libis-metadata 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +11 -0
  3. data/.rspec +3 -0
  4. data/.travis.yml +5 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +39 -0
  8. data/Rakefile +6 -0
  9. data/bin/console +14 -0
  10. data/bin/setup +8 -0
  11. data/exe/metadata +5 -0
  12. data/lib/libis/metadata/cli/cli_downloader.rb +182 -0
  13. data/lib/libis/metadata/cli/cli_helper.rb +74 -0
  14. data/lib/libis/metadata/command_line.rb +25 -0
  15. data/lib/libis/metadata/downloader.rb +117 -0
  16. data/lib/libis/metadata/dublin_core_record.rb +115 -0
  17. data/lib/libis/metadata/field_format.rb +119 -0
  18. data/lib/libis/metadata/fix_field.rb +33 -0
  19. data/lib/libis/metadata/mapper.rb +80 -0
  20. data/lib/libis/metadata/mappers/flandrica.rb +76 -0
  21. data/lib/libis/metadata/mappers/kuleuven.rb +1929 -0
  22. data/lib/libis/metadata/mappers/scope.rb +46 -0
  23. data/lib/libis/metadata/marc21_record.rb +49 -0
  24. data/lib/libis/metadata/marc_record.rb +285 -0
  25. data/lib/libis/metadata/parser/basic_parser.rb +116 -0
  26. data/lib/libis/metadata/parser/dublin_core_parser.rb +35 -0
  27. data/lib/libis/metadata/parser/marc21_parser.rb +205 -0
  28. data/lib/libis/metadata/parser/marc_format_parser.rb +51 -0
  29. data/lib/libis/metadata/parser/marc_rules.rb +34 -0
  30. data/lib/libis/metadata/parser/marc_select_parser.rb +24 -0
  31. data/lib/libis/metadata/parser/patch.rb +22 -0
  32. data/lib/libis/metadata/parser/subfield_criteria_parser.rb +70 -0
  33. data/lib/libis/metadata/parsers.rb +12 -0
  34. data/lib/libis/metadata/sharepoint_mapping.rb +119 -0
  35. data/lib/libis/metadata/sharepoint_record.rb +262 -0
  36. data/lib/libis/metadata/var_field.rb +242 -0
  37. data/lib/libis/metadata/version.rb +5 -0
  38. data/lib/libis/metadata.rb +25 -0
  39. data/lib/libis-metadata.rb +1 -0
  40. data/metadata.gemspec +39 -0
  41. metadata +266 -0
@@ -0,0 +1,46 @@
1
+ # encoding: utf-8
2
+
3
+ require 'libis/tools/metadata/dublin_core_record'
4
+ require 'libis/tools/assert'
5
+
6
+ module Libis
7
+ module Tools
8
+ module Metadata
9
+ module Mappers
10
+ # noinspection RubyResolve
11
+
12
+ # Mixin for {::Libis::Tools::Metadata::DublinCoreRecord} to enable conversion of the Scope exported DC record.
13
+ module Scope
14
+
15
+ # Main conversion method.
16
+ # @return [::Libis::Tools::Metadata::DublinCoreRecord]
17
+ def to_dc
18
+ assert(self.is_a? Libis::Tools::Metadata::DublinCoreRecord)
19
+
20
+ doc = Libis::Tools::Metadata::DublinCoreRecord.new(self.to_xml)
21
+
22
+ if doc.isPartOf
23
+
24
+ # create new node for isReferencedBy
25
+ new_node = doc.add_node(
26
+ 'isReferencedBy',
27
+ doc.isPartOf.content,
28
+ nil,
29
+ 'xsi:type' => 'dcterms:URI'
30
+ )
31
+
32
+ # Replace isPartOf with isReferencedBy
33
+ doc.isPartOf.replace new_node
34
+
35
+ end
36
+
37
+ doc
38
+
39
+ end
40
+
41
+ end
42
+
43
+ end
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,49 @@
1
+ # coding: utf-8
2
+
3
+ require 'cgi'
4
+
5
+ require_relative 'marc_record'
6
+
7
+ module Libis
8
+ module Metadata
9
+
10
+ # This class implements the missing private method 'get_all_records' to accomodate for the MARC-XML format.
11
+ class Marc21Record < Libis::Metadata::MarcRecord
12
+
13
+ private
14
+
15
+ def get_all_records
16
+
17
+ @all_records.clear
18
+
19
+ @node.xpath('.//leader').each {|f|
20
+ @all_records['LDR'] << FixField.new('LDR', f.content)
21
+ }
22
+
23
+ @node.xpath('.//controlfield').each {|f|
24
+ tag = f['tag']
25
+ tag = '%03d' % tag.to_i if tag.size < 3
26
+ @all_records[tag] << FixField.new(tag, f.content)
27
+ }
28
+
29
+ @node.xpath('.//datafield').each {|v|
30
+
31
+ tag = v['tag']
32
+ tag = '%03d' % tag.to_i if tag.size < 3
33
+
34
+ varfield = VarField.new(tag, v['ind1'].to_s, v['ind2'].to_s)
35
+
36
+ v.xpath('.//subfield').each {|s| varfield.add_subfield(s['code'], s.content)}
37
+
38
+ @all_records[tag] << varfield
39
+
40
+ }
41
+
42
+ @all_records
43
+
44
+ end
45
+
46
+ end
47
+
48
+ end
49
+ end
@@ -0,0 +1,285 @@
1
+ # coding: utf-8
2
+
3
+ require 'set'
4
+ require 'cgi'
5
+
6
+ require 'libis/tools/xml_document'
7
+ require 'libis/tools/assert'
8
+
9
+ require_relative 'fix_field'
10
+ require_relative 'var_field'
11
+ require_relative 'field_format'
12
+
13
+ module Libis
14
+ module Metadata
15
+
16
+ # noinspection RubyTooManyMethodsInspection
17
+
18
+ # Base class for reading MARC based records.
19
+ #
20
+ # For indicator selection: '#' or '' (empty) is wildcard; '_' or ' ' (space) is blank.
21
+ class MarcRecord
22
+
23
+ # Create a new MarcRecord object
24
+ #
25
+ # @param [XML node] xml_node XML node from Nokogiri or XmlDocument that contains child nodes with the data for
26
+ # one MARC record.
27
+ def initialize(xml_node)
28
+ @node = xml_node
29
+ @node.document.remove_namespaces!
30
+ @all_records = Hash.new {|h, k| h[k] = Array.new}
31
+ end
32
+
33
+ # Access to the XML node that was supplied to the constructor
34
+ # @return [XML node]
35
+ def to_raw
36
+ @node
37
+ end
38
+
39
+ # Returns the internal data structure (a Hash) with all the MARC data.
40
+ #
41
+ # The internal structure is a Hash with the tag as key and as value an Array of either FixField or VarField
42
+ # instances.
43
+ #
44
+ # @return [Hash] internal data structure
45
+ def all
46
+ return @all_records unless @all_records.empty?
47
+ @all_records = get_all_records
48
+ end
49
+
50
+ # Iterates over all the MARC fields.
51
+ #
52
+ # If a block is supplied it will be called for each field in the MARC record. The supplied argument will be the
53
+ # FixField or VarField instance for each field.
54
+ #
55
+ # @return [Array] The list of the field data or return values for each block call.
56
+ def each
57
+ all.map {|_, field_array| field_array}.flatten.map do |field|
58
+ block_given? ? yield(field) : field
59
+ end
60
+ end
61
+
62
+ # Get all fields matching search criteria.
63
+ #
64
+ # A block with one parameter can be supplied when calling this method. Each time a match is found, the block
65
+ # will be called with the field data as argument and the return value of the block will be added to the method's
66
+ # return value. This could for example be used to narrow the selection of the fields:
67
+ #
68
+ # # Only select 700 tags where $4 subfield contains 'abc', 'def' or 'xyz'
69
+ # record.all_tags('700') { |v| v.subfield['4'] =~ /^(abc|def|xyz)$/ ? v : nil }.compact
70
+ #
71
+ # @param [String] tag Tag selection string. Tag name with indicators, '#' for wildcard, '_' for blank. If an
72
+ # extra subfield name is added, a result will be created for each instance found of that subfield.
73
+ # @param [String] subfields Subfield specification. See FieldFormat class for more info; ignored for controlfields.
74
+ # @param [Proc] select_block block that will be executed once for each field found. The block takes one argument
75
+ # (the field) and should return true or false. True selects the field, false rejects it.
76
+ # @return [Array] If a block was supplied to the method call, the array will contain the result of the block
77
+ # for each tag found. Otherwise the array will just contain the data for each matching tag.
78
+ def all_tags(tag, subfields = '', select_block = Proc.new {|_| true})
79
+ t, ind1, ind2, subfield = tag =~ /^\d{3}/ ? [tag[0..2], tag[3], tag[4], tag[5]] : [tag, nil, nil, nil]
80
+ result = get_records(t, ind1, ind2, subfield, subfields, &select_block)
81
+ return result unless block_given?
82
+ result.map {|record| yield record}
83
+ end
84
+
85
+ alias_method :each_tag, :all_tags
86
+
87
+ # Get all fields matching search criteria.
88
+ # As {#all_tags} but without subfield criteria.
89
+ # @param [String] tag Tag selection string. Tag name with indicators, '#' for wildcard, '_' for blank. If an
90
+ # extra subfield name is added, a result will be created for each instance found of that subfield.
91
+ # @param [Proc] select_block block that will be executed once for each field found. The block takes one argument
92
+ # (the field) and should return true or false. True selects the field, false rejects it.
93
+ # @return [Array] If a block was supplied to the method call, the array will contain the result of the block
94
+ # for each tag found. Otherwise the array will just contain the data for each matching tag.
95
+ def select_fields(tag, select_block = nil, &block)
96
+ all_tags(tag, nil, select_block, &block)
97
+ end
98
+
99
+ # Find the first tag matching the criteria.
100
+ #
101
+ # If a block is supplied, it will be called with the found field data. The return value will be whatever the
102
+ # block returns. If no block is supplied, the field data will be returned. If nothing was found, the return
103
+ # value is nil.
104
+ #
105
+ # @param [String] tag Tag selection string. Tag name with indicators, '#' for wildcard, '_' for blank.
106
+ # @param [String] subfields Subfield specification. See FieldFormat class for more info; ignored for controlfields.
107
+ # @return [Object] nil if nothing found; field data or whatever block returns.
108
+ def first_tag(tag, subfields = '')
109
+ result = all_tags(tag, subfields).first
110
+ return nil unless result
111
+ return result unless block_given?
112
+ yield result
113
+ end
114
+
115
+ # Find all fields matching the criteria.
116
+ # (see #first_tag)
117
+ # @param (see #first_tag)
118
+ def all_fields(tag, subfields)
119
+ r = all_tags(tag, subfields).collect {|t| t.subfields_array(subfields)}.flatten.compact
120
+ return r unless block_given?
121
+ r.map {|field| yield field}
122
+ r.size > 0
123
+ end
124
+
125
+ # Find the first field matching the criteria
126
+ # (see #all_fields)
127
+ # @param (see #all_fields)
128
+ def first_field(tag, subfields)
129
+ result = all_fields(tag, subfields).first
130
+ return result unless block_given?
131
+ return false unless result
132
+ yield result
133
+ true
134
+ end
135
+
136
+ # Perform action on each field found. Code block required.
137
+ # @param (see #all_fields)
138
+ def each_field(tag, subfields)
139
+ all_fields(tag, subfields).each do |field|
140
+ yield field
141
+ end
142
+ end
143
+
144
+ # Dump content to string.
145
+ def marc_dump
146
+ all.values.flatten.each_with_object([]) {|record, m| m << record.dump}.join
147
+ end
148
+
149
+ # Save the current MARC record to file.
150
+ # @param [String] filename name of the file
151
+ def save(filename)
152
+ doc = ::Libis::Tools::XmlDocument.new
153
+ doc.root = @node
154
+
155
+ return doc unless filename
156
+
157
+ doc.save filename, save_with: (::Nokogiri::XML::Node::SaveOptions::NO_EMPTY_TAGS |
158
+ ::Nokogiri::XML::Node::SaveOptions::AS_XML |
159
+ ::Nokogiri::XML::Node::SaveOptions::FORMAT
160
+ )
161
+ end
162
+
163
+ # Load XML document from file and create a new {MarcRecord} for it.
164
+ # @param [String] filename name of XML Marc file
165
+ def self.load(filename)
166
+ doc = ::Libis::Tools::XmlDocument.open(filename)
167
+ self.new(doc.root)
168
+ end
169
+
170
+ # Load XML document from stream and create a new {MarcRecord} for it.
171
+ # @param [IO,String] io input stream
172
+ def self.read(io)
173
+ io = StringIO.new(io) if io.is_a? String
174
+ doc = ::Libis::Tools::XmlDocument.parse(io)
175
+ self.new(doc.root)
176
+ end
177
+
178
+ # Dump Marc record in Aleph Sequential format
179
+ # @return [String] Aleph sequential output
180
+ def to_aseq
181
+ record = ''
182
+ doc_number = tag('001').datas
183
+
184
+ all.select {|t| t.is_a? Libis::Metadata::FixField}.each {|t| record += "#{format('%09s', doc_number)} #{t.tag} L #{t.datas}\n"}
185
+ all.select {|t| t.is_a? Libis::Metadata::VarField}.each {|t|
186
+ record += "#{format('%09s', doc_number)} #{t.tag}#{t.ind1}#{t.ind2} L "
187
+ t.keys.each {|k|
188
+ t.subfield_array(k).each {|f|
189
+ record += "$$#{k}#{CGI::unescapeHTML(f)}"
190
+ }
191
+ }
192
+ record += "\n"
193
+ }
194
+
195
+ record
196
+ end
197
+
198
+ protected
199
+
200
+ def element(*parts)
201
+ opts = options parts
202
+ field_format(opts, *parts)
203
+ end
204
+
205
+ def list_s(*parts)
206
+ opts = options parts, join: ' '
207
+ field_format(opts, *parts)
208
+ end
209
+
210
+ def list_c(*parts)
211
+ opts = options parts, join: ', '
212
+ field_format(opts, *parts)
213
+ end
214
+
215
+ def list_d(*parts)
216
+ opts = options parts, join: ' - '
217
+ field_format(opts, *parts)
218
+ end
219
+
220
+ def repeat(*parts)
221
+ opts = options parts, join: '; '
222
+ field_format(opts, *parts)
223
+ end
224
+
225
+ def opt_r(*parts)
226
+ opts = options parts, fix: '()'
227
+ field_format(opts, *parts)
228
+ end
229
+
230
+ def opt_s(*parts)
231
+ opts = options parts, fix: '[]'
232
+ field_format(opts, *parts)
233
+ end
234
+
235
+ def odis_link(group, id, label)
236
+ "http://www.odis.be/lnk/#{group.downcase[0, 2]}_#{id}\##{label}"
237
+ end
238
+
239
+ private
240
+
241
+ def options(args, default = {})
242
+ default.merge(args.last.is_a?(::Hash) ? args.pop : {})
243
+ end
244
+
245
+ def field_format(default_options, *parts)
246
+ Libis::Metadata::FieldFormat.new(*parts).add_default_options(default_options).to_s
247
+ end
248
+
249
+ def get_records(tag, ind1 = '', ind2 = '', subfield = nil, subfields = '', &block)
250
+
251
+ ind1 ||= ''
252
+ ind2 ||= ''
253
+ subfields ||= ''
254
+
255
+ ind1.tr!('_', ' ')
256
+ ind1.tr!('#', '')
257
+
258
+ ind2.tr!('_', ' ')
259
+ ind2.tr!('#', '')
260
+
261
+ found = all[tag].select do |v|
262
+ result = v.is_a?(Libis::Metadata::FixField) ||
263
+ ((ind1.empty? or v.ind1 == ind1) &&
264
+ (ind2.empty? or v.ind2 == ind2) &&
265
+ v.match(subfields)
266
+ )
267
+ result &&= block.call(v) if block
268
+ result
269
+ end
270
+
271
+ return found unless subfield
272
+
273
+ # duplicate tags for subfield instances
274
+ found.map do |field|
275
+ next unless field.is_a? Libis::Metadata::FixField
276
+ field.subfield_data[subfield].map do |sfield|
277
+ field.dup.subfield_data[subfield] = [sfield]
278
+ end
279
+ end.compact.flatten
280
+
281
+ end
282
+
283
+ end
284
+ end
285
+ end
@@ -0,0 +1,116 @@
1
+ require 'parslet'
2
+ require 'parslet/convenience'
3
+
4
+ module Libis
5
+ module Metadata
6
+ module Parser
7
+ # noinspection RubyResolve
8
+
9
+ # New style parsers and converters for metadata. New, not finished and untested.
10
+ class BasicParser < Parslet::Parser
11
+ # space
12
+ rule(:space) {match('\s')}
13
+ rule(:space?) {space.maybe}
14
+ rule(:spaces) {space.repeat(1)}
15
+ rule(:spaces?) {space.repeat}
16
+
17
+ # numbers
18
+ rule(:number) {match('[0-9]')}
19
+ rule(:number?) {number.maybe}
20
+ rule(:integer) {number.repeat(1)}
21
+
22
+ # chars
23
+ rule(:character) {match(/[a-z]/i)}
24
+ rule(:character?) {character.maybe}
25
+ rule(:characters) {character.repeat(1)}
26
+
27
+ # word
28
+ rule(:wordchar) {match('\w')}
29
+
30
+ # name
31
+ rule(:name_string) {((character | underscore) >> wordchar.repeat).repeat(1)}
32
+
33
+ # text
34
+ rule(:other) {not_paren}
35
+ rule(:text) {other.repeat(1)}
36
+ rule(:text?) {text.maybe}
37
+
38
+ # special chars
39
+ rule(:minus) {str('-')}
40
+ rule(:colon) {str(':')}
41
+ rule(:semicolon) {str(';')}
42
+ rule(:underscore) {str('_')}
43
+ rule(:hashtag) {str('#')}
44
+ rule(:dollar) {str('$')}
45
+ rule(:star) {str('*')}
46
+
47
+ # grouping
48
+ rule(:paren) {lparen | rparen}
49
+ rule(:lparen) {lrparen | lsparen | lcparen | squote | dquote}
50
+ rule(:rparen) {rrparen | rsparen | rcparen | squote | dquote}
51
+
52
+ rule(:not_paren) {paren.absent? >> any}
53
+ rule(:not_lparen) {lrparen.absent? >> lsparen.absent? >> lcparen.absent? >> squote.absent? >> dquote.absent? >> any}
54
+ rule(:not_rparen) {rrparen.absent? >> rsparen.absent? >> rcparen.absent? >> squote.absent? >> dquote.absent? >> any}
55
+
56
+ rule(:lrparen) {str('(')}
57
+ rule(:lsparen) {str('[')}
58
+ rule(:lcparen) {str('{')}
59
+ rule(:rrparen) {str(')')}
60
+ rule(:rsparen) {str(']')}
61
+ rule(:rcparen) {str('}')}
62
+
63
+ rule(:squote) {str("'")}
64
+ rule(:dquote) {str('"')}
65
+ rule(:quote) {squote | dquote}
66
+
67
+ rule(:not_squote) {squote.absent? >> any}
68
+ rule(:not_dquote) {dquote.absent? >> any}
69
+ rule(:not_quote) {quote.absent? >> any}
70
+
71
+ def complement(char)
72
+ case char
73
+ when '('
74
+ ')'
75
+ when '{'
76
+ '}'
77
+ when '['
78
+ ']'
79
+ else
80
+ char
81
+ end
82
+ end
83
+
84
+ def grouped(foo, left_paren = lparen)
85
+ scope {
86
+ left_paren.capture(:paren).as(:lparen) >>
87
+ foo >>
88
+ dynamic {|_, c| str(complement(c.captures[:paren]))}.as(:rparen)
89
+ }
90
+ end
91
+
92
+ def grouped_anonymous(foo, left_paren = lparen)
93
+ scope {
94
+ left_paren.capture(:paren) >>
95
+ foo >>
96
+ dynamic {|_, c| str(complement(c.captures[:paren]))}
97
+ }
98
+ end
99
+
100
+ def any_quoted(key = :text)
101
+ scope {
102
+ quote.capture(:quote) >>
103
+ dynamic {|_, c| (str(c.captures[:quote]).absent? >> any).repeat(1)}.maybe.as(key) >>
104
+ dynamic {|_, c| str(c.captures[:quote])}
105
+ }
106
+ end
107
+
108
+ def transformer
109
+ self.class::Transformer.new rescue nil
110
+ end
111
+
112
+ end
113
+
114
+ end
115
+ end
116
+ end