traject 0.16.0 → 0.17.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +1 -0
  3. data/README.md +183 -191
  4. data/bench/bench.rb +1 -1
  5. data/doc/batch_execution.md +14 -0
  6. data/doc/extending.md +14 -12
  7. data/doc/indexing_rules.md +265 -0
  8. data/lib/traject/command_line.rb +12 -41
  9. data/lib/traject/debug_writer.rb +32 -13
  10. data/lib/traject/indexer.rb +101 -24
  11. data/lib/traject/indexer/settings.rb +18 -17
  12. data/lib/traject/json_writer.rb +32 -11
  13. data/lib/traject/line_writer.rb +6 -6
  14. data/lib/traject/macros/basic.rb +1 -1
  15. data/lib/traject/macros/marc21.rb +17 -13
  16. data/lib/traject/macros/marc21_semantics.rb +27 -25
  17. data/lib/traject/macros/marc_format_classifier.rb +39 -25
  18. data/lib/traject/marc4j_reader.rb +36 -22
  19. data/lib/traject/marc_extractor.rb +79 -75
  20. data/lib/traject/marc_reader.rb +33 -25
  21. data/lib/traject/mock_reader.rb +9 -10
  22. data/lib/traject/ndj_reader.rb +7 -7
  23. data/lib/traject/null_writer.rb +1 -1
  24. data/lib/traject/qualified_const_get.rb +12 -2
  25. data/lib/traject/solrj_writer.rb +61 -52
  26. data/lib/traject/thread_pool.rb +45 -45
  27. data/lib/traject/translation_map.rb +59 -27
  28. data/lib/traject/util.rb +3 -3
  29. data/lib/traject/version.rb +1 -1
  30. data/lib/traject/yaml_writer.rb +1 -1
  31. data/test/debug_writer_test.rb +7 -7
  32. data/test/indexer/each_record_test.rb +4 -4
  33. data/test/indexer/macros_marc21_semantics_test.rb +12 -12
  34. data/test/indexer/macros_marc21_test.rb +10 -10
  35. data/test/indexer/macros_test.rb +1 -1
  36. data/test/indexer/map_record_test.rb +6 -6
  37. data/test/indexer/read_write_test.rb +43 -4
  38. data/test/indexer/settings_test.rb +2 -2
  39. data/test/indexer/to_field_test.rb +8 -8
  40. data/test/marc4j_reader_test.rb +4 -4
  41. data/test/marc_extractor_test.rb +33 -25
  42. data/test/marc_format_classifier_test.rb +3 -3
  43. data/test/marc_reader_test.rb +2 -2
  44. data/test/test_helper.rb +3 -3
  45. data/test/test_support/demo_config.rb +52 -48
  46. data/test/translation_map_test.rb +22 -4
  47. data/test/translation_maps/bad_ruby.rb +2 -2
  48. data/test/translation_maps/both_map.rb +1 -1
  49. data/test/translation_maps/default_literal.rb +1 -1
  50. data/test/translation_maps/default_passthrough.rb +1 -1
  51. data/test/translation_maps/ruby_map.rb +1 -1
  52. metadata +7 -31
  53. data/doc/macros.md +0 -103
@@ -62,10 +62,10 @@ module Traject::Macros
62
62
  def self.get_sortable_author(record)
63
63
  onexx = MarcExtractor.cached("100:110:111", :first => true, :trim_punctuation => true).extract(record).first
64
64
  onexx = onexx.strip if onexx
65
-
65
+
66
66
  titles = []
67
67
  MarcExtractor.cached("240:245", :first => true).each_matching_line(record) do |field, spec|
68
- non_filing = field.indicator2.to_i
68
+ non_filing = field.indicator2.to_i
69
69
 
70
70
  str = field.subfields.collect {|sf| Marc21.trim_punctuation(sf.value.strip).strip}.join(" ")
71
71
  str = str.slice(non_filing, str.length)
@@ -73,7 +73,7 @@ module Traject::Macros
73
73
  end.first
74
74
  title = titles.first
75
75
  title = title.strip if title
76
-
76
+
77
77
  return [onexx, title].compact.join(" ")
78
78
  end
79
79
 
@@ -105,26 +105,26 @@ module Traject::Macros
105
105
  str
106
106
  end.first
107
107
  end
108
-
109
-
110
-
108
+
109
+
110
+
111
111
  # A generic way to strip a filing version (i.e., a string with the non-filing
112
112
  # characters stripped off)
113
113
  #
114
114
  # Always returns an array. If :include_original=>true is passed in,
115
115
  # that array will include the original string with the non-filing
116
116
  # characters still in it.
117
-
117
+
118
118
  def extract_marc_filing_version(spec='245abdefghknp', opts={})
119
119
  include_original = opts.delete(:include_original)
120
120
  if opts.size > 0
121
121
  raise RuntimeError.new("extract_marc_filing_version can take only :include_original as an argument, not #{opts.keys.map{|x| "'#{x}'"}.join(' or ')}")
122
122
  end
123
-
123
+
124
124
  extractor = Traject::MarcExtractor.cached(spec, opts)
125
-
125
+
126
126
  lambda do |record, accumulator, context|
127
- extractor.collect_matching_lines(record) do |field, spec|
127
+ extractor.collect_matching_lines(record) do |field, spec|
128
128
  str = extractor.collect_subfields(field, spec).first
129
129
  next unless str and !str.empty?
130
130
  vals = [Marc21Semantics.filing_version(field, str, spec)]
@@ -136,34 +136,34 @@ module Traject::Macros
136
136
  end
137
137
  end
138
138
  end
139
-
140
-
141
-
142
-
139
+
140
+
141
+
142
+
143
143
  # Take in a field, a string extracted from that field, and a spec and
144
- # return the filing version (i.e., the string without the
144
+ # return the filing version (i.e., the string without the
145
145
  # non-filing characters)
146
-
146
+
147
147
  def self.filing_version(field, str, spec)
148
148
  # Control fields don't have non-filing characters
149
149
  return str if field.kind_of? MARC::ControlField
150
-
150
+
151
151
  # 2nd indicator must be > 0
152
152
  ind2 = field.indicator2.to_i
153
153
  return str unless ind2 > 0
154
-
154
+
155
155
  # The spechash must either (a) have no subfields specified, or
156
156
  # (b) include the first subfield in the record
157
-
157
+
158
158
  subs = spec.subfields
159
159
  return str unless subs && subs.include?(field.subfields[0].code)
160
-
160
+
161
161
  # OK. If we got this far we actually need to strip characters off the string
162
-
162
+
163
163
  return str[ind2..-1]
164
164
  end
165
-
166
-
165
+
166
+
167
167
 
168
168
 
169
169
  # maps languages, by default out of 008[35-37] and 041a and 041d
@@ -367,6 +367,9 @@ module Traject::Macros
367
367
  return found_date
368
368
  end
369
369
 
370
+ # REGEX meant to rule out obvious non-LCC's, and only allow things
371
+ # plausibly LCC's.
372
+ LCC_REGEX = /\A *[A-Z]{1,3}[ .]*(?:(\d+)(?:\s*?\.\s*?(\d+))?).*/
370
373
  # Looks up Library of Congress Classification (LCC) or NLM Medical Subject Headings (MeSH)
371
374
  # from usual parts of the marc record. Maps them to high-level broad categories,
372
375
  # basically just using the first part of the LCC. Note it's just looking in bib-level
@@ -379,7 +382,6 @@ module Traject::Macros
379
382
  # or nil.
380
383
  #
381
384
  # The categories output aren't great, but they're something.
382
- LCC_REGEX = /\A *[A-Z]{1,3}[ .]*(?:(\d+)(?:\s*?\.\s*?(\d+))?).*/
383
385
  def marc_lcc_to_broad_category( options = {}, spec="050a:060a:090a:096a")
384
386
  # Trying to match things that look like LCC, and not match things
385
387
  # that don't. Is tricky.
@@ -503,4 +505,4 @@ module Traject::Macros
503
505
 
504
506
 
505
507
  end
506
- end
508
+ end
@@ -1,9 +1,19 @@
1
1
  module Traject
2
2
  module Macros
3
- # See MarcFormatClassifier class
3
+ # To use the marc_format macro, in your configuration file:
4
+ #
5
+ # require 'traject/macros/marc_formats
6
+ # extend Traject::Macros::MarcFormats
7
+ #
8
+ # to_field("format_s") marc_formats
9
+ #
10
+ # See also MarcClassifier which can be used directly for a bit more
11
+ # control.
4
12
  module MarcFormats
5
13
  # very opionated macro that just adds a grab bag of format/genre/types
6
- # into one field. You may want ot build your own from MarcFormatClassifier functions instead.
14
+ # from our own custom vocabulary, all into one field.
15
+ # You may want to build your own from MarcFormatClassifier functions instead.
16
+ #
7
17
  def marc_formats
8
18
  lambda do |record, accumulator|
9
19
  accumulator.concat Traject::Macros::MarcFormatClassifier.new(record).formats
@@ -12,10 +22,11 @@ module Traject
12
22
  end
13
23
 
14
24
 
15
- # Not actually a macro, but we're keeping it here for now,
16
- # a class for classifying marc according to format/genre/type.
25
+ # A tool for classifiying MARC records according to format/form/genre/type,
26
+ # just using our own custom vocabulary for those things.
17
27
  #
18
- # VERY opinionated.
28
+ # used by the `marc_formats` macro, but you can also use it directly
29
+ # for a bit more control.
19
30
  class MarcFormatClassifier
20
31
  attr_reader :record
21
32
 
@@ -24,22 +35,25 @@ module Traject
24
35
  end
25
36
 
26
37
  # A very opinionated method that just kind of jams together
27
- # all the possible format/genre/types into one array of 1 to N elements.
38
+ # all the possible format/genre/types into one array of 1 to N elements.
28
39
  #
29
- # Default "Other" will be used
40
+ # If no other values are present, the default value "Other" will be used.
41
+ #
42
+ # See also individual methods which you can use you seperate into
43
+ # different facets or do other custom things.
30
44
  def formats(options = {})
31
45
  options = {:default => "Other"}.merge(options)
32
46
 
33
47
  formats = []
34
48
 
35
49
  formats.concat genre
36
-
50
+
37
51
  formats << "Manuscript/Archive" if manuscript_archive?
38
52
  formats << "Microform" if microform?
39
53
  formats << "Online" if online?
40
54
 
41
55
  # In our own data, if it's an audio recording, it might show up
42
- # as print, but it's probably not.
56
+ # as print, but it's probably not.
43
57
  formats << "Print" if print? && ! (formats.include?("Non-musical Recording") || formats.include?("Musical Recording"))
44
58
 
45
59
  # If it's a Dissertation, we decide it's NOT a book
@@ -64,11 +78,11 @@ module Traject
64
78
  # Returns 1 or more values in an array from:
65
79
  # Book; Journal/Newspaper; Musical Score; Map/Globe; Non-musical Recording; Musical Recording
66
80
  # Image; Software/Data; Video/Film
67
- #
68
- # Uses leader byte 6, leader byte 7, and 007 byte 0.
81
+ #
82
+ # Uses leader byte 6, leader byte 7, and 007 byte 0.
69
83
  #
70
84
  # Gets actual labels from marc_genre_leader and marc_genre_007 translation maps,
71
- # so you can customize labels if you want.
85
+ # so you can customize labels if you want.
72
86
  def genre
73
87
  marc_genre_leader = Traject::TranslationMap.new("marc_genre_leader")
74
88
  marc_genre_007 = Traject::TranslationMap.new("marc_genre_007")
@@ -96,18 +110,18 @@ module Traject
96
110
  end
97
111
  end
98
112
 
99
- # Algorithm with help from Chris Case.
100
- # * If it has any RDA 338, then it's print if it has a value of
101
- # volume, sheet, or card.
113
+ # Algorithm with help from Chris Case.
114
+ # * If it has any RDA 338, then it's print if it has a value of
115
+ # volume, sheet, or card.
102
116
  # * If it does not have an RDA 338, it's print if and only if it has
103
- # NO 245$h GMD.
117
+ # NO 245$h GMD.
104
118
  #
105
- # * Here at JH, for legacy reasons we also choose to not
119
+ # * Here at JH, for legacy reasons we also choose to not
106
120
  # call it print if it's already been marked audio, but
107
- # we do that in a different method.
121
+ # we do that in a different method.
108
122
  #
109
123
  # This algorithm is definitely going to get some things wrong in
110
- # both directions, with real world data. But seems to be good enough.
124
+ # both directions, with real world data. But seems to be good enough.
111
125
  def print?
112
126
 
113
127
 
@@ -116,7 +130,7 @@ module Traject
116
130
  end
117
131
 
118
132
  if rda338.length > 0
119
- rda338.find do |field|
133
+ rda338.find do |field|
120
134
  field.subfields.find do |sf|
121
135
  (sf.code == "a" && %w{volume card sheet}.include?(sf.value)) ||
122
136
  (sf.code == "b" && %w{nc no nb}.include?(sf.value))
@@ -128,7 +142,7 @@ module Traject
128
142
  end
129
143
 
130
144
  # We use marc 007 to determine if this represents an online
131
- # resource. But sometimes resort to 245$h GMD too.
145
+ # resource. But sometimes resort to 245$h GMD too.
132
146
  def online?
133
147
  # field 007, byte 0 c="electronic" byte 1 r="remote" ==> sure Online
134
148
  found_007 = record.find do |field|
@@ -140,8 +154,8 @@ module Traject
140
154
  # Otherwise, if it has a GMD ["electronic resource"], we count it
141
155
  # as online only if NO 007[0] == 'c' exists, cause if it does we already
142
156
  # know it's electronic but not remote, otherwise first try would
143
- # have found it.
144
- return (normalized_gmd.start_with? "[electronic resource]") && ! record.find {|f| f.tag == '007' && f.value.slice(0) == "c"}
157
+ # have found it.
158
+ return (normalized_gmd.start_with? "[electronic resource]") && ! record.find {|f| f.tag == '007' && f.value.slice(0) == "c"}
145
159
  end
146
160
 
147
161
  # if field 007 byte 0 is 'h', that's microform. But many of our microform
@@ -153,7 +167,7 @@ module Traject
153
167
  record.find {|f| (f.tag == "007") && (f.value[0] == "h")}
154
168
  end
155
169
 
156
- # Marked as manuscript OR archive.
170
+ # Marked as manuscript OR archive.
157
171
  def manuscript_archive?
158
172
  leader06 = record.leader.slice(6)
159
173
  leader08 = record.leader.slice(8)
@@ -177,4 +191,4 @@ module Traject
177
191
 
178
192
  end
179
193
  end
180
- end
194
+ end
@@ -2,24 +2,21 @@ require 'traject'
2
2
  require 'marc'
3
3
  require 'marc/marc4j'
4
4
 
5
- # Uses Marc4J to read the marc records, but then translates them to
6
- # ruby-marc before delivering them still, Marc4J is just inside the black
7
- # box.
5
+ # `Traject::Marc4JReader` uses the marc4j java package to parse the MARC records
6
+ # into standard ruby-marc MARC::Record objects. This reader is often faster than
7
+ # Traject::MarcReader, especially for XML, and offers support for reading Marc8
8
+ # encoded records and transcoding to UTF8.
8
9
  #
9
- # But one way to get ability to transcode from Marc8. Records it delivers
10
- # are ALWAYS in UTF8, will be transcoded if needed.
10
+ # Marc4JReader can read MARC ISO 2709 ("binary") or MARCXML. We use the Marc4J MarcPermissiveStreamReader
11
+ # for reading binary, but sometimes in non-permissive mode, according to settings. We use the Marc4j MarcXmlReader
12
+ # for reading xml. The actual code for dealing with Marc4J is in the separate
13
+ # [marc-marc4j gem](https://github.com/billdueber/ruby-marc-marc4j).
11
14
  #
12
- # Also hope it gives us some performance benefit.
15
+ # See also the pure ruby Traject::MarcReader as an alternative, if you need to read
16
+ # marc-in-json, or if you don't need binary Marc8 support, it may in some cases
17
+ # be faster.
13
18
  #
14
- # Uses the Marc4J MarcPermissiveStreamReader for binary, but sometimes
15
- # in non-permissive mode, according to settings. Uses the Marc4j MarcXmlReader
16
- # for xml.
17
- #
18
- # NOTE: If you aren't reading in binary records encoded in MARC8, you may
19
- # find the pure-ruby Traject::MarcReader faster; the extra step to read
20
- # Marc4J but translate to ruby MARC::Record adds some overhead.
21
- #
22
- # Settings:
19
+ # ## Settings
23
20
  #
24
21
  # * marc_source.type: serialization type. default 'binary', also 'xml' (TODO: json/marc-in-json)
25
22
  #
@@ -39,9 +36,26 @@ require 'marc/marc4j'
39
36
  # * marc4j_reader.jar_dir: Path to a directory containing Marc4J jar file to use. All .jar's in dir will
40
37
  # be loaded. If unset, uses marc4j.jar bundled with traject.
41
38
  #
42
- # * marc4j_reader.keep_marc4j: Keeps the original marc4j record accessible from
43
- # the eventual ruby-marc record via record#original_marc4j
44
-
39
+ # * marc4j_reader.keep_marc4j: Keeps the original marc4j record accessible from
40
+ # the eventual ruby-marc record via record#original_marc4j. Intended for
41
+ # those that have legacy java code for which a marc4j object is needed. .
42
+ #
43
+ #
44
+ # ## Example
45
+ #
46
+ # In a configuration file:
47
+ #
48
+ # require 'traject/marc4j_reader
49
+ # settings do
50
+ # provide "reader_class_name", "Traject::Marc4JReader"
51
+ #
52
+ # #for MarcXML:
53
+ # # provide "marc_source.type", "xml"
54
+ #
55
+ # # Or instead for binary:
56
+ # provide "marc4j_reader.permissive", true
57
+ # provide "marc4j_reader.source_encoding", "MARC8"
58
+ # end
45
59
  class Traject::Marc4JReader
46
60
  include Enumerable
47
61
 
@@ -56,14 +70,14 @@ class Traject::Marc4JReader
56
70
  MARC::Record.instance_methods.include?(:"original_marc4j="))
57
71
  MARC::Record.class_eval('attr_accessor :original_marc4j')
58
72
  end
59
-
73
+
60
74
  # Creating a converter will do the following:
61
75
  # - nothing, if it detects that the marc4j jar is already loaded
62
76
  # - load all the .jar files in settings['marc4j_reader.jar_dir'] if set
63
77
  # - load the marc4j jar file bundled with MARC::MARC4J otherwise
64
-
78
+
65
79
  @converter = MARC::MARC4J.new(:jardir => settings['marc4j_reader.jar_dir'], :logger => logger)
66
-
80
+
67
81
  # Convenience
68
82
  java_import org.marc4j.MarcPermissiveStreamReader
69
83
  java_import org.marc4j.MarcXmlReader
@@ -121,4 +135,4 @@ class Traject::Marc4JReader
121
135
  @logger ||= (settings[:logger] || Yell.new(STDERR, :level => "gt.fatal")) # null logger)
122
136
  end
123
137
 
124
- end
138
+ end
@@ -6,22 +6,23 @@ module Traject
6
6
  #
7
7
  # Examples:
8
8
  #
9
- # array_of_stuff = MarcExtractor.new("001:245abc:700a").extract(marc_record)
10
- # values = MarcExtractor.new("245a:245abc").extract_marc(marc_record)
11
- # seperated_values = MarcExtractor.new("020a:020z").extract(marc_record)
12
- # bytes = MarcExtractor.new("008[35-37]")
9
+ # array_of_stuff = MarcExtractor.new("001:245abc:700a").extract(marc_record)
10
+ # values = MarcExtractor.new("245a:245abc").extract_marc(marc_record)
11
+ # seperated_values = MarcExtractor.new("020a:020z").extract(marc_record)
12
+ # bytes = MarcExtractor.new("008[35-37]")
13
13
  #
14
- # == String extraction specifications
14
+ # ## String extraction specifications
15
15
  #
16
16
  # Extraction directions are supplied in strings, usually as the first
17
17
  # parameter to MarcExtractor.new or MarcExtractor.cached. These specifications
18
- # are also the first parameter to the #marc_extract macro.
18
+ # are also the first parameter to the #marc_extract macro.
19
19
  #
20
20
  # A String specification is a string (or array of strings) which consists
21
- # of one or more Data and Control Field Specifications seperated by colons.
21
+ # of one or more Data and Control Field Specifications seperated by colons.
22
22
  #
23
23
  # A Data Field Specification is of the form:
24
- # `{tag}{|indicators|}{subfields}`
24
+ #
25
+ # * `{tag}{|indicators|}{subfields}`
25
26
  # * {tag} is three chars (usually but not neccesarily numeric)
26
27
  # * {indicators} are optional two chars enclosed in pipe ('|') characters,
27
28
  # * {subfields} are optional list of chars (alphanumeric)
@@ -29,58 +30,58 @@ module Traject
29
30
  # indicator spec must be two chars, but one can be * meaning "don't care".
30
31
  # space to mean 'blank'
31
32
  #
32
- # "245|01|abc65:345abc:700|*5|:800"
33
+ # "245|01|abc65:345abc:700|*5|:800"
33
34
  #
34
35
  # A Control Field Specification is used with tags for control (fixed) fields (ordinarily fields 001-010)
35
- # and includes a tag and a a byte slice specification.
36
+ # and includes a tag and a a byte slice specification.
36
37
  #
37
- # "008[35-37]:007[5]""
38
- # => bytes 35-37 inclusive of any field 008, and byte 5 of any field 007 (TODO: Should we support
39
- # "LDR" as a pseudo-tag to take byte slices of leader?)
38
+ # "008[35-37]:007[5]""
39
+ # => bytes 35-37 inclusive of any field 008, and byte 5 of any field 007 (TODO: Should we support
40
+ # "LDR" as a pseudo-tag to take byte slices of leader?)
40
41
  #
41
42
  # * subfields and indicators can only be provided for marc data/variable fields
42
43
  # * byte slice can only be provided for marc control fields (generally tags less than 010)
43
44
  #
44
- # == Subfield concatenation
45
+ # ## Subfield concatenation
45
46
  #
46
47
  # Normally, for a spec including multiple subfield codes, multiple subfields
47
48
  # from the same MARC field will be concatenated into one string separated by spaces:
48
49
  #
49
- # 600 a| Chomsky, Noam x| Philosophy.
50
- # 600 a| Chomsky, Noam x| Political and social views.
51
- # MarcExtractor.new("600ax").extract(record)
52
- # # results in two values sent to Solr:
53
- # "Chomsky, Noam Philosophy."
54
- # "Chomsky, Noam Political and social views."
50
+ # 600 a| Chomsky, Noam x| Philosophy.
51
+ # 600 a| Chomsky, Noam x| Political and social views.
52
+ # MarcExtractor.new("600ax").extract(record)
53
+ # # results in two values sent to Solr:
54
+ # "Chomsky, Noam Philosophy."
55
+ # "Chomsky, Noam Political and social views."
55
56
  #
56
57
  # You can turn off this concatenation and leave individual subfields in seperate
57
58
  # strings by setting the `separator` option to nil:
58
59
  #
59
- # MarcExtractor.new("600ax", :separator => nil).extract(record)
60
- # # Results in four values being sent to Solr (or 3 if you de-dup):
61
- # "Chomksy, Noam"
62
- # "Philosophy."
63
- # "Chomsky, Noam"
64
- # "Political and social views."
60
+ # MarcExtractor.new("600ax", :separator => nil).extract(record)
61
+ # # Results in four values being sent to Solr (or 3 if you de-dup):
62
+ # "Chomksy, Noam"
63
+ # "Philosophy."
64
+ # "Chomsky, Noam"
65
+ # "Political and social views."
65
66
  #
66
67
  # However, **the default is different for specifications with only a single
67
68
  # subfield**, these are by default kept seperated:
68
69
  #
69
- # 020 a| 285197145X a| 9782851971456
70
- # MarcExtractor.new("020a:020z").extract(record)
71
- # # two seperate strings sent to Solr:
72
- # "285197145X"
73
- # "9782851971456"
70
+ # 020 a| 285197145X a| 9782851971456
71
+ # MarcExtractor.new("020a:020z").extract(record)
72
+ # # two seperate strings sent to Solr:
73
+ # "285197145X"
74
+ # "9782851971456"
74
75
  #
75
76
  # For single subfield specifications, you force concatenation by
76
77
  # repeating the subfield specification:
77
78
  #
78
- # MarcExtractor.new("020aa:020zz").extract(record)
79
- # # would result in a single string sent to solr for
80
- # # the single field, by default space-separated:
81
- # "285197145X 9782851971456"
79
+ # MarcExtractor.new("020aa:020zz").extract(record)
80
+ # # would result in a single string sent to solr for
81
+ # # the single field, by default space-separated:
82
+ # "285197145X 9782851971456"
82
83
  #
83
- # == Note on Performance and MarcExtractor creation and reuse
84
+ # ## Note on Performance and MarcExtractor creation and reuse
84
85
  #
85
86
  # A MarcExtractor is somewhat expensive to create, and has been shown in profiling/
86
87
  # benchmarking to be a bottleneck if you end up creating one for each marc record
@@ -90,15 +91,15 @@ module Traject
90
91
  # If you are creating a traject 'macro' method, here's one way to do that,
91
92
  # capturing the MarcExtractor under closure:
92
93
  #
93
- # def some_macro(spec, other_args, whatever)
94
- # extractor = MarcExtractor.new( spec )
95
- # # ...
96
- # return lambda do |record, accumulator, context|
97
- # #...
98
- # accumulator.concat extractor.extract(record)
99
- # #...
100
- # end
101
- # end
94
+ # def some_macro(spec, other_args, whatever)
95
+ # extractor = MarcExtractor.new( spec )
96
+ # # ...
97
+ # return lambda do |record, accumulator, context|
98
+ # #...
99
+ # accumulator.concat extractor.extract(record)
100
+ # #...
101
+ # end
102
+ # end
102
103
  #
103
104
  # In other cases, you may find it convenient to improve performance by
104
105
  # using the MarcExtractor#cached method, instead of MarcExtractor#new, to
@@ -107,13 +108,13 @@ module Traject
107
108
  class MarcExtractor
108
109
  attr_accessor :options, :spec_hash
109
110
 
110
- # First arg is a specification for extraction of data from a MARC record.
111
+ # First arg is a specification for extraction of data from a MARC record.
111
112
  # Specification can be given in two forms:
112
113
  #
113
114
  # * a string specification like "008[35]:020a:245abc", see top of class
114
- # for examples. A string specification is most typical argument.
115
+ # for examples. A string specification is most typical argument.
115
116
  # * The output of a previous call to MarcExtractor.parse_string_spec(string_spec),
116
- # a 'pre-parsed' specification.
117
+ # a 'pre-parsed' specification.
117
118
  #
118
119
  # Second arg is options:
119
120
  #
@@ -146,6 +147,8 @@ module Traject
146
147
  if options[:alternate_script] != false
147
148
  @interesting_tags_hash['880'] = true
148
149
  end
150
+
151
+ self.freeze
149
152
  end
150
153
 
151
154
  # Takes the same arguments as MarcExtractor.new, but will re-use an existing
@@ -164,17 +167,10 @@ module Traject
164
167
  # although if you try hard enough you can surely find a way to do something
165
168
  # you shouldn't.
166
169
  #
167
- # extractor = MarcExtractor.cached("245abc:700a", :separator => nil)
170
+ # extractor = MarcExtractor.cached("245abc:700a", :separator => nil)
168
171
  def self.cached(*args)
169
172
  cache = (Thread.current[:marc_extractor_cached] ||= Hash.new)
170
- extractor = (cache[args] ||= begin
171
- ex = Traject::MarcExtractor.new(*args).freeze
172
- ex.options.freeze
173
- ex.spec_hash.freeze
174
- ex
175
- end)
176
-
177
- return extractor
173
+ return ( cache[args] ||= Traject::MarcExtractor.new(*args).freeze )
178
174
  end
179
175
 
180
176
  # Check to see if a tag is interesting (meaning it may be covered by a spec
@@ -186,14 +182,14 @@ module Traject
186
182
 
187
183
  # Converts from a string marc spec like "008[35]:245abc:700a" to a hash used internally
188
184
  # to represent the specification. See comments at head of class for
189
- # documentation of string specification format.
185
+ # documentation of string specification format.
190
186
  #
191
187
  #
192
- # == Return value
188
+ # ## Return value
193
189
  #
194
190
  # The hash returned is keyed by tag, and has as values an array of 0 or
195
191
  # or more MarcExtractor::Spec objects representing the specified extraction
196
- # operations for that tag.
192
+ # operations for that tag.
197
193
  #
198
194
  # It's an array of possibly more than one, because you can specify
199
195
  # multiple extractions on the same tag: for instance "245a:245abc"
@@ -201,7 +197,7 @@ module Traject
201
197
  # See tests for more examples.
202
198
  def self.parse_string_spec(spec_string)
203
199
  # hash defaults to []
204
- hash = Hash.new {|hash,key| hash[key] = []}
200
+ hash = Hash.new
205
201
 
206
202
  spec_strings = spec_string.is_a?(Array) ? spec_string.map{|s| s.split(/\s*:\s*/)}.flatten : spec_string.split(/s*:\s*/)
207
203
 
@@ -222,8 +218,9 @@ module Traject
222
218
  spec.indicator2 = indicators[1] if indicators[1] != "*"
223
219
  end
224
220
 
221
+ hash[spec.tag] ||= []
225
222
  hash[spec.tag] << spec
226
-
223
+
227
224
  elsif (part =~ /\A([a-zA-Z0-9]{3})(\[(\d+)(-(\d+))?\])\Z/) # control field, "005[4-5]"
228
225
  tag, byte1, byte2 = $1, $3, $5
229
226
 
@@ -234,7 +231,8 @@ module Traject
234
231
  elsif byte1
235
232
  spec.bytes = byte1.to_i
236
233
  end
237
-
234
+
235
+ hash[spec.tag] ||= []
238
236
  hash[spec.tag] << spec
239
237
  else
240
238
  raise ArgumentError.new("Unrecognized marc extract specification: #{part}")
@@ -286,7 +284,7 @@ module Traject
286
284
  #
287
285
  # Useful for re-use of this class for custom processing
288
286
  #
289
- # yields the MARC Field, the MarcExtractor::Spec object, the MarcExtractor object.
287
+ # yields the MARC Field, the MarcExtractor::Spec object, the MarcExtractor object.
290
288
  def collect_matching_lines(marc_record)
291
289
  results = []
292
290
  self.each_matching_line(marc_record) do |field, spec, extractor|
@@ -312,7 +310,7 @@ module Traject
312
310
  if options[:separator] && spec.joinable?
313
311
  subfields = [subfields.join(options[:separator])]
314
312
  end
315
-
313
+
316
314
  return subfields
317
315
  end
318
316
 
@@ -324,12 +322,12 @@ module Traject
324
322
  # When given an 880, will return the spec (if any) for the linked tag iff
325
323
  # we have a $6 and we want the alternate script.
326
324
  #
327
- # Returns an empty array in case of no matching extraction specs.
325
+ # Returns an empty array in case of no matching extraction specs.
328
326
  def specs_covering_field(field)
329
327
  tag = field.tag
330
328
 
331
329
  # Short-circuit the unintersting stuff
332
- return nil unless interesting_tag?(tag)
330
+ return [] unless interesting_tag?(tag)
333
331
 
334
332
  # Due to bug in jruby https://github.com/jruby/jruby/issues/886 , we need
335
333
  # to do this weird encode gymnastics, which fixes it for mysterious reasons.
@@ -339,7 +337,7 @@ module Traject
339
337
  end
340
338
 
341
339
  # Take the resulting tag and get the spec from it (or the default nil if there isn't a spec for this tag)
342
- spec = self.spec_hash[tag]
340
+ spec = self.spec_hash[tag] || []
343
341
  end
344
342
 
345
343
 
@@ -348,13 +346,19 @@ module Traject
348
346
  # define #control_field? on both ControlField and DataField?
349
347
  return field.kind_of? MARC::ControlField
350
348
  end
351
-
349
+
350
+ def freeze
351
+ self.options.freeze
352
+ self.spec_hash.freeze
353
+ super
354
+ end
355
+
352
356
 
353
357
  # Represents a single specification for extracting data
354
- # from a marc field, like "600abc" or "600|1*|x".
358
+ # from a marc field, like "600abc" or "600|1*|x".
355
359
  #
356
360
  # Includes the tag for reference, although this is redundant and not actually used
357
- # in logic, since the tag is also implicit in the overall spec_hash
361
+ # in logic, since the tag is also implicit in the overall spec_hash
358
362
  # with tag => [spec1, spec2]
359
363
  class Spec
360
364
  attr_accessor :tag, :subfields, :indicator1, :indicator2, :bytes
@@ -365,7 +369,7 @@ module Traject
365
369
  end
366
370
  end
367
371
 
368
-
372
+
369
373
  # Should subfields extracted by joined, if we have a seperator?
370
374
  # * '630' no subfields specified => join all subfields
371
375
  # * '630abc' multiple subfields specified = join all subfields
@@ -379,8 +383,8 @@ module Traject
379
383
 
380
384
  # Pass in a MARC field, do it's indicators match indicators
381
385
  # in this spec? nil indicators in spec mean we don't care, everything
382
- # matches.
383
- def matches_indicators?(field)
386
+ # matches.
387
+ def matches_indicators?(field)
384
388
  return (self.indicator1.nil? || self.indicator1 == field.indicator1) &&
385
389
  (self.indicator2.nil? || self.indicator2 == field.indicator2)
386
390
  end
@@ -396,7 +400,7 @@ module Traject
396
400
  return false unless spec.kind_of?(Spec)
397
401
 
398
402
  return (self.tag == spec.tag) &&
399
- (self.subfields == spec.subfields) &&
403
+ (self.subfields == spec.subfields) &&
400
404
  (self.indicator1 == spec.indicator1) &&
401
405
  (self.indicator1 == spec.indicator2) &&
402
406
  (self.bytes == spec.bytes)