traject 0.16.0 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +1 -0
  3. data/README.md +183 -191
  4. data/bench/bench.rb +1 -1
  5. data/doc/batch_execution.md +14 -0
  6. data/doc/extending.md +14 -12
  7. data/doc/indexing_rules.md +265 -0
  8. data/lib/traject/command_line.rb +12 -41
  9. data/lib/traject/debug_writer.rb +32 -13
  10. data/lib/traject/indexer.rb +101 -24
  11. data/lib/traject/indexer/settings.rb +18 -17
  12. data/lib/traject/json_writer.rb +32 -11
  13. data/lib/traject/line_writer.rb +6 -6
  14. data/lib/traject/macros/basic.rb +1 -1
  15. data/lib/traject/macros/marc21.rb +17 -13
  16. data/lib/traject/macros/marc21_semantics.rb +27 -25
  17. data/lib/traject/macros/marc_format_classifier.rb +39 -25
  18. data/lib/traject/marc4j_reader.rb +36 -22
  19. data/lib/traject/marc_extractor.rb +79 -75
  20. data/lib/traject/marc_reader.rb +33 -25
  21. data/lib/traject/mock_reader.rb +9 -10
  22. data/lib/traject/ndj_reader.rb +7 -7
  23. data/lib/traject/null_writer.rb +1 -1
  24. data/lib/traject/qualified_const_get.rb +12 -2
  25. data/lib/traject/solrj_writer.rb +61 -52
  26. data/lib/traject/thread_pool.rb +45 -45
  27. data/lib/traject/translation_map.rb +59 -27
  28. data/lib/traject/util.rb +3 -3
  29. data/lib/traject/version.rb +1 -1
  30. data/lib/traject/yaml_writer.rb +1 -1
  31. data/test/debug_writer_test.rb +7 -7
  32. data/test/indexer/each_record_test.rb +4 -4
  33. data/test/indexer/macros_marc21_semantics_test.rb +12 -12
  34. data/test/indexer/macros_marc21_test.rb +10 -10
  35. data/test/indexer/macros_test.rb +1 -1
  36. data/test/indexer/map_record_test.rb +6 -6
  37. data/test/indexer/read_write_test.rb +43 -4
  38. data/test/indexer/settings_test.rb +2 -2
  39. data/test/indexer/to_field_test.rb +8 -8
  40. data/test/marc4j_reader_test.rb +4 -4
  41. data/test/marc_extractor_test.rb +33 -25
  42. data/test/marc_format_classifier_test.rb +3 -3
  43. data/test/marc_reader_test.rb +2 -2
  44. data/test/test_helper.rb +3 -3
  45. data/test/test_support/demo_config.rb +52 -48
  46. data/test/translation_map_test.rb +22 -4
  47. data/test/translation_maps/bad_ruby.rb +2 -2
  48. data/test/translation_maps/both_map.rb +1 -1
  49. data/test/translation_maps/default_literal.rb +1 -1
  50. data/test/translation_maps/default_passthrough.rb +1 -1
  51. data/test/translation_maps/ruby_map.rb +1 -1
  52. metadata +7 -31
  53. data/doc/macros.md +0 -103
@@ -62,10 +62,10 @@ module Traject::Macros
62
62
  def self.get_sortable_author(record)
63
63
  onexx = MarcExtractor.cached("100:110:111", :first => true, :trim_punctuation => true).extract(record).first
64
64
  onexx = onexx.strip if onexx
65
-
65
+
66
66
  titles = []
67
67
  MarcExtractor.cached("240:245", :first => true).each_matching_line(record) do |field, spec|
68
- non_filing = field.indicator2.to_i
68
+ non_filing = field.indicator2.to_i
69
69
 
70
70
  str = field.subfields.collect {|sf| Marc21.trim_punctuation(sf.value.strip).strip}.join(" ")
71
71
  str = str.slice(non_filing, str.length)
@@ -73,7 +73,7 @@ module Traject::Macros
73
73
  end.first
74
74
  title = titles.first
75
75
  title = title.strip if title
76
-
76
+
77
77
  return [onexx, title].compact.join(" ")
78
78
  end
79
79
 
@@ -105,26 +105,26 @@ module Traject::Macros
105
105
  str
106
106
  end.first
107
107
  end
108
-
109
-
110
-
108
+
109
+
110
+
111
111
  # A generic way to strip a filing version (i.e., a string with the non-filing
112
112
  # characters stripped off)
113
113
  #
114
114
  # Always returns an array. If :include_original=>true is passed in,
115
115
  # that array will include the original string with the non-filing
116
116
  # characters still in it.
117
-
117
+
118
118
  def extract_marc_filing_version(spec='245abdefghknp', opts={})
119
119
  include_original = opts.delete(:include_original)
120
120
  if opts.size > 0
121
121
  raise RuntimeError.new("extract_marc_filing_version can take only :include_original as an argument, not #{opts.keys.map{|x| "'#{x}'"}.join(' or ')}")
122
122
  end
123
-
123
+
124
124
  extractor = Traject::MarcExtractor.cached(spec, opts)
125
-
125
+
126
126
  lambda do |record, accumulator, context|
127
- extractor.collect_matching_lines(record) do |field, spec|
127
+ extractor.collect_matching_lines(record) do |field, spec|
128
128
  str = extractor.collect_subfields(field, spec).first
129
129
  next unless str and !str.empty?
130
130
  vals = [Marc21Semantics.filing_version(field, str, spec)]
@@ -136,34 +136,34 @@ module Traject::Macros
136
136
  end
137
137
  end
138
138
  end
139
-
140
-
141
-
142
-
139
+
140
+
141
+
142
+
143
143
  # Take in a field, a string extracted from that field, and a spec and
144
- # return the filing version (i.e., the string without the
144
+ # return the filing version (i.e., the string without the
145
145
  # non-filing characters)
146
-
146
+
147
147
  def self.filing_version(field, str, spec)
148
148
  # Control fields don't have non-filing characters
149
149
  return str if field.kind_of? MARC::ControlField
150
-
150
+
151
151
  # 2nd indicator must be > 0
152
152
  ind2 = field.indicator2.to_i
153
153
  return str unless ind2 > 0
154
-
154
+
155
155
  # The spechash must either (a) have no subfields specified, or
156
156
  # (b) include the first subfield in the record
157
-
157
+
158
158
  subs = spec.subfields
159
159
  return str unless subs && subs.include?(field.subfields[0].code)
160
-
160
+
161
161
  # OK. If we got this far we actually need to strip characters off the string
162
-
162
+
163
163
  return str[ind2..-1]
164
164
  end
165
-
166
-
165
+
166
+
167
167
 
168
168
 
169
169
  # maps languages, by default out of 008[35-37] and 041a and 041d
@@ -367,6 +367,9 @@ module Traject::Macros
367
367
  return found_date
368
368
  end
369
369
 
370
+ # REGEX meant to rule out obvious non-LCC's, and only allow things
371
+ # plausibly LCC's.
372
+ LCC_REGEX = /\A *[A-Z]{1,3}[ .]*(?:(\d+)(?:\s*?\.\s*?(\d+))?).*/
370
373
  # Looks up Library of Congress Classification (LCC) or NLM Medical Subject Headings (MeSH)
371
374
  # from usual parts of the marc record. Maps them to high-level broad categories,
372
375
  # basically just using the first part of the LCC. Note it's just looking in bib-level
@@ -379,7 +382,6 @@ module Traject::Macros
379
382
  # or nil.
380
383
  #
381
384
  # The categories output aren't great, but they're something.
382
- LCC_REGEX = /\A *[A-Z]{1,3}[ .]*(?:(\d+)(?:\s*?\.\s*?(\d+))?).*/
383
385
  def marc_lcc_to_broad_category( options = {}, spec="050a:060a:090a:096a")
384
386
  # Trying to match things that look like LCC, and not match things
385
387
  # that don't. Is tricky.
@@ -503,4 +505,4 @@ module Traject::Macros
503
505
 
504
506
 
505
507
  end
506
- end
508
+ end
@@ -1,9 +1,19 @@
1
1
  module Traject
2
2
  module Macros
3
- # See MarcFormatClassifier class
3
+ # To use the marc_format macro, in your configuration file:
4
+ #
5
+ # require 'traject/macros/marc_formats
6
+ # extend Traject::Macros::MarcFormats
7
+ #
8
+ # to_field("format_s") marc_formats
9
+ #
10
+ # See also MarcClassifier which can be used directly for a bit more
11
+ # control.
4
12
  module MarcFormats
5
13
  # very opionated macro that just adds a grab bag of format/genre/types
6
- # into one field. You may want ot build your own from MarcFormatClassifier functions instead.
14
+ # from our own custom vocabulary, all into one field.
15
+ # You may want to build your own from MarcFormatClassifier functions instead.
16
+ #
7
17
  def marc_formats
8
18
  lambda do |record, accumulator|
9
19
  accumulator.concat Traject::Macros::MarcFormatClassifier.new(record).formats
@@ -12,10 +22,11 @@ module Traject
12
22
  end
13
23
 
14
24
 
15
- # Not actually a macro, but we're keeping it here for now,
16
- # a class for classifying marc according to format/genre/type.
25
+ # A tool for classifiying MARC records according to format/form/genre/type,
26
+ # just using our own custom vocabulary for those things.
17
27
  #
18
- # VERY opinionated.
28
+ # used by the `marc_formats` macro, but you can also use it directly
29
+ # for a bit more control.
19
30
  class MarcFormatClassifier
20
31
  attr_reader :record
21
32
 
@@ -24,22 +35,25 @@ module Traject
24
35
  end
25
36
 
26
37
  # A very opinionated method that just kind of jams together
27
- # all the possible format/genre/types into one array of 1 to N elements.
38
+ # all the possible format/genre/types into one array of 1 to N elements.
28
39
  #
29
- # Default "Other" will be used
40
+ # If no other values are present, the default value "Other" will be used.
41
+ #
42
+ # See also individual methods which you can use you seperate into
43
+ # different facets or do other custom things.
30
44
  def formats(options = {})
31
45
  options = {:default => "Other"}.merge(options)
32
46
 
33
47
  formats = []
34
48
 
35
49
  formats.concat genre
36
-
50
+
37
51
  formats << "Manuscript/Archive" if manuscript_archive?
38
52
  formats << "Microform" if microform?
39
53
  formats << "Online" if online?
40
54
 
41
55
  # In our own data, if it's an audio recording, it might show up
42
- # as print, but it's probably not.
56
+ # as print, but it's probably not.
43
57
  formats << "Print" if print? && ! (formats.include?("Non-musical Recording") || formats.include?("Musical Recording"))
44
58
 
45
59
  # If it's a Dissertation, we decide it's NOT a book
@@ -64,11 +78,11 @@ module Traject
64
78
  # Returns 1 or more values in an array from:
65
79
  # Book; Journal/Newspaper; Musical Score; Map/Globe; Non-musical Recording; Musical Recording
66
80
  # Image; Software/Data; Video/Film
67
- #
68
- # Uses leader byte 6, leader byte 7, and 007 byte 0.
81
+ #
82
+ # Uses leader byte 6, leader byte 7, and 007 byte 0.
69
83
  #
70
84
  # Gets actual labels from marc_genre_leader and marc_genre_007 translation maps,
71
- # so you can customize labels if you want.
85
+ # so you can customize labels if you want.
72
86
  def genre
73
87
  marc_genre_leader = Traject::TranslationMap.new("marc_genre_leader")
74
88
  marc_genre_007 = Traject::TranslationMap.new("marc_genre_007")
@@ -96,18 +110,18 @@ module Traject
96
110
  end
97
111
  end
98
112
 
99
- # Algorithm with help from Chris Case.
100
- # * If it has any RDA 338, then it's print if it has a value of
101
- # volume, sheet, or card.
113
+ # Algorithm with help from Chris Case.
114
+ # * If it has any RDA 338, then it's print if it has a value of
115
+ # volume, sheet, or card.
102
116
  # * If it does not have an RDA 338, it's print if and only if it has
103
- # NO 245$h GMD.
117
+ # NO 245$h GMD.
104
118
  #
105
- # * Here at JH, for legacy reasons we also choose to not
119
+ # * Here at JH, for legacy reasons we also choose to not
106
120
  # call it print if it's already been marked audio, but
107
- # we do that in a different method.
121
+ # we do that in a different method.
108
122
  #
109
123
  # This algorithm is definitely going to get some things wrong in
110
- # both directions, with real world data. But seems to be good enough.
124
+ # both directions, with real world data. But seems to be good enough.
111
125
  def print?
112
126
 
113
127
 
@@ -116,7 +130,7 @@ module Traject
116
130
  end
117
131
 
118
132
  if rda338.length > 0
119
- rda338.find do |field|
133
+ rda338.find do |field|
120
134
  field.subfields.find do |sf|
121
135
  (sf.code == "a" && %w{volume card sheet}.include?(sf.value)) ||
122
136
  (sf.code == "b" && %w{nc no nb}.include?(sf.value))
@@ -128,7 +142,7 @@ module Traject
128
142
  end
129
143
 
130
144
  # We use marc 007 to determine if this represents an online
131
- # resource. But sometimes resort to 245$h GMD too.
145
+ # resource. But sometimes resort to 245$h GMD too.
132
146
  def online?
133
147
  # field 007, byte 0 c="electronic" byte 1 r="remote" ==> sure Online
134
148
  found_007 = record.find do |field|
@@ -140,8 +154,8 @@ module Traject
140
154
  # Otherwise, if it has a GMD ["electronic resource"], we count it
141
155
  # as online only if NO 007[0] == 'c' exists, cause if it does we already
142
156
  # know it's electronic but not remote, otherwise first try would
143
- # have found it.
144
- return (normalized_gmd.start_with? "[electronic resource]") && ! record.find {|f| f.tag == '007' && f.value.slice(0) == "c"}
157
+ # have found it.
158
+ return (normalized_gmd.start_with? "[electronic resource]") && ! record.find {|f| f.tag == '007' && f.value.slice(0) == "c"}
145
159
  end
146
160
 
147
161
  # if field 007 byte 0 is 'h', that's microform. But many of our microform
@@ -153,7 +167,7 @@ module Traject
153
167
  record.find {|f| (f.tag == "007") && (f.value[0] == "h")}
154
168
  end
155
169
 
156
- # Marked as manuscript OR archive.
170
+ # Marked as manuscript OR archive.
157
171
  def manuscript_archive?
158
172
  leader06 = record.leader.slice(6)
159
173
  leader08 = record.leader.slice(8)
@@ -177,4 +191,4 @@ module Traject
177
191
 
178
192
  end
179
193
  end
180
- end
194
+ end
@@ -2,24 +2,21 @@ require 'traject'
2
2
  require 'marc'
3
3
  require 'marc/marc4j'
4
4
 
5
- # Uses Marc4J to read the marc records, but then translates them to
6
- # ruby-marc before delivering them still, Marc4J is just inside the black
7
- # box.
5
+ # `Traject::Marc4JReader` uses the marc4j java package to parse the MARC records
6
+ # into standard ruby-marc MARC::Record objects. This reader is often faster than
7
+ # Traject::MarcReader, especially for XML, and offers support for reading Marc8
8
+ # encoded records and transcoding to UTF8.
8
9
  #
9
- # But one way to get ability to transcode from Marc8. Records it delivers
10
- # are ALWAYS in UTF8, will be transcoded if needed.
10
+ # Marc4JReader can read MARC ISO 2709 ("binary") or MARCXML. We use the Marc4J MarcPermissiveStreamReader
11
+ # for reading binary, but sometimes in non-permissive mode, according to settings. We use the Marc4j MarcXmlReader
12
+ # for reading xml. The actual code for dealing with Marc4J is in the separate
13
+ # [marc-marc4j gem](https://github.com/billdueber/ruby-marc-marc4j).
11
14
  #
12
- # Also hope it gives us some performance benefit.
15
+ # See also the pure ruby Traject::MarcReader as an alternative, if you need to read
16
+ # marc-in-json, or if you don't need binary Marc8 support, it may in some cases
17
+ # be faster.
13
18
  #
14
- # Uses the Marc4J MarcPermissiveStreamReader for binary, but sometimes
15
- # in non-permissive mode, according to settings. Uses the Marc4j MarcXmlReader
16
- # for xml.
17
- #
18
- # NOTE: If you aren't reading in binary records encoded in MARC8, you may
19
- # find the pure-ruby Traject::MarcReader faster; the extra step to read
20
- # Marc4J but translate to ruby MARC::Record adds some overhead.
21
- #
22
- # Settings:
19
+ # ## Settings
23
20
  #
24
21
  # * marc_source.type: serialization type. default 'binary', also 'xml' (TODO: json/marc-in-json)
25
22
  #
@@ -39,9 +36,26 @@ require 'marc/marc4j'
39
36
  # * marc4j_reader.jar_dir: Path to a directory containing Marc4J jar file to use. All .jar's in dir will
40
37
  # be loaded. If unset, uses marc4j.jar bundled with traject.
41
38
  #
42
- # * marc4j_reader.keep_marc4j: Keeps the original marc4j record accessible from
43
- # the eventual ruby-marc record via record#original_marc4j
44
-
39
+ # * marc4j_reader.keep_marc4j: Keeps the original marc4j record accessible from
40
+ # the eventual ruby-marc record via record#original_marc4j. Intended for
41
+ # those that have legacy java code for which a marc4j object is needed. .
42
+ #
43
+ #
44
+ # ## Example
45
+ #
46
+ # In a configuration file:
47
+ #
48
+ # require 'traject/marc4j_reader
49
+ # settings do
50
+ # provide "reader_class_name", "Traject::Marc4JReader"
51
+ #
52
+ # #for MarcXML:
53
+ # # provide "marc_source.type", "xml"
54
+ #
55
+ # # Or instead for binary:
56
+ # provide "marc4j_reader.permissive", true
57
+ # provide "marc4j_reader.source_encoding", "MARC8"
58
+ # end
45
59
  class Traject::Marc4JReader
46
60
  include Enumerable
47
61
 
@@ -56,14 +70,14 @@ class Traject::Marc4JReader
56
70
  MARC::Record.instance_methods.include?(:"original_marc4j="))
57
71
  MARC::Record.class_eval('attr_accessor :original_marc4j')
58
72
  end
59
-
73
+
60
74
  # Creating a converter will do the following:
61
75
  # - nothing, if it detects that the marc4j jar is already loaded
62
76
  # - load all the .jar files in settings['marc4j_reader.jar_dir'] if set
63
77
  # - load the marc4j jar file bundled with MARC::MARC4J otherwise
64
-
78
+
65
79
  @converter = MARC::MARC4J.new(:jardir => settings['marc4j_reader.jar_dir'], :logger => logger)
66
-
80
+
67
81
  # Convenience
68
82
  java_import org.marc4j.MarcPermissiveStreamReader
69
83
  java_import org.marc4j.MarcXmlReader
@@ -121,4 +135,4 @@ class Traject::Marc4JReader
121
135
  @logger ||= (settings[:logger] || Yell.new(STDERR, :level => "gt.fatal")) # null logger)
122
136
  end
123
137
 
124
- end
138
+ end
@@ -6,22 +6,23 @@ module Traject
6
6
  #
7
7
  # Examples:
8
8
  #
9
- # array_of_stuff = MarcExtractor.new("001:245abc:700a").extract(marc_record)
10
- # values = MarcExtractor.new("245a:245abc").extract_marc(marc_record)
11
- # seperated_values = MarcExtractor.new("020a:020z").extract(marc_record)
12
- # bytes = MarcExtractor.new("008[35-37]")
9
+ # array_of_stuff = MarcExtractor.new("001:245abc:700a").extract(marc_record)
10
+ # values = MarcExtractor.new("245a:245abc").extract_marc(marc_record)
11
+ # seperated_values = MarcExtractor.new("020a:020z").extract(marc_record)
12
+ # bytes = MarcExtractor.new("008[35-37]")
13
13
  #
14
- # == String extraction specifications
14
+ # ## String extraction specifications
15
15
  #
16
16
  # Extraction directions are supplied in strings, usually as the first
17
17
  # parameter to MarcExtractor.new or MarcExtractor.cached. These specifications
18
- # are also the first parameter to the #marc_extract macro.
18
+ # are also the first parameter to the #marc_extract macro.
19
19
  #
20
20
  # A String specification is a string (or array of strings) which consists
21
- # of one or more Data and Control Field Specifications seperated by colons.
21
+ # of one or more Data and Control Field Specifications seperated by colons.
22
22
  #
23
23
  # A Data Field Specification is of the form:
24
- # `{tag}{|indicators|}{subfields}`
24
+ #
25
+ # * `{tag}{|indicators|}{subfields}`
25
26
  # * {tag} is three chars (usually but not neccesarily numeric)
26
27
  # * {indicators} are optional two chars enclosed in pipe ('|') characters,
27
28
  # * {subfields} are optional list of chars (alphanumeric)
@@ -29,58 +30,58 @@ module Traject
29
30
  # indicator spec must be two chars, but one can be * meaning "don't care".
30
31
  # space to mean 'blank'
31
32
  #
32
- # "245|01|abc65:345abc:700|*5|:800"
33
+ # "245|01|abc65:345abc:700|*5|:800"
33
34
  #
34
35
  # A Control Field Specification is used with tags for control (fixed) fields (ordinarily fields 001-010)
35
- # and includes a tag and a a byte slice specification.
36
+ # and includes a tag and a a byte slice specification.
36
37
  #
37
- # "008[35-37]:007[5]""
38
- # => bytes 35-37 inclusive of any field 008, and byte 5 of any field 007 (TODO: Should we support
39
- # "LDR" as a pseudo-tag to take byte slices of leader?)
38
+ # "008[35-37]:007[5]""
39
+ # => bytes 35-37 inclusive of any field 008, and byte 5 of any field 007 (TODO: Should we support
40
+ # "LDR" as a pseudo-tag to take byte slices of leader?)
40
41
  #
41
42
  # * subfields and indicators can only be provided for marc data/variable fields
42
43
  # * byte slice can only be provided for marc control fields (generally tags less than 010)
43
44
  #
44
- # == Subfield concatenation
45
+ # ## Subfield concatenation
45
46
  #
46
47
  # Normally, for a spec including multiple subfield codes, multiple subfields
47
48
  # from the same MARC field will be concatenated into one string separated by spaces:
48
49
  #
49
- # 600 a| Chomsky, Noam x| Philosophy.
50
- # 600 a| Chomsky, Noam x| Political and social views.
51
- # MarcExtractor.new("600ax").extract(record)
52
- # # results in two values sent to Solr:
53
- # "Chomsky, Noam Philosophy."
54
- # "Chomsky, Noam Political and social views."
50
+ # 600 a| Chomsky, Noam x| Philosophy.
51
+ # 600 a| Chomsky, Noam x| Political and social views.
52
+ # MarcExtractor.new("600ax").extract(record)
53
+ # # results in two values sent to Solr:
54
+ # "Chomsky, Noam Philosophy."
55
+ # "Chomsky, Noam Political and social views."
55
56
  #
56
57
  # You can turn off this concatenation and leave individual subfields in seperate
57
58
  # strings by setting the `separator` option to nil:
58
59
  #
59
- # MarcExtractor.new("600ax", :separator => nil).extract(record)
60
- # # Results in four values being sent to Solr (or 3 if you de-dup):
61
- # "Chomksy, Noam"
62
- # "Philosophy."
63
- # "Chomsky, Noam"
64
- # "Political and social views."
60
+ # MarcExtractor.new("600ax", :separator => nil).extract(record)
61
+ # # Results in four values being sent to Solr (or 3 if you de-dup):
62
+ # "Chomksy, Noam"
63
+ # "Philosophy."
64
+ # "Chomsky, Noam"
65
+ # "Political and social views."
65
66
  #
66
67
  # However, **the default is different for specifications with only a single
67
68
  # subfield**, these are by default kept seperated:
68
69
  #
69
- # 020 a| 285197145X a| 9782851971456
70
- # MarcExtractor.new("020a:020z").extract(record)
71
- # # two seperate strings sent to Solr:
72
- # "285197145X"
73
- # "9782851971456"
70
+ # 020 a| 285197145X a| 9782851971456
71
+ # MarcExtractor.new("020a:020z").extract(record)
72
+ # # two seperate strings sent to Solr:
73
+ # "285197145X"
74
+ # "9782851971456"
74
75
  #
75
76
  # For single subfield specifications, you force concatenation by
76
77
  # repeating the subfield specification:
77
78
  #
78
- # MarcExtractor.new("020aa:020zz").extract(record)
79
- # # would result in a single string sent to solr for
80
- # # the single field, by default space-separated:
81
- # "285197145X 9782851971456"
79
+ # MarcExtractor.new("020aa:020zz").extract(record)
80
+ # # would result in a single string sent to solr for
81
+ # # the single field, by default space-separated:
82
+ # "285197145X 9782851971456"
82
83
  #
83
- # == Note on Performance and MarcExtractor creation and reuse
84
+ # ## Note on Performance and MarcExtractor creation and reuse
84
85
  #
85
86
  # A MarcExtractor is somewhat expensive to create, and has been shown in profiling/
86
87
  # benchmarking to be a bottleneck if you end up creating one for each marc record
@@ -90,15 +91,15 @@ module Traject
90
91
  # If you are creating a traject 'macro' method, here's one way to do that,
91
92
  # capturing the MarcExtractor under closure:
92
93
  #
93
- # def some_macro(spec, other_args, whatever)
94
- # extractor = MarcExtractor.new( spec )
95
- # # ...
96
- # return lambda do |record, accumulator, context|
97
- # #...
98
- # accumulator.concat extractor.extract(record)
99
- # #...
100
- # end
101
- # end
94
+ # def some_macro(spec, other_args, whatever)
95
+ # extractor = MarcExtractor.new( spec )
96
+ # # ...
97
+ # return lambda do |record, accumulator, context|
98
+ # #...
99
+ # accumulator.concat extractor.extract(record)
100
+ # #...
101
+ # end
102
+ # end
102
103
  #
103
104
  # In other cases, you may find it convenient to improve performance by
104
105
  # using the MarcExtractor#cached method, instead of MarcExtractor#new, to
@@ -107,13 +108,13 @@ module Traject
107
108
  class MarcExtractor
108
109
  attr_accessor :options, :spec_hash
109
110
 
110
- # First arg is a specification for extraction of data from a MARC record.
111
+ # First arg is a specification for extraction of data from a MARC record.
111
112
  # Specification can be given in two forms:
112
113
  #
113
114
  # * a string specification like "008[35]:020a:245abc", see top of class
114
- # for examples. A string specification is most typical argument.
115
+ # for examples. A string specification is most typical argument.
115
116
  # * The output of a previous call to MarcExtractor.parse_string_spec(string_spec),
116
- # a 'pre-parsed' specification.
117
+ # a 'pre-parsed' specification.
117
118
  #
118
119
  # Second arg is options:
119
120
  #
@@ -146,6 +147,8 @@ module Traject
146
147
  if options[:alternate_script] != false
147
148
  @interesting_tags_hash['880'] = true
148
149
  end
150
+
151
+ self.freeze
149
152
  end
150
153
 
151
154
  # Takes the same arguments as MarcExtractor.new, but will re-use an existing
@@ -164,17 +167,10 @@ module Traject
164
167
  # although if you try hard enough you can surely find a way to do something
165
168
  # you shouldn't.
166
169
  #
167
- # extractor = MarcExtractor.cached("245abc:700a", :separator => nil)
170
+ # extractor = MarcExtractor.cached("245abc:700a", :separator => nil)
168
171
  def self.cached(*args)
169
172
  cache = (Thread.current[:marc_extractor_cached] ||= Hash.new)
170
- extractor = (cache[args] ||= begin
171
- ex = Traject::MarcExtractor.new(*args).freeze
172
- ex.options.freeze
173
- ex.spec_hash.freeze
174
- ex
175
- end)
176
-
177
- return extractor
173
+ return ( cache[args] ||= Traject::MarcExtractor.new(*args).freeze )
178
174
  end
179
175
 
180
176
  # Check to see if a tag is interesting (meaning it may be covered by a spec
@@ -186,14 +182,14 @@ module Traject
186
182
 
187
183
  # Converts from a string marc spec like "008[35]:245abc:700a" to a hash used internally
188
184
  # to represent the specification. See comments at head of class for
189
- # documentation of string specification format.
185
+ # documentation of string specification format.
190
186
  #
191
187
  #
192
- # == Return value
188
+ # ## Return value
193
189
  #
194
190
  # The hash returned is keyed by tag, and has as values an array of 0 or
195
191
  # or more MarcExtractor::Spec objects representing the specified extraction
196
- # operations for that tag.
192
+ # operations for that tag.
197
193
  #
198
194
  # It's an array of possibly more than one, because you can specify
199
195
  # multiple extractions on the same tag: for instance "245a:245abc"
@@ -201,7 +197,7 @@ module Traject
201
197
  # See tests for more examples.
202
198
  def self.parse_string_spec(spec_string)
203
199
  # hash defaults to []
204
- hash = Hash.new {|hash,key| hash[key] = []}
200
+ hash = Hash.new
205
201
 
206
202
  spec_strings = spec_string.is_a?(Array) ? spec_string.map{|s| s.split(/\s*:\s*/)}.flatten : spec_string.split(/s*:\s*/)
207
203
 
@@ -222,8 +218,9 @@ module Traject
222
218
  spec.indicator2 = indicators[1] if indicators[1] != "*"
223
219
  end
224
220
 
221
+ hash[spec.tag] ||= []
225
222
  hash[spec.tag] << spec
226
-
223
+
227
224
  elsif (part =~ /\A([a-zA-Z0-9]{3})(\[(\d+)(-(\d+))?\])\Z/) # control field, "005[4-5]"
228
225
  tag, byte1, byte2 = $1, $3, $5
229
226
 
@@ -234,7 +231,8 @@ module Traject
234
231
  elsif byte1
235
232
  spec.bytes = byte1.to_i
236
233
  end
237
-
234
+
235
+ hash[spec.tag] ||= []
238
236
  hash[spec.tag] << spec
239
237
  else
240
238
  raise ArgumentError.new("Unrecognized marc extract specification: #{part}")
@@ -286,7 +284,7 @@ module Traject
286
284
  #
287
285
  # Useful for re-use of this class for custom processing
288
286
  #
289
- # yields the MARC Field, the MarcExtractor::Spec object, the MarcExtractor object.
287
+ # yields the MARC Field, the MarcExtractor::Spec object, the MarcExtractor object.
290
288
  def collect_matching_lines(marc_record)
291
289
  results = []
292
290
  self.each_matching_line(marc_record) do |field, spec, extractor|
@@ -312,7 +310,7 @@ module Traject
312
310
  if options[:separator] && spec.joinable?
313
311
  subfields = [subfields.join(options[:separator])]
314
312
  end
315
-
313
+
316
314
  return subfields
317
315
  end
318
316
 
@@ -324,12 +322,12 @@ module Traject
324
322
  # When given an 880, will return the spec (if any) for the linked tag iff
325
323
  # we have a $6 and we want the alternate script.
326
324
  #
327
- # Returns an empty array in case of no matching extraction specs.
325
+ # Returns an empty array in case of no matching extraction specs.
328
326
  def specs_covering_field(field)
329
327
  tag = field.tag
330
328
 
331
329
  # Short-circuit the unintersting stuff
332
- return nil unless interesting_tag?(tag)
330
+ return [] unless interesting_tag?(tag)
333
331
 
334
332
  # Due to bug in jruby https://github.com/jruby/jruby/issues/886 , we need
335
333
  # to do this weird encode gymnastics, which fixes it for mysterious reasons.
@@ -339,7 +337,7 @@ module Traject
339
337
  end
340
338
 
341
339
  # Take the resulting tag and get the spec from it (or the default nil if there isn't a spec for this tag)
342
- spec = self.spec_hash[tag]
340
+ spec = self.spec_hash[tag] || []
343
341
  end
344
342
 
345
343
 
@@ -348,13 +346,19 @@ module Traject
348
346
  # define #control_field? on both ControlField and DataField?
349
347
  return field.kind_of? MARC::ControlField
350
348
  end
351
-
349
+
350
+ def freeze
351
+ self.options.freeze
352
+ self.spec_hash.freeze
353
+ super
354
+ end
355
+
352
356
 
353
357
  # Represents a single specification for extracting data
354
- # from a marc field, like "600abc" or "600|1*|x".
358
+ # from a marc field, like "600abc" or "600|1*|x".
355
359
  #
356
360
  # Includes the tag for reference, although this is redundant and not actually used
357
- # in logic, since the tag is also implicit in the overall spec_hash
361
+ # in logic, since the tag is also implicit in the overall spec_hash
358
362
  # with tag => [spec1, spec2]
359
363
  class Spec
360
364
  attr_accessor :tag, :subfields, :indicator1, :indicator2, :bytes
@@ -365,7 +369,7 @@ module Traject
365
369
  end
366
370
  end
367
371
 
368
-
372
+
369
373
  # Should subfields extracted by joined, if we have a seperator?
370
374
  # * '630' no subfields specified => join all subfields
371
375
  # * '630abc' multiple subfields specified = join all subfields
@@ -379,8 +383,8 @@ module Traject
379
383
 
380
384
  # Pass in a MARC field, do it's indicators match indicators
381
385
  # in this spec? nil indicators in spec mean we don't care, everything
382
- # matches.
383
- def matches_indicators?(field)
386
+ # matches.
387
+ def matches_indicators?(field)
384
388
  return (self.indicator1.nil? || self.indicator1 == field.indicator1) &&
385
389
  (self.indicator2.nil? || self.indicator2 == field.indicator2)
386
390
  end
@@ -396,7 +400,7 @@ module Traject
396
400
  return false unless spec.kind_of?(Spec)
397
401
 
398
402
  return (self.tag == spec.tag) &&
399
- (self.subfields == spec.subfields) &&
403
+ (self.subfields == spec.subfields) &&
400
404
  (self.indicator1 == spec.indicator1) &&
401
405
  (self.indicator1 == spec.indicator2) &&
402
406
  (self.bytes == spec.bytes)