traject 2.3.4 → 3.0.0.alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. checksums.yaml +5 -5
  2. data/.travis.yml +16 -9
  3. data/CHANGES.md +74 -1
  4. data/Gemfile +2 -1
  5. data/README.md +104 -53
  6. data/Rakefile +8 -1
  7. data/doc/indexing_rules.md +79 -63
  8. data/doc/programmatic_use.md +218 -0
  9. data/doc/settings.md +28 -1
  10. data/doc/xml.md +134 -0
  11. data/lib/traject.rb +5 -0
  12. data/lib/traject/array_writer.rb +34 -0
  13. data/lib/traject/command_line.rb +18 -22
  14. data/lib/traject/debug_writer.rb +2 -5
  15. data/lib/traject/experimental_nokogiri_streaming_reader.rb +276 -0
  16. data/lib/traject/hashie/indifferent_access_fix.rb +25 -0
  17. data/lib/traject/indexer.rb +321 -92
  18. data/lib/traject/indexer/context.rb +39 -13
  19. data/lib/traject/indexer/marc_indexer.rb +30 -0
  20. data/lib/traject/indexer/nokogiri_indexer.rb +30 -0
  21. data/lib/traject/indexer/settings.rb +36 -53
  22. data/lib/traject/indexer/step.rb +27 -33
  23. data/lib/traject/macros/marc21.rb +37 -12
  24. data/lib/traject/macros/nokogiri_macros.rb +43 -0
  25. data/lib/traject/macros/transformation.rb +162 -0
  26. data/lib/traject/marc_extractor.rb +2 -0
  27. data/lib/traject/ndj_reader.rb +1 -1
  28. data/lib/traject/nokogiri_reader.rb +179 -0
  29. data/lib/traject/oai_pmh_nokogiri_reader.rb +159 -0
  30. data/lib/traject/solr_json_writer.rb +19 -12
  31. data/lib/traject/thread_pool.rb +13 -0
  32. data/lib/traject/util.rb +14 -2
  33. data/lib/traject/version.rb +1 -1
  34. data/test/debug_writer_test.rb +3 -3
  35. data/test/delimited_writer_test.rb +3 -3
  36. data/test/experimental_nokogiri_streaming_reader_test.rb +169 -0
  37. data/test/indexer/context_test.rb +23 -13
  38. data/test/indexer/error_handler_test.rb +59 -0
  39. data/test/indexer/macros/macros_marc21_semantics_test.rb +46 -46
  40. data/test/indexer/macros/marc21/extract_all_marc_values_test.rb +1 -1
  41. data/test/indexer/macros/marc21/extract_marc_test.rb +19 -9
  42. data/test/indexer/macros/marc21/serialize_marc_test.rb +4 -4
  43. data/test/indexer/macros/to_field_test.rb +2 -2
  44. data/test/indexer/macros/transformation_test.rb +177 -0
  45. data/test/indexer/map_record_test.rb +2 -3
  46. data/test/indexer/nokogiri_indexer_test.rb +103 -0
  47. data/test/indexer/process_record_test.rb +55 -0
  48. data/test/indexer/process_with_test.rb +148 -0
  49. data/test/indexer/read_write_test.rb +52 -2
  50. data/test/indexer/settings_test.rb +34 -24
  51. data/test/indexer/to_field_test.rb +27 -2
  52. data/test/marc_extractor_test.rb +7 -7
  53. data/test/marc_reader_test.rb +4 -4
  54. data/test/nokogiri_reader_test.rb +158 -0
  55. data/test/oai_pmh_nokogiri_reader_test.rb +23 -0
  56. data/test/solr_json_writer_test.rb +24 -28
  57. data/test/test_helper.rb +8 -2
  58. data/test/test_support/namespace-test.xml +7 -0
  59. data/test/test_support/nokogiri_demo_config.rb +17 -0
  60. data/test/test_support/oai-pmh-one-record-2.xml +24 -0
  61. data/test/test_support/oai-pmh-one-record-first.xml +24 -0
  62. data/test/test_support/sample-oai-no-namespace.xml +197 -0
  63. data/test/test_support/sample-oai-pmh.xml +197 -0
  64. data/test/thread_pool_test.rb +38 -0
  65. data/test/translation_map_test.rb +3 -3
  66. data/test/translation_maps/ruby_map.rb +2 -1
  67. data/test/translation_maps/yaml_map.yaml +2 -1
  68. data/traject.gemspec +4 -11
  69. metadata +92 -6
@@ -1,6 +1,8 @@
1
1
  # Represents the context of a specific record being indexed, passed
2
2
  # to indexing logic blocks
3
3
  #
4
+ # Arg source_record_id_proc is a lambda that takes one arg (indexer-specific source record),
5
+ # and returns an ID for it suitable for use in log messages.
4
6
  class Traject::Indexer
5
7
  class Context
6
8
  def initialize(hash_init = {})
@@ -17,9 +19,13 @@ class Traject::Indexer
17
19
  end
18
20
 
19
21
  attr_accessor :clipboard, :output_hash, :logger
20
- attr_accessor :index_step, :source_record, :settings
21
- # 1-based position in stream of processed records.
22
+ attr_accessor :index_step, :source_record, :settings, :source_record_id_proc
23
+ # 'position' is a 1-based position in stream of processed records.
22
24
  attr_accessor :position
25
+ # sometimes we have multiple inputs, input_name describes the current one, and
26
+ # position_in_input the position of the record in the current input -- both can
27
+ # sometimes be blanl when we don't know.
28
+ attr_accessor :input_name, :position_in_input
23
29
 
24
30
  # Should we be skipping this record?
25
31
  attr_accessor :skipmessage
@@ -41,19 +47,39 @@ class Traject::Indexer
41
47
  # in output messages, especially since this method may sometimes
42
48
  # return empty string if info on record id is not available.
43
49
  #
44
- # Returns MARC 001, then a slash, then output_hash["id"] -- if both
50
+ # Returns id from source_record (if we can get it from a source_record_id_proc),
51
+ # then a slash,then output_hash["id"] -- if both
45
52
  # are present. Otherwise may return just one, or even an empty string.
46
- #
47
- # Likely override this for a future XML or other source format version.
48
53
  def source_record_id
49
- marc_id = if self.source_record &&
50
- self.source_record.kind_of?(MARC::Record) &&
51
- self.source_record['001']
52
- self.source_record['001'].value
53
- end
54
- output_id = self.output_hash["id"]
55
-
56
- return [marc_id, output_id].compact.join("/")
54
+ source_record_id_proc && source_record_id_proc.call(source_record)
55
+ end
56
+
57
+ # a string label that can be used to refer to a particular record in log messages and
58
+ # exceptions. Includes various parts depending on what we got.
59
+ def record_inspect
60
+ str = "<"
61
+
62
+ str << "record ##{position}" if position
63
+
64
+ if input_name && position_in_input
65
+ str << " (#{input_name} ##{position_in_input}), "
66
+ elsif position
67
+ str << ", "
68
+ end
69
+
70
+ if source_id = source_record_id
71
+ str << "source_id:#{source_id} "
72
+ end
73
+
74
+ if output_id = self.output_hash["id"]
75
+ str << "output_id:#{[output_id].join(',')}"
76
+ end
77
+
78
+ str.chomp!(" ")
79
+ str.chomp!(",")
80
+ str << ">"
81
+
82
+ str
57
83
  end
58
84
 
59
85
  end
@@ -0,0 +1,30 @@
1
+ module Traject
2
+ class Indexer
3
+ # An indexer sub-class that includes "extract_marc" and other macros from
4
+ # Traject::Macros::Marc21, and also adds some marc-specific default settings.
5
+ class MarcIndexer < ::Traject::Indexer
6
+ include Traject::Macros::Marc21
7
+
8
+ def self.default_settings
9
+ @default_settings ||= begin
10
+ marc_settings = {
11
+ "reader_class_name" => "Traject::MarcReader",
12
+ "marc_source.type" => "binary",
13
+ }
14
+ super.merge(marc_settings)
15
+ end
16
+ end
17
+
18
+ # Overridden from base Indexer, to get MARC 001 for log messages.
19
+ def source_record_id_proc
20
+ @source_record_id_proc ||= lambda do |source_marc_record|
21
+ if ( source_marc_record &&
22
+ source_marc_record.kind_of?(MARC::Record) &&
23
+ source_marc_record['001'] )
24
+ source_marc_record['001'].value
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,30 @@
1
+ require 'traject/nokogiri_reader'
2
+ require 'traject/macros/nokogiri_macros'
3
+ require 'traject/oai_pmh_nokogiri_reader'
4
+
5
+ module Traject
6
+ class Indexer
7
+ # An indexer sub-class for XML, where the source records in the pipeline are
8
+ # Nokogiri::XML::Document objects. It sets a default reader of NokogiriReader, and
9
+ # includes Traject::Macros::Nokogiri (with `extract_xpath`).
10
+ #
11
+ # See docs on XML use. (TODO)
12
+ class NokogiriIndexer < ::Traject::Indexer
13
+ include Traject::Macros::NokogiriMacros
14
+
15
+ def self.default_settings
16
+ @default_settings ||= super.merge("reader_class_name" => "Traject::NokogiriReader")
17
+ end
18
+
19
+ # Overridden from base Indexer, try an `id` attribute or element on record.
20
+ def source_record_id_proc
21
+ @source_record_id_proc ||= lambda do |source_xml_record|
22
+ if ( source_xml_record &&
23
+ source_xml_record.kind_of?(Nokogiri::XML::Node) )
24
+ source_xml_record['id'] || (el = source_xml_record.at_xpath('./id') && el.text)
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
@@ -11,33 +11,55 @@ class Traject::Indexer
11
11
  #
12
12
  # method #provide(key, value) is added, to do like settings[key] ||= value,
13
13
  # set only if not already set (but unlike ||=, nil or false can count as already set)
14
+ # provide WILL overwrite defaults.
14
15
  #
15
- # Also has an interesting 'defaults' system, meant to play along
16
- # with configuration file 'provide' statements. There is a built-in hash of
17
- # defaults, which will be lazily filled in if accessed and not yet
18
- # set. (nil can count as set, though!). If they haven't been lazily
19
- # set yet, then #provide will still fill them in. But you can also call
20
- # fill_in_defaults! to fill all defaults in, if you know configuration
21
- # files have all been loaded, and want to fill them in for inspection.
16
+ # Or you can use standard Hash `store` which will overwrite already set values as well
17
+ # as defaults.
18
+ #
19
+ # Has kind of a weird 'defaults' system, where you tell the hash what it's defaults
20
+ # are, but they aren't actually loaded until asked for (or you can call fill_in_defaults!
21
+ # to load em all for inspection), to accomodate the `provide` API, where a caller wants to set
22
+ # only if not already set, but DO overwrite defaults.
22
23
  class Settings < Hash
24
+ # Just a hash with indifferent access and hash initializer, to use for
25
+ # our defaults hash.
26
+ class DefaultsHash < Hash
27
+ include Hashie::Extensions::MergeInitializer # can init with hash
28
+ include Hashie::Extensions::IndifferentAccess
29
+ end
30
+
23
31
  include Hashie::Extensions::MergeInitializer # can init with hash
24
32
  include Hashie::Extensions::IndifferentAccess
25
33
 
26
34
  def initialize(*args)
27
35
  super
36
+
37
+ @defaults = {}
38
+
28
39
  self.default_proc = lambda do |hash, key|
29
- if self.class.defaults.has_key?(key)
30
- return hash[key] = self.class.defaults[key]
40
+ if @defaults.has_key?(key)
41
+ return hash[key] = @defaults[key]
31
42
  else
32
43
  return nil
33
44
  end
34
45
  end
46
+
47
+ @defaults_filled = Concurrent::AtomicBoolean.new(false)
48
+ end
49
+
50
+ def with_defaults(defaults)
51
+ @defaults = DefaultsHash.new(defaults).freeze
52
+ self
53
+ end
54
+
55
+ def keys
56
+ super + @defaults.keys
35
57
  end
36
58
 
37
59
  # a cautious store, which only saves key=value if
38
60
  # there was not already a value for #key. Can be used
39
61
  # to set settings that can be overridden on command line,
40
- # or general first-set-wins settings.
62
+ # or general first-set-wins settings. DOES set over defaults.
41
63
  def provide(key, value)
42
64
  unless has_key? key
43
65
  store(key, value)
@@ -54,50 +76,11 @@ class Traject::Indexer
54
76
  replace(reverse_merge(other_hash))
55
77
  end
56
78
 
79
+ # Normally defaults are filled in on-demand, but you can trigger it here --
80
+ # but if you later try to load traject config, `provide` will no longer
81
+ # overwrite defaults!
57
82
  def fill_in_defaults!
58
- self.reverse_merge!(self.class.defaults)
59
- end
60
-
61
-
62
- def self.mri_defaults
63
- {
64
- # Reader defaults
65
- "reader_class_name" => "Traject::MarcReader",
66
- "marc_source.type" => "binary",
67
-
68
- # Writer defaults
69
- "writer_class_name" => "Traject::SolrJsonWriter",
70
- "solr_writer.batch_size" => 100,
71
- "solr_writer.thread_pool" => 1,
72
-
73
- # Threading and logging
74
- "processing_thread_pool" => self.default_processing_thread_pool,
75
- "log.batch_size.severity" => "info",
76
-
77
- # how to post-process the accumulator
78
- "allow_nil_values" => false,
79
- "allow_duplicate_values" => true,
80
-
81
- "allow_empty_fields" => false,
82
- }
83
- end
84
-
85
- def self.jruby_defaults
86
- {
87
- 'reader_class_name' => "Traject::Marc4JReader",
88
- 'marc4j_reader.permissive' => true
89
- }
90
- end
91
-
92
-
93
- def self.defaults
94
- return @@defaults if defined? @@defaults
95
- default_settings = self.mri_defaults
96
- if defined? JRUBY_VERSION
97
- default_settings.merge! self.jruby_defaults
98
- end
99
-
100
- @@defaults = default_settings
83
+ self.reverse_merge!(@defaults)
101
84
  end
102
85
 
103
86
  def inspect
@@ -30,15 +30,15 @@ class Traject::Indexer
30
30
  # Set the arity of the lambda expression just once, when we define it
31
31
  def lambda=(lam)
32
32
  @lambda_arity = 0 # assume
33
+ @lambda = lam
34
+
33
35
  return unless lam
34
36
 
35
- @lambda = lam
36
37
  if @lambda.is_a?(Proc)
37
38
  @lambda_arity = @lambda.arity
38
39
  else
39
40
  raise NamingError.new("argument to each_record must be a block/lambda, not a #{lam.class} #{self.inspect}")
40
41
  end
41
-
42
42
  end
43
43
 
44
44
  # raises if bad data
@@ -89,17 +89,17 @@ class Traject::Indexer
89
89
  end
90
90
 
91
91
 
92
- # An indexing step definition for a "to_field" step to specific
93
- # field.
92
+ # An indexing step definition for a "to_field" step to specific
93
+ # field. The first field name argument can be an array of multiple field
94
+ # names, the processed values will be added to each one.
94
95
  class ToFieldStep
95
- attr_accessor :field_name, :block, :source_location
96
- attr_reader :lambda
96
+ attr_reader :field_name, :block, :source_location, :procs
97
97
 
98
- def initialize(fieldname, lambda, block, source_location)
99
- self.field_name = fieldname.freeze
100
- self.lambda = lambda
101
- self.block = block
102
- self.source_location = source_location
98
+ def initialize(field_name, procs, block, source_location)
99
+ @field_name = field_name.freeze
100
+ @procs = procs.freeze
101
+ @block = block.freeze
102
+ @source_location = source_location.freeze
103
103
 
104
104
  validate!
105
105
  end
@@ -108,18 +108,13 @@ class Traject::Indexer
108
108
  true
109
109
  end
110
110
 
111
- def lambda=(lam)
112
- @lambda = lam
113
- @lambda_arity = @lambda ? @lambda.arity : 0
114
- end
115
-
116
111
  def validate!
117
112
 
118
- if self.field_name.nil? || !self.field_name.is_a?(String) || self.field_name.empty?
119
- raise NamingError.new("to_field requires the field name (as a string) as the first argument at #{self.source_location})")
113
+ unless (field_name.is_a?(String) && ! field_name.empty?) || (field_name.is_a?(Array) && field_name.all? { |f| f.is_a?(String) && ! f.empty? })
114
+ raise NamingError.new("to_field requires the field name (as a string), or an array of such, as the first argument at #{self.source_location})")
120
115
  end
121
116
 
122
- [self.lambda, self.block].each do |proc|
117
+ [*self.procs, self.block].each do |proc|
123
118
  # allow negative arity, meaning variable/optional, trust em on that.
124
119
  # but for positive arrity, we need 2 or 3 args
125
120
  if proc && (proc.arity == 0 || proc.arity == 1 || proc.arity > 3)
@@ -130,26 +125,22 @@ class Traject::Indexer
130
125
 
131
126
  # Override inspect for developer debug messages
132
127
  def inspect
133
- "(to_field #{self.field_name} at #{self.source_location})"
128
+ "(to_field #{self.field_name.inspect} at #{self.source_location})"
134
129
  end
135
130
 
136
131
  def execute(context)
137
132
  accumulator = []
138
- sr = context.source_record
133
+ source_record = context.source_record
139
134
 
140
- if @lambda
141
- if @lambda_arity == 2
142
- @lambda.call(sr, accumulator)
135
+ [*self.procs, self.block].each do |aProc|
136
+ next unless aProc
137
+ if aProc.arity == 2
138
+ aProc.call(source_record, accumulator)
143
139
  else
144
- @lambda.call(sr, accumulator, context)
140
+ aProc.call(source_record, accumulator, context)
145
141
  end
146
142
  end
147
143
 
148
- if @block
149
- @block.call(sr, accumulator, context)
150
- end
151
-
152
-
153
144
  add_accumulator_to_context!(accumulator, context)
154
145
  return accumulator
155
146
  end
@@ -165,10 +156,13 @@ class Traject::Indexer
165
156
  accumulator.compact! unless context.settings[ALLOW_NIL_VALUES]
166
157
  return if accumulator.empty? and not (context.settings[ALLOW_EMPTY_FIELDS])
167
158
 
168
- context.output_hash[field_name] ||= []
159
+ # field_name can actually be an array of field names
160
+ Array(field_name).each do |a_field_name|
161
+ context.output_hash[a_field_name] ||= []
169
162
 
170
- existing_accumulator = context.output_hash[field_name].concat(accumulator)
171
- existing_accumulator.uniq! unless context.settings[ALLOW_DUPLICATE_VALUES]
163
+ existing_accumulator = context.output_hash[a_field_name].concat(accumulator)
164
+ existing_accumulator.uniq! unless context.settings[ALLOW_DUPLICATE_VALUES]
165
+ end
172
166
  end
173
167
  end
174
168
 
@@ -11,8 +11,8 @@ module Traject::Macros
11
11
  # def specific to Marc21.
12
12
  module Marc21
13
13
 
14
- # A combo function macro that will extract data from marc according to a string
15
- # field/substring specification, then apply various optional post-processing to it too.
14
+ # A macro that will extract data from marc according to a string
15
+ # field/substring specification.
16
16
  #
17
17
  # First argument is a string spec suitable for the MarcExtractor, see
18
18
  # MarcExtractor::parse_string_spec.
@@ -20,25 +20,42 @@ module Traject::Macros
20
20
  # Second arg is optional options, including options valid on MarcExtractor.new,
21
21
  # and others. By default, will de-duplicate results, but see :allow_duplicates
22
22
  #
23
- # * :first => true: take only first value
23
+ #
24
+ # * :allow_duplicates => boolean, default false, if set to true then will avoid
25
+ # de-duplicating the result array (array.uniq!)
26
+ #
27
+ # * :separator: (default ' ' (space)), what to use when joining multiple subfield matches from
28
+ # same field. Set to `nil` to leave them as separate values (which is actually default if only
29
+ # one subfield is given in spec, like `100a`). See MarcExtractor docs for more info.
30
+ #
31
+ # * :alternate_script: (default true). True, automatically include
32
+ # 'alternate script' MARC 880 linked fields corresponding to matched specifications. `false`, do
33
+ # not include. `:only` include _only_ linked 880s corresponding to spec, not base tags.
34
+ #
35
+ # ## Soft-Deprecated options: post-processing transformations
36
+ #
37
+ # These don't produce a deprecation warning and there is no planned horizon for them to go away, but the
38
+ # alternative of using additional transformation macros (from Traject::Macros::Transformation) composed with
39
+ # extract_marc is recommended.
40
+ #
41
+ # * :first => true: take only first value. **Instead**, use `extract_marc(whatever), first_only`
24
42
  #
25
43
  # * :translation_map => String: translate with named translation map looked up in load
26
- # path, uses Tranject::TranslationMap.new(translation_map_arg)
44
+ # path, uses Tranject::TranslationMap.new(translation_map_arg).
45
+ # **Instead**, use `extract_marc(whatever), translation_map(translation_map_arg)
27
46
  #
28
47
  # * :trim_punctuation => true; trims leading/trailing punctuation using standard algorithms that
29
- # have shown themselves useful with Marc, using Marc21.trim_punctuation
48
+ # have shown themselves useful with Marc, using Marc21.trim_punctuation. **Instead**, use
49
+ # `extract_marc(whatever), trim_punctuation
30
50
  #
31
- # * :default => String: if otherwise empty, add default value
32
- #
33
- # * :allow_duplicates => boolean, default false, if set to true then will avoid
34
- # de-duplicating the result array (array.uniq!)
51
+ # * :default => String: if otherwise empty, add default value. **Instead**, use `extract_marc(whatever), default("default value")`
35
52
  #
36
53
  #
37
54
  # Examples:
38
55
  #
39
- # to_field("title"), extract_marc("245abcd", :trim_punctuation => true)
40
- # to_field("id"), extract_marc("001", :first => true)
41
- # to_field("geo"), extract_marc("040a", :separator => nil, :translation_map => "marc040")
56
+ # to_field("title"), extract_marc("245abcd"), trim_punctuation
57
+ # to_field("id"), extract_marc("001"), first_only
58
+ # to_field("geo"), extract_marc("040a", :separator => nil), translation_map("marc040")
42
59
  #
43
60
  # If you'd like extract_marc functionality but you're not creating an indexer
44
61
  # step, see Traject::Macros::Marc21.extract_marc_from module method.
@@ -122,6 +139,14 @@ module Traject::Macros
122
139
  end
123
140
  end
124
141
 
142
+ # A transformation macro version of trim_punctuation -- heuristics for trimming punctuation
143
+ # from AACR2/MARC style values, to get bare values.
144
+ def trim_punctuation
145
+ lambda do |rec, accumulator|
146
+ accumulator.collect! {|s| Marc21.trim_punctuation(s)}
147
+ end
148
+ end
149
+
125
150
 
126
151
  # A list of symbols that are valid keys in the options hash
127
152
  EXTRACT_MARC_VALID_OPTIONS = [:first, :trim_punctuation, :default,