traject 2.3.4 → 3.0.0.alpha.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (69) hide show
  1. checksums.yaml +5 -5
  2. data/.travis.yml +16 -9
  3. data/CHANGES.md +74 -1
  4. data/Gemfile +2 -1
  5. data/README.md +104 -53
  6. data/Rakefile +8 -1
  7. data/doc/indexing_rules.md +79 -63
  8. data/doc/programmatic_use.md +218 -0
  9. data/doc/settings.md +28 -1
  10. data/doc/xml.md +134 -0
  11. data/lib/traject.rb +5 -0
  12. data/lib/traject/array_writer.rb +34 -0
  13. data/lib/traject/command_line.rb +18 -22
  14. data/lib/traject/debug_writer.rb +2 -5
  15. data/lib/traject/experimental_nokogiri_streaming_reader.rb +276 -0
  16. data/lib/traject/hashie/indifferent_access_fix.rb +25 -0
  17. data/lib/traject/indexer.rb +321 -92
  18. data/lib/traject/indexer/context.rb +39 -13
  19. data/lib/traject/indexer/marc_indexer.rb +30 -0
  20. data/lib/traject/indexer/nokogiri_indexer.rb +30 -0
  21. data/lib/traject/indexer/settings.rb +36 -53
  22. data/lib/traject/indexer/step.rb +27 -33
  23. data/lib/traject/macros/marc21.rb +37 -12
  24. data/lib/traject/macros/nokogiri_macros.rb +43 -0
  25. data/lib/traject/macros/transformation.rb +162 -0
  26. data/lib/traject/marc_extractor.rb +2 -0
  27. data/lib/traject/ndj_reader.rb +1 -1
  28. data/lib/traject/nokogiri_reader.rb +179 -0
  29. data/lib/traject/oai_pmh_nokogiri_reader.rb +159 -0
  30. data/lib/traject/solr_json_writer.rb +19 -12
  31. data/lib/traject/thread_pool.rb +13 -0
  32. data/lib/traject/util.rb +14 -2
  33. data/lib/traject/version.rb +1 -1
  34. data/test/debug_writer_test.rb +3 -3
  35. data/test/delimited_writer_test.rb +3 -3
  36. data/test/experimental_nokogiri_streaming_reader_test.rb +169 -0
  37. data/test/indexer/context_test.rb +23 -13
  38. data/test/indexer/error_handler_test.rb +59 -0
  39. data/test/indexer/macros/macros_marc21_semantics_test.rb +46 -46
  40. data/test/indexer/macros/marc21/extract_all_marc_values_test.rb +1 -1
  41. data/test/indexer/macros/marc21/extract_marc_test.rb +19 -9
  42. data/test/indexer/macros/marc21/serialize_marc_test.rb +4 -4
  43. data/test/indexer/macros/to_field_test.rb +2 -2
  44. data/test/indexer/macros/transformation_test.rb +177 -0
  45. data/test/indexer/map_record_test.rb +2 -3
  46. data/test/indexer/nokogiri_indexer_test.rb +103 -0
  47. data/test/indexer/process_record_test.rb +55 -0
  48. data/test/indexer/process_with_test.rb +148 -0
  49. data/test/indexer/read_write_test.rb +52 -2
  50. data/test/indexer/settings_test.rb +34 -24
  51. data/test/indexer/to_field_test.rb +27 -2
  52. data/test/marc_extractor_test.rb +7 -7
  53. data/test/marc_reader_test.rb +4 -4
  54. data/test/nokogiri_reader_test.rb +158 -0
  55. data/test/oai_pmh_nokogiri_reader_test.rb +23 -0
  56. data/test/solr_json_writer_test.rb +24 -28
  57. data/test/test_helper.rb +8 -2
  58. data/test/test_support/namespace-test.xml +7 -0
  59. data/test/test_support/nokogiri_demo_config.rb +17 -0
  60. data/test/test_support/oai-pmh-one-record-2.xml +24 -0
  61. data/test/test_support/oai-pmh-one-record-first.xml +24 -0
  62. data/test/test_support/sample-oai-no-namespace.xml +197 -0
  63. data/test/test_support/sample-oai-pmh.xml +197 -0
  64. data/test/thread_pool_test.rb +38 -0
  65. data/test/translation_map_test.rb +3 -3
  66. data/test/translation_maps/ruby_map.rb +2 -1
  67. data/test/translation_maps/yaml_map.yaml +2 -1
  68. data/traject.gemspec +4 -11
  69. metadata +92 -6
@@ -1,6 +1,8 @@
1
1
  # Represents the context of a specific record being indexed, passed
2
2
  # to indexing logic blocks
3
3
  #
4
+ # Arg source_record_id_proc is a lambda that takes one arg (indexer-specific source record),
5
+ # and returns an ID for it suitable for use in log messages.
4
6
  class Traject::Indexer
5
7
  class Context
6
8
  def initialize(hash_init = {})
@@ -17,9 +19,13 @@ class Traject::Indexer
17
19
  end
18
20
 
19
21
  attr_accessor :clipboard, :output_hash, :logger
20
- attr_accessor :index_step, :source_record, :settings
21
- # 1-based position in stream of processed records.
22
+ attr_accessor :index_step, :source_record, :settings, :source_record_id_proc
23
+ # 'position' is a 1-based position in stream of processed records.
22
24
  attr_accessor :position
25
+ # sometimes we have multiple inputs, input_name describes the current one, and
26
+ # position_in_input the position of the record in the current input -- both can
27
+ # sometimes be blanl when we don't know.
28
+ attr_accessor :input_name, :position_in_input
23
29
 
24
30
  # Should we be skipping this record?
25
31
  attr_accessor :skipmessage
@@ -41,19 +47,39 @@ class Traject::Indexer
41
47
  # in output messages, especially since this method may sometimes
42
48
  # return empty string if info on record id is not available.
43
49
  #
44
- # Returns MARC 001, then a slash, then output_hash["id"] -- if both
50
+ # Returns id from source_record (if we can get it from a source_record_id_proc),
51
+ # then a slash,then output_hash["id"] -- if both
45
52
  # are present. Otherwise may return just one, or even an empty string.
46
- #
47
- # Likely override this for a future XML or other source format version.
48
53
  def source_record_id
49
- marc_id = if self.source_record &&
50
- self.source_record.kind_of?(MARC::Record) &&
51
- self.source_record['001']
52
- self.source_record['001'].value
53
- end
54
- output_id = self.output_hash["id"]
55
-
56
- return [marc_id, output_id].compact.join("/")
54
+ source_record_id_proc && source_record_id_proc.call(source_record)
55
+ end
56
+
57
+ # a string label that can be used to refer to a particular record in log messages and
58
+ # exceptions. Includes various parts depending on what we got.
59
+ def record_inspect
60
+ str = "<"
61
+
62
+ str << "record ##{position}" if position
63
+
64
+ if input_name && position_in_input
65
+ str << " (#{input_name} ##{position_in_input}), "
66
+ elsif position
67
+ str << ", "
68
+ end
69
+
70
+ if source_id = source_record_id
71
+ str << "source_id:#{source_id} "
72
+ end
73
+
74
+ if output_id = self.output_hash["id"]
75
+ str << "output_id:#{[output_id].join(',')}"
76
+ end
77
+
78
+ str.chomp!(" ")
79
+ str.chomp!(",")
80
+ str << ">"
81
+
82
+ str
57
83
  end
58
84
 
59
85
  end
@@ -0,0 +1,30 @@
1
+ module Traject
2
+ class Indexer
3
+ # An indexer sub-class that includes "extract_marc" and other macros from
4
+ # Traject::Macros::Marc21, and also adds some marc-specific default settings.
5
+ class MarcIndexer < ::Traject::Indexer
6
+ include Traject::Macros::Marc21
7
+
8
+ def self.default_settings
9
+ @default_settings ||= begin
10
+ marc_settings = {
11
+ "reader_class_name" => "Traject::MarcReader",
12
+ "marc_source.type" => "binary",
13
+ }
14
+ super.merge(marc_settings)
15
+ end
16
+ end
17
+
18
+ # Overridden from base Indexer, to get MARC 001 for log messages.
19
+ def source_record_id_proc
20
+ @source_record_id_proc ||= lambda do |source_marc_record|
21
+ if ( source_marc_record &&
22
+ source_marc_record.kind_of?(MARC::Record) &&
23
+ source_marc_record['001'] )
24
+ source_marc_record['001'].value
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,30 @@
1
+ require 'traject/nokogiri_reader'
2
+ require 'traject/macros/nokogiri_macros'
3
+ require 'traject/oai_pmh_nokogiri_reader'
4
+
5
+ module Traject
6
+ class Indexer
7
+ # An indexer sub-class for XML, where the source records in the pipeline are
8
+ # Nokogiri::XML::Document objects. It sets a default reader of NokogiriReader, and
9
+ # includes Traject::Macros::Nokogiri (with `extract_xpath`).
10
+ #
11
+ # See docs on XML use. (TODO)
12
+ class NokogiriIndexer < ::Traject::Indexer
13
+ include Traject::Macros::NokogiriMacros
14
+
15
+ def self.default_settings
16
+ @default_settings ||= super.merge("reader_class_name" => "Traject::NokogiriReader")
17
+ end
18
+
19
+ # Overridden from base Indexer, try an `id` attribute or element on record.
20
+ def source_record_id_proc
21
+ @source_record_id_proc ||= lambda do |source_xml_record|
22
+ if ( source_xml_record &&
23
+ source_xml_record.kind_of?(Nokogiri::XML::Node) )
24
+ source_xml_record['id'] || (el = source_xml_record.at_xpath('./id') && el.text)
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
@@ -11,33 +11,55 @@ class Traject::Indexer
11
11
  #
12
12
  # method #provide(key, value) is added, to do like settings[key] ||= value,
13
13
  # set only if not already set (but unlike ||=, nil or false can count as already set)
14
+ # provide WILL overwrite defaults.
14
15
  #
15
- # Also has an interesting 'defaults' system, meant to play along
16
- # with configuration file 'provide' statements. There is a built-in hash of
17
- # defaults, which will be lazily filled in if accessed and not yet
18
- # set. (nil can count as set, though!). If they haven't been lazily
19
- # set yet, then #provide will still fill them in. But you can also call
20
- # fill_in_defaults! to fill all defaults in, if you know configuration
21
- # files have all been loaded, and want to fill them in for inspection.
16
+ # Or you can use standard Hash `store` which will overwrite already set values as well
17
+ # as defaults.
18
+ #
19
+ # Has kind of a weird 'defaults' system, where you tell the hash what it's defaults
20
+ # are, but they aren't actually loaded until asked for (or you can call fill_in_defaults!
21
+ # to load em all for inspection), to accomodate the `provide` API, where a caller wants to set
22
+ # only if not already set, but DO overwrite defaults.
22
23
  class Settings < Hash
24
+ # Just a hash with indifferent access and hash initializer, to use for
25
+ # our defaults hash.
26
+ class DefaultsHash < Hash
27
+ include Hashie::Extensions::MergeInitializer # can init with hash
28
+ include Hashie::Extensions::IndifferentAccess
29
+ end
30
+
23
31
  include Hashie::Extensions::MergeInitializer # can init with hash
24
32
  include Hashie::Extensions::IndifferentAccess
25
33
 
26
34
  def initialize(*args)
27
35
  super
36
+
37
+ @defaults = {}
38
+
28
39
  self.default_proc = lambda do |hash, key|
29
- if self.class.defaults.has_key?(key)
30
- return hash[key] = self.class.defaults[key]
40
+ if @defaults.has_key?(key)
41
+ return hash[key] = @defaults[key]
31
42
  else
32
43
  return nil
33
44
  end
34
45
  end
46
+
47
+ @defaults_filled = Concurrent::AtomicBoolean.new(false)
48
+ end
49
+
50
+ def with_defaults(defaults)
51
+ @defaults = DefaultsHash.new(defaults).freeze
52
+ self
53
+ end
54
+
55
+ def keys
56
+ super + @defaults.keys
35
57
  end
36
58
 
37
59
  # a cautious store, which only saves key=value if
38
60
  # there was not already a value for #key. Can be used
39
61
  # to set settings that can be overridden on command line,
40
- # or general first-set-wins settings.
62
+ # or general first-set-wins settings. DOES set over defaults.
41
63
  def provide(key, value)
42
64
  unless has_key? key
43
65
  store(key, value)
@@ -54,50 +76,11 @@ class Traject::Indexer
54
76
  replace(reverse_merge(other_hash))
55
77
  end
56
78
 
79
+ # Normally defaults are filled in on-demand, but you can trigger it here --
80
+ # but if you later try to load traject config, `provide` will no longer
81
+ # overwrite defaults!
57
82
  def fill_in_defaults!
58
- self.reverse_merge!(self.class.defaults)
59
- end
60
-
61
-
62
- def self.mri_defaults
63
- {
64
- # Reader defaults
65
- "reader_class_name" => "Traject::MarcReader",
66
- "marc_source.type" => "binary",
67
-
68
- # Writer defaults
69
- "writer_class_name" => "Traject::SolrJsonWriter",
70
- "solr_writer.batch_size" => 100,
71
- "solr_writer.thread_pool" => 1,
72
-
73
- # Threading and logging
74
- "processing_thread_pool" => self.default_processing_thread_pool,
75
- "log.batch_size.severity" => "info",
76
-
77
- # how to post-process the accumulator
78
- "allow_nil_values" => false,
79
- "allow_duplicate_values" => true,
80
-
81
- "allow_empty_fields" => false,
82
- }
83
- end
84
-
85
- def self.jruby_defaults
86
- {
87
- 'reader_class_name' => "Traject::Marc4JReader",
88
- 'marc4j_reader.permissive' => true
89
- }
90
- end
91
-
92
-
93
- def self.defaults
94
- return @@defaults if defined? @@defaults
95
- default_settings = self.mri_defaults
96
- if defined? JRUBY_VERSION
97
- default_settings.merge! self.jruby_defaults
98
- end
99
-
100
- @@defaults = default_settings
83
+ self.reverse_merge!(@defaults)
101
84
  end
102
85
 
103
86
  def inspect
@@ -30,15 +30,15 @@ class Traject::Indexer
30
30
  # Set the arity of the lambda expression just once, when we define it
31
31
  def lambda=(lam)
32
32
  @lambda_arity = 0 # assume
33
+ @lambda = lam
34
+
33
35
  return unless lam
34
36
 
35
- @lambda = lam
36
37
  if @lambda.is_a?(Proc)
37
38
  @lambda_arity = @lambda.arity
38
39
  else
39
40
  raise NamingError.new("argument to each_record must be a block/lambda, not a #{lam.class} #{self.inspect}")
40
41
  end
41
-
42
42
  end
43
43
 
44
44
  # raises if bad data
@@ -89,17 +89,17 @@ class Traject::Indexer
89
89
  end
90
90
 
91
91
 
92
- # An indexing step definition for a "to_field" step to specific
93
- # field.
92
+ # An indexing step definition for a "to_field" step to specific
93
+ # field. The first field name argument can be an array of multiple field
94
+ # names, the processed values will be added to each one.
94
95
  class ToFieldStep
95
- attr_accessor :field_name, :block, :source_location
96
- attr_reader :lambda
96
+ attr_reader :field_name, :block, :source_location, :procs
97
97
 
98
- def initialize(fieldname, lambda, block, source_location)
99
- self.field_name = fieldname.freeze
100
- self.lambda = lambda
101
- self.block = block
102
- self.source_location = source_location
98
+ def initialize(field_name, procs, block, source_location)
99
+ @field_name = field_name.freeze
100
+ @procs = procs.freeze
101
+ @block = block.freeze
102
+ @source_location = source_location.freeze
103
103
 
104
104
  validate!
105
105
  end
@@ -108,18 +108,13 @@ class Traject::Indexer
108
108
  true
109
109
  end
110
110
 
111
- def lambda=(lam)
112
- @lambda = lam
113
- @lambda_arity = @lambda ? @lambda.arity : 0
114
- end
115
-
116
111
  def validate!
117
112
 
118
- if self.field_name.nil? || !self.field_name.is_a?(String) || self.field_name.empty?
119
- raise NamingError.new("to_field requires the field name (as a string) as the first argument at #{self.source_location})")
113
+ unless (field_name.is_a?(String) && ! field_name.empty?) || (field_name.is_a?(Array) && field_name.all? { |f| f.is_a?(String) && ! f.empty? })
114
+ raise NamingError.new("to_field requires the field name (as a string), or an array of such, as the first argument at #{self.source_location})")
120
115
  end
121
116
 
122
- [self.lambda, self.block].each do |proc|
117
+ [*self.procs, self.block].each do |proc|
123
118
  # allow negative arity, meaning variable/optional, trust em on that.
124
119
  # but for positive arrity, we need 2 or 3 args
125
120
  if proc && (proc.arity == 0 || proc.arity == 1 || proc.arity > 3)
@@ -130,26 +125,22 @@ class Traject::Indexer
130
125
 
131
126
  # Override inspect for developer debug messages
132
127
  def inspect
133
- "(to_field #{self.field_name} at #{self.source_location})"
128
+ "(to_field #{self.field_name.inspect} at #{self.source_location})"
134
129
  end
135
130
 
136
131
  def execute(context)
137
132
  accumulator = []
138
- sr = context.source_record
133
+ source_record = context.source_record
139
134
 
140
- if @lambda
141
- if @lambda_arity == 2
142
- @lambda.call(sr, accumulator)
135
+ [*self.procs, self.block].each do |aProc|
136
+ next unless aProc
137
+ if aProc.arity == 2
138
+ aProc.call(source_record, accumulator)
143
139
  else
144
- @lambda.call(sr, accumulator, context)
140
+ aProc.call(source_record, accumulator, context)
145
141
  end
146
142
  end
147
143
 
148
- if @block
149
- @block.call(sr, accumulator, context)
150
- end
151
-
152
-
153
144
  add_accumulator_to_context!(accumulator, context)
154
145
  return accumulator
155
146
  end
@@ -165,10 +156,13 @@ class Traject::Indexer
165
156
  accumulator.compact! unless context.settings[ALLOW_NIL_VALUES]
166
157
  return if accumulator.empty? and not (context.settings[ALLOW_EMPTY_FIELDS])
167
158
 
168
- context.output_hash[field_name] ||= []
159
+ # field_name can actually be an array of field names
160
+ Array(field_name).each do |a_field_name|
161
+ context.output_hash[a_field_name] ||= []
169
162
 
170
- existing_accumulator = context.output_hash[field_name].concat(accumulator)
171
- existing_accumulator.uniq! unless context.settings[ALLOW_DUPLICATE_VALUES]
163
+ existing_accumulator = context.output_hash[a_field_name].concat(accumulator)
164
+ existing_accumulator.uniq! unless context.settings[ALLOW_DUPLICATE_VALUES]
165
+ end
172
166
  end
173
167
  end
174
168
 
@@ -11,8 +11,8 @@ module Traject::Macros
11
11
  # def specific to Marc21.
12
12
  module Marc21
13
13
 
14
- # A combo function macro that will extract data from marc according to a string
15
- # field/substring specification, then apply various optional post-processing to it too.
14
+ # A macro that will extract data from marc according to a string
15
+ # field/substring specification.
16
16
  #
17
17
  # First argument is a string spec suitable for the MarcExtractor, see
18
18
  # MarcExtractor::parse_string_spec.
@@ -20,25 +20,42 @@ module Traject::Macros
20
20
  # Second arg is optional options, including options valid on MarcExtractor.new,
21
21
  # and others. By default, will de-duplicate results, but see :allow_duplicates
22
22
  #
23
- # * :first => true: take only first value
23
+ #
24
+ # * :allow_duplicates => boolean, default false, if set to true then will avoid
25
+ # de-duplicating the result array (array.uniq!)
26
+ #
27
+ # * :separator: (default ' ' (space)), what to use when joining multiple subfield matches from
28
+ # same field. Set to `nil` to leave them as separate values (which is actually default if only
29
+ # one subfield is given in spec, like `100a`). See MarcExtractor docs for more info.
30
+ #
31
+ # * :alternate_script: (default true). True, automatically include
32
+ # 'alternate script' MARC 880 linked fields corresponding to matched specifications. `false`, do
33
+ # not include. `:only` include _only_ linked 880s corresponding to spec, not base tags.
34
+ #
35
+ # ## Soft-Deprecated options: post-processing transformations
36
+ #
37
+ # These don't produce a deprecation warning and there is no planned horizon for them to go away, but the
38
+ # alternative of using additional transformation macros (from Traject::Macros::Transformation) composed with
39
+ # extract_marc is recommended.
40
+ #
41
+ # * :first => true: take only first value. **Instead**, use `extract_marc(whatever), first_only`
24
42
  #
25
43
  # * :translation_map => String: translate with named translation map looked up in load
26
- # path, uses Tranject::TranslationMap.new(translation_map_arg)
44
+ # path, uses Tranject::TranslationMap.new(translation_map_arg).
45
+ # **Instead**, use `extract_marc(whatever), translation_map(translation_map_arg)
27
46
  #
28
47
  # * :trim_punctuation => true; trims leading/trailing punctuation using standard algorithms that
29
- # have shown themselves useful with Marc, using Marc21.trim_punctuation
48
+ # have shown themselves useful with Marc, using Marc21.trim_punctuation. **Instead**, use
49
+ # `extract_marc(whatever), trim_punctuation
30
50
  #
31
- # * :default => String: if otherwise empty, add default value
32
- #
33
- # * :allow_duplicates => boolean, default false, if set to true then will avoid
34
- # de-duplicating the result array (array.uniq!)
51
+ # * :default => String: if otherwise empty, add default value. **Instead**, use `extract_marc(whatever), default("default value")`
35
52
  #
36
53
  #
37
54
  # Examples:
38
55
  #
39
- # to_field("title"), extract_marc("245abcd", :trim_punctuation => true)
40
- # to_field("id"), extract_marc("001", :first => true)
41
- # to_field("geo"), extract_marc("040a", :separator => nil, :translation_map => "marc040")
56
+ # to_field("title"), extract_marc("245abcd"), trim_punctuation
57
+ # to_field("id"), extract_marc("001"), first_only
58
+ # to_field("geo"), extract_marc("040a", :separator => nil), translation_map("marc040")
42
59
  #
43
60
  # If you'd like extract_marc functionality but you're not creating an indexer
44
61
  # step, see Traject::Macros::Marc21.extract_marc_from module method.
@@ -122,6 +139,14 @@ module Traject::Macros
122
139
  end
123
140
  end
124
141
 
142
+ # A transformation macro version of trim_punctuation -- heuristics for trimming punctuation
143
+ # from AACR2/MARC style values, to get bare values.
144
+ def trim_punctuation
145
+ lambda do |rec, accumulator|
146
+ accumulator.collect! {|s| Marc21.trim_punctuation(s)}
147
+ end
148
+ end
149
+
125
150
 
126
151
  # A list of symbols that are valid keys in the options hash
127
152
  EXTRACT_MARC_VALID_OPTIONS = [:first, :trim_punctuation, :default,