traject 2.3.4 → 3.0.0.alpha.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (69) hide show
  1. checksums.yaml +5 -5
  2. data/.travis.yml +16 -9
  3. data/CHANGES.md +74 -1
  4. data/Gemfile +2 -1
  5. data/README.md +104 -53
  6. data/Rakefile +8 -1
  7. data/doc/indexing_rules.md +79 -63
  8. data/doc/programmatic_use.md +218 -0
  9. data/doc/settings.md +28 -1
  10. data/doc/xml.md +134 -0
  11. data/lib/traject.rb +5 -0
  12. data/lib/traject/array_writer.rb +34 -0
  13. data/lib/traject/command_line.rb +18 -22
  14. data/lib/traject/debug_writer.rb +2 -5
  15. data/lib/traject/experimental_nokogiri_streaming_reader.rb +276 -0
  16. data/lib/traject/hashie/indifferent_access_fix.rb +25 -0
  17. data/lib/traject/indexer.rb +321 -92
  18. data/lib/traject/indexer/context.rb +39 -13
  19. data/lib/traject/indexer/marc_indexer.rb +30 -0
  20. data/lib/traject/indexer/nokogiri_indexer.rb +30 -0
  21. data/lib/traject/indexer/settings.rb +36 -53
  22. data/lib/traject/indexer/step.rb +27 -33
  23. data/lib/traject/macros/marc21.rb +37 -12
  24. data/lib/traject/macros/nokogiri_macros.rb +43 -0
  25. data/lib/traject/macros/transformation.rb +162 -0
  26. data/lib/traject/marc_extractor.rb +2 -0
  27. data/lib/traject/ndj_reader.rb +1 -1
  28. data/lib/traject/nokogiri_reader.rb +179 -0
  29. data/lib/traject/oai_pmh_nokogiri_reader.rb +159 -0
  30. data/lib/traject/solr_json_writer.rb +19 -12
  31. data/lib/traject/thread_pool.rb +13 -0
  32. data/lib/traject/util.rb +14 -2
  33. data/lib/traject/version.rb +1 -1
  34. data/test/debug_writer_test.rb +3 -3
  35. data/test/delimited_writer_test.rb +3 -3
  36. data/test/experimental_nokogiri_streaming_reader_test.rb +169 -0
  37. data/test/indexer/context_test.rb +23 -13
  38. data/test/indexer/error_handler_test.rb +59 -0
  39. data/test/indexer/macros/macros_marc21_semantics_test.rb +46 -46
  40. data/test/indexer/macros/marc21/extract_all_marc_values_test.rb +1 -1
  41. data/test/indexer/macros/marc21/extract_marc_test.rb +19 -9
  42. data/test/indexer/macros/marc21/serialize_marc_test.rb +4 -4
  43. data/test/indexer/macros/to_field_test.rb +2 -2
  44. data/test/indexer/macros/transformation_test.rb +177 -0
  45. data/test/indexer/map_record_test.rb +2 -3
  46. data/test/indexer/nokogiri_indexer_test.rb +103 -0
  47. data/test/indexer/process_record_test.rb +55 -0
  48. data/test/indexer/process_with_test.rb +148 -0
  49. data/test/indexer/read_write_test.rb +52 -2
  50. data/test/indexer/settings_test.rb +34 -24
  51. data/test/indexer/to_field_test.rb +27 -2
  52. data/test/marc_extractor_test.rb +7 -7
  53. data/test/marc_reader_test.rb +4 -4
  54. data/test/nokogiri_reader_test.rb +158 -0
  55. data/test/oai_pmh_nokogiri_reader_test.rb +23 -0
  56. data/test/solr_json_writer_test.rb +24 -28
  57. data/test/test_helper.rb +8 -2
  58. data/test/test_support/namespace-test.xml +7 -0
  59. data/test/test_support/nokogiri_demo_config.rb +17 -0
  60. data/test/test_support/oai-pmh-one-record-2.xml +24 -0
  61. data/test/test_support/oai-pmh-one-record-first.xml +24 -0
  62. data/test/test_support/sample-oai-no-namespace.xml +197 -0
  63. data/test/test_support/sample-oai-pmh.xml +197 -0
  64. data/test/thread_pool_test.rb +38 -0
  65. data/test/translation_map_test.rb +3 -3
  66. data/test/translation_maps/ruby_map.rb +2 -1
  67. data/test/translation_maps/yaml_map.yaml +2 -1
  68. data/traject.gemspec +4 -11
  69. metadata +92 -6
@@ -0,0 +1,218 @@
1
+ # Programmatic/Embedded Use of Traject
2
+
3
+ Traject was originally written with a core use case of batch-processing many (millions) of records as a stand-alone process, usually with the `traject` command-line.
4
+
5
+ However, people have also found it useful for programmatic use embedded within a larger application, including Rails apps. Here are some hints for how to use traject effectively programmatically:
6
+
7
+ ## Initializing an indexer
8
+
9
+ The first arg to indexer constructor is an optional hash of settings, same settings you could set in configuration. Under programmatic use, it may be more convenient or more legible to set in constructor. Keys can be Strings or Symbols.
10
+
11
+ ```ruby
12
+ indexer = Traject::Indexer.new("solr_writer.commit_on_close" => true)
13
+ ```
14
+
15
+ Note that keys passed in as an initializer arg will "override" any settings set with `provide` in config.
16
+
17
+ ## Configuring an indexer
18
+
19
+ Under standard use, a traject indexer is configured with mapping rules and other settings in a standalone configuration file. You can do this programmatically with `load_config_file`:
20
+
21
+ ```ruby
22
+ indexer.load_config_file(path_to_config)
23
+ ```
24
+
25
+ This can be convenient for config files you can use either from the command line, or programmatically. Or for allowing other staff roles to write config files separately. You can call `load_config_file` multiple times, and order may matter -- exactly the same as command line configuration files.
26
+
27
+ Alternately, you may instead want to do your configuration inline, using `configure` (which just does an `instance_eval`, but is encouraged for clarity and forwards-compatibility:
28
+
29
+ ```ruby
30
+ indexer.configure do
31
+ # you can choose to load config files this way
32
+ load_config_file(path_to_config)
33
+
34
+ to_field "whatever", extract_marc("800")
35
+ after_processing do
36
+ # whatever
37
+ end
38
+ end
39
+ ```
40
+
41
+ Whatever you might do in a traject config file is valid here, because this is exactly the method used when traject loads config files. This includes adding in macros with `extend SomeModule`. Again, you can do instance_eval multiple times, and order may matter, just like ordinary config files.
42
+
43
+ As a convenience, you can also pass a block to indexer constructor, that will be `instance_eval`d, intended for configuration:
44
+
45
+ ```ruby
46
+ indexer = Traject::Indexer.new(settings) do
47
+ to_field "whatever", extract_marc(whatever)
48
+ end
49
+ ```
50
+
51
+ ## Running the indexer
52
+
53
+ ### process: probably not what you want
54
+
55
+ The standard command-line traject uses the `Indexer#process(io_stream)` method to invoke processing. While you can use this method programmatically, it makes some assumptions that may make it inconvenient for programmatic use:
56
+
57
+ * It automatically instantiates a reader and writer, and the reader and writer may not be safe to use more than once, so you can't call #process more than once for a given indexer instance. This also means you can't call it concurrently on the same indexer.
58
+
59
+ * It is optimized for millions+ records, for instance by default it uses internal threads, which you probably don't want -- and which can cause deadlock in some circumstances in a Rails5 app. You an set `processing_thread_pool` setting to `0` to ensure no additional threads created by indexer, but depending on the reader and writer involved, they may still be creating threads.
60
+
61
+ * It has what is probably excessive logging (and in some cases progress-bar output), assuming use as standalone command-line execution.
62
+
63
+ * It runs all `after_processing` steps, which you may not want in a few-records-at-a-time programmatic context.
64
+
65
+ As an alternative to the full high-volume pipeline in `#process`, several other methods
66
+ that do less, and are more easily composable, are available: `#map_record`, `#process_record`, and `#process_with`.
67
+
68
+
69
+ ### map_record: just map a single record, handle transformed output yourself
70
+
71
+ Simplest of all, `#map_record` takes a single source record, and simply returns the output_hash
72
+ transformed from it. You don't get the full Context back, and it is your responsibility to do something with this output_hash. If the record was skipped, nil is returned. Exceptions
73
+ in processing are simply raised out of this method.
74
+
75
+ ```ruby
76
+ output_hash = indexer.map_record(record)
77
+ ```
78
+
79
+
80
+ ### process_record: send a single record to instance writer
81
+
82
+ `#process_record` takes a single source record, sends it thorugh transformation, and sends the output the instance-configured writer. No logging, threading, or error handling is done for you. Skipped records will not be sent to writer. A `Traject::Indexer::Context` is returned from every call.
83
+
84
+ ```ruby
85
+ context = indexer.process_record(source_record)
86
+ ```
87
+
88
+ This method can be thought of as sending a single record through the indexer's pipeline and writer. For convenience, this is also aliased as `#<<`.
89
+
90
+ ```ruby
91
+ indexer << source_record
92
+ ```
93
+
94
+ `process_record` should be safe to call concurrently on an indexer shared between threads -- so long as the configured writer is thread-safe, which all built-in writers are.
95
+
96
+ You can (and may want/need to) manually call `indexer.complete` to run after_processing steps, and
97
+ close/flush the writer. After calling `complete`, the indexer can not be re-used for more `process_record` calls, as the writer has been closed.
98
+
99
+ ### process_with: an in between option for easier programmatic use
100
+
101
+ `process_with` is sort of a swiss-army-knife of processing records with a Traject::Indexer.
102
+
103
+ You supply it with a reader and writer every time you call it, it does not use the instance-configured reader and writer. This means you can call it as many times as you want with the same indexer (as readers and writers are not always re-usable, and may not be safe to share between threads/invocations). `process_with` is also safe to call concurrently on an indexer shared between threads.
104
+
105
+ Since a ruby Array of source methods counts as a Traject "reader" (it has an `each` yielding records), you can simply pass it an array of input. You can use the Traject::ArrayWriter as a "writer", which simply accumulates output Traject::Indexer::Contexts in memory. Or you can pass `process_with` a block instead of (or inaddition to!) a passed writer arg, as a sort of inline writer. The block will recieve one arg, a Context.
106
+
107
+ `process_with` does no logging, and does no concurrency (although the writer you are using may use multiple threads itself internally). It's a building block you can build whatever you like with.
108
+
109
+
110
+ ```ruby
111
+ writer = indexer.process_with([record1, record2, record3], Traject::ArrayWriter.new)
112
+ output_hashes = writer.values
113
+ output_contexts = writer.contexts
114
+ writer.clear! # if desired
115
+
116
+ # or
117
+
118
+ indexer.process_with([source_record, other_record]) do |context|
119
+ puts "#{context.position}: #{context.output_hash}"
120
+ end
121
+ ```
122
+
123
+ By default, any exceptions raised in processing are simply raised -- terminating processing -- for you to rescue and deal with as you like. Instead, you can provide a `rescue_with` argument with a proc/lambda that will be triggered on an exception processing a record. The proc will be passed two args, the Traject::Indexer::Context and the exception. You can choose to re-raise the original exception or any other, or swallow it, or process it however you like. If you do not raise, the indexer will continue processing subsequent records.
124
+
125
+ Skipped records are skipped, but you can hook into them with a `on_skipped` proc arg.
126
+
127
+ ```ruby
128
+ indexer.process_with([record1, record2, record3],
129
+ Traject::ArrayWriter.new,
130
+ on_skipped: proc do |context|
131
+ puts "Skipped: #{context.record_inspect}"
132
+ end,
133
+ rescue_with: proc do |context, exception|
134
+ puts "Error #{exception} in #{context.record_inspect}, continuing to process more"
135
+ end)
136
+ ```
137
+
138
+ `process_with` will *not* call any `after_processing` steps. Call them yourself if and when you want with `indexer.run_after_processing_steps`.
139
+
140
+ Some writers have a `close` method to finalize/flush output. `process_with` will not call it, you can call `writer.close` yourself -- after calling `close` on a writer, it can generally not be re-used.
141
+
142
+ ## Indexer performance, re-use, and concurrency
143
+
144
+ While the `Traject::Indexer` has generally been tuned for high performance, this does not apply to creating and configuring an indexer.
145
+
146
+ In particular, `indexer.load_config_file(path_to_config)` is not going to be high-performance, as it requires touching the file system to find and load a config file. If you are creating lots of indexer instances throughout your program life, and doing so in a place where the indexer instantiation is a performance bottleneck, this may be a problem.
147
+
148
+ I looked into trying to make `load_config_file`-type functionality more performant, but have not yet found a great way.
149
+
150
+ You may want to consider instead creating one or more configured "global" indexers (likely in a class variable rather than a ruby global variable, although it's up to you), and re-using it throughout your program's life. Since most reader-less uses of the Indexer are thread-safe, this should be safe to do even if in a situation (like a Rails app under many app server environments) where a global indexer could be used concurrently by multiple threads.
151
+
152
+ ### Concurrency concerns
153
+
154
+ * Indexing rules must be thread-safe. They generally will be naturally, but if you are refering to external state, you have to use thread-safe data structures. [concurrent-ruby](https://github.com/ruby-concurrency/concurrent-ruby), which is already a dependency of traject, has a variety of useful thread-safe and concurrent data structures.
155
+
156
+ * Writers should be written in a thread-safe manner, assuming concurrent calls to `put`. The built-in Writers all should be. If you are writing a custom Writer, you should ensure it is thread-safe for concurrent calls to `put`.
157
+
158
+ * Readers, and the Indexer#process method, are not thread-safe. Which is why using Indexer#process, which uses a fixed reader, is not threads-safe, and why when sharing a global idnexer we want to use `process_record`, `map_record`, or `process_with` as above.
159
+
160
+ It ought to be safe to use a global Indexer concurrently in several threads, with the `map_record`, `process_record` or `process_with` methods -- so long as your indexing rules and writers are thread-safe, as they usually will be and always ought to be.
161
+
162
+ ### An example
163
+
164
+ For the simplest case, we want to turn off all built-in traject concurrency in a "global" indexer we create, and then send records to.
165
+
166
+ ```ruby
167
+ $traject_indexer = Traject::Indexer.new(
168
+ # disable Indexer processing thread pool, to keep things simple and not interfering with Rails.
169
+ "processing_thread_pool" => 0,
170
+ "solr_writer.thread_pool" => 0, # writing to solr is done inline, no threads
171
+
172
+ "solr.url" => "http://whatever",
173
+ "writer_class" => "SolrJsonWriter",
174
+ "solr_writer.batch_size" => 1, #send to solr for each record, no batching
175
+ ) do
176
+ load_config_file("whatever/config.rb")
177
+ end
178
+
179
+ # Now, wherever you want, simply:
180
+
181
+ $traject_indexer << source_record
182
+ ```
183
+
184
+ `<<` is an alias for `process_record`. Above will take the source record, process it, and send it to the writer -- which has been configured to immediately send the `add` to solr. All of this will be done in the caller thread, with no additional threads used.
185
+
186
+ If you'd like the indexing operation to be 'async' from wherever you are calling it (say, a model save), you may want to use your own concurrency/async logic (say a Concurrent::Future, or an ActiveJob) to execute the `$traject_indexer << source_record` -- no problem. We above disable concurrency inside of Traject so you can do whatever you want at your application layer instead.
187
+
188
+ Note that the SolrJsonWriter will _not_ issue `commit` commands to Solr -- your Solr autoCommit configuration is likely sufficient, but if you need a feature where SolrJsonWriter sends commits, let us know.
189
+
190
+ The above example will do a separate HTTP POST to Solr for every record, which may not be ideal performance-wise. (On the plus side, it should re-use persistent HTTP connections if your Solr server supports that, so making your Solr server support that could be of benefit). You may want to do something more complicated that batches things somehow -- you can possibly do that with various settings/patterns of use for SolrJsonWriter (see for instance SolrJsonWriter#flush), or perhaps you want to use `map_record` or `process_with` as primitives to build whatever you want on top:
191
+
192
+
193
+ ```ruby
194
+ $indexer.process_with(array_of_one_or_more_records) do |context|
195
+ # called for each output document
196
+ do_whatever_you_want_with(context.output_hash)
197
+ end
198
+ ```
199
+
200
+ For instance, [Sunspot](https://github.com/sunspot/sunspot) does some [fancy stuff](https://github.com/sunspot/sunspot/blob/0cfa5d2a27cac383127233b846e6fed63db1dcbc/sunspot/lib/sunspot/batcher.rb) to try and batch Solr adds within a given bounded context. Perhaps something similar could be done on top of traject API if needed.
201
+
202
+ ### Rails concerns, and disabling concurrency
203
+
204
+ Rails will auto-load and re-load classes in typical development configuration. Rails 5 for the first time made dev-mode auto/re-loading concurrency safe, but at the cost of requiring _all_ code using threads to use Rails-specific APIs, or risk deadlock.
205
+
206
+ This makes things difficult for re-using non-rails-specific code that uses concurrency -- such as traject -- in a rails project.
207
+
208
+ For more information see the Rails guide on [Threading and code execution](http://guides.rubyonrails.org/threading_and_code_execution.html), and [this issue on concurrent-ruby](https://github.com/ruby-concurrency/concurrent-ruby/issues/585).
209
+
210
+ If you are using traject within Rails, and you have default dev-mode class auto/re-loading turned on, you may find that the execution locks up in a deadlock, involving Rails auto-loading.
211
+
212
+ One solution would be turning off Rails class reloading even in development, with `config.eager_loading = true` and `config.config.cache_classes = true`.
213
+
214
+ Another solution would be disabling all concurrency in Traject. You can do this with traject settings, but multiple settings may be required as different parts of traject each can use concurrency. For instance, as above, you need to set both `processing_thread_pool` and `solr_writer.thread_pool` to 0.
215
+
216
+ Alternately, you can call `Traject::ThreadPool.disable_concurrency!` -- this disables all multi-threaded concurrency in traject, process-wide and irrevocably. This can also be useful for temporary debugging.
217
+
218
+ We may in the future explore making traject automatically use Rails concurrency API so concurrency can just work in Rails too, but it's a bit of a mess.
@@ -28,7 +28,34 @@ settings are applied first of all. It's recommended you use `provide`.
28
28
 
29
29
  ### Reading (general)
30
30
 
31
- * `reader_class_name`: a Traject Reader class, used by the indexer as a source of records. Defaults to Traject::Marc4JReader (using the Java Marc4J library) on JRuby; Traject::MarcReader (using the ruby marc gem) otherwise. Command-line shortcut `-r`
31
+ * `reader_class_name`: a Traject Reader class, used by the indexer as a source of records. Defaults is reader-specific: Traject::MarcReader (using the ruby marc gem) or Traject::NokogiriReader.Command-line shortcut `-r`
32
+
33
+ ### Error handling
34
+
35
+ * `mapping_rescue`: Takes a proc/lambda/callable which accepts two arguments: A Traject::Context, and an exception. Called if an unexpected error is raised when executing indexing rules. The default when this is unset, is to log and re-raise, which will halt execution. It usually means a bug in your mapping code, that you will want to know about. See default logic at Traject::Indexer#default_mapping_rescue
36
+
37
+ You may instead want to skip the record and continue with indexing, or even conditionally
38
+ decide which to do. In a custom handler, if you want to halt execution, you should re-raise the
39
+ exception (or raise another). If you want to skip the record and continue, call `context.skip!`
40
+ and do not raise.
41
+
42
+ The "stabby lambda" syntax is useful for providing a lambda object with proper parsing
43
+ precedence to not need parentheses.
44
+
45
+ error_count = Concurrent::AtomicFixnum.new(0)
46
+ settings do
47
+ provide "mapping_rescue", -> (context, exception) {
48
+ error_count.increment
49
+ context.logger.error "Encountered exception: #{exception}, total errors #{error_count}"
50
+ if my_should_skip?(context, exception)
51
+ context.skip!
52
+ else
53
+ raise exception
54
+ end
55
+ }
56
+ end
57
+
58
+ At present `mapping_rescue` only handles exceptions in running mapping/indexing logic, unexpected raises in readers or writers may not be caught here.
32
59
 
33
60
  ### Threads
34
61
 
@@ -0,0 +1,134 @@
1
+ # Traject use with XML
2
+
3
+ The [NokogiriIndexer](../lib/traject/nokogiri_indexer.md) is a Traject::Indexer subclass with some defaults for XML use. It has "nokogiri" in the name, because it's based around [nokogiri](https://github.com/sparklemotion/nokogiri) objects as source records in the traject pipeline.
4
+
5
+ It by default uses the NokogiriReader to read XML and read Nokogiri::XML::Documents, and includes the NokogiriMacros mix-in, with some macros for operating on Nokogiri::XML::Documents.
6
+
7
+ ## On the command-line
8
+
9
+ You can tell the traject command-line to use the NokogiriIndexer with the `-i xml` flag:
10
+
11
+
12
+ ```bash
13
+ traject -i xml -c some_appropriate_config some/path/*.xml
14
+ traject -i xml -c some_appropriate_config specific_file.xml
15
+ ```
16
+
17
+ ## In your config files
18
+
19
+ ### Choosing your source record object
20
+
21
+ By default, each input XML file will be yielded as a source record into the traject pipeline. If you have things stored one-record-per-xml-document, that's just fine.
22
+
23
+ Frequently, we instead have an XML document which has sub-nodes that we'd like to treat as individual records in the pipeline. Use the setting `nokogiri.each_record_xpath` for this.
24
+
25
+ If your xpath to slice into source records includes namespaces, you need to register them with `nokogiri.namespaces`. For instance, to send one page of responses from an OAI-PMH server through traject, with OAI-PMH record being sliced into a separate traject source record:
26
+
27
+ ```ruby
28
+ provide "nokogiri.namespaces", {
29
+ "oai" => "http://www.openarchives.org/OAI/2.0/",
30
+ "dc" => "http://purl.org/dc/elements/1.1/",
31
+ "oai_dc" => "http://www.openarchives.org/OAI/2.0/oai_dc/"
32
+ }
33
+
34
+ provide "nokogiri.each_record_xpath", "//oai:record"
35
+ ```
36
+
37
+ ### using extract_xpath to get values
38
+
39
+ Generally with XML source, you'll want to extract individual pieces of text to index with traject. You do that with the `extract_xpath` macro. You can use namespaces registered with the `nokogiri.namespaces` setting.
40
+
41
+ ```ruby
42
+ to_field "title", extract_xpath("//dc:title")
43
+ ```
44
+
45
+ The documents yielded to the pipeline will have the node selected by `each_record_xpath` as the root node, so if you want to use an absolute rather than relative xpath (which may likely be faster) in our OAI-PMH example, it might look like this:
46
+
47
+ ```ruby
48
+ to_field "title", extract_xpath("/oai:record/oai:metadata/oai:dc/dc:title")
49
+ ```
50
+
51
+ You can also provide prefix->namespace mappings in an individual `extract_xpath` call, to override or add to what was in `nokogiri.namespaces`, with the `ns` keyword argument:
52
+
53
+ ```ruby
54
+ to_field "title", extract_xpath("/oai:record/oai:metadata/oai:dc/dc:title", ns: {
55
+ "oai" => "http://www.openarchives.org/OAI/2.0/",
56
+ "dc" => "http://purl.org/dc/elements/1.1/",
57
+ "oai_dc" => "http://www.openarchives.org/OAI/2.0/oai_dc/"
58
+ })
59
+ ```
60
+
61
+ You can use all the standard transforation macros in Traject::Macros::Transformation:
62
+
63
+ ```ruby
64
+ to_field "something", extract_xpath("//value"), first_only, translation_map("some_map"), default("no value")
65
+ ```
66
+
67
+
68
+ ### selecting non-text nodes
69
+
70
+ Let's say our traject source records are nokogiri documents representing XML like this:
71
+
72
+ ```xml
73
+ <person>
74
+ <name>
75
+ <given>Juan</given>
76
+ <surname>Garcia</surname>
77
+ </name>
78
+ </person>
79
+ ```
80
+
81
+ And let's say we do:
82
+
83
+ ```ruby
84
+ to_field "name", extract_xpath("//name")
85
+ ```
86
+
87
+ We've selected an XML node that does not just contain text, but other sub-nodes. What will end up in the traject accumulator, and sent out to Solr index or other output? By default `extract_xpath` will extract only text nodes, in order found in source document, space-separated. So you'd get `"Juan Garcia"` above. Do note that is dependent on source element order.
88
+
89
+ Which might be quite fine, especially if you are putting this into an indexed field in a use where order may not be that important, or source order is exactly what you want.
90
+
91
+ You can instead tell `extract_xpath` `to_text: false` to have it put the actual Nokogiri::XML::Node selected into the accumulator, perhaps for further processing to transform it to text yourself:
92
+
93
+ ```ruby
94
+ to_field "name", extract_xpath("//name", to_text: false) do |record, accumulator|
95
+ accumulator.map! do |xml_node|
96
+ "#{xml_node.at_path('./surname')}, #{xml_node.at_path('./given')}"
97
+ end
98
+ end
99
+ ```
100
+
101
+ If you call with `to_text: false`, and just leave the `Nokogiri::XML::Node`s on the accumulator, the default SolrJsonWriter will end up casting the to strings with `to_s`, which will serialize them to XML, which may be just what you want if you want to put serialized XML into a Solr field. To have more control over the serialization, you may want to use a transforation step similar to above.
102
+
103
+ ## The OaiPmhReader
104
+
105
+ [OAI-PMH](http://www.openarchives.org/OAI/openarchivesprotocol.html) input seems to be a common use case for XML with traject.
106
+
107
+ You can certainly use your own tool to save OAI-PMH responses to disk, then process then as any other XML, as above.
108
+
109
+ But we also provide a Traject::OaiPmhReader that you may be interested in. You give it an OAI-PMH URL, it fetches via HTTP and follows resumptionTokens to send all records into traject pipeline.
110
+
111
+ This is somewhat experimental, please let us know if you find it useful, or find any problems with it.
112
+
113
+ traject -i xml -r Traject::OaiPmhNokogiriReader -s oai_pmh.start_url="http://example.com/oai?verb=ListRecords&metadataPrefix=oai_dc" -c your_config.rb
114
+
115
+ See header comment doc on Traject::OaiPmhReader for more info.
116
+
117
+
118
+ ## Performance, and JRuby
119
+
120
+ The current NokogiriReader reads the input with the DOM parser, `Nokogiri::XML.parse`. So will require memory proportional to size of input documents.
121
+
122
+ I experimented with streaming parsers and spent quite a few hours on it, but couldn't quite get it there in a way that made sense and had good performance.
123
+
124
+ The NokogiriReader parser should be relatively performant though, allowing you to process hundreds of records per second in MRI.
125
+
126
+ (There is a half-finished `ExperimentalStreamingNokogiriReader` available, but it is experimental, half-finished, may disappear or change in backwards compat at any time, problematic, not recommended for production use, etc.)
127
+
128
+ ### Jruby
129
+
130
+ It may be that nokogiri JRuby is just much slower than nokogiri MRI (at least when namespaces are involved?) It may be that our workaround to a [JRuby bug involving namespaces on moving nodes](https://github.com/sparklemotion/nokogiri/issues/1774) doesn't help.
131
+
132
+ For whatever reason, in a simple test involving OAI-PMH schema-ed data, running under JRuby processes records only about 30% as quickly as running under MRI.
133
+
134
+ **JRuby is not recommended for XML use of traject at present.**
@@ -1,6 +1,11 @@
1
1
  require "traject/version"
2
2
 
3
+ require 'traject/hashie/indifferent_access_fix'
4
+
3
5
  require 'traject/indexer'
6
+ require 'traject/indexer/marc_indexer'
7
+ require 'traject/indexer/nokogiri_indexer'
8
+
4
9
  require 'traject/util'
5
10
 
6
11
  require 'traject/macros/basic'
@@ -0,0 +1,34 @@
1
+ module Traject
2
+ # Uses #put method from the traject writer API to just accumulate
3
+ # output_hash'es in an array. Useful for testing, or for simple programmatic
4
+ # use.
5
+ #
6
+ # Useful with process_with:
7
+ #
8
+ # indexer.process_with(source_array, ArrayWriter.new).values
9
+ # # => array of output_hash's
10
+ #
11
+ # Recommend against using it with huge number of records, as it will
12
+ # of course store them all in memory.
13
+ #
14
+ # Uses Concurrent::Arrays internally, so should be safe for use as writer
15
+ # in concurrency scenarios.
16
+ class ArrayWriter
17
+ attr_reader :values, :contexts
18
+
19
+ def initialize(_settings = nil)
20
+ @values = Concurrent::Array.new
21
+ @contexts = Concurrent::Array.new
22
+ end
23
+
24
+ def put(context)
25
+ contexts << context
26
+ values << context.output_hash
27
+ end
28
+
29
+ def clear!
30
+ @contexts.delete
31
+ @values.delete
32
+ end
33
+ end
34
+ end
@@ -19,6 +19,12 @@ module Traject
19
19
  attr_accessor :indexer
20
20
  attr_accessor :console
21
21
 
22
+ @@indexer_class_shortcuts = {
23
+ "basic" => "Traject::Indexer",
24
+ "marc" => "Traject::Indexer::MarcIndexer",
25
+ "xml" => "Traject::Indexer::NokogiriIndexer"
26
+ }
27
+
22
28
  def initialize(argv=ARGV)
23
29
  self.console = $stderr
24
30
 
@@ -135,37 +141,23 @@ module Traject
135
141
  return true
136
142
  end
137
143
 
144
+ # @return (Array<#read>, String)
138
145
  def get_input_io(argv)
139
- # ARGF might be perfect for this, but problems with it include:
140
- # * jruby is broken, no way to set it's encoding, leads to encoding errors reading non-ascii
141
- # https://github.com/jruby/jruby/issues/891
142
- # * It's apparently not enough like an IO object for at least one of the ruby-marc XML
143
- # readers:
144
- # NoMethodError: undefined method `to_inputstream' for ARGF:Object
145
- # init at /Users/jrochkind/.gem/jruby/1.9.3/gems/marc-0.5.1/lib/marc/xml_parsers.rb:369
146
- #
147
- # * It INSISTS on reading from ARGFV, making it hard to test, or use when you want to give
148
- # it a list of files on something other than ARGV.
149
- #
150
- # So for now we do just one file, or stdin if specified. Sorry!
151
-
152
146
  filename = nil
147
+ io_arr = nil
153
148
  if options[:stdin]
154
149
  indexer.logger.info("Reading from standard input")
155
- io = $stdin
156
- elsif argv.length > 1
157
- self.console.puts "Sorry, traject can only handle one input file at a time right now. `#{argv}` Exiting..."
158
- exit 1
150
+ io_arr = [$stdin]
159
151
  elsif argv.length == 0
160
- io = File.open(File::NULL, 'r')
152
+ io_arr = [File.open(File::NULL, 'r')]
161
153
  indexer.logger.info("Warning, no file input given. Use command-line argument '--stdin' to use standard input ")
162
154
  else
163
- io = File.open(argv.first, 'r')
164
- filename = argv.first
155
+ io_arr = argv.collect { |path| File.open(path, 'r') }
156
+ filename = argv.join(",")
165
157
  indexer.logger.info "Reading from #{filename}"
166
158
  end
167
159
 
168
- return io, filename
160
+ return io_arr, filename
169
161
  end
170
162
 
171
163
  def load_configuration_files!(my_indexer, conf_files)
@@ -250,6 +242,7 @@ module Traject
250
242
  on 'd', 'debug', "Include debug log, -s log.level=debug"
251
243
  on 'h', 'help', "print usage information to stderr"
252
244
  on 'c', 'conf', 'configuration file path (repeatable)', :argument => true, :as => Array
245
+ on :i, 'indexer', "Traject indexer class name or shortcut", :argument => true, default: "marc"
253
246
  on :s, :setting, "settings: `-s key=value` (repeatable)", :argument => true, :as => Array
254
247
  on :r, :reader, "Set reader class, shortcut for -s reader_class_name=", :argument => true
255
248
  on :o, "output_file", "output file for Writer classes that write to files", :argument => true
@@ -266,7 +259,10 @@ module Traject
266
259
  end
267
260
 
268
261
  def initialize_indexer!
269
- indexer = Traject::Indexer.new self.assemble_settings_hash(self.options)
262
+ indexer_class_name = @@indexer_class_shortcuts[options[:indexer]] || options[:indexer]
263
+ klass = Traject::Indexer.qualified_const_get(indexer_class_name)
264
+
265
+ indexer = klass.new self.assemble_settings_hash(self.options)
270
266
  load_configuration_files!(indexer, options[:conf])
271
267
 
272
268
  return indexer