RubyGems - traject - Versions diffs - 2.0.0-java - Mend

traject 2.0.0-java

Files changed (104) hide show

checksums.yaml +7 -0
data/.gitignore +18 -0
data/.travis.yml +27 -0
data/.yardopts +3 -0
data/Gemfile +12 -0
data/LICENSE.txt +20 -0
data/README.md +461 -0
data/Rakefile +21 -0
data/bench/bench.rb +30 -0
data/bin/traject +16 -0
data/doc/batch_execution.md +243 -0
data/doc/extending.md +190 -0
data/doc/indexing_rules.md +265 -0
data/doc/other_commands.md +47 -0
data/doc/settings.md +101 -0
data/lib/tasks/load_maps.rake +48 -0
data/lib/traject.rb +11 -0
data/lib/traject/command_line.rb +301 -0
data/lib/traject/csv_writer.rb +34 -0
data/lib/traject/debug_writer.rb +47 -0
data/lib/traject/delimited_writer.rb +110 -0
data/lib/traject/indexer.rb +613 -0
data/lib/traject/indexer/settings.rb +110 -0
data/lib/traject/json_writer.rb +51 -0
data/lib/traject/line_writer.rb +63 -0
data/lib/traject/macros/basic.rb +9 -0
data/lib/traject/macros/marc21.rb +223 -0
data/lib/traject/macros/marc21_semantics.rb +584 -0
data/lib/traject/macros/marc_format_classifier.rb +197 -0
data/lib/traject/marc_extractor.rb +410 -0
data/lib/traject/marc_reader.rb +89 -0
data/lib/traject/mock_reader.rb +97 -0
data/lib/traject/ndj_reader.rb +40 -0
data/lib/traject/null_writer.rb +22 -0
data/lib/traject/qualified_const_get.rb +40 -0
data/lib/traject/solr_json_writer.rb +277 -0
data/lib/traject/thread_pool.rb +161 -0
data/lib/traject/translation_map.rb +267 -0
data/lib/traject/util.rb +52 -0
data/lib/traject/version.rb +3 -0
data/lib/traject/yaml_writer.rb +9 -0
data/lib/translation_maps/lcc_top_level.yaml +26 -0
data/lib/translation_maps/marc_genre_007.yaml +9 -0
data/lib/translation_maps/marc_genre_leader.yaml +22 -0
data/lib/translation_maps/marc_geographic.yaml +589 -0
data/lib/translation_maps/marc_instruments.yaml +102 -0
data/lib/translation_maps/marc_languages.yaml +490 -0
data/test/debug_writer_test.rb +38 -0
data/test/delimited_writer_test.rb +104 -0
data/test/indexer/each_record_test.rb +59 -0
data/test/indexer/macros_marc21_semantics_test.rb +391 -0
data/test/indexer/macros_marc21_test.rb +190 -0
data/test/indexer/macros_test.rb +40 -0
data/test/indexer/map_record_test.rb +209 -0
data/test/indexer/read_write_test.rb +101 -0
data/test/indexer/settings_test.rb +152 -0
data/test/indexer/to_field_test.rb +77 -0
data/test/marc_extractor_test.rb +412 -0
data/test/marc_format_classifier_test.rb +98 -0
data/test/marc_reader_test.rb +110 -0
data/test/solr_json_writer_test.rb +248 -0
data/test/test_helper.rb +90 -0
data/test/test_support/245_no_ab.marc +1 -0
data/test/test_support/880_with_no_6.utf8.marc +1 -0
data/test/test_support/bad_subfield_code.marc +1 -0
data/test/test_support/bad_utf_byte.utf8.marc +1 -0
data/test/test_support/date_resort_to_260.marc +1 -0
data/test/test_support/date_type_r_missing_date2.marc +1 -0
data/test/test_support/date_with_u.marc +1 -0
data/test/test_support/demo_config.rb +155 -0
data/test/test_support/emptyish_record.marc +1 -0
data/test/test_support/escaped_character_reference.marc8.marc +1 -0
data/test/test_support/george_eliot.marc +1 -0
data/test/test_support/hebrew880s.marc +1 -0
data/test/test_support/louis_armstrong.marc +1 -0
data/test/test_support/manufacturing_consent.marc +1 -0
data/test/test_support/manuscript_online_thesis.marc +1 -0
data/test/test_support/microform_online_conference.marc +1 -0
data/test/test_support/multi_era.marc +1 -0
data/test/test_support/multi_geo.marc +1 -0
data/test/test_support/musical_cage.marc +1 -0
data/test/test_support/nature.marc +1 -0
data/test/test_support/one-marc8.mrc +1 -0
data/test/test_support/online_only.marc +1 -0
data/test/test_support/packed_041a_lang.marc +1 -0
data/test/test_support/test_data.utf8.json +30 -0
data/test/test_support/test_data.utf8.marc.xml +2609 -0
data/test/test_support/test_data.utf8.mrc +1 -0
data/test/test_support/test_data.utf8.mrc.gz +0 -0
data/test/test_support/the_business_ren.marc +1 -0
data/test/translation_map_test.rb +225 -0
data/test/translation_maps/bad_ruby.rb +8 -0
data/test/translation_maps/bad_yaml.yaml +1 -0
data/test/translation_maps/both_map.rb +1 -0
data/test/translation_maps/both_map.yaml +1 -0
data/test/translation_maps/default_literal.rb +10 -0
data/test/translation_maps/default_passthrough.rb +10 -0
data/test/translation_maps/marc_040a_translate_test.yaml +1 -0
data/test/translation_maps/properties_map.properties +5 -0
data/test/translation_maps/ruby_map.rb +10 -0
data/test/translation_maps/translate_array_test.yaml +8 -0
data/test/translation_maps/yaml_map.yaml +7 -0
data/traject.gemspec +47 -0
metadata +382 -0

data/doc/indexing_rules.md ADDED Viewed

@@ -0,0 +1,265 @@
+# Details on Traject Indexing: from custom logic to Macros
+Traject macros are a way of providing re-usable index mapping rules. Before we discuss how they work, we need to remind ourselves of the basic/direct Traject `to_field` indexing method.
+## How direct indexing logic works
+Here's the simplest possible direct Traject mapping logic, duplicating the effects of the `literal` macro:
+~~~ruby
+to_field("title") do |record, accumulator, context|
+  accumulator << "FIXED LITERAL"
+end
+~~~
+That `do` is just ruby `block` syntax, whereby we can pass a block of ruby code as an argument to to a ruby method. We pass a block taking three arguments, labeled `record`, `accumulator`, and `context`, to the `to_field` method. The third 'context' object is optional, you can define it in your block or not, depending on if you want to use it.
+The block is then stored by the Traject::Indexer, and called for each record indexed, with three arguments provided.
+#### record argument
+The record that gets passed to your block is a MARC::Record object (or, theoretically, any object that gets returned by a traject Reader). Your logic will usually examine the record to calculate the desired output.
+### accumulator argument
+The accumulator argument is an array. At the end of your custom code, the accumulator
+array should hold the output you want to send off, to the field specified in the `to_field`.
+The accumulator is a reference to a ruby array, and you need to **modify** that array,
+manipulating it in place with Array methods that mutate the array, like `concat`, `<<`,
+`map!` or even `replace`.
+You can't simply assign the accumulator variable to a different array, that won't work,
+you need to modify the array in-place.
+    # Won't work, assigning variable
+    to_field('foo') do |rec, acc|
+      acc = ["some constant"] } # WRONG!
+    end
+    # Won't work, assigning variable
+    to_field('foo') do |rec, acc|
+      acc << 'bill'
+      acc << 'dueber'
+      acc = acc.map{|str| str.upcase}
+    end   # WRONG! WRONG! WRONG! WRONG! WRONG!
+    # Instead, do, modify array in place
+    to_field('foo') {|rec, acc| acc << "some constant" }
+    to_field('foo') do |rec, acc|
+      acc << 'bill'
+      acc << 'dueber'
+      acc = acc.map!{|str| str.upcase} #notice using "map!" not just "map"
+    end
+### context argument
+The third optional context argument
+The third optional argument is a
+[Traject::Indexer::Context](./lib/traject/indexer/context.rb)  ([rdoc](http://rdoc.info/github/traject-project/traject/Traject/Indexer/Context))
+object. Most of the time you don't need it, but you can use it for
+some sophisticated functionality, for example using these Context methods:
+* `context.clipboard` A hash into which you can stuff values that you want to pass from one indexing step to another. For example, if you go through a bunch of work to query a database and get a result you'll need more than once, stick the results somewhere in the clipboard. This clipboard is record-specific, and won't persist between records.
+* `context.position` The position of the record in the input file (e.g., was it the first record, seoncd, etc.). Useful for error reporting
+* `context.output_hash` A hash mapping the field names (generally defined in `to_field` calls) to an array of values to be sent to the writer associated with that field. This allows you to modify what goes to the writer without going through a `to_field` call -- you can just set `context.output_hash['myfield'] = ['my', 'values']` and you're set. See below for more examples
+* `context.skip!(msg)` An assertion that this record should be ignored. No more indexing steps will be called, no results will be sent to the writer, and a `debug`-level log message will be written stating that the record was skipped.
+## Gotcha: Use closures to make your code more efficient
+A _closure_ is a computer-science term that means "a piece of code
+that remembers all the variables that were in scope when it was
+created." In ruby, lambdas and blocks are closures. Method definitions
+are not, which most of us have run across much to our chagrin.
+Within the context of `traject`, this means you can define a variable
+outside of a `to_field` or `each_record` block and it will be avaiable
+inside those blocks. And you only have to define it once.
+That's useful to do for any object that is even a bit expensive
+to create -- we can maximize the performance of our traject
+indexing by creating those objects once outside the block,
+instead of inside the block where it will be created
+once per-record (every time the block is executed):
+Compare:
+```ruby
+# Create the transformer for every single record
+to_field 'normalized_title' do |rec, acc|
+  transformer = My::Custom::Format::Transformer.new # Oh no! I'm doing this for each of my 10M records!
+  acc << transformer.transform(rec['245'].value)
+end
+# Create the transformer exactly once
+transformer = My::Custom::Format::Transformer.new # Ahhh. Do it once.
+to_field 'normalized_title' do |rec, acc|
+  acc << transformer.transform(rec['245'].value)
+end
+```
+Certain built-in traject calls have been optimized to be high performance
+so it's safe to do them inside 'inner loop' blocks though.
+That includes `Traject::TranslationMap.new` and `Traject::MarcExtractor.cached("xxx")`
+(note #cached rather than #new there)
+## From block to lambda
+In the ruby language, in addition to creating a code block as an argument
+to a method with `do |args| ... end` or `{|arg| ...  }, we can also create
+a code block to hold in a variable, with the `lambda` keyword:
+    always_output_foo = lambda do |record, accumulator|
+      accumulator << "FOO"
+    end
+traject `to_field` is written so, as a convenience, it can take a lambda expression
+stored in a variable as an alternative to a block:
+    to_field("always_has_foo"), always_output_foo
+Why is this a convenience? Well, ordinarily it's not something we
+need, but in fact it's what allows traject 'macros' as re-useable
+code templates.
+## Macros
+A Traject macro is a way to automatically create indexing rules via re-usable "templates".
+Traject macros are simply methods that return ruby lambda/proc objects, possibly creating
+them based on parameters passed in.
+Here is in fact how the `literal` function is implemented:
+~~~ruby
+def literal(value)
+  return lambda do |record, accumulator, context|
+     # because a lambda is a closure, we can define it in terms
+     # of the 'value' from the scope it's defined in!
+     accumulator << value
+  end
+end
+to_field("something"), literal("something")
+~~~
+It's really as simple as that, that's all a Traject macro is. A function that takes parameters, and based on those parameters returns a lambda; the lambda is then passed to the `to_field` indexing method, or similar methods.
+How do you make these methods available to the indexer?
+Define it in a module:
+~~~ruby
+# in a file literal_macro.rb
+module LiteralMacro
+  def literal(value)
+    return lambda do |record, accumulator, context|
+       # because a lambda is a closure, we can define it in terms
+       # of the 'value' from the scope it's defined in!
+       accumulator << value
+    end
+  end
+end
+~~~
+And then use ordinary ruby `require` and `extend` to add it to the current Indexer file, by simply including this
+in one of your config files:
+~~~
+require `literal_macro.rb`
+extend LiteralMacro
+to_field ...
+~~~
+That's it.  You can use the traject command line `-I` option to set the ruby load path, so your file will be findable via `require`.  Or you can distribute it in a gem, and use straight rubygems and the `gem` command in your configuration file, or Bundler with traject command-line `-g` option.
+## Using a lambda _and_ and block
+Traject macros (such as `extract_marc`) create and return a lambda. If
+you include a lambda _and_ a block on a `to_field` call, the latter
+gets the accumulator as it was filled in by the former.
+```ruby
+# Get the titles and lowercase them
+to_field 'lc_title', extract_marc('245') do |rec, acc, context|
+  acc.map!{|title| title.downcase}
+end
+# Build my own lambda and use it
+mylam = lambda {|rec, acc|  acc << 'one'} # just add a constant
+to_field('foo'), mylam do |rec, acc, context|
+  acc << 'two'
+end #=> context.output_hash['foo'] == ['one', 'two']
+# You might also want to do something like this
+to_field('foo'), my_macro_that_doesn't_dedup_ do |rec, acc|
+  acc.uniq!
+end
+```
+## Maniuplating `context.output_hash` directly
+If you ask for the context argument, a [Traject::Indexer::Context](./lib/traject/indexer/context.rb) ([rdoc](http://rdoc.info/gems/traject/Traject/Indexer/Context)), you have access to context.output_hash, with is
+the hash of transformed output that will be sent to Solr (or any other Writer)
+You can look in there to see any already transformed output and use it as the source
+for new output. You can actually *write* to there manually, which can be useful
+to write routines that effect more than one output field at once.
+**Note**: Make sure you always assign an _array_ to, e.g., `context.output_hash['foo']`, not a single value!
+## each_record
+All the previous discussion was in terms of `to_field` -- `each_record` is a similar
+routine, to define logic that is executed for each record, but isn't fixed to write
+to a single output field.
+So `each_record` blocks have no `accumulator` argument, instead they either take a single
+`record` argument; or both a `record` and a `context`.
+`each_record` can be used for logging or notifiying; computing intermediate
+results; or writing to more than one field at once.
+~~~ruby
+each_record do |record, context|
+  if is_it_bad?(record)
+    context.skip!("Skipping bad record")
+  else
+    context.clipboard[:expensive_result] = calculate_expensive_thing(record)
+  end
+end
+each_record do |record, context|
+  (one, two) = calculate_two_things_from(record)
+  context.output_hash["first_field"] ||= []
+  context.output_hash["first_field"] << one
+  context.output_hash["second_field"] ||= []
+  context.output_hash["second_field"] << one
+end
+~~~
+traject doesn't come with any macros written for use with
+`each_record`, but they could be created if useful --
+just methods that return lambda's taking the right
+args for `each_record`.
+## More tips and gotchas about indexing steps
+* **All your `to_field` and `each_record` steps are run _in the order in which they were initially evaluated_**. That means that the order you call your config files can potentially make a difference if you're screwing around stuffing stuff into the context clipboard or whatnot.
+* **`to_field` can be called multiple times on the same field name.** If you call the same field name multiple times, all the values will be sent to the writer.
+* **Once you call `context.skip!(msg)` no more index steps will be run for that record**. So if you have any cleanup code, you'll need to make sure to call it yourself.
+* **By default, `trajcet` indexing runs multi-threaded**. In the current implementation, the indexing steps for one record are *not* split across threads, but different records can be processed simultaneously by more than one thread. That means you need to make sure your code is thread-safe (or always set `processing_thread_pool` to 0).

data/doc/other_commands.md ADDED Viewed

@@ -0,0 +1,47 @@
+# Other traject command-line commands
+The traject command line supporst a few other miscellaneous commands with
+the "-x command" switch. The usual traject command line is actually
+the `process` command, `traject -x process ...` is the same as leaving out
+the `-x process`.
+## Commit
+`traject -x commit` will send a 'commit' message to the Solr server
+specified in setting `solr.url`.  Other parts of configuration will
+be ignored, but don't hurt.
+    traject -x commit -s solr.url=http://some.com/solr
+Or with a config file that includes a solr.url setting:
+    traject -x commit -c config_file.rb
+## marcout
+The `marcout` command will skip all processing/mapping, and simply
+serialize marc out to a file stream.
+This is mainly useful when you're using a custom reader to read
+marc from a database or something, but could also be used to
+convert marc from one format to another or something.
+Will write to stdout, or set the `output_file` setting (`-o` shortcut).
+Set the `marcout.type` setting to 'xml' or 'binary' for type of output.
+Or to `human` for human readable display of marc (that is not meant for
+machine readability, but can be good for manual diagnostics.)
+If outputing type binary, setting `marcout.allow_oversized` to
+true or false (boolean or string), to pass that to the MARC::Writer.
+If set to true, then oversized MARC records can still be serialized,
+with length bytes zero'd out -- technically illegal, but can
+be read by MARC::Reader in permissive mode.
+If you have MARC-XML *input*, you need to
+set the `marc_source.type` setting to XML for xml input.
+~~~bash
+traject -x marcout somefile.marc -o output.xml -s marcout.type=xml
+traject -x marcout -s marc_source.type=xml somefile.xml -c configuration.rb
+~~~

data/doc/settings.md ADDED Viewed

@@ -0,0 +1,101 @@
+# Traject settings
+Traject settings are a flat list of key/value pairs -- a single
+Hash, not nested. Keys are always strings, and dots (".") can be
+used for grouping and namespacing.
+Values are usually strings, but occasionally something else. String values can be easily
+set via the command line.
+Settings can be set in configuration files, usually like:
+~~~ruby
+settings do
+  provide "key", "value"
+end
+~~~~
+or on the command line: `-s key=value`.  There are also some command line shortcuts
+for commonly used settings, see `traject -h`.
+`provide` will only set the key if it was previously unset, so first time to set 'wins'. And command-line
+settings are applied first of all. It's recommended you use `provide`.
+`store` is also available, and forces setting of the new value overriding any previous value set.
+## Known settings
+* `debug_ascii_progress`: true/'true' to print ascii characters to STDERR indicating progress. Note,
+                          yes, this is fixed to STDERR, regardless of your logging setup.
+                          * `.` for every batch of records read and parsed
+                          * `^` for every batch of records batched and queued for adding to solr
+                                (possibly in thread pool)
+                          * `%` for completing of a Solr 'add'
+                          * `!` when threadpool for solr add has a full queue, so solr add is
+                                going to happen in calling queue -- means solr adding can't
+                                keep up with production.
+* `json_writer.pretty_print`: used by the JsonWriter, if set to true, will output pretty printed json (with added whitespace) for easier human readability. Default false.
+* `log.file`: filename to send logging, or 'STDOUT' or 'STDERR' for those streams. Default STDERR
+* `log.error_file`: Default nil, if set then all log lines of ERROR and higher will be _additionally_
+                  sent to error file named.
+* `log.format`: Formatting string used by Yell logger. https://github.com/rudionrails/yell/wiki/101-formatting-log-messages
+* `log.level`:  Log this level and above. Default 'info', set to eg 'debug' to get potentially more logging info,
+              or 'error' to get less. https://github.com/rudionrails/yell/wiki/101-setting-the-log-level
+* `log.batch_size`: If set to a number N (or string representation), will output a progress line to
+   log. (by default as INFO, but see log.batch_size.severity)
+* `log.batch_size.severity`: If `log.batch_size` is set, what logger severity level to log to. Default "INFO", set to "DEBUG" etc if desired.
+* `marc_source.type`: default 'binary'. Can also set to 'xml' or (not yet implemented todo) 'json'. Command line shortcut `-t`
+* `marcout.allow_oversized`: Used with `-x marcout` command to output marc when outputting
+     as ISO 2709 binary, set to true or string "true", and the MARC::Writer will have
+     allow_oversized=true set, allowing oversized records to be serialized with length
+    bytes zero'd out -- technically illegal, but can be read by MARC::Reader in permissive mode.
+* `output_file`: Output file to write to for operations that write to files: For instance the `marcout` command,
+                 or Writer classes that write to files, like Traject::JsonWriter. Has an shortcut
+                 `-o` on command line.
+* `processing_thread_pool` Number of threads in the main thread pool used for processing
+   records with input rules. On JRuby or Rubinius, defaults to 1 less than the number of processors detected on your machine. On other ruby platforms, defaults to 1. Set to 0 or nil
+   to disable thread pool, and do all processing in main thread.
+   Choose a pool size based on size of your machine, and complexity of your indexing rules, you
+   might want to try different sizes and measure which works best for you.
+   Probably no reason for it ever to be more than number of cores on indexing machine.
+* `reader_class_name`: a Traject Reader class, used by the indexer as a source
+    of records.   Defaults to Traject::Marc4JReader (using the Java Marc4J
+    library) on JRuby; Traject::MarcReader (using the ruby marc gem) otherwise.
+    Command-line shortcut `-r`
+* `solr.url`: URL to connect to a solr instance for indexing, eg http://example.org:8983/solr . Command-line short-cut `-u`.
+* `solr.version`: Set to eg "1.4.0", "4.3.0"; currently un-used, but in the future will control
+  change some default settings, and/or sanity check and warn you if you're doing something
+  that might not work with that version of solr. Set now for help in the future.
+* `solr_writer.batch_size`: size of batches that SolrJsonWriter will send docs to Solr in. Default 100. Set to nil,
+  0, or 1, and SolrJsonWriter will do one http transaction per document, no batching.
+* `solr_writer.commit_on_close`: default false, set to true to have the solr writer send an explicit commit message to Solr after indexing.
+* `solr_writer.thread_pool`:       Defaults to 1 (single bg thread). A thread pool is used for submitting docs
+                                    to solr. Set to 0 or nil to disable threading. Set to 1,
+                                    there will still be a single bg thread doing the adds.
+                                    May make sense to set higher than number of cores on your
+                                    indexing machine, as these threads will mostly be waiting
+                                    on Solr. Speed/capacity of your solr might be more relevant.
+                                    Note that processing_thread_pool threads can end up submitting
+                                    to solr too, if solr_json_writer.thread_pool is full.
+* `writer_class_name`: a Traject Writer class, used by indexer to send processed dictionaries off. Default Traject::SolrJsonWriter, other writers for debugging or writing to files are also available. See Traject::Indexer for more info. Command line shortcut `-w`

data/lib/tasks/load_maps.rake ADDED Viewed

@@ -0,0 +1,48 @@
+require 'net/http'
+require 'open-uri'
+namespace :load_maps do
+  desc "Load MARC geo codes by screen-scraping LC"
+  task :marc_geographic do
+    begin
+      require 'nokogiri'
+    rescue LoadError => e
+      $stderr.puts "\n  load_maps:marc_geographic task requires nokogiri"
+      $stderr.puts "  Try `gem install nokogiri` and try again. Exiting...\n\n"
+      exit 1
+    end
+    source_url = "http://www.loc.gov/marc/geoareas/gacs_code.html"
+    filename = ENV["OUTPUT_TO"] || File.expand_path("../../translation_maps/marc_geographic.yaml", __FILE__)
+    file = File.open( filename, "w:utf-8" )
+    $stderr.puts "Writing to `#{filename}` ..."
+    html = Nokogiri::HTML(open(source_url).read)
+    file.puts "# Translation map for marc geographic codes constructed by `rake load_maps:marc_geographic` task"
+    file.puts "# Scraped from #{source_url} at #{Time.now}"
+    file.puts "# Intentionally includes discontinued codes."
+    file.puts "\n"
+    html.css("tr").each do |line|
+      code = line.css("td.code").inner_text.strip
+      unless code.nil? || code.empty?
+        code.gsub!(/^\-/, '') # treat discontinued code like any other
+        label = line.css("td[2]").inner_text.strip
+        label.gsub!(/\n */, ' ') # get rid of newlines that file now sometimes contains, bah.
+        label.gsub!("'", "''") # yaml escapes single-quotes by doubling them, weird but true.
+        file.puts "'#{code}': '#{label}'"
+      end
+    end
+    $stderr.puts "Done."
+  end
+end