RubyGems - traject - Versions diffs - 2.0.0-java - Mend

traject 2.0.0-java

Files changed (104) hide show

checksums.yaml +7 -0
data/.gitignore +18 -0
data/.travis.yml +27 -0
data/.yardopts +3 -0
data/Gemfile +12 -0
data/LICENSE.txt +20 -0
data/README.md +461 -0
data/Rakefile +21 -0
data/bench/bench.rb +30 -0
data/bin/traject +16 -0
data/doc/batch_execution.md +243 -0
data/doc/extending.md +190 -0
data/doc/indexing_rules.md +265 -0
data/doc/other_commands.md +47 -0
data/doc/settings.md +101 -0
data/lib/tasks/load_maps.rake +48 -0
data/lib/traject.rb +11 -0
data/lib/traject/command_line.rb +301 -0
data/lib/traject/csv_writer.rb +34 -0
data/lib/traject/debug_writer.rb +47 -0
data/lib/traject/delimited_writer.rb +110 -0
data/lib/traject/indexer.rb +613 -0
data/lib/traject/indexer/settings.rb +110 -0
data/lib/traject/json_writer.rb +51 -0
data/lib/traject/line_writer.rb +63 -0
data/lib/traject/macros/basic.rb +9 -0
data/lib/traject/macros/marc21.rb +223 -0
data/lib/traject/macros/marc21_semantics.rb +584 -0
data/lib/traject/macros/marc_format_classifier.rb +197 -0
data/lib/traject/marc_extractor.rb +410 -0
data/lib/traject/marc_reader.rb +89 -0
data/lib/traject/mock_reader.rb +97 -0
data/lib/traject/ndj_reader.rb +40 -0
data/lib/traject/null_writer.rb +22 -0
data/lib/traject/qualified_const_get.rb +40 -0
data/lib/traject/solr_json_writer.rb +277 -0
data/lib/traject/thread_pool.rb +161 -0
data/lib/traject/translation_map.rb +267 -0
data/lib/traject/util.rb +52 -0
data/lib/traject/version.rb +3 -0
data/lib/traject/yaml_writer.rb +9 -0
data/lib/translation_maps/lcc_top_level.yaml +26 -0
data/lib/translation_maps/marc_genre_007.yaml +9 -0
data/lib/translation_maps/marc_genre_leader.yaml +22 -0
data/lib/translation_maps/marc_geographic.yaml +589 -0
data/lib/translation_maps/marc_instruments.yaml +102 -0
data/lib/translation_maps/marc_languages.yaml +490 -0
data/test/debug_writer_test.rb +38 -0
data/test/delimited_writer_test.rb +104 -0
data/test/indexer/each_record_test.rb +59 -0
data/test/indexer/macros_marc21_semantics_test.rb +391 -0
data/test/indexer/macros_marc21_test.rb +190 -0
data/test/indexer/macros_test.rb +40 -0
data/test/indexer/map_record_test.rb +209 -0
data/test/indexer/read_write_test.rb +101 -0
data/test/indexer/settings_test.rb +152 -0
data/test/indexer/to_field_test.rb +77 -0
data/test/marc_extractor_test.rb +412 -0
data/test/marc_format_classifier_test.rb +98 -0
data/test/marc_reader_test.rb +110 -0
data/test/solr_json_writer_test.rb +248 -0
data/test/test_helper.rb +90 -0
data/test/test_support/245_no_ab.marc +1 -0
data/test/test_support/880_with_no_6.utf8.marc +1 -0
data/test/test_support/bad_subfield_code.marc +1 -0
data/test/test_support/bad_utf_byte.utf8.marc +1 -0
data/test/test_support/date_resort_to_260.marc +1 -0
data/test/test_support/date_type_r_missing_date2.marc +1 -0
data/test/test_support/date_with_u.marc +1 -0
data/test/test_support/demo_config.rb +155 -0
data/test/test_support/emptyish_record.marc +1 -0
data/test/test_support/escaped_character_reference.marc8.marc +1 -0
data/test/test_support/george_eliot.marc +1 -0
data/test/test_support/hebrew880s.marc +1 -0
data/test/test_support/louis_armstrong.marc +1 -0
data/test/test_support/manufacturing_consent.marc +1 -0
data/test/test_support/manuscript_online_thesis.marc +1 -0
data/test/test_support/microform_online_conference.marc +1 -0
data/test/test_support/multi_era.marc +1 -0
data/test/test_support/multi_geo.marc +1 -0
data/test/test_support/musical_cage.marc +1 -0
data/test/test_support/nature.marc +1 -0
data/test/test_support/one-marc8.mrc +1 -0
data/test/test_support/online_only.marc +1 -0
data/test/test_support/packed_041a_lang.marc +1 -0
data/test/test_support/test_data.utf8.json +30 -0
data/test/test_support/test_data.utf8.marc.xml +2609 -0
data/test/test_support/test_data.utf8.mrc +1 -0
data/test/test_support/test_data.utf8.mrc.gz +0 -0
data/test/test_support/the_business_ren.marc +1 -0
data/test/translation_map_test.rb +225 -0
data/test/translation_maps/bad_ruby.rb +8 -0
data/test/translation_maps/bad_yaml.yaml +1 -0
data/test/translation_maps/both_map.rb +1 -0
data/test/translation_maps/both_map.yaml +1 -0
data/test/translation_maps/default_literal.rb +10 -0
data/test/translation_maps/default_passthrough.rb +10 -0
data/test/translation_maps/marc_040a_translate_test.yaml +1 -0
data/test/translation_maps/properties_map.properties +5 -0
data/test/translation_maps/ruby_map.rb +10 -0
data/test/translation_maps/translate_array_test.yaml +8 -0
data/test/translation_maps/yaml_map.yaml +7 -0
data/traject.gemspec +47 -0
metadata +382 -0

data/Rakefile ADDED Viewed

@@ -0,0 +1,21 @@
+begin
+  require 'bundler/setup'
+  require "bundler/gem_tasks"
+rescue LoadError
+  puts "You must `gem install bundler` and `bundle install` to run rake tasks"
+end
+require 'rake'
+require 'rake/testtask'
+task :default => [:test]
+Rake::TestTask.new do |t|
+  t.pattern = 'test/**/*_test.rb'
+  t.libs.push 'test', 'test_support'
+end
+# Not documented well, but this seems to be
+# the way to load rake tasks from other files
+#import "lib/tasks/load_map.rake"
+Dir.glob('lib/tasks/*.rake').each { |r| import r}

data/bench/bench.rb ADDED Viewed

@@ -0,0 +1,30 @@
+#!/usr/bin/env jruby
+$:.unshift File.expand_path('../../lib', __FILE__)
+require 'traject/command_line'
+require 'benchmark'
+unless ARGV.size >= 2
+  STDERR.puts "\n     Benchmark two (or more) different config files with both 0 and 3 threads against the given marc file\n"
+  STDERR.puts "\n     Usage:"
+  STDERR.puts "         jruby --server bench.rb config1.rb config2.rb [...configN.rb] filename.mrc\n\n"
+  exit
+end
+filename = ARGV.pop
+config_files = ARGV
+puts RUBY_DESCRIPTION
+Benchmark.bmbm do |x|
+  [0, 3].each do |threads|
+    config_files.each do |cf|
+      x.report("#{cf} (#{threads})") do
+        cmdline = Traject::CommandLine.new(["-c", cf, '-s', 'log.file=bench.log', '-s', "processing_thread_pool=#{threads}", filename])
+        cmdline.execute
+      end
+    end
+  end
+end

data/bin/traject ADDED Viewed

@@ -0,0 +1,16 @@
+#!/usr/bin/env ruby
+# If we're loading from source instead of a gem, rubygems
+# isn't setting load paths for us, so we need to set it ourselves
+self_load_path = File.expand_path("../lib", File.dirname(__FILE__))
+unless $LOAD_PATH.include? self_load_path
+  $LOAD_PATH << self_load_path
+end
+require 'traject/command_line'
+cmdline = Traject::CommandLine.new(ARGV)
+result = cmdline.execute
+exit 1 unless result # non-zero exit status on process telling us there's problems.

data/doc/batch_execution.md ADDED Viewed

@@ -0,0 +1,243 @@
+# Hints for running traject as a batch job
+Maybe as a cronjob. Maybe via a batch shell script that executes
+traject, and maybe even pipelines it together with other commands.
+These are things you might want to do with traject. Some potential problem points
+with suggested solutions, and additional hints.
+## Ruby version setting
+For best performance, traject should run under jruby. You will
+ordinarily have jruby installed under a ruby version switcher -- we
+recommend [chruby](https://github.com/postmodern/chruby) over other choices,
+but other popular choices include rvm and rbenv.
+Especially when running under a cron job, it can be difficult to
+set things up so traject runs under jruby -- and then when you add
+bundler into it, things can get positively byzantine. It's not you,
+this gets confusing.
+It can sometimes be useful to create a wrapper script for traject
+that takes care of making sure it's running under the right ruby
+version.
+### for chruby
+Simply run with:
+    chruby-exec jruby -- traject {other arguments}
+Whether specifying that directly in a crontab, or in a shell script
+that needs to call traject, etc. In a crontab environment, it'll actually need
+you to set PATH and SHELL variables, as specified in the [chruby docs](https://github.com/postmodern/chruby/wiki/Cron)
+So simple you might not need a wrapper script, but it might still be convenient to create one. Say
+you put a `jruby-traject` at `/usr/local/bin/jruby-traject`, that
+looks like this:
+    #!/usr/bin/env bash
+    chruby-exec jruby -- traject "$@"
+Now you can can just execute `jruby-traject {arguments}`, and execute traject
+in a jruby environment. (In a crontab, you'll still need to fix your
+PATH and SHELL env variables for `chruby-exec` to work, either in the
+crontab or in this wrapper script)
+### chruby monster wrapper script
+I am still not sure if this is a good idea, but here's an example of
+a wrapper script for chruby that will take care of the ENV even
+when running in a crontab, use chruby-exec only if jruby isn't
+already the default ruby, and add in `bundle exec` too.
+~~~bash
+#!/usr/bin/env bash
+# A wrapper for traject that uses chruby to make sure jruby
+# is being used before calling traject, and then calls
+# traject with bundle exec from within our traject project
+# dir.
+# Make sure /usr/local/bin is in PATH for chruby-exec,
+# which it's not ordinarily in a cronjob.
+if [[ ":$PATH:" != *":/usr/local/bin:"* ]]
+then
+  export PATH=$PATH:/usr/local/bin
+fi
+# chruby needs SHELL set, which it won't be from a crontab
+export SHELL=/bin/bash
+# Find the dir based on location of this wrapper script,
+# then use that dir to cd to for the bundle exec to find
+# the right Gemfile.
+traject_dir=$(cd `dirname "${BASH_SOURCE[0]}"` && pwd)
+# do we need to use chruby to switch to jruby?
+if [[ "$(ruby -v)" == *jruby* ]]
+then
+  ruby_picker="" # nothing needed "
+else
+  ruby_picker="chruby-exec jruby --"
+fi
+cmd="BUNDLE_GEMFILE=$traject_dir/Gemfile $ruby_picker bundle exec traject $@"
+echo $cmd
+eval $cmd
+~~~
+This monster script can perhaps be adapted for rbenv or rvm.
+### for rbenv
+If running in an interactive shell that has had rbenv set up for
+it, you can use rbenv's standard mechanism to say to execute
+something in jruby:
+    RBENV_VERSION=jruby-1.7.2 traject {args}
+You do need to specify the exact version of jruby, I don't think
+there's any way to say 'latest install jruby'. You could do the
+same thing for any batch scripts you're writing -- just have
+them set that `RBENV_VERSION` environment variable before
+executing traject.
+If you're running inside a cronjob, things get a bit trickier,
+because rbenv isn't normally set up in the limited environment
+of cron tasks. One way to deal with this is to have your
+cronjob explicitly execute in a bash login shell, that
+will then have rbenv set up -- so long as it's running
+under an account with rbenv set up properly!
+    # in a cronfile
+    # 10 * * * * /bin/bash -l -c 'RBENV_VERSION=jruby-1.7.2 traject {args}'
+(Better way? Doc pull requests welcome.)
+### for rvm
+See rvm's [own docs on use with cron](http://rvm.io/integration/cron), it gets a bit confusing.
+But here's one way, using a wrapper script. It does require you to
+identify and hard-code in where your rvm is installed, and exactly which
+version of jruby you want to execute with (will have to be updated if you upgrade
+jruby). (Is there a better way? Doc pull requests welcome! rvm confuses me!)
+Make a file at `/usr/local/bin/jruby-traject` that looks like this:
+~~~bash
+#!/usr/bin/env bash
+# load rvm ruby
+source /home/MY_ACCT/.rvm/environments/jruby-1.7.3
+traject "$@"
+~~~
+You have to use your actual account rvm is installed in for MY_ACCT.
+Or, if you have a global install of rvm instead of a user-account one,
+it might be at `/usr/local/rvm/environments`... instead.
+Now any account, in a crontab, in an interactive shell, wherever,
+can just execute `jruby-traject {arguments}`, and execute traject
+in a jruby environment.
+### Bundler too?
+If you're running with bundler too, you could make a wrapper file specific to
+a particular traject project and it's Gemfile, by combining the `bundle exec` into
+your wrapper file.  For instance,  for chruby, this works:
+    #!/usr/bin/env bash
+    chruby-exec jruby -- BUNDLE_GEMFILE=/path/to/Gemfile bundle exec traject "$@"
+Now you can call your wrapper script from anywhere and with any active ruby,
+and execute it in jruby and with the dependencies specified in the Gemfile
+for your project.
+## Exit codes
+Traject tries to always return a well-behaved unix exit code -- 0 for success,
+non-0 for error.
+You should be able to rely on this in your batch bash scripts, if you want to abort
+further processing if traject failed for some reason, you can check traject's
+exit code.
+If an uncaught exception happens, traject will return non-0.
+There are some kinds of errors which prevent traject from indexing
+one or more records, but traject may still continue processing
+the other records. If any records have been skipped in this way,
+traject will _also_ return a non-0 failure exit code. (Is this good?
+Does it need to be configurable?)
+In these cases, information about errors that led to skipped records should
+be output as ERROR level in the logs.
+## Logs and Error Reporting
+By default, traject outputs all logging to stderr.  This is often just what
+you want for a batch or automated process, where there might be some wrapper
+script which captures stderr and puts it where you want it.
+However, it's easy enough to tell traject to log somewhere else. Either on
+the command-line:
+    traject -s log.file=/some/other/file/log {other args}
+Or in a traject configuration file, setting the `log.file` configuration setting.
+### separate error log
+You can also separately have a duplicate log file created with ONLY log messages of
+level ERROR and higher (meaning ERROR and FATAL), with the `log.error_file` setting.
+Then, if there's any lines in this error log file at all, you know something bad
+happened, maybe your batch process needs to notify someone, or abort further
+steps in the batch process.
+    traject -s log.file=/var/log/traject.log -s log.error_file=/var/log/traject_error.log {more args}
+The error lines will be in the main log file, and also duplicated in the error
+log file.
+### Completely customizable logging with yell
+Traject uses the [yell](https://github.com/rudionrails/yell) gem for logging.
+You can configure the logger directly to implement whatever crazy logging rules you might
+want, so long as yell supports them. But yell is pretty flexible.
+Recall that traject config files are just ruby, executed in the context
+of a Traject::Indexer. You can set the Indexer's `logger` to a yell logger
+object you configure yourself however you like:
+~~~ruby
+  # inside a traject configuration file
+  self.logger = Yell.new do |l|
+    l.level = 'gte.info' # will only pass :info and above to the adapters
+    l.adapter :datefile, 'production.log', level: 'lte.warn' # anything lower or equal to :warn
+    l.adapter :datefile, 'error.log', level: 'gte.error' # anything greater or equal to :error
+  end
+~~~
+**note** it's important to use to use `self.logger =`, or due to
+ruby idiosyncracies you'll just be setting a local variable, not the Indexer's
+logger attribute.
+See [yell](https://github.com/rudionrails/yell)  docs for more, you can
+do whatever you can make yell, just write ruby.
+### Bundler
+For automated batch execution, we recommend you consider using
+bundler to manage any gem dependencies. See the [Extending
+With Your Own Code](./extending.md) traject docs for
+information on how traject integrates with bundler.

data/doc/extending.md ADDED Viewed

@@ -0,0 +1,190 @@
+# Extending With Your Own Code
+Beyond very simple logic, you'll want to write your own ruby code,
+organize it in files other than traject config files, but then
+use it in traject config files.
+You might want to have code local to your traject project; or you
+might want to use ruby gems to share code between projects and developers.
+A given project may use both of these techniques.
+Here are some suggestions for how to do this, along with mention
+of a couple traject features meant to make it easier.
+## Expert Summary
+* Traject `-I` argument command line can be used to list directories to
+  add to the load path, similar to the `ruby -I` argument. You
+  can then 'require' local project files from the load path.
+  * Or modify the ruby `$LOAD_PATH` manually at the top of a traject config file you are loading.
+  * translation map files found in a
+    "./translation_maps" subdir on the load path will be found
+    for Traject translation maps.
+* You can use Bundler with traject simply by creating a Gemfile with `bundler init`,
+  and then running command line with `bundle exec traject` or
+  even `BUNDLE_GEMFILE=path/to/Gemfile bundle exec traject`
+## Custom code local to your project
+You might want local translation maps, or local ruby
+code. Here's a standard recommended way you might lay out
+this extra code in the file system, using a 'lib'
+directory kept next to your traject config files:
+~~~
+- my_traject/
+  * config_file.rb
+  - lib/
+    * my_macros.rb
+    * my_utility.rb
+    - translation_maps/
+      * my_map.yaml
+~~~
+The `my_macros.rb` file might contain a simple [macro](./macros.md)
+in a module called `MyMacros`.
+The `my_utility.rb` file might contain, say, a module of utility
+methods, `MyUtility.some_utility`, etc.
+To refer to ruby code from another file, we use the standard
+ruby `require` statement to bring in the files:
+~~~ruby
+# config_file.rb
+require 'my_macros'
+require 'my_utility'
+# Now that MyMacros is available, extend it into the indexer,
+# and use it:
+extend MyMacros
+to_field "title", my_some_macro
+# And likewise, we can use our utility methods:
+to_field "title" do |record, accumulator, context|
+  accumulator << MyUtility.some_utility(record)
+end
+~~~
+**But wait!** This won't work yet. Becuase ruby won't be
+able to find the file in `requires 'my_macros'`. To fix
+that, we want to add our local `lib` directory to the
+ruby `$LOAD_PATH`, a standard ruby feature.
+Traject provides a way for you to add to the load path
+from the traject command line, the `-I` flag:
+    traject -I ./lib -c ./config_file.rb ...
+Or, you can hard-code a `$LOAD_PATH` change directly in your
+config file. You'll have to use some weird looking
+ruby code to create a file path relative to the current
+file (the config_file.rb), and then make sure it's
+an absolute path. (Should we add a traject utility
+method for this?)
+~~~ruby
+# at top of config_file.rb...
+$LOAD_PATH.unshift File.expand_path(File.join(File.dirname(__FILE__), './lib'))
+~~~
+That's pretty much it!
+What about that translation map? The `$LOAD_PATH` modification
+took care of that too, the Traject::TranslationMap will look
+up translation map definition files
+in a `./translation_maps` subdir on the load path, as in `./lib/translation_maps` in this case.
+## Using gems in your traject project
+If there is certain logic that is common between (traject or other)
+projects, it makes sense to put it in a ruby gem.
+We won't go into detail about creating ruby gems, but we
+do recomend you use the `bundle gem my_gem_name` command to create
+a skeleton of your gem
+([one tutorial here](http://railscasts.com/episodes/245-new-gem-with-bundler?view=asciicast)).
+This will also make available rake commands to install your gem locally
+(`rake install`), or release it to the rubygems server (`rake release`).
+There are two main methods to use a gem in your traject project,
+with straight rubygems, or with bundler.
+Without bundler is simpler. Simply `gem install some_gem` from the
+command line, and now you can `require` that gem in your traject
+config file, and use what it provides:
+~~~ruby
+#some_traject_config.rb
+require 'some_gem'
+SomeGem.whatever!
+~~~
+A gem can provide traject translation map definitions
+in a `lib/translation_maps` sub-directory, and traject will be able to find those
+translation maps when the gem is loaded. (Because gems'
+`./lib` directories are by default added to the ruby load path.)
+### Or, with bundler:
+However, if you then move your traject project to another system,
+where you haven't yet installed the `some_gem`, then running
+traject with this config file will, of course, fail. Or if you
+move your traject project to another system with a slightly
+different version of `some_gem`, your traject indexing could
+behave differently in confusing ways. As the number of gems
+you are using increases, managing this gets increasingly
+confusing.
+[bundler](http://bundler.io/) was invented to make this kind of dependency management
+more straightforward and reliable. We recommend you consider using
+bundler, especially for traject installations where traject will
+be run via automated batch jobs on production servers.
+Bundler's behavior is based on a `Gemfile` that lists your
+project dependencies. You can create a starter skeleton
+by running `bundler init`, probably in the directory
+right next to your traject config files.
+Then specify what gems your traject project will use,
+possibly with version restrictions, in the [Gemfile](http://bundler.io/v1.3/gemfile.html) --
+**do** include `gem 'traject'` in the Gemfile.
+Run `bundle install` from the directory with the Gemfile, on any system
+at any time, to make sure specified gems are installed.
+**Run traject** with `bundle exec` to have bundler set up the environment
+from your Gemfile. You can `cd` into the directory containing the Gemfile,
+so bundler can find it:
+    $ cd /some/where
+    $ bundle exec traject -c some_traject_config.rb ...
+Or you can use the BUNDLE_GEMFILE environment variable to tell bundler where
+to find the Gemfile, and run from any directory at all:
+    $ BUNDLE_GEMFILE=/path/to/Gemfile bundle exec traject -c /path/to/some_config.rb ...
+Bundler will make sure the specified versions of all gems are used by
+traject, and also make sure no gems except those specified in the gemfile
+are available to the program, for a reliable reproducible environment.
+You should still `require` the gem in your traject config file,
+then just refer to what it provides in your config code as usual.
+You should check both the `Gemfile` and the `Gemfile.lock`
+that bundler creates into your source control repo. The
+`Gemfile.lock` specifies _exactly_ what versions of
+gem dependencies are currently being used, so you can get the exact
+same dependency environment on different servers.
+See the [bundler documentation](http://bundler.io/#getting-started), or google, for more information.