traject 2.0.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +18 -0
  3. data/.travis.yml +27 -0
  4. data/.yardopts +3 -0
  5. data/Gemfile +12 -0
  6. data/LICENSE.txt +20 -0
  7. data/README.md +461 -0
  8. data/Rakefile +21 -0
  9. data/bench/bench.rb +30 -0
  10. data/bin/traject +16 -0
  11. data/doc/batch_execution.md +243 -0
  12. data/doc/extending.md +190 -0
  13. data/doc/indexing_rules.md +265 -0
  14. data/doc/other_commands.md +47 -0
  15. data/doc/settings.md +101 -0
  16. data/lib/tasks/load_maps.rake +48 -0
  17. data/lib/traject.rb +11 -0
  18. data/lib/traject/command_line.rb +301 -0
  19. data/lib/traject/csv_writer.rb +34 -0
  20. data/lib/traject/debug_writer.rb +47 -0
  21. data/lib/traject/delimited_writer.rb +110 -0
  22. data/lib/traject/indexer.rb +613 -0
  23. data/lib/traject/indexer/settings.rb +110 -0
  24. data/lib/traject/json_writer.rb +51 -0
  25. data/lib/traject/line_writer.rb +63 -0
  26. data/lib/traject/macros/basic.rb +9 -0
  27. data/lib/traject/macros/marc21.rb +223 -0
  28. data/lib/traject/macros/marc21_semantics.rb +584 -0
  29. data/lib/traject/macros/marc_format_classifier.rb +197 -0
  30. data/lib/traject/marc_extractor.rb +410 -0
  31. data/lib/traject/marc_reader.rb +89 -0
  32. data/lib/traject/mock_reader.rb +97 -0
  33. data/lib/traject/ndj_reader.rb +40 -0
  34. data/lib/traject/null_writer.rb +22 -0
  35. data/lib/traject/qualified_const_get.rb +40 -0
  36. data/lib/traject/solr_json_writer.rb +277 -0
  37. data/lib/traject/thread_pool.rb +161 -0
  38. data/lib/traject/translation_map.rb +267 -0
  39. data/lib/traject/util.rb +52 -0
  40. data/lib/traject/version.rb +3 -0
  41. data/lib/traject/yaml_writer.rb +9 -0
  42. data/lib/translation_maps/lcc_top_level.yaml +26 -0
  43. data/lib/translation_maps/marc_genre_007.yaml +9 -0
  44. data/lib/translation_maps/marc_genre_leader.yaml +22 -0
  45. data/lib/translation_maps/marc_geographic.yaml +589 -0
  46. data/lib/translation_maps/marc_instruments.yaml +102 -0
  47. data/lib/translation_maps/marc_languages.yaml +490 -0
  48. data/test/debug_writer_test.rb +38 -0
  49. data/test/delimited_writer_test.rb +104 -0
  50. data/test/indexer/each_record_test.rb +59 -0
  51. data/test/indexer/macros_marc21_semantics_test.rb +391 -0
  52. data/test/indexer/macros_marc21_test.rb +190 -0
  53. data/test/indexer/macros_test.rb +40 -0
  54. data/test/indexer/map_record_test.rb +209 -0
  55. data/test/indexer/read_write_test.rb +101 -0
  56. data/test/indexer/settings_test.rb +152 -0
  57. data/test/indexer/to_field_test.rb +77 -0
  58. data/test/marc_extractor_test.rb +412 -0
  59. data/test/marc_format_classifier_test.rb +98 -0
  60. data/test/marc_reader_test.rb +110 -0
  61. data/test/solr_json_writer_test.rb +248 -0
  62. data/test/test_helper.rb +90 -0
  63. data/test/test_support/245_no_ab.marc +1 -0
  64. data/test/test_support/880_with_no_6.utf8.marc +1 -0
  65. data/test/test_support/bad_subfield_code.marc +1 -0
  66. data/test/test_support/bad_utf_byte.utf8.marc +1 -0
  67. data/test/test_support/date_resort_to_260.marc +1 -0
  68. data/test/test_support/date_type_r_missing_date2.marc +1 -0
  69. data/test/test_support/date_with_u.marc +1 -0
  70. data/test/test_support/demo_config.rb +155 -0
  71. data/test/test_support/emptyish_record.marc +1 -0
  72. data/test/test_support/escaped_character_reference.marc8.marc +1 -0
  73. data/test/test_support/george_eliot.marc +1 -0
  74. data/test/test_support/hebrew880s.marc +1 -0
  75. data/test/test_support/louis_armstrong.marc +1 -0
  76. data/test/test_support/manufacturing_consent.marc +1 -0
  77. data/test/test_support/manuscript_online_thesis.marc +1 -0
  78. data/test/test_support/microform_online_conference.marc +1 -0
  79. data/test/test_support/multi_era.marc +1 -0
  80. data/test/test_support/multi_geo.marc +1 -0
  81. data/test/test_support/musical_cage.marc +1 -0
  82. data/test/test_support/nature.marc +1 -0
  83. data/test/test_support/one-marc8.mrc +1 -0
  84. data/test/test_support/online_only.marc +1 -0
  85. data/test/test_support/packed_041a_lang.marc +1 -0
  86. data/test/test_support/test_data.utf8.json +30 -0
  87. data/test/test_support/test_data.utf8.marc.xml +2609 -0
  88. data/test/test_support/test_data.utf8.mrc +1 -0
  89. data/test/test_support/test_data.utf8.mrc.gz +0 -0
  90. data/test/test_support/the_business_ren.marc +1 -0
  91. data/test/translation_map_test.rb +225 -0
  92. data/test/translation_maps/bad_ruby.rb +8 -0
  93. data/test/translation_maps/bad_yaml.yaml +1 -0
  94. data/test/translation_maps/both_map.rb +1 -0
  95. data/test/translation_maps/both_map.yaml +1 -0
  96. data/test/translation_maps/default_literal.rb +10 -0
  97. data/test/translation_maps/default_passthrough.rb +10 -0
  98. data/test/translation_maps/marc_040a_translate_test.yaml +1 -0
  99. data/test/translation_maps/properties_map.properties +5 -0
  100. data/test/translation_maps/ruby_map.rb +10 -0
  101. data/test/translation_maps/translate_array_test.yaml +8 -0
  102. data/test/translation_maps/yaml_map.yaml +7 -0
  103. data/traject.gemspec +47 -0
  104. metadata +382 -0
data/Rakefile ADDED
@@ -0,0 +1,21 @@
1
+ begin
2
+ require 'bundler/setup'
3
+ require "bundler/gem_tasks"
4
+ rescue LoadError
5
+ puts "You must `gem install bundler` and `bundle install` to run rake tasks"
6
+ end
7
+
8
+ require 'rake'
9
+ require 'rake/testtask'
10
+
11
+ task :default => [:test]
12
+
13
+ Rake::TestTask.new do |t|
14
+ t.pattern = 'test/**/*_test.rb'
15
+ t.libs.push 'test', 'test_support'
16
+ end
17
+
18
+ # Not documented well, but this seems to be
19
+ # the way to load rake tasks from other files
20
+ #import "lib/tasks/load_map.rake"
21
+ Dir.glob('lib/tasks/*.rake').each { |r| import r}
data/bench/bench.rb ADDED
@@ -0,0 +1,30 @@
1
+ #!/usr/bin/env jruby
2
+ $:.unshift File.expand_path('../../lib', __FILE__)
3
+
4
+ require 'traject/command_line'
5
+
6
+ require 'benchmark'
7
+
8
+ unless ARGV.size >= 2
9
+ STDERR.puts "\n Benchmark two (or more) different config files with both 0 and 3 threads against the given marc file\n"
10
+ STDERR.puts "\n Usage:"
11
+ STDERR.puts " jruby --server bench.rb config1.rb config2.rb [...configN.rb] filename.mrc\n\n"
12
+ exit
13
+ end
14
+
15
+ filename = ARGV.pop
16
+ config_files = ARGV
17
+
18
+ puts RUBY_DESCRIPTION
19
+ Benchmark.bmbm do |x|
20
+ [0, 3].each do |threads|
21
+ config_files.each do |cf|
22
+ x.report("#{cf} (#{threads})") do
23
+ cmdline = Traject::CommandLine.new(["-c", cf, '-s', 'log.file=bench.log', '-s', "processing_thread_pool=#{threads}", filename])
24
+ cmdline.execute
25
+ end
26
+ end
27
+ end
28
+ end
29
+
30
+
data/bin/traject ADDED
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/env ruby
2
+
3
+
4
+ # If we're loading from source instead of a gem, rubygems
5
+ # isn't setting load paths for us, so we need to set it ourselves
6
+ self_load_path = File.expand_path("../lib", File.dirname(__FILE__))
7
+ unless $LOAD_PATH.include? self_load_path
8
+ $LOAD_PATH << self_load_path
9
+ end
10
+
11
+ require 'traject/command_line'
12
+
13
+ cmdline = Traject::CommandLine.new(ARGV)
14
+ result = cmdline.execute
15
+
16
+ exit 1 unless result # non-zero exit status on process telling us there's problems.
@@ -0,0 +1,243 @@
1
+ # Hints for running traject as a batch job
2
+
3
+ Maybe as a cronjob. Maybe via a batch shell script that executes
4
+ traject, and maybe even pipelines it together with other commands.
5
+
6
+ These are things you might want to do with traject. Some potential problem points
7
+ with suggested solutions, and additional hints.
8
+
9
+ ## Ruby version setting
10
+
11
+ For best performance, traject should run under jruby. You will
12
+ ordinarily have jruby installed under a ruby version switcher -- we
13
+ recommend [chruby](https://github.com/postmodern/chruby) over other choices,
14
+ but other popular choices include rvm and rbenv.
15
+
16
+ Especially when running under a cron job, it can be difficult to
17
+ set things up so traject runs under jruby -- and then when you add
18
+ bundler into it, things can get positively byzantine. It's not you,
19
+ this gets confusing.
20
+
21
+ It can sometimes be useful to create a wrapper script for traject
22
+ that takes care of making sure it's running under the right ruby
23
+ version.
24
+
25
+ ### for chruby
26
+
27
+ Simply run with:
28
+
29
+ chruby-exec jruby -- traject {other arguments}
30
+
31
+ Whether specifying that directly in a crontab, or in a shell script
32
+ that needs to call traject, etc. In a crontab environment, it'll actually need
33
+ you to set PATH and SHELL variables, as specified in the [chruby docs](https://github.com/postmodern/chruby/wiki/Cron)
34
+
35
+
36
+ So simple you might not need a wrapper script, but it might still be convenient to create one. Say
37
+ you put a `jruby-traject` at `/usr/local/bin/jruby-traject`, that
38
+ looks like this:
39
+
40
+ #!/usr/bin/env bash
41
+
42
+ chruby-exec jruby -- traject "$@"
43
+
44
+ Now you can can just execute `jruby-traject {arguments}`, and execute traject
45
+ in a jruby environment. (In a crontab, you'll still need to fix your
46
+ PATH and SHELL env variables for `chruby-exec` to work, either in the
47
+ crontab or in this wrapper script)
48
+
49
+ ### chruby monster wrapper script
50
+
51
+ I am still not sure if this is a good idea, but here's an example of
52
+ a wrapper script for chruby that will take care of the ENV even
53
+ when running in a crontab, use chruby-exec only if jruby isn't
54
+ already the default ruby, and add in `bundle exec` too.
55
+
56
+ ~~~bash
57
+ #!/usr/bin/env bash
58
+
59
+ # A wrapper for traject that uses chruby to make sure jruby
60
+ # is being used before calling traject, and then calls
61
+ # traject with bundle exec from within our traject project
62
+ # dir.
63
+
64
+ # Make sure /usr/local/bin is in PATH for chruby-exec,
65
+ # which it's not ordinarily in a cronjob.
66
+ if [[ ":$PATH:" != *":/usr/local/bin:"* ]]
67
+ then
68
+ export PATH=$PATH:/usr/local/bin
69
+ fi
70
+ # chruby needs SHELL set, which it won't be from a crontab
71
+ export SHELL=/bin/bash
72
+
73
+ # Find the dir based on location of this wrapper script,
74
+ # then use that dir to cd to for the bundle exec to find
75
+ # the right Gemfile.
76
+ traject_dir=$(cd `dirname "${BASH_SOURCE[0]}"` && pwd)
77
+
78
+ # do we need to use chruby to switch to jruby?
79
+ if [[ "$(ruby -v)" == *jruby* ]]
80
+ then
81
+ ruby_picker="" # nothing needed "
82
+ else
83
+ ruby_picker="chruby-exec jruby --"
84
+ fi
85
+
86
+ cmd="BUNDLE_GEMFILE=$traject_dir/Gemfile $ruby_picker bundle exec traject $@"
87
+
88
+ echo $cmd
89
+ eval $cmd
90
+ ~~~
91
+
92
+ This monster script can perhaps be adapted for rbenv or rvm.
93
+
94
+ ### for rbenv
95
+
96
+ If running in an interactive shell that has had rbenv set up for
97
+ it, you can use rbenv's standard mechanism to say to execute
98
+ something in jruby:
99
+
100
+ RBENV_VERSION=jruby-1.7.2 traject {args}
101
+
102
+ You do need to specify the exact version of jruby, I don't think
103
+ there's any way to say 'latest install jruby'. You could do the
104
+ same thing for any batch scripts you're writing -- just have
105
+ them set that `RBENV_VERSION` environment variable before
106
+ executing traject.
107
+
108
+ If you're running inside a cronjob, things get a bit trickier,
109
+ because rbenv isn't normally set up in the limited environment
110
+ of cron tasks. One way to deal with this is to have your
111
+ cronjob explicitly execute in a bash login shell, that
112
+ will then have rbenv set up -- so long as it's running
113
+ under an account with rbenv set up properly!
114
+
115
+ # in a cronfile
116
+ # 10 * * * * /bin/bash -l -c 'RBENV_VERSION=jruby-1.7.2 traject {args}'
117
+
118
+ (Better way? Doc pull requests welcome.)
119
+
120
+
121
+ ### for rvm
122
+
123
+ See rvm's [own docs on use with cron](http://rvm.io/integration/cron), it gets a bit confusing.
124
+ But here's one way, using a wrapper script. It does require you to
125
+ identify and hard-code in where your rvm is installed, and exactly which
126
+ version of jruby you want to execute with (will have to be updated if you upgrade
127
+ jruby). (Is there a better way? Doc pull requests welcome! rvm confuses me!)
128
+
129
+ Make a file at `/usr/local/bin/jruby-traject` that looks like this:
130
+
131
+
132
+ ~~~bash
133
+ #!/usr/bin/env bash
134
+
135
+ # load rvm ruby
136
+ source /home/MY_ACCT/.rvm/environments/jruby-1.7.3
137
+
138
+ traject "$@"
139
+ ~~~
140
+
141
+ You have to use your actual account rvm is installed in for MY_ACCT.
142
+ Or, if you have a global install of rvm instead of a user-account one,
143
+ it might be at `/usr/local/rvm/environments`... instead.
144
+
145
+ Now any account, in a crontab, in an interactive shell, wherever,
146
+ can just execute `jruby-traject {arguments}`, and execute traject
147
+ in a jruby environment.
148
+
149
+
150
+ ### Bundler too?
151
+
152
+ If you're running with bundler too, you could make a wrapper file specific to
153
+ a particular traject project and it's Gemfile, by combining the `bundle exec` into
154
+ your wrapper file. For instance, for chruby, this works:
155
+
156
+ #!/usr/bin/env bash
157
+
158
+ chruby-exec jruby -- BUNDLE_GEMFILE=/path/to/Gemfile bundle exec traject "$@"
159
+
160
+ Now you can call your wrapper script from anywhere and with any active ruby,
161
+ and execute it in jruby and with the dependencies specified in the Gemfile
162
+ for your project.
163
+
164
+ ## Exit codes
165
+
166
+ Traject tries to always return a well-behaved unix exit code -- 0 for success,
167
+ non-0 for error.
168
+
169
+ You should be able to rely on this in your batch bash scripts, if you want to abort
170
+ further processing if traject failed for some reason, you can check traject's
171
+ exit code.
172
+
173
+ If an uncaught exception happens, traject will return non-0.
174
+
175
+ There are some kinds of errors which prevent traject from indexing
176
+ one or more records, but traject may still continue processing
177
+ the other records. If any records have been skipped in this way,
178
+ traject will _also_ return a non-0 failure exit code. (Is this good?
179
+ Does it need to be configurable?)
180
+
181
+ In these cases, information about errors that led to skipped records should
182
+ be output as ERROR level in the logs.
183
+
184
+ ## Logs and Error Reporting
185
+
186
+ By default, traject outputs all logging to stderr. This is often just what
187
+ you want for a batch or automated process, where there might be some wrapper
188
+ script which captures stderr and puts it where you want it.
189
+
190
+ However, it's easy enough to tell traject to log somewhere else. Either on
191
+ the command-line:
192
+
193
+ traject -s log.file=/some/other/file/log {other args}
194
+
195
+ Or in a traject configuration file, setting the `log.file` configuration setting.
196
+
197
+ ### separate error log
198
+
199
+ You can also separately have a duplicate log file created with ONLY log messages of
200
+ level ERROR and higher (meaning ERROR and FATAL), with the `log.error_file` setting.
201
+ Then, if there's any lines in this error log file at all, you know something bad
202
+ happened, maybe your batch process needs to notify someone, or abort further
203
+ steps in the batch process.
204
+
205
+ traject -s log.file=/var/log/traject.log -s log.error_file=/var/log/traject_error.log {more args}
206
+
207
+ The error lines will be in the main log file, and also duplicated in the error
208
+ log file.
209
+
210
+ ### Completely customizable logging with yell
211
+
212
+ Traject uses the [yell](https://github.com/rudionrails/yell) gem for logging.
213
+ You can configure the logger directly to implement whatever crazy logging rules you might
214
+ want, so long as yell supports them. But yell is pretty flexible.
215
+
216
+ Recall that traject config files are just ruby, executed in the context
217
+ of a Traject::Indexer. You can set the Indexer's `logger` to a yell logger
218
+ object you configure yourself however you like:
219
+
220
+ ~~~ruby
221
+ # inside a traject configuration file
222
+
223
+ self.logger = Yell.new do |l|
224
+ l.level = 'gte.info' # will only pass :info and above to the adapters
225
+
226
+ l.adapter :datefile, 'production.log', level: 'lte.warn' # anything lower or equal to :warn
227
+ l.adapter :datefile, 'error.log', level: 'gte.error' # anything greater or equal to :error
228
+ end
229
+ ~~~
230
+
231
+ **note** it's important to use to use `self.logger =`, or due to
232
+ ruby idiosyncracies you'll just be setting a local variable, not the Indexer's
233
+ logger attribute.
234
+
235
+ See [yell](https://github.com/rudionrails/yell) docs for more, you can
236
+ do whatever you can make yell, just write ruby.
237
+
238
+ ### Bundler
239
+
240
+ For automated batch execution, we recommend you consider using
241
+ bundler to manage any gem dependencies. See the [Extending
242
+ With Your Own Code](./extending.md) traject docs for
243
+ information on how traject integrates with bundler.
data/doc/extending.md ADDED
@@ -0,0 +1,190 @@
1
+ # Extending With Your Own Code
2
+
3
+ Beyond very simple logic, you'll want to write your own ruby code,
4
+ organize it in files other than traject config files, but then
5
+ use it in traject config files.
6
+
7
+ You might want to have code local to your traject project; or you
8
+ might want to use ruby gems to share code between projects and developers.
9
+ A given project may use both of these techniques.
10
+
11
+ Here are some suggestions for how to do this, along with mention
12
+ of a couple traject features meant to make it easier.
13
+
14
+ ## Expert Summary
15
+
16
+ * Traject `-I` argument command line can be used to list directories to
17
+ add to the load path, similar to the `ruby -I` argument. You
18
+ can then 'require' local project files from the load path.
19
+ * Or modify the ruby `$LOAD_PATH` manually at the top of a traject config file you are loading.
20
+ * translation map files found in a
21
+ "./translation_maps" subdir on the load path will be found
22
+ for Traject translation maps.
23
+ * You can use Bundler with traject simply by creating a Gemfile with `bundler init`,
24
+ and then running command line with `bundle exec traject` or
25
+ even `BUNDLE_GEMFILE=path/to/Gemfile bundle exec traject`
26
+
27
+ ## Custom code local to your project
28
+
29
+ You might want local translation maps, or local ruby
30
+ code. Here's a standard recommended way you might lay out
31
+ this extra code in the file system, using a 'lib'
32
+ directory kept next to your traject config files:
33
+
34
+ ~~~
35
+ - my_traject/
36
+ * config_file.rb
37
+ - lib/
38
+ * my_macros.rb
39
+ * my_utility.rb
40
+ - translation_maps/
41
+ * my_map.yaml
42
+ ~~~
43
+
44
+
45
+ The `my_macros.rb` file might contain a simple [macro](./macros.md)
46
+ in a module called `MyMacros`.
47
+
48
+ The `my_utility.rb` file might contain, say, a module of utility
49
+ methods, `MyUtility.some_utility`, etc.
50
+
51
+ To refer to ruby code from another file, we use the standard
52
+ ruby `require` statement to bring in the files:
53
+
54
+ ~~~ruby
55
+ # config_file.rb
56
+
57
+ require 'my_macros'
58
+ require 'my_utility'
59
+
60
+ # Now that MyMacros is available, extend it into the indexer,
61
+ # and use it:
62
+
63
+ extend MyMacros
64
+
65
+ to_field "title", my_some_macro
66
+
67
+ # And likewise, we can use our utility methods:
68
+
69
+ to_field "title" do |record, accumulator, context|
70
+ accumulator << MyUtility.some_utility(record)
71
+ end
72
+ ~~~
73
+
74
+ **But wait!** This won't work yet. Becuase ruby won't be
75
+ able to find the file in `requires 'my_macros'`. To fix
76
+ that, we want to add our local `lib` directory to the
77
+ ruby `$LOAD_PATH`, a standard ruby feature.
78
+
79
+ Traject provides a way for you to add to the load path
80
+ from the traject command line, the `-I` flag:
81
+
82
+ traject -I ./lib -c ./config_file.rb ...
83
+
84
+ Or, you can hard-code a `$LOAD_PATH` change directly in your
85
+ config file. You'll have to use some weird looking
86
+ ruby code to create a file path relative to the current
87
+ file (the config_file.rb), and then make sure it's
88
+ an absolute path. (Should we add a traject utility
89
+ method for this?)
90
+
91
+ ~~~ruby
92
+ # at top of config_file.rb...
93
+
94
+ $LOAD_PATH.unshift File.expand_path(File.join(File.dirname(__FILE__), './lib'))
95
+ ~~~
96
+
97
+ That's pretty much it!
98
+
99
+ What about that translation map? The `$LOAD_PATH` modification
100
+ took care of that too, the Traject::TranslationMap will look
101
+ up translation map definition files
102
+ in a `./translation_maps` subdir on the load path, as in `./lib/translation_maps` in this case.
103
+
104
+
105
+ ## Using gems in your traject project
106
+
107
+ If there is certain logic that is common between (traject or other)
108
+ projects, it makes sense to put it in a ruby gem.
109
+
110
+ We won't go into detail about creating ruby gems, but we
111
+ do recomend you use the `bundle gem my_gem_name` command to create
112
+ a skeleton of your gem
113
+ ([one tutorial here](http://railscasts.com/episodes/245-new-gem-with-bundler?view=asciicast)).
114
+ This will also make available rake commands to install your gem locally
115
+ (`rake install`), or release it to the rubygems server (`rake release`).
116
+
117
+ There are two main methods to use a gem in your traject project,
118
+ with straight rubygems, or with bundler.
119
+
120
+ Without bundler is simpler. Simply `gem install some_gem` from the
121
+ command line, and now you can `require` that gem in your traject
122
+ config file, and use what it provides:
123
+
124
+ ~~~ruby
125
+ #some_traject_config.rb
126
+
127
+ require 'some_gem'
128
+
129
+ SomeGem.whatever!
130
+ ~~~
131
+
132
+ A gem can provide traject translation map definitions
133
+ in a `lib/translation_maps` sub-directory, and traject will be able to find those
134
+ translation maps when the gem is loaded. (Because gems'
135
+ `./lib` directories are by default added to the ruby load path.)
136
+
137
+ ### Or, with bundler:
138
+
139
+ However, if you then move your traject project to another system,
140
+ where you haven't yet installed the `some_gem`, then running
141
+ traject with this config file will, of course, fail. Or if you
142
+ move your traject project to another system with a slightly
143
+ different version of `some_gem`, your traject indexing could
144
+ behave differently in confusing ways. As the number of gems
145
+ you are using increases, managing this gets increasingly
146
+ confusing.
147
+
148
+ [bundler](http://bundler.io/) was invented to make this kind of dependency management
149
+ more straightforward and reliable. We recommend you consider using
150
+ bundler, especially for traject installations where traject will
151
+ be run via automated batch jobs on production servers.
152
+
153
+ Bundler's behavior is based on a `Gemfile` that lists your
154
+ project dependencies. You can create a starter skeleton
155
+ by running `bundler init`, probably in the directory
156
+ right next to your traject config files.
157
+
158
+ Then specify what gems your traject project will use,
159
+ possibly with version restrictions, in the [Gemfile](http://bundler.io/v1.3/gemfile.html) --
160
+ **do** include `gem 'traject'` in the Gemfile.
161
+
162
+ Run `bundle install` from the directory with the Gemfile, on any system
163
+ at any time, to make sure specified gems are installed.
164
+
165
+ **Run traject** with `bundle exec` to have bundler set up the environment
166
+ from your Gemfile. You can `cd` into the directory containing the Gemfile,
167
+ so bundler can find it:
168
+
169
+ $ cd /some/where
170
+ $ bundle exec traject -c some_traject_config.rb ...
171
+
172
+ Or you can use the BUNDLE_GEMFILE environment variable to tell bundler where
173
+ to find the Gemfile, and run from any directory at all:
174
+
175
+ $ BUNDLE_GEMFILE=/path/to/Gemfile bundle exec traject -c /path/to/some_config.rb ...
176
+
177
+ Bundler will make sure the specified versions of all gems are used by
178
+ traject, and also make sure no gems except those specified in the gemfile
179
+ are available to the program, for a reliable reproducible environment.
180
+
181
+ You should still `require` the gem in your traject config file,
182
+ then just refer to what it provides in your config code as usual.
183
+
184
+ You should check both the `Gemfile` and the `Gemfile.lock`
185
+ that bundler creates into your source control repo. The
186
+ `Gemfile.lock` specifies _exactly_ what versions of
187
+ gem dependencies are currently being used, so you can get the exact
188
+ same dependency environment on different servers.
189
+
190
+ See the [bundler documentation](http://bundler.io/#getting-started), or google, for more information.