traject 2.0.0-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/.travis.yml +27 -0
- data/.yardopts +3 -0
- data/Gemfile +12 -0
- data/LICENSE.txt +20 -0
- data/README.md +461 -0
- data/Rakefile +21 -0
- data/bench/bench.rb +30 -0
- data/bin/traject +16 -0
- data/doc/batch_execution.md +243 -0
- data/doc/extending.md +190 -0
- data/doc/indexing_rules.md +265 -0
- data/doc/other_commands.md +47 -0
- data/doc/settings.md +101 -0
- data/lib/tasks/load_maps.rake +48 -0
- data/lib/traject.rb +11 -0
- data/lib/traject/command_line.rb +301 -0
- data/lib/traject/csv_writer.rb +34 -0
- data/lib/traject/debug_writer.rb +47 -0
- data/lib/traject/delimited_writer.rb +110 -0
- data/lib/traject/indexer.rb +613 -0
- data/lib/traject/indexer/settings.rb +110 -0
- data/lib/traject/json_writer.rb +51 -0
- data/lib/traject/line_writer.rb +63 -0
- data/lib/traject/macros/basic.rb +9 -0
- data/lib/traject/macros/marc21.rb +223 -0
- data/lib/traject/macros/marc21_semantics.rb +584 -0
- data/lib/traject/macros/marc_format_classifier.rb +197 -0
- data/lib/traject/marc_extractor.rb +410 -0
- data/lib/traject/marc_reader.rb +89 -0
- data/lib/traject/mock_reader.rb +97 -0
- data/lib/traject/ndj_reader.rb +40 -0
- data/lib/traject/null_writer.rb +22 -0
- data/lib/traject/qualified_const_get.rb +40 -0
- data/lib/traject/solr_json_writer.rb +277 -0
- data/lib/traject/thread_pool.rb +161 -0
- data/lib/traject/translation_map.rb +267 -0
- data/lib/traject/util.rb +52 -0
- data/lib/traject/version.rb +3 -0
- data/lib/traject/yaml_writer.rb +9 -0
- data/lib/translation_maps/lcc_top_level.yaml +26 -0
- data/lib/translation_maps/marc_genre_007.yaml +9 -0
- data/lib/translation_maps/marc_genre_leader.yaml +22 -0
- data/lib/translation_maps/marc_geographic.yaml +589 -0
- data/lib/translation_maps/marc_instruments.yaml +102 -0
- data/lib/translation_maps/marc_languages.yaml +490 -0
- data/test/debug_writer_test.rb +38 -0
- data/test/delimited_writer_test.rb +104 -0
- data/test/indexer/each_record_test.rb +59 -0
- data/test/indexer/macros_marc21_semantics_test.rb +391 -0
- data/test/indexer/macros_marc21_test.rb +190 -0
- data/test/indexer/macros_test.rb +40 -0
- data/test/indexer/map_record_test.rb +209 -0
- data/test/indexer/read_write_test.rb +101 -0
- data/test/indexer/settings_test.rb +152 -0
- data/test/indexer/to_field_test.rb +77 -0
- data/test/marc_extractor_test.rb +412 -0
- data/test/marc_format_classifier_test.rb +98 -0
- data/test/marc_reader_test.rb +110 -0
- data/test/solr_json_writer_test.rb +248 -0
- data/test/test_helper.rb +90 -0
- data/test/test_support/245_no_ab.marc +1 -0
- data/test/test_support/880_with_no_6.utf8.marc +1 -0
- data/test/test_support/bad_subfield_code.marc +1 -0
- data/test/test_support/bad_utf_byte.utf8.marc +1 -0
- data/test/test_support/date_resort_to_260.marc +1 -0
- data/test/test_support/date_type_r_missing_date2.marc +1 -0
- data/test/test_support/date_with_u.marc +1 -0
- data/test/test_support/demo_config.rb +155 -0
- data/test/test_support/emptyish_record.marc +1 -0
- data/test/test_support/escaped_character_reference.marc8.marc +1 -0
- data/test/test_support/george_eliot.marc +1 -0
- data/test/test_support/hebrew880s.marc +1 -0
- data/test/test_support/louis_armstrong.marc +1 -0
- data/test/test_support/manufacturing_consent.marc +1 -0
- data/test/test_support/manuscript_online_thesis.marc +1 -0
- data/test/test_support/microform_online_conference.marc +1 -0
- data/test/test_support/multi_era.marc +1 -0
- data/test/test_support/multi_geo.marc +1 -0
- data/test/test_support/musical_cage.marc +1 -0
- data/test/test_support/nature.marc +1 -0
- data/test/test_support/one-marc8.mrc +1 -0
- data/test/test_support/online_only.marc +1 -0
- data/test/test_support/packed_041a_lang.marc +1 -0
- data/test/test_support/test_data.utf8.json +30 -0
- data/test/test_support/test_data.utf8.marc.xml +2609 -0
- data/test/test_support/test_data.utf8.mrc +1 -0
- data/test/test_support/test_data.utf8.mrc.gz +0 -0
- data/test/test_support/the_business_ren.marc +1 -0
- data/test/translation_map_test.rb +225 -0
- data/test/translation_maps/bad_ruby.rb +8 -0
- data/test/translation_maps/bad_yaml.yaml +1 -0
- data/test/translation_maps/both_map.rb +1 -0
- data/test/translation_maps/both_map.yaml +1 -0
- data/test/translation_maps/default_literal.rb +10 -0
- data/test/translation_maps/default_passthrough.rb +10 -0
- data/test/translation_maps/marc_040a_translate_test.yaml +1 -0
- data/test/translation_maps/properties_map.properties +5 -0
- data/test/translation_maps/ruby_map.rb +10 -0
- data/test/translation_maps/translate_array_test.yaml +8 -0
- data/test/translation_maps/yaml_map.yaml +7 -0
- data/traject.gemspec +47 -0
- metadata +382 -0
data/Rakefile
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
begin
|
2
|
+
require 'bundler/setup'
|
3
|
+
require "bundler/gem_tasks"
|
4
|
+
rescue LoadError
|
5
|
+
puts "You must `gem install bundler` and `bundle install` to run rake tasks"
|
6
|
+
end
|
7
|
+
|
8
|
+
require 'rake'
|
9
|
+
require 'rake/testtask'
|
10
|
+
|
11
|
+
task :default => [:test]
|
12
|
+
|
13
|
+
Rake::TestTask.new do |t|
|
14
|
+
t.pattern = 'test/**/*_test.rb'
|
15
|
+
t.libs.push 'test', 'test_support'
|
16
|
+
end
|
17
|
+
|
18
|
+
# Not documented well, but this seems to be
|
19
|
+
# the way to load rake tasks from other files
|
20
|
+
#import "lib/tasks/load_map.rake"
|
21
|
+
Dir.glob('lib/tasks/*.rake').each { |r| import r}
|
data/bench/bench.rb
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
$:.unshift File.expand_path('../../lib', __FILE__)
|
3
|
+
|
4
|
+
require 'traject/command_line'
|
5
|
+
|
6
|
+
require 'benchmark'
|
7
|
+
|
8
|
+
unless ARGV.size >= 2
|
9
|
+
STDERR.puts "\n Benchmark two (or more) different config files with both 0 and 3 threads against the given marc file\n"
|
10
|
+
STDERR.puts "\n Usage:"
|
11
|
+
STDERR.puts " jruby --server bench.rb config1.rb config2.rb [...configN.rb] filename.mrc\n\n"
|
12
|
+
exit
|
13
|
+
end
|
14
|
+
|
15
|
+
filename = ARGV.pop
|
16
|
+
config_files = ARGV
|
17
|
+
|
18
|
+
puts RUBY_DESCRIPTION
|
19
|
+
Benchmark.bmbm do |x|
|
20
|
+
[0, 3].each do |threads|
|
21
|
+
config_files.each do |cf|
|
22
|
+
x.report("#{cf} (#{threads})") do
|
23
|
+
cmdline = Traject::CommandLine.new(["-c", cf, '-s', 'log.file=bench.log', '-s', "processing_thread_pool=#{threads}", filename])
|
24
|
+
cmdline.execute
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
|
data/bin/traject
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
|
4
|
+
# If we're loading from source instead of a gem, rubygems
|
5
|
+
# isn't setting load paths for us, so we need to set it ourselves
|
6
|
+
self_load_path = File.expand_path("../lib", File.dirname(__FILE__))
|
7
|
+
unless $LOAD_PATH.include? self_load_path
|
8
|
+
$LOAD_PATH << self_load_path
|
9
|
+
end
|
10
|
+
|
11
|
+
require 'traject/command_line'
|
12
|
+
|
13
|
+
cmdline = Traject::CommandLine.new(ARGV)
|
14
|
+
result = cmdline.execute
|
15
|
+
|
16
|
+
exit 1 unless result # non-zero exit status on process telling us there's problems.
|
@@ -0,0 +1,243 @@
|
|
1
|
+
# Hints for running traject as a batch job
|
2
|
+
|
3
|
+
Maybe as a cronjob. Maybe via a batch shell script that executes
|
4
|
+
traject, and maybe even pipelines it together with other commands.
|
5
|
+
|
6
|
+
These are things you might want to do with traject. Some potential problem points
|
7
|
+
with suggested solutions, and additional hints.
|
8
|
+
|
9
|
+
## Ruby version setting
|
10
|
+
|
11
|
+
For best performance, traject should run under jruby. You will
|
12
|
+
ordinarily have jruby installed under a ruby version switcher -- we
|
13
|
+
recommend [chruby](https://github.com/postmodern/chruby) over other choices,
|
14
|
+
but other popular choices include rvm and rbenv.
|
15
|
+
|
16
|
+
Especially when running under a cron job, it can be difficult to
|
17
|
+
set things up so traject runs under jruby -- and then when you add
|
18
|
+
bundler into it, things can get positively byzantine. It's not you,
|
19
|
+
this gets confusing.
|
20
|
+
|
21
|
+
It can sometimes be useful to create a wrapper script for traject
|
22
|
+
that takes care of making sure it's running under the right ruby
|
23
|
+
version.
|
24
|
+
|
25
|
+
### for chruby
|
26
|
+
|
27
|
+
Simply run with:
|
28
|
+
|
29
|
+
chruby-exec jruby -- traject {other arguments}
|
30
|
+
|
31
|
+
Whether specifying that directly in a crontab, or in a shell script
|
32
|
+
that needs to call traject, etc. In a crontab environment, it'll actually need
|
33
|
+
you to set PATH and SHELL variables, as specified in the [chruby docs](https://github.com/postmodern/chruby/wiki/Cron)
|
34
|
+
|
35
|
+
|
36
|
+
So simple you might not need a wrapper script, but it might still be convenient to create one. Say
|
37
|
+
you put a `jruby-traject` at `/usr/local/bin/jruby-traject`, that
|
38
|
+
looks like this:
|
39
|
+
|
40
|
+
#!/usr/bin/env bash
|
41
|
+
|
42
|
+
chruby-exec jruby -- traject "$@"
|
43
|
+
|
44
|
+
Now you can can just execute `jruby-traject {arguments}`, and execute traject
|
45
|
+
in a jruby environment. (In a crontab, you'll still need to fix your
|
46
|
+
PATH and SHELL env variables for `chruby-exec` to work, either in the
|
47
|
+
crontab or in this wrapper script)
|
48
|
+
|
49
|
+
### chruby monster wrapper script
|
50
|
+
|
51
|
+
I am still not sure if this is a good idea, but here's an example of
|
52
|
+
a wrapper script for chruby that will take care of the ENV even
|
53
|
+
when running in a crontab, use chruby-exec only if jruby isn't
|
54
|
+
already the default ruby, and add in `bundle exec` too.
|
55
|
+
|
56
|
+
~~~bash
|
57
|
+
#!/usr/bin/env bash
|
58
|
+
|
59
|
+
# A wrapper for traject that uses chruby to make sure jruby
|
60
|
+
# is being used before calling traject, and then calls
|
61
|
+
# traject with bundle exec from within our traject project
|
62
|
+
# dir.
|
63
|
+
|
64
|
+
# Make sure /usr/local/bin is in PATH for chruby-exec,
|
65
|
+
# which it's not ordinarily in a cronjob.
|
66
|
+
if [[ ":$PATH:" != *":/usr/local/bin:"* ]]
|
67
|
+
then
|
68
|
+
export PATH=$PATH:/usr/local/bin
|
69
|
+
fi
|
70
|
+
# chruby needs SHELL set, which it won't be from a crontab
|
71
|
+
export SHELL=/bin/bash
|
72
|
+
|
73
|
+
# Find the dir based on location of this wrapper script,
|
74
|
+
# then use that dir to cd to for the bundle exec to find
|
75
|
+
# the right Gemfile.
|
76
|
+
traject_dir=$(cd `dirname "${BASH_SOURCE[0]}"` && pwd)
|
77
|
+
|
78
|
+
# do we need to use chruby to switch to jruby?
|
79
|
+
if [[ "$(ruby -v)" == *jruby* ]]
|
80
|
+
then
|
81
|
+
ruby_picker="" # nothing needed "
|
82
|
+
else
|
83
|
+
ruby_picker="chruby-exec jruby --"
|
84
|
+
fi
|
85
|
+
|
86
|
+
cmd="BUNDLE_GEMFILE=$traject_dir/Gemfile $ruby_picker bundle exec traject $@"
|
87
|
+
|
88
|
+
echo $cmd
|
89
|
+
eval $cmd
|
90
|
+
~~~
|
91
|
+
|
92
|
+
This monster script can perhaps be adapted for rbenv or rvm.
|
93
|
+
|
94
|
+
### for rbenv
|
95
|
+
|
96
|
+
If running in an interactive shell that has had rbenv set up for
|
97
|
+
it, you can use rbenv's standard mechanism to say to execute
|
98
|
+
something in jruby:
|
99
|
+
|
100
|
+
RBENV_VERSION=jruby-1.7.2 traject {args}
|
101
|
+
|
102
|
+
You do need to specify the exact version of jruby, I don't think
|
103
|
+
there's any way to say 'latest install jruby'. You could do the
|
104
|
+
same thing for any batch scripts you're writing -- just have
|
105
|
+
them set that `RBENV_VERSION` environment variable before
|
106
|
+
executing traject.
|
107
|
+
|
108
|
+
If you're running inside a cronjob, things get a bit trickier,
|
109
|
+
because rbenv isn't normally set up in the limited environment
|
110
|
+
of cron tasks. One way to deal with this is to have your
|
111
|
+
cronjob explicitly execute in a bash login shell, that
|
112
|
+
will then have rbenv set up -- so long as it's running
|
113
|
+
under an account with rbenv set up properly!
|
114
|
+
|
115
|
+
# in a cronfile
|
116
|
+
# 10 * * * * /bin/bash -l -c 'RBENV_VERSION=jruby-1.7.2 traject {args}'
|
117
|
+
|
118
|
+
(Better way? Doc pull requests welcome.)
|
119
|
+
|
120
|
+
|
121
|
+
### for rvm
|
122
|
+
|
123
|
+
See rvm's [own docs on use with cron](http://rvm.io/integration/cron), it gets a bit confusing.
|
124
|
+
But here's one way, using a wrapper script. It does require you to
|
125
|
+
identify and hard-code in where your rvm is installed, and exactly which
|
126
|
+
version of jruby you want to execute with (will have to be updated if you upgrade
|
127
|
+
jruby). (Is there a better way? Doc pull requests welcome! rvm confuses me!)
|
128
|
+
|
129
|
+
Make a file at `/usr/local/bin/jruby-traject` that looks like this:
|
130
|
+
|
131
|
+
|
132
|
+
~~~bash
|
133
|
+
#!/usr/bin/env bash
|
134
|
+
|
135
|
+
# load rvm ruby
|
136
|
+
source /home/MY_ACCT/.rvm/environments/jruby-1.7.3
|
137
|
+
|
138
|
+
traject "$@"
|
139
|
+
~~~
|
140
|
+
|
141
|
+
You have to use your actual account rvm is installed in for MY_ACCT.
|
142
|
+
Or, if you have a global install of rvm instead of a user-account one,
|
143
|
+
it might be at `/usr/local/rvm/environments`... instead.
|
144
|
+
|
145
|
+
Now any account, in a crontab, in an interactive shell, wherever,
|
146
|
+
can just execute `jruby-traject {arguments}`, and execute traject
|
147
|
+
in a jruby environment.
|
148
|
+
|
149
|
+
|
150
|
+
### Bundler too?
|
151
|
+
|
152
|
+
If you're running with bundler too, you could make a wrapper file specific to
|
153
|
+
a particular traject project and it's Gemfile, by combining the `bundle exec` into
|
154
|
+
your wrapper file. For instance, for chruby, this works:
|
155
|
+
|
156
|
+
#!/usr/bin/env bash
|
157
|
+
|
158
|
+
chruby-exec jruby -- BUNDLE_GEMFILE=/path/to/Gemfile bundle exec traject "$@"
|
159
|
+
|
160
|
+
Now you can call your wrapper script from anywhere and with any active ruby,
|
161
|
+
and execute it in jruby and with the dependencies specified in the Gemfile
|
162
|
+
for your project.
|
163
|
+
|
164
|
+
## Exit codes
|
165
|
+
|
166
|
+
Traject tries to always return a well-behaved unix exit code -- 0 for success,
|
167
|
+
non-0 for error.
|
168
|
+
|
169
|
+
You should be able to rely on this in your batch bash scripts, if you want to abort
|
170
|
+
further processing if traject failed for some reason, you can check traject's
|
171
|
+
exit code.
|
172
|
+
|
173
|
+
If an uncaught exception happens, traject will return non-0.
|
174
|
+
|
175
|
+
There are some kinds of errors which prevent traject from indexing
|
176
|
+
one or more records, but traject may still continue processing
|
177
|
+
the other records. If any records have been skipped in this way,
|
178
|
+
traject will _also_ return a non-0 failure exit code. (Is this good?
|
179
|
+
Does it need to be configurable?)
|
180
|
+
|
181
|
+
In these cases, information about errors that led to skipped records should
|
182
|
+
be output as ERROR level in the logs.
|
183
|
+
|
184
|
+
## Logs and Error Reporting
|
185
|
+
|
186
|
+
By default, traject outputs all logging to stderr. This is often just what
|
187
|
+
you want for a batch or automated process, where there might be some wrapper
|
188
|
+
script which captures stderr and puts it where you want it.
|
189
|
+
|
190
|
+
However, it's easy enough to tell traject to log somewhere else. Either on
|
191
|
+
the command-line:
|
192
|
+
|
193
|
+
traject -s log.file=/some/other/file/log {other args}
|
194
|
+
|
195
|
+
Or in a traject configuration file, setting the `log.file` configuration setting.
|
196
|
+
|
197
|
+
### separate error log
|
198
|
+
|
199
|
+
You can also separately have a duplicate log file created with ONLY log messages of
|
200
|
+
level ERROR and higher (meaning ERROR and FATAL), with the `log.error_file` setting.
|
201
|
+
Then, if there's any lines in this error log file at all, you know something bad
|
202
|
+
happened, maybe your batch process needs to notify someone, or abort further
|
203
|
+
steps in the batch process.
|
204
|
+
|
205
|
+
traject -s log.file=/var/log/traject.log -s log.error_file=/var/log/traject_error.log {more args}
|
206
|
+
|
207
|
+
The error lines will be in the main log file, and also duplicated in the error
|
208
|
+
log file.
|
209
|
+
|
210
|
+
### Completely customizable logging with yell
|
211
|
+
|
212
|
+
Traject uses the [yell](https://github.com/rudionrails/yell) gem for logging.
|
213
|
+
You can configure the logger directly to implement whatever crazy logging rules you might
|
214
|
+
want, so long as yell supports them. But yell is pretty flexible.
|
215
|
+
|
216
|
+
Recall that traject config files are just ruby, executed in the context
|
217
|
+
of a Traject::Indexer. You can set the Indexer's `logger` to a yell logger
|
218
|
+
object you configure yourself however you like:
|
219
|
+
|
220
|
+
~~~ruby
|
221
|
+
# inside a traject configuration file
|
222
|
+
|
223
|
+
self.logger = Yell.new do |l|
|
224
|
+
l.level = 'gte.info' # will only pass :info and above to the adapters
|
225
|
+
|
226
|
+
l.adapter :datefile, 'production.log', level: 'lte.warn' # anything lower or equal to :warn
|
227
|
+
l.adapter :datefile, 'error.log', level: 'gte.error' # anything greater or equal to :error
|
228
|
+
end
|
229
|
+
~~~
|
230
|
+
|
231
|
+
**note** it's important to use to use `self.logger =`, or due to
|
232
|
+
ruby idiosyncracies you'll just be setting a local variable, not the Indexer's
|
233
|
+
logger attribute.
|
234
|
+
|
235
|
+
See [yell](https://github.com/rudionrails/yell) docs for more, you can
|
236
|
+
do whatever you can make yell, just write ruby.
|
237
|
+
|
238
|
+
### Bundler
|
239
|
+
|
240
|
+
For automated batch execution, we recommend you consider using
|
241
|
+
bundler to manage any gem dependencies. See the [Extending
|
242
|
+
With Your Own Code](./extending.md) traject docs for
|
243
|
+
information on how traject integrates with bundler.
|
data/doc/extending.md
ADDED
@@ -0,0 +1,190 @@
|
|
1
|
+
# Extending With Your Own Code
|
2
|
+
|
3
|
+
Beyond very simple logic, you'll want to write your own ruby code,
|
4
|
+
organize it in files other than traject config files, but then
|
5
|
+
use it in traject config files.
|
6
|
+
|
7
|
+
You might want to have code local to your traject project; or you
|
8
|
+
might want to use ruby gems to share code between projects and developers.
|
9
|
+
A given project may use both of these techniques.
|
10
|
+
|
11
|
+
Here are some suggestions for how to do this, along with mention
|
12
|
+
of a couple traject features meant to make it easier.
|
13
|
+
|
14
|
+
## Expert Summary
|
15
|
+
|
16
|
+
* Traject `-I` argument command line can be used to list directories to
|
17
|
+
add to the load path, similar to the `ruby -I` argument. You
|
18
|
+
can then 'require' local project files from the load path.
|
19
|
+
* Or modify the ruby `$LOAD_PATH` manually at the top of a traject config file you are loading.
|
20
|
+
* translation map files found in a
|
21
|
+
"./translation_maps" subdir on the load path will be found
|
22
|
+
for Traject translation maps.
|
23
|
+
* You can use Bundler with traject simply by creating a Gemfile with `bundler init`,
|
24
|
+
and then running command line with `bundle exec traject` or
|
25
|
+
even `BUNDLE_GEMFILE=path/to/Gemfile bundle exec traject`
|
26
|
+
|
27
|
+
## Custom code local to your project
|
28
|
+
|
29
|
+
You might want local translation maps, or local ruby
|
30
|
+
code. Here's a standard recommended way you might lay out
|
31
|
+
this extra code in the file system, using a 'lib'
|
32
|
+
directory kept next to your traject config files:
|
33
|
+
|
34
|
+
~~~
|
35
|
+
- my_traject/
|
36
|
+
* config_file.rb
|
37
|
+
- lib/
|
38
|
+
* my_macros.rb
|
39
|
+
* my_utility.rb
|
40
|
+
- translation_maps/
|
41
|
+
* my_map.yaml
|
42
|
+
~~~
|
43
|
+
|
44
|
+
|
45
|
+
The `my_macros.rb` file might contain a simple [macro](./macros.md)
|
46
|
+
in a module called `MyMacros`.
|
47
|
+
|
48
|
+
The `my_utility.rb` file might contain, say, a module of utility
|
49
|
+
methods, `MyUtility.some_utility`, etc.
|
50
|
+
|
51
|
+
To refer to ruby code from another file, we use the standard
|
52
|
+
ruby `require` statement to bring in the files:
|
53
|
+
|
54
|
+
~~~ruby
|
55
|
+
# config_file.rb
|
56
|
+
|
57
|
+
require 'my_macros'
|
58
|
+
require 'my_utility'
|
59
|
+
|
60
|
+
# Now that MyMacros is available, extend it into the indexer,
|
61
|
+
# and use it:
|
62
|
+
|
63
|
+
extend MyMacros
|
64
|
+
|
65
|
+
to_field "title", my_some_macro
|
66
|
+
|
67
|
+
# And likewise, we can use our utility methods:
|
68
|
+
|
69
|
+
to_field "title" do |record, accumulator, context|
|
70
|
+
accumulator << MyUtility.some_utility(record)
|
71
|
+
end
|
72
|
+
~~~
|
73
|
+
|
74
|
+
**But wait!** This won't work yet. Becuase ruby won't be
|
75
|
+
able to find the file in `requires 'my_macros'`. To fix
|
76
|
+
that, we want to add our local `lib` directory to the
|
77
|
+
ruby `$LOAD_PATH`, a standard ruby feature.
|
78
|
+
|
79
|
+
Traject provides a way for you to add to the load path
|
80
|
+
from the traject command line, the `-I` flag:
|
81
|
+
|
82
|
+
traject -I ./lib -c ./config_file.rb ...
|
83
|
+
|
84
|
+
Or, you can hard-code a `$LOAD_PATH` change directly in your
|
85
|
+
config file. You'll have to use some weird looking
|
86
|
+
ruby code to create a file path relative to the current
|
87
|
+
file (the config_file.rb), and then make sure it's
|
88
|
+
an absolute path. (Should we add a traject utility
|
89
|
+
method for this?)
|
90
|
+
|
91
|
+
~~~ruby
|
92
|
+
# at top of config_file.rb...
|
93
|
+
|
94
|
+
$LOAD_PATH.unshift File.expand_path(File.join(File.dirname(__FILE__), './lib'))
|
95
|
+
~~~
|
96
|
+
|
97
|
+
That's pretty much it!
|
98
|
+
|
99
|
+
What about that translation map? The `$LOAD_PATH` modification
|
100
|
+
took care of that too, the Traject::TranslationMap will look
|
101
|
+
up translation map definition files
|
102
|
+
in a `./translation_maps` subdir on the load path, as in `./lib/translation_maps` in this case.
|
103
|
+
|
104
|
+
|
105
|
+
## Using gems in your traject project
|
106
|
+
|
107
|
+
If there is certain logic that is common between (traject or other)
|
108
|
+
projects, it makes sense to put it in a ruby gem.
|
109
|
+
|
110
|
+
We won't go into detail about creating ruby gems, but we
|
111
|
+
do recomend you use the `bundle gem my_gem_name` command to create
|
112
|
+
a skeleton of your gem
|
113
|
+
([one tutorial here](http://railscasts.com/episodes/245-new-gem-with-bundler?view=asciicast)).
|
114
|
+
This will also make available rake commands to install your gem locally
|
115
|
+
(`rake install`), or release it to the rubygems server (`rake release`).
|
116
|
+
|
117
|
+
There are two main methods to use a gem in your traject project,
|
118
|
+
with straight rubygems, or with bundler.
|
119
|
+
|
120
|
+
Without bundler is simpler. Simply `gem install some_gem` from the
|
121
|
+
command line, and now you can `require` that gem in your traject
|
122
|
+
config file, and use what it provides:
|
123
|
+
|
124
|
+
~~~ruby
|
125
|
+
#some_traject_config.rb
|
126
|
+
|
127
|
+
require 'some_gem'
|
128
|
+
|
129
|
+
SomeGem.whatever!
|
130
|
+
~~~
|
131
|
+
|
132
|
+
A gem can provide traject translation map definitions
|
133
|
+
in a `lib/translation_maps` sub-directory, and traject will be able to find those
|
134
|
+
translation maps when the gem is loaded. (Because gems'
|
135
|
+
`./lib` directories are by default added to the ruby load path.)
|
136
|
+
|
137
|
+
### Or, with bundler:
|
138
|
+
|
139
|
+
However, if you then move your traject project to another system,
|
140
|
+
where you haven't yet installed the `some_gem`, then running
|
141
|
+
traject with this config file will, of course, fail. Or if you
|
142
|
+
move your traject project to another system with a slightly
|
143
|
+
different version of `some_gem`, your traject indexing could
|
144
|
+
behave differently in confusing ways. As the number of gems
|
145
|
+
you are using increases, managing this gets increasingly
|
146
|
+
confusing.
|
147
|
+
|
148
|
+
[bundler](http://bundler.io/) was invented to make this kind of dependency management
|
149
|
+
more straightforward and reliable. We recommend you consider using
|
150
|
+
bundler, especially for traject installations where traject will
|
151
|
+
be run via automated batch jobs on production servers.
|
152
|
+
|
153
|
+
Bundler's behavior is based on a `Gemfile` that lists your
|
154
|
+
project dependencies. You can create a starter skeleton
|
155
|
+
by running `bundler init`, probably in the directory
|
156
|
+
right next to your traject config files.
|
157
|
+
|
158
|
+
Then specify what gems your traject project will use,
|
159
|
+
possibly with version restrictions, in the [Gemfile](http://bundler.io/v1.3/gemfile.html) --
|
160
|
+
**do** include `gem 'traject'` in the Gemfile.
|
161
|
+
|
162
|
+
Run `bundle install` from the directory with the Gemfile, on any system
|
163
|
+
at any time, to make sure specified gems are installed.
|
164
|
+
|
165
|
+
**Run traject** with `bundle exec` to have bundler set up the environment
|
166
|
+
from your Gemfile. You can `cd` into the directory containing the Gemfile,
|
167
|
+
so bundler can find it:
|
168
|
+
|
169
|
+
$ cd /some/where
|
170
|
+
$ bundle exec traject -c some_traject_config.rb ...
|
171
|
+
|
172
|
+
Or you can use the BUNDLE_GEMFILE environment variable to tell bundler where
|
173
|
+
to find the Gemfile, and run from any directory at all:
|
174
|
+
|
175
|
+
$ BUNDLE_GEMFILE=/path/to/Gemfile bundle exec traject -c /path/to/some_config.rb ...
|
176
|
+
|
177
|
+
Bundler will make sure the specified versions of all gems are used by
|
178
|
+
traject, and also make sure no gems except those specified in the gemfile
|
179
|
+
are available to the program, for a reliable reproducible environment.
|
180
|
+
|
181
|
+
You should still `require` the gem in your traject config file,
|
182
|
+
then just refer to what it provides in your config code as usual.
|
183
|
+
|
184
|
+
You should check both the `Gemfile` and the `Gemfile.lock`
|
185
|
+
that bundler creates into your source control repo. The
|
186
|
+
`Gemfile.lock` specifies _exactly_ what versions of
|
187
|
+
gem dependencies are currently being used, so you can get the exact
|
188
|
+
same dependency environment on different servers.
|
189
|
+
|
190
|
+
See the [bundler documentation](http://bundler.io/#getting-started), or google, for more information.
|