traject 2.0.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/.travis.yml +27 -0
- data/.yardopts +3 -0
- data/Gemfile +12 -0
- data/LICENSE.txt +20 -0
- data/README.md +461 -0
- data/Rakefile +21 -0
- data/bench/bench.rb +30 -0
- data/bin/traject +16 -0
- data/doc/batch_execution.md +243 -0
- data/doc/extending.md +190 -0
- data/doc/indexing_rules.md +265 -0
- data/doc/other_commands.md +47 -0
- data/doc/settings.md +101 -0
- data/lib/tasks/load_maps.rake +48 -0
- data/lib/traject.rb +11 -0
- data/lib/traject/command_line.rb +301 -0
- data/lib/traject/csv_writer.rb +34 -0
- data/lib/traject/debug_writer.rb +47 -0
- data/lib/traject/delimited_writer.rb +110 -0
- data/lib/traject/indexer.rb +613 -0
- data/lib/traject/indexer/settings.rb +110 -0
- data/lib/traject/json_writer.rb +51 -0
- data/lib/traject/line_writer.rb +63 -0
- data/lib/traject/macros/basic.rb +9 -0
- data/lib/traject/macros/marc21.rb +223 -0
- data/lib/traject/macros/marc21_semantics.rb +584 -0
- data/lib/traject/macros/marc_format_classifier.rb +197 -0
- data/lib/traject/marc_extractor.rb +410 -0
- data/lib/traject/marc_reader.rb +89 -0
- data/lib/traject/mock_reader.rb +97 -0
- data/lib/traject/ndj_reader.rb +40 -0
- data/lib/traject/null_writer.rb +22 -0
- data/lib/traject/qualified_const_get.rb +40 -0
- data/lib/traject/solr_json_writer.rb +277 -0
- data/lib/traject/thread_pool.rb +161 -0
- data/lib/traject/translation_map.rb +267 -0
- data/lib/traject/util.rb +52 -0
- data/lib/traject/version.rb +3 -0
- data/lib/traject/yaml_writer.rb +9 -0
- data/lib/translation_maps/lcc_top_level.yaml +26 -0
- data/lib/translation_maps/marc_genre_007.yaml +9 -0
- data/lib/translation_maps/marc_genre_leader.yaml +22 -0
- data/lib/translation_maps/marc_geographic.yaml +589 -0
- data/lib/translation_maps/marc_instruments.yaml +102 -0
- data/lib/translation_maps/marc_languages.yaml +490 -0
- data/test/debug_writer_test.rb +38 -0
- data/test/delimited_writer_test.rb +104 -0
- data/test/indexer/each_record_test.rb +59 -0
- data/test/indexer/macros_marc21_semantics_test.rb +391 -0
- data/test/indexer/macros_marc21_test.rb +190 -0
- data/test/indexer/macros_test.rb +40 -0
- data/test/indexer/map_record_test.rb +209 -0
- data/test/indexer/read_write_test.rb +101 -0
- data/test/indexer/settings_test.rb +152 -0
- data/test/indexer/to_field_test.rb +77 -0
- data/test/marc_extractor_test.rb +412 -0
- data/test/marc_format_classifier_test.rb +98 -0
- data/test/marc_reader_test.rb +110 -0
- data/test/solr_json_writer_test.rb +248 -0
- data/test/test_helper.rb +90 -0
- data/test/test_support/245_no_ab.marc +1 -0
- data/test/test_support/880_with_no_6.utf8.marc +1 -0
- data/test/test_support/bad_subfield_code.marc +1 -0
- data/test/test_support/bad_utf_byte.utf8.marc +1 -0
- data/test/test_support/date_resort_to_260.marc +1 -0
- data/test/test_support/date_type_r_missing_date2.marc +1 -0
- data/test/test_support/date_with_u.marc +1 -0
- data/test/test_support/demo_config.rb +155 -0
- data/test/test_support/emptyish_record.marc +1 -0
- data/test/test_support/escaped_character_reference.marc8.marc +1 -0
- data/test/test_support/george_eliot.marc +1 -0
- data/test/test_support/hebrew880s.marc +1 -0
- data/test/test_support/louis_armstrong.marc +1 -0
- data/test/test_support/manufacturing_consent.marc +1 -0
- data/test/test_support/manuscript_online_thesis.marc +1 -0
- data/test/test_support/microform_online_conference.marc +1 -0
- data/test/test_support/multi_era.marc +1 -0
- data/test/test_support/multi_geo.marc +1 -0
- data/test/test_support/musical_cage.marc +1 -0
- data/test/test_support/nature.marc +1 -0
- data/test/test_support/one-marc8.mrc +1 -0
- data/test/test_support/online_only.marc +1 -0
- data/test/test_support/packed_041a_lang.marc +1 -0
- data/test/test_support/test_data.utf8.json +30 -0
- data/test/test_support/test_data.utf8.marc.xml +2609 -0
- data/test/test_support/test_data.utf8.mrc +1 -0
- data/test/test_support/test_data.utf8.mrc.gz +0 -0
- data/test/test_support/the_business_ren.marc +1 -0
- data/test/translation_map_test.rb +225 -0
- data/test/translation_maps/bad_ruby.rb +8 -0
- data/test/translation_maps/bad_yaml.yaml +1 -0
- data/test/translation_maps/both_map.rb +1 -0
- data/test/translation_maps/both_map.yaml +1 -0
- data/test/translation_maps/default_literal.rb +10 -0
- data/test/translation_maps/default_passthrough.rb +10 -0
- data/test/translation_maps/marc_040a_translate_test.yaml +1 -0
- data/test/translation_maps/properties_map.properties +5 -0
- data/test/translation_maps/ruby_map.rb +10 -0
- data/test/translation_maps/translate_array_test.yaml +8 -0
- data/test/translation_maps/yaml_map.yaml +7 -0
- data/traject.gemspec +47 -0
- metadata +382 -0
data/Rakefile
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
begin
|
|
2
|
+
require 'bundler/setup'
|
|
3
|
+
require "bundler/gem_tasks"
|
|
4
|
+
rescue LoadError
|
|
5
|
+
puts "You must `gem install bundler` and `bundle install` to run rake tasks"
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
require 'rake'
|
|
9
|
+
require 'rake/testtask'
|
|
10
|
+
|
|
11
|
+
task :default => [:test]
|
|
12
|
+
|
|
13
|
+
Rake::TestTask.new do |t|
|
|
14
|
+
t.pattern = 'test/**/*_test.rb'
|
|
15
|
+
t.libs.push 'test', 'test_support'
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# Not documented well, but this seems to be
|
|
19
|
+
# the way to load rake tasks from other files
|
|
20
|
+
#import "lib/tasks/load_map.rake"
|
|
21
|
+
Dir.glob('lib/tasks/*.rake').each { |r| import r}
|
data/bench/bench.rb
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
#!/usr/bin/env jruby
|
|
2
|
+
$:.unshift File.expand_path('../../lib', __FILE__)
|
|
3
|
+
|
|
4
|
+
require 'traject/command_line'
|
|
5
|
+
|
|
6
|
+
require 'benchmark'
|
|
7
|
+
|
|
8
|
+
unless ARGV.size >= 2
|
|
9
|
+
STDERR.puts "\n Benchmark two (or more) different config files with both 0 and 3 threads against the given marc file\n"
|
|
10
|
+
STDERR.puts "\n Usage:"
|
|
11
|
+
STDERR.puts " jruby --server bench.rb config1.rb config2.rb [...configN.rb] filename.mrc\n\n"
|
|
12
|
+
exit
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
filename = ARGV.pop
|
|
16
|
+
config_files = ARGV
|
|
17
|
+
|
|
18
|
+
puts RUBY_DESCRIPTION
|
|
19
|
+
Benchmark.bmbm do |x|
|
|
20
|
+
[0, 3].each do |threads|
|
|
21
|
+
config_files.each do |cf|
|
|
22
|
+
x.report("#{cf} (#{threads})") do
|
|
23
|
+
cmdline = Traject::CommandLine.new(["-c", cf, '-s', 'log.file=bench.log', '-s', "processing_thread_pool=#{threads}", filename])
|
|
24
|
+
cmdline.execute
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
|
data/bin/traject
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
# If we're loading from source instead of a gem, rubygems
|
|
5
|
+
# isn't setting load paths for us, so we need to set it ourselves
|
|
6
|
+
self_load_path = File.expand_path("../lib", File.dirname(__FILE__))
|
|
7
|
+
unless $LOAD_PATH.include? self_load_path
|
|
8
|
+
$LOAD_PATH << self_load_path
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
require 'traject/command_line'
|
|
12
|
+
|
|
13
|
+
cmdline = Traject::CommandLine.new(ARGV)
|
|
14
|
+
result = cmdline.execute
|
|
15
|
+
|
|
16
|
+
exit 1 unless result # non-zero exit status on process telling us there's problems.
|
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
# Hints for running traject as a batch job
|
|
2
|
+
|
|
3
|
+
Maybe as a cronjob. Maybe via a batch shell script that executes
|
|
4
|
+
traject, and maybe even pipelines it together with other commands.
|
|
5
|
+
|
|
6
|
+
These are things you might want to do with traject. Some potential problem points
|
|
7
|
+
with suggested solutions, and additional hints.
|
|
8
|
+
|
|
9
|
+
## Ruby version setting
|
|
10
|
+
|
|
11
|
+
For best performance, traject should run under jruby. You will
|
|
12
|
+
ordinarily have jruby installed under a ruby version switcher -- we
|
|
13
|
+
recommend [chruby](https://github.com/postmodern/chruby) over other choices,
|
|
14
|
+
but other popular choices include rvm and rbenv.
|
|
15
|
+
|
|
16
|
+
Especially when running under a cron job, it can be difficult to
|
|
17
|
+
set things up so traject runs under jruby -- and then when you add
|
|
18
|
+
bundler into it, things can get positively byzantine. It's not you,
|
|
19
|
+
this gets confusing.
|
|
20
|
+
|
|
21
|
+
It can sometimes be useful to create a wrapper script for traject
|
|
22
|
+
that takes care of making sure it's running under the right ruby
|
|
23
|
+
version.
|
|
24
|
+
|
|
25
|
+
### for chruby
|
|
26
|
+
|
|
27
|
+
Simply run with:
|
|
28
|
+
|
|
29
|
+
chruby-exec jruby -- traject {other arguments}
|
|
30
|
+
|
|
31
|
+
Whether specifying that directly in a crontab, or in a shell script
|
|
32
|
+
that needs to call traject, etc. In a crontab environment, it'll actually need
|
|
33
|
+
you to set PATH and SHELL variables, as specified in the [chruby docs](https://github.com/postmodern/chruby/wiki/Cron)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
So simple you might not need a wrapper script, but it might still be convenient to create one. Say
|
|
37
|
+
you put a `jruby-traject` at `/usr/local/bin/jruby-traject`, that
|
|
38
|
+
looks like this:
|
|
39
|
+
|
|
40
|
+
#!/usr/bin/env bash
|
|
41
|
+
|
|
42
|
+
chruby-exec jruby -- traject "$@"
|
|
43
|
+
|
|
44
|
+
Now you can can just execute `jruby-traject {arguments}`, and execute traject
|
|
45
|
+
in a jruby environment. (In a crontab, you'll still need to fix your
|
|
46
|
+
PATH and SHELL env variables for `chruby-exec` to work, either in the
|
|
47
|
+
crontab or in this wrapper script)
|
|
48
|
+
|
|
49
|
+
### chruby monster wrapper script
|
|
50
|
+
|
|
51
|
+
I am still not sure if this is a good idea, but here's an example of
|
|
52
|
+
a wrapper script for chruby that will take care of the ENV even
|
|
53
|
+
when running in a crontab, use chruby-exec only if jruby isn't
|
|
54
|
+
already the default ruby, and add in `bundle exec` too.
|
|
55
|
+
|
|
56
|
+
~~~bash
|
|
57
|
+
#!/usr/bin/env bash
|
|
58
|
+
|
|
59
|
+
# A wrapper for traject that uses chruby to make sure jruby
|
|
60
|
+
# is being used before calling traject, and then calls
|
|
61
|
+
# traject with bundle exec from within our traject project
|
|
62
|
+
# dir.
|
|
63
|
+
|
|
64
|
+
# Make sure /usr/local/bin is in PATH for chruby-exec,
|
|
65
|
+
# which it's not ordinarily in a cronjob.
|
|
66
|
+
if [[ ":$PATH:" != *":/usr/local/bin:"* ]]
|
|
67
|
+
then
|
|
68
|
+
export PATH=$PATH:/usr/local/bin
|
|
69
|
+
fi
|
|
70
|
+
# chruby needs SHELL set, which it won't be from a crontab
|
|
71
|
+
export SHELL=/bin/bash
|
|
72
|
+
|
|
73
|
+
# Find the dir based on location of this wrapper script,
|
|
74
|
+
# then use that dir to cd to for the bundle exec to find
|
|
75
|
+
# the right Gemfile.
|
|
76
|
+
traject_dir=$(cd `dirname "${BASH_SOURCE[0]}"` && pwd)
|
|
77
|
+
|
|
78
|
+
# do we need to use chruby to switch to jruby?
|
|
79
|
+
if [[ "$(ruby -v)" == *jruby* ]]
|
|
80
|
+
then
|
|
81
|
+
ruby_picker="" # nothing needed "
|
|
82
|
+
else
|
|
83
|
+
ruby_picker="chruby-exec jruby --"
|
|
84
|
+
fi
|
|
85
|
+
|
|
86
|
+
cmd="BUNDLE_GEMFILE=$traject_dir/Gemfile $ruby_picker bundle exec traject $@"
|
|
87
|
+
|
|
88
|
+
echo $cmd
|
|
89
|
+
eval $cmd
|
|
90
|
+
~~~
|
|
91
|
+
|
|
92
|
+
This monster script can perhaps be adapted for rbenv or rvm.
|
|
93
|
+
|
|
94
|
+
### for rbenv
|
|
95
|
+
|
|
96
|
+
If running in an interactive shell that has had rbenv set up for
|
|
97
|
+
it, you can use rbenv's standard mechanism to say to execute
|
|
98
|
+
something in jruby:
|
|
99
|
+
|
|
100
|
+
RBENV_VERSION=jruby-1.7.2 traject {args}
|
|
101
|
+
|
|
102
|
+
You do need to specify the exact version of jruby, I don't think
|
|
103
|
+
there's any way to say 'latest install jruby'. You could do the
|
|
104
|
+
same thing for any batch scripts you're writing -- just have
|
|
105
|
+
them set that `RBENV_VERSION` environment variable before
|
|
106
|
+
executing traject.
|
|
107
|
+
|
|
108
|
+
If you're running inside a cronjob, things get a bit trickier,
|
|
109
|
+
because rbenv isn't normally set up in the limited environment
|
|
110
|
+
of cron tasks. One way to deal with this is to have your
|
|
111
|
+
cronjob explicitly execute in a bash login shell, that
|
|
112
|
+
will then have rbenv set up -- so long as it's running
|
|
113
|
+
under an account with rbenv set up properly!
|
|
114
|
+
|
|
115
|
+
# in a cronfile
|
|
116
|
+
# 10 * * * * /bin/bash -l -c 'RBENV_VERSION=jruby-1.7.2 traject {args}'
|
|
117
|
+
|
|
118
|
+
(Better way? Doc pull requests welcome.)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
### for rvm
|
|
122
|
+
|
|
123
|
+
See rvm's [own docs on use with cron](http://rvm.io/integration/cron), it gets a bit confusing.
|
|
124
|
+
But here's one way, using a wrapper script. It does require you to
|
|
125
|
+
identify and hard-code in where your rvm is installed, and exactly which
|
|
126
|
+
version of jruby you want to execute with (will have to be updated if you upgrade
|
|
127
|
+
jruby). (Is there a better way? Doc pull requests welcome! rvm confuses me!)
|
|
128
|
+
|
|
129
|
+
Make a file at `/usr/local/bin/jruby-traject` that looks like this:
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
~~~bash
|
|
133
|
+
#!/usr/bin/env bash
|
|
134
|
+
|
|
135
|
+
# load rvm ruby
|
|
136
|
+
source /home/MY_ACCT/.rvm/environments/jruby-1.7.3
|
|
137
|
+
|
|
138
|
+
traject "$@"
|
|
139
|
+
~~~
|
|
140
|
+
|
|
141
|
+
You have to use your actual account rvm is installed in for MY_ACCT.
|
|
142
|
+
Or, if you have a global install of rvm instead of a user-account one,
|
|
143
|
+
it might be at `/usr/local/rvm/environments`... instead.
|
|
144
|
+
|
|
145
|
+
Now any account, in a crontab, in an interactive shell, wherever,
|
|
146
|
+
can just execute `jruby-traject {arguments}`, and execute traject
|
|
147
|
+
in a jruby environment.
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
### Bundler too?
|
|
151
|
+
|
|
152
|
+
If you're running with bundler too, you could make a wrapper file specific to
|
|
153
|
+
a particular traject project and it's Gemfile, by combining the `bundle exec` into
|
|
154
|
+
your wrapper file. For instance, for chruby, this works:
|
|
155
|
+
|
|
156
|
+
#!/usr/bin/env bash
|
|
157
|
+
|
|
158
|
+
chruby-exec jruby -- BUNDLE_GEMFILE=/path/to/Gemfile bundle exec traject "$@"
|
|
159
|
+
|
|
160
|
+
Now you can call your wrapper script from anywhere and with any active ruby,
|
|
161
|
+
and execute it in jruby and with the dependencies specified in the Gemfile
|
|
162
|
+
for your project.
|
|
163
|
+
|
|
164
|
+
## Exit codes
|
|
165
|
+
|
|
166
|
+
Traject tries to always return a well-behaved unix exit code -- 0 for success,
|
|
167
|
+
non-0 for error.
|
|
168
|
+
|
|
169
|
+
You should be able to rely on this in your batch bash scripts, if you want to abort
|
|
170
|
+
further processing if traject failed for some reason, you can check traject's
|
|
171
|
+
exit code.
|
|
172
|
+
|
|
173
|
+
If an uncaught exception happens, traject will return non-0.
|
|
174
|
+
|
|
175
|
+
There are some kinds of errors which prevent traject from indexing
|
|
176
|
+
one or more records, but traject may still continue processing
|
|
177
|
+
the other records. If any records have been skipped in this way,
|
|
178
|
+
traject will _also_ return a non-0 failure exit code. (Is this good?
|
|
179
|
+
Does it need to be configurable?)
|
|
180
|
+
|
|
181
|
+
In these cases, information about errors that led to skipped records should
|
|
182
|
+
be output as ERROR level in the logs.
|
|
183
|
+
|
|
184
|
+
## Logs and Error Reporting
|
|
185
|
+
|
|
186
|
+
By default, traject outputs all logging to stderr. This is often just what
|
|
187
|
+
you want for a batch or automated process, where there might be some wrapper
|
|
188
|
+
script which captures stderr and puts it where you want it.
|
|
189
|
+
|
|
190
|
+
However, it's easy enough to tell traject to log somewhere else. Either on
|
|
191
|
+
the command-line:
|
|
192
|
+
|
|
193
|
+
traject -s log.file=/some/other/file/log {other args}
|
|
194
|
+
|
|
195
|
+
Or in a traject configuration file, setting the `log.file` configuration setting.
|
|
196
|
+
|
|
197
|
+
### separate error log
|
|
198
|
+
|
|
199
|
+
You can also separately have a duplicate log file created with ONLY log messages of
|
|
200
|
+
level ERROR and higher (meaning ERROR and FATAL), with the `log.error_file` setting.
|
|
201
|
+
Then, if there's any lines in this error log file at all, you know something bad
|
|
202
|
+
happened, maybe your batch process needs to notify someone, or abort further
|
|
203
|
+
steps in the batch process.
|
|
204
|
+
|
|
205
|
+
traject -s log.file=/var/log/traject.log -s log.error_file=/var/log/traject_error.log {more args}
|
|
206
|
+
|
|
207
|
+
The error lines will be in the main log file, and also duplicated in the error
|
|
208
|
+
log file.
|
|
209
|
+
|
|
210
|
+
### Completely customizable logging with yell
|
|
211
|
+
|
|
212
|
+
Traject uses the [yell](https://github.com/rudionrails/yell) gem for logging.
|
|
213
|
+
You can configure the logger directly to implement whatever crazy logging rules you might
|
|
214
|
+
want, so long as yell supports them. But yell is pretty flexible.
|
|
215
|
+
|
|
216
|
+
Recall that traject config files are just ruby, executed in the context
|
|
217
|
+
of a Traject::Indexer. You can set the Indexer's `logger` to a yell logger
|
|
218
|
+
object you configure yourself however you like:
|
|
219
|
+
|
|
220
|
+
~~~ruby
|
|
221
|
+
# inside a traject configuration file
|
|
222
|
+
|
|
223
|
+
self.logger = Yell.new do |l|
|
|
224
|
+
l.level = 'gte.info' # will only pass :info and above to the adapters
|
|
225
|
+
|
|
226
|
+
l.adapter :datefile, 'production.log', level: 'lte.warn' # anything lower or equal to :warn
|
|
227
|
+
l.adapter :datefile, 'error.log', level: 'gte.error' # anything greater or equal to :error
|
|
228
|
+
end
|
|
229
|
+
~~~
|
|
230
|
+
|
|
231
|
+
**note** it's important to use to use `self.logger =`, or due to
|
|
232
|
+
ruby idiosyncracies you'll just be setting a local variable, not the Indexer's
|
|
233
|
+
logger attribute.
|
|
234
|
+
|
|
235
|
+
See [yell](https://github.com/rudionrails/yell) docs for more, you can
|
|
236
|
+
do whatever you can make yell, just write ruby.
|
|
237
|
+
|
|
238
|
+
### Bundler
|
|
239
|
+
|
|
240
|
+
For automated batch execution, we recommend you consider using
|
|
241
|
+
bundler to manage any gem dependencies. See the [Extending
|
|
242
|
+
With Your Own Code](./extending.md) traject docs for
|
|
243
|
+
information on how traject integrates with bundler.
|
data/doc/extending.md
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
# Extending With Your Own Code
|
|
2
|
+
|
|
3
|
+
Beyond very simple logic, you'll want to write your own ruby code,
|
|
4
|
+
organize it in files other than traject config files, but then
|
|
5
|
+
use it in traject config files.
|
|
6
|
+
|
|
7
|
+
You might want to have code local to your traject project; or you
|
|
8
|
+
might want to use ruby gems to share code between projects and developers.
|
|
9
|
+
A given project may use both of these techniques.
|
|
10
|
+
|
|
11
|
+
Here are some suggestions for how to do this, along with mention
|
|
12
|
+
of a couple traject features meant to make it easier.
|
|
13
|
+
|
|
14
|
+
## Expert Summary
|
|
15
|
+
|
|
16
|
+
* Traject `-I` argument command line can be used to list directories to
|
|
17
|
+
add to the load path, similar to the `ruby -I` argument. You
|
|
18
|
+
can then 'require' local project files from the load path.
|
|
19
|
+
* Or modify the ruby `$LOAD_PATH` manually at the top of a traject config file you are loading.
|
|
20
|
+
* translation map files found in a
|
|
21
|
+
"./translation_maps" subdir on the load path will be found
|
|
22
|
+
for Traject translation maps.
|
|
23
|
+
* You can use Bundler with traject simply by creating a Gemfile with `bundler init`,
|
|
24
|
+
and then running command line with `bundle exec traject` or
|
|
25
|
+
even `BUNDLE_GEMFILE=path/to/Gemfile bundle exec traject`
|
|
26
|
+
|
|
27
|
+
## Custom code local to your project
|
|
28
|
+
|
|
29
|
+
You might want local translation maps, or local ruby
|
|
30
|
+
code. Here's a standard recommended way you might lay out
|
|
31
|
+
this extra code in the file system, using a 'lib'
|
|
32
|
+
directory kept next to your traject config files:
|
|
33
|
+
|
|
34
|
+
~~~
|
|
35
|
+
- my_traject/
|
|
36
|
+
* config_file.rb
|
|
37
|
+
- lib/
|
|
38
|
+
* my_macros.rb
|
|
39
|
+
* my_utility.rb
|
|
40
|
+
- translation_maps/
|
|
41
|
+
* my_map.yaml
|
|
42
|
+
~~~
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
The `my_macros.rb` file might contain a simple [macro](./macros.md)
|
|
46
|
+
in a module called `MyMacros`.
|
|
47
|
+
|
|
48
|
+
The `my_utility.rb` file might contain, say, a module of utility
|
|
49
|
+
methods, `MyUtility.some_utility`, etc.
|
|
50
|
+
|
|
51
|
+
To refer to ruby code from another file, we use the standard
|
|
52
|
+
ruby `require` statement to bring in the files:
|
|
53
|
+
|
|
54
|
+
~~~ruby
|
|
55
|
+
# config_file.rb
|
|
56
|
+
|
|
57
|
+
require 'my_macros'
|
|
58
|
+
require 'my_utility'
|
|
59
|
+
|
|
60
|
+
# Now that MyMacros is available, extend it into the indexer,
|
|
61
|
+
# and use it:
|
|
62
|
+
|
|
63
|
+
extend MyMacros
|
|
64
|
+
|
|
65
|
+
to_field "title", my_some_macro
|
|
66
|
+
|
|
67
|
+
# And likewise, we can use our utility methods:
|
|
68
|
+
|
|
69
|
+
to_field "title" do |record, accumulator, context|
|
|
70
|
+
accumulator << MyUtility.some_utility(record)
|
|
71
|
+
end
|
|
72
|
+
~~~
|
|
73
|
+
|
|
74
|
+
**But wait!** This won't work yet. Becuase ruby won't be
|
|
75
|
+
able to find the file in `requires 'my_macros'`. To fix
|
|
76
|
+
that, we want to add our local `lib` directory to the
|
|
77
|
+
ruby `$LOAD_PATH`, a standard ruby feature.
|
|
78
|
+
|
|
79
|
+
Traject provides a way for you to add to the load path
|
|
80
|
+
from the traject command line, the `-I` flag:
|
|
81
|
+
|
|
82
|
+
traject -I ./lib -c ./config_file.rb ...
|
|
83
|
+
|
|
84
|
+
Or, you can hard-code a `$LOAD_PATH` change directly in your
|
|
85
|
+
config file. You'll have to use some weird looking
|
|
86
|
+
ruby code to create a file path relative to the current
|
|
87
|
+
file (the config_file.rb), and then make sure it's
|
|
88
|
+
an absolute path. (Should we add a traject utility
|
|
89
|
+
method for this?)
|
|
90
|
+
|
|
91
|
+
~~~ruby
|
|
92
|
+
# at top of config_file.rb...
|
|
93
|
+
|
|
94
|
+
$LOAD_PATH.unshift File.expand_path(File.join(File.dirname(__FILE__), './lib'))
|
|
95
|
+
~~~
|
|
96
|
+
|
|
97
|
+
That's pretty much it!
|
|
98
|
+
|
|
99
|
+
What about that translation map? The `$LOAD_PATH` modification
|
|
100
|
+
took care of that too, the Traject::TranslationMap will look
|
|
101
|
+
up translation map definition files
|
|
102
|
+
in a `./translation_maps` subdir on the load path, as in `./lib/translation_maps` in this case.
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
## Using gems in your traject project
|
|
106
|
+
|
|
107
|
+
If there is certain logic that is common between (traject or other)
|
|
108
|
+
projects, it makes sense to put it in a ruby gem.
|
|
109
|
+
|
|
110
|
+
We won't go into detail about creating ruby gems, but we
|
|
111
|
+
do recomend you use the `bundle gem my_gem_name` command to create
|
|
112
|
+
a skeleton of your gem
|
|
113
|
+
([one tutorial here](http://railscasts.com/episodes/245-new-gem-with-bundler?view=asciicast)).
|
|
114
|
+
This will also make available rake commands to install your gem locally
|
|
115
|
+
(`rake install`), or release it to the rubygems server (`rake release`).
|
|
116
|
+
|
|
117
|
+
There are two main methods to use a gem in your traject project,
|
|
118
|
+
with straight rubygems, or with bundler.
|
|
119
|
+
|
|
120
|
+
Without bundler is simpler. Simply `gem install some_gem` from the
|
|
121
|
+
command line, and now you can `require` that gem in your traject
|
|
122
|
+
config file, and use what it provides:
|
|
123
|
+
|
|
124
|
+
~~~ruby
|
|
125
|
+
#some_traject_config.rb
|
|
126
|
+
|
|
127
|
+
require 'some_gem'
|
|
128
|
+
|
|
129
|
+
SomeGem.whatever!
|
|
130
|
+
~~~
|
|
131
|
+
|
|
132
|
+
A gem can provide traject translation map definitions
|
|
133
|
+
in a `lib/translation_maps` sub-directory, and traject will be able to find those
|
|
134
|
+
translation maps when the gem is loaded. (Because gems'
|
|
135
|
+
`./lib` directories are by default added to the ruby load path.)
|
|
136
|
+
|
|
137
|
+
### Or, with bundler:
|
|
138
|
+
|
|
139
|
+
However, if you then move your traject project to another system,
|
|
140
|
+
where you haven't yet installed the `some_gem`, then running
|
|
141
|
+
traject with this config file will, of course, fail. Or if you
|
|
142
|
+
move your traject project to another system with a slightly
|
|
143
|
+
different version of `some_gem`, your traject indexing could
|
|
144
|
+
behave differently in confusing ways. As the number of gems
|
|
145
|
+
you are using increases, managing this gets increasingly
|
|
146
|
+
confusing.
|
|
147
|
+
|
|
148
|
+
[bundler](http://bundler.io/) was invented to make this kind of dependency management
|
|
149
|
+
more straightforward and reliable. We recommend you consider using
|
|
150
|
+
bundler, especially for traject installations where traject will
|
|
151
|
+
be run via automated batch jobs on production servers.
|
|
152
|
+
|
|
153
|
+
Bundler's behavior is based on a `Gemfile` that lists your
|
|
154
|
+
project dependencies. You can create a starter skeleton
|
|
155
|
+
by running `bundler init`, probably in the directory
|
|
156
|
+
right next to your traject config files.
|
|
157
|
+
|
|
158
|
+
Then specify what gems your traject project will use,
|
|
159
|
+
possibly with version restrictions, in the [Gemfile](http://bundler.io/v1.3/gemfile.html) --
|
|
160
|
+
**do** include `gem 'traject'` in the Gemfile.
|
|
161
|
+
|
|
162
|
+
Run `bundle install` from the directory with the Gemfile, on any system
|
|
163
|
+
at any time, to make sure specified gems are installed.
|
|
164
|
+
|
|
165
|
+
**Run traject** with `bundle exec` to have bundler set up the environment
|
|
166
|
+
from your Gemfile. You can `cd` into the directory containing the Gemfile,
|
|
167
|
+
so bundler can find it:
|
|
168
|
+
|
|
169
|
+
$ cd /some/where
|
|
170
|
+
$ bundle exec traject -c some_traject_config.rb ...
|
|
171
|
+
|
|
172
|
+
Or you can use the BUNDLE_GEMFILE environment variable to tell bundler where
|
|
173
|
+
to find the Gemfile, and run from any directory at all:
|
|
174
|
+
|
|
175
|
+
$ BUNDLE_GEMFILE=/path/to/Gemfile bundle exec traject -c /path/to/some_config.rb ...
|
|
176
|
+
|
|
177
|
+
Bundler will make sure the specified versions of all gems are used by
|
|
178
|
+
traject, and also make sure no gems except those specified in the gemfile
|
|
179
|
+
are available to the program, for a reliable reproducible environment.
|
|
180
|
+
|
|
181
|
+
You should still `require` the gem in your traject config file,
|
|
182
|
+
then just refer to what it provides in your config code as usual.
|
|
183
|
+
|
|
184
|
+
You should check both the `Gemfile` and the `Gemfile.lock`
|
|
185
|
+
that bundler creates into your source control repo. The
|
|
186
|
+
`Gemfile.lock` specifies _exactly_ what versions of
|
|
187
|
+
gem dependencies are currently being used, so you can get the exact
|
|
188
|
+
same dependency environment on different servers.
|
|
189
|
+
|
|
190
|
+
See the [bundler documentation](http://bundler.io/#getting-started), or google, for more information.
|