traject 2.0.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/.travis.yml +27 -0
- data/.yardopts +3 -0
- data/Gemfile +12 -0
- data/LICENSE.txt +20 -0
- data/README.md +461 -0
- data/Rakefile +21 -0
- data/bench/bench.rb +30 -0
- data/bin/traject +16 -0
- data/doc/batch_execution.md +243 -0
- data/doc/extending.md +190 -0
- data/doc/indexing_rules.md +265 -0
- data/doc/other_commands.md +47 -0
- data/doc/settings.md +101 -0
- data/lib/tasks/load_maps.rake +48 -0
- data/lib/traject.rb +11 -0
- data/lib/traject/command_line.rb +301 -0
- data/lib/traject/csv_writer.rb +34 -0
- data/lib/traject/debug_writer.rb +47 -0
- data/lib/traject/delimited_writer.rb +110 -0
- data/lib/traject/indexer.rb +613 -0
- data/lib/traject/indexer/settings.rb +110 -0
- data/lib/traject/json_writer.rb +51 -0
- data/lib/traject/line_writer.rb +63 -0
- data/lib/traject/macros/basic.rb +9 -0
- data/lib/traject/macros/marc21.rb +223 -0
- data/lib/traject/macros/marc21_semantics.rb +584 -0
- data/lib/traject/macros/marc_format_classifier.rb +197 -0
- data/lib/traject/marc_extractor.rb +410 -0
- data/lib/traject/marc_reader.rb +89 -0
- data/lib/traject/mock_reader.rb +97 -0
- data/lib/traject/ndj_reader.rb +40 -0
- data/lib/traject/null_writer.rb +22 -0
- data/lib/traject/qualified_const_get.rb +40 -0
- data/lib/traject/solr_json_writer.rb +277 -0
- data/lib/traject/thread_pool.rb +161 -0
- data/lib/traject/translation_map.rb +267 -0
- data/lib/traject/util.rb +52 -0
- data/lib/traject/version.rb +3 -0
- data/lib/traject/yaml_writer.rb +9 -0
- data/lib/translation_maps/lcc_top_level.yaml +26 -0
- data/lib/translation_maps/marc_genre_007.yaml +9 -0
- data/lib/translation_maps/marc_genre_leader.yaml +22 -0
- data/lib/translation_maps/marc_geographic.yaml +589 -0
- data/lib/translation_maps/marc_instruments.yaml +102 -0
- data/lib/translation_maps/marc_languages.yaml +490 -0
- data/test/debug_writer_test.rb +38 -0
- data/test/delimited_writer_test.rb +104 -0
- data/test/indexer/each_record_test.rb +59 -0
- data/test/indexer/macros_marc21_semantics_test.rb +391 -0
- data/test/indexer/macros_marc21_test.rb +190 -0
- data/test/indexer/macros_test.rb +40 -0
- data/test/indexer/map_record_test.rb +209 -0
- data/test/indexer/read_write_test.rb +101 -0
- data/test/indexer/settings_test.rb +152 -0
- data/test/indexer/to_field_test.rb +77 -0
- data/test/marc_extractor_test.rb +412 -0
- data/test/marc_format_classifier_test.rb +98 -0
- data/test/marc_reader_test.rb +110 -0
- data/test/solr_json_writer_test.rb +248 -0
- data/test/test_helper.rb +90 -0
- data/test/test_support/245_no_ab.marc +1 -0
- data/test/test_support/880_with_no_6.utf8.marc +1 -0
- data/test/test_support/bad_subfield_code.marc +1 -0
- data/test/test_support/bad_utf_byte.utf8.marc +1 -0
- data/test/test_support/date_resort_to_260.marc +1 -0
- data/test/test_support/date_type_r_missing_date2.marc +1 -0
- data/test/test_support/date_with_u.marc +1 -0
- data/test/test_support/demo_config.rb +155 -0
- data/test/test_support/emptyish_record.marc +1 -0
- data/test/test_support/escaped_character_reference.marc8.marc +1 -0
- data/test/test_support/george_eliot.marc +1 -0
- data/test/test_support/hebrew880s.marc +1 -0
- data/test/test_support/louis_armstrong.marc +1 -0
- data/test/test_support/manufacturing_consent.marc +1 -0
- data/test/test_support/manuscript_online_thesis.marc +1 -0
- data/test/test_support/microform_online_conference.marc +1 -0
- data/test/test_support/multi_era.marc +1 -0
- data/test/test_support/multi_geo.marc +1 -0
- data/test/test_support/musical_cage.marc +1 -0
- data/test/test_support/nature.marc +1 -0
- data/test/test_support/one-marc8.mrc +1 -0
- data/test/test_support/online_only.marc +1 -0
- data/test/test_support/packed_041a_lang.marc +1 -0
- data/test/test_support/test_data.utf8.json +30 -0
- data/test/test_support/test_data.utf8.marc.xml +2609 -0
- data/test/test_support/test_data.utf8.mrc +1 -0
- data/test/test_support/test_data.utf8.mrc.gz +0 -0
- data/test/test_support/the_business_ren.marc +1 -0
- data/test/translation_map_test.rb +225 -0
- data/test/translation_maps/bad_ruby.rb +8 -0
- data/test/translation_maps/bad_yaml.yaml +1 -0
- data/test/translation_maps/both_map.rb +1 -0
- data/test/translation_maps/both_map.yaml +1 -0
- data/test/translation_maps/default_literal.rb +10 -0
- data/test/translation_maps/default_passthrough.rb +10 -0
- data/test/translation_maps/marc_040a_translate_test.yaml +1 -0
- data/test/translation_maps/properties_map.properties +5 -0
- data/test/translation_maps/ruby_map.rb +10 -0
- data/test/translation_maps/translate_array_test.yaml +8 -0
- data/test/translation_maps/yaml_map.yaml +7 -0
- data/traject.gemspec +47 -0
- metadata +382 -0
data/lib/traject.rb
ADDED
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
require 'slop'
|
|
2
|
+
require 'traject'
|
|
3
|
+
require 'traject/indexer'
|
|
4
|
+
|
|
5
|
+
module Traject
|
|
6
|
+
# The class that executes for the Traject command line utility.
|
|
7
|
+
#
|
|
8
|
+
# Warning, does do things like exit entire program on error at present.
|
|
9
|
+
# You probably don't want to use this class for anything but an actual
|
|
10
|
+
# shell command line, if you want to execute indexing directly, just
|
|
11
|
+
# use the Traject::Indexer directly.
|
|
12
|
+
#
|
|
13
|
+
# A CommandLine object has a single persistent Indexer object it uses
|
|
14
|
+
class CommandLine
|
|
15
|
+
# orig_argv is original one passed in, remaining_argv is after destructive
|
|
16
|
+
# processing by slop, still has file args in it etc.
|
|
17
|
+
attr_accessor :orig_argv, :remaining_argv
|
|
18
|
+
attr_accessor :slop, :options
|
|
19
|
+
attr_accessor :indexer
|
|
20
|
+
attr_accessor :console
|
|
21
|
+
|
|
22
|
+
def initialize(argv=ARGV)
|
|
23
|
+
self.console = $stderr
|
|
24
|
+
|
|
25
|
+
self.orig_argv = argv.dup
|
|
26
|
+
self.remaining_argv = argv
|
|
27
|
+
|
|
28
|
+
self.slop = create_slop!
|
|
29
|
+
self.options = parse_options(self.remaining_argv)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Returns true on success or false on failure; may also raise exceptions;
|
|
33
|
+
# may also exit program directly itself (yeah, could use some normalization)
|
|
34
|
+
def execute
|
|
35
|
+
if options[:version]
|
|
36
|
+
self.console.puts "traject version #{Traject::VERSION}"
|
|
37
|
+
return
|
|
38
|
+
end
|
|
39
|
+
if options[:help]
|
|
40
|
+
self.console.puts slop.help
|
|
41
|
+
return
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
(options[:load_path] || []).each do |path|
|
|
46
|
+
$LOAD_PATH << path unless $LOAD_PATH.include? path
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
arg_check!
|
|
50
|
+
|
|
51
|
+
self.indexer = initialize_indexer!
|
|
52
|
+
|
|
53
|
+
######
|
|
54
|
+
# SAFE TO LOG to indexer.logger starting here, after indexer is set up from conf files
|
|
55
|
+
# with logging config.
|
|
56
|
+
#####
|
|
57
|
+
|
|
58
|
+
indexer.logger.info("traject (#{Traject::Version}) executing with: `#{orig_argv.join(' ')}`")
|
|
59
|
+
|
|
60
|
+
# Okay, actual command process! All command_ methods should return true
|
|
61
|
+
# on success, or false on failure.
|
|
62
|
+
result =
|
|
63
|
+
case options[:command]
|
|
64
|
+
when "process"
|
|
65
|
+
(io, filename) = get_input_io(self.remaining_argv)
|
|
66
|
+
indexer.settings['command_line.filename'] = filename if filename
|
|
67
|
+
indexer.process(io)
|
|
68
|
+
when "marcout"
|
|
69
|
+
(io, filename) = get_input_io(self.remaining_argv)
|
|
70
|
+
indexer.settings['command_line.filename'] = filename if filename
|
|
71
|
+
command_marcout!(io)
|
|
72
|
+
when "commit"
|
|
73
|
+
command_commit!
|
|
74
|
+
else
|
|
75
|
+
raise ArgumentError.new("Unrecognized traject command: #{options[:command]}")
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
return result
|
|
79
|
+
rescue Exception => e
|
|
80
|
+
# Try to log unexpected exceptions if possible
|
|
81
|
+
indexer && indexer.logger && indexer.logger.fatal("Traject::CommandLine: Unexpected exception, terminating execution: #{e.inspect}") rescue nil
|
|
82
|
+
raise e
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def command_commit!
|
|
86
|
+
require 'open-uri'
|
|
87
|
+
raise ArgumentError.new("No solr.url setting provided") if indexer.settings['solr.url'].to_s.empty?
|
|
88
|
+
|
|
89
|
+
url = "#{indexer.settings['solr.url']}/update?commit=true"
|
|
90
|
+
indexer.logger.info("Sending commit to: #{url}")
|
|
91
|
+
indexer.logger.info( open(url).read )
|
|
92
|
+
|
|
93
|
+
return true
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
def command_marcout!(io)
|
|
97
|
+
require 'marc'
|
|
98
|
+
|
|
99
|
+
output_type = indexer.settings["marcout.type"].to_s
|
|
100
|
+
output_type = "binary" if output_type.empty?
|
|
101
|
+
|
|
102
|
+
output_arg = unless indexer.settings["output_file"].to_s.empty?
|
|
103
|
+
indexer.settings["output_file"]
|
|
104
|
+
else
|
|
105
|
+
$stdout
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
indexer.logger.info(" marcout writing type:#{output_type} to file:#{output_arg}")
|
|
109
|
+
|
|
110
|
+
case output_type
|
|
111
|
+
when "binary"
|
|
112
|
+
writer = MARC::Writer.new(output_arg)
|
|
113
|
+
|
|
114
|
+
allow_oversized = indexer.settings["marcout.allow_oversized"]
|
|
115
|
+
if allow_oversized
|
|
116
|
+
allow_oversized = (allow_oversized.to_s == "true")
|
|
117
|
+
writer.allow_oversized = allow_oversized
|
|
118
|
+
end
|
|
119
|
+
when "xml"
|
|
120
|
+
writer = MARC::XMLWriter.new(output_arg)
|
|
121
|
+
when "human"
|
|
122
|
+
writer = output_arg.kind_of?(String) ? File.open(output_arg, "w:binary") : output_arg
|
|
123
|
+
else
|
|
124
|
+
raise ArgumentError.new("traject marcout unrecognized marcout.type: #{output_type}")
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
reader = indexer.reader!(io)
|
|
128
|
+
|
|
129
|
+
reader.each do |record|
|
|
130
|
+
writer.write record
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
writer.close
|
|
134
|
+
|
|
135
|
+
return true
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
def get_input_io(argv)
|
|
139
|
+
# ARGF might be perfect for this, but problems with it include:
|
|
140
|
+
# * jruby is broken, no way to set it's encoding, leads to encoding errors reading non-ascii
|
|
141
|
+
# https://github.com/jruby/jruby/issues/891
|
|
142
|
+
# * It's apparently not enough like an IO object for at least one of the ruby-marc XML
|
|
143
|
+
# readers:
|
|
144
|
+
# NoMethodError: undefined method `to_inputstream' for ARGF:Object
|
|
145
|
+
# init at /Users/jrochkind/.gem/jruby/1.9.3/gems/marc-0.5.1/lib/marc/xml_parsers.rb:369
|
|
146
|
+
#
|
|
147
|
+
# * It INSISTS on reading from ARGFV, making it hard to test, or use when you want to give
|
|
148
|
+
# it a list of files on something other than ARGV.
|
|
149
|
+
#
|
|
150
|
+
# So for now we do just one file, or stdin if specified. Sorry!
|
|
151
|
+
|
|
152
|
+
filename = nil
|
|
153
|
+
if options[:stdin]
|
|
154
|
+
indexer.logger.info("Reading from standard input")
|
|
155
|
+
io = $stdin
|
|
156
|
+
elsif argv.length > 1
|
|
157
|
+
self.console.puts "Sorry, traject can only handle one input file at a time right now. `#{argv}` Exiting..."
|
|
158
|
+
exit 1
|
|
159
|
+
elsif argv.length == 0
|
|
160
|
+
io = File.open(File::NULL, 'r')
|
|
161
|
+
indexer.logger.info("Warning, no file input given. Use command-line argument '--stdin' to use standard input ")
|
|
162
|
+
else
|
|
163
|
+
io = File.open(argv.first, 'r')
|
|
164
|
+
filename = argv.first
|
|
165
|
+
indexer.logger.info "Reading from #{filename}"
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
return io, filename
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
def load_configuration_files!(my_indexer, conf_files)
|
|
172
|
+
conf_files.each do |conf_path|
|
|
173
|
+
begin
|
|
174
|
+
file_io = File.open(conf_path)
|
|
175
|
+
rescue Errno::ENOENT => e
|
|
176
|
+
self.console.puts "Could not find configuration file '#{conf_path}', exiting..."
|
|
177
|
+
exit 2
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
begin
|
|
181
|
+
my_indexer.instance_eval(file_io.read, conf_path)
|
|
182
|
+
rescue Exception => e
|
|
183
|
+
if (conf_trace = e.backtrace.find {|l| l.start_with? conf_path}) &&
|
|
184
|
+
(conf_trace =~ /\A.*\:(\d+)\:in/)
|
|
185
|
+
line_number = $1
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
self.console.puts "Error processing configuration file '#{conf_path}' at line #{line_number}"
|
|
189
|
+
self.console.puts " #{e.class}: #{e.message}"
|
|
190
|
+
if e.backtrace.first =~ /\A(.*)\:in/
|
|
191
|
+
self.console.puts " from #{$1}"
|
|
192
|
+
end
|
|
193
|
+
exit 3
|
|
194
|
+
end
|
|
195
|
+
end
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
def arg_check!
|
|
199
|
+
if options[:command] == "process" && (options[:conf].nil? || options[:conf].length == 0)
|
|
200
|
+
self.console.puts "Error: Missing required configuration file"
|
|
201
|
+
self.console.puts "Exiting..."
|
|
202
|
+
self.console.puts
|
|
203
|
+
self.console.puts self.slop.help
|
|
204
|
+
exit 2
|
|
205
|
+
end
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def assemble_settings_hash(options)
|
|
210
|
+
settings = {}
|
|
211
|
+
|
|
212
|
+
# `-s key=value` command line
|
|
213
|
+
(options[:setting] || []).each do |setting_pair|
|
|
214
|
+
if setting_pair =~ /\A([^=]+)\=(.*)\Z/
|
|
215
|
+
key, value = $1, $2
|
|
216
|
+
settings[key] = value
|
|
217
|
+
else
|
|
218
|
+
self.console.puts "Unrecognized setting argument '#{setting_pair}':"
|
|
219
|
+
self.console.puts "Should be of format -s key=value"
|
|
220
|
+
exit 3
|
|
221
|
+
end
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
# other command line shortcuts for settings
|
|
225
|
+
if options[:debug]
|
|
226
|
+
settings["log.level"] = "debug"
|
|
227
|
+
end
|
|
228
|
+
if options[:'debug-mode']
|
|
229
|
+
require 'traject/debug_writer'
|
|
230
|
+
settings["writer_class_name"] = "Traject::DebugWriter"
|
|
231
|
+
settings["log.level"] = "debug"
|
|
232
|
+
settings["processing_thread_pool"] = 0
|
|
233
|
+
end
|
|
234
|
+
if options[:writer]
|
|
235
|
+
settings["writer_class_name"] = options[:writer]
|
|
236
|
+
end
|
|
237
|
+
if options[:reader]
|
|
238
|
+
settings["reader_class_name"] = options[:reader]
|
|
239
|
+
end
|
|
240
|
+
if options[:solr]
|
|
241
|
+
settings["solr.url"] = options[:solr]
|
|
242
|
+
end
|
|
243
|
+
if options[:marc_type]
|
|
244
|
+
settings["marc_source.type"] = options[:marc_type]
|
|
245
|
+
end
|
|
246
|
+
if options[:output_file]
|
|
247
|
+
settings["output_file"] = options[:output_file]
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
return settings
|
|
251
|
+
end
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def create_slop!
|
|
255
|
+
return Slop.new(:strict => true) do
|
|
256
|
+
banner "traject [options] -c configuration.rb [-c config2.rb] file.mrc"
|
|
257
|
+
|
|
258
|
+
on 'v', 'version', "print version information to stderr"
|
|
259
|
+
on 'd', 'debug', "Include debug log, -s log.level=debug"
|
|
260
|
+
on 'h', 'help', "print usage information to stderr"
|
|
261
|
+
on 'c', 'conf', 'configuration file path (repeatable)', :argument => true, :as => Array
|
|
262
|
+
on :s, :setting, "settings: `-s key=value` (repeatable)", :argument => true, :as => Array
|
|
263
|
+
on :r, :reader, "Set reader class, shortcut for -s reader_class_name=", :argument => true
|
|
264
|
+
on :o, "output_file", "output file for Writer classes that write to files", :argument => true
|
|
265
|
+
on :w, :writer, "Set writer class, shortcut for -s writer_class_name=", :argument => true
|
|
266
|
+
on :u, :solr, "Set solr url, shortcut for -s solr.url=", :argument => true
|
|
267
|
+
on :t, :marc_type, "xml, json or binary. shortcut for -s marc_source.type=", :argument => true
|
|
268
|
+
on :I, "load_path", "append paths to ruby $LOAD_PATH", :argument => true, :as => Array, :delimiter => ":"
|
|
269
|
+
|
|
270
|
+
on :x, "command", "alternate traject command: process (default); marcout; commit", :argument => true, :default => "process"
|
|
271
|
+
|
|
272
|
+
on "stdin", "read input from stdin"
|
|
273
|
+
on "debug-mode", "debug logging, single threaded, output human readable hashes"
|
|
274
|
+
end
|
|
275
|
+
end
|
|
276
|
+
|
|
277
|
+
def initialize_indexer!
|
|
278
|
+
indexer = Traject::Indexer.new self.assemble_settings_hash(self.options)
|
|
279
|
+
load_configuration_files!(indexer, options[:conf])
|
|
280
|
+
|
|
281
|
+
return indexer
|
|
282
|
+
end
|
|
283
|
+
|
|
284
|
+
def parse_options(argv)
|
|
285
|
+
|
|
286
|
+
begin
|
|
287
|
+
self.slop.parse!(argv)
|
|
288
|
+
rescue Slop::Error => e
|
|
289
|
+
self.console.puts "Error: #{e.message}"
|
|
290
|
+
self.console.puts "Exiting..."
|
|
291
|
+
self.console.puts
|
|
292
|
+
self.console.puts slop.help
|
|
293
|
+
exit 1
|
|
294
|
+
end
|
|
295
|
+
|
|
296
|
+
return self.slop.to_hash
|
|
297
|
+
end
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
end
|
|
301
|
+
end
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
require 'traject/delimited_writer'
|
|
2
|
+
require 'csv'
|
|
3
|
+
|
|
4
|
+
# A CSV-writer, for folks who like that sort of thing.
|
|
5
|
+
# Use DelimitedWriter for non-CSV lines (e.g., tab-delimited)
|
|
6
|
+
#
|
|
7
|
+
#
|
|
8
|
+
|
|
9
|
+
class Traject::CSVWriter < Traject::DelimitedWriter
|
|
10
|
+
|
|
11
|
+
def initialize(*args)
|
|
12
|
+
super
|
|
13
|
+
self.delimiter = nil # Let CSV take care of it
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def _write(data)
|
|
17
|
+
@output_file << data
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# Turn the output file into a CSV writer
|
|
21
|
+
def open_output_file
|
|
22
|
+
of = super
|
|
23
|
+
CSV.new(of)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# Let CSV take care of the comma escaping
|
|
27
|
+
def escape(x)
|
|
28
|
+
x = x.to_s
|
|
29
|
+
x.gsub! internal_delimiter, @eidelim
|
|
30
|
+
x
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
end
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
require 'traject/line_writer'
|
|
2
|
+
|
|
3
|
+
# The Traject::DebugWriter produces a simple, human-readable output format that's
|
|
4
|
+
# also amenable to simple computer processing (e.g., with a simple grep).
|
|
5
|
+
# It's the output format used when you pass the --debug-mode switch to traject on the command line.
|
|
6
|
+
#
|
|
7
|
+
# Output format is three columns: id, output field, values (multiple
|
|
8
|
+
# values seperated by '|'), and looks something like:
|
|
9
|
+
#
|
|
10
|
+
# 000001580 edition [1st ed.]
|
|
11
|
+
# 000001580 format Book | Online | Print
|
|
12
|
+
# 000001580 geo Great Britain
|
|
13
|
+
# 000001580 id 000001580
|
|
14
|
+
# 000001580 isbn 0631126902
|
|
15
|
+
#
|
|
16
|
+
# ## Settings
|
|
17
|
+
#
|
|
18
|
+
# * 'output_file' -- the name of the file to output to (command line -o shortcut).
|
|
19
|
+
# * 'output_stream' -- alternately, the IO stream
|
|
20
|
+
# * 'debug_writer.idfield' -- the solr field from which to pull the record ID (default: 'id')
|
|
21
|
+
# * 'debug_writer.format' -- How to format the id/solr field/values (default: '%-12s %-25s %s')
|
|
22
|
+
#
|
|
23
|
+
# By default, with neither output_file nor output_stream provided, writes to stdout, which
|
|
24
|
+
# can be useful for debugging diagnosis.
|
|
25
|
+
#
|
|
26
|
+
# ## Example configuration file
|
|
27
|
+
#
|
|
28
|
+
# require 'traject/debug_writer'
|
|
29
|
+
#
|
|
30
|
+
# settings do
|
|
31
|
+
# provide "writer_class_name", "Traject::DebugWriter"
|
|
32
|
+
# provide "output_file", "out.txt"
|
|
33
|
+
# end
|
|
34
|
+
class Traject::DebugWriter < Traject::LineWriter
|
|
35
|
+
DEFAULT_FORMAT = '%-12s %-25s %s'
|
|
36
|
+
DEFAULT_IDFIELD = 'id'
|
|
37
|
+
|
|
38
|
+
def serialize(context)
|
|
39
|
+
idfield = settings["debug_writer.idfield"] || DEFAULT_IDFIELD
|
|
40
|
+
format = settings['debug_writer.format'] || DEFAULT_FORMAT
|
|
41
|
+
h = context.output_hash
|
|
42
|
+
lines = h.keys.sort.map {|k| format % [h[idfield].first, k, h[k].join(' | ')] }
|
|
43
|
+
lines.push "\n"
|
|
44
|
+
lines.join("\n")
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
end
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
require 'traject/line_writer'
|
|
2
|
+
|
|
3
|
+
# A simple line writer that uses configuration to determine
|
|
4
|
+
# how to produce a tab-delimited file
|
|
5
|
+
#
|
|
6
|
+
# Appropos settings:
|
|
7
|
+
#
|
|
8
|
+
# * output_file -- the file to write to
|
|
9
|
+
# * output_stream -- the stream to write to, if defined and output_file is not
|
|
10
|
+
# * delimited_writer.delimiter -- What to separate fields with; default is tab
|
|
11
|
+
# * delimited_writer.internal_delimiter -- Delimiter _within_ a field, for multiple
|
|
12
|
+
# values. Default is pipe ( | )
|
|
13
|
+
# * delimited_writer.fields -- comma-separated list of the fields to output
|
|
14
|
+
# * delimited_writer.header (true/false) -- boolean that determines if we should output a header row. Default is true
|
|
15
|
+
# * delimited_writer.escape -- If a value actually contains the delimited or internal_delimiter, what to do?
|
|
16
|
+
# If unset, will follow the procedure below. If set, will turn it into the character(s) given
|
|
17
|
+
#
|
|
18
|
+
#
|
|
19
|
+
# If `delimited_writer.escape` is not set, the writer will automatically
|
|
20
|
+
# escape delimiters/internal_delimiters in the following way:
|
|
21
|
+
# * If the delimiter is a tab, replace tabs in values with a single space
|
|
22
|
+
# * If the delimiter is anything else, prefix it with a backslash
|
|
23
|
+
|
|
24
|
+
class Traject::DelimitedWriter < Traject::LineWriter
|
|
25
|
+
|
|
26
|
+
attr_reader :delimiter, :internal_delimiter, :edelim, :eidelim
|
|
27
|
+
attr_accessor :header
|
|
28
|
+
|
|
29
|
+
def initialize(settings)
|
|
30
|
+
super
|
|
31
|
+
|
|
32
|
+
# fields to output
|
|
33
|
+
|
|
34
|
+
begin
|
|
35
|
+
@fields = settings['delimited_writer.fields'].split(",")
|
|
36
|
+
rescue NoMethodError => e
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
if e or @fields.empty?
|
|
40
|
+
raise ArgumentError.new("#{self.class.name} must have a comma-delimited list of field names to output set in setting 'delimited_writer.fields'")
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
self.delimiter = settings['delimited_writer.delimiter'] || "\t"
|
|
44
|
+
self.internal_delimiter = settings['delimited_writer.internal_delimiter'] || '|'
|
|
45
|
+
self.header = settings['delimited_writer.header'].to_s != 'false'
|
|
46
|
+
|
|
47
|
+
# Output the header if need be
|
|
48
|
+
write_header if @header
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def escaped_delimiter(d)
|
|
52
|
+
return nil if d.nil?
|
|
53
|
+
d == "\t" ? ' ' : '\\' + d
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def delimiter=(d)
|
|
57
|
+
@delimiter = d
|
|
58
|
+
@edelim = escaped_delimiter(d)
|
|
59
|
+
self
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def internal_delimiter=(d)
|
|
63
|
+
@internal_delimiter = d
|
|
64
|
+
@eidelim = escaped_delimiter(d)
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def write_header
|
|
71
|
+
_write(@fields)
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def _write(data)
|
|
75
|
+
output_file.puts(data.join(delimiter))
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# Get the output values out of the context
|
|
79
|
+
def raw_output_values(context)
|
|
80
|
+
context.output_hash.values_at(*@fields)
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Escape the delimiters in whatever way has been defined
|
|
84
|
+
def escape(x)
|
|
85
|
+
x = x.to_s
|
|
86
|
+
x.gsub! @delimiter, @edelim if @delimiter
|
|
87
|
+
x.gsub! @internal_delimiter, @eidelim
|
|
88
|
+
x
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
# Derive actual output field values from the raw values
|
|
93
|
+
def output_values(raw)
|
|
94
|
+
raw.map do |x|
|
|
95
|
+
if x.is_a? Array
|
|
96
|
+
x.map!{|s| escape(s)}
|
|
97
|
+
x.join(@internal_delimiter)
|
|
98
|
+
else
|
|
99
|
+
escape(x)
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
# Spit out the escaped values joined by the delimiter
|
|
105
|
+
def serialize(context)
|
|
106
|
+
output_values(raw_output_values(context))
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
end
|