traject 2.0.0-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/.travis.yml +27 -0
- data/.yardopts +3 -0
- data/Gemfile +12 -0
- data/LICENSE.txt +20 -0
- data/README.md +461 -0
- data/Rakefile +21 -0
- data/bench/bench.rb +30 -0
- data/bin/traject +16 -0
- data/doc/batch_execution.md +243 -0
- data/doc/extending.md +190 -0
- data/doc/indexing_rules.md +265 -0
- data/doc/other_commands.md +47 -0
- data/doc/settings.md +101 -0
- data/lib/tasks/load_maps.rake +48 -0
- data/lib/traject.rb +11 -0
- data/lib/traject/command_line.rb +301 -0
- data/lib/traject/csv_writer.rb +34 -0
- data/lib/traject/debug_writer.rb +47 -0
- data/lib/traject/delimited_writer.rb +110 -0
- data/lib/traject/indexer.rb +613 -0
- data/lib/traject/indexer/settings.rb +110 -0
- data/lib/traject/json_writer.rb +51 -0
- data/lib/traject/line_writer.rb +63 -0
- data/lib/traject/macros/basic.rb +9 -0
- data/lib/traject/macros/marc21.rb +223 -0
- data/lib/traject/macros/marc21_semantics.rb +584 -0
- data/lib/traject/macros/marc_format_classifier.rb +197 -0
- data/lib/traject/marc_extractor.rb +410 -0
- data/lib/traject/marc_reader.rb +89 -0
- data/lib/traject/mock_reader.rb +97 -0
- data/lib/traject/ndj_reader.rb +40 -0
- data/lib/traject/null_writer.rb +22 -0
- data/lib/traject/qualified_const_get.rb +40 -0
- data/lib/traject/solr_json_writer.rb +277 -0
- data/lib/traject/thread_pool.rb +161 -0
- data/lib/traject/translation_map.rb +267 -0
- data/lib/traject/util.rb +52 -0
- data/lib/traject/version.rb +3 -0
- data/lib/traject/yaml_writer.rb +9 -0
- data/lib/translation_maps/lcc_top_level.yaml +26 -0
- data/lib/translation_maps/marc_genre_007.yaml +9 -0
- data/lib/translation_maps/marc_genre_leader.yaml +22 -0
- data/lib/translation_maps/marc_geographic.yaml +589 -0
- data/lib/translation_maps/marc_instruments.yaml +102 -0
- data/lib/translation_maps/marc_languages.yaml +490 -0
- data/test/debug_writer_test.rb +38 -0
- data/test/delimited_writer_test.rb +104 -0
- data/test/indexer/each_record_test.rb +59 -0
- data/test/indexer/macros_marc21_semantics_test.rb +391 -0
- data/test/indexer/macros_marc21_test.rb +190 -0
- data/test/indexer/macros_test.rb +40 -0
- data/test/indexer/map_record_test.rb +209 -0
- data/test/indexer/read_write_test.rb +101 -0
- data/test/indexer/settings_test.rb +152 -0
- data/test/indexer/to_field_test.rb +77 -0
- data/test/marc_extractor_test.rb +412 -0
- data/test/marc_format_classifier_test.rb +98 -0
- data/test/marc_reader_test.rb +110 -0
- data/test/solr_json_writer_test.rb +248 -0
- data/test/test_helper.rb +90 -0
- data/test/test_support/245_no_ab.marc +1 -0
- data/test/test_support/880_with_no_6.utf8.marc +1 -0
- data/test/test_support/bad_subfield_code.marc +1 -0
- data/test/test_support/bad_utf_byte.utf8.marc +1 -0
- data/test/test_support/date_resort_to_260.marc +1 -0
- data/test/test_support/date_type_r_missing_date2.marc +1 -0
- data/test/test_support/date_with_u.marc +1 -0
- data/test/test_support/demo_config.rb +155 -0
- data/test/test_support/emptyish_record.marc +1 -0
- data/test/test_support/escaped_character_reference.marc8.marc +1 -0
- data/test/test_support/george_eliot.marc +1 -0
- data/test/test_support/hebrew880s.marc +1 -0
- data/test/test_support/louis_armstrong.marc +1 -0
- data/test/test_support/manufacturing_consent.marc +1 -0
- data/test/test_support/manuscript_online_thesis.marc +1 -0
- data/test/test_support/microform_online_conference.marc +1 -0
- data/test/test_support/multi_era.marc +1 -0
- data/test/test_support/multi_geo.marc +1 -0
- data/test/test_support/musical_cage.marc +1 -0
- data/test/test_support/nature.marc +1 -0
- data/test/test_support/one-marc8.mrc +1 -0
- data/test/test_support/online_only.marc +1 -0
- data/test/test_support/packed_041a_lang.marc +1 -0
- data/test/test_support/test_data.utf8.json +30 -0
- data/test/test_support/test_data.utf8.marc.xml +2609 -0
- data/test/test_support/test_data.utf8.mrc +1 -0
- data/test/test_support/test_data.utf8.mrc.gz +0 -0
- data/test/test_support/the_business_ren.marc +1 -0
- data/test/translation_map_test.rb +225 -0
- data/test/translation_maps/bad_ruby.rb +8 -0
- data/test/translation_maps/bad_yaml.yaml +1 -0
- data/test/translation_maps/both_map.rb +1 -0
- data/test/translation_maps/both_map.yaml +1 -0
- data/test/translation_maps/default_literal.rb +10 -0
- data/test/translation_maps/default_passthrough.rb +10 -0
- data/test/translation_maps/marc_040a_translate_test.yaml +1 -0
- data/test/translation_maps/properties_map.properties +5 -0
- data/test/translation_maps/ruby_map.rb +10 -0
- data/test/translation_maps/translate_array_test.yaml +8 -0
- data/test/translation_maps/yaml_map.yaml +7 -0
- data/traject.gemspec +47 -0
- metadata +382 -0
@@ -0,0 +1,110 @@
|
|
1
|
+
require 'hashie'
|
2
|
+
require 'concurrent'
|
3
|
+
|
4
|
+
class Traject::Indexer
|
5
|
+
|
6
|
+
# A Hash of settings for a Traject::Indexer, which also ends up passed along
|
7
|
+
# to other objects Traject::Indexer interacts with.
|
8
|
+
#
|
9
|
+
# Enhanced with a few features from Hashie, to make it for
|
10
|
+
# instance string/symbol indifferent
|
11
|
+
#
|
12
|
+
# method #provide(key, value) is added, to do like settings[key] ||= value,
|
13
|
+
# set only if not already set (but unlike ||=, nil or false can count as already set)
|
14
|
+
#
|
15
|
+
# Also has an interesting 'defaults' system, meant to play along
|
16
|
+
# with configuration file 'provide' statements. There is a built-in hash of
|
17
|
+
# defaults, which will be lazily filled in if accessed and not yet
|
18
|
+
# set. (nil can count as set, though!). If they haven't been lazily
|
19
|
+
# set yet, then #provide will still fill them in. But you can also call
|
20
|
+
# fill_in_defaults! to fill all defaults in, if you know configuration
|
21
|
+
# files have all been loaded, and want to fill them in for inspection.
|
22
|
+
class Settings < Hash
|
23
|
+
include Hashie::Extensions::MergeInitializer # can init with hash
|
24
|
+
include Hashie::Extensions::IndifferentAccess
|
25
|
+
|
26
|
+
def initialize(*args)
|
27
|
+
super
|
28
|
+
self.default_proc = lambda do |hash, key|
|
29
|
+
if self.class.defaults.has_key?(key)
|
30
|
+
return hash[key] = self.class.defaults[key]
|
31
|
+
else
|
32
|
+
return nil
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
# a cautious store, which only saves key=value if
|
38
|
+
# there was not already a value for #key. Can be used
|
39
|
+
# to set settings that can be overridden on command line,
|
40
|
+
# or general first-set-wins settings.
|
41
|
+
def provide(key, value)
|
42
|
+
unless has_key? key
|
43
|
+
store(key, value)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
# reverse_merge copied from ActiveSupport, pretty straightforward,
|
48
|
+
# modified to make sure we return a Settings
|
49
|
+
def reverse_merge(other_hash)
|
50
|
+
self.class.new(other_hash).merge(self)
|
51
|
+
end
|
52
|
+
|
53
|
+
def reverse_merge!(other_hash)
|
54
|
+
replace(reverse_merge(other_hash))
|
55
|
+
end
|
56
|
+
|
57
|
+
def fill_in_defaults!
|
58
|
+
self.reverse_merge!(self.class.defaults)
|
59
|
+
end
|
60
|
+
|
61
|
+
|
62
|
+
def self.mri_defaults
|
63
|
+
{
|
64
|
+
"reader_class_name" => "Traject::MarcReader",
|
65
|
+
"writer_class_name" => "Traject::SolrJsonWriter",
|
66
|
+
"marc_source.type" => "binary",
|
67
|
+
"solrj_writer.batch_size" => 200,
|
68
|
+
"solrj_writer.thread_pool" => 1,
|
69
|
+
"processing_thread_pool" => self.default_processing_thread_pool,
|
70
|
+
"log.batch_size.severity" => "info"
|
71
|
+
}
|
72
|
+
end
|
73
|
+
|
74
|
+
def self.jruby_defaults
|
75
|
+
{
|
76
|
+
'reader_class_name' => "Traject::Marc4JReader",
|
77
|
+
'marc4j_reader.permissive' => true
|
78
|
+
}
|
79
|
+
end
|
80
|
+
|
81
|
+
|
82
|
+
def self.defaults
|
83
|
+
return @@defaults if defined? @@defaults
|
84
|
+
default_settings = self.mri_defaults
|
85
|
+
if defined? JRUBY_VERSION
|
86
|
+
default_settings.merge! self.jruby_defaults
|
87
|
+
end
|
88
|
+
|
89
|
+
@@defaults = default_settings
|
90
|
+
end
|
91
|
+
|
92
|
+
def inspect
|
93
|
+
# Keep any key ending in password out of the inspect
|
94
|
+
self.inject({}) do |hash, (key, value)|
|
95
|
+
hash[key] = (key =~ /password\Z/) ? "[hidden]" : value
|
96
|
+
hash
|
97
|
+
end.inspect
|
98
|
+
end
|
99
|
+
|
100
|
+
protected
|
101
|
+
def self.default_processing_thread_pool
|
102
|
+
if ["jruby", "rbx"].include? ENV["RUBY_ENGINE"]
|
103
|
+
[1, Concurrent.processor_count - 1].max
|
104
|
+
else
|
105
|
+
1
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
end
|
110
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'traject/line_writer'
|
3
|
+
|
4
|
+
# The JsonWriter outputs one JSON hash per record, separated by newlines.
|
5
|
+
#
|
6
|
+
# It's newline delimitted json, which should be suitable for being
|
7
|
+
# read by simple NDJ readers. (TODO: We have no checks right now to
|
8
|
+
# make sure the standard json serializers we're using don't put any
|
9
|
+
# internal newlines as whitespace in the json. Which would break NDJ
|
10
|
+
# reading. Should we?)
|
11
|
+
#
|
12
|
+
# Should be thread-safe (ie, multiple worker threads can be calling #put
|
13
|
+
# concurrently), because output to file is wrapped in a mutex synchronize.
|
14
|
+
# This does not seem to effect performance much, as far as I could tell
|
15
|
+
# benchmarking.
|
16
|
+
#
|
17
|
+
# ## Settings
|
18
|
+
#
|
19
|
+
# * output_file A filename to send output; default will use stdout.
|
20
|
+
#
|
21
|
+
# * json_writer.pretty_print: [default: false]: Pretty-print (e.g., include newlines, indentation, etc.)
|
22
|
+
# each JSON record instead of just mashing it all together on one line. The default, no pretty-printing option
|
23
|
+
# produces one record per line, easy to process with another program.
|
24
|
+
#
|
25
|
+
# ## Example output
|
26
|
+
#
|
27
|
+
# Without pretty printing, you end up with something like this (just two records shown):
|
28
|
+
#
|
29
|
+
# {"id":["000001118"],"oclc":["ocm00085737"],"sdrnum":["sdr-nrlf.b170195454"],"isbn":["0137319924"],"lccn":["73120791"],"mainauthor":["Behavioral and Social Sciences Survey Committee. Psychiatry Panel."],"author":["Behavioral and Social Sciences Survey Committee. Psychiatry Panel.","Hamburg, David A., 1925-"],"author2":["Behavioral and Social Sciences Survey Committee. Psychiatry Panel.","Hamburg, David A., 1925-"],"authorSort":["Behavioral and Social Sciences Survey Committee. Psychiatry Panel."],"author_top":["Behavioral and Social Sciences Survey Committee. Psychiatry Panel.","Edited by David A. Hamburg.","Hamburg, David A., 1925- ed."],"title":["Psychiatry as a behavioral science."],"title_a":["Psychiatry as a behavioral science."],"title_ab":["Psychiatry as a behavioral science."],"title_c":["Edited by David A. Hamburg."],"titleSort":["Psychiatry as a behavioral science"],"title_top":["Psychiatry as a behavioral science."],"title_rest":["A Spectrum book"],"series2":["A Spectrum book"],"callnumber":["RC327 .B41"],"broad_subject":["Medicine"],"pubdate":[1970],"format":["Book","Online","Print"],"publisher":["Prentice-Hall"],"language":["English"],"language008":["eng"],"editor":["David A. Hamburg."]}
|
30
|
+
# {"id":["000000794"],"oclc":["ocm00067181"],"lccn":["78011026"],"mainauthor":["Clark, Albert Curtis, 1859-1937."],"author":["Clark, Albert Curtis, 1859-1937."],"authorSort":["Clark, Albert Curtis, 1859-1937."],"author_top":["Clark, Albert Curtis, 1859-1937."],"title":["The descent of manuscripts.","descent of manuscripts."],"title_a":["The descent of manuscripts.","descent of manuscripts."],"title_ab":["The descent of manuscripts.","descent of manuscripts."],"titleSort":["descent of manuscripts"],"title_top":["The descent of manuscripts."],"callnumber":["PA47 .C45 1970"],"broad_subject":["Language & Literature"],"pubdate":[1918],"format":["Book","Online","Print"],"publisher":["Clarendon Press"],"language":["English"],"language008":["eng"]}
|
31
|
+
#
|
32
|
+
# ## Example configuration file
|
33
|
+
#
|
34
|
+
# require 'traject/json_writer'
|
35
|
+
#
|
36
|
+
# settings do
|
37
|
+
# provide "writer_class_name", "Traject::JsonWriter"
|
38
|
+
# provide "output_file", "out.json"
|
39
|
+
# end
|
40
|
+
class Traject::JsonWriter < Traject::LineWriter
|
41
|
+
|
42
|
+
def serialize(context)
|
43
|
+
hash = context.output_hash
|
44
|
+
if settings["json_writer.pretty_print"]
|
45
|
+
JSON.pretty_generate(hash)
|
46
|
+
else
|
47
|
+
JSON.generate(hash)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
require 'thread'
|
2
|
+
|
3
|
+
# A writer for Traject::Indexer, that just writes out
|
4
|
+
# all the output as serialized text with #puts.
|
5
|
+
#
|
6
|
+
# Should be thread-safe (ie, multiple worker threads can be calling #put
|
7
|
+
# concurrently), by wrapping write to actual output file in a mutex synchronize.
|
8
|
+
# This does not seem to effect performance much, as far as I could tell
|
9
|
+
# benchmarking.
|
10
|
+
#
|
11
|
+
# Output will be sent to `settings["output_file"]` string path, or else
|
12
|
+
# `settings["output_stream"]` (ruby IO object), or else stdout.
|
13
|
+
#
|
14
|
+
# This class can be sub-classed to write out different serialized
|
15
|
+
# reprentations -- subclasses will just override the #serialize
|
16
|
+
# method. For instance, see JsonWriter.
|
17
|
+
class Traject::LineWriter
|
18
|
+
attr_reader :settings
|
19
|
+
attr_reader :write_mutex, :output_file
|
20
|
+
|
21
|
+
def initialize(argSettings)
|
22
|
+
@settings = argSettings
|
23
|
+
@write_mutex = Mutex.new
|
24
|
+
|
25
|
+
# trigger lazy loading now for thread-safety
|
26
|
+
@output_file = open_output_file
|
27
|
+
end
|
28
|
+
|
29
|
+
def _write(data)
|
30
|
+
output_file.puts(data)
|
31
|
+
end
|
32
|
+
|
33
|
+
|
34
|
+
def serialize(context)
|
35
|
+
context.output_hash
|
36
|
+
end
|
37
|
+
|
38
|
+
def put(context)
|
39
|
+
serialized = serialize(context)
|
40
|
+
write_mutex.synchronize do
|
41
|
+
_write(serialized)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def open_output_file
|
46
|
+
unless defined? @output_file
|
47
|
+
of =
|
48
|
+
if settings["output_file"]
|
49
|
+
File.open(settings["output_file"], 'w:UTF-8')
|
50
|
+
elsif settings["output_stream"]
|
51
|
+
settings["output_stream"]
|
52
|
+
else
|
53
|
+
$stdout
|
54
|
+
end
|
55
|
+
end
|
56
|
+
return of
|
57
|
+
end
|
58
|
+
|
59
|
+
def close
|
60
|
+
@output_file.close unless (@output_file.nil? || @output_file.tty?)
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
@@ -0,0 +1,223 @@
|
|
1
|
+
require 'traject/marc_extractor'
|
2
|
+
require 'traject/translation_map'
|
3
|
+
require 'traject/util'
|
4
|
+
require 'base64'
|
5
|
+
require 'json'
|
6
|
+
require 'marc/fastxmlwriter'
|
7
|
+
|
8
|
+
module Traject::Macros
|
9
|
+
# Some of these may be generic for any MARC, but we haven't done
|
10
|
+
# the analytical work to think it through, some of this is
|
11
|
+
# def specific to Marc21.
|
12
|
+
module Marc21
|
13
|
+
|
14
|
+
# A combo function macro that will extract data from marc according to a string
|
15
|
+
# field/substring specification, then apply various optional post-processing to it too.
|
16
|
+
#
|
17
|
+
# First argument is a string spec suitable for the MarcExtractor, see
|
18
|
+
# MarcExtractor::parse_string_spec.
|
19
|
+
#
|
20
|
+
# Second arg is optional options, including options valid on MarcExtractor.new,
|
21
|
+
# and others. By default, will de-duplicate results, but see :allow_duplicates
|
22
|
+
#
|
23
|
+
# * :first => true: take only first value
|
24
|
+
#
|
25
|
+
# * :translation_map => String: translate with named translation map looked up in load
|
26
|
+
# path, uses Tranject::TranslationMap.new(translation_map_arg)
|
27
|
+
#
|
28
|
+
# * :trim_punctuation => true; trims leading/trailing punctuation using standard algorithms that
|
29
|
+
# have shown themselves useful with Marc, using Marc21.trim_punctuation
|
30
|
+
#
|
31
|
+
# * :default => String: if otherwise empty, add default value
|
32
|
+
#
|
33
|
+
# * :allow_duplicates => boolean, default false, if set to true then will avoid
|
34
|
+
# de-duplicating the result array (array.uniq!)
|
35
|
+
#
|
36
|
+
#
|
37
|
+
# Examples:
|
38
|
+
#
|
39
|
+
# to_field("title"), extract_marc("245abcd", :trim_punctuation => true)
|
40
|
+
# to_field("id"), extract_marc("001", :first => true)
|
41
|
+
# to_field("geo"), extract_marc("040a", :separator => nil, :translation_map => "marc040")
|
42
|
+
def extract_marc(spec, options = {})
|
43
|
+
|
44
|
+
# Raise an error if there are any invalid options, indicating a
|
45
|
+
# misspelled or illegal option, using a string instead of a symbol, etc.
|
46
|
+
|
47
|
+
unless (options.keys - EXTRACT_MARC_VALID_OPTIONS).empty?
|
48
|
+
raise RuntimeError.new("Illegal/Unknown argument '#{(options.keys - EXTRACT_MARC_VALID_OPTIONS).join(', ')}' in extract_marc at #{Traject::Util.extract_caller_location(caller.first)}")
|
49
|
+
end
|
50
|
+
|
51
|
+
|
52
|
+
# We create the TranslationMap and the MarcExtractor here
|
53
|
+
# on load, so the lambda can just refer to already created
|
54
|
+
# ones, and not have to create a new one per-execution.
|
55
|
+
#
|
56
|
+
# Benchmarking shows for MarcExtractor at least, there is
|
57
|
+
# significant performance advantage.
|
58
|
+
|
59
|
+
if translation_map_arg = options.delete(:translation_map)
|
60
|
+
translation_map = Traject::TranslationMap.new(translation_map_arg)
|
61
|
+
else
|
62
|
+
translation_map = nil
|
63
|
+
end
|
64
|
+
|
65
|
+
|
66
|
+
extractor = Traject::MarcExtractor.new(spec, options)
|
67
|
+
|
68
|
+
lambda do |record, accumulator, context|
|
69
|
+
accumulator.concat extractor.extract(record)
|
70
|
+
Marc21.apply_extraction_options(accumulator, options, translation_map)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
# Side-effect the accumulator with the options
|
75
|
+
def self.apply_extraction_options(accumulator, options, translation_map=nil)
|
76
|
+
only_first = options[:first]
|
77
|
+
trim_punctuation = options[:trim_punctuation]
|
78
|
+
default_value = options[:default]
|
79
|
+
allow_duplicates = options[:allow_duplicates]
|
80
|
+
|
81
|
+
if only_first
|
82
|
+
accumulator.replace Array(accumulator[0])
|
83
|
+
end
|
84
|
+
|
85
|
+
if translation_map
|
86
|
+
translation_map.translate_array! accumulator
|
87
|
+
end
|
88
|
+
|
89
|
+
if trim_punctuation
|
90
|
+
accumulator.collect! {|s| Marc21.trim_punctuation(s)}
|
91
|
+
end
|
92
|
+
|
93
|
+
unless allow_duplicates
|
94
|
+
accumulator.uniq!
|
95
|
+
end
|
96
|
+
|
97
|
+
if default_value && accumulator.empty?
|
98
|
+
accumulator << default_value
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
|
103
|
+
# A list of symbols that are valid keys in the options hash
|
104
|
+
EXTRACT_MARC_VALID_OPTIONS = [:first, :trim_punctuation, :default,
|
105
|
+
:allow_duplicates, :separator, :translation_map,
|
106
|
+
:alternate_script]
|
107
|
+
|
108
|
+
# Serializes complete marc record to a serialization format.
|
109
|
+
# required param :format,
|
110
|
+
# serialize_marc(:format => :binary)
|
111
|
+
#
|
112
|
+
# formats:
|
113
|
+
# [xml] MarcXML
|
114
|
+
# [json] marc-in-json (http://dilettantes.code4lib.org/blog/2010/09/a-proposal-to-serialize-marc-in-json/)
|
115
|
+
# [binary] Standard ISO 2709 binary marc. By default WILL be base64-encoded,
|
116
|
+
# assumed destination a solr 'binary' field.
|
117
|
+
# * add option `:binary_escape => false` to do straight binary -- unclear
|
118
|
+
# what Solr's documented behavior is when you do this, and add a string
|
119
|
+
# with binary control chars to solr. May do different things in diff
|
120
|
+
# Solr versions, including raising exceptions.
|
121
|
+
# * add option `:allow_oversized => true` to pass that flat
|
122
|
+
# to the MARC::Writer. Oversized records will then still be
|
123
|
+
# serialized, with certain header bytes filled with ascii 0's
|
124
|
+
# -- technically illegal MARC, but can still be read by
|
125
|
+
# ruby MARC::Reader in permissive mode.
|
126
|
+
def serialized_marc(options)
|
127
|
+
unless (options.keys - SERIALZED_MARC_VALID_OPTIONS).empty?
|
128
|
+
raise RuntimeError.new("Illegal/Unknown argument '#{(options.keys - SERIALZED_MARC_VALID_OPTIONS).join(', ')}' in seralized_marc at #{Traject::Util.extract_caller_location(caller.first)}")
|
129
|
+
end
|
130
|
+
|
131
|
+
format = options[:format].to_s
|
132
|
+
binary_escape = (options[:binary_escape] != false)
|
133
|
+
allow_oversized = (options[:allow_oversized] == true)
|
134
|
+
|
135
|
+
raise ArgumentError.new("Need :format => [binary|xml|json] arg") unless %w{binary xml json}.include?(format)
|
136
|
+
|
137
|
+
lambda do |record, accumulator, context|
|
138
|
+
case format
|
139
|
+
when "binary"
|
140
|
+
binary = MARC::Writer.encode(record, allow_oversized)
|
141
|
+
binary = Base64.encode64(binary) if binary_escape
|
142
|
+
accumulator << binary
|
143
|
+
when "xml"
|
144
|
+
accumulator << MARC::FastXMLWriter.encode(record)
|
145
|
+
when "json"
|
146
|
+
accumulator << JSON.dump(record.to_hash)
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
150
|
+
SERIALZED_MARC_VALID_OPTIONS = [:format, :binary_escape, :allow_oversized]
|
151
|
+
|
152
|
+
# Takes the whole record, by default from tags 100 to 899 inclusive,
|
153
|
+
# all subfields, and adds them to output. Subfields in a record are all
|
154
|
+
# joined by space by default.
|
155
|
+
#
|
156
|
+
# options
|
157
|
+
# [:from] default 100, only tags >= lexicographically
|
158
|
+
# [:to] default 899, only tags <= lexicographically
|
159
|
+
# [:separator] how to join subfields, default space, nil means don't join
|
160
|
+
#
|
161
|
+
# All fields in from-to must be marc DATA (not control fields), or weirdness
|
162
|
+
#
|
163
|
+
# Can always run this thing multiple times on the same field if you need
|
164
|
+
# non-contiguous ranges of fields.
|
165
|
+
def extract_all_marc_values(options = {})
|
166
|
+
unless (options.keys - EXTRACT_ALL_MARC_VALID_OPTIONS).empty?
|
167
|
+
raise RuntimeError.new("Illegal/Unknown argument '#{(options.keys - EXTRACT_ALL_MARC_VALID_OPTIONS).join(', ')}' in extract_all_marc at #{Traject::Util.extract_caller_location(caller.first)}")
|
168
|
+
end
|
169
|
+
options = {:from => "100", :to => "899", :separator => ' '}.merge(options)
|
170
|
+
|
171
|
+
lambda do |record, accumulator, context|
|
172
|
+
record.each do |field|
|
173
|
+
next unless field.tag >= options[:from] && field.tag <= options[:to]
|
174
|
+
subfield_values = field.subfields.collect {|sf| sf.value}
|
175
|
+
next unless subfield_values.length > 0
|
176
|
+
|
177
|
+
if options[:separator]
|
178
|
+
accumulator << subfield_values.join( options[:separator])
|
179
|
+
else
|
180
|
+
accumulator.concat subfield_values
|
181
|
+
end
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
185
|
+
end
|
186
|
+
EXTRACT_ALL_MARC_VALID_OPTIONS = [:separator, :from, :to]
|
187
|
+
|
188
|
+
|
189
|
+
# Trims punctuation mostly from end, and occasionally from beginning
|
190
|
+
# of string. Not nearly as complex logic as SolrMarc's version, just
|
191
|
+
# pretty simple.
|
192
|
+
#
|
193
|
+
# Removes
|
194
|
+
# * trailing: comma, slash, semicolon, colon (possibly preceded and followed by whitespace)
|
195
|
+
# * trailing period if it is preceded by at least three letters (possibly preceded and followed by whitespace)
|
196
|
+
# * single square bracket characters if they are the start and/or end
|
197
|
+
# chars and there are no internal square brackets.
|
198
|
+
#
|
199
|
+
# Returns altered string, doesn't change original arg.
|
200
|
+
def self.trim_punctuation(str)
|
201
|
+
|
202
|
+
# If something went wrong and we got a nil, just return it
|
203
|
+
return str unless str
|
204
|
+
|
205
|
+
# trailing: comma, slash, semicolon, colon (possibly preceded and followed by whitespace)
|
206
|
+
str = str.sub(/ *[ ,\/;:] *\Z/, '')
|
207
|
+
|
208
|
+
# trailing period if it is preceded by at least three letters (possibly preceded and followed by whitespace)
|
209
|
+
str = str.sub(/( *\w\w\w)\. *\Z/, '\1')
|
210
|
+
|
211
|
+
# single square bracket characters if they are the start and/or end
|
212
|
+
# chars and there are no internal square brackets.
|
213
|
+
str = str.sub(/\A\[?([^\[\]]+)\]?\Z/, '\1')
|
214
|
+
return str
|
215
|
+
end
|
216
|
+
|
217
|
+
def self.first!(arr)
|
218
|
+
# kind of esoteric, but slice used this way does mutating first, yep
|
219
|
+
arr.slice!(1, arr.length)
|
220
|
+
end
|
221
|
+
|
222
|
+
end
|
223
|
+
end
|