traject 2.0.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/.travis.yml +27 -0
- data/.yardopts +3 -0
- data/Gemfile +12 -0
- data/LICENSE.txt +20 -0
- data/README.md +461 -0
- data/Rakefile +21 -0
- data/bench/bench.rb +30 -0
- data/bin/traject +16 -0
- data/doc/batch_execution.md +243 -0
- data/doc/extending.md +190 -0
- data/doc/indexing_rules.md +265 -0
- data/doc/other_commands.md +47 -0
- data/doc/settings.md +101 -0
- data/lib/tasks/load_maps.rake +48 -0
- data/lib/traject.rb +11 -0
- data/lib/traject/command_line.rb +301 -0
- data/lib/traject/csv_writer.rb +34 -0
- data/lib/traject/debug_writer.rb +47 -0
- data/lib/traject/delimited_writer.rb +110 -0
- data/lib/traject/indexer.rb +613 -0
- data/lib/traject/indexer/settings.rb +110 -0
- data/lib/traject/json_writer.rb +51 -0
- data/lib/traject/line_writer.rb +63 -0
- data/lib/traject/macros/basic.rb +9 -0
- data/lib/traject/macros/marc21.rb +223 -0
- data/lib/traject/macros/marc21_semantics.rb +584 -0
- data/lib/traject/macros/marc_format_classifier.rb +197 -0
- data/lib/traject/marc_extractor.rb +410 -0
- data/lib/traject/marc_reader.rb +89 -0
- data/lib/traject/mock_reader.rb +97 -0
- data/lib/traject/ndj_reader.rb +40 -0
- data/lib/traject/null_writer.rb +22 -0
- data/lib/traject/qualified_const_get.rb +40 -0
- data/lib/traject/solr_json_writer.rb +277 -0
- data/lib/traject/thread_pool.rb +161 -0
- data/lib/traject/translation_map.rb +267 -0
- data/lib/traject/util.rb +52 -0
- data/lib/traject/version.rb +3 -0
- data/lib/traject/yaml_writer.rb +9 -0
- data/lib/translation_maps/lcc_top_level.yaml +26 -0
- data/lib/translation_maps/marc_genre_007.yaml +9 -0
- data/lib/translation_maps/marc_genre_leader.yaml +22 -0
- data/lib/translation_maps/marc_geographic.yaml +589 -0
- data/lib/translation_maps/marc_instruments.yaml +102 -0
- data/lib/translation_maps/marc_languages.yaml +490 -0
- data/test/debug_writer_test.rb +38 -0
- data/test/delimited_writer_test.rb +104 -0
- data/test/indexer/each_record_test.rb +59 -0
- data/test/indexer/macros_marc21_semantics_test.rb +391 -0
- data/test/indexer/macros_marc21_test.rb +190 -0
- data/test/indexer/macros_test.rb +40 -0
- data/test/indexer/map_record_test.rb +209 -0
- data/test/indexer/read_write_test.rb +101 -0
- data/test/indexer/settings_test.rb +152 -0
- data/test/indexer/to_field_test.rb +77 -0
- data/test/marc_extractor_test.rb +412 -0
- data/test/marc_format_classifier_test.rb +98 -0
- data/test/marc_reader_test.rb +110 -0
- data/test/solr_json_writer_test.rb +248 -0
- data/test/test_helper.rb +90 -0
- data/test/test_support/245_no_ab.marc +1 -0
- data/test/test_support/880_with_no_6.utf8.marc +1 -0
- data/test/test_support/bad_subfield_code.marc +1 -0
- data/test/test_support/bad_utf_byte.utf8.marc +1 -0
- data/test/test_support/date_resort_to_260.marc +1 -0
- data/test/test_support/date_type_r_missing_date2.marc +1 -0
- data/test/test_support/date_with_u.marc +1 -0
- data/test/test_support/demo_config.rb +155 -0
- data/test/test_support/emptyish_record.marc +1 -0
- data/test/test_support/escaped_character_reference.marc8.marc +1 -0
- data/test/test_support/george_eliot.marc +1 -0
- data/test/test_support/hebrew880s.marc +1 -0
- data/test/test_support/louis_armstrong.marc +1 -0
- data/test/test_support/manufacturing_consent.marc +1 -0
- data/test/test_support/manuscript_online_thesis.marc +1 -0
- data/test/test_support/microform_online_conference.marc +1 -0
- data/test/test_support/multi_era.marc +1 -0
- data/test/test_support/multi_geo.marc +1 -0
- data/test/test_support/musical_cage.marc +1 -0
- data/test/test_support/nature.marc +1 -0
- data/test/test_support/one-marc8.mrc +1 -0
- data/test/test_support/online_only.marc +1 -0
- data/test/test_support/packed_041a_lang.marc +1 -0
- data/test/test_support/test_data.utf8.json +30 -0
- data/test/test_support/test_data.utf8.marc.xml +2609 -0
- data/test/test_support/test_data.utf8.mrc +1 -0
- data/test/test_support/test_data.utf8.mrc.gz +0 -0
- data/test/test_support/the_business_ren.marc +1 -0
- data/test/translation_map_test.rb +225 -0
- data/test/translation_maps/bad_ruby.rb +8 -0
- data/test/translation_maps/bad_yaml.yaml +1 -0
- data/test/translation_maps/both_map.rb +1 -0
- data/test/translation_maps/both_map.yaml +1 -0
- data/test/translation_maps/default_literal.rb +10 -0
- data/test/translation_maps/default_passthrough.rb +10 -0
- data/test/translation_maps/marc_040a_translate_test.yaml +1 -0
- data/test/translation_maps/properties_map.properties +5 -0
- data/test/translation_maps/ruby_map.rb +10 -0
- data/test/translation_maps/translate_array_test.yaml +8 -0
- data/test/translation_maps/yaml_map.yaml +7 -0
- data/traject.gemspec +47 -0
- metadata +382 -0
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
require 'hashie'
|
|
2
|
+
require 'concurrent'
|
|
3
|
+
|
|
4
|
+
class Traject::Indexer
|
|
5
|
+
|
|
6
|
+
# A Hash of settings for a Traject::Indexer, which also ends up passed along
|
|
7
|
+
# to other objects Traject::Indexer interacts with.
|
|
8
|
+
#
|
|
9
|
+
# Enhanced with a few features from Hashie, to make it for
|
|
10
|
+
# instance string/symbol indifferent
|
|
11
|
+
#
|
|
12
|
+
# method #provide(key, value) is added, to do like settings[key] ||= value,
|
|
13
|
+
# set only if not already set (but unlike ||=, nil or false can count as already set)
|
|
14
|
+
#
|
|
15
|
+
# Also has an interesting 'defaults' system, meant to play along
|
|
16
|
+
# with configuration file 'provide' statements. There is a built-in hash of
|
|
17
|
+
# defaults, which will be lazily filled in if accessed and not yet
|
|
18
|
+
# set. (nil can count as set, though!). If they haven't been lazily
|
|
19
|
+
# set yet, then #provide will still fill them in. But you can also call
|
|
20
|
+
# fill_in_defaults! to fill all defaults in, if you know configuration
|
|
21
|
+
# files have all been loaded, and want to fill them in for inspection.
|
|
22
|
+
class Settings < Hash
|
|
23
|
+
include Hashie::Extensions::MergeInitializer # can init with hash
|
|
24
|
+
include Hashie::Extensions::IndifferentAccess
|
|
25
|
+
|
|
26
|
+
def initialize(*args)
|
|
27
|
+
super
|
|
28
|
+
self.default_proc = lambda do |hash, key|
|
|
29
|
+
if self.class.defaults.has_key?(key)
|
|
30
|
+
return hash[key] = self.class.defaults[key]
|
|
31
|
+
else
|
|
32
|
+
return nil
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# a cautious store, which only saves key=value if
|
|
38
|
+
# there was not already a value for #key. Can be used
|
|
39
|
+
# to set settings that can be overridden on command line,
|
|
40
|
+
# or general first-set-wins settings.
|
|
41
|
+
def provide(key, value)
|
|
42
|
+
unless has_key? key
|
|
43
|
+
store(key, value)
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# reverse_merge copied from ActiveSupport, pretty straightforward,
|
|
48
|
+
# modified to make sure we return a Settings
|
|
49
|
+
def reverse_merge(other_hash)
|
|
50
|
+
self.class.new(other_hash).merge(self)
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def reverse_merge!(other_hash)
|
|
54
|
+
replace(reverse_merge(other_hash))
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def fill_in_defaults!
|
|
58
|
+
self.reverse_merge!(self.class.defaults)
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def self.mri_defaults
|
|
63
|
+
{
|
|
64
|
+
"reader_class_name" => "Traject::MarcReader",
|
|
65
|
+
"writer_class_name" => "Traject::SolrJsonWriter",
|
|
66
|
+
"marc_source.type" => "binary",
|
|
67
|
+
"solrj_writer.batch_size" => 200,
|
|
68
|
+
"solrj_writer.thread_pool" => 1,
|
|
69
|
+
"processing_thread_pool" => self.default_processing_thread_pool,
|
|
70
|
+
"log.batch_size.severity" => "info"
|
|
71
|
+
}
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def self.jruby_defaults
|
|
75
|
+
{
|
|
76
|
+
'reader_class_name' => "Traject::Marc4JReader",
|
|
77
|
+
'marc4j_reader.permissive' => true
|
|
78
|
+
}
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def self.defaults
|
|
83
|
+
return @@defaults if defined? @@defaults
|
|
84
|
+
default_settings = self.mri_defaults
|
|
85
|
+
if defined? JRUBY_VERSION
|
|
86
|
+
default_settings.merge! self.jruby_defaults
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
@@defaults = default_settings
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def inspect
|
|
93
|
+
# Keep any key ending in password out of the inspect
|
|
94
|
+
self.inject({}) do |hash, (key, value)|
|
|
95
|
+
hash[key] = (key =~ /password\Z/) ? "[hidden]" : value
|
|
96
|
+
hash
|
|
97
|
+
end.inspect
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
protected
|
|
101
|
+
def self.default_processing_thread_pool
|
|
102
|
+
if ["jruby", "rbx"].include? ENV["RUBY_ENGINE"]
|
|
103
|
+
[1, Concurrent.processor_count - 1].max
|
|
104
|
+
else
|
|
105
|
+
1
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
end
|
|
110
|
+
end
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
require 'json'
|
|
2
|
+
require 'traject/line_writer'
|
|
3
|
+
|
|
4
|
+
# The JsonWriter outputs one JSON hash per record, separated by newlines.
|
|
5
|
+
#
|
|
6
|
+
# It's newline delimitted json, which should be suitable for being
|
|
7
|
+
# read by simple NDJ readers. (TODO: We have no checks right now to
|
|
8
|
+
# make sure the standard json serializers we're using don't put any
|
|
9
|
+
# internal newlines as whitespace in the json. Which would break NDJ
|
|
10
|
+
# reading. Should we?)
|
|
11
|
+
#
|
|
12
|
+
# Should be thread-safe (ie, multiple worker threads can be calling #put
|
|
13
|
+
# concurrently), because output to file is wrapped in a mutex synchronize.
|
|
14
|
+
# This does not seem to effect performance much, as far as I could tell
|
|
15
|
+
# benchmarking.
|
|
16
|
+
#
|
|
17
|
+
# ## Settings
|
|
18
|
+
#
|
|
19
|
+
# * output_file A filename to send output; default will use stdout.
|
|
20
|
+
#
|
|
21
|
+
# * json_writer.pretty_print: [default: false]: Pretty-print (e.g., include newlines, indentation, etc.)
|
|
22
|
+
# each JSON record instead of just mashing it all together on one line. The default, no pretty-printing option
|
|
23
|
+
# produces one record per line, easy to process with another program.
|
|
24
|
+
#
|
|
25
|
+
# ## Example output
|
|
26
|
+
#
|
|
27
|
+
# Without pretty printing, you end up with something like this (just two records shown):
|
|
28
|
+
#
|
|
29
|
+
# {"id":["000001118"],"oclc":["ocm00085737"],"sdrnum":["sdr-nrlf.b170195454"],"isbn":["0137319924"],"lccn":["73120791"],"mainauthor":["Behavioral and Social Sciences Survey Committee. Psychiatry Panel."],"author":["Behavioral and Social Sciences Survey Committee. Psychiatry Panel.","Hamburg, David A., 1925-"],"author2":["Behavioral and Social Sciences Survey Committee. Psychiatry Panel.","Hamburg, David A., 1925-"],"authorSort":["Behavioral and Social Sciences Survey Committee. Psychiatry Panel."],"author_top":["Behavioral and Social Sciences Survey Committee. Psychiatry Panel.","Edited by David A. Hamburg.","Hamburg, David A., 1925- ed."],"title":["Psychiatry as a behavioral science."],"title_a":["Psychiatry as a behavioral science."],"title_ab":["Psychiatry as a behavioral science."],"title_c":["Edited by David A. Hamburg."],"titleSort":["Psychiatry as a behavioral science"],"title_top":["Psychiatry as a behavioral science."],"title_rest":["A Spectrum book"],"series2":["A Spectrum book"],"callnumber":["RC327 .B41"],"broad_subject":["Medicine"],"pubdate":[1970],"format":["Book","Online","Print"],"publisher":["Prentice-Hall"],"language":["English"],"language008":["eng"],"editor":["David A. Hamburg."]}
|
|
30
|
+
# {"id":["000000794"],"oclc":["ocm00067181"],"lccn":["78011026"],"mainauthor":["Clark, Albert Curtis, 1859-1937."],"author":["Clark, Albert Curtis, 1859-1937."],"authorSort":["Clark, Albert Curtis, 1859-1937."],"author_top":["Clark, Albert Curtis, 1859-1937."],"title":["The descent of manuscripts.","descent of manuscripts."],"title_a":["The descent of manuscripts.","descent of manuscripts."],"title_ab":["The descent of manuscripts.","descent of manuscripts."],"titleSort":["descent of manuscripts"],"title_top":["The descent of manuscripts."],"callnumber":["PA47 .C45 1970"],"broad_subject":["Language & Literature"],"pubdate":[1918],"format":["Book","Online","Print"],"publisher":["Clarendon Press"],"language":["English"],"language008":["eng"]}
|
|
31
|
+
#
|
|
32
|
+
# ## Example configuration file
|
|
33
|
+
#
|
|
34
|
+
# require 'traject/json_writer'
|
|
35
|
+
#
|
|
36
|
+
# settings do
|
|
37
|
+
# provide "writer_class_name", "Traject::JsonWriter"
|
|
38
|
+
# provide "output_file", "out.json"
|
|
39
|
+
# end
|
|
40
|
+
class Traject::JsonWriter < Traject::LineWriter
|
|
41
|
+
|
|
42
|
+
def serialize(context)
|
|
43
|
+
hash = context.output_hash
|
|
44
|
+
if settings["json_writer.pretty_print"]
|
|
45
|
+
JSON.pretty_generate(hash)
|
|
46
|
+
else
|
|
47
|
+
JSON.generate(hash)
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
end
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
require 'thread'
|
|
2
|
+
|
|
3
|
+
# A writer for Traject::Indexer, that just writes out
|
|
4
|
+
# all the output as serialized text with #puts.
|
|
5
|
+
#
|
|
6
|
+
# Should be thread-safe (ie, multiple worker threads can be calling #put
|
|
7
|
+
# concurrently), by wrapping write to actual output file in a mutex synchronize.
|
|
8
|
+
# This does not seem to effect performance much, as far as I could tell
|
|
9
|
+
# benchmarking.
|
|
10
|
+
#
|
|
11
|
+
# Output will be sent to `settings["output_file"]` string path, or else
|
|
12
|
+
# `settings["output_stream"]` (ruby IO object), or else stdout.
|
|
13
|
+
#
|
|
14
|
+
# This class can be sub-classed to write out different serialized
|
|
15
|
+
# reprentations -- subclasses will just override the #serialize
|
|
16
|
+
# method. For instance, see JsonWriter.
|
|
17
|
+
class Traject::LineWriter
|
|
18
|
+
attr_reader :settings
|
|
19
|
+
attr_reader :write_mutex, :output_file
|
|
20
|
+
|
|
21
|
+
def initialize(argSettings)
|
|
22
|
+
@settings = argSettings
|
|
23
|
+
@write_mutex = Mutex.new
|
|
24
|
+
|
|
25
|
+
# trigger lazy loading now for thread-safety
|
|
26
|
+
@output_file = open_output_file
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def _write(data)
|
|
30
|
+
output_file.puts(data)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def serialize(context)
|
|
35
|
+
context.output_hash
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def put(context)
|
|
39
|
+
serialized = serialize(context)
|
|
40
|
+
write_mutex.synchronize do
|
|
41
|
+
_write(serialized)
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def open_output_file
|
|
46
|
+
unless defined? @output_file
|
|
47
|
+
of =
|
|
48
|
+
if settings["output_file"]
|
|
49
|
+
File.open(settings["output_file"], 'w:UTF-8')
|
|
50
|
+
elsif settings["output_stream"]
|
|
51
|
+
settings["output_stream"]
|
|
52
|
+
else
|
|
53
|
+
$stdout
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
return of
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def close
|
|
60
|
+
@output_file.close unless (@output_file.nil? || @output_file.tty?)
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
end
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
require 'traject/marc_extractor'
|
|
2
|
+
require 'traject/translation_map'
|
|
3
|
+
require 'traject/util'
|
|
4
|
+
require 'base64'
|
|
5
|
+
require 'json'
|
|
6
|
+
require 'marc/fastxmlwriter'
|
|
7
|
+
|
|
8
|
+
module Traject::Macros
|
|
9
|
+
# Some of these may be generic for any MARC, but we haven't done
|
|
10
|
+
# the analytical work to think it through, some of this is
|
|
11
|
+
# def specific to Marc21.
|
|
12
|
+
module Marc21
|
|
13
|
+
|
|
14
|
+
# A combo function macro that will extract data from marc according to a string
|
|
15
|
+
# field/substring specification, then apply various optional post-processing to it too.
|
|
16
|
+
#
|
|
17
|
+
# First argument is a string spec suitable for the MarcExtractor, see
|
|
18
|
+
# MarcExtractor::parse_string_spec.
|
|
19
|
+
#
|
|
20
|
+
# Second arg is optional options, including options valid on MarcExtractor.new,
|
|
21
|
+
# and others. By default, will de-duplicate results, but see :allow_duplicates
|
|
22
|
+
#
|
|
23
|
+
# * :first => true: take only first value
|
|
24
|
+
#
|
|
25
|
+
# * :translation_map => String: translate with named translation map looked up in load
|
|
26
|
+
# path, uses Tranject::TranslationMap.new(translation_map_arg)
|
|
27
|
+
#
|
|
28
|
+
# * :trim_punctuation => true; trims leading/trailing punctuation using standard algorithms that
|
|
29
|
+
# have shown themselves useful with Marc, using Marc21.trim_punctuation
|
|
30
|
+
#
|
|
31
|
+
# * :default => String: if otherwise empty, add default value
|
|
32
|
+
#
|
|
33
|
+
# * :allow_duplicates => boolean, default false, if set to true then will avoid
|
|
34
|
+
# de-duplicating the result array (array.uniq!)
|
|
35
|
+
#
|
|
36
|
+
#
|
|
37
|
+
# Examples:
|
|
38
|
+
#
|
|
39
|
+
# to_field("title"), extract_marc("245abcd", :trim_punctuation => true)
|
|
40
|
+
# to_field("id"), extract_marc("001", :first => true)
|
|
41
|
+
# to_field("geo"), extract_marc("040a", :separator => nil, :translation_map => "marc040")
|
|
42
|
+
def extract_marc(spec, options = {})
|
|
43
|
+
|
|
44
|
+
# Raise an error if there are any invalid options, indicating a
|
|
45
|
+
# misspelled or illegal option, using a string instead of a symbol, etc.
|
|
46
|
+
|
|
47
|
+
unless (options.keys - EXTRACT_MARC_VALID_OPTIONS).empty?
|
|
48
|
+
raise RuntimeError.new("Illegal/Unknown argument '#{(options.keys - EXTRACT_MARC_VALID_OPTIONS).join(', ')}' in extract_marc at #{Traject::Util.extract_caller_location(caller.first)}")
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
# We create the TranslationMap and the MarcExtractor here
|
|
53
|
+
# on load, so the lambda can just refer to already created
|
|
54
|
+
# ones, and not have to create a new one per-execution.
|
|
55
|
+
#
|
|
56
|
+
# Benchmarking shows for MarcExtractor at least, there is
|
|
57
|
+
# significant performance advantage.
|
|
58
|
+
|
|
59
|
+
if translation_map_arg = options.delete(:translation_map)
|
|
60
|
+
translation_map = Traject::TranslationMap.new(translation_map_arg)
|
|
61
|
+
else
|
|
62
|
+
translation_map = nil
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
extractor = Traject::MarcExtractor.new(spec, options)
|
|
67
|
+
|
|
68
|
+
lambda do |record, accumulator, context|
|
|
69
|
+
accumulator.concat extractor.extract(record)
|
|
70
|
+
Marc21.apply_extraction_options(accumulator, options, translation_map)
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# Side-effect the accumulator with the options
|
|
75
|
+
def self.apply_extraction_options(accumulator, options, translation_map=nil)
|
|
76
|
+
only_first = options[:first]
|
|
77
|
+
trim_punctuation = options[:trim_punctuation]
|
|
78
|
+
default_value = options[:default]
|
|
79
|
+
allow_duplicates = options[:allow_duplicates]
|
|
80
|
+
|
|
81
|
+
if only_first
|
|
82
|
+
accumulator.replace Array(accumulator[0])
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
if translation_map
|
|
86
|
+
translation_map.translate_array! accumulator
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
if trim_punctuation
|
|
90
|
+
accumulator.collect! {|s| Marc21.trim_punctuation(s)}
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
unless allow_duplicates
|
|
94
|
+
accumulator.uniq!
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
if default_value && accumulator.empty?
|
|
98
|
+
accumulator << default_value
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
# A list of symbols that are valid keys in the options hash
|
|
104
|
+
EXTRACT_MARC_VALID_OPTIONS = [:first, :trim_punctuation, :default,
|
|
105
|
+
:allow_duplicates, :separator, :translation_map,
|
|
106
|
+
:alternate_script]
|
|
107
|
+
|
|
108
|
+
# Serializes complete marc record to a serialization format.
|
|
109
|
+
# required param :format,
|
|
110
|
+
# serialize_marc(:format => :binary)
|
|
111
|
+
#
|
|
112
|
+
# formats:
|
|
113
|
+
# [xml] MarcXML
|
|
114
|
+
# [json] marc-in-json (http://dilettantes.code4lib.org/blog/2010/09/a-proposal-to-serialize-marc-in-json/)
|
|
115
|
+
# [binary] Standard ISO 2709 binary marc. By default WILL be base64-encoded,
|
|
116
|
+
# assumed destination a solr 'binary' field.
|
|
117
|
+
# * add option `:binary_escape => false` to do straight binary -- unclear
|
|
118
|
+
# what Solr's documented behavior is when you do this, and add a string
|
|
119
|
+
# with binary control chars to solr. May do different things in diff
|
|
120
|
+
# Solr versions, including raising exceptions.
|
|
121
|
+
# * add option `:allow_oversized => true` to pass that flat
|
|
122
|
+
# to the MARC::Writer. Oversized records will then still be
|
|
123
|
+
# serialized, with certain header bytes filled with ascii 0's
|
|
124
|
+
# -- technically illegal MARC, but can still be read by
|
|
125
|
+
# ruby MARC::Reader in permissive mode.
|
|
126
|
+
def serialized_marc(options)
|
|
127
|
+
unless (options.keys - SERIALZED_MARC_VALID_OPTIONS).empty?
|
|
128
|
+
raise RuntimeError.new("Illegal/Unknown argument '#{(options.keys - SERIALZED_MARC_VALID_OPTIONS).join(', ')}' in seralized_marc at #{Traject::Util.extract_caller_location(caller.first)}")
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
format = options[:format].to_s
|
|
132
|
+
binary_escape = (options[:binary_escape] != false)
|
|
133
|
+
allow_oversized = (options[:allow_oversized] == true)
|
|
134
|
+
|
|
135
|
+
raise ArgumentError.new("Need :format => [binary|xml|json] arg") unless %w{binary xml json}.include?(format)
|
|
136
|
+
|
|
137
|
+
lambda do |record, accumulator, context|
|
|
138
|
+
case format
|
|
139
|
+
when "binary"
|
|
140
|
+
binary = MARC::Writer.encode(record, allow_oversized)
|
|
141
|
+
binary = Base64.encode64(binary) if binary_escape
|
|
142
|
+
accumulator << binary
|
|
143
|
+
when "xml"
|
|
144
|
+
accumulator << MARC::FastXMLWriter.encode(record)
|
|
145
|
+
when "json"
|
|
146
|
+
accumulator << JSON.dump(record.to_hash)
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
SERIALZED_MARC_VALID_OPTIONS = [:format, :binary_escape, :allow_oversized]
|
|
151
|
+
|
|
152
|
+
# Takes the whole record, by default from tags 100 to 899 inclusive,
|
|
153
|
+
# all subfields, and adds them to output. Subfields in a record are all
|
|
154
|
+
# joined by space by default.
|
|
155
|
+
#
|
|
156
|
+
# options
|
|
157
|
+
# [:from] default 100, only tags >= lexicographically
|
|
158
|
+
# [:to] default 899, only tags <= lexicographically
|
|
159
|
+
# [:separator] how to join subfields, default space, nil means don't join
|
|
160
|
+
#
|
|
161
|
+
# All fields in from-to must be marc DATA (not control fields), or weirdness
|
|
162
|
+
#
|
|
163
|
+
# Can always run this thing multiple times on the same field if you need
|
|
164
|
+
# non-contiguous ranges of fields.
|
|
165
|
+
def extract_all_marc_values(options = {})
|
|
166
|
+
unless (options.keys - EXTRACT_ALL_MARC_VALID_OPTIONS).empty?
|
|
167
|
+
raise RuntimeError.new("Illegal/Unknown argument '#{(options.keys - EXTRACT_ALL_MARC_VALID_OPTIONS).join(', ')}' in extract_all_marc at #{Traject::Util.extract_caller_location(caller.first)}")
|
|
168
|
+
end
|
|
169
|
+
options = {:from => "100", :to => "899", :separator => ' '}.merge(options)
|
|
170
|
+
|
|
171
|
+
lambda do |record, accumulator, context|
|
|
172
|
+
record.each do |field|
|
|
173
|
+
next unless field.tag >= options[:from] && field.tag <= options[:to]
|
|
174
|
+
subfield_values = field.subfields.collect {|sf| sf.value}
|
|
175
|
+
next unless subfield_values.length > 0
|
|
176
|
+
|
|
177
|
+
if options[:separator]
|
|
178
|
+
accumulator << subfield_values.join( options[:separator])
|
|
179
|
+
else
|
|
180
|
+
accumulator.concat subfield_values
|
|
181
|
+
end
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
end
|
|
186
|
+
EXTRACT_ALL_MARC_VALID_OPTIONS = [:separator, :from, :to]
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
# Trims punctuation mostly from end, and occasionally from beginning
|
|
190
|
+
# of string. Not nearly as complex logic as SolrMarc's version, just
|
|
191
|
+
# pretty simple.
|
|
192
|
+
#
|
|
193
|
+
# Removes
|
|
194
|
+
# * trailing: comma, slash, semicolon, colon (possibly preceded and followed by whitespace)
|
|
195
|
+
# * trailing period if it is preceded by at least three letters (possibly preceded and followed by whitespace)
|
|
196
|
+
# * single square bracket characters if they are the start and/or end
|
|
197
|
+
# chars and there are no internal square brackets.
|
|
198
|
+
#
|
|
199
|
+
# Returns altered string, doesn't change original arg.
|
|
200
|
+
def self.trim_punctuation(str)
|
|
201
|
+
|
|
202
|
+
# If something went wrong and we got a nil, just return it
|
|
203
|
+
return str unless str
|
|
204
|
+
|
|
205
|
+
# trailing: comma, slash, semicolon, colon (possibly preceded and followed by whitespace)
|
|
206
|
+
str = str.sub(/ *[ ,\/;:] *\Z/, '')
|
|
207
|
+
|
|
208
|
+
# trailing period if it is preceded by at least three letters (possibly preceded and followed by whitespace)
|
|
209
|
+
str = str.sub(/( *\w\w\w)\. *\Z/, '\1')
|
|
210
|
+
|
|
211
|
+
# single square bracket characters if they are the start and/or end
|
|
212
|
+
# chars and there are no internal square brackets.
|
|
213
|
+
str = str.sub(/\A\[?([^\[\]]+)\]?\Z/, '\1')
|
|
214
|
+
return str
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
def self.first!(arr)
|
|
218
|
+
# kind of esoteric, but slice used this way does mutating first, yep
|
|
219
|
+
arr.slice!(1, arr.length)
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
end
|
|
223
|
+
end
|