traject 2.0.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +18 -0
  3. data/.travis.yml +27 -0
  4. data/.yardopts +3 -0
  5. data/Gemfile +12 -0
  6. data/LICENSE.txt +20 -0
  7. data/README.md +461 -0
  8. data/Rakefile +21 -0
  9. data/bench/bench.rb +30 -0
  10. data/bin/traject +16 -0
  11. data/doc/batch_execution.md +243 -0
  12. data/doc/extending.md +190 -0
  13. data/doc/indexing_rules.md +265 -0
  14. data/doc/other_commands.md +47 -0
  15. data/doc/settings.md +101 -0
  16. data/lib/tasks/load_maps.rake +48 -0
  17. data/lib/traject.rb +11 -0
  18. data/lib/traject/command_line.rb +301 -0
  19. data/lib/traject/csv_writer.rb +34 -0
  20. data/lib/traject/debug_writer.rb +47 -0
  21. data/lib/traject/delimited_writer.rb +110 -0
  22. data/lib/traject/indexer.rb +613 -0
  23. data/lib/traject/indexer/settings.rb +110 -0
  24. data/lib/traject/json_writer.rb +51 -0
  25. data/lib/traject/line_writer.rb +63 -0
  26. data/lib/traject/macros/basic.rb +9 -0
  27. data/lib/traject/macros/marc21.rb +223 -0
  28. data/lib/traject/macros/marc21_semantics.rb +584 -0
  29. data/lib/traject/macros/marc_format_classifier.rb +197 -0
  30. data/lib/traject/marc_extractor.rb +410 -0
  31. data/lib/traject/marc_reader.rb +89 -0
  32. data/lib/traject/mock_reader.rb +97 -0
  33. data/lib/traject/ndj_reader.rb +40 -0
  34. data/lib/traject/null_writer.rb +22 -0
  35. data/lib/traject/qualified_const_get.rb +40 -0
  36. data/lib/traject/solr_json_writer.rb +277 -0
  37. data/lib/traject/thread_pool.rb +161 -0
  38. data/lib/traject/translation_map.rb +267 -0
  39. data/lib/traject/util.rb +52 -0
  40. data/lib/traject/version.rb +3 -0
  41. data/lib/traject/yaml_writer.rb +9 -0
  42. data/lib/translation_maps/lcc_top_level.yaml +26 -0
  43. data/lib/translation_maps/marc_genre_007.yaml +9 -0
  44. data/lib/translation_maps/marc_genre_leader.yaml +22 -0
  45. data/lib/translation_maps/marc_geographic.yaml +589 -0
  46. data/lib/translation_maps/marc_instruments.yaml +102 -0
  47. data/lib/translation_maps/marc_languages.yaml +490 -0
  48. data/test/debug_writer_test.rb +38 -0
  49. data/test/delimited_writer_test.rb +104 -0
  50. data/test/indexer/each_record_test.rb +59 -0
  51. data/test/indexer/macros_marc21_semantics_test.rb +391 -0
  52. data/test/indexer/macros_marc21_test.rb +190 -0
  53. data/test/indexer/macros_test.rb +40 -0
  54. data/test/indexer/map_record_test.rb +209 -0
  55. data/test/indexer/read_write_test.rb +101 -0
  56. data/test/indexer/settings_test.rb +152 -0
  57. data/test/indexer/to_field_test.rb +77 -0
  58. data/test/marc_extractor_test.rb +412 -0
  59. data/test/marc_format_classifier_test.rb +98 -0
  60. data/test/marc_reader_test.rb +110 -0
  61. data/test/solr_json_writer_test.rb +248 -0
  62. data/test/test_helper.rb +90 -0
  63. data/test/test_support/245_no_ab.marc +1 -0
  64. data/test/test_support/880_with_no_6.utf8.marc +1 -0
  65. data/test/test_support/bad_subfield_code.marc +1 -0
  66. data/test/test_support/bad_utf_byte.utf8.marc +1 -0
  67. data/test/test_support/date_resort_to_260.marc +1 -0
  68. data/test/test_support/date_type_r_missing_date2.marc +1 -0
  69. data/test/test_support/date_with_u.marc +1 -0
  70. data/test/test_support/demo_config.rb +155 -0
  71. data/test/test_support/emptyish_record.marc +1 -0
  72. data/test/test_support/escaped_character_reference.marc8.marc +1 -0
  73. data/test/test_support/george_eliot.marc +1 -0
  74. data/test/test_support/hebrew880s.marc +1 -0
  75. data/test/test_support/louis_armstrong.marc +1 -0
  76. data/test/test_support/manufacturing_consent.marc +1 -0
  77. data/test/test_support/manuscript_online_thesis.marc +1 -0
  78. data/test/test_support/microform_online_conference.marc +1 -0
  79. data/test/test_support/multi_era.marc +1 -0
  80. data/test/test_support/multi_geo.marc +1 -0
  81. data/test/test_support/musical_cage.marc +1 -0
  82. data/test/test_support/nature.marc +1 -0
  83. data/test/test_support/one-marc8.mrc +1 -0
  84. data/test/test_support/online_only.marc +1 -0
  85. data/test/test_support/packed_041a_lang.marc +1 -0
  86. data/test/test_support/test_data.utf8.json +30 -0
  87. data/test/test_support/test_data.utf8.marc.xml +2609 -0
  88. data/test/test_support/test_data.utf8.mrc +1 -0
  89. data/test/test_support/test_data.utf8.mrc.gz +0 -0
  90. data/test/test_support/the_business_ren.marc +1 -0
  91. data/test/translation_map_test.rb +225 -0
  92. data/test/translation_maps/bad_ruby.rb +8 -0
  93. data/test/translation_maps/bad_yaml.yaml +1 -0
  94. data/test/translation_maps/both_map.rb +1 -0
  95. data/test/translation_maps/both_map.yaml +1 -0
  96. data/test/translation_maps/default_literal.rb +10 -0
  97. data/test/translation_maps/default_passthrough.rb +10 -0
  98. data/test/translation_maps/marc_040a_translate_test.yaml +1 -0
  99. data/test/translation_maps/properties_map.properties +5 -0
  100. data/test/translation_maps/ruby_map.rb +10 -0
  101. data/test/translation_maps/translate_array_test.yaml +8 -0
  102. data/test/translation_maps/yaml_map.yaml +7 -0
  103. data/traject.gemspec +47 -0
  104. metadata +382 -0
@@ -0,0 +1,110 @@
1
+ require 'hashie'
2
+ require 'concurrent'
3
+
4
+ class Traject::Indexer
5
+
6
+ # A Hash of settings for a Traject::Indexer, which also ends up passed along
7
+ # to other objects Traject::Indexer interacts with.
8
+ #
9
+ # Enhanced with a few features from Hashie, to make it for
10
+ # instance string/symbol indifferent
11
+ #
12
+ # method #provide(key, value) is added, to do like settings[key] ||= value,
13
+ # set only if not already set (but unlike ||=, nil or false can count as already set)
14
+ #
15
+ # Also has an interesting 'defaults' system, meant to play along
16
+ # with configuration file 'provide' statements. There is a built-in hash of
17
+ # defaults, which will be lazily filled in if accessed and not yet
18
+ # set. (nil can count as set, though!). If they haven't been lazily
19
+ # set yet, then #provide will still fill them in. But you can also call
20
+ # fill_in_defaults! to fill all defaults in, if you know configuration
21
+ # files have all been loaded, and want to fill them in for inspection.
22
+ class Settings < Hash
23
+ include Hashie::Extensions::MergeInitializer # can init with hash
24
+ include Hashie::Extensions::IndifferentAccess
25
+
26
+ def initialize(*args)
27
+ super
28
+ self.default_proc = lambda do |hash, key|
29
+ if self.class.defaults.has_key?(key)
30
+ return hash[key] = self.class.defaults[key]
31
+ else
32
+ return nil
33
+ end
34
+ end
35
+ end
36
+
37
+ # a cautious store, which only saves key=value if
38
+ # there was not already a value for #key. Can be used
39
+ # to set settings that can be overridden on command line,
40
+ # or general first-set-wins settings.
41
+ def provide(key, value)
42
+ unless has_key? key
43
+ store(key, value)
44
+ end
45
+ end
46
+
47
+ # reverse_merge copied from ActiveSupport, pretty straightforward,
48
+ # modified to make sure we return a Settings
49
+ def reverse_merge(other_hash)
50
+ self.class.new(other_hash).merge(self)
51
+ end
52
+
53
+ def reverse_merge!(other_hash)
54
+ replace(reverse_merge(other_hash))
55
+ end
56
+
57
+ def fill_in_defaults!
58
+ self.reverse_merge!(self.class.defaults)
59
+ end
60
+
61
+
62
+ def self.mri_defaults
63
+ {
64
+ "reader_class_name" => "Traject::MarcReader",
65
+ "writer_class_name" => "Traject::SolrJsonWriter",
66
+ "marc_source.type" => "binary",
67
+ "solrj_writer.batch_size" => 200,
68
+ "solrj_writer.thread_pool" => 1,
69
+ "processing_thread_pool" => self.default_processing_thread_pool,
70
+ "log.batch_size.severity" => "info"
71
+ }
72
+ end
73
+
74
+ def self.jruby_defaults
75
+ {
76
+ 'reader_class_name' => "Traject::Marc4JReader",
77
+ 'marc4j_reader.permissive' => true
78
+ }
79
+ end
80
+
81
+
82
+ def self.defaults
83
+ return @@defaults if defined? @@defaults
84
+ default_settings = self.mri_defaults
85
+ if defined? JRUBY_VERSION
86
+ default_settings.merge! self.jruby_defaults
87
+ end
88
+
89
+ @@defaults = default_settings
90
+ end
91
+
92
+ def inspect
93
+ # Keep any key ending in password out of the inspect
94
+ self.inject({}) do |hash, (key, value)|
95
+ hash[key] = (key =~ /password\Z/) ? "[hidden]" : value
96
+ hash
97
+ end.inspect
98
+ end
99
+
100
+ protected
101
+ def self.default_processing_thread_pool
102
+ if ["jruby", "rbx"].include? ENV["RUBY_ENGINE"]
103
+ [1, Concurrent.processor_count - 1].max
104
+ else
105
+ 1
106
+ end
107
+ end
108
+
109
+ end
110
+ end
@@ -0,0 +1,51 @@
1
+ require 'json'
2
+ require 'traject/line_writer'
3
+
4
+ # The JsonWriter outputs one JSON hash per record, separated by newlines.
5
+ #
6
+ # It's newline delimitted json, which should be suitable for being
7
+ # read by simple NDJ readers. (TODO: We have no checks right now to
8
+ # make sure the standard json serializers we're using don't put any
9
+ # internal newlines as whitespace in the json. Which would break NDJ
10
+ # reading. Should we?)
11
+ #
12
+ # Should be thread-safe (ie, multiple worker threads can be calling #put
13
+ # concurrently), because output to file is wrapped in a mutex synchronize.
14
+ # This does not seem to effect performance much, as far as I could tell
15
+ # benchmarking.
16
+ #
17
+ # ## Settings
18
+ #
19
+ # * output_file A filename to send output; default will use stdout.
20
+ #
21
+ # * json_writer.pretty_print: [default: false]: Pretty-print (e.g., include newlines, indentation, etc.)
22
+ # each JSON record instead of just mashing it all together on one line. The default, no pretty-printing option
23
+ # produces one record per line, easy to process with another program.
24
+ #
25
+ # ## Example output
26
+ #
27
+ # Without pretty printing, you end up with something like this (just two records shown):
28
+ #
29
+ # {"id":["000001118"],"oclc":["ocm00085737"],"sdrnum":["sdr-nrlf.b170195454"],"isbn":["0137319924"],"lccn":["73120791"],"mainauthor":["Behavioral and Social Sciences Survey Committee. Psychiatry Panel."],"author":["Behavioral and Social Sciences Survey Committee. Psychiatry Panel.","Hamburg, David A., 1925-"],"author2":["Behavioral and Social Sciences Survey Committee. Psychiatry Panel.","Hamburg, David A., 1925-"],"authorSort":["Behavioral and Social Sciences Survey Committee. Psychiatry Panel."],"author_top":["Behavioral and Social Sciences Survey Committee. Psychiatry Panel.","Edited by David A. Hamburg.","Hamburg, David A., 1925- ed."],"title":["Psychiatry as a behavioral science."],"title_a":["Psychiatry as a behavioral science."],"title_ab":["Psychiatry as a behavioral science."],"title_c":["Edited by David A. Hamburg."],"titleSort":["Psychiatry as a behavioral science"],"title_top":["Psychiatry as a behavioral science."],"title_rest":["A Spectrum book"],"series2":["A Spectrum book"],"callnumber":["RC327 .B41"],"broad_subject":["Medicine"],"pubdate":[1970],"format":["Book","Online","Print"],"publisher":["Prentice-Hall"],"language":["English"],"language008":["eng"],"editor":["David A. Hamburg."]}
30
+ # {"id":["000000794"],"oclc":["ocm00067181"],"lccn":["78011026"],"mainauthor":["Clark, Albert Curtis, 1859-1937."],"author":["Clark, Albert Curtis, 1859-1937."],"authorSort":["Clark, Albert Curtis, 1859-1937."],"author_top":["Clark, Albert Curtis, 1859-1937."],"title":["The descent of manuscripts.","descent of manuscripts."],"title_a":["The descent of manuscripts.","descent of manuscripts."],"title_ab":["The descent of manuscripts.","descent of manuscripts."],"titleSort":["descent of manuscripts"],"title_top":["The descent of manuscripts."],"callnumber":["PA47 .C45 1970"],"broad_subject":["Language & Literature"],"pubdate":[1918],"format":["Book","Online","Print"],"publisher":["Clarendon Press"],"language":["English"],"language008":["eng"]}
31
+ #
32
+ # ## Example configuration file
33
+ #
34
+ # require 'traject/json_writer'
35
+ #
36
+ # settings do
37
+ # provide "writer_class_name", "Traject::JsonWriter"
38
+ # provide "output_file", "out.json"
39
+ # end
40
+ class Traject::JsonWriter < Traject::LineWriter
41
+
42
+ def serialize(context)
43
+ hash = context.output_hash
44
+ if settings["json_writer.pretty_print"]
45
+ JSON.pretty_generate(hash)
46
+ else
47
+ JSON.generate(hash)
48
+ end
49
+ end
50
+
51
+ end
@@ -0,0 +1,63 @@
1
+ require 'thread'
2
+
3
+ # A writer for Traject::Indexer, that just writes out
4
+ # all the output as serialized text with #puts.
5
+ #
6
+ # Should be thread-safe (ie, multiple worker threads can be calling #put
7
+ # concurrently), by wrapping write to actual output file in a mutex synchronize.
8
+ # This does not seem to effect performance much, as far as I could tell
9
+ # benchmarking.
10
+ #
11
+ # Output will be sent to `settings["output_file"]` string path, or else
12
+ # `settings["output_stream"]` (ruby IO object), or else stdout.
13
+ #
14
+ # This class can be sub-classed to write out different serialized
15
+ # reprentations -- subclasses will just override the #serialize
16
+ # method. For instance, see JsonWriter.
17
+ class Traject::LineWriter
18
+ attr_reader :settings
19
+ attr_reader :write_mutex, :output_file
20
+
21
+ def initialize(argSettings)
22
+ @settings = argSettings
23
+ @write_mutex = Mutex.new
24
+
25
+ # trigger lazy loading now for thread-safety
26
+ @output_file = open_output_file
27
+ end
28
+
29
+ def _write(data)
30
+ output_file.puts(data)
31
+ end
32
+
33
+
34
+ def serialize(context)
35
+ context.output_hash
36
+ end
37
+
38
+ def put(context)
39
+ serialized = serialize(context)
40
+ write_mutex.synchronize do
41
+ _write(serialized)
42
+ end
43
+ end
44
+
45
+ def open_output_file
46
+ unless defined? @output_file
47
+ of =
48
+ if settings["output_file"]
49
+ File.open(settings["output_file"], 'w:UTF-8')
50
+ elsif settings["output_stream"]
51
+ settings["output_stream"]
52
+ else
53
+ $stdout
54
+ end
55
+ end
56
+ return of
57
+ end
58
+
59
+ def close
60
+ @output_file.close unless (@output_file.nil? || @output_file.tty?)
61
+ end
62
+
63
+ end
@@ -0,0 +1,9 @@
1
+ module Traject::Macros
2
+ module Basic
3
+ def literal(literal)
4
+ lambda do |record, accumulator, context|
5
+ accumulator << literal
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,223 @@
1
+ require 'traject/marc_extractor'
2
+ require 'traject/translation_map'
3
+ require 'traject/util'
4
+ require 'base64'
5
+ require 'json'
6
+ require 'marc/fastxmlwriter'
7
+
8
+ module Traject::Macros
9
+ # Some of these may be generic for any MARC, but we haven't done
10
+ # the analytical work to think it through, some of this is
11
+ # def specific to Marc21.
12
+ module Marc21
13
+
14
+ # A combo function macro that will extract data from marc according to a string
15
+ # field/substring specification, then apply various optional post-processing to it too.
16
+ #
17
+ # First argument is a string spec suitable for the MarcExtractor, see
18
+ # MarcExtractor::parse_string_spec.
19
+ #
20
+ # Second arg is optional options, including options valid on MarcExtractor.new,
21
+ # and others. By default, will de-duplicate results, but see :allow_duplicates
22
+ #
23
+ # * :first => true: take only first value
24
+ #
25
+ # * :translation_map => String: translate with named translation map looked up in load
26
+ # path, uses Tranject::TranslationMap.new(translation_map_arg)
27
+ #
28
+ # * :trim_punctuation => true; trims leading/trailing punctuation using standard algorithms that
29
+ # have shown themselves useful with Marc, using Marc21.trim_punctuation
30
+ #
31
+ # * :default => String: if otherwise empty, add default value
32
+ #
33
+ # * :allow_duplicates => boolean, default false, if set to true then will avoid
34
+ # de-duplicating the result array (array.uniq!)
35
+ #
36
+ #
37
+ # Examples:
38
+ #
39
+ # to_field("title"), extract_marc("245abcd", :trim_punctuation => true)
40
+ # to_field("id"), extract_marc("001", :first => true)
41
+ # to_field("geo"), extract_marc("040a", :separator => nil, :translation_map => "marc040")
42
+ def extract_marc(spec, options = {})
43
+
44
+ # Raise an error if there are any invalid options, indicating a
45
+ # misspelled or illegal option, using a string instead of a symbol, etc.
46
+
47
+ unless (options.keys - EXTRACT_MARC_VALID_OPTIONS).empty?
48
+ raise RuntimeError.new("Illegal/Unknown argument '#{(options.keys - EXTRACT_MARC_VALID_OPTIONS).join(', ')}' in extract_marc at #{Traject::Util.extract_caller_location(caller.first)}")
49
+ end
50
+
51
+
52
+ # We create the TranslationMap and the MarcExtractor here
53
+ # on load, so the lambda can just refer to already created
54
+ # ones, and not have to create a new one per-execution.
55
+ #
56
+ # Benchmarking shows for MarcExtractor at least, there is
57
+ # significant performance advantage.
58
+
59
+ if translation_map_arg = options.delete(:translation_map)
60
+ translation_map = Traject::TranslationMap.new(translation_map_arg)
61
+ else
62
+ translation_map = nil
63
+ end
64
+
65
+
66
+ extractor = Traject::MarcExtractor.new(spec, options)
67
+
68
+ lambda do |record, accumulator, context|
69
+ accumulator.concat extractor.extract(record)
70
+ Marc21.apply_extraction_options(accumulator, options, translation_map)
71
+ end
72
+ end
73
+
74
+ # Side-effect the accumulator with the options
75
+ def self.apply_extraction_options(accumulator, options, translation_map=nil)
76
+ only_first = options[:first]
77
+ trim_punctuation = options[:trim_punctuation]
78
+ default_value = options[:default]
79
+ allow_duplicates = options[:allow_duplicates]
80
+
81
+ if only_first
82
+ accumulator.replace Array(accumulator[0])
83
+ end
84
+
85
+ if translation_map
86
+ translation_map.translate_array! accumulator
87
+ end
88
+
89
+ if trim_punctuation
90
+ accumulator.collect! {|s| Marc21.trim_punctuation(s)}
91
+ end
92
+
93
+ unless allow_duplicates
94
+ accumulator.uniq!
95
+ end
96
+
97
+ if default_value && accumulator.empty?
98
+ accumulator << default_value
99
+ end
100
+ end
101
+
102
+
103
+ # A list of symbols that are valid keys in the options hash
104
+ EXTRACT_MARC_VALID_OPTIONS = [:first, :trim_punctuation, :default,
105
+ :allow_duplicates, :separator, :translation_map,
106
+ :alternate_script]
107
+
108
+ # Serializes complete marc record to a serialization format.
109
+ # required param :format,
110
+ # serialize_marc(:format => :binary)
111
+ #
112
+ # formats:
113
+ # [xml] MarcXML
114
+ # [json] marc-in-json (http://dilettantes.code4lib.org/blog/2010/09/a-proposal-to-serialize-marc-in-json/)
115
+ # [binary] Standard ISO 2709 binary marc. By default WILL be base64-encoded,
116
+ # assumed destination a solr 'binary' field.
117
+ # * add option `:binary_escape => false` to do straight binary -- unclear
118
+ # what Solr's documented behavior is when you do this, and add a string
119
+ # with binary control chars to solr. May do different things in diff
120
+ # Solr versions, including raising exceptions.
121
+ # * add option `:allow_oversized => true` to pass that flat
122
+ # to the MARC::Writer. Oversized records will then still be
123
+ # serialized, with certain header bytes filled with ascii 0's
124
+ # -- technically illegal MARC, but can still be read by
125
+ # ruby MARC::Reader in permissive mode.
126
+ def serialized_marc(options)
127
+ unless (options.keys - SERIALZED_MARC_VALID_OPTIONS).empty?
128
+ raise RuntimeError.new("Illegal/Unknown argument '#{(options.keys - SERIALZED_MARC_VALID_OPTIONS).join(', ')}' in seralized_marc at #{Traject::Util.extract_caller_location(caller.first)}")
129
+ end
130
+
131
+ format = options[:format].to_s
132
+ binary_escape = (options[:binary_escape] != false)
133
+ allow_oversized = (options[:allow_oversized] == true)
134
+
135
+ raise ArgumentError.new("Need :format => [binary|xml|json] arg") unless %w{binary xml json}.include?(format)
136
+
137
+ lambda do |record, accumulator, context|
138
+ case format
139
+ when "binary"
140
+ binary = MARC::Writer.encode(record, allow_oversized)
141
+ binary = Base64.encode64(binary) if binary_escape
142
+ accumulator << binary
143
+ when "xml"
144
+ accumulator << MARC::FastXMLWriter.encode(record)
145
+ when "json"
146
+ accumulator << JSON.dump(record.to_hash)
147
+ end
148
+ end
149
+ end
150
+ SERIALZED_MARC_VALID_OPTIONS = [:format, :binary_escape, :allow_oversized]
151
+
152
+ # Takes the whole record, by default from tags 100 to 899 inclusive,
153
+ # all subfields, and adds them to output. Subfields in a record are all
154
+ # joined by space by default.
155
+ #
156
+ # options
157
+ # [:from] default 100, only tags >= lexicographically
158
+ # [:to] default 899, only tags <= lexicographically
159
+ # [:separator] how to join subfields, default space, nil means don't join
160
+ #
161
+ # All fields in from-to must be marc DATA (not control fields), or weirdness
162
+ #
163
+ # Can always run this thing multiple times on the same field if you need
164
+ # non-contiguous ranges of fields.
165
+ def extract_all_marc_values(options = {})
166
+ unless (options.keys - EXTRACT_ALL_MARC_VALID_OPTIONS).empty?
167
+ raise RuntimeError.new("Illegal/Unknown argument '#{(options.keys - EXTRACT_ALL_MARC_VALID_OPTIONS).join(', ')}' in extract_all_marc at #{Traject::Util.extract_caller_location(caller.first)}")
168
+ end
169
+ options = {:from => "100", :to => "899", :separator => ' '}.merge(options)
170
+
171
+ lambda do |record, accumulator, context|
172
+ record.each do |field|
173
+ next unless field.tag >= options[:from] && field.tag <= options[:to]
174
+ subfield_values = field.subfields.collect {|sf| sf.value}
175
+ next unless subfield_values.length > 0
176
+
177
+ if options[:separator]
178
+ accumulator << subfield_values.join( options[:separator])
179
+ else
180
+ accumulator.concat subfield_values
181
+ end
182
+ end
183
+ end
184
+
185
+ end
186
+ EXTRACT_ALL_MARC_VALID_OPTIONS = [:separator, :from, :to]
187
+
188
+
189
+ # Trims punctuation mostly from end, and occasionally from beginning
190
+ # of string. Not nearly as complex logic as SolrMarc's version, just
191
+ # pretty simple.
192
+ #
193
+ # Removes
194
+ # * trailing: comma, slash, semicolon, colon (possibly preceded and followed by whitespace)
195
+ # * trailing period if it is preceded by at least three letters (possibly preceded and followed by whitespace)
196
+ # * single square bracket characters if they are the start and/or end
197
+ # chars and there are no internal square brackets.
198
+ #
199
+ # Returns altered string, doesn't change original arg.
200
+ def self.trim_punctuation(str)
201
+
202
+ # If something went wrong and we got a nil, just return it
203
+ return str unless str
204
+
205
+ # trailing: comma, slash, semicolon, colon (possibly preceded and followed by whitespace)
206
+ str = str.sub(/ *[ ,\/;:] *\Z/, '')
207
+
208
+ # trailing period if it is preceded by at least three letters (possibly preceded and followed by whitespace)
209
+ str = str.sub(/( *\w\w\w)\. *\Z/, '\1')
210
+
211
+ # single square bracket characters if they are the start and/or end
212
+ # chars and there are no internal square brackets.
213
+ str = str.sub(/\A\[?([^\[\]]+)\]?\Z/, '\1')
214
+ return str
215
+ end
216
+
217
+ def self.first!(arr)
218
+ # kind of esoteric, but slice used this way does mutating first, yep
219
+ arr.slice!(1, arr.length)
220
+ end
221
+
222
+ end
223
+ end