traject 2.0.0-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (104) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +18 -0
  3. data/.travis.yml +27 -0
  4. data/.yardopts +3 -0
  5. data/Gemfile +12 -0
  6. data/LICENSE.txt +20 -0
  7. data/README.md +461 -0
  8. data/Rakefile +21 -0
  9. data/bench/bench.rb +30 -0
  10. data/bin/traject +16 -0
  11. data/doc/batch_execution.md +243 -0
  12. data/doc/extending.md +190 -0
  13. data/doc/indexing_rules.md +265 -0
  14. data/doc/other_commands.md +47 -0
  15. data/doc/settings.md +101 -0
  16. data/lib/tasks/load_maps.rake +48 -0
  17. data/lib/traject.rb +11 -0
  18. data/lib/traject/command_line.rb +301 -0
  19. data/lib/traject/csv_writer.rb +34 -0
  20. data/lib/traject/debug_writer.rb +47 -0
  21. data/lib/traject/delimited_writer.rb +110 -0
  22. data/lib/traject/indexer.rb +613 -0
  23. data/lib/traject/indexer/settings.rb +110 -0
  24. data/lib/traject/json_writer.rb +51 -0
  25. data/lib/traject/line_writer.rb +63 -0
  26. data/lib/traject/macros/basic.rb +9 -0
  27. data/lib/traject/macros/marc21.rb +223 -0
  28. data/lib/traject/macros/marc21_semantics.rb +584 -0
  29. data/lib/traject/macros/marc_format_classifier.rb +197 -0
  30. data/lib/traject/marc_extractor.rb +410 -0
  31. data/lib/traject/marc_reader.rb +89 -0
  32. data/lib/traject/mock_reader.rb +97 -0
  33. data/lib/traject/ndj_reader.rb +40 -0
  34. data/lib/traject/null_writer.rb +22 -0
  35. data/lib/traject/qualified_const_get.rb +40 -0
  36. data/lib/traject/solr_json_writer.rb +277 -0
  37. data/lib/traject/thread_pool.rb +161 -0
  38. data/lib/traject/translation_map.rb +267 -0
  39. data/lib/traject/util.rb +52 -0
  40. data/lib/traject/version.rb +3 -0
  41. data/lib/traject/yaml_writer.rb +9 -0
  42. data/lib/translation_maps/lcc_top_level.yaml +26 -0
  43. data/lib/translation_maps/marc_genre_007.yaml +9 -0
  44. data/lib/translation_maps/marc_genre_leader.yaml +22 -0
  45. data/lib/translation_maps/marc_geographic.yaml +589 -0
  46. data/lib/translation_maps/marc_instruments.yaml +102 -0
  47. data/lib/translation_maps/marc_languages.yaml +490 -0
  48. data/test/debug_writer_test.rb +38 -0
  49. data/test/delimited_writer_test.rb +104 -0
  50. data/test/indexer/each_record_test.rb +59 -0
  51. data/test/indexer/macros_marc21_semantics_test.rb +391 -0
  52. data/test/indexer/macros_marc21_test.rb +190 -0
  53. data/test/indexer/macros_test.rb +40 -0
  54. data/test/indexer/map_record_test.rb +209 -0
  55. data/test/indexer/read_write_test.rb +101 -0
  56. data/test/indexer/settings_test.rb +152 -0
  57. data/test/indexer/to_field_test.rb +77 -0
  58. data/test/marc_extractor_test.rb +412 -0
  59. data/test/marc_format_classifier_test.rb +98 -0
  60. data/test/marc_reader_test.rb +110 -0
  61. data/test/solr_json_writer_test.rb +248 -0
  62. data/test/test_helper.rb +90 -0
  63. data/test/test_support/245_no_ab.marc +1 -0
  64. data/test/test_support/880_with_no_6.utf8.marc +1 -0
  65. data/test/test_support/bad_subfield_code.marc +1 -0
  66. data/test/test_support/bad_utf_byte.utf8.marc +1 -0
  67. data/test/test_support/date_resort_to_260.marc +1 -0
  68. data/test/test_support/date_type_r_missing_date2.marc +1 -0
  69. data/test/test_support/date_with_u.marc +1 -0
  70. data/test/test_support/demo_config.rb +155 -0
  71. data/test/test_support/emptyish_record.marc +1 -0
  72. data/test/test_support/escaped_character_reference.marc8.marc +1 -0
  73. data/test/test_support/george_eliot.marc +1 -0
  74. data/test/test_support/hebrew880s.marc +1 -0
  75. data/test/test_support/louis_armstrong.marc +1 -0
  76. data/test/test_support/manufacturing_consent.marc +1 -0
  77. data/test/test_support/manuscript_online_thesis.marc +1 -0
  78. data/test/test_support/microform_online_conference.marc +1 -0
  79. data/test/test_support/multi_era.marc +1 -0
  80. data/test/test_support/multi_geo.marc +1 -0
  81. data/test/test_support/musical_cage.marc +1 -0
  82. data/test/test_support/nature.marc +1 -0
  83. data/test/test_support/one-marc8.mrc +1 -0
  84. data/test/test_support/online_only.marc +1 -0
  85. data/test/test_support/packed_041a_lang.marc +1 -0
  86. data/test/test_support/test_data.utf8.json +30 -0
  87. data/test/test_support/test_data.utf8.marc.xml +2609 -0
  88. data/test/test_support/test_data.utf8.mrc +1 -0
  89. data/test/test_support/test_data.utf8.mrc.gz +0 -0
  90. data/test/test_support/the_business_ren.marc +1 -0
  91. data/test/translation_map_test.rb +225 -0
  92. data/test/translation_maps/bad_ruby.rb +8 -0
  93. data/test/translation_maps/bad_yaml.yaml +1 -0
  94. data/test/translation_maps/both_map.rb +1 -0
  95. data/test/translation_maps/both_map.yaml +1 -0
  96. data/test/translation_maps/default_literal.rb +10 -0
  97. data/test/translation_maps/default_passthrough.rb +10 -0
  98. data/test/translation_maps/marc_040a_translate_test.yaml +1 -0
  99. data/test/translation_maps/properties_map.properties +5 -0
  100. data/test/translation_maps/ruby_map.rb +10 -0
  101. data/test/translation_maps/translate_array_test.yaml +8 -0
  102. data/test/translation_maps/yaml_map.yaml +7 -0
  103. data/traject.gemspec +47 -0
  104. metadata +382 -0
@@ -0,0 +1,110 @@
1
+ require 'hashie'
2
+ require 'concurrent'
3
+
4
+ class Traject::Indexer
5
+
6
+ # A Hash of settings for a Traject::Indexer, which also ends up passed along
7
+ # to other objects Traject::Indexer interacts with.
8
+ #
9
+ # Enhanced with a few features from Hashie, to make it for
10
+ # instance string/symbol indifferent
11
+ #
12
+ # method #provide(key, value) is added, to do like settings[key] ||= value,
13
+ # set only if not already set (but unlike ||=, nil or false can count as already set)
14
+ #
15
+ # Also has an interesting 'defaults' system, meant to play along
16
+ # with configuration file 'provide' statements. There is a built-in hash of
17
+ # defaults, which will be lazily filled in if accessed and not yet
18
+ # set. (nil can count as set, though!). If they haven't been lazily
19
+ # set yet, then #provide will still fill them in. But you can also call
20
+ # fill_in_defaults! to fill all defaults in, if you know configuration
21
+ # files have all been loaded, and want to fill them in for inspection.
22
+ class Settings < Hash
23
+ include Hashie::Extensions::MergeInitializer # can init with hash
24
+ include Hashie::Extensions::IndifferentAccess
25
+
26
+ def initialize(*args)
27
+ super
28
+ self.default_proc = lambda do |hash, key|
29
+ if self.class.defaults.has_key?(key)
30
+ return hash[key] = self.class.defaults[key]
31
+ else
32
+ return nil
33
+ end
34
+ end
35
+ end
36
+
37
+ # a cautious store, which only saves key=value if
38
+ # there was not already a value for #key. Can be used
39
+ # to set settings that can be overridden on command line,
40
+ # or general first-set-wins settings.
41
+ def provide(key, value)
42
+ unless has_key? key
43
+ store(key, value)
44
+ end
45
+ end
46
+
47
+ # reverse_merge copied from ActiveSupport, pretty straightforward,
48
+ # modified to make sure we return a Settings
49
+ def reverse_merge(other_hash)
50
+ self.class.new(other_hash).merge(self)
51
+ end
52
+
53
+ def reverse_merge!(other_hash)
54
+ replace(reverse_merge(other_hash))
55
+ end
56
+
57
+ def fill_in_defaults!
58
+ self.reverse_merge!(self.class.defaults)
59
+ end
60
+
61
+
62
+ def self.mri_defaults
63
+ {
64
+ "reader_class_name" => "Traject::MarcReader",
65
+ "writer_class_name" => "Traject::SolrJsonWriter",
66
+ "marc_source.type" => "binary",
67
+ "solrj_writer.batch_size" => 200,
68
+ "solrj_writer.thread_pool" => 1,
69
+ "processing_thread_pool" => self.default_processing_thread_pool,
70
+ "log.batch_size.severity" => "info"
71
+ }
72
+ end
73
+
74
+ def self.jruby_defaults
75
+ {
76
+ 'reader_class_name' => "Traject::Marc4JReader",
77
+ 'marc4j_reader.permissive' => true
78
+ }
79
+ end
80
+
81
+
82
+ def self.defaults
83
+ return @@defaults if defined? @@defaults
84
+ default_settings = self.mri_defaults
85
+ if defined? JRUBY_VERSION
86
+ default_settings.merge! self.jruby_defaults
87
+ end
88
+
89
+ @@defaults = default_settings
90
+ end
91
+
92
+ def inspect
93
+ # Keep any key ending in password out of the inspect
94
+ self.inject({}) do |hash, (key, value)|
95
+ hash[key] = (key =~ /password\Z/) ? "[hidden]" : value
96
+ hash
97
+ end.inspect
98
+ end
99
+
100
+ protected
101
+ def self.default_processing_thread_pool
102
+ if ["jruby", "rbx"].include? ENV["RUBY_ENGINE"]
103
+ [1, Concurrent.processor_count - 1].max
104
+ else
105
+ 1
106
+ end
107
+ end
108
+
109
+ end
110
+ end
@@ -0,0 +1,51 @@
1
+ require 'json'
2
+ require 'traject/line_writer'
3
+
4
+ # The JsonWriter outputs one JSON hash per record, separated by newlines.
5
+ #
6
+ # It's newline delimitted json, which should be suitable for being
7
+ # read by simple NDJ readers. (TODO: We have no checks right now to
8
+ # make sure the standard json serializers we're using don't put any
9
+ # internal newlines as whitespace in the json. Which would break NDJ
10
+ # reading. Should we?)
11
+ #
12
+ # Should be thread-safe (ie, multiple worker threads can be calling #put
13
+ # concurrently), because output to file is wrapped in a mutex synchronize.
14
+ # This does not seem to effect performance much, as far as I could tell
15
+ # benchmarking.
16
+ #
17
+ # ## Settings
18
+ #
19
+ # * output_file A filename to send output; default will use stdout.
20
+ #
21
+ # * json_writer.pretty_print: [default: false]: Pretty-print (e.g., include newlines, indentation, etc.)
22
+ # each JSON record instead of just mashing it all together on one line. The default, no pretty-printing option
23
+ # produces one record per line, easy to process with another program.
24
+ #
25
+ # ## Example output
26
+ #
27
+ # Without pretty printing, you end up with something like this (just two records shown):
28
+ #
29
+ # {"id":["000001118"],"oclc":["ocm00085737"],"sdrnum":["sdr-nrlf.b170195454"],"isbn":["0137319924"],"lccn":["73120791"],"mainauthor":["Behavioral and Social Sciences Survey Committee. Psychiatry Panel."],"author":["Behavioral and Social Sciences Survey Committee. Psychiatry Panel.","Hamburg, David A., 1925-"],"author2":["Behavioral and Social Sciences Survey Committee. Psychiatry Panel.","Hamburg, David A., 1925-"],"authorSort":["Behavioral and Social Sciences Survey Committee. Psychiatry Panel."],"author_top":["Behavioral and Social Sciences Survey Committee. Psychiatry Panel.","Edited by David A. Hamburg.","Hamburg, David A., 1925- ed."],"title":["Psychiatry as a behavioral science."],"title_a":["Psychiatry as a behavioral science."],"title_ab":["Psychiatry as a behavioral science."],"title_c":["Edited by David A. Hamburg."],"titleSort":["Psychiatry as a behavioral science"],"title_top":["Psychiatry as a behavioral science."],"title_rest":["A Spectrum book"],"series2":["A Spectrum book"],"callnumber":["RC327 .B41"],"broad_subject":["Medicine"],"pubdate":[1970],"format":["Book","Online","Print"],"publisher":["Prentice-Hall"],"language":["English"],"language008":["eng"],"editor":["David A. Hamburg."]}
30
+ # {"id":["000000794"],"oclc":["ocm00067181"],"lccn":["78011026"],"mainauthor":["Clark, Albert Curtis, 1859-1937."],"author":["Clark, Albert Curtis, 1859-1937."],"authorSort":["Clark, Albert Curtis, 1859-1937."],"author_top":["Clark, Albert Curtis, 1859-1937."],"title":["The descent of manuscripts.","descent of manuscripts."],"title_a":["The descent of manuscripts.","descent of manuscripts."],"title_ab":["The descent of manuscripts.","descent of manuscripts."],"titleSort":["descent of manuscripts"],"title_top":["The descent of manuscripts."],"callnumber":["PA47 .C45 1970"],"broad_subject":["Language & Literature"],"pubdate":[1918],"format":["Book","Online","Print"],"publisher":["Clarendon Press"],"language":["English"],"language008":["eng"]}
31
+ #
32
+ # ## Example configuration file
33
+ #
34
+ # require 'traject/json_writer'
35
+ #
36
+ # settings do
37
+ # provide "writer_class_name", "Traject::JsonWriter"
38
+ # provide "output_file", "out.json"
39
+ # end
40
+ class Traject::JsonWriter < Traject::LineWriter
41
+
42
+ def serialize(context)
43
+ hash = context.output_hash
44
+ if settings["json_writer.pretty_print"]
45
+ JSON.pretty_generate(hash)
46
+ else
47
+ JSON.generate(hash)
48
+ end
49
+ end
50
+
51
+ end
@@ -0,0 +1,63 @@
1
+ require 'thread'
2
+
3
+ # A writer for Traject::Indexer, that just writes out
4
+ # all the output as serialized text with #puts.
5
+ #
6
+ # Should be thread-safe (ie, multiple worker threads can be calling #put
7
+ # concurrently), by wrapping write to actual output file in a mutex synchronize.
8
+ # This does not seem to effect performance much, as far as I could tell
9
+ # benchmarking.
10
+ #
11
+ # Output will be sent to `settings["output_file"]` string path, or else
12
+ # `settings["output_stream"]` (ruby IO object), or else stdout.
13
+ #
14
+ # This class can be sub-classed to write out different serialized
15
+ # reprentations -- subclasses will just override the #serialize
16
+ # method. For instance, see JsonWriter.
17
+ class Traject::LineWriter
18
+ attr_reader :settings
19
+ attr_reader :write_mutex, :output_file
20
+
21
+ def initialize(argSettings)
22
+ @settings = argSettings
23
+ @write_mutex = Mutex.new
24
+
25
+ # trigger lazy loading now for thread-safety
26
+ @output_file = open_output_file
27
+ end
28
+
29
+ def _write(data)
30
+ output_file.puts(data)
31
+ end
32
+
33
+
34
+ def serialize(context)
35
+ context.output_hash
36
+ end
37
+
38
+ def put(context)
39
+ serialized = serialize(context)
40
+ write_mutex.synchronize do
41
+ _write(serialized)
42
+ end
43
+ end
44
+
45
+ def open_output_file
46
+ unless defined? @output_file
47
+ of =
48
+ if settings["output_file"]
49
+ File.open(settings["output_file"], 'w:UTF-8')
50
+ elsif settings["output_stream"]
51
+ settings["output_stream"]
52
+ else
53
+ $stdout
54
+ end
55
+ end
56
+ return of
57
+ end
58
+
59
+ def close
60
+ @output_file.close unless (@output_file.nil? || @output_file.tty?)
61
+ end
62
+
63
+ end
@@ -0,0 +1,9 @@
1
+ module Traject::Macros
2
+ module Basic
3
+ def literal(literal)
4
+ lambda do |record, accumulator, context|
5
+ accumulator << literal
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,223 @@
1
+ require 'traject/marc_extractor'
2
+ require 'traject/translation_map'
3
+ require 'traject/util'
4
+ require 'base64'
5
+ require 'json'
6
+ require 'marc/fastxmlwriter'
7
+
8
+ module Traject::Macros
9
+ # Some of these may be generic for any MARC, but we haven't done
10
+ # the analytical work to think it through, some of this is
11
+ # def specific to Marc21.
12
+ module Marc21
13
+
14
+ # A combo function macro that will extract data from marc according to a string
15
+ # field/substring specification, then apply various optional post-processing to it too.
16
+ #
17
+ # First argument is a string spec suitable for the MarcExtractor, see
18
+ # MarcExtractor::parse_string_spec.
19
+ #
20
+ # Second arg is optional options, including options valid on MarcExtractor.new,
21
+ # and others. By default, will de-duplicate results, but see :allow_duplicates
22
+ #
23
+ # * :first => true: take only first value
24
+ #
25
+ # * :translation_map => String: translate with named translation map looked up in load
26
+ # path, uses Tranject::TranslationMap.new(translation_map_arg)
27
+ #
28
+ # * :trim_punctuation => true; trims leading/trailing punctuation using standard algorithms that
29
+ # have shown themselves useful with Marc, using Marc21.trim_punctuation
30
+ #
31
+ # * :default => String: if otherwise empty, add default value
32
+ #
33
+ # * :allow_duplicates => boolean, default false, if set to true then will avoid
34
+ # de-duplicating the result array (array.uniq!)
35
+ #
36
+ #
37
+ # Examples:
38
+ #
39
+ # to_field("title"), extract_marc("245abcd", :trim_punctuation => true)
40
+ # to_field("id"), extract_marc("001", :first => true)
41
+ # to_field("geo"), extract_marc("040a", :separator => nil, :translation_map => "marc040")
42
+ def extract_marc(spec, options = {})
43
+
44
+ # Raise an error if there are any invalid options, indicating a
45
+ # misspelled or illegal option, using a string instead of a symbol, etc.
46
+
47
+ unless (options.keys - EXTRACT_MARC_VALID_OPTIONS).empty?
48
+ raise RuntimeError.new("Illegal/Unknown argument '#{(options.keys - EXTRACT_MARC_VALID_OPTIONS).join(', ')}' in extract_marc at #{Traject::Util.extract_caller_location(caller.first)}")
49
+ end
50
+
51
+
52
+ # We create the TranslationMap and the MarcExtractor here
53
+ # on load, so the lambda can just refer to already created
54
+ # ones, and not have to create a new one per-execution.
55
+ #
56
+ # Benchmarking shows for MarcExtractor at least, there is
57
+ # significant performance advantage.
58
+
59
+ if translation_map_arg = options.delete(:translation_map)
60
+ translation_map = Traject::TranslationMap.new(translation_map_arg)
61
+ else
62
+ translation_map = nil
63
+ end
64
+
65
+
66
+ extractor = Traject::MarcExtractor.new(spec, options)
67
+
68
+ lambda do |record, accumulator, context|
69
+ accumulator.concat extractor.extract(record)
70
+ Marc21.apply_extraction_options(accumulator, options, translation_map)
71
+ end
72
+ end
73
+
74
+ # Side-effect the accumulator with the options
75
+ def self.apply_extraction_options(accumulator, options, translation_map=nil)
76
+ only_first = options[:first]
77
+ trim_punctuation = options[:trim_punctuation]
78
+ default_value = options[:default]
79
+ allow_duplicates = options[:allow_duplicates]
80
+
81
+ if only_first
82
+ accumulator.replace Array(accumulator[0])
83
+ end
84
+
85
+ if translation_map
86
+ translation_map.translate_array! accumulator
87
+ end
88
+
89
+ if trim_punctuation
90
+ accumulator.collect! {|s| Marc21.trim_punctuation(s)}
91
+ end
92
+
93
+ unless allow_duplicates
94
+ accumulator.uniq!
95
+ end
96
+
97
+ if default_value && accumulator.empty?
98
+ accumulator << default_value
99
+ end
100
+ end
101
+
102
+
103
+ # A list of symbols that are valid keys in the options hash
104
+ EXTRACT_MARC_VALID_OPTIONS = [:first, :trim_punctuation, :default,
105
+ :allow_duplicates, :separator, :translation_map,
106
+ :alternate_script]
107
+
108
+ # Serializes complete marc record to a serialization format.
109
+ # required param :format,
110
+ # serialize_marc(:format => :binary)
111
+ #
112
+ # formats:
113
+ # [xml] MarcXML
114
+ # [json] marc-in-json (http://dilettantes.code4lib.org/blog/2010/09/a-proposal-to-serialize-marc-in-json/)
115
+ # [binary] Standard ISO 2709 binary marc. By default WILL be base64-encoded,
116
+ # assumed destination a solr 'binary' field.
117
+ # * add option `:binary_escape => false` to do straight binary -- unclear
118
+ # what Solr's documented behavior is when you do this, and add a string
119
+ # with binary control chars to solr. May do different things in diff
120
+ # Solr versions, including raising exceptions.
121
+ # * add option `:allow_oversized => true` to pass that flat
122
+ # to the MARC::Writer. Oversized records will then still be
123
+ # serialized, with certain header bytes filled with ascii 0's
124
+ # -- technically illegal MARC, but can still be read by
125
+ # ruby MARC::Reader in permissive mode.
126
+ def serialized_marc(options)
127
+ unless (options.keys - SERIALZED_MARC_VALID_OPTIONS).empty?
128
+ raise RuntimeError.new("Illegal/Unknown argument '#{(options.keys - SERIALZED_MARC_VALID_OPTIONS).join(', ')}' in seralized_marc at #{Traject::Util.extract_caller_location(caller.first)}")
129
+ end
130
+
131
+ format = options[:format].to_s
132
+ binary_escape = (options[:binary_escape] != false)
133
+ allow_oversized = (options[:allow_oversized] == true)
134
+
135
+ raise ArgumentError.new("Need :format => [binary|xml|json] arg") unless %w{binary xml json}.include?(format)
136
+
137
+ lambda do |record, accumulator, context|
138
+ case format
139
+ when "binary"
140
+ binary = MARC::Writer.encode(record, allow_oversized)
141
+ binary = Base64.encode64(binary) if binary_escape
142
+ accumulator << binary
143
+ when "xml"
144
+ accumulator << MARC::FastXMLWriter.encode(record)
145
+ when "json"
146
+ accumulator << JSON.dump(record.to_hash)
147
+ end
148
+ end
149
+ end
150
+ SERIALZED_MARC_VALID_OPTIONS = [:format, :binary_escape, :allow_oversized]
151
+
152
+ # Takes the whole record, by default from tags 100 to 899 inclusive,
153
+ # all subfields, and adds them to output. Subfields in a record are all
154
+ # joined by space by default.
155
+ #
156
+ # options
157
+ # [:from] default 100, only tags >= lexicographically
158
+ # [:to] default 899, only tags <= lexicographically
159
+ # [:separator] how to join subfields, default space, nil means don't join
160
+ #
161
+ # All fields in from-to must be marc DATA (not control fields), or weirdness
162
+ #
163
+ # Can always run this thing multiple times on the same field if you need
164
+ # non-contiguous ranges of fields.
165
+ def extract_all_marc_values(options = {})
166
+ unless (options.keys - EXTRACT_ALL_MARC_VALID_OPTIONS).empty?
167
+ raise RuntimeError.new("Illegal/Unknown argument '#{(options.keys - EXTRACT_ALL_MARC_VALID_OPTIONS).join(', ')}' in extract_all_marc at #{Traject::Util.extract_caller_location(caller.first)}")
168
+ end
169
+ options = {:from => "100", :to => "899", :separator => ' '}.merge(options)
170
+
171
+ lambda do |record, accumulator, context|
172
+ record.each do |field|
173
+ next unless field.tag >= options[:from] && field.tag <= options[:to]
174
+ subfield_values = field.subfields.collect {|sf| sf.value}
175
+ next unless subfield_values.length > 0
176
+
177
+ if options[:separator]
178
+ accumulator << subfield_values.join( options[:separator])
179
+ else
180
+ accumulator.concat subfield_values
181
+ end
182
+ end
183
+ end
184
+
185
+ end
186
+ EXTRACT_ALL_MARC_VALID_OPTIONS = [:separator, :from, :to]
187
+
188
+
189
+ # Trims punctuation mostly from end, and occasionally from beginning
190
+ # of string. Not nearly as complex logic as SolrMarc's version, just
191
+ # pretty simple.
192
+ #
193
+ # Removes
194
+ # * trailing: comma, slash, semicolon, colon (possibly preceded and followed by whitespace)
195
+ # * trailing period if it is preceded by at least three letters (possibly preceded and followed by whitespace)
196
+ # * single square bracket characters if they are the start and/or end
197
+ # chars and there are no internal square brackets.
198
+ #
199
+ # Returns altered string, doesn't change original arg.
200
+ def self.trim_punctuation(str)
201
+
202
+ # If something went wrong and we got a nil, just return it
203
+ return str unless str
204
+
205
+ # trailing: comma, slash, semicolon, colon (possibly preceded and followed by whitespace)
206
+ str = str.sub(/ *[ ,\/;:] *\Z/, '')
207
+
208
+ # trailing period if it is preceded by at least three letters (possibly preceded and followed by whitespace)
209
+ str = str.sub(/( *\w\w\w)\. *\Z/, '\1')
210
+
211
+ # single square bracket characters if they are the start and/or end
212
+ # chars and there are no internal square brackets.
213
+ str = str.sub(/\A\[?([^\[\]]+)\]?\Z/, '\1')
214
+ return str
215
+ end
216
+
217
+ def self.first!(arr)
218
+ # kind of esoteric, but slice used this way does mutating first, yep
219
+ arr.slice!(1, arr.length)
220
+ end
221
+
222
+ end
223
+ end