RubyGems - traject - Versions diffs - 0.16.0 → 0.17.0 - Mend

traject 0.16.0 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

checksums.yaml +7 -0
data/.yardopts +1 -0
data/README.md +183 -191
data/bench/bench.rb +1 -1
data/doc/batch_execution.md +14 -0
data/doc/extending.md +14 -12
data/doc/indexing_rules.md +265 -0
data/lib/traject/command_line.rb +12 -41
data/lib/traject/debug_writer.rb +32 -13
data/lib/traject/indexer.rb +101 -24
data/lib/traject/indexer/settings.rb +18 -17
data/lib/traject/json_writer.rb +32 -11
data/lib/traject/line_writer.rb +6 -6
data/lib/traject/macros/basic.rb +1 -1
data/lib/traject/macros/marc21.rb +17 -13
data/lib/traject/macros/marc21_semantics.rb +27 -25
data/lib/traject/macros/marc_format_classifier.rb +39 -25
data/lib/traject/marc4j_reader.rb +36 -22
data/lib/traject/marc_extractor.rb +79 -75
data/lib/traject/marc_reader.rb +33 -25
data/lib/traject/mock_reader.rb +9 -10
data/lib/traject/ndj_reader.rb +7 -7
data/lib/traject/null_writer.rb +1 -1
data/lib/traject/qualified_const_get.rb +12 -2
data/lib/traject/solrj_writer.rb +61 -52
data/lib/traject/thread_pool.rb +45 -45
data/lib/traject/translation_map.rb +59 -27
data/lib/traject/util.rb +3 -3
data/lib/traject/version.rb +1 -1
data/lib/traject/yaml_writer.rb +1 -1
data/test/debug_writer_test.rb +7 -7
data/test/indexer/each_record_test.rb +4 -4
data/test/indexer/macros_marc21_semantics_test.rb +12 -12
data/test/indexer/macros_marc21_test.rb +10 -10
data/test/indexer/macros_test.rb +1 -1
data/test/indexer/map_record_test.rb +6 -6
data/test/indexer/read_write_test.rb +43 -4
data/test/indexer/settings_test.rb +2 -2
data/test/indexer/to_field_test.rb +8 -8
data/test/marc4j_reader_test.rb +4 -4
data/test/marc_extractor_test.rb +33 -25
data/test/marc_format_classifier_test.rb +3 -3
data/test/marc_reader_test.rb +2 -2
data/test/test_helper.rb +3 -3
data/test/test_support/demo_config.rb +52 -48
data/test/translation_map_test.rb +22 -4
data/test/translation_maps/bad_ruby.rb +2 -2
data/test/translation_maps/both_map.rb +1 -1
data/test/translation_maps/default_literal.rb +1 -1
data/test/translation_maps/default_passthrough.rb +1 -1
data/test/translation_maps/ruby_map.rb +1 -1
metadata +7 -31
data/doc/macros.md +0 -103

data/lib/traject/macros/marc21_semantics.rb CHANGED

@@ -62,10 +62,10 @@ module Traject::Macros
     def self.get_sortable_author(record)
       onexx = MarcExtractor.cached("100:110:111", :first => true, :trim_punctuation => true).extract(record).first
       onexx = onexx.strip if onexx
       titles = []
       MarcExtractor.cached("240:245", :first => true).each_matching_line(record) do |field, spec|
-        non_filing = field.indicator2.to_i
+        non_filing = field.indicator2.to_i
         str = field.subfields.collect {|sf| Marc21.trim_punctuation(sf.value.strip).strip}.join(" ")
         str = str.slice(non_filing, str.length)
@@ -73,7 +73,7 @@ module Traject::Macros
       end.first
       title = titles.first
       title = title.strip if title
       return [onexx, title].compact.join("   ")
     end
@@ -105,26 +105,26 @@ module Traject::Macros
         str
       end.first
     end
     # A generic way to strip a filing version (i.e., a string with the non-filing
     # characters stripped off)
     #
     # Always returns an array. If :include_original=>true is passed in,
     # that array will include the original string with the non-filing
     # characters still in it.
     def extract_marc_filing_version(spec='245abdefghknp', opts={})
       include_original = opts.delete(:include_original)
       if opts.size > 0
         raise RuntimeError.new("extract_marc_filing_version can take only :include_original as an argument, not #{opts.keys.map{|x| "'#{x}'"}.join(' or ')}")
       end
       extractor = Traject::MarcExtractor.cached(spec, opts)
       lambda do |record, accumulator, context|
-        extractor.collect_matching_lines(record) do |field, spec|
+        extractor.collect_matching_lines(record) do |field, spec|
           str = extractor.collect_subfields(field, spec).first
           next unless str and !str.empty?
           vals = [Marc21Semantics.filing_version(field, str, spec)]
@@ -136,34 +136,34 @@ module Traject::Macros
         end
       end
     end
     # Take in a field, a string extracted from that field, and a spec and
-    # return the filing version (i.e., the string without the
+    # return the filing version (i.e., the string without the
     # non-filing characters)
     def self.filing_version(field, str, spec)
       # Control fields don't have non-filing characters
       return str if field.kind_of? MARC::ControlField
       # 2nd indicator must be > 0
       ind2 = field.indicator2.to_i
       return str unless ind2 > 0
       # The spechash must either (a) have no subfields specified, or
       # (b) include the first subfield in the record
       subs = spec.subfields
       return str unless subs && subs.include?(field.subfields[0].code)
       # OK. If we got this far we actually need to strip characters off the string
       return str[ind2..-1]
     end
     # maps languages, by default out of 008[35-37] and 041a and 041d
@@ -367,6 +367,9 @@ module Traject::Macros
       return found_date
     end
+    # REGEX meant to rule out obvious non-LCC's, and only allow things
+    # plausibly LCC's.
+    LCC_REGEX = /\A *[A-Z]{1,3}[ .]*(?:(\d+)(?:\s*?\.\s*?(\d+))?).*/
     # Looks up Library of Congress Classification (LCC) or NLM Medical Subject Headings (MeSH)
     # from usual parts of the marc record. Maps them to high-level broad categories,
     # basically just using the first part of the LCC. Note it's just looking in bib-level
@@ -379,7 +382,6 @@ module Traject::Macros
     # or nil.
     #
     # The categories output aren't great, but they're something.
-    LCC_REGEX = /\A *[A-Z]{1,3}[ .]*(?:(\d+)(?:\s*?\.\s*?(\d+))?).*/
     def marc_lcc_to_broad_category( options = {}, spec="050a:060a:090a:096a")
       # Trying to match things that look like LCC, and not match things
       # that don't. Is tricky.
@@ -503,4 +505,4 @@ module Traject::Macros
   end
-end
+end

data/lib/traject/macros/marc_format_classifier.rb CHANGED

@@ -1,9 +1,19 @@
 module Traject
   module Macros
-    # See MarcFormatClassifier class
+    # To use the marc_format macro, in your configuration file:
+    #
+    #     require 'traject/macros/marc_formats
+    #     extend Traject::Macros::MarcFormats
+    #
+    #     to_field("format_s") marc_formats
+    #
+    # See also MarcClassifier which can be used directly for a bit more
+    # control.
     module MarcFormats
       # very opionated macro that just adds a grab bag of format/genre/types
-      # into one field. You may want ot build your own from MarcFormatClassifier functions instead.
+      # from our own custom vocabulary, all into one field.
+      # You may want to build your own from MarcFormatClassifier functions instead.
+      #
       def marc_formats
         lambda do |record, accumulator|
           accumulator.concat Traject::Macros::MarcFormatClassifier.new(record).formats
@@ -12,10 +22,11 @@ module Traject
     end
-    # Not actually a macro, but we're keeping it here for now,
-    # a class for classifying marc according to format/genre/type.
+    # A tool for classifiying MARC records according to format/form/genre/type,
+    # just using our own custom vocabulary for those things.
     #
-    # VERY opinionated.
+    # used by the `marc_formats` macro, but you can also use it directly
+    # for a bit more control.
     class MarcFormatClassifier
       attr_reader :record
@@ -24,22 +35,25 @@ module Traject
       end
       # A very opinionated method that just kind of jams together
-      # all the possible format/genre/types into one array of 1 to N elements.
+      # all the possible format/genre/types into one array of 1 to N elements.
       #
-      # Default "Other" will be used
+      # If no other values are present, the default value "Other" will be used.
+      #
+      # See also individual methods which you can use you seperate into
+      # different facets or do other custom things.
       def formats(options = {})
         options = {:default => "Other"}.merge(options)
         formats = []
         formats.concat genre
         formats << "Manuscript/Archive" if manuscript_archive?
         formats << "Microform" if microform?
         formats << "Online"    if online?
         # In our own data, if it's an audio recording, it might show up
-        # as print, but it's probably not.
+        # as print, but it's probably not.
         formats << "Print"     if print? && ! (formats.include?("Non-musical Recording") || formats.include?("Musical Recording"))
         # If it's a Dissertation, we decide it's NOT a book
@@ -64,11 +78,11 @@ module Traject
       # Returns 1 or more values in an array from:
       # Book; Journal/Newspaper; Musical Score; Map/Globe; Non-musical Recording; Musical Recording
       # Image; Software/Data; Video/Film
-      #
-      # Uses leader byte 6, leader byte 7, and 007 byte 0.
+      #
+      # Uses leader byte 6, leader byte 7, and 007 byte 0.
       #
       # Gets actual labels from marc_genre_leader and marc_genre_007 translation maps,
-      # so you can customize labels if you want.
+      # so you can customize labels if you want.
       def genre
         marc_genre_leader = Traject::TranslationMap.new("marc_genre_leader")
         marc_genre_007    = Traject::TranslationMap.new("marc_genre_007")
@@ -96,18 +110,18 @@ module Traject
         end
       end
-      # Algorithm with help from Chris Case.
-      # * If it has any RDA 338, then it's print if it has a value of
-      #   volume, sheet, or card.
+      # Algorithm with help from Chris Case.
+      # * If it has any RDA 338, then it's print if it has a value of
+      #   volume, sheet, or card.
       # * If it does not have an RDA 338, it's print if and only if it has
-      #   NO 245$h GMD.
+      #   NO 245$h GMD.
       #
-      # * Here at JH, for legacy reasons we also choose to not
+      # * Here at JH, for legacy reasons we also choose to not
       #   call it print if it's already been marked audio, but
-      #   we do that in a different method.
+      #   we do that in a different method.
       #
       # This algorithm is definitely going to get some things wrong in
-      # both directions, with real world data. But seems to be good enough.
+      # both directions, with real world data. But seems to be good enough.
       def print?
@@ -116,7 +130,7 @@ module Traject
         end
         if rda338.length > 0
-          rda338.find do |field|
+          rda338.find do |field|
             field.subfields.find do |sf|
               (sf.code == "a" && %w{volume card sheet}.include?(sf.value)) ||
               (sf.code == "b" && %w{nc no nb}.include?(sf.value))
@@ -128,7 +142,7 @@ module Traject
       end
       # We use marc 007 to determine if this represents an online
-      # resource. But sometimes resort to 245$h GMD too.
+      # resource. But sometimes resort to 245$h GMD too.
       def online?
         # field 007, byte 0 c="electronic" byte 1 r="remote" ==> sure Online
         found_007 = record.find do |field|
@@ -140,8 +154,8 @@ module Traject
         # Otherwise, if it has a GMD ["electronic resource"], we count it
         # as online only if NO 007[0] == 'c' exists, cause if it does we already
         # know it's electronic but not remote, otherwise first try would
-        # have found it.
-        return (normalized_gmd.start_with? "[electronic resource]") && ! record.find {|f| f.tag == '007' && f.value.slice(0) == "c"}
+        # have found it.
+        return (normalized_gmd.start_with? "[electronic resource]") && ! record.find {|f| f.tag == '007' && f.value.slice(0) == "c"}
       end
       # if field 007 byte 0 is 'h', that's microform. But many of our microform
@@ -153,7 +167,7 @@ module Traject
         record.find {|f| (f.tag == "007") && (f.value[0] == "h")}
       end
-      # Marked as manuscript OR archive.
+      # Marked as manuscript OR archive.
       def manuscript_archive?
         leader06 = record.leader.slice(6)
         leader08 = record.leader.slice(8)
@@ -177,4 +191,4 @@ module Traject
     end
   end
-end
+end

data/lib/traject/marc4j_reader.rb CHANGED

@@ -2,24 +2,21 @@ require 'traject'
 require 'marc'
 require 'marc/marc4j'
-# Uses Marc4J to read the marc records, but then translates them to
-# ruby-marc before delivering them still, Marc4J is just inside the black
-# box.
+# `Traject::Marc4JReader` uses the marc4j java package to parse the MARC records
+# into standard ruby-marc MARC::Record objects. This reader is often faster than
+# Traject::MarcReader, especially for XML, and offers support for reading Marc8
+# encoded records and transcoding to UTF8.
 #
-# But one way to get ability to transcode from Marc8. Records it delivers
-# are ALWAYS in UTF8, will be transcoded if needed.
+# Marc4JReader can read MARC ISO 2709 ("binary") or MARCXML. We use the Marc4J MarcPermissiveStreamReader
+# for reading binary, but sometimes in non-permissive mode, according to settings. We use the Marc4j MarcXmlReader
+# for reading xml. The actual code for dealing with Marc4J is in the separate
+# [marc-marc4j gem](https://github.com/billdueber/ruby-marc-marc4j).
 #
-# Also hope it gives us some performance benefit.
+# See also the pure ruby Traject::MarcReader as an alternative, if you need to read
+# marc-in-json, or if you don't need binary Marc8 support, it may in some cases
+# be faster.
 #
-# Uses the Marc4J MarcPermissiveStreamReader for binary, but sometimes
-# in non-permissive mode, according to settings. Uses the Marc4j MarcXmlReader
-# for xml.
-#
-# NOTE: If you aren't reading in binary records encoded in MARC8, you may
-# find the pure-ruby Traject::MarcReader faster; the extra step to read
-# Marc4J but translate to ruby MARC::Record adds some overhead.
-#
-# Settings:
+# ## Settings
 #
 # * marc_source.type:     serialization type. default 'binary', also 'xml' (TODO: json/marc-in-json)
 #
@@ -39,9 +36,26 @@ require 'marc/marc4j'
 # * marc4j_reader.jar_dir: Path to a directory containing Marc4J jar file to use. All .jar's in dir will
 #                          be loaded. If unset, uses marc4j.jar bundled with traject.
 #
-# * marc4j_reader.keep_marc4j: Keeps the original marc4j record accessible from
-#                              the eventual ruby-marc record via record#original_marc4j
+# * marc4j_reader.keep_marc4j: Keeps the original marc4j record accessible from
+#   the eventual ruby-marc record via record#original_marc4j. Intended for
+#   those that have legacy java code for which a marc4j object is needed. .
+#
+#
+# ## Example
+#
+# In a configuration file:
+#
+#     require 'traject/marc4j_reader
+#     settings do
+#       provide "reader_class_name", "Traject::Marc4JReader"
+#
+#       #for MarcXML:
+#       # provide "marc_source.type", "xml"
+#
+#       # Or instead for binary:
+#       provide "marc4j_reader.permissive", true
+#       provide "marc4j_reader.source_encoding", "MARC8"
+#     end
 class Traject::Marc4JReader
   include Enumerable
@@ -56,14 +70,14 @@ class Traject::Marc4JReader
          MARC::Record.instance_methods.include?(:"original_marc4j="))
       MARC::Record.class_eval('attr_accessor :original_marc4j')
     end
     # Creating a converter will do the following:
     #  - nothing, if it detects that the marc4j jar is already loaded
     #  - load all the .jar files in settings['marc4j_reader.jar_dir'] if set
     #  - load the marc4j jar file bundled with MARC::MARC4J otherwise
     @converter = MARC::MARC4J.new(:jardir => settings['marc4j_reader.jar_dir'], :logger => logger)
     # Convenience
     java_import org.marc4j.MarcPermissiveStreamReader
     java_import org.marc4j.MarcXmlReader
@@ -121,4 +135,4 @@ class Traject::Marc4JReader
     @logger ||= (settings[:logger] || Yell.new(STDERR, :level => "gt.fatal")) # null logger)
   end
-end
+end

data/lib/traject/marc_extractor.rb CHANGED

@@ -6,22 +6,23 @@ module Traject
   #
   # Examples:
   #
-  #    array_of_stuff   = MarcExtractor.new("001:245abc:700a").extract(marc_record)
-  #    values           = MarcExtractor.new("245a:245abc").extract_marc(marc_record)
-  #    seperated_values = MarcExtractor.new("020a:020z").extract(marc_record)
-  #    bytes            = MarcExtractor.new("008[35-37]")
+  #     array_of_stuff   = MarcExtractor.new("001:245abc:700a").extract(marc_record)
+  #     values           = MarcExtractor.new("245a:245abc").extract_marc(marc_record)
+  #     seperated_values = MarcExtractor.new("020a:020z").extract(marc_record)
+  #     bytes            = MarcExtractor.new("008[35-37]")
   #
-  # == String extraction specifications
+  # ## String extraction specifications
   #
   # Extraction directions are supplied in strings, usually as the first
   # parameter to MarcExtractor.new or MarcExtractor.cached. These specifications
-  # are also the first parameter to the #marc_extract macro.
+  # are also the first parameter to the #marc_extract macro.
   #
   # A String specification is a string (or array of strings) which consists
-  # of one or more Data and Control Field Specifications seperated by colons.
+  # of one or more Data and Control Field Specifications seperated by colons.
   #
   # A Data Field Specification is of the form:
-  #  `{tag}{|indicators|}{subfields}`
+  #
+  # * `{tag}{|indicators|}{subfields}`
   # * {tag} is three chars (usually but not neccesarily numeric)
   # * {indicators} are optional two chars enclosed in pipe ('|') characters,
   # * {subfields} are optional list of chars (alphanumeric)
@@ -29,58 +30,58 @@ module Traject
   # indicator spec must be two chars, but one can be * meaning "don't care".
   # space to mean 'blank'
   #
-  # "245|01|abc65:345abc:700|*5|:800"
+  #     "245|01|abc65:345abc:700|*5|:800"
   #
   # A Control Field Specification is used with tags for control (fixed) fields (ordinarily fields 001-010)
-  # and includes a tag and a a byte slice specification.
+  # and includes a tag and a a byte slice specification.
   #
-  #  "008[35-37]:007[5]""
-  #  => bytes 35-37 inclusive of any field 008, and byte 5 of any field 007 (TODO: Should we support
-  #    "LDR" as a pseudo-tag to take byte slices of leader?)
+  #      "008[35-37]:007[5]""
+  #      => bytes 35-37 inclusive of any field 008, and byte 5 of any field 007 (TODO: Should we support
+  #      "LDR" as a pseudo-tag to take byte slices of leader?)
   #
   # * subfields and indicators can only be provided for marc data/variable fields
   # * byte slice can only be provided for marc control fields (generally tags less than 010)
   #
-  # == Subfield concatenation
+  # ## Subfield concatenation
   #
   # Normally, for a spec including multiple subfield codes, multiple subfields
   # from the same MARC field will be concatenated into one string separated by spaces:
   #
-  #    600 a| Chomsky, Noam x| Philosophy.
-  #    600 a| Chomsky, Noam x| Political and social views.
-  #    MarcExtractor.new("600ax").extract(record)
-  #    # results in two values sent to Solr:
-  #    "Chomsky, Noam Philosophy."
-  #    "Chomsky, Noam Political and social views."
+  #     600 a| Chomsky, Noam x| Philosophy.
+  #     600 a| Chomsky, Noam x| Political and social views.
+  #     MarcExtractor.new("600ax").extract(record)
+  #     # results in two values sent to Solr:
+  #     "Chomsky, Noam Philosophy."
+  #     "Chomsky, Noam Political and social views."
   #
   # You can turn off this concatenation and leave individual subfields in seperate
   # strings by setting the `separator` option to nil:
   #
-  #    MarcExtractor.new("600ax", :separator => nil).extract(record)
-  #    # Results in four values being sent to Solr (or 3 if you de-dup):
-  #    "Chomksy, Noam"
-  #    "Philosophy."
-  #    "Chomsky, Noam"
-  #    "Political and social views."
+  #     MarcExtractor.new("600ax", :separator => nil).extract(record)
+  #     # Results in four values being sent to Solr (or 3 if you de-dup):
+  #     "Chomksy, Noam"
+  #     "Philosophy."
+  #     "Chomsky, Noam"
+  #     "Political and social views."
   #
   # However, **the default is different for specifications with only a single
   # subfield**, these are by default kept seperated:
   #
-  #    020 a| 285197145X a| 9782851971456
-  #    MarcExtractor.new("020a:020z").extract(record)
-  #    # two seperate strings sent to Solr:
-  #    "285197145X"
-  #    "9782851971456"
+  #     020 a| 285197145X a| 9782851971456
+  #     MarcExtractor.new("020a:020z").extract(record)
+  #     # two seperate strings sent to Solr:
+  #     "285197145X"
+  #     "9782851971456"
   #
   # For single subfield specifications, you force concatenation by
   # repeating the subfield specification:
   #
-  #    MarcExtractor.new("020aa:020zz").extract(record)
-  #    # would result in a single string sent to solr for
-  #    # the single field, by default space-separated:
-  #    "285197145X 9782851971456"
+  #     MarcExtractor.new("020aa:020zz").extract(record)
+  #     # would result in a single string sent to solr for
+  #     # the single field, by default space-separated:
+  #     "285197145X 9782851971456"
   #
-  # == Note on Performance and MarcExtractor creation and reuse
+  # ## Note on Performance and MarcExtractor creation and reuse
   #
   # A MarcExtractor is somewhat expensive to create, and has been shown in profiling/
   # benchmarking to be a bottleneck if you end up creating one for each marc record
@@ -90,15 +91,15 @@ module Traject
   # If you are creating a traject 'macro' method, here's one way to do that,
   # capturing the MarcExtractor under closure:
   #
-  #    def some_macro(spec, other_args, whatever)
-  #      extractor = MarcExtractor.new( spec )
-  #      # ...
-  #      return lambda do |record, accumulator, context|
-  #         #...
-  #         accumulator.concat extractor.extract(record)
-  #         #...
-  #      end
-  #    end
+  #     def some_macro(spec, other_args, whatever)
+  #       extractor = MarcExtractor.new( spec )
+  #       # ...
+  #       return lambda do |record, accumulator, context|
+  #          #...
+  #          accumulator.concat extractor.extract(record)
+  #          #...
+  #       end
+  #     end
   #
   # In other cases, you may find it convenient to improve performance by
   # using the MarcExtractor#cached method, instead of MarcExtractor#new, to
@@ -107,13 +108,13 @@ module Traject
   class MarcExtractor
     attr_accessor :options, :spec_hash
-    # First arg is a specification for extraction of data from a MARC record.
+    # First arg is a specification for extraction of data from a MARC record.
     # Specification can be given in two forms:
     #
     #  * a string specification like "008[35]:020a:245abc", see top of class
-    #    for examples. A string specification is most typical argument.
+    #    for examples. A string specification is most typical argument.
     #  * The output of a previous call to MarcExtractor.parse_string_spec(string_spec),
-    #    a 'pre-parsed' specification.
+    #    a 'pre-parsed' specification.
     #
     # Second arg is options:
     #
@@ -146,6 +147,8 @@ module Traject
       if options[:alternate_script] != false
         @interesting_tags_hash['880'] = true
       end
+      self.freeze
     end
     # Takes the same arguments as MarcExtractor.new, but will re-use an existing
@@ -164,17 +167,10 @@ module Traject
     # although if you try hard enough you can surely find a way to do something
     # you shouldn't.
     #
-    #    extractor = MarcExtractor.cached("245abc:700a", :separator => nil)
+    #     extractor = MarcExtractor.cached("245abc:700a", :separator => nil)
     def self.cached(*args)
       cache = (Thread.current[:marc_extractor_cached] ||= Hash.new)
-      extractor = (cache[args] ||= begin
-        ex = Traject::MarcExtractor.new(*args).freeze
-        ex.options.freeze
-        ex.spec_hash.freeze
-        ex
-      end)
-      return extractor
+      return ( cache[args] ||= Traject::MarcExtractor.new(*args).freeze )
     end
     # Check to see if a tag is interesting (meaning it may be covered by a spec
@@ -186,14 +182,14 @@ module Traject
     # Converts from a string marc spec like "008[35]:245abc:700a" to a hash used internally
     # to represent the specification. See comments at head of class for
-    # documentation of string specification format.
+    # documentation of string specification format.
     #
     #
-    # == Return value
+    # ## Return value
     #
     # The hash returned is keyed by tag, and has as values an array of 0 or
     # or more MarcExtractor::Spec objects representing the specified extraction
-    # operations for that tag.
+    # operations for that tag.
     #
     # It's an array of possibly more than one, because you can specify
     # multiple extractions on the same tag: for instance "245a:245abc"
@@ -201,7 +197,7 @@ module Traject
     # See tests for more examples.
     def self.parse_string_spec(spec_string)
       # hash defaults to []
-      hash = Hash.new {|hash,key| hash[key] = []}
+      hash = Hash.new
       spec_strings = spec_string.is_a?(Array) ? spec_string.map{|s| s.split(/\s*:\s*/)}.flatten : spec_string.split(/s*:\s*/)
@@ -222,8 +218,9 @@ module Traject
            spec.indicator2 = indicators[1] if indicators[1] != "*"
           end
+          hash[spec.tag] ||= []
           hash[spec.tag] << spec
         elsif (part =~ /\A([a-zA-Z0-9]{3})(\[(\d+)(-(\d+))?\])\Z/) # control field, "005[4-5]"
           tag, byte1, byte2 = $1, $3, $5
@@ -234,7 +231,8 @@ module Traject
           elsif byte1
            spec.bytes = byte1.to_i
           end
+          hash[spec.tag] ||= []
           hash[spec.tag] << spec
         else
           raise ArgumentError.new("Unrecognized marc extract specification: #{part}")
@@ -286,7 +284,7 @@ module Traject
     #
     # Useful for re-use of this class for custom processing
     #
-    # yields the MARC Field, the MarcExtractor::Spec object, the MarcExtractor object.
+    # yields the MARC Field, the MarcExtractor::Spec object, the MarcExtractor object.
     def collect_matching_lines(marc_record)
       results = []
       self.each_matching_line(marc_record) do |field, spec, extractor|
@@ -312,7 +310,7 @@ module Traject
       if options[:separator] && spec.joinable?
         subfields = [subfields.join(options[:separator])]
       end
       return subfields
     end
@@ -324,12 +322,12 @@ module Traject
     # When given an 880, will return the spec (if any) for the linked tag iff
     # we have a $6 and we want the alternate script.
     #
-    # Returns an empty array in case of no matching extraction specs.
+    # Returns an empty array in case of no matching extraction specs.
     def specs_covering_field(field)
       tag = field.tag
       # Short-circuit the unintersting stuff
-      return nil unless interesting_tag?(tag)
+      return [] unless interesting_tag?(tag)
       # Due to bug in jruby https://github.com/jruby/jruby/issues/886 , we need
       # to do this weird encode gymnastics, which fixes it for mysterious reasons.
@@ -339,7 +337,7 @@ module Traject
       end
       # Take the resulting tag and get the spec from it (or the default nil if there isn't a spec for this tag)
-      spec = self.spec_hash[tag]
+      spec = self.spec_hash[tag] || []
     end
@@ -348,13 +346,19 @@ module Traject
       # define #control_field? on both ControlField and DataField?
       return field.kind_of? MARC::ControlField
     end
+    def freeze
+      self.options.freeze
+      self.spec_hash.freeze
+      super
+    end
     # Represents a single specification for extracting data
-    # from a marc field, like "600abc" or "600|1*|x".
+    # from a marc field, like "600abc" or "600|1*|x".
     #
     # Includes the tag for reference, although this is redundant and not actually used
-    # in logic, since the tag is also implicit in the overall spec_hash
+    # in logic, since the tag is also implicit in the overall spec_hash
     # with tag => [spec1, spec2]
     class Spec
       attr_accessor :tag, :subfields, :indicator1, :indicator2, :bytes
@@ -365,7 +369,7 @@ module Traject
         end
       end
       #  Should subfields extracted by joined, if we have a seperator?
       #  * '630' no subfields specified => join all subfields
       #  * '630abc' multiple subfields specified = join all subfields
@@ -379,8 +383,8 @@ module Traject
       # Pass in a MARC field, do it's indicators match indicators
       # in this spec? nil indicators in spec mean we don't care, everything
-      # matches.
-      def matches_indicators?(field)
+      # matches.
+      def matches_indicators?(field)
         return (self.indicator1.nil? || self.indicator1 == field.indicator1) &&
           (self.indicator2.nil? || self.indicator2 == field.indicator2)
       end
@@ -396,7 +400,7 @@ module Traject
         return false unless spec.kind_of?(Spec)
         return (self.tag == spec.tag) &&
-          (self.subfields == spec.subfields) &&
+          (self.subfields == spec.subfields) &&
           (self.indicator1 == spec.indicator1) &&
           (self.indicator1 == spec.indicator2) &&
           (self.bytes == spec.bytes)