RubyGems - traject - Versions diffs - 2.0.0-java - Mend

traject 2.0.0-java

Files changed (104) hide show

checksums.yaml +7 -0
data/.gitignore +18 -0
data/.travis.yml +27 -0
data/.yardopts +3 -0
data/Gemfile +12 -0
data/LICENSE.txt +20 -0
data/README.md +461 -0
data/Rakefile +21 -0
data/bench/bench.rb +30 -0
data/bin/traject +16 -0
data/doc/batch_execution.md +243 -0
data/doc/extending.md +190 -0
data/doc/indexing_rules.md +265 -0
data/doc/other_commands.md +47 -0
data/doc/settings.md +101 -0
data/lib/tasks/load_maps.rake +48 -0
data/lib/traject.rb +11 -0
data/lib/traject/command_line.rb +301 -0
data/lib/traject/csv_writer.rb +34 -0
data/lib/traject/debug_writer.rb +47 -0
data/lib/traject/delimited_writer.rb +110 -0
data/lib/traject/indexer.rb +613 -0
data/lib/traject/indexer/settings.rb +110 -0
data/lib/traject/json_writer.rb +51 -0
data/lib/traject/line_writer.rb +63 -0
data/lib/traject/macros/basic.rb +9 -0
data/lib/traject/macros/marc21.rb +223 -0
data/lib/traject/macros/marc21_semantics.rb +584 -0
data/lib/traject/macros/marc_format_classifier.rb +197 -0
data/lib/traject/marc_extractor.rb +410 -0
data/lib/traject/marc_reader.rb +89 -0
data/lib/traject/mock_reader.rb +97 -0
data/lib/traject/ndj_reader.rb +40 -0
data/lib/traject/null_writer.rb +22 -0
data/lib/traject/qualified_const_get.rb +40 -0
data/lib/traject/solr_json_writer.rb +277 -0
data/lib/traject/thread_pool.rb +161 -0
data/lib/traject/translation_map.rb +267 -0
data/lib/traject/util.rb +52 -0
data/lib/traject/version.rb +3 -0
data/lib/traject/yaml_writer.rb +9 -0
data/lib/translation_maps/lcc_top_level.yaml +26 -0
data/lib/translation_maps/marc_genre_007.yaml +9 -0
data/lib/translation_maps/marc_genre_leader.yaml +22 -0
data/lib/translation_maps/marc_geographic.yaml +589 -0
data/lib/translation_maps/marc_instruments.yaml +102 -0
data/lib/translation_maps/marc_languages.yaml +490 -0
data/test/debug_writer_test.rb +38 -0
data/test/delimited_writer_test.rb +104 -0
data/test/indexer/each_record_test.rb +59 -0
data/test/indexer/macros_marc21_semantics_test.rb +391 -0
data/test/indexer/macros_marc21_test.rb +190 -0
data/test/indexer/macros_test.rb +40 -0
data/test/indexer/map_record_test.rb +209 -0
data/test/indexer/read_write_test.rb +101 -0
data/test/indexer/settings_test.rb +152 -0
data/test/indexer/to_field_test.rb +77 -0
data/test/marc_extractor_test.rb +412 -0
data/test/marc_format_classifier_test.rb +98 -0
data/test/marc_reader_test.rb +110 -0
data/test/solr_json_writer_test.rb +248 -0
data/test/test_helper.rb +90 -0
data/test/test_support/245_no_ab.marc +1 -0
data/test/test_support/880_with_no_6.utf8.marc +1 -0
data/test/test_support/bad_subfield_code.marc +1 -0
data/test/test_support/bad_utf_byte.utf8.marc +1 -0
data/test/test_support/date_resort_to_260.marc +1 -0
data/test/test_support/date_type_r_missing_date2.marc +1 -0
data/test/test_support/date_with_u.marc +1 -0
data/test/test_support/demo_config.rb +155 -0
data/test/test_support/emptyish_record.marc +1 -0
data/test/test_support/escaped_character_reference.marc8.marc +1 -0
data/test/test_support/george_eliot.marc +1 -0
data/test/test_support/hebrew880s.marc +1 -0
data/test/test_support/louis_armstrong.marc +1 -0
data/test/test_support/manufacturing_consent.marc +1 -0
data/test/test_support/manuscript_online_thesis.marc +1 -0
data/test/test_support/microform_online_conference.marc +1 -0
data/test/test_support/multi_era.marc +1 -0
data/test/test_support/multi_geo.marc +1 -0
data/test/test_support/musical_cage.marc +1 -0
data/test/test_support/nature.marc +1 -0
data/test/test_support/one-marc8.mrc +1 -0
data/test/test_support/online_only.marc +1 -0
data/test/test_support/packed_041a_lang.marc +1 -0
data/test/test_support/test_data.utf8.json +30 -0
data/test/test_support/test_data.utf8.marc.xml +2609 -0
data/test/test_support/test_data.utf8.mrc +1 -0
data/test/test_support/test_data.utf8.mrc.gz +0 -0
data/test/test_support/the_business_ren.marc +1 -0
data/test/translation_map_test.rb +225 -0
data/test/translation_maps/bad_ruby.rb +8 -0
data/test/translation_maps/bad_yaml.yaml +1 -0
data/test/translation_maps/both_map.rb +1 -0
data/test/translation_maps/both_map.yaml +1 -0
data/test/translation_maps/default_literal.rb +10 -0
data/test/translation_maps/default_passthrough.rb +10 -0
data/test/translation_maps/marc_040a_translate_test.yaml +1 -0
data/test/translation_maps/properties_map.properties +5 -0
data/test/translation_maps/ruby_map.rb +10 -0
data/test/translation_maps/translate_array_test.yaml +8 -0
data/test/translation_maps/yaml_map.yaml +7 -0
data/traject.gemspec +47 -0
metadata +382 -0

data/lib/traject/macros/marc_format_classifier.rb ADDED Viewed

@@ -0,0 +1,197 @@
+module Traject
+  module Macros
+    # To use the marc_format macro, in your configuration file:
+    #
+    #     require 'traject/macros/marc_formats
+    #     extend Traject::Macros::MarcFormats
+    #
+    #     to_field("format_s") marc_formats
+    #
+    # See also MarcClassifier which can be used directly for a bit more
+    # control.
+    module MarcFormats
+      # very opionated macro that just adds a grab bag of format/genre/types
+      # from our own custom vocabulary, all into one field.
+      # You may want to build your own from MarcFormatClassifier functions instead.
+      #
+      def marc_formats
+        lambda do |record, accumulator|
+          accumulator.concat Traject::Macros::MarcFormatClassifier.new(record).formats
+        end
+      end
+    end
+    # A tool for classifiying MARC records according to format/form/genre/type,
+    # just using our own custom vocabulary for those things.
+    #
+    # used by the `marc_formats` macro, but you can also use it directly
+    # for a bit more control.
+    class MarcFormatClassifier
+      attr_reader :record
+      def initialize(marc_record)
+        @record = marc_record
+      end
+      # A very opinionated method that just kind of jams together
+      # all the possible format/genre/types into one array of 1 to N elements.
+      #
+      # If no other values are present, the default value "Other" will be used.
+      #
+      # See also individual methods which you can use you seperate into
+      # different facets or do other custom things.
+      def formats(options = {})
+        options = {:default => "Other"}.merge(options)
+        formats = []
+        formats.concat genre
+        formats << "Manuscript/Archive" if manuscript_archive?
+        formats << "Microform" if microform?
+        formats << "Online"    if online?
+        # In our own data, if it's an audio recording, it might show up
+        # as print, but it's probably not.
+        formats << "Print"     if print? && ! (formats.include?("Non-musical Recording") || formats.include?("Musical Recording"))
+        # If it's a Dissertation, we decide it's NOT a book
+        if thesis?
+          formats.delete("Book")
+          formats << "Dissertation/Thesis"
+        end
+        if proceeding?
+          formats <<  "Conference"
+        end
+        if formats.empty?
+          formats << options[:default]
+        end
+        return formats
+      end
+      # Returns 1 or more values in an array from:
+      # Book; Journal/Newspaper; Musical Score; Map/Globe; Non-musical Recording; Musical Recording
+      # Image; Software/Data; Video/Film
+      #
+      # Uses leader byte 6, leader byte 7, and 007 byte 0.
+      #
+      # Gets actual labels from marc_genre_leader and marc_genre_007 translation maps,
+      # so you can customize labels if you want.
+      def genre
+        marc_genre_leader = Traject::TranslationMap.new("marc_genre_leader")
+        marc_genre_007    = Traject::TranslationMap.new("marc_genre_007")
+        results = marc_genre_leader[ record.leader.slice(6,2) ] ||
+          marc_genre_leader[ record.leader.slice(6)] ||
+          record.find_all {|f| f.tag == "007"}.collect {|f| marc_genre_007[f.value.slice(0)]}
+        [results].flatten
+      end
+      # Just checks if it has a 502, if it does it's considered a thesis
+      def thesis?
+        @thesis_q ||= begin
+          ! record.find {|a| a.tag == "502"}.nil?
+        end
+      end
+      # Just checks all $6xx for a $v "Congresses"
+      def proceeding?
+        @proceeding_q ||= begin
+          ! record.find do |field|
+            field.tag.slice(0) == '6' && field.subfields.find {|sf| sf.code == "v" && sf.value =~ /^\s*(C|c)ongresses\.?\s*$/}
+          end.nil?
+        end
+      end
+      # Algorithm with help from Chris Case.
+      # * If it has any RDA 338, then it's print if it has a value of
+      #   volume, sheet, or card.
+      # * If it does not have an RDA 338, it's print if and only if it has
+      #   no 245$h GMD.
+      #
+      # * Here at JH, for legacy reasons we also choose to not
+      #   call it print if it's already been marked audio, but
+      #   we do that in a different method.
+      #
+      # Note that any record that has neither a 245 nor a 338rda is going
+      # to be marked print
+      #
+      # This algorithm is definitely going to get some things wrong in
+      # both directions, with real world data. But seems to be good enough.
+      def print?
+        rda338 = record.find_all do |field|
+          field.tag == "338" && field['2'] == "rdacarrier"
+        end
+        if rda338.length > 0
+          rda338.find do |field|
+            field.subfields.find do |sf|
+              (sf.code == "a" && %w{volume card sheet}.include?(sf.value)) ||
+              (sf.code == "b" && %w{nc no nb}.include?(sf.value))
+            end
+          end
+        else
+          normalized_gmd.length == 0
+        end
+      end
+      # We use marc 007 to determine if this represents an online
+      # resource. But sometimes resort to 245$h GMD too.
+      def online?
+        # field 007, byte 0 c="electronic" byte 1 r="remote" ==> sure Online
+        found_007 = record.fields('007').find do |field|
+          field.value.slice(0) == "c" && field.value.slice(1) == "r"
+        end
+        return true if found_007
+        # Otherwise, if it has a GMD ["electronic resource"], we count it
+        # as online only if NO 007[0] == 'c' exists, cause if it does we already
+        # know it's electronic but not remote, otherwise first try would
+        # have found it.
+        return (normalized_gmd.start_with? "[electronic resource]") && ! record.find {|f| f.tag == '007' && f.value.slice(0) == "c"}
+      end
+      # if field 007 byte 0 is 'h', that's microform. But many of our microform
+      # don't have that. If leader byte 6 is 'h', that's an obsolete way of saying
+      # microform. And finally, if GMD is
+      def microform?
+        normalized_gmd.start_with?("[microform]") ||
+        record.leader[6] == "h" ||
+        record.find {|f| (f.tag == "007") && (f.value[0] == "h")}
+      end
+      # Marked as manuscript OR archive.
+      def manuscript_archive?
+        leader06 = record.leader.slice(6)
+        leader08 = record.leader.slice(8)
+        # leader 6 t=Manuscript Language Material, d=Manuscript Music,
+        # f=Manuscript Cartographic
+        #
+        # leader 06 = 'b' is obsolete, but if it exists it means archival countrl
+        #
+        # leader 08 'a'='archival control'
+        %w{t d f b}.include?(leader06) || leader08 == "a"
+      end
+      # downcased version of the gmd, or else empty string
+      def normalized_gmd
+        @gmd ||= begin
+          ((a245 = record['245']) && a245['h'] && a245['h'].downcase) || ""
+        end
+      end
+    end
+  end
+end

data/lib/traject/marc_extractor.rb ADDED Viewed

@@ -0,0 +1,410 @@
+module Traject
+  # MarcExtractor is a class for extracting lists of strings from a MARC::Record,
+  # according to specifications. See #parse_string_spec for description of string
+  # string arguments used to specify extraction. See #initialize for options
+  # that can be set controlling extraction.
+  #
+  # Examples:
+  #
+  #     array_of_stuff   = MarcExtractor.new("001:245abc:700a").extract(marc_record)
+  #     values           = MarcExtractor.new("245a:245abc").extract_marc(marc_record)
+  #     seperated_values = MarcExtractor.new("020a:020z").extract(marc_record)
+  #     bytes            = MarcExtractor.new("008[35-37]")
+  #
+  # ## String extraction specifications
+  #
+  # Extraction directions are supplied in strings, usually as the first
+  # parameter to MarcExtractor.new or MarcExtractor.cached. These specifications
+  # are also the first parameter to the #marc_extract macro.
+  #
+  # A String specification is a string (or array of strings) which consists
+  # of one or more Data and Control Field Specifications seperated by colons.
+  #
+  # A Data Field Specification is of the form:
+  #
+  # * `{tag}{|indicators|}{subfields}`
+  # * {tag} is three chars (usually but not neccesarily numeric)
+  # * {indicators} are optional two chars enclosed in pipe ('|') characters,
+  # * {subfields} are optional list of chars (alphanumeric)
+  #
+  # indicator spec must be two chars, but one can be * meaning "don't care".
+  # space to mean 'blank'
+  #
+  #     "245|01|abc65:345abc:700|*5|:800"
+  #
+  # A Control Field Specification is used with tags for control (fixed) fields (ordinarily fields 001-010)
+  # and includes a tag and a a byte slice specification.
+  #
+  #      "008[35-37]:007[5]""
+  #      => bytes 35-37 inclusive of any field 008, and byte 5 of any field 007
+  #
+  # * subfields and indicators can only be provided for marc data/variable fields
+  # * byte slice can only be provided for marc control fields (generally tags less than 010)
+  #
+  # ## Subfield concatenation
+  #
+  # Normally, for a spec including multiple subfield codes, multiple subfields
+  # from the same MARC field will be concatenated into one string separated by spaces:
+  #
+  #     600 a| Chomsky, Noam x| Philosophy.
+  #     600 a| Chomsky, Noam x| Political and social views.
+  #     MarcExtractor.new("600ax").extract(record)
+  #     # results in two values sent to Solr:
+  #     "Chomsky, Noam Philosophy."
+  #     "Chomsky, Noam Political and social views."
+  #
+  # You can turn off this concatenation and leave individual subfields in seperate
+  # strings by setting the `separator` option to nil:
+  #
+  #     MarcExtractor.new("600ax", :separator => nil).extract(record)
+  #     # Results in four values being sent to Solr (or 3 if you de-dup):
+  #     "Chomksy, Noam"
+  #     "Philosophy."
+  #     "Chomsky, Noam"
+  #     "Political and social views."
+  #
+  # However, **the default is different for specifications with only a single
+  # subfield**, these are by default kept seperated:
+  #
+  #     020 a| 285197145X a| 9782851971456
+  #     MarcExtractor.new("020a:020z").extract(record)
+  #     # two seperate strings sent to Solr:
+  #     "285197145X"
+  #     "9782851971456"
+  #
+  # For single subfield specifications, you force concatenation by
+  # repeating the subfield specification:
+  #
+  #     MarcExtractor.new("020aa:020zz").extract(record)
+  #     # would result in a single string sent to solr for
+  #     # the single field, by default space-separated:
+  #     "285197145X 9782851971456"
+  #
+  # ## Note on Performance and MarcExtractor creation and reuse
+  #
+  # A MarcExtractor is somewhat expensive to create, and has been shown in profiling/
+  # benchmarking to be a bottleneck if you end up creating one for each marc record
+  # processed.  Instead, a single MarcExtractor should be created, and re-used
+  # per MARC record.
+  #
+  # If you are creating a traject 'macro' method, here's one way to do that,
+  # capturing the MarcExtractor under closure:
+  #
+  #     def some_macro(spec, other_args, whatever)
+  #       extractor = MarcExtractor.new( spec )
+  #       # ...
+  #       return lambda do |record, accumulator, context|
+  #          #...
+  #          accumulator.concat extractor.extract(record)
+  #          #...
+  #       end
+  #     end
+  #
+  # In other cases, you may find it convenient to improve performance by
+  # using the MarcExtractor#cached method, instead of MarcExtractor#new, to
+  # lazily create and then re-use a MarcExtractor object with
+  # particular initialization arguments.
+  class MarcExtractor
+    attr_accessor :options, :spec_hash
+    # First arg is a specification for extraction of data from a MARC record.
+    # Specification can be given in two forms:
+    #
+    #  * a string specification like "008[35]:020a:245abc", see top of class
+    #    for examples. A string specification is most typical argument.
+    #  * The output of a previous call to MarcExtractor.parse_string_spec(string_spec),
+    #    a 'pre-parsed' specification.
+    #
+    # Second arg is options:
+    #
+    # [:separator]  default ' ' (space), what to use to separate
+    #               subfield values when joining strings
+    #
+    # [:alternate_script] default :include, include linked 880s for tags
+    #                     that match spec. Also:
+    #                     * false => do not include.
+    #                     * :only => only include linked 880s, not original
+    def initialize(spec, options = {})
+      self.options = {
+        :separator => ' ',
+        :alternate_script => :include
+      }.merge(options)
+      self.spec_hash = spec.kind_of?(Hash) ? spec : self.class.parse_string_spec(spec)
+      # Tags are "interesting" if we have a spec that might cover it
+      @interesting_tags_hash = {}
+      # By default, interesting tags are those represented by keys in spec_hash.
+      # Add them unless we only care about alternate scripts.
+      unless options[:alternate_script] == :only
+        self.spec_hash.keys.each {|tag| @interesting_tags_hash[tag] = true}
+      end
+      # If we *are* interested in alternate scripts, add the 880
+      if options[:alternate_script] != false
+        @interesting_tags_hash['880'] = true
+      end
+      self.freeze
+    end
+    # Takes the same arguments as MarcExtractor.new, but will re-use an existing
+    # cached MarcExtractor already created with given initialization arguments,
+    # if available.
+    #
+    # This can be used to increase performance of indexing routines, as
+    # MarcExtractor creation has been shown via profiling/benchmarking
+    # to be expensive.
+    #
+    # Cache is thread-local, so should be thread-safe.
+    #
+    # You should _not_ modify the state of any MarcExtractor retrieved
+    # via cached, as the MarcExtractor will be re-used and shared (possibly
+    # between threads even!). We try to use ruby #freeze to keep you from doing so,
+    # although if you try hard enough you can surely find a way to do something
+    # you shouldn't.
+    #
+    #     extractor = MarcExtractor.cached("245abc:700a", :separator => nil)
+    def self.cached(*args)
+      cache = (Thread.current[:marc_extractor_cached] ||= Hash.new)
+      return ( cache[args] ||= Traject::MarcExtractor.new(*args).freeze )
+    end
+    # Check to see if a tag is interesting (meaning it may be covered by a spec
+    # and the passed-in options about alternate scripts)
+    def interesting_tag?(tag)
+      return @interesting_tags_hash.include?(tag)
+    end
+    # Converts from a string marc spec like "008[35]:245abc:700a" to a hash used internally
+    # to represent the specification. See comments at head of class for
+    # documentation of string specification format.
+    #
+    #
+    # ## Return value
+    #
+    # The hash returned is keyed by tag, and has as values an array of 0 or
+    # or more MarcExtractor::Spec objects representing the specified extraction
+    # operations for that tag.
+    #
+    # It's an array of possibly more than one, because you can specify
+    # multiple extractions on the same tag: for instance "245a:245abc"
+    #
+    # See tests for more examples.
+    def self.parse_string_spec(spec_string)
+      # hash defaults to []
+      hash = Hash.new
+      spec_strings = spec_string.is_a?(Array) ? spec_string.map{|s| s.split(/\s*:\s*/)}.flatten : spec_string.split(/s*:\s*/)
+      spec_strings.each do |part|
+        if (part =~ /\A([a-zA-Z0-9]{3})(\|([a-z0-9\ \*]{2})\|)?([a-z0-9]*)?\Z/)
+          # variable field
+          tag, indicators, subfields = $1, $3, $4
+          spec = Spec.new(:tag => tag)
+          if subfields and !subfields.empty?
+            spec.subfields = subfields.split('')
+          end
+          if indicators
+           # if specified as '*', leave nil
+           spec.indicator1 = indicators[0] if indicators[0] != "*"
+           spec.indicator2 = indicators[1] if indicators[1] != "*"
+          end
+          hash[spec.tag] ||= []
+          hash[spec.tag] << spec
+        elsif (part =~ /\A([a-zA-Z0-9]{3})(\[(\d+)(-(\d+))?\])\Z/) # control field, "005[4-5]"
+          tag, byte1, byte2 = $1, $3, $5
+          spec = Spec.new(:tag => tag)
+          if byte1 && byte2
+            spec.bytes = ((byte1.to_i)..(byte2.to_i))
+          elsif byte1
+           spec.bytes = byte1.to_i
+          end
+          hash[spec.tag] ||= []
+          hash[spec.tag] << spec
+        else
+          raise ArgumentError.new("Unrecognized marc extract specification: #{part}")
+        end
+      end
+      return hash
+    end
+    # Returns array of strings, extracted values. Maybe empty array.
+    def extract(marc_record)
+      results = []
+      self.each_matching_line(marc_record) do |field, spec|
+        if control_field?(field)
+          results << (spec.bytes ? field.value.byteslice(spec.bytes) : field.value)
+        else
+          results.concat collect_subfields(field, spec)
+        end
+      end
+      return results
+    end
+    # Yields a block for every line in source record that matches
+    # spec. First arg to block is MARC::DataField or ControlField, second
+    # is the MarcExtractor::Spec that it matched on. May take account
+    # of options such as :alternate_script
+    #
+    # Third (optional) arg to block is self, the MarcExtractor object, useful for custom
+    # implementations.
+    def each_matching_line(marc_record)
+      marc_record.fields(@interesting_tags_hash.keys).each do |field|
+        # Make sure it matches indicators too, specs_covering_field
+        # doesn't check that.
+        specs_covering_field(field).each do |spec|
+          if spec.matches_indicators?(field)
+            yield(field, spec, self)
+          end
+        end
+      end
+    end
+    # line each_matching_line, takes a block to process each matching line,
+    # but collects results of block into an array -- flattens any subarrays for you!
+    #
+    # Useful for re-use of this class for custom processing
+    #
+    # yields the MARC Field, the MarcExtractor::Spec object, the MarcExtractor object.
+    def collect_matching_lines(marc_record)
+      results = []
+      self.each_matching_line(marc_record) do |field, spec, extractor|
+        results.concat [yield(field, spec, extractor)].flatten
+      end
+      return results
+    end
+    # Pass in a marc data field and a Spec object with extraction
+    # instructions, returns an ARRAY of one or more strings, subfields extracted
+    # and processed per spec. Takes account of options such
+    # as :separator
+    #
+    # Always returns array, sometimes empty array.
+    def collect_subfields(field, spec)
+      subfields = field.subfields.collect do |subfield|
+        subfield.value if spec.includes_subfield_code?(subfield.code)
+      end.compact
+      return subfields if subfields.empty? # empty array, just return it.
+      if options[:separator] && spec.joinable?
+        subfields = [subfields.join(options[:separator])]
+      end
+      return subfields
+    end
+    # Find Spec objects, if any, covering extraction from this field.
+    # Returns an array of 0 or more MarcExtractor::Spec objects
+    #
+    # When given an 880, will return the spec (if any) for the linked tag iff
+    # we have a $6 and we want the alternate script.
+    #
+    # Returns an empty array in case of no matching extraction specs.
+    def specs_covering_field(field)
+      tag = field.tag
+      # Short-circuit the unintersting stuff
+      return [] unless interesting_tag?(tag)
+      # Due to bug in jruby https://github.com/jruby/jruby/issues/886 , we need
+      # to do this weird encode gymnastics, which fixes it for mysterious reasons.
+      if tag == "880" && field['6']
+        tag = field["6"].encode(field["6"].encoding).byteslice(0,3)
+      end
+      # Take the resulting tag and get the spec from it (or the default nil if there isn't a spec for this tag)
+      spec = self.spec_hash[tag] || []
+    end
+    def control_field?(field)
+      # should the MARC gem have a more efficient way to do this,
+      # define #control_field? on both ControlField and DataField?
+      return field.kind_of? MARC::ControlField
+    end
+    def freeze
+      self.options.freeze
+      self.spec_hash.freeze
+      super
+    end
+    # Represents a single specification for extracting data
+    # from a marc field, like "600abc" or "600|1*|x".
+    #
+    # Includes the tag for reference, although this is redundant and not actually used
+    # in logic, since the tag is also implicit in the overall spec_hash
+    # with tag => [spec1, spec2]
+    class Spec
+      attr_accessor :tag, :subfields, :indicator1, :indicator2, :bytes
+      def initialize(hash = {})
+        hash.each_pair do |key, value|
+          self.send("#{key}=", value)
+        end
+      end
+      #  Should subfields extracted by joined, if we have a seperator?
+      #  * '630' no subfields specified => join all subfields
+      #  * '630abc' multiple subfields specified = join all subfields
+      #  * '633a' one subfield => do not join, return one value for each $a in the field
+      #  * '633aa' one subfield, doubled => do join after all, will return a single string joining all the values of all the $a's.
+      #
+      # Last case is handled implicitly at the moment when subfields == ['a', 'a']
+      def joinable?
+        (self.subfields.nil? || self.subfields.size != 1)
+      end
+      # Pass in a MARC field, do it's indicators match indicators
+      # in this spec? nil indicators in spec mean we don't care, everything
+      # matches.
+      def matches_indicators?(field)
+        return (self.indicator1.nil? || self.indicator1 == field.indicator1) &&
+          (self.indicator2.nil? || self.indicator2 == field.indicator2)
+      end
+      # Pass in a string subfield code like 'a'; does this
+      # spec include it?
+      def includes_subfield_code?(code)
+        # subfields nil means include them all
+        self.subfields.nil? || self.subfields.include?(code)
+      end
+      def ==(spec)
+        return false unless spec.kind_of?(Spec)
+        return (self.tag == spec.tag) &&
+          (self.subfields == spec.subfields) &&
+          (self.indicator1 == spec.indicator1) &&
+          (self.indicator1 == spec.indicator2) &&
+          (self.bytes == spec.bytes)
+      end
+    end
+  end
+end