RubyGems - traject - Versions diffs - 2.0.0-java - Mend

traject 2.0.0-java

Files changed (104) hide show

checksums.yaml +7 -0
data/.gitignore +18 -0
data/.travis.yml +27 -0
data/.yardopts +3 -0
data/Gemfile +12 -0
data/LICENSE.txt +20 -0
data/README.md +461 -0
data/Rakefile +21 -0
data/bench/bench.rb +30 -0
data/bin/traject +16 -0
data/doc/batch_execution.md +243 -0
data/doc/extending.md +190 -0
data/doc/indexing_rules.md +265 -0
data/doc/other_commands.md +47 -0
data/doc/settings.md +101 -0
data/lib/tasks/load_maps.rake +48 -0
data/lib/traject.rb +11 -0
data/lib/traject/command_line.rb +301 -0
data/lib/traject/csv_writer.rb +34 -0
data/lib/traject/debug_writer.rb +47 -0
data/lib/traject/delimited_writer.rb +110 -0
data/lib/traject/indexer.rb +613 -0
data/lib/traject/indexer/settings.rb +110 -0
data/lib/traject/json_writer.rb +51 -0
data/lib/traject/line_writer.rb +63 -0
data/lib/traject/macros/basic.rb +9 -0
data/lib/traject/macros/marc21.rb +223 -0
data/lib/traject/macros/marc21_semantics.rb +584 -0
data/lib/traject/macros/marc_format_classifier.rb +197 -0
data/lib/traject/marc_extractor.rb +410 -0
data/lib/traject/marc_reader.rb +89 -0
data/lib/traject/mock_reader.rb +97 -0
data/lib/traject/ndj_reader.rb +40 -0
data/lib/traject/null_writer.rb +22 -0
data/lib/traject/qualified_const_get.rb +40 -0
data/lib/traject/solr_json_writer.rb +277 -0
data/lib/traject/thread_pool.rb +161 -0
data/lib/traject/translation_map.rb +267 -0
data/lib/traject/util.rb +52 -0
data/lib/traject/version.rb +3 -0
data/lib/traject/yaml_writer.rb +9 -0
data/lib/translation_maps/lcc_top_level.yaml +26 -0
data/lib/translation_maps/marc_genre_007.yaml +9 -0
data/lib/translation_maps/marc_genre_leader.yaml +22 -0
data/lib/translation_maps/marc_geographic.yaml +589 -0
data/lib/translation_maps/marc_instruments.yaml +102 -0
data/lib/translation_maps/marc_languages.yaml +490 -0
data/test/debug_writer_test.rb +38 -0
data/test/delimited_writer_test.rb +104 -0
data/test/indexer/each_record_test.rb +59 -0
data/test/indexer/macros_marc21_semantics_test.rb +391 -0
data/test/indexer/macros_marc21_test.rb +190 -0
data/test/indexer/macros_test.rb +40 -0
data/test/indexer/map_record_test.rb +209 -0
data/test/indexer/read_write_test.rb +101 -0
data/test/indexer/settings_test.rb +152 -0
data/test/indexer/to_field_test.rb +77 -0
data/test/marc_extractor_test.rb +412 -0
data/test/marc_format_classifier_test.rb +98 -0
data/test/marc_reader_test.rb +110 -0
data/test/solr_json_writer_test.rb +248 -0
data/test/test_helper.rb +90 -0
data/test/test_support/245_no_ab.marc +1 -0
data/test/test_support/880_with_no_6.utf8.marc +1 -0
data/test/test_support/bad_subfield_code.marc +1 -0
data/test/test_support/bad_utf_byte.utf8.marc +1 -0
data/test/test_support/date_resort_to_260.marc +1 -0
data/test/test_support/date_type_r_missing_date2.marc +1 -0
data/test/test_support/date_with_u.marc +1 -0
data/test/test_support/demo_config.rb +155 -0
data/test/test_support/emptyish_record.marc +1 -0
data/test/test_support/escaped_character_reference.marc8.marc +1 -0
data/test/test_support/george_eliot.marc +1 -0
data/test/test_support/hebrew880s.marc +1 -0
data/test/test_support/louis_armstrong.marc +1 -0
data/test/test_support/manufacturing_consent.marc +1 -0
data/test/test_support/manuscript_online_thesis.marc +1 -0
data/test/test_support/microform_online_conference.marc +1 -0
data/test/test_support/multi_era.marc +1 -0
data/test/test_support/multi_geo.marc +1 -0
data/test/test_support/musical_cage.marc +1 -0
data/test/test_support/nature.marc +1 -0
data/test/test_support/one-marc8.mrc +1 -0
data/test/test_support/online_only.marc +1 -0
data/test/test_support/packed_041a_lang.marc +1 -0
data/test/test_support/test_data.utf8.json +30 -0
data/test/test_support/test_data.utf8.marc.xml +2609 -0
data/test/test_support/test_data.utf8.mrc +1 -0
data/test/test_support/test_data.utf8.mrc.gz +0 -0
data/test/test_support/the_business_ren.marc +1 -0
data/test/translation_map_test.rb +225 -0
data/test/translation_maps/bad_ruby.rb +8 -0
data/test/translation_maps/bad_yaml.yaml +1 -0
data/test/translation_maps/both_map.rb +1 -0
data/test/translation_maps/both_map.yaml +1 -0
data/test/translation_maps/default_literal.rb +10 -0
data/test/translation_maps/default_passthrough.rb +10 -0
data/test/translation_maps/marc_040a_translate_test.yaml +1 -0
data/test/translation_maps/properties_map.properties +5 -0
data/test/translation_maps/ruby_map.rb +10 -0
data/test/translation_maps/translate_array_test.yaml +8 -0
data/test/translation_maps/yaml_map.yaml +7 -0
data/traject.gemspec +47 -0
metadata +382 -0

data/test/debug_writer_test.rb ADDED Viewed

@@ -0,0 +1,38 @@
+require 'test_helper'
+require 'stringio'
+require 'traject/debug_writer'
+require 'traject'
+require 'marc'
+describe 'Simple output' do
+  before do
+    @record = MARC::Reader.new(support_file_path  "manufacturing_consent.marc").to_a.first
+    @indexer = Traject::Indexer.new
+    @indexer.instance_eval do
+      to_field "id", extract_marc("001", :first => true)
+      to_field "title", extract_marc("245ab")
+    end
+    @io = StringIO.new
+    @writer = Traject::DebugWriter.new("output_stream" => @io)
+    @id = "2710183"
+    @title = "Manufacturing consent : the political economy of the mass media /"
+  end
+  it "does a simple output" do
+    @writer.put Traject::Indexer::Context.new(:output_hash => @indexer.map_record(@record))
+    expected = [
+      "#{@id} id #{@id}",
+      "#{@id} title #{@title}",
+      "\n"
+    ]
+    assert_equal expected.join("\n").gsub(/\s/, ''), @io.string.gsub(/\s/, '')
+    @writer.close
+  end
+end

data/test/delimited_writer_test.rb ADDED Viewed

@@ -0,0 +1,104 @@
+# Encoding: UTF-8
+require 'test_helper'
+require 'stringio'
+require 'traject/delimited_writer'
+require 'traject/csv_writer'
+require 'csv'
+describe "Delimited/CSV Writers" do
+  before do
+    @out                 = StringIO.new
+    @settings            = {'output_stream' => @out, 'delimited_writer.fields' => 'four,one,two'}
+    @context             = Struct.new(:output_hash).new
+    @context.output_hash = {'one' => 'one', 'two' => %w[two1 two2], 'three' => 'three', 'four' => 'four'}
+  end
+  after do
+    @out.close
+  end
+  describe "Traject::DelimitedWriter" do
+    it "creates a dw with defaults" do
+      dw = Traject::DelimitedWriter.new(@settings)
+      dw.delimiter.must_equal "\t"
+      dw.internal_delimiter.must_equal '|'
+      dw.edelim.must_equal ' '
+      dw.eidelim.must_equal '\\|'
+    end
+    it "respects different delimiter" do
+      @settings['delimited_writer.delimiter'] = '^'
+      dw                                      = Traject::DelimitedWriter.new(@settings)
+      dw.delimiter.must_equal '^'
+      dw.edelim.must_equal '\\^'
+      dw.internal_delimiter.must_equal '|'
+    end
+    it "outputs a header if asked to" do
+      dw = Traject::DelimitedWriter.new(@settings)
+      @out.string.chomp.must_equal %w[four one two].join("\t")
+    end
+    it "doesn't output a header if asked not to" do
+      @settings['delimited_writer.header'] = 'false'
+      dw                                   = Traject::DelimitedWriter.new(@settings)
+      @out.string.must_be_empty
+    end
+    it "deals with multiple values" do
+      dw = Traject::DelimitedWriter.new(@settings)
+      dw.put @context
+      @out.string.split("\n").last.must_equal ['four', 'one', 'two1|two2'].join(dw.delimiter)
+    end
+    it "bails if delimited_writer.fields isn't set" do
+      @settings.delete 'delimited_writer.fields'
+      proc { Traject::DelimitedWriter.new(@settings) }.must_raise(ArgumentError)
+    end
+  end
+  describe "Traject::CSVWriter" do
+    it "unsets the delimiter" do
+      cw = Traject::CSVWriter.new(@settings)
+      cw.delimiter.must_be_nil
+    end
+    it "writes the header" do
+      cw = Traject::CSVWriter.new(@settings)
+      @out.string.chomp.must_equal 'four,one,two'
+    end
+    it "uses the internal delimiter" do
+      cw = Traject::CSVWriter.new(@settings)
+      cw.put @context
+      @out.string.split("\n").last.must_equal ['four', 'one', 'two1|two2'].join(',')
+    end
+    it "produces complex output" do
+      @context.output_hash = {
+          'four' => ['Bill Clinton, Jr.', 'Jesse "the Body" Ventura'],
+          'one' => 'Willard "Mitt" Romney',
+          'two' => 'Dueber, Bill'
+      }
+      canonical = StringIO.new
+      csv = CSV.new(canonical)
+      csv_vals = [@context.output_hash['four'].join('|'), @context.output_hash['one'], @context.output_hash['two']]
+      csv << csv_vals
+      csv_output = canonical.string.chomp
+      cw = Traject::CSVWriter.new(@settings)
+      cw.put @context
+      traject_csvwriter_output = @out.string.split("\n").last.chomp
+      assert_equal(csv_output, traject_csvwriter_output)
+    end
+  end
+end

data/test/indexer/each_record_test.rb ADDED Viewed

@@ -0,0 +1,59 @@
+require 'test_helper'
+describe "Traject::Indexer#each_record" do
+  before do
+    @indexer = Traject::Indexer.new
+  end
+  describe "checks arguments" do
+    it "rejects no-arg block" do
+      assert_raises(Traject::Indexer::ArityError) do
+        @indexer.each_record do
+        end
+      end
+    end
+    it "rejects three-arg block" do
+      assert_raises(Traject::Indexer::ArityError) do
+        @indexer.each_record do |one, two, three|
+        end
+      end
+    end
+    it "accepts one-arg block" do
+      @indexer.each_record do |record|
+      end
+    end
+    it "accepts two-arg block" do
+      @indexer.each_record do |record, context|
+      end
+    end
+    it "accepts variable arity block" do
+      @indexer.each_record do |*variable|
+      end
+    end
+    it "outputs error with source location" do
+      begin
+        @indexer.to_field('foo') {|one, two| }
+        @indexer.each_record {|one, two, three| }   # bad arity
+        flunk("Should have rejected bad arity ")
+      rescue Traject::Indexer::ArityError => e
+        assert_match(/each_record at .*\/.*:\d+/, e.message)
+      rescue
+        flunk("Should only fail with a ArityError")
+      end
+    end
+    it "rejects each_record with a name (e.g., using a to_field syntax)" do
+      assert_raises(Traject::Indexer::NamingError) do
+        @indexer.each_record('bad_name') {|one, two| }
+      end
+    end
+    it "reject each_record with no arguments/blocks at all" do
+      assert_raises(ArgumentError) do
+        @indexer.each_record()
+      end
+    end
+  end
+end

data/test/indexer/macros_marc21_semantics_test.rb ADDED Viewed

@@ -0,0 +1,391 @@
+# Encoding: UTF-8
+require 'test_helper'
+require 'traject/indexer'
+require 'traject/macros/marc21_semantics'
+require 'json'
+require 'marc/record'
+# See also marc_extractor_test.rb for more detailed tests on marc extraction,
+# this is just a basic test to make sure our macro works passing through to there
+# and other options.
+describe "Traject::Macros::Marc21Semantics" do
+  Marc21Semantics = Traject::Macros::Marc21Semantics # shortcut
+  before do
+    @indexer = Traject::Indexer.new
+    @indexer.extend Marc21Semantics
+    @record = MARC::Reader.new(support_file_path  "manufacturing_consent.marc").to_a.first
+  end
+  it "oclcnum" do
+    @indexer.instance_eval do
+      to_field "oclcnum", oclcnum
+    end
+    output = @indexer.map_record(@record)
+    assert_equal %w{47971712},  output["oclcnum"]
+    assert_equal({}, @indexer.map_record(empty_record))
+  end
+  it "deals with all prefixed OCLC nunbers" do
+    @record.append(MARC::DataField.new('035', ' ', ' ', ['a', '(OCoLC)ocm111111111']))
+    @record.append(MARC::DataField.new('035', ' ', ' ', ['a', '(OCoLC)222222222']))
+    @record.append(MARC::DataField.new('035', ' ', ' ', ['a', 'ocm333333333']))
+    @record.append(MARC::DataField.new('035', ' ', ' ', ['a', 'ocn444444444']))
+    @record.append(MARC::DataField.new('035', ' ', ' ', ['a', '(OCoLC)ocn555555555']))
+    @record.append(MARC::DataField.new('035', ' ', ' ', ['a', '(OCoLC)on666666666']))
+    @record.append(MARC::DataField.new('035', ' ', ' ', ['a', '777777777'])) # not OCLC number
+    @indexer.instance_eval do
+      to_field "oclcnum", oclcnum
+    end
+    output = @indexer.map_record(@record)
+    assert_equal %w{47971712 111111111 222222222 333333333 444444444 555555555 666666666},  output["oclcnum"]
+  end
+  it "#marc_series_facet" do
+    @record = MARC::Reader.new(support_file_path  "louis_armstrong.marc").to_a.first
+    @indexer.instance_eval do
+      to_field "series_facet", marc_series_facet
+    end
+    output = @indexer.map_record(@record)
+    # trims punctuation too
+    assert_equal ["Big bands"], output["series_facet"]
+    assert_equal({}, @indexer.map_record(empty_record))
+  end
+  describe "marc_sortable_author" do
+    # these probably should be taking only certain subfields, but we're copying
+    # from SolrMarc that didn't do so either and nobody noticed, so not bothering for now.
+    before do
+      @indexer.instance_eval do
+        to_field "author_sort", marc_sortable_author
+      end
+    end
+    it "collates author and title" do
+      output = @indexer.map_record(@record)
+      assert_equal ["Herman, Edward S.   Manufacturing consent the political economy of the mass media Edward S. Herman and Noam Chomsky ; with a new introduction by the authors"], output["author_sort"]
+      assert_equal [""], @indexer.map_record(empty_record)['author_sort']
+    end
+    it "respects non-filing" do
+      @record = MARC::Reader.new(support_file_path  "the_business_ren.marc").to_a.first
+      output = @indexer.map_record(@record)
+      assert_equal ["Business renaissance quarterly [electronic resource]."], output["author_sort"]
+      assert_equal [""], @indexer.map_record(empty_record)['author_sort']
+    end
+  end
+  describe "marc_sortable_title" do
+    before do
+      @indexer.instance_eval { to_field "title_sort", marc_sortable_title }
+    end
+    it "works" do
+      output = @indexer.map_record(@record)
+      assert_equal ["Manufacturing consent : the political economy of the mass media"], output["title_sort"]
+      assert_equal({}, @indexer.map_record(empty_record))
+    end
+    it "respects non-filing" do
+      @record = MARC::Reader.new(support_file_path  "the_business_ren.marc").to_a.first
+      output = @indexer.map_record(@record)
+      assert_equal ["Business renaissance quarterly"], output["title_sort"]
+    end
+    it "works with a record with no 245$ab" do
+      @record = MARC::Reader.new(support_file_path  "245_no_ab.marc").to_a.first
+      output = @indexer.map_record(@record)
+      assert_equal ["Papers"], output["title_sort"]
+    end
+  end
+  describe "marc_languages" do
+    before do
+      @indexer.instance_eval {to_field "languages", marc_languages() }
+    end
+    it "unpacks packed 041a and translates" do
+      @record = MARC::Reader.new(support_file_path  "packed_041a_lang.marc").to_a.first
+      output = @indexer.map_record(@record)
+      assert_equal ["English", "French", "German", "Italian", "Spanish", "Russian"], output["languages"]
+      assert_equal({}, @indexer.map_record(empty_record))
+    end
+  end
+  describe "marc_instrumentation_humanized" do
+    before do
+      @record = MARC::Reader.new(support_file_path  "musical_cage.marc").to_a.first
+      @indexer.instance_eval {to_field "instrumentation", marc_instrumentation_humanized }
+    end
+    it "translates, de-duping" do
+      output = @indexer.map_record(@record)
+      assert_equal ["Larger ensemble, Unspecified", "Piano", "Soprano voice", "Tenor voice", "Violin", "Larger ensemble, Ethnic", "Guitar", "Voices, Unspecified"], output["instrumentation"]
+      assert_equal({}, @indexer.map_record(empty_record))
+    end
+  end
+  describe "marc_instrument_codes_normalized" do
+    before do
+      @record = MARC::Reader.new(support_file_path  "musical_cage.marc").to_a.first
+      @indexer.instance_eval {to_field "instrument_codes", marc_instrument_codes_normalized }
+    end
+    it "normalizes, de-duping" do
+      output = @indexer.map_record(@record)
+      assert_equal ["on", "ka01", "ka", "va01", "va", "vd01", "vd", "sa01", "sa", "oy", "tb01", "tb", "vn12", "vn"],
+        output["instrument_codes"]
+    end
+    it "codes soloist 048$b" do
+      @record = MARC::Reader.new(support_file_path  "louis_armstrong.marc").to_a.first
+      output = @indexer.map_record(@record)
+      assert_equal ["bb01", "bb01.s", "bb", "bb.s", "oe"], output["instrument_codes"]
+      assert_equal({}, @indexer.map_record(empty_record))
+    end
+  end
+  describe "publication_date" do
+    # there are way too many edge cases for us to test em all, but we'll test some of em.
+    it "works when there's no date information" do
+      assert_equal nil,  Marc21Semantics.publication_date(empty_record)
+    end
+    it "uses macro correctly with no date info" do
+      @indexer.instance_eval {to_field "date", marc_publication_date }
+      assert_equal({}, @indexer.map_record(empty_record))
+    end
+    it "pulls out 008 date_type s" do
+      @record = MARC::Reader.new(support_file_path  "manufacturing_consent.marc").to_a.first
+      assert_equal 2002, Marc21Semantics.publication_date(@record)
+    end
+    it "uses start date for date_type c continuing resource" do
+      @record = MARC::Reader.new(support_file_path  "the_business_ren.marc").to_a.first
+      assert_equal 2006, Marc21Semantics.publication_date(@record)
+    end
+    it "returns nil when the records really got nothing" do
+      @record = MARC::Reader.new(support_file_path  "emptyish_record.marc").to_a.first
+      assert_equal nil, Marc21Semantics.publication_date(@record)
+    end
+    it "estimates with a single 'u'" do
+      @record = MARC::Reader.new(support_file_path  "date_with_u.marc").to_a.first
+      # was 184u as date1 on a continuing resource. For continuing resources,
+      # we take the first date. And need to deal with the u.
+      assert_equal 1845, Marc21Semantics.publication_date(@record)
+    end
+    it "resorts to 260c" do
+      @record = MARC::Reader.new(support_file_path  "date_resort_to_260.marc").to_a.first
+      assert_equal 1980, Marc21Semantics.publication_date(@record)
+    end
+    it "works with date type r missing date2" do
+      @record = MARC::Reader.new(support_file_path  "date_type_r_missing_date2.marc").to_a.first
+      assert_equal 1957, Marc21Semantics.publication_date(@record)
+    end
+    it "works correctly with date type 'q'" do
+      val = @record['008'].value
+      val[6] = 'q'
+      val[7..10] = '191u'
+      val[11..14] = '192u'
+      @record['008'].value = val
+      # Date should be date1 + date2 / 2 = (1910 + 1929) / 2 = 1919
+      estimate_tolerance = 30
+      assert_equal 1919, Marc21Semantics.publication_date(@record, estimate_tolerance)
+    end
+  end
+  describe "marc_lcc_to_broad_category" do
+    before do
+      @indexer.instance_eval {to_field "discipline_facet", marc_lcc_to_broad_category }
+    end
+    it "maps a simple example" do
+      @record = MARC::Reader.new(support_file_path  "manufacturing_consent.marc").to_a.first
+      output = @indexer.map_record(@record)
+      assert_equal ["Language & Literature"], output["discipline_facet"]
+    end
+    it "maps to default" do
+      @record = MARC::Reader.new(support_file_path  "musical_cage.marc").to_a.first
+      output = @indexer.map_record(@record)
+      assert_equal ["Unknown"], output["discipline_facet"]
+      assert_equal(["Unknown"], @indexer.map_record(empty_record)['discipline_facet'])
+    end
+    it "maps to nothing if none and no default" do
+      @indexer.instance_eval {to_field "discipline_no_default", marc_lcc_to_broad_category(:default => nil)}
+      @record = MARC::Reader.new(support_file_path  "musical_cage.marc").to_a.first
+      output = @indexer.map_record(@record)
+      assert_nil output["discipline_no_default"]
+      assert_nil @indexer.map_record(empty_record)["discipline_no_default"]
+    end
+    describe "LCC_REGEX" do
+      it "rejects a non-LCC" do
+        refute_match Traject::Macros::Marc21Semantics::LCC_REGEX, "Film no. A .N285"
+      end
+    end
+  end
+  describe "marc_geo_facet" do
+    before do
+      @indexer.instance_eval {to_field "geo_facet", marc_geo_facet }
+    end
+    it "maps a complicated record" do
+      @record = MARC::Reader.new(support_file_path  "multi_geo.marc").to_a.first
+      output = @indexer.map_record(@record)
+      assert_equal ["Europe", "Middle East", "Africa, North", "Agora (Athens, Greece)", "Rome (Italy)", "Italy"], output["geo_facet"]
+      assert_equal({}, @indexer.map_record(empty_record))
+    end
+    it "maps nothing on a record with no geo" do
+      @record = MARC::Reader.new(support_file_path  "manufacturing_consent.marc").to_a.first
+      output = @indexer.map_record(@record)
+      assert_nil output["geo_facet"]
+      assert_equal({}, @indexer.map_record(empty_record))
+    end
+  end
+  describe "marc_era_facet" do
+    before do
+      @indexer.instance_eval {to_field "era_facet", marc_era_facet}
+    end
+    it "maps a complicated record" do
+      @record = MARC::Reader.new(support_file_path  "multi_era.marc").to_a.first
+      output = @indexer.map_record(@record)
+      assert_equal ["Early modern, 1500-1700", "17th century", "Great Britain: Puritan Revolution, 1642-1660", "Great Britain: Civil War, 1642-1649", "1642-1660"],
+        output["era_facet"]
+      assert_equal({}, @indexer.map_record(empty_record))
+    end
+  end
+  describe "marc_lcsh_display" do
+    it "formats typical field" do
+      field = MARC::DataField.new('650', ' ', ' ', ['a', 'Psychoanalysis and literature'], ['z', 'England'], ['x', 'History'], ['y', '19th century.'])
+      str = Marc21Semantics.assemble_lcsh(field)
+      assert_equal "Psychoanalysis and literature — England — History — 19th century", str
+    end
+    it "ignores numeric subfields" do
+      field = MARC::DataField.new('650', ' ', ' ', ['a', 'Psychoanalysis and literature'], ['x', 'History'], ['0', '01234'], ['3', 'Some part'])
+      str = Marc21Semantics.assemble_lcsh(field)
+      assert_equal "Psychoanalysis and literature — History", str
+    end
+    it "doesn't put subdivision in wrong place" do
+      field = MARC::DataField.new('600', ' ', ' ', ['a', 'Eliot, George,'],['d', '1819-1880.'], ['t', 'Middlemarch'])
+      str = Marc21Semantics.assemble_lcsh(field)
+      assert_equal "Eliot, George, 1819-1880. Middlemarch", str
+    end
+    it "mixes non-subdivisions with subdivisions" do
+      field = MARC::DataField.new('600', ' ', ' ', ['a', 'Eliot, George,'],['d', '1819-1880.'], ['t', 'Middlemarch'], ['x', 'Criticism.'])
+      str = Marc21Semantics.assemble_lcsh(field)
+      assert_equal "Eliot, George, 1819-1880. Middlemarch — Criticism", str
+    end
+    it "returns nil for a field with no relevant subfields" do
+      field = MARC::DataField.new('650', ' ', ' ')
+      assert_nil Marc21Semantics.assemble_lcsh(field)
+    end
+    describe "marc_lcsh_formatted macro" do
+      it "smoke test" do
+        @record = MARC::Reader.new(support_file_path  "george_eliot.marc").to_a.first
+        @indexer.instance_eval {to_field "lcsh", marc_lcsh_formatted}
+        output = @indexer.map_record(@record)
+        assert output["lcsh"].length > 0, "outputs data"
+        assert output["lcsh"].include?("Eliot, George, 1819-1880 — Characters"), "includes a string its supposed to"
+        assert_equal({}, @indexer.map_record(empty_record))
+      end
+    end
+  end
+  describe "extract_marc_filing_version" do
+    before do
+      @record = MARC::Reader.new(support_file_path  "the_business_ren.marc").to_a.first
+    end
+    it "works as expected" do
+      @indexer.instance_eval do
+        to_field 'title_phrase', extract_marc_filing_version('245ab')
+      end
+      output = @indexer.map_record(@record)
+      assert_equal ['Business renaissance quarterly'], output['title_phrase']
+      assert_equal({}, @indexer.map_record(empty_record))
+    end
+    it "works with :include_original" do
+      @indexer.instance_eval do
+        to_field 'title_phrase', extract_marc_filing_version('245ab', :include_original=>true)
+      end
+      output = @indexer.map_record(@record)
+      assert_equal ['The Business renaissance quarterly', 'Business renaissance quarterly'], output['title_phrase']
+      assert_equal({}, @indexer.map_record(empty_record))
+    end
+    it "doesn't do anything if you don't include the first subfield" do
+      @indexer.instance_eval do
+        to_field 'title_phrase', extract_marc_filing_version('245h')
+      end
+      output = @indexer.map_record(@record)
+      assert_equal ['[electronic resource].'], output['title_phrase']
+      assert_equal({}, @indexer.map_record(empty_record))
+    end
+    it "dies if you pass it something else" do
+      assert_raises(RuntimeError) do
+        @indexer.instance_eval do
+          to_field 'title_phrase', extract_marc_filing_version('245ab', :include_original=>true, :uniq => true)
+        end
+      end
+    end
+  end
+end