traject 1.0.0.beta.1 → 1.0.0.beta.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/lib/traject/indexer.rb +4 -4
- data/lib/traject/macros/marc21.rb +11 -1
- data/lib/traject/macros/marc21_semantics.rb +69 -1
- data/lib/traject/version.rb +1 -1
- data/test/indexer/macros_marc21_semantics_test.rb +47 -1
- data/test/indexer/macros_marc21_test.rb +3 -0
- data/test/test_support/demo_config.rb +1 -1
- data/test/test_support/george_eliot.marc +1 -0
- metadata +4 -2
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA1:
         | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 3 | 
            +
              metadata.gz: cb35b4c5ba302cb865b459bfac6859ef6be68927
         | 
| 4 | 
            +
              data.tar.gz: 14964b88428d0a827932cbf17194a77c56de1091
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: b3ae114fe4a11baaf6f6470d35d5df339a32b8f7e747f012310ee90ed55b982754d196abcc25ec0375d8bf988ca5e3ebd2c71ef7df159bc7007e6cdae9c69643
         | 
| 7 | 
            +
              data.tar.gz: 74c6c6170860cf1cd4883d644f9c23151a703ff996592abb8668a655dcff885c108c7e04e13419f6c4ba727384b3bab3dff896c177382284b8d12b98cc711225
         | 
    
        data/README.md
    CHANGED
    
    | @@ -3,7 +3,7 @@ | |
| 3 3 | 
             
            Tools for reading MARC records, transforming them with indexing rules, and indexing to Solr.
         | 
| 4 4 | 
             
            Might be used to index MARC data for a Solr-based discovery product like [Blacklight](https://github.com/projectblacklight/blacklight) or [VUFind](http://vufind.org/).
         | 
| 5 5 |  | 
| 6 | 
            -
            Traject might also be generalized to a set of tools for getting structured data from a source, and  | 
| 6 | 
            +
            Traject might also be generalized to a set of tools for getting structured data from a source, and transforming it to a hash-like object to send to a destination. 
         | 
| 7 7 |  | 
| 8 8 |  | 
| 9 9 | 
             
            **Traject is nearing 1.0, it is robust, feature-rich and being used in production by authors -- feedback invited**
         | 
    
        data/lib/traject/indexer.rb
    CHANGED
    
    | @@ -7,7 +7,6 @@ require 'traject/indexer/settings' | |
| 7 7 | 
             
            require 'traject/marc_reader'
         | 
| 8 8 | 
             
            require 'traject/marc4j_reader'
         | 
| 9 9 | 
             
            require 'traject/json_writer'
         | 
| 10 | 
            -
            require 'traject/solrj_writer'
         | 
| 11 10 |  | 
| 12 11 | 
             
            require 'traject/macros/marc21'
         | 
| 13 12 | 
             
            require 'traject/macros/basic'
         | 
| @@ -71,9 +70,10 @@ require 'traject/macros/basic' | |
| 71 70 | 
             
            #  4) Optionally implements a #skipped_record_count method, returning int count of records
         | 
| 72 71 | 
             
            #     that were skipped due to errors (and presumably logged)
         | 
| 73 72 | 
             
            #
         | 
| 74 | 
            -
            #  The default writer  | 
| 75 | 
            -
            #   | 
| 76 | 
            -
            #   | 
| 73 | 
            +
            #  The default writer is the SolrJWriter, using Java SolrJ to
         | 
| 74 | 
            +
            #  write to a Solr.  A few other built-in writers are available,
         | 
| 75 | 
            +
            #  but it's anticipated more will be created as plugins or local
         | 
| 76 | 
            +
            #  code for special purposes. 
         | 
| 77 77 | 
             
            #
         | 
| 78 78 | 
             
            #  You can set alternate writers by setting a Class object directly
         | 
| 79 79 | 
             
            #  with the #writer_class method, or by the 'writer_class_name' Setting,
         | 
| @@ -191,8 +191,18 @@ module Traject::Macros | |
| 191 191 | 
             
                #
         | 
| 192 192 | 
             
                # Returns altered string, doesn't change original arg.
         | 
| 193 193 | 
             
                def self.trim_punctuation(str)
         | 
| 194 | 
            +
                  
         | 
| 195 | 
            +
                  # If something went wrong and we got a nil, just return it
         | 
| 196 | 
            +
                  return str unless str
         | 
| 197 | 
            +
                  
         | 
| 198 | 
            +
                  # trailing: comma, slash, semicolon, colon (possibly preceded and followed by whitespace)
         | 
| 194 199 | 
             
                  str = str.sub(/ *[ ,\/;:] *\Z/, '')
         | 
| 195 | 
            -
             | 
| 200 | 
            +
             | 
| 201 | 
            +
                  # trailing period if it is preceded by at least three letters (possibly preceded and followed by whitespace)
         | 
| 202 | 
            +
                  str = str.sub(/( *\w\w\w)\. *\Z/, '\1')
         | 
| 203 | 
            +
             | 
| 204 | 
            +
                  # single square bracket characters if they are the start and/or end
         | 
| 205 | 
            +
                  #   chars and there are no internal square brackets.
         | 
| 196 206 | 
             
                  str = str.sub(/\A\[?([^\[\]]+)\]?\Z/, '\1')
         | 
| 197 207 | 
             
                  return str
         | 
| 198 208 | 
             
                end
         | 
| @@ -1,3 +1,5 @@ | |
| 1 | 
            +
            # Encoding: UTF-8
         | 
| 2 | 
            +
             | 
| 1 3 | 
             
            require 'traject/marc_extractor'
         | 
| 2 4 |  | 
| 3 5 | 
             
            module Traject::Macros
         | 
| @@ -81,7 +83,8 @@ module Traject::Macros | |
| 81 83 | 
             
                # 245 a and b, with non-filing characters stripped off
         | 
| 82 84 | 
             
                def marc_sortable_title
         | 
| 83 85 | 
             
                  lambda do |record, accumulator|
         | 
| 84 | 
            -
                     | 
| 86 | 
            +
                    st = Marc21Semantics.get_sortable_title(record)
         | 
| 87 | 
            +
                    accumulator << st if st
         | 
| 85 88 | 
             
                  end
         | 
| 86 89 | 
             
                end
         | 
| 87 90 |  | 
| @@ -503,6 +506,71 @@ module Traject::Macros | |
| 503 506 | 
             
                  end
         | 
| 504 507 | 
             
                end
         | 
| 505 508 |  | 
| 509 | 
            +
                # Extracts LCSH-carrying fields, and formatting them
         | 
| 510 | 
            +
                # as a pre-coordinated LCSH string, for instance suitable for including
         | 
| 511 | 
            +
                # in a facet. 
         | 
| 512 | 
            +
                #
         | 
| 513 | 
            +
                # You can supply your own list of fields as a spec, but for significant
         | 
| 514 | 
            +
                # customization you probably just want to write your own method in
         | 
| 515 | 
            +
                # terms of the Marc21Semantics.assemble_lcsh method. 
         | 
| 516 | 
            +
                def marc_lcsh_formatted(options = {})
         | 
| 517 | 
            +
                  spec            = options[:spec] || "600:610:611:630:648:650:651:654:6662"
         | 
| 518 | 
            +
                  subd_separator  = options[:subdivison_separator] || " — "
         | 
| 519 | 
            +
                  other_separator = options[:other_separator] || " "
         | 
| 520 | 
            +
             | 
| 521 | 
            +
                  extractor       = MarcExtractor.new(spec)
         | 
| 522 | 
            +
             | 
| 523 | 
            +
                  return lambda do |record, accumulator|
         | 
| 524 | 
            +
                    accumulator.concat( extractor.collect_matching_lines(record) do |field, spec|
         | 
| 525 | 
            +
                      Marc21Semantics.assemble_lcsh(field, subd_separator, other_separator)
         | 
| 526 | 
            +
                    end)
         | 
| 527 | 
            +
                  end
         | 
| 528 | 
            +
             | 
| 529 | 
            +
                end
         | 
| 530 | 
            +
             | 
| 531 | 
            +
                # Takes a MARC::Field and formats it into a pre-coordinated LCSH string
         | 
| 532 | 
            +
                # with subdivision seperators in the right place. 
         | 
| 533 | 
            +
                #
         | 
| 534 | 
            +
                # For 600 fields especially, need to not just join with subdivision seperator
         | 
| 535 | 
            +
                # to take acount of $a$d$t -- for other fields, might be able to just
         | 
| 536 | 
            +
                # join subfields, not sure. 
         | 
| 537 | 
            +
                #
         | 
| 538 | 
            +
                # WILL strip trailing period from generated string, contrary to some LCSH practice.
         | 
| 539 | 
            +
                # Our data is inconsistent on whether it has period or not, this was
         | 
| 540 | 
            +
                # the easiest way to standardize. 
         | 
| 541 | 
            +
                #
         | 
| 542 | 
            +
                # Default subdivision seperator is em-dash with spaces, set to '--' if you want. 
         | 
| 543 | 
            +
                #
         | 
| 544 | 
            +
                # Cite: "Dash (-) that precedes a subdivision in an extended 600 subject heading
         | 
| 545 | 
            +
                # is not carried in the MARC record. It may be system generated as a display constant
         | 
| 546 | 
            +
                # associated with the content of subfield $v, $x, $y, and $z."
         | 
| 547 | 
            +
                # http://www.loc.gov/marc/bibliographic/bd600.html
         | 
| 548 | 
            +
                def self.assemble_lcsh(marc_field, subd_separator = " — ", other_separator = " ")
         | 
| 549 | 
            +
                  str = ""
         | 
| 550 | 
            +
                  subd_prefix_codes = %w{v x y z}
         | 
| 551 | 
            +
             | 
| 552 | 
            +
             | 
| 553 | 
            +
                  marc_field.subfields.each_with_index do |sf, i|
         | 
| 554 | 
            +
                    # ignore non-alphabetic, like numeric control subfields
         | 
| 555 | 
            +
                    next unless sf.code =~ /\A[a-z]\Z/
         | 
| 556 | 
            +
             | 
| 557 | 
            +
                    prefix = if subd_prefix_codes.include? sf.code
         | 
| 558 | 
            +
                      subd_separator
         | 
| 559 | 
            +
                    elsif i == 0
         | 
| 560 | 
            +
                      ""
         | 
| 561 | 
            +
                    else
         | 
| 562 | 
            +
                      other_separator
         | 
| 563 | 
            +
                    end
         | 
| 564 | 
            +
                    str << prefix << sf.value
         | 
| 565 | 
            +
                  end
         | 
| 566 | 
            +
             | 
| 567 | 
            +
                  str.gsub!(/\.\Z/, '')
         | 
| 568 | 
            +
             | 
| 569 | 
            +
                  return nil if str == ""
         | 
| 570 | 
            +
             | 
| 571 | 
            +
                  return str
         | 
| 572 | 
            +
                end
         | 
| 573 | 
            +
             | 
| 506 574 |  | 
| 507 575 | 
             
              end
         | 
| 508 576 | 
             
            end
         | 
    
        data/lib/traject/version.rb
    CHANGED
    
    
| @@ -1,3 +1,5 @@ | |
| 1 | 
            +
            # Encoding: UTF-8
         | 
| 2 | 
            +
             | 
| 1 3 | 
             
            require 'test_helper'
         | 
| 2 4 |  | 
| 3 5 | 
             
            require 'traject/indexer'
         | 
| @@ -231,7 +233,52 @@ describe "Traject::Macros::Marc21Semantics" do | |
| 231 233 | 
             
                  assert_equal ["Early modern, 1500-1700", "17th century", "Great Britain: Puritan Revolution, 1642-1660", "Great Britain: Civil War, 1642-1649", "1642-1660"],
         | 
| 232 234 | 
             
                    output["era_facet"]
         | 
| 233 235 | 
             
                end
         | 
| 236 | 
            +
              end
         | 
| 237 | 
            +
             | 
| 238 | 
            +
              describe "marc_lcsh_display" do
         | 
| 239 | 
            +
                it "formats typical field" do      
         | 
| 240 | 
            +
                  field = MARC::DataField.new('650', ' ', ' ', ['a', 'Psychoanalysis and literature'], ['z', 'England'], ['x', 'History'], ['y', '19th century.'])
         | 
| 241 | 
            +
                  str = Marc21Semantics.assemble_lcsh(field)
         | 
| 242 | 
            +
             | 
| 243 | 
            +
                  assert_equal "Psychoanalysis and literature — England — History — 19th century", str
         | 
| 244 | 
            +
                end
         | 
| 245 | 
            +
             | 
| 246 | 
            +
                it "ignores numeric subfields" do
         | 
| 247 | 
            +
                  field = MARC::DataField.new('650', ' ', ' ', ['a', 'Psychoanalysis and literature'], ['x', 'History'], ['0', '01234'], ['3', 'Some part'])
         | 
| 248 | 
            +
                  str = Marc21Semantics.assemble_lcsh(field)
         | 
| 249 | 
            +
             | 
| 250 | 
            +
                  assert_equal "Psychoanalysis and literature — History", str
         | 
| 251 | 
            +
                end
         | 
| 252 | 
            +
             | 
| 253 | 
            +
                it "doesn't put subdivision in wrong place" do 
         | 
| 254 | 
            +
                  field = MARC::DataField.new('600', ' ', ' ', ['a', 'Eliot, George,'],['d', '1819-1880.'], ['t', 'Middlemarch'])
         | 
| 255 | 
            +
                  str = Marc21Semantics.assemble_lcsh(field)
         | 
| 256 | 
            +
             | 
| 257 | 
            +
                  assert_equal "Eliot, George, 1819-1880. Middlemarch", str
         | 
| 258 | 
            +
                end
         | 
| 259 | 
            +
             | 
| 260 | 
            +
                it "mixes non-subdivisions with subdivisions" do
         | 
| 261 | 
            +
                  field = MARC::DataField.new('600', ' ', ' ', ['a', 'Eliot, George,'],['d', '1819-1880.'], ['t', 'Middlemarch'], ['x', 'Criticism.'])
         | 
| 262 | 
            +
                  str = Marc21Semantics.assemble_lcsh(field)
         | 
| 263 | 
            +
             | 
| 264 | 
            +
                  assert_equal "Eliot, George, 1819-1880. Middlemarch — Criticism", str
         | 
| 265 | 
            +
                end
         | 
| 266 | 
            +
             | 
| 267 | 
            +
                it "returns nil for a field with no relevant subfields" do
         | 
| 268 | 
            +
                  field = MARC::DataField.new('650', ' ', ' ')
         | 
| 269 | 
            +
                  assert_nil Marc21Semantics.assemble_lcsh(field)
         | 
| 270 | 
            +
                end
         | 
| 271 | 
            +
             | 
| 272 | 
            +
                describe "marc_lcsh_formatted macro" do
         | 
| 273 | 
            +
                  it "smoke test" do
         | 
| 274 | 
            +
                    @record = MARC::Reader.new(support_file_path  "george_eliot.marc").to_a.first
         | 
| 275 | 
            +
                    @indexer.instance_eval {to_field "lcsh", marc_lcsh_formatted}
         | 
| 276 | 
            +
                    output = @indexer.map_record(@record)
         | 
| 234 277 |  | 
| 278 | 
            +
                    assert output["lcsh"].length > 0, "outputs data"
         | 
| 279 | 
            +
                    assert output["lcsh"].include?("Eliot, George, 1819-1880 — Characters"), "includes a string its supposed to"
         | 
| 280 | 
            +
                  end
         | 
| 281 | 
            +
                end
         | 
| 235 282 | 
             
              end
         | 
| 236 283 |  | 
| 237 284 | 
             
              describe "extract_marc_filing_version" do
         | 
| @@ -272,7 +319,6 @@ describe "Traject::Macros::Marc21Semantics" do | |
| 272 319 | 
             
                  end
         | 
| 273 320 | 
             
                end
         | 
| 274 321 |  | 
| 275 | 
            -
             | 
| 276 322 | 
             
              end
         | 
| 277 323 |  | 
| 278 324 |  | 
| @@ -97,6 +97,9 @@ describe "Traject::Macros::Marc21" do | |
| 97 97 | 
             
                  assert_equal "one two three", Marc21.trim_punctuation("one two three]")
         | 
| 98 98 | 
             
                  assert_equal "one two three", Marc21.trim_punctuation("[one two three")
         | 
| 99 99 | 
             
                  assert_equal "one two three", Marc21.trim_punctuation("[one two three]")
         | 
| 100 | 
            +
             | 
| 101 | 
            +
                  # This one was a bug before
         | 
| 102 | 
            +
                  assert_equal "Feminism and art", Marc21.trim_punctuation("Feminism and art.")
         | 
| 100 103 | 
             
                end
         | 
| 101 104 |  | 
| 102 105 | 
             
                it "uses :translation_map" do
         | 
| @@ -20,7 +20,7 @@ extend Traject::Macros::MarcFormats | |
| 20 20 | 
             
            # files however you like, you can call traject with as many
         | 
| 21 21 | 
             
            # config files as you like, `traject -c one.rb -c two.rb -c etc.rb`
         | 
| 22 22 | 
             
            settings do
         | 
| 23 | 
            -
              provide "solr.url", "http:// | 
| 23 | 
            +
              provide "solr.url", "http://solr.somewhere.edu:8983/solr/corename"
         | 
| 24 24 |  | 
| 25 25 | 
             
              # Only if you need to connect to a Solr 1.x:
         | 
| 26 26 | 
             
              provide "solrj_writer.parser_class_name", "XMLResponseParser"
         | 
| @@ -0,0 +1 @@ | |
| 1 | 
            +
            01359cam a2200361 a 4500001000800000005001700008008004100025010001700066020002800083020003500111035001600146040001300162043001200175049000900187050002500196082001500221100002200236245009700258260005800355300002700413440004600440504006400486600005400550600004700604600004400651600004300695650006700738650005800805650002900863910002600892994001200918991006700930232964520030805093128.0020925s2003    nyu      b   s001 0 eng    a  2002036483  a0791458334 (alk. paper)  a0791458342 (pbk. : alk. paper)  aocm50737282  aDLCcDLC  ae-uk-en  aJHEE00aPR4692.P74bP37 200300a823/.82211 aParis, Bernard J.10aRereading George Eliot :bchanging responses to her experiments in life /cBernard J. Paris.  aAlbany :bState University of New York Press,cc2003.  axiii, 220 p. ;c23 cm. 0aSUNY series in psychoanalysis and culture  aIncludes bibliographical references (p. 213-215) and index.10aEliot, George,d1819-1880xKnowledgexPsychology.10aEliot, George,d1819-1880.tDaniel Deronda10aEliot, George,d1819-1880.tMiddlemarch10aEliot, George,d1819-1880xCharacters. 0aPsychoanalysis and literaturezEnglandxHistoryy19th century. 0aPsychological fiction, EnglishxHistory and criticism 0aPsychology in literature  a2329645bHorizon bib#  aE0bJHE  aPR4692.P74 P37 2003flcbelc1cc. 1q0i3857076lembluememsel
         | 
    
        metadata
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: traject
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 1.0.0.beta. | 
| 4 | 
            +
              version: 1.0.0.beta.2
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors:
         | 
| 7 7 | 
             
            - Jonathan Rochkind
         | 
| @@ -9,7 +9,7 @@ authors: | |
| 9 9 | 
             
            autorequire:
         | 
| 10 10 | 
             
            bindir: bin
         | 
| 11 11 | 
             
            cert_chain: []
         | 
| 12 | 
            -
            date: 2013-10- | 
| 12 | 
            +
            date: 2013-10-17 00:00:00.000000000 Z
         | 
| 13 13 | 
             
            dependencies:
         | 
| 14 14 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 15 15 | 
             
              name: marc
         | 
| @@ -216,6 +216,7 @@ files: | |
| 216 216 | 
             
            - test/test_support/date_with_u.marc
         | 
| 217 217 | 
             
            - test/test_support/demo_config.rb
         | 
| 218 218 | 
             
            - test/test_support/emptyish_record.marc
         | 
| 219 | 
            +
            - test/test_support/george_eliot.marc
         | 
| 219 220 | 
             
            - test/test_support/hebrew880s.marc
         | 
| 220 221 | 
             
            - test/test_support/louis_armstrong.marc
         | 
| 221 222 | 
             
            - test/test_support/manufacturing_consent.marc
         | 
| @@ -313,6 +314,7 @@ test_files: | |
| 313 314 | 
             
            - test/test_support/date_with_u.marc
         | 
| 314 315 | 
             
            - test/test_support/demo_config.rb
         | 
| 315 316 | 
             
            - test/test_support/emptyish_record.marc
         | 
| 317 | 
            +
            - test/test_support/george_eliot.marc
         | 
| 316 318 | 
             
            - test/test_support/hebrew880s.marc
         | 
| 317 319 | 
             
            - test/test_support/louis_armstrong.marc
         | 
| 318 320 | 
             
            - test/test_support/manufacturing_consent.marc
         |