traject 0.9.1 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.travis.yml +7 -0
- data/Gemfile +5 -1
- data/README.md +65 -17
- data/bench/bench.rb +30 -0
- data/bin/traject +4 -169
- data/doc/batch_execution.md +177 -0
- data/doc/extending.md +182 -0
- data/doc/other_commands.md +49 -0
- data/doc/settings.md +6 -2
- data/lib/traject.rb +1 -0
- data/lib/traject/command_line.rb +296 -0
- data/lib/traject/debug_writer.rb +28 -0
- data/lib/traject/indexer.rb +84 -20
- data/lib/traject/indexer/settings.rb +9 -1
- data/lib/traject/json_writer.rb +15 -38
- data/lib/traject/line_writer.rb +59 -0
- data/lib/traject/macros/marc21.rb +10 -5
- data/lib/traject/macros/marc21_semantics.rb +57 -25
- data/lib/traject/marc4j_reader.rb +9 -26
- data/lib/traject/marc_extractor.rb +121 -48
- data/lib/traject/mock_reader.rb +87 -0
- data/lib/traject/mock_writer.rb +34 -0
- data/lib/traject/solrj_writer.rb +1 -22
- data/lib/traject/util.rb +107 -1
- data/lib/traject/version.rb +1 -1
- data/lib/traject/yaml_writer.rb +9 -0
- data/test/debug_writer_test.rb +38 -0
- data/test/indexer/each_record_test.rb +27 -2
- data/test/indexer/macros_marc21_semantics_test.rb +12 -1
- data/test/indexer/settings_test.rb +9 -2
- data/test/indexer/to_field_test.rb +35 -5
- data/test/marc4j_reader_test.rb +3 -0
- data/test/marc_extractor_test.rb +94 -20
- data/test/test_support/demo_config.rb +6 -3
- data/traject.gemspec +1 -2
- metadata +17 -20
| @@ -0,0 +1,28 @@ | |
| 1 | 
            +
            require 'traject/line_writer'
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            # A writer for Traject::Indexer that outputs each record as a series of
         | 
| 4 | 
            +
            # lines, prefixed by the id, one for each field and it's values.
         | 
| 5 | 
            +
            # Multiple values are separated by pipes
         | 
| 6 | 
            +
            #
         | 
| 7 | 
            +
            # Applicable settings:
         | 
| 8 | 
            +
            #
         | 
| 9 | 
            +
            #  - 'output_file' -- the name of the file to output to
         | 
| 10 | 
            +
            #  - 'output_stream' -- alternately, the IO stream
         | 
| 11 | 
            +
            #  - 'debug_writer.idfield' -- the solr field from which to pull the record ID (default: 'id')
         | 
| 12 | 
            +
            #  - 'debug_writer.format'  -- How to format the id/solr field/values (default: '%-12s %-25s %s')
         | 
| 13 | 
            +
             | 
| 14 | 
            +
             | 
| 15 | 
            +
            class Traject::DebugWriter < Traject::LineWriter
         | 
| 16 | 
            +
              DEFAULT_FORMAT = '%-12s %-25s %s'
         | 
| 17 | 
            +
              DEFAULT_IDFIELD = 'id'
         | 
| 18 | 
            +
              
         | 
| 19 | 
            +
              def serialize(context)
         | 
| 20 | 
            +
                idfield = settings["debug_writer.idfield"] || DEFAULT_IDFIELD
         | 
| 21 | 
            +
                format  = settings['debug_writer.format']  || DEFAULT_FORMAT
         | 
| 22 | 
            +
                h = context.output_hash
         | 
| 23 | 
            +
                lines = h.keys.sort.map {|k| format % [h[idfield].first, k, h[k].join(' | ')] }
         | 
| 24 | 
            +
                lines.push "\n"
         | 
| 25 | 
            +
                lines.join("\n")
         | 
| 26 | 
            +
              end    
         | 
| 27 | 
            +
             | 
| 28 | 
            +
            end
         | 
    
        data/lib/traject/indexer.rb
    CHANGED
    
    | @@ -50,6 +50,13 @@ require 'traject/macros/basic' | |
| 50 50 | 
             
            #  with a String name of class meeting the Writer contract.
         | 
| 51 51 | 
             
            #
         | 
| 52 52 | 
             
            class Traject::Indexer
         | 
| 53 | 
            +
              
         | 
| 54 | 
            +
              # Arity error on a passed block
         | 
| 55 | 
            +
              class ArityError < ArgumentError; end
         | 
| 56 | 
            +
              class NamingError < ArgumentError; end
         | 
| 57 | 
            +
             | 
| 58 | 
            +
              
         | 
| 59 | 
            +
              
         | 
| 53 60 | 
             
              include Traject::QualifiedConstGet
         | 
| 54 61 |  | 
| 55 62 | 
             
              attr_writer :reader_class, :writer_class
         | 
| @@ -143,20 +150,13 @@ class Traject::Indexer | |
| 143 150 | 
             
              end
         | 
| 144 151 |  | 
| 145 152 |  | 
| 153 | 
            +
             | 
| 154 | 
            +
             | 
| 155 | 
            +
             | 
| 146 156 | 
             
              # Used to define an indexing mapping.
         | 
| 147 157 | 
             
              def to_field(field_name, aLambda = nil, &block)
         | 
| 148 158 |  | 
| 149 | 
            -
                 | 
| 150 | 
            -
                  raise ArgumentError.new("to_field requires a non-blank first argument, field name")
         | 
| 151 | 
            -
                end
         | 
| 152 | 
            -
                [aLambda, block].each do |proc|
         | 
| 153 | 
            -
                  # allow negative arity, meaning variable/optional, trust em on that.
         | 
| 154 | 
            -
                  # but for positive arrity, we need 2 or 3 args
         | 
| 155 | 
            -
                  if proc && (proc.arity == 0 || proc.arity == 1 || proc.arity > 3)
         | 
| 156 | 
            -
                    raise ArgumentError.new("block/proc given to to_field needs 2 or 3 arguments: #{proc}")
         | 
| 157 | 
            -
                  end
         | 
| 158 | 
            -
                end
         | 
| 159 | 
            -
             | 
| 159 | 
            +
                verify_to_field_arguments(field_name, aLambda, block)
         | 
| 160 160 |  | 
| 161 161 | 
             
                @index_steps << {
         | 
| 162 162 | 
             
                  :field_name => field_name.to_s,
         | 
| @@ -168,15 +168,7 @@ class Traject::Indexer | |
| 168 168 | 
             
              end
         | 
| 169 169 |  | 
| 170 170 | 
             
              def each_record(aLambda = nil, &block)
         | 
| 171 | 
            -
                 | 
| 172 | 
            -
                [aLambda, block].each do |proc|
         | 
| 173 | 
            -
                  # allow negative arity, meaning variable/optional, trust em on that.
         | 
| 174 | 
            -
                  # but for positive arrity, we need 1 or 2 args
         | 
| 175 | 
            -
                  if proc && (proc.arity == 0 || proc.arity > 2)
         | 
| 176 | 
            -
                    raise ArgumentError.new("block/proc given to to_field needs 1 or 2 arguments: #{proc}")
         | 
| 177 | 
            -
                  end
         | 
| 178 | 
            -
                end
         | 
| 179 | 
            -
             | 
| 171 | 
            +
                verify_each_record_arguments(aLambda, block)
         | 
| 180 172 | 
             
                @index_steps << {
         | 
| 181 173 | 
             
                  :lambda => aLambda,
         | 
| 182 174 | 
             
                  :block  => block,
         | 
| @@ -394,6 +386,78 @@ class Traject::Indexer | |
| 394 386 | 
             
              end
         | 
| 395 387 |  | 
| 396 388 |  | 
| 389 | 
            +
              
         | 
| 390 | 
            +
              
         | 
| 391 | 
            +
              # Verify that the field name is good, and throw a useful error if not
         | 
| 392 | 
            +
              def verify_field_name(field_name)
         | 
| 393 | 
            +
                if field_name.nil? || !field_name.is_a?(String) || field_name.empty? 
         | 
| 394 | 
            +
                  raise NamingError.new("to_field requires the field name (String) as the first argument (#{last_named_step.message})")
         | 
| 395 | 
            +
                end
         | 
| 396 | 
            +
              end
         | 
| 397 | 
            +
             | 
| 398 | 
            +
              
         | 
| 399 | 
            +
              # Verify the various, increasingly-complex things that can be sent to to_field
         | 
| 400 | 
            +
              # to make sure it's all kosher.
         | 
| 401 | 
            +
              #
         | 
| 402 | 
            +
              # "Modification" takes place for zero-argument blocks that return a lambda
         | 
| 403 | 
            +
             | 
| 404 | 
            +
              def verify_to_field_arguments(field_name, aLambda, block)
         | 
| 405 | 
            +
             | 
| 406 | 
            +
                verify_field_name(field_name)
         | 
| 407 | 
            +
                
         | 
| 408 | 
            +
                [aLambda, block].each do |proc|
         | 
| 409 | 
            +
                  # allow negative arity, meaning variable/optional, trust em on that.
         | 
| 410 | 
            +
                  # but for positive arrity, we need 2 or 3 args
         | 
| 411 | 
            +
                  if proc && (proc.arity == 0 || proc.arity == 1 || proc.arity > 3)
         | 
| 412 | 
            +
                    raise ArityError.new("error parsing field '#{field_name}': block/proc given to to_field needs 2 or 3 (or variable) arguments: #{proc} (#{last_named_step.message})")
         | 
| 413 | 
            +
                  end
         | 
| 414 | 
            +
                end
         | 
| 415 | 
            +
                
         | 
| 416 | 
            +
              end
         | 
| 417 | 
            +
             | 
| 418 | 
            +
              # Verify the procs sent to each_record to make sure it's all kosher.
         | 
| 419 | 
            +
              
         | 
| 420 | 
            +
              def verify_each_record_arguments(aLambda, block)
         | 
| 421 | 
            +
                unless aLambda or block
         | 
| 422 | 
            +
                  raise ArgumentError.new("Missing Argument: each_record must take a block/lambda as an argument (#{last_named_step.message})")
         | 
| 423 | 
            +
                end
         | 
| 424 | 
            +
                
         | 
| 425 | 
            +
                [aLambda, block].each do |proc|
         | 
| 426 | 
            +
                  # allow negative arity, meaning variable/optional, trust em on that.
         | 
| 427 | 
            +
                  # but for positive arrity, we need 1 or 2 args
         | 
| 428 | 
            +
                  if proc
         | 
| 429 | 
            +
                    unless proc.is_a?(Proc)
         | 
| 430 | 
            +
                      raise NamingError.new("argument to each_record must be a block/lambda, not a #{proc.class} (#{last_named_step.message})")
         | 
| 431 | 
            +
                    end
         | 
| 432 | 
            +
                    if (proc.arity == 0 || proc.arity > 2)
         | 
| 433 | 
            +
                      raise ArityError.new("block/proc given to each_record needs 1 or 2 arguments: #{proc} (#{last_named_step.message})")
         | 
| 434 | 
            +
                    end
         | 
| 435 | 
            +
                  end
         | 
| 436 | 
            +
                end
         | 
| 437 | 
            +
              end
         | 
| 438 | 
            +
              
         | 
| 439 | 
            +
              def last_named_step
         | 
| 440 | 
            +
                return LastNamedStep.new(@index_steps)
         | 
| 441 | 
            +
              end
         | 
| 442 | 
            +
              
         | 
| 443 | 
            +
              
         | 
| 444 | 
            +
              # A convenient way to find, and generate error messages for, the last named step (for helping locate parse errors)
         | 
| 445 | 
            +
              class LastNamedStep
         | 
| 446 | 
            +
                attr_accessor :step, :message
         | 
| 447 | 
            +
             | 
| 448 | 
            +
                # Get the last step for which we have a field_name (e.g., the last to_field, skipping over each_record)
         | 
| 449 | 
            +
                def initialize(index_steps)
         | 
| 450 | 
            +
                  @step = index_steps.reverse_each.find{|step| step[:field_name]}
         | 
| 451 | 
            +
                  if @step 
         | 
| 452 | 
            +
                    @message = "last successfully parsed field was '#{@step[:field_name]}'"
         | 
| 453 | 
            +
                  else
         | 
| 454 | 
            +
                    @message = "there were no previous named fields successfully parsed"
         | 
| 455 | 
            +
                  end
         | 
| 456 | 
            +
                end
         | 
| 457 | 
            +
              end
         | 
| 458 | 
            +
              
         | 
| 459 | 
            +
             | 
| 460 | 
            +
             | 
| 397 461 | 
             
              # Represents the context of a specific record being indexed, passed
         | 
| 398 462 | 
             
              # to indexing logic blocks
         | 
| 399 463 | 
             
              #
         | 
| @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            require 'hashie'
         | 
| 2 2 |  | 
| 3 3 | 
             
            # A Hash of settings for a Traject::Indexer, which also ends up passed along
         | 
| 4 | 
            -
            # to other objects Traject::Indexer interacts with. | 
| 4 | 
            +
            # to other objects Traject::Indexer interacts with.
         | 
| 5 5 | 
             
            #
         | 
| 6 6 | 
             
            # Enhanced with a few features from Hashie, to make it for
         | 
| 7 7 | 
             
            # instance string/symbol indifferent
         | 
| @@ -71,5 +71,13 @@ class Traject::Indexer | |
| 71 71 | 
             
                  "processing_thread_pool"    => 3
         | 
| 72 72 | 
             
                  }
         | 
| 73 73 | 
             
                end
         | 
| 74 | 
            +
             | 
| 75 | 
            +
                def inspect
         | 
| 76 | 
            +
                  # Keep any key ending in password out of the inspect
         | 
| 77 | 
            +
                  self.inject({}) do |hash, (key, value)|
         | 
| 78 | 
            +
                    hash[key] = (key =~ /password\Z/) ? "[hidden]" : value
         | 
| 79 | 
            +
                    hash
         | 
| 80 | 
            +
                  end.inspect
         | 
| 81 | 
            +
                end
         | 
| 74 82 | 
             
              end
         | 
| 75 83 | 
             
            end
         | 
    
        data/lib/traject/json_writer.rb
    CHANGED
    
    | @@ -1,53 +1,30 @@ | |
| 1 1 | 
             
            require 'json'
         | 
| 2 | 
            +
            require 'traject/line_writer'
         | 
| 2 3 |  | 
| 3 4 | 
             
            # A writer for Traject::Indexer, that just writes out
         | 
| 4 5 | 
             
            # all the output as Json. It's newline delimitted json, but
         | 
| 5 6 | 
             
            # right now no checks to make sure there is no internal newlines
         | 
| 6 | 
            -
            # as whitespace in the json. TODO, add that. | 
| 7 | 
            +
            # as whitespace in the json. TODO, add that.
         | 
| 7 8 | 
             
            #
         | 
| 8 | 
            -
            #  | 
| 9 | 
            -
            #  | 
| 9 | 
            +
            # Should be thread-safe (ie, multiple worker threads can be calling #put
         | 
| 10 | 
            +
            # concurrently), by wrapping write to actual output file in a mutex synchronize.
         | 
| 11 | 
            +
            # This does not seem to effect performance much, as far as I could tell
         | 
| 12 | 
            +
            # benchmarking.
         | 
| 10 13 | 
             
            #
         | 
| 11 14 | 
             
            # You can force pretty-printing with setting 'json_writer.pretty_print' of boolean
         | 
| 12 | 
            -
            # true or string 'true'.  Useful mostly for human checking of output. | 
| 15 | 
            +
            # true or string 'true'.  Useful mostly for human checking of output.
         | 
| 13 16 | 
             
            #
         | 
| 14 17 | 
             
            # Output will be sent to settings["output_file"] string path, or else
         | 
| 15 | 
            -
            # settings["output_stream"] (ruby IO object), or else stdout. | 
| 16 | 
            -
            class Traject::JsonWriter
         | 
| 17 | 
            -
              attr_reader :settings
         | 
| 18 | 
            +
            # settings["output_stream"] (ruby IO object), or else stdout.
         | 
| 19 | 
            +
            class Traject::JsonWriter < Traject::LineWriter
         | 
| 18 20 |  | 
| 19 | 
            -
              def  | 
| 20 | 
            -
                @settings = argSettings
         | 
| 21 | 
            -
              end
         | 
| 22 | 
            -
             | 
| 23 | 
            -
              def put(context)
         | 
| 21 | 
            +
              def serialize(context)
         | 
| 24 22 | 
             
                hash = context.output_hash
         | 
| 25 | 
            -
             | 
| 26 | 
            -
             | 
| 27 | 
            -
             | 
| 28 | 
            -
             | 
| 29 | 
            -
                  else
         | 
| 30 | 
            -
                    JSON.generate(hash)
         | 
| 31 | 
            -
                  end
         | 
| 32 | 
            -
                output_file.puts(serialized)
         | 
| 33 | 
            -
              end
         | 
| 34 | 
            -
             | 
| 35 | 
            -
              def output_file
         | 
| 36 | 
            -
                unless defined? @output_file
         | 
| 37 | 
            -
                  @output_file = 
         | 
| 38 | 
            -
                    if settings["output_file"]
         | 
| 39 | 
            -
                      File.open(settings["output_file"], 'w:UTF-8')
         | 
| 40 | 
            -
                    elsif settings["output_stream"]
         | 
| 41 | 
            -
                      settings["output_stream"]
         | 
| 42 | 
            -
                    else
         | 
| 43 | 
            -
                      $stdout
         | 
| 44 | 
            -
                    end
         | 
| 23 | 
            +
                if settings["json_writer.pretty_print"]
         | 
| 24 | 
            +
                  JSON.pretty_generate(hash)
         | 
| 25 | 
            +
                else
         | 
| 26 | 
            +
                  JSON.generate(hash)
         | 
| 45 27 | 
             
                end
         | 
| 46 | 
            -
                 | 
| 47 | 
            -
              end
         | 
| 48 | 
            -
             | 
| 49 | 
            -
              def close 
         | 
| 50 | 
            -
                @output_file.close unless (@output_file.nil? || @output_file.tty?)
         | 
| 51 | 
            -
              end
         | 
| 28 | 
            +
              end    
         | 
| 52 29 |  | 
| 53 30 | 
             
            end
         | 
| @@ -0,0 +1,59 @@ | |
| 1 | 
            +
            require 'thread'
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            # A writer for Traject::Indexer, that just writes out
         | 
| 4 | 
            +
            # all the output as serialized text with #puts. 
         | 
| 5 | 
            +
            #
         | 
| 6 | 
            +
            # Should be thread-safe (ie, multiple worker threads can be calling #put
         | 
| 7 | 
            +
            # concurrently), by wrapping write to actual output file in a mutex synchronize.
         | 
| 8 | 
            +
            # This does not seem to effect performance much, as far as I could tell
         | 
| 9 | 
            +
            # benchmarking.
         | 
| 10 | 
            +
            #
         | 
| 11 | 
            +
            # Output will be sent to settings["output_file"] string path, or else
         | 
| 12 | 
            +
            # settings["output_stream"] (ruby IO object), or else stdout.
         | 
| 13 | 
            +
            #
         | 
| 14 | 
            +
            # This class can be sub-classed to write out different serialized
         | 
| 15 | 
            +
            # reprentations -- subclasses will just override the #serialize
         | 
| 16 | 
            +
            # method. For instance, see JsonWriter. 
         | 
| 17 | 
            +
            class Traject::LineWriter
         | 
| 18 | 
            +
              attr_reader :settings
         | 
| 19 | 
            +
              attr_reader :write_mutex
         | 
| 20 | 
            +
             | 
| 21 | 
            +
              def initialize(argSettings)
         | 
| 22 | 
            +
                @settings     = argSettings
         | 
| 23 | 
            +
                @write_mutex  = Mutex.new
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                # trigger lazy loading now for thread-safety
         | 
| 26 | 
            +
                output_file
         | 
| 27 | 
            +
              end
         | 
| 28 | 
            +
             | 
| 29 | 
            +
             | 
| 30 | 
            +
              def serialize(context)
         | 
| 31 | 
            +
                context.output_hash
         | 
| 32 | 
            +
              end    
         | 
| 33 | 
            +
             | 
| 34 | 
            +
              def put(context)
         | 
| 35 | 
            +
                serialized = serialize(context)
         | 
| 36 | 
            +
                write_mutex.synchronize do
         | 
| 37 | 
            +
                  output_file.puts(serialized)
         | 
| 38 | 
            +
                end
         | 
| 39 | 
            +
              end
         | 
| 40 | 
            +
             | 
| 41 | 
            +
              def output_file
         | 
| 42 | 
            +
                unless defined? @output_file
         | 
| 43 | 
            +
                  @output_file =
         | 
| 44 | 
            +
                    if settings["output_file"]
         | 
| 45 | 
            +
                      File.open(settings["output_file"], 'w:UTF-8')
         | 
| 46 | 
            +
                    elsif settings["output_stream"]
         | 
| 47 | 
            +
                      settings["output_stream"]
         | 
| 48 | 
            +
                    else
         | 
| 49 | 
            +
                      $stdout
         | 
| 50 | 
            +
                    end
         | 
| 51 | 
            +
                end
         | 
| 52 | 
            +
                return @output_file
         | 
| 53 | 
            +
              end
         | 
| 54 | 
            +
             | 
| 55 | 
            +
              def close
         | 
| 56 | 
            +
                @output_file.close unless (@output_file.nil? || @output_file.tty?)
         | 
| 57 | 
            +
              end
         | 
| 58 | 
            +
             | 
| 59 | 
            +
            end
         | 
| @@ -35,16 +35,21 @@ module Traject::Macros | |
| 35 35 | 
             
                  trim_punctuation        = options.delete(:trim_punctuation)
         | 
| 36 36 | 
             
                  default_value           = options.delete(:default)
         | 
| 37 37 |  | 
| 38 | 
            -
                  # We create the TranslationMap  | 
| 39 | 
            -
                  #  | 
| 40 | 
            -
                  #  | 
| 41 | 
            -
                  # | 
| 38 | 
            +
                  # We create the TranslationMap and the MarcExtractor here
         | 
| 39 | 
            +
                  # on load, so the lambda can just refer to already created
         | 
| 40 | 
            +
                  # ones, and not have to create a new one per-execution.
         | 
| 41 | 
            +
                  #
         | 
| 42 | 
            +
                  # Benchmarking shows for MarcExtractor at least, there is
         | 
| 43 | 
            +
                  # significant performance advantage. 
         | 
| 44 | 
            +
             | 
| 42 45 | 
             
                  if translation_map_arg  = options.delete(:translation_map)
         | 
| 43 46 | 
             
                    translation_map = Traject::TranslationMap.new(translation_map_arg)
         | 
| 44 47 | 
             
                  end
         | 
| 45 48 |  | 
| 49 | 
            +
                  extractor = Traject::MarcExtractor.new(spec, options)
         | 
| 50 | 
            +
             | 
| 46 51 | 
             
                  lambda do |record, accumulator, context|
         | 
| 47 | 
            -
                    accumulator.concat  | 
| 52 | 
            +
                    accumulator.concat extractor.extract(record)
         | 
| 48 53 |  | 
| 49 54 | 
             
                    if only_first
         | 
| 50 55 | 
             
                      Marc21.first! accumulator
         | 
| @@ -11,19 +11,30 @@ module Traject::Macros | |
| 11 11 | 
             
                # shortcut
         | 
| 12 12 | 
             
                MarcExtractor = Traject::MarcExtractor
         | 
| 13 13 |  | 
| 14 | 
            -
                # Extract OCLC numbers from, by default 035a's | 
| 14 | 
            +
                # Extract OCLC numbers from, by default 035a's by known prefixes, then stripped
         | 
| 15 15 | 
             
                # just the num, and de-dup.
         | 
| 16 16 | 
             
                def oclcnum(extract_fields = "035a")
         | 
| 17 | 
            +
                  extractor = MarcExtractor.new(extract_fields, :seperator => nil)
         | 
| 18 | 
            +
             | 
| 17 19 | 
             
                  lambda do |record, accumulator|
         | 
| 18 | 
            -
                    list =  | 
| 19 | 
            -
                      Marc21Semantics. | 
| 20 | 
            -
                    end
         | 
| 20 | 
            +
                    list = extractor.extract(record).collect! do |o|
         | 
| 21 | 
            +
                      Marc21Semantics.oclcnum_extract(o)
         | 
| 22 | 
            +
                    end.compact
         | 
| 21 23 |  | 
| 22 24 | 
             
                    accumulator.concat list.uniq if list
         | 
| 23 25 | 
             
                  end
         | 
| 24 26 | 
             
                end
         | 
| 25 | 
            -
                 | 
| 26 | 
            -
             | 
| 27 | 
            +
                # If a num begins with a known OCLC prefix, return it without the prefix.
         | 
| 28 | 
            +
                # otherwise nil.
         | 
| 29 | 
            +
                def self.oclcnum_extract(num)
         | 
| 30 | 
            +
                  stripped = num.gsub(/\A(ocm)|(ocn)|(on)|(\(OCoLC\))/, '')
         | 
| 31 | 
            +
                  if num != stripped
         | 
| 32 | 
            +
                    # it had the prefix, which we've now stripped
         | 
| 33 | 
            +
                    return stripped
         | 
| 34 | 
            +
                  else
         | 
| 35 | 
            +
                    # it didn't have the prefix
         | 
| 36 | 
            +
                    return nil
         | 
| 37 | 
            +
                  end
         | 
| 27 38 | 
             
                end
         | 
| 28 39 |  | 
| 29 40 |  | 
| @@ -47,12 +58,13 @@ module Traject::Macros | |
| 47 58 | 
             
                    accumulator << Marc21Semantics.get_sortable_author(record)
         | 
| 48 59 | 
             
                  end
         | 
| 49 60 | 
             
                end
         | 
| 61 | 
            +
             | 
| 50 62 | 
             
                def self.get_sortable_author(record)
         | 
| 51 | 
            -
                  onexx = MarcExtractor. | 
| 63 | 
            +
                  onexx = MarcExtractor.cached("100:110:111", :first => true).extract(record).first
         | 
| 52 64 | 
             
                  onexx = onexx.strip if onexx
         | 
| 53 65 |  | 
| 54 66 | 
             
                  titles = []
         | 
| 55 | 
            -
                  MarcExtractor. | 
| 67 | 
            +
                  MarcExtractor.cached("240:245", :first => true).each_matching_line(record) do |field, spec|
         | 
| 56 68 | 
             
                    non_filing = field.indicator2.to_i
         | 
| 57 69 |  | 
| 58 70 | 
             
                    str = field.subfields.collect {|sf| sf.value}.join(" ")
         | 
| @@ -72,8 +84,9 @@ module Traject::Macros | |
| 72 84 | 
             
                    accumulator << Marc21Semantics.get_sortable_title(record)
         | 
| 73 85 | 
             
                  end
         | 
| 74 86 | 
             
                end
         | 
| 87 | 
            +
             | 
| 75 88 | 
             
                def self.get_sortable_title(record)
         | 
| 76 | 
            -
                  MarcExtractor. | 
| 89 | 
            +
                  MarcExtractor.cached("245ab").collect_matching_lines(record) do |field, spec, extractor|
         | 
| 77 90 | 
             
                    str = extractor.collect_subfields(field, spec).first
         | 
| 78 91 |  | 
| 79 92 | 
             
                    if str.nil?
         | 
| @@ -105,8 +118,10 @@ module Traject::Macros | |
| 105 118 | 
             
                def marc_languages(spec = "008[35-37]:041a:041d")
         | 
| 106 119 | 
             
                  translation_map = Traject::TranslationMap.new("marc_languages")
         | 
| 107 120 |  | 
| 121 | 
            +
                  extractor = MarcExtractor.new(spec, :seperator => nil)
         | 
| 122 | 
            +
             | 
| 108 123 | 
             
                  lambda do |record, accumulator|
         | 
| 109 | 
            -
                    codes =  | 
| 124 | 
            +
                    codes = extractor.collect_matching_lines(record) do |field, spec, extractor|
         | 
| 110 125 | 
             
                      if extractor.control_field?(field)
         | 
| 111 126 | 
             
                        (spec[:bytes] ? field.value.byteslice(spec[:bytes]) : field.value)
         | 
| 112 127 | 
             
                      else
         | 
| @@ -134,10 +149,12 @@ module Traject::Macros | |
| 134 149 | 
             
                # already covered by another field we're including, so we don't want to double count it, possibly
         | 
| 135 150 | 
             
                # with slight variation.
         | 
| 136 151 | 
             
                def marc_series_facet(spec = "440a:490a:800abcdt:810abcdt:811acdeft:830adfgklmnoprst")
         | 
| 152 | 
            +
                  extractor = MarcExtractor.new(spec)
         | 
| 153 | 
            +
             | 
| 137 154 | 
             
                  lambda do |record, accumulator|
         | 
| 138 | 
            -
                     | 
| 155 | 
            +
                    accumulator.concat( extractor.collect_matching_lines(record) do |field, spec, extractor|
         | 
| 139 156 | 
             
                      extractor.collect_subfields(field, spec) unless (field.tag == "490" && field.indicator1 == "1")
         | 
| 140 | 
            -
                    end
         | 
| 157 | 
            +
                    end.compact)
         | 
| 141 158 | 
             
                  end
         | 
| 142 159 | 
             
                end
         | 
| 143 160 |  | 
| @@ -149,8 +166,10 @@ module Traject::Macros | |
| 149 166 | 
             
                def marc_instrumentation_humanized(spec = "048ab", options = {})
         | 
| 150 167 | 
             
                  translation_map = Traject::TranslationMap.new(options[:translation_map] || "marc_instruments")
         | 
| 151 168 |  | 
| 169 | 
            +
                  extractor = MarcExtractor.new(spec, :seperator => nil)
         | 
| 170 | 
            +
             | 
| 152 171 | 
             
                  lambda do |record, accumulator|
         | 
| 153 | 
            -
                    values =  | 
| 172 | 
            +
                    values = extractor.extract(record)
         | 
| 154 173 | 
             
                    human = values.collect do |value|
         | 
| 155 174 | 
             
                      translation_map[ value.slice(0, 2) ]
         | 
| 156 175 | 
             
                    end.uniq
         | 
| @@ -169,9 +188,12 @@ module Traject::Macros | |
| 169 188 | 
             
                # codes.
         | 
| 170 189 | 
             
                def marc_instrument_codes_normalized(spec = "048")
         | 
| 171 190 | 
             
                  soloist_suffix = ".s"
         | 
| 191 | 
            +
             | 
| 192 | 
            +
                  extractor = MarcExtractor.new("048", :seperator => nil)
         | 
| 193 | 
            +
             | 
| 172 194 | 
             
                  return lambda do |record, accumulator|
         | 
| 173 195 | 
             
                    accumulator.concat(
         | 
| 174 | 
            -
                       | 
| 196 | 
            +
                      extractor.collect_matching_lines(record) do |field, spec, extractor|
         | 
| 175 197 | 
             
                        values = []
         | 
| 176 198 |  | 
| 177 199 | 
             
                        field.subfields.each do |sf|
         | 
| @@ -219,7 +241,7 @@ module Traject::Macros | |
| 219 241 | 
             
                # See #marc_publication_date. Yeah, this is a holy mess.
         | 
| 220 242 | 
             
                # Maybe it should actually be extracted to it's own class!
         | 
| 221 243 | 
             
                def self.publication_date(record, estimate_tolerance = 15, min_year = 500, max_year = (Time.new.year + 6))
         | 
| 222 | 
            -
                  field008 = MarcExtractor. | 
| 244 | 
            +
                  field008 = MarcExtractor.cached("008").extract(record).first
         | 
| 223 245 | 
             
                  found_date = nil
         | 
| 224 246 |  | 
| 225 247 | 
             
                  if field008 && field008.length >= 11
         | 
| @@ -264,7 +286,7 @@ module Traject::Macros | |
| 264 286 | 
             
                  end
         | 
| 265 287 | 
             
                  # Okay, nothing from 008, try 260
         | 
| 266 288 | 
             
                  if found_date.nil?
         | 
| 267 | 
            -
                    v260c = MarcExtractor. | 
| 289 | 
            +
                    v260c = MarcExtractor.cached("260c", :seperator => nil).extract(record).first
         | 
| 268 290 | 
             
                    # just try to take the first four digits out of there, we're not going to try
         | 
| 269 291 | 
             
                    # anything crazy.
         | 
| 270 292 | 
             
                    if v260c =~ /(\d{4})/
         | 
| @@ -298,8 +320,10 @@ module Traject::Macros | |
| 298 320 | 
             
                  default_value = options.has_key?(:default) ? options[:default] : "Unknown"
         | 
| 299 321 | 
             
                  translation_map = Traject::TranslationMap.new("lcc_top_level")
         | 
| 300 322 |  | 
| 323 | 
            +
                  extractor = MarcExtractor.new(spec, :seperator => nil)
         | 
| 324 | 
            +
             | 
| 301 325 | 
             
                  lambda do |record, accumulator|
         | 
| 302 | 
            -
                    candidates =  | 
| 326 | 
            +
                    candidates = extractor.extract(record)
         | 
| 303 327 |  | 
| 304 328 | 
             
                    candidates.reject! do |candidate|
         | 
| 305 329 | 
             
                      !(candidate =~ lcc_regex)
         | 
| @@ -328,10 +352,14 @@ module Traject::Macros | |
| 328 352 | 
             
                  a_fields_spec = options[:geo_a_fields] || "651a:691a"
         | 
| 329 353 | 
             
                  z_fields_spec = options[:geo_z_fields] || "600:610:611:630:648:650:654:655:656:690:651:691"
         | 
| 330 354 |  | 
| 355 | 
            +
                  extractor_043a      = MarcExtractor.new("043a", :seperator => nil)
         | 
| 356 | 
            +
                  extractor_a_fields  = MarcExtractor.new(a_fields_spec, :seperator => nil)
         | 
| 357 | 
            +
                  extractor_z_fields  = MarcExtractor.new(z_fields_spec)
         | 
| 358 | 
            +
             | 
| 331 359 | 
             
                  lambda do |record, accumulator|
         | 
| 332 360 |  | 
| 333 361 | 
             
                    accumulator.concat(
         | 
| 334 | 
            -
                       | 
| 362 | 
            +
                      extractor_043a.extract(record).collect do |code|
         | 
| 335 363 | 
             
                        # remove any trailing hyphens, then map
         | 
| 336 364 | 
             
                        marc_geo_map[code.gsub(/\-+\Z/, '')]
         | 
| 337 365 | 
             
                      end.compact
         | 
| @@ -339,15 +367,15 @@ module Traject::Macros | |
| 339 367 |  | 
| 340 368 | 
             
                    #LCSH 651a and 691a go in more or less normally.
         | 
| 341 369 | 
             
                    accumulator.concat(
         | 
| 342 | 
            -
                       | 
| 370 | 
            +
                      extractor_a_fields.extract(record).collect do |s|
         | 
| 343 371 | 
             
                        # remove trailing periods, which they sometimes have if they were
         | 
| 344 372 | 
             
                        # at end of LCSH.
         | 
| 345 373 | 
             
                        s.sub(/\. */, '')
         | 
| 346 374 | 
             
                      end
         | 
| 347 375 | 
             
                    )
         | 
| 348 376 |  | 
| 349 | 
            -
                    # fields we take z's from have a bit more normalization | 
| 350 | 
            -
                     | 
| 377 | 
            +
                    # fields we take z's from have a bit more normalization
         | 
| 378 | 
            +
                    extractor_z_fields.each_matching_line(record) do |field, spec, extractor|
         | 
| 351 379 | 
             
                      z_fields = field.subfields.find_all {|sf| sf.code == "z"}.collect {|sf| sf.value }
         | 
| 352 380 | 
             
                      # depending on position in total field, may be a period on the end
         | 
| 353 381 | 
             
                      # we want to remove.
         | 
| @@ -376,17 +404,21 @@ module Traject::Macros | |
| 376 404 | 
             
                  ordinary_fields_spec = "600y:610y:611y:630y:648ay:650y:654y:656y:690y"
         | 
| 377 405 | 
             
                  special_fields_spec = "651:691"
         | 
| 378 406 | 
             
                  seperator = ": "
         | 
| 407 | 
            +
             | 
| 408 | 
            +
                  extractor_ordinary_fields = MarcExtractor.new(ordinary_fields_spec)
         | 
| 409 | 
            +
                  extractor_special_fields  = MarcExtractor.new(special_fields_spec)
         | 
| 410 | 
            +
             | 
| 379 411 | 
             
                  lambda do |record, accumulator|
         | 
| 380 412 | 
             
                    # straightforward ones
         | 
| 381 413 |  | 
| 382 414 |  | 
| 383 | 
            -
                    accumulator.concat(  | 
| 415 | 
            +
                    accumulator.concat( extractor_ordinary_fields.extract(record).collect do |v|
         | 
| 384 416 | 
             
                      # May have a period we have to remove, if it was at end of tag
         | 
| 385 417 | 
             
                      v.sub(/\. *\Z/, '')
         | 
| 386 418 | 
             
                    end)
         | 
| 387 419 |  | 
| 388 | 
            -
                    # weird ones | 
| 389 | 
            -
                     | 
| 420 | 
            +
                    # weird ones
         | 
| 421 | 
            +
                    extractor_special_fields.each_matching_line(record) do |field, spec, extractor|
         | 
| 390 422 | 
             
                      field.subfields.each do |sf|
         | 
| 391 423 | 
             
                        next unless sf.code == 'y'
         | 
| 392 424 | 
             
                        if sf.value =~ /\A\s*.+,\s+(ca.\s+)?\d\d\d\d?(-\d\d\d\d?)?( B\.C\.)?[.,; ]*\Z/
         | 
| @@ -396,7 +428,7 @@ module Traject::Macros | |
| 396 428 | 
             
                          accumulator << sf.value.sub(/\. *\Z/, '')
         | 
| 397 429 | 
             
                        end
         | 
| 398 430 | 
             
                      end
         | 
| 399 | 
            -
                    end | 
| 431 | 
            +
                    end
         | 
| 400 432 | 
             
                  end
         | 
| 401 433 | 
             
                end
         | 
| 402 434 |  |