RubyGems - traject - Versions diffs - 3.0.0 → 3.4.0 - Mend

traject 3.0.0 → 3.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

checksums.yaml +4 -4
data/.travis.yml +3 -4
data/CHANGES.md +65 -0
data/README.md +9 -4
data/doc/indexing_rules.md +5 -6
data/doc/programmatic_use.md +25 -1
data/doc/settings.md +4 -0
data/doc/xml.md +12 -0
data/lib/traject/indexer.rb +40 -4
data/lib/traject/indexer/context.rb +45 -0
data/lib/traject/indexer/step.rb +8 -12
data/lib/traject/line_writer.rb +36 -4
data/lib/traject/macros/marc21.rb +2 -2
data/lib/traject/macros/marc21_semantics.rb +15 -12
data/lib/traject/macros/nokogiri_macros.rb +9 -3
data/lib/traject/nokogiri_reader.rb +17 -19
data/lib/traject/oai_pmh_nokogiri_reader.rb +9 -3
data/lib/traject/solr_json_writer.rb +167 -29
data/lib/traject/version.rb +1 -1
data/lib/translation_maps/marc_languages.yaml +77 -48
data/test/delimited_writer_test.rb +14 -16
data/test/indexer/class_level_configuration_test.rb +127 -0
data/test/indexer/context_test.rb +64 -1
data/test/indexer/error_handler_test.rb +18 -0
data/test/indexer/macros/macros_marc21_semantics_test.rb +4 -0
data/test/indexer/nokogiri_indexer_test.rb +35 -0
data/test/nokogiri_reader_test.rb +66 -3
data/test/solr_json_writer_test.rb +175 -7
data/test/test_support/date_resort_to_264.marc +1 -0
data/traject.gemspec +4 -4
metadata +37 -16

data/lib/traject/macros/marc21.rb CHANGED

@@ -42,11 +42,11 @@ module Traject::Macros
     #
     # * :translation_map => String: translate with named translation map looked up in load
     #       path, uses Tranject::TranslationMap.new(translation_map_arg).
-    #       **Instead**, use `extract_marc(whatever), translation_map(translation_map_arg)
+    #       **Instead**, use `extract_marc(whatever), translation_map(translation_map_arg)`
     #
     # * :trim_punctuation => true; trims leading/trailing punctuation using standard algorithms that
     #     have shown themselves useful with Marc, using Marc21.trim_punctuation. **Instead**, use
-    #    `extract_marc(whatever), trim_punctuation
+    #    `extract_marc(whatever), trim_punctuation`
     #
     # * :default => String: if otherwise empty, add default value. **Instead**, use `extract_marc(whatever), default("default value")`
     #

data/lib/traject/macros/marc21_semantics.rb CHANGED

@@ -26,19 +26,19 @@ module Traject::Macros
         accumulator.concat list.uniq if list
       end
     end
     # If a num begins with a known OCLC prefix, return it without the prefix.
     # otherwise nil.
     #
-    # Allow (OCoLC) and/or ocn/ocm/on
+    # Allow (OCoLC) and/or ocn/ocm/on
     OCLCPAT = /
       \A\s*
       (?:(?:\(OCoLC\)) |
          (?:\(OCoLC\))?(?:(?:ocm)|(?:ocn)|(?:on))
          )(\d+)
          /x
     def self.oclcnum_extract(num)
       if m = OCLCPAT.match(num)
         return m[1]
@@ -364,13 +364,16 @@ module Traject::Macros
           end
         end
       end
-      # Okay, nothing from 008, try 260
+      # Okay, nothing from 008, first try 264, then try 260
       if found_date.nil?
+        v264c = MarcExtractor.cached("264c", :separator => nil).extract(record).first
         v260c = MarcExtractor.cached("260c", :separator => nil).extract(record).first
         # just try to take the first four digits out of there, we're not going to try
         # anything crazy.
-        if m = /(\d{4})/.match(v260c)
+        if m = /(\d{4})/.match(v264c)
           found_date = m[1].to_i
+        elsif m = /(\d{4})/.match(v260c)
+            found_date = m[1].to_i
         end
       end
@@ -519,11 +522,11 @@ module Traject::Macros
     # Extracts LCSH-carrying fields, and formatting them
     # as a pre-coordinated LCSH string, for instance suitable for including
-    # in a facet.
+    # in a facet.
     #
     # You can supply your own list of fields as a spec, but for significant
     # customization you probably just want to write your own method in
-    # terms of the Marc21Semantics.assemble_lcsh method.
+    # terms of the Marc21Semantics.assemble_lcsh method.
     def marc_lcsh_formatted(options = {})
       spec            = options[:spec] || "600:610:611:630:648:650:651:654:662"
       subd_separator  = options[:subdivison_separator] || " — "
@@ -540,17 +543,17 @@ module Traject::Macros
     end
     # Takes a MARC::Field and formats it into a pre-coordinated LCSH string
-    # with subdivision seperators in the right place.
+    # with subdivision seperators in the right place.
     #
     # For 600 fields especially, need to not just join with subdivision seperator
     # to take acount of $a$d$t -- for other fields, might be able to just
-    # join subfields, not sure.
+    # join subfields, not sure.
     #
     # WILL strip trailing period from generated string, contrary to some LCSH practice.
     # Our data is inconsistent on whether it has period or not, this was
-    # the easiest way to standardize.
+    # the easiest way to standardize.
     #
-    # Default subdivision seperator is em-dash with spaces, set to '--' if you want.
+    # Default subdivision seperator is em-dash with spaces, set to '--' if you want.
     #
     # Cite: "Dash (-) that precedes a subdivision in an extended 600 subject heading
     # is not carried in the MARC record. It may be system generated as a display constant

data/lib/traject/macros/nokogiri_macros.rb CHANGED

@@ -26,9 +26,15 @@ module Traject
             # Make sure to avoid text content that was all blank, which is "between the children"
             # whitespace.
             result = result.collect do |n|
-              n.xpath('.//text()').collect(&:text).tap do |arr|
-                arr.reject! { |s| s =~ (/\A\s+\z/) }
-              end.join(" ")
+              if n.kind_of?(Nokogiri::XML::Attr)
+                # attribute value
+                n.value
+              else
+                # text from node
+                n.xpath('.//text()').collect(&:text).tap do |arr|
+                  arr.reject! { |s| s =~ (/\A\s+\z/) }
+                end.join(" ")
+              end
             end
           else
             # just put all matches in accumulator as Nokogiri::XML::Node's

data/lib/traject/nokogiri_reader.rb CHANGED

@@ -21,6 +21,9 @@ module Traject
   #   If you need to use namespaces here, you need to have them registered with
   #   `nokogiri.default_namespaces`. If your source docs use namespaces, you DO need
   #   to use them in your each_record_xpath.
+  # * nokogiri.strict_mode: if set to `true` or `"true"`, ask Nokogiri to parse in 'strict'
+  #   mode, it will raise a `Nokogiri::XML::SyntaxError` if the XML is not well-formed, instead
+  #   of trying to take it's best-guess correction. https://nokogiri.org/tutorials/ensuring_well_formed_markup.html
   # * nokogiri_reader.extra_xpath_hooks: Experimental in progress, see below.
   #
   # ## nokogiri_reader.extra_xpath_hooks: For handling nodes outside of your each_record_xpath
@@ -87,7 +90,11 @@ module Traject
     end
     def each
-      whole_input_doc = Nokogiri::XML.parse(input_stream)
+      config_proc = if settings["nokogiri.strict_mode"]
+        proc { |config| config.strict }
+      end
+      whole_input_doc = Nokogiri::XML.parse(input_stream, &config_proc)
       if each_record_xpath
         whole_input_doc.xpath(each_record_xpath, default_namespaces).each do |matching_node|
@@ -118,35 +125,26 @@ module Traject
     private
-    # In MRI Nokogiri, this is as simple as `new_parent_doc.root = node`
+    # We simply do `new_parent_doc.root = node`
     # It seemed maybe safer to dup the node as well as remove the original from the original doc,
     # but I believe this will result in double memory usage, as unlinked nodes aren't GC'd until
     # their doc is.  I am hoping this pattern results in less memory usage.
     # https://github.com/sparklemotion/nokogiri/issues/1703
     #
-    # However, in JRuby it's a different story, JRuby doesn't properly preserve namespaces
-    # when re-parenting a node.
+    # We used to have to do something different in Jruby to work around bug:
     # https://github.com/sparklemotion/nokogiri/issues/1774
     #
-    # The nodes within the tree re-parented _know_ they are in the correct namespaces,
-    # and xpath queries require that namespace, but the appropriate xmlns attributes
-    # aren't included in the serialized XML. This JRuby-specific code seems to get
-    # things back to a consistent state.
+    # But as of nokogiri 1.9, that does not work, and is not necessary if we accept
+    # that Jruby nokogiri may put xmlns declerations on different elements than MRI,
+    # although it should be semantically equivalent for a namespace-aware parser.
+    # https://github.com/sparklemotion/nokogiri/issues/1875
+    #
+    # This as a separate method now exists largely as a historical artifact, and for this
+    # documentation.
     def reparent_node_to_root(new_parent_doc, node)
-      if Traject::Util.is_jruby?
-        original_ns_scopes = node.namespace_scopes
-      end
       new_parent_doc.root = node
-      if Traject::Util.is_jruby?
-        original_ns_scopes.each do |ns|
-          if new_parent_doc.at_xpath("//#{ns.prefix}:*", ns.prefix => ns.href)
-            new_parent_doc.root.add_namespace(ns.prefix, ns.href)
-          end
-        end
-      end
       return new_parent_doc
     end

data/lib/traject/oai_pmh_nokogiri_reader.rb CHANGED

@@ -115,9 +115,15 @@ module Traject
     # @returns [HTTP::Client] from http.rb gem
     def http_client
       @http_client ||= begin
-        # timeout setting on http.rb seems to be a mess.
-        # https://github.com/httprb/http/issues/488
-        client = HTTP.timeout(:global, write: timeout / 3, connect: timeout / 3, read: timeout / 3)
+        client = nil
+        if HTTP::VERSION.split(".").first.to_i > 3
+          client = HTTP.timeout(timeout)
+        else
+          # timeout setting on http.rb 3.x are a bit of a mess.
+          # https://github.com/httprb/http/issues/488
+          client = HTTP.timeout(:global, write: timeout / 3, connect: timeout / 3, read: timeout / 3)
+        end
         if settings["oai_pmh.try_gzip"]
           client = client.use(:auto_inflate).headers("accept-encoding" => "gzip;q=1.0, identity;q=0.5")

data/lib/traject/solr_json_writer.rb CHANGED

@@ -16,7 +16,30 @@ require 'concurrent' # for atomic_fixnum
 # This should work under both MRI and JRuby, with JRuby getting much
 # better performance due to the threading model.
 #
-# Relevant settings
+# Solr updates are by default sent with no commit params. This will definitely
+# maximize your performance, and *especially* for bulk/batch indexing is recommended --
+# use Solr auto commit in your Solr configuration instead, possibly with `commit_on_close`
+# setting here.
+#
+# However, if you want the writer to send `commitWithin=true`, `commit=true`,
+# `softCommit=true`, or any other URL parameters valid for Solr update handlers,
+# you can configure this with `solr_writer.solr_update_args` setting. See:
+# https://lucene.apache.org/solr/guide/7_0/near-real-time-searching.html#passing-commit-and-commitwithin-parameters-as-part-of-the-url
+# Eg:
+#
+#     settings do
+#       provide "solr_writer.solr_update_args", { commitWithin: 1000 }
+#     end
+#
+#  (That it's a hash makes it infeasible to set/override on command line, if this is
+#  annoying for you let us know)
+#
+#  `solr_update_args` will apply to batch and individual update requests, but
+#  not to commit sent if `commit_on_close`. You can also instead set
+#   `solr_writer.solr_commit_args` for that (or pass in an arg to #commit if calling
+#   manually)
+#
+# ## Relevant settings
 #
 # * solr.url (optional if solr.update_url is set) The URL to the solr core to index into
 #
@@ -35,19 +58,32 @@ require 'concurrent' # for atomic_fixnum
 #
 # * solr_writer.skippable_exceptions: List of classes that will be rescued internal to
 #   SolrJsonWriter, and handled with max_skipped logic. Defaults to
-#   `[HTTPClient::TimeoutError, SocketError, Errno::ECONNREFUSED]`
+#   `[HTTPClient::TimeoutError, SocketError, Errno::ECONNREFUSED, Traject::SolrJsonWriter::BadHttpResponse]`
+#
+# * solr_writer.solr_update_args: A _hash_ of query params to send to solr update url.
+#   Will be sent with every update request. Eg `{ softCommit: true }` or `{ commitWithin: 1000 }`.
+#   See also `solr_writer.solr_commit_args`
 #
 # * solr_writer.commit_on_close: Set to true (or "true") if you want to commit at the
 #   end of the indexing run. (Old "solrj_writer.commit_on_close" supported for backwards
 #   compat only.)
 #
+# * solr_writer.commit_solr_update_args: A hash of query params to send when committing.
+#   Will be used for automatic `close_on_commit`, as well as any manual calls to #commit.
+#   If set, must include {"commit" => "true"} or { "softCommit" => "true" } if you actually
+#   want commits to happen when SolrJsonWriter tries to commit! But can be used to switch to softCommits
+#   (hard commits default), or specify additional params like optimize etc.
+#
+# * solr_writer.http_timeout: Value in seconds, will be set on the httpclient as connect/receive/send
+#   timeout. No way to set them individually at present. Default nil, use HTTPClient defaults
+#   (60 for connect/recieve, 120 for send).
+#
 # * solr_writer.commit_timeout: If commit_on_close, how long to wait for Solr before
-#   giving up as a timeout. Default 10 minutes. Solr can be slow.
+#   giving up as a timeout (http client receive_timeout). Default 10 minutes. Solr can be slow at commits. Overrides solr_writer.timeout
 #
 # * solr_json_writer.http_client Mainly intended for testing, set your own HTTPClient
 #   or mock object to be used for HTTP.
+#
 class Traject::SolrJsonWriter
   include Traject::QualifiedConstGet
@@ -71,7 +107,21 @@ class Traject::SolrJsonWriter
       @max_skipped = nil
     end
-    @http_client = @settings["solr_json_writer.http_client"] || HTTPClient.new
+    @http_client = if @settings["solr_json_writer.http_client"]
+      @settings["solr_json_writer.http_client"]
+    else
+      client = HTTPClient.new
+      if @settings["solr_writer.http_timeout"]
+        client.connect_timeout = client.receive_timeout = client.send_timeout = @settings["solr_writer.http_timeout"]
+      end
+      if @settings["solr_writer.basic_auth_user"] &&
+          @settings["solr_writer.basic_auth_password"]
+        client.set_auth(@settings["solr.url"], @settings["solr_writer.basic_auth_user"], @settings["solr_writer.basic_auth_password"])
+      end
+      client
+    end
     @batch_size = (settings["solr_writer.batch_size"] || DEFAULT_BATCH_SIZE).to_i
     @batch_size = 1 if @batch_size < 1
@@ -96,6 +146,9 @@ class Traject::SolrJsonWriter
     # Figure out where to send updates
     @solr_update_url = self.determine_solr_update_url
+    @solr_update_args = settings["solr_writer.solr_update_args"]
+    @commit_solr_update_args = settings["solr_writer.commit_solr_update_args"]
     logger.info("   #{self.class.name} writing to '#{@solr_update_url}' in batches of #{@batch_size} with #{@thread_pool_size} bg threads")
   end
@@ -123,14 +176,28 @@ class Traject::SolrJsonWriter
     send_batch( Traject::Util.drain_queue(@batched_queue) )
   end
+  # configured update url, with either settings @solr_update_args or passed in
+  # query_params added to it
+  def solr_update_url_with_query(query_params)
+    if query_params
+      @solr_update_url + '?' + URI.encode_www_form(query_params)
+    else
+      @solr_update_url
+    end
+  end
   # Send the given batch of contexts. If something goes wrong, send
   # them one at a time.
   # @param [Array<Traject::Indexer::Context>] an array of contexts
   def send_batch(batch)
     return if batch.empty?
+    logger.debug("#{self.class.name}: sending batch of #{batch.size} to Solr")
     json_package = JSON.generate(batch.map { |c| c.output_hash })
     begin
-      resp = @http_client.post @solr_update_url, json_package, "Content-type" => "application/json"
+      resp = @http_client.post solr_update_url_with_query(@solr_update_args), json_package, "Content-type" => "application/json"
     rescue StandardError => exception
     end
@@ -151,34 +218,71 @@ class Traject::SolrJsonWriter
   # Send a single context to Solr, logging an error if need be
   # @param [Traject::Indexer::Context] c The context whose document you want to send
   def send_single(c)
+    logger.debug("#{self.class.name}: sending single record to Solr: #{c.output_hash}")
     json_package = JSON.generate([c.output_hash])
     begin
-      resp = @http_client.post @solr_update_url, json_package, "Content-type" => "application/json"
-      # Catch Timeouts and network errors as skipped records, but otherwise
-      # allow unexpected errors to propagate up.
-    rescue *skippable_exceptions => exception
-      # no body, local variable exception set above will be used below
-    end
+      post_url = solr_update_url_with_query(@solr_update_args)
+      resp = @http_client.post post_url, json_package, "Content-type" => "application/json"
-    if exception || resp.status != 200
-      if exception
-        msg = Traject::Util.exception_to_log_message(exception)
+      unless resp.status == 200
+        raise BadHttpResponse.new("Unexpected HTTP response status #{resp.status} from POST #{post_url}", resp)
+      end
+      # Catch Timeouts and network errors -- as well as non-200 http responses --
+      # as skipped records, but otherwise allow unexpected errors to propagate up.
+    rescue *skippable_exceptions => exception
+      msg = if exception.kind_of?(BadHttpResponse)
+        "Solr error response: #{exception.response.status}: #{exception.response.body}"
       else
-        msg = "Solr error response: #{resp.status}: #{resp.body}"
+        Traject::Util.exception_to_log_message(exception)
       end
       logger.error "Could not add record #{c.record_inspect}: #{msg}"
       logger.debug("\t" + exception.backtrace.join("\n\t")) if exception
       logger.debug(c.source_record.to_s) if c.source_record
       @skipped_record_incrementer.increment
       if @max_skipped and skipped_record_count > @max_skipped
-        raise MaxSkippedRecordsExceeded.new("#{self.class.name}: Exceeded maximum number of skipped records (#{@max_skipped}): aborting")
+        # re-raising in rescue means the last encountered error will be available as #cause
+        # on raised exception, a feature in ruby 2.1+.
+        raise MaxSkippedRecordsExceeded.new("#{self.class.name}: Exceeded maximum number of skipped records (#{@max_skipped}): aborting: #{exception.message}")
       end
     end
+  end
+  # Very beginning of a delete implementation. POSTs a delete request to solr
+  # for id in arg (value of Solr UniqueID field, usually `id` field).
+  #
+  # Right now, does it inline and immediately, no use of background threads or batching.
+  # This could change.
+  #
+  # Right now, if unsuccesful for any reason, will raise immediately out of here.
+  # Could raise any of the `skippable_exceptions` (timeouts, network errors), an
+  # exception will be raised right out of here.
+  #
+  # Will use `solr_writer.solr_update_args` settings.
+  #
+  # There is no built-in way to direct a record to be deleted from an indexing config
+  # file at the moment, this is just a loose method on the writer.
+  def delete(id)
+    logger.debug("#{self.class.name}: Sending delete to Solr for #{id}")
+    json_package = {delete: id}
+    resp = @http_client.post solr_update_url_with_query(@solr_update_args), JSON.generate(json_package), "Content-type" => "application/json"
+    if resp.status != 200
+      raise RuntimeError.new("Could not delete #{id.inspect}, http response #{resp.status}: #{resp.body}")
+    end
   end
+  # Send a delete all query.
+  #
+  # This method takes no params and will not automatically commit the deletes.
+  # @example @writer.delete_all!
+  def delete_all!
+    delete(query: "*:*")
+  end
   # Get the logger from the settings, or default to an effectively null logger
   def logger
@@ -199,14 +303,16 @@ class Traject::SolrJsonWriter
       @thread_pool.maybe_in_thread_pool { send_batch(batch) }
     end
-    # Wait for shutdown, and time it.
-    logger.debug "#{self.class.name}: Shutting down thread pool, waiting if needed..."
-    elapsed = @thread_pool.shutdown_and_wait
-    if elapsed > 60
-      logger.warn "Waited #{elapsed} seconds for all threads, you may want to increase solr_writer.thread_pool (currently #{@settings["solr_writer.thread_pool"]})"
+    if @thread_pool_size && @thread_pool_size > 0
+      # Wait for shutdown, and time it.
+      logger.debug "#{self.class.name}: Shutting down thread pool, waiting if needed..."
+      elapsed = @thread_pool.shutdown_and_wait
+      if elapsed > 60
+        logger.warn "Waited #{elapsed} seconds for all threads, you may want to increase solr_writer.thread_pool (currently #{@settings["solr_writer.thread_pool"]})"
+      end
+      logger.debug "#{self.class.name}: Thread pool shutdown complete"
+      logger.warn "#{self.class.name}: #{skipped_record_count} skipped records" if skipped_record_count > 0
     end
-    logger.debug "#{self.class.name}: Thread pool shutdown complete"
-    logger.warn "#{self.class.name}: #{skipped_record_count} skipped records" if skipped_record_count > 0
     # check again now that we've waited, there could still be some
     # that didn't show up before.
@@ -220,14 +326,32 @@ class Traject::SolrJsonWriter
   # Send a commit
-  def commit
+  #
+  # Called automatially by `close_on_commit` setting, but also can be called manually.
+  #
+  # If settings `solr_writer.commit_solr_update_args` is set, will be used by default.
+  # That setting needs `{ commit: true }` or  `{softCommit: true}` if you want it to
+  # actually do a commit!
+  #
+  # Optional query_params argument is the actual args to send, you must be sure
+  # to make it include "commit: true" or "softCommit: true" for it to actually commit!
+  # But you may want to include other params too, like optimize etc. query_param
+  # argument replaces setting `solr_writer.commit_solr_update_args`, they are not merged.
+  #
+  # @param [Hash] query_params optional query params to send to solr update. Default {"commit" => "true"}
+  #
+  # @example @writer.commit
+  # @example @writer.commit(softCommit: true)
+  # @example @writer.commit(commit: true, optimize: true, waitFlush: false)
+  def commit(query_params = nil)
+    query_params ||= @commit_solr_update_args || {"commit" => "true"}
     logger.info "#{self.class.name} sending commit to solr at url #{@solr_update_url}..."
     original_timeout = @http_client.receive_timeout
     @http_client.receive_timeout = (settings["commit_timeout"] || (10 * 60)).to_i
-    resp = @http_client.get(@solr_update_url, {"commit" => 'true'})
+    resp = @http_client.get(solr_update_url_with_query(query_params))
     unless resp.status == 200
       raise RuntimeError.new("Could not commit to Solr: #{resp.status} #{resp.body}")
     end
@@ -279,10 +403,24 @@ class Traject::SolrJsonWriter
   class MaxSkippedRecordsExceeded < RuntimeError ; end
+  # Adapted from HTTPClient::BadResponseError.
+  # It's got a #response accessor that will give you the HTTPClient
+  # Response object that had a bad status, although relying on that
+  # would tie you to our HTTPClient implementation that maybe should
+  # be considered an implementation detail, so I dunno.
+  class BadHttpResponse < RuntimeError
+    # HTTP::Message:: a response
+    attr_reader :response
+    def initialize(msg, response = nil) # :nodoc:
+      super(msg)
+      @response = response
+    end
+  end
   private
   def skippable_exceptions
-    @skippable_exceptions ||= (settings["solr_writer.skippable_exceptions"] || [HTTPClient::TimeoutError, SocketError, Errno::ECONNREFUSED])
+    @skippable_exceptions ||= (settings["solr_writer.skippable_exceptions"] || [HTTPClient::TimeoutError, SocketError, Errno::ECONNREFUSED, Traject::SolrJsonWriter::BadHttpResponse])
   end
 end