RubyGems - discovery-indexer - Versions diffs - 0.9.6 → 0.9.7 - Mend

discovery-indexer 0.9.6 → 0.9.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

checksums.yaml +4 -4
data/lib/discovery-indexer.rb +2 -3
data/lib/errors.rb +8 -8
data/lib/logging.rb +3 -6
data/lib/mapper/general_mapper.rb +4 -6
data/lib/reader/modsxml.rb +16 -20
data/lib/reader/modsxml_reader.rb +2 -5
data/lib/reader/purlxml.rb +16 -23
data/lib/reader/purlxml_model.rb +23 -31
data/lib/reader/purlxml_parser.rb +2 -3
data/lib/reader/purlxml_parser_strict.rb +99 -111
data/lib/reader/purlxml_reader.rb +2 -3
data/lib/version.rb +1 -1
data/lib/writer/solr_client.rb +30 -32
data/lib/writer/solr_writer.rb +15 -15
metadata +4 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 3c328198b5aaf90f99e8b239b37aabacf9cfcdc7
-  data.tar.gz: 7ffabf609d7b24d5ae413f4cac213a9aabef2e8e
+  metadata.gz: 9cddff2ef4144cc3d17af7bd71bdb6f68840d38c
+  data.tar.gz: ca19a7acbb80341c1d6440567e305ef0bf170542
 SHA512:
-  metadata.gz: 7ec36accf3a418d95040bc8ca561a7f08fac0fabcb6d2cac133589f451620b885c16218daa7338c2b478771978eed2d2a6ae236a0eb9cde1b836941b8eb41434
-  data.tar.gz: 2595c1da62ab979b6f12db5ca3cc64102c622924090cc0cf355e3b14b405f33fbafb859282b65bc16c1516369a407c6bae61809e19b7f0e417a92ac9ae43857c
+  metadata.gz: 65fa148806dcf8dc498b262cbc3acf811c89fcad0d81397099009e0e8eb368577477e50c9c5bbee7e0921e11b17542cf4857b2aff4e54a319f5f97ea22e1bb8f
+  data.tar.gz: a28465c230937c24168114d91f41a58e59492a16b0b479ab8e4ae41429c4682dc7eec99a4b796ff93869d37b7e34ed78a6d7485ff1500ee2bc91e99e3a6e070f

data/lib/discovery-indexer.rb CHANGED Viewed

@@ -15,9 +15,8 @@ require 'mapper/general_mapper'
 require 'writer/solr_client'
 require 'writer/solr_writer'
-#require 'utilities/extract_sub_targets'
+# require 'utilities/extract_sub_targets'
 module DiscoveryIndexer
   PURL_DEFAULT = 'http://purl.stanford.edu'
-end
+end

data/lib/errors.rb CHANGED Viewed

@@ -1,13 +1,13 @@
 module DiscoveryIndexer
   module Errors
-    MissingPurlPage = Class.new(StandardError)
-    MissingMods = Class.new(StandardError)
-    MissingPublicXml = Class.new(StandardError)
-    MissingContentMetadata = Class.new(StandardError)
-    MissingIdentityMetadata = Class.new(StandardError)
-    MissingRightsMetadata = Class.new(StandardError)
-    MissingRDF = Class.new(StandardError)
+    MissingPurlPage = Class.new(StandardError)
+    MissingMods = Class.new(StandardError)
+    MissingPublicXml = Class.new(StandardError)
+    MissingContentMetadata = Class.new(StandardError)
+    MissingIdentityMetadata = Class.new(StandardError)
+    MissingRightsMetadata = Class.new(StandardError)
+    MissingRDF = Class.new(StandardError)
     MissingDC = Class.new(StandardError)
     MissingModsPage = Class.new(StandardError)
   end
-end
+end

data/lib/logging.rb CHANGED Viewed

@@ -6,11 +6,8 @@ module DiscoveryIndexer
       def logger
         @logger ||= Logger.new(STDOUT)
       end
-      def logger=(logger)
-        @logger = logger
-      end
-    end
+      attr_writer :logger
+    end
   end
-end
+end

data/lib/mapper/general_mapper.rb CHANGED Viewed

@@ -1,29 +1,27 @@
 module DiscoveryIndexer
   module Mapper
     class GeneralMapper
       # Initializes an instance from IndexMapper
       # @param [String] druid e.g. ab123cd4567
       # @param [Stanford::Mods::Record] modsxml represents the MODS xml for the druid
       # @param [DiscoveryIndexer::Reader::PurlxmlModel] purlxml represents the purlxml model
       # @param [Hash] collection_data represents a hash of collection_druid and catkey
       # e.g. @collection_data = {'aa00bb0001'=>{:name=>'Test Collection Name',:ckey=>'000001'},'nt028fd5773'=>{:name=>'Revs Institute Archive',:ckey=>'000002'}}
-      def initialize(druid, modsxml, purlxml, collection_data={})
+      def initialize(druid, modsxml, purlxml, collection_data = {})
         @druid = druid
         @modsxml = modsxml
         @purlxml = purlxml
         @collection_data = collection_data
       end
-      # Create a Hash representing a Solr doc, with all MODS related fields populated.
+      # Create a Hash representing a Solr doc, with all MODS related fields populated.
       # @return [Hash] Hash representing the Solr document
-      def convert_to_solr_doc()
+      def convert_to_solr_doc
         solr_doc = {}
         solr_doc[:id] = @druid
         solr_doc[:title] = @modsxml.sw_full_title
-        return solr_doc
+        solr_doc
       end
     end
   end
 end

data/lib/reader/modsxml.rb CHANGED Viewed

@@ -1,44 +1,40 @@
 require 'stanford-mods'
 module DiscoveryIndexer
   module InputXml
-    # This class is the main class to access and parse the mods xml
+    # This class is the main class to access and parse the mods xml
     #    as retrieved from PURL server
     # @example to run the code
     #  druid = "aa111aa1111"
     #  p =  DiscoveryIndexer::InputXml::Modsxml.new(druid)
     #  model =  p.load()
-    #
-    #
+    #
+    #
     class Modsxml
-      # initializes a new object
+      # initializes a new object
       # @param druid [String] the druid object in the format "aa111aa1111"
       def initialize(druid)
         @druid = druid
-        @modsxml_ng_doc = nil
+        @modsxml_ng_doc = nil
       end
       # loads the mods xml to stanford mods model for the fedora object defind in the druid,
-      #    it reads the mods xml once from PURL server, and repeat the parsing with each call
-      # @return [Stanford::Mods::Record] represents the mods xml
-      def load()
-        if @modsxml_ng_doc.nil? then
-          @modsxml_ng_doc = ModsxmlReader.read(@druid)
-        end
+      # it reads the mods xml once from PURL server, and repeat the parsing with each call
+      # @return [Stanford::Mods::Record] represents the mods xml
+      def load
+        @modsxml_ng_doc = ModsxmlReader.read(@druid) if @modsxml_ng_doc.nil?
         modsxml_model = Stanford::Mods::Record.new
         modsxml_model.from_nk_node(@modsxml_ng_doc)
-        return modsxml_model
+        modsxml_model
       end
       # loads the mods xml to stanford mods model for the fedora object defind in the druid,
-      #    it reads the mods xml from PURL server with every call
-      # @return [Stanford::Mods::Record] represents the mods xml
-      def reload()
+      # it reads the mods xml from PURL server with every call
+      # @return [Stanford::Mods::Record] represents the mods xml
+      def reload
         @modsxml_ng_doc = ModsxmlReader.read(@druid)
-        return load()
+        load
       end
     end
   end
 end

data/lib/reader/modsxml_reader.rb CHANGED Viewed

@@ -3,21 +3,18 @@ require 'open-uri'
 module DiscoveryIndexer
   module InputXml
     class ModsxmlReader
       # reads the mods xml for the fedora object that is defined , from the purl server
       # @param [String] druid e.g. ab123cd4567
       # @return [Nokogiri::XML::Document] the mods xml for the fedora object
       # @raise [MissingModsXml] if there's no mods xml available for this druid
       def self.read(druid)
         mods_uri = "#{DiscoveryIndexer::PURL_DEFAULT}/#{druid}.mods"
         begin
-          modsxml_ng_doc = Nokogiri::XML(open(mods_uri))
-          return modsxml_ng_doc
+          Nokogiri::XML(open(mods_uri))
         rescue
           raise DiscoveryIndexer::Errors::MissingModsPage.new(mods_uri)
         end
       end
     end
   end
-end
+end

data/lib/reader/purlxml.rb CHANGED Viewed

@@ -1,43 +1,36 @@
 module DiscoveryIndexer
   module InputXml
-    # This class is the main class to access and parse the purl xml
+    # This class is the main class to access and parse the purl xml
     #    as retrieved from PURL server
     # @example to run the code
     #  druid = "aa111aa1111"
     #  p =  DiscoveryIndexer::InputXml::Purlxml.new(druid)
     #  model =  p.load()
-    #
     class Purlxml
-      # initializes a new object
+      # initializes a new object
       # @param druid [String] the druid object in the format "aa111aa1111"
       def initialize(druid)
         @druid = druid
-        @purlxml_ng_doc = nil
+        @purlxml_ng_doc = nil
       end
       # loads the purl xml to purlxml model for the fedora object defind in the druid,
-      #    it reads the purl xml once from PURL server, and repeat the parsing with each call
-      # @return [PurlxmlModel] represents the purlxml
-      def load()
-        if @purlxml_ng_doc.nil? then
-          @purlxml_ng_doc = PurlxmlReader.read(@druid)
-        end
-        purlxml_parser = PurlxmlParserStrict.new(@druid,@purlxml_ng_doc)
-        purlxml_model = purlxml_parser.parse()
-        return purlxml_model
+      # it reads the purl xml once from PURL server, and repeat the parsing with each call
+      # @return [PurlxmlModel] represents the purlxml
+      def load
+        @purlxml_ng_doc = PurlxmlReader.read(@druid) if @purlxml_ng_doc.nil?
+        purlxml_parser = PurlxmlParserStrict.new(@druid, @purlxml_ng_doc)
+        purlxml_model = purlxml_parser.parse
+        purlxml_model
       end
       # loads the purl xml to purlxml model for the fedora object defind in the druid
-      #    it reads the purl xml from PURL server with every call
-      # @return [PurlxmlModel] represents the purlxml
-      def reload()
+      # it reads the purl xml from PURL server with every call
+      # @return [PurlxmlModel] represents the purlxml
+      def reload
         @purlxml_ng_doc = PurlxmlReader.read(@druid)
-        return load()
+        load
       end
-  	end
+    end
   end
 end

data/lib/reader/purlxml_model.rb CHANGED Viewed

@@ -1,47 +1,46 @@
 module DiscoveryIndexer
   module InputXml
     class PurlxmlModel
-      #@!attribute [rw] druid
-      # @return [String] The druid value eg., ab123cd4567
-      attr_accessor :druid
-      #@!attribute [rw] public_xml
-      # @return [Nokogiri::XML] The publix xml as retrieved from purl server
+      # @!attribute [rw] druid
+      # @return [String] The druid value eg., ab123cd4567
+      attr_accessor :druid
+      # @!attribute [rw] public_xml
+      # @return [Nokogiri::XML] The publix xml as retrieved from purl server
       attr_accessor :public_xml
-      #@!attribute [rw] content_metadata
+      # @!attribute [rw] content_metadata
       # @return [Nokogiri::XML] The content_metadata as extracted from public xml
       attr_accessor :content_metadata
-      #@!attribute [rw] identity_metadata
+      # @!attribute [rw] identity_metadata
       # @return [Nokogiri::XML] The identity_metadata as extracted from public xml
       attr_accessor :identity_metadata
-      #@!attribute [rw] rights_metadata
+      # @!attribute [rw] rights_metadata
       # @return [Nokogiri::XML] The rights_metadata as extracted from public xml
       attr_accessor :rights_metadata
-      #@!attribute [rw] dc
+      # @!attribute [rw] dc
       # @return [Nokogiri::XML] The dc element as extracted from public xml
       attr_accessor :dc
-      #@!attribute [rw] rdf
+      # @!attribute [rw] rdf
       # @return [Nokogiri::XML] The rdf element as extracted from public xml
       attr_accessor :rdf
       # @!attribute [rw] release_tags_hash
       # @return [Hash] The release_tag in hash format as extracted from public xml
-      #  ReleaseData element.
+      #  ReleaseData element.
       # @example
       #  !{"target1"=>true, "target2"=>false}
-      attr_accessor :release_tags_hash
+      attr_accessor :release_tags_hash
       # @!attribute [rw] dor_content_type
       # @return [String] The dor_content_type as extracted from public xml
       #  content_metadata.
       attr_accessor :dor_content_type
       # @!attribute [rw] dor_display_type
       # @return [String] The displayType as extracted from public xml
       #  identity_metadata.
@@ -50,25 +49,25 @@ module DiscoveryIndexer
       # @!attribute [rw] is_collection
       # @return [Boolean] true if the item type is collection in the identity_metadata
       attr_accessor :is_collection
       # @!attribute [rw] collection_druids
       # @return [Array] a list of the collections that this is druid belongs to
       # @example
       #  ["aa11aaa1111","bb111bb1111"]
       attr_accessor :collection_druids
       # @!attribute [rw] file_ids
       # @return [Array] a list of the file ids in the content_metadata
       # @example
-      #  ["pc0065_b08_f10_i031.txt","pc0065_b08_f10_i032.txt"]
+      #  ["pc0065_b08_f10_i031.txt","pc0065_b08_f10_i032.txt"]
       attr_accessor :file_ids
       # @!attribute [rw] image_ids
       # @return [Array] a list of the image ids in the content_metadata
       # @example
-      #  ["pc0065_b08_f10_i031.jp2","pc0065_b08_f10_i032.jp2"]
+      #  ["pc0065_b08_f10_i031.jp2","pc0065_b08_f10_i032.jp2"]
       attr_accessor :image_ids
       # @!attribute [rw] catkey
       # @return [String] the catkey attribute in identity_metadata
       attr_accessor :catkey
@@ -76,15 +75,15 @@ module DiscoveryIndexer
       # @!attribute [rw] barcode
       # @return [String] the barcode attribute in identity_metadata
       attr_accessor :barcode
       # @!attribute [rw] label
       # @return [String] the objectLabel attribute in identity_metadata
       attr_accessor :label
       # @!attribute [rw] copyright
       # @return [String] the copyright statement from rights metadata
       attr_accessor :copyright
       # @!attribute [rw] use_and_reproduction
       # @return [String] the use and reproduction statement from rights metadata
       attr_accessor :use_and_reproduction
@@ -92,13 +91,6 @@ module DiscoveryIndexer
       # @!attribute [rw] source_id
       # @return [String] the sourceid from identity metadata
       attr_accessor :source_id
     end
   end
 end

data/lib/reader/purlxml_parser.rb CHANGED Viewed

@@ -1,13 +1,12 @@
 module DiscoveryIndexer
   module InputXml
     class PurlxmlParser
       def initialize(druid, purlxml_ng_doc)
         @purlxml_ng_doc = purlxml_ng_doc
         @druid = druid
       end
-      def parse()
+      def parse
       end
     end
   end

data/lib/reader/purlxml_parser_strict.rb CHANGED Viewed

@@ -3,154 +3,142 @@ module DiscoveryIndexer
     class PurlxmlParserStrict < PurlxmlParser
       include DiscoveryIndexer::Logging
       RDF_NAMESPACE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
       OAI_DC_NAMESPACE = 'http://www.openarchives.org/OAI/2.0/oai_dc/'
       MODS_NAMESPACE = 'http://www.loc.gov/mods/v3'
       # it parses the purlxml into a purlxml model
       # @return [PurlxmlModel] represents the purlxml as parsed based on the parser rules
-      def parse()
+      def parse
         purlxml_model = PurlxmlModel.new
         purlxml_model.druid             = @druid
         purlxml_model.public_xml        = @purlxml_ng_doc
-        purlxml_model.content_metadata  = parse_content_metadata()
-        purlxml_model.identity_metadata = parse_identity_metadata()
-        purlxml_model.rights_metadata   = parse_rights_metadata()
-        purlxml_model.dc                = parse_dc()
-        purlxml_model.rdf               = parse_rdf()
-        purlxml_model.is_collection     = parse_is_collection()
-        purlxml_model.collection_druids = parse_collection_druids()
-        purlxml_model.dor_content_type  = parse_dor_content_type()
-        purlxml_model.dor_display_type  = parse_dor_display_type()
-        purlxml_model.release_tags_hash = parse_release_tags_hash()
-        purlxml_model.file_ids          = parse_file_ids()
-        purlxml_model.image_ids         = parse_image_ids()
-        purlxml_model.catkey            = parse_catkey()
-        purlxml_model.barcode           = parse_barcode()
-        purlxml_model.label             = parse_label()
-        purlxml_model.copyright         = parse_copyright()
-        purlxml_model.use_and_reproduction = parse_use_and_reproduction()
-        purlxml_model.source_id  = parse_sourceid()
-        return purlxml_model
-      end
+        purlxml_model.content_metadata  = parse_content_metadata
+        purlxml_model.identity_metadata = parse_identity_metadata
+        purlxml_model.rights_metadata   = parse_rights_metadata
+        purlxml_model.dc                = parse_dc
+        purlxml_model.rdf               = parse_rdf
+        purlxml_model.is_collection     = parse_is_collection
+        purlxml_model.collection_druids = parse_collection_druids
+        purlxml_model.dor_content_type  = parse_dor_content_type
+        purlxml_model.dor_display_type  = parse_dor_display_type
+        purlxml_model.release_tags_hash = parse_release_tags_hash
+        purlxml_model.file_ids          = parse_file_ids
+        purlxml_model.image_ids         = parse_image_ids
+        purlxml_model.catkey            = parse_catkey
+        purlxml_model.barcode           = parse_barcode
+        purlxml_model.label             = parse_label
+        purlxml_model.copyright         = parse_copyright
+        purlxml_model.use_and_reproduction = parse_use_and_reproduction
+        purlxml_model.source_id = parse_sourceid
+        purlxml_model
+      end
       # extracts the identityMetadata for this fedora object, from the purl xml
       # @return [Nokogiri::XML::Document] the identityMetadata for the fedora object
       # @raise [DiscoveryIndexer::Errors::MissingIdentityMetadata] if there is no identity_metadata
       def parse_identity_metadata
-        begin
-          ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/identityMetadata').to_xml)
-          raise DiscoveryIndexer::Errors::MissingIdentityMetadata.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
-          ng_doc
-        rescue
-          raise DiscoveryIndexer::Errors::MissingIdentityMetadata.new(@purlxml_ng_doc.inspect)
-        end
+        ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/identityMetadata').to_xml)
+        fail DiscoveryIndexer::Errors::MissingIdentityMetadata.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
+        ng_doc
+      rescue
+        raise DiscoveryIndexer::Errors::MissingIdentityMetadata.new(@purlxml_ng_doc.inspect)
       end
-      def parse_rights_metadata
-        begin
-          ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/rightsMetadata').to_xml)
-          raise DiscoveryIndexer::Errors::MissingRightsMetadata.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
-          ng_doc
-        rescue
-          raise DiscoveryIndexer::Errors::MissingRightsMetadata.new(@purlxml_ng_doc.inspect)
-        end
+      def parse_rights_metadata
+        ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/rightsMetadata').to_xml)
+        fail DiscoveryIndexer::Errors::MissingRightsMetadata.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
+        ng_doc
+      rescue
+        raise DiscoveryIndexer::Errors::MissingRightsMetadata.new(@purlxml_ng_doc.inspect)
       end
       # extracts the dc field for this fedora object, from the purl xml
       # @return [Nokogiri::XML::Document] the dc for the fedora object
       # @raise [DiscoveryIndexer::Errors::MissingDC] if there is no dc element
       def parse_dc
-          begin
-            ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/dc:dc', {'dc' => OAI_DC_NAMESPACE}).to_xml(:encoding => 'utf-8'))
-            raise DiscoveryIndexer::Errors::MissingDC.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
-            ng_doc
-          rescue
-            raise DiscoveryIndexer::Errors::MissingDC.new(@purlxml_ng_doc.inspect)
-          end
+        ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/dc:dc', 'dc' => OAI_DC_NAMESPACE).to_xml(encoding: 'utf-8'))
+        fail DiscoveryIndexer::Errors::MissingDC.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
+        ng_doc
+      rescue
+        raise DiscoveryIndexer::Errors::MissingDC.new(@purlxml_ng_doc.inspect)
       end
       # extracts the rdf field for this fedora object, from the purl xml
       # @return [Nokogiri::XML::Document] the rdf for the fedora object
       # @raise [DiscoveryIndexer::Errors::MissingRDF] if there is no rdf element
       def parse_rdf
-        begin
-          ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/rdf:RDF', {'rdf' => RDF_NAMESPACE}).to_xml)
-          raise DiscoveryIndexer::Errors::MissingRDF.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
-          ng_doc
-        rescue
-          raise DiscoveryIndexer::Errors::MissingRDF.new(@purlxml_ng_doc.inspect)
-        end
+        ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/rdf:RDF', 'rdf' => RDF_NAMESPACE).to_xml)
+        fail DiscoveryIndexer::Errors::MissingRDF.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
+        ng_doc
+      rescue
+        raise DiscoveryIndexer::Errors::MissingRDF.new(@purlxml_ng_doc.inspect)
       end
       # extracts the release tag element for this fedora object, from the the ReleaseData element in purl xml
       # @return [Hash] the release tags for the fedora object
       def parse_release_tags_hash
-        release_tags={}
-        unless  @purlxml_ng_doc.nil?
-          release_elements =  @purlxml_ng_doc.xpath('//ReleaseData/release')
-          release_elements.each { |n|
-            unless n.attr("to").nil?
-              release_target = n.attr("to")
+        release_tags = {}
+        unless @purlxml_ng_doc.nil?
+          release_elements = @purlxml_ng_doc.xpath('//ReleaseData/release')
+          release_elements.each do |n|
+            unless n.attr('to').nil?
+              release_target = n.attr('to')
               text = n.text
-              unless text.nil?
-                release_tags[release_target]= to_boolean(text)
-              end
+              release_tags[release_target] = to_boolean(text) unless text.nil?
             end
-          }
+          end
         end
-        return release_tags
+        release_tags
       end
       # extracts the contentMetadata for this fedora object, from the purl xml
       # @return [Nokogiri::XML::Document] the contentMetadata for the fedora object
       # @raise [DiscoveryIndexer::Errors::MissingContentMetadata] if there is no contentMetadata
       def parse_content_metadata
-          ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/contentMetadata').to_xml)
-          ng_doc = nil if !ng_doc || ng_doc.children.empty?
-          ng_doc
+        ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/contentMetadata').to_xml)
+        ng_doc = nil if !ng_doc || ng_doc.children.empty?
+        ng_doc
       end
       # @return true if the identityMetadata has <objectType>collection</objectType>, false otherwise
       def parse_is_collection
-        identity_metadata  = parse_identity_metadata
+        identity_metadata = parse_identity_metadata
         unless identity_metadata.nil?
           object_type_nodes = identity_metadata.xpath('//objectType')
-          return true if object_type_nodes.find_index { |n| ['collection','set'].include? n.text.downcase}
+          return true if object_type_nodes.find_index { |n| %w(collection set).include? n.text.downcase }
         end
         false
       end
       # get the druids from isMemberOfCollection relationships in rels-ext from public_xml
       # @return [Array<String>] the druids (e.g. ww123yy1234) this object has isMemberOfColletion relationship with, or nil if none
       def parse_collection_druids
-        ns_hash = {'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'fedora' => "info:fedora/fedora-system:def/relations-external#", '' => ''}
+        ns_hash = { 'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'fedora' => 'info:fedora/fedora-system:def/relations-external#', '' => '' }
         is_member_of_nodes ||= @purlxml_ng_doc.xpath('/publicObject/rdf:RDF/rdf:Description/fedora:isMemberOfCollection/@rdf:resource', ns_hash)
         # from public_xml rels-ext
         druids = []
-        is_member_of_nodes.each { |n|
+        is_member_of_nodes.each do |n|
           druids << n.value.split('druid:').last unless n.value.empty?
-        }
+        end
         return nil if druids.empty?
         druids
       end
       # the value of the type attribute for a DOR object's contentMetadata
       #  more info about these values is here:
       #    https://consul.stanford.edu/display/chimera/DOR+content+types%2C+resource+types+and+interpretive+metadata
       #    https://consul.stanford.edu/display/chimera/Summary+of+Content+Types%2C+Resource+Types+and+their+behaviors
-      # @return [String]
+      # @return [String]
       def parse_dor_content_type
         content_md = parse_content_metadata
         dct = content_md ? content_md.xpath('contentMetadata/@type').text : nil
         DiscoveryIndexer::Logging.logger.debug "#{@druid} has no DOR content type (<contentMetadata> element may be missing type attribute)" if !dct || dct.empty?
         dct
       end
       # the value of the displyType tag from a DOR collection's identityMetadata
-      # @return [String]
+      # @return [String]
       def parse_dor_display_type
         identity_md = parse_identity_metadata
         ddt = identity_md ? identity_md.xpath('//displayType').text : nil
@@ -161,43 +149,44 @@ module DiscoveryIndexer
       # the @id attribute of resource/file elements that match the image type, including extension
       # @return [Array<String>] filenames
       def parse_image_ids
-          ids = []
-          content_md = parse_content_metadata
-          unless content_md.nil?
-            content_md.xpath('//resource[@type="image"]/file/@id').each { |node|
-              ids << node.text if !node.text.empty?
-            }
-          return nil if ids.empty?
-          ids
+        ids = []
+        content_md = parse_content_metadata
+        return nil if content_md.nil?
+        content_md.xpath('//resource[@type="image"]/file/@id').each do |node|
+          ids << node.text unless node.text.empty?
         end
+        content_md.xpath('//resource[@type="page"]/file/@id').each do |node|
+          ids << node.text unless node.text.empty?
+        end
+        return nil if ids.empty?
+        ids
       end
       def parse_sourceid
         get_value(@purlxml_ng_doc.css('//identityMetadata/sourceId'))
       end
       def parse_copyright
         get_value(@purlxml_ng_doc.css('//rightsMetadata/copyright/human[type="copyright"]'))
       end
       def parse_use_and_reproduction
         get_value(@purlxml_ng_doc.css('//rightsMetadata/use/human[type="useAndReproduction"]'))
       end
       # the @id attribute of resource/file elements, including extension
       # @return [Array<String>] filenames
       def parse_file_ids
         ids = []
         content_md = parse_content_metadata
-        unless content_md.nil?
-            content_md.xpath('//resource/file/@id').each { |node|
-              ids << node.text if !node.text.empty?
-            }
-          return nil if ids.empty?
-          ids
-        end
-      end
+        return unless content_md.nil?
+        content_md.xpath('//resource/file/@id').each do |node|
+          ids << node.text unless node.text.empty?
+        end
+        return nil if ids.empty?
+        ids
+      end
       # @return catkey value from the DOR identity_metadata, or nil if there is no catkey
       def parse_catkey
         get_value(@purlxml_ng_doc.xpath("/publicObject/identityMetadata/otherId[@name='catkey']"))
@@ -210,23 +199,22 @@ module DiscoveryIndexer
       # @return objectLabel value from the DOR identity_metadata, or nil if there is no barcode
       def parse_label
-        get_value(@purlxml_ng_doc.xpath("/publicObject/identityMetadata/objectLabel"))
+        get_value(@purlxml_ng_doc.xpath('/publicObject/identityMetadata/objectLabel'))
       end
       def get_value(node)
-        (node && node.first) ? node.first.content : nil
+        (node && node.first) ? node.first.content : nil
       end
       def to_boolean(text)
-        if text.nil? || text.empty? then
+        if text.nil? || text.empty?
           return false
-        elsif text.downcase.eql?("true") || text.downcase == "t" then
+        elsif text.downcase.eql?('true') || text.downcase == 't'
           return true
         else
           return false
         end
-      end
+      end
     end
   end
 end

data/lib/reader/purlxml_reader.rb CHANGED Viewed

@@ -3,14 +3,13 @@ require 'open-uri'
 module DiscoveryIndexer
   module InputXml
     class PurlxmlReader
       # reads the public xml for the fedora object that is defined , from the purl server
       # @param [String] druid e.g. ab123cd4567
       # @return [Nokogiri::XML::Document] the public xml for the fedora object
       # @raise [MissingPublicXml] if there's no purl xml available for this druid
       def self.read(druid)
         purlxml_uri = "#{DiscoveryIndexer::PURL_DEFAULT}/#{druid}.xml"
         begin
           purlxml_object = Nokogiri::XML(open(purlxml_uri))
           return purlxml_object
@@ -20,4 +19,4 @@ module DiscoveryIndexer
       end
     end
   end
-end
+end

data/lib/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module DiscoveryIndexer
-    VERSION = '0.9.6'
+  VERSION = '0.9.7'
 end

data/lib/writer/solr_client.rb CHANGED Viewed

@@ -3,6 +3,7 @@ require 'rsolr'
 require 'rest-client'
 module DiscoveryIndexer
   module Writer
+    # Processes adds and deletes to the solr core
     class SolrClient
       include DiscoveryIndexer::Logging
@@ -13,7 +14,7 @@ module DiscoveryIndexer
       # @param solr_connector [RSolr::Client]  is an open connection with the solr core
       # @param max_retries [Integer] the maximum number of tries before fail
       def self.add(id, solr_doc, solr_connector, max_retries = 10)
-        process(id, solr_doc, solr_connector, max_retries, is_delete=false)
+        process(id, solr_doc, solr_connector, max_retries, false)
       end
       # Add the document to solr, retry if an error occurs.
@@ -22,79 +23,76 @@ module DiscoveryIndexer
       # @param solr_connector[RSolr::Client]  is an open connection with the solr core
       # @param max_retries [Integer] the maximum number of tries before fail
       def self.delete(id, solr_connector, max_retries = 10)
-        process(id, {}, solr_connector, max_retries, is_delete=true)
+        process(id, {}, solr_connector, max_retries, true)
       end
       # It's an internal method that receives all the requests and deal with
       # SOLR core. This method can call add, delete, or update
       #
       # @param id [String] the document id, usually it will be druid.
-      # @param solr_doc [Hash] is the solr doc in hash format
+      # @param solr_doc [Hash] is the solr doc in hash format
       # @param solr_connector [RSolr::Client]  is an open connection with the solr core
       # @param max_retries [Integer] the maximum number of tries before fail
-      def self.process(id, solr_doc, solr_connector, max_retries, is_delete=false)
-        handler = Proc.new do |exception, attempt_number, total_delay|
+      def self.process(id, solr_doc, solr_connector, max_retries, is_delete = false)
+        handler = proc do |exception, attempt_number, _total_delay|
           DiscoveryIndexer::Logging.logger.debug "#{exception.class} on attempt #{attempt_number} for #{id}"
         end
-        with_retries(:max_tries => max_retries, :handler => handler, :base_sleep_seconds => 1, :max_sleep_seconds => 5) do |attempt|
+        with_retries(max_tries: max_retries, handler: handler, base_sleep_seconds: 1, max_sleep_seconds: 5) do |attempt|
           DiscoveryIndexer::Logging.logger.debug "Attempt #{attempt} for #{id}"
           if is_delete
             DiscoveryIndexer::Logging.logger.info "Deleting #{id} on attempt #{attempt}"
             solr_connector.delete_by_id(id)
-          elsif allow_update?(solr_connector) && doc_exists?(id,solr_connector)
+          elsif allow_update?(solr_connector) && doc_exists?(id, solr_connector)
             DiscoveryIndexer::Logging.logger.info "Updating #{id} on attempt #{attempt}"
-            update_solr_doc(id,solr_doc,solr_connector)
+            update_solr_doc(id, solr_doc, solr_connector)
           else
             DiscoveryIndexer::Logging.logger.info "Indexing #{id} on attempt #{attempt}"
             solr_connector.add(solr_doc)
           end
           solr_connector.commit
           DiscoveryIndexer::Logging.logger.info "Completing #{id} successfully on attempt #{attempt}"
         end
       end
       # @param solr_connector [RSolr::Client]  is an open connection with the solr core
       # @return [Boolean] true if the solr core allowing update feature
       def self.allow_update?(solr_connector)
-        return solr_connector.options.include?(:allow_update) ? solr_connector.options[:allow_update] : false
+        solr_connector.options.include?(:allow_update) ? solr_connector.options[:allow_update] : false
       end
       # @param id [String] the document id, usually it will be druid.
       # @param solr_connector [RSolr::Client]  is an open connection with the solr core
       # @return [Boolean] true if the solr doc defined by this id exists
-      def self.doc_exists?(id,solr_connector)
-        response=solr_connector.get 'select', :params=>{:q=>'id:"' + id + '"'}
+      def self.doc_exists?(id, solr_connector)
+        response = solr_connector.get 'select', params: { q: 'id:"' + id + '"' }
         response['response']['numFound'] == 1
       end
       # It is an internal method that updates the solr doc instead of adding a new one.
-      def self.update_solr_doc(id,solr_doc,solr_connector)
+      def self.update_solr_doc(id, solr_doc, solr_connector)
         # update_solr_doc can't used RSolr because updating hash doc is not supported
         #  so we need to build the json input manually
         solr_url = solr_connector.options[:url]
-        if solr_url.end_with?("/") then
-          url="#{solr_connector.options[:url]}update?commit=true"
+        if solr_url.end_with?('/')
+          url = "#{solr_connector.options[:url]}update?commit=true"
         else
-          url="#{solr_connector.options[:url]}/update?commit=true"
+          url = "#{solr_connector.options[:url]}/update?commit=true"
         end
-        params="[{\"id\":\"#{id}\","
-        solr_doc.each do |field_name,new_values|
-          unless field_name == :id
-            params+="\"#{field_name}\":"
-            new_values=[new_values] unless new_values.class==Array
-            new_values = new_values.map {|s| s.to_s.gsub("\\","\\\\\\").gsub('"','\"').strip} # strip leading/trailing spaces and escape quotes for each value
-            params+="{\"set\":[\"#{new_values.join('","')}\"]},"
-          end
+        params = "[{\"id\":\"#{id}\","
+        solr_doc.each do |field_name, new_values|
+          next unless field_name == :id
+          params += "\"#{field_name}\":"
+          new_values = [new_values] unless new_values.class == Array
+          new_values = new_values.map { |s| s.to_s.gsub('\\', '\\\\\\').gsub('"', '\"').strip } # strip leading/trailing spaces and escape quotes for each value
+          params += "{\"set\":[\"#{new_values.join('","')}\"]},"
         end
         params.chomp!(',')
-        params+="}]"
-        RestClient.post url, params,:content_type => :json, :accept=>:json
+        params += '}]'
+        RestClient.post url, params, content_type: :json, accept: :json
       end
     end
   end
-end
+end

data/lib/writer/solr_writer.rb CHANGED Viewed

@@ -3,57 +3,57 @@ require 'rsolr'
 module DiscoveryIndexer
   module Writer
+    # Performs writes to solr client based upon true and false release flags
     class SolrWriter
       include DiscoveryIndexer::Logging
       def process(id, index_doc, targets, solr_targets_configs)
         @solr_targets_configs = solr_targets_configs
         index_targets = []
         delete_targets = []
-        targets.keys.each do |target|
-          if targets[target] then
+        targets.keys.each do |target|
+          if targets[target]
             index_targets.append(target)
           else
             delete_targets.append(target)
           end
         end
         # get targets with true
         solr_index_client(id, index_doc, index_targets)
         # get targets with false
         solr_delete_client(id, delete_targets)
       end
       def solr_delete_from_all(id, solr_targets_configs)
         # Get a list of all registered targets
-        @solr_targets_configs=solr_targets_configs
-        targets = @solr_targets_configs.keys()
+        @solr_targets_configs = solr_targets_configs
+        targets = @solr_targets_configs.keys
         solr_delete_client(id, targets)
       end
       def solr_index_client(id, index_doc, targets)
         targets.each do |solr_target|
-          solr_connector = get_connector_for_target(solr_target)
+          solr_connector = get_connector_for_target(solr_target)
           SolrClient.add(id, index_doc, solr_connector) unless solr_connector.nil?
-        end
+        end
       end
       def solr_delete_client(id, targets)
         targets.each do |solr_target|
           solr_connector = get_connector_for_target(solr_target)
           SolrClient.delete(id, solr_connector) unless solr_connector.nil?
-        end
+        end
       end
       def get_connector_for_target(solr_target)
         solr_connector = nil
-        if @solr_targets_configs.keys.include?(solr_target) then
+        if @solr_targets_configs.keys.include?(solr_target)
           config = @solr_targets_configs[solr_target]
           solr_connector = RSolr.connect(config.deep_symbolize_keys)
         end
-        return solr_connector
+        solr_connector
       end
     end
   end
 end

metadata CHANGED Viewed

@@ -1,14 +1,15 @@
 --- !ruby/object:Gem::Specification
 name: discovery-indexer
 version: !ruby/object:Gem::Version
-  version: 0.9.6
+  version: 0.9.7
 platform: ruby
 authors:
 - Ahmed AlSum
+- Laney McGlohon
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-06-30 00:00:00.000000000 Z
+date: 2015-09-23 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri
@@ -138,7 +139,7 @@ dependencies:
         version: '0'
 description: This library manages the core operations for the discovery indexing such
   as reading PURL xml, mapping to the solr document, and writing to solr core.
-email: aalsum@stanford.edu
+email: laneymcg@stanford.edu
 executables: []
 extensions: []
 extra_rdoc_files: []