discovery-indexer 0.9.6 → 0.9.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 3c328198b5aaf90f99e8b239b37aabacf9cfcdc7
4
- data.tar.gz: 7ffabf609d7b24d5ae413f4cac213a9aabef2e8e
3
+ metadata.gz: 9cddff2ef4144cc3d17af7bd71bdb6f68840d38c
4
+ data.tar.gz: ca19a7acbb80341c1d6440567e305ef0bf170542
5
5
  SHA512:
6
- metadata.gz: 7ec36accf3a418d95040bc8ca561a7f08fac0fabcb6d2cac133589f451620b885c16218daa7338c2b478771978eed2d2a6ae236a0eb9cde1b836941b8eb41434
7
- data.tar.gz: 2595c1da62ab979b6f12db5ca3cc64102c622924090cc0cf355e3b14b405f33fbafb859282b65bc16c1516369a407c6bae61809e19b7f0e417a92ac9ae43857c
6
+ metadata.gz: 65fa148806dcf8dc498b262cbc3acf811c89fcad0d81397099009e0e8eb368577477e50c9c5bbee7e0921e11b17542cf4857b2aff4e54a319f5f97ea22e1bb8f
7
+ data.tar.gz: a28465c230937c24168114d91f41a58e59492a16b0b479ab8e4ae41429c4682dc7eec99a4b796ff93869d37b7e34ed78a6d7485ff1500ee2bc91e99e3a6e070f
@@ -15,9 +15,8 @@ require 'mapper/general_mapper'
15
15
  require 'writer/solr_client'
16
16
  require 'writer/solr_writer'
17
17
 
18
- #require 'utilities/extract_sub_targets'
19
-
18
+ # require 'utilities/extract_sub_targets'
20
19
 
21
20
  module DiscoveryIndexer
22
21
  PURL_DEFAULT = 'http://purl.stanford.edu'
23
- end
22
+ end
data/lib/errors.rb CHANGED
@@ -1,13 +1,13 @@
1
1
  module DiscoveryIndexer
2
2
  module Errors
3
- MissingPurlPage = Class.new(StandardError)
4
- MissingMods = Class.new(StandardError)
5
- MissingPublicXml = Class.new(StandardError)
6
- MissingContentMetadata = Class.new(StandardError)
7
- MissingIdentityMetadata = Class.new(StandardError)
8
- MissingRightsMetadata = Class.new(StandardError)
9
- MissingRDF = Class.new(StandardError)
3
+ MissingPurlPage = Class.new(StandardError)
4
+ MissingMods = Class.new(StandardError)
5
+ MissingPublicXml = Class.new(StandardError)
6
+ MissingContentMetadata = Class.new(StandardError)
7
+ MissingIdentityMetadata = Class.new(StandardError)
8
+ MissingRightsMetadata = Class.new(StandardError)
9
+ MissingRDF = Class.new(StandardError)
10
10
  MissingDC = Class.new(StandardError)
11
11
  MissingModsPage = Class.new(StandardError)
12
12
  end
13
- end
13
+ end
data/lib/logging.rb CHANGED
@@ -6,11 +6,8 @@ module DiscoveryIndexer
6
6
  def logger
7
7
  @logger ||= Logger.new(STDOUT)
8
8
  end
9
-
10
- def logger=(logger)
11
- @logger = logger
12
- end
13
- end
14
9
 
10
+ attr_writer :logger
11
+ end
15
12
  end
16
- end
13
+ end
@@ -1,29 +1,27 @@
1
1
  module DiscoveryIndexer
2
2
  module Mapper
3
3
  class GeneralMapper
4
-
5
4
  # Initializes an instance from IndexMapper
6
5
  # @param [String] druid e.g. ab123cd4567
7
6
  # @param [Stanford::Mods::Record] modsxml represents the MODS xml for the druid
8
7
  # @param [DiscoveryIndexer::Reader::PurlxmlModel] purlxml represents the purlxml model
9
8
  # @param [Hash] collection_data represents a hash of collection_druid and catkey
10
9
  # e.g. @collection_data = {'aa00bb0001'=>{:name=>'Test Collection Name',:ckey=>'000001'},'nt028fd5773'=>{:name=>'Revs Institute Archive',:ckey=>'000002'}}
11
- def initialize(druid, modsxml, purlxml, collection_data={})
10
+ def initialize(druid, modsxml, purlxml, collection_data = {})
12
11
  @druid = druid
13
12
  @modsxml = modsxml
14
13
  @purlxml = purlxml
15
14
  @collection_data = collection_data
16
15
  end
17
16
 
18
- # Create a Hash representing a Solr doc, with all MODS related fields populated.
17
+ # Create a Hash representing a Solr doc, with all MODS related fields populated.
19
18
  # @return [Hash] Hash representing the Solr document
20
- def convert_to_solr_doc()
19
+ def convert_to_solr_doc
21
20
  solr_doc = {}
22
21
  solr_doc[:id] = @druid
23
22
  solr_doc[:title] = @modsxml.sw_full_title
24
- return solr_doc
23
+ solr_doc
25
24
  end
26
25
  end
27
26
  end
28
27
  end
29
-
@@ -1,44 +1,40 @@
1
1
  require 'stanford-mods'
2
2
  module DiscoveryIndexer
3
3
  module InputXml
4
-
5
- # This class is the main class to access and parse the mods xml
4
+ # This class is the main class to access and parse the mods xml
6
5
  # as retrieved from PURL server
7
6
  # @example to run the code
8
7
  # druid = "aa111aa1111"
9
8
  # p = DiscoveryIndexer::InputXml::Modsxml.new(druid)
10
9
  # model = p.load()
11
- #
12
- #
10
+ #
11
+ #
13
12
  class Modsxml
14
- # initializes a new object
13
+ # initializes a new object
15
14
  # @param druid [String] the druid object in the format "aa111aa1111"
16
15
  def initialize(druid)
17
16
  @druid = druid
18
- @modsxml_ng_doc = nil
17
+ @modsxml_ng_doc = nil
19
18
  end
20
19
 
21
20
  # loads the mods xml to stanford mods model for the fedora object defind in the druid,
22
- # it reads the mods xml once from PURL server, and repeat the parsing with each call
23
- # @return [Stanford::Mods::Record] represents the mods xml
24
- def load()
25
- if @modsxml_ng_doc.nil? then
26
- @modsxml_ng_doc = ModsxmlReader.read(@druid)
27
- end
28
-
21
+ # it reads the mods xml once from PURL server, and repeat the parsing with each call
22
+ # @return [Stanford::Mods::Record] represents the mods xml
23
+ def load
24
+ @modsxml_ng_doc = ModsxmlReader.read(@druid) if @modsxml_ng_doc.nil?
25
+
29
26
  modsxml_model = Stanford::Mods::Record.new
30
27
  modsxml_model.from_nk_node(@modsxml_ng_doc)
31
- return modsxml_model
28
+ modsxml_model
32
29
  end
33
-
30
+
34
31
  # loads the mods xml to stanford mods model for the fedora object defind in the druid,
35
- # it reads the mods xml from PURL server with every call
36
- # @return [Stanford::Mods::Record] represents the mods xml
37
- def reload()
32
+ # it reads the mods xml from PURL server with every call
33
+ # @return [Stanford::Mods::Record] represents the mods xml
34
+ def reload
38
35
  @modsxml_ng_doc = ModsxmlReader.read(@druid)
39
- return load()
36
+ load
40
37
  end
41
-
42
38
  end
43
39
  end
44
40
  end
@@ -3,21 +3,18 @@ require 'open-uri'
3
3
  module DiscoveryIndexer
4
4
  module InputXml
5
5
  class ModsxmlReader
6
-
7
6
  # reads the mods xml for the fedora object that is defined , from the purl server
8
7
  # @param [String] druid e.g. ab123cd4567
9
8
  # @return [Nokogiri::XML::Document] the mods xml for the fedora object
10
9
  # @raise [MissingModsXml] if there's no mods xml available for this druid
11
10
  def self.read(druid)
12
11
  mods_uri = "#{DiscoveryIndexer::PURL_DEFAULT}/#{druid}.mods"
13
-
14
12
  begin
15
- modsxml_ng_doc = Nokogiri::XML(open(mods_uri))
16
- return modsxml_ng_doc
13
+ Nokogiri::XML(open(mods_uri))
17
14
  rescue
18
15
  raise DiscoveryIndexer::Errors::MissingModsPage.new(mods_uri)
19
16
  end
20
17
  end
21
18
  end
22
19
  end
23
- end
20
+ end
@@ -1,43 +1,36 @@
1
1
  module DiscoveryIndexer
2
2
  module InputXml
3
-
4
- # This class is the main class to access and parse the purl xml
3
+ # This class is the main class to access and parse the purl xml
5
4
  # as retrieved from PURL server
6
5
  # @example to run the code
7
6
  # druid = "aa111aa1111"
8
7
  # p = DiscoveryIndexer::InputXml::Purlxml.new(druid)
9
8
  # model = p.load()
10
- #
11
9
  class Purlxml
12
-
13
- # initializes a new object
10
+ # initializes a new object
14
11
  # @param druid [String] the druid object in the format "aa111aa1111"
15
12
  def initialize(druid)
16
13
  @druid = druid
17
- @purlxml_ng_doc = nil
14
+ @purlxml_ng_doc = nil
18
15
  end
19
16
 
20
17
  # loads the purl xml to purlxml model for the fedora object defind in the druid,
21
- # it reads the purl xml once from PURL server, and repeat the parsing with each call
22
- # @return [PurlxmlModel] represents the purlxml
23
- def load()
24
- if @purlxml_ng_doc.nil? then
25
- @purlxml_ng_doc = PurlxmlReader.read(@druid)
26
- end
27
-
28
- purlxml_parser = PurlxmlParserStrict.new(@druid,@purlxml_ng_doc)
29
- purlxml_model = purlxml_parser.parse()
30
- return purlxml_model
18
+ # it reads the purl xml once from PURL server, and repeat the parsing with each call
19
+ # @return [PurlxmlModel] represents the purlxml
20
+ def load
21
+ @purlxml_ng_doc = PurlxmlReader.read(@druid) if @purlxml_ng_doc.nil?
22
+ purlxml_parser = PurlxmlParserStrict.new(@druid, @purlxml_ng_doc)
23
+ purlxml_model = purlxml_parser.parse
24
+ purlxml_model
31
25
  end
32
-
26
+
33
27
  # loads the purl xml to purlxml model for the fedora object defind in the druid
34
- # it reads the purl xml from PURL server with every call
35
- # @return [PurlxmlModel] represents the purlxml
36
- def reload()
28
+ # it reads the purl xml from PURL server with every call
29
+ # @return [PurlxmlModel] represents the purlxml
30
+ def reload
37
31
  @purlxml_ng_doc = PurlxmlReader.read(@druid)
38
- return load()
32
+ load
39
33
  end
40
- end
34
+ end
41
35
  end
42
36
  end
43
-
@@ -1,47 +1,46 @@
1
1
  module DiscoveryIndexer
2
2
  module InputXml
3
3
  class PurlxmlModel
4
-
5
- #@!attribute [rw] druid
6
- # @return [String] The druid value eg., ab123cd4567
7
- attr_accessor :druid
8
-
9
- #@!attribute [rw] public_xml
10
- # @return [Nokogiri::XML] The publix xml as retrieved from purl server
4
+ # @!attribute [rw] druid
5
+ # @return [String] The druid value eg., ab123cd4567
6
+ attr_accessor :druid
7
+
8
+ # @!attribute [rw] public_xml
9
+ # @return [Nokogiri::XML] The publix xml as retrieved from purl server
11
10
  attr_accessor :public_xml
12
11
 
13
- #@!attribute [rw] content_metadata
12
+ # @!attribute [rw] content_metadata
14
13
  # @return [Nokogiri::XML] The content_metadata as extracted from public xml
15
14
  attr_accessor :content_metadata
16
15
 
17
- #@!attribute [rw] identity_metadata
16
+ # @!attribute [rw] identity_metadata
18
17
  # @return [Nokogiri::XML] The identity_metadata as extracted from public xml
19
18
  attr_accessor :identity_metadata
20
19
 
21
- #@!attribute [rw] rights_metadata
20
+ # @!attribute [rw] rights_metadata
22
21
  # @return [Nokogiri::XML] The rights_metadata as extracted from public xml
23
22
  attr_accessor :rights_metadata
24
23
 
25
- #@!attribute [rw] dc
24
+ # @!attribute [rw] dc
26
25
  # @return [Nokogiri::XML] The dc element as extracted from public xml
27
26
  attr_accessor :dc
28
-
29
- #@!attribute [rw] rdf
27
+
28
+ # @!attribute [rw] rdf
30
29
  # @return [Nokogiri::XML] The rdf element as extracted from public xml
31
30
  attr_accessor :rdf
32
31
 
33
32
  # @!attribute [rw] release_tags_hash
34
33
  # @return [Hash] The release_tag in hash format as extracted from public xml
35
- # ReleaseData element.
34
+ # ReleaseData element.
36
35
  # @example
37
36
  # !{"target1"=>true, "target2"=>false}
38
- attr_accessor :release_tags_hash
37
+ attr_accessor :release_tags_hash
39
38
 
40
39
  # @!attribute [rw] dor_content_type
41
40
  # @return [String] The dor_content_type as extracted from public xml
42
41
  # content_metadata.
43
42
  attr_accessor :dor_content_type
44
-
43
+
45
44
  # @!attribute [rw] dor_display_type
46
45
  # @return [String] The displayType as extracted from public xml
47
46
  # identity_metadata.
@@ -50,25 +49,25 @@ module DiscoveryIndexer
50
49
  # @!attribute [rw] is_collection
51
50
  # @return [Boolean] true if the item type is collection in the identity_metadata
52
51
  attr_accessor :is_collection
53
-
52
+
54
53
  # @!attribute [rw] collection_druids
55
54
  # @return [Array] a list of the collections that this is druid belongs to
56
55
  # @example
57
56
  # ["aa11aaa1111","bb111bb1111"]
58
57
  attr_accessor :collection_druids
59
-
58
+
60
59
  # @!attribute [rw] file_ids
61
60
  # @return [Array] a list of the file ids in the content_metadata
62
61
  # @example
63
- # ["pc0065_b08_f10_i031.txt","pc0065_b08_f10_i032.txt"]
62
+ # ["pc0065_b08_f10_i031.txt","pc0065_b08_f10_i032.txt"]
64
63
  attr_accessor :file_ids
65
64
 
66
65
  # @!attribute [rw] image_ids
67
66
  # @return [Array] a list of the image ids in the content_metadata
68
67
  # @example
69
- # ["pc0065_b08_f10_i031.jp2","pc0065_b08_f10_i032.jp2"]
68
+ # ["pc0065_b08_f10_i031.jp2","pc0065_b08_f10_i032.jp2"]
70
69
  attr_accessor :image_ids
71
-
70
+
72
71
  # @!attribute [rw] catkey
73
72
  # @return [String] the catkey attribute in identity_metadata
74
73
  attr_accessor :catkey
@@ -76,15 +75,15 @@ module DiscoveryIndexer
76
75
  # @!attribute [rw] barcode
77
76
  # @return [String] the barcode attribute in identity_metadata
78
77
  attr_accessor :barcode
79
-
78
+
80
79
  # @!attribute [rw] label
81
80
  # @return [String] the objectLabel attribute in identity_metadata
82
81
  attr_accessor :label
83
-
82
+
84
83
  # @!attribute [rw] copyright
85
84
  # @return [String] the copyright statement from rights metadata
86
85
  attr_accessor :copyright
87
-
86
+
88
87
  # @!attribute [rw] use_and_reproduction
89
88
  # @return [String] the use and reproduction statement from rights metadata
90
89
  attr_accessor :use_and_reproduction
@@ -92,13 +91,6 @@ module DiscoveryIndexer
92
91
  # @!attribute [rw] source_id
93
92
  # @return [String] the sourceid from identity metadata
94
93
  attr_accessor :source_id
95
-
96
94
  end
97
95
  end
98
96
  end
99
-
100
-
101
-
102
-
103
-
104
-
@@ -1,13 +1,12 @@
1
1
  module DiscoveryIndexer
2
2
  module InputXml
3
3
  class PurlxmlParser
4
-
5
4
  def initialize(druid, purlxml_ng_doc)
6
5
  @purlxml_ng_doc = purlxml_ng_doc
7
6
  @druid = druid
8
7
  end
9
-
10
- def parse()
8
+
9
+ def parse
11
10
  end
12
11
  end
13
12
  end
@@ -3,154 +3,142 @@ module DiscoveryIndexer
3
3
  class PurlxmlParserStrict < PurlxmlParser
4
4
  include DiscoveryIndexer::Logging
5
5
 
6
-
7
6
  RDF_NAMESPACE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
8
7
  OAI_DC_NAMESPACE = 'http://www.openarchives.org/OAI/2.0/oai_dc/'
9
8
  MODS_NAMESPACE = 'http://www.loc.gov/mods/v3'
10
9
 
11
10
  # it parses the purlxml into a purlxml model
12
11
  # @return [PurlxmlModel] represents the purlxml as parsed based on the parser rules
13
- def parse()
12
+ def parse
14
13
  purlxml_model = PurlxmlModel.new
15
14
  purlxml_model.druid = @druid
16
15
  purlxml_model.public_xml = @purlxml_ng_doc
17
- purlxml_model.content_metadata = parse_content_metadata()
18
- purlxml_model.identity_metadata = parse_identity_metadata()
19
- purlxml_model.rights_metadata = parse_rights_metadata()
20
- purlxml_model.dc = parse_dc()
21
- purlxml_model.rdf = parse_rdf()
22
- purlxml_model.is_collection = parse_is_collection()
23
- purlxml_model.collection_druids = parse_collection_druids()
24
- purlxml_model.dor_content_type = parse_dor_content_type()
25
- purlxml_model.dor_display_type = parse_dor_display_type()
26
- purlxml_model.release_tags_hash = parse_release_tags_hash()
27
- purlxml_model.file_ids = parse_file_ids()
28
- purlxml_model.image_ids = parse_image_ids()
29
- purlxml_model.catkey = parse_catkey()
30
- purlxml_model.barcode = parse_barcode()
31
- purlxml_model.label = parse_label()
32
- purlxml_model.copyright = parse_copyright()
33
- purlxml_model.use_and_reproduction = parse_use_and_reproduction()
34
- purlxml_model.source_id = parse_sourceid()
35
- return purlxml_model
36
- end
37
-
16
+ purlxml_model.content_metadata = parse_content_metadata
17
+ purlxml_model.identity_metadata = parse_identity_metadata
18
+ purlxml_model.rights_metadata = parse_rights_metadata
19
+ purlxml_model.dc = parse_dc
20
+ purlxml_model.rdf = parse_rdf
21
+ purlxml_model.is_collection = parse_is_collection
22
+ purlxml_model.collection_druids = parse_collection_druids
23
+ purlxml_model.dor_content_type = parse_dor_content_type
24
+ purlxml_model.dor_display_type = parse_dor_display_type
25
+ purlxml_model.release_tags_hash = parse_release_tags_hash
26
+ purlxml_model.file_ids = parse_file_ids
27
+ purlxml_model.image_ids = parse_image_ids
28
+ purlxml_model.catkey = parse_catkey
29
+ purlxml_model.barcode = parse_barcode
30
+ purlxml_model.label = parse_label
31
+ purlxml_model.copyright = parse_copyright
32
+ purlxml_model.use_and_reproduction = parse_use_and_reproduction
33
+ purlxml_model.source_id = parse_sourceid
34
+ purlxml_model
35
+ end
36
+
38
37
  # extracts the identityMetadata for this fedora object, from the purl xml
39
38
  # @return [Nokogiri::XML::Document] the identityMetadata for the fedora object
40
39
  # @raise [DiscoveryIndexer::Errors::MissingIdentityMetadata] if there is no identity_metadata
41
40
  def parse_identity_metadata
42
- begin
43
- ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/identityMetadata').to_xml)
44
- raise DiscoveryIndexer::Errors::MissingIdentityMetadata.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
45
- ng_doc
46
- rescue
47
- raise DiscoveryIndexer::Errors::MissingIdentityMetadata.new(@purlxml_ng_doc.inspect)
48
- end
41
+ ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/identityMetadata').to_xml)
42
+ fail DiscoveryIndexer::Errors::MissingIdentityMetadata.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
43
+ ng_doc
44
+ rescue
45
+ raise DiscoveryIndexer::Errors::MissingIdentityMetadata.new(@purlxml_ng_doc.inspect)
49
46
  end
50
-
51
- def parse_rights_metadata
52
- begin
53
- ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/rightsMetadata').to_xml)
54
- raise DiscoveryIndexer::Errors::MissingRightsMetadata.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
55
- ng_doc
56
- rescue
57
- raise DiscoveryIndexer::Errors::MissingRightsMetadata.new(@purlxml_ng_doc.inspect)
58
- end
47
+
48
+ def parse_rights_metadata
49
+ ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/rightsMetadata').to_xml)
50
+ fail DiscoveryIndexer::Errors::MissingRightsMetadata.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
51
+ ng_doc
52
+ rescue
53
+ raise DiscoveryIndexer::Errors::MissingRightsMetadata.new(@purlxml_ng_doc.inspect)
59
54
  end
60
-
55
+
61
56
  # extracts the dc field for this fedora object, from the purl xml
62
57
  # @return [Nokogiri::XML::Document] the dc for the fedora object
63
58
  # @raise [DiscoveryIndexer::Errors::MissingDC] if there is no dc element
64
59
  def parse_dc
65
- begin
66
- ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/dc:dc', {'dc' => OAI_DC_NAMESPACE}).to_xml(:encoding => 'utf-8'))
67
- raise DiscoveryIndexer::Errors::MissingDC.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
68
- ng_doc
69
- rescue
70
- raise DiscoveryIndexer::Errors::MissingDC.new(@purlxml_ng_doc.inspect)
71
- end
60
+ ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/dc:dc', 'dc' => OAI_DC_NAMESPACE).to_xml(encoding: 'utf-8'))
61
+ fail DiscoveryIndexer::Errors::MissingDC.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
62
+ ng_doc
63
+ rescue
64
+ raise DiscoveryIndexer::Errors::MissingDC.new(@purlxml_ng_doc.inspect)
72
65
  end
73
-
66
+
74
67
  # extracts the rdf field for this fedora object, from the purl xml
75
68
  # @return [Nokogiri::XML::Document] the rdf for the fedora object
76
69
  # @raise [DiscoveryIndexer::Errors::MissingRDF] if there is no rdf element
77
70
  def parse_rdf
78
- begin
79
- ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/rdf:RDF', {'rdf' => RDF_NAMESPACE}).to_xml)
80
- raise DiscoveryIndexer::Errors::MissingRDF.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
81
- ng_doc
82
- rescue
83
- raise DiscoveryIndexer::Errors::MissingRDF.new(@purlxml_ng_doc.inspect)
84
- end
71
+ ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/rdf:RDF', 'rdf' => RDF_NAMESPACE).to_xml)
72
+ fail DiscoveryIndexer::Errors::MissingRDF.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
73
+ ng_doc
74
+ rescue
75
+ raise DiscoveryIndexer::Errors::MissingRDF.new(@purlxml_ng_doc.inspect)
85
76
  end
86
-
87
-
77
+
88
78
  # extracts the release tag element for this fedora object, from the the ReleaseData element in purl xml
89
79
  # @return [Hash] the release tags for the fedora object
90
80
  def parse_release_tags_hash
91
- release_tags={}
92
- unless @purlxml_ng_doc.nil?
93
- release_elements = @purlxml_ng_doc.xpath('//ReleaseData/release')
94
- release_elements.each { |n|
95
- unless n.attr("to").nil?
96
- release_target = n.attr("to")
81
+ release_tags = {}
82
+ unless @purlxml_ng_doc.nil?
83
+ release_elements = @purlxml_ng_doc.xpath('//ReleaseData/release')
84
+ release_elements.each do |n|
85
+ unless n.attr('to').nil?
86
+ release_target = n.attr('to')
97
87
  text = n.text
98
- unless text.nil?
99
- release_tags[release_target]= to_boolean(text)
100
- end
88
+ release_tags[release_target] = to_boolean(text) unless text.nil?
101
89
  end
102
- }
90
+ end
103
91
  end
104
- return release_tags
92
+ release_tags
105
93
  end
106
-
94
+
107
95
  # extracts the contentMetadata for this fedora object, from the purl xml
108
96
  # @return [Nokogiri::XML::Document] the contentMetadata for the fedora object
109
97
  # @raise [DiscoveryIndexer::Errors::MissingContentMetadata] if there is no contentMetadata
110
98
  def parse_content_metadata
111
- ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/contentMetadata').to_xml)
112
- ng_doc = nil if !ng_doc || ng_doc.children.empty?
113
- ng_doc
99
+ ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/contentMetadata').to_xml)
100
+ ng_doc = nil if !ng_doc || ng_doc.children.empty?
101
+ ng_doc
114
102
  end
115
-
103
+
116
104
  # @return true if the identityMetadata has <objectType>collection</objectType>, false otherwise
117
105
  def parse_is_collection
118
- identity_metadata = parse_identity_metadata
106
+ identity_metadata = parse_identity_metadata
119
107
  unless identity_metadata.nil?
120
108
  object_type_nodes = identity_metadata.xpath('//objectType')
121
- return true if object_type_nodes.find_index { |n| ['collection','set'].include? n.text.downcase}
109
+ return true if object_type_nodes.find_index { |n| %w(collection set).include? n.text.downcase }
122
110
  end
123
111
  false
124
112
  end
125
-
113
+
126
114
  # get the druids from isMemberOfCollection relationships in rels-ext from public_xml
127
115
  # @return [Array<String>] the druids (e.g. ww123yy1234) this object has isMemberOfColletion relationship with, or nil if none
128
116
  def parse_collection_druids
129
- ns_hash = {'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'fedora' => "info:fedora/fedora-system:def/relations-external#", '' => ''}
117
+ ns_hash = { 'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'fedora' => 'info:fedora/fedora-system:def/relations-external#', '' => '' }
130
118
  is_member_of_nodes ||= @purlxml_ng_doc.xpath('/publicObject/rdf:RDF/rdf:Description/fedora:isMemberOfCollection/@rdf:resource', ns_hash)
131
119
  # from public_xml rels-ext
132
120
  druids = []
133
- is_member_of_nodes.each { |n|
121
+ is_member_of_nodes.each do |n|
134
122
  druids << n.value.split('druid:').last unless n.value.empty?
135
- }
123
+ end
136
124
  return nil if druids.empty?
137
125
  druids
138
126
  end
139
-
127
+
140
128
  # the value of the type attribute for a DOR object's contentMetadata
141
129
  # more info about these values is here:
142
130
  # https://consul.stanford.edu/display/chimera/DOR+content+types%2C+resource+types+and+interpretive+metadata
143
131
  # https://consul.stanford.edu/display/chimera/Summary+of+Content+Types%2C+Resource+Types+and+their+behaviors
144
- # @return [String]
132
+ # @return [String]
145
133
  def parse_dor_content_type
146
134
  content_md = parse_content_metadata
147
135
  dct = content_md ? content_md.xpath('contentMetadata/@type').text : nil
148
136
  DiscoveryIndexer::Logging.logger.debug "#{@druid} has no DOR content type (<contentMetadata> element may be missing type attribute)" if !dct || dct.empty?
149
137
  dct
150
138
  end
151
-
139
+
152
140
  # the value of the displyType tag from a DOR collection's identityMetadata
153
- # @return [String]
141
+ # @return [String]
154
142
  def parse_dor_display_type
155
143
  identity_md = parse_identity_metadata
156
144
  ddt = identity_md ? identity_md.xpath('//displayType').text : nil
@@ -161,43 +149,44 @@ module DiscoveryIndexer
161
149
  # the @id attribute of resource/file elements that match the image type, including extension
162
150
  # @return [Array<String>] filenames
163
151
  def parse_image_ids
164
- ids = []
165
- content_md = parse_content_metadata
166
- unless content_md.nil?
167
- content_md.xpath('//resource[@type="image"]/file/@id').each { |node|
168
- ids << node.text if !node.text.empty?
169
- }
170
- return nil if ids.empty?
171
- ids
152
+ ids = []
153
+ content_md = parse_content_metadata
154
+ return nil if content_md.nil?
155
+ content_md.xpath('//resource[@type="image"]/file/@id').each do |node|
156
+ ids << node.text unless node.text.empty?
172
157
  end
158
+ content_md.xpath('//resource[@type="page"]/file/@id').each do |node|
159
+ ids << node.text unless node.text.empty?
160
+ end
161
+ return nil if ids.empty?
162
+ ids
173
163
  end
174
164
 
175
165
  def parse_sourceid
176
166
  get_value(@purlxml_ng_doc.css('//identityMetadata/sourceId'))
177
167
  end
178
-
168
+
179
169
  def parse_copyright
180
170
  get_value(@purlxml_ng_doc.css('//rightsMetadata/copyright/human[type="copyright"]'))
181
171
  end
182
-
172
+
183
173
  def parse_use_and_reproduction
184
174
  get_value(@purlxml_ng_doc.css('//rightsMetadata/use/human[type="useAndReproduction"]'))
185
175
  end
186
-
176
+
187
177
  # the @id attribute of resource/file elements, including extension
188
178
  # @return [Array<String>] filenames
189
179
  def parse_file_ids
190
180
  ids = []
191
181
  content_md = parse_content_metadata
192
- unless content_md.nil?
193
- content_md.xpath('//resource/file/@id').each { |node|
194
- ids << node.text if !node.text.empty?
195
- }
196
- return nil if ids.empty?
197
- ids
198
- end
199
- end
200
-
182
+ return unless content_md.nil?
183
+ content_md.xpath('//resource/file/@id').each do |node|
184
+ ids << node.text unless node.text.empty?
185
+ end
186
+ return nil if ids.empty?
187
+ ids
188
+ end
189
+
201
190
  # @return catkey value from the DOR identity_metadata, or nil if there is no catkey
202
191
  def parse_catkey
203
192
  get_value(@purlxml_ng_doc.xpath("/publicObject/identityMetadata/otherId[@name='catkey']"))
@@ -210,23 +199,22 @@ module DiscoveryIndexer
210
199
 
211
200
  # @return objectLabel value from the DOR identity_metadata, or nil if there is no barcode
212
201
  def parse_label
213
- get_value(@purlxml_ng_doc.xpath("/publicObject/identityMetadata/objectLabel"))
202
+ get_value(@purlxml_ng_doc.xpath('/publicObject/identityMetadata/objectLabel'))
214
203
  end
215
-
204
+
216
205
  def get_value(node)
217
- (node && node.first) ? node.first.content : nil
206
+ (node && node.first) ? node.first.content : nil
218
207
  end
219
-
208
+
220
209
  def to_boolean(text)
221
- if text.nil? || text.empty? then
210
+ if text.nil? || text.empty?
222
211
  return false
223
- elsif text.downcase.eql?("true") || text.downcase == "t" then
212
+ elsif text.downcase.eql?('true') || text.downcase == 't'
224
213
  return true
225
214
  else
226
215
  return false
227
216
  end
228
- end
217
+ end
229
218
  end
230
219
  end
231
220
  end
232
-
@@ -3,14 +3,13 @@ require 'open-uri'
3
3
  module DiscoveryIndexer
4
4
  module InputXml
5
5
  class PurlxmlReader
6
-
7
6
  # reads the public xml for the fedora object that is defined , from the purl server
8
7
  # @param [String] druid e.g. ab123cd4567
9
8
  # @return [Nokogiri::XML::Document] the public xml for the fedora object
10
9
  # @raise [MissingPublicXml] if there's no purl xml available for this druid
11
10
  def self.read(druid)
12
11
  purlxml_uri = "#{DiscoveryIndexer::PURL_DEFAULT}/#{druid}.xml"
13
-
12
+
14
13
  begin
15
14
  purlxml_object = Nokogiri::XML(open(purlxml_uri))
16
15
  return purlxml_object
@@ -20,4 +19,4 @@ module DiscoveryIndexer
20
19
  end
21
20
  end
22
21
  end
23
- end
22
+ end
data/lib/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module DiscoveryIndexer
2
- VERSION = '0.9.6'
2
+ VERSION = '0.9.7'
3
3
  end
@@ -3,6 +3,7 @@ require 'rsolr'
3
3
  require 'rest-client'
4
4
  module DiscoveryIndexer
5
5
  module Writer
6
+ # Processes adds and deletes to the solr core
6
7
  class SolrClient
7
8
  include DiscoveryIndexer::Logging
8
9
 
@@ -13,7 +14,7 @@ module DiscoveryIndexer
13
14
  # @param solr_connector [RSolr::Client] is an open connection with the solr core
14
15
  # @param max_retries [Integer] the maximum number of tries before fail
15
16
  def self.add(id, solr_doc, solr_connector, max_retries = 10)
16
- process(id, solr_doc, solr_connector, max_retries, is_delete=false)
17
+ process(id, solr_doc, solr_connector, max_retries, false)
17
18
  end
18
19
 
19
20
  # Add the document to solr, retry if an error occurs.
@@ -22,79 +23,76 @@ module DiscoveryIndexer
22
23
  # @param solr_connector[RSolr::Client] is an open connection with the solr core
23
24
  # @param max_retries [Integer] the maximum number of tries before fail
24
25
  def self.delete(id, solr_connector, max_retries = 10)
25
- process(id, {}, solr_connector, max_retries, is_delete=true)
26
+ process(id, {}, solr_connector, max_retries, true)
26
27
  end
27
28
 
28
29
  # It's an internal method that receives all the requests and deal with
29
30
  # SOLR core. This method can call add, delete, or update
30
31
  #
31
32
  # @param id [String] the document id, usually it will be druid.
32
- # @param solr_doc [Hash] is the solr doc in hash format
33
+ # @param solr_doc [Hash] is the solr doc in hash format
33
34
  # @param solr_connector [RSolr::Client] is an open connection with the solr core
34
35
  # @param max_retries [Integer] the maximum number of tries before fail
35
- def self.process(id, solr_doc, solr_connector, max_retries, is_delete=false)
36
- handler = Proc.new do |exception, attempt_number, total_delay|
36
+ def self.process(id, solr_doc, solr_connector, max_retries, is_delete = false)
37
+ handler = proc do |exception, attempt_number, _total_delay|
37
38
  DiscoveryIndexer::Logging.logger.debug "#{exception.class} on attempt #{attempt_number} for #{id}"
38
39
  end
39
-
40
- with_retries(:max_tries => max_retries, :handler => handler, :base_sleep_seconds => 1, :max_sleep_seconds => 5) do |attempt|
40
+
41
+ with_retries(max_tries: max_retries, handler: handler, base_sleep_seconds: 1, max_sleep_seconds: 5) do |attempt|
41
42
  DiscoveryIndexer::Logging.logger.debug "Attempt #{attempt} for #{id}"
42
-
43
+
43
44
  if is_delete
44
45
  DiscoveryIndexer::Logging.logger.info "Deleting #{id} on attempt #{attempt}"
45
46
  solr_connector.delete_by_id(id)
46
- elsif allow_update?(solr_connector) && doc_exists?(id,solr_connector)
47
+ elsif allow_update?(solr_connector) && doc_exists?(id, solr_connector)
47
48
  DiscoveryIndexer::Logging.logger.info "Updating #{id} on attempt #{attempt}"
48
- update_solr_doc(id,solr_doc,solr_connector)
49
+ update_solr_doc(id, solr_doc, solr_connector)
49
50
  else
50
51
  DiscoveryIndexer::Logging.logger.info "Indexing #{id} on attempt #{attempt}"
51
52
  solr_connector.add(solr_doc)
52
53
  end
53
54
  solr_connector.commit
54
55
  DiscoveryIndexer::Logging.logger.info "Completing #{id} successfully on attempt #{attempt}"
55
-
56
56
  end
57
57
  end
58
58
 
59
59
  # @param solr_connector [RSolr::Client] is an open connection with the solr core
60
60
  # @return [Boolean] true if the solr core allowing update feature
61
61
  def self.allow_update?(solr_connector)
62
- return solr_connector.options.include?(:allow_update) ? solr_connector.options[:allow_update] : false
62
+ solr_connector.options.include?(:allow_update) ? solr_connector.options[:allow_update] : false
63
63
  end
64
64
 
65
65
  # @param id [String] the document id, usually it will be druid.
66
66
  # @param solr_connector [RSolr::Client] is an open connection with the solr core
67
67
  # @return [Boolean] true if the solr doc defined by this id exists
68
- def self.doc_exists?(id,solr_connector)
69
- response=solr_connector.get 'select', :params=>{:q=>'id:"' + id + '"'}
68
+ def self.doc_exists?(id, solr_connector)
69
+ response = solr_connector.get 'select', params: { q: 'id:"' + id + '"' }
70
70
  response['response']['numFound'] == 1
71
71
  end
72
-
72
+
73
73
  # It is an internal method that updates the solr doc instead of adding a new one.
74
- def self.update_solr_doc(id,solr_doc,solr_connector)
74
+ def self.update_solr_doc(id, solr_doc, solr_connector)
75
75
  # update_solr_doc can't used RSolr because updating hash doc is not supported
76
76
  # so we need to build the json input manually
77
77
  solr_url = solr_connector.options[:url]
78
- if solr_url.end_with?("/") then
79
- url="#{solr_connector.options[:url]}update?commit=true"
78
+ if solr_url.end_with?('/')
79
+ url = "#{solr_connector.options[:url]}update?commit=true"
80
80
  else
81
- url="#{solr_connector.options[:url]}/update?commit=true"
81
+ url = "#{solr_connector.options[:url]}/update?commit=true"
82
82
  end
83
-
84
- params="[{\"id\":\"#{id}\","
85
- solr_doc.each do |field_name,new_values|
86
- unless field_name == :id
87
- params+="\"#{field_name}\":"
88
- new_values=[new_values] unless new_values.class==Array
89
- new_values = new_values.map {|s| s.to_s.gsub("\\","\\\\\\").gsub('"','\"').strip} # strip leading/trailing spaces and escape quotes for each value
90
- params+="{\"set\":[\"#{new_values.join('","')}\"]},"
91
- end
83
+
84
+ params = "[{\"id\":\"#{id}\","
85
+ solr_doc.each do |field_name, new_values|
86
+ next unless field_name == :id
87
+ params += "\"#{field_name}\":"
88
+ new_values = [new_values] unless new_values.class == Array
89
+ new_values = new_values.map { |s| s.to_s.gsub('\\', '\\\\\\').gsub('"', '\"').strip } # strip leading/trailing spaces and escape quotes for each value
90
+ params += "{\"set\":[\"#{new_values.join('","')}\"]},"
92
91
  end
93
92
  params.chomp!(',')
94
- params+="}]"
95
- RestClient.post url, params,:content_type => :json, :accept=>:json
93
+ params += '}]'
94
+ RestClient.post url, params, content_type: :json, accept: :json
96
95
  end
97
-
98
96
  end
99
97
  end
100
- end
98
+ end
@@ -3,57 +3,57 @@ require 'rsolr'
3
3
 
4
4
  module DiscoveryIndexer
5
5
  module Writer
6
+ # Performs writes to solr client based upon true and false release flags
6
7
  class SolrWriter
7
8
  include DiscoveryIndexer::Logging
8
-
9
+
9
10
  def process(id, index_doc, targets, solr_targets_configs)
10
11
  @solr_targets_configs = solr_targets_configs
11
12
  index_targets = []
12
13
  delete_targets = []
13
- targets.keys.each do |target|
14
- if targets[target] then
14
+ targets.keys.each do |target|
15
+ if targets[target]
15
16
  index_targets.append(target)
16
17
  else
17
18
  delete_targets.append(target)
18
19
  end
19
20
  end
20
-
21
+
21
22
  # get targets with true
22
23
  solr_index_client(id, index_doc, index_targets)
23
24
  # get targets with false
24
25
  solr_delete_client(id, delete_targets)
25
26
  end
26
-
27
+
27
28
  def solr_delete_from_all(id, solr_targets_configs)
28
29
  # Get a list of all registered targets
29
- @solr_targets_configs=solr_targets_configs
30
- targets = @solr_targets_configs.keys()
30
+ @solr_targets_configs = solr_targets_configs
31
+ targets = @solr_targets_configs.keys
31
32
  solr_delete_client(id, targets)
32
33
  end
33
-
34
+
34
35
  def solr_index_client(id, index_doc, targets)
35
36
  targets.each do |solr_target|
36
- solr_connector = get_connector_for_target(solr_target)
37
+ solr_connector = get_connector_for_target(solr_target)
37
38
  SolrClient.add(id, index_doc, solr_connector) unless solr_connector.nil?
38
- end
39
+ end
39
40
  end
40
-
41
+
41
42
  def solr_delete_client(id, targets)
42
43
  targets.each do |solr_target|
43
44
  solr_connector = get_connector_for_target(solr_target)
44
45
  SolrClient.delete(id, solr_connector) unless solr_connector.nil?
45
- end
46
+ end
46
47
  end
47
48
 
48
49
  def get_connector_for_target(solr_target)
49
50
  solr_connector = nil
50
- if @solr_targets_configs.keys.include?(solr_target) then
51
+ if @solr_targets_configs.keys.include?(solr_target)
51
52
  config = @solr_targets_configs[solr_target]
52
53
  solr_connector = RSolr.connect(config.deep_symbolize_keys)
53
54
  end
54
- return solr_connector
55
+ solr_connector
55
56
  end
56
-
57
57
  end
58
58
  end
59
59
  end
metadata CHANGED
@@ -1,14 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: discovery-indexer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.6
4
+ version: 0.9.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ahmed AlSum
8
+ - Laney McGlohon
8
9
  autorequire:
9
10
  bindir: bin
10
11
  cert_chain: []
11
- date: 2015-06-30 00:00:00.000000000 Z
12
+ date: 2015-09-23 00:00:00.000000000 Z
12
13
  dependencies:
13
14
  - !ruby/object:Gem::Dependency
14
15
  name: nokogiri
@@ -138,7 +139,7 @@ dependencies:
138
139
  version: '0'
139
140
  description: This library manages the core operations for the discovery indexing such
140
141
  as reading PURL xml, mapping to the solr document, and writing to solr core.
141
- email: aalsum@stanford.edu
142
+ email: laneymcg@stanford.edu
142
143
  executables: []
143
144
  extensions: []
144
145
  extra_rdoc_files: []