discovery-indexer 0.9.6 → 0.9.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 3c328198b5aaf90f99e8b239b37aabacf9cfcdc7
4
- data.tar.gz: 7ffabf609d7b24d5ae413f4cac213a9aabef2e8e
3
+ metadata.gz: 9cddff2ef4144cc3d17af7bd71bdb6f68840d38c
4
+ data.tar.gz: ca19a7acbb80341c1d6440567e305ef0bf170542
5
5
  SHA512:
6
- metadata.gz: 7ec36accf3a418d95040bc8ca561a7f08fac0fabcb6d2cac133589f451620b885c16218daa7338c2b478771978eed2d2a6ae236a0eb9cde1b836941b8eb41434
7
- data.tar.gz: 2595c1da62ab979b6f12db5ca3cc64102c622924090cc0cf355e3b14b405f33fbafb859282b65bc16c1516369a407c6bae61809e19b7f0e417a92ac9ae43857c
6
+ metadata.gz: 65fa148806dcf8dc498b262cbc3acf811c89fcad0d81397099009e0e8eb368577477e50c9c5bbee7e0921e11b17542cf4857b2aff4e54a319f5f97ea22e1bb8f
7
+ data.tar.gz: a28465c230937c24168114d91f41a58e59492a16b0b479ab8e4ae41429c4682dc7eec99a4b796ff93869d37b7e34ed78a6d7485ff1500ee2bc91e99e3a6e070f
@@ -15,9 +15,8 @@ require 'mapper/general_mapper'
15
15
  require 'writer/solr_client'
16
16
  require 'writer/solr_writer'
17
17
 
18
- #require 'utilities/extract_sub_targets'
19
-
18
+ # require 'utilities/extract_sub_targets'
20
19
 
21
20
  module DiscoveryIndexer
22
21
  PURL_DEFAULT = 'http://purl.stanford.edu'
23
- end
22
+ end
data/lib/errors.rb CHANGED
@@ -1,13 +1,13 @@
1
1
  module DiscoveryIndexer
2
2
  module Errors
3
- MissingPurlPage = Class.new(StandardError)
4
- MissingMods = Class.new(StandardError)
5
- MissingPublicXml = Class.new(StandardError)
6
- MissingContentMetadata = Class.new(StandardError)
7
- MissingIdentityMetadata = Class.new(StandardError)
8
- MissingRightsMetadata = Class.new(StandardError)
9
- MissingRDF = Class.new(StandardError)
3
+ MissingPurlPage = Class.new(StandardError)
4
+ MissingMods = Class.new(StandardError)
5
+ MissingPublicXml = Class.new(StandardError)
6
+ MissingContentMetadata = Class.new(StandardError)
7
+ MissingIdentityMetadata = Class.new(StandardError)
8
+ MissingRightsMetadata = Class.new(StandardError)
9
+ MissingRDF = Class.new(StandardError)
10
10
  MissingDC = Class.new(StandardError)
11
11
  MissingModsPage = Class.new(StandardError)
12
12
  end
13
- end
13
+ end
data/lib/logging.rb CHANGED
@@ -6,11 +6,8 @@ module DiscoveryIndexer
6
6
  def logger
7
7
  @logger ||= Logger.new(STDOUT)
8
8
  end
9
-
10
- def logger=(logger)
11
- @logger = logger
12
- end
13
- end
14
9
 
10
+ attr_writer :logger
11
+ end
15
12
  end
16
- end
13
+ end
@@ -1,29 +1,27 @@
1
1
  module DiscoveryIndexer
2
2
  module Mapper
3
3
  class GeneralMapper
4
-
5
4
  # Initializes an instance from IndexMapper
6
5
  # @param [String] druid e.g. ab123cd4567
7
6
  # @param [Stanford::Mods::Record] modsxml represents the MODS xml for the druid
8
7
  # @param [DiscoveryIndexer::Reader::PurlxmlModel] purlxml represents the purlxml model
9
8
  # @param [Hash] collection_data represents a hash of collection_druid and catkey
10
9
  # e.g. @collection_data = {'aa00bb0001'=>{:name=>'Test Collection Name',:ckey=>'000001'},'nt028fd5773'=>{:name=>'Revs Institute Archive',:ckey=>'000002'}}
11
- def initialize(druid, modsxml, purlxml, collection_data={})
10
+ def initialize(druid, modsxml, purlxml, collection_data = {})
12
11
  @druid = druid
13
12
  @modsxml = modsxml
14
13
  @purlxml = purlxml
15
14
  @collection_data = collection_data
16
15
  end
17
16
 
18
- # Create a Hash representing a Solr doc, with all MODS related fields populated.
17
+ # Create a Hash representing a Solr doc, with all MODS related fields populated.
19
18
  # @return [Hash] Hash representing the Solr document
20
- def convert_to_solr_doc()
19
+ def convert_to_solr_doc
21
20
  solr_doc = {}
22
21
  solr_doc[:id] = @druid
23
22
  solr_doc[:title] = @modsxml.sw_full_title
24
- return solr_doc
23
+ solr_doc
25
24
  end
26
25
  end
27
26
  end
28
27
  end
29
-
@@ -1,44 +1,40 @@
1
1
  require 'stanford-mods'
2
2
  module DiscoveryIndexer
3
3
  module InputXml
4
-
5
- # This class is the main class to access and parse the mods xml
4
+ # This class is the main class to access and parse the mods xml
6
5
  # as retrieved from PURL server
7
6
  # @example to run the code
8
7
  # druid = "aa111aa1111"
9
8
  # p = DiscoveryIndexer::InputXml::Modsxml.new(druid)
10
9
  # model = p.load()
11
- #
12
- #
10
+ #
11
+ #
13
12
  class Modsxml
14
- # initializes a new object
13
+ # initializes a new object
15
14
  # @param druid [String] the druid object in the format "aa111aa1111"
16
15
  def initialize(druid)
17
16
  @druid = druid
18
- @modsxml_ng_doc = nil
17
+ @modsxml_ng_doc = nil
19
18
  end
20
19
 
21
20
  # loads the mods xml to stanford mods model for the fedora object defind in the druid,
22
- # it reads the mods xml once from PURL server, and repeat the parsing with each call
23
- # @return [Stanford::Mods::Record] represents the mods xml
24
- def load()
25
- if @modsxml_ng_doc.nil? then
26
- @modsxml_ng_doc = ModsxmlReader.read(@druid)
27
- end
28
-
21
+ # it reads the mods xml once from PURL server, and repeat the parsing with each call
22
+ # @return [Stanford::Mods::Record] represents the mods xml
23
+ def load
24
+ @modsxml_ng_doc = ModsxmlReader.read(@druid) if @modsxml_ng_doc.nil?
25
+
29
26
  modsxml_model = Stanford::Mods::Record.new
30
27
  modsxml_model.from_nk_node(@modsxml_ng_doc)
31
- return modsxml_model
28
+ modsxml_model
32
29
  end
33
-
30
+
34
31
  # loads the mods xml to stanford mods model for the fedora object defind in the druid,
35
- # it reads the mods xml from PURL server with every call
36
- # @return [Stanford::Mods::Record] represents the mods xml
37
- def reload()
32
+ # it reads the mods xml from PURL server with every call
33
+ # @return [Stanford::Mods::Record] represents the mods xml
34
+ def reload
38
35
  @modsxml_ng_doc = ModsxmlReader.read(@druid)
39
- return load()
36
+ load
40
37
  end
41
-
42
38
  end
43
39
  end
44
40
  end
@@ -3,21 +3,18 @@ require 'open-uri'
3
3
  module DiscoveryIndexer
4
4
  module InputXml
5
5
  class ModsxmlReader
6
-
7
6
  # reads the mods xml for the fedora object that is defined , from the purl server
8
7
  # @param [String] druid e.g. ab123cd4567
9
8
  # @return [Nokogiri::XML::Document] the mods xml for the fedora object
10
9
  # @raise [MissingModsXml] if there's no mods xml available for this druid
11
10
  def self.read(druid)
12
11
  mods_uri = "#{DiscoveryIndexer::PURL_DEFAULT}/#{druid}.mods"
13
-
14
12
  begin
15
- modsxml_ng_doc = Nokogiri::XML(open(mods_uri))
16
- return modsxml_ng_doc
13
+ Nokogiri::XML(open(mods_uri))
17
14
  rescue
18
15
  raise DiscoveryIndexer::Errors::MissingModsPage.new(mods_uri)
19
16
  end
20
17
  end
21
18
  end
22
19
  end
23
- end
20
+ end
@@ -1,43 +1,36 @@
1
1
  module DiscoveryIndexer
2
2
  module InputXml
3
-
4
- # This class is the main class to access and parse the purl xml
3
+ # This class is the main class to access and parse the purl xml
5
4
  # as retrieved from PURL server
6
5
  # @example to run the code
7
6
  # druid = "aa111aa1111"
8
7
  # p = DiscoveryIndexer::InputXml::Purlxml.new(druid)
9
8
  # model = p.load()
10
- #
11
9
  class Purlxml
12
-
13
- # initializes a new object
10
+ # initializes a new object
14
11
  # @param druid [String] the druid object in the format "aa111aa1111"
15
12
  def initialize(druid)
16
13
  @druid = druid
17
- @purlxml_ng_doc = nil
14
+ @purlxml_ng_doc = nil
18
15
  end
19
16
 
20
17
  # loads the purl xml to purlxml model for the fedora object defind in the druid,
21
- # it reads the purl xml once from PURL server, and repeat the parsing with each call
22
- # @return [PurlxmlModel] represents the purlxml
23
- def load()
24
- if @purlxml_ng_doc.nil? then
25
- @purlxml_ng_doc = PurlxmlReader.read(@druid)
26
- end
27
-
28
- purlxml_parser = PurlxmlParserStrict.new(@druid,@purlxml_ng_doc)
29
- purlxml_model = purlxml_parser.parse()
30
- return purlxml_model
18
+ # it reads the purl xml once from PURL server, and repeat the parsing with each call
19
+ # @return [PurlxmlModel] represents the purlxml
20
+ def load
21
+ @purlxml_ng_doc = PurlxmlReader.read(@druid) if @purlxml_ng_doc.nil?
22
+ purlxml_parser = PurlxmlParserStrict.new(@druid, @purlxml_ng_doc)
23
+ purlxml_model = purlxml_parser.parse
24
+ purlxml_model
31
25
  end
32
-
26
+
33
27
  # loads the purl xml to purlxml model for the fedora object defind in the druid
34
- # it reads the purl xml from PURL server with every call
35
- # @return [PurlxmlModel] represents the purlxml
36
- def reload()
28
+ # it reads the purl xml from PURL server with every call
29
+ # @return [PurlxmlModel] represents the purlxml
30
+ def reload
37
31
  @purlxml_ng_doc = PurlxmlReader.read(@druid)
38
- return load()
32
+ load
39
33
  end
40
- end
34
+ end
41
35
  end
42
36
  end
43
-
@@ -1,47 +1,46 @@
1
1
  module DiscoveryIndexer
2
2
  module InputXml
3
3
  class PurlxmlModel
4
-
5
- #@!attribute [rw] druid
6
- # @return [String] The druid value eg., ab123cd4567
7
- attr_accessor :druid
8
-
9
- #@!attribute [rw] public_xml
10
- # @return [Nokogiri::XML] The publix xml as retrieved from purl server
4
+ # @!attribute [rw] druid
5
+ # @return [String] The druid value eg., ab123cd4567
6
+ attr_accessor :druid
7
+
8
+ # @!attribute [rw] public_xml
9
+ # @return [Nokogiri::XML] The publix xml as retrieved from purl server
11
10
  attr_accessor :public_xml
12
11
 
13
- #@!attribute [rw] content_metadata
12
+ # @!attribute [rw] content_metadata
14
13
  # @return [Nokogiri::XML] The content_metadata as extracted from public xml
15
14
  attr_accessor :content_metadata
16
15
 
17
- #@!attribute [rw] identity_metadata
16
+ # @!attribute [rw] identity_metadata
18
17
  # @return [Nokogiri::XML] The identity_metadata as extracted from public xml
19
18
  attr_accessor :identity_metadata
20
19
 
21
- #@!attribute [rw] rights_metadata
20
+ # @!attribute [rw] rights_metadata
22
21
  # @return [Nokogiri::XML] The rights_metadata as extracted from public xml
23
22
  attr_accessor :rights_metadata
24
23
 
25
- #@!attribute [rw] dc
24
+ # @!attribute [rw] dc
26
25
  # @return [Nokogiri::XML] The dc element as extracted from public xml
27
26
  attr_accessor :dc
28
-
29
- #@!attribute [rw] rdf
27
+
28
+ # @!attribute [rw] rdf
30
29
  # @return [Nokogiri::XML] The rdf element as extracted from public xml
31
30
  attr_accessor :rdf
32
31
 
33
32
  # @!attribute [rw] release_tags_hash
34
33
  # @return [Hash] The release_tag in hash format as extracted from public xml
35
- # ReleaseData element.
34
+ # ReleaseData element.
36
35
  # @example
37
36
  # !{"target1"=>true, "target2"=>false}
38
- attr_accessor :release_tags_hash
37
+ attr_accessor :release_tags_hash
39
38
 
40
39
  # @!attribute [rw] dor_content_type
41
40
  # @return [String] The dor_content_type as extracted from public xml
42
41
  # content_metadata.
43
42
  attr_accessor :dor_content_type
44
-
43
+
45
44
  # @!attribute [rw] dor_display_type
46
45
  # @return [String] The displayType as extracted from public xml
47
46
  # identity_metadata.
@@ -50,25 +49,25 @@ module DiscoveryIndexer
50
49
  # @!attribute [rw] is_collection
51
50
  # @return [Boolean] true if the item type is collection in the identity_metadata
52
51
  attr_accessor :is_collection
53
-
52
+
54
53
  # @!attribute [rw] collection_druids
55
54
  # @return [Array] a list of the collections that this is druid belongs to
56
55
  # @example
57
56
  # ["aa11aaa1111","bb111bb1111"]
58
57
  attr_accessor :collection_druids
59
-
58
+
60
59
  # @!attribute [rw] file_ids
61
60
  # @return [Array] a list of the file ids in the content_metadata
62
61
  # @example
63
- # ["pc0065_b08_f10_i031.txt","pc0065_b08_f10_i032.txt"]
62
+ # ["pc0065_b08_f10_i031.txt","pc0065_b08_f10_i032.txt"]
64
63
  attr_accessor :file_ids
65
64
 
66
65
  # @!attribute [rw] image_ids
67
66
  # @return [Array] a list of the image ids in the content_metadata
68
67
  # @example
69
- # ["pc0065_b08_f10_i031.jp2","pc0065_b08_f10_i032.jp2"]
68
+ # ["pc0065_b08_f10_i031.jp2","pc0065_b08_f10_i032.jp2"]
70
69
  attr_accessor :image_ids
71
-
70
+
72
71
  # @!attribute [rw] catkey
73
72
  # @return [String] the catkey attribute in identity_metadata
74
73
  attr_accessor :catkey
@@ -76,15 +75,15 @@ module DiscoveryIndexer
76
75
  # @!attribute [rw] barcode
77
76
  # @return [String] the barcode attribute in identity_metadata
78
77
  attr_accessor :barcode
79
-
78
+
80
79
  # @!attribute [rw] label
81
80
  # @return [String] the objectLabel attribute in identity_metadata
82
81
  attr_accessor :label
83
-
82
+
84
83
  # @!attribute [rw] copyright
85
84
  # @return [String] the copyright statement from rights metadata
86
85
  attr_accessor :copyright
87
-
86
+
88
87
  # @!attribute [rw] use_and_reproduction
89
88
  # @return [String] the use and reproduction statement from rights metadata
90
89
  attr_accessor :use_and_reproduction
@@ -92,13 +91,6 @@ module DiscoveryIndexer
92
91
  # @!attribute [rw] source_id
93
92
  # @return [String] the sourceid from identity metadata
94
93
  attr_accessor :source_id
95
-
96
94
  end
97
95
  end
98
96
  end
99
-
100
-
101
-
102
-
103
-
104
-
@@ -1,13 +1,12 @@
1
1
  module DiscoveryIndexer
2
2
  module InputXml
3
3
  class PurlxmlParser
4
-
5
4
  def initialize(druid, purlxml_ng_doc)
6
5
  @purlxml_ng_doc = purlxml_ng_doc
7
6
  @druid = druid
8
7
  end
9
-
10
- def parse()
8
+
9
+ def parse
11
10
  end
12
11
  end
13
12
  end
@@ -3,154 +3,142 @@ module DiscoveryIndexer
3
3
  class PurlxmlParserStrict < PurlxmlParser
4
4
  include DiscoveryIndexer::Logging
5
5
 
6
-
7
6
  RDF_NAMESPACE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
8
7
  OAI_DC_NAMESPACE = 'http://www.openarchives.org/OAI/2.0/oai_dc/'
9
8
  MODS_NAMESPACE = 'http://www.loc.gov/mods/v3'
10
9
 
11
10
  # it parses the purlxml into a purlxml model
12
11
  # @return [PurlxmlModel] represents the purlxml as parsed based on the parser rules
13
- def parse()
12
+ def parse
14
13
  purlxml_model = PurlxmlModel.new
15
14
  purlxml_model.druid = @druid
16
15
  purlxml_model.public_xml = @purlxml_ng_doc
17
- purlxml_model.content_metadata = parse_content_metadata()
18
- purlxml_model.identity_metadata = parse_identity_metadata()
19
- purlxml_model.rights_metadata = parse_rights_metadata()
20
- purlxml_model.dc = parse_dc()
21
- purlxml_model.rdf = parse_rdf()
22
- purlxml_model.is_collection = parse_is_collection()
23
- purlxml_model.collection_druids = parse_collection_druids()
24
- purlxml_model.dor_content_type = parse_dor_content_type()
25
- purlxml_model.dor_display_type = parse_dor_display_type()
26
- purlxml_model.release_tags_hash = parse_release_tags_hash()
27
- purlxml_model.file_ids = parse_file_ids()
28
- purlxml_model.image_ids = parse_image_ids()
29
- purlxml_model.catkey = parse_catkey()
30
- purlxml_model.barcode = parse_barcode()
31
- purlxml_model.label = parse_label()
32
- purlxml_model.copyright = parse_copyright()
33
- purlxml_model.use_and_reproduction = parse_use_and_reproduction()
34
- purlxml_model.source_id = parse_sourceid()
35
- return purlxml_model
36
- end
37
-
16
+ purlxml_model.content_metadata = parse_content_metadata
17
+ purlxml_model.identity_metadata = parse_identity_metadata
18
+ purlxml_model.rights_metadata = parse_rights_metadata
19
+ purlxml_model.dc = parse_dc
20
+ purlxml_model.rdf = parse_rdf
21
+ purlxml_model.is_collection = parse_is_collection
22
+ purlxml_model.collection_druids = parse_collection_druids
23
+ purlxml_model.dor_content_type = parse_dor_content_type
24
+ purlxml_model.dor_display_type = parse_dor_display_type
25
+ purlxml_model.release_tags_hash = parse_release_tags_hash
26
+ purlxml_model.file_ids = parse_file_ids
27
+ purlxml_model.image_ids = parse_image_ids
28
+ purlxml_model.catkey = parse_catkey
29
+ purlxml_model.barcode = parse_barcode
30
+ purlxml_model.label = parse_label
31
+ purlxml_model.copyright = parse_copyright
32
+ purlxml_model.use_and_reproduction = parse_use_and_reproduction
33
+ purlxml_model.source_id = parse_sourceid
34
+ purlxml_model
35
+ end
36
+
38
37
  # extracts the identityMetadata for this fedora object, from the purl xml
39
38
  # @return [Nokogiri::XML::Document] the identityMetadata for the fedora object
40
39
  # @raise [DiscoveryIndexer::Errors::MissingIdentityMetadata] if there is no identity_metadata
41
40
  def parse_identity_metadata
42
- begin
43
- ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/identityMetadata').to_xml)
44
- raise DiscoveryIndexer::Errors::MissingIdentityMetadata.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
45
- ng_doc
46
- rescue
47
- raise DiscoveryIndexer::Errors::MissingIdentityMetadata.new(@purlxml_ng_doc.inspect)
48
- end
41
+ ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/identityMetadata').to_xml)
42
+ fail DiscoveryIndexer::Errors::MissingIdentityMetadata.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
43
+ ng_doc
44
+ rescue
45
+ raise DiscoveryIndexer::Errors::MissingIdentityMetadata.new(@purlxml_ng_doc.inspect)
49
46
  end
50
-
51
- def parse_rights_metadata
52
- begin
53
- ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/rightsMetadata').to_xml)
54
- raise DiscoveryIndexer::Errors::MissingRightsMetadata.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
55
- ng_doc
56
- rescue
57
- raise DiscoveryIndexer::Errors::MissingRightsMetadata.new(@purlxml_ng_doc.inspect)
58
- end
47
+
48
+ def parse_rights_metadata
49
+ ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/rightsMetadata').to_xml)
50
+ fail DiscoveryIndexer::Errors::MissingRightsMetadata.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
51
+ ng_doc
52
+ rescue
53
+ raise DiscoveryIndexer::Errors::MissingRightsMetadata.new(@purlxml_ng_doc.inspect)
59
54
  end
60
-
55
+
61
56
  # extracts the dc field for this fedora object, from the purl xml
62
57
  # @return [Nokogiri::XML::Document] the dc for the fedora object
63
58
  # @raise [DiscoveryIndexer::Errors::MissingDC] if there is no dc element
64
59
  def parse_dc
65
- begin
66
- ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/dc:dc', {'dc' => OAI_DC_NAMESPACE}).to_xml(:encoding => 'utf-8'))
67
- raise DiscoveryIndexer::Errors::MissingDC.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
68
- ng_doc
69
- rescue
70
- raise DiscoveryIndexer::Errors::MissingDC.new(@purlxml_ng_doc.inspect)
71
- end
60
+ ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/dc:dc', 'dc' => OAI_DC_NAMESPACE).to_xml(encoding: 'utf-8'))
61
+ fail DiscoveryIndexer::Errors::MissingDC.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
62
+ ng_doc
63
+ rescue
64
+ raise DiscoveryIndexer::Errors::MissingDC.new(@purlxml_ng_doc.inspect)
72
65
  end
73
-
66
+
74
67
  # extracts the rdf field for this fedora object, from the purl xml
75
68
  # @return [Nokogiri::XML::Document] the rdf for the fedora object
76
69
  # @raise [DiscoveryIndexer::Errors::MissingRDF] if there is no rdf element
77
70
  def parse_rdf
78
- begin
79
- ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/rdf:RDF', {'rdf' => RDF_NAMESPACE}).to_xml)
80
- raise DiscoveryIndexer::Errors::MissingRDF.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
81
- ng_doc
82
- rescue
83
- raise DiscoveryIndexer::Errors::MissingRDF.new(@purlxml_ng_doc.inspect)
84
- end
71
+ ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/rdf:RDF', 'rdf' => RDF_NAMESPACE).to_xml)
72
+ fail DiscoveryIndexer::Errors::MissingRDF.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
73
+ ng_doc
74
+ rescue
75
+ raise DiscoveryIndexer::Errors::MissingRDF.new(@purlxml_ng_doc.inspect)
85
76
  end
86
-
87
-
77
+
88
78
  # extracts the release tag element for this fedora object, from the the ReleaseData element in purl xml
89
79
  # @return [Hash] the release tags for the fedora object
90
80
  def parse_release_tags_hash
91
- release_tags={}
92
- unless @purlxml_ng_doc.nil?
93
- release_elements = @purlxml_ng_doc.xpath('//ReleaseData/release')
94
- release_elements.each { |n|
95
- unless n.attr("to").nil?
96
- release_target = n.attr("to")
81
+ release_tags = {}
82
+ unless @purlxml_ng_doc.nil?
83
+ release_elements = @purlxml_ng_doc.xpath('//ReleaseData/release')
84
+ release_elements.each do |n|
85
+ unless n.attr('to').nil?
86
+ release_target = n.attr('to')
97
87
  text = n.text
98
- unless text.nil?
99
- release_tags[release_target]= to_boolean(text)
100
- end
88
+ release_tags[release_target] = to_boolean(text) unless text.nil?
101
89
  end
102
- }
90
+ end
103
91
  end
104
- return release_tags
92
+ release_tags
105
93
  end
106
-
94
+
107
95
  # extracts the contentMetadata for this fedora object, from the purl xml
108
96
  # @return [Nokogiri::XML::Document] the contentMetadata for the fedora object
109
97
  # @raise [DiscoveryIndexer::Errors::MissingContentMetadata] if there is no contentMetadata
110
98
  def parse_content_metadata
111
- ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/contentMetadata').to_xml)
112
- ng_doc = nil if !ng_doc || ng_doc.children.empty?
113
- ng_doc
99
+ ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/contentMetadata').to_xml)
100
+ ng_doc = nil if !ng_doc || ng_doc.children.empty?
101
+ ng_doc
114
102
  end
115
-
103
+
116
104
  # @return true if the identityMetadata has <objectType>collection</objectType>, false otherwise
117
105
  def parse_is_collection
118
- identity_metadata = parse_identity_metadata
106
+ identity_metadata = parse_identity_metadata
119
107
  unless identity_metadata.nil?
120
108
  object_type_nodes = identity_metadata.xpath('//objectType')
121
- return true if object_type_nodes.find_index { |n| ['collection','set'].include? n.text.downcase}
109
+ return true if object_type_nodes.find_index { |n| %w(collection set).include? n.text.downcase }
122
110
  end
123
111
  false
124
112
  end
125
-
113
+
126
114
  # get the druids from isMemberOfCollection relationships in rels-ext from public_xml
127
115
  # @return [Array<String>] the druids (e.g. ww123yy1234) this object has isMemberOfColletion relationship with, or nil if none
128
116
  def parse_collection_druids
129
- ns_hash = {'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'fedora' => "info:fedora/fedora-system:def/relations-external#", '' => ''}
117
+ ns_hash = { 'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'fedora' => 'info:fedora/fedora-system:def/relations-external#', '' => '' }
130
118
  is_member_of_nodes ||= @purlxml_ng_doc.xpath('/publicObject/rdf:RDF/rdf:Description/fedora:isMemberOfCollection/@rdf:resource', ns_hash)
131
119
  # from public_xml rels-ext
132
120
  druids = []
133
- is_member_of_nodes.each { |n|
121
+ is_member_of_nodes.each do |n|
134
122
  druids << n.value.split('druid:').last unless n.value.empty?
135
- }
123
+ end
136
124
  return nil if druids.empty?
137
125
  druids
138
126
  end
139
-
127
+
140
128
  # the value of the type attribute for a DOR object's contentMetadata
141
129
  # more info about these values is here:
142
130
  # https://consul.stanford.edu/display/chimera/DOR+content+types%2C+resource+types+and+interpretive+metadata
143
131
  # https://consul.stanford.edu/display/chimera/Summary+of+Content+Types%2C+Resource+Types+and+their+behaviors
144
- # @return [String]
132
+ # @return [String]
145
133
  def parse_dor_content_type
146
134
  content_md = parse_content_metadata
147
135
  dct = content_md ? content_md.xpath('contentMetadata/@type').text : nil
148
136
  DiscoveryIndexer::Logging.logger.debug "#{@druid} has no DOR content type (<contentMetadata> element may be missing type attribute)" if !dct || dct.empty?
149
137
  dct
150
138
  end
151
-
139
+
152
140
  # the value of the displyType tag from a DOR collection's identityMetadata
153
- # @return [String]
141
+ # @return [String]
154
142
  def parse_dor_display_type
155
143
  identity_md = parse_identity_metadata
156
144
  ddt = identity_md ? identity_md.xpath('//displayType').text : nil
@@ -161,43 +149,44 @@ module DiscoveryIndexer
161
149
  # the @id attribute of resource/file elements that match the image type, including extension
162
150
  # @return [Array<String>] filenames
163
151
  def parse_image_ids
164
- ids = []
165
- content_md = parse_content_metadata
166
- unless content_md.nil?
167
- content_md.xpath('//resource[@type="image"]/file/@id').each { |node|
168
- ids << node.text if !node.text.empty?
169
- }
170
- return nil if ids.empty?
171
- ids
152
+ ids = []
153
+ content_md = parse_content_metadata
154
+ return nil if content_md.nil?
155
+ content_md.xpath('//resource[@type="image"]/file/@id').each do |node|
156
+ ids << node.text unless node.text.empty?
172
157
  end
158
+ content_md.xpath('//resource[@type="page"]/file/@id').each do |node|
159
+ ids << node.text unless node.text.empty?
160
+ end
161
+ return nil if ids.empty?
162
+ ids
173
163
  end
174
164
 
175
165
  def parse_sourceid
176
166
  get_value(@purlxml_ng_doc.css('//identityMetadata/sourceId'))
177
167
  end
178
-
168
+
179
169
  def parse_copyright
180
170
  get_value(@purlxml_ng_doc.css('//rightsMetadata/copyright/human[type="copyright"]'))
181
171
  end
182
-
172
+
183
173
  def parse_use_and_reproduction
184
174
  get_value(@purlxml_ng_doc.css('//rightsMetadata/use/human[type="useAndReproduction"]'))
185
175
  end
186
-
176
+
187
177
  # the @id attribute of resource/file elements, including extension
188
178
  # @return [Array<String>] filenames
189
179
  def parse_file_ids
190
180
  ids = []
191
181
  content_md = parse_content_metadata
192
- unless content_md.nil?
193
- content_md.xpath('//resource/file/@id').each { |node|
194
- ids << node.text if !node.text.empty?
195
- }
196
- return nil if ids.empty?
197
- ids
198
- end
199
- end
200
-
182
+ return unless content_md.nil?
183
+ content_md.xpath('//resource/file/@id').each do |node|
184
+ ids << node.text unless node.text.empty?
185
+ end
186
+ return nil if ids.empty?
187
+ ids
188
+ end
189
+
201
190
  # @return catkey value from the DOR identity_metadata, or nil if there is no catkey
202
191
  def parse_catkey
203
192
  get_value(@purlxml_ng_doc.xpath("/publicObject/identityMetadata/otherId[@name='catkey']"))
@@ -210,23 +199,22 @@ module DiscoveryIndexer
210
199
 
211
200
  # @return objectLabel value from the DOR identity_metadata, or nil if there is no barcode
212
201
  def parse_label
213
- get_value(@purlxml_ng_doc.xpath("/publicObject/identityMetadata/objectLabel"))
202
+ get_value(@purlxml_ng_doc.xpath('/publicObject/identityMetadata/objectLabel'))
214
203
  end
215
-
204
+
216
205
  def get_value(node)
217
- (node && node.first) ? node.first.content : nil
206
+ (node && node.first) ? node.first.content : nil
218
207
  end
219
-
208
+
220
209
  def to_boolean(text)
221
- if text.nil? || text.empty? then
210
+ if text.nil? || text.empty?
222
211
  return false
223
- elsif text.downcase.eql?("true") || text.downcase == "t" then
212
+ elsif text.downcase.eql?('true') || text.downcase == 't'
224
213
  return true
225
214
  else
226
215
  return false
227
216
  end
228
- end
217
+ end
229
218
  end
230
219
  end
231
220
  end
232
-
@@ -3,14 +3,13 @@ require 'open-uri'
3
3
  module DiscoveryIndexer
4
4
  module InputXml
5
5
  class PurlxmlReader
6
-
7
6
  # reads the public xml for the fedora object that is defined , from the purl server
8
7
  # @param [String] druid e.g. ab123cd4567
9
8
  # @return [Nokogiri::XML::Document] the public xml for the fedora object
10
9
  # @raise [MissingPublicXml] if there's no purl xml available for this druid
11
10
  def self.read(druid)
12
11
  purlxml_uri = "#{DiscoveryIndexer::PURL_DEFAULT}/#{druid}.xml"
13
-
12
+
14
13
  begin
15
14
  purlxml_object = Nokogiri::XML(open(purlxml_uri))
16
15
  return purlxml_object
@@ -20,4 +19,4 @@ module DiscoveryIndexer
20
19
  end
21
20
  end
22
21
  end
23
- end
22
+ end
data/lib/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module DiscoveryIndexer
2
- VERSION = '0.9.6'
2
+ VERSION = '0.9.7'
3
3
  end
@@ -3,6 +3,7 @@ require 'rsolr'
3
3
  require 'rest-client'
4
4
  module DiscoveryIndexer
5
5
  module Writer
6
+ # Processes adds and deletes to the solr core
6
7
  class SolrClient
7
8
  include DiscoveryIndexer::Logging
8
9
 
@@ -13,7 +14,7 @@ module DiscoveryIndexer
13
14
  # @param solr_connector [RSolr::Client] is an open connection with the solr core
14
15
  # @param max_retries [Integer] the maximum number of tries before fail
15
16
  def self.add(id, solr_doc, solr_connector, max_retries = 10)
16
- process(id, solr_doc, solr_connector, max_retries, is_delete=false)
17
+ process(id, solr_doc, solr_connector, max_retries, false)
17
18
  end
18
19
 
19
20
  # Add the document to solr, retry if an error occurs.
@@ -22,79 +23,76 @@ module DiscoveryIndexer
22
23
  # @param solr_connector[RSolr::Client] is an open connection with the solr core
23
24
  # @param max_retries [Integer] the maximum number of tries before fail
24
25
  def self.delete(id, solr_connector, max_retries = 10)
25
- process(id, {}, solr_connector, max_retries, is_delete=true)
26
+ process(id, {}, solr_connector, max_retries, true)
26
27
  end
27
28
 
28
29
  # It's an internal method that receives all the requests and deal with
29
30
  # SOLR core. This method can call add, delete, or update
30
31
  #
31
32
  # @param id [String] the document id, usually it will be druid.
32
- # @param solr_doc [Hash] is the solr doc in hash format
33
+ # @param solr_doc [Hash] is the solr doc in hash format
33
34
  # @param solr_connector [RSolr::Client] is an open connection with the solr core
34
35
  # @param max_retries [Integer] the maximum number of tries before fail
35
- def self.process(id, solr_doc, solr_connector, max_retries, is_delete=false)
36
- handler = Proc.new do |exception, attempt_number, total_delay|
36
+ def self.process(id, solr_doc, solr_connector, max_retries, is_delete = false)
37
+ handler = proc do |exception, attempt_number, _total_delay|
37
38
  DiscoveryIndexer::Logging.logger.debug "#{exception.class} on attempt #{attempt_number} for #{id}"
38
39
  end
39
-
40
- with_retries(:max_tries => max_retries, :handler => handler, :base_sleep_seconds => 1, :max_sleep_seconds => 5) do |attempt|
40
+
41
+ with_retries(max_tries: max_retries, handler: handler, base_sleep_seconds: 1, max_sleep_seconds: 5) do |attempt|
41
42
  DiscoveryIndexer::Logging.logger.debug "Attempt #{attempt} for #{id}"
42
-
43
+
43
44
  if is_delete
44
45
  DiscoveryIndexer::Logging.logger.info "Deleting #{id} on attempt #{attempt}"
45
46
  solr_connector.delete_by_id(id)
46
- elsif allow_update?(solr_connector) && doc_exists?(id,solr_connector)
47
+ elsif allow_update?(solr_connector) && doc_exists?(id, solr_connector)
47
48
  DiscoveryIndexer::Logging.logger.info "Updating #{id} on attempt #{attempt}"
48
- update_solr_doc(id,solr_doc,solr_connector)
49
+ update_solr_doc(id, solr_doc, solr_connector)
49
50
  else
50
51
  DiscoveryIndexer::Logging.logger.info "Indexing #{id} on attempt #{attempt}"
51
52
  solr_connector.add(solr_doc)
52
53
  end
53
54
  solr_connector.commit
54
55
  DiscoveryIndexer::Logging.logger.info "Completing #{id} successfully on attempt #{attempt}"
55
-
56
56
  end
57
57
  end
58
58
 
59
59
  # @param solr_connector [RSolr::Client] is an open connection with the solr core
60
60
  # @return [Boolean] true if the solr core allowing update feature
61
61
  def self.allow_update?(solr_connector)
62
- return solr_connector.options.include?(:allow_update) ? solr_connector.options[:allow_update] : false
62
+ solr_connector.options.include?(:allow_update) ? solr_connector.options[:allow_update] : false
63
63
  end
64
64
 
65
65
  # @param id [String] the document id, usually it will be druid.
66
66
  # @param solr_connector [RSolr::Client] is an open connection with the solr core
67
67
  # @return [Boolean] true if the solr doc defined by this id exists
68
- def self.doc_exists?(id,solr_connector)
69
- response=solr_connector.get 'select', :params=>{:q=>'id:"' + id + '"'}
68
+ def self.doc_exists?(id, solr_connector)
69
+ response = solr_connector.get 'select', params: { q: 'id:"' + id + '"' }
70
70
  response['response']['numFound'] == 1
71
71
  end
72
-
72
+
73
73
  # It is an internal method that updates the solr doc instead of adding a new one.
74
- def self.update_solr_doc(id,solr_doc,solr_connector)
74
+ def self.update_solr_doc(id, solr_doc, solr_connector)
75
75
  # update_solr_doc can't used RSolr because updating hash doc is not supported
76
76
  # so we need to build the json input manually
77
77
  solr_url = solr_connector.options[:url]
78
- if solr_url.end_with?("/") then
79
- url="#{solr_connector.options[:url]}update?commit=true"
78
+ if solr_url.end_with?('/')
79
+ url = "#{solr_connector.options[:url]}update?commit=true"
80
80
  else
81
- url="#{solr_connector.options[:url]}/update?commit=true"
81
+ url = "#{solr_connector.options[:url]}/update?commit=true"
82
82
  end
83
-
84
- params="[{\"id\":\"#{id}\","
85
- solr_doc.each do |field_name,new_values|
86
- unless field_name == :id
87
- params+="\"#{field_name}\":"
88
- new_values=[new_values] unless new_values.class==Array
89
- new_values = new_values.map {|s| s.to_s.gsub("\\","\\\\\\").gsub('"','\"').strip} # strip leading/trailing spaces and escape quotes for each value
90
- params+="{\"set\":[\"#{new_values.join('","')}\"]},"
91
- end
83
+
84
+ params = "[{\"id\":\"#{id}\","
85
+ solr_doc.each do |field_name, new_values|
86
+ next unless field_name == :id
87
+ params += "\"#{field_name}\":"
88
+ new_values = [new_values] unless new_values.class == Array
89
+ new_values = new_values.map { |s| s.to_s.gsub('\\', '\\\\\\').gsub('"', '\"').strip } # strip leading/trailing spaces and escape quotes for each value
90
+ params += "{\"set\":[\"#{new_values.join('","')}\"]},"
92
91
  end
93
92
  params.chomp!(',')
94
- params+="}]"
95
- RestClient.post url, params,:content_type => :json, :accept=>:json
93
+ params += '}]'
94
+ RestClient.post url, params, content_type: :json, accept: :json
96
95
  end
97
-
98
96
  end
99
97
  end
100
- end
98
+ end
@@ -3,57 +3,57 @@ require 'rsolr'
3
3
 
4
4
  module DiscoveryIndexer
5
5
  module Writer
6
+ # Performs writes to solr client based upon true and false release flags
6
7
  class SolrWriter
7
8
  include DiscoveryIndexer::Logging
8
-
9
+
9
10
  def process(id, index_doc, targets, solr_targets_configs)
10
11
  @solr_targets_configs = solr_targets_configs
11
12
  index_targets = []
12
13
  delete_targets = []
13
- targets.keys.each do |target|
14
- if targets[target] then
14
+ targets.keys.each do |target|
15
+ if targets[target]
15
16
  index_targets.append(target)
16
17
  else
17
18
  delete_targets.append(target)
18
19
  end
19
20
  end
20
-
21
+
21
22
  # get targets with true
22
23
  solr_index_client(id, index_doc, index_targets)
23
24
  # get targets with false
24
25
  solr_delete_client(id, delete_targets)
25
26
  end
26
-
27
+
27
28
  def solr_delete_from_all(id, solr_targets_configs)
28
29
  # Get a list of all registered targets
29
- @solr_targets_configs=solr_targets_configs
30
- targets = @solr_targets_configs.keys()
30
+ @solr_targets_configs = solr_targets_configs
31
+ targets = @solr_targets_configs.keys
31
32
  solr_delete_client(id, targets)
32
33
  end
33
-
34
+
34
35
  def solr_index_client(id, index_doc, targets)
35
36
  targets.each do |solr_target|
36
- solr_connector = get_connector_for_target(solr_target)
37
+ solr_connector = get_connector_for_target(solr_target)
37
38
  SolrClient.add(id, index_doc, solr_connector) unless solr_connector.nil?
38
- end
39
+ end
39
40
  end
40
-
41
+
41
42
  def solr_delete_client(id, targets)
42
43
  targets.each do |solr_target|
43
44
  solr_connector = get_connector_for_target(solr_target)
44
45
  SolrClient.delete(id, solr_connector) unless solr_connector.nil?
45
- end
46
+ end
46
47
  end
47
48
 
48
49
  def get_connector_for_target(solr_target)
49
50
  solr_connector = nil
50
- if @solr_targets_configs.keys.include?(solr_target) then
51
+ if @solr_targets_configs.keys.include?(solr_target)
51
52
  config = @solr_targets_configs[solr_target]
52
53
  solr_connector = RSolr.connect(config.deep_symbolize_keys)
53
54
  end
54
- return solr_connector
55
+ solr_connector
55
56
  end
56
-
57
57
  end
58
58
  end
59
59
  end
metadata CHANGED
@@ -1,14 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: discovery-indexer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.6
4
+ version: 0.9.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ahmed AlSum
8
+ - Laney McGlohon
8
9
  autorequire:
9
10
  bindir: bin
10
11
  cert_chain: []
11
- date: 2015-06-30 00:00:00.000000000 Z
12
+ date: 2015-09-23 00:00:00.000000000 Z
12
13
  dependencies:
13
14
  - !ruby/object:Gem::Dependency
14
15
  name: nokogiri
@@ -138,7 +139,7 @@ dependencies:
138
139
  version: '0'
139
140
  description: This library manages the core operations for the discovery indexing such
140
141
  as reading PURL xml, mapping to the solr document, and writing to solr core.
141
- email: aalsum@stanford.edu
142
+ email: laneymcg@stanford.edu
142
143
  executables: []
143
144
  extensions: []
144
145
  extra_rdoc_files: []