discovery-indexer 0.9.6 → 0.9.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/discovery-indexer.rb +2 -3
- data/lib/errors.rb +8 -8
- data/lib/logging.rb +3 -6
- data/lib/mapper/general_mapper.rb +4 -6
- data/lib/reader/modsxml.rb +16 -20
- data/lib/reader/modsxml_reader.rb +2 -5
- data/lib/reader/purlxml.rb +16 -23
- data/lib/reader/purlxml_model.rb +23 -31
- data/lib/reader/purlxml_parser.rb +2 -3
- data/lib/reader/purlxml_parser_strict.rb +99 -111
- data/lib/reader/purlxml_reader.rb +2 -3
- data/lib/version.rb +1 -1
- data/lib/writer/solr_client.rb +30 -32
- data/lib/writer/solr_writer.rb +15 -15
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9cddff2ef4144cc3d17af7bd71bdb6f68840d38c
|
4
|
+
data.tar.gz: ca19a7acbb80341c1d6440567e305ef0bf170542
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 65fa148806dcf8dc498b262cbc3acf811c89fcad0d81397099009e0e8eb368577477e50c9c5bbee7e0921e11b17542cf4857b2aff4e54a319f5f97ea22e1bb8f
|
7
|
+
data.tar.gz: a28465c230937c24168114d91f41a58e59492a16b0b479ab8e4ae41429c4682dc7eec99a4b796ff93869d37b7e34ed78a6d7485ff1500ee2bc91e99e3a6e070f
|
data/lib/discovery-indexer.rb
CHANGED
@@ -15,9 +15,8 @@ require 'mapper/general_mapper'
|
|
15
15
|
require 'writer/solr_client'
|
16
16
|
require 'writer/solr_writer'
|
17
17
|
|
18
|
-
#require 'utilities/extract_sub_targets'
|
19
|
-
|
18
|
+
# require 'utilities/extract_sub_targets'
|
20
19
|
|
21
20
|
module DiscoveryIndexer
|
22
21
|
PURL_DEFAULT = 'http://purl.stanford.edu'
|
23
|
-
end
|
22
|
+
end
|
data/lib/errors.rb
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
module DiscoveryIndexer
|
2
2
|
module Errors
|
3
|
-
MissingPurlPage = Class.new(StandardError)
|
4
|
-
MissingMods = Class.new(StandardError)
|
5
|
-
MissingPublicXml = Class.new(StandardError)
|
6
|
-
MissingContentMetadata = Class.new(StandardError)
|
7
|
-
MissingIdentityMetadata = Class.new(StandardError)
|
8
|
-
MissingRightsMetadata = Class.new(StandardError)
|
9
|
-
MissingRDF = Class.new(StandardError)
|
3
|
+
MissingPurlPage = Class.new(StandardError)
|
4
|
+
MissingMods = Class.new(StandardError)
|
5
|
+
MissingPublicXml = Class.new(StandardError)
|
6
|
+
MissingContentMetadata = Class.new(StandardError)
|
7
|
+
MissingIdentityMetadata = Class.new(StandardError)
|
8
|
+
MissingRightsMetadata = Class.new(StandardError)
|
9
|
+
MissingRDF = Class.new(StandardError)
|
10
10
|
MissingDC = Class.new(StandardError)
|
11
11
|
MissingModsPage = Class.new(StandardError)
|
12
12
|
end
|
13
|
-
end
|
13
|
+
end
|
data/lib/logging.rb
CHANGED
@@ -1,29 +1,27 @@
|
|
1
1
|
module DiscoveryIndexer
|
2
2
|
module Mapper
|
3
3
|
class GeneralMapper
|
4
|
-
|
5
4
|
# Initializes an instance from IndexMapper
|
6
5
|
# @param [String] druid e.g. ab123cd4567
|
7
6
|
# @param [Stanford::Mods::Record] modsxml represents the MODS xml for the druid
|
8
7
|
# @param [DiscoveryIndexer::Reader::PurlxmlModel] purlxml represents the purlxml model
|
9
8
|
# @param [Hash] collection_data represents a hash of collection_druid and catkey
|
10
9
|
# e.g. @collection_data = {'aa00bb0001'=>{:name=>'Test Collection Name',:ckey=>'000001'},'nt028fd5773'=>{:name=>'Revs Institute Archive',:ckey=>'000002'}}
|
11
|
-
def initialize(druid, modsxml, purlxml, collection_data={})
|
10
|
+
def initialize(druid, modsxml, purlxml, collection_data = {})
|
12
11
|
@druid = druid
|
13
12
|
@modsxml = modsxml
|
14
13
|
@purlxml = purlxml
|
15
14
|
@collection_data = collection_data
|
16
15
|
end
|
17
16
|
|
18
|
-
# Create a Hash representing a Solr doc, with all MODS related fields populated.
|
17
|
+
# Create a Hash representing a Solr doc, with all MODS related fields populated.
|
19
18
|
# @return [Hash] Hash representing the Solr document
|
20
|
-
def convert_to_solr_doc
|
19
|
+
def convert_to_solr_doc
|
21
20
|
solr_doc = {}
|
22
21
|
solr_doc[:id] = @druid
|
23
22
|
solr_doc[:title] = @modsxml.sw_full_title
|
24
|
-
|
23
|
+
solr_doc
|
25
24
|
end
|
26
25
|
end
|
27
26
|
end
|
28
27
|
end
|
29
|
-
|
data/lib/reader/modsxml.rb
CHANGED
@@ -1,44 +1,40 @@
|
|
1
1
|
require 'stanford-mods'
|
2
2
|
module DiscoveryIndexer
|
3
3
|
module InputXml
|
4
|
-
|
5
|
-
# This class is the main class to access and parse the mods xml
|
4
|
+
# This class is the main class to access and parse the mods xml
|
6
5
|
# as retrieved from PURL server
|
7
6
|
# @example to run the code
|
8
7
|
# druid = "aa111aa1111"
|
9
8
|
# p = DiscoveryIndexer::InputXml::Modsxml.new(druid)
|
10
9
|
# model = p.load()
|
11
|
-
#
|
12
|
-
#
|
10
|
+
#
|
11
|
+
#
|
13
12
|
class Modsxml
|
14
|
-
# initializes a new object
|
13
|
+
# initializes a new object
|
15
14
|
# @param druid [String] the druid object in the format "aa111aa1111"
|
16
15
|
def initialize(druid)
|
17
16
|
@druid = druid
|
18
|
-
@modsxml_ng_doc = nil
|
17
|
+
@modsxml_ng_doc = nil
|
19
18
|
end
|
20
19
|
|
21
20
|
# loads the mods xml to stanford mods model for the fedora object defind in the druid,
|
22
|
-
#
|
23
|
-
# @return [Stanford::Mods::Record] represents the mods xml
|
24
|
-
def load
|
25
|
-
if @modsxml_ng_doc.nil?
|
26
|
-
|
27
|
-
end
|
28
|
-
|
21
|
+
# it reads the mods xml once from PURL server, and repeat the parsing with each call
|
22
|
+
# @return [Stanford::Mods::Record] represents the mods xml
|
23
|
+
def load
|
24
|
+
@modsxml_ng_doc = ModsxmlReader.read(@druid) if @modsxml_ng_doc.nil?
|
25
|
+
|
29
26
|
modsxml_model = Stanford::Mods::Record.new
|
30
27
|
modsxml_model.from_nk_node(@modsxml_ng_doc)
|
31
|
-
|
28
|
+
modsxml_model
|
32
29
|
end
|
33
|
-
|
30
|
+
|
34
31
|
# loads the mods xml to stanford mods model for the fedora object defind in the druid,
|
35
|
-
#
|
36
|
-
# @return [Stanford::Mods::Record] represents the mods xml
|
37
|
-
def reload
|
32
|
+
# it reads the mods xml from PURL server with every call
|
33
|
+
# @return [Stanford::Mods::Record] represents the mods xml
|
34
|
+
def reload
|
38
35
|
@modsxml_ng_doc = ModsxmlReader.read(@druid)
|
39
|
-
|
36
|
+
load
|
40
37
|
end
|
41
|
-
|
42
38
|
end
|
43
39
|
end
|
44
40
|
end
|
@@ -3,21 +3,18 @@ require 'open-uri'
|
|
3
3
|
module DiscoveryIndexer
|
4
4
|
module InputXml
|
5
5
|
class ModsxmlReader
|
6
|
-
|
7
6
|
# reads the mods xml for the fedora object that is defined , from the purl server
|
8
7
|
# @param [String] druid e.g. ab123cd4567
|
9
8
|
# @return [Nokogiri::XML::Document] the mods xml for the fedora object
|
10
9
|
# @raise [MissingModsXml] if there's no mods xml available for this druid
|
11
10
|
def self.read(druid)
|
12
11
|
mods_uri = "#{DiscoveryIndexer::PURL_DEFAULT}/#{druid}.mods"
|
13
|
-
|
14
12
|
begin
|
15
|
-
|
16
|
-
return modsxml_ng_doc
|
13
|
+
Nokogiri::XML(open(mods_uri))
|
17
14
|
rescue
|
18
15
|
raise DiscoveryIndexer::Errors::MissingModsPage.new(mods_uri)
|
19
16
|
end
|
20
17
|
end
|
21
18
|
end
|
22
19
|
end
|
23
|
-
end
|
20
|
+
end
|
data/lib/reader/purlxml.rb
CHANGED
@@ -1,43 +1,36 @@
|
|
1
1
|
module DiscoveryIndexer
|
2
2
|
module InputXml
|
3
|
-
|
4
|
-
# This class is the main class to access and parse the purl xml
|
3
|
+
# This class is the main class to access and parse the purl xml
|
5
4
|
# as retrieved from PURL server
|
6
5
|
# @example to run the code
|
7
6
|
# druid = "aa111aa1111"
|
8
7
|
# p = DiscoveryIndexer::InputXml::Purlxml.new(druid)
|
9
8
|
# model = p.load()
|
10
|
-
#
|
11
9
|
class Purlxml
|
12
|
-
|
13
|
-
# initializes a new object
|
10
|
+
# initializes a new object
|
14
11
|
# @param druid [String] the druid object in the format "aa111aa1111"
|
15
12
|
def initialize(druid)
|
16
13
|
@druid = druid
|
17
|
-
@purlxml_ng_doc = nil
|
14
|
+
@purlxml_ng_doc = nil
|
18
15
|
end
|
19
16
|
|
20
17
|
# loads the purl xml to purlxml model for the fedora object defind in the druid,
|
21
|
-
#
|
22
|
-
# @return [PurlxmlModel] represents the purlxml
|
23
|
-
def load
|
24
|
-
if @purlxml_ng_doc.nil?
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
purlxml_parser = PurlxmlParserStrict.new(@druid,@purlxml_ng_doc)
|
29
|
-
purlxml_model = purlxml_parser.parse()
|
30
|
-
return purlxml_model
|
18
|
+
# it reads the purl xml once from PURL server, and repeat the parsing with each call
|
19
|
+
# @return [PurlxmlModel] represents the purlxml
|
20
|
+
def load
|
21
|
+
@purlxml_ng_doc = PurlxmlReader.read(@druid) if @purlxml_ng_doc.nil?
|
22
|
+
purlxml_parser = PurlxmlParserStrict.new(@druid, @purlxml_ng_doc)
|
23
|
+
purlxml_model = purlxml_parser.parse
|
24
|
+
purlxml_model
|
31
25
|
end
|
32
|
-
|
26
|
+
|
33
27
|
# loads the purl xml to purlxml model for the fedora object defind in the druid
|
34
|
-
#
|
35
|
-
# @return [PurlxmlModel] represents the purlxml
|
36
|
-
def reload
|
28
|
+
# it reads the purl xml from PURL server with every call
|
29
|
+
# @return [PurlxmlModel] represents the purlxml
|
30
|
+
def reload
|
37
31
|
@purlxml_ng_doc = PurlxmlReader.read(@druid)
|
38
|
-
|
32
|
+
load
|
39
33
|
end
|
40
|
-
|
34
|
+
end
|
41
35
|
end
|
42
36
|
end
|
43
|
-
|
data/lib/reader/purlxml_model.rb
CHANGED
@@ -1,47 +1,46 @@
|
|
1
1
|
module DiscoveryIndexer
|
2
2
|
module InputXml
|
3
3
|
class PurlxmlModel
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
# @return [Nokogiri::XML] The publix xml as retrieved from purl server
|
4
|
+
# @!attribute [rw] druid
|
5
|
+
# @return [String] The druid value eg., ab123cd4567
|
6
|
+
attr_accessor :druid
|
7
|
+
|
8
|
+
# @!attribute [rw] public_xml
|
9
|
+
# @return [Nokogiri::XML] The publix xml as retrieved from purl server
|
11
10
|
attr_accessor :public_xml
|
12
11
|
|
13
|
-
|
12
|
+
# @!attribute [rw] content_metadata
|
14
13
|
# @return [Nokogiri::XML] The content_metadata as extracted from public xml
|
15
14
|
attr_accessor :content_metadata
|
16
15
|
|
17
|
-
|
16
|
+
# @!attribute [rw] identity_metadata
|
18
17
|
# @return [Nokogiri::XML] The identity_metadata as extracted from public xml
|
19
18
|
attr_accessor :identity_metadata
|
20
19
|
|
21
|
-
|
20
|
+
# @!attribute [rw] rights_metadata
|
22
21
|
# @return [Nokogiri::XML] The rights_metadata as extracted from public xml
|
23
22
|
attr_accessor :rights_metadata
|
24
23
|
|
25
|
-
|
24
|
+
# @!attribute [rw] dc
|
26
25
|
# @return [Nokogiri::XML] The dc element as extracted from public xml
|
27
26
|
attr_accessor :dc
|
28
|
-
|
29
|
-
|
27
|
+
|
28
|
+
# @!attribute [rw] rdf
|
30
29
|
# @return [Nokogiri::XML] The rdf element as extracted from public xml
|
31
30
|
attr_accessor :rdf
|
32
31
|
|
33
32
|
# @!attribute [rw] release_tags_hash
|
34
33
|
# @return [Hash] The release_tag in hash format as extracted from public xml
|
35
|
-
# ReleaseData element.
|
34
|
+
# ReleaseData element.
|
36
35
|
# @example
|
37
36
|
# !{"target1"=>true, "target2"=>false}
|
38
|
-
attr_accessor :release_tags_hash
|
37
|
+
attr_accessor :release_tags_hash
|
39
38
|
|
40
39
|
# @!attribute [rw] dor_content_type
|
41
40
|
# @return [String] The dor_content_type as extracted from public xml
|
42
41
|
# content_metadata.
|
43
42
|
attr_accessor :dor_content_type
|
44
|
-
|
43
|
+
|
45
44
|
# @!attribute [rw] dor_display_type
|
46
45
|
# @return [String] The displayType as extracted from public xml
|
47
46
|
# identity_metadata.
|
@@ -50,25 +49,25 @@ module DiscoveryIndexer
|
|
50
49
|
# @!attribute [rw] is_collection
|
51
50
|
# @return [Boolean] true if the item type is collection in the identity_metadata
|
52
51
|
attr_accessor :is_collection
|
53
|
-
|
52
|
+
|
54
53
|
# @!attribute [rw] collection_druids
|
55
54
|
# @return [Array] a list of the collections that this is druid belongs to
|
56
55
|
# @example
|
57
56
|
# ["aa11aaa1111","bb111bb1111"]
|
58
57
|
attr_accessor :collection_druids
|
59
|
-
|
58
|
+
|
60
59
|
# @!attribute [rw] file_ids
|
61
60
|
# @return [Array] a list of the file ids in the content_metadata
|
62
61
|
# @example
|
63
|
-
# ["pc0065_b08_f10_i031.txt","pc0065_b08_f10_i032.txt"]
|
62
|
+
# ["pc0065_b08_f10_i031.txt","pc0065_b08_f10_i032.txt"]
|
64
63
|
attr_accessor :file_ids
|
65
64
|
|
66
65
|
# @!attribute [rw] image_ids
|
67
66
|
# @return [Array] a list of the image ids in the content_metadata
|
68
67
|
# @example
|
69
|
-
# ["pc0065_b08_f10_i031.jp2","pc0065_b08_f10_i032.jp2"]
|
68
|
+
# ["pc0065_b08_f10_i031.jp2","pc0065_b08_f10_i032.jp2"]
|
70
69
|
attr_accessor :image_ids
|
71
|
-
|
70
|
+
|
72
71
|
# @!attribute [rw] catkey
|
73
72
|
# @return [String] the catkey attribute in identity_metadata
|
74
73
|
attr_accessor :catkey
|
@@ -76,15 +75,15 @@ module DiscoveryIndexer
|
|
76
75
|
# @!attribute [rw] barcode
|
77
76
|
# @return [String] the barcode attribute in identity_metadata
|
78
77
|
attr_accessor :barcode
|
79
|
-
|
78
|
+
|
80
79
|
# @!attribute [rw] label
|
81
80
|
# @return [String] the objectLabel attribute in identity_metadata
|
82
81
|
attr_accessor :label
|
83
|
-
|
82
|
+
|
84
83
|
# @!attribute [rw] copyright
|
85
84
|
# @return [String] the copyright statement from rights metadata
|
86
85
|
attr_accessor :copyright
|
87
|
-
|
86
|
+
|
88
87
|
# @!attribute [rw] use_and_reproduction
|
89
88
|
# @return [String] the use and reproduction statement from rights metadata
|
90
89
|
attr_accessor :use_and_reproduction
|
@@ -92,13 +91,6 @@ module DiscoveryIndexer
|
|
92
91
|
# @!attribute [rw] source_id
|
93
92
|
# @return [String] the sourceid from identity metadata
|
94
93
|
attr_accessor :source_id
|
95
|
-
|
96
94
|
end
|
97
95
|
end
|
98
96
|
end
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
@@ -3,154 +3,142 @@ module DiscoveryIndexer
|
|
3
3
|
class PurlxmlParserStrict < PurlxmlParser
|
4
4
|
include DiscoveryIndexer::Logging
|
5
5
|
|
6
|
-
|
7
6
|
RDF_NAMESPACE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
|
8
7
|
OAI_DC_NAMESPACE = 'http://www.openarchives.org/OAI/2.0/oai_dc/'
|
9
8
|
MODS_NAMESPACE = 'http://www.loc.gov/mods/v3'
|
10
9
|
|
11
10
|
# it parses the purlxml into a purlxml model
|
12
11
|
# @return [PurlxmlModel] represents the purlxml as parsed based on the parser rules
|
13
|
-
def parse
|
12
|
+
def parse
|
14
13
|
purlxml_model = PurlxmlModel.new
|
15
14
|
purlxml_model.druid = @druid
|
16
15
|
purlxml_model.public_xml = @purlxml_ng_doc
|
17
|
-
purlxml_model.content_metadata = parse_content_metadata
|
18
|
-
purlxml_model.identity_metadata = parse_identity_metadata
|
19
|
-
purlxml_model.rights_metadata = parse_rights_metadata
|
20
|
-
purlxml_model.dc = parse_dc
|
21
|
-
purlxml_model.rdf = parse_rdf
|
22
|
-
purlxml_model.is_collection = parse_is_collection
|
23
|
-
purlxml_model.collection_druids = parse_collection_druids
|
24
|
-
purlxml_model.dor_content_type = parse_dor_content_type
|
25
|
-
purlxml_model.dor_display_type = parse_dor_display_type
|
26
|
-
purlxml_model.release_tags_hash = parse_release_tags_hash
|
27
|
-
purlxml_model.file_ids = parse_file_ids
|
28
|
-
purlxml_model.image_ids = parse_image_ids
|
29
|
-
purlxml_model.catkey = parse_catkey
|
30
|
-
purlxml_model.barcode = parse_barcode
|
31
|
-
purlxml_model.label = parse_label
|
32
|
-
purlxml_model.copyright = parse_copyright
|
33
|
-
purlxml_model.use_and_reproduction = parse_use_and_reproduction
|
34
|
-
purlxml_model.source_id
|
35
|
-
|
36
|
-
end
|
37
|
-
|
16
|
+
purlxml_model.content_metadata = parse_content_metadata
|
17
|
+
purlxml_model.identity_metadata = parse_identity_metadata
|
18
|
+
purlxml_model.rights_metadata = parse_rights_metadata
|
19
|
+
purlxml_model.dc = parse_dc
|
20
|
+
purlxml_model.rdf = parse_rdf
|
21
|
+
purlxml_model.is_collection = parse_is_collection
|
22
|
+
purlxml_model.collection_druids = parse_collection_druids
|
23
|
+
purlxml_model.dor_content_type = parse_dor_content_type
|
24
|
+
purlxml_model.dor_display_type = parse_dor_display_type
|
25
|
+
purlxml_model.release_tags_hash = parse_release_tags_hash
|
26
|
+
purlxml_model.file_ids = parse_file_ids
|
27
|
+
purlxml_model.image_ids = parse_image_ids
|
28
|
+
purlxml_model.catkey = parse_catkey
|
29
|
+
purlxml_model.barcode = parse_barcode
|
30
|
+
purlxml_model.label = parse_label
|
31
|
+
purlxml_model.copyright = parse_copyright
|
32
|
+
purlxml_model.use_and_reproduction = parse_use_and_reproduction
|
33
|
+
purlxml_model.source_id = parse_sourceid
|
34
|
+
purlxml_model
|
35
|
+
end
|
36
|
+
|
38
37
|
# extracts the identityMetadata for this fedora object, from the purl xml
|
39
38
|
# @return [Nokogiri::XML::Document] the identityMetadata for the fedora object
|
40
39
|
# @raise [DiscoveryIndexer::Errors::MissingIdentityMetadata] if there is no identity_metadata
|
41
40
|
def parse_identity_metadata
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
raise DiscoveryIndexer::Errors::MissingIdentityMetadata.new(@purlxml_ng_doc.inspect)
|
48
|
-
end
|
41
|
+
ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/identityMetadata').to_xml)
|
42
|
+
fail DiscoveryIndexer::Errors::MissingIdentityMetadata.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
|
43
|
+
ng_doc
|
44
|
+
rescue
|
45
|
+
raise DiscoveryIndexer::Errors::MissingIdentityMetadata.new(@purlxml_ng_doc.inspect)
|
49
46
|
end
|
50
|
-
|
51
|
-
def parse_rights_metadata
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
raise DiscoveryIndexer::Errors::MissingRightsMetadata.new(@purlxml_ng_doc.inspect)
|
58
|
-
end
|
47
|
+
|
48
|
+
def parse_rights_metadata
|
49
|
+
ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/rightsMetadata').to_xml)
|
50
|
+
fail DiscoveryIndexer::Errors::MissingRightsMetadata.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
|
51
|
+
ng_doc
|
52
|
+
rescue
|
53
|
+
raise DiscoveryIndexer::Errors::MissingRightsMetadata.new(@purlxml_ng_doc.inspect)
|
59
54
|
end
|
60
|
-
|
55
|
+
|
61
56
|
# extracts the dc field for this fedora object, from the purl xml
|
62
57
|
# @return [Nokogiri::XML::Document] the dc for the fedora object
|
63
58
|
# @raise [DiscoveryIndexer::Errors::MissingDC] if there is no dc element
|
64
59
|
def parse_dc
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
raise DiscoveryIndexer::Errors::MissingDC.new(@purlxml_ng_doc.inspect)
|
71
|
-
end
|
60
|
+
ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/dc:dc', 'dc' => OAI_DC_NAMESPACE).to_xml(encoding: 'utf-8'))
|
61
|
+
fail DiscoveryIndexer::Errors::MissingDC.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
|
62
|
+
ng_doc
|
63
|
+
rescue
|
64
|
+
raise DiscoveryIndexer::Errors::MissingDC.new(@purlxml_ng_doc.inspect)
|
72
65
|
end
|
73
|
-
|
66
|
+
|
74
67
|
# extracts the rdf field for this fedora object, from the purl xml
|
75
68
|
# @return [Nokogiri::XML::Document] the rdf for the fedora object
|
76
69
|
# @raise [DiscoveryIndexer::Errors::MissingRDF] if there is no rdf element
|
77
70
|
def parse_rdf
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
raise DiscoveryIndexer::Errors::MissingRDF.new(@purlxml_ng_doc.inspect)
|
84
|
-
end
|
71
|
+
ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/rdf:RDF', 'rdf' => RDF_NAMESPACE).to_xml)
|
72
|
+
fail DiscoveryIndexer::Errors::MissingRDF.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
|
73
|
+
ng_doc
|
74
|
+
rescue
|
75
|
+
raise DiscoveryIndexer::Errors::MissingRDF.new(@purlxml_ng_doc.inspect)
|
85
76
|
end
|
86
|
-
|
87
|
-
|
77
|
+
|
88
78
|
# extracts the release tag element for this fedora object, from the the ReleaseData element in purl xml
|
89
79
|
# @return [Hash] the release tags for the fedora object
|
90
80
|
def parse_release_tags_hash
|
91
|
-
release_tags={}
|
92
|
-
unless
|
93
|
-
release_elements =
|
94
|
-
release_elements.each
|
95
|
-
unless n.attr(
|
96
|
-
release_target = n.attr(
|
81
|
+
release_tags = {}
|
82
|
+
unless @purlxml_ng_doc.nil?
|
83
|
+
release_elements = @purlxml_ng_doc.xpath('//ReleaseData/release')
|
84
|
+
release_elements.each do |n|
|
85
|
+
unless n.attr('to').nil?
|
86
|
+
release_target = n.attr('to')
|
97
87
|
text = n.text
|
98
|
-
unless text.nil?
|
99
|
-
release_tags[release_target]= to_boolean(text)
|
100
|
-
end
|
88
|
+
release_tags[release_target] = to_boolean(text) unless text.nil?
|
101
89
|
end
|
102
|
-
|
90
|
+
end
|
103
91
|
end
|
104
|
-
|
92
|
+
release_tags
|
105
93
|
end
|
106
|
-
|
94
|
+
|
107
95
|
# extracts the contentMetadata for this fedora object, from the purl xml
|
108
96
|
# @return [Nokogiri::XML::Document] the contentMetadata for the fedora object
|
109
97
|
# @raise [DiscoveryIndexer::Errors::MissingContentMetadata] if there is no contentMetadata
|
110
98
|
def parse_content_metadata
|
111
|
-
|
112
|
-
|
113
|
-
|
99
|
+
ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/contentMetadata').to_xml)
|
100
|
+
ng_doc = nil if !ng_doc || ng_doc.children.empty?
|
101
|
+
ng_doc
|
114
102
|
end
|
115
|
-
|
103
|
+
|
116
104
|
# @return true if the identityMetadata has <objectType>collection</objectType>, false otherwise
|
117
105
|
def parse_is_collection
|
118
|
-
identity_metadata
|
106
|
+
identity_metadata = parse_identity_metadata
|
119
107
|
unless identity_metadata.nil?
|
120
108
|
object_type_nodes = identity_metadata.xpath('//objectType')
|
121
|
-
return true if object_type_nodes.find_index { |n|
|
109
|
+
return true if object_type_nodes.find_index { |n| %w(collection set).include? n.text.downcase }
|
122
110
|
end
|
123
111
|
false
|
124
112
|
end
|
125
|
-
|
113
|
+
|
126
114
|
# get the druids from isMemberOfCollection relationships in rels-ext from public_xml
|
127
115
|
# @return [Array<String>] the druids (e.g. ww123yy1234) this object has isMemberOfColletion relationship with, or nil if none
|
128
116
|
def parse_collection_druids
|
129
|
-
ns_hash = {'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'fedora' =>
|
117
|
+
ns_hash = { 'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'fedora' => 'info:fedora/fedora-system:def/relations-external#', '' => '' }
|
130
118
|
is_member_of_nodes ||= @purlxml_ng_doc.xpath('/publicObject/rdf:RDF/rdf:Description/fedora:isMemberOfCollection/@rdf:resource', ns_hash)
|
131
119
|
# from public_xml rels-ext
|
132
120
|
druids = []
|
133
|
-
is_member_of_nodes.each
|
121
|
+
is_member_of_nodes.each do |n|
|
134
122
|
druids << n.value.split('druid:').last unless n.value.empty?
|
135
|
-
|
123
|
+
end
|
136
124
|
return nil if druids.empty?
|
137
125
|
druids
|
138
126
|
end
|
139
|
-
|
127
|
+
|
140
128
|
# the value of the type attribute for a DOR object's contentMetadata
|
141
129
|
# more info about these values is here:
|
142
130
|
# https://consul.stanford.edu/display/chimera/DOR+content+types%2C+resource+types+and+interpretive+metadata
|
143
131
|
# https://consul.stanford.edu/display/chimera/Summary+of+Content+Types%2C+Resource+Types+and+their+behaviors
|
144
|
-
# @return [String]
|
132
|
+
# @return [String]
|
145
133
|
def parse_dor_content_type
|
146
134
|
content_md = parse_content_metadata
|
147
135
|
dct = content_md ? content_md.xpath('contentMetadata/@type').text : nil
|
148
136
|
DiscoveryIndexer::Logging.logger.debug "#{@druid} has no DOR content type (<contentMetadata> element may be missing type attribute)" if !dct || dct.empty?
|
149
137
|
dct
|
150
138
|
end
|
151
|
-
|
139
|
+
|
152
140
|
# the value of the displyType tag from a DOR collection's identityMetadata
|
153
|
-
# @return [String]
|
141
|
+
# @return [String]
|
154
142
|
def parse_dor_display_type
|
155
143
|
identity_md = parse_identity_metadata
|
156
144
|
ddt = identity_md ? identity_md.xpath('//displayType').text : nil
|
@@ -161,43 +149,44 @@ module DiscoveryIndexer
|
|
161
149
|
# the @id attribute of resource/file elements that match the image type, including extension
|
162
150
|
# @return [Array<String>] filenames
|
163
151
|
def parse_image_ids
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
}
|
170
|
-
return nil if ids.empty?
|
171
|
-
ids
|
152
|
+
ids = []
|
153
|
+
content_md = parse_content_metadata
|
154
|
+
return nil if content_md.nil?
|
155
|
+
content_md.xpath('//resource[@type="image"]/file/@id').each do |node|
|
156
|
+
ids << node.text unless node.text.empty?
|
172
157
|
end
|
158
|
+
content_md.xpath('//resource[@type="page"]/file/@id').each do |node|
|
159
|
+
ids << node.text unless node.text.empty?
|
160
|
+
end
|
161
|
+
return nil if ids.empty?
|
162
|
+
ids
|
173
163
|
end
|
174
164
|
|
175
165
|
def parse_sourceid
|
176
166
|
get_value(@purlxml_ng_doc.css('//identityMetadata/sourceId'))
|
177
167
|
end
|
178
|
-
|
168
|
+
|
179
169
|
def parse_copyright
|
180
170
|
get_value(@purlxml_ng_doc.css('//rightsMetadata/copyright/human[type="copyright"]'))
|
181
171
|
end
|
182
|
-
|
172
|
+
|
183
173
|
def parse_use_and_reproduction
|
184
174
|
get_value(@purlxml_ng_doc.css('//rightsMetadata/use/human[type="useAndReproduction"]'))
|
185
175
|
end
|
186
|
-
|
176
|
+
|
187
177
|
# the @id attribute of resource/file elements, including extension
|
188
178
|
# @return [Array<String>] filenames
|
189
179
|
def parse_file_ids
|
190
180
|
ids = []
|
191
181
|
content_md = parse_content_metadata
|
192
|
-
unless content_md.nil?
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
182
|
+
return unless content_md.nil?
|
183
|
+
content_md.xpath('//resource/file/@id').each do |node|
|
184
|
+
ids << node.text unless node.text.empty?
|
185
|
+
end
|
186
|
+
return nil if ids.empty?
|
187
|
+
ids
|
188
|
+
end
|
189
|
+
|
201
190
|
# @return catkey value from the DOR identity_metadata, or nil if there is no catkey
|
202
191
|
def parse_catkey
|
203
192
|
get_value(@purlxml_ng_doc.xpath("/publicObject/identityMetadata/otherId[@name='catkey']"))
|
@@ -210,23 +199,22 @@ module DiscoveryIndexer
|
|
210
199
|
|
211
200
|
# @return objectLabel value from the DOR identity_metadata, or nil if there is no barcode
|
212
201
|
def parse_label
|
213
|
-
get_value(@purlxml_ng_doc.xpath(
|
202
|
+
get_value(@purlxml_ng_doc.xpath('/publicObject/identityMetadata/objectLabel'))
|
214
203
|
end
|
215
|
-
|
204
|
+
|
216
205
|
def get_value(node)
|
217
|
-
(node && node.first) ? node.first.content : nil
|
206
|
+
(node && node.first) ? node.first.content : nil
|
218
207
|
end
|
219
|
-
|
208
|
+
|
220
209
|
def to_boolean(text)
|
221
|
-
if text.nil? || text.empty?
|
210
|
+
if text.nil? || text.empty?
|
222
211
|
return false
|
223
|
-
elsif text.downcase.eql?(
|
212
|
+
elsif text.downcase.eql?('true') || text.downcase == 't'
|
224
213
|
return true
|
225
214
|
else
|
226
215
|
return false
|
227
216
|
end
|
228
|
-
end
|
217
|
+
end
|
229
218
|
end
|
230
219
|
end
|
231
220
|
end
|
232
|
-
|
@@ -3,14 +3,13 @@ require 'open-uri'
|
|
3
3
|
module DiscoveryIndexer
|
4
4
|
module InputXml
|
5
5
|
class PurlxmlReader
|
6
|
-
|
7
6
|
# reads the public xml for the fedora object that is defined , from the purl server
|
8
7
|
# @param [String] druid e.g. ab123cd4567
|
9
8
|
# @return [Nokogiri::XML::Document] the public xml for the fedora object
|
10
9
|
# @raise [MissingPublicXml] if there's no purl xml available for this druid
|
11
10
|
def self.read(druid)
|
12
11
|
purlxml_uri = "#{DiscoveryIndexer::PURL_DEFAULT}/#{druid}.xml"
|
13
|
-
|
12
|
+
|
14
13
|
begin
|
15
14
|
purlxml_object = Nokogiri::XML(open(purlxml_uri))
|
16
15
|
return purlxml_object
|
@@ -20,4 +19,4 @@ module DiscoveryIndexer
|
|
20
19
|
end
|
21
20
|
end
|
22
21
|
end
|
23
|
-
end
|
22
|
+
end
|
data/lib/version.rb
CHANGED
data/lib/writer/solr_client.rb
CHANGED
@@ -3,6 +3,7 @@ require 'rsolr'
|
|
3
3
|
require 'rest-client'
|
4
4
|
module DiscoveryIndexer
|
5
5
|
module Writer
|
6
|
+
# Processes adds and deletes to the solr core
|
6
7
|
class SolrClient
|
7
8
|
include DiscoveryIndexer::Logging
|
8
9
|
|
@@ -13,7 +14,7 @@ module DiscoveryIndexer
|
|
13
14
|
# @param solr_connector [RSolr::Client] is an open connection with the solr core
|
14
15
|
# @param max_retries [Integer] the maximum number of tries before fail
|
15
16
|
def self.add(id, solr_doc, solr_connector, max_retries = 10)
|
16
|
-
process(id, solr_doc, solr_connector, max_retries,
|
17
|
+
process(id, solr_doc, solr_connector, max_retries, false)
|
17
18
|
end
|
18
19
|
|
19
20
|
# Add the document to solr, retry if an error occurs.
|
@@ -22,79 +23,76 @@ module DiscoveryIndexer
|
|
22
23
|
# @param solr_connector[RSolr::Client] is an open connection with the solr core
|
23
24
|
# @param max_retries [Integer] the maximum number of tries before fail
|
24
25
|
def self.delete(id, solr_connector, max_retries = 10)
|
25
|
-
process(id, {}, solr_connector, max_retries,
|
26
|
+
process(id, {}, solr_connector, max_retries, true)
|
26
27
|
end
|
27
28
|
|
28
29
|
# It's an internal method that receives all the requests and deal with
|
29
30
|
# SOLR core. This method can call add, delete, or update
|
30
31
|
#
|
31
32
|
# @param id [String] the document id, usually it will be druid.
|
32
|
-
# @param solr_doc [Hash] is the solr doc in hash format
|
33
|
+
# @param solr_doc [Hash] is the solr doc in hash format
|
33
34
|
# @param solr_connector [RSolr::Client] is an open connection with the solr core
|
34
35
|
# @param max_retries [Integer] the maximum number of tries before fail
|
35
|
-
def self.process(id, solr_doc, solr_connector, max_retries, is_delete=false)
|
36
|
-
handler =
|
36
|
+
def self.process(id, solr_doc, solr_connector, max_retries, is_delete = false)
|
37
|
+
handler = proc do |exception, attempt_number, _total_delay|
|
37
38
|
DiscoveryIndexer::Logging.logger.debug "#{exception.class} on attempt #{attempt_number} for #{id}"
|
38
39
|
end
|
39
|
-
|
40
|
-
with_retries(:
|
40
|
+
|
41
|
+
with_retries(max_tries: max_retries, handler: handler, base_sleep_seconds: 1, max_sleep_seconds: 5) do |attempt|
|
41
42
|
DiscoveryIndexer::Logging.logger.debug "Attempt #{attempt} for #{id}"
|
42
|
-
|
43
|
+
|
43
44
|
if is_delete
|
44
45
|
DiscoveryIndexer::Logging.logger.info "Deleting #{id} on attempt #{attempt}"
|
45
46
|
solr_connector.delete_by_id(id)
|
46
|
-
elsif allow_update?(solr_connector) && doc_exists?(id,solr_connector)
|
47
|
+
elsif allow_update?(solr_connector) && doc_exists?(id, solr_connector)
|
47
48
|
DiscoveryIndexer::Logging.logger.info "Updating #{id} on attempt #{attempt}"
|
48
|
-
update_solr_doc(id,solr_doc,solr_connector)
|
49
|
+
update_solr_doc(id, solr_doc, solr_connector)
|
49
50
|
else
|
50
51
|
DiscoveryIndexer::Logging.logger.info "Indexing #{id} on attempt #{attempt}"
|
51
52
|
solr_connector.add(solr_doc)
|
52
53
|
end
|
53
54
|
solr_connector.commit
|
54
55
|
DiscoveryIndexer::Logging.logger.info "Completing #{id} successfully on attempt #{attempt}"
|
55
|
-
|
56
56
|
end
|
57
57
|
end
|
58
58
|
|
59
59
|
# @param solr_connector [RSolr::Client] is an open connection with the solr core
|
60
60
|
# @return [Boolean] true if the solr core allowing update feature
|
61
61
|
def self.allow_update?(solr_connector)
|
62
|
-
|
62
|
+
solr_connector.options.include?(:allow_update) ? solr_connector.options[:allow_update] : false
|
63
63
|
end
|
64
64
|
|
65
65
|
# @param id [String] the document id, usually it will be druid.
|
66
66
|
# @param solr_connector [RSolr::Client] is an open connection with the solr core
|
67
67
|
# @return [Boolean] true if the solr doc defined by this id exists
|
68
|
-
def self.doc_exists?(id,solr_connector)
|
69
|
-
response=solr_connector.get 'select', :
|
68
|
+
def self.doc_exists?(id, solr_connector)
|
69
|
+
response = solr_connector.get 'select', params: { q: 'id:"' + id + '"' }
|
70
70
|
response['response']['numFound'] == 1
|
71
71
|
end
|
72
|
-
|
72
|
+
|
73
73
|
# It is an internal method that updates the solr doc instead of adding a new one.
|
74
|
-
def self.update_solr_doc(id,solr_doc,solr_connector)
|
74
|
+
def self.update_solr_doc(id, solr_doc, solr_connector)
|
75
75
|
# update_solr_doc can't used RSolr because updating hash doc is not supported
|
76
76
|
# so we need to build the json input manually
|
77
77
|
solr_url = solr_connector.options[:url]
|
78
|
-
if solr_url.end_with?(
|
79
|
-
url="#{solr_connector.options[:url]}update?commit=true"
|
78
|
+
if solr_url.end_with?('/')
|
79
|
+
url = "#{solr_connector.options[:url]}update?commit=true"
|
80
80
|
else
|
81
|
-
url="#{solr_connector.options[:url]}/update?commit=true"
|
81
|
+
url = "#{solr_connector.options[:url]}/update?commit=true"
|
82
82
|
end
|
83
|
-
|
84
|
-
params="[{\"id\":\"#{id}\","
|
85
|
-
solr_doc.each do |field_name,new_values|
|
86
|
-
unless field_name == :id
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
end
|
83
|
+
|
84
|
+
params = "[{\"id\":\"#{id}\","
|
85
|
+
solr_doc.each do |field_name, new_values|
|
86
|
+
next unless field_name == :id
|
87
|
+
params += "\"#{field_name}\":"
|
88
|
+
new_values = [new_values] unless new_values.class == Array
|
89
|
+
new_values = new_values.map { |s| s.to_s.gsub('\\', '\\\\\\').gsub('"', '\"').strip } # strip leading/trailing spaces and escape quotes for each value
|
90
|
+
params += "{\"set\":[\"#{new_values.join('","')}\"]},"
|
92
91
|
end
|
93
92
|
params.chomp!(',')
|
94
|
-
params+=
|
95
|
-
RestClient.post url, params
|
93
|
+
params += '}]'
|
94
|
+
RestClient.post url, params, content_type: :json, accept: :json
|
96
95
|
end
|
97
|
-
|
98
96
|
end
|
99
97
|
end
|
100
|
-
end
|
98
|
+
end
|
data/lib/writer/solr_writer.rb
CHANGED
@@ -3,57 +3,57 @@ require 'rsolr'
|
|
3
3
|
|
4
4
|
module DiscoveryIndexer
|
5
5
|
module Writer
|
6
|
+
# Performs writes to solr client based upon true and false release flags
|
6
7
|
class SolrWriter
|
7
8
|
include DiscoveryIndexer::Logging
|
8
|
-
|
9
|
+
|
9
10
|
def process(id, index_doc, targets, solr_targets_configs)
|
10
11
|
@solr_targets_configs = solr_targets_configs
|
11
12
|
index_targets = []
|
12
13
|
delete_targets = []
|
13
|
-
targets.keys.each do |target|
|
14
|
-
if targets[target]
|
14
|
+
targets.keys.each do |target|
|
15
|
+
if targets[target]
|
15
16
|
index_targets.append(target)
|
16
17
|
else
|
17
18
|
delete_targets.append(target)
|
18
19
|
end
|
19
20
|
end
|
20
|
-
|
21
|
+
|
21
22
|
# get targets with true
|
22
23
|
solr_index_client(id, index_doc, index_targets)
|
23
24
|
# get targets with false
|
24
25
|
solr_delete_client(id, delete_targets)
|
25
26
|
end
|
26
|
-
|
27
|
+
|
27
28
|
def solr_delete_from_all(id, solr_targets_configs)
|
28
29
|
# Get a list of all registered targets
|
29
|
-
@solr_targets_configs=solr_targets_configs
|
30
|
-
targets = @solr_targets_configs.keys
|
30
|
+
@solr_targets_configs = solr_targets_configs
|
31
|
+
targets = @solr_targets_configs.keys
|
31
32
|
solr_delete_client(id, targets)
|
32
33
|
end
|
33
|
-
|
34
|
+
|
34
35
|
def solr_index_client(id, index_doc, targets)
|
35
36
|
targets.each do |solr_target|
|
36
|
-
solr_connector = get_connector_for_target(solr_target)
|
37
|
+
solr_connector = get_connector_for_target(solr_target)
|
37
38
|
SolrClient.add(id, index_doc, solr_connector) unless solr_connector.nil?
|
38
|
-
end
|
39
|
+
end
|
39
40
|
end
|
40
|
-
|
41
|
+
|
41
42
|
def solr_delete_client(id, targets)
|
42
43
|
targets.each do |solr_target|
|
43
44
|
solr_connector = get_connector_for_target(solr_target)
|
44
45
|
SolrClient.delete(id, solr_connector) unless solr_connector.nil?
|
45
|
-
end
|
46
|
+
end
|
46
47
|
end
|
47
48
|
|
48
49
|
def get_connector_for_target(solr_target)
|
49
50
|
solr_connector = nil
|
50
|
-
if @solr_targets_configs.keys.include?(solr_target)
|
51
|
+
if @solr_targets_configs.keys.include?(solr_target)
|
51
52
|
config = @solr_targets_configs[solr_target]
|
52
53
|
solr_connector = RSolr.connect(config.deep_symbolize_keys)
|
53
54
|
end
|
54
|
-
|
55
|
+
solr_connector
|
55
56
|
end
|
56
|
-
|
57
57
|
end
|
58
58
|
end
|
59
59
|
end
|
metadata
CHANGED
@@ -1,14 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: discovery-indexer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.9.
|
4
|
+
version: 0.9.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ahmed AlSum
|
8
|
+
- Laney McGlohon
|
8
9
|
autorequire:
|
9
10
|
bindir: bin
|
10
11
|
cert_chain: []
|
11
|
-
date: 2015-
|
12
|
+
date: 2015-09-23 00:00:00.000000000 Z
|
12
13
|
dependencies:
|
13
14
|
- !ruby/object:Gem::Dependency
|
14
15
|
name: nokogiri
|
@@ -138,7 +139,7 @@ dependencies:
|
|
138
139
|
version: '0'
|
139
140
|
description: This library manages the core operations for the discovery indexing such
|
140
141
|
as reading PURL xml, mapping to the solr document, and writing to solr core.
|
141
|
-
email:
|
142
|
+
email: laneymcg@stanford.edu
|
142
143
|
executables: []
|
143
144
|
extensions: []
|
144
145
|
extra_rdoc_files: []
|