discovery-indexer 0.9.6 → 0.9.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/discovery-indexer.rb +2 -3
- data/lib/errors.rb +8 -8
- data/lib/logging.rb +3 -6
- data/lib/mapper/general_mapper.rb +4 -6
- data/lib/reader/modsxml.rb +16 -20
- data/lib/reader/modsxml_reader.rb +2 -5
- data/lib/reader/purlxml.rb +16 -23
- data/lib/reader/purlxml_model.rb +23 -31
- data/lib/reader/purlxml_parser.rb +2 -3
- data/lib/reader/purlxml_parser_strict.rb +99 -111
- data/lib/reader/purlxml_reader.rb +2 -3
- data/lib/version.rb +1 -1
- data/lib/writer/solr_client.rb +30 -32
- data/lib/writer/solr_writer.rb +15 -15
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9cddff2ef4144cc3d17af7bd71bdb6f68840d38c
|
4
|
+
data.tar.gz: ca19a7acbb80341c1d6440567e305ef0bf170542
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 65fa148806dcf8dc498b262cbc3acf811c89fcad0d81397099009e0e8eb368577477e50c9c5bbee7e0921e11b17542cf4857b2aff4e54a319f5f97ea22e1bb8f
|
7
|
+
data.tar.gz: a28465c230937c24168114d91f41a58e59492a16b0b479ab8e4ae41429c4682dc7eec99a4b796ff93869d37b7e34ed78a6d7485ff1500ee2bc91e99e3a6e070f
|
data/lib/discovery-indexer.rb
CHANGED
@@ -15,9 +15,8 @@ require 'mapper/general_mapper'
|
|
15
15
|
require 'writer/solr_client'
|
16
16
|
require 'writer/solr_writer'
|
17
17
|
|
18
|
-
#require 'utilities/extract_sub_targets'
|
19
|
-
|
18
|
+
# require 'utilities/extract_sub_targets'
|
20
19
|
|
21
20
|
module DiscoveryIndexer
|
22
21
|
PURL_DEFAULT = 'http://purl.stanford.edu'
|
23
|
-
end
|
22
|
+
end
|
data/lib/errors.rb
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
module DiscoveryIndexer
|
2
2
|
module Errors
|
3
|
-
MissingPurlPage = Class.new(StandardError)
|
4
|
-
MissingMods = Class.new(StandardError)
|
5
|
-
MissingPublicXml = Class.new(StandardError)
|
6
|
-
MissingContentMetadata = Class.new(StandardError)
|
7
|
-
MissingIdentityMetadata = Class.new(StandardError)
|
8
|
-
MissingRightsMetadata = Class.new(StandardError)
|
9
|
-
MissingRDF = Class.new(StandardError)
|
3
|
+
MissingPurlPage = Class.new(StandardError)
|
4
|
+
MissingMods = Class.new(StandardError)
|
5
|
+
MissingPublicXml = Class.new(StandardError)
|
6
|
+
MissingContentMetadata = Class.new(StandardError)
|
7
|
+
MissingIdentityMetadata = Class.new(StandardError)
|
8
|
+
MissingRightsMetadata = Class.new(StandardError)
|
9
|
+
MissingRDF = Class.new(StandardError)
|
10
10
|
MissingDC = Class.new(StandardError)
|
11
11
|
MissingModsPage = Class.new(StandardError)
|
12
12
|
end
|
13
|
-
end
|
13
|
+
end
|
data/lib/logging.rb
CHANGED
@@ -1,29 +1,27 @@
|
|
1
1
|
module DiscoveryIndexer
|
2
2
|
module Mapper
|
3
3
|
class GeneralMapper
|
4
|
-
|
5
4
|
# Initializes an instance from IndexMapper
|
6
5
|
# @param [String] druid e.g. ab123cd4567
|
7
6
|
# @param [Stanford::Mods::Record] modsxml represents the MODS xml for the druid
|
8
7
|
# @param [DiscoveryIndexer::Reader::PurlxmlModel] purlxml represents the purlxml model
|
9
8
|
# @param [Hash] collection_data represents a hash of collection_druid and catkey
|
10
9
|
# e.g. @collection_data = {'aa00bb0001'=>{:name=>'Test Collection Name',:ckey=>'000001'},'nt028fd5773'=>{:name=>'Revs Institute Archive',:ckey=>'000002'}}
|
11
|
-
def initialize(druid, modsxml, purlxml, collection_data={})
|
10
|
+
def initialize(druid, modsxml, purlxml, collection_data = {})
|
12
11
|
@druid = druid
|
13
12
|
@modsxml = modsxml
|
14
13
|
@purlxml = purlxml
|
15
14
|
@collection_data = collection_data
|
16
15
|
end
|
17
16
|
|
18
|
-
# Create a Hash representing a Solr doc, with all MODS related fields populated.
|
17
|
+
# Create a Hash representing a Solr doc, with all MODS related fields populated.
|
19
18
|
# @return [Hash] Hash representing the Solr document
|
20
|
-
def convert_to_solr_doc
|
19
|
+
def convert_to_solr_doc
|
21
20
|
solr_doc = {}
|
22
21
|
solr_doc[:id] = @druid
|
23
22
|
solr_doc[:title] = @modsxml.sw_full_title
|
24
|
-
|
23
|
+
solr_doc
|
25
24
|
end
|
26
25
|
end
|
27
26
|
end
|
28
27
|
end
|
29
|
-
|
data/lib/reader/modsxml.rb
CHANGED
@@ -1,44 +1,40 @@
|
|
1
1
|
require 'stanford-mods'
|
2
2
|
module DiscoveryIndexer
|
3
3
|
module InputXml
|
4
|
-
|
5
|
-
# This class is the main class to access and parse the mods xml
|
4
|
+
# This class is the main class to access and parse the mods xml
|
6
5
|
# as retrieved from PURL server
|
7
6
|
# @example to run the code
|
8
7
|
# druid = "aa111aa1111"
|
9
8
|
# p = DiscoveryIndexer::InputXml::Modsxml.new(druid)
|
10
9
|
# model = p.load()
|
11
|
-
#
|
12
|
-
#
|
10
|
+
#
|
11
|
+
#
|
13
12
|
class Modsxml
|
14
|
-
# initializes a new object
|
13
|
+
# initializes a new object
|
15
14
|
# @param druid [String] the druid object in the format "aa111aa1111"
|
16
15
|
def initialize(druid)
|
17
16
|
@druid = druid
|
18
|
-
@modsxml_ng_doc = nil
|
17
|
+
@modsxml_ng_doc = nil
|
19
18
|
end
|
20
19
|
|
21
20
|
# loads the mods xml to stanford mods model for the fedora object defind in the druid,
|
22
|
-
#
|
23
|
-
# @return [Stanford::Mods::Record] represents the mods xml
|
24
|
-
def load
|
25
|
-
if @modsxml_ng_doc.nil?
|
26
|
-
|
27
|
-
end
|
28
|
-
|
21
|
+
# it reads the mods xml once from PURL server, and repeat the parsing with each call
|
22
|
+
# @return [Stanford::Mods::Record] represents the mods xml
|
23
|
+
def load
|
24
|
+
@modsxml_ng_doc = ModsxmlReader.read(@druid) if @modsxml_ng_doc.nil?
|
25
|
+
|
29
26
|
modsxml_model = Stanford::Mods::Record.new
|
30
27
|
modsxml_model.from_nk_node(@modsxml_ng_doc)
|
31
|
-
|
28
|
+
modsxml_model
|
32
29
|
end
|
33
|
-
|
30
|
+
|
34
31
|
# loads the mods xml to stanford mods model for the fedora object defind in the druid,
|
35
|
-
#
|
36
|
-
# @return [Stanford::Mods::Record] represents the mods xml
|
37
|
-
def reload
|
32
|
+
# it reads the mods xml from PURL server with every call
|
33
|
+
# @return [Stanford::Mods::Record] represents the mods xml
|
34
|
+
def reload
|
38
35
|
@modsxml_ng_doc = ModsxmlReader.read(@druid)
|
39
|
-
|
36
|
+
load
|
40
37
|
end
|
41
|
-
|
42
38
|
end
|
43
39
|
end
|
44
40
|
end
|
@@ -3,21 +3,18 @@ require 'open-uri'
|
|
3
3
|
module DiscoveryIndexer
|
4
4
|
module InputXml
|
5
5
|
class ModsxmlReader
|
6
|
-
|
7
6
|
# reads the mods xml for the fedora object that is defined , from the purl server
|
8
7
|
# @param [String] druid e.g. ab123cd4567
|
9
8
|
# @return [Nokogiri::XML::Document] the mods xml for the fedora object
|
10
9
|
# @raise [MissingModsXml] if there's no mods xml available for this druid
|
11
10
|
def self.read(druid)
|
12
11
|
mods_uri = "#{DiscoveryIndexer::PURL_DEFAULT}/#{druid}.mods"
|
13
|
-
|
14
12
|
begin
|
15
|
-
|
16
|
-
return modsxml_ng_doc
|
13
|
+
Nokogiri::XML(open(mods_uri))
|
17
14
|
rescue
|
18
15
|
raise DiscoveryIndexer::Errors::MissingModsPage.new(mods_uri)
|
19
16
|
end
|
20
17
|
end
|
21
18
|
end
|
22
19
|
end
|
23
|
-
end
|
20
|
+
end
|
data/lib/reader/purlxml.rb
CHANGED
@@ -1,43 +1,36 @@
|
|
1
1
|
module DiscoveryIndexer
|
2
2
|
module InputXml
|
3
|
-
|
4
|
-
# This class is the main class to access and parse the purl xml
|
3
|
+
# This class is the main class to access and parse the purl xml
|
5
4
|
# as retrieved from PURL server
|
6
5
|
# @example to run the code
|
7
6
|
# druid = "aa111aa1111"
|
8
7
|
# p = DiscoveryIndexer::InputXml::Purlxml.new(druid)
|
9
8
|
# model = p.load()
|
10
|
-
#
|
11
9
|
class Purlxml
|
12
|
-
|
13
|
-
# initializes a new object
|
10
|
+
# initializes a new object
|
14
11
|
# @param druid [String] the druid object in the format "aa111aa1111"
|
15
12
|
def initialize(druid)
|
16
13
|
@druid = druid
|
17
|
-
@purlxml_ng_doc = nil
|
14
|
+
@purlxml_ng_doc = nil
|
18
15
|
end
|
19
16
|
|
20
17
|
# loads the purl xml to purlxml model for the fedora object defind in the druid,
|
21
|
-
#
|
22
|
-
# @return [PurlxmlModel] represents the purlxml
|
23
|
-
def load
|
24
|
-
if @purlxml_ng_doc.nil?
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
purlxml_parser = PurlxmlParserStrict.new(@druid,@purlxml_ng_doc)
|
29
|
-
purlxml_model = purlxml_parser.parse()
|
30
|
-
return purlxml_model
|
18
|
+
# it reads the purl xml once from PURL server, and repeat the parsing with each call
|
19
|
+
# @return [PurlxmlModel] represents the purlxml
|
20
|
+
def load
|
21
|
+
@purlxml_ng_doc = PurlxmlReader.read(@druid) if @purlxml_ng_doc.nil?
|
22
|
+
purlxml_parser = PurlxmlParserStrict.new(@druid, @purlxml_ng_doc)
|
23
|
+
purlxml_model = purlxml_parser.parse
|
24
|
+
purlxml_model
|
31
25
|
end
|
32
|
-
|
26
|
+
|
33
27
|
# loads the purl xml to purlxml model for the fedora object defind in the druid
|
34
|
-
#
|
35
|
-
# @return [PurlxmlModel] represents the purlxml
|
36
|
-
def reload
|
28
|
+
# it reads the purl xml from PURL server with every call
|
29
|
+
# @return [PurlxmlModel] represents the purlxml
|
30
|
+
def reload
|
37
31
|
@purlxml_ng_doc = PurlxmlReader.read(@druid)
|
38
|
-
|
32
|
+
load
|
39
33
|
end
|
40
|
-
|
34
|
+
end
|
41
35
|
end
|
42
36
|
end
|
43
|
-
|
data/lib/reader/purlxml_model.rb
CHANGED
@@ -1,47 +1,46 @@
|
|
1
1
|
module DiscoveryIndexer
|
2
2
|
module InputXml
|
3
3
|
class PurlxmlModel
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
# @return [Nokogiri::XML] The publix xml as retrieved from purl server
|
4
|
+
# @!attribute [rw] druid
|
5
|
+
# @return [String] The druid value eg., ab123cd4567
|
6
|
+
attr_accessor :druid
|
7
|
+
|
8
|
+
# @!attribute [rw] public_xml
|
9
|
+
# @return [Nokogiri::XML] The publix xml as retrieved from purl server
|
11
10
|
attr_accessor :public_xml
|
12
11
|
|
13
|
-
|
12
|
+
# @!attribute [rw] content_metadata
|
14
13
|
# @return [Nokogiri::XML] The content_metadata as extracted from public xml
|
15
14
|
attr_accessor :content_metadata
|
16
15
|
|
17
|
-
|
16
|
+
# @!attribute [rw] identity_metadata
|
18
17
|
# @return [Nokogiri::XML] The identity_metadata as extracted from public xml
|
19
18
|
attr_accessor :identity_metadata
|
20
19
|
|
21
|
-
|
20
|
+
# @!attribute [rw] rights_metadata
|
22
21
|
# @return [Nokogiri::XML] The rights_metadata as extracted from public xml
|
23
22
|
attr_accessor :rights_metadata
|
24
23
|
|
25
|
-
|
24
|
+
# @!attribute [rw] dc
|
26
25
|
# @return [Nokogiri::XML] The dc element as extracted from public xml
|
27
26
|
attr_accessor :dc
|
28
|
-
|
29
|
-
|
27
|
+
|
28
|
+
# @!attribute [rw] rdf
|
30
29
|
# @return [Nokogiri::XML] The rdf element as extracted from public xml
|
31
30
|
attr_accessor :rdf
|
32
31
|
|
33
32
|
# @!attribute [rw] release_tags_hash
|
34
33
|
# @return [Hash] The release_tag in hash format as extracted from public xml
|
35
|
-
# ReleaseData element.
|
34
|
+
# ReleaseData element.
|
36
35
|
# @example
|
37
36
|
# !{"target1"=>true, "target2"=>false}
|
38
|
-
attr_accessor :release_tags_hash
|
37
|
+
attr_accessor :release_tags_hash
|
39
38
|
|
40
39
|
# @!attribute [rw] dor_content_type
|
41
40
|
# @return [String] The dor_content_type as extracted from public xml
|
42
41
|
# content_metadata.
|
43
42
|
attr_accessor :dor_content_type
|
44
|
-
|
43
|
+
|
45
44
|
# @!attribute [rw] dor_display_type
|
46
45
|
# @return [String] The displayType as extracted from public xml
|
47
46
|
# identity_metadata.
|
@@ -50,25 +49,25 @@ module DiscoveryIndexer
|
|
50
49
|
# @!attribute [rw] is_collection
|
51
50
|
# @return [Boolean] true if the item type is collection in the identity_metadata
|
52
51
|
attr_accessor :is_collection
|
53
|
-
|
52
|
+
|
54
53
|
# @!attribute [rw] collection_druids
|
55
54
|
# @return [Array] a list of the collections that this is druid belongs to
|
56
55
|
# @example
|
57
56
|
# ["aa11aaa1111","bb111bb1111"]
|
58
57
|
attr_accessor :collection_druids
|
59
|
-
|
58
|
+
|
60
59
|
# @!attribute [rw] file_ids
|
61
60
|
# @return [Array] a list of the file ids in the content_metadata
|
62
61
|
# @example
|
63
|
-
# ["pc0065_b08_f10_i031.txt","pc0065_b08_f10_i032.txt"]
|
62
|
+
# ["pc0065_b08_f10_i031.txt","pc0065_b08_f10_i032.txt"]
|
64
63
|
attr_accessor :file_ids
|
65
64
|
|
66
65
|
# @!attribute [rw] image_ids
|
67
66
|
# @return [Array] a list of the image ids in the content_metadata
|
68
67
|
# @example
|
69
|
-
# ["pc0065_b08_f10_i031.jp2","pc0065_b08_f10_i032.jp2"]
|
68
|
+
# ["pc0065_b08_f10_i031.jp2","pc0065_b08_f10_i032.jp2"]
|
70
69
|
attr_accessor :image_ids
|
71
|
-
|
70
|
+
|
72
71
|
# @!attribute [rw] catkey
|
73
72
|
# @return [String] the catkey attribute in identity_metadata
|
74
73
|
attr_accessor :catkey
|
@@ -76,15 +75,15 @@ module DiscoveryIndexer
|
|
76
75
|
# @!attribute [rw] barcode
|
77
76
|
# @return [String] the barcode attribute in identity_metadata
|
78
77
|
attr_accessor :barcode
|
79
|
-
|
78
|
+
|
80
79
|
# @!attribute [rw] label
|
81
80
|
# @return [String] the objectLabel attribute in identity_metadata
|
82
81
|
attr_accessor :label
|
83
|
-
|
82
|
+
|
84
83
|
# @!attribute [rw] copyright
|
85
84
|
# @return [String] the copyright statement from rights metadata
|
86
85
|
attr_accessor :copyright
|
87
|
-
|
86
|
+
|
88
87
|
# @!attribute [rw] use_and_reproduction
|
89
88
|
# @return [String] the use and reproduction statement from rights metadata
|
90
89
|
attr_accessor :use_and_reproduction
|
@@ -92,13 +91,6 @@ module DiscoveryIndexer
|
|
92
91
|
# @!attribute [rw] source_id
|
93
92
|
# @return [String] the sourceid from identity metadata
|
94
93
|
attr_accessor :source_id
|
95
|
-
|
96
94
|
end
|
97
95
|
end
|
98
96
|
end
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
@@ -3,154 +3,142 @@ module DiscoveryIndexer
|
|
3
3
|
class PurlxmlParserStrict < PurlxmlParser
|
4
4
|
include DiscoveryIndexer::Logging
|
5
5
|
|
6
|
-
|
7
6
|
RDF_NAMESPACE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
|
8
7
|
OAI_DC_NAMESPACE = 'http://www.openarchives.org/OAI/2.0/oai_dc/'
|
9
8
|
MODS_NAMESPACE = 'http://www.loc.gov/mods/v3'
|
10
9
|
|
11
10
|
# it parses the purlxml into a purlxml model
|
12
11
|
# @return [PurlxmlModel] represents the purlxml as parsed based on the parser rules
|
13
|
-
def parse
|
12
|
+
def parse
|
14
13
|
purlxml_model = PurlxmlModel.new
|
15
14
|
purlxml_model.druid = @druid
|
16
15
|
purlxml_model.public_xml = @purlxml_ng_doc
|
17
|
-
purlxml_model.content_metadata = parse_content_metadata
|
18
|
-
purlxml_model.identity_metadata = parse_identity_metadata
|
19
|
-
purlxml_model.rights_metadata = parse_rights_metadata
|
20
|
-
purlxml_model.dc = parse_dc
|
21
|
-
purlxml_model.rdf = parse_rdf
|
22
|
-
purlxml_model.is_collection = parse_is_collection
|
23
|
-
purlxml_model.collection_druids = parse_collection_druids
|
24
|
-
purlxml_model.dor_content_type = parse_dor_content_type
|
25
|
-
purlxml_model.dor_display_type = parse_dor_display_type
|
26
|
-
purlxml_model.release_tags_hash = parse_release_tags_hash
|
27
|
-
purlxml_model.file_ids = parse_file_ids
|
28
|
-
purlxml_model.image_ids = parse_image_ids
|
29
|
-
purlxml_model.catkey = parse_catkey
|
30
|
-
purlxml_model.barcode = parse_barcode
|
31
|
-
purlxml_model.label = parse_label
|
32
|
-
purlxml_model.copyright = parse_copyright
|
33
|
-
purlxml_model.use_and_reproduction = parse_use_and_reproduction
|
34
|
-
purlxml_model.source_id
|
35
|
-
|
36
|
-
end
|
37
|
-
|
16
|
+
purlxml_model.content_metadata = parse_content_metadata
|
17
|
+
purlxml_model.identity_metadata = parse_identity_metadata
|
18
|
+
purlxml_model.rights_metadata = parse_rights_metadata
|
19
|
+
purlxml_model.dc = parse_dc
|
20
|
+
purlxml_model.rdf = parse_rdf
|
21
|
+
purlxml_model.is_collection = parse_is_collection
|
22
|
+
purlxml_model.collection_druids = parse_collection_druids
|
23
|
+
purlxml_model.dor_content_type = parse_dor_content_type
|
24
|
+
purlxml_model.dor_display_type = parse_dor_display_type
|
25
|
+
purlxml_model.release_tags_hash = parse_release_tags_hash
|
26
|
+
purlxml_model.file_ids = parse_file_ids
|
27
|
+
purlxml_model.image_ids = parse_image_ids
|
28
|
+
purlxml_model.catkey = parse_catkey
|
29
|
+
purlxml_model.barcode = parse_barcode
|
30
|
+
purlxml_model.label = parse_label
|
31
|
+
purlxml_model.copyright = parse_copyright
|
32
|
+
purlxml_model.use_and_reproduction = parse_use_and_reproduction
|
33
|
+
purlxml_model.source_id = parse_sourceid
|
34
|
+
purlxml_model
|
35
|
+
end
|
36
|
+
|
38
37
|
# extracts the identityMetadata for this fedora object, from the purl xml
|
39
38
|
# @return [Nokogiri::XML::Document] the identityMetadata for the fedora object
|
40
39
|
# @raise [DiscoveryIndexer::Errors::MissingIdentityMetadata] if there is no identity_metadata
|
41
40
|
def parse_identity_metadata
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
raise DiscoveryIndexer::Errors::MissingIdentityMetadata.new(@purlxml_ng_doc.inspect)
|
48
|
-
end
|
41
|
+
ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/identityMetadata').to_xml)
|
42
|
+
fail DiscoveryIndexer::Errors::MissingIdentityMetadata.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
|
43
|
+
ng_doc
|
44
|
+
rescue
|
45
|
+
raise DiscoveryIndexer::Errors::MissingIdentityMetadata.new(@purlxml_ng_doc.inspect)
|
49
46
|
end
|
50
|
-
|
51
|
-
def parse_rights_metadata
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
raise DiscoveryIndexer::Errors::MissingRightsMetadata.new(@purlxml_ng_doc.inspect)
|
58
|
-
end
|
47
|
+
|
48
|
+
def parse_rights_metadata
|
49
|
+
ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/rightsMetadata').to_xml)
|
50
|
+
fail DiscoveryIndexer::Errors::MissingRightsMetadata.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
|
51
|
+
ng_doc
|
52
|
+
rescue
|
53
|
+
raise DiscoveryIndexer::Errors::MissingRightsMetadata.new(@purlxml_ng_doc.inspect)
|
59
54
|
end
|
60
|
-
|
55
|
+
|
61
56
|
# extracts the dc field for this fedora object, from the purl xml
|
62
57
|
# @return [Nokogiri::XML::Document] the dc for the fedora object
|
63
58
|
# @raise [DiscoveryIndexer::Errors::MissingDC] if there is no dc element
|
64
59
|
def parse_dc
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
raise DiscoveryIndexer::Errors::MissingDC.new(@purlxml_ng_doc.inspect)
|
71
|
-
end
|
60
|
+
ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/dc:dc', 'dc' => OAI_DC_NAMESPACE).to_xml(encoding: 'utf-8'))
|
61
|
+
fail DiscoveryIndexer::Errors::MissingDC.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
|
62
|
+
ng_doc
|
63
|
+
rescue
|
64
|
+
raise DiscoveryIndexer::Errors::MissingDC.new(@purlxml_ng_doc.inspect)
|
72
65
|
end
|
73
|
-
|
66
|
+
|
74
67
|
# extracts the rdf field for this fedora object, from the purl xml
|
75
68
|
# @return [Nokogiri::XML::Document] the rdf for the fedora object
|
76
69
|
# @raise [DiscoveryIndexer::Errors::MissingRDF] if there is no rdf element
|
77
70
|
def parse_rdf
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
raise DiscoveryIndexer::Errors::MissingRDF.new(@purlxml_ng_doc.inspect)
|
84
|
-
end
|
71
|
+
ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/rdf:RDF', 'rdf' => RDF_NAMESPACE).to_xml)
|
72
|
+
fail DiscoveryIndexer::Errors::MissingRDF.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
|
73
|
+
ng_doc
|
74
|
+
rescue
|
75
|
+
raise DiscoveryIndexer::Errors::MissingRDF.new(@purlxml_ng_doc.inspect)
|
85
76
|
end
|
86
|
-
|
87
|
-
|
77
|
+
|
88
78
|
# extracts the release tag element for this fedora object, from the the ReleaseData element in purl xml
|
89
79
|
# @return [Hash] the release tags for the fedora object
|
90
80
|
def parse_release_tags_hash
|
91
|
-
release_tags={}
|
92
|
-
unless
|
93
|
-
release_elements =
|
94
|
-
release_elements.each
|
95
|
-
unless n.attr(
|
96
|
-
release_target = n.attr(
|
81
|
+
release_tags = {}
|
82
|
+
unless @purlxml_ng_doc.nil?
|
83
|
+
release_elements = @purlxml_ng_doc.xpath('//ReleaseData/release')
|
84
|
+
release_elements.each do |n|
|
85
|
+
unless n.attr('to').nil?
|
86
|
+
release_target = n.attr('to')
|
97
87
|
text = n.text
|
98
|
-
unless text.nil?
|
99
|
-
release_tags[release_target]= to_boolean(text)
|
100
|
-
end
|
88
|
+
release_tags[release_target] = to_boolean(text) unless text.nil?
|
101
89
|
end
|
102
|
-
|
90
|
+
end
|
103
91
|
end
|
104
|
-
|
92
|
+
release_tags
|
105
93
|
end
|
106
|
-
|
94
|
+
|
107
95
|
# extracts the contentMetadata for this fedora object, from the purl xml
|
108
96
|
# @return [Nokogiri::XML::Document] the contentMetadata for the fedora object
|
109
97
|
# @raise [DiscoveryIndexer::Errors::MissingContentMetadata] if there is no contentMetadata
|
110
98
|
def parse_content_metadata
|
111
|
-
|
112
|
-
|
113
|
-
|
99
|
+
ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/contentMetadata').to_xml)
|
100
|
+
ng_doc = nil if !ng_doc || ng_doc.children.empty?
|
101
|
+
ng_doc
|
114
102
|
end
|
115
|
-
|
103
|
+
|
116
104
|
# @return true if the identityMetadata has <objectType>collection</objectType>, false otherwise
|
117
105
|
def parse_is_collection
|
118
|
-
identity_metadata
|
106
|
+
identity_metadata = parse_identity_metadata
|
119
107
|
unless identity_metadata.nil?
|
120
108
|
object_type_nodes = identity_metadata.xpath('//objectType')
|
121
|
-
return true if object_type_nodes.find_index { |n|
|
109
|
+
return true if object_type_nodes.find_index { |n| %w(collection set).include? n.text.downcase }
|
122
110
|
end
|
123
111
|
false
|
124
112
|
end
|
125
|
-
|
113
|
+
|
126
114
|
# get the druids from isMemberOfCollection relationships in rels-ext from public_xml
|
127
115
|
# @return [Array<String>] the druids (e.g. ww123yy1234) this object has isMemberOfColletion relationship with, or nil if none
|
128
116
|
def parse_collection_druids
|
129
|
-
ns_hash = {'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'fedora' =>
|
117
|
+
ns_hash = { 'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'fedora' => 'info:fedora/fedora-system:def/relations-external#', '' => '' }
|
130
118
|
is_member_of_nodes ||= @purlxml_ng_doc.xpath('/publicObject/rdf:RDF/rdf:Description/fedora:isMemberOfCollection/@rdf:resource', ns_hash)
|
131
119
|
# from public_xml rels-ext
|
132
120
|
druids = []
|
133
|
-
is_member_of_nodes.each
|
121
|
+
is_member_of_nodes.each do |n|
|
134
122
|
druids << n.value.split('druid:').last unless n.value.empty?
|
135
|
-
|
123
|
+
end
|
136
124
|
return nil if druids.empty?
|
137
125
|
druids
|
138
126
|
end
|
139
|
-
|
127
|
+
|
140
128
|
# the value of the type attribute for a DOR object's contentMetadata
|
141
129
|
# more info about these values is here:
|
142
130
|
# https://consul.stanford.edu/display/chimera/DOR+content+types%2C+resource+types+and+interpretive+metadata
|
143
131
|
# https://consul.stanford.edu/display/chimera/Summary+of+Content+Types%2C+Resource+Types+and+their+behaviors
|
144
|
-
# @return [String]
|
132
|
+
# @return [String]
|
145
133
|
def parse_dor_content_type
|
146
134
|
content_md = parse_content_metadata
|
147
135
|
dct = content_md ? content_md.xpath('contentMetadata/@type').text : nil
|
148
136
|
DiscoveryIndexer::Logging.logger.debug "#{@druid} has no DOR content type (<contentMetadata> element may be missing type attribute)" if !dct || dct.empty?
|
149
137
|
dct
|
150
138
|
end
|
151
|
-
|
139
|
+
|
152
140
|
# the value of the displyType tag from a DOR collection's identityMetadata
|
153
|
-
# @return [String]
|
141
|
+
# @return [String]
|
154
142
|
def parse_dor_display_type
|
155
143
|
identity_md = parse_identity_metadata
|
156
144
|
ddt = identity_md ? identity_md.xpath('//displayType').text : nil
|
@@ -161,43 +149,44 @@ module DiscoveryIndexer
|
|
161
149
|
# the @id attribute of resource/file elements that match the image type, including extension
|
162
150
|
# @return [Array<String>] filenames
|
163
151
|
def parse_image_ids
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
}
|
170
|
-
return nil if ids.empty?
|
171
|
-
ids
|
152
|
+
ids = []
|
153
|
+
content_md = parse_content_metadata
|
154
|
+
return nil if content_md.nil?
|
155
|
+
content_md.xpath('//resource[@type="image"]/file/@id').each do |node|
|
156
|
+
ids << node.text unless node.text.empty?
|
172
157
|
end
|
158
|
+
content_md.xpath('//resource[@type="page"]/file/@id').each do |node|
|
159
|
+
ids << node.text unless node.text.empty?
|
160
|
+
end
|
161
|
+
return nil if ids.empty?
|
162
|
+
ids
|
173
163
|
end
|
174
164
|
|
175
165
|
def parse_sourceid
|
176
166
|
get_value(@purlxml_ng_doc.css('//identityMetadata/sourceId'))
|
177
167
|
end
|
178
|
-
|
168
|
+
|
179
169
|
def parse_copyright
|
180
170
|
get_value(@purlxml_ng_doc.css('//rightsMetadata/copyright/human[type="copyright"]'))
|
181
171
|
end
|
182
|
-
|
172
|
+
|
183
173
|
def parse_use_and_reproduction
|
184
174
|
get_value(@purlxml_ng_doc.css('//rightsMetadata/use/human[type="useAndReproduction"]'))
|
185
175
|
end
|
186
|
-
|
176
|
+
|
187
177
|
# the @id attribute of resource/file elements, including extension
|
188
178
|
# @return [Array<String>] filenames
|
189
179
|
def parse_file_ids
|
190
180
|
ids = []
|
191
181
|
content_md = parse_content_metadata
|
192
|
-
unless content_md.nil?
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
182
|
+
return unless content_md.nil?
|
183
|
+
content_md.xpath('//resource/file/@id').each do |node|
|
184
|
+
ids << node.text unless node.text.empty?
|
185
|
+
end
|
186
|
+
return nil if ids.empty?
|
187
|
+
ids
|
188
|
+
end
|
189
|
+
|
201
190
|
# @return catkey value from the DOR identity_metadata, or nil if there is no catkey
|
202
191
|
def parse_catkey
|
203
192
|
get_value(@purlxml_ng_doc.xpath("/publicObject/identityMetadata/otherId[@name='catkey']"))
|
@@ -210,23 +199,22 @@ module DiscoveryIndexer
|
|
210
199
|
|
211
200
|
# @return objectLabel value from the DOR identity_metadata, or nil if there is no barcode
|
212
201
|
def parse_label
|
213
|
-
get_value(@purlxml_ng_doc.xpath(
|
202
|
+
get_value(@purlxml_ng_doc.xpath('/publicObject/identityMetadata/objectLabel'))
|
214
203
|
end
|
215
|
-
|
204
|
+
|
216
205
|
def get_value(node)
|
217
|
-
(node && node.first) ? node.first.content : nil
|
206
|
+
(node && node.first) ? node.first.content : nil
|
218
207
|
end
|
219
|
-
|
208
|
+
|
220
209
|
def to_boolean(text)
|
221
|
-
if text.nil? || text.empty?
|
210
|
+
if text.nil? || text.empty?
|
222
211
|
return false
|
223
|
-
elsif text.downcase.eql?(
|
212
|
+
elsif text.downcase.eql?('true') || text.downcase == 't'
|
224
213
|
return true
|
225
214
|
else
|
226
215
|
return false
|
227
216
|
end
|
228
|
-
end
|
217
|
+
end
|
229
218
|
end
|
230
219
|
end
|
231
220
|
end
|
232
|
-
|
@@ -3,14 +3,13 @@ require 'open-uri'
|
|
3
3
|
module DiscoveryIndexer
|
4
4
|
module InputXml
|
5
5
|
class PurlxmlReader
|
6
|
-
|
7
6
|
# reads the public xml for the fedora object that is defined , from the purl server
|
8
7
|
# @param [String] druid e.g. ab123cd4567
|
9
8
|
# @return [Nokogiri::XML::Document] the public xml for the fedora object
|
10
9
|
# @raise [MissingPublicXml] if there's no purl xml available for this druid
|
11
10
|
def self.read(druid)
|
12
11
|
purlxml_uri = "#{DiscoveryIndexer::PURL_DEFAULT}/#{druid}.xml"
|
13
|
-
|
12
|
+
|
14
13
|
begin
|
15
14
|
purlxml_object = Nokogiri::XML(open(purlxml_uri))
|
16
15
|
return purlxml_object
|
@@ -20,4 +19,4 @@ module DiscoveryIndexer
|
|
20
19
|
end
|
21
20
|
end
|
22
21
|
end
|
23
|
-
end
|
22
|
+
end
|
data/lib/version.rb
CHANGED
data/lib/writer/solr_client.rb
CHANGED
@@ -3,6 +3,7 @@ require 'rsolr'
|
|
3
3
|
require 'rest-client'
|
4
4
|
module DiscoveryIndexer
|
5
5
|
module Writer
|
6
|
+
# Processes adds and deletes to the solr core
|
6
7
|
class SolrClient
|
7
8
|
include DiscoveryIndexer::Logging
|
8
9
|
|
@@ -13,7 +14,7 @@ module DiscoveryIndexer
|
|
13
14
|
# @param solr_connector [RSolr::Client] is an open connection with the solr core
|
14
15
|
# @param max_retries [Integer] the maximum number of tries before fail
|
15
16
|
def self.add(id, solr_doc, solr_connector, max_retries = 10)
|
16
|
-
process(id, solr_doc, solr_connector, max_retries,
|
17
|
+
process(id, solr_doc, solr_connector, max_retries, false)
|
17
18
|
end
|
18
19
|
|
19
20
|
# Add the document to solr, retry if an error occurs.
|
@@ -22,79 +23,76 @@ module DiscoveryIndexer
|
|
22
23
|
# @param solr_connector[RSolr::Client] is an open connection with the solr core
|
23
24
|
# @param max_retries [Integer] the maximum number of tries before fail
|
24
25
|
def self.delete(id, solr_connector, max_retries = 10)
|
25
|
-
process(id, {}, solr_connector, max_retries,
|
26
|
+
process(id, {}, solr_connector, max_retries, true)
|
26
27
|
end
|
27
28
|
|
28
29
|
# It's an internal method that receives all the requests and deal with
|
29
30
|
# SOLR core. This method can call add, delete, or update
|
30
31
|
#
|
31
32
|
# @param id [String] the document id, usually it will be druid.
|
32
|
-
# @param solr_doc [Hash] is the solr doc in hash format
|
33
|
+
# @param solr_doc [Hash] is the solr doc in hash format
|
33
34
|
# @param solr_connector [RSolr::Client] is an open connection with the solr core
|
34
35
|
# @param max_retries [Integer] the maximum number of tries before fail
|
35
|
-
def self.process(id, solr_doc, solr_connector, max_retries, is_delete=false)
|
36
|
-
handler =
|
36
|
+
def self.process(id, solr_doc, solr_connector, max_retries, is_delete = false)
|
37
|
+
handler = proc do |exception, attempt_number, _total_delay|
|
37
38
|
DiscoveryIndexer::Logging.logger.debug "#{exception.class} on attempt #{attempt_number} for #{id}"
|
38
39
|
end
|
39
|
-
|
40
|
-
with_retries(:
|
40
|
+
|
41
|
+
with_retries(max_tries: max_retries, handler: handler, base_sleep_seconds: 1, max_sleep_seconds: 5) do |attempt|
|
41
42
|
DiscoveryIndexer::Logging.logger.debug "Attempt #{attempt} for #{id}"
|
42
|
-
|
43
|
+
|
43
44
|
if is_delete
|
44
45
|
DiscoveryIndexer::Logging.logger.info "Deleting #{id} on attempt #{attempt}"
|
45
46
|
solr_connector.delete_by_id(id)
|
46
|
-
elsif allow_update?(solr_connector) && doc_exists?(id,solr_connector)
|
47
|
+
elsif allow_update?(solr_connector) && doc_exists?(id, solr_connector)
|
47
48
|
DiscoveryIndexer::Logging.logger.info "Updating #{id} on attempt #{attempt}"
|
48
|
-
update_solr_doc(id,solr_doc,solr_connector)
|
49
|
+
update_solr_doc(id, solr_doc, solr_connector)
|
49
50
|
else
|
50
51
|
DiscoveryIndexer::Logging.logger.info "Indexing #{id} on attempt #{attempt}"
|
51
52
|
solr_connector.add(solr_doc)
|
52
53
|
end
|
53
54
|
solr_connector.commit
|
54
55
|
DiscoveryIndexer::Logging.logger.info "Completing #{id} successfully on attempt #{attempt}"
|
55
|
-
|
56
56
|
end
|
57
57
|
end
|
58
58
|
|
59
59
|
# @param solr_connector [RSolr::Client] is an open connection with the solr core
|
60
60
|
# @return [Boolean] true if the solr core allowing update feature
|
61
61
|
def self.allow_update?(solr_connector)
|
62
|
-
|
62
|
+
solr_connector.options.include?(:allow_update) ? solr_connector.options[:allow_update] : false
|
63
63
|
end
|
64
64
|
|
65
65
|
# @param id [String] the document id, usually it will be druid.
|
66
66
|
# @param solr_connector [RSolr::Client] is an open connection with the solr core
|
67
67
|
# @return [Boolean] true if the solr doc defined by this id exists
|
68
|
-
def self.doc_exists?(id,solr_connector)
|
69
|
-
response=solr_connector.get 'select', :
|
68
|
+
def self.doc_exists?(id, solr_connector)
|
69
|
+
response = solr_connector.get 'select', params: { q: 'id:"' + id + '"' }
|
70
70
|
response['response']['numFound'] == 1
|
71
71
|
end
|
72
|
-
|
72
|
+
|
73
73
|
# It is an internal method that updates the solr doc instead of adding a new one.
|
74
|
-
def self.update_solr_doc(id,solr_doc,solr_connector)
|
74
|
+
def self.update_solr_doc(id, solr_doc, solr_connector)
|
75
75
|
# update_solr_doc can't used RSolr because updating hash doc is not supported
|
76
76
|
# so we need to build the json input manually
|
77
77
|
solr_url = solr_connector.options[:url]
|
78
|
-
if solr_url.end_with?(
|
79
|
-
url="#{solr_connector.options[:url]}update?commit=true"
|
78
|
+
if solr_url.end_with?('/')
|
79
|
+
url = "#{solr_connector.options[:url]}update?commit=true"
|
80
80
|
else
|
81
|
-
url="#{solr_connector.options[:url]}/update?commit=true"
|
81
|
+
url = "#{solr_connector.options[:url]}/update?commit=true"
|
82
82
|
end
|
83
|
-
|
84
|
-
params="[{\"id\":\"#{id}\","
|
85
|
-
solr_doc.each do |field_name,new_values|
|
86
|
-
unless field_name == :id
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
end
|
83
|
+
|
84
|
+
params = "[{\"id\":\"#{id}\","
|
85
|
+
solr_doc.each do |field_name, new_values|
|
86
|
+
next unless field_name == :id
|
87
|
+
params += "\"#{field_name}\":"
|
88
|
+
new_values = [new_values] unless new_values.class == Array
|
89
|
+
new_values = new_values.map { |s| s.to_s.gsub('\\', '\\\\\\').gsub('"', '\"').strip } # strip leading/trailing spaces and escape quotes for each value
|
90
|
+
params += "{\"set\":[\"#{new_values.join('","')}\"]},"
|
92
91
|
end
|
93
92
|
params.chomp!(',')
|
94
|
-
params+=
|
95
|
-
RestClient.post url, params
|
93
|
+
params += '}]'
|
94
|
+
RestClient.post url, params, content_type: :json, accept: :json
|
96
95
|
end
|
97
|
-
|
98
96
|
end
|
99
97
|
end
|
100
|
-
end
|
98
|
+
end
|
data/lib/writer/solr_writer.rb
CHANGED
@@ -3,57 +3,57 @@ require 'rsolr'
|
|
3
3
|
|
4
4
|
module DiscoveryIndexer
|
5
5
|
module Writer
|
6
|
+
# Performs writes to solr client based upon true and false release flags
|
6
7
|
class SolrWriter
|
7
8
|
include DiscoveryIndexer::Logging
|
8
|
-
|
9
|
+
|
9
10
|
def process(id, index_doc, targets, solr_targets_configs)
|
10
11
|
@solr_targets_configs = solr_targets_configs
|
11
12
|
index_targets = []
|
12
13
|
delete_targets = []
|
13
|
-
targets.keys.each do |target|
|
14
|
-
if targets[target]
|
14
|
+
targets.keys.each do |target|
|
15
|
+
if targets[target]
|
15
16
|
index_targets.append(target)
|
16
17
|
else
|
17
18
|
delete_targets.append(target)
|
18
19
|
end
|
19
20
|
end
|
20
|
-
|
21
|
+
|
21
22
|
# get targets with true
|
22
23
|
solr_index_client(id, index_doc, index_targets)
|
23
24
|
# get targets with false
|
24
25
|
solr_delete_client(id, delete_targets)
|
25
26
|
end
|
26
|
-
|
27
|
+
|
27
28
|
def solr_delete_from_all(id, solr_targets_configs)
|
28
29
|
# Get a list of all registered targets
|
29
|
-
@solr_targets_configs=solr_targets_configs
|
30
|
-
targets = @solr_targets_configs.keys
|
30
|
+
@solr_targets_configs = solr_targets_configs
|
31
|
+
targets = @solr_targets_configs.keys
|
31
32
|
solr_delete_client(id, targets)
|
32
33
|
end
|
33
|
-
|
34
|
+
|
34
35
|
def solr_index_client(id, index_doc, targets)
|
35
36
|
targets.each do |solr_target|
|
36
|
-
solr_connector = get_connector_for_target(solr_target)
|
37
|
+
solr_connector = get_connector_for_target(solr_target)
|
37
38
|
SolrClient.add(id, index_doc, solr_connector) unless solr_connector.nil?
|
38
|
-
end
|
39
|
+
end
|
39
40
|
end
|
40
|
-
|
41
|
+
|
41
42
|
def solr_delete_client(id, targets)
|
42
43
|
targets.each do |solr_target|
|
43
44
|
solr_connector = get_connector_for_target(solr_target)
|
44
45
|
SolrClient.delete(id, solr_connector) unless solr_connector.nil?
|
45
|
-
end
|
46
|
+
end
|
46
47
|
end
|
47
48
|
|
48
49
|
def get_connector_for_target(solr_target)
|
49
50
|
solr_connector = nil
|
50
|
-
if @solr_targets_configs.keys.include?(solr_target)
|
51
|
+
if @solr_targets_configs.keys.include?(solr_target)
|
51
52
|
config = @solr_targets_configs[solr_target]
|
52
53
|
solr_connector = RSolr.connect(config.deep_symbolize_keys)
|
53
54
|
end
|
54
|
-
|
55
|
+
solr_connector
|
55
56
|
end
|
56
|
-
|
57
57
|
end
|
58
58
|
end
|
59
59
|
end
|
metadata
CHANGED
@@ -1,14 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: discovery-indexer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.9.
|
4
|
+
version: 0.9.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ahmed AlSum
|
8
|
+
- Laney McGlohon
|
8
9
|
autorequire:
|
9
10
|
bindir: bin
|
10
11
|
cert_chain: []
|
11
|
-
date: 2015-
|
12
|
+
date: 2015-09-23 00:00:00.000000000 Z
|
12
13
|
dependencies:
|
13
14
|
- !ruby/object:Gem::Dependency
|
14
15
|
name: nokogiri
|
@@ -138,7 +139,7 @@ dependencies:
|
|
138
139
|
version: '0'
|
139
140
|
description: This library manages the core operations for the discovery indexing such
|
140
141
|
as reading PURL xml, mapping to the solr document, and writing to solr core.
|
141
|
-
email:
|
142
|
+
email: laneymcg@stanford.edu
|
142
143
|
executables: []
|
143
144
|
extensions: []
|
144
145
|
extra_rdoc_files: []
|