discovery-indexer 0.0.1 → 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/discovery-indexer.rb +22 -0
- data/lib/errors.rb +13 -0
- data/lib/mapper/general_mapper.rb +18 -0
- data/lib/mapper/index_mapper.rb +179 -0
- data/lib/reader/modsxml.rb +44 -0
- data/lib/reader/modsxml_reader.rb +23 -0
- data/lib/reader/purlxml.rb +43 -0
- data/lib/reader/purlxml_model.rb +29 -0
- data/lib/reader/purlxml_parser.rb +13 -0
- data/lib/reader/purlxml_parser_strict.rb +210 -0
- data/lib/reader/purlxml_reader.rb +23 -0
- data/lib/utilities/extract_sub_targets.rb +13 -0
- data/lib/version.rb +1 -1
- data/lib/writer/solr_client.rb +51 -0
- data/lib/writer/solr_writer.rb +61 -0
- metadata +130 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 073a6699afc6ef96ee080e00be2dfe3180bfb734
|
4
|
+
data.tar.gz: e4361f9aad38081a92598e53fb0f55e564807812
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 29e0c0dd830b66cc7cfe61bd4da7a42d3ee6fbb55721bf01ecf1f4b1679585d7aae9cc7c6f2a35d77b80b0017b45babea77516b9a4373191b7c8899b0a9a371c
|
7
|
+
data.tar.gz: 09c61dd4fa8501865bc4081dcb584224de912921eae74bb34121d6aafd05f4a762f45cdaffdf2c0a0282d318bc501629dcb0f1228c48d1fcba9ca769d4ab07db
|
data/lib/discovery-indexer.rb
CHANGED
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'reader/purlxml'
|
2
|
+
require 'reader/purlxml_reader'
|
3
|
+
require 'reader/purlxml_parser'
|
4
|
+
require 'reader/purlxml_parser_strict'
|
5
|
+
require 'reader/purlxml_model'
|
6
|
+
|
7
|
+
require 'reader/modsxml'
|
8
|
+
require 'reader/modsxml_reader'
|
9
|
+
|
10
|
+
require 'mapper/general_mapper'
|
11
|
+
require 'mapper/index_mapper'
|
12
|
+
|
13
|
+
require 'writer/solr_client'
|
14
|
+
require 'writer/solr_writer'
|
15
|
+
|
16
|
+
#require 'utilities/extract_sub_targets'
|
17
|
+
|
18
|
+
require 'errors'
|
19
|
+
|
20
|
+
module DiscoveryIndexer
|
21
|
+
PURL_DEFAULT = 'http://purl-test.stanford.edu'
|
22
|
+
end
|
data/lib/errors.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
module DiscoveryIndexer
|
2
|
+
module Errors
|
3
|
+
MissingPurlPage = Class.new(StandardError)
|
4
|
+
MissingMods = Class.new(StandardError)
|
5
|
+
MissingPublicXml = Class.new(StandardError)
|
6
|
+
MissingContentMetadata = Class.new(StandardError)
|
7
|
+
MissingIdentityMetadata = Class.new(StandardError)
|
8
|
+
MissingRightsMetadata = Class.new(StandardError)
|
9
|
+
MissingRDF = Class.new(StandardError)
|
10
|
+
MissingDC = Class.new(StandardError)
|
11
|
+
MissingModsPage = Class.new(StandardError)
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module DiscoveryIndexer
|
2
|
+
module Mapper
|
3
|
+
class GeneralMapper
|
4
|
+
|
5
|
+
def initialize(druid, modsxml, purlxml, collection_names={})
|
6
|
+
@druid = druid
|
7
|
+
@modsxml = modsxml
|
8
|
+
@purlxml = purlxml
|
9
|
+
@collection_names = collection_names
|
10
|
+
end
|
11
|
+
|
12
|
+
def map()
|
13
|
+
end
|
14
|
+
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
@@ -0,0 +1,179 @@
|
|
1
|
+
module DiscoveryIndexer
|
2
|
+
module Mapper
|
3
|
+
|
4
|
+
# This class is responsible for creating the solr_doc hash based on the input
|
5
|
+
# of druid_id, modsxml, purlxml, and optional hash of collection_names
|
6
|
+
class IndexMapper < GeneralMapper
|
7
|
+
|
8
|
+
# Initializes an instance from IndexMapper
|
9
|
+
# @param [String] druid e.g. ab123cd4567
|
10
|
+
# @param [Stanford::Mods::Record] modsxml represents the MODS xml for the druid
|
11
|
+
# @param [DiscoveryIndexer::Reader::PurlxmlModel] purlxml represents the purlxml model
|
12
|
+
# @param [Hash] collection_names represents a hash of collection_druid and
|
13
|
+
# collection_name !{"aa111aa1111"=>"First Collection", "bb123bb1234"=>"Second Collection"}
|
14
|
+
def initialize(druid, modsxml, purlxml, collection_names={})
|
15
|
+
super druid, modsxml, purlxml, collection_names
|
16
|
+
end
|
17
|
+
|
18
|
+
# Create a Hash representing a Solr doc, with all MODS related fields populated.
|
19
|
+
# @return [Hash] Hash representing the Solr document
|
20
|
+
def map()
|
21
|
+
solr_doc = {}
|
22
|
+
solr_doc[:id] = @druid
|
23
|
+
solr_doc.update mods_to_title_fields
|
24
|
+
solr_doc.update mods_to_author_fields
|
25
|
+
solr_doc.update mods_to_subject_search_fields
|
26
|
+
solr_doc.update mods_to_publication_fields
|
27
|
+
solr_doc.update mods_to_pub_date
|
28
|
+
solr_doc.update mods_to_others
|
29
|
+
|
30
|
+
solr_doc[:all_search] = @modsxml.text.gsub(/\s+/, ' ')
|
31
|
+
return solr_doc
|
32
|
+
end
|
33
|
+
|
34
|
+
# @return [Hash] Hash representing the title fields
|
35
|
+
def mods_to_title_fields
|
36
|
+
# title fields
|
37
|
+
doc_hash = {
|
38
|
+
:title_245a_search => @modsxml.sw_short_title,
|
39
|
+
:title_245_search => @modsxml.sw_full_title,
|
40
|
+
:title_variant_search => @modsxml.sw_addl_titles,
|
41
|
+
:title_sort => @modsxml.sw_sort_title,
|
42
|
+
:title_245a_display => @modsxml.sw_short_title,
|
43
|
+
:title_display => @modsxml.sw_title_display,
|
44
|
+
:title_full_display => @modsxml.sw_full_title,
|
45
|
+
}
|
46
|
+
doc_hash
|
47
|
+
end
|
48
|
+
|
49
|
+
# @return [Hash] Hash representing the author fields
|
50
|
+
def mods_to_author_fields
|
51
|
+
doc_hash = {
|
52
|
+
# author fields
|
53
|
+
:author_1xx_search => @modsxml.sw_main_author,
|
54
|
+
:author_7xx_search => @modsxml.sw_addl_authors,
|
55
|
+
:author_person_facet => @modsxml.sw_person_authors,
|
56
|
+
:author_other_facet => @modsxml.sw_impersonal_authors,
|
57
|
+
:author_sort => @modsxml.sw_sort_author[1..-1],
|
58
|
+
:author_corp_display => @modsxml.sw_corporate_authors,
|
59
|
+
:author_meeting_display => @modsxml.sw_meeting_authors,
|
60
|
+
:author_person_display => @modsxml.sw_person_authors,
|
61
|
+
:author_person_full_display => @modsxml.sw_person_authors,
|
62
|
+
}
|
63
|
+
doc_hash
|
64
|
+
end
|
65
|
+
|
66
|
+
# @return [Hash] Hash representing the search fields
|
67
|
+
def mods_to_subject_search_fields
|
68
|
+
doc_hash = {
|
69
|
+
# subject search fields
|
70
|
+
:topic_search => @modsxml.topic_search,
|
71
|
+
:geographic_search => @modsxml.geographic_search,
|
72
|
+
:subject_other_search => @modsxml.subject_other_search,
|
73
|
+
:subject_other_subvy_search => @modsxml.subject_other_subvy_search,
|
74
|
+
:subject_all_search => @modsxml.subject_all_search,
|
75
|
+
:topic_facet => @modsxml.topic_facet,
|
76
|
+
:geographic_facet => @modsxml.geographic_facet,
|
77
|
+
:era_facet => @modsxml.era_facet,
|
78
|
+
}
|
79
|
+
end
|
80
|
+
|
81
|
+
# @return [Hash] Hash representing the publication fields
|
82
|
+
def mods_to_publication_fields
|
83
|
+
doc_hash = {
|
84
|
+
# publication fields
|
85
|
+
:pub_search => @modsxml.place,
|
86
|
+
:pub_date_sort => @modsxml.pub_date_sort,
|
87
|
+
:imprint_display => @modsxml.pub_date_display,
|
88
|
+
:pub_date => @modsxml.pub_date_facet,
|
89
|
+
:pub_date_display => @modsxml.pub_date_display, # pub_date_display may be deprecated
|
90
|
+
}
|
91
|
+
end
|
92
|
+
|
93
|
+
# @return [Hash] Hash representing the pub date
|
94
|
+
def mods_to_pub_date
|
95
|
+
doc_hash = {}
|
96
|
+
pub_date_sort = @modsxml.pub_date_sort
|
97
|
+
if is_positive_int? pub_date_sort
|
98
|
+
doc_hash[:pub_year_tisim] = pub_date_sort # for date slider
|
99
|
+
# put the displayable year in the correct field, :creation_year_isi for example
|
100
|
+
doc_hash[date_type_sym] = @modsxml.pub_date_sort if date_type_sym
|
101
|
+
end
|
102
|
+
return doc_hash
|
103
|
+
end
|
104
|
+
|
105
|
+
# @return [Hash] Hash representing some fields
|
106
|
+
def mods_to_others
|
107
|
+
doc_hash = {
|
108
|
+
:format_main_ssim => format_main_ssim,
|
109
|
+
:format => format, # for backwards compatibility
|
110
|
+
:language => @modsxml.sw_language_facet,
|
111
|
+
:physical => @modsxml.term_values([:physical_description, :extent]),
|
112
|
+
:summary_search => @modsxml.term_values(:abstract),
|
113
|
+
:toc_search => @modsxml.term_values(:tableOfContents),
|
114
|
+
:url_suppl => @modsxml.term_values([:related_item, :location, :url]),
|
115
|
+
}
|
116
|
+
return doc_hash
|
117
|
+
end
|
118
|
+
|
119
|
+
# select one or more format values from the controlled vocabulary here:
|
120
|
+
# http://searchworks-solr-lb.stanford.edu:8983/solr/select?facet.field=format&rows=0&facet.sort=index
|
121
|
+
# via stanford-mods gem
|
122
|
+
# @return [Array<String>] value(s) in the SearchWorks controlled vocabulary, or []
|
123
|
+
def format
|
124
|
+
vals = @modsxml.format
|
125
|
+
if vals.empty?
|
126
|
+
puts "#{@druid} has no SearchWorks format from MODS - check <typeOfResource> and other implicated MODS elements"
|
127
|
+
end
|
128
|
+
vals
|
129
|
+
end
|
130
|
+
|
131
|
+
# call stanford-mods format_main to get results
|
132
|
+
# @return [Array<String>] value(s) in the SearchWorks controlled vocabulary, or []
|
133
|
+
def format_main_ssim
|
134
|
+
vals = @modsxml.format_main
|
135
|
+
if vals.empty?
|
136
|
+
puts "#{@druid} has no SearchWorks Resource Type from MODS - check <typeOfResource> and other implicated MODS elements"
|
137
|
+
end
|
138
|
+
vals
|
139
|
+
end
|
140
|
+
|
141
|
+
# call stanford-mods sw_genre to get results
|
142
|
+
# @return [Array<String>] value(s)
|
143
|
+
def genre_ssim
|
144
|
+
@modsxml.sw_genre
|
145
|
+
end
|
146
|
+
|
147
|
+
protected
|
148
|
+
|
149
|
+
# @return true if the string parses into an int, and if so, the int is >= 0
|
150
|
+
def is_positive_int? str
|
151
|
+
begin
|
152
|
+
if str.to_i >= 0
|
153
|
+
return true
|
154
|
+
else
|
155
|
+
return false
|
156
|
+
end
|
157
|
+
rescue
|
158
|
+
end
|
159
|
+
return false
|
160
|
+
end
|
161
|
+
|
162
|
+
# determines particular flavor of displayable publication year field
|
163
|
+
# @return Solr field name as a symbol
|
164
|
+
def date_type_sym
|
165
|
+
vals = @modsxml.term_values([:origin_info,:dateIssued])
|
166
|
+
if vals and vals.length > 0
|
167
|
+
return :publication_year_isi
|
168
|
+
end
|
169
|
+
vals = @modsxml.term_values([:origin_info,:dateCreated])
|
170
|
+
if vals and vals.length > 0
|
171
|
+
return :creation_year_isi
|
172
|
+
end
|
173
|
+
nil
|
174
|
+
end
|
175
|
+
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'stanford-mods'
|
2
|
+
module DiscoveryIndexer
|
3
|
+
module InputXml
|
4
|
+
|
5
|
+
# This class is the main class to access and parse the mods xml
|
6
|
+
# as retrieved from PURL server
|
7
|
+
# @example to run the code
|
8
|
+
# druid = "aa111aa1111"
|
9
|
+
# p = DiscoveryIndexer::InputXml::Modsxml.new(druid)
|
10
|
+
# model = p.load()
|
11
|
+
#
|
12
|
+
#
|
13
|
+
class Modsxml
|
14
|
+
# initializes a new object
|
15
|
+
# @param druid [String] the druid object in the format "aa111aa1111"
|
16
|
+
def initialize(druid)
|
17
|
+
@druid = druid
|
18
|
+
@modsxml_ng_doc = nil
|
19
|
+
end
|
20
|
+
|
21
|
+
# loads the mods xml to stanford mods model for the fedora object defind in the druid,
|
22
|
+
# it reads the mods xml once from PURL server, and repeat the parsing with each call
|
23
|
+
# @return [Stanford::Mods::Record] represents the mods xml
|
24
|
+
def load()
|
25
|
+
if @modsxml_ng_doc.nil? then
|
26
|
+
@modsxml_ng_doc = ModsxmlReader.read(@druid)
|
27
|
+
end
|
28
|
+
|
29
|
+
modsxml_model = Stanford::Mods::Record.new
|
30
|
+
modsxml_model.from_nk_node(@modsxml_ng_doc)
|
31
|
+
return modsxml_model
|
32
|
+
end
|
33
|
+
|
34
|
+
# loads the mods xml to stanford mods model for the fedora object defind in the druid,
|
35
|
+
# it reads the mods xml from PURL server with every call
|
36
|
+
# @return [Stanford::Mods::Record] represents the mods xml
|
37
|
+
def reload()
|
38
|
+
@modsxml_ng_doc = ModsxmlReader.read(@druid)
|
39
|
+
return load()
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'open-uri'
|
3
|
+
module DiscoveryIndexer
|
4
|
+
module InputXml
|
5
|
+
class ModsxmlReader
|
6
|
+
|
7
|
+
# reads the mods xml for the fedora object that is defined , from the purl server
|
8
|
+
# @param [String] druid e.g. ab123cd4567
|
9
|
+
# @return [Nokogiri::XML::Document] the mods xml for the fedora object
|
10
|
+
# @raise [MissingModsXml] if there's no mods xml available for this druid
|
11
|
+
def self.read(druid)
|
12
|
+
mods_uri = "#{DiscoveryIndexer::PURL_DEFAULT}/#{druid}.mods"
|
13
|
+
|
14
|
+
begin
|
15
|
+
modsxml_ng_doc = Nokogiri::XML(open(mods_uri))
|
16
|
+
return modsxml_ng_doc
|
17
|
+
rescue
|
18
|
+
raise DiscoveryIndexer::Errors::MissingModsPage.new(mods_uri)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module DiscoveryIndexer
|
2
|
+
module InputXml
|
3
|
+
|
4
|
+
# This class is the main class to access and parse the purl xml
|
5
|
+
# as retrieved from PURL server
|
6
|
+
# @example to run the code
|
7
|
+
# druid = "aa111aa1111"
|
8
|
+
# p = DiscoveryIndexer::InputXml::Purlxml.new(druid)
|
9
|
+
# model = p.load()
|
10
|
+
#
|
11
|
+
class Purlxml
|
12
|
+
|
13
|
+
# initializes a new object
|
14
|
+
# @param druid [String] the druid object in the format "aa111aa1111"
|
15
|
+
def initialize(druid)
|
16
|
+
@druid = druid
|
17
|
+
@purlxml_ng_doc = nil
|
18
|
+
end
|
19
|
+
|
20
|
+
# loads the purl xml to purlxml model for the fedora object defind in the druid,
|
21
|
+
# it reads the purl xml once from PURL server, and repeat the parsing with each call
|
22
|
+
# @return [PurlxmlModel] represents the purlxml
|
23
|
+
def load()
|
24
|
+
if @purlxml_ng_doc.nil? then
|
25
|
+
@purlxml_ng_doc = PurlxmlReader.read(@druid)
|
26
|
+
end
|
27
|
+
|
28
|
+
purlxml_parser = PurlxmlParserStrict.new(@purlxml_ng_doc)
|
29
|
+
purlxml_model = purlxml_parser.parse()
|
30
|
+
return purlxml_model
|
31
|
+
end
|
32
|
+
|
33
|
+
# loads the purl xml to purlxml model for the fedora object defind in the druid
|
34
|
+
# it reads the purl xml from PURL server with every call
|
35
|
+
# @return [PurlxmlModel] represents the purlxml
|
36
|
+
def reload()
|
37
|
+
@purlxml_ng_doc = PurlxmlReader.read(@druid)
|
38
|
+
return load()
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module DiscoveryIndexer
|
2
|
+
module InputXml
|
3
|
+
class PurlxmlModel
|
4
|
+
attr_accessor :public_xml
|
5
|
+
attr_accessor :content_metadata
|
6
|
+
attr_accessor :identity_metadata
|
7
|
+
attr_accessor :rights_metadata
|
8
|
+
attr_accessor :dc
|
9
|
+
attr_accessor :rdf
|
10
|
+
attr_accessor :release_tags_hash
|
11
|
+
attr_accessor :dor_content_type
|
12
|
+
attr_accessor :is_collection
|
13
|
+
attr_accessor :collection_druids
|
14
|
+
attr_accessor :dor_content_type
|
15
|
+
attr_accessor :file_ids
|
16
|
+
attr_accessor :image_ids
|
17
|
+
attr_accessor :catkey
|
18
|
+
attr_accessor :barcode
|
19
|
+
attr_accessor :label
|
20
|
+
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
|
26
|
+
|
27
|
+
|
28
|
+
|
29
|
+
|
@@ -0,0 +1,210 @@
|
|
1
|
+
module DiscoveryIndexer
|
2
|
+
module InputXml
|
3
|
+
class PurlxmlParserStrict < PurlxmlParser
|
4
|
+
|
5
|
+
RDF_NAMESPACE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
|
6
|
+
OAI_DC_NAMESPACE = 'http://www.openarchives.org/OAI/2.0/oai_dc/'
|
7
|
+
MODS_NAMESPACE = 'http://www.loc.gov/mods/v3'
|
8
|
+
|
9
|
+
def initialize(purlxml_ng_doc)
|
10
|
+
super
|
11
|
+
end
|
12
|
+
|
13
|
+
# it parses the purlxml into a purlxml model
|
14
|
+
# @return [PurlxmlModel] represents the purlxml as parsed based on the parser rules
|
15
|
+
def parse()
|
16
|
+
purlxml_model = PurlxmlModel.new
|
17
|
+
purlxml_model.public_xml = @purlxml_ng_doc
|
18
|
+
purlxml_model.content_metadata = parse_content_metadata()
|
19
|
+
purlxml_model.identity_metadata = parse_identity_metadata()
|
20
|
+
purlxml_model.rights_metadata = parse_rights_metadata()
|
21
|
+
purlxml_model.dc = parse_dc()
|
22
|
+
purlxml_model.rdf = parse_rdf()
|
23
|
+
purlxml_model.is_collection = parse_is_collection()
|
24
|
+
purlxml_model.collection_druids = parse_collection_druids()
|
25
|
+
purlxml_model.dor_content_type = parse_dor_content_type()
|
26
|
+
purlxml_model.release_tags_hash = parse_release_tags_hash()
|
27
|
+
purlxml_model.file_ids = parse_file_ids()
|
28
|
+
purlxml_model.image_ids = parse_image_ids()
|
29
|
+
purlxml_model.catkey = parse_catkey()
|
30
|
+
purlxml_model.barcode = parse_barcode()
|
31
|
+
purlxml_model.label = parse_label()
|
32
|
+
return purlxml_model
|
33
|
+
end
|
34
|
+
|
35
|
+
# extracts the identityMetadata for this fedora object, from the purl xml
|
36
|
+
# @return [Nokogiri::XML::Document] the identityMetadata for the fedora object
|
37
|
+
# @raise [DiscoveryIndexer::Errors::MissingIdentityMetadata] if there is no contentMetadata
|
38
|
+
def parse_identity_metadata
|
39
|
+
begin
|
40
|
+
ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/identityMetadata').to_xml)
|
41
|
+
raise DiscoveryIndexer::Errors::MissingIdentityMetadata.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
|
42
|
+
ng_doc
|
43
|
+
rescue
|
44
|
+
raise DiscoveryIndexer::Errors::MissingIdentityMetadata.new(@purlxml_ng_doc.inspect)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def parse_rights_metadata
|
49
|
+
begin
|
50
|
+
ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/rightsMetadata').to_xml)
|
51
|
+
raise DiscoveryIndexer::Errors::MissingRightsMetadata.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
|
52
|
+
ng_doc
|
53
|
+
rescue
|
54
|
+
raise DiscoveryIndexer::Errors::MissingRightsMetadata.new(@purlxml_ng_doc.inspect)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
# extracts the dc field for this fedora object, from the purl xml
|
59
|
+
# @return [Nokogiri::XML::Document] the dc for the fedora object
|
60
|
+
# @raise [DiscoveryIndexer::Errors::MissingDC] if there is no dc element
|
61
|
+
def parse_dc
|
62
|
+
begin
|
63
|
+
ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/dc:dc', {'dc' => OAI_DC_NAMESPACE}).to_xml(:encoding => 'utf-8'))
|
64
|
+
raise DiscoveryIndexer::Errors::MissingDC.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
|
65
|
+
ng_doc
|
66
|
+
rescue
|
67
|
+
raise DiscoveryIndexer::Errors::MissingDC.new(@purlxml_ng_doc.inspect)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
# extracts the rdf field for this fedora object, from the purl xml
|
72
|
+
# @return [Nokogiri::XML::Document] the rdf for the fedora object
|
73
|
+
# @raise [DiscoveryIndexer::Errors::MissingRDF] if there is no rdf element
|
74
|
+
def parse_rdf
|
75
|
+
begin
|
76
|
+
ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/rdf:RDF', {'rdf' => RDF_NAMESPACE}).to_xml)
|
77
|
+
raise DiscoveryIndexer::Errors::MissingRDF.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
|
78
|
+
ng_doc
|
79
|
+
rescue
|
80
|
+
raise DiscoveryIndexer::Errors::MissingRDF.new(@purlxml_ng_doc.inspect)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
|
85
|
+
# extracts the release tag element for this fedora object, from the the identity metadata in purl xml
|
86
|
+
# @return [Hash] the release tags for the fedora object
|
87
|
+
def parse_release_tags_hash
|
88
|
+
release_tags={}
|
89
|
+
identity_metadata = parse_identity_metadata
|
90
|
+
unless identity_metadata.nil?
|
91
|
+
release_elements = identity_metadata.xpath('//release')
|
92
|
+
release_elements.each { |n|
|
93
|
+
unless n.attr("to").nil?
|
94
|
+
release_target = n.attr("to")
|
95
|
+
|
96
|
+
|
97
|
+
#target = release_target.split(":").first
|
98
|
+
#sub_target = "default"
|
99
|
+
#if target != release_target.split(":").last then
|
100
|
+
# sub_target = release_target.split(":").last
|
101
|
+
#end
|
102
|
+
text = n.text
|
103
|
+
unless text.nil?
|
104
|
+
release_tags[release_target]= text
|
105
|
+
end
|
106
|
+
end
|
107
|
+
}
|
108
|
+
return release_tags
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
# extracts the contentMetadata for this fedora object, from the purl xml
|
113
|
+
# @return [Nokogiri::XML::Document] the contentMetadata for the fedora object
|
114
|
+
# @raise [DiscoveryIndexer::Errors::MissingContentMetadata] if there is no contentMetadata
|
115
|
+
def parse_content_metadata
|
116
|
+
# begin
|
117
|
+
ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/contentMetadata').to_xml)
|
118
|
+
# raise DiscoveryIndexer::Errors::MissingContentMetadata.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
|
119
|
+
ng_doc
|
120
|
+
# rescue
|
121
|
+
# raise DiscoveryIndexer::Errors::MissingContentMetadata.new(@purlxml_ng_doc.inspect)
|
122
|
+
# end
|
123
|
+
end
|
124
|
+
|
125
|
+
# @return true if the identityMetadata has <objectType>collection</objectType>, false otherwise
|
126
|
+
def parse_is_collection
|
127
|
+
identity_metadata = parse_identity_metadata
|
128
|
+
unless identity_metadata.nil?
|
129
|
+
object_type_nodes = identity_metadata.xpath('./objectType')
|
130
|
+
return true if object_type_nodes.find_index { |n| n.text == 'collection'}
|
131
|
+
end
|
132
|
+
false
|
133
|
+
end
|
134
|
+
|
135
|
+
# get the druids from isMemberOfCollection relationships in rels-ext from public_xml
|
136
|
+
# @return [Array<String>] the druids (e.g. ww123yy1234) this object has isMemberOfColletion relationship with, or nil if none
|
137
|
+
def parse_collection_druids
|
138
|
+
ns_hash = {'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'fedora' => "info:fedora/fedora-system:def/relations-external#", '' => ''}
|
139
|
+
is_member_of_nodes ||= @purlxml_ng_doc.xpath('/publicObject/rdf:RDF/rdf:Description/fedora:isMemberOfCollection/@rdf:resource', ns_hash)
|
140
|
+
# from public_xml rels-ext
|
141
|
+
druids = []
|
142
|
+
is_member_of_nodes.each { |n|
|
143
|
+
druids << n.value.split('druid:').last unless n.value.empty?
|
144
|
+
}
|
145
|
+
return nil if druids.empty?
|
146
|
+
druids
|
147
|
+
end
|
148
|
+
|
149
|
+
# the value of the type attribute for a DOR object's contentMetadata
|
150
|
+
# more info about these values is here:
|
151
|
+
# https://consul.stanford.edu/display/chimera/DOR+content+types%2C+resource+types+and+interpretive+metadata
|
152
|
+
# https://consul.stanford.edu/display/chimera/Summary+of+Content+Types%2C+Resource+Types+and+their+behaviors
|
153
|
+
# @return [String]
|
154
|
+
def parse_dor_content_type
|
155
|
+
content_md = parse_content_metadata
|
156
|
+
dct = content_md ? content_md.xpath('@type').text : nil
|
157
|
+
puts " has no DOR content type (<contentMetadata> element may be missing type attribute)" if !dct || dct.empty?
|
158
|
+
dct
|
159
|
+
end
|
160
|
+
|
161
|
+
# the @id attribute of resource/file elements that match the display_type, including extension
|
162
|
+
# @return [Array<String>] filenames
|
163
|
+
def parse_image_ids
|
164
|
+
ids = []
|
165
|
+
content_md = parse_content_metadata
|
166
|
+
unless content_md.nil?
|
167
|
+
content_md.xpath('./resource[@type="image"]/file/@id').each { |node|
|
168
|
+
ids << node.text if !node.text.empty?
|
169
|
+
}
|
170
|
+
return nil if ids.empty?
|
171
|
+
ids
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
def parse_file_ids
|
176
|
+
ids = []
|
177
|
+
content_md = parse_content_metadata
|
178
|
+
unless content_md.nil?
|
179
|
+
content_md.xpath('./resource/file/@id').each { |node|
|
180
|
+
ids << node.text if !node.text.empty?
|
181
|
+
}
|
182
|
+
return nil if ids.empty?
|
183
|
+
ids
|
184
|
+
end
|
185
|
+
end
|
186
|
+
|
187
|
+
def parse_catkey
|
188
|
+
catkey = nil
|
189
|
+
node = @purlxml_ng_doc.xpath("/publicObject/identityMetadata/otherId[@name='catkey']")
|
190
|
+
catkey = node.first.content if node && node.first
|
191
|
+
return catkey
|
192
|
+
end
|
193
|
+
|
194
|
+
def parse_barcode
|
195
|
+
barcode = nil
|
196
|
+
node = @purlxml_ng_doc.xpath("/publicObject/identityMetadata/otherId[@name='barcode']")
|
197
|
+
barcode = node.first.content if node && node.first
|
198
|
+
return barcode
|
199
|
+
end
|
200
|
+
|
201
|
+
def parse_label
|
202
|
+
label = nil
|
203
|
+
node = @purlxml_ng_doc.xpath("/publicObject/identityMetadata/objectLabel")
|
204
|
+
label = node.first.content if node && node.first
|
205
|
+
return label
|
206
|
+
end
|
207
|
+
end
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'open-uri'
|
3
|
+
module DiscoveryIndexer
|
4
|
+
module InputXml
|
5
|
+
class PurlxmlReader
|
6
|
+
|
7
|
+
# reads the public xml for the fedora object that is defined , from the purl server
|
8
|
+
# @param [String] druid e.g. ab123cd4567
|
9
|
+
# @return [Nokogiri::XML::Document] the public xml for the fedora object
|
10
|
+
# @raise [MissingPublicXml] if there's no purl xml available for this druid
|
11
|
+
def self.read(druid)
|
12
|
+
purlxml_uri = "#{DiscoveryIndexer::PURL_DEFAULT}/#{druid}.xml"
|
13
|
+
|
14
|
+
begin
|
15
|
+
purlxml_object = Nokogiri::XML(open(purlxml_uri))
|
16
|
+
return purlxml_object
|
17
|
+
rescue
|
18
|
+
raise DiscoveryIndexer::Errors::MissingPurlPage.new(purlxml_uri)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
data/lib/version.rb
CHANGED
@@ -0,0 +1,51 @@
|
|
1
|
+
require 'retries'
|
2
|
+
require 'rsolr'
|
3
|
+
|
4
|
+
module DiscoveryIndexer
|
5
|
+
module Writer
|
6
|
+
class SolrClient
|
7
|
+
|
8
|
+
# Add the document to solr, retry if an error occurs.
|
9
|
+
# See https://github.com/ooyala/retries for docs on with_retries.
|
10
|
+
# @param [Hash] solr_doc a Hash representation of the solr document
|
11
|
+
# @param [RSolr::Client] solr_connector is an open connection with the solr core
|
12
|
+
# @param [Integer] max_retries the maximum number of tries before fail
|
13
|
+
def self.add(solr_doc, solr_connector, max_retries = 10)
|
14
|
+
process(solr_doc, solr_connector, max_retries, is_delete=false)
|
15
|
+
end
|
16
|
+
|
17
|
+
# Add the document to solr, retry if an error occurs.
|
18
|
+
# See https://github.com/ooyala/retries for docs on with_retries.
|
19
|
+
# @param [Hash] solr_doc that has only the id !{:id=>"ab123cd4567"}
|
20
|
+
# @param [RSolr::Client] solr_connector is an open connection with the solr core
|
21
|
+
# @param [Integer] max_retries the maximum number of tries before fail
|
22
|
+
def self.delete(solr_doc, solr_connector, max_retries = 10)
|
23
|
+
process(solr_doc, solr_connector, max_retries, is_delete=true)
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.process(solr_doc, solr_connector, max_retries, is_delete=false)
|
27
|
+
logger = Logger.new STDOUT
|
28
|
+
id = solr_doc[:id]
|
29
|
+
puts id
|
30
|
+
handler = Proc.new do |exception, attempt_number, total_delay|
|
31
|
+
logger.debug "#{exception.class} on attempt #{attempt_number} for #{id}"
|
32
|
+
end
|
33
|
+
|
34
|
+
with_retries(:max_tries => max_retries, :handler => handler, :base_sleep_seconds => 1, :max_sleep_seconds => 5) do |attempt|
|
35
|
+
logger.debug "Attempt #{attempt} for #{id}"
|
36
|
+
|
37
|
+
if is_delete
|
38
|
+
solr_connector.delete_by_id(id)
|
39
|
+
logger.info "Successfully deleted #{id} on attempt #{attempt}"
|
40
|
+
else
|
41
|
+
solr_connector.add(solr_doc)
|
42
|
+
logger.info "Successfully indexed #{id} on attempt #{attempt}"
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
46
|
+
solr_connector.commit
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
require 'retries'
|
2
|
+
require 'rsolr'
|
3
|
+
|
4
|
+
module DiscoveryIndexer
|
5
|
+
module Writer
|
6
|
+
class SolrWriter
|
7
|
+
|
8
|
+
def process(druid, index_doc, targets, solr_targets_configs)
|
9
|
+
@solr_targets_configs = solr_targets_configs
|
10
|
+
index_targets = []
|
11
|
+
delete_targets = []
|
12
|
+
puts targets
|
13
|
+
targets.keys.each do |target|
|
14
|
+
if targets[target] then
|
15
|
+
index_targets.append(target)
|
16
|
+
else
|
17
|
+
delete_targets.append(target)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
# get targets with true
|
22
|
+
solr_index_client(index_doc, index_targets)
|
23
|
+
# get targets with false
|
24
|
+
solr_delete_client(druid, delete_targets)
|
25
|
+
end
|
26
|
+
|
27
|
+
def solr_delete_from_all(druid, solr_targets_configs)
|
28
|
+
# Get a list of all registered targets
|
29
|
+
@solr_targets_configs=solr_targets_configs
|
30
|
+
targets = @solr_targets_configs.keys()
|
31
|
+
solr_delete_client(druid, targets)
|
32
|
+
end
|
33
|
+
|
34
|
+
def solr_index_client(index_doc, targets)
|
35
|
+
targets.each do |solr_target|
|
36
|
+
solr_connector = get_connector_for_target(solr_target)
|
37
|
+
SolrClient.add(index_doc, solr_connector)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def solr_delete_client(druid, targets)
|
42
|
+
targets.each do |solr_target|
|
43
|
+
solr_connector = get_connector_for_target(solr_target)
|
44
|
+
SolrClient.delete({:id=>druid}, solr_connector)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def get_connector_for_target(solr_target)
|
49
|
+
solr_connector = nil
|
50
|
+
puts solr_target
|
51
|
+
puts @solr_targets_configs
|
52
|
+
if @solr_targets_configs.keys.include?(solr_target) then
|
53
|
+
config = @solr_targets_configs[solr_target]
|
54
|
+
solr_connector = RSolr.connect(config)
|
55
|
+
end
|
56
|
+
return solr_connector
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
metadata
CHANGED
@@ -1,15 +1,127 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: discovery-indexer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: '0.1'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ahmed AlSum
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-03-
|
12
|
-
dependencies:
|
11
|
+
date: 2015-03-19 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: stanford-mods
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: retries
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rsolr
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rspec
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: webmock
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: equivalent-xml
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: vcr
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - ">="
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - ">="
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0'
|
13
125
|
description: This library manages the core operations for the discovery indexing such
|
14
126
|
as reading PURL xml, mapping to the solr document, and writing to solr core.
|
15
127
|
email: aalsum@stanford.edu
|
@@ -18,7 +130,20 @@ extensions: []
|
|
18
130
|
extra_rdoc_files: []
|
19
131
|
files:
|
20
132
|
- lib/discovery-indexer.rb
|
133
|
+
- lib/errors.rb
|
134
|
+
- lib/mapper/general_mapper.rb
|
135
|
+
- lib/mapper/index_mapper.rb
|
136
|
+
- lib/reader/modsxml.rb
|
137
|
+
- lib/reader/modsxml_reader.rb
|
138
|
+
- lib/reader/purlxml.rb
|
139
|
+
- lib/reader/purlxml_model.rb
|
140
|
+
- lib/reader/purlxml_parser.rb
|
141
|
+
- lib/reader/purlxml_parser_strict.rb
|
142
|
+
- lib/reader/purlxml_reader.rb
|
143
|
+
- lib/utilities/extract_sub_targets.rb
|
21
144
|
- lib/version.rb
|
145
|
+
- lib/writer/solr_client.rb
|
146
|
+
- lib/writer/solr_writer.rb
|
22
147
|
homepage:
|
23
148
|
licenses:
|
24
149
|
- Stanford University
|
@@ -29,12 +154,12 @@ require_paths:
|
|
29
154
|
- lib
|
30
155
|
required_ruby_version: !ruby/object:Gem::Requirement
|
31
156
|
requirements:
|
32
|
-
- -
|
157
|
+
- - ">="
|
33
158
|
- !ruby/object:Gem::Version
|
34
159
|
version: '0'
|
35
160
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
36
161
|
requirements:
|
37
|
-
- -
|
162
|
+
- - ">="
|
38
163
|
- !ruby/object:Gem::Version
|
39
164
|
version: '0'
|
40
165
|
requirements: []
|