discovery-indexer 0.0.1 → 0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/discovery-indexer.rb +22 -0
- data/lib/errors.rb +13 -0
- data/lib/mapper/general_mapper.rb +18 -0
- data/lib/mapper/index_mapper.rb +179 -0
- data/lib/reader/modsxml.rb +44 -0
- data/lib/reader/modsxml_reader.rb +23 -0
- data/lib/reader/purlxml.rb +43 -0
- data/lib/reader/purlxml_model.rb +29 -0
- data/lib/reader/purlxml_parser.rb +13 -0
- data/lib/reader/purlxml_parser_strict.rb +210 -0
- data/lib/reader/purlxml_reader.rb +23 -0
- data/lib/utilities/extract_sub_targets.rb +13 -0
- data/lib/version.rb +1 -1
- data/lib/writer/solr_client.rb +51 -0
- data/lib/writer/solr_writer.rb +61 -0
- metadata +130 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 073a6699afc6ef96ee080e00be2dfe3180bfb734
|
4
|
+
data.tar.gz: e4361f9aad38081a92598e53fb0f55e564807812
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 29e0c0dd830b66cc7cfe61bd4da7a42d3ee6fbb55721bf01ecf1f4b1679585d7aae9cc7c6f2a35d77b80b0017b45babea77516b9a4373191b7c8899b0a9a371c
|
7
|
+
data.tar.gz: 09c61dd4fa8501865bc4081dcb584224de912921eae74bb34121d6aafd05f4a762f45cdaffdf2c0a0282d318bc501629dcb0f1228c48d1fcba9ca769d4ab07db
|
data/lib/discovery-indexer.rb
CHANGED
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'reader/purlxml'
|
2
|
+
require 'reader/purlxml_reader'
|
3
|
+
require 'reader/purlxml_parser'
|
4
|
+
require 'reader/purlxml_parser_strict'
|
5
|
+
require 'reader/purlxml_model'
|
6
|
+
|
7
|
+
require 'reader/modsxml'
|
8
|
+
require 'reader/modsxml_reader'
|
9
|
+
|
10
|
+
require 'mapper/general_mapper'
|
11
|
+
require 'mapper/index_mapper'
|
12
|
+
|
13
|
+
require 'writer/solr_client'
|
14
|
+
require 'writer/solr_writer'
|
15
|
+
|
16
|
+
#require 'utilities/extract_sub_targets'
|
17
|
+
|
18
|
+
require 'errors'
|
19
|
+
|
20
|
+
module DiscoveryIndexer
|
21
|
+
PURL_DEFAULT = 'http://purl-test.stanford.edu'
|
22
|
+
end
|
data/lib/errors.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
module DiscoveryIndexer
|
2
|
+
module Errors
|
3
|
+
MissingPurlPage = Class.new(StandardError)
|
4
|
+
MissingMods = Class.new(StandardError)
|
5
|
+
MissingPublicXml = Class.new(StandardError)
|
6
|
+
MissingContentMetadata = Class.new(StandardError)
|
7
|
+
MissingIdentityMetadata = Class.new(StandardError)
|
8
|
+
MissingRightsMetadata = Class.new(StandardError)
|
9
|
+
MissingRDF = Class.new(StandardError)
|
10
|
+
MissingDC = Class.new(StandardError)
|
11
|
+
MissingModsPage = Class.new(StandardError)
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module DiscoveryIndexer
|
2
|
+
module Mapper
|
3
|
+
class GeneralMapper
|
4
|
+
|
5
|
+
def initialize(druid, modsxml, purlxml, collection_names={})
|
6
|
+
@druid = druid
|
7
|
+
@modsxml = modsxml
|
8
|
+
@purlxml = purlxml
|
9
|
+
@collection_names = collection_names
|
10
|
+
end
|
11
|
+
|
12
|
+
def map()
|
13
|
+
end
|
14
|
+
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
@@ -0,0 +1,179 @@
|
|
1
|
+
module DiscoveryIndexer
|
2
|
+
module Mapper
|
3
|
+
|
4
|
+
# This class is responsible for creating the solr_doc hash based on the input
|
5
|
+
# of druid_id, modsxml, purlxml, and optional hash of collection_names
|
6
|
+
class IndexMapper < GeneralMapper
|
7
|
+
|
8
|
+
# Initializes an instance from IndexMapper
|
9
|
+
# @param [String] druid e.g. ab123cd4567
|
10
|
+
# @param [Stanford::Mods::Record] modsxml represents the MODS xml for the druid
|
11
|
+
# @param [DiscoveryIndexer::Reader::PurlxmlModel] purlxml represents the purlxml model
|
12
|
+
# @param [Hash] collection_names represents a hash of collection_druid and
|
13
|
+
# collection_name !{"aa111aa1111"=>"First Collection", "bb123bb1234"=>"Second Collection"}
|
14
|
+
def initialize(druid, modsxml, purlxml, collection_names={})
|
15
|
+
super druid, modsxml, purlxml, collection_names
|
16
|
+
end
|
17
|
+
|
18
|
+
# Create a Hash representing a Solr doc, with all MODS related fields populated.
|
19
|
+
# @return [Hash] Hash representing the Solr document
|
20
|
+
def map()
|
21
|
+
solr_doc = {}
|
22
|
+
solr_doc[:id] = @druid
|
23
|
+
solr_doc.update mods_to_title_fields
|
24
|
+
solr_doc.update mods_to_author_fields
|
25
|
+
solr_doc.update mods_to_subject_search_fields
|
26
|
+
solr_doc.update mods_to_publication_fields
|
27
|
+
solr_doc.update mods_to_pub_date
|
28
|
+
solr_doc.update mods_to_others
|
29
|
+
|
30
|
+
solr_doc[:all_search] = @modsxml.text.gsub(/\s+/, ' ')
|
31
|
+
return solr_doc
|
32
|
+
end
|
33
|
+
|
34
|
+
# @return [Hash] Hash representing the title fields
|
35
|
+
def mods_to_title_fields
|
36
|
+
# title fields
|
37
|
+
doc_hash = {
|
38
|
+
:title_245a_search => @modsxml.sw_short_title,
|
39
|
+
:title_245_search => @modsxml.sw_full_title,
|
40
|
+
:title_variant_search => @modsxml.sw_addl_titles,
|
41
|
+
:title_sort => @modsxml.sw_sort_title,
|
42
|
+
:title_245a_display => @modsxml.sw_short_title,
|
43
|
+
:title_display => @modsxml.sw_title_display,
|
44
|
+
:title_full_display => @modsxml.sw_full_title,
|
45
|
+
}
|
46
|
+
doc_hash
|
47
|
+
end
|
48
|
+
|
49
|
+
# @return [Hash] Hash representing the author fields
|
50
|
+
def mods_to_author_fields
|
51
|
+
doc_hash = {
|
52
|
+
# author fields
|
53
|
+
:author_1xx_search => @modsxml.sw_main_author,
|
54
|
+
:author_7xx_search => @modsxml.sw_addl_authors,
|
55
|
+
:author_person_facet => @modsxml.sw_person_authors,
|
56
|
+
:author_other_facet => @modsxml.sw_impersonal_authors,
|
57
|
+
:author_sort => @modsxml.sw_sort_author[1..-1],
|
58
|
+
:author_corp_display => @modsxml.sw_corporate_authors,
|
59
|
+
:author_meeting_display => @modsxml.sw_meeting_authors,
|
60
|
+
:author_person_display => @modsxml.sw_person_authors,
|
61
|
+
:author_person_full_display => @modsxml.sw_person_authors,
|
62
|
+
}
|
63
|
+
doc_hash
|
64
|
+
end
|
65
|
+
|
66
|
+
# @return [Hash] Hash representing the search fields
|
67
|
+
def mods_to_subject_search_fields
|
68
|
+
doc_hash = {
|
69
|
+
# subject search fields
|
70
|
+
:topic_search => @modsxml.topic_search,
|
71
|
+
:geographic_search => @modsxml.geographic_search,
|
72
|
+
:subject_other_search => @modsxml.subject_other_search,
|
73
|
+
:subject_other_subvy_search => @modsxml.subject_other_subvy_search,
|
74
|
+
:subject_all_search => @modsxml.subject_all_search,
|
75
|
+
:topic_facet => @modsxml.topic_facet,
|
76
|
+
:geographic_facet => @modsxml.geographic_facet,
|
77
|
+
:era_facet => @modsxml.era_facet,
|
78
|
+
}
|
79
|
+
end
|
80
|
+
|
81
|
+
# @return [Hash] Hash representing the publication fields
|
82
|
+
def mods_to_publication_fields
|
83
|
+
doc_hash = {
|
84
|
+
# publication fields
|
85
|
+
:pub_search => @modsxml.place,
|
86
|
+
:pub_date_sort => @modsxml.pub_date_sort,
|
87
|
+
:imprint_display => @modsxml.pub_date_display,
|
88
|
+
:pub_date => @modsxml.pub_date_facet,
|
89
|
+
:pub_date_display => @modsxml.pub_date_display, # pub_date_display may be deprecated
|
90
|
+
}
|
91
|
+
end
|
92
|
+
|
93
|
+
# @return [Hash] Hash representing the pub date
|
94
|
+
def mods_to_pub_date
|
95
|
+
doc_hash = {}
|
96
|
+
pub_date_sort = @modsxml.pub_date_sort
|
97
|
+
if is_positive_int? pub_date_sort
|
98
|
+
doc_hash[:pub_year_tisim] = pub_date_sort # for date slider
|
99
|
+
# put the displayable year in the correct field, :creation_year_isi for example
|
100
|
+
doc_hash[date_type_sym] = @modsxml.pub_date_sort if date_type_sym
|
101
|
+
end
|
102
|
+
return doc_hash
|
103
|
+
end
|
104
|
+
|
105
|
+
# @return [Hash] Hash representing some fields
|
106
|
+
def mods_to_others
|
107
|
+
doc_hash = {
|
108
|
+
:format_main_ssim => format_main_ssim,
|
109
|
+
:format => format, # for backwards compatibility
|
110
|
+
:language => @modsxml.sw_language_facet,
|
111
|
+
:physical => @modsxml.term_values([:physical_description, :extent]),
|
112
|
+
:summary_search => @modsxml.term_values(:abstract),
|
113
|
+
:toc_search => @modsxml.term_values(:tableOfContents),
|
114
|
+
:url_suppl => @modsxml.term_values([:related_item, :location, :url]),
|
115
|
+
}
|
116
|
+
return doc_hash
|
117
|
+
end
|
118
|
+
|
119
|
+
# select one or more format values from the controlled vocabulary here:
|
120
|
+
# http://searchworks-solr-lb.stanford.edu:8983/solr/select?facet.field=format&rows=0&facet.sort=index
|
121
|
+
# via stanford-mods gem
|
122
|
+
# @return [Array<String>] value(s) in the SearchWorks controlled vocabulary, or []
|
123
|
+
def format
|
124
|
+
vals = @modsxml.format
|
125
|
+
if vals.empty?
|
126
|
+
puts "#{@druid} has no SearchWorks format from MODS - check <typeOfResource> and other implicated MODS elements"
|
127
|
+
end
|
128
|
+
vals
|
129
|
+
end
|
130
|
+
|
131
|
+
# call stanford-mods format_main to get results
|
132
|
+
# @return [Array<String>] value(s) in the SearchWorks controlled vocabulary, or []
|
133
|
+
def format_main_ssim
|
134
|
+
vals = @modsxml.format_main
|
135
|
+
if vals.empty?
|
136
|
+
puts "#{@druid} has no SearchWorks Resource Type from MODS - check <typeOfResource> and other implicated MODS elements"
|
137
|
+
end
|
138
|
+
vals
|
139
|
+
end
|
140
|
+
|
141
|
+
# call stanford-mods sw_genre to get results
|
142
|
+
# @return [Array<String>] value(s)
|
143
|
+
def genre_ssim
|
144
|
+
@modsxml.sw_genre
|
145
|
+
end
|
146
|
+
|
147
|
+
protected
|
148
|
+
|
149
|
+
# @return true if the string parses into an int, and if so, the int is >= 0
|
150
|
+
def is_positive_int? str
|
151
|
+
begin
|
152
|
+
if str.to_i >= 0
|
153
|
+
return true
|
154
|
+
else
|
155
|
+
return false
|
156
|
+
end
|
157
|
+
rescue
|
158
|
+
end
|
159
|
+
return false
|
160
|
+
end
|
161
|
+
|
162
|
+
# determines particular flavor of displayable publication year field
|
163
|
+
# @return Solr field name as a symbol
|
164
|
+
def date_type_sym
|
165
|
+
vals = @modsxml.term_values([:origin_info,:dateIssued])
|
166
|
+
if vals and vals.length > 0
|
167
|
+
return :publication_year_isi
|
168
|
+
end
|
169
|
+
vals = @modsxml.term_values([:origin_info,:dateCreated])
|
170
|
+
if vals and vals.length > 0
|
171
|
+
return :creation_year_isi
|
172
|
+
end
|
173
|
+
nil
|
174
|
+
end
|
175
|
+
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'stanford-mods'
|
2
|
+
module DiscoveryIndexer
|
3
|
+
module InputXml
|
4
|
+
|
5
|
+
# This class is the main class to access and parse the mods xml
|
6
|
+
# as retrieved from PURL server
|
7
|
+
# @example to run the code
|
8
|
+
# druid = "aa111aa1111"
|
9
|
+
# p = DiscoveryIndexer::InputXml::Modsxml.new(druid)
|
10
|
+
# model = p.load()
|
11
|
+
#
|
12
|
+
#
|
13
|
+
class Modsxml
|
14
|
+
# initializes a new object
|
15
|
+
# @param druid [String] the druid object in the format "aa111aa1111"
|
16
|
+
def initialize(druid)
|
17
|
+
@druid = druid
|
18
|
+
@modsxml_ng_doc = nil
|
19
|
+
end
|
20
|
+
|
21
|
+
# loads the mods xml to stanford mods model for the fedora object defind in the druid,
|
22
|
+
# it reads the mods xml once from PURL server, and repeat the parsing with each call
|
23
|
+
# @return [Stanford::Mods::Record] represents the mods xml
|
24
|
+
def load()
|
25
|
+
if @modsxml_ng_doc.nil? then
|
26
|
+
@modsxml_ng_doc = ModsxmlReader.read(@druid)
|
27
|
+
end
|
28
|
+
|
29
|
+
modsxml_model = Stanford::Mods::Record.new
|
30
|
+
modsxml_model.from_nk_node(@modsxml_ng_doc)
|
31
|
+
return modsxml_model
|
32
|
+
end
|
33
|
+
|
34
|
+
# loads the mods xml to stanford mods model for the fedora object defind in the druid,
|
35
|
+
# it reads the mods xml from PURL server with every call
|
36
|
+
# @return [Stanford::Mods::Record] represents the mods xml
|
37
|
+
def reload()
|
38
|
+
@modsxml_ng_doc = ModsxmlReader.read(@druid)
|
39
|
+
return load()
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'open-uri'
|
3
|
+
module DiscoveryIndexer
|
4
|
+
module InputXml
|
5
|
+
class ModsxmlReader
|
6
|
+
|
7
|
+
# reads the mods xml for the fedora object that is defined , from the purl server
|
8
|
+
# @param [String] druid e.g. ab123cd4567
|
9
|
+
# @return [Nokogiri::XML::Document] the mods xml for the fedora object
|
10
|
+
# @raise [MissingModsXml] if there's no mods xml available for this druid
|
11
|
+
def self.read(druid)
|
12
|
+
mods_uri = "#{DiscoveryIndexer::PURL_DEFAULT}/#{druid}.mods"
|
13
|
+
|
14
|
+
begin
|
15
|
+
modsxml_ng_doc = Nokogiri::XML(open(mods_uri))
|
16
|
+
return modsxml_ng_doc
|
17
|
+
rescue
|
18
|
+
raise DiscoveryIndexer::Errors::MissingModsPage.new(mods_uri)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module DiscoveryIndexer
|
2
|
+
module InputXml
|
3
|
+
|
4
|
+
# This class is the main class to access and parse the purl xml
|
5
|
+
# as retrieved from PURL server
|
6
|
+
# @example to run the code
|
7
|
+
# druid = "aa111aa1111"
|
8
|
+
# p = DiscoveryIndexer::InputXml::Purlxml.new(druid)
|
9
|
+
# model = p.load()
|
10
|
+
#
|
11
|
+
class Purlxml
|
12
|
+
|
13
|
+
# initializes a new object
|
14
|
+
# @param druid [String] the druid object in the format "aa111aa1111"
|
15
|
+
def initialize(druid)
|
16
|
+
@druid = druid
|
17
|
+
@purlxml_ng_doc = nil
|
18
|
+
end
|
19
|
+
|
20
|
+
# loads the purl xml to purlxml model for the fedora object defind in the druid,
|
21
|
+
# it reads the purl xml once from PURL server, and repeat the parsing with each call
|
22
|
+
# @return [PurlxmlModel] represents the purlxml
|
23
|
+
def load()
|
24
|
+
if @purlxml_ng_doc.nil? then
|
25
|
+
@purlxml_ng_doc = PurlxmlReader.read(@druid)
|
26
|
+
end
|
27
|
+
|
28
|
+
purlxml_parser = PurlxmlParserStrict.new(@purlxml_ng_doc)
|
29
|
+
purlxml_model = purlxml_parser.parse()
|
30
|
+
return purlxml_model
|
31
|
+
end
|
32
|
+
|
33
|
+
# loads the purl xml to purlxml model for the fedora object defind in the druid
|
34
|
+
# it reads the purl xml from PURL server with every call
|
35
|
+
# @return [PurlxmlModel] represents the purlxml
|
36
|
+
def reload()
|
37
|
+
@purlxml_ng_doc = PurlxmlReader.read(@druid)
|
38
|
+
return load()
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module DiscoveryIndexer
|
2
|
+
module InputXml
|
3
|
+
class PurlxmlModel
|
4
|
+
attr_accessor :public_xml
|
5
|
+
attr_accessor :content_metadata
|
6
|
+
attr_accessor :identity_metadata
|
7
|
+
attr_accessor :rights_metadata
|
8
|
+
attr_accessor :dc
|
9
|
+
attr_accessor :rdf
|
10
|
+
attr_accessor :release_tags_hash
|
11
|
+
attr_accessor :dor_content_type
|
12
|
+
attr_accessor :is_collection
|
13
|
+
attr_accessor :collection_druids
|
14
|
+
attr_accessor :dor_content_type
|
15
|
+
attr_accessor :file_ids
|
16
|
+
attr_accessor :image_ids
|
17
|
+
attr_accessor :catkey
|
18
|
+
attr_accessor :barcode
|
19
|
+
attr_accessor :label
|
20
|
+
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
|
26
|
+
|
27
|
+
|
28
|
+
|
29
|
+
|
@@ -0,0 +1,210 @@
|
|
1
|
+
module DiscoveryIndexer
|
2
|
+
module InputXml
|
3
|
+
class PurlxmlParserStrict < PurlxmlParser
|
4
|
+
|
5
|
+
RDF_NAMESPACE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
|
6
|
+
OAI_DC_NAMESPACE = 'http://www.openarchives.org/OAI/2.0/oai_dc/'
|
7
|
+
MODS_NAMESPACE = 'http://www.loc.gov/mods/v3'
|
8
|
+
|
9
|
+
def initialize(purlxml_ng_doc)
|
10
|
+
super
|
11
|
+
end
|
12
|
+
|
13
|
+
# it parses the purlxml into a purlxml model
|
14
|
+
# @return [PurlxmlModel] represents the purlxml as parsed based on the parser rules
|
15
|
+
def parse()
|
16
|
+
purlxml_model = PurlxmlModel.new
|
17
|
+
purlxml_model.public_xml = @purlxml_ng_doc
|
18
|
+
purlxml_model.content_metadata = parse_content_metadata()
|
19
|
+
purlxml_model.identity_metadata = parse_identity_metadata()
|
20
|
+
purlxml_model.rights_metadata = parse_rights_metadata()
|
21
|
+
purlxml_model.dc = parse_dc()
|
22
|
+
purlxml_model.rdf = parse_rdf()
|
23
|
+
purlxml_model.is_collection = parse_is_collection()
|
24
|
+
purlxml_model.collection_druids = parse_collection_druids()
|
25
|
+
purlxml_model.dor_content_type = parse_dor_content_type()
|
26
|
+
purlxml_model.release_tags_hash = parse_release_tags_hash()
|
27
|
+
purlxml_model.file_ids = parse_file_ids()
|
28
|
+
purlxml_model.image_ids = parse_image_ids()
|
29
|
+
purlxml_model.catkey = parse_catkey()
|
30
|
+
purlxml_model.barcode = parse_barcode()
|
31
|
+
purlxml_model.label = parse_label()
|
32
|
+
return purlxml_model
|
33
|
+
end
|
34
|
+
|
35
|
+
# extracts the identityMetadata for this fedora object, from the purl xml
|
36
|
+
# @return [Nokogiri::XML::Document] the identityMetadata for the fedora object
|
37
|
+
# @raise [DiscoveryIndexer::Errors::MissingIdentityMetadata] if there is no contentMetadata
|
38
|
+
def parse_identity_metadata
|
39
|
+
begin
|
40
|
+
ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/identityMetadata').to_xml)
|
41
|
+
raise DiscoveryIndexer::Errors::MissingIdentityMetadata.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
|
42
|
+
ng_doc
|
43
|
+
rescue
|
44
|
+
raise DiscoveryIndexer::Errors::MissingIdentityMetadata.new(@purlxml_ng_doc.inspect)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def parse_rights_metadata
|
49
|
+
begin
|
50
|
+
ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/rightsMetadata').to_xml)
|
51
|
+
raise DiscoveryIndexer::Errors::MissingRightsMetadata.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
|
52
|
+
ng_doc
|
53
|
+
rescue
|
54
|
+
raise DiscoveryIndexer::Errors::MissingRightsMetadata.new(@purlxml_ng_doc.inspect)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
# extracts the dc field for this fedora object, from the purl xml
|
59
|
+
# @return [Nokogiri::XML::Document] the dc for the fedora object
|
60
|
+
# @raise [DiscoveryIndexer::Errors::MissingDC] if there is no dc element
|
61
|
+
def parse_dc
|
62
|
+
begin
|
63
|
+
ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/dc:dc', {'dc' => OAI_DC_NAMESPACE}).to_xml(:encoding => 'utf-8'))
|
64
|
+
raise DiscoveryIndexer::Errors::MissingDC.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
|
65
|
+
ng_doc
|
66
|
+
rescue
|
67
|
+
raise DiscoveryIndexer::Errors::MissingDC.new(@purlxml_ng_doc.inspect)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
# extracts the rdf field for this fedora object, from the purl xml
|
72
|
+
# @return [Nokogiri::XML::Document] the rdf for the fedora object
|
73
|
+
# @raise [DiscoveryIndexer::Errors::MissingRDF] if there is no rdf element
|
74
|
+
def parse_rdf
|
75
|
+
begin
|
76
|
+
ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/rdf:RDF', {'rdf' => RDF_NAMESPACE}).to_xml)
|
77
|
+
raise DiscoveryIndexer::Errors::MissingRDF.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
|
78
|
+
ng_doc
|
79
|
+
rescue
|
80
|
+
raise DiscoveryIndexer::Errors::MissingRDF.new(@purlxml_ng_doc.inspect)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
|
85
|
+
# extracts the release tag element for this fedora object, from the the identity metadata in purl xml
|
86
|
+
# @return [Hash] the release tags for the fedora object
|
87
|
+
def parse_release_tags_hash
|
88
|
+
release_tags={}
|
89
|
+
identity_metadata = parse_identity_metadata
|
90
|
+
unless identity_metadata.nil?
|
91
|
+
release_elements = identity_metadata.xpath('//release')
|
92
|
+
release_elements.each { |n|
|
93
|
+
unless n.attr("to").nil?
|
94
|
+
release_target = n.attr("to")
|
95
|
+
|
96
|
+
|
97
|
+
#target = release_target.split(":").first
|
98
|
+
#sub_target = "default"
|
99
|
+
#if target != release_target.split(":").last then
|
100
|
+
# sub_target = release_target.split(":").last
|
101
|
+
#end
|
102
|
+
text = n.text
|
103
|
+
unless text.nil?
|
104
|
+
release_tags[release_target]= text
|
105
|
+
end
|
106
|
+
end
|
107
|
+
}
|
108
|
+
return release_tags
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
# extracts the contentMetadata for this fedora object, from the purl xml
|
113
|
+
# @return [Nokogiri::XML::Document] the contentMetadata for the fedora object
|
114
|
+
# @raise [DiscoveryIndexer::Errors::MissingContentMetadata] if there is no contentMetadata
|
115
|
+
def parse_content_metadata
|
116
|
+
# begin
|
117
|
+
ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/contentMetadata').to_xml)
|
118
|
+
# raise DiscoveryIndexer::Errors::MissingContentMetadata.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
|
119
|
+
ng_doc
|
120
|
+
# rescue
|
121
|
+
# raise DiscoveryIndexer::Errors::MissingContentMetadata.new(@purlxml_ng_doc.inspect)
|
122
|
+
# end
|
123
|
+
end
|
124
|
+
|
125
|
+
# @return true if the identityMetadata has <objectType>collection</objectType>, false otherwise
|
126
|
+
def parse_is_collection
|
127
|
+
identity_metadata = parse_identity_metadata
|
128
|
+
unless identity_metadata.nil?
|
129
|
+
object_type_nodes = identity_metadata.xpath('./objectType')
|
130
|
+
return true if object_type_nodes.find_index { |n| n.text == 'collection'}
|
131
|
+
end
|
132
|
+
false
|
133
|
+
end
|
134
|
+
|
135
|
+
# get the druids from isMemberOfCollection relationships in rels-ext from public_xml
|
136
|
+
# @return [Array<String>] the druids (e.g. ww123yy1234) this object has isMemberOfColletion relationship with, or nil if none
|
137
|
+
def parse_collection_druids
|
138
|
+
ns_hash = {'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'fedora' => "info:fedora/fedora-system:def/relations-external#", '' => ''}
|
139
|
+
is_member_of_nodes ||= @purlxml_ng_doc.xpath('/publicObject/rdf:RDF/rdf:Description/fedora:isMemberOfCollection/@rdf:resource', ns_hash)
|
140
|
+
# from public_xml rels-ext
|
141
|
+
druids = []
|
142
|
+
is_member_of_nodes.each { |n|
|
143
|
+
druids << n.value.split('druid:').last unless n.value.empty?
|
144
|
+
}
|
145
|
+
return nil if druids.empty?
|
146
|
+
druids
|
147
|
+
end
|
148
|
+
|
149
|
+
# the value of the type attribute for a DOR object's contentMetadata
|
150
|
+
# more info about these values is here:
|
151
|
+
# https://consul.stanford.edu/display/chimera/DOR+content+types%2C+resource+types+and+interpretive+metadata
|
152
|
+
# https://consul.stanford.edu/display/chimera/Summary+of+Content+Types%2C+Resource+Types+and+their+behaviors
|
153
|
+
# @return [String]
|
154
|
+
def parse_dor_content_type
|
155
|
+
content_md = parse_content_metadata
|
156
|
+
dct = content_md ? content_md.xpath('@type').text : nil
|
157
|
+
puts " has no DOR content type (<contentMetadata> element may be missing type attribute)" if !dct || dct.empty?
|
158
|
+
dct
|
159
|
+
end
|
160
|
+
|
161
|
+
# the @id attribute of resource/file elements that match the display_type, including extension
|
162
|
+
# @return [Array<String>] filenames
|
163
|
+
def parse_image_ids
|
164
|
+
ids = []
|
165
|
+
content_md = parse_content_metadata
|
166
|
+
unless content_md.nil?
|
167
|
+
content_md.xpath('./resource[@type="image"]/file/@id').each { |node|
|
168
|
+
ids << node.text if !node.text.empty?
|
169
|
+
}
|
170
|
+
return nil if ids.empty?
|
171
|
+
ids
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
def parse_file_ids
|
176
|
+
ids = []
|
177
|
+
content_md = parse_content_metadata
|
178
|
+
unless content_md.nil?
|
179
|
+
content_md.xpath('./resource/file/@id').each { |node|
|
180
|
+
ids << node.text if !node.text.empty?
|
181
|
+
}
|
182
|
+
return nil if ids.empty?
|
183
|
+
ids
|
184
|
+
end
|
185
|
+
end
|
186
|
+
|
187
|
+
def parse_catkey
|
188
|
+
catkey = nil
|
189
|
+
node = @purlxml_ng_doc.xpath("/publicObject/identityMetadata/otherId[@name='catkey']")
|
190
|
+
catkey = node.first.content if node && node.first
|
191
|
+
return catkey
|
192
|
+
end
|
193
|
+
|
194
|
+
def parse_barcode
|
195
|
+
barcode = nil
|
196
|
+
node = @purlxml_ng_doc.xpath("/publicObject/identityMetadata/otherId[@name='barcode']")
|
197
|
+
barcode = node.first.content if node && node.first
|
198
|
+
return barcode
|
199
|
+
end
|
200
|
+
|
201
|
+
def parse_label
|
202
|
+
label = nil
|
203
|
+
node = @purlxml_ng_doc.xpath("/publicObject/identityMetadata/objectLabel")
|
204
|
+
label = node.first.content if node && node.first
|
205
|
+
return label
|
206
|
+
end
|
207
|
+
end
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'open-uri'
|
3
|
+
module DiscoveryIndexer
|
4
|
+
module InputXml
|
5
|
+
class PurlxmlReader
|
6
|
+
|
7
|
+
# reads the public xml for the fedora object that is defined , from the purl server
|
8
|
+
# @param [String] druid e.g. ab123cd4567
|
9
|
+
# @return [Nokogiri::XML::Document] the public xml for the fedora object
|
10
|
+
# @raise [MissingPublicXml] if there's no purl xml available for this druid
|
11
|
+
def self.read(druid)
|
12
|
+
purlxml_uri = "#{DiscoveryIndexer::PURL_DEFAULT}/#{druid}.xml"
|
13
|
+
|
14
|
+
begin
|
15
|
+
purlxml_object = Nokogiri::XML(open(purlxml_uri))
|
16
|
+
return purlxml_object
|
17
|
+
rescue
|
18
|
+
raise DiscoveryIndexer::Errors::MissingPurlPage.new(purlxml_uri)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
data/lib/version.rb
CHANGED
@@ -0,0 +1,51 @@
|
|
1
|
+
require 'retries'
|
2
|
+
require 'rsolr'
|
3
|
+
|
4
|
+
module DiscoveryIndexer
|
5
|
+
module Writer
|
6
|
+
class SolrClient
|
7
|
+
|
8
|
+
# Add the document to solr, retry if an error occurs.
|
9
|
+
# See https://github.com/ooyala/retries for docs on with_retries.
|
10
|
+
# @param [Hash] solr_doc a Hash representation of the solr document
|
11
|
+
# @param [RSolr::Client] solr_connector is an open connection with the solr core
|
12
|
+
# @param [Integer] max_retries the maximum number of tries before fail
|
13
|
+
def self.add(solr_doc, solr_connector, max_retries = 10)
|
14
|
+
process(solr_doc, solr_connector, max_retries, is_delete=false)
|
15
|
+
end
|
16
|
+
|
17
|
+
# Add the document to solr, retry if an error occurs.
|
18
|
+
# See https://github.com/ooyala/retries for docs on with_retries.
|
19
|
+
# @param [Hash] solr_doc that has only the id !{:id=>"ab123cd4567"}
|
20
|
+
# @param [RSolr::Client] solr_connector is an open connection with the solr core
|
21
|
+
# @param [Integer] max_retries the maximum number of tries before fail
|
22
|
+
def self.delete(solr_doc, solr_connector, max_retries = 10)
|
23
|
+
process(solr_doc, solr_connector, max_retries, is_delete=true)
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.process(solr_doc, solr_connector, max_retries, is_delete=false)
|
27
|
+
logger = Logger.new STDOUT
|
28
|
+
id = solr_doc[:id]
|
29
|
+
puts id
|
30
|
+
handler = Proc.new do |exception, attempt_number, total_delay|
|
31
|
+
logger.debug "#{exception.class} on attempt #{attempt_number} for #{id}"
|
32
|
+
end
|
33
|
+
|
34
|
+
with_retries(:max_tries => max_retries, :handler => handler, :base_sleep_seconds => 1, :max_sleep_seconds => 5) do |attempt|
|
35
|
+
logger.debug "Attempt #{attempt} for #{id}"
|
36
|
+
|
37
|
+
if is_delete
|
38
|
+
solr_connector.delete_by_id(id)
|
39
|
+
logger.info "Successfully deleted #{id} on attempt #{attempt}"
|
40
|
+
else
|
41
|
+
solr_connector.add(solr_doc)
|
42
|
+
logger.info "Successfully indexed #{id} on attempt #{attempt}"
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
46
|
+
solr_connector.commit
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
require 'retries'
|
2
|
+
require 'rsolr'
|
3
|
+
|
4
|
+
module DiscoveryIndexer
|
5
|
+
module Writer
|
6
|
+
class SolrWriter
|
7
|
+
|
8
|
+
def process(druid, index_doc, targets, solr_targets_configs)
|
9
|
+
@solr_targets_configs = solr_targets_configs
|
10
|
+
index_targets = []
|
11
|
+
delete_targets = []
|
12
|
+
puts targets
|
13
|
+
targets.keys.each do |target|
|
14
|
+
if targets[target] then
|
15
|
+
index_targets.append(target)
|
16
|
+
else
|
17
|
+
delete_targets.append(target)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
# get targets with true
|
22
|
+
solr_index_client(index_doc, index_targets)
|
23
|
+
# get targets with false
|
24
|
+
solr_delete_client(druid, delete_targets)
|
25
|
+
end
|
26
|
+
|
27
|
+
def solr_delete_from_all(druid, solr_targets_configs)
|
28
|
+
# Get a list of all registered targets
|
29
|
+
@solr_targets_configs=solr_targets_configs
|
30
|
+
targets = @solr_targets_configs.keys()
|
31
|
+
solr_delete_client(druid, targets)
|
32
|
+
end
|
33
|
+
|
34
|
+
def solr_index_client(index_doc, targets)
|
35
|
+
targets.each do |solr_target|
|
36
|
+
solr_connector = get_connector_for_target(solr_target)
|
37
|
+
SolrClient.add(index_doc, solr_connector)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def solr_delete_client(druid, targets)
|
42
|
+
targets.each do |solr_target|
|
43
|
+
solr_connector = get_connector_for_target(solr_target)
|
44
|
+
SolrClient.delete({:id=>druid}, solr_connector)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def get_connector_for_target(solr_target)
|
49
|
+
solr_connector = nil
|
50
|
+
puts solr_target
|
51
|
+
puts @solr_targets_configs
|
52
|
+
if @solr_targets_configs.keys.include?(solr_target) then
|
53
|
+
config = @solr_targets_configs[solr_target]
|
54
|
+
solr_connector = RSolr.connect(config)
|
55
|
+
end
|
56
|
+
return solr_connector
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
metadata
CHANGED
@@ -1,15 +1,127 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: discovery-indexer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: '0.1'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ahmed AlSum
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-03-
|
12
|
-
dependencies:
|
11
|
+
date: 2015-03-19 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: stanford-mods
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: retries
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rsolr
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rspec
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: webmock
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: equivalent-xml
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: vcr
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - ">="
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - ">="
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0'
|
13
125
|
description: This library manages the core operations for the discovery indexing such
|
14
126
|
as reading PURL xml, mapping to the solr document, and writing to solr core.
|
15
127
|
email: aalsum@stanford.edu
|
@@ -18,7 +130,20 @@ extensions: []
|
|
18
130
|
extra_rdoc_files: []
|
19
131
|
files:
|
20
132
|
- lib/discovery-indexer.rb
|
133
|
+
- lib/errors.rb
|
134
|
+
- lib/mapper/general_mapper.rb
|
135
|
+
- lib/mapper/index_mapper.rb
|
136
|
+
- lib/reader/modsxml.rb
|
137
|
+
- lib/reader/modsxml_reader.rb
|
138
|
+
- lib/reader/purlxml.rb
|
139
|
+
- lib/reader/purlxml_model.rb
|
140
|
+
- lib/reader/purlxml_parser.rb
|
141
|
+
- lib/reader/purlxml_parser_strict.rb
|
142
|
+
- lib/reader/purlxml_reader.rb
|
143
|
+
- lib/utilities/extract_sub_targets.rb
|
21
144
|
- lib/version.rb
|
145
|
+
- lib/writer/solr_client.rb
|
146
|
+
- lib/writer/solr_writer.rb
|
22
147
|
homepage:
|
23
148
|
licenses:
|
24
149
|
- Stanford University
|
@@ -29,12 +154,12 @@ require_paths:
|
|
29
154
|
- lib
|
30
155
|
required_ruby_version: !ruby/object:Gem::Requirement
|
31
156
|
requirements:
|
32
|
-
- -
|
157
|
+
- - ">="
|
33
158
|
- !ruby/object:Gem::Version
|
34
159
|
version: '0'
|
35
160
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
36
161
|
requirements:
|
37
|
-
- -
|
162
|
+
- - ">="
|
38
163
|
- !ruby/object:Gem::Version
|
39
164
|
version: '0'
|
40
165
|
requirements: []
|