discovery-indexer 0.0.1 → 0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c02b6da0be42381bffd949f1afb336909bfaa768
4
- data.tar.gz: 1ba06814d93eef836acff2399624d6d55a06d2a5
3
+ metadata.gz: 073a6699afc6ef96ee080e00be2dfe3180bfb734
4
+ data.tar.gz: e4361f9aad38081a92598e53fb0f55e564807812
5
5
  SHA512:
6
- metadata.gz: 7164b01c893146b50a41429aaacfa50f5592c86b322b38e9a2fbd779ab4152bfdc0e296457bcce4ad6393bdf22b1fcf21358cfbce3cc395ef661144e3716ff12
7
- data.tar.gz: ae30908c3aa0b56c2cdcc5646452db595bd9755d86c1de6a5794a950e654c51cfae06966088a18861f544e46ce4543efd15a055e108363bdb56a2a378f0b2089
6
+ metadata.gz: 29e0c0dd830b66cc7cfe61bd4da7a42d3ee6fbb55721bf01ecf1f4b1679585d7aae9cc7c6f2a35d77b80b0017b45babea77516b9a4373191b7c8899b0a9a371c
7
+ data.tar.gz: 09c61dd4fa8501865bc4081dcb584224de912921eae74bb34121d6aafd05f4a762f45cdaffdf2c0a0282d318bc501629dcb0f1228c48d1fcba9ca769d4ab07db
@@ -0,0 +1,22 @@
1
+ require 'reader/purlxml'
2
+ require 'reader/purlxml_reader'
3
+ require 'reader/purlxml_parser'
4
+ require 'reader/purlxml_parser_strict'
5
+ require 'reader/purlxml_model'
6
+
7
+ require 'reader/modsxml'
8
+ require 'reader/modsxml_reader'
9
+
10
+ require 'mapper/general_mapper'
11
+ require 'mapper/index_mapper'
12
+
13
+ require 'writer/solr_client'
14
+ require 'writer/solr_writer'
15
+
16
+ #require 'utilities/extract_sub_targets'
17
+
18
+ require 'errors'
19
+
20
+ module DiscoveryIndexer
21
+ PURL_DEFAULT = 'http://purl-test.stanford.edu'
22
+ end
data/lib/errors.rb ADDED
@@ -0,0 +1,13 @@
1
+ module DiscoveryIndexer
2
+ module Errors
3
+ MissingPurlPage = Class.new(StandardError)
4
+ MissingMods = Class.new(StandardError)
5
+ MissingPublicXml = Class.new(StandardError)
6
+ MissingContentMetadata = Class.new(StandardError)
7
+ MissingIdentityMetadata = Class.new(StandardError)
8
+ MissingRightsMetadata = Class.new(StandardError)
9
+ MissingRDF = Class.new(StandardError)
10
+ MissingDC = Class.new(StandardError)
11
+ MissingModsPage = Class.new(StandardError)
12
+ end
13
+ end
@@ -0,0 +1,18 @@
1
+ module DiscoveryIndexer
2
+ module Mapper
3
+ class GeneralMapper
4
+
5
+ def initialize(druid, modsxml, purlxml, collection_names={})
6
+ @druid = druid
7
+ @modsxml = modsxml
8
+ @purlxml = purlxml
9
+ @collection_names = collection_names
10
+ end
11
+
12
+ def map()
13
+ end
14
+
15
+ end
16
+ end
17
+ end
18
+
@@ -0,0 +1,179 @@
1
+ module DiscoveryIndexer
2
+ module Mapper
3
+
4
+ # This class is responsible for creating the solr_doc hash based on the input
5
+ # of druid_id, modsxml, purlxml, and optional hash of collection_names
6
+ class IndexMapper < GeneralMapper
7
+
8
+ # Initializes an instance from IndexMapper
9
+ # @param [String] druid e.g. ab123cd4567
10
+ # @param [Stanford::Mods::Record] modsxml represents the MODS xml for the druid
11
+ # @param [DiscoveryIndexer::Reader::PurlxmlModel] purlxml represents the purlxml model
12
+ # @param [Hash] collection_names represents a hash of collection_druid and
13
+ # collection_name !{"aa111aa1111"=>"First Collection", "bb123bb1234"=>"Second Collection"}
14
+ def initialize(druid, modsxml, purlxml, collection_names={})
15
+ super druid, modsxml, purlxml, collection_names
16
+ end
17
+
18
+ # Create a Hash representing a Solr doc, with all MODS related fields populated.
19
+ # @return [Hash] Hash representing the Solr document
20
+ def map()
21
+ solr_doc = {}
22
+ solr_doc[:id] = @druid
23
+ solr_doc.update mods_to_title_fields
24
+ solr_doc.update mods_to_author_fields
25
+ solr_doc.update mods_to_subject_search_fields
26
+ solr_doc.update mods_to_publication_fields
27
+ solr_doc.update mods_to_pub_date
28
+ solr_doc.update mods_to_others
29
+
30
+ solr_doc[:all_search] = @modsxml.text.gsub(/\s+/, ' ')
31
+ return solr_doc
32
+ end
33
+
34
+ # @return [Hash] Hash representing the title fields
35
+ def mods_to_title_fields
36
+ # title fields
37
+ doc_hash = {
38
+ :title_245a_search => @modsxml.sw_short_title,
39
+ :title_245_search => @modsxml.sw_full_title,
40
+ :title_variant_search => @modsxml.sw_addl_titles,
41
+ :title_sort => @modsxml.sw_sort_title,
42
+ :title_245a_display => @modsxml.sw_short_title,
43
+ :title_display => @modsxml.sw_title_display,
44
+ :title_full_display => @modsxml.sw_full_title,
45
+ }
46
+ doc_hash
47
+ end
48
+
49
+ # @return [Hash] Hash representing the author fields
50
+ def mods_to_author_fields
51
+ doc_hash = {
52
+ # author fields
53
+ :author_1xx_search => @modsxml.sw_main_author,
54
+ :author_7xx_search => @modsxml.sw_addl_authors,
55
+ :author_person_facet => @modsxml.sw_person_authors,
56
+ :author_other_facet => @modsxml.sw_impersonal_authors,
57
+ :author_sort => @modsxml.sw_sort_author[1..-1],
58
+ :author_corp_display => @modsxml.sw_corporate_authors,
59
+ :author_meeting_display => @modsxml.sw_meeting_authors,
60
+ :author_person_display => @modsxml.sw_person_authors,
61
+ :author_person_full_display => @modsxml.sw_person_authors,
62
+ }
63
+ doc_hash
64
+ end
65
+
66
+ # @return [Hash] Hash representing the search fields
67
+ def mods_to_subject_search_fields
68
+ doc_hash = {
69
+ # subject search fields
70
+ :topic_search => @modsxml.topic_search,
71
+ :geographic_search => @modsxml.geographic_search,
72
+ :subject_other_search => @modsxml.subject_other_search,
73
+ :subject_other_subvy_search => @modsxml.subject_other_subvy_search,
74
+ :subject_all_search => @modsxml.subject_all_search,
75
+ :topic_facet => @modsxml.topic_facet,
76
+ :geographic_facet => @modsxml.geographic_facet,
77
+ :era_facet => @modsxml.era_facet,
78
+ }
79
+ end
80
+
81
+ # @return [Hash] Hash representing the publication fields
82
+ def mods_to_publication_fields
83
+ doc_hash = {
84
+ # publication fields
85
+ :pub_search => @modsxml.place,
86
+ :pub_date_sort => @modsxml.pub_date_sort,
87
+ :imprint_display => @modsxml.pub_date_display,
88
+ :pub_date => @modsxml.pub_date_facet,
89
+ :pub_date_display => @modsxml.pub_date_display, # pub_date_display may be deprecated
90
+ }
91
+ end
92
+
93
+ # @return [Hash] Hash representing the pub date
94
+ def mods_to_pub_date
95
+ doc_hash = {}
96
+ pub_date_sort = @modsxml.pub_date_sort
97
+ if is_positive_int? pub_date_sort
98
+ doc_hash[:pub_year_tisim] = pub_date_sort # for date slider
99
+ # put the displayable year in the correct field, :creation_year_isi for example
100
+ doc_hash[date_type_sym] = @modsxml.pub_date_sort if date_type_sym
101
+ end
102
+ return doc_hash
103
+ end
104
+
105
+ # @return [Hash] Hash representing some fields
106
+ def mods_to_others
107
+ doc_hash = {
108
+ :format_main_ssim => format_main_ssim,
109
+ :format => format, # for backwards compatibility
110
+ :language => @modsxml.sw_language_facet,
111
+ :physical => @modsxml.term_values([:physical_description, :extent]),
112
+ :summary_search => @modsxml.term_values(:abstract),
113
+ :toc_search => @modsxml.term_values(:tableOfContents),
114
+ :url_suppl => @modsxml.term_values([:related_item, :location, :url]),
115
+ }
116
+ return doc_hash
117
+ end
118
+
119
+ # select one or more format values from the controlled vocabulary here:
120
+ # http://searchworks-solr-lb.stanford.edu:8983/solr/select?facet.field=format&rows=0&facet.sort=index
121
+ # via stanford-mods gem
122
+ # @return [Array<String>] value(s) in the SearchWorks controlled vocabulary, or []
123
+ def format
124
+ vals = @modsxml.format
125
+ if vals.empty?
126
+ puts "#{@druid} has no SearchWorks format from MODS - check <typeOfResource> and other implicated MODS elements"
127
+ end
128
+ vals
129
+ end
130
+
131
+ # call stanford-mods format_main to get results
132
+ # @return [Array<String>] value(s) in the SearchWorks controlled vocabulary, or []
133
+ def format_main_ssim
134
+ vals = @modsxml.format_main
135
+ if vals.empty?
136
+ puts "#{@druid} has no SearchWorks Resource Type from MODS - check <typeOfResource> and other implicated MODS elements"
137
+ end
138
+ vals
139
+ end
140
+
141
+ # call stanford-mods sw_genre to get results
142
+ # @return [Array<String>] value(s)
143
+ def genre_ssim
144
+ @modsxml.sw_genre
145
+ end
146
+
147
+ protected
148
+
149
+ # @return true if the string parses into an int, and if so, the int is >= 0
150
+ def is_positive_int? str
151
+ begin
152
+ if str.to_i >= 0
153
+ return true
154
+ else
155
+ return false
156
+ end
157
+ rescue
158
+ end
159
+ return false
160
+ end
161
+
162
+ # determines particular flavor of displayable publication year field
163
+ # @return Solr field name as a symbol
164
+ def date_type_sym
165
+ vals = @modsxml.term_values([:origin_info,:dateIssued])
166
+ if vals and vals.length > 0
167
+ return :publication_year_isi
168
+ end
169
+ vals = @modsxml.term_values([:origin_info,:dateCreated])
170
+ if vals and vals.length > 0
171
+ return :creation_year_isi
172
+ end
173
+ nil
174
+ end
175
+
176
+ end
177
+ end
178
+ end
179
+
@@ -0,0 +1,44 @@
1
+ require 'stanford-mods'
2
+ module DiscoveryIndexer
3
+ module InputXml
4
+
5
+ # This class is the main class to access and parse the mods xml
6
+ # as retrieved from PURL server
7
+ # @example to run the code
8
+ # druid = "aa111aa1111"
9
+ # p = DiscoveryIndexer::InputXml::Modsxml.new(druid)
10
+ # model = p.load()
11
+ #
12
+ #
13
+ class Modsxml
14
+ # initializes a new object
15
+ # @param druid [String] the druid object in the format "aa111aa1111"
16
+ def initialize(druid)
17
+ @druid = druid
18
+ @modsxml_ng_doc = nil
19
+ end
20
+
21
+ # loads the mods xml to stanford mods model for the fedora object defind in the druid,
22
+ # it reads the mods xml once from PURL server, and repeat the parsing with each call
23
+ # @return [Stanford::Mods::Record] represents the mods xml
24
+ def load()
25
+ if @modsxml_ng_doc.nil? then
26
+ @modsxml_ng_doc = ModsxmlReader.read(@druid)
27
+ end
28
+
29
+ modsxml_model = Stanford::Mods::Record.new
30
+ modsxml_model.from_nk_node(@modsxml_ng_doc)
31
+ return modsxml_model
32
+ end
33
+
34
+ # loads the mods xml to stanford mods model for the fedora object defind in the druid,
35
+ # it reads the mods xml from PURL server with every call
36
+ # @return [Stanford::Mods::Record] represents the mods xml
37
+ def reload()
38
+ @modsxml_ng_doc = ModsxmlReader.read(@druid)
39
+ return load()
40
+ end
41
+
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,23 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+ module DiscoveryIndexer
4
+ module InputXml
5
+ class ModsxmlReader
6
+
7
+ # reads the mods xml for the fedora object that is defined , from the purl server
8
+ # @param [String] druid e.g. ab123cd4567
9
+ # @return [Nokogiri::XML::Document] the mods xml for the fedora object
10
+ # @raise [MissingModsXml] if there's no mods xml available for this druid
11
+ def self.read(druid)
12
+ mods_uri = "#{DiscoveryIndexer::PURL_DEFAULT}/#{druid}.mods"
13
+
14
+ begin
15
+ modsxml_ng_doc = Nokogiri::XML(open(mods_uri))
16
+ return modsxml_ng_doc
17
+ rescue
18
+ raise DiscoveryIndexer::Errors::MissingModsPage.new(mods_uri)
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,43 @@
1
+ module DiscoveryIndexer
2
+ module InputXml
3
+
4
+ # This class is the main class to access and parse the purl xml
5
+ # as retrieved from PURL server
6
+ # @example to run the code
7
+ # druid = "aa111aa1111"
8
+ # p = DiscoveryIndexer::InputXml::Purlxml.new(druid)
9
+ # model = p.load()
10
+ #
11
+ class Purlxml
12
+
13
+ # initializes a new object
14
+ # @param druid [String] the druid object in the format "aa111aa1111"
15
+ def initialize(druid)
16
+ @druid = druid
17
+ @purlxml_ng_doc = nil
18
+ end
19
+
20
+ # loads the purl xml to purlxml model for the fedora object defind in the druid,
21
+ # it reads the purl xml once from PURL server, and repeat the parsing with each call
22
+ # @return [PurlxmlModel] represents the purlxml
23
+ def load()
24
+ if @purlxml_ng_doc.nil? then
25
+ @purlxml_ng_doc = PurlxmlReader.read(@druid)
26
+ end
27
+
28
+ purlxml_parser = PurlxmlParserStrict.new(@purlxml_ng_doc)
29
+ purlxml_model = purlxml_parser.parse()
30
+ return purlxml_model
31
+ end
32
+
33
+ # loads the purl xml to purlxml model for the fedora object defind in the druid
34
+ # it reads the purl xml from PURL server with every call
35
+ # @return [PurlxmlModel] represents the purlxml
36
+ def reload()
37
+ @purlxml_ng_doc = PurlxmlReader.read(@druid)
38
+ return load()
39
+ end
40
+ end
41
+ end
42
+ end
43
+
@@ -0,0 +1,29 @@
1
+ module DiscoveryIndexer
2
+ module InputXml
3
+ class PurlxmlModel
4
+ attr_accessor :public_xml
5
+ attr_accessor :content_metadata
6
+ attr_accessor :identity_metadata
7
+ attr_accessor :rights_metadata
8
+ attr_accessor :dc
9
+ attr_accessor :rdf
10
+ attr_accessor :release_tags_hash
11
+ attr_accessor :dor_content_type
12
+ attr_accessor :is_collection
13
+ attr_accessor :collection_druids
14
+ attr_accessor :dor_content_type
15
+ attr_accessor :file_ids
16
+ attr_accessor :image_ids
17
+ attr_accessor :catkey
18
+ attr_accessor :barcode
19
+ attr_accessor :label
20
+
21
+ end
22
+ end
23
+ end
24
+
25
+
26
+
27
+
28
+
29
+
@@ -0,0 +1,13 @@
1
+ module DiscoveryIndexer
2
+ module InputXml
3
+ class PurlxmlParser
4
+
5
+ def initialize(purlxml_ng_doc)
6
+ @purlxml_ng_doc = purlxml_ng_doc
7
+ end
8
+
9
+ def parse()
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,210 @@
1
+ module DiscoveryIndexer
2
+ module InputXml
3
+ class PurlxmlParserStrict < PurlxmlParser
4
+
5
+ RDF_NAMESPACE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
6
+ OAI_DC_NAMESPACE = 'http://www.openarchives.org/OAI/2.0/oai_dc/'
7
+ MODS_NAMESPACE = 'http://www.loc.gov/mods/v3'
8
+
9
+ def initialize(purlxml_ng_doc)
10
+ super
11
+ end
12
+
13
+ # it parses the purlxml into a purlxml model
14
+ # @return [PurlxmlModel] represents the purlxml as parsed based on the parser rules
15
+ def parse()
16
+ purlxml_model = PurlxmlModel.new
17
+ purlxml_model.public_xml = @purlxml_ng_doc
18
+ purlxml_model.content_metadata = parse_content_metadata()
19
+ purlxml_model.identity_metadata = parse_identity_metadata()
20
+ purlxml_model.rights_metadata = parse_rights_metadata()
21
+ purlxml_model.dc = parse_dc()
22
+ purlxml_model.rdf = parse_rdf()
23
+ purlxml_model.is_collection = parse_is_collection()
24
+ purlxml_model.collection_druids = parse_collection_druids()
25
+ purlxml_model.dor_content_type = parse_dor_content_type()
26
+ purlxml_model.release_tags_hash = parse_release_tags_hash()
27
+ purlxml_model.file_ids = parse_file_ids()
28
+ purlxml_model.image_ids = parse_image_ids()
29
+ purlxml_model.catkey = parse_catkey()
30
+ purlxml_model.barcode = parse_barcode()
31
+ purlxml_model.label = parse_label()
32
+ return purlxml_model
33
+ end
34
+
35
+ # extracts the identityMetadata for this fedora object, from the purl xml
36
+ # @return [Nokogiri::XML::Document] the identityMetadata for the fedora object
37
+ # @raise [DiscoveryIndexer::Errors::MissingIdentityMetadata] if there is no contentMetadata
38
+ def parse_identity_metadata
39
+ begin
40
+ ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/identityMetadata').to_xml)
41
+ raise DiscoveryIndexer::Errors::MissingIdentityMetadata.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
42
+ ng_doc
43
+ rescue
44
+ raise DiscoveryIndexer::Errors::MissingIdentityMetadata.new(@purlxml_ng_doc.inspect)
45
+ end
46
+ end
47
+
48
+ def parse_rights_metadata
49
+ begin
50
+ ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/rightsMetadata').to_xml)
51
+ raise DiscoveryIndexer::Errors::MissingRightsMetadata.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
52
+ ng_doc
53
+ rescue
54
+ raise DiscoveryIndexer::Errors::MissingRightsMetadata.new(@purlxml_ng_doc.inspect)
55
+ end
56
+ end
57
+
58
+ # extracts the dc field for this fedora object, from the purl xml
59
+ # @return [Nokogiri::XML::Document] the dc for the fedora object
60
+ # @raise [DiscoveryIndexer::Errors::MissingDC] if there is no dc element
61
+ def parse_dc
62
+ begin
63
+ ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/dc:dc', {'dc' => OAI_DC_NAMESPACE}).to_xml(:encoding => 'utf-8'))
64
+ raise DiscoveryIndexer::Errors::MissingDC.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
65
+ ng_doc
66
+ rescue
67
+ raise DiscoveryIndexer::Errors::MissingDC.new(@purlxml_ng_doc.inspect)
68
+ end
69
+ end
70
+
71
+ # extracts the rdf field for this fedora object, from the purl xml
72
+ # @return [Nokogiri::XML::Document] the rdf for the fedora object
73
+ # @raise [DiscoveryIndexer::Errors::MissingRDF] if there is no rdf element
74
+ def parse_rdf
75
+ begin
76
+ ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/rdf:RDF', {'rdf' => RDF_NAMESPACE}).to_xml)
77
+ raise DiscoveryIndexer::Errors::MissingRDF.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
78
+ ng_doc
79
+ rescue
80
+ raise DiscoveryIndexer::Errors::MissingRDF.new(@purlxml_ng_doc.inspect)
81
+ end
82
+ end
83
+
84
+
85
+ # extracts the release tag element for this fedora object, from the the identity metadata in purl xml
86
+ # @return [Hash] the release tags for the fedora object
87
+ def parse_release_tags_hash
88
+ release_tags={}
89
+ identity_metadata = parse_identity_metadata
90
+ unless identity_metadata.nil?
91
+ release_elements = identity_metadata.xpath('//release')
92
+ release_elements.each { |n|
93
+ unless n.attr("to").nil?
94
+ release_target = n.attr("to")
95
+
96
+
97
+ #target = release_target.split(":").first
98
+ #sub_target = "default"
99
+ #if target != release_target.split(":").last then
100
+ # sub_target = release_target.split(":").last
101
+ #end
102
+ text = n.text
103
+ unless text.nil?
104
+ release_tags[release_target]= text
105
+ end
106
+ end
107
+ }
108
+ return release_tags
109
+ end
110
+ end
111
+
112
+ # extracts the contentMetadata for this fedora object, from the purl xml
113
+ # @return [Nokogiri::XML::Document] the contentMetadata for the fedora object
114
+ # @raise [DiscoveryIndexer::Errors::MissingContentMetadata] if there is no contentMetadata
115
+ def parse_content_metadata
116
+ # begin
117
+ ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/contentMetadata').to_xml)
118
+ # raise DiscoveryIndexer::Errors::MissingContentMetadata.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
119
+ ng_doc
120
+ # rescue
121
+ # raise DiscoveryIndexer::Errors::MissingContentMetadata.new(@purlxml_ng_doc.inspect)
122
+ # end
123
+ end
124
+
125
+ # @return true if the identityMetadata has <objectType>collection</objectType>, false otherwise
126
+ def parse_is_collection
127
+ identity_metadata = parse_identity_metadata
128
+ unless identity_metadata.nil?
129
+ object_type_nodes = identity_metadata.xpath('./objectType')
130
+ return true if object_type_nodes.find_index { |n| n.text == 'collection'}
131
+ end
132
+ false
133
+ end
134
+
135
+ # get the druids from isMemberOfCollection relationships in rels-ext from public_xml
136
+ # @return [Array<String>] the druids (e.g. ww123yy1234) this object has isMemberOfColletion relationship with, or nil if none
137
+ def parse_collection_druids
138
+ ns_hash = {'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'fedora' => "info:fedora/fedora-system:def/relations-external#", '' => ''}
139
+ is_member_of_nodes ||= @purlxml_ng_doc.xpath('/publicObject/rdf:RDF/rdf:Description/fedora:isMemberOfCollection/@rdf:resource', ns_hash)
140
+ # from public_xml rels-ext
141
+ druids = []
142
+ is_member_of_nodes.each { |n|
143
+ druids << n.value.split('druid:').last unless n.value.empty?
144
+ }
145
+ return nil if druids.empty?
146
+ druids
147
+ end
148
+
149
+ # the value of the type attribute for a DOR object's contentMetadata
150
+ # more info about these values is here:
151
+ # https://consul.stanford.edu/display/chimera/DOR+content+types%2C+resource+types+and+interpretive+metadata
152
+ # https://consul.stanford.edu/display/chimera/Summary+of+Content+Types%2C+Resource+Types+and+their+behaviors
153
+ # @return [String]
154
+ def parse_dor_content_type
155
+ content_md = parse_content_metadata
156
+ dct = content_md ? content_md.xpath('@type').text : nil
157
+ puts " has no DOR content type (<contentMetadata> element may be missing type attribute)" if !dct || dct.empty?
158
+ dct
159
+ end
160
+
161
+ # the @id attribute of resource/file elements that match the display_type, including extension
162
+ # @return [Array<String>] filenames
163
+ def parse_image_ids
164
+ ids = []
165
+ content_md = parse_content_metadata
166
+ unless content_md.nil?
167
+ content_md.xpath('./resource[@type="image"]/file/@id').each { |node|
168
+ ids << node.text if !node.text.empty?
169
+ }
170
+ return nil if ids.empty?
171
+ ids
172
+ end
173
+ end
174
+
175
+ def parse_file_ids
176
+ ids = []
177
+ content_md = parse_content_metadata
178
+ unless content_md.nil?
179
+ content_md.xpath('./resource/file/@id').each { |node|
180
+ ids << node.text if !node.text.empty?
181
+ }
182
+ return nil if ids.empty?
183
+ ids
184
+ end
185
+ end
186
+
187
+ def parse_catkey
188
+ catkey = nil
189
+ node = @purlxml_ng_doc.xpath("/publicObject/identityMetadata/otherId[@name='catkey']")
190
+ catkey = node.first.content if node && node.first
191
+ return catkey
192
+ end
193
+
194
+ def parse_barcode
195
+ barcode = nil
196
+ node = @purlxml_ng_doc.xpath("/publicObject/identityMetadata/otherId[@name='barcode']")
197
+ barcode = node.first.content if node && node.first
198
+ return barcode
199
+ end
200
+
201
+ def parse_label
202
+ label = nil
203
+ node = @purlxml_ng_doc.xpath("/publicObject/identityMetadata/objectLabel")
204
+ label = node.first.content if node && node.first
205
+ return label
206
+ end
207
+ end
208
+ end
209
+ end
210
+
@@ -0,0 +1,23 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+ module DiscoveryIndexer
4
+ module InputXml
5
+ class PurlxmlReader
6
+
7
+ # reads the public xml for the fedora object that is defined , from the purl server
8
+ # @param [String] druid e.g. ab123cd4567
9
+ # @return [Nokogiri::XML::Document] the public xml for the fedora object
10
+ # @raise [MissingPublicXml] if there's no purl xml available for this druid
11
+ def self.read(druid)
12
+ purlxml_uri = "#{DiscoveryIndexer::PURL_DEFAULT}/#{druid}.xml"
13
+
14
+ begin
15
+ purlxml_object = Nokogiri::XML(open(purlxml_uri))
16
+ return purlxml_object
17
+ rescue
18
+ raise DiscoveryIndexer::Errors::MissingPurlPage.new(purlxml_uri)
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,13 @@
1
+ module DiscoveryIndexer
2
+ module Utilities
3
+ class ExtractSubTargets
4
+
5
+ def self.by_name target_name, release_tags
6
+ if release_tags.keys.include?(target_name) then
7
+ return release_tags[target_name]
8
+ end
9
+ end
10
+
11
+ end
12
+ end
13
+ end
data/lib/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module DiscoveryIndexer
2
- VERSION = '0.0.1'
2
+ VERSION = '0.1'
3
3
  end
@@ -0,0 +1,51 @@
1
+ require 'retries'
2
+ require 'rsolr'
3
+
4
+ module DiscoveryIndexer
5
+ module Writer
6
+ class SolrClient
7
+
8
+ # Add the document to solr, retry if an error occurs.
9
+ # See https://github.com/ooyala/retries for docs on with_retries.
10
+ # @param [Hash] solr_doc a Hash representation of the solr document
11
+ # @param [RSolr::Client] solr_connector is an open connection with the solr core
12
+ # @param [Integer] max_retries the maximum number of tries before fail
13
+ def self.add(solr_doc, solr_connector, max_retries = 10)
14
+ process(solr_doc, solr_connector, max_retries, is_delete=false)
15
+ end
16
+
17
+ # Add the document to solr, retry if an error occurs.
18
+ # See https://github.com/ooyala/retries for docs on with_retries.
19
+ # @param [Hash] solr_doc that has only the id !{:id=>"ab123cd4567"}
20
+ # @param [RSolr::Client] solr_connector is an open connection with the solr core
21
+ # @param [Integer] max_retries the maximum number of tries before fail
22
+ def self.delete(solr_doc, solr_connector, max_retries = 10)
23
+ process(solr_doc, solr_connector, max_retries, is_delete=true)
24
+ end
25
+
26
+ def self.process(solr_doc, solr_connector, max_retries, is_delete=false)
27
+ logger = Logger.new STDOUT
28
+ id = solr_doc[:id]
29
+ puts id
30
+ handler = Proc.new do |exception, attempt_number, total_delay|
31
+ logger.debug "#{exception.class} on attempt #{attempt_number} for #{id}"
32
+ end
33
+
34
+ with_retries(:max_tries => max_retries, :handler => handler, :base_sleep_seconds => 1, :max_sleep_seconds => 5) do |attempt|
35
+ logger.debug "Attempt #{attempt} for #{id}"
36
+
37
+ if is_delete
38
+ solr_connector.delete_by_id(id)
39
+ logger.info "Successfully deleted #{id} on attempt #{attempt}"
40
+ else
41
+ solr_connector.add(solr_doc)
42
+ logger.info "Successfully indexed #{id} on attempt #{attempt}"
43
+ end
44
+
45
+ end
46
+ solr_connector.commit
47
+ end
48
+
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,61 @@
1
+ require 'retries'
2
+ require 'rsolr'
3
+
4
+ module DiscoveryIndexer
5
+ module Writer
6
+ class SolrWriter
7
+
8
+ def process(druid, index_doc, targets, solr_targets_configs)
9
+ @solr_targets_configs = solr_targets_configs
10
+ index_targets = []
11
+ delete_targets = []
12
+ puts targets
13
+ targets.keys.each do |target|
14
+ if targets[target] then
15
+ index_targets.append(target)
16
+ else
17
+ delete_targets.append(target)
18
+ end
19
+ end
20
+
21
+ # get targets with true
22
+ solr_index_client(index_doc, index_targets)
23
+ # get targets with false
24
+ solr_delete_client(druid, delete_targets)
25
+ end
26
+
27
+ def solr_delete_from_all(druid, solr_targets_configs)
28
+ # Get a list of all registered targets
29
+ @solr_targets_configs=solr_targets_configs
30
+ targets = @solr_targets_configs.keys()
31
+ solr_delete_client(druid, targets)
32
+ end
33
+
34
+ def solr_index_client(index_doc, targets)
35
+ targets.each do |solr_target|
36
+ solr_connector = get_connector_for_target(solr_target)
37
+ SolrClient.add(index_doc, solr_connector)
38
+ end
39
+ end
40
+
41
+ def solr_delete_client(druid, targets)
42
+ targets.each do |solr_target|
43
+ solr_connector = get_connector_for_target(solr_target)
44
+ SolrClient.delete({:id=>druid}, solr_connector)
45
+ end
46
+ end
47
+
48
+ def get_connector_for_target(solr_target)
49
+ solr_connector = nil
50
+ puts solr_target
51
+ puts @solr_targets_configs
52
+ if @solr_targets_configs.keys.include?(solr_target) then
53
+ config = @solr_targets_configs[solr_target]
54
+ solr_connector = RSolr.connect(config)
55
+ end
56
+ return solr_connector
57
+ end
58
+
59
+ end
60
+ end
61
+ end
metadata CHANGED
@@ -1,15 +1,127 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: discovery-indexer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: '0.1'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ahmed AlSum
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-03-09 00:00:00.000000000 Z
12
- dependencies: []
11
+ date: 2015-03-19 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: stanford-mods
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: retries
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rsolr
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rspec
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: webmock
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: equivalent-xml
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: vcr
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
13
125
  description: This library manages the core operations for the discovery indexing such
14
126
  as reading PURL xml, mapping to the solr document, and writing to solr core.
15
127
  email: aalsum@stanford.edu
@@ -18,7 +130,20 @@ extensions: []
18
130
  extra_rdoc_files: []
19
131
  files:
20
132
  - lib/discovery-indexer.rb
133
+ - lib/errors.rb
134
+ - lib/mapper/general_mapper.rb
135
+ - lib/mapper/index_mapper.rb
136
+ - lib/reader/modsxml.rb
137
+ - lib/reader/modsxml_reader.rb
138
+ - lib/reader/purlxml.rb
139
+ - lib/reader/purlxml_model.rb
140
+ - lib/reader/purlxml_parser.rb
141
+ - lib/reader/purlxml_parser_strict.rb
142
+ - lib/reader/purlxml_reader.rb
143
+ - lib/utilities/extract_sub_targets.rb
21
144
  - lib/version.rb
145
+ - lib/writer/solr_client.rb
146
+ - lib/writer/solr_writer.rb
22
147
  homepage:
23
148
  licenses:
24
149
  - Stanford University
@@ -29,12 +154,12 @@ require_paths:
29
154
  - lib
30
155
  required_ruby_version: !ruby/object:Gem::Requirement
31
156
  requirements:
32
- - - '>='
157
+ - - ">="
33
158
  - !ruby/object:Gem::Version
34
159
  version: '0'
35
160
  required_rubygems_version: !ruby/object:Gem::Requirement
36
161
  requirements:
37
- - - '>='
162
+ - - ">="
38
163
  - !ruby/object:Gem::Version
39
164
  version: '0'
40
165
  requirements: []