discovery-indexer 0.0.1 → 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c02b6da0be42381bffd949f1afb336909bfaa768
4
- data.tar.gz: 1ba06814d93eef836acff2399624d6d55a06d2a5
3
+ metadata.gz: 073a6699afc6ef96ee080e00be2dfe3180bfb734
4
+ data.tar.gz: e4361f9aad38081a92598e53fb0f55e564807812
5
5
  SHA512:
6
- metadata.gz: 7164b01c893146b50a41429aaacfa50f5592c86b322b38e9a2fbd779ab4152bfdc0e296457bcce4ad6393bdf22b1fcf21358cfbce3cc395ef661144e3716ff12
7
- data.tar.gz: ae30908c3aa0b56c2cdcc5646452db595bd9755d86c1de6a5794a950e654c51cfae06966088a18861f544e46ce4543efd15a055e108363bdb56a2a378f0b2089
6
+ metadata.gz: 29e0c0dd830b66cc7cfe61bd4da7a42d3ee6fbb55721bf01ecf1f4b1679585d7aae9cc7c6f2a35d77b80b0017b45babea77516b9a4373191b7c8899b0a9a371c
7
+ data.tar.gz: 09c61dd4fa8501865bc4081dcb584224de912921eae74bb34121d6aafd05f4a762f45cdaffdf2c0a0282d318bc501629dcb0f1228c48d1fcba9ca769d4ab07db
@@ -0,0 +1,22 @@
1
+ require 'reader/purlxml'
2
+ require 'reader/purlxml_reader'
3
+ require 'reader/purlxml_parser'
4
+ require 'reader/purlxml_parser_strict'
5
+ require 'reader/purlxml_model'
6
+
7
+ require 'reader/modsxml'
8
+ require 'reader/modsxml_reader'
9
+
10
+ require 'mapper/general_mapper'
11
+ require 'mapper/index_mapper'
12
+
13
+ require 'writer/solr_client'
14
+ require 'writer/solr_writer'
15
+
16
+ #require 'utilities/extract_sub_targets'
17
+
18
+ require 'errors'
19
+
20
+ module DiscoveryIndexer
21
+ PURL_DEFAULT = 'http://purl-test.stanford.edu'
22
+ end
data/lib/errors.rb ADDED
@@ -0,0 +1,13 @@
1
+ module DiscoveryIndexer
2
+ module Errors
3
+ MissingPurlPage = Class.new(StandardError)
4
+ MissingMods = Class.new(StandardError)
5
+ MissingPublicXml = Class.new(StandardError)
6
+ MissingContentMetadata = Class.new(StandardError)
7
+ MissingIdentityMetadata = Class.new(StandardError)
8
+ MissingRightsMetadata = Class.new(StandardError)
9
+ MissingRDF = Class.new(StandardError)
10
+ MissingDC = Class.new(StandardError)
11
+ MissingModsPage = Class.new(StandardError)
12
+ end
13
+ end
@@ -0,0 +1,18 @@
1
+ module DiscoveryIndexer
2
+ module Mapper
3
+ class GeneralMapper
4
+
5
+ def initialize(druid, modsxml, purlxml, collection_names={})
6
+ @druid = druid
7
+ @modsxml = modsxml
8
+ @purlxml = purlxml
9
+ @collection_names = collection_names
10
+ end
11
+
12
+ def map()
13
+ end
14
+
15
+ end
16
+ end
17
+ end
18
+
@@ -0,0 +1,179 @@
1
+ module DiscoveryIndexer
2
+ module Mapper
3
+
4
+ # This class is responsible for creating the solr_doc hash based on the input
5
+ # of druid_id, modsxml, purlxml, and optional hash of collection_names
6
+ class IndexMapper < GeneralMapper
7
+
8
+ # Initializes an instance from IndexMapper
9
+ # @param [String] druid e.g. ab123cd4567
10
+ # @param [Stanford::Mods::Record] modsxml represents the MODS xml for the druid
11
+ # @param [DiscoveryIndexer::Reader::PurlxmlModel] purlxml represents the purlxml model
12
+ # @param [Hash] collection_names represents a hash of collection_druid and
13
+ # collection_name !{"aa111aa1111"=>"First Collection", "bb123bb1234"=>"Second Collection"}
14
+ def initialize(druid, modsxml, purlxml, collection_names={})
15
+ super druid, modsxml, purlxml, collection_names
16
+ end
17
+
18
+ # Create a Hash representing a Solr doc, with all MODS related fields populated.
19
+ # @return [Hash] Hash representing the Solr document
20
+ def map()
21
+ solr_doc = {}
22
+ solr_doc[:id] = @druid
23
+ solr_doc.update mods_to_title_fields
24
+ solr_doc.update mods_to_author_fields
25
+ solr_doc.update mods_to_subject_search_fields
26
+ solr_doc.update mods_to_publication_fields
27
+ solr_doc.update mods_to_pub_date
28
+ solr_doc.update mods_to_others
29
+
30
+ solr_doc[:all_search] = @modsxml.text.gsub(/\s+/, ' ')
31
+ return solr_doc
32
+ end
33
+
34
+ # @return [Hash] Hash representing the title fields
35
+ def mods_to_title_fields
36
+ # title fields
37
+ doc_hash = {
38
+ :title_245a_search => @modsxml.sw_short_title,
39
+ :title_245_search => @modsxml.sw_full_title,
40
+ :title_variant_search => @modsxml.sw_addl_titles,
41
+ :title_sort => @modsxml.sw_sort_title,
42
+ :title_245a_display => @modsxml.sw_short_title,
43
+ :title_display => @modsxml.sw_title_display,
44
+ :title_full_display => @modsxml.sw_full_title,
45
+ }
46
+ doc_hash
47
+ end
48
+
49
+ # @return [Hash] Hash representing the author fields
50
+ def mods_to_author_fields
51
+ doc_hash = {
52
+ # author fields
53
+ :author_1xx_search => @modsxml.sw_main_author,
54
+ :author_7xx_search => @modsxml.sw_addl_authors,
55
+ :author_person_facet => @modsxml.sw_person_authors,
56
+ :author_other_facet => @modsxml.sw_impersonal_authors,
57
+ :author_sort => @modsxml.sw_sort_author[1..-1],
58
+ :author_corp_display => @modsxml.sw_corporate_authors,
59
+ :author_meeting_display => @modsxml.sw_meeting_authors,
60
+ :author_person_display => @modsxml.sw_person_authors,
61
+ :author_person_full_display => @modsxml.sw_person_authors,
62
+ }
63
+ doc_hash
64
+ end
65
+
66
+ # @return [Hash] Hash representing the search fields
67
+ def mods_to_subject_search_fields
68
+ doc_hash = {
69
+ # subject search fields
70
+ :topic_search => @modsxml.topic_search,
71
+ :geographic_search => @modsxml.geographic_search,
72
+ :subject_other_search => @modsxml.subject_other_search,
73
+ :subject_other_subvy_search => @modsxml.subject_other_subvy_search,
74
+ :subject_all_search => @modsxml.subject_all_search,
75
+ :topic_facet => @modsxml.topic_facet,
76
+ :geographic_facet => @modsxml.geographic_facet,
77
+ :era_facet => @modsxml.era_facet,
78
+ }
79
+ end
80
+
81
+ # @return [Hash] Hash representing the publication fields
82
+ def mods_to_publication_fields
83
+ doc_hash = {
84
+ # publication fields
85
+ :pub_search => @modsxml.place,
86
+ :pub_date_sort => @modsxml.pub_date_sort,
87
+ :imprint_display => @modsxml.pub_date_display,
88
+ :pub_date => @modsxml.pub_date_facet,
89
+ :pub_date_display => @modsxml.pub_date_display, # pub_date_display may be deprecated
90
+ }
91
+ end
92
+
93
+ # @return [Hash] Hash representing the pub date
94
+ def mods_to_pub_date
95
+ doc_hash = {}
96
+ pub_date_sort = @modsxml.pub_date_sort
97
+ if is_positive_int? pub_date_sort
98
+ doc_hash[:pub_year_tisim] = pub_date_sort # for date slider
99
+ # put the displayable year in the correct field, :creation_year_isi for example
100
+ doc_hash[date_type_sym] = @modsxml.pub_date_sort if date_type_sym
101
+ end
102
+ return doc_hash
103
+ end
104
+
105
+ # @return [Hash] Hash representing some fields
106
+ def mods_to_others
107
+ doc_hash = {
108
+ :format_main_ssim => format_main_ssim,
109
+ :format => format, # for backwards compatibility
110
+ :language => @modsxml.sw_language_facet,
111
+ :physical => @modsxml.term_values([:physical_description, :extent]),
112
+ :summary_search => @modsxml.term_values(:abstract),
113
+ :toc_search => @modsxml.term_values(:tableOfContents),
114
+ :url_suppl => @modsxml.term_values([:related_item, :location, :url]),
115
+ }
116
+ return doc_hash
117
+ end
118
+
119
+ # select one or more format values from the controlled vocabulary here:
120
+ # http://searchworks-solr-lb.stanford.edu:8983/solr/select?facet.field=format&rows=0&facet.sort=index
121
+ # via stanford-mods gem
122
+ # @return [Array<String>] value(s) in the SearchWorks controlled vocabulary, or []
123
+ def format
124
+ vals = @modsxml.format
125
+ if vals.empty?
126
+ puts "#{@druid} has no SearchWorks format from MODS - check <typeOfResource> and other implicated MODS elements"
127
+ end
128
+ vals
129
+ end
130
+
131
+ # call stanford-mods format_main to get results
132
+ # @return [Array<String>] value(s) in the SearchWorks controlled vocabulary, or []
133
+ def format_main_ssim
134
+ vals = @modsxml.format_main
135
+ if vals.empty?
136
+ puts "#{@druid} has no SearchWorks Resource Type from MODS - check <typeOfResource> and other implicated MODS elements"
137
+ end
138
+ vals
139
+ end
140
+
141
+ # call stanford-mods sw_genre to get results
142
+ # @return [Array<String>] value(s)
143
+ def genre_ssim
144
+ @modsxml.sw_genre
145
+ end
146
+
147
+ protected
148
+
149
+ # @return true if the string parses into an int, and if so, the int is >= 0
150
+ def is_positive_int? str
151
+ begin
152
+ if str.to_i >= 0
153
+ return true
154
+ else
155
+ return false
156
+ end
157
+ rescue
158
+ end
159
+ return false
160
+ end
161
+
162
+ # determines particular flavor of displayable publication year field
163
+ # @return Solr field name as a symbol
164
+ def date_type_sym
165
+ vals = @modsxml.term_values([:origin_info,:dateIssued])
166
+ if vals and vals.length > 0
167
+ return :publication_year_isi
168
+ end
169
+ vals = @modsxml.term_values([:origin_info,:dateCreated])
170
+ if vals and vals.length > 0
171
+ return :creation_year_isi
172
+ end
173
+ nil
174
+ end
175
+
176
+ end
177
+ end
178
+ end
179
+
@@ -0,0 +1,44 @@
1
+ require 'stanford-mods'
2
+ module DiscoveryIndexer
3
+ module InputXml
4
+
5
+ # This class is the main class to access and parse the mods xml
6
+ # as retrieved from PURL server
7
+ # @example to run the code
8
+ # druid = "aa111aa1111"
9
+ # p = DiscoveryIndexer::InputXml::Modsxml.new(druid)
10
+ # model = p.load()
11
+ #
12
+ #
13
+ class Modsxml
14
+ # initializes a new object
15
+ # @param druid [String] the druid object in the format "aa111aa1111"
16
+ def initialize(druid)
17
+ @druid = druid
18
+ @modsxml_ng_doc = nil
19
+ end
20
+
21
+ # loads the mods xml to stanford mods model for the fedora object defind in the druid,
22
+ # it reads the mods xml once from PURL server, and repeat the parsing with each call
23
+ # @return [Stanford::Mods::Record] represents the mods xml
24
+ def load()
25
+ if @modsxml_ng_doc.nil? then
26
+ @modsxml_ng_doc = ModsxmlReader.read(@druid)
27
+ end
28
+
29
+ modsxml_model = Stanford::Mods::Record.new
30
+ modsxml_model.from_nk_node(@modsxml_ng_doc)
31
+ return modsxml_model
32
+ end
33
+
34
+ # loads the mods xml to stanford mods model for the fedora object defind in the druid,
35
+ # it reads the mods xml from PURL server with every call
36
+ # @return [Stanford::Mods::Record] represents the mods xml
37
+ def reload()
38
+ @modsxml_ng_doc = ModsxmlReader.read(@druid)
39
+ return load()
40
+ end
41
+
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,23 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+ module DiscoveryIndexer
4
+ module InputXml
5
+ class ModsxmlReader
6
+
7
+ # reads the mods xml for the fedora object that is defined , from the purl server
8
+ # @param [String] druid e.g. ab123cd4567
9
+ # @return [Nokogiri::XML::Document] the mods xml for the fedora object
10
+ # @raise [MissingModsXml] if there's no mods xml available for this druid
11
+ def self.read(druid)
12
+ mods_uri = "#{DiscoveryIndexer::PURL_DEFAULT}/#{druid}.mods"
13
+
14
+ begin
15
+ modsxml_ng_doc = Nokogiri::XML(open(mods_uri))
16
+ return modsxml_ng_doc
17
+ rescue
18
+ raise DiscoveryIndexer::Errors::MissingModsPage.new(mods_uri)
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,43 @@
1
+ module DiscoveryIndexer
2
+ module InputXml
3
+
4
+ # This class is the main class to access and parse the purl xml
5
+ # as retrieved from PURL server
6
+ # @example to run the code
7
+ # druid = "aa111aa1111"
8
+ # p = DiscoveryIndexer::InputXml::Purlxml.new(druid)
9
+ # model = p.load()
10
+ #
11
+ class Purlxml
12
+
13
+ # initializes a new object
14
+ # @param druid [String] the druid object in the format "aa111aa1111"
15
+ def initialize(druid)
16
+ @druid = druid
17
+ @purlxml_ng_doc = nil
18
+ end
19
+
20
+ # loads the purl xml to purlxml model for the fedora object defind in the druid,
21
+ # it reads the purl xml once from PURL server, and repeat the parsing with each call
22
+ # @return [PurlxmlModel] represents the purlxml
23
+ def load()
24
+ if @purlxml_ng_doc.nil? then
25
+ @purlxml_ng_doc = PurlxmlReader.read(@druid)
26
+ end
27
+
28
+ purlxml_parser = PurlxmlParserStrict.new(@purlxml_ng_doc)
29
+ purlxml_model = purlxml_parser.parse()
30
+ return purlxml_model
31
+ end
32
+
33
+ # loads the purl xml to purlxml model for the fedora object defind in the druid
34
+ # it reads the purl xml from PURL server with every call
35
+ # @return [PurlxmlModel] represents the purlxml
36
+ def reload()
37
+ @purlxml_ng_doc = PurlxmlReader.read(@druid)
38
+ return load()
39
+ end
40
+ end
41
+ end
42
+ end
43
+
@@ -0,0 +1,29 @@
1
+ module DiscoveryIndexer
2
+ module InputXml
3
+ class PurlxmlModel
4
+ attr_accessor :public_xml
5
+ attr_accessor :content_metadata
6
+ attr_accessor :identity_metadata
7
+ attr_accessor :rights_metadata
8
+ attr_accessor :dc
9
+ attr_accessor :rdf
10
+ attr_accessor :release_tags_hash
11
+ attr_accessor :dor_content_type
12
+ attr_accessor :is_collection
13
+ attr_accessor :collection_druids
14
+ attr_accessor :dor_content_type
15
+ attr_accessor :file_ids
16
+ attr_accessor :image_ids
17
+ attr_accessor :catkey
18
+ attr_accessor :barcode
19
+ attr_accessor :label
20
+
21
+ end
22
+ end
23
+ end
24
+
25
+
26
+
27
+
28
+
29
+
@@ -0,0 +1,13 @@
1
+ module DiscoveryIndexer
2
+ module InputXml
3
+ class PurlxmlParser
4
+
5
+ def initialize(purlxml_ng_doc)
6
+ @purlxml_ng_doc = purlxml_ng_doc
7
+ end
8
+
9
+ def parse()
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,210 @@
1
+ module DiscoveryIndexer
2
+ module InputXml
3
+ class PurlxmlParserStrict < PurlxmlParser
4
+
5
+ RDF_NAMESPACE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
6
+ OAI_DC_NAMESPACE = 'http://www.openarchives.org/OAI/2.0/oai_dc/'
7
+ MODS_NAMESPACE = 'http://www.loc.gov/mods/v3'
8
+
9
+ def initialize(purlxml_ng_doc)
10
+ super
11
+ end
12
+
13
+ # it parses the purlxml into a purlxml model
14
+ # @return [PurlxmlModel] represents the purlxml as parsed based on the parser rules
15
+ def parse()
16
+ purlxml_model = PurlxmlModel.new
17
+ purlxml_model.public_xml = @purlxml_ng_doc
18
+ purlxml_model.content_metadata = parse_content_metadata()
19
+ purlxml_model.identity_metadata = parse_identity_metadata()
20
+ purlxml_model.rights_metadata = parse_rights_metadata()
21
+ purlxml_model.dc = parse_dc()
22
+ purlxml_model.rdf = parse_rdf()
23
+ purlxml_model.is_collection = parse_is_collection()
24
+ purlxml_model.collection_druids = parse_collection_druids()
25
+ purlxml_model.dor_content_type = parse_dor_content_type()
26
+ purlxml_model.release_tags_hash = parse_release_tags_hash()
27
+ purlxml_model.file_ids = parse_file_ids()
28
+ purlxml_model.image_ids = parse_image_ids()
29
+ purlxml_model.catkey = parse_catkey()
30
+ purlxml_model.barcode = parse_barcode()
31
+ purlxml_model.label = parse_label()
32
+ return purlxml_model
33
+ end
34
+
35
+ # extracts the identityMetadata for this fedora object, from the purl xml
36
+ # @return [Nokogiri::XML::Document] the identityMetadata for the fedora object
37
+ # @raise [DiscoveryIndexer::Errors::MissingIdentityMetadata] if there is no contentMetadata
38
+ def parse_identity_metadata
39
+ begin
40
+ ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/identityMetadata').to_xml)
41
+ raise DiscoveryIndexer::Errors::MissingIdentityMetadata.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
42
+ ng_doc
43
+ rescue
44
+ raise DiscoveryIndexer::Errors::MissingIdentityMetadata.new(@purlxml_ng_doc.inspect)
45
+ end
46
+ end
47
+
48
+ def parse_rights_metadata
49
+ begin
50
+ ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/rightsMetadata').to_xml)
51
+ raise DiscoveryIndexer::Errors::MissingRightsMetadata.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
52
+ ng_doc
53
+ rescue
54
+ raise DiscoveryIndexer::Errors::MissingRightsMetadata.new(@purlxml_ng_doc.inspect)
55
+ end
56
+ end
57
+
58
+ # extracts the dc field for this fedora object, from the purl xml
59
+ # @return [Nokogiri::XML::Document] the dc for the fedora object
60
+ # @raise [DiscoveryIndexer::Errors::MissingDC] if there is no dc element
61
+ def parse_dc
62
+ begin
63
+ ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/dc:dc', {'dc' => OAI_DC_NAMESPACE}).to_xml(:encoding => 'utf-8'))
64
+ raise DiscoveryIndexer::Errors::MissingDC.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
65
+ ng_doc
66
+ rescue
67
+ raise DiscoveryIndexer::Errors::MissingDC.new(@purlxml_ng_doc.inspect)
68
+ end
69
+ end
70
+
71
+ # extracts the rdf field for this fedora object, from the purl xml
72
+ # @return [Nokogiri::XML::Document] the rdf for the fedora object
73
+ # @raise [DiscoveryIndexer::Errors::MissingRDF] if there is no rdf element
74
+ def parse_rdf
75
+ begin
76
+ ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/rdf:RDF', {'rdf' => RDF_NAMESPACE}).to_xml)
77
+ raise DiscoveryIndexer::Errors::MissingRDF.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
78
+ ng_doc
79
+ rescue
80
+ raise DiscoveryIndexer::Errors::MissingRDF.new(@purlxml_ng_doc.inspect)
81
+ end
82
+ end
83
+
84
+
85
+ # extracts the release tag element for this fedora object, from the the identity metadata in purl xml
86
+ # @return [Hash] the release tags for the fedora object
87
+ def parse_release_tags_hash
88
+ release_tags={}
89
+ identity_metadata = parse_identity_metadata
90
+ unless identity_metadata.nil?
91
+ release_elements = identity_metadata.xpath('//release')
92
+ release_elements.each { |n|
93
+ unless n.attr("to").nil?
94
+ release_target = n.attr("to")
95
+
96
+
97
+ #target = release_target.split(":").first
98
+ #sub_target = "default"
99
+ #if target != release_target.split(":").last then
100
+ # sub_target = release_target.split(":").last
101
+ #end
102
+ text = n.text
103
+ unless text.nil?
104
+ release_tags[release_target]= text
105
+ end
106
+ end
107
+ }
108
+ return release_tags
109
+ end
110
+ end
111
+
112
+ # extracts the contentMetadata for this fedora object, from the purl xml
113
+ # @return [Nokogiri::XML::Document] the contentMetadata for the fedora object
114
+ # @raise [DiscoveryIndexer::Errors::MissingContentMetadata] if there is no contentMetadata
115
+ def parse_content_metadata
116
+ # begin
117
+ ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/contentMetadata').to_xml)
118
+ # raise DiscoveryIndexer::Errors::MissingContentMetadata.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
119
+ ng_doc
120
+ # rescue
121
+ # raise DiscoveryIndexer::Errors::MissingContentMetadata.new(@purlxml_ng_doc.inspect)
122
+ # end
123
+ end
124
+
125
+ # @return true if the identityMetadata has <objectType>collection</objectType>, false otherwise
126
+ def parse_is_collection
127
+ identity_metadata = parse_identity_metadata
128
+ unless identity_metadata.nil?
129
+ object_type_nodes = identity_metadata.xpath('./objectType')
130
+ return true if object_type_nodes.find_index { |n| n.text == 'collection'}
131
+ end
132
+ false
133
+ end
134
+
135
+ # get the druids from isMemberOfCollection relationships in rels-ext from public_xml
136
+ # @return [Array<String>] the druids (e.g. ww123yy1234) this object has isMemberOfColletion relationship with, or nil if none
137
+ def parse_collection_druids
138
+ ns_hash = {'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'fedora' => "info:fedora/fedora-system:def/relations-external#", '' => ''}
139
+ is_member_of_nodes ||= @purlxml_ng_doc.xpath('/publicObject/rdf:RDF/rdf:Description/fedora:isMemberOfCollection/@rdf:resource', ns_hash)
140
+ # from public_xml rels-ext
141
+ druids = []
142
+ is_member_of_nodes.each { |n|
143
+ druids << n.value.split('druid:').last unless n.value.empty?
144
+ }
145
+ return nil if druids.empty?
146
+ druids
147
+ end
148
+
149
+ # the value of the type attribute for a DOR object's contentMetadata
150
+ # more info about these values is here:
151
+ # https://consul.stanford.edu/display/chimera/DOR+content+types%2C+resource+types+and+interpretive+metadata
152
+ # https://consul.stanford.edu/display/chimera/Summary+of+Content+Types%2C+Resource+Types+and+their+behaviors
153
+ # @return [String]
154
+ def parse_dor_content_type
155
+ content_md = parse_content_metadata
156
+ dct = content_md ? content_md.xpath('@type').text : nil
157
+ puts " has no DOR content type (<contentMetadata> element may be missing type attribute)" if !dct || dct.empty?
158
+ dct
159
+ end
160
+
161
+ # the @id attribute of resource/file elements that match the display_type, including extension
162
+ # @return [Array<String>] filenames
163
+ def parse_image_ids
164
+ ids = []
165
+ content_md = parse_content_metadata
166
+ unless content_md.nil?
167
+ content_md.xpath('./resource[@type="image"]/file/@id').each { |node|
168
+ ids << node.text if !node.text.empty?
169
+ }
170
+ return nil if ids.empty?
171
+ ids
172
+ end
173
+ end
174
+
175
+ def parse_file_ids
176
+ ids = []
177
+ content_md = parse_content_metadata
178
+ unless content_md.nil?
179
+ content_md.xpath('./resource/file/@id').each { |node|
180
+ ids << node.text if !node.text.empty?
181
+ }
182
+ return nil if ids.empty?
183
+ ids
184
+ end
185
+ end
186
+
187
+ def parse_catkey
188
+ catkey = nil
189
+ node = @purlxml_ng_doc.xpath("/publicObject/identityMetadata/otherId[@name='catkey']")
190
+ catkey = node.first.content if node && node.first
191
+ return catkey
192
+ end
193
+
194
+ def parse_barcode
195
+ barcode = nil
196
+ node = @purlxml_ng_doc.xpath("/publicObject/identityMetadata/otherId[@name='barcode']")
197
+ barcode = node.first.content if node && node.first
198
+ return barcode
199
+ end
200
+
201
+ def parse_label
202
+ label = nil
203
+ node = @purlxml_ng_doc.xpath("/publicObject/identityMetadata/objectLabel")
204
+ label = node.first.content if node && node.first
205
+ return label
206
+ end
207
+ end
208
+ end
209
+ end
210
+
@@ -0,0 +1,23 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+ module DiscoveryIndexer
4
+ module InputXml
5
+ class PurlxmlReader
6
+
7
+ # reads the public xml for the fedora object that is defined , from the purl server
8
+ # @param [String] druid e.g. ab123cd4567
9
+ # @return [Nokogiri::XML::Document] the public xml for the fedora object
10
+ # @raise [MissingPublicXml] if there's no purl xml available for this druid
11
+ def self.read(druid)
12
+ purlxml_uri = "#{DiscoveryIndexer::PURL_DEFAULT}/#{druid}.xml"
13
+
14
+ begin
15
+ purlxml_object = Nokogiri::XML(open(purlxml_uri))
16
+ return purlxml_object
17
+ rescue
18
+ raise DiscoveryIndexer::Errors::MissingPurlPage.new(purlxml_uri)
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,13 @@
1
+ module DiscoveryIndexer
2
+ module Utilities
3
+ class ExtractSubTargets
4
+
5
+ def self.by_name target_name, release_tags
6
+ if release_tags.keys.include?(target_name) then
7
+ return release_tags[target_name]
8
+ end
9
+ end
10
+
11
+ end
12
+ end
13
+ end
data/lib/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module DiscoveryIndexer
2
- VERSION = '0.0.1'
2
+ VERSION = '0.1'
3
3
  end
@@ -0,0 +1,51 @@
1
+ require 'retries'
2
+ require 'rsolr'
3
+
4
+ module DiscoveryIndexer
5
+ module Writer
6
+ class SolrClient
7
+
8
+ # Add the document to solr, retry if an error occurs.
9
+ # See https://github.com/ooyala/retries for docs on with_retries.
10
+ # @param [Hash] solr_doc a Hash representation of the solr document
11
+ # @param [RSolr::Client] solr_connector is an open connection with the solr core
12
+ # @param [Integer] max_retries the maximum number of tries before fail
13
+ def self.add(solr_doc, solr_connector, max_retries = 10)
14
+ process(solr_doc, solr_connector, max_retries, is_delete=false)
15
+ end
16
+
17
+ # Add the document to solr, retry if an error occurs.
18
+ # See https://github.com/ooyala/retries for docs on with_retries.
19
+ # @param [Hash] solr_doc that has only the id !{:id=>"ab123cd4567"}
20
+ # @param [RSolr::Client] solr_connector is an open connection with the solr core
21
+ # @param [Integer] max_retries the maximum number of tries before fail
22
+ def self.delete(solr_doc, solr_connector, max_retries = 10)
23
+ process(solr_doc, solr_connector, max_retries, is_delete=true)
24
+ end
25
+
26
+ def self.process(solr_doc, solr_connector, max_retries, is_delete=false)
27
+ logger = Logger.new STDOUT
28
+ id = solr_doc[:id]
29
+ puts id
30
+ handler = Proc.new do |exception, attempt_number, total_delay|
31
+ logger.debug "#{exception.class} on attempt #{attempt_number} for #{id}"
32
+ end
33
+
34
+ with_retries(:max_tries => max_retries, :handler => handler, :base_sleep_seconds => 1, :max_sleep_seconds => 5) do |attempt|
35
+ logger.debug "Attempt #{attempt} for #{id}"
36
+
37
+ if is_delete
38
+ solr_connector.delete_by_id(id)
39
+ logger.info "Successfully deleted #{id} on attempt #{attempt}"
40
+ else
41
+ solr_connector.add(solr_doc)
42
+ logger.info "Successfully indexed #{id} on attempt #{attempt}"
43
+ end
44
+
45
+ end
46
+ solr_connector.commit
47
+ end
48
+
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,61 @@
1
+ require 'retries'
2
+ require 'rsolr'
3
+
4
+ module DiscoveryIndexer
5
+ module Writer
6
+ class SolrWriter
7
+
8
+ def process(druid, index_doc, targets, solr_targets_configs)
9
+ @solr_targets_configs = solr_targets_configs
10
+ index_targets = []
11
+ delete_targets = []
12
+ puts targets
13
+ targets.keys.each do |target|
14
+ if targets[target] then
15
+ index_targets.append(target)
16
+ else
17
+ delete_targets.append(target)
18
+ end
19
+ end
20
+
21
+ # get targets with true
22
+ solr_index_client(index_doc, index_targets)
23
+ # get targets with false
24
+ solr_delete_client(druid, delete_targets)
25
+ end
26
+
27
+ def solr_delete_from_all(druid, solr_targets_configs)
28
+ # Get a list of all registered targets
29
+ @solr_targets_configs=solr_targets_configs
30
+ targets = @solr_targets_configs.keys()
31
+ solr_delete_client(druid, targets)
32
+ end
33
+
34
+ def solr_index_client(index_doc, targets)
35
+ targets.each do |solr_target|
36
+ solr_connector = get_connector_for_target(solr_target)
37
+ SolrClient.add(index_doc, solr_connector)
38
+ end
39
+ end
40
+
41
+ def solr_delete_client(druid, targets)
42
+ targets.each do |solr_target|
43
+ solr_connector = get_connector_for_target(solr_target)
44
+ SolrClient.delete({:id=>druid}, solr_connector)
45
+ end
46
+ end
47
+
48
+ def get_connector_for_target(solr_target)
49
+ solr_connector = nil
50
+ puts solr_target
51
+ puts @solr_targets_configs
52
+ if @solr_targets_configs.keys.include?(solr_target) then
53
+ config = @solr_targets_configs[solr_target]
54
+ solr_connector = RSolr.connect(config)
55
+ end
56
+ return solr_connector
57
+ end
58
+
59
+ end
60
+ end
61
+ end
metadata CHANGED
@@ -1,15 +1,127 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: discovery-indexer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: '0.1'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ahmed AlSum
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-03-09 00:00:00.000000000 Z
12
- dependencies: []
11
+ date: 2015-03-19 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: stanford-mods
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: retries
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rsolr
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rspec
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: webmock
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: equivalent-xml
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: vcr
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
13
125
  description: This library manages the core operations for the discovery indexing such
14
126
  as reading PURL xml, mapping to the solr document, and writing to solr core.
15
127
  email: aalsum@stanford.edu
@@ -18,7 +130,20 @@ extensions: []
18
130
  extra_rdoc_files: []
19
131
  files:
20
132
  - lib/discovery-indexer.rb
133
+ - lib/errors.rb
134
+ - lib/mapper/general_mapper.rb
135
+ - lib/mapper/index_mapper.rb
136
+ - lib/reader/modsxml.rb
137
+ - lib/reader/modsxml_reader.rb
138
+ - lib/reader/purlxml.rb
139
+ - lib/reader/purlxml_model.rb
140
+ - lib/reader/purlxml_parser.rb
141
+ - lib/reader/purlxml_parser_strict.rb
142
+ - lib/reader/purlxml_reader.rb
143
+ - lib/utilities/extract_sub_targets.rb
21
144
  - lib/version.rb
145
+ - lib/writer/solr_client.rb
146
+ - lib/writer/solr_writer.rb
22
147
  homepage:
23
148
  licenses:
24
149
  - Stanford University
@@ -29,12 +154,12 @@ require_paths:
29
154
  - lib
30
155
  required_ruby_version: !ruby/object:Gem::Requirement
31
156
  requirements:
32
- - - '>='
157
+ - - ">="
33
158
  - !ruby/object:Gem::Version
34
159
  version: '0'
35
160
  required_rubygems_version: !ruby/object:Gem::Requirement
36
161
  requirements:
37
- - - '>='
162
+ - - ">="
38
163
  - !ruby/object:Gem::Version
39
164
  version: '0'
40
165
  requirements: []