harvestdor-indexer 0.0.3 → 0.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/README.rdoc +4 -0
- data/Rakefile +2 -1
- data/harvestdor-indexer.gemspec +1 -1
- data/lib/harvestdor-indexer.rb +112 -27
- data/lib/harvestdor-indexer/version.rb +1 -1
- data/spec/unit/harvestdor-indexer_spec.rb +45 -61
- metadata +19 -33
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
OGYzNWY3ZWNkM2VlNzdhY2YyN2UxZGQ3NzM2MTQyOTRmYzRiZDVjMw==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
YTE4YWFhMWEzOGQ0NDJhMDdlNGRlNGFiNjNiMjE1ZWJlNWMyNzQ2Yg==
|
7
|
+
!binary "U0hBNTEy":
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
ODdlZDM2MTNkYmYzODNlZmVhY2I0YzJjNGRhMGE3MjMyYzNjNTZiYTJmNDMy
|
10
|
+
ZWI1ZTUyZjZmNWFlNTY3NGUzZGMzMzVlMDY0MTQxNjRiNzRlM2U0OGI3MzU4
|
11
|
+
NTZhNWUwMjE2MjNjMDJjODU4MTA3YzVjOGY5NzQzZGNkMGE1ZDM=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
MTYyMzMzNDA2OTM5NWU4ZjdlNGQxYzc3ODU5MTBmNjI5MGM0OTM3NjNkZDdm
|
14
|
+
OWIwY2M1Yjk5OTEyOGIxNGFmYmEwMGNmZDhkZmQ4MmM5MWQxMDAzMmZhMTdl
|
15
|
+
YmNkODM1MDZmMTI4OThhYmFlYWFlMTc5MzEwYzBlMTIyNzg4Yzk=
|
data/README.rdoc
CHANGED
@@ -108,6 +108,10 @@ I suggest you run your code on harvestdor-dev, as it is already set up to be abl
|
|
108
108
|
|
109
109
|
== Releases
|
110
110
|
|
111
|
+
* <b>0.0.7</b> adding additional logging of error, success counts, and time to index and harvest
|
112
|
+
* <b>0.0.6</b> tweak error handling for public xml pieces
|
113
|
+
* <b>0.0.5</b> make rake release a no-op
|
114
|
+
* <b>0.0.4</b> add confstruct runtime dependency
|
111
115
|
* <b>0.0.3</b> add methods for public_xml, content_metadata, identity_metadata ...
|
112
116
|
* <b>0.0.2</b> better model code for index method (thanks, Bess!)
|
113
117
|
* <b>0.0.1</b> initial commit
|
data/Rakefile
CHANGED
data/harvestdor-indexer.gemspec
CHANGED
@@ -24,7 +24,7 @@ Gem::Specification.new do |gem|
|
|
24
24
|
gem.add_dependency 'stanford-mods'
|
25
25
|
|
26
26
|
# Runtime dependencies
|
27
|
-
|
27
|
+
gem.add_runtime_dependency 'confstruct'
|
28
28
|
|
29
29
|
# Development dependencies
|
30
30
|
# Bundler will install these gems too if you've checked out solrmarc-wrapper source from git and run 'bundle install'
|
data/lib/harvestdor-indexer.rb
CHANGED
@@ -15,7 +15,15 @@ module Harvestdor
|
|
15
15
|
# Base class to harvest from DOR via harvestdor gem and then index
|
16
16
|
class Indexer
|
17
17
|
|
18
|
+
attr_accessor :error_count, :success_count, :max_retries
|
19
|
+
attr_accessor :total_time_to_parse,:total_time_to_solr
|
20
|
+
|
18
21
|
def initialize yml_path, options = {}
|
22
|
+
@success_count=0 # the number of objects successfully indexed
|
23
|
+
@error_count=0 # the number of objects that failed
|
24
|
+
@max_retries=5 # the number of times to retry an object
|
25
|
+
@total_time_to_solr=0
|
26
|
+
@total_time_to_parse=0
|
19
27
|
@yml_path = yml_path
|
20
28
|
config.configure(YAML.load_file(yml_path)) if yml_path
|
21
29
|
config.configure options
|
@@ -35,19 +43,62 @@ module Harvestdor
|
|
35
43
|
# create a Solr profiling document for each druid
|
36
44
|
# write the result to the Solr index
|
37
45
|
def harvest_and_index
|
46
|
+
start_time=Time.now
|
47
|
+
logger.info("Started harvest_and_index at #{start_time}")
|
38
48
|
if whitelist.empty?
|
39
49
|
druids.each { |druid| index druid }
|
40
50
|
else
|
41
51
|
whitelist.each { |druid| index druid }
|
42
52
|
end
|
43
53
|
solr_client.commit
|
44
|
-
|
54
|
+
total_time=elapsed_time(start_time)
|
55
|
+
total_objects=@success_count+@error_count
|
56
|
+
logger.info("Finished harvest_and_index at #{Time.now}: final Solr commit returned")
|
57
|
+
logger.info("Total elapsed time for harvest and index: #{(total_time/60.0)} minutes")
|
58
|
+
logger.info("Avg solr commit time per object (successful): #{@total_time_to_solr/@success_count} seconds") unless (@total_time_to_solr == 0 || @success_count == 0)
|
59
|
+
logger.info("Avg solr commit time per object (all): #{@total_time_to_solr/total_objects} seconds") unless (@total_time_to_solr == 0 || @error_count == 0 || total_objects == 0)
|
60
|
+
logger.info("Avg parse time per object (successful): #{@total_time_to_parse/@success_count} seconds") unless (@total_time_to_parse == 0 || @success_count == 0)
|
61
|
+
logger.info("Avg parse time per object (all): #{@total_time_to_parse/total_objects} seconds") unless (@total_time_to_parse == 0 || @error_count == 0 || total_objects == 0)
|
62
|
+
logger.info("Avg complete index time per object (successful): #{total_time/@success_count} seconds") unless (@success_count == 0)
|
63
|
+
logger.info("Avg complete index time per object (all): #{total_time/total_objects} seconds") unless (@error_count == 0 || total_object == 0)
|
64
|
+
logger.info("Successful count: #{@success_count}")
|
65
|
+
logger.info("Error count: #{@error_count}")
|
66
|
+
logger.info("Total records processed: #{total_objects}")
|
45
67
|
end
|
46
68
|
|
47
69
|
# return Array of druids contained in the OAI harvest indicated by OAI params in yml configuration file
|
48
70
|
# @return [Array<String>] or enumeration over it, if block is given. (strings are druids, e.g. ab123cd1234)
|
49
71
|
def druids
|
50
|
-
@druids
|
72
|
+
if @druids.nil?
|
73
|
+
start_time=Time.now
|
74
|
+
logger.info("Starting OAI harvest of druids at #{start_time}.")
|
75
|
+
@druids = harvestdor_client.druids_via_oai
|
76
|
+
logger.info("Completed OAI harves of druids at #{Time.now}. Found #{@druids.size} druids. Total elapsed time for OAI harvest = #{elapsed_time(start_time,:minutes)} minutes")
|
77
|
+
end
|
78
|
+
return @druids
|
79
|
+
end
|
80
|
+
|
81
|
+
#add the document to solr, retry if an error occurs
|
82
|
+
def solr_add(doc, id, do_retry=true)
|
83
|
+
#if do_retry is false, skip retrying
|
84
|
+
tries=do_retry ? 0 : 999
|
85
|
+
while tries < @max_retries
|
86
|
+
begin
|
87
|
+
tries+=1
|
88
|
+
solr_client.add(doc)
|
89
|
+
#return if successful
|
90
|
+
return
|
91
|
+
rescue => e
|
92
|
+
if tries<@max_retries
|
93
|
+
logger.warn "#{id}: #{e.message}, retrying"
|
94
|
+
else
|
95
|
+
@error_count+=1
|
96
|
+
logger.error "Failed saving #{id}: #{e.message}"
|
97
|
+
logger.error e.backtrace
|
98
|
+
return
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
51
102
|
end
|
52
103
|
|
53
104
|
# create Solr doc for the druid and add it to Solr, unless it is on the blacklist.
|
@@ -59,6 +110,8 @@ module Harvestdor
|
|
59
110
|
logger.fatal("You must override the index method to transform druids into Solr docs and add them to Solr")
|
60
111
|
|
61
112
|
begin
|
113
|
+
start_time=Time.now
|
114
|
+
logger.info("About to index #{druid} at #{start_time}")
|
62
115
|
#logger.debug "About to index #{druid}"
|
63
116
|
doc_hash = {}
|
64
117
|
doc_hash[:id] = druid
|
@@ -69,10 +122,12 @@ module Harvestdor
|
|
69
122
|
|
70
123
|
solr_client.add(doc_hash)
|
71
124
|
|
72
|
-
|
125
|
+
logger.info("Indexed #{druid} in #{elapsed_time(start_time)} seconds")
|
126
|
+
@success_count+=1
|
73
127
|
# TODO: provide call to code to update DOR object's workflow datastream??
|
74
128
|
rescue => e
|
75
|
-
|
129
|
+
@error_count+=1
|
130
|
+
logger.error "Failed to index #{druid} in #{elapsed_time(start_time)} seconds: #{e.message}"
|
76
131
|
end
|
77
132
|
end
|
78
133
|
end
|
@@ -81,7 +136,9 @@ module Harvestdor
|
|
81
136
|
# @param [String] druid e.g. ab123cd4567
|
82
137
|
# @return [Stanford::Mods::Record] created from the MODS xml for the druid
|
83
138
|
def smods_rec druid
|
139
|
+
start_time=Time.now
|
84
140
|
ng_doc = harvestdor_client.mods druid
|
141
|
+
logger.info("Fetched MODs for #{druid} in #{elapsed_time(start_time)} seconds")
|
85
142
|
raise "Empty MODS metadata for #{druid}: #{ng_doc.to_xml}" if ng_doc.root.xpath('//text()').empty?
|
86
143
|
mods_rec = Stanford::Mods::Record.new
|
87
144
|
mods_rec.from_nk_node(ng_doc.root)
|
@@ -92,45 +149,59 @@ module Harvestdor
|
|
92
149
|
# @param [String] druid e.g. ab123cd4567
|
93
150
|
# @return [Nokogiri::XML::Document] the public xml for the DOR object
|
94
151
|
def public_xml druid
|
152
|
+
start_time=Time.now
|
95
153
|
ng_doc = harvestdor_client.public_xml druid
|
154
|
+
logger.info("Fetched public_xml for #{druid} in #{elapsed_time(start_time)} seconds")
|
96
155
|
raise "No public xml for #{druid}" if !ng_doc
|
97
156
|
raise "Empty public xml for #{druid}: #{ng_doc.to_xml}" if ng_doc.root.xpath('//text()').empty?
|
98
157
|
ng_doc
|
99
158
|
end
|
100
|
-
|
101
|
-
# the contentMetadata for this DOR object, from the purl public xml
|
102
|
-
# @param [
|
159
|
+
|
160
|
+
# the contentMetadata for this DOR object, ultimately from the purl public xml
|
161
|
+
# @param [Object] object a String containing a druid (e.g. ab123cd4567), or
|
162
|
+
# a Nokogiri::XML::Document containing the public_xml for an object
|
103
163
|
# @return [Nokogiri::XML::Document] the contentMetadata for the DOR object
|
104
|
-
def content_metadata
|
105
|
-
|
106
|
-
|
164
|
+
def content_metadata object
|
165
|
+
start_time=Time.now
|
166
|
+
ng_doc = harvestdor_client.content_metadata object
|
167
|
+
logger.info("Fetched content_metadata in #{elapsed_time(start_time)} seconds")
|
168
|
+
raise "No contentMetadata for #{object.inspect}" if !ng_doc || ng_doc.children.empty?
|
107
169
|
ng_doc
|
108
170
|
end
|
109
|
-
|
110
|
-
# the identityMetadata for this DOR object, from the purl public xml
|
111
|
-
# @param [
|
171
|
+
|
172
|
+
# the identityMetadata for this DOR object, ultimately from the purl public xml
|
173
|
+
# @param [Object] object a String containing a druid (e.g. ab123cd4567), or
|
174
|
+
# a Nokogiri::XML::Document containing the public_xml for an object
|
112
175
|
# @return [Nokogiri::XML::Document] the identityMetadata for the DOR object
|
113
|
-
def identity_metadata
|
114
|
-
|
115
|
-
|
176
|
+
def identity_metadata object
|
177
|
+
start_time=Time.now
|
178
|
+
ng_doc = harvestdor_client.identity_metadata object
|
179
|
+
logger.info("Fetched identity_metadata in #{elapsed_time(start_time)} seconds")
|
180
|
+
raise "No identityMetadata for #{object.inspect}" if !ng_doc || ng_doc.children.empty?
|
116
181
|
ng_doc
|
117
182
|
end
|
118
183
|
|
119
|
-
# the rightsMetadata for this DOR object, from the purl public xml
|
120
|
-
# @param [
|
184
|
+
# the rightsMetadata for this DOR object, ultimately from the purl public xml
|
185
|
+
# @param [Object] object a String containing a druid (e.g. ab123cd4567), or
|
186
|
+
# a Nokogiri::XML::Document containing the public_xml for an object
|
121
187
|
# @return [Nokogiri::XML::Document] the rightsMetadata for the DOR object
|
122
|
-
def rights_metadata
|
123
|
-
|
124
|
-
|
188
|
+
def rights_metadata object
|
189
|
+
start_time=Time.now
|
190
|
+
ng_doc = harvestdor_client.rights_metadata object
|
191
|
+
logger.info("Fetched rights_metadata in #{elapsed_time(start_time)} seconds")
|
192
|
+
raise "No rightsMetadata for #{object.inspect}" if !ng_doc || ng_doc.children.empty?
|
125
193
|
ng_doc
|
126
194
|
end
|
127
195
|
|
128
|
-
# the RDF for this DOR object, from the purl public xml
|
129
|
-
# @param [
|
196
|
+
# the RDF for this DOR object, ultimately from the purl public xml
|
197
|
+
# @param [Object] object a String containing a druid (e.g. ab123cd4567), or
|
198
|
+
# a Nokogiri::XML::Document containing the public_xml for an object
|
130
199
|
# @return [Nokogiri::XML::Document] the RDF for the DOR object
|
131
|
-
def rdf
|
132
|
-
|
133
|
-
|
200
|
+
def rdf object
|
201
|
+
start_time=Time.now
|
202
|
+
ng_doc = harvestdor_client.rdf object
|
203
|
+
logger.info("Fetched rdf in #{elapsed_time(start_time)} seconds")
|
204
|
+
raise "No RDF for #{object.inspect}" if !ng_doc || ng_doc.children.empty?
|
134
205
|
ng_doc
|
135
206
|
end
|
136
207
|
|
@@ -162,6 +233,20 @@ module Harvestdor
|
|
162
233
|
@harvestdor_client ||= Harvestdor::Client.new({:config_yml_path => @yml_path})
|
163
234
|
end
|
164
235
|
|
236
|
+
def elapsed_time(start_time,units=:seconds)
|
237
|
+
elapsed_seconds=Time.now-start_time
|
238
|
+
case units
|
239
|
+
when :seconds
|
240
|
+
return elapsed_seconds.round(2)
|
241
|
+
when :minutes
|
242
|
+
return (elapsed_seconds/60.0).round(1)
|
243
|
+
when :hours
|
244
|
+
return (elapsed_seconds/3600.0).round(2)
|
245
|
+
else
|
246
|
+
return elapsed_seconds
|
247
|
+
end
|
248
|
+
end
|
249
|
+
|
165
250
|
# populate @blacklist as an Array of druids ('oo000oo0000') that will NOT be processed
|
166
251
|
# by reading the File at the indicated path
|
167
252
|
# @param [String] path - path of file containing a list of druids
|
@@ -200,7 +285,7 @@ module Harvestdor
|
|
200
285
|
logger.fatal msg
|
201
286
|
raise msg
|
202
287
|
end
|
203
|
-
|
288
|
+
|
204
289
|
# Global, memoized, lazy initialized instance of a logger
|
205
290
|
# @param [String] log_dir directory for to get log file
|
206
291
|
# @param [String] log_name name of log file
|
@@ -13,13 +13,29 @@ describe Harvestdor::Indexer do
|
|
13
13
|
@whitelist_path = File.join(File.dirname(__FILE__), "../config/ap_whitelist.txt")
|
14
14
|
end
|
15
15
|
|
16
|
+
describe "access methods" do
|
17
|
+
it "initializes success count" do
|
18
|
+
@indexer.success_count.should == 0
|
19
|
+
end
|
20
|
+
it "initializes error count" do
|
21
|
+
@indexer.error_count.should == 0
|
22
|
+
end
|
23
|
+
it "initializes max_retries" do
|
24
|
+
@indexer.max_retries.should == 5
|
25
|
+
end
|
26
|
+
it "allows overriding of max_retries" do
|
27
|
+
@indexer.max_retries=6
|
28
|
+
@indexer.max_retries.should == 6
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
16
32
|
describe "logging" do
|
17
33
|
it "should write the log file to the directory indicated by log_dir" do
|
18
34
|
@indexer.logger.info("indexer_spec logging test message")
|
19
35
|
File.exists?(File.join(@yaml['log_dir'], @yaml['log_name'])).should == true
|
20
36
|
end
|
21
37
|
end
|
22
|
-
|
38
|
+
|
23
39
|
it "should initialize the harvestdor_client from the config" do
|
24
40
|
@hdor_client.should be_an_instance_of(Harvestdor::Client)
|
25
41
|
@hdor_client.config.default_set.should == @yaml['default_set']
|
@@ -32,7 +48,7 @@ describe Harvestdor::Indexer do
|
|
32
48
|
}
|
33
49
|
end
|
34
50
|
it "should call druids_via_oai and then call :add on rsolr connection" do
|
35
|
-
@
|
51
|
+
@indexer.should_receive(:druids).and_return([@fake_druid])
|
36
52
|
@indexer.solr_client.should_receive(:add).with(@doc_hash)
|
37
53
|
@indexer.solr_client.should_receive(:commit)
|
38
54
|
@indexer.harvest_and_index
|
@@ -76,7 +92,7 @@ describe Harvestdor::Indexer do
|
|
76
92
|
end
|
77
93
|
|
78
94
|
it "druids method should call druids_via_oai method on harvestdor_client" do
|
79
|
-
@hdor_client.should_receive(:druids_via_oai)
|
95
|
+
@hdor_client.should_receive(:druids_via_oai).and_return([@fake_druid])
|
80
96
|
@indexer.druids
|
81
97
|
end
|
82
98
|
|
@@ -129,16 +145,13 @@ describe Harvestdor::Indexer do
|
|
129
145
|
@hdor_client.should_receive(:public_xml).with(@fake_druid).and_return(Nokogiri::XML("<publicObject/>"))
|
130
146
|
expect { @indexer.public_xml(@fake_druid) }.to raise_error(RuntimeError, Regexp.new("^Empty public xml for #{@fake_druid}: <"))
|
131
147
|
end
|
132
|
-
it "raises Harvestdor::Errors::MissingPurlPage if there is no purl page for the druid" do
|
133
|
-
expect { @indexer.public_xml(@fake_druid) }.to raise_error(Harvestdor::Errors::MissingPurlPage)
|
134
|
-
end
|
135
148
|
it "raises error if there is no public_xml page for the druid" do
|
136
149
|
@hdor_client.should_receive(:public_xml).with(@fake_druid).and_return(nil)
|
137
150
|
expect { @indexer.public_xml(@fake_druid) }.to raise_error(RuntimeError, "No public xml for #{@fake_druid}")
|
138
151
|
end
|
139
152
|
end
|
140
153
|
context "#content_metadata" do
|
141
|
-
it "returns a Nokogiri::XML::Document derived from the public xml" do
|
154
|
+
it "returns a Nokogiri::XML::Document derived from the public xml if a druid is passed" do
|
142
155
|
Harvestdor.stub(:public_xml).with(@fake_druid, @indexer.config.purl).and_return(@ng_pub_xml)
|
143
156
|
cm = @indexer.content_metadata(@fake_druid)
|
144
157
|
cm.should be_kind_of(Nokogiri::XML::Document)
|
@@ -147,25 +160,23 @@ describe Harvestdor::Indexer do
|
|
147
160
|
cm.root.attributes['objectId'].text.should == @fake_druid
|
148
161
|
cm.root.text.strip.should == 'foo'
|
149
162
|
end
|
150
|
-
it "
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
163
|
+
it "if passed a Nokogiri::XML::Document of the public xml, it does no fetch" do
|
164
|
+
URI::HTTP.any_instance.should_not_receive(:open)
|
165
|
+
@hdor_client.should_receive(:content_metadata).and_call_original
|
166
|
+
cm = @indexer.content_metadata(@ng_pub_xml)
|
167
|
+
cm.should be_kind_of(Nokogiri::XML::Document)
|
168
|
+
cm.root.should_not == nil
|
169
|
+
cm.root.name.should == 'contentMetadata'
|
170
|
+
cm.root.attributes['objectId'].text.should == @fake_druid
|
171
|
+
cm.root.text.strip.should == 'foo'
|
157
172
|
end
|
158
173
|
it "raises RuntimeError if nil is returned by Harvestdor::Client.contentMetadata for the druid" do
|
159
174
|
@hdor_client.should_receive(:content_metadata).with(@fake_druid).and_return(nil)
|
160
|
-
expect { @indexer.content_metadata(@fake_druid) }.to raise_error(RuntimeError, "No contentMetadata for #{@fake_druid}")
|
161
|
-
end
|
162
|
-
it "raises MissingContentMetadata error if there is no contentMetadata in the public_xml for the druid" do
|
163
|
-
URI::HTTP.any_instance.should_receive(:open)
|
164
|
-
expect { @indexer.content_metadata(@fake_druid) }.to raise_error(Harvestdor::Errors::MissingContentMetadata)
|
175
|
+
expect { @indexer.content_metadata(@fake_druid) }.to raise_error(RuntimeError, "No contentMetadata for \"#{@fake_druid}\"")
|
165
176
|
end
|
166
177
|
end
|
167
178
|
context "#identity_metadata" do
|
168
|
-
it "returns a Nokogiri::XML::Document derived from the public xml" do
|
179
|
+
it "returns a Nokogiri::XML::Document derived from the public xml if a druid is passed" do
|
169
180
|
Harvestdor.stub(:public_xml).with(@fake_druid, @indexer.config.purl).and_return(@ng_pub_xml)
|
170
181
|
im = @indexer.identity_metadata(@fake_druid)
|
171
182
|
im.should be_kind_of(Nokogiri::XML::Document)
|
@@ -173,25 +184,22 @@ describe Harvestdor::Indexer do
|
|
173
184
|
im.root.name.should == 'identityMetadata'
|
174
185
|
im.root.text.strip.should == "druid:#{@fake_druid}"
|
175
186
|
end
|
176
|
-
it "
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
187
|
+
it "if passed a Nokogiri::XML::Document of the public xml, it does no fetch" do
|
188
|
+
URI::HTTP.any_instance.should_not_receive(:open)
|
189
|
+
@hdor_client.should_receive(:identity_metadata).and_call_original
|
190
|
+
im = @indexer.identity_metadata(@ng_pub_xml)
|
191
|
+
im.should be_kind_of(Nokogiri::XML::Document)
|
192
|
+
im.root.should_not == nil
|
193
|
+
im.root.name.should == 'identityMetadata'
|
194
|
+
im.root.text.strip.should == "druid:#{@fake_druid}"
|
183
195
|
end
|
184
196
|
it "raises RuntimeError if nil is returned by Harvestdor::Client.identityMetadata for the druid" do
|
185
197
|
@hdor_client.should_receive(:identity_metadata).with(@fake_druid).and_return(nil)
|
186
|
-
expect { @indexer.identity_metadata(@fake_druid) }.to raise_error(RuntimeError, "No identityMetadata for #{@fake_druid}")
|
187
|
-
end
|
188
|
-
it "raises MissingIdentityMetadata error if there is no identityMetadata in the public_xml for the druid" do
|
189
|
-
URI::HTTP.any_instance.should_receive(:open)
|
190
|
-
expect { @indexer.identity_metadata(@fake_druid) }.to raise_error(Harvestdor::Errors::MissingIdentityMetadata)
|
198
|
+
expect { @indexer.identity_metadata(@fake_druid) }.to raise_error(RuntimeError, "No identityMetadata for \"#{@fake_druid}\"")
|
191
199
|
end
|
192
200
|
end
|
193
201
|
context "#rights_metadata" do
|
194
|
-
it "returns a Nokogiri::XML::Document derived from the public xml" do
|
202
|
+
it "returns a Nokogiri::XML::Document derived from the public xml if a druid is passed" do
|
195
203
|
Harvestdor.stub(:public_xml).with(@fake_druid, @indexer.config.purl).and_return(@ng_pub_xml)
|
196
204
|
im = @indexer.rights_metadata(@fake_druid)
|
197
205
|
im.should be_kind_of(Nokogiri::XML::Document)
|
@@ -199,25 +207,13 @@ describe Harvestdor::Indexer do
|
|
199
207
|
im.root.name.should == 'rightsMetadata'
|
200
208
|
im.root.text.strip.should == "bar"
|
201
209
|
end
|
202
|
-
it "raises Harvestdor::Errors::MissingPurlPage if there is no purl page for the druid" do
|
203
|
-
expect { @indexer.rights_metadata(@fake_druid) }.to raise_error(Harvestdor::Errors::MissingPurlPage)
|
204
|
-
end
|
205
|
-
it "should raise exception if there is no rightsMetadata in the public xml" do
|
206
|
-
pub_xml = "<publicObject id='druid:#{@fake_druid}'>#{@cntnt_md_xml}</publicObject>"
|
207
|
-
Harvestdor.stub(:public_xml).with(@fake_druid, @indexer.config.purl).and_return(Nokogiri::XML(pub_xml))
|
208
|
-
expect { @indexer.rights_metadata(@fake_druid) }.to raise_error(RuntimeError, "No rightsMetadata for #{@fake_druid}")
|
209
|
-
end
|
210
210
|
it "raises RuntimeError if nil is returned by Harvestdor::Client.rightsMetadata for the druid" do
|
211
211
|
@hdor_client.should_receive(:rights_metadata).with(@fake_druid).and_return(nil)
|
212
|
-
expect { @indexer.rights_metadata(@fake_druid) }.to raise_error(RuntimeError, "No rightsMetadata for #{@fake_druid}")
|
213
|
-
end
|
214
|
-
it "raises MissingRightsMetadata error if there is no rightsMetadata in the public_xml for the druid" do
|
215
|
-
URI::HTTP.any_instance.should_receive(:open)
|
216
|
-
expect { @indexer.rights_metadata(@fake_druid) }.to raise_error(Harvestdor::Errors::MissingRightsMetadata)
|
212
|
+
expect { @indexer.rights_metadata(@fake_druid) }.to raise_error(RuntimeError, "No rightsMetadata for \"#{@fake_druid}\"")
|
217
213
|
end
|
218
214
|
end
|
219
215
|
context "#rdf" do
|
220
|
-
it "returns a Nokogiri::XML::Document derived from the public xml" do
|
216
|
+
it "returns a Nokogiri::XML::Document derived from the public xml if a druid is passed" do
|
221
217
|
Harvestdor.stub(:public_xml).with(@fake_druid, @indexer.config.purl).and_return(@ng_pub_xml)
|
222
218
|
im = @indexer.rdf(@fake_druid)
|
223
219
|
im.should be_kind_of(Nokogiri::XML::Document)
|
@@ -225,23 +221,11 @@ describe Harvestdor::Indexer do
|
|
225
221
|
im.root.name.should == 'RDF'
|
226
222
|
im.root.text.strip.should == "relationship!"
|
227
223
|
end
|
228
|
-
it "raises Harvestdor::Errors::MissingPurlPage if there is no purl page for the druid" do
|
229
|
-
expect { @indexer.rdf(@fake_druid) }.to raise_error(Harvestdor::Errors::MissingPurlPage)
|
230
|
-
end
|
231
|
-
it "should raise exception if there is no rdf in the public xml" do
|
232
|
-
pub_xml = "<publicObject id='druid:#{@fake_druid}'>#{@cntnt_md_xml}</publicObject>"
|
233
|
-
Harvestdor.stub(:public_xml).with(@fake_druid, @indexer.config.purl).and_return(Nokogiri::XML(pub_xml))
|
234
|
-
expect { @indexer.rdf(@fake_druid) }.to raise_error(RuntimeError, "No RDF for #{@fake_druid}")
|
235
|
-
end
|
236
224
|
it "raises RuntimeError if nil is returned by Harvestdor::Client.rdf for the druid" do
|
237
225
|
@hdor_client.should_receive(:rdf).with(@fake_druid).and_return(nil)
|
238
|
-
expect { @indexer.rdf(@fake_druid) }.to raise_error(RuntimeError, "No RDF for #{@fake_druid}")
|
239
|
-
end
|
240
|
-
it "raises MissingRDF error if there is no rdf in the public_xml for the druid" do
|
241
|
-
URI::HTTP.any_instance.should_receive(:open)
|
242
|
-
expect { @indexer.rdf(@fake_druid) }.to raise_error(Harvestdor::Errors::MissingRDF)
|
226
|
+
expect { @indexer.rdf(@fake_druid) }.to raise_error(RuntimeError, "No RDF for \"#{@fake_druid}\"")
|
243
227
|
end
|
244
|
-
end
|
228
|
+
end
|
245
229
|
end
|
246
230
|
|
247
231
|
context "blacklist" do
|
metadata
CHANGED
@@ -1,20 +1,18 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: harvestdor-indexer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
5
|
-
prerelease:
|
4
|
+
version: 0.0.10
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Naomi Dushay
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date: 2013-
|
11
|
+
date: 2013-10-18 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
14
13
|
- !ruby/object:Gem::Dependency
|
15
14
|
name: rsolr
|
16
15
|
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
16
|
requirements:
|
19
17
|
- - ! '>='
|
20
18
|
- !ruby/object:Gem::Version
|
@@ -22,7 +20,6 @@ dependencies:
|
|
22
20
|
type: :runtime
|
23
21
|
prerelease: false
|
24
22
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
23
|
requirements:
|
27
24
|
- - ! '>='
|
28
25
|
- !ruby/object:Gem::Version
|
@@ -30,7 +27,6 @@ dependencies:
|
|
30
27
|
- !ruby/object:Gem::Dependency
|
31
28
|
name: harvestdor
|
32
29
|
requirement: !ruby/object:Gem::Requirement
|
33
|
-
none: false
|
34
30
|
requirements:
|
35
31
|
- - ! '>='
|
36
32
|
- !ruby/object:Gem::Version
|
@@ -38,7 +34,6 @@ dependencies:
|
|
38
34
|
type: :runtime
|
39
35
|
prerelease: false
|
40
36
|
version_requirements: !ruby/object:Gem::Requirement
|
41
|
-
none: false
|
42
37
|
requirements:
|
43
38
|
- - ! '>='
|
44
39
|
- !ruby/object:Gem::Version
|
@@ -46,7 +41,6 @@ dependencies:
|
|
46
41
|
- !ruby/object:Gem::Dependency
|
47
42
|
name: stanford-mods
|
48
43
|
requirement: !ruby/object:Gem::Requirement
|
49
|
-
none: false
|
50
44
|
requirements:
|
51
45
|
- - ! '>='
|
52
46
|
- !ruby/object:Gem::Version
|
@@ -54,7 +48,20 @@ dependencies:
|
|
54
48
|
type: :runtime
|
55
49
|
prerelease: false
|
56
50
|
version_requirements: !ruby/object:Gem::Requirement
|
57
|
-
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: confstruct
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
58
65
|
requirements:
|
59
66
|
- - ! '>='
|
60
67
|
- !ruby/object:Gem::Version
|
@@ -62,7 +69,6 @@ dependencies:
|
|
62
69
|
- !ruby/object:Gem::Dependency
|
63
70
|
name: lyberteam-gems-devel
|
64
71
|
requirement: !ruby/object:Gem::Requirement
|
65
|
-
none: false
|
66
72
|
requirements:
|
67
73
|
- - ! '>='
|
68
74
|
- !ruby/object:Gem::Version
|
@@ -70,7 +76,6 @@ dependencies:
|
|
70
76
|
type: :development
|
71
77
|
prerelease: false
|
72
78
|
version_requirements: !ruby/object:Gem::Requirement
|
73
|
-
none: false
|
74
79
|
requirements:
|
75
80
|
- - ! '>='
|
76
81
|
- !ruby/object:Gem::Version
|
@@ -78,7 +83,6 @@ dependencies:
|
|
78
83
|
- !ruby/object:Gem::Dependency
|
79
84
|
name: rake
|
80
85
|
requirement: !ruby/object:Gem::Requirement
|
81
|
-
none: false
|
82
86
|
requirements:
|
83
87
|
- - ! '>='
|
84
88
|
- !ruby/object:Gem::Version
|
@@ -86,7 +90,6 @@ dependencies:
|
|
86
90
|
type: :development
|
87
91
|
prerelease: false
|
88
92
|
version_requirements: !ruby/object:Gem::Requirement
|
89
|
-
none: false
|
90
93
|
requirements:
|
91
94
|
- - ! '>='
|
92
95
|
- !ruby/object:Gem::Version
|
@@ -94,7 +97,6 @@ dependencies:
|
|
94
97
|
- !ruby/object:Gem::Dependency
|
95
98
|
name: rdoc
|
96
99
|
requirement: !ruby/object:Gem::Requirement
|
97
|
-
none: false
|
98
100
|
requirements:
|
99
101
|
- - ! '>='
|
100
102
|
- !ruby/object:Gem::Version
|
@@ -102,7 +104,6 @@ dependencies:
|
|
102
104
|
type: :development
|
103
105
|
prerelease: false
|
104
106
|
version_requirements: !ruby/object:Gem::Requirement
|
105
|
-
none: false
|
106
107
|
requirements:
|
107
108
|
- - ! '>='
|
108
109
|
- !ruby/object:Gem::Version
|
@@ -110,7 +111,6 @@ dependencies:
|
|
110
111
|
- !ruby/object:Gem::Dependency
|
111
112
|
name: yard
|
112
113
|
requirement: !ruby/object:Gem::Requirement
|
113
|
-
none: false
|
114
114
|
requirements:
|
115
115
|
- - ! '>='
|
116
116
|
- !ruby/object:Gem::Version
|
@@ -118,7 +118,6 @@ dependencies:
|
|
118
118
|
type: :development
|
119
119
|
prerelease: false
|
120
120
|
version_requirements: !ruby/object:Gem::Requirement
|
121
|
-
none: false
|
122
121
|
requirements:
|
123
122
|
- - ! '>='
|
124
123
|
- !ruby/object:Gem::Version
|
@@ -126,7 +125,6 @@ dependencies:
|
|
126
125
|
- !ruby/object:Gem::Dependency
|
127
126
|
name: rspec
|
128
127
|
requirement: !ruby/object:Gem::Requirement
|
129
|
-
none: false
|
130
128
|
requirements:
|
131
129
|
- - ! '>='
|
132
130
|
- !ruby/object:Gem::Version
|
@@ -134,7 +132,6 @@ dependencies:
|
|
134
132
|
type: :development
|
135
133
|
prerelease: false
|
136
134
|
version_requirements: !ruby/object:Gem::Requirement
|
137
|
-
none: false
|
138
135
|
requirements:
|
139
136
|
- - ! '>='
|
140
137
|
- !ruby/object:Gem::Version
|
@@ -142,7 +139,6 @@ dependencies:
|
|
142
139
|
- !ruby/object:Gem::Dependency
|
143
140
|
name: simplecov
|
144
141
|
requirement: !ruby/object:Gem::Requirement
|
145
|
-
none: false
|
146
142
|
requirements:
|
147
143
|
- - ! '>='
|
148
144
|
- !ruby/object:Gem::Version
|
@@ -150,7 +146,6 @@ dependencies:
|
|
150
146
|
type: :development
|
151
147
|
prerelease: false
|
152
148
|
version_requirements: !ruby/object:Gem::Requirement
|
153
|
-
none: false
|
154
149
|
requirements:
|
155
150
|
- - ! '>='
|
156
151
|
- !ruby/object:Gem::Version
|
@@ -158,7 +153,6 @@ dependencies:
|
|
158
153
|
- !ruby/object:Gem::Dependency
|
159
154
|
name: simplecov-rcov
|
160
155
|
requirement: !ruby/object:Gem::Requirement
|
161
|
-
none: false
|
162
156
|
requirements:
|
163
157
|
- - ! '>='
|
164
158
|
- !ruby/object:Gem::Version
|
@@ -166,7 +160,6 @@ dependencies:
|
|
166
160
|
type: :development
|
167
161
|
prerelease: false
|
168
162
|
version_requirements: !ruby/object:Gem::Requirement
|
169
|
-
none: false
|
170
163
|
requirements:
|
171
164
|
- - ! '>='
|
172
165
|
- !ruby/object:Gem::Version
|
@@ -196,33 +189,26 @@ files:
|
|
196
189
|
- spec/unit/harvestdor-indexer_spec.rb
|
197
190
|
homepage: https://consul.stanford.edu/display/chimera/Chimera+project
|
198
191
|
licenses: []
|
192
|
+
metadata: {}
|
199
193
|
post_install_message:
|
200
194
|
rdoc_options: []
|
201
195
|
require_paths:
|
202
196
|
- lib
|
203
197
|
required_ruby_version: !ruby/object:Gem::Requirement
|
204
|
-
none: false
|
205
198
|
requirements:
|
206
199
|
- - ! '>='
|
207
200
|
- !ruby/object:Gem::Version
|
208
201
|
version: '0'
|
209
|
-
segments:
|
210
|
-
- 0
|
211
|
-
hash: -2920299245033359379
|
212
202
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
213
|
-
none: false
|
214
203
|
requirements:
|
215
204
|
- - ! '>='
|
216
205
|
- !ruby/object:Gem::Version
|
217
206
|
version: '0'
|
218
|
-
segments:
|
219
|
-
- 0
|
220
|
-
hash: -2920299245033359379
|
221
207
|
requirements: []
|
222
208
|
rubyforge_project:
|
223
|
-
rubygems_version:
|
209
|
+
rubygems_version: 2.0.7
|
224
210
|
signing_key:
|
225
|
-
specification_version:
|
211
|
+
specification_version: 4
|
226
212
|
summary: Harvest DOR object metadata and index it to Solr
|
227
213
|
test_files:
|
228
214
|
- spec/config/ap.yml
|