harvestdor-indexer 1.0.4 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +4 -2
- data/Gemfile +1 -1
- data/README.rdoc +1 -0
- data/harvestdor-indexer.gemspec +4 -2
- data/lib/harvestdor-indexer.rb +1 -317
- data/lib/harvestdor/indexer.rb +159 -0
- data/lib/harvestdor/indexer/metrics.rb +53 -0
- data/lib/harvestdor/indexer/resource.rb +174 -0
- data/lib/harvestdor/indexer/solr.rb +39 -0
- data/lib/{harvestdor-indexer → harvestdor/indexer}/version.rb +1 -1
- data/spec/config/ap.yml +32 -44
- data/spec/fixtures/vcr_cassettes/get_collection_druids_call.yml +96 -0
- data/spec/fixtures/vcr_cassettes/process_druids_whitelist_call.yml +1494 -16
- data/spec/fixtures/vcr_cassettes/single_rsolr_connection_call.yml +80 -27
- data/spec/spec_helper.rb +1 -1
- data/spec/unit/harvestdor-indexer-resource_spec.rb +174 -0
- data/spec/unit/harvestdor-indexer-solr_spec.rb +32 -0
- data/spec/unit/harvestdor-indexer_spec.rb +47 -291
- data/spec/unit/harvestdor/indexer/metrics_spec.rb +46 -0
- metadata +45 -10
- data/config/dor-fetcher-client.yml +0 -4
- data/spec/config/ap_blacklist.txt +0 -5
@@ -0,0 +1,53 @@
|
|
1
|
+
module Harvestdor
|
2
|
+
##
|
3
|
+
# Harvest metrics tracker
|
4
|
+
class Indexer::Metrics
|
5
|
+
attr_accessor :error_count, :success_count, :logger
|
6
|
+
|
7
|
+
def initialize options = {}
|
8
|
+
@success_count=0 # the number of objects successfully indexed
|
9
|
+
@error_count=0 # the number of objects that failed
|
10
|
+
@logger = options[:logger] || Logger.new(STDERR)
|
11
|
+
end
|
12
|
+
|
13
|
+
##
|
14
|
+
# Wrap an operation in tally block; if the block completes without throwing
|
15
|
+
# an exception, tally a success. If the block throws an exception, catch it
|
16
|
+
# and tally a failure.
|
17
|
+
#
|
18
|
+
# Callers can provide an :on_error handler to receive the exception and process
|
19
|
+
# it appropriately.
|
20
|
+
#
|
21
|
+
# @param [Hash] options
|
22
|
+
# @option options [#call] Callback that will receive any exception thrown by the block
|
23
|
+
def tally options = {}, &block
|
24
|
+
begin
|
25
|
+
block.call
|
26
|
+
success!
|
27
|
+
rescue => e
|
28
|
+
error!
|
29
|
+
logger.error "Failed to process: #{e.message}"
|
30
|
+
options[:on_error].call e if options[:on_error]
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
##
|
35
|
+
# Record a successful run
|
36
|
+
def success!
|
37
|
+
@success_count += 1
|
38
|
+
end
|
39
|
+
|
40
|
+
##
|
41
|
+
# Record an error
|
42
|
+
def error!
|
43
|
+
@error_count += 1
|
44
|
+
end
|
45
|
+
|
46
|
+
##
|
47
|
+
# Total number of runs
|
48
|
+
def total
|
49
|
+
@success_count + @error_count
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,174 @@
|
|
1
|
+
require 'active_support/benchmarkable'
|
2
|
+
|
3
|
+
module Harvestdor
|
4
|
+
class Indexer::Resource
|
5
|
+
include ActiveSupport::Benchmarkable
|
6
|
+
|
7
|
+
attr_reader :indexer, :druid, :options
|
8
|
+
|
9
|
+
def initialize indexer, druid, options = {}
|
10
|
+
@indexer = indexer
|
11
|
+
@druid = druid
|
12
|
+
@options = options
|
13
|
+
end
|
14
|
+
|
15
|
+
def bare_druid
|
16
|
+
@bare_druid ||= druid.gsub("druid:", "")
|
17
|
+
end
|
18
|
+
|
19
|
+
##
|
20
|
+
# The harvestdor client used for retrieving resources
|
21
|
+
def harvestdor_client
|
22
|
+
indexer.harvestdor_client
|
23
|
+
end
|
24
|
+
|
25
|
+
def dor_fetcher_client
|
26
|
+
indexer.dor_fetcher_client
|
27
|
+
end
|
28
|
+
|
29
|
+
##
|
30
|
+
# Get the logger
|
31
|
+
def logger
|
32
|
+
options[:logger] || (indexer.logger if indexer.respond_to? :logger) || Logger.new(STDERR)
|
33
|
+
end
|
34
|
+
|
35
|
+
##
|
36
|
+
# Is this resource a collection?
|
37
|
+
def collection?
|
38
|
+
identity_metadata.xpath("/identityMetadata/objectType").any? { |x| x.text == "collection" }
|
39
|
+
end
|
40
|
+
|
41
|
+
# get the druids from isMemberOfCollection relationships in rels-ext from public_xml
|
42
|
+
# @return [Array<String>] the druids (e.g. ww123yy1234) this object has isMemberOfColletion relationship with, or nil if none
|
43
|
+
def collections
|
44
|
+
@collections ||= begin
|
45
|
+
ns_hash = {'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'fedora' => "info:fedora/fedora-system:def/relations-external#", '' => ''}
|
46
|
+
is_member_of_nodes ||= public_xml.xpath('/publicObject/rdf:RDF/rdf:Description/fedora:isMemberOfCollection/@rdf:resource', ns_hash)
|
47
|
+
|
48
|
+
is_member_of_nodes.reject { |n| n.value.empty? }.map do |n|
|
49
|
+
Harvestdor::Indexer::Resource.new(indexer, n.value.gsub("info:fedora/", ""))
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
##
|
55
|
+
# Return the items in this collection
|
56
|
+
def items
|
57
|
+
@items ||= begin
|
58
|
+
druids = dor_fetcher_client.druid_array(dor_fetcher_client.get_collection(bare_druid, {}))
|
59
|
+
druids.map { |x| Harvestdor::Indexer::Resource.new(indexer, x) }
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
# given a druid, get its objectLabel from its purl page identityMetadata
|
64
|
+
# @param [String] druid, e.g. ab123cd4567
|
65
|
+
# @return [String] the value of the <objectLabel> element in the identityMetadata for the object
|
66
|
+
def identity_md_obj_label
|
67
|
+
logger.error("#{druid} missing identityMetadata") unless identity_metadata
|
68
|
+
identity_metadata.xpath('identityMetadata/objectLabel').text
|
69
|
+
end
|
70
|
+
|
71
|
+
|
72
|
+
# return the MODS for the druid as a Stanford::Mods::Record object
|
73
|
+
# @param [String] druid e.g. ab123cd4567
|
74
|
+
# @return [Stanford::Mods::Record] created from the MODS xml for the druid
|
75
|
+
def smods_rec
|
76
|
+
@smods_rec ||= benchmark "smods_rec(#{druid})", level: :debug do
|
77
|
+
ng_doc = mods
|
78
|
+
raise "Empty MODS metadata for #{druid}: #{ng_doc.to_xml}" if ng_doc.root.xpath('//text()').empty?
|
79
|
+
mods_rec = Stanford::Mods::Record.new
|
80
|
+
mods_rec.from_nk_node(ng_doc.root)
|
81
|
+
mods_rec
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
def mods
|
86
|
+
@mods ||= harvestdor_client.mods bare_druid
|
87
|
+
end
|
88
|
+
|
89
|
+
# the public xml for this DOR object, from the purl page
|
90
|
+
# @param [String] druid e.g. ab123cd4567
|
91
|
+
# @return [Nokogiri::XML::Document] the public xml for the DOR object
|
92
|
+
def public_xml
|
93
|
+
@public_xml ||= benchmark "public_xml(#{druid})", level: :debug do
|
94
|
+
ng_doc = harvestdor_client.public_xml bare_druid
|
95
|
+
raise "No public xml for #{druid}" if !ng_doc
|
96
|
+
raise "Empty public xml for #{druid}: #{ng_doc.to_xml}" if ng_doc.root.xpath('//text()').empty?
|
97
|
+
ng_doc
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
##
|
102
|
+
# Has the public_xml been previously retrieved?
|
103
|
+
def public_xml?
|
104
|
+
!!@public_xml
|
105
|
+
end
|
106
|
+
|
107
|
+
##
|
108
|
+
# Get the public_xml, if retrieved, or the druid. This is used to short-circuit
|
109
|
+
# retrieving metadata out of the public xml.
|
110
|
+
def public_xml_or_druid
|
111
|
+
if public_xml?
|
112
|
+
public_xml
|
113
|
+
else
|
114
|
+
bare_druid
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
# the contentMetadata for this DOR object, ultimately from the purl public xml
|
119
|
+
# @param [Object] object a String containing a druid (e.g. ab123cd4567), or
|
120
|
+
# a Nokogiri::XML::Document containing the public_xml for an object
|
121
|
+
# @return [Nokogiri::XML::Document] the contentMetadata for the DOR object
|
122
|
+
def content_metadata
|
123
|
+
ng_doc = benchmark "content_metadata (#{druid})", level: :debug do
|
124
|
+
harvestdor_client.content_metadata public_xml_or_druid
|
125
|
+
end
|
126
|
+
raise "No contentMetadata for \"#{druid}\"" if !ng_doc || ng_doc.children.empty?
|
127
|
+
ng_doc
|
128
|
+
end
|
129
|
+
|
130
|
+
# the identityMetadata for this DOR object, ultimately from the purl public xml
|
131
|
+
# @param [Object] object a String containing a druid (e.g. ab123cd4567), or
|
132
|
+
# a Nokogiri::XML::Document containing the public_xml for an object
|
133
|
+
# @return [Nokogiri::XML::Document] the identityMetadata for the DOR object
|
134
|
+
def identity_metadata
|
135
|
+
ng_doc = benchmark "identity_metadata (#{druid})", level: :debug do
|
136
|
+
harvestdor_client.identity_metadata public_xml_or_druid
|
137
|
+
end
|
138
|
+
raise "No identityMetadata for \"#{druid}\"" if !ng_doc || ng_doc.children.empty?
|
139
|
+
ng_doc
|
140
|
+
end
|
141
|
+
|
142
|
+
# the rightsMetadata for this DOR object, ultimately from the purl public xml
|
143
|
+
# @param [Object] object a String containing a druid (e.g. ab123cd4567), or
|
144
|
+
# a Nokogiri::XML::Document containing the public_xml for an object
|
145
|
+
# @return [Nokogiri::XML::Document] the rightsMetadata for the DOR object
|
146
|
+
def rights_metadata
|
147
|
+
ng_doc = benchmark "rights_metadata (#{druid})", level: :debug do
|
148
|
+
harvestdor_client.rights_metadata public_xml_or_druid
|
149
|
+
end
|
150
|
+
raise "No rightsMetadata for \"#{druid}\"" if !ng_doc || ng_doc.children.empty?
|
151
|
+
ng_doc
|
152
|
+
end
|
153
|
+
|
154
|
+
# the RDF for this DOR object, ultimately from the purl public xml
|
155
|
+
# @param [Object] object a String containing a druid (e.g. ab123cd4567), or
|
156
|
+
# a Nokogiri::XML::Document containing the public_xml for an object
|
157
|
+
# @return [Nokogiri::XML::Document] the RDF for the DOR object
|
158
|
+
def rdf
|
159
|
+
ng_doc = benchmark "rdf (#{druid})", level: :debug do
|
160
|
+
harvestdor_client.rdf public_xml_or_druid
|
161
|
+
end
|
162
|
+
raise "No RDF for \"#{druid}\"" if !ng_doc || ng_doc.children.empty?
|
163
|
+
ng_doc
|
164
|
+
end
|
165
|
+
|
166
|
+
def eql?(other)
|
167
|
+
other.is_a? Harvestdor::Indexer::Resource and other.indexer == indexer and other.druid == druid
|
168
|
+
end
|
169
|
+
|
170
|
+
def hash
|
171
|
+
druid.hash ^ indexer.hash
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module Harvestdor
|
2
|
+
class Indexer::Solr
|
3
|
+
attr_accessor :client, :indexer, :config
|
4
|
+
|
5
|
+
def initialize indexer, config = {}
|
6
|
+
@indexer = indexer
|
7
|
+
@client = RSolr.connect(config)
|
8
|
+
@config = Confstruct::Configuration.new config
|
9
|
+
@config.max_retries ||= 10
|
10
|
+
end
|
11
|
+
|
12
|
+
def logger
|
13
|
+
indexer.logger
|
14
|
+
end
|
15
|
+
|
16
|
+
def commit!
|
17
|
+
client.commit
|
18
|
+
end
|
19
|
+
|
20
|
+
# Add the document to solr, retry if an error occurs.
|
21
|
+
# See https://github.com/ooyala/retries for docs on with_retries.
|
22
|
+
# @param [Hash] doc a Hash representation of the solr document
|
23
|
+
def add(doc)
|
24
|
+
id = doc[:id]
|
25
|
+
|
26
|
+
handler = Proc.new do |exception, attempt_number, total_delay|
|
27
|
+
logger.debug "#{exception.class} on attempt #{attempt_number} for #{id}"
|
28
|
+
# logger.debug exception.backtrace
|
29
|
+
end
|
30
|
+
|
31
|
+
with_retries(:max_tries => config.max_retries, :handler => handler, :base_sleep_seconds => 1, :max_sleep_seconds => 5) do |attempt|
|
32
|
+
logger.debug "Attempt #{attempt} for #{id}"
|
33
|
+
client.add(doc)
|
34
|
+
logger.info "Successfully indexed #{id} on attempt #{attempt}"
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
39
|
+
end
|
data/spec/config/ap.yml
CHANGED
@@ -1,28 +1,6 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
# 3. blacklist or whitelist if you are using them
|
5
|
-
# 4. Solr baseurl
|
6
|
-
|
7
|
-
# log_dir: directory for log file (default logs, relative to harvestdor gem path)
|
8
|
-
log_dir: spec/test_logs
|
9
|
-
|
10
|
-
# log_name: name of log file (default: harvestdor.log)
|
11
|
-
log_name: ap-test.log
|
12
|
-
|
13
|
-
# purl: url for the DOR purl server (used to get ContentMetadata, etc.)
|
14
|
-
purl: http://purl.stanford.edu
|
15
|
-
|
16
|
-
# ---------- White and Black list parameters -----
|
17
|
-
|
18
|
-
# name of file containing druids that will NOT be processed even if they are harvested
|
19
|
-
# via DorFetcher either give absolute path or path relative to where the command will
|
20
|
-
# be executed
|
21
|
-
#blacklist: config/ap_blacklist.txt
|
22
|
-
|
23
|
-
# name of file containing druids that WILL be processed (all others will be ignored)
|
24
|
-
# either give absolute path or path relative to where the command will be executed
|
25
|
-
#whitelist: config/ap_whitelist.txt
|
1
|
+
dor_fetcher:
|
2
|
+
service_url: http://127.0.0.1:3000
|
3
|
+
skip_heartbeat: true
|
26
4
|
|
27
5
|
# ----------- SOLR index (that we're writing INTO) parameters ------------
|
28
6
|
solr:
|
@@ -31,25 +9,35 @@ solr:
|
|
31
9
|
# timeouts are in seconds; read_timeout -> open/read, open_timeout -> connection open
|
32
10
|
read_timeout: 60
|
33
11
|
open_timeout: 60
|
12
|
+
max_retries: 10
|
34
13
|
|
35
|
-
#
|
36
|
-
# can be overridden on calls to harvest_ids and harvest_records
|
37
|
-
default_set: is_governed_by_yg867hg1375
|
38
|
-
|
39
|
-
# default_metadata_prefix: default metadata prefix to be used for harvesting (default: mods)
|
40
|
-
# can be overridden on calls to harvest_ids and harvest_records
|
41
|
-
|
42
|
-
# default_from_date: default from date for harvest (default: nil)
|
43
|
-
# can be overridden on calls to harvest_ids and harvest_records
|
14
|
+
# ---------- White and Black list parameters -----
|
44
15
|
|
45
|
-
#
|
46
|
-
#
|
16
|
+
# name of file containing druids that WILL be processed (all others will be ignored)
|
17
|
+
# either give absolute path or path relative to where the command will be executed
|
18
|
+
#whitelist: config/ap_whitelist.txt
|
47
19
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
#
|
53
|
-
|
54
|
-
|
55
|
-
|
20
|
+
harvestdor:
|
21
|
+
# You will want to copy this file and change the following settings:
|
22
|
+
# 1. log_dir, log_name
|
23
|
+
# 2. default_set
|
24
|
+
# 3. blacklist or whitelist if you are using them
|
25
|
+
# 4. Solr baseurl
|
26
|
+
|
27
|
+
# log_dir: directory for log file (default logs, relative to harvestdor gem path)
|
28
|
+
log_dir: spec/test_logs
|
29
|
+
|
30
|
+
# log_name: name of log file (default: harvestdor.log)
|
31
|
+
log_name: ap-test.log
|
32
|
+
|
33
|
+
# purl: url for the DOR purl server (used to get ContentMetadata, etc.)
|
34
|
+
purl: http://purl.stanford.edu
|
35
|
+
|
36
|
+
# Additional options to pass to Faraday http client (https://github.com/technoweenie/faraday)
|
37
|
+
http_options:
|
38
|
+
ssl:
|
39
|
+
verify: false
|
40
|
+
# timeouts are in seconds; timeout -> open/read, open_timeout -> connection open
|
41
|
+
request:
|
42
|
+
timeout: 180
|
43
|
+
open_timeout: 180
|
@@ -53,4 +53,100 @@ http_interactions:
|
|
53
53
|
social customs, and people."}],"counts":{"collections":1,"items":5,"total_count":6}}'
|
54
54
|
http_version:
|
55
55
|
recorded_at: Wed, 12 Nov 2014 19:34:03 GMT
|
56
|
+
- request:
|
57
|
+
method: get
|
58
|
+
uri: http://purl.stanford.edu/yg867hg1375.xml
|
59
|
+
body:
|
60
|
+
encoding: US-ASCII
|
61
|
+
string: ''
|
62
|
+
headers:
|
63
|
+
Accept-Encoding:
|
64
|
+
- gzip;q=1.0,deflate;q=0.6,identity;q=0.3
|
65
|
+
Accept:
|
66
|
+
- "*/*"
|
67
|
+
User-Agent:
|
68
|
+
- Ruby
|
69
|
+
response:
|
70
|
+
status:
|
71
|
+
code: 200
|
72
|
+
message: ''
|
73
|
+
headers:
|
74
|
+
Date:
|
75
|
+
- Wed, 17 Dec 2014 19:39:37 GMT
|
76
|
+
Server:
|
77
|
+
- Apache/2.2.15 (Red Hat)
|
78
|
+
X-Powered-By:
|
79
|
+
- Phusion Passenger (mod_rails/mod_rack) 3.0.19
|
80
|
+
X-Ua-Compatible:
|
81
|
+
- IE=Edge,chrome=1
|
82
|
+
Etag:
|
83
|
+
- '"67aa6d1481ba1537ae63af5aaf493f84"'
|
84
|
+
Cache-Control:
|
85
|
+
- max-age=0, private, must-revalidate
|
86
|
+
X-Request-Id:
|
87
|
+
- f2e753d56bf896cde6e941be0f51d05a
|
88
|
+
X-Runtime:
|
89
|
+
- '0.007983'
|
90
|
+
X-Rack-Cache:
|
91
|
+
- miss
|
92
|
+
Status:
|
93
|
+
- '200'
|
94
|
+
Content-Length:
|
95
|
+
- '2180'
|
96
|
+
Content-Type:
|
97
|
+
- application/xml; charset=utf-8
|
98
|
+
body:
|
99
|
+
encoding: UTF-8
|
100
|
+
string: |
|
101
|
+
<publicObject id="druid:yg867hg1375" published="2013-11-11T15:34:32-08:00">
|
102
|
+
<identityMetadata>
|
103
|
+
<objectId>druid:yg867hg1375</objectId>
|
104
|
+
<objectCreator>DOR</objectCreator>
|
105
|
+
<objectLabel>Francis E. Stafford photographs, 1909-1933</objectLabel>
|
106
|
+
<objectType>collection</objectType>
|
107
|
+
<adminPolicy>druid:vb546ms7107</adminPolicy>
|
108
|
+
<otherId name="catkey">9615156</otherId>
|
109
|
+
<otherId name="uuid">8f1feb20-4b29-11e3-8e31-0050569b3c3c</otherId>
|
110
|
+
<tag>Remediated By : 3.25.3</tag>
|
111
|
+
</identityMetadata>
|
112
|
+
<xml/>
|
113
|
+
<rightsMetadata>
|
114
|
+
<access type="discover">
|
115
|
+
<machine>
|
116
|
+
<world/>
|
117
|
+
</machine>
|
118
|
+
</access>
|
119
|
+
<access type="read">
|
120
|
+
<machine>
|
121
|
+
<world/>
|
122
|
+
</machine>
|
123
|
+
</access>
|
124
|
+
<use>
|
125
|
+
<human type="useAndReproduction"/>
|
126
|
+
<human type="creativeCommons"/>
|
127
|
+
<machine type="creativeCommons"/>
|
128
|
+
</use>
|
129
|
+
<copyright>
|
130
|
+
<human/>
|
131
|
+
</copyright>
|
132
|
+
</rightsMetadata>
|
133
|
+
<rdf:RDF xmlns:fedora-model="info:fedora/fedora-system:def/model#" xmlns:hydra="http://projecthydra.org/ns/relations#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
|
134
|
+
<rdf:Description rdf:about="info:fedora/druid:yg867hg1375">
|
135
|
+
</rdf:Description>
|
136
|
+
</rdf:RDF>
|
137
|
+
<oai_dc:dc xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:srw_dc="info:srw/schema/1/dc-schema" xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd">
|
138
|
+
<dc:title>Francis E. Stafford photographs, 1909-1933</dc:title>
|
139
|
+
<dc:contributor>Stafford, Francis E., 1884-1938</dc:contributor>
|
140
|
+
<dc:type>Collection</dc:type>
|
141
|
+
<dc:date>1909-1933</dc:date>
|
142
|
+
<dc:language>und</dc:language>
|
143
|
+
<dc:format>3 oversize boxes.</dc:format>
|
144
|
+
<dc:description>Photographs of scenes in China, mainly between 1909 and 1915.</dc:description>
|
145
|
+
<dc:rights>Closed. Digital use copies available.</dc:rights>
|
146
|
+
<dc:description type="biographical/historical">American missionary in China, 1909-1915 and 1932-1933.</dc:description>
|
147
|
+
<dc:coverage>China</dc:coverage>
|
148
|
+
</oai_dc:dc>
|
149
|
+
</publicObject>
|
150
|
+
http_version:
|
151
|
+
recorded_at: Wed, 17 Dec 2014 19:39:38 GMT
|
56
152
|
recorded_with: VCR 2.9.3
|