harvestdor-indexer 1.0.4 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +4 -2
- data/Gemfile +1 -1
- data/README.rdoc +1 -0
- data/harvestdor-indexer.gemspec +4 -2
- data/lib/harvestdor-indexer.rb +1 -317
- data/lib/harvestdor/indexer.rb +159 -0
- data/lib/harvestdor/indexer/metrics.rb +53 -0
- data/lib/harvestdor/indexer/resource.rb +174 -0
- data/lib/harvestdor/indexer/solr.rb +39 -0
- data/lib/{harvestdor-indexer → harvestdor/indexer}/version.rb +1 -1
- data/spec/config/ap.yml +32 -44
- data/spec/fixtures/vcr_cassettes/get_collection_druids_call.yml +96 -0
- data/spec/fixtures/vcr_cassettes/process_druids_whitelist_call.yml +1494 -16
- data/spec/fixtures/vcr_cassettes/single_rsolr_connection_call.yml +80 -27
- data/spec/spec_helper.rb +1 -1
- data/spec/unit/harvestdor-indexer-resource_spec.rb +174 -0
- data/spec/unit/harvestdor-indexer-solr_spec.rb +32 -0
- data/spec/unit/harvestdor-indexer_spec.rb +47 -291
- data/spec/unit/harvestdor/indexer/metrics_spec.rb +46 -0
- metadata +45 -10
- data/config/dor-fetcher-client.yml +0 -4
- data/spec/config/ap_blacklist.txt +0 -5
@@ -0,0 +1,53 @@
|
|
1
|
+
module Harvestdor
|
2
|
+
##
|
3
|
+
# Harvest metrics tracker
|
4
|
+
class Indexer::Metrics
|
5
|
+
attr_accessor :error_count, :success_count, :logger
|
6
|
+
|
7
|
+
def initialize options = {}
|
8
|
+
@success_count=0 # the number of objects successfully indexed
|
9
|
+
@error_count=0 # the number of objects that failed
|
10
|
+
@logger = options[:logger] || Logger.new(STDERR)
|
11
|
+
end
|
12
|
+
|
13
|
+
##
|
14
|
+
# Wrap an operation in tally block; if the block completes without throwing
|
15
|
+
# an exception, tally a success. If the block throws an exception, catch it
|
16
|
+
# and tally a failure.
|
17
|
+
#
|
18
|
+
# Callers can provide an :on_error handler to receive the exception and process
|
19
|
+
# it appropriately.
|
20
|
+
#
|
21
|
+
# @param [Hash] options
|
22
|
+
# @option options [#call] Callback that will receive any exception thrown by the block
|
23
|
+
def tally options = {}, &block
|
24
|
+
begin
|
25
|
+
block.call
|
26
|
+
success!
|
27
|
+
rescue => e
|
28
|
+
error!
|
29
|
+
logger.error "Failed to process: #{e.message}"
|
30
|
+
options[:on_error].call e if options[:on_error]
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
##
|
35
|
+
# Record a successful run
|
36
|
+
def success!
|
37
|
+
@success_count += 1
|
38
|
+
end
|
39
|
+
|
40
|
+
##
|
41
|
+
# Record an error
|
42
|
+
def error!
|
43
|
+
@error_count += 1
|
44
|
+
end
|
45
|
+
|
46
|
+
##
|
47
|
+
# Total number of runs
|
48
|
+
def total
|
49
|
+
@success_count + @error_count
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,174 @@
|
|
1
|
+
require 'active_support/benchmarkable'
|
2
|
+
|
3
|
+
module Harvestdor
|
4
|
+
class Indexer::Resource
|
5
|
+
include ActiveSupport::Benchmarkable
|
6
|
+
|
7
|
+
attr_reader :indexer, :druid, :options
|
8
|
+
|
9
|
+
def initialize indexer, druid, options = {}
|
10
|
+
@indexer = indexer
|
11
|
+
@druid = druid
|
12
|
+
@options = options
|
13
|
+
end
|
14
|
+
|
15
|
+
def bare_druid
|
16
|
+
@bare_druid ||= druid.gsub("druid:", "")
|
17
|
+
end
|
18
|
+
|
19
|
+
##
|
20
|
+
# The harvestdor client used for retrieving resources
|
21
|
+
def harvestdor_client
|
22
|
+
indexer.harvestdor_client
|
23
|
+
end
|
24
|
+
|
25
|
+
def dor_fetcher_client
|
26
|
+
indexer.dor_fetcher_client
|
27
|
+
end
|
28
|
+
|
29
|
+
##
|
30
|
+
# Get the logger
|
31
|
+
def logger
|
32
|
+
options[:logger] || (indexer.logger if indexer.respond_to? :logger) || Logger.new(STDERR)
|
33
|
+
end
|
34
|
+
|
35
|
+
##
|
36
|
+
# Is this resource a collection?
|
37
|
+
def collection?
|
38
|
+
identity_metadata.xpath("/identityMetadata/objectType").any? { |x| x.text == "collection" }
|
39
|
+
end
|
40
|
+
|
41
|
+
# get the druids from isMemberOfCollection relationships in rels-ext from public_xml
|
42
|
+
# @return [Array<String>] the druids (e.g. ww123yy1234) this object has isMemberOfColletion relationship with, or nil if none
|
43
|
+
def collections
|
44
|
+
@collections ||= begin
|
45
|
+
ns_hash = {'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'fedora' => "info:fedora/fedora-system:def/relations-external#", '' => ''}
|
46
|
+
is_member_of_nodes ||= public_xml.xpath('/publicObject/rdf:RDF/rdf:Description/fedora:isMemberOfCollection/@rdf:resource', ns_hash)
|
47
|
+
|
48
|
+
is_member_of_nodes.reject { |n| n.value.empty? }.map do |n|
|
49
|
+
Harvestdor::Indexer::Resource.new(indexer, n.value.gsub("info:fedora/", ""))
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
##
|
55
|
+
# Return the items in this collection
|
56
|
+
def items
|
57
|
+
@items ||= begin
|
58
|
+
druids = dor_fetcher_client.druid_array(dor_fetcher_client.get_collection(bare_druid, {}))
|
59
|
+
druids.map { |x| Harvestdor::Indexer::Resource.new(indexer, x) }
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
# given a druid, get its objectLabel from its purl page identityMetadata
|
64
|
+
# @param [String] druid, e.g. ab123cd4567
|
65
|
+
# @return [String] the value of the <objectLabel> element in the identityMetadata for the object
|
66
|
+
def identity_md_obj_label
|
67
|
+
logger.error("#{druid} missing identityMetadata") unless identity_metadata
|
68
|
+
identity_metadata.xpath('identityMetadata/objectLabel').text
|
69
|
+
end
|
70
|
+
|
71
|
+
|
72
|
+
# return the MODS for the druid as a Stanford::Mods::Record object
|
73
|
+
# @param [String] druid e.g. ab123cd4567
|
74
|
+
# @return [Stanford::Mods::Record] created from the MODS xml for the druid
|
75
|
+
def smods_rec
|
76
|
+
@smods_rec ||= benchmark "smods_rec(#{druid})", level: :debug do
|
77
|
+
ng_doc = mods
|
78
|
+
raise "Empty MODS metadata for #{druid}: #{ng_doc.to_xml}" if ng_doc.root.xpath('//text()').empty?
|
79
|
+
mods_rec = Stanford::Mods::Record.new
|
80
|
+
mods_rec.from_nk_node(ng_doc.root)
|
81
|
+
mods_rec
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
def mods
|
86
|
+
@mods ||= harvestdor_client.mods bare_druid
|
87
|
+
end
|
88
|
+
|
89
|
+
# the public xml for this DOR object, from the purl page
|
90
|
+
# @param [String] druid e.g. ab123cd4567
|
91
|
+
# @return [Nokogiri::XML::Document] the public xml for the DOR object
|
92
|
+
def public_xml
|
93
|
+
@public_xml ||= benchmark "public_xml(#{druid})", level: :debug do
|
94
|
+
ng_doc = harvestdor_client.public_xml bare_druid
|
95
|
+
raise "No public xml for #{druid}" if !ng_doc
|
96
|
+
raise "Empty public xml for #{druid}: #{ng_doc.to_xml}" if ng_doc.root.xpath('//text()').empty?
|
97
|
+
ng_doc
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
##
|
102
|
+
# Has the public_xml been previously retrieved?
|
103
|
+
def public_xml?
|
104
|
+
!!@public_xml
|
105
|
+
end
|
106
|
+
|
107
|
+
##
|
108
|
+
# Get the public_xml, if retrieved, or the druid. This is used to short-circuit
|
109
|
+
# retrieving metadata out of the public xml.
|
110
|
+
def public_xml_or_druid
|
111
|
+
if public_xml?
|
112
|
+
public_xml
|
113
|
+
else
|
114
|
+
bare_druid
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
# the contentMetadata for this DOR object, ultimately from the purl public xml
|
119
|
+
# @param [Object] object a String containing a druid (e.g. ab123cd4567), or
|
120
|
+
# a Nokogiri::XML::Document containing the public_xml for an object
|
121
|
+
# @return [Nokogiri::XML::Document] the contentMetadata for the DOR object
|
122
|
+
def content_metadata
|
123
|
+
ng_doc = benchmark "content_metadata (#{druid})", level: :debug do
|
124
|
+
harvestdor_client.content_metadata public_xml_or_druid
|
125
|
+
end
|
126
|
+
raise "No contentMetadata for \"#{druid}\"" if !ng_doc || ng_doc.children.empty?
|
127
|
+
ng_doc
|
128
|
+
end
|
129
|
+
|
130
|
+
# the identityMetadata for this DOR object, ultimately from the purl public xml
|
131
|
+
# @param [Object] object a String containing a druid (e.g. ab123cd4567), or
|
132
|
+
# a Nokogiri::XML::Document containing the public_xml for an object
|
133
|
+
# @return [Nokogiri::XML::Document] the identityMetadata for the DOR object
|
134
|
+
def identity_metadata
|
135
|
+
ng_doc = benchmark "identity_metadata (#{druid})", level: :debug do
|
136
|
+
harvestdor_client.identity_metadata public_xml_or_druid
|
137
|
+
end
|
138
|
+
raise "No identityMetadata for \"#{druid}\"" if !ng_doc || ng_doc.children.empty?
|
139
|
+
ng_doc
|
140
|
+
end
|
141
|
+
|
142
|
+
# the rightsMetadata for this DOR object, ultimately from the purl public xml
|
143
|
+
# @param [Object] object a String containing a druid (e.g. ab123cd4567), or
|
144
|
+
# a Nokogiri::XML::Document containing the public_xml for an object
|
145
|
+
# @return [Nokogiri::XML::Document] the rightsMetadata for the DOR object
|
146
|
+
def rights_metadata
|
147
|
+
ng_doc = benchmark "rights_metadata (#{druid})", level: :debug do
|
148
|
+
harvestdor_client.rights_metadata public_xml_or_druid
|
149
|
+
end
|
150
|
+
raise "No rightsMetadata for \"#{druid}\"" if !ng_doc || ng_doc.children.empty?
|
151
|
+
ng_doc
|
152
|
+
end
|
153
|
+
|
154
|
+
# the RDF for this DOR object, ultimately from the purl public xml
|
155
|
+
# @param [Object] object a String containing a druid (e.g. ab123cd4567), or
|
156
|
+
# a Nokogiri::XML::Document containing the public_xml for an object
|
157
|
+
# @return [Nokogiri::XML::Document] the RDF for the DOR object
|
158
|
+
def rdf
|
159
|
+
ng_doc = benchmark "rdf (#{druid})", level: :debug do
|
160
|
+
harvestdor_client.rdf public_xml_or_druid
|
161
|
+
end
|
162
|
+
raise "No RDF for \"#{druid}\"" if !ng_doc || ng_doc.children.empty?
|
163
|
+
ng_doc
|
164
|
+
end
|
165
|
+
|
166
|
+
def eql?(other)
|
167
|
+
other.is_a? Harvestdor::Indexer::Resource and other.indexer == indexer and other.druid == druid
|
168
|
+
end
|
169
|
+
|
170
|
+
def hash
|
171
|
+
druid.hash ^ indexer.hash
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module Harvestdor
|
2
|
+
class Indexer::Solr
|
3
|
+
attr_accessor :client, :indexer, :config
|
4
|
+
|
5
|
+
def initialize indexer, config = {}
|
6
|
+
@indexer = indexer
|
7
|
+
@client = RSolr.connect(config)
|
8
|
+
@config = Confstruct::Configuration.new config
|
9
|
+
@config.max_retries ||= 10
|
10
|
+
end
|
11
|
+
|
12
|
+
def logger
|
13
|
+
indexer.logger
|
14
|
+
end
|
15
|
+
|
16
|
+
def commit!
|
17
|
+
client.commit
|
18
|
+
end
|
19
|
+
|
20
|
+
# Add the document to solr, retry if an error occurs.
|
21
|
+
# See https://github.com/ooyala/retries for docs on with_retries.
|
22
|
+
# @param [Hash] doc a Hash representation of the solr document
|
23
|
+
def add(doc)
|
24
|
+
id = doc[:id]
|
25
|
+
|
26
|
+
handler = Proc.new do |exception, attempt_number, total_delay|
|
27
|
+
logger.debug "#{exception.class} on attempt #{attempt_number} for #{id}"
|
28
|
+
# logger.debug exception.backtrace
|
29
|
+
end
|
30
|
+
|
31
|
+
with_retries(:max_tries => config.max_retries, :handler => handler, :base_sleep_seconds => 1, :max_sleep_seconds => 5) do |attempt|
|
32
|
+
logger.debug "Attempt #{attempt} for #{id}"
|
33
|
+
client.add(doc)
|
34
|
+
logger.info "Successfully indexed #{id} on attempt #{attempt}"
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
39
|
+
end
|
data/spec/config/ap.yml
CHANGED
@@ -1,28 +1,6 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
# 3. blacklist or whitelist if you are using them
|
5
|
-
# 4. Solr baseurl
|
6
|
-
|
7
|
-
# log_dir: directory for log file (default logs, relative to harvestdor gem path)
|
8
|
-
log_dir: spec/test_logs
|
9
|
-
|
10
|
-
# log_name: name of log file (default: harvestdor.log)
|
11
|
-
log_name: ap-test.log
|
12
|
-
|
13
|
-
# purl: url for the DOR purl server (used to get ContentMetadata, etc.)
|
14
|
-
purl: http://purl.stanford.edu
|
15
|
-
|
16
|
-
# ---------- White and Black list parameters -----
|
17
|
-
|
18
|
-
# name of file containing druids that will NOT be processed even if they are harvested
|
19
|
-
# via DorFetcher either give absolute path or path relative to where the command will
|
20
|
-
# be executed
|
21
|
-
#blacklist: config/ap_blacklist.txt
|
22
|
-
|
23
|
-
# name of file containing druids that WILL be processed (all others will be ignored)
|
24
|
-
# either give absolute path or path relative to where the command will be executed
|
25
|
-
#whitelist: config/ap_whitelist.txt
|
1
|
+
dor_fetcher:
|
2
|
+
service_url: http://127.0.0.1:3000
|
3
|
+
skip_heartbeat: true
|
26
4
|
|
27
5
|
# ----------- SOLR index (that we're writing INTO) parameters ------------
|
28
6
|
solr:
|
@@ -31,25 +9,35 @@ solr:
|
|
31
9
|
# timeouts are in seconds; read_timeout -> open/read, open_timeout -> connection open
|
32
10
|
read_timeout: 60
|
33
11
|
open_timeout: 60
|
12
|
+
max_retries: 10
|
34
13
|
|
35
|
-
#
|
36
|
-
# can be overridden on calls to harvest_ids and harvest_records
|
37
|
-
default_set: is_governed_by_yg867hg1375
|
38
|
-
|
39
|
-
# default_metadata_prefix: default metadata prefix to be used for harvesting (default: mods)
|
40
|
-
# can be overridden on calls to harvest_ids and harvest_records
|
41
|
-
|
42
|
-
# default_from_date: default from date for harvest (default: nil)
|
43
|
-
# can be overridden on calls to harvest_ids and harvest_records
|
14
|
+
# ---------- White and Black list parameters -----
|
44
15
|
|
45
|
-
#
|
46
|
-
#
|
16
|
+
# name of file containing druids that WILL be processed (all others will be ignored)
|
17
|
+
# either give absolute path or path relative to where the command will be executed
|
18
|
+
#whitelist: config/ap_whitelist.txt
|
47
19
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
#
|
53
|
-
|
54
|
-
|
55
|
-
|
20
|
+
harvestdor:
|
21
|
+
# You will want to copy this file and change the following settings:
|
22
|
+
# 1. log_dir, log_name
|
23
|
+
# 2. default_set
|
24
|
+
# 3. blacklist or whitelist if you are using them
|
25
|
+
# 4. Solr baseurl
|
26
|
+
|
27
|
+
# log_dir: directory for log file (default logs, relative to harvestdor gem path)
|
28
|
+
log_dir: spec/test_logs
|
29
|
+
|
30
|
+
# log_name: name of log file (default: harvestdor.log)
|
31
|
+
log_name: ap-test.log
|
32
|
+
|
33
|
+
# purl: url for the DOR purl server (used to get ContentMetadata, etc.)
|
34
|
+
purl: http://purl.stanford.edu
|
35
|
+
|
36
|
+
# Additional options to pass to Faraday http client (https://github.com/technoweenie/faraday)
|
37
|
+
http_options:
|
38
|
+
ssl:
|
39
|
+
verify: false
|
40
|
+
# timeouts are in seconds; timeout -> open/read, open_timeout -> connection open
|
41
|
+
request:
|
42
|
+
timeout: 180
|
43
|
+
open_timeout: 180
|
@@ -53,4 +53,100 @@ http_interactions:
|
|
53
53
|
social customs, and people."}],"counts":{"collections":1,"items":5,"total_count":6}}'
|
54
54
|
http_version:
|
55
55
|
recorded_at: Wed, 12 Nov 2014 19:34:03 GMT
|
56
|
+
- request:
|
57
|
+
method: get
|
58
|
+
uri: http://purl.stanford.edu/yg867hg1375.xml
|
59
|
+
body:
|
60
|
+
encoding: US-ASCII
|
61
|
+
string: ''
|
62
|
+
headers:
|
63
|
+
Accept-Encoding:
|
64
|
+
- gzip;q=1.0,deflate;q=0.6,identity;q=0.3
|
65
|
+
Accept:
|
66
|
+
- "*/*"
|
67
|
+
User-Agent:
|
68
|
+
- Ruby
|
69
|
+
response:
|
70
|
+
status:
|
71
|
+
code: 200
|
72
|
+
message: ''
|
73
|
+
headers:
|
74
|
+
Date:
|
75
|
+
- Wed, 17 Dec 2014 19:39:37 GMT
|
76
|
+
Server:
|
77
|
+
- Apache/2.2.15 (Red Hat)
|
78
|
+
X-Powered-By:
|
79
|
+
- Phusion Passenger (mod_rails/mod_rack) 3.0.19
|
80
|
+
X-Ua-Compatible:
|
81
|
+
- IE=Edge,chrome=1
|
82
|
+
Etag:
|
83
|
+
- '"67aa6d1481ba1537ae63af5aaf493f84"'
|
84
|
+
Cache-Control:
|
85
|
+
- max-age=0, private, must-revalidate
|
86
|
+
X-Request-Id:
|
87
|
+
- f2e753d56bf896cde6e941be0f51d05a
|
88
|
+
X-Runtime:
|
89
|
+
- '0.007983'
|
90
|
+
X-Rack-Cache:
|
91
|
+
- miss
|
92
|
+
Status:
|
93
|
+
- '200'
|
94
|
+
Content-Length:
|
95
|
+
- '2180'
|
96
|
+
Content-Type:
|
97
|
+
- application/xml; charset=utf-8
|
98
|
+
body:
|
99
|
+
encoding: UTF-8
|
100
|
+
string: |
|
101
|
+
<publicObject id="druid:yg867hg1375" published="2013-11-11T15:34:32-08:00">
|
102
|
+
<identityMetadata>
|
103
|
+
<objectId>druid:yg867hg1375</objectId>
|
104
|
+
<objectCreator>DOR</objectCreator>
|
105
|
+
<objectLabel>Francis E. Stafford photographs, 1909-1933</objectLabel>
|
106
|
+
<objectType>collection</objectType>
|
107
|
+
<adminPolicy>druid:vb546ms7107</adminPolicy>
|
108
|
+
<otherId name="catkey">9615156</otherId>
|
109
|
+
<otherId name="uuid">8f1feb20-4b29-11e3-8e31-0050569b3c3c</otherId>
|
110
|
+
<tag>Remediated By : 3.25.3</tag>
|
111
|
+
</identityMetadata>
|
112
|
+
<xml/>
|
113
|
+
<rightsMetadata>
|
114
|
+
<access type="discover">
|
115
|
+
<machine>
|
116
|
+
<world/>
|
117
|
+
</machine>
|
118
|
+
</access>
|
119
|
+
<access type="read">
|
120
|
+
<machine>
|
121
|
+
<world/>
|
122
|
+
</machine>
|
123
|
+
</access>
|
124
|
+
<use>
|
125
|
+
<human type="useAndReproduction"/>
|
126
|
+
<human type="creativeCommons"/>
|
127
|
+
<machine type="creativeCommons"/>
|
128
|
+
</use>
|
129
|
+
<copyright>
|
130
|
+
<human/>
|
131
|
+
</copyright>
|
132
|
+
</rightsMetadata>
|
133
|
+
<rdf:RDF xmlns:fedora-model="info:fedora/fedora-system:def/model#" xmlns:hydra="http://projecthydra.org/ns/relations#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
|
134
|
+
<rdf:Description rdf:about="info:fedora/druid:yg867hg1375">
|
135
|
+
</rdf:Description>
|
136
|
+
</rdf:RDF>
|
137
|
+
<oai_dc:dc xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:srw_dc="info:srw/schema/1/dc-schema" xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd">
|
138
|
+
<dc:title>Francis E. Stafford photographs, 1909-1933</dc:title>
|
139
|
+
<dc:contributor>Stafford, Francis E., 1884-1938</dc:contributor>
|
140
|
+
<dc:type>Collection</dc:type>
|
141
|
+
<dc:date>1909-1933</dc:date>
|
142
|
+
<dc:language>und</dc:language>
|
143
|
+
<dc:format>3 oversize boxes.</dc:format>
|
144
|
+
<dc:description>Photographs of scenes in China, mainly between 1909 and 1915.</dc:description>
|
145
|
+
<dc:rights>Closed. Digital use copies available.</dc:rights>
|
146
|
+
<dc:description type="biographical/historical">American missionary in China, 1909-1915 and 1932-1933.</dc:description>
|
147
|
+
<dc:coverage>China</dc:coverage>
|
148
|
+
</oai_dc:dc>
|
149
|
+
</publicObject>
|
150
|
+
http_version:
|
151
|
+
recorded_at: Wed, 17 Dec 2014 19:39:38 GMT
|
56
152
|
recorded_with: VCR 2.9.3
|