harvestdor-indexer 1.0.4 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,53 @@
1
+ module Harvestdor
2
+ ##
3
+ # Harvest metrics tracker
4
+ class Indexer::Metrics
5
+ attr_accessor :error_count, :success_count, :logger
6
+
7
+ def initialize options = {}
8
+ @success_count=0 # the number of objects successfully indexed
9
+ @error_count=0 # the number of objects that failed
10
+ @logger = options[:logger] || Logger.new(STDERR)
11
+ end
12
+
13
+ ##
14
+ # Wrap an operation in tally block; if the block completes without throwing
15
+ # an exception, tally a success. If the block throws an exception, catch it
16
+ # and tally a failure.
17
+ #
18
+ # Callers can provide an :on_error handler to receive the exception and process
19
+ # it appropriately.
20
+ #
21
+ # @param [Hash] options
22
+ # @option options [#call] Callback that will receive any exception thrown by the block
23
+ def tally options = {}, &block
24
+ begin
25
+ block.call
26
+ success!
27
+ rescue => e
28
+ error!
29
+ logger.error "Failed to process: #{e.message}"
30
+ options[:on_error].call e if options[:on_error]
31
+ end
32
+ end
33
+
34
+ ##
35
+ # Record a successful run
36
+ def success!
37
+ @success_count += 1
38
+ end
39
+
40
+ ##
41
+ # Record an error
42
+ def error!
43
+ @error_count += 1
44
+ end
45
+
46
+ ##
47
+ # Total number of runs
48
+ def total
49
+ @success_count + @error_count
50
+ end
51
+
52
+ end
53
+ end
@@ -0,0 +1,174 @@
1
+ require 'active_support/benchmarkable'
2
+
3
+ module Harvestdor
4
+ class Indexer::Resource
5
+ include ActiveSupport::Benchmarkable
6
+
7
+ attr_reader :indexer, :druid, :options
8
+
9
+ def initialize indexer, druid, options = {}
10
+ @indexer = indexer
11
+ @druid = druid
12
+ @options = options
13
+ end
14
+
15
+ def bare_druid
16
+ @bare_druid ||= druid.gsub("druid:", "")
17
+ end
18
+
19
+ ##
20
+ # The harvestdor client used for retrieving resources
21
+ def harvestdor_client
22
+ indexer.harvestdor_client
23
+ end
24
+
25
+ def dor_fetcher_client
26
+ indexer.dor_fetcher_client
27
+ end
28
+
29
+ ##
30
+ # Get the logger
31
+ def logger
32
+ options[:logger] || (indexer.logger if indexer.respond_to? :logger) || Logger.new(STDERR)
33
+ end
34
+
35
+ ##
36
+ # Is this resource a collection?
37
+ def collection?
38
+ identity_metadata.xpath("/identityMetadata/objectType").any? { |x| x.text == "collection" }
39
+ end
40
+
41
+ # get the druids from isMemberOfCollection relationships in rels-ext from public_xml
42
+ # @return [Array<String>] the druids (e.g. ww123yy1234) this object has isMemberOfColletion relationship with, or nil if none
43
+ def collections
44
+ @collections ||= begin
45
+ ns_hash = {'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'fedora' => "info:fedora/fedora-system:def/relations-external#", '' => ''}
46
+ is_member_of_nodes ||= public_xml.xpath('/publicObject/rdf:RDF/rdf:Description/fedora:isMemberOfCollection/@rdf:resource', ns_hash)
47
+
48
+ is_member_of_nodes.reject { |n| n.value.empty? }.map do |n|
49
+ Harvestdor::Indexer::Resource.new(indexer, n.value.gsub("info:fedora/", ""))
50
+ end
51
+ end
52
+ end
53
+
54
+ ##
55
+ # Return the items in this collection
56
+ def items
57
+ @items ||= begin
58
+ druids = dor_fetcher_client.druid_array(dor_fetcher_client.get_collection(bare_druid, {}))
59
+ druids.map { |x| Harvestdor::Indexer::Resource.new(indexer, x) }
60
+ end
61
+ end
62
+
63
+ # given a druid, get its objectLabel from its purl page identityMetadata
64
+ # @param [String] druid, e.g. ab123cd4567
65
+ # @return [String] the value of the <objectLabel> element in the identityMetadata for the object
66
+ def identity_md_obj_label
67
+ logger.error("#{druid} missing identityMetadata") unless identity_metadata
68
+ identity_metadata.xpath('identityMetadata/objectLabel').text
69
+ end
70
+
71
+
72
+ # return the MODS for the druid as a Stanford::Mods::Record object
73
+ # @param [String] druid e.g. ab123cd4567
74
+ # @return [Stanford::Mods::Record] created from the MODS xml for the druid
75
+ def smods_rec
76
+ @smods_rec ||= benchmark "smods_rec(#{druid})", level: :debug do
77
+ ng_doc = mods
78
+ raise "Empty MODS metadata for #{druid}: #{ng_doc.to_xml}" if ng_doc.root.xpath('//text()').empty?
79
+ mods_rec = Stanford::Mods::Record.new
80
+ mods_rec.from_nk_node(ng_doc.root)
81
+ mods_rec
82
+ end
83
+ end
84
+
85
+ def mods
86
+ @mods ||= harvestdor_client.mods bare_druid
87
+ end
88
+
89
+ # the public xml for this DOR object, from the purl page
90
+ # @param [String] druid e.g. ab123cd4567
91
+ # @return [Nokogiri::XML::Document] the public xml for the DOR object
92
+ def public_xml
93
+ @public_xml ||= benchmark "public_xml(#{druid})", level: :debug do
94
+ ng_doc = harvestdor_client.public_xml bare_druid
95
+ raise "No public xml for #{druid}" if !ng_doc
96
+ raise "Empty public xml for #{druid}: #{ng_doc.to_xml}" if ng_doc.root.xpath('//text()').empty?
97
+ ng_doc
98
+ end
99
+ end
100
+
101
+ ##
102
+ # Has the public_xml been previously retrieved?
103
+ def public_xml?
104
+ !!@public_xml
105
+ end
106
+
107
+ ##
108
+ # Get the public_xml, if retrieved, or the druid. This is used to short-circuit
109
+ # retrieving metadata out of the public xml.
110
+ def public_xml_or_druid
111
+ if public_xml?
112
+ public_xml
113
+ else
114
+ bare_druid
115
+ end
116
+ end
117
+
118
+ # the contentMetadata for this DOR object, ultimately from the purl public xml
119
+ # @param [Object] object a String containing a druid (e.g. ab123cd4567), or
120
+ # a Nokogiri::XML::Document containing the public_xml for an object
121
+ # @return [Nokogiri::XML::Document] the contentMetadata for the DOR object
122
+ def content_metadata
123
+ ng_doc = benchmark "content_metadata (#{druid})", level: :debug do
124
+ harvestdor_client.content_metadata public_xml_or_druid
125
+ end
126
+ raise "No contentMetadata for \"#{druid}\"" if !ng_doc || ng_doc.children.empty?
127
+ ng_doc
128
+ end
129
+
130
+ # the identityMetadata for this DOR object, ultimately from the purl public xml
131
+ # @param [Object] object a String containing a druid (e.g. ab123cd4567), or
132
+ # a Nokogiri::XML::Document containing the public_xml for an object
133
+ # @return [Nokogiri::XML::Document] the identityMetadata for the DOR object
134
+ def identity_metadata
135
+ ng_doc = benchmark "identity_metadata (#{druid})", level: :debug do
136
+ harvestdor_client.identity_metadata public_xml_or_druid
137
+ end
138
+ raise "No identityMetadata for \"#{druid}\"" if !ng_doc || ng_doc.children.empty?
139
+ ng_doc
140
+ end
141
+
142
+ # the rightsMetadata for this DOR object, ultimately from the purl public xml
143
+ # @param [Object] object a String containing a druid (e.g. ab123cd4567), or
144
+ # a Nokogiri::XML::Document containing the public_xml for an object
145
+ # @return [Nokogiri::XML::Document] the rightsMetadata for the DOR object
146
+ def rights_metadata
147
+ ng_doc = benchmark "rights_metadata (#{druid})", level: :debug do
148
+ harvestdor_client.rights_metadata public_xml_or_druid
149
+ end
150
+ raise "No rightsMetadata for \"#{druid}\"" if !ng_doc || ng_doc.children.empty?
151
+ ng_doc
152
+ end
153
+
154
+ # the RDF for this DOR object, ultimately from the purl public xml
155
+ # @param [Object] object a String containing a druid (e.g. ab123cd4567), or
156
+ # a Nokogiri::XML::Document containing the public_xml for an object
157
+ # @return [Nokogiri::XML::Document] the RDF for the DOR object
158
+ def rdf
159
+ ng_doc = benchmark "rdf (#{druid})", level: :debug do
160
+ harvestdor_client.rdf public_xml_or_druid
161
+ end
162
+ raise "No RDF for \"#{druid}\"" if !ng_doc || ng_doc.children.empty?
163
+ ng_doc
164
+ end
165
+
166
+ def eql?(other)
167
+ other.is_a? Harvestdor::Indexer::Resource and other.indexer == indexer and other.druid == druid
168
+ end
169
+
170
+ def hash
171
+ druid.hash ^ indexer.hash
172
+ end
173
+ end
174
+ end
@@ -0,0 +1,39 @@
1
+ module Harvestdor
2
+ class Indexer::Solr
3
+ attr_accessor :client, :indexer, :config
4
+
5
+ def initialize indexer, config = {}
6
+ @indexer = indexer
7
+ @client = RSolr.connect(config)
8
+ @config = Confstruct::Configuration.new config
9
+ @config.max_retries ||= 10
10
+ end
11
+
12
+ def logger
13
+ indexer.logger
14
+ end
15
+
16
+ def commit!
17
+ client.commit
18
+ end
19
+
20
+ # Add the document to solr, retry if an error occurs.
21
+ # See https://github.com/ooyala/retries for docs on with_retries.
22
+ # @param [Hash] doc a Hash representation of the solr document
23
+ def add(doc)
24
+ id = doc[:id]
25
+
26
+ handler = Proc.new do |exception, attempt_number, total_delay|
27
+ logger.debug "#{exception.class} on attempt #{attempt_number} for #{id}"
28
+ # logger.debug exception.backtrace
29
+ end
30
+
31
+ with_retries(:max_tries => config.max_retries, :handler => handler, :base_sleep_seconds => 1, :max_sleep_seconds => 5) do |attempt|
32
+ logger.debug "Attempt #{attempt} for #{id}"
33
+ client.add(doc)
34
+ logger.info "Successfully indexed #{id} on attempt #{attempt}"
35
+ end
36
+ end
37
+
38
+ end
39
+ end
@@ -1,6 +1,6 @@
1
1
  module Harvestdor
2
2
  class Indexer
3
3
  # this is the Ruby Gem version
4
- VERSION = "1.0.4"
4
+ VERSION = "2.0.0"
5
5
  end
6
6
  end
@@ -1,28 +1,6 @@
1
- # You will want to copy this file and change the following settings:
2
- # 1. log_dir, log_name
3
- # 2. default_set
4
- # 3. blacklist or whitelist if you are using them
5
- # 4. Solr baseurl
6
-
7
- # log_dir: directory for log file (default logs, relative to harvestdor gem path)
8
- log_dir: spec/test_logs
9
-
10
- # log_name: name of log file (default: harvestdor.log)
11
- log_name: ap-test.log
12
-
13
- # purl: url for the DOR purl server (used to get ContentMetadata, etc.)
14
- purl: http://purl.stanford.edu
15
-
16
- # ---------- White and Black list parameters -----
17
-
18
- # name of file containing druids that will NOT be processed even if they are harvested
19
- # via DorFetcher either give absolute path or path relative to where the command will
20
- # be executed
21
- #blacklist: config/ap_blacklist.txt
22
-
23
- # name of file containing druids that WILL be processed (all others will be ignored)
24
- # either give absolute path or path relative to where the command will be executed
25
- #whitelist: config/ap_whitelist.txt
1
+ dor_fetcher:
2
+ service_url: http://127.0.0.1:3000
3
+ skip_heartbeat: true
26
4
 
27
5
  # ----------- SOLR index (that we're writing INTO) parameters ------------
28
6
  solr:
@@ -31,25 +9,35 @@ solr:
31
9
  # timeouts are in seconds; read_timeout -> open/read, open_timeout -> connection open
32
10
  read_timeout: 60
33
11
  open_timeout: 60
12
+ max_retries: 10
34
13
 
35
- # default_set: default set for harvest (default: nil)
36
- # can be overridden on calls to harvest_ids and harvest_records
37
- default_set: is_governed_by_yg867hg1375
38
-
39
- # default_metadata_prefix: default metadata prefix to be used for harvesting (default: mods)
40
- # can be overridden on calls to harvest_ids and harvest_records
41
-
42
- # default_from_date: default from date for harvest (default: nil)
43
- # can be overridden on calls to harvest_ids and harvest_records
14
+ # ---------- White and Black list parameters -----
44
15
 
45
- # default_until_date: default until date for harvest (default: nil)
46
- # can be overridden on calls to harvest_ids and harvest_records
16
+ # name of file containing druids that WILL be processed (all others will be ignored)
17
+ # either give absolute path or path relative to where the command will be executed
18
+ #whitelist: config/ap_whitelist.txt
47
19
 
48
- # Additional options to pass to Faraday http client (https://github.com/technoweenie/faraday)
49
- http_options:
50
- ssl:
51
- verify: false
52
- # timeouts are in seconds; timeout -> open/read, open_timeout -> connection open
53
- request:
54
- timeout: 180
55
- open_timeout: 180
20
+ harvestdor:
21
+ # You will want to copy this file and change the following settings:
22
+ # 1. log_dir, log_name
23
+ # 2. default_set
24
+ # 3. blacklist or whitelist if you are using them
25
+ # 4. Solr baseurl
26
+
27
+ # log_dir: directory for log file (default logs, relative to harvestdor gem path)
28
+ log_dir: spec/test_logs
29
+
30
+ # log_name: name of log file (default: harvestdor.log)
31
+ log_name: ap-test.log
32
+
33
+ # purl: url for the DOR purl server (used to get ContentMetadata, etc.)
34
+ purl: http://purl.stanford.edu
35
+
36
+ # Additional options to pass to Faraday http client (https://github.com/technoweenie/faraday)
37
+ http_options:
38
+ ssl:
39
+ verify: false
40
+ # timeouts are in seconds; timeout -> open/read, open_timeout -> connection open
41
+ request:
42
+ timeout: 180
43
+ open_timeout: 180
@@ -53,4 +53,100 @@ http_interactions:
53
53
  social customs, and people."}],"counts":{"collections":1,"items":5,"total_count":6}}'
54
54
  http_version:
55
55
  recorded_at: Wed, 12 Nov 2014 19:34:03 GMT
56
+ - request:
57
+ method: get
58
+ uri: http://purl.stanford.edu/yg867hg1375.xml
59
+ body:
60
+ encoding: US-ASCII
61
+ string: ''
62
+ headers:
63
+ Accept-Encoding:
64
+ - gzip;q=1.0,deflate;q=0.6,identity;q=0.3
65
+ Accept:
66
+ - "*/*"
67
+ User-Agent:
68
+ - Ruby
69
+ response:
70
+ status:
71
+ code: 200
72
+ message: ''
73
+ headers:
74
+ Date:
75
+ - Wed, 17 Dec 2014 19:39:37 GMT
76
+ Server:
77
+ - Apache/2.2.15 (Red Hat)
78
+ X-Powered-By:
79
+ - Phusion Passenger (mod_rails/mod_rack) 3.0.19
80
+ X-Ua-Compatible:
81
+ - IE=Edge,chrome=1
82
+ Etag:
83
+ - '"67aa6d1481ba1537ae63af5aaf493f84"'
84
+ Cache-Control:
85
+ - max-age=0, private, must-revalidate
86
+ X-Request-Id:
87
+ - f2e753d56bf896cde6e941be0f51d05a
88
+ X-Runtime:
89
+ - '0.007983'
90
+ X-Rack-Cache:
91
+ - miss
92
+ Status:
93
+ - '200'
94
+ Content-Length:
95
+ - '2180'
96
+ Content-Type:
97
+ - application/xml; charset=utf-8
98
+ body:
99
+ encoding: UTF-8
100
+ string: |
101
+ <publicObject id="druid:yg867hg1375" published="2013-11-11T15:34:32-08:00">
102
+ <identityMetadata>
103
+ <objectId>druid:yg867hg1375</objectId>
104
+ <objectCreator>DOR</objectCreator>
105
+ <objectLabel>Francis E. Stafford photographs, 1909-1933</objectLabel>
106
+ <objectType>collection</objectType>
107
+ <adminPolicy>druid:vb546ms7107</adminPolicy>
108
+ <otherId name="catkey">9615156</otherId>
109
+ <otherId name="uuid">8f1feb20-4b29-11e3-8e31-0050569b3c3c</otherId>
110
+ <tag>Remediated By : 3.25.3</tag>
111
+ </identityMetadata>
112
+ <xml/>
113
+ <rightsMetadata>
114
+ <access type="discover">
115
+ <machine>
116
+ <world/>
117
+ </machine>
118
+ </access>
119
+ <access type="read">
120
+ <machine>
121
+ <world/>
122
+ </machine>
123
+ </access>
124
+ <use>
125
+ <human type="useAndReproduction"/>
126
+ <human type="creativeCommons"/>
127
+ <machine type="creativeCommons"/>
128
+ </use>
129
+ <copyright>
130
+ <human/>
131
+ </copyright>
132
+ </rightsMetadata>
133
+ <rdf:RDF xmlns:fedora-model="info:fedora/fedora-system:def/model#" xmlns:hydra="http://projecthydra.org/ns/relations#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
134
+ <rdf:Description rdf:about="info:fedora/druid:yg867hg1375">
135
+ </rdf:Description>
136
+ </rdf:RDF>
137
+ <oai_dc:dc xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:srw_dc="info:srw/schema/1/dc-schema" xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd">
138
+ <dc:title>Francis E. Stafford photographs, 1909-1933</dc:title>
139
+ <dc:contributor>Stafford, Francis E., 1884-1938</dc:contributor>
140
+ <dc:type>Collection</dc:type>
141
+ <dc:date>1909-1933</dc:date>
142
+ <dc:language>und</dc:language>
143
+ <dc:format>3 oversize boxes.</dc:format>
144
+ <dc:description>Photographs of scenes in China, mainly between 1909 and 1915.</dc:description>
145
+ <dc:rights>Closed. Digital use copies available.</dc:rights>
146
+ <dc:description type="biographical/historical">American missionary in China, 1909-1915 and 1932-1933.</dc:description>
147
+ <dc:coverage>China</dc:coverage>
148
+ </oai_dc:dc>
149
+ </publicObject>
150
+ http_version:
151
+ recorded_at: Wed, 17 Dec 2014 19:39:38 GMT
56
152
  recorded_with: VCR 2.9.3