cul-fedora 0.8.6 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/cul-fedora.gemspec +5 -13
- data/lib/cul-fedora.rb +5 -1
- data/lib/cul-fedora/item.rb +27 -16
- data/lib/cul-fedora/server.rb +10 -4
- data/lib/cul-fedora/solr.rb +64 -51
- metadata +9 -15
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
1.0.0
|
data/cul-fedora.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{cul-fedora}
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "1.0.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
-
s.authors = [
|
12
|
-
s.date = %q{2011-
|
11
|
+
s.authors = [%q{James Stuart}]
|
12
|
+
s.date = %q{2011-09-19}
|
13
13
|
s.description = %q{Columbia-specific Fedora libraries}
|
14
14
|
s.email = %q{tastyhat@jamesstuart.org}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -62,19 +62,11 @@ Gem::Specification.new do |s|
|
|
62
62
|
"test_fedora_item.rb"
|
63
63
|
]
|
64
64
|
s.homepage = %q{http://github.com/tastyhat/cul-fedora}
|
65
|
-
s.require_paths = [
|
66
|
-
s.rubygems_version = %q{1.
|
65
|
+
s.require_paths = [%q{lib}]
|
66
|
+
s.rubygems_version = %q{1.8.6}
|
67
67
|
s.summary = %q{Columbia University Fedora Hooks}
|
68
|
-
s.test_files = [
|
69
|
-
"test/helper.rb",
|
70
|
-
"test/test_cul-fedora.rb",
|
71
|
-
"test/test_fedora_item.rb",
|
72
|
-
"test/test_fedora_server.rb",
|
73
|
-
"test/test_fedora_solr.rb"
|
74
|
-
]
|
75
68
|
|
76
69
|
if s.respond_to? :specification_version then
|
77
|
-
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
78
70
|
s.specification_version = 3
|
79
71
|
|
80
72
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
data/lib/cul-fedora.rb
CHANGED
data/lib/cul-fedora/item.rb
CHANGED
@@ -1,4 +1,9 @@
|
|
1
1
|
require "open3"
|
2
|
+
begin
|
3
|
+
require "active_support/core_ext/array/extract_options"
|
4
|
+
rescue
|
5
|
+
require "activesupport"
|
6
|
+
end
|
2
7
|
|
3
8
|
module Cul
|
4
9
|
module Fedora
|
@@ -40,6 +45,7 @@ module Cul
|
|
40
45
|
request
|
41
46
|
return true
|
42
47
|
rescue Exception => e # we should really do some better checking of error type etc here
|
48
|
+
logger.error e.message
|
43
49
|
return false
|
44
50
|
end
|
45
51
|
end
|
@@ -82,7 +88,8 @@ module Cul
|
|
82
88
|
i = i + MAX_LIST_MEMBERS_PER_REQUEST
|
83
89
|
end
|
84
90
|
return items
|
85
|
-
rescue
|
91
|
+
rescue Exception => e
|
92
|
+
logger.error e.message
|
86
93
|
[]
|
87
94
|
end
|
88
95
|
end
|
@@ -90,8 +97,9 @@ module Cul
|
|
90
97
|
def getSize()
|
91
98
|
begin
|
92
99
|
request(:method => "/objects", :sdef => "methods/ldpd:sdef.Aggregator", :request => "getSize").to_i
|
93
|
-
rescue
|
94
|
-
|
100
|
+
rescue Exception => e
|
101
|
+
logger.error e.message
|
102
|
+
return -1
|
95
103
|
end
|
96
104
|
end
|
97
105
|
|
@@ -103,6 +111,7 @@ module Cul
|
|
103
111
|
@server.item(metadata.attributes["uri"].value)
|
104
112
|
end
|
105
113
|
rescue Exception => e
|
114
|
+
logger.error e.message
|
106
115
|
[]
|
107
116
|
end
|
108
117
|
end
|
@@ -113,7 +122,8 @@ module Cul
|
|
113
122
|
result.xpath("/rdf:RDF/rdf:Description/*[local-name()='memberOf']").collect do |member|
|
114
123
|
@server.item(member.attributes["resource"].value)
|
115
124
|
end
|
116
|
-
rescue
|
125
|
+
rescue Exception => e
|
126
|
+
logger.error e.message
|
117
127
|
[]
|
118
128
|
end
|
119
129
|
end
|
@@ -136,6 +146,7 @@ module Cul
|
|
136
146
|
author_roles = ["author","creator","editor","speaker","moderator","interviewee","interviewer","contributor"]
|
137
147
|
other_name_roles = ["thesis advisor"]
|
138
148
|
corporate_author_roles = ["author"]
|
149
|
+
corporate_department_roles = ["originator"]
|
139
150
|
|
140
151
|
organizations = []
|
141
152
|
departments = []
|
@@ -182,7 +193,7 @@ module Cul
|
|
182
193
|
note_org = true
|
183
194
|
all_author_names << fullname
|
184
195
|
if(!name_node["ID"].nil?)
|
185
|
-
add_field.call("
|
196
|
+
add_field.call("author_uni", name_node["ID"])
|
186
197
|
end
|
187
198
|
add_field.call("author_search", fullname.downcase)
|
188
199
|
add_field.call("author_facet", fullname)
|
@@ -209,7 +220,7 @@ module Cul
|
|
209
220
|
end
|
210
221
|
|
211
222
|
mods.css("name[@type='corporate']").each do |corp_name_node|
|
212
|
-
if(!corp_name_node["ID"].nil? && corp_name_node["ID"].include?("originator"))
|
223
|
+
if((!corp_name_node["ID"].nil? && corp_name_node["ID"].include?("originator")) || corp_name_node.css("role>roleTerm").collect(&:content).any? { |role| corporate_department_roles.include?(role) })
|
213
224
|
name_part = corp_name_node.at_css("namePart").text
|
214
225
|
if(name_part.include?(". "))
|
215
226
|
name_part_split = name_part.split(". ")
|
@@ -230,13 +241,12 @@ module Cul
|
|
230
241
|
end
|
231
242
|
end
|
232
243
|
|
233
|
-
add_field.call("
|
234
|
-
add_field.call("
|
244
|
+
add_field.call("author_display",all_author_names.join("; "))
|
245
|
+
add_field.call("pub_date_facet", mods.at_css("*[@keyDate='yes']"))
|
235
246
|
|
236
247
|
mods.css("genre").each do |genre_node|
|
237
248
|
add_field.call("genre_facet", genre_node)
|
238
249
|
add_field.call("genre_search", genre_node)
|
239
|
-
|
240
250
|
end
|
241
251
|
|
242
252
|
|
@@ -247,14 +257,14 @@ module Cul
|
|
247
257
|
if(subject_node.attributes.count == 0)
|
248
258
|
subject_node.css("topic").each do |topic_node|
|
249
259
|
add_field.call("keyword_search", topic_node.content.downcase)
|
250
|
-
add_field.call("
|
260
|
+
add_field.call("subject_facet", topic_node)
|
251
261
|
add_field.call("subject_search", topic_node)
|
252
262
|
end
|
253
263
|
end
|
254
264
|
end
|
255
265
|
|
256
266
|
|
257
|
-
add_field.call("
|
267
|
+
add_field.call("table_of_contents", mods.at_css("tableOfContents"))
|
258
268
|
|
259
269
|
mods.css("note").each { |note| add_field.call("notes", note) }
|
260
270
|
|
@@ -277,7 +287,7 @@ module Cul
|
|
277
287
|
|
278
288
|
if(related_series = mods.at_css("relatedItem[@type='series']"))
|
279
289
|
if(related_series.has_attribute?("ID"))
|
280
|
-
add_field.call("
|
290
|
+
add_field.call("series_facet", related_series.at_css("titleInfo>title"))
|
281
291
|
end
|
282
292
|
end
|
283
293
|
|
@@ -290,23 +300,24 @@ module Cul
|
|
290
300
|
|
291
301
|
mods.css("typeOfResource").each { |tr| add_field.call("type_of_resource_facet", tr)}
|
292
302
|
mods.css("subject>geographic").each do |geo|
|
293
|
-
add_field.call("
|
303
|
+
add_field.call("geographic_area_display", geo)
|
294
304
|
add_field.call("geographic_area_search", geo)
|
295
305
|
end
|
296
306
|
|
297
|
-
|
307
|
+
# This is just a placeholder, reminding us that we need to implement citations in some way
|
308
|
+
# add_field.call("export_as_mla_citation_txt","")
|
298
309
|
|
299
310
|
if(organizations.count > 0)
|
300
311
|
organizations = organizations.uniq
|
301
312
|
organizations.each do |organization|
|
302
|
-
add_field.call("
|
313
|
+
add_field.call("organization_facet", organization)
|
303
314
|
end
|
304
315
|
end
|
305
316
|
|
306
317
|
if(departments.count > 0)
|
307
318
|
departments = departments.uniq
|
308
319
|
departments.each do |department|
|
309
|
-
add_field.call("
|
320
|
+
add_field.call("department_facet", department.to_s.sub(", Department of", "").strip)
|
310
321
|
end
|
311
322
|
end
|
312
323
|
|
data/lib/cul-fedora/server.rb
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
begin
|
2
|
+
require "active_support/core_ext/array/extract_options"
|
3
|
+
rescue
|
4
|
+
require "activesupport"
|
5
|
+
end
|
6
|
+
|
1
7
|
module Cul
|
2
8
|
module Fedora
|
3
9
|
class Server
|
@@ -6,10 +12,10 @@ module Cul
|
|
6
12
|
|
7
13
|
def initialize(*args)
|
8
14
|
options = args.extract_options!
|
9
|
-
@riurl = options[:riurl] || raise(ArgumentError, "Must provide riurl argument")
|
10
|
-
@riquery = options[:riquery] || raise(ArgumentError, "Must provide riquery argument")
|
11
|
-
@hc = options[:http_client]
|
12
|
-
@logger = options[:logger]
|
15
|
+
@riurl = options[:riurl] || options["riurl"] || raise(ArgumentError, "Must provide riurl argument")
|
16
|
+
@riquery = options[:riquery] || options["riquery"] || raise(ArgumentError, "Must provide riquery argument")
|
17
|
+
@hc = options[:http_client] || options["http_client"]
|
18
|
+
@logger = options[:logger] || options["logger"]
|
13
19
|
end
|
14
20
|
|
15
21
|
def logger
|
data/lib/cul-fedora/solr.rb
CHANGED
@@ -1,12 +1,18 @@
|
|
1
|
+
begin
|
2
|
+
require "active_support/core_ext/array/extract_options"
|
3
|
+
rescue
|
4
|
+
require "activesupport"
|
5
|
+
end
|
6
|
+
|
1
7
|
module Cul
|
2
8
|
module Fedora
|
3
9
|
class Solr
|
4
10
|
|
5
11
|
attr_reader :url
|
6
12
|
|
7
|
-
def initialize(
|
8
|
-
@url =
|
9
|
-
@logger =
|
13
|
+
def initialize(options = {})
|
14
|
+
@url = options[:url] || options["url"] || raise(ArgumentError, "must provide url")
|
15
|
+
@logger = options[:logger] || options["logger"]
|
10
16
|
end
|
11
17
|
|
12
18
|
def logger
|
@@ -22,31 +28,53 @@ module Cul
|
|
22
28
|
end
|
23
29
|
|
24
30
|
def delete_index
|
31
|
+
logger.info "Deleting Solr index..."
|
25
32
|
rsolr.delete_by_query("*:*")
|
26
33
|
rsolr.commit
|
27
34
|
end
|
28
35
|
|
29
|
-
def delete_removed(fedora_server)
|
36
|
+
def delete_removed(fedora_server, fedora_item_pids = nil)
|
37
|
+
|
38
|
+
removed = identify_removed(fedora_server)
|
39
|
+
logger.info "Deleting items removed from Fedora..."
|
40
|
+
removed.each do |id|
|
41
|
+
logger.info "Deleting " + id + "..."
|
42
|
+
rsolr.delete_by_query("id:" + id.to_s.gsub(/:/,'\\:'))
|
43
|
+
end
|
44
|
+
|
45
|
+
rsolr.commit
|
30
46
|
|
47
|
+
end
|
48
|
+
|
49
|
+
def identify_removed(fedora_server, fedora_item_pids = nil)
|
31
50
|
start = 0
|
32
51
|
rows = 500
|
52
|
+
removed = []
|
33
53
|
results = rsolr.select({:q => "", :fl => "id", :start => start, :rows => rows})
|
34
|
-
logger.info "
|
54
|
+
logger.info "Identifying items removed from Fedora..."
|
35
55
|
while(!results["response"]["docs"].empty?)
|
36
56
|
|
57
|
+
logger.info("Checking Solr index from " + start.to_s + " to " + (start + rows).to_s + "...")
|
37
58
|
results["response"]["docs"].each do |doc|
|
38
|
-
|
39
|
-
|
40
|
-
|
59
|
+
|
60
|
+
if(fedora_item_pids.nil?)
|
61
|
+
if(!fedora_server.item(doc["id"]).exists?)
|
62
|
+
logger.info "Noting removed item " + doc["id"] + "..."
|
63
|
+
removed << doc["id"].to_s
|
64
|
+
end
|
65
|
+
else
|
66
|
+
if(!fedora_item_pids.include?(doc["id"].to_s))
|
67
|
+
logger.info "Noting removed item " + doc["id"] + "..."
|
68
|
+
removed << doc["id"].to_s
|
69
|
+
end
|
41
70
|
end
|
71
|
+
|
42
72
|
end
|
43
73
|
|
44
74
|
start = start + rows
|
45
75
|
results = rsolr.select({:q => "", :fl => "id", :start => start, :rows => rows})
|
46
76
|
end
|
47
|
-
|
48
|
-
rsolr.commit
|
49
|
-
|
77
|
+
return removed
|
50
78
|
end
|
51
79
|
|
52
80
|
def ingest(options = {})
|
@@ -63,14 +91,9 @@ module Cul
|
|
63
91
|
|
64
92
|
delete = options.delete(:delete_removed) || false
|
65
93
|
overwrite = options.delete(:overwrite) || false
|
66
|
-
|
67
|
-
skip = options.delete(:skip) || nil
|
68
|
-
|
69
|
-
processed_successfully = 0
|
94
|
+
skip = options.delete(:skip) || []
|
70
95
|
|
71
|
-
|
72
|
-
delete_removed(fedora_server)
|
73
|
-
end
|
96
|
+
indexed_count = 0
|
74
97
|
|
75
98
|
logger.info "Preparing the items for indexing..."
|
76
99
|
collections.each do |collection|
|
@@ -79,24 +102,26 @@ module Cul
|
|
79
102
|
|
80
103
|
items.sort!
|
81
104
|
|
82
|
-
to_add = []
|
83
105
|
results = Hash.new { |h,k| h[k] = [] }
|
84
|
-
errors =
|
106
|
+
errors = []
|
107
|
+
|
108
|
+
item_pids = []
|
109
|
+
items.each do |item|
|
110
|
+
item_pids << item.pid
|
111
|
+
end
|
112
|
+
if delete == true
|
113
|
+
delete_removed(fedora_server, item_pids)
|
114
|
+
end
|
85
115
|
|
86
116
|
logger.info "Preparing to index " + items.length.to_s + " items..."
|
87
117
|
|
88
118
|
items.each do |i|
|
89
119
|
|
90
|
-
if(ignore.index(i.pid).nil? == false)
|
91
|
-
logger.info "Ignoring " + i.pid + "..."
|
120
|
+
if(ignore.index(i.pid).nil? == false || skip.index(i.pid).nil? == false)
|
121
|
+
logger.info "Ignoring/skipping " + i.pid + "..."
|
122
|
+
results[:skipped] << i.pid
|
92
123
|
next
|
93
124
|
end
|
94
|
-
|
95
|
-
if process && skip && skip > 0
|
96
|
-
skip -= 1
|
97
|
-
next
|
98
|
-
end
|
99
|
-
|
100
125
|
|
101
126
|
if item_exists?(i)
|
102
127
|
unless overwrite == true
|
@@ -104,45 +129,33 @@ module Cul
|
|
104
129
|
next
|
105
130
|
end
|
106
131
|
end
|
107
|
-
|
108
132
|
|
109
133
|
logger.info "Indexing " + i.pid + "..."
|
110
134
|
|
111
135
|
result_hash = i.send("index_for_#{format}", options)
|
112
136
|
|
113
|
-
results[result_hash[:status]]
|
137
|
+
results[result_hash[:status]] << i.pid
|
114
138
|
|
115
139
|
case result_hash[:status]
|
116
140
|
when :success
|
117
|
-
|
118
|
-
|
141
|
+
begin
|
142
|
+
rsolr.add(result_hash[:results])
|
143
|
+
indexed_count += 1
|
144
|
+
rescue Exception => e
|
145
|
+
errors << i.pid
|
146
|
+
logger.error e.message
|
147
|
+
end
|
119
148
|
when :error
|
120
|
-
errors
|
121
|
-
|
122
|
-
|
123
|
-
if process
|
124
|
-
process -= 1
|
125
|
-
break if process <= 0
|
149
|
+
errors << i.pid
|
150
|
+
logger.error result_hash[:error_message]
|
126
151
|
end
|
127
152
|
|
128
|
-
if to_add.length >= 500
|
129
|
-
logger.info "Adding batch to commit queue..."
|
130
|
-
rsolr.add(to_add)
|
131
|
-
to_add.clear
|
132
|
-
end
|
133
|
-
|
134
|
-
end
|
135
|
-
|
136
|
-
if to_add.length > 0
|
137
|
-
logger.info "Adding batch to commit queue..."
|
138
|
-
rsolr.add(to_add)
|
139
|
-
to_add.clear
|
140
153
|
end
|
141
154
|
|
142
155
|
logger.info "Committing changes to Solr..."
|
143
156
|
rsolr.commit
|
144
157
|
|
145
|
-
return {:results => results, :errors => errors, :
|
158
|
+
return {:results => results, :errors => errors, :indexed_count => indexed_count}
|
146
159
|
|
147
160
|
end
|
148
161
|
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cul-fedora
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
5
|
-
prerelease:
|
4
|
+
hash: 23
|
5
|
+
prerelease:
|
6
6
|
segments:
|
7
|
+
- 1
|
8
|
+
- 0
|
7
9
|
- 0
|
8
|
-
|
9
|
-
- 6
|
10
|
-
version: 0.8.6
|
10
|
+
version: 1.0.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- James Stuart
|
@@ -15,8 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-
|
19
|
-
default_executable:
|
18
|
+
date: 2011-09-19 00:00:00 Z
|
20
19
|
dependencies:
|
21
20
|
- !ruby/object:Gem::Dependency
|
22
21
|
name: shoulda
|
@@ -175,7 +174,6 @@ files:
|
|
175
174
|
- test/test_fedora_server.rb
|
176
175
|
- test/test_fedora_solr.rb
|
177
176
|
- test_fedora_item.rb
|
178
|
-
has_rdoc: true
|
179
177
|
homepage: http://github.com/tastyhat/cul-fedora
|
180
178
|
licenses: []
|
181
179
|
|
@@ -205,13 +203,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
205
203
|
requirements: []
|
206
204
|
|
207
205
|
rubyforge_project:
|
208
|
-
rubygems_version: 1.
|
206
|
+
rubygems_version: 1.8.6
|
209
207
|
signing_key:
|
210
208
|
specification_version: 3
|
211
209
|
summary: Columbia University Fedora Hooks
|
212
|
-
test_files:
|
213
|
-
|
214
|
-
- test/test_cul-fedora.rb
|
215
|
-
- test/test_fedora_item.rb
|
216
|
-
- test/test_fedora_server.rb
|
217
|
-
- test/test_fedora_solr.rb
|
210
|
+
test_files: []
|
211
|
+
|