cul-fedora 0.8.6 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/cul-fedora.gemspec +5 -13
- data/lib/cul-fedora.rb +5 -1
- data/lib/cul-fedora/item.rb +27 -16
- data/lib/cul-fedora/server.rb +10 -4
- data/lib/cul-fedora/solr.rb +64 -51
- metadata +9 -15
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
1.0.0
|
data/cul-fedora.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{cul-fedora}
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "1.0.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
-
s.authors = [
|
12
|
-
s.date = %q{2011-
|
11
|
+
s.authors = [%q{James Stuart}]
|
12
|
+
s.date = %q{2011-09-19}
|
13
13
|
s.description = %q{Columbia-specific Fedora libraries}
|
14
14
|
s.email = %q{tastyhat@jamesstuart.org}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -62,19 +62,11 @@ Gem::Specification.new do |s|
|
|
62
62
|
"test_fedora_item.rb"
|
63
63
|
]
|
64
64
|
s.homepage = %q{http://github.com/tastyhat/cul-fedora}
|
65
|
-
s.require_paths = [
|
66
|
-
s.rubygems_version = %q{1.
|
65
|
+
s.require_paths = [%q{lib}]
|
66
|
+
s.rubygems_version = %q{1.8.6}
|
67
67
|
s.summary = %q{Columbia University Fedora Hooks}
|
68
|
-
s.test_files = [
|
69
|
-
"test/helper.rb",
|
70
|
-
"test/test_cul-fedora.rb",
|
71
|
-
"test/test_fedora_item.rb",
|
72
|
-
"test/test_fedora_server.rb",
|
73
|
-
"test/test_fedora_solr.rb"
|
74
|
-
]
|
75
68
|
|
76
69
|
if s.respond_to? :specification_version then
|
77
|
-
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
78
70
|
s.specification_version = 3
|
79
71
|
|
80
72
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
data/lib/cul-fedora.rb
CHANGED
data/lib/cul-fedora/item.rb
CHANGED
@@ -1,4 +1,9 @@
|
|
1
1
|
require "open3"
|
2
|
+
begin
|
3
|
+
require "active_support/core_ext/array/extract_options"
|
4
|
+
rescue
|
5
|
+
require "activesupport"
|
6
|
+
end
|
2
7
|
|
3
8
|
module Cul
|
4
9
|
module Fedora
|
@@ -40,6 +45,7 @@ module Cul
|
|
40
45
|
request
|
41
46
|
return true
|
42
47
|
rescue Exception => e # we should really do some better checking of error type etc here
|
48
|
+
logger.error e.message
|
43
49
|
return false
|
44
50
|
end
|
45
51
|
end
|
@@ -82,7 +88,8 @@ module Cul
|
|
82
88
|
i = i + MAX_LIST_MEMBERS_PER_REQUEST
|
83
89
|
end
|
84
90
|
return items
|
85
|
-
rescue
|
91
|
+
rescue Exception => e
|
92
|
+
logger.error e.message
|
86
93
|
[]
|
87
94
|
end
|
88
95
|
end
|
@@ -90,8 +97,9 @@ module Cul
|
|
90
97
|
def getSize()
|
91
98
|
begin
|
92
99
|
request(:method => "/objects", :sdef => "methods/ldpd:sdef.Aggregator", :request => "getSize").to_i
|
93
|
-
rescue
|
94
|
-
|
100
|
+
rescue Exception => e
|
101
|
+
logger.error e.message
|
102
|
+
return -1
|
95
103
|
end
|
96
104
|
end
|
97
105
|
|
@@ -103,6 +111,7 @@ module Cul
|
|
103
111
|
@server.item(metadata.attributes["uri"].value)
|
104
112
|
end
|
105
113
|
rescue Exception => e
|
114
|
+
logger.error e.message
|
106
115
|
[]
|
107
116
|
end
|
108
117
|
end
|
@@ -113,7 +122,8 @@ module Cul
|
|
113
122
|
result.xpath("/rdf:RDF/rdf:Description/*[local-name()='memberOf']").collect do |member|
|
114
123
|
@server.item(member.attributes["resource"].value)
|
115
124
|
end
|
116
|
-
rescue
|
125
|
+
rescue Exception => e
|
126
|
+
logger.error e.message
|
117
127
|
[]
|
118
128
|
end
|
119
129
|
end
|
@@ -136,6 +146,7 @@ module Cul
|
|
136
146
|
author_roles = ["author","creator","editor","speaker","moderator","interviewee","interviewer","contributor"]
|
137
147
|
other_name_roles = ["thesis advisor"]
|
138
148
|
corporate_author_roles = ["author"]
|
149
|
+
corporate_department_roles = ["originator"]
|
139
150
|
|
140
151
|
organizations = []
|
141
152
|
departments = []
|
@@ -182,7 +193,7 @@ module Cul
|
|
182
193
|
note_org = true
|
183
194
|
all_author_names << fullname
|
184
195
|
if(!name_node["ID"].nil?)
|
185
|
-
add_field.call("
|
196
|
+
add_field.call("author_uni", name_node["ID"])
|
186
197
|
end
|
187
198
|
add_field.call("author_search", fullname.downcase)
|
188
199
|
add_field.call("author_facet", fullname)
|
@@ -209,7 +220,7 @@ module Cul
|
|
209
220
|
end
|
210
221
|
|
211
222
|
mods.css("name[@type='corporate']").each do |corp_name_node|
|
212
|
-
if(!corp_name_node["ID"].nil? && corp_name_node["ID"].include?("originator"))
|
223
|
+
if((!corp_name_node["ID"].nil? && corp_name_node["ID"].include?("originator")) || corp_name_node.css("role>roleTerm").collect(&:content).any? { |role| corporate_department_roles.include?(role) })
|
213
224
|
name_part = corp_name_node.at_css("namePart").text
|
214
225
|
if(name_part.include?(". "))
|
215
226
|
name_part_split = name_part.split(". ")
|
@@ -230,13 +241,12 @@ module Cul
|
|
230
241
|
end
|
231
242
|
end
|
232
243
|
|
233
|
-
add_field.call("
|
234
|
-
add_field.call("
|
244
|
+
add_field.call("author_display",all_author_names.join("; "))
|
245
|
+
add_field.call("pub_date_facet", mods.at_css("*[@keyDate='yes']"))
|
235
246
|
|
236
247
|
mods.css("genre").each do |genre_node|
|
237
248
|
add_field.call("genre_facet", genre_node)
|
238
249
|
add_field.call("genre_search", genre_node)
|
239
|
-
|
240
250
|
end
|
241
251
|
|
242
252
|
|
@@ -247,14 +257,14 @@ module Cul
|
|
247
257
|
if(subject_node.attributes.count == 0)
|
248
258
|
subject_node.css("topic").each do |topic_node|
|
249
259
|
add_field.call("keyword_search", topic_node.content.downcase)
|
250
|
-
add_field.call("
|
260
|
+
add_field.call("subject_facet", topic_node)
|
251
261
|
add_field.call("subject_search", topic_node)
|
252
262
|
end
|
253
263
|
end
|
254
264
|
end
|
255
265
|
|
256
266
|
|
257
|
-
add_field.call("
|
267
|
+
add_field.call("table_of_contents", mods.at_css("tableOfContents"))
|
258
268
|
|
259
269
|
mods.css("note").each { |note| add_field.call("notes", note) }
|
260
270
|
|
@@ -277,7 +287,7 @@ module Cul
|
|
277
287
|
|
278
288
|
if(related_series = mods.at_css("relatedItem[@type='series']"))
|
279
289
|
if(related_series.has_attribute?("ID"))
|
280
|
-
add_field.call("
|
290
|
+
add_field.call("series_facet", related_series.at_css("titleInfo>title"))
|
281
291
|
end
|
282
292
|
end
|
283
293
|
|
@@ -290,23 +300,24 @@ module Cul
|
|
290
300
|
|
291
301
|
mods.css("typeOfResource").each { |tr| add_field.call("type_of_resource_facet", tr)}
|
292
302
|
mods.css("subject>geographic").each do |geo|
|
293
|
-
add_field.call("
|
303
|
+
add_field.call("geographic_area_display", geo)
|
294
304
|
add_field.call("geographic_area_search", geo)
|
295
305
|
end
|
296
306
|
|
297
|
-
|
307
|
+
# This is just a placeholder, reminding us that we need to implement citations in some way
|
308
|
+
# add_field.call("export_as_mla_citation_txt","")
|
298
309
|
|
299
310
|
if(organizations.count > 0)
|
300
311
|
organizations = organizations.uniq
|
301
312
|
organizations.each do |organization|
|
302
|
-
add_field.call("
|
313
|
+
add_field.call("organization_facet", organization)
|
303
314
|
end
|
304
315
|
end
|
305
316
|
|
306
317
|
if(departments.count > 0)
|
307
318
|
departments = departments.uniq
|
308
319
|
departments.each do |department|
|
309
|
-
add_field.call("
|
320
|
+
add_field.call("department_facet", department.to_s.sub(", Department of", "").strip)
|
310
321
|
end
|
311
322
|
end
|
312
323
|
|
data/lib/cul-fedora/server.rb
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
begin
|
2
|
+
require "active_support/core_ext/array/extract_options"
|
3
|
+
rescue
|
4
|
+
require "activesupport"
|
5
|
+
end
|
6
|
+
|
1
7
|
module Cul
|
2
8
|
module Fedora
|
3
9
|
class Server
|
@@ -6,10 +12,10 @@ module Cul
|
|
6
12
|
|
7
13
|
def initialize(*args)
|
8
14
|
options = args.extract_options!
|
9
|
-
@riurl = options[:riurl] || raise(ArgumentError, "Must provide riurl argument")
|
10
|
-
@riquery = options[:riquery] || raise(ArgumentError, "Must provide riquery argument")
|
11
|
-
@hc = options[:http_client]
|
12
|
-
@logger = options[:logger]
|
15
|
+
@riurl = options[:riurl] || options["riurl"] || raise(ArgumentError, "Must provide riurl argument")
|
16
|
+
@riquery = options[:riquery] || options["riquery"] || raise(ArgumentError, "Must provide riquery argument")
|
17
|
+
@hc = options[:http_client] || options["http_client"]
|
18
|
+
@logger = options[:logger] || options["logger"]
|
13
19
|
end
|
14
20
|
|
15
21
|
def logger
|
data/lib/cul-fedora/solr.rb
CHANGED
@@ -1,12 +1,18 @@
|
|
1
|
+
begin
|
2
|
+
require "active_support/core_ext/array/extract_options"
|
3
|
+
rescue
|
4
|
+
require "activesupport"
|
5
|
+
end
|
6
|
+
|
1
7
|
module Cul
|
2
8
|
module Fedora
|
3
9
|
class Solr
|
4
10
|
|
5
11
|
attr_reader :url
|
6
12
|
|
7
|
-
def initialize(
|
8
|
-
@url =
|
9
|
-
@logger =
|
13
|
+
def initialize(options = {})
|
14
|
+
@url = options[:url] || options["url"] || raise(ArgumentError, "must provide url")
|
15
|
+
@logger = options[:logger] || options["logger"]
|
10
16
|
end
|
11
17
|
|
12
18
|
def logger
|
@@ -22,31 +28,53 @@ module Cul
|
|
22
28
|
end
|
23
29
|
|
24
30
|
def delete_index
|
31
|
+
logger.info "Deleting Solr index..."
|
25
32
|
rsolr.delete_by_query("*:*")
|
26
33
|
rsolr.commit
|
27
34
|
end
|
28
35
|
|
29
|
-
def delete_removed(fedora_server)
|
36
|
+
def delete_removed(fedora_server, fedora_item_pids = nil)
|
37
|
+
|
38
|
+
removed = identify_removed(fedora_server)
|
39
|
+
logger.info "Deleting items removed from Fedora..."
|
40
|
+
removed.each do |id|
|
41
|
+
logger.info "Deleting " + id + "..."
|
42
|
+
rsolr.delete_by_query("id:" + id.to_s.gsub(/:/,'\\:'))
|
43
|
+
end
|
44
|
+
|
45
|
+
rsolr.commit
|
30
46
|
|
47
|
+
end
|
48
|
+
|
49
|
+
def identify_removed(fedora_server, fedora_item_pids = nil)
|
31
50
|
start = 0
|
32
51
|
rows = 500
|
52
|
+
removed = []
|
33
53
|
results = rsolr.select({:q => "", :fl => "id", :start => start, :rows => rows})
|
34
|
-
logger.info "
|
54
|
+
logger.info "Identifying items removed from Fedora..."
|
35
55
|
while(!results["response"]["docs"].empty?)
|
36
56
|
|
57
|
+
logger.info("Checking Solr index from " + start.to_s + " to " + (start + rows).to_s + "...")
|
37
58
|
results["response"]["docs"].each do |doc|
|
38
|
-
|
39
|
-
|
40
|
-
|
59
|
+
|
60
|
+
if(fedora_item_pids.nil?)
|
61
|
+
if(!fedora_server.item(doc["id"]).exists?)
|
62
|
+
logger.info "Noting removed item " + doc["id"] + "..."
|
63
|
+
removed << doc["id"].to_s
|
64
|
+
end
|
65
|
+
else
|
66
|
+
if(!fedora_item_pids.include?(doc["id"].to_s))
|
67
|
+
logger.info "Noting removed item " + doc["id"] + "..."
|
68
|
+
removed << doc["id"].to_s
|
69
|
+
end
|
41
70
|
end
|
71
|
+
|
42
72
|
end
|
43
73
|
|
44
74
|
start = start + rows
|
45
75
|
results = rsolr.select({:q => "", :fl => "id", :start => start, :rows => rows})
|
46
76
|
end
|
47
|
-
|
48
|
-
rsolr.commit
|
49
|
-
|
77
|
+
return removed
|
50
78
|
end
|
51
79
|
|
52
80
|
def ingest(options = {})
|
@@ -63,14 +91,9 @@ module Cul
|
|
63
91
|
|
64
92
|
delete = options.delete(:delete_removed) || false
|
65
93
|
overwrite = options.delete(:overwrite) || false
|
66
|
-
|
67
|
-
skip = options.delete(:skip) || nil
|
68
|
-
|
69
|
-
processed_successfully = 0
|
94
|
+
skip = options.delete(:skip) || []
|
70
95
|
|
71
|
-
|
72
|
-
delete_removed(fedora_server)
|
73
|
-
end
|
96
|
+
indexed_count = 0
|
74
97
|
|
75
98
|
logger.info "Preparing the items for indexing..."
|
76
99
|
collections.each do |collection|
|
@@ -79,24 +102,26 @@ module Cul
|
|
79
102
|
|
80
103
|
items.sort!
|
81
104
|
|
82
|
-
to_add = []
|
83
105
|
results = Hash.new { |h,k| h[k] = [] }
|
84
|
-
errors =
|
106
|
+
errors = []
|
107
|
+
|
108
|
+
item_pids = []
|
109
|
+
items.each do |item|
|
110
|
+
item_pids << item.pid
|
111
|
+
end
|
112
|
+
if delete == true
|
113
|
+
delete_removed(fedora_server, item_pids)
|
114
|
+
end
|
85
115
|
|
86
116
|
logger.info "Preparing to index " + items.length.to_s + " items..."
|
87
117
|
|
88
118
|
items.each do |i|
|
89
119
|
|
90
|
-
if(ignore.index(i.pid).nil? == false)
|
91
|
-
logger.info "Ignoring " + i.pid + "..."
|
120
|
+
if(ignore.index(i.pid).nil? == false || skip.index(i.pid).nil? == false)
|
121
|
+
logger.info "Ignoring/skipping " + i.pid + "..."
|
122
|
+
results[:skipped] << i.pid
|
92
123
|
next
|
93
124
|
end
|
94
|
-
|
95
|
-
if process && skip && skip > 0
|
96
|
-
skip -= 1
|
97
|
-
next
|
98
|
-
end
|
99
|
-
|
100
125
|
|
101
126
|
if item_exists?(i)
|
102
127
|
unless overwrite == true
|
@@ -104,45 +129,33 @@ module Cul
|
|
104
129
|
next
|
105
130
|
end
|
106
131
|
end
|
107
|
-
|
108
132
|
|
109
133
|
logger.info "Indexing " + i.pid + "..."
|
110
134
|
|
111
135
|
result_hash = i.send("index_for_#{format}", options)
|
112
136
|
|
113
|
-
results[result_hash[:status]]
|
137
|
+
results[result_hash[:status]] << i.pid
|
114
138
|
|
115
139
|
case result_hash[:status]
|
116
140
|
when :success
|
117
|
-
|
118
|
-
|
141
|
+
begin
|
142
|
+
rsolr.add(result_hash[:results])
|
143
|
+
indexed_count += 1
|
144
|
+
rescue Exception => e
|
145
|
+
errors << i.pid
|
146
|
+
logger.error e.message
|
147
|
+
end
|
119
148
|
when :error
|
120
|
-
errors
|
121
|
-
|
122
|
-
|
123
|
-
if process
|
124
|
-
process -= 1
|
125
|
-
break if process <= 0
|
149
|
+
errors << i.pid
|
150
|
+
logger.error result_hash[:error_message]
|
126
151
|
end
|
127
152
|
|
128
|
-
if to_add.length >= 500
|
129
|
-
logger.info "Adding batch to commit queue..."
|
130
|
-
rsolr.add(to_add)
|
131
|
-
to_add.clear
|
132
|
-
end
|
133
|
-
|
134
|
-
end
|
135
|
-
|
136
|
-
if to_add.length > 0
|
137
|
-
logger.info "Adding batch to commit queue..."
|
138
|
-
rsolr.add(to_add)
|
139
|
-
to_add.clear
|
140
153
|
end
|
141
154
|
|
142
155
|
logger.info "Committing changes to Solr..."
|
143
156
|
rsolr.commit
|
144
157
|
|
145
|
-
return {:results => results, :errors => errors, :
|
158
|
+
return {:results => results, :errors => errors, :indexed_count => indexed_count}
|
146
159
|
|
147
160
|
end
|
148
161
|
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cul-fedora
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
5
|
-
prerelease:
|
4
|
+
hash: 23
|
5
|
+
prerelease:
|
6
6
|
segments:
|
7
|
+
- 1
|
8
|
+
- 0
|
7
9
|
- 0
|
8
|
-
|
9
|
-
- 6
|
10
|
-
version: 0.8.6
|
10
|
+
version: 1.0.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- James Stuart
|
@@ -15,8 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-
|
19
|
-
default_executable:
|
18
|
+
date: 2011-09-19 00:00:00 Z
|
20
19
|
dependencies:
|
21
20
|
- !ruby/object:Gem::Dependency
|
22
21
|
name: shoulda
|
@@ -175,7 +174,6 @@ files:
|
|
175
174
|
- test/test_fedora_server.rb
|
176
175
|
- test/test_fedora_solr.rb
|
177
176
|
- test_fedora_item.rb
|
178
|
-
has_rdoc: true
|
179
177
|
homepage: http://github.com/tastyhat/cul-fedora
|
180
178
|
licenses: []
|
181
179
|
|
@@ -205,13 +203,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
205
203
|
requirements: []
|
206
204
|
|
207
205
|
rubyforge_project:
|
208
|
-
rubygems_version: 1.
|
206
|
+
rubygems_version: 1.8.6
|
209
207
|
signing_key:
|
210
208
|
specification_version: 3
|
211
209
|
summary: Columbia University Fedora Hooks
|
212
|
-
test_files:
|
213
|
-
|
214
|
-
- test/test_cul-fedora.rb
|
215
|
-
- test/test_fedora_item.rb
|
216
|
-
- test/test_fedora_server.rb
|
217
|
-
- test/test_fedora_solr.rb
|
210
|
+
test_files: []
|
211
|
+
|