cul-fedora 0.8.6 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.8.6
1
+ 1.0.0
data/cul-fedora.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{cul-fedora}
8
- s.version = "0.8.6"
8
+ s.version = "1.0.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
- s.authors = ["James Stuart"]
12
- s.date = %q{2011-07-14}
11
+ s.authors = [%q{James Stuart}]
12
+ s.date = %q{2011-09-19}
13
13
  s.description = %q{Columbia-specific Fedora libraries}
14
14
  s.email = %q{tastyhat@jamesstuart.org}
15
15
  s.extra_rdoc_files = [
@@ -62,19 +62,11 @@ Gem::Specification.new do |s|
62
62
  "test_fedora_item.rb"
63
63
  ]
64
64
  s.homepage = %q{http://github.com/tastyhat/cul-fedora}
65
- s.require_paths = ["lib"]
66
- s.rubygems_version = %q{1.3.7}
65
+ s.require_paths = [%q{lib}]
66
+ s.rubygems_version = %q{1.8.6}
67
67
  s.summary = %q{Columbia University Fedora Hooks}
68
- s.test_files = [
69
- "test/helper.rb",
70
- "test/test_cul-fedora.rb",
71
- "test/test_fedora_item.rb",
72
- "test/test_fedora_server.rb",
73
- "test/test_fedora_solr.rb"
74
- ]
75
68
 
76
69
  if s.respond_to? :specification_version then
77
- current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
78
70
  s.specification_version = 3
79
71
 
80
72
  if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
data/lib/cul-fedora.rb CHANGED
@@ -1,6 +1,10 @@
1
1
  require "httpclient"
2
2
  require "nokogiri"
3
- require "activesupport"
3
+ begin
4
+ require "active_support"
5
+ rescue
6
+ require "activesupport"
7
+ end
4
8
  require "rsolr"
5
9
  require "rsolr-ext"
6
10
  require "open3"
@@ -1,4 +1,9 @@
1
1
  require "open3"
2
+ begin
3
+ require "active_support/core_ext/array/extract_options"
4
+ rescue
5
+ require "activesupport"
6
+ end
2
7
 
3
8
  module Cul
4
9
  module Fedora
@@ -40,6 +45,7 @@ module Cul
40
45
  request
41
46
  return true
42
47
  rescue Exception => e # we should really do some better checking of error type etc here
48
+ logger.error e.message
43
49
  return false
44
50
  end
45
51
  end
@@ -82,7 +88,8 @@ module Cul
82
88
  i = i + MAX_LIST_MEMBERS_PER_REQUEST
83
89
  end
84
90
  return items
85
- rescue
91
+ rescue Exception => e
92
+ logger.error e.message
86
93
  []
87
94
  end
88
95
  end
@@ -90,8 +97,9 @@ module Cul
90
97
  def getSize()
91
98
  begin
92
99
  request(:method => "/objects", :sdef => "methods/ldpd:sdef.Aggregator", :request => "getSize").to_i
93
- rescue
94
- -1
100
+ rescue Exception => e
101
+ logger.error e.message
102
+ return -1
95
103
  end
96
104
  end
97
105
 
@@ -103,6 +111,7 @@ module Cul
103
111
  @server.item(metadata.attributes["uri"].value)
104
112
  end
105
113
  rescue Exception => e
114
+ logger.error e.message
106
115
  []
107
116
  end
108
117
  end
@@ -113,7 +122,8 @@ module Cul
113
122
  result.xpath("/rdf:RDF/rdf:Description/*[local-name()='memberOf']").collect do |member|
114
123
  @server.item(member.attributes["resource"].value)
115
124
  end
116
- rescue
125
+ rescue Exception => e
126
+ logger.error e.message
117
127
  []
118
128
  end
119
129
  end
@@ -136,6 +146,7 @@ module Cul
136
146
  author_roles = ["author","creator","editor","speaker","moderator","interviewee","interviewer","contributor"]
137
147
  other_name_roles = ["thesis advisor"]
138
148
  corporate_author_roles = ["author"]
149
+ corporate_department_roles = ["originator"]
139
150
 
140
151
  organizations = []
141
152
  departments = []
@@ -182,7 +193,7 @@ module Cul
182
193
  note_org = true
183
194
  all_author_names << fullname
184
195
  if(!name_node["ID"].nil?)
185
- add_field.call("author_id_uni", name_node["ID"])
196
+ add_field.call("author_uni", name_node["ID"])
186
197
  end
187
198
  add_field.call("author_search", fullname.downcase)
188
199
  add_field.call("author_facet", fullname)
@@ -209,7 +220,7 @@ module Cul
209
220
  end
210
221
 
211
222
  mods.css("name[@type='corporate']").each do |corp_name_node|
212
- if(!corp_name_node["ID"].nil? && corp_name_node["ID"].include?("originator"))
223
+ if((!corp_name_node["ID"].nil? && corp_name_node["ID"].include?("originator")) || corp_name_node.css("role>roleTerm").collect(&:content).any? { |role| corporate_department_roles.include?(role) })
213
224
  name_part = corp_name_node.at_css("namePart").text
214
225
  if(name_part.include?(". "))
215
226
  name_part_split = name_part.split(". ")
@@ -230,13 +241,12 @@ module Cul
230
241
  end
231
242
  end
232
243
 
233
- add_field.call("authors_display",all_author_names.join("; "))
234
- add_field.call("pub_date", mods.at_css("*[@keyDate='yes']"))
244
+ add_field.call("author_display",all_author_names.join("; "))
245
+ add_field.call("pub_date_facet", mods.at_css("*[@keyDate='yes']"))
235
246
 
236
247
  mods.css("genre").each do |genre_node|
237
248
  add_field.call("genre_facet", genre_node)
238
249
  add_field.call("genre_search", genre_node)
239
-
240
250
  end
241
251
 
242
252
 
@@ -247,14 +257,14 @@ module Cul
247
257
  if(subject_node.attributes.count == 0)
248
258
  subject_node.css("topic").each do |topic_node|
249
259
  add_field.call("keyword_search", topic_node.content.downcase)
250
- add_field.call("subject", topic_node)
260
+ add_field.call("subject_facet", topic_node)
251
261
  add_field.call("subject_search", topic_node)
252
262
  end
253
263
  end
254
264
  end
255
265
 
256
266
 
257
- add_field.call("tableOfContents", mods.at_css("tableOfContents"))
267
+ add_field.call("table_of_contents", mods.at_css("tableOfContents"))
258
268
 
259
269
  mods.css("note").each { |note| add_field.call("notes", note) }
260
270
 
@@ -277,7 +287,7 @@ module Cul
277
287
 
278
288
  if(related_series = mods.at_css("relatedItem[@type='series']"))
279
289
  if(related_series.has_attribute?("ID"))
280
- add_field.call("series", related_series.at_css("titleInfo>title"))
290
+ add_field.call("series_facet", related_series.at_css("titleInfo>title"))
281
291
  end
282
292
  end
283
293
 
@@ -290,23 +300,24 @@ module Cul
290
300
 
291
301
  mods.css("typeOfResource").each { |tr| add_field.call("type_of_resource_facet", tr)}
292
302
  mods.css("subject>geographic").each do |geo|
293
- add_field.call("geographic_area", geo)
303
+ add_field.call("geographic_area_display", geo)
294
304
  add_field.call("geographic_area_search", geo)
295
305
  end
296
306
 
297
- add_field.call("export_as_mla_citation_txt","")
307
+ # This is just a placeholder, reminding us that we need to implement citations in some way
308
+ # add_field.call("export_as_mla_citation_txt","")
298
309
 
299
310
  if(organizations.count > 0)
300
311
  organizations = organizations.uniq
301
312
  organizations.each do |organization|
302
- add_field.call("affiliation_organization", organization)
313
+ add_field.call("organization_facet", organization)
303
314
  end
304
315
  end
305
316
 
306
317
  if(departments.count > 0)
307
318
  departments = departments.uniq
308
319
  departments.each do |department|
309
- add_field.call("affiliation_department", department.to_s.sub(", Department of", "").strip)
320
+ add_field.call("department_facet", department.to_s.sub(", Department of", "").strip)
310
321
  end
311
322
  end
312
323
 
@@ -1,3 +1,9 @@
1
+ begin
2
+ require "active_support/core_ext/array/extract_options"
3
+ rescue
4
+ require "activesupport"
5
+ end
6
+
1
7
  module Cul
2
8
  module Fedora
3
9
  class Server
@@ -6,10 +12,10 @@ module Cul
6
12
 
7
13
  def initialize(*args)
8
14
  options = args.extract_options!
9
- @riurl = options[:riurl] || raise(ArgumentError, "Must provide riurl argument")
10
- @riquery = options[:riquery] || raise(ArgumentError, "Must provide riquery argument")
11
- @hc = options[:http_client]
12
- @logger = options[:logger]
15
+ @riurl = options[:riurl] || options["riurl"] || raise(ArgumentError, "Must provide riurl argument")
16
+ @riquery = options[:riquery] || options["riquery"] || raise(ArgumentError, "Must provide riquery argument")
17
+ @hc = options[:http_client] || options["http_client"]
18
+ @logger = options[:logger] || options["logger"]
13
19
  end
14
20
 
15
21
  def logger
@@ -1,12 +1,18 @@
1
+ begin
2
+ require "active_support/core_ext/array/extract_options"
3
+ rescue
4
+ require "activesupport"
5
+ end
6
+
1
7
  module Cul
2
8
  module Fedora
3
9
  class Solr
4
10
 
5
11
  attr_reader :url
6
12
 
7
- def initialize(config = {})
8
- @url = config[:url] || raise(ArgumentError, "must provide url")
9
- @logger = config[:logger]
13
+ def initialize(options = {})
14
+ @url = options[:url] || options["url"] || raise(ArgumentError, "must provide url")
15
+ @logger = options[:logger] || options["logger"]
10
16
  end
11
17
 
12
18
  def logger
@@ -22,31 +28,53 @@ module Cul
22
28
  end
23
29
 
24
30
  def delete_index
31
+ logger.info "Deleting Solr index..."
25
32
  rsolr.delete_by_query("*:*")
26
33
  rsolr.commit
27
34
  end
28
35
 
29
- def delete_removed(fedora_server)
36
+ def delete_removed(fedora_server, fedora_item_pids = nil)
37
+
38
+ removed = identify_removed(fedora_server)
39
+ logger.info "Deleting items removed from Fedora..."
40
+ removed.each do |id|
41
+ logger.info "Deleting " + id + "..."
42
+ rsolr.delete_by_query("id:" + id.to_s.gsub(/:/,'\\:'))
43
+ end
44
+
45
+ rsolr.commit
30
46
 
47
+ end
48
+
49
+ def identify_removed(fedora_server, fedora_item_pids = nil)
31
50
  start = 0
32
51
  rows = 500
52
+ removed = []
33
53
  results = rsolr.select({:q => "", :fl => "id", :start => start, :rows => rows})
34
- logger.info "Deleting items removed from Fedora..."
54
+ logger.info "Identifying items removed from Fedora..."
35
55
  while(!results["response"]["docs"].empty?)
36
56
 
57
+ logger.info("Checking Solr index from " + start.to_s + " to " + (start + rows).to_s + "...")
37
58
  results["response"]["docs"].each do |doc|
38
- if(!fedora_server.item(doc["id"]).exists?)
39
- logger.info "Deleting " + doc["id"] + "..."
40
- rsolr.delete_by_query("id:" + doc["id"].to_s.gsub(/:/,'\\:'))
59
+
60
+ if(fedora_item_pids.nil?)
61
+ if(!fedora_server.item(doc["id"]).exists?)
62
+ logger.info "Noting removed item " + doc["id"] + "..."
63
+ removed << doc["id"].to_s
64
+ end
65
+ else
66
+ if(!fedora_item_pids.include?(doc["id"].to_s))
67
+ logger.info "Noting removed item " + doc["id"] + "..."
68
+ removed << doc["id"].to_s
69
+ end
41
70
  end
71
+
42
72
  end
43
73
 
44
74
  start = start + rows
45
75
  results = rsolr.select({:q => "", :fl => "id", :start => start, :rows => rows})
46
76
  end
47
-
48
- rsolr.commit
49
-
77
+ return removed
50
78
  end
51
79
 
52
80
  def ingest(options = {})
@@ -63,14 +91,9 @@ module Cul
63
91
 
64
92
  delete = options.delete(:delete_removed) || false
65
93
  overwrite = options.delete(:overwrite) || false
66
- process = options.delete(:process) || nil
67
- skip = options.delete(:skip) || nil
68
-
69
- processed_successfully = 0
94
+ skip = options.delete(:skip) || []
70
95
 
71
- if delete == true
72
- delete_removed(fedora_server)
73
- end
96
+ indexed_count = 0
74
97
 
75
98
  logger.info "Preparing the items for indexing..."
76
99
  collections.each do |collection|
@@ -79,24 +102,26 @@ module Cul
79
102
 
80
103
  items.sort!
81
104
 
82
- to_add = []
83
105
  results = Hash.new { |h,k| h[k] = [] }
84
- errors = {}
106
+ errors = []
107
+
108
+ item_pids = []
109
+ items.each do |item|
110
+ item_pids << item.pid
111
+ end
112
+ if delete == true
113
+ delete_removed(fedora_server, item_pids)
114
+ end
85
115
 
86
116
  logger.info "Preparing to index " + items.length.to_s + " items..."
87
117
 
88
118
  items.each do |i|
89
119
 
90
- if(ignore.index(i.pid).nil? == false)
91
- logger.info "Ignoring " + i.pid + "..."
120
+ if(ignore.index(i.pid).nil? == false || skip.index(i.pid).nil? == false)
121
+ logger.info "Ignoring/skipping " + i.pid + "..."
122
+ results[:skipped] << i.pid
92
123
  next
93
124
  end
94
-
95
- if process && skip && skip > 0
96
- skip -= 1
97
- next
98
- end
99
-
100
125
 
101
126
  if item_exists?(i)
102
127
  unless overwrite == true
@@ -104,45 +129,33 @@ module Cul
104
129
  next
105
130
  end
106
131
  end
107
-
108
132
 
109
133
  logger.info "Indexing " + i.pid + "..."
110
134
 
111
135
  result_hash = i.send("index_for_#{format}", options)
112
136
 
113
- results[result_hash[:status]] << i.pid
137
+ results[result_hash[:status]] << i.pid
114
138
 
115
139
  case result_hash[:status]
116
140
  when :success
117
- to_add << result_hash[:results]
118
- processed_successfully += 1
141
+ begin
142
+ rsolr.add(result_hash[:results])
143
+ indexed_count += 1
144
+ rescue Exception => e
145
+ errors << i.pid
146
+ logger.error e.message
147
+ end
119
148
  when :error
120
- errors[i.pid] = result_hash[:error_message]
121
- end
122
-
123
- if process
124
- process -= 1
125
- break if process <= 0
149
+ errors << i.pid
150
+ logger.error result_hash[:error_message]
126
151
  end
127
152
 
128
- if to_add.length >= 500
129
- logger.info "Adding batch to commit queue..."
130
- rsolr.add(to_add)
131
- to_add.clear
132
- end
133
-
134
- end
135
-
136
- if to_add.length > 0
137
- logger.info "Adding batch to commit queue..."
138
- rsolr.add(to_add)
139
- to_add.clear
140
153
  end
141
154
 
142
155
  logger.info "Committing changes to Solr..."
143
156
  rsolr.commit
144
157
 
145
- return {:results => results, :errors => errors, :processed_successfully => processed_successfully}
158
+ return {:results => results, :errors => errors, :indexed_count => indexed_count}
146
159
 
147
160
  end
148
161
 
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cul-fedora
3
3
  version: !ruby/object:Gem::Version
4
- hash: 51
5
- prerelease: false
4
+ hash: 23
5
+ prerelease:
6
6
  segments:
7
+ - 1
8
+ - 0
7
9
  - 0
8
- - 8
9
- - 6
10
- version: 0.8.6
10
+ version: 1.0.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - James Stuart
@@ -15,8 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-07-14 00:00:00 -04:00
19
- default_executable:
18
+ date: 2011-09-19 00:00:00 Z
20
19
  dependencies:
21
20
  - !ruby/object:Gem::Dependency
22
21
  name: shoulda
@@ -175,7 +174,6 @@ files:
175
174
  - test/test_fedora_server.rb
176
175
  - test/test_fedora_solr.rb
177
176
  - test_fedora_item.rb
178
- has_rdoc: true
179
177
  homepage: http://github.com/tastyhat/cul-fedora
180
178
  licenses: []
181
179
 
@@ -205,13 +203,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
205
203
  requirements: []
206
204
 
207
205
  rubyforge_project:
208
- rubygems_version: 1.3.7
206
+ rubygems_version: 1.8.6
209
207
  signing_key:
210
208
  specification_version: 3
211
209
  summary: Columbia University Fedora Hooks
212
- test_files:
213
- - test/helper.rb
214
- - test/test_cul-fedora.rb
215
- - test/test_fedora_item.rb
216
- - test/test_fedora_server.rb
217
- - test/test_fedora_solr.rb
210
+ test_files: []
211
+