cul-fedora 0.8.6 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.8.6
1
+ 1.0.0
data/cul-fedora.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{cul-fedora}
8
- s.version = "0.8.6"
8
+ s.version = "1.0.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
- s.authors = ["James Stuart"]
12
- s.date = %q{2011-07-14}
11
+ s.authors = [%q{James Stuart}]
12
+ s.date = %q{2011-09-19}
13
13
  s.description = %q{Columbia-specific Fedora libraries}
14
14
  s.email = %q{tastyhat@jamesstuart.org}
15
15
  s.extra_rdoc_files = [
@@ -62,19 +62,11 @@ Gem::Specification.new do |s|
62
62
  "test_fedora_item.rb"
63
63
  ]
64
64
  s.homepage = %q{http://github.com/tastyhat/cul-fedora}
65
- s.require_paths = ["lib"]
66
- s.rubygems_version = %q{1.3.7}
65
+ s.require_paths = [%q{lib}]
66
+ s.rubygems_version = %q{1.8.6}
67
67
  s.summary = %q{Columbia University Fedora Hooks}
68
- s.test_files = [
69
- "test/helper.rb",
70
- "test/test_cul-fedora.rb",
71
- "test/test_fedora_item.rb",
72
- "test/test_fedora_server.rb",
73
- "test/test_fedora_solr.rb"
74
- ]
75
68
 
76
69
  if s.respond_to? :specification_version then
77
- current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
78
70
  s.specification_version = 3
79
71
 
80
72
  if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
data/lib/cul-fedora.rb CHANGED
@@ -1,6 +1,10 @@
1
1
  require "httpclient"
2
2
  require "nokogiri"
3
- require "activesupport"
3
+ begin
4
+ require "active_support"
5
+ rescue
6
+ require "activesupport"
7
+ end
4
8
  require "rsolr"
5
9
  require "rsolr-ext"
6
10
  require "open3"
@@ -1,4 +1,9 @@
1
1
  require "open3"
2
+ begin
3
+ require "active_support/core_ext/array/extract_options"
4
+ rescue
5
+ require "activesupport"
6
+ end
2
7
 
3
8
  module Cul
4
9
  module Fedora
@@ -40,6 +45,7 @@ module Cul
40
45
  request
41
46
  return true
42
47
  rescue Exception => e # we should really do some better checking of error type etc here
48
+ logger.error e.message
43
49
  return false
44
50
  end
45
51
  end
@@ -82,7 +88,8 @@ module Cul
82
88
  i = i + MAX_LIST_MEMBERS_PER_REQUEST
83
89
  end
84
90
  return items
85
- rescue
91
+ rescue Exception => e
92
+ logger.error e.message
86
93
  []
87
94
  end
88
95
  end
@@ -90,8 +97,9 @@ module Cul
90
97
  def getSize()
91
98
  begin
92
99
  request(:method => "/objects", :sdef => "methods/ldpd:sdef.Aggregator", :request => "getSize").to_i
93
- rescue
94
- -1
100
+ rescue Exception => e
101
+ logger.error e.message
102
+ return -1
95
103
  end
96
104
  end
97
105
 
@@ -103,6 +111,7 @@ module Cul
103
111
  @server.item(metadata.attributes["uri"].value)
104
112
  end
105
113
  rescue Exception => e
114
+ logger.error e.message
106
115
  []
107
116
  end
108
117
  end
@@ -113,7 +122,8 @@ module Cul
113
122
  result.xpath("/rdf:RDF/rdf:Description/*[local-name()='memberOf']").collect do |member|
114
123
  @server.item(member.attributes["resource"].value)
115
124
  end
116
- rescue
125
+ rescue Exception => e
126
+ logger.error e.message
117
127
  []
118
128
  end
119
129
  end
@@ -136,6 +146,7 @@ module Cul
136
146
  author_roles = ["author","creator","editor","speaker","moderator","interviewee","interviewer","contributor"]
137
147
  other_name_roles = ["thesis advisor"]
138
148
  corporate_author_roles = ["author"]
149
+ corporate_department_roles = ["originator"]
139
150
 
140
151
  organizations = []
141
152
  departments = []
@@ -182,7 +193,7 @@ module Cul
182
193
  note_org = true
183
194
  all_author_names << fullname
184
195
  if(!name_node["ID"].nil?)
185
- add_field.call("author_id_uni", name_node["ID"])
196
+ add_field.call("author_uni", name_node["ID"])
186
197
  end
187
198
  add_field.call("author_search", fullname.downcase)
188
199
  add_field.call("author_facet", fullname)
@@ -209,7 +220,7 @@ module Cul
209
220
  end
210
221
 
211
222
  mods.css("name[@type='corporate']").each do |corp_name_node|
212
- if(!corp_name_node["ID"].nil? && corp_name_node["ID"].include?("originator"))
223
+ if((!corp_name_node["ID"].nil? && corp_name_node["ID"].include?("originator")) || corp_name_node.css("role>roleTerm").collect(&:content).any? { |role| corporate_department_roles.include?(role) })
213
224
  name_part = corp_name_node.at_css("namePart").text
214
225
  if(name_part.include?(". "))
215
226
  name_part_split = name_part.split(". ")
@@ -230,13 +241,12 @@ module Cul
230
241
  end
231
242
  end
232
243
 
233
- add_field.call("authors_display",all_author_names.join("; "))
234
- add_field.call("pub_date", mods.at_css("*[@keyDate='yes']"))
244
+ add_field.call("author_display",all_author_names.join("; "))
245
+ add_field.call("pub_date_facet", mods.at_css("*[@keyDate='yes']"))
235
246
 
236
247
  mods.css("genre").each do |genre_node|
237
248
  add_field.call("genre_facet", genre_node)
238
249
  add_field.call("genre_search", genre_node)
239
-
240
250
  end
241
251
 
242
252
 
@@ -247,14 +257,14 @@ module Cul
247
257
  if(subject_node.attributes.count == 0)
248
258
  subject_node.css("topic").each do |topic_node|
249
259
  add_field.call("keyword_search", topic_node.content.downcase)
250
- add_field.call("subject", topic_node)
260
+ add_field.call("subject_facet", topic_node)
251
261
  add_field.call("subject_search", topic_node)
252
262
  end
253
263
  end
254
264
  end
255
265
 
256
266
 
257
- add_field.call("tableOfContents", mods.at_css("tableOfContents"))
267
+ add_field.call("table_of_contents", mods.at_css("tableOfContents"))
258
268
 
259
269
  mods.css("note").each { |note| add_field.call("notes", note) }
260
270
 
@@ -277,7 +287,7 @@ module Cul
277
287
 
278
288
  if(related_series = mods.at_css("relatedItem[@type='series']"))
279
289
  if(related_series.has_attribute?("ID"))
280
- add_field.call("series", related_series.at_css("titleInfo>title"))
290
+ add_field.call("series_facet", related_series.at_css("titleInfo>title"))
281
291
  end
282
292
  end
283
293
 
@@ -290,23 +300,24 @@ module Cul
290
300
 
291
301
  mods.css("typeOfResource").each { |tr| add_field.call("type_of_resource_facet", tr)}
292
302
  mods.css("subject>geographic").each do |geo|
293
- add_field.call("geographic_area", geo)
303
+ add_field.call("geographic_area_display", geo)
294
304
  add_field.call("geographic_area_search", geo)
295
305
  end
296
306
 
297
- add_field.call("export_as_mla_citation_txt","")
307
+ # This is just a placeholder, reminding us that we need to implement citations in some way
308
+ # add_field.call("export_as_mla_citation_txt","")
298
309
 
299
310
  if(organizations.count > 0)
300
311
  organizations = organizations.uniq
301
312
  organizations.each do |organization|
302
- add_field.call("affiliation_organization", organization)
313
+ add_field.call("organization_facet", organization)
303
314
  end
304
315
  end
305
316
 
306
317
  if(departments.count > 0)
307
318
  departments = departments.uniq
308
319
  departments.each do |department|
309
- add_field.call("affiliation_department", department.to_s.sub(", Department of", "").strip)
320
+ add_field.call("department_facet", department.to_s.sub(", Department of", "").strip)
310
321
  end
311
322
  end
312
323
 
@@ -1,3 +1,9 @@
1
+ begin
2
+ require "active_support/core_ext/array/extract_options"
3
+ rescue
4
+ require "activesupport"
5
+ end
6
+
1
7
  module Cul
2
8
  module Fedora
3
9
  class Server
@@ -6,10 +12,10 @@ module Cul
6
12
 
7
13
  def initialize(*args)
8
14
  options = args.extract_options!
9
- @riurl = options[:riurl] || raise(ArgumentError, "Must provide riurl argument")
10
- @riquery = options[:riquery] || raise(ArgumentError, "Must provide riquery argument")
11
- @hc = options[:http_client]
12
- @logger = options[:logger]
15
+ @riurl = options[:riurl] || options["riurl"] || raise(ArgumentError, "Must provide riurl argument")
16
+ @riquery = options[:riquery] || options["riquery"] || raise(ArgumentError, "Must provide riquery argument")
17
+ @hc = options[:http_client] || options["http_client"]
18
+ @logger = options[:logger] || options["logger"]
13
19
  end
14
20
 
15
21
  def logger
@@ -1,12 +1,18 @@
1
+ begin
2
+ require "active_support/core_ext/array/extract_options"
3
+ rescue
4
+ require "activesupport"
5
+ end
6
+
1
7
  module Cul
2
8
  module Fedora
3
9
  class Solr
4
10
 
5
11
  attr_reader :url
6
12
 
7
- def initialize(config = {})
8
- @url = config[:url] || raise(ArgumentError, "must provide url")
9
- @logger = config[:logger]
13
+ def initialize(options = {})
14
+ @url = options[:url] || options["url"] || raise(ArgumentError, "must provide url")
15
+ @logger = options[:logger] || options["logger"]
10
16
  end
11
17
 
12
18
  def logger
@@ -22,31 +28,53 @@ module Cul
22
28
  end
23
29
 
24
30
  def delete_index
31
+ logger.info "Deleting Solr index..."
25
32
  rsolr.delete_by_query("*:*")
26
33
  rsolr.commit
27
34
  end
28
35
 
29
- def delete_removed(fedora_server)
36
+ def delete_removed(fedora_server, fedora_item_pids = nil)
37
+
38
+ removed = identify_removed(fedora_server)
39
+ logger.info "Deleting items removed from Fedora..."
40
+ removed.each do |id|
41
+ logger.info "Deleting " + id + "..."
42
+ rsolr.delete_by_query("id:" + id.to_s.gsub(/:/,'\\:'))
43
+ end
44
+
45
+ rsolr.commit
30
46
 
47
+ end
48
+
49
+ def identify_removed(fedora_server, fedora_item_pids = nil)
31
50
  start = 0
32
51
  rows = 500
52
+ removed = []
33
53
  results = rsolr.select({:q => "", :fl => "id", :start => start, :rows => rows})
34
- logger.info "Deleting items removed from Fedora..."
54
+ logger.info "Identifying items removed from Fedora..."
35
55
  while(!results["response"]["docs"].empty?)
36
56
 
57
+ logger.info("Checking Solr index from " + start.to_s + " to " + (start + rows).to_s + "...")
37
58
  results["response"]["docs"].each do |doc|
38
- if(!fedora_server.item(doc["id"]).exists?)
39
- logger.info "Deleting " + doc["id"] + "..."
40
- rsolr.delete_by_query("id:" + doc["id"].to_s.gsub(/:/,'\\:'))
59
+
60
+ if(fedora_item_pids.nil?)
61
+ if(!fedora_server.item(doc["id"]).exists?)
62
+ logger.info "Noting removed item " + doc["id"] + "..."
63
+ removed << doc["id"].to_s
64
+ end
65
+ else
66
+ if(!fedora_item_pids.include?(doc["id"].to_s))
67
+ logger.info "Noting removed item " + doc["id"] + "..."
68
+ removed << doc["id"].to_s
69
+ end
41
70
  end
71
+
42
72
  end
43
73
 
44
74
  start = start + rows
45
75
  results = rsolr.select({:q => "", :fl => "id", :start => start, :rows => rows})
46
76
  end
47
-
48
- rsolr.commit
49
-
77
+ return removed
50
78
  end
51
79
 
52
80
  def ingest(options = {})
@@ -63,14 +91,9 @@ module Cul
63
91
 
64
92
  delete = options.delete(:delete_removed) || false
65
93
  overwrite = options.delete(:overwrite) || false
66
- process = options.delete(:process) || nil
67
- skip = options.delete(:skip) || nil
68
-
69
- processed_successfully = 0
94
+ skip = options.delete(:skip) || []
70
95
 
71
- if delete == true
72
- delete_removed(fedora_server)
73
- end
96
+ indexed_count = 0
74
97
 
75
98
  logger.info "Preparing the items for indexing..."
76
99
  collections.each do |collection|
@@ -79,24 +102,26 @@ module Cul
79
102
 
80
103
  items.sort!
81
104
 
82
- to_add = []
83
105
  results = Hash.new { |h,k| h[k] = [] }
84
- errors = {}
106
+ errors = []
107
+
108
+ item_pids = []
109
+ items.each do |item|
110
+ item_pids << item.pid
111
+ end
112
+ if delete == true
113
+ delete_removed(fedora_server, item_pids)
114
+ end
85
115
 
86
116
  logger.info "Preparing to index " + items.length.to_s + " items..."
87
117
 
88
118
  items.each do |i|
89
119
 
90
- if(ignore.index(i.pid).nil? == false)
91
- logger.info "Ignoring " + i.pid + "..."
120
+ if(ignore.index(i.pid).nil? == false || skip.index(i.pid).nil? == false)
121
+ logger.info "Ignoring/skipping " + i.pid + "..."
122
+ results[:skipped] << i.pid
92
123
  next
93
124
  end
94
-
95
- if process && skip && skip > 0
96
- skip -= 1
97
- next
98
- end
99
-
100
125
 
101
126
  if item_exists?(i)
102
127
  unless overwrite == true
@@ -104,45 +129,33 @@ module Cul
104
129
  next
105
130
  end
106
131
  end
107
-
108
132
 
109
133
  logger.info "Indexing " + i.pid + "..."
110
134
 
111
135
  result_hash = i.send("index_for_#{format}", options)
112
136
 
113
- results[result_hash[:status]] << i.pid
137
+ results[result_hash[:status]] << i.pid
114
138
 
115
139
  case result_hash[:status]
116
140
  when :success
117
- to_add << result_hash[:results]
118
- processed_successfully += 1
141
+ begin
142
+ rsolr.add(result_hash[:results])
143
+ indexed_count += 1
144
+ rescue Exception => e
145
+ errors << i.pid
146
+ logger.error e.message
147
+ end
119
148
  when :error
120
- errors[i.pid] = result_hash[:error_message]
121
- end
122
-
123
- if process
124
- process -= 1
125
- break if process <= 0
149
+ errors << i.pid
150
+ logger.error result_hash[:error_message]
126
151
  end
127
152
 
128
- if to_add.length >= 500
129
- logger.info "Adding batch to commit queue..."
130
- rsolr.add(to_add)
131
- to_add.clear
132
- end
133
-
134
- end
135
-
136
- if to_add.length > 0
137
- logger.info "Adding batch to commit queue..."
138
- rsolr.add(to_add)
139
- to_add.clear
140
153
  end
141
154
 
142
155
  logger.info "Committing changes to Solr..."
143
156
  rsolr.commit
144
157
 
145
- return {:results => results, :errors => errors, :processed_successfully => processed_successfully}
158
+ return {:results => results, :errors => errors, :indexed_count => indexed_count}
146
159
 
147
160
  end
148
161
 
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cul-fedora
3
3
  version: !ruby/object:Gem::Version
4
- hash: 51
5
- prerelease: false
4
+ hash: 23
5
+ prerelease:
6
6
  segments:
7
+ - 1
8
+ - 0
7
9
  - 0
8
- - 8
9
- - 6
10
- version: 0.8.6
10
+ version: 1.0.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - James Stuart
@@ -15,8 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-07-14 00:00:00 -04:00
19
- default_executable:
18
+ date: 2011-09-19 00:00:00 Z
20
19
  dependencies:
21
20
  - !ruby/object:Gem::Dependency
22
21
  name: shoulda
@@ -175,7 +174,6 @@ files:
175
174
  - test/test_fedora_server.rb
176
175
  - test/test_fedora_solr.rb
177
176
  - test_fedora_item.rb
178
- has_rdoc: true
179
177
  homepage: http://github.com/tastyhat/cul-fedora
180
178
  licenses: []
181
179
 
@@ -205,13 +203,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
205
203
  requirements: []
206
204
 
207
205
  rubyforge_project:
208
- rubygems_version: 1.3.7
206
+ rubygems_version: 1.8.6
209
207
  signing_key:
210
208
  specification_version: 3
211
209
  summary: Columbia University Fedora Hooks
212
- test_files:
213
- - test/helper.rb
214
- - test/test_cul-fedora.rb
215
- - test/test_fedora_item.rb
216
- - test/test_fedora_server.rb
217
- - test/test_fedora_solr.rb
210
+ test_files: []
211
+