cul-fedora 0.5.1 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.5.1
1
+ 0.5.2
data/cul-fedora.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{cul-fedora}
8
- s.version = "0.5.1"
8
+ s.version = "0.5.2"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["James Stuart"]
12
- s.date = %q{2010-10-11}
12
+ s.date = %q{2010-10-12}
13
13
  s.description = %q{Columbia-specific Fedora libraries}
14
14
  s.email = %q{tastyhat@jamesstuart.org}
15
15
  s.extra_rdoc_files = [
@@ -47,6 +47,7 @@ Gem::Specification.new do |s|
47
47
  "lib/tika/poi-3.5-beta5.jar",
48
48
  "lib/tika/poi-ooxml-3.5-beta5.jar",
49
49
  "lib/tika/poi-scratchpad-3.5-beta5.jar",
50
+ "lib/tika/scratch/1286827167_3249395",
50
51
  "lib/tika/tika-0.3.jar",
51
52
  "lib/tika/xercesImpl-2.8.1.jar",
52
53
  "lib/tika/xml-apis-1.0.b2.jar",
@@ -1,11 +1,21 @@
1
+ require "open3"
2
+
1
3
  module Cul
2
4
  module Fedora
3
5
  class Item
4
6
  attr_reader :server, :pid
7
+ include Open3
5
8
 
6
9
  URI_TO_PID = 'info:fedora/'
7
10
 
8
11
 
12
+ def <=>(other)
13
+ pid <=> other.pid
14
+ end
15
+
16
+ def pid_escaped
17
+ pid.gsub(/:/,'\\:')
18
+ end
9
19
 
10
20
  def initialize(*args)
11
21
  options = args.extract_options!
@@ -38,7 +48,7 @@ module Cul
38
48
 
39
49
  def risearch_for_members()
40
50
  results = JSON::parse(@server.request(:method => "", :request => "risearch", :format => "json", :lang => "itql", :query => sprintf(@server.riquery, @pid)))["results"]
41
-
51
+
42
52
  results.collect { |r| @server.item(r["member"]) }
43
53
 
44
54
  end
@@ -73,7 +83,14 @@ module Cul
73
83
  end
74
84
  end
75
85
 
76
- def index_for_ac2
86
+ def index_for_ac2(options = {})
87
+ do_fulltext = options[:fulltext] || false
88
+ do_metadata = options[:metadata] || true
89
+
90
+ status = :success
91
+ error_message = ""
92
+
93
+
77
94
  results = Hash.new { |h,k| h[k] = [] }
78
95
  normalize_space = lambda { |s| s.to_s.strip.gsub(/\s{2,}/," ") }
79
96
  search_to_content = lambda { |x| x.kind_of?(Nokogiri::XML::Element) ? x.content : x.to_s }
@@ -83,125 +100,150 @@ module Cul
83
100
 
84
101
  roles = ["Author","author","Creator","Thesis Advisor","Collector","Owner","Speaker","Seminar Chairman","Secretary","Rapporteur","Committee Member","Degree Grantor","Moderator","Editor","Interviewee","Interviewer","Organizer of Meeting","Originator","Teacher"]
85
102
 
86
- collections = self.belongsTo
87
- meta = describedBy.first
88
103
 
89
- meta = Nokogiri::XML(meta.datastream("CONTENT")) if meta
90
- mods = meta.at_css("mods") if meta
104
+ begin
105
+ collections = self.belongsTo
106
+ meta = describedBy.first
107
+
108
+ meta = Nokogiri::XML(meta.datastream("CONTENT")) if meta
109
+ mods = meta.at_css("mods") if meta
110
+
111
+ if mods && do_metadata
112
+ # baseline blacklight fields: id is the unique identifier, format determines by default, what partials get called
113
+ add_field.call("id", @pid)
114
+ add_field.call("internal_h", collections.first.to_s + "/")
115
+ add_field.call("pid", @pid)
116
+ collections.each do |collection|
117
+ add_field.call("member_of", collection)
118
+ end
119
+
91
120
 
92
- return {} unless mods
93
- # baseline blacklight fields: id is the unique identifier, format determines by default, what partials get called
94
- add_field.call("id", @pid)
95
- add_field.call("internal_h", collections.first.to_s + "/")
96
- add_field.call("pid", @pid)
97
- collections.each do |collection|
98
- add_field.call("member_of", collection)
99
- end
100
121
 
122
+ title = normalize_space.call(mods.css("titleInfo>nonSort,title").collect(&:content).join(" "))
123
+ add_field.call("title_display", title)
124
+ add_field.call("title_search", title)
101
125
 
102
-
103
- title = normalize_space.call(mods.css("titleInfo>nonSort,title").collect(&:content).join(" "))
104
- add_field.call("title_display", title)
105
- add_field.call("title_search", title)
126
+ all_names = []
127
+ mods.css("name[@type='personal']").each do |name_node|
128
+ if name_node.css("role>roleTerm[@type='text']").collect(&:content).any? { |role| roles.include?(role) }
106
129
 
107
- all_names = []
108
- mods.css("name[@type='personal']").each do |name_node|
109
- if name_node.css("role>roleTerm[@type='text']").collect(&:content).any? { |role| roles.include?(role) }
130
+ fullname = get_fullname.call(name_node)
110
131
 
111
- fullname = get_fullname.call(name_node)
132
+ all_names << fullname
133
+ add_field.call("author_id_uni", name_node.at_css("authorID[@type='institution']"))
134
+ add_field.call("author_id_repository", name_node.at_css("authorID[@type='repository']"))
135
+ add_field.call("author_id_naf", name_node.at_css("authorID[@type='naf']"))
136
+ add_field.call("author_search", fullname.downcase)
137
+ add_field.call("author_facet", fullname)
112
138
 
113
- all_names << fullname
114
- add_field.call("author_id_uni", name_node.at_css("authorID[@type='institution']"))
115
- add_field.call("author_id_repository", name_node.at_css("authorID[@type='repository']"))
116
- add_field.call("author_id_naf", name_node.at_css("authorID[@type='naf']"))
117
- add_field.call("author_search", fullname.downcase)
118
- add_field.call("author_facet", fullname)
139
+ end
119
140
 
120
141
  end
121
142
 
122
- end
143
+ add_field.call("authors_display",all_names.join("; "))
144
+ add_field.call("date", mods.at_css("*[@keyDate='yes']"))
123
145
 
124
- add_field.call("authors_display",all_names.join("; "))
125
- add_field.call("date", mods.at_css("*[@keyDate='yes']"))
146
+ mods.css("genre").each do |genre_node|
147
+ add_field.call("genre_facet", genre_node)
148
+ add_field.call("genre_search", genre_node)
126
149
 
127
- mods.css("genre").each do |genre_node|
128
- add_field.call("genre_facet", genre_node)
129
- add_field.call("genre_search", genre_node)
150
+ end
130
151
 
131
- end
132
152
 
153
+ add_field.call("abstract", mods.at_css("abstract"))
154
+ add_field.call("handle", mods.at_css("identifier[@type='hdl']"))
133
155
 
134
- add_field.call("abstract", mods.at_css("abstract"))
135
- add_field.call("handle", mods.at_css("identifier[@type='hdl']"))
156
+ mods.css("subject:not([@authority='local'])>topic").each do |topic_node|
157
+ add_field.call("keyword_search", topic_node.content.downcase)
158
+ add_field.call("keyword_facet", topic_node)
159
+ end
136
160
 
137
- mods.css("subject:not([@authority='local'])>topic").each do |topic_node|
138
- add_field.call("keyword_search", topic_node.content.downcase)
139
- add_field.call("keyword_facet", topic_node)
140
- end
161
+ mods.css("subject[@authority='local']>topic").each do |topic_node|
162
+ add_field.call("subject", topic_node)
163
+ add_field.call("subject_search", topic_node)
164
+ end
141
165
 
142
- mods.css("subject[@authority='local']>topic").each do |topic_node|
143
- add_field.call("subject", topic_node)
144
- add_field.call("subject_search", topic_node)
145
- end
146
166
 
167
+ add_field.call("tableOfContents", mods.at_css("tableOfContents"))
168
+
169
+ mods.css("note").each { |note| add_field.call("notes", note) }
147
170
 
148
- add_field.call("tableOfContents", mods.at_css("tableOfContents"))
171
+ if (related_host = mods.at_css("relatedItem[@type='host']"))
172
+ book_journal_title = related_host.at_css("titleInfo>title")
149
173
 
150
- mods.css("note").each { |note| add_field.call("notes", note) }
174
+ if book_journal_title
175
+ book_journal_subtitle = mods.at_css("name>titleInfo>subTitle")
151
176
 
152
- if (related_host = mods.at_css("relatedItem[@type='host']"))
153
- book_journal_title = related_host.at_css("titleInfo>title")
177
+ book_journal_title = book_journal_title.content + ": " + book_journal_subtitle.content.to_s if book_journal_subtitle
154
178
 
155
- if book_journal_title
156
- book_journal_subtitle = mods.at_css("name>titleInfo>subTitle")
179
+ end
157
180
 
158
- book_journal_title = book_journal_title.content + ": " + book_journal_subtitle.content.to_s if book_journal_subtitle
181
+ add_field.call("book_journal_title", book_journal_title)
159
182
 
183
+ add_field.call("book_author", get_fullname.call(related_host.at_css("name")))
184
+
185
+ add_field.call("issn", related_host.at_css("identifier[@type='issn']"))
160
186
  end
161
187
 
162
- add_field.call("book_journal_title", book_journal_title)
188
+ add_field.call("publisher", mods.at_css("relatedItem>originInfo>publisher"))
189
+ add_field.call("publisher_location", mods.at_css("relatedItem > originInfo>place>placeTerm[@type='text']"))
190
+ add_field.call("isbn", mods.at_css("relatedItem>identifier[@type='isbn']"))
191
+ add_field.call("doi", mods.at_css("identifier[@type='doi'][@displayLabel='Published version']"))
163
192
 
164
- add_field.call("book_author", get_fullname.call(related_host.at_css("name")))
193
+ mods.css("physicalDescription>internetMediaType").each { |mt| add_field.call("media_type_facet", mt) }
165
194
 
166
- add_field.call("issn", related_host.at_css("identifier[@type='issn']"))
195
+ mods.css("typeOfResource").each { |tr| add_field.call("type_of_resource_facet", tr)}
196
+ mods.css("subject>geographic").each do |geo|
197
+ add_field.call("geographic_area", geo)
198
+ add_field.call("geographic_area_search", geo)
199
+ end
167
200
  end
168
201
 
169
- add_field.call("publisher", mods.at_css("relatedItem>originInfo>publisher"))
170
- add_field.call("publisher_location", mods.at_css("relatedItem > originInfo>place>placeTerm[@type='text']"))
171
- add_field.call("isbn", mods.at_css("relatedItem>identifier[@type='isbn']"))
172
- add_field.call("doi", mods.at_css("identifier[@type='doi'][@displayLabel='Published version']"))
173
202
 
174
- mods.css("physicalDescription>internetMediaType").each { |mt| add_field.call("media_type_facet", mt) }
203
+ if do_fulltext
204
+ listMembers.each_with_index do |member, i|
205
+ tika_directory = File.expand_path(File.join(File.expand_path(File.dirname(__FILE__)), "..", "tika"))
175
206
 
176
- mods.css("typeOfResource").each { |tr| add_field.call("type_of_resource_facet", tr)}
177
- mods.css("subject>geographic").each do |geo|
178
- add_field.call("geographic_area", geo)
179
- add_field.call("geographic_area_search", geo)
180
- end
207
+ resource_file_name = File.join(tika_directory, "scratch", Time.now.to_i.to_s + "_" + rand(10000000).to_s)
208
+ tika_jar = File.join(tika_directory, "tika-0.3.jar")
209
+
210
+ File.open(resource_file_name, "w") { |f| f.puts(member.datastream("CONTENT")) }
181
211
 
182
212
 
213
+ tika_result = []
214
+ tika_error = []
183
215
 
184
-
185
- listMembers.each_with_index do |member, i|
186
- tika_directory = File.expand_path(File.join(File.expand_path(File.dirname(__FILE__)), "..", "tika"))
216
+ Open3.popen3("java -jar #{tika_jar} -t #{resource_file_name}") do |stdin, stdout, stderr|
217
+ tika_result = stdout.readlines
218
+ tika_error = stderr.readlines
219
+ end
187
220
 
188
- resource_file_name = File.join(tika_directory, "scratch", Time.now.to_i.to_s + "_" + rand(10000000).to_s)
189
- tika_jar = File.join(tika_directory, "tika-0.3.jar")
221
+ unless tika_error.empty?
222
+ status = :error
223
+ error_message += tika_error.join("\n")
224
+ else
190
225
 
191
- File.open(resource_file_name, "w") { |f| f.puts(member.datastream("CONTENT")) }
192
226
 
193
-
194
- tika_result = %x[java -jar #{tika_jar} -t #{resource_file_name}]
195
-
227
+ add_field.call("ac.fulltext_#{i}", tika_result)
228
+ end
196
229
 
197
- add_field.call("ac.fulltext_#{i}", tika_result)
198
-
199
- File.delete(resource_file_name)
230
+ File.delete(resource_file_name)
231
+ end
232
+ end
233
+
234
+ rescue Exception => e
235
+ status = :error
236
+ error_message += e.message
200
237
  end
201
238
 
202
- return results
239
+ status = :invalid_format if results.empty?
240
+
241
+ return {:status => status, :error_message => error_message, :results => results}
242
+
203
243
  end
204
244
 
245
+
246
+
205
247
  def to_s
206
248
  @pid
207
249
  end
@@ -7,30 +7,83 @@ module Cul
7
7
 
8
8
  end
9
9
 
10
+ def item_exists?(item)
11
+ !rsolr.find(:filters => {:id => item.pid_escaped})["response"]["docs"].empty?
12
+ end
13
+
10
14
  def rsolr
11
15
  @rsolr ||= RSolr.connect(:url => @url)
12
16
  end
13
17
 
14
18
  def ingest(options = {})
15
19
  format = options.delete(:format) || raise(ArgumentError, "needs format")
20
+
16
21
  items = options.delete(:items) || []
17
22
  items = [items] unless items.kind_of?(Array)
18
-
19
23
  collections = options.delete(:collections) || []
20
24
  collections = [collections] unless collections.kind_of?(Array)
25
+
26
+ overwrite = options.delete(:overwrite) || false
27
+ process = options.delete(:process) || nil
28
+ skip = options.delete(:skip) || nil
29
+
30
+
31
+
21
32
  collections.each do |collection|
22
33
  items |= collection.listMembers
23
34
  end
24
35
 
36
+ items.sort!
37
+
38
+ to_add = []
39
+ results = Hash.new { |h,k| h[k] = [] }
40
+ errors = {}
41
+
42
+ items.each do |i|
43
+ if process && skip && skip > 0
44
+ skip -= 1
45
+ next
46
+ end
47
+
48
+
49
+ if item_exists?(i)
50
+
51
+ unless overwrite
52
+ results[:skipped] << i.pid
53
+ next
54
+ end
55
+ end
56
+
25
57
 
26
58
 
27
- rsolr.add(items.collect { |i| i.send("index_for_#{format}")}.reject { |doc| doc == {}})
28
-
59
+
60
+ result_hash = i.send("index_for_#{format}", options)
61
+
62
+ results[result_hash[:status]] << i.pid
63
+
64
+ case result_hash[:status]
65
+ when :success
66
+ to_add << result_hash[:results]
67
+ when :error
68
+ errors[i.pid] = result_hash[:error_message]
69
+ end
70
+
71
+ if process
72
+ process -= 1
73
+ break if process <= 0
74
+ end
75
+
76
+ end
77
+
78
+ rsolr.add(to_add)
29
79
  rsolr.commit
80
+
81
+ return {:results => results, :errors => errors}
82
+
30
83
  end
31
84
 
32
85
  end
33
-
86
+
34
87
 
35
88
  end
36
89
  end
Binary file
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cul-fedora
3
3
  version: !ruby/object:Gem::Version
4
- hash: 9
4
+ hash: 15
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 5
9
- - 1
10
- version: 0.5.1
9
+ - 2
10
+ version: 0.5.2
11
11
  platform: ruby
12
12
  authors:
13
13
  - James Stuart
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-10-11 00:00:00 -04:00
18
+ date: 2010-10-12 00:00:00 -04:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -158,6 +158,7 @@ files:
158
158
  - lib/tika/poi-3.5-beta5.jar
159
159
  - lib/tika/poi-ooxml-3.5-beta5.jar
160
160
  - lib/tika/poi-scratchpad-3.5-beta5.jar
161
+ - lib/tika/scratch/1286827167_3249395
161
162
  - lib/tika/tika-0.3.jar
162
163
  - lib/tika/xercesImpl-2.8.1.jar
163
164
  - lib/tika/xml-apis-1.0.b2.jar