cul-fedora 0.5.1 → 0.5.2

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.5.1
1
+ 0.5.2
data/cul-fedora.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{cul-fedora}
8
- s.version = "0.5.1"
8
+ s.version = "0.5.2"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["James Stuart"]
12
- s.date = %q{2010-10-11}
12
+ s.date = %q{2010-10-12}
13
13
  s.description = %q{Columbia-specific Fedora libraries}
14
14
  s.email = %q{tastyhat@jamesstuart.org}
15
15
  s.extra_rdoc_files = [
@@ -47,6 +47,7 @@ Gem::Specification.new do |s|
47
47
  "lib/tika/poi-3.5-beta5.jar",
48
48
  "lib/tika/poi-ooxml-3.5-beta5.jar",
49
49
  "lib/tika/poi-scratchpad-3.5-beta5.jar",
50
+ "lib/tika/scratch/1286827167_3249395",
50
51
  "lib/tika/tika-0.3.jar",
51
52
  "lib/tika/xercesImpl-2.8.1.jar",
52
53
  "lib/tika/xml-apis-1.0.b2.jar",
@@ -1,11 +1,21 @@
1
+ require "open3"
2
+
1
3
  module Cul
2
4
  module Fedora
3
5
  class Item
4
6
  attr_reader :server, :pid
7
+ include Open3
5
8
 
6
9
  URI_TO_PID = 'info:fedora/'
7
10
 
8
11
 
12
+ def <=>(other)
13
+ pid <=> other.pid
14
+ end
15
+
16
+ def pid_escaped
17
+ pid.gsub(/:/,'\\:')
18
+ end
9
19
 
10
20
  def initialize(*args)
11
21
  options = args.extract_options!
@@ -38,7 +48,7 @@ module Cul
38
48
 
39
49
  def risearch_for_members()
40
50
  results = JSON::parse(@server.request(:method => "", :request => "risearch", :format => "json", :lang => "itql", :query => sprintf(@server.riquery, @pid)))["results"]
41
-
51
+
42
52
  results.collect { |r| @server.item(r["member"]) }
43
53
 
44
54
  end
@@ -73,7 +83,14 @@ module Cul
73
83
  end
74
84
  end
75
85
 
76
- def index_for_ac2
86
+ def index_for_ac2(options = {})
87
+ do_fulltext = options[:fulltext] || false
88
+ do_metadata = options[:metadata] || true
89
+
90
+ status = :success
91
+ error_message = ""
92
+
93
+
77
94
  results = Hash.new { |h,k| h[k] = [] }
78
95
  normalize_space = lambda { |s| s.to_s.strip.gsub(/\s{2,}/," ") }
79
96
  search_to_content = lambda { |x| x.kind_of?(Nokogiri::XML::Element) ? x.content : x.to_s }
@@ -83,125 +100,150 @@ module Cul
83
100
 
84
101
  roles = ["Author","author","Creator","Thesis Advisor","Collector","Owner","Speaker","Seminar Chairman","Secretary","Rapporteur","Committee Member","Degree Grantor","Moderator","Editor","Interviewee","Interviewer","Organizer of Meeting","Originator","Teacher"]
85
102
 
86
- collections = self.belongsTo
87
- meta = describedBy.first
88
103
 
89
- meta = Nokogiri::XML(meta.datastream("CONTENT")) if meta
90
- mods = meta.at_css("mods") if meta
104
+ begin
105
+ collections = self.belongsTo
106
+ meta = describedBy.first
107
+
108
+ meta = Nokogiri::XML(meta.datastream("CONTENT")) if meta
109
+ mods = meta.at_css("mods") if meta
110
+
111
+ if mods && do_metadata
112
+ # baseline blacklight fields: id is the unique identifier, format determines by default, what partials get called
113
+ add_field.call("id", @pid)
114
+ add_field.call("internal_h", collections.first.to_s + "/")
115
+ add_field.call("pid", @pid)
116
+ collections.each do |collection|
117
+ add_field.call("member_of", collection)
118
+ end
119
+
91
120
 
92
- return {} unless mods
93
- # baseline blacklight fields: id is the unique identifier, format determines by default, what partials get called
94
- add_field.call("id", @pid)
95
- add_field.call("internal_h", collections.first.to_s + "/")
96
- add_field.call("pid", @pid)
97
- collections.each do |collection|
98
- add_field.call("member_of", collection)
99
- end
100
121
 
122
+ title = normalize_space.call(mods.css("titleInfo>nonSort,title").collect(&:content).join(" "))
123
+ add_field.call("title_display", title)
124
+ add_field.call("title_search", title)
101
125
 
102
-
103
- title = normalize_space.call(mods.css("titleInfo>nonSort,title").collect(&:content).join(" "))
104
- add_field.call("title_display", title)
105
- add_field.call("title_search", title)
126
+ all_names = []
127
+ mods.css("name[@type='personal']").each do |name_node|
128
+ if name_node.css("role>roleTerm[@type='text']").collect(&:content).any? { |role| roles.include?(role) }
106
129
 
107
- all_names = []
108
- mods.css("name[@type='personal']").each do |name_node|
109
- if name_node.css("role>roleTerm[@type='text']").collect(&:content).any? { |role| roles.include?(role) }
130
+ fullname = get_fullname.call(name_node)
110
131
 
111
- fullname = get_fullname.call(name_node)
132
+ all_names << fullname
133
+ add_field.call("author_id_uni", name_node.at_css("authorID[@type='institution']"))
134
+ add_field.call("author_id_repository", name_node.at_css("authorID[@type='repository']"))
135
+ add_field.call("author_id_naf", name_node.at_css("authorID[@type='naf']"))
136
+ add_field.call("author_search", fullname.downcase)
137
+ add_field.call("author_facet", fullname)
112
138
 
113
- all_names << fullname
114
- add_field.call("author_id_uni", name_node.at_css("authorID[@type='institution']"))
115
- add_field.call("author_id_repository", name_node.at_css("authorID[@type='repository']"))
116
- add_field.call("author_id_naf", name_node.at_css("authorID[@type='naf']"))
117
- add_field.call("author_search", fullname.downcase)
118
- add_field.call("author_facet", fullname)
139
+ end
119
140
 
120
141
  end
121
142
 
122
- end
143
+ add_field.call("authors_display",all_names.join("; "))
144
+ add_field.call("date", mods.at_css("*[@keyDate='yes']"))
123
145
 
124
- add_field.call("authors_display",all_names.join("; "))
125
- add_field.call("date", mods.at_css("*[@keyDate='yes']"))
146
+ mods.css("genre").each do |genre_node|
147
+ add_field.call("genre_facet", genre_node)
148
+ add_field.call("genre_search", genre_node)
126
149
 
127
- mods.css("genre").each do |genre_node|
128
- add_field.call("genre_facet", genre_node)
129
- add_field.call("genre_search", genre_node)
150
+ end
130
151
 
131
- end
132
152
 
153
+ add_field.call("abstract", mods.at_css("abstract"))
154
+ add_field.call("handle", mods.at_css("identifier[@type='hdl']"))
133
155
 
134
- add_field.call("abstract", mods.at_css("abstract"))
135
- add_field.call("handle", mods.at_css("identifier[@type='hdl']"))
156
+ mods.css("subject:not([@authority='local'])>topic").each do |topic_node|
157
+ add_field.call("keyword_search", topic_node.content.downcase)
158
+ add_field.call("keyword_facet", topic_node)
159
+ end
136
160
 
137
- mods.css("subject:not([@authority='local'])>topic").each do |topic_node|
138
- add_field.call("keyword_search", topic_node.content.downcase)
139
- add_field.call("keyword_facet", topic_node)
140
- end
161
+ mods.css("subject[@authority='local']>topic").each do |topic_node|
162
+ add_field.call("subject", topic_node)
163
+ add_field.call("subject_search", topic_node)
164
+ end
141
165
 
142
- mods.css("subject[@authority='local']>topic").each do |topic_node|
143
- add_field.call("subject", topic_node)
144
- add_field.call("subject_search", topic_node)
145
- end
146
166
 
167
+ add_field.call("tableOfContents", mods.at_css("tableOfContents"))
168
+
169
+ mods.css("note").each { |note| add_field.call("notes", note) }
147
170
 
148
- add_field.call("tableOfContents", mods.at_css("tableOfContents"))
171
+ if (related_host = mods.at_css("relatedItem[@type='host']"))
172
+ book_journal_title = related_host.at_css("titleInfo>title")
149
173
 
150
- mods.css("note").each { |note| add_field.call("notes", note) }
174
+ if book_journal_title
175
+ book_journal_subtitle = mods.at_css("name>titleInfo>subTitle")
151
176
 
152
- if (related_host = mods.at_css("relatedItem[@type='host']"))
153
- book_journal_title = related_host.at_css("titleInfo>title")
177
+ book_journal_title = book_journal_title.content + ": " + book_journal_subtitle.content.to_s if book_journal_subtitle
154
178
 
155
- if book_journal_title
156
- book_journal_subtitle = mods.at_css("name>titleInfo>subTitle")
179
+ end
157
180
 
158
- book_journal_title = book_journal_title.content + ": " + book_journal_subtitle.content.to_s if book_journal_subtitle
181
+ add_field.call("book_journal_title", book_journal_title)
159
182
 
183
+ add_field.call("book_author", get_fullname.call(related_host.at_css("name")))
184
+
185
+ add_field.call("issn", related_host.at_css("identifier[@type='issn']"))
160
186
  end
161
187
 
162
- add_field.call("book_journal_title", book_journal_title)
188
+ add_field.call("publisher", mods.at_css("relatedItem>originInfo>publisher"))
189
+ add_field.call("publisher_location", mods.at_css("relatedItem > originInfo>place>placeTerm[@type='text']"))
190
+ add_field.call("isbn", mods.at_css("relatedItem>identifier[@type='isbn']"))
191
+ add_field.call("doi", mods.at_css("identifier[@type='doi'][@displayLabel='Published version']"))
163
192
 
164
- add_field.call("book_author", get_fullname.call(related_host.at_css("name")))
193
+ mods.css("physicalDescription>internetMediaType").each { |mt| add_field.call("media_type_facet", mt) }
165
194
 
166
- add_field.call("issn", related_host.at_css("identifier[@type='issn']"))
195
+ mods.css("typeOfResource").each { |tr| add_field.call("type_of_resource_facet", tr)}
196
+ mods.css("subject>geographic").each do |geo|
197
+ add_field.call("geographic_area", geo)
198
+ add_field.call("geographic_area_search", geo)
199
+ end
167
200
  end
168
201
 
169
- add_field.call("publisher", mods.at_css("relatedItem>originInfo>publisher"))
170
- add_field.call("publisher_location", mods.at_css("relatedItem > originInfo>place>placeTerm[@type='text']"))
171
- add_field.call("isbn", mods.at_css("relatedItem>identifier[@type='isbn']"))
172
- add_field.call("doi", mods.at_css("identifier[@type='doi'][@displayLabel='Published version']"))
173
202
 
174
- mods.css("physicalDescription>internetMediaType").each { |mt| add_field.call("media_type_facet", mt) }
203
+ if do_fulltext
204
+ listMembers.each_with_index do |member, i|
205
+ tika_directory = File.expand_path(File.join(File.expand_path(File.dirname(__FILE__)), "..", "tika"))
175
206
 
176
- mods.css("typeOfResource").each { |tr| add_field.call("type_of_resource_facet", tr)}
177
- mods.css("subject>geographic").each do |geo|
178
- add_field.call("geographic_area", geo)
179
- add_field.call("geographic_area_search", geo)
180
- end
207
+ resource_file_name = File.join(tika_directory, "scratch", Time.now.to_i.to_s + "_" + rand(10000000).to_s)
208
+ tika_jar = File.join(tika_directory, "tika-0.3.jar")
209
+
210
+ File.open(resource_file_name, "w") { |f| f.puts(member.datastream("CONTENT")) }
181
211
 
182
212
 
213
+ tika_result = []
214
+ tika_error = []
183
215
 
184
-
185
- listMembers.each_with_index do |member, i|
186
- tika_directory = File.expand_path(File.join(File.expand_path(File.dirname(__FILE__)), "..", "tika"))
216
+ Open3.popen3("java -jar #{tika_jar} -t #{resource_file_name}") do |stdin, stdout, stderr|
217
+ tika_result = stdout.readlines
218
+ tika_error = stderr.readlines
219
+ end
187
220
 
188
- resource_file_name = File.join(tika_directory, "scratch", Time.now.to_i.to_s + "_" + rand(10000000).to_s)
189
- tika_jar = File.join(tika_directory, "tika-0.3.jar")
221
+ unless tika_error.empty?
222
+ status = :error
223
+ error_message += tika_error.join("\n")
224
+ else
190
225
 
191
- File.open(resource_file_name, "w") { |f| f.puts(member.datastream("CONTENT")) }
192
226
 
193
-
194
- tika_result = %x[java -jar #{tika_jar} -t #{resource_file_name}]
195
-
227
+ add_field.call("ac.fulltext_#{i}", tika_result)
228
+ end
196
229
 
197
- add_field.call("ac.fulltext_#{i}", tika_result)
198
-
199
- File.delete(resource_file_name)
230
+ File.delete(resource_file_name)
231
+ end
232
+ end
233
+
234
+ rescue Exception => e
235
+ status = :error
236
+ error_message += e.message
200
237
  end
201
238
 
202
- return results
239
+ status = :invalid_format if results.empty?
240
+
241
+ return {:status => status, :error_message => error_message, :results => results}
242
+
203
243
  end
204
244
 
245
+
246
+
205
247
  def to_s
206
248
  @pid
207
249
  end
@@ -7,30 +7,83 @@ module Cul
7
7
 
8
8
  end
9
9
 
10
+ def item_exists?(item)
11
+ !rsolr.find(:filters => {:id => item.pid_escaped})["response"]["docs"].empty?
12
+ end
13
+
10
14
  def rsolr
11
15
  @rsolr ||= RSolr.connect(:url => @url)
12
16
  end
13
17
 
14
18
  def ingest(options = {})
15
19
  format = options.delete(:format) || raise(ArgumentError, "needs format")
20
+
16
21
  items = options.delete(:items) || []
17
22
  items = [items] unless items.kind_of?(Array)
18
-
19
23
  collections = options.delete(:collections) || []
20
24
  collections = [collections] unless collections.kind_of?(Array)
25
+
26
+ overwrite = options.delete(:overwrite) || false
27
+ process = options.delete(:process) || nil
28
+ skip = options.delete(:skip) || nil
29
+
30
+
31
+
21
32
  collections.each do |collection|
22
33
  items |= collection.listMembers
23
34
  end
24
35
 
36
+ items.sort!
37
+
38
+ to_add = []
39
+ results = Hash.new { |h,k| h[k] = [] }
40
+ errors = {}
41
+
42
+ items.each do |i|
43
+ if process && skip && skip > 0
44
+ skip -= 1
45
+ next
46
+ end
47
+
48
+
49
+ if item_exists?(i)
50
+
51
+ unless overwrite
52
+ results[:skipped] << i.pid
53
+ next
54
+ end
55
+ end
56
+
25
57
 
26
58
 
27
- rsolr.add(items.collect { |i| i.send("index_for_#{format}")}.reject { |doc| doc == {}})
28
-
59
+
60
+ result_hash = i.send("index_for_#{format}", options)
61
+
62
+ results[result_hash[:status]] << i.pid
63
+
64
+ case result_hash[:status]
65
+ when :success
66
+ to_add << result_hash[:results]
67
+ when :error
68
+ errors[i.pid] = result_hash[:error_message]
69
+ end
70
+
71
+ if process
72
+ process -= 1
73
+ break if process <= 0
74
+ end
75
+
76
+ end
77
+
78
+ rsolr.add(to_add)
29
79
  rsolr.commit
80
+
81
+ return {:results => results, :errors => errors}
82
+
30
83
  end
31
84
 
32
85
  end
33
-
86
+
34
87
 
35
88
  end
36
89
  end
Binary file
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cul-fedora
3
3
  version: !ruby/object:Gem::Version
4
- hash: 9
4
+ hash: 15
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 5
9
- - 1
10
- version: 0.5.1
9
+ - 2
10
+ version: 0.5.2
11
11
  platform: ruby
12
12
  authors:
13
13
  - James Stuart
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-10-11 00:00:00 -04:00
18
+ date: 2010-10-12 00:00:00 -04:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -158,6 +158,7 @@ files:
158
158
  - lib/tika/poi-3.5-beta5.jar
159
159
  - lib/tika/poi-ooxml-3.5-beta5.jar
160
160
  - lib/tika/poi-scratchpad-3.5-beta5.jar
161
+ - lib/tika/scratch/1286827167_3249395
161
162
  - lib/tika/tika-0.3.jar
162
163
  - lib/tika/xercesImpl-2.8.1.jar
163
164
  - lib/tika/xml-apis-1.0.b2.jar