cul-fedora 0.5.1 → 0.5.2
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/cul-fedora.gemspec +3 -2
- data/lib/cul-fedora/item.rb +120 -78
- data/lib/cul-fedora/solr.rb +57 -4
- data/lib/tika/scratch/1286827167_3249395 +0 -0
- metadata +5 -4
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.5.
|
1
|
+
0.5.2
|
data/cul-fedora.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{cul-fedora}
|
8
|
-
s.version = "0.5.
|
8
|
+
s.version = "0.5.2"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["James Stuart"]
|
12
|
-
s.date = %q{2010-10-
|
12
|
+
s.date = %q{2010-10-12}
|
13
13
|
s.description = %q{Columbia-specific Fedora libraries}
|
14
14
|
s.email = %q{tastyhat@jamesstuart.org}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -47,6 +47,7 @@ Gem::Specification.new do |s|
|
|
47
47
|
"lib/tika/poi-3.5-beta5.jar",
|
48
48
|
"lib/tika/poi-ooxml-3.5-beta5.jar",
|
49
49
|
"lib/tika/poi-scratchpad-3.5-beta5.jar",
|
50
|
+
"lib/tika/scratch/1286827167_3249395",
|
50
51
|
"lib/tika/tika-0.3.jar",
|
51
52
|
"lib/tika/xercesImpl-2.8.1.jar",
|
52
53
|
"lib/tika/xml-apis-1.0.b2.jar",
|
data/lib/cul-fedora/item.rb
CHANGED
@@ -1,11 +1,21 @@
|
|
1
|
+
require "open3"
|
2
|
+
|
1
3
|
module Cul
|
2
4
|
module Fedora
|
3
5
|
class Item
|
4
6
|
attr_reader :server, :pid
|
7
|
+
include Open3
|
5
8
|
|
6
9
|
URI_TO_PID = 'info:fedora/'
|
7
10
|
|
8
11
|
|
12
|
+
def <=>(other)
|
13
|
+
pid <=> other.pid
|
14
|
+
end
|
15
|
+
|
16
|
+
def pid_escaped
|
17
|
+
pid.gsub(/:/,'\\:')
|
18
|
+
end
|
9
19
|
|
10
20
|
def initialize(*args)
|
11
21
|
options = args.extract_options!
|
@@ -38,7 +48,7 @@ module Cul
|
|
38
48
|
|
39
49
|
def risearch_for_members()
|
40
50
|
results = JSON::parse(@server.request(:method => "", :request => "risearch", :format => "json", :lang => "itql", :query => sprintf(@server.riquery, @pid)))["results"]
|
41
|
-
|
51
|
+
|
42
52
|
results.collect { |r| @server.item(r["member"]) }
|
43
53
|
|
44
54
|
end
|
@@ -73,7 +83,14 @@ module Cul
|
|
73
83
|
end
|
74
84
|
end
|
75
85
|
|
76
|
-
def index_for_ac2
|
86
|
+
def index_for_ac2(options = {})
|
87
|
+
do_fulltext = options[:fulltext] || false
|
88
|
+
do_metadata = options[:metadata] || true
|
89
|
+
|
90
|
+
status = :success
|
91
|
+
error_message = ""
|
92
|
+
|
93
|
+
|
77
94
|
results = Hash.new { |h,k| h[k] = [] }
|
78
95
|
normalize_space = lambda { |s| s.to_s.strip.gsub(/\s{2,}/," ") }
|
79
96
|
search_to_content = lambda { |x| x.kind_of?(Nokogiri::XML::Element) ? x.content : x.to_s }
|
@@ -83,125 +100,150 @@ module Cul
|
|
83
100
|
|
84
101
|
roles = ["Author","author","Creator","Thesis Advisor","Collector","Owner","Speaker","Seminar Chairman","Secretary","Rapporteur","Committee Member","Degree Grantor","Moderator","Editor","Interviewee","Interviewer","Organizer of Meeting","Originator","Teacher"]
|
85
102
|
|
86
|
-
collections = self.belongsTo
|
87
|
-
meta = describedBy.first
|
88
103
|
|
89
|
-
|
90
|
-
|
104
|
+
begin
|
105
|
+
collections = self.belongsTo
|
106
|
+
meta = describedBy.first
|
107
|
+
|
108
|
+
meta = Nokogiri::XML(meta.datastream("CONTENT")) if meta
|
109
|
+
mods = meta.at_css("mods") if meta
|
110
|
+
|
111
|
+
if mods && do_metadata
|
112
|
+
# baseline blacklight fields: id is the unique identifier, format determines by default, what partials get called
|
113
|
+
add_field.call("id", @pid)
|
114
|
+
add_field.call("internal_h", collections.first.to_s + "/")
|
115
|
+
add_field.call("pid", @pid)
|
116
|
+
collections.each do |collection|
|
117
|
+
add_field.call("member_of", collection)
|
118
|
+
end
|
119
|
+
|
91
120
|
|
92
|
-
return {} unless mods
|
93
|
-
# baseline blacklight fields: id is the unique identifier, format determines by default, what partials get called
|
94
|
-
add_field.call("id", @pid)
|
95
|
-
add_field.call("internal_h", collections.first.to_s + "/")
|
96
|
-
add_field.call("pid", @pid)
|
97
|
-
collections.each do |collection|
|
98
|
-
add_field.call("member_of", collection)
|
99
|
-
end
|
100
121
|
|
122
|
+
title = normalize_space.call(mods.css("titleInfo>nonSort,title").collect(&:content).join(" "))
|
123
|
+
add_field.call("title_display", title)
|
124
|
+
add_field.call("title_search", title)
|
101
125
|
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
add_field.call("title_search", title)
|
126
|
+
all_names = []
|
127
|
+
mods.css("name[@type='personal']").each do |name_node|
|
128
|
+
if name_node.css("role>roleTerm[@type='text']").collect(&:content).any? { |role| roles.include?(role) }
|
106
129
|
|
107
|
-
|
108
|
-
mods.css("name[@type='personal']").each do |name_node|
|
109
|
-
if name_node.css("role>roleTerm[@type='text']").collect(&:content).any? { |role| roles.include?(role) }
|
130
|
+
fullname = get_fullname.call(name_node)
|
110
131
|
|
111
|
-
|
132
|
+
all_names << fullname
|
133
|
+
add_field.call("author_id_uni", name_node.at_css("authorID[@type='institution']"))
|
134
|
+
add_field.call("author_id_repository", name_node.at_css("authorID[@type='repository']"))
|
135
|
+
add_field.call("author_id_naf", name_node.at_css("authorID[@type='naf']"))
|
136
|
+
add_field.call("author_search", fullname.downcase)
|
137
|
+
add_field.call("author_facet", fullname)
|
112
138
|
|
113
|
-
|
114
|
-
add_field.call("author_id_uni", name_node.at_css("authorID[@type='institution']"))
|
115
|
-
add_field.call("author_id_repository", name_node.at_css("authorID[@type='repository']"))
|
116
|
-
add_field.call("author_id_naf", name_node.at_css("authorID[@type='naf']"))
|
117
|
-
add_field.call("author_search", fullname.downcase)
|
118
|
-
add_field.call("author_facet", fullname)
|
139
|
+
end
|
119
140
|
|
120
141
|
end
|
121
142
|
|
122
|
-
|
143
|
+
add_field.call("authors_display",all_names.join("; "))
|
144
|
+
add_field.call("date", mods.at_css("*[@keyDate='yes']"))
|
123
145
|
|
124
|
-
|
125
|
-
|
146
|
+
mods.css("genre").each do |genre_node|
|
147
|
+
add_field.call("genre_facet", genre_node)
|
148
|
+
add_field.call("genre_search", genre_node)
|
126
149
|
|
127
|
-
|
128
|
-
add_field.call("genre_facet", genre_node)
|
129
|
-
add_field.call("genre_search", genre_node)
|
150
|
+
end
|
130
151
|
|
131
|
-
end
|
132
152
|
|
153
|
+
add_field.call("abstract", mods.at_css("abstract"))
|
154
|
+
add_field.call("handle", mods.at_css("identifier[@type='hdl']"))
|
133
155
|
|
134
|
-
|
135
|
-
|
156
|
+
mods.css("subject:not([@authority='local'])>topic").each do |topic_node|
|
157
|
+
add_field.call("keyword_search", topic_node.content.downcase)
|
158
|
+
add_field.call("keyword_facet", topic_node)
|
159
|
+
end
|
136
160
|
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
161
|
+
mods.css("subject[@authority='local']>topic").each do |topic_node|
|
162
|
+
add_field.call("subject", topic_node)
|
163
|
+
add_field.call("subject_search", topic_node)
|
164
|
+
end
|
141
165
|
|
142
|
-
mods.css("subject[@authority='local']>topic").each do |topic_node|
|
143
|
-
add_field.call("subject", topic_node)
|
144
|
-
add_field.call("subject_search", topic_node)
|
145
|
-
end
|
146
166
|
|
167
|
+
add_field.call("tableOfContents", mods.at_css("tableOfContents"))
|
168
|
+
|
169
|
+
mods.css("note").each { |note| add_field.call("notes", note) }
|
147
170
|
|
148
|
-
|
171
|
+
if (related_host = mods.at_css("relatedItem[@type='host']"))
|
172
|
+
book_journal_title = related_host.at_css("titleInfo>title")
|
149
173
|
|
150
|
-
|
174
|
+
if book_journal_title
|
175
|
+
book_journal_subtitle = mods.at_css("name>titleInfo>subTitle")
|
151
176
|
|
152
|
-
|
153
|
-
book_journal_title = related_host.at_css("titleInfo>title")
|
177
|
+
book_journal_title = book_journal_title.content + ": " + book_journal_subtitle.content.to_s if book_journal_subtitle
|
154
178
|
|
155
|
-
|
156
|
-
book_journal_subtitle = mods.at_css("name>titleInfo>subTitle")
|
179
|
+
end
|
157
180
|
|
158
|
-
|
181
|
+
add_field.call("book_journal_title", book_journal_title)
|
159
182
|
|
183
|
+
add_field.call("book_author", get_fullname.call(related_host.at_css("name")))
|
184
|
+
|
185
|
+
add_field.call("issn", related_host.at_css("identifier[@type='issn']"))
|
160
186
|
end
|
161
187
|
|
162
|
-
add_field.call("
|
188
|
+
add_field.call("publisher", mods.at_css("relatedItem>originInfo>publisher"))
|
189
|
+
add_field.call("publisher_location", mods.at_css("relatedItem > originInfo>place>placeTerm[@type='text']"))
|
190
|
+
add_field.call("isbn", mods.at_css("relatedItem>identifier[@type='isbn']"))
|
191
|
+
add_field.call("doi", mods.at_css("identifier[@type='doi'][@displayLabel='Published version']"))
|
163
192
|
|
164
|
-
|
193
|
+
mods.css("physicalDescription>internetMediaType").each { |mt| add_field.call("media_type_facet", mt) }
|
165
194
|
|
166
|
-
|
195
|
+
mods.css("typeOfResource").each { |tr| add_field.call("type_of_resource_facet", tr)}
|
196
|
+
mods.css("subject>geographic").each do |geo|
|
197
|
+
add_field.call("geographic_area", geo)
|
198
|
+
add_field.call("geographic_area_search", geo)
|
199
|
+
end
|
167
200
|
end
|
168
201
|
|
169
|
-
add_field.call("publisher", mods.at_css("relatedItem>originInfo>publisher"))
|
170
|
-
add_field.call("publisher_location", mods.at_css("relatedItem > originInfo>place>placeTerm[@type='text']"))
|
171
|
-
add_field.call("isbn", mods.at_css("relatedItem>identifier[@type='isbn']"))
|
172
|
-
add_field.call("doi", mods.at_css("identifier[@type='doi'][@displayLabel='Published version']"))
|
173
202
|
|
174
|
-
|
203
|
+
if do_fulltext
|
204
|
+
listMembers.each_with_index do |member, i|
|
205
|
+
tika_directory = File.expand_path(File.join(File.expand_path(File.dirname(__FILE__)), "..", "tika"))
|
175
206
|
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
end
|
207
|
+
resource_file_name = File.join(tika_directory, "scratch", Time.now.to_i.to_s + "_" + rand(10000000).to_s)
|
208
|
+
tika_jar = File.join(tika_directory, "tika-0.3.jar")
|
209
|
+
|
210
|
+
File.open(resource_file_name, "w") { |f| f.puts(member.datastream("CONTENT")) }
|
181
211
|
|
182
212
|
|
213
|
+
tika_result = []
|
214
|
+
tika_error = []
|
183
215
|
|
184
|
-
|
185
|
-
|
186
|
-
|
216
|
+
Open3.popen3("java -jar #{tika_jar} -t #{resource_file_name}") do |stdin, stdout, stderr|
|
217
|
+
tika_result = stdout.readlines
|
218
|
+
tika_error = stderr.readlines
|
219
|
+
end
|
187
220
|
|
188
|
-
|
189
|
-
|
221
|
+
unless tika_error.empty?
|
222
|
+
status = :error
|
223
|
+
error_message += tika_error.join("\n")
|
224
|
+
else
|
190
225
|
|
191
|
-
File.open(resource_file_name, "w") { |f| f.puts(member.datastream("CONTENT")) }
|
192
226
|
|
193
|
-
|
194
|
-
|
195
|
-
|
227
|
+
add_field.call("ac.fulltext_#{i}", tika_result)
|
228
|
+
end
|
196
229
|
|
197
|
-
|
198
|
-
|
199
|
-
|
230
|
+
File.delete(resource_file_name)
|
231
|
+
end
|
232
|
+
end
|
233
|
+
|
234
|
+
rescue Exception => e
|
235
|
+
status = :error
|
236
|
+
error_message += e.message
|
200
237
|
end
|
201
238
|
|
202
|
-
|
239
|
+
status = :invalid_format if results.empty?
|
240
|
+
|
241
|
+
return {:status => status, :error_message => error_message, :results => results}
|
242
|
+
|
203
243
|
end
|
204
244
|
|
245
|
+
|
246
|
+
|
205
247
|
def to_s
|
206
248
|
@pid
|
207
249
|
end
|
data/lib/cul-fedora/solr.rb
CHANGED
@@ -7,30 +7,83 @@ module Cul
|
|
7
7
|
|
8
8
|
end
|
9
9
|
|
10
|
+
def item_exists?(item)
|
11
|
+
!rsolr.find(:filters => {:id => item.pid_escaped})["response"]["docs"].empty?
|
12
|
+
end
|
13
|
+
|
10
14
|
def rsolr
|
11
15
|
@rsolr ||= RSolr.connect(:url => @url)
|
12
16
|
end
|
13
17
|
|
14
18
|
def ingest(options = {})
|
15
19
|
format = options.delete(:format) || raise(ArgumentError, "needs format")
|
20
|
+
|
16
21
|
items = options.delete(:items) || []
|
17
22
|
items = [items] unless items.kind_of?(Array)
|
18
|
-
|
19
23
|
collections = options.delete(:collections) || []
|
20
24
|
collections = [collections] unless collections.kind_of?(Array)
|
25
|
+
|
26
|
+
overwrite = options.delete(:overwrite) || false
|
27
|
+
process = options.delete(:process) || nil
|
28
|
+
skip = options.delete(:skip) || nil
|
29
|
+
|
30
|
+
|
31
|
+
|
21
32
|
collections.each do |collection|
|
22
33
|
items |= collection.listMembers
|
23
34
|
end
|
24
35
|
|
36
|
+
items.sort!
|
37
|
+
|
38
|
+
to_add = []
|
39
|
+
results = Hash.new { |h,k| h[k] = [] }
|
40
|
+
errors = {}
|
41
|
+
|
42
|
+
items.each do |i|
|
43
|
+
if process && skip && skip > 0
|
44
|
+
skip -= 1
|
45
|
+
next
|
46
|
+
end
|
47
|
+
|
48
|
+
|
49
|
+
if item_exists?(i)
|
50
|
+
|
51
|
+
unless overwrite
|
52
|
+
results[:skipped] << i.pid
|
53
|
+
next
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
25
57
|
|
26
58
|
|
27
|
-
|
28
|
-
|
59
|
+
|
60
|
+
result_hash = i.send("index_for_#{format}", options)
|
61
|
+
|
62
|
+
results[result_hash[:status]] << i.pid
|
63
|
+
|
64
|
+
case result_hash[:status]
|
65
|
+
when :success
|
66
|
+
to_add << result_hash[:results]
|
67
|
+
when :error
|
68
|
+
errors[i.pid] = result_hash[:error_message]
|
69
|
+
end
|
70
|
+
|
71
|
+
if process
|
72
|
+
process -= 1
|
73
|
+
break if process <= 0
|
74
|
+
end
|
75
|
+
|
76
|
+
end
|
77
|
+
|
78
|
+
rsolr.add(to_add)
|
29
79
|
rsolr.commit
|
80
|
+
|
81
|
+
return {:results => results, :errors => errors}
|
82
|
+
|
30
83
|
end
|
31
84
|
|
32
85
|
end
|
33
|
-
|
86
|
+
|
34
87
|
|
35
88
|
end
|
36
89
|
end
|
Binary file
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cul-fedora
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 15
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 5
|
9
|
-
-
|
10
|
-
version: 0.5.
|
9
|
+
- 2
|
10
|
+
version: 0.5.2
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- James Stuart
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-10-
|
18
|
+
date: 2010-10-12 00:00:00 -04:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -158,6 +158,7 @@ files:
|
|
158
158
|
- lib/tika/poi-3.5-beta5.jar
|
159
159
|
- lib/tika/poi-ooxml-3.5-beta5.jar
|
160
160
|
- lib/tika/poi-scratchpad-3.5-beta5.jar
|
161
|
+
- lib/tika/scratch/1286827167_3249395
|
161
162
|
- lib/tika/tika-0.3.jar
|
162
163
|
- lib/tika/xercesImpl-2.8.1.jar
|
163
164
|
- lib/tika/xml-apis-1.0.b2.jar
|