cul-fedora 0.5.1 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/cul-fedora.gemspec +3 -2
- data/lib/cul-fedora/item.rb +120 -78
- data/lib/cul-fedora/solr.rb +57 -4
- data/lib/tika/scratch/1286827167_3249395 +0 -0
- metadata +5 -4
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.5.
|
1
|
+
0.5.2
|
data/cul-fedora.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{cul-fedora}
|
8
|
-
s.version = "0.5.
|
8
|
+
s.version = "0.5.2"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["James Stuart"]
|
12
|
-
s.date = %q{2010-10-
|
12
|
+
s.date = %q{2010-10-12}
|
13
13
|
s.description = %q{Columbia-specific Fedora libraries}
|
14
14
|
s.email = %q{tastyhat@jamesstuart.org}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -47,6 +47,7 @@ Gem::Specification.new do |s|
|
|
47
47
|
"lib/tika/poi-3.5-beta5.jar",
|
48
48
|
"lib/tika/poi-ooxml-3.5-beta5.jar",
|
49
49
|
"lib/tika/poi-scratchpad-3.5-beta5.jar",
|
50
|
+
"lib/tika/scratch/1286827167_3249395",
|
50
51
|
"lib/tika/tika-0.3.jar",
|
51
52
|
"lib/tika/xercesImpl-2.8.1.jar",
|
52
53
|
"lib/tika/xml-apis-1.0.b2.jar",
|
data/lib/cul-fedora/item.rb
CHANGED
@@ -1,11 +1,21 @@
|
|
1
|
+
require "open3"
|
2
|
+
|
1
3
|
module Cul
|
2
4
|
module Fedora
|
3
5
|
class Item
|
4
6
|
attr_reader :server, :pid
|
7
|
+
include Open3
|
5
8
|
|
6
9
|
URI_TO_PID = 'info:fedora/'
|
7
10
|
|
8
11
|
|
12
|
+
def <=>(other)
|
13
|
+
pid <=> other.pid
|
14
|
+
end
|
15
|
+
|
16
|
+
def pid_escaped
|
17
|
+
pid.gsub(/:/,'\\:')
|
18
|
+
end
|
9
19
|
|
10
20
|
def initialize(*args)
|
11
21
|
options = args.extract_options!
|
@@ -38,7 +48,7 @@ module Cul
|
|
38
48
|
|
39
49
|
def risearch_for_members()
|
40
50
|
results = JSON::parse(@server.request(:method => "", :request => "risearch", :format => "json", :lang => "itql", :query => sprintf(@server.riquery, @pid)))["results"]
|
41
|
-
|
51
|
+
|
42
52
|
results.collect { |r| @server.item(r["member"]) }
|
43
53
|
|
44
54
|
end
|
@@ -73,7 +83,14 @@ module Cul
|
|
73
83
|
end
|
74
84
|
end
|
75
85
|
|
76
|
-
def index_for_ac2
|
86
|
+
def index_for_ac2(options = {})
|
87
|
+
do_fulltext = options[:fulltext] || false
|
88
|
+
do_metadata = options[:metadata] || true
|
89
|
+
|
90
|
+
status = :success
|
91
|
+
error_message = ""
|
92
|
+
|
93
|
+
|
77
94
|
results = Hash.new { |h,k| h[k] = [] }
|
78
95
|
normalize_space = lambda { |s| s.to_s.strip.gsub(/\s{2,}/," ") }
|
79
96
|
search_to_content = lambda { |x| x.kind_of?(Nokogiri::XML::Element) ? x.content : x.to_s }
|
@@ -83,125 +100,150 @@ module Cul
|
|
83
100
|
|
84
101
|
roles = ["Author","author","Creator","Thesis Advisor","Collector","Owner","Speaker","Seminar Chairman","Secretary","Rapporteur","Committee Member","Degree Grantor","Moderator","Editor","Interviewee","Interviewer","Organizer of Meeting","Originator","Teacher"]
|
85
102
|
|
86
|
-
collections = self.belongsTo
|
87
|
-
meta = describedBy.first
|
88
103
|
|
89
|
-
|
90
|
-
|
104
|
+
begin
|
105
|
+
collections = self.belongsTo
|
106
|
+
meta = describedBy.first
|
107
|
+
|
108
|
+
meta = Nokogiri::XML(meta.datastream("CONTENT")) if meta
|
109
|
+
mods = meta.at_css("mods") if meta
|
110
|
+
|
111
|
+
if mods && do_metadata
|
112
|
+
# baseline blacklight fields: id is the unique identifier, format determines by default, what partials get called
|
113
|
+
add_field.call("id", @pid)
|
114
|
+
add_field.call("internal_h", collections.first.to_s + "/")
|
115
|
+
add_field.call("pid", @pid)
|
116
|
+
collections.each do |collection|
|
117
|
+
add_field.call("member_of", collection)
|
118
|
+
end
|
119
|
+
|
91
120
|
|
92
|
-
return {} unless mods
|
93
|
-
# baseline blacklight fields: id is the unique identifier, format determines by default, what partials get called
|
94
|
-
add_field.call("id", @pid)
|
95
|
-
add_field.call("internal_h", collections.first.to_s + "/")
|
96
|
-
add_field.call("pid", @pid)
|
97
|
-
collections.each do |collection|
|
98
|
-
add_field.call("member_of", collection)
|
99
|
-
end
|
100
121
|
|
122
|
+
title = normalize_space.call(mods.css("titleInfo>nonSort,title").collect(&:content).join(" "))
|
123
|
+
add_field.call("title_display", title)
|
124
|
+
add_field.call("title_search", title)
|
101
125
|
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
add_field.call("title_search", title)
|
126
|
+
all_names = []
|
127
|
+
mods.css("name[@type='personal']").each do |name_node|
|
128
|
+
if name_node.css("role>roleTerm[@type='text']").collect(&:content).any? { |role| roles.include?(role) }
|
106
129
|
|
107
|
-
|
108
|
-
mods.css("name[@type='personal']").each do |name_node|
|
109
|
-
if name_node.css("role>roleTerm[@type='text']").collect(&:content).any? { |role| roles.include?(role) }
|
130
|
+
fullname = get_fullname.call(name_node)
|
110
131
|
|
111
|
-
|
132
|
+
all_names << fullname
|
133
|
+
add_field.call("author_id_uni", name_node.at_css("authorID[@type='institution']"))
|
134
|
+
add_field.call("author_id_repository", name_node.at_css("authorID[@type='repository']"))
|
135
|
+
add_field.call("author_id_naf", name_node.at_css("authorID[@type='naf']"))
|
136
|
+
add_field.call("author_search", fullname.downcase)
|
137
|
+
add_field.call("author_facet", fullname)
|
112
138
|
|
113
|
-
|
114
|
-
add_field.call("author_id_uni", name_node.at_css("authorID[@type='institution']"))
|
115
|
-
add_field.call("author_id_repository", name_node.at_css("authorID[@type='repository']"))
|
116
|
-
add_field.call("author_id_naf", name_node.at_css("authorID[@type='naf']"))
|
117
|
-
add_field.call("author_search", fullname.downcase)
|
118
|
-
add_field.call("author_facet", fullname)
|
139
|
+
end
|
119
140
|
|
120
141
|
end
|
121
142
|
|
122
|
-
|
143
|
+
add_field.call("authors_display",all_names.join("; "))
|
144
|
+
add_field.call("date", mods.at_css("*[@keyDate='yes']"))
|
123
145
|
|
124
|
-
|
125
|
-
|
146
|
+
mods.css("genre").each do |genre_node|
|
147
|
+
add_field.call("genre_facet", genre_node)
|
148
|
+
add_field.call("genre_search", genre_node)
|
126
149
|
|
127
|
-
|
128
|
-
add_field.call("genre_facet", genre_node)
|
129
|
-
add_field.call("genre_search", genre_node)
|
150
|
+
end
|
130
151
|
|
131
|
-
end
|
132
152
|
|
153
|
+
add_field.call("abstract", mods.at_css("abstract"))
|
154
|
+
add_field.call("handle", mods.at_css("identifier[@type='hdl']"))
|
133
155
|
|
134
|
-
|
135
|
-
|
156
|
+
mods.css("subject:not([@authority='local'])>topic").each do |topic_node|
|
157
|
+
add_field.call("keyword_search", topic_node.content.downcase)
|
158
|
+
add_field.call("keyword_facet", topic_node)
|
159
|
+
end
|
136
160
|
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
161
|
+
mods.css("subject[@authority='local']>topic").each do |topic_node|
|
162
|
+
add_field.call("subject", topic_node)
|
163
|
+
add_field.call("subject_search", topic_node)
|
164
|
+
end
|
141
165
|
|
142
|
-
mods.css("subject[@authority='local']>topic").each do |topic_node|
|
143
|
-
add_field.call("subject", topic_node)
|
144
|
-
add_field.call("subject_search", topic_node)
|
145
|
-
end
|
146
166
|
|
167
|
+
add_field.call("tableOfContents", mods.at_css("tableOfContents"))
|
168
|
+
|
169
|
+
mods.css("note").each { |note| add_field.call("notes", note) }
|
147
170
|
|
148
|
-
|
171
|
+
if (related_host = mods.at_css("relatedItem[@type='host']"))
|
172
|
+
book_journal_title = related_host.at_css("titleInfo>title")
|
149
173
|
|
150
|
-
|
174
|
+
if book_journal_title
|
175
|
+
book_journal_subtitle = mods.at_css("name>titleInfo>subTitle")
|
151
176
|
|
152
|
-
|
153
|
-
book_journal_title = related_host.at_css("titleInfo>title")
|
177
|
+
book_journal_title = book_journal_title.content + ": " + book_journal_subtitle.content.to_s if book_journal_subtitle
|
154
178
|
|
155
|
-
|
156
|
-
book_journal_subtitle = mods.at_css("name>titleInfo>subTitle")
|
179
|
+
end
|
157
180
|
|
158
|
-
|
181
|
+
add_field.call("book_journal_title", book_journal_title)
|
159
182
|
|
183
|
+
add_field.call("book_author", get_fullname.call(related_host.at_css("name")))
|
184
|
+
|
185
|
+
add_field.call("issn", related_host.at_css("identifier[@type='issn']"))
|
160
186
|
end
|
161
187
|
|
162
|
-
add_field.call("
|
188
|
+
add_field.call("publisher", mods.at_css("relatedItem>originInfo>publisher"))
|
189
|
+
add_field.call("publisher_location", mods.at_css("relatedItem > originInfo>place>placeTerm[@type='text']"))
|
190
|
+
add_field.call("isbn", mods.at_css("relatedItem>identifier[@type='isbn']"))
|
191
|
+
add_field.call("doi", mods.at_css("identifier[@type='doi'][@displayLabel='Published version']"))
|
163
192
|
|
164
|
-
|
193
|
+
mods.css("physicalDescription>internetMediaType").each { |mt| add_field.call("media_type_facet", mt) }
|
165
194
|
|
166
|
-
|
195
|
+
mods.css("typeOfResource").each { |tr| add_field.call("type_of_resource_facet", tr)}
|
196
|
+
mods.css("subject>geographic").each do |geo|
|
197
|
+
add_field.call("geographic_area", geo)
|
198
|
+
add_field.call("geographic_area_search", geo)
|
199
|
+
end
|
167
200
|
end
|
168
201
|
|
169
|
-
add_field.call("publisher", mods.at_css("relatedItem>originInfo>publisher"))
|
170
|
-
add_field.call("publisher_location", mods.at_css("relatedItem > originInfo>place>placeTerm[@type='text']"))
|
171
|
-
add_field.call("isbn", mods.at_css("relatedItem>identifier[@type='isbn']"))
|
172
|
-
add_field.call("doi", mods.at_css("identifier[@type='doi'][@displayLabel='Published version']"))
|
173
202
|
|
174
|
-
|
203
|
+
if do_fulltext
|
204
|
+
listMembers.each_with_index do |member, i|
|
205
|
+
tika_directory = File.expand_path(File.join(File.expand_path(File.dirname(__FILE__)), "..", "tika"))
|
175
206
|
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
end
|
207
|
+
resource_file_name = File.join(tika_directory, "scratch", Time.now.to_i.to_s + "_" + rand(10000000).to_s)
|
208
|
+
tika_jar = File.join(tika_directory, "tika-0.3.jar")
|
209
|
+
|
210
|
+
File.open(resource_file_name, "w") { |f| f.puts(member.datastream("CONTENT")) }
|
181
211
|
|
182
212
|
|
213
|
+
tika_result = []
|
214
|
+
tika_error = []
|
183
215
|
|
184
|
-
|
185
|
-
|
186
|
-
|
216
|
+
Open3.popen3("java -jar #{tika_jar} -t #{resource_file_name}") do |stdin, stdout, stderr|
|
217
|
+
tika_result = stdout.readlines
|
218
|
+
tika_error = stderr.readlines
|
219
|
+
end
|
187
220
|
|
188
|
-
|
189
|
-
|
221
|
+
unless tika_error.empty?
|
222
|
+
status = :error
|
223
|
+
error_message += tika_error.join("\n")
|
224
|
+
else
|
190
225
|
|
191
|
-
File.open(resource_file_name, "w") { |f| f.puts(member.datastream("CONTENT")) }
|
192
226
|
|
193
|
-
|
194
|
-
|
195
|
-
|
227
|
+
add_field.call("ac.fulltext_#{i}", tika_result)
|
228
|
+
end
|
196
229
|
|
197
|
-
|
198
|
-
|
199
|
-
|
230
|
+
File.delete(resource_file_name)
|
231
|
+
end
|
232
|
+
end
|
233
|
+
|
234
|
+
rescue Exception => e
|
235
|
+
status = :error
|
236
|
+
error_message += e.message
|
200
237
|
end
|
201
238
|
|
202
|
-
|
239
|
+
status = :invalid_format if results.empty?
|
240
|
+
|
241
|
+
return {:status => status, :error_message => error_message, :results => results}
|
242
|
+
|
203
243
|
end
|
204
244
|
|
245
|
+
|
246
|
+
|
205
247
|
def to_s
|
206
248
|
@pid
|
207
249
|
end
|
data/lib/cul-fedora/solr.rb
CHANGED
@@ -7,30 +7,83 @@ module Cul
|
|
7
7
|
|
8
8
|
end
|
9
9
|
|
10
|
+
def item_exists?(item)
|
11
|
+
!rsolr.find(:filters => {:id => item.pid_escaped})["response"]["docs"].empty?
|
12
|
+
end
|
13
|
+
|
10
14
|
def rsolr
|
11
15
|
@rsolr ||= RSolr.connect(:url => @url)
|
12
16
|
end
|
13
17
|
|
14
18
|
def ingest(options = {})
|
15
19
|
format = options.delete(:format) || raise(ArgumentError, "needs format")
|
20
|
+
|
16
21
|
items = options.delete(:items) || []
|
17
22
|
items = [items] unless items.kind_of?(Array)
|
18
|
-
|
19
23
|
collections = options.delete(:collections) || []
|
20
24
|
collections = [collections] unless collections.kind_of?(Array)
|
25
|
+
|
26
|
+
overwrite = options.delete(:overwrite) || false
|
27
|
+
process = options.delete(:process) || nil
|
28
|
+
skip = options.delete(:skip) || nil
|
29
|
+
|
30
|
+
|
31
|
+
|
21
32
|
collections.each do |collection|
|
22
33
|
items |= collection.listMembers
|
23
34
|
end
|
24
35
|
|
36
|
+
items.sort!
|
37
|
+
|
38
|
+
to_add = []
|
39
|
+
results = Hash.new { |h,k| h[k] = [] }
|
40
|
+
errors = {}
|
41
|
+
|
42
|
+
items.each do |i|
|
43
|
+
if process && skip && skip > 0
|
44
|
+
skip -= 1
|
45
|
+
next
|
46
|
+
end
|
47
|
+
|
48
|
+
|
49
|
+
if item_exists?(i)
|
50
|
+
|
51
|
+
unless overwrite
|
52
|
+
results[:skipped] << i.pid
|
53
|
+
next
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
25
57
|
|
26
58
|
|
27
|
-
|
28
|
-
|
59
|
+
|
60
|
+
result_hash = i.send("index_for_#{format}", options)
|
61
|
+
|
62
|
+
results[result_hash[:status]] << i.pid
|
63
|
+
|
64
|
+
case result_hash[:status]
|
65
|
+
when :success
|
66
|
+
to_add << result_hash[:results]
|
67
|
+
when :error
|
68
|
+
errors[i.pid] = result_hash[:error_message]
|
69
|
+
end
|
70
|
+
|
71
|
+
if process
|
72
|
+
process -= 1
|
73
|
+
break if process <= 0
|
74
|
+
end
|
75
|
+
|
76
|
+
end
|
77
|
+
|
78
|
+
rsolr.add(to_add)
|
29
79
|
rsolr.commit
|
80
|
+
|
81
|
+
return {:results => results, :errors => errors}
|
82
|
+
|
30
83
|
end
|
31
84
|
|
32
85
|
end
|
33
|
-
|
86
|
+
|
34
87
|
|
35
88
|
end
|
36
89
|
end
|
Binary file
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cul-fedora
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 15
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 5
|
9
|
-
-
|
10
|
-
version: 0.5.
|
9
|
+
- 2
|
10
|
+
version: 0.5.2
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- James Stuart
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-10-
|
18
|
+
date: 2010-10-12 00:00:00 -04:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -158,6 +158,7 @@ files:
|
|
158
158
|
- lib/tika/poi-3.5-beta5.jar
|
159
159
|
- lib/tika/poi-ooxml-3.5-beta5.jar
|
160
160
|
- lib/tika/poi-scratchpad-3.5-beta5.jar
|
161
|
+
- lib/tika/scratch/1286827167_3249395
|
161
162
|
- lib/tika/tika-0.3.jar
|
162
163
|
- lib/tika/xercesImpl-2.8.1.jar
|
163
164
|
- lib/tika/xml-apis-1.0.b2.jar
|