spotlight-dor-resources 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +5 -1
- data/lib/spotlight/dor/indexer.rb +16 -16
- data/lib/spotlight/dor/resources/version.rb +1 -1
- data/spec/lib/spotlight/dor/indexer_spec.rb +6 -2
- data/spec/test_app_templates/gdor.yml +5 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c130b41c0a1a72df4461aeb827bc66f7959244f6
|
4
|
+
data.tar.gz: 2c91800eb334e092f90e5cb290fa69ace4163b0e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e8ab91d1840f91f56a32a6eefe3d249fb7727fa985aad7324db746d805a3e3c7c2b02ca06e8ac1f49d5fad5ca97645ea107663c9a4cb984e14b51dc03d525050
|
7
|
+
data.tar.gz: 812b4522e38b815d846ce4a240a0792c64cd2b3139895a990d195eb938882828970c6c533ee61f35dc196e591798b8c06dd80466f7fe9a3ff6a34b382f7d73ed
|
data/README.md
CHANGED
@@ -51,8 +51,12 @@ public_xml=purl.resource.public_xml # nokogiri doc with all public XML
|
|
51
51
|
mods=purl.resource.mods # nokogiri doc with just the MODs
|
52
52
|
smods_rec=purl.resource.smods_rec # a Stanford::Mods::Record Object (which inherits from the MODS gem)
|
53
53
|
|
54
|
+
puts purl.resource.bare_druid # will not have the "druid:" prefix
|
55
|
+
puts purl.resource.druid # might have the "druid:" prefix (if returned as the result of an is_collection call)
|
56
|
+
|
54
57
|
purl.resource.indexer.config.solr.url # the configured solr server (should be localhost in development)
|
55
|
-
|
58
|
+
|
59
|
+
purl.save # save to database and index to solr
|
56
60
|
purl.reindex # reindex once saved the first time
|
57
61
|
```
|
58
62
|
|
@@ -1,7 +1,10 @@
|
|
1
1
|
# rubocop:disable Metrics/ClassLength
|
2
|
+
# rubocop:disable Metrics/AbcSize
|
3
|
+
|
2
4
|
# external gems
|
3
5
|
require 'gdor/indexer'
|
4
6
|
require 'solrizer'
|
7
|
+
require 'faraday'
|
5
8
|
|
6
9
|
module Spotlight::Dor
|
7
10
|
# Base class to harvest from DOR via harvestdor gem
|
@@ -125,7 +128,7 @@ module Spotlight::Dor
|
|
125
128
|
add_thumbnail_fields(images.first, solr_doc) if images.first
|
126
129
|
|
127
130
|
images.each do |image|
|
128
|
-
add_image_fields(image, solr_doc)
|
131
|
+
add_image_fields(image, solr_doc, sdb.bare_druid)
|
129
132
|
end
|
130
133
|
end
|
131
134
|
|
@@ -140,9 +143,9 @@ module Spotlight::Dor
|
|
140
143
|
Solrizer.insert_field(solr_doc, 'content_metadata_first_image_height', image_data['height'], :displayable)
|
141
144
|
end
|
142
145
|
|
143
|
-
def add_image_fields(node, solr_doc)
|
146
|
+
def add_image_fields(node, solr_doc, bare_druid)
|
144
147
|
file_id = node.attr('id').gsub('.jp2', '')
|
145
|
-
base_url = stacks_iiif_url(
|
148
|
+
base_url = stacks_iiif_url(bare_druid, file_id)
|
146
149
|
|
147
150
|
Solrizer.insert_field(solr_doc, 'content_metadata_image_iiif_info', "#{base_url}/info.json", :displayable)
|
148
151
|
Solrizer.insert_field(solr_doc, 'thumbnail_square_url', "#{base_url}/square/100,100/0/default.jpg", :displayable)
|
@@ -151,8 +154,8 @@ module Spotlight::Dor
|
|
151
154
|
Solrizer.insert_field(solr_doc, 'full_image_url', "#{base_url}/full/full/0/default.jpg", :displayable)
|
152
155
|
end
|
153
156
|
|
154
|
-
def stacks_iiif_url(
|
155
|
-
"#{Spotlight::Dor::Resources::Engine.config.stacks_iiif_url}/#{
|
157
|
+
def stacks_iiif_url(bare_druid, file_name)
|
158
|
+
"#{Spotlight::Dor::Resources::Engine.config.stacks_iiif_url}/#{bare_druid}%2F#{file_name}"
|
156
159
|
end
|
157
160
|
end
|
158
161
|
|
@@ -189,19 +192,16 @@ module Spotlight::Dor
|
|
189
192
|
|
190
193
|
# search for configured full text files, and if found, add them to the full text (whole document) solr field
|
191
194
|
def add_object_full_text(sdb, solr_doc)
|
192
|
-
object_level_full_text_urls(sdb)
|
193
|
-
|
194
|
-
|
195
|
-
# because .to_s of nil is empty string
|
196
|
-
solr_doc['full_text_tesim'] = solr_doc['full_text_tesim'].to_s + get_file_content(file_url)
|
197
|
-
end
|
195
|
+
full_text_urls = object_level_full_text_urls(sdb)
|
196
|
+
return if full_text_urls.size == 0
|
197
|
+
solr_doc['full_text_tesim'] = full_text_urls.map { |file_url| get_file_content(file_url) }
|
198
198
|
end
|
199
199
|
|
200
200
|
# go grab the supplied file url, grab the file, encode and return
|
201
|
-
# TODO:
|
201
|
+
# TODO: this should also be able to deal with .rtf and .xml files, scrubbing/converting as necessary to get plain text
|
202
202
|
def get_file_content(file_url)
|
203
|
-
response =
|
204
|
-
response.body.scrub.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
|
203
|
+
response = Faraday.get(file_url)
|
204
|
+
response.body.scrub.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?').gsub(/\s+/, ' ')
|
205
205
|
rescue
|
206
206
|
logger.warn("Error indexing full text - couldn't load file #{file_url}")
|
207
207
|
nil
|
@@ -213,7 +213,7 @@ module Spotlight::Dor
|
|
213
213
|
files = []
|
214
214
|
object_level_full_text_filenames(sdb).each do |xpath_location|
|
215
215
|
files += sdb.public_xml.xpath(xpath_location).map do |txt_file|
|
216
|
-
"#{Spotlight::Dor::Resources::Engine.config.stacks_file_url}/#{sdb.
|
216
|
+
"#{Spotlight::Dor::Resources::Engine.config.stacks_file_url}/#{sdb.bare_druid}/#{txt_file['id']}"
|
217
217
|
end
|
218
218
|
end
|
219
219
|
files
|
@@ -223,7 +223,7 @@ module Spotlight::Dor
|
|
223
223
|
# add as many as you need, all will be searched
|
224
224
|
def object_level_full_text_filenames(sdb)
|
225
225
|
[
|
226
|
-
"//contentMetadata/resource/file[@id=\"#{sdb.
|
226
|
+
"//contentMetadata/resource/file[@id=\"#{sdb.bare_druid}.txt\"]" # feigenbaum style - full text in .txt named for druid
|
227
227
|
]
|
228
228
|
end
|
229
229
|
end
|
@@ -50,6 +50,7 @@ describe Spotlight::Dor::Indexer do
|
|
50
50
|
describe '#add_content_metadata_fields' do
|
51
51
|
before do
|
52
52
|
allow(r).to receive(:public_xml).and_return(public_xml)
|
53
|
+
allow(sdb).to receive(:bare_druid).and_return(fake_druid)
|
53
54
|
|
54
55
|
# stacks url calculations require the druid
|
55
56
|
solr_doc[:id] = fake_druid
|
@@ -500,6 +501,9 @@ describe Spotlight::Dor::Indexer do
|
|
500
501
|
# rubocop:enable Metrics/LineLength
|
501
502
|
|
502
503
|
describe '#add_object_full_text' do
|
504
|
+
before do
|
505
|
+
allow(sdb).to receive(:bare_druid).and_return(fake_druid)
|
506
|
+
end
|
503
507
|
let!(:expected_text) { 'SOME full text string that is returned from the server' }
|
504
508
|
let!(:full_file_path) { 'https://stacks.stanford.edu/file/oo000oo0000/oo000oo0000.txt' }
|
505
509
|
it 'indexes the full text into the appropriate field if a recognized file pattern is found' do
|
@@ -523,7 +527,7 @@ describe Spotlight::Dor::Indexer do
|
|
523
527
|
allow(subject).to receive(:get_file_content).with(full_file_path).and_return(expected_text)
|
524
528
|
subject.send(:add_object_full_text, sdb, solr_doc)
|
525
529
|
expect(subject.object_level_full_text_urls(sdb)).to eq [full_file_path]
|
526
|
-
expect(solr_doc['full_text_tesim']).to eq expected_text
|
530
|
+
expect(solr_doc['full_text_tesim']).to eq [expected_text]
|
527
531
|
end
|
528
532
|
it 'does not index the full text if no recognized pattern is found' do
|
529
533
|
public_xml_with_no_recognized_full_text = Nokogiri::XML <<-EOF
|
@@ -566,7 +570,7 @@ describe Spotlight::Dor::Indexer do
|
|
566
570
|
allow(subject).to receive(:get_file_content).with(full_file_path).and_return(expected_text)
|
567
571
|
subject.send(:add_object_full_text, sdb, solr_doc)
|
568
572
|
expect(subject.object_level_full_text_urls(sdb)).to eq [full_file_path, full_file_path]
|
569
|
-
expect(solr_doc['full_text_tesim']).to eq
|
573
|
+
expect(solr_doc['full_text_tesim']).to eq [expected_text, expected_text] # same file twice in a 2 element array
|
570
574
|
end
|
571
575
|
end # add_object_full_text
|
572
576
|
end
|
@@ -1,3 +1,8 @@
|
|
1
|
+
dor_fetcher:
|
2
|
+
service_url: http://dorfetcher-url
|
3
|
+
# if skip_heartbeat set to true, this will skip a check that the dorfetcher service is alive before making API calls
|
4
|
+
skip_heartbeat: true
|
5
|
+
|
1
6
|
harvestdor:
|
2
7
|
# log_dir: directory for log file (default logs, relative to harvestdor gem path)
|
3
8
|
log_dir: log
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spotlight-dor-resources
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Beer
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-11-
|
11
|
+
date: 2015-11-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: faraday
|