spotlight-dor-resources 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +5 -1
- data/lib/spotlight/dor/indexer.rb +16 -16
- data/lib/spotlight/dor/resources/version.rb +1 -1
- data/spec/lib/spotlight/dor/indexer_spec.rb +6 -2
- data/spec/test_app_templates/gdor.yml +5 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c130b41c0a1a72df4461aeb827bc66f7959244f6
|
4
|
+
data.tar.gz: 2c91800eb334e092f90e5cb290fa69ace4163b0e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e8ab91d1840f91f56a32a6eefe3d249fb7727fa985aad7324db746d805a3e3c7c2b02ca06e8ac1f49d5fad5ca97645ea107663c9a4cb984e14b51dc03d525050
|
7
|
+
data.tar.gz: 812b4522e38b815d846ce4a240a0792c64cd2b3139895a990d195eb938882828970c6c533ee61f35dc196e591798b8c06dd80466f7fe9a3ff6a34b382f7d73ed
|
data/README.md
CHANGED
@@ -51,8 +51,12 @@ public_xml=purl.resource.public_xml # nokogiri doc with all public XML
|
|
51
51
|
mods=purl.resource.mods # nokogiri doc with just the MODs
|
52
52
|
smods_rec=purl.resource.smods_rec # a Stanford::Mods::Record Object (which inherits from the MODS gem)
|
53
53
|
|
54
|
+
puts purl.resource.bare_druid # will not have the "druid:" prefix
|
55
|
+
puts purl.resource.druid # might have the "druid:" prefix (if returned as the result of an is_collection call)
|
56
|
+
|
54
57
|
purl.resource.indexer.config.solr.url # the configured solr server (should be localhost in development)
|
55
|
-
|
58
|
+
|
59
|
+
purl.save # save to database and index to solr
|
56
60
|
purl.reindex # reindex once saved the first time
|
57
61
|
```
|
58
62
|
|
@@ -1,7 +1,10 @@
|
|
1
1
|
# rubocop:disable Metrics/ClassLength
|
2
|
+
# rubocop:disable Metrics/AbcSize
|
3
|
+
|
2
4
|
# external gems
|
3
5
|
require 'gdor/indexer'
|
4
6
|
require 'solrizer'
|
7
|
+
require 'faraday'
|
5
8
|
|
6
9
|
module Spotlight::Dor
|
7
10
|
# Base class to harvest from DOR via harvestdor gem
|
@@ -125,7 +128,7 @@ module Spotlight::Dor
|
|
125
128
|
add_thumbnail_fields(images.first, solr_doc) if images.first
|
126
129
|
|
127
130
|
images.each do |image|
|
128
|
-
add_image_fields(image, solr_doc)
|
131
|
+
add_image_fields(image, solr_doc, sdb.bare_druid)
|
129
132
|
end
|
130
133
|
end
|
131
134
|
|
@@ -140,9 +143,9 @@ module Spotlight::Dor
|
|
140
143
|
Solrizer.insert_field(solr_doc, 'content_metadata_first_image_height', image_data['height'], :displayable)
|
141
144
|
end
|
142
145
|
|
143
|
-
def add_image_fields(node, solr_doc)
|
146
|
+
def add_image_fields(node, solr_doc, bare_druid)
|
144
147
|
file_id = node.attr('id').gsub('.jp2', '')
|
145
|
-
base_url = stacks_iiif_url(
|
148
|
+
base_url = stacks_iiif_url(bare_druid, file_id)
|
146
149
|
|
147
150
|
Solrizer.insert_field(solr_doc, 'content_metadata_image_iiif_info', "#{base_url}/info.json", :displayable)
|
148
151
|
Solrizer.insert_field(solr_doc, 'thumbnail_square_url', "#{base_url}/square/100,100/0/default.jpg", :displayable)
|
@@ -151,8 +154,8 @@ module Spotlight::Dor
|
|
151
154
|
Solrizer.insert_field(solr_doc, 'full_image_url', "#{base_url}/full/full/0/default.jpg", :displayable)
|
152
155
|
end
|
153
156
|
|
154
|
-
def stacks_iiif_url(
|
155
|
-
"#{Spotlight::Dor::Resources::Engine.config.stacks_iiif_url}/#{
|
157
|
+
def stacks_iiif_url(bare_druid, file_name)
|
158
|
+
"#{Spotlight::Dor::Resources::Engine.config.stacks_iiif_url}/#{bare_druid}%2F#{file_name}"
|
156
159
|
end
|
157
160
|
end
|
158
161
|
|
@@ -189,19 +192,16 @@ module Spotlight::Dor
|
|
189
192
|
|
190
193
|
# search for configured full text files, and if found, add them to the full text (whole document) solr field
|
191
194
|
def add_object_full_text(sdb, solr_doc)
|
192
|
-
object_level_full_text_urls(sdb)
|
193
|
-
|
194
|
-
|
195
|
-
# because .to_s of nil is empty string
|
196
|
-
solr_doc['full_text_tesim'] = solr_doc['full_text_tesim'].to_s + get_file_content(file_url)
|
197
|
-
end
|
195
|
+
full_text_urls = object_level_full_text_urls(sdb)
|
196
|
+
return if full_text_urls.size == 0
|
197
|
+
solr_doc['full_text_tesim'] = full_text_urls.map { |file_url| get_file_content(file_url) }
|
198
198
|
end
|
199
199
|
|
200
200
|
# go grab the supplied file url, grab the file, encode and return
|
201
|
-
# TODO:
|
201
|
+
# TODO: this should also be able to deal with .rtf and .xml files, scrubbing/converting as necessary to get plain text
|
202
202
|
def get_file_content(file_url)
|
203
|
-
response =
|
204
|
-
response.body.scrub.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
|
203
|
+
response = Faraday.get(file_url)
|
204
|
+
response.body.scrub.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?').gsub(/\s+/, ' ')
|
205
205
|
rescue
|
206
206
|
logger.warn("Error indexing full text - couldn't load file #{file_url}")
|
207
207
|
nil
|
@@ -213,7 +213,7 @@ module Spotlight::Dor
|
|
213
213
|
files = []
|
214
214
|
object_level_full_text_filenames(sdb).each do |xpath_location|
|
215
215
|
files += sdb.public_xml.xpath(xpath_location).map do |txt_file|
|
216
|
-
"#{Spotlight::Dor::Resources::Engine.config.stacks_file_url}/#{sdb.
|
216
|
+
"#{Spotlight::Dor::Resources::Engine.config.stacks_file_url}/#{sdb.bare_druid}/#{txt_file['id']}"
|
217
217
|
end
|
218
218
|
end
|
219
219
|
files
|
@@ -223,7 +223,7 @@ module Spotlight::Dor
|
|
223
223
|
# add as many as you need, all will be searched
|
224
224
|
def object_level_full_text_filenames(sdb)
|
225
225
|
[
|
226
|
-
"//contentMetadata/resource/file[@id=\"#{sdb.
|
226
|
+
"//contentMetadata/resource/file[@id=\"#{sdb.bare_druid}.txt\"]" # feigenbaum style - full text in .txt named for druid
|
227
227
|
]
|
228
228
|
end
|
229
229
|
end
|
@@ -50,6 +50,7 @@ describe Spotlight::Dor::Indexer do
|
|
50
50
|
describe '#add_content_metadata_fields' do
|
51
51
|
before do
|
52
52
|
allow(r).to receive(:public_xml).and_return(public_xml)
|
53
|
+
allow(sdb).to receive(:bare_druid).and_return(fake_druid)
|
53
54
|
|
54
55
|
# stacks url calculations require the druid
|
55
56
|
solr_doc[:id] = fake_druid
|
@@ -500,6 +501,9 @@ describe Spotlight::Dor::Indexer do
|
|
500
501
|
# rubocop:enable Metrics/LineLength
|
501
502
|
|
502
503
|
describe '#add_object_full_text' do
|
504
|
+
before do
|
505
|
+
allow(sdb).to receive(:bare_druid).and_return(fake_druid)
|
506
|
+
end
|
503
507
|
let!(:expected_text) { 'SOME full text string that is returned from the server' }
|
504
508
|
let!(:full_file_path) { 'https://stacks.stanford.edu/file/oo000oo0000/oo000oo0000.txt' }
|
505
509
|
it 'indexes the full text into the appropriate field if a recognized file pattern is found' do
|
@@ -523,7 +527,7 @@ describe Spotlight::Dor::Indexer do
|
|
523
527
|
allow(subject).to receive(:get_file_content).with(full_file_path).and_return(expected_text)
|
524
528
|
subject.send(:add_object_full_text, sdb, solr_doc)
|
525
529
|
expect(subject.object_level_full_text_urls(sdb)).to eq [full_file_path]
|
526
|
-
expect(solr_doc['full_text_tesim']).to eq expected_text
|
530
|
+
expect(solr_doc['full_text_tesim']).to eq [expected_text]
|
527
531
|
end
|
528
532
|
it 'does not index the full text if no recognized pattern is found' do
|
529
533
|
public_xml_with_no_recognized_full_text = Nokogiri::XML <<-EOF
|
@@ -566,7 +570,7 @@ describe Spotlight::Dor::Indexer do
|
|
566
570
|
allow(subject).to receive(:get_file_content).with(full_file_path).and_return(expected_text)
|
567
571
|
subject.send(:add_object_full_text, sdb, solr_doc)
|
568
572
|
expect(subject.object_level_full_text_urls(sdb)).to eq [full_file_path, full_file_path]
|
569
|
-
expect(solr_doc['full_text_tesim']).to eq
|
573
|
+
expect(solr_doc['full_text_tesim']).to eq [expected_text, expected_text] # same file twice in a 2 element array
|
570
574
|
end
|
571
575
|
end # add_object_full_text
|
572
576
|
end
|
@@ -1,3 +1,8 @@
|
|
1
|
+
dor_fetcher:
|
2
|
+
service_url: http://dorfetcher-url
|
3
|
+
# if skip_heartbeat set to true, this will skip a check that the dorfetcher service is alive before making API calls
|
4
|
+
skip_heartbeat: true
|
5
|
+
|
1
6
|
harvestdor:
|
2
7
|
# log_dir: directory for log file (default logs, relative to harvestdor gem path)
|
3
8
|
log_dir: log
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spotlight-dor-resources
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Beer
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-11-
|
11
|
+
date: 2015-11-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: faraday
|