spotlight-dor-resources 0.0.4 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +3 -1
- data/.rubocop.yml +1 -1
- data/Gemfile +16 -9
- data/README.md +6 -6
- data/lib/spotlight/dor/indexer.rb +46 -0
- data/lib/spotlight/dor/resources/engine.rb +3 -1
- data/lib/spotlight/dor/resources/version.rb +1 -1
- data/spec/lib/spotlight/dor/indexer_spec.rb +71 -0
- data/spec/spec_helper.rb +2 -2
- data/spec/test_app_templates/gdor.yml +1 -1
- data/spec/test_app_templates/lib/generators/test_app_generator.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c0c448f6276a5bbd515fecaaf0e0d836e533f246
|
4
|
+
data.tar.gz: 13048c1c02a606c0c4d30e07768c79bcd6385834
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 326665ea9d207234a6fa97b8c3993a5cff159a154d78e61d813de6352b39ece8db5d8364a331830fb0857979e97b12299eded8b77d0b9d513e0155e02a1fab71
|
7
|
+
data.tar.gz: 7a9ade62a20f3c233864fcbccf1454ad220f98297a3556a42550e6e2c30fc887a4040cac6346d4af6ee3ce716a8fd8edad5314b31aaca399f3d9e176acbe3682
|
data/.gitignore
CHANGED
data/.rubocop.yml
CHANGED
data/Gemfile
CHANGED
@@ -12,21 +12,28 @@ group :test do
|
|
12
12
|
gem 'pry-byebug', require: false
|
13
13
|
end
|
14
14
|
|
15
|
-
#
|
16
|
-
|
15
|
+
# BEGIN ENGINE_CART BLOCK
|
16
|
+
# engine_cart: 0.8.0
|
17
|
+
# engine_cart stanza: 0.8.0
|
18
|
+
# the below comes from engine_cart, a gem used to test this Rails engine gem in the context of a Rails app.
|
19
|
+
file = File.expand_path("Gemfile", ENV['ENGINE_CART_DESTINATION'] || ENV['RAILS_ROOT'] || File.expand_path(".internal_test_app", File.dirname(__FILE__)))
|
17
20
|
if File.exist?(file)
|
18
|
-
|
19
|
-
|
21
|
+
begin
|
22
|
+
eval_gemfile file
|
23
|
+
rescue Bundler::GemfileError => e
|
24
|
+
Bundler.ui.warn '[EngineCart] Skipping Rails application dependencies:'
|
25
|
+
Bundler.ui.warn e.message
|
26
|
+
end
|
20
27
|
else
|
21
|
-
|
28
|
+
Bundler.ui.warn "[EngineCart] Unable to find test application dependencies in #{file}, using placeholder dependencies"
|
29
|
+
|
30
|
+
gem 'rails', ENV['RAILS_VERSION'] if ENV['RAILS_VERSION']
|
22
31
|
|
23
|
-
# explicitly include sass-rails to get compatible sprocket dependencies
|
24
32
|
if ENV['RAILS_VERSION'].nil? || ENV['RAILS_VERSION'] =~ /^4.2/
|
25
|
-
gem 'coffee-rails', '~> 4.1.0'
|
26
|
-
gem 'sass-rails', '~> 5.0'
|
27
33
|
gem 'responders', "~> 2.0"
|
34
|
+
gem 'sass-rails', ">= 5.0"
|
28
35
|
else
|
29
36
|
gem 'sass-rails', "< 5.0"
|
30
|
-
gem 'coffee-rails', "~> 4.0.0"
|
31
37
|
end
|
32
38
|
end
|
39
|
+
# END ENGINE_CART BLOCK
|
data/README.md
CHANGED
@@ -24,7 +24,7 @@ Within a Spotlight application with `spotlight-dor-resources` installed, you cou
|
|
24
24
|
|
25
25
|
Note that Spotlight:
|
26
26
|
|
27
|
-
* is a Rails engine and needs to be used in the context of a Rails application. We use [engine_cart](https://github.com/cbeer/engine_cart) to create an internal test application at
|
27
|
+
* is a Rails engine and needs to be used in the context of a Rails application. We use [engine_cart](https://github.com/cbeer/engine_cart) to create an internal test application at .internal_test_app.
|
28
28
|
* uses Solr as part of its integration tests. We use [jettywrapper](https://github.com/projecthydra/jettywrapper) to manage the Solr instance used for development and test.
|
29
29
|
|
30
30
|
Our `$ rake ci` task utilizes Solr and the testing rails app, with Spotlight installed, automatically.
|
@@ -36,16 +36,16 @@ For access to a Rails console with the gem loaded up for testing purposes, you c
|
|
36
36
|
$ bundle
|
37
37
|
$ bundle exec rake:ci
|
38
38
|
|
39
|
-
This will download a test jetty instance (to run Solr), generate a testing app at ```
|
39
|
+
This will download a test jetty instance (to run Solr), generate a testing app at ```.internal_test_app``` and run the tests.
|
40
40
|
|
41
41
|
### Indexing with the generated test app
|
42
42
|
|
43
|
-
$
|
44
|
-
|
45
|
-
$ bundle exec rails console
|
43
|
+
$ bundle exec rake engine_cart:console
|
46
44
|
|
47
45
|
```ruby
|
48
|
-
|
46
|
+
druid="ty202yt2402" # a feigenbaum druid
|
47
|
+
druid="cx709ty7769" # a revs druid
|
48
|
+
purl=Spotlight::Resources::Purl.new({:url=>"https://purl.stanford.edu/#{druid}"})
|
49
49
|
puts purl.to_solr.first # the solr document as a hash
|
50
50
|
public_xml=purl.resource.public_xml # nokogiri doc with all public XML
|
51
51
|
mods=purl.resource.mods # nokogiri doc with just the MODs
|
@@ -182,6 +182,52 @@ module Spotlight::Dor
|
|
182
182
|
end
|
183
183
|
end
|
184
184
|
|
185
|
+
concerning :FullTextIndexing do
|
186
|
+
included do
|
187
|
+
before_index :add_object_full_text
|
188
|
+
end
|
189
|
+
|
190
|
+
# search for configured full text files, and if found, add them to the full text (whole document) solr field
|
191
|
+
def add_object_full_text(sdb, solr_doc)
|
192
|
+
object_level_full_text_urls(sdb).each do |file_url|
|
193
|
+
# append content from each file to the field, creating field if it doesn't exist yet
|
194
|
+
# ruby note: the construct below allows us to append a string to a variable that starts out as nil
|
195
|
+
# because .to_s of nil is empty string
|
196
|
+
solr_doc['full_text_tesim'] = solr_doc['full_text_tesim'].to_s + get_file_content(file_url)
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
# go grab the supplied file url, grab the file, encode and return
|
201
|
+
# TODO: thse should also be able to also deal with .rtf and .xml files
|
202
|
+
def get_file_content(file_url)
|
203
|
+
response = Net::HTTP.get_response(URI.parse(file_url))
|
204
|
+
response.body.scrub.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
|
205
|
+
rescue
|
206
|
+
logger.warn("Error indexing full text - couldn't load file #{file_url}")
|
207
|
+
nil
|
208
|
+
end
|
209
|
+
|
210
|
+
# these are the file locations where full txt files can be found at the object level
|
211
|
+
# this method returns an array of fully qualified public URLs that can be accessed to find full text countent
|
212
|
+
def object_level_full_text_urls(sdb)
|
213
|
+
files = []
|
214
|
+
object_level_full_text_filenames(sdb).each do |xpath_location|
|
215
|
+
files += sdb.public_xml.xpath(xpath_location).map do |txt_file|
|
216
|
+
"#{Spotlight::Dor::Resources::Engine.config.stacks_file_url}/#{sdb.druid}/#{txt_file['id']}"
|
217
|
+
end
|
218
|
+
end
|
219
|
+
files
|
220
|
+
end
|
221
|
+
|
222
|
+
# xpaths to locations in the contentMetadata where full text object level files can be found,
|
223
|
+
# add as many as you need, all will be searched
|
224
|
+
def object_level_full_text_filenames(sdb)
|
225
|
+
[
|
226
|
+
"//contentMetadata/resource/file[@id=\"#{sdb.druid}.txt\"]" # feigenbaum style - full text in .txt named for druid
|
227
|
+
]
|
228
|
+
end
|
229
|
+
end
|
230
|
+
|
185
231
|
concerning :CartographicIndexing do
|
186
232
|
included do
|
187
233
|
before_index :mods_cartographics_indexing
|
@@ -8,7 +8,9 @@ module Spotlight::Dor::Resources
|
|
8
8
|
Spotlight::Engine.config.resource_providers << Spotlight::Resources::Searchworks
|
9
9
|
Spotlight::Engine.config.resource_providers << Spotlight::Resources::Purl
|
10
10
|
Spotlight::Dor::Resources::Engine.config.parallel_options = { in_threads: 1 }
|
11
|
-
Spotlight::Dor::Resources::Engine.config.
|
11
|
+
Spotlight::Dor::Resources::Engine.config.base_stacks_url = 'https://stacks.stanford.edu'
|
12
|
+
Spotlight::Dor::Resources::Engine.config.stacks_file_url = "#{config.base_stacks_url}/file"
|
13
|
+
Spotlight::Dor::Resources::Engine.config.stacks_iiif_url = "#{config.base_stacks_url}/image/iiif"
|
12
14
|
end
|
13
15
|
end
|
14
16
|
end
|
@@ -498,4 +498,75 @@ describe Spotlight::Dor::Indexer do
|
|
498
498
|
end # each
|
499
499
|
end # add_folder_name
|
500
500
|
# rubocop:enable Metrics/LineLength
|
501
|
+
|
502
|
+
describe '#add_object_full_text' do
|
503
|
+
let!(:expected_text) { 'SOME full text string that is returned from the server' }
|
504
|
+
let!(:full_file_path) { 'https://stacks.stanford.edu/file/oo000oo0000/oo000oo0000.txt' }
|
505
|
+
it 'indexes the full text into the appropriate field if a recognized file pattern is found' do
|
506
|
+
public_xml_with_feigenbaum_full_text = Nokogiri::XML <<-EOF
|
507
|
+
<publicObject id="druid:oo000oo0000" published="2015-10-17T18:24:08-07:00">
|
508
|
+
<contentMetadata objectId="oo000oo0000" type="book">
|
509
|
+
<resource id="oo000oo0000_4" sequence="4" type="object">
|
510
|
+
<label>Document</label>
|
511
|
+
<file id="oo000oo0000.pdf" mimetype="application/pdf" size="6801421"></file>
|
512
|
+
<file id="oo000oo0000.txt" mimetype="text/plain" size="23376"></file>
|
513
|
+
</resource>
|
514
|
+
<resource id="oo000oo0000_5" sequence="5" type="page">
|
515
|
+
<label>Page 1</label>
|
516
|
+
<file id="oo000oo0000_00001.jp2" mimetype="image/jp2" size="1864266"><imageData width="2632" height="3422"/></file>
|
517
|
+
</resource>
|
518
|
+
</contentMetadata>
|
519
|
+
</publicObject>
|
520
|
+
EOF
|
521
|
+
allow(sdb).to receive(:public_xml).and_return(public_xml_with_feigenbaum_full_text)
|
522
|
+
# don't actually attempt a call to the stacks
|
523
|
+
allow(subject).to receive(:get_file_content).with(full_file_path).and_return(expected_text)
|
524
|
+
subject.send(:add_object_full_text, sdb, solr_doc)
|
525
|
+
expect(subject.object_level_full_text_urls(sdb)).to eq [full_file_path]
|
526
|
+
expect(solr_doc['full_text_tesim']).to eq expected_text
|
527
|
+
end
|
528
|
+
it 'does not index the full text if no recognized pattern is found' do
|
529
|
+
public_xml_with_no_recognized_full_text = Nokogiri::XML <<-EOF
|
530
|
+
<publicObject id="druid:oo000oo0000" published="2015-10-17T18:24:08-07:00">
|
531
|
+
<contentMetadata objectId="oo000oo0000" type="book">
|
532
|
+
<resource id="oo000oo0000_4" sequence="4" type="object">
|
533
|
+
<label>Document</label>
|
534
|
+
<file id="oo000oo0000.pdf" mimetype="application/pdf" size="6801421"></file>
|
535
|
+
</resource>
|
536
|
+
<resource id="oo000oo0000_5" sequence="5" type="page">
|
537
|
+
<label>Page 1</label>
|
538
|
+
<file id="oo000oo0000_00001.jp2" mimetype="image/jp2" size="1864266"><imageData width="2632" height="3422"/></file>
|
539
|
+
</resource>
|
540
|
+
</contentMetadata>
|
541
|
+
</publicObject>
|
542
|
+
EOF
|
543
|
+
allow(sdb).to receive(:public_xml).and_return(public_xml_with_no_recognized_full_text)
|
544
|
+
subject.send(:add_object_full_text, sdb, solr_doc)
|
545
|
+
expect(subject.object_level_full_text_urls(sdb)).to eq []
|
546
|
+
expect(solr_doc['full_text_tesim']).to be_nil
|
547
|
+
end
|
548
|
+
it 'indexes the full text from two files if two recognized patterns are found' do
|
549
|
+
public_xml_with_two_recognized_full_text_files = Nokogiri::XML <<-EOF
|
550
|
+
<publicObject id="druid:oo000oo0000" published="2015-10-17T18:24:08-07:00">
|
551
|
+
<contentMetadata objectId="oo000oo0000" type="book">
|
552
|
+
<resource id="oo000oo0000_4" sequence="4" type="object">
|
553
|
+
<label>Document</label>
|
554
|
+
<file id="oo000oo0000.pdf" mimetype="application/pdf" size="6801421"></file>
|
555
|
+
<file id="oo000oo0000.txt" mimetype="text/plain" size="23376"></file>
|
556
|
+
</resource>
|
557
|
+
<resource id="oo000oo0000_5" sequence="5" type="page">
|
558
|
+
<label>Page 1</label>
|
559
|
+
<file id="oo000oo0000_00001.jp2" mimetype="image/jp2" size="1864266"><imageData width="2632" height="3422"/></file>
|
560
|
+
<file id="oo000oo0000.txt" mimetype="text/plain" size="23376"></file>
|
561
|
+
</resource>
|
562
|
+
</contentMetadata>
|
563
|
+
</publicObject>
|
564
|
+
EOF
|
565
|
+
allow(sdb).to receive(:public_xml).and_return(public_xml_with_two_recognized_full_text_files)
|
566
|
+
allow(subject).to receive(:get_file_content).with(full_file_path).and_return(expected_text)
|
567
|
+
subject.send(:add_object_full_text, sdb, solr_doc)
|
568
|
+
expect(subject.object_level_full_text_urls(sdb)).to eq [full_file_path, full_file_path]
|
569
|
+
expect(solr_doc['full_text_tesim']).to eq(expected_text + expected_text) # same file twice
|
570
|
+
end
|
571
|
+
end # add_object_full_text
|
501
572
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -10,7 +10,7 @@ EngineCart.load_application!
|
|
10
10
|
require 'rspec/rails'
|
11
11
|
require 'capybara/poltergeist'
|
12
12
|
Capybara.javascript_driver = :poltergeist
|
13
|
-
Capybara.
|
13
|
+
Capybara.default_max_wait_time = 5
|
14
14
|
|
15
15
|
if ENV["COVERAGE"] or ENV["CI"]
|
16
16
|
require 'simplecov'
|
@@ -57,4 +57,4 @@ RSpec.configure do |config|
|
|
57
57
|
config.include Warden::Test::Helpers, type: :feature
|
58
58
|
config.after(:each, type: :feature) { Warden.test_reset! }
|
59
59
|
config.include Capybara::DSL
|
60
|
-
end
|
60
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spotlight-dor-resources
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Beer
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-11-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: faraday
|
@@ -321,7 +321,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
321
321
|
version: '0'
|
322
322
|
requirements: []
|
323
323
|
rubyforge_project:
|
324
|
-
rubygems_version: 2.4.
|
324
|
+
rubygems_version: 2.4.6
|
325
325
|
signing_key:
|
326
326
|
specification_version: 4
|
327
327
|
summary: Spotlight resource indexer for DOR resources.
|