spotlight-dor-resources 0.0.4 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +3 -1
- data/.rubocop.yml +1 -1
- data/Gemfile +16 -9
- data/README.md +6 -6
- data/lib/spotlight/dor/indexer.rb +46 -0
- data/lib/spotlight/dor/resources/engine.rb +3 -1
- data/lib/spotlight/dor/resources/version.rb +1 -1
- data/spec/lib/spotlight/dor/indexer_spec.rb +71 -0
- data/spec/spec_helper.rb +2 -2
- data/spec/test_app_templates/gdor.yml +1 -1
- data/spec/test_app_templates/lib/generators/test_app_generator.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c0c448f6276a5bbd515fecaaf0e0d836e533f246
|
4
|
+
data.tar.gz: 13048c1c02a606c0c4d30e07768c79bcd6385834
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 326665ea9d207234a6fa97b8c3993a5cff159a154d78e61d813de6352b39ece8db5d8364a331830fb0857979e97b12299eded8b77d0b9d513e0155e02a1fab71
|
7
|
+
data.tar.gz: 7a9ade62a20f3c233864fcbccf1454ad220f98297a3556a42550e6e2c30fc887a4040cac6346d4af6ee3ce716a8fd8edad5314b31aaca399f3d9e176acbe3682
|
data/.gitignore
CHANGED
data/.rubocop.yml
CHANGED
data/Gemfile
CHANGED
@@ -12,21 +12,28 @@ group :test do
|
|
12
12
|
gem 'pry-byebug', require: false
|
13
13
|
end
|
14
14
|
|
15
|
-
#
|
16
|
-
|
15
|
+
# BEGIN ENGINE_CART BLOCK
|
16
|
+
# engine_cart: 0.8.0
|
17
|
+
# engine_cart stanza: 0.8.0
|
18
|
+
# the below comes from engine_cart, a gem used to test this Rails engine gem in the context of a Rails app.
|
19
|
+
file = File.expand_path("Gemfile", ENV['ENGINE_CART_DESTINATION'] || ENV['RAILS_ROOT'] || File.expand_path(".internal_test_app", File.dirname(__FILE__)))
|
17
20
|
if File.exist?(file)
|
18
|
-
|
19
|
-
|
21
|
+
begin
|
22
|
+
eval_gemfile file
|
23
|
+
rescue Bundler::GemfileError => e
|
24
|
+
Bundler.ui.warn '[EngineCart] Skipping Rails application dependencies:'
|
25
|
+
Bundler.ui.warn e.message
|
26
|
+
end
|
20
27
|
else
|
21
|
-
|
28
|
+
Bundler.ui.warn "[EngineCart] Unable to find test application dependencies in #{file}, using placeholder dependencies"
|
29
|
+
|
30
|
+
gem 'rails', ENV['RAILS_VERSION'] if ENV['RAILS_VERSION']
|
22
31
|
|
23
|
-
# explicitly include sass-rails to get compatible sprocket dependencies
|
24
32
|
if ENV['RAILS_VERSION'].nil? || ENV['RAILS_VERSION'] =~ /^4.2/
|
25
|
-
gem 'coffee-rails', '~> 4.1.0'
|
26
|
-
gem 'sass-rails', '~> 5.0'
|
27
33
|
gem 'responders', "~> 2.0"
|
34
|
+
gem 'sass-rails', ">= 5.0"
|
28
35
|
else
|
29
36
|
gem 'sass-rails', "< 5.0"
|
30
|
-
gem 'coffee-rails', "~> 4.0.0"
|
31
37
|
end
|
32
38
|
end
|
39
|
+
# END ENGINE_CART BLOCK
|
data/README.md
CHANGED
@@ -24,7 +24,7 @@ Within a Spotlight application with `spotlight-dor-resources` installed, you cou
|
|
24
24
|
|
25
25
|
Note that Spotlight:
|
26
26
|
|
27
|
-
* is a Rails engine and needs to be used in the context of a Rails application. We use [engine_cart](https://github.com/cbeer/engine_cart) to create an internal test application at
|
27
|
+
* is a Rails engine and needs to be used in the context of a Rails application. We use [engine_cart](https://github.com/cbeer/engine_cart) to create an internal test application at .internal_test_app.
|
28
28
|
* uses Solr as part of its integration tests. We use [jettywrapper](https://github.com/projecthydra/jettywrapper) to manage the Solr instance used for development and test.
|
29
29
|
|
30
30
|
Our `$ rake ci` task utilizes Solr and the testing rails app, with Spotlight installed, automatically.
|
@@ -36,16 +36,16 @@ For access to a Rails console with the gem loaded up for testing purposes, you c
|
|
36
36
|
$ bundle
|
37
37
|
$ bundle exec rake:ci
|
38
38
|
|
39
|
-
This will download a test jetty instance (to run Solr), generate a testing app at ```
|
39
|
+
This will download a test jetty instance (to run Solr), generate a testing app at ```.internal_test_app``` and run the tests.
|
40
40
|
|
41
41
|
### Indexing with the generated test app
|
42
42
|
|
43
|
-
$
|
44
|
-
|
45
|
-
$ bundle exec rails console
|
43
|
+
$ bundle exec rake engine_cart:console
|
46
44
|
|
47
45
|
```ruby
|
48
|
-
|
46
|
+
druid="ty202yt2402" # a feigenbaum druid
|
47
|
+
druid="cx709ty7769" # a revs druid
|
48
|
+
purl=Spotlight::Resources::Purl.new({:url=>"https://purl.stanford.edu/#{druid}"})
|
49
49
|
puts purl.to_solr.first # the solr document as a hash
|
50
50
|
public_xml=purl.resource.public_xml # nokogiri doc with all public XML
|
51
51
|
mods=purl.resource.mods # nokogiri doc with just the MODs
|
@@ -182,6 +182,52 @@ module Spotlight::Dor
|
|
182
182
|
end
|
183
183
|
end
|
184
184
|
|
185
|
+
concerning :FullTextIndexing do
|
186
|
+
included do
|
187
|
+
before_index :add_object_full_text
|
188
|
+
end
|
189
|
+
|
190
|
+
# search for configured full text files, and if found, add them to the full text (whole document) solr field
|
191
|
+
def add_object_full_text(sdb, solr_doc)
|
192
|
+
object_level_full_text_urls(sdb).each do |file_url|
|
193
|
+
# append content from each file to the field, creating field if it doesn't exist yet
|
194
|
+
# ruby note: the construct below allows us to append a string to a variable that starts out as nil
|
195
|
+
# because .to_s of nil is empty string
|
196
|
+
solr_doc['full_text_tesim'] = solr_doc['full_text_tesim'].to_s + get_file_content(file_url)
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
# go grab the supplied file url, grab the file, encode and return
|
201
|
+
# TODO: thse should also be able to also deal with .rtf and .xml files
|
202
|
+
def get_file_content(file_url)
|
203
|
+
response = Net::HTTP.get_response(URI.parse(file_url))
|
204
|
+
response.body.scrub.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
|
205
|
+
rescue
|
206
|
+
logger.warn("Error indexing full text - couldn't load file #{file_url}")
|
207
|
+
nil
|
208
|
+
end
|
209
|
+
|
210
|
+
# these are the file locations where full txt files can be found at the object level
|
211
|
+
# this method returns an array of fully qualified public URLs that can be accessed to find full text countent
|
212
|
+
def object_level_full_text_urls(sdb)
|
213
|
+
files = []
|
214
|
+
object_level_full_text_filenames(sdb).each do |xpath_location|
|
215
|
+
files += sdb.public_xml.xpath(xpath_location).map do |txt_file|
|
216
|
+
"#{Spotlight::Dor::Resources::Engine.config.stacks_file_url}/#{sdb.druid}/#{txt_file['id']}"
|
217
|
+
end
|
218
|
+
end
|
219
|
+
files
|
220
|
+
end
|
221
|
+
|
222
|
+
# xpaths to locations in the contentMetadata where full text object level files can be found,
|
223
|
+
# add as many as you need, all will be searched
|
224
|
+
def object_level_full_text_filenames(sdb)
|
225
|
+
[
|
226
|
+
"//contentMetadata/resource/file[@id=\"#{sdb.druid}.txt\"]" # feigenbaum style - full text in .txt named for druid
|
227
|
+
]
|
228
|
+
end
|
229
|
+
end
|
230
|
+
|
185
231
|
concerning :CartographicIndexing do
|
186
232
|
included do
|
187
233
|
before_index :mods_cartographics_indexing
|
@@ -8,7 +8,9 @@ module Spotlight::Dor::Resources
|
|
8
8
|
Spotlight::Engine.config.resource_providers << Spotlight::Resources::Searchworks
|
9
9
|
Spotlight::Engine.config.resource_providers << Spotlight::Resources::Purl
|
10
10
|
Spotlight::Dor::Resources::Engine.config.parallel_options = { in_threads: 1 }
|
11
|
-
Spotlight::Dor::Resources::Engine.config.
|
11
|
+
Spotlight::Dor::Resources::Engine.config.base_stacks_url = 'https://stacks.stanford.edu'
|
12
|
+
Spotlight::Dor::Resources::Engine.config.stacks_file_url = "#{config.base_stacks_url}/file"
|
13
|
+
Spotlight::Dor::Resources::Engine.config.stacks_iiif_url = "#{config.base_stacks_url}/image/iiif"
|
12
14
|
end
|
13
15
|
end
|
14
16
|
end
|
@@ -498,4 +498,75 @@ describe Spotlight::Dor::Indexer do
|
|
498
498
|
end # each
|
499
499
|
end # add_folder_name
|
500
500
|
# rubocop:enable Metrics/LineLength
|
501
|
+
|
502
|
+
describe '#add_object_full_text' do
|
503
|
+
let!(:expected_text) { 'SOME full text string that is returned from the server' }
|
504
|
+
let!(:full_file_path) { 'https://stacks.stanford.edu/file/oo000oo0000/oo000oo0000.txt' }
|
505
|
+
it 'indexes the full text into the appropriate field if a recognized file pattern is found' do
|
506
|
+
public_xml_with_feigenbaum_full_text = Nokogiri::XML <<-EOF
|
507
|
+
<publicObject id="druid:oo000oo0000" published="2015-10-17T18:24:08-07:00">
|
508
|
+
<contentMetadata objectId="oo000oo0000" type="book">
|
509
|
+
<resource id="oo000oo0000_4" sequence="4" type="object">
|
510
|
+
<label>Document</label>
|
511
|
+
<file id="oo000oo0000.pdf" mimetype="application/pdf" size="6801421"></file>
|
512
|
+
<file id="oo000oo0000.txt" mimetype="text/plain" size="23376"></file>
|
513
|
+
</resource>
|
514
|
+
<resource id="oo000oo0000_5" sequence="5" type="page">
|
515
|
+
<label>Page 1</label>
|
516
|
+
<file id="oo000oo0000_00001.jp2" mimetype="image/jp2" size="1864266"><imageData width="2632" height="3422"/></file>
|
517
|
+
</resource>
|
518
|
+
</contentMetadata>
|
519
|
+
</publicObject>
|
520
|
+
EOF
|
521
|
+
allow(sdb).to receive(:public_xml).and_return(public_xml_with_feigenbaum_full_text)
|
522
|
+
# don't actually attempt a call to the stacks
|
523
|
+
allow(subject).to receive(:get_file_content).with(full_file_path).and_return(expected_text)
|
524
|
+
subject.send(:add_object_full_text, sdb, solr_doc)
|
525
|
+
expect(subject.object_level_full_text_urls(sdb)).to eq [full_file_path]
|
526
|
+
expect(solr_doc['full_text_tesim']).to eq expected_text
|
527
|
+
end
|
528
|
+
it 'does not index the full text if no recognized pattern is found' do
|
529
|
+
public_xml_with_no_recognized_full_text = Nokogiri::XML <<-EOF
|
530
|
+
<publicObject id="druid:oo000oo0000" published="2015-10-17T18:24:08-07:00">
|
531
|
+
<contentMetadata objectId="oo000oo0000" type="book">
|
532
|
+
<resource id="oo000oo0000_4" sequence="4" type="object">
|
533
|
+
<label>Document</label>
|
534
|
+
<file id="oo000oo0000.pdf" mimetype="application/pdf" size="6801421"></file>
|
535
|
+
</resource>
|
536
|
+
<resource id="oo000oo0000_5" sequence="5" type="page">
|
537
|
+
<label>Page 1</label>
|
538
|
+
<file id="oo000oo0000_00001.jp2" mimetype="image/jp2" size="1864266"><imageData width="2632" height="3422"/></file>
|
539
|
+
</resource>
|
540
|
+
</contentMetadata>
|
541
|
+
</publicObject>
|
542
|
+
EOF
|
543
|
+
allow(sdb).to receive(:public_xml).and_return(public_xml_with_no_recognized_full_text)
|
544
|
+
subject.send(:add_object_full_text, sdb, solr_doc)
|
545
|
+
expect(subject.object_level_full_text_urls(sdb)).to eq []
|
546
|
+
expect(solr_doc['full_text_tesim']).to be_nil
|
547
|
+
end
|
548
|
+
it 'indexes the full text from two files if two recognized patterns are found' do
|
549
|
+
public_xml_with_two_recognized_full_text_files = Nokogiri::XML <<-EOF
|
550
|
+
<publicObject id="druid:oo000oo0000" published="2015-10-17T18:24:08-07:00">
|
551
|
+
<contentMetadata objectId="oo000oo0000" type="book">
|
552
|
+
<resource id="oo000oo0000_4" sequence="4" type="object">
|
553
|
+
<label>Document</label>
|
554
|
+
<file id="oo000oo0000.pdf" mimetype="application/pdf" size="6801421"></file>
|
555
|
+
<file id="oo000oo0000.txt" mimetype="text/plain" size="23376"></file>
|
556
|
+
</resource>
|
557
|
+
<resource id="oo000oo0000_5" sequence="5" type="page">
|
558
|
+
<label>Page 1</label>
|
559
|
+
<file id="oo000oo0000_00001.jp2" mimetype="image/jp2" size="1864266"><imageData width="2632" height="3422"/></file>
|
560
|
+
<file id="oo000oo0000.txt" mimetype="text/plain" size="23376"></file>
|
561
|
+
</resource>
|
562
|
+
</contentMetadata>
|
563
|
+
</publicObject>
|
564
|
+
EOF
|
565
|
+
allow(sdb).to receive(:public_xml).and_return(public_xml_with_two_recognized_full_text_files)
|
566
|
+
allow(subject).to receive(:get_file_content).with(full_file_path).and_return(expected_text)
|
567
|
+
subject.send(:add_object_full_text, sdb, solr_doc)
|
568
|
+
expect(subject.object_level_full_text_urls(sdb)).to eq [full_file_path, full_file_path]
|
569
|
+
expect(solr_doc['full_text_tesim']).to eq(expected_text + expected_text) # same file twice
|
570
|
+
end
|
571
|
+
end # add_object_full_text
|
501
572
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -10,7 +10,7 @@ EngineCart.load_application!
|
|
10
10
|
require 'rspec/rails'
|
11
11
|
require 'capybara/poltergeist'
|
12
12
|
Capybara.javascript_driver = :poltergeist
|
13
|
-
Capybara.
|
13
|
+
Capybara.default_max_wait_time = 5
|
14
14
|
|
15
15
|
if ENV["COVERAGE"] or ENV["CI"]
|
16
16
|
require 'simplecov'
|
@@ -57,4 +57,4 @@ RSpec.configure do |config|
|
|
57
57
|
config.include Warden::Test::Helpers, type: :feature
|
58
58
|
config.after(:each, type: :feature) { Warden.test_reset! }
|
59
59
|
config.include Capybara::DSL
|
60
|
-
end
|
60
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spotlight-dor-resources
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Beer
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-11-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: faraday
|
@@ -321,7 +321,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
321
321
|
version: '0'
|
322
322
|
requirements: []
|
323
323
|
rubyforge_project:
|
324
|
-
rubygems_version: 2.4.
|
324
|
+
rubygems_version: 2.4.6
|
325
325
|
signing_key:
|
326
326
|
specification_version: 4
|
327
327
|
summary: Spotlight resource indexer for DOR resources.
|