spotlight-dor-resources 0.0.4 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8b1e1f0ddb963f81f8fd302a284ef53592146ef9
4
- data.tar.gz: 03687ee4c19706fa434993ce33ad59beac3b522a
3
+ metadata.gz: c0c448f6276a5bbd515fecaaf0e0d836e533f246
4
+ data.tar.gz: 13048c1c02a606c0c4d30e07768c79bcd6385834
5
5
  SHA512:
6
- metadata.gz: 01970798dc91a7dd7b3f451244e71573d6c1129eb2c59a7f7758b3fdfb4d7d70c3d5bf8e3d99d1a7300b404dbc7142869898b1792729f8ab1099beb1f7113cd1
7
- data.tar.gz: 121941bdd33d142d20f27bcacd25d112327324d5e1f84e0f190896bb9fa0c932562cdaeea2a29dae846d84b940085e3bcdd3b92a4f832854e74c24f46269621d
6
+ metadata.gz: 326665ea9d207234a6fa97b8c3993a5cff159a154d78e61d813de6352b39ece8db5d8364a331830fb0857979e97b12299eded8b77d0b9d513e0155e02a1fab71
7
+ data.tar.gz: 7a9ade62a20f3c233864fcbccf1454ad220f98297a3556a42550e6e2c30fc887a4040cac6346d4af6ee3ce716a8fd8edad5314b31aaca399f3d9e176acbe3682
data/.gitignore CHANGED
@@ -20,5 +20,7 @@ pickle-email-*.html
20
20
  .project
21
21
  config/initializers/secret_token.rb
22
22
  spec/internal
23
+ .internal_test_app
23
24
  Gemfile.lock
24
- jetty
25
+ jetty
26
+ ./spec/internal
data/.rubocop.yml CHANGED
@@ -8,7 +8,7 @@ AllCops:
8
8
  - 'bin/**/*'
9
9
  - 'db/**/*'
10
10
  - 'config/**/*'
11
- - 'spec/internal/**/*'
11
+ - '.internal_test_app/**/*'
12
12
  - 'spec/test_app_templates/**/*'
13
13
  - 'spec/spec_helper.rb'
14
14
  - 'spec/teaspoon_env.rb'
data/Gemfile CHANGED
@@ -12,21 +12,28 @@ group :test do
12
12
  gem 'pry-byebug', require: false
13
13
  end
14
14
 
15
- # the below comes from engine_cart, a gem used to test this Rails engine gem in the context of a Rails app
16
- file = File.expand_path("Gemfile", ENV['ENGINE_CART_DESTINATION'] || ENV['RAILS_ROOT'] || File.expand_path("../spec/internal", __FILE__))
15
+ # BEGIN ENGINE_CART BLOCK
16
+ # engine_cart: 0.8.0
17
+ # engine_cart stanza: 0.8.0
18
+ # the below comes from engine_cart, a gem used to test this Rails engine gem in the context of a Rails app.
19
+ file = File.expand_path("Gemfile", ENV['ENGINE_CART_DESTINATION'] || ENV['RAILS_ROOT'] || File.expand_path(".internal_test_app", File.dirname(__FILE__)))
17
20
  if File.exist?(file)
18
- puts "Loading #{file} ..." if $DEBUG # `ruby -d` or `bundle -v`
19
- instance_eval File.read(file)
21
+ begin
22
+ eval_gemfile file
23
+ rescue Bundler::GemfileError => e
24
+ Bundler.ui.warn '[EngineCart] Skipping Rails application dependencies:'
25
+ Bundler.ui.warn e.message
26
+ end
20
27
  else
21
- gem 'rails', ENV['RAILS_VERSION']
28
+ Bundler.ui.warn "[EngineCart] Unable to find test application dependencies in #{file}, using placeholder dependencies"
29
+
30
+ gem 'rails', ENV['RAILS_VERSION'] if ENV['RAILS_VERSION']
22
31
 
23
- # explicitly include sass-rails to get compatible sprocket dependencies
24
32
  if ENV['RAILS_VERSION'].nil? || ENV['RAILS_VERSION'] =~ /^4.2/
25
- gem 'coffee-rails', '~> 4.1.0'
26
- gem 'sass-rails', '~> 5.0'
27
33
  gem 'responders', "~> 2.0"
34
+ gem 'sass-rails', ">= 5.0"
28
35
  else
29
36
  gem 'sass-rails', "< 5.0"
30
- gem 'coffee-rails', "~> 4.0.0"
31
37
  end
32
38
  end
39
+ # END ENGINE_CART BLOCK
data/README.md CHANGED
@@ -24,7 +24,7 @@ Within a Spotlight application with `spotlight-dor-resources` installed, you cou
24
24
 
25
25
  Note that Spotlight:
26
26
 
27
- * is a Rails engine and needs to be used in the context of a Rails application. We use [engine_cart](https://github.com/cbeer/engine_cart) to create an internal test application at spec/internal.
27
+ * is a Rails engine and needs to be used in the context of a Rails application. We use [engine_cart](https://github.com/cbeer/engine_cart) to create an internal test application at .internal_test_app.
28
28
  * uses Solr as part of its integration tests. We use [jettywrapper](https://github.com/projecthydra/jettywrapper) to manage the Solr instance used for development and test.
29
29
 
30
30
  Our `$ rake ci` task utilizes Solr and the testing rails app, with Spotlight installed, automatically.
@@ -36,16 +36,16 @@ For access to a Rails console with the gem loaded up for testing purposes, you c
36
36
  $ bundle
37
37
  $ bundle exec rake:ci
38
38
 
39
- This will download a test jetty instance (to run Solr), generate a testing app at ```spec/internal``` and run the tests.
39
+ This will download a test jetty instance (to run Solr), generate a testing app at ```.internal_test_app``` and run the tests.
40
40
 
41
41
  ### Indexing with the generated test app
42
42
 
43
- $ cd spec/internal
44
-
45
- $ bundle exec rails console
43
+ $ bundle exec rake engine_cart:console
46
44
 
47
45
  ```ruby
48
- purl=Spotlight::Resources::Purl.new({:url=>'https://purl.stanford.edu/cx709ty7769'})
46
+ druid="ty202yt2402" # a feigenbaum druid
47
+ druid="cx709ty7769" # a revs druid
48
+ purl=Spotlight::Resources::Purl.new({:url=>"https://purl.stanford.edu/#{druid}"})
49
49
  puts purl.to_solr.first # the solr document as a hash
50
50
  public_xml=purl.resource.public_xml # nokogiri doc with all public XML
51
51
  mods=purl.resource.mods # nokogiri doc with just the MODs
@@ -182,6 +182,52 @@ module Spotlight::Dor
182
182
  end
183
183
  end
184
184
 
185
+ concerning :FullTextIndexing do
186
+ included do
187
+ before_index :add_object_full_text
188
+ end
189
+
190
+ # search for configured full text files, and if found, add them to the full text (whole document) solr field
191
+ def add_object_full_text(sdb, solr_doc)
192
+ object_level_full_text_urls(sdb).each do |file_url|
193
+ # append content from each file to the field, creating field if it doesn't exist yet
194
+ # ruby note: the construct below allows us to append a string to a variable that starts out as nil
195
+ # because .to_s of nil is empty string
196
+ solr_doc['full_text_tesim'] = solr_doc['full_text_tesim'].to_s + get_file_content(file_url)
197
+ end
198
+ end
199
+
200
+ # go grab the supplied file url, grab the file, encode and return
201
+ # TODO: thse should also be able to also deal with .rtf and .xml files
202
+ def get_file_content(file_url)
203
+ response = Net::HTTP.get_response(URI.parse(file_url))
204
+ response.body.scrub.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
205
+ rescue
206
+ logger.warn("Error indexing full text - couldn't load file #{file_url}")
207
+ nil
208
+ end
209
+
210
+ # these are the file locations where full txt files can be found at the object level
211
+ # this method returns an array of fully qualified public URLs that can be accessed to find full text countent
212
+ def object_level_full_text_urls(sdb)
213
+ files = []
214
+ object_level_full_text_filenames(sdb).each do |xpath_location|
215
+ files += sdb.public_xml.xpath(xpath_location).map do |txt_file|
216
+ "#{Spotlight::Dor::Resources::Engine.config.stacks_file_url}/#{sdb.druid}/#{txt_file['id']}"
217
+ end
218
+ end
219
+ files
220
+ end
221
+
222
+ # xpaths to locations in the contentMetadata where full text object level files can be found,
223
+ # add as many as you need, all will be searched
224
+ def object_level_full_text_filenames(sdb)
225
+ [
226
+ "//contentMetadata/resource/file[@id=\"#{sdb.druid}.txt\"]" # feigenbaum style - full text in .txt named for druid
227
+ ]
228
+ end
229
+ end
230
+
185
231
  concerning :CartographicIndexing do
186
232
  included do
187
233
  before_index :mods_cartographics_indexing
@@ -8,7 +8,9 @@ module Spotlight::Dor::Resources
8
8
  Spotlight::Engine.config.resource_providers << Spotlight::Resources::Searchworks
9
9
  Spotlight::Engine.config.resource_providers << Spotlight::Resources::Purl
10
10
  Spotlight::Dor::Resources::Engine.config.parallel_options = { in_threads: 1 }
11
- Spotlight::Dor::Resources::Engine.config.stacks_iiif_url = 'https://stacks.stanford.edu/image/iiif'
11
+ Spotlight::Dor::Resources::Engine.config.base_stacks_url = 'https://stacks.stanford.edu'
12
+ Spotlight::Dor::Resources::Engine.config.stacks_file_url = "#{config.base_stacks_url}/file"
13
+ Spotlight::Dor::Resources::Engine.config.stacks_iiif_url = "#{config.base_stacks_url}/image/iiif"
12
14
  end
13
15
  end
14
16
  end
@@ -2,7 +2,7 @@ module Spotlight
2
2
  module Dor
3
3
  # :nodoc:
4
4
  module Resources
5
- VERSION = '0.0.4'
5
+ VERSION = '0.0.5'
6
6
  end
7
7
  end
8
8
  end
@@ -498,4 +498,75 @@ describe Spotlight::Dor::Indexer do
498
498
  end # each
499
499
  end # add_folder_name
500
500
  # rubocop:enable Metrics/LineLength
501
+
502
+ describe '#add_object_full_text' do
503
+ let!(:expected_text) { 'SOME full text string that is returned from the server' }
504
+ let!(:full_file_path) { 'https://stacks.stanford.edu/file/oo000oo0000/oo000oo0000.txt' }
505
+ it 'indexes the full text into the appropriate field if a recognized file pattern is found' do
506
+ public_xml_with_feigenbaum_full_text = Nokogiri::XML <<-EOF
507
+ <publicObject id="druid:oo000oo0000" published="2015-10-17T18:24:08-07:00">
508
+ <contentMetadata objectId="oo000oo0000" type="book">
509
+ <resource id="oo000oo0000_4" sequence="4" type="object">
510
+ <label>Document</label>
511
+ <file id="oo000oo0000.pdf" mimetype="application/pdf" size="6801421"></file>
512
+ <file id="oo000oo0000.txt" mimetype="text/plain" size="23376"></file>
513
+ </resource>
514
+ <resource id="oo000oo0000_5" sequence="5" type="page">
515
+ <label>Page 1</label>
516
+ <file id="oo000oo0000_00001.jp2" mimetype="image/jp2" size="1864266"><imageData width="2632" height="3422"/></file>
517
+ </resource>
518
+ </contentMetadata>
519
+ </publicObject>
520
+ EOF
521
+ allow(sdb).to receive(:public_xml).and_return(public_xml_with_feigenbaum_full_text)
522
+ # don't actually attempt a call to the stacks
523
+ allow(subject).to receive(:get_file_content).with(full_file_path).and_return(expected_text)
524
+ subject.send(:add_object_full_text, sdb, solr_doc)
525
+ expect(subject.object_level_full_text_urls(sdb)).to eq [full_file_path]
526
+ expect(solr_doc['full_text_tesim']).to eq expected_text
527
+ end
528
+ it 'does not index the full text if no recognized pattern is found' do
529
+ public_xml_with_no_recognized_full_text = Nokogiri::XML <<-EOF
530
+ <publicObject id="druid:oo000oo0000" published="2015-10-17T18:24:08-07:00">
531
+ <contentMetadata objectId="oo000oo0000" type="book">
532
+ <resource id="oo000oo0000_4" sequence="4" type="object">
533
+ <label>Document</label>
534
+ <file id="oo000oo0000.pdf" mimetype="application/pdf" size="6801421"></file>
535
+ </resource>
536
+ <resource id="oo000oo0000_5" sequence="5" type="page">
537
+ <label>Page 1</label>
538
+ <file id="oo000oo0000_00001.jp2" mimetype="image/jp2" size="1864266"><imageData width="2632" height="3422"/></file>
539
+ </resource>
540
+ </contentMetadata>
541
+ </publicObject>
542
+ EOF
543
+ allow(sdb).to receive(:public_xml).and_return(public_xml_with_no_recognized_full_text)
544
+ subject.send(:add_object_full_text, sdb, solr_doc)
545
+ expect(subject.object_level_full_text_urls(sdb)).to eq []
546
+ expect(solr_doc['full_text_tesim']).to be_nil
547
+ end
548
+ it 'indexes the full text from two files if two recognized patterns are found' do
549
+ public_xml_with_two_recognized_full_text_files = Nokogiri::XML <<-EOF
550
+ <publicObject id="druid:oo000oo0000" published="2015-10-17T18:24:08-07:00">
551
+ <contentMetadata objectId="oo000oo0000" type="book">
552
+ <resource id="oo000oo0000_4" sequence="4" type="object">
553
+ <label>Document</label>
554
+ <file id="oo000oo0000.pdf" mimetype="application/pdf" size="6801421"></file>
555
+ <file id="oo000oo0000.txt" mimetype="text/plain" size="23376"></file>
556
+ </resource>
557
+ <resource id="oo000oo0000_5" sequence="5" type="page">
558
+ <label>Page 1</label>
559
+ <file id="oo000oo0000_00001.jp2" mimetype="image/jp2" size="1864266"><imageData width="2632" height="3422"/></file>
560
+ <file id="oo000oo0000.txt" mimetype="text/plain" size="23376"></file>
561
+ </resource>
562
+ </contentMetadata>
563
+ </publicObject>
564
+ EOF
565
+ allow(sdb).to receive(:public_xml).and_return(public_xml_with_two_recognized_full_text_files)
566
+ allow(subject).to receive(:get_file_content).with(full_file_path).and_return(expected_text)
567
+ subject.send(:add_object_full_text, sdb, solr_doc)
568
+ expect(subject.object_level_full_text_urls(sdb)).to eq [full_file_path, full_file_path]
569
+ expect(solr_doc['full_text_tesim']).to eq(expected_text + expected_text) # same file twice
570
+ end
571
+ end # add_object_full_text
501
572
  end
data/spec/spec_helper.rb CHANGED
@@ -10,7 +10,7 @@ EngineCart.load_application!
10
10
  require 'rspec/rails'
11
11
  require 'capybara/poltergeist'
12
12
  Capybara.javascript_driver = :poltergeist
13
- Capybara.default_wait_time = 5
13
+ Capybara.default_max_wait_time = 5
14
14
 
15
15
  if ENV["COVERAGE"] or ENV["CI"]
16
16
  require 'simplecov'
@@ -57,4 +57,4 @@ RSpec.configure do |config|
57
57
  config.include Warden::Test::Helpers, type: :feature
58
58
  config.after(:each, type: :feature) { Warden.test_reset! }
59
59
  config.include Capybara::DSL
60
- end
60
+ end
@@ -1,6 +1,6 @@
1
1
  harvestdor:
2
2
  # log_dir: directory for log file (default logs, relative to harvestdor gem path)
3
- log_dir: spec/internal/log/gdor_logs
3
+ log_dir: log
4
4
 
5
5
  # log_name: name of log file (default: harvestdor.log)
6
6
  log_name: gdor.log
@@ -1,7 +1,7 @@
1
1
  require 'rails/generators'
2
2
 
3
3
  class TestAppGenerator < Rails::Generators::Base
4
- source_root "../../spec/test_app_templates"
4
+ source_root "../spec/test_app_templates"
5
5
 
6
6
  def add_gems
7
7
  gem 'blacklight', '~> 5.1'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spotlight-dor-resources
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Beer
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-10-30 00:00:00.000000000 Z
11
+ date: 2015-11-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: faraday
@@ -321,7 +321,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
321
321
  version: '0'
322
322
  requirements: []
323
323
  rubyforge_project:
324
- rubygems_version: 2.4.8
324
+ rubygems_version: 2.4.6
325
325
  signing_key:
326
326
  specification_version: 4
327
327
  summary: Spotlight resource indexer for DOR resources.