spotlight-dor-resources 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8b1e1f0ddb963f81f8fd302a284ef53592146ef9
4
- data.tar.gz: 03687ee4c19706fa434993ce33ad59beac3b522a
3
+ metadata.gz: c0c448f6276a5bbd515fecaaf0e0d836e533f246
4
+ data.tar.gz: 13048c1c02a606c0c4d30e07768c79bcd6385834
5
5
  SHA512:
6
- metadata.gz: 01970798dc91a7dd7b3f451244e71573d6c1129eb2c59a7f7758b3fdfb4d7d70c3d5bf8e3d99d1a7300b404dbc7142869898b1792729f8ab1099beb1f7113cd1
7
- data.tar.gz: 121941bdd33d142d20f27bcacd25d112327324d5e1f84e0f190896bb9fa0c932562cdaeea2a29dae846d84b940085e3bcdd3b92a4f832854e74c24f46269621d
6
+ metadata.gz: 326665ea9d207234a6fa97b8c3993a5cff159a154d78e61d813de6352b39ece8db5d8364a331830fb0857979e97b12299eded8b77d0b9d513e0155e02a1fab71
7
+ data.tar.gz: 7a9ade62a20f3c233864fcbccf1454ad220f98297a3556a42550e6e2c30fc887a4040cac6346d4af6ee3ce716a8fd8edad5314b31aaca399f3d9e176acbe3682
data/.gitignore CHANGED
@@ -20,5 +20,7 @@ pickle-email-*.html
20
20
  .project
21
21
  config/initializers/secret_token.rb
22
22
  spec/internal
23
+ .internal_test_app
23
24
  Gemfile.lock
24
- jetty
25
+ jetty
26
+ ./spec/internal
data/.rubocop.yml CHANGED
@@ -8,7 +8,7 @@ AllCops:
8
8
  - 'bin/**/*'
9
9
  - 'db/**/*'
10
10
  - 'config/**/*'
11
- - 'spec/internal/**/*'
11
+ - '.internal_test_app/**/*'
12
12
  - 'spec/test_app_templates/**/*'
13
13
  - 'spec/spec_helper.rb'
14
14
  - 'spec/teaspoon_env.rb'
data/Gemfile CHANGED
@@ -12,21 +12,28 @@ group :test do
12
12
  gem 'pry-byebug', require: false
13
13
  end
14
14
 
15
- # the below comes from engine_cart, a gem used to test this Rails engine gem in the context of a Rails app
16
- file = File.expand_path("Gemfile", ENV['ENGINE_CART_DESTINATION'] || ENV['RAILS_ROOT'] || File.expand_path("../spec/internal", __FILE__))
15
+ # BEGIN ENGINE_CART BLOCK
16
+ # engine_cart: 0.8.0
17
+ # engine_cart stanza: 0.8.0
18
+ # the below comes from engine_cart, a gem used to test this Rails engine gem in the context of a Rails app.
19
+ file = File.expand_path("Gemfile", ENV['ENGINE_CART_DESTINATION'] || ENV['RAILS_ROOT'] || File.expand_path(".internal_test_app", File.dirname(__FILE__)))
17
20
  if File.exist?(file)
18
- puts "Loading #{file} ..." if $DEBUG # `ruby -d` or `bundle -v`
19
- instance_eval File.read(file)
21
+ begin
22
+ eval_gemfile file
23
+ rescue Bundler::GemfileError => e
24
+ Bundler.ui.warn '[EngineCart] Skipping Rails application dependencies:'
25
+ Bundler.ui.warn e.message
26
+ end
20
27
  else
21
- gem 'rails', ENV['RAILS_VERSION']
28
+ Bundler.ui.warn "[EngineCart] Unable to find test application dependencies in #{file}, using placeholder dependencies"
29
+
30
+ gem 'rails', ENV['RAILS_VERSION'] if ENV['RAILS_VERSION']
22
31
 
23
- # explicitly include sass-rails to get compatible sprocket dependencies
24
32
  if ENV['RAILS_VERSION'].nil? || ENV['RAILS_VERSION'] =~ /^4.2/
25
- gem 'coffee-rails', '~> 4.1.0'
26
- gem 'sass-rails', '~> 5.0'
27
33
  gem 'responders', "~> 2.0"
34
+ gem 'sass-rails', ">= 5.0"
28
35
  else
29
36
  gem 'sass-rails', "< 5.0"
30
- gem 'coffee-rails', "~> 4.0.0"
31
37
  end
32
38
  end
39
+ # END ENGINE_CART BLOCK
data/README.md CHANGED
@@ -24,7 +24,7 @@ Within a Spotlight application with `spotlight-dor-resources` installed, you cou
24
24
 
25
25
  Note that Spotlight:
26
26
 
27
- * is a Rails engine and needs to be used in the context of a Rails application. We use [engine_cart](https://github.com/cbeer/engine_cart) to create an internal test application at spec/internal.
27
+ * is a Rails engine and needs to be used in the context of a Rails application. We use [engine_cart](https://github.com/cbeer/engine_cart) to create an internal test application at .internal_test_app.
28
28
  * uses Solr as part of its integration tests. We use [jettywrapper](https://github.com/projecthydra/jettywrapper) to manage the Solr instance used for development and test.
29
29
 
30
30
  Our `$ rake ci` task utilizes Solr and the testing rails app, with Spotlight installed, automatically.
@@ -36,16 +36,16 @@ For access to a Rails console with the gem loaded up for testing purposes, you c
36
36
  $ bundle
37
37
  $ bundle exec rake:ci
38
38
 
39
- This will download a test jetty instance (to run Solr), generate a testing app at ```spec/internal``` and run the tests.
39
+ This will download a test jetty instance (to run Solr), generate a testing app at ```.internal_test_app``` and run the tests.
40
40
 
41
41
  ### Indexing with the generated test app
42
42
 
43
- $ cd spec/internal
44
-
45
- $ bundle exec rails console
43
+ $ bundle exec rake engine_cart:console
46
44
 
47
45
  ```ruby
48
- purl=Spotlight::Resources::Purl.new({:url=>'https://purl.stanford.edu/cx709ty7769'})
46
+ druid="ty202yt2402" # a feigenbaum druid
47
+ druid="cx709ty7769" # a revs druid
48
+ purl=Spotlight::Resources::Purl.new({:url=>"https://purl.stanford.edu/#{druid}"})
49
49
  puts purl.to_solr.first # the solr document as a hash
50
50
  public_xml=purl.resource.public_xml # nokogiri doc with all public XML
51
51
  mods=purl.resource.mods # nokogiri doc with just the MODs
@@ -182,6 +182,52 @@ module Spotlight::Dor
182
182
  end
183
183
  end
184
184
 
185
+ concerning :FullTextIndexing do
186
+ included do
187
+ before_index :add_object_full_text
188
+ end
189
+
190
+ # search for configured full text files, and if found, add them to the full text (whole document) solr field
191
+ def add_object_full_text(sdb, solr_doc)
192
+ object_level_full_text_urls(sdb).each do |file_url|
193
+ # append content from each file to the field, creating field if it doesn't exist yet
194
+ # ruby note: the construct below allows us to append a string to a variable that starts out as nil
195
+ # because .to_s of nil is empty string
196
+ solr_doc['full_text_tesim'] = solr_doc['full_text_tesim'].to_s + get_file_content(file_url)
197
+ end
198
+ end
199
+
200
+ # go grab the supplied file url, grab the file, encode and return
201
+ # TODO: thse should also be able to also deal with .rtf and .xml files
202
+ def get_file_content(file_url)
203
+ response = Net::HTTP.get_response(URI.parse(file_url))
204
+ response.body.scrub.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
205
+ rescue
206
+ logger.warn("Error indexing full text - couldn't load file #{file_url}")
207
+ nil
208
+ end
209
+
210
+ # these are the file locations where full txt files can be found at the object level
211
+ # this method returns an array of fully qualified public URLs that can be accessed to find full text countent
212
+ def object_level_full_text_urls(sdb)
213
+ files = []
214
+ object_level_full_text_filenames(sdb).each do |xpath_location|
215
+ files += sdb.public_xml.xpath(xpath_location).map do |txt_file|
216
+ "#{Spotlight::Dor::Resources::Engine.config.stacks_file_url}/#{sdb.druid}/#{txt_file['id']}"
217
+ end
218
+ end
219
+ files
220
+ end
221
+
222
+ # xpaths to locations in the contentMetadata where full text object level files can be found,
223
+ # add as many as you need, all will be searched
224
+ def object_level_full_text_filenames(sdb)
225
+ [
226
+ "//contentMetadata/resource/file[@id=\"#{sdb.druid}.txt\"]" # feigenbaum style - full text in .txt named for druid
227
+ ]
228
+ end
229
+ end
230
+
185
231
  concerning :CartographicIndexing do
186
232
  included do
187
233
  before_index :mods_cartographics_indexing
@@ -8,7 +8,9 @@ module Spotlight::Dor::Resources
8
8
  Spotlight::Engine.config.resource_providers << Spotlight::Resources::Searchworks
9
9
  Spotlight::Engine.config.resource_providers << Spotlight::Resources::Purl
10
10
  Spotlight::Dor::Resources::Engine.config.parallel_options = { in_threads: 1 }
11
- Spotlight::Dor::Resources::Engine.config.stacks_iiif_url = 'https://stacks.stanford.edu/image/iiif'
11
+ Spotlight::Dor::Resources::Engine.config.base_stacks_url = 'https://stacks.stanford.edu'
12
+ Spotlight::Dor::Resources::Engine.config.stacks_file_url = "#{config.base_stacks_url}/file"
13
+ Spotlight::Dor::Resources::Engine.config.stacks_iiif_url = "#{config.base_stacks_url}/image/iiif"
12
14
  end
13
15
  end
14
16
  end
@@ -2,7 +2,7 @@ module Spotlight
2
2
  module Dor
3
3
  # :nodoc:
4
4
  module Resources
5
- VERSION = '0.0.4'
5
+ VERSION = '0.0.5'
6
6
  end
7
7
  end
8
8
  end
@@ -498,4 +498,75 @@ describe Spotlight::Dor::Indexer do
498
498
  end # each
499
499
  end # add_folder_name
500
500
  # rubocop:enable Metrics/LineLength
501
+
502
+ describe '#add_object_full_text' do
503
+ let!(:expected_text) { 'SOME full text string that is returned from the server' }
504
+ let!(:full_file_path) { 'https://stacks.stanford.edu/file/oo000oo0000/oo000oo0000.txt' }
505
+ it 'indexes the full text into the appropriate field if a recognized file pattern is found' do
506
+ public_xml_with_feigenbaum_full_text = Nokogiri::XML <<-EOF
507
+ <publicObject id="druid:oo000oo0000" published="2015-10-17T18:24:08-07:00">
508
+ <contentMetadata objectId="oo000oo0000" type="book">
509
+ <resource id="oo000oo0000_4" sequence="4" type="object">
510
+ <label>Document</label>
511
+ <file id="oo000oo0000.pdf" mimetype="application/pdf" size="6801421"></file>
512
+ <file id="oo000oo0000.txt" mimetype="text/plain" size="23376"></file>
513
+ </resource>
514
+ <resource id="oo000oo0000_5" sequence="5" type="page">
515
+ <label>Page 1</label>
516
+ <file id="oo000oo0000_00001.jp2" mimetype="image/jp2" size="1864266"><imageData width="2632" height="3422"/></file>
517
+ </resource>
518
+ </contentMetadata>
519
+ </publicObject>
520
+ EOF
521
+ allow(sdb).to receive(:public_xml).and_return(public_xml_with_feigenbaum_full_text)
522
+ # don't actually attempt a call to the stacks
523
+ allow(subject).to receive(:get_file_content).with(full_file_path).and_return(expected_text)
524
+ subject.send(:add_object_full_text, sdb, solr_doc)
525
+ expect(subject.object_level_full_text_urls(sdb)).to eq [full_file_path]
526
+ expect(solr_doc['full_text_tesim']).to eq expected_text
527
+ end
528
+ it 'does not index the full text if no recognized pattern is found' do
529
+ public_xml_with_no_recognized_full_text = Nokogiri::XML <<-EOF
530
+ <publicObject id="druid:oo000oo0000" published="2015-10-17T18:24:08-07:00">
531
+ <contentMetadata objectId="oo000oo0000" type="book">
532
+ <resource id="oo000oo0000_4" sequence="4" type="object">
533
+ <label>Document</label>
534
+ <file id="oo000oo0000.pdf" mimetype="application/pdf" size="6801421"></file>
535
+ </resource>
536
+ <resource id="oo000oo0000_5" sequence="5" type="page">
537
+ <label>Page 1</label>
538
+ <file id="oo000oo0000_00001.jp2" mimetype="image/jp2" size="1864266"><imageData width="2632" height="3422"/></file>
539
+ </resource>
540
+ </contentMetadata>
541
+ </publicObject>
542
+ EOF
543
+ allow(sdb).to receive(:public_xml).and_return(public_xml_with_no_recognized_full_text)
544
+ subject.send(:add_object_full_text, sdb, solr_doc)
545
+ expect(subject.object_level_full_text_urls(sdb)).to eq []
546
+ expect(solr_doc['full_text_tesim']).to be_nil
547
+ end
548
+ it 'indexes the full text from two files if two recognized patterns are found' do
549
+ public_xml_with_two_recognized_full_text_files = Nokogiri::XML <<-EOF
550
+ <publicObject id="druid:oo000oo0000" published="2015-10-17T18:24:08-07:00">
551
+ <contentMetadata objectId="oo000oo0000" type="book">
552
+ <resource id="oo000oo0000_4" sequence="4" type="object">
553
+ <label>Document</label>
554
+ <file id="oo000oo0000.pdf" mimetype="application/pdf" size="6801421"></file>
555
+ <file id="oo000oo0000.txt" mimetype="text/plain" size="23376"></file>
556
+ </resource>
557
+ <resource id="oo000oo0000_5" sequence="5" type="page">
558
+ <label>Page 1</label>
559
+ <file id="oo000oo0000_00001.jp2" mimetype="image/jp2" size="1864266"><imageData width="2632" height="3422"/></file>
560
+ <file id="oo000oo0000.txt" mimetype="text/plain" size="23376"></file>
561
+ </resource>
562
+ </contentMetadata>
563
+ </publicObject>
564
+ EOF
565
+ allow(sdb).to receive(:public_xml).and_return(public_xml_with_two_recognized_full_text_files)
566
+ allow(subject).to receive(:get_file_content).with(full_file_path).and_return(expected_text)
567
+ subject.send(:add_object_full_text, sdb, solr_doc)
568
+ expect(subject.object_level_full_text_urls(sdb)).to eq [full_file_path, full_file_path]
569
+ expect(solr_doc['full_text_tesim']).to eq(expected_text + expected_text) # same file twice
570
+ end
571
+ end # add_object_full_text
501
572
  end
data/spec/spec_helper.rb CHANGED
@@ -10,7 +10,7 @@ EngineCart.load_application!
10
10
  require 'rspec/rails'
11
11
  require 'capybara/poltergeist'
12
12
  Capybara.javascript_driver = :poltergeist
13
- Capybara.default_wait_time = 5
13
+ Capybara.default_max_wait_time = 5
14
14
 
15
15
  if ENV["COVERAGE"] or ENV["CI"]
16
16
  require 'simplecov'
@@ -57,4 +57,4 @@ RSpec.configure do |config|
57
57
  config.include Warden::Test::Helpers, type: :feature
58
58
  config.after(:each, type: :feature) { Warden.test_reset! }
59
59
  config.include Capybara::DSL
60
- end
60
+ end
@@ -1,6 +1,6 @@
1
1
  harvestdor:
2
2
  # log_dir: directory for log file (default logs, relative to harvestdor gem path)
3
- log_dir: spec/internal/log/gdor_logs
3
+ log_dir: log
4
4
 
5
5
  # log_name: name of log file (default: harvestdor.log)
6
6
  log_name: gdor.log
@@ -1,7 +1,7 @@
1
1
  require 'rails/generators'
2
2
 
3
3
  class TestAppGenerator < Rails::Generators::Base
4
- source_root "../../spec/test_app_templates"
4
+ source_root "../spec/test_app_templates"
5
5
 
6
6
  def add_gems
7
7
  gem 'blacklight', '~> 5.1'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spotlight-dor-resources
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Beer
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-10-30 00:00:00.000000000 Z
11
+ date: 2015-11-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: faraday
@@ -321,7 +321,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
321
321
  version: '0'
322
322
  requirements: []
323
323
  rubyforge_project:
324
- rubygems_version: 2.4.8
324
+ rubygems_version: 2.4.6
325
325
  signing_key:
326
326
  specification_version: 4
327
327
  summary: Spotlight resource indexer for DOR resources.