spotlight-dor-resources 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c0c448f6276a5bbd515fecaaf0e0d836e533f246
4
- data.tar.gz: 13048c1c02a606c0c4d30e07768c79bcd6385834
3
+ metadata.gz: c130b41c0a1a72df4461aeb827bc66f7959244f6
4
+ data.tar.gz: 2c91800eb334e092f90e5cb290fa69ace4163b0e
5
5
  SHA512:
6
- metadata.gz: 326665ea9d207234a6fa97b8c3993a5cff159a154d78e61d813de6352b39ece8db5d8364a331830fb0857979e97b12299eded8b77d0b9d513e0155e02a1fab71
7
- data.tar.gz: 7a9ade62a20f3c233864fcbccf1454ad220f98297a3556a42550e6e2c30fc887a4040cac6346d4af6ee3ce716a8fd8edad5314b31aaca399f3d9e176acbe3682
6
+ metadata.gz: e8ab91d1840f91f56a32a6eefe3d249fb7727fa985aad7324db746d805a3e3c7c2b02ca06e8ac1f49d5fad5ca97645ea107663c9a4cb984e14b51dc03d525050
7
+ data.tar.gz: 812b4522e38b815d846ce4a240a0792c64cd2b3139895a990d195eb938882828970c6c533ee61f35dc196e591798b8c06dd80466f7fe9a3ff6a34b382f7d73ed
data/README.md CHANGED
@@ -51,8 +51,12 @@ public_xml=purl.resource.public_xml # nokogiri doc with all public XML
51
51
  mods=purl.resource.mods # nokogiri doc with just the MODs
52
52
  smods_rec=purl.resource.smods_rec # a Stanford::Mods::Record Object (which inherits from the MODS gem)
53
53
 
54
+ puts purl.resource.bare_druid # will not have the "druid:" prefix
55
+ puts purl.resource.druid # might have the "druid:" prefix (if returned as the result of an is_collection call)
56
+
54
57
  purl.resource.indexer.config.solr.url # the configured solr server (should be localhost in development)
55
- purl.save # save and index to solr
58
+
59
+ purl.save # save to database and index to solr
56
60
  purl.reindex # reindex once saved the first time
57
61
  ```
58
62
 
@@ -1,7 +1,10 @@
1
1
  # rubocop:disable Metrics/ClassLength
2
+ # rubocop:disable Metrics/AbcSize
3
+
2
4
  # external gems
3
5
  require 'gdor/indexer'
4
6
  require 'solrizer'
7
+ require 'faraday'
5
8
 
6
9
  module Spotlight::Dor
7
10
  # Base class to harvest from DOR via harvestdor gem
@@ -125,7 +128,7 @@ module Spotlight::Dor
125
128
  add_thumbnail_fields(images.first, solr_doc) if images.first
126
129
 
127
130
  images.each do |image|
128
- add_image_fields(image, solr_doc)
131
+ add_image_fields(image, solr_doc, sdb.bare_druid)
129
132
  end
130
133
  end
131
134
 
@@ -140,9 +143,9 @@ module Spotlight::Dor
140
143
  Solrizer.insert_field(solr_doc, 'content_metadata_first_image_height', image_data['height'], :displayable)
141
144
  end
142
145
 
143
- def add_image_fields(node, solr_doc)
146
+ def add_image_fields(node, solr_doc, bare_druid)
144
147
  file_id = node.attr('id').gsub('.jp2', '')
145
- base_url = stacks_iiif_url(solr_doc[:id], file_id)
148
+ base_url = stacks_iiif_url(bare_druid, file_id)
146
149
 
147
150
  Solrizer.insert_field(solr_doc, 'content_metadata_image_iiif_info', "#{base_url}/info.json", :displayable)
148
151
  Solrizer.insert_field(solr_doc, 'thumbnail_square_url', "#{base_url}/square/100,100/0/default.jpg", :displayable)
@@ -151,8 +154,8 @@ module Spotlight::Dor
151
154
  Solrizer.insert_field(solr_doc, 'full_image_url', "#{base_url}/full/full/0/default.jpg", :displayable)
152
155
  end
153
156
 
154
- def stacks_iiif_url(druid, file_name)
155
- "#{Spotlight::Dor::Resources::Engine.config.stacks_iiif_url}/#{druid}%2F#{file_name}"
157
+ def stacks_iiif_url(bare_druid, file_name)
158
+ "#{Spotlight::Dor::Resources::Engine.config.stacks_iiif_url}/#{bare_druid}%2F#{file_name}"
156
159
  end
157
160
  end
158
161
 
@@ -189,19 +192,16 @@ module Spotlight::Dor
189
192
 
190
193
  # search for configured full text files, and if found, add them to the full text (whole document) solr field
191
194
  def add_object_full_text(sdb, solr_doc)
192
- object_level_full_text_urls(sdb).each do |file_url|
193
- # append content from each file to the field, creating field if it doesn't exist yet
194
- # ruby note: the construct below allows us to append a string to a variable that starts out as nil
195
- # because .to_s of nil is empty string
196
- solr_doc['full_text_tesim'] = solr_doc['full_text_tesim'].to_s + get_file_content(file_url)
197
- end
195
+ full_text_urls = object_level_full_text_urls(sdb)
196
+ return if full_text_urls.size == 0
197
+ solr_doc['full_text_tesim'] = full_text_urls.map { |file_url| get_file_content(file_url) }
198
198
  end
199
199
 
200
200
  # go grab the supplied file url, grab the file, encode and return
201
- # TODO: thse should also be able to also deal with .rtf and .xml files
201
+ # TODO: this should also be able to deal with .rtf and .xml files, scrubbing/converting as necessary to get plain text
202
202
  def get_file_content(file_url)
203
- response = Net::HTTP.get_response(URI.parse(file_url))
204
- response.body.scrub.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
203
+ response = Faraday.get(file_url)
204
+ response.body.scrub.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?').gsub(/\s+/, ' ')
205
205
  rescue
206
206
  logger.warn("Error indexing full text - couldn't load file #{file_url}")
207
207
  nil
@@ -213,7 +213,7 @@ module Spotlight::Dor
213
213
  files = []
214
214
  object_level_full_text_filenames(sdb).each do |xpath_location|
215
215
  files += sdb.public_xml.xpath(xpath_location).map do |txt_file|
216
- "#{Spotlight::Dor::Resources::Engine.config.stacks_file_url}/#{sdb.druid}/#{txt_file['id']}"
216
+ "#{Spotlight::Dor::Resources::Engine.config.stacks_file_url}/#{sdb.bare_druid}/#{txt_file['id']}"
217
217
  end
218
218
  end
219
219
  files
@@ -223,7 +223,7 @@ module Spotlight::Dor
223
223
  # add as many as you need, all will be searched
224
224
  def object_level_full_text_filenames(sdb)
225
225
  [
226
- "//contentMetadata/resource/file[@id=\"#{sdb.druid}.txt\"]" # feigenbaum style - full text in .txt named for druid
226
+ "//contentMetadata/resource/file[@id=\"#{sdb.bare_druid}.txt\"]" # feigenbaum style - full text in .txt named for druid
227
227
  ]
228
228
  end
229
229
  end
@@ -2,7 +2,7 @@ module Spotlight
2
2
  module Dor
3
3
  # :nodoc:
4
4
  module Resources
5
- VERSION = '0.0.5'
5
+ VERSION = '0.0.6'
6
6
  end
7
7
  end
8
8
  end
@@ -50,6 +50,7 @@ describe Spotlight::Dor::Indexer do
50
50
  describe '#add_content_metadata_fields' do
51
51
  before do
52
52
  allow(r).to receive(:public_xml).and_return(public_xml)
53
+ allow(sdb).to receive(:bare_druid).and_return(fake_druid)
53
54
 
54
55
  # stacks url calculations require the druid
55
56
  solr_doc[:id] = fake_druid
@@ -500,6 +501,9 @@ describe Spotlight::Dor::Indexer do
500
501
  # rubocop:enable Metrics/LineLength
501
502
 
502
503
  describe '#add_object_full_text' do
504
+ before do
505
+ allow(sdb).to receive(:bare_druid).and_return(fake_druid)
506
+ end
503
507
  let!(:expected_text) { 'SOME full text string that is returned from the server' }
504
508
  let!(:full_file_path) { 'https://stacks.stanford.edu/file/oo000oo0000/oo000oo0000.txt' }
505
509
  it 'indexes the full text into the appropriate field if a recognized file pattern is found' do
@@ -523,7 +527,7 @@ describe Spotlight::Dor::Indexer do
523
527
  allow(subject).to receive(:get_file_content).with(full_file_path).and_return(expected_text)
524
528
  subject.send(:add_object_full_text, sdb, solr_doc)
525
529
  expect(subject.object_level_full_text_urls(sdb)).to eq [full_file_path]
526
- expect(solr_doc['full_text_tesim']).to eq expected_text
530
+ expect(solr_doc['full_text_tesim']).to eq [expected_text]
527
531
  end
528
532
  it 'does not index the full text if no recognized pattern is found' do
529
533
  public_xml_with_no_recognized_full_text = Nokogiri::XML <<-EOF
@@ -566,7 +570,7 @@ describe Spotlight::Dor::Indexer do
566
570
  allow(subject).to receive(:get_file_content).with(full_file_path).and_return(expected_text)
567
571
  subject.send(:add_object_full_text, sdb, solr_doc)
568
572
  expect(subject.object_level_full_text_urls(sdb)).to eq [full_file_path, full_file_path]
569
- expect(solr_doc['full_text_tesim']).to eq(expected_text + expected_text) # same file twice
573
+ expect(solr_doc['full_text_tesim']).to eq [expected_text, expected_text] # same file twice in a 2 element array
570
574
  end
571
575
  end # add_object_full_text
572
576
  end
@@ -1,3 +1,8 @@
1
+ dor_fetcher:
2
+ service_url: http://dorfetcher-url
3
+ # if skip_heartbeat set to true, this will skip a check that the dorfetcher service is alive before making API calls
4
+ skip_heartbeat: true
5
+
1
6
  harvestdor:
2
7
  # log_dir: directory for log file (default logs, relative to harvestdor gem path)
3
8
  log_dir: log
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spotlight-dor-resources
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Beer
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-11-04 00:00:00.000000000 Z
11
+ date: 2015-11-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: faraday