spotlight-dor-resources 0.0.5 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c0c448f6276a5bbd515fecaaf0e0d836e533f246
4
- data.tar.gz: 13048c1c02a606c0c4d30e07768c79bcd6385834
3
+ metadata.gz: c130b41c0a1a72df4461aeb827bc66f7959244f6
4
+ data.tar.gz: 2c91800eb334e092f90e5cb290fa69ace4163b0e
5
5
  SHA512:
6
- metadata.gz: 326665ea9d207234a6fa97b8c3993a5cff159a154d78e61d813de6352b39ece8db5d8364a331830fb0857979e97b12299eded8b77d0b9d513e0155e02a1fab71
7
- data.tar.gz: 7a9ade62a20f3c233864fcbccf1454ad220f98297a3556a42550e6e2c30fc887a4040cac6346d4af6ee3ce716a8fd8edad5314b31aaca399f3d9e176acbe3682
6
+ metadata.gz: e8ab91d1840f91f56a32a6eefe3d249fb7727fa985aad7324db746d805a3e3c7c2b02ca06e8ac1f49d5fad5ca97645ea107663c9a4cb984e14b51dc03d525050
7
+ data.tar.gz: 812b4522e38b815d846ce4a240a0792c64cd2b3139895a990d195eb938882828970c6c533ee61f35dc196e591798b8c06dd80466f7fe9a3ff6a34b382f7d73ed
data/README.md CHANGED
@@ -51,8 +51,12 @@ public_xml=purl.resource.public_xml # nokogiri doc with all public XML
51
51
  mods=purl.resource.mods # nokogiri doc with just the MODs
52
52
  smods_rec=purl.resource.smods_rec # a Stanford::Mods::Record Object (which inherits from the MODS gem)
53
53
 
54
+ puts purl.resource.bare_druid # will not have the "druid:" prefix
55
+ puts purl.resource.druid # might have the "druid:" prefix (if returned as the result of an is_collection call)
56
+
54
57
  purl.resource.indexer.config.solr.url # the configured solr server (should be localhost in development)
55
- purl.save # save and index to solr
58
+
59
+ purl.save # save to database and index to solr
56
60
  purl.reindex # reindex once saved the first time
57
61
  ```
58
62
 
@@ -1,7 +1,10 @@
1
1
  # rubocop:disable Metrics/ClassLength
2
+ # rubocop:disable Metrics/AbcSize
3
+
2
4
  # external gems
3
5
  require 'gdor/indexer'
4
6
  require 'solrizer'
7
+ require 'faraday'
5
8
 
6
9
  module Spotlight::Dor
7
10
  # Base class to harvest from DOR via harvestdor gem
@@ -125,7 +128,7 @@ module Spotlight::Dor
125
128
  add_thumbnail_fields(images.first, solr_doc) if images.first
126
129
 
127
130
  images.each do |image|
128
- add_image_fields(image, solr_doc)
131
+ add_image_fields(image, solr_doc, sdb.bare_druid)
129
132
  end
130
133
  end
131
134
 
@@ -140,9 +143,9 @@ module Spotlight::Dor
140
143
  Solrizer.insert_field(solr_doc, 'content_metadata_first_image_height', image_data['height'], :displayable)
141
144
  end
142
145
 
143
- def add_image_fields(node, solr_doc)
146
+ def add_image_fields(node, solr_doc, bare_druid)
144
147
  file_id = node.attr('id').gsub('.jp2', '')
145
- base_url = stacks_iiif_url(solr_doc[:id], file_id)
148
+ base_url = stacks_iiif_url(bare_druid, file_id)
146
149
 
147
150
  Solrizer.insert_field(solr_doc, 'content_metadata_image_iiif_info', "#{base_url}/info.json", :displayable)
148
151
  Solrizer.insert_field(solr_doc, 'thumbnail_square_url', "#{base_url}/square/100,100/0/default.jpg", :displayable)
@@ -151,8 +154,8 @@ module Spotlight::Dor
151
154
  Solrizer.insert_field(solr_doc, 'full_image_url', "#{base_url}/full/full/0/default.jpg", :displayable)
152
155
  end
153
156
 
154
- def stacks_iiif_url(druid, file_name)
155
- "#{Spotlight::Dor::Resources::Engine.config.stacks_iiif_url}/#{druid}%2F#{file_name}"
157
+ def stacks_iiif_url(bare_druid, file_name)
158
+ "#{Spotlight::Dor::Resources::Engine.config.stacks_iiif_url}/#{bare_druid}%2F#{file_name}"
156
159
  end
157
160
  end
158
161
 
@@ -189,19 +192,16 @@ module Spotlight::Dor
189
192
 
190
193
  # search for configured full text files, and if found, add them to the full text (whole document) solr field
191
194
  def add_object_full_text(sdb, solr_doc)
192
- object_level_full_text_urls(sdb).each do |file_url|
193
- # append content from each file to the field, creating field if it doesn't exist yet
194
- # ruby note: the construct below allows us to append a string to a variable that starts out as nil
195
- # because .to_s of nil is empty string
196
- solr_doc['full_text_tesim'] = solr_doc['full_text_tesim'].to_s + get_file_content(file_url)
197
- end
195
+ full_text_urls = object_level_full_text_urls(sdb)
196
+ return if full_text_urls.size == 0
197
+ solr_doc['full_text_tesim'] = full_text_urls.map { |file_url| get_file_content(file_url) }
198
198
  end
199
199
 
200
200
  # go grab the supplied file url, grab the file, encode and return
201
- # TODO: thse should also be able to also deal with .rtf and .xml files
201
+ # TODO: this should also be able to deal with .rtf and .xml files, scrubbing/converting as necessary to get plain text
202
202
  def get_file_content(file_url)
203
- response = Net::HTTP.get_response(URI.parse(file_url))
204
- response.body.scrub.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
203
+ response = Faraday.get(file_url)
204
+ response.body.scrub.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?').gsub(/\s+/, ' ')
205
205
  rescue
206
206
  logger.warn("Error indexing full text - couldn't load file #{file_url}")
207
207
  nil
@@ -213,7 +213,7 @@ module Spotlight::Dor
213
213
  files = []
214
214
  object_level_full_text_filenames(sdb).each do |xpath_location|
215
215
  files += sdb.public_xml.xpath(xpath_location).map do |txt_file|
216
- "#{Spotlight::Dor::Resources::Engine.config.stacks_file_url}/#{sdb.druid}/#{txt_file['id']}"
216
+ "#{Spotlight::Dor::Resources::Engine.config.stacks_file_url}/#{sdb.bare_druid}/#{txt_file['id']}"
217
217
  end
218
218
  end
219
219
  files
@@ -223,7 +223,7 @@ module Spotlight::Dor
223
223
  # add as many as you need, all will be searched
224
224
  def object_level_full_text_filenames(sdb)
225
225
  [
226
- "//contentMetadata/resource/file[@id=\"#{sdb.druid}.txt\"]" # feigenbaum style - full text in .txt named for druid
226
+ "//contentMetadata/resource/file[@id=\"#{sdb.bare_druid}.txt\"]" # feigenbaum style - full text in .txt named for druid
227
227
  ]
228
228
  end
229
229
  end
@@ -2,7 +2,7 @@ module Spotlight
2
2
  module Dor
3
3
  # :nodoc:
4
4
  module Resources
5
- VERSION = '0.0.5'
5
+ VERSION = '0.0.6'
6
6
  end
7
7
  end
8
8
  end
@@ -50,6 +50,7 @@ describe Spotlight::Dor::Indexer do
50
50
  describe '#add_content_metadata_fields' do
51
51
  before do
52
52
  allow(r).to receive(:public_xml).and_return(public_xml)
53
+ allow(sdb).to receive(:bare_druid).and_return(fake_druid)
53
54
 
54
55
  # stacks url calculations require the druid
55
56
  solr_doc[:id] = fake_druid
@@ -500,6 +501,9 @@ describe Spotlight::Dor::Indexer do
500
501
  # rubocop:enable Metrics/LineLength
501
502
 
502
503
  describe '#add_object_full_text' do
504
+ before do
505
+ allow(sdb).to receive(:bare_druid).and_return(fake_druid)
506
+ end
503
507
  let!(:expected_text) { 'SOME full text string that is returned from the server' }
504
508
  let!(:full_file_path) { 'https://stacks.stanford.edu/file/oo000oo0000/oo000oo0000.txt' }
505
509
  it 'indexes the full text into the appropriate field if a recognized file pattern is found' do
@@ -523,7 +527,7 @@ describe Spotlight::Dor::Indexer do
523
527
  allow(subject).to receive(:get_file_content).with(full_file_path).and_return(expected_text)
524
528
  subject.send(:add_object_full_text, sdb, solr_doc)
525
529
  expect(subject.object_level_full_text_urls(sdb)).to eq [full_file_path]
526
- expect(solr_doc['full_text_tesim']).to eq expected_text
530
+ expect(solr_doc['full_text_tesim']).to eq [expected_text]
527
531
  end
528
532
  it 'does not index the full text if no recognized pattern is found' do
529
533
  public_xml_with_no_recognized_full_text = Nokogiri::XML <<-EOF
@@ -566,7 +570,7 @@ describe Spotlight::Dor::Indexer do
566
570
  allow(subject).to receive(:get_file_content).with(full_file_path).and_return(expected_text)
567
571
  subject.send(:add_object_full_text, sdb, solr_doc)
568
572
  expect(subject.object_level_full_text_urls(sdb)).to eq [full_file_path, full_file_path]
569
- expect(solr_doc['full_text_tesim']).to eq(expected_text + expected_text) # same file twice
573
+ expect(solr_doc['full_text_tesim']).to eq [expected_text, expected_text] # same file twice in a 2 element array
570
574
  end
571
575
  end # add_object_full_text
572
576
  end
@@ -1,3 +1,8 @@
1
+ dor_fetcher:
2
+ service_url: http://dorfetcher-url
3
+ # if skip_heartbeat set to true, this will skip a check that the dorfetcher service is alive before making API calls
4
+ skip_heartbeat: true
5
+
1
6
  harvestdor:
2
7
  # log_dir: directory for log file (default logs, relative to harvestdor gem path)
3
8
  log_dir: log
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spotlight-dor-resources
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Beer
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-11-04 00:00:00.000000000 Z
11
+ date: 2015-11-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: faraday