cul_hydra 1.9.2 → 1.9.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a7c942afbb5baf9638461f80723af40b30633892a7774d426ad4edabe6dbdab4
4
- data.tar.gz: 549c735df93fed430ebb33708a59829f7f7228e664309d1f19f323f6de7ef4a6
3
+ metadata.gz: 8f3619958e270bc6bec9a1e7452032b1859264350705dc0b87be69492fb1ce75
4
+ data.tar.gz: c8a57028e065ad4dcd862714fe5d1885708ea91705e26f0c11f6f2358d1fd634
5
5
  SHA512:
6
- metadata.gz: 0d5eba1acff06eb1789b529d0f4871a4d40ce48f76d6e1cbd5acff3162495b156fce85ed2910c581fc7f71548f0c42de3551343b5e85aeddbfbe63f6ea8062d1
7
- data.tar.gz: 57d089ae06742089dd3e140307cf3cf787615196e69e8fbbc9a271fab3ff38f0ab4bf97554cab459d9cf1fa7ea66dedb2a17dbb49cb2480706e5712b5e61749b
6
+ metadata.gz: 328b4fc330baf639835cf543bc1994bdd5fff32d5030b657d48a85b14481bbf894fff1ccb121039d0191a22e3c8b5dbaffa8020969067a52dc0c6f00422de2f8
7
+ data.tar.gz: 4c142006b25ade25def658155a10ad8ea5ae4d896647f140c06ed7b16d99027b6d1deec68f56340e2e73bb38fcac530098604b886ff3c5c1d83a6f820206035a
@@ -2,11 +2,13 @@ module Cul
2
2
  module Hydra
3
3
  module Datastreams
4
4
  class EncodedTextDatastream < ::ActiveFedora::Datastream
5
- DEFAULT_PRIORITIES = [ Encoding::UTF_8, Encoding::ISO_8859_1, Encoding::WINDOWS_1252 ]
5
+ DEFAULT_PRIORITIES = [ Encoding::UTF_8, Encoding::WINDOWS_1252, Encoding::ISO_8859_1 ]
6
+
6
7
  def initialize(digital_object=nil, dsid=nil, options={})
7
8
  @encoding_priorities = options.delete(:encodings) || DEFAULT_PRIORITIES
8
9
  super
9
10
  end
11
+
10
12
  def content=(value)
11
13
  super(utf8able!(value).encode!(Encoding::UTF_8))
12
14
  end
@@ -21,6 +23,7 @@ class EncodedTextDatastream < ::ActiveFedora::Datastream
21
23
 
22
24
  def self.utf8able!(data, encoding_priorities = DEFAULT_PRIORITIES)
23
25
  return unless data
26
+ data = data.read if data.is_a? IO
24
27
  content_encoding = encoding_priorities.detect do |enc|
25
28
  begin
26
29
  data.force_encoding(enc).valid_encoding?
@@ -29,7 +32,6 @@ class EncodedTextDatastream < ::ActiveFedora::Datastream
29
32
  end
30
33
  end
31
34
  raise "could not encode text datastream content" unless content_encoding
32
- puts "using encoding #{content_encoding}"
33
35
  data.force_encoding(content_encoding)
34
36
  end
35
37
  end
@@ -68,7 +68,8 @@ class GenericResource < ::ActiveFedora::Base
68
68
  solr_doc["fulltext_tesim"] = []
69
69
  unless self.datastreams["fulltext"].nil?
70
70
  solr_doc["fulltext_tesim"].concat(solr_doc["title_display_ssm"]) unless solr_doc["title_display_ssm"].nil? or solr_doc["title_display_ssm"].length == 0
71
- solr_doc["fulltext_tesim"] << self.datastreams["fulltext"].content
71
+ utf8able = Cul::Hydra::Datastreams::EncodedTextDatastream.utf8able!(self.datastreams["fulltext"].content)
72
+ solr_doc["fulltext_tesim"] << utf8able.encode(Encoding::UTF_8)
72
73
  end
73
74
  relationships(:original_name).each do |original_name|
74
75
  solr_doc["original_name_tesim"] ||= []
@@ -0,0 +1 @@
1
+ In 1917, Harrison founded the first organization (The Liberty League) and the first newspaper (The Voice) of the �New Negro Movement� and he published his first book, The Negro and the Nation. He opposed positions taken by Joel E. Spingarn and W.E.B. Du Bois of the NAACP during the First World War and, along with William Monroe Trotter and others he organized the 1918 Liberty Congress. The Congress, the major Black protest effort during the war, demanded enforcement of the Thirteenth, Fourteenth, and Fifteenth Amendments and federal anti-lynching legislation. Beginning in 1920, he became the principal editor of Marcus Garvey's Negro World, which he reshaped into a leading political and literary publication of the era. In its pages, he discussed history, politics, theater, international affairs, religion, and science. He also created a "Poetry for the People" feature, a �West Indian News Notes� column, and what he described as the first regular book review section by a Black author in �Negro newspaperdom.� In 1920 he also published his second book, When Africa Awakes: The �Inside Story� of the Stirrings and Strivings of the New Negro in the Western World. Later, he would criticize Garvey's methods and actions.
@@ -2,7 +2,9 @@ module Cul::Hydra::Indexer
2
2
 
3
3
  NUM_FEDORA_RETRY_ATTEMPTS = 3
4
4
  DELAY_BETWEEN_FEDORA_RETRY_ATTEMPTS = 5.seconds
5
-
5
+ DEFAULT_INDEX_OPTS = {
6
+ skip_resources: false, verbose_output: false, softcommit: true, reraise: false
7
+ }.freeze
6
8
  def self.descend_from(pid, pids_to_omit=nil, verbose_output=false)
7
9
  if pid.blank?
8
10
  raise 'Please supply a pid (e.g. rake recursively_index_fedora_objects pid=ldpd:123)'
@@ -60,14 +62,37 @@ module Cul::Hydra::Indexer
60
62
  end
61
63
  def self.recursively_index_fedora_objects(top_pid, pids_to_omit=nil, skip_generic_resources=false, verbose_output=false)
62
64
 
65
+ index_opts = { skip_generic_resources: skip_generic_resources, verbose_output: verbose_output }
63
66
  descend_from(top_pid, pids_to_omit, verbose_output) do |pid|
64
- self.index_pid(pid, skip_generic_resources, verbose_output)
67
+ self.index_pid(pid, index_opts)
65
68
  end
69
+ end
66
70
 
71
+ # this is a compatibility method for bridging the previously used postional arguments to
72
+ # keyword arguments by extracting an opts hash from varargs
73
+ # legacy positional opts signature: skip_resources = false, verbose_output = false, softcommit = true
74
+ # keyword defaults are in DEFAULT_INDEX_OPTS
75
+ # @param args [Array] a list of arguments ending with an options hash
76
+ # @return options hash
77
+ def self.extract_index_opts(args)
78
+ args = args.dup # do not modify the original list
79
+ # extract opts hash
80
+ index_opts = (args.last.is_a? Hash) ? args.pop : {}
81
+ # symbolize keys and reverse merge defaults
82
+ index_opts = index_opts.map {|k,v| [k.to_sym, v] }.to_h
83
+ index_opts = DEFAULT_INDEX_OPTS.merge(index_opts)
84
+ # assign any legacy positional arguments, permitting explicit nils
85
+ unless args.empty?
86
+ index_opts[:skip_resources] = args[0] if args.length > 0
87
+ index_opts[:verbose_output] = args[1] if args.length > 1
88
+ index_opts[:softcommit] = args[2] if args.length > 2
89
+ end
90
+ index_opts
67
91
  end
68
92
 
69
- def self.index_pid(pid, skip_generic_resources=false, verbose_output=false, softcommit=true)
93
+ def self.index_pid(pid, *args)
70
94
  # We found an object with the desired PID. Let's reindex it
95
+ index_opts = extract_index_opts(args)
71
96
  begin
72
97
  active_fedora_object = nil
73
98
 
@@ -77,19 +102,19 @@ module Cul::Hydra::Indexer
77
102
  if skip_generic_resources && active_fedora_object.is_a?(GenericResource)
78
103
  puts 'Object was skipped because GenericResources are being skipped and it is a GenericResource.'
79
104
  else
80
- if softcommit
105
+ if index_opts[:softcommit]
81
106
  active_fedora_object.update_index
82
107
  else
83
108
  # Using direct solr query to update document without soft commiting
84
109
  ActiveFedora::SolrService.add(active_fedora_object.to_solr)
85
110
  end
86
- puts 'done.' if verbose_output
111
+ puts 'done.' if index_opts[:verbose_output]
87
112
  end
88
113
  break
89
114
  rescue RestClient::RequestTimeout, Errno::EHOSTUNREACH => e
90
115
  remaining_attempts = (NUM_FEDORA_RETRY_ATTEMPTS-1) - i
91
116
  if remaining_attempts == 0
92
- raise e
117
+ raise
93
118
  else
94
119
  Rails.logger.error "Error: Could not connect to fedora. (#{e.class.to_s + ': ' + e.message}). Will retry #{remaining_attempts} more #{remaining_attempts == 1 ? 'time' : 'times'} (after a #{DELAY_BETWEEN_FEDORA_RETRY_ATTEMPTS} second delay)."
95
120
  sleep DELAY_BETWEEN_FEDORA_RETRY_ATTEMPTS
@@ -102,15 +127,18 @@ module Cul::Hydra::Indexer
102
127
  sleep 5
103
128
  else
104
129
  # Other RuntimeErrors should be passed on
105
- raise e
130
+ raise
106
131
  end
107
132
  end
108
133
  end
109
134
  rescue SystemExit, Interrupt => e
110
135
  # Allow system interrupt (ctrl+c)
111
- raise e
136
+ raise
112
137
  rescue Exception => e
113
138
  puts "Encountered problem with #{pid}. Skipping record. Exception class: #{e.class.name}. Message: #{e.message}"
139
+ if index_opts[:reraise]
140
+ raise
141
+ end
114
142
  end
115
143
  end
116
144
  end
@@ -1,6 +1,6 @@
1
1
  module Cul
2
2
  module Hydra
3
- VERSION = '1.9.2'
3
+ VERSION = '1.9.3'
4
4
  def self.version
5
5
  VERSION
6
6
  end
data/lib/tasks/index.rake CHANGED
@@ -91,10 +91,11 @@ namespace :cul_hydra do
91
91
  pool = Thread.pool(thread_pool_size)
92
92
  mutex = Mutex.new
93
93
 
94
+ index_opts = { skip_generic_resources: skip_generic_resources, verbose_output: false }
94
95
  pids.each do |pid|
95
96
  pool.process {
96
97
 
97
- Cul::Hydra::Indexer.index_pid(pid, skip_generic_resources, false)
98
+ Cul::Hydra::Indexer.index_pid(pid, index_opts)
98
99
 
99
100
  mutex.synchronize do
100
101
  counter += 1
@@ -124,7 +125,8 @@ namespace :cul_hydra do
124
125
  next
125
126
  end
126
127
 
127
- skip_generic_resources = (ENV['skip_generic_resources'] == 'true')
128
+ index_opts = { verbose_output: false }
129
+ index_opts[:skip_generic_resources] = (ENV['skip_generic_resources'] == 'true')
128
130
 
129
131
  start_time = Time.now
130
132
  pids = Cul::Hydra::RisearchMembers.get_publish_target_member_pids(publish_target_pid, true)
@@ -133,7 +135,7 @@ namespace :cul_hydra do
133
135
  counter = 0
134
136
 
135
137
  pids.each do |pid|
136
- Cul::Hydra::Indexer.index_pid(pid, skip_generic_resources, false)
138
+ Cul::Hydra::Indexer.index_pid(pid, index_opts)
137
139
  counter += 1
138
140
  puts "Indexed #{counter} of #{total} | #{Time.now - start_time} seconds"
139
141
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cul_hydra
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.9.2
4
+ version: 1.9.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Benjamin Armintor
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2020-03-12 00:00:00.000000000 Z
12
+ date: 2020-04-15 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rails
@@ -451,6 +451,7 @@ files:
451
451
  - fixtures/cmodels/ldpd_sdep.StaticImageCore.xml
452
452
  - fixtures/cmodels/ore_Proxy.xml
453
453
  - fixtures/cmodels/pcdm_Collection.xml
454
+ - fixtures/spec/BLOB/description-cp1252.txt
454
455
  - fixtures/spec/BLOB/description-utf8.txt
455
456
  - fixtures/spec/BLOB/dlc.md
456
457
  - fixtures/spec/BLOB/test001.jpg