cul_hydra 1.9.2 → 1.9.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/app/models/cul/hydra/datastreams/encoded_text_datastream.rb +4 -2
- data/app/models/generic_resource.rb +2 -1
- data/fixtures/spec/BLOB/description-cp1252.txt +1 -0
- data/lib/cul_hydra/indexer.rb +36 -8
- data/lib/cul_hydra/version.rb +1 -1
- data/lib/tasks/index.rake +5 -3
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8f3619958e270bc6bec9a1e7452032b1859264350705dc0b87be69492fb1ce75
|
4
|
+
data.tar.gz: c8a57028e065ad4dcd862714fe5d1885708ea91705e26f0c11f6f2358d1fd634
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 328b4fc330baf639835cf543bc1994bdd5fff32d5030b657d48a85b14481bbf894fff1ccb121039d0191a22e3c8b5dbaffa8020969067a52dc0c6f00422de2f8
|
7
|
+
data.tar.gz: 4c142006b25ade25def658155a10ad8ea5ae4d896647f140c06ed7b16d99027b6d1deec68f56340e2e73bb38fcac530098604b886ff3c5c1d83a6f820206035a
|
@@ -2,11 +2,13 @@ module Cul
|
|
2
2
|
module Hydra
|
3
3
|
module Datastreams
|
4
4
|
class EncodedTextDatastream < ::ActiveFedora::Datastream
|
5
|
-
DEFAULT_PRIORITIES = [ Encoding::UTF_8, Encoding::
|
5
|
+
DEFAULT_PRIORITIES = [ Encoding::UTF_8, Encoding::WINDOWS_1252, Encoding::ISO_8859_1 ]
|
6
|
+
|
6
7
|
def initialize(digital_object=nil, dsid=nil, options={})
|
7
8
|
@encoding_priorities = options.delete(:encodings) || DEFAULT_PRIORITIES
|
8
9
|
super
|
9
10
|
end
|
11
|
+
|
10
12
|
def content=(value)
|
11
13
|
super(utf8able!(value).encode!(Encoding::UTF_8))
|
12
14
|
end
|
@@ -21,6 +23,7 @@ class EncodedTextDatastream < ::ActiveFedora::Datastream
|
|
21
23
|
|
22
24
|
def self.utf8able!(data, encoding_priorities = DEFAULT_PRIORITIES)
|
23
25
|
return unless data
|
26
|
+
data = data.read if data.is_a? IO
|
24
27
|
content_encoding = encoding_priorities.detect do |enc|
|
25
28
|
begin
|
26
29
|
data.force_encoding(enc).valid_encoding?
|
@@ -29,7 +32,6 @@ class EncodedTextDatastream < ::ActiveFedora::Datastream
|
|
29
32
|
end
|
30
33
|
end
|
31
34
|
raise "could not encode text datastream content" unless content_encoding
|
32
|
-
puts "using encoding #{content_encoding}"
|
33
35
|
data.force_encoding(content_encoding)
|
34
36
|
end
|
35
37
|
end
|
@@ -68,7 +68,8 @@ class GenericResource < ::ActiveFedora::Base
|
|
68
68
|
solr_doc["fulltext_tesim"] = []
|
69
69
|
unless self.datastreams["fulltext"].nil?
|
70
70
|
solr_doc["fulltext_tesim"].concat(solr_doc["title_display_ssm"]) unless solr_doc["title_display_ssm"].nil? or solr_doc["title_display_ssm"].length == 0
|
71
|
-
|
71
|
+
utf8able = Cul::Hydra::Datastreams::EncodedTextDatastream.utf8able!(self.datastreams["fulltext"].content)
|
72
|
+
solr_doc["fulltext_tesim"] << utf8able.encode(Encoding::UTF_8)
|
72
73
|
end
|
73
74
|
relationships(:original_name).each do |original_name|
|
74
75
|
solr_doc["original_name_tesim"] ||= []
|
@@ -0,0 +1 @@
|
|
1
|
+
In 1917, Harrison founded the first organization (The Liberty League) and the first newspaper (The Voice) of the �New Negro Movement� and he published his first book, The Negro and the Nation. He opposed positions taken by Joel E. Spingarn and W.E.B. Du Bois of the NAACP during the First World War and, along with William Monroe Trotter and others he organized the 1918 Liberty Congress. The Congress, the major Black protest effort during the war, demanded enforcement of the Thirteenth, Fourteenth, and Fifteenth Amendments and federal anti-lynching legislation. Beginning in 1920, he became the principal editor of Marcus Garvey's Negro World, which he reshaped into a leading political and literary publication of the era. In its pages, he discussed history, politics, theater, international affairs, religion, and science. He also created a "Poetry for the People" feature, a �West Indian News Notes� column, and what he described as the first regular book review section by a Black author in �Negro newspaperdom.� In 1920 he also published his second book, When Africa Awakes: The �Inside Story� of the Stirrings and Strivings of the New Negro in the Western World. Later, he would criticize Garvey's methods and actions.
|
data/lib/cul_hydra/indexer.rb
CHANGED
@@ -2,7 +2,9 @@ module Cul::Hydra::Indexer
|
|
2
2
|
|
3
3
|
NUM_FEDORA_RETRY_ATTEMPTS = 3
|
4
4
|
DELAY_BETWEEN_FEDORA_RETRY_ATTEMPTS = 5.seconds
|
5
|
-
|
5
|
+
DEFAULT_INDEX_OPTS = {
|
6
|
+
skip_resources: false, verbose_output: false, softcommit: true, reraise: false
|
7
|
+
}.freeze
|
6
8
|
def self.descend_from(pid, pids_to_omit=nil, verbose_output=false)
|
7
9
|
if pid.blank?
|
8
10
|
raise 'Please supply a pid (e.g. rake recursively_index_fedora_objects pid=ldpd:123)'
|
@@ -60,14 +62,37 @@ module Cul::Hydra::Indexer
|
|
60
62
|
end
|
61
63
|
def self.recursively_index_fedora_objects(top_pid, pids_to_omit=nil, skip_generic_resources=false, verbose_output=false)
|
62
64
|
|
65
|
+
index_opts = { skip_generic_resources: skip_generic_resources, verbose_output: verbose_output }
|
63
66
|
descend_from(top_pid, pids_to_omit, verbose_output) do |pid|
|
64
|
-
self.index_pid(pid,
|
67
|
+
self.index_pid(pid, index_opts)
|
65
68
|
end
|
69
|
+
end
|
66
70
|
|
71
|
+
# this is a compatibility method for bridging the previously used postional arguments to
|
72
|
+
# keyword arguments by extracting an opts hash from varargs
|
73
|
+
# legacy positional opts signature: skip_resources = false, verbose_output = false, softcommit = true
|
74
|
+
# keyword defaults are in DEFAULT_INDEX_OPTS
|
75
|
+
# @param args [Array] a list of arguments ending with an options hash
|
76
|
+
# @return options hash
|
77
|
+
def self.extract_index_opts(args)
|
78
|
+
args = args.dup # do not modify the original list
|
79
|
+
# extract opts hash
|
80
|
+
index_opts = (args.last.is_a? Hash) ? args.pop : {}
|
81
|
+
# symbolize keys and reverse merge defaults
|
82
|
+
index_opts = index_opts.map {|k,v| [k.to_sym, v] }.to_h
|
83
|
+
index_opts = DEFAULT_INDEX_OPTS.merge(index_opts)
|
84
|
+
# assign any legacy positional arguments, permitting explicit nils
|
85
|
+
unless args.empty?
|
86
|
+
index_opts[:skip_resources] = args[0] if args.length > 0
|
87
|
+
index_opts[:verbose_output] = args[1] if args.length > 1
|
88
|
+
index_opts[:softcommit] = args[2] if args.length > 2
|
89
|
+
end
|
90
|
+
index_opts
|
67
91
|
end
|
68
92
|
|
69
|
-
def self.index_pid(pid,
|
93
|
+
def self.index_pid(pid, *args)
|
70
94
|
# We found an object with the desired PID. Let's reindex it
|
95
|
+
index_opts = extract_index_opts(args)
|
71
96
|
begin
|
72
97
|
active_fedora_object = nil
|
73
98
|
|
@@ -77,19 +102,19 @@ module Cul::Hydra::Indexer
|
|
77
102
|
if skip_generic_resources && active_fedora_object.is_a?(GenericResource)
|
78
103
|
puts 'Object was skipped because GenericResources are being skipped and it is a GenericResource.'
|
79
104
|
else
|
80
|
-
if softcommit
|
105
|
+
if index_opts[:softcommit]
|
81
106
|
active_fedora_object.update_index
|
82
107
|
else
|
83
108
|
# Using direct solr query to update document without soft commiting
|
84
109
|
ActiveFedora::SolrService.add(active_fedora_object.to_solr)
|
85
110
|
end
|
86
|
-
puts 'done.' if verbose_output
|
111
|
+
puts 'done.' if index_opts[:verbose_output]
|
87
112
|
end
|
88
113
|
break
|
89
114
|
rescue RestClient::RequestTimeout, Errno::EHOSTUNREACH => e
|
90
115
|
remaining_attempts = (NUM_FEDORA_RETRY_ATTEMPTS-1) - i
|
91
116
|
if remaining_attempts == 0
|
92
|
-
raise
|
117
|
+
raise
|
93
118
|
else
|
94
119
|
Rails.logger.error "Error: Could not connect to fedora. (#{e.class.to_s + ': ' + e.message}). Will retry #{remaining_attempts} more #{remaining_attempts == 1 ? 'time' : 'times'} (after a #{DELAY_BETWEEN_FEDORA_RETRY_ATTEMPTS} second delay)."
|
95
120
|
sleep DELAY_BETWEEN_FEDORA_RETRY_ATTEMPTS
|
@@ -102,15 +127,18 @@ module Cul::Hydra::Indexer
|
|
102
127
|
sleep 5
|
103
128
|
else
|
104
129
|
# Other RuntimeErrors should be passed on
|
105
|
-
raise
|
130
|
+
raise
|
106
131
|
end
|
107
132
|
end
|
108
133
|
end
|
109
134
|
rescue SystemExit, Interrupt => e
|
110
135
|
# Allow system interrupt (ctrl+c)
|
111
|
-
raise
|
136
|
+
raise
|
112
137
|
rescue Exception => e
|
113
138
|
puts "Encountered problem with #{pid}. Skipping record. Exception class: #{e.class.name}. Message: #{e.message}"
|
139
|
+
if index_opts[:reraise]
|
140
|
+
raise
|
141
|
+
end
|
114
142
|
end
|
115
143
|
end
|
116
144
|
end
|
data/lib/cul_hydra/version.rb
CHANGED
data/lib/tasks/index.rake
CHANGED
@@ -91,10 +91,11 @@ namespace :cul_hydra do
|
|
91
91
|
pool = Thread.pool(thread_pool_size)
|
92
92
|
mutex = Mutex.new
|
93
93
|
|
94
|
+
index_opts = { skip_generic_resources: skip_generic_resources, verbose_output: false }
|
94
95
|
pids.each do |pid|
|
95
96
|
pool.process {
|
96
97
|
|
97
|
-
Cul::Hydra::Indexer.index_pid(pid,
|
98
|
+
Cul::Hydra::Indexer.index_pid(pid, index_opts)
|
98
99
|
|
99
100
|
mutex.synchronize do
|
100
101
|
counter += 1
|
@@ -124,7 +125,8 @@ namespace :cul_hydra do
|
|
124
125
|
next
|
125
126
|
end
|
126
127
|
|
127
|
-
|
128
|
+
index_opts = { verbose_output: false }
|
129
|
+
index_opts[:skip_generic_resources] = (ENV['skip_generic_resources'] == 'true')
|
128
130
|
|
129
131
|
start_time = Time.now
|
130
132
|
pids = Cul::Hydra::RisearchMembers.get_publish_target_member_pids(publish_target_pid, true)
|
@@ -133,7 +135,7 @@ namespace :cul_hydra do
|
|
133
135
|
counter = 0
|
134
136
|
|
135
137
|
pids.each do |pid|
|
136
|
-
Cul::Hydra::Indexer.index_pid(pid,
|
138
|
+
Cul::Hydra::Indexer.index_pid(pid, index_opts)
|
137
139
|
counter += 1
|
138
140
|
puts "Indexed #{counter} of #{total} | #{Time.now - start_time} seconds"
|
139
141
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cul_hydra
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.9.
|
4
|
+
version: 1.9.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Benjamin Armintor
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2020-
|
12
|
+
date: 2020-04-15 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rails
|
@@ -451,6 +451,7 @@ files:
|
|
451
451
|
- fixtures/cmodels/ldpd_sdep.StaticImageCore.xml
|
452
452
|
- fixtures/cmodels/ore_Proxy.xml
|
453
453
|
- fixtures/cmodels/pcdm_Collection.xml
|
454
|
+
- fixtures/spec/BLOB/description-cp1252.txt
|
454
455
|
- fixtures/spec/BLOB/description-utf8.txt
|
455
456
|
- fixtures/spec/BLOB/dlc.md
|
456
457
|
- fixtures/spec/BLOB/test001.jpg
|