bulk_ops 0.1.23 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/db/migrate/20200122234235_remove_relationships_ammend_work_proxy.rb +14 -0
- data/lib/bulk_ops.rb +3 -2
- data/lib/bulk_ops/apply_operation_job.rb +8 -0
- data/lib/bulk_ops/create_work_job.rb +1 -1
- data/lib/bulk_ops/github_access.rb +1 -1
- data/lib/bulk_ops/operation.rb +57 -49
- data/lib/bulk_ops/parser.rb +50 -414
- data/lib/bulk_ops/resolve_children_job.rb +14 -0
- data/lib/bulk_ops/solr_service.rb +13 -0
- data/lib/bulk_ops/update_work_job.rb +1 -1
- data/lib/bulk_ops/verification.rb +2 -10
- data/lib/bulk_ops/version.rb +1 -1
- data/lib/bulk_ops/work_job.rb +20 -13
- data/lib/bulk_ops/work_proxy.rb +18 -2
- data/lib/concerns/interpret_controlled_behavior.rb +140 -0
- data/lib/concerns/interpret_files_behavior.rb +82 -0
- data/lib/concerns/interpret_options_behavior.rb +59 -0
- data/lib/concerns/interpret_relationships_behavior.rb +123 -0
- data/lib/concerns/interpret_scalar_behavior.rb +21 -0
- data/lib/concerns/search_builder_behavior.rb +80 -0
- metadata +12 -3
- data/lib/bulk_ops/relationship.rb +0 -117
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f9a35abfb31034307e62c7b7483b215447b848e1bb57f08d7200ae5293924e96
|
4
|
+
data.tar.gz: 77f0dd2a02a8343f945da3cd3f0a19cf9c590556ceff90bb86118032a2ed9192
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5e0490d9f81743bbfbf87848654cff4fba774f2c5b218903535e732e42e72908382598741d452a6de16017cb43a9c19b3283c07d2ff5b03008b16afc8163cda0
|
7
|
+
data.tar.gz: 9123a66a499e37ef944854f70f49aa7d0de8f73af6461efe1949a7dbe75cead327e3b7fd1edd266965254758bb38d89d70cfd212fc90408c54a14da77f506b38
|
@@ -0,0 +1,14 @@
|
|
1
|
+
class RemoveRelationshipsAmmendWorkProxy < ActiveRecord::Migration[5.0]
|
2
|
+
def change
|
3
|
+
|
4
|
+
drop_table :bulk_ops_relationships
|
5
|
+
|
6
|
+
change_table :bulk_ops_work_proxies do |t|
|
7
|
+
t.integer :parent_id
|
8
|
+
t.integer :previous_sibling_id
|
9
|
+
end
|
10
|
+
|
11
|
+
remove_column :bulk_ops_operations, :operation_type
|
12
|
+
|
13
|
+
end
|
14
|
+
end
|
data/lib/bulk_ops.rb
CHANGED
@@ -34,8 +34,9 @@ module BulkOps
|
|
34
34
|
OPTIONS_FILENAME = 'configuration.yml'
|
35
35
|
ROW_OFFSET = 2
|
36
36
|
|
37
|
-
dirstring = File.join( File.dirname(__FILE__), '
|
38
|
-
|
37
|
+
dirstring = File.join( File.dirname(__FILE__), 'concerns/*.rb')
|
38
|
+
dirstring2 = File.join( File.dirname(__FILE__), 'bulk_ops/**/*.rb')
|
39
|
+
((Dir[dirstring] || []) + Dir[dirstring2]).uniq.each do |file|
|
39
40
|
begin
|
40
41
|
require file
|
41
42
|
rescue Exception => e
|
@@ -12,7 +12,7 @@ class BulkOps::CreateWorkJob < BulkOps::WorkJob
|
|
12
12
|
end
|
13
13
|
|
14
14
|
def define_work workClass
|
15
|
-
if record_exists?(@work_proxy.work_id)
|
15
|
+
if BulkOps::SolrService.record_exists?(@work_proxy.work_id)
|
16
16
|
report_error "trying to ingest a work proxy that already has a work attached. Work id: #{@work_proxy.work_id} Proxy id: #{@work_proxy.id}"
|
17
17
|
return false
|
18
18
|
end
|
@@ -196,7 +196,7 @@ class BulkOps::GithubAccess
|
|
196
196
|
|
197
197
|
def create_pull_request message: false
|
198
198
|
begin
|
199
|
-
message ||= "Apply
|
199
|
+
message ||= "Apply operation #{name} through Hyrax browser interface"
|
200
200
|
pull = client.create_pull_request(repo, "master", name, message)
|
201
201
|
pull["number"]
|
202
202
|
rescue Octokit::UnprocessableEntity
|
data/lib/bulk_ops/operation.rb
CHANGED
@@ -38,10 +38,6 @@ module BulkOps
|
|
38
38
|
states
|
39
39
|
end
|
40
40
|
|
41
|
-
def type
|
42
|
-
operation_type
|
43
|
-
end
|
44
|
-
|
45
41
|
def self.schema
|
46
42
|
ScoobySnacks::METADATA_SCHEMA
|
47
43
|
end
|
@@ -62,45 +58,73 @@ module BulkOps
|
|
62
58
|
update(stage: new_stage)
|
63
59
|
end
|
64
60
|
|
65
|
-
def
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
61
|
+
def destroy_all_works_and_proxies
|
62
|
+
work_proxies.each do |proxy|
|
63
|
+
if BulkOps::SolrService.record_exists?(proxy.work_id)
|
64
|
+
ActiveFedora::Base.find(work_id).destroy
|
65
|
+
end
|
66
|
+
proxy.destroy
|
67
|
+
end
|
68
|
+
update(stage: "waiting",
|
69
|
+
status: "reverted changes")
|
70
|
+
|
71
|
+
end
|
75
72
|
|
76
|
-
|
77
|
-
|
73
|
+
def destroy_all_works
|
74
|
+
work_proxies.each do |proxy|
|
75
|
+
if BulkOps::SolrService.record_exists?(proxy.work_id)
|
76
|
+
ActiveFedora::Base.find(work_id).destroy
|
77
|
+
end
|
78
|
+
proxy.update(status: "destroyed", message: "The work created by this proxy was destroyed by the user")
|
79
|
+
end
|
80
|
+
update(stage: "waiting",
|
81
|
+
status: "reverted changes")
|
78
82
|
end
|
79
83
|
|
80
|
-
def
|
81
|
-
|
82
|
-
|
84
|
+
def destroy_all_proxies
|
85
|
+
work_proxies.each do |proxy|
|
86
|
+
proxy.destroy
|
87
|
+
end
|
88
|
+
update(stage: "waiting",
|
89
|
+
status: "reverted changes")
|
90
|
+
end
|
83
91
|
|
84
|
-
|
85
|
-
|
86
|
-
|
92
|
+
def apply!
|
93
|
+
update({stage: "running",
|
94
|
+
status: "OK",
|
95
|
+
message: "Bulk operation initiated by #{user.name || user.email}"})
|
96
|
+
# We should now on the master branch. Make sure the correct spreadsheet version is loaded
|
97
|
+
final_spreadsheet
|
87
98
|
|
88
|
-
|
99
|
+
# In case this operation has run before, gather all work proxies that are completed and exclude them from the application
|
100
|
+
complete_proxies = work_proxies.select{|proxy| proxy.status == "complete" && proxy.work_id.present?}
|
101
|
+
incomplete_row_numbers = Array(0..@metadata.length-1) - complete_proxies.map(&:row_number)
|
89
102
|
|
90
|
-
|
103
|
+
# Destroy all proxies corresponding to incomplete rows
|
104
|
+
(work_proxies - complete_proxies).each{proxy| proxy.destroy!}
|
105
|
+
|
106
|
+
# Create a new work proxy for incompplete row
|
107
|
+
# All the proxies need to exist before parsing in order to correctly recognize relationships
|
108
|
+
incomplete_row_numbers.each do |row_number|
|
109
|
+
values = @metadata[row_number]
|
110
|
+
next if values.to_s.gsub(',','').blank?
|
111
|
+
next if BulkOps::Parser.is_file_set? @metadata, proxy.row_number
|
112
|
+
work_proxies.create(status: "new",
|
91
113
|
last_event: DateTime.now,
|
92
|
-
|
114
|
+
work_type: work_type,
|
115
|
+
row_number: proxy.row_number,
|
93
116
|
visibility: options['visibility'],
|
94
117
|
message: "created during ingest initiated by #{user.name || user.email}")
|
118
|
+
|
95
119
|
end
|
96
|
-
|
97
|
-
# make sure the work proxies we just created are loaded in memory
|
120
|
+
# Reload the operation so that it can recognize its new proxies
|
98
121
|
reload
|
99
|
-
#
|
100
|
-
|
122
|
+
# Parse each spreadsheet row and create a background job for each proxy we just created
|
123
|
+
incomplete_row_numberss.each do |row_number|
|
124
|
+
values = @metadata[row_number]
|
101
125
|
proxy = work_proxies.find_by(row_number: row_number)
|
102
126
|
proxy.update(message: "interpreted at #{DateTime.now.strftime("%d/%m/%Y %H:%M")} " + proxy.message)
|
103
|
-
data = BulkOps::Parser.new(proxy, @metadata).interpret_data(raw_row: values)
|
127
|
+
data = BulkOps::Parser.new(proxy, @metadata,options).interpret_data(raw_row: values)
|
104
128
|
next unless proxy.proxy_errors.blank?
|
105
129
|
BulkOps::WorkJob.perform_later(proxy.work_type || "Work",
|
106
130
|
user.email,
|
@@ -112,13 +136,6 @@ module BulkOps
|
|
112
136
|
report_errors!
|
113
137
|
end
|
114
138
|
|
115
|
-
def delete_all
|
116
|
-
work_proxies.each do |proxy|
|
117
|
-
ActiveFedora::Base.find(proxy.work_id).destroy
|
118
|
-
proxy.update(status: "destroyed", message: "The work created by this proxy was destroyed by the user")
|
119
|
-
end
|
120
|
-
end
|
121
|
-
|
122
139
|
def check_if_finished
|
123
140
|
return unless stage == "running" && !busy?
|
124
141
|
|
@@ -208,7 +225,7 @@ module BulkOps
|
|
208
225
|
|
209
226
|
def report_errors!
|
210
227
|
error_file_name = BulkOps::Error.write_errors!(accumulated_errors, git)
|
211
|
-
notify!(subject: "Errors initializing bulk
|
228
|
+
notify!(subject: "Errors initializing bulk operation in Hycruz", message: "Hycruz encountered some errors while it was setting up your operation and preparing to begin. For most types of errors, the individual rows of the spreadsheet with errors will be ignored and the rest will proceed. Please consult the operation summary for real time information on the status of the operation. Details about these initialization errors can be seen on Github at the following url: https://github.com/#{git.repo}/blob/#{git.name}/#{git.name}/errors/#{error_file_name}") if error_file_name
|
212
229
|
end
|
213
230
|
|
214
231
|
def create_pull_request message: false
|
@@ -222,7 +239,7 @@ module BulkOps
|
|
222
239
|
update(stage: "pending")
|
223
240
|
end
|
224
241
|
|
225
|
-
def create_branch(fields: nil, work_ids: nil, options: nil
|
242
|
+
def create_branch(fields: nil, work_ids: nil, options: nil)
|
226
243
|
git.create_branch!
|
227
244
|
bulk_ops_dir = Gem::Specification.find_by_name("bulk_ops").gem_dir
|
228
245
|
|
@@ -238,13 +255,12 @@ module BulkOps
|
|
238
255
|
options.each { |option, value| full_options[option] = value }
|
239
256
|
|
240
257
|
full_options[name] = name
|
241
|
-
full_options[type] = type
|
242
258
|
full_options[status] = status
|
243
259
|
|
244
260
|
git.update_options full_options
|
245
261
|
end
|
246
262
|
|
247
|
-
create_new_spreadsheet(fields: fields, work_ids: work_ids)
|
263
|
+
create_new_spreadsheet(fields: fields, work_ids: work_ids)
|
248
264
|
end
|
249
265
|
|
250
266
|
def get_spreadsheet return_headers: false
|
@@ -298,14 +314,6 @@ module BulkOps
|
|
298
314
|
return false
|
299
315
|
end
|
300
316
|
|
301
|
-
def ingest?
|
302
|
-
type == "ingest"
|
303
|
-
end
|
304
|
-
|
305
|
-
def update?
|
306
|
-
type == "update"
|
307
|
-
end
|
308
|
-
|
309
317
|
def delete_branch
|
310
318
|
git.delete_branch!
|
311
319
|
end
|
data/lib/bulk_ops/parser.rb
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
|
2
1
|
class BulkOps::Parser
|
3
2
|
require 'uri'
|
4
3
|
|
@@ -6,6 +5,27 @@ class BulkOps::Parser
|
|
6
5
|
|
7
6
|
delegate :relationships, :operation, :row_number, :work_id, :visibility, :work_type, :reference_identifier, :order, to: :proxy
|
8
7
|
|
8
|
+
include BulkOps::InterpretRelationshipsBehavior
|
9
|
+
include BulkOps::InterpretFilesBehavior
|
10
|
+
include BulkOps::InterpretScalarBehavior
|
11
|
+
include BulkOps::InterpretOptionsBehavior
|
12
|
+
include BulkOps::InterpretControlledBehavior
|
13
|
+
|
14
|
+
def self.unescape_csv(value)
|
15
|
+
value.gsub(/\\(['";,])/,'\1')
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.split_values value_string
|
19
|
+
# Split values on all un-escaped separator character (escape character is '\')
|
20
|
+
# Then replace all escaped separator charactors with un-escaped versions
|
21
|
+
value_string.split(/(?<!\\)#{BulkOps::SEPARATOR}/).map{|val| val.gsub("\\#{BulkOps::SEPARATOR}",BulkOps::SEPARATOR).strip}
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.normalize_relationship_field_name field
|
25
|
+
normfield = field.to_s.downcase.parameterize.gsub(/[_\s-]/,'')
|
26
|
+
BulkOps::RELATIONSHIP_FIELDS.find{|rel_field| normfield == rel_field }
|
27
|
+
end
|
28
|
+
|
9
29
|
def self.is_file_set? metadata, row_number
|
10
30
|
return false unless metadata[row_number].present?
|
11
31
|
# If the work type is explicitly specified, use that
|
@@ -23,12 +43,32 @@ class BulkOps::Parser
|
|
23
43
|
return true
|
24
44
|
end
|
25
45
|
|
26
|
-
|
46
|
+
def self.get_negating_metadata(work_id, metadata={})
|
47
|
+
return false unless BulkOps::SolrService.record_exists?(work_id)
|
48
|
+
work = ActiveFedora::Base.find(work_id)
|
49
|
+
schema = ScoobySnacks::METADATA_SCHEMA
|
50
|
+
schema.all_fields.each do |field|
|
51
|
+
field_key = field.controlled? ? "#{field.name}_attributes" : field.name
|
52
|
+
metadata[field_key] ||= (field.multiple? ? [] : nil)
|
53
|
+
if field.controlled?
|
54
|
+
values = Array(work.send(field.name)).map{|value| {id: value.id, _destroy: true} }
|
55
|
+
if field.multiple?
|
56
|
+
metadata[field_key] += values
|
57
|
+
else
|
58
|
+
metadata[field_key] = values.first
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
return metadata
|
63
|
+
end
|
64
|
+
|
65
|
+
def initialize prx, metadata_sheet=nil, options={}
|
27
66
|
@proxy = prx
|
28
|
-
@raw_data = (metadata_sheet ||
|
67
|
+
@raw_data = (metadata_sheet || operation.metadata)
|
29
68
|
@raw_row = @raw_data[@proxy.row_number]
|
30
69
|
@metadata = {}
|
31
70
|
@parsing_errors = []
|
71
|
+
@options = options || operation.options
|
32
72
|
end
|
33
73
|
|
34
74
|
def interpret_data raw_row: nil, raw_data: nil, proxy: nil
|
@@ -42,6 +82,9 @@ class BulkOps::Parser
|
|
42
82
|
interpret_relationship_fields
|
43
83
|
setMetadataInheritance
|
44
84
|
interpret_option_fields
|
85
|
+
if @proxy.work_id.present? && @options['discard_existing_metadata']
|
86
|
+
@metadata.deep_merge!(self.class.get_negating_metadata(@proxy.work_id))
|
87
|
+
end
|
45
88
|
interpret_file_fields
|
46
89
|
interpret_controlled_fields
|
47
90
|
interpret_scalar_fields
|
@@ -66,7 +109,7 @@ class BulkOps::Parser
|
|
66
109
|
end
|
67
110
|
|
68
111
|
def connect_existing_work
|
69
|
-
return unless (column_name =
|
112
|
+
return unless (column_name = @options["update_identifier"])
|
70
113
|
return unless (key = @raw_row.to_h.keys.find{|key| key.to_s.parameterize.downcase.gsub("_","") == column_name.to_s.parameterize.downcase.gsub("_","")})
|
71
114
|
return unless (value = @raw_row[key]).present?
|
72
115
|
return unless (work_id = find_work_id_from_unique_metadata(key, value))
|
@@ -83,351 +126,6 @@ class BulkOps::Parser
|
|
83
126
|
return response["docs"][0]["id"]
|
84
127
|
end
|
85
128
|
|
86
|
-
def interpret_controlled_fields
|
87
|
-
|
88
|
-
# The labels array tracks the contents of columns marked as labels,
|
89
|
-
# which may require special validation
|
90
|
-
labels = {}
|
91
|
-
|
92
|
-
# This hash is populated with relevant data as we loop through the fields
|
93
|
-
controlled_data = {}
|
94
|
-
|
95
|
-
@raw_row.each do |field_name, value|
|
96
|
-
next if value.blank? or field_name.blank?
|
97
|
-
field_name = field_name.to_s
|
98
|
-
|
99
|
-
#If our CSV interpreter is feeding us the headers as a line, ignore it.
|
100
|
-
next if field_name == value
|
101
|
-
|
102
|
-
#check if they are using the 'field_name.authority' syntax
|
103
|
-
authority = nil
|
104
|
-
if ((split=field_name.split('.')).count == 2)
|
105
|
-
authority = split.last
|
106
|
-
field_name = split.first
|
107
|
-
end
|
108
|
-
|
109
|
-
# get the field name, if this column is a metadata field
|
110
|
-
field_name_norm = find_field_name(field_name)
|
111
|
-
field = schema.get_field(field_name_norm)
|
112
|
-
|
113
|
-
# Ignore anything that isn't a controlled field
|
114
|
-
next unless field.present? && field.controlled?
|
115
|
-
|
116
|
-
# Keep track of label fields
|
117
|
-
if field_name.downcase.ends_with?("label")
|
118
|
-
next if operation.options["ignore_labels"]
|
119
|
-
labels[field_name_norm] ||= []
|
120
|
-
labels[field_name_norm] += split_values value
|
121
|
-
next unless operation.options["import_labels"]
|
122
|
-
end
|
123
|
-
|
124
|
-
remove = field_name.downcase.starts_with?("remove") || field_name.downcase.starts_with?("delete")
|
125
|
-
|
126
|
-
# handle multiple values
|
127
|
-
value_array = split_values(value)
|
128
|
-
controlled_data[field_name_norm] ||= [] unless value_array.blank?
|
129
|
-
value_array.each do |value|
|
130
|
-
# Decide of we're dealing with a label or url
|
131
|
-
# It's an ID if it's a URL and the name doesn't end in 'label'
|
132
|
-
value.strip!
|
133
|
-
if value =~ /^#{URI::regexp}$/ and !field_name.downcase.ends_with?("label")
|
134
|
-
value_id = value
|
135
|
-
# label = WorkIndexer.fetch_remote_label(value)
|
136
|
-
# error_message = "cannot fetch remote label for url: #{value}"
|
137
|
-
# report_error( :cannot_retrieve_label , error_message, url: value, row_number: row_number) unless label
|
138
|
-
else
|
139
|
-
# It's a label, so unescape it and get the id
|
140
|
-
value = unescape_csv(value)
|
141
|
-
value_id = get_remote_id(value, property: field_name_norm, authority: authority) || localAuthUrl(field_name_norm, value)
|
142
|
-
# label = value
|
143
|
-
report_error(:cannot_retrieve_url,
|
144
|
-
message: "cannot find or create url for controlled vocabulary label: #{value}",
|
145
|
-
url: value,
|
146
|
-
row_number: row_number) unless value_id
|
147
|
-
end
|
148
|
-
controlled_data[field_name_norm] << {id: value_id, remove: field_name.downcase.starts_with?("remove")}
|
149
|
-
end
|
150
|
-
end
|
151
|
-
|
152
|
-
# Actually add all the data
|
153
|
-
controlled_data.each do |property_name, data|
|
154
|
-
@metadata["#{property_name}_attributes"] ||= [] unless data.blank?
|
155
|
-
data.uniq.each do |datum|
|
156
|
-
atts = {"id" => datum[:id]}
|
157
|
-
atts["_delete"] = true if datum[:remove]
|
158
|
-
@metadata["#{property_name}_attributes"] << atts
|
159
|
-
end
|
160
|
-
end
|
161
|
-
end
|
162
|
-
|
163
|
-
def interpret_scalar_fields
|
164
|
-
@raw_row.each do |field, values|
|
165
|
-
next if values.blank? or field.nil? or field == values
|
166
|
-
# get the field name, if this column is a metadata field
|
167
|
-
next unless field_name = find_field_name(field.to_s)
|
168
|
-
field = schema.get_field(field_name)
|
169
|
-
# Ignore controlled fields
|
170
|
-
next if field.controlled?
|
171
|
-
split_values(values).each do |value|
|
172
|
-
next if value.blank?
|
173
|
-
value = value.strip.encode('utf-8', :invalid => :replace, :undef => :replace, :replace => '_') unless value.blank?
|
174
|
-
value = unescape_csv(value)
|
175
|
-
(@metadata[field_name] ||= []) << value
|
176
|
-
end
|
177
|
-
end
|
178
|
-
end
|
179
|
-
|
180
|
-
def interpret_file_fields
|
181
|
-
# This method handles file additions and deletions from the spreadsheet
|
182
|
-
# if additional files need to be deleted because the update is set to replace
|
183
|
-
# some or all existing files, those replacement-related deletions are handled
|
184
|
-
# by the BulkOps::Operation.
|
185
|
-
#
|
186
|
-
|
187
|
-
@raw_row.each do |field, value|
|
188
|
-
next if value.blank? or field.blank?
|
189
|
-
field = field.to_s
|
190
|
-
#If our CSV interpreter is feeding us the headers as a line, ignore it.
|
191
|
-
next if field == value
|
192
|
-
|
193
|
-
# Check if this is a file field, and whether we are removing or adding a file
|
194
|
-
next unless (action = BulkOps::Verification.is_file_field?(field))
|
195
|
-
|
196
|
-
# Move on if this field is the name of another property (e.g. masterFilename)
|
197
|
-
next if find_field_name(field)
|
198
|
-
|
199
|
-
# Check if we are removing a file
|
200
|
-
if action == "remove"
|
201
|
-
get_removed_filesets(value).each { |fileset_id| delete_file_set(file_set_id) }
|
202
|
-
else
|
203
|
-
# Add a file
|
204
|
-
operation.get_file_paths(value).each do |filepath|
|
205
|
-
begin
|
206
|
-
uploaded_file = Hyrax::UploadedFile.create(file: File.open(filepath), user: operation.user)
|
207
|
-
(@metadata[:uploaded_files] ||= []) << uploaded_file.id unless uploaded_file.id.nil?
|
208
|
-
rescue Exception => e
|
209
|
-
report_error(:upload_error,
|
210
|
-
message: "Error opening file: #{ filepath } -- #{e}",
|
211
|
-
file: File.join(BulkOps::INGEST_MEDIA_PATH,filename),
|
212
|
-
row_number: row_number)
|
213
|
-
end
|
214
|
-
end
|
215
|
-
end
|
216
|
-
|
217
|
-
# Check if any of the upcoming rows are child filesets
|
218
|
-
i = 1
|
219
|
-
while self.class.is_file_set?(@metadata,row_number+i)
|
220
|
-
child_row.each do |field,value|
|
221
|
-
next if value.blank?
|
222
|
-
title = value if ["title","label"].include?(field.downcase.strip)
|
223
|
-
if BulkOps::Verification.is_file_field?(field)
|
224
|
-
operation.get_file_paths(value).each do |filepath|
|
225
|
-
uploaded_file = Hyrax::UploadedFile.create(file: File.open(filepath), user: operation.user)
|
226
|
-
end
|
227
|
-
end
|
228
|
-
end
|
229
|
-
i+=1
|
230
|
-
end
|
231
|
-
|
232
|
-
end
|
233
|
-
end
|
234
|
-
|
235
|
-
def interpret_option_fields
|
236
|
-
@raw_row.each do |field,value|
|
237
|
-
next if value.blank? or field.blank?
|
238
|
-
field = field.to_s
|
239
|
-
next if value == field
|
240
|
-
|
241
|
-
normfield = field.downcase.parameterize.gsub(/[_\s-]/,'')
|
242
|
-
if ["visibility", "public"].include?(normfield)
|
243
|
-
@proxy.update(visibility: format_visibility(value))
|
244
|
-
|
245
|
-
end
|
246
|
-
if ["worktype","model","type"].include?(normfield)
|
247
|
-
@proxy.update(work_type: format_worktype(value) )
|
248
|
-
end
|
249
|
-
if ["referenceidentifier",
|
250
|
-
"referenceid",
|
251
|
-
"refid",
|
252
|
-
"referenceidentifiertype",
|
253
|
-
"referenceidtype",
|
254
|
-
"refidtype",
|
255
|
-
"relationshipidentifier",
|
256
|
-
"relationshipid",
|
257
|
-
"relationshipidentifiertype",
|
258
|
-
"relationshipidtype",
|
259
|
-
"relid",
|
260
|
-
"relidtype"].include?(normfield)
|
261
|
-
@proxy.update(reference_identifier: format_reference_id(value))
|
262
|
-
end
|
263
|
-
end
|
264
|
-
end
|
265
|
-
|
266
|
-
def interpret_relationship_fields
|
267
|
-
@raw_row.each do |field,value|
|
268
|
-
next if value.blank? or field.blank?
|
269
|
-
field = field.to_s
|
270
|
-
value = unescape_csv(value)
|
271
|
-
identifer_type = reference_identifier
|
272
|
-
|
273
|
-
next if value == field
|
274
|
-
|
275
|
-
# Correctly interpret the notation "parent:id", "parent id" etc in a column header
|
276
|
-
if (split = field.split(/[:_\-\s]/)).count == 2
|
277
|
-
identifier_type = split.last
|
278
|
-
relationship_type = split.first.to_s
|
279
|
-
else
|
280
|
-
relationship_type = field
|
281
|
-
end
|
282
|
-
|
283
|
-
relationship_type = self.class.normalize_relationship_field_name(relationship_type)
|
284
|
-
case relationship_type
|
285
|
-
when "order"
|
286
|
-
# If the field specifies the object's order among siblings
|
287
|
-
@proxy.update(order: value.to_f)
|
288
|
-
next
|
289
|
-
when "collection"
|
290
|
-
# If the field specifies the name or ID of a collection,
|
291
|
-
# find or create the collection and update the metadata to match
|
292
|
-
col = find_or_create_collection(value)
|
293
|
-
( @metadata[:member_of_collection_ids] ||= [] ) << col.id if col
|
294
|
-
next
|
295
|
-
when "parent", "child"
|
296
|
-
|
297
|
-
# correctly interpret the notation "id:a78C2d81"
|
298
|
-
identifier_type, object_identifier = interpret_relationship_value(identifier_type, value)
|
299
|
-
|
300
|
-
relationship_parameters = { work_proxy_id: @proxy.id,
|
301
|
-
identifier_type: identifier_type,
|
302
|
-
relationship_type: relationship_type,
|
303
|
-
object_identifier: object_identifier,
|
304
|
-
status: "new"}
|
305
|
-
|
306
|
-
#add previous sibling link if necessary
|
307
|
-
previous_value = @raw_data[row_number-1][field]
|
308
|
-
# Check if this is a parent relationship, and the previous row also has one
|
309
|
-
if previous_value.present? && (relationship_type == "parent")
|
310
|
-
# Check if the previous row has the same parent as this row
|
311
|
-
if object_identifier == interpret_relationship_value(identifier_type, previous_value, field).last
|
312
|
-
# If so, set the previous sibling parameter on the relationshp
|
313
|
-
# to the id for the proxy associated with the previous row
|
314
|
-
relationship_parameters[:previous_sibling] = operation.work_proxies.find_by(row_number: row_number-1).id
|
315
|
-
end
|
316
|
-
end
|
317
|
-
BulkOps::Relationship.create(relationship_parameters)
|
318
|
-
end
|
319
|
-
end
|
320
|
-
end
|
321
|
-
|
322
|
-
def self.normalize_relationship_field_name field
|
323
|
-
normfield = field.downcase.parameterize.gsub(/[_\s-]/,'')
|
324
|
-
BulkOps::RELATIONSHIP_FIELDS.find{|rel_field| normfield == rel_field }
|
325
|
-
end
|
326
|
-
|
327
|
-
def find_previous_parent field="parent"
|
328
|
-
#Return the row number of the most recent preceding row that does
|
329
|
-
# not itself have a parent defined
|
330
|
-
i = 1;
|
331
|
-
while (prev_row = raw_data[row_number - i])
|
332
|
-
return (row_number - i) if prev_row[field].blank?
|
333
|
-
i += 1
|
334
|
-
end
|
335
|
-
end
|
336
|
-
|
337
|
-
def interpret_relationship_value id_type, value, field="parent"
|
338
|
-
#Handle "id:20kj4259" syntax if it hasn't already been handled
|
339
|
-
if (split = value.to_s.split(":")).count == 2
|
340
|
-
id_type, value = split.first
|
341
|
-
value = split.last
|
342
|
-
end
|
343
|
-
#Handle special shorthand syntax for refering to relative row numbers
|
344
|
-
if id_type == "row"
|
345
|
-
#if the value is an integer
|
346
|
-
if value =~ /\A[-+]?[0-9]+\z/
|
347
|
-
if value.to_i < 0
|
348
|
-
# if given a negative integer, count backwards from the current row (remember that value.to_i is negative)
|
349
|
-
return [id_type,row_number + value.to_i]
|
350
|
-
elsif value.to_i > 0
|
351
|
-
# if given a positive integer, remove the row offset
|
352
|
-
value = (value.to_i - BulkOps::ROW_OFFSET).to_s
|
353
|
-
end
|
354
|
-
elsif value.to_s.downcase.include?("prev")
|
355
|
-
# if given any variation of the word "previous", get the first preceding row with no parent of its own
|
356
|
-
return [id_type,find_previous_parent(field)]
|
357
|
-
end
|
358
|
-
end
|
359
|
-
return [id_type,value]
|
360
|
-
end
|
361
|
-
|
362
|
-
def unescape_csv(value)
|
363
|
-
value.gsub(/\\(['";,])/,'\1')
|
364
|
-
end
|
365
|
-
|
366
|
-
|
367
|
-
def format_worktype(value)
|
368
|
-
# format the value like a class name
|
369
|
-
type = value.titleize.gsub(/[-_\s]/,'')
|
370
|
-
# reject it if it isn't a defined class
|
371
|
-
type = false unless Object.const_defined? type
|
372
|
-
# fall back to the work type defined by the operation, or a standard "Work"
|
373
|
-
return type ||= work_type || operation.work_type || "Work"
|
374
|
-
end
|
375
|
-
|
376
|
-
def format_visibility(value)
|
377
|
-
case value.downcase
|
378
|
-
when "public", "open", "true"
|
379
|
-
return "open"
|
380
|
-
when "campus", "ucsc", "institution"
|
381
|
-
return "ucsc"
|
382
|
-
when "restricted", "private", "closed", "false"
|
383
|
-
return "restricted"
|
384
|
-
end
|
385
|
-
end
|
386
|
-
|
387
|
-
|
388
|
-
def mintLocalAuthUrl(auth_name, value)
|
389
|
-
value.strip!
|
390
|
-
id = value.parameterize
|
391
|
-
auth = Qa::LocalAuthority.find_or_create_by(name: auth_name)
|
392
|
-
entry = Qa::LocalAuthorityEntry.create(local_authority: auth,
|
393
|
-
label: value,
|
394
|
-
uri: id)
|
395
|
-
return localIdToUrl(id,auth_name)
|
396
|
-
end
|
397
|
-
|
398
|
-
def findAuthUrl(auth, value)
|
399
|
-
value.strip!
|
400
|
-
return nil if auth.nil?
|
401
|
-
return nil unless (entries = Qa::Authorities::Local.subauthority_for(auth).search(value))
|
402
|
-
entries.each do |entry|
|
403
|
-
#require exact match
|
404
|
-
next unless entry["label"].force_encoding('UTF-8') == value.force_encoding('UTF-8')
|
405
|
-
url = entry["url"] || entry["id"]
|
406
|
-
# url = localIdToUrl(url,auth) unless url =~ URI::regexp
|
407
|
-
return url
|
408
|
-
end
|
409
|
-
return nil
|
410
|
-
end
|
411
|
-
|
412
|
-
def localIdToUrl(id,auth_name)
|
413
|
-
root_urls = {production: "https://digitalcollections.library.ucsc.edu",
|
414
|
-
staging: "http://digitalcollections-staging.library.ucsc.edu",
|
415
|
-
development: "http://#{Socket.gethostname}",
|
416
|
-
test: "http://#{Socket.gethostname}"}
|
417
|
-
return "#{root_urls[Rails.env.to_sym]}/authorities/show/local/#{auth_name}/#{id}"
|
418
|
-
end
|
419
|
-
|
420
|
-
def getLocalAuth(field_name)
|
421
|
-
field = schema.get_property(field_name)
|
422
|
-
# There is only ever one local authority per field, so just pick the first you find
|
423
|
-
if vocs = field.vocabularies
|
424
|
-
vocs.each do |voc|
|
425
|
-
return voc["subauthority"] if voc["authority"].downcase == "local"
|
426
|
-
end
|
427
|
-
end
|
428
|
-
return nil
|
429
|
-
end
|
430
|
-
|
431
129
|
def setAdminSet
|
432
130
|
return if @metadata[:admin_set_id]
|
433
131
|
asets = AdminSet.where({title: "Bulk Ingest Set"})
|
@@ -437,7 +135,7 @@ class BulkOps::Parser
|
|
437
135
|
|
438
136
|
def setMetadataInheritance
|
439
137
|
return if @metadata[:metadataInheritance].present?
|
440
|
-
@metadata[:metadataInheritance] =
|
138
|
+
@metadata[:metadataInheritance] = @options["metadataInheritance"] unless @options["metadataInheritance"].blank?
|
441
139
|
end
|
442
140
|
|
443
141
|
def report_error type, message, **args
|
@@ -447,75 +145,13 @@ class BulkOps::Parser
|
|
447
145
|
(@parsing_errors ||= []) << BulkOps::Error.new(**args)
|
448
146
|
end
|
449
147
|
|
450
|
-
def get_removed_filesets(filestring)
|
451
|
-
file_ids = split_values(filestring)
|
452
|
-
file_ids.select{|file_id| record_exists?(file_id)}
|
453
|
-
|
454
|
-
# This part handles filenames in addition to file ids. It doesn't work yet!
|
455
|
-
# file_ids.map do |file_id|
|
456
|
-
# If the filename is the id of an existing record, keep that
|
457
|
-
# next(file_id) if (record_exists?(file_id))
|
458
|
-
# If this is the label (i.e.filename) of an existing fileset, use that fileset id
|
459
|
-
# TODO MAKE THIS WORK!!
|
460
|
-
# next(filename) if (filename_exists?(filename))
|
461
|
-
# File.join(BulkOps::INGEST_MEDIA_PATH, filename_prefix, filename)
|
462
|
-
# end
|
463
|
-
end
|
464
|
-
|
465
|
-
def delete_file_set fileset_id
|
466
|
-
BulkOps::DeleteFileSetJob.perform_later(fileset_id, operation.user.email )
|
467
|
-
end
|
468
|
-
|
469
|
-
def record_exists? id
|
470
|
-
operation.record_exists? id
|
471
|
-
end
|
472
|
-
|
473
|
-
def localAuthUrl(property, value)
|
474
|
-
return value if (auth = getLocalAuth(property)).nil?
|
475
|
-
url = findAuthUrl(auth, value) || mintLocalAuthUrl(auth,value)
|
476
|
-
return url
|
477
|
-
end
|
478
|
-
|
479
|
-
def find_collection(collection)
|
480
|
-
cols = Collection.where(id: collection)
|
481
|
-
cols += Collection.where(title: collection).select{|col| col.title.first == collection}
|
482
|
-
return cols.last unless cols.empty?
|
483
|
-
return false
|
484
|
-
end
|
485
|
-
|
486
|
-
def find_or_create_collection(collection)
|
487
|
-
col = find_collection(collection)
|
488
|
-
return col if col
|
489
|
-
return false if collection.to_i > 0
|
490
|
-
col = Collection.create(title: [collection.to_s], depositor: operation.user.email, collection_type: Hyrax::CollectionType.find_by(title:"User Collection"))
|
491
|
-
end
|
492
|
-
|
493
|
-
def get_remote_id(value, authority: nil, property: nil)
|
494
|
-
return false
|
495
|
-
#TODO retrieve URL for this value from the specified remote authr
|
496
|
-
end
|
497
|
-
|
498
|
-
def format_param_name(name)
|
499
|
-
name.titleize.gsub(/\s+/, "").camelcase(:lower)
|
500
|
-
end
|
501
|
-
|
502
|
-
def schema
|
503
|
-
ScoobySnacks::METADATA_SCHEMA
|
504
|
-
end
|
505
|
-
|
506
148
|
def find_field_name(field)
|
507
149
|
operation.find_field_name(field)
|
508
150
|
end
|
509
151
|
|
510
|
-
def
|
511
|
-
|
512
|
-
str[0].downcase + str[1..-1]
|
152
|
+
def schema
|
153
|
+
ScoobySnacks::METADATA_SCHEMA
|
513
154
|
end
|
514
155
|
|
515
|
-
def split_values value_string
|
516
|
-
# Split values on all un-escaped separator character (escape character is '\')
|
517
|
-
# Then replace all escaped separator charactors with un-escaped versions
|
518
|
-
value_string.split(/(?<!\\)#{BulkOps::SEPARATOR}/).map{|val| val.gsub("\\#{BulkOps::SEPARATOR}",BulkOps::SEPARATOR).strip}
|
519
|
-
end
|
520
156
|
|
521
157
|
end
|