bulk_ops 0.1.14 → 0.1.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/bulk_ops.rb +34 -3
- data/lib/bulk_ops/create_spreadsheet_job.rb +1 -1
- data/lib/bulk_ops/github_access.rb +7 -11
- data/lib/bulk_ops/operation.rb +10 -29
- data/lib/bulk_ops/parser.rb +485 -0
- data/lib/bulk_ops/verification.rb +9 -9
- data/lib/bulk_ops/version.rb +1 -1
- data/lib/bulk_ops/work_proxy.rb +0 -459
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fea513373c0ae0267f9302311300b8f4ba03b9fa632db168aec201c2f8359182
|
4
|
+
data.tar.gz: baa0fe9b67bfbe7d2f8283ff7949cb8ec46e268c7e15ef17c7b73b9c3a80ef19
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 33810a935cc44ee6de4448a12e37d4c0889b6a4c7d409011fc5dd9d0bddc18e1a53f0f18337c933ab3dd6903d4112b0a968579f20e7e204d4278220c0dbb0315
|
7
|
+
data.tar.gz: b7ff43aed578a7aba0cb59d0862af6d1ffe7f50eccce6715171063a09e1edf2670e3d23333b4e506ff3d473ff6dbed56f672ff66b2f43209e58e67950706072a
|
data/lib/bulk_ops.rb
CHANGED
@@ -1,6 +1,39 @@
|
|
1
1
|
require "bulk_ops/version"
|
2
2
|
|
3
3
|
module BulkOps
|
4
|
+
OPTION_FIELDS = ['visibility','work type']
|
5
|
+
RELATIONSHIP_FIELDS = ['parent','child','collection','order']
|
6
|
+
REFERENCE_IDENTIFIER_FIELDS = ['Reference Identifier','ref_id','Reference ID','Relationship ID','Relationship Identifier','Reference Identifier Type','Reference ID Type','Ref ID Type','relationship_identifier_type','relationship_id_type']
|
7
|
+
FILE_FIELDS = ['file','files','filename','filenames']
|
8
|
+
FILE_ACTIONS = ['add','upload','remove','delete']
|
9
|
+
SEPARATOR = ';'
|
10
|
+
DEFAULT_ADMIN_SET_TITLE = "Bulk Ingest Set"
|
11
|
+
INGEST_MEDIA_PATH = "/dams_ingest"
|
12
|
+
TEMPLATE_DIR = "lib/bulk_ops/templates"
|
13
|
+
RELATIONSHIP_COLUMNS = ["parent","child","next"]
|
14
|
+
SPECIAL_COLUMNS = ["parent",
|
15
|
+
"child",
|
16
|
+
"order",
|
17
|
+
"next",
|
18
|
+
"work_type",
|
19
|
+
"collection",
|
20
|
+
"collection_title",
|
21
|
+
"collection_id",
|
22
|
+
"visibility",
|
23
|
+
"relationship_identifier_type",
|
24
|
+
"id",
|
25
|
+
"filename",
|
26
|
+
"file"]
|
27
|
+
IGNORED_COLUMNS = ["ignore","offline_notes"]
|
28
|
+
OPTION_REQUIREMENTS = {type: {required: true,
|
29
|
+
values:[:ingest,:update]},
|
30
|
+
file_method: {required: :true,
|
31
|
+
values: [:replace_some,:add_remove,:replace_all]},
|
32
|
+
notifications: {required: true}}
|
33
|
+
SPREADSHEET_FILENAME = 'metadata.csv'
|
34
|
+
OPTIONS_FILENAME = 'configuration.yml'
|
35
|
+
ROW_OFFSET = 2
|
36
|
+
|
4
37
|
dirstring = File.join( File.dirname(__FILE__), 'bulk_ops/**/*.rb')
|
5
38
|
Dir[dirstring].each do |file|
|
6
39
|
begin
|
@@ -9,7 +42,5 @@ module BulkOps
|
|
9
42
|
puts "ERROR LOADING #{File.basename(file)}: #{e}"
|
10
43
|
end
|
11
44
|
end
|
12
|
-
|
13
|
-
# require 'bulk_ops/verification'
|
14
|
-
# require 'bulk_ops/work_proxy'
|
45
|
+
|
15
46
|
end
|
@@ -36,7 +36,7 @@ class BulkOps::CreateSpreadsheetJob < ActiveJob::Base
|
|
36
36
|
next if value.is_a? DateTime
|
37
37
|
value = (label ? WorkIndexer.fetch_remote_label(value.id) : value.id) unless value.is_a? String
|
38
38
|
value.gsub("\"","\"\"")
|
39
|
-
end.join(BulkOps::
|
39
|
+
end.join(BulkOps::SEPARATOR).prepend('"').concat('"')
|
40
40
|
end.join(',')
|
41
41
|
end
|
42
42
|
|
@@ -5,10 +5,6 @@ require 'base64'
|
|
5
5
|
|
6
6
|
class BulkOps::GithubAccess
|
7
7
|
|
8
|
-
ROW_OFFSET = 2
|
9
|
-
SPREADSHEET_FILENAME = 'metadata.csv'
|
10
|
-
OPTIONS_FILENAME = 'configuration.yml'
|
11
|
-
|
12
8
|
attr_accessor :name
|
13
9
|
|
14
10
|
def self.auth_url user
|
@@ -142,11 +138,11 @@ class BulkOps::GithubAccess
|
|
142
138
|
def add_new_spreadsheet file, message=false
|
143
139
|
if file.is_a? Tempfile
|
144
140
|
file.close
|
145
|
-
add_file file.path, SPREADSHEET_FILENAME, message: message
|
141
|
+
add_file file.path, BulkOps::SPREADSHEET_FILENAME, message: message
|
146
142
|
elsif file.is_a?(String) && File.file?(file)
|
147
|
-
add_file file, SPREADSHEET_FILENAME, message: message
|
143
|
+
add_file file, BulkOps::SPREADSHEET_FILENAME, message: message
|
148
144
|
elsif file.is_a? String
|
149
|
-
add_contents(spreadsheet_path, SPREADSHEET_FILENAME, message: message)
|
145
|
+
add_contents(spreadsheet_path, BulkOps::SPREADSHEET_FILENAME, message: message)
|
150
146
|
end
|
151
147
|
end
|
152
148
|
|
@@ -218,12 +214,12 @@ class BulkOps::GithubAccess
|
|
218
214
|
|
219
215
|
def get_metadata_row row_number
|
220
216
|
@current_metadata ||= load_metadata
|
221
|
-
@current_metadata[row_number - ROW_OFFSET]
|
217
|
+
@current_metadata[row_number - BulkOps::ROW_OFFSET]
|
222
218
|
end
|
223
219
|
|
224
220
|
def get_past_metadata_row commit_sha, row_number
|
225
221
|
past_metadata = Base64.decode64( client.contents(repo, path: filename, ref: commit_sha) )
|
226
|
-
past_metadata[row_number - ROW_OFFSET]
|
222
|
+
past_metadata[row_number - BulkOps::ROW_OFFSET]
|
227
223
|
end
|
228
224
|
|
229
225
|
def get_file filename
|
@@ -244,13 +240,13 @@ class BulkOps::GithubAccess
|
|
244
240
|
end
|
245
241
|
|
246
242
|
def spreadsheet_path
|
247
|
-
"#{name}/#{SPREADSHEET_FILENAME}"
|
243
|
+
"#{name}/#{BulkOps::SPREADSHEET_FILENAME}"
|
248
244
|
end
|
249
245
|
|
250
246
|
private
|
251
247
|
|
252
248
|
def options_path
|
253
|
-
"#{name}/#{OPTIONS_FILENAME}"
|
249
|
+
"#{name}/#{BulkOps::OPTIONS_FILENAME}"
|
254
250
|
end
|
255
251
|
|
256
252
|
def current_master_commit_sha
|
data/lib/bulk_ops/operation.rb
CHANGED
@@ -7,33 +7,10 @@ module BulkOps
|
|
7
7
|
|
8
8
|
include BulkOps::Verification
|
9
9
|
|
10
|
-
attr_accessor :work_type, :visibility, :reference_identifier
|
10
|
+
attr_accessor :work_type, :visibility, :reference_identifier, :metadata
|
11
11
|
|
12
12
|
delegate :can_merge?, :merge_pull_request, to: :git
|
13
13
|
|
14
|
-
INGEST_MEDIA_PATH = "/dams_ingest"
|
15
|
-
TEMPLATE_DIR = "lib/bulk_ops/templates"
|
16
|
-
RELATIONSHIP_COLUMNS = ["parent","child","next"]
|
17
|
-
SPECIAL_COLUMNS = ["parent",
|
18
|
-
"child",
|
19
|
-
"order",
|
20
|
-
"next",
|
21
|
-
"work_type",
|
22
|
-
"collection",
|
23
|
-
"collection_title",
|
24
|
-
"collection_id",
|
25
|
-
"visibility",
|
26
|
-
"relationship_identifier_type",
|
27
|
-
"id",
|
28
|
-
"filename",
|
29
|
-
"file"]
|
30
|
-
IGNORED_COLUMNS = ["ignore","offline_notes"]
|
31
|
-
OPTION_REQUIREMENTS = {type: {required: true,
|
32
|
-
values:[:ingest,:update]},
|
33
|
-
file_method: {required: :true,
|
34
|
-
values: [:replace_some,:add_remove,:replace_all]},
|
35
|
-
notifications: {required: true}}
|
36
|
-
|
37
14
|
def self.unique_name name, user
|
38
15
|
while BulkOps::Operation.find_by(name: name) || BulkOps::GithubAccess.list_branch_names(user).include?(name) do
|
39
16
|
if ['-','_'].include?(name[-2]) && name[-1].to_i > 0
|
@@ -119,7 +96,7 @@ module BulkOps
|
|
119
96
|
@metadata.each_with_index do |values,row_number|
|
120
97
|
proxy = work_proxies.find_by(row_number: row_number)
|
121
98
|
proxy.update(message: "interpreted at #{DateTime.now.strftime("%d/%m/%Y %H:%M")} " + proxy.message)
|
122
|
-
data = proxy.interpret_data values
|
99
|
+
data = BulkOps::Parser.new(proxy, @metadata).interpret_data(raw_row: values)
|
123
100
|
next unless proxy.proxy_errors.blank?
|
124
101
|
BulkOps::CreateWorkJob.perform_later(proxy.work_type || "Work",
|
125
102
|
user.email,
|
@@ -202,7 +179,7 @@ module BulkOps
|
|
202
179
|
|
203
180
|
#loop through the work proxies to create a job for each work
|
204
181
|
work_proxies.each do |proxy|
|
205
|
-
data = proxy.interpret_data final_spreadsheet[proxy.row_number]
|
182
|
+
data = BulkOps::Parser.new(proxy,final_spreadsheet).interpret_data(raw_row: final_spreadsheet[proxy.row_number])
|
206
183
|
BulkOps::UpdateWorkJob.perform_later(proxy.work_type || "",
|
207
184
|
user.email,
|
208
185
|
data,
|
@@ -238,13 +215,13 @@ module BulkOps
|
|
238
215
|
bulk_ops_dir = Gem::Specification.find_by_name("bulk_ops").gem_dir
|
239
216
|
|
240
217
|
#copy template files
|
241
|
-
Dir["#{bulk_ops_dir}/#{TEMPLATE_DIR}/*"].each do |file|
|
218
|
+
Dir["#{bulk_ops_dir}/#{BulkOps::TEMPLATE_DIR}/*"].each do |file|
|
242
219
|
git.add_file file
|
243
220
|
end
|
244
221
|
|
245
222
|
#update configuration options
|
246
223
|
unless options.blank?
|
247
|
-
full_options = YAML.load_file(File.join(bulk_ops_dir,TEMPLATE_DIR, BulkOps::
|
224
|
+
full_options = YAML.load_file(File.join(bulk_ops_dir,BulkOps::TEMPLATE_DIR, BulkOps::OPTIONS_FILENAME))
|
248
225
|
|
249
226
|
options.each { |option, value| full_options[option] = value }
|
250
227
|
|
@@ -278,6 +255,10 @@ module BulkOps
|
|
278
255
|
git.update_options(options, message: message)
|
279
256
|
end
|
280
257
|
|
258
|
+
def metadata
|
259
|
+
@metadata ||= git.load_metadata
|
260
|
+
end
|
261
|
+
|
281
262
|
def options
|
282
263
|
return {} if name.nil?
|
283
264
|
return @options if @options
|
@@ -332,7 +313,7 @@ module BulkOps
|
|
332
313
|
end
|
333
314
|
|
334
315
|
def ignored_fields
|
335
|
-
(options['ignored headers'] || []) + IGNORED_COLUMNS
|
316
|
+
(options['ignored headers'] || []) + BulkOps::IGNORED_COLUMNS
|
336
317
|
end
|
337
318
|
|
338
319
|
|
@@ -0,0 +1,485 @@
|
|
1
|
+
class BulkOps::Parser
|
2
|
+
require 'uri'
|
3
|
+
|
4
|
+
attr_accessor :proxy, :raw_data, :raw_row
|
5
|
+
|
6
|
+
delegate :relationships, :operation, :row_number, :work_id, :visibility, :work_type, :reference_identifier, :order, to: :proxy
|
7
|
+
|
8
|
+
def initialize prx, metadata_sheet=nil
|
9
|
+
@proxy = prx
|
10
|
+
@raw_data = (metadata_sheet || proxy.operation.metadata)
|
11
|
+
@raw_row = @raw_data[@proxy.row_number].dup
|
12
|
+
@metadata = {}
|
13
|
+
@parsing_errors = []
|
14
|
+
end
|
15
|
+
|
16
|
+
def interpret_data raw_row: nil, raw_data: nil, proxy: nil
|
17
|
+
@raw_row = raw_row if raw_row.present?
|
18
|
+
@proxy = proxy if proxy.present?
|
19
|
+
@raw_data = raw_data if raw_data.present?
|
20
|
+
setAdminSet
|
21
|
+
setMetadataInheritance
|
22
|
+
interpret_option_fields
|
23
|
+
interpret_relationship_fields
|
24
|
+
disambiguate_columns
|
25
|
+
interpret_file_fields
|
26
|
+
interpret_controlled_fields
|
27
|
+
interpret_scalar_fields
|
28
|
+
@proxy.update(status: "ERROR", message: "error parsing spreadsheet line") if @parsing_errors.present?
|
29
|
+
@proxy.proxy_errors = (@proxy.proxy_errors || []) + @parsing_errors
|
30
|
+
return @metadata
|
31
|
+
end
|
32
|
+
|
33
|
+
def disambiguate_columns
|
34
|
+
#do nothing unless there are columns with the same header
|
35
|
+
return unless (@raw_row.respond_to?(:headers) && (@raw_row.headers.uniq.length < @raw_row.length) )
|
36
|
+
row = {}
|
37
|
+
(0...@raw_row.length).each do |i|
|
38
|
+
header = @raw_row.headers[i]
|
39
|
+
value = @raw_row[i]
|
40
|
+
# separate values in identical columns using the separator
|
41
|
+
row[header] = (Array(row[header]) << value).join(BulkOps::SEPARATOR)
|
42
|
+
end
|
43
|
+
#return a hash with identical columns merged
|
44
|
+
return row
|
45
|
+
end
|
46
|
+
|
47
|
+
def interpret_controlled_fields
|
48
|
+
|
49
|
+
# The labels array tracks the contents of columns marked as labels,
|
50
|
+
# which may require special validation
|
51
|
+
labels = {}
|
52
|
+
|
53
|
+
# This hash is populated with relevant data as we loop through the fields
|
54
|
+
controlled_data = {}
|
55
|
+
|
56
|
+
row = @raw_row.dup
|
57
|
+
@raw_row.each do |field_name, value|
|
58
|
+
next if value.blank? or field_name.blank?
|
59
|
+
field_name = field_name.to_s
|
60
|
+
|
61
|
+
#If our CSV interpreter is feeding us the headers as a line, ignore it.
|
62
|
+
next if field_name == value
|
63
|
+
|
64
|
+
#check if they are using the 'field_name.authority' syntax
|
65
|
+
authority = nil
|
66
|
+
if ((split=field_name.split('.')).count == 2)
|
67
|
+
authority = split.last
|
68
|
+
field_name = split.first
|
69
|
+
end
|
70
|
+
|
71
|
+
# get the field name, if this column is a metadata field
|
72
|
+
field_name_norm = find_field_name(field_name)
|
73
|
+
field = schema.get_field(field_name_norm)
|
74
|
+
|
75
|
+
# Ignore anything that isn't a controlled field
|
76
|
+
next unless field.present? && field.controlled?
|
77
|
+
|
78
|
+
# Keep track of label fields
|
79
|
+
if field_name.downcase.ends_with?("label")
|
80
|
+
next if operation.options["ignore_labels"]
|
81
|
+
labels[field_name_norm] ||= []
|
82
|
+
labels[field_name_norm] += split_values value
|
83
|
+
next unless operation.options["import_labels"]
|
84
|
+
end
|
85
|
+
|
86
|
+
remove = field_name.downcase.starts_with?("remove") || field_name.downcase.starts_with?("delete")
|
87
|
+
|
88
|
+
# handle multiple values
|
89
|
+
value_array = split_values(value)
|
90
|
+
controlled_data[field_name_norm] ||= [] unless value_array.blank?
|
91
|
+
value_array.each do |value|
|
92
|
+
# Decide of we're dealing with a label or url
|
93
|
+
# It's an ID if it's a URL and the name doesn't end in 'label'
|
94
|
+
value.strip!
|
95
|
+
if value =~ /^#{URI::regexp}$/ and !field_name.downcase.ends_with?("label")
|
96
|
+
value_id = value
|
97
|
+
# label = WorkIndexer.fetch_remote_label(value)
|
98
|
+
# error_message = "cannot fetch remote label for url: #{value}"
|
99
|
+
# report_error( :cannot_retrieve_label , error_message, url: value, row_number: row_number) unless label
|
100
|
+
else
|
101
|
+
# It's a label, so unescape it and get the id
|
102
|
+
value = unescape_csv(value)
|
103
|
+
value_id = get_remote_id(value, property: field_name_norm, authority: authority) || localAuthUrl(field_name_norm, value)
|
104
|
+
# label = value
|
105
|
+
report_error(:cannot_retrieve_url,
|
106
|
+
message: "cannot find or create url for controlled vocabulary label: #{value}",
|
107
|
+
url: value,
|
108
|
+
row_number: row_number) unless value_id
|
109
|
+
end
|
110
|
+
controlled_data[field_name_norm] << {id: value_id, remove: field_name.downcase.starts_with?("remove")}
|
111
|
+
row.delete(field_name)
|
112
|
+
end
|
113
|
+
end
|
114
|
+
@raw_row = row
|
115
|
+
|
116
|
+
# Actually add all the data
|
117
|
+
controlled_data.each do |property_name, data|
|
118
|
+
@metadata["#{property_name}_attributes"] ||= [] unless data.blank?
|
119
|
+
data.uniq.each do |datum|
|
120
|
+
atts = {"id" => datum[:id]}
|
121
|
+
atts["_delete"] = true if datum[:remove]
|
122
|
+
@metadata["#{property_name}_attributes"] << atts
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
def interpret_scalar_fields
|
128
|
+
row = @raw_row.dup
|
129
|
+
@raw_row.each do |field, values|
|
130
|
+
next if values.blank? or field.nil? or field == values
|
131
|
+
# get the field name, if this column is a metadata field
|
132
|
+
next unless field_name = find_field_name(field.to_s)
|
133
|
+
field = schema.get_field(field_name)
|
134
|
+
# Ignore controlled fields
|
135
|
+
next if field.controlled?
|
136
|
+
split_values(values).each do |value|
|
137
|
+
next if value.blank?
|
138
|
+
value = value.strip.encode('utf-8', :invalid => :replace, :undef => :replace, :replace => '_') unless value.blank?
|
139
|
+
value = unescape_csv(value)
|
140
|
+
(@metadata[field_name] ||= []) << value
|
141
|
+
row.delete(field)
|
142
|
+
end
|
143
|
+
end
|
144
|
+
@raw_row = row
|
145
|
+
end
|
146
|
+
|
147
|
+
def interpret_file_fields
|
148
|
+
# This method handles file additions and deletions from the spreadsheet
|
149
|
+
# if additional files need to be deleted because the update is set to replace
|
150
|
+
# some or all existing files, those replacement-related deletions are handled
|
151
|
+
# by the BulkOps::Operation.
|
152
|
+
#
|
153
|
+
# TODO: THIS DOES NOT YET MANAGE THE ORDER OF INGESTED FILESETS
|
154
|
+
|
155
|
+
row = @raw_row.dup
|
156
|
+
@raw_row.each do |field, value|
|
157
|
+
next if value.blank? or field.blank?
|
158
|
+
field = field.to_s
|
159
|
+
#If our CSV interpreter is feeding us the headers as a line, ignore it.
|
160
|
+
next if field == value
|
161
|
+
|
162
|
+
|
163
|
+
# Check if this is a file field, and whether we are removing or adding a file
|
164
|
+
next unless (action = is_file_field?(field))
|
165
|
+
|
166
|
+
# Move on if this field is the name of another property (e.g. masterFilename)
|
167
|
+
next if find_field_name(field)
|
168
|
+
|
169
|
+
# Check if we are removing a file
|
170
|
+
if action == "remove"
|
171
|
+
get_removed_filesets(value).each { |fileset_id| delete_file_set(file_set_id) }
|
172
|
+
else
|
173
|
+
# Add a file
|
174
|
+
operation.get_file_paths(value).each do |filepath|
|
175
|
+
begin
|
176
|
+
uploaded_file = Hyrax::UploadedFile.create(file: File.open(filepath), user: operation.user)
|
177
|
+
(@metadata[:uploaded_files] ||= []) << uploaded_file.id unless uploaded_file.id.nil?
|
178
|
+
row.delete(field)
|
179
|
+
rescue Exception => e
|
180
|
+
report_error(:upload_error,
|
181
|
+
message: "Error opening file: #{ filepath } -- #{e}",
|
182
|
+
file: File.join(BulkOps::INGEST_MEDIA_PATH,filename),
|
183
|
+
row_number: row_number)
|
184
|
+
end
|
185
|
+
end
|
186
|
+
end
|
187
|
+
end
|
188
|
+
@raw_row = row
|
189
|
+
end
|
190
|
+
|
191
|
+
def interpret_option_fields
|
192
|
+
row = @raw_row.dup
|
193
|
+
@raw_row.each do |field,value|
|
194
|
+
next if value.blank? or field.blank?
|
195
|
+
field = field.to_s
|
196
|
+
next if value == field
|
197
|
+
|
198
|
+
normfield = field.downcase.parameterize.gsub(/[_\s-]/,'')
|
199
|
+
if ["visibility", "public"].include?(normfield)
|
200
|
+
@proxy.update(visibility: format_visibility(value))
|
201
|
+
row.delete(field)
|
202
|
+
end
|
203
|
+
if ["worktype","model","type"].include?(normfield)
|
204
|
+
@proxy.update(work_type: format_worktype(value) )
|
205
|
+
row.delete(field)
|
206
|
+
end
|
207
|
+
if ["referenceidentifier",
|
208
|
+
"referenceid",
|
209
|
+
"refid",
|
210
|
+
"referenceidentifiertype",
|
211
|
+
"referenceidtype",
|
212
|
+
"refidtype",
|
213
|
+
"relationshipidentifier",
|
214
|
+
"relationshipid",
|
215
|
+
"relationshipidentifiertype",
|
216
|
+
"relationshipidtype",
|
217
|
+
"relid",
|
218
|
+
"relidtype"].include?(normfield)
|
219
|
+
@proxy.update(reference_identifier: format_reference_id(value))
|
220
|
+
row.delete(field)
|
221
|
+
end
|
222
|
+
end
|
223
|
+
@raw_row = row
|
224
|
+
end
|
225
|
+
|
226
|
+
def interpret_relationship_fields
|
227
|
+
row = @raw_row.dup
|
228
|
+
@raw_row.each do |field,value|
|
229
|
+
next if value.blank? or field.blank?
|
230
|
+
field = field.to_s
|
231
|
+
value = unescape_csv(value)
|
232
|
+
identifer_type = reference_identifier
|
233
|
+
|
234
|
+
next if value == field
|
235
|
+
|
236
|
+
# Correctly interpret the notation "parent:id", "parent id" etc in a column header
|
237
|
+
if (split = field.split(/[:_\-\s]/)).count == 2
|
238
|
+
identifier_type = split.last
|
239
|
+
relationship_type = split.first.to_s
|
240
|
+
else
|
241
|
+
relationship_type = field
|
242
|
+
end
|
243
|
+
|
244
|
+
relationship_type = normalize_relationship_field_name(relationship_type)
|
245
|
+
case relationship_type
|
246
|
+
when "order"
|
247
|
+
# If the field specifies the object's order among siblings
|
248
|
+
@proxy.update(order: value.to_f)
|
249
|
+
row.delete(field)
|
250
|
+
next
|
251
|
+
when "collection"
|
252
|
+
# If the field specifies the name or ID of a collection,
|
253
|
+
# find or create the collection and update the metadata to match
|
254
|
+
col = find_or_create_collection(value)
|
255
|
+
( @metadata[:member_of_collection_ids] ||= [] ) << col.id if col
|
256
|
+
row.delete field
|
257
|
+
next
|
258
|
+
when "parent", "child"
|
259
|
+
|
260
|
+
# correctly interpret the notation "id:a78C2d81"
|
261
|
+
identifier_type, object_identifier = interpret_relationship_value(identifier_type, value)
|
262
|
+
|
263
|
+
relationship_parameters = { work_proxy_id: @proxy.id,
|
264
|
+
identifier_type: identifier_type,
|
265
|
+
relationship_type: relationship_type,
|
266
|
+
object_identifier: object_identifier,
|
267
|
+
status: "new"}
|
268
|
+
|
269
|
+
#add previous sibling link if necessary
|
270
|
+
previous_value = @raw_data[row_number-1][field]
|
271
|
+
# Check if this is a parent relationship, and the previous row also has one
|
272
|
+
if previous_value.present? && (relationship_type == "parent")
|
273
|
+
# Check if the previous row has the same parent as this row
|
274
|
+
if object_identifier == interpret_relationship_value(identifier_type, previous_value, field).last
|
275
|
+
# If so, set the previous sibling parameter on the relationshp
|
276
|
+
# to the id for the proxy associated with the previous row
|
277
|
+
relationship_parameters[:previous_sibling] = operation.work_proxies.find_by(row_number: row_number-1).id
|
278
|
+
end
|
279
|
+
end
|
280
|
+
BulkOps::Relationship.create(relationship_parameters)
|
281
|
+
row.delete field
|
282
|
+
end
|
283
|
+
end
|
284
|
+
@raw_row = row
|
285
|
+
end
|
286
|
+
|
287
|
+
def normalize_relationship_field_name field
|
288
|
+
normfield = field.downcase.parameterize.gsub(/[_\s-]/,'')
|
289
|
+
BulkOps::RELATIONSHIP_FIELDS.find{|rel_field| normfield == rel_field }
|
290
|
+
end
|
291
|
+
|
292
|
+
def find_previous_parent field="parent"
|
293
|
+
#Return the row number of the most recent preceding row that does
|
294
|
+
# not itself have a parent defined
|
295
|
+
i = 1;
|
296
|
+
while (prev_row = raw_data[row_number - i])
|
297
|
+
return (row_number - i) if prev_row[field].blank?
|
298
|
+
i += 1
|
299
|
+
end
|
300
|
+
end
|
301
|
+
|
302
|
+
def interpret_relationship_value id_type, value, field="parent"
|
303
|
+
#Handle "id:20kj4259" syntax if it hasn't already been handled
|
304
|
+
if (split = value.to_s.split(":")).count == 2
|
305
|
+
id_type = split.first
|
306
|
+
value = split.last
|
307
|
+
end
|
308
|
+
#Handle special shorthand syntax for refering to relative row numbers
|
309
|
+
if id_type == "row"
|
310
|
+
if value.to_i < 0
|
311
|
+
# if given a negative integer, count backwards from the current row (remember that value.to_i is negative)
|
312
|
+
return [id_type,row_number + value.to_i]
|
313
|
+
elsif value.to_s.downcase.include?("prev")
|
314
|
+
# if given any variation of the word "previous", get the first preceding row with no parent of its own
|
315
|
+
return [id_type,find_previous_parent(field)]
|
316
|
+
end
|
317
|
+
end
|
318
|
+
return [id_type,value]
|
319
|
+
end
|
320
|
+
|
321
|
+
def unescape_csv(value)
|
322
|
+
value.gsub(/\\(['";,])/,'\1')
|
323
|
+
end
|
324
|
+
|
325
|
+
|
326
|
+
def format_worktype(value)
|
327
|
+
# format the value like a class name
|
328
|
+
type = value.titleize.gsub(/[-_\s]/,'')
|
329
|
+
# reject it if it isn't a defined class
|
330
|
+
type = false unless Object.const_defined? type
|
331
|
+
# fall back to the work type defined by the operation, or a standard "Work"
|
332
|
+
return type ||= work_type || operation.work_type || "Work"
|
333
|
+
end
|
334
|
+
|
335
|
+
def format_visibility(value)
|
336
|
+
case value.downcase
|
337
|
+
when "public", "open", "true"
|
338
|
+
return "open"
|
339
|
+
when "campus", "ucsc", "institution"
|
340
|
+
return "ucsc"
|
341
|
+
when "restricted", "private", "closed", "false"
|
342
|
+
return "restricted"
|
343
|
+
end
|
344
|
+
end
|
345
|
+
|
346
|
+
|
347
|
+
def mintLocalAuthUrl(auth_name, value)
|
348
|
+
value.strip!
|
349
|
+
id = value.parameterize
|
350
|
+
auth = Qa::LocalAuthority.find_or_create_by(name: auth_name)
|
351
|
+
entry = Qa::LocalAuthorityEntry.create(local_authority: auth,
|
352
|
+
label: value,
|
353
|
+
uri: id)
|
354
|
+
return localIdToUrl(id,auth_name)
|
355
|
+
end
|
356
|
+
|
357
|
+
def findAuthUrl(auth, value)
|
358
|
+
value.strip!
|
359
|
+
return nil if auth.nil?
|
360
|
+
return nil unless (entries = Qa::Authorities::Local.subauthority_for(auth).search(value))
|
361
|
+
entries.each do |entry|
|
362
|
+
#require exact match
|
363
|
+
next unless entry["label"].force_encoding('UTF-8') == value.force_encoding('UTF-8')
|
364
|
+
url = entry["url"] || entry["id"]
|
365
|
+
# url = localIdToUrl(url,auth) unless url =~ URI::regexp
|
366
|
+
return url
|
367
|
+
end
|
368
|
+
return nil
|
369
|
+
end
|
370
|
+
|
371
|
+
def localIdToUrl(id,auth_name)
|
372
|
+
root_urls = {production: "https://digitalcollections.library.ucsc.edu",
|
373
|
+
staging: "http://digitalcollections-staging.library.ucsc.edu",
|
374
|
+
development: "http://#{Socket.gethostname}",
|
375
|
+
test: "http://#{Socket.gethostname}"}
|
376
|
+
return "#{root_urls[Rails.env.to_sym]}/authorities/show/local/#{auth_name}/#{id}"
|
377
|
+
end
|
378
|
+
|
379
|
+
def getLocalAuth(field_name)
|
380
|
+
field = schema.get_property(field_name)
|
381
|
+
# There is only ever one local authority per field, so just pick the first you find
|
382
|
+
if vocs = field.vocabularies
|
383
|
+
vocs.each do |voc|
|
384
|
+
return voc["subauthority"] if voc["authority"].downcase == "local"
|
385
|
+
end
|
386
|
+
end
|
387
|
+
return nil
|
388
|
+
end
|
389
|
+
|
390
|
+
def setAdminSet
|
391
|
+
return if @metadata[:admin_set_id]
|
392
|
+
asets = AdminSet.where({title: "Bulk Ingest Set"})
|
393
|
+
asets = AdminSet.find('admin_set/default') if asets.blank?
|
394
|
+
@metadata[:admin_set_id] = Array(asets).first.id unless asets.blank?
|
395
|
+
end
|
396
|
+
|
397
|
+
def setMetadataInheritance
|
398
|
+
return if @metadata[:metadataInheritance].present?
|
399
|
+
@metadata[:metadataInheritance] = operation.options["metadataInheritance"] unless operation.options["metadataInheritance"].blank?
|
400
|
+
end
|
401
|
+
|
402
|
+
def report_error type, message, **args
|
403
|
+
puts "ERROR MESSAGE: #{message}"
|
404
|
+
@proxy.update(status: "error", message: message)
|
405
|
+
args[:type]=type
|
406
|
+
(@parsing_errors ||= []) << BulkOps::Error.new(**args)
|
407
|
+
end
|
408
|
+
|
409
|
+
def get_removed_filesets(filestring)
|
410
|
+
file_ids = split_values(filestring)
|
411
|
+
file_ids.select{|file_id| record_exists?(file_id)}
|
412
|
+
|
413
|
+
# This part handles filenames in addition to file ids. It doesn't work yet!
|
414
|
+
# file_ids.map do |file_id|
|
415
|
+
# If the filename is the id of an existing record, keep that
|
416
|
+
# next(file_id) if (record_exists?(file_id))
|
417
|
+
# If this is the label (i.e.filename) of an existing fileset, use that fileset id
|
418
|
+
# TODO MAKE THIS WORK!!
|
419
|
+
# next(filename) if (filename_exists?(filename))
|
420
|
+
# File.join(BulkOps::INGEST_MEDIA_PATH, filename_prefix, filename)
|
421
|
+
# end
|
422
|
+
end
|
423
|
+
|
424
|
+
def delete_file_set fileset_id
|
425
|
+
BulkOps::DeleteFileSetJob.perform_later(fileset_id, operation.user.email )
|
426
|
+
end
|
427
|
+
|
428
|
+
|
429
|
+
def is_file_field? field
|
430
|
+
operation.is_file_field? field
|
431
|
+
end
|
432
|
+
|
433
|
+
def record_exists? id
|
434
|
+
operation.record_exists? id
|
435
|
+
end
|
436
|
+
|
437
|
+
def localAuthUrl(property, value)
|
438
|
+
return value if (auth = getLocalAuth(property)).nil?
|
439
|
+
url = findAuthUrl(auth, value) || mintLocalAuthUrl(auth,value)
|
440
|
+
return url
|
441
|
+
end
|
442
|
+
|
443
|
+
def find_collection(collection)
|
444
|
+
cols = Collection.where(id: collection)
|
445
|
+
cols += Collection.where(title: collection).select{|col| col.title.first == collection}
|
446
|
+
return cols.last unless cols.empty?
|
447
|
+
return false
|
448
|
+
end
|
449
|
+
|
450
|
+
def find_or_create_collection(collection)
|
451
|
+
col = find_collection(collection)
|
452
|
+
return col if col
|
453
|
+
return false if collection.to_i > 0
|
454
|
+
col = Collection.create(title: [collection.to_s], depositor: operation.user.email, collection_type: Hyrax::CollectionType.find_by(title:"User Collection"))
|
455
|
+
end
|
456
|
+
|
457
|
+
def get_remote_id(value, authority: nil, property: nil)
|
458
|
+
return false
|
459
|
+
#TODO retrieve URL for this value from the specified remote authr
|
460
|
+
end
|
461
|
+
|
462
|
+
def format_param_name(name)
|
463
|
+
name.titleize.gsub(/\s+/, "").camelcase(:lower)
|
464
|
+
end
|
465
|
+
|
466
|
+
def schema
|
467
|
+
ScoobySnacks::METADATA_SCHEMA
|
468
|
+
end
|
469
|
+
|
470
|
+
def find_field_name(field)
|
471
|
+
operation.find_field_name(field)
|
472
|
+
end
|
473
|
+
|
474
|
+
def downcase_first_letter(str)
|
475
|
+
return "" unless str
|
476
|
+
str[0].downcase + str[1..-1]
|
477
|
+
end
|
478
|
+
|
479
|
+
def split_values value_string
|
480
|
+
# Split values on all un-escaped separator character (escape character is '\')
|
481
|
+
# Then replace all escaped separator charactors with un-escaped versions
|
482
|
+
value_string.split(/(?<!\\)#{BulkOps::SEPARATOR}/).map{|val| val.gsub("\\#{BulkOps::SEPARATOR}",BulkOps::SEPARATOR).strip}
|
483
|
+
end
|
484
|
+
|
485
|
+
end
|
@@ -35,7 +35,7 @@ module BulkOps
|
|
35
35
|
return false if fieldname.blank?
|
36
36
|
return false if schema.get_field(fieldname)
|
37
37
|
field_parts = fieldname.underscore.humanize.downcase.gsub(/[-_]/,' ').split(" ")
|
38
|
-
return false unless field_parts.any?{ |field_type| BulkOps::
|
38
|
+
return false unless field_parts.any?{ |field_type| BulkOps::FILE_FIELDS.include?(field_type) }
|
39
39
|
return "remove" if field_parts.any?{ |field_type| ['remove','delete'].include?(field_type) }
|
40
40
|
return "add"
|
41
41
|
end
|
@@ -46,7 +46,7 @@ module BulkOps
|
|
46
46
|
name.gsub!(/[_\s-]?[lL]abel$/,'')
|
47
47
|
name.gsub!(/^[rR]emove[_\s-]?/,'')
|
48
48
|
name.gsub!(/^[dD]elete[_\s-]?/,'')
|
49
|
-
possible_fields = Work.attribute_names + schema.all_field_names
|
49
|
+
possible_fields = (Work.attribute_names + schema.all_field_names).uniq
|
50
50
|
matching_fields = possible_fields.select{|pfield| pfield.gsub(/[_\s-]/,'').parameterize == name.gsub(/[_\s-]/,'').parameterize }
|
51
51
|
return false if matching_fields.blank?
|
52
52
|
# raise Exception "Ambiguous metadata fields!" if matching_fields.uniq.count > 1
|
@@ -55,8 +55,8 @@ module BulkOps
|
|
55
55
|
|
56
56
|
def get_file_paths(filestring)
|
57
57
|
return [] if filestring.blank?
|
58
|
-
filenames = filestring.split(BulkOps::
|
59
|
-
filenames.map { |filename| File.join(BulkOps::
|
58
|
+
filenames = filestring.split(BulkOps::SEPARATOR)
|
59
|
+
filenames.map { |filename| File.join(BulkOps::INGEST_MEDIA_PATH, options['file_prefix'] || "", filename) }
|
60
60
|
end
|
61
61
|
|
62
62
|
def record_exists? id
|
@@ -85,7 +85,7 @@ module BulkOps
|
|
85
85
|
end
|
86
86
|
|
87
87
|
def verify_configuration
|
88
|
-
BulkOps::
|
88
|
+
BulkOps::OPTION_REQUIREMENTS.each do |option_name, option_info|
|
89
89
|
# Make sure it's present if required
|
90
90
|
if (option_info["required"].to_s == "true") || (option_info["required"].to_s == type)
|
91
91
|
if options[option_name].blank?
|
@@ -120,7 +120,7 @@ module BulkOps
|
|
120
120
|
# Ignore everything marked as a label
|
121
121
|
next if column_name_redux.ends_with? "label"
|
122
122
|
# Ignore any column names with special meaning in hyrax
|
123
|
-
next if BulkOps::
|
123
|
+
next if BulkOps::SPECIAL_COLUMNS.any?{|col| col.downcase.parameterize.gsub(/[_\s-]/,"") == column_name_redux }
|
124
124
|
# Ignore any columns speficied to be ignored in the configuration
|
125
125
|
ignored = options["ignored headers"] || []
|
126
126
|
next if ignored.any?{|col| col.downcase.parameterize.gsub(/[_\s-]/,"") == column_name_redux }
|
@@ -131,7 +131,7 @@ module BulkOps
|
|
131
131
|
end
|
132
132
|
|
133
133
|
def verify_remote_urls
|
134
|
-
row_offset = BulkOps::
|
134
|
+
row_offset = BulkOps::ROW_OFFSET.present? ? BulkOps::ROW_OFFSET : 2
|
135
135
|
get_spreadsheet.each_with_index do |row, row_num|
|
136
136
|
update(message: "verifying controlled vocab urls (row number #{row_num})")
|
137
137
|
next if row_num.nil?
|
@@ -173,7 +173,7 @@ module BulkOps
|
|
173
173
|
def get_ref_id row
|
174
174
|
row.each do |field,value|
|
175
175
|
next if field.blank? or value.blank? or field === value
|
176
|
-
next unless BulkOps::
|
176
|
+
next unless BulkOps::REFERENCE_IDENTIFIER_FIELDS.any?{ |ref_field| normalize_field(ref_field) == normalize_field(field) }
|
177
177
|
return value
|
178
178
|
end
|
179
179
|
# No reference identifier specified in the row. Use the default for the operation.
|
@@ -190,7 +190,7 @@ module BulkOps
|
|
190
190
|
# This is sketchy. Redo it.
|
191
191
|
(metadata = get_spreadsheet).each do |row,row_num|
|
192
192
|
ref_id = get_ref_id(row)
|
193
|
-
BulkOps::
|
193
|
+
BulkOps::RELATIONSHIP_COLUMNS.each do |relationship|
|
194
194
|
next unless (obj_id = row[relationship])
|
195
195
|
if (split = obj_id.split(':')).present? && split.count == 2
|
196
196
|
ref_id = split[0].downcase
|
data/lib/bulk_ops/version.rb
CHANGED
data/lib/bulk_ops/work_proxy.rb
CHANGED
@@ -1,12 +1,5 @@
|
|
1
1
|
class BulkOps::WorkProxy < ActiveRecord::Base
|
2
2
|
|
3
|
-
require 'uri'
|
4
|
-
OPTION_FIELDS = ['visibility','work type']
|
5
|
-
RELATIONSHIP_FIELDS = ['parent','child','collection','order']
|
6
|
-
REFERENCE_IDENTIFIER_FIELDS = ['Reference Identifier','ref_id','Reference ID','Relationship ID','Relationship Identifier','Reference Identifier Type','Reference ID Type','Ref ID Type','relationship_identifier_type','relationship_id_type']
|
7
|
-
FILE_FIELDS = ['file','files','filename','filenames']
|
8
|
-
FILE_ACTIONS = ['add','upload','remove','delete']
|
9
|
-
SEPARATOR = ';'
|
10
3
|
self.table_name = "bulk_ops_work_proxies"
|
11
4
|
belongs_to :operation, class_name: "BulkOps::Operation", foreign_key: "operation_id"
|
12
5
|
has_many :relationships, class_name: "BulkOps::Relationship"
|
@@ -40,462 +33,10 @@ class BulkOps::WorkProxy < ActiveRecord::Base
|
|
40
33
|
# TODO make it so people can edit the work again
|
41
34
|
end
|
42
35
|
|
43
|
-
def interpret_data raw_data
|
44
|
-
admin_set = AdminSet.where(title: "Bulk Ingest Set").first || AdminSet.find(AdminSet.find_or_create_default_admin_set_id)
|
45
|
-
metadata = {admin_set_id: admin_set.id}
|
46
|
-
metadata.merge! interpret_file_fields(raw_data)
|
47
|
-
metadata.merge! interpret_controlled_fields(raw_data)
|
48
|
-
metadata.merge! interpret_scalar_fields(raw_data)
|
49
|
-
metadata.merge! interpret_relationship_fields(raw_data)
|
50
|
-
metadata.merge! interpret_option_fields(raw_data)
|
51
|
-
metadata = setAdminSet(metadata)
|
52
|
-
metadata = setMetadataInheritance(metadata)
|
53
|
-
return metadata
|
54
|
-
end
|
55
36
|
|
56
37
|
def proxy_errors
|
57
38
|
@proxy_errors ||= []
|
58
39
|
end
|
59
40
|
|
60
|
-
private
|
61
|
-
|
62
|
-
def is_file_field? field
|
63
|
-
operation.is_file_field? field
|
64
|
-
end
|
65
|
-
|
66
|
-
def record_exists? id
|
67
|
-
operation.record_exists? id
|
68
|
-
end
|
69
|
-
|
70
|
-
def localAuthUrl(property, value)
|
71
|
-
return value if (auth = getLocalAuth(property)).nil?
|
72
|
-
url = findAuthUrl(auth, value) || mintLocalAuthUrl(auth,value)
|
73
|
-
return url
|
74
|
-
end
|
75
|
-
|
76
|
-
def find_collection(collection)
|
77
|
-
cols = Collection.where(id: collection)
|
78
|
-
cols += Collection.where(title: collection).select{|col| col.title.first == collection}
|
79
|
-
return cols.last unless cols.empty?
|
80
|
-
return false
|
81
|
-
end
|
82
|
-
|
83
|
-
def find_or_create_collection(collection)
|
84
|
-
col = find_collection(collection)
|
85
|
-
return col if col
|
86
|
-
return false if collection.to_i > 0
|
87
|
-
col = Collection.create(title: [collection.to_s], depositor: operation.user.email, collection_type: Hyrax::CollectionType.find_by(title:"User Collection"))
|
88
|
-
end
|
89
|
-
|
90
|
-
def get_remote_id(value, authority: nil, property: nil)
|
91
|
-
return false
|
92
|
-
#TODO retrieve URL for this value from the specified remote authr
|
93
|
-
end
|
94
|
-
|
95
|
-
def format_param_name(name)
|
96
|
-
name.titleize.gsub(/\s+/, "").camelcase(:lower)
|
97
|
-
end
|
98
|
-
|
99
|
-
def schema
|
100
|
-
ScoobySnacks::METADATA_SCHEMA
|
101
|
-
end
|
102
|
-
|
103
|
-
def find_field_name(field)
|
104
|
-
operation.find_field_name(field)
|
105
|
-
end
|
106
|
-
|
107
|
-
def downcase_first_letter(str)
|
108
|
-
return "" unless str
|
109
|
-
str[0].downcase + str[1..-1]
|
110
|
-
end
|
111
|
-
|
112
|
-
def split_values value_string
|
113
|
-
# Split values on all un-escaped separator character (escape character is '\')
|
114
|
-
# Then replace all escaped separator charactors with un-escaped versions
|
115
|
-
value_string.split(/(?<!\\)#{SEPARATOR}/).map{|val| val.gsub("\\#{SEPARATOR}",SEPARATOR).strip}
|
116
|
-
end
|
117
|
-
|
118
|
-
def interpret_controlled_fields raw_data
|
119
|
-
|
120
|
-
# The labels array tracks the contents of columns marked as labels,
|
121
|
-
# which may require special validation
|
122
|
-
labels = {}
|
123
|
-
|
124
|
-
# This hash is populated with relevant data as we loop through the fields
|
125
|
-
controlled_data = {}
|
126
|
-
|
127
|
-
raw_data.each do |field_name, value|
|
128
|
-
next if value.blank? or field_name.blank?
|
129
|
-
field_name = field_name.to_s
|
130
|
-
|
131
|
-
#If our CSV interpreter is feeding us the headers as a line, ignore it.
|
132
|
-
next if field_name == value
|
133
|
-
|
134
|
-
#check if they are using the 'field_name.authority' syntax
|
135
|
-
authority = nil
|
136
|
-
if ((split=field_name.split('.')).count == 2)
|
137
|
-
authority = split.last
|
138
|
-
field_name = split.first
|
139
|
-
end
|
140
|
-
|
141
|
-
# get the field name, if this column is a metadata field
|
142
|
-
field_name_norm = find_field_name(field_name)
|
143
|
-
field = schema.get_field(field_name_norm)
|
144
|
-
|
145
|
-
# Ignore anything that isn't a controlled field
|
146
|
-
next unless field.present? && field.controlled?
|
147
|
-
|
148
|
-
# Keep track of label fields
|
149
|
-
if field_name.downcase.ends_with?("label")
|
150
|
-
next if operation.options["ignore_labels"]
|
151
|
-
labels[field_name_norm] ||= []
|
152
|
-
labels[field_name_norm] += split_values value
|
153
|
-
next unless operation.options["import_labels"]
|
154
|
-
end
|
155
|
-
|
156
|
-
remove = field_name.downcase.starts_with?("remove") || field_name.downcase.starts_with?("delete")
|
157
|
-
|
158
|
-
# handle multiple values
|
159
|
-
value_array = split_values(value)
|
160
|
-
controlled_data[field_name_norm] ||= [] unless value_array.blank?
|
161
|
-
value_array.each do |value|
|
162
|
-
# Decide of we're dealing with a label or url
|
163
|
-
# It's an ID if it's a URL and the name doesn't end in 'label'
|
164
|
-
value.strip!
|
165
|
-
if value =~ /^#{URI::regexp}$/ and !field_name.downcase.ends_with?("label")
|
166
|
-
id = value
|
167
|
-
# label = WorkIndexer.fetch_remote_label(value)
|
168
|
-
# error_message = "cannot fetch remote label for url: #{value}"
|
169
|
-
# report_error( :cannot_retrieve_label , error_message, url: value, row_number: row_number) unless label
|
170
|
-
else
|
171
|
-
# It's a label, so unescape it and get the id
|
172
|
-
value = unescape_csv(value)
|
173
|
-
id = get_remote_id(value, property: field_name_norm, authority: authority) || localAuthUrl(field_name_norm, value)
|
174
|
-
# label = value
|
175
|
-
report_error(:cannot_retrieve_url,
|
176
|
-
message: "cannot find or create url for controlled vocabulary label: #{value}",
|
177
|
-
url: value,
|
178
|
-
row_number: row_number) unless id
|
179
|
-
end
|
180
|
-
controlled_data[field_name_norm] << {id: id, remove: field_name.downcase.starts_with?("remove")}
|
181
|
-
end
|
182
|
-
end
|
183
|
-
|
184
|
-
#delete any duplicates (if someone listed a url and also its label, or the same url twice)
|
185
|
-
controlled_data.each{|field_name, values| controlled_data[field_name] = values.uniq }
|
186
|
-
|
187
|
-
# Actually add all the data
|
188
|
-
metadata = {}
|
189
|
-
leftover_data = raw_data.dup.to_hash
|
190
|
-
controlled_data.each do |property_name, data|
|
191
|
-
metadata["#{property_name}_attributes"] ||= [] unless data.blank?
|
192
|
-
data.each do |datum|
|
193
|
-
atts = {"id" => datum[:id]}
|
194
|
-
atts["_delete"] = true if datum[:remove]
|
195
|
-
metadata["#{property_name}_attributes"] << atts
|
196
|
-
leftover_data.except! property_name
|
197
|
-
end
|
198
|
-
end
|
199
|
-
#return [metadata, leftover_data]
|
200
|
-
return metadata
|
201
|
-
end
|
202
|
-
|
203
|
-
def interpret_scalar_fields raw_data
|
204
|
-
metadata = {}
|
205
|
-
raw_data.each do |field, values|
|
206
|
-
next if values.blank? or field.nil? or field == values
|
207
|
-
# get the field name, if this column is a metadata field
|
208
|
-
next unless field_name = find_field_name(field.to_s)
|
209
|
-
field = schema.get_field(field_name)
|
210
|
-
# Ignore controlled fields
|
211
|
-
next if field.controlled?
|
212
|
-
split_values(values).each do |value|
|
213
|
-
next if value.blank?
|
214
|
-
value = value.strip.encode('utf-8', :invalid => :replace, :undef => :replace, :replace => '_') unless value.blank?
|
215
|
-
value = unescape_csv(value)
|
216
|
-
(metadata[field_name] ||= []) << value
|
217
|
-
end
|
218
|
-
end
|
219
|
-
return metadata
|
220
|
-
end
|
221
|
-
|
222
|
-
def interpret_file_fields raw_data
|
223
|
-
# This method handles file additions and deletions from the spreadsheet
|
224
|
-
# if additional files need to be deleted because the update is set to replace
|
225
|
-
# some or all existing files, those replacement-related deletions are handled
|
226
|
-
# by the BulkOps::Operation.
|
227
|
-
#
|
228
|
-
# TODO: THIS DOES NOT YET MANAGE THE ORDER OF INGESTED FILESETS
|
229
|
-
|
230
|
-
metadata = {}
|
231
|
-
raw_data.each do |field, value|
|
232
|
-
next if value.blank? or field.blank?
|
233
|
-
field = field.to_s
|
234
|
-
#If our CSV interpreter is feeding us the headers as a line, ignore it.
|
235
|
-
next if field == value
|
236
|
-
|
237
|
-
|
238
|
-
# Check if this is a file field, and whether we are removing or adding a file
|
239
|
-
next unless (action = is_file_field?(field))
|
240
|
-
|
241
|
-
# Move on if this field is the name of another property (e.g. masterFilename)
|
242
|
-
next if find_field_name(field)
|
243
|
-
|
244
|
-
# Check if we are removing a file
|
245
|
-
if action == "remove"
|
246
|
-
get_removed_filesets(value).each { |fileset_id| delete_file_set(file_set_id) }
|
247
|
-
else
|
248
|
-
# Add a file
|
249
|
-
operation.get_file_paths(value).each do |filepath|
|
250
|
-
begin
|
251
|
-
uploaded_file = Hyrax::UploadedFile.create(file: File.open(filepath), user: operation.user)
|
252
|
-
(metadata[:uploaded_files] ||= []) << uploaded_file.id unless uploaded_file.id.nil?
|
253
|
-
rescue Exception => e
|
254
|
-
report_error(:upload_error,
|
255
|
-
message: "Error opening file: #{ filepath } -- #{e}",
|
256
|
-
file: File.join(BulkOps::Operation::INGEST_MEDIA_PATH,filename),
|
257
|
-
row_number: row_number)
|
258
|
-
end
|
259
|
-
end
|
260
|
-
end
|
261
|
-
end
|
262
|
-
return metadata
|
263
|
-
end
|
264
|
-
|
265
|
-
def interpret_option_fields raw_data
|
266
|
-
raw_data.each do |field,value|
|
267
|
-
next if value.blank? or field.blank?
|
268
|
-
field = field.to_s
|
269
|
-
next if value == field
|
270
|
-
|
271
|
-
normfield = field.downcase.parameterize.gsub(/[_\s-]/,'')
|
272
|
-
if ["visibility", "public"].include?(normfield)
|
273
|
-
update(visibility: format_visibility(value))
|
274
|
-
end
|
275
|
-
if ["worktype","model","type"].include?(normfield)
|
276
|
-
update(work_type: format_worktype(value) )
|
277
|
-
end
|
278
|
-
if ["referenceidentifier",
|
279
|
-
"referenceid",
|
280
|
-
"refid",
|
281
|
-
"referenceidentifiertype",
|
282
|
-
"referenceidtype",
|
283
|
-
"refidtype",
|
284
|
-
"relationshipidentifier",
|
285
|
-
"relationshipid",
|
286
|
-
"relationshipidentifiertype",
|
287
|
-
"relationshipidtype",
|
288
|
-
"relid",
|
289
|
-
"relidtype"].include?(normfield)
|
290
|
-
update(reference_identifier: format_reference_id(value))
|
291
|
-
end
|
292
|
-
end
|
293
|
-
return {}
|
294
|
-
end
|
295
|
-
|
296
|
-
def interpret_relationship_fields(raw_data)
|
297
|
-
metadata = {}
|
298
|
-
raw_data.each do |field,value|
|
299
|
-
next if value.blank? or field.blank?
|
300
|
-
field = field.to_s
|
301
|
-
value = unescape_csv(value)
|
302
|
-
identifer_type = reference_identifier
|
303
|
-
|
304
|
-
next if value == field
|
305
|
-
|
306
|
-
if (split = field.split(":")).count == 2
|
307
|
-
identifier_type = split.last
|
308
|
-
relationship_type = split.first.to_s
|
309
|
-
else
|
310
|
-
relationship_type = field
|
311
|
-
end
|
312
|
-
|
313
|
-
relationship_type = normalize_relationship_field_name(relationship_type)
|
314
|
-
case relationship_type
|
315
|
-
when "order"
|
316
|
-
# If the field specifies the object's order among siblings
|
317
|
-
update(order: value.to_f)
|
318
|
-
next
|
319
|
-
when "collection"
|
320
|
-
# If the field specifies the name or ID of a collection,
|
321
|
-
# find or create the collection and update the metadata to match
|
322
|
-
col = find_or_create_collection(value)
|
323
|
-
( metadata[:member_of_collection_ids] ||= [] ) << col.id if col
|
324
|
-
next
|
325
|
-
when "parent", "child"
|
326
|
-
|
327
|
-
# correctly interpret the notation "id:a78C2d81"
|
328
|
-
identifier_type, object_identifier = interpret_relationship_value(identifier_type, value)
|
329
|
-
|
330
|
-
relationship_parameters = { work_proxy_id: id,
|
331
|
-
identifier_type: identifier_type,
|
332
|
-
relationship_type: relationship_type,
|
333
|
-
object_identifier: object_identifier,
|
334
|
-
status: "new"}
|
335
|
-
|
336
|
-
#add previous sibling link if necessary
|
337
|
-
previous_value = operation.final_spreadsheet[row_number-1][field]
|
338
|
-
# Check if this is a parent relationship, and the previous row also has one
|
339
|
-
if previous_value.present? && (relationship_type == "parent")
|
340
|
-
# Check if the previous row has the same parent as this row
|
341
|
-
if object_identifier == interpret_relationship_value(identifier_type, previous_value, field).last
|
342
|
-
# If so, set the previous sibling parameter on the relationshp
|
343
|
-
# to the id for the proxy associated with the previous row
|
344
|
-
relationship_parameters[:previous_sibling] = operation.work_proxies.find_by(row_number: row_number-1).id
|
345
|
-
end
|
346
|
-
end
|
347
|
-
BulkOps::Relationship.create(relationship_parameters)
|
348
|
-
end
|
349
|
-
return metadata
|
350
|
-
end
|
351
|
-
end
|
352
|
-
|
353
|
-
def normalize_relationship_field_name field
|
354
|
-
normfield = field.downcase.parameterize.gsub(/[_\s-]/,'')
|
355
|
-
RELATIONSHIP_FIELDS.find{|field| normfield.include?(field) }
|
356
|
-
end
|
357
|
-
|
358
|
-
def find_previous_parent field="parent"
|
359
|
-
#Return the row number of the most recent preceding row that does
|
360
|
-
# not itself have a parent defined
|
361
|
-
i = 0;
|
362
|
-
while (prev_row = operation.final_spreadsheet[row_number - i])
|
363
|
-
return (row_number - i) if prev_row[field].blank?
|
364
|
-
end
|
365
|
-
end
|
366
|
-
|
367
|
-
def interpret_relationship_value id_type, value, field="parent"
|
368
|
-
#Handle "id:20kj4259" syntax if it hasn't already been handled
|
369
|
-
if (split = value.to_s.split(":")).count == 2
|
370
|
-
id_type = split.first
|
371
|
-
value = split.last
|
372
|
-
end
|
373
|
-
#Handle special shorthand syntax for refering to relative row numbers
|
374
|
-
if id_type == "row"
|
375
|
-
if value.to_i < 0
|
376
|
-
# if given a negative integer, count backwards from the current row
|
377
|
-
return [id_type,row_number - value]
|
378
|
-
elsif value.to_s.downcase.include?("prev")
|
379
|
-
# if given any variation of the word "previous", get the first preceding row with no parent of its own
|
380
|
-
return [id_type,find_previous_parent(field)]
|
381
|
-
end
|
382
|
-
end
|
383
|
-
return [id_type,value]
|
384
|
-
end
|
385
|
-
|
386
|
-
def unescape_csv(value)
|
387
|
-
value.gsub(/\\(['";,])/,'\1')
|
388
|
-
end
|
389
|
-
|
390
|
-
def format_worktype(value)
|
391
|
-
# format the value like a class name
|
392
|
-
type = value.titleize.gsub(/[-_\s]/,'')
|
393
|
-
# reject it if it isn't a defined class
|
394
|
-
type = false unless Object.const_defined? type
|
395
|
-
# fall back to the work type defined by the operation, or a standard "Work"
|
396
|
-
return type ||= operation.work_type || "Work"
|
397
|
-
end
|
398
|
-
|
399
|
-
def format_visibility(value)
|
400
|
-
case value.downcase
|
401
|
-
when "public", "open", "true"
|
402
|
-
return "open"
|
403
|
-
when "campus", "ucsc", "institution"
|
404
|
-
return "ucsc"
|
405
|
-
when "restricted", "private", "closed", "false"
|
406
|
-
return "restricted"
|
407
|
-
end
|
408
|
-
end
|
409
|
-
|
410
|
-
def mintLocalAuthUrl(auth_name, value)
|
411
|
-
value.strip!
|
412
|
-
id = value.parameterize
|
413
|
-
auth = Qa::LocalAuthority.find_or_create_by(name: auth_name)
|
414
|
-
entry = Qa::LocalAuthorityEntry.create(local_authority: auth,
|
415
|
-
label: value,
|
416
|
-
uri: id)
|
417
|
-
return localIdToUrl(id,auth_name)
|
418
|
-
end
|
419
|
-
|
420
|
-
def findAuthUrl(auth, value)
|
421
|
-
value.strip!
|
422
|
-
return nil if auth.nil?
|
423
|
-
return nil unless (entries = Qa::Authorities::Local.subauthority_for(auth).search(value))
|
424
|
-
entries.each do |entry|
|
425
|
-
#require exact match
|
426
|
-
next unless entry["label"].force_encoding('UTF-8') == value.force_encoding('UTF-8')
|
427
|
-
url = entry["url"] || entry["id"]
|
428
|
-
# url = localIdToUrl(url,auth) unless url =~ URI::regexp
|
429
|
-
return url
|
430
|
-
end
|
431
|
-
return nil
|
432
|
-
end
|
433
|
-
|
434
|
-
def localIdToUrl(id,auth_name)
|
435
|
-
root_urls = {production: "https://digitalcollections.library.ucsc.edu",
|
436
|
-
staging: "http://digitalcollections-staging.library.ucsc.edu",
|
437
|
-
development: "http://#{Socket.gethostname}",
|
438
|
-
test: "http://#{Socket.gethostname}"}
|
439
|
-
return "#{root_urls[Rails.env.to_sym]}/authorities/show/local/#{auth_name}/#{id}"
|
440
|
-
end
|
441
|
-
|
442
|
-
def getLocalAuth(field_name)
|
443
|
-
field = schema.get_property(field_name)
|
444
|
-
# There is only ever one local authority per field, so just pick the first you find
|
445
|
-
if vocs = field.vocabularies
|
446
|
-
vocs.each do |voc|
|
447
|
-
return voc["subauthority"] if voc["authority"].downcase == "local"
|
448
|
-
end
|
449
|
-
end
|
450
|
-
return nil
|
451
|
-
end
|
452
|
-
|
453
|
-
def setAdminSet metadata
|
454
|
-
return metadata if metadata[:admin_set_id]
|
455
|
-
asets = AdminSet.where({title: "Bulk Ingest Set"})
|
456
|
-
asets = AdminSet.find('admin_set/default') if asets.blank?
|
457
|
-
metadata[:admin_set_id] = Array(asets).first.id unless asets.blank?
|
458
|
-
return metadata
|
459
|
-
end
|
460
|
-
|
461
|
-
def setMetadataInheritance metadata
|
462
|
-
return metadata if metadata[:metadataInheritance].present?
|
463
|
-
metadata[:metadataInheritance] = operation.options["metadataInheritance"] unless operation.options["metadataInheritance"].blank?
|
464
|
-
return metadata
|
465
|
-
end
|
466
|
-
|
467
|
-
def report_error type, message, **args
|
468
|
-
puts "ERROR MESSAGE: #{message}"
|
469
|
-
update(status: "error", message: message)
|
470
|
-
args[:type]=type
|
471
|
-
(@proxy_errors ||= []) << BulkOps::Error.new(**args)
|
472
|
-
end
|
473
|
-
|
474
|
-
def filename_prefix
|
475
|
-
@filename_prefix ||= operation.filename_prefix
|
476
|
-
end
|
477
|
-
|
478
|
-
def record_exists?
|
479
|
-
operation.record_exists? work_id
|
480
|
-
end
|
481
|
-
|
482
|
-
def get_removed_filesets(filestring)
|
483
|
-
file_ids = split_values(filestring)
|
484
|
-
file_ids.select{|file_id| record_exists?(file_id)}
|
485
|
-
|
486
|
-
# This part handles filenames in addition to file ids. It doesn't work yet!
|
487
|
-
# file_ids.map do |file_id|
|
488
|
-
# If the filename is the id of an existing record, keep that
|
489
|
-
# next(file_id) if (record_exists?(file_id))
|
490
|
-
# If this is the label (i.e.filename) of an existing fileset, use that fileset id
|
491
|
-
# TODO MAKE THIS WORK!!
|
492
|
-
# next(filename) if (filename_exists?(filename))
|
493
|
-
# File.join(BulkOps::Operation::INGEST_MEDIA_PATH, filename_prefix, filename)
|
494
|
-
# end
|
495
|
-
end
|
496
|
-
|
497
|
-
def delete_file_set fileset_id
|
498
|
-
BulkOps::DeleteFileSetJob.perform_later(fileset_id, operation.user.email )
|
499
|
-
end
|
500
41
|
|
501
42
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bulk_ops
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.15
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ned Henry, UCSC Library Digital Initiatives
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-10-
|
11
|
+
date: 2019-10-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rails
|
@@ -106,6 +106,7 @@ files:
|
|
106
106
|
- lib/bulk_ops/github_access.rb
|
107
107
|
- lib/bulk_ops/github_credential.rb
|
108
108
|
- lib/bulk_ops/operation.rb
|
109
|
+
- lib/bulk_ops/parser.rb
|
109
110
|
- lib/bulk_ops/queue_work_ingests_job.rb
|
110
111
|
- lib/bulk_ops/relationship.rb
|
111
112
|
- lib/bulk_ops/search_builder_behavior.rb
|