bulk_ops 0.1.14 → 0.1.15
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/bulk_ops.rb +34 -3
- data/lib/bulk_ops/create_spreadsheet_job.rb +1 -1
- data/lib/bulk_ops/github_access.rb +7 -11
- data/lib/bulk_ops/operation.rb +10 -29
- data/lib/bulk_ops/parser.rb +485 -0
- data/lib/bulk_ops/verification.rb +9 -9
- data/lib/bulk_ops/version.rb +1 -1
- data/lib/bulk_ops/work_proxy.rb +0 -459
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fea513373c0ae0267f9302311300b8f4ba03b9fa632db168aec201c2f8359182
|
4
|
+
data.tar.gz: baa0fe9b67bfbe7d2f8283ff7949cb8ec46e268c7e15ef17c7b73b9c3a80ef19
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 33810a935cc44ee6de4448a12e37d4c0889b6a4c7d409011fc5dd9d0bddc18e1a53f0f18337c933ab3dd6903d4112b0a968579f20e7e204d4278220c0dbb0315
|
7
|
+
data.tar.gz: b7ff43aed578a7aba0cb59d0862af6d1ffe7f50eccce6715171063a09e1edf2670e3d23333b4e506ff3d473ff6dbed56f672ff66b2f43209e58e67950706072a
|
data/lib/bulk_ops.rb
CHANGED
@@ -1,6 +1,39 @@
|
|
1
1
|
require "bulk_ops/version"
|
2
2
|
|
3
3
|
module BulkOps
|
4
|
+
OPTION_FIELDS = ['visibility','work type']
|
5
|
+
RELATIONSHIP_FIELDS = ['parent','child','collection','order']
|
6
|
+
REFERENCE_IDENTIFIER_FIELDS = ['Reference Identifier','ref_id','Reference ID','Relationship ID','Relationship Identifier','Reference Identifier Type','Reference ID Type','Ref ID Type','relationship_identifier_type','relationship_id_type']
|
7
|
+
FILE_FIELDS = ['file','files','filename','filenames']
|
8
|
+
FILE_ACTIONS = ['add','upload','remove','delete']
|
9
|
+
SEPARATOR = ';'
|
10
|
+
DEFAULT_ADMIN_SET_TITLE = "Bulk Ingest Set"
|
11
|
+
INGEST_MEDIA_PATH = "/dams_ingest"
|
12
|
+
TEMPLATE_DIR = "lib/bulk_ops/templates"
|
13
|
+
RELATIONSHIP_COLUMNS = ["parent","child","next"]
|
14
|
+
SPECIAL_COLUMNS = ["parent",
|
15
|
+
"child",
|
16
|
+
"order",
|
17
|
+
"next",
|
18
|
+
"work_type",
|
19
|
+
"collection",
|
20
|
+
"collection_title",
|
21
|
+
"collection_id",
|
22
|
+
"visibility",
|
23
|
+
"relationship_identifier_type",
|
24
|
+
"id",
|
25
|
+
"filename",
|
26
|
+
"file"]
|
27
|
+
IGNORED_COLUMNS = ["ignore","offline_notes"]
|
28
|
+
OPTION_REQUIREMENTS = {type: {required: true,
|
29
|
+
values:[:ingest,:update]},
|
30
|
+
file_method: {required: :true,
|
31
|
+
values: [:replace_some,:add_remove,:replace_all]},
|
32
|
+
notifications: {required: true}}
|
33
|
+
SPREADSHEET_FILENAME = 'metadata.csv'
|
34
|
+
OPTIONS_FILENAME = 'configuration.yml'
|
35
|
+
ROW_OFFSET = 2
|
36
|
+
|
4
37
|
dirstring = File.join( File.dirname(__FILE__), 'bulk_ops/**/*.rb')
|
5
38
|
Dir[dirstring].each do |file|
|
6
39
|
begin
|
@@ -9,7 +42,5 @@ module BulkOps
|
|
9
42
|
puts "ERROR LOADING #{File.basename(file)}: #{e}"
|
10
43
|
end
|
11
44
|
end
|
12
|
-
|
13
|
-
# require 'bulk_ops/verification'
|
14
|
-
# require 'bulk_ops/work_proxy'
|
45
|
+
|
15
46
|
end
|
@@ -36,7 +36,7 @@ class BulkOps::CreateSpreadsheetJob < ActiveJob::Base
|
|
36
36
|
next if value.is_a? DateTime
|
37
37
|
value = (label ? WorkIndexer.fetch_remote_label(value.id) : value.id) unless value.is_a? String
|
38
38
|
value.gsub("\"","\"\"")
|
39
|
-
end.join(BulkOps::
|
39
|
+
end.join(BulkOps::SEPARATOR).prepend('"').concat('"')
|
40
40
|
end.join(',')
|
41
41
|
end
|
42
42
|
|
@@ -5,10 +5,6 @@ require 'base64'
|
|
5
5
|
|
6
6
|
class BulkOps::GithubAccess
|
7
7
|
|
8
|
-
ROW_OFFSET = 2
|
9
|
-
SPREADSHEET_FILENAME = 'metadata.csv'
|
10
|
-
OPTIONS_FILENAME = 'configuration.yml'
|
11
|
-
|
12
8
|
attr_accessor :name
|
13
9
|
|
14
10
|
def self.auth_url user
|
@@ -142,11 +138,11 @@ class BulkOps::GithubAccess
|
|
142
138
|
def add_new_spreadsheet file, message=false
|
143
139
|
if file.is_a? Tempfile
|
144
140
|
file.close
|
145
|
-
add_file file.path, SPREADSHEET_FILENAME, message: message
|
141
|
+
add_file file.path, BulkOps::SPREADSHEET_FILENAME, message: message
|
146
142
|
elsif file.is_a?(String) && File.file?(file)
|
147
|
-
add_file file, SPREADSHEET_FILENAME, message: message
|
143
|
+
add_file file, BulkOps::SPREADSHEET_FILENAME, message: message
|
148
144
|
elsif file.is_a? String
|
149
|
-
add_contents(spreadsheet_path, SPREADSHEET_FILENAME, message: message)
|
145
|
+
add_contents(spreadsheet_path, BulkOps::SPREADSHEET_FILENAME, message: message)
|
150
146
|
end
|
151
147
|
end
|
152
148
|
|
@@ -218,12 +214,12 @@ class BulkOps::GithubAccess
|
|
218
214
|
|
219
215
|
def get_metadata_row row_number
|
220
216
|
@current_metadata ||= load_metadata
|
221
|
-
@current_metadata[row_number - ROW_OFFSET]
|
217
|
+
@current_metadata[row_number - BulkOps::ROW_OFFSET]
|
222
218
|
end
|
223
219
|
|
224
220
|
def get_past_metadata_row commit_sha, row_number
|
225
221
|
past_metadata = Base64.decode64( client.contents(repo, path: filename, ref: commit_sha) )
|
226
|
-
past_metadata[row_number - ROW_OFFSET]
|
222
|
+
past_metadata[row_number - BulkOps::ROW_OFFSET]
|
227
223
|
end
|
228
224
|
|
229
225
|
def get_file filename
|
@@ -244,13 +240,13 @@ class BulkOps::GithubAccess
|
|
244
240
|
end
|
245
241
|
|
246
242
|
def spreadsheet_path
|
247
|
-
"#{name}/#{SPREADSHEET_FILENAME}"
|
243
|
+
"#{name}/#{BulkOps::SPREADSHEET_FILENAME}"
|
248
244
|
end
|
249
245
|
|
250
246
|
private
|
251
247
|
|
252
248
|
def options_path
|
253
|
-
"#{name}/#{OPTIONS_FILENAME}"
|
249
|
+
"#{name}/#{BulkOps::OPTIONS_FILENAME}"
|
254
250
|
end
|
255
251
|
|
256
252
|
def current_master_commit_sha
|
data/lib/bulk_ops/operation.rb
CHANGED
@@ -7,33 +7,10 @@ module BulkOps
|
|
7
7
|
|
8
8
|
include BulkOps::Verification
|
9
9
|
|
10
|
-
attr_accessor :work_type, :visibility, :reference_identifier
|
10
|
+
attr_accessor :work_type, :visibility, :reference_identifier, :metadata
|
11
11
|
|
12
12
|
delegate :can_merge?, :merge_pull_request, to: :git
|
13
13
|
|
14
|
-
INGEST_MEDIA_PATH = "/dams_ingest"
|
15
|
-
TEMPLATE_DIR = "lib/bulk_ops/templates"
|
16
|
-
RELATIONSHIP_COLUMNS = ["parent","child","next"]
|
17
|
-
SPECIAL_COLUMNS = ["parent",
|
18
|
-
"child",
|
19
|
-
"order",
|
20
|
-
"next",
|
21
|
-
"work_type",
|
22
|
-
"collection",
|
23
|
-
"collection_title",
|
24
|
-
"collection_id",
|
25
|
-
"visibility",
|
26
|
-
"relationship_identifier_type",
|
27
|
-
"id",
|
28
|
-
"filename",
|
29
|
-
"file"]
|
30
|
-
IGNORED_COLUMNS = ["ignore","offline_notes"]
|
31
|
-
OPTION_REQUIREMENTS = {type: {required: true,
|
32
|
-
values:[:ingest,:update]},
|
33
|
-
file_method: {required: :true,
|
34
|
-
values: [:replace_some,:add_remove,:replace_all]},
|
35
|
-
notifications: {required: true}}
|
36
|
-
|
37
14
|
def self.unique_name name, user
|
38
15
|
while BulkOps::Operation.find_by(name: name) || BulkOps::GithubAccess.list_branch_names(user).include?(name) do
|
39
16
|
if ['-','_'].include?(name[-2]) && name[-1].to_i > 0
|
@@ -119,7 +96,7 @@ module BulkOps
|
|
119
96
|
@metadata.each_with_index do |values,row_number|
|
120
97
|
proxy = work_proxies.find_by(row_number: row_number)
|
121
98
|
proxy.update(message: "interpreted at #{DateTime.now.strftime("%d/%m/%Y %H:%M")} " + proxy.message)
|
122
|
-
data = proxy.interpret_data values
|
99
|
+
data = BulkOps::Parser.new(proxy, @metadata).interpret_data(raw_row: values)
|
123
100
|
next unless proxy.proxy_errors.blank?
|
124
101
|
BulkOps::CreateWorkJob.perform_later(proxy.work_type || "Work",
|
125
102
|
user.email,
|
@@ -202,7 +179,7 @@ module BulkOps
|
|
202
179
|
|
203
180
|
#loop through the work proxies to create a job for each work
|
204
181
|
work_proxies.each do |proxy|
|
205
|
-
data = proxy.interpret_data final_spreadsheet[proxy.row_number]
|
182
|
+
data = BulkOps::Parser.new(proxy,final_spreadsheet).interpret_data(raw_row: final_spreadsheet[proxy.row_number])
|
206
183
|
BulkOps::UpdateWorkJob.perform_later(proxy.work_type || "",
|
207
184
|
user.email,
|
208
185
|
data,
|
@@ -238,13 +215,13 @@ module BulkOps
|
|
238
215
|
bulk_ops_dir = Gem::Specification.find_by_name("bulk_ops").gem_dir
|
239
216
|
|
240
217
|
#copy template files
|
241
|
-
Dir["#{bulk_ops_dir}/#{TEMPLATE_DIR}/*"].each do |file|
|
218
|
+
Dir["#{bulk_ops_dir}/#{BulkOps::TEMPLATE_DIR}/*"].each do |file|
|
242
219
|
git.add_file file
|
243
220
|
end
|
244
221
|
|
245
222
|
#update configuration options
|
246
223
|
unless options.blank?
|
247
|
-
full_options = YAML.load_file(File.join(bulk_ops_dir,TEMPLATE_DIR, BulkOps::
|
224
|
+
full_options = YAML.load_file(File.join(bulk_ops_dir,BulkOps::TEMPLATE_DIR, BulkOps::OPTIONS_FILENAME))
|
248
225
|
|
249
226
|
options.each { |option, value| full_options[option] = value }
|
250
227
|
|
@@ -278,6 +255,10 @@ module BulkOps
|
|
278
255
|
git.update_options(options, message: message)
|
279
256
|
end
|
280
257
|
|
258
|
+
def metadata
|
259
|
+
@metadata ||= git.load_metadata
|
260
|
+
end
|
261
|
+
|
281
262
|
def options
|
282
263
|
return {} if name.nil?
|
283
264
|
return @options if @options
|
@@ -332,7 +313,7 @@ module BulkOps
|
|
332
313
|
end
|
333
314
|
|
334
315
|
def ignored_fields
|
335
|
-
(options['ignored headers'] || []) + IGNORED_COLUMNS
|
316
|
+
(options['ignored headers'] || []) + BulkOps::IGNORED_COLUMNS
|
336
317
|
end
|
337
318
|
|
338
319
|
|
@@ -0,0 +1,485 @@
|
|
1
|
+
class BulkOps::Parser
|
2
|
+
require 'uri'
|
3
|
+
|
4
|
+
attr_accessor :proxy, :raw_data, :raw_row
|
5
|
+
|
6
|
+
delegate :relationships, :operation, :row_number, :work_id, :visibility, :work_type, :reference_identifier, :order, to: :proxy
|
7
|
+
|
8
|
+
def initialize prx, metadata_sheet=nil
|
9
|
+
@proxy = prx
|
10
|
+
@raw_data = (metadata_sheet || proxy.operation.metadata)
|
11
|
+
@raw_row = @raw_data[@proxy.row_number].dup
|
12
|
+
@metadata = {}
|
13
|
+
@parsing_errors = []
|
14
|
+
end
|
15
|
+
|
16
|
+
def interpret_data raw_row: nil, raw_data: nil, proxy: nil
|
17
|
+
@raw_row = raw_row if raw_row.present?
|
18
|
+
@proxy = proxy if proxy.present?
|
19
|
+
@raw_data = raw_data if raw_data.present?
|
20
|
+
setAdminSet
|
21
|
+
setMetadataInheritance
|
22
|
+
interpret_option_fields
|
23
|
+
interpret_relationship_fields
|
24
|
+
disambiguate_columns
|
25
|
+
interpret_file_fields
|
26
|
+
interpret_controlled_fields
|
27
|
+
interpret_scalar_fields
|
28
|
+
@proxy.update(status: "ERROR", message: "error parsing spreadsheet line") if @parsing_errors.present?
|
29
|
+
@proxy.proxy_errors = (@proxy.proxy_errors || []) + @parsing_errors
|
30
|
+
return @metadata
|
31
|
+
end
|
32
|
+
|
33
|
+
def disambiguate_columns
|
34
|
+
#do nothing unless there are columns with the same header
|
35
|
+
return unless (@raw_row.respond_to?(:headers) && (@raw_row.headers.uniq.length < @raw_row.length) )
|
36
|
+
row = {}
|
37
|
+
(0...@raw_row.length).each do |i|
|
38
|
+
header = @raw_row.headers[i]
|
39
|
+
value = @raw_row[i]
|
40
|
+
# separate values in identical columns using the separator
|
41
|
+
row[header] = (Array(row[header]) << value).join(BulkOps::SEPARATOR)
|
42
|
+
end
|
43
|
+
#return a hash with identical columns merged
|
44
|
+
return row
|
45
|
+
end
|
46
|
+
|
47
|
+
def interpret_controlled_fields
|
48
|
+
|
49
|
+
# The labels array tracks the contents of columns marked as labels,
|
50
|
+
# which may require special validation
|
51
|
+
labels = {}
|
52
|
+
|
53
|
+
# This hash is populated with relevant data as we loop through the fields
|
54
|
+
controlled_data = {}
|
55
|
+
|
56
|
+
row = @raw_row.dup
|
57
|
+
@raw_row.each do |field_name, value|
|
58
|
+
next if value.blank? or field_name.blank?
|
59
|
+
field_name = field_name.to_s
|
60
|
+
|
61
|
+
#If our CSV interpreter is feeding us the headers as a line, ignore it.
|
62
|
+
next if field_name == value
|
63
|
+
|
64
|
+
#check if they are using the 'field_name.authority' syntax
|
65
|
+
authority = nil
|
66
|
+
if ((split=field_name.split('.')).count == 2)
|
67
|
+
authority = split.last
|
68
|
+
field_name = split.first
|
69
|
+
end
|
70
|
+
|
71
|
+
# get the field name, if this column is a metadata field
|
72
|
+
field_name_norm = find_field_name(field_name)
|
73
|
+
field = schema.get_field(field_name_norm)
|
74
|
+
|
75
|
+
# Ignore anything that isn't a controlled field
|
76
|
+
next unless field.present? && field.controlled?
|
77
|
+
|
78
|
+
# Keep track of label fields
|
79
|
+
if field_name.downcase.ends_with?("label")
|
80
|
+
next if operation.options["ignore_labels"]
|
81
|
+
labels[field_name_norm] ||= []
|
82
|
+
labels[field_name_norm] += split_values value
|
83
|
+
next unless operation.options["import_labels"]
|
84
|
+
end
|
85
|
+
|
86
|
+
remove = field_name.downcase.starts_with?("remove") || field_name.downcase.starts_with?("delete")
|
87
|
+
|
88
|
+
# handle multiple values
|
89
|
+
value_array = split_values(value)
|
90
|
+
controlled_data[field_name_norm] ||= [] unless value_array.blank?
|
91
|
+
value_array.each do |value|
|
92
|
+
# Decide of we're dealing with a label or url
|
93
|
+
# It's an ID if it's a URL and the name doesn't end in 'label'
|
94
|
+
value.strip!
|
95
|
+
if value =~ /^#{URI::regexp}$/ and !field_name.downcase.ends_with?("label")
|
96
|
+
value_id = value
|
97
|
+
# label = WorkIndexer.fetch_remote_label(value)
|
98
|
+
# error_message = "cannot fetch remote label for url: #{value}"
|
99
|
+
# report_error( :cannot_retrieve_label , error_message, url: value, row_number: row_number) unless label
|
100
|
+
else
|
101
|
+
# It's a label, so unescape it and get the id
|
102
|
+
value = unescape_csv(value)
|
103
|
+
value_id = get_remote_id(value, property: field_name_norm, authority: authority) || localAuthUrl(field_name_norm, value)
|
104
|
+
# label = value
|
105
|
+
report_error(:cannot_retrieve_url,
|
106
|
+
message: "cannot find or create url for controlled vocabulary label: #{value}",
|
107
|
+
url: value,
|
108
|
+
row_number: row_number) unless value_id
|
109
|
+
end
|
110
|
+
controlled_data[field_name_norm] << {id: value_id, remove: field_name.downcase.starts_with?("remove")}
|
111
|
+
row.delete(field_name)
|
112
|
+
end
|
113
|
+
end
|
114
|
+
@raw_row = row
|
115
|
+
|
116
|
+
# Actually add all the data
|
117
|
+
controlled_data.each do |property_name, data|
|
118
|
+
@metadata["#{property_name}_attributes"] ||= [] unless data.blank?
|
119
|
+
data.uniq.each do |datum|
|
120
|
+
atts = {"id" => datum[:id]}
|
121
|
+
atts["_delete"] = true if datum[:remove]
|
122
|
+
@metadata["#{property_name}_attributes"] << atts
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
def interpret_scalar_fields
|
128
|
+
row = @raw_row.dup
|
129
|
+
@raw_row.each do |field, values|
|
130
|
+
next if values.blank? or field.nil? or field == values
|
131
|
+
# get the field name, if this column is a metadata field
|
132
|
+
next unless field_name = find_field_name(field.to_s)
|
133
|
+
field = schema.get_field(field_name)
|
134
|
+
# Ignore controlled fields
|
135
|
+
next if field.controlled?
|
136
|
+
split_values(values).each do |value|
|
137
|
+
next if value.blank?
|
138
|
+
value = value.strip.encode('utf-8', :invalid => :replace, :undef => :replace, :replace => '_') unless value.blank?
|
139
|
+
value = unescape_csv(value)
|
140
|
+
(@metadata[field_name] ||= []) << value
|
141
|
+
row.delete(field)
|
142
|
+
end
|
143
|
+
end
|
144
|
+
@raw_row = row
|
145
|
+
end
|
146
|
+
|
147
|
+
def interpret_file_fields
|
148
|
+
# This method handles file additions and deletions from the spreadsheet
|
149
|
+
# if additional files need to be deleted because the update is set to replace
|
150
|
+
# some or all existing files, those replacement-related deletions are handled
|
151
|
+
# by the BulkOps::Operation.
|
152
|
+
#
|
153
|
+
# TODO: THIS DOES NOT YET MANAGE THE ORDER OF INGESTED FILESETS
|
154
|
+
|
155
|
+
row = @raw_row.dup
|
156
|
+
@raw_row.each do |field, value|
|
157
|
+
next if value.blank? or field.blank?
|
158
|
+
field = field.to_s
|
159
|
+
#If our CSV interpreter is feeding us the headers as a line, ignore it.
|
160
|
+
next if field == value
|
161
|
+
|
162
|
+
|
163
|
+
# Check if this is a file field, and whether we are removing or adding a file
|
164
|
+
next unless (action = is_file_field?(field))
|
165
|
+
|
166
|
+
# Move on if this field is the name of another property (e.g. masterFilename)
|
167
|
+
next if find_field_name(field)
|
168
|
+
|
169
|
+
# Check if we are removing a file
|
170
|
+
if action == "remove"
|
171
|
+
get_removed_filesets(value).each { |fileset_id| delete_file_set(file_set_id) }
|
172
|
+
else
|
173
|
+
# Add a file
|
174
|
+
operation.get_file_paths(value).each do |filepath|
|
175
|
+
begin
|
176
|
+
uploaded_file = Hyrax::UploadedFile.create(file: File.open(filepath), user: operation.user)
|
177
|
+
(@metadata[:uploaded_files] ||= []) << uploaded_file.id unless uploaded_file.id.nil?
|
178
|
+
row.delete(field)
|
179
|
+
rescue Exception => e
|
180
|
+
report_error(:upload_error,
|
181
|
+
message: "Error opening file: #{ filepath } -- #{e}",
|
182
|
+
file: File.join(BulkOps::INGEST_MEDIA_PATH,filename),
|
183
|
+
row_number: row_number)
|
184
|
+
end
|
185
|
+
end
|
186
|
+
end
|
187
|
+
end
|
188
|
+
@raw_row = row
|
189
|
+
end
|
190
|
+
|
191
|
+
def interpret_option_fields
|
192
|
+
row = @raw_row.dup
|
193
|
+
@raw_row.each do |field,value|
|
194
|
+
next if value.blank? or field.blank?
|
195
|
+
field = field.to_s
|
196
|
+
next if value == field
|
197
|
+
|
198
|
+
normfield = field.downcase.parameterize.gsub(/[_\s-]/,'')
|
199
|
+
if ["visibility", "public"].include?(normfield)
|
200
|
+
@proxy.update(visibility: format_visibility(value))
|
201
|
+
row.delete(field)
|
202
|
+
end
|
203
|
+
if ["worktype","model","type"].include?(normfield)
|
204
|
+
@proxy.update(work_type: format_worktype(value) )
|
205
|
+
row.delete(field)
|
206
|
+
end
|
207
|
+
if ["referenceidentifier",
|
208
|
+
"referenceid",
|
209
|
+
"refid",
|
210
|
+
"referenceidentifiertype",
|
211
|
+
"referenceidtype",
|
212
|
+
"refidtype",
|
213
|
+
"relationshipidentifier",
|
214
|
+
"relationshipid",
|
215
|
+
"relationshipidentifiertype",
|
216
|
+
"relationshipidtype",
|
217
|
+
"relid",
|
218
|
+
"relidtype"].include?(normfield)
|
219
|
+
@proxy.update(reference_identifier: format_reference_id(value))
|
220
|
+
row.delete(field)
|
221
|
+
end
|
222
|
+
end
|
223
|
+
@raw_row = row
|
224
|
+
end
|
225
|
+
|
226
|
+
def interpret_relationship_fields
|
227
|
+
row = @raw_row.dup
|
228
|
+
@raw_row.each do |field,value|
|
229
|
+
next if value.blank? or field.blank?
|
230
|
+
field = field.to_s
|
231
|
+
value = unescape_csv(value)
|
232
|
+
identifer_type = reference_identifier
|
233
|
+
|
234
|
+
next if value == field
|
235
|
+
|
236
|
+
# Correctly interpret the notation "parent:id", "parent id" etc in a column header
|
237
|
+
if (split = field.split(/[:_\-\s]/)).count == 2
|
238
|
+
identifier_type = split.last
|
239
|
+
relationship_type = split.first.to_s
|
240
|
+
else
|
241
|
+
relationship_type = field
|
242
|
+
end
|
243
|
+
|
244
|
+
relationship_type = normalize_relationship_field_name(relationship_type)
|
245
|
+
case relationship_type
|
246
|
+
when "order"
|
247
|
+
# If the field specifies the object's order among siblings
|
248
|
+
@proxy.update(order: value.to_f)
|
249
|
+
row.delete(field)
|
250
|
+
next
|
251
|
+
when "collection"
|
252
|
+
# If the field specifies the name or ID of a collection,
|
253
|
+
# find or create the collection and update the metadata to match
|
254
|
+
col = find_or_create_collection(value)
|
255
|
+
( @metadata[:member_of_collection_ids] ||= [] ) << col.id if col
|
256
|
+
row.delete field
|
257
|
+
next
|
258
|
+
when "parent", "child"
|
259
|
+
|
260
|
+
# correctly interpret the notation "id:a78C2d81"
|
261
|
+
identifier_type, object_identifier = interpret_relationship_value(identifier_type, value)
|
262
|
+
|
263
|
+
relationship_parameters = { work_proxy_id: @proxy.id,
|
264
|
+
identifier_type: identifier_type,
|
265
|
+
relationship_type: relationship_type,
|
266
|
+
object_identifier: object_identifier,
|
267
|
+
status: "new"}
|
268
|
+
|
269
|
+
#add previous sibling link if necessary
|
270
|
+
previous_value = @raw_data[row_number-1][field]
|
271
|
+
# Check if this is a parent relationship, and the previous row also has one
|
272
|
+
if previous_value.present? && (relationship_type == "parent")
|
273
|
+
# Check if the previous row has the same parent as this row
|
274
|
+
if object_identifier == interpret_relationship_value(identifier_type, previous_value, field).last
|
275
|
+
# If so, set the previous sibling parameter on the relationshp
|
276
|
+
# to the id for the proxy associated with the previous row
|
277
|
+
relationship_parameters[:previous_sibling] = operation.work_proxies.find_by(row_number: row_number-1).id
|
278
|
+
end
|
279
|
+
end
|
280
|
+
BulkOps::Relationship.create(relationship_parameters)
|
281
|
+
row.delete field
|
282
|
+
end
|
283
|
+
end
|
284
|
+
@raw_row = row
|
285
|
+
end
|
286
|
+
|
287
|
+
def normalize_relationship_field_name field
|
288
|
+
normfield = field.downcase.parameterize.gsub(/[_\s-]/,'')
|
289
|
+
BulkOps::RELATIONSHIP_FIELDS.find{|rel_field| normfield == rel_field }
|
290
|
+
end
|
291
|
+
|
292
|
+
def find_previous_parent field="parent"
|
293
|
+
#Return the row number of the most recent preceding row that does
|
294
|
+
# not itself have a parent defined
|
295
|
+
i = 1;
|
296
|
+
while (prev_row = raw_data[row_number - i])
|
297
|
+
return (row_number - i) if prev_row[field].blank?
|
298
|
+
i += 1
|
299
|
+
end
|
300
|
+
end
|
301
|
+
|
302
|
+
def interpret_relationship_value id_type, value, field="parent"
|
303
|
+
#Handle "id:20kj4259" syntax if it hasn't already been handled
|
304
|
+
if (split = value.to_s.split(":")).count == 2
|
305
|
+
id_type = split.first
|
306
|
+
value = split.last
|
307
|
+
end
|
308
|
+
#Handle special shorthand syntax for refering to relative row numbers
|
309
|
+
if id_type == "row"
|
310
|
+
if value.to_i < 0
|
311
|
+
# if given a negative integer, count backwards from the current row (remember that value.to_i is negative)
|
312
|
+
return [id_type,row_number + value.to_i]
|
313
|
+
elsif value.to_s.downcase.include?("prev")
|
314
|
+
# if given any variation of the word "previous", get the first preceding row with no parent of its own
|
315
|
+
return [id_type,find_previous_parent(field)]
|
316
|
+
end
|
317
|
+
end
|
318
|
+
return [id_type,value]
|
319
|
+
end
|
320
|
+
|
321
|
+
def unescape_csv(value)
|
322
|
+
value.gsub(/\\(['";,])/,'\1')
|
323
|
+
end
|
324
|
+
|
325
|
+
|
326
|
+
def format_worktype(value)
|
327
|
+
# format the value like a class name
|
328
|
+
type = value.titleize.gsub(/[-_\s]/,'')
|
329
|
+
# reject it if it isn't a defined class
|
330
|
+
type = false unless Object.const_defined? type
|
331
|
+
# fall back to the work type defined by the operation, or a standard "Work"
|
332
|
+
return type ||= work_type || operation.work_type || "Work"
|
333
|
+
end
|
334
|
+
|
335
|
+
def format_visibility(value)
|
336
|
+
case value.downcase
|
337
|
+
when "public", "open", "true"
|
338
|
+
return "open"
|
339
|
+
when "campus", "ucsc", "institution"
|
340
|
+
return "ucsc"
|
341
|
+
when "restricted", "private", "closed", "false"
|
342
|
+
return "restricted"
|
343
|
+
end
|
344
|
+
end
|
345
|
+
|
346
|
+
|
347
|
+
def mintLocalAuthUrl(auth_name, value)
|
348
|
+
value.strip!
|
349
|
+
id = value.parameterize
|
350
|
+
auth = Qa::LocalAuthority.find_or_create_by(name: auth_name)
|
351
|
+
entry = Qa::LocalAuthorityEntry.create(local_authority: auth,
|
352
|
+
label: value,
|
353
|
+
uri: id)
|
354
|
+
return localIdToUrl(id,auth_name)
|
355
|
+
end
|
356
|
+
|
357
|
+
def findAuthUrl(auth, value)
|
358
|
+
value.strip!
|
359
|
+
return nil if auth.nil?
|
360
|
+
return nil unless (entries = Qa::Authorities::Local.subauthority_for(auth).search(value))
|
361
|
+
entries.each do |entry|
|
362
|
+
#require exact match
|
363
|
+
next unless entry["label"].force_encoding('UTF-8') == value.force_encoding('UTF-8')
|
364
|
+
url = entry["url"] || entry["id"]
|
365
|
+
# url = localIdToUrl(url,auth) unless url =~ URI::regexp
|
366
|
+
return url
|
367
|
+
end
|
368
|
+
return nil
|
369
|
+
end
|
370
|
+
|
371
|
+
def localIdToUrl(id,auth_name)
|
372
|
+
root_urls = {production: "https://digitalcollections.library.ucsc.edu",
|
373
|
+
staging: "http://digitalcollections-staging.library.ucsc.edu",
|
374
|
+
development: "http://#{Socket.gethostname}",
|
375
|
+
test: "http://#{Socket.gethostname}"}
|
376
|
+
return "#{root_urls[Rails.env.to_sym]}/authorities/show/local/#{auth_name}/#{id}"
|
377
|
+
end
|
378
|
+
|
379
|
+
def getLocalAuth(field_name)
|
380
|
+
field = schema.get_property(field_name)
|
381
|
+
# There is only ever one local authority per field, so just pick the first you find
|
382
|
+
if vocs = field.vocabularies
|
383
|
+
vocs.each do |voc|
|
384
|
+
return voc["subauthority"] if voc["authority"].downcase == "local"
|
385
|
+
end
|
386
|
+
end
|
387
|
+
return nil
|
388
|
+
end
|
389
|
+
|
390
|
+
def setAdminSet
|
391
|
+
return if @metadata[:admin_set_id]
|
392
|
+
asets = AdminSet.where({title: "Bulk Ingest Set"})
|
393
|
+
asets = AdminSet.find('admin_set/default') if asets.blank?
|
394
|
+
@metadata[:admin_set_id] = Array(asets).first.id unless asets.blank?
|
395
|
+
end
|
396
|
+
|
397
|
+
def setMetadataInheritance
|
398
|
+
return if @metadata[:metadataInheritance].present?
|
399
|
+
@metadata[:metadataInheritance] = operation.options["metadataInheritance"] unless operation.options["metadataInheritance"].blank?
|
400
|
+
end
|
401
|
+
|
402
|
+
def report_error type, message, **args
|
403
|
+
puts "ERROR MESSAGE: #{message}"
|
404
|
+
@proxy.update(status: "error", message: message)
|
405
|
+
args[:type]=type
|
406
|
+
(@parsing_errors ||= []) << BulkOps::Error.new(**args)
|
407
|
+
end
|
408
|
+
|
409
|
+
def get_removed_filesets(filestring)
|
410
|
+
file_ids = split_values(filestring)
|
411
|
+
file_ids.select{|file_id| record_exists?(file_id)}
|
412
|
+
|
413
|
+
# This part handles filenames in addition to file ids. It doesn't work yet!
|
414
|
+
# file_ids.map do |file_id|
|
415
|
+
# If the filename is the id of an existing record, keep that
|
416
|
+
# next(file_id) if (record_exists?(file_id))
|
417
|
+
# If this is the label (i.e.filename) of an existing fileset, use that fileset id
|
418
|
+
# TODO MAKE THIS WORK!!
|
419
|
+
# next(filename) if (filename_exists?(filename))
|
420
|
+
# File.join(BulkOps::INGEST_MEDIA_PATH, filename_prefix, filename)
|
421
|
+
# end
|
422
|
+
end
|
423
|
+
|
424
|
+
def delete_file_set fileset_id
|
425
|
+
BulkOps::DeleteFileSetJob.perform_later(fileset_id, operation.user.email )
|
426
|
+
end
|
427
|
+
|
428
|
+
|
429
|
+
def is_file_field? field
|
430
|
+
operation.is_file_field? field
|
431
|
+
end
|
432
|
+
|
433
|
+
def record_exists? id
|
434
|
+
operation.record_exists? id
|
435
|
+
end
|
436
|
+
|
437
|
+
def localAuthUrl(property, value)
|
438
|
+
return value if (auth = getLocalAuth(property)).nil?
|
439
|
+
url = findAuthUrl(auth, value) || mintLocalAuthUrl(auth,value)
|
440
|
+
return url
|
441
|
+
end
|
442
|
+
|
443
|
+
def find_collection(collection)
|
444
|
+
cols = Collection.where(id: collection)
|
445
|
+
cols += Collection.where(title: collection).select{|col| col.title.first == collection}
|
446
|
+
return cols.last unless cols.empty?
|
447
|
+
return false
|
448
|
+
end
|
449
|
+
|
450
|
+
def find_or_create_collection(collection)
|
451
|
+
col = find_collection(collection)
|
452
|
+
return col if col
|
453
|
+
return false if collection.to_i > 0
|
454
|
+
col = Collection.create(title: [collection.to_s], depositor: operation.user.email, collection_type: Hyrax::CollectionType.find_by(title:"User Collection"))
|
455
|
+
end
|
456
|
+
|
457
|
+
def get_remote_id(value, authority: nil, property: nil)
|
458
|
+
return false
|
459
|
+
#TODO retrieve URL for this value from the specified remote authr
|
460
|
+
end
|
461
|
+
|
462
|
+
def format_param_name(name)
|
463
|
+
name.titleize.gsub(/\s+/, "").camelcase(:lower)
|
464
|
+
end
|
465
|
+
|
466
|
+
def schema
|
467
|
+
ScoobySnacks::METADATA_SCHEMA
|
468
|
+
end
|
469
|
+
|
470
|
+
def find_field_name(field)
|
471
|
+
operation.find_field_name(field)
|
472
|
+
end
|
473
|
+
|
474
|
+
def downcase_first_letter(str)
|
475
|
+
return "" unless str
|
476
|
+
str[0].downcase + str[1..-1]
|
477
|
+
end
|
478
|
+
|
479
|
+
def split_values value_string
|
480
|
+
# Split values on all un-escaped separator character (escape character is '\')
|
481
|
+
# Then replace all escaped separator charactors with un-escaped versions
|
482
|
+
value_string.split(/(?<!\\)#{BulkOps::SEPARATOR}/).map{|val| val.gsub("\\#{BulkOps::SEPARATOR}",BulkOps::SEPARATOR).strip}
|
483
|
+
end
|
484
|
+
|
485
|
+
end
|
@@ -35,7 +35,7 @@ module BulkOps
|
|
35
35
|
return false if fieldname.blank?
|
36
36
|
return false if schema.get_field(fieldname)
|
37
37
|
field_parts = fieldname.underscore.humanize.downcase.gsub(/[-_]/,' ').split(" ")
|
38
|
-
return false unless field_parts.any?{ |field_type| BulkOps::
|
38
|
+
return false unless field_parts.any?{ |field_type| BulkOps::FILE_FIELDS.include?(field_type) }
|
39
39
|
return "remove" if field_parts.any?{ |field_type| ['remove','delete'].include?(field_type) }
|
40
40
|
return "add"
|
41
41
|
end
|
@@ -46,7 +46,7 @@ module BulkOps
|
|
46
46
|
name.gsub!(/[_\s-]?[lL]abel$/,'')
|
47
47
|
name.gsub!(/^[rR]emove[_\s-]?/,'')
|
48
48
|
name.gsub!(/^[dD]elete[_\s-]?/,'')
|
49
|
-
possible_fields = Work.attribute_names + schema.all_field_names
|
49
|
+
possible_fields = (Work.attribute_names + schema.all_field_names).uniq
|
50
50
|
matching_fields = possible_fields.select{|pfield| pfield.gsub(/[_\s-]/,'').parameterize == name.gsub(/[_\s-]/,'').parameterize }
|
51
51
|
return false if matching_fields.blank?
|
52
52
|
# raise Exception "Ambiguous metadata fields!" if matching_fields.uniq.count > 1
|
@@ -55,8 +55,8 @@ module BulkOps
|
|
55
55
|
|
56
56
|
def get_file_paths(filestring)
|
57
57
|
return [] if filestring.blank?
|
58
|
-
filenames = filestring.split(BulkOps::
|
59
|
-
filenames.map { |filename| File.join(BulkOps::
|
58
|
+
filenames = filestring.split(BulkOps::SEPARATOR)
|
59
|
+
filenames.map { |filename| File.join(BulkOps::INGEST_MEDIA_PATH, options['file_prefix'] || "", filename) }
|
60
60
|
end
|
61
61
|
|
62
62
|
def record_exists? id
|
@@ -85,7 +85,7 @@ module BulkOps
|
|
85
85
|
end
|
86
86
|
|
87
87
|
def verify_configuration
|
88
|
-
BulkOps::
|
88
|
+
BulkOps::OPTION_REQUIREMENTS.each do |option_name, option_info|
|
89
89
|
# Make sure it's present if required
|
90
90
|
if (option_info["required"].to_s == "true") || (option_info["required"].to_s == type)
|
91
91
|
if options[option_name].blank?
|
@@ -120,7 +120,7 @@ module BulkOps
|
|
120
120
|
# Ignore everything marked as a label
|
121
121
|
next if column_name_redux.ends_with? "label"
|
122
122
|
# Ignore any column names with special meaning in hyrax
|
123
|
-
next if BulkOps::
|
123
|
+
next if BulkOps::SPECIAL_COLUMNS.any?{|col| col.downcase.parameterize.gsub(/[_\s-]/,"") == column_name_redux }
|
124
124
|
# Ignore any columns speficied to be ignored in the configuration
|
125
125
|
ignored = options["ignored headers"] || []
|
126
126
|
next if ignored.any?{|col| col.downcase.parameterize.gsub(/[_\s-]/,"") == column_name_redux }
|
@@ -131,7 +131,7 @@ module BulkOps
|
|
131
131
|
end
|
132
132
|
|
133
133
|
def verify_remote_urls
|
134
|
-
row_offset = BulkOps::
|
134
|
+
row_offset = BulkOps::ROW_OFFSET.present? ? BulkOps::ROW_OFFSET : 2
|
135
135
|
get_spreadsheet.each_with_index do |row, row_num|
|
136
136
|
update(message: "verifying controlled vocab urls (row number #{row_num})")
|
137
137
|
next if row_num.nil?
|
@@ -173,7 +173,7 @@ module BulkOps
|
|
173
173
|
def get_ref_id row
|
174
174
|
row.each do |field,value|
|
175
175
|
next if field.blank? or value.blank? or field === value
|
176
|
-
next unless BulkOps::
|
176
|
+
next unless BulkOps::REFERENCE_IDENTIFIER_FIELDS.any?{ |ref_field| normalize_field(ref_field) == normalize_field(field) }
|
177
177
|
return value
|
178
178
|
end
|
179
179
|
# No reference identifier specified in the row. Use the default for the operation.
|
@@ -190,7 +190,7 @@ module BulkOps
|
|
190
190
|
# This is sketchy. Redo it.
|
191
191
|
(metadata = get_spreadsheet).each do |row,row_num|
|
192
192
|
ref_id = get_ref_id(row)
|
193
|
-
BulkOps::
|
193
|
+
BulkOps::RELATIONSHIP_COLUMNS.each do |relationship|
|
194
194
|
next unless (obj_id = row[relationship])
|
195
195
|
if (split = obj_id.split(':')).present? && split.count == 2
|
196
196
|
ref_id = split[0].downcase
|
data/lib/bulk_ops/version.rb
CHANGED
data/lib/bulk_ops/work_proxy.rb
CHANGED
@@ -1,12 +1,5 @@
|
|
1
1
|
class BulkOps::WorkProxy < ActiveRecord::Base
|
2
2
|
|
3
|
-
require 'uri'
|
4
|
-
OPTION_FIELDS = ['visibility','work type']
|
5
|
-
RELATIONSHIP_FIELDS = ['parent','child','collection','order']
|
6
|
-
REFERENCE_IDENTIFIER_FIELDS = ['Reference Identifier','ref_id','Reference ID','Relationship ID','Relationship Identifier','Reference Identifier Type','Reference ID Type','Ref ID Type','relationship_identifier_type','relationship_id_type']
|
7
|
-
FILE_FIELDS = ['file','files','filename','filenames']
|
8
|
-
FILE_ACTIONS = ['add','upload','remove','delete']
|
9
|
-
SEPARATOR = ';'
|
10
3
|
self.table_name = "bulk_ops_work_proxies"
|
11
4
|
belongs_to :operation, class_name: "BulkOps::Operation", foreign_key: "operation_id"
|
12
5
|
has_many :relationships, class_name: "BulkOps::Relationship"
|
@@ -40,462 +33,10 @@ class BulkOps::WorkProxy < ActiveRecord::Base
|
|
40
33
|
# TODO make it so people can edit the work again
|
41
34
|
end
|
42
35
|
|
43
|
-
def interpret_data raw_data
|
44
|
-
admin_set = AdminSet.where(title: "Bulk Ingest Set").first || AdminSet.find(AdminSet.find_or_create_default_admin_set_id)
|
45
|
-
metadata = {admin_set_id: admin_set.id}
|
46
|
-
metadata.merge! interpret_file_fields(raw_data)
|
47
|
-
metadata.merge! interpret_controlled_fields(raw_data)
|
48
|
-
metadata.merge! interpret_scalar_fields(raw_data)
|
49
|
-
metadata.merge! interpret_relationship_fields(raw_data)
|
50
|
-
metadata.merge! interpret_option_fields(raw_data)
|
51
|
-
metadata = setAdminSet(metadata)
|
52
|
-
metadata = setMetadataInheritance(metadata)
|
53
|
-
return metadata
|
54
|
-
end
|
55
36
|
|
56
37
|
def proxy_errors
|
57
38
|
@proxy_errors ||= []
|
58
39
|
end
|
59
40
|
|
60
|
-
private
|
61
|
-
|
62
|
-
def is_file_field? field
|
63
|
-
operation.is_file_field? field
|
64
|
-
end
|
65
|
-
|
66
|
-
def record_exists? id
|
67
|
-
operation.record_exists? id
|
68
|
-
end
|
69
|
-
|
70
|
-
def localAuthUrl(property, value)
|
71
|
-
return value if (auth = getLocalAuth(property)).nil?
|
72
|
-
url = findAuthUrl(auth, value) || mintLocalAuthUrl(auth,value)
|
73
|
-
return url
|
74
|
-
end
|
75
|
-
|
76
|
-
def find_collection(collection)
|
77
|
-
cols = Collection.where(id: collection)
|
78
|
-
cols += Collection.where(title: collection).select{|col| col.title.first == collection}
|
79
|
-
return cols.last unless cols.empty?
|
80
|
-
return false
|
81
|
-
end
|
82
|
-
|
83
|
-
def find_or_create_collection(collection)
|
84
|
-
col = find_collection(collection)
|
85
|
-
return col if col
|
86
|
-
return false if collection.to_i > 0
|
87
|
-
col = Collection.create(title: [collection.to_s], depositor: operation.user.email, collection_type: Hyrax::CollectionType.find_by(title:"User Collection"))
|
88
|
-
end
|
89
|
-
|
90
|
-
def get_remote_id(value, authority: nil, property: nil)
|
91
|
-
return false
|
92
|
-
#TODO retrieve URL for this value from the specified remote authr
|
93
|
-
end
|
94
|
-
|
95
|
-
def format_param_name(name)
|
96
|
-
name.titleize.gsub(/\s+/, "").camelcase(:lower)
|
97
|
-
end
|
98
|
-
|
99
|
-
def schema
|
100
|
-
ScoobySnacks::METADATA_SCHEMA
|
101
|
-
end
|
102
|
-
|
103
|
-
def find_field_name(field)
|
104
|
-
operation.find_field_name(field)
|
105
|
-
end
|
106
|
-
|
107
|
-
def downcase_first_letter(str)
|
108
|
-
return "" unless str
|
109
|
-
str[0].downcase + str[1..-1]
|
110
|
-
end
|
111
|
-
|
112
|
-
def split_values value_string
|
113
|
-
# Split values on all un-escaped separator character (escape character is '\')
|
114
|
-
# Then replace all escaped separator charactors with un-escaped versions
|
115
|
-
value_string.split(/(?<!\\)#{SEPARATOR}/).map{|val| val.gsub("\\#{SEPARATOR}",SEPARATOR).strip}
|
116
|
-
end
|
117
|
-
|
118
|
-
def interpret_controlled_fields raw_data
|
119
|
-
|
120
|
-
# The labels array tracks the contents of columns marked as labels,
|
121
|
-
# which may require special validation
|
122
|
-
labels = {}
|
123
|
-
|
124
|
-
# This hash is populated with relevant data as we loop through the fields
|
125
|
-
controlled_data = {}
|
126
|
-
|
127
|
-
raw_data.each do |field_name, value|
|
128
|
-
next if value.blank? or field_name.blank?
|
129
|
-
field_name = field_name.to_s
|
130
|
-
|
131
|
-
#If our CSV interpreter is feeding us the headers as a line, ignore it.
|
132
|
-
next if field_name == value
|
133
|
-
|
134
|
-
#check if they are using the 'field_name.authority' syntax
|
135
|
-
authority = nil
|
136
|
-
if ((split=field_name.split('.')).count == 2)
|
137
|
-
authority = split.last
|
138
|
-
field_name = split.first
|
139
|
-
end
|
140
|
-
|
141
|
-
# get the field name, if this column is a metadata field
|
142
|
-
field_name_norm = find_field_name(field_name)
|
143
|
-
field = schema.get_field(field_name_norm)
|
144
|
-
|
145
|
-
# Ignore anything that isn't a controlled field
|
146
|
-
next unless field.present? && field.controlled?
|
147
|
-
|
148
|
-
# Keep track of label fields
|
149
|
-
if field_name.downcase.ends_with?("label")
|
150
|
-
next if operation.options["ignore_labels"]
|
151
|
-
labels[field_name_norm] ||= []
|
152
|
-
labels[field_name_norm] += split_values value
|
153
|
-
next unless operation.options["import_labels"]
|
154
|
-
end
|
155
|
-
|
156
|
-
remove = field_name.downcase.starts_with?("remove") || field_name.downcase.starts_with?("delete")
|
157
|
-
|
158
|
-
# handle multiple values
|
159
|
-
value_array = split_values(value)
|
160
|
-
controlled_data[field_name_norm] ||= [] unless value_array.blank?
|
161
|
-
value_array.each do |value|
|
162
|
-
# Decide of we're dealing with a label or url
|
163
|
-
# It's an ID if it's a URL and the name doesn't end in 'label'
|
164
|
-
value.strip!
|
165
|
-
if value =~ /^#{URI::regexp}$/ and !field_name.downcase.ends_with?("label")
|
166
|
-
id = value
|
167
|
-
# label = WorkIndexer.fetch_remote_label(value)
|
168
|
-
# error_message = "cannot fetch remote label for url: #{value}"
|
169
|
-
# report_error( :cannot_retrieve_label , error_message, url: value, row_number: row_number) unless label
|
170
|
-
else
|
171
|
-
# It's a label, so unescape it and get the id
|
172
|
-
value = unescape_csv(value)
|
173
|
-
id = get_remote_id(value, property: field_name_norm, authority: authority) || localAuthUrl(field_name_norm, value)
|
174
|
-
# label = value
|
175
|
-
report_error(:cannot_retrieve_url,
|
176
|
-
message: "cannot find or create url for controlled vocabulary label: #{value}",
|
177
|
-
url: value,
|
178
|
-
row_number: row_number) unless id
|
179
|
-
end
|
180
|
-
controlled_data[field_name_norm] << {id: id, remove: field_name.downcase.starts_with?("remove")}
|
181
|
-
end
|
182
|
-
end
|
183
|
-
|
184
|
-
#delete any duplicates (if someone listed a url and also its label, or the same url twice)
|
185
|
-
controlled_data.each{|field_name, values| controlled_data[field_name] = values.uniq }
|
186
|
-
|
187
|
-
# Actually add all the data
|
188
|
-
metadata = {}
|
189
|
-
leftover_data = raw_data.dup.to_hash
|
190
|
-
controlled_data.each do |property_name, data|
|
191
|
-
metadata["#{property_name}_attributes"] ||= [] unless data.blank?
|
192
|
-
data.each do |datum|
|
193
|
-
atts = {"id" => datum[:id]}
|
194
|
-
atts["_delete"] = true if datum[:remove]
|
195
|
-
metadata["#{property_name}_attributes"] << atts
|
196
|
-
leftover_data.except! property_name
|
197
|
-
end
|
198
|
-
end
|
199
|
-
#return [metadata, leftover_data]
|
200
|
-
return metadata
|
201
|
-
end
|
202
|
-
|
203
|
-
def interpret_scalar_fields raw_data
|
204
|
-
metadata = {}
|
205
|
-
raw_data.each do |field, values|
|
206
|
-
next if values.blank? or field.nil? or field == values
|
207
|
-
# get the field name, if this column is a metadata field
|
208
|
-
next unless field_name = find_field_name(field.to_s)
|
209
|
-
field = schema.get_field(field_name)
|
210
|
-
# Ignore controlled fields
|
211
|
-
next if field.controlled?
|
212
|
-
split_values(values).each do |value|
|
213
|
-
next if value.blank?
|
214
|
-
value = value.strip.encode('utf-8', :invalid => :replace, :undef => :replace, :replace => '_') unless value.blank?
|
215
|
-
value = unescape_csv(value)
|
216
|
-
(metadata[field_name] ||= []) << value
|
217
|
-
end
|
218
|
-
end
|
219
|
-
return metadata
|
220
|
-
end
|
221
|
-
|
222
|
-
def interpret_file_fields raw_data
|
223
|
-
# This method handles file additions and deletions from the spreadsheet
|
224
|
-
# if additional files need to be deleted because the update is set to replace
|
225
|
-
# some or all existing files, those replacement-related deletions are handled
|
226
|
-
# by the BulkOps::Operation.
|
227
|
-
#
|
228
|
-
# TODO: THIS DOES NOT YET MANAGE THE ORDER OF INGESTED FILESETS
|
229
|
-
|
230
|
-
metadata = {}
|
231
|
-
raw_data.each do |field, value|
|
232
|
-
next if value.blank? or field.blank?
|
233
|
-
field = field.to_s
|
234
|
-
#If our CSV interpreter is feeding us the headers as a line, ignore it.
|
235
|
-
next if field == value
|
236
|
-
|
237
|
-
|
238
|
-
# Check if this is a file field, and whether we are removing or adding a file
|
239
|
-
next unless (action = is_file_field?(field))
|
240
|
-
|
241
|
-
# Move on if this field is the name of another property (e.g. masterFilename)
|
242
|
-
next if find_field_name(field)
|
243
|
-
|
244
|
-
# Check if we are removing a file
|
245
|
-
if action == "remove"
|
246
|
-
get_removed_filesets(value).each { |fileset_id| delete_file_set(file_set_id) }
|
247
|
-
else
|
248
|
-
# Add a file
|
249
|
-
operation.get_file_paths(value).each do |filepath|
|
250
|
-
begin
|
251
|
-
uploaded_file = Hyrax::UploadedFile.create(file: File.open(filepath), user: operation.user)
|
252
|
-
(metadata[:uploaded_files] ||= []) << uploaded_file.id unless uploaded_file.id.nil?
|
253
|
-
rescue Exception => e
|
254
|
-
report_error(:upload_error,
|
255
|
-
message: "Error opening file: #{ filepath } -- #{e}",
|
256
|
-
file: File.join(BulkOps::Operation::INGEST_MEDIA_PATH,filename),
|
257
|
-
row_number: row_number)
|
258
|
-
end
|
259
|
-
end
|
260
|
-
end
|
261
|
-
end
|
262
|
-
return metadata
|
263
|
-
end
|
264
|
-
|
265
|
-
def interpret_option_fields raw_data
|
266
|
-
raw_data.each do |field,value|
|
267
|
-
next if value.blank? or field.blank?
|
268
|
-
field = field.to_s
|
269
|
-
next if value == field
|
270
|
-
|
271
|
-
normfield = field.downcase.parameterize.gsub(/[_\s-]/,'')
|
272
|
-
if ["visibility", "public"].include?(normfield)
|
273
|
-
update(visibility: format_visibility(value))
|
274
|
-
end
|
275
|
-
if ["worktype","model","type"].include?(normfield)
|
276
|
-
update(work_type: format_worktype(value) )
|
277
|
-
end
|
278
|
-
if ["referenceidentifier",
|
279
|
-
"referenceid",
|
280
|
-
"refid",
|
281
|
-
"referenceidentifiertype",
|
282
|
-
"referenceidtype",
|
283
|
-
"refidtype",
|
284
|
-
"relationshipidentifier",
|
285
|
-
"relationshipid",
|
286
|
-
"relationshipidentifiertype",
|
287
|
-
"relationshipidtype",
|
288
|
-
"relid",
|
289
|
-
"relidtype"].include?(normfield)
|
290
|
-
update(reference_identifier: format_reference_id(value))
|
291
|
-
end
|
292
|
-
end
|
293
|
-
return {}
|
294
|
-
end
|
295
|
-
|
296
|
-
def interpret_relationship_fields(raw_data)
|
297
|
-
metadata = {}
|
298
|
-
raw_data.each do |field,value|
|
299
|
-
next if value.blank? or field.blank?
|
300
|
-
field = field.to_s
|
301
|
-
value = unescape_csv(value)
|
302
|
-
identifer_type = reference_identifier
|
303
|
-
|
304
|
-
next if value == field
|
305
|
-
|
306
|
-
if (split = field.split(":")).count == 2
|
307
|
-
identifier_type = split.last
|
308
|
-
relationship_type = split.first.to_s
|
309
|
-
else
|
310
|
-
relationship_type = field
|
311
|
-
end
|
312
|
-
|
313
|
-
relationship_type = normalize_relationship_field_name(relationship_type)
|
314
|
-
case relationship_type
|
315
|
-
when "order"
|
316
|
-
# If the field specifies the object's order among siblings
|
317
|
-
update(order: value.to_f)
|
318
|
-
next
|
319
|
-
when "collection"
|
320
|
-
# If the field specifies the name or ID of a collection,
|
321
|
-
# find or create the collection and update the metadata to match
|
322
|
-
col = find_or_create_collection(value)
|
323
|
-
( metadata[:member_of_collection_ids] ||= [] ) << col.id if col
|
324
|
-
next
|
325
|
-
when "parent", "child"
|
326
|
-
|
327
|
-
# correctly interpret the notation "id:a78C2d81"
|
328
|
-
identifier_type, object_identifier = interpret_relationship_value(identifier_type, value)
|
329
|
-
|
330
|
-
relationship_parameters = { work_proxy_id: id,
|
331
|
-
identifier_type: identifier_type,
|
332
|
-
relationship_type: relationship_type,
|
333
|
-
object_identifier: object_identifier,
|
334
|
-
status: "new"}
|
335
|
-
|
336
|
-
#add previous sibling link if necessary
|
337
|
-
previous_value = operation.final_spreadsheet[row_number-1][field]
|
338
|
-
# Check if this is a parent relationship, and the previous row also has one
|
339
|
-
if previous_value.present? && (relationship_type == "parent")
|
340
|
-
# Check if the previous row has the same parent as this row
|
341
|
-
if object_identifier == interpret_relationship_value(identifier_type, previous_value, field).last
|
342
|
-
# If so, set the previous sibling parameter on the relationshp
|
343
|
-
# to the id for the proxy associated with the previous row
|
344
|
-
relationship_parameters[:previous_sibling] = operation.work_proxies.find_by(row_number: row_number-1).id
|
345
|
-
end
|
346
|
-
end
|
347
|
-
BulkOps::Relationship.create(relationship_parameters)
|
348
|
-
end
|
349
|
-
return metadata
|
350
|
-
end
|
351
|
-
end
|
352
|
-
|
353
|
-
def normalize_relationship_field_name field
|
354
|
-
normfield = field.downcase.parameterize.gsub(/[_\s-]/,'')
|
355
|
-
RELATIONSHIP_FIELDS.find{|field| normfield.include?(field) }
|
356
|
-
end
|
357
|
-
|
358
|
-
def find_previous_parent field="parent"
|
359
|
-
#Return the row number of the most recent preceding row that does
|
360
|
-
# not itself have a parent defined
|
361
|
-
i = 0;
|
362
|
-
while (prev_row = operation.final_spreadsheet[row_number - i])
|
363
|
-
return (row_number - i) if prev_row[field].blank?
|
364
|
-
end
|
365
|
-
end
|
366
|
-
|
367
|
-
def interpret_relationship_value id_type, value, field="parent"
|
368
|
-
#Handle "id:20kj4259" syntax if it hasn't already been handled
|
369
|
-
if (split = value.to_s.split(":")).count == 2
|
370
|
-
id_type = split.first
|
371
|
-
value = split.last
|
372
|
-
end
|
373
|
-
#Handle special shorthand syntax for refering to relative row numbers
|
374
|
-
if id_type == "row"
|
375
|
-
if value.to_i < 0
|
376
|
-
# if given a negative integer, count backwards from the current row
|
377
|
-
return [id_type,row_number - value]
|
378
|
-
elsif value.to_s.downcase.include?("prev")
|
379
|
-
# if given any variation of the word "previous", get the first preceding row with no parent of its own
|
380
|
-
return [id_type,find_previous_parent(field)]
|
381
|
-
end
|
382
|
-
end
|
383
|
-
return [id_type,value]
|
384
|
-
end
|
385
|
-
|
386
|
-
def unescape_csv(value)
|
387
|
-
value.gsub(/\\(['";,])/,'\1')
|
388
|
-
end
|
389
|
-
|
390
|
-
def format_worktype(value)
|
391
|
-
# format the value like a class name
|
392
|
-
type = value.titleize.gsub(/[-_\s]/,'')
|
393
|
-
# reject it if it isn't a defined class
|
394
|
-
type = false unless Object.const_defined? type
|
395
|
-
# fall back to the work type defined by the operation, or a standard "Work"
|
396
|
-
return type ||= operation.work_type || "Work"
|
397
|
-
end
|
398
|
-
|
399
|
-
def format_visibility(value)
|
400
|
-
case value.downcase
|
401
|
-
when "public", "open", "true"
|
402
|
-
return "open"
|
403
|
-
when "campus", "ucsc", "institution"
|
404
|
-
return "ucsc"
|
405
|
-
when "restricted", "private", "closed", "false"
|
406
|
-
return "restricted"
|
407
|
-
end
|
408
|
-
end
|
409
|
-
|
410
|
-
def mintLocalAuthUrl(auth_name, value)
|
411
|
-
value.strip!
|
412
|
-
id = value.parameterize
|
413
|
-
auth = Qa::LocalAuthority.find_or_create_by(name: auth_name)
|
414
|
-
entry = Qa::LocalAuthorityEntry.create(local_authority: auth,
|
415
|
-
label: value,
|
416
|
-
uri: id)
|
417
|
-
return localIdToUrl(id,auth_name)
|
418
|
-
end
|
419
|
-
|
420
|
-
def findAuthUrl(auth, value)
|
421
|
-
value.strip!
|
422
|
-
return nil if auth.nil?
|
423
|
-
return nil unless (entries = Qa::Authorities::Local.subauthority_for(auth).search(value))
|
424
|
-
entries.each do |entry|
|
425
|
-
#require exact match
|
426
|
-
next unless entry["label"].force_encoding('UTF-8') == value.force_encoding('UTF-8')
|
427
|
-
url = entry["url"] || entry["id"]
|
428
|
-
# url = localIdToUrl(url,auth) unless url =~ URI::regexp
|
429
|
-
return url
|
430
|
-
end
|
431
|
-
return nil
|
432
|
-
end
|
433
|
-
|
434
|
-
def localIdToUrl(id,auth_name)
|
435
|
-
root_urls = {production: "https://digitalcollections.library.ucsc.edu",
|
436
|
-
staging: "http://digitalcollections-staging.library.ucsc.edu",
|
437
|
-
development: "http://#{Socket.gethostname}",
|
438
|
-
test: "http://#{Socket.gethostname}"}
|
439
|
-
return "#{root_urls[Rails.env.to_sym]}/authorities/show/local/#{auth_name}/#{id}"
|
440
|
-
end
|
441
|
-
|
442
|
-
def getLocalAuth(field_name)
|
443
|
-
field = schema.get_property(field_name)
|
444
|
-
# There is only ever one local authority per field, so just pick the first you find
|
445
|
-
if vocs = field.vocabularies
|
446
|
-
vocs.each do |voc|
|
447
|
-
return voc["subauthority"] if voc["authority"].downcase == "local"
|
448
|
-
end
|
449
|
-
end
|
450
|
-
return nil
|
451
|
-
end
|
452
|
-
|
453
|
-
def setAdminSet metadata
|
454
|
-
return metadata if metadata[:admin_set_id]
|
455
|
-
asets = AdminSet.where({title: "Bulk Ingest Set"})
|
456
|
-
asets = AdminSet.find('admin_set/default') if asets.blank?
|
457
|
-
metadata[:admin_set_id] = Array(asets).first.id unless asets.blank?
|
458
|
-
return metadata
|
459
|
-
end
|
460
|
-
|
461
|
-
def setMetadataInheritance metadata
|
462
|
-
return metadata if metadata[:metadataInheritance].present?
|
463
|
-
metadata[:metadataInheritance] = operation.options["metadataInheritance"] unless operation.options["metadataInheritance"].blank?
|
464
|
-
return metadata
|
465
|
-
end
|
466
|
-
|
467
|
-
def report_error type, message, **args
|
468
|
-
puts "ERROR MESSAGE: #{message}"
|
469
|
-
update(status: "error", message: message)
|
470
|
-
args[:type]=type
|
471
|
-
(@proxy_errors ||= []) << BulkOps::Error.new(**args)
|
472
|
-
end
|
473
|
-
|
474
|
-
def filename_prefix
|
475
|
-
@filename_prefix ||= operation.filename_prefix
|
476
|
-
end
|
477
|
-
|
478
|
-
def record_exists?
|
479
|
-
operation.record_exists? work_id
|
480
|
-
end
|
481
|
-
|
482
|
-
def get_removed_filesets(filestring)
|
483
|
-
file_ids = split_values(filestring)
|
484
|
-
file_ids.select{|file_id| record_exists?(file_id)}
|
485
|
-
|
486
|
-
# This part handles filenames in addition to file ids. It doesn't work yet!
|
487
|
-
# file_ids.map do |file_id|
|
488
|
-
# If the filename is the id of an existing record, keep that
|
489
|
-
# next(file_id) if (record_exists?(file_id))
|
490
|
-
# If this is the label (i.e.filename) of an existing fileset, use that fileset id
|
491
|
-
# TODO MAKE THIS WORK!!
|
492
|
-
# next(filename) if (filename_exists?(filename))
|
493
|
-
# File.join(BulkOps::Operation::INGEST_MEDIA_PATH, filename_prefix, filename)
|
494
|
-
# end
|
495
|
-
end
|
496
|
-
|
497
|
-
def delete_file_set fileset_id
|
498
|
-
BulkOps::DeleteFileSetJob.perform_later(fileset_id, operation.user.email )
|
499
|
-
end
|
500
41
|
|
501
42
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bulk_ops
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.15
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ned Henry, UCSC Library Digital Initiatives
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-10-
|
11
|
+
date: 2019-10-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rails
|
@@ -106,6 +106,7 @@ files:
|
|
106
106
|
- lib/bulk_ops/github_access.rb
|
107
107
|
- lib/bulk_ops/github_credential.rb
|
108
108
|
- lib/bulk_ops/operation.rb
|
109
|
+
- lib/bulk_ops/parser.rb
|
109
110
|
- lib/bulk_ops/queue_work_ingests_job.rb
|
110
111
|
- lib/bulk_ops/relationship.rb
|
111
112
|
- lib/bulk_ops/search_builder_behavior.rb
|