bulk_ops 0.1.23 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/db/migrate/20200122234235_remove_relationships_ammend_work_proxy.rb +14 -0
- data/lib/bulk_ops.rb +3 -2
- data/lib/bulk_ops/apply_operation_job.rb +8 -0
- data/lib/bulk_ops/create_work_job.rb +1 -1
- data/lib/bulk_ops/github_access.rb +1 -1
- data/lib/bulk_ops/operation.rb +57 -49
- data/lib/bulk_ops/parser.rb +50 -414
- data/lib/bulk_ops/resolve_children_job.rb +14 -0
- data/lib/bulk_ops/solr_service.rb +13 -0
- data/lib/bulk_ops/update_work_job.rb +1 -1
- data/lib/bulk_ops/verification.rb +2 -10
- data/lib/bulk_ops/version.rb +1 -1
- data/lib/bulk_ops/work_job.rb +20 -13
- data/lib/bulk_ops/work_proxy.rb +18 -2
- data/lib/concerns/interpret_controlled_behavior.rb +140 -0
- data/lib/concerns/interpret_files_behavior.rb +82 -0
- data/lib/concerns/interpret_options_behavior.rb +59 -0
- data/lib/concerns/interpret_relationships_behavior.rb +123 -0
- data/lib/concerns/interpret_scalar_behavior.rb +21 -0
- data/lib/concerns/search_builder_behavior.rb +80 -0
- metadata +12 -3
- data/lib/bulk_ops/relationship.rb +0 -117
@@ -0,0 +1,14 @@
|
|
1
|
+
class BulkOps::ResolveChildrenJob < ActiveJob::Base
|
2
|
+
|
3
|
+
def perform(proxy_id)
|
4
|
+
proxy = BulkOps::WorkProxy.find(proxy_id)
|
5
|
+
if proxy.ordered_children.all?{|child| child.work_id.present?}
|
6
|
+
work = ActiveFedora::Base.find(proxy.work_id)
|
7
|
+
work.ordered_member_ids = proxy.ordered_children.map(&:work_id)
|
8
|
+
work.save
|
9
|
+
else
|
10
|
+
BulkOps::ResolveChildrenJob.set(wait: 30.minutes).perform_later(proxy_id)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
end
|
@@ -13,7 +13,7 @@ class BulkOps::UpdateWorkJob < BulkOps::WorkJob
|
|
13
13
|
|
14
14
|
def define_work workClass=nil
|
15
15
|
# report an error if we can't find the work in solr
|
16
|
-
unless record_exists?(@work_proxy.work_id)
|
16
|
+
unless BulkOps::SolrService.record_exists?(@work_proxy.work_id)
|
17
17
|
report_error "Could not find work to update with id: #{@work_proxy.work_id} referenced by work proxy: #{@work_proxy.id}"
|
18
18
|
return false
|
19
19
|
end
|
@@ -59,14 +59,6 @@ module BulkOps
|
|
59
59
|
filenames.map { |filename| File.join(BulkOps::INGEST_MEDIA_PATH, options['file_prefix'] || "", filename) }
|
60
60
|
end
|
61
61
|
|
62
|
-
def record_exists? id
|
63
|
-
begin
|
64
|
-
return true if SolrDocument.find(id)
|
65
|
-
rescue Blacklight::Exceptions::RecordNotFound
|
66
|
-
return false
|
67
|
-
end
|
68
|
-
end
|
69
|
-
|
70
62
|
private
|
71
63
|
|
72
64
|
def verify_files
|
@@ -165,7 +157,7 @@ module BulkOps
|
|
165
157
|
get_spreadsheet.each_with_index do |row, row_num|
|
166
158
|
id = get_ref_id(row)
|
167
159
|
#TODO: find by other field. for now just id
|
168
|
-
unless (record_exists(id))
|
160
|
+
unless (BulkOps::SolrService.record_exists?(id))
|
169
161
|
@verification_errors << BulkOps::Error.new(type: :cannot_find_work, id: id)
|
170
162
|
end
|
171
163
|
end
|
@@ -205,7 +197,7 @@ module BulkOps
|
|
205
197
|
end
|
206
198
|
elsif ref_id.include?("id")
|
207
199
|
# This is a hydra id reference. It should correspond to an object already in the repo
|
208
|
-
unless record_exists?(obj_id)
|
200
|
+
unless BulkOps::SolrService.record_exists?(obj_id)
|
209
201
|
@verification_errors << BulkOps::Error.new({type: :bad_object_reference, object_id: obj_id, row_number: row_num + BulkOps:: ROW_OFFSET})
|
210
202
|
end
|
211
203
|
end
|
data/lib/bulk_ops/version.rb
CHANGED
data/lib/bulk_ops/work_job.rb
CHANGED
@@ -13,11 +13,25 @@ class BulkOps::WorkJob < ActiveJob::Base
|
|
13
13
|
update_status "error"
|
14
14
|
else
|
15
15
|
@work_proxy.work_id = @work.id
|
16
|
-
|
16
|
+
|
17
|
+
# If this work has a parent outside of the current operation,
|
18
|
+
# and this is the first sibling (we only need to do this once per parent),
|
19
|
+
# queue a job to resolve that work's new children
|
20
|
+
if @work_proxy.parent_id.present? && (parent_proxy = BulkOps::WorkProxy.find(parent_id))
|
21
|
+
if parent_proxy.operation_id != @work_proxy.operation_id
|
22
|
+
if @work_proxy.previous_sibling.nil?
|
23
|
+
BulkOps::ResolveChildrenJob.set(wait: 10.minutes).perform_later(parent_proxy.id)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
17
27
|
|
18
|
-
#
|
19
|
-
|
20
|
-
|
28
|
+
# Set up jobs to link child works (once they are ingested)
|
29
|
+
# or mark as complete otherwise
|
30
|
+
if (children = @work_proxy.ordered_children)
|
31
|
+
BulkOps::ResolveChildrenJob.perform_later(@work_proxy.id)
|
32
|
+
update_status "awaiting_children"
|
33
|
+
else
|
34
|
+
update_status "complete"
|
21
35
|
end
|
22
36
|
|
23
37
|
# Delete any UploadedFiles. These take up tons of unnecessary disk space.
|
@@ -52,6 +66,7 @@ class BulkOps::WorkJob < ActiveJob::Base
|
|
52
66
|
report_error("Cannot find work proxy with id: #{work_proxy_id}")
|
53
67
|
return
|
54
68
|
end
|
69
|
+
return if @work_proxy.status == "complete"
|
55
70
|
|
56
71
|
return unless (work_action = define_work(workClass))
|
57
72
|
|
@@ -66,7 +81,7 @@ class BulkOps::WorkJob < ActiveJob::Base
|
|
66
81
|
|
67
82
|
|
68
83
|
def define_work(workClass="Work")
|
69
|
-
if (@work_proxy.present? && @work_proxy.work_id.present? && record_exists?(@work_proxy.work_id))
|
84
|
+
if (@work_proxy.present? && @work_proxy.work_id.present? && BulkOps::SolrService.record_exists?(@work_proxy.work_id))
|
70
85
|
begin
|
71
86
|
@work = ActiveFedora::Base.find(@work_proxy.work_id)
|
72
87
|
return :update
|
@@ -80,14 +95,6 @@ class BulkOps::WorkJob < ActiveJob::Base
|
|
80
95
|
end
|
81
96
|
end
|
82
97
|
|
83
|
-
def record_exists? id
|
84
|
-
begin
|
85
|
-
return true if SolrDocument.find(id)
|
86
|
-
rescue Blacklight::Exceptions::RecordNotFound
|
87
|
-
return false
|
88
|
-
end
|
89
|
-
end
|
90
|
-
|
91
98
|
def report_error message=nil
|
92
99
|
update_status "job_error", message: message
|
93
100
|
end
|
data/lib/bulk_ops/work_proxy.rb
CHANGED
@@ -2,7 +2,6 @@ class BulkOps::WorkProxy < ActiveRecord::Base
|
|
2
2
|
|
3
3
|
self.table_name = "bulk_ops_work_proxies"
|
4
4
|
belongs_to :operation, class_name: "BulkOps::Operation", foreign_key: "operation_id"
|
5
|
-
has_many :relationships, class_name: "BulkOps::Relationship"
|
6
5
|
|
7
6
|
attr_accessor :proxy_errors
|
8
7
|
|
@@ -38,5 +37,22 @@ class BulkOps::WorkProxy < ActiveRecord::Base
|
|
38
37
|
@proxy_errors ||= []
|
39
38
|
end
|
40
39
|
|
41
|
-
|
40
|
+
def ordered_siblings
|
41
|
+
return nil unless (parent = BulkOps::WorkProxy.find(parent_id))
|
42
|
+
parent.ordered_children - self
|
43
|
+
end
|
44
|
+
|
45
|
+
def ordered_children
|
46
|
+
children = BulkOps::WorkProxy.where(parent_id: id)
|
47
|
+
ordered_kids = []
|
48
|
+
previous_id = nil
|
49
|
+
while ordered_kids.length < children.length do
|
50
|
+
next_child = children.find{|child| child.previous_sibling_id == previous_id}
|
51
|
+
break if (next_child.nil? or ordered_kids.include?(next_child))
|
52
|
+
previous_id = next_child.id
|
53
|
+
ordered_kids << next_child
|
54
|
+
end
|
55
|
+
ordered_kids = ordered_kids + (children - ordered_kids) if (children.length > ordered_kids.length)
|
56
|
+
ordered_kids
|
57
|
+
end
|
42
58
|
end
|
@@ -0,0 +1,140 @@
|
|
1
|
+
module BulkOps::InterpretControlledBehavior
|
2
|
+
extend ActiveSupport::Concern
|
3
|
+
|
4
|
+
def interpret_controlled_fields
|
5
|
+
|
6
|
+
# The labels array tracks the contents of columns marked as labels,
|
7
|
+
# which may require special validation
|
8
|
+
labels = {}
|
9
|
+
|
10
|
+
# This hash is populated with relevant data as we loop through the fields
|
11
|
+
controlled_data = {}
|
12
|
+
|
13
|
+
@raw_row.each do |field_name, value|
|
14
|
+
next if value.blank? or field_name.blank?
|
15
|
+
field_name = field_name.to_s
|
16
|
+
|
17
|
+
#If our CSV interpreter is feeding us the headers as a line, ignore it.
|
18
|
+
next if field_name == value
|
19
|
+
|
20
|
+
#check if they are using the 'field_name.authority' syntax
|
21
|
+
authority = nil
|
22
|
+
if ((split=field_name.split('.')).count == 2)
|
23
|
+
authority = split.last
|
24
|
+
field_name = split.first
|
25
|
+
end
|
26
|
+
|
27
|
+
# get the field name, if this column is a metadata field
|
28
|
+
field_name_norm = find_field_name(field_name)
|
29
|
+
field = schema.get_field(field_name_norm)
|
30
|
+
|
31
|
+
# Ignore anything that isn't a controlled field
|
32
|
+
next unless field.present? && field.controlled?
|
33
|
+
|
34
|
+
# Keep track of label fields
|
35
|
+
if field_name.downcase.ends_with?("label")
|
36
|
+
next if @options["ignore_labels"]
|
37
|
+
labels[field_name_norm] ||= []
|
38
|
+
labels[field_name_norm] += BulkOps::Parser.split_values value
|
39
|
+
next unless @options["import_labels"]
|
40
|
+
end
|
41
|
+
|
42
|
+
# handle multiple values
|
43
|
+
value_array = BulkOps::Parser.split_values(value)
|
44
|
+
controlled_data[field_name_norm] ||= [] unless value_array.blank?
|
45
|
+
value_array.each do |value|
|
46
|
+
# Decide of we're dealing with a label or url
|
47
|
+
# It's an ID if it's a URL and the name doesn't end in 'label'
|
48
|
+
value.strip!
|
49
|
+
if value =~ /^#{URI::regexp}$/ and !field_name.downcase.ends_with?("label")
|
50
|
+
value_id = value
|
51
|
+
# label = WorkIndexer.fetch_remote_label(value)
|
52
|
+
# error_message = "cannot fetch remote label for url: #{value}"
|
53
|
+
# report_error( :cannot_retrieve_label , error_message, url: value, row_number: row_number) unless label
|
54
|
+
else
|
55
|
+
# It's a label, so unescape it and get the id
|
56
|
+
value = BulkOps::Parser.unescape_csv(value)
|
57
|
+
value_id = get_remote_id(value, property: field_name_norm, authority: authority) || localAuthUrl(field_name_norm, value)
|
58
|
+
# label = value
|
59
|
+
report_error(:cannot_retrieve_url,
|
60
|
+
message: "cannot find or create url for controlled vocabulary label: #{value}",
|
61
|
+
url: value,
|
62
|
+
row_number: row_number) unless value_id
|
63
|
+
end
|
64
|
+
destroy? =
|
65
|
+
atts = {id: value_id}
|
66
|
+
atts[:_destroy] = true if (field_name.downcase.starts_with?("remove") or field_name.downcase.starts_with?("delete"))
|
67
|
+
controlled_data[field_name_norm] << atts
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
# Actually add all the data
|
72
|
+
controlled_data.each do |property_name, data|
|
73
|
+
@metadata["#{property_name}_attributes"] ||= [] unless data.blank?
|
74
|
+
data.uniq.each do |datum|
|
75
|
+
@metadata["#{property_name}_attributes"].reject!{|val| val[:id] == datum[:id]}
|
76
|
+
@metadata["#{property_name}_attributes"] << datum
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
private
|
82
|
+
|
83
|
+
def localAuthUrl(property, value)
|
84
|
+
return value if (auth = getLocalAuth(property)).nil?
|
85
|
+
url = findAuthUrl(auth, value) || mintLocalAuthUrl(auth,value)
|
86
|
+
return url
|
87
|
+
end
|
88
|
+
|
89
|
+
def findAuthUrl(auth, value)
|
90
|
+
value.strip!
|
91
|
+
return nil if auth.nil?
|
92
|
+
return nil unless (entries = Qa::Authorities::Local.subauthority_for(auth).search(value))
|
93
|
+
entries.each do |entry|
|
94
|
+
#require exact match
|
95
|
+
next unless entry["label"].force_encoding('UTF-8') == value.force_encoding('UTF-8')
|
96
|
+
url = entry["url"] || entry["id"]
|
97
|
+
# url = localIdToUrl(url,auth) unless url =~ URI::regexp
|
98
|
+
return url
|
99
|
+
end
|
100
|
+
return nil
|
101
|
+
end
|
102
|
+
|
103
|
+
def localIdToUrl(id,auth_name)
|
104
|
+
root_urls = {'production' => "https://digitalcollections.library.ucsc.edu",
|
105
|
+
'staging' => "http://digitalcollections-staging.library.ucsc.edu",
|
106
|
+
'sandbox' => "http://digitalcollections-staging-sandbox.library.ucsc.edu",
|
107
|
+
'development' => "http://#{Socket.gethostname}",
|
108
|
+
'test' => "http://#{Socket.gethostname}"}
|
109
|
+
return "#{root_urls[Rails.env.to_s]}/authorities/show/local/#{auth_name}/#{id}"
|
110
|
+
end
|
111
|
+
|
112
|
+
|
113
|
+
def mintLocalAuthUrl(auth_name, value)
|
114
|
+
value.strip!
|
115
|
+
id = value.parameterize
|
116
|
+
auth = Qa::LocalAuthority.find_or_create_by(name: auth_name)
|
117
|
+
entry = Qa::LocalAuthorityEntry.create(local_authority: auth,
|
118
|
+
label: value,
|
119
|
+
uri: id)
|
120
|
+
return localIdToUrl(id,auth_name)
|
121
|
+
end
|
122
|
+
|
123
|
+
def getLocalAuth(field_name)
|
124
|
+
field = schema.get_property(field_name)
|
125
|
+
# There is only ever one local authority per field, so just pick the first you find
|
126
|
+
if vocs = field.vocabularies
|
127
|
+
vocs.each do |voc|
|
128
|
+
return voc["subauthority"] if voc["authority"].downcase == "local"
|
129
|
+
end
|
130
|
+
end
|
131
|
+
return nil
|
132
|
+
end
|
133
|
+
|
134
|
+
|
135
|
+
def get_remote_id(value, authority: nil, property: nil)
|
136
|
+
return false
|
137
|
+
#TODO retrieve URL for this value from the specified remote authr
|
138
|
+
end
|
139
|
+
|
140
|
+
end
|
@@ -0,0 +1,82 @@
|
|
1
|
+
module BulkOps::InterpretFilesBehavior
|
2
|
+
extend ActiveSupport::Concern
|
3
|
+
|
4
|
+
|
5
|
+
def interpret_file_fields
|
6
|
+
# This method handles file additions and deletions from the spreadsheet
|
7
|
+
# if additional files need to be deleted because the update is set to replace
|
8
|
+
# some or all existing files, those replacement-related deletions are handled
|
9
|
+
# by the BulkOps::Operation.
|
10
|
+
#
|
11
|
+
|
12
|
+
@raw_row.each do |field, value|
|
13
|
+
next if value.blank? or field.blank?
|
14
|
+
field = field.to_s
|
15
|
+
#If our CSV interpreter is feeding us the headers as a line, ignore it.
|
16
|
+
next if field == value
|
17
|
+
|
18
|
+
# Check if this is a file field, and whether we are removing or adding a file
|
19
|
+
next unless (action = BulkOps::Verification.is_file_field?(field))
|
20
|
+
|
21
|
+
# Move on if this field is the name of another property (e.g. masterFilename)
|
22
|
+
next if find_field_name(field)
|
23
|
+
|
24
|
+
# Check if we are removing a file
|
25
|
+
if action == "remove"
|
26
|
+
get_removed_filesets(value).each { |fileset_id| delete_file_set(file_set_id) }
|
27
|
+
else
|
28
|
+
# Add a file
|
29
|
+
operation.get_file_paths(value).each do |filepath|
|
30
|
+
begin
|
31
|
+
uploaded_file = Hyrax::UploadedFile.create(file: File.open(filepath), user: operation.user)
|
32
|
+
(@metadata[:uploaded_files] ||= []) << uploaded_file.id unless uploaded_file.id.nil?
|
33
|
+
rescue Exception => e
|
34
|
+
report_error(:upload_error,
|
35
|
+
message: "Error opening file: #{ filepath } -- #{e}",
|
36
|
+
file: File.join(BulkOps::INGEST_MEDIA_PATH,filename),
|
37
|
+
row_number: row_number)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
# Check if any of the upcoming rows are child filesets
|
43
|
+
i = 1
|
44
|
+
while self.class.is_file_set?(@metadata,row_number+i)
|
45
|
+
child_row.each do |field,value|
|
46
|
+
next if value.blank?
|
47
|
+
title = value if ["title","label"].include?(field.downcase.strip)
|
48
|
+
if BulkOps::Verification.is_file_field?(field)
|
49
|
+
operation.get_file_paths(value).each do |filepath|
|
50
|
+
uploaded_file = Hyrax::UploadedFile.create(file: File.open(filepath), user: operation.user)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
i+=1
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
private
|
61
|
+
|
62
|
+
def get_removed_filesets(filestring)
|
63
|
+
file_ids = BulkOps::Parser.split_values(filestring)
|
64
|
+
file_ids.select{|file_id| BulkOps::SolrService.record_exists?(file_id)}
|
65
|
+
|
66
|
+
# This part handles filenames in addition to file ids. It doesn't work yet!
|
67
|
+
# file_ids.map do |file_id|
|
68
|
+
# If the filename is the id of an existing record, keep that
|
69
|
+
# next(file_id) if (BulkOps::SolrService.record_exists?(file_id))
|
70
|
+
# If this is the label (i.e.filename) of an existing fileset, use that fileset id
|
71
|
+
# TODO MAKE THIS WORK!!
|
72
|
+
# next(filename) if (filename_exists?(filename))
|
73
|
+
# File.join(BulkOps::INGEST_MEDIA_PATH, filename_prefix, filename)
|
74
|
+
# end
|
75
|
+
end
|
76
|
+
|
77
|
+
def delete_file_set fileset_id
|
78
|
+
BulkOps::DeleteFileSetJob.perform_later(fileset_id, operation.user.email )
|
79
|
+
end
|
80
|
+
|
81
|
+
|
82
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
module BulkOps::InterpretOptionsBehavior
|
2
|
+
extend ActiveSupport::Concern
|
3
|
+
|
4
|
+
|
5
|
+
def interpret_option_fields
|
6
|
+
@raw_row.each do |field,value|
|
7
|
+
next if value.blank? or field.blank?
|
8
|
+
field = field.to_s
|
9
|
+
next if value == field
|
10
|
+
|
11
|
+
normfield = field.downcase.parameterize.gsub(/[_\s-]/,'')
|
12
|
+
if ["visibility", "public"].include?(normfield)
|
13
|
+
@proxy.update(visibility: format_visibility(value))
|
14
|
+
|
15
|
+
end
|
16
|
+
if ["worktype","model","type"].include?(normfield)
|
17
|
+
@proxy.update(work_type: format_worktype(value) )
|
18
|
+
end
|
19
|
+
if ["referenceidentifier",
|
20
|
+
"referenceid",
|
21
|
+
"refid",
|
22
|
+
"referenceidentifiertype",
|
23
|
+
"referenceidtype",
|
24
|
+
"refidtype",
|
25
|
+
"relationshipidentifier",
|
26
|
+
"relationshipid",
|
27
|
+
"relationshipidentifiertype",
|
28
|
+
"relationshipidtype",
|
29
|
+
"relid",
|
30
|
+
"relidtype"].include?(normfield)
|
31
|
+
@proxy.update(reference_identifier: format_reference_id(value))
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
def format_worktype(value)
|
40
|
+
# format the value like a class name
|
41
|
+
type = value.titleize.gsub(/[-_\s]/,'')
|
42
|
+
# reject it if it isn't a defined class
|
43
|
+
type = false unless Object.const_defined? type
|
44
|
+
# fall back to the work type defined by the operation, or a standard "Work"
|
45
|
+
return type ||= work_type || operation.work_type || "Work"
|
46
|
+
end
|
47
|
+
|
48
|
+
def format_visibility(value)
|
49
|
+
case value.downcase
|
50
|
+
when "public", "open", "true"
|
51
|
+
return "open"
|
52
|
+
when "campus", "ucsc", "institution"
|
53
|
+
return "ucsc"
|
54
|
+
when "restricted", "private", "closed", "false"
|
55
|
+
return "restricted"
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|