bulk_ops 0.1.23 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/db/migrate/20200122234235_remove_relationships_ammend_work_proxy.rb +14 -0
- data/lib/bulk_ops.rb +3 -2
- data/lib/bulk_ops/apply_operation_job.rb +8 -0
- data/lib/bulk_ops/create_work_job.rb +1 -1
- data/lib/bulk_ops/github_access.rb +1 -1
- data/lib/bulk_ops/operation.rb +57 -49
- data/lib/bulk_ops/parser.rb +50 -414
- data/lib/bulk_ops/resolve_children_job.rb +14 -0
- data/lib/bulk_ops/solr_service.rb +13 -0
- data/lib/bulk_ops/update_work_job.rb +1 -1
- data/lib/bulk_ops/verification.rb +2 -10
- data/lib/bulk_ops/version.rb +1 -1
- data/lib/bulk_ops/work_job.rb +20 -13
- data/lib/bulk_ops/work_proxy.rb +18 -2
- data/lib/concerns/interpret_controlled_behavior.rb +140 -0
- data/lib/concerns/interpret_files_behavior.rb +82 -0
- data/lib/concerns/interpret_options_behavior.rb +59 -0
- data/lib/concerns/interpret_relationships_behavior.rb +123 -0
- data/lib/concerns/interpret_scalar_behavior.rb +21 -0
- data/lib/concerns/search_builder_behavior.rb +80 -0
- metadata +12 -3
- data/lib/bulk_ops/relationship.rb +0 -117
@@ -0,0 +1,14 @@
|
|
1
|
+
class BulkOps::ResolveChildrenJob < ActiveJob::Base
|
2
|
+
|
3
|
+
def perform(proxy_id)
|
4
|
+
proxy = BulkOps::WorkProxy.find(proxy_id)
|
5
|
+
if proxy.ordered_children.all?{|child| child.work_id.present?}
|
6
|
+
work = ActiveFedora::Base.find(proxy.work_id)
|
7
|
+
work.ordered_member_ids = proxy.ordered_children.map(&:work_id)
|
8
|
+
work.save
|
9
|
+
else
|
10
|
+
BulkOps::ResolveChildrenJob.set(wait: 30.minutes).perform_later(proxy_id)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
end
|
@@ -13,7 +13,7 @@ class BulkOps::UpdateWorkJob < BulkOps::WorkJob
|
|
13
13
|
|
14
14
|
def define_work workClass=nil
|
15
15
|
# report an error if we can't find the work in solr
|
16
|
-
unless record_exists?(@work_proxy.work_id)
|
16
|
+
unless BulkOps::SolrService.record_exists?(@work_proxy.work_id)
|
17
17
|
report_error "Could not find work to update with id: #{@work_proxy.work_id} referenced by work proxy: #{@work_proxy.id}"
|
18
18
|
return false
|
19
19
|
end
|
@@ -59,14 +59,6 @@ module BulkOps
|
|
59
59
|
filenames.map { |filename| File.join(BulkOps::INGEST_MEDIA_PATH, options['file_prefix'] || "", filename) }
|
60
60
|
end
|
61
61
|
|
62
|
-
def record_exists? id
|
63
|
-
begin
|
64
|
-
return true if SolrDocument.find(id)
|
65
|
-
rescue Blacklight::Exceptions::RecordNotFound
|
66
|
-
return false
|
67
|
-
end
|
68
|
-
end
|
69
|
-
|
70
62
|
private
|
71
63
|
|
72
64
|
def verify_files
|
@@ -165,7 +157,7 @@ module BulkOps
|
|
165
157
|
get_spreadsheet.each_with_index do |row, row_num|
|
166
158
|
id = get_ref_id(row)
|
167
159
|
#TODO: find by other field. for now just id
|
168
|
-
unless (record_exists(id))
|
160
|
+
unless (BulkOps::SolrService.record_exists?(id))
|
169
161
|
@verification_errors << BulkOps::Error.new(type: :cannot_find_work, id: id)
|
170
162
|
end
|
171
163
|
end
|
@@ -205,7 +197,7 @@ module BulkOps
|
|
205
197
|
end
|
206
198
|
elsif ref_id.include?("id")
|
207
199
|
# This is a hydra id reference. It should correspond to an object already in the repo
|
208
|
-
unless record_exists?(obj_id)
|
200
|
+
unless BulkOps::SolrService.record_exists?(obj_id)
|
209
201
|
@verification_errors << BulkOps::Error.new({type: :bad_object_reference, object_id: obj_id, row_number: row_num + BulkOps:: ROW_OFFSET})
|
210
202
|
end
|
211
203
|
end
|
data/lib/bulk_ops/version.rb
CHANGED
data/lib/bulk_ops/work_job.rb
CHANGED
@@ -13,11 +13,25 @@ class BulkOps::WorkJob < ActiveJob::Base
|
|
13
13
|
update_status "error"
|
14
14
|
else
|
15
15
|
@work_proxy.work_id = @work.id
|
16
|
-
|
16
|
+
|
17
|
+
# If this work has a parent outside of the current operation,
|
18
|
+
# and this is the first sibling (we only need to do this once per parent),
|
19
|
+
# queue a job to resolve that work's new children
|
20
|
+
if @work_proxy.parent_id.present? && (parent_proxy = BulkOps::WorkProxy.find(parent_id))
|
21
|
+
if parent_proxy.operation_id != @work_proxy.operation_id
|
22
|
+
if @work_proxy.previous_sibling.nil?
|
23
|
+
BulkOps::ResolveChildrenJob.set(wait: 10.minutes).perform_later(parent_proxy.id)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
17
27
|
|
18
|
-
#
|
19
|
-
|
20
|
-
|
28
|
+
# Set up jobs to link child works (once they are ingested)
|
29
|
+
# or mark as complete otherwise
|
30
|
+
if (children = @work_proxy.ordered_children)
|
31
|
+
BulkOps::ResolveChildrenJob.perform_later(@work_proxy.id)
|
32
|
+
update_status "awaiting_children"
|
33
|
+
else
|
34
|
+
update_status "complete"
|
21
35
|
end
|
22
36
|
|
23
37
|
# Delete any UploadedFiles. These take up tons of unnecessary disk space.
|
@@ -52,6 +66,7 @@ class BulkOps::WorkJob < ActiveJob::Base
|
|
52
66
|
report_error("Cannot find work proxy with id: #{work_proxy_id}")
|
53
67
|
return
|
54
68
|
end
|
69
|
+
return if @work_proxy.status == "complete"
|
55
70
|
|
56
71
|
return unless (work_action = define_work(workClass))
|
57
72
|
|
@@ -66,7 +81,7 @@ class BulkOps::WorkJob < ActiveJob::Base
|
|
66
81
|
|
67
82
|
|
68
83
|
def define_work(workClass="Work")
|
69
|
-
if (@work_proxy.present? && @work_proxy.work_id.present? && record_exists?(@work_proxy.work_id))
|
84
|
+
if (@work_proxy.present? && @work_proxy.work_id.present? && BulkOps::SolrService.record_exists?(@work_proxy.work_id))
|
70
85
|
begin
|
71
86
|
@work = ActiveFedora::Base.find(@work_proxy.work_id)
|
72
87
|
return :update
|
@@ -80,14 +95,6 @@ class BulkOps::WorkJob < ActiveJob::Base
|
|
80
95
|
end
|
81
96
|
end
|
82
97
|
|
83
|
-
def record_exists? id
|
84
|
-
begin
|
85
|
-
return true if SolrDocument.find(id)
|
86
|
-
rescue Blacklight::Exceptions::RecordNotFound
|
87
|
-
return false
|
88
|
-
end
|
89
|
-
end
|
90
|
-
|
91
98
|
def report_error message=nil
|
92
99
|
update_status "job_error", message: message
|
93
100
|
end
|
data/lib/bulk_ops/work_proxy.rb
CHANGED
@@ -2,7 +2,6 @@ class BulkOps::WorkProxy < ActiveRecord::Base
|
|
2
2
|
|
3
3
|
self.table_name = "bulk_ops_work_proxies"
|
4
4
|
belongs_to :operation, class_name: "BulkOps::Operation", foreign_key: "operation_id"
|
5
|
-
has_many :relationships, class_name: "BulkOps::Relationship"
|
6
5
|
|
7
6
|
attr_accessor :proxy_errors
|
8
7
|
|
@@ -38,5 +37,22 @@ class BulkOps::WorkProxy < ActiveRecord::Base
|
|
38
37
|
@proxy_errors ||= []
|
39
38
|
end
|
40
39
|
|
41
|
-
|
40
|
+
def ordered_siblings
|
41
|
+
return nil unless (parent = BulkOps::WorkProxy.find(parent_id))
|
42
|
+
parent.ordered_children - self
|
43
|
+
end
|
44
|
+
|
45
|
+
def ordered_children
|
46
|
+
children = BulkOps::WorkProxy.where(parent_id: id)
|
47
|
+
ordered_kids = []
|
48
|
+
previous_id = nil
|
49
|
+
while ordered_kids.length < children.length do
|
50
|
+
next_child = children.find{|child| child.previous_sibling_id == previous_id}
|
51
|
+
break if (next_child.nil? or ordered_kids.include?(next_child))
|
52
|
+
previous_id = next_child.id
|
53
|
+
ordered_kids << next_child
|
54
|
+
end
|
55
|
+
ordered_kids = ordered_kids + (children - ordered_kids) if (children.length > ordered_kids.length)
|
56
|
+
ordered_kids
|
57
|
+
end
|
42
58
|
end
|
@@ -0,0 +1,140 @@
|
|
1
|
+
module BulkOps::InterpretControlledBehavior
|
2
|
+
extend ActiveSupport::Concern
|
3
|
+
|
4
|
+
def interpret_controlled_fields
|
5
|
+
|
6
|
+
# The labels array tracks the contents of columns marked as labels,
|
7
|
+
# which may require special validation
|
8
|
+
labels = {}
|
9
|
+
|
10
|
+
# This hash is populated with relevant data as we loop through the fields
|
11
|
+
controlled_data = {}
|
12
|
+
|
13
|
+
@raw_row.each do |field_name, value|
|
14
|
+
next if value.blank? or field_name.blank?
|
15
|
+
field_name = field_name.to_s
|
16
|
+
|
17
|
+
#If our CSV interpreter is feeding us the headers as a line, ignore it.
|
18
|
+
next if field_name == value
|
19
|
+
|
20
|
+
#check if they are using the 'field_name.authority' syntax
|
21
|
+
authority = nil
|
22
|
+
if ((split=field_name.split('.')).count == 2)
|
23
|
+
authority = split.last
|
24
|
+
field_name = split.first
|
25
|
+
end
|
26
|
+
|
27
|
+
# get the field name, if this column is a metadata field
|
28
|
+
field_name_norm = find_field_name(field_name)
|
29
|
+
field = schema.get_field(field_name_norm)
|
30
|
+
|
31
|
+
# Ignore anything that isn't a controlled field
|
32
|
+
next unless field.present? && field.controlled?
|
33
|
+
|
34
|
+
# Keep track of label fields
|
35
|
+
if field_name.downcase.ends_with?("label")
|
36
|
+
next if @options["ignore_labels"]
|
37
|
+
labels[field_name_norm] ||= []
|
38
|
+
labels[field_name_norm] += BulkOps::Parser.split_values value
|
39
|
+
next unless @options["import_labels"]
|
40
|
+
end
|
41
|
+
|
42
|
+
# handle multiple values
|
43
|
+
value_array = BulkOps::Parser.split_values(value)
|
44
|
+
controlled_data[field_name_norm] ||= [] unless value_array.blank?
|
45
|
+
value_array.each do |value|
|
46
|
+
# Decide of we're dealing with a label or url
|
47
|
+
# It's an ID if it's a URL and the name doesn't end in 'label'
|
48
|
+
value.strip!
|
49
|
+
if value =~ /^#{URI::regexp}$/ and !field_name.downcase.ends_with?("label")
|
50
|
+
value_id = value
|
51
|
+
# label = WorkIndexer.fetch_remote_label(value)
|
52
|
+
# error_message = "cannot fetch remote label for url: #{value}"
|
53
|
+
# report_error( :cannot_retrieve_label , error_message, url: value, row_number: row_number) unless label
|
54
|
+
else
|
55
|
+
# It's a label, so unescape it and get the id
|
56
|
+
value = BulkOps::Parser.unescape_csv(value)
|
57
|
+
value_id = get_remote_id(value, property: field_name_norm, authority: authority) || localAuthUrl(field_name_norm, value)
|
58
|
+
# label = value
|
59
|
+
report_error(:cannot_retrieve_url,
|
60
|
+
message: "cannot find or create url for controlled vocabulary label: #{value}",
|
61
|
+
url: value,
|
62
|
+
row_number: row_number) unless value_id
|
63
|
+
end
|
64
|
+
destroy? =
|
65
|
+
atts = {id: value_id}
|
66
|
+
atts[:_destroy] = true if (field_name.downcase.starts_with?("remove") or field_name.downcase.starts_with?("delete"))
|
67
|
+
controlled_data[field_name_norm] << atts
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
# Actually add all the data
|
72
|
+
controlled_data.each do |property_name, data|
|
73
|
+
@metadata["#{property_name}_attributes"] ||= [] unless data.blank?
|
74
|
+
data.uniq.each do |datum|
|
75
|
+
@metadata["#{property_name}_attributes"].reject!{|val| val[:id] == datum[:id]}
|
76
|
+
@metadata["#{property_name}_attributes"] << datum
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
private
|
82
|
+
|
83
|
+
def localAuthUrl(property, value)
|
84
|
+
return value if (auth = getLocalAuth(property)).nil?
|
85
|
+
url = findAuthUrl(auth, value) || mintLocalAuthUrl(auth,value)
|
86
|
+
return url
|
87
|
+
end
|
88
|
+
|
89
|
+
def findAuthUrl(auth, value)
|
90
|
+
value.strip!
|
91
|
+
return nil if auth.nil?
|
92
|
+
return nil unless (entries = Qa::Authorities::Local.subauthority_for(auth).search(value))
|
93
|
+
entries.each do |entry|
|
94
|
+
#require exact match
|
95
|
+
next unless entry["label"].force_encoding('UTF-8') == value.force_encoding('UTF-8')
|
96
|
+
url = entry["url"] || entry["id"]
|
97
|
+
# url = localIdToUrl(url,auth) unless url =~ URI::regexp
|
98
|
+
return url
|
99
|
+
end
|
100
|
+
return nil
|
101
|
+
end
|
102
|
+
|
103
|
+
def localIdToUrl(id,auth_name)
|
104
|
+
root_urls = {'production' => "https://digitalcollections.library.ucsc.edu",
|
105
|
+
'staging' => "http://digitalcollections-staging.library.ucsc.edu",
|
106
|
+
'sandbox' => "http://digitalcollections-staging-sandbox.library.ucsc.edu",
|
107
|
+
'development' => "http://#{Socket.gethostname}",
|
108
|
+
'test' => "http://#{Socket.gethostname}"}
|
109
|
+
return "#{root_urls[Rails.env.to_s]}/authorities/show/local/#{auth_name}/#{id}"
|
110
|
+
end
|
111
|
+
|
112
|
+
|
113
|
+
def mintLocalAuthUrl(auth_name, value)
|
114
|
+
value.strip!
|
115
|
+
id = value.parameterize
|
116
|
+
auth = Qa::LocalAuthority.find_or_create_by(name: auth_name)
|
117
|
+
entry = Qa::LocalAuthorityEntry.create(local_authority: auth,
|
118
|
+
label: value,
|
119
|
+
uri: id)
|
120
|
+
return localIdToUrl(id,auth_name)
|
121
|
+
end
|
122
|
+
|
123
|
+
def getLocalAuth(field_name)
|
124
|
+
field = schema.get_property(field_name)
|
125
|
+
# There is only ever one local authority per field, so just pick the first you find
|
126
|
+
if vocs = field.vocabularies
|
127
|
+
vocs.each do |voc|
|
128
|
+
return voc["subauthority"] if voc["authority"].downcase == "local"
|
129
|
+
end
|
130
|
+
end
|
131
|
+
return nil
|
132
|
+
end
|
133
|
+
|
134
|
+
|
135
|
+
def get_remote_id(value, authority: nil, property: nil)
|
136
|
+
return false
|
137
|
+
#TODO retrieve URL for this value from the specified remote authr
|
138
|
+
end
|
139
|
+
|
140
|
+
end
|
@@ -0,0 +1,82 @@
|
|
1
|
+
module BulkOps::InterpretFilesBehavior
|
2
|
+
extend ActiveSupport::Concern
|
3
|
+
|
4
|
+
|
5
|
+
def interpret_file_fields
|
6
|
+
# This method handles file additions and deletions from the spreadsheet
|
7
|
+
# if additional files need to be deleted because the update is set to replace
|
8
|
+
# some or all existing files, those replacement-related deletions are handled
|
9
|
+
# by the BulkOps::Operation.
|
10
|
+
#
|
11
|
+
|
12
|
+
@raw_row.each do |field, value|
|
13
|
+
next if value.blank? or field.blank?
|
14
|
+
field = field.to_s
|
15
|
+
#If our CSV interpreter is feeding us the headers as a line, ignore it.
|
16
|
+
next if field == value
|
17
|
+
|
18
|
+
# Check if this is a file field, and whether we are removing or adding a file
|
19
|
+
next unless (action = BulkOps::Verification.is_file_field?(field))
|
20
|
+
|
21
|
+
# Move on if this field is the name of another property (e.g. masterFilename)
|
22
|
+
next if find_field_name(field)
|
23
|
+
|
24
|
+
# Check if we are removing a file
|
25
|
+
if action == "remove"
|
26
|
+
get_removed_filesets(value).each { |fileset_id| delete_file_set(file_set_id) }
|
27
|
+
else
|
28
|
+
# Add a file
|
29
|
+
operation.get_file_paths(value).each do |filepath|
|
30
|
+
begin
|
31
|
+
uploaded_file = Hyrax::UploadedFile.create(file: File.open(filepath), user: operation.user)
|
32
|
+
(@metadata[:uploaded_files] ||= []) << uploaded_file.id unless uploaded_file.id.nil?
|
33
|
+
rescue Exception => e
|
34
|
+
report_error(:upload_error,
|
35
|
+
message: "Error opening file: #{ filepath } -- #{e}",
|
36
|
+
file: File.join(BulkOps::INGEST_MEDIA_PATH,filename),
|
37
|
+
row_number: row_number)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
# Check if any of the upcoming rows are child filesets
|
43
|
+
i = 1
|
44
|
+
while self.class.is_file_set?(@metadata,row_number+i)
|
45
|
+
child_row.each do |field,value|
|
46
|
+
next if value.blank?
|
47
|
+
title = value if ["title","label"].include?(field.downcase.strip)
|
48
|
+
if BulkOps::Verification.is_file_field?(field)
|
49
|
+
operation.get_file_paths(value).each do |filepath|
|
50
|
+
uploaded_file = Hyrax::UploadedFile.create(file: File.open(filepath), user: operation.user)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
i+=1
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
private
|
61
|
+
|
62
|
+
def get_removed_filesets(filestring)
|
63
|
+
file_ids = BulkOps::Parser.split_values(filestring)
|
64
|
+
file_ids.select{|file_id| BulkOps::SolrService.record_exists?(file_id)}
|
65
|
+
|
66
|
+
# This part handles filenames in addition to file ids. It doesn't work yet!
|
67
|
+
# file_ids.map do |file_id|
|
68
|
+
# If the filename is the id of an existing record, keep that
|
69
|
+
# next(file_id) if (BulkOps::SolrService.record_exists?(file_id))
|
70
|
+
# If this is the label (i.e.filename) of an existing fileset, use that fileset id
|
71
|
+
# TODO MAKE THIS WORK!!
|
72
|
+
# next(filename) if (filename_exists?(filename))
|
73
|
+
# File.join(BulkOps::INGEST_MEDIA_PATH, filename_prefix, filename)
|
74
|
+
# end
|
75
|
+
end
|
76
|
+
|
77
|
+
def delete_file_set fileset_id
|
78
|
+
BulkOps::DeleteFileSetJob.perform_later(fileset_id, operation.user.email )
|
79
|
+
end
|
80
|
+
|
81
|
+
|
82
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
module BulkOps::InterpretOptionsBehavior
|
2
|
+
extend ActiveSupport::Concern
|
3
|
+
|
4
|
+
|
5
|
+
def interpret_option_fields
|
6
|
+
@raw_row.each do |field,value|
|
7
|
+
next if value.blank? or field.blank?
|
8
|
+
field = field.to_s
|
9
|
+
next if value == field
|
10
|
+
|
11
|
+
normfield = field.downcase.parameterize.gsub(/[_\s-]/,'')
|
12
|
+
if ["visibility", "public"].include?(normfield)
|
13
|
+
@proxy.update(visibility: format_visibility(value))
|
14
|
+
|
15
|
+
end
|
16
|
+
if ["worktype","model","type"].include?(normfield)
|
17
|
+
@proxy.update(work_type: format_worktype(value) )
|
18
|
+
end
|
19
|
+
if ["referenceidentifier",
|
20
|
+
"referenceid",
|
21
|
+
"refid",
|
22
|
+
"referenceidentifiertype",
|
23
|
+
"referenceidtype",
|
24
|
+
"refidtype",
|
25
|
+
"relationshipidentifier",
|
26
|
+
"relationshipid",
|
27
|
+
"relationshipidentifiertype",
|
28
|
+
"relationshipidtype",
|
29
|
+
"relid",
|
30
|
+
"relidtype"].include?(normfield)
|
31
|
+
@proxy.update(reference_identifier: format_reference_id(value))
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
def format_worktype(value)
|
40
|
+
# format the value like a class name
|
41
|
+
type = value.titleize.gsub(/[-_\s]/,'')
|
42
|
+
# reject it if it isn't a defined class
|
43
|
+
type = false unless Object.const_defined? type
|
44
|
+
# fall back to the work type defined by the operation, or a standard "Work"
|
45
|
+
return type ||= work_type || operation.work_type || "Work"
|
46
|
+
end
|
47
|
+
|
48
|
+
def format_visibility(value)
|
49
|
+
case value.downcase
|
50
|
+
when "public", "open", "true"
|
51
|
+
return "open"
|
52
|
+
when "campus", "ucsc", "institution"
|
53
|
+
return "ucsc"
|
54
|
+
when "restricted", "private", "closed", "false"
|
55
|
+
return "restricted"
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|