bulk_ops 0.1.23 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,14 @@
1
+ class BulkOps::ResolveChildrenJob < ActiveJob::Base
2
+
3
+ def perform(proxy_id)
4
+ proxy = BulkOps::WorkProxy.find(proxy_id)
5
+ if proxy.ordered_children.all?{|child| child.work_id.present?}
6
+ work = ActiveFedora::Base.find(proxy.work_id)
7
+ work.ordered_member_ids = proxy.ordered_children.map(&:work_id)
8
+ work.save
9
+ else
10
+ BulkOps::ResolveChildrenJob.set(wait: 30.minutes).perform_later(proxy_id)
11
+ end
12
+ end
13
+
14
+ end
@@ -0,0 +1,13 @@
1
+ class SolrService
2
+
3
+ def self.record_exists? id
4
+ begin
5
+ return true if SolrDocument.find(id)
6
+ rescue Blacklight::Exceptions::RecordNotFound
7
+ return false
8
+ end
9
+ return false
10
+ end
11
+
12
+
13
+ end
@@ -13,7 +13,7 @@ class BulkOps::UpdateWorkJob < BulkOps::WorkJob
13
13
 
14
14
  def define_work workClass=nil
15
15
  # report an error if we can't find the work in solr
16
- unless record_exists?(@work_proxy.work_id)
16
+ unless BulkOps::SolrService.record_exists?(@work_proxy.work_id)
17
17
  report_error "Could not find work to update with id: #{@work_proxy.work_id} referenced by work proxy: #{@work_proxy.id}"
18
18
  return false
19
19
  end
@@ -59,14 +59,6 @@ module BulkOps
59
59
  filenames.map { |filename| File.join(BulkOps::INGEST_MEDIA_PATH, options['file_prefix'] || "", filename) }
60
60
  end
61
61
 
62
- def record_exists? id
63
- begin
64
- return true if SolrDocument.find(id)
65
- rescue Blacklight::Exceptions::RecordNotFound
66
- return false
67
- end
68
- end
69
-
70
62
  private
71
63
 
72
64
  def verify_files
@@ -165,7 +157,7 @@ module BulkOps
165
157
  get_spreadsheet.each_with_index do |row, row_num|
166
158
  id = get_ref_id(row)
167
159
  #TODO: find by other field. for now just id
168
- unless (record_exists(id))
160
+ unless (BulkOps::SolrService.record_exists?(id))
169
161
  @verification_errors << BulkOps::Error.new(type: :cannot_find_work, id: id)
170
162
  end
171
163
  end
@@ -205,7 +197,7 @@ module BulkOps
205
197
  end
206
198
  elsif ref_id.include?("id")
207
199
  # This is a hydra id reference. It should correspond to an object already in the repo
208
- unless record_exists?(obj_id)
200
+ unless BulkOps::SolrService.record_exists?(obj_id)
209
201
  @verification_errors << BulkOps::Error.new({type: :bad_object_reference, object_id: obj_id, row_number: row_num + BulkOps:: ROW_OFFSET})
210
202
  end
211
203
  end
@@ -1,3 +1,3 @@
1
1
  module BulkOps
2
- VERSION = "0.1.23"
2
+ VERSION = "0.2.0"
3
3
  end
@@ -13,11 +13,25 @@ class BulkOps::WorkJob < ActiveJob::Base
13
13
  update_status "error"
14
14
  else
15
15
  @work_proxy.work_id = @work.id
16
- update_status "complete"
16
+
17
+ # If this work has a parent outside of the current operation,
18
+ # and this is the first sibling (we only need to do this once per parent),
19
+ # queue a job to resolve that work's new children
20
+ if @work_proxy.parent_id.present? && (parent_proxy = BulkOps::WorkProxy.find(parent_id))
21
+ if parent_proxy.operation_id != @work_proxy.operation_id
22
+ if @work_proxy.previous_sibling.nil?
23
+ BulkOps::ResolveChildrenJob.set(wait: 10.minutes).perform_later(parent_proxy.id)
24
+ end
25
+ end
26
+ end
17
27
 
18
- # Attempt to resolve all of the relationships defined in this row
19
- @work_proxy.relationships.each do |relationship|
20
- relationship.resolve!
28
+ # Set up jobs to link child works (once they are ingested)
29
+ # or mark as complete otherwise
30
+ if (children = @work_proxy.ordered_children)
31
+ BulkOps::ResolveChildrenJob.perform_later(@work_proxy.id)
32
+ update_status "awaiting_children"
33
+ else
34
+ update_status "complete"
21
35
  end
22
36
 
23
37
  # Delete any UploadedFiles. These take up tons of unnecessary disk space.
@@ -52,6 +66,7 @@ class BulkOps::WorkJob < ActiveJob::Base
52
66
  report_error("Cannot find work proxy with id: #{work_proxy_id}")
53
67
  return
54
68
  end
69
+ return if @work_proxy.status == "complete"
55
70
 
56
71
  return unless (work_action = define_work(workClass))
57
72
 
@@ -66,7 +81,7 @@ class BulkOps::WorkJob < ActiveJob::Base
66
81
 
67
82
 
68
83
  def define_work(workClass="Work")
69
- if (@work_proxy.present? && @work_proxy.work_id.present? && record_exists?(@work_proxy.work_id))
84
+ if (@work_proxy.present? && @work_proxy.work_id.present? && BulkOps::SolrService.record_exists?(@work_proxy.work_id))
70
85
  begin
71
86
  @work = ActiveFedora::Base.find(@work_proxy.work_id)
72
87
  return :update
@@ -80,14 +95,6 @@ class BulkOps::WorkJob < ActiveJob::Base
80
95
  end
81
96
  end
82
97
 
83
- def record_exists? id
84
- begin
85
- return true if SolrDocument.find(id)
86
- rescue Blacklight::Exceptions::RecordNotFound
87
- return false
88
- end
89
- end
90
-
91
98
  def report_error message=nil
92
99
  update_status "job_error", message: message
93
100
  end
@@ -2,7 +2,6 @@ class BulkOps::WorkProxy < ActiveRecord::Base
2
2
 
3
3
  self.table_name = "bulk_ops_work_proxies"
4
4
  belongs_to :operation, class_name: "BulkOps::Operation", foreign_key: "operation_id"
5
- has_many :relationships, class_name: "BulkOps::Relationship"
6
5
 
7
6
  attr_accessor :proxy_errors
8
7
 
@@ -38,5 +37,22 @@ class BulkOps::WorkProxy < ActiveRecord::Base
38
37
  @proxy_errors ||= []
39
38
  end
40
39
 
41
-
40
+ def ordered_siblings
41
+ return nil unless (parent = BulkOps::WorkProxy.find(parent_id))
42
+ parent.ordered_children - self
43
+ end
44
+
45
+ def ordered_children
46
+ children = BulkOps::WorkProxy.where(parent_id: id)
47
+ ordered_kids = []
48
+ previous_id = nil
49
+ while ordered_kids.length < children.length do
50
+ next_child = children.find{|child| child.previous_sibling_id == previous_id}
51
+ break if (next_child.nil? or ordered_kids.include?(next_child))
52
+ previous_id = next_child.id
53
+ ordered_kids << next_child
54
+ end
55
+ ordered_kids = ordered_kids + (children - ordered_kids) if (children.length > ordered_kids.length)
56
+ ordered_kids
57
+ end
42
58
  end
@@ -0,0 +1,140 @@
1
+ module BulkOps::InterpretControlledBehavior
2
+ extend ActiveSupport::Concern
3
+
4
+ def interpret_controlled_fields
5
+
6
+ # The labels array tracks the contents of columns marked as labels,
7
+ # which may require special validation
8
+ labels = {}
9
+
10
+ # This hash is populated with relevant data as we loop through the fields
11
+ controlled_data = {}
12
+
13
+ @raw_row.each do |field_name, value|
14
+ next if value.blank? or field_name.blank?
15
+ field_name = field_name.to_s
16
+
17
+ #If our CSV interpreter is feeding us the headers as a line, ignore it.
18
+ next if field_name == value
19
+
20
+ #check if they are using the 'field_name.authority' syntax
21
+ authority = nil
22
+ if ((split=field_name.split('.')).count == 2)
23
+ authority = split.last
24
+ field_name = split.first
25
+ end
26
+
27
+ # get the field name, if this column is a metadata field
28
+ field_name_norm = find_field_name(field_name)
29
+ field = schema.get_field(field_name_norm)
30
+
31
+ # Ignore anything that isn't a controlled field
32
+ next unless field.present? && field.controlled?
33
+
34
+ # Keep track of label fields
35
+ if field_name.downcase.ends_with?("label")
36
+ next if @options["ignore_labels"]
37
+ labels[field_name_norm] ||= []
38
+ labels[field_name_norm] += BulkOps::Parser.split_values value
39
+ next unless @options["import_labels"]
40
+ end
41
+
42
+ # handle multiple values
43
+ value_array = BulkOps::Parser.split_values(value)
44
+ controlled_data[field_name_norm] ||= [] unless value_array.blank?
45
+ value_array.each do |value|
46
+ # Decide of we're dealing with a label or url
47
+ # It's an ID if it's a URL and the name doesn't end in 'label'
48
+ value.strip!
49
+ if value =~ /^#{URI::regexp}$/ and !field_name.downcase.ends_with?("label")
50
+ value_id = value
51
+ # label = WorkIndexer.fetch_remote_label(value)
52
+ # error_message = "cannot fetch remote label for url: #{value}"
53
+ # report_error( :cannot_retrieve_label , error_message, url: value, row_number: row_number) unless label
54
+ else
55
+ # It's a label, so unescape it and get the id
56
+ value = BulkOps::Parser.unescape_csv(value)
57
+ value_id = get_remote_id(value, property: field_name_norm, authority: authority) || localAuthUrl(field_name_norm, value)
58
+ # label = value
59
+ report_error(:cannot_retrieve_url,
60
+ message: "cannot find or create url for controlled vocabulary label: #{value}",
61
+ url: value,
62
+ row_number: row_number) unless value_id
63
+ end
64
+ destroy? =
65
+ atts = {id: value_id}
66
+ atts[:_destroy] = true if (field_name.downcase.starts_with?("remove") or field_name.downcase.starts_with?("delete"))
67
+ controlled_data[field_name_norm] << atts
68
+ end
69
+ end
70
+
71
+ # Actually add all the data
72
+ controlled_data.each do |property_name, data|
73
+ @metadata["#{property_name}_attributes"] ||= [] unless data.blank?
74
+ data.uniq.each do |datum|
75
+ @metadata["#{property_name}_attributes"].reject!{|val| val[:id] == datum[:id]}
76
+ @metadata["#{property_name}_attributes"] << datum
77
+ end
78
+ end
79
+ end
80
+
81
+ private
82
+
83
+ def localAuthUrl(property, value)
84
+ return value if (auth = getLocalAuth(property)).nil?
85
+ url = findAuthUrl(auth, value) || mintLocalAuthUrl(auth,value)
86
+ return url
87
+ end
88
+
89
+ def findAuthUrl(auth, value)
90
+ value.strip!
91
+ return nil if auth.nil?
92
+ return nil unless (entries = Qa::Authorities::Local.subauthority_for(auth).search(value))
93
+ entries.each do |entry|
94
+ #require exact match
95
+ next unless entry["label"].force_encoding('UTF-8') == value.force_encoding('UTF-8')
96
+ url = entry["url"] || entry["id"]
97
+ # url = localIdToUrl(url,auth) unless url =~ URI::regexp
98
+ return url
99
+ end
100
+ return nil
101
+ end
102
+
103
+ def localIdToUrl(id,auth_name)
104
+ root_urls = {'production' => "https://digitalcollections.library.ucsc.edu",
105
+ 'staging' => "http://digitalcollections-staging.library.ucsc.edu",
106
+ 'sandbox' => "http://digitalcollections-staging-sandbox.library.ucsc.edu",
107
+ 'development' => "http://#{Socket.gethostname}",
108
+ 'test' => "http://#{Socket.gethostname}"}
109
+ return "#{root_urls[Rails.env.to_s]}/authorities/show/local/#{auth_name}/#{id}"
110
+ end
111
+
112
+
113
+ def mintLocalAuthUrl(auth_name, value)
114
+ value.strip!
115
+ id = value.parameterize
116
+ auth = Qa::LocalAuthority.find_or_create_by(name: auth_name)
117
+ entry = Qa::LocalAuthorityEntry.create(local_authority: auth,
118
+ label: value,
119
+ uri: id)
120
+ return localIdToUrl(id,auth_name)
121
+ end
122
+
123
+ def getLocalAuth(field_name)
124
+ field = schema.get_property(field_name)
125
+ # There is only ever one local authority per field, so just pick the first you find
126
+ if vocs = field.vocabularies
127
+ vocs.each do |voc|
128
+ return voc["subauthority"] if voc["authority"].downcase == "local"
129
+ end
130
+ end
131
+ return nil
132
+ end
133
+
134
+
135
+ def get_remote_id(value, authority: nil, property: nil)
136
+ return false
137
+ #TODO retrieve URL for this value from the specified remote authr
138
+ end
139
+
140
+ end
@@ -0,0 +1,82 @@
1
+ module BulkOps::InterpretFilesBehavior
2
+ extend ActiveSupport::Concern
3
+
4
+
5
+ def interpret_file_fields
6
+ # This method handles file additions and deletions from the spreadsheet
7
+ # if additional files need to be deleted because the update is set to replace
8
+ # some or all existing files, those replacement-related deletions are handled
9
+ # by the BulkOps::Operation.
10
+ #
11
+
12
+ @raw_row.each do |field, value|
13
+ next if value.blank? or field.blank?
14
+ field = field.to_s
15
+ #If our CSV interpreter is feeding us the headers as a line, ignore it.
16
+ next if field == value
17
+
18
+ # Check if this is a file field, and whether we are removing or adding a file
19
+ next unless (action = BulkOps::Verification.is_file_field?(field))
20
+
21
+ # Move on if this field is the name of another property (e.g. masterFilename)
22
+ next if find_field_name(field)
23
+
24
+ # Check if we are removing a file
25
+ if action == "remove"
26
+ get_removed_filesets(value).each { |fileset_id| delete_file_set(file_set_id) }
27
+ else
28
+ # Add a file
29
+ operation.get_file_paths(value).each do |filepath|
30
+ begin
31
+ uploaded_file = Hyrax::UploadedFile.create(file: File.open(filepath), user: operation.user)
32
+ (@metadata[:uploaded_files] ||= []) << uploaded_file.id unless uploaded_file.id.nil?
33
+ rescue Exception => e
34
+ report_error(:upload_error,
35
+ message: "Error opening file: #{ filepath } -- #{e}",
36
+ file: File.join(BulkOps::INGEST_MEDIA_PATH,filename),
37
+ row_number: row_number)
38
+ end
39
+ end
40
+ end
41
+
42
+ # Check if any of the upcoming rows are child filesets
43
+ i = 1
44
+ while self.class.is_file_set?(@metadata,row_number+i)
45
+ child_row.each do |field,value|
46
+ next if value.blank?
47
+ title = value if ["title","label"].include?(field.downcase.strip)
48
+ if BulkOps::Verification.is_file_field?(field)
49
+ operation.get_file_paths(value).each do |filepath|
50
+ uploaded_file = Hyrax::UploadedFile.create(file: File.open(filepath), user: operation.user)
51
+ end
52
+ end
53
+ end
54
+ i+=1
55
+ end
56
+
57
+ end
58
+ end
59
+
60
+ private
61
+
62
+ def get_removed_filesets(filestring)
63
+ file_ids = BulkOps::Parser.split_values(filestring)
64
+ file_ids.select{|file_id| BulkOps::SolrService.record_exists?(file_id)}
65
+
66
+ # This part handles filenames in addition to file ids. It doesn't work yet!
67
+ # file_ids.map do |file_id|
68
+ # If the filename is the id of an existing record, keep that
69
+ # next(file_id) if (BulkOps::SolrService.record_exists?(file_id))
70
+ # If this is the label (i.e.filename) of an existing fileset, use that fileset id
71
+ # TODO MAKE THIS WORK!!
72
+ # next(filename) if (filename_exists?(filename))
73
+ # File.join(BulkOps::INGEST_MEDIA_PATH, filename_prefix, filename)
74
+ # end
75
+ end
76
+
77
+ def delete_file_set fileset_id
78
+ BulkOps::DeleteFileSetJob.perform_later(fileset_id, operation.user.email )
79
+ end
80
+
81
+
82
+ end
@@ -0,0 +1,59 @@
1
+ module BulkOps::InterpretOptionsBehavior
2
+ extend ActiveSupport::Concern
3
+
4
+
5
+ def interpret_option_fields
6
+ @raw_row.each do |field,value|
7
+ next if value.blank? or field.blank?
8
+ field = field.to_s
9
+ next if value == field
10
+
11
+ normfield = field.downcase.parameterize.gsub(/[_\s-]/,'')
12
+ if ["visibility", "public"].include?(normfield)
13
+ @proxy.update(visibility: format_visibility(value))
14
+
15
+ end
16
+ if ["worktype","model","type"].include?(normfield)
17
+ @proxy.update(work_type: format_worktype(value) )
18
+ end
19
+ if ["referenceidentifier",
20
+ "referenceid",
21
+ "refid",
22
+ "referenceidentifiertype",
23
+ "referenceidtype",
24
+ "refidtype",
25
+ "relationshipidentifier",
26
+ "relationshipid",
27
+ "relationshipidentifiertype",
28
+ "relationshipidtype",
29
+ "relid",
30
+ "relidtype"].include?(normfield)
31
+ @proxy.update(reference_identifier: format_reference_id(value))
32
+ end
33
+ end
34
+ end
35
+
36
+
37
+ private
38
+
39
+ def format_worktype(value)
40
+ # format the value like a class name
41
+ type = value.titleize.gsub(/[-_\s]/,'')
42
+ # reject it if it isn't a defined class
43
+ type = false unless Object.const_defined? type
44
+ # fall back to the work type defined by the operation, or a standard "Work"
45
+ return type ||= work_type || operation.work_type || "Work"
46
+ end
47
+
48
+ def format_visibility(value)
49
+ case value.downcase
50
+ when "public", "open", "true"
51
+ return "open"
52
+ when "campus", "ucsc", "institution"
53
+ return "ucsc"
54
+ when "restricted", "private", "closed", "false"
55
+ return "restricted"
56
+ end
57
+ end
58
+
59
+ end