bulk_ops 0.1.23 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,14 @@
1
+ class BulkOps::ResolveChildrenJob < ActiveJob::Base
2
+
3
+ def perform(proxy_id)
4
+ proxy = BulkOps::WorkProxy.find(proxy_id)
5
+ if proxy.ordered_children.all?{|child| child.work_id.present?}
6
+ work = ActiveFedora::Base.find(proxy.work_id)
7
+ work.ordered_member_ids = proxy.ordered_children.map(&:work_id)
8
+ work.save
9
+ else
10
+ BulkOps::ResolveChildrenJob.set(wait: 30.minutes).perform_later(proxy_id)
11
+ end
12
+ end
13
+
14
+ end
@@ -0,0 +1,13 @@
1
+ class SolrService
2
+
3
+ def self.record_exists? id
4
+ begin
5
+ return true if SolrDocument.find(id)
6
+ rescue Blacklight::Exceptions::RecordNotFound
7
+ return false
8
+ end
9
+ return false
10
+ end
11
+
12
+
13
+ end
@@ -13,7 +13,7 @@ class BulkOps::UpdateWorkJob < BulkOps::WorkJob
13
13
 
14
14
  def define_work workClass=nil
15
15
  # report an error if we can't find the work in solr
16
- unless record_exists?(@work_proxy.work_id)
16
+ unless BulkOps::SolrService.record_exists?(@work_proxy.work_id)
17
17
  report_error "Could not find work to update with id: #{@work_proxy.work_id} referenced by work proxy: #{@work_proxy.id}"
18
18
  return false
19
19
  end
@@ -59,14 +59,6 @@ module BulkOps
59
59
  filenames.map { |filename| File.join(BulkOps::INGEST_MEDIA_PATH, options['file_prefix'] || "", filename) }
60
60
  end
61
61
 
62
- def record_exists? id
63
- begin
64
- return true if SolrDocument.find(id)
65
- rescue Blacklight::Exceptions::RecordNotFound
66
- return false
67
- end
68
- end
69
-
70
62
  private
71
63
 
72
64
  def verify_files
@@ -165,7 +157,7 @@ module BulkOps
165
157
  get_spreadsheet.each_with_index do |row, row_num|
166
158
  id = get_ref_id(row)
167
159
  #TODO: find by other field. for now just id
168
- unless (record_exists(id))
160
+ unless (BulkOps::SolrService.record_exists?(id))
169
161
  @verification_errors << BulkOps::Error.new(type: :cannot_find_work, id: id)
170
162
  end
171
163
  end
@@ -205,7 +197,7 @@ module BulkOps
205
197
  end
206
198
  elsif ref_id.include?("id")
207
199
  # This is a hydra id reference. It should correspond to an object already in the repo
208
- unless record_exists?(obj_id)
200
+ unless BulkOps::SolrService.record_exists?(obj_id)
209
201
  @verification_errors << BulkOps::Error.new({type: :bad_object_reference, object_id: obj_id, row_number: row_num + BulkOps:: ROW_OFFSET})
210
202
  end
211
203
  end
@@ -1,3 +1,3 @@
1
1
  module BulkOps
2
- VERSION = "0.1.23"
2
+ VERSION = "0.2.0"
3
3
  end
@@ -13,11 +13,25 @@ class BulkOps::WorkJob < ActiveJob::Base
13
13
  update_status "error"
14
14
  else
15
15
  @work_proxy.work_id = @work.id
16
- update_status "complete"
16
+
17
+ # If this work has a parent outside of the current operation,
18
+ # and this is the first sibling (we only need to do this once per parent),
19
+ # queue a job to resolve that work's new children
20
+ if @work_proxy.parent_id.present? && (parent_proxy = BulkOps::WorkProxy.find(parent_id))
21
+ if parent_proxy.operation_id != @work_proxy.operation_id
22
+ if @work_proxy.previous_sibling.nil?
23
+ BulkOps::ResolveChildrenJob.set(wait: 10.minutes).perform_later(parent_proxy.id)
24
+ end
25
+ end
26
+ end
17
27
 
18
- # Attempt to resolve all of the relationships defined in this row
19
- @work_proxy.relationships.each do |relationship|
20
- relationship.resolve!
28
+ # Set up jobs to link child works (once they are ingested)
29
+ # or mark as complete otherwise
30
+ if (children = @work_proxy.ordered_children)
31
+ BulkOps::ResolveChildrenJob.perform_later(@work_proxy.id)
32
+ update_status "awaiting_children"
33
+ else
34
+ update_status "complete"
21
35
  end
22
36
 
23
37
  # Delete any UploadedFiles. These take up tons of unnecessary disk space.
@@ -52,6 +66,7 @@ class BulkOps::WorkJob < ActiveJob::Base
52
66
  report_error("Cannot find work proxy with id: #{work_proxy_id}")
53
67
  return
54
68
  end
69
+ return if @work_proxy.status == "complete"
55
70
 
56
71
  return unless (work_action = define_work(workClass))
57
72
 
@@ -66,7 +81,7 @@ class BulkOps::WorkJob < ActiveJob::Base
66
81
 
67
82
 
68
83
  def define_work(workClass="Work")
69
- if (@work_proxy.present? && @work_proxy.work_id.present? && record_exists?(@work_proxy.work_id))
84
+ if (@work_proxy.present? && @work_proxy.work_id.present? && BulkOps::SolrService.record_exists?(@work_proxy.work_id))
70
85
  begin
71
86
  @work = ActiveFedora::Base.find(@work_proxy.work_id)
72
87
  return :update
@@ -80,14 +95,6 @@ class BulkOps::WorkJob < ActiveJob::Base
80
95
  end
81
96
  end
82
97
 
83
- def record_exists? id
84
- begin
85
- return true if SolrDocument.find(id)
86
- rescue Blacklight::Exceptions::RecordNotFound
87
- return false
88
- end
89
- end
90
-
91
98
  def report_error message=nil
92
99
  update_status "job_error", message: message
93
100
  end
@@ -2,7 +2,6 @@ class BulkOps::WorkProxy < ActiveRecord::Base
2
2
 
3
3
  self.table_name = "bulk_ops_work_proxies"
4
4
  belongs_to :operation, class_name: "BulkOps::Operation", foreign_key: "operation_id"
5
- has_many :relationships, class_name: "BulkOps::Relationship"
6
5
 
7
6
  attr_accessor :proxy_errors
8
7
 
@@ -38,5 +37,22 @@ class BulkOps::WorkProxy < ActiveRecord::Base
38
37
  @proxy_errors ||= []
39
38
  end
40
39
 
41
-
40
+ def ordered_siblings
41
+ return nil unless (parent = BulkOps::WorkProxy.find(parent_id))
42
+ parent.ordered_children - self
43
+ end
44
+
45
+ def ordered_children
46
+ children = BulkOps::WorkProxy.where(parent_id: id)
47
+ ordered_kids = []
48
+ previous_id = nil
49
+ while ordered_kids.length < children.length do
50
+ next_child = children.find{|child| child.previous_sibling_id == previous_id}
51
+ break if (next_child.nil? or ordered_kids.include?(next_child))
52
+ previous_id = next_child.id
53
+ ordered_kids << next_child
54
+ end
55
+ ordered_kids = ordered_kids + (children - ordered_kids) if (children.length > ordered_kids.length)
56
+ ordered_kids
57
+ end
42
58
  end
@@ -0,0 +1,140 @@
1
+ module BulkOps::InterpretControlledBehavior
2
+ extend ActiveSupport::Concern
3
+
4
+ def interpret_controlled_fields
5
+
6
+ # The labels array tracks the contents of columns marked as labels,
7
+ # which may require special validation
8
+ labels = {}
9
+
10
+ # This hash is populated with relevant data as we loop through the fields
11
+ controlled_data = {}
12
+
13
+ @raw_row.each do |field_name, value|
14
+ next if value.blank? or field_name.blank?
15
+ field_name = field_name.to_s
16
+
17
+ #If our CSV interpreter is feeding us the headers as a line, ignore it.
18
+ next if field_name == value
19
+
20
+ #check if they are using the 'field_name.authority' syntax
21
+ authority = nil
22
+ if ((split=field_name.split('.')).count == 2)
23
+ authority = split.last
24
+ field_name = split.first
25
+ end
26
+
27
+ # get the field name, if this column is a metadata field
28
+ field_name_norm = find_field_name(field_name)
29
+ field = schema.get_field(field_name_norm)
30
+
31
+ # Ignore anything that isn't a controlled field
32
+ next unless field.present? && field.controlled?
33
+
34
+ # Keep track of label fields
35
+ if field_name.downcase.ends_with?("label")
36
+ next if @options["ignore_labels"]
37
+ labels[field_name_norm] ||= []
38
+ labels[field_name_norm] += BulkOps::Parser.split_values value
39
+ next unless @options["import_labels"]
40
+ end
41
+
42
+ # handle multiple values
43
+ value_array = BulkOps::Parser.split_values(value)
44
+ controlled_data[field_name_norm] ||= [] unless value_array.blank?
45
+ value_array.each do |value|
46
+ # Decide of we're dealing with a label or url
47
+ # It's an ID if it's a URL and the name doesn't end in 'label'
48
+ value.strip!
49
+ if value =~ /^#{URI::regexp}$/ and !field_name.downcase.ends_with?("label")
50
+ value_id = value
51
+ # label = WorkIndexer.fetch_remote_label(value)
52
+ # error_message = "cannot fetch remote label for url: #{value}"
53
+ # report_error( :cannot_retrieve_label , error_message, url: value, row_number: row_number) unless label
54
+ else
55
+ # It's a label, so unescape it and get the id
56
+ value = BulkOps::Parser.unescape_csv(value)
57
+ value_id = get_remote_id(value, property: field_name_norm, authority: authority) || localAuthUrl(field_name_norm, value)
58
+ # label = value
59
+ report_error(:cannot_retrieve_url,
60
+ message: "cannot find or create url for controlled vocabulary label: #{value}",
61
+ url: value,
62
+ row_number: row_number) unless value_id
63
+ end
64
+ destroy? =
65
+ atts = {id: value_id}
66
+ atts[:_destroy] = true if (field_name.downcase.starts_with?("remove") or field_name.downcase.starts_with?("delete"))
67
+ controlled_data[field_name_norm] << atts
68
+ end
69
+ end
70
+
71
+ # Actually add all the data
72
+ controlled_data.each do |property_name, data|
73
+ @metadata["#{property_name}_attributes"] ||= [] unless data.blank?
74
+ data.uniq.each do |datum|
75
+ @metadata["#{property_name}_attributes"].reject!{|val| val[:id] == datum[:id]}
76
+ @metadata["#{property_name}_attributes"] << datum
77
+ end
78
+ end
79
+ end
80
+
81
+ private
82
+
83
+ def localAuthUrl(property, value)
84
+ return value if (auth = getLocalAuth(property)).nil?
85
+ url = findAuthUrl(auth, value) || mintLocalAuthUrl(auth,value)
86
+ return url
87
+ end
88
+
89
+ def findAuthUrl(auth, value)
90
+ value.strip!
91
+ return nil if auth.nil?
92
+ return nil unless (entries = Qa::Authorities::Local.subauthority_for(auth).search(value))
93
+ entries.each do |entry|
94
+ #require exact match
95
+ next unless entry["label"].force_encoding('UTF-8') == value.force_encoding('UTF-8')
96
+ url = entry["url"] || entry["id"]
97
+ # url = localIdToUrl(url,auth) unless url =~ URI::regexp
98
+ return url
99
+ end
100
+ return nil
101
+ end
102
+
103
+ def localIdToUrl(id,auth_name)
104
+ root_urls = {'production' => "https://digitalcollections.library.ucsc.edu",
105
+ 'staging' => "http://digitalcollections-staging.library.ucsc.edu",
106
+ 'sandbox' => "http://digitalcollections-staging-sandbox.library.ucsc.edu",
107
+ 'development' => "http://#{Socket.gethostname}",
108
+ 'test' => "http://#{Socket.gethostname}"}
109
+ return "#{root_urls[Rails.env.to_s]}/authorities/show/local/#{auth_name}/#{id}"
110
+ end
111
+
112
+
113
+ def mintLocalAuthUrl(auth_name, value)
114
+ value.strip!
115
+ id = value.parameterize
116
+ auth = Qa::LocalAuthority.find_or_create_by(name: auth_name)
117
+ entry = Qa::LocalAuthorityEntry.create(local_authority: auth,
118
+ label: value,
119
+ uri: id)
120
+ return localIdToUrl(id,auth_name)
121
+ end
122
+
123
+ def getLocalAuth(field_name)
124
+ field = schema.get_property(field_name)
125
+ # There is only ever one local authority per field, so just pick the first you find
126
+ if vocs = field.vocabularies
127
+ vocs.each do |voc|
128
+ return voc["subauthority"] if voc["authority"].downcase == "local"
129
+ end
130
+ end
131
+ return nil
132
+ end
133
+
134
+
135
+ def get_remote_id(value, authority: nil, property: nil)
136
+ return false
137
+ #TODO retrieve URL for this value from the specified remote authr
138
+ end
139
+
140
+ end
@@ -0,0 +1,82 @@
1
+ module BulkOps::InterpretFilesBehavior
2
+ extend ActiveSupport::Concern
3
+
4
+
5
+ def interpret_file_fields
6
+ # This method handles file additions and deletions from the spreadsheet
7
+ # if additional files need to be deleted because the update is set to replace
8
+ # some or all existing files, those replacement-related deletions are handled
9
+ # by the BulkOps::Operation.
10
+ #
11
+
12
+ @raw_row.each do |field, value|
13
+ next if value.blank? or field.blank?
14
+ field = field.to_s
15
+ #If our CSV interpreter is feeding us the headers as a line, ignore it.
16
+ next if field == value
17
+
18
+ # Check if this is a file field, and whether we are removing or adding a file
19
+ next unless (action = BulkOps::Verification.is_file_field?(field))
20
+
21
+ # Move on if this field is the name of another property (e.g. masterFilename)
22
+ next if find_field_name(field)
23
+
24
+ # Check if we are removing a file
25
+ if action == "remove"
26
+ get_removed_filesets(value).each { |fileset_id| delete_file_set(file_set_id) }
27
+ else
28
+ # Add a file
29
+ operation.get_file_paths(value).each do |filepath|
30
+ begin
31
+ uploaded_file = Hyrax::UploadedFile.create(file: File.open(filepath), user: operation.user)
32
+ (@metadata[:uploaded_files] ||= []) << uploaded_file.id unless uploaded_file.id.nil?
33
+ rescue Exception => e
34
+ report_error(:upload_error,
35
+ message: "Error opening file: #{ filepath } -- #{e}",
36
+ file: File.join(BulkOps::INGEST_MEDIA_PATH,filename),
37
+ row_number: row_number)
38
+ end
39
+ end
40
+ end
41
+
42
+ # Check if any of the upcoming rows are child filesets
43
+ i = 1
44
+ while self.class.is_file_set?(@metadata,row_number+i)
45
+ child_row.each do |field,value|
46
+ next if value.blank?
47
+ title = value if ["title","label"].include?(field.downcase.strip)
48
+ if BulkOps::Verification.is_file_field?(field)
49
+ operation.get_file_paths(value).each do |filepath|
50
+ uploaded_file = Hyrax::UploadedFile.create(file: File.open(filepath), user: operation.user)
51
+ end
52
+ end
53
+ end
54
+ i+=1
55
+ end
56
+
57
+ end
58
+ end
59
+
60
+ private
61
+
62
+ def get_removed_filesets(filestring)
63
+ file_ids = BulkOps::Parser.split_values(filestring)
64
+ file_ids.select{|file_id| BulkOps::SolrService.record_exists?(file_id)}
65
+
66
+ # This part handles filenames in addition to file ids. It doesn't work yet!
67
+ # file_ids.map do |file_id|
68
+ # If the filename is the id of an existing record, keep that
69
+ # next(file_id) if (BulkOps::SolrService.record_exists?(file_id))
70
+ # If this is the label (i.e.filename) of an existing fileset, use that fileset id
71
+ # TODO MAKE THIS WORK!!
72
+ # next(filename) if (filename_exists?(filename))
73
+ # File.join(BulkOps::INGEST_MEDIA_PATH, filename_prefix, filename)
74
+ # end
75
+ end
76
+
77
+ def delete_file_set fileset_id
78
+ BulkOps::DeleteFileSetJob.perform_later(fileset_id, operation.user.email )
79
+ end
80
+
81
+
82
+ end
@@ -0,0 +1,59 @@
1
+ module BulkOps::InterpretOptionsBehavior
2
+ extend ActiveSupport::Concern
3
+
4
+
5
+ def interpret_option_fields
6
+ @raw_row.each do |field,value|
7
+ next if value.blank? or field.blank?
8
+ field = field.to_s
9
+ next if value == field
10
+
11
+ normfield = field.downcase.parameterize.gsub(/[_\s-]/,'')
12
+ if ["visibility", "public"].include?(normfield)
13
+ @proxy.update(visibility: format_visibility(value))
14
+
15
+ end
16
+ if ["worktype","model","type"].include?(normfield)
17
+ @proxy.update(work_type: format_worktype(value) )
18
+ end
19
+ if ["referenceidentifier",
20
+ "referenceid",
21
+ "refid",
22
+ "referenceidentifiertype",
23
+ "referenceidtype",
24
+ "refidtype",
25
+ "relationshipidentifier",
26
+ "relationshipid",
27
+ "relationshipidentifiertype",
28
+ "relationshipidtype",
29
+ "relid",
30
+ "relidtype"].include?(normfield)
31
+ @proxy.update(reference_identifier: format_reference_id(value))
32
+ end
33
+ end
34
+ end
35
+
36
+
37
+ private
38
+
39
+ def format_worktype(value)
40
+ # format the value like a class name
41
+ type = value.titleize.gsub(/[-_\s]/,'')
42
+ # reject it if it isn't a defined class
43
+ type = false unless Object.const_defined? type
44
+ # fall back to the work type defined by the operation, or a standard "Work"
45
+ return type ||= work_type || operation.work_type || "Work"
46
+ end
47
+
48
+ def format_visibility(value)
49
+ case value.downcase
50
+ when "public", "open", "true"
51
+ return "open"
52
+ when "campus", "ucsc", "institution"
53
+ return "ucsc"
54
+ when "restricted", "private", "closed", "false"
55
+ return "restricted"
56
+ end
57
+ end
58
+
59
+ end