bulk_ops 0.1.23 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/db/migrate/20200122234235_remove_relationships_ammend_work_proxy.rb +14 -0
- data/lib/bulk_ops.rb +3 -2
- data/lib/bulk_ops/apply_operation_job.rb +8 -0
- data/lib/bulk_ops/create_work_job.rb +1 -1
- data/lib/bulk_ops/github_access.rb +1 -1
- data/lib/bulk_ops/operation.rb +57 -49
- data/lib/bulk_ops/parser.rb +50 -414
- data/lib/bulk_ops/resolve_children_job.rb +14 -0
- data/lib/bulk_ops/solr_service.rb +13 -0
- data/lib/bulk_ops/update_work_job.rb +1 -1
- data/lib/bulk_ops/verification.rb +2 -10
- data/lib/bulk_ops/version.rb +1 -1
- data/lib/bulk_ops/work_job.rb +20 -13
- data/lib/bulk_ops/work_proxy.rb +18 -2
- data/lib/concerns/interpret_controlled_behavior.rb +140 -0
- data/lib/concerns/interpret_files_behavior.rb +82 -0
- data/lib/concerns/interpret_options_behavior.rb +59 -0
- data/lib/concerns/interpret_relationships_behavior.rb +123 -0
- data/lib/concerns/interpret_scalar_behavior.rb +21 -0
- data/lib/concerns/search_builder_behavior.rb +80 -0
- metadata +12 -3
- data/lib/bulk_ops/relationship.rb +0 -117
@@ -0,0 +1,123 @@
|
|
1
|
+
module BulkOps::InterpretRelationshipsBehavior
|
2
|
+
extend ActiveSupport::Concern
|
3
|
+
|
4
|
+
def interpret_relationship_fields
|
5
|
+
@raw_row.each do |field,value|
|
6
|
+
next if value.blank? or field.blank? or value == field
|
7
|
+
|
8
|
+
#the default identifier type is the reference identifier of the proxy
|
9
|
+
id_type = reference_identifier
|
10
|
+
|
11
|
+
# Correctly interpret the notation "parent:id", "parent id" etc in a column header
|
12
|
+
if (split = field.split(/[:_\-\s]/)).count == 2
|
13
|
+
id_type = split.last
|
14
|
+
field = split.first
|
15
|
+
end
|
16
|
+
|
17
|
+
# skip to next field unless it's a known relationship field
|
18
|
+
next unless (relationship_type = self.class.normalize_relationship_field_name(field))
|
19
|
+
|
20
|
+
case relationship_type
|
21
|
+
when "order"
|
22
|
+
# If the field specifies the object's order among siblings
|
23
|
+
@proxy.update(order: value.to_f)
|
24
|
+
next
|
25
|
+
when "collection"
|
26
|
+
# If the field specifies the name or ID of a collection,
|
27
|
+
# find or create the collection and update the metadata to match
|
28
|
+
col = find_or_create_collection(value)
|
29
|
+
( @metadata[:member_of_collection_ids] ||= [] ) << col.id if col
|
30
|
+
next
|
31
|
+
when "parent"
|
32
|
+
# Correctly interpret the notation "row:349", "id:s8df4j32w" etc in a cell
|
33
|
+
if (split = value.split(/[:_\\s]/)).count == 2
|
34
|
+
id_type = split.first
|
35
|
+
value = split.last
|
36
|
+
end
|
37
|
+
parent = find_parent_proxy(value, field, id_type)
|
38
|
+
proxy_updates = { parent_id: parent.id}
|
39
|
+
siblings = parent.ordered_children
|
40
|
+
if siblings.present? && @proxy.previous_sibling_id.nil?
|
41
|
+
proxy_updates[:previous_sibling_id] = siblings.last.id
|
42
|
+
end
|
43
|
+
@proxy.update(proxy_updates)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
private
|
49
|
+
|
50
|
+
def find_previous_parent_row field="parent"
|
51
|
+
#Return the row number of the most recent preceding row that does
|
52
|
+
# not itself have a parent defined
|
53
|
+
i = 1;
|
54
|
+
while (prev_row = raw_data[row_number - i])
|
55
|
+
return (row_number - i) if prev_row[field].blank?
|
56
|
+
i += 1
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def find_parent_proxy parent_id, field, id_type
|
61
|
+
#The id_type determines what kind of identifier we expect in parent_id
|
62
|
+
case id_type.downcase
|
63
|
+
when "id"
|
64
|
+
# Expect a reference to an existing work in the DAMS
|
65
|
+
return false unless BulkOps::SolrService.record_exists?(parent_id.to_s)
|
66
|
+
# Pull the work proxy for that work, if it exists
|
67
|
+
parent_proxy = BulkOps::WorkProxy.find_by(work_id: parent_id.to_s, operation_id: @proxy.operation.id) || BulkOps::WorkProxy.find_by(work_id: parent_id.to_s)
|
68
|
+
# If no work proxy exists for this work, create one just to keep track of this task
|
69
|
+
return parent_proxy if proxy.present?
|
70
|
+
return BulkOps::WorkProxies.create(status: "awaiting_children",
|
71
|
+
operation_id: 0,
|
72
|
+
last_event: DateTime.now,
|
73
|
+
work_id: parent_id.to_s)
|
74
|
+
|
75
|
+
when "proxy_id"
|
76
|
+
return BulkOps::WorkProxy.find(parent_id)
|
77
|
+
when "row"
|
78
|
+
if parent_id =~ /\A[-+]?[0-9]+\z/
|
79
|
+
if parent_id.to_i < 0
|
80
|
+
# if given a negative integer, count backwards from the current row (remember that parent_id.to_i is negative)
|
81
|
+
parent_id = @proxy.row_number.to_i + parent_id.to_i
|
82
|
+
elsif parent_id.to_i > 0
|
83
|
+
# if given a positive integer, just remove the row offset
|
84
|
+
parent_id = parent_id.to_i - BulkOps::ROW_OFFSET
|
85
|
+
end
|
86
|
+
elsif parent_id.to_s.downcase.include?("prev")
|
87
|
+
# if given any variation of the word "previous", get the first preceding row with no parent of its own
|
88
|
+
parent_id = find_previous_parent_row(field)
|
89
|
+
end
|
90
|
+
|
91
|
+
return BulkOps::WorkProxy.find_by(operation_id: @proxy.operation_id,
|
92
|
+
row_number: parent_id.to_i)
|
93
|
+
# when "title"
|
94
|
+
# # TODO clean up solr query and add work type to it
|
95
|
+
# query = "{!field f=title_tesim}#{object_identifier}"
|
96
|
+
# objects = ActiveFedora::SolrService.instance.conn.get(ActiveFedora::SolrService.select_path,
|
97
|
+
# params: { fq: query, rows: 1})["response"]["docs"]
|
98
|
+
# return ActiveFedora::Base.find(objects.first["id"]) if objects.present?
|
99
|
+
# return false
|
100
|
+
# when "identifier"
|
101
|
+
# query = "{!field f=identifier_tesim}#{object_identifier}"
|
102
|
+
# objects = ActiveFedora::SolrService.instance.conn.get(ActiveFedora::SolrService.select_path,params: { fq: query, rows: 100})["response"]["docs"]
|
103
|
+
# return false if objects.blank?
|
104
|
+
# return ActiveFedora::Base.find(objects.first["id"])
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
def find_collection(collection)
|
109
|
+
puts "FINDING COLLECTION: #{collection}"
|
110
|
+
cols = Collection.where(title: collection)
|
111
|
+
cols += Collection.where(title: collection).select{|col| col.title.first == collection}
|
112
|
+
cols += Collection.where(id: collection)
|
113
|
+
puts "COLLECTION: #{cols.last}"
|
114
|
+
return cols.last unless cols.empty?
|
115
|
+
return false
|
116
|
+
end
|
117
|
+
|
118
|
+
def find_or_create_collection(collection)
|
119
|
+
find_collection(collection) || Collection.create(title: [collection.to_s], depositor: operation.user.email, collection_type: Hyrax::CollectionType.find_by(title:"User Collection"))
|
120
|
+
end
|
121
|
+
|
122
|
+
|
123
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module BulkOps::InterpretScalarBehavior
|
2
|
+
extend ActiveSupport::Concern
|
3
|
+
|
4
|
+
def interpret_scalar_fields
|
5
|
+
@raw_row.each do |field, values|
|
6
|
+
next if values.blank? or field.nil? or field == values
|
7
|
+
# get the field name, if this column is a metadata field
|
8
|
+
next unless field_name = find_field_name(field.to_s)
|
9
|
+
field = schema.get_field(field_name)
|
10
|
+
# Ignore controlled fields
|
11
|
+
next if field.controlled?
|
12
|
+
BulkOps::Parser.split_values(values).each do |value|
|
13
|
+
next if value.blank?
|
14
|
+
value = value.strip.encode('utf-8', :invalid => :replace, :undef => :replace, :replace => '_') unless value.blank?
|
15
|
+
value = BulkOps::Parser.unescape_csv(value)
|
16
|
+
(@metadata[field_name] ||= []) << value
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
@@ -0,0 +1,80 @@
|
|
1
|
+
module BulkOps::SearchBuilderBehavior
|
2
|
+
extend ActiveSupport::Concern
|
3
|
+
included do
|
4
|
+
attr_reader :collection,
|
5
|
+
:admin_set,
|
6
|
+
:workflow_state
|
7
|
+
class_attribute :collection_field,
|
8
|
+
:collection_id_field,
|
9
|
+
:admin_set_field,
|
10
|
+
:admin_set_id_field,
|
11
|
+
:workflow_state_field,
|
12
|
+
:workflow_state_id_field,
|
13
|
+
:keyword_field
|
14
|
+
self.collection_field = 'member_of_collections_ssim'
|
15
|
+
self.collection_id_field = 'member_of_collection_ids_ssim'
|
16
|
+
self.admin_set_field = 'admin_set_tesim'
|
17
|
+
self.admin_set_id_field = 'isPartOf_ssim'
|
18
|
+
self.workflow_state_field = 'workflow_state_name_ssim'
|
19
|
+
self.keyword_field = 'all_fields'
|
20
|
+
|
21
|
+
self.default_processor_chain += [:member_of_collection,
|
22
|
+
:member_of_admin_set,
|
23
|
+
:in_workflow_state,
|
24
|
+
:with_keyword_query]
|
25
|
+
end
|
26
|
+
|
27
|
+
# @param [scope] Typically the controller object
|
28
|
+
def initialize(scope: {},
|
29
|
+
collection: nil,
|
30
|
+
collection_id: nil,
|
31
|
+
admin_set: nil,
|
32
|
+
admin_set_id: nil,
|
33
|
+
workflow_state: nil,
|
34
|
+
keyword_query: nil)
|
35
|
+
|
36
|
+
@collection = collection unless collection.blank?
|
37
|
+
@admin_set = admin_set unless admin_set.blank?
|
38
|
+
@admin_set_id = admin_set_id unless admin_set_id.blank?
|
39
|
+
@workflow_state = workflow_state unless workflow_state.blank?
|
40
|
+
@collection_id = collection_id unless collection_id.blank?
|
41
|
+
@workflow_state = workflow_state unless workflow_state.blank?
|
42
|
+
@keyword_query = keyword_query unless keyword_query.blank?
|
43
|
+
super(scope)
|
44
|
+
end
|
45
|
+
|
46
|
+
def models
|
47
|
+
[Work,Course,Lecture]
|
48
|
+
end
|
49
|
+
|
50
|
+
# include filters into the query to only include the collection memebers
|
51
|
+
def member_of_collection(solr_parameters)
|
52
|
+
solr_parameters[:fq] ||= []
|
53
|
+
solr_parameters[:fq] << "#{collection_field}:#{@collection}" if @collection
|
54
|
+
solr_parameters[:fq] << "#{collection_id_field}:#{@collection_id}" if @collection_id
|
55
|
+
end
|
56
|
+
|
57
|
+
# include filters into the query to only include the collection memebers
|
58
|
+
def member_of_admin_set(solr_parameters)
|
59
|
+
solr_parameters[:fq] ||= []
|
60
|
+
solr_parameters[:fq] << "#{admin_set_field}:#{@admin_set}" if @admin_set
|
61
|
+
solr_parameters[:fq] << "#{admin_set_id_field}:#{@admin_set_id}" if @admin_set_id
|
62
|
+
end
|
63
|
+
|
64
|
+
# include filters into the query to only include the collection memebers
|
65
|
+
def in_workflow_state(solr_parameters)
|
66
|
+
solr_parameters[:fq] ||= []
|
67
|
+
solr_parameters[:fq] << "#{workflow_state_field}:#{@workflow_state}" if @workflow_state
|
68
|
+
end
|
69
|
+
|
70
|
+
def with_keyword_query(solr_parameters)
|
71
|
+
if @keyword_query
|
72
|
+
solr_parameters[:q] ||= []
|
73
|
+
# solr_parameters[:q] << "#{keyword_field}:#{@keyword_query}" if @keyword_query
|
74
|
+
solr_parameters[:q] << @keyword_query
|
75
|
+
solr_parameters[:qf] = "title_tesim titleAlternative_tesim subseries_tesim creator_label_tesim contributor_label_tesim originalPublisher_tesim publisher_tesim publisherHomepage_tesim resourceType_label_tesim rightsHolder_label_tesim scale_tesim series_tesim source_tesim staffNote_tesim coordinates_tesim subjectName_label_tesim subjectPlace_label_tesim subjectTemporal_label_tesim subjectTopic_label_tesim dateCreated_tesim dateCreatedDisplay_tesim dateDigitized_tesim datePublished_tesim description_tesim physicalFormat_label_tesim keyword_tesim language_label_tesim license_tesim masterFilename_tesim physicalDescription_tesim accessRights_tesim itemCallNumber_tesim collectionCallNumber_tesim donorProvenance_tesim genre_label_tesim boxFolder_tesim subject_label_tesim file_format_tesim all_text_timv"
|
76
|
+
end
|
77
|
+
solr_parameters
|
78
|
+
end
|
79
|
+
|
80
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bulk_ops
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ned Henry, UCSC Library Digital Initiatives
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-01-
|
11
|
+
date: 2020-01-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rails
|
@@ -97,7 +97,9 @@ files:
|
|
97
97
|
- config/routes.rb
|
98
98
|
- db/migrate/20180926190757_create_github_credentials.rb
|
99
99
|
- db/migrate/20181017180436_create_bulk_ops_tables.rb
|
100
|
+
- db/migrate/20200122234235_remove_relationships_ammend_work_proxy.rb
|
100
101
|
- lib/bulk_ops.rb
|
102
|
+
- lib/bulk_ops/apply_operation_job.rb
|
101
103
|
- lib/bulk_ops/create_spreadsheet_job.rb
|
102
104
|
- lib/bulk_ops/create_work_job.rb
|
103
105
|
- lib/bulk_ops/delete_file_set_job.rb
|
@@ -108,8 +110,9 @@ files:
|
|
108
110
|
- lib/bulk_ops/operation.rb
|
109
111
|
- lib/bulk_ops/parser.rb
|
110
112
|
- lib/bulk_ops/queue_work_ingests_job.rb
|
111
|
-
- lib/bulk_ops/
|
113
|
+
- lib/bulk_ops/resolve_children_job.rb
|
112
114
|
- lib/bulk_ops/search_builder_behavior.rb
|
115
|
+
- lib/bulk_ops/solr_service.rb
|
113
116
|
- lib/bulk_ops/templates/configuration.yml
|
114
117
|
- lib/bulk_ops/templates/readme.md
|
115
118
|
- lib/bulk_ops/update_work_job.rb
|
@@ -118,6 +121,12 @@ files:
|
|
118
121
|
- lib/bulk_ops/version.rb
|
119
122
|
- lib/bulk_ops/work_job.rb
|
120
123
|
- lib/bulk_ops/work_proxy.rb
|
124
|
+
- lib/concerns/interpret_controlled_behavior.rb
|
125
|
+
- lib/concerns/interpret_files_behavior.rb
|
126
|
+
- lib/concerns/interpret_options_behavior.rb
|
127
|
+
- lib/concerns/interpret_relationships_behavior.rb
|
128
|
+
- lib/concerns/interpret_scalar_behavior.rb
|
129
|
+
- lib/concerns/search_builder_behavior.rb
|
121
130
|
- lib/generators/bulk_ops/install/install_generator.rb
|
122
131
|
- lib/generators/bulk_ops/install/templates/config/github.yml.example
|
123
132
|
homepage: http://UCSCLibrary.github.org
|
@@ -1,117 +0,0 @@
|
|
1
|
-
class BulkOps::Relationship < ActiveRecord::Base
|
2
|
-
RELATIONSHIP_FIELDS = ['parent','child','order','next','collection']
|
3
|
-
|
4
|
-
self.table_name = "bulk_ops_relationships"
|
5
|
-
belongs_to :work_proxy, class_name: "BulkOps::WorkProxy", foreign_key: "work_proxy_id"
|
6
|
-
delegate :operation, :operation_id, to: :work_proxy
|
7
|
-
|
8
|
-
def initialize *args
|
9
|
-
super *args
|
10
|
-
|
11
|
-
# Attempt to resolve the relationship immediately
|
12
|
-
# which might work in the case of updates
|
13
|
-
# resolve!
|
14
|
-
end
|
15
|
-
|
16
|
-
def findObject
|
17
|
-
case (identifier_type || "").downcase
|
18
|
-
when "id"
|
19
|
-
begin
|
20
|
-
object = ActiveFedora::Base.find(object_identifier)
|
21
|
-
rescue Ldp::Gone
|
22
|
-
return false
|
23
|
-
end
|
24
|
-
return object || false
|
25
|
-
when "title"
|
26
|
-
# TODO clean up solr query and add work type to it
|
27
|
-
query = "{!field f=title_tesim}#{object_identifier}"
|
28
|
-
objects = ActiveFedora::SolrService.instance.conn.get(ActiveFedora::SolrService.select_path,
|
29
|
-
params: { fq: query, rows: 100})["response"]["docs"]
|
30
|
-
if objects.present?
|
31
|
-
return ActiveFedora::Base.find(objects.first["id"])
|
32
|
-
elsif (relationship_type || "").downcase == "collection"
|
33
|
-
return Collection.create(title: [object_identifier])
|
34
|
-
else
|
35
|
-
return false
|
36
|
-
end
|
37
|
-
when "identifier"
|
38
|
-
query = "{!field f=identifier_tesim}#{object_identifier}"
|
39
|
-
objects = ActiveFedora::SolrService.instance.conn.get(ActiveFedora::SolrService.select_path,params: { fq: query, rows: 100})["response"]["docs"]
|
40
|
-
return false if objects.blank?
|
41
|
-
return ActiveFedora::Base.find(objects.first["id"])
|
42
|
-
when "row"
|
43
|
-
object_proxy = BulkOps::WorkProxy.find_by(operation_id: work_proxy.operation_id,
|
44
|
-
row_number: (object_identifier.to_i))
|
45
|
-
ActiveFedora::Base.find(object_proxy.work_id)
|
46
|
-
when "proxy_id"
|
47
|
-
return false unless (proxy = BulkOps::WorkProxy.find(proxy_id))
|
48
|
-
return false unless proxy.work_id.present?
|
49
|
-
ActiveFedora::Base.find(proxy.work_id)
|
50
|
-
end
|
51
|
-
end
|
52
|
-
|
53
|
-
def resolve!
|
54
|
-
unless subject = work_proxy.work and object = self.findObject
|
55
|
-
wait!
|
56
|
-
return
|
57
|
-
end
|
58
|
-
implement_relationship! relationship_type, subject, object
|
59
|
-
end
|
60
|
-
|
61
|
-
def insert_among_children(object,new_member)
|
62
|
-
return nil unless ["parent"].include?((relationship_type || "").downcase)
|
63
|
-
prev_sib_id = previous_sibling
|
64
|
-
# This is the id of the WorkProxy associate with the most recent sibling work
|
65
|
-
# that might be fully ingested. If is it not fully ingested, we will move on
|
66
|
-
# to the preceding sibling.
|
67
|
-
while prev_sib_id.present?
|
68
|
-
prev_sib_proxy = BulkOps::WorkProxy.find(prev_sib_id)
|
69
|
-
# Check if the previous sibling is fully ingested
|
70
|
-
# and get its index among its siblings (if it has been successfully attached to the parent)
|
71
|
-
prev_sib_index = object.ordered_member_ids.index(prev_sib_proxy.work_id) if prev_sib_proxy.work_id.present?
|
72
|
-
# Insert the new member among its siblings if we found the right place
|
73
|
-
return object.ordered_members.to_a.insert(prev_sib_index+1, new_member) if prev_sib_index.present?
|
74
|
-
# Otherwise, pull up the sibling's relationship field to check if it sibling has a sibling before it
|
75
|
-
sib_relationship = prev_sib_proxy.relationships.find{|rel| rel.findObject.id == object.id }
|
76
|
-
# If we can't find an ingested sibling among the ordered members,
|
77
|
-
# break this loop and make this work the first member.
|
78
|
-
break unless sib_relationship.present?
|
79
|
-
prev_sib_id = sib_relationship.previous_sibling
|
80
|
-
end
|
81
|
-
#If we never found an existing previous sibling already attached, put this one at the front
|
82
|
-
return [new_member]+object.ordered_members.to_a
|
83
|
-
end
|
84
|
-
|
85
|
-
def implement_relationship!(type,subject,object)
|
86
|
-
case (type || "").downcase
|
87
|
-
when "parent"
|
88
|
-
unless object.member_ids.include? subject.id
|
89
|
-
object.reload
|
90
|
-
object.save
|
91
|
-
object.ordered_members = insert_among_children(object, subject)
|
92
|
-
object.save
|
93
|
-
end
|
94
|
-
when "child"
|
95
|
-
#CAVEAT ordering not fully implemented in this case
|
96
|
-
unless subject.member_ids.include? object.id
|
97
|
-
subject.ordered_members << object
|
98
|
-
subject.save
|
99
|
-
end
|
100
|
-
when "order"
|
101
|
-
#TODO - implement this - related to ordering of filesets
|
102
|
-
|
103
|
-
end
|
104
|
-
update(status: "complete")
|
105
|
-
end
|
106
|
-
|
107
|
-
private
|
108
|
-
|
109
|
-
def fail!
|
110
|
-
update(status: "failed")
|
111
|
-
end
|
112
|
-
|
113
|
-
def wait!
|
114
|
-
update(status: "pending")
|
115
|
-
end
|
116
|
-
|
117
|
-
end
|