bulkrax 1.0.0 → 2.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/app/controllers/bulkrax/exporters_controller.rb +12 -4
- data/app/controllers/bulkrax/importers_controller.rb +22 -17
- data/app/factories/bulkrax/object_factory.rb +44 -61
- data/app/jobs/bulkrax/create_relationships_job.rb +187 -0
- data/app/jobs/bulkrax/delete_work_job.rb +6 -2
- data/app/jobs/bulkrax/export_work_job.rb +3 -1
- data/app/jobs/bulkrax/exporter_job.rb +1 -0
- data/app/jobs/bulkrax/{import_work_collection_job.rb → import_collection_job.rb} +2 -2
- data/app/jobs/bulkrax/importer_job.rb +16 -1
- data/app/matchers/bulkrax/application_matcher.rb +9 -6
- data/app/models/bulkrax/csv_collection_entry.rb +8 -6
- data/app/models/bulkrax/csv_entry.rb +139 -45
- data/app/models/bulkrax/entry.rb +19 -8
- data/app/models/bulkrax/exporter.rb +12 -5
- data/app/models/bulkrax/importer.rb +22 -5
- data/app/models/bulkrax/oai_entry.rb +5 -1
- data/app/models/bulkrax/rdf_entry.rb +16 -7
- data/app/models/bulkrax/xml_entry.rb +4 -0
- data/app/models/concerns/bulkrax/export_behavior.rb +2 -2
- data/app/models/concerns/bulkrax/file_factory.rb +2 -1
- data/app/models/concerns/bulkrax/has_matchers.rb +59 -16
- data/app/models/concerns/bulkrax/import_behavior.rb +35 -5
- data/app/models/concerns/bulkrax/importer_exporter_behavior.rb +19 -0
- data/app/models/concerns/bulkrax/status_info.rb +4 -4
- data/app/parsers/bulkrax/application_parser.rb +59 -84
- data/app/parsers/bulkrax/bagit_parser.rb +12 -3
- data/app/parsers/bulkrax/csv_parser.rb +117 -62
- data/app/parsers/bulkrax/oai_dc_parser.rb +5 -2
- data/app/parsers/bulkrax/xml_parser.rb +5 -0
- data/app/views/bulkrax/exporters/_form.html.erb +1 -1
- data/app/views/bulkrax/exporters/show.html.erb +13 -1
- data/app/views/bulkrax/importers/_edit_form_buttons.html.erb +45 -14
- data/app/views/bulkrax/importers/edit.html.erb +2 -0
- data/app/views/bulkrax/importers/index.html.erb +15 -17
- data/app/views/bulkrax/importers/show.html.erb +6 -2
- data/config/locales/bulkrax.en.yml +1 -0
- data/db/migrate/20190731114016_change_importer_and_exporter_to_polymorphic.rb +5 -1
- data/db/migrate/20211004170708_change_bulkrax_statuses_error_message_column_type_to_text.rb +5 -0
- data/db/migrate/20211203195233_rename_children_counters_to_relationships.rb +6 -0
- data/lib/bulkrax/engine.rb +1 -1
- data/lib/bulkrax/version.rb +1 -1
- data/lib/bulkrax.rb +9 -17
- data/lib/generators/bulkrax/templates/bin/importer +17 -11
- data/lib/generators/bulkrax/templates/config/bulkrax_api.yml +3 -1
- data/lib/generators/bulkrax/templates/config/initializers/bulkrax.rb +7 -12
- metadata +13 -7
- data/app/jobs/bulkrax/child_relationships_job.rb +0 -128
@@ -12,6 +12,8 @@ module Bulkrax
|
|
12
12
|
raise CollectionsCreatedError unless collections_created?
|
13
13
|
@item = factory.run!
|
14
14
|
end
|
15
|
+
parent_jobs if self.parsed_metadata[related_parents_parsed_mapping].present?
|
16
|
+
child_jobs if self.parsed_metadata[related_children_parsed_mapping].present?
|
15
17
|
rescue RSolr::Error::Http, CollectionsCreatedError => e
|
16
18
|
raise e
|
17
19
|
rescue StandardError => e
|
@@ -22,7 +24,19 @@ module Bulkrax
|
|
22
24
|
return @item
|
23
25
|
end
|
24
26
|
|
25
|
-
def
|
27
|
+
def parent_jobs
|
28
|
+
self.parsed_metadata[related_parents_parsed_mapping].each do |parent_identifier|
|
29
|
+
CreateRelationshipsJob.perform_later(entry_identifier: self.identifier, parent_identifier: parent_identifier, importer_run: self.last_run)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def child_jobs
|
34
|
+
self.parsed_metadata[related_children_parsed_mapping].each do |child_identifier|
|
35
|
+
CreateRelationshipsJob.perform_later(entry_identifier: self.identifier, child_identifier: child_identifier, importer_run: self.last_run)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def find_collection_ids
|
26
40
|
self.collection_ids
|
27
41
|
end
|
28
42
|
|
@@ -57,15 +71,27 @@ module Bulkrax
|
|
57
71
|
end
|
58
72
|
|
59
73
|
def add_collections
|
60
|
-
return if
|
61
|
-
|
62
|
-
|
74
|
+
return if find_collection_ids.blank?
|
75
|
+
|
76
|
+
ActiveSupport::Deprecation.warn(
|
77
|
+
'Creating Collections using the collection_field_mapping will no longer be supported as of Bulkrax version 3.0.' \
|
78
|
+
' Please configure Bulkrax to use related_parents_field_mapping and related_children_field_mapping instead.'
|
79
|
+
)
|
80
|
+
self.parsed_metadata['member_of_collections_attributes'] = {}
|
81
|
+
find_collection_ids.each_with_index do |c, i|
|
82
|
+
self.parsed_metadata['member_of_collections_attributes'][i.to_s] = { id: c }
|
83
|
+
end
|
63
84
|
end
|
64
85
|
|
65
86
|
def factory
|
87
|
+
ActiveSupport::Deprecation.warn(
|
88
|
+
'Creating Collections using the collection_field_mapping will no longer be supported as of Bulkrax version 3.0.' \
|
89
|
+
' Please configure Bulkrax to use related_parents_field_mapping and related_children_field_mapping instead.'
|
90
|
+
)
|
66
91
|
@factory ||= Bulkrax::ObjectFactory.new(attributes: self.parsed_metadata,
|
67
92
|
source_identifier_value: identifier,
|
68
93
|
work_identifier: parser.work_identifier,
|
94
|
+
collection_field_mapping: parser.collection_field_mapping,
|
69
95
|
replace_files: replace_files,
|
70
96
|
user: user,
|
71
97
|
klass: factory_class,
|
@@ -80,7 +106,11 @@ module Bulkrax
|
|
80
106
|
else
|
81
107
|
Bulkrax.default_work_type
|
82
108
|
end
|
83
|
-
|
109
|
+
|
110
|
+
# return the name of the collection or work
|
111
|
+
fc.tr!(' ', '_')
|
112
|
+
fc.downcase! if fc.match?(/[-_]/)
|
113
|
+
fc.camelcase.constantize
|
84
114
|
rescue NameError
|
85
115
|
nil
|
86
116
|
rescue
|
@@ -25,10 +25,29 @@ module Bulkrax
|
|
25
25
|
if collection
|
26
26
|
current_run.total_collection_entries = index + 1 unless parser.collections_total.positive?
|
27
27
|
else
|
28
|
+
# TODO: differentiate between work and collection counts for exporters
|
28
29
|
current_run.total_work_entries = index + 1 unless limit.to_i.positive? || parser.total.positive?
|
29
30
|
end
|
30
31
|
current_run.enqueued_records = index + 1
|
31
32
|
current_run.save!
|
32
33
|
end
|
34
|
+
|
35
|
+
def keys_without_numbers(keys)
|
36
|
+
keys.map { |key| key_without_numbers(key) }
|
37
|
+
end
|
38
|
+
|
39
|
+
def key_without_numbers(key)
|
40
|
+
key.gsub(/_\d+/, '').sub(/^\d+_/, '')
|
41
|
+
end
|
42
|
+
|
43
|
+
# Is this a file?
|
44
|
+
def file?
|
45
|
+
parser_fields&.[]('import_file_path') && File.file?(parser_fields['import_file_path'])
|
46
|
+
end
|
47
|
+
|
48
|
+
# Is this a zip file?
|
49
|
+
def zip?
|
50
|
+
parser_fields&.[]('import_file_path') && MIME::Types.type_for(parser_fields['import_file_path']).include?('application/zip')
|
51
|
+
end
|
33
52
|
end
|
34
53
|
end
|
@@ -33,13 +33,13 @@ module Bulkrax
|
|
33
33
|
current_status&.created_at
|
34
34
|
end
|
35
35
|
|
36
|
-
def status_info(e = nil)
|
36
|
+
def status_info(e = nil, current_run = nil)
|
37
37
|
if e.nil?
|
38
|
-
self.statuses.create!(status_message: 'Complete', runnable: last_run)
|
38
|
+
self.statuses.create!(status_message: 'Complete', runnable: current_run || last_run)
|
39
39
|
elsif e.is_a?(String)
|
40
|
-
self.statuses.create!(status_message: e, runnable: last_run)
|
40
|
+
self.statuses.create!(status_message: e, runnable: current_run || last_run)
|
41
41
|
else
|
42
|
-
self.statuses.create!(status_message: 'Failed', runnable: last_run, error_class: e.class.to_s, error_message: e.message, error_backtrace: e.backtrace)
|
42
|
+
self.statuses.create!(status_message: 'Failed', runnable: current_run || last_run, error_class: e.class.to_s, error_message: e.message, error_backtrace: e.backtrace)
|
43
43
|
end
|
44
44
|
end
|
45
45
|
|
@@ -1,15 +1,15 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Bulkrax
|
4
|
-
class ApplicationParser
|
5
|
-
attr_accessor :importerexporter
|
4
|
+
class ApplicationParser # rubocop:disable Metrics/ClassLength
|
5
|
+
attr_accessor :importerexporter, :headers
|
6
6
|
alias importer importerexporter
|
7
7
|
alias exporter importerexporter
|
8
|
-
delegate :only_updates, :limit, :current_run, :errors,
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
8
|
+
delegate :only_updates, :limit, :current_run, :errors, :mapping,
|
9
|
+
:seen, :increment_counters, :parser_fields, :user, :keys_without_numbers,
|
10
|
+
:key_without_numbers, :status, :status_info, :status_at,
|
11
|
+
:exporter_export_path, :exporter_export_zip_path, :importer_unzip_path, :validate_only,
|
12
|
+
to: :importerexporter
|
13
13
|
|
14
14
|
def self.parser_fields
|
15
15
|
{}
|
@@ -25,6 +25,7 @@ module Bulkrax
|
|
25
25
|
|
26
26
|
def initialize(importerexporter)
|
27
27
|
@importerexporter = importerexporter
|
28
|
+
@headers = []
|
28
29
|
end
|
29
30
|
|
30
31
|
# @api
|
@@ -43,20 +44,54 @@ module Bulkrax
|
|
43
44
|
end
|
44
45
|
|
45
46
|
def source_identifier
|
46
|
-
@source_identifier ||=
|
47
|
+
@source_identifier ||= get_field_mapping_hash_for('source_identifier')&.values&.first&.[]('from')&.first&.to_sym || :source_identifier
|
47
48
|
end
|
48
49
|
|
49
50
|
def work_identifier
|
50
|
-
@work_identifier ||=
|
51
|
+
@work_identifier ||= get_field_mapping_hash_for('source_identifier')&.keys&.first&.to_sym || :source
|
51
52
|
end
|
52
53
|
|
53
|
-
def
|
54
|
-
@
|
55
|
-
|
56
|
-
|
57
|
-
|
54
|
+
def related_parents_raw_mapping
|
55
|
+
@related_parents_raw_mapping ||= get_field_mapping_hash_for('related_parents_field_mapping')&.values&.first&.[]('from')&.first
|
56
|
+
end
|
57
|
+
|
58
|
+
def related_parents_parsed_mapping
|
59
|
+
@related_parents_parsed_mapping ||= get_field_mapping_hash_for('related_parents_field_mapping')&.keys&.first
|
60
|
+
end
|
61
|
+
|
62
|
+
def related_children_raw_mapping
|
63
|
+
@related_children_raw_mapping ||= get_field_mapping_hash_for('related_children_field_mapping')&.values&.first&.[]('from')&.first
|
64
|
+
end
|
65
|
+
|
66
|
+
def related_children_parsed_mapping
|
67
|
+
@related_children_parsed_mapping ||= get_field_mapping_hash_for('related_children_field_mapping')&.keys&.first
|
68
|
+
end
|
69
|
+
|
70
|
+
def get_field_mapping_hash_for(key)
|
71
|
+
return instance_variable_get("@#{key}_hash") if instance_variable_get("@#{key}_hash").present?
|
72
|
+
|
73
|
+
instance_variable_set(
|
74
|
+
"@#{key}_hash",
|
75
|
+
importerexporter.mapping.with_indifferent_access.select { |_, h| h.key?(key) }
|
76
|
+
)
|
77
|
+
raise StandardError, "more than one #{key} declared: #{instance_variable_get("@#{key}_hash").keys.join(', ')}" if instance_variable_get("@#{key}_hash").length > 1
|
58
78
|
|
59
|
-
|
79
|
+
instance_variable_get("@#{key}_hash")
|
80
|
+
end
|
81
|
+
|
82
|
+
def collection_field_mapping
|
83
|
+
ActiveSupport::Deprecation.warn(
|
84
|
+
'Creating Collections using the collection_field_mapping will no longer be supported as of Bulkrax version 3.0.' \
|
85
|
+
' Please configure Bulkrax to use related_parents_field_mapping and related_children_field_mapping instead.'
|
86
|
+
)
|
87
|
+
Bulkrax.collection_field_mapping[self.entry_class.to_s]&.to_sym || :collection
|
88
|
+
end
|
89
|
+
|
90
|
+
def model_field_mappings
|
91
|
+
model_mappings = Bulkrax.field_mappings[self.class.to_s]&.dig('model', :from) || []
|
92
|
+
model_mappings |= ['model']
|
93
|
+
|
94
|
+
model_mappings
|
60
95
|
end
|
61
96
|
|
62
97
|
def perform_method
|
@@ -91,76 +126,19 @@ module Bulkrax
|
|
91
126
|
path
|
92
127
|
end
|
93
128
|
|
129
|
+
# Base path for imported and exported files
|
130
|
+
def base_path(type = 'import')
|
131
|
+
ENV['HYKU_MULTITENANT'] ? File.join(Bulkrax.send("#{type}_path"), Site.instance.account.name) : Bulkrax.send("#{type}_path")
|
132
|
+
end
|
133
|
+
|
94
134
|
# Path where we'll store the import metadata and files
|
95
135
|
# this is used for uploaded and cloud files
|
96
136
|
def path_for_import
|
97
|
-
@path_for_import = File.join(
|
137
|
+
@path_for_import = File.join(base_path, importerexporter.path_string)
|
98
138
|
FileUtils.mkdir_p(@path_for_import) unless File.exist?(@path_for_import)
|
99
139
|
@path_for_import
|
100
140
|
end
|
101
141
|
|
102
|
-
# Optional, only used by certain parsers
|
103
|
-
# Other parsers should override with a custom or empty method
|
104
|
-
# Will be skipped unless the #record is a Hash
|
105
|
-
def create_parent_child_relationships
|
106
|
-
parents.each do |key, value|
|
107
|
-
parent = entry_class.where(
|
108
|
-
identifier: key,
|
109
|
-
importerexporter_id: importerexporter.id,
|
110
|
-
importerexporter_type: 'Bulkrax::Importer'
|
111
|
-
).first
|
112
|
-
|
113
|
-
# not finding the entries here indicates that the given identifiers are incorrect
|
114
|
-
# in that case we should log that
|
115
|
-
children = value.map do |child|
|
116
|
-
entry_class.where(
|
117
|
-
identifier: child,
|
118
|
-
importerexporter_id: importerexporter.id,
|
119
|
-
importerexporter_type: 'Bulkrax::Importer'
|
120
|
-
).first
|
121
|
-
end.compact.uniq
|
122
|
-
|
123
|
-
if parent.present? && (children.length != value.length)
|
124
|
-
# Increment the failures for the number we couldn't find
|
125
|
-
# Because all of our entries have been created by now, if we can't find them, the data is wrong
|
126
|
-
Rails.logger.error("Expected #{value.length} children for parent entry #{parent.id}, found #{children.length}")
|
127
|
-
break if children.empty?
|
128
|
-
Rails.logger.warn("Adding #{children.length} children to parent entry #{parent.id} (expected #{value.length})")
|
129
|
-
end
|
130
|
-
parent_id = parent.id
|
131
|
-
child_entry_ids = children.map(&:id)
|
132
|
-
ChildRelationshipsJob.perform_later(parent_id, child_entry_ids, current_run.id)
|
133
|
-
end
|
134
|
-
rescue StandardError => e
|
135
|
-
status_info(e)
|
136
|
-
end
|
137
|
-
|
138
|
-
def parents
|
139
|
-
@parents ||= setup_parents
|
140
|
-
end
|
141
|
-
|
142
|
-
def setup_parents
|
143
|
-
pts = []
|
144
|
-
records.each do |record|
|
145
|
-
r = if record.respond_to?(:to_h)
|
146
|
-
record.to_h
|
147
|
-
else
|
148
|
-
record
|
149
|
-
end
|
150
|
-
next unless r.is_a?(Hash)
|
151
|
-
children = if r[:children].is_a?(String)
|
152
|
-
r[:children].split(/\s*[:;|]\s*/)
|
153
|
-
else
|
154
|
-
r[:children]
|
155
|
-
end
|
156
|
-
next if children.blank?
|
157
|
-
pts << {
|
158
|
-
r[source_identifier] => children
|
159
|
-
}
|
160
|
-
end
|
161
|
-
pts.blank? ? pts : pts.inject(:merge)
|
162
|
-
end
|
163
|
-
|
164
142
|
def setup_export_file
|
165
143
|
raise StandardError, 'must be defined' if exporter?
|
166
144
|
end
|
@@ -288,12 +266,9 @@ module Bulkrax
|
|
288
266
|
private
|
289
267
|
|
290
268
|
def real_import_file_path
|
291
|
-
if file? && zip?
|
292
|
-
|
293
|
-
|
294
|
-
else
|
295
|
-
parser_fields['import_file_path']
|
296
|
-
end
|
269
|
+
return importer_unzip_path if file? && zip?
|
270
|
+
|
271
|
+
parser_fields['import_file_path']
|
297
272
|
end
|
298
273
|
end
|
299
274
|
end
|
@@ -40,7 +40,7 @@ module Bulkrax
|
|
40
40
|
raise StandardError, 'No metadata files were found' if path.blank?
|
41
41
|
data = entry_class.read_data(path)
|
42
42
|
data = entry_class.data_for_entry(data, source_identifier)
|
43
|
-
data[:file] = bag.bag_files.join('|')
|
43
|
+
data[:file] = bag.bag_files.join('|') unless importerexporter.metadata_only?
|
44
44
|
data
|
45
45
|
end
|
46
46
|
end
|
@@ -58,7 +58,7 @@ module Bulkrax
|
|
58
58
|
collection_type_gid: Hyrax::CollectionType.find_or_create_default_collection_type.gid
|
59
59
|
}
|
60
60
|
new_entry = find_or_create_entry(collection_entry_class, collection, 'Bulkrax::Importer', metadata)
|
61
|
-
|
61
|
+
ImportCollectionJob.perform_now(new_entry.id, current_run.id)
|
62
62
|
increment_counters(index, true)
|
63
63
|
end
|
64
64
|
end
|
@@ -83,13 +83,22 @@ module Bulkrax
|
|
83
83
|
end
|
84
84
|
|
85
85
|
def collections
|
86
|
-
|
86
|
+
ActiveSupport::Deprecation.warn(
|
87
|
+
'Creating Collections using the collection_field_mapping will no longer be supported as of Bulkrax version 3.0.' \
|
88
|
+
' Please configure Bulkrax to use related_parents_field_mapping and related_children_field_mapping instead.'
|
89
|
+
)
|
90
|
+
records.map { |r| r[collection_field_mapping].split(/\s*[;|]\s*/) if r[collection_field_mapping].present? }.flatten.compact.uniq
|
87
91
|
end
|
88
92
|
|
89
93
|
def collections_total
|
90
94
|
collections.size
|
91
95
|
end
|
92
96
|
|
97
|
+
# TODO: change to differentiate between collection and work records when adding ability to import collection metadata
|
98
|
+
def works_total
|
99
|
+
total
|
100
|
+
end
|
101
|
+
|
93
102
|
def total
|
94
103
|
metadata_paths.count
|
95
104
|
end
|
@@ -2,31 +2,47 @@
|
|
2
2
|
|
3
3
|
require 'csv'
|
4
4
|
module Bulkrax
|
5
|
-
class CsvParser < ApplicationParser
|
5
|
+
class CsvParser < ApplicationParser # rubocop:disable Metrics/ClassLength
|
6
6
|
include ErroredEntries
|
7
7
|
def self.export_supported?
|
8
8
|
true
|
9
9
|
end
|
10
10
|
|
11
|
-
def
|
12
|
-
|
11
|
+
def records(_opts = {})
|
12
|
+
file_for_import = only_updates ? parser_fields['partial_import_file_path'] : import_file_path
|
13
|
+
# data for entry does not need source_identifier for csv, because csvs are read sequentially and mapped after raw data is read.
|
14
|
+
csv_data = entry_class.read_data(file_for_import)
|
15
|
+
importer.parser_fields['total'] = csv_data.count
|
16
|
+
importer.save
|
17
|
+
@records ||= csv_data.map { |record_data| entry_class.data_for_entry(record_data, nil) }
|
13
18
|
end
|
14
19
|
|
15
20
|
def collections
|
16
|
-
|
17
|
-
|
21
|
+
ActiveSupport::Deprecation.warn(
|
22
|
+
'Creating Collections using the collection_field_mapping will no longer be supported as of Bulkrax version 3.0.' \
|
23
|
+
' Please configure Bulkrax to use related_parents_field_mapping and related_children_field_mapping instead.'
|
24
|
+
)
|
18
25
|
# retrieve a list of unique collections
|
19
|
-
records.map
|
26
|
+
records.map do |r|
|
27
|
+
collections = []
|
28
|
+
r[collection_field_mapping].split(/\s*[;|]\s*/).each { |title| collections << { title: title } } if r[collection_field_mapping].present?
|
29
|
+
model_field_mappings.each do |model_mapping|
|
30
|
+
collections << r if r[model_mapping.to_sym]&.downcase == 'collection'
|
31
|
+
end
|
32
|
+
collections
|
33
|
+
end.flatten.compact.uniq
|
20
34
|
end
|
21
35
|
|
22
36
|
def collections_total
|
23
37
|
collections.size
|
24
38
|
end
|
25
39
|
|
26
|
-
def
|
27
|
-
|
28
|
-
|
29
|
-
|
40
|
+
def works
|
41
|
+
records - collections
|
42
|
+
end
|
43
|
+
|
44
|
+
def works_total
|
45
|
+
works.size
|
30
46
|
end
|
31
47
|
|
32
48
|
# We could use CsvEntry#fields_from_data(data) but that would mean re-reading the data
|
@@ -44,8 +60,9 @@ module Bulkrax
|
|
44
60
|
end
|
45
61
|
|
46
62
|
def valid_import?
|
47
|
-
|
48
|
-
|
63
|
+
import_strings = keys_without_numbers(import_fields.map(&:to_s))
|
64
|
+
error_alert = "Missing at least one required element, missing element(s) are: #{missing_elements(import_strings).join(', ')}"
|
65
|
+
raise StandardError, error_alert unless required_elements?(import_strings)
|
49
66
|
|
50
67
|
file_paths.is_a?(Array)
|
51
68
|
rescue StandardError => e
|
@@ -56,26 +73,26 @@ module Bulkrax
|
|
56
73
|
def create_collections
|
57
74
|
collections.each_with_index do |collection, index|
|
58
75
|
next if collection.blank?
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
}
|
65
|
-
new_entry = find_or_create_entry(collection_entry_class, collection, 'Bulkrax::Importer', metadata)
|
66
|
-
ImportWorkCollectionJob.perform_now(new_entry.id, current_run.id)
|
76
|
+
break if records.find_index(collection).present? && limit_reached?(limit, records.find_index(collection))
|
77
|
+
|
78
|
+
new_entry = find_or_create_entry(collection_entry_class, unique_collection_identifier(collection), 'Bulkrax::Importer', collection.to_h)
|
79
|
+
# TODO: add support for :delete option
|
80
|
+
ImportCollectionJob.perform_now(new_entry.id, current_run.id)
|
67
81
|
increment_counters(index, true)
|
68
82
|
end
|
83
|
+
importer.record_status
|
84
|
+
rescue StandardError => e
|
85
|
+
status_info(e)
|
69
86
|
end
|
70
87
|
|
71
88
|
def create_works
|
72
|
-
|
73
|
-
next unless record_has_source_identifier(
|
74
|
-
break if limit_reached?(limit,
|
89
|
+
works.each_with_index do |work, index|
|
90
|
+
next unless record_has_source_identifier(work, records.find_index(work))
|
91
|
+
break if limit_reached?(limit, records.find_index(work))
|
75
92
|
|
76
|
-
seen[
|
77
|
-
new_entry = find_or_create_entry(entry_class,
|
78
|
-
if
|
93
|
+
seen[work[source_identifier]] = true
|
94
|
+
new_entry = find_or_create_entry(entry_class, work[source_identifier], 'Bulkrax::Importer', work.to_h)
|
95
|
+
if work[:delete].present?
|
79
96
|
DeleteWorkJob.send(perform_method, new_entry, current_run)
|
80
97
|
else
|
81
98
|
ImportWorkJob.send(perform_method, new_entry.id, current_run.id)
|
@@ -99,10 +116,6 @@ module Bulkrax
|
|
99
116
|
path
|
100
117
|
end
|
101
118
|
|
102
|
-
def create_parent_child_relationships
|
103
|
-
super
|
104
|
-
end
|
105
|
-
|
106
119
|
def extra_filters
|
107
120
|
output = ""
|
108
121
|
if importerexporter.start_date.present?
|
@@ -117,6 +130,8 @@ module Bulkrax
|
|
117
130
|
|
118
131
|
def current_work_ids
|
119
132
|
case importerexporter.export_from
|
133
|
+
when 'all'
|
134
|
+
ActiveFedora::SolrService.query("has_model_ssim:(#{Hyrax.config.curation_concerns.join(' OR ')}) #{extra_filters}", rows: 2_147_483_647).map(&:id)
|
120
135
|
when 'collection'
|
121
136
|
ActiveFedora::SolrService.query("member_of_collection_ids_ssim:#{importerexporter.export_source + extra_filters}", rows: 2_000_000_000).map(&:id)
|
122
137
|
when 'worktype'
|
@@ -126,9 +141,16 @@ module Bulkrax
|
|
126
141
|
complete_statuses = Bulkrax::Status.latest_by_statusable
|
127
142
|
.includes(:statusable)
|
128
143
|
.where('bulkrax_statuses.statusable_id IN (?) AND bulkrax_statuses.statusable_type = ? AND status_message = ?', entry_ids, 'Bulkrax::Entry', 'Complete')
|
129
|
-
complete_entry_identifiers = complete_statuses.map { |s| s.statusable&.identifier }
|
130
144
|
|
131
|
-
|
145
|
+
complete_entry_identifiers = complete_statuses.map { |s| s.statusable&.identifier&.gsub(':', '\:') }
|
146
|
+
extra_filters = extra_filters.presence || '*:*'
|
147
|
+
|
148
|
+
ActiveFedora::SolrService.get(
|
149
|
+
extra_filters.to_s,
|
150
|
+
fq: "#{work_identifier}_sim:(#{complete_entry_identifiers.join(' OR ')})",
|
151
|
+
fl: 'id',
|
152
|
+
rows: 2_000_000_000
|
153
|
+
)['response']['docs'].map { |obj| obj['id'] }
|
132
154
|
end
|
133
155
|
end
|
134
156
|
|
@@ -136,12 +158,18 @@ module Bulkrax
|
|
136
158
|
current_work_ids.each_with_index do |wid, index|
|
137
159
|
break if limit_reached?(limit, index)
|
138
160
|
new_entry = find_or_create_entry(entry_class, wid, 'Bulkrax::Exporter')
|
139
|
-
|
161
|
+
begin
|
162
|
+
entry = Bulkrax::ExportWorkJob.perform_now(new_entry.id, current_run.id)
|
163
|
+
rescue => e
|
164
|
+
Rails.logger.info("#{e.message} was detected during export")
|
165
|
+
end
|
166
|
+
self.headers |= entry.parsed_metadata.keys if entry
|
140
167
|
end
|
141
168
|
end
|
142
169
|
alias create_from_collection create_new_entries
|
143
170
|
alias create_from_importer create_new_entries
|
144
171
|
alias create_from_worktype create_new_entries
|
172
|
+
alias create_from_all create_new_entries
|
145
173
|
|
146
174
|
def entry_class
|
147
175
|
CsvEntry
|
@@ -154,19 +182,11 @@ module Bulkrax
|
|
154
182
|
# See https://stackoverflow.com/questions/2650517/count-the-number-of-lines-in-a-file-without-reading-entire-file-into-memory
|
155
183
|
# Changed to grep as wc -l counts blank lines, and ignores the final unescaped line (which may or may not contain data)
|
156
184
|
def total
|
157
|
-
if importer?
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
@total = `grep -vc ^$ #{real_import_file_path}`.to_i - 1 if @total < 1
|
163
|
-
elsif exporter?
|
164
|
-
@total = importerexporter.entries.count
|
165
|
-
else
|
166
|
-
@total = 0
|
167
|
-
end
|
168
|
-
return @total
|
169
|
-
rescue StandardErrorr
|
185
|
+
@total = importer.parser_fields['total'] || 0 if importer?
|
186
|
+
@total = importerexporter.entries.count if exporter?
|
187
|
+
|
188
|
+
return @total || 0
|
189
|
+
rescue StandardError
|
170
190
|
@total = 0
|
171
191
|
end
|
172
192
|
|
@@ -201,31 +221,58 @@ module Bulkrax
|
|
201
221
|
end
|
202
222
|
end
|
203
223
|
|
204
|
-
def
|
205
|
-
|
206
|
-
new_entry(entry_class, 'Bulkrax::Exporter').field_supported?(key) &&
|
224
|
+
def export_key_allowed(key)
|
225
|
+
new_entry(entry_class, 'Bulkrax::Exporter').field_supported?(key) &&
|
207
226
|
key != source_identifier.to_s
|
208
227
|
end
|
209
228
|
|
210
229
|
# All possible column names
|
211
230
|
def export_headers
|
212
|
-
headers =
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
231
|
+
headers = sort_headers(self.headers)
|
232
|
+
|
233
|
+
# we don't want access_control_id exported and we want file at the end
|
234
|
+
headers.delete('access_control_id') if headers.include?('access_control_id')
|
235
|
+
|
236
|
+
# add the headers below at the beginning or end to maintain the preexisting export behavior
|
237
|
+
headers.prepend('model')
|
238
|
+
headers.prepend(source_identifier.to_s)
|
239
|
+
headers.prepend('id')
|
240
|
+
|
217
241
|
headers.uniq
|
218
242
|
end
|
219
243
|
|
244
|
+
def object_names
|
245
|
+
return @object_names if @object_names
|
246
|
+
|
247
|
+
@object_names = mapping.values.map { |value| value['object'] }
|
248
|
+
@object_names.uniq!.delete(nil)
|
249
|
+
|
250
|
+
@object_names
|
251
|
+
end
|
252
|
+
|
253
|
+
def sort_headers(headers)
|
254
|
+
# converting headers like creator_name_1 to creator_1_name so they get sorted by numerical order
|
255
|
+
# while keeping objects grouped together
|
256
|
+
headers.sort_by do |item|
|
257
|
+
number = item.match(/\d+/)&.[](0) || 0.to_s
|
258
|
+
sort_number = number.rjust(4, "0")
|
259
|
+
object_prefix = object_names.detect { |o| item.match(/^#{o}/) } || item
|
260
|
+
remainder = item.gsub(/^#{object_prefix}_/, '').gsub(/_#{number}/, '')
|
261
|
+
"#{object_prefix}_#{sort_number}_#{remainder}"
|
262
|
+
end
|
263
|
+
end
|
264
|
+
|
220
265
|
# in the parser as it is specific to the format
|
221
266
|
def setup_export_file
|
222
|
-
File.join(importerexporter.exporter_export_path,
|
267
|
+
File.join(importerexporter.exporter_export_path, "export_#{importerexporter.export_source}_from_#{importerexporter.export_from}.csv")
|
223
268
|
end
|
224
269
|
|
225
270
|
# Retrieve file paths for [:file] mapping in records
|
226
271
|
# and check all listed files exist.
|
227
272
|
def file_paths
|
228
273
|
raise StandardError, 'No records were found' if records.blank?
|
274
|
+
return [] if importerexporter.metadata_only?
|
275
|
+
|
229
276
|
@file_paths ||= records.map do |r|
|
230
277
|
file_mapping = Bulkrax.field_mappings.dig(self.class.to_s, 'file', :from)&.first&.to_sym || :file
|
231
278
|
next if r[file_mapping].blank?
|
@@ -244,23 +291,31 @@ module Bulkrax
|
|
244
291
|
# Retrieve the path where we expect to find the files
|
245
292
|
def path_to_files
|
246
293
|
@path_to_files ||= File.join(
|
247
|
-
|
294
|
+
zip? ? importer_unzip_path : File.dirname(import_file_path),
|
248
295
|
'files'
|
249
296
|
)
|
250
297
|
end
|
251
298
|
|
252
299
|
private
|
253
300
|
|
301
|
+
def unique_collection_identifier(collection_hash)
|
302
|
+
entry_uid = collection_hash[source_identifier]
|
303
|
+
entry_uid ||= if Bulkrax.fill_in_blank_source_identifiers.present?
|
304
|
+
Bulkrax.fill_in_blank_source_identifiers.call(self, records.find_index(collection_hash))
|
305
|
+
else
|
306
|
+
collection_hash[:title].split(/\s*[;|]\s*/).first
|
307
|
+
end
|
308
|
+
|
309
|
+
entry_uid
|
310
|
+
end
|
311
|
+
|
254
312
|
# Override to return the first CSV in the path, if a zip file is supplied
|
255
313
|
# We expect a single CSV at the top level of the zip in the CSVParser
|
256
314
|
# but we are willing to go look for it if need be
|
257
315
|
def real_import_file_path
|
258
|
-
if file? && zip?
|
259
|
-
|
260
|
-
|
261
|
-
else
|
262
|
-
parser_fields['import_file_path']
|
263
|
-
end
|
316
|
+
return Dir["#{importer_unzip_path}/**/*.csv"].first if file? && zip?
|
317
|
+
|
318
|
+
parser_fields['import_file_path']
|
264
319
|
end
|
265
320
|
end
|
266
321
|
end
|
@@ -75,7 +75,7 @@ module Bulkrax
|
|
75
75
|
|
76
76
|
new_entry = collection_entry_class.where(importerexporter: importerexporter, identifier: unique_collection_identifier, raw_metadata: metadata).first_or_create!
|
77
77
|
# perform now to ensure this gets created before work imports start
|
78
|
-
|
78
|
+
ImportCollectionJob.perform_now(new_entry.id, importerexporter.current_run.id)
|
79
79
|
increment_counters(index, true)
|
80
80
|
end
|
81
81
|
end
|
@@ -119,7 +119,10 @@ module Bulkrax
|
|
119
119
|
end
|
120
120
|
end
|
121
121
|
|
122
|
-
|
122
|
+
# TODO: change to differentiate between collection and work records when adding ability to import collection metadata
|
123
|
+
def works_total
|
124
|
+
total
|
125
|
+
end
|
123
126
|
|
124
127
|
def total
|
125
128
|
@total ||= records(quick: true).doc.find(".//resumptionToken").to_a.first.attributes["completeListSize"].to_i
|