bulkrax 3.4.0 → 3.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/jobs/bulkrax/create_relationships_job.rb +4 -2
- data/app/models/concerns/bulkrax/dynamic_record_lookup.rb +7 -8
- data/app/models/concerns/bulkrax/file_set_entry_behavior.rb +5 -1
- data/app/models/concerns/bulkrax/import_behavior.rb +2 -2
- data/app/parsers/bulkrax/bagit_parser.rb +34 -147
- data/app/parsers/bulkrax/csv_parser.rb +6 -5
- data/app/views/bulkrax/exporters/_form.html.erb +3 -0
- data/lib/bulkrax/version.rb +1 -1
- metadata +6 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0a80674a9f582c3b8e83f442318908edb6ca9f0b615c970d09b17d941cc8027d
|
4
|
+
data.tar.gz: a2a53116ef49e03dde1aa1df14d8259a2b4abf06a82cff63a9d4ba622ba6600a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: af3d75fb03105e37f7374f3a7f863c545d9cc9c95ab2f18bbbf7b4692024e09811f103a372327b4724c836568bad176ed0ad0b7be929ed556259aa9b0793fce6
|
7
|
+
data.tar.gz: 1117a185fbab2bae0746187f464bebea855759a5ecccf0d34f098ac55ad7a2952e663268372262ba8f97820c8c1f02bd29c74a388cfd8ea9cfed84a46dad94cf
|
@@ -42,10 +42,12 @@ module Bulkrax
|
|
42
42
|
pending_relationships.each do |rel|
|
43
43
|
raise ::StandardError, %("#{rel}" needs either a child or a parent to create a relationship) if rel.child_id.nil? || rel.parent_id.nil?
|
44
44
|
@child_entry, child_record = find_record(rel.child_id, importer_run_id)
|
45
|
-
|
45
|
+
if child_record
|
46
|
+
child_record.is_a?(::Collection) ? @child_records[:collections] << child_record : @child_records[:works] << child_record
|
47
|
+
end
|
46
48
|
end
|
47
49
|
|
48
|
-
if (child_records[:collections].blank? && child_records[:works].blank?) || parent_record.
|
50
|
+
if (child_records[:collections].blank? && child_records[:works].blank?) || parent_record.nil?
|
49
51
|
reschedule({ parent_identifier: parent_identifier, importer_run_id: importer_run_id })
|
50
52
|
return false # stop current job from continuing to run after rescheduling
|
51
53
|
end
|
@@ -12,15 +12,14 @@ module Bulkrax
|
|
12
12
|
# check for our entry in our current importer first
|
13
13
|
importer_id = ImporterRun.find(importer_run_id).importer_id
|
14
14
|
default_scope = { identifier: identifier, importerexporter_type: 'Bulkrax::Importer' }
|
15
|
-
record = Entry.find_by(default_scope.merge({ importerexporter_id: importer_id })) || Entry.find_by(default_scope)
|
16
15
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
16
|
+
begin
|
17
|
+
# the identifier parameter can be a :source_identifier or the id of an object
|
18
|
+
record = Entry.find_by(default_scope.merge({ importerexporter_id: importer_id })) || Entry.find_by(default_scope)
|
19
|
+
record ||= ActiveFedora::Base.find(identifier)
|
20
|
+
# NameError for if ActiveFedora isn't installed
|
21
|
+
rescue NameError, ActiveFedora::ObjectNotFoundError
|
22
|
+
record = nil
|
24
23
|
end
|
25
24
|
|
26
25
|
# return the found entry here instead of searching for it again in the CreateRelationshipsJob
|
@@ -8,10 +8,14 @@ module Bulkrax
|
|
8
8
|
|
9
9
|
def add_path_to_file
|
10
10
|
parsed_metadata['file'].each_with_index do |filename, i|
|
11
|
-
|
11
|
+
next if filename.blank?
|
12
|
+
|
13
|
+
path_to_file = parser.path_to_files(filename: filename)
|
12
14
|
|
13
15
|
parsed_metadata['file'][i] = path_to_file
|
14
16
|
end
|
17
|
+
parsed_metadata['file'].delete('')
|
18
|
+
|
15
19
|
raise ::StandardError, "one or more file paths are invalid: #{parsed_metadata['file'].join(', ')}" unless parsed_metadata['file'].map { |file_path| ::File.file?(file_path) }.all?
|
16
20
|
|
17
21
|
parsed_metadata['file']
|
@@ -12,8 +12,8 @@ module Bulkrax
|
|
12
12
|
raise CollectionsCreatedError unless collections_created?
|
13
13
|
@item = factory.run!
|
14
14
|
add_user_to_permission_templates! if self.class.to_s.include?("Collection")
|
15
|
-
parent_jobs if self.parsed_metadata[related_parents_parsed_mapping].present?
|
16
|
-
child_jobs if self.parsed_metadata[related_children_parsed_mapping].present?
|
15
|
+
parent_jobs if self.parsed_metadata[related_parents_parsed_mapping]&.join.present?
|
16
|
+
child_jobs if self.parsed_metadata[related_children_parsed_mapping]&.join.present?
|
17
17
|
end
|
18
18
|
rescue RSolr::Error::Http, CollectionsCreatedError => e
|
19
19
|
raise e
|
@@ -1,7 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Bulkrax
|
4
|
-
class BagitParser <
|
4
|
+
class BagitParser < CsvParser # rubocop:disable Metrics/ClassLength
|
5
5
|
include ExportBehavior
|
6
6
|
|
7
7
|
def self.export_supported?
|
@@ -20,12 +20,8 @@ module Bulkrax
|
|
20
20
|
rdf_format ? RdfEntry : CsvEntry
|
21
21
|
end
|
22
22
|
|
23
|
-
def
|
24
|
-
|
25
|
-
end
|
26
|
-
|
27
|
-
def file_set_entry_class
|
28
|
-
CsvFileSetEntry
|
23
|
+
def path_to_files(filename:)
|
24
|
+
@path_to_files ||= Dir.glob(File.join(import_file_path, '**/data', filename)).first
|
29
25
|
end
|
30
26
|
|
31
27
|
# Take a random sample of 10 metadata_paths and work out the import fields from that
|
@@ -36,39 +32,41 @@ module Bulkrax
|
|
36
32
|
end.flatten.compact.uniq
|
37
33
|
end
|
38
34
|
|
39
|
-
#
|
40
|
-
# Create an Array of all metadata records, one per file
|
35
|
+
# Create an Array of all metadata records
|
41
36
|
def records(_opts = {})
|
42
37
|
raise StandardError, 'No BagIt records were found' if bags.blank?
|
43
38
|
@records ||= bags.map do |bag|
|
44
39
|
path = metadata_path(bag)
|
45
40
|
raise StandardError, 'No metadata files were found' if path.blank?
|
46
41
|
data = entry_class.read_data(path)
|
47
|
-
|
48
|
-
data[:file] = bag.bag_files.join('|') unless importerexporter.metadata_only?
|
49
|
-
data
|
42
|
+
get_data(bag, data)
|
50
43
|
end
|
44
|
+
|
45
|
+
@records = @records.flatten
|
51
46
|
end
|
52
47
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
new_entry = find_or_create_entry(collection_entry_class, collection, 'Bulkrax::Importer', metadata)
|
66
|
-
ImportCollectionJob.perform_now(new_entry.id, current_run.id)
|
67
|
-
increment_counters(index, collection: true)
|
48
|
+
def get_data(bag, data)
|
49
|
+
if entry_class == CsvEntry
|
50
|
+
data = data.map do |data_row|
|
51
|
+
record_data = entry_class.data_for_entry(data_row, source_identifier, self)
|
52
|
+
next record_data if importerexporter.metadata_only?
|
53
|
+
|
54
|
+
record_data[:file] = bag.bag_files.join('|') if ::Hyrax.config.curation_concerns.include? record_data[:model]&.constantize
|
55
|
+
record_data
|
56
|
+
end
|
57
|
+
else
|
58
|
+
data = entry_class.data_for_entry(data, source_identifier, self)
|
59
|
+
data[:file] = bag.bag_files.join('|') unless importerexporter.metadata_only?
|
68
60
|
end
|
61
|
+
|
62
|
+
data
|
69
63
|
end
|
70
64
|
|
71
65
|
def create_works
|
66
|
+
entry_class == CsvEntry ? super : create_rdf_works
|
67
|
+
end
|
68
|
+
|
69
|
+
def create_rdf_works
|
72
70
|
records.each_with_index do |record, index|
|
73
71
|
next unless record_has_source_identifier(record, index)
|
74
72
|
break if limit_reached?(limit, index)
|
@@ -87,19 +85,6 @@ module Bulkrax
|
|
87
85
|
status_info(e)
|
88
86
|
end
|
89
87
|
|
90
|
-
def collections
|
91
|
-
records.map { |r| r[related_parents_parsed_mapping].split(/\s*[;|]\s*/) if r[related_parents_parsed_mapping].present? }.flatten.compact.uniq
|
92
|
-
end
|
93
|
-
|
94
|
-
def collections_total
|
95
|
-
collections.size
|
96
|
-
end
|
97
|
-
|
98
|
-
# TODO: change to differentiate between collection and work records when adding ability to import collection metadata
|
99
|
-
def works_total
|
100
|
-
total
|
101
|
-
end
|
102
|
-
|
103
88
|
def total
|
104
89
|
@total = importer.parser_fields['total'] || 0 if importer?
|
105
90
|
|
@@ -112,18 +97,6 @@ module Bulkrax
|
|
112
97
|
@total = 0
|
113
98
|
end
|
114
99
|
|
115
|
-
def extra_filters
|
116
|
-
output = ""
|
117
|
-
if importerexporter.start_date.present?
|
118
|
-
start_dt = importerexporter.start_date.to_datetime.strftime('%FT%TZ')
|
119
|
-
finish_dt = importerexporter.finish_date.present? ? importerexporter.finish_date.to_datetime.end_of_day.strftime('%FT%TZ') : "NOW"
|
120
|
-
output += " AND system_modified_dtsi:[#{start_dt} TO #{finish_dt}]"
|
121
|
-
end
|
122
|
-
output += importerexporter.work_visibility.present? ? " AND visibility_ssi:#{importerexporter.work_visibility}" : ""
|
123
|
-
output += importerexporter.workflow_status.present? ? " AND workflow_state_name_ssim:#{importerexporter.workflow_status}" : ""
|
124
|
-
output
|
125
|
-
end
|
126
|
-
|
127
100
|
def current_record_ids
|
128
101
|
@work_ids = []
|
129
102
|
@collection_ids = []
|
@@ -143,70 +116,19 @@ module Bulkrax
|
|
143
116
|
@work_ids + @collection_ids + @file_set_ids
|
144
117
|
end
|
145
118
|
|
146
|
-
# Set the following instance variables: @work_ids, @collection_ids, @file_set_ids
|
147
|
-
# @see #current_record_ids
|
148
|
-
def set_ids_for_exporting_from_importer
|
149
|
-
entry_ids = Importer.find(importerexporter.export_source).entries.pluck(:id)
|
150
|
-
complete_statuses = Status.latest_by_statusable
|
151
|
-
.includes(:statusable)
|
152
|
-
.where('bulkrax_statuses.statusable_id IN (?) AND bulkrax_statuses.statusable_type = ? AND status_message = ?', entry_ids, 'Bulkrax::Entry', 'Complete')
|
153
|
-
|
154
|
-
complete_entry_identifiers = complete_statuses.map { |s| s.statusable&.identifier&.gsub(':', '\:') }
|
155
|
-
extra_filters = extra_filters.presence || '*:*'
|
156
|
-
|
157
|
-
{ :@work_ids => ::Hyrax.config.curation_concerns, :@collection_ids => [::Collection], :@file_set_ids => [::FileSet] }.each do |instance_var, models_to_search|
|
158
|
-
instance_variable_set(instance_var, ActiveFedora::SolrService.post(
|
159
|
-
extra_filters.to_s,
|
160
|
-
fq: [
|
161
|
-
%(#{::Solrizer.solr_name(work_identifier)}:("#{complete_entry_identifiers.join('" OR "')}")),
|
162
|
-
"has_model_ssim:(#{models_to_search.join(' OR ')})"
|
163
|
-
],
|
164
|
-
fl: 'id',
|
165
|
-
rows: 2_000_000_000
|
166
|
-
)['response']['docs'].map { |obj| obj['id'] })
|
167
|
-
end
|
168
|
-
end
|
169
|
-
|
170
119
|
# export methods
|
171
120
|
|
172
|
-
def create_new_entries
|
173
|
-
current_record_ids.each_with_index do |id, index|
|
174
|
-
break if limit_reached?(limit, index)
|
175
|
-
|
176
|
-
this_entry_class = if @collection_ids.include?(id)
|
177
|
-
collection_entry_class
|
178
|
-
elsif @file_set_ids.include?(id)
|
179
|
-
file_set_entry_class
|
180
|
-
else
|
181
|
-
entry_class
|
182
|
-
end
|
183
|
-
new_entry = find_or_create_entry(this_entry_class, id, 'Bulkrax::Exporter')
|
184
|
-
|
185
|
-
begin
|
186
|
-
entry = ExportWorkJob.perform_now(new_entry.id, current_run.id)
|
187
|
-
rescue => e
|
188
|
-
Rails.logger.info("#{e.message} was detected during export")
|
189
|
-
end
|
190
|
-
|
191
|
-
self.headers |= entry.parsed_metadata.keys if entry
|
192
|
-
end
|
193
|
-
end
|
194
|
-
alias create_from_collection create_new_entries
|
195
|
-
alias create_from_importer create_new_entries
|
196
|
-
alias create_from_worktype create_new_entries
|
197
|
-
alias create_from_all create_new_entries
|
198
|
-
|
199
121
|
# rubocop:disable Metrics/MethodLength, Metrics/AbcSize
|
200
122
|
def write_files
|
201
123
|
require 'open-uri'
|
202
124
|
require 'socket'
|
203
125
|
importerexporter.entries.where(identifier: current_record_ids)[0..limit || total].each do |entry|
|
204
|
-
|
205
|
-
next unless Hyrax.config.curation_concerns.include?(
|
126
|
+
record = ActiveFedora::Base.find(entry.identifier)
|
127
|
+
next unless Hyrax.config.curation_concerns.include?(record.class)
|
206
128
|
bag = BagIt::Bag.new setup_bagit_folder(entry.identifier)
|
207
129
|
bag_entries = [entry]
|
208
130
|
|
209
|
-
|
131
|
+
record.file_sets.each do |fs|
|
210
132
|
if @file_set_ids.present?
|
211
133
|
file_set_entry = Bulkrax::CsvFileSetEntry.where("parsed_metadata LIKE '%#{fs.id}%'").first
|
212
134
|
bag_entries << file_set_entry unless file_set_entry.nil?
|
@@ -245,42 +167,6 @@ module Bulkrax
|
|
245
167
|
key != source_identifier.to_s
|
246
168
|
end
|
247
169
|
|
248
|
-
# All possible column names
|
249
|
-
def export_headers
|
250
|
-
headers = sort_headers(self.headers)
|
251
|
-
|
252
|
-
# we don't want access_control_id exported and we want file at the end
|
253
|
-
headers.delete('access_control_id') if headers.include?('access_control_id')
|
254
|
-
|
255
|
-
# add the headers below at the beginning or end to maintain the preexisting export behavior
|
256
|
-
headers.prepend('model')
|
257
|
-
headers.prepend(source_identifier.to_s)
|
258
|
-
headers.prepend('id')
|
259
|
-
|
260
|
-
headers.uniq
|
261
|
-
end
|
262
|
-
|
263
|
-
def object_names
|
264
|
-
return @object_names if @object_names
|
265
|
-
|
266
|
-
@object_names = mapping.values.map { |value| value['object'] }
|
267
|
-
@object_names.uniq!.delete(nil)
|
268
|
-
|
269
|
-
@object_names
|
270
|
-
end
|
271
|
-
|
272
|
-
def sort_headers(headers)
|
273
|
-
# converting headers like creator_name_1 to creator_1_name so they get sorted by numerical order
|
274
|
-
# while keeping objects grouped together
|
275
|
-
headers.sort_by do |item|
|
276
|
-
number = item.match(/\d+/)&.[](0) || 0.to_s
|
277
|
-
sort_number = number.rjust(4, "0")
|
278
|
-
object_prefix = object_names.detect { |o| item.match(/^#{o}/) } || item
|
279
|
-
remainder = item.gsub(/^#{object_prefix}_/, '').gsub(/_#{number}/, '')
|
280
|
-
"#{object_prefix}_#{sort_number}_#{remainder}"
|
281
|
-
end
|
282
|
-
end
|
283
|
-
|
284
170
|
def setup_triple_metadata_export_file(id)
|
285
171
|
File.join(importerexporter.exporter_export_path, id, 'metadata.nt')
|
286
172
|
end
|
@@ -300,11 +186,6 @@ module Bulkrax
|
|
300
186
|
end
|
301
187
|
end
|
302
188
|
|
303
|
-
def required_elements?(keys)
|
304
|
-
return if keys.blank?
|
305
|
-
!required_elements.map { |el| keys.map(&:to_s).include?(el) }.include?(false)
|
306
|
-
end
|
307
|
-
|
308
189
|
# @todo - investigate getting directory structure
|
309
190
|
# @todo - investigate using perform_later, and having the importer check for
|
310
191
|
# DownloadCloudFileJob before it starts
|
@@ -355,5 +236,11 @@ module Bulkrax
|
|
355
236
|
return nil unless bag.valid?
|
356
237
|
bag
|
357
238
|
end
|
239
|
+
|
240
|
+
# use the version of this method from the application parser instead
|
241
|
+
def real_import_file_path
|
242
|
+
return importer_unzip_path if file? && zip?
|
243
|
+
parser_fields['import_file_path']
|
244
|
+
end
|
358
245
|
end
|
359
246
|
end
|
@@ -272,8 +272,8 @@ module Bulkrax
|
|
272
272
|
CsvFileSetEntry
|
273
273
|
end
|
274
274
|
|
275
|
-
#
|
276
|
-
#
|
275
|
+
# TODO: figure out why using the version of this method that's in the bagit parser
|
276
|
+
# breaks specs for the "if importer?" line
|
277
277
|
def total
|
278
278
|
@total = importer.parser_fields['total'] || 0 if importer?
|
279
279
|
@total = limit || current_record_ids.count if exporter?
|
@@ -382,10 +382,11 @@ module Bulkrax
|
|
382
382
|
end
|
383
383
|
|
384
384
|
# Retrieve the path where we expect to find the files
|
385
|
-
def path_to_files
|
385
|
+
def path_to_files(**args)
|
386
|
+
filename = args.fetch(:filename, '')
|
387
|
+
|
386
388
|
@path_to_files ||= File.join(
|
387
|
-
zip? ? importer_unzip_path : File.dirname(import_file_path),
|
388
|
-
'files'
|
389
|
+
zip? ? importer_unzip_path : File.dirname(import_file_path), 'files', filename
|
389
390
|
)
|
390
391
|
end
|
391
392
|
|
@@ -29,6 +29,7 @@
|
|
29
29
|
|
30
30
|
<%= form.input :export_source_importer,
|
31
31
|
label: t('bulkrax.exporter.labels.importer'),
|
32
|
+
required: true,
|
32
33
|
prompt: 'Select from the list',
|
33
34
|
label_html: { class: 'importer export-source-option hidden' },
|
34
35
|
input_html: { class: 'importer export-source-option hidden' },
|
@@ -37,6 +38,7 @@
|
|
37
38
|
<%= form.input :export_source_collection,
|
38
39
|
prompt: 'Start typing ...',
|
39
40
|
label: t('bulkrax.exporter.labels.collection'),
|
41
|
+
required: true,
|
40
42
|
placeholder: @collection&.title&.first,
|
41
43
|
label_html: { class: 'collection export-source-option hidden' },
|
42
44
|
input_html: {
|
@@ -50,6 +52,7 @@
|
|
50
52
|
|
51
53
|
<%= form.input :export_source_worktype,
|
52
54
|
label: t('bulkrax.exporter.labels.worktype'),
|
55
|
+
required: true,
|
53
56
|
prompt: 'Select from the list',
|
54
57
|
label_html: { class: 'worktype export-source-option hidden' },
|
55
58
|
input_html: { class: 'worktype export-source-option hidden' },
|
data/lib/bulkrax/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bulkrax
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.
|
4
|
+
version: 3.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rob Kaufman
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-06-
|
11
|
+
date: 2022-06-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rails
|
@@ -404,7 +404,7 @@ homepage: https://github.com/samvera-labs/bulkrax
|
|
404
404
|
licenses:
|
405
405
|
- Apache-2.0
|
406
406
|
metadata: {}
|
407
|
-
post_install_message:
|
407
|
+
post_install_message:
|
408
408
|
rdoc_options: []
|
409
409
|
require_paths:
|
410
410
|
- lib
|
@@ -419,8 +419,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
419
419
|
- !ruby/object:Gem::Version
|
420
420
|
version: '0'
|
421
421
|
requirements: []
|
422
|
-
rubygems_version: 3.
|
423
|
-
signing_key:
|
422
|
+
rubygems_version: 3.0.3
|
423
|
+
signing_key:
|
424
424
|
specification_version: 4
|
425
425
|
summary: Import and export tool for Hyrax and Hyku
|
426
426
|
test_files: []
|