bulkrax 3.4.0 → 3.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/app/jobs/bulkrax/create_relationships_job.rb +4 -2
- data/app/models/concerns/bulkrax/dynamic_record_lookup.rb +7 -8
- data/app/models/concerns/bulkrax/file_set_entry_behavior.rb +5 -1
- data/app/models/concerns/bulkrax/import_behavior.rb +2 -2
- data/app/parsers/bulkrax/bagit_parser.rb +34 -147
- data/app/parsers/bulkrax/csv_parser.rb +6 -5
- data/app/views/bulkrax/exporters/_form.html.erb +3 -0
- data/lib/bulkrax/version.rb +1 -1
- metadata +6 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0a80674a9f582c3b8e83f442318908edb6ca9f0b615c970d09b17d941cc8027d
|
4
|
+
data.tar.gz: a2a53116ef49e03dde1aa1df14d8259a2b4abf06a82cff63a9d4ba622ba6600a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: af3d75fb03105e37f7374f3a7f863c545d9cc9c95ab2f18bbbf7b4692024e09811f103a372327b4724c836568bad176ed0ad0b7be929ed556259aa9b0793fce6
|
7
|
+
data.tar.gz: 1117a185fbab2bae0746187f464bebea855759a5ecccf0d34f098ac55ad7a2952e663268372262ba8f97820c8c1f02bd29c74a388cfd8ea9cfed84a46dad94cf
|
@@ -42,10 +42,12 @@ module Bulkrax
|
|
42
42
|
pending_relationships.each do |rel|
|
43
43
|
raise ::StandardError, %("#{rel}" needs either a child or a parent to create a relationship) if rel.child_id.nil? || rel.parent_id.nil?
|
44
44
|
@child_entry, child_record = find_record(rel.child_id, importer_run_id)
|
45
|
-
|
45
|
+
if child_record
|
46
|
+
child_record.is_a?(::Collection) ? @child_records[:collections] << child_record : @child_records[:works] << child_record
|
47
|
+
end
|
46
48
|
end
|
47
49
|
|
48
|
-
if (child_records[:collections].blank? && child_records[:works].blank?) || parent_record.
|
50
|
+
if (child_records[:collections].blank? && child_records[:works].blank?) || parent_record.nil?
|
49
51
|
reschedule({ parent_identifier: parent_identifier, importer_run_id: importer_run_id })
|
50
52
|
return false # stop current job from continuing to run after rescheduling
|
51
53
|
end
|
@@ -12,15 +12,14 @@ module Bulkrax
|
|
12
12
|
# check for our entry in our current importer first
|
13
13
|
importer_id = ImporterRun.find(importer_run_id).importer_id
|
14
14
|
default_scope = { identifier: identifier, importerexporter_type: 'Bulkrax::Importer' }
|
15
|
-
record = Entry.find_by(default_scope.merge({ importerexporter_id: importer_id })) || Entry.find_by(default_scope)
|
16
15
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
16
|
+
begin
|
17
|
+
# the identifier parameter can be a :source_identifier or the id of an object
|
18
|
+
record = Entry.find_by(default_scope.merge({ importerexporter_id: importer_id })) || Entry.find_by(default_scope)
|
19
|
+
record ||= ActiveFedora::Base.find(identifier)
|
20
|
+
# NameError for if ActiveFedora isn't installed
|
21
|
+
rescue NameError, ActiveFedora::ObjectNotFoundError
|
22
|
+
record = nil
|
24
23
|
end
|
25
24
|
|
26
25
|
# return the found entry here instead of searching for it again in the CreateRelationshipsJob
|
@@ -8,10 +8,14 @@ module Bulkrax
|
|
8
8
|
|
9
9
|
def add_path_to_file
|
10
10
|
parsed_metadata['file'].each_with_index do |filename, i|
|
11
|
-
|
11
|
+
next if filename.blank?
|
12
|
+
|
13
|
+
path_to_file = parser.path_to_files(filename: filename)
|
12
14
|
|
13
15
|
parsed_metadata['file'][i] = path_to_file
|
14
16
|
end
|
17
|
+
parsed_metadata['file'].delete('')
|
18
|
+
|
15
19
|
raise ::StandardError, "one or more file paths are invalid: #{parsed_metadata['file'].join(', ')}" unless parsed_metadata['file'].map { |file_path| ::File.file?(file_path) }.all?
|
16
20
|
|
17
21
|
parsed_metadata['file']
|
@@ -12,8 +12,8 @@ module Bulkrax
|
|
12
12
|
raise CollectionsCreatedError unless collections_created?
|
13
13
|
@item = factory.run!
|
14
14
|
add_user_to_permission_templates! if self.class.to_s.include?("Collection")
|
15
|
-
parent_jobs if self.parsed_metadata[related_parents_parsed_mapping].present?
|
16
|
-
child_jobs if self.parsed_metadata[related_children_parsed_mapping].present?
|
15
|
+
parent_jobs if self.parsed_metadata[related_parents_parsed_mapping]&.join.present?
|
16
|
+
child_jobs if self.parsed_metadata[related_children_parsed_mapping]&.join.present?
|
17
17
|
end
|
18
18
|
rescue RSolr::Error::Http, CollectionsCreatedError => e
|
19
19
|
raise e
|
@@ -1,7 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Bulkrax
|
4
|
-
class BagitParser <
|
4
|
+
class BagitParser < CsvParser # rubocop:disable Metrics/ClassLength
|
5
5
|
include ExportBehavior
|
6
6
|
|
7
7
|
def self.export_supported?
|
@@ -20,12 +20,8 @@ module Bulkrax
|
|
20
20
|
rdf_format ? RdfEntry : CsvEntry
|
21
21
|
end
|
22
22
|
|
23
|
-
def
|
24
|
-
|
25
|
-
end
|
26
|
-
|
27
|
-
def file_set_entry_class
|
28
|
-
CsvFileSetEntry
|
23
|
+
def path_to_files(filename:)
|
24
|
+
@path_to_files ||= Dir.glob(File.join(import_file_path, '**/data', filename)).first
|
29
25
|
end
|
30
26
|
|
31
27
|
# Take a random sample of 10 metadata_paths and work out the import fields from that
|
@@ -36,39 +32,41 @@ module Bulkrax
|
|
36
32
|
end.flatten.compact.uniq
|
37
33
|
end
|
38
34
|
|
39
|
-
#
|
40
|
-
# Create an Array of all metadata records, one per file
|
35
|
+
# Create an Array of all metadata records
|
41
36
|
def records(_opts = {})
|
42
37
|
raise StandardError, 'No BagIt records were found' if bags.blank?
|
43
38
|
@records ||= bags.map do |bag|
|
44
39
|
path = metadata_path(bag)
|
45
40
|
raise StandardError, 'No metadata files were found' if path.blank?
|
46
41
|
data = entry_class.read_data(path)
|
47
|
-
|
48
|
-
data[:file] = bag.bag_files.join('|') unless importerexporter.metadata_only?
|
49
|
-
data
|
42
|
+
get_data(bag, data)
|
50
43
|
end
|
44
|
+
|
45
|
+
@records = @records.flatten
|
51
46
|
end
|
52
47
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
new_entry = find_or_create_entry(collection_entry_class, collection, 'Bulkrax::Importer', metadata)
|
66
|
-
ImportCollectionJob.perform_now(new_entry.id, current_run.id)
|
67
|
-
increment_counters(index, collection: true)
|
48
|
+
def get_data(bag, data)
|
49
|
+
if entry_class == CsvEntry
|
50
|
+
data = data.map do |data_row|
|
51
|
+
record_data = entry_class.data_for_entry(data_row, source_identifier, self)
|
52
|
+
next record_data if importerexporter.metadata_only?
|
53
|
+
|
54
|
+
record_data[:file] = bag.bag_files.join('|') if ::Hyrax.config.curation_concerns.include? record_data[:model]&.constantize
|
55
|
+
record_data
|
56
|
+
end
|
57
|
+
else
|
58
|
+
data = entry_class.data_for_entry(data, source_identifier, self)
|
59
|
+
data[:file] = bag.bag_files.join('|') unless importerexporter.metadata_only?
|
68
60
|
end
|
61
|
+
|
62
|
+
data
|
69
63
|
end
|
70
64
|
|
71
65
|
def create_works
|
66
|
+
entry_class == CsvEntry ? super : create_rdf_works
|
67
|
+
end
|
68
|
+
|
69
|
+
def create_rdf_works
|
72
70
|
records.each_with_index do |record, index|
|
73
71
|
next unless record_has_source_identifier(record, index)
|
74
72
|
break if limit_reached?(limit, index)
|
@@ -87,19 +85,6 @@ module Bulkrax
|
|
87
85
|
status_info(e)
|
88
86
|
end
|
89
87
|
|
90
|
-
def collections
|
91
|
-
records.map { |r| r[related_parents_parsed_mapping].split(/\s*[;|]\s*/) if r[related_parents_parsed_mapping].present? }.flatten.compact.uniq
|
92
|
-
end
|
93
|
-
|
94
|
-
def collections_total
|
95
|
-
collections.size
|
96
|
-
end
|
97
|
-
|
98
|
-
# TODO: change to differentiate between collection and work records when adding ability to import collection metadata
|
99
|
-
def works_total
|
100
|
-
total
|
101
|
-
end
|
102
|
-
|
103
88
|
def total
|
104
89
|
@total = importer.parser_fields['total'] || 0 if importer?
|
105
90
|
|
@@ -112,18 +97,6 @@ module Bulkrax
|
|
112
97
|
@total = 0
|
113
98
|
end
|
114
99
|
|
115
|
-
def extra_filters
|
116
|
-
output = ""
|
117
|
-
if importerexporter.start_date.present?
|
118
|
-
start_dt = importerexporter.start_date.to_datetime.strftime('%FT%TZ')
|
119
|
-
finish_dt = importerexporter.finish_date.present? ? importerexporter.finish_date.to_datetime.end_of_day.strftime('%FT%TZ') : "NOW"
|
120
|
-
output += " AND system_modified_dtsi:[#{start_dt} TO #{finish_dt}]"
|
121
|
-
end
|
122
|
-
output += importerexporter.work_visibility.present? ? " AND visibility_ssi:#{importerexporter.work_visibility}" : ""
|
123
|
-
output += importerexporter.workflow_status.present? ? " AND workflow_state_name_ssim:#{importerexporter.workflow_status}" : ""
|
124
|
-
output
|
125
|
-
end
|
126
|
-
|
127
100
|
def current_record_ids
|
128
101
|
@work_ids = []
|
129
102
|
@collection_ids = []
|
@@ -143,70 +116,19 @@ module Bulkrax
|
|
143
116
|
@work_ids + @collection_ids + @file_set_ids
|
144
117
|
end
|
145
118
|
|
146
|
-
# Set the following instance variables: @work_ids, @collection_ids, @file_set_ids
|
147
|
-
# @see #current_record_ids
|
148
|
-
def set_ids_for_exporting_from_importer
|
149
|
-
entry_ids = Importer.find(importerexporter.export_source).entries.pluck(:id)
|
150
|
-
complete_statuses = Status.latest_by_statusable
|
151
|
-
.includes(:statusable)
|
152
|
-
.where('bulkrax_statuses.statusable_id IN (?) AND bulkrax_statuses.statusable_type = ? AND status_message = ?', entry_ids, 'Bulkrax::Entry', 'Complete')
|
153
|
-
|
154
|
-
complete_entry_identifiers = complete_statuses.map { |s| s.statusable&.identifier&.gsub(':', '\:') }
|
155
|
-
extra_filters = extra_filters.presence || '*:*'
|
156
|
-
|
157
|
-
{ :@work_ids => ::Hyrax.config.curation_concerns, :@collection_ids => [::Collection], :@file_set_ids => [::FileSet] }.each do |instance_var, models_to_search|
|
158
|
-
instance_variable_set(instance_var, ActiveFedora::SolrService.post(
|
159
|
-
extra_filters.to_s,
|
160
|
-
fq: [
|
161
|
-
%(#{::Solrizer.solr_name(work_identifier)}:("#{complete_entry_identifiers.join('" OR "')}")),
|
162
|
-
"has_model_ssim:(#{models_to_search.join(' OR ')})"
|
163
|
-
],
|
164
|
-
fl: 'id',
|
165
|
-
rows: 2_000_000_000
|
166
|
-
)['response']['docs'].map { |obj| obj['id'] })
|
167
|
-
end
|
168
|
-
end
|
169
|
-
|
170
119
|
# export methods
|
171
120
|
|
172
|
-
def create_new_entries
|
173
|
-
current_record_ids.each_with_index do |id, index|
|
174
|
-
break if limit_reached?(limit, index)
|
175
|
-
|
176
|
-
this_entry_class = if @collection_ids.include?(id)
|
177
|
-
collection_entry_class
|
178
|
-
elsif @file_set_ids.include?(id)
|
179
|
-
file_set_entry_class
|
180
|
-
else
|
181
|
-
entry_class
|
182
|
-
end
|
183
|
-
new_entry = find_or_create_entry(this_entry_class, id, 'Bulkrax::Exporter')
|
184
|
-
|
185
|
-
begin
|
186
|
-
entry = ExportWorkJob.perform_now(new_entry.id, current_run.id)
|
187
|
-
rescue => e
|
188
|
-
Rails.logger.info("#{e.message} was detected during export")
|
189
|
-
end
|
190
|
-
|
191
|
-
self.headers |= entry.parsed_metadata.keys if entry
|
192
|
-
end
|
193
|
-
end
|
194
|
-
alias create_from_collection create_new_entries
|
195
|
-
alias create_from_importer create_new_entries
|
196
|
-
alias create_from_worktype create_new_entries
|
197
|
-
alias create_from_all create_new_entries
|
198
|
-
|
199
121
|
# rubocop:disable Metrics/MethodLength, Metrics/AbcSize
|
200
122
|
def write_files
|
201
123
|
require 'open-uri'
|
202
124
|
require 'socket'
|
203
125
|
importerexporter.entries.where(identifier: current_record_ids)[0..limit || total].each do |entry|
|
204
|
-
|
205
|
-
next unless Hyrax.config.curation_concerns.include?(
|
126
|
+
record = ActiveFedora::Base.find(entry.identifier)
|
127
|
+
next unless Hyrax.config.curation_concerns.include?(record.class)
|
206
128
|
bag = BagIt::Bag.new setup_bagit_folder(entry.identifier)
|
207
129
|
bag_entries = [entry]
|
208
130
|
|
209
|
-
|
131
|
+
record.file_sets.each do |fs|
|
210
132
|
if @file_set_ids.present?
|
211
133
|
file_set_entry = Bulkrax::CsvFileSetEntry.where("parsed_metadata LIKE '%#{fs.id}%'").first
|
212
134
|
bag_entries << file_set_entry unless file_set_entry.nil?
|
@@ -245,42 +167,6 @@ module Bulkrax
|
|
245
167
|
key != source_identifier.to_s
|
246
168
|
end
|
247
169
|
|
248
|
-
# All possible column names
|
249
|
-
def export_headers
|
250
|
-
headers = sort_headers(self.headers)
|
251
|
-
|
252
|
-
# we don't want access_control_id exported and we want file at the end
|
253
|
-
headers.delete('access_control_id') if headers.include?('access_control_id')
|
254
|
-
|
255
|
-
# add the headers below at the beginning or end to maintain the preexisting export behavior
|
256
|
-
headers.prepend('model')
|
257
|
-
headers.prepend(source_identifier.to_s)
|
258
|
-
headers.prepend('id')
|
259
|
-
|
260
|
-
headers.uniq
|
261
|
-
end
|
262
|
-
|
263
|
-
def object_names
|
264
|
-
return @object_names if @object_names
|
265
|
-
|
266
|
-
@object_names = mapping.values.map { |value| value['object'] }
|
267
|
-
@object_names.uniq!.delete(nil)
|
268
|
-
|
269
|
-
@object_names
|
270
|
-
end
|
271
|
-
|
272
|
-
def sort_headers(headers)
|
273
|
-
# converting headers like creator_name_1 to creator_1_name so they get sorted by numerical order
|
274
|
-
# while keeping objects grouped together
|
275
|
-
headers.sort_by do |item|
|
276
|
-
number = item.match(/\d+/)&.[](0) || 0.to_s
|
277
|
-
sort_number = number.rjust(4, "0")
|
278
|
-
object_prefix = object_names.detect { |o| item.match(/^#{o}/) } || item
|
279
|
-
remainder = item.gsub(/^#{object_prefix}_/, '').gsub(/_#{number}/, '')
|
280
|
-
"#{object_prefix}_#{sort_number}_#{remainder}"
|
281
|
-
end
|
282
|
-
end
|
283
|
-
|
284
170
|
def setup_triple_metadata_export_file(id)
|
285
171
|
File.join(importerexporter.exporter_export_path, id, 'metadata.nt')
|
286
172
|
end
|
@@ -300,11 +186,6 @@ module Bulkrax
|
|
300
186
|
end
|
301
187
|
end
|
302
188
|
|
303
|
-
def required_elements?(keys)
|
304
|
-
return if keys.blank?
|
305
|
-
!required_elements.map { |el| keys.map(&:to_s).include?(el) }.include?(false)
|
306
|
-
end
|
307
|
-
|
308
189
|
# @todo - investigate getting directory structure
|
309
190
|
# @todo - investigate using perform_later, and having the importer check for
|
310
191
|
# DownloadCloudFileJob before it starts
|
@@ -355,5 +236,11 @@ module Bulkrax
|
|
355
236
|
return nil unless bag.valid?
|
356
237
|
bag
|
357
238
|
end
|
239
|
+
|
240
|
+
# use the version of this method from the application parser instead
|
241
|
+
def real_import_file_path
|
242
|
+
return importer_unzip_path if file? && zip?
|
243
|
+
parser_fields['import_file_path']
|
244
|
+
end
|
358
245
|
end
|
359
246
|
end
|
@@ -272,8 +272,8 @@ module Bulkrax
|
|
272
272
|
CsvFileSetEntry
|
273
273
|
end
|
274
274
|
|
275
|
-
#
|
276
|
-
#
|
275
|
+
# TODO: figure out why using the version of this method that's in the bagit parser
|
276
|
+
# breaks specs for the "if importer?" line
|
277
277
|
def total
|
278
278
|
@total = importer.parser_fields['total'] || 0 if importer?
|
279
279
|
@total = limit || current_record_ids.count if exporter?
|
@@ -382,10 +382,11 @@ module Bulkrax
|
|
382
382
|
end
|
383
383
|
|
384
384
|
# Retrieve the path where we expect to find the files
|
385
|
-
def path_to_files
|
385
|
+
def path_to_files(**args)
|
386
|
+
filename = args.fetch(:filename, '')
|
387
|
+
|
386
388
|
@path_to_files ||= File.join(
|
387
|
-
zip? ? importer_unzip_path : File.dirname(import_file_path),
|
388
|
-
'files'
|
389
|
+
zip? ? importer_unzip_path : File.dirname(import_file_path), 'files', filename
|
389
390
|
)
|
390
391
|
end
|
391
392
|
|
@@ -29,6 +29,7 @@
|
|
29
29
|
|
30
30
|
<%= form.input :export_source_importer,
|
31
31
|
label: t('bulkrax.exporter.labels.importer'),
|
32
|
+
required: true,
|
32
33
|
prompt: 'Select from the list',
|
33
34
|
label_html: { class: 'importer export-source-option hidden' },
|
34
35
|
input_html: { class: 'importer export-source-option hidden' },
|
@@ -37,6 +38,7 @@
|
|
37
38
|
<%= form.input :export_source_collection,
|
38
39
|
prompt: 'Start typing ...',
|
39
40
|
label: t('bulkrax.exporter.labels.collection'),
|
41
|
+
required: true,
|
40
42
|
placeholder: @collection&.title&.first,
|
41
43
|
label_html: { class: 'collection export-source-option hidden' },
|
42
44
|
input_html: {
|
@@ -50,6 +52,7 @@
|
|
50
52
|
|
51
53
|
<%= form.input :export_source_worktype,
|
52
54
|
label: t('bulkrax.exporter.labels.worktype'),
|
55
|
+
required: true,
|
53
56
|
prompt: 'Select from the list',
|
54
57
|
label_html: { class: 'worktype export-source-option hidden' },
|
55
58
|
input_html: { class: 'worktype export-source-option hidden' },
|
data/lib/bulkrax/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bulkrax
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.
|
4
|
+
version: 3.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rob Kaufman
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-06-
|
11
|
+
date: 2022-06-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rails
|
@@ -404,7 +404,7 @@ homepage: https://github.com/samvera-labs/bulkrax
|
|
404
404
|
licenses:
|
405
405
|
- Apache-2.0
|
406
406
|
metadata: {}
|
407
|
-
post_install_message:
|
407
|
+
post_install_message:
|
408
408
|
rdoc_options: []
|
409
409
|
require_paths:
|
410
410
|
- lib
|
@@ -419,8 +419,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
419
419
|
- !ruby/object:Gem::Version
|
420
420
|
version: '0'
|
421
421
|
requirements: []
|
422
|
-
rubygems_version: 3.
|
423
|
-
signing_key:
|
422
|
+
rubygems_version: 3.0.3
|
423
|
+
signing_key:
|
424
424
|
specification_version: 4
|
425
425
|
summary: Import and export tool for Hyrax and Hyku
|
426
426
|
test_files: []
|