bulkrax 3.4.0 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +3 -5
- data/app/controllers/bulkrax/exporters_controller.rb +1 -1
- data/app/jobs/bulkrax/create_relationships_job.rb +4 -2
- data/app/models/bulkrax/entry.rb +0 -2
- data/app/models/bulkrax/exporter.rb +15 -2
- data/app/models/concerns/bulkrax/dynamic_record_lookup.rb +7 -8
- data/app/models/concerns/bulkrax/export_behavior.rb +0 -22
- data/app/models/concerns/bulkrax/file_set_entry_behavior.rb +5 -1
- data/app/models/concerns/bulkrax/import_behavior.rb +2 -2
- data/app/parsers/bulkrax/application_parser.rb +6 -25
- data/app/parsers/bulkrax/bagit_parser.rb +69 -160
- data/app/parsers/bulkrax/csv_parser.rb +54 -10
- data/app/views/bulkrax/exporters/_downloads.html.erb +8 -0
- data/app/views/bulkrax/exporters/_form.html.erb +3 -0
- data/app/views/bulkrax/exporters/index.html.erb +5 -2
- data/app/views/bulkrax/exporters/show.html.erb +4 -7
- data/lib/bulkrax/version.rb +1 -1
- data/lib/tasks/bulkrax_tasks.rake +28 -4
- metadata +7 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: eb56d86ee90ae9e1cf0628504694e1301ab8f2d6b24ffa8fd323f8953a8ee956
|
4
|
+
data.tar.gz: 71056b077e300f27eee3bcccd9d7e2bee2fc7bdf2fc6ba9248b69a29f3994f9c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 05ea49e6f2c5e73cbddacf35dcaf9de499760d7093e3ae8f3ce4ea5ab28e25d065b7877607436fcbe02d21e17c2df940b0224f1ea7d638a602486ce807d99981
|
7
|
+
data.tar.gz: ecdda29924e09793e62684f16ebcd79cd90ab9e8204d011b89a577ca667a644709ff2df5b525ac6c32355426038c05d9fc3d74c6efb20ee7cfab653d9b89b67a
|
data/README.md
CHANGED
@@ -70,7 +70,7 @@ Bulkrax.setup do |config|
|
|
70
70
|
end
|
71
71
|
```
|
72
72
|
|
73
|
-
The [configuration guide](https://github.com/samvera-labs/bulkrax/wiki/
|
73
|
+
The [configuration guide](https://github.com/samvera-labs/bulkrax/wiki/Configuring-Bulkrax) provides detailed instructions on the various available configurations.
|
74
74
|
|
75
75
|
Example:
|
76
76
|
|
@@ -120,7 +120,7 @@ It's unlikely that the incoming import data has fields that exactly match those
|
|
120
120
|
|
121
121
|
By default, a mapping for the OAI parser has been added to map standard oai_dc fields to Hyrax basic_metadata. The other parsers have no default mapping, and will map any incoming fields to Hyrax properties with the same name. Configurations can be added in `config/intializers/bulkrax.rb`
|
122
122
|
|
123
|
-
Configuring field mappings is documented in the [Bulkrax Configuration Guide](https://github.com/samvera-labs/bulkrax/wiki/
|
123
|
+
Configuring field mappings is documented in the [Bulkrax Configuration Guide](https://github.com/samvera-labs/bulkrax/wiki/Configuring-Bulkrax).
|
124
124
|
|
125
125
|
## Importing Files
|
126
126
|
|
@@ -151,7 +151,7 @@ end
|
|
151
151
|
|
152
152
|
## Customizing Bulkrax
|
153
153
|
|
154
|
-
For further information on how to extend and customize Bulkrax, please see the [Bulkrax Customization Guide](https://github.com/samvera-labs/bulkrax/wiki/Customizing).
|
154
|
+
For further information on how to extend and customize Bulkrax, please see the [Bulkrax Customization Guide](https://github.com/samvera-labs/bulkrax/wiki/Customizing-Bulkrax).
|
155
155
|
|
156
156
|
## How it Works
|
157
157
|
Once you have Bulkrax installed, you will have access to an easy to use interface with which you are able to create, edit, delete, run, and re-run imports and exports.
|
@@ -191,8 +191,6 @@ We encourage everyone to help improve this project. Bug reports and pull reques
|
|
191
191
|
|
192
192
|
This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](https://contributor-covenant.org) code of conduct.
|
193
193
|
|
194
|
-
All Contributors should have signed the Samvera Contributor License Agreement (CLA)
|
195
|
-
|
196
194
|
## Questions
|
197
195
|
Questions can be sent to support@notch8.com. Please make sure to include "Bulkrax" in the subject line of your email.
|
198
196
|
|
@@ -42,10 +42,12 @@ module Bulkrax
|
|
42
42
|
pending_relationships.each do |rel|
|
43
43
|
raise ::StandardError, %("#{rel}" needs either a child or a parent to create a relationship) if rel.child_id.nil? || rel.parent_id.nil?
|
44
44
|
@child_entry, child_record = find_record(rel.child_id, importer_run_id)
|
45
|
-
|
45
|
+
if child_record
|
46
|
+
child_record.is_a?(::Collection) ? @child_records[:collections] << child_record : @child_records[:works] << child_record
|
47
|
+
end
|
46
48
|
end
|
47
49
|
|
48
|
-
if (child_records[:collections].blank? && child_records[:works].blank?) || parent_record.
|
50
|
+
if (child_records[:collections].blank? && child_records[:works].blank?) || parent_record.nil?
|
49
51
|
reschedule({ parent_identifier: parent_identifier, importer_run_id: importer_run_id })
|
50
52
|
return false # stop current job from continuing to run after rescheduling
|
51
53
|
end
|
data/app/models/bulkrax/entry.rb
CHANGED
@@ -4,8 +4,6 @@ module Bulkrax
|
|
4
4
|
# Custom error class for collections_created?
|
5
5
|
class CollectionsCreatedError < RuntimeError; end
|
6
6
|
class OAIError < RuntimeError; end
|
7
|
-
# TODO: remove when ApplicationParser#bagit_zip_file_size_check is removed
|
8
|
-
class BagitZipError < RuntimeError; end
|
9
7
|
class Entry < ApplicationRecord
|
10
8
|
include Bulkrax::HasMatchers
|
11
9
|
include Bulkrax::ImportBehavior
|
@@ -124,9 +124,13 @@ module Bulkrax
|
|
124
124
|
end
|
125
125
|
|
126
126
|
def exporter_export_zip_path
|
127
|
-
@exporter_export_zip_path ||= File.join(parser.base_path('export'), "export_#{self.id}_#{self.exporter_runs.last.id}
|
127
|
+
@exporter_export_zip_path ||= File.join(parser.base_path('export'), "export_#{self.id}_#{self.exporter_runs.last.id}")
|
128
128
|
rescue
|
129
|
-
@exporter_export_zip_path ||= File.join(parser.base_path('export'), "export_#{self.id}_0
|
129
|
+
@exporter_export_zip_path ||= File.join(parser.base_path('export'), "export_#{self.id}_0")
|
130
|
+
end
|
131
|
+
|
132
|
+
def exporter_export_zip_files
|
133
|
+
@exporter_export_zip_files ||= Dir["#{exporter_export_zip_path}/**"].map { |zip| Array(zip.split('/').last) }
|
130
134
|
end
|
131
135
|
|
132
136
|
def export_properties
|
@@ -137,5 +141,14 @@ module Bulkrax
|
|
137
141
|
def metadata_only?
|
138
142
|
export_type == 'metadata'
|
139
143
|
end
|
144
|
+
|
145
|
+
def sort_zip_files(zip_files)
|
146
|
+
zip_files.sort_by do |item|
|
147
|
+
number = item.split('_').last.match(/\d+/)&.[](0) || 0.to_s
|
148
|
+
sort_number = number.rjust(4, "0")
|
149
|
+
|
150
|
+
sort_number
|
151
|
+
end
|
152
|
+
end
|
140
153
|
end
|
141
154
|
end
|
@@ -12,15 +12,14 @@ module Bulkrax
|
|
12
12
|
# check for our entry in our current importer first
|
13
13
|
importer_id = ImporterRun.find(importer_run_id).importer_id
|
14
14
|
default_scope = { identifier: identifier, importerexporter_type: 'Bulkrax::Importer' }
|
15
|
-
record = Entry.find_by(default_scope.merge({ importerexporter_id: importer_id })) || Entry.find_by(default_scope)
|
16
15
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
16
|
+
begin
|
17
|
+
# the identifier parameter can be a :source_identifier or the id of an object
|
18
|
+
record = Entry.find_by(default_scope.merge({ importerexporter_id: importer_id })) || Entry.find_by(default_scope)
|
19
|
+
record ||= ActiveFedora::Base.find(identifier)
|
20
|
+
# NameError for if ActiveFedora isn't installed
|
21
|
+
rescue NameError, ActiveFedora::ObjectNotFoundError
|
22
|
+
record = nil
|
24
23
|
end
|
25
24
|
|
26
25
|
# return the found entry here instead of searching for it again in the CreateRelationshipsJob
|
@@ -7,9 +7,6 @@ module Bulkrax
|
|
7
7
|
|
8
8
|
def build_for_exporter
|
9
9
|
build_export_metadata
|
10
|
-
# TODO(alishaevn): determine if the line below is still necessary
|
11
|
-
# the csv and bagit parsers also have write_files methods
|
12
|
-
write_files if export_type == 'full' && !importerexporter.parser_klass.include?('Bagit')
|
13
10
|
rescue RSolr::Error::Http, CollectionsCreatedError => e
|
14
11
|
raise e
|
15
12
|
rescue StandardError => e
|
@@ -26,25 +23,6 @@ module Bulkrax
|
|
26
23
|
@hyrax_record ||= ActiveFedora::Base.find(self.identifier)
|
27
24
|
end
|
28
25
|
|
29
|
-
def write_files
|
30
|
-
return if hyrax_record.is_a?(Collection)
|
31
|
-
|
32
|
-
file_sets = hyrax_record.file_set? ? Array.wrap(hyrax_record) : hyrax_record.file_sets
|
33
|
-
file_sets << hyrax_record.thumbnail if hyrax_record.thumbnail.present? && hyrax_record.work? && exporter.include_thumbnails
|
34
|
-
file_sets.each do |fs|
|
35
|
-
path = File.join(exporter_export_path, 'files')
|
36
|
-
FileUtils.mkdir_p(path)
|
37
|
-
file = filename(fs)
|
38
|
-
require 'open-uri'
|
39
|
-
io = open(fs.original_file.uri)
|
40
|
-
next if file.blank?
|
41
|
-
File.open(File.join(path, file), 'wb') do |f|
|
42
|
-
f.write(io.read)
|
43
|
-
f.close
|
44
|
-
end
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
26
|
# Prepend the file_set id to ensure a unique filename and also one that is not longer than 255 characters
|
49
27
|
def filename(file_set)
|
50
28
|
return if file_set.original_file.blank?
|
@@ -8,10 +8,14 @@ module Bulkrax
|
|
8
8
|
|
9
9
|
def add_path_to_file
|
10
10
|
parsed_metadata['file'].each_with_index do |filename, i|
|
11
|
-
|
11
|
+
next if filename.blank?
|
12
|
+
|
13
|
+
path_to_file = parser.path_to_files(filename: filename)
|
12
14
|
|
13
15
|
parsed_metadata['file'][i] = path_to_file
|
14
16
|
end
|
17
|
+
parsed_metadata['file'].delete('')
|
18
|
+
|
15
19
|
raise ::StandardError, "one or more file paths are invalid: #{parsed_metadata['file'].join(', ')}" unless parsed_metadata['file'].map { |file_path| ::File.file?(file_path) }.all?
|
16
20
|
|
17
21
|
parsed_metadata['file']
|
@@ -12,8 +12,8 @@ module Bulkrax
|
|
12
12
|
raise CollectionsCreatedError unless collections_created?
|
13
13
|
@item = factory.run!
|
14
14
|
add_user_to_permission_templates! if self.class.to_s.include?("Collection")
|
15
|
-
parent_jobs if self.parsed_metadata[related_parents_parsed_mapping].present?
|
16
|
-
child_jobs if self.parsed_metadata[related_children_parsed_mapping].present?
|
15
|
+
parent_jobs if self.parsed_metadata[related_parents_parsed_mapping]&.join.present?
|
16
|
+
child_jobs if self.parsed_metadata[related_children_parsed_mapping]&.join.present?
|
17
17
|
end
|
18
18
|
rescue RSolr::Error::Http, CollectionsCreatedError => e
|
19
19
|
raise e
|
@@ -247,8 +247,6 @@ module Bulkrax
|
|
247
247
|
def write
|
248
248
|
write_files
|
249
249
|
zip
|
250
|
-
# uncomment next line to debug for faulty zipping during bagit export
|
251
|
-
bagit_zip_file_size_check if importerexporter.parser_klass.include?('Bagit')
|
252
250
|
end
|
253
251
|
|
254
252
|
def unzip(file_to_unzip)
|
@@ -262,30 +260,13 @@ module Bulkrax
|
|
262
260
|
end
|
263
261
|
|
264
262
|
def zip
|
265
|
-
FileUtils.
|
266
|
-
Zip::File.open(exporter_export_zip_path, create: true) do |zip_file|
|
267
|
-
Dir["#{exporter_export_path}/**/**"].each do |file|
|
268
|
-
zip_file.add(file.sub("#{exporter_export_path}/", ''), file)
|
269
|
-
end
|
270
|
-
end
|
271
|
-
end
|
263
|
+
FileUtils.mkdir_p(exporter_export_zip_path)
|
272
264
|
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
begin
|
279
|
-
raise BagitZipError, "Invalid Bag, file size mismatch for #{file.sub("#{exporter_export_path}/", '')}" if File.size(file) != zipped_file.size
|
280
|
-
rescue BagitZipError => e
|
281
|
-
matched_entry_ids = importerexporter.entry_ids.select do |id|
|
282
|
-
Bulkrax::Entry.find(id).identifier.include?(zipped_file.name.split('/').first)
|
283
|
-
end
|
284
|
-
matched_entry_ids.each do |entry_id|
|
285
|
-
Bulkrax::Entry.find(entry_id).status_info(e)
|
286
|
-
status_info('Complete (with failures)')
|
287
|
-
end
|
288
|
-
end
|
265
|
+
Dir["#{exporter_export_path}/**"].each do |folder|
|
266
|
+
zip_path = "#{exporter_export_zip_path.split('/').last}_#{folder.split('/').last}.zip"
|
267
|
+
Zip::File.open(File.join("#{exporter_export_zip_path}/#{zip_path}"), create: true) do |zip_file|
|
268
|
+
Dir["#{folder}/**/**"].each do |file|
|
269
|
+
zip_file.add(file.sub("#{folder}/", ''), file)
|
289
270
|
end
|
290
271
|
end
|
291
272
|
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Bulkrax
|
4
|
-
class BagitParser <
|
4
|
+
class BagitParser < CsvParser # rubocop:disable Metrics/ClassLength
|
5
5
|
include ExportBehavior
|
6
6
|
|
7
7
|
def self.export_supported?
|
@@ -20,12 +20,8 @@ module Bulkrax
|
|
20
20
|
rdf_format ? RdfEntry : CsvEntry
|
21
21
|
end
|
22
22
|
|
23
|
-
def
|
24
|
-
|
25
|
-
end
|
26
|
-
|
27
|
-
def file_set_entry_class
|
28
|
-
CsvFileSetEntry
|
23
|
+
def path_to_files(filename:)
|
24
|
+
@path_to_files ||= Dir.glob(File.join(import_file_path, '**/data', filename)).first
|
29
25
|
end
|
30
26
|
|
31
27
|
# Take a random sample of 10 metadata_paths and work out the import fields from that
|
@@ -36,39 +32,41 @@ module Bulkrax
|
|
36
32
|
end.flatten.compact.uniq
|
37
33
|
end
|
38
34
|
|
39
|
-
#
|
40
|
-
# Create an Array of all metadata records, one per file
|
35
|
+
# Create an Array of all metadata records
|
41
36
|
def records(_opts = {})
|
42
37
|
raise StandardError, 'No BagIt records were found' if bags.blank?
|
43
38
|
@records ||= bags.map do |bag|
|
44
39
|
path = metadata_path(bag)
|
45
40
|
raise StandardError, 'No metadata files were found' if path.blank?
|
46
41
|
data = entry_class.read_data(path)
|
47
|
-
|
48
|
-
data[:file] = bag.bag_files.join('|') unless importerexporter.metadata_only?
|
49
|
-
data
|
42
|
+
get_data(bag, data)
|
50
43
|
end
|
44
|
+
|
45
|
+
@records = @records.flatten
|
51
46
|
end
|
52
47
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
new_entry = find_or_create_entry(collection_entry_class, collection, 'Bulkrax::Importer', metadata)
|
66
|
-
ImportCollectionJob.perform_now(new_entry.id, current_run.id)
|
67
|
-
increment_counters(index, collection: true)
|
48
|
+
def get_data(bag, data)
|
49
|
+
if entry_class == CsvEntry
|
50
|
+
data = data.map do |data_row|
|
51
|
+
record_data = entry_class.data_for_entry(data_row, source_identifier, self)
|
52
|
+
next record_data if importerexporter.metadata_only?
|
53
|
+
|
54
|
+
record_data[:file] = bag.bag_files.join('|') if ::Hyrax.config.curation_concerns.include? record_data[:model]&.constantize
|
55
|
+
record_data
|
56
|
+
end
|
57
|
+
else
|
58
|
+
data = entry_class.data_for_entry(data, source_identifier, self)
|
59
|
+
data[:file] = bag.bag_files.join('|') unless importerexporter.metadata_only?
|
68
60
|
end
|
61
|
+
|
62
|
+
data
|
69
63
|
end
|
70
64
|
|
71
65
|
def create_works
|
66
|
+
entry_class == CsvEntry ? super : create_rdf_works
|
67
|
+
end
|
68
|
+
|
69
|
+
def create_rdf_works
|
72
70
|
records.each_with_index do |record, index|
|
73
71
|
next unless record_has_source_identifier(record, index)
|
74
72
|
break if limit_reached?(limit, index)
|
@@ -87,19 +85,6 @@ module Bulkrax
|
|
87
85
|
status_info(e)
|
88
86
|
end
|
89
87
|
|
90
|
-
def collections
|
91
|
-
records.map { |r| r[related_parents_parsed_mapping].split(/\s*[;|]\s*/) if r[related_parents_parsed_mapping].present? }.flatten.compact.uniq
|
92
|
-
end
|
93
|
-
|
94
|
-
def collections_total
|
95
|
-
collections.size
|
96
|
-
end
|
97
|
-
|
98
|
-
# TODO: change to differentiate between collection and work records when adding ability to import collection metadata
|
99
|
-
def works_total
|
100
|
-
total
|
101
|
-
end
|
102
|
-
|
103
88
|
def total
|
104
89
|
@total = importer.parser_fields['total'] || 0 if importer?
|
105
90
|
|
@@ -112,18 +97,6 @@ module Bulkrax
|
|
112
97
|
@total = 0
|
113
98
|
end
|
114
99
|
|
115
|
-
def extra_filters
|
116
|
-
output = ""
|
117
|
-
if importerexporter.start_date.present?
|
118
|
-
start_dt = importerexporter.start_date.to_datetime.strftime('%FT%TZ')
|
119
|
-
finish_dt = importerexporter.finish_date.present? ? importerexporter.finish_date.to_datetime.end_of_day.strftime('%FT%TZ') : "NOW"
|
120
|
-
output += " AND system_modified_dtsi:[#{start_dt} TO #{finish_dt}]"
|
121
|
-
end
|
122
|
-
output += importerexporter.work_visibility.present? ? " AND visibility_ssi:#{importerexporter.work_visibility}" : ""
|
123
|
-
output += importerexporter.workflow_status.present? ? " AND workflow_state_name_ssim:#{importerexporter.workflow_status}" : ""
|
124
|
-
output
|
125
|
-
end
|
126
|
-
|
127
100
|
def current_record_ids
|
128
101
|
@work_ids = []
|
129
102
|
@collection_ids = []
|
@@ -140,78 +113,39 @@ module Bulkrax
|
|
140
113
|
when 'importer'
|
141
114
|
set_ids_for_exporting_from_importer
|
142
115
|
end
|
143
|
-
@work_ids + @collection_ids + @file_set_ids
|
144
|
-
end
|
145
116
|
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
entry_ids = Importer.find(importerexporter.export_source).entries.pluck(:id)
|
150
|
-
complete_statuses = Status.latest_by_statusable
|
151
|
-
.includes(:statusable)
|
152
|
-
.where('bulkrax_statuses.statusable_id IN (?) AND bulkrax_statuses.statusable_type = ? AND status_message = ?', entry_ids, 'Bulkrax::Entry', 'Complete')
|
153
|
-
|
154
|
-
complete_entry_identifiers = complete_statuses.map { |s| s.statusable&.identifier&.gsub(':', '\:') }
|
155
|
-
extra_filters = extra_filters.presence || '*:*'
|
156
|
-
|
157
|
-
{ :@work_ids => ::Hyrax.config.curation_concerns, :@collection_ids => [::Collection], :@file_set_ids => [::FileSet] }.each do |instance_var, models_to_search|
|
158
|
-
instance_variable_set(instance_var, ActiveFedora::SolrService.post(
|
159
|
-
extra_filters.to_s,
|
160
|
-
fq: [
|
161
|
-
%(#{::Solrizer.solr_name(work_identifier)}:("#{complete_entry_identifiers.join('" OR "')}")),
|
162
|
-
"has_model_ssim:(#{models_to_search.join(' OR ')})"
|
163
|
-
],
|
164
|
-
fl: 'id',
|
165
|
-
rows: 2_000_000_000
|
166
|
-
)['response']['docs'].map { |obj| obj['id'] })
|
167
|
-
end
|
117
|
+
find_child_file_sets(@work_ids) if importerexporter.export_from == 'collection' || importerexporter.export_from == 'worktype'
|
118
|
+
|
119
|
+
@work_ids + @collection_ids + @file_set_ids
|
168
120
|
end
|
169
121
|
|
170
122
|
# export methods
|
171
123
|
|
172
|
-
def create_new_entries
|
173
|
-
current_record_ids.each_with_index do |id, index|
|
174
|
-
break if limit_reached?(limit, index)
|
175
|
-
|
176
|
-
this_entry_class = if @collection_ids.include?(id)
|
177
|
-
collection_entry_class
|
178
|
-
elsif @file_set_ids.include?(id)
|
179
|
-
file_set_entry_class
|
180
|
-
else
|
181
|
-
entry_class
|
182
|
-
end
|
183
|
-
new_entry = find_or_create_entry(this_entry_class, id, 'Bulkrax::Exporter')
|
184
|
-
|
185
|
-
begin
|
186
|
-
entry = ExportWorkJob.perform_now(new_entry.id, current_run.id)
|
187
|
-
rescue => e
|
188
|
-
Rails.logger.info("#{e.message} was detected during export")
|
189
|
-
end
|
190
|
-
|
191
|
-
self.headers |= entry.parsed_metadata.keys if entry
|
192
|
-
end
|
193
|
-
end
|
194
|
-
alias create_from_collection create_new_entries
|
195
|
-
alias create_from_importer create_new_entries
|
196
|
-
alias create_from_worktype create_new_entries
|
197
|
-
alias create_from_all create_new_entries
|
198
|
-
|
199
124
|
# rubocop:disable Metrics/MethodLength, Metrics/AbcSize
|
200
125
|
def write_files
|
201
126
|
require 'open-uri'
|
202
127
|
require 'socket'
|
128
|
+
|
129
|
+
folder_count = 1
|
130
|
+
records_in_folder = 0
|
131
|
+
|
203
132
|
importerexporter.entries.where(identifier: current_record_ids)[0..limit || total].each do |entry|
|
204
|
-
|
205
|
-
next unless Hyrax.config.curation_concerns.include?(
|
206
|
-
|
133
|
+
record = ActiveFedora::Base.find(entry.identifier)
|
134
|
+
next unless Hyrax.config.curation_concerns.include?(record.class)
|
135
|
+
|
207
136
|
bag_entries = [entry]
|
137
|
+
file_set_entries = Bulkrax::CsvFileSetEntry.where(importerexporter_id: importerexporter.id).where("parsed_metadata LIKE '%#{record.id}%'")
|
138
|
+
file_set_entries.each { |fse| bag_entries << fse }
|
208
139
|
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
140
|
+
records_in_folder += bag_entries.count
|
141
|
+
if records_in_folder > records_split_count
|
142
|
+
folder_count += 1
|
143
|
+
records_in_folder = bag_entries.count
|
144
|
+
end
|
145
|
+
|
146
|
+
bag ||= BagIt::Bag.new setup_bagit_folder(folder_count, entry.identifier)
|
214
147
|
|
148
|
+
record.file_sets.each do |fs|
|
215
149
|
file_name = filename(fs)
|
216
150
|
next if file_name.blank?
|
217
151
|
io = open(fs.original_file.uri)
|
@@ -226,17 +160,21 @@ module Bulkrax
|
|
226
160
|
end
|
227
161
|
end
|
228
162
|
|
229
|
-
CSV.open(setup_csv_metadata_export_file(entry.identifier), "w", headers: export_headers, write_headers: true) do |csv|
|
163
|
+
CSV.open(setup_csv_metadata_export_file(folder_count, entry.identifier), "w", headers: export_headers, write_headers: true) do |csv|
|
230
164
|
bag_entries.each { |csv_entry| csv << csv_entry.parsed_metadata }
|
231
165
|
end
|
232
|
-
|
166
|
+
|
167
|
+
write_triples(folder_count, entry)
|
233
168
|
bag.manifest!(algo: 'sha256')
|
234
169
|
end
|
235
170
|
end
|
236
171
|
# rubocop:enable Metrics/MethodLength, Metrics/AbcSize
|
237
172
|
|
238
|
-
def setup_csv_metadata_export_file(id)
|
239
|
-
File.join(importerexporter.exporter_export_path,
|
173
|
+
def setup_csv_metadata_export_file(folder_count, id)
|
174
|
+
path = File.join(importerexporter.exporter_export_path, folder_count.to_s)
|
175
|
+
FileUtils.mkdir_p(path) unless File.exist?(path)
|
176
|
+
|
177
|
+
File.join(path, id, 'metadata.csv')
|
240
178
|
end
|
241
179
|
|
242
180
|
def key_allowed(key)
|
@@ -245,66 +183,31 @@ module Bulkrax
|
|
245
183
|
key != source_identifier.to_s
|
246
184
|
end
|
247
185
|
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
# we don't want access_control_id exported and we want file at the end
|
253
|
-
headers.delete('access_control_id') if headers.include?('access_control_id')
|
186
|
+
def setup_triple_metadata_export_file(folder_count, id)
|
187
|
+
path = File.join(importerexporter.exporter_export_path, folder_count.to_s)
|
188
|
+
FileUtils.mkdir_p(path) unless File.exist?(path)
|
254
189
|
|
255
|
-
|
256
|
-
headers.prepend('model')
|
257
|
-
headers.prepend(source_identifier.to_s)
|
258
|
-
headers.prepend('id')
|
259
|
-
|
260
|
-
headers.uniq
|
190
|
+
File.join(path, id, 'metadata.nt')
|
261
191
|
end
|
262
192
|
|
263
|
-
def
|
264
|
-
|
265
|
-
|
266
|
-
@object_names = mapping.values.map { |value| value['object'] }
|
267
|
-
@object_names.uniq!.delete(nil)
|
268
|
-
|
269
|
-
@object_names
|
270
|
-
end
|
271
|
-
|
272
|
-
def sort_headers(headers)
|
273
|
-
# converting headers like creator_name_1 to creator_1_name so they get sorted by numerical order
|
274
|
-
# while keeping objects grouped together
|
275
|
-
headers.sort_by do |item|
|
276
|
-
number = item.match(/\d+/)&.[](0) || 0.to_s
|
277
|
-
sort_number = number.rjust(4, "0")
|
278
|
-
object_prefix = object_names.detect { |o| item.match(/^#{o}/) } || item
|
279
|
-
remainder = item.gsub(/^#{object_prefix}_/, '').gsub(/_#{number}/, '')
|
280
|
-
"#{object_prefix}_#{sort_number}_#{remainder}"
|
281
|
-
end
|
282
|
-
end
|
193
|
+
def setup_bagit_folder(folder_count, id)
|
194
|
+
path = File.join(importerexporter.exporter_export_path, folder_count.to_s)
|
195
|
+
FileUtils.mkdir_p(path) unless File.exist?(path)
|
283
196
|
|
284
|
-
|
285
|
-
File.join(importerexporter.exporter_export_path, id, 'metadata.nt')
|
197
|
+
File.join(path, id)
|
286
198
|
end
|
287
199
|
|
288
|
-
def
|
289
|
-
File.join(importerexporter.exporter_export_path, id)
|
290
|
-
end
|
291
|
-
|
292
|
-
def write_triples(e)
|
200
|
+
def write_triples(folder_count, e)
|
293
201
|
sd = SolrDocument.find(e.identifier)
|
294
202
|
return if sd.nil?
|
295
203
|
|
296
204
|
req = ActionDispatch::Request.new({ 'HTTP_HOST' => Socket.gethostname })
|
297
205
|
rdf = Hyrax::GraphExporter.new(sd, req).fetch.dump(:ntriples)
|
298
|
-
File.open(setup_triple_metadata_export_file(e.identifier), "w") do |triples|
|
206
|
+
File.open(setup_triple_metadata_export_file(folder_count, e.identifier), "w") do |triples|
|
299
207
|
triples.write(rdf)
|
300
208
|
end
|
301
209
|
end
|
302
210
|
|
303
|
-
def required_elements?(keys)
|
304
|
-
return if keys.blank?
|
305
|
-
!required_elements.map { |el| keys.map(&:to_s).include?(el) }.include?(false)
|
306
|
-
end
|
307
|
-
|
308
211
|
# @todo - investigate getting directory structure
|
309
212
|
# @todo - investigate using perform_later, and having the importer check for
|
310
213
|
# DownloadCloudFileJob before it starts
|
@@ -355,5 +258,11 @@ module Bulkrax
|
|
355
258
|
return nil unless bag.valid?
|
356
259
|
bag
|
357
260
|
end
|
261
|
+
|
262
|
+
# use the version of this method from the application parser instead
|
263
|
+
def real_import_file_path
|
264
|
+
return importer_unzip_path if file? && zip?
|
265
|
+
parser_fields['import_file_path']
|
266
|
+
end
|
358
267
|
end
|
359
268
|
end
|
@@ -4,6 +4,7 @@ require 'csv'
|
|
4
4
|
module Bulkrax
|
5
5
|
class CsvParser < ApplicationParser # rubocop:disable Metrics/ClassLength
|
6
6
|
include ErroredEntries
|
7
|
+
include ExportBehavior
|
7
8
|
attr_writer :collections, :file_sets, :works
|
8
9
|
|
9
10
|
def self.export_supported?
|
@@ -207,6 +208,13 @@ module Bulkrax
|
|
207
208
|
@work_ids + @collection_ids + @file_set_ids
|
208
209
|
end
|
209
210
|
|
211
|
+
# find the related file set ids so entries can be made for export
|
212
|
+
def find_child_file_sets(work_ids)
|
213
|
+
work_ids.each do |id|
|
214
|
+
ActiveFedora::Base.find(id).file_set_ids.each { |fs_id| @file_set_ids << fs_id }
|
215
|
+
end
|
216
|
+
end
|
217
|
+
|
210
218
|
# Set the following instance variables: @work_ids, @collection_ids, @file_set_ids
|
211
219
|
# @see #current_record_ids
|
212
220
|
def set_ids_for_exporting_from_importer
|
@@ -272,8 +280,8 @@ module Bulkrax
|
|
272
280
|
CsvFileSetEntry
|
273
281
|
end
|
274
282
|
|
275
|
-
#
|
276
|
-
#
|
283
|
+
# TODO: figure out why using the version of this method that's in the bagit parser
|
284
|
+
# breaks specs for the "if importer?" line
|
277
285
|
def total
|
278
286
|
@total = importer.parser_fields['total'] || 0 if importer?
|
279
287
|
@total = limit || current_record_ids.count if exporter?
|
@@ -283,6 +291,10 @@ module Bulkrax
|
|
283
291
|
@total = 0
|
284
292
|
end
|
285
293
|
|
294
|
+
def records_split_count
|
295
|
+
1000
|
296
|
+
end
|
297
|
+
|
286
298
|
# @todo - investigate getting directory structure
|
287
299
|
# @todo - investigate using perform_later, and having the importer check for
|
288
300
|
# DownloadCloudFileJob before it starts
|
@@ -307,9 +319,37 @@ module Bulkrax
|
|
307
319
|
# export methods
|
308
320
|
|
309
321
|
def write_files
|
310
|
-
|
311
|
-
|
312
|
-
|
322
|
+
require 'open-uri'
|
323
|
+
folder_count = 0
|
324
|
+
|
325
|
+
importerexporter.entries.where(identifier: current_record_ids)[0..limit || total].in_groups_of(records_split_count, false) do |group|
|
326
|
+
folder_count += 1
|
327
|
+
|
328
|
+
CSV.open(setup_export_file(folder_count), "w", headers: export_headers, write_headers: true) do |csv|
|
329
|
+
group.each do |entry|
|
330
|
+
csv << entry.parsed_metadata
|
331
|
+
next if importerexporter.metadata_only? || entry.type == 'Bulkrax::CsvCollectionEntry'
|
332
|
+
|
333
|
+
store_files(entry.identifier, folder_count.to_s)
|
334
|
+
end
|
335
|
+
end
|
336
|
+
end
|
337
|
+
end
|
338
|
+
|
339
|
+
def store_files(identifier, folder_count)
|
340
|
+
record = ActiveFedora::Base.find(identifier)
|
341
|
+
file_sets = record.file_set? ? Array.wrap(record) : record.file_sets
|
342
|
+
file_sets << record.thumbnail if exporter.include_thumbnails && record.thumbnail.present? && record.work?
|
343
|
+
file_sets.each do |fs|
|
344
|
+
path = File.join(exporter_export_path, folder_count, 'files')
|
345
|
+
FileUtils.mkdir_p(path) unless File.exist? path
|
346
|
+
file = filename(fs)
|
347
|
+
io = open(fs.original_file.uri)
|
348
|
+
next if file.blank?
|
349
|
+
|
350
|
+
File.open(File.join(path, file), 'wb') do |f|
|
351
|
+
f.write(io.read)
|
352
|
+
f.close
|
313
353
|
end
|
314
354
|
end
|
315
355
|
end
|
@@ -356,8 +396,11 @@ module Bulkrax
|
|
356
396
|
end
|
357
397
|
|
358
398
|
# in the parser as it is specific to the format
|
359
|
-
def setup_export_file
|
360
|
-
File.join(importerexporter.exporter_export_path,
|
399
|
+
def setup_export_file(folder_count)
|
400
|
+
path = File.join(importerexporter.exporter_export_path, folder_count.to_s)
|
401
|
+
FileUtils.mkdir_p(path) unless File.exist?(path)
|
402
|
+
|
403
|
+
File.join(path, "export_#{importerexporter.export_source}_from_#{importerexporter.export_from}_#{folder_count}.csv")
|
361
404
|
end
|
362
405
|
|
363
406
|
# Retrieve file paths for [:file] mapping in records
|
@@ -382,10 +425,11 @@ module Bulkrax
|
|
382
425
|
end
|
383
426
|
|
384
427
|
# Retrieve the path where we expect to find the files
|
385
|
-
def path_to_files
|
428
|
+
def path_to_files(**args)
|
429
|
+
filename = args.fetch(:filename, '')
|
430
|
+
|
386
431
|
@path_to_files ||= File.join(
|
387
|
-
zip? ? importer_unzip_path : File.dirname(import_file_path),
|
388
|
-
'files'
|
432
|
+
zip? ? importer_unzip_path : File.dirname(import_file_path), 'files', filename
|
389
433
|
)
|
390
434
|
end
|
391
435
|
|
@@ -29,6 +29,7 @@
|
|
29
29
|
|
30
30
|
<%= form.input :export_source_importer,
|
31
31
|
label: t('bulkrax.exporter.labels.importer'),
|
32
|
+
# required: true,
|
32
33
|
prompt: 'Select from the list',
|
33
34
|
label_html: { class: 'importer export-source-option hidden' },
|
34
35
|
input_html: { class: 'importer export-source-option hidden' },
|
@@ -37,6 +38,7 @@
|
|
37
38
|
<%= form.input :export_source_collection,
|
38
39
|
prompt: 'Start typing ...',
|
39
40
|
label: t('bulkrax.exporter.labels.collection'),
|
41
|
+
# required: true,
|
40
42
|
placeholder: @collection&.title&.first,
|
41
43
|
label_html: { class: 'collection export-source-option hidden' },
|
42
44
|
input_html: {
|
@@ -50,6 +52,7 @@
|
|
50
52
|
|
51
53
|
<%= form.input :export_source_worktype,
|
52
54
|
label: t('bulkrax.exporter.labels.worktype'),
|
55
|
+
# required: true,
|
53
56
|
prompt: 'Select from the list',
|
54
57
|
label_html: { class: 'worktype export-source-option hidden' },
|
55
58
|
input_html: { class: 'worktype export-source-option hidden' },
|
@@ -21,7 +21,7 @@
|
|
21
21
|
<th scope="col">Name</th>
|
22
22
|
<th scope="col">Status</th>
|
23
23
|
<th scope="col">Date Exported</th>
|
24
|
-
<th scope="col"
|
24
|
+
<th scope="col">Downloadable Files</th>
|
25
25
|
<th scope="col"></th>
|
26
26
|
<th scope="col"></th>
|
27
27
|
<th scope="col"></th>
|
@@ -35,7 +35,10 @@
|
|
35
35
|
<td><%= exporter.created_at %></td>
|
36
36
|
<td>
|
37
37
|
<% if File.exist?(exporter.exporter_export_zip_path) %>
|
38
|
-
<%=
|
38
|
+
<%= simple_form_for(exporter, method: :get, url: exporter_download_path(exporter)) do |form| %>
|
39
|
+
<%= render 'downloads', exporter: exporter, form: form %>
|
40
|
+
<%= form.button :submit, value: 'Download', data: { disable_with: false } %>
|
41
|
+
<% end %>
|
39
42
|
<% end%>
|
40
43
|
</td>
|
41
44
|
<td><%= link_to raw('<span class="glyphicon glyphicon-info-sign"></span>'), exporter_path(exporter) %></td>
|
@@ -8,10 +8,11 @@
|
|
8
8
|
<div class='panel-body'>
|
9
9
|
|
10
10
|
<% if File.exist?(@exporter.exporter_export_zip_path) %>
|
11
|
-
|
11
|
+
<%= simple_form_for @exporter, method: :get, url: exporter_download_path(@exporter), html: { class: 'form-inline bulkrax-p-align' } do |form| %>
|
12
12
|
<strong>Download:</strong>
|
13
|
-
<%=
|
14
|
-
|
13
|
+
<%= render 'downloads', exporter: @exporter, form: form %>
|
14
|
+
<%= form.button :submit, value: 'Download', data: { disable_with: false } %>
|
15
|
+
<% end %>
|
15
16
|
<% end %>
|
16
17
|
|
17
18
|
<p class='bulkrax-p-align'>
|
@@ -135,10 +136,6 @@
|
|
135
136
|
<%= page_entries_info(@work_entries) %><br>
|
136
137
|
<%= paginate(@work_entries, param_name: :work_entries_page) %>
|
137
138
|
<br>
|
138
|
-
<% if File.exist?(@exporter.exporter_export_zip_path) %>
|
139
|
-
<%= link_to 'Download', exporter_download_path(@exporter) %>
|
140
|
-
|
|
141
|
-
<% end %>
|
142
139
|
<%= link_to 'Edit', edit_exporter_path(@exporter) %>
|
143
140
|
|
|
144
141
|
<%= link_to 'Back', exporters_path %>
|
data/lib/bulkrax/version.rb
CHANGED
@@ -1,6 +1,30 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
3
|
+
namespace :bulkrax do
|
4
|
+
desc "Remove old exported zips and create new ones with the new file structure"
|
5
|
+
task rerun_all_exporters: :environment do
|
6
|
+
if defined?(::Hyku)
|
7
|
+
Account.find_each do |account|
|
8
|
+
puts "=============== updating #{account.name} ============"
|
9
|
+
next if account.name == "search"
|
10
|
+
switch!(account)
|
11
|
+
|
12
|
+
rerun_exporters_and_delete_zips
|
13
|
+
|
14
|
+
puts "=============== finished updating #{account.name} ============"
|
15
|
+
end
|
16
|
+
else
|
17
|
+
rerun_exporters_and_delete_zips
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def rerun_exporters_and_delete_zips
|
22
|
+
begin
|
23
|
+
Bulkrax::Exporter.all.each { |e| Bulkrax::ExporterJob.perform_later(e.id) }
|
24
|
+
rescue => e
|
25
|
+
puts "(#{e.message})"
|
26
|
+
end
|
27
|
+
|
28
|
+
Dir["tmp/exports/**.zip"].each { |zip_path| FileUtils.rm_rf(zip_path) }
|
29
|
+
end
|
30
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bulkrax
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 4.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rob Kaufman
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
11
|
+
date: 2022-07-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rails
|
@@ -331,6 +331,7 @@ files:
|
|
331
331
|
- app/views/bulkrax/entries/_parsed_metadata.html.erb
|
332
332
|
- app/views/bulkrax/entries/_raw_metadata.html.erb
|
333
333
|
- app/views/bulkrax/entries/show.html.erb
|
334
|
+
- app/views/bulkrax/exporters/_downloads.html.erb
|
334
335
|
- app/views/bulkrax/exporters/_form.html.erb
|
335
336
|
- app/views/bulkrax/exporters/edit.html.erb
|
336
337
|
- app/views/bulkrax/exporters/index.html.erb
|
@@ -404,7 +405,7 @@ homepage: https://github.com/samvera-labs/bulkrax
|
|
404
405
|
licenses:
|
405
406
|
- Apache-2.0
|
406
407
|
metadata: {}
|
407
|
-
post_install_message:
|
408
|
+
post_install_message:
|
408
409
|
rdoc_options: []
|
409
410
|
require_paths:
|
410
411
|
- lib
|
@@ -419,8 +420,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
419
420
|
- !ruby/object:Gem::Version
|
420
421
|
version: '0'
|
421
422
|
requirements: []
|
422
|
-
rubygems_version: 3.
|
423
|
-
signing_key:
|
423
|
+
rubygems_version: 3.0.3
|
424
|
+
signing_key:
|
424
425
|
specification_version: 4
|
425
426
|
summary: Import and export tool for Hyrax and Hyku
|
426
427
|
test_files: []
|