bulkrax 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +205 -0
- data/README.md +202 -0
- data/Rakefile +42 -0
- data/app/assets/config/bulkrax_manifest.js +2 -0
- data/app/assets/javascripts/bulkrax/application.js +14 -0
- data/app/assets/javascripts/bulkrax/bulkrax.js +11 -0
- data/app/assets/javascripts/bulkrax/entries.js +15 -0
- data/app/assets/javascripts/bulkrax/exporters.js +60 -0
- data/app/assets/javascripts/bulkrax/importers.js.erb +166 -0
- data/app/assets/stylesheets/bulkrax/accordion.scss +40 -0
- data/app/assets/stylesheets/bulkrax/application.css +15 -0
- data/app/assets/stylesheets/bulkrax/coderay.scss +264 -0
- data/app/assets/stylesheets/bulkrax/import_export.scss +37 -0
- data/app/controllers/bulkrax/application_controller.rb +8 -0
- data/app/controllers/bulkrax/entries_controller.rb +44 -0
- data/app/controllers/bulkrax/exporters_controller.rb +125 -0
- data/app/controllers/bulkrax/importers_controller.rb +315 -0
- data/app/controllers/concerns/bulkrax/api.rb +29 -0
- data/app/factories/bulkrax/object_factory.rb +230 -0
- data/app/helpers/bulkrax/application_helper.rb +15 -0
- data/app/helpers/bulkrax/exporters_helper.rb +6 -0
- data/app/helpers/bulkrax/importers_helper.rb +13 -0
- data/app/helpers/bulkrax/validation_helper.rb +153 -0
- data/app/jobs/bulkrax/application_job.rb +6 -0
- data/app/jobs/bulkrax/child_relationships_job.rb +128 -0
- data/app/jobs/bulkrax/delete_work_job.rb +16 -0
- data/app/jobs/bulkrax/download_cloud_file_job.rb +18 -0
- data/app/jobs/bulkrax/export_work_job.rb +37 -0
- data/app/jobs/bulkrax/exporter_job.rb +14 -0
- data/app/jobs/bulkrax/import_work_collection_job.rb +41 -0
- data/app/jobs/bulkrax/import_work_job.rb +32 -0
- data/app/jobs/bulkrax/importer_job.rb +26 -0
- data/app/mailers/bulkrax/application_mailer.rb +8 -0
- data/app/matchers/bulkrax/application_matcher.rb +113 -0
- data/app/matchers/bulkrax/bagit_matcher.rb +6 -0
- data/app/matchers/bulkrax/csv_matcher.rb +6 -0
- data/app/matchers/bulkrax/oai_matcher.rb +6 -0
- data/app/models/bulkrax/application_record.rb +7 -0
- data/app/models/bulkrax/csv_collection_entry.rb +19 -0
- data/app/models/bulkrax/csv_entry.rb +163 -0
- data/app/models/bulkrax/entry.rb +104 -0
- data/app/models/bulkrax/exporter.rb +122 -0
- data/app/models/bulkrax/exporter_run.rb +7 -0
- data/app/models/bulkrax/import_failed.rb +13 -0
- data/app/models/bulkrax/importer.rb +155 -0
- data/app/models/bulkrax/importer_run.rb +8 -0
- data/app/models/bulkrax/oai_dc_entry.rb +6 -0
- data/app/models/bulkrax/oai_entry.rb +74 -0
- data/app/models/bulkrax/oai_qualified_dc_entry.rb +6 -0
- data/app/models/bulkrax/oai_set_entry.rb +19 -0
- data/app/models/bulkrax/rdf_collection_entry.rb +19 -0
- data/app/models/bulkrax/rdf_entry.rb +90 -0
- data/app/models/bulkrax/status.rb +25 -0
- data/app/models/bulkrax/xml_entry.rb +73 -0
- data/app/models/concerns/bulkrax/download_behavior.rb +61 -0
- data/app/models/concerns/bulkrax/errored_entries.rb +45 -0
- data/app/models/concerns/bulkrax/export_behavior.rb +58 -0
- data/app/models/concerns/bulkrax/file_factory.rb +140 -0
- data/app/models/concerns/bulkrax/has_local_processing.rb +7 -0
- data/app/models/concerns/bulkrax/has_matchers.rb +155 -0
- data/app/models/concerns/bulkrax/import_behavior.rb +90 -0
- data/app/models/concerns/bulkrax/importer_exporter_behavior.rb +34 -0
- data/app/models/concerns/bulkrax/status_info.rb +56 -0
- data/app/parsers/bulkrax/application_parser.rb +299 -0
- data/app/parsers/bulkrax/bagit_parser.rb +157 -0
- data/app/parsers/bulkrax/csv_parser.rb +266 -0
- data/app/parsers/bulkrax/oai_dc_parser.rb +130 -0
- data/app/parsers/bulkrax/oai_qualified_dc_parser.rb +9 -0
- data/app/parsers/bulkrax/xml_parser.rb +103 -0
- data/app/views/bulkrax/entries/_parsed_metadata.html.erb +19 -0
- data/app/views/bulkrax/entries/_raw_metadata.html.erb +19 -0
- data/app/views/bulkrax/entries/show.html.erb +63 -0
- data/app/views/bulkrax/exporters/_form.html.erb +120 -0
- data/app/views/bulkrax/exporters/edit.html.erb +23 -0
- data/app/views/bulkrax/exporters/index.html.erb +67 -0
- data/app/views/bulkrax/exporters/new.html.erb +23 -0
- data/app/views/bulkrax/exporters/show.html.erb +124 -0
- data/app/views/bulkrax/importers/_bagit_fields.html.erb +54 -0
- data/app/views/bulkrax/importers/_browse_everything.html.erb +12 -0
- data/app/views/bulkrax/importers/_csv_fields.html.erb +39 -0
- data/app/views/bulkrax/importers/_edit_form_buttons.html.erb +16 -0
- data/app/views/bulkrax/importers/_form.html.erb +35 -0
- data/app/views/bulkrax/importers/_oai_fields.html.erb +42 -0
- data/app/views/bulkrax/importers/_xml_fields.html.erb +60 -0
- data/app/views/bulkrax/importers/edit.html.erb +20 -0
- data/app/views/bulkrax/importers/index.html.erb +77 -0
- data/app/views/bulkrax/importers/new.html.erb +25 -0
- data/app/views/bulkrax/importers/show.html.erb +175 -0
- data/app/views/bulkrax/importers/upload_corrected_entries.html.erb +37 -0
- data/app/views/bulkrax/shared/_bulkrax_errors.html.erb +52 -0
- data/app/views/bulkrax/shared/_bulkrax_field_mapping.html.erb +39 -0
- data/app/views/hyrax/dashboard/sidebar/_bulkrax_sidebar_additions.html.erb +6 -0
- data/app/views/hyrax/dashboard/sidebar/_repository_content.html.erb +19 -0
- data/app/views/layouts/bulkrax/application.html.erb +14 -0
- data/config/locales/bulkrax.en.yml +36 -0
- data/config/routes.rb +18 -0
- data/db/migrate/20181011230201_create_bulkrax_importers.rb +18 -0
- data/db/migrate/20181011230228_create_bulkrax_importer_runs.rb +16 -0
- data/db/migrate/20190325183136_create_bulkrax_entries.rb +16 -0
- data/db/migrate/20190601221109_add_status_to_entry.rb +9 -0
- data/db/migrate/20190715161939_add_collections_to_importer_runs.rb +6 -0
- data/db/migrate/20190715162044_change_collection_ids_on_entries.rb +5 -0
- data/db/migrate/20190729124607_create_bulkrax_exporters.rb +19 -0
- data/db/migrate/20190729134158_create_bulkrax_exporter_runs.rb +14 -0
- data/db/migrate/20190731114016_change_importer_and_exporter_to_polymorphic.rb +12 -0
- data/db/migrate/20191203225129_add_total_collection_records_to_importer_runs.rb +5 -0
- data/db/migrate/20191204191623_add_children_to_importer_runs.rb +6 -0
- data/db/migrate/20191204223857_change_total_records_to_total_work_entries.rb +6 -0
- data/db/migrate/20191212155530_change_entry_last_error.rb +19 -0
- data/db/migrate/20200108194557_add_validate_only_to_bulkrax_importers.rb +5 -0
- data/db/migrate/20200301232856_add_status_to_importers.rb +9 -0
- data/db/migrate/20200312190638_remove_foreign_key_from_bulkrax_entries.rb +5 -0
- data/db/migrate/20200326235838_add_status_to_exporters.rb +7 -0
- data/db/migrate/20200601204556_add_invalid_record_to_importer_run.rb +5 -0
- data/db/migrate/20200818055819_create_bulkrax_statuses.rb +18 -0
- data/db/migrate/20200819054016_move_to_statuses.rb +30 -0
- data/db/migrate/20201106014204_add_date_filter_and_status_to_bulkrax_exporters.rb +7 -0
- data/db/migrate/20201117220007_add_workflow_status_to_bulkrax_exporter.rb +5 -0
- data/db/migrate/20210806044408_remove_unused_last_error.rb +7 -0
- data/db/migrate/20210806065737_increase_text_sizes.rb +12 -0
- data/lib/bulkrax.rb +161 -0
- data/lib/bulkrax/engine.rb +37 -0
- data/lib/bulkrax/version.rb +5 -0
- data/lib/generators/bulkrax/install_generator.rb +80 -0
- data/lib/generators/bulkrax/templates/README +3 -0
- data/lib/generators/bulkrax/templates/app/assets/images/bulkrax/removed.png +0 -0
- data/lib/generators/bulkrax/templates/app/models/concerns/bulkrax/has_local_processing.rb +8 -0
- data/lib/generators/bulkrax/templates/bin/importer +140 -0
- data/lib/generators/bulkrax/templates/config/bulkrax_api.yml +84 -0
- data/lib/generators/bulkrax/templates/config/initializers/bulkrax.rb +72 -0
- data/lib/tasks/bulkrax_tasks.rake +6 -0
- metadata +388 -0
@@ -0,0 +1,157 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Bulkrax
|
4
|
+
class BagitParser < ApplicationParser
|
5
|
+
def self.export_supported?
|
6
|
+
false # @todo will be supported
|
7
|
+
end
|
8
|
+
|
9
|
+
def valid_import?
|
10
|
+
return true if import_fields.present?
|
11
|
+
rescue => e
|
12
|
+
status_info(e)
|
13
|
+
false
|
14
|
+
end
|
15
|
+
|
16
|
+
def entry_class
|
17
|
+
parser_fields['metadata_format'].constantize
|
18
|
+
end
|
19
|
+
|
20
|
+
def collection_entry_class
|
21
|
+
parser_fields['metadata_format'].gsub('Entry', 'CollectionEntry').constantize
|
22
|
+
rescue
|
23
|
+
Entry
|
24
|
+
end
|
25
|
+
|
26
|
+
# Take a random sample of 10 metadata_paths and work out the import fields from that
|
27
|
+
def import_fields
|
28
|
+
raise StandardError, 'No metadata files were found' if metadata_paths.blank?
|
29
|
+
@import_fields ||= metadata_paths.sample(10).map do |path|
|
30
|
+
entry_class.fields_from_data(entry_class.read_data(path))
|
31
|
+
end.flatten.compact.uniq
|
32
|
+
end
|
33
|
+
|
34
|
+
# Assume a single metadata record per path
|
35
|
+
# Create an Array of all metadata records, one per file
|
36
|
+
def records(_opts = {})
|
37
|
+
raise StandardError, 'No BagIt records were found' if bags.blank?
|
38
|
+
@records ||= bags.map do |bag|
|
39
|
+
path = metadata_path(bag)
|
40
|
+
raise StandardError, 'No metadata files were found' if path.blank?
|
41
|
+
data = entry_class.read_data(path)
|
42
|
+
data = entry_class.data_for_entry(data, source_identifier)
|
43
|
+
data[:file] = bag.bag_files.join('|')
|
44
|
+
data
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
# Find or create collections referenced by works
|
49
|
+
# If the import data also contains records for these works, they will be updated
|
50
|
+
# during create works
|
51
|
+
def create_collections
|
52
|
+
collections.each_with_index do |collection, index|
|
53
|
+
next if collection.blank?
|
54
|
+
metadata = {
|
55
|
+
title: [collection],
|
56
|
+
work_identifier => [collection],
|
57
|
+
visibility: 'open',
|
58
|
+
collection_type_gid: Hyrax::CollectionType.find_or_create_default_collection_type.gid
|
59
|
+
}
|
60
|
+
new_entry = find_or_create_entry(collection_entry_class, collection, 'Bulkrax::Importer', metadata)
|
61
|
+
ImportWorkCollectionJob.perform_now(new_entry.id, current_run.id)
|
62
|
+
increment_counters(index, true)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def create_works
|
67
|
+
records.each_with_index do |record, index|
|
68
|
+
next unless record_has_source_identifier(record, index)
|
69
|
+
break if limit_reached?(limit, index)
|
70
|
+
|
71
|
+
seen[record[source_identifier]] = true
|
72
|
+
new_entry = find_or_create_entry(entry_class, record[source_identifier], 'Bulkrax::Importer', record)
|
73
|
+
if record[:delete].present?
|
74
|
+
DeleteWorkJob.send(perform_method, new_entry, current_run)
|
75
|
+
else
|
76
|
+
ImportWorkJob.send(perform_method, new_entry.id, current_run.id)
|
77
|
+
end
|
78
|
+
increment_counters(index)
|
79
|
+
end
|
80
|
+
importer.record_status
|
81
|
+
rescue StandardError => e
|
82
|
+
status_info(e)
|
83
|
+
end
|
84
|
+
|
85
|
+
def collections
|
86
|
+
records.map { |r| r[:collection].split(/\s*[;|]\s*/) if r[:collection].present? }.flatten.compact.uniq
|
87
|
+
end
|
88
|
+
|
89
|
+
def collections_total
|
90
|
+
collections.size
|
91
|
+
end
|
92
|
+
|
93
|
+
def total
|
94
|
+
metadata_paths.count
|
95
|
+
end
|
96
|
+
|
97
|
+
def required_elements?(keys)
|
98
|
+
return if keys.blank?
|
99
|
+
!required_elements.map { |el| keys.map(&:to_s).include?(el) }.include?(false)
|
100
|
+
end
|
101
|
+
|
102
|
+
# @todo - investigate getting directory structure
|
103
|
+
# @todo - investigate using perform_later, and having the importer check for
|
104
|
+
# DownloadCloudFileJob before it starts
|
105
|
+
def retrieve_cloud_files(files)
|
106
|
+
# There should only be one zip file for Bagit, take the first
|
107
|
+
return if files['0'].blank?
|
108
|
+
target_file = File.join(path_for_import, files['0']['file_name'].tr(' ', '_'))
|
109
|
+
# Now because we want the files in place before the importer runs
|
110
|
+
Bulkrax::DownloadCloudFileJob.perform_now(files['0'], target_file)
|
111
|
+
return target_file
|
112
|
+
end
|
113
|
+
|
114
|
+
private
|
115
|
+
|
116
|
+
def bags
|
117
|
+
return @bags if @bags.present?
|
118
|
+
new_bag = bag(import_file_path)
|
119
|
+
@bags = if new_bag
|
120
|
+
[new_bag]
|
121
|
+
else
|
122
|
+
Dir.glob("#{import_file_path}/**/*").map { |d| bag(d) }
|
123
|
+
end
|
124
|
+
@bags.delete(nil)
|
125
|
+
raise StandardError, 'No valid bags found' if @bags.blank?
|
126
|
+
return @bags
|
127
|
+
end
|
128
|
+
|
129
|
+
# Gather the paths to all bags; skip any stray files
|
130
|
+
def bag_paths
|
131
|
+
bags.map(&:bag_dir)
|
132
|
+
end
|
133
|
+
|
134
|
+
def metadata_file_name
|
135
|
+
raise StandardError, 'The metadata file name must be specified' if parser_fields['metadata_file_name'].blank?
|
136
|
+
parser_fields['metadata_file_name']
|
137
|
+
end
|
138
|
+
|
139
|
+
# Gather the paths to all metadata files matching the metadata_file_name
|
140
|
+
def metadata_paths
|
141
|
+
@metadata_paths ||= bag_paths.map do |b|
|
142
|
+
Dir.glob("#{b}/**/*").select { |f| File.file?(f) && f.ends_with?(metadata_file_name) }
|
143
|
+
end.flatten.compact
|
144
|
+
end
|
145
|
+
|
146
|
+
def metadata_path(bag)
|
147
|
+
Dir.glob("#{bag.bag_dir}/**/*").detect { |f| File.file?(f) && f.ends_with?(metadata_file_name) }
|
148
|
+
end
|
149
|
+
|
150
|
+
def bag(path)
|
151
|
+
return nil unless path && File.exist?(File.join(path, 'bagit.txt'))
|
152
|
+
bag = BagIt::Bag.new(path)
|
153
|
+
return nil unless bag.valid?
|
154
|
+
bag
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|
@@ -0,0 +1,266 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'csv'
|
4
|
+
module Bulkrax
|
5
|
+
class CsvParser < ApplicationParser
|
6
|
+
include ErroredEntries
|
7
|
+
def self.export_supported?
|
8
|
+
true
|
9
|
+
end
|
10
|
+
|
11
|
+
def initialize(importerexporter)
|
12
|
+
@importerexporter = importerexporter
|
13
|
+
end
|
14
|
+
|
15
|
+
def collections
|
16
|
+
# does the CSV contain a collection column?
|
17
|
+
return [] unless import_fields.include?(:collection)
|
18
|
+
# retrieve a list of unique collections
|
19
|
+
records.map { |r| r[:collection].split(/\s*[;|]\s*/) if r[:collection].present? }.flatten.compact.uniq
|
20
|
+
end
|
21
|
+
|
22
|
+
def collections_total
|
23
|
+
collections.size
|
24
|
+
end
|
25
|
+
|
26
|
+
def records(_opts = {})
|
27
|
+
file_for_import = only_updates ? parser_fields['partial_import_file_path'] : import_file_path
|
28
|
+
# data for entry does not need source_identifier for csv, because csvs are read sequentially and mapped after raw data is read.
|
29
|
+
@records ||= entry_class.read_data(file_for_import).map { |record_data| entry_class.data_for_entry(record_data, nil) }
|
30
|
+
end
|
31
|
+
|
32
|
+
# We could use CsvEntry#fields_from_data(data) but that would mean re-reading the data
|
33
|
+
def import_fields
|
34
|
+
@import_fields ||= records.inject(:merge).keys.compact.uniq
|
35
|
+
end
|
36
|
+
|
37
|
+
def required_elements?(keys)
|
38
|
+
return if keys.blank?
|
39
|
+
missing_elements(keys).blank?
|
40
|
+
end
|
41
|
+
|
42
|
+
def missing_elements(keys)
|
43
|
+
required_elements.map(&:to_s) - keys.map(&:to_s)
|
44
|
+
end
|
45
|
+
|
46
|
+
def valid_import?
|
47
|
+
error_alert = "Missing at least one required element, missing element(s) are: #{missing_elements(import_fields).join(', ')}"
|
48
|
+
raise StandardError, error_alert unless required_elements?(import_fields)
|
49
|
+
|
50
|
+
file_paths.is_a?(Array)
|
51
|
+
rescue StandardError => e
|
52
|
+
status_info(e)
|
53
|
+
false
|
54
|
+
end
|
55
|
+
|
56
|
+
def create_collections
|
57
|
+
collections.each_with_index do |collection, index|
|
58
|
+
next if collection.blank?
|
59
|
+
metadata = {
|
60
|
+
title: [collection],
|
61
|
+
work_identifier => [collection],
|
62
|
+
visibility: 'open',
|
63
|
+
collection_type_gid: Hyrax::CollectionType.find_or_create_default_collection_type.gid
|
64
|
+
}
|
65
|
+
new_entry = find_or_create_entry(collection_entry_class, collection, 'Bulkrax::Importer', metadata)
|
66
|
+
ImportWorkCollectionJob.perform_now(new_entry.id, current_run.id)
|
67
|
+
increment_counters(index, true)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def create_works
|
72
|
+
records.each_with_index do |record, index|
|
73
|
+
next unless record_has_source_identifier(record, index)
|
74
|
+
break if limit_reached?(limit, index)
|
75
|
+
|
76
|
+
seen[record[source_identifier]] = true
|
77
|
+
new_entry = find_or_create_entry(entry_class, record[source_identifier], 'Bulkrax::Importer', record.to_h.compact)
|
78
|
+
if record[:delete].present?
|
79
|
+
DeleteWorkJob.send(perform_method, new_entry, current_run)
|
80
|
+
else
|
81
|
+
ImportWorkJob.send(perform_method, new_entry.id, current_run.id)
|
82
|
+
end
|
83
|
+
increment_counters(index)
|
84
|
+
end
|
85
|
+
importer.record_status
|
86
|
+
rescue StandardError => e
|
87
|
+
status_info(e)
|
88
|
+
end
|
89
|
+
|
90
|
+
def write_partial_import_file(file)
|
91
|
+
import_filename = import_file_path.split('/').last
|
92
|
+
partial_import_filename = "#{File.basename(import_filename, '.csv')}_corrected_entries.csv"
|
93
|
+
|
94
|
+
path = File.join(path_for_import, partial_import_filename)
|
95
|
+
FileUtils.mv(
|
96
|
+
file.path,
|
97
|
+
path
|
98
|
+
)
|
99
|
+
path
|
100
|
+
end
|
101
|
+
|
102
|
+
def create_parent_child_relationships
|
103
|
+
super
|
104
|
+
end
|
105
|
+
|
106
|
+
def extra_filters
|
107
|
+
output = ""
|
108
|
+
if importerexporter.start_date.present?
|
109
|
+
start_dt = importerexporter.start_date.to_datetime.strftime('%FT%TZ')
|
110
|
+
finish_dt = importerexporter.finish_date.present? ? importerexporter.finish_date.to_datetime.end_of_day.strftime('%FT%TZ') : "NOW"
|
111
|
+
output += " AND system_modified_dtsi:[#{start_dt} TO #{finish_dt}]"
|
112
|
+
end
|
113
|
+
output += importerexporter.work_visibility.present? ? " AND visibility_ssi:#{importerexporter.work_visibility}" : ""
|
114
|
+
output += importerexporter.workflow_status.present? ? " AND workflow_state_name_ssim:#{importerexporter.workflow_status}" : ""
|
115
|
+
output
|
116
|
+
end
|
117
|
+
|
118
|
+
def current_work_ids
|
119
|
+
case importerexporter.export_from
|
120
|
+
when 'collection'
|
121
|
+
ActiveFedora::SolrService.query("member_of_collection_ids_ssim:#{importerexporter.export_source + extra_filters}", rows: 2_000_000_000).map(&:id)
|
122
|
+
when 'worktype'
|
123
|
+
ActiveFedora::SolrService.query("has_model_ssim:#{importerexporter.export_source + extra_filters}", rows: 2_000_000_000).map(&:id)
|
124
|
+
when 'importer'
|
125
|
+
entry_ids = Bulkrax::Importer.find(importerexporter.export_source).entries.pluck(:id)
|
126
|
+
complete_statuses = Bulkrax::Status.latest_by_statusable
|
127
|
+
.includes(:statusable)
|
128
|
+
.where('bulkrax_statuses.statusable_id IN (?) AND bulkrax_statuses.statusable_type = ? AND status_message = ?', entry_ids, 'Bulkrax::Entry', 'Complete')
|
129
|
+
complete_entry_identifiers = complete_statuses.map { |s| s.statusable&.identifier }
|
130
|
+
|
131
|
+
ActiveFedora::SolrService.query("#{work_identifier}_tesim:(#{complete_entry_identifiers.join(' OR ')})#{extra_filters}", rows: 2_000_000_000).map(&:id)
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
def create_new_entries
|
136
|
+
current_work_ids.each_with_index do |wid, index|
|
137
|
+
break if limit_reached?(limit, index)
|
138
|
+
new_entry = find_or_create_entry(entry_class, wid, 'Bulkrax::Exporter')
|
139
|
+
Bulkrax::ExportWorkJob.perform_now(new_entry.id, current_run.id)
|
140
|
+
end
|
141
|
+
end
|
142
|
+
alias create_from_collection create_new_entries
|
143
|
+
alias create_from_importer create_new_entries
|
144
|
+
alias create_from_worktype create_new_entries
|
145
|
+
|
146
|
+
def entry_class
|
147
|
+
CsvEntry
|
148
|
+
end
|
149
|
+
|
150
|
+
def collection_entry_class
|
151
|
+
CsvCollectionEntry
|
152
|
+
end
|
153
|
+
|
154
|
+
# See https://stackoverflow.com/questions/2650517/count-the-number-of-lines-in-a-file-without-reading-entire-file-into-memory
|
155
|
+
# Changed to grep as wc -l counts blank lines, and ignores the final unescaped line (which may or may not contain data)
|
156
|
+
def total
|
157
|
+
if importer?
|
158
|
+
return @total if @total&.positive?
|
159
|
+
# windows enocded
|
160
|
+
@total = `grep -c ^M #{real_import_file_path}`.to_i - 1
|
161
|
+
# unix encoded
|
162
|
+
@total = `grep -vc ^$ #{real_import_file_path}`.to_i - 1 if @total < 1
|
163
|
+
elsif exporter?
|
164
|
+
@total = importerexporter.entries.count
|
165
|
+
else
|
166
|
+
@total = 0
|
167
|
+
end
|
168
|
+
return @total
|
169
|
+
rescue StandardErrorr
|
170
|
+
@total = 0
|
171
|
+
end
|
172
|
+
|
173
|
+
# @todo - investigate getting directory structure
|
174
|
+
# @todo - investigate using perform_later, and having the importer check for
|
175
|
+
# DownloadCloudFileJob before it starts
|
176
|
+
def retrieve_cloud_files(files)
|
177
|
+
files_path = File.join(path_for_import, 'files')
|
178
|
+
FileUtils.mkdir_p(files_path) unless File.exist?(files_path)
|
179
|
+
files.each_pair do |_key, file|
|
180
|
+
# fixes bug where auth headers do not get attached properly
|
181
|
+
if file['auth_header'].present?
|
182
|
+
file['headers'] ||= {}
|
183
|
+
file['headers'].merge!(file['auth_header'])
|
184
|
+
end
|
185
|
+
# this only works for uniquely named files
|
186
|
+
target_file = File.join(files_path, file['file_name'].tr(' ', '_'))
|
187
|
+
# Now because we want the files in place before the importer runs
|
188
|
+
# Problematic for a large upload
|
189
|
+
Bulkrax::DownloadCloudFileJob.perform_now(file, target_file)
|
190
|
+
end
|
191
|
+
return nil
|
192
|
+
end
|
193
|
+
|
194
|
+
# export methods
|
195
|
+
|
196
|
+
def write_files
|
197
|
+
CSV.open(setup_export_file, "w", headers: export_headers, write_headers: true) do |csv|
|
198
|
+
importerexporter.entries.where(identifier: current_work_ids)[0..limit || total].each do |e|
|
199
|
+
csv << e.parsed_metadata
|
200
|
+
end
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
def key_allowed(key)
|
205
|
+
!Bulkrax.reserved_properties.include?(key) &&
|
206
|
+
new_entry(entry_class, 'Bulkrax::Exporter').field_supported?(key) &&
|
207
|
+
key != source_identifier.to_s
|
208
|
+
end
|
209
|
+
|
210
|
+
# All possible column names
|
211
|
+
def export_headers
|
212
|
+
headers = ['id']
|
213
|
+
headers << source_identifier.to_s
|
214
|
+
headers << 'model'
|
215
|
+
importerexporter.mapping.each_key { |key| headers << key if key_allowed(key) }
|
216
|
+
headers << 'file'
|
217
|
+
headers.uniq
|
218
|
+
end
|
219
|
+
|
220
|
+
# in the parser as it is specific to the format
|
221
|
+
def setup_export_file
|
222
|
+
File.join(importerexporter.exporter_export_path, 'export.csv')
|
223
|
+
end
|
224
|
+
|
225
|
+
# Retrieve file paths for [:file] mapping in records
|
226
|
+
# and check all listed files exist.
|
227
|
+
def file_paths
|
228
|
+
raise StandardError, 'No records were found' if records.blank?
|
229
|
+
@file_paths ||= records.map do |r|
|
230
|
+
file_mapping = Bulkrax.field_mappings.dig(self.class.to_s, 'file', :from)&.first&.to_sym || :file
|
231
|
+
next if r[file_mapping].blank?
|
232
|
+
|
233
|
+
r[file_mapping].split(/\s*[:;|]\s*/).map do |f|
|
234
|
+
file = File.join(path_to_files, f.tr(' ', '_'))
|
235
|
+
if File.exist?(file) # rubocop:disable Style/GuardClause
|
236
|
+
file
|
237
|
+
else
|
238
|
+
raise "File #{file} does not exist"
|
239
|
+
end
|
240
|
+
end
|
241
|
+
end.flatten.compact.uniq
|
242
|
+
end
|
243
|
+
|
244
|
+
# Retrieve the path where we expect to find the files
|
245
|
+
def path_to_files
|
246
|
+
@path_to_files ||= File.join(
|
247
|
+
File.file?(import_file_path) ? File.dirname(import_file_path) : import_file_path,
|
248
|
+
'files'
|
249
|
+
)
|
250
|
+
end
|
251
|
+
|
252
|
+
private
|
253
|
+
|
254
|
+
# Override to return the first CSV in the path, if a zip file is supplied
|
255
|
+
# We expect a single CSV at the top level of the zip in the CSVParser
|
256
|
+
# but we are willing to go look for it if need be
|
257
|
+
def real_import_file_path
|
258
|
+
if file? && zip?
|
259
|
+
unzip(parser_fields['import_file_path'])
|
260
|
+
return Dir["#{importer_unzip_path}/**/*.csv"].first
|
261
|
+
else
|
262
|
+
parser_fields['import_file_path']
|
263
|
+
end
|
264
|
+
end
|
265
|
+
end
|
266
|
+
end
|
@@ -0,0 +1,130 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Bulkrax
|
4
|
+
class OaiDcParser < ApplicationParser
|
5
|
+
attr_accessor :headers
|
6
|
+
delegate :list_sets, to: :client
|
7
|
+
|
8
|
+
def initialize(importerexporter)
|
9
|
+
super
|
10
|
+
@headers = { from: importerexporter.user.email }
|
11
|
+
end
|
12
|
+
|
13
|
+
def client
|
14
|
+
@client ||= OAI::Client.new(importerexporter.parser_fields['base_url'],
|
15
|
+
headers: headers,
|
16
|
+
parser: 'libxml',
|
17
|
+
metadata_prefix: importerexporter.parser_fields['metadata_prefix'])
|
18
|
+
rescue StandardError
|
19
|
+
raise OAIError
|
20
|
+
end
|
21
|
+
|
22
|
+
def collection_name
|
23
|
+
@collection_name ||= parser_fields['set'] || 'all'
|
24
|
+
end
|
25
|
+
|
26
|
+
def entry_class
|
27
|
+
OaiDcEntry
|
28
|
+
end
|
29
|
+
|
30
|
+
def collection_entry_class
|
31
|
+
OaiSetEntry
|
32
|
+
end
|
33
|
+
|
34
|
+
def records(opts = {})
|
35
|
+
opts[:set] = collection_name unless collection_name == 'all'
|
36
|
+
|
37
|
+
opts[:from] = importerexporter&.last_imported_at&.strftime("%Y-%m-%d") if importerexporter.last_imported_at && only_updates
|
38
|
+
|
39
|
+
if opts[:quick]
|
40
|
+
opts.delete(:quick)
|
41
|
+
begin
|
42
|
+
@short_records = client.list_identifiers(opts)
|
43
|
+
rescue OAI::Exception => e
|
44
|
+
return @short_records = [] if e.code == "noRecordsMatch"
|
45
|
+
raise e
|
46
|
+
end
|
47
|
+
else
|
48
|
+
begin
|
49
|
+
@records ||= client.list_records(opts.merge(metadata_prefix: parser_fields['metadata_prefix']))
|
50
|
+
rescue OAI::Exception => e
|
51
|
+
return @records = [] if e.code == "noRecordsMatch"
|
52
|
+
raise e
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
# the set of fields available in the import data
|
58
|
+
def import_fields
|
59
|
+
['contributor', 'coverage', 'creator', 'date', 'description', 'format', 'identifier', 'language', 'publisher', 'relation', 'rights', 'source', 'subject', 'title', 'type']
|
60
|
+
end
|
61
|
+
|
62
|
+
delegate :list_sets, to: :client
|
63
|
+
|
64
|
+
def create_collections
|
65
|
+
metadata = {
|
66
|
+
visibility: 'open',
|
67
|
+
collection_type_gid: Hyrax::CollectionType.find_or_create_default_collection_type.gid
|
68
|
+
}
|
69
|
+
|
70
|
+
collections.each_with_index do |set, index|
|
71
|
+
next unless collection_name == 'all' || collection_name == set.spec
|
72
|
+
unique_collection_identifier = importerexporter.unique_collection_identifier(set.spec)
|
73
|
+
metadata[:title] = [set.name]
|
74
|
+
metadata[work_identifier] = [unique_collection_identifier]
|
75
|
+
|
76
|
+
new_entry = collection_entry_class.where(importerexporter: importerexporter, identifier: unique_collection_identifier, raw_metadata: metadata).first_or_create!
|
77
|
+
# perform now to ensure this gets created before work imports start
|
78
|
+
ImportWorkCollectionJob.perform_now(new_entry.id, importerexporter.current_run.id)
|
79
|
+
increment_counters(index, true)
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def create_works
|
84
|
+
results = self.records(quick: true)
|
85
|
+
return if results.blank?
|
86
|
+
results.full.each_with_index do |record, index|
|
87
|
+
identifier = record.send(source_identifier)
|
88
|
+
if identifier.blank?
|
89
|
+
if Bulkrax.fill_in_blank_source_identifiers.present?
|
90
|
+
identifier = Bulkrax.fill_in_blank_source_identifiers.call(self, index)
|
91
|
+
else
|
92
|
+
invalid_record("Missing #{source_identifier} for #{record.to_h}\n")
|
93
|
+
next
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
break if limit_reached?(limit, index)
|
98
|
+
seen[identifier] = true
|
99
|
+
new_entry = entry_class.where(importerexporter: self.importerexporter, identifier: identifier).first_or_create!
|
100
|
+
if record.deleted?
|
101
|
+
DeleteWorkJob.send(perform_method, new_entry, importerexporter.current_run)
|
102
|
+
else
|
103
|
+
ImportWorkJob.send(perform_method, new_entry.id, importerexporter.current_run.id)
|
104
|
+
end
|
105
|
+
increment_counters(index)
|
106
|
+
end
|
107
|
+
importer.record_status
|
108
|
+
end
|
109
|
+
|
110
|
+
def collections
|
111
|
+
@collections ||= list_sets
|
112
|
+
end
|
113
|
+
|
114
|
+
def collections_total
|
115
|
+
if collection_name == 'all'
|
116
|
+
collections.count
|
117
|
+
else
|
118
|
+
1
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
def create_parent_child_relationships; end
|
123
|
+
|
124
|
+
def total
|
125
|
+
@total ||= records(quick: true).doc.find(".//resumptionToken").to_a.first.attributes["completeListSize"].to_i
|
126
|
+
rescue
|
127
|
+
@total = 0
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|