bulkrax 9.3.5 → 9.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +11 -1
- data/app/assets/javascripts/bulkrax/application.js +2 -1
- data/app/assets/javascripts/bulkrax/bulkrax.js +13 -4
- data/app/assets/javascripts/bulkrax/bulkrax_utils.js +96 -0
- data/app/assets/javascripts/bulkrax/datatables.js +1 -0
- data/app/assets/javascripts/bulkrax/entries.js +17 -10
- data/app/assets/javascripts/bulkrax/importers.js.erb +9 -2
- data/app/assets/javascripts/bulkrax/importers_stepper.js +2420 -0
- data/app/assets/stylesheets/bulkrax/application.css +1 -1
- data/app/assets/stylesheets/bulkrax/stepper/_header.scss +83 -0
- data/app/assets/stylesheets/bulkrax/stepper/_mixins.scss +26 -0
- data/app/assets/stylesheets/bulkrax/stepper/_navigation.scss +103 -0
- data/app/assets/stylesheets/bulkrax/stepper/_responsive.scss +46 -0
- data/app/assets/stylesheets/bulkrax/stepper/_review.scss +92 -0
- data/app/assets/stylesheets/bulkrax/stepper/_settings.scss +106 -0
- data/app/assets/stylesheets/bulkrax/stepper/_success.scss +26 -0
- data/app/assets/stylesheets/bulkrax/stepper/_summary.scss +171 -0
- data/app/assets/stylesheets/bulkrax/stepper/_upload.scss +339 -0
- data/app/assets/stylesheets/bulkrax/stepper/_validation.scss +237 -0
- data/app/assets/stylesheets/bulkrax/stepper/_variables.scss +46 -0
- data/app/assets/stylesheets/bulkrax/stepper.scss +32 -0
- data/app/controllers/bulkrax/guided_imports_controller.rb +175 -0
- data/app/controllers/bulkrax/importers_controller.rb +28 -31
- data/app/controllers/concerns/bulkrax/guided_import_demo_scenarios.rb +201 -0
- data/app/controllers/concerns/bulkrax/importer_file_handler.rb +212 -0
- data/app/errors/bulkrax/unzip_error.rb +16 -0
- data/app/factories/bulkrax/object_factory.rb +3 -2
- data/app/factories/bulkrax/valkyrie_object_factory.rb +61 -17
- data/app/jobs/bulkrax/importer_job.rb +42 -4
- data/app/models/bulkrax/csv_entry.rb +27 -7
- data/app/models/bulkrax/entry.rb +4 -0
- data/app/models/bulkrax/importer.rb +27 -10
- data/app/models/concerns/bulkrax/has_matchers.rb +2 -2
- data/app/models/concerns/bulkrax/importer_exporter_behavior.rb +6 -5
- data/app/parsers/bulkrax/application_parser.rb +63 -20
- data/app/parsers/bulkrax/bagit_parser.rb +12 -0
- data/app/parsers/bulkrax/csv_parser.rb +168 -25
- data/app/parsers/concerns/bulkrax/csv_parser/csv_template_generation.rb +73 -0
- data/app/parsers/concerns/bulkrax/csv_parser/csv_validation.rb +133 -0
- data/app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb +282 -0
- data/app/parsers/concerns/bulkrax/csv_parser/csv_validation_hierarchy.rb +96 -0
- data/app/services/bulkrax/csv_template/column_builder.rb +60 -0
- data/app/services/bulkrax/csv_template/column_descriptor.rb +58 -0
- data/app/services/bulkrax/csv_template/csv_builder.rb +83 -0
- data/app/services/bulkrax/csv_template/explanation_builder.rb +57 -0
- data/app/services/bulkrax/csv_template/field_analyzer.rb +56 -0
- data/app/services/bulkrax/csv_template/file_path_generator.rb +47 -0
- data/app/services/bulkrax/csv_template/file_validator.rb +68 -0
- data/app/services/bulkrax/csv_template/mapping_manager.rb +55 -0
- data/app/services/bulkrax/csv_template/model_loader.rb +50 -0
- data/app/services/bulkrax/csv_template/row_builder.rb +35 -0
- data/app/services/bulkrax/csv_template/schema_analyzer.rb +70 -0
- data/app/services/bulkrax/csv_template/split_formatter.rb +44 -0
- data/app/services/bulkrax/csv_template/value_determiner.rb +68 -0
- data/app/services/bulkrax/stepper_response_formatter.rb +347 -0
- data/app/services/bulkrax/validation_error_csv_builder.rb +99 -0
- data/app/validators/bulkrax/csv_row/child_reference.rb +56 -0
- data/app/validators/bulkrax/csv_row/circular_reference.rb +71 -0
- data/app/validators/bulkrax/csv_row/controlled_vocabulary.rb +74 -0
- data/app/validators/bulkrax/csv_row/duplicate_identifier.rb +63 -0
- data/app/validators/bulkrax/csv_row/missing_source_identifier.rb +31 -0
- data/app/validators/bulkrax/csv_row/parent_reference.rb +59 -0
- data/app/validators/bulkrax/csv_row/required_values.rb +64 -0
- data/app/views/bulkrax/guided_imports/new.html.erb +567 -0
- data/app/views/bulkrax/importers/index.html.erb +6 -1
- data/app/views/bulkrax/importers/new.html.erb +1 -1
- data/app/views/bulkrax/importers/show.html.erb +17 -1
- data/config/i18n-tasks.yml +195 -0
- data/config/locales/bulkrax.de.yml +508 -0
- data/config/locales/bulkrax.en.yml +463 -233
- data/config/locales/bulkrax.es.yml +508 -0
- data/config/locales/bulkrax.fr.yml +508 -0
- data/config/locales/bulkrax.it.yml +508 -0
- data/config/locales/bulkrax.pt-BR.yml +508 -0
- data/config/locales/bulkrax.zh.yml +507 -0
- data/config/routes.rb +10 -1
- data/lib/bulkrax/data/demo_scenarios.json +2235 -0
- data/lib/bulkrax/version.rb +1 -1
- data/lib/bulkrax.rb +31 -0
- metadata +56 -16
- data/app/services/bulkrax/sample_csv_service/column_builder.rb +0 -58
- data/app/services/bulkrax/sample_csv_service/column_descriptor.rb +0 -56
- data/app/services/bulkrax/sample_csv_service/csv_builder.rb +0 -82
- data/app/services/bulkrax/sample_csv_service/explanation_builder.rb +0 -51
- data/app/services/bulkrax/sample_csv_service/field_analyzer.rb +0 -54
- data/app/services/bulkrax/sample_csv_service/file_path_generator.rb +0 -16
- data/app/services/bulkrax/sample_csv_service/mapping_manager.rb +0 -36
- data/app/services/bulkrax/sample_csv_service/model_loader.rb +0 -40
- data/app/services/bulkrax/sample_csv_service/row_builder.rb +0 -33
- data/app/services/bulkrax/sample_csv_service/schema_analyzer.rb +0 -69
- data/app/services/bulkrax/sample_csv_service/split_formatter.rb +0 -42
- data/app/services/bulkrax/sample_csv_service/value_determiner.rb +0 -67
- data/app/services/bulkrax/sample_csv_service.rb +0 -78
- /data/{app/services → lib}/wings/custom_queries/find_by_source_identifier.rb +0 -0
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Bulkrax
|
|
4
|
+
class CsvParser < ApplicationParser
|
|
5
|
+
# Private helper methods for CsvValidation.
|
|
6
|
+
module CsvValidationHelpers # rubocop:disable Metrics/ModuleLength
|
|
7
|
+
include CsvValidationHierarchy
|
|
8
|
+
|
|
9
|
+
# Resolve a symbol key from mappings for use as a record hash key.
|
|
10
|
+
# Returns a Symbol matching the parser's symbol-keyed record hash.
|
|
11
|
+
def resolve_validation_key(mapping_manager, key: nil, flag: nil, default:)
|
|
12
|
+
options = mapping_manager.resolve_column_name(key: key, flag: flag, default: default.to_s)
|
|
13
|
+
options.first&.to_sym || default
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
# Parse rows from a CsvEntry.read_data result into the canonical record shape.
|
|
17
|
+
# CsvEntry.read_data returns CSV::Row objects with symbol headers; blank rows
|
|
18
|
+
# are already filtered by CsvWrapper.
|
|
19
|
+
def parse_validation_rows(raw_csv, source_id_key, parent_key, children_key, file_key)
|
|
20
|
+
raw_csv.map do |row|
|
|
21
|
+
# CSV::Row#to_h converts symbol headers → string-keyed hash
|
|
22
|
+
row_hash = row.to_h.transform_keys(&:to_s)
|
|
23
|
+
{
|
|
24
|
+
source_identifier: row[source_id_key],
|
|
25
|
+
model: row[:model],
|
|
26
|
+
parent: row[parent_key],
|
|
27
|
+
children: row[children_key],
|
|
28
|
+
file: row[file_key],
|
|
29
|
+
raw_row: row_hash
|
|
30
|
+
}
|
|
31
|
+
end
|
|
32
|
+
rescue StandardError => e
|
|
33
|
+
Rails.logger.error("CsvParser.validate_csv: error parsing rows – #{e.message}")
|
|
34
|
+
[]
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def build_validation_field_metadata(all_models, field_analyzer)
|
|
38
|
+
all_models.each_with_object({}) do |model, hash|
|
|
39
|
+
field_list = field_analyzer.find_or_create_field_list_for(model_name: model)
|
|
40
|
+
hash[model] = {
|
|
41
|
+
properties: field_list.dig(model, 'properties') || [],
|
|
42
|
+
required_terms: field_list.dig(model, 'required_terms') || [],
|
|
43
|
+
controlled_vocab_terms: field_list.dig(model, 'controlled_vocab_terms') || []
|
|
44
|
+
}
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def build_valid_validation_headers(mapping_manager, field_analyzer, all_models, mappings, field_metadata)
|
|
49
|
+
svc = Bulkrax::CsvParser::ValidationContext.new(
|
|
50
|
+
mapping_manager: mapping_manager,
|
|
51
|
+
field_analyzer: field_analyzer,
|
|
52
|
+
all_models: all_models,
|
|
53
|
+
mappings: mappings
|
|
54
|
+
)
|
|
55
|
+
all_cols = CsvTemplate::ColumnBuilder.new(svc).all_columns
|
|
56
|
+
all_cols - CsvTemplate::CsvBuilder::IGNORED_PROPERTIES
|
|
57
|
+
rescue StandardError => e
|
|
58
|
+
Rails.logger.error("CsvParser.validate_csv: error building valid headers – #{e.message}")
|
|
59
|
+
standard = %w[model source_identifier parents children file]
|
|
60
|
+
model_fields = field_metadata.values.flat_map { |m| m[:properties] }
|
|
61
|
+
.map { |prop| mapping_manager.key_to_mapped_column(prop) }
|
|
62
|
+
(standard + model_fields).uniq
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def find_missing_required_headers(headers, field_metadata, mapping_manager)
|
|
66
|
+
csv_keys = headers.map { |h| mapping_manager.mapped_to_key(h).sub(/_\d+\z/, '') }.uniq
|
|
67
|
+
missing = []
|
|
68
|
+
field_metadata.each do |model, meta|
|
|
69
|
+
(meta[:required_terms] || []).each do |field|
|
|
70
|
+
missing << { model: model, field: field } unless csv_keys.include?(field)
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
missing.uniq
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def find_unrecognized_validation_headers(headers, valid_headers)
|
|
77
|
+
checker = DidYouMean::SpellChecker.new(dictionary: valid_headers)
|
|
78
|
+
headers
|
|
79
|
+
.reject { |h| h.blank? || valid_headers.include?(h) || valid_headers.include?(h.sub(/_\d+\z/, '')) }
|
|
80
|
+
.index_with { |h| checker.correct(h).first }
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def find_empty_column_positions(headers, raw_csv)
|
|
84
|
+
headers.each_with_index.filter_map do |h, i|
|
|
85
|
+
next if h.present?
|
|
86
|
+
has_data = raw_csv.any? { |row| row.fields[i].present? }
|
|
87
|
+
i + 1 if has_data
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Adds a missing source_identifier entry to missing_required when the column
|
|
92
|
+
# is absent and fill_in_blank_source_identifiers is not configured.
|
|
93
|
+
def append_missing_source_id!(missing_required, headers, source_id_key, all_models)
|
|
94
|
+
return if headers.map(&:to_s).include?(source_id_key.to_s)
|
|
95
|
+
return if Bulkrax.fill_in_blank_source_identifiers.present?
|
|
96
|
+
|
|
97
|
+
all_models.each { |model| missing_required << { model: model, field: source_id_key.to_s } }
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
# Adds a file-level notice when the model column is absent or every row has a blank
|
|
101
|
+
# model value, indicating that the default work type will be used for all rows.
|
|
102
|
+
# When this notice is present the per-row default_work_type_used warnings are
|
|
103
|
+
# suppressed in the formatter — no need to repeat the same message for every row.
|
|
104
|
+
def append_missing_model_notice!(notices, headers, csv_data)
|
|
105
|
+
default_model = Bulkrax.default_work_type
|
|
106
|
+
return if default_model.blank?
|
|
107
|
+
|
|
108
|
+
model_column_present = headers.map(&:to_s).include?('model')
|
|
109
|
+
all_rows_blank = model_column_present && csv_data.all? { |r| r[:model].blank? }
|
|
110
|
+
|
|
111
|
+
return if model_column_present && !all_rows_blank
|
|
112
|
+
|
|
113
|
+
key_suffix = all_rows_blank ? 'column_empty' : 'column_missing'
|
|
114
|
+
base_key = 'bulkrax.importer.guided_import.validation.default_work_type_notice'
|
|
115
|
+
notices << {
|
|
116
|
+
field: 'model',
|
|
117
|
+
default_work_type: default_model,
|
|
118
|
+
message: I18n.t("#{base_key}.message_#{key_suffix}", default_work_type: default_model),
|
|
119
|
+
suggestion: I18n.t("#{base_key}.suggestion_#{key_suffix}")
|
|
120
|
+
}
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
def apply_rights_statement_validation_override!(result, missing_required)
|
|
124
|
+
only_rights = missing_required.present? &&
|
|
125
|
+
missing_required.all? { |h| h[:field].to_s == 'rights_statement' }
|
|
126
|
+
return unless only_rights && !result[:isValid]
|
|
127
|
+
return if result[:headers].blank?
|
|
128
|
+
return if result[:missingFiles]&.any?
|
|
129
|
+
|
|
130
|
+
result[:isValid] = true
|
|
131
|
+
result[:hasWarnings] = true
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
# Assembles the final result hash returned to the guided import UI.
|
|
135
|
+
def assemble_result(headers:, missing_required:, header_issues:, row_errors:, csv_data:, file_validator:, collections:, works:, file_sets:, notices: []) # rubocop:disable Metrics/ParameterLists
|
|
136
|
+
row_error_entries = row_errors.select { |e| e[:severity] == 'error' }
|
|
137
|
+
row_warning_entries = row_errors.select { |e| e[:severity] == 'warning' }
|
|
138
|
+
has_errors = missing_required.any? || headers.blank? || csv_data.empty? ||
|
|
139
|
+
file_validator.missing_files.any? || row_error_entries.any?
|
|
140
|
+
has_warnings = header_issues[:unrecognized].any? || header_issues[:empty_columns].any? ||
|
|
141
|
+
file_validator.possible_missing_files? || row_warning_entries.any? || notices.any?
|
|
142
|
+
|
|
143
|
+
{
|
|
144
|
+
headers: headers,
|
|
145
|
+
missingRequired: missing_required,
|
|
146
|
+
notices: notices,
|
|
147
|
+
unrecognized: header_issues[:unrecognized],
|
|
148
|
+
emptyColumns: header_issues[:empty_columns],
|
|
149
|
+
rowCount: csv_data.length,
|
|
150
|
+
isValid: !has_errors,
|
|
151
|
+
hasWarnings: has_warnings,
|
|
152
|
+
rowErrors: row_errors,
|
|
153
|
+
collections: collections,
|
|
154
|
+
works: works,
|
|
155
|
+
fileSets: file_sets,
|
|
156
|
+
totalItems: csv_data.length,
|
|
157
|
+
fileReferences: file_validator.count_references,
|
|
158
|
+
missingFiles: file_validator.missing_files,
|
|
159
|
+
foundFiles: file_validator.found_files_count,
|
|
160
|
+
zipIncluded: file_validator.zip_included?
|
|
161
|
+
}
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
# Builds the find_record lambda used by row validators and hierarchy extraction.
|
|
165
|
+
def build_find_record
|
|
166
|
+
all_mappings = Bulkrax.field_mappings['Bulkrax::CsvParser'] || {}
|
|
167
|
+
work_identifier = all_mappings.find { |_k, v| v['source_identifier'] == true }&.first || 'source'
|
|
168
|
+
work_identifier_search = Array.wrap(all_mappings.dig(work_identifier, 'search_field')).first&.to_s ||
|
|
169
|
+
"#{work_identifier}_sim"
|
|
170
|
+
->(id) { find_record_by_source_identifier(id, work_identifier, work_identifier_search) }
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
# Attempt to locate an existing repository record by its identifier.
|
|
174
|
+
# The identifier may be a repository object ID or a source_identifier property value.
|
|
175
|
+
# Checks the repository directly (by ID, then by Solr property search) — a Bulkrax
|
|
176
|
+
# Entry record alone is not sufficient, as the object may never have been created.
|
|
177
|
+
#
|
|
178
|
+
# @param identifier [String]
|
|
179
|
+
# @param work_identifier [String] the source_identifier property name (e.g. "source")
|
|
180
|
+
# @param work_identifier_search [String] the Solr field for source_identifier (e.g. "source_sim")
|
|
181
|
+
# @return [Boolean] true if a matching repository object is found
|
|
182
|
+
def find_record_by_source_identifier(identifier, work_identifier, work_identifier_search)
|
|
183
|
+
return false if identifier.blank?
|
|
184
|
+
|
|
185
|
+
return true if Bulkrax.object_factory.find_or_nil(identifier).present?
|
|
186
|
+
|
|
187
|
+
[Bulkrax.collection_model_class, *Bulkrax.curation_concerns].any? do |klass|
|
|
188
|
+
Bulkrax.object_factory.search_by_property(
|
|
189
|
+
value: identifier,
|
|
190
|
+
klass: klass,
|
|
191
|
+
search_field: work_identifier_search,
|
|
192
|
+
name_field: work_identifier
|
|
193
|
+
).present?
|
|
194
|
+
end
|
|
195
|
+
rescue StandardError
|
|
196
|
+
false
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
# Returns the raw CSV column name (String) for a relationship field.
|
|
200
|
+
# Looks for the mapping entry flagged with +flag+ and returns its first +from+ value,
|
|
201
|
+
# falling back to +default+ when none is found.
|
|
202
|
+
def resolve_relationship_column(mappings, flag, default)
|
|
203
|
+
entry = mappings.find { |_k, v| v.is_a?(Hash) && v[flag] }
|
|
204
|
+
entry&.last&.dig('from')&.first || default
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
def resolve_parent_split_pattern(mappings)
|
|
208
|
+
split_val = mappings.dig('parents', 'split') || mappings.dig(:parents, :split)
|
|
209
|
+
return nil if split_val.blank?
|
|
210
|
+
return Bulkrax::DEFAULT_MULTI_VALUE_ELEMENT_SPLIT_ON if split_val == true
|
|
211
|
+
|
|
212
|
+
split_val
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
def resolve_children_split_pattern(mappings)
|
|
216
|
+
split_val = mappings.dig('children', 'split') || mappings.dig(:children, :split)
|
|
217
|
+
return nil if split_val.blank?
|
|
218
|
+
return Bulkrax::DEFAULT_MULTI_VALUE_ELEMENT_SPLIT_ON if split_val == true
|
|
219
|
+
|
|
220
|
+
split_val
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
# Builds a graph of { source_identifier => [parent_ids] } from all CSV records.
|
|
224
|
+
# Used by CircularReference validator to detect cycles across the whole CSV.
|
|
225
|
+
#
|
|
226
|
+
# Parent edges are collected from both directions:
|
|
227
|
+
# - explicit parent declarations (parents / parents_N columns)
|
|
228
|
+
# - inverted child declarations (children / children_N columns), mirroring
|
|
229
|
+
# the normalisation done in importers_stepper.js#normalizeRelationships
|
|
230
|
+
def build_relationship_graph(csv_data, mappings)
|
|
231
|
+
parent_column = resolve_relationship_column(mappings, 'related_parents_field_mapping', 'parents')
|
|
232
|
+
children_column = resolve_relationship_column(mappings, 'related_children_field_mapping', 'children')
|
|
233
|
+
parent_suffix = /\A#{Regexp.escape(parent_column)}_\d+\z/
|
|
234
|
+
children_suffix = /\A#{Regexp.escape(children_column)}_\d+\z/
|
|
235
|
+
|
|
236
|
+
graph = build_parent_edges(csv_data, parent_suffix, resolve_parent_split_pattern(mappings))
|
|
237
|
+
invert_child_edges(graph, csv_data, children_suffix, resolve_children_split_pattern(mappings))
|
|
238
|
+
graph
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
def build_parent_edges(csv_data, suffix_pattern, split_pattern)
|
|
242
|
+
csv_data.each_with_object({}) do |record, graph|
|
|
243
|
+
id = record[:source_identifier]
|
|
244
|
+
next if id.blank?
|
|
245
|
+
|
|
246
|
+
base_ids = split_or_single(record[:parent], split_pattern)
|
|
247
|
+
suffix_ids = suffixed_values(record[:raw_row], suffix_pattern)
|
|
248
|
+
graph[id] = (base_ids + suffix_ids).uniq
|
|
249
|
+
end
|
|
250
|
+
end
|
|
251
|
+
|
|
252
|
+
def invert_child_edges(graph, csv_data, suffix_pattern, split_pattern)
|
|
253
|
+
csv_data.each do |record|
|
|
254
|
+
id = record[:source_identifier]
|
|
255
|
+
next if id.blank?
|
|
256
|
+
|
|
257
|
+
child_ids = split_or_single(record[:children], split_pattern) +
|
|
258
|
+
suffixed_values(record[:raw_row], suffix_pattern)
|
|
259
|
+
child_ids.each do |child_id|
|
|
260
|
+
graph[child_id] ||= []
|
|
261
|
+
graph[child_id] << id unless graph[child_id].include?(id)
|
|
262
|
+
end
|
|
263
|
+
end
|
|
264
|
+
end
|
|
265
|
+
|
|
266
|
+
def split_or_single(value, split_pattern)
|
|
267
|
+
if split_pattern
|
|
268
|
+
value.to_s.split(split_pattern).map(&:strip).reject(&:blank?)
|
|
269
|
+
elsif value.present?
|
|
270
|
+
[value.to_s.strip]
|
|
271
|
+
else
|
|
272
|
+
[]
|
|
273
|
+
end
|
|
274
|
+
end
|
|
275
|
+
|
|
276
|
+
def suffixed_values(raw_row, suffix_pattern)
|
|
277
|
+
raw_row.select { |k, _| k.to_s.match?(suffix_pattern) }
|
|
278
|
+
.values.map(&:to_s).map(&:strip).reject(&:blank?)
|
|
279
|
+
end
|
|
280
|
+
end
|
|
281
|
+
end
|
|
282
|
+
end
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Bulkrax
|
|
4
|
+
class CsvParser < ApplicationParser
|
|
5
|
+
# Hierarchy-building helpers for CsvValidation. Handles extracting and
|
|
6
|
+
# categorising items from parsed CSV data for the guided import tree view.
|
|
7
|
+
module CsvValidationHierarchy
|
|
8
|
+
def extract_validation_items(csv_data, all_ids = Set.new, find_record = nil, parent_split_pattern: nil, child_split_pattern: '|')
|
|
9
|
+
child_to_parents = build_child_to_parents_map(csv_data, child_split_pattern: child_split_pattern)
|
|
10
|
+
collections = []
|
|
11
|
+
works = []
|
|
12
|
+
file_sets = []
|
|
13
|
+
|
|
14
|
+
csv_data.each do |item|
|
|
15
|
+
categorise_validation_item(item, child_to_parents, all_ids, collections, works, file_sets, find_record,
|
|
16
|
+
parent_split_pattern: parent_split_pattern, child_split_pattern: child_split_pattern)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
[collections, works, file_sets]
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def build_child_to_parents_map(csv_data, child_split_pattern: '|')
|
|
23
|
+
Hash.new { |h, k| h[k] = [] }.tap do |map|
|
|
24
|
+
csv_data.each do |item|
|
|
25
|
+
next if item[:source_identifier].blank?
|
|
26
|
+
|
|
27
|
+
collect_relationship_ids(item[:children], item[:raw_row], 'children', split_pattern: child_split_pattern).each do |child_id|
|
|
28
|
+
map[child_id] << item[:source_identifier]
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def categorise_validation_item(item, child_to_parents, all_ids, collections, works, file_sets, find_record = nil, parent_split_pattern: nil, child_split_pattern: '|') # rubocop:disable Metrics/ParameterLists
|
|
35
|
+
item_id = item[:source_identifier]
|
|
36
|
+
model_str = item[:model].to_s
|
|
37
|
+
|
|
38
|
+
opts = { type: nil, find_record: find_record, parent: parent_split_pattern, child: child_split_pattern }
|
|
39
|
+
if model_str.casecmp('collection').zero? || model_str.casecmp('collectionresource').zero?
|
|
40
|
+
collections << build_item_hash(item, child_to_parents, all_ids, opts.merge(type: 'collection'))
|
|
41
|
+
elsif model_str.casecmp('fileset').zero? || model_str.casecmp('hyrax::fileset').zero?
|
|
42
|
+
file_sets << { id: item_id, title: item[:raw_row]['title'] || item_id, type: 'file_set' }
|
|
43
|
+
else
|
|
44
|
+
works << build_item_hash(item, child_to_parents, all_ids, opts.merge(type: 'work'))
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def build_item_hash(item, child_to_parents, all_ids, opts = {}) # rubocop:disable Metrics/MethodLength
|
|
49
|
+
type = opts[:type]
|
|
50
|
+
find_record = opts[:find_record]
|
|
51
|
+
item_id = item[:source_identifier]
|
|
52
|
+
title = item[:raw_row]['title'] || item_id
|
|
53
|
+
parents = collect_relationship_ids(item[:parent], item[:raw_row], 'parents', split_pattern: opts[:parent])
|
|
54
|
+
children = collect_relationship_ids(item[:children], item[:raw_row], 'children', split_pattern: opts[:child] || '|')
|
|
55
|
+
|
|
56
|
+
{
|
|
57
|
+
id: item_id,
|
|
58
|
+
title: title,
|
|
59
|
+
type: type,
|
|
60
|
+
existing: find_record&.call(item_id) || false,
|
|
61
|
+
parentIds: (resolvable_ids(parents, all_ids) + resolvable_ids(child_to_parents[item_id] || [], all_ids)).uniq,
|
|
62
|
+
childIds: resolvable_ids(children, all_ids),
|
|
63
|
+
existingParentIds: external_ids(parents, all_ids, find_record),
|
|
64
|
+
existingChildIds: external_ids(children, all_ids, find_record)
|
|
65
|
+
}
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def parse_relationship_field(value, split_pattern: '|')
|
|
69
|
+
return [] if value.blank?
|
|
70
|
+
value.to_s.split(split_pattern).map(&:strip).reject(&:blank?)
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def collect_relationship_ids(base_value, raw_row, column, split_pattern: '|')
|
|
74
|
+
base_ids = parse_relationship_field(base_value, split_pattern: split_pattern)
|
|
75
|
+
suffix_pattern = /\A#{Regexp.escape(column)}_\d+\z/
|
|
76
|
+
suffix_ids = raw_row
|
|
77
|
+
.select { |k, _| k.to_s.match?(suffix_pattern) }
|
|
78
|
+
.values
|
|
79
|
+
.map(&:to_s).map(&:strip).reject(&:blank?)
|
|
80
|
+
(base_ids + suffix_ids).uniq
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def resolvable_ids(ids, all_ids)
|
|
84
|
+
ids.select { |id| all_ids.include?(id) }
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# Returns ids from the list that are NOT in the CSV but exist in the repository.
|
|
88
|
+
def external_ids(ids, all_ids, find_record)
|
|
89
|
+
return [] if find_record.nil?
|
|
90
|
+
|
|
91
|
+
ids.reject { |id| all_ids.include?(id) }
|
|
92
|
+
.select { |id| find_record.call(id) }
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
end
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Bulkrax
|
|
4
|
+
module CsvTemplate
|
|
5
|
+
# Builds column headers for CSV
|
|
6
|
+
class ColumnBuilder
|
|
7
|
+
def initialize(service)
|
|
8
|
+
@service = service
|
|
9
|
+
@descriptor = CsvTemplate::ColumnDescriptor.new
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def all_columns
|
|
13
|
+
required_columns + property_columns
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def required_columns
|
|
17
|
+
mapped_core_columns +
|
|
18
|
+
relationship_columns +
|
|
19
|
+
file_columns
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
private
|
|
23
|
+
|
|
24
|
+
def mapped_core_columns
|
|
25
|
+
@descriptor.core_columns.map do |column|
|
|
26
|
+
@service.mapping_manager.key_to_mapped_column(column)
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def property_columns
|
|
31
|
+
field_lists = @service.all_models.map do |m|
|
|
32
|
+
@service.field_analyzer.find_or_create_field_list_for(model_name: m)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
properties = field_lists
|
|
36
|
+
.flat_map { |item| item.values.flat_map { |config| config["properties"] || [] } }
|
|
37
|
+
.uniq
|
|
38
|
+
.map { |property| @service.mapping_manager.key_to_mapped_column(property) }
|
|
39
|
+
.uniq
|
|
40
|
+
|
|
41
|
+
(properties - required_columns).sort
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def relationship_columns
|
|
45
|
+
[
|
|
46
|
+
@service.mapping_manager.find_by_flag("related_children_field_mapping", 'children'),
|
|
47
|
+
@service.mapping_manager.find_by_flag("related_parents_field_mapping", 'parents')
|
|
48
|
+
]
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def file_columns
|
|
52
|
+
CsvTemplate::ColumnDescriptor::COLUMN_DESCRIPTIONS[:files].flat_map do |property_hash|
|
|
53
|
+
property_hash.keys.map do |key|
|
|
54
|
+
@service.mapping_manager.key_to_mapped_column(key)
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Bulkrax
|
|
4
|
+
module CsvTemplate
|
|
5
|
+
# Manages column descriptions and metadata
|
|
6
|
+
class ColumnDescriptor
|
|
7
|
+
COLUMN_DESCRIPTIONS = {
|
|
8
|
+
include_first: [
|
|
9
|
+
{ "model" => "The work types configured in your repository are listed below.\nIf left blank, your default work type, #{Bulkrax.default_work_type}, is used." },
|
|
10
|
+
{ "source_identifier" => "This must be a unique identifier.\nIt can be alphanumeric with some special characters (e.g. hyphens, colons), and URLs are also supported." },
|
|
11
|
+
{ "id" => "This column would optionally be included only if it is a re-import, i.e. for updating or deleting records.\nThis is a key identifier used by the system, which you wouldn't have for new imports." },
|
|
12
|
+
{ "rights_statement" => "Rights statement URI for the work.\nIf not included, uses the value specified on the bulk import configuration screen." }
|
|
13
|
+
],
|
|
14
|
+
visibility: [
|
|
15
|
+
{ "visibility" => "Uses the value specified on the bulk import configuration screen if not added here.\nValid options: open, authenticated, restricted, embargo, lease" },
|
|
16
|
+
{ "embargo_release_date" => "Required for embargo (yyyy-mm-dd)" },
|
|
17
|
+
{ "visibility_during_embargo" => "Required for embargo" },
|
|
18
|
+
{ "visibility_after_embargo" => "Required for embargo" },
|
|
19
|
+
{ "lease_expiration_date" => "Required for lease (yyyy-mm-dd)" },
|
|
20
|
+
{ "visibility_during_lease" => "Required for lease" },
|
|
21
|
+
{ "visibility_after_lease" => "Required for lease" }
|
|
22
|
+
],
|
|
23
|
+
files: [
|
|
24
|
+
{ "file" => "Use filenames exactly matching those in your files folder.\nZip your CSV and files folder together and attach this to your importer." },
|
|
25
|
+
{ "remote_files" => "Use the URLs to remote files to be attached to the work." }
|
|
26
|
+
],
|
|
27
|
+
relationships: [
|
|
28
|
+
{ "parents" => "The source_identifier or id of work or collection to be attached as parent." },
|
|
29
|
+
{ "children" => "The source_identifier or id of work or file to be attached as child." }
|
|
30
|
+
],
|
|
31
|
+
other: [
|
|
32
|
+
{ "hide_from_catalog_search" => "Set to 1 to hide the collection from catalog search results." },
|
|
33
|
+
{ "show_pdf_download_button" => "Set to 1 to show a PDF download link on the work's page." },
|
|
34
|
+
{ "show_pdf_viewer" => "Set to 1 to show a PDF viewer on the work's page." },
|
|
35
|
+
{ "video_embed" => "A valid URL to a hosted video that can appear in an iframe, beginning with 'http://' or 'https://'." }
|
|
36
|
+
]
|
|
37
|
+
}.freeze
|
|
38
|
+
|
|
39
|
+
def core_columns
|
|
40
|
+
extract_column_names(:include_first) + extract_column_names(:visibility)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def find_description_for(column)
|
|
44
|
+
COLUMN_DESCRIPTIONS.each_value do |group|
|
|
45
|
+
prop = group.find { |hash| hash.key?(column) }
|
|
46
|
+
return prop[column] if prop
|
|
47
|
+
end
|
|
48
|
+
nil
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
private
|
|
52
|
+
|
|
53
|
+
def extract_column_names(group)
|
|
54
|
+
COLUMN_DESCRIPTIONS[group].map { |hash| hash.keys.first }
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Bulkrax
|
|
4
|
+
module CsvTemplate
|
|
5
|
+
# Builds CSV content
|
|
6
|
+
class CsvBuilder
|
|
7
|
+
IGNORED_PROPERTIES = %w[
|
|
8
|
+
admin_set_id alternate_ids
|
|
9
|
+
bulkrax_identifier
|
|
10
|
+
collection_type_gid contexts created_at
|
|
11
|
+
date date_modified date_uploaded depositor
|
|
12
|
+
embargo embargo_id
|
|
13
|
+
file_ids
|
|
14
|
+
has_model head
|
|
15
|
+
internal_resource is_child
|
|
16
|
+
lease lease_id
|
|
17
|
+
member_ids member_of_collection_ids modified_date
|
|
18
|
+
new_record
|
|
19
|
+
on_behalf_of owner proxy_depositor
|
|
20
|
+
rendering_ids representative_id
|
|
21
|
+
schema_version split_from_pdf_id state tail
|
|
22
|
+
thumbnail_id
|
|
23
|
+
updated_at
|
|
24
|
+
].freeze
|
|
25
|
+
|
|
26
|
+
def initialize(service)
|
|
27
|
+
@service = service
|
|
28
|
+
@column_builder = CsvTemplate::ColumnBuilder.new(service)
|
|
29
|
+
@row_builder = CsvTemplate::RowBuilder.new(service)
|
|
30
|
+
@header_row = nil
|
|
31
|
+
@required_headings = []
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def write_to_file(file_path)
|
|
35
|
+
FileUtils.mkdir_p(File.dirname(file_path))
|
|
36
|
+
CSV.open(file_path, "w") { |csv| write_rows(csv) }
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def generate_string
|
|
40
|
+
CSV.generate { |csv| write_rows(csv) }
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
private
|
|
44
|
+
|
|
45
|
+
def write_rows(csv)
|
|
46
|
+
csv_rows.each { |row| csv << row }
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def csv_rows
|
|
50
|
+
@header_row = fill_header_row
|
|
51
|
+
rows = [
|
|
52
|
+
@header_row,
|
|
53
|
+
@row_builder.build_explanation_row(@header_row),
|
|
54
|
+
*@row_builder.build_model_rows(@header_row)
|
|
55
|
+
]
|
|
56
|
+
remove_empty_columns(rows)
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def fill_header_row
|
|
60
|
+
@required_headings = @column_builder.required_columns
|
|
61
|
+
all_columns = @column_builder.all_columns
|
|
62
|
+
filtered = all_columns - IGNORED_PROPERTIES
|
|
63
|
+
@required_headings = @column_builder.required_columns & filtered
|
|
64
|
+
filtered
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def remove_empty_columns(rows)
|
|
68
|
+
return rows if rows.empty?
|
|
69
|
+
|
|
70
|
+
columns = rows.transpose
|
|
71
|
+
non_empty_columns = columns.select { |col| keep_column?(col) }
|
|
72
|
+
non_empty_columns.transpose
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def keep_column?(column)
|
|
76
|
+
heading = column[0]
|
|
77
|
+
return true if @required_headings.include?(heading)
|
|
78
|
+
|
|
79
|
+
column[2..-1].any? { |value| !value.nil? && value != "" && value != "---" }
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
end
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Bulkrax
|
|
4
|
+
module CsvTemplate
|
|
5
|
+
# Builds explanations for CSV columns
|
|
6
|
+
class ExplanationBuilder
|
|
7
|
+
def initialize(service)
|
|
8
|
+
@service = service
|
|
9
|
+
@descriptor = CsvTemplate::ColumnDescriptor.new
|
|
10
|
+
@split_formatter = CsvTemplate::SplitFormatter.new
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def build_explanations(header_row)
|
|
14
|
+
header_row.map do |column|
|
|
15
|
+
{ column => build_explanation(column) }
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
private
|
|
20
|
+
|
|
21
|
+
def build_explanation(column)
|
|
22
|
+
mapping_key = @service.mapping_manager.mapped_to_key(column)
|
|
23
|
+
|
|
24
|
+
column_description = source_identifier_description(column) || @descriptor.find_description_for(column)
|
|
25
|
+
controlled_vocab_info = controlled_vocab_text(mapping_key)
|
|
26
|
+
split_info = split_text(mapping_key, controlled_vocab_info)
|
|
27
|
+
|
|
28
|
+
components = [
|
|
29
|
+
column_description,
|
|
30
|
+
controlled_vocab_info,
|
|
31
|
+
split_info
|
|
32
|
+
].compact
|
|
33
|
+
|
|
34
|
+
components.join("\n")
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def source_identifier_description(column)
|
|
38
|
+
return unless @service.mapping_manager.mapped_to_key(column) == 'source_identifier'
|
|
39
|
+
return if Bulkrax.fill_in_blank_source_identifiers.blank?
|
|
40
|
+
"Will be auto-generated if left blank.\nProviding one allows round-tripping and deduplication across imports."
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def controlled_vocab_text(field_name)
|
|
44
|
+
vocab_terms = @service.field_analyzer.controlled_vocab_terms
|
|
45
|
+
return unless vocab_terms.include?(field_name) || field_name == 'based_near'
|
|
46
|
+
'This property uses a controlled vocabulary.'
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def split_text(mapping_key, controlled_vocab_info)
|
|
50
|
+
return nil if controlled_vocab_info.present? && !mapping_key.in?(%w[location resource_type])
|
|
51
|
+
split_value = @service.mapping_manager.split_value_for(mapping_key)
|
|
52
|
+
return nil unless split_value
|
|
53
|
+
@split_formatter.format(split_value)
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|