bulkrax 3.4.0 → 3.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f0ee151bc10b7485eb716463b2c4895165d6df9d73c3dd60813d2eb4de8161d1
4
- data.tar.gz: f4e5ddfb5ac602eb20a85850f8ec5a9286a9a668f8c83548dd049db4d91a2a0e
3
+ metadata.gz: 0a80674a9f582c3b8e83f442318908edb6ca9f0b615c970d09b17d941cc8027d
4
+ data.tar.gz: a2a53116ef49e03dde1aa1df14d8259a2b4abf06a82cff63a9d4ba622ba6600a
5
5
  SHA512:
6
- metadata.gz: a5b029da7feaee11c8a3eb58e0c7150abcb06b6ca08f9d102134d5e1c9eef3049ae85b0084d74934b7504688da7d4958a0dc425c705b42101c1a3ce62a57d0c7
7
- data.tar.gz: c807b68265c0d88b9e7faea4f6efdf94b3bcf90e965f5cc97cc259202bc55db976944eea8e4bf99876723f881d29d2fa7004dbd9985f7f8164648df517722133
6
+ metadata.gz: af3d75fb03105e37f7374f3a7f863c545d9cc9c95ab2f18bbbf7b4692024e09811f103a372327b4724c836568bad176ed0ad0b7be929ed556259aa9b0793fce6
7
+ data.tar.gz: 1117a185fbab2bae0746187f464bebea855759a5ecccf0d34f098ac55ad7a2952e663268372262ba8f97820c8c1f02bd29c74a388cfd8ea9cfed84a46dad94cf
@@ -42,10 +42,12 @@ module Bulkrax
42
42
  pending_relationships.each do |rel|
43
43
  raise ::StandardError, %("#{rel}" needs either a child or a parent to create a relationship) if rel.child_id.nil? || rel.parent_id.nil?
44
44
  @child_entry, child_record = find_record(rel.child_id, importer_run_id)
45
- child_record.is_a?(::Collection) ? @child_records[:collections] << child_record : @child_records[:works] << child_record
45
+ if child_record
46
+ child_record.is_a?(::Collection) ? @child_records[:collections] << child_record : @child_records[:works] << child_record
47
+ end
46
48
  end
47
49
 
48
- if (child_records[:collections].blank? && child_records[:works].blank?) || parent_record.blank?
50
+ if (child_records[:collections].blank? && child_records[:works].blank?) || parent_record.nil?
49
51
  reschedule({ parent_identifier: parent_identifier, importer_run_id: importer_run_id })
50
52
  return false # stop current job from continuing to run after rescheduling
51
53
  end
@@ -12,15 +12,14 @@ module Bulkrax
12
12
  # check for our entry in our current importer first
13
13
  importer_id = ImporterRun.find(importer_run_id).importer_id
14
14
  default_scope = { identifier: identifier, importerexporter_type: 'Bulkrax::Importer' }
15
- record = Entry.find_by(default_scope.merge({ importerexporter_id: importer_id })) || Entry.find_by(default_scope)
16
15
 
17
- # TODO(alishaevn): discuss whether we are only looking for Collection models here
18
- # use ActiveFedora::Base.find(identifier) instead?
19
- record ||= ::Collection.where(id: identifier).first # rubocop:disable Rails/FindBy
20
- if record.blank?
21
- available_work_types.each do |work_type|
22
- record ||= work_type.where(id: identifier).first # rubocop:disable Rails/FindBy
23
- end
16
+ begin
17
+ # the identifier parameter can be a :source_identifier or the id of an object
18
+ record = Entry.find_by(default_scope.merge({ importerexporter_id: importer_id })) || Entry.find_by(default_scope)
19
+ record ||= ActiveFedora::Base.find(identifier)
20
+ # NameError for if ActiveFedora isn't installed
21
+ rescue NameError, ActiveFedora::ObjectNotFoundError
22
+ record = nil
24
23
  end
25
24
 
26
25
  # return the found entry here instead of searching for it again in the CreateRelationshipsJob
@@ -8,10 +8,14 @@ module Bulkrax
8
8
 
9
9
  def add_path_to_file
10
10
  parsed_metadata['file'].each_with_index do |filename, i|
11
- path_to_file = ::File.join(parser.path_to_files, filename)
11
+ next if filename.blank?
12
+
13
+ path_to_file = parser.path_to_files(filename: filename)
12
14
 
13
15
  parsed_metadata['file'][i] = path_to_file
14
16
  end
17
+ parsed_metadata['file'].delete('')
18
+
15
19
  raise ::StandardError, "one or more file paths are invalid: #{parsed_metadata['file'].join(', ')}" unless parsed_metadata['file'].map { |file_path| ::File.file?(file_path) }.all?
16
20
 
17
21
  parsed_metadata['file']
@@ -12,8 +12,8 @@ module Bulkrax
12
12
  raise CollectionsCreatedError unless collections_created?
13
13
  @item = factory.run!
14
14
  add_user_to_permission_templates! if self.class.to_s.include?("Collection")
15
- parent_jobs if self.parsed_metadata[related_parents_parsed_mapping].present?
16
- child_jobs if self.parsed_metadata[related_children_parsed_mapping].present?
15
+ parent_jobs if self.parsed_metadata[related_parents_parsed_mapping]&.join.present?
16
+ child_jobs if self.parsed_metadata[related_children_parsed_mapping]&.join.present?
17
17
  end
18
18
  rescue RSolr::Error::Http, CollectionsCreatedError => e
19
19
  raise e
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Bulkrax
4
- class BagitParser < ApplicationParser # rubocop:disable Metrics/ClassLength
4
+ class BagitParser < CsvParser # rubocop:disable Metrics/ClassLength
5
5
  include ExportBehavior
6
6
 
7
7
  def self.export_supported?
@@ -20,12 +20,8 @@ module Bulkrax
20
20
  rdf_format ? RdfEntry : CsvEntry
21
21
  end
22
22
 
23
- def collection_entry_class
24
- CsvCollectionEntry
25
- end
26
-
27
- def file_set_entry_class
28
- CsvFileSetEntry
23
+ def path_to_files(filename:)
24
+ @path_to_files ||= Dir.glob(File.join(import_file_path, '**/data', filename)).first
29
25
  end
30
26
 
31
27
  # Take a random sample of 10 metadata_paths and work out the import fields from that
@@ -36,39 +32,41 @@ module Bulkrax
36
32
  end.flatten.compact.uniq
37
33
  end
38
34
 
39
- # Assume a single metadata record per path
40
- # Create an Array of all metadata records, one per file
35
+ # Create an Array of all metadata records
41
36
  def records(_opts = {})
42
37
  raise StandardError, 'No BagIt records were found' if bags.blank?
43
38
  @records ||= bags.map do |bag|
44
39
  path = metadata_path(bag)
45
40
  raise StandardError, 'No metadata files were found' if path.blank?
46
41
  data = entry_class.read_data(path)
47
- data = entry_class.data_for_entry(data, source_identifier, self)
48
- data[:file] = bag.bag_files.join('|') unless importerexporter.metadata_only?
49
- data
42
+ get_data(bag, data)
50
43
  end
44
+
45
+ @records = @records.flatten
51
46
  end
52
47
 
53
- # Find or create collections referenced by works
54
- # If the import data also contains records for these works, they will be updated
55
- # during create works
56
- def create_collections
57
- collections.each_with_index do |collection, index|
58
- next if collection.blank?
59
- metadata = {
60
- title: [collection],
61
- work_identifier => [collection],
62
- visibility: 'open',
63
- collection_type_gid: Hyrax::CollectionType.find_or_create_default_collection_type.gid
64
- }
65
- new_entry = find_or_create_entry(collection_entry_class, collection, 'Bulkrax::Importer', metadata)
66
- ImportCollectionJob.perform_now(new_entry.id, current_run.id)
67
- increment_counters(index, collection: true)
48
+ def get_data(bag, data)
49
+ if entry_class == CsvEntry
50
+ data = data.map do |data_row|
51
+ record_data = entry_class.data_for_entry(data_row, source_identifier, self)
52
+ next record_data if importerexporter.metadata_only?
53
+
54
+ record_data[:file] = bag.bag_files.join('|') if ::Hyrax.config.curation_concerns.include? record_data[:model]&.constantize
55
+ record_data
56
+ end
57
+ else
58
+ data = entry_class.data_for_entry(data, source_identifier, self)
59
+ data[:file] = bag.bag_files.join('|') unless importerexporter.metadata_only?
68
60
  end
61
+
62
+ data
69
63
  end
70
64
 
71
65
  def create_works
66
+ entry_class == CsvEntry ? super : create_rdf_works
67
+ end
68
+
69
+ def create_rdf_works
72
70
  records.each_with_index do |record, index|
73
71
  next unless record_has_source_identifier(record, index)
74
72
  break if limit_reached?(limit, index)
@@ -87,19 +85,6 @@ module Bulkrax
87
85
  status_info(e)
88
86
  end
89
87
 
90
- def collections
91
- records.map { |r| r[related_parents_parsed_mapping].split(/\s*[;|]\s*/) if r[related_parents_parsed_mapping].present? }.flatten.compact.uniq
92
- end
93
-
94
- def collections_total
95
- collections.size
96
- end
97
-
98
- # TODO: change to differentiate between collection and work records when adding ability to import collection metadata
99
- def works_total
100
- total
101
- end
102
-
103
88
  def total
104
89
  @total = importer.parser_fields['total'] || 0 if importer?
105
90
 
@@ -112,18 +97,6 @@ module Bulkrax
112
97
  @total = 0
113
98
  end
114
99
 
115
- def extra_filters
116
- output = ""
117
- if importerexporter.start_date.present?
118
- start_dt = importerexporter.start_date.to_datetime.strftime('%FT%TZ')
119
- finish_dt = importerexporter.finish_date.present? ? importerexporter.finish_date.to_datetime.end_of_day.strftime('%FT%TZ') : "NOW"
120
- output += " AND system_modified_dtsi:[#{start_dt} TO #{finish_dt}]"
121
- end
122
- output += importerexporter.work_visibility.present? ? " AND visibility_ssi:#{importerexporter.work_visibility}" : ""
123
- output += importerexporter.workflow_status.present? ? " AND workflow_state_name_ssim:#{importerexporter.workflow_status}" : ""
124
- output
125
- end
126
-
127
100
  def current_record_ids
128
101
  @work_ids = []
129
102
  @collection_ids = []
@@ -143,70 +116,19 @@ module Bulkrax
143
116
  @work_ids + @collection_ids + @file_set_ids
144
117
  end
145
118
 
146
- # Set the following instance variables: @work_ids, @collection_ids, @file_set_ids
147
- # @see #current_record_ids
148
- def set_ids_for_exporting_from_importer
149
- entry_ids = Importer.find(importerexporter.export_source).entries.pluck(:id)
150
- complete_statuses = Status.latest_by_statusable
151
- .includes(:statusable)
152
- .where('bulkrax_statuses.statusable_id IN (?) AND bulkrax_statuses.statusable_type = ? AND status_message = ?', entry_ids, 'Bulkrax::Entry', 'Complete')
153
-
154
- complete_entry_identifiers = complete_statuses.map { |s| s.statusable&.identifier&.gsub(':', '\:') }
155
- extra_filters = extra_filters.presence || '*:*'
156
-
157
- { :@work_ids => ::Hyrax.config.curation_concerns, :@collection_ids => [::Collection], :@file_set_ids => [::FileSet] }.each do |instance_var, models_to_search|
158
- instance_variable_set(instance_var, ActiveFedora::SolrService.post(
159
- extra_filters.to_s,
160
- fq: [
161
- %(#{::Solrizer.solr_name(work_identifier)}:("#{complete_entry_identifiers.join('" OR "')}")),
162
- "has_model_ssim:(#{models_to_search.join(' OR ')})"
163
- ],
164
- fl: 'id',
165
- rows: 2_000_000_000
166
- )['response']['docs'].map { |obj| obj['id'] })
167
- end
168
- end
169
-
170
119
  # export methods
171
120
 
172
- def create_new_entries
173
- current_record_ids.each_with_index do |id, index|
174
- break if limit_reached?(limit, index)
175
-
176
- this_entry_class = if @collection_ids.include?(id)
177
- collection_entry_class
178
- elsif @file_set_ids.include?(id)
179
- file_set_entry_class
180
- else
181
- entry_class
182
- end
183
- new_entry = find_or_create_entry(this_entry_class, id, 'Bulkrax::Exporter')
184
-
185
- begin
186
- entry = ExportWorkJob.perform_now(new_entry.id, current_run.id)
187
- rescue => e
188
- Rails.logger.info("#{e.message} was detected during export")
189
- end
190
-
191
- self.headers |= entry.parsed_metadata.keys if entry
192
- end
193
- end
194
- alias create_from_collection create_new_entries
195
- alias create_from_importer create_new_entries
196
- alias create_from_worktype create_new_entries
197
- alias create_from_all create_new_entries
198
-
199
121
  # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
200
122
  def write_files
201
123
  require 'open-uri'
202
124
  require 'socket'
203
125
  importerexporter.entries.where(identifier: current_record_ids)[0..limit || total].each do |entry|
204
- work = ActiveFedora::Base.find(entry.identifier)
205
- next unless Hyrax.config.curation_concerns.include?(work.class)
126
+ record = ActiveFedora::Base.find(entry.identifier)
127
+ next unless Hyrax.config.curation_concerns.include?(record.class)
206
128
  bag = BagIt::Bag.new setup_bagit_folder(entry.identifier)
207
129
  bag_entries = [entry]
208
130
 
209
- work.file_sets.each do |fs|
131
+ record.file_sets.each do |fs|
210
132
  if @file_set_ids.present?
211
133
  file_set_entry = Bulkrax::CsvFileSetEntry.where("parsed_metadata LIKE '%#{fs.id}%'").first
212
134
  bag_entries << file_set_entry unless file_set_entry.nil?
@@ -245,42 +167,6 @@ module Bulkrax
245
167
  key != source_identifier.to_s
246
168
  end
247
169
 
248
- # All possible column names
249
- def export_headers
250
- headers = sort_headers(self.headers)
251
-
252
- # we don't want access_control_id exported and we want file at the end
253
- headers.delete('access_control_id') if headers.include?('access_control_id')
254
-
255
- # add the headers below at the beginning or end to maintain the preexisting export behavior
256
- headers.prepend('model')
257
- headers.prepend(source_identifier.to_s)
258
- headers.prepend('id')
259
-
260
- headers.uniq
261
- end
262
-
263
- def object_names
264
- return @object_names if @object_names
265
-
266
- @object_names = mapping.values.map { |value| value['object'] }
267
- @object_names.uniq!.delete(nil)
268
-
269
- @object_names
270
- end
271
-
272
- def sort_headers(headers)
273
- # converting headers like creator_name_1 to creator_1_name so they get sorted by numerical order
274
- # while keeping objects grouped together
275
- headers.sort_by do |item|
276
- number = item.match(/\d+/)&.[](0) || 0.to_s
277
- sort_number = number.rjust(4, "0")
278
- object_prefix = object_names.detect { |o| item.match(/^#{o}/) } || item
279
- remainder = item.gsub(/^#{object_prefix}_/, '').gsub(/_#{number}/, '')
280
- "#{object_prefix}_#{sort_number}_#{remainder}"
281
- end
282
- end
283
-
284
170
  def setup_triple_metadata_export_file(id)
285
171
  File.join(importerexporter.exporter_export_path, id, 'metadata.nt')
286
172
  end
@@ -300,11 +186,6 @@ module Bulkrax
300
186
  end
301
187
  end
302
188
 
303
- def required_elements?(keys)
304
- return if keys.blank?
305
- !required_elements.map { |el| keys.map(&:to_s).include?(el) }.include?(false)
306
- end
307
-
308
189
  # @todo - investigate getting directory structure
309
190
  # @todo - investigate using perform_later, and having the importer check for
310
191
  # DownloadCloudFileJob before it starts
@@ -355,5 +236,11 @@ module Bulkrax
355
236
  return nil unless bag.valid?
356
237
  bag
357
238
  end
239
+
240
+ # use the version of this method from the application parser instead
241
+ def real_import_file_path
242
+ return importer_unzip_path if file? && zip?
243
+ parser_fields['import_file_path']
244
+ end
358
245
  end
359
246
  end
@@ -272,8 +272,8 @@ module Bulkrax
272
272
  CsvFileSetEntry
273
273
  end
274
274
 
275
- # See https://stackoverflow.com/questions/2650517/count-the-number-of-lines-in-a-file-without-reading-entire-file-into-memory
276
- # Changed to grep as wc -l counts blank lines, and ignores the final unescaped line (which may or may not contain data)
275
+ # TODO: figure out why using the version of this method that's in the bagit parser
276
+ # breaks specs for the "if importer?" line
277
277
  def total
278
278
  @total = importer.parser_fields['total'] || 0 if importer?
279
279
  @total = limit || current_record_ids.count if exporter?
@@ -382,10 +382,11 @@ module Bulkrax
382
382
  end
383
383
 
384
384
  # Retrieve the path where we expect to find the files
385
- def path_to_files
385
+ def path_to_files(**args)
386
+ filename = args.fetch(:filename, '')
387
+
386
388
  @path_to_files ||= File.join(
387
- zip? ? importer_unzip_path : File.dirname(import_file_path),
388
- 'files'
389
+ zip? ? importer_unzip_path : File.dirname(import_file_path), 'files', filename
389
390
  )
390
391
  end
391
392
 
@@ -29,6 +29,7 @@
29
29
 
30
30
  <%= form.input :export_source_importer,
31
31
  label: t('bulkrax.exporter.labels.importer'),
32
+ required: true,
32
33
  prompt: 'Select from the list',
33
34
  label_html: { class: 'importer export-source-option hidden' },
34
35
  input_html: { class: 'importer export-source-option hidden' },
@@ -37,6 +38,7 @@
37
38
  <%= form.input :export_source_collection,
38
39
  prompt: 'Start typing ...',
39
40
  label: t('bulkrax.exporter.labels.collection'),
41
+ required: true,
40
42
  placeholder: @collection&.title&.first,
41
43
  label_html: { class: 'collection export-source-option hidden' },
42
44
  input_html: {
@@ -50,6 +52,7 @@
50
52
 
51
53
  <%= form.input :export_source_worktype,
52
54
  label: t('bulkrax.exporter.labels.worktype'),
55
+ required: true,
53
56
  prompt: 'Select from the list',
54
57
  label_html: { class: 'worktype export-source-option hidden' },
55
58
  input_html: { class: 'worktype export-source-option hidden' },
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Bulkrax
4
- VERSION = '3.4.0'
4
+ VERSION = '3.5.0'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bulkrax
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.4.0
4
+ version: 3.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rob Kaufman
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-06-22 00:00:00.000000000 Z
11
+ date: 2022-06-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rails
@@ -404,7 +404,7 @@ homepage: https://github.com/samvera-labs/bulkrax
404
404
  licenses:
405
405
  - Apache-2.0
406
406
  metadata: {}
407
- post_install_message:
407
+ post_install_message:
408
408
  rdoc_options: []
409
409
  require_paths:
410
410
  - lib
@@ -419,8 +419,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
419
419
  - !ruby/object:Gem::Version
420
420
  version: '0'
421
421
  requirements: []
422
- rubygems_version: 3.1.4
423
- signing_key:
422
+ rubygems_version: 3.0.3
423
+ signing_key:
424
424
  specification_version: 4
425
425
  summary: Import and export tool for Hyrax and Hyku
426
426
  test_files: []