bulkrax 3.4.0 → 3.5.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f0ee151bc10b7485eb716463b2c4895165d6df9d73c3dd60813d2eb4de8161d1
4
- data.tar.gz: f4e5ddfb5ac602eb20a85850f8ec5a9286a9a668f8c83548dd049db4d91a2a0e
3
+ metadata.gz: 0a80674a9f582c3b8e83f442318908edb6ca9f0b615c970d09b17d941cc8027d
4
+ data.tar.gz: a2a53116ef49e03dde1aa1df14d8259a2b4abf06a82cff63a9d4ba622ba6600a
5
5
  SHA512:
6
- metadata.gz: a5b029da7feaee11c8a3eb58e0c7150abcb06b6ca08f9d102134d5e1c9eef3049ae85b0084d74934b7504688da7d4958a0dc425c705b42101c1a3ce62a57d0c7
7
- data.tar.gz: c807b68265c0d88b9e7faea4f6efdf94b3bcf90e965f5cc97cc259202bc55db976944eea8e4bf99876723f881d29d2fa7004dbd9985f7f8164648df517722133
6
+ metadata.gz: af3d75fb03105e37f7374f3a7f863c545d9cc9c95ab2f18bbbf7b4692024e09811f103a372327b4724c836568bad176ed0ad0b7be929ed556259aa9b0793fce6
7
+ data.tar.gz: 1117a185fbab2bae0746187f464bebea855759a5ecccf0d34f098ac55ad7a2952e663268372262ba8f97820c8c1f02bd29c74a388cfd8ea9cfed84a46dad94cf
@@ -42,10 +42,12 @@ module Bulkrax
42
42
  pending_relationships.each do |rel|
43
43
  raise ::StandardError, %("#{rel}" needs either a child or a parent to create a relationship) if rel.child_id.nil? || rel.parent_id.nil?
44
44
  @child_entry, child_record = find_record(rel.child_id, importer_run_id)
45
- child_record.is_a?(::Collection) ? @child_records[:collections] << child_record : @child_records[:works] << child_record
45
+ if child_record
46
+ child_record.is_a?(::Collection) ? @child_records[:collections] << child_record : @child_records[:works] << child_record
47
+ end
46
48
  end
47
49
 
48
- if (child_records[:collections].blank? && child_records[:works].blank?) || parent_record.blank?
50
+ if (child_records[:collections].blank? && child_records[:works].blank?) || parent_record.nil?
49
51
  reschedule({ parent_identifier: parent_identifier, importer_run_id: importer_run_id })
50
52
  return false # stop current job from continuing to run after rescheduling
51
53
  end
@@ -12,15 +12,14 @@ module Bulkrax
12
12
  # check for our entry in our current importer first
13
13
  importer_id = ImporterRun.find(importer_run_id).importer_id
14
14
  default_scope = { identifier: identifier, importerexporter_type: 'Bulkrax::Importer' }
15
- record = Entry.find_by(default_scope.merge({ importerexporter_id: importer_id })) || Entry.find_by(default_scope)
16
15
 
17
- # TODO(alishaevn): discuss whether we are only looking for Collection models here
18
- # use ActiveFedora::Base.find(identifier) instead?
19
- record ||= ::Collection.where(id: identifier).first # rubocop:disable Rails/FindBy
20
- if record.blank?
21
- available_work_types.each do |work_type|
22
- record ||= work_type.where(id: identifier).first # rubocop:disable Rails/FindBy
23
- end
16
+ begin
17
+ # the identifier parameter can be a :source_identifier or the id of an object
18
+ record = Entry.find_by(default_scope.merge({ importerexporter_id: importer_id })) || Entry.find_by(default_scope)
19
+ record ||= ActiveFedora::Base.find(identifier)
20
+ # NameError for if ActiveFedora isn't installed
21
+ rescue NameError, ActiveFedora::ObjectNotFoundError
22
+ record = nil
24
23
  end
25
24
 
26
25
  # return the found entry here instead of searching for it again in the CreateRelationshipsJob
@@ -8,10 +8,14 @@ module Bulkrax
8
8
 
9
9
  def add_path_to_file
10
10
  parsed_metadata['file'].each_with_index do |filename, i|
11
- path_to_file = ::File.join(parser.path_to_files, filename)
11
+ next if filename.blank?
12
+
13
+ path_to_file = parser.path_to_files(filename: filename)
12
14
 
13
15
  parsed_metadata['file'][i] = path_to_file
14
16
  end
17
+ parsed_metadata['file'].delete('')
18
+
15
19
  raise ::StandardError, "one or more file paths are invalid: #{parsed_metadata['file'].join(', ')}" unless parsed_metadata['file'].map { |file_path| ::File.file?(file_path) }.all?
16
20
 
17
21
  parsed_metadata['file']
@@ -12,8 +12,8 @@ module Bulkrax
12
12
  raise CollectionsCreatedError unless collections_created?
13
13
  @item = factory.run!
14
14
  add_user_to_permission_templates! if self.class.to_s.include?("Collection")
15
- parent_jobs if self.parsed_metadata[related_parents_parsed_mapping].present?
16
- child_jobs if self.parsed_metadata[related_children_parsed_mapping].present?
15
+ parent_jobs if self.parsed_metadata[related_parents_parsed_mapping]&.join.present?
16
+ child_jobs if self.parsed_metadata[related_children_parsed_mapping]&.join.present?
17
17
  end
18
18
  rescue RSolr::Error::Http, CollectionsCreatedError => e
19
19
  raise e
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Bulkrax
4
- class BagitParser < ApplicationParser # rubocop:disable Metrics/ClassLength
4
+ class BagitParser < CsvParser # rubocop:disable Metrics/ClassLength
5
5
  include ExportBehavior
6
6
 
7
7
  def self.export_supported?
@@ -20,12 +20,8 @@ module Bulkrax
20
20
  rdf_format ? RdfEntry : CsvEntry
21
21
  end
22
22
 
23
- def collection_entry_class
24
- CsvCollectionEntry
25
- end
26
-
27
- def file_set_entry_class
28
- CsvFileSetEntry
23
+ def path_to_files(filename:)
24
+ @path_to_files ||= Dir.glob(File.join(import_file_path, '**/data', filename)).first
29
25
  end
30
26
 
31
27
  # Take a random sample of 10 metadata_paths and work out the import fields from that
@@ -36,39 +32,41 @@ module Bulkrax
36
32
  end.flatten.compact.uniq
37
33
  end
38
34
 
39
- # Assume a single metadata record per path
40
- # Create an Array of all metadata records, one per file
35
+ # Create an Array of all metadata records
41
36
  def records(_opts = {})
42
37
  raise StandardError, 'No BagIt records were found' if bags.blank?
43
38
  @records ||= bags.map do |bag|
44
39
  path = metadata_path(bag)
45
40
  raise StandardError, 'No metadata files were found' if path.blank?
46
41
  data = entry_class.read_data(path)
47
- data = entry_class.data_for_entry(data, source_identifier, self)
48
- data[:file] = bag.bag_files.join('|') unless importerexporter.metadata_only?
49
- data
42
+ get_data(bag, data)
50
43
  end
44
+
45
+ @records = @records.flatten
51
46
  end
52
47
 
53
- # Find or create collections referenced by works
54
- # If the import data also contains records for these works, they will be updated
55
- # during create works
56
- def create_collections
57
- collections.each_with_index do |collection, index|
58
- next if collection.blank?
59
- metadata = {
60
- title: [collection],
61
- work_identifier => [collection],
62
- visibility: 'open',
63
- collection_type_gid: Hyrax::CollectionType.find_or_create_default_collection_type.gid
64
- }
65
- new_entry = find_or_create_entry(collection_entry_class, collection, 'Bulkrax::Importer', metadata)
66
- ImportCollectionJob.perform_now(new_entry.id, current_run.id)
67
- increment_counters(index, collection: true)
48
+ def get_data(bag, data)
49
+ if entry_class == CsvEntry
50
+ data = data.map do |data_row|
51
+ record_data = entry_class.data_for_entry(data_row, source_identifier, self)
52
+ next record_data if importerexporter.metadata_only?
53
+
54
+ record_data[:file] = bag.bag_files.join('|') if ::Hyrax.config.curation_concerns.include? record_data[:model]&.constantize
55
+ record_data
56
+ end
57
+ else
58
+ data = entry_class.data_for_entry(data, source_identifier, self)
59
+ data[:file] = bag.bag_files.join('|') unless importerexporter.metadata_only?
68
60
  end
61
+
62
+ data
69
63
  end
70
64
 
71
65
  def create_works
66
+ entry_class == CsvEntry ? super : create_rdf_works
67
+ end
68
+
69
+ def create_rdf_works
72
70
  records.each_with_index do |record, index|
73
71
  next unless record_has_source_identifier(record, index)
74
72
  break if limit_reached?(limit, index)
@@ -87,19 +85,6 @@ module Bulkrax
87
85
  status_info(e)
88
86
  end
89
87
 
90
- def collections
91
- records.map { |r| r[related_parents_parsed_mapping].split(/\s*[;|]\s*/) if r[related_parents_parsed_mapping].present? }.flatten.compact.uniq
92
- end
93
-
94
- def collections_total
95
- collections.size
96
- end
97
-
98
- # TODO: change to differentiate between collection and work records when adding ability to import collection metadata
99
- def works_total
100
- total
101
- end
102
-
103
88
  def total
104
89
  @total = importer.parser_fields['total'] || 0 if importer?
105
90
 
@@ -112,18 +97,6 @@ module Bulkrax
112
97
  @total = 0
113
98
  end
114
99
 
115
- def extra_filters
116
- output = ""
117
- if importerexporter.start_date.present?
118
- start_dt = importerexporter.start_date.to_datetime.strftime('%FT%TZ')
119
- finish_dt = importerexporter.finish_date.present? ? importerexporter.finish_date.to_datetime.end_of_day.strftime('%FT%TZ') : "NOW"
120
- output += " AND system_modified_dtsi:[#{start_dt} TO #{finish_dt}]"
121
- end
122
- output += importerexporter.work_visibility.present? ? " AND visibility_ssi:#{importerexporter.work_visibility}" : ""
123
- output += importerexporter.workflow_status.present? ? " AND workflow_state_name_ssim:#{importerexporter.workflow_status}" : ""
124
- output
125
- end
126
-
127
100
  def current_record_ids
128
101
  @work_ids = []
129
102
  @collection_ids = []
@@ -143,70 +116,19 @@ module Bulkrax
143
116
  @work_ids + @collection_ids + @file_set_ids
144
117
  end
145
118
 
146
- # Set the following instance variables: @work_ids, @collection_ids, @file_set_ids
147
- # @see #current_record_ids
148
- def set_ids_for_exporting_from_importer
149
- entry_ids = Importer.find(importerexporter.export_source).entries.pluck(:id)
150
- complete_statuses = Status.latest_by_statusable
151
- .includes(:statusable)
152
- .where('bulkrax_statuses.statusable_id IN (?) AND bulkrax_statuses.statusable_type = ? AND status_message = ?', entry_ids, 'Bulkrax::Entry', 'Complete')
153
-
154
- complete_entry_identifiers = complete_statuses.map { |s| s.statusable&.identifier&.gsub(':', '\:') }
155
- extra_filters = extra_filters.presence || '*:*'
156
-
157
- { :@work_ids => ::Hyrax.config.curation_concerns, :@collection_ids => [::Collection], :@file_set_ids => [::FileSet] }.each do |instance_var, models_to_search|
158
- instance_variable_set(instance_var, ActiveFedora::SolrService.post(
159
- extra_filters.to_s,
160
- fq: [
161
- %(#{::Solrizer.solr_name(work_identifier)}:("#{complete_entry_identifiers.join('" OR "')}")),
162
- "has_model_ssim:(#{models_to_search.join(' OR ')})"
163
- ],
164
- fl: 'id',
165
- rows: 2_000_000_000
166
- )['response']['docs'].map { |obj| obj['id'] })
167
- end
168
- end
169
-
170
119
  # export methods
171
120
 
172
- def create_new_entries
173
- current_record_ids.each_with_index do |id, index|
174
- break if limit_reached?(limit, index)
175
-
176
- this_entry_class = if @collection_ids.include?(id)
177
- collection_entry_class
178
- elsif @file_set_ids.include?(id)
179
- file_set_entry_class
180
- else
181
- entry_class
182
- end
183
- new_entry = find_or_create_entry(this_entry_class, id, 'Bulkrax::Exporter')
184
-
185
- begin
186
- entry = ExportWorkJob.perform_now(new_entry.id, current_run.id)
187
- rescue => e
188
- Rails.logger.info("#{e.message} was detected during export")
189
- end
190
-
191
- self.headers |= entry.parsed_metadata.keys if entry
192
- end
193
- end
194
- alias create_from_collection create_new_entries
195
- alias create_from_importer create_new_entries
196
- alias create_from_worktype create_new_entries
197
- alias create_from_all create_new_entries
198
-
199
121
  # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
200
122
  def write_files
201
123
  require 'open-uri'
202
124
  require 'socket'
203
125
  importerexporter.entries.where(identifier: current_record_ids)[0..limit || total].each do |entry|
204
- work = ActiveFedora::Base.find(entry.identifier)
205
- next unless Hyrax.config.curation_concerns.include?(work.class)
126
+ record = ActiveFedora::Base.find(entry.identifier)
127
+ next unless Hyrax.config.curation_concerns.include?(record.class)
206
128
  bag = BagIt::Bag.new setup_bagit_folder(entry.identifier)
207
129
  bag_entries = [entry]
208
130
 
209
- work.file_sets.each do |fs|
131
+ record.file_sets.each do |fs|
210
132
  if @file_set_ids.present?
211
133
  file_set_entry = Bulkrax::CsvFileSetEntry.where("parsed_metadata LIKE '%#{fs.id}%'").first
212
134
  bag_entries << file_set_entry unless file_set_entry.nil?
@@ -245,42 +167,6 @@ module Bulkrax
245
167
  key != source_identifier.to_s
246
168
  end
247
169
 
248
- # All possible column names
249
- def export_headers
250
- headers = sort_headers(self.headers)
251
-
252
- # we don't want access_control_id exported and we want file at the end
253
- headers.delete('access_control_id') if headers.include?('access_control_id')
254
-
255
- # add the headers below at the beginning or end to maintain the preexisting export behavior
256
- headers.prepend('model')
257
- headers.prepend(source_identifier.to_s)
258
- headers.prepend('id')
259
-
260
- headers.uniq
261
- end
262
-
263
- def object_names
264
- return @object_names if @object_names
265
-
266
- @object_names = mapping.values.map { |value| value['object'] }
267
- @object_names.uniq!.delete(nil)
268
-
269
- @object_names
270
- end
271
-
272
- def sort_headers(headers)
273
- # converting headers like creator_name_1 to creator_1_name so they get sorted by numerical order
274
- # while keeping objects grouped together
275
- headers.sort_by do |item|
276
- number = item.match(/\d+/)&.[](0) || 0.to_s
277
- sort_number = number.rjust(4, "0")
278
- object_prefix = object_names.detect { |o| item.match(/^#{o}/) } || item
279
- remainder = item.gsub(/^#{object_prefix}_/, '').gsub(/_#{number}/, '')
280
- "#{object_prefix}_#{sort_number}_#{remainder}"
281
- end
282
- end
283
-
284
170
  def setup_triple_metadata_export_file(id)
285
171
  File.join(importerexporter.exporter_export_path, id, 'metadata.nt')
286
172
  end
@@ -300,11 +186,6 @@ module Bulkrax
300
186
  end
301
187
  end
302
188
 
303
- def required_elements?(keys)
304
- return if keys.blank?
305
- !required_elements.map { |el| keys.map(&:to_s).include?(el) }.include?(false)
306
- end
307
-
308
189
  # @todo - investigate getting directory structure
309
190
  # @todo - investigate using perform_later, and having the importer check for
310
191
  # DownloadCloudFileJob before it starts
@@ -355,5 +236,11 @@ module Bulkrax
355
236
  return nil unless bag.valid?
356
237
  bag
357
238
  end
239
+
240
+ # use the version of this method from the application parser instead
241
+ def real_import_file_path
242
+ return importer_unzip_path if file? && zip?
243
+ parser_fields['import_file_path']
244
+ end
358
245
  end
359
246
  end
@@ -272,8 +272,8 @@ module Bulkrax
272
272
  CsvFileSetEntry
273
273
  end
274
274
 
275
- # See https://stackoverflow.com/questions/2650517/count-the-number-of-lines-in-a-file-without-reading-entire-file-into-memory
276
- # Changed to grep as wc -l counts blank lines, and ignores the final unescaped line (which may or may not contain data)
275
+ # TODO: figure out why using the version of this method that's in the bagit parser
276
+ # breaks specs for the "if importer?" line
277
277
  def total
278
278
  @total = importer.parser_fields['total'] || 0 if importer?
279
279
  @total = limit || current_record_ids.count if exporter?
@@ -382,10 +382,11 @@ module Bulkrax
382
382
  end
383
383
 
384
384
  # Retrieve the path where we expect to find the files
385
- def path_to_files
385
+ def path_to_files(**args)
386
+ filename = args.fetch(:filename, '')
387
+
386
388
  @path_to_files ||= File.join(
387
- zip? ? importer_unzip_path : File.dirname(import_file_path),
388
- 'files'
389
+ zip? ? importer_unzip_path : File.dirname(import_file_path), 'files', filename
389
390
  )
390
391
  end
391
392
 
@@ -29,6 +29,7 @@
29
29
 
30
30
  <%= form.input :export_source_importer,
31
31
  label: t('bulkrax.exporter.labels.importer'),
32
+ required: true,
32
33
  prompt: 'Select from the list',
33
34
  label_html: { class: 'importer export-source-option hidden' },
34
35
  input_html: { class: 'importer export-source-option hidden' },
@@ -37,6 +38,7 @@
37
38
  <%= form.input :export_source_collection,
38
39
  prompt: 'Start typing ...',
39
40
  label: t('bulkrax.exporter.labels.collection'),
41
+ required: true,
40
42
  placeholder: @collection&.title&.first,
41
43
  label_html: { class: 'collection export-source-option hidden' },
42
44
  input_html: {
@@ -50,6 +52,7 @@
50
52
 
51
53
  <%= form.input :export_source_worktype,
52
54
  label: t('bulkrax.exporter.labels.worktype'),
55
+ required: true,
53
56
  prompt: 'Select from the list',
54
57
  label_html: { class: 'worktype export-source-option hidden' },
55
58
  input_html: { class: 'worktype export-source-option hidden' },
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Bulkrax
4
- VERSION = '3.4.0'
4
+ VERSION = '3.5.0'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bulkrax
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.4.0
4
+ version: 3.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rob Kaufman
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-06-22 00:00:00.000000000 Z
11
+ date: 2022-06-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rails
@@ -404,7 +404,7 @@ homepage: https://github.com/samvera-labs/bulkrax
404
404
  licenses:
405
405
  - Apache-2.0
406
406
  metadata: {}
407
- post_install_message:
407
+ post_install_message:
408
408
  rdoc_options: []
409
409
  require_paths:
410
410
  - lib
@@ -419,8 +419,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
419
419
  - !ruby/object:Gem::Version
420
420
  version: '0'
421
421
  requirements: []
422
- rubygems_version: 3.1.4
423
- signing_key:
422
+ rubygems_version: 3.0.3
423
+ signing_key:
424
424
  specification_version: 4
425
425
  summary: Import and export tool for Hyrax and Hyku
426
426
  test_files: []