hyrax-ingest 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +52 -0
  3. data/config/routes.rb +2 -0
  4. data/lib/hyrax/ingest.rb +12 -0
  5. data/lib/hyrax/ingest/batch_runner.rb +130 -0
  6. data/lib/hyrax/ingest/configuration.rb +54 -0
  7. data/lib/hyrax/ingest/engine.rb +6 -0
  8. data/lib/hyrax/ingest/errors.rb +186 -0
  9. data/lib/hyrax/ingest/fetcher.rb +55 -0
  10. data/lib/hyrax/ingest/fetcher/base.rb +78 -0
  11. data/lib/hyrax/ingest/fetcher/csv_file.rb +89 -0
  12. data/lib/hyrax/ingest/fetcher/date_time.rb +15 -0
  13. data/lib/hyrax/ingest/fetcher/literal.rb +24 -0
  14. data/lib/hyrax/ingest/fetcher/premis_event_type.rb +28 -0
  15. data/lib/hyrax/ingest/fetcher/rdf_uri.rb +21 -0
  16. data/lib/hyrax/ingest/fetcher/xml_file.rb +54 -0
  17. data/lib/hyrax/ingest/fetcher/yaml_file.rb +36 -0
  18. data/lib/hyrax/ingest/has_depositor.rb +13 -0
  19. data/lib/hyrax/ingest/has_iteration.rb +18 -0
  20. data/lib/hyrax/ingest/has_logger.rb +29 -0
  21. data/lib/hyrax/ingest/has_report.rb +17 -0
  22. data/lib/hyrax/ingest/has_shared_sip.rb +20 -0
  23. data/lib/hyrax/ingest/has_sip.rb +20 -0
  24. data/lib/hyrax/ingest/ingester.rb +75 -0
  25. data/lib/hyrax/ingest/ingester/active_fedora_base_ingester.rb +136 -0
  26. data/lib/hyrax/ingest/ingester/active_fedora_file_ingester.rb +17 -0
  27. data/lib/hyrax/ingest/ingester/active_fedora_property_assigner.rb +67 -0
  28. data/lib/hyrax/ingest/ingester/base.rb +28 -0
  29. data/lib/hyrax/ingest/ingester/file_set_ingester.rb +68 -0
  30. data/lib/hyrax/ingest/ingester/preservation_event_ingester.rb +27 -0
  31. data/lib/hyrax/ingest/ingester/work_ingester.rb +55 -0
  32. data/lib/hyrax/ingest/reporting.rb +13 -0
  33. data/lib/hyrax/ingest/reporting/configuration.rb +22 -0
  34. data/lib/hyrax/ingest/reporting/report.rb +79 -0
  35. data/lib/hyrax/ingest/reporting/views/hyrax_ingest_report.html.erb +77 -0
  36. data/lib/hyrax/ingest/reporting/views/hyrax_ingest_report.txt.erb +21 -0
  37. data/lib/hyrax/ingest/runner.rb +103 -0
  38. data/lib/hyrax/ingest/sip.rb +92 -0
  39. data/lib/hyrax/ingest/transformer.rb +42 -0
  40. data/lib/hyrax/ingest/transformer/base.rb +12 -0
  41. data/lib/hyrax/ingest/transformer/to_date.rb +33 -0
  42. data/lib/hyrax/ingest/version.rb +5 -0
  43. data/lib/tasks/ingest_tasks.rake +22 -0
  44. metadata +330 -0
@@ -0,0 +1,27 @@
1
+ require 'hyrax/ingest/ingester/active_fedora_base_ingester'
2
+ require 'hyrax/ingest/ingester/active_fedora_file_ingester'
3
+
4
+ module Hyrax
5
+ module Ingest
6
+ module Ingester
7
+ class PreservationEventIngester < ActiveFedoraBaseIngester
8
+
9
+ attr_reader :premis_event_related_object
10
+
11
+ def initialize(config={})
12
+ config[:af_model_class_name] ||= 'Hyrax::Preservation::Event'
13
+ @premis_event_related_object = config.delete(:premis_event_related_object)
14
+ super(config)
15
+ end
16
+
17
+ def run!
18
+ assign_properties!
19
+ af_model.premis_event_related_object = premis_event_related_object
20
+ save_model!
21
+ # return the new instance of the ActiveFedora model
22
+ af_model
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,55 @@
1
+ require 'hyrax/ingest/ingester/active_fedora_base_ingester'
2
+ require 'hyrax/ingest/ingester/file_set_ingester'
3
+
4
+ module Hyrax
5
+ module Ingest
6
+ module Ingester
7
+ class WorkIngester < ActiveFedoraBaseIngester
8
+ attr_reader :file_sets_config
9
+
10
+ def initialize(config={})
11
+ # TODO: Throw a useful custom error when :type option is missing.
12
+ config[:af_model_class_name] ||= config.delete(:type)
13
+ @file_sets_config = config.delete(:FileSets) || []
14
+ super(config)
15
+ end
16
+
17
+ def run!
18
+ assign_properties!
19
+ assign_related_file_set_properties!
20
+ apply_depositor_metadata!
21
+ save_model!
22
+ assign_file_sets_to_work!
23
+ # return the new instance of the ActiveFedora model
24
+ af_model
25
+ end
26
+
27
+ private
28
+
29
+ def apply_depositor_metadata!
30
+ af_model.apply_depositor_metadata(depositor) if depositor
31
+ end
32
+
33
+ def assign_related_file_set_properties!
34
+ file_set_ingesters.each { |file_set_ingester| file_set_ingester.assign_properties! }
35
+ end
36
+
37
+ def assign_file_sets_to_work!
38
+ file_set_ingesters.each do |file_set_ingester|
39
+ af_model.members += [file_set_ingester.af_model]
40
+ end
41
+ end
42
+
43
+ def file_set_ingesters
44
+ @file_set_ingesters ||= @file_sets_config.map do |file_set_config|
45
+ Hyrax::Ingest::Ingester::FileSetIngester.new(file_set_config).tap do |file_set_ingester|
46
+ file_set_ingester.sip = sip if file_set_ingester.respond_to?(:sip=)
47
+ file_set_ingester.shared_sip = shared_sip if file_set_ingester.respond_to?(:shared_sip=)
48
+ file_set_ingester.iteration = iteration if file_set_ingester.respond_to?(:iteration=)
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,13 @@
1
+ require 'hyrax/ingest/reporting/configuration'
2
+
3
+ module Hyrax
4
+ module Ingest
5
+ module Reporting
6
+ class << self
7
+ def config
8
+ @config ||= Hyrax::Ingest::Reporting::Configuration.new
9
+ end
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,22 @@
1
+ require 'hyrax/ingest/errors'
2
+
3
+ module Hyrax
4
+ module Ingest
5
+ module Reporting
6
+ class Configuration
7
+ attr_reader :default_template_path
8
+ attr_accessor :default_output_file
9
+
10
+ def initialize
11
+ @default_template_path = File.expand_path('../views/hyrax_ingest_report.html.erb', __FILE__)
12
+ @default_output_file = File.expand_path('hyrax_ingest_report.html')
13
+ end
14
+
15
+ def default_template_path=(path)
16
+ raise Hyrax::Ingest::Errors::ConfigurationError, "\"#{path}\" does not exist" unless File.exist? path
17
+ @default_template_path = path
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,79 @@
1
+ require 'erb'
2
+ require 'functional_hash'
3
+ require 'hyrax/ingest/reporting/configuration'
4
+
5
+ module Hyrax
6
+ module Ingest
7
+ module Reporting
8
+ class Report
9
+ attr_reader :stat
10
+
11
+ def initialize
12
+ @stat = default_stat
13
+ end
14
+
15
+ def default_stat
16
+ # Initialize a FunctionalHash to serve as a stat tracker, and
17
+ # add some default values in the same way you would a Hash.
18
+ @stat = FunctionalHash.new.tap do |stat|
19
+ # Stores an array of all SIP paths.
20
+ stat[:sip_paths] = []
21
+
22
+ # Stores a list of all files from SIPs that are part of the ingest.
23
+ stat[:files] = []
24
+
25
+ # Stores a hash where keys are Fetcher classes, and values are
26
+ # the number of occurrences of missing rquired values.
27
+ stat[:missing_required_values] = {}
28
+
29
+ stat[:total_missing_required_values] = Proc.new do |s|
30
+ stat[:missing_required_values].reduce(0) do |total, key_and_value|
31
+ # When reducing a Hash, the 2nd arg to the block is a
32
+ # 2-element array, where the 1st element is the key, and the
33
+ # 2nd element is the value the key points to.
34
+ occurrences = key_and_value.last
35
+ total + occurrences.count
36
+ end
37
+ end
38
+
39
+ # Filters the :missing_required_values hash to those for XML files.
40
+ # stat[:xml_files_missing_required_values] = Proc.new do |s|
41
+ # s[:missing_required_values].select { |fetcher_class, params| fetcher_class.to_s =~ /XMLFile$/ }
42
+ # end
43
+
44
+ # Define a functional hash value that returns the count of the given key.
45
+ stat[:count] = Proc.new do |s, key_to_count|
46
+ s[key_to_count].respond_to?(:count) ? s[key_to_count].count : 0
47
+ end
48
+
49
+ stat[:models_saved] = []
50
+ stat[:models_failed] = []
51
+ end
52
+ end
53
+
54
+ def render(template_path: nil)
55
+ template_path ||= Reporting.config.default_template_path
56
+ template_content = File.read(File.expand_path(template_path))
57
+ ERB.new(template_content).result(binding)
58
+ end
59
+
60
+ def write_to_file(filename: nil, template_path: nil)
61
+ filename ||= Reporting.config.default_output_file
62
+ File.write(filename, render(template_path: template_path))
63
+ end
64
+
65
+ def failed_with(error)
66
+ errors << error
67
+ end
68
+
69
+ def errors
70
+ @errors ||= []
71
+ end
72
+
73
+ def failed?
74
+ !errors.empty?
75
+ end
76
+ end
77
+ end
78
+ end
79
+ end
@@ -0,0 +1,77 @@
1
+ <style>
2
+
3
+ dl.inline dd {
4
+ display: inline;
5
+ margin: 0;
6
+ }
7
+
8
+ dl.inline dd:after {
9
+ display: block;
10
+ content: '';
11
+ }
12
+
13
+ dl.inline dt {
14
+ display: inline-block;
15
+ /*min-width: 100px;*/
16
+ width: 200px;
17
+ }
18
+
19
+ h3 {
20
+ margin-left: 20px;
21
+ }
22
+
23
+ </style>
24
+
25
+ <h1>Hyrax Ingest Report Summary</h1>
26
+
27
+ <% if failed? %>
28
+ <h2><strong>Ingest Failed!<strong></h2>
29
+ <p>The following errors caused the ingest to fail:</p>
30
+ <ol>
31
+ <% errors.each do |error| %>
32
+ <li><%= error %></li>
33
+ <% end %>
34
+ </ol>
35
+ <% end %>
36
+
37
+
38
+ <h2>Ingest Details:</h2>
39
+ <dl class="inline">
40
+ <dt>Ingest configuration file:</dt>
41
+ <dd><%= stat[:config_file_path] %></dd>
42
+ <dt>Started at:</dt>
43
+ <dd><%= stat[:datetime_started] %></dd>
44
+ <dt>Completed at:</dt>
45
+ <dd><%= stat[:datetime_completed] %></dd>
46
+ <dt>Batch size:</dt>
47
+ <dd><%= stat[:batch_size] %></dd>
48
+ <dt>Total # of source files used:</dt>
49
+ <dd><%= stat[:count, :files] %></dd>
50
+ <dt>Records ingested:</dt>
51
+ <dd><%= stat[:count, :models_saved] %></dd>
52
+ <dt>Records failed:</dt>
53
+ <dd><%= stat[:count, :models_failed] %></dd>
54
+ </dl>
55
+
56
+ <h2>Missing Required Values: <%= stat[:total_missing_required_values] %> total</h2>
57
+ <% if stat[:total_missing_required_values] > 0 %>
58
+ <% stat[:missing_required_values].each do |fetcher_class, list_of_param_hashes| %>
59
+ <% fetcher_class_short_name = fetcher_class.to_s.gsub(/.*\:\:/, '') %>
60
+ <h3>Missing required values from <%= fetcher_class_short_name %>: <%= list_of_param_hashes.count %></h3>
61
+ <ol>
62
+ <% list_of_param_hashes.each do |param_hash| %>
63
+ <li>
64
+ <% if param_hash.empty? %>
65
+ <em>No additional information</em>
66
+ <% else %>
67
+ <dl class="inline">
68
+ <% param_hash.each do |key, val| %>
69
+ <dt><%= key%>:</dt><dd><%= val %></dd>
70
+ <% end %>
71
+ </dl>
72
+ <% end %>
73
+ </li>
74
+ <% end%>
75
+ </ol>
76
+ <% end %>
77
+ <% end %>
@@ -0,0 +1,21 @@
1
+ =============================================
2
+ ======== Hyrax Ingest Report Summary ========
3
+ =============================================
4
+
5
+ Ingest configuration file: <%= stat[:config_file_path] %>
6
+ Started at: <%= stat[:datetime_started] %>
7
+ Completed at: <%= stat[:datetime_completed] %>
8
+ Batch size: <%= stat[:batch_size] %>
9
+ Total # of source files used: <%= stat[:count, :files] %>
10
+ Records ingested: <%= stat[:count, :models_saved] %>
11
+ Records failed: <%= stat[:count, :models_failed] %>
12
+
13
+ <% if stat[:count, :xml_files_missing_required_values] %>
14
+ XML Files missing required values...
15
+ <% stat[:xml_files_missing_required_values].each do |filename, xpaths| %>
16
+ Values missing from: <%= filename %>
17
+ <% xpaths.each do |xpath| %>
18
+ - <%= xpath %>
19
+ <% end %>
20
+ <% end %>
21
+ <% end %>
@@ -0,0 +1,103 @@
1
+ require 'hyrax/ingest/configuration'
2
+ require 'hyrax/ingest/ingester'
3
+ require 'hyrax/ingest/reporting'
4
+ require 'hyrax/ingest/has_sip'
5
+ require 'hyrax/ingest/has_shared_sip'
6
+ require 'hyrax/ingest/has_iteration'
7
+ require 'hyrax/ingest/has_logger'
8
+ require 'hyrax/ingest/has_report'
9
+ require 'hyrax/ingest/has_depositor'
10
+
11
+
12
+ module Hyrax
13
+ module Ingest
14
+ class Runner
15
+ include Reporting
16
+ include Interloper
17
+ include HasSIP
18
+ include HasSharedSIP
19
+ include HasIteration
20
+ include HasReport
21
+ include HasLogger
22
+ include HasDepositor
23
+
24
+ attr_reader :config
25
+
26
+ before(:run!) do
27
+ logger.info "Ingest iteration #{iteration+1} started."
28
+ report.stat[:datetime_started] ||= DateTime.now
29
+ report.stat[:batch_size] ||= 1
30
+ report.stat[:files] += sip.file_paths if sip
31
+ report.stat[:files] += shared_sip.file_paths if shared_sip
32
+ report.stat[:config_file_path] = config.config_file_path
33
+ end
34
+
35
+ after(:run!) do
36
+ logger.info "Ingest iteration #{iteration+1} complete."
37
+ report.stat[:datetime_completed] ||= DateTime.now
38
+ end
39
+
40
+ def initialize(config_file_path:, sip_path: nil, shared_sip_path: nil, iteration: 0, depositor: nil)
41
+ self.sip = SIP.new(path: sip_path) if sip_path
42
+ self.shared_sip = shared_sip_path != nil ? SIP.new(path: shared_sip_path) : nil
43
+ self.iteration = iteration.to_i
44
+ self.depositor = depositor
45
+ @config = Hyrax::Ingest::Configuration.new(config_file_path: config_file_path)
46
+ end
47
+
48
+ def run!
49
+ ingesters.collect { |ingester| ingester.run! }
50
+ end
51
+
52
+ # TODO: Does not yet return IDs of associated objects that were ingested
53
+ # as assocaited objects (i.e. objects that are nested under other
54
+ # objects in the ingest configuration). It only returns IDs for objects that
55
+ # are ingested per the top-level of ingest configuration.
56
+ def ingested_ids_by_type
57
+ {}.tap do |h|
58
+ ingesters.each do |ingester|
59
+ if ingester.respond_to? :af_model
60
+ h[ingester.af_model.class] ||= []
61
+ h[ingester.af_model.class] << ingester.af_model.id
62
+ end
63
+ end
64
+ end
65
+ end
66
+
67
+ # TODO: Does not yet return IDs of associated objects that were ingested
68
+ # as assocaited objects (i.e. objects that are nested under other
69
+ # objects in the ingest configuration). It only returns IDs for objects that
70
+ # are ingested per the top-level of ingest configuration.
71
+ def ingested_ids_by_type
72
+ {}.tap do |h|
73
+ ingesters.each do |ingester|
74
+ if ingester.respond_to? :af_model
75
+ h[ingester.af_model.class] ||= []
76
+ h[ingester.af_model.class] << ingester.af_model.id
77
+ end
78
+ end
79
+ end
80
+ end
81
+
82
+ private
83
+
84
+ def ingesters
85
+ @ingesters ||= config.ingester_configs.map do |ingester_config|
86
+ # TODO: Better way to handle invalid config than throwing big
87
+ # error msgs from here.
88
+ raise Hyrax::Ingest::Errors::InvalidConfig.new('Ingester config must be a single key value pair, where the key is the name of the ingester, and the value is the ingester configuration.') unless ingester_config.respond_to? :keys
89
+ ingester_name = ingester_config.keys.first
90
+ ingester_options = ingester_config.values.first
91
+ Hyrax::Ingest::Ingester.factory(ingester_name, ingester_options).tap do |ingester|
92
+ ingester.sip = sip if ingester.respond_to? :sip=
93
+ ingester.shared_sip = shared_sip if ingester.respond_to? :shared_sip=
94
+ ingester.iteration = iteration if ingester.respond_to? :iteration=
95
+ ingester.logger = logger if ingester.respond_to? :logger=
96
+ ingester.report = report if ingester.respond_to? :report=
97
+ ingester.depositor = depositor if ingester.respond_to? :depositor=
98
+ end
99
+ end
100
+ end
101
+ end
102
+ end
103
+ end
@@ -0,0 +1,92 @@
1
+ require 'hyrax/ingest/errors'
2
+ require 'minitar'
3
+
4
+ module Hyrax
5
+ module Ingest
6
+ # A model for reading Submission Information Packages (SIPs) from a file_pathsystem.
7
+ #
8
+ # @attr_reader [String] path description of a readonly attribute
9
+ class SIP
10
+ attr_reader :path
11
+
12
+ # @param [String] path The path to the SIP on the filesystem.
13
+ def initialize(path:)
14
+ raise Hyrax::Ingest::Errors::InvalidSIPPath.new(path.to_s) unless File.exists? path.to_s
15
+ @path = path
16
+ end
17
+
18
+ # @return [Array] A list of File objects that are part of the SIP
19
+ def file_paths
20
+ @file_paths ||= single_file_path
21
+ @file_paths ||= file_paths_from_dir
22
+ @file_paths ||= file_paths_from_tarball
23
+ @file_paths ||= []
24
+ end
25
+
26
+ # @param [String, Regexp] filename A string, a Regexp, or a string representation of a regex
27
+ # @return [File] The file from the SIP that matches the param.
28
+ def find_file_path(basename_or_regex)
29
+ file_path = file_path_from_regex(basename_or_regex) || file_path_from_basename(basename_or_regex)
30
+ raise Hyrax::Ingest::Errors::FileNotFoundInSIP.new(path, basename_or_regex) unless file_path
31
+ file_path
32
+ end
33
+
34
+ # Reads the content of a file from the SIP, and automatically rewinds it
35
+ # so it can be read again.
36
+ # @param [String, Regexp] filename A string, a Regexp, or a string representation of a regex
37
+ # @return [String] The contents of the matched file
38
+ def read_file(basename_or_regex)
39
+ File.read(find_file_path(basename_or_regex))
40
+ end
41
+
42
+ private
43
+
44
+ # @param [String, Regexp] regex Either a Regexp object or a string
45
+ # beginning and ending in forward slashes, that can be converted to
46
+ # a regex.
47
+ # @return [File] The file that matches regex as a regular expression;
48
+ # nil if no file matches 'regex', or if 'regex' cannot be used as a
49
+ # regular expression.
50
+ def file_path_from_regex(regex)
51
+ # If 'regex' is a string beginning and ending in slash, convert it to
52
+ # a Regexp.
53
+ regex = Regexp.new(regex.to_s[1..-2]) if regex.to_s =~ /^\/.*\/$/
54
+ file_paths.find { |file| File.basename(file) =~ regex } if regex.is_a? Regexp
55
+ end
56
+
57
+ # @param [String] filename The name of the file within the SIP you want
58
+ # to return.
59
+ # @return [File] The file that matches the 'filename' parameter; nil if
60
+ # no file matches the 'filename'.
61
+ def file_path_from_basename(filename)
62
+ file_paths.find { |file| File.basename(file) == filename }
63
+ end
64
+
65
+ # @return Array An Array containing the one and only file pointed to by #path
66
+ def single_file_path
67
+ Array(path) if File.file? path
68
+ end
69
+
70
+ def file_paths_from_dir
71
+ if File.directory? path
72
+ Dir.glob("#{path}/**/*").select { |entry| File.file? entry }
73
+ end
74
+ end
75
+
76
+ def file_paths_from_tarball
77
+ # TODO: this is the best test I could find for reliably determining
78
+ # whether a file was a TAR archive or not, but it seems finicky, as
79
+ # it probably depends on your operating system, or what kind of tarball
80
+ # it is. Find something better?
81
+ if (`file '#{path}'` =~ /tar archive/)
82
+ Minitar.unpack(path, tmp_dir_for_unpacked_tarball)
83
+ Dir.glob("#{tmp_dir_for_unpacked_tarball}/**/*")
84
+ end
85
+ end
86
+
87
+ def tmp_dir_for_unpacked_tarball
88
+ "#{Dir.tmpdir}/#{File.basename(path)}.unpacked"
89
+ end
90
+ end
91
+ end
92
+ end