hyrax-ingest 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (44) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +52 -0
  3. data/config/routes.rb +2 -0
  4. data/lib/hyrax/ingest.rb +12 -0
  5. data/lib/hyrax/ingest/batch_runner.rb +130 -0
  6. data/lib/hyrax/ingest/configuration.rb +54 -0
  7. data/lib/hyrax/ingest/engine.rb +6 -0
  8. data/lib/hyrax/ingest/errors.rb +186 -0
  9. data/lib/hyrax/ingest/fetcher.rb +55 -0
  10. data/lib/hyrax/ingest/fetcher/base.rb +78 -0
  11. data/lib/hyrax/ingest/fetcher/csv_file.rb +89 -0
  12. data/lib/hyrax/ingest/fetcher/date_time.rb +15 -0
  13. data/lib/hyrax/ingest/fetcher/literal.rb +24 -0
  14. data/lib/hyrax/ingest/fetcher/premis_event_type.rb +28 -0
  15. data/lib/hyrax/ingest/fetcher/rdf_uri.rb +21 -0
  16. data/lib/hyrax/ingest/fetcher/xml_file.rb +54 -0
  17. data/lib/hyrax/ingest/fetcher/yaml_file.rb +36 -0
  18. data/lib/hyrax/ingest/has_depositor.rb +13 -0
  19. data/lib/hyrax/ingest/has_iteration.rb +18 -0
  20. data/lib/hyrax/ingest/has_logger.rb +29 -0
  21. data/lib/hyrax/ingest/has_report.rb +17 -0
  22. data/lib/hyrax/ingest/has_shared_sip.rb +20 -0
  23. data/lib/hyrax/ingest/has_sip.rb +20 -0
  24. data/lib/hyrax/ingest/ingester.rb +75 -0
  25. data/lib/hyrax/ingest/ingester/active_fedora_base_ingester.rb +136 -0
  26. data/lib/hyrax/ingest/ingester/active_fedora_file_ingester.rb +17 -0
  27. data/lib/hyrax/ingest/ingester/active_fedora_property_assigner.rb +67 -0
  28. data/lib/hyrax/ingest/ingester/base.rb +28 -0
  29. data/lib/hyrax/ingest/ingester/file_set_ingester.rb +68 -0
  30. data/lib/hyrax/ingest/ingester/preservation_event_ingester.rb +27 -0
  31. data/lib/hyrax/ingest/ingester/work_ingester.rb +55 -0
  32. data/lib/hyrax/ingest/reporting.rb +13 -0
  33. data/lib/hyrax/ingest/reporting/configuration.rb +22 -0
  34. data/lib/hyrax/ingest/reporting/report.rb +79 -0
  35. data/lib/hyrax/ingest/reporting/views/hyrax_ingest_report.html.erb +77 -0
  36. data/lib/hyrax/ingest/reporting/views/hyrax_ingest_report.txt.erb +21 -0
  37. data/lib/hyrax/ingest/runner.rb +103 -0
  38. data/lib/hyrax/ingest/sip.rb +92 -0
  39. data/lib/hyrax/ingest/transformer.rb +42 -0
  40. data/lib/hyrax/ingest/transformer/base.rb +12 -0
  41. data/lib/hyrax/ingest/transformer/to_date.rb +33 -0
  42. data/lib/hyrax/ingest/version.rb +5 -0
  43. data/lib/tasks/ingest_tasks.rake +22 -0
  44. metadata +330 -0
@@ -0,0 +1,27 @@
1
+ require 'hyrax/ingest/ingester/active_fedora_base_ingester'
2
+ require 'hyrax/ingest/ingester/active_fedora_file_ingester'
3
+
4
+ module Hyrax
5
+ module Ingest
6
+ module Ingester
7
+ class PreservationEventIngester < ActiveFedoraBaseIngester
8
+
9
+ attr_reader :premis_event_related_object
10
+
11
+ def initialize(config={})
12
+ config[:af_model_class_name] ||= 'Hyrax::Preservation::Event'
13
+ @premis_event_related_object = config.delete(:premis_event_related_object)
14
+ super(config)
15
+ end
16
+
17
+ def run!
18
+ assign_properties!
19
+ af_model.premis_event_related_object = premis_event_related_object
20
+ save_model!
21
+ # return the new instance of the ActiveFedora model
22
+ af_model
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,55 @@
1
+ require 'hyrax/ingest/ingester/active_fedora_base_ingester'
2
+ require 'hyrax/ingest/ingester/file_set_ingester'
3
+
4
+ module Hyrax
5
+ module Ingest
6
+ module Ingester
7
+ class WorkIngester < ActiveFedoraBaseIngester
8
+ attr_reader :file_sets_config
9
+
10
+ def initialize(config={})
11
+ # TODO: Throw a useful custom error when :type option is missing.
12
+ config[:af_model_class_name] ||= config.delete(:type)
13
+ @file_sets_config = config.delete(:FileSets) || []
14
+ super(config)
15
+ end
16
+
17
+ def run!
18
+ assign_properties!
19
+ assign_related_file_set_properties!
20
+ apply_depositor_metadata!
21
+ save_model!
22
+ assign_file_sets_to_work!
23
+ # return the new instance of the ActiveFedora model
24
+ af_model
25
+ end
26
+
27
+ private
28
+
29
+ def apply_depositor_metadata!
30
+ af_model.apply_depositor_metadata(depositor) if depositor
31
+ end
32
+
33
+ def assign_related_file_set_properties!
34
+ file_set_ingesters.each { |file_set_ingester| file_set_ingester.assign_properties! }
35
+ end
36
+
37
+ def assign_file_sets_to_work!
38
+ file_set_ingesters.each do |file_set_ingester|
39
+ af_model.members += [file_set_ingester.af_model]
40
+ end
41
+ end
42
+
43
+ def file_set_ingesters
44
+ @file_set_ingesters ||= @file_sets_config.map do |file_set_config|
45
+ Hyrax::Ingest::Ingester::FileSetIngester.new(file_set_config).tap do |file_set_ingester|
46
+ file_set_ingester.sip = sip if file_set_ingester.respond_to?(:sip=)
47
+ file_set_ingester.shared_sip = shared_sip if file_set_ingester.respond_to?(:shared_sip=)
48
+ file_set_ingester.iteration = iteration if file_set_ingester.respond_to?(:iteration=)
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,13 @@
1
+ require 'hyrax/ingest/reporting/configuration'
2
+
3
+ module Hyrax
4
+ module Ingest
5
+ module Reporting
6
+ class << self
7
+ def config
8
+ @config ||= Hyrax::Ingest::Reporting::Configuration.new
9
+ end
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,22 @@
1
+ require 'hyrax/ingest/errors'
2
+
3
+ module Hyrax
4
+ module Ingest
5
+ module Reporting
6
+ class Configuration
7
+ attr_reader :default_template_path
8
+ attr_accessor :default_output_file
9
+
10
+ def initialize
11
+ @default_template_path = File.expand_path('../views/hyrax_ingest_report.html.erb', __FILE__)
12
+ @default_output_file = File.expand_path('hyrax_ingest_report.html')
13
+ end
14
+
15
+ def default_template_path=(path)
16
+ raise Hyrax::Ingest::Errors::ConfigurationError, "\"#{path}\" does not exist" unless File.exist? path
17
+ @default_template_path = path
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,79 @@
1
+ require 'erb'
2
+ require 'functional_hash'
3
+ require 'hyrax/ingest/reporting/configuration'
4
+
5
+ module Hyrax
6
+ module Ingest
7
+ module Reporting
8
+ class Report
9
+ attr_reader :stat
10
+
11
+ def initialize
12
+ @stat = default_stat
13
+ end
14
+
15
+ def default_stat
16
+ # Initialize a FunctionalHash to serve as a stat tracker, and
17
+ # add some default values in the same way you would a Hash.
18
+ @stat = FunctionalHash.new.tap do |stat|
19
+ # Stores an array of all SIP paths.
20
+ stat[:sip_paths] = []
21
+
22
+ # Stores a list of all files from SIPs that are part of the ingest.
23
+ stat[:files] = []
24
+
25
+ # Stores a hash where keys are Fetcher classes, and values are
26
+ # the number of occurrences of missing rquired values.
27
+ stat[:missing_required_values] = {}
28
+
29
+ stat[:total_missing_required_values] = Proc.new do |s|
30
+ stat[:missing_required_values].reduce(0) do |total, key_and_value|
31
+ # When reducing a Hash, the 2nd arg to the block is a
32
+ # 2-element array, where the 1st element is the key, and the
33
+ # 2nd element is the value the key points to.
34
+ occurrences = key_and_value.last
35
+ total + occurrences.count
36
+ end
37
+ end
38
+
39
+ # Filters the :missing_required_values hash to those for XML files.
40
+ # stat[:xml_files_missing_required_values] = Proc.new do |s|
41
+ # s[:missing_required_values].select { |fetcher_class, params| fetcher_class.to_s =~ /XMLFile$/ }
42
+ # end
43
+
44
+ # Define a functional hash value that returns the count of the given key.
45
+ stat[:count] = Proc.new do |s, key_to_count|
46
+ s[key_to_count].respond_to?(:count) ? s[key_to_count].count : 0
47
+ end
48
+
49
+ stat[:models_saved] = []
50
+ stat[:models_failed] = []
51
+ end
52
+ end
53
+
54
+ def render(template_path: nil)
55
+ template_path ||= Reporting.config.default_template_path
56
+ template_content = File.read(File.expand_path(template_path))
57
+ ERB.new(template_content).result(binding)
58
+ end
59
+
60
+ def write_to_file(filename: nil, template_path: nil)
61
+ filename ||= Reporting.config.default_output_file
62
+ File.write(filename, render(template_path: template_path))
63
+ end
64
+
65
+ def failed_with(error)
66
+ errors << error
67
+ end
68
+
69
+ def errors
70
+ @errors ||= []
71
+ end
72
+
73
+ def failed?
74
+ !errors.empty?
75
+ end
76
+ end
77
+ end
78
+ end
79
+ end
@@ -0,0 +1,77 @@
1
+ <style>
2
+
3
+ dl.inline dd {
4
+ display: inline;
5
+ margin: 0;
6
+ }
7
+
8
+ dl.inline dd:after {
9
+ display: block;
10
+ content: '';
11
+ }
12
+
13
+ dl.inline dt {
14
+ display: inline-block;
15
+ /*min-width: 100px;*/
16
+ width: 200px;
17
+ }
18
+
19
+ h3 {
20
+ margin-left: 20px;
21
+ }
22
+
23
+ </style>
24
+
25
+ <h1>Hyrax Ingest Report Summary</h1>
26
+
27
+ <% if failed? %>
28
+ <h2><strong>Ingest Failed!<strong></h2>
29
+ <p>The following errors caused the ingest to fail:</p>
30
+ <ol>
31
+ <% errors.each do |error| %>
32
+ <li><%= error %></li>
33
+ <% end %>
34
+ </ol>
35
+ <% end %>
36
+
37
+
38
+ <h2>Ingest Details:</h2>
39
+ <dl class="inline">
40
+ <dt>Ingest configuration file:</dt>
41
+ <dd><%= stat[:config_file_path] %></dd>
42
+ <dt>Started at:</dt>
43
+ <dd><%= stat[:datetime_started] %></dd>
44
+ <dt>Completed at:</dt>
45
+ <dd><%= stat[:datetime_completed] %></dd>
46
+ <dt>Batch size:</dt>
47
+ <dd><%= stat[:batch_size] %></dd>
48
+ <dt>Total # of source files used:</dt>
49
+ <dd><%= stat[:count, :files] %></dd>
50
+ <dt>Records ingested:</dt>
51
+ <dd><%= stat[:count, :models_saved] %></dd>
52
+ <dt>Records failed:</dt>
53
+ <dd><%= stat[:count, :models_failed] %></dd>
54
+ </dl>
55
+
56
+ <h2>Missing Required Values: <%= stat[:total_missing_required_values] %> total</h2>
57
+ <% if stat[:total_missing_required_values] > 0 %>
58
+ <% stat[:missing_required_values].each do |fetcher_class, list_of_param_hashes| %>
59
+ <% fetcher_class_short_name = fetcher_class.to_s.gsub(/.*\:\:/, '') %>
60
+ <h3>Missing required values from <%= fetcher_class_short_name %>: <%= list_of_param_hashes.count %></h3>
61
+ <ol>
62
+ <% list_of_param_hashes.each do |param_hash| %>
63
+ <li>
64
+ <% if param_hash.empty? %>
65
+ <em>No additional information</em>
66
+ <% else %>
67
+ <dl class="inline">
68
+ <% param_hash.each do |key, val| %>
69
+ <dt><%= key%>:</dt><dd><%= val %></dd>
70
+ <% end %>
71
+ </dl>
72
+ <% end %>
73
+ </li>
74
+ <% end%>
75
+ </ol>
76
+ <% end %>
77
+ <% end %>
@@ -0,0 +1,21 @@
1
+ =============================================
2
+ ======== Hyrax Ingest Report Summary ========
3
+ =============================================
4
+
5
+ Ingest configuration file: <%= stat[:config_file_path] %>
6
+ Started at: <%= stat[:datetime_started] %>
7
+ Completed at: <%= stat[:datetime_completed] %>
8
+ Batch size: <%= stat[:batch_size] %>
9
+ Total # of source files used: <%= stat[:count, :files] %>
10
+ Records ingested: <%= stat[:count, :models_saved] %>
11
+ Records failed: <%= stat[:count, :models_failed] %>
12
+
13
+ <% if stat[:count, :xml_files_missing_required_values] %>
14
+ XML Files missing required values...
15
+ <% stat[:xml_files_missing_required_values].each do |filename, xpaths| %>
16
+ Values missing from: <%= filename %>
17
+ <% xpaths.each do |xpath| %>
18
+ - <%= xpath %>
19
+ <% end %>
20
+ <% end %>
21
+ <% end %>
@@ -0,0 +1,103 @@
1
+ require 'hyrax/ingest/configuration'
2
+ require 'hyrax/ingest/ingester'
3
+ require 'hyrax/ingest/reporting'
4
+ require 'hyrax/ingest/has_sip'
5
+ require 'hyrax/ingest/has_shared_sip'
6
+ require 'hyrax/ingest/has_iteration'
7
+ require 'hyrax/ingest/has_logger'
8
+ require 'hyrax/ingest/has_report'
9
+ require 'hyrax/ingest/has_depositor'
10
+
11
+
12
+ module Hyrax
13
+ module Ingest
14
+ class Runner
15
+ include Reporting
16
+ include Interloper
17
+ include HasSIP
18
+ include HasSharedSIP
19
+ include HasIteration
20
+ include HasReport
21
+ include HasLogger
22
+ include HasDepositor
23
+
24
+ attr_reader :config
25
+
26
+ before(:run!) do
27
+ logger.info "Ingest iteration #{iteration+1} started."
28
+ report.stat[:datetime_started] ||= DateTime.now
29
+ report.stat[:batch_size] ||= 1
30
+ report.stat[:files] += sip.file_paths if sip
31
+ report.stat[:files] += shared_sip.file_paths if shared_sip
32
+ report.stat[:config_file_path] = config.config_file_path
33
+ end
34
+
35
+ after(:run!) do
36
+ logger.info "Ingest iteration #{iteration+1} complete."
37
+ report.stat[:datetime_completed] ||= DateTime.now
38
+ end
39
+
40
+ def initialize(config_file_path:, sip_path: nil, shared_sip_path: nil, iteration: 0, depositor: nil)
41
+ self.sip = SIP.new(path: sip_path) if sip_path
42
+ self.shared_sip = shared_sip_path != nil ? SIP.new(path: shared_sip_path) : nil
43
+ self.iteration = iteration.to_i
44
+ self.depositor = depositor
45
+ @config = Hyrax::Ingest::Configuration.new(config_file_path: config_file_path)
46
+ end
47
+
48
+ def run!
49
+ ingesters.collect { |ingester| ingester.run! }
50
+ end
51
+
52
+ # TODO: Does not yet return IDs of associated objects that were ingested
53
+ # as assocaited objects (i.e. objects that are nested under other
54
+ # objects in the ingest configuration). It only returns IDs for objects that
55
+ # are ingested per the top-level of ingest configuration.
56
+ def ingested_ids_by_type
57
+ {}.tap do |h|
58
+ ingesters.each do |ingester|
59
+ if ingester.respond_to? :af_model
60
+ h[ingester.af_model.class] ||= []
61
+ h[ingester.af_model.class] << ingester.af_model.id
62
+ end
63
+ end
64
+ end
65
+ end
66
+
67
+ # TODO: Does not yet return IDs of associated objects that were ingested
68
+ # as assocaited objects (i.e. objects that are nested under other
69
+ # objects in the ingest configuration). It only returns IDs for objects that
70
+ # are ingested per the top-level of ingest configuration.
71
+ def ingested_ids_by_type
72
+ {}.tap do |h|
73
+ ingesters.each do |ingester|
74
+ if ingester.respond_to? :af_model
75
+ h[ingester.af_model.class] ||= []
76
+ h[ingester.af_model.class] << ingester.af_model.id
77
+ end
78
+ end
79
+ end
80
+ end
81
+
82
+ private
83
+
84
+ def ingesters
85
+ @ingesters ||= config.ingester_configs.map do |ingester_config|
86
+ # TODO: Better way to handle invalid config than throwing big
87
+ # error msgs from here.
88
+ raise Hyrax::Ingest::Errors::InvalidConfig.new('Ingester config must be a single key value pair, where the key is the name of the ingester, and the value is the ingester configuration.') unless ingester_config.respond_to? :keys
89
+ ingester_name = ingester_config.keys.first
90
+ ingester_options = ingester_config.values.first
91
+ Hyrax::Ingest::Ingester.factory(ingester_name, ingester_options).tap do |ingester|
92
+ ingester.sip = sip if ingester.respond_to? :sip=
93
+ ingester.shared_sip = shared_sip if ingester.respond_to? :shared_sip=
94
+ ingester.iteration = iteration if ingester.respond_to? :iteration=
95
+ ingester.logger = logger if ingester.respond_to? :logger=
96
+ ingester.report = report if ingester.respond_to? :report=
97
+ ingester.depositor = depositor if ingester.respond_to? :depositor=
98
+ end
99
+ end
100
+ end
101
+ end
102
+ end
103
+ end
@@ -0,0 +1,92 @@
1
+ require 'hyrax/ingest/errors'
2
+ require 'minitar'
3
+
4
+ module Hyrax
5
+ module Ingest
6
+ # A model for reading Submission Information Packages (SIPs) from a file_pathsystem.
7
+ #
8
+ # @attr_reader [String] path description of a readonly attribute
9
+ class SIP
10
+ attr_reader :path
11
+
12
+ # @param [String] path The path to the SIP on the filesystem.
13
+ def initialize(path:)
14
+ raise Hyrax::Ingest::Errors::InvalidSIPPath.new(path.to_s) unless File.exists? path.to_s
15
+ @path = path
16
+ end
17
+
18
+ # @return [Array] A list of File objects that are part of the SIP
19
+ def file_paths
20
+ @file_paths ||= single_file_path
21
+ @file_paths ||= file_paths_from_dir
22
+ @file_paths ||= file_paths_from_tarball
23
+ @file_paths ||= []
24
+ end
25
+
26
+ # @param [String, Regexp] filename A string, a Regexp, or a string representation of a regex
27
+ # @return [File] The file from the SIP that matches the param.
28
+ def find_file_path(basename_or_regex)
29
+ file_path = file_path_from_regex(basename_or_regex) || file_path_from_basename(basename_or_regex)
30
+ raise Hyrax::Ingest::Errors::FileNotFoundInSIP.new(path, basename_or_regex) unless file_path
31
+ file_path
32
+ end
33
+
34
+ # Reads the content of a file from the SIP, and automatically rewinds it
35
+ # so it can be read again.
36
+ # @param [String, Regexp] filename A string, a Regexp, or a string representation of a regex
37
+ # @return [String] The contents of the matched file
38
+ def read_file(basename_or_regex)
39
+ File.read(find_file_path(basename_or_regex))
40
+ end
41
+
42
+ private
43
+
44
+ # @param [String, Regexp] regex Either a Regexp object or a string
45
+ # beginning and ending in forward slashes, that can be converted to
46
+ # a regex.
47
+ # @return [File] The file that matches regex as a regular expression;
48
+ # nil if no file matches 'regex', or if 'regex' cannot be used as a
49
+ # regular expression.
50
+ def file_path_from_regex(regex)
51
+ # If 'regex' is a string beginning and ending in slash, convert it to
52
+ # a Regexp.
53
+ regex = Regexp.new(regex.to_s[1..-2]) if regex.to_s =~ /^\/.*\/$/
54
+ file_paths.find { |file| File.basename(file) =~ regex } if regex.is_a? Regexp
55
+ end
56
+
57
+ # @param [String] filename The name of the file within the SIP you want
58
+ # to return.
59
+ # @return [File] The file that matches the 'filename' parameter; nil if
60
+ # no file matches the 'filename'.
61
+ def file_path_from_basename(filename)
62
+ file_paths.find { |file| File.basename(file) == filename }
63
+ end
64
+
65
+ # @return Array An Array containing the one and only file pointed to by #path
66
+ def single_file_path
67
+ Array(path) if File.file? path
68
+ end
69
+
70
+ def file_paths_from_dir
71
+ if File.directory? path
72
+ Dir.glob("#{path}/**/*").select { |entry| File.file? entry }
73
+ end
74
+ end
75
+
76
+ def file_paths_from_tarball
77
+ # TODO: this is the best test I could find for reliably determining
78
+ # whether a file was a TAR archive or not, but it seems finicky, as
79
+ # it probably depends on your operating system, or what kind of tarball
80
+ # it is. Find something better?
81
+ if (`file '#{path}'` =~ /tar archive/)
82
+ Minitar.unpack(path, tmp_dir_for_unpacked_tarball)
83
+ Dir.glob("#{tmp_dir_for_unpacked_tarball}/**/*")
84
+ end
85
+ end
86
+
87
+ def tmp_dir_for_unpacked_tarball
88
+ "#{Dir.tmpdir}/#{File.basename(path)}.unpacked"
89
+ end
90
+ end
91
+ end
92
+ end