hyrax-ingest 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +52 -0
  3. data/config/routes.rb +2 -0
  4. data/lib/hyrax/ingest.rb +12 -0
  5. data/lib/hyrax/ingest/batch_runner.rb +130 -0
  6. data/lib/hyrax/ingest/configuration.rb +54 -0
  7. data/lib/hyrax/ingest/engine.rb +6 -0
  8. data/lib/hyrax/ingest/errors.rb +186 -0
  9. data/lib/hyrax/ingest/fetcher.rb +55 -0
  10. data/lib/hyrax/ingest/fetcher/base.rb +78 -0
  11. data/lib/hyrax/ingest/fetcher/csv_file.rb +89 -0
  12. data/lib/hyrax/ingest/fetcher/date_time.rb +15 -0
  13. data/lib/hyrax/ingest/fetcher/literal.rb +24 -0
  14. data/lib/hyrax/ingest/fetcher/premis_event_type.rb +28 -0
  15. data/lib/hyrax/ingest/fetcher/rdf_uri.rb +21 -0
  16. data/lib/hyrax/ingest/fetcher/xml_file.rb +54 -0
  17. data/lib/hyrax/ingest/fetcher/yaml_file.rb +36 -0
  18. data/lib/hyrax/ingest/has_depositor.rb +13 -0
  19. data/lib/hyrax/ingest/has_iteration.rb +18 -0
  20. data/lib/hyrax/ingest/has_logger.rb +29 -0
  21. data/lib/hyrax/ingest/has_report.rb +17 -0
  22. data/lib/hyrax/ingest/has_shared_sip.rb +20 -0
  23. data/lib/hyrax/ingest/has_sip.rb +20 -0
  24. data/lib/hyrax/ingest/ingester.rb +75 -0
  25. data/lib/hyrax/ingest/ingester/active_fedora_base_ingester.rb +136 -0
  26. data/lib/hyrax/ingest/ingester/active_fedora_file_ingester.rb +17 -0
  27. data/lib/hyrax/ingest/ingester/active_fedora_property_assigner.rb +67 -0
  28. data/lib/hyrax/ingest/ingester/base.rb +28 -0
  29. data/lib/hyrax/ingest/ingester/file_set_ingester.rb +68 -0
  30. data/lib/hyrax/ingest/ingester/preservation_event_ingester.rb +27 -0
  31. data/lib/hyrax/ingest/ingester/work_ingester.rb +55 -0
  32. data/lib/hyrax/ingest/reporting.rb +13 -0
  33. data/lib/hyrax/ingest/reporting/configuration.rb +22 -0
  34. data/lib/hyrax/ingest/reporting/report.rb +79 -0
  35. data/lib/hyrax/ingest/reporting/views/hyrax_ingest_report.html.erb +77 -0
  36. data/lib/hyrax/ingest/reporting/views/hyrax_ingest_report.txt.erb +21 -0
  37. data/lib/hyrax/ingest/runner.rb +103 -0
  38. data/lib/hyrax/ingest/sip.rb +92 -0
  39. data/lib/hyrax/ingest/transformer.rb +42 -0
  40. data/lib/hyrax/ingest/transformer/base.rb +12 -0
  41. data/lib/hyrax/ingest/transformer/to_date.rb +33 -0
  42. data/lib/hyrax/ingest/version.rb +5 -0
  43. data/lib/tasks/ingest_tasks.rake +22 -0
  44. metadata +330 -0
@@ -0,0 +1,55 @@
1
+ require 'hyrax/ingest/fetcher/xml_file'
2
+ require 'hyrax/ingest/fetcher/yaml_file'
3
+ require 'hyrax/ingest/fetcher/csv_file'
4
+ require 'hyrax/ingest/fetcher/literal'
5
+ require 'hyrax/ingest/fetcher/premis_event_type'
6
+ require 'hyrax/ingest/fetcher/rdf_uri'
7
+ require 'hyrax/ingest/fetcher/date_time'
8
+ require 'hyrax/ingest/errors'
9
+
10
+ module Hyrax
11
+ module Ingest
12
+ module Fetcher
13
+ class << self
14
+ # @return Set The set of all fetcher classes.
15
+ def all_classes
16
+ @all_classes ||= Set.new.tap do |all_classes|
17
+ all_classes << Hyrax::Ingest::Fetcher::XMLFile
18
+ all_classes << Hyrax::Ingest::Fetcher::YAMLFile
19
+ all_classes << Hyrax::Ingest::Fetcher::CSVFile
20
+ all_classes << Hyrax::Ingest::Fetcher::Literal
21
+ all_classes << Hyrax::Ingest::Fetcher::PremisEventType
22
+ all_classes << Hyrax::Ingest::Fetcher::RdfUri
23
+ all_classes << Hyrax::Ingest::Fetcher::DateTime
24
+ end
25
+ end
26
+
27
+ # @param [string] options The key is the fetcher class name
28
+ # and the value is the hash of options to pass to the constructor of the fetcher
29
+ # class.
30
+ # @return An instance of the fetcher class.
31
+ def factory(class_name, options={})
32
+ options ||= {}
33
+ find_class_by_name(class_name).new(options)
34
+ end
35
+
36
+ # @param [String] class_name The stringified class name, with our
37
+ # without namespaces.
38
+ # @raise [Hyrax::Ingest::Errors::UnknownFetcherClass] When there is no
39
+ # corresponding fetcher class for the given value of the `class_name`
40
+ # param.
41
+ # @raise [Hyrax::Ingest::Errors::UnknownfetcherClass] When the value of
42
+ # `class_name` param is insufficient in determining a fetcher class.
43
+ # @return [Class] The appropriate fetcher class.
44
+ def find_class_by_name(class_name)
45
+ found_classes = all_classes.select do |class_const|
46
+ (class_const.to_s == class_name) || (class_const.to_s =~ /::#{class_name}/)
47
+ end
48
+ raise Hyrax::Ingest::Errors::UnknownFetcherClass.new(class_name, all_classes) if found_classes.count == 0
49
+ raise Hyrax::Ingest::Errors::AmbiguousFetcherClass.new(class_name, found_classes) if found_classes.count > 1
50
+ found_classes.first
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,78 @@
1
+ require 'hyrax/ingest/errors'
2
+ require 'hyrax/ingest/sip'
3
+ require 'interloper'
4
+ require 'hyrax/ingest/has_logger'
5
+ require 'hyrax/ingest/has_report'
6
+
7
+
8
+
9
+ module Hyrax
10
+ module Ingest
11
+ module Fetcher
12
+ class Base
13
+ include Interloper
14
+ include HasReport
15
+ include HasLogger
16
+
17
+ # Callback to log missing values that have been marked as required.
18
+ after(:fetch) do
19
+ # Use a flag to only report the missing value once.
20
+ unless @after_fetch_run_once
21
+ if required? && fetched_value_is_empty?
22
+ report_missing_required_value
23
+ end
24
+ @after_fetch_run_once = true
25
+ end
26
+ end
27
+
28
+ def initialize(options={})
29
+ @required = options.delete(:required)
30
+ end
31
+
32
+ # Boolean reader for @required.
33
+ def required?; !!@required; end
34
+
35
+ # Subclasses should override this method with the logic required to
36
+ # fetch values from a SIP. The overidden method should set
37
+ # @fetched_value, and return it.
38
+ # @see Hyrax::Ingest::Fetcher::XMLFile#fetch
39
+ # @abstract
40
+ def fetch
41
+ @fetched_value
42
+ end
43
+
44
+ protected
45
+
46
+ # Determines whether the value that was fetched by #fetch is empty
47
+ # or not. Be default, the fetched value is empty if it is nil, an
48
+ # empty string, an empty array, an empty hash, or an array
49
+ # containing any combination of those. Overwrite this method in
50
+ # subclasses to change the definition of empty in those contexts.
51
+ # The return value is used for reporting which values are missing,
52
+ # but required.
53
+ # @return [Boolean] True if @fetched_value is considered to be empty; false otherwise.
54
+ def fetched_value_is_empty?
55
+ Array(@fetched_value).reduce(true) do |all_empty, val|
56
+ all_empty &&= ( val.nil? || ( val.respond_to?(:empty?) && val.empty? ) )
57
+ end
58
+ end
59
+
60
+ # Reports occurrences of missing required values.
61
+ # Subclasses should override this method to provide further detail
62
+ # by passing an options hash that will be available for reporting.
63
+ # @see Hyrax::Ingest::Fetcher::XMLFile#report_missing_required_value
64
+ # @example
65
+ # # Provide additional info to the report.
66
+ # def report_missing_require_value
67
+ # super(foo: "bar")
68
+ # end
69
+ def report_missing_required_value(params={})
70
+ short_class_name = self.class.to_s.gsub(/.*\:\:/, '')
71
+ logger.warn "Missing required value from #{short_class_name} with params = #{params}"
72
+ report.stat[:missing_required_values][self.class] ||= []
73
+ report.stat[:missing_required_values][self.class] << params
74
+ end
75
+ end
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,89 @@
1
+ require 'hyrax/ingest/fetcher/base'
2
+ require 'hyrax/ingest/has_sip'
3
+ require 'hyrax/ingest/has_iteration'
4
+ require 'roo'
5
+
6
+ module Hyrax
7
+ module Ingest
8
+ module Fetcher
9
+ class CSVFile < Base
10
+ attr_reader :filename, :column, :row, :header_row_number, :row_offset
11
+
12
+ include HasSIP
13
+ include HasIteration
14
+
15
+ def initialize(options={})
16
+ raise ArgumentError, "Required option :filename is missing" unless options.key?(:filename)
17
+ raise ArgumentError, "Required option :column is missing" unless options.key?(:column)
18
+ raise ArgumentError, "Required option :row is missing" unless options.key?(:row)
19
+
20
+ @filename = options.fetch(:filename)
21
+ @column = options.fetch(:column, '').to_s.strip.downcase
22
+ @row = options.fetch(:row, '').to_s.strip.downcase
23
+ @header_row_number = options.fetch(:header_row_number, 1)
24
+ @row_offset = options[:row_offset]
25
+ super
26
+ end
27
+
28
+ def fetch
29
+ @fetched_value ||= cell_value
30
+ end
31
+
32
+ private
33
+
34
+ def roo
35
+ @roo ||= Roo::CSV.new(sip.find_file_path(filename))
36
+ end
37
+
38
+ def cell_value
39
+ roo.cell(row_number, column_number)
40
+ end
41
+
42
+ # @return The column number from which to retrieve the cell.
43
+ def column_number
44
+ @column_number ||= column_number_from_header || specific_column_number
45
+ # TODO: custom error
46
+ raise ArgumentError, "Value for column: option must be a number or a column header; '#{column}' was given." if @column_number.nil?
47
+ @column_number
48
+ end
49
+
50
+ def column_number_from_header
51
+ headers.index(column) + 1 if headers.index(column)
52
+ end
53
+
54
+ def specific_column_number
55
+ column.to_i if string_is_integer?(column)
56
+ end
57
+
58
+ def row_number
59
+ @row_number ||= next_row || specific_row_number
60
+ # TODO: custom error
61
+ raise ArgumentError, "Value for row: option must be a number or the keyword 'next'; #{@row} was given." if @row_number.nil?
62
+ @row_number + row_offset
63
+ end
64
+
65
+ def next_row
66
+ iteration + 1 if @row == 'next'
67
+ end
68
+
69
+ def specific_row_number
70
+ row.to_i if string_is_integer?(row)
71
+ end
72
+
73
+ def row_offset
74
+ @row_offset || header_row_number
75
+ end
76
+
77
+ def headers
78
+ @headers ||= roo.row(header_row_number).map(&:to_s).map(&:downcase)
79
+ end
80
+
81
+ def string_is_integer?(str)
82
+ # remove leading zeros first
83
+ str.sub!(/^0+/, '')
84
+ str.to_i.to_s == str.to_s
85
+ end
86
+ end
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,15 @@
1
+ require 'hyrax/ingest/errors'
2
+ require 'hyrax/ingest/sip'
3
+ require 'date'
4
+
5
+ module Hyrax
6
+ module Ingest
7
+ module Fetcher
8
+ class DateTime < Base
9
+ def fetch
10
+ @fetched_value ||= ::DateTime.now
11
+ end
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,24 @@
1
+ require 'hyrax/ingest/errors'
2
+ require 'hyrax/ingest/sip'
3
+
4
+ module Hyrax
5
+ module Ingest
6
+ module Fetcher
7
+ class Literal < Base
8
+ attr_reader :literal_value
9
+
10
+ inherit_callbacks_for :fetch
11
+
12
+ def initialize(options={})
13
+ options = { value: options } unless options.is_a? Hash
14
+ @literal_value = options.delete(:value)
15
+ super
16
+ end
17
+
18
+ def fetch
19
+ @fetched_value = literal_value
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,28 @@
1
+ # This class adhere's to the Hyrax::Ingest::Fetcher::Base interface to work other classes within the Hyrax::Ingest
2
+ # gem. If effectively maps PREMIS Event Type abbreviations to the URI that is then associated with a
3
+ # Hyrax::Preservation::Event model.
4
+
5
+ require 'hyrax/ingest/errors'
6
+ require 'hyrax/ingest/sip'
7
+
8
+ module Hyrax
9
+ module Ingest
10
+ module Fetcher
11
+ class PremisEventType < Base
12
+ inherit_callbacks_for :fetch
13
+
14
+ attr_reader :abbr
15
+
16
+ def initialize(options={})
17
+ options = { abbr: options } unless options.is_a? Hash
18
+ @abbr = options.delete(:abbr)
19
+ super
20
+ end
21
+
22
+ def fetch
23
+ @fetched_value ||= Hyrax::Preservation::PremisEventType.find_by_abbr(@abbr).uri
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,21 @@
1
+ require 'rdf'
2
+
3
+ module Hyrax
4
+ module Ingest
5
+ module Fetcher
6
+ class RdfUri < Base
7
+ attr_reader :uri_str
8
+
9
+ def initialize(options={})
10
+ options = { uri_str: options } unless options.is_a? Hash
11
+ @uri_str = options.delete(:uri_str)
12
+ super
13
+ end
14
+
15
+ def fetch
16
+ @fetched_value ||= ::RDF::URI.new(uri_str)
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,54 @@
1
+ require 'hyrax/ingest/fetcher/base'
2
+ require 'hyrax/ingest/has_sip'
3
+ require 'nokogiri'
4
+ require 'hyrax/ingest/reporting'
5
+ require 'interloper'
6
+
7
+ module Hyrax
8
+ module Ingest
9
+ module Fetcher
10
+ class XMLFile < Base
11
+ include HasSIP
12
+
13
+ attr_reader :filename, :xpath, :default, :fetched_value
14
+
15
+ inherit_callbacks_after :fetch
16
+
17
+ def initialize(options={})
18
+ raise ArgumentError, "Required option :filename is missing" unless options.key?(:filename)
19
+ raise ArgumentError, "Required option :xpath is missing" unless options.key?(:xpath)
20
+ @filename = options[:filename]
21
+ @xpath = options[:xpath]
22
+ @default = options[:default] || []
23
+ super
24
+ end
25
+
26
+ # Overrides Hyrax::Ingest::Fetcher::Base#fetch
27
+ # @return [String] The value fetched from the XML file
28
+ def fetch
29
+ @fetched_value ||= begin
30
+ fetched_from_xml = noko.xpath(xpath).map(&:text)
31
+ fetched_from_xml.empty? ? default : fetched_from_xml
32
+ end
33
+ end
34
+
35
+ protected
36
+
37
+ # Overrides
38
+ # Hyrax::Ingest::Fetcher::Base#report_missing_required_value,
39
+ # passing filename and xpath inforamtion. The report summary has
40
+ # specific logic to look for these.
41
+ def report_missing_required_value
42
+ super(filename: filename, xpath: xpath)
43
+ end
44
+
45
+ def noko
46
+ @noko ||= Nokogiri::XML(sip.read_file(filename)).tap do |n|
47
+ # TODO: allow using namespaces instead of blindly removing them.
48
+ n.remove_namespaces!
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,36 @@
1
+ require 'hyrax/ingest/fetcher/base'
2
+ require 'hyrax/ingest/has_sip'
3
+ require 'yaml'
4
+
5
+ module Hyrax
6
+ module Ingest
7
+ module Fetcher
8
+ class YAMLFile < Base
9
+ include HasSIP
10
+
11
+ inherit_callbacks_for :fetch
12
+
13
+ attr_reader :filename, :yaml_path
14
+
15
+ def initialize(options={})
16
+ raise ArgumentError, "Required option :filename is missing" unless options.key?(:filename)
17
+ raise ArgumentError, "Required option :yaml_path is missing" unless options.key?(:yaml_path)
18
+ @filename = options[:filename]
19
+ @yaml_path = options[:yaml_path]
20
+ super
21
+ end
22
+
23
+ def fetch
24
+ @fetched_value ||= yaml.dig(*yaml_path)
25
+ end
26
+
27
+ private
28
+
29
+ # @return [Hash, Array, String] The YAML parsed into Ruby objects.
30
+ def yaml
31
+ @yaml ||= YAML.load sip.read_file(filename)
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,13 @@
1
+ # Simple interface for injecting a Hyrax::Ingest::SIP dependency.
2
+ require 'hyrax/ingest/sip'
3
+ require 'hyrax/ingest/errors'
4
+
5
+ module Hyrax
6
+ module Ingest
7
+ module HasDepositor
8
+ def HasDepositor.included(mod)
9
+ attr_accessor :depositor
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,18 @@
1
+ # Simple interface for injecting a Hyrax::Ingest::SIP dependency.
2
+
3
+ require 'hyrax/ingest/sip'
4
+ require 'hyrax/ingest/errors'
5
+
6
+ module Hyrax
7
+ module Ingest
8
+ module HasIteration
9
+ def iteration=(iteration)
10
+ @iteration = iteration.to_i
11
+ end
12
+
13
+ def iteration
14
+ @iteration ||= 0
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,29 @@
1
+ require 'logger'
2
+
3
+ module Hyrax
4
+ module Ingest
5
+ module HasLogger
6
+ def logger
7
+ @logger ||= Logger.new(HasLogger.config.default_log_to)
8
+ end
9
+
10
+ def logger=(logger)
11
+ raise Hyrax::Ingest::Errors::InvalidLogger unless logger.is_a? Logger
12
+ @logger = logger
13
+ end
14
+
15
+ class << self
16
+ def config
17
+ @config ||= Configuration.new
18
+ end
19
+ end
20
+
21
+ class Configuration
22
+ attr_accessor :default_log_to
23
+ def initialize
24
+ @default_log_to = STDOUT
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end