hyrax-ingest 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +52 -0
- data/config/routes.rb +2 -0
- data/lib/hyrax/ingest.rb +12 -0
- data/lib/hyrax/ingest/batch_runner.rb +130 -0
- data/lib/hyrax/ingest/configuration.rb +54 -0
- data/lib/hyrax/ingest/engine.rb +6 -0
- data/lib/hyrax/ingest/errors.rb +186 -0
- data/lib/hyrax/ingest/fetcher.rb +55 -0
- data/lib/hyrax/ingest/fetcher/base.rb +78 -0
- data/lib/hyrax/ingest/fetcher/csv_file.rb +89 -0
- data/lib/hyrax/ingest/fetcher/date_time.rb +15 -0
- data/lib/hyrax/ingest/fetcher/literal.rb +24 -0
- data/lib/hyrax/ingest/fetcher/premis_event_type.rb +28 -0
- data/lib/hyrax/ingest/fetcher/rdf_uri.rb +21 -0
- data/lib/hyrax/ingest/fetcher/xml_file.rb +54 -0
- data/lib/hyrax/ingest/fetcher/yaml_file.rb +36 -0
- data/lib/hyrax/ingest/has_depositor.rb +13 -0
- data/lib/hyrax/ingest/has_iteration.rb +18 -0
- data/lib/hyrax/ingest/has_logger.rb +29 -0
- data/lib/hyrax/ingest/has_report.rb +17 -0
- data/lib/hyrax/ingest/has_shared_sip.rb +20 -0
- data/lib/hyrax/ingest/has_sip.rb +20 -0
- data/lib/hyrax/ingest/ingester.rb +75 -0
- data/lib/hyrax/ingest/ingester/active_fedora_base_ingester.rb +136 -0
- data/lib/hyrax/ingest/ingester/active_fedora_file_ingester.rb +17 -0
- data/lib/hyrax/ingest/ingester/active_fedora_property_assigner.rb +67 -0
- data/lib/hyrax/ingest/ingester/base.rb +28 -0
- data/lib/hyrax/ingest/ingester/file_set_ingester.rb +68 -0
- data/lib/hyrax/ingest/ingester/preservation_event_ingester.rb +27 -0
- data/lib/hyrax/ingest/ingester/work_ingester.rb +55 -0
- data/lib/hyrax/ingest/reporting.rb +13 -0
- data/lib/hyrax/ingest/reporting/configuration.rb +22 -0
- data/lib/hyrax/ingest/reporting/report.rb +79 -0
- data/lib/hyrax/ingest/reporting/views/hyrax_ingest_report.html.erb +77 -0
- data/lib/hyrax/ingest/reporting/views/hyrax_ingest_report.txt.erb +21 -0
- data/lib/hyrax/ingest/runner.rb +103 -0
- data/lib/hyrax/ingest/sip.rb +92 -0
- data/lib/hyrax/ingest/transformer.rb +42 -0
- data/lib/hyrax/ingest/transformer/base.rb +12 -0
- data/lib/hyrax/ingest/transformer/to_date.rb +33 -0
- data/lib/hyrax/ingest/version.rb +5 -0
- data/lib/tasks/ingest_tasks.rake +22 -0
- metadata +330 -0
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'hyrax/ingest/fetcher/xml_file'
|
2
|
+
require 'hyrax/ingest/fetcher/yaml_file'
|
3
|
+
require 'hyrax/ingest/fetcher/csv_file'
|
4
|
+
require 'hyrax/ingest/fetcher/literal'
|
5
|
+
require 'hyrax/ingest/fetcher/premis_event_type'
|
6
|
+
require 'hyrax/ingest/fetcher/rdf_uri'
|
7
|
+
require 'hyrax/ingest/fetcher/date_time'
|
8
|
+
require 'hyrax/ingest/errors'
|
9
|
+
|
10
|
+
module Hyrax
|
11
|
+
module Ingest
|
12
|
+
module Fetcher
|
13
|
+
class << self
|
14
|
+
# @return Set The set of all fetcher classes.
|
15
|
+
def all_classes
|
16
|
+
@all_classes ||= Set.new.tap do |all_classes|
|
17
|
+
all_classes << Hyrax::Ingest::Fetcher::XMLFile
|
18
|
+
all_classes << Hyrax::Ingest::Fetcher::YAMLFile
|
19
|
+
all_classes << Hyrax::Ingest::Fetcher::CSVFile
|
20
|
+
all_classes << Hyrax::Ingest::Fetcher::Literal
|
21
|
+
all_classes << Hyrax::Ingest::Fetcher::PremisEventType
|
22
|
+
all_classes << Hyrax::Ingest::Fetcher::RdfUri
|
23
|
+
all_classes << Hyrax::Ingest::Fetcher::DateTime
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
# @param [string] options The key is the fetcher class name
|
28
|
+
# and the value is the hash of options to pass to the constructor of the fetcher
|
29
|
+
# class.
|
30
|
+
# @return An instance of the fetcher class.
|
31
|
+
def factory(class_name, options={})
|
32
|
+
options ||= {}
|
33
|
+
find_class_by_name(class_name).new(options)
|
34
|
+
end
|
35
|
+
|
36
|
+
# @param [String] class_name The stringified class name, with our
|
37
|
+
# without namespaces.
|
38
|
+
# @raise [Hyrax::Ingest::Errors::UnknownFetcherClass] When there is no
|
39
|
+
# corresponding fetcher class for the given value of the `class_name`
|
40
|
+
# param.
|
41
|
+
# @raise [Hyrax::Ingest::Errors::UnknownfetcherClass] When the value of
|
42
|
+
# `class_name` param is insufficient in determining a fetcher class.
|
43
|
+
# @return [Class] The appropriate fetcher class.
|
44
|
+
def find_class_by_name(class_name)
|
45
|
+
found_classes = all_classes.select do |class_const|
|
46
|
+
(class_const.to_s == class_name) || (class_const.to_s =~ /::#{class_name}/)
|
47
|
+
end
|
48
|
+
raise Hyrax::Ingest::Errors::UnknownFetcherClass.new(class_name, all_classes) if found_classes.count == 0
|
49
|
+
raise Hyrax::Ingest::Errors::AmbiguousFetcherClass.new(class_name, found_classes) if found_classes.count > 1
|
50
|
+
found_classes.first
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
require 'hyrax/ingest/errors'
|
2
|
+
require 'hyrax/ingest/sip'
|
3
|
+
require 'interloper'
|
4
|
+
require 'hyrax/ingest/has_logger'
|
5
|
+
require 'hyrax/ingest/has_report'
|
6
|
+
|
7
|
+
|
8
|
+
|
9
|
+
module Hyrax
|
10
|
+
module Ingest
|
11
|
+
module Fetcher
|
12
|
+
class Base
|
13
|
+
include Interloper
|
14
|
+
include HasReport
|
15
|
+
include HasLogger
|
16
|
+
|
17
|
+
# Callback to log missing values that have been marked as required.
|
18
|
+
after(:fetch) do
|
19
|
+
# Use a flag to only report the missing value once.
|
20
|
+
unless @after_fetch_run_once
|
21
|
+
if required? && fetched_value_is_empty?
|
22
|
+
report_missing_required_value
|
23
|
+
end
|
24
|
+
@after_fetch_run_once = true
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def initialize(options={})
|
29
|
+
@required = options.delete(:required)
|
30
|
+
end
|
31
|
+
|
32
|
+
# Boolean reader for @required.
|
33
|
+
def required?; !!@required; end
|
34
|
+
|
35
|
+
# Subclasses should override this method with the logic required to
|
36
|
+
# fetch values from a SIP. The overidden method should set
|
37
|
+
# @fetched_value, and return it.
|
38
|
+
# @see Hyrax::Ingest::Fetcher::XMLFile#fetch
|
39
|
+
# @abstract
|
40
|
+
def fetch
|
41
|
+
@fetched_value
|
42
|
+
end
|
43
|
+
|
44
|
+
protected
|
45
|
+
|
46
|
+
# Determines whether the value that was fetched by #fetch is empty
|
47
|
+
# or not. Be default, the fetched value is empty if it is nil, an
|
48
|
+
# empty string, an empty array, an empty hash, or an array
|
49
|
+
# containing any combination of those. Overwrite this method in
|
50
|
+
# subclasses to change the definition of empty in those contexts.
|
51
|
+
# The return value is used for reporting which values are missing,
|
52
|
+
# but required.
|
53
|
+
# @return [Boolean] True if @fetched_value is considered to be empty; false otherwise.
|
54
|
+
def fetched_value_is_empty?
|
55
|
+
Array(@fetched_value).reduce(true) do |all_empty, val|
|
56
|
+
all_empty &&= ( val.nil? || ( val.respond_to?(:empty?) && val.empty? ) )
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
# Reports occurrences of missing required values.
|
61
|
+
# Subclasses should override this method to provide further detail
|
62
|
+
# by passing an options hash that will be available for reporting.
|
63
|
+
# @see Hyrax::Ingest::Fetcher::XMLFile#report_missing_required_value
|
64
|
+
# @example
|
65
|
+
# # Provide additional info to the report.
|
66
|
+
# def report_missing_require_value
|
67
|
+
# super(foo: "bar")
|
68
|
+
# end
|
69
|
+
def report_missing_required_value(params={})
|
70
|
+
short_class_name = self.class.to_s.gsub(/.*\:\:/, '')
|
71
|
+
logger.warn "Missing required value from #{short_class_name} with params = #{params}"
|
72
|
+
report.stat[:missing_required_values][self.class] ||= []
|
73
|
+
report.stat[:missing_required_values][self.class] << params
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
@@ -0,0 +1,89 @@
|
|
1
|
+
require 'hyrax/ingest/fetcher/base'
|
2
|
+
require 'hyrax/ingest/has_sip'
|
3
|
+
require 'hyrax/ingest/has_iteration'
|
4
|
+
require 'roo'
|
5
|
+
|
6
|
+
module Hyrax
|
7
|
+
module Ingest
|
8
|
+
module Fetcher
|
9
|
+
class CSVFile < Base
|
10
|
+
attr_reader :filename, :column, :row, :header_row_number, :row_offset
|
11
|
+
|
12
|
+
include HasSIP
|
13
|
+
include HasIteration
|
14
|
+
|
15
|
+
def initialize(options={})
|
16
|
+
raise ArgumentError, "Required option :filename is missing" unless options.key?(:filename)
|
17
|
+
raise ArgumentError, "Required option :column is missing" unless options.key?(:column)
|
18
|
+
raise ArgumentError, "Required option :row is missing" unless options.key?(:row)
|
19
|
+
|
20
|
+
@filename = options.fetch(:filename)
|
21
|
+
@column = options.fetch(:column, '').to_s.strip.downcase
|
22
|
+
@row = options.fetch(:row, '').to_s.strip.downcase
|
23
|
+
@header_row_number = options.fetch(:header_row_number, 1)
|
24
|
+
@row_offset = options[:row_offset]
|
25
|
+
super
|
26
|
+
end
|
27
|
+
|
28
|
+
def fetch
|
29
|
+
@fetched_value ||= cell_value
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def roo
|
35
|
+
@roo ||= Roo::CSV.new(sip.find_file_path(filename))
|
36
|
+
end
|
37
|
+
|
38
|
+
def cell_value
|
39
|
+
roo.cell(row_number, column_number)
|
40
|
+
end
|
41
|
+
|
42
|
+
# @return The column number from which to retrieve the cell.
|
43
|
+
def column_number
|
44
|
+
@column_number ||= column_number_from_header || specific_column_number
|
45
|
+
# TODO: custom error
|
46
|
+
raise ArgumentError, "Value for column: option must be a number or a column header; '#{column}' was given." if @column_number.nil?
|
47
|
+
@column_number
|
48
|
+
end
|
49
|
+
|
50
|
+
def column_number_from_header
|
51
|
+
headers.index(column) + 1 if headers.index(column)
|
52
|
+
end
|
53
|
+
|
54
|
+
def specific_column_number
|
55
|
+
column.to_i if string_is_integer?(column)
|
56
|
+
end
|
57
|
+
|
58
|
+
def row_number
|
59
|
+
@row_number ||= next_row || specific_row_number
|
60
|
+
# TODO: custom error
|
61
|
+
raise ArgumentError, "Value for row: option must be a number or the keyword 'next'; #{@row} was given." if @row_number.nil?
|
62
|
+
@row_number + row_offset
|
63
|
+
end
|
64
|
+
|
65
|
+
def next_row
|
66
|
+
iteration + 1 if @row == 'next'
|
67
|
+
end
|
68
|
+
|
69
|
+
def specific_row_number
|
70
|
+
row.to_i if string_is_integer?(row)
|
71
|
+
end
|
72
|
+
|
73
|
+
def row_offset
|
74
|
+
@row_offset || header_row_number
|
75
|
+
end
|
76
|
+
|
77
|
+
def headers
|
78
|
+
@headers ||= roo.row(header_row_number).map(&:to_s).map(&:downcase)
|
79
|
+
end
|
80
|
+
|
81
|
+
def string_is_integer?(str)
|
82
|
+
# remove leading zeros first
|
83
|
+
str.sub!(/^0+/, '')
|
84
|
+
str.to_i.to_s == str.to_s
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'hyrax/ingest/errors'
|
2
|
+
require 'hyrax/ingest/sip'
|
3
|
+
|
4
|
+
module Hyrax
|
5
|
+
module Ingest
|
6
|
+
module Fetcher
|
7
|
+
class Literal < Base
|
8
|
+
attr_reader :literal_value
|
9
|
+
|
10
|
+
inherit_callbacks_for :fetch
|
11
|
+
|
12
|
+
def initialize(options={})
|
13
|
+
options = { value: options } unless options.is_a? Hash
|
14
|
+
@literal_value = options.delete(:value)
|
15
|
+
super
|
16
|
+
end
|
17
|
+
|
18
|
+
def fetch
|
19
|
+
@fetched_value = literal_value
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# This class adhere's to the Hyrax::Ingest::Fetcher::Base interface to work other classes within the Hyrax::Ingest
|
2
|
+
# gem. If effectively maps PREMIS Event Type abbreviations to the URI that is then associated with a
|
3
|
+
# Hyrax::Preservation::Event model.
|
4
|
+
|
5
|
+
require 'hyrax/ingest/errors'
|
6
|
+
require 'hyrax/ingest/sip'
|
7
|
+
|
8
|
+
module Hyrax
|
9
|
+
module Ingest
|
10
|
+
module Fetcher
|
11
|
+
class PremisEventType < Base
|
12
|
+
inherit_callbacks_for :fetch
|
13
|
+
|
14
|
+
attr_reader :abbr
|
15
|
+
|
16
|
+
def initialize(options={})
|
17
|
+
options = { abbr: options } unless options.is_a? Hash
|
18
|
+
@abbr = options.delete(:abbr)
|
19
|
+
super
|
20
|
+
end
|
21
|
+
|
22
|
+
def fetch
|
23
|
+
@fetched_value ||= Hyrax::Preservation::PremisEventType.find_by_abbr(@abbr).uri
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'rdf'
|
2
|
+
|
3
|
+
module Hyrax
|
4
|
+
module Ingest
|
5
|
+
module Fetcher
|
6
|
+
class RdfUri < Base
|
7
|
+
attr_reader :uri_str
|
8
|
+
|
9
|
+
def initialize(options={})
|
10
|
+
options = { uri_str: options } unless options.is_a? Hash
|
11
|
+
@uri_str = options.delete(:uri_str)
|
12
|
+
super
|
13
|
+
end
|
14
|
+
|
15
|
+
def fetch
|
16
|
+
@fetched_value ||= ::RDF::URI.new(uri_str)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'hyrax/ingest/fetcher/base'
|
2
|
+
require 'hyrax/ingest/has_sip'
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'hyrax/ingest/reporting'
|
5
|
+
require 'interloper'
|
6
|
+
|
7
|
+
module Hyrax
|
8
|
+
module Ingest
|
9
|
+
module Fetcher
|
10
|
+
class XMLFile < Base
|
11
|
+
include HasSIP
|
12
|
+
|
13
|
+
attr_reader :filename, :xpath, :default, :fetched_value
|
14
|
+
|
15
|
+
inherit_callbacks_after :fetch
|
16
|
+
|
17
|
+
def initialize(options={})
|
18
|
+
raise ArgumentError, "Required option :filename is missing" unless options.key?(:filename)
|
19
|
+
raise ArgumentError, "Required option :xpath is missing" unless options.key?(:xpath)
|
20
|
+
@filename = options[:filename]
|
21
|
+
@xpath = options[:xpath]
|
22
|
+
@default = options[:default] || []
|
23
|
+
super
|
24
|
+
end
|
25
|
+
|
26
|
+
# Overrides Hyrax::Ingest::Fetcher::Base#fetch
|
27
|
+
# @return [String] The value fetched from the XML file
|
28
|
+
def fetch
|
29
|
+
@fetched_value ||= begin
|
30
|
+
fetched_from_xml = noko.xpath(xpath).map(&:text)
|
31
|
+
fetched_from_xml.empty? ? default : fetched_from_xml
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
protected
|
36
|
+
|
37
|
+
# Overrides
|
38
|
+
# Hyrax::Ingest::Fetcher::Base#report_missing_required_value,
|
39
|
+
# passing filename and xpath inforamtion. The report summary has
|
40
|
+
# specific logic to look for these.
|
41
|
+
def report_missing_required_value
|
42
|
+
super(filename: filename, xpath: xpath)
|
43
|
+
end
|
44
|
+
|
45
|
+
def noko
|
46
|
+
@noko ||= Nokogiri::XML(sip.read_file(filename)).tap do |n|
|
47
|
+
# TODO: allow using namespaces instead of blindly removing them.
|
48
|
+
n.remove_namespaces!
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'hyrax/ingest/fetcher/base'
|
2
|
+
require 'hyrax/ingest/has_sip'
|
3
|
+
require 'yaml'
|
4
|
+
|
5
|
+
module Hyrax
|
6
|
+
module Ingest
|
7
|
+
module Fetcher
|
8
|
+
class YAMLFile < Base
|
9
|
+
include HasSIP
|
10
|
+
|
11
|
+
inherit_callbacks_for :fetch
|
12
|
+
|
13
|
+
attr_reader :filename, :yaml_path
|
14
|
+
|
15
|
+
def initialize(options={})
|
16
|
+
raise ArgumentError, "Required option :filename is missing" unless options.key?(:filename)
|
17
|
+
raise ArgumentError, "Required option :yaml_path is missing" unless options.key?(:yaml_path)
|
18
|
+
@filename = options[:filename]
|
19
|
+
@yaml_path = options[:yaml_path]
|
20
|
+
super
|
21
|
+
end
|
22
|
+
|
23
|
+
def fetch
|
24
|
+
@fetched_value ||= yaml.dig(*yaml_path)
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
# @return [Hash, Array, String] The YAML parsed into Ruby objects.
|
30
|
+
def yaml
|
31
|
+
@yaml ||= YAML.load sip.read_file(filename)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# Simple interface for injecting a Hyrax::Ingest::SIP dependency.
|
2
|
+
require 'hyrax/ingest/sip'
|
3
|
+
require 'hyrax/ingest/errors'
|
4
|
+
|
5
|
+
module Hyrax
|
6
|
+
module Ingest
|
7
|
+
module HasDepositor
|
8
|
+
def HasDepositor.included(mod)
|
9
|
+
attr_accessor :depositor
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
# Simple interface for injecting a Hyrax::Ingest::SIP dependency.
|
2
|
+
|
3
|
+
require 'hyrax/ingest/sip'
|
4
|
+
require 'hyrax/ingest/errors'
|
5
|
+
|
6
|
+
module Hyrax
|
7
|
+
module Ingest
|
8
|
+
module HasIteration
|
9
|
+
def iteration=(iteration)
|
10
|
+
@iteration = iteration.to_i
|
11
|
+
end
|
12
|
+
|
13
|
+
def iteration
|
14
|
+
@iteration ||= 0
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'logger'
|
2
|
+
|
3
|
+
module Hyrax
|
4
|
+
module Ingest
|
5
|
+
module HasLogger
|
6
|
+
def logger
|
7
|
+
@logger ||= Logger.new(HasLogger.config.default_log_to)
|
8
|
+
end
|
9
|
+
|
10
|
+
def logger=(logger)
|
11
|
+
raise Hyrax::Ingest::Errors::InvalidLogger unless logger.is_a? Logger
|
12
|
+
@logger = logger
|
13
|
+
end
|
14
|
+
|
15
|
+
class << self
|
16
|
+
def config
|
17
|
+
@config ||= Configuration.new
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
class Configuration
|
22
|
+
attr_accessor :default_log_to
|
23
|
+
def initialize
|
24
|
+
@default_log_to = STDOUT
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|