fedora-migrate 0.0.1 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/Gemfile +1 -0
- data/LICENSE +14 -0
- data/README.md +201 -6
- data/config/jetty.yml +2 -2
- data/fedora-migrate.gemspec +7 -6
- data/lib/fedora-migrate.rb +7 -24
- data/lib/fedora_migrate/content_mover.rb +49 -0
- data/lib/fedora_migrate/datastream_mover.rb +19 -34
- data/lib/fedora_migrate/datastream_verification.rb +36 -0
- data/lib/fedora_migrate/dates_mover.rb +14 -0
- data/lib/fedora_migrate/hooks.rb +23 -0
- data/lib/fedora_migrate/migration_options.rb +18 -0
- data/lib/fedora_migrate/mover.rb +12 -0
- data/lib/fedora_migrate/object_mover.rb +20 -9
- data/lib/fedora_migrate/rdf_datastream_mover.rb +31 -14
- data/lib/fedora_migrate/rels_ext_datastream_mover.rb +28 -62
- data/lib/fedora_migrate/repository_migrator.rb +30 -25
- data/lib/fedora_migrate/rubydora_connection.rb +0 -2
- data/lib/fedora_migrate/target_constructor.rb +39 -0
- data/lib/fedora_migrate/version.rb +1 -1
- data/spec/fixtures/objects/scholarsphere_5712mc568.xml +7284 -0
- data/spec/fixtures/objects/scholarsphere_7d279232g.xml +20120 -0
- data/spec/fixtures/objects/scholarsphere_sf2686078.xml +8823 -0
- data/spec/fixtures/objects/scholarsphere_x346dj04v.xml +188 -0
- data/spec/fixtures/objects/scholarsphere_x346dj06d.xml +255 -0
- data/spec/fixtures/objects/scholarsphere_x346dj08z.xml +1242 -0
- data/spec/fixtures/objects/sufia_5m60qr94g.xml +68 -0
- data/spec/fixtures/objects/sufia_5m60qr95r.xml +133 -0
- data/spec/fixtures/objects/sufia_5m60qr961.xml +133 -0
- data/spec/fixtures/objects/sufia_5m60qr979.xml +118 -0
- data/spec/integration/content_versions_spec.rb +24 -1
- data/spec/integration/missing_relationships_spec.rb +30 -0
- data/spec/integration/object_migration_spec.rb +49 -5
- data/spec/integration/rdf_migration_spec.rb +38 -13
- data/spec/integration/relationship_migration_spec.rb +10 -9
- data/spec/integration/repository_migration_spec.rb +46 -19
- data/spec/integration/versions_spec.rb +32 -0
- data/spec/spec_helper.rb +8 -1
- data/spec/support/example_model.rb +56 -0
- data/spec/unit/content_mover_spec.rb +78 -0
- data/spec/unit/datastream_verification_spec.rb +60 -0
- data/spec/unit/dates_mover_spec.rb +33 -0
- data/spec/unit/migration_options_spec.rb +61 -0
- data/spec/unit/mover_spec.rb +35 -1
- data/spec/unit/object_mover_spec.rb +1 -3
- data/spec/unit/rels_ext_datastream_mover_spec.rb +28 -18
- data/spec/unit/repository_migrator_spec.rb +16 -5
- data/spec/unit/target_constructor_spec.rb +34 -0
- data/tasks/dev.rake +1 -1
- metadata +80 -38
- data/LICENSE.txt +0 -22
- data/lib/fedora_migrate/rdf_datastream_parser.rb +0 -29
- data/lib/fedora_migrate/triple_converter.rb +0 -39
- data/spec/fixtures/datastreams/rdf_ntriples_datastream.txt +0 -2
- data/spec/unit/rdf_datastream_mover_spec.rb +0 -8
- data/spec/unit/rdf_datastream_parser_spec.rb +0 -38
- data/spec/unit/triple_converter_spec.rb +0 -35
@@ -0,0 +1,36 @@
|
|
1
|
+
module FedoraMigrate::DatastreamVerification
|
2
|
+
|
3
|
+
attr_accessor :datastream
|
4
|
+
|
5
|
+
def valid? datastream=nil
|
6
|
+
@datastream = datastream || @source
|
7
|
+
check = has_matching_checksums? || has_matching_nokogiri_checksums?
|
8
|
+
FedoraMigrate::Logger.warn "#{@datastream.pid} datastream #{@datastream.dsid} validation failed" unless check
|
9
|
+
check
|
10
|
+
end
|
11
|
+
|
12
|
+
def has_matching_checksums?
|
13
|
+
datastream.checksum == target_checksum || checksum(datastream.content) == target_checksum
|
14
|
+
end
|
15
|
+
|
16
|
+
def has_matching_nokogiri_checksums?
|
17
|
+
return false unless datastream.mimeType == "text/xml"
|
18
|
+
checksum(Nokogiri::XML(datastream.content).to_xml) == checksum(Nokogiri::XML(target_content).to_xml)
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
def target_checksum
|
24
|
+
target.digest.first.to_s.split(/:/).last
|
25
|
+
end
|
26
|
+
|
27
|
+
# In some cases, the data is in ldp_source but target.content is empty, so we check both places
|
28
|
+
def target_content
|
29
|
+
target.content.empty? ? target.ldp_source.content : target.content
|
30
|
+
end
|
31
|
+
|
32
|
+
def checksum content
|
33
|
+
Digest::SHA1.hexdigest(content)
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
module FedoraMigrate
|
2
|
+
class DatesMover < Mover
|
3
|
+
|
4
|
+
def migrate
|
5
|
+
if source.respond_to?(:createdDate) && target.respond_to?(:date_uploaded)
|
6
|
+
target.date_uploaded = source.createdDate
|
7
|
+
end
|
8
|
+
if source.respond_to?(:lastModifiedDate) && target.respond_to?(:date_modified)
|
9
|
+
target.date_modified = source.lastModifiedDate
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
end
|
14
|
+
end
|
data/lib/fedora_migrate/hooks.rb
CHANGED
@@ -1,11 +1,34 @@
|
|
1
|
+
# Override this methods to perform additional actions before and after
|
2
|
+
# migation of objects and datastreams.
|
3
|
+
#
|
4
|
+
# To do so, simply define a FedoraMigrate::Hooks module anywhere in
|
5
|
+
# you application and substitute methods for the ones listed below
|
1
6
|
module FedoraMigrate
|
2
7
|
module Hooks
|
3
8
|
|
9
|
+
# Called from FedoraMigrate::ObjectMover
|
4
10
|
def before_object_migration
|
5
11
|
end
|
6
12
|
|
13
|
+
# Called from FedoraMigrate::ObjectMover
|
7
14
|
def after_object_migration
|
8
15
|
end
|
9
16
|
|
17
|
+
# Called from FedoraMigrate::RDFDatastreamMover
|
18
|
+
def before_rdf_datastream_migration
|
19
|
+
end
|
20
|
+
|
21
|
+
# Called from FedoraMigrate::RDFDatastreamMover
|
22
|
+
def after_rdf_datastream_migration
|
23
|
+
end
|
24
|
+
|
25
|
+
# Called from FedoraMigrate::DatastreamMover
|
26
|
+
def before_datastream_migration
|
27
|
+
end
|
28
|
+
|
29
|
+
# Called from FedoraMigrate::DatastreamMover
|
30
|
+
def after_datastream_migration
|
31
|
+
end
|
32
|
+
|
10
33
|
end
|
11
34
|
end
|
@@ -7,5 +7,23 @@ module FedoraMigrate
|
|
7
7
|
self.conversions = options.nil? ? [] : [options[:convert]].flatten
|
8
8
|
end
|
9
9
|
|
10
|
+
def forced?
|
11
|
+
option_true?(:force)
|
12
|
+
end
|
13
|
+
|
14
|
+
def not_forced?
|
15
|
+
!forced?
|
16
|
+
end
|
17
|
+
|
18
|
+
def application_creates_versions?
|
19
|
+
option_true?(:application_creates_versions)
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def option_true?(name)
|
25
|
+
!!(options && options[name])
|
26
|
+
end
|
27
|
+
|
10
28
|
end
|
11
29
|
end
|
data/lib/fedora_migrate/mover.rb
CHANGED
@@ -39,6 +39,18 @@ module FedoraMigrate
|
|
39
39
|
target.inspect
|
40
40
|
end
|
41
41
|
end
|
42
|
+
|
43
|
+
def id_component object=nil
|
44
|
+
object ||= source
|
45
|
+
raise FedoraMigrate::Errors::MigrationError, "can't get the id component without an object" if object.nil?
|
46
|
+
self.class.id_component(object)
|
47
|
+
end
|
48
|
+
|
49
|
+
def self.id_component object
|
50
|
+
return object.pid.split(/:/).last if object.kind_of?(Rubydora::DigitalObject)
|
51
|
+
return object.to_s.split(/:/).last if object.respond_to?(:to_s)
|
52
|
+
nil
|
53
|
+
end
|
42
54
|
|
43
55
|
end
|
44
56
|
end
|
@@ -5,9 +5,10 @@ module FedoraMigrate
|
|
5
5
|
|
6
6
|
def migrate
|
7
7
|
prepare_target
|
8
|
-
migrate_content_datastreams
|
9
8
|
conversions.collect { |ds| convert_rdf_datastream(ds) }
|
9
|
+
migrate_content_datastreams
|
10
10
|
migrate_permissions
|
11
|
+
migrate_dates
|
11
12
|
complete_target
|
12
13
|
end
|
13
14
|
|
@@ -19,7 +20,6 @@ module FedoraMigrate
|
|
19
20
|
def prepare_target
|
20
21
|
Logger.info "running before_object_migration hooks"
|
21
22
|
before_object_migration
|
22
|
-
save
|
23
23
|
end
|
24
24
|
|
25
25
|
def complete_target
|
@@ -30,20 +30,28 @@ module FedoraMigrate
|
|
30
30
|
|
31
31
|
private
|
32
32
|
|
33
|
+
# We have to call save before migrating content datastreams, otherwise versions aren't recorded
|
34
|
+
# TODO: this will fail if required fields are defined in a descMetadata datastream that is not
|
35
|
+
# converted to RDF (issue #8)
|
33
36
|
def migrate_content_datastreams
|
37
|
+
save
|
34
38
|
target.attached_files.keys.each do |ds|
|
35
|
-
mover = FedoraMigrate::DatastreamMover.new(source.datastreams[ds.to_s], target.attached_files[ds.to_s])
|
39
|
+
mover = FedoraMigrate::DatastreamMover.new(source.datastreams[ds.to_s], target.attached_files[ds.to_s], options)
|
36
40
|
mover.migrate
|
37
41
|
end
|
38
42
|
end
|
39
43
|
|
40
44
|
def convert_rdf_datastream ds
|
41
|
-
if source.datastreams.
|
42
|
-
mover = FedoraMigrate::RDFDatastreamMover.new(
|
45
|
+
if source.datastreams.key?(ds)
|
46
|
+
mover = FedoraMigrate::RDFDatastreamMover.new(datastream_content(ds), target)
|
43
47
|
mover.migrate
|
44
48
|
end
|
45
49
|
end
|
46
50
|
|
51
|
+
def datastream_content(dsid)
|
52
|
+
source.datastreams[dsid.to_s]
|
53
|
+
end
|
54
|
+
|
47
55
|
def migrate_permissions
|
48
56
|
if source.datastreams.keys.include?(RIGHTS_DATASTREAM) && target.respond_to?(:permissions)
|
49
57
|
mover = FedoraMigrate::PermissionsMover.new(source.datastreams[RIGHTS_DATASTREAM], target)
|
@@ -51,12 +59,15 @@ module FedoraMigrate
|
|
51
59
|
end
|
52
60
|
end
|
53
61
|
|
62
|
+
def migrate_dates
|
63
|
+
FedoraMigrate::DatesMover.new(source, target).migrate
|
64
|
+
end
|
65
|
+
|
54
66
|
def create_target_model
|
55
|
-
|
56
|
-
|
57
|
-
@target =
|
67
|
+
builder = FedoraMigrate::TargetConstructor.new(source.models).build
|
68
|
+
raise FedoraMigrate::Errors::MigrationError, "No qualified targets found in #{source.pid}" if builder.target.nil?
|
69
|
+
@target = builder.target.new(id: id_component)
|
58
70
|
end
|
59
71
|
|
60
72
|
end
|
61
|
-
|
62
73
|
end
|
@@ -1,28 +1,45 @@
|
|
1
|
+
require 'rchardet'
|
2
|
+
|
1
3
|
module FedoraMigrate
|
2
4
|
class RDFDatastreamMover < Mover
|
3
5
|
|
4
6
|
def migrate
|
5
7
|
Logger.info "converting datastream '#{source.dsid}' to RDF"
|
6
|
-
|
7
|
-
|
8
|
+
before_rdf_datastream_migration
|
9
|
+
migrate_rdf_triples
|
10
|
+
after_rdf_datastream_migration
|
8
11
|
save
|
9
12
|
end
|
10
13
|
|
11
|
-
def
|
12
|
-
|
13
|
-
parser.parse
|
14
|
-
parser.statements.each do |statement|
|
15
|
-
target.resource << statement
|
16
|
-
end
|
14
|
+
def migrate_rdf_triples
|
15
|
+
target.resource << updated_graph
|
17
16
|
end
|
18
17
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
18
|
+
private
|
19
|
+
|
20
|
+
def updated_graph
|
21
|
+
reader.new(updated_datastream_content)
|
22
|
+
end
|
23
|
+
|
24
|
+
def updated_datastream_content
|
25
|
+
correct_encoding(datastream_content).gsub(/<.+#{source.pid}>/,"<#{target.uri}>")
|
24
26
|
end
|
25
|
-
end
|
26
27
|
|
28
|
+
def datastream_content
|
29
|
+
source.content
|
30
|
+
end
|
31
|
+
|
32
|
+
# Scholarsphere has some ISO-8859 encoded data, which violates the NTriples spec.
|
33
|
+
# Here we correct that.
|
34
|
+
def correct_encoding(input)
|
35
|
+
input.encode!(Encoding::UTF_8)
|
36
|
+
rescue Encoding::UndefinedConversionError
|
37
|
+
cd = ::CharDet.detect(input)
|
38
|
+
input.force_encoding(Encoding.find(cd["encoding"].upcase)).encode!(Encoding::UTF_8)
|
39
|
+
end
|
40
|
+
|
41
|
+
def reader
|
42
|
+
RDF::Reader.for(:ntriples)
|
43
|
+
end
|
27
44
|
end
|
28
45
|
end
|
@@ -1,89 +1,55 @@
|
|
1
|
-
require 'rubydora'
|
2
1
|
module FedoraMigrate
|
3
2
|
class RelsExtDatastreamMover < Mover
|
4
3
|
|
5
|
-
attr_accessor :relationships, :ng_xml, :subject
|
6
|
-
|
7
|
-
RELS_EXT = Rubydora::RelationshipsMixin::RELS_EXT
|
8
4
|
RELS_EXT_DATASTREAM = "RELS-EXT".freeze
|
9
5
|
|
10
|
-
def
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
parse_relationships if has_relationships?
|
15
|
-
end
|
16
|
-
|
17
|
-
def has_relationships?
|
18
|
-
source.datastreams.keys.include?(RELS_EXT_DATASTREAM)
|
6
|
+
def migrate
|
7
|
+
migrate_statements
|
8
|
+
target.ldp_source.update
|
9
|
+
update_index
|
19
10
|
end
|
20
11
|
|
21
|
-
def
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
objects.collect { |object| migrate_incomming_relationship(predicate, object) }
|
26
|
-
else
|
27
|
-
migrate_outgoing_relationship(predicate, objects)
|
28
|
-
end
|
29
|
-
end
|
30
|
-
end
|
12
|
+
def post_initialize
|
13
|
+
@target ||= ActiveFedora::Base.find(id_component)
|
14
|
+
rescue ActiveFedora::ObjectNotFoundError
|
15
|
+
raise FedoraMigrate::Errors::MigrationError, "Target object was not found in Fedora 4. Did you migrate it?"
|
31
16
|
end
|
32
17
|
|
33
18
|
private
|
34
19
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
query = "//ns0:"+RELS_EXT[key].split(/#/).last
|
39
|
-
relationships[key.to_sym] = query_results(query)
|
20
|
+
def migrate_statements
|
21
|
+
statements.each do |statement|
|
22
|
+
target.ldp_source.graph << [target.rdf_subject, migrate_predicate(statement.predicate), migrate_object(statement.object)]
|
40
23
|
end
|
41
24
|
end
|
42
25
|
|
43
|
-
def
|
44
|
-
|
45
|
-
|
46
|
-
end
|
47
|
-
return results
|
26
|
+
def update_index
|
27
|
+
target.reload
|
28
|
+
target.update_index
|
48
29
|
end
|
49
30
|
|
50
|
-
def
|
51
|
-
@
|
52
|
-
rescue ActiveFedora::ObjectNotFoundError
|
53
|
-
raise FedoraMigrate::Errors::MigrationError, "Source was not found in Fedora4. Did you migrated it?"
|
31
|
+
def graph
|
32
|
+
@graph ||= RDF::Graph.new { |g| g.from_rdfxml(source.datastreams[RELS_EXT_DATASTREAM].content) }
|
54
33
|
end
|
55
34
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
raise FedoraMigrate::Errors::MigrationError, "Could not find object with id #{id}"
|
35
|
+
# Override this if any predicate transformation is needed
|
36
|
+
def migrate_predicate(fc3_uri)
|
37
|
+
fc3_uri
|
60
38
|
end
|
61
39
|
|
62
|
-
|
63
|
-
|
64
|
-
Logger.info "adding #{subject.id} to #{object.id} with predicate #{predicate.to_s}"
|
65
|
-
object.reflections.each do |key, association|
|
66
|
-
unless association.predicate.to_s.split(/#/).empty?
|
67
|
-
if association.predicate.to_s.split(/#/).last.gsub(/is/,"").underscore == predicate.to_s
|
68
|
-
object.send(key.to_s) << subject
|
69
|
-
end
|
70
|
-
end
|
71
|
-
end
|
40
|
+
def migrate_object(fc3_uri)
|
41
|
+
RDF::URI.new(ActiveFedora::Base.id_to_uri(id_component(fc3_uri)))
|
72
42
|
end
|
73
43
|
|
74
|
-
|
75
|
-
|
76
|
-
Logger.
|
77
|
-
|
78
|
-
if key.to_s.match(/_ids$/)
|
79
|
-
subject.send(key.to_s+"=", objects.collect { |o| o.id })
|
80
|
-
subject.save
|
81
|
-
end
|
82
|
-
end
|
44
|
+
def has_missing_object?(statement)
|
45
|
+
return false if ActiveFedora::Base.exists?(id_component(statement.object))
|
46
|
+
Logger.warn "#{source.pid} could not migrate relationship #{statement.predicate} because #{statement.object} doesn't exist in Fedora 4"
|
47
|
+
true
|
83
48
|
end
|
84
49
|
|
85
|
-
|
86
|
-
|
50
|
+
# All the graph statements except hasModel and those with missing objects
|
51
|
+
def statements
|
52
|
+
graph.statements.reject { |stmt| stmt.predicate == ActiveFedora::RDF::Fcrepo::Model.hasModel || has_missing_object?(stmt) }
|
87
53
|
end
|
88
54
|
|
89
55
|
end
|
@@ -3,50 +3,55 @@ module FedoraMigrate
|
|
3
3
|
|
4
4
|
include MigrationOptions
|
5
5
|
|
6
|
-
attr_accessor :source_objects, :
|
6
|
+
attr_accessor :source_objects, :namespace, :failed
|
7
7
|
|
8
8
|
def initialize namespace = nil, options = {}
|
9
9
|
@namespace = namespace || repository_namespace
|
10
10
|
@options = options
|
11
|
+
@failed = 0
|
11
12
|
@source_objects = get_source_objects
|
12
|
-
@results = []
|
13
13
|
conversion_options
|
14
14
|
end
|
15
15
|
|
16
|
+
# TODO: need a reporting mechanism for results (issue #4)
|
16
17
|
def migrate_objects
|
17
|
-
source_objects.each
|
18
|
-
|
19
|
-
begin
|
20
|
-
results << { source.pid => [FedoraMigrate::ObjectMover.new(source, nil, options).migrate] }
|
21
|
-
rescue NameError => e
|
22
|
-
results << { source.pid => e.to_s }
|
23
|
-
rescue FedoraMigrate::Errors::MigrationError => e
|
24
|
-
results << { source.pid => e.to_s }
|
25
|
-
end
|
26
|
-
end
|
18
|
+
source_objects.each { |source| migrate_object(source) }
|
19
|
+
@failed == 0
|
27
20
|
end
|
28
21
|
|
29
22
|
# TODO: need a reporting mechanism for results (issue #4)
|
30
23
|
def migrate_relationships
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
results << { source.pid => e.to_s }
|
37
|
-
rescue ActiveFedora::AssociationTypeMismatch => e
|
38
|
-
results << { source.pid => e.to_s }
|
39
|
-
end
|
40
|
-
end
|
41
|
-
end
|
42
|
-
|
43
|
-
# TODO: page through all the objects (issue #6)
|
24
|
+
return "Relationship migration halted because #{failed.to_s} objects didn't migrate successfully." if failed > 0 && not_forced?
|
25
|
+
source_objects.each { |source| migrate_relationship(source) }
|
26
|
+
@failed == 0
|
27
|
+
end
|
28
|
+
|
44
29
|
def get_source_objects
|
45
30
|
FedoraMigrate.source.connection.search(nil).collect { |o| qualifying_object(o) }.compact
|
46
31
|
end
|
47
32
|
|
48
33
|
private
|
49
34
|
|
35
|
+
def migrate_object source
|
36
|
+
Logger.info "Migrating source object #{source.pid}"
|
37
|
+
FedoraMigrate::ObjectMover.new(source, nil, options).migrate
|
38
|
+
rescue StandardError => e
|
39
|
+
Logger.warn "#{source.pid} failed.\n#{error_message(e)}"
|
40
|
+
@failed = @failed + 1
|
41
|
+
end
|
42
|
+
|
43
|
+
def migrate_relationship source
|
44
|
+
Logger.info "Migrating relationships for source object #{source.pid}"
|
45
|
+
FedoraMigrate::RelsExtDatastreamMover.new(source).migrate
|
46
|
+
rescue StandardError => e
|
47
|
+
Logger.warn "#{source.pid} relationship migration failed.\n#{error_message(e)}"
|
48
|
+
@failed = @failed + 1
|
49
|
+
end
|
50
|
+
|
51
|
+
def error_message e
|
52
|
+
[e.inspect, e.backtrace.join("\n\t")].join("\n\t")
|
53
|
+
end
|
54
|
+
|
50
55
|
def repository_namespace
|
51
56
|
FedoraMigrate.source.connection.repository_profile["repositoryPID"]["repositoryPID"].split(/:/).first.strip
|
52
57
|
end
|