fedora-migrate 0.0.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/Gemfile +1 -0
- data/LICENSE +14 -0
- data/README.md +201 -6
- data/config/jetty.yml +2 -2
- data/fedora-migrate.gemspec +7 -6
- data/lib/fedora-migrate.rb +7 -24
- data/lib/fedora_migrate/content_mover.rb +49 -0
- data/lib/fedora_migrate/datastream_mover.rb +19 -34
- data/lib/fedora_migrate/datastream_verification.rb +36 -0
- data/lib/fedora_migrate/dates_mover.rb +14 -0
- data/lib/fedora_migrate/hooks.rb +23 -0
- data/lib/fedora_migrate/migration_options.rb +18 -0
- data/lib/fedora_migrate/mover.rb +12 -0
- data/lib/fedora_migrate/object_mover.rb +20 -9
- data/lib/fedora_migrate/rdf_datastream_mover.rb +31 -14
- data/lib/fedora_migrate/rels_ext_datastream_mover.rb +28 -62
- data/lib/fedora_migrate/repository_migrator.rb +30 -25
- data/lib/fedora_migrate/rubydora_connection.rb +0 -2
- data/lib/fedora_migrate/target_constructor.rb +39 -0
- data/lib/fedora_migrate/version.rb +1 -1
- data/spec/fixtures/objects/scholarsphere_5712mc568.xml +7284 -0
- data/spec/fixtures/objects/scholarsphere_7d279232g.xml +20120 -0
- data/spec/fixtures/objects/scholarsphere_sf2686078.xml +8823 -0
- data/spec/fixtures/objects/scholarsphere_x346dj04v.xml +188 -0
- data/spec/fixtures/objects/scholarsphere_x346dj06d.xml +255 -0
- data/spec/fixtures/objects/scholarsphere_x346dj08z.xml +1242 -0
- data/spec/fixtures/objects/sufia_5m60qr94g.xml +68 -0
- data/spec/fixtures/objects/sufia_5m60qr95r.xml +133 -0
- data/spec/fixtures/objects/sufia_5m60qr961.xml +133 -0
- data/spec/fixtures/objects/sufia_5m60qr979.xml +118 -0
- data/spec/integration/content_versions_spec.rb +24 -1
- data/spec/integration/missing_relationships_spec.rb +30 -0
- data/spec/integration/object_migration_spec.rb +49 -5
- data/spec/integration/rdf_migration_spec.rb +38 -13
- data/spec/integration/relationship_migration_spec.rb +10 -9
- data/spec/integration/repository_migration_spec.rb +46 -19
- data/spec/integration/versions_spec.rb +32 -0
- data/spec/spec_helper.rb +8 -1
- data/spec/support/example_model.rb +56 -0
- data/spec/unit/content_mover_spec.rb +78 -0
- data/spec/unit/datastream_verification_spec.rb +60 -0
- data/spec/unit/dates_mover_spec.rb +33 -0
- data/spec/unit/migration_options_spec.rb +61 -0
- data/spec/unit/mover_spec.rb +35 -1
- data/spec/unit/object_mover_spec.rb +1 -3
- data/spec/unit/rels_ext_datastream_mover_spec.rb +28 -18
- data/spec/unit/repository_migrator_spec.rb +16 -5
- data/spec/unit/target_constructor_spec.rb +34 -0
- data/tasks/dev.rake +1 -1
- metadata +80 -38
- data/LICENSE.txt +0 -22
- data/lib/fedora_migrate/rdf_datastream_parser.rb +0 -29
- data/lib/fedora_migrate/triple_converter.rb +0 -39
- data/spec/fixtures/datastreams/rdf_ntriples_datastream.txt +0 -2
- data/spec/unit/rdf_datastream_mover_spec.rb +0 -8
- data/spec/unit/rdf_datastream_parser_spec.rb +0 -38
- data/spec/unit/triple_converter_spec.rb +0 -35
@@ -0,0 +1,36 @@
|
|
1
|
+
module FedoraMigrate::DatastreamVerification
|
2
|
+
|
3
|
+
attr_accessor :datastream
|
4
|
+
|
5
|
+
def valid? datastream=nil
|
6
|
+
@datastream = datastream || @source
|
7
|
+
check = has_matching_checksums? || has_matching_nokogiri_checksums?
|
8
|
+
FedoraMigrate::Logger.warn "#{@datastream.pid} datastream #{@datastream.dsid} validation failed" unless check
|
9
|
+
check
|
10
|
+
end
|
11
|
+
|
12
|
+
def has_matching_checksums?
|
13
|
+
datastream.checksum == target_checksum || checksum(datastream.content) == target_checksum
|
14
|
+
end
|
15
|
+
|
16
|
+
def has_matching_nokogiri_checksums?
|
17
|
+
return false unless datastream.mimeType == "text/xml"
|
18
|
+
checksum(Nokogiri::XML(datastream.content).to_xml) == checksum(Nokogiri::XML(target_content).to_xml)
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
def target_checksum
|
24
|
+
target.digest.first.to_s.split(/:/).last
|
25
|
+
end
|
26
|
+
|
27
|
+
# In some cases, the data is in ldp_source but target.content is empty, so we check both places
|
28
|
+
def target_content
|
29
|
+
target.content.empty? ? target.ldp_source.content : target.content
|
30
|
+
end
|
31
|
+
|
32
|
+
def checksum content
|
33
|
+
Digest::SHA1.hexdigest(content)
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
module FedoraMigrate
|
2
|
+
class DatesMover < Mover
|
3
|
+
|
4
|
+
def migrate
|
5
|
+
if source.respond_to?(:createdDate) && target.respond_to?(:date_uploaded)
|
6
|
+
target.date_uploaded = source.createdDate
|
7
|
+
end
|
8
|
+
if source.respond_to?(:lastModifiedDate) && target.respond_to?(:date_modified)
|
9
|
+
target.date_modified = source.lastModifiedDate
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
end
|
14
|
+
end
|
data/lib/fedora_migrate/hooks.rb
CHANGED
@@ -1,11 +1,34 @@
|
|
1
|
+
# Override this methods to perform additional actions before and after
|
2
|
+
# migation of objects and datastreams.
|
3
|
+
#
|
4
|
+
# To do so, simply define a FedoraMigrate::Hooks module anywhere in
|
5
|
+
# you application and substitute methods for the ones listed below
|
1
6
|
module FedoraMigrate
|
2
7
|
module Hooks
|
3
8
|
|
9
|
+
# Called from FedoraMigrate::ObjectMover
|
4
10
|
def before_object_migration
|
5
11
|
end
|
6
12
|
|
13
|
+
# Called from FedoraMigrate::ObjectMover
|
7
14
|
def after_object_migration
|
8
15
|
end
|
9
16
|
|
17
|
+
# Called from FedoraMigrate::RDFDatastreamMover
|
18
|
+
def before_rdf_datastream_migration
|
19
|
+
end
|
20
|
+
|
21
|
+
# Called from FedoraMigrate::RDFDatastreamMover
|
22
|
+
def after_rdf_datastream_migration
|
23
|
+
end
|
24
|
+
|
25
|
+
# Called from FedoraMigrate::DatastreamMover
|
26
|
+
def before_datastream_migration
|
27
|
+
end
|
28
|
+
|
29
|
+
# Called from FedoraMigrate::DatastreamMover
|
30
|
+
def after_datastream_migration
|
31
|
+
end
|
32
|
+
|
10
33
|
end
|
11
34
|
end
|
@@ -7,5 +7,23 @@ module FedoraMigrate
|
|
7
7
|
self.conversions = options.nil? ? [] : [options[:convert]].flatten
|
8
8
|
end
|
9
9
|
|
10
|
+
def forced?
|
11
|
+
option_true?(:force)
|
12
|
+
end
|
13
|
+
|
14
|
+
def not_forced?
|
15
|
+
!forced?
|
16
|
+
end
|
17
|
+
|
18
|
+
def application_creates_versions?
|
19
|
+
option_true?(:application_creates_versions)
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def option_true?(name)
|
25
|
+
!!(options && options[name])
|
26
|
+
end
|
27
|
+
|
10
28
|
end
|
11
29
|
end
|
data/lib/fedora_migrate/mover.rb
CHANGED
@@ -39,6 +39,18 @@ module FedoraMigrate
|
|
39
39
|
target.inspect
|
40
40
|
end
|
41
41
|
end
|
42
|
+
|
43
|
+
def id_component object=nil
|
44
|
+
object ||= source
|
45
|
+
raise FedoraMigrate::Errors::MigrationError, "can't get the id component without an object" if object.nil?
|
46
|
+
self.class.id_component(object)
|
47
|
+
end
|
48
|
+
|
49
|
+
def self.id_component object
|
50
|
+
return object.pid.split(/:/).last if object.kind_of?(Rubydora::DigitalObject)
|
51
|
+
return object.to_s.split(/:/).last if object.respond_to?(:to_s)
|
52
|
+
nil
|
53
|
+
end
|
42
54
|
|
43
55
|
end
|
44
56
|
end
|
@@ -5,9 +5,10 @@ module FedoraMigrate
|
|
5
5
|
|
6
6
|
def migrate
|
7
7
|
prepare_target
|
8
|
-
migrate_content_datastreams
|
9
8
|
conversions.collect { |ds| convert_rdf_datastream(ds) }
|
9
|
+
migrate_content_datastreams
|
10
10
|
migrate_permissions
|
11
|
+
migrate_dates
|
11
12
|
complete_target
|
12
13
|
end
|
13
14
|
|
@@ -19,7 +20,6 @@ module FedoraMigrate
|
|
19
20
|
def prepare_target
|
20
21
|
Logger.info "running before_object_migration hooks"
|
21
22
|
before_object_migration
|
22
|
-
save
|
23
23
|
end
|
24
24
|
|
25
25
|
def complete_target
|
@@ -30,20 +30,28 @@ module FedoraMigrate
|
|
30
30
|
|
31
31
|
private
|
32
32
|
|
33
|
+
# We have to call save before migrating content datastreams, otherwise versions aren't recorded
|
34
|
+
# TODO: this will fail if required fields are defined in a descMetadata datastream that is not
|
35
|
+
# converted to RDF (issue #8)
|
33
36
|
def migrate_content_datastreams
|
37
|
+
save
|
34
38
|
target.attached_files.keys.each do |ds|
|
35
|
-
mover = FedoraMigrate::DatastreamMover.new(source.datastreams[ds.to_s], target.attached_files[ds.to_s])
|
39
|
+
mover = FedoraMigrate::DatastreamMover.new(source.datastreams[ds.to_s], target.attached_files[ds.to_s], options)
|
36
40
|
mover.migrate
|
37
41
|
end
|
38
42
|
end
|
39
43
|
|
40
44
|
def convert_rdf_datastream ds
|
41
|
-
if source.datastreams.
|
42
|
-
mover = FedoraMigrate::RDFDatastreamMover.new(
|
45
|
+
if source.datastreams.key?(ds)
|
46
|
+
mover = FedoraMigrate::RDFDatastreamMover.new(datastream_content(ds), target)
|
43
47
|
mover.migrate
|
44
48
|
end
|
45
49
|
end
|
46
50
|
|
51
|
+
def datastream_content(dsid)
|
52
|
+
source.datastreams[dsid.to_s]
|
53
|
+
end
|
54
|
+
|
47
55
|
def migrate_permissions
|
48
56
|
if source.datastreams.keys.include?(RIGHTS_DATASTREAM) && target.respond_to?(:permissions)
|
49
57
|
mover = FedoraMigrate::PermissionsMover.new(source.datastreams[RIGHTS_DATASTREAM], target)
|
@@ -51,12 +59,15 @@ module FedoraMigrate
|
|
51
59
|
end
|
52
60
|
end
|
53
61
|
|
62
|
+
def migrate_dates
|
63
|
+
FedoraMigrate::DatesMover.new(source, target).migrate
|
64
|
+
end
|
65
|
+
|
54
66
|
def create_target_model
|
55
|
-
|
56
|
-
|
57
|
-
@target =
|
67
|
+
builder = FedoraMigrate::TargetConstructor.new(source.models).build
|
68
|
+
raise FedoraMigrate::Errors::MigrationError, "No qualified targets found in #{source.pid}" if builder.target.nil?
|
69
|
+
@target = builder.target.new(id: id_component)
|
58
70
|
end
|
59
71
|
|
60
72
|
end
|
61
|
-
|
62
73
|
end
|
@@ -1,28 +1,45 @@
|
|
1
|
+
require 'rchardet'
|
2
|
+
|
1
3
|
module FedoraMigrate
|
2
4
|
class RDFDatastreamMover < Mover
|
3
5
|
|
4
6
|
def migrate
|
5
7
|
Logger.info "converting datastream '#{source.dsid}' to RDF"
|
6
|
-
|
7
|
-
|
8
|
+
before_rdf_datastream_migration
|
9
|
+
migrate_rdf_triples
|
10
|
+
after_rdf_datastream_migration
|
8
11
|
save
|
9
12
|
end
|
10
13
|
|
11
|
-
def
|
12
|
-
|
13
|
-
parser.parse
|
14
|
-
parser.statements.each do |statement|
|
15
|
-
target.resource << statement
|
16
|
-
end
|
14
|
+
def migrate_rdf_triples
|
15
|
+
target.resource << updated_graph
|
17
16
|
end
|
18
17
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
18
|
+
private
|
19
|
+
|
20
|
+
def updated_graph
|
21
|
+
reader.new(updated_datastream_content)
|
22
|
+
end
|
23
|
+
|
24
|
+
def updated_datastream_content
|
25
|
+
correct_encoding(datastream_content).gsub(/<.+#{source.pid}>/,"<#{target.uri}>")
|
24
26
|
end
|
25
|
-
end
|
26
27
|
|
28
|
+
def datastream_content
|
29
|
+
source.content
|
30
|
+
end
|
31
|
+
|
32
|
+
# Scholarsphere has some ISO-8859 encoded data, which violates the NTriples spec.
|
33
|
+
# Here we correct that.
|
34
|
+
def correct_encoding(input)
|
35
|
+
input.encode!(Encoding::UTF_8)
|
36
|
+
rescue Encoding::UndefinedConversionError
|
37
|
+
cd = ::CharDet.detect(input)
|
38
|
+
input.force_encoding(Encoding.find(cd["encoding"].upcase)).encode!(Encoding::UTF_8)
|
39
|
+
end
|
40
|
+
|
41
|
+
def reader
|
42
|
+
RDF::Reader.for(:ntriples)
|
43
|
+
end
|
27
44
|
end
|
28
45
|
end
|
@@ -1,89 +1,55 @@
|
|
1
|
-
require 'rubydora'
|
2
1
|
module FedoraMigrate
|
3
2
|
class RelsExtDatastreamMover < Mover
|
4
3
|
|
5
|
-
attr_accessor :relationships, :ng_xml, :subject
|
6
|
-
|
7
|
-
RELS_EXT = Rubydora::RelationshipsMixin::RELS_EXT
|
8
4
|
RELS_EXT_DATASTREAM = "RELS-EXT".freeze
|
9
5
|
|
10
|
-
def
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
parse_relationships if has_relationships?
|
15
|
-
end
|
16
|
-
|
17
|
-
def has_relationships?
|
18
|
-
source.datastreams.keys.include?(RELS_EXT_DATASTREAM)
|
6
|
+
def migrate
|
7
|
+
migrate_statements
|
8
|
+
target.ldp_source.update
|
9
|
+
update_index
|
19
10
|
end
|
20
11
|
|
21
|
-
def
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
objects.collect { |object| migrate_incomming_relationship(predicate, object) }
|
26
|
-
else
|
27
|
-
migrate_outgoing_relationship(predicate, objects)
|
28
|
-
end
|
29
|
-
end
|
30
|
-
end
|
12
|
+
def post_initialize
|
13
|
+
@target ||= ActiveFedora::Base.find(id_component)
|
14
|
+
rescue ActiveFedora::ObjectNotFoundError
|
15
|
+
raise FedoraMigrate::Errors::MigrationError, "Target object was not found in Fedora 4. Did you migrate it?"
|
31
16
|
end
|
32
17
|
|
33
18
|
private
|
34
19
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
query = "//ns0:"+RELS_EXT[key].split(/#/).last
|
39
|
-
relationships[key.to_sym] = query_results(query)
|
20
|
+
def migrate_statements
|
21
|
+
statements.each do |statement|
|
22
|
+
target.ldp_source.graph << [target.rdf_subject, migrate_predicate(statement.predicate), migrate_object(statement.object)]
|
40
23
|
end
|
41
24
|
end
|
42
25
|
|
43
|
-
def
|
44
|
-
|
45
|
-
|
46
|
-
end
|
47
|
-
return results
|
26
|
+
def update_index
|
27
|
+
target.reload
|
28
|
+
target.update_index
|
48
29
|
end
|
49
30
|
|
50
|
-
def
|
51
|
-
@
|
52
|
-
rescue ActiveFedora::ObjectNotFoundError
|
53
|
-
raise FedoraMigrate::Errors::MigrationError, "Source was not found in Fedora4. Did you migrated it?"
|
31
|
+
def graph
|
32
|
+
@graph ||= RDF::Graph.new { |g| g.from_rdfxml(source.datastreams[RELS_EXT_DATASTREAM].content) }
|
54
33
|
end
|
55
34
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
raise FedoraMigrate::Errors::MigrationError, "Could not find object with id #{id}"
|
35
|
+
# Override this if any predicate transformation is needed
|
36
|
+
def migrate_predicate(fc3_uri)
|
37
|
+
fc3_uri
|
60
38
|
end
|
61
39
|
|
62
|
-
|
63
|
-
|
64
|
-
Logger.info "adding #{subject.id} to #{object.id} with predicate #{predicate.to_s}"
|
65
|
-
object.reflections.each do |key, association|
|
66
|
-
unless association.predicate.to_s.split(/#/).empty?
|
67
|
-
if association.predicate.to_s.split(/#/).last.gsub(/is/,"").underscore == predicate.to_s
|
68
|
-
object.send(key.to_s) << subject
|
69
|
-
end
|
70
|
-
end
|
71
|
-
end
|
40
|
+
def migrate_object(fc3_uri)
|
41
|
+
RDF::URI.new(ActiveFedora::Base.id_to_uri(id_component(fc3_uri)))
|
72
42
|
end
|
73
43
|
|
74
|
-
|
75
|
-
|
76
|
-
Logger.
|
77
|
-
|
78
|
-
if key.to_s.match(/_ids$/)
|
79
|
-
subject.send(key.to_s+"=", objects.collect { |o| o.id })
|
80
|
-
subject.save
|
81
|
-
end
|
82
|
-
end
|
44
|
+
def has_missing_object?(statement)
|
45
|
+
return false if ActiveFedora::Base.exists?(id_component(statement.object))
|
46
|
+
Logger.warn "#{source.pid} could not migrate relationship #{statement.predicate} because #{statement.object} doesn't exist in Fedora 4"
|
47
|
+
true
|
83
48
|
end
|
84
49
|
|
85
|
-
|
86
|
-
|
50
|
+
# All the graph statements except hasModel and those with missing objects
|
51
|
+
def statements
|
52
|
+
graph.statements.reject { |stmt| stmt.predicate == ActiveFedora::RDF::Fcrepo::Model.hasModel || has_missing_object?(stmt) }
|
87
53
|
end
|
88
54
|
|
89
55
|
end
|
@@ -3,50 +3,55 @@ module FedoraMigrate
|
|
3
3
|
|
4
4
|
include MigrationOptions
|
5
5
|
|
6
|
-
attr_accessor :source_objects, :
|
6
|
+
attr_accessor :source_objects, :namespace, :failed
|
7
7
|
|
8
8
|
def initialize namespace = nil, options = {}
|
9
9
|
@namespace = namespace || repository_namespace
|
10
10
|
@options = options
|
11
|
+
@failed = 0
|
11
12
|
@source_objects = get_source_objects
|
12
|
-
@results = []
|
13
13
|
conversion_options
|
14
14
|
end
|
15
15
|
|
16
|
+
# TODO: need a reporting mechanism for results (issue #4)
|
16
17
|
def migrate_objects
|
17
|
-
source_objects.each
|
18
|
-
|
19
|
-
begin
|
20
|
-
results << { source.pid => [FedoraMigrate::ObjectMover.new(source, nil, options).migrate] }
|
21
|
-
rescue NameError => e
|
22
|
-
results << { source.pid => e.to_s }
|
23
|
-
rescue FedoraMigrate::Errors::MigrationError => e
|
24
|
-
results << { source.pid => e.to_s }
|
25
|
-
end
|
26
|
-
end
|
18
|
+
source_objects.each { |source| migrate_object(source) }
|
19
|
+
@failed == 0
|
27
20
|
end
|
28
21
|
|
29
22
|
# TODO: need a reporting mechanism for results (issue #4)
|
30
23
|
def migrate_relationships
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
results << { source.pid => e.to_s }
|
37
|
-
rescue ActiveFedora::AssociationTypeMismatch => e
|
38
|
-
results << { source.pid => e.to_s }
|
39
|
-
end
|
40
|
-
end
|
41
|
-
end
|
42
|
-
|
43
|
-
# TODO: page through all the objects (issue #6)
|
24
|
+
return "Relationship migration halted because #{failed.to_s} objects didn't migrate successfully." if failed > 0 && not_forced?
|
25
|
+
source_objects.each { |source| migrate_relationship(source) }
|
26
|
+
@failed == 0
|
27
|
+
end
|
28
|
+
|
44
29
|
def get_source_objects
|
45
30
|
FedoraMigrate.source.connection.search(nil).collect { |o| qualifying_object(o) }.compact
|
46
31
|
end
|
47
32
|
|
48
33
|
private
|
49
34
|
|
35
|
+
def migrate_object source
|
36
|
+
Logger.info "Migrating source object #{source.pid}"
|
37
|
+
FedoraMigrate::ObjectMover.new(source, nil, options).migrate
|
38
|
+
rescue StandardError => e
|
39
|
+
Logger.warn "#{source.pid} failed.\n#{error_message(e)}"
|
40
|
+
@failed = @failed + 1
|
41
|
+
end
|
42
|
+
|
43
|
+
def migrate_relationship source
|
44
|
+
Logger.info "Migrating relationships for source object #{source.pid}"
|
45
|
+
FedoraMigrate::RelsExtDatastreamMover.new(source).migrate
|
46
|
+
rescue StandardError => e
|
47
|
+
Logger.warn "#{source.pid} relationship migration failed.\n#{error_message(e)}"
|
48
|
+
@failed = @failed + 1
|
49
|
+
end
|
50
|
+
|
51
|
+
def error_message e
|
52
|
+
[e.inspect, e.backtrace.join("\n\t")].join("\n\t")
|
53
|
+
end
|
54
|
+
|
50
55
|
def repository_namespace
|
51
56
|
FedoraMigrate.source.connection.repository_profile["repositoryPID"]["repositoryPID"].split(/:/).first.strip
|
52
57
|
end
|