fedora-migrate 0.0.1 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/Gemfile +1 -0
  4. data/LICENSE +14 -0
  5. data/README.md +201 -6
  6. data/config/jetty.yml +2 -2
  7. data/fedora-migrate.gemspec +7 -6
  8. data/lib/fedora-migrate.rb +7 -24
  9. data/lib/fedora_migrate/content_mover.rb +49 -0
  10. data/lib/fedora_migrate/datastream_mover.rb +19 -34
  11. data/lib/fedora_migrate/datastream_verification.rb +36 -0
  12. data/lib/fedora_migrate/dates_mover.rb +14 -0
  13. data/lib/fedora_migrate/hooks.rb +23 -0
  14. data/lib/fedora_migrate/migration_options.rb +18 -0
  15. data/lib/fedora_migrate/mover.rb +12 -0
  16. data/lib/fedora_migrate/object_mover.rb +20 -9
  17. data/lib/fedora_migrate/rdf_datastream_mover.rb +31 -14
  18. data/lib/fedora_migrate/rels_ext_datastream_mover.rb +28 -62
  19. data/lib/fedora_migrate/repository_migrator.rb +30 -25
  20. data/lib/fedora_migrate/rubydora_connection.rb +0 -2
  21. data/lib/fedora_migrate/target_constructor.rb +39 -0
  22. data/lib/fedora_migrate/version.rb +1 -1
  23. data/spec/fixtures/objects/scholarsphere_5712mc568.xml +7284 -0
  24. data/spec/fixtures/objects/scholarsphere_7d279232g.xml +20120 -0
  25. data/spec/fixtures/objects/scholarsphere_sf2686078.xml +8823 -0
  26. data/spec/fixtures/objects/scholarsphere_x346dj04v.xml +188 -0
  27. data/spec/fixtures/objects/scholarsphere_x346dj06d.xml +255 -0
  28. data/spec/fixtures/objects/scholarsphere_x346dj08z.xml +1242 -0
  29. data/spec/fixtures/objects/sufia_5m60qr94g.xml +68 -0
  30. data/spec/fixtures/objects/sufia_5m60qr95r.xml +133 -0
  31. data/spec/fixtures/objects/sufia_5m60qr961.xml +133 -0
  32. data/spec/fixtures/objects/sufia_5m60qr979.xml +118 -0
  33. data/spec/integration/content_versions_spec.rb +24 -1
  34. data/spec/integration/missing_relationships_spec.rb +30 -0
  35. data/spec/integration/object_migration_spec.rb +49 -5
  36. data/spec/integration/rdf_migration_spec.rb +38 -13
  37. data/spec/integration/relationship_migration_spec.rb +10 -9
  38. data/spec/integration/repository_migration_spec.rb +46 -19
  39. data/spec/integration/versions_spec.rb +32 -0
  40. data/spec/spec_helper.rb +8 -1
  41. data/spec/support/example_model.rb +56 -0
  42. data/spec/unit/content_mover_spec.rb +78 -0
  43. data/spec/unit/datastream_verification_spec.rb +60 -0
  44. data/spec/unit/dates_mover_spec.rb +33 -0
  45. data/spec/unit/migration_options_spec.rb +61 -0
  46. data/spec/unit/mover_spec.rb +35 -1
  47. data/spec/unit/object_mover_spec.rb +1 -3
  48. data/spec/unit/rels_ext_datastream_mover_spec.rb +28 -18
  49. data/spec/unit/repository_migrator_spec.rb +16 -5
  50. data/spec/unit/target_constructor_spec.rb +34 -0
  51. data/tasks/dev.rake +1 -1
  52. metadata +80 -38
  53. data/LICENSE.txt +0 -22
  54. data/lib/fedora_migrate/rdf_datastream_parser.rb +0 -29
  55. data/lib/fedora_migrate/triple_converter.rb +0 -39
  56. data/spec/fixtures/datastreams/rdf_ntriples_datastream.txt +0 -2
  57. data/spec/unit/rdf_datastream_mover_spec.rb +0 -8
  58. data/spec/unit/rdf_datastream_parser_spec.rb +0 -38
  59. data/spec/unit/triple_converter_spec.rb +0 -35
@@ -0,0 +1,36 @@
1
+ module FedoraMigrate::DatastreamVerification
2
+
3
+ attr_accessor :datastream
4
+
5
+ def valid? datastream=nil
6
+ @datastream = datastream || @source
7
+ check = has_matching_checksums? || has_matching_nokogiri_checksums?
8
+ FedoraMigrate::Logger.warn "#{@datastream.pid} datastream #{@datastream.dsid} validation failed" unless check
9
+ check
10
+ end
11
+
12
+ def has_matching_checksums?
13
+ datastream.checksum == target_checksum || checksum(datastream.content) == target_checksum
14
+ end
15
+
16
+ def has_matching_nokogiri_checksums?
17
+ return false unless datastream.mimeType == "text/xml"
18
+ checksum(Nokogiri::XML(datastream.content).to_xml) == checksum(Nokogiri::XML(target_content).to_xml)
19
+ end
20
+
21
+ private
22
+
23
+ def target_checksum
24
+ target.digest.first.to_s.split(/:/).last
25
+ end
26
+
27
+ # In some cases, the data is in ldp_source but target.content is empty, so we check both places
28
+ def target_content
29
+ target.content.empty? ? target.ldp_source.content : target.content
30
+ end
31
+
32
+ def checksum content
33
+ Digest::SHA1.hexdigest(content)
34
+ end
35
+
36
+ end
@@ -0,0 +1,14 @@
1
+ module FedoraMigrate
2
+ class DatesMover < Mover
3
+
4
+ def migrate
5
+ if source.respond_to?(:createdDate) && target.respond_to?(:date_uploaded)
6
+ target.date_uploaded = source.createdDate
7
+ end
8
+ if source.respond_to?(:lastModifiedDate) && target.respond_to?(:date_modified)
9
+ target.date_modified = source.lastModifiedDate
10
+ end
11
+ end
12
+
13
+ end
14
+ end
@@ -1,11 +1,34 @@
1
+ # Override this methods to perform additional actions before and after
2
+ # migation of objects and datastreams.
3
+ #
4
+ # To do so, simply define a FedoraMigrate::Hooks module anywhere in
5
+ # you application and substitute methods for the ones listed below
1
6
  module FedoraMigrate
2
7
  module Hooks
3
8
 
9
+ # Called from FedoraMigrate::ObjectMover
4
10
  def before_object_migration
5
11
  end
6
12
 
13
+ # Called from FedoraMigrate::ObjectMover
7
14
  def after_object_migration
8
15
  end
9
16
 
17
+ # Called from FedoraMigrate::RDFDatastreamMover
18
+ def before_rdf_datastream_migration
19
+ end
20
+
21
+ # Called from FedoraMigrate::RDFDatastreamMover
22
+ def after_rdf_datastream_migration
23
+ end
24
+
25
+ # Called from FedoraMigrate::DatastreamMover
26
+ def before_datastream_migration
27
+ end
28
+
29
+ # Called from FedoraMigrate::DatastreamMover
30
+ def after_datastream_migration
31
+ end
32
+
10
33
  end
11
34
  end
@@ -7,5 +7,23 @@ module FedoraMigrate
7
7
  self.conversions = options.nil? ? [] : [options[:convert]].flatten
8
8
  end
9
9
 
10
+ def forced?
11
+ option_true?(:force)
12
+ end
13
+
14
+ def not_forced?
15
+ !forced?
16
+ end
17
+
18
+ def application_creates_versions?
19
+ option_true?(:application_creates_versions)
20
+ end
21
+
22
+ private
23
+
24
+ def option_true?(name)
25
+ !!(options && options[name])
26
+ end
27
+
10
28
  end
11
29
  end
@@ -39,6 +39,18 @@ module FedoraMigrate
39
39
  target.inspect
40
40
  end
41
41
  end
42
+
43
+ def id_component object=nil
44
+ object ||= source
45
+ raise FedoraMigrate::Errors::MigrationError, "can't get the id component without an object" if object.nil?
46
+ self.class.id_component(object)
47
+ end
48
+
49
+ def self.id_component object
50
+ return object.pid.split(/:/).last if object.kind_of?(Rubydora::DigitalObject)
51
+ return object.to_s.split(/:/).last if object.respond_to?(:to_s)
52
+ nil
53
+ end
42
54
 
43
55
  end
44
56
  end
@@ -5,9 +5,10 @@ module FedoraMigrate
5
5
 
6
6
  def migrate
7
7
  prepare_target
8
- migrate_content_datastreams
9
8
  conversions.collect { |ds| convert_rdf_datastream(ds) }
9
+ migrate_content_datastreams
10
10
  migrate_permissions
11
+ migrate_dates
11
12
  complete_target
12
13
  end
13
14
 
@@ -19,7 +20,6 @@ module FedoraMigrate
19
20
  def prepare_target
20
21
  Logger.info "running before_object_migration hooks"
21
22
  before_object_migration
22
- save
23
23
  end
24
24
 
25
25
  def complete_target
@@ -30,20 +30,28 @@ module FedoraMigrate
30
30
 
31
31
  private
32
32
 
33
+ # We have to call save before migrating content datastreams, otherwise versions aren't recorded
34
+ # TODO: this will fail if required fields are defined in a descMetadata datastream that is not
35
+ # converted to RDF (issue #8)
33
36
  def migrate_content_datastreams
37
+ save
34
38
  target.attached_files.keys.each do |ds|
35
- mover = FedoraMigrate::DatastreamMover.new(source.datastreams[ds.to_s], target.attached_files[ds.to_s])
39
+ mover = FedoraMigrate::DatastreamMover.new(source.datastreams[ds.to_s], target.attached_files[ds.to_s], options)
36
40
  mover.migrate
37
41
  end
38
42
  end
39
43
 
40
44
  def convert_rdf_datastream ds
41
- if source.datastreams.keys.include?(ds)
42
- mover = FedoraMigrate::RDFDatastreamMover.new(source.datastreams[ds.to_s], target)
45
+ if source.datastreams.key?(ds)
46
+ mover = FedoraMigrate::RDFDatastreamMover.new(datastream_content(ds), target)
43
47
  mover.migrate
44
48
  end
45
49
  end
46
50
 
51
+ def datastream_content(dsid)
52
+ source.datastreams[dsid.to_s]
53
+ end
54
+
47
55
  def migrate_permissions
48
56
  if source.datastreams.keys.include?(RIGHTS_DATASTREAM) && target.respond_to?(:permissions)
49
57
  mover = FedoraMigrate::PermissionsMover.new(source.datastreams[RIGHTS_DATASTREAM], target)
@@ -51,12 +59,15 @@ module FedoraMigrate
51
59
  end
52
60
  end
53
61
 
62
+ def migrate_dates
63
+ FedoraMigrate::DatesMover.new(source, target).migrate
64
+ end
65
+
54
66
  def create_target_model
55
- afmodel = source.models.map { |m| m if m.match(/afmodel/) }.compact.first.split(/:/).last
56
- Logger.info "found #{afmodel} in source object #{source.pid}"
57
- @target = afmodel.constantize.new(id: source.pid.split(/:/).last)
67
+ builder = FedoraMigrate::TargetConstructor.new(source.models).build
68
+ raise FedoraMigrate::Errors::MigrationError, "No qualified targets found in #{source.pid}" if builder.target.nil?
69
+ @target = builder.target.new(id: id_component)
58
70
  end
59
71
 
60
72
  end
61
-
62
73
  end
@@ -1,28 +1,45 @@
1
+ require 'rchardet'
2
+
1
3
  module FedoraMigrate
2
4
  class RDFDatastreamMover < Mover
3
5
 
4
6
  def migrate
5
7
  Logger.info "converting datastream '#{source.dsid}' to RDF"
6
- parse_rdf_triples
7
- force_attribute_change
8
+ before_rdf_datastream_migration
9
+ migrate_rdf_triples
10
+ after_rdf_datastream_migration
8
11
  save
9
12
  end
10
13
 
11
- def parse_rdf_triples
12
- parser = FedoraMigrate::RDFDatastreamParser.new(target.uri, source.content)
13
- parser.parse
14
- parser.statements.each do |statement|
15
- target.resource << statement
16
- end
14
+ def migrate_rdf_triples
15
+ target.resource << updated_graph
17
16
  end
18
17
 
19
- # See projecthydra/active_fedora#540
20
- # Forcibly setting each attribute's changed status to true
21
- def force_attribute_change
22
- target.class.delegated_attributes.keys.each do |term|
23
- target.send(term+"_will_change!")
18
+ private
19
+
20
+ def updated_graph
21
+ reader.new(updated_datastream_content)
22
+ end
23
+
24
+ def updated_datastream_content
25
+ correct_encoding(datastream_content).gsub(/<.+#{source.pid}>/,"<#{target.uri}>")
24
26
  end
25
- end
26
27
 
28
+ def datastream_content
29
+ source.content
30
+ end
31
+
32
+ # Scholarsphere has some ISO-8859 encoded data, which violates the NTriples spec.
33
+ # Here we correct that.
34
+ def correct_encoding(input)
35
+ input.encode!(Encoding::UTF_8)
36
+ rescue Encoding::UndefinedConversionError
37
+ cd = ::CharDet.detect(input)
38
+ input.force_encoding(Encoding.find(cd["encoding"].upcase)).encode!(Encoding::UTF_8)
39
+ end
40
+
41
+ def reader
42
+ RDF::Reader.for(:ntriples)
43
+ end
27
44
  end
28
45
  end
@@ -1,89 +1,55 @@
1
- require 'rubydora'
2
1
  module FedoraMigrate
3
2
  class RelsExtDatastreamMover < Mover
4
3
 
5
- attr_accessor :relationships, :ng_xml, :subject
6
-
7
- RELS_EXT = Rubydora::RelationshipsMixin::RELS_EXT
8
4
  RELS_EXT_DATASTREAM = "RELS-EXT".freeze
9
5
 
10
- def post_initialize
11
- retrieve_subject
12
- @relationships ||= {}
13
- @ng_xml = Nokogiri::XML(source.datastreams[RELS_EXT_DATASTREAM].content)
14
- parse_relationships if has_relationships?
15
- end
16
-
17
- def has_relationships?
18
- source.datastreams.keys.include?(RELS_EXT_DATASTREAM)
6
+ def migrate
7
+ migrate_statements
8
+ target.ldp_source.update
9
+ update_index
19
10
  end
20
11
 
21
- def migrate
22
- relationships.each do |predicate, objects|
23
- unless objects.empty?
24
- if is_singular?(predicate.to_s)
25
- objects.collect { |object| migrate_incomming_relationship(predicate, object) }
26
- else
27
- migrate_outgoing_relationship(predicate, objects)
28
- end
29
- end
30
- end
12
+ def post_initialize
13
+ @target ||= ActiveFedora::Base.find(id_component)
14
+ rescue ActiveFedora::ObjectNotFoundError
15
+ raise FedoraMigrate::Errors::MigrationError, "Target object was not found in Fedora 4. Did you migrate it?"
31
16
  end
32
17
 
33
18
  private
34
19
 
35
- # because of projecthydra/rubydora#90
36
- def parse_relationships
37
- RELS_EXT.keys.each do |key|
38
- query = "//ns0:"+RELS_EXT[key].split(/#/).last
39
- relationships[key.to_sym] = query_results(query)
20
+ def migrate_statements
21
+ statements.each do |statement|
22
+ target.ldp_source.graph << [target.rdf_subject, migrate_predicate(statement.predicate), migrate_object(statement.object)]
40
23
  end
41
24
  end
42
25
 
43
- def query_results query, results = Array.new
44
- ng_xml.xpath(query).each do |predicate|
45
- results << retrieve_object(predicate.attribute("resource").text.split(/:/).last)
46
- end
47
- return results
26
+ def update_index
27
+ target.reload
28
+ target.update_index
48
29
  end
49
30
 
50
- def retrieve_subject
51
- @subject = ActiveFedora::Base.find(source.pid.split(/:/).last)
52
- rescue ActiveFedora::ObjectNotFoundError
53
- raise FedoraMigrate::Errors::MigrationError, "Source was not found in Fedora4. Did you migrated it?"
31
+ def graph
32
+ @graph ||= RDF::Graph.new { |g| g.from_rdfxml(source.datastreams[RELS_EXT_DATASTREAM].content) }
54
33
  end
55
34
 
56
- def retrieve_object id
57
- object = ActiveFedora::Base.find(id)
58
- rescue ActiveFedora::ObjectNotFoundError
59
- raise FedoraMigrate::Errors::MigrationError, "Could not find object with id #{id}"
35
+ # Override this if any predicate transformation is needed
36
+ def migrate_predicate(fc3_uri)
37
+ fc3_uri
60
38
  end
61
39
 
62
- # TODO: This is problematic and may not work in all situations (issue #7)
63
- def migrate_incomming_relationship predicate, object
64
- Logger.info "adding #{subject.id} to #{object.id} with predicate #{predicate.to_s}"
65
- object.reflections.each do |key, association|
66
- unless association.predicate.to_s.split(/#/).empty?
67
- if association.predicate.to_s.split(/#/).last.gsub(/is/,"").underscore == predicate.to_s
68
- object.send(key.to_s) << subject
69
- end
70
- end
71
- end
40
+ def migrate_object(fc3_uri)
41
+ RDF::URI.new(ActiveFedora::Base.id_to_uri(id_component(fc3_uri)))
72
42
  end
73
43
 
74
- # TODO: Very stinky... needs a different approach (issue #7)
75
- def migrate_outgoing_relationship predicate, objects
76
- Logger.info "adding #{objects.count.to_s} members to #{subject.id} with predicate #{predicate.to_s}"
77
- subject.reflections.each do |key, association|
78
- if key.to_s.match(/_ids$/)
79
- subject.send(key.to_s+"=", objects.collect { |o| o.id })
80
- subject.save
81
- end
82
- end
44
+ def has_missing_object?(statement)
45
+ return false if ActiveFedora::Base.exists?(id_component(statement.object))
46
+ Logger.warn "#{source.pid} could not migrate relationship #{statement.predicate} because #{statement.object} doesn't exist in Fedora 4"
47
+ true
83
48
  end
84
49
 
85
- def is_singular?(str)
86
- str.singularize == str
50
+ # All the graph statements except hasModel and those with missing objects
51
+ def statements
52
+ graph.statements.reject { |stmt| stmt.predicate == ActiveFedora::RDF::Fcrepo::Model.hasModel || has_missing_object?(stmt) }
87
53
  end
88
54
 
89
55
  end
@@ -3,50 +3,55 @@ module FedoraMigrate
3
3
 
4
4
  include MigrationOptions
5
5
 
6
- attr_accessor :source_objects, :results, :namespace
6
+ attr_accessor :source_objects, :namespace, :failed
7
7
 
8
8
  def initialize namespace = nil, options = {}
9
9
  @namespace = namespace || repository_namespace
10
10
  @options = options
11
+ @failed = 0
11
12
  @source_objects = get_source_objects
12
- @results = []
13
13
  conversion_options
14
14
  end
15
15
 
16
+ # TODO: need a reporting mechanism for results (issue #4)
16
17
  def migrate_objects
17
- source_objects.each do |source|
18
- Logger.info "Migrating source object #{source.pid}"
19
- begin
20
- results << { source.pid => [FedoraMigrate::ObjectMover.new(source, nil, options).migrate] }
21
- rescue NameError => e
22
- results << { source.pid => e.to_s }
23
- rescue FedoraMigrate::Errors::MigrationError => e
24
- results << { source.pid => e.to_s }
25
- end
26
- end
18
+ source_objects.each { |source| migrate_object(source) }
19
+ @failed == 0
27
20
  end
28
21
 
29
22
  # TODO: need a reporting mechanism for results (issue #4)
30
23
  def migrate_relationships
31
- source_objects.each do |source|
32
- Logger.info "Migrating relationships for source object #{source.pid}"
33
- begin
34
- FedoraMigrate::RelsExtDatastreamMover.new(source).migrate
35
- rescue FedoraMigrate::Errors::MigrationError => e
36
- results << { source.pid => e.to_s }
37
- rescue ActiveFedora::AssociationTypeMismatch => e
38
- results << { source.pid => e.to_s }
39
- end
40
- end
41
- end
42
-
43
- # TODO: page through all the objects (issue #6)
24
+ return "Relationship migration halted because #{failed.to_s} objects didn't migrate successfully." if failed > 0 && not_forced?
25
+ source_objects.each { |source| migrate_relationship(source) }
26
+ @failed == 0
27
+ end
28
+
44
29
  def get_source_objects
45
30
  FedoraMigrate.source.connection.search(nil).collect { |o| qualifying_object(o) }.compact
46
31
  end
47
32
 
48
33
  private
49
34
 
35
+ def migrate_object source
36
+ Logger.info "Migrating source object #{source.pid}"
37
+ FedoraMigrate::ObjectMover.new(source, nil, options).migrate
38
+ rescue StandardError => e
39
+ Logger.warn "#{source.pid} failed.\n#{error_message(e)}"
40
+ @failed = @failed + 1
41
+ end
42
+
43
+ def migrate_relationship source
44
+ Logger.info "Migrating relationships for source object #{source.pid}"
45
+ FedoraMigrate::RelsExtDatastreamMover.new(source).migrate
46
+ rescue StandardError => e
47
+ Logger.warn "#{source.pid} relationship migration failed.\n#{error_message(e)}"
48
+ @failed = @failed + 1
49
+ end
50
+
51
+ def error_message e
52
+ [e.inspect, e.backtrace.join("\n\t")].join("\n\t")
53
+ end
54
+
50
55
  def repository_namespace
51
56
  FedoraMigrate.source.connection.repository_profile["repositoryPID"]["repositoryPID"].split(/:/).first.strip
52
57
  end