fedora-migrate 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/Gemfile +1 -0
  4. data/LICENSE +14 -0
  5. data/README.md +201 -6
  6. data/config/jetty.yml +2 -2
  7. data/fedora-migrate.gemspec +7 -6
  8. data/lib/fedora-migrate.rb +7 -24
  9. data/lib/fedora_migrate/content_mover.rb +49 -0
  10. data/lib/fedora_migrate/datastream_mover.rb +19 -34
  11. data/lib/fedora_migrate/datastream_verification.rb +36 -0
  12. data/lib/fedora_migrate/dates_mover.rb +14 -0
  13. data/lib/fedora_migrate/hooks.rb +23 -0
  14. data/lib/fedora_migrate/migration_options.rb +18 -0
  15. data/lib/fedora_migrate/mover.rb +12 -0
  16. data/lib/fedora_migrate/object_mover.rb +20 -9
  17. data/lib/fedora_migrate/rdf_datastream_mover.rb +31 -14
  18. data/lib/fedora_migrate/rels_ext_datastream_mover.rb +28 -62
  19. data/lib/fedora_migrate/repository_migrator.rb +30 -25
  20. data/lib/fedora_migrate/rubydora_connection.rb +0 -2
  21. data/lib/fedora_migrate/target_constructor.rb +39 -0
  22. data/lib/fedora_migrate/version.rb +1 -1
  23. data/spec/fixtures/objects/scholarsphere_5712mc568.xml +7284 -0
  24. data/spec/fixtures/objects/scholarsphere_7d279232g.xml +20120 -0
  25. data/spec/fixtures/objects/scholarsphere_sf2686078.xml +8823 -0
  26. data/spec/fixtures/objects/scholarsphere_x346dj04v.xml +188 -0
  27. data/spec/fixtures/objects/scholarsphere_x346dj06d.xml +255 -0
  28. data/spec/fixtures/objects/scholarsphere_x346dj08z.xml +1242 -0
  29. data/spec/fixtures/objects/sufia_5m60qr94g.xml +68 -0
  30. data/spec/fixtures/objects/sufia_5m60qr95r.xml +133 -0
  31. data/spec/fixtures/objects/sufia_5m60qr961.xml +133 -0
  32. data/spec/fixtures/objects/sufia_5m60qr979.xml +118 -0
  33. data/spec/integration/content_versions_spec.rb +24 -1
  34. data/spec/integration/missing_relationships_spec.rb +30 -0
  35. data/spec/integration/object_migration_spec.rb +49 -5
  36. data/spec/integration/rdf_migration_spec.rb +38 -13
  37. data/spec/integration/relationship_migration_spec.rb +10 -9
  38. data/spec/integration/repository_migration_spec.rb +46 -19
  39. data/spec/integration/versions_spec.rb +32 -0
  40. data/spec/spec_helper.rb +8 -1
  41. data/spec/support/example_model.rb +56 -0
  42. data/spec/unit/content_mover_spec.rb +78 -0
  43. data/spec/unit/datastream_verification_spec.rb +60 -0
  44. data/spec/unit/dates_mover_spec.rb +33 -0
  45. data/spec/unit/migration_options_spec.rb +61 -0
  46. data/spec/unit/mover_spec.rb +35 -1
  47. data/spec/unit/object_mover_spec.rb +1 -3
  48. data/spec/unit/rels_ext_datastream_mover_spec.rb +28 -18
  49. data/spec/unit/repository_migrator_spec.rb +16 -5
  50. data/spec/unit/target_constructor_spec.rb +34 -0
  51. data/tasks/dev.rake +1 -1
  52. metadata +80 -38
  53. data/LICENSE.txt +0 -22
  54. data/lib/fedora_migrate/rdf_datastream_parser.rb +0 -29
  55. data/lib/fedora_migrate/triple_converter.rb +0 -39
  56. data/spec/fixtures/datastreams/rdf_ntriples_datastream.txt +0 -2
  57. data/spec/unit/rdf_datastream_mover_spec.rb +0 -8
  58. data/spec/unit/rdf_datastream_parser_spec.rb +0 -38
  59. data/spec/unit/triple_converter_spec.rb +0 -35
@@ -0,0 +1,36 @@
1
+ module FedoraMigrate::DatastreamVerification
2
+
3
+ attr_accessor :datastream
4
+
5
+ def valid? datastream=nil
6
+ @datastream = datastream || @source
7
+ check = has_matching_checksums? || has_matching_nokogiri_checksums?
8
+ FedoraMigrate::Logger.warn "#{@datastream.pid} datastream #{@datastream.dsid} validation failed" unless check
9
+ check
10
+ end
11
+
12
+ def has_matching_checksums?
13
+ datastream.checksum == target_checksum || checksum(datastream.content) == target_checksum
14
+ end
15
+
16
+ def has_matching_nokogiri_checksums?
17
+ return false unless datastream.mimeType == "text/xml"
18
+ checksum(Nokogiri::XML(datastream.content).to_xml) == checksum(Nokogiri::XML(target_content).to_xml)
19
+ end
20
+
21
+ private
22
+
23
+ def target_checksum
24
+ target.digest.first.to_s.split(/:/).last
25
+ end
26
+
27
+ # In some cases, the data is in ldp_source but target.content is empty, so we check both places
28
+ def target_content
29
+ target.content.empty? ? target.ldp_source.content : target.content
30
+ end
31
+
32
+ def checksum content
33
+ Digest::SHA1.hexdigest(content)
34
+ end
35
+
36
+ end
@@ -0,0 +1,14 @@
1
+ module FedoraMigrate
2
+ class DatesMover < Mover
3
+
4
+ def migrate
5
+ if source.respond_to?(:createdDate) && target.respond_to?(:date_uploaded)
6
+ target.date_uploaded = source.createdDate
7
+ end
8
+ if source.respond_to?(:lastModifiedDate) && target.respond_to?(:date_modified)
9
+ target.date_modified = source.lastModifiedDate
10
+ end
11
+ end
12
+
13
+ end
14
+ end
@@ -1,11 +1,34 @@
1
+ # Override this methods to perform additional actions before and after
2
+ # migation of objects and datastreams.
3
+ #
4
+ # To do so, simply define a FedoraMigrate::Hooks module anywhere in
5
+ # you application and substitute methods for the ones listed below
1
6
  module FedoraMigrate
2
7
  module Hooks
3
8
 
9
+ # Called from FedoraMigrate::ObjectMover
4
10
  def before_object_migration
5
11
  end
6
12
 
13
+ # Called from FedoraMigrate::ObjectMover
7
14
  def after_object_migration
8
15
  end
9
16
 
17
+ # Called from FedoraMigrate::RDFDatastreamMover
18
+ def before_rdf_datastream_migration
19
+ end
20
+
21
+ # Called from FedoraMigrate::RDFDatastreamMover
22
+ def after_rdf_datastream_migration
23
+ end
24
+
25
+ # Called from FedoraMigrate::DatastreamMover
26
+ def before_datastream_migration
27
+ end
28
+
29
+ # Called from FedoraMigrate::DatastreamMover
30
+ def after_datastream_migration
31
+ end
32
+
10
33
  end
11
34
  end
@@ -7,5 +7,23 @@ module FedoraMigrate
7
7
  self.conversions = options.nil? ? [] : [options[:convert]].flatten
8
8
  end
9
9
 
10
+ def forced?
11
+ option_true?(:force)
12
+ end
13
+
14
+ def not_forced?
15
+ !forced?
16
+ end
17
+
18
+ def application_creates_versions?
19
+ option_true?(:application_creates_versions)
20
+ end
21
+
22
+ private
23
+
24
+ def option_true?(name)
25
+ !!(options && options[name])
26
+ end
27
+
10
28
  end
11
29
  end
@@ -39,6 +39,18 @@ module FedoraMigrate
39
39
  target.inspect
40
40
  end
41
41
  end
42
+
43
+ def id_component object=nil
44
+ object ||= source
45
+ raise FedoraMigrate::Errors::MigrationError, "can't get the id component without an object" if object.nil?
46
+ self.class.id_component(object)
47
+ end
48
+
49
+ def self.id_component object
50
+ return object.pid.split(/:/).last if object.kind_of?(Rubydora::DigitalObject)
51
+ return object.to_s.split(/:/).last if object.respond_to?(:to_s)
52
+ nil
53
+ end
42
54
 
43
55
  end
44
56
  end
@@ -5,9 +5,10 @@ module FedoraMigrate
5
5
 
6
6
  def migrate
7
7
  prepare_target
8
- migrate_content_datastreams
9
8
  conversions.collect { |ds| convert_rdf_datastream(ds) }
9
+ migrate_content_datastreams
10
10
  migrate_permissions
11
+ migrate_dates
11
12
  complete_target
12
13
  end
13
14
 
@@ -19,7 +20,6 @@ module FedoraMigrate
19
20
  def prepare_target
20
21
  Logger.info "running before_object_migration hooks"
21
22
  before_object_migration
22
- save
23
23
  end
24
24
 
25
25
  def complete_target
@@ -30,20 +30,28 @@ module FedoraMigrate
30
30
 
31
31
  private
32
32
 
33
+ # We have to call save before migrating content datastreams, otherwise versions aren't recorded
34
+ # TODO: this will fail if required fields are defined in a descMetadata datastream that is not
35
+ # converted to RDF (issue #8)
33
36
  def migrate_content_datastreams
37
+ save
34
38
  target.attached_files.keys.each do |ds|
35
- mover = FedoraMigrate::DatastreamMover.new(source.datastreams[ds.to_s], target.attached_files[ds.to_s])
39
+ mover = FedoraMigrate::DatastreamMover.new(source.datastreams[ds.to_s], target.attached_files[ds.to_s], options)
36
40
  mover.migrate
37
41
  end
38
42
  end
39
43
 
40
44
  def convert_rdf_datastream ds
41
- if source.datastreams.keys.include?(ds)
42
- mover = FedoraMigrate::RDFDatastreamMover.new(source.datastreams[ds.to_s], target)
45
+ if source.datastreams.key?(ds)
46
+ mover = FedoraMigrate::RDFDatastreamMover.new(datastream_content(ds), target)
43
47
  mover.migrate
44
48
  end
45
49
  end
46
50
 
51
+ def datastream_content(dsid)
52
+ source.datastreams[dsid.to_s]
53
+ end
54
+
47
55
  def migrate_permissions
48
56
  if source.datastreams.keys.include?(RIGHTS_DATASTREAM) && target.respond_to?(:permissions)
49
57
  mover = FedoraMigrate::PermissionsMover.new(source.datastreams[RIGHTS_DATASTREAM], target)
@@ -51,12 +59,15 @@ module FedoraMigrate
51
59
  end
52
60
  end
53
61
 
62
+ def migrate_dates
63
+ FedoraMigrate::DatesMover.new(source, target).migrate
64
+ end
65
+
54
66
  def create_target_model
55
- afmodel = source.models.map { |m| m if m.match(/afmodel/) }.compact.first.split(/:/).last
56
- Logger.info "found #{afmodel} in source object #{source.pid}"
57
- @target = afmodel.constantize.new(id: source.pid.split(/:/).last)
67
+ builder = FedoraMigrate::TargetConstructor.new(source.models).build
68
+ raise FedoraMigrate::Errors::MigrationError, "No qualified targets found in #{source.pid}" if builder.target.nil?
69
+ @target = builder.target.new(id: id_component)
58
70
  end
59
71
 
60
72
  end
61
-
62
73
  end
@@ -1,28 +1,45 @@
1
+ require 'rchardet'
2
+
1
3
  module FedoraMigrate
2
4
  class RDFDatastreamMover < Mover
3
5
 
4
6
  def migrate
5
7
  Logger.info "converting datastream '#{source.dsid}' to RDF"
6
- parse_rdf_triples
7
- force_attribute_change
8
+ before_rdf_datastream_migration
9
+ migrate_rdf_triples
10
+ after_rdf_datastream_migration
8
11
  save
9
12
  end
10
13
 
11
- def parse_rdf_triples
12
- parser = FedoraMigrate::RDFDatastreamParser.new(target.uri, source.content)
13
- parser.parse
14
- parser.statements.each do |statement|
15
- target.resource << statement
16
- end
14
+ def migrate_rdf_triples
15
+ target.resource << updated_graph
17
16
  end
18
17
 
19
- # See projecthydra/active_fedora#540
20
- # Forcibly setting each attribute's changed status to true
21
- def force_attribute_change
22
- target.class.delegated_attributes.keys.each do |term|
23
- target.send(term+"_will_change!")
18
+ private
19
+
20
+ def updated_graph
21
+ reader.new(updated_datastream_content)
22
+ end
23
+
24
+ def updated_datastream_content
25
+ correct_encoding(datastream_content).gsub(/<.+#{source.pid}>/,"<#{target.uri}>")
24
26
  end
25
- end
26
27
 
28
+ def datastream_content
29
+ source.content
30
+ end
31
+
32
+ # Scholarsphere has some ISO-8859 encoded data, which violates the NTriples spec.
33
+ # Here we correct that.
34
+ def correct_encoding(input)
35
+ input.encode!(Encoding::UTF_8)
36
+ rescue Encoding::UndefinedConversionError
37
+ cd = ::CharDet.detect(input)
38
+ input.force_encoding(Encoding.find(cd["encoding"].upcase)).encode!(Encoding::UTF_8)
39
+ end
40
+
41
+ def reader
42
+ RDF::Reader.for(:ntriples)
43
+ end
27
44
  end
28
45
  end
@@ -1,89 +1,55 @@
1
- require 'rubydora'
2
1
  module FedoraMigrate
3
2
  class RelsExtDatastreamMover < Mover
4
3
 
5
- attr_accessor :relationships, :ng_xml, :subject
6
-
7
- RELS_EXT = Rubydora::RelationshipsMixin::RELS_EXT
8
4
  RELS_EXT_DATASTREAM = "RELS-EXT".freeze
9
5
 
10
- def post_initialize
11
- retrieve_subject
12
- @relationships ||= {}
13
- @ng_xml = Nokogiri::XML(source.datastreams[RELS_EXT_DATASTREAM].content)
14
- parse_relationships if has_relationships?
15
- end
16
-
17
- def has_relationships?
18
- source.datastreams.keys.include?(RELS_EXT_DATASTREAM)
6
+ def migrate
7
+ migrate_statements
8
+ target.ldp_source.update
9
+ update_index
19
10
  end
20
11
 
21
- def migrate
22
- relationships.each do |predicate, objects|
23
- unless objects.empty?
24
- if is_singular?(predicate.to_s)
25
- objects.collect { |object| migrate_incomming_relationship(predicate, object) }
26
- else
27
- migrate_outgoing_relationship(predicate, objects)
28
- end
29
- end
30
- end
12
+ def post_initialize
13
+ @target ||= ActiveFedora::Base.find(id_component)
14
+ rescue ActiveFedora::ObjectNotFoundError
15
+ raise FedoraMigrate::Errors::MigrationError, "Target object was not found in Fedora 4. Did you migrate it?"
31
16
  end
32
17
 
33
18
  private
34
19
 
35
- # because of projecthydra/rubydora#90
36
- def parse_relationships
37
- RELS_EXT.keys.each do |key|
38
- query = "//ns0:"+RELS_EXT[key].split(/#/).last
39
- relationships[key.to_sym] = query_results(query)
20
+ def migrate_statements
21
+ statements.each do |statement|
22
+ target.ldp_source.graph << [target.rdf_subject, migrate_predicate(statement.predicate), migrate_object(statement.object)]
40
23
  end
41
24
  end
42
25
 
43
- def query_results query, results = Array.new
44
- ng_xml.xpath(query).each do |predicate|
45
- results << retrieve_object(predicate.attribute("resource").text.split(/:/).last)
46
- end
47
- return results
26
+ def update_index
27
+ target.reload
28
+ target.update_index
48
29
  end
49
30
 
50
- def retrieve_subject
51
- @subject = ActiveFedora::Base.find(source.pid.split(/:/).last)
52
- rescue ActiveFedora::ObjectNotFoundError
53
- raise FedoraMigrate::Errors::MigrationError, "Source was not found in Fedora4. Did you migrated it?"
31
+ def graph
32
+ @graph ||= RDF::Graph.new { |g| g.from_rdfxml(source.datastreams[RELS_EXT_DATASTREAM].content) }
54
33
  end
55
34
 
56
- def retrieve_object id
57
- object = ActiveFedora::Base.find(id)
58
- rescue ActiveFedora::ObjectNotFoundError
59
- raise FedoraMigrate::Errors::MigrationError, "Could not find object with id #{id}"
35
+ # Override this if any predicate transformation is needed
36
+ def migrate_predicate(fc3_uri)
37
+ fc3_uri
60
38
  end
61
39
 
62
- # TODO: This is problematic and may not work in all situations (issue #7)
63
- def migrate_incomming_relationship predicate, object
64
- Logger.info "adding #{subject.id} to #{object.id} with predicate #{predicate.to_s}"
65
- object.reflections.each do |key, association|
66
- unless association.predicate.to_s.split(/#/).empty?
67
- if association.predicate.to_s.split(/#/).last.gsub(/is/,"").underscore == predicate.to_s
68
- object.send(key.to_s) << subject
69
- end
70
- end
71
- end
40
+ def migrate_object(fc3_uri)
41
+ RDF::URI.new(ActiveFedora::Base.id_to_uri(id_component(fc3_uri)))
72
42
  end
73
43
 
74
- # TODO: Very stinky... needs a different approach (issue #7)
75
- def migrate_outgoing_relationship predicate, objects
76
- Logger.info "adding #{objects.count.to_s} members to #{subject.id} with predicate #{predicate.to_s}"
77
- subject.reflections.each do |key, association|
78
- if key.to_s.match(/_ids$/)
79
- subject.send(key.to_s+"=", objects.collect { |o| o.id })
80
- subject.save
81
- end
82
- end
44
+ def has_missing_object?(statement)
45
+ return false if ActiveFedora::Base.exists?(id_component(statement.object))
46
+ Logger.warn "#{source.pid} could not migrate relationship #{statement.predicate} because #{statement.object} doesn't exist in Fedora 4"
47
+ true
83
48
  end
84
49
 
85
- def is_singular?(str)
86
- str.singularize == str
50
+ # All the graph statements except hasModel and those with missing objects
51
+ def statements
52
+ graph.statements.reject { |stmt| stmt.predicate == ActiveFedora::RDF::Fcrepo::Model.hasModel || has_missing_object?(stmt) }
87
53
  end
88
54
 
89
55
  end
@@ -3,50 +3,55 @@ module FedoraMigrate
3
3
 
4
4
  include MigrationOptions
5
5
 
6
- attr_accessor :source_objects, :results, :namespace
6
+ attr_accessor :source_objects, :namespace, :failed
7
7
 
8
8
  def initialize namespace = nil, options = {}
9
9
  @namespace = namespace || repository_namespace
10
10
  @options = options
11
+ @failed = 0
11
12
  @source_objects = get_source_objects
12
- @results = []
13
13
  conversion_options
14
14
  end
15
15
 
16
+ # TODO: need a reporting mechanism for results (issue #4)
16
17
  def migrate_objects
17
- source_objects.each do |source|
18
- Logger.info "Migrating source object #{source.pid}"
19
- begin
20
- results << { source.pid => [FedoraMigrate::ObjectMover.new(source, nil, options).migrate] }
21
- rescue NameError => e
22
- results << { source.pid => e.to_s }
23
- rescue FedoraMigrate::Errors::MigrationError => e
24
- results << { source.pid => e.to_s }
25
- end
26
- end
18
+ source_objects.each { |source| migrate_object(source) }
19
+ @failed == 0
27
20
  end
28
21
 
29
22
  # TODO: need a reporting mechanism for results (issue #4)
30
23
  def migrate_relationships
31
- source_objects.each do |source|
32
- Logger.info "Migrating relationships for source object #{source.pid}"
33
- begin
34
- FedoraMigrate::RelsExtDatastreamMover.new(source).migrate
35
- rescue FedoraMigrate::Errors::MigrationError => e
36
- results << { source.pid => e.to_s }
37
- rescue ActiveFedora::AssociationTypeMismatch => e
38
- results << { source.pid => e.to_s }
39
- end
40
- end
41
- end
42
-
43
- # TODO: page through all the objects (issue #6)
24
+ return "Relationship migration halted because #{failed.to_s} objects didn't migrate successfully." if failed > 0 && not_forced?
25
+ source_objects.each { |source| migrate_relationship(source) }
26
+ @failed == 0
27
+ end
28
+
44
29
  def get_source_objects
45
30
  FedoraMigrate.source.connection.search(nil).collect { |o| qualifying_object(o) }.compact
46
31
  end
47
32
 
48
33
  private
49
34
 
35
+ def migrate_object source
36
+ Logger.info "Migrating source object #{source.pid}"
37
+ FedoraMigrate::ObjectMover.new(source, nil, options).migrate
38
+ rescue StandardError => e
39
+ Logger.warn "#{source.pid} failed.\n#{error_message(e)}"
40
+ @failed = @failed + 1
41
+ end
42
+
43
+ def migrate_relationship source
44
+ Logger.info "Migrating relationships for source object #{source.pid}"
45
+ FedoraMigrate::RelsExtDatastreamMover.new(source).migrate
46
+ rescue StandardError => e
47
+ Logger.warn "#{source.pid} relationship migration failed.\n#{error_message(e)}"
48
+ @failed = @failed + 1
49
+ end
50
+
51
+ def error_message e
52
+ [e.inspect, e.backtrace.join("\n\t")].join("\n\t")
53
+ end
54
+
50
55
  def repository_namespace
51
56
  FedoraMigrate.source.connection.repository_profile["repositoryPID"]["repositoryPID"].split(/:/).first.strip
52
57
  end