dor-services 2.2.4 → 4.4.10

Sign up to get free protection for your applications and to get access to all the features.
Files changed (106) hide show
  1. checksums.yaml +15 -0
  2. data/bin/dor-indexer +108 -0
  3. data/bin/dor-indexerd +73 -0
  4. data/bin/nokogiri +19 -0
  5. data/bin/rake +19 -0
  6. data/bin/ruby_noexec_wrapper +14 -0
  7. data/bin/solrizer +19 -0
  8. data/bin/solrizerd +19 -0
  9. data/config/certs/README +1 -0
  10. data/config/config_defaults.yml +62 -0
  11. data/config/dev_console_env.rb.example +67 -0
  12. data/config/predicate_mappings.yml +55 -0
  13. data/lib/dor-services.rb +152 -19
  14. data/lib/dor/config.rb +133 -35
  15. data/lib/dor/datastreams/administrative_metadata_ds.rb +84 -0
  16. data/lib/dor/datastreams/content_metadata_ds.rb +337 -0
  17. data/lib/dor/datastreams/datastream_spec_solrizer.rb +18 -0
  18. data/lib/dor/datastreams/default_object_rights_ds.rb +52 -0
  19. data/lib/dor/datastreams/desc_metadata_ds.rb +39 -0
  20. data/lib/{datastreams → dor/datastreams}/embargo_metadata_ds.rb +25 -20
  21. data/lib/{datastreams → dor/datastreams}/events_ds.rb +14 -9
  22. data/lib/dor/datastreams/identity.xsl +8 -0
  23. data/lib/dor/datastreams/identity_metadata_ds.rb +112 -0
  24. data/lib/dor/datastreams/role_metadata_ds.rb +51 -0
  25. data/lib/dor/datastreams/simple_dublin_core_ds.rb +45 -0
  26. data/lib/dor/datastreams/version_metadata_ds.rb +214 -0
  27. data/lib/dor/datastreams/workflow_definition_ds.rb +113 -0
  28. data/lib/dor/datastreams/workflow_ds.rb +103 -0
  29. data/lib/dor/exceptions.rb +0 -1
  30. data/lib/dor/migrations/content_metadata_ds/change_content_type.rb +7 -0
  31. data/lib/dor/migrations/identifiable/assert_adminPolicy.rb +9 -0
  32. data/lib/dor/migrations/identifiable/fix_model_assertions.rb +13 -0
  33. data/lib/dor/migrations/identifiable/record_remediation.rb +18 -0
  34. data/lib/dor/migrations/identifiable/uriify_augmented_contentlocation_refs.rb +18 -0
  35. data/lib/dor/migrations/identifiable/uriify_contentlocation_refs.rb +18 -0
  36. data/lib/dor/migrations/processable/unify_workflows.rb +17 -0
  37. data/lib/dor/migrations/versionable/add_missing_version_md.rb +9 -0
  38. data/lib/dor/models/admin_policy_object.rb +16 -0
  39. data/lib/dor/models/assembleable.rb +14 -0
  40. data/lib/dor/models/collection.rb +14 -0
  41. data/lib/dor/models/contentable.rb +227 -0
  42. data/lib/dor/models/describable.rb +194 -0
  43. data/lib/dor/models/discoverable.rb +66 -0
  44. data/lib/dor/models/editable.rb +267 -0
  45. data/lib/dor/models/embargoable.rb +97 -0
  46. data/lib/dor/models/eventable.rb +12 -0
  47. data/lib/dor/models/governable.rb +162 -0
  48. data/lib/dor/models/identifiable.rb +211 -0
  49. data/lib/dor/models/item.rb +44 -0
  50. data/lib/dor/models/itemizable.rb +66 -0
  51. data/lib/dor/{mods2dc.xslt → models/mods2dc.xslt} +39 -12
  52. data/lib/dor/models/preservable.rb +50 -0
  53. data/lib/dor/models/processable.rb +229 -0
  54. data/lib/dor/models/publishable.rb +74 -0
  55. data/lib/dor/models/set.rb +12 -0
  56. data/lib/dor/models/shelvable.rb +27 -0
  57. data/lib/dor/models/upgradable.rb +74 -0
  58. data/lib/dor/models/versionable.rb +94 -0
  59. data/lib/dor/models/workflow_object.rb +54 -0
  60. data/lib/dor/services/cleanup_service.rb +47 -0
  61. data/lib/dor/services/digital_stacks_service.rb +55 -0
  62. data/lib/dor/services/merge_service.rb +96 -0
  63. data/lib/dor/{metadata_handlers → services/metadata_handlers}/catalog_handler.rb +0 -2
  64. data/lib/dor/{metadata_handlers → services/metadata_handlers}/mdtoolkit_handler.rb +0 -2
  65. data/lib/dor/{metadata_service.rb → services/metadata_service.rb} +1 -3
  66. data/lib/dor/services/registration_service.rb +181 -0
  67. data/lib/dor/services/sdr_ingest_service.rb +181 -0
  68. data/lib/dor/services/search_service.rb +131 -0
  69. data/lib/dor/services/suri_service.rb +32 -0
  70. data/lib/dor/services/technical_metadata_service.rb +226 -0
  71. data/lib/dor/{tei2dc.xslt → services/tei2dc.xslt} +0 -0
  72. data/lib/dor/utils/ng_tidy.rb +37 -0
  73. data/lib/dor/utils/predicate_patch.rb +23 -0
  74. data/lib/dor/utils/solr_doc_helper.rb +9 -0
  75. data/lib/dor/utils/utc_date_field_mapper.rb +7 -0
  76. data/lib/dor/version.rb +3 -0
  77. data/lib/dor/workflow/document.rb +131 -0
  78. data/lib/dor/workflow/graph.rb +166 -0
  79. data/lib/dor/workflow/process.rb +99 -0
  80. data/lib/gsearch/demoFoxmlToSolr.xslt +340 -122
  81. data/lib/tasks/dor.rake +39 -0
  82. metadata +494 -384
  83. data/lib/datastreams/content_metadata_ds.rb +0 -12
  84. data/lib/datastreams/identity_metadata_ds.rb +0 -28
  85. data/lib/datastreams/ng_tidy.rb +0 -19
  86. data/lib/datastreams/simple_dublin_core_ds.rb +0 -23
  87. data/lib/datastreams/workflow_definition_ds.rb +0 -105
  88. data/lib/datastreams/workflow_ds.rb +0 -16
  89. data/lib/dor/admin_policy_object.rb +0 -11
  90. data/lib/dor/base.rb +0 -81
  91. data/lib/dor/cleanup_service.rb +0 -32
  92. data/lib/dor/digital_stacks_service.rb +0 -82
  93. data/lib/dor/druid_utils.rb +0 -41
  94. data/lib/dor/embargo.rb +0 -41
  95. data/lib/dor/item.rb +0 -141
  96. data/lib/dor/provenance_metadata_service.rb +0 -65
  97. data/lib/dor/registration_service.rb +0 -87
  98. data/lib/dor/rsolr.rb +0 -27
  99. data/lib/dor/sdr_ingest_service.rb +0 -117
  100. data/lib/dor/search_service.rb +0 -86
  101. data/lib/dor/suri_service.rb +0 -37
  102. data/lib/dor/workflow_object.rb +0 -13
  103. data/lib/dor/workflow_service.rb +0 -111
  104. data/lib/xml_models/foxml.rb +0 -261
  105. data/lib/xml_models/identity_metadata/dublin_core.rb +0 -119
  106. data/lib/xml_models/identity_metadata/identity_metadata.rb +0 -288
@@ -0,0 +1,181 @@
1
+ require 'rubygems'
2
+ require 'lyber-utils'
3
+ require 'moab_stanford'
4
+ require 'dor-services'
5
+
6
+ module Dor
7
+ class SdrIngestService
8
+
9
+ # @param [Dor::Item] dor_item The representation of the digital object
10
+ # @param [String] agreement_id depreciated, included for backward compatability with common-accessoning
11
+ # @return [void] Create the moab manifests, export data to a BagIt bag, kick off the SDR ingest workflow
12
+ def self.transfer(dor_item, agreement_id=nil)
13
+ druid = dor_item.pid
14
+ workspace = DruidTools::Druid.new(druid,Dor::Config.sdr.local_workspace_root)
15
+ signature_catalog = get_signature_catalog(druid)
16
+ new_version_id = signature_catalog.version_id + 1
17
+ metadata_dir = extract_datastreams(dor_item, workspace)
18
+ verify_version_metadata(metadata_dir, new_version_id)
19
+ version_inventory = get_version_inventory(metadata_dir, druid, new_version_id)
20
+ version_addtions = signature_catalog.version_additions(version_inventory)
21
+ content_addtions = version_addtions.group('content')
22
+ if content_addtions.nil? or content_addtions.files.empty?
23
+ content_dir = nil
24
+ else
25
+ new_file_list = content_addtions.path_list
26
+ content_dir = workspace.find_filelist_parent('content',new_file_list)
27
+ end
28
+ content_group = version_inventory.group('content')
29
+ unless content_group.nil? or content_group.files.empty?
30
+ signature_catalog.normalize_group_signatures(content_group, content_dir)
31
+ end
32
+ # export the bag (in tar format)
33
+ bag_dir = Pathname(Dor::Config.sdr.local_export_home).join(druid.sub('druid:',''))
34
+ bagger = Moab::Bagger.new(version_inventory, signature_catalog, bag_dir)
35
+ bagger.reset_bag
36
+ bagger.create_bag_inventory(:depositor)
37
+ bagger.deposit_group('content', content_dir)
38
+ bagger.deposit_group('metadata', metadata_dir)
39
+ bagger.create_tagfiles
40
+ verify_bag_structure(bag_dir)
41
+ # Now bootstrap SDR workflow. but do not create the workflows datastream
42
+ dor_item.initialize_workflow('sdrIngestWF', 'sdr', false)
43
+ rescue Exception => e
44
+ raise LyberCore::Exceptions::ItemError.new(druid, "Export failure", e)
45
+ end
46
+
47
+ # @param [String] druid The object identifier
48
+ # @return [Moab::SignatureCatalog] the catalog of all files previously ingested
49
+ def self.get_signature_catalog(druid)
50
+ sdr_client = Dor::Config.sdr.rest_client
51
+ url = "objects/#{druid}/manifest/signatureCatalog.xml"
52
+ response = sdr_client[url].get
53
+ Moab::SignatureCatalog.parse(response)
54
+ rescue
55
+ Moab::SignatureCatalog.new(:digital_object_id => druid, :version_id => 0)
56
+ end
57
+
58
+ # @param [Dor::Item] dor_item The representation of the digital object
59
+ # @param [DruidTools::Druid] workspace The representation of the item's work area
60
+ # @return [Pathname] Pull all the datastreams specified in the configuration file
61
+ # into the workspace's metadata directory, overwriting existing file if present
62
+ def self.extract_datastreams(dor_item, workspace)
63
+ metadata_dir = Pathname.new(workspace.path('metadata',create=true))
64
+ Config.sdr.datastreams.to_hash.each_pair do |ds_name, required|
65
+ ds_name = ds_name.to_s
66
+ metadata_file = metadata_dir.join("#{ds_name}.xml")
67
+ metadata_string = self.get_datastream_content(dor_item, ds_name, required)
68
+ metadata_file.open('w') { |f| f << metadata_string } if metadata_string
69
+ end
70
+ metadata_dir
71
+ end
72
+
73
+ # @param [Dor::Item] dor_item The representation of the digital object
74
+ # @param [String] ds_name The name of the desired Fedora datastream
75
+ # @param [String] required Enumeration: one of ['required', 'optional']
76
+ # @return [String] return the xml text of the specified datastream if it exists.
77
+ # If not found, return nil unless it is a required datastream in which case raise exception
78
+ def self.get_datastream_content(dor_item, ds_name, required)
79
+ ds = (ds_name == 'relationshipMetadata' ? 'RELS-EXT' : ds_name)
80
+ if dor_item.datastreams.keys.include?(ds) and not dor_item.datastreams[ds].new?
81
+ return dor_item.datastreams[ds].content
82
+ elsif (required == 'optional')
83
+ return nil
84
+ else
85
+ raise "required datastream #{ds_name} not found in DOR"
86
+ end
87
+ end
88
+
89
+ # @param [Pathname] metadata_dir the location of the metadata directory in the workspace
90
+ # @param [Integer] expected the version identifer expected to be used in the versionMetadata
91
+ def self.verify_version_metadata(metadata_dir, expected)
92
+ vmfile = metadata_dir.join("versionMetadata.xml")
93
+ verify_version_id(vmfile, expected, vmfile_version_id(vmfile))
94
+ true
95
+ end
96
+
97
+ # @param [Pathname] pathname The location of the file containing a version number
98
+ # @param [Integer] expected The version number that should be in the file
99
+ # @param [Integer] found The version number that is actually in the file
100
+ def self.verify_version_id(pathname, expected, found)
101
+ raise "Version mismatch in #{pathname}, expected #{expected}, found #{found}" unless (expected == found)
102
+ true
103
+ end
104
+
105
+ # @param [Pathname] pathname the location of the versionMetadata file
106
+ # @return [Integer] the versionId found in the last version element, or nil if missing
107
+ def self.vmfile_version_id(pathname)
108
+ verify_pathname(pathname)
109
+ doc = Nokogiri::XML(File.open(pathname.to_s))
110
+ nodeset = doc.xpath("/versionMetadata/version")
111
+ version_id = nodeset.last['versionId']
112
+ version_id.nil? ? nil : version_id.to_i
113
+ end
114
+
115
+ # @param [Pathname] metadata_dir The location of the the object's metadata files
116
+ # @param [String] druid The object identifier
117
+ # @param [Integer] version_id The version number
118
+ # @return [Moab::FileInventory] Generate and return a version inventory for the object
119
+ def self.get_version_inventory(metadata_dir, druid, version_id)
120
+ version_inventory = get_content_inventory(metadata_dir, druid, version_id)
121
+ version_inventory.groups << get_metadata_file_group(metadata_dir)
122
+ version_inventory
123
+ end
124
+
125
+ # @param [Pathname] metadata_dir The location of the the object's metadata files
126
+ # @param [String] druid The object identifier
127
+ # @param [Integer] version_id The version number
128
+ # @return [Moab::FileInventory] Parse the contentMetadata
129
+ # and generate a new version inventory object containing a content group
130
+ def self.get_content_inventory(metadata_dir, druid, version_id)
131
+ content_metadata = get_content_metadata(metadata_dir)
132
+ if content_metadata
133
+ Stanford::ContentInventory.new.inventory_from_cm(content_metadata, druid, subset='preserve', version_id)
134
+ else
135
+ FileInventory.new(:type=>"version",:digital_object_id=>druid, :version_id=>version_id)
136
+ end
137
+ end
138
+
139
+ # @param [Pathname] metadata_dir The location of the the object's metadata files
140
+ # @return [String] Return the contents of the contentMetadata.xml file from the content directory
141
+ def self.get_content_metadata(metadata_dir)
142
+ content_metadata_pathname = metadata_dir.join('contentMetadata.xml')
143
+ if content_metadata_pathname.exist?
144
+ content_metadata_pathname.read
145
+ else
146
+ nil
147
+ end
148
+ end
149
+
150
+ # @param [Pathname] metadata_dir The location of the the object's metadata files
151
+ # @return [Moab::FileGroup] Traverse the metadata directory and generate a metadata group
152
+ def self.get_metadata_file_group(metadata_dir)
153
+ file_group = FileGroup.new(:group_id=>'metadata').group_from_directory(metadata_dir)
154
+ file_group
155
+ end
156
+
157
+ # @param [Pathname] bag_dir the location of the bag to be verified
158
+ # @return [Boolean] true if all required files exist, raises exception if not
159
+ def self.verify_bag_structure(bag_dir)
160
+ verify_pathname(bag_dir)
161
+ verify_pathname(bag_dir.join('data'))
162
+ verify_pathname(bag_dir.join('bagit.txt'))
163
+ verify_pathname(bag_dir.join('bag-info.txt'))
164
+ verify_pathname(bag_dir.join('manifest-sha256.txt'))
165
+ verify_pathname(bag_dir.join('tagmanifest-sha256.txt'))
166
+ verify_pathname(bag_dir.join('versionAdditions.xml'))
167
+ verify_pathname(bag_dir.join('versionInventory.xml'))
168
+ verify_pathname(bag_dir.join('data','metadata','versionMetadata.xml'))
169
+ true
170
+ end
171
+
172
+ # @param [Pathname] pathname The file whose existence should be verified
173
+ # @return [Boolean] true if file exists, raises exception if not
174
+ def self.verify_pathname(pathname)
175
+ raise "#{pathname.basename} not found at #{pathname}" unless pathname.exist?
176
+ true
177
+ end
178
+
179
+ end
180
+
181
+ end
@@ -0,0 +1,131 @@
1
+ require 'json'
2
+ require 'active_support/core_ext'
3
+
4
+ module Dor
5
+
6
+ class SearchService
7
+
8
+ include Solrizer::FieldNameMapper
9
+ RISEARCH_TEMPLATE = "select $object from <#ri> where $object <dc:identifier> '%s'"
10
+ @@index_version = nil
11
+
12
+ class << self
13
+
14
+ def index_version
15
+ if @@index_version.nil?
16
+ xsl_doc = Nokogiri::XML(File.read(File.expand_path('../../../gsearch/demoFoxmlToSolr.xslt',__FILE__)))
17
+ @@index_version = xsl_doc.at_xpath('/xsl:stylesheet/xsl:variable[@name="INDEXVERSION"]/text()').to_s
18
+ end
19
+ @@index_version
20
+ end
21
+
22
+ def reindex(*pids)
23
+ client = Config.gsearch.rest_client
24
+ pids.in_groups_of(20, false) do |group|
25
+ group.each { |pid| client["?operation=updateIndex&action=fromPid&value=#{pid}"].get }
26
+ yield group if block_given?
27
+ end
28
+ pids
29
+ end
30
+
31
+ def risearch(query, opts = {})
32
+ client = Config.fedora.client['risearch']
33
+ client.options[:timeout] = opts.delete(:timeout)
34
+ query_params = {
35
+ :type => 'tuples',
36
+ :lang => 'itql',
37
+ :format => 'CSV',
38
+ :limit => '1000',
39
+ :stream => 'on',
40
+ :query => query
41
+ }.merge(opts)
42
+ result = client.post(query_params)
43
+ result.split(/\n/)[1..-1].collect { |pid| pid.chomp.sub(/^info:fedora\//,'') }
44
+ end
45
+
46
+ def iterate_over_pids(opts = {}, &block)
47
+ opts[:query] ||= "select $object from <#ri> where $object <info:fedora/fedora-system:def/model#label> $label"
48
+ opts[:in_groups_of] ||= 100
49
+ opts[:mode] ||= :single
50
+ start = 0
51
+ pids = Dor::SearchService.risearch("#{opts[:query]} limit #{opts[:in_groups_of]} offset #{start}")
52
+ while pids.present?
53
+ if opts[:mode] == :single
54
+ pids.each { |pid| yield pid }
55
+ else
56
+ yield pids
57
+ end
58
+ start += pids.length
59
+ pids = Dor::SearchService.risearch("#{opts[:query]} limit #{opts[:in_groups_of]} offset #{start}")
60
+ end
61
+ end
62
+
63
+ def gsearch(params)
64
+ client = Config.gsearch.client
65
+ query_params = params.merge(:wt => 'json')
66
+ query_string = query_params.collect { |k,v|
67
+ if v.is_a?(Array)
68
+ v.collect { |vv| "#{k}=#{URI.encode(vv.to_s)}" }.join('&')
69
+ else
70
+ "#{k}=#{URI.encode(v.to_s)}"
71
+ end
72
+ }.join('&')
73
+ result = JSON.parse(client["select?#{query_string}"].get)
74
+ end
75
+
76
+ def query query, args={}
77
+ params = args.merge({ :q => query })
78
+ params[:start] ||= 0
79
+ resp = solr.find params
80
+ if block_given?
81
+ cont = true
82
+ while cont and resp.docs.length > 0
83
+ cont = yield(resp)
84
+ params[:rows] ||= resp.docs.length
85
+ params[:start] += params[:rows]
86
+ resp = solr.find params
87
+ end
88
+ else
89
+ return resp
90
+ end
91
+ end
92
+
93
+ def query_by_id(id)
94
+ if id.is_a?(Hash) # Single valued: { :google => 'STANFORD_0123456789' }
95
+ id = id.collect { |*v| v.join(':') }.first
96
+ elsif id.is_a?(Array) # Two values: [ 'google', 'STANFORD_0123456789' ]
97
+ id = id.join(':')
98
+ end
99
+ q = %{#{solr_name 'identifier', :string}:"#{id}"}
100
+ result = []
101
+ resp = query(q, :fl => 'id', :rows => 1000) do |resp|
102
+ result += resp.docs.collect { |doc| doc['id'] }
103
+ true
104
+ end
105
+ result
106
+ end
107
+
108
+ def solr
109
+ @@solr ||= ActiveFedora.solr.conn.is_a?(RSolr::Client) ? ActiveFedora.solr.conn : Dor::Config.make_solr_connection
110
+ end
111
+
112
+ # @return String druid of the SDR Graveyard APO
113
+ # nil if APO does not exist in the currently configured environment
114
+ def sdr_graveyard_apo_druid
115
+ @@sdr_graveyard_apo ||= find_sdr_graveyard_apo_druid
116
+ end
117
+
118
+ def find_sdr_graveyard_apo_druid
119
+ r = Dor::SearchService.query('dc_title_t:"SDR Graveyard"', :fl => 'id')
120
+ if r.docs.empty?
121
+ nil
122
+ else
123
+ r.docs.first[:id]
124
+ end
125
+ end
126
+
127
+ end
128
+
129
+ end
130
+
131
+ end
@@ -0,0 +1,32 @@
1
+ require 'rest-client'
2
+ require 'active_fedora'
3
+
4
+ module Dor
5
+ class SuriService
6
+ # If Dor::Config.suri.mint_ids is set to true, then this method
7
+ # returns Config.suri.id_namespace:id_from_suri
8
+ # Throws an exception if there were any problems
9
+ def self.mint_id quantity=nil
10
+ want_array = quantity.is_a?(Numeric)
11
+ quantity = 1 if quantity.nil?
12
+ ids = []
13
+ if Config.suri.mint_ids
14
+ #Post with no body
15
+ resource = RestClient::Resource.new("#{Config.suri.url}/suri2/namespaces/#{Config.suri.id_namespace}",
16
+ :user => Config.suri.user, :password => Config.suri.pass)
17
+ ids = resource["identifiers?quantity=#{quantity}"].post('').chomp.split(/\n/).collect { |id| "#{Config.suri.id_namespace}:#{id.strip}" }
18
+ else
19
+ repo = ActiveFedora::Base.respond_to?(:connection_for_pid) ? ActiveFedora::Base.connection_for_pid(0) : ActiveFedora.fedora.connection
20
+ resp = Nokogiri::XML(repo.next_pid :numPIDs => quantity)
21
+ ids = resp.xpath('/pidList/pid').collect { |node| node.text }
22
+ end
23
+ return want_array ? ids : ids.first
24
+
25
+ # rescue Exception => e
26
+ # Rails.logger.error("Unable to mint id from suri: #{e.to_s}")
27
+ # raise e
28
+ end
29
+
30
+
31
+ end
32
+ end
@@ -0,0 +1,226 @@
1
+ require 'rubygems'
2
+ require 'moab_stanford'
3
+ require 'jhove_service'
4
+ require 'dor-services'
5
+
6
+ module Dor
7
+
8
+ class TechnicalMetadataService
9
+
10
+ # @param [Dor::Item] dor_item The DOR item being processed by the technical metadata robot
11
+ # @return [Boolean] True if technical metadata is correctly added or updated
12
+ def self.add_update_technical_metadata(dor_item)
13
+ test_jhove_service
14
+ druid = dor_item.pid
15
+ content_group_diff = get_content_group_diff(dor_item)
16
+ deltas = get_file_deltas(content_group_diff)
17
+ new_files = get_new_files(deltas)
18
+ old_techmd = get_old_technical_metadata(dor_item)
19
+ new_techmd = get_new_technical_metadata(druid, new_files)
20
+ if old_techmd.nil?
21
+ # this is version 1 or previous technical metadata was not saved
22
+ final_techmd = new_techmd
23
+ elsif content_group_diff.difference_count == 0
24
+ # there have been no changes to content files from previous version
25
+ return true
26
+ else
27
+ merged_nodes = merge_file_nodes(old_techmd, new_techmd, deltas)
28
+ final_techmd = build_technical_metadata(druid,merged_nodes)
29
+ end
30
+ ds = dor_item.datastreams["technicalMetadata"]
31
+ ds.dsLabel = 'Technical Metadata'
32
+ ds.content = final_techmd
33
+ ds.save
34
+ true
35
+ end
36
+
37
+ # @return [Boolean] Make sure that the jhove-service gem is loaded
38
+ def self.test_jhove_service
39
+ unless defined? ::JhoveService
40
+ begin
41
+ require 'jhove_service'
42
+ rescue LoadError => e
43
+ puts e.inspect
44
+ raise "jhove-service dependency gem was not found. Please add it to your Gemfile and run bundle install"
45
+ end
46
+ end
47
+ end
48
+
49
+ # @param [Dor::Item] dor_item The DOR item being processed by the technical metadata robot
50
+ # @return [FileGroupDifference] The differences between two versions of a group of files
51
+ def self.get_content_group_diff(dor_item)
52
+ inventory_diff_xml = dor_item.get_content_diff('all')
53
+ inventory_diff = Moab::FileInventoryDifference.parse(inventory_diff_xml)
54
+ content_group_diff = inventory_diff.group_difference("content")
55
+ content_group_diff
56
+ end
57
+
58
+ # @param [FileGroupDifference] content_group_diff
59
+ # @return [Hash<Symbol,Array>] Sets of filenames grouped by change type for use in performing file or metadata operations
60
+ def self.get_file_deltas(content_group_diff)
61
+ deltas = content_group_diff.file_deltas
62
+ deltas
63
+ end
64
+
65
+ # @param [Hash<Symbol,Array>] deltas Sets of filenames grouped by change type for use in performing file or metadata operations
66
+ # @return [Array<String>] The list of filenames for files that are either added or modifed since the previous version
67
+ def self.get_new_files(deltas)
68
+ deltas[:added] + deltas[:modified]
69
+ end
70
+
71
+ # @param [Dor::Item] dor_item The DOR item being processed by the technical metadata robot
72
+ # @return [String] The technicalMetadata datastream from the previous version of the digital object
73
+ def self.get_old_technical_metadata(dor_item)
74
+ sdr_techmd = get_sdr_technical_metadata(dor_item.pid)
75
+ return sdr_techmd unless sdr_techmd.nil?
76
+ get_dor_technical_metadata(dor_item)
77
+ end
78
+
79
+ # @param [String] druid The identifier of the digital object being processed by the technical metadata robot
80
+ # @return [String] The technicalMetadata datastream from the previous version of the digital object (fetched from SDR storage)
81
+ # The data is updated to the latest format.
82
+ def self.get_sdr_technical_metadata(druid)
83
+ begin
84
+ sdr_techmd = get_sdr_metadata(druid, "technicalMetadata")
85
+ rescue RestClient::ResourceNotFound => e
86
+ return nil
87
+ end
88
+ if sdr_techmd =~ /<technicalMetadata/
89
+ return sdr_techmd
90
+ elsif sdr_techmd =~ /<jhove/
91
+ return ::JhoveService.new.upgrade_technical_metadata(sdr_techmd)
92
+ else
93
+ return nil
94
+ end
95
+ end
96
+
97
+ # @param [Dor::Item] dor_item The DOR item being processed by the technical metadata robot
98
+ # @return [String] The technicalMetadata datastream from the previous version of the digital object (fetched from DOR fedora).
99
+ # The data is updated to the latest format.
100
+ def self.get_dor_technical_metadata(dor_item)
101
+ ds = "technicalMetadata"
102
+ if dor_item.datastreams.keys.include?(ds) and not dor_item.datastreams[ds].new?
103
+ dor_techmd = dor_item.datastreams[ds].content
104
+ else
105
+ return nil
106
+ end
107
+ if dor_techmd =~ /<technicalMetadata/
108
+ return dor_techmd
109
+ elsif dor_techmd =~ /<jhove/
110
+ return ::JhoveService.new.upgrade_technical_metadata(dor_techmd)
111
+ else
112
+ return nil
113
+ end
114
+ end
115
+
116
+ # @param [String] druid The identifier of the digital object being processed by the technical metadata robot
117
+ # @param [String] dsname The identifier of the metadata datastream
118
+ # @return [String] The datastream contents from the previous version of the digital object (fetched from SDR storage)
119
+ def self.get_sdr_metadata(druid, dsname)
120
+ sdr_client = Dor::Config.sdr.rest_client
121
+ url = "objects/#{druid}/metadata/#{dsname}.xml"
122
+ response = sdr_client[url].get
123
+ response
124
+ end
125
+
126
+ # @param [DruidTools::Druid] druid_tool A wrapper class for the druid identifier. Used to generate paths
127
+ # @param [Array<String>] new_files The list of filenames for files that are either added or modifed since the previous version
128
+ # @return [String] The technicalMetadata datastream for the new files of the new digital object version
129
+ def self.get_new_technical_metadata(druid, new_files)
130
+ return nil if new_files.nil? or new_files.empty?
131
+ workspace = DruidTools::Druid.new(druid, Dor::Config.sdr.local_workspace_root)
132
+ content_dir = workspace.find_filelist_parent('content',new_files)
133
+ temp_dir = workspace.temp_dir
134
+ jhove_service = ::JhoveService.new(temp_dir)
135
+ jhove_service.digital_object_id=druid
136
+ fileset_file = write_fileset(temp_dir, new_files)
137
+ jhove_output_file = jhove_service.run_jhove(content_dir, fileset_file)
138
+ tech_md_file = jhove_service.create_technical_metadata(jhove_output_file)
139
+ IO.read(tech_md_file)
140
+ end
141
+
142
+ # @param [Pathname] temp_dir The pathname of the temp folder in the object's workspace area
143
+ # @param [Object] new_files [Array<String>] The list of filenames for files that are either added or modifed since the previous version
144
+ # @return [Pathname] Save the new_files list to a text file and return that file's name
145
+ def self.write_fileset(temp_dir, new_files)
146
+ fileset_pathname = Pathname(temp_dir).join('jhove_fileset.txt')
147
+ fileset_pathname.open('w') {|f| f.puts(new_files) }
148
+ fileset_pathname
149
+ end
150
+
151
+ # @param [String] old_techmd The technicalMetadata datastream from the previous version of the digital object
152
+ # @param [String] new_techmd The technicalMetadata datastream for the new files of the new digital object version
153
+ # @param [Array<String>] deltas The list of filenames for files that are either added or modifed since the previous version
154
+ # @return [Hash<String,Nokogiri::XML::Node>] The complete set of technicalMetadata nodes for the digital object, indexed by filename
155
+ def self.merge_file_nodes(old_techmd, new_techmd, deltas)
156
+ old_file_nodes = get_file_nodes(old_techmd)
157
+ new_file_nodes = get_file_nodes(new_techmd)
158
+ merged_nodes = Hash.new
159
+ deltas[:identical].each do |path|
160
+ merged_nodes[path] = old_file_nodes[path]
161
+ end
162
+ deltas[:modified].each do |path|
163
+ merged_nodes[path] = new_file_nodes[path]
164
+ end
165
+ deltas[:added].each do |path|
166
+ merged_nodes[path] = new_file_nodes[path]
167
+ end
168
+ deltas[:renamed].each do |oldpath,newpath|
169
+ clone = old_file_nodes[oldpath].clone
170
+ clone.sub!(/<file\s*id.*?["'].*?["'].*?>/, "<file id='#{newpath}'>")
171
+ merged_nodes[newpath] = clone
172
+ end
173
+ deltas[:copyadded].each do |oldpath,newpath|
174
+ clone = old_file_nodes[oldpath].clone
175
+ clone.sub!(/<file\s*id.*?["'].*?["'].*?>/, "<file id='#{newpath}'>")
176
+ merged_nodes[newpath] = clone
177
+ end
178
+ merged_nodes
179
+ end
180
+
181
+ # @param [String] technical_metadata A technicalMetadata datastream contents
182
+ # @return [Hash<String,Nokogiri::XML::Node>] The set of nodes from a technicalMetadata datastream , indexed by filename
183
+ def self.get_file_nodes(technical_metadata)
184
+ file_hash = Hash.new
185
+ return file_hash if technical_metadata.nil?
186
+ current_file = Array.new
187
+ path = nil
188
+ in_file = false
189
+ technical_metadata.each_line do |line|
190
+ if line =~ /^\s*<file.*["'](.*?)["']/
191
+ current_file << line
192
+ path = $1
193
+ in_file = true
194
+ elsif line =~ /^\s*<\/file>/
195
+ current_file << line
196
+ file_hash[path] = current_file.join
197
+ current_file = Array.new
198
+ path = nil
199
+ in_file = false
200
+ elsif in_file
201
+ current_file << line
202
+ end
203
+ end
204
+ file_hash
205
+ end
206
+
207
+ # @param [String] druid The identifier of the digital object being processed by the technical metadata robot
208
+ # @param [Hash<String,Nokogiri::XML::Node>] merged_nodes The complete set of technicalMetadata nodes for the digital object, indexed by filename
209
+ # @return [String] The finalized technicalMetadata datastream contents for the new object version
210
+ def self.build_technical_metadata(druid, merged_nodes)
211
+ techmd_root = <<-EOF
212
+ <technicalMetadata objectId='#{druid}' datetime='#{Time.now.utc.iso8601}'
213
+ xmlns:jhove='http://hul.harvard.edu/ois/xml/ns/jhove'
214
+ xmlns:mix='http://www.loc.gov/mix/v10'
215
+ xmlns:textmd='info:lc/xmlns/textMD-v3'>
216
+ EOF
217
+ doc = techmd_root
218
+ merged_nodes.keys.sort.each {|path| doc << merged_nodes[path] }
219
+ doc << "</technicalMetadata>"
220
+ doc
221
+ end
222
+
223
+ end
224
+
225
+ end
226
+