dor-services 2.2.4 → 4.4.10
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/bin/dor-indexer +108 -0
- data/bin/dor-indexerd +73 -0
- data/bin/nokogiri +19 -0
- data/bin/rake +19 -0
- data/bin/ruby_noexec_wrapper +14 -0
- data/bin/solrizer +19 -0
- data/bin/solrizerd +19 -0
- data/config/certs/README +1 -0
- data/config/config_defaults.yml +62 -0
- data/config/dev_console_env.rb.example +67 -0
- data/config/predicate_mappings.yml +55 -0
- data/lib/dor-services.rb +152 -19
- data/lib/dor/config.rb +133 -35
- data/lib/dor/datastreams/administrative_metadata_ds.rb +84 -0
- data/lib/dor/datastreams/content_metadata_ds.rb +337 -0
- data/lib/dor/datastreams/datastream_spec_solrizer.rb +18 -0
- data/lib/dor/datastreams/default_object_rights_ds.rb +52 -0
- data/lib/dor/datastreams/desc_metadata_ds.rb +39 -0
- data/lib/{datastreams → dor/datastreams}/embargo_metadata_ds.rb +25 -20
- data/lib/{datastreams → dor/datastreams}/events_ds.rb +14 -9
- data/lib/dor/datastreams/identity.xsl +8 -0
- data/lib/dor/datastreams/identity_metadata_ds.rb +112 -0
- data/lib/dor/datastreams/role_metadata_ds.rb +51 -0
- data/lib/dor/datastreams/simple_dublin_core_ds.rb +45 -0
- data/lib/dor/datastreams/version_metadata_ds.rb +214 -0
- data/lib/dor/datastreams/workflow_definition_ds.rb +113 -0
- data/lib/dor/datastreams/workflow_ds.rb +103 -0
- data/lib/dor/exceptions.rb +0 -1
- data/lib/dor/migrations/content_metadata_ds/change_content_type.rb +7 -0
- data/lib/dor/migrations/identifiable/assert_adminPolicy.rb +9 -0
- data/lib/dor/migrations/identifiable/fix_model_assertions.rb +13 -0
- data/lib/dor/migrations/identifiable/record_remediation.rb +18 -0
- data/lib/dor/migrations/identifiable/uriify_augmented_contentlocation_refs.rb +18 -0
- data/lib/dor/migrations/identifiable/uriify_contentlocation_refs.rb +18 -0
- data/lib/dor/migrations/processable/unify_workflows.rb +17 -0
- data/lib/dor/migrations/versionable/add_missing_version_md.rb +9 -0
- data/lib/dor/models/admin_policy_object.rb +16 -0
- data/lib/dor/models/assembleable.rb +14 -0
- data/lib/dor/models/collection.rb +14 -0
- data/lib/dor/models/contentable.rb +227 -0
- data/lib/dor/models/describable.rb +194 -0
- data/lib/dor/models/discoverable.rb +66 -0
- data/lib/dor/models/editable.rb +267 -0
- data/lib/dor/models/embargoable.rb +97 -0
- data/lib/dor/models/eventable.rb +12 -0
- data/lib/dor/models/governable.rb +162 -0
- data/lib/dor/models/identifiable.rb +211 -0
- data/lib/dor/models/item.rb +44 -0
- data/lib/dor/models/itemizable.rb +66 -0
- data/lib/dor/{mods2dc.xslt → models/mods2dc.xslt} +39 -12
- data/lib/dor/models/preservable.rb +50 -0
- data/lib/dor/models/processable.rb +229 -0
- data/lib/dor/models/publishable.rb +74 -0
- data/lib/dor/models/set.rb +12 -0
- data/lib/dor/models/shelvable.rb +27 -0
- data/lib/dor/models/upgradable.rb +74 -0
- data/lib/dor/models/versionable.rb +94 -0
- data/lib/dor/models/workflow_object.rb +54 -0
- data/lib/dor/services/cleanup_service.rb +47 -0
- data/lib/dor/services/digital_stacks_service.rb +55 -0
- data/lib/dor/services/merge_service.rb +96 -0
- data/lib/dor/{metadata_handlers → services/metadata_handlers}/catalog_handler.rb +0 -2
- data/lib/dor/{metadata_handlers → services/metadata_handlers}/mdtoolkit_handler.rb +0 -2
- data/lib/dor/{metadata_service.rb → services/metadata_service.rb} +1 -3
- data/lib/dor/services/registration_service.rb +181 -0
- data/lib/dor/services/sdr_ingest_service.rb +181 -0
- data/lib/dor/services/search_service.rb +131 -0
- data/lib/dor/services/suri_service.rb +32 -0
- data/lib/dor/services/technical_metadata_service.rb +226 -0
- data/lib/dor/{tei2dc.xslt → services/tei2dc.xslt} +0 -0
- data/lib/dor/utils/ng_tidy.rb +37 -0
- data/lib/dor/utils/predicate_patch.rb +23 -0
- data/lib/dor/utils/solr_doc_helper.rb +9 -0
- data/lib/dor/utils/utc_date_field_mapper.rb +7 -0
- data/lib/dor/version.rb +3 -0
- data/lib/dor/workflow/document.rb +131 -0
- data/lib/dor/workflow/graph.rb +166 -0
- data/lib/dor/workflow/process.rb +99 -0
- data/lib/gsearch/demoFoxmlToSolr.xslt +340 -122
- data/lib/tasks/dor.rake +39 -0
- metadata +494 -384
- data/lib/datastreams/content_metadata_ds.rb +0 -12
- data/lib/datastreams/identity_metadata_ds.rb +0 -28
- data/lib/datastreams/ng_tidy.rb +0 -19
- data/lib/datastreams/simple_dublin_core_ds.rb +0 -23
- data/lib/datastreams/workflow_definition_ds.rb +0 -105
- data/lib/datastreams/workflow_ds.rb +0 -16
- data/lib/dor/admin_policy_object.rb +0 -11
- data/lib/dor/base.rb +0 -81
- data/lib/dor/cleanup_service.rb +0 -32
- data/lib/dor/digital_stacks_service.rb +0 -82
- data/lib/dor/druid_utils.rb +0 -41
- data/lib/dor/embargo.rb +0 -41
- data/lib/dor/item.rb +0 -141
- data/lib/dor/provenance_metadata_service.rb +0 -65
- data/lib/dor/registration_service.rb +0 -87
- data/lib/dor/rsolr.rb +0 -27
- data/lib/dor/sdr_ingest_service.rb +0 -117
- data/lib/dor/search_service.rb +0 -86
- data/lib/dor/suri_service.rb +0 -37
- data/lib/dor/workflow_object.rb +0 -13
- data/lib/dor/workflow_service.rb +0 -111
- data/lib/xml_models/foxml.rb +0 -261
- data/lib/xml_models/identity_metadata/dublin_core.rb +0 -119
- data/lib/xml_models/identity_metadata/identity_metadata.rb +0 -288
@@ -0,0 +1,181 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'lyber-utils'
|
3
|
+
require 'moab_stanford'
|
4
|
+
require 'dor-services'
|
5
|
+
|
6
|
+
module Dor
|
7
|
+
class SdrIngestService
|
8
|
+
|
9
|
+
# @param [Dor::Item] dor_item The representation of the digital object
|
10
|
+
# @param [String] agreement_id depreciated, included for backward compatability with common-accessoning
|
11
|
+
# @return [void] Create the moab manifests, export data to a BagIt bag, kick off the SDR ingest workflow
|
12
|
+
def self.transfer(dor_item, agreement_id=nil)
|
13
|
+
druid = dor_item.pid
|
14
|
+
workspace = DruidTools::Druid.new(druid,Dor::Config.sdr.local_workspace_root)
|
15
|
+
signature_catalog = get_signature_catalog(druid)
|
16
|
+
new_version_id = signature_catalog.version_id + 1
|
17
|
+
metadata_dir = extract_datastreams(dor_item, workspace)
|
18
|
+
verify_version_metadata(metadata_dir, new_version_id)
|
19
|
+
version_inventory = get_version_inventory(metadata_dir, druid, new_version_id)
|
20
|
+
version_addtions = signature_catalog.version_additions(version_inventory)
|
21
|
+
content_addtions = version_addtions.group('content')
|
22
|
+
if content_addtions.nil? or content_addtions.files.empty?
|
23
|
+
content_dir = nil
|
24
|
+
else
|
25
|
+
new_file_list = content_addtions.path_list
|
26
|
+
content_dir = workspace.find_filelist_parent('content',new_file_list)
|
27
|
+
end
|
28
|
+
content_group = version_inventory.group('content')
|
29
|
+
unless content_group.nil? or content_group.files.empty?
|
30
|
+
signature_catalog.normalize_group_signatures(content_group, content_dir)
|
31
|
+
end
|
32
|
+
# export the bag (in tar format)
|
33
|
+
bag_dir = Pathname(Dor::Config.sdr.local_export_home).join(druid.sub('druid:',''))
|
34
|
+
bagger = Moab::Bagger.new(version_inventory, signature_catalog, bag_dir)
|
35
|
+
bagger.reset_bag
|
36
|
+
bagger.create_bag_inventory(:depositor)
|
37
|
+
bagger.deposit_group('content', content_dir)
|
38
|
+
bagger.deposit_group('metadata', metadata_dir)
|
39
|
+
bagger.create_tagfiles
|
40
|
+
verify_bag_structure(bag_dir)
|
41
|
+
# Now bootstrap SDR workflow. but do not create the workflows datastream
|
42
|
+
dor_item.initialize_workflow('sdrIngestWF', 'sdr', false)
|
43
|
+
rescue Exception => e
|
44
|
+
raise LyberCore::Exceptions::ItemError.new(druid, "Export failure", e)
|
45
|
+
end
|
46
|
+
|
47
|
+
# @param [String] druid The object identifier
|
48
|
+
# @return [Moab::SignatureCatalog] the catalog of all files previously ingested
|
49
|
+
def self.get_signature_catalog(druid)
|
50
|
+
sdr_client = Dor::Config.sdr.rest_client
|
51
|
+
url = "objects/#{druid}/manifest/signatureCatalog.xml"
|
52
|
+
response = sdr_client[url].get
|
53
|
+
Moab::SignatureCatalog.parse(response)
|
54
|
+
rescue
|
55
|
+
Moab::SignatureCatalog.new(:digital_object_id => druid, :version_id => 0)
|
56
|
+
end
|
57
|
+
|
58
|
+
# @param [Dor::Item] dor_item The representation of the digital object
|
59
|
+
# @param [DruidTools::Druid] workspace The representation of the item's work area
|
60
|
+
# @return [Pathname] Pull all the datastreams specified in the configuration file
|
61
|
+
# into the workspace's metadata directory, overwriting existing file if present
|
62
|
+
def self.extract_datastreams(dor_item, workspace)
|
63
|
+
metadata_dir = Pathname.new(workspace.path('metadata',create=true))
|
64
|
+
Config.sdr.datastreams.to_hash.each_pair do |ds_name, required|
|
65
|
+
ds_name = ds_name.to_s
|
66
|
+
metadata_file = metadata_dir.join("#{ds_name}.xml")
|
67
|
+
metadata_string = self.get_datastream_content(dor_item, ds_name, required)
|
68
|
+
metadata_file.open('w') { |f| f << metadata_string } if metadata_string
|
69
|
+
end
|
70
|
+
metadata_dir
|
71
|
+
end
|
72
|
+
|
73
|
+
# @param [Dor::Item] dor_item The representation of the digital object
|
74
|
+
# @param [String] ds_name The name of the desired Fedora datastream
|
75
|
+
# @param [String] required Enumeration: one of ['required', 'optional']
|
76
|
+
# @return [String] return the xml text of the specified datastream if it exists.
|
77
|
+
# If not found, return nil unless it is a required datastream in which case raise exception
|
78
|
+
def self.get_datastream_content(dor_item, ds_name, required)
|
79
|
+
ds = (ds_name == 'relationshipMetadata' ? 'RELS-EXT' : ds_name)
|
80
|
+
if dor_item.datastreams.keys.include?(ds) and not dor_item.datastreams[ds].new?
|
81
|
+
return dor_item.datastreams[ds].content
|
82
|
+
elsif (required == 'optional')
|
83
|
+
return nil
|
84
|
+
else
|
85
|
+
raise "required datastream #{ds_name} not found in DOR"
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# @param [Pathname] metadata_dir the location of the metadata directory in the workspace
|
90
|
+
# @param [Integer] expected the version identifer expected to be used in the versionMetadata
|
91
|
+
def self.verify_version_metadata(metadata_dir, expected)
|
92
|
+
vmfile = metadata_dir.join("versionMetadata.xml")
|
93
|
+
verify_version_id(vmfile, expected, vmfile_version_id(vmfile))
|
94
|
+
true
|
95
|
+
end
|
96
|
+
|
97
|
+
# @param [Pathname] pathname The location of the file containing a version number
|
98
|
+
# @param [Integer] expected The version number that should be in the file
|
99
|
+
# @param [Integer] found The version number that is actually in the file
|
100
|
+
def self.verify_version_id(pathname, expected, found)
|
101
|
+
raise "Version mismatch in #{pathname}, expected #{expected}, found #{found}" unless (expected == found)
|
102
|
+
true
|
103
|
+
end
|
104
|
+
|
105
|
+
# @param [Pathname] pathname the location of the versionMetadata file
|
106
|
+
# @return [Integer] the versionId found in the last version element, or nil if missing
|
107
|
+
def self.vmfile_version_id(pathname)
|
108
|
+
verify_pathname(pathname)
|
109
|
+
doc = Nokogiri::XML(File.open(pathname.to_s))
|
110
|
+
nodeset = doc.xpath("/versionMetadata/version")
|
111
|
+
version_id = nodeset.last['versionId']
|
112
|
+
version_id.nil? ? nil : version_id.to_i
|
113
|
+
end
|
114
|
+
|
115
|
+
# @param [Pathname] metadata_dir The location of the the object's metadata files
|
116
|
+
# @param [String] druid The object identifier
|
117
|
+
# @param [Integer] version_id The version number
|
118
|
+
# @return [Moab::FileInventory] Generate and return a version inventory for the object
|
119
|
+
def self.get_version_inventory(metadata_dir, druid, version_id)
|
120
|
+
version_inventory = get_content_inventory(metadata_dir, druid, version_id)
|
121
|
+
version_inventory.groups << get_metadata_file_group(metadata_dir)
|
122
|
+
version_inventory
|
123
|
+
end
|
124
|
+
|
125
|
+
# @param [Pathname] metadata_dir The location of the the object's metadata files
|
126
|
+
# @param [String] druid The object identifier
|
127
|
+
# @param [Integer] version_id The version number
|
128
|
+
# @return [Moab::FileInventory] Parse the contentMetadata
|
129
|
+
# and generate a new version inventory object containing a content group
|
130
|
+
def self.get_content_inventory(metadata_dir, druid, version_id)
|
131
|
+
content_metadata = get_content_metadata(metadata_dir)
|
132
|
+
if content_metadata
|
133
|
+
Stanford::ContentInventory.new.inventory_from_cm(content_metadata, druid, subset='preserve', version_id)
|
134
|
+
else
|
135
|
+
FileInventory.new(:type=>"version",:digital_object_id=>druid, :version_id=>version_id)
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
# @param [Pathname] metadata_dir The location of the the object's metadata files
|
140
|
+
# @return [String] Return the contents of the contentMetadata.xml file from the content directory
|
141
|
+
def self.get_content_metadata(metadata_dir)
|
142
|
+
content_metadata_pathname = metadata_dir.join('contentMetadata.xml')
|
143
|
+
if content_metadata_pathname.exist?
|
144
|
+
content_metadata_pathname.read
|
145
|
+
else
|
146
|
+
nil
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
# @param [Pathname] metadata_dir The location of the the object's metadata files
|
151
|
+
# @return [Moab::FileGroup] Traverse the metadata directory and generate a metadata group
|
152
|
+
def self.get_metadata_file_group(metadata_dir)
|
153
|
+
file_group = FileGroup.new(:group_id=>'metadata').group_from_directory(metadata_dir)
|
154
|
+
file_group
|
155
|
+
end
|
156
|
+
|
157
|
+
# @param [Pathname] bag_dir the location of the bag to be verified
|
158
|
+
# @return [Boolean] true if all required files exist, raises exception if not
|
159
|
+
def self.verify_bag_structure(bag_dir)
|
160
|
+
verify_pathname(bag_dir)
|
161
|
+
verify_pathname(bag_dir.join('data'))
|
162
|
+
verify_pathname(bag_dir.join('bagit.txt'))
|
163
|
+
verify_pathname(bag_dir.join('bag-info.txt'))
|
164
|
+
verify_pathname(bag_dir.join('manifest-sha256.txt'))
|
165
|
+
verify_pathname(bag_dir.join('tagmanifest-sha256.txt'))
|
166
|
+
verify_pathname(bag_dir.join('versionAdditions.xml'))
|
167
|
+
verify_pathname(bag_dir.join('versionInventory.xml'))
|
168
|
+
verify_pathname(bag_dir.join('data','metadata','versionMetadata.xml'))
|
169
|
+
true
|
170
|
+
end
|
171
|
+
|
172
|
+
# @param [Pathname] pathname The file whose existence should be verified
|
173
|
+
# @return [Boolean] true if file exists, raises exception if not
|
174
|
+
def self.verify_pathname(pathname)
|
175
|
+
raise "#{pathname.basename} not found at #{pathname}" unless pathname.exist?
|
176
|
+
true
|
177
|
+
end
|
178
|
+
|
179
|
+
end
|
180
|
+
|
181
|
+
end
|
@@ -0,0 +1,131 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'active_support/core_ext'
|
3
|
+
|
4
|
+
module Dor
|
5
|
+
|
6
|
+
class SearchService
|
7
|
+
|
8
|
+
include Solrizer::FieldNameMapper
|
9
|
+
RISEARCH_TEMPLATE = "select $object from <#ri> where $object <dc:identifier> '%s'"
|
10
|
+
@@index_version = nil
|
11
|
+
|
12
|
+
class << self
|
13
|
+
|
14
|
+
def index_version
|
15
|
+
if @@index_version.nil?
|
16
|
+
xsl_doc = Nokogiri::XML(File.read(File.expand_path('../../../gsearch/demoFoxmlToSolr.xslt',__FILE__)))
|
17
|
+
@@index_version = xsl_doc.at_xpath('/xsl:stylesheet/xsl:variable[@name="INDEXVERSION"]/text()').to_s
|
18
|
+
end
|
19
|
+
@@index_version
|
20
|
+
end
|
21
|
+
|
22
|
+
def reindex(*pids)
|
23
|
+
client = Config.gsearch.rest_client
|
24
|
+
pids.in_groups_of(20, false) do |group|
|
25
|
+
group.each { |pid| client["?operation=updateIndex&action=fromPid&value=#{pid}"].get }
|
26
|
+
yield group if block_given?
|
27
|
+
end
|
28
|
+
pids
|
29
|
+
end
|
30
|
+
|
31
|
+
def risearch(query, opts = {})
|
32
|
+
client = Config.fedora.client['risearch']
|
33
|
+
client.options[:timeout] = opts.delete(:timeout)
|
34
|
+
query_params = {
|
35
|
+
:type => 'tuples',
|
36
|
+
:lang => 'itql',
|
37
|
+
:format => 'CSV',
|
38
|
+
:limit => '1000',
|
39
|
+
:stream => 'on',
|
40
|
+
:query => query
|
41
|
+
}.merge(opts)
|
42
|
+
result = client.post(query_params)
|
43
|
+
result.split(/\n/)[1..-1].collect { |pid| pid.chomp.sub(/^info:fedora\//,'') }
|
44
|
+
end
|
45
|
+
|
46
|
+
def iterate_over_pids(opts = {}, &block)
|
47
|
+
opts[:query] ||= "select $object from <#ri> where $object <info:fedora/fedora-system:def/model#label> $label"
|
48
|
+
opts[:in_groups_of] ||= 100
|
49
|
+
opts[:mode] ||= :single
|
50
|
+
start = 0
|
51
|
+
pids = Dor::SearchService.risearch("#{opts[:query]} limit #{opts[:in_groups_of]} offset #{start}")
|
52
|
+
while pids.present?
|
53
|
+
if opts[:mode] == :single
|
54
|
+
pids.each { |pid| yield pid }
|
55
|
+
else
|
56
|
+
yield pids
|
57
|
+
end
|
58
|
+
start += pids.length
|
59
|
+
pids = Dor::SearchService.risearch("#{opts[:query]} limit #{opts[:in_groups_of]} offset #{start}")
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def gsearch(params)
|
64
|
+
client = Config.gsearch.client
|
65
|
+
query_params = params.merge(:wt => 'json')
|
66
|
+
query_string = query_params.collect { |k,v|
|
67
|
+
if v.is_a?(Array)
|
68
|
+
v.collect { |vv| "#{k}=#{URI.encode(vv.to_s)}" }.join('&')
|
69
|
+
else
|
70
|
+
"#{k}=#{URI.encode(v.to_s)}"
|
71
|
+
end
|
72
|
+
}.join('&')
|
73
|
+
result = JSON.parse(client["select?#{query_string}"].get)
|
74
|
+
end
|
75
|
+
|
76
|
+
def query query, args={}
|
77
|
+
params = args.merge({ :q => query })
|
78
|
+
params[:start] ||= 0
|
79
|
+
resp = solr.find params
|
80
|
+
if block_given?
|
81
|
+
cont = true
|
82
|
+
while cont and resp.docs.length > 0
|
83
|
+
cont = yield(resp)
|
84
|
+
params[:rows] ||= resp.docs.length
|
85
|
+
params[:start] += params[:rows]
|
86
|
+
resp = solr.find params
|
87
|
+
end
|
88
|
+
else
|
89
|
+
return resp
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
def query_by_id(id)
|
94
|
+
if id.is_a?(Hash) # Single valued: { :google => 'STANFORD_0123456789' }
|
95
|
+
id = id.collect { |*v| v.join(':') }.first
|
96
|
+
elsif id.is_a?(Array) # Two values: [ 'google', 'STANFORD_0123456789' ]
|
97
|
+
id = id.join(':')
|
98
|
+
end
|
99
|
+
q = %{#{solr_name 'identifier', :string}:"#{id}"}
|
100
|
+
result = []
|
101
|
+
resp = query(q, :fl => 'id', :rows => 1000) do |resp|
|
102
|
+
result += resp.docs.collect { |doc| doc['id'] }
|
103
|
+
true
|
104
|
+
end
|
105
|
+
result
|
106
|
+
end
|
107
|
+
|
108
|
+
def solr
|
109
|
+
@@solr ||= ActiveFedora.solr.conn.is_a?(RSolr::Client) ? ActiveFedora.solr.conn : Dor::Config.make_solr_connection
|
110
|
+
end
|
111
|
+
|
112
|
+
# @return String druid of the SDR Graveyard APO
|
113
|
+
# nil if APO does not exist in the currently configured environment
|
114
|
+
def sdr_graveyard_apo_druid
|
115
|
+
@@sdr_graveyard_apo ||= find_sdr_graveyard_apo_druid
|
116
|
+
end
|
117
|
+
|
118
|
+
def find_sdr_graveyard_apo_druid
|
119
|
+
r = Dor::SearchService.query('dc_title_t:"SDR Graveyard"', :fl => 'id')
|
120
|
+
if r.docs.empty?
|
121
|
+
nil
|
122
|
+
else
|
123
|
+
r.docs.first[:id]
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
end
|
128
|
+
|
129
|
+
end
|
130
|
+
|
131
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'rest-client'
|
2
|
+
require 'active_fedora'
|
3
|
+
|
4
|
+
module Dor
|
5
|
+
class SuriService
|
6
|
+
# If Dor::Config.suri.mint_ids is set to true, then this method
|
7
|
+
# returns Config.suri.id_namespace:id_from_suri
|
8
|
+
# Throws an exception if there were any problems
|
9
|
+
def self.mint_id quantity=nil
|
10
|
+
want_array = quantity.is_a?(Numeric)
|
11
|
+
quantity = 1 if quantity.nil?
|
12
|
+
ids = []
|
13
|
+
if Config.suri.mint_ids
|
14
|
+
#Post with no body
|
15
|
+
resource = RestClient::Resource.new("#{Config.suri.url}/suri2/namespaces/#{Config.suri.id_namespace}",
|
16
|
+
:user => Config.suri.user, :password => Config.suri.pass)
|
17
|
+
ids = resource["identifiers?quantity=#{quantity}"].post('').chomp.split(/\n/).collect { |id| "#{Config.suri.id_namespace}:#{id.strip}" }
|
18
|
+
else
|
19
|
+
repo = ActiveFedora::Base.respond_to?(:connection_for_pid) ? ActiveFedora::Base.connection_for_pid(0) : ActiveFedora.fedora.connection
|
20
|
+
resp = Nokogiri::XML(repo.next_pid :numPIDs => quantity)
|
21
|
+
ids = resp.xpath('/pidList/pid').collect { |node| node.text }
|
22
|
+
end
|
23
|
+
return want_array ? ids : ids.first
|
24
|
+
|
25
|
+
# rescue Exception => e
|
26
|
+
# Rails.logger.error("Unable to mint id from suri: #{e.to_s}")
|
27
|
+
# raise e
|
28
|
+
end
|
29
|
+
|
30
|
+
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,226 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'moab_stanford'
|
3
|
+
require 'jhove_service'
|
4
|
+
require 'dor-services'
|
5
|
+
|
6
|
+
module Dor
|
7
|
+
|
8
|
+
class TechnicalMetadataService
|
9
|
+
|
10
|
+
# @param [Dor::Item] dor_item The DOR item being processed by the technical metadata robot
|
11
|
+
# @return [Boolean] True if technical metadata is correctly added or updated
|
12
|
+
def self.add_update_technical_metadata(dor_item)
|
13
|
+
test_jhove_service
|
14
|
+
druid = dor_item.pid
|
15
|
+
content_group_diff = get_content_group_diff(dor_item)
|
16
|
+
deltas = get_file_deltas(content_group_diff)
|
17
|
+
new_files = get_new_files(deltas)
|
18
|
+
old_techmd = get_old_technical_metadata(dor_item)
|
19
|
+
new_techmd = get_new_technical_metadata(druid, new_files)
|
20
|
+
if old_techmd.nil?
|
21
|
+
# this is version 1 or previous technical metadata was not saved
|
22
|
+
final_techmd = new_techmd
|
23
|
+
elsif content_group_diff.difference_count == 0
|
24
|
+
# there have been no changes to content files from previous version
|
25
|
+
return true
|
26
|
+
else
|
27
|
+
merged_nodes = merge_file_nodes(old_techmd, new_techmd, deltas)
|
28
|
+
final_techmd = build_technical_metadata(druid,merged_nodes)
|
29
|
+
end
|
30
|
+
ds = dor_item.datastreams["technicalMetadata"]
|
31
|
+
ds.dsLabel = 'Technical Metadata'
|
32
|
+
ds.content = final_techmd
|
33
|
+
ds.save
|
34
|
+
true
|
35
|
+
end
|
36
|
+
|
37
|
+
# @return [Boolean] Make sure that the jhove-service gem is loaded
|
38
|
+
def self.test_jhove_service
|
39
|
+
unless defined? ::JhoveService
|
40
|
+
begin
|
41
|
+
require 'jhove_service'
|
42
|
+
rescue LoadError => e
|
43
|
+
puts e.inspect
|
44
|
+
raise "jhove-service dependency gem was not found. Please add it to your Gemfile and run bundle install"
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
# @param [Dor::Item] dor_item The DOR item being processed by the technical metadata robot
|
50
|
+
# @return [FileGroupDifference] The differences between two versions of a group of files
|
51
|
+
def self.get_content_group_diff(dor_item)
|
52
|
+
inventory_diff_xml = dor_item.get_content_diff('all')
|
53
|
+
inventory_diff = Moab::FileInventoryDifference.parse(inventory_diff_xml)
|
54
|
+
content_group_diff = inventory_diff.group_difference("content")
|
55
|
+
content_group_diff
|
56
|
+
end
|
57
|
+
|
58
|
+
# @param [FileGroupDifference] content_group_diff
|
59
|
+
# @return [Hash<Symbol,Array>] Sets of filenames grouped by change type for use in performing file or metadata operations
|
60
|
+
def self.get_file_deltas(content_group_diff)
|
61
|
+
deltas = content_group_diff.file_deltas
|
62
|
+
deltas
|
63
|
+
end
|
64
|
+
|
65
|
+
# @param [Hash<Symbol,Array>] deltas Sets of filenames grouped by change type for use in performing file or metadata operations
|
66
|
+
# @return [Array<String>] The list of filenames for files that are either added or modifed since the previous version
|
67
|
+
def self.get_new_files(deltas)
|
68
|
+
deltas[:added] + deltas[:modified]
|
69
|
+
end
|
70
|
+
|
71
|
+
# @param [Dor::Item] dor_item The DOR item being processed by the technical metadata robot
|
72
|
+
# @return [String] The technicalMetadata datastream from the previous version of the digital object
|
73
|
+
def self.get_old_technical_metadata(dor_item)
|
74
|
+
sdr_techmd = get_sdr_technical_metadata(dor_item.pid)
|
75
|
+
return sdr_techmd unless sdr_techmd.nil?
|
76
|
+
get_dor_technical_metadata(dor_item)
|
77
|
+
end
|
78
|
+
|
79
|
+
# @param [String] druid The identifier of the digital object being processed by the technical metadata robot
|
80
|
+
# @return [String] The technicalMetadata datastream from the previous version of the digital object (fetched from SDR storage)
|
81
|
+
# The data is updated to the latest format.
|
82
|
+
def self.get_sdr_technical_metadata(druid)
|
83
|
+
begin
|
84
|
+
sdr_techmd = get_sdr_metadata(druid, "technicalMetadata")
|
85
|
+
rescue RestClient::ResourceNotFound => e
|
86
|
+
return nil
|
87
|
+
end
|
88
|
+
if sdr_techmd =~ /<technicalMetadata/
|
89
|
+
return sdr_techmd
|
90
|
+
elsif sdr_techmd =~ /<jhove/
|
91
|
+
return ::JhoveService.new.upgrade_technical_metadata(sdr_techmd)
|
92
|
+
else
|
93
|
+
return nil
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
# @param [Dor::Item] dor_item The DOR item being processed by the technical metadata robot
|
98
|
+
# @return [String] The technicalMetadata datastream from the previous version of the digital object (fetched from DOR fedora).
|
99
|
+
# The data is updated to the latest format.
|
100
|
+
def self.get_dor_technical_metadata(dor_item)
|
101
|
+
ds = "technicalMetadata"
|
102
|
+
if dor_item.datastreams.keys.include?(ds) and not dor_item.datastreams[ds].new?
|
103
|
+
dor_techmd = dor_item.datastreams[ds].content
|
104
|
+
else
|
105
|
+
return nil
|
106
|
+
end
|
107
|
+
if dor_techmd =~ /<technicalMetadata/
|
108
|
+
return dor_techmd
|
109
|
+
elsif dor_techmd =~ /<jhove/
|
110
|
+
return ::JhoveService.new.upgrade_technical_metadata(dor_techmd)
|
111
|
+
else
|
112
|
+
return nil
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
# @param [String] druid The identifier of the digital object being processed by the technical metadata robot
|
117
|
+
# @param [String] dsname The identifier of the metadata datastream
|
118
|
+
# @return [String] The datastream contents from the previous version of the digital object (fetched from SDR storage)
|
119
|
+
def self.get_sdr_metadata(druid, dsname)
|
120
|
+
sdr_client = Dor::Config.sdr.rest_client
|
121
|
+
url = "objects/#{druid}/metadata/#{dsname}.xml"
|
122
|
+
response = sdr_client[url].get
|
123
|
+
response
|
124
|
+
end
|
125
|
+
|
126
|
+
# @param [DruidTools::Druid] druid_tool A wrapper class for the druid identifier. Used to generate paths
|
127
|
+
# @param [Array<String>] new_files The list of filenames for files that are either added or modifed since the previous version
|
128
|
+
# @return [String] The technicalMetadata datastream for the new files of the new digital object version
|
129
|
+
def self.get_new_technical_metadata(druid, new_files)
|
130
|
+
return nil if new_files.nil? or new_files.empty?
|
131
|
+
workspace = DruidTools::Druid.new(druid, Dor::Config.sdr.local_workspace_root)
|
132
|
+
content_dir = workspace.find_filelist_parent('content',new_files)
|
133
|
+
temp_dir = workspace.temp_dir
|
134
|
+
jhove_service = ::JhoveService.new(temp_dir)
|
135
|
+
jhove_service.digital_object_id=druid
|
136
|
+
fileset_file = write_fileset(temp_dir, new_files)
|
137
|
+
jhove_output_file = jhove_service.run_jhove(content_dir, fileset_file)
|
138
|
+
tech_md_file = jhove_service.create_technical_metadata(jhove_output_file)
|
139
|
+
IO.read(tech_md_file)
|
140
|
+
end
|
141
|
+
|
142
|
+
# @param [Pathname] temp_dir The pathname of the temp folder in the object's workspace area
|
143
|
+
# @param [Object] new_files [Array<String>] The list of filenames for files that are either added or modifed since the previous version
|
144
|
+
# @return [Pathname] Save the new_files list to a text file and return that file's name
|
145
|
+
def self.write_fileset(temp_dir, new_files)
|
146
|
+
fileset_pathname = Pathname(temp_dir).join('jhove_fileset.txt')
|
147
|
+
fileset_pathname.open('w') {|f| f.puts(new_files) }
|
148
|
+
fileset_pathname
|
149
|
+
end
|
150
|
+
|
151
|
+
# @param [String] old_techmd The technicalMetadata datastream from the previous version of the digital object
|
152
|
+
# @param [String] new_techmd The technicalMetadata datastream for the new files of the new digital object version
|
153
|
+
# @param [Array<String>] deltas The list of filenames for files that are either added or modifed since the previous version
|
154
|
+
# @return [Hash<String,Nokogiri::XML::Node>] The complete set of technicalMetadata nodes for the digital object, indexed by filename
|
155
|
+
def self.merge_file_nodes(old_techmd, new_techmd, deltas)
|
156
|
+
old_file_nodes = get_file_nodes(old_techmd)
|
157
|
+
new_file_nodes = get_file_nodes(new_techmd)
|
158
|
+
merged_nodes = Hash.new
|
159
|
+
deltas[:identical].each do |path|
|
160
|
+
merged_nodes[path] = old_file_nodes[path]
|
161
|
+
end
|
162
|
+
deltas[:modified].each do |path|
|
163
|
+
merged_nodes[path] = new_file_nodes[path]
|
164
|
+
end
|
165
|
+
deltas[:added].each do |path|
|
166
|
+
merged_nodes[path] = new_file_nodes[path]
|
167
|
+
end
|
168
|
+
deltas[:renamed].each do |oldpath,newpath|
|
169
|
+
clone = old_file_nodes[oldpath].clone
|
170
|
+
clone.sub!(/<file\s*id.*?["'].*?["'].*?>/, "<file id='#{newpath}'>")
|
171
|
+
merged_nodes[newpath] = clone
|
172
|
+
end
|
173
|
+
deltas[:copyadded].each do |oldpath,newpath|
|
174
|
+
clone = old_file_nodes[oldpath].clone
|
175
|
+
clone.sub!(/<file\s*id.*?["'].*?["'].*?>/, "<file id='#{newpath}'>")
|
176
|
+
merged_nodes[newpath] = clone
|
177
|
+
end
|
178
|
+
merged_nodes
|
179
|
+
end
|
180
|
+
|
181
|
+
# @param [String] technical_metadata A technicalMetadata datastream contents
|
182
|
+
# @return [Hash<String,Nokogiri::XML::Node>] The set of nodes from a technicalMetadata datastream , indexed by filename
|
183
|
+
def self.get_file_nodes(technical_metadata)
|
184
|
+
file_hash = Hash.new
|
185
|
+
return file_hash if technical_metadata.nil?
|
186
|
+
current_file = Array.new
|
187
|
+
path = nil
|
188
|
+
in_file = false
|
189
|
+
technical_metadata.each_line do |line|
|
190
|
+
if line =~ /^\s*<file.*["'](.*?)["']/
|
191
|
+
current_file << line
|
192
|
+
path = $1
|
193
|
+
in_file = true
|
194
|
+
elsif line =~ /^\s*<\/file>/
|
195
|
+
current_file << line
|
196
|
+
file_hash[path] = current_file.join
|
197
|
+
current_file = Array.new
|
198
|
+
path = nil
|
199
|
+
in_file = false
|
200
|
+
elsif in_file
|
201
|
+
current_file << line
|
202
|
+
end
|
203
|
+
end
|
204
|
+
file_hash
|
205
|
+
end
|
206
|
+
|
207
|
+
# @param [String] druid The identifier of the digital object being processed by the technical metadata robot
|
208
|
+
# @param [Hash<String,Nokogiri::XML::Node>] merged_nodes The complete set of technicalMetadata nodes for the digital object, indexed by filename
|
209
|
+
# @return [String] The finalized technicalMetadata datastream contents for the new object version
|
210
|
+
def self.build_technical_metadata(druid, merged_nodes)
|
211
|
+
techmd_root = <<-EOF
|
212
|
+
<technicalMetadata objectId='#{druid}' datetime='#{Time.now.utc.iso8601}'
|
213
|
+
xmlns:jhove='http://hul.harvard.edu/ois/xml/ns/jhove'
|
214
|
+
xmlns:mix='http://www.loc.gov/mix/v10'
|
215
|
+
xmlns:textmd='info:lc/xmlns/textMD-v3'>
|
216
|
+
EOF
|
217
|
+
doc = techmd_root
|
218
|
+
merged_nodes.keys.sort.each {|path| doc << merged_nodes[path] }
|
219
|
+
doc << "</technicalMetadata>"
|
220
|
+
doc
|
221
|
+
end
|
222
|
+
|
223
|
+
end
|
224
|
+
|
225
|
+
end
|
226
|
+
|