jhove-service 1.4.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE.rdoc +23 -0
- data/README.rdoc +36 -0
- data/bin/aiff-hul-1.6.1-RC1.jar +0 -0
- data/bin/ascii-hul-1.4.1.jar +0 -0
- data/bin/cache/xhtml-attribs-1.xsd +73 -0
- data/bin/cache/xhtml-base-1.xsd +36 -0
- data/bin/cache/xhtml-bdo-1.xsd +71 -0
- data/bin/cache/xhtml-blkphras-1.xsd +160 -0
- data/bin/cache/xhtml-blkpres-1.xsd +37 -0
- data/bin/cache/xhtml-blkstruct-1.xsd +49 -0
- data/bin/cache/xhtml-csismap-1.xsd +96 -0
- data/bin/cache/xhtml-datatypes-1.xsd +242 -0
- data/bin/cache/xhtml-edit-1.xsd +39 -0
- data/bin/cache/xhtml-events-1.xsd +130 -0
- data/bin/cache/xhtml-form-1.xsd +327 -0
- data/bin/cache/xhtml-framework-1.xsd +66 -0
- data/bin/cache/xhtml-hypertext-1.xsd +47 -0
- data/bin/cache/xhtml-image-1.xsd +46 -0
- data/bin/cache/xhtml-inlphras-1.xsd +163 -0
- data/bin/cache/xhtml-inlpres-1.xsd +39 -0
- data/bin/cache/xhtml-inlstruct-1.xsd +50 -0
- data/bin/cache/xhtml-inlstyle-1.xsd +27 -0
- data/bin/cache/xhtml-link-1.xsd +45 -0
- data/bin/cache/xhtml-list-1.xsd +99 -0
- data/bin/cache/xhtml-meta-1.xsd +54 -0
- data/bin/cache/xhtml-object-1.xsd +76 -0
- data/bin/cache/xhtml-param-1.xsd +51 -0
- data/bin/cache/xhtml-pres-1.xsd +51 -0
- data/bin/cache/xhtml-ruby-1.xsd +170 -0
- data/bin/cache/xhtml-script-1.xsd +71 -0
- data/bin/cache/xhtml-ssismap-1.xsd +43 -0
- data/bin/cache/xhtml-struct-1.xsd +130 -0
- data/bin/cache/xhtml-style-1.xsd +53 -0
- data/bin/cache/xhtml-table-1.xsd +272 -0
- data/bin/cache/xhtml-target-1.xsd +49 -0
- data/bin/cache/xhtml-text-1.xsd +67 -0
- data/bin/cache/xhtml11-model-1.xsd +716 -0
- data/bin/cache/xhtml11-modules-1.xsd +605 -0
- data/bin/cache/xhtml11.xsd +104 -0
- data/bin/cache/xml.xsd +287 -0
- data/bin/console +9 -0
- data/bin/extension-mimetype.conf +209 -0
- data/bin/gif-hul-1.4.2-RC1.jar +0 -0
- data/bin/html-hul-1.4.1.jar +0 -0
- data/bin/jhove-README-1st.txt +2 -0
- data/bin/jhove-apps-1.24.0-RC1.jar +0 -0
- data/bin/jhove-ext-modules-1.24.0-RC1.jar +0 -0
- data/bin/jhove.conf +98 -0
- data/bin/jhoveToolkit.sh +87 -0
- data/bin/jpeg-hul-1.5.2-RC1.jar +0 -0
- data/bin/jpeg2000-hul-1.4.2-RC1.jar +0 -0
- data/bin/pdf-hul-1.12.2-RC1.jar +0 -0
- data/bin/tiff-hul-1.9.2-RC1.jar +0 -0
- data/bin/utf8-hul-1.7.1.jar +0 -0
- data/bin/wave-hul-1.8.1-RC1.jar +0 -0
- data/bin/xml-hul-1.5.1.jar +0 -0
- data/lib/jhove_service.rb +127 -0
- data/lib/jhove_technical_metadata.rb +284 -0
- metadata +172 -0
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/bin/jhove.conf
ADDED
@@ -0,0 +1,98 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<jhoveConfig version="1.0"
|
3
|
+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
4
|
+
xmlns="http://hul.harvard.edu/ois/xml/ns/jhove/jhoveConfig"
|
5
|
+
xsi:schemaLocation="http://hul.harvard.edu/ois/xml/ns/jhove/jhoveConfig
|
6
|
+
http://cosimo.stanford.edu/standards/jhove/v1/jhoveConfig.xsd">
|
7
|
+
<jhoveHome></jhoveHome>
|
8
|
+
<defaultEncoding>utf-8</defaultEncoding>
|
9
|
+
<tempDirectory>/tmp</tempDirectory>
|
10
|
+
<bufferSize>131072</bufferSize>
|
11
|
+
<mixVersion>1.0</mixVersion>
|
12
|
+
<sigBytes>1024</sigBytes>
|
13
|
+
<module>
|
14
|
+
<class>edu.harvard.hul.ois.jhove.module.AiffModule</class>
|
15
|
+
</module>
|
16
|
+
<module>
|
17
|
+
<class>edu.harvard.hul.ois.jhove.module.WaveModule</class>
|
18
|
+
</module>
|
19
|
+
<module>
|
20
|
+
<class>edu.harvard.hul.ois.jhove.module.PdfModule</class>
|
21
|
+
<param>p</param>
|
22
|
+
</module>
|
23
|
+
<module>
|
24
|
+
<class>com.mcgath.jhove.module.PngModule</class>
|
25
|
+
</module>
|
26
|
+
<module>
|
27
|
+
<class>edu.harvard.hul.ois.jhove.module.Jpeg2000Module</class>
|
28
|
+
</module>
|
29
|
+
<module>
|
30
|
+
<class>edu.harvard.hul.ois.jhove.module.JpegModule</class>
|
31
|
+
</module>
|
32
|
+
<module>
|
33
|
+
<class>edu.harvard.hul.ois.jhove.module.GifModule</class>
|
34
|
+
</module>
|
35
|
+
<module>
|
36
|
+
<class>edu.harvard.hul.ois.jhove.module.TiffModule</class>
|
37
|
+
</module>
|
38
|
+
<module>
|
39
|
+
<class>edu.harvard.hul.ois.jhove.module.XmlModule</class>
|
40
|
+
<param>schema=http://www.w3.org/markUp/schema/xhtml11.xsd;cache/xhtml11.xsd</param>
|
41
|
+
<param>schema=http://www.w3.org/2001/xml.xsd;cache/xml.xsd</param>
|
42
|
+
<param>schema=http://www.w3.org/markUp/schema/xhtml11-model-1.xsd;cache/xhtml11-model-1.xsd</param>
|
43
|
+
<param>schema=https://www.w3.org/markUp/schema/xhtml11-model-1.xsd;cache/xhtml11-model-1.xsd</param>
|
44
|
+
<param>schema=http://www.w3.org/markUp/schema/xhtml11-modules-1.xsd;cache/xhtml11-modules-1.xsd</param>
|
45
|
+
<param>schema=https://www.w3.org/markUp/schema/xhtml11-modules-1.xsd;cache/xhtml11-modules-1.xsd</param>
|
46
|
+
<param>schema=http://www.w3.org/markUp/schema/xhtml-attribs-1.xsd;cache/xhtml-attribs-1.xsd</param>
|
47
|
+
<param>schema=https://www.w3.org/markUp/schema/xhtml-attribs-1.xsd;cache/xhtml-attribs-1.xsd</param>
|
48
|
+
<param>schema=http://www.w3.org/markUp/schema/xhtml-base-1.xsd;cache/xhtml-base-1.xsd</param>
|
49
|
+
<param>schema=https://www.w3.org/markUp/schema/xhtml-blkphras-1.xsd;cache/xhtml-blkphras-1.xsd</param>
|
50
|
+
<param>schema=https://www.w3.org/markUp/schema/xhtml-blkpres-1.xsd;cache/xhtml-blkpres-1.xsd</param>
|
51
|
+
<param>schema=https://www.w3.org/markUp/schema/xhtml-blkstruct-1.xsd;cache/xhtml-blkstruct-1.xsd</param>
|
52
|
+
<param>schema=http://www.w3.org/markUp/schema/xhtml-bdo-1.xsd;cache/xhtml-bdo-1.xsd</param>
|
53
|
+
<param>schema=http://www.w3.org/markUp/schema/xhtml-csismap-1.xsd;cache/xhtml-csismap-1.xsd</param>
|
54
|
+
<param>schema=http://www.w3.org/markUp/schema/xhtml-datatypes-1.xsd;cache/xhtml-datatypes-1.xsd</param>
|
55
|
+
<param>schema=http://www.w3.org/markUp/schema/xhtml-edit-1.xsd;cache/xhtml-edit-1.xsd</param>
|
56
|
+
<param>schema=http://www.w3.org/markUp/schema/xhtml-events-1.xsd;cache/xhtml-events-1.xsd</param>
|
57
|
+
<param>schema=http://www.w3.org/markUp/schema/xhtml-form-1.xsd;cache/xhtml-form-1.xsd</param>
|
58
|
+
<param>schema=http://www.w3.org/markUp/schema/xhtml-framework-1.xsd;cache/xhtml-framework-1.xsd</param>
|
59
|
+
<param>schema=http://www.w3.org/markUp/schema/xhtml-hypertext-1.xsd;cache/xhtml-hypertext-1.xsd</param>
|
60
|
+
<param>schema=http://www.w3.org/markUp/schema/xhtml-image-1.xsd;cache/xhtml-image-1.xsd</param>
|
61
|
+
<param>schema=https://www.w3.org/markUp/schema/xhtml-inlphras-1.xsd;cache/xhtml-inlphras-1.xsd</param>
|
62
|
+
<param>schema=https://www.w3.org/markUp/schema/xhtml-inlpres-1.xsd;cache/xhtml-inlpres-1.xsd</param>
|
63
|
+
<param>schema=https://www.w3.org/markUp/schema/xhtml-inlstruct-1.xsd;cache/xhtml-inlstruct-1.xsd</param>
|
64
|
+
<param>schema=http://www.w3.org/markUp/schema/xhtml-inlstyle-1.xsd;cache/xhtml-inlstyle-1.xsd</param>
|
65
|
+
<param>schema=http://www.w3.org/markUp/schema/xhtml-link-1.xsd;cache/xhtml-link-1.xsd</param>
|
66
|
+
<param>schema=http://www.w3.org/markUp/schema/xhtml-list-1.xsd;cache/xhtml-list-1.xsd</param>
|
67
|
+
<param>schema=http://www.w3.org/markUp/schema/xhtml-meta-1.xsd;cache/xhtml-meta-1.xsd</param>
|
68
|
+
<param>schema=http://www.w3.org/markUp/schema/xhtml-object-1.xsd;cache/xhtml-object-1.xsd</param>
|
69
|
+
<param>schema=http://www.w3.org/markUp/schema/xhtml-param-1.xsd;cache/xhtml-param-1.xsd</param>
|
70
|
+
<param>schema=https://www.w3.org/markUp/schema/xhtml-param-1.xsd;cache/xhtml-param-1.xsd</param>
|
71
|
+
<param>schema=http://www.w3.org/markUp/schema/xhtml-pres-1.xsd;cache/xhtml-pres-1.xsd</param>
|
72
|
+
<param>schema=http://www.w3.org/markUp/schema/xhtml-ruby-1.xsd;cache/xhtml-ruby-1.xsd</param>
|
73
|
+
<param>schema=http://www.w3.org/markUp/schema/xhtml-script-1.xsd;cache/xhtml-script-1.xsd</param>
|
74
|
+
<param>schema=http://www.w3.org/markUp/schema/xhtml-ssismap-1.xsd;cache/xhtml-ssismap-1.xsd</param>
|
75
|
+
<param>schema=http://www.w3.org/markUp/schema/xhtml-struct-1.xsd;cache/xhtml-struct-1.xsd</param>
|
76
|
+
<param>schema=http://www.w3.org/markUp/schema/xhtml-style-1.xsd;cache/xhtml-style-1.xsd</param>
|
77
|
+
<param>schema=http://www.w3.org/markUp/schema/xhtml-table-1.xsd;cache/xhtml-table-1.xsd</param>
|
78
|
+
<param>schema=http://www.w3.org/markUp/schema/xhtml-target-1.xsd;cache/xhtml-target-1.xsd</param>
|
79
|
+
<param>schema=http://www.w3.org/markUp/schema/xhtml-text-1.xsd;cache/xhtml-text-1.xsd</param> </module>
|
80
|
+
<module>
|
81
|
+
<class>edu.harvard.hul.ois.jhove.module.HtmlModule</class>
|
82
|
+
</module>
|
83
|
+
<module>
|
84
|
+
<class>edu.harvard.hul.ois.jhove.module.WarcModule</class>
|
85
|
+
</module>
|
86
|
+
<module>
|
87
|
+
<class>edu.harvard.hul.ois.jhove.module.GzipModule</class>
|
88
|
+
</module>
|
89
|
+
<module>
|
90
|
+
<class>edu.harvard.hul.ois.jhove.module.AsciiModule</class>
|
91
|
+
</module>
|
92
|
+
<module>
|
93
|
+
<class>edu.harvard.hul.ois.jhove.module.Utf8Module</class>
|
94
|
+
</module>
|
95
|
+
<outputHandler>
|
96
|
+
<class>edu.harvard.hul.ois.jhove.handler.XmlHandler</class>
|
97
|
+
</outputHandler>
|
98
|
+
</jhoveConfig>
|
data/bin/jhoveToolkit.sh
ADDED
@@ -0,0 +1,87 @@
|
|
1
|
+
#!/bin/sh
|
2
|
+
|
3
|
+
########################################################################
|
4
|
+
# JHOVE - JSTOR/Harvard Object Validation Environment
|
5
|
+
# Copyright 2003-2005 by JSTOR and the President and Fellows of Harvard College
|
6
|
+
# JHOVE is made available under the GNU General Public License (see the
|
7
|
+
# file LICENSE for details)
|
8
|
+
#
|
9
|
+
# Usage: jhove [-c config] [-m module] [-h handler] [-e encoding] [-H handler]
|
10
|
+
# [-o output] [-x saxclass] [-t tempdir] [-b bufsize]
|
11
|
+
# [-l loglevel] [[-krs] dir-file-or-uri [...]]
|
12
|
+
#
|
13
|
+
# where -c config Configuration file pathname
|
14
|
+
# -m module Module name
|
15
|
+
# -h handler Output handler name (defaults to TEXT)
|
16
|
+
# -e encoding Character encoding of output handler (defaults to UTF-8)
|
17
|
+
# -H handler About handler name
|
18
|
+
# -o output Output file pathname (defaults to standard output)
|
19
|
+
# -x saxclass SAX parser class (defaults to J2SE 1.4 default)
|
20
|
+
# -t tempdir Temporary directory in which to create temporary files
|
21
|
+
# -b bufsize Buffer size for buffered I/O (defaults to J2SE 1.4 default)
|
22
|
+
# -k Calculate CRC32, MD5, and SHA-1 checksums
|
23
|
+
# -r Display raw data flags, not textual equivalents
|
24
|
+
# -s Format identification based on internal signatures only
|
25
|
+
# dir-file-or-uri Directory, file pathname or URI of formatted content
|
26
|
+
#
|
27
|
+
# CHANGE for JHOVE 1.8:
|
28
|
+
# You no longer have to figure out where JAVA_HOME is; that's the
|
29
|
+
# operating system's job. If the OS tells you it can't find Java,
|
30
|
+
# adjust your shell's path or revert to the old way (commented out).
|
31
|
+
# Configuration constants:
|
32
|
+
|
33
|
+
# Infer JHOVE_HOME from script location
|
34
|
+
SCRIPT="${0}"
|
35
|
+
|
36
|
+
echo ${SCRIPT}
|
37
|
+
|
38
|
+
# Resolve absolute and relative symlinks
|
39
|
+
while [ -h "${SCRIPT}" ]; do
|
40
|
+
LS=$( ls -ld "${SCRIPT}" )
|
41
|
+
LINK=$( expr "${LS}" : '.*-> \(.*\)$' )
|
42
|
+
if expr "${LINK}" : '/.*' > /dev/null; then
|
43
|
+
SCRIPT="${LINK}"
|
44
|
+
else
|
45
|
+
SCRIPT="$( dirname "${SCRIPT}" )/${LINK}"
|
46
|
+
fi
|
47
|
+
done
|
48
|
+
|
49
|
+
# Store absolute location
|
50
|
+
CWD="$( pwd )"
|
51
|
+
JHOVE_HOME="$( cd "$(dirname "${SCRIPT}" )" && pwd )"
|
52
|
+
cd "${CWD}" || exit
|
53
|
+
export JHOVE_HOME
|
54
|
+
|
55
|
+
JHOVE_VERSION=1.24.0-RC1
|
56
|
+
JAVA_HOME=/etc/alternatives/jre
|
57
|
+
JAVA=/usr/bin/java
|
58
|
+
|
59
|
+
CP=${JHOVE_HOME}/jhove-apps-${JHOVE_VERSION}.jar:${JHOVE_HOME}/jhove-ext-modules-${JHOVE_VERSION}.jar
|
60
|
+
CP=${CP}:${JHOVE_HOME}/aiff-hul-1.6.1-RC1.jar
|
61
|
+
CP=${CP}:${JHOVE_HOME}/ascii-hul-1.4.1.jar
|
62
|
+
CP=${CP}:${JHOVE_HOME}/gif-hul-1.4.2-RC1.jar
|
63
|
+
CP=${CP}:${JHOVE_HOME}/html-hul-1.4.1.jar
|
64
|
+
CP=${CP}:${JHOVE_HOME}/jpeg-hul-1.5.2-RC1.jar
|
65
|
+
CP=${CP}:${JHOVE_HOME}/jpeg2000-hul-1.4.2-RC1.jar
|
66
|
+
CP=${CP}:${JHOVE_HOME}/pdf-hul-1.12.2-RC1.jar
|
67
|
+
CP=${CP}:${JHOVE_HOME}/tiff-hul-1.9.2-RC1.jar
|
68
|
+
CP=${CP}:${JHOVE_HOME}/utf8-hul-1.7.1.jar
|
69
|
+
CP=${CP}:${JHOVE_HOME}/wave-hul-1.8.1-RC1.jar
|
70
|
+
CP=${CP}:${JHOVE_HOME}/xml-hul-1.5.1.jar
|
71
|
+
|
72
|
+
# Retrieve a copy of all command line arguments to pass to the application.
|
73
|
+
# Since looping over the positional parameters is such a common thing to do in scripts,
|
74
|
+
# for arg
|
75
|
+
# defaults to
|
76
|
+
# for arg in "$@".
|
77
|
+
# The double-quoted "$@" is special magic that causes each parameter to be used as a single word
|
78
|
+
|
79
|
+
ARGS="-c ${JHOVE_HOME}/jhove.conf"
|
80
|
+
for ARG do
|
81
|
+
ARGS="$ARGS $ARG"
|
82
|
+
done
|
83
|
+
|
84
|
+
echo $JHOVE_HOME
|
85
|
+
|
86
|
+
# Set the CLASSPATH and invoke the Java loader.
|
87
|
+
${JAVA} -Xms128M -Xmx6000M -classpath $CP Jhove $ARGS
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1,127 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'pathname'
|
3
|
+
require 'jhove_technical_metadata'
|
4
|
+
require 'stringio'
|
5
|
+
require 'uri'
|
6
|
+
require 'shellwords'
|
7
|
+
require 'open3'
|
8
|
+
|
9
|
+
class JhoveService
|
10
|
+
|
11
|
+
# @return [Pathname] The directory in which program files are located
|
12
|
+
attr_accessor :bin_pathname
|
13
|
+
|
14
|
+
# @return [Pathname] The directory in which output should be generated
|
15
|
+
attr_accessor :target_pathname
|
16
|
+
|
17
|
+
# @return [String] The druid of the object, which gets inserted in the root element of the output
|
18
|
+
attr_accessor :digital_object_id
|
19
|
+
|
20
|
+
# @param [String] target_dir The directory into which output should be generated
|
21
|
+
def initialize(target_dir=nil)
|
22
|
+
@target_pathname = Pathname.new(target_dir) unless target_dir.nil?
|
23
|
+
@bin_pathname = Pathname.new(File.expand_path(File.dirname(__FILE__) + '/../bin'))
|
24
|
+
end
|
25
|
+
|
26
|
+
# @return [String] The output file from the JHOVE run
|
27
|
+
def jhove_output
|
28
|
+
@target_pathname.join('jhove_output.xml')
|
29
|
+
end
|
30
|
+
|
31
|
+
# @return [String] The technicalMetadata.xml output file path
|
32
|
+
def tech_md_output
|
33
|
+
@target_pathname.join('technicalMetadata.xml')
|
34
|
+
end
|
35
|
+
|
36
|
+
# @param content_dir [Pathname,String] the directory path containing the files to be analyzed by JHOVE
|
37
|
+
# @param fileset_file [Pathname,String] the pathname of the file listing which files should be processed. If nil, process all files.
|
38
|
+
# @return [String] Run JHOVE to characterize all content files, returning the output file path
|
39
|
+
def run_jhove(content_dir, fileset_file=nil)
|
40
|
+
raise "Content #{content_dir} not found" unless File.directory? content_dir
|
41
|
+
if fileset_file.nil? # a simple directory gets called directly
|
42
|
+
exec_command(get_jhove_command(content_dir))
|
43
|
+
jhove_output_xml_ng = File.open(jhove_output) { |f| Nokogiri::XML(f) }
|
44
|
+
else # a filelist gets run one by one, jhove cannot do this out of the box, so we need to run jhove file by file and then assemble the results ourselves into a single XML
|
45
|
+
raise "File list #{fileset_file} not found" unless File.exists? fileset_file
|
46
|
+
files = File.new(fileset_file).readlines
|
47
|
+
raise "File list #{fileset_file} empty" if files.size == 0
|
48
|
+
combined_xml_output = ""
|
49
|
+
jhove_output_xml_ng = Nokogiri::XML('')
|
50
|
+
files.each_with_index do |filename,i| # generate jhove output for each file in a separate xml file
|
51
|
+
full_path_to_file = File.join(content_dir,filename.strip)
|
52
|
+
output_file = @target_pathname.join("jhove_output_#{i}.xml")
|
53
|
+
exec_command(get_jhove_command(full_path_to_file,output_file))
|
54
|
+
jhove_output_xml_ng = File.open(output_file) { |f| Nokogiri::XML(f) }
|
55
|
+
combined_xml_output += jhove_output_xml_ng.css("//repInfo").to_xml # build up an XML string with all output
|
56
|
+
output_file.delete
|
57
|
+
end
|
58
|
+
jhove_output_xml_ng.root.children.each {|n| n.remove} # use all of the files we built up above, strip all the children to get the root jhove node
|
59
|
+
jhove_output_xml_ng.root << combined_xml_output # now add the combined xml for all files
|
60
|
+
end
|
61
|
+
remove_path_from_file_nodes(jhove_output_xml_ng,content_dir)
|
62
|
+
File.write(jhove_output, jhove_output_xml_ng.to_xml)
|
63
|
+
jhove_output.to_s
|
64
|
+
end
|
65
|
+
|
66
|
+
# @param command [String] the command to execute on the command line
|
67
|
+
# @raises [RuntimeError] if there is a problem running the command
|
68
|
+
def exec_command(command)
|
69
|
+
stdout, stderr, status = Open3.capture3(command, chdir: @bin_pathname)
|
70
|
+
raise "Error when running JHOVE #{command}:\n#{stderr}" unless status.success?
|
71
|
+
end
|
72
|
+
|
73
|
+
# @param input_path [Pathname,String] the directory path or filename containing the folder or file to be analyzed by JHOVE
|
74
|
+
# @param output_file [Pathname,String] the output file to write the XML to, defaults to filename specified in jhove_output
|
75
|
+
# @return [String] The jhove-toolkit command to be exectuted in a system call
|
76
|
+
def get_jhove_command(input_path,output_file = jhove_output)
|
77
|
+
filename = Shellwords.escape(input_path) # escape any special characters in the path
|
78
|
+
args = "-h xml -o \"#{output_file}\" \\\"#{filename}"
|
79
|
+
jhove_script = './jhoveToolkit.sh'
|
80
|
+
jhove_cmd = "#{jhove_script} #{args}"
|
81
|
+
jhove_cmd
|
82
|
+
end
|
83
|
+
|
84
|
+
# @param jhove_output_xml_ng [ng_xml_obj] the nokogiri xml output from jhove
|
85
|
+
# @param path [String] the shared path that will be removed from each file name to ensure the file nodes are relative
|
86
|
+
def remove_path_from_file_nodes(jhove_output_xml_ng,path)
|
87
|
+
jhove_output_xml_ng.xpath('//jhove:repInfo', 'jhove' => 'http://schema.openpreservation.org/ois/xml/ns/jhove').each do |filename_node|
|
88
|
+
filename_node.attributes['uri'].value = URI.decode(filename_node.attributes['uri'].value.gsub("#{path}",'').sub(/^\//,'')) # decode and remove path and any leading /
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
# @param [Pathname,String] jhove_pathname The full path of the file containing JHOVE output to be transformed to technical metadata
|
93
|
+
# @return [String] Convert jhove output it to technicalMetadata, returning the output file path
|
94
|
+
def create_technical_metadata(jhove_pathname=jhove_output)
|
95
|
+
jhove_pathname = Pathname.new(jhove_pathname)
|
96
|
+
jhovetm = JhoveTechnicalMetadata.new()
|
97
|
+
jhovetm.digital_object_id=self.digital_object_id
|
98
|
+
jhovetm.output_file=tech_md_output
|
99
|
+
# Create a SAX parser
|
100
|
+
parser = Nokogiri::XML::SAX::Parser.new(jhovetm)
|
101
|
+
# Feed the parser some XML
|
102
|
+
parser.parse(jhove_pathname.open('rb'))
|
103
|
+
tech_md_output.to_s
|
104
|
+
end
|
105
|
+
|
106
|
+
# @param [String] old_tm the old techMD xml to be transformed to new technical metadata format
|
107
|
+
# @return [String] Convert old techMD date to new technicalMetadata format
|
108
|
+
def upgrade_technical_metadata(old_tm)
|
109
|
+
new_tm = StringIO.new()
|
110
|
+
upgrade_sax_handler = JhoveTechnicalMetadata.new()
|
111
|
+
upgrade_sax_handler.digital_object_id=self.digital_object_id
|
112
|
+
upgrade_sax_handler.ios = new_tm
|
113
|
+
# Create a SAX parser
|
114
|
+
parser = Nokogiri::XML::SAX::Parser.new(upgrade_sax_handler)
|
115
|
+
# Feed the parser some XML
|
116
|
+
parser.parse(old_tm)
|
117
|
+
new_tm.string
|
118
|
+
end
|
119
|
+
|
120
|
+
|
121
|
+
# @return [void] Cleanup the temporary workspace used to hold the metadata outputs
|
122
|
+
def cleanup()
|
123
|
+
jhove_output.delete if jhove_output.exist?
|
124
|
+
tech_md_output.delete if tech_md_output.exist?
|
125
|
+
end
|
126
|
+
|
127
|
+
end
|
@@ -0,0 +1,284 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'time'
|
4
|
+
require 'pathname'
|
5
|
+
|
6
|
+
# A SAX handler for filtering JHOVE output to create a technicalMetadata datastream
|
7
|
+
# The previous mechanism (uising XSLT tranformation) was causing out of memory errors,
|
8
|
+
# due to XSLT's behavior of loading both the input and output objects into memory.
|
9
|
+
class JhoveTechnicalMetadata < Nokogiri::XML::SAX::Document
|
10
|
+
|
11
|
+
# @return [IO] the output stream for the result
|
12
|
+
attr_accessor :ios
|
13
|
+
|
14
|
+
# @return [String] The druid of the object, which gets inserted in the root element of the output
|
15
|
+
attr_accessor :digital_object_id
|
16
|
+
|
17
|
+
def initialize()
|
18
|
+
@indent = 0
|
19
|
+
@ios = STDOUT #File.open(STDOUT, 'w')
|
20
|
+
end
|
21
|
+
|
22
|
+
# @param [Pathname] pathname the location of the technicalMetadata.xml file to be created
|
23
|
+
# @return [void] Opens the output stream pointing to the specified file
|
24
|
+
def output_file=(pathname)
|
25
|
+
@ios = pathname.open('w')
|
26
|
+
end
|
27
|
+
|
28
|
+
# @param [String] string The character string to be appended to the output
|
29
|
+
# @return [void] Append the specified string to the output stream
|
30
|
+
def output(string)
|
31
|
+
@ios.puts " "*@indent + string
|
32
|
+
end
|
33
|
+
|
34
|
+
# @param [String] tag the name of the XML element from the parsed input
|
35
|
+
# @param [Hash] attrs the XML attributes of the element
|
36
|
+
# @return [void] this method is called by the sax parser at the beginning of an element
|
37
|
+
def start_element(tag, attrs = [])
|
38
|
+
case tag
|
39
|
+
when 'jhove'
|
40
|
+
# <jhove> is the root element of the input
|
41
|
+
root_open(attrs)
|
42
|
+
when 'repInfo'
|
43
|
+
# A <repInfo> element contains the data for each file
|
44
|
+
file_wrapper_open(attrs)
|
45
|
+
when 'properties'
|
46
|
+
# A <properties> element contains the variable data for the file
|
47
|
+
properties_open
|
48
|
+
else
|
49
|
+
if tag[0..2] == 'mix'
|
50
|
+
# JHOVE output for image files contains tech md in MIX format that we copy verbatum to output
|
51
|
+
mix_open(tag)
|
52
|
+
elsif @in_jhove
|
53
|
+
# we've encountered one of the JHOVE elements that we want to automatically copy
|
54
|
+
jhove_open(tag, attrs)
|
55
|
+
elsif @in_properties
|
56
|
+
# we're looking for the LineEndings property in the JHOVE output
|
57
|
+
linebreak_open(tag)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
# @param [String] tag the value of a text node found in the parsed XML
|
63
|
+
# @return [void] this method is called by the sax parser when a text node is encountered
|
64
|
+
def characters(string)
|
65
|
+
@text = string
|
66
|
+
end
|
67
|
+
|
68
|
+
# @param [String] tag the name of the XML element from the parsed input
|
69
|
+
# @return [void] this method is called by the sax parser at the end of an element
|
70
|
+
def end_element(tag)
|
71
|
+
case tag
|
72
|
+
when 'jhove'
|
73
|
+
root_close
|
74
|
+
when 'repInfo'
|
75
|
+
file_wrapper_close
|
76
|
+
when 'properties'
|
77
|
+
properties_close
|
78
|
+
else
|
79
|
+
if tag[0..2] == 'mix'
|
80
|
+
mix_close(tag)
|
81
|
+
elsif @in_jhove
|
82
|
+
jhove_close(tag)
|
83
|
+
elsif @in_properties
|
84
|
+
linebreak_close(tag)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# @param [Hash] attrs the attributes of the <jhove> element in the XML input
|
90
|
+
# @return [void] create the <technicalMetadata> root element of the XML output and include namespace declararions
|
91
|
+
def root_open(attrs)
|
92
|
+
if @digital_object_id
|
93
|
+
output "<technicalMetadata objectId='#{@digital_object_id}' datetime='#{Time.now.utc.iso8601}'"
|
94
|
+
else
|
95
|
+
output "<technicalMetadata datetime='#{Time.now.utc.iso8601}'"
|
96
|
+
end
|
97
|
+
@indent += 2
|
98
|
+
output "xmlns:jhove='http://hul.harvard.edu/ois/xml/ns/jhove'"
|
99
|
+
output "xmlns:mix='http://www.loc.gov/mix/v10'"
|
100
|
+
output "xmlns:textmd='info:lc/xmlns/textMD-v3' >"
|
101
|
+
@indent -= 1
|
102
|
+
end
|
103
|
+
|
104
|
+
# @return [void] add the closing element of the output document
|
105
|
+
def root_close
|
106
|
+
@indent -= 1
|
107
|
+
output "</technicalMetadata>"
|
108
|
+
@ios.close
|
109
|
+
end
|
110
|
+
|
111
|
+
# @param [Hash] attrs the attributes of the <jhove> element in the XML input
|
112
|
+
# @return [void] Append a <file> element to the output, setting the id attribute to the file path
|
113
|
+
def file_wrapper_open(attrs)
|
114
|
+
filepath=nil
|
115
|
+
attrs.each { |attr| filepath=attr[1] if attr[0]=='uri'}
|
116
|
+
output "<file id='#{filepath}'>"
|
117
|
+
@indent += 1
|
118
|
+
@in_jhove = true
|
119
|
+
end
|
120
|
+
|
121
|
+
# @return [void] Append a </file> tag to close the file data,
|
122
|
+
# but first inset a textMD stanza if the file has a text format
|
123
|
+
def file_wrapper_close
|
124
|
+
case @format
|
125
|
+
when 'ASCII', 'HTML','TEXT','UTF-8'
|
126
|
+
output_textmd(@linebreak)
|
127
|
+
end
|
128
|
+
@indent -= 1
|
129
|
+
output " </jhove:properties>" if @in_properties
|
130
|
+
|
131
|
+
output "</file>"
|
132
|
+
@in_jhove = false
|
133
|
+
@in_properties=false
|
134
|
+
end
|
135
|
+
|
136
|
+
# @param [String] tag the name of the XML element from the parsed input
|
137
|
+
# @param [Hash] attrs the attributes of the <jhove> element in the XML input
|
138
|
+
# @return [void] Copy this jhove element tag and its attributes verbatum
|
139
|
+
def jhove_open(tag, attrs)
|
140
|
+
if @jhove_tag # saved previously
|
141
|
+
# we encountered a new element so output what was previously cached
|
142
|
+
output "<jhove:#{@jhove_tag}#{@jhove_attrs}>"
|
143
|
+
@indent += 1
|
144
|
+
end
|
145
|
+
# cache the element name and its attributes
|
146
|
+
@jhove_tag = tag
|
147
|
+
@jhove_attrs = ""
|
148
|
+
attrs.each do |attr|
|
149
|
+
@jhove_attrs += " #{attr[0]}='#{attr[1]}'"
|
150
|
+
end
|
151
|
+
@text = nil
|
152
|
+
@linebreak='LF'
|
153
|
+
end
|
154
|
+
|
155
|
+
# @param [String] tag the name of the XML element from the parsed input
|
156
|
+
# @return [void] Output a closing tag, preceded by cached data, if such exists
|
157
|
+
def jhove_close(tag)
|
158
|
+
if @text && tag == @jhove_tag
|
159
|
+
output "<jhove:#{@jhove_tag}#{@jhove_attrs}>#{@text}</jhove:#{tag}>"
|
160
|
+
elsif tag == @jhove_tag
|
161
|
+
output "<jhove:#{@jhove_tag}#{@jhove_attrs}/>"
|
162
|
+
else
|
163
|
+
@indent -=1
|
164
|
+
output "</jhove:#{tag}>"
|
165
|
+
end
|
166
|
+
@format = @text if tag == 'format'
|
167
|
+
@text = nil
|
168
|
+
@jhove_tag = nil
|
169
|
+
@jhove_attrs=""
|
170
|
+
end
|
171
|
+
|
172
|
+
# @return [void] Output a <properties> element if one was encountered in the input,
|
173
|
+
# then ignore most input data from within the properties element, except mix and LineBreaks
|
174
|
+
def properties_open
|
175
|
+
output "<jhove:properties>"
|
176
|
+
@indent += 1
|
177
|
+
@in_jhove = false
|
178
|
+
@in_properties=true
|
179
|
+
end
|
180
|
+
|
181
|
+
# @return [void] Appending of a closing tag is handled elsewhere
|
182
|
+
def properties_close
|
183
|
+
@indent -= 1
|
184
|
+
end
|
185
|
+
|
186
|
+
# @param [String] tag the name of the XML element from the parsed input
|
187
|
+
# @return [void] Copy any Mix data verbatum,
|
188
|
+
def mix_open(tag)
|
189
|
+
if @mix_tag
|
190
|
+
# we encountered a new element so output what was previously cached
|
191
|
+
output "<#{@mix_tag}>"
|
192
|
+
@indent += 1
|
193
|
+
end
|
194
|
+
# cache the element name
|
195
|
+
@mix_tag = tag
|
196
|
+
@text = nil
|
197
|
+
end
|
198
|
+
|
199
|
+
# @param [String] tag the name of the XML element from the parsed input
|
200
|
+
# @return [void] Output a closing tag, preceded by cached data, if such exists
|
201
|
+
def mix_close(tag)
|
202
|
+
if @text && tag == @mix_tag
|
203
|
+
output "<#{tag}>#{@text}</#{tag}>"
|
204
|
+
elsif tag == @mix_tag
|
205
|
+
output "<#{tag}/>"
|
206
|
+
else
|
207
|
+
@indent -=1
|
208
|
+
output "</#{tag}>"
|
209
|
+
end
|
210
|
+
@text = nil
|
211
|
+
@mix_tag = nil
|
212
|
+
end
|
213
|
+
|
214
|
+
# @param [String] tag the name of the XML element from the parsed input
|
215
|
+
# @return [void] Keep clearing the text cache any time a new element is encountered
|
216
|
+
def linebreak_open(tag)
|
217
|
+
@text = nil if @text
|
218
|
+
end
|
219
|
+
|
220
|
+
# @param [String] tag the name of the XML element from the parsed input
|
221
|
+
# @return [void] Look for the LineEndings name/value pair, which is spread across multiple elements
|
222
|
+
def linebreak_close(tag)
|
223
|
+
case tag
|
224
|
+
when 'name'
|
225
|
+
@in_line_endings = false
|
226
|
+
@in_line_endings = true if @text == 'LineEndings'
|
227
|
+
when 'value'
|
228
|
+
@linebreak = @text if @in_line_endings
|
229
|
+
@in_line_endings = false
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
# @param [Object] linebreak the CRLF or LF value found in the JHOVE output ()default is LF)
|
234
|
+
# @return [void] Output a textMD section within the properties element
|
235
|
+
def output_textmd(linebreak)
|
236
|
+
indent = @indent
|
237
|
+
@indent = 0
|
238
|
+
if @in_properties
|
239
|
+
# properties element tags provided by other code
|
240
|
+
output <<-EOF
|
241
|
+
<textmd:textMD>
|
242
|
+
<textmd:character_info>
|
243
|
+
<textmd:byte_order>big</textmd:byte_order>
|
244
|
+
<textmd:byte_size>8</textmd:byte_size>
|
245
|
+
<textmd:character_size>1</textmd:character_size>
|
246
|
+
<textmd:linebreak>#{linebreak}</textmd:linebreak>
|
247
|
+
</textmd:character_info>
|
248
|
+
</textmd:textMD>
|
249
|
+
EOF
|
250
|
+
else
|
251
|
+
# there were no properties elements in the input, so we must supply them ourselves
|
252
|
+
output <<-EOF
|
253
|
+
<jhove:properties>
|
254
|
+
<textmd:textMD>
|
255
|
+
<textmd:character_info>
|
256
|
+
<textmd:byte_order>big</textmd:byte_order>
|
257
|
+
<textmd:byte_size>8</textmd:byte_size>
|
258
|
+
<textmd:character_size>1</textmd:character_size>
|
259
|
+
<textmd:linebreak>#{linebreak}</textmd:linebreak>
|
260
|
+
</textmd:character_info>
|
261
|
+
</textmd:textMD>
|
262
|
+
</jhove:properties>
|
263
|
+
EOF
|
264
|
+
end
|
265
|
+
@indent = indent
|
266
|
+
end
|
267
|
+
|
268
|
+
end
|
269
|
+
|
270
|
+
|
271
|
+
# Below is the equivalent of a java main method.
|
272
|
+
# For this to work OK, the module/class being invoked
|
273
|
+
# must have already have been loaded by the Ruby interpreter.
|
274
|
+
|
275
|
+
if __FILE__ == $0
|
276
|
+
# Create a handler
|
277
|
+
jhovetm = JhoveTechnicalMetadata.new()
|
278
|
+
jhovetm.digital_object_id=ARGV[0]
|
279
|
+
jhovetm.output_file=Pahtname.new(ARGV[2])
|
280
|
+
# Create a SAX parser
|
281
|
+
parser = Nokogiri::XML::SAX::Parser.new(jhovetm)
|
282
|
+
# Feed the parser some XML
|
283
|
+
parser.parse(File.open(ARGV[1], 'rb'))
|
284
|
+
end
|