jhove-service 1.4.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (60) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.rdoc +23 -0
  3. data/README.rdoc +36 -0
  4. data/bin/aiff-hul-1.6.1-RC1.jar +0 -0
  5. data/bin/ascii-hul-1.4.1.jar +0 -0
  6. data/bin/cache/xhtml-attribs-1.xsd +73 -0
  7. data/bin/cache/xhtml-base-1.xsd +36 -0
  8. data/bin/cache/xhtml-bdo-1.xsd +71 -0
  9. data/bin/cache/xhtml-blkphras-1.xsd +160 -0
  10. data/bin/cache/xhtml-blkpres-1.xsd +37 -0
  11. data/bin/cache/xhtml-blkstruct-1.xsd +49 -0
  12. data/bin/cache/xhtml-csismap-1.xsd +96 -0
  13. data/bin/cache/xhtml-datatypes-1.xsd +242 -0
  14. data/bin/cache/xhtml-edit-1.xsd +39 -0
  15. data/bin/cache/xhtml-events-1.xsd +130 -0
  16. data/bin/cache/xhtml-form-1.xsd +327 -0
  17. data/bin/cache/xhtml-framework-1.xsd +66 -0
  18. data/bin/cache/xhtml-hypertext-1.xsd +47 -0
  19. data/bin/cache/xhtml-image-1.xsd +46 -0
  20. data/bin/cache/xhtml-inlphras-1.xsd +163 -0
  21. data/bin/cache/xhtml-inlpres-1.xsd +39 -0
  22. data/bin/cache/xhtml-inlstruct-1.xsd +50 -0
  23. data/bin/cache/xhtml-inlstyle-1.xsd +27 -0
  24. data/bin/cache/xhtml-link-1.xsd +45 -0
  25. data/bin/cache/xhtml-list-1.xsd +99 -0
  26. data/bin/cache/xhtml-meta-1.xsd +54 -0
  27. data/bin/cache/xhtml-object-1.xsd +76 -0
  28. data/bin/cache/xhtml-param-1.xsd +51 -0
  29. data/bin/cache/xhtml-pres-1.xsd +51 -0
  30. data/bin/cache/xhtml-ruby-1.xsd +170 -0
  31. data/bin/cache/xhtml-script-1.xsd +71 -0
  32. data/bin/cache/xhtml-ssismap-1.xsd +43 -0
  33. data/bin/cache/xhtml-struct-1.xsd +130 -0
  34. data/bin/cache/xhtml-style-1.xsd +53 -0
  35. data/bin/cache/xhtml-table-1.xsd +272 -0
  36. data/bin/cache/xhtml-target-1.xsd +49 -0
  37. data/bin/cache/xhtml-text-1.xsd +67 -0
  38. data/bin/cache/xhtml11-model-1.xsd +716 -0
  39. data/bin/cache/xhtml11-modules-1.xsd +605 -0
  40. data/bin/cache/xhtml11.xsd +104 -0
  41. data/bin/cache/xml.xsd +287 -0
  42. data/bin/console +9 -0
  43. data/bin/extension-mimetype.conf +209 -0
  44. data/bin/gif-hul-1.4.2-RC1.jar +0 -0
  45. data/bin/html-hul-1.4.1.jar +0 -0
  46. data/bin/jhove-README-1st.txt +2 -0
  47. data/bin/jhove-apps-1.24.0-RC1.jar +0 -0
  48. data/bin/jhove-ext-modules-1.24.0-RC1.jar +0 -0
  49. data/bin/jhove.conf +98 -0
  50. data/bin/jhoveToolkit.sh +87 -0
  51. data/bin/jpeg-hul-1.5.2-RC1.jar +0 -0
  52. data/bin/jpeg2000-hul-1.4.2-RC1.jar +0 -0
  53. data/bin/pdf-hul-1.12.2-RC1.jar +0 -0
  54. data/bin/tiff-hul-1.9.2-RC1.jar +0 -0
  55. data/bin/utf8-hul-1.7.1.jar +0 -0
  56. data/bin/wave-hul-1.8.1-RC1.jar +0 -0
  57. data/bin/xml-hul-1.5.1.jar +0 -0
  58. data/lib/jhove_service.rb +127 -0
  59. data/lib/jhove_technical_metadata.rb +284 -0
  60. metadata +172 -0
Binary file
Binary file
@@ -0,0 +1,2 @@
1
+ http://jhove.openpreservation.org/getting-started/
2
+ http://jhove.openpreservation.org/documentation/
@@ -0,0 +1,98 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <jhoveConfig version="1.0"
3
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
4
+ xmlns="http://hul.harvard.edu/ois/xml/ns/jhove/jhoveConfig"
5
+ xsi:schemaLocation="http://hul.harvard.edu/ois/xml/ns/jhove/jhoveConfig
6
+ http://cosimo.stanford.edu/standards/jhove/v1/jhoveConfig.xsd">
7
+ <jhoveHome></jhoveHome>
8
+ <defaultEncoding>utf-8</defaultEncoding>
9
+ <tempDirectory>/tmp</tempDirectory>
10
+ <bufferSize>131072</bufferSize>
11
+ <mixVersion>1.0</mixVersion>
12
+ <sigBytes>1024</sigBytes>
13
+ <module>
14
+ <class>edu.harvard.hul.ois.jhove.module.AiffModule</class>
15
+ </module>
16
+ <module>
17
+ <class>edu.harvard.hul.ois.jhove.module.WaveModule</class>
18
+ </module>
19
+ <module>
20
+ <class>edu.harvard.hul.ois.jhove.module.PdfModule</class>
21
+ <param>p</param>
22
+ </module>
23
+ <module>
24
+ <class>com.mcgath.jhove.module.PngModule</class>
25
+ </module>
26
+ <module>
27
+ <class>edu.harvard.hul.ois.jhove.module.Jpeg2000Module</class>
28
+ </module>
29
+ <module>
30
+ <class>edu.harvard.hul.ois.jhove.module.JpegModule</class>
31
+ </module>
32
+ <module>
33
+ <class>edu.harvard.hul.ois.jhove.module.GifModule</class>
34
+ </module>
35
+ <module>
36
+ <class>edu.harvard.hul.ois.jhove.module.TiffModule</class>
37
+ </module>
38
+ <module>
39
+ <class>edu.harvard.hul.ois.jhove.module.XmlModule</class>
40
+ <param>schema=http://www.w3.org/markUp/schema/xhtml11.xsd;cache/xhtml11.xsd</param>
41
+ <param>schema=http://www.w3.org/2001/xml.xsd;cache/xml.xsd</param>
42
+ <param>schema=http://www.w3.org/markUp/schema/xhtml11-model-1.xsd;cache/xhtml11-model-1.xsd</param>
43
+ <param>schema=https://www.w3.org/markUp/schema/xhtml11-model-1.xsd;cache/xhtml11-model-1.xsd</param>
44
+ <param>schema=http://www.w3.org/markUp/schema/xhtml11-modules-1.xsd;cache/xhtml11-modules-1.xsd</param>
45
+ <param>schema=https://www.w3.org/markUp/schema/xhtml11-modules-1.xsd;cache/xhtml11-modules-1.xsd</param>
46
+ <param>schema=http://www.w3.org/markUp/schema/xhtml-attribs-1.xsd;cache/xhtml-attribs-1.xsd</param>
47
+ <param>schema=https://www.w3.org/markUp/schema/xhtml-attribs-1.xsd;cache/xhtml-attribs-1.xsd</param>
48
+ <param>schema=http://www.w3.org/markUp/schema/xhtml-base-1.xsd;cache/xhtml-base-1.xsd</param>
49
+ <param>schema=https://www.w3.org/markUp/schema/xhtml-blkphras-1.xsd;cache/xhtml-blkphras-1.xsd</param>
50
+ <param>schema=https://www.w3.org/markUp/schema/xhtml-blkpres-1.xsd;cache/xhtml-blkpres-1.xsd</param>
51
+ <param>schema=https://www.w3.org/markUp/schema/xhtml-blkstruct-1.xsd;cache/xhtml-blkstruct-1.xsd</param>
52
+ <param>schema=http://www.w3.org/markUp/schema/xhtml-bdo-1.xsd;cache/xhtml-bdo-1.xsd</param>
53
+ <param>schema=http://www.w3.org/markUp/schema/xhtml-csismap-1.xsd;cache/xhtml-csismap-1.xsd</param>
54
+ <param>schema=http://www.w3.org/markUp/schema/xhtml-datatypes-1.xsd;cache/xhtml-datatypes-1.xsd</param>
55
+ <param>schema=http://www.w3.org/markUp/schema/xhtml-edit-1.xsd;cache/xhtml-edit-1.xsd</param>
56
+ <param>schema=http://www.w3.org/markUp/schema/xhtml-events-1.xsd;cache/xhtml-events-1.xsd</param>
57
+ <param>schema=http://www.w3.org/markUp/schema/xhtml-form-1.xsd;cache/xhtml-form-1.xsd</param>
58
+ <param>schema=http://www.w3.org/markUp/schema/xhtml-framework-1.xsd;cache/xhtml-framework-1.xsd</param>
59
+ <param>schema=http://www.w3.org/markUp/schema/xhtml-hypertext-1.xsd;cache/xhtml-hypertext-1.xsd</param>
60
+ <param>schema=http://www.w3.org/markUp/schema/xhtml-image-1.xsd;cache/xhtml-image-1.xsd</param>
61
+ <param>schema=https://www.w3.org/markUp/schema/xhtml-inlphras-1.xsd;cache/xhtml-inlphras-1.xsd</param>
62
+ <param>schema=https://www.w3.org/markUp/schema/xhtml-inlpres-1.xsd;cache/xhtml-inlpres-1.xsd</param>
63
+ <param>schema=https://www.w3.org/markUp/schema/xhtml-inlstruct-1.xsd;cache/xhtml-inlstruct-1.xsd</param>
64
+ <param>schema=http://www.w3.org/markUp/schema/xhtml-inlstyle-1.xsd;cache/xhtml-inlstyle-1.xsd</param>
65
+ <param>schema=http://www.w3.org/markUp/schema/xhtml-link-1.xsd;cache/xhtml-link-1.xsd</param>
66
+ <param>schema=http://www.w3.org/markUp/schema/xhtml-list-1.xsd;cache/xhtml-list-1.xsd</param>
67
+ <param>schema=http://www.w3.org/markUp/schema/xhtml-meta-1.xsd;cache/xhtml-meta-1.xsd</param>
68
+ <param>schema=http://www.w3.org/markUp/schema/xhtml-object-1.xsd;cache/xhtml-object-1.xsd</param>
69
+ <param>schema=http://www.w3.org/markUp/schema/xhtml-param-1.xsd;cache/xhtml-param-1.xsd</param>
70
+ <param>schema=https://www.w3.org/markUp/schema/xhtml-param-1.xsd;cache/xhtml-param-1.xsd</param>
71
+ <param>schema=http://www.w3.org/markUp/schema/xhtml-pres-1.xsd;cache/xhtml-pres-1.xsd</param>
72
+ <param>schema=http://www.w3.org/markUp/schema/xhtml-ruby-1.xsd;cache/xhtml-ruby-1.xsd</param>
73
+ <param>schema=http://www.w3.org/markUp/schema/xhtml-script-1.xsd;cache/xhtml-script-1.xsd</param>
74
+ <param>schema=http://www.w3.org/markUp/schema/xhtml-ssismap-1.xsd;cache/xhtml-ssismap-1.xsd</param>
75
+ <param>schema=http://www.w3.org/markUp/schema/xhtml-struct-1.xsd;cache/xhtml-struct-1.xsd</param>
76
+ <param>schema=http://www.w3.org/markUp/schema/xhtml-style-1.xsd;cache/xhtml-style-1.xsd</param>
77
+ <param>schema=http://www.w3.org/markUp/schema/xhtml-table-1.xsd;cache/xhtml-table-1.xsd</param>
78
+ <param>schema=http://www.w3.org/markUp/schema/xhtml-target-1.xsd;cache/xhtml-target-1.xsd</param>
79
+ <param>schema=http://www.w3.org/markUp/schema/xhtml-text-1.xsd;cache/xhtml-text-1.xsd</param> </module>
80
+ <module>
81
+ <class>edu.harvard.hul.ois.jhove.module.HtmlModule</class>
82
+ </module>
83
+ <module>
84
+ <class>edu.harvard.hul.ois.jhove.module.WarcModule</class>
85
+ </module>
86
+ <module>
87
+ <class>edu.harvard.hul.ois.jhove.module.GzipModule</class>
88
+ </module>
89
+ <module>
90
+ <class>edu.harvard.hul.ois.jhove.module.AsciiModule</class>
91
+ </module>
92
+ <module>
93
+ <class>edu.harvard.hul.ois.jhove.module.Utf8Module</class>
94
+ </module>
95
+ <outputHandler>
96
+ <class>edu.harvard.hul.ois.jhove.handler.XmlHandler</class>
97
+ </outputHandler>
98
+ </jhoveConfig>
@@ -0,0 +1,87 @@
1
+ #!/bin/sh
2
+
3
+ ########################################################################
4
+ # JHOVE - JSTOR/Harvard Object Validation Environment
5
+ # Copyright 2003-2005 by JSTOR and the President and Fellows of Harvard College
6
+ # JHOVE is made available under the GNU General Public License (see the
7
+ # file LICENSE for details)
8
+ #
9
+ # Usage: jhove [-c config] [-m module] [-h handler] [-e encoding] [-H handler]
10
+ # [-o output] [-x saxclass] [-t tempdir] [-b bufsize]
11
+ # [-l loglevel] [[-krs] dir-file-or-uri [...]]
12
+ #
13
+ # where -c config Configuration file pathname
14
+ # -m module Module name
15
+ # -h handler Output handler name (defaults to TEXT)
16
+ # -e encoding Character encoding of output handler (defaults to UTF-8)
17
+ # -H handler About handler name
18
+ # -o output Output file pathname (defaults to standard output)
19
+ # -x saxclass SAX parser class (defaults to J2SE 1.4 default)
20
+ # -t tempdir Temporary directory in which to create temporary files
21
+ # -b bufsize Buffer size for buffered I/O (defaults to J2SE 1.4 default)
22
+ # -k Calculate CRC32, MD5, and SHA-1 checksums
23
+ # -r Display raw data flags, not textual equivalents
24
+ # -s Format identification based on internal signatures only
25
+ # dir-file-or-uri Directory, file pathname or URI of formatted content
26
+ #
27
+ # CHANGE for JHOVE 1.8:
28
+ # You no longer have to figure out where JAVA_HOME is; that's the
29
+ # operating system's job. If the OS tells you it can't find Java,
30
+ # adjust your shell's path or revert to the old way (commented out).
31
+ # Configuration constants:
32
+
33
+ # Infer JHOVE_HOME from script location
34
+ SCRIPT="${0}"
35
+
36
+ echo ${SCRIPT}
37
+
38
+ # Resolve absolute and relative symlinks
39
+ while [ -h "${SCRIPT}" ]; do
40
+ LS=$( ls -ld "${SCRIPT}" )
41
+ LINK=$( expr "${LS}" : '.*-> \(.*\)$' )
42
+ if expr "${LINK}" : '/.*' > /dev/null; then
43
+ SCRIPT="${LINK}"
44
+ else
45
+ SCRIPT="$( dirname "${SCRIPT}" )/${LINK}"
46
+ fi
47
+ done
48
+
49
+ # Store absolute location
50
+ CWD="$( pwd )"
51
+ JHOVE_HOME="$( cd "$(dirname "${SCRIPT}" )" && pwd )"
52
+ cd "${CWD}" || exit
53
+ export JHOVE_HOME
54
+
55
+ JHOVE_VERSION=1.24.0-RC1
56
+ JAVA_HOME=/etc/alternatives/jre
57
+ JAVA=/usr/bin/java
58
+
59
+ CP=${JHOVE_HOME}/jhove-apps-${JHOVE_VERSION}.jar:${JHOVE_HOME}/jhove-ext-modules-${JHOVE_VERSION}.jar
60
+ CP=${CP}:${JHOVE_HOME}/aiff-hul-1.6.1-RC1.jar
61
+ CP=${CP}:${JHOVE_HOME}/ascii-hul-1.4.1.jar
62
+ CP=${CP}:${JHOVE_HOME}/gif-hul-1.4.2-RC1.jar
63
+ CP=${CP}:${JHOVE_HOME}/html-hul-1.4.1.jar
64
+ CP=${CP}:${JHOVE_HOME}/jpeg-hul-1.5.2-RC1.jar
65
+ CP=${CP}:${JHOVE_HOME}/jpeg2000-hul-1.4.2-RC1.jar
66
+ CP=${CP}:${JHOVE_HOME}/pdf-hul-1.12.2-RC1.jar
67
+ CP=${CP}:${JHOVE_HOME}/tiff-hul-1.9.2-RC1.jar
68
+ CP=${CP}:${JHOVE_HOME}/utf8-hul-1.7.1.jar
69
+ CP=${CP}:${JHOVE_HOME}/wave-hul-1.8.1-RC1.jar
70
+ CP=${CP}:${JHOVE_HOME}/xml-hul-1.5.1.jar
71
+
72
+ # Retrieve a copy of all command line arguments to pass to the application.
73
+ # Since looping over the positional parameters is such a common thing to do in scripts,
74
+ # for arg
75
+ # defaults to
76
+ # for arg in "$@".
77
+ # The double-quoted "$@" is special magic that causes each parameter to be used as a single word
78
+
79
+ ARGS="-c ${JHOVE_HOME}/jhove.conf"
80
+ for ARG do
81
+ ARGS="$ARGS $ARG"
82
+ done
83
+
84
+ echo $JHOVE_HOME
85
+
86
+ # Set the CLASSPATH and invoke the Java loader.
87
+ ${JAVA} -Xms128M -Xmx6000M -classpath $CP Jhove $ARGS
Binary file
Binary file
@@ -0,0 +1,127 @@
1
+ require 'nokogiri'
2
+ require 'pathname'
3
+ require 'jhove_technical_metadata'
4
+ require 'stringio'
5
+ require 'uri'
6
+ require 'shellwords'
7
+ require 'open3'
8
+
9
+ class JhoveService
10
+
11
+ # @return [Pathname] The directory in which program files are located
12
+ attr_accessor :bin_pathname
13
+
14
+ # @return [Pathname] The directory in which output should be generated
15
+ attr_accessor :target_pathname
16
+
17
+ # @return [String] The druid of the object, which gets inserted in the root element of the output
18
+ attr_accessor :digital_object_id
19
+
20
+ # @param [String] target_dir The directory into which output should be generated
21
+ def initialize(target_dir=nil)
22
+ @target_pathname = Pathname.new(target_dir) unless target_dir.nil?
23
+ @bin_pathname = Pathname.new(File.expand_path(File.dirname(__FILE__) + '/../bin'))
24
+ end
25
+
26
+ # @return [String] The output file from the JHOVE run
27
+ def jhove_output
28
+ @target_pathname.join('jhove_output.xml')
29
+ end
30
+
31
+ # @return [String] The technicalMetadata.xml output file path
32
+ def tech_md_output
33
+ @target_pathname.join('technicalMetadata.xml')
34
+ end
35
+
36
+ # @param content_dir [Pathname,String] the directory path containing the files to be analyzed by JHOVE
37
+ # @param fileset_file [Pathname,String] the pathname of the file listing which files should be processed. If nil, process all files.
38
+ # @return [String] Run JHOVE to characterize all content files, returning the output file path
39
+ def run_jhove(content_dir, fileset_file=nil)
40
+ raise "Content #{content_dir} not found" unless File.directory? content_dir
41
+ if fileset_file.nil? # a simple directory gets called directly
42
+ exec_command(get_jhove_command(content_dir))
43
+ jhove_output_xml_ng = File.open(jhove_output) { |f| Nokogiri::XML(f) }
44
+ else # a filelist gets run one by one, jhove cannot do this out of the box, so we need to run jhove file by file and then assemble the results ourselves into a single XML
45
+ raise "File list #{fileset_file} not found" unless File.exists? fileset_file
46
+ files = File.new(fileset_file).readlines
47
+ raise "File list #{fileset_file} empty" if files.size == 0
48
+ combined_xml_output = ""
49
+ jhove_output_xml_ng = Nokogiri::XML('')
50
+ files.each_with_index do |filename,i| # generate jhove output for each file in a separate xml file
51
+ full_path_to_file = File.join(content_dir,filename.strip)
52
+ output_file = @target_pathname.join("jhove_output_#{i}.xml")
53
+ exec_command(get_jhove_command(full_path_to_file,output_file))
54
+ jhove_output_xml_ng = File.open(output_file) { |f| Nokogiri::XML(f) }
55
+ combined_xml_output += jhove_output_xml_ng.css("//repInfo").to_xml # build up an XML string with all output
56
+ output_file.delete
57
+ end
58
+ jhove_output_xml_ng.root.children.each {|n| n.remove} # use all of the files we built up above, strip all the children to get the root jhove node
59
+ jhove_output_xml_ng.root << combined_xml_output # now add the combined xml for all files
60
+ end
61
+ remove_path_from_file_nodes(jhove_output_xml_ng,content_dir)
62
+ File.write(jhove_output, jhove_output_xml_ng.to_xml)
63
+ jhove_output.to_s
64
+ end
65
+
66
+ # @param command [String] the command to execute on the command line
67
+ # @raises [RuntimeError] if there is a problem running the command
68
+ def exec_command(command)
69
+ stdout, stderr, status = Open3.capture3(command, chdir: @bin_pathname)
70
+ raise "Error when running JHOVE #{command}:\n#{stderr}" unless status.success?
71
+ end
72
+
73
+ # @param input_path [Pathname,String] the directory path or filename containing the folder or file to be analyzed by JHOVE
74
+ # @param output_file [Pathname,String] the output file to write the XML to, defaults to filename specified in jhove_output
75
+ # @return [String] The jhove-toolkit command to be exectuted in a system call
76
+ def get_jhove_command(input_path,output_file = jhove_output)
77
+ filename = Shellwords.escape(input_path) # escape any special characters in the path
78
+ args = "-h xml -o \"#{output_file}\" \\\"#{filename}"
79
+ jhove_script = './jhoveToolkit.sh'
80
+ jhove_cmd = "#{jhove_script} #{args}"
81
+ jhove_cmd
82
+ end
83
+
84
+ # @param jhove_output_xml_ng [ng_xml_obj] the nokogiri xml output from jhove
85
+ # @param path [String] the shared path that will be removed from each file name to ensure the file nodes are relative
86
+ def remove_path_from_file_nodes(jhove_output_xml_ng,path)
87
+ jhove_output_xml_ng.xpath('//jhove:repInfo', 'jhove' => 'http://schema.openpreservation.org/ois/xml/ns/jhove').each do |filename_node|
88
+ filename_node.attributes['uri'].value = URI.decode(filename_node.attributes['uri'].value.gsub("#{path}",'').sub(/^\//,'')) # decode and remove path and any leading /
89
+ end
90
+ end
91
+
92
+ # @param [Pathname,String] jhove_pathname The full path of the file containing JHOVE output to be transformed to technical metadata
93
+ # @return [String] Convert jhove output it to technicalMetadata, returning the output file path
94
+ def create_technical_metadata(jhove_pathname=jhove_output)
95
+ jhove_pathname = Pathname.new(jhove_pathname)
96
+ jhovetm = JhoveTechnicalMetadata.new()
97
+ jhovetm.digital_object_id=self.digital_object_id
98
+ jhovetm.output_file=tech_md_output
99
+ # Create a SAX parser
100
+ parser = Nokogiri::XML::SAX::Parser.new(jhovetm)
101
+ # Feed the parser some XML
102
+ parser.parse(jhove_pathname.open('rb'))
103
+ tech_md_output.to_s
104
+ end
105
+
106
+ # @param [String] old_tm the old techMD xml to be transformed to new technical metadata format
107
+ # @return [String] Convert old techMD date to new technicalMetadata format
108
+ def upgrade_technical_metadata(old_tm)
109
+ new_tm = StringIO.new()
110
+ upgrade_sax_handler = JhoveTechnicalMetadata.new()
111
+ upgrade_sax_handler.digital_object_id=self.digital_object_id
112
+ upgrade_sax_handler.ios = new_tm
113
+ # Create a SAX parser
114
+ parser = Nokogiri::XML::SAX::Parser.new(upgrade_sax_handler)
115
+ # Feed the parser some XML
116
+ parser.parse(old_tm)
117
+ new_tm.string
118
+ end
119
+
120
+
121
+ # @return [void] Cleanup the temporary workspace used to hold the metadata outputs
122
+ def cleanup()
123
+ jhove_output.delete if jhove_output.exist?
124
+ tech_md_output.delete if tech_md_output.exist?
125
+ end
126
+
127
+ end
@@ -0,0 +1,284 @@
1
+ require 'rubygems'
2
+ require 'nokogiri'
3
+ require 'time'
4
+ require 'pathname'
5
+
6
+ # A SAX handler for filtering JHOVE output to create a technicalMetadata datastream
7
+ # The previous mechanism (uising XSLT tranformation) was causing out of memory errors,
8
+ # due to XSLT's behavior of loading both the input and output objects into memory.
9
+ class JhoveTechnicalMetadata < Nokogiri::XML::SAX::Document
10
+
11
+ # @return [IO] the output stream for the result
12
+ attr_accessor :ios
13
+
14
+ # @return [String] The druid of the object, which gets inserted in the root element of the output
15
+ attr_accessor :digital_object_id
16
+
17
+ def initialize()
18
+ @indent = 0
19
+ @ios = STDOUT #File.open(STDOUT, 'w')
20
+ end
21
+
22
+ # @param [Pathname] pathname the location of the technicalMetadata.xml file to be created
23
+ # @return [void] Opens the output stream pointing to the specified file
24
+ def output_file=(pathname)
25
+ @ios = pathname.open('w')
26
+ end
27
+
28
+ # @param [String] string The character string to be appended to the output
29
+ # @return [void] Append the specified string to the output stream
30
+ def output(string)
31
+ @ios.puts " "*@indent + string
32
+ end
33
+
34
+ # @param [String] tag the name of the XML element from the parsed input
35
+ # @param [Hash] attrs the XML attributes of the element
36
+ # @return [void] this method is called by the sax parser at the beginning of an element
37
+ def start_element(tag, attrs = [])
38
+ case tag
39
+ when 'jhove'
40
+ # <jhove> is the root element of the input
41
+ root_open(attrs)
42
+ when 'repInfo'
43
+ # A <repInfo> element contains the data for each file
44
+ file_wrapper_open(attrs)
45
+ when 'properties'
46
+ # A <properties> element contains the variable data for the file
47
+ properties_open
48
+ else
49
+ if tag[0..2] == 'mix'
50
+ # JHOVE output for image files contains tech md in MIX format that we copy verbatum to output
51
+ mix_open(tag)
52
+ elsif @in_jhove
53
+ # we've encountered one of the JHOVE elements that we want to automatically copy
54
+ jhove_open(tag, attrs)
55
+ elsif @in_properties
56
+ # we're looking for the LineEndings property in the JHOVE output
57
+ linebreak_open(tag)
58
+ end
59
+ end
60
+ end
61
+
62
+ # @param [String] tag the value of a text node found in the parsed XML
63
+ # @return [void] this method is called by the sax parser when a text node is encountered
64
+ def characters(string)
65
+ @text = string
66
+ end
67
+
68
+ # @param [String] tag the name of the XML element from the parsed input
69
+ # @return [void] this method is called by the sax parser at the end of an element
70
+ def end_element(tag)
71
+ case tag
72
+ when 'jhove'
73
+ root_close
74
+ when 'repInfo'
75
+ file_wrapper_close
76
+ when 'properties'
77
+ properties_close
78
+ else
79
+ if tag[0..2] == 'mix'
80
+ mix_close(tag)
81
+ elsif @in_jhove
82
+ jhove_close(tag)
83
+ elsif @in_properties
84
+ linebreak_close(tag)
85
+ end
86
+ end
87
+ end
88
+
89
+ # @param [Hash] attrs the attributes of the <jhove> element in the XML input
90
+ # @return [void] create the <technicalMetadata> root element of the XML output and include namespace declararions
91
+ def root_open(attrs)
92
+ if @digital_object_id
93
+ output "<technicalMetadata objectId='#{@digital_object_id}' datetime='#{Time.now.utc.iso8601}'"
94
+ else
95
+ output "<technicalMetadata datetime='#{Time.now.utc.iso8601}'"
96
+ end
97
+ @indent += 2
98
+ output "xmlns:jhove='http://hul.harvard.edu/ois/xml/ns/jhove'"
99
+ output "xmlns:mix='http://www.loc.gov/mix/v10'"
100
+ output "xmlns:textmd='info:lc/xmlns/textMD-v3' >"
101
+ @indent -= 1
102
+ end
103
+
104
+ # @return [void] add the closing element of the output document
105
+ def root_close
106
+ @indent -= 1
107
+ output "</technicalMetadata>"
108
+ @ios.close
109
+ end
110
+
111
+ # @param [Hash] attrs the attributes of the <jhove> element in the XML input
112
+ # @return [void] Append a <file> element to the output, setting the id attribute to the file path
113
+ def file_wrapper_open(attrs)
114
+ filepath=nil
115
+ attrs.each { |attr| filepath=attr[1] if attr[0]=='uri'}
116
+ output "<file id='#{filepath}'>"
117
+ @indent += 1
118
+ @in_jhove = true
119
+ end
120
+
121
+ # @return [void] Append a </file> tag to close the file data,
122
+ # but first inset a textMD stanza if the file has a text format
123
+ def file_wrapper_close
124
+ case @format
125
+ when 'ASCII', 'HTML','TEXT','UTF-8'
126
+ output_textmd(@linebreak)
127
+ end
128
+ @indent -= 1
129
+ output " </jhove:properties>" if @in_properties
130
+
131
+ output "</file>"
132
+ @in_jhove = false
133
+ @in_properties=false
134
+ end
135
+
136
+ # @param [String] tag the name of the XML element from the parsed input
137
+ # @param [Hash] attrs the attributes of the <jhove> element in the XML input
138
+ # @return [void] Copy this jhove element tag and its attributes verbatum
139
+ def jhove_open(tag, attrs)
140
+ if @jhove_tag # saved previously
141
+ # we encountered a new element so output what was previously cached
142
+ output "<jhove:#{@jhove_tag}#{@jhove_attrs}>"
143
+ @indent += 1
144
+ end
145
+ # cache the element name and its attributes
146
+ @jhove_tag = tag
147
+ @jhove_attrs = ""
148
+ attrs.each do |attr|
149
+ @jhove_attrs += " #{attr[0]}='#{attr[1]}'"
150
+ end
151
+ @text = nil
152
+ @linebreak='LF'
153
+ end
154
+
155
+ # @param [String] tag the name of the XML element from the parsed input
156
+ # @return [void] Output a closing tag, preceded by cached data, if such exists
157
+ def jhove_close(tag)
158
+ if @text && tag == @jhove_tag
159
+ output "<jhove:#{@jhove_tag}#{@jhove_attrs}>#{@text}</jhove:#{tag}>"
160
+ elsif tag == @jhove_tag
161
+ output "<jhove:#{@jhove_tag}#{@jhove_attrs}/>"
162
+ else
163
+ @indent -=1
164
+ output "</jhove:#{tag}>"
165
+ end
166
+ @format = @text if tag == 'format'
167
+ @text = nil
168
+ @jhove_tag = nil
169
+ @jhove_attrs=""
170
+ end
171
+
172
+ # @return [void] Output a <properties> element if one was encountered in the input,
173
+ # then ignore most input data from within the properties element, except mix and LineBreaks
174
+ def properties_open
175
+ output "<jhove:properties>"
176
+ @indent += 1
177
+ @in_jhove = false
178
+ @in_properties=true
179
+ end
180
+
181
+ # @return [void] Appending of a closing tag is handled elsewhere
182
+ def properties_close
183
+ @indent -= 1
184
+ end
185
+
186
+ # @param [String] tag the name of the XML element from the parsed input
187
+ # @return [void] Copy any Mix data verbatum,
188
+ def mix_open(tag)
189
+ if @mix_tag
190
+ # we encountered a new element so output what was previously cached
191
+ output "<#{@mix_tag}>"
192
+ @indent += 1
193
+ end
194
+ # cache the element name
195
+ @mix_tag = tag
196
+ @text = nil
197
+ end
198
+
199
+ # @param [String] tag the name of the XML element from the parsed input
200
+ # @return [void] Output a closing tag, preceded by cached data, if such exists
201
+ def mix_close(tag)
202
+ if @text && tag == @mix_tag
203
+ output "<#{tag}>#{@text}</#{tag}>"
204
+ elsif tag == @mix_tag
205
+ output "<#{tag}/>"
206
+ else
207
+ @indent -=1
208
+ output "</#{tag}>"
209
+ end
210
+ @text = nil
211
+ @mix_tag = nil
212
+ end
213
+
214
+ # @param [String] tag the name of the XML element from the parsed input
215
+ # @return [void] Keep clearing the text cache any time a new element is encountered
216
+ def linebreak_open(tag)
217
+ @text = nil if @text
218
+ end
219
+
220
+ # @param [String] tag the name of the XML element from the parsed input
221
+ # @return [void] Look for the LineEndings name/value pair, which is spread across multiple elements
222
+ def linebreak_close(tag)
223
+ case tag
224
+ when 'name'
225
+ @in_line_endings = false
226
+ @in_line_endings = true if @text == 'LineEndings'
227
+ when 'value'
228
+ @linebreak = @text if @in_line_endings
229
+ @in_line_endings = false
230
+ end
231
+ end
232
+
233
+ # @param [Object] linebreak the CRLF or LF value found in the JHOVE output ()default is LF)
234
+ # @return [void] Output a textMD section within the properties element
235
+ def output_textmd(linebreak)
236
+ indent = @indent
237
+ @indent = 0
238
+ if @in_properties
239
+ # properties element tags provided by other code
240
+ output <<-EOF
241
+ <textmd:textMD>
242
+ <textmd:character_info>
243
+ <textmd:byte_order>big</textmd:byte_order>
244
+ <textmd:byte_size>8</textmd:byte_size>
245
+ <textmd:character_size>1</textmd:character_size>
246
+ <textmd:linebreak>#{linebreak}</textmd:linebreak>
247
+ </textmd:character_info>
248
+ </textmd:textMD>
249
+ EOF
250
+ else
251
+ # there were no properties elements in the input, so we must supply them ourselves
252
+ output <<-EOF
253
+ <jhove:properties>
254
+ <textmd:textMD>
255
+ <textmd:character_info>
256
+ <textmd:byte_order>big</textmd:byte_order>
257
+ <textmd:byte_size>8</textmd:byte_size>
258
+ <textmd:character_size>1</textmd:character_size>
259
+ <textmd:linebreak>#{linebreak}</textmd:linebreak>
260
+ </textmd:character_info>
261
+ </textmd:textMD>
262
+ </jhove:properties>
263
+ EOF
264
+ end
265
+ @indent = indent
266
+ end
267
+
268
+ end
269
+
270
+
271
+ # Below is the equivalent of a java main method.
272
+ # For this to work OK, the module/class being invoked
273
+ # must have already have been loaded by the Ruby interpreter.
274
+
275
+ if __FILE__ == $0
276
+ # Create a handler
277
+ jhovetm = JhoveTechnicalMetadata.new()
278
+ jhovetm.digital_object_id=ARGV[0]
279
+ jhovetm.output_file=Pahtname.new(ARGV[2])
280
+ # Create a SAX parser
281
+ parser = Nokogiri::XML::SAX::Parser.new(jhovetm)
282
+ # Feed the parser some XML
283
+ parser.parse(File.open(ARGV[1], 'rb'))
284
+ end