libis-format 0.9.32 → 0.9.33
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/data/types.yml +30 -16
- data/lib/libis/format/config.rb +7 -18
- data/lib/libis/format/converter/image_converter.rb +6 -0
- data/lib/libis/format/droid.rb +82 -25
- data/lib/libis/format/extension_identification.rb +55 -0
- data/lib/libis/format/fido.rb +57 -72
- data/lib/libis/format/file_tool.rb +76 -0
- data/lib/libis/format/identification_tool.rb +174 -0
- data/lib/libis/format/identifier.rb +129 -117
- data/lib/libis/format/type_database.rb +36 -5
- data/lib/libis/format/version.rb +1 -1
- data/lib/libis/format.rb +3 -0
- data/libis-format.gemspec +2 -1
- data/spec/converter_spec.rb +6 -4
- data/spec/identifier_spec.rb +125 -34
- metadata +21 -126
- data/tools/droid/DROID_SignatureFile_V90.xml +0 -40182
- data/tools/droid/container-signature-20170330.xml +0 -3584
- data/tools/droid/droid-command-line-6.3.jar +0 -0
- data/tools/droid/droid.bat +0 -152
- data/tools/droid/droid.sh +0 -152
- data/tools/droid/lib/XmlSchema-1.4.7.jar +0 -0
- data/tools/droid/lib/activation-1.1.jar +0 -0
- data/tools/droid/lib/aopalliance-1.0.jar +0 -0
- data/tools/droid/lib/asm-2.2.3.jar +0 -0
- data/tools/droid/lib/aspectjrt-1.8.7.jar +0 -0
- data/tools/droid/lib/aspectjweaver-1.8.7.jar +0 -0
- data/tools/droid/lib/bcmail-jdk14-138.jar +0 -0
- data/tools/droid/lib/bcprov-jdk14-138.jar +0 -0
- data/tools/droid/lib/beansbinding-1.2.1.jar +0 -0
- data/tools/droid/lib/byteseek-2.0.3.jar +0 -0
- data/tools/droid/lib/cglib-nodep-2.2.2.jar +0 -0
- data/tools/droid/lib/classmate-1.0.0.jar +0 -0
- data/tools/droid/lib/commons-cli-1.2.jar +0 -0
- data/tools/droid/lib/commons-codec-1.10.jar +0 -0
- data/tools/droid/lib/commons-collections-3.2.2.jar +0 -0
- data/tools/droid/lib/commons-compress-1.4.1.jar +0 -0
- data/tools/droid/lib/commons-configuration-1.8.jar +0 -0
- data/tools/droid/lib/commons-dbcp-1.4.jar +0 -0
- data/tools/droid/lib/commons-httpclient-3.1.jar +0 -0
- data/tools/droid/lib/commons-io-2.4.jar +0 -0
- data/tools/droid/lib/commons-lang-2.6.jar +0 -0
- data/tools/droid/lib/commons-logging-1.1.1.jar +0 -0
- data/tools/droid/lib/commons-pool-1.5.4.jar +0 -0
- data/tools/droid/lib/cxf-api-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-common-schemas-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-common-utilities-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-bindings-http-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-bindings-soap-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-bindings-xml-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-core-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-databinding-jaxb-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-frontend-jaxws-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-frontend-simple-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-transports-http-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-ws-addr-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-tools-common-2.2.12.jar +0 -0
- data/tools/droid/lib/de.huxhorn.lilith.3rdparty.flyingsaucer.core-renderer-8RC1.jar +0 -0
- data/tools/droid/lib/derby-10.10.2.0.jar +0 -0
- data/tools/droid/lib/droid-container-6.3.jar +0 -0
- data/tools/droid/lib/droid-core-6.3.jar +0 -0
- data/tools/droid/lib/droid-core-interfaces-6.3.jar +0 -0
- data/tools/droid/lib/droid-export-6.3.jar +0 -0
- data/tools/droid/lib/droid-export-interfaces-6.3.jar +0 -0
- data/tools/droid/lib/droid-help-6.3.jar +0 -0
- data/tools/droid/lib/droid-report-6.3.jar +0 -0
- data/tools/droid/lib/droid-report-interfaces-6.3.jar +0 -0
- data/tools/droid/lib/droid-results-6.3.jar +0 -0
- data/tools/droid/lib/geronimo-activation_1.1_spec-1.0.2.jar +0 -0
- data/tools/droid/lib/geronimo-annotation_1.0_spec-1.1.1.jar +0 -0
- data/tools/droid/lib/geronimo-javamail_1.4_spec-1.6.jar +0 -0
- data/tools/droid/lib/geronimo-jaxws_2.1_spec-1.0.jar +0 -0
- data/tools/droid/lib/geronimo-stax-api_1.0_spec-1.0.1.jar +0 -0
- data/tools/droid/lib/geronimo-ws-metadata_2.0_spec-1.1.2.jar +0 -0
- data/tools/droid/lib/hibernate-validator-5.1.0.Final.jar +0 -0
- data/tools/droid/lib/itext-2.0.8.jar +0 -0
- data/tools/droid/lib/javahelp-2.0.05.jar +0 -0
- data/tools/droid/lib/jaxb-api-2.1.jar +0 -0
- data/tools/droid/lib/jaxb-impl-2.1.13.jar +0 -0
- data/tools/droid/lib/jboss-logging-3.1.3.GA.jar +0 -0
- data/tools/droid/lib/joda-time-1.6.2.jar +0 -0
- data/tools/droid/lib/jra-1.0-alpha-4.jar +0 -0
- data/tools/droid/lib/jta-1.1.jar +0 -0
- data/tools/droid/lib/jwat-arc-1.0.3.jar +0 -0
- data/tools/droid/lib/jwat-archive-common-1.0.3.jar +0 -0
- data/tools/droid/lib/jwat-common-1.0.3.jar +0 -0
- data/tools/droid/lib/jwat-gzip-1.0.3.jar +0 -0
- data/tools/droid/lib/jwat-warc-1.0.2.jar +0 -0
- data/tools/droid/lib/log4j-1.2.13.jar +0 -0
- data/tools/droid/lib/neethi-2.0.4.jar +0 -0
- data/tools/droid/lib/opencsv-2.3.jar +0 -0
- data/tools/droid/lib/org-netbeans-swing-outline-7.2.jar +0 -0
- data/tools/droid/lib/org-openide-util-7.2.jar +0 -0
- data/tools/droid/lib/org-openide-util-lookup-7.2.jar +0 -0
- data/tools/droid/lib/poi-3.13.jar +0 -0
- data/tools/droid/lib/saaj-api-1.3.jar +0 -0
- data/tools/droid/lib/saaj-impl-1.3.2.jar +0 -0
- data/tools/droid/lib/slf4j-api-1.4.2.jar +0 -0
- data/tools/droid/lib/slf4j-log4j12-1.4.2.jar +0 -0
- data/tools/droid/lib/spring-aop-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-beans-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-context-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-core-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-expression-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-jdbc-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-orm-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-tx-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-web-2.5.6.jar +0 -0
- data/tools/droid/lib/stax-api-1.0-2.jar +0 -0
- data/tools/droid/lib/trove4j-3.0.3.jar +0 -0
- data/tools/droid/lib/truezip-6.8.4.jar +0 -0
- data/tools/droid/lib/validation-api-1.1.0.Final.jar +0 -0
- data/tools/droid/lib/wsdl4j-1.6.2.jar +0 -0
- data/tools/droid/lib/wstx-asl-3.2.9.jar +0 -0
- data/tools/droid/lib/xercesImpl-2.9.1.jar +0 -0
- data/tools/droid/lib/xml-apis-1.3.04.jar +0 -0
- data/tools/droid/lib/xml-resolver-1.2.jar +0 -0
- data/tools/droid/lib/xz-1.0.jar +0 -0
- data/tools/fido/__init__.py +0 -50
- data/tools/fido/conf/DROID_SignatureFile-v90.xml +0 -2
- data/tools/fido/conf/container-signature-20170330.xml +0 -3584
- data/tools/fido/conf/dc.xsd +0 -119
- data/tools/fido/conf/dcmitype.xsd +0 -53
- data/tools/fido/conf/dcterms.xsd +0 -383
- data/tools/fido/conf/fido-formats.xsd +0 -173
- data/tools/fido/conf/format_extension_template.xml +0 -105
- data/tools/fido/conf/format_extensions.xml +0 -484
- data/tools/fido/conf/formats-v90.xml +0 -48877
- data/tools/fido/conf/pronom-xml-v90.zip +0 -0
- data/tools/fido/conf/versions.xml +0 -8
- data/tools/fido/fido.bat +0 -4
- data/tools/fido/fido.py +0 -884
- data/tools/fido/fido.sh +0 -5
- data/tools/fido/package.py +0 -96
- data/tools/fido/prepare.py +0 -645
- data/tools/fido/pronomutils.py +0 -200
- data/tools/fido/toxml.py +0 -60
- data/tools/fido/update_signatures.py +0 -183
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
require 'csv'
|
|
2
|
+
require 'tmpdir'
|
|
3
|
+
|
|
4
|
+
require 'singleton'
|
|
5
|
+
require 'libis/tools/extend/string'
|
|
6
|
+
require 'libis/tools/logger'
|
|
7
|
+
require 'libis/tools/command'
|
|
8
|
+
|
|
9
|
+
require 'libis/format/config'
|
|
10
|
+
require 'libis/format/type_database'
|
|
11
|
+
|
|
12
|
+
module Libis
|
|
13
|
+
module Format
|
|
14
|
+
|
|
15
|
+
class IdentificationTool
|
|
16
|
+
include Singleton
|
|
17
|
+
include ::Libis::Tools::Logger
|
|
18
|
+
|
|
19
|
+
def self.bad_mimetype(mimetype)
|
|
20
|
+
self.instance.bad_mimetype(mimetype)
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def self.run(file, recursive = false)
|
|
24
|
+
if file.is_a?(Array)
|
|
25
|
+
return run_list file
|
|
26
|
+
elsif file.is_a?(String) && File.exists?(file) && File.readable?(file)
|
|
27
|
+
if File.directory?(file)
|
|
28
|
+
return run_dir(file, recursive)
|
|
29
|
+
elsif File.file?(file)
|
|
30
|
+
return self.instance.run(file)
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
raise ArgumentError,
|
|
34
|
+
'IdentificationTool: file argument should be a path to an existing file or directory or a list of those'
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def self.run_dir(file, recursive = true)
|
|
38
|
+
self.instance.run_dir file, recursive
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def self.run_list(filelist)
|
|
42
|
+
self.instance.run_list filelist
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
protected
|
|
46
|
+
|
|
47
|
+
def create_list_file(filelist)
|
|
48
|
+
list_file = Dir::Tmpname.make_tmpname(%w'file .list', nil)
|
|
49
|
+
File.open(list_file, 'w') do |f|
|
|
50
|
+
filelist.each do |fname|
|
|
51
|
+
f.write "#{fname}\n"
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
yield(list_file)
|
|
55
|
+
ensure
|
|
56
|
+
File.delete(list_file)
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def find_files(dir, recurse = true)
|
|
60
|
+
args = []
|
|
61
|
+
args << '-L'
|
|
62
|
+
args << dir.escape_for_string
|
|
63
|
+
args << '-maxdepth' << '1' unless recurse
|
|
64
|
+
args << '-type' << 'f'
|
|
65
|
+
args << '-print'
|
|
66
|
+
output = ::Libis::Tools::Command.run('find', *args)
|
|
67
|
+
warn "Find command errors: #{output[:err].join("\n")}" unless output[:err].empty?
|
|
68
|
+
output[:out]
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Reformat output to make it easier to post-process and decide on the preferred format
|
|
72
|
+
#
|
|
73
|
+
# input format:
|
|
74
|
+
# [
|
|
75
|
+
# { filepath: <filename>, mimetype: <mimetype>, matchtype: <matchtype>, ... }
|
|
76
|
+
# ]
|
|
77
|
+
#
|
|
78
|
+
# output format:
|
|
79
|
+
# { <filename> => [<result>, ...], ... }
|
|
80
|
+
#
|
|
81
|
+
# <result> is the enchanced Hash output of the identification tool:
|
|
82
|
+
# { mimetype: <mimetype>, puid: <puid>, matchtype: <matchtype>, score: <score>, ...}
|
|
83
|
+
#
|
|
84
|
+
def process_output(output)
|
|
85
|
+
output.reduce({}) do |results, x|
|
|
86
|
+
filepath = x.delete(:filepath)
|
|
87
|
+
results[filepath] ||= []
|
|
88
|
+
results[filepath.freeze] << annotate(x)
|
|
89
|
+
results
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
# Enhance the output with mimetype and score
|
|
94
|
+
def annotate(result)
|
|
95
|
+
# Enhance result with mimetype if needed
|
|
96
|
+
if bad_mimetypes.include?(result[:mimetype]) && !bad_puids.include?(result[:puid])
|
|
97
|
+
result[:mimetype] = get_mimetype(result[:puid])
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
# Normalize the mimetype
|
|
101
|
+
Libis::Format::TypeDatabase.normalize(result, PUID: :puid, MIME: :mimetype)
|
|
102
|
+
|
|
103
|
+
# Default score is 5
|
|
104
|
+
result[:score] = 5
|
|
105
|
+
|
|
106
|
+
# Weak detection score is 1
|
|
107
|
+
result[:score] = 1 if bad_mimetypes.include? result[:mimetype]
|
|
108
|
+
|
|
109
|
+
# freeze all strings
|
|
110
|
+
result.each {|_, v| v.freeze if v.is_a?(String)}
|
|
111
|
+
|
|
112
|
+
# Adapt score based on matchtype
|
|
113
|
+
result[:matchtype] = result[:matchtype].to_s.downcase
|
|
114
|
+
case result[:matchtype]
|
|
115
|
+
|
|
116
|
+
# Signature match increases score with 2
|
|
117
|
+
when 'signature'
|
|
118
|
+
result[:score] += 2
|
|
119
|
+
# typeinfo = ::Libis::Format::TypeDatabase.puid_typeinfo(result[:puid])
|
|
120
|
+
# ext = File.extname(result[:filename])
|
|
121
|
+
# result[:score] += 1 if typeinfo and typeinfo[:EXTENSIONS].include?(ext)
|
|
122
|
+
|
|
123
|
+
# Container match increases score with 4
|
|
124
|
+
when 'container'
|
|
125
|
+
result[:score] += 4
|
|
126
|
+
# typeinfo = ::Libis::Format::TypeDatabase.puid_typeinfo(result[:puid])
|
|
127
|
+
# ext = File.extname(result[:filename])
|
|
128
|
+
# result[:score] += 1 if typeinfo and typeinfo[:EXTENSIONS].include?(ext)
|
|
129
|
+
|
|
130
|
+
# Extension match is the weakest identification; score is lowered by 2 points
|
|
131
|
+
when 'extension'
|
|
132
|
+
result[:score] -= 2
|
|
133
|
+
|
|
134
|
+
# Magic code (file tool) is to be trused even less
|
|
135
|
+
when 'magic'
|
|
136
|
+
result[:score] -= 3
|
|
137
|
+
|
|
138
|
+
# Or no change otherwise
|
|
139
|
+
else
|
|
140
|
+
# do nothing
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
# Detecting a zip file should decrease the score as it may hide one of the many zip-based formats (e.g. epub,
|
|
144
|
+
# Office OpenXML, OpenDocument, jar, maff, svx)
|
|
145
|
+
if result[:mimetype] == 'application/zip'
|
|
146
|
+
result[:score] -= 2
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
# Return result enhanced with mimetype and score fields
|
|
150
|
+
result
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
def get_mimetype(puid)
|
|
154
|
+
::Libis::Format::TypeDatabase.puid_typeinfo(puid)[:MIME].first rescue nil
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
def get_puid(mimetype)
|
|
158
|
+
::Libis::Format::TypeDatabase.mime_infos(mimetype).first[:PUID].first rescue nil
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
attr_accessor :bad_mimetypes, :bad_puids
|
|
162
|
+
|
|
163
|
+
def initialize
|
|
164
|
+
@bad_mimetypes = [nil, '', 'None', 'application/octet-stream']
|
|
165
|
+
@bad_puids = [nil, 'fmt/unknown']
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
def bad_mimetype(mimetype)
|
|
169
|
+
@bad_mimetypes << mimetype
|
|
170
|
+
end
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
end
|
|
174
|
+
end
|
|
@@ -3,13 +3,17 @@
|
|
|
3
3
|
require 'singleton'
|
|
4
4
|
|
|
5
5
|
require 'libis-tools'
|
|
6
|
+
require 'libis/tools/extend/hash'
|
|
6
7
|
require 'libis/tools/extend/string'
|
|
7
8
|
require 'libis/tools/extend/empty'
|
|
8
9
|
|
|
9
10
|
require 'libis/format/type_database'
|
|
10
11
|
|
|
12
|
+
require_relative 'config'
|
|
11
13
|
require_relative 'fido'
|
|
12
14
|
require_relative 'droid'
|
|
15
|
+
require_relative 'file_tool'
|
|
16
|
+
require_relative 'extension_identification'
|
|
13
17
|
|
|
14
18
|
module Libis
|
|
15
19
|
module Format
|
|
@@ -18,39 +22,8 @@ module Libis
|
|
|
18
22
|
include ::Libis::Tools::Logger
|
|
19
23
|
include Singleton
|
|
20
24
|
|
|
21
|
-
RETRY_MIMETYPES = %w(application/zip) + ::Libis::Format::Fido::BAD_MIMETYPES
|
|
22
|
-
FIDO_FAILURES = %w(application/vnd.oasis.opendocument.text application/vnd.oasis.opendocument.spreadsheet)
|
|
23
|
-
|
|
24
|
-
attr_reader :xml_validations
|
|
25
|
-
|
|
26
|
-
protected
|
|
27
|
-
|
|
28
|
-
def initialize
|
|
29
|
-
data_dir = File.absolute_path(File.join(File.dirname(__FILE__), '..', '..', '..', 'data'))
|
|
30
|
-
@fido_formats = [(File.join(data_dir, 'lias_formats.xml'))]
|
|
31
|
-
# noinspection RubyStringKeysInHashInspection
|
|
32
|
-
@xml_validations = {'archive/ead' => File.join(data_dir, 'ead.xsd')}
|
|
33
|
-
end
|
|
34
|
-
|
|
35
|
-
def result_ok?(result, who_is_asking = nil)
|
|
36
|
-
result = ::Libis::Format::TypeDatabase.enrich(result, PUID: :puid, MIME: :mimetype)
|
|
37
|
-
return false if result.empty?
|
|
38
|
-
return true unless result[:TYPE].empty?
|
|
39
|
-
return false if RETRY_MIMETYPES.include? result[:mimetype]
|
|
40
|
-
return false if FIDO_FAILURES.include? result[:mimetype] and who_is_asking == :DROID
|
|
41
|
-
!(result[:mimetype].empty? and result[:puid].empty?)
|
|
42
|
-
end
|
|
43
|
-
|
|
44
|
-
def get_puid(mimetype)
|
|
45
|
-
::Libis::Format::TypeDatabase.mime_infos(mimetype).first[:PUID].first rescue nil
|
|
46
|
-
end
|
|
47
|
-
|
|
48
25
|
public
|
|
49
26
|
|
|
50
|
-
def self.add_fido_format(f)
|
|
51
|
-
::Libis::Format::Fido.add_format f
|
|
52
|
-
end
|
|
53
|
-
|
|
54
27
|
def self.add_xml_validation(mimetype, xsd_file)
|
|
55
28
|
instance.xml_validations[mimetype] = xsd_file
|
|
56
29
|
end
|
|
@@ -59,134 +32,173 @@ module Libis
|
|
|
59
32
|
instance.xml_validations
|
|
60
33
|
end
|
|
61
34
|
|
|
62
|
-
def self.get(
|
|
63
|
-
instance.get
|
|
35
|
+
def self.get(file, options = {})
|
|
36
|
+
instance.get file, options
|
|
64
37
|
end
|
|
65
38
|
|
|
66
|
-
|
|
39
|
+
attr_reader :xml_validations
|
|
67
40
|
|
|
68
|
-
|
|
69
|
-
error 'File %s cannot be found.', file
|
|
70
|
-
return nil
|
|
71
|
-
end
|
|
72
|
-
if File.directory? file
|
|
73
|
-
error '%s is a directory.', file
|
|
74
|
-
return nil
|
|
75
|
-
end
|
|
41
|
+
def get(file, options = {})
|
|
76
42
|
|
|
77
|
-
options
|
|
43
|
+
options[:droid] = true unless options[:tool] and options[:tool] != :droid
|
|
44
|
+
options[:fido] = true unless options[:tool] and options[:tool] != :fido
|
|
45
|
+
options[:file] = true unless options[:tool] and options[:tool] != :file
|
|
78
46
|
|
|
79
|
-
result = {messages: []}
|
|
47
|
+
result = {messages: [], output: {}, formats: {}}
|
|
80
48
|
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
49
|
+
begin
|
|
50
|
+
get_droid_identification(file, options[:recursive], result) if options[:droid]
|
|
51
|
+
rescue => e
|
|
52
|
+
log_msg(result, :error, "Error running Droid: #{e.message} @ #{e.backtrace.first}")
|
|
53
|
+
end
|
|
85
54
|
|
|
86
|
-
|
|
87
|
-
|
|
55
|
+
begin
|
|
56
|
+
get_fido_identification(file, options[:recursive], result) if options[:fido]
|
|
57
|
+
rescue => e
|
|
58
|
+
log_msg(result, :error, "Error running Fido: #{e.message} @ #{e.backtrace.first}")
|
|
59
|
+
end
|
|
88
60
|
|
|
89
|
-
|
|
90
|
-
|
|
61
|
+
begin
|
|
62
|
+
get_file_identification(file, options[:recursive], result) if options[:file]
|
|
63
|
+
rescue => e
|
|
64
|
+
log_msg(result, :error, "Error running File: #{e.message} @ #{e.backtrace.first}")
|
|
65
|
+
end
|
|
91
66
|
|
|
92
|
-
#
|
|
93
|
-
result = get_extension_identification(file, result)
|
|
67
|
+
# get_extension_identification(file, options[:recursive], result)
|
|
94
68
|
|
|
95
69
|
# determine XML type. Add custom types at runtime with
|
|
96
70
|
# Libis::Tools::Format::Identifier.add_xml_validation('my_type', '/path/to/my_type.xsd')
|
|
97
|
-
|
|
71
|
+
validate_against_xml_schema(result)
|
|
98
72
|
|
|
99
|
-
result
|
|
100
|
-
log_msg(result, :info, "Identification of '#{file}': '#{result}'") :
|
|
101
|
-
log_msg(result, :warn, "Could not identify MIME type of '#{file}'")
|
|
102
|
-
end
|
|
73
|
+
process_results(result)
|
|
103
74
|
|
|
104
|
-
|
|
105
|
-
|
|
75
|
+
# result[:mimetype] ?
|
|
76
|
+
# log_msg(result, :info, "Identification of '#{file}': '#{result}'") :
|
|
77
|
+
# log_msg(result, :warn, "Could not identify MIME type of '#{file}'")
|
|
106
78
|
|
|
107
|
-
|
|
79
|
+
result
|
|
108
80
|
|
|
109
|
-
|
|
81
|
+
end
|
|
110
82
|
|
|
111
|
-
|
|
112
|
-
result[:method] = 'fido'
|
|
83
|
+
protected
|
|
113
84
|
|
|
114
|
-
|
|
85
|
+
def initialize
|
|
86
|
+
@xml_validations = Libis::Format::Config[:xml_validations].to_h
|
|
115
87
|
end
|
|
116
88
|
|
|
117
|
-
def
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
result.clear
|
|
123
|
-
droid_output = droid_output.first
|
|
124
|
-
result[:mimetype] = droid_output[:mime_type].to_s.split(/[\s,]+/).find { |x| x =~ /.*\/.*/ }
|
|
125
|
-
result[:matchtype] = droid_output[:method]
|
|
126
|
-
result[:puid] = droid_output[:puid]
|
|
127
|
-
result[:format_name] = droid_output[:format_name]
|
|
128
|
-
result[:format_version] = droid_output[:format_version]
|
|
129
|
-
result[:method] = 'droid'
|
|
89
|
+
def get_file_identification(file, recursive, result)
|
|
90
|
+
output = ::Libis::Format::FileTool.run(file, recursive)
|
|
91
|
+
process_tool_output(output, result)
|
|
92
|
+
output
|
|
93
|
+
end
|
|
130
94
|
|
|
131
|
-
|
|
95
|
+
def get_fido_identification(file, recursive, result)
|
|
96
|
+
output = ::Libis::Format::Fido.run(file, recursive)
|
|
97
|
+
process_tool_output(output, result)
|
|
98
|
+
output
|
|
132
99
|
end
|
|
133
100
|
|
|
134
|
-
def
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
result[:method] = 'file'
|
|
145
|
-
rescue Exception
|
|
146
|
-
# ignored
|
|
147
|
-
end
|
|
148
|
-
result
|
|
101
|
+
def get_droid_identification(file, recursive, result)
|
|
102
|
+
output = ::Libis::Format::Droid.run(file, recursive)
|
|
103
|
+
process_tool_output(output, result)
|
|
104
|
+
output
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
def get_extension_identification(file, recursive, result)
|
|
108
|
+
output = ::Libis::Format::ExtensionIdentification.run(file, recursive)
|
|
109
|
+
process_tool_output(output, result)
|
|
110
|
+
output
|
|
149
111
|
end
|
|
150
112
|
|
|
151
|
-
def
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
result[:mimetype] = info[:MIME].first rescue nil
|
|
157
|
-
result[:puid] = info[:PUID].first rescue nil
|
|
113
|
+
def validate_against_xml_schema(result)
|
|
114
|
+
result[:output].each do |file, file_results|
|
|
115
|
+
file_results.each do |file_result|
|
|
116
|
+
xml_validate(file, file_result, result)
|
|
117
|
+
end
|
|
158
118
|
end
|
|
159
|
-
result[:method] = 'extension'
|
|
160
|
-
result
|
|
161
119
|
end
|
|
162
120
|
|
|
163
|
-
def
|
|
164
|
-
return
|
|
121
|
+
def xml_validate(file, file_result, result)
|
|
122
|
+
return unless file_result[:mimetype] =~ /^(text|application)\/xml$/
|
|
165
123
|
doc = ::Libis::Tools::XmlDocument.open file
|
|
166
124
|
xml_validations.each do |mime, xsd_file|
|
|
167
125
|
next unless xsd_file
|
|
168
126
|
begin
|
|
169
127
|
if doc.validates_against?(xsd_file)
|
|
170
128
|
log_msg result, :debug, "XML file validated against XML Schema: #{xsd_file}"
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
result = ::Libis::Format::TypeDatabase.enrich(result, PUID: :puid, MIME: :mimetype)
|
|
129
|
+
info = {mimetype: mime, tool: file_result[:source], source: :xsd_validation, match_type: 'xsd_validation', format_version: '', }
|
|
130
|
+
file_result.merge! Libis::Format::TypeDatabase.enrich(info, PUID: :puid, MIME: :mimetype, NAME: :format_name)
|
|
174
131
|
end
|
|
175
|
-
rescue
|
|
176
|
-
# Do nothing - probably Nokogiri chrashed during validation.
|
|
177
|
-
#
|
|
178
|
-
# so we
|
|
132
|
+
rescue => e
|
|
133
|
+
# Do nothing - probably Nokogiri chrashed during validation. Could have many causes
|
|
134
|
+
# (remote schema (firewall, network, link rot, ...), schema syntax error, corrupt XML,...)
|
|
135
|
+
# so we log and continue.
|
|
136
|
+
log_msg(result, :warn, "Error during XML validation: #{e.message}")
|
|
179
137
|
end
|
|
180
138
|
end
|
|
181
|
-
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
def process_results(result)
|
|
142
|
+
result[:output].map do |file, output|
|
|
143
|
+
file_result = result[:formats][file] = {}
|
|
144
|
+
if output.empty?
|
|
145
|
+
file_result = {
|
|
146
|
+
mimetype: 'application/octet-stream',
|
|
147
|
+
puid: 'fmt/unknown',
|
|
148
|
+
score: 0,
|
|
149
|
+
source: nil
|
|
150
|
+
}
|
|
151
|
+
else
|
|
152
|
+
format_matches = output.group_by {|x| [x[:mimetype], x[:puid]]}
|
|
153
|
+
format_matches.each do |match, group|
|
|
154
|
+
format_matches[match] = group.group_by {|x| x[:score]}.sort.reverse.to_h
|
|
155
|
+
end
|
|
156
|
+
case format_matches.count
|
|
157
|
+
when 0
|
|
158
|
+
# No this really cannot happen. If there are not hits, there will be at least a format [nil,nil]
|
|
159
|
+
when 1
|
|
160
|
+
# only one match, that's easy. The first of the highest score will be used
|
|
161
|
+
file_result.merge!(get_best_result(output))
|
|
162
|
+
else
|
|
163
|
+
process_multiple_formats(file_result, format_matches, output)
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
end
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
def process_multiple_formats(file_result, format_matches, output)
|
|
170
|
+
# multiple matches. Let's select the highest score matches
|
|
171
|
+
file_result.merge!(get_best_result(output))
|
|
172
|
+
file_result[:alternatives] = []
|
|
173
|
+
format_matches.keys.each do |mime, puid|
|
|
174
|
+
next if file_result[:mimetype] == mime && puid.nil?
|
|
175
|
+
selection = output.select {|x| x[:mimetype] == mime && x[:puid] == puid}
|
|
176
|
+
file_result[:alternatives] << get_best_result(selection)
|
|
177
|
+
end
|
|
178
|
+
file_result[:alternatives] = file_result[:alternatives].sort_by {|x| x[:score]}.reverse
|
|
179
|
+
file_result.delete(:alternatives) if file_result[:alternatives].size <= 1
|
|
182
180
|
end
|
|
183
181
|
|
|
184
182
|
private
|
|
185
183
|
|
|
184
|
+
def process_tool_output(output, result)
|
|
185
|
+
output.each do |file, file_output|
|
|
186
|
+
result[:output][file] ||= []
|
|
187
|
+
result[:output][file] += file_output
|
|
188
|
+
end
|
|
189
|
+
end
|
|
190
|
+
|
|
186
191
|
def log_msg(result, severity, text)
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
192
|
+
result[:messages] << [severity, text]
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
def get_mimetype(puid)
|
|
196
|
+
::Libis::Format::TypeDatabase.puid_typeinfo(puid)[:MIME].first rescue nil
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
def get_best_result(results)
|
|
200
|
+
score = results.map {|x| x[:score]}.max
|
|
201
|
+
results.select {|x| x[:score] == score}.reduce(:apply_defaults)
|
|
190
202
|
end
|
|
191
203
|
|
|
192
204
|
end
|
|
@@ -7,6 +7,8 @@ require 'backports/rails/hash'
|
|
|
7
7
|
require 'libis/tools/logger'
|
|
8
8
|
require 'libis/tools/extend/string'
|
|
9
9
|
|
|
10
|
+
require_relative 'config'
|
|
11
|
+
|
|
10
12
|
module Libis
|
|
11
13
|
module Format
|
|
12
14
|
|
|
@@ -19,6 +21,25 @@ module Libis
|
|
|
19
21
|
end
|
|
20
22
|
|
|
21
23
|
def self.enrich(info, map_keys = {})
|
|
24
|
+
return {} unless info.is_a? Hash
|
|
25
|
+
mapper = Hash.new {|hash,key| hash[key] = key}
|
|
26
|
+
mapper.merge! map_keys
|
|
27
|
+
unless (puid = info[mapper[:PUID]]).blank?
|
|
28
|
+
info[mapper[:TYPE]] ||= puid_infos(puid).first[:TYPE] rescue nil
|
|
29
|
+
end
|
|
30
|
+
unless (mime = info[mapper[:MIME]]).blank?
|
|
31
|
+
info[mapper[:TYPE]] ||= mime_infos(mime).first[:TYPE] rescue nil
|
|
32
|
+
end
|
|
33
|
+
unless (type_name = info[mapper[:TYPE]]).nil?
|
|
34
|
+
mapper.keys.each do |key|
|
|
35
|
+
info[mapper[key]] = get(type_name, key) || info[mapper[key]]
|
|
36
|
+
end
|
|
37
|
+
info[mapper[:GROUP]] = self.type_group(type_name)
|
|
38
|
+
end
|
|
39
|
+
info
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def self.normalize(info, map_keys = {})
|
|
22
43
|
return {} unless info.is_a? Hash
|
|
23
44
|
mapper = Hash.new {|hash,key| hash[key] = key}
|
|
24
45
|
mapper.merge! map_keys
|
|
@@ -29,14 +50,25 @@ module Libis
|
|
|
29
50
|
info[mapper[:TYPE]] ||= self.mime_infos(mime).first[:TYPE] rescue nil
|
|
30
51
|
end
|
|
31
52
|
unless (type_name = info[mapper[:TYPE]]).nil?
|
|
32
|
-
info[mapper[:MIME]] = self.type_mimetypes(type_name).first if
|
|
33
|
-
info[mapper[:PUID]] = self.type_puids(type_name).first if info[mapper[:PUID]].blank?
|
|
34
|
-
info[mapper[:EXTENSIONS]] = self.type_extentions(type_name)
|
|
53
|
+
info[mapper[:MIME]] = self.type_mimetypes(type_name).first if self.type_mimetypes(type_name).first
|
|
35
54
|
info[mapper[:GROUP]] = self.type_group(type_name)
|
|
36
55
|
end
|
|
37
56
|
info
|
|
38
57
|
end
|
|
39
58
|
|
|
59
|
+
def self.get(type_name, key)
|
|
60
|
+
case key
|
|
61
|
+
when :MIME
|
|
62
|
+
type_mimetypes(type_name).first
|
|
63
|
+
when :PUID
|
|
64
|
+
type_puids(type_name).first
|
|
65
|
+
when :EXTENSION
|
|
66
|
+
type_extentions(type_name).first
|
|
67
|
+
else
|
|
68
|
+
self.typeinfo(type_name)[key]
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
|
|
40
72
|
def self.type_group(t)
|
|
41
73
|
typeinfo(t)[:GROUP]
|
|
42
74
|
end
|
|
@@ -159,8 +191,7 @@ module Libis
|
|
|
159
191
|
|
|
160
192
|
def initialize
|
|
161
193
|
@types = Hash.new
|
|
162
|
-
|
|
163
|
-
type_database = File.join(data_dir, 'types.yml')
|
|
194
|
+
type_database = Libis::Format::Config[:type_database]
|
|
164
195
|
load_types(type_database)
|
|
165
196
|
end
|
|
166
197
|
|
data/lib/libis/format/version.rb
CHANGED
data/lib/libis/format.rb
CHANGED
|
@@ -5,9 +5,12 @@ module Libis
|
|
|
5
5
|
autoload :Config, 'libis/format/config'
|
|
6
6
|
autoload :TypeDatabase, 'libis/format/type_database'
|
|
7
7
|
autoload :Identifier, 'libis/format/identifier'
|
|
8
|
+
autoload :Identifier, 'libis/format/identifier'
|
|
8
9
|
|
|
10
|
+
autoload :FileTool, 'libis/format/file_tool'
|
|
9
11
|
autoload :Fido, 'libis/format/fido'
|
|
10
12
|
autoload :Droid, 'libis/format/droid'
|
|
13
|
+
autoload :ExtensionIdentification, 'libis/format/extension_identification'
|
|
11
14
|
|
|
12
15
|
autoload :OfficeToPdf, 'libis/format/office_to_pdf'
|
|
13
16
|
autoload :PdfCopy, 'libis/format/pdf_copy'
|
data/libis-format.gemspec
CHANGED
|
@@ -26,8 +26,9 @@ Gem::Specification.new do |spec|
|
|
|
26
26
|
spec.add_development_dependency 'rake', '~> 10.3'
|
|
27
27
|
spec.add_development_dependency 'rspec', '~> 3.1'
|
|
28
28
|
spec.add_development_dependency 'simplecov', '~> 0.9'
|
|
29
|
+
spec.add_development_dependency 'awesome_print'
|
|
29
30
|
|
|
30
|
-
spec.add_runtime_dependency 'libis-tools', '~> 0.9'
|
|
31
|
+
spec.add_runtime_dependency 'libis-tools', '~> 0.9.52'
|
|
31
32
|
spec.add_runtime_dependency 'os', '= 0.9.6'
|
|
32
33
|
spec.add_runtime_dependency 'mini_magick', '~> 4.3'
|
|
33
34
|
spec.add_runtime_dependency 'deep_dive', '~> 0.3'
|
data/spec/converter_spec.rb
CHANGED
|
@@ -23,6 +23,8 @@ describe 'Converters' do
|
|
|
23
23
|
|
|
24
24
|
before(:all) {
|
|
25
25
|
Libis::Tools::Config.logger.level = :WARN
|
|
26
|
+
::Libis::Format::Config[:droid_path] = '/opt/droid/droid.sh'
|
|
27
|
+
::Libis::Format::Config[:fido_path] = '/usr/local/bin/fido'
|
|
26
28
|
}
|
|
27
29
|
|
|
28
30
|
context 'Repository' do
|
|
@@ -114,7 +116,7 @@ describe 'Converters' do
|
|
|
114
116
|
expect(result).to eq tgt_file
|
|
115
117
|
compare = MiniMagick::Tool::Compare.new
|
|
116
118
|
compare << ref_file << tgt_file
|
|
117
|
-
compare.metric << '
|
|
119
|
+
compare.metric << 'MAE'
|
|
118
120
|
compare.fuzz << '1%'
|
|
119
121
|
compare << diff_file
|
|
120
122
|
compare.call {|_, _, status| expect(status).to be 0}
|
|
@@ -132,7 +134,7 @@ describe 'Converters' do
|
|
|
132
134
|
expect(result).to eq tgt_file
|
|
133
135
|
compare = MiniMagick::Tool::Compare.new
|
|
134
136
|
compare << ref_file << tgt_file
|
|
135
|
-
compare.metric << '
|
|
137
|
+
compare.metric << 'MAE'
|
|
136
138
|
compare << diff_file
|
|
137
139
|
compare.call {|_, _, status| expect(status).to be 0}
|
|
138
140
|
FileUtils.rm tgt_file, force: true
|
|
@@ -167,7 +169,7 @@ describe 'Converters' do
|
|
|
167
169
|
compare = MiniMagick::Tool::Compare.new
|
|
168
170
|
compare << ref_file << tgt_file
|
|
169
171
|
compare.metric << 'AE'
|
|
170
|
-
compare.fuzz << '
|
|
172
|
+
compare.fuzz << '100%'
|
|
171
173
|
compare << diff_file
|
|
172
174
|
compare.call do |_stdin, _stdout, status|
|
|
173
175
|
expect(status).to be 0
|
|
@@ -189,7 +191,7 @@ describe 'Converters' do
|
|
|
189
191
|
expect(File.exist?(tgt_file)).to be_truthy
|
|
190
192
|
compare = MiniMagick::Tool::Compare.new
|
|
191
193
|
compare << ref_file << tgt_file
|
|
192
|
-
compare.metric << '
|
|
194
|
+
compare.metric << 'MAE'
|
|
193
195
|
compare.fuzz << '10%'
|
|
194
196
|
compare << diff_file
|
|
195
197
|
compare.call {|_,_,status| expect(status).to be 0}
|