libis-format 0.9.32 → 0.9.33
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/data/types.yml +30 -16
- data/lib/libis/format/config.rb +7 -18
- data/lib/libis/format/converter/image_converter.rb +6 -0
- data/lib/libis/format/droid.rb +82 -25
- data/lib/libis/format/extension_identification.rb +55 -0
- data/lib/libis/format/fido.rb +57 -72
- data/lib/libis/format/file_tool.rb +76 -0
- data/lib/libis/format/identification_tool.rb +174 -0
- data/lib/libis/format/identifier.rb +129 -117
- data/lib/libis/format/type_database.rb +36 -5
- data/lib/libis/format/version.rb +1 -1
- data/lib/libis/format.rb +3 -0
- data/libis-format.gemspec +2 -1
- data/spec/converter_spec.rb +6 -4
- data/spec/identifier_spec.rb +125 -34
- metadata +21 -126
- data/tools/droid/DROID_SignatureFile_V90.xml +0 -40182
- data/tools/droid/container-signature-20170330.xml +0 -3584
- data/tools/droid/droid-command-line-6.3.jar +0 -0
- data/tools/droid/droid.bat +0 -152
- data/tools/droid/droid.sh +0 -152
- data/tools/droid/lib/XmlSchema-1.4.7.jar +0 -0
- data/tools/droid/lib/activation-1.1.jar +0 -0
- data/tools/droid/lib/aopalliance-1.0.jar +0 -0
- data/tools/droid/lib/asm-2.2.3.jar +0 -0
- data/tools/droid/lib/aspectjrt-1.8.7.jar +0 -0
- data/tools/droid/lib/aspectjweaver-1.8.7.jar +0 -0
- data/tools/droid/lib/bcmail-jdk14-138.jar +0 -0
- data/tools/droid/lib/bcprov-jdk14-138.jar +0 -0
- data/tools/droid/lib/beansbinding-1.2.1.jar +0 -0
- data/tools/droid/lib/byteseek-2.0.3.jar +0 -0
- data/tools/droid/lib/cglib-nodep-2.2.2.jar +0 -0
- data/tools/droid/lib/classmate-1.0.0.jar +0 -0
- data/tools/droid/lib/commons-cli-1.2.jar +0 -0
- data/tools/droid/lib/commons-codec-1.10.jar +0 -0
- data/tools/droid/lib/commons-collections-3.2.2.jar +0 -0
- data/tools/droid/lib/commons-compress-1.4.1.jar +0 -0
- data/tools/droid/lib/commons-configuration-1.8.jar +0 -0
- data/tools/droid/lib/commons-dbcp-1.4.jar +0 -0
- data/tools/droid/lib/commons-httpclient-3.1.jar +0 -0
- data/tools/droid/lib/commons-io-2.4.jar +0 -0
- data/tools/droid/lib/commons-lang-2.6.jar +0 -0
- data/tools/droid/lib/commons-logging-1.1.1.jar +0 -0
- data/tools/droid/lib/commons-pool-1.5.4.jar +0 -0
- data/tools/droid/lib/cxf-api-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-common-schemas-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-common-utilities-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-bindings-http-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-bindings-soap-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-bindings-xml-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-core-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-databinding-jaxb-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-frontend-jaxws-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-frontend-simple-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-transports-http-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-ws-addr-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-tools-common-2.2.12.jar +0 -0
- data/tools/droid/lib/de.huxhorn.lilith.3rdparty.flyingsaucer.core-renderer-8RC1.jar +0 -0
- data/tools/droid/lib/derby-10.10.2.0.jar +0 -0
- data/tools/droid/lib/droid-container-6.3.jar +0 -0
- data/tools/droid/lib/droid-core-6.3.jar +0 -0
- data/tools/droid/lib/droid-core-interfaces-6.3.jar +0 -0
- data/tools/droid/lib/droid-export-6.3.jar +0 -0
- data/tools/droid/lib/droid-export-interfaces-6.3.jar +0 -0
- data/tools/droid/lib/droid-help-6.3.jar +0 -0
- data/tools/droid/lib/droid-report-6.3.jar +0 -0
- data/tools/droid/lib/droid-report-interfaces-6.3.jar +0 -0
- data/tools/droid/lib/droid-results-6.3.jar +0 -0
- data/tools/droid/lib/geronimo-activation_1.1_spec-1.0.2.jar +0 -0
- data/tools/droid/lib/geronimo-annotation_1.0_spec-1.1.1.jar +0 -0
- data/tools/droid/lib/geronimo-javamail_1.4_spec-1.6.jar +0 -0
- data/tools/droid/lib/geronimo-jaxws_2.1_spec-1.0.jar +0 -0
- data/tools/droid/lib/geronimo-stax-api_1.0_spec-1.0.1.jar +0 -0
- data/tools/droid/lib/geronimo-ws-metadata_2.0_spec-1.1.2.jar +0 -0
- data/tools/droid/lib/hibernate-validator-5.1.0.Final.jar +0 -0
- data/tools/droid/lib/itext-2.0.8.jar +0 -0
- data/tools/droid/lib/javahelp-2.0.05.jar +0 -0
- data/tools/droid/lib/jaxb-api-2.1.jar +0 -0
- data/tools/droid/lib/jaxb-impl-2.1.13.jar +0 -0
- data/tools/droid/lib/jboss-logging-3.1.3.GA.jar +0 -0
- data/tools/droid/lib/joda-time-1.6.2.jar +0 -0
- data/tools/droid/lib/jra-1.0-alpha-4.jar +0 -0
- data/tools/droid/lib/jta-1.1.jar +0 -0
- data/tools/droid/lib/jwat-arc-1.0.3.jar +0 -0
- data/tools/droid/lib/jwat-archive-common-1.0.3.jar +0 -0
- data/tools/droid/lib/jwat-common-1.0.3.jar +0 -0
- data/tools/droid/lib/jwat-gzip-1.0.3.jar +0 -0
- data/tools/droid/lib/jwat-warc-1.0.2.jar +0 -0
- data/tools/droid/lib/log4j-1.2.13.jar +0 -0
- data/tools/droid/lib/neethi-2.0.4.jar +0 -0
- data/tools/droid/lib/opencsv-2.3.jar +0 -0
- data/tools/droid/lib/org-netbeans-swing-outline-7.2.jar +0 -0
- data/tools/droid/lib/org-openide-util-7.2.jar +0 -0
- data/tools/droid/lib/org-openide-util-lookup-7.2.jar +0 -0
- data/tools/droid/lib/poi-3.13.jar +0 -0
- data/tools/droid/lib/saaj-api-1.3.jar +0 -0
- data/tools/droid/lib/saaj-impl-1.3.2.jar +0 -0
- data/tools/droid/lib/slf4j-api-1.4.2.jar +0 -0
- data/tools/droid/lib/slf4j-log4j12-1.4.2.jar +0 -0
- data/tools/droid/lib/spring-aop-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-beans-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-context-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-core-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-expression-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-jdbc-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-orm-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-tx-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-web-2.5.6.jar +0 -0
- data/tools/droid/lib/stax-api-1.0-2.jar +0 -0
- data/tools/droid/lib/trove4j-3.0.3.jar +0 -0
- data/tools/droid/lib/truezip-6.8.4.jar +0 -0
- data/tools/droid/lib/validation-api-1.1.0.Final.jar +0 -0
- data/tools/droid/lib/wsdl4j-1.6.2.jar +0 -0
- data/tools/droid/lib/wstx-asl-3.2.9.jar +0 -0
- data/tools/droid/lib/xercesImpl-2.9.1.jar +0 -0
- data/tools/droid/lib/xml-apis-1.3.04.jar +0 -0
- data/tools/droid/lib/xml-resolver-1.2.jar +0 -0
- data/tools/droid/lib/xz-1.0.jar +0 -0
- data/tools/fido/__init__.py +0 -50
- data/tools/fido/conf/DROID_SignatureFile-v90.xml +0 -2
- data/tools/fido/conf/container-signature-20170330.xml +0 -3584
- data/tools/fido/conf/dc.xsd +0 -119
- data/tools/fido/conf/dcmitype.xsd +0 -53
- data/tools/fido/conf/dcterms.xsd +0 -383
- data/tools/fido/conf/fido-formats.xsd +0 -173
- data/tools/fido/conf/format_extension_template.xml +0 -105
- data/tools/fido/conf/format_extensions.xml +0 -484
- data/tools/fido/conf/formats-v90.xml +0 -48877
- data/tools/fido/conf/pronom-xml-v90.zip +0 -0
- data/tools/fido/conf/versions.xml +0 -8
- data/tools/fido/fido.bat +0 -4
- data/tools/fido/fido.py +0 -884
- data/tools/fido/fido.sh +0 -5
- data/tools/fido/package.py +0 -96
- data/tools/fido/prepare.py +0 -645
- data/tools/fido/pronomutils.py +0 -200
- data/tools/fido/toxml.py +0 -60
- data/tools/fido/update_signatures.py +0 -183
@@ -0,0 +1,174 @@
|
|
1
|
+
require 'csv'
|
2
|
+
require 'tmpdir'
|
3
|
+
|
4
|
+
require 'singleton'
|
5
|
+
require 'libis/tools/extend/string'
|
6
|
+
require 'libis/tools/logger'
|
7
|
+
require 'libis/tools/command'
|
8
|
+
|
9
|
+
require 'libis/format/config'
|
10
|
+
require 'libis/format/type_database'
|
11
|
+
|
12
|
+
module Libis
|
13
|
+
module Format
|
14
|
+
|
15
|
+
class IdentificationTool
|
16
|
+
include Singleton
|
17
|
+
include ::Libis::Tools::Logger
|
18
|
+
|
19
|
+
def self.bad_mimetype(mimetype)
|
20
|
+
self.instance.bad_mimetype(mimetype)
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.run(file, recursive = false)
|
24
|
+
if file.is_a?(Array)
|
25
|
+
return run_list file
|
26
|
+
elsif file.is_a?(String) && File.exists?(file) && File.readable?(file)
|
27
|
+
if File.directory?(file)
|
28
|
+
return run_dir(file, recursive)
|
29
|
+
elsif File.file?(file)
|
30
|
+
return self.instance.run(file)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
raise ArgumentError,
|
34
|
+
'IdentificationTool: file argument should be a path to an existing file or directory or a list of those'
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.run_dir(file, recursive = true)
|
38
|
+
self.instance.run_dir file, recursive
|
39
|
+
end
|
40
|
+
|
41
|
+
def self.run_list(filelist)
|
42
|
+
self.instance.run_list filelist
|
43
|
+
end
|
44
|
+
|
45
|
+
protected
|
46
|
+
|
47
|
+
def create_list_file(filelist)
|
48
|
+
list_file = Dir::Tmpname.make_tmpname(%w'file .list', nil)
|
49
|
+
File.open(list_file, 'w') do |f|
|
50
|
+
filelist.each do |fname|
|
51
|
+
f.write "#{fname}\n"
|
52
|
+
end
|
53
|
+
end
|
54
|
+
yield(list_file)
|
55
|
+
ensure
|
56
|
+
File.delete(list_file)
|
57
|
+
end
|
58
|
+
|
59
|
+
def find_files(dir, recurse = true)
|
60
|
+
args = []
|
61
|
+
args << '-L'
|
62
|
+
args << dir.escape_for_string
|
63
|
+
args << '-maxdepth' << '1' unless recurse
|
64
|
+
args << '-type' << 'f'
|
65
|
+
args << '-print'
|
66
|
+
output = ::Libis::Tools::Command.run('find', *args)
|
67
|
+
warn "Find command errors: #{output[:err].join("\n")}" unless output[:err].empty?
|
68
|
+
output[:out]
|
69
|
+
end
|
70
|
+
|
71
|
+
# Reformat output to make it easier to post-process and decide on the preferred format
|
72
|
+
#
|
73
|
+
# input format:
|
74
|
+
# [
|
75
|
+
# { filepath: <filename>, mimetype: <mimetype>, matchtype: <matchtype>, ... }
|
76
|
+
# ]
|
77
|
+
#
|
78
|
+
# output format:
|
79
|
+
# { <filename> => [<result>, ...], ... }
|
80
|
+
#
|
81
|
+
# <result> is the enchanced Hash output of the identification tool:
|
82
|
+
# { mimetype: <mimetype>, puid: <puid>, matchtype: <matchtype>, score: <score>, ...}
|
83
|
+
#
|
84
|
+
def process_output(output)
|
85
|
+
output.reduce({}) do |results, x|
|
86
|
+
filepath = x.delete(:filepath)
|
87
|
+
results[filepath] ||= []
|
88
|
+
results[filepath.freeze] << annotate(x)
|
89
|
+
results
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
# Enhance the output with mimetype and score
|
94
|
+
def annotate(result)
|
95
|
+
# Enhance result with mimetype if needed
|
96
|
+
if bad_mimetypes.include?(result[:mimetype]) && !bad_puids.include?(result[:puid])
|
97
|
+
result[:mimetype] = get_mimetype(result[:puid])
|
98
|
+
end
|
99
|
+
|
100
|
+
# Normalize the mimetype
|
101
|
+
Libis::Format::TypeDatabase.normalize(result, PUID: :puid, MIME: :mimetype)
|
102
|
+
|
103
|
+
# Default score is 5
|
104
|
+
result[:score] = 5
|
105
|
+
|
106
|
+
# Weak detection score is 1
|
107
|
+
result[:score] = 1 if bad_mimetypes.include? result[:mimetype]
|
108
|
+
|
109
|
+
# freeze all strings
|
110
|
+
result.each {|_, v| v.freeze if v.is_a?(String)}
|
111
|
+
|
112
|
+
# Adapt score based on matchtype
|
113
|
+
result[:matchtype] = result[:matchtype].to_s.downcase
|
114
|
+
case result[:matchtype]
|
115
|
+
|
116
|
+
# Signature match increases score with 2
|
117
|
+
when 'signature'
|
118
|
+
result[:score] += 2
|
119
|
+
# typeinfo = ::Libis::Format::TypeDatabase.puid_typeinfo(result[:puid])
|
120
|
+
# ext = File.extname(result[:filename])
|
121
|
+
# result[:score] += 1 if typeinfo and typeinfo[:EXTENSIONS].include?(ext)
|
122
|
+
|
123
|
+
# Container match increases score with 4
|
124
|
+
when 'container'
|
125
|
+
result[:score] += 4
|
126
|
+
# typeinfo = ::Libis::Format::TypeDatabase.puid_typeinfo(result[:puid])
|
127
|
+
# ext = File.extname(result[:filename])
|
128
|
+
# result[:score] += 1 if typeinfo and typeinfo[:EXTENSIONS].include?(ext)
|
129
|
+
|
130
|
+
# Extension match is the weakest identification; score is lowered by 2 points
|
131
|
+
when 'extension'
|
132
|
+
result[:score] -= 2
|
133
|
+
|
134
|
+
# Magic code (file tool) is to be trused even less
|
135
|
+
when 'magic'
|
136
|
+
result[:score] -= 3
|
137
|
+
|
138
|
+
# Or no change otherwise
|
139
|
+
else
|
140
|
+
# do nothing
|
141
|
+
end
|
142
|
+
|
143
|
+
# Detecting a zip file should decrease the score as it may hide one of the many zip-based formats (e.g. epub,
|
144
|
+
# Office OpenXML, OpenDocument, jar, maff, svx)
|
145
|
+
if result[:mimetype] == 'application/zip'
|
146
|
+
result[:score] -= 2
|
147
|
+
end
|
148
|
+
|
149
|
+
# Return result enhanced with mimetype and score fields
|
150
|
+
result
|
151
|
+
end
|
152
|
+
|
153
|
+
def get_mimetype(puid)
|
154
|
+
::Libis::Format::TypeDatabase.puid_typeinfo(puid)[:MIME].first rescue nil
|
155
|
+
end
|
156
|
+
|
157
|
+
def get_puid(mimetype)
|
158
|
+
::Libis::Format::TypeDatabase.mime_infos(mimetype).first[:PUID].first rescue nil
|
159
|
+
end
|
160
|
+
|
161
|
+
attr_accessor :bad_mimetypes, :bad_puids
|
162
|
+
|
163
|
+
def initialize
|
164
|
+
@bad_mimetypes = [nil, '', 'None', 'application/octet-stream']
|
165
|
+
@bad_puids = [nil, 'fmt/unknown']
|
166
|
+
end
|
167
|
+
|
168
|
+
def bad_mimetype(mimetype)
|
169
|
+
@bad_mimetypes << mimetype
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
173
|
+
end
|
174
|
+
end
|
@@ -3,13 +3,17 @@
|
|
3
3
|
require 'singleton'
|
4
4
|
|
5
5
|
require 'libis-tools'
|
6
|
+
require 'libis/tools/extend/hash'
|
6
7
|
require 'libis/tools/extend/string'
|
7
8
|
require 'libis/tools/extend/empty'
|
8
9
|
|
9
10
|
require 'libis/format/type_database'
|
10
11
|
|
12
|
+
require_relative 'config'
|
11
13
|
require_relative 'fido'
|
12
14
|
require_relative 'droid'
|
15
|
+
require_relative 'file_tool'
|
16
|
+
require_relative 'extension_identification'
|
13
17
|
|
14
18
|
module Libis
|
15
19
|
module Format
|
@@ -18,39 +22,8 @@ module Libis
|
|
18
22
|
include ::Libis::Tools::Logger
|
19
23
|
include Singleton
|
20
24
|
|
21
|
-
RETRY_MIMETYPES = %w(application/zip) + ::Libis::Format::Fido::BAD_MIMETYPES
|
22
|
-
FIDO_FAILURES = %w(application/vnd.oasis.opendocument.text application/vnd.oasis.opendocument.spreadsheet)
|
23
|
-
|
24
|
-
attr_reader :xml_validations
|
25
|
-
|
26
|
-
protected
|
27
|
-
|
28
|
-
def initialize
|
29
|
-
data_dir = File.absolute_path(File.join(File.dirname(__FILE__), '..', '..', '..', 'data'))
|
30
|
-
@fido_formats = [(File.join(data_dir, 'lias_formats.xml'))]
|
31
|
-
# noinspection RubyStringKeysInHashInspection
|
32
|
-
@xml_validations = {'archive/ead' => File.join(data_dir, 'ead.xsd')}
|
33
|
-
end
|
34
|
-
|
35
|
-
def result_ok?(result, who_is_asking = nil)
|
36
|
-
result = ::Libis::Format::TypeDatabase.enrich(result, PUID: :puid, MIME: :mimetype)
|
37
|
-
return false if result.empty?
|
38
|
-
return true unless result[:TYPE].empty?
|
39
|
-
return false if RETRY_MIMETYPES.include? result[:mimetype]
|
40
|
-
return false if FIDO_FAILURES.include? result[:mimetype] and who_is_asking == :DROID
|
41
|
-
!(result[:mimetype].empty? and result[:puid].empty?)
|
42
|
-
end
|
43
|
-
|
44
|
-
def get_puid(mimetype)
|
45
|
-
::Libis::Format::TypeDatabase.mime_infos(mimetype).first[:PUID].first rescue nil
|
46
|
-
end
|
47
|
-
|
48
25
|
public
|
49
26
|
|
50
|
-
def self.add_fido_format(f)
|
51
|
-
::Libis::Format::Fido.add_format f
|
52
|
-
end
|
53
|
-
|
54
27
|
def self.add_xml_validation(mimetype, xsd_file)
|
55
28
|
instance.xml_validations[mimetype] = xsd_file
|
56
29
|
end
|
@@ -59,134 +32,173 @@ module Libis
|
|
59
32
|
instance.xml_validations
|
60
33
|
end
|
61
34
|
|
62
|
-
def self.get(
|
63
|
-
instance.get
|
35
|
+
def self.get(file, options = {})
|
36
|
+
instance.get file, options
|
64
37
|
end
|
65
38
|
|
66
|
-
|
39
|
+
attr_reader :xml_validations
|
67
40
|
|
68
|
-
|
69
|
-
error 'File %s cannot be found.', file
|
70
|
-
return nil
|
71
|
-
end
|
72
|
-
if File.directory? file
|
73
|
-
error '%s is a directory.', file
|
74
|
-
return nil
|
75
|
-
end
|
41
|
+
def get(file, options = {})
|
76
42
|
|
77
|
-
options
|
43
|
+
options[:droid] = true unless options[:tool] and options[:tool] != :droid
|
44
|
+
options[:fido] = true unless options[:tool] and options[:tool] != :fido
|
45
|
+
options[:file] = true unless options[:tool] and options[:tool] != :file
|
78
46
|
|
79
|
-
result = {messages: []}
|
47
|
+
result = {messages: [], output: {}, formats: {}}
|
80
48
|
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
49
|
+
begin
|
50
|
+
get_droid_identification(file, options[:recursive], result) if options[:droid]
|
51
|
+
rescue => e
|
52
|
+
log_msg(result, :error, "Error running Droid: #{e.message} @ #{e.backtrace.first}")
|
53
|
+
end
|
85
54
|
|
86
|
-
|
87
|
-
|
55
|
+
begin
|
56
|
+
get_fido_identification(file, options[:recursive], result) if options[:fido]
|
57
|
+
rescue => e
|
58
|
+
log_msg(result, :error, "Error running Fido: #{e.message} @ #{e.backtrace.first}")
|
59
|
+
end
|
88
60
|
|
89
|
-
|
90
|
-
|
61
|
+
begin
|
62
|
+
get_file_identification(file, options[:recursive], result) if options[:file]
|
63
|
+
rescue => e
|
64
|
+
log_msg(result, :error, "Error running File: #{e.message} @ #{e.backtrace.first}")
|
65
|
+
end
|
91
66
|
|
92
|
-
#
|
93
|
-
result = get_extension_identification(file, result)
|
67
|
+
# get_extension_identification(file, options[:recursive], result)
|
94
68
|
|
95
69
|
# determine XML type. Add custom types at runtime with
|
96
70
|
# Libis::Tools::Format::Identifier.add_xml_validation('my_type', '/path/to/my_type.xsd')
|
97
|
-
|
71
|
+
validate_against_xml_schema(result)
|
98
72
|
|
99
|
-
result
|
100
|
-
log_msg(result, :info, "Identification of '#{file}': '#{result}'") :
|
101
|
-
log_msg(result, :warn, "Could not identify MIME type of '#{file}'")
|
102
|
-
end
|
73
|
+
process_results(result)
|
103
74
|
|
104
|
-
|
105
|
-
|
75
|
+
# result[:mimetype] ?
|
76
|
+
# log_msg(result, :info, "Identification of '#{file}': '#{result}'") :
|
77
|
+
# log_msg(result, :warn, "Could not identify MIME type of '#{file}'")
|
106
78
|
|
107
|
-
|
79
|
+
result
|
108
80
|
|
109
|
-
|
81
|
+
end
|
110
82
|
|
111
|
-
|
112
|
-
result[:method] = 'fido'
|
83
|
+
protected
|
113
84
|
|
114
|
-
|
85
|
+
def initialize
|
86
|
+
@xml_validations = Libis::Format::Config[:xml_validations].to_h
|
115
87
|
end
|
116
88
|
|
117
|
-
def
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
result.clear
|
123
|
-
droid_output = droid_output.first
|
124
|
-
result[:mimetype] = droid_output[:mime_type].to_s.split(/[\s,]+/).find { |x| x =~ /.*\/.*/ }
|
125
|
-
result[:matchtype] = droid_output[:method]
|
126
|
-
result[:puid] = droid_output[:puid]
|
127
|
-
result[:format_name] = droid_output[:format_name]
|
128
|
-
result[:format_version] = droid_output[:format_version]
|
129
|
-
result[:method] = 'droid'
|
89
|
+
def get_file_identification(file, recursive, result)
|
90
|
+
output = ::Libis::Format::FileTool.run(file, recursive)
|
91
|
+
process_tool_output(output, result)
|
92
|
+
output
|
93
|
+
end
|
130
94
|
|
131
|
-
|
95
|
+
def get_fido_identification(file, recursive, result)
|
96
|
+
output = ::Libis::Format::Fido.run(file, recursive)
|
97
|
+
process_tool_output(output, result)
|
98
|
+
output
|
132
99
|
end
|
133
100
|
|
134
|
-
def
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
result[:method] = 'file'
|
145
|
-
rescue Exception
|
146
|
-
# ignored
|
147
|
-
end
|
148
|
-
result
|
101
|
+
def get_droid_identification(file, recursive, result)
|
102
|
+
output = ::Libis::Format::Droid.run(file, recursive)
|
103
|
+
process_tool_output(output, result)
|
104
|
+
output
|
105
|
+
end
|
106
|
+
|
107
|
+
def get_extension_identification(file, recursive, result)
|
108
|
+
output = ::Libis::Format::ExtensionIdentification.run(file, recursive)
|
109
|
+
process_tool_output(output, result)
|
110
|
+
output
|
149
111
|
end
|
150
112
|
|
151
|
-
def
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
result[:mimetype] = info[:MIME].first rescue nil
|
157
|
-
result[:puid] = info[:PUID].first rescue nil
|
113
|
+
def validate_against_xml_schema(result)
|
114
|
+
result[:output].each do |file, file_results|
|
115
|
+
file_results.each do |file_result|
|
116
|
+
xml_validate(file, file_result, result)
|
117
|
+
end
|
158
118
|
end
|
159
|
-
result[:method] = 'extension'
|
160
|
-
result
|
161
119
|
end
|
162
120
|
|
163
|
-
def
|
164
|
-
return
|
121
|
+
def xml_validate(file, file_result, result)
|
122
|
+
return unless file_result[:mimetype] =~ /^(text|application)\/xml$/
|
165
123
|
doc = ::Libis::Tools::XmlDocument.open file
|
166
124
|
xml_validations.each do |mime, xsd_file|
|
167
125
|
next unless xsd_file
|
168
126
|
begin
|
169
127
|
if doc.validates_against?(xsd_file)
|
170
128
|
log_msg result, :debug, "XML file validated against XML Schema: #{xsd_file}"
|
171
|
-
|
172
|
-
|
173
|
-
result = ::Libis::Format::TypeDatabase.enrich(result, PUID: :puid, MIME: :mimetype)
|
129
|
+
info = {mimetype: mime, tool: file_result[:source], source: :xsd_validation, match_type: 'xsd_validation', format_version: '', }
|
130
|
+
file_result.merge! Libis::Format::TypeDatabase.enrich(info, PUID: :puid, MIME: :mimetype, NAME: :format_name)
|
174
131
|
end
|
175
|
-
rescue
|
176
|
-
# Do nothing - probably Nokogiri chrashed during validation.
|
177
|
-
#
|
178
|
-
# so we
|
132
|
+
rescue => e
|
133
|
+
# Do nothing - probably Nokogiri chrashed during validation. Could have many causes
|
134
|
+
# (remote schema (firewall, network, link rot, ...), schema syntax error, corrupt XML,...)
|
135
|
+
# so we log and continue.
|
136
|
+
log_msg(result, :warn, "Error during XML validation: #{e.message}")
|
179
137
|
end
|
180
138
|
end
|
181
|
-
|
139
|
+
end
|
140
|
+
|
141
|
+
def process_results(result)
|
142
|
+
result[:output].map do |file, output|
|
143
|
+
file_result = result[:formats][file] = {}
|
144
|
+
if output.empty?
|
145
|
+
file_result = {
|
146
|
+
mimetype: 'application/octet-stream',
|
147
|
+
puid: 'fmt/unknown',
|
148
|
+
score: 0,
|
149
|
+
source: nil
|
150
|
+
}
|
151
|
+
else
|
152
|
+
format_matches = output.group_by {|x| [x[:mimetype], x[:puid]]}
|
153
|
+
format_matches.each do |match, group|
|
154
|
+
format_matches[match] = group.group_by {|x| x[:score]}.sort.reverse.to_h
|
155
|
+
end
|
156
|
+
case format_matches.count
|
157
|
+
when 0
|
158
|
+
# No this really cannot happen. If there are not hits, there will be at least a format [nil,nil]
|
159
|
+
when 1
|
160
|
+
# only one match, that's easy. The first of the highest score will be used
|
161
|
+
file_result.merge!(get_best_result(output))
|
162
|
+
else
|
163
|
+
process_multiple_formats(file_result, format_matches, output)
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
def process_multiple_formats(file_result, format_matches, output)
|
170
|
+
# multiple matches. Let's select the highest score matches
|
171
|
+
file_result.merge!(get_best_result(output))
|
172
|
+
file_result[:alternatives] = []
|
173
|
+
format_matches.keys.each do |mime, puid|
|
174
|
+
next if file_result[:mimetype] == mime && puid.nil?
|
175
|
+
selection = output.select {|x| x[:mimetype] == mime && x[:puid] == puid}
|
176
|
+
file_result[:alternatives] << get_best_result(selection)
|
177
|
+
end
|
178
|
+
file_result[:alternatives] = file_result[:alternatives].sort_by {|x| x[:score]}.reverse
|
179
|
+
file_result.delete(:alternatives) if file_result[:alternatives].size <= 1
|
182
180
|
end
|
183
181
|
|
184
182
|
private
|
185
183
|
|
184
|
+
def process_tool_output(output, result)
|
185
|
+
output.each do |file, file_output|
|
186
|
+
result[:output][file] ||= []
|
187
|
+
result[:output][file] += file_output
|
188
|
+
end
|
189
|
+
end
|
190
|
+
|
186
191
|
def log_msg(result, severity, text)
|
187
|
-
|
188
|
-
|
189
|
-
|
192
|
+
result[:messages] << [severity, text]
|
193
|
+
end
|
194
|
+
|
195
|
+
def get_mimetype(puid)
|
196
|
+
::Libis::Format::TypeDatabase.puid_typeinfo(puid)[:MIME].first rescue nil
|
197
|
+
end
|
198
|
+
|
199
|
+
def get_best_result(results)
|
200
|
+
score = results.map {|x| x[:score]}.max
|
201
|
+
results.select {|x| x[:score] == score}.reduce(:apply_defaults)
|
190
202
|
end
|
191
203
|
|
192
204
|
end
|
@@ -7,6 +7,8 @@ require 'backports/rails/hash'
|
|
7
7
|
require 'libis/tools/logger'
|
8
8
|
require 'libis/tools/extend/string'
|
9
9
|
|
10
|
+
require_relative 'config'
|
11
|
+
|
10
12
|
module Libis
|
11
13
|
module Format
|
12
14
|
|
@@ -19,6 +21,25 @@ module Libis
|
|
19
21
|
end
|
20
22
|
|
21
23
|
def self.enrich(info, map_keys = {})
|
24
|
+
return {} unless info.is_a? Hash
|
25
|
+
mapper = Hash.new {|hash,key| hash[key] = key}
|
26
|
+
mapper.merge! map_keys
|
27
|
+
unless (puid = info[mapper[:PUID]]).blank?
|
28
|
+
info[mapper[:TYPE]] ||= puid_infos(puid).first[:TYPE] rescue nil
|
29
|
+
end
|
30
|
+
unless (mime = info[mapper[:MIME]]).blank?
|
31
|
+
info[mapper[:TYPE]] ||= mime_infos(mime).first[:TYPE] rescue nil
|
32
|
+
end
|
33
|
+
unless (type_name = info[mapper[:TYPE]]).nil?
|
34
|
+
mapper.keys.each do |key|
|
35
|
+
info[mapper[key]] = get(type_name, key) || info[mapper[key]]
|
36
|
+
end
|
37
|
+
info[mapper[:GROUP]] = self.type_group(type_name)
|
38
|
+
end
|
39
|
+
info
|
40
|
+
end
|
41
|
+
|
42
|
+
def self.normalize(info, map_keys = {})
|
22
43
|
return {} unless info.is_a? Hash
|
23
44
|
mapper = Hash.new {|hash,key| hash[key] = key}
|
24
45
|
mapper.merge! map_keys
|
@@ -29,14 +50,25 @@ module Libis
|
|
29
50
|
info[mapper[:TYPE]] ||= self.mime_infos(mime).first[:TYPE] rescue nil
|
30
51
|
end
|
31
52
|
unless (type_name = info[mapper[:TYPE]]).nil?
|
32
|
-
info[mapper[:MIME]] = self.type_mimetypes(type_name).first if
|
33
|
-
info[mapper[:PUID]] = self.type_puids(type_name).first if info[mapper[:PUID]].blank?
|
34
|
-
info[mapper[:EXTENSIONS]] = self.type_extentions(type_name)
|
53
|
+
info[mapper[:MIME]] = self.type_mimetypes(type_name).first if self.type_mimetypes(type_name).first
|
35
54
|
info[mapper[:GROUP]] = self.type_group(type_name)
|
36
55
|
end
|
37
56
|
info
|
38
57
|
end
|
39
58
|
|
59
|
+
def self.get(type_name, key)
|
60
|
+
case key
|
61
|
+
when :MIME
|
62
|
+
type_mimetypes(type_name).first
|
63
|
+
when :PUID
|
64
|
+
type_puids(type_name).first
|
65
|
+
when :EXTENSION
|
66
|
+
type_extentions(type_name).first
|
67
|
+
else
|
68
|
+
self.typeinfo(type_name)[key]
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
40
72
|
def self.type_group(t)
|
41
73
|
typeinfo(t)[:GROUP]
|
42
74
|
end
|
@@ -159,8 +191,7 @@ module Libis
|
|
159
191
|
|
160
192
|
def initialize
|
161
193
|
@types = Hash.new
|
162
|
-
|
163
|
-
type_database = File.join(data_dir, 'types.yml')
|
194
|
+
type_database = Libis::Format::Config[:type_database]
|
164
195
|
load_types(type_database)
|
165
196
|
end
|
166
197
|
|
data/lib/libis/format/version.rb
CHANGED
data/lib/libis/format.rb
CHANGED
@@ -5,9 +5,12 @@ module Libis
|
|
5
5
|
autoload :Config, 'libis/format/config'
|
6
6
|
autoload :TypeDatabase, 'libis/format/type_database'
|
7
7
|
autoload :Identifier, 'libis/format/identifier'
|
8
|
+
autoload :Identifier, 'libis/format/identifier'
|
8
9
|
|
10
|
+
autoload :FileTool, 'libis/format/file_tool'
|
9
11
|
autoload :Fido, 'libis/format/fido'
|
10
12
|
autoload :Droid, 'libis/format/droid'
|
13
|
+
autoload :ExtensionIdentification, 'libis/format/extension_identification'
|
11
14
|
|
12
15
|
autoload :OfficeToPdf, 'libis/format/office_to_pdf'
|
13
16
|
autoload :PdfCopy, 'libis/format/pdf_copy'
|
data/libis-format.gemspec
CHANGED
@@ -26,8 +26,9 @@ Gem::Specification.new do |spec|
|
|
26
26
|
spec.add_development_dependency 'rake', '~> 10.3'
|
27
27
|
spec.add_development_dependency 'rspec', '~> 3.1'
|
28
28
|
spec.add_development_dependency 'simplecov', '~> 0.9'
|
29
|
+
spec.add_development_dependency 'awesome_print'
|
29
30
|
|
30
|
-
spec.add_runtime_dependency 'libis-tools', '~> 0.9'
|
31
|
+
spec.add_runtime_dependency 'libis-tools', '~> 0.9.52'
|
31
32
|
spec.add_runtime_dependency 'os', '= 0.9.6'
|
32
33
|
spec.add_runtime_dependency 'mini_magick', '~> 4.3'
|
33
34
|
spec.add_runtime_dependency 'deep_dive', '~> 0.3'
|
data/spec/converter_spec.rb
CHANGED
@@ -23,6 +23,8 @@ describe 'Converters' do
|
|
23
23
|
|
24
24
|
before(:all) {
|
25
25
|
Libis::Tools::Config.logger.level = :WARN
|
26
|
+
::Libis::Format::Config[:droid_path] = '/opt/droid/droid.sh'
|
27
|
+
::Libis::Format::Config[:fido_path] = '/usr/local/bin/fido'
|
26
28
|
}
|
27
29
|
|
28
30
|
context 'Repository' do
|
@@ -114,7 +116,7 @@ describe 'Converters' do
|
|
114
116
|
expect(result).to eq tgt_file
|
115
117
|
compare = MiniMagick::Tool::Compare.new
|
116
118
|
compare << ref_file << tgt_file
|
117
|
-
compare.metric << '
|
119
|
+
compare.metric << 'MAE'
|
118
120
|
compare.fuzz << '1%'
|
119
121
|
compare << diff_file
|
120
122
|
compare.call {|_, _, status| expect(status).to be 0}
|
@@ -132,7 +134,7 @@ describe 'Converters' do
|
|
132
134
|
expect(result).to eq tgt_file
|
133
135
|
compare = MiniMagick::Tool::Compare.new
|
134
136
|
compare << ref_file << tgt_file
|
135
|
-
compare.metric << '
|
137
|
+
compare.metric << 'MAE'
|
136
138
|
compare << diff_file
|
137
139
|
compare.call {|_, _, status| expect(status).to be 0}
|
138
140
|
FileUtils.rm tgt_file, force: true
|
@@ -167,7 +169,7 @@ describe 'Converters' do
|
|
167
169
|
compare = MiniMagick::Tool::Compare.new
|
168
170
|
compare << ref_file << tgt_file
|
169
171
|
compare.metric << 'AE'
|
170
|
-
compare.fuzz << '
|
172
|
+
compare.fuzz << '100%'
|
171
173
|
compare << diff_file
|
172
174
|
compare.call do |_stdin, _stdout, status|
|
173
175
|
expect(status).to be 0
|
@@ -189,7 +191,7 @@ describe 'Converters' do
|
|
189
191
|
expect(File.exist?(tgt_file)).to be_truthy
|
190
192
|
compare = MiniMagick::Tool::Compare.new
|
191
193
|
compare << ref_file << tgt_file
|
192
|
-
compare.metric << '
|
194
|
+
compare.metric << 'MAE'
|
193
195
|
compare.fuzz << '10%'
|
194
196
|
compare << diff_file
|
195
197
|
compare.call {|_,_,status| expect(status).to be 0}
|