libis-format 0.9.32 → 0.9.33

Sign up to get free protection for your applications and to get access to all the features.
Files changed (139) hide show
  1. checksums.yaml +4 -4
  2. data/data/types.yml +30 -16
  3. data/lib/libis/format/config.rb +7 -18
  4. data/lib/libis/format/converter/image_converter.rb +6 -0
  5. data/lib/libis/format/droid.rb +82 -25
  6. data/lib/libis/format/extension_identification.rb +55 -0
  7. data/lib/libis/format/fido.rb +57 -72
  8. data/lib/libis/format/file_tool.rb +76 -0
  9. data/lib/libis/format/identification_tool.rb +174 -0
  10. data/lib/libis/format/identifier.rb +129 -117
  11. data/lib/libis/format/type_database.rb +36 -5
  12. data/lib/libis/format/version.rb +1 -1
  13. data/lib/libis/format.rb +3 -0
  14. data/libis-format.gemspec +2 -1
  15. data/spec/converter_spec.rb +6 -4
  16. data/spec/identifier_spec.rb +125 -34
  17. metadata +21 -126
  18. data/tools/droid/DROID_SignatureFile_V90.xml +0 -40182
  19. data/tools/droid/container-signature-20170330.xml +0 -3584
  20. data/tools/droid/droid-command-line-6.3.jar +0 -0
  21. data/tools/droid/droid.bat +0 -152
  22. data/tools/droid/droid.sh +0 -152
  23. data/tools/droid/lib/XmlSchema-1.4.7.jar +0 -0
  24. data/tools/droid/lib/activation-1.1.jar +0 -0
  25. data/tools/droid/lib/aopalliance-1.0.jar +0 -0
  26. data/tools/droid/lib/asm-2.2.3.jar +0 -0
  27. data/tools/droid/lib/aspectjrt-1.8.7.jar +0 -0
  28. data/tools/droid/lib/aspectjweaver-1.8.7.jar +0 -0
  29. data/tools/droid/lib/bcmail-jdk14-138.jar +0 -0
  30. data/tools/droid/lib/bcprov-jdk14-138.jar +0 -0
  31. data/tools/droid/lib/beansbinding-1.2.1.jar +0 -0
  32. data/tools/droid/lib/byteseek-2.0.3.jar +0 -0
  33. data/tools/droid/lib/cglib-nodep-2.2.2.jar +0 -0
  34. data/tools/droid/lib/classmate-1.0.0.jar +0 -0
  35. data/tools/droid/lib/commons-cli-1.2.jar +0 -0
  36. data/tools/droid/lib/commons-codec-1.10.jar +0 -0
  37. data/tools/droid/lib/commons-collections-3.2.2.jar +0 -0
  38. data/tools/droid/lib/commons-compress-1.4.1.jar +0 -0
  39. data/tools/droid/lib/commons-configuration-1.8.jar +0 -0
  40. data/tools/droid/lib/commons-dbcp-1.4.jar +0 -0
  41. data/tools/droid/lib/commons-httpclient-3.1.jar +0 -0
  42. data/tools/droid/lib/commons-io-2.4.jar +0 -0
  43. data/tools/droid/lib/commons-lang-2.6.jar +0 -0
  44. data/tools/droid/lib/commons-logging-1.1.1.jar +0 -0
  45. data/tools/droid/lib/commons-pool-1.5.4.jar +0 -0
  46. data/tools/droid/lib/cxf-api-2.2.12.jar +0 -0
  47. data/tools/droid/lib/cxf-common-schemas-2.2.12.jar +0 -0
  48. data/tools/droid/lib/cxf-common-utilities-2.2.12.jar +0 -0
  49. data/tools/droid/lib/cxf-rt-bindings-http-2.2.12.jar +0 -0
  50. data/tools/droid/lib/cxf-rt-bindings-soap-2.2.12.jar +0 -0
  51. data/tools/droid/lib/cxf-rt-bindings-xml-2.2.12.jar +0 -0
  52. data/tools/droid/lib/cxf-rt-core-2.2.12.jar +0 -0
  53. data/tools/droid/lib/cxf-rt-databinding-jaxb-2.2.12.jar +0 -0
  54. data/tools/droid/lib/cxf-rt-frontend-jaxws-2.2.12.jar +0 -0
  55. data/tools/droid/lib/cxf-rt-frontend-simple-2.2.12.jar +0 -0
  56. data/tools/droid/lib/cxf-rt-transports-http-2.2.12.jar +0 -0
  57. data/tools/droid/lib/cxf-rt-ws-addr-2.2.12.jar +0 -0
  58. data/tools/droid/lib/cxf-tools-common-2.2.12.jar +0 -0
  59. data/tools/droid/lib/de.huxhorn.lilith.3rdparty.flyingsaucer.core-renderer-8RC1.jar +0 -0
  60. data/tools/droid/lib/derby-10.10.2.0.jar +0 -0
  61. data/tools/droid/lib/droid-container-6.3.jar +0 -0
  62. data/tools/droid/lib/droid-core-6.3.jar +0 -0
  63. data/tools/droid/lib/droid-core-interfaces-6.3.jar +0 -0
  64. data/tools/droid/lib/droid-export-6.3.jar +0 -0
  65. data/tools/droid/lib/droid-export-interfaces-6.3.jar +0 -0
  66. data/tools/droid/lib/droid-help-6.3.jar +0 -0
  67. data/tools/droid/lib/droid-report-6.3.jar +0 -0
  68. data/tools/droid/lib/droid-report-interfaces-6.3.jar +0 -0
  69. data/tools/droid/lib/droid-results-6.3.jar +0 -0
  70. data/tools/droid/lib/geronimo-activation_1.1_spec-1.0.2.jar +0 -0
  71. data/tools/droid/lib/geronimo-annotation_1.0_spec-1.1.1.jar +0 -0
  72. data/tools/droid/lib/geronimo-javamail_1.4_spec-1.6.jar +0 -0
  73. data/tools/droid/lib/geronimo-jaxws_2.1_spec-1.0.jar +0 -0
  74. data/tools/droid/lib/geronimo-stax-api_1.0_spec-1.0.1.jar +0 -0
  75. data/tools/droid/lib/geronimo-ws-metadata_2.0_spec-1.1.2.jar +0 -0
  76. data/tools/droid/lib/hibernate-validator-5.1.0.Final.jar +0 -0
  77. data/tools/droid/lib/itext-2.0.8.jar +0 -0
  78. data/tools/droid/lib/javahelp-2.0.05.jar +0 -0
  79. data/tools/droid/lib/jaxb-api-2.1.jar +0 -0
  80. data/tools/droid/lib/jaxb-impl-2.1.13.jar +0 -0
  81. data/tools/droid/lib/jboss-logging-3.1.3.GA.jar +0 -0
  82. data/tools/droid/lib/joda-time-1.6.2.jar +0 -0
  83. data/tools/droid/lib/jra-1.0-alpha-4.jar +0 -0
  84. data/tools/droid/lib/jta-1.1.jar +0 -0
  85. data/tools/droid/lib/jwat-arc-1.0.3.jar +0 -0
  86. data/tools/droid/lib/jwat-archive-common-1.0.3.jar +0 -0
  87. data/tools/droid/lib/jwat-common-1.0.3.jar +0 -0
  88. data/tools/droid/lib/jwat-gzip-1.0.3.jar +0 -0
  89. data/tools/droid/lib/jwat-warc-1.0.2.jar +0 -0
  90. data/tools/droid/lib/log4j-1.2.13.jar +0 -0
  91. data/tools/droid/lib/neethi-2.0.4.jar +0 -0
  92. data/tools/droid/lib/opencsv-2.3.jar +0 -0
  93. data/tools/droid/lib/org-netbeans-swing-outline-7.2.jar +0 -0
  94. data/tools/droid/lib/org-openide-util-7.2.jar +0 -0
  95. data/tools/droid/lib/org-openide-util-lookup-7.2.jar +0 -0
  96. data/tools/droid/lib/poi-3.13.jar +0 -0
  97. data/tools/droid/lib/saaj-api-1.3.jar +0 -0
  98. data/tools/droid/lib/saaj-impl-1.3.2.jar +0 -0
  99. data/tools/droid/lib/slf4j-api-1.4.2.jar +0 -0
  100. data/tools/droid/lib/slf4j-log4j12-1.4.2.jar +0 -0
  101. data/tools/droid/lib/spring-aop-4.0.3.RELEASE.jar +0 -0
  102. data/tools/droid/lib/spring-beans-4.0.3.RELEASE.jar +0 -0
  103. data/tools/droid/lib/spring-context-4.0.3.RELEASE.jar +0 -0
  104. data/tools/droid/lib/spring-core-4.0.3.RELEASE.jar +0 -0
  105. data/tools/droid/lib/spring-expression-4.0.3.RELEASE.jar +0 -0
  106. data/tools/droid/lib/spring-jdbc-4.0.3.RELEASE.jar +0 -0
  107. data/tools/droid/lib/spring-orm-4.0.3.RELEASE.jar +0 -0
  108. data/tools/droid/lib/spring-tx-4.0.3.RELEASE.jar +0 -0
  109. data/tools/droid/lib/spring-web-2.5.6.jar +0 -0
  110. data/tools/droid/lib/stax-api-1.0-2.jar +0 -0
  111. data/tools/droid/lib/trove4j-3.0.3.jar +0 -0
  112. data/tools/droid/lib/truezip-6.8.4.jar +0 -0
  113. data/tools/droid/lib/validation-api-1.1.0.Final.jar +0 -0
  114. data/tools/droid/lib/wsdl4j-1.6.2.jar +0 -0
  115. data/tools/droid/lib/wstx-asl-3.2.9.jar +0 -0
  116. data/tools/droid/lib/xercesImpl-2.9.1.jar +0 -0
  117. data/tools/droid/lib/xml-apis-1.3.04.jar +0 -0
  118. data/tools/droid/lib/xml-resolver-1.2.jar +0 -0
  119. data/tools/droid/lib/xz-1.0.jar +0 -0
  120. data/tools/fido/__init__.py +0 -50
  121. data/tools/fido/conf/DROID_SignatureFile-v90.xml +0 -2
  122. data/tools/fido/conf/container-signature-20170330.xml +0 -3584
  123. data/tools/fido/conf/dc.xsd +0 -119
  124. data/tools/fido/conf/dcmitype.xsd +0 -53
  125. data/tools/fido/conf/dcterms.xsd +0 -383
  126. data/tools/fido/conf/fido-formats.xsd +0 -173
  127. data/tools/fido/conf/format_extension_template.xml +0 -105
  128. data/tools/fido/conf/format_extensions.xml +0 -484
  129. data/tools/fido/conf/formats-v90.xml +0 -48877
  130. data/tools/fido/conf/pronom-xml-v90.zip +0 -0
  131. data/tools/fido/conf/versions.xml +0 -8
  132. data/tools/fido/fido.bat +0 -4
  133. data/tools/fido/fido.py +0 -884
  134. data/tools/fido/fido.sh +0 -5
  135. data/tools/fido/package.py +0 -96
  136. data/tools/fido/prepare.py +0 -645
  137. data/tools/fido/pronomutils.py +0 -200
  138. data/tools/fido/toxml.py +0 -60
  139. data/tools/fido/update_signatures.py +0 -183
@@ -0,0 +1,174 @@
1
+ require 'csv'
2
+ require 'tmpdir'
3
+
4
+ require 'singleton'
5
+ require 'libis/tools/extend/string'
6
+ require 'libis/tools/logger'
7
+ require 'libis/tools/command'
8
+
9
+ require 'libis/format/config'
10
+ require 'libis/format/type_database'
11
+
12
+ module Libis
13
+ module Format
14
+
15
+ class IdentificationTool
16
+ include Singleton
17
+ include ::Libis::Tools::Logger
18
+
19
+ def self.bad_mimetype(mimetype)
20
+ self.instance.bad_mimetype(mimetype)
21
+ end
22
+
23
+ def self.run(file, recursive = false)
24
+ if file.is_a?(Array)
25
+ return run_list file
26
+ elsif file.is_a?(String) && File.exists?(file) && File.readable?(file)
27
+ if File.directory?(file)
28
+ return run_dir(file, recursive)
29
+ elsif File.file?(file)
30
+ return self.instance.run(file)
31
+ end
32
+ end
33
+ raise ArgumentError,
34
+ 'IdentificationTool: file argument should be a path to an existing file or directory or a list of those'
35
+ end
36
+
37
+ def self.run_dir(file, recursive = true)
38
+ self.instance.run_dir file, recursive
39
+ end
40
+
41
+ def self.run_list(filelist)
42
+ self.instance.run_list filelist
43
+ end
44
+
45
+ protected
46
+
47
+ def create_list_file(filelist)
48
+ list_file = Dir::Tmpname.make_tmpname(%w'file .list', nil)
49
+ File.open(list_file, 'w') do |f|
50
+ filelist.each do |fname|
51
+ f.write "#{fname}\n"
52
+ end
53
+ end
54
+ yield(list_file)
55
+ ensure
56
+ File.delete(list_file)
57
+ end
58
+
59
+ def find_files(dir, recurse = true)
60
+ args = []
61
+ args << '-L'
62
+ args << dir.escape_for_string
63
+ args << '-maxdepth' << '1' unless recurse
64
+ args << '-type' << 'f'
65
+ args << '-print'
66
+ output = ::Libis::Tools::Command.run('find', *args)
67
+ warn "Find command errors: #{output[:err].join("\n")}" unless output[:err].empty?
68
+ output[:out]
69
+ end
70
+
71
+ # Reformat output to make it easier to post-process and decide on the preferred format
72
+ #
73
+ # input format:
74
+ # [
75
+ # { filepath: <filename>, mimetype: <mimetype>, matchtype: <matchtype>, ... }
76
+ # ]
77
+ #
78
+ # output format:
79
+ # { <filename> => [<result>, ...], ... }
80
+ #
81
+ # <result> is the enchanced Hash output of the identification tool:
82
+ # { mimetype: <mimetype>, puid: <puid>, matchtype: <matchtype>, score: <score>, ...}
83
+ #
84
+ def process_output(output)
85
+ output.reduce({}) do |results, x|
86
+ filepath = x.delete(:filepath)
87
+ results[filepath] ||= []
88
+ results[filepath.freeze] << annotate(x)
89
+ results
90
+ end
91
+ end
92
+
93
+ # Enhance the output with mimetype and score
94
+ def annotate(result)
95
+ # Enhance result with mimetype if needed
96
+ if bad_mimetypes.include?(result[:mimetype]) && !bad_puids.include?(result[:puid])
97
+ result[:mimetype] = get_mimetype(result[:puid])
98
+ end
99
+
100
+ # Normalize the mimetype
101
+ Libis::Format::TypeDatabase.normalize(result, PUID: :puid, MIME: :mimetype)
102
+
103
+ # Default score is 5
104
+ result[:score] = 5
105
+
106
+ # Weak detection score is 1
107
+ result[:score] = 1 if bad_mimetypes.include? result[:mimetype]
108
+
109
+ # freeze all strings
110
+ result.each {|_, v| v.freeze if v.is_a?(String)}
111
+
112
+ # Adapt score based on matchtype
113
+ result[:matchtype] = result[:matchtype].to_s.downcase
114
+ case result[:matchtype]
115
+
116
+ # Signature match increases score with 2
117
+ when 'signature'
118
+ result[:score] += 2
119
+ # typeinfo = ::Libis::Format::TypeDatabase.puid_typeinfo(result[:puid])
120
+ # ext = File.extname(result[:filename])
121
+ # result[:score] += 1 if typeinfo and typeinfo[:EXTENSIONS].include?(ext)
122
+
123
+ # Container match increases score with 4
124
+ when 'container'
125
+ result[:score] += 4
126
+ # typeinfo = ::Libis::Format::TypeDatabase.puid_typeinfo(result[:puid])
127
+ # ext = File.extname(result[:filename])
128
+ # result[:score] += 1 if typeinfo and typeinfo[:EXTENSIONS].include?(ext)
129
+
130
+ # Extension match is the weakest identification; score is lowered by 2 points
131
+ when 'extension'
132
+ result[:score] -= 2
133
+
134
+ # Magic code (file tool) is to be trused even less
135
+ when 'magic'
136
+ result[:score] -= 3
137
+
138
+ # Or no change otherwise
139
+ else
140
+ # do nothing
141
+ end
142
+
143
+ # Detecting a zip file should decrease the score as it may hide one of the many zip-based formats (e.g. epub,
144
+ # Office OpenXML, OpenDocument, jar, maff, svx)
145
+ if result[:mimetype] == 'application/zip'
146
+ result[:score] -= 2
147
+ end
148
+
149
+ # Return result enhanced with mimetype and score fields
150
+ result
151
+ end
152
+
153
+ def get_mimetype(puid)
154
+ ::Libis::Format::TypeDatabase.puid_typeinfo(puid)[:MIME].first rescue nil
155
+ end
156
+
157
+ def get_puid(mimetype)
158
+ ::Libis::Format::TypeDatabase.mime_infos(mimetype).first[:PUID].first rescue nil
159
+ end
160
+
161
+ attr_accessor :bad_mimetypes, :bad_puids
162
+
163
+ def initialize
164
+ @bad_mimetypes = [nil, '', 'None', 'application/octet-stream']
165
+ @bad_puids = [nil, 'fmt/unknown']
166
+ end
167
+
168
+ def bad_mimetype(mimetype)
169
+ @bad_mimetypes << mimetype
170
+ end
171
+ end
172
+
173
+ end
174
+ end
@@ -3,13 +3,17 @@
3
3
  require 'singleton'
4
4
 
5
5
  require 'libis-tools'
6
+ require 'libis/tools/extend/hash'
6
7
  require 'libis/tools/extend/string'
7
8
  require 'libis/tools/extend/empty'
8
9
 
9
10
  require 'libis/format/type_database'
10
11
 
12
+ require_relative 'config'
11
13
  require_relative 'fido'
12
14
  require_relative 'droid'
15
+ require_relative 'file_tool'
16
+ require_relative 'extension_identification'
13
17
 
14
18
  module Libis
15
19
  module Format
@@ -18,39 +22,8 @@ module Libis
18
22
  include ::Libis::Tools::Logger
19
23
  include Singleton
20
24
 
21
- RETRY_MIMETYPES = %w(application/zip) + ::Libis::Format::Fido::BAD_MIMETYPES
22
- FIDO_FAILURES = %w(application/vnd.oasis.opendocument.text application/vnd.oasis.opendocument.spreadsheet)
23
-
24
- attr_reader :xml_validations
25
-
26
- protected
27
-
28
- def initialize
29
- data_dir = File.absolute_path(File.join(File.dirname(__FILE__), '..', '..', '..', 'data'))
30
- @fido_formats = [(File.join(data_dir, 'lias_formats.xml'))]
31
- # noinspection RubyStringKeysInHashInspection
32
- @xml_validations = {'archive/ead' => File.join(data_dir, 'ead.xsd')}
33
- end
34
-
35
- def result_ok?(result, who_is_asking = nil)
36
- result = ::Libis::Format::TypeDatabase.enrich(result, PUID: :puid, MIME: :mimetype)
37
- return false if result.empty?
38
- return true unless result[:TYPE].empty?
39
- return false if RETRY_MIMETYPES.include? result[:mimetype]
40
- return false if FIDO_FAILURES.include? result[:mimetype] and who_is_asking == :DROID
41
- !(result[:mimetype].empty? and result[:puid].empty?)
42
- end
43
-
44
- def get_puid(mimetype)
45
- ::Libis::Format::TypeDatabase.mime_infos(mimetype).first[:PUID].first rescue nil
46
- end
47
-
48
25
  public
49
26
 
50
- def self.add_fido_format(f)
51
- ::Libis::Format::Fido.add_format f
52
- end
53
-
54
27
  def self.add_xml_validation(mimetype, xsd_file)
55
28
  instance.xml_validations[mimetype] = xsd_file
56
29
  end
@@ -59,134 +32,173 @@ module Libis
59
32
  instance.xml_validations
60
33
  end
61
34
 
62
- def self.get(file_path, options = nil)
63
- instance.get file_path, options
35
+ def self.get(file, options = {})
36
+ instance.get file, options
64
37
  end
65
38
 
66
- def get(file, options = nil)
39
+ attr_reader :xml_validations
67
40
 
68
- unless File.exists? file
69
- error 'File %s cannot be found.', file
70
- return nil
71
- end
72
- if File.directory? file
73
- error '%s is a directory.', file
74
- return nil
75
- end
41
+ def get(file, options = {})
76
42
 
77
- options ||= {}
43
+ options[:droid] = true unless options[:tool] and options[:tool] != :droid
44
+ options[:fido] = true unless options[:tool] and options[:tool] != :fido
45
+ options[:file] = true unless options[:tool] and options[:tool] != :file
78
46
 
79
- result = {messages: []}
47
+ result = {messages: [], output: {}, formats: {}}
80
48
 
81
- # use FIDO
82
- # Note: FIDO does not always do a good job, mainly due to lacking container inspection.
83
- # FIDO misses should be registered in
84
- result = get_fido_identification(file, result, options[:formats]) unless options[:droid]
49
+ begin
50
+ get_droid_identification(file, options[:recursive], result) if options[:droid]
51
+ rescue => e
52
+ log_msg(result, :error, "Error running Droid: #{e.message} @ #{e.backtrace.first}")
53
+ end
85
54
 
86
- # use DROID
87
- result = get_droid_identification file, result
55
+ begin
56
+ get_fido_identification(file, options[:recursive], result) if options[:fido]
57
+ rescue => e
58
+ log_msg(result, :error, "Error running Fido: #{e.message} @ #{e.backtrace.first}")
59
+ end
88
60
 
89
- # use FILE
90
- result = get_file_identification(file, result)
61
+ begin
62
+ get_file_identification(file, options[:recursive], result) if options[:file]
63
+ rescue => e
64
+ log_msg(result, :error, "Error running File: #{e.message} @ #{e.backtrace.first}")
65
+ end
91
66
 
92
- # Try file extension
93
- result = get_extension_identification(file, result)
67
+ # get_extension_identification(file, options[:recursive], result)
94
68
 
95
69
  # determine XML type. Add custom types at runtime with
96
70
  # Libis::Tools::Format::Identifier.add_xml_validation('my_type', '/path/to/my_type.xsd')
97
- result = validate_against_xml_schema(file, result)
71
+ validate_against_xml_schema(result)
98
72
 
99
- result[:mimetype] ?
100
- log_msg(result, :info, "Identification of '#{file}': '#{result}'") :
101
- log_msg(result, :warn, "Could not identify MIME type of '#{file}'")
102
- end
73
+ process_results(result)
103
74
 
104
- def get_fido_identification(file, result = {}, xtra_formats = nil)
105
- return result if result_ok? result
75
+ # result[:mimetype] ?
76
+ # log_msg(result, :info, "Identification of '#{file}': '#{result}'") :
77
+ # log_msg(result, :warn, "Could not identify MIME type of '#{file}'")
106
78
 
107
- fido_result = ::Libis::Format::Fido.run(file, xtra_formats)
79
+ result
108
80
 
109
- return result unless fido_result.is_a? Hash
81
+ end
110
82
 
111
- result.merge! fido_result
112
- result[:method] = 'fido'
83
+ protected
113
84
 
114
- log_msg(result, :debug, "Fido MIME-type: #{result[:mimetype]} (PRONOM UID: #{result[:puid]})")
85
+ def initialize
86
+ @xml_validations = Libis::Format::Config[:xml_validations].to_h
115
87
  end
116
88
 
117
- def get_droid_identification(file, result = {})
118
- return result if result_ok? result, :DROID
119
- droid_output = ::Libis::Format::Droid.run file
120
- result[:messages] << [:debug, "DROID: #{droid_output}"]
121
- warn 'Droid found multiple matches; using first match only' if droid_output.size > 1
122
- result.clear
123
- droid_output = droid_output.first
124
- result[:mimetype] = droid_output[:mime_type].to_s.split(/[\s,]+/).find { |x| x =~ /.*\/.*/ }
125
- result[:matchtype] = droid_output[:method]
126
- result[:puid] = droid_output[:puid]
127
- result[:format_name] = droid_output[:format_name]
128
- result[:format_version] = droid_output[:format_version]
129
- result[:method] = 'droid'
89
+ def get_file_identification(file, recursive, result)
90
+ output = ::Libis::Format::FileTool.run(file, recursive)
91
+ process_tool_output(output, result)
92
+ output
93
+ end
130
94
 
131
- log_msg(result, :debug, "Droid MIME-type: #{result[:mimetype]} (PRONOM UID: #{result[:puid]})")
95
+ def get_fido_identification(file, recursive, result)
96
+ output = ::Libis::Format::Fido.run(file, recursive)
97
+ process_tool_output(output, result)
98
+ output
132
99
  end
133
100
 
134
- def get_file_identification(file, result = nil)
135
- return result if result_ok? result
136
- begin
137
- output = ::Libis::Tools::Command.run('file', '-b', '--mime-type', "\"#{file.escape_for_string}\"")[:err]
138
- mimetype = output.strip.split
139
- if mimetype
140
- log_msg(result, :debug, "File result: '#{mimetype}'")
141
- result[:mimetype] = mimetype
142
- result[:puid] = get_puid(mimetype)
143
- end
144
- result[:method] = 'file'
145
- rescue Exception
146
- # ignored
147
- end
148
- result
101
+ def get_droid_identification(file, recursive, result)
102
+ output = ::Libis::Format::Droid.run(file, recursive)
103
+ process_tool_output(output, result)
104
+ output
105
+ end
106
+
107
+ def get_extension_identification(file, recursive, result)
108
+ output = ::Libis::Format::ExtensionIdentification.run(file, recursive)
109
+ process_tool_output(output, result)
110
+ output
149
111
  end
150
112
 
151
- def get_extension_identification(file, result = nil)
152
- return result if result_ok? result
153
- info = ::Libis::Format::TypeDatabase.ext_infos(File.extname(file)).first
154
- log_msg result, :debug, "File extension info: #{info}"
155
- if info
156
- result[:mimetype] = info[:MIME].first rescue nil
157
- result[:puid] = info[:PUID].first rescue nil
113
+ def validate_against_xml_schema(result)
114
+ result[:output].each do |file, file_results|
115
+ file_results.each do |file_result|
116
+ xml_validate(file, file_result, result)
117
+ end
158
118
  end
159
- result[:method] = 'extension'
160
- result
161
119
  end
162
120
 
163
- def validate_against_xml_schema(file, result)
164
- return result unless result[:mimetype] =~ /^(text|application)\/xml$/
121
+ def xml_validate(file, file_result, result)
122
+ return unless file_result[:mimetype] =~ /^(text|application)\/xml$/
165
123
  doc = ::Libis::Tools::XmlDocument.open file
166
124
  xml_validations.each do |mime, xsd_file|
167
125
  next unless xsd_file
168
126
  begin
169
127
  if doc.validates_against?(xsd_file)
170
128
  log_msg result, :debug, "XML file validated against XML Schema: #{xsd_file}"
171
- result[:mimetype] = mime
172
- result[:puid] = nil
173
- result = ::Libis::Format::TypeDatabase.enrich(result, PUID: :puid, MIME: :mimetype)
129
+ info = {mimetype: mime, tool: file_result[:source], source: :xsd_validation, match_type: 'xsd_validation', format_version: '', }
130
+ file_result.merge! Libis::Format::TypeDatabase.enrich(info, PUID: :puid, MIME: :mimetype, NAME: :format_name)
174
131
  end
175
- rescue
176
- # Do nothing - probably Nokogiri chrashed during validation.
177
- # Could have many causes (remote schema: firewall, network, link rot, ...; schema syntax error; ...)
178
- # so we just ignore and continue.
132
+ rescue => e
133
+ # Do nothing - probably Nokogiri chrashed during validation. Could have many causes
134
+ # (remote schema (firewall, network, link rot, ...), schema syntax error, corrupt XML,...)
135
+ # so we log and continue.
136
+ log_msg(result, :warn, "Error during XML validation: #{e.message}")
179
137
  end
180
138
  end
181
- result
139
+ end
140
+
141
+ def process_results(result)
142
+ result[:output].map do |file, output|
143
+ file_result = result[:formats][file] = {}
144
+ if output.empty?
145
+ file_result = {
146
+ mimetype: 'application/octet-stream',
147
+ puid: 'fmt/unknown',
148
+ score: 0,
149
+ source: nil
150
+ }
151
+ else
152
+ format_matches = output.group_by {|x| [x[:mimetype], x[:puid]]}
153
+ format_matches.each do |match, group|
154
+ format_matches[match] = group.group_by {|x| x[:score]}.sort.reverse.to_h
155
+ end
156
+ case format_matches.count
157
+ when 0
158
+ # No this really cannot happen. If there are not hits, there will be at least a format [nil,nil]
159
+ when 1
160
+ # only one match, that's easy. The first of the highest score will be used
161
+ file_result.merge!(get_best_result(output))
162
+ else
163
+ process_multiple_formats(file_result, format_matches, output)
164
+ end
165
+ end
166
+ end
167
+ end
168
+
169
+ def process_multiple_formats(file_result, format_matches, output)
170
+ # multiple matches. Let's select the highest score matches
171
+ file_result.merge!(get_best_result(output))
172
+ file_result[:alternatives] = []
173
+ format_matches.keys.each do |mime, puid|
174
+ next if file_result[:mimetype] == mime && puid.nil?
175
+ selection = output.select {|x| x[:mimetype] == mime && x[:puid] == puid}
176
+ file_result[:alternatives] << get_best_result(selection)
177
+ end
178
+ file_result[:alternatives] = file_result[:alternatives].sort_by {|x| x[:score]}.reverse
179
+ file_result.delete(:alternatives) if file_result[:alternatives].size <= 1
182
180
  end
183
181
 
184
182
  private
185
183
 
184
+ def process_tool_output(output, result)
185
+ output.each do |file, file_output|
186
+ result[:output][file] ||= []
187
+ result[:output][file] += file_output
188
+ end
189
+ end
190
+
186
191
  def log_msg(result, severity, text)
187
- return {} unless result.is_a?(Hash)
188
- (result[:messages] ||= []) << [severity, text]
189
- result
192
+ result[:messages] << [severity, text]
193
+ end
194
+
195
+ def get_mimetype(puid)
196
+ ::Libis::Format::TypeDatabase.puid_typeinfo(puid)[:MIME].first rescue nil
197
+ end
198
+
199
+ def get_best_result(results)
200
+ score = results.map {|x| x[:score]}.max
201
+ results.select {|x| x[:score] == score}.reduce(:apply_defaults)
190
202
  end
191
203
 
192
204
  end
@@ -7,6 +7,8 @@ require 'backports/rails/hash'
7
7
  require 'libis/tools/logger'
8
8
  require 'libis/tools/extend/string'
9
9
 
10
+ require_relative 'config'
11
+
10
12
  module Libis
11
13
  module Format
12
14
 
@@ -19,6 +21,25 @@ module Libis
19
21
  end
20
22
 
21
23
  def self.enrich(info, map_keys = {})
24
+ return {} unless info.is_a? Hash
25
+ mapper = Hash.new {|hash,key| hash[key] = key}
26
+ mapper.merge! map_keys
27
+ unless (puid = info[mapper[:PUID]]).blank?
28
+ info[mapper[:TYPE]] ||= puid_infos(puid).first[:TYPE] rescue nil
29
+ end
30
+ unless (mime = info[mapper[:MIME]]).blank?
31
+ info[mapper[:TYPE]] ||= mime_infos(mime).first[:TYPE] rescue nil
32
+ end
33
+ unless (type_name = info[mapper[:TYPE]]).nil?
34
+ mapper.keys.each do |key|
35
+ info[mapper[key]] = get(type_name, key) || info[mapper[key]]
36
+ end
37
+ info[mapper[:GROUP]] = self.type_group(type_name)
38
+ end
39
+ info
40
+ end
41
+
42
+ def self.normalize(info, map_keys = {})
22
43
  return {} unless info.is_a? Hash
23
44
  mapper = Hash.new {|hash,key| hash[key] = key}
24
45
  mapper.merge! map_keys
@@ -29,14 +50,25 @@ module Libis
29
50
  info[mapper[:TYPE]] ||= self.mime_infos(mime).first[:TYPE] rescue nil
30
51
  end
31
52
  unless (type_name = info[mapper[:TYPE]]).nil?
32
- info[mapper[:MIME]] = self.type_mimetypes(type_name).first if info[mapper[:MIME]].blank?
33
- info[mapper[:PUID]] = self.type_puids(type_name).first if info[mapper[:PUID]].blank?
34
- info[mapper[:EXTENSIONS]] = self.type_extentions(type_name)
53
+ info[mapper[:MIME]] = self.type_mimetypes(type_name).first if self.type_mimetypes(type_name).first
35
54
  info[mapper[:GROUP]] = self.type_group(type_name)
36
55
  end
37
56
  info
38
57
  end
39
58
 
59
+ def self.get(type_name, key)
60
+ case key
61
+ when :MIME
62
+ type_mimetypes(type_name).first
63
+ when :PUID
64
+ type_puids(type_name).first
65
+ when :EXTENSION
66
+ type_extentions(type_name).first
67
+ else
68
+ self.typeinfo(type_name)[key]
69
+ end
70
+ end
71
+
40
72
  def self.type_group(t)
41
73
  typeinfo(t)[:GROUP]
42
74
  end
@@ -159,8 +191,7 @@ module Libis
159
191
 
160
192
  def initialize
161
193
  @types = Hash.new
162
- data_dir = File.absolute_path(File.join(File.dirname(__FILE__), '..', '..', '..', 'data'))
163
- type_database = File.join(data_dir, 'types.yml')
194
+ type_database = Libis::Format::Config[:type_database]
164
195
  load_types(type_database)
165
196
  end
166
197
 
@@ -1,5 +1,5 @@
1
1
  module Libis
2
2
  module Format
3
- VERSION = '0.9.32'
3
+ VERSION = '0.9.33'
4
4
  end
5
5
  end
data/lib/libis/format.rb CHANGED
@@ -5,9 +5,12 @@ module Libis
5
5
  autoload :Config, 'libis/format/config'
6
6
  autoload :TypeDatabase, 'libis/format/type_database'
7
7
  autoload :Identifier, 'libis/format/identifier'
8
+ autoload :Identifier, 'libis/format/identifier'
8
9
 
10
+ autoload :FileTool, 'libis/format/file_tool'
9
11
  autoload :Fido, 'libis/format/fido'
10
12
  autoload :Droid, 'libis/format/droid'
13
+ autoload :ExtensionIdentification, 'libis/format/extension_identification'
11
14
 
12
15
  autoload :OfficeToPdf, 'libis/format/office_to_pdf'
13
16
  autoload :PdfCopy, 'libis/format/pdf_copy'
data/libis-format.gemspec CHANGED
@@ -26,8 +26,9 @@ Gem::Specification.new do |spec|
26
26
  spec.add_development_dependency 'rake', '~> 10.3'
27
27
  spec.add_development_dependency 'rspec', '~> 3.1'
28
28
  spec.add_development_dependency 'simplecov', '~> 0.9'
29
+ spec.add_development_dependency 'awesome_print'
29
30
 
30
- spec.add_runtime_dependency 'libis-tools', '~> 0.9'
31
+ spec.add_runtime_dependency 'libis-tools', '~> 0.9.52'
31
32
  spec.add_runtime_dependency 'os', '= 0.9.6'
32
33
  spec.add_runtime_dependency 'mini_magick', '~> 4.3'
33
34
  spec.add_runtime_dependency 'deep_dive', '~> 0.3'
@@ -23,6 +23,8 @@ describe 'Converters' do
23
23
 
24
24
  before(:all) {
25
25
  Libis::Tools::Config.logger.level = :WARN
26
+ ::Libis::Format::Config[:droid_path] = '/opt/droid/droid.sh'
27
+ ::Libis::Format::Config[:fido_path] = '/usr/local/bin/fido'
26
28
  }
27
29
 
28
30
  context 'Repository' do
@@ -114,7 +116,7 @@ describe 'Converters' do
114
116
  expect(result).to eq tgt_file
115
117
  compare = MiniMagick::Tool::Compare.new
116
118
  compare << ref_file << tgt_file
117
- compare.metric << 'AE'
119
+ compare.metric << 'MAE'
118
120
  compare.fuzz << '1%'
119
121
  compare << diff_file
120
122
  compare.call {|_, _, status| expect(status).to be 0}
@@ -132,7 +134,7 @@ describe 'Converters' do
132
134
  expect(result).to eq tgt_file
133
135
  compare = MiniMagick::Tool::Compare.new
134
136
  compare << ref_file << tgt_file
135
- compare.metric << 'AE'
137
+ compare.metric << 'MAE'
136
138
  compare << diff_file
137
139
  compare.call {|_, _, status| expect(status).to be 0}
138
140
  FileUtils.rm tgt_file, force: true
@@ -167,7 +169,7 @@ describe 'Converters' do
167
169
  compare = MiniMagick::Tool::Compare.new
168
170
  compare << ref_file << tgt_file
169
171
  compare.metric << 'AE'
170
- compare.fuzz << '1%'
172
+ compare.fuzz << '100%'
171
173
  compare << diff_file
172
174
  compare.call do |_stdin, _stdout, status|
173
175
  expect(status).to be 0
@@ -189,7 +191,7 @@ describe 'Converters' do
189
191
  expect(File.exist?(tgt_file)).to be_truthy
190
192
  compare = MiniMagick::Tool::Compare.new
191
193
  compare << ref_file << tgt_file
192
- compare.metric << 'AE'
194
+ compare.metric << 'MAE'
193
195
  compare.fuzz << '10%'
194
196
  compare << diff_file
195
197
  compare.call {|_,_,status| expect(status).to be 0}