libis-format 0.9.32 → 0.9.33

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. checksums.yaml +4 -4
  2. data/data/types.yml +30 -16
  3. data/lib/libis/format/config.rb +7 -18
  4. data/lib/libis/format/converter/image_converter.rb +6 -0
  5. data/lib/libis/format/droid.rb +82 -25
  6. data/lib/libis/format/extension_identification.rb +55 -0
  7. data/lib/libis/format/fido.rb +57 -72
  8. data/lib/libis/format/file_tool.rb +76 -0
  9. data/lib/libis/format/identification_tool.rb +174 -0
  10. data/lib/libis/format/identifier.rb +129 -117
  11. data/lib/libis/format/type_database.rb +36 -5
  12. data/lib/libis/format/version.rb +1 -1
  13. data/lib/libis/format.rb +3 -0
  14. data/libis-format.gemspec +2 -1
  15. data/spec/converter_spec.rb +6 -4
  16. data/spec/identifier_spec.rb +125 -34
  17. metadata +21 -126
  18. data/tools/droid/DROID_SignatureFile_V90.xml +0 -40182
  19. data/tools/droid/container-signature-20170330.xml +0 -3584
  20. data/tools/droid/droid-command-line-6.3.jar +0 -0
  21. data/tools/droid/droid.bat +0 -152
  22. data/tools/droid/droid.sh +0 -152
  23. data/tools/droid/lib/XmlSchema-1.4.7.jar +0 -0
  24. data/tools/droid/lib/activation-1.1.jar +0 -0
  25. data/tools/droid/lib/aopalliance-1.0.jar +0 -0
  26. data/tools/droid/lib/asm-2.2.3.jar +0 -0
  27. data/tools/droid/lib/aspectjrt-1.8.7.jar +0 -0
  28. data/tools/droid/lib/aspectjweaver-1.8.7.jar +0 -0
  29. data/tools/droid/lib/bcmail-jdk14-138.jar +0 -0
  30. data/tools/droid/lib/bcprov-jdk14-138.jar +0 -0
  31. data/tools/droid/lib/beansbinding-1.2.1.jar +0 -0
  32. data/tools/droid/lib/byteseek-2.0.3.jar +0 -0
  33. data/tools/droid/lib/cglib-nodep-2.2.2.jar +0 -0
  34. data/tools/droid/lib/classmate-1.0.0.jar +0 -0
  35. data/tools/droid/lib/commons-cli-1.2.jar +0 -0
  36. data/tools/droid/lib/commons-codec-1.10.jar +0 -0
  37. data/tools/droid/lib/commons-collections-3.2.2.jar +0 -0
  38. data/tools/droid/lib/commons-compress-1.4.1.jar +0 -0
  39. data/tools/droid/lib/commons-configuration-1.8.jar +0 -0
  40. data/tools/droid/lib/commons-dbcp-1.4.jar +0 -0
  41. data/tools/droid/lib/commons-httpclient-3.1.jar +0 -0
  42. data/tools/droid/lib/commons-io-2.4.jar +0 -0
  43. data/tools/droid/lib/commons-lang-2.6.jar +0 -0
  44. data/tools/droid/lib/commons-logging-1.1.1.jar +0 -0
  45. data/tools/droid/lib/commons-pool-1.5.4.jar +0 -0
  46. data/tools/droid/lib/cxf-api-2.2.12.jar +0 -0
  47. data/tools/droid/lib/cxf-common-schemas-2.2.12.jar +0 -0
  48. data/tools/droid/lib/cxf-common-utilities-2.2.12.jar +0 -0
  49. data/tools/droid/lib/cxf-rt-bindings-http-2.2.12.jar +0 -0
  50. data/tools/droid/lib/cxf-rt-bindings-soap-2.2.12.jar +0 -0
  51. data/tools/droid/lib/cxf-rt-bindings-xml-2.2.12.jar +0 -0
  52. data/tools/droid/lib/cxf-rt-core-2.2.12.jar +0 -0
  53. data/tools/droid/lib/cxf-rt-databinding-jaxb-2.2.12.jar +0 -0
  54. data/tools/droid/lib/cxf-rt-frontend-jaxws-2.2.12.jar +0 -0
  55. data/tools/droid/lib/cxf-rt-frontend-simple-2.2.12.jar +0 -0
  56. data/tools/droid/lib/cxf-rt-transports-http-2.2.12.jar +0 -0
  57. data/tools/droid/lib/cxf-rt-ws-addr-2.2.12.jar +0 -0
  58. data/tools/droid/lib/cxf-tools-common-2.2.12.jar +0 -0
  59. data/tools/droid/lib/de.huxhorn.lilith.3rdparty.flyingsaucer.core-renderer-8RC1.jar +0 -0
  60. data/tools/droid/lib/derby-10.10.2.0.jar +0 -0
  61. data/tools/droid/lib/droid-container-6.3.jar +0 -0
  62. data/tools/droid/lib/droid-core-6.3.jar +0 -0
  63. data/tools/droid/lib/droid-core-interfaces-6.3.jar +0 -0
  64. data/tools/droid/lib/droid-export-6.3.jar +0 -0
  65. data/tools/droid/lib/droid-export-interfaces-6.3.jar +0 -0
  66. data/tools/droid/lib/droid-help-6.3.jar +0 -0
  67. data/tools/droid/lib/droid-report-6.3.jar +0 -0
  68. data/tools/droid/lib/droid-report-interfaces-6.3.jar +0 -0
  69. data/tools/droid/lib/droid-results-6.3.jar +0 -0
  70. data/tools/droid/lib/geronimo-activation_1.1_spec-1.0.2.jar +0 -0
  71. data/tools/droid/lib/geronimo-annotation_1.0_spec-1.1.1.jar +0 -0
  72. data/tools/droid/lib/geronimo-javamail_1.4_spec-1.6.jar +0 -0
  73. data/tools/droid/lib/geronimo-jaxws_2.1_spec-1.0.jar +0 -0
  74. data/tools/droid/lib/geronimo-stax-api_1.0_spec-1.0.1.jar +0 -0
  75. data/tools/droid/lib/geronimo-ws-metadata_2.0_spec-1.1.2.jar +0 -0
  76. data/tools/droid/lib/hibernate-validator-5.1.0.Final.jar +0 -0
  77. data/tools/droid/lib/itext-2.0.8.jar +0 -0
  78. data/tools/droid/lib/javahelp-2.0.05.jar +0 -0
  79. data/tools/droid/lib/jaxb-api-2.1.jar +0 -0
  80. data/tools/droid/lib/jaxb-impl-2.1.13.jar +0 -0
  81. data/tools/droid/lib/jboss-logging-3.1.3.GA.jar +0 -0
  82. data/tools/droid/lib/joda-time-1.6.2.jar +0 -0
  83. data/tools/droid/lib/jra-1.0-alpha-4.jar +0 -0
  84. data/tools/droid/lib/jta-1.1.jar +0 -0
  85. data/tools/droid/lib/jwat-arc-1.0.3.jar +0 -0
  86. data/tools/droid/lib/jwat-archive-common-1.0.3.jar +0 -0
  87. data/tools/droid/lib/jwat-common-1.0.3.jar +0 -0
  88. data/tools/droid/lib/jwat-gzip-1.0.3.jar +0 -0
  89. data/tools/droid/lib/jwat-warc-1.0.2.jar +0 -0
  90. data/tools/droid/lib/log4j-1.2.13.jar +0 -0
  91. data/tools/droid/lib/neethi-2.0.4.jar +0 -0
  92. data/tools/droid/lib/opencsv-2.3.jar +0 -0
  93. data/tools/droid/lib/org-netbeans-swing-outline-7.2.jar +0 -0
  94. data/tools/droid/lib/org-openide-util-7.2.jar +0 -0
  95. data/tools/droid/lib/org-openide-util-lookup-7.2.jar +0 -0
  96. data/tools/droid/lib/poi-3.13.jar +0 -0
  97. data/tools/droid/lib/saaj-api-1.3.jar +0 -0
  98. data/tools/droid/lib/saaj-impl-1.3.2.jar +0 -0
  99. data/tools/droid/lib/slf4j-api-1.4.2.jar +0 -0
  100. data/tools/droid/lib/slf4j-log4j12-1.4.2.jar +0 -0
  101. data/tools/droid/lib/spring-aop-4.0.3.RELEASE.jar +0 -0
  102. data/tools/droid/lib/spring-beans-4.0.3.RELEASE.jar +0 -0
  103. data/tools/droid/lib/spring-context-4.0.3.RELEASE.jar +0 -0
  104. data/tools/droid/lib/spring-core-4.0.3.RELEASE.jar +0 -0
  105. data/tools/droid/lib/spring-expression-4.0.3.RELEASE.jar +0 -0
  106. data/tools/droid/lib/spring-jdbc-4.0.3.RELEASE.jar +0 -0
  107. data/tools/droid/lib/spring-orm-4.0.3.RELEASE.jar +0 -0
  108. data/tools/droid/lib/spring-tx-4.0.3.RELEASE.jar +0 -0
  109. data/tools/droid/lib/spring-web-2.5.6.jar +0 -0
  110. data/tools/droid/lib/stax-api-1.0-2.jar +0 -0
  111. data/tools/droid/lib/trove4j-3.0.3.jar +0 -0
  112. data/tools/droid/lib/truezip-6.8.4.jar +0 -0
  113. data/tools/droid/lib/validation-api-1.1.0.Final.jar +0 -0
  114. data/tools/droid/lib/wsdl4j-1.6.2.jar +0 -0
  115. data/tools/droid/lib/wstx-asl-3.2.9.jar +0 -0
  116. data/tools/droid/lib/xercesImpl-2.9.1.jar +0 -0
  117. data/tools/droid/lib/xml-apis-1.3.04.jar +0 -0
  118. data/tools/droid/lib/xml-resolver-1.2.jar +0 -0
  119. data/tools/droid/lib/xz-1.0.jar +0 -0
  120. data/tools/fido/__init__.py +0 -50
  121. data/tools/fido/conf/DROID_SignatureFile-v90.xml +0 -2
  122. data/tools/fido/conf/container-signature-20170330.xml +0 -3584
  123. data/tools/fido/conf/dc.xsd +0 -119
  124. data/tools/fido/conf/dcmitype.xsd +0 -53
  125. data/tools/fido/conf/dcterms.xsd +0 -383
  126. data/tools/fido/conf/fido-formats.xsd +0 -173
  127. data/tools/fido/conf/format_extension_template.xml +0 -105
  128. data/tools/fido/conf/format_extensions.xml +0 -484
  129. data/tools/fido/conf/formats-v90.xml +0 -48877
  130. data/tools/fido/conf/pronom-xml-v90.zip +0 -0
  131. data/tools/fido/conf/versions.xml +0 -8
  132. data/tools/fido/fido.bat +0 -4
  133. data/tools/fido/fido.py +0 -884
  134. data/tools/fido/fido.sh +0 -5
  135. data/tools/fido/package.py +0 -96
  136. data/tools/fido/prepare.py +0 -645
  137. data/tools/fido/pronomutils.py +0 -200
  138. data/tools/fido/toxml.py +0 -60
  139. data/tools/fido/update_signatures.py +0 -183
@@ -0,0 +1,174 @@
1
+ require 'csv'
2
+ require 'tmpdir'
3
+
4
+ require 'singleton'
5
+ require 'libis/tools/extend/string'
6
+ require 'libis/tools/logger'
7
+ require 'libis/tools/command'
8
+
9
+ require 'libis/format/config'
10
+ require 'libis/format/type_database'
11
+
12
+ module Libis
13
+ module Format
14
+
15
+ class IdentificationTool
16
+ include Singleton
17
+ include ::Libis::Tools::Logger
18
+
19
+ def self.bad_mimetype(mimetype)
20
+ self.instance.bad_mimetype(mimetype)
21
+ end
22
+
23
+ def self.run(file, recursive = false)
24
+ if file.is_a?(Array)
25
+ return run_list file
26
+ elsif file.is_a?(String) && File.exists?(file) && File.readable?(file)
27
+ if File.directory?(file)
28
+ return run_dir(file, recursive)
29
+ elsif File.file?(file)
30
+ return self.instance.run(file)
31
+ end
32
+ end
33
+ raise ArgumentError,
34
+ 'IdentificationTool: file argument should be a path to an existing file or directory or a list of those'
35
+ end
36
+
37
+ def self.run_dir(file, recursive = true)
38
+ self.instance.run_dir file, recursive
39
+ end
40
+
41
+ def self.run_list(filelist)
42
+ self.instance.run_list filelist
43
+ end
44
+
45
+ protected
46
+
47
+ def create_list_file(filelist)
48
+ list_file = Dir::Tmpname.make_tmpname(%w'file .list', nil)
49
+ File.open(list_file, 'w') do |f|
50
+ filelist.each do |fname|
51
+ f.write "#{fname}\n"
52
+ end
53
+ end
54
+ yield(list_file)
55
+ ensure
56
+ File.delete(list_file)
57
+ end
58
+
59
+ def find_files(dir, recurse = true)
60
+ args = []
61
+ args << '-L'
62
+ args << dir.escape_for_string
63
+ args << '-maxdepth' << '1' unless recurse
64
+ args << '-type' << 'f'
65
+ args << '-print'
66
+ output = ::Libis::Tools::Command.run('find', *args)
67
+ warn "Find command errors: #{output[:err].join("\n")}" unless output[:err].empty?
68
+ output[:out]
69
+ end
70
+
71
+ # Reformat output to make it easier to post-process and decide on the preferred format
72
+ #
73
+ # input format:
74
+ # [
75
+ # { filepath: <filename>, mimetype: <mimetype>, matchtype: <matchtype>, ... }
76
+ # ]
77
+ #
78
+ # output format:
79
+ # { <filename> => [<result>, ...], ... }
80
+ #
81
+ # <result> is the enchanced Hash output of the identification tool:
82
+ # { mimetype: <mimetype>, puid: <puid>, matchtype: <matchtype>, score: <score>, ...}
83
+ #
84
+ def process_output(output)
85
+ output.reduce({}) do |results, x|
86
+ filepath = x.delete(:filepath)
87
+ results[filepath] ||= []
88
+ results[filepath.freeze] << annotate(x)
89
+ results
90
+ end
91
+ end
92
+
93
+ # Enhance the output with mimetype and score
94
+ def annotate(result)
95
+ # Enhance result with mimetype if needed
96
+ if bad_mimetypes.include?(result[:mimetype]) && !bad_puids.include?(result[:puid])
97
+ result[:mimetype] = get_mimetype(result[:puid])
98
+ end
99
+
100
+ # Normalize the mimetype
101
+ Libis::Format::TypeDatabase.normalize(result, PUID: :puid, MIME: :mimetype)
102
+
103
+ # Default score is 5
104
+ result[:score] = 5
105
+
106
+ # Weak detection score is 1
107
+ result[:score] = 1 if bad_mimetypes.include? result[:mimetype]
108
+
109
+ # freeze all strings
110
+ result.each {|_, v| v.freeze if v.is_a?(String)}
111
+
112
+ # Adapt score based on matchtype
113
+ result[:matchtype] = result[:matchtype].to_s.downcase
114
+ case result[:matchtype]
115
+
116
+ # Signature match increases score with 2
117
+ when 'signature'
118
+ result[:score] += 2
119
+ # typeinfo = ::Libis::Format::TypeDatabase.puid_typeinfo(result[:puid])
120
+ # ext = File.extname(result[:filename])
121
+ # result[:score] += 1 if typeinfo and typeinfo[:EXTENSIONS].include?(ext)
122
+
123
+ # Container match increases score with 4
124
+ when 'container'
125
+ result[:score] += 4
126
+ # typeinfo = ::Libis::Format::TypeDatabase.puid_typeinfo(result[:puid])
127
+ # ext = File.extname(result[:filename])
128
+ # result[:score] += 1 if typeinfo and typeinfo[:EXTENSIONS].include?(ext)
129
+
130
+ # Extension match is the weakest identification; score is lowered by 2 points
131
+ when 'extension'
132
+ result[:score] -= 2
133
+
134
+ # Magic code (file tool) is to be trused even less
135
+ when 'magic'
136
+ result[:score] -= 3
137
+
138
+ # Or no change otherwise
139
+ else
140
+ # do nothing
141
+ end
142
+
143
+ # Detecting a zip file should decrease the score as it may hide one of the many zip-based formats (e.g. epub,
144
+ # Office OpenXML, OpenDocument, jar, maff, svx)
145
+ if result[:mimetype] == 'application/zip'
146
+ result[:score] -= 2
147
+ end
148
+
149
+ # Return result enhanced with mimetype and score fields
150
+ result
151
+ end
152
+
153
+ def get_mimetype(puid)
154
+ ::Libis::Format::TypeDatabase.puid_typeinfo(puid)[:MIME].first rescue nil
155
+ end
156
+
157
+ def get_puid(mimetype)
158
+ ::Libis::Format::TypeDatabase.mime_infos(mimetype).first[:PUID].first rescue nil
159
+ end
160
+
161
+ attr_accessor :bad_mimetypes, :bad_puids
162
+
163
+ def initialize
164
+ @bad_mimetypes = [nil, '', 'None', 'application/octet-stream']
165
+ @bad_puids = [nil, 'fmt/unknown']
166
+ end
167
+
168
+ def bad_mimetype(mimetype)
169
+ @bad_mimetypes << mimetype
170
+ end
171
+ end
172
+
173
+ end
174
+ end
@@ -3,13 +3,17 @@
3
3
  require 'singleton'
4
4
 
5
5
  require 'libis-tools'
6
+ require 'libis/tools/extend/hash'
6
7
  require 'libis/tools/extend/string'
7
8
  require 'libis/tools/extend/empty'
8
9
 
9
10
  require 'libis/format/type_database'
10
11
 
12
+ require_relative 'config'
11
13
  require_relative 'fido'
12
14
  require_relative 'droid'
15
+ require_relative 'file_tool'
16
+ require_relative 'extension_identification'
13
17
 
14
18
  module Libis
15
19
  module Format
@@ -18,39 +22,8 @@ module Libis
18
22
  include ::Libis::Tools::Logger
19
23
  include Singleton
20
24
 
21
- RETRY_MIMETYPES = %w(application/zip) + ::Libis::Format::Fido::BAD_MIMETYPES
22
- FIDO_FAILURES = %w(application/vnd.oasis.opendocument.text application/vnd.oasis.opendocument.spreadsheet)
23
-
24
- attr_reader :xml_validations
25
-
26
- protected
27
-
28
- def initialize
29
- data_dir = File.absolute_path(File.join(File.dirname(__FILE__), '..', '..', '..', 'data'))
30
- @fido_formats = [(File.join(data_dir, 'lias_formats.xml'))]
31
- # noinspection RubyStringKeysInHashInspection
32
- @xml_validations = {'archive/ead' => File.join(data_dir, 'ead.xsd')}
33
- end
34
-
35
- def result_ok?(result, who_is_asking = nil)
36
- result = ::Libis::Format::TypeDatabase.enrich(result, PUID: :puid, MIME: :mimetype)
37
- return false if result.empty?
38
- return true unless result[:TYPE].empty?
39
- return false if RETRY_MIMETYPES.include? result[:mimetype]
40
- return false if FIDO_FAILURES.include? result[:mimetype] and who_is_asking == :DROID
41
- !(result[:mimetype].empty? and result[:puid].empty?)
42
- end
43
-
44
- def get_puid(mimetype)
45
- ::Libis::Format::TypeDatabase.mime_infos(mimetype).first[:PUID].first rescue nil
46
- end
47
-
48
25
  public
49
26
 
50
- def self.add_fido_format(f)
51
- ::Libis::Format::Fido.add_format f
52
- end
53
-
54
27
  def self.add_xml_validation(mimetype, xsd_file)
55
28
  instance.xml_validations[mimetype] = xsd_file
56
29
  end
@@ -59,134 +32,173 @@ module Libis
59
32
  instance.xml_validations
60
33
  end
61
34
 
62
- def self.get(file_path, options = nil)
63
- instance.get file_path, options
35
+ def self.get(file, options = {})
36
+ instance.get file, options
64
37
  end
65
38
 
66
- def get(file, options = nil)
39
+ attr_reader :xml_validations
67
40
 
68
- unless File.exists? file
69
- error 'File %s cannot be found.', file
70
- return nil
71
- end
72
- if File.directory? file
73
- error '%s is a directory.', file
74
- return nil
75
- end
41
+ def get(file, options = {})
76
42
 
77
- options ||= {}
43
+ options[:droid] = true unless options[:tool] and options[:tool] != :droid
44
+ options[:fido] = true unless options[:tool] and options[:tool] != :fido
45
+ options[:file] = true unless options[:tool] and options[:tool] != :file
78
46
 
79
- result = {messages: []}
47
+ result = {messages: [], output: {}, formats: {}}
80
48
 
81
- # use FIDO
82
- # Note: FIDO does not always do a good job, mainly due to lacking container inspection.
83
- # FIDO misses should be registered in
84
- result = get_fido_identification(file, result, options[:formats]) unless options[:droid]
49
+ begin
50
+ get_droid_identification(file, options[:recursive], result) if options[:droid]
51
+ rescue => e
52
+ log_msg(result, :error, "Error running Droid: #{e.message} @ #{e.backtrace.first}")
53
+ end
85
54
 
86
- # use DROID
87
- result = get_droid_identification file, result
55
+ begin
56
+ get_fido_identification(file, options[:recursive], result) if options[:fido]
57
+ rescue => e
58
+ log_msg(result, :error, "Error running Fido: #{e.message} @ #{e.backtrace.first}")
59
+ end
88
60
 
89
- # use FILE
90
- result = get_file_identification(file, result)
61
+ begin
62
+ get_file_identification(file, options[:recursive], result) if options[:file]
63
+ rescue => e
64
+ log_msg(result, :error, "Error running File: #{e.message} @ #{e.backtrace.first}")
65
+ end
91
66
 
92
- # Try file extension
93
- result = get_extension_identification(file, result)
67
+ # get_extension_identification(file, options[:recursive], result)
94
68
 
95
69
  # determine XML type. Add custom types at runtime with
96
70
  # Libis::Tools::Format::Identifier.add_xml_validation('my_type', '/path/to/my_type.xsd')
97
- result = validate_against_xml_schema(file, result)
71
+ validate_against_xml_schema(result)
98
72
 
99
- result[:mimetype] ?
100
- log_msg(result, :info, "Identification of '#{file}': '#{result}'") :
101
- log_msg(result, :warn, "Could not identify MIME type of '#{file}'")
102
- end
73
+ process_results(result)
103
74
 
104
- def get_fido_identification(file, result = {}, xtra_formats = nil)
105
- return result if result_ok? result
75
+ # result[:mimetype] ?
76
+ # log_msg(result, :info, "Identification of '#{file}': '#{result}'") :
77
+ # log_msg(result, :warn, "Could not identify MIME type of '#{file}'")
106
78
 
107
- fido_result = ::Libis::Format::Fido.run(file, xtra_formats)
79
+ result
108
80
 
109
- return result unless fido_result.is_a? Hash
81
+ end
110
82
 
111
- result.merge! fido_result
112
- result[:method] = 'fido'
83
+ protected
113
84
 
114
- log_msg(result, :debug, "Fido MIME-type: #{result[:mimetype]} (PRONOM UID: #{result[:puid]})")
85
+ def initialize
86
+ @xml_validations = Libis::Format::Config[:xml_validations].to_h
115
87
  end
116
88
 
117
- def get_droid_identification(file, result = {})
118
- return result if result_ok? result, :DROID
119
- droid_output = ::Libis::Format::Droid.run file
120
- result[:messages] << [:debug, "DROID: #{droid_output}"]
121
- warn 'Droid found multiple matches; using first match only' if droid_output.size > 1
122
- result.clear
123
- droid_output = droid_output.first
124
- result[:mimetype] = droid_output[:mime_type].to_s.split(/[\s,]+/).find { |x| x =~ /.*\/.*/ }
125
- result[:matchtype] = droid_output[:method]
126
- result[:puid] = droid_output[:puid]
127
- result[:format_name] = droid_output[:format_name]
128
- result[:format_version] = droid_output[:format_version]
129
- result[:method] = 'droid'
89
+ def get_file_identification(file, recursive, result)
90
+ output = ::Libis::Format::FileTool.run(file, recursive)
91
+ process_tool_output(output, result)
92
+ output
93
+ end
130
94
 
131
- log_msg(result, :debug, "Droid MIME-type: #{result[:mimetype]} (PRONOM UID: #{result[:puid]})")
95
+ def get_fido_identification(file, recursive, result)
96
+ output = ::Libis::Format::Fido.run(file, recursive)
97
+ process_tool_output(output, result)
98
+ output
132
99
  end
133
100
 
134
- def get_file_identification(file, result = nil)
135
- return result if result_ok? result
136
- begin
137
- output = ::Libis::Tools::Command.run('file', '-b', '--mime-type', "\"#{file.escape_for_string}\"")[:err]
138
- mimetype = output.strip.split
139
- if mimetype
140
- log_msg(result, :debug, "File result: '#{mimetype}'")
141
- result[:mimetype] = mimetype
142
- result[:puid] = get_puid(mimetype)
143
- end
144
- result[:method] = 'file'
145
- rescue Exception
146
- # ignored
147
- end
148
- result
101
+ def get_droid_identification(file, recursive, result)
102
+ output = ::Libis::Format::Droid.run(file, recursive)
103
+ process_tool_output(output, result)
104
+ output
105
+ end
106
+
107
+ def get_extension_identification(file, recursive, result)
108
+ output = ::Libis::Format::ExtensionIdentification.run(file, recursive)
109
+ process_tool_output(output, result)
110
+ output
149
111
  end
150
112
 
151
- def get_extension_identification(file, result = nil)
152
- return result if result_ok? result
153
- info = ::Libis::Format::TypeDatabase.ext_infos(File.extname(file)).first
154
- log_msg result, :debug, "File extension info: #{info}"
155
- if info
156
- result[:mimetype] = info[:MIME].first rescue nil
157
- result[:puid] = info[:PUID].first rescue nil
113
+ def validate_against_xml_schema(result)
114
+ result[:output].each do |file, file_results|
115
+ file_results.each do |file_result|
116
+ xml_validate(file, file_result, result)
117
+ end
158
118
  end
159
- result[:method] = 'extension'
160
- result
161
119
  end
162
120
 
163
- def validate_against_xml_schema(file, result)
164
- return result unless result[:mimetype] =~ /^(text|application)\/xml$/
121
+ def xml_validate(file, file_result, result)
122
+ return unless file_result[:mimetype] =~ /^(text|application)\/xml$/
165
123
  doc = ::Libis::Tools::XmlDocument.open file
166
124
  xml_validations.each do |mime, xsd_file|
167
125
  next unless xsd_file
168
126
  begin
169
127
  if doc.validates_against?(xsd_file)
170
128
  log_msg result, :debug, "XML file validated against XML Schema: #{xsd_file}"
171
- result[:mimetype] = mime
172
- result[:puid] = nil
173
- result = ::Libis::Format::TypeDatabase.enrich(result, PUID: :puid, MIME: :mimetype)
129
+ info = {mimetype: mime, tool: file_result[:source], source: :xsd_validation, match_type: 'xsd_validation', format_version: '', }
130
+ file_result.merge! Libis::Format::TypeDatabase.enrich(info, PUID: :puid, MIME: :mimetype, NAME: :format_name)
174
131
  end
175
- rescue
176
- # Do nothing - probably Nokogiri chrashed during validation.
177
- # Could have many causes (remote schema: firewall, network, link rot, ...; schema syntax error; ...)
178
- # so we just ignore and continue.
132
+ rescue => e
133
+ # Do nothing - probably Nokogiri chrashed during validation. Could have many causes
134
+ # (remote schema (firewall, network, link rot, ...), schema syntax error, corrupt XML,...)
135
+ # so we log and continue.
136
+ log_msg(result, :warn, "Error during XML validation: #{e.message}")
179
137
  end
180
138
  end
181
- result
139
+ end
140
+
141
+ def process_results(result)
142
+ result[:output].map do |file, output|
143
+ file_result = result[:formats][file] = {}
144
+ if output.empty?
145
+ file_result = {
146
+ mimetype: 'application/octet-stream',
147
+ puid: 'fmt/unknown',
148
+ score: 0,
149
+ source: nil
150
+ }
151
+ else
152
+ format_matches = output.group_by {|x| [x[:mimetype], x[:puid]]}
153
+ format_matches.each do |match, group|
154
+ format_matches[match] = group.group_by {|x| x[:score]}.sort.reverse.to_h
155
+ end
156
+ case format_matches.count
157
+ when 0
158
+ # No this really cannot happen. If there are not hits, there will be at least a format [nil,nil]
159
+ when 1
160
+ # only one match, that's easy. The first of the highest score will be used
161
+ file_result.merge!(get_best_result(output))
162
+ else
163
+ process_multiple_formats(file_result, format_matches, output)
164
+ end
165
+ end
166
+ end
167
+ end
168
+
169
+ def process_multiple_formats(file_result, format_matches, output)
170
+ # multiple matches. Let's select the highest score matches
171
+ file_result.merge!(get_best_result(output))
172
+ file_result[:alternatives] = []
173
+ format_matches.keys.each do |mime, puid|
174
+ next if file_result[:mimetype] == mime && puid.nil?
175
+ selection = output.select {|x| x[:mimetype] == mime && x[:puid] == puid}
176
+ file_result[:alternatives] << get_best_result(selection)
177
+ end
178
+ file_result[:alternatives] = file_result[:alternatives].sort_by {|x| x[:score]}.reverse
179
+ file_result.delete(:alternatives) if file_result[:alternatives].size <= 1
182
180
  end
183
181
 
184
182
  private
185
183
 
184
+ def process_tool_output(output, result)
185
+ output.each do |file, file_output|
186
+ result[:output][file] ||= []
187
+ result[:output][file] += file_output
188
+ end
189
+ end
190
+
186
191
  def log_msg(result, severity, text)
187
- return {} unless result.is_a?(Hash)
188
- (result[:messages] ||= []) << [severity, text]
189
- result
192
+ result[:messages] << [severity, text]
193
+ end
194
+
195
+ def get_mimetype(puid)
196
+ ::Libis::Format::TypeDatabase.puid_typeinfo(puid)[:MIME].first rescue nil
197
+ end
198
+
199
+ def get_best_result(results)
200
+ score = results.map {|x| x[:score]}.max
201
+ results.select {|x| x[:score] == score}.reduce(:apply_defaults)
190
202
  end
191
203
 
192
204
  end
@@ -7,6 +7,8 @@ require 'backports/rails/hash'
7
7
  require 'libis/tools/logger'
8
8
  require 'libis/tools/extend/string'
9
9
 
10
+ require_relative 'config'
11
+
10
12
  module Libis
11
13
  module Format
12
14
 
@@ -19,6 +21,25 @@ module Libis
19
21
  end
20
22
 
21
23
  def self.enrich(info, map_keys = {})
24
+ return {} unless info.is_a? Hash
25
+ mapper = Hash.new {|hash,key| hash[key] = key}
26
+ mapper.merge! map_keys
27
+ unless (puid = info[mapper[:PUID]]).blank?
28
+ info[mapper[:TYPE]] ||= puid_infos(puid).first[:TYPE] rescue nil
29
+ end
30
+ unless (mime = info[mapper[:MIME]]).blank?
31
+ info[mapper[:TYPE]] ||= mime_infos(mime).first[:TYPE] rescue nil
32
+ end
33
+ unless (type_name = info[mapper[:TYPE]]).nil?
34
+ mapper.keys.each do |key|
35
+ info[mapper[key]] = get(type_name, key) || info[mapper[key]]
36
+ end
37
+ info[mapper[:GROUP]] = self.type_group(type_name)
38
+ end
39
+ info
40
+ end
41
+
42
+ def self.normalize(info, map_keys = {})
22
43
  return {} unless info.is_a? Hash
23
44
  mapper = Hash.new {|hash,key| hash[key] = key}
24
45
  mapper.merge! map_keys
@@ -29,14 +50,25 @@ module Libis
29
50
  info[mapper[:TYPE]] ||= self.mime_infos(mime).first[:TYPE] rescue nil
30
51
  end
31
52
  unless (type_name = info[mapper[:TYPE]]).nil?
32
- info[mapper[:MIME]] = self.type_mimetypes(type_name).first if info[mapper[:MIME]].blank?
33
- info[mapper[:PUID]] = self.type_puids(type_name).first if info[mapper[:PUID]].blank?
34
- info[mapper[:EXTENSIONS]] = self.type_extentions(type_name)
53
+ info[mapper[:MIME]] = self.type_mimetypes(type_name).first if self.type_mimetypes(type_name).first
35
54
  info[mapper[:GROUP]] = self.type_group(type_name)
36
55
  end
37
56
  info
38
57
  end
39
58
 
59
+ def self.get(type_name, key)
60
+ case key
61
+ when :MIME
62
+ type_mimetypes(type_name).first
63
+ when :PUID
64
+ type_puids(type_name).first
65
+ when :EXTENSION
66
+ type_extentions(type_name).first
67
+ else
68
+ self.typeinfo(type_name)[key]
69
+ end
70
+ end
71
+
40
72
  def self.type_group(t)
41
73
  typeinfo(t)[:GROUP]
42
74
  end
@@ -159,8 +191,7 @@ module Libis
159
191
 
160
192
  def initialize
161
193
  @types = Hash.new
162
- data_dir = File.absolute_path(File.join(File.dirname(__FILE__), '..', '..', '..', 'data'))
163
- type_database = File.join(data_dir, 'types.yml')
194
+ type_database = Libis::Format::Config[:type_database]
164
195
  load_types(type_database)
165
196
  end
166
197
 
@@ -1,5 +1,5 @@
1
1
  module Libis
2
2
  module Format
3
- VERSION = '0.9.32'
3
+ VERSION = '0.9.33'
4
4
  end
5
5
  end
data/lib/libis/format.rb CHANGED
@@ -5,9 +5,12 @@ module Libis
5
5
  autoload :Config, 'libis/format/config'
6
6
  autoload :TypeDatabase, 'libis/format/type_database'
7
7
  autoload :Identifier, 'libis/format/identifier'
8
+ autoload :Identifier, 'libis/format/identifier'
8
9
 
10
+ autoload :FileTool, 'libis/format/file_tool'
9
11
  autoload :Fido, 'libis/format/fido'
10
12
  autoload :Droid, 'libis/format/droid'
13
+ autoload :ExtensionIdentification, 'libis/format/extension_identification'
11
14
 
12
15
  autoload :OfficeToPdf, 'libis/format/office_to_pdf'
13
16
  autoload :PdfCopy, 'libis/format/pdf_copy'
data/libis-format.gemspec CHANGED
@@ -26,8 +26,9 @@ Gem::Specification.new do |spec|
26
26
  spec.add_development_dependency 'rake', '~> 10.3'
27
27
  spec.add_development_dependency 'rspec', '~> 3.1'
28
28
  spec.add_development_dependency 'simplecov', '~> 0.9'
29
+ spec.add_development_dependency 'awesome_print'
29
30
 
30
- spec.add_runtime_dependency 'libis-tools', '~> 0.9'
31
+ spec.add_runtime_dependency 'libis-tools', '~> 0.9.52'
31
32
  spec.add_runtime_dependency 'os', '= 0.9.6'
32
33
  spec.add_runtime_dependency 'mini_magick', '~> 4.3'
33
34
  spec.add_runtime_dependency 'deep_dive', '~> 0.3'
@@ -23,6 +23,8 @@ describe 'Converters' do
23
23
 
24
24
  before(:all) {
25
25
  Libis::Tools::Config.logger.level = :WARN
26
+ ::Libis::Format::Config[:droid_path] = '/opt/droid/droid.sh'
27
+ ::Libis::Format::Config[:fido_path] = '/usr/local/bin/fido'
26
28
  }
27
29
 
28
30
  context 'Repository' do
@@ -114,7 +116,7 @@ describe 'Converters' do
114
116
  expect(result).to eq tgt_file
115
117
  compare = MiniMagick::Tool::Compare.new
116
118
  compare << ref_file << tgt_file
117
- compare.metric << 'AE'
119
+ compare.metric << 'MAE'
118
120
  compare.fuzz << '1%'
119
121
  compare << diff_file
120
122
  compare.call {|_, _, status| expect(status).to be 0}
@@ -132,7 +134,7 @@ describe 'Converters' do
132
134
  expect(result).to eq tgt_file
133
135
  compare = MiniMagick::Tool::Compare.new
134
136
  compare << ref_file << tgt_file
135
- compare.metric << 'AE'
137
+ compare.metric << 'MAE'
136
138
  compare << diff_file
137
139
  compare.call {|_, _, status| expect(status).to be 0}
138
140
  FileUtils.rm tgt_file, force: true
@@ -167,7 +169,7 @@ describe 'Converters' do
167
169
  compare = MiniMagick::Tool::Compare.new
168
170
  compare << ref_file << tgt_file
169
171
  compare.metric << 'AE'
170
- compare.fuzz << '1%'
172
+ compare.fuzz << '100%'
171
173
  compare << diff_file
172
174
  compare.call do |_stdin, _stdout, status|
173
175
  expect(status).to be 0
@@ -189,7 +191,7 @@ describe 'Converters' do
189
191
  expect(File.exist?(tgt_file)).to be_truthy
190
192
  compare = MiniMagick::Tool::Compare.new
191
193
  compare << ref_file << tgt_file
192
- compare.metric << 'AE'
194
+ compare.metric << 'MAE'
193
195
  compare.fuzz << '10%'
194
196
  compare << diff_file
195
197
  compare.call {|_,_,status| expect(status).to be 0}