libis-format 0.9.41 → 0.9.44
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +365 -0
- data/bin/droid +1 -1
- data/bin/fido +1 -1
- data/bin/pdf_copy +1 -1
- data/lib/libis/format/config.rb +1 -0
- data/lib/libis/format/converter/audio_converter.rb +1 -1
- data/lib/libis/format/converter/base.rb +2 -1
- data/lib/libis/format/converter/office_converter.rb +2 -2
- data/lib/libis/format/converter/pdf_converter.rb +6 -6
- data/lib/libis/format/converter/video_converter.rb +96 -2
- data/lib/libis/format/identifier.rb +12 -12
- data/lib/libis/format/tool/droid.rb +108 -0
- data/lib/libis/format/tool/extension_identification.rb +58 -0
- data/lib/libis/format/tool/ffmpeg.rb +43 -0
- data/lib/libis/format/tool/fido.rb +91 -0
- data/lib/libis/format/tool/file_tool.rb +78 -0
- data/lib/libis/format/tool/identification_tool.rb +175 -0
- data/lib/libis/format/tool/office_to_pdf.rb +54 -0
- data/lib/libis/format/tool/pdf_copy.rb +42 -0
- data/lib/libis/format/tool/pdf_merge.rb +43 -0
- data/lib/libis/format/tool/pdf_optimizer.rb +38 -0
- data/lib/libis/format/tool/pdf_split.rb +41 -0
- data/lib/libis/format/tool/pdf_to_pdfa.rb +78 -0
- data/lib/libis/format/tool/pdfa_validator.rb +63 -0
- data/lib/libis/format/tool.rb +23 -0
- data/lib/libis/format/version.rb +1 -1
- data/lib/libis/format.rb +1 -15
- data/libis-format.gemspec +1 -2
- data/spec/converter_audio_spec.rb +66 -0
- data/spec/converter_image_spec.rb +166 -0
- data/spec/converter_office_spec.rb +84 -0
- data/spec/converter_pdf_spec.rb +30 -0
- data/spec/converter_repository_spec.rb +91 -0
- data/spec/converter_video_spec.rb +97 -0
- data/spec/data/video/copyright.png +0 -0
- data/spec/identifier_spec.rb +3 -15
- metadata +32 -33
- data/lib/libis/format/droid.rb +0 -106
- data/lib/libis/format/extension_identification.rb +0 -55
- data/lib/libis/format/ffmpeg.rb +0 -41
- data/lib/libis/format/fido.rb +0 -89
- data/lib/libis/format/file_tool.rb +0 -76
- data/lib/libis/format/identification_tool.rb +0 -174
- data/lib/libis/format/office_to_pdf.rb +0 -52
- data/lib/libis/format/pdf_copy.rb +0 -40
- data/lib/libis/format/pdf_merge.rb +0 -41
- data/lib/libis/format/pdf_optimizer.rb +0 -36
- data/lib/libis/format/pdf_split.rb +0 -39
- data/lib/libis/format/pdf_to_pdfa.rb +0 -74
- data/lib/libis/format/pdfa_validator.rb +0 -61
- data/spec/converter_spec.rb +0 -433
@@ -12,10 +12,10 @@ require 'nori/core_ext/object'
|
|
12
12
|
require 'libis/format/type_database'
|
13
13
|
|
14
14
|
require_relative 'config'
|
15
|
-
require_relative 'fido'
|
16
|
-
require_relative 'droid'
|
17
|
-
require_relative 'file_tool'
|
18
|
-
require_relative 'extension_identification'
|
15
|
+
require_relative 'tool/fido'
|
16
|
+
require_relative 'tool/droid'
|
17
|
+
require_relative 'tool/file_tool'
|
18
|
+
require_relative 'tool/extension_identification'
|
19
19
|
|
20
20
|
module Libis
|
21
21
|
module Format
|
@@ -78,7 +78,7 @@ module Libis
|
|
78
78
|
log_msg(result, :error, "Error validating XML files: #{e.message} @ #{e.backtrace.first}")
|
79
79
|
end
|
80
80
|
|
81
|
-
process_results(result)
|
81
|
+
process_results(result, !options[:keep_output])
|
82
82
|
|
83
83
|
result
|
84
84
|
|
@@ -91,25 +91,25 @@ module Libis
|
|
91
91
|
end
|
92
92
|
|
93
93
|
def get_file_identification(file, result, options)
|
94
|
-
output = ::Libis::Format::FileTool.run(file, options[:recursive])
|
94
|
+
output = ::Libis::Format::Tool::FileTool.run(file, options[:recursive])
|
95
95
|
process_tool_output(output, result, options[:base_dir])
|
96
96
|
output
|
97
97
|
end
|
98
98
|
|
99
99
|
def get_fido_identification(file, result, options)
|
100
|
-
output = ::Libis::Format::Fido.run(file, options[:recursive])
|
100
|
+
output = ::Libis::Format::Tool::Fido.run(file, options[:recursive])
|
101
101
|
process_tool_output(output, result, options[:base_dir])
|
102
102
|
output
|
103
103
|
end
|
104
104
|
|
105
105
|
def get_droid_identification(file, result, options)
|
106
|
-
output = ::Libis::Format::Droid.run(file, options[:recursive])
|
106
|
+
output = ::Libis::Format::Tool::Droid.run(file, options[:recursive])
|
107
107
|
process_tool_output(output, result, options[:base_dir])
|
108
108
|
output
|
109
109
|
end
|
110
110
|
|
111
111
|
def get_extension_identification(file, result, options)
|
112
|
-
output = ::Libis::Format::ExtensionIdentification.run(file, options[:recursive])
|
112
|
+
output = ::Libis::Format::Tool::ExtensionIdentification.run(file, options[:recursive])
|
113
113
|
process_tool_output(output, result, options[:base_dir])
|
114
114
|
output
|
115
115
|
end
|
@@ -149,9 +149,9 @@ module Libis
|
|
149
149
|
log_msg(result, :warn, "Error parsing XML file #{file}: #{e.message} @ #{e.backtrace.first}")
|
150
150
|
end
|
151
151
|
|
152
|
-
def process_results(result)
|
152
|
+
def process_results(result, delete_output = true)
|
153
153
|
result[:output].keys.each do |file|
|
154
|
-
output = result[:output]
|
154
|
+
output = result[:output][file]
|
155
155
|
file_result = result[:formats][file] = {}
|
156
156
|
if output.empty?
|
157
157
|
log_msg(result, :warn, "Could not identify format of '#{file}'.")
|
@@ -177,7 +177,7 @@ module Libis
|
|
177
177
|
end
|
178
178
|
end
|
179
179
|
end
|
180
|
-
result.delete(:output)
|
180
|
+
result.delete(:output) if delete_output
|
181
181
|
end
|
182
182
|
|
183
183
|
def process_multiple_formats(file_result, format_matches, output)
|
@@ -0,0 +1,108 @@
|
|
1
|
+
require 'singleton'
|
2
|
+
|
3
|
+
require 'tempfile'
|
4
|
+
require 'csv'
|
5
|
+
|
6
|
+
require 'libis/format/config'
|
7
|
+
|
8
|
+
unless CSV::HeaderConverters.has_key?(:droid_headers)
|
9
|
+
CSV::HeaderConverters[:droid_headers] = lambda {|h|
|
10
|
+
h.encode(ConverterEncoding).downcase.strip.
|
11
|
+
gsub(/\W+/, "").to_sym
|
12
|
+
}
|
13
|
+
end
|
14
|
+
|
15
|
+
require_relative 'identification_tool'
|
16
|
+
|
17
|
+
module Libis
|
18
|
+
module Format
|
19
|
+
module Tool
|
20
|
+
|
21
|
+
class Droid < Libis::Format::Tool::IdentificationTool
|
22
|
+
|
23
|
+
def run_list(filelist)
|
24
|
+
runner(filelist)
|
25
|
+
end
|
26
|
+
|
27
|
+
def run_dir(dir, recursive = true)
|
28
|
+
profile = profile_file_name
|
29
|
+
report = result_file_name
|
30
|
+
create_profile(dir, profile, recursive)
|
31
|
+
create_report(profile, report)
|
32
|
+
parse_report(report)
|
33
|
+
end
|
34
|
+
|
35
|
+
def run(file)
|
36
|
+
runner(file)
|
37
|
+
end
|
38
|
+
|
39
|
+
protected
|
40
|
+
|
41
|
+
def runner(file_or_list)
|
42
|
+
profile = profile_file_name
|
43
|
+
report = result_file_name
|
44
|
+
create_profile(file_or_list, profile)
|
45
|
+
create_report(profile, report)
|
46
|
+
parse_report(report)
|
47
|
+
end
|
48
|
+
|
49
|
+
def parse_report(report)
|
50
|
+
keys = [
|
51
|
+
:id, :parent_id, :uri, :filepath, :filename, :matchtype, :status, :filesize, :type, :extension,
|
52
|
+
:mod_time, :ext_mismatch, :hash, :format_count, :puid, :mimetype, :format_name, :format_version]
|
53
|
+
result = CSV.parse(File.readlines(report).join)
|
54
|
+
.map {|a| Hash[keys.zip(a)]}
|
55
|
+
.select {|a| a[:type] == 'File'}
|
56
|
+
# File.delete report
|
57
|
+
result.each do |r|
|
58
|
+
r.delete(:id)
|
59
|
+
r.delete(:parent_id)
|
60
|
+
r.delete(:uri)
|
61
|
+
r.delete(:filename)
|
62
|
+
r.delete(:status)
|
63
|
+
r.delete(:filesize)
|
64
|
+
r.delete(:type)
|
65
|
+
r.delete(:extension)
|
66
|
+
r.delete(:mod_time)
|
67
|
+
r.delete(:hash)
|
68
|
+
r.delete(:format_count)
|
69
|
+
r[:source] = :droid
|
70
|
+
end
|
71
|
+
File.delete report
|
72
|
+
process_output(result)
|
73
|
+
end
|
74
|
+
|
75
|
+
def create_report(profile, report)
|
76
|
+
args = [
|
77
|
+
'-e', report,
|
78
|
+
'-p', profile,
|
79
|
+
'-q'
|
80
|
+
]
|
81
|
+
result = Libis::Tools::Command.run(Libis::Format::Config[:droid_path], *args)
|
82
|
+
raise RuntimeError, "DROID report errors: #{result[:err].join("\n")}" unless result[:status] == 0
|
83
|
+
File.delete profile
|
84
|
+
end
|
85
|
+
|
86
|
+
def create_profile(file_or_list, profile, recursive = false)
|
87
|
+
args = []
|
88
|
+
files = (file_or_list.is_a?(Array)) ? file_or_list.map(&:escape_for_string) : [file_or_list.escape_for_string]
|
89
|
+
files.each {|file| args << '-a' << file}
|
90
|
+
args << '-p' << profile << '-q'
|
91
|
+
args << '-R' if recursive
|
92
|
+
result = Libis::Tools::Command.run(Libis::Format::Config[:droid_path], *args)
|
93
|
+
raise RuntimeError, "DROID profile errors: #{result[:err].join("\n")}" unless result[:status] == 0
|
94
|
+
end
|
95
|
+
|
96
|
+
def profile_file_name
|
97
|
+
Tools::TempFile.name('droid', '.profile')
|
98
|
+
end
|
99
|
+
|
100
|
+
def result_file_name
|
101
|
+
Tools::TempFile.name('droid', '.csv')
|
102
|
+
end
|
103
|
+
|
104
|
+
end
|
105
|
+
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
require_relative 'identification_tool'
|
2
|
+
|
3
|
+
module Libis
|
4
|
+
module Format
|
5
|
+
module Tool
|
6
|
+
|
7
|
+
class ExtensionIdentification < Libis::Format::Tool::IdentificationTool
|
8
|
+
|
9
|
+
def run_list(filelist)
|
10
|
+
|
11
|
+
output = runner(nil, filelist)
|
12
|
+
|
13
|
+
process_output(output)
|
14
|
+
|
15
|
+
end
|
16
|
+
|
17
|
+
def run_dir(dir, recursive = true)
|
18
|
+
|
19
|
+
filelist = find_files(dir, recursive)
|
20
|
+
|
21
|
+
output = runner(nil, filelist)
|
22
|
+
|
23
|
+
process_output(output)
|
24
|
+
|
25
|
+
end
|
26
|
+
|
27
|
+
def run(file)
|
28
|
+
|
29
|
+
output = runner(file)
|
30
|
+
|
31
|
+
process_output(output)
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
protected
|
36
|
+
|
37
|
+
def runner(*args)
|
38
|
+
|
39
|
+
args.map do |file|
|
40
|
+
info = ::Libis::Format::TypeDatabase.ext_infos(File.extname(file)).first
|
41
|
+
if info
|
42
|
+
{
|
43
|
+
filepath: file,
|
44
|
+
mimetype: (info[:MIME].first rescue nil),
|
45
|
+
puid: (info[:PUID].first rescue nil),
|
46
|
+
matchtype: 'extension',
|
47
|
+
source: :type_database
|
48
|
+
}
|
49
|
+
end
|
50
|
+
end.cleanup
|
51
|
+
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
55
|
+
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'libis/tools/extend/string'
|
2
|
+
require 'libis/tools/extend/empty'
|
3
|
+
require 'libis/tools/command'
|
4
|
+
|
5
|
+
require 'csv'
|
6
|
+
require 'libis/format/config'
|
7
|
+
|
8
|
+
module Libis
|
9
|
+
module Format
|
10
|
+
module Tool
|
11
|
+
|
12
|
+
class FFMpeg
|
13
|
+
include Singleton
|
14
|
+
include ::Libis::Tools::Logger
|
15
|
+
|
16
|
+
def self.run(source, target, options = {})
|
17
|
+
self.instance.run source, target, options
|
18
|
+
end
|
19
|
+
|
20
|
+
def run(source, target, options = {})
|
21
|
+
opts = []
|
22
|
+
opts += options[:global] unless options[:global].empty?
|
23
|
+
opts += options[:input] unless options[:input].empty?
|
24
|
+
opts << '-i' << source
|
25
|
+
opts += options[:filter] unless options[:filter].empty?
|
26
|
+
opts += options[:output] unless options[:output].empty?
|
27
|
+
opts << target
|
28
|
+
result = Libis::Tools::Command.run(Libis::Format::Config[:ffmpeg_path], *opts)
|
29
|
+
|
30
|
+
unless result[:status] == 0
|
31
|
+
error "FFMpeg errors: #{(result[:err] + result[:out]).join("\n")}"
|
32
|
+
return false
|
33
|
+
end
|
34
|
+
warn "FFMpeg warnings: #{(result[:err] + result[:out]).join("\n")}" unless result[:err].empty?
|
35
|
+
|
36
|
+
result[:out]
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
require 'libis/tools/extend/string'
|
2
|
+
require 'libis/tools/command'
|
3
|
+
|
4
|
+
require 'csv'
|
5
|
+
require 'libis/format/config'
|
6
|
+
|
7
|
+
require_relative 'identification_tool'
|
8
|
+
|
9
|
+
module Libis
|
10
|
+
module Format
|
11
|
+
module Tool
|
12
|
+
|
13
|
+
class Fido < Libis::Format::Tool::IdentificationTool
|
14
|
+
|
15
|
+
def self.add_formats(formats_file)
|
16
|
+
self.instance.formats << formats_file unless self.instance.formats.include?(formats_file)
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.del_formats(formats_file)
|
20
|
+
self.instance.formats.delete(formats_file)
|
21
|
+
end
|
22
|
+
|
23
|
+
attr_reader :formats
|
24
|
+
|
25
|
+
def run_list(filelist)
|
26
|
+
create_list_file(filelist) do |list_file|
|
27
|
+
output = runner(nil, '-input', list_file.escape_for_string)
|
28
|
+
process_output(output)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def run_dir(dir, recursive = true)
|
33
|
+
args = []
|
34
|
+
args << '-recurse' if recursive
|
35
|
+
output = runner(dir, *args)
|
36
|
+
process_output(output)
|
37
|
+
end
|
38
|
+
|
39
|
+
def run(file)
|
40
|
+
output = runner(file)
|
41
|
+
process_output(output)
|
42
|
+
end
|
43
|
+
|
44
|
+
protected
|
45
|
+
|
46
|
+
def initialize
|
47
|
+
super
|
48
|
+
@formats = Libis::Format::Config[:fido_formats].dup
|
49
|
+
bad_mimetype('application/vnd.oasis.opendocument.text')
|
50
|
+
bad_mimetype('application/vnd.oasis.opendocument.spreadsheet')
|
51
|
+
end
|
52
|
+
|
53
|
+
attr_writer :formats
|
54
|
+
|
55
|
+
def runner(filename, *args)
|
56
|
+
# Load custome format definitions if present
|
57
|
+
args << '-loadformats' << "#{formats.join(',')}" unless formats.empty?
|
58
|
+
|
59
|
+
# Workaround for Fido performance bug
|
60
|
+
args << '-bufsize' << '1000'
|
61
|
+
|
62
|
+
# Add filename to argument list (optional)
|
63
|
+
args << "#{filename.escape_for_string}" if filename
|
64
|
+
|
65
|
+
# No header output
|
66
|
+
args << '-q'
|
67
|
+
|
68
|
+
# Run command and capture results
|
69
|
+
fido = ::Libis::Tools::Command.run(Libis::Format::Config[:fido_path], *args)
|
70
|
+
|
71
|
+
# Log warning if needed
|
72
|
+
raise RuntimeError, "Fido errors: #{fido[:err].join("\n")}" unless fido[:err].empty?
|
73
|
+
|
74
|
+
# Parse output (CSV) text into array and return result
|
75
|
+
keys = [:status, :time, :puid, :format_name, :format_version, :filesize, :filepath, :mimetype, :matchtype]
|
76
|
+
result = CSV.parse(fido[:out].join("\n"))
|
77
|
+
.map {|a| Hash[keys.zip(a)]}
|
78
|
+
.select {|a| a[:status] == 'OK'}
|
79
|
+
result.each do |r|
|
80
|
+
r.delete(:time)
|
81
|
+
r.delete(:status)
|
82
|
+
r.delete(:filesize)
|
83
|
+
r[:source] = :fido
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
end
|
88
|
+
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
require_relative 'identification_tool'
|
2
|
+
|
3
|
+
module Libis
|
4
|
+
module Format
|
5
|
+
module Tool
|
6
|
+
|
7
|
+
class FileTool < Libis::Format::Tool::IdentificationTool
|
8
|
+
|
9
|
+
def run_list(filelist)
|
10
|
+
|
11
|
+
create_list_file(filelist) do |list_file|
|
12
|
+
|
13
|
+
output = runnerIdentificationTool(nil, '--files-from', list_file)
|
14
|
+
|
15
|
+
process_output(output)
|
16
|
+
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
20
|
+
|
21
|
+
def run_dir(dir, recursive = true)
|
22
|
+
|
23
|
+
filelist = find_files(dir, recursive)
|
24
|
+
|
25
|
+
create_list_file(filelist) do |list_file|
|
26
|
+
|
27
|
+
output = runner(nil, '--files-from', list_file)
|
28
|
+
|
29
|
+
process_output(output)
|
30
|
+
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
def run(file)
|
36
|
+
|
37
|
+
output = runner(file)
|
38
|
+
|
39
|
+
process_output(output)
|
40
|
+
|
41
|
+
end
|
42
|
+
|
43
|
+
protected
|
44
|
+
|
45
|
+
def runner(filename, *args)
|
46
|
+
|
47
|
+
# Create new argument list
|
48
|
+
opts = []
|
49
|
+
|
50
|
+
# Add fixed options
|
51
|
+
# -L : follow symlinks
|
52
|
+
# --mime-type : only print MIME type
|
53
|
+
opts << '-L' << '--mime-type'
|
54
|
+
|
55
|
+
# Append passed arguments
|
56
|
+
opts += args
|
57
|
+
|
58
|
+
# Finally add the filename to process
|
59
|
+
opts << filename.escape_for_string if filename
|
60
|
+
|
61
|
+
# Run the UNIX file command and capture the results
|
62
|
+
file_tool = ::Libis::Tools::Command.run('file', *opts)
|
63
|
+
|
64
|
+
raise RuntimeError, "File command errors: #{file_tool[:err].join("\n")}" unless file_tool[:err].empty?
|
65
|
+
|
66
|
+
|
67
|
+
# Parse output text into array and return result
|
68
|
+
file_tool[:out].map do |line|
|
69
|
+
r = line.split(/:\s+/)
|
70
|
+
{filepath: r[0], mimetype: r[1], matchtype: 'magic', source: :file}
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
end
|
75
|
+
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
@@ -0,0 +1,175 @@
|
|
1
|
+
require 'csv'
|
2
|
+
require 'tmpdir'
|
3
|
+
|
4
|
+
require 'singleton'
|
5
|
+
require 'libis/tools/extend/string'
|
6
|
+
require 'libis/tools/logger'
|
7
|
+
require 'libis/tools/command'
|
8
|
+
|
9
|
+
require 'libis/format/config'
|
10
|
+
require 'libis/format/type_database'
|
11
|
+
|
12
|
+
module Libis
|
13
|
+
module Format
|
14
|
+
module Tool
|
15
|
+
|
16
|
+
class IdentificationTool
|
17
|
+
include Singleton
|
18
|
+
include ::Libis::Tools::Logger
|
19
|
+
|
20
|
+
def self.bad_mimetype(mimetype)
|
21
|
+
self.instance.bad_mimetype(mimetype)
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.run(file, recursive = false)
|
25
|
+
if file.is_a?(Array)
|
26
|
+
return run_list file
|
27
|
+
elsif file.is_a?(String) && File.exists?(file) && File.readable?(file)
|
28
|
+
if File.directory?(file)
|
29
|
+
return run_dir(file, recursive)
|
30
|
+
elsif File.file?(file)
|
31
|
+
return self.instance.run(file)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
raise ArgumentError,
|
35
|
+
'IdentificationTool: file argument should be a path to an existing file or directory or a list of those'
|
36
|
+
end
|
37
|
+
|
38
|
+
def self.run_dir(file, recursive = true)
|
39
|
+
self.instance.run_dir file, recursive
|
40
|
+
end
|
41
|
+
|
42
|
+
def self.run_list(filelist)
|
43
|
+
self.instance.run_list filelist
|
44
|
+
end
|
45
|
+
|
46
|
+
protected
|
47
|
+
|
48
|
+
def create_list_file(filelist)
|
49
|
+
list_file = Tempfile.new(%w'file .list')
|
50
|
+
filelist.each do |fname|
|
51
|
+
list_file.write "#{fname}\n"
|
52
|
+
end
|
53
|
+
list_file.close
|
54
|
+
yield(list_file.path)
|
55
|
+
ensure
|
56
|
+
list_file.unlink
|
57
|
+
end
|
58
|
+
|
59
|
+
def find_files(dir, recurse = true)
|
60
|
+
args = []
|
61
|
+
args << '-L'
|
62
|
+
args << dir.escape_for_string
|
63
|
+
args << '-maxdepth' << '1' unless recurse
|
64
|
+
args << '-type' << 'f'
|
65
|
+
args << '-print'
|
66
|
+
output = ::Libis::Tools::Command.run('find', *args)
|
67
|
+
warn "Find command errors: #{output[:err].join("\n")}" unless output[:err].empty?
|
68
|
+
output[:out]
|
69
|
+
end
|
70
|
+
|
71
|
+
# Reformat output to make it easier to post-process and decide on the preferred format
|
72
|
+
#
|
73
|
+
# input format:
|
74
|
+
# [
|
75
|
+
# { filepath: <filename>, mimetype: <mimetype>, matchtype: <matchtype>, ... }
|
76
|
+
# ]
|
77
|
+
#
|
78
|
+
# output format:
|
79
|
+
# { <filename> => [<result>, ...], ... }
|
80
|
+
#
|
81
|
+
# <result> is the enchanced Hash output of the identification tool:
|
82
|
+
# { mimetype: <mimetype>, puid: <puid>, matchtype: <matchtype>, score: <score>, ...}
|
83
|
+
#
|
84
|
+
def process_output(output)
|
85
|
+
output.reduce({}) do |results, x|
|
86
|
+
filepath = x.delete(:filepath)
|
87
|
+
results[filepath] ||= []
|
88
|
+
results[filepath.freeze] << annotate(x)
|
89
|
+
results
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
# Enhance the output with mimetype and score
|
94
|
+
def annotate(result)
|
95
|
+
# Enhance result with mimetype if needed
|
96
|
+
if bad_mimetypes.include?(result[:mimetype]) && !bad_puids.include?(result[:puid])
|
97
|
+
result[:mimetype] = get_mimetype(result[:puid])
|
98
|
+
end
|
99
|
+
|
100
|
+
# Normalize the mimetype
|
101
|
+
Libis::Format::TypeDatabase.normalize(result, PUID: :puid, MIME: :mimetype)
|
102
|
+
|
103
|
+
# Default score is 5
|
104
|
+
result[:score] = 5
|
105
|
+
|
106
|
+
# Weak detection score is 1
|
107
|
+
result[:score] = 1 if bad_mimetypes.include? result[:mimetype]
|
108
|
+
|
109
|
+
# freeze all strings
|
110
|
+
result.each {|_, v| v.freeze if v.is_a?(String)}
|
111
|
+
|
112
|
+
# Adapt score based on matchtype
|
113
|
+
result[:matchtype] = result[:matchtype].to_s.downcase
|
114
|
+
case result[:matchtype]
|
115
|
+
|
116
|
+
# Signature match increases score with 2
|
117
|
+
when 'signature'
|
118
|
+
result[:score] += 2
|
119
|
+
# typeinfo = ::Libis::Format::TypeDatabase.puid_typeinfo(result[:puid])
|
120
|
+
# ext = File.extname(result[:filename])
|
121
|
+
# result[:score] += 1 if typeinfo and typeinfo[:EXTENSIONS].include?(ext)
|
122
|
+
|
123
|
+
# Container match increases score with 4
|
124
|
+
when 'container'
|
125
|
+
result[:score] += 4
|
126
|
+
# typeinfo = ::Libis::Format::TypeDatabase.puid_typeinfo(result[:puid])
|
127
|
+
# ext = File.extname(result[:filename])
|
128
|
+
# result[:score] += 1 if typeinfo and typeinfo[:EXTENSIONS].include?(ext)
|
129
|
+
|
130
|
+
# Extension match is the weakest identification; score is lowered by 2 points
|
131
|
+
when 'extension'
|
132
|
+
result[:score] -= 2
|
133
|
+
|
134
|
+
# Magic code (file tool) is to be trused even less
|
135
|
+
when 'magic'
|
136
|
+
result[:score] -= 3
|
137
|
+
|
138
|
+
# Or no change otherwise
|
139
|
+
else
|
140
|
+
# do nothing
|
141
|
+
end
|
142
|
+
|
143
|
+
# Detecting a zip file should decrease the score as it may hide one of the many zip-based formats (e.g. epub,
|
144
|
+
# Office OpenXML, OpenDocument, jar, maff, svx)
|
145
|
+
if result[:mimetype] == 'application/zip'
|
146
|
+
result[:score] -= 2
|
147
|
+
end
|
148
|
+
|
149
|
+
# Return result enhanced with mimetype and score fields
|
150
|
+
result
|
151
|
+
end
|
152
|
+
|
153
|
+
def get_mimetype(puid)
|
154
|
+
::Libis::Format::TypeDatabase.puid_typeinfo(puid)[:MIME].first rescue nil
|
155
|
+
end
|
156
|
+
|
157
|
+
def get_puid(mimetype)
|
158
|
+
::Libis::Format::TypeDatabase.mime_infos(mimetype).first[:PUID].first rescue nil
|
159
|
+
end
|
160
|
+
|
161
|
+
attr_accessor :bad_mimetypes, :bad_puids
|
162
|
+
|
163
|
+
def initialize
|
164
|
+
@bad_mimetypes = [nil, '', 'None', 'application/octet-stream']
|
165
|
+
@bad_puids = [nil, 'fmt/unknown']
|
166
|
+
end
|
167
|
+
|
168
|
+
def bad_mimetype(mimetype)
|
169
|
+
@bad_mimetypes << mimetype
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
|
3
|
+
require 'libis/tools/extend/string'
|
4
|
+
require 'libis/tools/logger'
|
5
|
+
require 'libis/tools/command'
|
6
|
+
|
7
|
+
require 'libis/format/config'
|
8
|
+
|
9
|
+
module Libis
|
10
|
+
module Format
|
11
|
+
module Tool
|
12
|
+
|
13
|
+
class OfficeToPdf
|
14
|
+
include ::Libis::Tools::Logger
|
15
|
+
|
16
|
+
def self.run(source, target, options = {})
|
17
|
+
self.new.run source, target, options
|
18
|
+
end
|
19
|
+
|
20
|
+
def run(source, target, options = {})
|
21
|
+
workdir = '/...'
|
22
|
+
workdir = Dir.tmpdir unless Dir.exist? workdir
|
23
|
+
|
24
|
+
workdir = File.join(workdir, rand(1000000).to_s)
|
25
|
+
FileUtils.mkpath(workdir)
|
26
|
+
|
27
|
+
src_file = File.join(workdir, File.basename(source))
|
28
|
+
FileUtils.symlink source, src_file
|
29
|
+
|
30
|
+
tgt_file = File.join(workdir, File.basename(source, '.*') + '.pdf')
|
31
|
+
|
32
|
+
export_filter = options[:export_filter] || 'pdf'
|
33
|
+
|
34
|
+
result = Libis::Tools::Command.run(
|
35
|
+
Libis::Format::Config[:soffice_path], '--headless',
|
36
|
+
'--convert-to', export_filter,
|
37
|
+
'--outdir', workdir, src_file
|
38
|
+
)
|
39
|
+
|
40
|
+
unless result[:status] == 0
|
41
|
+
warn "PdfConvert errors: #{(result[:err] + result[:out]).join("\n")}"
|
42
|
+
return false
|
43
|
+
end
|
44
|
+
|
45
|
+
FileUtils.copy tgt_file, target, preserve: true
|
46
|
+
FileUtils.rmtree workdir
|
47
|
+
|
48
|
+
result[:out]
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|