libis-format 0.9.41 → 0.9.44

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +365 -0
  3. data/bin/droid +1 -1
  4. data/bin/fido +1 -1
  5. data/bin/pdf_copy +1 -1
  6. data/lib/libis/format/config.rb +1 -0
  7. data/lib/libis/format/converter/audio_converter.rb +1 -1
  8. data/lib/libis/format/converter/base.rb +2 -1
  9. data/lib/libis/format/converter/office_converter.rb +2 -2
  10. data/lib/libis/format/converter/pdf_converter.rb +6 -6
  11. data/lib/libis/format/converter/video_converter.rb +96 -2
  12. data/lib/libis/format/identifier.rb +12 -12
  13. data/lib/libis/format/tool/droid.rb +108 -0
  14. data/lib/libis/format/tool/extension_identification.rb +58 -0
  15. data/lib/libis/format/tool/ffmpeg.rb +43 -0
  16. data/lib/libis/format/tool/fido.rb +91 -0
  17. data/lib/libis/format/tool/file_tool.rb +78 -0
  18. data/lib/libis/format/tool/identification_tool.rb +175 -0
  19. data/lib/libis/format/tool/office_to_pdf.rb +54 -0
  20. data/lib/libis/format/tool/pdf_copy.rb +42 -0
  21. data/lib/libis/format/tool/pdf_merge.rb +43 -0
  22. data/lib/libis/format/tool/pdf_optimizer.rb +38 -0
  23. data/lib/libis/format/tool/pdf_split.rb +41 -0
  24. data/lib/libis/format/tool/pdf_to_pdfa.rb +78 -0
  25. data/lib/libis/format/tool/pdfa_validator.rb +63 -0
  26. data/lib/libis/format/tool.rb +23 -0
  27. data/lib/libis/format/version.rb +1 -1
  28. data/lib/libis/format.rb +1 -15
  29. data/libis-format.gemspec +1 -2
  30. data/spec/converter_audio_spec.rb +66 -0
  31. data/spec/converter_image_spec.rb +166 -0
  32. data/spec/converter_office_spec.rb +84 -0
  33. data/spec/converter_pdf_spec.rb +30 -0
  34. data/spec/converter_repository_spec.rb +91 -0
  35. data/spec/converter_video_spec.rb +97 -0
  36. data/spec/data/video/copyright.png +0 -0
  37. data/spec/identifier_spec.rb +3 -15
  38. metadata +32 -33
  39. data/lib/libis/format/droid.rb +0 -106
  40. data/lib/libis/format/extension_identification.rb +0 -55
  41. data/lib/libis/format/ffmpeg.rb +0 -41
  42. data/lib/libis/format/fido.rb +0 -89
  43. data/lib/libis/format/file_tool.rb +0 -76
  44. data/lib/libis/format/identification_tool.rb +0 -174
  45. data/lib/libis/format/office_to_pdf.rb +0 -52
  46. data/lib/libis/format/pdf_copy.rb +0 -40
  47. data/lib/libis/format/pdf_merge.rb +0 -41
  48. data/lib/libis/format/pdf_optimizer.rb +0 -36
  49. data/lib/libis/format/pdf_split.rb +0 -39
  50. data/lib/libis/format/pdf_to_pdfa.rb +0 -74
  51. data/lib/libis/format/pdfa_validator.rb +0 -61
  52. data/spec/converter_spec.rb +0 -433
@@ -1,89 +0,0 @@
1
- require 'libis/tools/extend/string'
2
- require 'libis/tools/command'
3
-
4
- require 'csv'
5
- require 'libis/format/config'
6
-
7
- require_relative 'identification_tool'
8
-
9
- module Libis
10
- module Format
11
-
12
- class Fido < Libis::Format::IdentificationTool
13
-
14
- def self.add_formats(formats_file)
15
- self.instance.formats << formats_file unless self.instance.formats.include?(formats_file)
16
- end
17
-
18
- def self.del_formats(formats_file)
19
- self.instance.formats.delete(formats_file)
20
- end
21
-
22
- attr_reader :formats
23
-
24
- def run_list(filelist)
25
- create_list_file(filelist) do |list_file|
26
- output = runner(nil, '-input', list_file.escape_for_string)
27
- process_output(output)
28
- end
29
- end
30
-
31
- def run_dir(dir, recursive = true)
32
- args = []
33
- args << '-recurse' if recursive
34
- output = runner(dir, *args)
35
- process_output(output)
36
- end
37
-
38
- def run(file)
39
- output = runner(file)
40
- process_output(output)
41
- end
42
-
43
- protected
44
-
45
- def initialize
46
- super
47
- @formats = Libis::Format::Config[:fido_formats].dup
48
- bad_mimetype('application/vnd.oasis.opendocument.text')
49
- bad_mimetype('application/vnd.oasis.opendocument.spreadsheet')
50
- end
51
-
52
- attr_writer :formats
53
-
54
- def runner(filename, *args)
55
- # Load custome format definitions if present
56
- args << '-loadformats' << "#{formats.join(',')}" unless formats.empty?
57
-
58
- # Workaround for Fido performance bug
59
- args << '-bufsize' << '1000'
60
-
61
- # Add filename to argument list (optional)
62
- args << "#{filename.escape_for_string}" if filename
63
-
64
- # No header output
65
- args << '-q'
66
-
67
- # Run command and capture results
68
- fido = ::Libis::Tools::Command.run(Libis::Format::Config[:fido_path], *args)
69
-
70
- # Log warning if needed
71
- raise RuntimeError, "Fido errors: #{fido[:err].join("\n")}" unless fido[:err].empty?
72
-
73
- # Parse output (CSV) text into array and return result
74
- keys = [:status, :time, :puid, :format_name, :format_version, :filesize, :filepath, :mimetype, :matchtype]
75
- result = CSV.parse(fido[:out].join("\n"))
76
- .map {|a| Hash[keys.zip(a)]}
77
- .select {|a| a[:status] == 'OK'}
78
- result.each do |r|
79
- r.delete(:time)
80
- r.delete(:status)
81
- r.delete(:filesize)
82
- r[:source] = :fido
83
- end
84
- end
85
-
86
- end
87
-
88
- end
89
- end
@@ -1,76 +0,0 @@
1
- require_relative 'identification_tool'
2
-
3
- module Libis
4
- module Format
5
-
6
- class FileTool < Libis::Format::IdentificationTool
7
-
8
- def run_list(filelist)
9
-
10
- create_list_file(filelist) do |list_file|
11
-
12
- output = runner(nil, '--files-from', list_file)
13
-
14
- process_output(output)
15
-
16
- end
17
-
18
- end
19
-
20
- def run_dir(dir, recursive = true)
21
-
22
- filelist = find_files(dir, recursive)
23
-
24
- create_list_file(filelist) do |list_file|
25
-
26
- output = runner(nil, '--files-from', list_file)
27
-
28
- process_output(output)
29
-
30
- end
31
-
32
- end
33
-
34
- def run(file)
35
-
36
- output = runner(file)
37
-
38
- process_output(output)
39
-
40
- end
41
-
42
- protected
43
-
44
- def runner(filename, *args)
45
-
46
- # Create new argument list
47
- opts = []
48
-
49
- # Add fixed options
50
- # -L : follow symlinks
51
- # --mime-type : only print MIME type
52
- opts << '-L' << '--mime-type'
53
-
54
- # Append passed arguments
55
- opts += args
56
-
57
- # Finally add the filename to process
58
- opts << filename.escape_for_string if filename
59
-
60
- # Run the UNIX file command and capture the results
61
- file_tool = ::Libis::Tools::Command.run('file', *opts)
62
-
63
- raise RuntimeError, "File command errors: #{file_tool[:err].join("\n")}" unless file_tool[:err].empty?
64
-
65
-
66
- # Parse output text into array and return result
67
- file_tool[:out].map do |line|
68
- r = line.split(/:\s+/)
69
- {filepath: r[0], mimetype: r[1], matchtype: 'magic', source: :file}
70
- end
71
- end
72
-
73
- end
74
-
75
- end
76
- end
@@ -1,174 +0,0 @@
1
- require 'csv'
2
- require 'tmpdir'
3
-
4
- require 'singleton'
5
- require 'libis/tools/extend/string'
6
- require 'libis/tools/logger'
7
- require 'libis/tools/command'
8
-
9
- require 'libis/format/config'
10
- require 'libis/format/type_database'
11
-
12
- module Libis
13
- module Format
14
-
15
- class IdentificationTool
16
- include Singleton
17
- include ::Libis::Tools::Logger
18
-
19
- def self.bad_mimetype(mimetype)
20
- self.instance.bad_mimetype(mimetype)
21
- end
22
-
23
- def self.run(file, recursive = false)
24
- if file.is_a?(Array)
25
- return run_list file
26
- elsif file.is_a?(String) && File.exists?(file) && File.readable?(file)
27
- if File.directory?(file)
28
- return run_dir(file, recursive)
29
- elsif File.file?(file)
30
- return self.instance.run(file)
31
- end
32
- end
33
- raise ArgumentError,
34
- 'IdentificationTool: file argument should be a path to an existing file or directory or a list of those'
35
- end
36
-
37
- def self.run_dir(file, recursive = true)
38
- self.instance.run_dir file, recursive
39
- end
40
-
41
- def self.run_list(filelist)
42
- self.instance.run_list filelist
43
- end
44
-
45
- protected
46
-
47
- def create_list_file(filelist)
48
- list_file = Dir::Tmpname.make_tmpname(%w'file .list', nil)
49
- File.open(list_file, 'w') do |f|
50
- filelist.each do |fname|
51
- f.write "#{fname}\n"
52
- end
53
- end
54
- yield(list_file)
55
- ensure
56
- File.delete(list_file)
57
- end
58
-
59
- def find_files(dir, recurse = true)
60
- args = []
61
- args << '-L'
62
- args << dir.escape_for_string
63
- args << '-maxdepth' << '1' unless recurse
64
- args << '-type' << 'f'
65
- args << '-print'
66
- output = ::Libis::Tools::Command.run('find', *args)
67
- warn "Find command errors: #{output[:err].join("\n")}" unless output[:err].empty?
68
- output[:out]
69
- end
70
-
71
- # Reformat output to make it easier to post-process and decide on the preferred format
72
- #
73
- # input format:
74
- # [
75
- # { filepath: <filename>, mimetype: <mimetype>, matchtype: <matchtype>, ... }
76
- # ]
77
- #
78
- # output format:
79
- # { <filename> => [<result>, ...], ... }
80
- #
81
- # <result> is the enchanced Hash output of the identification tool:
82
- # { mimetype: <mimetype>, puid: <puid>, matchtype: <matchtype>, score: <score>, ...}
83
- #
84
- def process_output(output)
85
- output.reduce({}) do |results, x|
86
- filepath = x.delete(:filepath)
87
- results[filepath] ||= []
88
- results[filepath.freeze] << annotate(x)
89
- results
90
- end
91
- end
92
-
93
- # Enhance the output with mimetype and score
94
- def annotate(result)
95
- # Enhance result with mimetype if needed
96
- if bad_mimetypes.include?(result[:mimetype]) && !bad_puids.include?(result[:puid])
97
- result[:mimetype] = get_mimetype(result[:puid])
98
- end
99
-
100
- # Normalize the mimetype
101
- Libis::Format::TypeDatabase.normalize(result, PUID: :puid, MIME: :mimetype)
102
-
103
- # Default score is 5
104
- result[:score] = 5
105
-
106
- # Weak detection score is 1
107
- result[:score] = 1 if bad_mimetypes.include? result[:mimetype]
108
-
109
- # freeze all strings
110
- result.each {|_, v| v.freeze if v.is_a?(String)}
111
-
112
- # Adapt score based on matchtype
113
- result[:matchtype] = result[:matchtype].to_s.downcase
114
- case result[:matchtype]
115
-
116
- # Signature match increases score with 2
117
- when 'signature'
118
- result[:score] += 2
119
- # typeinfo = ::Libis::Format::TypeDatabase.puid_typeinfo(result[:puid])
120
- # ext = File.extname(result[:filename])
121
- # result[:score] += 1 if typeinfo and typeinfo[:EXTENSIONS].include?(ext)
122
-
123
- # Container match increases score with 4
124
- when 'container'
125
- result[:score] += 4
126
- # typeinfo = ::Libis::Format::TypeDatabase.puid_typeinfo(result[:puid])
127
- # ext = File.extname(result[:filename])
128
- # result[:score] += 1 if typeinfo and typeinfo[:EXTENSIONS].include?(ext)
129
-
130
- # Extension match is the weakest identification; score is lowered by 2 points
131
- when 'extension'
132
- result[:score] -= 2
133
-
134
- # Magic code (file tool) is to be trused even less
135
- when 'magic'
136
- result[:score] -= 3
137
-
138
- # Or no change otherwise
139
- else
140
- # do nothing
141
- end
142
-
143
- # Detecting a zip file should decrease the score as it may hide one of the many zip-based formats (e.g. epub,
144
- # Office OpenXML, OpenDocument, jar, maff, svx)
145
- if result[:mimetype] == 'application/zip'
146
- result[:score] -= 2
147
- end
148
-
149
- # Return result enhanced with mimetype and score fields
150
- result
151
- end
152
-
153
- def get_mimetype(puid)
154
- ::Libis::Format::TypeDatabase.puid_typeinfo(puid)[:MIME].first rescue nil
155
- end
156
-
157
- def get_puid(mimetype)
158
- ::Libis::Format::TypeDatabase.mime_infos(mimetype).first[:PUID].first rescue nil
159
- end
160
-
161
- attr_accessor :bad_mimetypes, :bad_puids
162
-
163
- def initialize
164
- @bad_mimetypes = [nil, '', 'None', 'application/octet-stream']
165
- @bad_puids = [nil, 'fmt/unknown']
166
- end
167
-
168
- def bad_mimetype(mimetype)
169
- @bad_mimetypes << mimetype
170
- end
171
- end
172
-
173
- end
174
- end
@@ -1,52 +0,0 @@
1
- require 'fileutils'
2
-
3
- require 'libis/tools/extend/string'
4
- require 'libis/tools/logger'
5
- require 'libis/tools/command'
6
-
7
- require 'libis/format/config'
8
-
9
- module Libis
10
- module Format
11
-
12
- class OfficeToPdf
13
- include ::Libis::Tools::Logger
14
-
15
- def self.run(source, target, options = {})
16
- self.new.run source, target, options
17
- end
18
-
19
- def run(source, target, options = {})
20
- workdir = '/...'
21
- workdir = Dir.tmpdir unless Dir.exist? workdir
22
-
23
- workdir = File.join(workdir, rand(1000000).to_s)
24
- FileUtils.mkpath(workdir)
25
-
26
- src_file = File.join(workdir, File.basename(source))
27
- FileUtils.symlink source, src_file
28
-
29
- tgt_file = File.join(workdir, File.basename(source, '.*') + '.pdf')
30
-
31
- export_filter = options[:export_filter] || 'pdf'
32
-
33
- result = Libis::Tools::Command.run(
34
- Libis::Format::Config[:soffice_path], '--headless',
35
- '--convert-to', export_filter,
36
- '--outdir', workdir, src_file
37
- )
38
-
39
- unless result[:status] == 0
40
- warn "PdfConvert errors: #{(result[:err] + result[:out]).join("\n")}"
41
- return false
42
- end
43
-
44
- FileUtils.copy tgt_file, target, preserve: true
45
- FileUtils.rmtree workdir
46
-
47
- result[:out]
48
- end
49
- end
50
-
51
- end
52
- end
@@ -1,40 +0,0 @@
1
- require 'os'
2
-
3
- require 'libis/tools/extend/string'
4
- require 'libis/tools/logger'
5
- require 'libis/tools/command'
6
-
7
- require 'libis/format/config'
8
-
9
- module Libis
10
- module Format
11
-
12
- class PdfCopy
13
- include ::Libis::Tools::Logger
14
-
15
- def self.run(source, target, options = [])
16
- self.new.run source, target, options
17
- end
18
-
19
- def run(source, target, options = [])
20
- tool_dir = File.absolute_path(File.join(File.dirname(__FILE__), '..', '..', '..', 'tools'))
21
- jar_file = File.join(tool_dir, 'PdfTool.jar')
22
-
23
- if OS.java?
24
- # TODO: import library and execute in current VM. For now do exactly as in MRI.
25
- end
26
-
27
- Libis::Tools::Command.run(
28
- Libis::Format::Config[:java_path],
29
- '-cp', jar_file,
30
- 'CopyPdf',
31
- '--file_input', source,
32
- '--file_output', target,
33
- *options
34
- )
35
-
36
- end
37
- end
38
-
39
- end
40
- end
@@ -1,41 +0,0 @@
1
- require 'os'
2
-
3
- require 'libis/tools/extend/string'
4
- require 'libis/tools/logger'
5
- require 'libis/tools/command'
6
-
7
- require 'libis/format/config'
8
-
9
- module Libis
10
- module Format
11
-
12
- class PdfMerge
13
- include ::Libis::Tools::Logger
14
-
15
- def self.run(source, target, options = [])
16
- self.new.run source, target, options
17
- end
18
-
19
- def run(source, target, options = [])
20
- tool_dir = File.absolute_path(File.join(File.dirname(__FILE__), '..', '..', '..', 'tools'))
21
- jar_file = File.join(tool_dir, 'PdfTool.jar')
22
- source = [source] unless source.is_a?(Array)
23
-
24
- if OS.java?
25
- # TODO: import library and execute in current VM. For now do exactly as in MRI.
26
- end
27
-
28
- Libis::Tools::Command.run(
29
- Libis::Format::Config[:java_path],
30
- '-cp', jar_file,
31
- 'MergePdf',
32
- '--file_output', target,
33
- *options,
34
- *source,
35
- )
36
-
37
- end
38
- end
39
-
40
- end
41
- end
@@ -1,36 +0,0 @@
1
- require 'os'
2
-
3
- require 'libis/tools/extend/string'
4
- require 'libis/tools/logger'
5
- require 'libis/tools/command'
6
-
7
- require 'libis/format/config'
8
-
9
- module Libis
10
- module Format
11
-
12
- class PdfOptimizer
13
- include ::Libis::Tools::Logger
14
-
15
- def self.run(source, target, quality)
16
- self.new.run source, target, quality
17
- end
18
-
19
- def run(source, target, quality)
20
-
21
- Libis::Tools::Command.run(
22
- 'gs',
23
- '-sDEVICE=pdfwrite',
24
- '-dCompatibilityLevel=1.4',
25
- "-dPDFSETTINGS=/#{quality}",
26
- '-dNOPAUSE',
27
- '-dBATCH',
28
- "-sOutputFile=#{target}",
29
- "#{source}"
30
- )
31
-
32
- end
33
- end
34
-
35
- end
36
- end
@@ -1,39 +0,0 @@
1
- require 'os'
2
-
3
- require 'libis/tools/extend/string'
4
- require 'libis/tools/logger'
5
- require 'libis/tools/command'
6
-
7
- require 'libis/format/config'
8
-
9
- module Libis
10
- module Format
11
-
12
- class PdfSplit
13
- include ::Libis::Tools::Logger
14
-
15
- def self.run(source, target, options = [])
16
- self.new.run source, target, options
17
- end
18
-
19
- def run(source, target, options = [])
20
- tool_dir = File.absolute_path(File.join(File.dirname(__FILE__), '..', '..', '..', 'tools'))
21
- jar_file = File.join(tool_dir, 'PdfTool.jar')
22
- if OS.java?
23
- # TODO: import library and execute in current VM. For now do exactly as in MRI.
24
- end
25
-
26
- Libis::Tools::Command.run(
27
- Libis::Format::Config[:java_path],
28
- '-cp', jar_file,
29
- 'SplitPdf',
30
- '--file_input', source,
31
- '--file_output', target,
32
- *options
33
- )
34
-
35
- end
36
- end
37
-
38
- end
39
- end
@@ -1,74 +0,0 @@
1
- require 'tempfile'
2
- require 'csv'
3
- require 'fileutils'
4
-
5
- require 'libis/tools/extend/string'
6
- require 'libis/tools/logger'
7
- require 'libis/tools/command'
8
-
9
- require 'libis/format'
10
-
11
- module Libis
12
- module Format
13
-
14
- class PdfToPdfa
15
- include ::Libis::Tools::Logger
16
-
17
- def self.run(source, target = nil, options = {})
18
- self.new.run source, target, options
19
- end
20
-
21
- def run(source, target = nil, options = nil)
22
-
23
- target ||= File.join(Dir.tmpdir, Dir::Tmpname.make_tmpname([File.basename(source, '.*'), '.pdf']))
24
-
25
- icc_info = icc_options(options[:colorspace])
26
-
27
- icc_file = File.join(Dir.tmpdir, "#{icc_info[:icc_name]}#{Random.new.bytes(12).unpack('H*').first}.icc")
28
- FileUtils.cp(File.join(Libis::Format::DATA_DIR, "#{icc_info[:icc_name]}.icc"), icc_file)
29
-
30
- def_filename = File.join(Dir.tmpdir, "PDFA_def_#{Random.new.bytes(12).unpack('H*').first}.ps")
31
- File.open(def_filename, 'w') do |f|
32
- f.puts File.read(File.join(Libis::Format::DATA_DIR, 'PDFA_def.ps')).
33
- gsub('[** Fill in ICC profile location **]', icc_file).
34
- gsub('[** Fill in ICC reference name **]', icc_info[:icc_ref])
35
- end
36
-
37
- result = Libis::Tools::Command.run(
38
- Libis::Format::Config[:ghostscript_path],
39
- '-dBATCH', '-dNOPAUSE', '-dNOOUTERSAVE',
40
- '-sColorConversionStrategy=/UseDeviceIndependentColor',
41
- "-sProcessColorModel=#{icc_info[:device]}",
42
- '-sDEVICE=pdfwrite', '-dPDFA', '-dPDFACompatibilityPolicy=1',
43
- "-sOutputICCProfile=#{icc_file}",
44
- '-o', File.absolute_path(target),
45
- def_filename,
46
- source
47
- )
48
-
49
- FileUtils.rm [icc_file, def_filename].compact, force: true
50
-
51
- unless PdfaValidator.run(target)
52
- result[:status] = -999
53
- result[:err] << 'Failed to validate generated PDF/A file.'
54
- end
55
-
56
- result
57
- end
58
-
59
-
60
- private
61
-
62
- def icc_options(colorspace)
63
- case colorspace.to_s.downcase
64
- when 'cmyk'
65
- {icc_name: 'ISOcoated_v2_eci', icc_ref: 'FOGRA39L', device: 'DeviceCMYK'}
66
- else
67
- {icc_name: 'eciRGB_v2', icc_ref: 'sRGB', device: 'DeviceRGB'}
68
- end
69
- end
70
-
71
- end
72
-
73
- end
74
- end
@@ -1,61 +0,0 @@
1
- require 'fileutils'
2
-
3
- require 'libis/tools/extend/string'
4
- require 'libis/tools/logger'
5
- require 'libis/tools/command'
6
-
7
- require 'libis/format/config'
8
-
9
- module Libis
10
- module Format
11
-
12
- class PdfaValidator
13
- include ::Libis::Tools::Logger
14
-
15
- def self.run(source)
16
- self.new.run source
17
- end
18
-
19
- def run(source)
20
-
21
- src_file = File.absolute_path(source)
22
-
23
- if (pdfa = Libis::Format::Config[:pdfa_path])
24
- # Keep it clean: tool generates fontconfig/ cache dir in current working dir
25
- previous_wd = Dir.getwd
26
- Dir.chdir(Dir.tmpdir)
27
-
28
- result = Libis::Tools::Command.run(
29
- pdfa,
30
- '--noxml',
31
- '--level', 'B',
32
- '--verb', '0',
33
- src_file
34
- )
35
-
36
- Dir.chdir(previous_wd)
37
-
38
- unless result[:out].any? { |line| line =~ /^VLD-\[PASS\]/ }
39
- warn "Validator failed to validate the PDF file '%s' against PDF/A-1B constraints:\n%s", source,
40
- result[:out].join("\n")
41
- return false
42
- end
43
- else
44
- jar = File.join(Libis::Format::ROOT_DIR, 'tools', 'pdfbox', 'preflight-app-1.8.10.jar')
45
- result = Libis::Tools::Command.run(
46
- Libis::Format::Config[:java_path],
47
- '-jar', jar,
48
- src_file
49
- )
50
- unless result[:status] == 0
51
- warn "Validator failed to validate the PDF file '%s' against PDF/A-1B constraints:\n%s", source,
52
- result[:out].join("\n")
53
- return false
54
- end
55
- end
56
- true
57
- end
58
- end
59
-
60
- end
61
- end