format_parser 0.3.5 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/archive.rb +36 -0
- data/lib/attributes_json.rb +9 -5
- data/lib/document.rb +1 -0
- data/lib/format_parser.rb +1 -0
- data/lib/format_parser/version.rb +1 -1
- data/lib/parsers/pdf_parser.rb +76 -0
- data/lib/parsers/zip_parser.rb +39 -0
- data/lib/parsers/zip_parser/file_reader.rb +485 -0
- data/spec/attributes_json_spec.rb +19 -1
- data/spec/parsers/pdf_parser_spec.rb +68 -0
- data/spec/parsers/zip_parser_spec.rb +68 -0
- metadata +8 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a76c414094334f57859df79e61d42fa1fdabb3bd
|
4
|
+
data.tar.gz: 120aaee7484ee01165a2c8dd09b796bce7900c9f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a249af874800774dae313b42e4c191125341a6497a9e31b75e54d22ac008725331ce2227e41c167b9f746b85f7db86364dbbdf5614d48f11cb4122e4de01ce03
|
7
|
+
data.tar.gz: e6fee97f2741dccc1c9325813eed247d2a93d7b118b7b6b902cba2f23307650127875d443554a4df1e2ea3c8658c59bab68d2c4d84cef4afe2b4bf1e6454c144
|
data/lib/archive.rb
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'ks'
|
2
|
+
|
3
|
+
module FormatParser
|
4
|
+
class Archive
|
5
|
+
include FormatParser::AttributesJSON
|
6
|
+
|
7
|
+
class Entry < Ks.strict(:type, :size, :filename)
|
8
|
+
def to_json(*a)
|
9
|
+
to_h.to_json(*a)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
NATURE = :archive
|
14
|
+
|
15
|
+
# What filetype was recognized? Will contain a non-ambiguous symbol
|
16
|
+
# referring to the file format. The symbol can be used as a filename
|
17
|
+
# extension safely
|
18
|
+
attr_accessor :format
|
19
|
+
|
20
|
+
# Array of Entry structs
|
21
|
+
attr_accessor :entries
|
22
|
+
|
23
|
+
# If a parser wants to provide any extra information to the caller
|
24
|
+
# it can be placed here
|
25
|
+
attr_accessor :intrinsics
|
26
|
+
|
27
|
+
# Only permits assignments via defined accessors
|
28
|
+
def initialize(**attributes)
|
29
|
+
attributes.map { |(k, v)| public_send("#{k}=", v) }
|
30
|
+
end
|
31
|
+
|
32
|
+
def nature
|
33
|
+
NATURE
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
data/lib/attributes_json.rb
CHANGED
@@ -14,7 +14,7 @@ module FormatParser::AttributesJSON
|
|
14
14
|
|
15
15
|
# Implements a sane default `as_json` for an object
|
16
16
|
# that accessors defined
|
17
|
-
def as_json(
|
17
|
+
def as_json(root: false)
|
18
18
|
h = {}
|
19
19
|
h['nature'] = nature if respond_to?(:nature) # Needed for file info structs
|
20
20
|
methods.grep(/\w\=$/).each_with_object(h) do |attr_writer_method_name, h|
|
@@ -24,11 +24,15 @@ module FormatParser::AttributesJSON
|
|
24
24
|
# by the caller
|
25
25
|
h[reader_method_name] = value.respond_to?(:as_json) ? value.as_json : value
|
26
26
|
end
|
27
|
+
if root
|
28
|
+
{'format_parser_file_info' => h}
|
29
|
+
else
|
30
|
+
h
|
31
|
+
end
|
27
32
|
end
|
28
33
|
|
29
|
-
# Implements to_json with sane defaults
|
30
|
-
|
31
|
-
|
32
|
-
generator_state.generate(as_json)
|
34
|
+
# Implements to_json with sane defaults, with or without arguments
|
35
|
+
def to_json(*maybe_generator_state)
|
36
|
+
as_json(root: false).to_json(*maybe_generator_state)
|
33
37
|
end
|
34
38
|
end
|
data/lib/document.rb
CHANGED
data/lib/format_parser.rb
CHANGED
@@ -0,0 +1,76 @@
|
|
1
|
+
class FormatParser::PDFParser
|
2
|
+
include FormatParser::IOUtils
|
3
|
+
|
4
|
+
# First 9 bytes of a PDF should be in this format, according to:
|
5
|
+
#
|
6
|
+
# https://stackoverflow.com/questions/3108201/detect-if-pdf-file-is-correct-header-pdf
|
7
|
+
#
|
8
|
+
# There are however exceptions, which are left out for now.
|
9
|
+
#
|
10
|
+
PDF_MARKER = /%PDF-1\.[0-8]{1}/
|
11
|
+
|
12
|
+
# Page counts have different markers depending on
|
13
|
+
# the PDF type. There is not a single common way of solving
|
14
|
+
# this. The only way of solving this correctly is by adding
|
15
|
+
# different types of PDF's in the specs.
|
16
|
+
#
|
17
|
+
COUNT_MARKERS = ['Count ']
|
18
|
+
EOF_MARKER = '%EOF'
|
19
|
+
|
20
|
+
def call(io)
|
21
|
+
io = FormatParser::IOConstraint.new(io)
|
22
|
+
|
23
|
+
return unless safe_read(io, 9) =~ PDF_MARKER
|
24
|
+
|
25
|
+
attributes = scan_for_attributes(io)
|
26
|
+
|
27
|
+
FormatParser::Document.new(
|
28
|
+
format: :pdf,
|
29
|
+
page_count: attributes[:page_count]
|
30
|
+
)
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
# Read ahead bytes until one of % or / is reached.
|
36
|
+
# A header in a PDF always starts with a /
|
37
|
+
# The % is to detect the EOF
|
38
|
+
#
|
39
|
+
def scan_for_attributes(io)
|
40
|
+
result = {}
|
41
|
+
|
42
|
+
while read = safe_read(io, 1)
|
43
|
+
case read
|
44
|
+
when '%'
|
45
|
+
break if safe_read(io, EOF_MARKER.size) == EOF_MARKER
|
46
|
+
when '/'
|
47
|
+
find_page_count(io, result)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
result
|
52
|
+
end
|
53
|
+
|
54
|
+
def find_page_count(io, result)
|
55
|
+
COUNT_MARKERS.each do |marker|
|
56
|
+
if safe_read(io, marker.size) == marker
|
57
|
+
result[:page_count] = read_numbers(io)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
# Read ahead bytes until no more numbers are found
|
63
|
+
# This assumes that the position of io starts at a
|
64
|
+
# number
|
65
|
+
def read_numbers(io)
|
66
|
+
numbers = ''
|
67
|
+
|
68
|
+
while c = safe_read(io, 1)
|
69
|
+
c =~ /\d+/ ? numbers << c : break
|
70
|
+
end
|
71
|
+
|
72
|
+
numbers.to_i
|
73
|
+
end
|
74
|
+
|
75
|
+
FormatParser.register_parser self, natures: :document, formats: :pdf
|
76
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
class FormatParser::ZIPParser
|
2
|
+
require_relative 'zip_parser/file_reader'
|
3
|
+
|
4
|
+
def call(io)
|
5
|
+
reader = FileReader.new
|
6
|
+
entries = reader.read_zip_structure(io: FormatParser::IOConstraint.new(io))
|
7
|
+
|
8
|
+
entries_archive = entries.map do |ze|
|
9
|
+
ft = directory?(ze) ? :directory : :file
|
10
|
+
decoded_filename = decode_filename(ze)
|
11
|
+
FormatParser::Archive::Entry.new(type: ft, size: ze.uncompressed_size, filename: decoded_filename)
|
12
|
+
end
|
13
|
+
|
14
|
+
FormatParser::Archive.new(format: :zip, entries: entries_archive)
|
15
|
+
rescue FileReader::Error
|
16
|
+
# This is not a ZIP, or a broken ZIP.
|
17
|
+
return
|
18
|
+
end
|
19
|
+
|
20
|
+
def directory?(zip_entry)
|
21
|
+
# We can do a lap dance here and parse out the individual bit fields
|
22
|
+
# from the external attributes, check the OS type that is in the entry
|
23
|
+
# to see if it can be interpreted as UNIX or not, and generally have
|
24
|
+
# heaps of fun. Instead, we will be frugal.
|
25
|
+
zip_entry.filename.end_with?('/')
|
26
|
+
end
|
27
|
+
|
28
|
+
def decode_filename(zip_entry)
|
29
|
+
# Check for the EFS bit in the general-purpose flags. If it is set,
|
30
|
+
# the entry filename can be treated as UTF-8
|
31
|
+
if zip_entry.gp_flags & 0b100000000000 == 0b100000000000
|
32
|
+
zip_entry.filename.unpack('U*').pack('U*')
|
33
|
+
else
|
34
|
+
zip_entry.filename.encode(Encoding::UTF_8, undefined: :replace)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
FormatParser.register_parser self, natures: [:archive, :document], formats: :zip
|
39
|
+
end
|
@@ -0,0 +1,485 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'stringio'
|
4
|
+
|
5
|
+
# A very barebones ZIP file reader
|
6
|
+
class FormatParser::ZIPParser::FileReader
|
7
|
+
Error = Class.new(StandardError)
|
8
|
+
ReadError = Class.new(Error)
|
9
|
+
UnsupportedFeature = Class.new(Error)
|
10
|
+
InvalidStructure = Class.new(Error)
|
11
|
+
LocalHeaderPending = Class.new(Error) do
|
12
|
+
def message
|
13
|
+
'The compressed data offset is not available (local header has not been read)'
|
14
|
+
end
|
15
|
+
end
|
16
|
+
MissingEOCD = Class.new(Error) do
|
17
|
+
def message
|
18
|
+
'Could not find the EOCD signature in the buffer - maybe a malformed ZIP file'
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
C_UINT32LE = 'V'
|
23
|
+
C_UINT16LE = 'v'
|
24
|
+
C_UINT64LE = 'Q<'
|
25
|
+
|
26
|
+
# To prevent too many tiny reads, read the maximum possible size of end of
|
27
|
+
# central directory record upfront (all the fixed fields + at most 0xFFFF
|
28
|
+
# bytes of the archive comment)
|
29
|
+
MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE =
|
30
|
+
begin
|
31
|
+
4 + # Offset of the start of central directory
|
32
|
+
4 + # Size of the central directory
|
33
|
+
2 + # Number of files in the cdir
|
34
|
+
4 + # End-of-central-directory signature
|
35
|
+
2 + # Number of this disk
|
36
|
+
2 + # Number of disk with the start of cdir
|
37
|
+
2 + # Number of files in the cdir of this disk
|
38
|
+
2 + # The comment size
|
39
|
+
0xFFFF # Maximum comment size
|
40
|
+
end
|
41
|
+
|
42
|
+
# To prevent too many tiny reads, read the maximum possible size of the local file header upfront.
|
43
|
+
# The maximum size is all the usual items, plus the maximum size
|
44
|
+
# of the filename (0xFFFF bytes) and the maximum size of the extras (0xFFFF bytes)
|
45
|
+
MAX_LOCAL_HEADER_SIZE =
|
46
|
+
begin
|
47
|
+
4 + # signature
|
48
|
+
2 + # Version needed to extract
|
49
|
+
2 + # gp flags
|
50
|
+
2 + # storage mode
|
51
|
+
2 + # dos time
|
52
|
+
2 + # dos date
|
53
|
+
4 + # CRC32
|
54
|
+
4 + # Comp size
|
55
|
+
4 + # Uncomp size
|
56
|
+
2 + # Filename size
|
57
|
+
2 + # Extra fields size
|
58
|
+
0xFFFF + # Maximum filename size
|
59
|
+
0xFFFF # Maximum extra fields size
|
60
|
+
end
|
61
|
+
|
62
|
+
SIZE_OF_USABLE_EOCD_RECORD =
|
63
|
+
begin
|
64
|
+
4 + # Signature
|
65
|
+
2 + # Number of this disk
|
66
|
+
2 + # Number of the disk with the EOCD record
|
67
|
+
2 + # Number of entries in the central directory of this disk
|
68
|
+
2 + # Number of entries in the central directory total
|
69
|
+
4 + # Size of the central directory
|
70
|
+
4 # Start of the central directory offset
|
71
|
+
end
|
72
|
+
|
73
|
+
private_constant :C_UINT32LE, :C_UINT16LE, :C_UINT64LE, :MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE,
|
74
|
+
:MAX_LOCAL_HEADER_SIZE, :SIZE_OF_USABLE_EOCD_RECORD
|
75
|
+
|
76
|
+
# Represents a file within the ZIP archive being read
|
77
|
+
class ZipEntry
|
78
|
+
include FormatParser::AttributesJSON
|
79
|
+
|
80
|
+
# @return [Fixnum] bit-packed version signature of the program that made the archive
|
81
|
+
attr_accessor :made_by
|
82
|
+
|
83
|
+
# @return [Fixnum] ZIP version support needed to extract this file
|
84
|
+
attr_accessor :version_needed_to_extract
|
85
|
+
|
86
|
+
# @return [Fixnum] bit-packed general purpose flags
|
87
|
+
attr_accessor :gp_flags
|
88
|
+
|
89
|
+
# @return [Fixnum] Storage mode (0 for stored, 8 for deflate)
|
90
|
+
attr_accessor :storage_mode
|
91
|
+
|
92
|
+
# @return [Fixnum] the bit-packed DOS time
|
93
|
+
attr_accessor :dos_time
|
94
|
+
|
95
|
+
# @return [Fixnum] the bit-packed DOS date
|
96
|
+
attr_accessor :dos_date
|
97
|
+
|
98
|
+
# @return [Fixnum] the CRC32 checksum of this file
|
99
|
+
attr_accessor :crc32
|
100
|
+
|
101
|
+
# @return [Fixnum] size of compressed file data in the ZIP
|
102
|
+
attr_accessor :compressed_size
|
103
|
+
|
104
|
+
# @return [Fixnum] size of the file once uncompressed
|
105
|
+
attr_accessor :uncompressed_size
|
106
|
+
|
107
|
+
# @return [String] the filename
|
108
|
+
attr_accessor :filename
|
109
|
+
|
110
|
+
# @return [Fixnum] disk number where this file starts
|
111
|
+
attr_accessor :disk_number_start
|
112
|
+
|
113
|
+
# @return [Fixnum] internal attributes of the file
|
114
|
+
attr_accessor :internal_attrs
|
115
|
+
|
116
|
+
# @return [Fixnum] external attributes of the file
|
117
|
+
attr_accessor :external_attrs
|
118
|
+
|
119
|
+
# @return [Fixnum] at what offset the local file header starts
|
120
|
+
# in your original IO object
|
121
|
+
attr_accessor :local_file_header_offset
|
122
|
+
|
123
|
+
# @return [String] the file comment
|
124
|
+
attr_accessor :comment
|
125
|
+
|
126
|
+
# @return [Fixnum] at what offset you should start reading
|
127
|
+
# for the compressed data in your original IO object
|
128
|
+
def compressed_data_offset
|
129
|
+
@compressed_data_offset || raise(LocalHeaderPending)
|
130
|
+
end
|
131
|
+
|
132
|
+
# Tells whether the compressed data offset is already known for this entry
|
133
|
+
# @return [Boolean]
|
134
|
+
def known_offset?
|
135
|
+
!@compressed_data_offset.nil?
|
136
|
+
end
|
137
|
+
|
138
|
+
# Tells whether the entry uses a data descriptor (this is defined
|
139
|
+
# by bit 3 in the GP flags).
|
140
|
+
def uses_data_descriptor?
|
141
|
+
(gp_flags & 0x0008) == 0x0008
|
142
|
+
end
|
143
|
+
|
144
|
+
# Sets the offset at which the compressed data for this file starts in the ZIP.
|
145
|
+
# By default, the value will be set by the Reader for you. If you use delayed
|
146
|
+
# reading, you need to set it by using the `get_compressed_data_offset` on the Reader:
|
147
|
+
#
|
148
|
+
# entry.compressed_data_offset = reader.get_compressed_data_offset(io: file,
|
149
|
+
# local_file_header_offset: entry.local_header_offset)
|
150
|
+
def compressed_data_offset=(offset)
|
151
|
+
@compressed_data_offset = offset.to_i
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
# Parse an IO handle to a ZIP archive into an array of Entry objects.
|
156
|
+
#
|
157
|
+
# @param io[#tell, #seek, #read, #size] an IO-ish object
|
158
|
+
# @return [Array<ZipEntry>] an array of entries within the ZIP being parsed
|
159
|
+
def read_zip_structure(io:)
|
160
|
+
zip_file_size = io.size
|
161
|
+
eocd_offset = get_eocd_offset(io, zip_file_size)
|
162
|
+
|
163
|
+
zip64_end_of_cdir_location = get_zip64_eocd_location(io, eocd_offset)
|
164
|
+
num_files, cdir_location, cdir_size =
|
165
|
+
if zip64_end_of_cdir_location
|
166
|
+
num_files_and_central_directory_offset_zip64(io, zip64_end_of_cdir_location)
|
167
|
+
else
|
168
|
+
num_files_and_central_directory_offset(io, eocd_offset)
|
169
|
+
end
|
170
|
+
|
171
|
+
log { format('Located the central directory start at %d', cdir_location) }
|
172
|
+
seek(io, cdir_location)
|
173
|
+
|
174
|
+
# In zip_tricks we read the entire central directory _and_ enything behind it.
|
175
|
+
# Strictly speaking, we should be able to read `cdir_size` bytes and not a byte more.
|
176
|
+
# BUT! in format_parser we avoid unbounded reads, as a matter of fact they are forbidden.
|
177
|
+
# So we will again limit ouselves to cdir_size, and we will take cushion of 1 KB.
|
178
|
+
central_directory_str = io.read(cdir_size + 1024)
|
179
|
+
central_directory_io = StringIO.new(central_directory_str)
|
180
|
+
log do
|
181
|
+
format(
|
182
|
+
'Read %d bytes with central directory + EOCD record and locator',
|
183
|
+
central_directory_str.bytesize)
|
184
|
+
end
|
185
|
+
|
186
|
+
entries = (0...num_files).map do |entry_n|
|
187
|
+
offset_location = cdir_location + central_directory_io.pos
|
188
|
+
log do
|
189
|
+
format(
|
190
|
+
'Reading the central directory entry %d starting at offset %d',
|
191
|
+
entry_n, offset_location)
|
192
|
+
end
|
193
|
+
read_cdir_entry(central_directory_io)
|
194
|
+
end
|
195
|
+
|
196
|
+
entries
|
197
|
+
end
|
198
|
+
|
199
|
+
private
|
200
|
+
|
201
|
+
def skip_ahead_2(io)
|
202
|
+
skip_ahead_n(io, 2)
|
203
|
+
end
|
204
|
+
|
205
|
+
def skip_ahead_4(io)
|
206
|
+
skip_ahead_n(io, 4)
|
207
|
+
end
|
208
|
+
|
209
|
+
def skip_ahead_8(io)
|
210
|
+
skip_ahead_n(io, 8)
|
211
|
+
end
|
212
|
+
|
213
|
+
def seek(io, absolute_pos)
|
214
|
+
io.seek(absolute_pos)
|
215
|
+
unless absolute_pos == io.pos
|
216
|
+
raise ReadError,
|
217
|
+
"Expected to seek to #{absolute_pos} but only got to #{io.pos}"
|
218
|
+
end
|
219
|
+
nil
|
220
|
+
end
|
221
|
+
|
222
|
+
def assert_signature(io, signature_magic_number)
|
223
|
+
readback = read_4b(io)
|
224
|
+
if readback != signature_magic_number
|
225
|
+
expected = '0x0' + signature_magic_number.to_s(16)
|
226
|
+
actual = '0x0' + readback.to_s(16)
|
227
|
+
raise InvalidStructure, "Expected signature #{expected}, but read #{actual}"
|
228
|
+
end
|
229
|
+
end
|
230
|
+
|
231
|
+
def skip_ahead_n(io, n)
|
232
|
+
pos_before = io.pos
|
233
|
+
io.seek(io.pos + n)
|
234
|
+
pos_after = io.pos
|
235
|
+
delta = pos_after - pos_before
|
236
|
+
unless delta == n
|
237
|
+
raise ReadError, "Expected to seek #{n} bytes ahead, but could only seek #{delta} bytes ahead"
|
238
|
+
end
|
239
|
+
nil
|
240
|
+
end
|
241
|
+
|
242
|
+
def read_n(io, n_bytes)
|
243
|
+
io.read(n_bytes).tap do |d|
|
244
|
+
raise ReadError, "Expected to read #{n_bytes} bytes, but the IO was at the end" if d.nil?
|
245
|
+
unless d.bytesize == n_bytes
|
246
|
+
raise ReadError, "Expected to read #{n_bytes} bytes, read #{d.bytesize}"
|
247
|
+
end
|
248
|
+
end
|
249
|
+
end
|
250
|
+
|
251
|
+
def read_2b(io)
|
252
|
+
read_n(io, 2).unpack(C_UINT16LE).shift
|
253
|
+
end
|
254
|
+
|
255
|
+
def read_4b(io)
|
256
|
+
read_n(io, 4).unpack(C_UINT32LE).shift
|
257
|
+
end
|
258
|
+
|
259
|
+
def read_8b(io)
|
260
|
+
read_n(io, 8).unpack(C_UINT64LE).shift
|
261
|
+
end
|
262
|
+
|
263
|
+
def read_cdir_entry(io)
|
264
|
+
assert_signature(io, 0x02014b50)
|
265
|
+
ZipEntry.new.tap do |e|
|
266
|
+
e.made_by = read_2b(io)
|
267
|
+
e.version_needed_to_extract = read_2b(io)
|
268
|
+
e.gp_flags = read_2b(io)
|
269
|
+
e.storage_mode = read_2b(io)
|
270
|
+
e.dos_time = read_2b(io)
|
271
|
+
e.dos_date = read_2b(io)
|
272
|
+
e.crc32 = read_4b(io)
|
273
|
+
e.compressed_size = read_4b(io)
|
274
|
+
e.uncompressed_size = read_4b(io)
|
275
|
+
filename_size = read_2b(io)
|
276
|
+
extra_size = read_2b(io)
|
277
|
+
comment_len = read_2b(io)
|
278
|
+
e.disk_number_start = read_2b(io)
|
279
|
+
e.internal_attrs = read_2b(io)
|
280
|
+
e.external_attrs = read_4b(io)
|
281
|
+
e.local_file_header_offset = read_4b(io)
|
282
|
+
e.filename = read_n(io, filename_size)
|
283
|
+
|
284
|
+
# Extra fields
|
285
|
+
extras = read_n(io, extra_size)
|
286
|
+
# Comment
|
287
|
+
e.comment = read_n(io, comment_len)
|
288
|
+
|
289
|
+
# Parse out the extra fields
|
290
|
+
extra_table = parse_out_extra_fields(extras)
|
291
|
+
|
292
|
+
# ...of which we really only need the Zip64 extra
|
293
|
+
if zip64_extra_contents ||= extra_table[1]
|
294
|
+
# If the Zip64 extra is present, we let it override all
|
295
|
+
# the values fetched from the conventional header
|
296
|
+
zip64_extra = StringIO.new(zip64_extra_contents)
|
297
|
+
log do
|
298
|
+
format(
|
299
|
+
'Will read Zip64 extra data for %s, %d bytes',
|
300
|
+
e.filename, zip64_extra.size)
|
301
|
+
end
|
302
|
+
# Now here be dragons. The APPNOTE specifies that
|
303
|
+
#
|
304
|
+
# > The order of the fields in the ZIP64 extended
|
305
|
+
# > information record is fixed, but the fields will
|
306
|
+
# > only appear if the corresponding Local or Central
|
307
|
+
# > directory record field is set to 0xFFFF or 0xFFFFFFFF.
|
308
|
+
#
|
309
|
+
# It means that before we read this stuff we need to check if the previously-read
|
310
|
+
# values are at overflow, and only _then_ proceed to read them. Bah.
|
311
|
+
if e.uncompressed_size == 0xFFFFFFFF
|
312
|
+
e.uncompressed_size = read_8b(zip64_extra)
|
313
|
+
end
|
314
|
+
if e.compressed_size == 0xFFFFFFFF
|
315
|
+
e.compressed_size = read_8b(zip64_extra)
|
316
|
+
end
|
317
|
+
if e.local_file_header_offset == 0xFFFFFFFF
|
318
|
+
e.local_file_header_offset = read_8b(zip64_extra)
|
319
|
+
end
|
320
|
+
# Disk number comes last and we can skip it anyway, since we do
|
321
|
+
# not support multi-disk archives
|
322
|
+
end
|
323
|
+
end
|
324
|
+
end
|
325
|
+
|
326
|
+
def get_eocd_offset(file_io, zip_file_size)
|
327
|
+
# Start reading from the _comment_ of the zip file (from the very end).
|
328
|
+
# The maximum size of the comment is 0xFFFF (what fits in 2 bytes)
|
329
|
+
implied_position_of_eocd_record = zip_file_size - MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE
|
330
|
+
implied_position_of_eocd_record = 0 if implied_position_of_eocd_record < 0
|
331
|
+
|
332
|
+
# Use a soft seek (we might not be able to get as far behind in the IO as we want)
|
333
|
+
# and a soft read (we might not be able to read as many bytes as we want)
|
334
|
+
file_io.seek(implied_position_of_eocd_record)
|
335
|
+
str_containing_eocd_record = file_io.read(MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE)
|
336
|
+
raise MissingEOCD unless str_containing_eocd_record
|
337
|
+
|
338
|
+
eocd_idx_in_buf = locate_eocd_signature(str_containing_eocd_record)
|
339
|
+
|
340
|
+
raise MissingEOCD unless eocd_idx_in_buf
|
341
|
+
|
342
|
+
eocd_offset = implied_position_of_eocd_record + eocd_idx_in_buf
|
343
|
+
log { format('Found EOCD signature at offset %d', eocd_offset) }
|
344
|
+
|
345
|
+
eocd_offset
|
346
|
+
end
|
347
|
+
|
348
|
+
# This is tricky. Essentially, we have to scan the maximum possible number
|
349
|
+
# of bytes (that the EOCD can theoretically occupy including the comment),
|
350
|
+
# and we have to find a combination of:
|
351
|
+
# [EOCD signature, <some ZIP medatata>, comment byte size, the comment of
|
352
|
+
# that size, eof].
|
353
|
+
# The only way I could find to do this was with a sliding window, but
|
354
|
+
# there probably is a better way.
|
355
|
+
def locate_eocd_signature(in_str)
|
356
|
+
# We have to scan from the _very_ tail. We read the very minimum size
|
357
|
+
# the EOCD record can have (up to and including the comment size), using
|
358
|
+
# a sliding window. Once our end offset matches the comment size we found our
|
359
|
+
# EOCD marker.
|
360
|
+
unpack_pattern = 'VvvvvVVv'
|
361
|
+
minimum_record_size = 22
|
362
|
+
end_location = minimum_record_size * -1
|
363
|
+
loop do
|
364
|
+
# If the window is nil, we have rolled off the start of the string, nothing to do here.
|
365
|
+
# We use negative values because if we used positive slice indices
|
366
|
+
# we would have to detect the rollover ourselves
|
367
|
+
break unless window = in_str[end_location, minimum_record_size]
|
368
|
+
|
369
|
+
window_location = in_str.bytesize + end_location
|
370
|
+
unpacked = window.unpack(unpack_pattern)
|
371
|
+
# If we found the signarue, pick up the comment size, and check if the size of the window
|
372
|
+
# plus that comment size is where we are in the string. If we are - bingo.
|
373
|
+
if unpacked[0] == 0x06054b50 && comment_size = unpacked[-1]
|
374
|
+
assumed_eocd_location = in_str.bytesize - comment_size - minimum_record_size
|
375
|
+
# if the comment size is where we should be at - we found our EOCD
|
376
|
+
return assumed_eocd_location if assumed_eocd_location == window_location
|
377
|
+
end
|
378
|
+
|
379
|
+
end_location -= 1 # Shift the window back, by one byte, and try again.
|
380
|
+
end
|
381
|
+
end
|
382
|
+
|
383
|
+
# Find the Zip64 EOCD locator segment offset. Do this by seeking backwards from the
|
384
|
+
# EOCD record in the archive by fixed offsets
|
385
|
+
def get_zip64_eocd_location(file_io, eocd_offset)
|
386
|
+
zip64_eocd_loc_offset = eocd_offset
|
387
|
+
zip64_eocd_loc_offset -= 4 # The signature
|
388
|
+
zip64_eocd_loc_offset -= 4 # Which disk has the Zip64 end of central directory record
|
389
|
+
zip64_eocd_loc_offset -= 8 # Offset of the zip64 central directory record
|
390
|
+
zip64_eocd_loc_offset -= 4 # Total number of disks
|
391
|
+
|
392
|
+
log do
|
393
|
+
format(
|
394
|
+
'Will look for the Zip64 EOCD locator signature at offset %d',
|
395
|
+
zip64_eocd_loc_offset)
|
396
|
+
end
|
397
|
+
|
398
|
+
# If the offset is negative there is certainly no Zip64 EOCD locator here
|
399
|
+
return unless zip64_eocd_loc_offset >= 0
|
400
|
+
|
401
|
+
file_io.seek(zip64_eocd_loc_offset)
|
402
|
+
assert_signature(file_io, 0x07064b50)
|
403
|
+
|
404
|
+
log { format('Found Zip64 EOCD locator at offset %d', zip64_eocd_loc_offset) }
|
405
|
+
|
406
|
+
disk_num = read_4b(file_io) # number of the disk
|
407
|
+
raise UnsupportedFeature, 'The archive spans multiple disks' if disk_num != 0
|
408
|
+
read_8b(file_io)
|
409
|
+
rescue ReadError, InvalidStructure
|
410
|
+
nil
|
411
|
+
end
|
412
|
+
|
413
|
+
# num_files_and_central_directory_offset_zip64 is too high. [21.12/15]
|
414
|
+
def num_files_and_central_directory_offset_zip64(io, zip64_end_of_cdir_location)
|
415
|
+
seek(io, zip64_end_of_cdir_location)
|
416
|
+
|
417
|
+
assert_signature(io, 0x06064b50)
|
418
|
+
|
419
|
+
zip64_eocdr_size = read_8b(io)
|
420
|
+
zip64_eocdr = read_n(io, zip64_eocdr_size) # Reading in bulk is cheaper
|
421
|
+
zip64_eocdr = StringIO.new(zip64_eocdr)
|
422
|
+
skip_ahead_2(zip64_eocdr) # version made by
|
423
|
+
skip_ahead_2(zip64_eocdr) # version needed to extract
|
424
|
+
|
425
|
+
disk_n = read_4b(zip64_eocdr) # number of this disk
|
426
|
+
disk_n_with_eocdr = read_4b(zip64_eocdr) # number of the disk with the EOCDR
|
427
|
+
if disk_n != disk_n_with_eocdr
|
428
|
+
raise UnsupportedFeature, 'The archive spans multiple disks'
|
429
|
+
end
|
430
|
+
|
431
|
+
num_files_this_disk = read_8b(zip64_eocdr) # number of files on this disk
|
432
|
+
num_files_total = read_8b(zip64_eocdr) # files total in the central directory
|
433
|
+
|
434
|
+
if num_files_this_disk != num_files_total
|
435
|
+
raise UnsupportedFeature, 'The archive spans multiple disks'
|
436
|
+
end
|
437
|
+
|
438
|
+
log do
|
439
|
+
format(
|
440
|
+
'Zip64 EOCD record states there are %d files in the archive',
|
441
|
+
num_files_total)
|
442
|
+
end
|
443
|
+
|
444
|
+
central_dir_size = read_8b(zip64_eocdr) # Size of the central directory
|
445
|
+
central_dir_offset = read_8b(zip64_eocdr) # Where the central directory starts
|
446
|
+
|
447
|
+
[num_files_total, central_dir_offset, central_dir_size]
|
448
|
+
end
|
449
|
+
|
450
|
+
def num_files_and_central_directory_offset(file_io, eocd_offset)
|
451
|
+
seek(file_io, eocd_offset)
|
452
|
+
|
453
|
+
# The size of the EOCD record is known upfront, so use a strict read
|
454
|
+
eocd_record_str = read_n(file_io, SIZE_OF_USABLE_EOCD_RECORD)
|
455
|
+
io = StringIO.new(eocd_record_str)
|
456
|
+
|
457
|
+
assert_signature(io, 0x06054b50)
|
458
|
+
skip_ahead_2(io) # number_of_this_disk
|
459
|
+
skip_ahead_2(io) # number of the disk with the EOCD record
|
460
|
+
skip_ahead_2(io) # number of entries in the central directory of this disk
|
461
|
+
num_files = read_2b(io) # number of entries in the central directory total
|
462
|
+
cdir_size = read_4b(io) # size of the central directory
|
463
|
+
cdir_offset = read_4b(io) # start of central directorty offset
|
464
|
+
[num_files, cdir_offset, cdir_size]
|
465
|
+
end
|
466
|
+
|
467
|
+
# Is provided as a stub to be overridden in a subclass if you need it. Will report
|
468
|
+
# during various stages of reading. The log message is contained in the return value
|
469
|
+
# of `yield` in the method (the log messages are lazy-evaluated).
|
470
|
+
def log
|
471
|
+
# $stderr.puts(yield)
|
472
|
+
end
|
473
|
+
|
474
|
+
def parse_out_extra_fields(extra_fields_str)
|
475
|
+
extra_table = {}
|
476
|
+
extras_buf = StringIO.new(extra_fields_str)
|
477
|
+
until extras_buf.eof?
|
478
|
+
extra_id = read_2b(extras_buf)
|
479
|
+
extra_size = read_2b(extras_buf)
|
480
|
+
extra_contents = read_n(extras_buf, extra_size)
|
481
|
+
extra_table[extra_id] = extra_contents
|
482
|
+
end
|
483
|
+
extra_table
|
484
|
+
end
|
485
|
+
end
|
@@ -13,7 +13,7 @@ describe FormatParser::AttributesJSON do
|
|
13
13
|
instance.foo = 42
|
14
14
|
instance.bar = 'abcdef'
|
15
15
|
expect(instance.as_json).to eq('nature' => 'good', 'foo' => 42, 'bar' => 'abcdef', 'baz' => nil)
|
16
|
-
expect(instance.as_json(root: true)).to eq('nature' => 'good', 'foo' => 42, 'bar' => 'abcdef', 'baz' => nil)
|
16
|
+
expect(instance.as_json(root: true)).to eq('format_parser_file_info' => {'nature' => 'good', 'foo' => 42, 'bar' => 'abcdef', 'baz' => nil})
|
17
17
|
end
|
18
18
|
|
19
19
|
it 'is included into file information types' do
|
@@ -49,4 +49,22 @@ describe FormatParser::AttributesJSON do
|
|
49
49
|
standard_output = JSON.dump(instance)
|
50
50
|
expect(pretty_output).not_to eq(standard_output)
|
51
51
|
end
|
52
|
+
|
53
|
+
it 'provides to_json without arguments' do
|
54
|
+
anon_class = Class.new do
|
55
|
+
include FormatParser::AttributesJSON
|
56
|
+
attr_accessor :foo, :bar, :baz
|
57
|
+
def nature
|
58
|
+
'good'
|
59
|
+
end
|
60
|
+
end
|
61
|
+
instance = anon_class.new
|
62
|
+
instance.foo = 42
|
63
|
+
instance.bar = 'abcdef'
|
64
|
+
|
65
|
+
output = instance.to_json
|
66
|
+
readback = JSON.parse(output, symbolize_names: true)
|
67
|
+
|
68
|
+
expect(readback).to have_key(:nature)
|
69
|
+
end
|
52
70
|
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe FormatParser::PDFParser do
|
4
|
+
let(:parsed_pdf) {
|
5
|
+
subject.call(
|
6
|
+
File.open(
|
7
|
+
Pathname.new(fixtures_dir).join('PDF').join(pdf_file),
|
8
|
+
'rb'
|
9
|
+
)
|
10
|
+
)
|
11
|
+
}
|
12
|
+
|
13
|
+
shared_examples :behave_like_pdf do |hash|
|
14
|
+
let(:pdf_file) { hash.fetch(:file) }
|
15
|
+
|
16
|
+
it 'acts as a pdf' do
|
17
|
+
expect(parsed_pdf).not_to be_nil
|
18
|
+
expect(parsed_pdf.nature).to eq(:document)
|
19
|
+
expect(parsed_pdf.format).to eq(:pdf)
|
20
|
+
end
|
21
|
+
|
22
|
+
it 'has a correct page count' do
|
23
|
+
expect(parsed_pdf.page_count).to eq(hash.fetch(:page_count))
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
describe 'a PDF file with a missing version header' do
|
28
|
+
let(:pdf_file) { 'not_a.pdf' }
|
29
|
+
|
30
|
+
it 'does not parse succesfully' do
|
31
|
+
expect(parsed_pdf).to be_nil
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
describe 'a PDF file with a correct header but no valid content' do
|
36
|
+
let(:pdf_file) { 'broken.pdf' }
|
37
|
+
|
38
|
+
pending 'does not parse succesfully'
|
39
|
+
end
|
40
|
+
|
41
|
+
describe 'exceeding the PDF read limit' do
|
42
|
+
let(:pdf_file) { 'read_limit.pdf' }
|
43
|
+
|
44
|
+
pending 'does not parse succesfully'
|
45
|
+
end
|
46
|
+
|
47
|
+
describe 'a PDF file with a missing COUNT_HEADER' do
|
48
|
+
let(:pdf_file) { 'missing_page_count.pdf' }
|
49
|
+
|
50
|
+
it 'does not return a page count' do
|
51
|
+
expect(parsed_pdf.page_count).to eq(nil)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
describe 'parses a PDF file' do
|
56
|
+
describe 'a single page file' do
|
57
|
+
include_examples :behave_like_pdf, file: '1_page.pdf', page_count: 1
|
58
|
+
end
|
59
|
+
|
60
|
+
describe 'a multi page pdf file' do
|
61
|
+
include_examples :behave_like_pdf, file: '2_pages.pdf', page_count: 2
|
62
|
+
end
|
63
|
+
|
64
|
+
describe 'a multi page pdf file with content' do
|
65
|
+
include_examples :behave_like_pdf, file: '10_pages.pdf', page_count: 10
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe FormatParser::ZIPParser do
|
4
|
+
it 'parses a ZIP archive with Zip64 extra fields (due to the number of files)' do
|
5
|
+
fixture_path = fixtures_dir + '/ZIP/arch_many_entries.zip'
|
6
|
+
fi_io = File.open(fixture_path, 'rb')
|
7
|
+
|
8
|
+
result = subject.call(fi_io)
|
9
|
+
expect(result).not_to be_nil
|
10
|
+
|
11
|
+
expect(result.format).to eq(:zip)
|
12
|
+
expect(result.nature).to eq(:archive)
|
13
|
+
expect(result.entries.length).to eq(0xFFFF + 1)
|
14
|
+
|
15
|
+
entry = result.entries.fetch(5674)
|
16
|
+
expect(entry.type).to eq(:file)
|
17
|
+
expect(entry.size).to eq(47)
|
18
|
+
expect(entry.filename).to eq('file-0005674.txt')
|
19
|
+
end
|
20
|
+
|
21
|
+
it 'parses a ZIP archive with a few files' do
|
22
|
+
fixture_path = fixtures_dir + '/ZIP/arch_few_entries.zip'
|
23
|
+
fi_io = File.open(fixture_path, 'rb')
|
24
|
+
|
25
|
+
result = subject.call(fi_io)
|
26
|
+
expect(result).not_to be_nil
|
27
|
+
|
28
|
+
expect(result.format).to eq(:zip)
|
29
|
+
expect(result.nature).to eq(:archive)
|
30
|
+
expect(result.entries.length).to eq(3)
|
31
|
+
end
|
32
|
+
|
33
|
+
it 'correctly identifies an empty directory' do
|
34
|
+
fixture_path = fixtures_dir + '/ZIP/arch_with_empty_dir.zip'
|
35
|
+
fi_io = File.open(fixture_path, 'rb')
|
36
|
+
|
37
|
+
result = subject.call(fi_io)
|
38
|
+
expect(result).not_to be_nil
|
39
|
+
|
40
|
+
expect(result.format).to eq(:zip)
|
41
|
+
expect(result.nature).to eq(:archive)
|
42
|
+
expect(result.entries.length).to eq(3)
|
43
|
+
|
44
|
+
dir_entry = result.entries.last
|
45
|
+
expect(dir_entry.filename).to eq('папочка/')
|
46
|
+
expect(dir_entry.type).to eq(:directory)
|
47
|
+
end
|
48
|
+
|
49
|
+
it 'returns a result that has a usable JSON representation' do
|
50
|
+
fixture_path = fixtures_dir + '/ZIP/arch_with_empty_dir.zip'
|
51
|
+
fi_io = File.open(fixture_path, 'rb')
|
52
|
+
|
53
|
+
result = subject.call(fi_io)
|
54
|
+
json_repr = JSON.pretty_generate(result)
|
55
|
+
|
56
|
+
json_parsed_repr = JSON.parse(json_repr, symbolize_names: true)
|
57
|
+
expect(json_parsed_repr[:nature]).to eq('archive')
|
58
|
+
expect(json_parsed_repr[:format]).to eq('zip')
|
59
|
+
expect(json_parsed_repr[:entries]).to be_kind_of(Array)
|
60
|
+
expect(json_parsed_repr[:entries].length).to eq(3)
|
61
|
+
|
62
|
+
json_parsed_repr[:entries].each do |e|
|
63
|
+
expect(e[:filename]).to be_kind_of(String)
|
64
|
+
expect(e[:size]).to be_kind_of(Integer)
|
65
|
+
expect(e[:type]).to be_kind_of(String)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: format_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Noah Berman
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2018-03-
|
12
|
+
date: 2018-03-30 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: ks
|
@@ -159,6 +159,7 @@ files:
|
|
159
159
|
- README.md
|
160
160
|
- Rakefile
|
161
161
|
- format_parser.gemspec
|
162
|
+
- lib/archive.rb
|
162
163
|
- lib/attributes_json.rb
|
163
164
|
- lib/audio.rb
|
164
165
|
- lib/care.rb
|
@@ -180,10 +181,13 @@ files:
|
|
180
181
|
- lib/parsers/mp3_parser.rb
|
181
182
|
- lib/parsers/mp3_parser/id3_v1.rb
|
182
183
|
- lib/parsers/mp3_parser/id3_v2.rb
|
184
|
+
- lib/parsers/pdf_parser.rb
|
183
185
|
- lib/parsers/png_parser.rb
|
184
186
|
- lib/parsers/psd_parser.rb
|
185
187
|
- lib/parsers/tiff_parser.rb
|
186
188
|
- lib/parsers/wav_parser.rb
|
189
|
+
- lib/parsers/zip_parser.rb
|
190
|
+
- lib/parsers/zip_parser/file_reader.rb
|
187
191
|
- lib/read_limiter.rb
|
188
192
|
- lib/remote_io.rb
|
189
193
|
- lib/video.rb
|
@@ -201,10 +205,12 @@ files:
|
|
201
205
|
- spec/parsers/jpeg_parser_spec.rb
|
202
206
|
- spec/parsers/moov_parser_spec.rb
|
203
207
|
- spec/parsers/mp3_parser_spec.rb
|
208
|
+
- spec/parsers/pdf_parser_spec.rb
|
204
209
|
- spec/parsers/png_parser_spec.rb
|
205
210
|
- spec/parsers/psd_parser_spec.rb
|
206
211
|
- spec/parsers/tiff_parser_spec.rb
|
207
212
|
- spec/parsers/wav_parser_spec.rb
|
213
|
+
- spec/parsers/zip_parser_spec.rb
|
208
214
|
- spec/read_limiter_spec.rb
|
209
215
|
- spec/remote_fetching_spec.rb
|
210
216
|
- spec/remote_io_spec.rb
|