format_parser 0.3.5 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/archive.rb +36 -0
- data/lib/attributes_json.rb +9 -5
- data/lib/document.rb +1 -0
- data/lib/format_parser.rb +1 -0
- data/lib/format_parser/version.rb +1 -1
- data/lib/parsers/pdf_parser.rb +76 -0
- data/lib/parsers/zip_parser.rb +39 -0
- data/lib/parsers/zip_parser/file_reader.rb +485 -0
- data/spec/attributes_json_spec.rb +19 -1
- data/spec/parsers/pdf_parser_spec.rb +68 -0
- data/spec/parsers/zip_parser_spec.rb +68 -0
- metadata +8 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a76c414094334f57859df79e61d42fa1fdabb3bd
|
4
|
+
data.tar.gz: 120aaee7484ee01165a2c8dd09b796bce7900c9f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a249af874800774dae313b42e4c191125341a6497a9e31b75e54d22ac008725331ce2227e41c167b9f746b85f7db86364dbbdf5614d48f11cb4122e4de01ce03
|
7
|
+
data.tar.gz: e6fee97f2741dccc1c9325813eed247d2a93d7b118b7b6b902cba2f23307650127875d443554a4df1e2ea3c8658c59bab68d2c4d84cef4afe2b4bf1e6454c144
|
data/lib/archive.rb
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'ks'
|
2
|
+
|
3
|
+
module FormatParser
|
4
|
+
class Archive
|
5
|
+
include FormatParser::AttributesJSON
|
6
|
+
|
7
|
+
class Entry < Ks.strict(:type, :size, :filename)
|
8
|
+
def to_json(*a)
|
9
|
+
to_h.to_json(*a)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
NATURE = :archive
|
14
|
+
|
15
|
+
# What filetype was recognized? Will contain a non-ambiguous symbol
|
16
|
+
# referring to the file format. The symbol can be used as a filename
|
17
|
+
# extension safely
|
18
|
+
attr_accessor :format
|
19
|
+
|
20
|
+
# Array of Entry structs
|
21
|
+
attr_accessor :entries
|
22
|
+
|
23
|
+
# If a parser wants to provide any extra information to the caller
|
24
|
+
# it can be placed here
|
25
|
+
attr_accessor :intrinsics
|
26
|
+
|
27
|
+
# Only permits assignments via defined accessors
|
28
|
+
def initialize(**attributes)
|
29
|
+
attributes.map { |(k, v)| public_send("#{k}=", v) }
|
30
|
+
end
|
31
|
+
|
32
|
+
def nature
|
33
|
+
NATURE
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
data/lib/attributes_json.rb
CHANGED
@@ -14,7 +14,7 @@ module FormatParser::AttributesJSON
|
|
14
14
|
|
15
15
|
# Implements a sane default `as_json` for an object
|
16
16
|
# that accessors defined
|
17
|
-
def as_json(
|
17
|
+
def as_json(root: false)
|
18
18
|
h = {}
|
19
19
|
h['nature'] = nature if respond_to?(:nature) # Needed for file info structs
|
20
20
|
methods.grep(/\w\=$/).each_with_object(h) do |attr_writer_method_name, h|
|
@@ -24,11 +24,15 @@ module FormatParser::AttributesJSON
|
|
24
24
|
# by the caller
|
25
25
|
h[reader_method_name] = value.respond_to?(:as_json) ? value.as_json : value
|
26
26
|
end
|
27
|
+
if root
|
28
|
+
{'format_parser_file_info' => h}
|
29
|
+
else
|
30
|
+
h
|
31
|
+
end
|
27
32
|
end
|
28
33
|
|
29
|
-
# Implements to_json with sane defaults
|
30
|
-
|
31
|
-
|
32
|
-
generator_state.generate(as_json)
|
34
|
+
# Implements to_json with sane defaults, with or without arguments
|
35
|
+
def to_json(*maybe_generator_state)
|
36
|
+
as_json(root: false).to_json(*maybe_generator_state)
|
33
37
|
end
|
34
38
|
end
|
data/lib/document.rb
CHANGED
data/lib/format_parser.rb
CHANGED
@@ -0,0 +1,76 @@
|
|
1
|
+
class FormatParser::PDFParser
|
2
|
+
include FormatParser::IOUtils
|
3
|
+
|
4
|
+
# First 9 bytes of a PDF should be in this format, according to:
|
5
|
+
#
|
6
|
+
# https://stackoverflow.com/questions/3108201/detect-if-pdf-file-is-correct-header-pdf
|
7
|
+
#
|
8
|
+
# There are however exceptions, which are left out for now.
|
9
|
+
#
|
10
|
+
PDF_MARKER = /%PDF-1\.[0-8]{1}/
|
11
|
+
|
12
|
+
# Page counts have different markers depending on
|
13
|
+
# the PDF type. There is not a single common way of solving
|
14
|
+
# this. The only way of solving this correctly is by adding
|
15
|
+
# different types of PDF's in the specs.
|
16
|
+
#
|
17
|
+
COUNT_MARKERS = ['Count ']
|
18
|
+
EOF_MARKER = '%EOF'
|
19
|
+
|
20
|
+
def call(io)
|
21
|
+
io = FormatParser::IOConstraint.new(io)
|
22
|
+
|
23
|
+
return unless safe_read(io, 9) =~ PDF_MARKER
|
24
|
+
|
25
|
+
attributes = scan_for_attributes(io)
|
26
|
+
|
27
|
+
FormatParser::Document.new(
|
28
|
+
format: :pdf,
|
29
|
+
page_count: attributes[:page_count]
|
30
|
+
)
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
# Read ahead bytes until one of % or / is reached.
|
36
|
+
# A header in a PDF always starts with a /
|
37
|
+
# The % is to detect the EOF
|
38
|
+
#
|
39
|
+
def scan_for_attributes(io)
|
40
|
+
result = {}
|
41
|
+
|
42
|
+
while read = safe_read(io, 1)
|
43
|
+
case read
|
44
|
+
when '%'
|
45
|
+
break if safe_read(io, EOF_MARKER.size) == EOF_MARKER
|
46
|
+
when '/'
|
47
|
+
find_page_count(io, result)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
result
|
52
|
+
end
|
53
|
+
|
54
|
+
def find_page_count(io, result)
|
55
|
+
COUNT_MARKERS.each do |marker|
|
56
|
+
if safe_read(io, marker.size) == marker
|
57
|
+
result[:page_count] = read_numbers(io)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
# Read ahead bytes until no more numbers are found
|
63
|
+
# This assumes that the position of io starts at a
|
64
|
+
# number
|
65
|
+
def read_numbers(io)
|
66
|
+
numbers = ''
|
67
|
+
|
68
|
+
while c = safe_read(io, 1)
|
69
|
+
c =~ /\d+/ ? numbers << c : break
|
70
|
+
end
|
71
|
+
|
72
|
+
numbers.to_i
|
73
|
+
end
|
74
|
+
|
75
|
+
FormatParser.register_parser self, natures: :document, formats: :pdf
|
76
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
class FormatParser::ZIPParser
|
2
|
+
require_relative 'zip_parser/file_reader'
|
3
|
+
|
4
|
+
def call(io)
|
5
|
+
reader = FileReader.new
|
6
|
+
entries = reader.read_zip_structure(io: FormatParser::IOConstraint.new(io))
|
7
|
+
|
8
|
+
entries_archive = entries.map do |ze|
|
9
|
+
ft = directory?(ze) ? :directory : :file
|
10
|
+
decoded_filename = decode_filename(ze)
|
11
|
+
FormatParser::Archive::Entry.new(type: ft, size: ze.uncompressed_size, filename: decoded_filename)
|
12
|
+
end
|
13
|
+
|
14
|
+
FormatParser::Archive.new(format: :zip, entries: entries_archive)
|
15
|
+
rescue FileReader::Error
|
16
|
+
# This is not a ZIP, or a broken ZIP.
|
17
|
+
return
|
18
|
+
end
|
19
|
+
|
20
|
+
def directory?(zip_entry)
|
21
|
+
# We can do a lap dance here and parse out the individual bit fields
|
22
|
+
# from the external attributes, check the OS type that is in the entry
|
23
|
+
# to see if it can be interpreted as UNIX or not, and generally have
|
24
|
+
# heaps of fun. Instead, we will be frugal.
|
25
|
+
zip_entry.filename.end_with?('/')
|
26
|
+
end
|
27
|
+
|
28
|
+
def decode_filename(zip_entry)
|
29
|
+
# Check for the EFS bit in the general-purpose flags. If it is set,
|
30
|
+
# the entry filename can be treated as UTF-8
|
31
|
+
if zip_entry.gp_flags & 0b100000000000 == 0b100000000000
|
32
|
+
zip_entry.filename.unpack('U*').pack('U*')
|
33
|
+
else
|
34
|
+
zip_entry.filename.encode(Encoding::UTF_8, undefined: :replace)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
FormatParser.register_parser self, natures: [:archive, :document], formats: :zip
|
39
|
+
end
|
@@ -0,0 +1,485 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'stringio'
|
4
|
+
|
5
|
+
# A very barebones ZIP file reader
|
6
|
+
class FormatParser::ZIPParser::FileReader
|
7
|
+
Error = Class.new(StandardError)
|
8
|
+
ReadError = Class.new(Error)
|
9
|
+
UnsupportedFeature = Class.new(Error)
|
10
|
+
InvalidStructure = Class.new(Error)
|
11
|
+
LocalHeaderPending = Class.new(Error) do
|
12
|
+
def message
|
13
|
+
'The compressed data offset is not available (local header has not been read)'
|
14
|
+
end
|
15
|
+
end
|
16
|
+
MissingEOCD = Class.new(Error) do
|
17
|
+
def message
|
18
|
+
'Could not find the EOCD signature in the buffer - maybe a malformed ZIP file'
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
C_UINT32LE = 'V'
|
23
|
+
C_UINT16LE = 'v'
|
24
|
+
C_UINT64LE = 'Q<'
|
25
|
+
|
26
|
+
# To prevent too many tiny reads, read the maximum possible size of end of
|
27
|
+
# central directory record upfront (all the fixed fields + at most 0xFFFF
|
28
|
+
# bytes of the archive comment)
|
29
|
+
MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE =
|
30
|
+
begin
|
31
|
+
4 + # Offset of the start of central directory
|
32
|
+
4 + # Size of the central directory
|
33
|
+
2 + # Number of files in the cdir
|
34
|
+
4 + # End-of-central-directory signature
|
35
|
+
2 + # Number of this disk
|
36
|
+
2 + # Number of disk with the start of cdir
|
37
|
+
2 + # Number of files in the cdir of this disk
|
38
|
+
2 + # The comment size
|
39
|
+
0xFFFF # Maximum comment size
|
40
|
+
end
|
41
|
+
|
42
|
+
# To prevent too many tiny reads, read the maximum possible size of the local file header upfront.
|
43
|
+
# The maximum size is all the usual items, plus the maximum size
|
44
|
+
# of the filename (0xFFFF bytes) and the maximum size of the extras (0xFFFF bytes)
|
45
|
+
MAX_LOCAL_HEADER_SIZE =
|
46
|
+
begin
|
47
|
+
4 + # signature
|
48
|
+
2 + # Version needed to extract
|
49
|
+
2 + # gp flags
|
50
|
+
2 + # storage mode
|
51
|
+
2 + # dos time
|
52
|
+
2 + # dos date
|
53
|
+
4 + # CRC32
|
54
|
+
4 + # Comp size
|
55
|
+
4 + # Uncomp size
|
56
|
+
2 + # Filename size
|
57
|
+
2 + # Extra fields size
|
58
|
+
0xFFFF + # Maximum filename size
|
59
|
+
0xFFFF # Maximum extra fields size
|
60
|
+
end
|
61
|
+
|
62
|
+
SIZE_OF_USABLE_EOCD_RECORD =
|
63
|
+
begin
|
64
|
+
4 + # Signature
|
65
|
+
2 + # Number of this disk
|
66
|
+
2 + # Number of the disk with the EOCD record
|
67
|
+
2 + # Number of entries in the central directory of this disk
|
68
|
+
2 + # Number of entries in the central directory total
|
69
|
+
4 + # Size of the central directory
|
70
|
+
4 # Start of the central directory offset
|
71
|
+
end
|
72
|
+
|
73
|
+
private_constant :C_UINT32LE, :C_UINT16LE, :C_UINT64LE, :MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE,
|
74
|
+
:MAX_LOCAL_HEADER_SIZE, :SIZE_OF_USABLE_EOCD_RECORD
|
75
|
+
|
76
|
+
# Represents a file within the ZIP archive being read
|
77
|
+
class ZipEntry
|
78
|
+
include FormatParser::AttributesJSON
|
79
|
+
|
80
|
+
# @return [Fixnum] bit-packed version signature of the program that made the archive
|
81
|
+
attr_accessor :made_by
|
82
|
+
|
83
|
+
# @return [Fixnum] ZIP version support needed to extract this file
|
84
|
+
attr_accessor :version_needed_to_extract
|
85
|
+
|
86
|
+
# @return [Fixnum] bit-packed general purpose flags
|
87
|
+
attr_accessor :gp_flags
|
88
|
+
|
89
|
+
# @return [Fixnum] Storage mode (0 for stored, 8 for deflate)
|
90
|
+
attr_accessor :storage_mode
|
91
|
+
|
92
|
+
# @return [Fixnum] the bit-packed DOS time
|
93
|
+
attr_accessor :dos_time
|
94
|
+
|
95
|
+
# @return [Fixnum] the bit-packed DOS date
|
96
|
+
attr_accessor :dos_date
|
97
|
+
|
98
|
+
# @return [Fixnum] the CRC32 checksum of this file
|
99
|
+
attr_accessor :crc32
|
100
|
+
|
101
|
+
# @return [Fixnum] size of compressed file data in the ZIP
|
102
|
+
attr_accessor :compressed_size
|
103
|
+
|
104
|
+
# @return [Fixnum] size of the file once uncompressed
|
105
|
+
attr_accessor :uncompressed_size
|
106
|
+
|
107
|
+
# @return [String] the filename
|
108
|
+
attr_accessor :filename
|
109
|
+
|
110
|
+
# @return [Fixnum] disk number where this file starts
|
111
|
+
attr_accessor :disk_number_start
|
112
|
+
|
113
|
+
# @return [Fixnum] internal attributes of the file
|
114
|
+
attr_accessor :internal_attrs
|
115
|
+
|
116
|
+
# @return [Fixnum] external attributes of the file
|
117
|
+
attr_accessor :external_attrs
|
118
|
+
|
119
|
+
# @return [Fixnum] at what offset the local file header starts
|
120
|
+
# in your original IO object
|
121
|
+
attr_accessor :local_file_header_offset
|
122
|
+
|
123
|
+
# @return [String] the file comment
|
124
|
+
attr_accessor :comment
|
125
|
+
|
126
|
+
# @return [Fixnum] at what offset you should start reading
|
127
|
+
# for the compressed data in your original IO object
|
128
|
+
def compressed_data_offset
|
129
|
+
@compressed_data_offset || raise(LocalHeaderPending)
|
130
|
+
end
|
131
|
+
|
132
|
+
# Tells whether the compressed data offset is already known for this entry
|
133
|
+
# @return [Boolean]
|
134
|
+
def known_offset?
|
135
|
+
!@compressed_data_offset.nil?
|
136
|
+
end
|
137
|
+
|
138
|
+
# Tells whether the entry uses a data descriptor (this is defined
|
139
|
+
# by bit 3 in the GP flags).
|
140
|
+
def uses_data_descriptor?
|
141
|
+
(gp_flags & 0x0008) == 0x0008
|
142
|
+
end
|
143
|
+
|
144
|
+
# Sets the offset at which the compressed data for this file starts in the ZIP.
|
145
|
+
# By default, the value will be set by the Reader for you. If you use delayed
|
146
|
+
# reading, you need to set it by using the `get_compressed_data_offset` on the Reader:
|
147
|
+
#
|
148
|
+
# entry.compressed_data_offset = reader.get_compressed_data_offset(io: file,
|
149
|
+
# local_file_header_offset: entry.local_header_offset)
|
150
|
+
def compressed_data_offset=(offset)
|
151
|
+
@compressed_data_offset = offset.to_i
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
# Parse an IO handle to a ZIP archive into an array of Entry objects.
|
156
|
+
#
|
157
|
+
# @param io[#tell, #seek, #read, #size] an IO-ish object
|
158
|
+
# @return [Array<ZipEntry>] an array of entries within the ZIP being parsed
|
159
|
+
def read_zip_structure(io:)
|
160
|
+
zip_file_size = io.size
|
161
|
+
eocd_offset = get_eocd_offset(io, zip_file_size)
|
162
|
+
|
163
|
+
zip64_end_of_cdir_location = get_zip64_eocd_location(io, eocd_offset)
|
164
|
+
num_files, cdir_location, cdir_size =
|
165
|
+
if zip64_end_of_cdir_location
|
166
|
+
num_files_and_central_directory_offset_zip64(io, zip64_end_of_cdir_location)
|
167
|
+
else
|
168
|
+
num_files_and_central_directory_offset(io, eocd_offset)
|
169
|
+
end
|
170
|
+
|
171
|
+
log { format('Located the central directory start at %d', cdir_location) }
|
172
|
+
seek(io, cdir_location)
|
173
|
+
|
174
|
+
# In zip_tricks we read the entire central directory _and_ enything behind it.
|
175
|
+
# Strictly speaking, we should be able to read `cdir_size` bytes and not a byte more.
|
176
|
+
# BUT! in format_parser we avoid unbounded reads, as a matter of fact they are forbidden.
|
177
|
+
# So we will again limit ouselves to cdir_size, and we will take cushion of 1 KB.
|
178
|
+
central_directory_str = io.read(cdir_size + 1024)
|
179
|
+
central_directory_io = StringIO.new(central_directory_str)
|
180
|
+
log do
|
181
|
+
format(
|
182
|
+
'Read %d bytes with central directory + EOCD record and locator',
|
183
|
+
central_directory_str.bytesize)
|
184
|
+
end
|
185
|
+
|
186
|
+
entries = (0...num_files).map do |entry_n|
|
187
|
+
offset_location = cdir_location + central_directory_io.pos
|
188
|
+
log do
|
189
|
+
format(
|
190
|
+
'Reading the central directory entry %d starting at offset %d',
|
191
|
+
entry_n, offset_location)
|
192
|
+
end
|
193
|
+
read_cdir_entry(central_directory_io)
|
194
|
+
end
|
195
|
+
|
196
|
+
entries
|
197
|
+
end
|
198
|
+
|
199
|
+
private
|
200
|
+
|
201
|
+
def skip_ahead_2(io)
|
202
|
+
skip_ahead_n(io, 2)
|
203
|
+
end
|
204
|
+
|
205
|
+
def skip_ahead_4(io)
|
206
|
+
skip_ahead_n(io, 4)
|
207
|
+
end
|
208
|
+
|
209
|
+
def skip_ahead_8(io)
|
210
|
+
skip_ahead_n(io, 8)
|
211
|
+
end
|
212
|
+
|
213
|
+
def seek(io, absolute_pos)
|
214
|
+
io.seek(absolute_pos)
|
215
|
+
unless absolute_pos == io.pos
|
216
|
+
raise ReadError,
|
217
|
+
"Expected to seek to #{absolute_pos} but only got to #{io.pos}"
|
218
|
+
end
|
219
|
+
nil
|
220
|
+
end
|
221
|
+
|
222
|
+
def assert_signature(io, signature_magic_number)
|
223
|
+
readback = read_4b(io)
|
224
|
+
if readback != signature_magic_number
|
225
|
+
expected = '0x0' + signature_magic_number.to_s(16)
|
226
|
+
actual = '0x0' + readback.to_s(16)
|
227
|
+
raise InvalidStructure, "Expected signature #{expected}, but read #{actual}"
|
228
|
+
end
|
229
|
+
end
|
230
|
+
|
231
|
+
def skip_ahead_n(io, n)
|
232
|
+
pos_before = io.pos
|
233
|
+
io.seek(io.pos + n)
|
234
|
+
pos_after = io.pos
|
235
|
+
delta = pos_after - pos_before
|
236
|
+
unless delta == n
|
237
|
+
raise ReadError, "Expected to seek #{n} bytes ahead, but could only seek #{delta} bytes ahead"
|
238
|
+
end
|
239
|
+
nil
|
240
|
+
end
|
241
|
+
|
242
|
+
def read_n(io, n_bytes)
|
243
|
+
io.read(n_bytes).tap do |d|
|
244
|
+
raise ReadError, "Expected to read #{n_bytes} bytes, but the IO was at the end" if d.nil?
|
245
|
+
unless d.bytesize == n_bytes
|
246
|
+
raise ReadError, "Expected to read #{n_bytes} bytes, read #{d.bytesize}"
|
247
|
+
end
|
248
|
+
end
|
249
|
+
end
|
250
|
+
|
251
|
+
def read_2b(io)
|
252
|
+
read_n(io, 2).unpack(C_UINT16LE).shift
|
253
|
+
end
|
254
|
+
|
255
|
+
def read_4b(io)
|
256
|
+
read_n(io, 4).unpack(C_UINT32LE).shift
|
257
|
+
end
|
258
|
+
|
259
|
+
def read_8b(io)
|
260
|
+
read_n(io, 8).unpack(C_UINT64LE).shift
|
261
|
+
end
|
262
|
+
|
263
|
+
def read_cdir_entry(io)
|
264
|
+
assert_signature(io, 0x02014b50)
|
265
|
+
ZipEntry.new.tap do |e|
|
266
|
+
e.made_by = read_2b(io)
|
267
|
+
e.version_needed_to_extract = read_2b(io)
|
268
|
+
e.gp_flags = read_2b(io)
|
269
|
+
e.storage_mode = read_2b(io)
|
270
|
+
e.dos_time = read_2b(io)
|
271
|
+
e.dos_date = read_2b(io)
|
272
|
+
e.crc32 = read_4b(io)
|
273
|
+
e.compressed_size = read_4b(io)
|
274
|
+
e.uncompressed_size = read_4b(io)
|
275
|
+
filename_size = read_2b(io)
|
276
|
+
extra_size = read_2b(io)
|
277
|
+
comment_len = read_2b(io)
|
278
|
+
e.disk_number_start = read_2b(io)
|
279
|
+
e.internal_attrs = read_2b(io)
|
280
|
+
e.external_attrs = read_4b(io)
|
281
|
+
e.local_file_header_offset = read_4b(io)
|
282
|
+
e.filename = read_n(io, filename_size)
|
283
|
+
|
284
|
+
# Extra fields
|
285
|
+
extras = read_n(io, extra_size)
|
286
|
+
# Comment
|
287
|
+
e.comment = read_n(io, comment_len)
|
288
|
+
|
289
|
+
# Parse out the extra fields
|
290
|
+
extra_table = parse_out_extra_fields(extras)
|
291
|
+
|
292
|
+
# ...of which we really only need the Zip64 extra
|
293
|
+
if zip64_extra_contents ||= extra_table[1]
|
294
|
+
# If the Zip64 extra is present, we let it override all
|
295
|
+
# the values fetched from the conventional header
|
296
|
+
zip64_extra = StringIO.new(zip64_extra_contents)
|
297
|
+
log do
|
298
|
+
format(
|
299
|
+
'Will read Zip64 extra data for %s, %d bytes',
|
300
|
+
e.filename, zip64_extra.size)
|
301
|
+
end
|
302
|
+
# Now here be dragons. The APPNOTE specifies that
|
303
|
+
#
|
304
|
+
# > The order of the fields in the ZIP64 extended
|
305
|
+
# > information record is fixed, but the fields will
|
306
|
+
# > only appear if the corresponding Local or Central
|
307
|
+
# > directory record field is set to 0xFFFF or 0xFFFFFFFF.
|
308
|
+
#
|
309
|
+
# It means that before we read this stuff we need to check if the previously-read
|
310
|
+
# values are at overflow, and only _then_ proceed to read them. Bah.
|
311
|
+
if e.uncompressed_size == 0xFFFFFFFF
|
312
|
+
e.uncompressed_size = read_8b(zip64_extra)
|
313
|
+
end
|
314
|
+
if e.compressed_size == 0xFFFFFFFF
|
315
|
+
e.compressed_size = read_8b(zip64_extra)
|
316
|
+
end
|
317
|
+
if e.local_file_header_offset == 0xFFFFFFFF
|
318
|
+
e.local_file_header_offset = read_8b(zip64_extra)
|
319
|
+
end
|
320
|
+
# Disk number comes last and we can skip it anyway, since we do
|
321
|
+
# not support multi-disk archives
|
322
|
+
end
|
323
|
+
end
|
324
|
+
end
|
325
|
+
|
326
|
+
def get_eocd_offset(file_io, zip_file_size)
|
327
|
+
# Start reading from the _comment_ of the zip file (from the very end).
|
328
|
+
# The maximum size of the comment is 0xFFFF (what fits in 2 bytes)
|
329
|
+
implied_position_of_eocd_record = zip_file_size - MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE
|
330
|
+
implied_position_of_eocd_record = 0 if implied_position_of_eocd_record < 0
|
331
|
+
|
332
|
+
# Use a soft seek (we might not be able to get as far behind in the IO as we want)
|
333
|
+
# and a soft read (we might not be able to read as many bytes as we want)
|
334
|
+
file_io.seek(implied_position_of_eocd_record)
|
335
|
+
str_containing_eocd_record = file_io.read(MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE)
|
336
|
+
raise MissingEOCD unless str_containing_eocd_record
|
337
|
+
|
338
|
+
eocd_idx_in_buf = locate_eocd_signature(str_containing_eocd_record)
|
339
|
+
|
340
|
+
raise MissingEOCD unless eocd_idx_in_buf
|
341
|
+
|
342
|
+
eocd_offset = implied_position_of_eocd_record + eocd_idx_in_buf
|
343
|
+
log { format('Found EOCD signature at offset %d', eocd_offset) }
|
344
|
+
|
345
|
+
eocd_offset
|
346
|
+
end
|
347
|
+
|
348
|
+
# This is tricky. Essentially, we have to scan the maximum possible number
|
349
|
+
# of bytes (that the EOCD can theoretically occupy including the comment),
|
350
|
+
# and we have to find a combination of:
|
351
|
+
# [EOCD signature, <some ZIP medatata>, comment byte size, the comment of
|
352
|
+
# that size, eof].
|
353
|
+
# The only way I could find to do this was with a sliding window, but
|
354
|
+
# there probably is a better way.
|
355
|
+
def locate_eocd_signature(in_str)
|
356
|
+
# We have to scan from the _very_ tail. We read the very minimum size
|
357
|
+
# the EOCD record can have (up to and including the comment size), using
|
358
|
+
# a sliding window. Once our end offset matches the comment size we found our
|
359
|
+
# EOCD marker.
|
360
|
+
unpack_pattern = 'VvvvvVVv'
|
361
|
+
minimum_record_size = 22
|
362
|
+
end_location = minimum_record_size * -1
|
363
|
+
loop do
|
364
|
+
# If the window is nil, we have rolled off the start of the string, nothing to do here.
|
365
|
+
# We use negative values because if we used positive slice indices
|
366
|
+
# we would have to detect the rollover ourselves
|
367
|
+
break unless window = in_str[end_location, minimum_record_size]
|
368
|
+
|
369
|
+
window_location = in_str.bytesize + end_location
|
370
|
+
unpacked = window.unpack(unpack_pattern)
|
371
|
+
# If we found the signarue, pick up the comment size, and check if the size of the window
|
372
|
+
# plus that comment size is where we are in the string. If we are - bingo.
|
373
|
+
if unpacked[0] == 0x06054b50 && comment_size = unpacked[-1]
|
374
|
+
assumed_eocd_location = in_str.bytesize - comment_size - minimum_record_size
|
375
|
+
# if the comment size is where we should be at - we found our EOCD
|
376
|
+
return assumed_eocd_location if assumed_eocd_location == window_location
|
377
|
+
end
|
378
|
+
|
379
|
+
end_location -= 1 # Shift the window back, by one byte, and try again.
|
380
|
+
end
|
381
|
+
end
|
382
|
+
|
383
|
+
# Find the Zip64 EOCD locator segment offset. Do this by seeking backwards from the
|
384
|
+
# EOCD record in the archive by fixed offsets
|
385
|
+
def get_zip64_eocd_location(file_io, eocd_offset)
|
386
|
+
zip64_eocd_loc_offset = eocd_offset
|
387
|
+
zip64_eocd_loc_offset -= 4 # The signature
|
388
|
+
zip64_eocd_loc_offset -= 4 # Which disk has the Zip64 end of central directory record
|
389
|
+
zip64_eocd_loc_offset -= 8 # Offset of the zip64 central directory record
|
390
|
+
zip64_eocd_loc_offset -= 4 # Total number of disks
|
391
|
+
|
392
|
+
log do
|
393
|
+
format(
|
394
|
+
'Will look for the Zip64 EOCD locator signature at offset %d',
|
395
|
+
zip64_eocd_loc_offset)
|
396
|
+
end
|
397
|
+
|
398
|
+
# If the offset is negative there is certainly no Zip64 EOCD locator here
|
399
|
+
return unless zip64_eocd_loc_offset >= 0
|
400
|
+
|
401
|
+
file_io.seek(zip64_eocd_loc_offset)
|
402
|
+
assert_signature(file_io, 0x07064b50)
|
403
|
+
|
404
|
+
log { format('Found Zip64 EOCD locator at offset %d', zip64_eocd_loc_offset) }
|
405
|
+
|
406
|
+
disk_num = read_4b(file_io) # number of the disk
|
407
|
+
raise UnsupportedFeature, 'The archive spans multiple disks' if disk_num != 0
|
408
|
+
read_8b(file_io)
|
409
|
+
rescue ReadError, InvalidStructure
|
410
|
+
nil
|
411
|
+
end
|
412
|
+
|
413
|
+
# num_files_and_central_directory_offset_zip64 is too high. [21.12/15]
|
414
|
+
def num_files_and_central_directory_offset_zip64(io, zip64_end_of_cdir_location)
|
415
|
+
seek(io, zip64_end_of_cdir_location)
|
416
|
+
|
417
|
+
assert_signature(io, 0x06064b50)
|
418
|
+
|
419
|
+
zip64_eocdr_size = read_8b(io)
|
420
|
+
zip64_eocdr = read_n(io, zip64_eocdr_size) # Reading in bulk is cheaper
|
421
|
+
zip64_eocdr = StringIO.new(zip64_eocdr)
|
422
|
+
skip_ahead_2(zip64_eocdr) # version made by
|
423
|
+
skip_ahead_2(zip64_eocdr) # version needed to extract
|
424
|
+
|
425
|
+
disk_n = read_4b(zip64_eocdr) # number of this disk
|
426
|
+
disk_n_with_eocdr = read_4b(zip64_eocdr) # number of the disk with the EOCDR
|
427
|
+
if disk_n != disk_n_with_eocdr
|
428
|
+
raise UnsupportedFeature, 'The archive spans multiple disks'
|
429
|
+
end
|
430
|
+
|
431
|
+
num_files_this_disk = read_8b(zip64_eocdr) # number of files on this disk
|
432
|
+
num_files_total = read_8b(zip64_eocdr) # files total in the central directory
|
433
|
+
|
434
|
+
if num_files_this_disk != num_files_total
|
435
|
+
raise UnsupportedFeature, 'The archive spans multiple disks'
|
436
|
+
end
|
437
|
+
|
438
|
+
log do
|
439
|
+
format(
|
440
|
+
'Zip64 EOCD record states there are %d files in the archive',
|
441
|
+
num_files_total)
|
442
|
+
end
|
443
|
+
|
444
|
+
central_dir_size = read_8b(zip64_eocdr) # Size of the central directory
|
445
|
+
central_dir_offset = read_8b(zip64_eocdr) # Where the central directory starts
|
446
|
+
|
447
|
+
[num_files_total, central_dir_offset, central_dir_size]
|
448
|
+
end
|
449
|
+
|
450
|
+
def num_files_and_central_directory_offset(file_io, eocd_offset)
|
451
|
+
seek(file_io, eocd_offset)
|
452
|
+
|
453
|
+
# The size of the EOCD record is known upfront, so use a strict read
|
454
|
+
eocd_record_str = read_n(file_io, SIZE_OF_USABLE_EOCD_RECORD)
|
455
|
+
io = StringIO.new(eocd_record_str)
|
456
|
+
|
457
|
+
assert_signature(io, 0x06054b50)
|
458
|
+
skip_ahead_2(io) # number_of_this_disk
|
459
|
+
skip_ahead_2(io) # number of the disk with the EOCD record
|
460
|
+
skip_ahead_2(io) # number of entries in the central directory of this disk
|
461
|
+
num_files = read_2b(io) # number of entries in the central directory total
|
462
|
+
cdir_size = read_4b(io) # size of the central directory
|
463
|
+
cdir_offset = read_4b(io) # start of central directorty offset
|
464
|
+
[num_files, cdir_offset, cdir_size]
|
465
|
+
end
|
466
|
+
|
467
|
+
# Is provided as a stub to be overridden in a subclass if you need it. Will report
|
468
|
+
# during various stages of reading. The log message is contained in the return value
|
469
|
+
# of `yield` in the method (the log messages are lazy-evaluated).
|
470
|
+
def log
|
471
|
+
# $stderr.puts(yield)
|
472
|
+
end
|
473
|
+
|
474
|
+
def parse_out_extra_fields(extra_fields_str)
|
475
|
+
extra_table = {}
|
476
|
+
extras_buf = StringIO.new(extra_fields_str)
|
477
|
+
until extras_buf.eof?
|
478
|
+
extra_id = read_2b(extras_buf)
|
479
|
+
extra_size = read_2b(extras_buf)
|
480
|
+
extra_contents = read_n(extras_buf, extra_size)
|
481
|
+
extra_table[extra_id] = extra_contents
|
482
|
+
end
|
483
|
+
extra_table
|
484
|
+
end
|
485
|
+
end
|
@@ -13,7 +13,7 @@ describe FormatParser::AttributesJSON do
|
|
13
13
|
instance.foo = 42
|
14
14
|
instance.bar = 'abcdef'
|
15
15
|
expect(instance.as_json).to eq('nature' => 'good', 'foo' => 42, 'bar' => 'abcdef', 'baz' => nil)
|
16
|
-
expect(instance.as_json(root: true)).to eq('nature' => 'good', 'foo' => 42, 'bar' => 'abcdef', 'baz' => nil)
|
16
|
+
expect(instance.as_json(root: true)).to eq('format_parser_file_info' => {'nature' => 'good', 'foo' => 42, 'bar' => 'abcdef', 'baz' => nil})
|
17
17
|
end
|
18
18
|
|
19
19
|
it 'is included into file information types' do
|
@@ -49,4 +49,22 @@ describe FormatParser::AttributesJSON do
|
|
49
49
|
standard_output = JSON.dump(instance)
|
50
50
|
expect(pretty_output).not_to eq(standard_output)
|
51
51
|
end
|
52
|
+
|
53
|
+
it 'provides to_json without arguments' do
|
54
|
+
anon_class = Class.new do
|
55
|
+
include FormatParser::AttributesJSON
|
56
|
+
attr_accessor :foo, :bar, :baz
|
57
|
+
def nature
|
58
|
+
'good'
|
59
|
+
end
|
60
|
+
end
|
61
|
+
instance = anon_class.new
|
62
|
+
instance.foo = 42
|
63
|
+
instance.bar = 'abcdef'
|
64
|
+
|
65
|
+
output = instance.to_json
|
66
|
+
readback = JSON.parse(output, symbolize_names: true)
|
67
|
+
|
68
|
+
expect(readback).to have_key(:nature)
|
69
|
+
end
|
52
70
|
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe FormatParser::PDFParser do
|
4
|
+
let(:parsed_pdf) {
|
5
|
+
subject.call(
|
6
|
+
File.open(
|
7
|
+
Pathname.new(fixtures_dir).join('PDF').join(pdf_file),
|
8
|
+
'rb'
|
9
|
+
)
|
10
|
+
)
|
11
|
+
}
|
12
|
+
|
13
|
+
shared_examples :behave_like_pdf do |hash|
|
14
|
+
let(:pdf_file) { hash.fetch(:file) }
|
15
|
+
|
16
|
+
it 'acts as a pdf' do
|
17
|
+
expect(parsed_pdf).not_to be_nil
|
18
|
+
expect(parsed_pdf.nature).to eq(:document)
|
19
|
+
expect(parsed_pdf.format).to eq(:pdf)
|
20
|
+
end
|
21
|
+
|
22
|
+
it 'has a correct page count' do
|
23
|
+
expect(parsed_pdf.page_count).to eq(hash.fetch(:page_count))
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
describe 'a PDF file with a missing version header' do
|
28
|
+
let(:pdf_file) { 'not_a.pdf' }
|
29
|
+
|
30
|
+
it 'does not parse succesfully' do
|
31
|
+
expect(parsed_pdf).to be_nil
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
describe 'a PDF file with a correct header but no valid content' do
|
36
|
+
let(:pdf_file) { 'broken.pdf' }
|
37
|
+
|
38
|
+
pending 'does not parse succesfully'
|
39
|
+
end
|
40
|
+
|
41
|
+
describe 'exceeding the PDF read limit' do
|
42
|
+
let(:pdf_file) { 'read_limit.pdf' }
|
43
|
+
|
44
|
+
pending 'does not parse succesfully'
|
45
|
+
end
|
46
|
+
|
47
|
+
describe 'a PDF file with a missing COUNT_HEADER' do
|
48
|
+
let(:pdf_file) { 'missing_page_count.pdf' }
|
49
|
+
|
50
|
+
it 'does not return a page count' do
|
51
|
+
expect(parsed_pdf.page_count).to eq(nil)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
describe 'parses a PDF file' do
|
56
|
+
describe 'a single page file' do
|
57
|
+
include_examples :behave_like_pdf, file: '1_page.pdf', page_count: 1
|
58
|
+
end
|
59
|
+
|
60
|
+
describe 'a multi page pdf file' do
|
61
|
+
include_examples :behave_like_pdf, file: '2_pages.pdf', page_count: 2
|
62
|
+
end
|
63
|
+
|
64
|
+
describe 'a multi page pdf file with content' do
|
65
|
+
include_examples :behave_like_pdf, file: '10_pages.pdf', page_count: 10
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe FormatParser::ZIPParser do
|
4
|
+
it 'parses a ZIP archive with Zip64 extra fields (due to the number of files)' do
|
5
|
+
fixture_path = fixtures_dir + '/ZIP/arch_many_entries.zip'
|
6
|
+
fi_io = File.open(fixture_path, 'rb')
|
7
|
+
|
8
|
+
result = subject.call(fi_io)
|
9
|
+
expect(result).not_to be_nil
|
10
|
+
|
11
|
+
expect(result.format).to eq(:zip)
|
12
|
+
expect(result.nature).to eq(:archive)
|
13
|
+
expect(result.entries.length).to eq(0xFFFF + 1)
|
14
|
+
|
15
|
+
entry = result.entries.fetch(5674)
|
16
|
+
expect(entry.type).to eq(:file)
|
17
|
+
expect(entry.size).to eq(47)
|
18
|
+
expect(entry.filename).to eq('file-0005674.txt')
|
19
|
+
end
|
20
|
+
|
21
|
+
it 'parses a ZIP archive with a few files' do
|
22
|
+
fixture_path = fixtures_dir + '/ZIP/arch_few_entries.zip'
|
23
|
+
fi_io = File.open(fixture_path, 'rb')
|
24
|
+
|
25
|
+
result = subject.call(fi_io)
|
26
|
+
expect(result).not_to be_nil
|
27
|
+
|
28
|
+
expect(result.format).to eq(:zip)
|
29
|
+
expect(result.nature).to eq(:archive)
|
30
|
+
expect(result.entries.length).to eq(3)
|
31
|
+
end
|
32
|
+
|
33
|
+
it 'correctly identifies an empty directory' do
|
34
|
+
fixture_path = fixtures_dir + '/ZIP/arch_with_empty_dir.zip'
|
35
|
+
fi_io = File.open(fixture_path, 'rb')
|
36
|
+
|
37
|
+
result = subject.call(fi_io)
|
38
|
+
expect(result).not_to be_nil
|
39
|
+
|
40
|
+
expect(result.format).to eq(:zip)
|
41
|
+
expect(result.nature).to eq(:archive)
|
42
|
+
expect(result.entries.length).to eq(3)
|
43
|
+
|
44
|
+
dir_entry = result.entries.last
|
45
|
+
expect(dir_entry.filename).to eq('папочка/')
|
46
|
+
expect(dir_entry.type).to eq(:directory)
|
47
|
+
end
|
48
|
+
|
49
|
+
it 'returns a result that has a usable JSON representation' do
|
50
|
+
fixture_path = fixtures_dir + '/ZIP/arch_with_empty_dir.zip'
|
51
|
+
fi_io = File.open(fixture_path, 'rb')
|
52
|
+
|
53
|
+
result = subject.call(fi_io)
|
54
|
+
json_repr = JSON.pretty_generate(result)
|
55
|
+
|
56
|
+
json_parsed_repr = JSON.parse(json_repr, symbolize_names: true)
|
57
|
+
expect(json_parsed_repr[:nature]).to eq('archive')
|
58
|
+
expect(json_parsed_repr[:format]).to eq('zip')
|
59
|
+
expect(json_parsed_repr[:entries]).to be_kind_of(Array)
|
60
|
+
expect(json_parsed_repr[:entries].length).to eq(3)
|
61
|
+
|
62
|
+
json_parsed_repr[:entries].each do |e|
|
63
|
+
expect(e[:filename]).to be_kind_of(String)
|
64
|
+
expect(e[:size]).to be_kind_of(Integer)
|
65
|
+
expect(e[:type]).to be_kind_of(String)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: format_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Noah Berman
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2018-03-
|
12
|
+
date: 2018-03-30 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: ks
|
@@ -159,6 +159,7 @@ files:
|
|
159
159
|
- README.md
|
160
160
|
- Rakefile
|
161
161
|
- format_parser.gemspec
|
162
|
+
- lib/archive.rb
|
162
163
|
- lib/attributes_json.rb
|
163
164
|
- lib/audio.rb
|
164
165
|
- lib/care.rb
|
@@ -180,10 +181,13 @@ files:
|
|
180
181
|
- lib/parsers/mp3_parser.rb
|
181
182
|
- lib/parsers/mp3_parser/id3_v1.rb
|
182
183
|
- lib/parsers/mp3_parser/id3_v2.rb
|
184
|
+
- lib/parsers/pdf_parser.rb
|
183
185
|
- lib/parsers/png_parser.rb
|
184
186
|
- lib/parsers/psd_parser.rb
|
185
187
|
- lib/parsers/tiff_parser.rb
|
186
188
|
- lib/parsers/wav_parser.rb
|
189
|
+
- lib/parsers/zip_parser.rb
|
190
|
+
- lib/parsers/zip_parser/file_reader.rb
|
187
191
|
- lib/read_limiter.rb
|
188
192
|
- lib/remote_io.rb
|
189
193
|
- lib/video.rb
|
@@ -201,10 +205,12 @@ files:
|
|
201
205
|
- spec/parsers/jpeg_parser_spec.rb
|
202
206
|
- spec/parsers/moov_parser_spec.rb
|
203
207
|
- spec/parsers/mp3_parser_spec.rb
|
208
|
+
- spec/parsers/pdf_parser_spec.rb
|
204
209
|
- spec/parsers/png_parser_spec.rb
|
205
210
|
- spec/parsers/psd_parser_spec.rb
|
206
211
|
- spec/parsers/tiff_parser_spec.rb
|
207
212
|
- spec/parsers/wav_parser_spec.rb
|
213
|
+
- spec/parsers/zip_parser_spec.rb
|
208
214
|
- spec/read_limiter_spec.rb
|
209
215
|
- spec/remote_fetching_spec.rb
|
210
216
|
- spec/remote_io_spec.rb
|