format_parser 0.3.5 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 4bc81ce8d64a13fe43d93e7fe24b17ea484e64a4
4
- data.tar.gz: 8d559f3fdd9f0a814b479ade6997a3d6bd7dd4a5
3
+ metadata.gz: a76c414094334f57859df79e61d42fa1fdabb3bd
4
+ data.tar.gz: 120aaee7484ee01165a2c8dd09b796bce7900c9f
5
5
  SHA512:
6
- metadata.gz: d555a6eb131261f3c9bb2418e1c7acf12dfb07de4d3b2af65a8135247b8900ddb8e1c60bc6c0d6985ac3c2bfdfc81c0902396e21436a4b4b7a5fd6eee7dcf4d7
7
- data.tar.gz: c955511e375429fb93068220a1b7d70875a7e71760f14652fd191ad8d1a8b5c20ff20258cda030468acfc53cde070436b8515fe9af7e0f757dbe2c2d66722a03
6
+ metadata.gz: a249af874800774dae313b42e4c191125341a6497a9e31b75e54d22ac008725331ce2227e41c167b9f746b85f7db86364dbbdf5614d48f11cb4122e4de01ce03
7
+ data.tar.gz: e6fee97f2741dccc1c9325813eed247d2a93d7b118b7b6b902cba2f23307650127875d443554a4df1e2ea3c8658c59bab68d2c4d84cef4afe2b4bf1e6454c144
data/lib/archive.rb ADDED
@@ -0,0 +1,36 @@
1
+ require 'ks'
2
+
3
+ module FormatParser
4
+ class Archive
5
+ include FormatParser::AttributesJSON
6
+
7
+ class Entry < Ks.strict(:type, :size, :filename)
8
+ def to_json(*a)
9
+ to_h.to_json(*a)
10
+ end
11
+ end
12
+
13
+ NATURE = :archive
14
+
15
+ # What filetype was recognized? Will contain a non-ambiguous symbol
16
+ # referring to the file format. The symbol can be used as a filename
17
+ # extension safely
18
+ attr_accessor :format
19
+
20
+ # Array of Entry structs
21
+ attr_accessor :entries
22
+
23
+ # If a parser wants to provide any extra information to the caller
24
+ # it can be placed here
25
+ attr_accessor :intrinsics
26
+
27
+ # Only permits assignments via defined accessors
28
+ def initialize(**attributes)
29
+ attributes.map { |(k, v)| public_send("#{k}=", v) }
30
+ end
31
+
32
+ def nature
33
+ NATURE
34
+ end
35
+ end
36
+ end
@@ -14,7 +14,7 @@ module FormatParser::AttributesJSON
14
14
 
15
15
  # Implements a sane default `as_json` for an object
16
16
  # that accessors defined
17
- def as_json(*_maybe_root_option)
17
+ def as_json(root: false)
18
18
  h = {}
19
19
  h['nature'] = nature if respond_to?(:nature) # Needed for file info structs
20
20
  methods.grep(/\w\=$/).each_with_object(h) do |attr_writer_method_name, h|
@@ -24,11 +24,15 @@ module FormatParser::AttributesJSON
24
24
  # by the caller
25
25
  h[reader_method_name] = value.respond_to?(:as_json) ? value.as_json : value
26
26
  end
27
+ if root
28
+ {'format_parser_file_info' => h}
29
+ else
30
+ h
31
+ end
27
32
  end
28
33
 
29
- # Implements to_json with sane defaults - like
30
- # support for `JSON.pretty_generate` vs. `JSON.dump`
31
- def to_json(generator_state)
32
- generator_state.generate(as_json)
34
+ # Implements to_json with sane defaults, with or without arguments
35
+ def to_json(*maybe_generator_state)
36
+ as_json(root: false).to_json(*maybe_generator_state)
33
37
  end
34
38
  end
data/lib/document.rb CHANGED
@@ -6,6 +6,7 @@ module FormatParser
6
6
 
7
7
  attr_accessor :format
8
8
  attr_accessor :document_type
9
+ attr_accessor :page_count
9
10
 
10
11
  # Only permits assignments via defined accessors
11
12
  def initialize(**attributes)
data/lib/format_parser.rb CHANGED
@@ -6,6 +6,7 @@ module FormatParser
6
6
  require_relative 'audio'
7
7
  require_relative 'document'
8
8
  require_relative 'video'
9
+ require_relative 'archive'
9
10
  require_relative 'io_utils'
10
11
  require_relative 'read_limiter'
11
12
  require_relative 'remote_io'
@@ -1,3 +1,3 @@
1
1
  module FormatParser
2
- VERSION = '0.3.5'
2
+ VERSION = '0.4.0'
3
3
  end
@@ -0,0 +1,76 @@
1
+ class FormatParser::PDFParser
2
+ include FormatParser::IOUtils
3
+
4
+ # First 9 bytes of a PDF should be in this format, according to:
5
+ #
6
+ # https://stackoverflow.com/questions/3108201/detect-if-pdf-file-is-correct-header-pdf
7
+ #
8
+ # There are however exceptions, which are left out for now.
9
+ #
10
+ PDF_MARKER = /%PDF-1\.[0-8]{1}/
11
+
12
+ # Page counts have different markers depending on
13
+ # the PDF type. There is not a single common way of solving
14
+ # this. The only way of solving this correctly is by adding
15
+ # different types of PDF's in the specs.
16
+ #
17
+ COUNT_MARKERS = ['Count ']
18
+ EOF_MARKER = '%EOF'
19
+
20
+ def call(io)
21
+ io = FormatParser::IOConstraint.new(io)
22
+
23
+ return unless safe_read(io, 9) =~ PDF_MARKER
24
+
25
+ attributes = scan_for_attributes(io)
26
+
27
+ FormatParser::Document.new(
28
+ format: :pdf,
29
+ page_count: attributes[:page_count]
30
+ )
31
+ end
32
+
33
+ private
34
+
35
+ # Read ahead bytes until one of % or / is reached.
36
+ # A header in a PDF always starts with a /
37
+ # The % is to detect the EOF
38
+ #
39
+ def scan_for_attributes(io)
40
+ result = {}
41
+
42
+ while read = safe_read(io, 1)
43
+ case read
44
+ when '%'
45
+ break if safe_read(io, EOF_MARKER.size) == EOF_MARKER
46
+ when '/'
47
+ find_page_count(io, result)
48
+ end
49
+ end
50
+
51
+ result
52
+ end
53
+
54
+ def find_page_count(io, result)
55
+ COUNT_MARKERS.each do |marker|
56
+ if safe_read(io, marker.size) == marker
57
+ result[:page_count] = read_numbers(io)
58
+ end
59
+ end
60
+ end
61
+
62
+ # Read ahead bytes until no more numbers are found
63
+ # This assumes that the position of io starts at a
64
+ # number
65
+ def read_numbers(io)
66
+ numbers = ''
67
+
68
+ while c = safe_read(io, 1)
69
+ c =~ /\d+/ ? numbers << c : break
70
+ end
71
+
72
+ numbers.to_i
73
+ end
74
+
75
+ FormatParser.register_parser self, natures: :document, formats: :pdf
76
+ end
@@ -0,0 +1,39 @@
1
+ class FormatParser::ZIPParser
2
+ require_relative 'zip_parser/file_reader'
3
+
4
+ def call(io)
5
+ reader = FileReader.new
6
+ entries = reader.read_zip_structure(io: FormatParser::IOConstraint.new(io))
7
+
8
+ entries_archive = entries.map do |ze|
9
+ ft = directory?(ze) ? :directory : :file
10
+ decoded_filename = decode_filename(ze)
11
+ FormatParser::Archive::Entry.new(type: ft, size: ze.uncompressed_size, filename: decoded_filename)
12
+ end
13
+
14
+ FormatParser::Archive.new(format: :zip, entries: entries_archive)
15
+ rescue FileReader::Error
16
+ # This is not a ZIP, or a broken ZIP.
17
+ return
18
+ end
19
+
20
+ def directory?(zip_entry)
21
+ # We can do a lap dance here and parse out the individual bit fields
22
+ # from the external attributes, check the OS type that is in the entry
23
+ # to see if it can be interpreted as UNIX or not, and generally have
24
+ # heaps of fun. Instead, we will be frugal.
25
+ zip_entry.filename.end_with?('/')
26
+ end
27
+
28
+ def decode_filename(zip_entry)
29
+ # Check for the EFS bit in the general-purpose flags. If it is set,
30
+ # the entry filename can be treated as UTF-8
31
+ if zip_entry.gp_flags & 0b100000000000 == 0b100000000000
32
+ zip_entry.filename.unpack('U*').pack('U*')
33
+ else
34
+ zip_entry.filename.encode(Encoding::UTF_8, undefined: :replace)
35
+ end
36
+ end
37
+
38
+ FormatParser.register_parser self, natures: [:archive, :document], formats: :zip
39
+ end
@@ -0,0 +1,485 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'stringio'
4
+
5
+ # A very barebones ZIP file reader
6
+ class FormatParser::ZIPParser::FileReader
7
+ Error = Class.new(StandardError)
8
+ ReadError = Class.new(Error)
9
+ UnsupportedFeature = Class.new(Error)
10
+ InvalidStructure = Class.new(Error)
11
+ LocalHeaderPending = Class.new(Error) do
12
+ def message
13
+ 'The compressed data offset is not available (local header has not been read)'
14
+ end
15
+ end
16
+ MissingEOCD = Class.new(Error) do
17
+ def message
18
+ 'Could not find the EOCD signature in the buffer - maybe a malformed ZIP file'
19
+ end
20
+ end
21
+
22
+ C_UINT32LE = 'V'
23
+ C_UINT16LE = 'v'
24
+ C_UINT64LE = 'Q<'
25
+
26
+ # To prevent too many tiny reads, read the maximum possible size of end of
27
+ # central directory record upfront (all the fixed fields + at most 0xFFFF
28
+ # bytes of the archive comment)
29
+ MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE =
30
+ begin
31
+ 4 + # Offset of the start of central directory
32
+ 4 + # Size of the central directory
33
+ 2 + # Number of files in the cdir
34
+ 4 + # End-of-central-directory signature
35
+ 2 + # Number of this disk
36
+ 2 + # Number of disk with the start of cdir
37
+ 2 + # Number of files in the cdir of this disk
38
+ 2 + # The comment size
39
+ 0xFFFF # Maximum comment size
40
+ end
41
+
42
+ # To prevent too many tiny reads, read the maximum possible size of the local file header upfront.
43
+ # The maximum size is all the usual items, plus the maximum size
44
+ # of the filename (0xFFFF bytes) and the maximum size of the extras (0xFFFF bytes)
45
+ MAX_LOCAL_HEADER_SIZE =
46
+ begin
47
+ 4 + # signature
48
+ 2 + # Version needed to extract
49
+ 2 + # gp flags
50
+ 2 + # storage mode
51
+ 2 + # dos time
52
+ 2 + # dos date
53
+ 4 + # CRC32
54
+ 4 + # Comp size
55
+ 4 + # Uncomp size
56
+ 2 + # Filename size
57
+ 2 + # Extra fields size
58
+ 0xFFFF + # Maximum filename size
59
+ 0xFFFF # Maximum extra fields size
60
+ end
61
+
62
+ SIZE_OF_USABLE_EOCD_RECORD =
63
+ begin
64
+ 4 + # Signature
65
+ 2 + # Number of this disk
66
+ 2 + # Number of the disk with the EOCD record
67
+ 2 + # Number of entries in the central directory of this disk
68
+ 2 + # Number of entries in the central directory total
69
+ 4 + # Size of the central directory
70
+ 4 # Start of the central directory offset
71
+ end
72
+
73
+ private_constant :C_UINT32LE, :C_UINT16LE, :C_UINT64LE, :MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE,
74
+ :MAX_LOCAL_HEADER_SIZE, :SIZE_OF_USABLE_EOCD_RECORD
75
+
76
+ # Represents a file within the ZIP archive being read
77
+ class ZipEntry
78
+ include FormatParser::AttributesJSON
79
+
80
+ # @return [Fixnum] bit-packed version signature of the program that made the archive
81
+ attr_accessor :made_by
82
+
83
+ # @return [Fixnum] ZIP version support needed to extract this file
84
+ attr_accessor :version_needed_to_extract
85
+
86
+ # @return [Fixnum] bit-packed general purpose flags
87
+ attr_accessor :gp_flags
88
+
89
+ # @return [Fixnum] Storage mode (0 for stored, 8 for deflate)
90
+ attr_accessor :storage_mode
91
+
92
+ # @return [Fixnum] the bit-packed DOS time
93
+ attr_accessor :dos_time
94
+
95
+ # @return [Fixnum] the bit-packed DOS date
96
+ attr_accessor :dos_date
97
+
98
+ # @return [Fixnum] the CRC32 checksum of this file
99
+ attr_accessor :crc32
100
+
101
+ # @return [Fixnum] size of compressed file data in the ZIP
102
+ attr_accessor :compressed_size
103
+
104
+ # @return [Fixnum] size of the file once uncompressed
105
+ attr_accessor :uncompressed_size
106
+
107
+ # @return [String] the filename
108
+ attr_accessor :filename
109
+
110
+ # @return [Fixnum] disk number where this file starts
111
+ attr_accessor :disk_number_start
112
+
113
+ # @return [Fixnum] internal attributes of the file
114
+ attr_accessor :internal_attrs
115
+
116
+ # @return [Fixnum] external attributes of the file
117
+ attr_accessor :external_attrs
118
+
119
+ # @return [Fixnum] at what offset the local file header starts
120
+ # in your original IO object
121
+ attr_accessor :local_file_header_offset
122
+
123
+ # @return [String] the file comment
124
+ attr_accessor :comment
125
+
126
+ # @return [Fixnum] at what offset you should start reading
127
+ # for the compressed data in your original IO object
128
+ def compressed_data_offset
129
+ @compressed_data_offset || raise(LocalHeaderPending)
130
+ end
131
+
132
+ # Tells whether the compressed data offset is already known for this entry
133
+ # @return [Boolean]
134
+ def known_offset?
135
+ !@compressed_data_offset.nil?
136
+ end
137
+
138
+ # Tells whether the entry uses a data descriptor (this is defined
139
+ # by bit 3 in the GP flags).
140
+ def uses_data_descriptor?
141
+ (gp_flags & 0x0008) == 0x0008
142
+ end
143
+
144
+ # Sets the offset at which the compressed data for this file starts in the ZIP.
145
+ # By default, the value will be set by the Reader for you. If you use delayed
146
+ # reading, you need to set it by using the `get_compressed_data_offset` on the Reader:
147
+ #
148
+ # entry.compressed_data_offset = reader.get_compressed_data_offset(io: file,
149
+ # local_file_header_offset: entry.local_header_offset)
150
+ def compressed_data_offset=(offset)
151
+ @compressed_data_offset = offset.to_i
152
+ end
153
+ end
154
+
155
+ # Parse an IO handle to a ZIP archive into an array of Entry objects.
156
+ #
157
+ # @param io[#tell, #seek, #read, #size] an IO-ish object
158
+ # @return [Array<ZipEntry>] an array of entries within the ZIP being parsed
159
+ def read_zip_structure(io:)
160
+ zip_file_size = io.size
161
+ eocd_offset = get_eocd_offset(io, zip_file_size)
162
+
163
+ zip64_end_of_cdir_location = get_zip64_eocd_location(io, eocd_offset)
164
+ num_files, cdir_location, cdir_size =
165
+ if zip64_end_of_cdir_location
166
+ num_files_and_central_directory_offset_zip64(io, zip64_end_of_cdir_location)
167
+ else
168
+ num_files_and_central_directory_offset(io, eocd_offset)
169
+ end
170
+
171
+ log { format('Located the central directory start at %d', cdir_location) }
172
+ seek(io, cdir_location)
173
+
174
+ # In zip_tricks we read the entire central directory _and_ enything behind it.
175
+ # Strictly speaking, we should be able to read `cdir_size` bytes and not a byte more.
176
+ # BUT! in format_parser we avoid unbounded reads, as a matter of fact they are forbidden.
177
+ # So we will again limit ouselves to cdir_size, and we will take cushion of 1 KB.
178
+ central_directory_str = io.read(cdir_size + 1024)
179
+ central_directory_io = StringIO.new(central_directory_str)
180
+ log do
181
+ format(
182
+ 'Read %d bytes with central directory + EOCD record and locator',
183
+ central_directory_str.bytesize)
184
+ end
185
+
186
+ entries = (0...num_files).map do |entry_n|
187
+ offset_location = cdir_location + central_directory_io.pos
188
+ log do
189
+ format(
190
+ 'Reading the central directory entry %d starting at offset %d',
191
+ entry_n, offset_location)
192
+ end
193
+ read_cdir_entry(central_directory_io)
194
+ end
195
+
196
+ entries
197
+ end
198
+
199
+ private
200
+
201
+ def skip_ahead_2(io)
202
+ skip_ahead_n(io, 2)
203
+ end
204
+
205
+ def skip_ahead_4(io)
206
+ skip_ahead_n(io, 4)
207
+ end
208
+
209
+ def skip_ahead_8(io)
210
+ skip_ahead_n(io, 8)
211
+ end
212
+
213
+ def seek(io, absolute_pos)
214
+ io.seek(absolute_pos)
215
+ unless absolute_pos == io.pos
216
+ raise ReadError,
217
+ "Expected to seek to #{absolute_pos} but only got to #{io.pos}"
218
+ end
219
+ nil
220
+ end
221
+
222
+ def assert_signature(io, signature_magic_number)
223
+ readback = read_4b(io)
224
+ if readback != signature_magic_number
225
+ expected = '0x0' + signature_magic_number.to_s(16)
226
+ actual = '0x0' + readback.to_s(16)
227
+ raise InvalidStructure, "Expected signature #{expected}, but read #{actual}"
228
+ end
229
+ end
230
+
231
+ def skip_ahead_n(io, n)
232
+ pos_before = io.pos
233
+ io.seek(io.pos + n)
234
+ pos_after = io.pos
235
+ delta = pos_after - pos_before
236
+ unless delta == n
237
+ raise ReadError, "Expected to seek #{n} bytes ahead, but could only seek #{delta} bytes ahead"
238
+ end
239
+ nil
240
+ end
241
+
242
+ def read_n(io, n_bytes)
243
+ io.read(n_bytes).tap do |d|
244
+ raise ReadError, "Expected to read #{n_bytes} bytes, but the IO was at the end" if d.nil?
245
+ unless d.bytesize == n_bytes
246
+ raise ReadError, "Expected to read #{n_bytes} bytes, read #{d.bytesize}"
247
+ end
248
+ end
249
+ end
250
+
251
+ def read_2b(io)
252
+ read_n(io, 2).unpack(C_UINT16LE).shift
253
+ end
254
+
255
+ def read_4b(io)
256
+ read_n(io, 4).unpack(C_UINT32LE).shift
257
+ end
258
+
259
+ def read_8b(io)
260
+ read_n(io, 8).unpack(C_UINT64LE).shift
261
+ end
262
+
263
+ def read_cdir_entry(io)
264
+ assert_signature(io, 0x02014b50)
265
+ ZipEntry.new.tap do |e|
266
+ e.made_by = read_2b(io)
267
+ e.version_needed_to_extract = read_2b(io)
268
+ e.gp_flags = read_2b(io)
269
+ e.storage_mode = read_2b(io)
270
+ e.dos_time = read_2b(io)
271
+ e.dos_date = read_2b(io)
272
+ e.crc32 = read_4b(io)
273
+ e.compressed_size = read_4b(io)
274
+ e.uncompressed_size = read_4b(io)
275
+ filename_size = read_2b(io)
276
+ extra_size = read_2b(io)
277
+ comment_len = read_2b(io)
278
+ e.disk_number_start = read_2b(io)
279
+ e.internal_attrs = read_2b(io)
280
+ e.external_attrs = read_4b(io)
281
+ e.local_file_header_offset = read_4b(io)
282
+ e.filename = read_n(io, filename_size)
283
+
284
+ # Extra fields
285
+ extras = read_n(io, extra_size)
286
+ # Comment
287
+ e.comment = read_n(io, comment_len)
288
+
289
+ # Parse out the extra fields
290
+ extra_table = parse_out_extra_fields(extras)
291
+
292
+ # ...of which we really only need the Zip64 extra
293
+ if zip64_extra_contents ||= extra_table[1]
294
+ # If the Zip64 extra is present, we let it override all
295
+ # the values fetched from the conventional header
296
+ zip64_extra = StringIO.new(zip64_extra_contents)
297
+ log do
298
+ format(
299
+ 'Will read Zip64 extra data for %s, %d bytes',
300
+ e.filename, zip64_extra.size)
301
+ end
302
+ # Now here be dragons. The APPNOTE specifies that
303
+ #
304
+ # > The order of the fields in the ZIP64 extended
305
+ # > information record is fixed, but the fields will
306
+ # > only appear if the corresponding Local or Central
307
+ # > directory record field is set to 0xFFFF or 0xFFFFFFFF.
308
+ #
309
+ # It means that before we read this stuff we need to check if the previously-read
310
+ # values are at overflow, and only _then_ proceed to read them. Bah.
311
+ if e.uncompressed_size == 0xFFFFFFFF
312
+ e.uncompressed_size = read_8b(zip64_extra)
313
+ end
314
+ if e.compressed_size == 0xFFFFFFFF
315
+ e.compressed_size = read_8b(zip64_extra)
316
+ end
317
+ if e.local_file_header_offset == 0xFFFFFFFF
318
+ e.local_file_header_offset = read_8b(zip64_extra)
319
+ end
320
+ # Disk number comes last and we can skip it anyway, since we do
321
+ # not support multi-disk archives
322
+ end
323
+ end
324
+ end
325
+
326
+ def get_eocd_offset(file_io, zip_file_size)
327
+ # Start reading from the _comment_ of the zip file (from the very end).
328
+ # The maximum size of the comment is 0xFFFF (what fits in 2 bytes)
329
+ implied_position_of_eocd_record = zip_file_size - MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE
330
+ implied_position_of_eocd_record = 0 if implied_position_of_eocd_record < 0
331
+
332
+ # Use a soft seek (we might not be able to get as far behind in the IO as we want)
333
+ # and a soft read (we might not be able to read as many bytes as we want)
334
+ file_io.seek(implied_position_of_eocd_record)
335
+ str_containing_eocd_record = file_io.read(MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE)
336
+ raise MissingEOCD unless str_containing_eocd_record
337
+
338
+ eocd_idx_in_buf = locate_eocd_signature(str_containing_eocd_record)
339
+
340
+ raise MissingEOCD unless eocd_idx_in_buf
341
+
342
+ eocd_offset = implied_position_of_eocd_record + eocd_idx_in_buf
343
+ log { format('Found EOCD signature at offset %d', eocd_offset) }
344
+
345
+ eocd_offset
346
+ end
347
+
348
+ # This is tricky. Essentially, we have to scan the maximum possible number
349
+ # of bytes (that the EOCD can theoretically occupy including the comment),
350
+ # and we have to find a combination of:
351
+ # [EOCD signature, <some ZIP medatata>, comment byte size, the comment of
352
+ # that size, eof].
353
+ # The only way I could find to do this was with a sliding window, but
354
+ # there probably is a better way.
355
+ def locate_eocd_signature(in_str)
356
+ # We have to scan from the _very_ tail. We read the very minimum size
357
+ # the EOCD record can have (up to and including the comment size), using
358
+ # a sliding window. Once our end offset matches the comment size we found our
359
+ # EOCD marker.
360
+ unpack_pattern = 'VvvvvVVv'
361
+ minimum_record_size = 22
362
+ end_location = minimum_record_size * -1
363
+ loop do
364
+ # If the window is nil, we have rolled off the start of the string, nothing to do here.
365
+ # We use negative values because if we used positive slice indices
366
+ # we would have to detect the rollover ourselves
367
+ break unless window = in_str[end_location, minimum_record_size]
368
+
369
+ window_location = in_str.bytesize + end_location
370
+ unpacked = window.unpack(unpack_pattern)
371
+ # If we found the signarue, pick up the comment size, and check if the size of the window
372
+ # plus that comment size is where we are in the string. If we are - bingo.
373
+ if unpacked[0] == 0x06054b50 && comment_size = unpacked[-1]
374
+ assumed_eocd_location = in_str.bytesize - comment_size - minimum_record_size
375
+ # if the comment size is where we should be at - we found our EOCD
376
+ return assumed_eocd_location if assumed_eocd_location == window_location
377
+ end
378
+
379
+ end_location -= 1 # Shift the window back, by one byte, and try again.
380
+ end
381
+ end
382
+
383
+ # Find the Zip64 EOCD locator segment offset. Do this by seeking backwards from the
384
+ # EOCD record in the archive by fixed offsets
385
+ def get_zip64_eocd_location(file_io, eocd_offset)
386
+ zip64_eocd_loc_offset = eocd_offset
387
+ zip64_eocd_loc_offset -= 4 # The signature
388
+ zip64_eocd_loc_offset -= 4 # Which disk has the Zip64 end of central directory record
389
+ zip64_eocd_loc_offset -= 8 # Offset of the zip64 central directory record
390
+ zip64_eocd_loc_offset -= 4 # Total number of disks
391
+
392
+ log do
393
+ format(
394
+ 'Will look for the Zip64 EOCD locator signature at offset %d',
395
+ zip64_eocd_loc_offset)
396
+ end
397
+
398
+ # If the offset is negative there is certainly no Zip64 EOCD locator here
399
+ return unless zip64_eocd_loc_offset >= 0
400
+
401
+ file_io.seek(zip64_eocd_loc_offset)
402
+ assert_signature(file_io, 0x07064b50)
403
+
404
+ log { format('Found Zip64 EOCD locator at offset %d', zip64_eocd_loc_offset) }
405
+
406
+ disk_num = read_4b(file_io) # number of the disk
407
+ raise UnsupportedFeature, 'The archive spans multiple disks' if disk_num != 0
408
+ read_8b(file_io)
409
+ rescue ReadError, InvalidStructure
410
+ nil
411
+ end
412
+
413
+ # num_files_and_central_directory_offset_zip64 is too high. [21.12/15]
414
+ def num_files_and_central_directory_offset_zip64(io, zip64_end_of_cdir_location)
415
+ seek(io, zip64_end_of_cdir_location)
416
+
417
+ assert_signature(io, 0x06064b50)
418
+
419
+ zip64_eocdr_size = read_8b(io)
420
+ zip64_eocdr = read_n(io, zip64_eocdr_size) # Reading in bulk is cheaper
421
+ zip64_eocdr = StringIO.new(zip64_eocdr)
422
+ skip_ahead_2(zip64_eocdr) # version made by
423
+ skip_ahead_2(zip64_eocdr) # version needed to extract
424
+
425
+ disk_n = read_4b(zip64_eocdr) # number of this disk
426
+ disk_n_with_eocdr = read_4b(zip64_eocdr) # number of the disk with the EOCDR
427
+ if disk_n != disk_n_with_eocdr
428
+ raise UnsupportedFeature, 'The archive spans multiple disks'
429
+ end
430
+
431
+ num_files_this_disk = read_8b(zip64_eocdr) # number of files on this disk
432
+ num_files_total = read_8b(zip64_eocdr) # files total in the central directory
433
+
434
+ if num_files_this_disk != num_files_total
435
+ raise UnsupportedFeature, 'The archive spans multiple disks'
436
+ end
437
+
438
+ log do
439
+ format(
440
+ 'Zip64 EOCD record states there are %d files in the archive',
441
+ num_files_total)
442
+ end
443
+
444
+ central_dir_size = read_8b(zip64_eocdr) # Size of the central directory
445
+ central_dir_offset = read_8b(zip64_eocdr) # Where the central directory starts
446
+
447
+ [num_files_total, central_dir_offset, central_dir_size]
448
+ end
449
+
450
+ def num_files_and_central_directory_offset(file_io, eocd_offset)
451
+ seek(file_io, eocd_offset)
452
+
453
+ # The size of the EOCD record is known upfront, so use a strict read
454
+ eocd_record_str = read_n(file_io, SIZE_OF_USABLE_EOCD_RECORD)
455
+ io = StringIO.new(eocd_record_str)
456
+
457
+ assert_signature(io, 0x06054b50)
458
+ skip_ahead_2(io) # number_of_this_disk
459
+ skip_ahead_2(io) # number of the disk with the EOCD record
460
+ skip_ahead_2(io) # number of entries in the central directory of this disk
461
+ num_files = read_2b(io) # number of entries in the central directory total
462
+ cdir_size = read_4b(io) # size of the central directory
463
+ cdir_offset = read_4b(io) # start of central directorty offset
464
+ [num_files, cdir_offset, cdir_size]
465
+ end
466
+
467
+ # Is provided as a stub to be overridden in a subclass if you need it. Will report
468
+ # during various stages of reading. The log message is contained in the return value
469
+ # of `yield` in the method (the log messages are lazy-evaluated).
470
+ def log
471
+ # $stderr.puts(yield)
472
+ end
473
+
474
+ def parse_out_extra_fields(extra_fields_str)
475
+ extra_table = {}
476
+ extras_buf = StringIO.new(extra_fields_str)
477
+ until extras_buf.eof?
478
+ extra_id = read_2b(extras_buf)
479
+ extra_size = read_2b(extras_buf)
480
+ extra_contents = read_n(extras_buf, extra_size)
481
+ extra_table[extra_id] = extra_contents
482
+ end
483
+ extra_table
484
+ end
485
+ end
@@ -13,7 +13,7 @@ describe FormatParser::AttributesJSON do
13
13
  instance.foo = 42
14
14
  instance.bar = 'abcdef'
15
15
  expect(instance.as_json).to eq('nature' => 'good', 'foo' => 42, 'bar' => 'abcdef', 'baz' => nil)
16
- expect(instance.as_json(root: true)).to eq('nature' => 'good', 'foo' => 42, 'bar' => 'abcdef', 'baz' => nil)
16
+ expect(instance.as_json(root: true)).to eq('format_parser_file_info' => {'nature' => 'good', 'foo' => 42, 'bar' => 'abcdef', 'baz' => nil})
17
17
  end
18
18
 
19
19
  it 'is included into file information types' do
@@ -49,4 +49,22 @@ describe FormatParser::AttributesJSON do
49
49
  standard_output = JSON.dump(instance)
50
50
  expect(pretty_output).not_to eq(standard_output)
51
51
  end
52
+
53
+ it 'provides to_json without arguments' do
54
+ anon_class = Class.new do
55
+ include FormatParser::AttributesJSON
56
+ attr_accessor :foo, :bar, :baz
57
+ def nature
58
+ 'good'
59
+ end
60
+ end
61
+ instance = anon_class.new
62
+ instance.foo = 42
63
+ instance.bar = 'abcdef'
64
+
65
+ output = instance.to_json
66
+ readback = JSON.parse(output, symbolize_names: true)
67
+
68
+ expect(readback).to have_key(:nature)
69
+ end
52
70
  end
@@ -0,0 +1,68 @@
1
+ require 'spec_helper'
2
+
3
+ describe FormatParser::PDFParser do
4
+ let(:parsed_pdf) {
5
+ subject.call(
6
+ File.open(
7
+ Pathname.new(fixtures_dir).join('PDF').join(pdf_file),
8
+ 'rb'
9
+ )
10
+ )
11
+ }
12
+
13
+ shared_examples :behave_like_pdf do |hash|
14
+ let(:pdf_file) { hash.fetch(:file) }
15
+
16
+ it 'acts as a pdf' do
17
+ expect(parsed_pdf).not_to be_nil
18
+ expect(parsed_pdf.nature).to eq(:document)
19
+ expect(parsed_pdf.format).to eq(:pdf)
20
+ end
21
+
22
+ it 'has a correct page count' do
23
+ expect(parsed_pdf.page_count).to eq(hash.fetch(:page_count))
24
+ end
25
+ end
26
+
27
+ describe 'a PDF file with a missing version header' do
28
+ let(:pdf_file) { 'not_a.pdf' }
29
+
30
+ it 'does not parse succesfully' do
31
+ expect(parsed_pdf).to be_nil
32
+ end
33
+ end
34
+
35
+ describe 'a PDF file with a correct header but no valid content' do
36
+ let(:pdf_file) { 'broken.pdf' }
37
+
38
+ pending 'does not parse succesfully'
39
+ end
40
+
41
+ describe 'exceeding the PDF read limit' do
42
+ let(:pdf_file) { 'read_limit.pdf' }
43
+
44
+ pending 'does not parse succesfully'
45
+ end
46
+
47
+ describe 'a PDF file with a missing COUNT_HEADER' do
48
+ let(:pdf_file) { 'missing_page_count.pdf' }
49
+
50
+ it 'does not return a page count' do
51
+ expect(parsed_pdf.page_count).to eq(nil)
52
+ end
53
+ end
54
+
55
+ describe 'parses a PDF file' do
56
+ describe 'a single page file' do
57
+ include_examples :behave_like_pdf, file: '1_page.pdf', page_count: 1
58
+ end
59
+
60
+ describe 'a multi page pdf file' do
61
+ include_examples :behave_like_pdf, file: '2_pages.pdf', page_count: 2
62
+ end
63
+
64
+ describe 'a multi page pdf file with content' do
65
+ include_examples :behave_like_pdf, file: '10_pages.pdf', page_count: 10
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,68 @@
1
+ require 'spec_helper'
2
+
3
+ describe FormatParser::ZIPParser do
4
+ it 'parses a ZIP archive with Zip64 extra fields (due to the number of files)' do
5
+ fixture_path = fixtures_dir + '/ZIP/arch_many_entries.zip'
6
+ fi_io = File.open(fixture_path, 'rb')
7
+
8
+ result = subject.call(fi_io)
9
+ expect(result).not_to be_nil
10
+
11
+ expect(result.format).to eq(:zip)
12
+ expect(result.nature).to eq(:archive)
13
+ expect(result.entries.length).to eq(0xFFFF + 1)
14
+
15
+ entry = result.entries.fetch(5674)
16
+ expect(entry.type).to eq(:file)
17
+ expect(entry.size).to eq(47)
18
+ expect(entry.filename).to eq('file-0005674.txt')
19
+ end
20
+
21
+ it 'parses a ZIP archive with a few files' do
22
+ fixture_path = fixtures_dir + '/ZIP/arch_few_entries.zip'
23
+ fi_io = File.open(fixture_path, 'rb')
24
+
25
+ result = subject.call(fi_io)
26
+ expect(result).not_to be_nil
27
+
28
+ expect(result.format).to eq(:zip)
29
+ expect(result.nature).to eq(:archive)
30
+ expect(result.entries.length).to eq(3)
31
+ end
32
+
33
+ it 'correctly identifies an empty directory' do
34
+ fixture_path = fixtures_dir + '/ZIP/arch_with_empty_dir.zip'
35
+ fi_io = File.open(fixture_path, 'rb')
36
+
37
+ result = subject.call(fi_io)
38
+ expect(result).not_to be_nil
39
+
40
+ expect(result.format).to eq(:zip)
41
+ expect(result.nature).to eq(:archive)
42
+ expect(result.entries.length).to eq(3)
43
+
44
+ dir_entry = result.entries.last
45
+ expect(dir_entry.filename).to eq('папочка/')
46
+ expect(dir_entry.type).to eq(:directory)
47
+ end
48
+
49
+ it 'returns a result that has a usable JSON representation' do
50
+ fixture_path = fixtures_dir + '/ZIP/arch_with_empty_dir.zip'
51
+ fi_io = File.open(fixture_path, 'rb')
52
+
53
+ result = subject.call(fi_io)
54
+ json_repr = JSON.pretty_generate(result)
55
+
56
+ json_parsed_repr = JSON.parse(json_repr, symbolize_names: true)
57
+ expect(json_parsed_repr[:nature]).to eq('archive')
58
+ expect(json_parsed_repr[:format]).to eq('zip')
59
+ expect(json_parsed_repr[:entries]).to be_kind_of(Array)
60
+ expect(json_parsed_repr[:entries].length).to eq(3)
61
+
62
+ json_parsed_repr[:entries].each do |e|
63
+ expect(e[:filename]).to be_kind_of(String)
64
+ expect(e[:size]).to be_kind_of(Integer)
65
+ expect(e[:type]).to be_kind_of(String)
66
+ end
67
+ end
68
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: format_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.5
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Noah Berman
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2018-03-13 00:00:00.000000000 Z
12
+ date: 2018-03-30 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: ks
@@ -159,6 +159,7 @@ files:
159
159
  - README.md
160
160
  - Rakefile
161
161
  - format_parser.gemspec
162
+ - lib/archive.rb
162
163
  - lib/attributes_json.rb
163
164
  - lib/audio.rb
164
165
  - lib/care.rb
@@ -180,10 +181,13 @@ files:
180
181
  - lib/parsers/mp3_parser.rb
181
182
  - lib/parsers/mp3_parser/id3_v1.rb
182
183
  - lib/parsers/mp3_parser/id3_v2.rb
184
+ - lib/parsers/pdf_parser.rb
183
185
  - lib/parsers/png_parser.rb
184
186
  - lib/parsers/psd_parser.rb
185
187
  - lib/parsers/tiff_parser.rb
186
188
  - lib/parsers/wav_parser.rb
189
+ - lib/parsers/zip_parser.rb
190
+ - lib/parsers/zip_parser/file_reader.rb
187
191
  - lib/read_limiter.rb
188
192
  - lib/remote_io.rb
189
193
  - lib/video.rb
@@ -201,10 +205,12 @@ files:
201
205
  - spec/parsers/jpeg_parser_spec.rb
202
206
  - spec/parsers/moov_parser_spec.rb
203
207
  - spec/parsers/mp3_parser_spec.rb
208
+ - spec/parsers/pdf_parser_spec.rb
204
209
  - spec/parsers/png_parser_spec.rb
205
210
  - spec/parsers/psd_parser_spec.rb
206
211
  - spec/parsers/tiff_parser_spec.rb
207
212
  - spec/parsers/wav_parser_spec.rb
213
+ - spec/parsers/zip_parser_spec.rb
208
214
  - spec/read_limiter_spec.rb
209
215
  - spec/remote_fetching_spec.rb
210
216
  - spec/remote_io_spec.rb