format_parser 0.3.5 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 4bc81ce8d64a13fe43d93e7fe24b17ea484e64a4
4
- data.tar.gz: 8d559f3fdd9f0a814b479ade6997a3d6bd7dd4a5
3
+ metadata.gz: a76c414094334f57859df79e61d42fa1fdabb3bd
4
+ data.tar.gz: 120aaee7484ee01165a2c8dd09b796bce7900c9f
5
5
  SHA512:
6
- metadata.gz: d555a6eb131261f3c9bb2418e1c7acf12dfb07de4d3b2af65a8135247b8900ddb8e1c60bc6c0d6985ac3c2bfdfc81c0902396e21436a4b4b7a5fd6eee7dcf4d7
7
- data.tar.gz: c955511e375429fb93068220a1b7d70875a7e71760f14652fd191ad8d1a8b5c20ff20258cda030468acfc53cde070436b8515fe9af7e0f757dbe2c2d66722a03
6
+ metadata.gz: a249af874800774dae313b42e4c191125341a6497a9e31b75e54d22ac008725331ce2227e41c167b9f746b85f7db86364dbbdf5614d48f11cb4122e4de01ce03
7
+ data.tar.gz: e6fee97f2741dccc1c9325813eed247d2a93d7b118b7b6b902cba2f23307650127875d443554a4df1e2ea3c8658c59bab68d2c4d84cef4afe2b4bf1e6454c144
data/lib/archive.rb ADDED
@@ -0,0 +1,36 @@
1
+ require 'ks'
2
+
3
+ module FormatParser
4
+ class Archive
5
+ include FormatParser::AttributesJSON
6
+
7
+ class Entry < Ks.strict(:type, :size, :filename)
8
+ def to_json(*a)
9
+ to_h.to_json(*a)
10
+ end
11
+ end
12
+
13
+ NATURE = :archive
14
+
15
+ # What filetype was recognized? Will contain a non-ambiguous symbol
16
+ # referring to the file format. The symbol can be used as a filename
17
+ # extension safely
18
+ attr_accessor :format
19
+
20
+ # Array of Entry structs
21
+ attr_accessor :entries
22
+
23
+ # If a parser wants to provide any extra information to the caller
24
+ # it can be placed here
25
+ attr_accessor :intrinsics
26
+
27
+ # Only permits assignments via defined accessors
28
+ def initialize(**attributes)
29
+ attributes.map { |(k, v)| public_send("#{k}=", v) }
30
+ end
31
+
32
+ def nature
33
+ NATURE
34
+ end
35
+ end
36
+ end
@@ -14,7 +14,7 @@ module FormatParser::AttributesJSON
14
14
 
15
15
  # Implements a sane default `as_json` for an object
16
16
  # that accessors defined
17
- def as_json(*_maybe_root_option)
17
+ def as_json(root: false)
18
18
  h = {}
19
19
  h['nature'] = nature if respond_to?(:nature) # Needed for file info structs
20
20
  methods.grep(/\w\=$/).each_with_object(h) do |attr_writer_method_name, h|
@@ -24,11 +24,15 @@ module FormatParser::AttributesJSON
24
24
  # by the caller
25
25
  h[reader_method_name] = value.respond_to?(:as_json) ? value.as_json : value
26
26
  end
27
+ if root
28
+ {'format_parser_file_info' => h}
29
+ else
30
+ h
31
+ end
27
32
  end
28
33
 
29
- # Implements to_json with sane defaults - like
30
- # support for `JSON.pretty_generate` vs. `JSON.dump`
31
- def to_json(generator_state)
32
- generator_state.generate(as_json)
34
+ # Implements to_json with sane defaults, with or without arguments
35
+ def to_json(*maybe_generator_state)
36
+ as_json(root: false).to_json(*maybe_generator_state)
33
37
  end
34
38
  end
data/lib/document.rb CHANGED
@@ -6,6 +6,7 @@ module FormatParser
6
6
 
7
7
  attr_accessor :format
8
8
  attr_accessor :document_type
9
+ attr_accessor :page_count
9
10
 
10
11
  # Only permits assignments via defined accessors
11
12
  def initialize(**attributes)
data/lib/format_parser.rb CHANGED
@@ -6,6 +6,7 @@ module FormatParser
6
6
  require_relative 'audio'
7
7
  require_relative 'document'
8
8
  require_relative 'video'
9
+ require_relative 'archive'
9
10
  require_relative 'io_utils'
10
11
  require_relative 'read_limiter'
11
12
  require_relative 'remote_io'
@@ -1,3 +1,3 @@
1
1
  module FormatParser
2
- VERSION = '0.3.5'
2
+ VERSION = '0.4.0'
3
3
  end
@@ -0,0 +1,76 @@
1
+ class FormatParser::PDFParser
2
+ include FormatParser::IOUtils
3
+
4
+ # First 9 bytes of a PDF should be in this format, according to:
5
+ #
6
+ # https://stackoverflow.com/questions/3108201/detect-if-pdf-file-is-correct-header-pdf
7
+ #
8
+ # There are however exceptions, which are left out for now.
9
+ #
10
+ PDF_MARKER = /%PDF-1\.[0-8]{1}/
11
+
12
+ # Page counts have different markers depending on
13
+ # the PDF type. There is not a single common way of solving
14
+ # this. The only way of solving this correctly is by adding
15
+ # different types of PDF's in the specs.
16
+ #
17
+ COUNT_MARKERS = ['Count ']
18
+ EOF_MARKER = '%EOF'
19
+
20
+ def call(io)
21
+ io = FormatParser::IOConstraint.new(io)
22
+
23
+ return unless safe_read(io, 9) =~ PDF_MARKER
24
+
25
+ attributes = scan_for_attributes(io)
26
+
27
+ FormatParser::Document.new(
28
+ format: :pdf,
29
+ page_count: attributes[:page_count]
30
+ )
31
+ end
32
+
33
+ private
34
+
35
+ # Read ahead bytes until one of % or / is reached.
36
+ # A header in a PDF always starts with a /
37
+ # The % is to detect the EOF
38
+ #
39
+ def scan_for_attributes(io)
40
+ result = {}
41
+
42
+ while read = safe_read(io, 1)
43
+ case read
44
+ when '%'
45
+ break if safe_read(io, EOF_MARKER.size) == EOF_MARKER
46
+ when '/'
47
+ find_page_count(io, result)
48
+ end
49
+ end
50
+
51
+ result
52
+ end
53
+
54
+ def find_page_count(io, result)
55
+ COUNT_MARKERS.each do |marker|
56
+ if safe_read(io, marker.size) == marker
57
+ result[:page_count] = read_numbers(io)
58
+ end
59
+ end
60
+ end
61
+
62
+ # Read ahead bytes until no more numbers are found
63
+ # This assumes that the position of io starts at a
64
+ # number
65
+ def read_numbers(io)
66
+ numbers = ''
67
+
68
+ while c = safe_read(io, 1)
69
+ c =~ /\d+/ ? numbers << c : break
70
+ end
71
+
72
+ numbers.to_i
73
+ end
74
+
75
+ FormatParser.register_parser self, natures: :document, formats: :pdf
76
+ end
@@ -0,0 +1,39 @@
1
+ class FormatParser::ZIPParser
2
+ require_relative 'zip_parser/file_reader'
3
+
4
+ def call(io)
5
+ reader = FileReader.new
6
+ entries = reader.read_zip_structure(io: FormatParser::IOConstraint.new(io))
7
+
8
+ entries_archive = entries.map do |ze|
9
+ ft = directory?(ze) ? :directory : :file
10
+ decoded_filename = decode_filename(ze)
11
+ FormatParser::Archive::Entry.new(type: ft, size: ze.uncompressed_size, filename: decoded_filename)
12
+ end
13
+
14
+ FormatParser::Archive.new(format: :zip, entries: entries_archive)
15
+ rescue FileReader::Error
16
+ # This is not a ZIP, or a broken ZIP.
17
+ return
18
+ end
19
+
20
+ def directory?(zip_entry)
21
+ # We can do a lap dance here and parse out the individual bit fields
22
+ # from the external attributes, check the OS type that is in the entry
23
+ # to see if it can be interpreted as UNIX or not, and generally have
24
+ # heaps of fun. Instead, we will be frugal.
25
+ zip_entry.filename.end_with?('/')
26
+ end
27
+
28
+ def decode_filename(zip_entry)
29
+ # Check for the EFS bit in the general-purpose flags. If it is set,
30
+ # the entry filename can be treated as UTF-8
31
+ if zip_entry.gp_flags & 0b100000000000 == 0b100000000000
32
+ zip_entry.filename.unpack('U*').pack('U*')
33
+ else
34
+ zip_entry.filename.encode(Encoding::UTF_8, undefined: :replace)
35
+ end
36
+ end
37
+
38
+ FormatParser.register_parser self, natures: [:archive, :document], formats: :zip
39
+ end
@@ -0,0 +1,485 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'stringio'
4
+
5
+ # A very barebones ZIP file reader
6
+ class FormatParser::ZIPParser::FileReader
7
+ Error = Class.new(StandardError)
8
+ ReadError = Class.new(Error)
9
+ UnsupportedFeature = Class.new(Error)
10
+ InvalidStructure = Class.new(Error)
11
+ LocalHeaderPending = Class.new(Error) do
12
+ def message
13
+ 'The compressed data offset is not available (local header has not been read)'
14
+ end
15
+ end
16
+ MissingEOCD = Class.new(Error) do
17
+ def message
18
+ 'Could not find the EOCD signature in the buffer - maybe a malformed ZIP file'
19
+ end
20
+ end
21
+
22
+ C_UINT32LE = 'V'
23
+ C_UINT16LE = 'v'
24
+ C_UINT64LE = 'Q<'
25
+
26
+ # To prevent too many tiny reads, read the maximum possible size of end of
27
+ # central directory record upfront (all the fixed fields + at most 0xFFFF
28
+ # bytes of the archive comment)
29
+ MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE =
30
+ begin
31
+ 4 + # Offset of the start of central directory
32
+ 4 + # Size of the central directory
33
+ 2 + # Number of files in the cdir
34
+ 4 + # End-of-central-directory signature
35
+ 2 + # Number of this disk
36
+ 2 + # Number of disk with the start of cdir
37
+ 2 + # Number of files in the cdir of this disk
38
+ 2 + # The comment size
39
+ 0xFFFF # Maximum comment size
40
+ end
41
+
42
+ # To prevent too many tiny reads, read the maximum possible size of the local file header upfront.
43
+ # The maximum size is all the usual items, plus the maximum size
44
+ # of the filename (0xFFFF bytes) and the maximum size of the extras (0xFFFF bytes)
45
+ MAX_LOCAL_HEADER_SIZE =
46
+ begin
47
+ 4 + # signature
48
+ 2 + # Version needed to extract
49
+ 2 + # gp flags
50
+ 2 + # storage mode
51
+ 2 + # dos time
52
+ 2 + # dos date
53
+ 4 + # CRC32
54
+ 4 + # Comp size
55
+ 4 + # Uncomp size
56
+ 2 + # Filename size
57
+ 2 + # Extra fields size
58
+ 0xFFFF + # Maximum filename size
59
+ 0xFFFF # Maximum extra fields size
60
+ end
61
+
62
+ SIZE_OF_USABLE_EOCD_RECORD =
63
+ begin
64
+ 4 + # Signature
65
+ 2 + # Number of this disk
66
+ 2 + # Number of the disk with the EOCD record
67
+ 2 + # Number of entries in the central directory of this disk
68
+ 2 + # Number of entries in the central directory total
69
+ 4 + # Size of the central directory
70
+ 4 # Start of the central directory offset
71
+ end
72
+
73
+ private_constant :C_UINT32LE, :C_UINT16LE, :C_UINT64LE, :MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE,
74
+ :MAX_LOCAL_HEADER_SIZE, :SIZE_OF_USABLE_EOCD_RECORD
75
+
76
+ # Represents a file within the ZIP archive being read
77
+ class ZipEntry
78
+ include FormatParser::AttributesJSON
79
+
80
+ # @return [Fixnum] bit-packed version signature of the program that made the archive
81
+ attr_accessor :made_by
82
+
83
+ # @return [Fixnum] ZIP version support needed to extract this file
84
+ attr_accessor :version_needed_to_extract
85
+
86
+ # @return [Fixnum] bit-packed general purpose flags
87
+ attr_accessor :gp_flags
88
+
89
+ # @return [Fixnum] Storage mode (0 for stored, 8 for deflate)
90
+ attr_accessor :storage_mode
91
+
92
+ # @return [Fixnum] the bit-packed DOS time
93
+ attr_accessor :dos_time
94
+
95
+ # @return [Fixnum] the bit-packed DOS date
96
+ attr_accessor :dos_date
97
+
98
+ # @return [Fixnum] the CRC32 checksum of this file
99
+ attr_accessor :crc32
100
+
101
+ # @return [Fixnum] size of compressed file data in the ZIP
102
+ attr_accessor :compressed_size
103
+
104
+ # @return [Fixnum] size of the file once uncompressed
105
+ attr_accessor :uncompressed_size
106
+
107
+ # @return [String] the filename
108
+ attr_accessor :filename
109
+
110
+ # @return [Fixnum] disk number where this file starts
111
+ attr_accessor :disk_number_start
112
+
113
+ # @return [Fixnum] internal attributes of the file
114
+ attr_accessor :internal_attrs
115
+
116
+ # @return [Fixnum] external attributes of the file
117
+ attr_accessor :external_attrs
118
+
119
+ # @return [Fixnum] at what offset the local file header starts
120
+ # in your original IO object
121
+ attr_accessor :local_file_header_offset
122
+
123
+ # @return [String] the file comment
124
+ attr_accessor :comment
125
+
126
+ # @return [Fixnum] at what offset you should start reading
127
+ # for the compressed data in your original IO object
128
+ def compressed_data_offset
129
+ @compressed_data_offset || raise(LocalHeaderPending)
130
+ end
131
+
132
+ # Tells whether the compressed data offset is already known for this entry
133
+ # @return [Boolean]
134
+ def known_offset?
135
+ !@compressed_data_offset.nil?
136
+ end
137
+
138
+ # Tells whether the entry uses a data descriptor (this is defined
139
+ # by bit 3 in the GP flags).
140
+ def uses_data_descriptor?
141
+ (gp_flags & 0x0008) == 0x0008
142
+ end
143
+
144
+ # Sets the offset at which the compressed data for this file starts in the ZIP.
145
+ # By default, the value will be set by the Reader for you. If you use delayed
146
+ # reading, you need to set it by using the `get_compressed_data_offset` on the Reader:
147
+ #
148
+ # entry.compressed_data_offset = reader.get_compressed_data_offset(io: file,
149
+ # local_file_header_offset: entry.local_header_offset)
150
+ def compressed_data_offset=(offset)
151
+ @compressed_data_offset = offset.to_i
152
+ end
153
+ end
154
+
155
+ # Parse an IO handle to a ZIP archive into an array of Entry objects.
156
+ #
157
+ # @param io[#tell, #seek, #read, #size] an IO-ish object
158
+ # @return [Array<ZipEntry>] an array of entries within the ZIP being parsed
159
+ def read_zip_structure(io:)
160
+ zip_file_size = io.size
161
+ eocd_offset = get_eocd_offset(io, zip_file_size)
162
+
163
+ zip64_end_of_cdir_location = get_zip64_eocd_location(io, eocd_offset)
164
+ num_files, cdir_location, cdir_size =
165
+ if zip64_end_of_cdir_location
166
+ num_files_and_central_directory_offset_zip64(io, zip64_end_of_cdir_location)
167
+ else
168
+ num_files_and_central_directory_offset(io, eocd_offset)
169
+ end
170
+
171
+ log { format('Located the central directory start at %d', cdir_location) }
172
+ seek(io, cdir_location)
173
+
174
+ # In zip_tricks we read the entire central directory _and_ enything behind it.
175
+ # Strictly speaking, we should be able to read `cdir_size` bytes and not a byte more.
176
+ # BUT! in format_parser we avoid unbounded reads, as a matter of fact they are forbidden.
177
+ # So we will again limit ouselves to cdir_size, and we will take cushion of 1 KB.
178
+ central_directory_str = io.read(cdir_size + 1024)
179
+ central_directory_io = StringIO.new(central_directory_str)
180
+ log do
181
+ format(
182
+ 'Read %d bytes with central directory + EOCD record and locator',
183
+ central_directory_str.bytesize)
184
+ end
185
+
186
+ entries = (0...num_files).map do |entry_n|
187
+ offset_location = cdir_location + central_directory_io.pos
188
+ log do
189
+ format(
190
+ 'Reading the central directory entry %d starting at offset %d',
191
+ entry_n, offset_location)
192
+ end
193
+ read_cdir_entry(central_directory_io)
194
+ end
195
+
196
+ entries
197
+ end
198
+
199
+ private
200
+
201
+ def skip_ahead_2(io)
202
+ skip_ahead_n(io, 2)
203
+ end
204
+
205
+ def skip_ahead_4(io)
206
+ skip_ahead_n(io, 4)
207
+ end
208
+
209
+ def skip_ahead_8(io)
210
+ skip_ahead_n(io, 8)
211
+ end
212
+
213
+ def seek(io, absolute_pos)
214
+ io.seek(absolute_pos)
215
+ unless absolute_pos == io.pos
216
+ raise ReadError,
217
+ "Expected to seek to #{absolute_pos} but only got to #{io.pos}"
218
+ end
219
+ nil
220
+ end
221
+
222
+ def assert_signature(io, signature_magic_number)
223
+ readback = read_4b(io)
224
+ if readback != signature_magic_number
225
+ expected = '0x0' + signature_magic_number.to_s(16)
226
+ actual = '0x0' + readback.to_s(16)
227
+ raise InvalidStructure, "Expected signature #{expected}, but read #{actual}"
228
+ end
229
+ end
230
+
231
+ def skip_ahead_n(io, n)
232
+ pos_before = io.pos
233
+ io.seek(io.pos + n)
234
+ pos_after = io.pos
235
+ delta = pos_after - pos_before
236
+ unless delta == n
237
+ raise ReadError, "Expected to seek #{n} bytes ahead, but could only seek #{delta} bytes ahead"
238
+ end
239
+ nil
240
+ end
241
+
242
+ def read_n(io, n_bytes)
243
+ io.read(n_bytes).tap do |d|
244
+ raise ReadError, "Expected to read #{n_bytes} bytes, but the IO was at the end" if d.nil?
245
+ unless d.bytesize == n_bytes
246
+ raise ReadError, "Expected to read #{n_bytes} bytes, read #{d.bytesize}"
247
+ end
248
+ end
249
+ end
250
+
251
+ def read_2b(io)
252
+ read_n(io, 2).unpack(C_UINT16LE).shift
253
+ end
254
+
255
+ def read_4b(io)
256
+ read_n(io, 4).unpack(C_UINT32LE).shift
257
+ end
258
+
259
+ def read_8b(io)
260
+ read_n(io, 8).unpack(C_UINT64LE).shift
261
+ end
262
+
263
+ def read_cdir_entry(io)
264
+ assert_signature(io, 0x02014b50)
265
+ ZipEntry.new.tap do |e|
266
+ e.made_by = read_2b(io)
267
+ e.version_needed_to_extract = read_2b(io)
268
+ e.gp_flags = read_2b(io)
269
+ e.storage_mode = read_2b(io)
270
+ e.dos_time = read_2b(io)
271
+ e.dos_date = read_2b(io)
272
+ e.crc32 = read_4b(io)
273
+ e.compressed_size = read_4b(io)
274
+ e.uncompressed_size = read_4b(io)
275
+ filename_size = read_2b(io)
276
+ extra_size = read_2b(io)
277
+ comment_len = read_2b(io)
278
+ e.disk_number_start = read_2b(io)
279
+ e.internal_attrs = read_2b(io)
280
+ e.external_attrs = read_4b(io)
281
+ e.local_file_header_offset = read_4b(io)
282
+ e.filename = read_n(io, filename_size)
283
+
284
+ # Extra fields
285
+ extras = read_n(io, extra_size)
286
+ # Comment
287
+ e.comment = read_n(io, comment_len)
288
+
289
+ # Parse out the extra fields
290
+ extra_table = parse_out_extra_fields(extras)
291
+
292
+ # ...of which we really only need the Zip64 extra
293
+ if zip64_extra_contents ||= extra_table[1]
294
+ # If the Zip64 extra is present, we let it override all
295
+ # the values fetched from the conventional header
296
+ zip64_extra = StringIO.new(zip64_extra_contents)
297
+ log do
298
+ format(
299
+ 'Will read Zip64 extra data for %s, %d bytes',
300
+ e.filename, zip64_extra.size)
301
+ end
302
+ # Now here be dragons. The APPNOTE specifies that
303
+ #
304
+ # > The order of the fields in the ZIP64 extended
305
+ # > information record is fixed, but the fields will
306
+ # > only appear if the corresponding Local or Central
307
+ # > directory record field is set to 0xFFFF or 0xFFFFFFFF.
308
+ #
309
+ # It means that before we read this stuff we need to check if the previously-read
310
+ # values are at overflow, and only _then_ proceed to read them. Bah.
311
+ if e.uncompressed_size == 0xFFFFFFFF
312
+ e.uncompressed_size = read_8b(zip64_extra)
313
+ end
314
+ if e.compressed_size == 0xFFFFFFFF
315
+ e.compressed_size = read_8b(zip64_extra)
316
+ end
317
+ if e.local_file_header_offset == 0xFFFFFFFF
318
+ e.local_file_header_offset = read_8b(zip64_extra)
319
+ end
320
+ # Disk number comes last and we can skip it anyway, since we do
321
+ # not support multi-disk archives
322
+ end
323
+ end
324
+ end
325
+
326
+ def get_eocd_offset(file_io, zip_file_size)
327
+ # Start reading from the _comment_ of the zip file (from the very end).
328
+ # The maximum size of the comment is 0xFFFF (what fits in 2 bytes)
329
+ implied_position_of_eocd_record = zip_file_size - MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE
330
+ implied_position_of_eocd_record = 0 if implied_position_of_eocd_record < 0
331
+
332
+ # Use a soft seek (we might not be able to get as far behind in the IO as we want)
333
+ # and a soft read (we might not be able to read as many bytes as we want)
334
+ file_io.seek(implied_position_of_eocd_record)
335
+ str_containing_eocd_record = file_io.read(MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE)
336
+ raise MissingEOCD unless str_containing_eocd_record
337
+
338
+ eocd_idx_in_buf = locate_eocd_signature(str_containing_eocd_record)
339
+
340
+ raise MissingEOCD unless eocd_idx_in_buf
341
+
342
+ eocd_offset = implied_position_of_eocd_record + eocd_idx_in_buf
343
+ log { format('Found EOCD signature at offset %d', eocd_offset) }
344
+
345
+ eocd_offset
346
+ end
347
+
348
+ # This is tricky. Essentially, we have to scan the maximum possible number
349
+ # of bytes (that the EOCD can theoretically occupy including the comment),
350
+ # and we have to find a combination of:
351
+ # [EOCD signature, <some ZIP medatata>, comment byte size, the comment of
352
+ # that size, eof].
353
+ # The only way I could find to do this was with a sliding window, but
354
+ # there probably is a better way.
355
+ def locate_eocd_signature(in_str)
356
+ # We have to scan from the _very_ tail. We read the very minimum size
357
+ # the EOCD record can have (up to and including the comment size), using
358
+ # a sliding window. Once our end offset matches the comment size we found our
359
+ # EOCD marker.
360
+ unpack_pattern = 'VvvvvVVv'
361
+ minimum_record_size = 22
362
+ end_location = minimum_record_size * -1
363
+ loop do
364
+ # If the window is nil, we have rolled off the start of the string, nothing to do here.
365
+ # We use negative values because if we used positive slice indices
366
+ # we would have to detect the rollover ourselves
367
+ break unless window = in_str[end_location, minimum_record_size]
368
+
369
+ window_location = in_str.bytesize + end_location
370
+ unpacked = window.unpack(unpack_pattern)
371
+ # If we found the signarue, pick up the comment size, and check if the size of the window
372
+ # plus that comment size is where we are in the string. If we are - bingo.
373
+ if unpacked[0] == 0x06054b50 && comment_size = unpacked[-1]
374
+ assumed_eocd_location = in_str.bytesize - comment_size - minimum_record_size
375
+ # if the comment size is where we should be at - we found our EOCD
376
+ return assumed_eocd_location if assumed_eocd_location == window_location
377
+ end
378
+
379
+ end_location -= 1 # Shift the window back, by one byte, and try again.
380
+ end
381
+ end
382
+
383
+ # Find the Zip64 EOCD locator segment offset. Do this by seeking backwards from the
384
+ # EOCD record in the archive by fixed offsets
385
+ def get_zip64_eocd_location(file_io, eocd_offset)
386
+ zip64_eocd_loc_offset = eocd_offset
387
+ zip64_eocd_loc_offset -= 4 # The signature
388
+ zip64_eocd_loc_offset -= 4 # Which disk has the Zip64 end of central directory record
389
+ zip64_eocd_loc_offset -= 8 # Offset of the zip64 central directory record
390
+ zip64_eocd_loc_offset -= 4 # Total number of disks
391
+
392
+ log do
393
+ format(
394
+ 'Will look for the Zip64 EOCD locator signature at offset %d',
395
+ zip64_eocd_loc_offset)
396
+ end
397
+
398
+ # If the offset is negative there is certainly no Zip64 EOCD locator here
399
+ return unless zip64_eocd_loc_offset >= 0
400
+
401
+ file_io.seek(zip64_eocd_loc_offset)
402
+ assert_signature(file_io, 0x07064b50)
403
+
404
+ log { format('Found Zip64 EOCD locator at offset %d', zip64_eocd_loc_offset) }
405
+
406
+ disk_num = read_4b(file_io) # number of the disk
407
+ raise UnsupportedFeature, 'The archive spans multiple disks' if disk_num != 0
408
+ read_8b(file_io)
409
+ rescue ReadError, InvalidStructure
410
+ nil
411
+ end
412
+
413
+ # num_files_and_central_directory_offset_zip64 is too high. [21.12/15]
414
+ def num_files_and_central_directory_offset_zip64(io, zip64_end_of_cdir_location)
415
+ seek(io, zip64_end_of_cdir_location)
416
+
417
+ assert_signature(io, 0x06064b50)
418
+
419
+ zip64_eocdr_size = read_8b(io)
420
+ zip64_eocdr = read_n(io, zip64_eocdr_size) # Reading in bulk is cheaper
421
+ zip64_eocdr = StringIO.new(zip64_eocdr)
422
+ skip_ahead_2(zip64_eocdr) # version made by
423
+ skip_ahead_2(zip64_eocdr) # version needed to extract
424
+
425
+ disk_n = read_4b(zip64_eocdr) # number of this disk
426
+ disk_n_with_eocdr = read_4b(zip64_eocdr) # number of the disk with the EOCDR
427
+ if disk_n != disk_n_with_eocdr
428
+ raise UnsupportedFeature, 'The archive spans multiple disks'
429
+ end
430
+
431
+ num_files_this_disk = read_8b(zip64_eocdr) # number of files on this disk
432
+ num_files_total = read_8b(zip64_eocdr) # files total in the central directory
433
+
434
+ if num_files_this_disk != num_files_total
435
+ raise UnsupportedFeature, 'The archive spans multiple disks'
436
+ end
437
+
438
+ log do
439
+ format(
440
+ 'Zip64 EOCD record states there are %d files in the archive',
441
+ num_files_total)
442
+ end
443
+
444
+ central_dir_size = read_8b(zip64_eocdr) # Size of the central directory
445
+ central_dir_offset = read_8b(zip64_eocdr) # Where the central directory starts
446
+
447
+ [num_files_total, central_dir_offset, central_dir_size]
448
+ end
449
+
450
+ def num_files_and_central_directory_offset(file_io, eocd_offset)
451
+ seek(file_io, eocd_offset)
452
+
453
+ # The size of the EOCD record is known upfront, so use a strict read
454
+ eocd_record_str = read_n(file_io, SIZE_OF_USABLE_EOCD_RECORD)
455
+ io = StringIO.new(eocd_record_str)
456
+
457
+ assert_signature(io, 0x06054b50)
458
+ skip_ahead_2(io) # number_of_this_disk
459
+ skip_ahead_2(io) # number of the disk with the EOCD record
460
+ skip_ahead_2(io) # number of entries in the central directory of this disk
461
+ num_files = read_2b(io) # number of entries in the central directory total
462
+ cdir_size = read_4b(io) # size of the central directory
463
+ cdir_offset = read_4b(io) # start of central directorty offset
464
+ [num_files, cdir_offset, cdir_size]
465
+ end
466
+
467
+ # Is provided as a stub to be overridden in a subclass if you need it. Will report
468
+ # during various stages of reading. The log message is contained in the return value
469
+ # of `yield` in the method (the log messages are lazy-evaluated).
470
+ def log
471
+ # $stderr.puts(yield)
472
+ end
473
+
474
+ def parse_out_extra_fields(extra_fields_str)
475
+ extra_table = {}
476
+ extras_buf = StringIO.new(extra_fields_str)
477
+ until extras_buf.eof?
478
+ extra_id = read_2b(extras_buf)
479
+ extra_size = read_2b(extras_buf)
480
+ extra_contents = read_n(extras_buf, extra_size)
481
+ extra_table[extra_id] = extra_contents
482
+ end
483
+ extra_table
484
+ end
485
+ end
@@ -13,7 +13,7 @@ describe FormatParser::AttributesJSON do
13
13
  instance.foo = 42
14
14
  instance.bar = 'abcdef'
15
15
  expect(instance.as_json).to eq('nature' => 'good', 'foo' => 42, 'bar' => 'abcdef', 'baz' => nil)
16
- expect(instance.as_json(root: true)).to eq('nature' => 'good', 'foo' => 42, 'bar' => 'abcdef', 'baz' => nil)
16
+ expect(instance.as_json(root: true)).to eq('format_parser_file_info' => {'nature' => 'good', 'foo' => 42, 'bar' => 'abcdef', 'baz' => nil})
17
17
  end
18
18
 
19
19
  it 'is included into file information types' do
@@ -49,4 +49,22 @@ describe FormatParser::AttributesJSON do
49
49
  standard_output = JSON.dump(instance)
50
50
  expect(pretty_output).not_to eq(standard_output)
51
51
  end
52
+
53
+ it 'provides to_json without arguments' do
54
+ anon_class = Class.new do
55
+ include FormatParser::AttributesJSON
56
+ attr_accessor :foo, :bar, :baz
57
+ def nature
58
+ 'good'
59
+ end
60
+ end
61
+ instance = anon_class.new
62
+ instance.foo = 42
63
+ instance.bar = 'abcdef'
64
+
65
+ output = instance.to_json
66
+ readback = JSON.parse(output, symbolize_names: true)
67
+
68
+ expect(readback).to have_key(:nature)
69
+ end
52
70
  end
@@ -0,0 +1,68 @@
1
+ require 'spec_helper'
2
+
3
+ describe FormatParser::PDFParser do
4
+ let(:parsed_pdf) {
5
+ subject.call(
6
+ File.open(
7
+ Pathname.new(fixtures_dir).join('PDF').join(pdf_file),
8
+ 'rb'
9
+ )
10
+ )
11
+ }
12
+
13
+ shared_examples :behave_like_pdf do |hash|
14
+ let(:pdf_file) { hash.fetch(:file) }
15
+
16
+ it 'acts as a pdf' do
17
+ expect(parsed_pdf).not_to be_nil
18
+ expect(parsed_pdf.nature).to eq(:document)
19
+ expect(parsed_pdf.format).to eq(:pdf)
20
+ end
21
+
22
+ it 'has a correct page count' do
23
+ expect(parsed_pdf.page_count).to eq(hash.fetch(:page_count))
24
+ end
25
+ end
26
+
27
+ describe 'a PDF file with a missing version header' do
28
+ let(:pdf_file) { 'not_a.pdf' }
29
+
30
+ it 'does not parse succesfully' do
31
+ expect(parsed_pdf).to be_nil
32
+ end
33
+ end
34
+
35
+ describe 'a PDF file with a correct header but no valid content' do
36
+ let(:pdf_file) { 'broken.pdf' }
37
+
38
+ pending 'does not parse succesfully'
39
+ end
40
+
41
+ describe 'exceeding the PDF read limit' do
42
+ let(:pdf_file) { 'read_limit.pdf' }
43
+
44
+ pending 'does not parse succesfully'
45
+ end
46
+
47
+ describe 'a PDF file with a missing COUNT_HEADER' do
48
+ let(:pdf_file) { 'missing_page_count.pdf' }
49
+
50
+ it 'does not return a page count' do
51
+ expect(parsed_pdf.page_count).to eq(nil)
52
+ end
53
+ end
54
+
55
+ describe 'parses a PDF file' do
56
+ describe 'a single page file' do
57
+ include_examples :behave_like_pdf, file: '1_page.pdf', page_count: 1
58
+ end
59
+
60
+ describe 'a multi page pdf file' do
61
+ include_examples :behave_like_pdf, file: '2_pages.pdf', page_count: 2
62
+ end
63
+
64
+ describe 'a multi page pdf file with content' do
65
+ include_examples :behave_like_pdf, file: '10_pages.pdf', page_count: 10
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,68 @@
1
+ require 'spec_helper'
2
+
3
+ describe FormatParser::ZIPParser do
4
+ it 'parses a ZIP archive with Zip64 extra fields (due to the number of files)' do
5
+ fixture_path = fixtures_dir + '/ZIP/arch_many_entries.zip'
6
+ fi_io = File.open(fixture_path, 'rb')
7
+
8
+ result = subject.call(fi_io)
9
+ expect(result).not_to be_nil
10
+
11
+ expect(result.format).to eq(:zip)
12
+ expect(result.nature).to eq(:archive)
13
+ expect(result.entries.length).to eq(0xFFFF + 1)
14
+
15
+ entry = result.entries.fetch(5674)
16
+ expect(entry.type).to eq(:file)
17
+ expect(entry.size).to eq(47)
18
+ expect(entry.filename).to eq('file-0005674.txt')
19
+ end
20
+
21
+ it 'parses a ZIP archive with a few files' do
22
+ fixture_path = fixtures_dir + '/ZIP/arch_few_entries.zip'
23
+ fi_io = File.open(fixture_path, 'rb')
24
+
25
+ result = subject.call(fi_io)
26
+ expect(result).not_to be_nil
27
+
28
+ expect(result.format).to eq(:zip)
29
+ expect(result.nature).to eq(:archive)
30
+ expect(result.entries.length).to eq(3)
31
+ end
32
+
33
+ it 'correctly identifies an empty directory' do
34
+ fixture_path = fixtures_dir + '/ZIP/arch_with_empty_dir.zip'
35
+ fi_io = File.open(fixture_path, 'rb')
36
+
37
+ result = subject.call(fi_io)
38
+ expect(result).not_to be_nil
39
+
40
+ expect(result.format).to eq(:zip)
41
+ expect(result.nature).to eq(:archive)
42
+ expect(result.entries.length).to eq(3)
43
+
44
+ dir_entry = result.entries.last
45
+ expect(dir_entry.filename).to eq('папочка/')
46
+ expect(dir_entry.type).to eq(:directory)
47
+ end
48
+
49
+ it 'returns a result that has a usable JSON representation' do
50
+ fixture_path = fixtures_dir + '/ZIP/arch_with_empty_dir.zip'
51
+ fi_io = File.open(fixture_path, 'rb')
52
+
53
+ result = subject.call(fi_io)
54
+ json_repr = JSON.pretty_generate(result)
55
+
56
+ json_parsed_repr = JSON.parse(json_repr, symbolize_names: true)
57
+ expect(json_parsed_repr[:nature]).to eq('archive')
58
+ expect(json_parsed_repr[:format]).to eq('zip')
59
+ expect(json_parsed_repr[:entries]).to be_kind_of(Array)
60
+ expect(json_parsed_repr[:entries].length).to eq(3)
61
+
62
+ json_parsed_repr[:entries].each do |e|
63
+ expect(e[:filename]).to be_kind_of(String)
64
+ expect(e[:size]).to be_kind_of(Integer)
65
+ expect(e[:type]).to be_kind_of(String)
66
+ end
67
+ end
68
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: format_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.5
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Noah Berman
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2018-03-13 00:00:00.000000000 Z
12
+ date: 2018-03-30 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: ks
@@ -159,6 +159,7 @@ files:
159
159
  - README.md
160
160
  - Rakefile
161
161
  - format_parser.gemspec
162
+ - lib/archive.rb
162
163
  - lib/attributes_json.rb
163
164
  - lib/audio.rb
164
165
  - lib/care.rb
@@ -180,10 +181,13 @@ files:
180
181
  - lib/parsers/mp3_parser.rb
181
182
  - lib/parsers/mp3_parser/id3_v1.rb
182
183
  - lib/parsers/mp3_parser/id3_v2.rb
184
+ - lib/parsers/pdf_parser.rb
183
185
  - lib/parsers/png_parser.rb
184
186
  - lib/parsers/psd_parser.rb
185
187
  - lib/parsers/tiff_parser.rb
186
188
  - lib/parsers/wav_parser.rb
189
+ - lib/parsers/zip_parser.rb
190
+ - lib/parsers/zip_parser/file_reader.rb
187
191
  - lib/read_limiter.rb
188
192
  - lib/remote_io.rb
189
193
  - lib/video.rb
@@ -201,10 +205,12 @@ files:
201
205
  - spec/parsers/jpeg_parser_spec.rb
202
206
  - spec/parsers/moov_parser_spec.rb
203
207
  - spec/parsers/mp3_parser_spec.rb
208
+ - spec/parsers/pdf_parser_spec.rb
204
209
  - spec/parsers/png_parser_spec.rb
205
210
  - spec/parsers/psd_parser_spec.rb
206
211
  - spec/parsers/tiff_parser_spec.rb
207
212
  - spec/parsers/wav_parser_spec.rb
213
+ - spec/parsers/zip_parser_spec.rb
208
214
  - spec/read_limiter_spec.rb
209
215
  - spec/remote_fetching_spec.rb
210
216
  - spec/remote_io_spec.rb