format_parser 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.gitignore +1 -1
- data/README.md +15 -2
- data/lib/archive.rb +3 -5
- data/lib/format_parser.rb +4 -0
- data/lib/format_parser/version.rb +1 -1
- data/lib/io_utils.rb +3 -0
- data/lib/parsers/flac_parser.rb +75 -0
- data/lib/parsers/moov_parser.rb +17 -8
- data/lib/parsers/moov_parser/decoder.rb +10 -1
- data/lib/parsers/mp3_parser.rb +9 -0
- data/lib/parsers/zip_parser.rb +11 -1
- data/lib/parsers/zip_parser/file_reader.rb +24 -29
- data/lib/parsers/zip_parser/office_formats.rb +51 -0
- data/spec/esoteric_formats_spec.rb +10 -0
- data/spec/parsers/flac_parser_spec.rb +63 -0
- data/spec/parsers/moov_parser_spec.rb +39 -4
- data/spec/parsers/zip_parser_spec.rb +24 -0
- metadata +7 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
|
-
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 7ff294de8e6759d2705cabe93f548ffb6733a121410e1b3c9dc929de52356745
|
|
4
|
+
data.tar.gz: 663240675efd9e8e5425f27333098e8661ef5e8dbbbd38bab91fff1e605bbefc
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: ed7ea153adf28d2efb9352a880d9f4a146ebdad53e282f75d36c2dee2887740d727c25373d33c83dc8ab863e680e12175bb19109cbecf2347950a6fc0233c385
|
|
7
|
+
data.tar.gz: cf6d750264fdfe5a9520c3bb9a8a02b856ba825b709435661b3fcd614f118fc3709d3ba88a25bff0a142f1d03d4fc2389f63740606ff5df4e2db837977a66488
|
data/.gitignore
CHANGED
data/README.md
CHANGED
|
@@ -12,7 +12,7 @@ and [dimensions,](https://github.com/sstephenson/dimensions) borrowing from them
|
|
|
12
12
|
|
|
13
13
|
## Currently supported filetypes:
|
|
14
14
|
|
|
15
|
-
`TIFF, CR2, PSD, PNG, MP3, JPEG, GIF, DPX, AIFF, WAV, FDX, MOV, MP4`
|
|
15
|
+
`TIFF, CR2, PSD, PNG, MP3, JPEG, GIF, PDF, DPX, AIFF, WAV, FDX, MOV, MP4, M4A, FLAC, DOCX, PPTX, XLSX`
|
|
16
16
|
|
|
17
17
|
...with [more](https://github.com/WeTransfer/format_parser/issues?q=is%3Aissue+is%3Aopen+label%3Aformats) on the way!
|
|
18
18
|
|
|
@@ -101,7 +101,7 @@ class MyParser
|
|
|
101
101
|
def call(io)
|
|
102
102
|
# ... do some parsing with `io`
|
|
103
103
|
magic_bytes = io.read(4)
|
|
104
|
-
return unless magic_bytes
|
|
104
|
+
return unless magic_bytes == 'XBMP'
|
|
105
105
|
# ... more parsing code
|
|
106
106
|
# ...and return the FileInformation::Image object with the metadata.
|
|
107
107
|
FormatParser::Image.new(
|
|
@@ -171,3 +171,16 @@ Unless specified otherwise in this section the fixture files are MIT licensed an
|
|
|
171
171
|
|
|
172
172
|
### CR2
|
|
173
173
|
- CR2 examples are downloaded from http://www.rawsamples.ch/ and are Creative Common Licensed.
|
|
174
|
+
|
|
175
|
+
### FLAC
|
|
176
|
+
- atc_fixture_vbr.flac is a converted version of the MP3 with the same name
|
|
177
|
+
- c_11k16btipcm.flac is a converted version of the WAV with the same name
|
|
178
|
+
|
|
179
|
+
### M4A
|
|
180
|
+
- fixture.m4a was created by one of the project maintainers and is MIT licensed
|
|
181
|
+
|
|
182
|
+
### ZIP
|
|
183
|
+
- The .zip fixture files have been created by the project maintainers
|
|
184
|
+
|
|
185
|
+
### .docx
|
|
186
|
+
- The .docx files were generated by the project maintainers
|
data/lib/archive.rb
CHANGED
|
@@ -10,7 +10,9 @@ module FormatParser
|
|
|
10
10
|
end
|
|
11
11
|
end
|
|
12
12
|
|
|
13
|
-
|
|
13
|
+
# Lots of Office and LibreOffice documents are in fact packaged into
|
|
14
|
+
# ZIPs, as are .epub files. We make `nature` customisable for this occasion
|
|
15
|
+
attr_accessor :nature
|
|
14
16
|
|
|
15
17
|
# What filetype was recognized? Will contain a non-ambiguous symbol
|
|
16
18
|
# referring to the file format. The symbol can be used as a filename
|
|
@@ -28,9 +30,5 @@ module FormatParser
|
|
|
28
30
|
def initialize(**attributes)
|
|
29
31
|
attributes.map { |(k, v)| public_send("#{k}=", v) }
|
|
30
32
|
end
|
|
31
|
-
|
|
32
|
-
def nature
|
|
33
|
-
NATURE
|
|
34
|
-
end
|
|
35
33
|
end
|
|
36
34
|
end
|
data/lib/format_parser.rb
CHANGED
|
@@ -93,6 +93,10 @@ module FormatParser
|
|
|
93
93
|
rescue IOUtils::InvalidRead
|
|
94
94
|
# There was not enough data for this parser to work on,
|
|
95
95
|
# and it triggered an error
|
|
96
|
+
rescue IOUtils::MalformedFile
|
|
97
|
+
# Unexpected input was encountered during the parsing of
|
|
98
|
+
# a file. This might indicate either a malicious or a
|
|
99
|
+
# corruped file.
|
|
96
100
|
rescue ReadLimiter::BudgetExceeded
|
|
97
101
|
# The parser tried to read too much - most likely the file structure
|
|
98
102
|
# caused the parser to go off-track. Strictly speaking we should log this
|
data/lib/io_utils.rb
CHANGED
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
class FormatParser::FLACParser
|
|
2
|
+
include FormatParser::IOUtils
|
|
3
|
+
|
|
4
|
+
MAGIC_BYTES = 4
|
|
5
|
+
MAGIC_BYTE_STRING = 'fLaC'
|
|
6
|
+
BLOCK_HEADER_BYTES = 4
|
|
7
|
+
|
|
8
|
+
def bytestring_to_int(s)
|
|
9
|
+
s.unpack('B*')[0].to_i(2)
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def call(io)
|
|
13
|
+
magic_bytes = safe_read(io, MAGIC_BYTES)
|
|
14
|
+
|
|
15
|
+
return unless magic_bytes == MAGIC_BYTE_STRING
|
|
16
|
+
|
|
17
|
+
# Skip info we don't need
|
|
18
|
+
safe_skip(io, BLOCK_HEADER_BYTES)
|
|
19
|
+
|
|
20
|
+
minimum_block_size = bytestring_to_int(safe_read(io, 2))
|
|
21
|
+
|
|
22
|
+
if minimum_block_size < 16
|
|
23
|
+
raise MalformedFile, 'FLAC file minimum block size must be larger than 16'
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
maximum_block_size = bytestring_to_int(safe_read(io, 2))
|
|
27
|
+
|
|
28
|
+
if maximum_block_size < minimum_block_size
|
|
29
|
+
raise MalformedFile, 'FLAC file maximum block size must be equal to or larger than minimum block size'
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
minimum_frame_size = bytestring_to_int(safe_read(io, 3))
|
|
33
|
+
maximum_frame_size = bytestring_to_int(safe_read(io, 3))
|
|
34
|
+
|
|
35
|
+
# Audio info comes in irregularly sized (i.e. not 8-bit) chunks,
|
|
36
|
+
# so read total as bitstring and parse separately
|
|
37
|
+
audio_info = safe_read(io, 8).unpack('B*')[0]
|
|
38
|
+
|
|
39
|
+
# sample rate is 20 bits
|
|
40
|
+
sample_rate = audio_info.slice!(0..19).to_i(2)
|
|
41
|
+
|
|
42
|
+
raise MalformedFile, 'FLAC file sample rate must be larger than 0' unless sample_rate > 0
|
|
43
|
+
|
|
44
|
+
# Number of channels is 3 bits
|
|
45
|
+
# Header contains number of channels minus one, so add one
|
|
46
|
+
num_channels = audio_info.slice!(0..2).to_i(2) + 1
|
|
47
|
+
|
|
48
|
+
# Bits per sample is 5 bits
|
|
49
|
+
# Header contains number of bits per sample minus one, so add one
|
|
50
|
+
bits_per_sample = audio_info.slice!(0..4).to_i(2) + 1
|
|
51
|
+
|
|
52
|
+
# Total samples is 36 bits
|
|
53
|
+
total_samples = audio_info.slice!(0..35).to_i(2)
|
|
54
|
+
|
|
55
|
+
# Division is safe due to check above
|
|
56
|
+
duration = total_samples.to_f / sample_rate
|
|
57
|
+
|
|
58
|
+
FormatParser::Audio.new(
|
|
59
|
+
format: :flac,
|
|
60
|
+
num_audio_channels: num_channels,
|
|
61
|
+
audio_sample_rate_hz: sample_rate,
|
|
62
|
+
media_duration_seconds: duration,
|
|
63
|
+
media_duration_frames: total_samples,
|
|
64
|
+
intrinsics: {
|
|
65
|
+
bits_per_sample: bits_per_sample,
|
|
66
|
+
minimum_frame_size: minimum_frame_size,
|
|
67
|
+
maximum_frame_size: maximum_frame_size,
|
|
68
|
+
minimum_block_size: minimum_block_size,
|
|
69
|
+
maximum_block_size: maximum_block_size
|
|
70
|
+
}
|
|
71
|
+
)
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
FormatParser.register_parser self, natures: :audio, formats: :flac
|
|
75
|
+
end
|
data/lib/parsers/moov_parser.rb
CHANGED
|
@@ -52,19 +52,28 @@ class FormatParser::MOOVParser
|
|
|
52
52
|
media_duration_s = duration / timescale.to_f
|
|
53
53
|
end
|
|
54
54
|
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
55
|
+
# M4A only contains audio, while MP4 and friends can contain video.
|
|
56
|
+
if format_from_moov_type(file_type) == :m4a
|
|
57
|
+
FormatParser::Audio.new(
|
|
58
|
+
format: format_from_moov_type(file_type),
|
|
59
|
+
media_duration_seconds: media_duration_s,
|
|
60
|
+
intrinsics: atom_tree,
|
|
61
|
+
)
|
|
62
|
+
else
|
|
63
|
+
FormatParser::Video.new(
|
|
64
|
+
format: format_from_moov_type(file_type),
|
|
65
|
+
width_px: width,
|
|
66
|
+
height_px: height,
|
|
67
|
+
media_duration_seconds: media_duration_s,
|
|
68
|
+
intrinsics: atom_tree,
|
|
69
|
+
)
|
|
70
|
+
end
|
|
62
71
|
end
|
|
63
72
|
|
|
64
73
|
private
|
|
65
74
|
|
|
66
75
|
def format_from_moov_type(file_type)
|
|
67
|
-
FTYP_MAP.fetch(file_type, :mov)
|
|
76
|
+
FTYP_MAP.fetch(file_type.downcase, :mov)
|
|
68
77
|
end
|
|
69
78
|
|
|
70
79
|
# An MPEG4/MOV/M4A will start with the "ftyp" atom. The atom must have a length
|
|
@@ -181,6 +181,10 @@ class FormatParser::MOOVParser::Decoder
|
|
|
181
181
|
}
|
|
182
182
|
end
|
|
183
183
|
|
|
184
|
+
def parse_meta_atom(io, atom_size)
|
|
185
|
+
parse_hdlr_atom(io, atom_size)
|
|
186
|
+
end
|
|
187
|
+
|
|
184
188
|
def parse_atom_fields_per_type(io, atom_size, atom_type)
|
|
185
189
|
if respond_to?("parse_#{atom_type}_atom", true)
|
|
186
190
|
send("parse_#{atom_type}_atom", io, atom_size)
|
|
@@ -189,6 +193,11 @@ class FormatParser::MOOVParser::Decoder
|
|
|
189
193
|
end
|
|
190
194
|
end
|
|
191
195
|
|
|
196
|
+
def parse_atom_children_and_data_fields(io, atom_size_sans_header, atom_type, current_branch)
|
|
197
|
+
parse_atom_fields_per_type(io, atom_size_sans_header, atom_type)
|
|
198
|
+
extract_atom_stream(io, atom_size_sans_header, current_branch + [atom_type])
|
|
199
|
+
end
|
|
200
|
+
|
|
192
201
|
# Recursive descent parser - will drill down to atoms which
|
|
193
202
|
# we know are permitted to have leaf/branch atoms within itself,
|
|
194
203
|
# and will attempt to recover the data fields for leaf atoms
|
|
@@ -215,7 +224,7 @@ class FormatParser::MOOVParser::Decoder
|
|
|
215
224
|
atom_size_sans_header = atom_size - size_of_atom_type_and_size
|
|
216
225
|
|
|
217
226
|
children, fields = if KNOWN_BRANCH_AND_LEAF_ATOM_TYPES.include?(atom_type)
|
|
218
|
-
parse_atom_children_and_data_fields(io, atom_size_sans_header, atom_type)
|
|
227
|
+
parse_atom_children_and_data_fields(io, atom_size_sans_header, atom_type, current_branch)
|
|
219
228
|
elsif KNOWN_BRANCH_ATOM_TYPES.include?(atom_type)
|
|
220
229
|
[extract_atom_stream(io, atom_size_sans_header, current_branch + [atom_type]), nil]
|
|
221
230
|
else # Assume leaf atom
|
data/lib/parsers/mp3_parser.rb
CHANGED
|
@@ -23,7 +23,16 @@ class FormatParser::MP3Parser
|
|
|
23
23
|
# Default frame size for mp3
|
|
24
24
|
SAMPLES_PER_FRAME = 1152
|
|
25
25
|
|
|
26
|
+
# For some edge cases
|
|
27
|
+
ZIP_LOCAL_ENTRY_SIGNATURE = "PK\x03\x04\x14\x00".b
|
|
28
|
+
|
|
26
29
|
def call(io)
|
|
30
|
+
# Special case: some ZIPs (Office documents) did detect as MP3s.
|
|
31
|
+
# To avoid having that happen, we check for the PKZIP signature -
|
|
32
|
+
# local entry header signature - at the very start of the file
|
|
33
|
+
return if io.read(6) == ZIP_LOCAL_ENTRY_SIGNATURE
|
|
34
|
+
io.seek(0)
|
|
35
|
+
|
|
27
36
|
# Read the last 128 bytes which might contain ID3v1
|
|
28
37
|
id3_v1 = ID3V1.attempt_id3_v1_extraction(io)
|
|
29
38
|
# Read the header bytes that might contain ID3v1
|
data/lib/parsers/zip_parser.rb
CHANGED
|
@@ -1,17 +1,27 @@
|
|
|
1
1
|
class FormatParser::ZIPParser
|
|
2
2
|
require_relative 'zip_parser/file_reader'
|
|
3
|
+
require_relative 'zip_parser/office_formats'
|
|
4
|
+
|
|
5
|
+
include OfficeFormats
|
|
3
6
|
|
|
4
7
|
def call(io)
|
|
5
8
|
reader = FileReader.new
|
|
6
9
|
entries = reader.read_zip_structure(io: FormatParser::IOConstraint.new(io))
|
|
7
10
|
|
|
11
|
+
filenames_set = Set.new
|
|
8
12
|
entries_archive = entries.map do |ze|
|
|
9
13
|
ft = directory?(ze) ? :directory : :file
|
|
10
14
|
decoded_filename = decode_filename(ze)
|
|
15
|
+
filenames_set << decoded_filename
|
|
11
16
|
FormatParser::Archive::Entry.new(type: ft, size: ze.uncompressed_size, filename: decoded_filename)
|
|
12
17
|
end
|
|
13
18
|
|
|
14
|
-
|
|
19
|
+
if office_document?(filenames_set)
|
|
20
|
+
office_format = office_file_format_from_entry_set(filenames_set)
|
|
21
|
+
FormatParser::Archive.new(nature: :document, format: office_format, entries: entries_archive)
|
|
22
|
+
else
|
|
23
|
+
FormatParser::Archive.new(nature: :archive, format: :zip, entries: entries_archive)
|
|
24
|
+
end
|
|
15
25
|
rescue FileReader::Error
|
|
16
26
|
# This is not a ZIP, or a broken ZIP.
|
|
17
27
|
return
|
|
@@ -159,7 +159,6 @@ class FormatParser::ZIPParser::FileReader
|
|
|
159
159
|
def read_zip_structure(io:)
|
|
160
160
|
zip_file_size = io.size
|
|
161
161
|
eocd_offset = get_eocd_offset(io, zip_file_size)
|
|
162
|
-
|
|
163
162
|
zip64_end_of_cdir_location = get_zip64_eocd_location(io, eocd_offset)
|
|
164
163
|
num_files, cdir_location, cdir_size =
|
|
165
164
|
if zip64_end_of_cdir_location
|
|
@@ -345,39 +344,35 @@ class FormatParser::ZIPParser::FileReader
|
|
|
345
344
|
eocd_offset
|
|
346
345
|
end
|
|
347
346
|
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
347
|
+
def all_indices_of_substr_in_str(of_substring, in_string)
|
|
348
|
+
last_i = 0
|
|
349
|
+
found_at_indices = []
|
|
350
|
+
while last_i = in_string.index(of_substring, last_i)
|
|
351
|
+
found_at_indices << last_i
|
|
352
|
+
last_i += of_substring.bytesize
|
|
353
|
+
end
|
|
354
|
+
found_at_indices
|
|
355
|
+
end
|
|
356
|
+
|
|
355
357
|
def locate_eocd_signature(in_str)
|
|
356
|
-
|
|
357
|
-
# the EOCD record can have (up to and including the comment size), using
|
|
358
|
-
# a sliding window. Once our end offset matches the comment size we found our
|
|
359
|
-
# EOCD marker.
|
|
358
|
+
eocd_signature = [0x06054b50].pack('V')
|
|
360
359
|
unpack_pattern = 'VvvvvVVv'
|
|
361
360
|
minimum_record_size = 22
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
#
|
|
367
|
-
break
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
if unpacked[0] == 0x06054b50 && comment_size = unpacked[-1]
|
|
374
|
-
assumed_eocd_location = in_str.bytesize - comment_size - minimum_record_size
|
|
375
|
-
# if the comment size is where we should be at - we found our EOCD
|
|
376
|
-
return assumed_eocd_location if assumed_eocd_location == window_location
|
|
361
|
+
str_size = in_str.bytesize
|
|
362
|
+
indices = all_indices_of_substr_in_str(eocd_signature, in_str)
|
|
363
|
+
indices.each do |check_at|
|
|
364
|
+
maybe_record = in_str[check_at..str_size]
|
|
365
|
+
# If the record is smaller than the minimum - we will never recover anything
|
|
366
|
+
break if maybe_record.bytesize < minimum_record_size
|
|
367
|
+
signature, *_rest, comment_size = maybe_record.unpack(unpack_pattern)
|
|
368
|
+
|
|
369
|
+
# Check the only condition for the match
|
|
370
|
+
if signature == 0x06054b50 && (maybe_record.bytesize - minimum_record_size) == comment_size
|
|
371
|
+
return check_at # Found the EOCD marker location
|
|
377
372
|
end
|
|
378
|
-
|
|
379
|
-
end_location -= 1 # Shift the window back, by one byte, and try again.
|
|
380
373
|
end
|
|
374
|
+
# If we haven't caught anything, return nil deliberately instead of returning the last statement
|
|
375
|
+
nil
|
|
381
376
|
end
|
|
382
377
|
|
|
383
378
|
# Find the Zip64 EOCD locator segment offset. Do this by seeking backwards from the
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# Based on an unscientific sample of 63 documents I could find on my hard drive,
|
|
2
|
+
# all docx/pptx/xlsx files contain, at the minimum, the following files:
|
|
3
|
+
#
|
|
4
|
+
# [Content_types].xml
|
|
5
|
+
# _rels/.rels
|
|
6
|
+
# docProps/core.xml
|
|
7
|
+
# docPropx/app.xml
|
|
8
|
+
#
|
|
9
|
+
# Additionally, per file type, they contain the following:
|
|
10
|
+
#
|
|
11
|
+
# word/document.xml
|
|
12
|
+
# xl/workbook.xml
|
|
13
|
+
# ppt/presentation.xml
|
|
14
|
+
#
|
|
15
|
+
# These are sufficient to say with certainty that a ZIP is in fact an Office document.
|
|
16
|
+
# Also that unscientific sample revealed that I came to dislike MS Office so much as to
|
|
17
|
+
# only have 63 documents on my entire workstation.
|
|
18
|
+
#
|
|
19
|
+
# We do not perform the actual _decoding_ of the Office documents here, because to read
|
|
20
|
+
# their contents we need to:
|
|
21
|
+
#
|
|
22
|
+
# * inflate the compressed part files (potential for deflate bombs)
|
|
23
|
+
# * parse the document XML (potential for XML parser exploitation)
|
|
24
|
+
#
|
|
25
|
+
# which are real threats and require adequate mitigation. For our purposes the
|
|
26
|
+
# token detection of specific filenames should be enough to say with certainty
|
|
27
|
+
# that a document _is_ an Office document, and not just a ZIP.
|
|
28
|
+
module FormatParser::ZIPParser::OfficeFormats
|
|
29
|
+
OFFICE_MARKER_FILES = Set.new([
|
|
30
|
+
'[Content_Types].xml',
|
|
31
|
+
'_rels/.rels',
|
|
32
|
+
'docProps/core.xml',
|
|
33
|
+
'docProps/app.xml',
|
|
34
|
+
])
|
|
35
|
+
|
|
36
|
+
def office_document?(filenames_set)
|
|
37
|
+
OFFICE_MARKER_FILES.subset?(filenames_set)
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def office_file_format_from_entry_set(filenames_set)
|
|
41
|
+
if filenames_set.include?('word/document.xml')
|
|
42
|
+
:docx
|
|
43
|
+
elsif filenames_set.include?('xl/workbook.xml')
|
|
44
|
+
:xlsx
|
|
45
|
+
elsif filenames_set.include?('ppt/presentation.xml')
|
|
46
|
+
:pptx
|
|
47
|
+
else
|
|
48
|
+
:unknown
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
end
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe 'Parsing esoteric files and files causing ambiguous detection' do
|
|
4
|
+
it 'correctly parses the test .docx files as Office docs' do
|
|
5
|
+
docx_path = fixtures_dir + '/ZIP/10.docx'
|
|
6
|
+
result = FormatParser.parse(File.open(docx_path, 'rb'))
|
|
7
|
+
expect(result).not_to be_nil
|
|
8
|
+
expect(result.nature).to eq(:document)
|
|
9
|
+
end
|
|
10
|
+
end
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe FormatParser::FLACParser do
|
|
4
|
+
it 'decodes and estimates duration for the atc_fixture_vbr FLAC File' do
|
|
5
|
+
fpath = fixtures_dir + 'FLAC/atc_fixture_vbr.flac'
|
|
6
|
+
parsed = subject.call(File.open(fpath, 'rb'))
|
|
7
|
+
|
|
8
|
+
expect(parsed).not_to be_nil
|
|
9
|
+
|
|
10
|
+
expect(parsed.nature).to eq(:audio)
|
|
11
|
+
expect(parsed.format).to eq(:flac)
|
|
12
|
+
expect(parsed.num_audio_channels).to eq(2)
|
|
13
|
+
expect(parsed.audio_sample_rate_hz).to eq(44100)
|
|
14
|
+
expect(parsed.intrinsics).not_to be_nil
|
|
15
|
+
expect(parsed.media_duration_frames).to eq(33810)
|
|
16
|
+
expect(parsed.media_duration_seconds).to be_within(0.1).of(0.836)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
it 'decodes and estimates duration for the 16bit FLAC File' do
|
|
20
|
+
fpath = fixtures_dir + 'FLAC/c_11k16bitpcm.flac'
|
|
21
|
+
parsed = subject.call(File.open(fpath, 'rb'))
|
|
22
|
+
|
|
23
|
+
expect(parsed).not_to be_nil
|
|
24
|
+
|
|
25
|
+
expect(parsed.nature).to eq(:audio)
|
|
26
|
+
expect(parsed.format).to eq(:flac)
|
|
27
|
+
expect(parsed.intrinsics[:bits_per_sample]).to eq(16)
|
|
28
|
+
expect(parsed.num_audio_channels).to eq(1)
|
|
29
|
+
expect(parsed.audio_sample_rate_hz).to eq(11025)
|
|
30
|
+
expect(parsed.media_duration_frames).to eq(152267)
|
|
31
|
+
expect(parsed.media_duration_seconds).to be_within(0.01).of(13.81)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
it 'raises error on parsing an invalid file' do
|
|
35
|
+
fpath = fixtures_dir + 'FLAC/invalid.flac'
|
|
36
|
+
|
|
37
|
+
expect {
|
|
38
|
+
subject.call(File.open(fpath, 'rb'))
|
|
39
|
+
}.to raise_error(FormatParser::IOUtils::InvalidRead)
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
it 'raises error on parsing a file with an invalid block size' do
|
|
43
|
+
fpath = fixtures_dir + 'FLAC/invalid_minimum_block_size.flac'
|
|
44
|
+
|
|
45
|
+
expect {
|
|
46
|
+
subject.call(File.open(fpath, 'rb'))
|
|
47
|
+
}.to raise_error(FormatParser::IOUtils::MalformedFile)
|
|
48
|
+
|
|
49
|
+
fpath = fixtures_dir + 'FLAC/invalid_maximum_block_size.flac'
|
|
50
|
+
|
|
51
|
+
expect {
|
|
52
|
+
subject.call(File.open(fpath, 'rb'))
|
|
53
|
+
}.to raise_error(FormatParser::IOUtils::MalformedFile)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
it 'raises an error when sample rate is 0' do
|
|
57
|
+
fpath = fixtures_dir + 'FLAC/sample_rate_0.flac'
|
|
58
|
+
|
|
59
|
+
expect {
|
|
60
|
+
subject.call(File.open(fpath, 'rb'))
|
|
61
|
+
}.to raise_error(FormatParser::IOUtils::MalformedFile)
|
|
62
|
+
end
|
|
63
|
+
end
|
|
@@ -29,9 +29,37 @@ describe FormatParser::MOOVParser do
|
|
|
29
29
|
end
|
|
30
30
|
end
|
|
31
31
|
|
|
32
|
-
Dir.glob(fixtures_dir + '/MOOV
|
|
33
|
-
it "is able to parse #{File.basename(
|
|
34
|
-
result = subject.call(File.open(
|
|
32
|
+
Dir.glob(fixtures_dir + '/MOOV/**/*.m4a').sort.each do |m4a_path|
|
|
33
|
+
it "is able to parse #{File.basename(m4a_path)}" do
|
|
34
|
+
result = subject.call(File.open(m4a_path, 'rb'))
|
|
35
|
+
|
|
36
|
+
expect(result).not_to be_nil
|
|
37
|
+
expect(result.nature).to eq(:audio)
|
|
38
|
+
expect(result.media_duration_seconds).to be_kind_of(Float)
|
|
39
|
+
expect(result.media_duration_seconds).to be > 0
|
|
40
|
+
|
|
41
|
+
expect(result.intrinsics).not_to be_nil
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
Dir.glob(fixtures_dir + '/MOOV/**/*.mov').sort.each do |mov_path|
|
|
46
|
+
it "is able to parse #{File.basename(mov_path)}" do
|
|
47
|
+
result = subject.call(File.open(mov_path, 'rb'))
|
|
48
|
+
|
|
49
|
+
expect(result).not_to be_nil
|
|
50
|
+
expect(result.nature).to eq(:video)
|
|
51
|
+
expect(result.width_px).to be > 0
|
|
52
|
+
expect(result.height_px).to be > 0
|
|
53
|
+
expect(result.media_duration_seconds).to be_kind_of(Float)
|
|
54
|
+
expect(result.media_duration_seconds).to be > 0
|
|
55
|
+
|
|
56
|
+
expect(result.intrinsics).not_to be_nil
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
Dir.glob(fixtures_dir + '/MOOV/**/*.mp4').sort.each do |mp4_path|
|
|
61
|
+
it "is able to parse #{File.basename(mp4_path)}" do
|
|
62
|
+
result = subject.call(File.open(mp4_path, 'rb'))
|
|
35
63
|
|
|
36
64
|
expect(result).not_to be_nil
|
|
37
65
|
expect(result.nature).to eq(:video)
|
|
@@ -44,7 +72,14 @@ describe FormatParser::MOOVParser do
|
|
|
44
72
|
end
|
|
45
73
|
end
|
|
46
74
|
|
|
47
|
-
it 'parses an M4A file and provides the necessary metadata'
|
|
75
|
+
it 'parses an M4A file and provides the necessary metadata' do
|
|
76
|
+
m4a_path = fixtures_dir + '/MOOV/M4A/fixture.m4a'
|
|
77
|
+
|
|
78
|
+
result = subject.call(File.open(m4a_path, 'rb'))
|
|
79
|
+
expect(result).not_to be_nil
|
|
80
|
+
expect(result.nature).to eq(:audio)
|
|
81
|
+
expect(result.format).to eq(:m4a)
|
|
82
|
+
end
|
|
48
83
|
|
|
49
84
|
it 'parses a MOV file and provides the necessary metadata' do
|
|
50
85
|
mov_path = fixtures_dir + '/MOOV/MOV/Test_Circular_ProRes422.mov'
|
|
@@ -46,6 +46,30 @@ describe FormatParser::ZIPParser do
|
|
|
46
46
|
expect(dir_entry.type).to eq(:directory)
|
|
47
47
|
end
|
|
48
48
|
|
|
49
|
+
it 'correctly identifies Word documents' do
|
|
50
|
+
fixture_path = fixtures_dir + '/ZIP/10.docx'
|
|
51
|
+
fi_io = File.open(fixture_path, 'rb')
|
|
52
|
+
|
|
53
|
+
result = subject.call(fi_io)
|
|
54
|
+
expect(result.nature).to eq(:document)
|
|
55
|
+
expect(result.format).to eq(:docx)
|
|
56
|
+
|
|
57
|
+
fixture_path = fixtures_dir + '/ZIP/sample-docx.docx'
|
|
58
|
+
fi_io = File.open(fixture_path, 'rb')
|
|
59
|
+
|
|
60
|
+
result = subject.call(fi_io)
|
|
61
|
+
expect(result.nature).to eq(:document)
|
|
62
|
+
expect(result.format).to eq(:docx)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
it 'is able to handle specific fuzzed input' do
|
|
66
|
+
r = Random.new(354)
|
|
67
|
+
1024.times do
|
|
68
|
+
random_blob = StringIO.new(r.bytes(512 * 1024))
|
|
69
|
+
subject.call(random_blob) # If there is an error in one of the parsers the example will raise too
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
49
73
|
it 'returns a result that has a usable JSON representation' do
|
|
50
74
|
fixture_path = fixtures_dir + '/ZIP/arch_with_empty_dir.zip'
|
|
51
75
|
fi_io = File.open(fixture_path, 'rb')
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: format_parser
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.5.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Noah Berman
|
|
@@ -9,7 +9,7 @@ authors:
|
|
|
9
9
|
autorequire:
|
|
10
10
|
bindir: exe
|
|
11
11
|
cert_chain: []
|
|
12
|
-
date: 2018-03
|
|
12
|
+
date: 2018-04-03 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
|
14
14
|
- !ruby/object:Gem::Dependency
|
|
15
15
|
name: ks
|
|
@@ -174,6 +174,7 @@ files:
|
|
|
174
174
|
- lib/parsers/dpx_parser.rb
|
|
175
175
|
- lib/parsers/exif_parser.rb
|
|
176
176
|
- lib/parsers/fdx_parser.rb
|
|
177
|
+
- lib/parsers/flac_parser.rb
|
|
177
178
|
- lib/parsers/gif_parser.rb
|
|
178
179
|
- lib/parsers/jpeg_parser.rb
|
|
179
180
|
- lib/parsers/moov_parser.rb
|
|
@@ -188,11 +189,13 @@ files:
|
|
|
188
189
|
- lib/parsers/wav_parser.rb
|
|
189
190
|
- lib/parsers/zip_parser.rb
|
|
190
191
|
- lib/parsers/zip_parser/file_reader.rb
|
|
192
|
+
- lib/parsers/zip_parser/office_formats.rb
|
|
191
193
|
- lib/read_limiter.rb
|
|
192
194
|
- lib/remote_io.rb
|
|
193
195
|
- lib/video.rb
|
|
194
196
|
- spec/attributes_json_spec.rb
|
|
195
197
|
- spec/care_spec.rb
|
|
198
|
+
- spec/esoteric_formats_spec.rb
|
|
196
199
|
- spec/file_information_spec.rb
|
|
197
200
|
- spec/format_parser_spec.rb
|
|
198
201
|
- spec/io_utils_spec.rb
|
|
@@ -201,6 +204,7 @@ files:
|
|
|
201
204
|
- spec/parsers/dpx_parser_spec.rb
|
|
202
205
|
- spec/parsers/exif_parser_spec.rb
|
|
203
206
|
- spec/parsers/fdx_parser_spec.rb
|
|
207
|
+
- spec/parsers/flac_parser_spec.rb
|
|
204
208
|
- spec/parsers/gif_parser_spec.rb
|
|
205
209
|
- spec/parsers/jpeg_parser_spec.rb
|
|
206
210
|
- spec/parsers/moov_parser_spec.rb
|
|
@@ -236,7 +240,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
236
240
|
version: '0'
|
|
237
241
|
requirements: []
|
|
238
242
|
rubyforge_project:
|
|
239
|
-
rubygems_version: 2.
|
|
243
|
+
rubygems_version: 2.7.3
|
|
240
244
|
signing_key:
|
|
241
245
|
specification_version: 4
|
|
242
246
|
summary: A library for efficient parsing of file metadata
|