format_parser 0.4.0 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.gitignore +1 -1
- data/README.md +15 -2
- data/lib/archive.rb +3 -5
- data/lib/format_parser.rb +4 -0
- data/lib/format_parser/version.rb +1 -1
- data/lib/io_utils.rb +3 -0
- data/lib/parsers/flac_parser.rb +75 -0
- data/lib/parsers/moov_parser.rb +17 -8
- data/lib/parsers/moov_parser/decoder.rb +10 -1
- data/lib/parsers/mp3_parser.rb +9 -0
- data/lib/parsers/zip_parser.rb +11 -1
- data/lib/parsers/zip_parser/file_reader.rb +24 -29
- data/lib/parsers/zip_parser/office_formats.rb +51 -0
- data/spec/esoteric_formats_spec.rb +10 -0
- data/spec/parsers/flac_parser_spec.rb +63 -0
- data/spec/parsers/moov_parser_spec.rb +39 -4
- data/spec/parsers/zip_parser_spec.rb +24 -0
- metadata +7 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 7ff294de8e6759d2705cabe93f548ffb6733a121410e1b3c9dc929de52356745
|
4
|
+
data.tar.gz: 663240675efd9e8e5425f27333098e8661ef5e8dbbbd38bab91fff1e605bbefc
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ed7ea153adf28d2efb9352a880d9f4a146ebdad53e282f75d36c2dee2887740d727c25373d33c83dc8ab863e680e12175bb19109cbecf2347950a6fc0233c385
|
7
|
+
data.tar.gz: cf6d750264fdfe5a9520c3bb9a8a02b856ba825b709435661b3fcd614f118fc3709d3ba88a25bff0a142f1d03d4fc2389f63740606ff5df4e2db837977a66488
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -12,7 +12,7 @@ and [dimensions,](https://github.com/sstephenson/dimensions) borrowing from them
|
|
12
12
|
|
13
13
|
## Currently supported filetypes:
|
14
14
|
|
15
|
-
`TIFF, CR2, PSD, PNG, MP3, JPEG, GIF, DPX, AIFF, WAV, FDX, MOV, MP4`
|
15
|
+
`TIFF, CR2, PSD, PNG, MP3, JPEG, GIF, PDF, DPX, AIFF, WAV, FDX, MOV, MP4, M4A, FLAC, DOCX, PPTX, XLSX`
|
16
16
|
|
17
17
|
...with [more](https://github.com/WeTransfer/format_parser/issues?q=is%3Aissue+is%3Aopen+label%3Aformats) on the way!
|
18
18
|
|
@@ -101,7 +101,7 @@ class MyParser
|
|
101
101
|
def call(io)
|
102
102
|
# ... do some parsing with `io`
|
103
103
|
magic_bytes = io.read(4)
|
104
|
-
return unless magic_bytes
|
104
|
+
return unless magic_bytes == 'XBMP'
|
105
105
|
# ... more parsing code
|
106
106
|
# ...and return the FileInformation::Image object with the metadata.
|
107
107
|
FormatParser::Image.new(
|
@@ -171,3 +171,16 @@ Unless specified otherwise in this section the fixture files are MIT licensed an
|
|
171
171
|
|
172
172
|
### CR2
|
173
173
|
- CR2 examples are downloaded from http://www.rawsamples.ch/ and are Creative Common Licensed.
|
174
|
+
|
175
|
+
### FLAC
|
176
|
+
- atc_fixture_vbr.flac is a converted version of the MP3 with the same name
|
177
|
+
- c_11k16btipcm.flac is a converted version of the WAV with the same name
|
178
|
+
|
179
|
+
### M4A
|
180
|
+
- fixture.m4a was created by one of the project maintainers and is MIT licensed
|
181
|
+
|
182
|
+
### ZIP
|
183
|
+
- The .zip fixture files have been created by the project maintainers
|
184
|
+
|
185
|
+
### .docx
|
186
|
+
- The .docx files were generated by the project maintainers
|
data/lib/archive.rb
CHANGED
@@ -10,7 +10,9 @@ module FormatParser
|
|
10
10
|
end
|
11
11
|
end
|
12
12
|
|
13
|
-
|
13
|
+
# Lots of Office and LibreOffice documents are in fact packaged into
|
14
|
+
# ZIPs, as are .epub files. We make `nature` customisable for this occasion
|
15
|
+
attr_accessor :nature
|
14
16
|
|
15
17
|
# What filetype was recognized? Will contain a non-ambiguous symbol
|
16
18
|
# referring to the file format. The symbol can be used as a filename
|
@@ -28,9 +30,5 @@ module FormatParser
|
|
28
30
|
def initialize(**attributes)
|
29
31
|
attributes.map { |(k, v)| public_send("#{k}=", v) }
|
30
32
|
end
|
31
|
-
|
32
|
-
def nature
|
33
|
-
NATURE
|
34
|
-
end
|
35
33
|
end
|
36
34
|
end
|
data/lib/format_parser.rb
CHANGED
@@ -93,6 +93,10 @@ module FormatParser
|
|
93
93
|
rescue IOUtils::InvalidRead
|
94
94
|
# There was not enough data for this parser to work on,
|
95
95
|
# and it triggered an error
|
96
|
+
rescue IOUtils::MalformedFile
|
97
|
+
# Unexpected input was encountered during the parsing of
|
98
|
+
# a file. This might indicate either a malicious or a
|
99
|
+
# corruped file.
|
96
100
|
rescue ReadLimiter::BudgetExceeded
|
97
101
|
# The parser tried to read too much - most likely the file structure
|
98
102
|
# caused the parser to go off-track. Strictly speaking we should log this
|
data/lib/io_utils.rb
CHANGED
@@ -0,0 +1,75 @@
|
|
1
|
+
class FormatParser::FLACParser
|
2
|
+
include FormatParser::IOUtils
|
3
|
+
|
4
|
+
MAGIC_BYTES = 4
|
5
|
+
MAGIC_BYTE_STRING = 'fLaC'
|
6
|
+
BLOCK_HEADER_BYTES = 4
|
7
|
+
|
8
|
+
def bytestring_to_int(s)
|
9
|
+
s.unpack('B*')[0].to_i(2)
|
10
|
+
end
|
11
|
+
|
12
|
+
def call(io)
|
13
|
+
magic_bytes = safe_read(io, MAGIC_BYTES)
|
14
|
+
|
15
|
+
return unless magic_bytes == MAGIC_BYTE_STRING
|
16
|
+
|
17
|
+
# Skip info we don't need
|
18
|
+
safe_skip(io, BLOCK_HEADER_BYTES)
|
19
|
+
|
20
|
+
minimum_block_size = bytestring_to_int(safe_read(io, 2))
|
21
|
+
|
22
|
+
if minimum_block_size < 16
|
23
|
+
raise MalformedFile, 'FLAC file minimum block size must be larger than 16'
|
24
|
+
end
|
25
|
+
|
26
|
+
maximum_block_size = bytestring_to_int(safe_read(io, 2))
|
27
|
+
|
28
|
+
if maximum_block_size < minimum_block_size
|
29
|
+
raise MalformedFile, 'FLAC file maximum block size must be equal to or larger than minimum block size'
|
30
|
+
end
|
31
|
+
|
32
|
+
minimum_frame_size = bytestring_to_int(safe_read(io, 3))
|
33
|
+
maximum_frame_size = bytestring_to_int(safe_read(io, 3))
|
34
|
+
|
35
|
+
# Audio info comes in irregularly sized (i.e. not 8-bit) chunks,
|
36
|
+
# so read total as bitstring and parse separately
|
37
|
+
audio_info = safe_read(io, 8).unpack('B*')[0]
|
38
|
+
|
39
|
+
# sample rate is 20 bits
|
40
|
+
sample_rate = audio_info.slice!(0..19).to_i(2)
|
41
|
+
|
42
|
+
raise MalformedFile, 'FLAC file sample rate must be larger than 0' unless sample_rate > 0
|
43
|
+
|
44
|
+
# Number of channels is 3 bits
|
45
|
+
# Header contains number of channels minus one, so add one
|
46
|
+
num_channels = audio_info.slice!(0..2).to_i(2) + 1
|
47
|
+
|
48
|
+
# Bits per sample is 5 bits
|
49
|
+
# Header contains number of bits per sample minus one, so add one
|
50
|
+
bits_per_sample = audio_info.slice!(0..4).to_i(2) + 1
|
51
|
+
|
52
|
+
# Total samples is 36 bits
|
53
|
+
total_samples = audio_info.slice!(0..35).to_i(2)
|
54
|
+
|
55
|
+
# Division is safe due to check above
|
56
|
+
duration = total_samples.to_f / sample_rate
|
57
|
+
|
58
|
+
FormatParser::Audio.new(
|
59
|
+
format: :flac,
|
60
|
+
num_audio_channels: num_channels,
|
61
|
+
audio_sample_rate_hz: sample_rate,
|
62
|
+
media_duration_seconds: duration,
|
63
|
+
media_duration_frames: total_samples,
|
64
|
+
intrinsics: {
|
65
|
+
bits_per_sample: bits_per_sample,
|
66
|
+
minimum_frame_size: minimum_frame_size,
|
67
|
+
maximum_frame_size: maximum_frame_size,
|
68
|
+
minimum_block_size: minimum_block_size,
|
69
|
+
maximum_block_size: maximum_block_size
|
70
|
+
}
|
71
|
+
)
|
72
|
+
end
|
73
|
+
|
74
|
+
FormatParser.register_parser self, natures: :audio, formats: :flac
|
75
|
+
end
|
data/lib/parsers/moov_parser.rb
CHANGED
@@ -52,19 +52,28 @@ class FormatParser::MOOVParser
|
|
52
52
|
media_duration_s = duration / timescale.to_f
|
53
53
|
end
|
54
54
|
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
55
|
+
# M4A only contains audio, while MP4 and friends can contain video.
|
56
|
+
if format_from_moov_type(file_type) == :m4a
|
57
|
+
FormatParser::Audio.new(
|
58
|
+
format: format_from_moov_type(file_type),
|
59
|
+
media_duration_seconds: media_duration_s,
|
60
|
+
intrinsics: atom_tree,
|
61
|
+
)
|
62
|
+
else
|
63
|
+
FormatParser::Video.new(
|
64
|
+
format: format_from_moov_type(file_type),
|
65
|
+
width_px: width,
|
66
|
+
height_px: height,
|
67
|
+
media_duration_seconds: media_duration_s,
|
68
|
+
intrinsics: atom_tree,
|
69
|
+
)
|
70
|
+
end
|
62
71
|
end
|
63
72
|
|
64
73
|
private
|
65
74
|
|
66
75
|
def format_from_moov_type(file_type)
|
67
|
-
FTYP_MAP.fetch(file_type, :mov)
|
76
|
+
FTYP_MAP.fetch(file_type.downcase, :mov)
|
68
77
|
end
|
69
78
|
|
70
79
|
# An MPEG4/MOV/M4A will start with the "ftyp" atom. The atom must have a length
|
@@ -181,6 +181,10 @@ class FormatParser::MOOVParser::Decoder
|
|
181
181
|
}
|
182
182
|
end
|
183
183
|
|
184
|
+
def parse_meta_atom(io, atom_size)
|
185
|
+
parse_hdlr_atom(io, atom_size)
|
186
|
+
end
|
187
|
+
|
184
188
|
def parse_atom_fields_per_type(io, atom_size, atom_type)
|
185
189
|
if respond_to?("parse_#{atom_type}_atom", true)
|
186
190
|
send("parse_#{atom_type}_atom", io, atom_size)
|
@@ -189,6 +193,11 @@ class FormatParser::MOOVParser::Decoder
|
|
189
193
|
end
|
190
194
|
end
|
191
195
|
|
196
|
+
def parse_atom_children_and_data_fields(io, atom_size_sans_header, atom_type, current_branch)
|
197
|
+
parse_atom_fields_per_type(io, atom_size_sans_header, atom_type)
|
198
|
+
extract_atom_stream(io, atom_size_sans_header, current_branch + [atom_type])
|
199
|
+
end
|
200
|
+
|
192
201
|
# Recursive descent parser - will drill down to atoms which
|
193
202
|
# we know are permitted to have leaf/branch atoms within itself,
|
194
203
|
# and will attempt to recover the data fields for leaf atoms
|
@@ -215,7 +224,7 @@ class FormatParser::MOOVParser::Decoder
|
|
215
224
|
atom_size_sans_header = atom_size - size_of_atom_type_and_size
|
216
225
|
|
217
226
|
children, fields = if KNOWN_BRANCH_AND_LEAF_ATOM_TYPES.include?(atom_type)
|
218
|
-
parse_atom_children_and_data_fields(io, atom_size_sans_header, atom_type)
|
227
|
+
parse_atom_children_and_data_fields(io, atom_size_sans_header, atom_type, current_branch)
|
219
228
|
elsif KNOWN_BRANCH_ATOM_TYPES.include?(atom_type)
|
220
229
|
[extract_atom_stream(io, atom_size_sans_header, current_branch + [atom_type]), nil]
|
221
230
|
else # Assume leaf atom
|
data/lib/parsers/mp3_parser.rb
CHANGED
@@ -23,7 +23,16 @@ class FormatParser::MP3Parser
|
|
23
23
|
# Default frame size for mp3
|
24
24
|
SAMPLES_PER_FRAME = 1152
|
25
25
|
|
26
|
+
# For some edge cases
|
27
|
+
ZIP_LOCAL_ENTRY_SIGNATURE = "PK\x03\x04\x14\x00".b
|
28
|
+
|
26
29
|
def call(io)
|
30
|
+
# Special case: some ZIPs (Office documents) did detect as MP3s.
|
31
|
+
# To avoid having that happen, we check for the PKZIP signature -
|
32
|
+
# local entry header signature - at the very start of the file
|
33
|
+
return if io.read(6) == ZIP_LOCAL_ENTRY_SIGNATURE
|
34
|
+
io.seek(0)
|
35
|
+
|
27
36
|
# Read the last 128 bytes which might contain ID3v1
|
28
37
|
id3_v1 = ID3V1.attempt_id3_v1_extraction(io)
|
29
38
|
# Read the header bytes that might contain ID3v1
|
data/lib/parsers/zip_parser.rb
CHANGED
@@ -1,17 +1,27 @@
|
|
1
1
|
class FormatParser::ZIPParser
|
2
2
|
require_relative 'zip_parser/file_reader'
|
3
|
+
require_relative 'zip_parser/office_formats'
|
4
|
+
|
5
|
+
include OfficeFormats
|
3
6
|
|
4
7
|
def call(io)
|
5
8
|
reader = FileReader.new
|
6
9
|
entries = reader.read_zip_structure(io: FormatParser::IOConstraint.new(io))
|
7
10
|
|
11
|
+
filenames_set = Set.new
|
8
12
|
entries_archive = entries.map do |ze|
|
9
13
|
ft = directory?(ze) ? :directory : :file
|
10
14
|
decoded_filename = decode_filename(ze)
|
15
|
+
filenames_set << decoded_filename
|
11
16
|
FormatParser::Archive::Entry.new(type: ft, size: ze.uncompressed_size, filename: decoded_filename)
|
12
17
|
end
|
13
18
|
|
14
|
-
|
19
|
+
if office_document?(filenames_set)
|
20
|
+
office_format = office_file_format_from_entry_set(filenames_set)
|
21
|
+
FormatParser::Archive.new(nature: :document, format: office_format, entries: entries_archive)
|
22
|
+
else
|
23
|
+
FormatParser::Archive.new(nature: :archive, format: :zip, entries: entries_archive)
|
24
|
+
end
|
15
25
|
rescue FileReader::Error
|
16
26
|
# This is not a ZIP, or a broken ZIP.
|
17
27
|
return
|
@@ -159,7 +159,6 @@ class FormatParser::ZIPParser::FileReader
|
|
159
159
|
def read_zip_structure(io:)
|
160
160
|
zip_file_size = io.size
|
161
161
|
eocd_offset = get_eocd_offset(io, zip_file_size)
|
162
|
-
|
163
162
|
zip64_end_of_cdir_location = get_zip64_eocd_location(io, eocd_offset)
|
164
163
|
num_files, cdir_location, cdir_size =
|
165
164
|
if zip64_end_of_cdir_location
|
@@ -345,39 +344,35 @@ class FormatParser::ZIPParser::FileReader
|
|
345
344
|
eocd_offset
|
346
345
|
end
|
347
346
|
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
347
|
+
def all_indices_of_substr_in_str(of_substring, in_string)
|
348
|
+
last_i = 0
|
349
|
+
found_at_indices = []
|
350
|
+
while last_i = in_string.index(of_substring, last_i)
|
351
|
+
found_at_indices << last_i
|
352
|
+
last_i += of_substring.bytesize
|
353
|
+
end
|
354
|
+
found_at_indices
|
355
|
+
end
|
356
|
+
|
355
357
|
def locate_eocd_signature(in_str)
|
356
|
-
|
357
|
-
# the EOCD record can have (up to and including the comment size), using
|
358
|
-
# a sliding window. Once our end offset matches the comment size we found our
|
359
|
-
# EOCD marker.
|
358
|
+
eocd_signature = [0x06054b50].pack('V')
|
360
359
|
unpack_pattern = 'VvvvvVVv'
|
361
360
|
minimum_record_size = 22
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
#
|
367
|
-
break
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
if unpacked[0] == 0x06054b50 && comment_size = unpacked[-1]
|
374
|
-
assumed_eocd_location = in_str.bytesize - comment_size - minimum_record_size
|
375
|
-
# if the comment size is where we should be at - we found our EOCD
|
376
|
-
return assumed_eocd_location if assumed_eocd_location == window_location
|
361
|
+
str_size = in_str.bytesize
|
362
|
+
indices = all_indices_of_substr_in_str(eocd_signature, in_str)
|
363
|
+
indices.each do |check_at|
|
364
|
+
maybe_record = in_str[check_at..str_size]
|
365
|
+
# If the record is smaller than the minimum - we will never recover anything
|
366
|
+
break if maybe_record.bytesize < minimum_record_size
|
367
|
+
signature, *_rest, comment_size = maybe_record.unpack(unpack_pattern)
|
368
|
+
|
369
|
+
# Check the only condition for the match
|
370
|
+
if signature == 0x06054b50 && (maybe_record.bytesize - minimum_record_size) == comment_size
|
371
|
+
return check_at # Found the EOCD marker location
|
377
372
|
end
|
378
|
-
|
379
|
-
end_location -= 1 # Shift the window back, by one byte, and try again.
|
380
373
|
end
|
374
|
+
# If we haven't caught anything, return nil deliberately instead of returning the last statement
|
375
|
+
nil
|
381
376
|
end
|
382
377
|
|
383
378
|
# Find the Zip64 EOCD locator segment offset. Do this by seeking backwards from the
|
@@ -0,0 +1,51 @@
|
|
1
|
+
# Based on an unscientific sample of 63 documents I could find on my hard drive,
|
2
|
+
# all docx/pptx/xlsx files contain, at the minimum, the following files:
|
3
|
+
#
|
4
|
+
# [Content_types].xml
|
5
|
+
# _rels/.rels
|
6
|
+
# docProps/core.xml
|
7
|
+
# docPropx/app.xml
|
8
|
+
#
|
9
|
+
# Additionally, per file type, they contain the following:
|
10
|
+
#
|
11
|
+
# word/document.xml
|
12
|
+
# xl/workbook.xml
|
13
|
+
# ppt/presentation.xml
|
14
|
+
#
|
15
|
+
# These are sufficient to say with certainty that a ZIP is in fact an Office document.
|
16
|
+
# Also that unscientific sample revealed that I came to dislike MS Office so much as to
|
17
|
+
# only have 63 documents on my entire workstation.
|
18
|
+
#
|
19
|
+
# We do not perform the actual _decoding_ of the Office documents here, because to read
|
20
|
+
# their contents we need to:
|
21
|
+
#
|
22
|
+
# * inflate the compressed part files (potential for deflate bombs)
|
23
|
+
# * parse the document XML (potential for XML parser exploitation)
|
24
|
+
#
|
25
|
+
# which are real threats and require adequate mitigation. For our purposes the
|
26
|
+
# token detection of specific filenames should be enough to say with certainty
|
27
|
+
# that a document _is_ an Office document, and not just a ZIP.
|
28
|
+
module FormatParser::ZIPParser::OfficeFormats
|
29
|
+
OFFICE_MARKER_FILES = Set.new([
|
30
|
+
'[Content_Types].xml',
|
31
|
+
'_rels/.rels',
|
32
|
+
'docProps/core.xml',
|
33
|
+
'docProps/app.xml',
|
34
|
+
])
|
35
|
+
|
36
|
+
def office_document?(filenames_set)
|
37
|
+
OFFICE_MARKER_FILES.subset?(filenames_set)
|
38
|
+
end
|
39
|
+
|
40
|
+
def office_file_format_from_entry_set(filenames_set)
|
41
|
+
if filenames_set.include?('word/document.xml')
|
42
|
+
:docx
|
43
|
+
elsif filenames_set.include?('xl/workbook.xml')
|
44
|
+
:xlsx
|
45
|
+
elsif filenames_set.include?('ppt/presentation.xml')
|
46
|
+
:pptx
|
47
|
+
else
|
48
|
+
:unknown
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,10 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe 'Parsing esoteric files and files causing ambiguous detection' do
|
4
|
+
it 'correctly parses the test .docx files as Office docs' do
|
5
|
+
docx_path = fixtures_dir + '/ZIP/10.docx'
|
6
|
+
result = FormatParser.parse(File.open(docx_path, 'rb'))
|
7
|
+
expect(result).not_to be_nil
|
8
|
+
expect(result.nature).to eq(:document)
|
9
|
+
end
|
10
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe FormatParser::FLACParser do
|
4
|
+
it 'decodes and estimates duration for the atc_fixture_vbr FLAC File' do
|
5
|
+
fpath = fixtures_dir + 'FLAC/atc_fixture_vbr.flac'
|
6
|
+
parsed = subject.call(File.open(fpath, 'rb'))
|
7
|
+
|
8
|
+
expect(parsed).not_to be_nil
|
9
|
+
|
10
|
+
expect(parsed.nature).to eq(:audio)
|
11
|
+
expect(parsed.format).to eq(:flac)
|
12
|
+
expect(parsed.num_audio_channels).to eq(2)
|
13
|
+
expect(parsed.audio_sample_rate_hz).to eq(44100)
|
14
|
+
expect(parsed.intrinsics).not_to be_nil
|
15
|
+
expect(parsed.media_duration_frames).to eq(33810)
|
16
|
+
expect(parsed.media_duration_seconds).to be_within(0.1).of(0.836)
|
17
|
+
end
|
18
|
+
|
19
|
+
it 'decodes and estimates duration for the 16bit FLAC File' do
|
20
|
+
fpath = fixtures_dir + 'FLAC/c_11k16bitpcm.flac'
|
21
|
+
parsed = subject.call(File.open(fpath, 'rb'))
|
22
|
+
|
23
|
+
expect(parsed).not_to be_nil
|
24
|
+
|
25
|
+
expect(parsed.nature).to eq(:audio)
|
26
|
+
expect(parsed.format).to eq(:flac)
|
27
|
+
expect(parsed.intrinsics[:bits_per_sample]).to eq(16)
|
28
|
+
expect(parsed.num_audio_channels).to eq(1)
|
29
|
+
expect(parsed.audio_sample_rate_hz).to eq(11025)
|
30
|
+
expect(parsed.media_duration_frames).to eq(152267)
|
31
|
+
expect(parsed.media_duration_seconds).to be_within(0.01).of(13.81)
|
32
|
+
end
|
33
|
+
|
34
|
+
it 'raises error on parsing an invalid file' do
|
35
|
+
fpath = fixtures_dir + 'FLAC/invalid.flac'
|
36
|
+
|
37
|
+
expect {
|
38
|
+
subject.call(File.open(fpath, 'rb'))
|
39
|
+
}.to raise_error(FormatParser::IOUtils::InvalidRead)
|
40
|
+
end
|
41
|
+
|
42
|
+
it 'raises error on parsing a file with an invalid block size' do
|
43
|
+
fpath = fixtures_dir + 'FLAC/invalid_minimum_block_size.flac'
|
44
|
+
|
45
|
+
expect {
|
46
|
+
subject.call(File.open(fpath, 'rb'))
|
47
|
+
}.to raise_error(FormatParser::IOUtils::MalformedFile)
|
48
|
+
|
49
|
+
fpath = fixtures_dir + 'FLAC/invalid_maximum_block_size.flac'
|
50
|
+
|
51
|
+
expect {
|
52
|
+
subject.call(File.open(fpath, 'rb'))
|
53
|
+
}.to raise_error(FormatParser::IOUtils::MalformedFile)
|
54
|
+
end
|
55
|
+
|
56
|
+
it 'raises an error when sample rate is 0' do
|
57
|
+
fpath = fixtures_dir + 'FLAC/sample_rate_0.flac'
|
58
|
+
|
59
|
+
expect {
|
60
|
+
subject.call(File.open(fpath, 'rb'))
|
61
|
+
}.to raise_error(FormatParser::IOUtils::MalformedFile)
|
62
|
+
end
|
63
|
+
end
|
@@ -29,9 +29,37 @@ describe FormatParser::MOOVParser do
|
|
29
29
|
end
|
30
30
|
end
|
31
31
|
|
32
|
-
Dir.glob(fixtures_dir + '/MOOV
|
33
|
-
it "is able to parse #{File.basename(
|
34
|
-
result = subject.call(File.open(
|
32
|
+
Dir.glob(fixtures_dir + '/MOOV/**/*.m4a').sort.each do |m4a_path|
|
33
|
+
it "is able to parse #{File.basename(m4a_path)}" do
|
34
|
+
result = subject.call(File.open(m4a_path, 'rb'))
|
35
|
+
|
36
|
+
expect(result).not_to be_nil
|
37
|
+
expect(result.nature).to eq(:audio)
|
38
|
+
expect(result.media_duration_seconds).to be_kind_of(Float)
|
39
|
+
expect(result.media_duration_seconds).to be > 0
|
40
|
+
|
41
|
+
expect(result.intrinsics).not_to be_nil
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
Dir.glob(fixtures_dir + '/MOOV/**/*.mov').sort.each do |mov_path|
|
46
|
+
it "is able to parse #{File.basename(mov_path)}" do
|
47
|
+
result = subject.call(File.open(mov_path, 'rb'))
|
48
|
+
|
49
|
+
expect(result).not_to be_nil
|
50
|
+
expect(result.nature).to eq(:video)
|
51
|
+
expect(result.width_px).to be > 0
|
52
|
+
expect(result.height_px).to be > 0
|
53
|
+
expect(result.media_duration_seconds).to be_kind_of(Float)
|
54
|
+
expect(result.media_duration_seconds).to be > 0
|
55
|
+
|
56
|
+
expect(result.intrinsics).not_to be_nil
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
Dir.glob(fixtures_dir + '/MOOV/**/*.mp4').sort.each do |mp4_path|
|
61
|
+
it "is able to parse #{File.basename(mp4_path)}" do
|
62
|
+
result = subject.call(File.open(mp4_path, 'rb'))
|
35
63
|
|
36
64
|
expect(result).not_to be_nil
|
37
65
|
expect(result.nature).to eq(:video)
|
@@ -44,7 +72,14 @@ describe FormatParser::MOOVParser do
|
|
44
72
|
end
|
45
73
|
end
|
46
74
|
|
47
|
-
it 'parses an M4A file and provides the necessary metadata'
|
75
|
+
it 'parses an M4A file and provides the necessary metadata' do
|
76
|
+
m4a_path = fixtures_dir + '/MOOV/M4A/fixture.m4a'
|
77
|
+
|
78
|
+
result = subject.call(File.open(m4a_path, 'rb'))
|
79
|
+
expect(result).not_to be_nil
|
80
|
+
expect(result.nature).to eq(:audio)
|
81
|
+
expect(result.format).to eq(:m4a)
|
82
|
+
end
|
48
83
|
|
49
84
|
it 'parses a MOV file and provides the necessary metadata' do
|
50
85
|
mov_path = fixtures_dir + '/MOOV/MOV/Test_Circular_ProRes422.mov'
|
@@ -46,6 +46,30 @@ describe FormatParser::ZIPParser do
|
|
46
46
|
expect(dir_entry.type).to eq(:directory)
|
47
47
|
end
|
48
48
|
|
49
|
+
it 'correctly identifies Word documents' do
|
50
|
+
fixture_path = fixtures_dir + '/ZIP/10.docx'
|
51
|
+
fi_io = File.open(fixture_path, 'rb')
|
52
|
+
|
53
|
+
result = subject.call(fi_io)
|
54
|
+
expect(result.nature).to eq(:document)
|
55
|
+
expect(result.format).to eq(:docx)
|
56
|
+
|
57
|
+
fixture_path = fixtures_dir + '/ZIP/sample-docx.docx'
|
58
|
+
fi_io = File.open(fixture_path, 'rb')
|
59
|
+
|
60
|
+
result = subject.call(fi_io)
|
61
|
+
expect(result.nature).to eq(:document)
|
62
|
+
expect(result.format).to eq(:docx)
|
63
|
+
end
|
64
|
+
|
65
|
+
it 'is able to handle specific fuzzed input' do
|
66
|
+
r = Random.new(354)
|
67
|
+
1024.times do
|
68
|
+
random_blob = StringIO.new(r.bytes(512 * 1024))
|
69
|
+
subject.call(random_blob) # If there is an error in one of the parsers the example will raise too
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
49
73
|
it 'returns a result that has a usable JSON representation' do
|
50
74
|
fixture_path = fixtures_dir + '/ZIP/arch_with_empty_dir.zip'
|
51
75
|
fi_io = File.open(fixture_path, 'rb')
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: format_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Noah Berman
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2018-03
|
12
|
+
date: 2018-04-03 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: ks
|
@@ -174,6 +174,7 @@ files:
|
|
174
174
|
- lib/parsers/dpx_parser.rb
|
175
175
|
- lib/parsers/exif_parser.rb
|
176
176
|
- lib/parsers/fdx_parser.rb
|
177
|
+
- lib/parsers/flac_parser.rb
|
177
178
|
- lib/parsers/gif_parser.rb
|
178
179
|
- lib/parsers/jpeg_parser.rb
|
179
180
|
- lib/parsers/moov_parser.rb
|
@@ -188,11 +189,13 @@ files:
|
|
188
189
|
- lib/parsers/wav_parser.rb
|
189
190
|
- lib/parsers/zip_parser.rb
|
190
191
|
- lib/parsers/zip_parser/file_reader.rb
|
192
|
+
- lib/parsers/zip_parser/office_formats.rb
|
191
193
|
- lib/read_limiter.rb
|
192
194
|
- lib/remote_io.rb
|
193
195
|
- lib/video.rb
|
194
196
|
- spec/attributes_json_spec.rb
|
195
197
|
- spec/care_spec.rb
|
198
|
+
- spec/esoteric_formats_spec.rb
|
196
199
|
- spec/file_information_spec.rb
|
197
200
|
- spec/format_parser_spec.rb
|
198
201
|
- spec/io_utils_spec.rb
|
@@ -201,6 +204,7 @@ files:
|
|
201
204
|
- spec/parsers/dpx_parser_spec.rb
|
202
205
|
- spec/parsers/exif_parser_spec.rb
|
203
206
|
- spec/parsers/fdx_parser_spec.rb
|
207
|
+
- spec/parsers/flac_parser_spec.rb
|
204
208
|
- spec/parsers/gif_parser_spec.rb
|
205
209
|
- spec/parsers/jpeg_parser_spec.rb
|
206
210
|
- spec/parsers/moov_parser_spec.rb
|
@@ -236,7 +240,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
236
240
|
version: '0'
|
237
241
|
requirements: []
|
238
242
|
rubyforge_project:
|
239
|
-
rubygems_version: 2.
|
243
|
+
rubygems_version: 2.7.3
|
240
244
|
signing_key:
|
241
245
|
specification_version: 4
|
242
246
|
summary: A library for efficient parsing of file metadata
|