format_parser 0.4.0 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: a76c414094334f57859df79e61d42fa1fdabb3bd
4
- data.tar.gz: 120aaee7484ee01165a2c8dd09b796bce7900c9f
2
+ SHA256:
3
+ metadata.gz: 7ff294de8e6759d2705cabe93f548ffb6733a121410e1b3c9dc929de52356745
4
+ data.tar.gz: 663240675efd9e8e5425f27333098e8661ef5e8dbbbd38bab91fff1e605bbefc
5
5
  SHA512:
6
- metadata.gz: a249af874800774dae313b42e4c191125341a6497a9e31b75e54d22ac008725331ce2227e41c167b9f746b85f7db86364dbbdf5614d48f11cb4122e4de01ce03
7
- data.tar.gz: e6fee97f2741dccc1c9325813eed247d2a93d7b118b7b6b902cba2f23307650127875d443554a4df1e2ea3c8658c59bab68d2c4d84cef4afe2b4bf1e6454c144
6
+ metadata.gz: ed7ea153adf28d2efb9352a880d9f4a146ebdad53e282f75d36c2dee2887740d727c25373d33c83dc8ab863e680e12175bb19109cbecf2347950a6fc0233c385
7
+ data.tar.gz: cf6d750264fdfe5a9520c3bb9a8a02b856ba825b709435661b3fcd614f118fc3709d3ba88a25bff0a142f1d03d4fc2389f63740606ff5df4e2db837977a66488
data/.gitignore CHANGED
@@ -10,4 +10,4 @@
10
10
  *.gem
11
11
 
12
12
  # rspec failure tracking
13
- .rspec_status
13
+ .rspec_status
data/README.md CHANGED
@@ -12,7 +12,7 @@ and [dimensions,](https://github.com/sstephenson/dimensions) borrowing from them
12
12
 
13
13
  ## Currently supported filetypes:
14
14
 
15
- `TIFF, CR2, PSD, PNG, MP3, JPEG, GIF, DPX, AIFF, WAV, FDX, MOV, MP4`
15
+ `TIFF, CR2, PSD, PNG, MP3, JPEG, GIF, PDF, DPX, AIFF, WAV, FDX, MOV, MP4, M4A, FLAC, DOCX, PPTX, XLSX`
16
16
 
17
17
  ...with [more](https://github.com/WeTransfer/format_parser/issues?q=is%3Aissue+is%3Aopen+label%3Aformats) on the way!
18
18
 
@@ -101,7 +101,7 @@ class MyParser
101
101
  def call(io)
102
102
  # ... do some parsing with `io`
103
103
  magic_bytes = io.read(4)
104
- return unless magic_bytes != 'XBMP'
104
+ return unless magic_bytes == 'XBMP'
105
105
  # ... more parsing code
106
106
  # ...and return the FileInformation::Image object with the metadata.
107
107
  FormatParser::Image.new(
@@ -171,3 +171,16 @@ Unless specified otherwise in this section the fixture files are MIT licensed an
171
171
 
172
172
  ### CR2
173
173
  - CR2 examples are downloaded from http://www.rawsamples.ch/ and are Creative Common Licensed.
174
+
175
+ ### FLAC
176
+ - atc_fixture_vbr.flac is a converted version of the MP3 with the same name
177
+ - c_11k16btipcm.flac is a converted version of the WAV with the same name
178
+
179
+ ### M4A
180
+ - fixture.m4a was created by one of the project maintainers and is MIT licensed
181
+
182
+ ### ZIP
183
+ - The .zip fixture files have been created by the project maintainers
184
+
185
+ ### .docx
186
+ - The .docx files were generated by the project maintainers
data/lib/archive.rb CHANGED
@@ -10,7 +10,9 @@ module FormatParser
10
10
  end
11
11
  end
12
12
 
13
- NATURE = :archive
13
+ # Lots of Office and LibreOffice documents are in fact packaged into
14
+ # ZIPs, as are .epub files. We make `nature` customisable for this occasion
15
+ attr_accessor :nature
14
16
 
15
17
  # What filetype was recognized? Will contain a non-ambiguous symbol
16
18
  # referring to the file format. The symbol can be used as a filename
@@ -28,9 +30,5 @@ module FormatParser
28
30
  def initialize(**attributes)
29
31
  attributes.map { |(k, v)| public_send("#{k}=", v) }
30
32
  end
31
-
32
- def nature
33
- NATURE
34
- end
35
33
  end
36
34
  end
data/lib/format_parser.rb CHANGED
@@ -93,6 +93,10 @@ module FormatParser
93
93
  rescue IOUtils::InvalidRead
94
94
  # There was not enough data for this parser to work on,
95
95
  # and it triggered an error
96
+ rescue IOUtils::MalformedFile
97
+ # Unexpected input was encountered during the parsing of
98
+ # a file. This might indicate either a malicious or a
99
+ # corruped file.
96
100
  rescue ReadLimiter::BudgetExceeded
97
101
  # The parser tried to read too much - most likely the file structure
98
102
  # caused the parser to go off-track. Strictly speaking we should log this
@@ -1,3 +1,3 @@
1
1
  module FormatParser
2
- VERSION = '0.4.0'
2
+ VERSION = '0.5.0'
3
3
  end
data/lib/io_utils.rb CHANGED
@@ -2,6 +2,9 @@ module FormatParser::IOUtils
2
2
  class InvalidRead < ArgumentError
3
3
  end
4
4
 
5
+ class MalformedFile < ArgumentError
6
+ end
7
+
5
8
  def safe_read(io, n)
6
9
  raise ArgumentError, 'Unbounded reads are not supported' if n.nil?
7
10
  buf = io.read(n)
@@ -0,0 +1,75 @@
1
+ class FormatParser::FLACParser
2
+ include FormatParser::IOUtils
3
+
4
+ MAGIC_BYTES = 4
5
+ MAGIC_BYTE_STRING = 'fLaC'
6
+ BLOCK_HEADER_BYTES = 4
7
+
8
+ def bytestring_to_int(s)
9
+ s.unpack('B*')[0].to_i(2)
10
+ end
11
+
12
+ def call(io)
13
+ magic_bytes = safe_read(io, MAGIC_BYTES)
14
+
15
+ return unless magic_bytes == MAGIC_BYTE_STRING
16
+
17
+ # Skip info we don't need
18
+ safe_skip(io, BLOCK_HEADER_BYTES)
19
+
20
+ minimum_block_size = bytestring_to_int(safe_read(io, 2))
21
+
22
+ if minimum_block_size < 16
23
+ raise MalformedFile, 'FLAC file minimum block size must be larger than 16'
24
+ end
25
+
26
+ maximum_block_size = bytestring_to_int(safe_read(io, 2))
27
+
28
+ if maximum_block_size < minimum_block_size
29
+ raise MalformedFile, 'FLAC file maximum block size must be equal to or larger than minimum block size'
30
+ end
31
+
32
+ minimum_frame_size = bytestring_to_int(safe_read(io, 3))
33
+ maximum_frame_size = bytestring_to_int(safe_read(io, 3))
34
+
35
+ # Audio info comes in irregularly sized (i.e. not 8-bit) chunks,
36
+ # so read total as bitstring and parse separately
37
+ audio_info = safe_read(io, 8).unpack('B*')[0]
38
+
39
+ # sample rate is 20 bits
40
+ sample_rate = audio_info.slice!(0..19).to_i(2)
41
+
42
+ raise MalformedFile, 'FLAC file sample rate must be larger than 0' unless sample_rate > 0
43
+
44
+ # Number of channels is 3 bits
45
+ # Header contains number of channels minus one, so add one
46
+ num_channels = audio_info.slice!(0..2).to_i(2) + 1
47
+
48
+ # Bits per sample is 5 bits
49
+ # Header contains number of bits per sample minus one, so add one
50
+ bits_per_sample = audio_info.slice!(0..4).to_i(2) + 1
51
+
52
+ # Total samples is 36 bits
53
+ total_samples = audio_info.slice!(0..35).to_i(2)
54
+
55
+ # Division is safe due to check above
56
+ duration = total_samples.to_f / sample_rate
57
+
58
+ FormatParser::Audio.new(
59
+ format: :flac,
60
+ num_audio_channels: num_channels,
61
+ audio_sample_rate_hz: sample_rate,
62
+ media_duration_seconds: duration,
63
+ media_duration_frames: total_samples,
64
+ intrinsics: {
65
+ bits_per_sample: bits_per_sample,
66
+ minimum_frame_size: minimum_frame_size,
67
+ maximum_frame_size: maximum_frame_size,
68
+ minimum_block_size: minimum_block_size,
69
+ maximum_block_size: maximum_block_size
70
+ }
71
+ )
72
+ end
73
+
74
+ FormatParser.register_parser self, natures: :audio, formats: :flac
75
+ end
@@ -52,19 +52,28 @@ class FormatParser::MOOVParser
52
52
  media_duration_s = duration / timescale.to_f
53
53
  end
54
54
 
55
- FormatParser::Video.new(
56
- format: format_from_moov_type(file_type),
57
- width_px: width,
58
- height_px: height,
59
- media_duration_seconds: media_duration_s,
60
- intrinsics: atom_tree,
61
- )
55
+ # M4A only contains audio, while MP4 and friends can contain video.
56
+ if format_from_moov_type(file_type) == :m4a
57
+ FormatParser::Audio.new(
58
+ format: format_from_moov_type(file_type),
59
+ media_duration_seconds: media_duration_s,
60
+ intrinsics: atom_tree,
61
+ )
62
+ else
63
+ FormatParser::Video.new(
64
+ format: format_from_moov_type(file_type),
65
+ width_px: width,
66
+ height_px: height,
67
+ media_duration_seconds: media_duration_s,
68
+ intrinsics: atom_tree,
69
+ )
70
+ end
62
71
  end
63
72
 
64
73
  private
65
74
 
66
75
  def format_from_moov_type(file_type)
67
- FTYP_MAP.fetch(file_type, :mov)
76
+ FTYP_MAP.fetch(file_type.downcase, :mov)
68
77
  end
69
78
 
70
79
  # An MPEG4/MOV/M4A will start with the "ftyp" atom. The atom must have a length
@@ -181,6 +181,10 @@ class FormatParser::MOOVParser::Decoder
181
181
  }
182
182
  end
183
183
 
184
+ def parse_meta_atom(io, atom_size)
185
+ parse_hdlr_atom(io, atom_size)
186
+ end
187
+
184
188
  def parse_atom_fields_per_type(io, atom_size, atom_type)
185
189
  if respond_to?("parse_#{atom_type}_atom", true)
186
190
  send("parse_#{atom_type}_atom", io, atom_size)
@@ -189,6 +193,11 @@ class FormatParser::MOOVParser::Decoder
189
193
  end
190
194
  end
191
195
 
196
+ def parse_atom_children_and_data_fields(io, atom_size_sans_header, atom_type, current_branch)
197
+ parse_atom_fields_per_type(io, atom_size_sans_header, atom_type)
198
+ extract_atom_stream(io, atom_size_sans_header, current_branch + [atom_type])
199
+ end
200
+
192
201
  # Recursive descent parser - will drill down to atoms which
193
202
  # we know are permitted to have leaf/branch atoms within itself,
194
203
  # and will attempt to recover the data fields for leaf atoms
@@ -215,7 +224,7 @@ class FormatParser::MOOVParser::Decoder
215
224
  atom_size_sans_header = atom_size - size_of_atom_type_and_size
216
225
 
217
226
  children, fields = if KNOWN_BRANCH_AND_LEAF_ATOM_TYPES.include?(atom_type)
218
- parse_atom_children_and_data_fields(io, atom_size_sans_header, atom_type)
227
+ parse_atom_children_and_data_fields(io, atom_size_sans_header, atom_type, current_branch)
219
228
  elsif KNOWN_BRANCH_ATOM_TYPES.include?(atom_type)
220
229
  [extract_atom_stream(io, atom_size_sans_header, current_branch + [atom_type]), nil]
221
230
  else # Assume leaf atom
@@ -23,7 +23,16 @@ class FormatParser::MP3Parser
23
23
  # Default frame size for mp3
24
24
  SAMPLES_PER_FRAME = 1152
25
25
 
26
+ # For some edge cases
27
+ ZIP_LOCAL_ENTRY_SIGNATURE = "PK\x03\x04\x14\x00".b
28
+
26
29
  def call(io)
30
+ # Special case: some ZIPs (Office documents) did detect as MP3s.
31
+ # To avoid having that happen, we check for the PKZIP signature -
32
+ # local entry header signature - at the very start of the file
33
+ return if io.read(6) == ZIP_LOCAL_ENTRY_SIGNATURE
34
+ io.seek(0)
35
+
27
36
  # Read the last 128 bytes which might contain ID3v1
28
37
  id3_v1 = ID3V1.attempt_id3_v1_extraction(io)
29
38
  # Read the header bytes that might contain ID3v1
@@ -1,17 +1,27 @@
1
1
  class FormatParser::ZIPParser
2
2
  require_relative 'zip_parser/file_reader'
3
+ require_relative 'zip_parser/office_formats'
4
+
5
+ include OfficeFormats
3
6
 
4
7
  def call(io)
5
8
  reader = FileReader.new
6
9
  entries = reader.read_zip_structure(io: FormatParser::IOConstraint.new(io))
7
10
 
11
+ filenames_set = Set.new
8
12
  entries_archive = entries.map do |ze|
9
13
  ft = directory?(ze) ? :directory : :file
10
14
  decoded_filename = decode_filename(ze)
15
+ filenames_set << decoded_filename
11
16
  FormatParser::Archive::Entry.new(type: ft, size: ze.uncompressed_size, filename: decoded_filename)
12
17
  end
13
18
 
14
- FormatParser::Archive.new(format: :zip, entries: entries_archive)
19
+ if office_document?(filenames_set)
20
+ office_format = office_file_format_from_entry_set(filenames_set)
21
+ FormatParser::Archive.new(nature: :document, format: office_format, entries: entries_archive)
22
+ else
23
+ FormatParser::Archive.new(nature: :archive, format: :zip, entries: entries_archive)
24
+ end
15
25
  rescue FileReader::Error
16
26
  # This is not a ZIP, or a broken ZIP.
17
27
  return
@@ -159,7 +159,6 @@ class FormatParser::ZIPParser::FileReader
159
159
  def read_zip_structure(io:)
160
160
  zip_file_size = io.size
161
161
  eocd_offset = get_eocd_offset(io, zip_file_size)
162
-
163
162
  zip64_end_of_cdir_location = get_zip64_eocd_location(io, eocd_offset)
164
163
  num_files, cdir_location, cdir_size =
165
164
  if zip64_end_of_cdir_location
@@ -345,39 +344,35 @@ class FormatParser::ZIPParser::FileReader
345
344
  eocd_offset
346
345
  end
347
346
 
348
- # This is tricky. Essentially, we have to scan the maximum possible number
349
- # of bytes (that the EOCD can theoretically occupy including the comment),
350
- # and we have to find a combination of:
351
- # [EOCD signature, <some ZIP medatata>, comment byte size, the comment of
352
- # that size, eof].
353
- # The only way I could find to do this was with a sliding window, but
354
- # there probably is a better way.
347
+ def all_indices_of_substr_in_str(of_substring, in_string)
348
+ last_i = 0
349
+ found_at_indices = []
350
+ while last_i = in_string.index(of_substring, last_i)
351
+ found_at_indices << last_i
352
+ last_i += of_substring.bytesize
353
+ end
354
+ found_at_indices
355
+ end
356
+
355
357
  def locate_eocd_signature(in_str)
356
- # We have to scan from the _very_ tail. We read the very minimum size
357
- # the EOCD record can have (up to and including the comment size), using
358
- # a sliding window. Once our end offset matches the comment size we found our
359
- # EOCD marker.
358
+ eocd_signature = [0x06054b50].pack('V')
360
359
  unpack_pattern = 'VvvvvVVv'
361
360
  minimum_record_size = 22
362
- end_location = minimum_record_size * -1
363
- loop do
364
- # If the window is nil, we have rolled off the start of the string, nothing to do here.
365
- # We use negative values because if we used positive slice indices
366
- # we would have to detect the rollover ourselves
367
- break unless window = in_str[end_location, minimum_record_size]
368
-
369
- window_location = in_str.bytesize + end_location
370
- unpacked = window.unpack(unpack_pattern)
371
- # If we found the signarue, pick up the comment size, and check if the size of the window
372
- # plus that comment size is where we are in the string. If we are - bingo.
373
- if unpacked[0] == 0x06054b50 && comment_size = unpacked[-1]
374
- assumed_eocd_location = in_str.bytesize - comment_size - minimum_record_size
375
- # if the comment size is where we should be at - we found our EOCD
376
- return assumed_eocd_location if assumed_eocd_location == window_location
361
+ str_size = in_str.bytesize
362
+ indices = all_indices_of_substr_in_str(eocd_signature, in_str)
363
+ indices.each do |check_at|
364
+ maybe_record = in_str[check_at..str_size]
365
+ # If the record is smaller than the minimum - we will never recover anything
366
+ break if maybe_record.bytesize < minimum_record_size
367
+ signature, *_rest, comment_size = maybe_record.unpack(unpack_pattern)
368
+
369
+ # Check the only condition for the match
370
+ if signature == 0x06054b50 && (maybe_record.bytesize - minimum_record_size) == comment_size
371
+ return check_at # Found the EOCD marker location
377
372
  end
378
-
379
- end_location -= 1 # Shift the window back, by one byte, and try again.
380
373
  end
374
+ # If we haven't caught anything, return nil deliberately instead of returning the last statement
375
+ nil
381
376
  end
382
377
 
383
378
  # Find the Zip64 EOCD locator segment offset. Do this by seeking backwards from the
@@ -0,0 +1,51 @@
1
+ # Based on an unscientific sample of 63 documents I could find on my hard drive,
2
+ # all docx/pptx/xlsx files contain, at the minimum, the following files:
3
+ #
4
+ # [Content_types].xml
5
+ # _rels/.rels
6
+ # docProps/core.xml
7
+ # docPropx/app.xml
8
+ #
9
+ # Additionally, per file type, they contain the following:
10
+ #
11
+ # word/document.xml
12
+ # xl/workbook.xml
13
+ # ppt/presentation.xml
14
+ #
15
+ # These are sufficient to say with certainty that a ZIP is in fact an Office document.
16
+ # Also that unscientific sample revealed that I came to dislike MS Office so much as to
17
+ # only have 63 documents on my entire workstation.
18
+ #
19
+ # We do not perform the actual _decoding_ of the Office documents here, because to read
20
+ # their contents we need to:
21
+ #
22
+ # * inflate the compressed part files (potential for deflate bombs)
23
+ # * parse the document XML (potential for XML parser exploitation)
24
+ #
25
+ # which are real threats and require adequate mitigation. For our purposes the
26
+ # token detection of specific filenames should be enough to say with certainty
27
+ # that a document _is_ an Office document, and not just a ZIP.
28
+ module FormatParser::ZIPParser::OfficeFormats
29
+ OFFICE_MARKER_FILES = Set.new([
30
+ '[Content_Types].xml',
31
+ '_rels/.rels',
32
+ 'docProps/core.xml',
33
+ 'docProps/app.xml',
34
+ ])
35
+
36
+ def office_document?(filenames_set)
37
+ OFFICE_MARKER_FILES.subset?(filenames_set)
38
+ end
39
+
40
+ def office_file_format_from_entry_set(filenames_set)
41
+ if filenames_set.include?('word/document.xml')
42
+ :docx
43
+ elsif filenames_set.include?('xl/workbook.xml')
44
+ :xlsx
45
+ elsif filenames_set.include?('ppt/presentation.xml')
46
+ :pptx
47
+ else
48
+ :unknown
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,10 @@
1
+ require 'spec_helper'
2
+
3
+ describe 'Parsing esoteric files and files causing ambiguous detection' do
4
+ it 'correctly parses the test .docx files as Office docs' do
5
+ docx_path = fixtures_dir + '/ZIP/10.docx'
6
+ result = FormatParser.parse(File.open(docx_path, 'rb'))
7
+ expect(result).not_to be_nil
8
+ expect(result.nature).to eq(:document)
9
+ end
10
+ end
@@ -0,0 +1,63 @@
1
+ require 'spec_helper'
2
+
3
+ describe FormatParser::FLACParser do
4
+ it 'decodes and estimates duration for the atc_fixture_vbr FLAC File' do
5
+ fpath = fixtures_dir + 'FLAC/atc_fixture_vbr.flac'
6
+ parsed = subject.call(File.open(fpath, 'rb'))
7
+
8
+ expect(parsed).not_to be_nil
9
+
10
+ expect(parsed.nature).to eq(:audio)
11
+ expect(parsed.format).to eq(:flac)
12
+ expect(parsed.num_audio_channels).to eq(2)
13
+ expect(parsed.audio_sample_rate_hz).to eq(44100)
14
+ expect(parsed.intrinsics).not_to be_nil
15
+ expect(parsed.media_duration_frames).to eq(33810)
16
+ expect(parsed.media_duration_seconds).to be_within(0.1).of(0.836)
17
+ end
18
+
19
+ it 'decodes and estimates duration for the 16bit FLAC File' do
20
+ fpath = fixtures_dir + 'FLAC/c_11k16bitpcm.flac'
21
+ parsed = subject.call(File.open(fpath, 'rb'))
22
+
23
+ expect(parsed).not_to be_nil
24
+
25
+ expect(parsed.nature).to eq(:audio)
26
+ expect(parsed.format).to eq(:flac)
27
+ expect(parsed.intrinsics[:bits_per_sample]).to eq(16)
28
+ expect(parsed.num_audio_channels).to eq(1)
29
+ expect(parsed.audio_sample_rate_hz).to eq(11025)
30
+ expect(parsed.media_duration_frames).to eq(152267)
31
+ expect(parsed.media_duration_seconds).to be_within(0.01).of(13.81)
32
+ end
33
+
34
+ it 'raises error on parsing an invalid file' do
35
+ fpath = fixtures_dir + 'FLAC/invalid.flac'
36
+
37
+ expect {
38
+ subject.call(File.open(fpath, 'rb'))
39
+ }.to raise_error(FormatParser::IOUtils::InvalidRead)
40
+ end
41
+
42
+ it 'raises error on parsing a file with an invalid block size' do
43
+ fpath = fixtures_dir + 'FLAC/invalid_minimum_block_size.flac'
44
+
45
+ expect {
46
+ subject.call(File.open(fpath, 'rb'))
47
+ }.to raise_error(FormatParser::IOUtils::MalformedFile)
48
+
49
+ fpath = fixtures_dir + 'FLAC/invalid_maximum_block_size.flac'
50
+
51
+ expect {
52
+ subject.call(File.open(fpath, 'rb'))
53
+ }.to raise_error(FormatParser::IOUtils::MalformedFile)
54
+ end
55
+
56
+ it 'raises an error when sample rate is 0' do
57
+ fpath = fixtures_dir + 'FLAC/sample_rate_0.flac'
58
+
59
+ expect {
60
+ subject.call(File.open(fpath, 'rb'))
61
+ }.to raise_error(FormatParser::IOUtils::MalformedFile)
62
+ end
63
+ end
@@ -29,9 +29,37 @@ describe FormatParser::MOOVParser do
29
29
  end
30
30
  end
31
31
 
32
- Dir.glob(fixtures_dir + '/MOOV/**/*.*').sort.each do |moov_path|
33
- it "is able to parse #{File.basename(moov_path)}" do
34
- result = subject.call(File.open(moov_path, 'rb'))
32
+ Dir.glob(fixtures_dir + '/MOOV/**/*.m4a').sort.each do |m4a_path|
33
+ it "is able to parse #{File.basename(m4a_path)}" do
34
+ result = subject.call(File.open(m4a_path, 'rb'))
35
+
36
+ expect(result).not_to be_nil
37
+ expect(result.nature).to eq(:audio)
38
+ expect(result.media_duration_seconds).to be_kind_of(Float)
39
+ expect(result.media_duration_seconds).to be > 0
40
+
41
+ expect(result.intrinsics).not_to be_nil
42
+ end
43
+ end
44
+
45
+ Dir.glob(fixtures_dir + '/MOOV/**/*.mov').sort.each do |mov_path|
46
+ it "is able to parse #{File.basename(mov_path)}" do
47
+ result = subject.call(File.open(mov_path, 'rb'))
48
+
49
+ expect(result).not_to be_nil
50
+ expect(result.nature).to eq(:video)
51
+ expect(result.width_px).to be > 0
52
+ expect(result.height_px).to be > 0
53
+ expect(result.media_duration_seconds).to be_kind_of(Float)
54
+ expect(result.media_duration_seconds).to be > 0
55
+
56
+ expect(result.intrinsics).not_to be_nil
57
+ end
58
+ end
59
+
60
+ Dir.glob(fixtures_dir + '/MOOV/**/*.mp4').sort.each do |mp4_path|
61
+ it "is able to parse #{File.basename(mp4_path)}" do
62
+ result = subject.call(File.open(mp4_path, 'rb'))
35
63
 
36
64
  expect(result).not_to be_nil
37
65
  expect(result.nature).to eq(:video)
@@ -44,7 +72,14 @@ describe FormatParser::MOOVParser do
44
72
  end
45
73
  end
46
74
 
47
- it 'parses an M4A file and provides the necessary metadata'
75
+ it 'parses an M4A file and provides the necessary metadata' do
76
+ m4a_path = fixtures_dir + '/MOOV/M4A/fixture.m4a'
77
+
78
+ result = subject.call(File.open(m4a_path, 'rb'))
79
+ expect(result).not_to be_nil
80
+ expect(result.nature).to eq(:audio)
81
+ expect(result.format).to eq(:m4a)
82
+ end
48
83
 
49
84
  it 'parses a MOV file and provides the necessary metadata' do
50
85
  mov_path = fixtures_dir + '/MOOV/MOV/Test_Circular_ProRes422.mov'
@@ -46,6 +46,30 @@ describe FormatParser::ZIPParser do
46
46
  expect(dir_entry.type).to eq(:directory)
47
47
  end
48
48
 
49
+ it 'correctly identifies Word documents' do
50
+ fixture_path = fixtures_dir + '/ZIP/10.docx'
51
+ fi_io = File.open(fixture_path, 'rb')
52
+
53
+ result = subject.call(fi_io)
54
+ expect(result.nature).to eq(:document)
55
+ expect(result.format).to eq(:docx)
56
+
57
+ fixture_path = fixtures_dir + '/ZIP/sample-docx.docx'
58
+ fi_io = File.open(fixture_path, 'rb')
59
+
60
+ result = subject.call(fi_io)
61
+ expect(result.nature).to eq(:document)
62
+ expect(result.format).to eq(:docx)
63
+ end
64
+
65
+ it 'is able to handle specific fuzzed input' do
66
+ r = Random.new(354)
67
+ 1024.times do
68
+ random_blob = StringIO.new(r.bytes(512 * 1024))
69
+ subject.call(random_blob) # If there is an error in one of the parsers the example will raise too
70
+ end
71
+ end
72
+
49
73
  it 'returns a result that has a usable JSON representation' do
50
74
  fixture_path = fixtures_dir + '/ZIP/arch_with_empty_dir.zip'
51
75
  fi_io = File.open(fixture_path, 'rb')
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: format_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Noah Berman
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2018-03-30 00:00:00.000000000 Z
12
+ date: 2018-04-03 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: ks
@@ -174,6 +174,7 @@ files:
174
174
  - lib/parsers/dpx_parser.rb
175
175
  - lib/parsers/exif_parser.rb
176
176
  - lib/parsers/fdx_parser.rb
177
+ - lib/parsers/flac_parser.rb
177
178
  - lib/parsers/gif_parser.rb
178
179
  - lib/parsers/jpeg_parser.rb
179
180
  - lib/parsers/moov_parser.rb
@@ -188,11 +189,13 @@ files:
188
189
  - lib/parsers/wav_parser.rb
189
190
  - lib/parsers/zip_parser.rb
190
191
  - lib/parsers/zip_parser/file_reader.rb
192
+ - lib/parsers/zip_parser/office_formats.rb
191
193
  - lib/read_limiter.rb
192
194
  - lib/remote_io.rb
193
195
  - lib/video.rb
194
196
  - spec/attributes_json_spec.rb
195
197
  - spec/care_spec.rb
198
+ - spec/esoteric_formats_spec.rb
196
199
  - spec/file_information_spec.rb
197
200
  - spec/format_parser_spec.rb
198
201
  - spec/io_utils_spec.rb
@@ -201,6 +204,7 @@ files:
201
204
  - spec/parsers/dpx_parser_spec.rb
202
205
  - spec/parsers/exif_parser_spec.rb
203
206
  - spec/parsers/fdx_parser_spec.rb
207
+ - spec/parsers/flac_parser_spec.rb
204
208
  - spec/parsers/gif_parser_spec.rb
205
209
  - spec/parsers/jpeg_parser_spec.rb
206
210
  - spec/parsers/moov_parser_spec.rb
@@ -236,7 +240,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
236
240
  version: '0'
237
241
  requirements: []
238
242
  rubyforge_project:
239
- rubygems_version: 2.5.2
243
+ rubygems_version: 2.7.3
240
244
  signing_key:
241
245
  specification_version: 4
242
246
  summary: A library for efficient parsing of file metadata