format_parser 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: a76c414094334f57859df79e61d42fa1fdabb3bd
4
- data.tar.gz: 120aaee7484ee01165a2c8dd09b796bce7900c9f
2
+ SHA256:
3
+ metadata.gz: 7ff294de8e6759d2705cabe93f548ffb6733a121410e1b3c9dc929de52356745
4
+ data.tar.gz: 663240675efd9e8e5425f27333098e8661ef5e8dbbbd38bab91fff1e605bbefc
5
5
  SHA512:
6
- metadata.gz: a249af874800774dae313b42e4c191125341a6497a9e31b75e54d22ac008725331ce2227e41c167b9f746b85f7db86364dbbdf5614d48f11cb4122e4de01ce03
7
- data.tar.gz: e6fee97f2741dccc1c9325813eed247d2a93d7b118b7b6b902cba2f23307650127875d443554a4df1e2ea3c8658c59bab68d2c4d84cef4afe2b4bf1e6454c144
6
+ metadata.gz: ed7ea153adf28d2efb9352a880d9f4a146ebdad53e282f75d36c2dee2887740d727c25373d33c83dc8ab863e680e12175bb19109cbecf2347950a6fc0233c385
7
+ data.tar.gz: cf6d750264fdfe5a9520c3bb9a8a02b856ba825b709435661b3fcd614f118fc3709d3ba88a25bff0a142f1d03d4fc2389f63740606ff5df4e2db837977a66488
data/.gitignore CHANGED
@@ -10,4 +10,4 @@
10
10
  *.gem
11
11
 
12
12
  # rspec failure tracking
13
- .rspec_status
13
+ .rspec_status
data/README.md CHANGED
@@ -12,7 +12,7 @@ and [dimensions,](https://github.com/sstephenson/dimensions) borrowing from them
12
12
 
13
13
  ## Currently supported filetypes:
14
14
 
15
- `TIFF, CR2, PSD, PNG, MP3, JPEG, GIF, DPX, AIFF, WAV, FDX, MOV, MP4`
15
+ `TIFF, CR2, PSD, PNG, MP3, JPEG, GIF, PDF, DPX, AIFF, WAV, FDX, MOV, MP4, M4A, FLAC, DOCX, PPTX, XLSX`
16
16
 
17
17
  ...with [more](https://github.com/WeTransfer/format_parser/issues?q=is%3Aissue+is%3Aopen+label%3Aformats) on the way!
18
18
 
@@ -101,7 +101,7 @@ class MyParser
101
101
  def call(io)
102
102
  # ... do some parsing with `io`
103
103
  magic_bytes = io.read(4)
104
- return unless magic_bytes != 'XBMP'
104
+ return unless magic_bytes == 'XBMP'
105
105
  # ... more parsing code
106
106
  # ...and return the FileInformation::Image object with the metadata.
107
107
  FormatParser::Image.new(
@@ -171,3 +171,16 @@ Unless specified otherwise in this section the fixture files are MIT licensed an
171
171
 
172
172
  ### CR2
173
173
  - CR2 examples are downloaded from http://www.rawsamples.ch/ and are Creative Common Licensed.
174
+
175
+ ### FLAC
176
+ - atc_fixture_vbr.flac is a converted version of the MP3 with the same name
177
+ - c_11k16btipcm.flac is a converted version of the WAV with the same name
178
+
179
+ ### M4A
180
+ - fixture.m4a was created by one of the project maintainers and is MIT licensed
181
+
182
+ ### ZIP
183
+ - The .zip fixture files have been created by the project maintainers
184
+
185
+ ### .docx
186
+ - The .docx files were generated by the project maintainers
data/lib/archive.rb CHANGED
@@ -10,7 +10,9 @@ module FormatParser
10
10
  end
11
11
  end
12
12
 
13
- NATURE = :archive
13
+ # Lots of Office and LibreOffice documents are in fact packaged into
14
+ # ZIPs, as are .epub files. We make `nature` customisable for this occasion
15
+ attr_accessor :nature
14
16
 
15
17
  # What filetype was recognized? Will contain a non-ambiguous symbol
16
18
  # referring to the file format. The symbol can be used as a filename
@@ -28,9 +30,5 @@ module FormatParser
28
30
  def initialize(**attributes)
29
31
  attributes.map { |(k, v)| public_send("#{k}=", v) }
30
32
  end
31
-
32
- def nature
33
- NATURE
34
- end
35
33
  end
36
34
  end
data/lib/format_parser.rb CHANGED
@@ -93,6 +93,10 @@ module FormatParser
93
93
  rescue IOUtils::InvalidRead
94
94
  # There was not enough data for this parser to work on,
95
95
  # and it triggered an error
96
+ rescue IOUtils::MalformedFile
97
+ # Unexpected input was encountered during the parsing of
98
+ # a file. This might indicate either a malicious or a
99
+ # corruped file.
96
100
  rescue ReadLimiter::BudgetExceeded
97
101
  # The parser tried to read too much - most likely the file structure
98
102
  # caused the parser to go off-track. Strictly speaking we should log this
@@ -1,3 +1,3 @@
1
1
  module FormatParser
2
- VERSION = '0.4.0'
2
+ VERSION = '0.5.0'
3
3
  end
data/lib/io_utils.rb CHANGED
@@ -2,6 +2,9 @@ module FormatParser::IOUtils
2
2
  class InvalidRead < ArgumentError
3
3
  end
4
4
 
5
+ class MalformedFile < ArgumentError
6
+ end
7
+
5
8
  def safe_read(io, n)
6
9
  raise ArgumentError, 'Unbounded reads are not supported' if n.nil?
7
10
  buf = io.read(n)
@@ -0,0 +1,75 @@
1
+ class FormatParser::FLACParser
2
+ include FormatParser::IOUtils
3
+
4
+ MAGIC_BYTES = 4
5
+ MAGIC_BYTE_STRING = 'fLaC'
6
+ BLOCK_HEADER_BYTES = 4
7
+
8
+ def bytestring_to_int(s)
9
+ s.unpack('B*')[0].to_i(2)
10
+ end
11
+
12
+ def call(io)
13
+ magic_bytes = safe_read(io, MAGIC_BYTES)
14
+
15
+ return unless magic_bytes == MAGIC_BYTE_STRING
16
+
17
+ # Skip info we don't need
18
+ safe_skip(io, BLOCK_HEADER_BYTES)
19
+
20
+ minimum_block_size = bytestring_to_int(safe_read(io, 2))
21
+
22
+ if minimum_block_size < 16
23
+ raise MalformedFile, 'FLAC file minimum block size must be larger than 16'
24
+ end
25
+
26
+ maximum_block_size = bytestring_to_int(safe_read(io, 2))
27
+
28
+ if maximum_block_size < minimum_block_size
29
+ raise MalformedFile, 'FLAC file maximum block size must be equal to or larger than minimum block size'
30
+ end
31
+
32
+ minimum_frame_size = bytestring_to_int(safe_read(io, 3))
33
+ maximum_frame_size = bytestring_to_int(safe_read(io, 3))
34
+
35
+ # Audio info comes in irregularly sized (i.e. not 8-bit) chunks,
36
+ # so read total as bitstring and parse separately
37
+ audio_info = safe_read(io, 8).unpack('B*')[0]
38
+
39
+ # sample rate is 20 bits
40
+ sample_rate = audio_info.slice!(0..19).to_i(2)
41
+
42
+ raise MalformedFile, 'FLAC file sample rate must be larger than 0' unless sample_rate > 0
43
+
44
+ # Number of channels is 3 bits
45
+ # Header contains number of channels minus one, so add one
46
+ num_channels = audio_info.slice!(0..2).to_i(2) + 1
47
+
48
+ # Bits per sample is 5 bits
49
+ # Header contains number of bits per sample minus one, so add one
50
+ bits_per_sample = audio_info.slice!(0..4).to_i(2) + 1
51
+
52
+ # Total samples is 36 bits
53
+ total_samples = audio_info.slice!(0..35).to_i(2)
54
+
55
+ # Division is safe due to check above
56
+ duration = total_samples.to_f / sample_rate
57
+
58
+ FormatParser::Audio.new(
59
+ format: :flac,
60
+ num_audio_channels: num_channels,
61
+ audio_sample_rate_hz: sample_rate,
62
+ media_duration_seconds: duration,
63
+ media_duration_frames: total_samples,
64
+ intrinsics: {
65
+ bits_per_sample: bits_per_sample,
66
+ minimum_frame_size: minimum_frame_size,
67
+ maximum_frame_size: maximum_frame_size,
68
+ minimum_block_size: minimum_block_size,
69
+ maximum_block_size: maximum_block_size
70
+ }
71
+ )
72
+ end
73
+
74
+ FormatParser.register_parser self, natures: :audio, formats: :flac
75
+ end
@@ -52,19 +52,28 @@ class FormatParser::MOOVParser
52
52
  media_duration_s = duration / timescale.to_f
53
53
  end
54
54
 
55
- FormatParser::Video.new(
56
- format: format_from_moov_type(file_type),
57
- width_px: width,
58
- height_px: height,
59
- media_duration_seconds: media_duration_s,
60
- intrinsics: atom_tree,
61
- )
55
+ # M4A only contains audio, while MP4 and friends can contain video.
56
+ if format_from_moov_type(file_type) == :m4a
57
+ FormatParser::Audio.new(
58
+ format: format_from_moov_type(file_type),
59
+ media_duration_seconds: media_duration_s,
60
+ intrinsics: atom_tree,
61
+ )
62
+ else
63
+ FormatParser::Video.new(
64
+ format: format_from_moov_type(file_type),
65
+ width_px: width,
66
+ height_px: height,
67
+ media_duration_seconds: media_duration_s,
68
+ intrinsics: atom_tree,
69
+ )
70
+ end
62
71
  end
63
72
 
64
73
  private
65
74
 
66
75
  def format_from_moov_type(file_type)
67
- FTYP_MAP.fetch(file_type, :mov)
76
+ FTYP_MAP.fetch(file_type.downcase, :mov)
68
77
  end
69
78
 
70
79
  # An MPEG4/MOV/M4A will start with the "ftyp" atom. The atom must have a length
@@ -181,6 +181,10 @@ class FormatParser::MOOVParser::Decoder
181
181
  }
182
182
  end
183
183
 
184
+ def parse_meta_atom(io, atom_size)
185
+ parse_hdlr_atom(io, atom_size)
186
+ end
187
+
184
188
  def parse_atom_fields_per_type(io, atom_size, atom_type)
185
189
  if respond_to?("parse_#{atom_type}_atom", true)
186
190
  send("parse_#{atom_type}_atom", io, atom_size)
@@ -189,6 +193,11 @@ class FormatParser::MOOVParser::Decoder
189
193
  end
190
194
  end
191
195
 
196
+ def parse_atom_children_and_data_fields(io, atom_size_sans_header, atom_type, current_branch)
197
+ parse_atom_fields_per_type(io, atom_size_sans_header, atom_type)
198
+ extract_atom_stream(io, atom_size_sans_header, current_branch + [atom_type])
199
+ end
200
+
192
201
  # Recursive descent parser - will drill down to atoms which
193
202
  # we know are permitted to have leaf/branch atoms within itself,
194
203
  # and will attempt to recover the data fields for leaf atoms
@@ -215,7 +224,7 @@ class FormatParser::MOOVParser::Decoder
215
224
  atom_size_sans_header = atom_size - size_of_atom_type_and_size
216
225
 
217
226
  children, fields = if KNOWN_BRANCH_AND_LEAF_ATOM_TYPES.include?(atom_type)
218
- parse_atom_children_and_data_fields(io, atom_size_sans_header, atom_type)
227
+ parse_atom_children_and_data_fields(io, atom_size_sans_header, atom_type, current_branch)
219
228
  elsif KNOWN_BRANCH_ATOM_TYPES.include?(atom_type)
220
229
  [extract_atom_stream(io, atom_size_sans_header, current_branch + [atom_type]), nil]
221
230
  else # Assume leaf atom
@@ -23,7 +23,16 @@ class FormatParser::MP3Parser
23
23
  # Default frame size for mp3
24
24
  SAMPLES_PER_FRAME = 1152
25
25
 
26
+ # For some edge cases
27
+ ZIP_LOCAL_ENTRY_SIGNATURE = "PK\x03\x04\x14\x00".b
28
+
26
29
  def call(io)
30
+ # Special case: some ZIPs (Office documents) did detect as MP3s.
31
+ # To avoid having that happen, we check for the PKZIP signature -
32
+ # local entry header signature - at the very start of the file
33
+ return if io.read(6) == ZIP_LOCAL_ENTRY_SIGNATURE
34
+ io.seek(0)
35
+
27
36
  # Read the last 128 bytes which might contain ID3v1
28
37
  id3_v1 = ID3V1.attempt_id3_v1_extraction(io)
29
38
  # Read the header bytes that might contain ID3v1
@@ -1,17 +1,27 @@
1
1
  class FormatParser::ZIPParser
2
2
  require_relative 'zip_parser/file_reader'
3
+ require_relative 'zip_parser/office_formats'
4
+
5
+ include OfficeFormats
3
6
 
4
7
  def call(io)
5
8
  reader = FileReader.new
6
9
  entries = reader.read_zip_structure(io: FormatParser::IOConstraint.new(io))
7
10
 
11
+ filenames_set = Set.new
8
12
  entries_archive = entries.map do |ze|
9
13
  ft = directory?(ze) ? :directory : :file
10
14
  decoded_filename = decode_filename(ze)
15
+ filenames_set << decoded_filename
11
16
  FormatParser::Archive::Entry.new(type: ft, size: ze.uncompressed_size, filename: decoded_filename)
12
17
  end
13
18
 
14
- FormatParser::Archive.new(format: :zip, entries: entries_archive)
19
+ if office_document?(filenames_set)
20
+ office_format = office_file_format_from_entry_set(filenames_set)
21
+ FormatParser::Archive.new(nature: :document, format: office_format, entries: entries_archive)
22
+ else
23
+ FormatParser::Archive.new(nature: :archive, format: :zip, entries: entries_archive)
24
+ end
15
25
  rescue FileReader::Error
16
26
  # This is not a ZIP, or a broken ZIP.
17
27
  return
@@ -159,7 +159,6 @@ class FormatParser::ZIPParser::FileReader
159
159
  def read_zip_structure(io:)
160
160
  zip_file_size = io.size
161
161
  eocd_offset = get_eocd_offset(io, zip_file_size)
162
-
163
162
  zip64_end_of_cdir_location = get_zip64_eocd_location(io, eocd_offset)
164
163
  num_files, cdir_location, cdir_size =
165
164
  if zip64_end_of_cdir_location
@@ -345,39 +344,35 @@ class FormatParser::ZIPParser::FileReader
345
344
  eocd_offset
346
345
  end
347
346
 
348
- # This is tricky. Essentially, we have to scan the maximum possible number
349
- # of bytes (that the EOCD can theoretically occupy including the comment),
350
- # and we have to find a combination of:
351
- # [EOCD signature, <some ZIP medatata>, comment byte size, the comment of
352
- # that size, eof].
353
- # The only way I could find to do this was with a sliding window, but
354
- # there probably is a better way.
347
+ def all_indices_of_substr_in_str(of_substring, in_string)
348
+ last_i = 0
349
+ found_at_indices = []
350
+ while last_i = in_string.index(of_substring, last_i)
351
+ found_at_indices << last_i
352
+ last_i += of_substring.bytesize
353
+ end
354
+ found_at_indices
355
+ end
356
+
355
357
  def locate_eocd_signature(in_str)
356
- # We have to scan from the _very_ tail. We read the very minimum size
357
- # the EOCD record can have (up to and including the comment size), using
358
- # a sliding window. Once our end offset matches the comment size we found our
359
- # EOCD marker.
358
+ eocd_signature = [0x06054b50].pack('V')
360
359
  unpack_pattern = 'VvvvvVVv'
361
360
  minimum_record_size = 22
362
- end_location = minimum_record_size * -1
363
- loop do
364
- # If the window is nil, we have rolled off the start of the string, nothing to do here.
365
- # We use negative values because if we used positive slice indices
366
- # we would have to detect the rollover ourselves
367
- break unless window = in_str[end_location, minimum_record_size]
368
-
369
- window_location = in_str.bytesize + end_location
370
- unpacked = window.unpack(unpack_pattern)
371
- # If we found the signarue, pick up the comment size, and check if the size of the window
372
- # plus that comment size is where we are in the string. If we are - bingo.
373
- if unpacked[0] == 0x06054b50 && comment_size = unpacked[-1]
374
- assumed_eocd_location = in_str.bytesize - comment_size - minimum_record_size
375
- # if the comment size is where we should be at - we found our EOCD
376
- return assumed_eocd_location if assumed_eocd_location == window_location
361
+ str_size = in_str.bytesize
362
+ indices = all_indices_of_substr_in_str(eocd_signature, in_str)
363
+ indices.each do |check_at|
364
+ maybe_record = in_str[check_at..str_size]
365
+ # If the record is smaller than the minimum - we will never recover anything
366
+ break if maybe_record.bytesize < minimum_record_size
367
+ signature, *_rest, comment_size = maybe_record.unpack(unpack_pattern)
368
+
369
+ # Check the only condition for the match
370
+ if signature == 0x06054b50 && (maybe_record.bytesize - minimum_record_size) == comment_size
371
+ return check_at # Found the EOCD marker location
377
372
  end
378
-
379
- end_location -= 1 # Shift the window back, by one byte, and try again.
380
373
  end
374
+ # If we haven't caught anything, return nil deliberately instead of returning the last statement
375
+ nil
381
376
  end
382
377
 
383
378
  # Find the Zip64 EOCD locator segment offset. Do this by seeking backwards from the
@@ -0,0 +1,51 @@
1
+ # Based on an unscientific sample of 63 documents I could find on my hard drive,
2
+ # all docx/pptx/xlsx files contain, at the minimum, the following files:
3
+ #
4
+ # [Content_types].xml
5
+ # _rels/.rels
6
+ # docProps/core.xml
7
+ # docPropx/app.xml
8
+ #
9
+ # Additionally, per file type, they contain the following:
10
+ #
11
+ # word/document.xml
12
+ # xl/workbook.xml
13
+ # ppt/presentation.xml
14
+ #
15
+ # These are sufficient to say with certainty that a ZIP is in fact an Office document.
16
+ # Also that unscientific sample revealed that I came to dislike MS Office so much as to
17
+ # only have 63 documents on my entire workstation.
18
+ #
19
+ # We do not perform the actual _decoding_ of the Office documents here, because to read
20
+ # their contents we need to:
21
+ #
22
+ # * inflate the compressed part files (potential for deflate bombs)
23
+ # * parse the document XML (potential for XML parser exploitation)
24
+ #
25
+ # which are real threats and require adequate mitigation. For our purposes the
26
+ # token detection of specific filenames should be enough to say with certainty
27
+ # that a document _is_ an Office document, and not just a ZIP.
28
+ module FormatParser::ZIPParser::OfficeFormats
29
+ OFFICE_MARKER_FILES = Set.new([
30
+ '[Content_Types].xml',
31
+ '_rels/.rels',
32
+ 'docProps/core.xml',
33
+ 'docProps/app.xml',
34
+ ])
35
+
36
+ def office_document?(filenames_set)
37
+ OFFICE_MARKER_FILES.subset?(filenames_set)
38
+ end
39
+
40
+ def office_file_format_from_entry_set(filenames_set)
41
+ if filenames_set.include?('word/document.xml')
42
+ :docx
43
+ elsif filenames_set.include?('xl/workbook.xml')
44
+ :xlsx
45
+ elsif filenames_set.include?('ppt/presentation.xml')
46
+ :pptx
47
+ else
48
+ :unknown
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,10 @@
1
+ require 'spec_helper'
2
+
3
+ describe 'Parsing esoteric files and files causing ambiguous detection' do
4
+ it 'correctly parses the test .docx files as Office docs' do
5
+ docx_path = fixtures_dir + '/ZIP/10.docx'
6
+ result = FormatParser.parse(File.open(docx_path, 'rb'))
7
+ expect(result).not_to be_nil
8
+ expect(result.nature).to eq(:document)
9
+ end
10
+ end
@@ -0,0 +1,63 @@
1
+ require 'spec_helper'
2
+
3
+ describe FormatParser::FLACParser do
4
+ it 'decodes and estimates duration for the atc_fixture_vbr FLAC File' do
5
+ fpath = fixtures_dir + 'FLAC/atc_fixture_vbr.flac'
6
+ parsed = subject.call(File.open(fpath, 'rb'))
7
+
8
+ expect(parsed).not_to be_nil
9
+
10
+ expect(parsed.nature).to eq(:audio)
11
+ expect(parsed.format).to eq(:flac)
12
+ expect(parsed.num_audio_channels).to eq(2)
13
+ expect(parsed.audio_sample_rate_hz).to eq(44100)
14
+ expect(parsed.intrinsics).not_to be_nil
15
+ expect(parsed.media_duration_frames).to eq(33810)
16
+ expect(parsed.media_duration_seconds).to be_within(0.1).of(0.836)
17
+ end
18
+
19
+ it 'decodes and estimates duration for the 16bit FLAC File' do
20
+ fpath = fixtures_dir + 'FLAC/c_11k16bitpcm.flac'
21
+ parsed = subject.call(File.open(fpath, 'rb'))
22
+
23
+ expect(parsed).not_to be_nil
24
+
25
+ expect(parsed.nature).to eq(:audio)
26
+ expect(parsed.format).to eq(:flac)
27
+ expect(parsed.intrinsics[:bits_per_sample]).to eq(16)
28
+ expect(parsed.num_audio_channels).to eq(1)
29
+ expect(parsed.audio_sample_rate_hz).to eq(11025)
30
+ expect(parsed.media_duration_frames).to eq(152267)
31
+ expect(parsed.media_duration_seconds).to be_within(0.01).of(13.81)
32
+ end
33
+
34
+ it 'raises error on parsing an invalid file' do
35
+ fpath = fixtures_dir + 'FLAC/invalid.flac'
36
+
37
+ expect {
38
+ subject.call(File.open(fpath, 'rb'))
39
+ }.to raise_error(FormatParser::IOUtils::InvalidRead)
40
+ end
41
+
42
+ it 'raises error on parsing a file with an invalid block size' do
43
+ fpath = fixtures_dir + 'FLAC/invalid_minimum_block_size.flac'
44
+
45
+ expect {
46
+ subject.call(File.open(fpath, 'rb'))
47
+ }.to raise_error(FormatParser::IOUtils::MalformedFile)
48
+
49
+ fpath = fixtures_dir + 'FLAC/invalid_maximum_block_size.flac'
50
+
51
+ expect {
52
+ subject.call(File.open(fpath, 'rb'))
53
+ }.to raise_error(FormatParser::IOUtils::MalformedFile)
54
+ end
55
+
56
+ it 'raises an error when sample rate is 0' do
57
+ fpath = fixtures_dir + 'FLAC/sample_rate_0.flac'
58
+
59
+ expect {
60
+ subject.call(File.open(fpath, 'rb'))
61
+ }.to raise_error(FormatParser::IOUtils::MalformedFile)
62
+ end
63
+ end
@@ -29,9 +29,37 @@ describe FormatParser::MOOVParser do
29
29
  end
30
30
  end
31
31
 
32
- Dir.glob(fixtures_dir + '/MOOV/**/*.*').sort.each do |moov_path|
33
- it "is able to parse #{File.basename(moov_path)}" do
34
- result = subject.call(File.open(moov_path, 'rb'))
32
+ Dir.glob(fixtures_dir + '/MOOV/**/*.m4a').sort.each do |m4a_path|
33
+ it "is able to parse #{File.basename(m4a_path)}" do
34
+ result = subject.call(File.open(m4a_path, 'rb'))
35
+
36
+ expect(result).not_to be_nil
37
+ expect(result.nature).to eq(:audio)
38
+ expect(result.media_duration_seconds).to be_kind_of(Float)
39
+ expect(result.media_duration_seconds).to be > 0
40
+
41
+ expect(result.intrinsics).not_to be_nil
42
+ end
43
+ end
44
+
45
+ Dir.glob(fixtures_dir + '/MOOV/**/*.mov').sort.each do |mov_path|
46
+ it "is able to parse #{File.basename(mov_path)}" do
47
+ result = subject.call(File.open(mov_path, 'rb'))
48
+
49
+ expect(result).not_to be_nil
50
+ expect(result.nature).to eq(:video)
51
+ expect(result.width_px).to be > 0
52
+ expect(result.height_px).to be > 0
53
+ expect(result.media_duration_seconds).to be_kind_of(Float)
54
+ expect(result.media_duration_seconds).to be > 0
55
+
56
+ expect(result.intrinsics).not_to be_nil
57
+ end
58
+ end
59
+
60
+ Dir.glob(fixtures_dir + '/MOOV/**/*.mp4').sort.each do |mp4_path|
61
+ it "is able to parse #{File.basename(mp4_path)}" do
62
+ result = subject.call(File.open(mp4_path, 'rb'))
35
63
 
36
64
  expect(result).not_to be_nil
37
65
  expect(result.nature).to eq(:video)
@@ -44,7 +72,14 @@ describe FormatParser::MOOVParser do
44
72
  end
45
73
  end
46
74
 
47
- it 'parses an M4A file and provides the necessary metadata'
75
+ it 'parses an M4A file and provides the necessary metadata' do
76
+ m4a_path = fixtures_dir + '/MOOV/M4A/fixture.m4a'
77
+
78
+ result = subject.call(File.open(m4a_path, 'rb'))
79
+ expect(result).not_to be_nil
80
+ expect(result.nature).to eq(:audio)
81
+ expect(result.format).to eq(:m4a)
82
+ end
48
83
 
49
84
  it 'parses a MOV file and provides the necessary metadata' do
50
85
  mov_path = fixtures_dir + '/MOOV/MOV/Test_Circular_ProRes422.mov'
@@ -46,6 +46,30 @@ describe FormatParser::ZIPParser do
46
46
  expect(dir_entry.type).to eq(:directory)
47
47
  end
48
48
 
49
+ it 'correctly identifies Word documents' do
50
+ fixture_path = fixtures_dir + '/ZIP/10.docx'
51
+ fi_io = File.open(fixture_path, 'rb')
52
+
53
+ result = subject.call(fi_io)
54
+ expect(result.nature).to eq(:document)
55
+ expect(result.format).to eq(:docx)
56
+
57
+ fixture_path = fixtures_dir + '/ZIP/sample-docx.docx'
58
+ fi_io = File.open(fixture_path, 'rb')
59
+
60
+ result = subject.call(fi_io)
61
+ expect(result.nature).to eq(:document)
62
+ expect(result.format).to eq(:docx)
63
+ end
64
+
65
+ it 'is able to handle specific fuzzed input' do
66
+ r = Random.new(354)
67
+ 1024.times do
68
+ random_blob = StringIO.new(r.bytes(512 * 1024))
69
+ subject.call(random_blob) # If there is an error in one of the parsers the example will raise too
70
+ end
71
+ end
72
+
49
73
  it 'returns a result that has a usable JSON representation' do
50
74
  fixture_path = fixtures_dir + '/ZIP/arch_with_empty_dir.zip'
51
75
  fi_io = File.open(fixture_path, 'rb')
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: format_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Noah Berman
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2018-03-30 00:00:00.000000000 Z
12
+ date: 2018-04-03 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: ks
@@ -174,6 +174,7 @@ files:
174
174
  - lib/parsers/dpx_parser.rb
175
175
  - lib/parsers/exif_parser.rb
176
176
  - lib/parsers/fdx_parser.rb
177
+ - lib/parsers/flac_parser.rb
177
178
  - lib/parsers/gif_parser.rb
178
179
  - lib/parsers/jpeg_parser.rb
179
180
  - lib/parsers/moov_parser.rb
@@ -188,11 +189,13 @@ files:
188
189
  - lib/parsers/wav_parser.rb
189
190
  - lib/parsers/zip_parser.rb
190
191
  - lib/parsers/zip_parser/file_reader.rb
192
+ - lib/parsers/zip_parser/office_formats.rb
191
193
  - lib/read_limiter.rb
192
194
  - lib/remote_io.rb
193
195
  - lib/video.rb
194
196
  - spec/attributes_json_spec.rb
195
197
  - spec/care_spec.rb
198
+ - spec/esoteric_formats_spec.rb
196
199
  - spec/file_information_spec.rb
197
200
  - spec/format_parser_spec.rb
198
201
  - spec/io_utils_spec.rb
@@ -201,6 +204,7 @@ files:
201
204
  - spec/parsers/dpx_parser_spec.rb
202
205
  - spec/parsers/exif_parser_spec.rb
203
206
  - spec/parsers/fdx_parser_spec.rb
207
+ - spec/parsers/flac_parser_spec.rb
204
208
  - spec/parsers/gif_parser_spec.rb
205
209
  - spec/parsers/jpeg_parser_spec.rb
206
210
  - spec/parsers/moov_parser_spec.rb
@@ -236,7 +240,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
236
240
  version: '0'
237
241
  requirements: []
238
242
  rubyforge_project:
239
- rubygems_version: 2.5.2
243
+ rubygems_version: 2.7.3
240
244
  signing_key:
241
245
  specification_version: 4
242
246
  summary: A library for efficient parsing of file metadata