format_parser 0.9.0 → 0.9.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/CONTRIBUTING.md +76 -0
- data/README.md +21 -65
- data/lib/attributes_json.rb +1 -0
- data/lib/format_parser.rb +2 -0
- data/lib/format_parser/version.rb +1 -1
- data/lib/parsers/jpeg_parser.rb +49 -31
- data/lib/parsers/mp3_parser.rb +4 -0
- data/lib/read_limiter.rb +2 -0
- data/lib/read_limits_config.rb +1 -1
- data/spec/attributes_json_spec.rb +15 -0
- data/spec/esoteric_formats_spec.rb +7 -0
- data/spec/format_parser_spec.rb +3 -2
- data/spec/parsers/jpeg_parser_spec.rb +38 -0
- data/spec/parsers/mp3_parser_spec.rb +7 -0
- data/spec/read_limiter_spec.rb +12 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cd4a3b56391cebff09efc933b5fd48188c67f913adeec46c77a832ff067fd870
|
4
|
+
data.tar.gz: 159c14df0b5740f627a99915f05750bf753c89620017733eab067c6e865e972d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 559b0f5709bd8fc23cb20468eed5c3840a4087a444a581591b530490278532dff8700a54652e7e9cc8ab82b89ac2db910bf8e79620c7de5cf5a172cd7285ade3
|
7
|
+
data.tar.gz: d5baa5e7b5aea3cce2acf2513509b4dac8ad40dceb29a69875a10e4322dd136214ae650e82254416d1d39907a33cb18ed0d40634af0fa974a68ce2b0dc99b7dd
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,16 @@
|
|
1
|
+
## 0.9.3
|
2
|
+
* Fix a JPEG parsing regression introduced in 0.9.1
|
3
|
+
|
4
|
+
## 0.9.2 (yanked)
|
5
|
+
* Make sure MP3 parser returns `nil` when encountering infinite duration
|
6
|
+
* Do not read JPEG APP1 markers that contain no EXIF data
|
7
|
+
* Explicitly replace `Float::INFINITY` values in `AttributesJSON` with `nil` as per JSON convention
|
8
|
+
* Make sure the cached pages in `Care` are explicitly deleted after each `parse` call (should help GC)
|
9
|
+
* Raise the pagefaults restriction to 16 to cope with "too many useless markers in JPEGs" scenario once more
|
10
|
+
|
11
|
+
## 0.9.1 (yanked)
|
12
|
+
* Perf: Make JPEG parser bail out earlier if no marker is found while scanning through 1024 bytes of data
|
13
|
+
|
1
14
|
## 0.9.0
|
2
15
|
* Add a parser for the BMP image file format
|
3
16
|
|
data/CONTRIBUTING.md
CHANGED
@@ -87,6 +87,82 @@ or no result as soon as possible (once you know the file is not fit for your spe
|
|
87
87
|
Bear in mind that we enforce read budgets per-parser, so you will not be allowed to perform
|
88
88
|
too many reads, or perform reads which are too large.
|
89
89
|
|
90
|
+
In order to create new parsers, it is recommended to make a well-named class with an instance method `call`.
|
91
|
+
|
92
|
+
`call` accepts the IO-ish object as an argument, parses data that it reads from it,
|
93
|
+
and then returns the metadata for the file (if it could recover any) or `nil` if it couldn't. All files pass
|
94
|
+
through all parsers by default, so if you are dealing with a file that is not "your" format - return `nil` from
|
95
|
+
your method or `break` your Proc as early as possible. A blank `return` works fine too.
|
96
|
+
|
97
|
+
The IO will at the minimum support the subset of the IO API defined in `IOConstraint`
|
98
|
+
|
99
|
+
Your parser has to be registered using `FormatParser.register_parser` with the information on the formats
|
100
|
+
and file natures it provides.
|
101
|
+
|
102
|
+
Down below you can find the most basic parser implementation:
|
103
|
+
|
104
|
+
```ruby
|
105
|
+
MyParser = ->(io) {
|
106
|
+
# ... do some parsing with `io`
|
107
|
+
magic_bytes = io.read(4)
|
108
|
+
# breaking the block returns `nil` to the caller signaling "no match"
|
109
|
+
break if magic_bytes != 'IMGA'
|
110
|
+
|
111
|
+
parsed_witdh, parsed_height = io.read(8).unpack('VV')
|
112
|
+
# ...and return the FileInformation::Image object with the metadata.
|
113
|
+
FormatParser::Image.new(
|
114
|
+
format: :imga,
|
115
|
+
width_px: parsed_width,
|
116
|
+
height_px: parsed_height,
|
117
|
+
)
|
118
|
+
}
|
119
|
+
|
120
|
+
# Register the parser with the module, so that it will be applied to any
|
121
|
+
# document given to `FormatParser.parse()`. The supported natures are currently
|
122
|
+
# - :audio
|
123
|
+
# - :document
|
124
|
+
# - :image
|
125
|
+
# - :video
|
126
|
+
# - :archive
|
127
|
+
FormatParser.register_parser MyParser, natures: :image, formats: :imga
|
128
|
+
```
|
129
|
+
|
130
|
+
If you are using a class, this is the skeleton to use:
|
131
|
+
|
132
|
+
```ruby
|
133
|
+
class MyParser
|
134
|
+
def call(io)
|
135
|
+
# ... do some parsing with `io`
|
136
|
+
# The instance will be discarded after parsing, so using instance variables
|
137
|
+
# is permitted - they are not shared between calls to `call`
|
138
|
+
@magic_bytes = io.read(4)
|
139
|
+
break if @magic_bytes != 'IMGA'
|
140
|
+
parsed_witdh, parsed_height = io.read(8).unpack('VV')
|
141
|
+
FormatParser::Image.new(
|
142
|
+
format: :imga,
|
143
|
+
width_px: parsed_width,
|
144
|
+
height_px: parsed_height,
|
145
|
+
)
|
146
|
+
end
|
147
|
+
|
148
|
+
FormatParser.register_parser self, natures: :image, formats: :bmp
|
149
|
+
end
|
150
|
+
```
|
151
|
+
|
152
|
+
### Calling convention for preparing parsers
|
153
|
+
|
154
|
+
A parser that gets registered using `register_parser` must be either:
|
155
|
+
|
156
|
+
1) An object that can be `call()`-ed itself, with an argument that conforms to `IOConstraint`
|
157
|
+
2) An object that responds to `new` and returns something that can be `call()`-ed with with an argument that conforms to `IOConstraint`.
|
158
|
+
|
159
|
+
The second opton is recommended for most cases.
|
160
|
+
|
161
|
+
FormatParser is made to be used in threaded environments, and if you use instance variables
|
162
|
+
you need your parser to be isolated from it's siblings in other threads - therefore you can pass
|
163
|
+
a Class on registration to have your parser instantiated for each `call()`, anew.
|
164
|
+
|
165
|
+
|
90
166
|
## Pull requests
|
91
167
|
|
92
168
|
Good pull requests-patches, improvements, new features-are a fantastic
|
data/README.md
CHANGED
@@ -12,7 +12,24 @@ and [dimensions,](https://github.com/sstephenson/dimensions) borrowing from them
|
|
12
12
|
|
13
13
|
## Currently supported filetypes:
|
14
14
|
|
15
|
-
|
15
|
+
* TIFF
|
16
|
+
* CR2
|
17
|
+
* PSD
|
18
|
+
* PNG
|
19
|
+
* MP3
|
20
|
+
* JPEG
|
21
|
+
* GIF
|
22
|
+
* PDF
|
23
|
+
* DPX
|
24
|
+
* AIFF
|
25
|
+
* WAV
|
26
|
+
* FLAC
|
27
|
+
* FDX
|
28
|
+
* MOV
|
29
|
+
* MP4
|
30
|
+
* M4A
|
31
|
+
* ZIP
|
32
|
+
* DOCX, PPTX, XLSX
|
16
33
|
|
17
34
|
...with [more](https://github.com/WeTransfer/format_parser/issues?q=is%3Aissue+is%3Aopen+label%3Aformats) on the way!
|
18
35
|
|
@@ -32,7 +49,7 @@ match.orientation #=> :top_left
|
|
32
49
|
If you would rather receive all potential results from the gem, call the gem as follows:
|
33
50
|
|
34
51
|
```ruby
|
35
|
-
FormatParser.parse(File.open("myimage.jpg", "rb"), results: :all)
|
52
|
+
array_of_results = FormatParser.parse(File.open("myimage.jpg", "rb"), results: :all)
|
36
53
|
```
|
37
54
|
|
38
55
|
You can also optimize the metadata extraction by providing hints to the gem:
|
@@ -50,69 +67,7 @@ JSON.pretty_generate(img_info) #=> ...
|
|
50
67
|
|
51
68
|
## Creating your own parsers
|
52
69
|
|
53
|
-
|
54
|
-
parsing, and then returns the metadata for the file (if it could recover any) or `nil` if it couldn't. All files pass
|
55
|
-
through all parsers by default, so if you are dealing with a file that is not "your" format - return `nil` from
|
56
|
-
your method or `break` your Proc as early as possible. A blank `return` works fine too.
|
57
|
-
|
58
|
-
The IO will at the minimum support the subset of the IO API defined in `IOConstraint`
|
59
|
-
|
60
|
-
Strictly, a parser should be one of the two things:
|
61
|
-
|
62
|
-
1) An object that can be `call()`-ed itself, with an argument that conforms to `IOConstraint`
|
63
|
-
2) An object that responds to `new` and returns something that can be `call()`-ed with the same convention.
|
64
|
-
|
65
|
-
The second opton is useful for parsers that are stateful and non-reentrant. FormatParser is made to be used in
|
66
|
-
threaded environments, and if you use instance variables you need your parser to be isolated from it's siblings in
|
67
|
-
other threads - therefore you can pass a Class on registration to have your parser instantiated for each `call()`,
|
68
|
-
anew.
|
69
|
-
|
70
|
-
Your parser has to be registered using `FormatParser.register_parser` with the information on the formats
|
71
|
-
and file natures it provides.
|
72
|
-
|
73
|
-
Down below you can find a basic parser implementation:
|
74
|
-
|
75
|
-
```ruby
|
76
|
-
MyParser = ->(io) {
|
77
|
-
# ... do some parsing with `io`
|
78
|
-
magic_bytes = io.read(4)
|
79
|
-
break if magic_bytes != 'XBMP'
|
80
|
-
# ... more parsing code
|
81
|
-
# ...and return the FileInformation::Image object with the metadata.
|
82
|
-
FormatParser::Image.new(
|
83
|
-
width_px: parsed_width,
|
84
|
-
height_px: parsed_height,
|
85
|
-
)
|
86
|
-
}
|
87
|
-
|
88
|
-
# Register the parser with the module, so that it will be applied to any
|
89
|
-
# document given to `FormatParser.parse()`. The supported natures are currently
|
90
|
-
# - :audio
|
91
|
-
# - :document
|
92
|
-
# - :image
|
93
|
-
# - :video
|
94
|
-
FormatParser.register_parser MyParser, natures: :image, formats: :bmp
|
95
|
-
```
|
96
|
-
|
97
|
-
If you are using a class, this is the skeleton to use:
|
98
|
-
|
99
|
-
```ruby
|
100
|
-
class MyParser
|
101
|
-
def call(io)
|
102
|
-
# ... do some parsing with `io`
|
103
|
-
magic_bytes = io.read(4)
|
104
|
-
return unless magic_bytes == 'XBMP'
|
105
|
-
# ... more parsing code
|
106
|
-
# ...and return the FileInformation::Image object with the metadata.
|
107
|
-
FormatParser::Image.new(
|
108
|
-
width_px: parsed_width,
|
109
|
-
height_px: parsed_height,
|
110
|
-
)
|
111
|
-
end
|
112
|
-
|
113
|
-
FormatParser.register_parser self, natures: :image, formats: :bmp
|
114
|
-
end
|
115
|
-
```
|
70
|
+
See the [section on writing parsers in CONTRIBUTING.md](CONTRIBUTING.md#so-you-want-to-contribute-a-new-parser)
|
116
71
|
|
117
72
|
## Design rationale
|
118
73
|
|
@@ -151,6 +106,7 @@ Unless specified otherwise in this section the fixture files are MIT licensed an
|
|
151
106
|
### JPEG
|
152
107
|
- `divergent_pixel_dimensions_exif.jpg` is used with permission from LiveKom GmbH
|
153
108
|
- `extended_reads.jpg` has kindly been made available by Raphaelle Pellerin for use exclusively with format_parser
|
109
|
+
- `too_many_APP1_markers_surrogate.jpg` was created by the project maintainers
|
154
110
|
|
155
111
|
### AIFF
|
156
112
|
- fixture.aiff was created by one of the project maintainers and is MIT licensed
|
data/lib/attributes_json.rb
CHANGED
@@ -20,6 +20,7 @@ module FormatParser::AttributesJSON
|
|
20
20
|
methods.grep(/\w\=$/).each_with_object(h) do |attr_writer_method_name, h|
|
21
21
|
reader_method_name = attr_writer_method_name.to_s.gsub(/\=$/, '')
|
22
22
|
value = public_send(reader_method_name)
|
23
|
+
value = nil if value == Float::INFINITY
|
23
24
|
# When calling as_json on our members there is no need to pass the root: option given to us
|
24
25
|
# by the caller
|
25
26
|
h[reader_method_name] = value.respond_to?(:as_json) ? value.as_json : value
|
data/lib/format_parser.rb
CHANGED
data/lib/parsers/jpeg_parser.rb
CHANGED
@@ -10,6 +10,7 @@ class FormatParser::JPEGParser
|
|
10
10
|
SOS_MARKER = 0xDA # start of stream
|
11
11
|
APP1_MARKER = 0xE1 # maybe EXIF
|
12
12
|
EXIF_MAGIC_STRING = "Exif\0\0".b
|
13
|
+
MUST_FIND_NEXT_MARKER_WITHIN_BYTES = 1024
|
13
14
|
|
14
15
|
def call(io)
|
15
16
|
@buf = FormatParser::IOConstraint.new(io)
|
@@ -61,6 +62,8 @@ class FormatParser::JPEGParser
|
|
61
62
|
end
|
62
63
|
end
|
63
64
|
|
65
|
+
FormatParser::Measurometer.add_distribution_value('format_parser.JPEGParser.bytes_read_until_capture', @buf.pos)
|
66
|
+
|
64
67
|
# Return at the earliest possible opportunity
|
65
68
|
if @width && @height
|
66
69
|
return FormatParser::Image.new(
|
@@ -78,11 +81,18 @@ class FormatParser::JPEGParser
|
|
78
81
|
end
|
79
82
|
|
80
83
|
# Read a byte, if it is 0xFF then skip bytes as long as they are also 0xFF (byte stuffing)
|
81
|
-
# and return the first byte scanned that is not 0xFF
|
84
|
+
# and return the first byte scanned that is not 0xFF. Also applies limits so that we do not
|
85
|
+
# read for inordinate amount of time should we encounter a file where we _do_ have a SOI
|
86
|
+
# marker at the start and then no markers for a _very_ long time (happened with some PSDs)
|
82
87
|
def read_next_marker
|
83
|
-
|
84
|
-
|
85
|
-
|
88
|
+
# We need to find a sequence of two bytes - the first one is 0xFF, the other is anything but 0xFF
|
89
|
+
a = read_char
|
90
|
+
(MUST_FIND_NEXT_MARKER_WITHIN_BYTES - 1).times do
|
91
|
+
b = read_char
|
92
|
+
return b if a == 0xFF && b != 0xFF # Caught the marker
|
93
|
+
a = b # Shift the tuple one byte forward
|
94
|
+
end
|
95
|
+
nil # Nothing found
|
86
96
|
end
|
87
97
|
|
88
98
|
def scan_start_of_frame
|
@@ -107,34 +117,42 @@ class FormatParser::JPEGParser
|
|
107
117
|
# pry it out of the APP1 frame and parse it as the TIFF segment - which is what EXIFR
|
108
118
|
# does under the hood.
|
109
119
|
app1_frame_content_length = read_short - 2
|
110
|
-
|
111
|
-
|
112
|
-
maybe_exif_magic_str =
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
@exif_output = scanner.exif_data
|
120
|
-
@orientation = scanner.orientation unless scanner.orientation.nil?
|
121
|
-
@intrinsics[:exif_pixel_x_dimension] = @exif_output.pixel_x_dimension
|
122
|
-
@intrinsics[:exif_pixel_y_dimension] = @exif_output.pixel_y_dimension
|
123
|
-
# Save these two for later, when we decide to provide display width /
|
124
|
-
# display height in addition to pixel buffer width / height. These two
|
125
|
-
# are _different concepts_. Imagine you have an image shot with a camera
|
126
|
-
# in portrait orientation, and the camera has an anamorphic lens. That
|
127
|
-
# anamorpohic lens is a smart lens, and therefore transmits pixel aspect
|
128
|
-
# ratio to the camera, and the camera encodes that aspect ratio into the
|
129
|
-
# image metadata. If we want to know what size our _pixel buffer_ will be,
|
130
|
-
# and how to _read_ the pixel data (stride/interleaving) - we need the
|
131
|
-
# pixel buffer dimensions. If we want to know what aspect and dimensions
|
132
|
-
# our file is going to have _once displayed_ and _once pixels have been
|
133
|
-
# brought to the right orientation_ we need to work with **display dimensions**
|
134
|
-
# which can be remarkably different from the pixel buffer dimensions.
|
135
|
-
@exif_width = scanner.width
|
136
|
-
@exif_height = scanner.height
|
120
|
+
|
121
|
+
# Peek whether the contents of the marker starts with Exif\0
|
122
|
+
maybe_exif_magic_str = safe_read(@buf, EXIF_MAGIC_STRING.bytesize)
|
123
|
+
|
124
|
+
# If we could not find the magic Exif\0 string at the start of the marker,
|
125
|
+
# seek to the start of the next marker and return
|
126
|
+
unless maybe_exif_magic_str == EXIF_MAGIC_STRING
|
127
|
+
safe_skip(@buf, app1_frame_content_length - EXIF_MAGIC_STRING.bytesize)
|
128
|
+
return
|
137
129
|
end
|
130
|
+
|
131
|
+
# ...and only then read the marker contents and parse it as EXIF
|
132
|
+
exif_data = safe_read(@buf, app1_frame_content_length - EXIF_MAGIC_STRING.bytesize)
|
133
|
+
|
134
|
+
FormatParser::Measurometer.add_distribution_value('format_parser.JPEGParser.bytes_sent_to_exif_parser', exif_data.bytesize)
|
135
|
+
scanner = FormatParser::EXIFParser.new(StringIO.new(exif_data))
|
136
|
+
scanner.scan_image_tiff
|
137
|
+
|
138
|
+
@exif_output = scanner.exif_data
|
139
|
+
@orientation = scanner.orientation unless scanner.orientation.nil?
|
140
|
+
@intrinsics[:exif_pixel_x_dimension] = @exif_output.pixel_x_dimension
|
141
|
+
@intrinsics[:exif_pixel_y_dimension] = @exif_output.pixel_y_dimension
|
142
|
+
# Save these two for later, when we decide to provide display width /
|
143
|
+
# display height in addition to pixel buffer width / height. These two
|
144
|
+
# are _different concepts_. Imagine you have an image shot with a camera
|
145
|
+
# in portrait orientation, and the camera has an anamorphic lens. That
|
146
|
+
# anamorpohic lens is a smart lens, and therefore transmits pixel aspect
|
147
|
+
# ratio to the camera, and the camera encodes that aspect ratio into the
|
148
|
+
# image metadata. If we want to know what size our _pixel buffer_ will be,
|
149
|
+
# and how to _read_ the pixel data (stride/interleaving) - we need the
|
150
|
+
# pixel buffer dimensions. If we want to know what aspect and dimensions
|
151
|
+
# our file is going to have _once displayed_ and _once pixels have been
|
152
|
+
# brought to the right orientation_ we need to work with **display dimensions**
|
153
|
+
# which can be remarkably different from the pixel buffer dimensions.
|
154
|
+
@exif_width = scanner.width
|
155
|
+
@exif_height = scanner.height
|
138
156
|
rescue EXIFR::MalformedTIFF
|
139
157
|
# Not a JPEG or the Exif headers contain invalid data, or
|
140
158
|
# an APP1 marker was detected in a file that is not a JPEG
|
data/lib/parsers/mp3_parser.rb
CHANGED
@@ -83,6 +83,10 @@ class FormatParser::MP3Parser
|
|
83
83
|
est_samples = est_frame_count * SAMPLES_PER_FRAME
|
84
84
|
est_duration_seconds = est_samples / avg_sample_rate
|
85
85
|
|
86
|
+
# Safeguard for i.e. some JPEGs being recognized as MP3
|
87
|
+
# to prevent ambiguous recognition
|
88
|
+
return if est_duration_seconds == Float::INFINITY
|
89
|
+
|
86
90
|
file_info.media_duration_seconds = est_duration_seconds
|
87
91
|
file_info
|
88
92
|
end
|
data/lib/read_limiter.rb
CHANGED
data/lib/read_limits_config.rb
CHANGED
@@ -27,6 +27,21 @@ describe FormatParser::AttributesJSON do
|
|
27
27
|
end
|
28
28
|
end
|
29
29
|
|
30
|
+
it 'converts Float::INFINITY to nil' do
|
31
|
+
anon_class = Class.new do
|
32
|
+
include FormatParser::AttributesJSON
|
33
|
+
attr_accessor :some_infinity
|
34
|
+
def some_infinity
|
35
|
+
Float::INFINITY
|
36
|
+
end
|
37
|
+
end
|
38
|
+
instance = anon_class.new
|
39
|
+
output = JSON.dump(instance)
|
40
|
+
readback = JSON.parse(output, symbolize_names: true)
|
41
|
+
expect(readback).to have_key(:some_infinity)
|
42
|
+
expect(readback[:some_infinity]).to be_nil
|
43
|
+
end
|
44
|
+
|
30
45
|
it 'provides a default implementation of to_json as well' do
|
31
46
|
anon_class = Class.new do
|
32
47
|
include FormatParser::AttributesJSON
|
@@ -13,4 +13,11 @@ describe 'Parsing esoteric files and files causing ambiguous detection' do
|
|
13
13
|
result = FormatParser.parse(File.open(jpeg_path, 'rb'))
|
14
14
|
expect(result.nature).to eq(:archive)
|
15
15
|
end
|
16
|
+
|
17
|
+
it 'returns a result for JPEG file that causes many reads due to too many APP1 markers' do
|
18
|
+
jpeg_path = fixtures_dir + '/JPEG/too_many_APP1_markers_surrogate.jpg'
|
19
|
+
result = FormatParser.parse(File.open(jpeg_path, 'rb'))
|
20
|
+
expect(result).not_to be_nil
|
21
|
+
expect(result.nature).to eq(:image)
|
22
|
+
end
|
16
23
|
end
|
data/spec/format_parser_spec.rb
CHANGED
@@ -32,10 +32,11 @@ describe FormatParser do
|
|
32
32
|
FormatParser.register_parser exploit, natures: :document, formats: :exploit
|
33
33
|
|
34
34
|
sample_io = StringIO.new(Random.new.bytes(1024 * 1024 * 8))
|
35
|
-
|
36
|
-
expect(sample_io).to receive(:read).at_most(8).times.and_call_original
|
35
|
+
allow(sample_io).to receive(:read).and_call_original
|
37
36
|
|
38
37
|
result = FormatParser.parse(sample_io, formats: [:exploit])
|
38
|
+
|
39
|
+
expect(sample_io).to have_received(:read).at_most(16).times
|
39
40
|
expect(result).to be_nil
|
40
41
|
|
41
42
|
FormatParser.deregister_parser(exploit)
|
@@ -49,9 +49,47 @@ describe FormatParser::JPEGParser do
|
|
49
49
|
expect(result.intrinsics).to eq(exif_pixel_x_dimension: 8214, exif_pixel_y_dimension: 5476)
|
50
50
|
end
|
51
51
|
|
52
|
+
it 'reads an example with many APP1 markers at the beginning of which none are EXIF' do
|
53
|
+
fixture_path = fixtures_dir + '/JPEG/too_many_APP1_markers_surrogate.jpg'
|
54
|
+
io = FormatParser::ReadLimiter.new(File.open(fixture_path, 'rb'))
|
55
|
+
|
56
|
+
result = subject.call(io)
|
57
|
+
|
58
|
+
expect(result).not_to be_nil
|
59
|
+
expect(result.width_px).to eq(1920)
|
60
|
+
expect(result.height_px).to eq(1200)
|
61
|
+
|
62
|
+
expect(io.bytes).to be < (128 * 1024)
|
63
|
+
expect(io.reads).to be < (1024 * 4)
|
64
|
+
end
|
65
|
+
|
66
|
+
it 'does not continue parsing for inordinate amount of time if the file contains no 0xFF bytes' do
|
67
|
+
# Create a large fuzzed input that consists of any bytes except 0xFF,
|
68
|
+
# so that the marker detector has nothing to latch on to
|
69
|
+
bytes_except_byte_255 = 0x0..0xFE
|
70
|
+
|
71
|
+
# Start the blob with the usual SOI marker - 0xFF 0xD8, so that the parser does not
|
72
|
+
# bail out too early and actually "bites" into the blob
|
73
|
+
no_markers = ([0xFF, 0xD8] + (16 * 1024).times.map { rand(bytes_except_byte_255) }).pack('C*')
|
74
|
+
|
75
|
+
# Yes, assertions on a private method - but we want to ensure we do not read more
|
76
|
+
# single bytes than the restriction stipulates we may. At the same time we check that
|
77
|
+
# the method does indeed, get triggered
|
78
|
+
allow(subject).to receive(:read_char).and_call_original
|
79
|
+
result = subject.call(StringIO.new(no_markers))
|
80
|
+
expect(result).to be_nil
|
81
|
+
expect(subject).to have_received(:read_char).at_most(1026).times
|
82
|
+
end
|
83
|
+
|
52
84
|
it 'does not return a result for a Keynote document' do
|
53
85
|
key_path = fixtures_dir + '/JPEG/keynote_recognized_as_jpeg.key'
|
54
86
|
result = subject.call(File.open(key_path, 'rb'))
|
55
87
|
expect(result).to be_nil
|
56
88
|
end
|
89
|
+
|
90
|
+
it 'parses the the marker structure correctly when marker bytes cannot be read in groups of 2' do
|
91
|
+
kitten_path = fixtures_dir + '/JPEG/off-cadence-markers.jpg'
|
92
|
+
result = subject.call(File.open(kitten_path, 'rb'))
|
93
|
+
expect(result).not_to be_nil
|
94
|
+
end
|
57
95
|
end
|
@@ -44,4 +44,11 @@ describe FormatParser::MP3Parser do
|
|
44
44
|
|
45
45
|
expect(parsed.intrinsics).not_to be_nil
|
46
46
|
end
|
47
|
+
|
48
|
+
it 'avoids returning a result when the parsed duration is infinite' do
|
49
|
+
fpath = fixtures_dir + '/JPEG/too_many_APP1_markers_surrogate.jpg'
|
50
|
+
parsed = subject.call(File.open(fpath, 'rb'))
|
51
|
+
|
52
|
+
expect(parsed).to be_nil
|
53
|
+
end
|
47
54
|
end
|
data/spec/read_limiter_spec.rb
CHANGED
@@ -18,6 +18,18 @@ describe FormatParser::ReadLimiter do
|
|
18
18
|
expect(reader.pos).to eq(2)
|
19
19
|
end
|
20
20
|
|
21
|
+
it 'exposes #reads, #seeks, #bytes' do
|
22
|
+
reader = FormatParser::ReadLimiter.new(io)
|
23
|
+
expect(reader.pos).to eq(0)
|
24
|
+
reader.read(2)
|
25
|
+
reader.seek(3)
|
26
|
+
reader.seek(4)
|
27
|
+
|
28
|
+
expect(reader.reads).to eq(1)
|
29
|
+
expect(reader.bytes).to eq(2)
|
30
|
+
expect(reader.seeks).to eq(2)
|
31
|
+
end
|
32
|
+
|
21
33
|
it 'enforces the number of seeks' do
|
22
34
|
reader = FormatParser::ReadLimiter.new(io, max_seeks: 4)
|
23
35
|
4.times { reader.seek(1) }
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: format_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.9.
|
4
|
+
version: 0.9.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Noah Berman
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2018-04-
|
12
|
+
date: 2018-04-21 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: ks
|