format_parser 0.9.0 → 0.9.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/CONTRIBUTING.md +76 -0
- data/README.md +21 -65
- data/lib/attributes_json.rb +1 -0
- data/lib/format_parser.rb +2 -0
- data/lib/format_parser/version.rb +1 -1
- data/lib/parsers/jpeg_parser.rb +49 -31
- data/lib/parsers/mp3_parser.rb +4 -0
- data/lib/read_limiter.rb +2 -0
- data/lib/read_limits_config.rb +1 -1
- data/spec/attributes_json_spec.rb +15 -0
- data/spec/esoteric_formats_spec.rb +7 -0
- data/spec/format_parser_spec.rb +3 -2
- data/spec/parsers/jpeg_parser_spec.rb +38 -0
- data/spec/parsers/mp3_parser_spec.rb +7 -0
- data/spec/read_limiter_spec.rb +12 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cd4a3b56391cebff09efc933b5fd48188c67f913adeec46c77a832ff067fd870
|
4
|
+
data.tar.gz: 159c14df0b5740f627a99915f05750bf753c89620017733eab067c6e865e972d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 559b0f5709bd8fc23cb20468eed5c3840a4087a444a581591b530490278532dff8700a54652e7e9cc8ab82b89ac2db910bf8e79620c7de5cf5a172cd7285ade3
|
7
|
+
data.tar.gz: d5baa5e7b5aea3cce2acf2513509b4dac8ad40dceb29a69875a10e4322dd136214ae650e82254416d1d39907a33cb18ed0d40634af0fa974a68ce2b0dc99b7dd
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,16 @@
|
|
1
|
+
## 0.9.3
|
2
|
+
* Fix a JPEG parsing regression introduced in 0.9.1
|
3
|
+
|
4
|
+
## 0.9.2 (yanked)
|
5
|
+
* Make sure MP3 parser returns `nil` when encountering infinite duration
|
6
|
+
* Do not read JPEG APP1 markers that contain no EXIF data
|
7
|
+
* Explicitly replace `Float::INFINITY` values in `AttributesJSON` with `nil` as per JSON convention
|
8
|
+
* Make sure the cached pages in `Care` are explicitly deleted after each `parse` call (should help GC)
|
9
|
+
* Raise the pagefaults restriction to 16 to cope with "too many useless markers in JPEGs" scenario once more
|
10
|
+
|
11
|
+
## 0.9.1 (yanked)
|
12
|
+
* Perf: Make JPEG parser bail out earlier if no marker is found while scanning through 1024 bytes of data
|
13
|
+
|
1
14
|
## 0.9.0
|
2
15
|
* Add a parser for the BMP image file format
|
3
16
|
|
data/CONTRIBUTING.md
CHANGED
@@ -87,6 +87,82 @@ or no result as soon as possible (once you know the file is not fit for your spe
|
|
87
87
|
Bear in mind that we enforce read budgets per-parser, so you will not be allowed to perform
|
88
88
|
too many reads, or perform reads which are too large.
|
89
89
|
|
90
|
+
In order to create new parsers, it is recommended to make a well-named class with an instance method `call`.
|
91
|
+
|
92
|
+
`call` accepts the IO-ish object as an argument, parses data that it reads from it,
|
93
|
+
and then returns the metadata for the file (if it could recover any) or `nil` if it couldn't. All files pass
|
94
|
+
through all parsers by default, so if you are dealing with a file that is not "your" format - return `nil` from
|
95
|
+
your method or `break` your Proc as early as possible. A blank `return` works fine too.
|
96
|
+
|
97
|
+
The IO will at the minimum support the subset of the IO API defined in `IOConstraint`
|
98
|
+
|
99
|
+
Your parser has to be registered using `FormatParser.register_parser` with the information on the formats
|
100
|
+
and file natures it provides.
|
101
|
+
|
102
|
+
Down below you can find the most basic parser implementation:
|
103
|
+
|
104
|
+
```ruby
|
105
|
+
MyParser = ->(io) {
|
106
|
+
# ... do some parsing with `io`
|
107
|
+
magic_bytes = io.read(4)
|
108
|
+
# breaking the block returns `nil` to the caller signaling "no match"
|
109
|
+
break if magic_bytes != 'IMGA'
|
110
|
+
|
111
|
+
parsed_witdh, parsed_height = io.read(8).unpack('VV')
|
112
|
+
# ...and return the FileInformation::Image object with the metadata.
|
113
|
+
FormatParser::Image.new(
|
114
|
+
format: :imga,
|
115
|
+
width_px: parsed_width,
|
116
|
+
height_px: parsed_height,
|
117
|
+
)
|
118
|
+
}
|
119
|
+
|
120
|
+
# Register the parser with the module, so that it will be applied to any
|
121
|
+
# document given to `FormatParser.parse()`. The supported natures are currently
|
122
|
+
# - :audio
|
123
|
+
# - :document
|
124
|
+
# - :image
|
125
|
+
# - :video
|
126
|
+
# - :archive
|
127
|
+
FormatParser.register_parser MyParser, natures: :image, formats: :imga
|
128
|
+
```
|
129
|
+
|
130
|
+
If you are using a class, this is the skeleton to use:
|
131
|
+
|
132
|
+
```ruby
|
133
|
+
class MyParser
|
134
|
+
def call(io)
|
135
|
+
# ... do some parsing with `io`
|
136
|
+
# The instance will be discarded after parsing, so using instance variables
|
137
|
+
# is permitted - they are not shared between calls to `call`
|
138
|
+
@magic_bytes = io.read(4)
|
139
|
+
break if @magic_bytes != 'IMGA'
|
140
|
+
parsed_witdh, parsed_height = io.read(8).unpack('VV')
|
141
|
+
FormatParser::Image.new(
|
142
|
+
format: :imga,
|
143
|
+
width_px: parsed_width,
|
144
|
+
height_px: parsed_height,
|
145
|
+
)
|
146
|
+
end
|
147
|
+
|
148
|
+
FormatParser.register_parser self, natures: :image, formats: :bmp
|
149
|
+
end
|
150
|
+
```
|
151
|
+
|
152
|
+
### Calling convention for preparing parsers
|
153
|
+
|
154
|
+
A parser that gets registered using `register_parser` must be either:
|
155
|
+
|
156
|
+
1) An object that can be `call()`-ed itself, with an argument that conforms to `IOConstraint`
|
157
|
+
2) An object that responds to `new` and returns something that can be `call()`-ed with with an argument that conforms to `IOConstraint`.
|
158
|
+
|
159
|
+
The second opton is recommended for most cases.
|
160
|
+
|
161
|
+
FormatParser is made to be used in threaded environments, and if you use instance variables
|
162
|
+
you need your parser to be isolated from it's siblings in other threads - therefore you can pass
|
163
|
+
a Class on registration to have your parser instantiated for each `call()`, anew.
|
164
|
+
|
165
|
+
|
90
166
|
## Pull requests
|
91
167
|
|
92
168
|
Good pull requests-patches, improvements, new features-are a fantastic
|
data/README.md
CHANGED
@@ -12,7 +12,24 @@ and [dimensions,](https://github.com/sstephenson/dimensions) borrowing from them
|
|
12
12
|
|
13
13
|
## Currently supported filetypes:
|
14
14
|
|
15
|
-
|
15
|
+
* TIFF
|
16
|
+
* CR2
|
17
|
+
* PSD
|
18
|
+
* PNG
|
19
|
+
* MP3
|
20
|
+
* JPEG
|
21
|
+
* GIF
|
22
|
+
* PDF
|
23
|
+
* DPX
|
24
|
+
* AIFF
|
25
|
+
* WAV
|
26
|
+
* FLAC
|
27
|
+
* FDX
|
28
|
+
* MOV
|
29
|
+
* MP4
|
30
|
+
* M4A
|
31
|
+
* ZIP
|
32
|
+
* DOCX, PPTX, XLSX
|
16
33
|
|
17
34
|
...with [more](https://github.com/WeTransfer/format_parser/issues?q=is%3Aissue+is%3Aopen+label%3Aformats) on the way!
|
18
35
|
|
@@ -32,7 +49,7 @@ match.orientation #=> :top_left
|
|
32
49
|
If you would rather receive all potential results from the gem, call the gem as follows:
|
33
50
|
|
34
51
|
```ruby
|
35
|
-
FormatParser.parse(File.open("myimage.jpg", "rb"), results: :all)
|
52
|
+
array_of_results = FormatParser.parse(File.open("myimage.jpg", "rb"), results: :all)
|
36
53
|
```
|
37
54
|
|
38
55
|
You can also optimize the metadata extraction by providing hints to the gem:
|
@@ -50,69 +67,7 @@ JSON.pretty_generate(img_info) #=> ...
|
|
50
67
|
|
51
68
|
## Creating your own parsers
|
52
69
|
|
53
|
-
|
54
|
-
parsing, and then returns the metadata for the file (if it could recover any) or `nil` if it couldn't. All files pass
|
55
|
-
through all parsers by default, so if you are dealing with a file that is not "your" format - return `nil` from
|
56
|
-
your method or `break` your Proc as early as possible. A blank `return` works fine too.
|
57
|
-
|
58
|
-
The IO will at the minimum support the subset of the IO API defined in `IOConstraint`
|
59
|
-
|
60
|
-
Strictly, a parser should be one of the two things:
|
61
|
-
|
62
|
-
1) An object that can be `call()`-ed itself, with an argument that conforms to `IOConstraint`
|
63
|
-
2) An object that responds to `new` and returns something that can be `call()`-ed with the same convention.
|
64
|
-
|
65
|
-
The second opton is useful for parsers that are stateful and non-reentrant. FormatParser is made to be used in
|
66
|
-
threaded environments, and if you use instance variables you need your parser to be isolated from it's siblings in
|
67
|
-
other threads - therefore you can pass a Class on registration to have your parser instantiated for each `call()`,
|
68
|
-
anew.
|
69
|
-
|
70
|
-
Your parser has to be registered using `FormatParser.register_parser` with the information on the formats
|
71
|
-
and file natures it provides.
|
72
|
-
|
73
|
-
Down below you can find a basic parser implementation:
|
74
|
-
|
75
|
-
```ruby
|
76
|
-
MyParser = ->(io) {
|
77
|
-
# ... do some parsing with `io`
|
78
|
-
magic_bytes = io.read(4)
|
79
|
-
break if magic_bytes != 'XBMP'
|
80
|
-
# ... more parsing code
|
81
|
-
# ...and return the FileInformation::Image object with the metadata.
|
82
|
-
FormatParser::Image.new(
|
83
|
-
width_px: parsed_width,
|
84
|
-
height_px: parsed_height,
|
85
|
-
)
|
86
|
-
}
|
87
|
-
|
88
|
-
# Register the parser with the module, so that it will be applied to any
|
89
|
-
# document given to `FormatParser.parse()`. The supported natures are currently
|
90
|
-
# - :audio
|
91
|
-
# - :document
|
92
|
-
# - :image
|
93
|
-
# - :video
|
94
|
-
FormatParser.register_parser MyParser, natures: :image, formats: :bmp
|
95
|
-
```
|
96
|
-
|
97
|
-
If you are using a class, this is the skeleton to use:
|
98
|
-
|
99
|
-
```ruby
|
100
|
-
class MyParser
|
101
|
-
def call(io)
|
102
|
-
# ... do some parsing with `io`
|
103
|
-
magic_bytes = io.read(4)
|
104
|
-
return unless magic_bytes == 'XBMP'
|
105
|
-
# ... more parsing code
|
106
|
-
# ...and return the FileInformation::Image object with the metadata.
|
107
|
-
FormatParser::Image.new(
|
108
|
-
width_px: parsed_width,
|
109
|
-
height_px: parsed_height,
|
110
|
-
)
|
111
|
-
end
|
112
|
-
|
113
|
-
FormatParser.register_parser self, natures: :image, formats: :bmp
|
114
|
-
end
|
115
|
-
```
|
70
|
+
See the [section on writing parsers in CONTRIBUTING.md](CONTRIBUTING.md#so-you-want-to-contribute-a-new-parser)
|
116
71
|
|
117
72
|
## Design rationale
|
118
73
|
|
@@ -151,6 +106,7 @@ Unless specified otherwise in this section the fixture files are MIT licensed an
|
|
151
106
|
### JPEG
|
152
107
|
- `divergent_pixel_dimensions_exif.jpg` is used with permission from LiveKom GmbH
|
153
108
|
- `extended_reads.jpg` has kindly been made available by Raphaelle Pellerin for use exclusively with format_parser
|
109
|
+
- `too_many_APP1_markers_surrogate.jpg` was created by the project maintainers
|
154
110
|
|
155
111
|
### AIFF
|
156
112
|
- fixture.aiff was created by one of the project maintainers and is MIT licensed
|
data/lib/attributes_json.rb
CHANGED
@@ -20,6 +20,7 @@ module FormatParser::AttributesJSON
|
|
20
20
|
methods.grep(/\w\=$/).each_with_object(h) do |attr_writer_method_name, h|
|
21
21
|
reader_method_name = attr_writer_method_name.to_s.gsub(/\=$/, '')
|
22
22
|
value = public_send(reader_method_name)
|
23
|
+
value = nil if value == Float::INFINITY
|
23
24
|
# When calling as_json on our members there is no need to pass the root: option given to us
|
24
25
|
# by the caller
|
25
26
|
h[reader_method_name] = value.respond_to?(:as_json) ? value.as_json : value
|
data/lib/format_parser.rb
CHANGED
data/lib/parsers/jpeg_parser.rb
CHANGED
@@ -10,6 +10,7 @@ class FormatParser::JPEGParser
|
|
10
10
|
SOS_MARKER = 0xDA # start of stream
|
11
11
|
APP1_MARKER = 0xE1 # maybe EXIF
|
12
12
|
EXIF_MAGIC_STRING = "Exif\0\0".b
|
13
|
+
MUST_FIND_NEXT_MARKER_WITHIN_BYTES = 1024
|
13
14
|
|
14
15
|
def call(io)
|
15
16
|
@buf = FormatParser::IOConstraint.new(io)
|
@@ -61,6 +62,8 @@ class FormatParser::JPEGParser
|
|
61
62
|
end
|
62
63
|
end
|
63
64
|
|
65
|
+
FormatParser::Measurometer.add_distribution_value('format_parser.JPEGParser.bytes_read_until_capture', @buf.pos)
|
66
|
+
|
64
67
|
# Return at the earliest possible opportunity
|
65
68
|
if @width && @height
|
66
69
|
return FormatParser::Image.new(
|
@@ -78,11 +81,18 @@ class FormatParser::JPEGParser
|
|
78
81
|
end
|
79
82
|
|
80
83
|
# Read a byte, if it is 0xFF then skip bytes as long as they are also 0xFF (byte stuffing)
|
81
|
-
# and return the first byte scanned that is not 0xFF
|
84
|
+
# and return the first byte scanned that is not 0xFF. Also applies limits so that we do not
|
85
|
+
# read for inordinate amount of time should we encounter a file where we _do_ have a SOI
|
86
|
+
# marker at the start and then no markers for a _very_ long time (happened with some PSDs)
|
82
87
|
def read_next_marker
|
83
|
-
|
84
|
-
|
85
|
-
|
88
|
+
# We need to find a sequence of two bytes - the first one is 0xFF, the other is anything but 0xFF
|
89
|
+
a = read_char
|
90
|
+
(MUST_FIND_NEXT_MARKER_WITHIN_BYTES - 1).times do
|
91
|
+
b = read_char
|
92
|
+
return b if a == 0xFF && b != 0xFF # Caught the marker
|
93
|
+
a = b # Shift the tuple one byte forward
|
94
|
+
end
|
95
|
+
nil # Nothing found
|
86
96
|
end
|
87
97
|
|
88
98
|
def scan_start_of_frame
|
@@ -107,34 +117,42 @@ class FormatParser::JPEGParser
|
|
107
117
|
# pry it out of the APP1 frame and parse it as the TIFF segment - which is what EXIFR
|
108
118
|
# does under the hood.
|
109
119
|
app1_frame_content_length = read_short - 2
|
110
|
-
|
111
|
-
|
112
|
-
maybe_exif_magic_str =
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
@exif_output = scanner.exif_data
|
120
|
-
@orientation = scanner.orientation unless scanner.orientation.nil?
|
121
|
-
@intrinsics[:exif_pixel_x_dimension] = @exif_output.pixel_x_dimension
|
122
|
-
@intrinsics[:exif_pixel_y_dimension] = @exif_output.pixel_y_dimension
|
123
|
-
# Save these two for later, when we decide to provide display width /
|
124
|
-
# display height in addition to pixel buffer width / height. These two
|
125
|
-
# are _different concepts_. Imagine you have an image shot with a camera
|
126
|
-
# in portrait orientation, and the camera has an anamorphic lens. That
|
127
|
-
# anamorpohic lens is a smart lens, and therefore transmits pixel aspect
|
128
|
-
# ratio to the camera, and the camera encodes that aspect ratio into the
|
129
|
-
# image metadata. If we want to know what size our _pixel buffer_ will be,
|
130
|
-
# and how to _read_ the pixel data (stride/interleaving) - we need the
|
131
|
-
# pixel buffer dimensions. If we want to know what aspect and dimensions
|
132
|
-
# our file is going to have _once displayed_ and _once pixels have been
|
133
|
-
# brought to the right orientation_ we need to work with **display dimensions**
|
134
|
-
# which can be remarkably different from the pixel buffer dimensions.
|
135
|
-
@exif_width = scanner.width
|
136
|
-
@exif_height = scanner.height
|
120
|
+
|
121
|
+
# Peek whether the contents of the marker starts with Exif\0
|
122
|
+
maybe_exif_magic_str = safe_read(@buf, EXIF_MAGIC_STRING.bytesize)
|
123
|
+
|
124
|
+
# If we could not find the magic Exif\0 string at the start of the marker,
|
125
|
+
# seek to the start of the next marker and return
|
126
|
+
unless maybe_exif_magic_str == EXIF_MAGIC_STRING
|
127
|
+
safe_skip(@buf, app1_frame_content_length - EXIF_MAGIC_STRING.bytesize)
|
128
|
+
return
|
137
129
|
end
|
130
|
+
|
131
|
+
# ...and only then read the marker contents and parse it as EXIF
|
132
|
+
exif_data = safe_read(@buf, app1_frame_content_length - EXIF_MAGIC_STRING.bytesize)
|
133
|
+
|
134
|
+
FormatParser::Measurometer.add_distribution_value('format_parser.JPEGParser.bytes_sent_to_exif_parser', exif_data.bytesize)
|
135
|
+
scanner = FormatParser::EXIFParser.new(StringIO.new(exif_data))
|
136
|
+
scanner.scan_image_tiff
|
137
|
+
|
138
|
+
@exif_output = scanner.exif_data
|
139
|
+
@orientation = scanner.orientation unless scanner.orientation.nil?
|
140
|
+
@intrinsics[:exif_pixel_x_dimension] = @exif_output.pixel_x_dimension
|
141
|
+
@intrinsics[:exif_pixel_y_dimension] = @exif_output.pixel_y_dimension
|
142
|
+
# Save these two for later, when we decide to provide display width /
|
143
|
+
# display height in addition to pixel buffer width / height. These two
|
144
|
+
# are _different concepts_. Imagine you have an image shot with a camera
|
145
|
+
# in portrait orientation, and the camera has an anamorphic lens. That
|
146
|
+
# anamorpohic lens is a smart lens, and therefore transmits pixel aspect
|
147
|
+
# ratio to the camera, and the camera encodes that aspect ratio into the
|
148
|
+
# image metadata. If we want to know what size our _pixel buffer_ will be,
|
149
|
+
# and how to _read_ the pixel data (stride/interleaving) - we need the
|
150
|
+
# pixel buffer dimensions. If we want to know what aspect and dimensions
|
151
|
+
# our file is going to have _once displayed_ and _once pixels have been
|
152
|
+
# brought to the right orientation_ we need to work with **display dimensions**
|
153
|
+
# which can be remarkably different from the pixel buffer dimensions.
|
154
|
+
@exif_width = scanner.width
|
155
|
+
@exif_height = scanner.height
|
138
156
|
rescue EXIFR::MalformedTIFF
|
139
157
|
# Not a JPEG or the Exif headers contain invalid data, or
|
140
158
|
# an APP1 marker was detected in a file that is not a JPEG
|
data/lib/parsers/mp3_parser.rb
CHANGED
@@ -83,6 +83,10 @@ class FormatParser::MP3Parser
|
|
83
83
|
est_samples = est_frame_count * SAMPLES_PER_FRAME
|
84
84
|
est_duration_seconds = est_samples / avg_sample_rate
|
85
85
|
|
86
|
+
# Safeguard for i.e. some JPEGs being recognized as MP3
|
87
|
+
# to prevent ambiguous recognition
|
88
|
+
return if est_duration_seconds == Float::INFINITY
|
89
|
+
|
86
90
|
file_info.media_duration_seconds = est_duration_seconds
|
87
91
|
file_info
|
88
92
|
end
|
data/lib/read_limiter.rb
CHANGED
data/lib/read_limits_config.rb
CHANGED
@@ -27,6 +27,21 @@ describe FormatParser::AttributesJSON do
|
|
27
27
|
end
|
28
28
|
end
|
29
29
|
|
30
|
+
it 'converts Float::INFINITY to nil' do
|
31
|
+
anon_class = Class.new do
|
32
|
+
include FormatParser::AttributesJSON
|
33
|
+
attr_accessor :some_infinity
|
34
|
+
def some_infinity
|
35
|
+
Float::INFINITY
|
36
|
+
end
|
37
|
+
end
|
38
|
+
instance = anon_class.new
|
39
|
+
output = JSON.dump(instance)
|
40
|
+
readback = JSON.parse(output, symbolize_names: true)
|
41
|
+
expect(readback).to have_key(:some_infinity)
|
42
|
+
expect(readback[:some_infinity]).to be_nil
|
43
|
+
end
|
44
|
+
|
30
45
|
it 'provides a default implementation of to_json as well' do
|
31
46
|
anon_class = Class.new do
|
32
47
|
include FormatParser::AttributesJSON
|
@@ -13,4 +13,11 @@ describe 'Parsing esoteric files and files causing ambiguous detection' do
|
|
13
13
|
result = FormatParser.parse(File.open(jpeg_path, 'rb'))
|
14
14
|
expect(result.nature).to eq(:archive)
|
15
15
|
end
|
16
|
+
|
17
|
+
it 'returns a result for JPEG file that causes many reads due to too many APP1 markers' do
|
18
|
+
jpeg_path = fixtures_dir + '/JPEG/too_many_APP1_markers_surrogate.jpg'
|
19
|
+
result = FormatParser.parse(File.open(jpeg_path, 'rb'))
|
20
|
+
expect(result).not_to be_nil
|
21
|
+
expect(result.nature).to eq(:image)
|
22
|
+
end
|
16
23
|
end
|
data/spec/format_parser_spec.rb
CHANGED
@@ -32,10 +32,11 @@ describe FormatParser do
|
|
32
32
|
FormatParser.register_parser exploit, natures: :document, formats: :exploit
|
33
33
|
|
34
34
|
sample_io = StringIO.new(Random.new.bytes(1024 * 1024 * 8))
|
35
|
-
|
36
|
-
expect(sample_io).to receive(:read).at_most(8).times.and_call_original
|
35
|
+
allow(sample_io).to receive(:read).and_call_original
|
37
36
|
|
38
37
|
result = FormatParser.parse(sample_io, formats: [:exploit])
|
38
|
+
|
39
|
+
expect(sample_io).to have_received(:read).at_most(16).times
|
39
40
|
expect(result).to be_nil
|
40
41
|
|
41
42
|
FormatParser.deregister_parser(exploit)
|
@@ -49,9 +49,47 @@ describe FormatParser::JPEGParser do
|
|
49
49
|
expect(result.intrinsics).to eq(exif_pixel_x_dimension: 8214, exif_pixel_y_dimension: 5476)
|
50
50
|
end
|
51
51
|
|
52
|
+
it 'reads an example with many APP1 markers at the beginning of which none are EXIF' do
|
53
|
+
fixture_path = fixtures_dir + '/JPEG/too_many_APP1_markers_surrogate.jpg'
|
54
|
+
io = FormatParser::ReadLimiter.new(File.open(fixture_path, 'rb'))
|
55
|
+
|
56
|
+
result = subject.call(io)
|
57
|
+
|
58
|
+
expect(result).not_to be_nil
|
59
|
+
expect(result.width_px).to eq(1920)
|
60
|
+
expect(result.height_px).to eq(1200)
|
61
|
+
|
62
|
+
expect(io.bytes).to be < (128 * 1024)
|
63
|
+
expect(io.reads).to be < (1024 * 4)
|
64
|
+
end
|
65
|
+
|
66
|
+
it 'does not continue parsing for inordinate amount of time if the file contains no 0xFF bytes' do
|
67
|
+
# Create a large fuzzed input that consists of any bytes except 0xFF,
|
68
|
+
# so that the marker detector has nothing to latch on to
|
69
|
+
bytes_except_byte_255 = 0x0..0xFE
|
70
|
+
|
71
|
+
# Start the blob with the usual SOI marker - 0xFF 0xD8, so that the parser does not
|
72
|
+
# bail out too early and actually "bites" into the blob
|
73
|
+
no_markers = ([0xFF, 0xD8] + (16 * 1024).times.map { rand(bytes_except_byte_255) }).pack('C*')
|
74
|
+
|
75
|
+
# Yes, assertions on a private method - but we want to ensure we do not read more
|
76
|
+
# single bytes than the restriction stipulates we may. At the same time we check that
|
77
|
+
# the method does indeed, get triggered
|
78
|
+
allow(subject).to receive(:read_char).and_call_original
|
79
|
+
result = subject.call(StringIO.new(no_markers))
|
80
|
+
expect(result).to be_nil
|
81
|
+
expect(subject).to have_received(:read_char).at_most(1026).times
|
82
|
+
end
|
83
|
+
|
52
84
|
it 'does not return a result for a Keynote document' do
|
53
85
|
key_path = fixtures_dir + '/JPEG/keynote_recognized_as_jpeg.key'
|
54
86
|
result = subject.call(File.open(key_path, 'rb'))
|
55
87
|
expect(result).to be_nil
|
56
88
|
end
|
89
|
+
|
90
|
+
it 'parses the the marker structure correctly when marker bytes cannot be read in groups of 2' do
|
91
|
+
kitten_path = fixtures_dir + '/JPEG/off-cadence-markers.jpg'
|
92
|
+
result = subject.call(File.open(kitten_path, 'rb'))
|
93
|
+
expect(result).not_to be_nil
|
94
|
+
end
|
57
95
|
end
|
@@ -44,4 +44,11 @@ describe FormatParser::MP3Parser do
|
|
44
44
|
|
45
45
|
expect(parsed.intrinsics).not_to be_nil
|
46
46
|
end
|
47
|
+
|
48
|
+
it 'avoids returning a result when the parsed duration is infinite' do
|
49
|
+
fpath = fixtures_dir + '/JPEG/too_many_APP1_markers_surrogate.jpg'
|
50
|
+
parsed = subject.call(File.open(fpath, 'rb'))
|
51
|
+
|
52
|
+
expect(parsed).to be_nil
|
53
|
+
end
|
47
54
|
end
|
data/spec/read_limiter_spec.rb
CHANGED
@@ -18,6 +18,18 @@ describe FormatParser::ReadLimiter do
|
|
18
18
|
expect(reader.pos).to eq(2)
|
19
19
|
end
|
20
20
|
|
21
|
+
it 'exposes #reads, #seeks, #bytes' do
|
22
|
+
reader = FormatParser::ReadLimiter.new(io)
|
23
|
+
expect(reader.pos).to eq(0)
|
24
|
+
reader.read(2)
|
25
|
+
reader.seek(3)
|
26
|
+
reader.seek(4)
|
27
|
+
|
28
|
+
expect(reader.reads).to eq(1)
|
29
|
+
expect(reader.bytes).to eq(2)
|
30
|
+
expect(reader.seeks).to eq(2)
|
31
|
+
end
|
32
|
+
|
21
33
|
it 'enforces the number of seeks' do
|
22
34
|
reader = FormatParser::ReadLimiter.new(io, max_seeks: 4)
|
23
35
|
4.times { reader.seek(1) }
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: format_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.9.
|
4
|
+
version: 0.9.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Noah Berman
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2018-04-
|
12
|
+
date: 2018-04-21 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: ks
|