format_parser 0.12.2 → 0.12.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/lib/attributes_json.rb +34 -5
- data/lib/format_parser/version.rb +1 -1
- data/lib/parsers/jpeg_parser.rb +8 -4
- data/spec/attributes_json_spec.rb +43 -0
- data/spec/esoteric_formats_spec.rb +7 -0
- data/spec/parsers/zip_parser_spec.rb +0 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 03325dd0dc412571fd66d0c94226b9b88d7ae88b9bb6a985fe7c29f226c87b64
|
4
|
+
data.tar.gz: cbb1e33a3bffa36e832a064d9f28c6f8ebb2985fee397560d3ce19d971c36b96
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 95c9d34a4469670ee69b718dac997fc3d1cc471071979ceb9df0010bff191d8b4c39e02f34669a13d831e40d918ad79889a6ff3b3ff0272df40b6160ee63bf8c
|
7
|
+
data.tar.gz: 417dbdcd4ea63939e060df8a8dd6e4927b4d9ec2de01cfab7d7b9ecf9701f56c5a93494cae7fb30361178d2b05b840f6c55928fffed73742829374acc8e24aff
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,11 @@
|
|
1
|
+
## 0.12.4
|
2
|
+
* Ensure JPEG recognition only runs when the JPEG SOI marker is detected **at the start** of file. Previously
|
3
|
+
the JPEG parser would scan for the marker, sometimes finding it (appropriately) in places like... MP3 album
|
4
|
+
artwork inside ID3 tags. Or Keynote documents. Or whatnot - lots of things have JPEG thumbnails embedded.
|
5
|
+
|
6
|
+
## 0.12.3
|
7
|
+
* Make sure all strings going to the JSON representations of parse results are encoded as UTF-8 or escaped
|
8
|
+
|
1
9
|
## 0.12.2
|
2
10
|
* Make sure the `VERSION` constant is available in the loaded gem. Previously the constant would be made
|
3
11
|
available by Bundler when developing the library - since it loads the `.gemspec` which, in turn, requires the
|
data/lib/attributes_json.rb
CHANGED
@@ -11,6 +11,8 @@
|
|
11
11
|
# the_foo.number_of_bars = 42
|
12
12
|
# the_foo.as_json #=> {:number_of_bars => 42}
|
13
13
|
module FormatParser::AttributesJSON
|
14
|
+
UNICODE_REPLACEMENT_CHAR = [0xFFFD].pack('U')
|
15
|
+
MAXIMUM_JSON_NESTING_WHEN_SANITIZING = 256
|
14
16
|
|
15
17
|
# Implements a sane default `as_json` for an object
|
16
18
|
# that accessors defined
|
@@ -19,11 +21,12 @@ module FormatParser::AttributesJSON
|
|
19
21
|
h['nature'] = nature if respond_to?(:nature) # Needed for file info structs
|
20
22
|
methods.grep(/\w\=$/).each_with_object(h) do |attr_writer_method_name, h|
|
21
23
|
reader_method_name = attr_writer_method_name.to_s.gsub(/\=$/, '')
|
22
|
-
|
23
|
-
|
24
|
-
#
|
25
|
-
|
26
|
-
|
24
|
+
attribute_value = public_send(reader_method_name)
|
25
|
+
# When calling as_json on our members there is no need to pass
|
26
|
+
# the root: option given to us by the caller
|
27
|
+
unwrapped_attribute_value = attribute_value.respond_to?(:as_json) ? attribute_value.as_json : attribute_value
|
28
|
+
sanitized_value = _sanitize_json_value(unwrapped_attribute_value)
|
29
|
+
h[reader_method_name] = sanitized_value
|
27
30
|
end
|
28
31
|
if root
|
29
32
|
{'format_parser_file_info' => h}
|
@@ -32,6 +35,32 @@ module FormatParser::AttributesJSON
|
|
32
35
|
end
|
33
36
|
end
|
34
37
|
|
38
|
+
# Used for sanitizing values that are sourced to `JSON::Generator::State#generate`
|
39
|
+
# The reason we need to do this is as follows: `JSON.generate / JSON.dump / JSON.pretty_generate`
|
40
|
+
# use a totally different code path than `"foo".to_json(generator_state)`. We cannot predict
|
41
|
+
# which one of these two ways our users will be using, and at the same time we need to prevent
|
42
|
+
# invalid Strings (ones which cannot be encoded into UTF-8) as well as Float::INFINITY values
|
43
|
+
# from being passed to the JSON encoder. Since we cannot override the JSON generator with
|
44
|
+
# these additions, instead we will deep-convert the entire object being output to make sure
|
45
|
+
# it is up to snuff.
|
46
|
+
def _sanitize_json_value(value, nesting = 0)
|
47
|
+
raise ArgumentError, 'Nested JSON-ish structure too deep' if nesting > MAXIMUM_JSON_NESTING_WHEN_SANITIZING
|
48
|
+
case value
|
49
|
+
when Float::INFINITY
|
50
|
+
nil
|
51
|
+
when String
|
52
|
+
value.encode(Encoding::UTF_8, undef: :replace, replace: UNICODE_REPLACEMENT_CHAR)
|
53
|
+
when Hash
|
54
|
+
Hash[value.map { |k, v| [_sanitize_json_value(k, nesting + 1), _sanitize_json_value(v, nesting + 1)] }]
|
55
|
+
when Array
|
56
|
+
value.map { |v| _sanitize_json_value(v, nesting + 1) }
|
57
|
+
when Struct
|
58
|
+
_sanitize_json_value(value.to_h, nesting + 1)
|
59
|
+
else
|
60
|
+
value
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
35
64
|
# Implements to_json with sane defaults, with or without arguments
|
36
65
|
def to_json(*maybe_generator_state)
|
37
66
|
as_json(root: false).to_json(*maybe_generator_state)
|
data/lib/parsers/jpeg_parser.rb
CHANGED
@@ -5,7 +5,7 @@ class FormatParser::JPEGParser
|
|
5
5
|
class InvalidStructure < StandardError
|
6
6
|
end
|
7
7
|
|
8
|
-
|
8
|
+
JPEG_SOI_MARKER_HEAD = [0xFF, 0xD8].pack('C2')
|
9
9
|
SOF_MARKERS = [0xC0..0xC3, 0xC5..0xC7, 0xC9..0xCB, 0xCD..0xCF]
|
10
10
|
EOI_MARKER = 0xD9 # end of image
|
11
11
|
SOS_MARKER = 0xDA # start of stream
|
@@ -32,9 +32,13 @@ class FormatParser::JPEGParser
|
|
32
32
|
end
|
33
33
|
|
34
34
|
def scan
|
35
|
-
#
|
36
|
-
|
37
|
-
|
35
|
+
# Most JPEG images start with the 0xFF0xD8 SOI marker.
|
36
|
+
# We _can_ search for that marker, but we will then
|
37
|
+
# ambiguously capture things like JPEGs embedded in ID3
|
38
|
+
# tags of MP3s - these _are_ JPEGs but we care much
|
39
|
+
# more about the top-level "wrapper" file, not about
|
40
|
+
# it's bits and bobs
|
41
|
+
return unless safe_read(@buf, 2) == JPEG_SOI_MARKER_HEAD
|
38
42
|
|
39
43
|
markers_start_at = @buf.pos
|
40
44
|
|
@@ -97,4 +97,47 @@ describe FormatParser::AttributesJSON do
|
|
97
97
|
|
98
98
|
expect(readback).to have_key(:nature)
|
99
99
|
end
|
100
|
+
|
101
|
+
it 'converts purely-binary String objects deeply nested in the struct to escapes and question marks' do
|
102
|
+
nasty_hash = {
|
103
|
+
id: 'TIT2',
|
104
|
+
size: 37,
|
105
|
+
flags: "\x00\x00",
|
106
|
+
struct: Struct.new(:key).new('Value'),
|
107
|
+
content: "\x01\xFF\xFEb\x00i\x00r\x00d\x00s\x00 \x005\x00 \x00m\x00o\x00r\x00e\x00 \x00c\x00o\x00m\x00p\x00".b
|
108
|
+
}
|
109
|
+
expect {
|
110
|
+
JSON.pretty_generate(nasty_hash) # Should not raise an error
|
111
|
+
}.to raise_error(Encoding::UndefinedConversionError)
|
112
|
+
|
113
|
+
anon_class = Struct.new(:evil)
|
114
|
+
anon_class.include FormatParser::AttributesJSON
|
115
|
+
|
116
|
+
object_with_attributes_module = anon_class.new(nasty_hash)
|
117
|
+
output = JSON.pretty_generate(object_with_attributes_module)
|
118
|
+
|
119
|
+
parsed_output = JSON.parse(output, symbolize_names: true)
|
120
|
+
|
121
|
+
expect(parsed_output[:evil][:struct]).to eq(key: 'Value')
|
122
|
+
expect(parsed_output[:evil][:id]).to eq('TIT2')
|
123
|
+
expect(parsed_output[:evil][:flags]).to be_kind_of(String)
|
124
|
+
end
|
125
|
+
|
126
|
+
it 'prevents traversals of data structures which are too deep with an exception' do
|
127
|
+
fractal_hash = {}
|
128
|
+
current = fractal_hash
|
129
|
+
1024.times do
|
130
|
+
current[:leaf] = {}
|
131
|
+
current = current[:leaf]
|
132
|
+
end
|
133
|
+
|
134
|
+
anon_class = Struct.new(:evil)
|
135
|
+
anon_class.include FormatParser::AttributesJSON
|
136
|
+
|
137
|
+
object_with_attributes_module = anon_class.new(fractal_hash)
|
138
|
+
|
139
|
+
expect {
|
140
|
+
JSON.pretty_generate(object_with_attributes_module)
|
141
|
+
}.to raise_error(/structure too deep/)
|
142
|
+
end
|
100
143
|
end
|
@@ -14,6 +14,13 @@ describe 'Parsing esoteric files and files causing ambiguous detection' do
|
|
14
14
|
expect(result.nature).to eq(:archive)
|
15
15
|
end
|
16
16
|
|
17
|
+
it 'does not pick up JPG album art within an MP3 as a JPEG file' do
|
18
|
+
jpeg_path = fixtures_dir + '/MP3/ATC Fixture With Album Art.mp3'
|
19
|
+
results = FormatParser.parse(File.open(jpeg_path, 'rb'), results: :all)
|
20
|
+
expect(results).to be_one
|
21
|
+
expect(results.first.nature).to eq(:audio)
|
22
|
+
end
|
23
|
+
|
17
24
|
it 'returns a result for JPEG file that causes many reads due to too many APP1 markers' do
|
18
25
|
jpeg_path = fixtures_dir + '/JPEG/too_many_APP1_markers_surrogate.jpg'
|
19
26
|
result = FormatParser.parse(File.open(jpeg_path, 'rb'))
|
@@ -82,7 +82,6 @@ describe FormatParser::ZIPParser do
|
|
82
82
|
expect(json_parsed_repr[:format]).to eq('zip')
|
83
83
|
expect(json_parsed_repr[:entries]).to be_kind_of(Array)
|
84
84
|
expect(json_parsed_repr[:entries].length).to eq(3)
|
85
|
-
|
86
85
|
json_parsed_repr[:entries].each do |e|
|
87
86
|
expect(e[:filename]).to be_kind_of(String)
|
88
87
|
expect(e[:size]).to be_kind_of(Integer)
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: format_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.12.
|
4
|
+
version: 0.12.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Noah Berman
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2018-05-
|
12
|
+
date: 2018-05-10 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: ks
|