format_parser 0.12.2 → 0.12.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/lib/attributes_json.rb +34 -5
- data/lib/format_parser/version.rb +1 -1
- data/lib/parsers/jpeg_parser.rb +8 -4
- data/spec/attributes_json_spec.rb +43 -0
- data/spec/esoteric_formats_spec.rb +7 -0
- data/spec/parsers/zip_parser_spec.rb +0 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 03325dd0dc412571fd66d0c94226b9b88d7ae88b9bb6a985fe7c29f226c87b64
|
4
|
+
data.tar.gz: cbb1e33a3bffa36e832a064d9f28c6f8ebb2985fee397560d3ce19d971c36b96
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 95c9d34a4469670ee69b718dac997fc3d1cc471071979ceb9df0010bff191d8b4c39e02f34669a13d831e40d918ad79889a6ff3b3ff0272df40b6160ee63bf8c
|
7
|
+
data.tar.gz: 417dbdcd4ea63939e060df8a8dd6e4927b4d9ec2de01cfab7d7b9ecf9701f56c5a93494cae7fb30361178d2b05b840f6c55928fffed73742829374acc8e24aff
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,11 @@
|
|
1
|
+
## 0.12.4
|
2
|
+
* Ensure JPEG recognition only runs when the JPEG SOI marker is detected **at the start** of file. Previously
|
3
|
+
the JPEG parser would scan for the marker, sometimes finding it (appropriately) in places like... MP3 album
|
4
|
+
artwork inside ID3 tags. Or Keynote documents. Or whatnot - lots of things have JPEG thumbnails embedded.
|
5
|
+
|
6
|
+
## 0.12.3
|
7
|
+
* Make sure all strings going to the JSON representations of parse results are encoded as UTF-8 or escaped
|
8
|
+
|
1
9
|
## 0.12.2
|
2
10
|
* Make sure the `VERSION` constant is available in the loaded gem. Previously the constant would be made
|
3
11
|
available by Bundler when developing the library - since it loads the `.gemspec` which, in turn, requires the
|
data/lib/attributes_json.rb
CHANGED
@@ -11,6 +11,8 @@
|
|
11
11
|
# the_foo.number_of_bars = 42
|
12
12
|
# the_foo.as_json #=> {:number_of_bars => 42}
|
13
13
|
module FormatParser::AttributesJSON
|
14
|
+
UNICODE_REPLACEMENT_CHAR = [0xFFFD].pack('U')
|
15
|
+
MAXIMUM_JSON_NESTING_WHEN_SANITIZING = 256
|
14
16
|
|
15
17
|
# Implements a sane default `as_json` for an object
|
16
18
|
# that accessors defined
|
@@ -19,11 +21,12 @@ module FormatParser::AttributesJSON
|
|
19
21
|
h['nature'] = nature if respond_to?(:nature) # Needed for file info structs
|
20
22
|
methods.grep(/\w\=$/).each_with_object(h) do |attr_writer_method_name, h|
|
21
23
|
reader_method_name = attr_writer_method_name.to_s.gsub(/\=$/, '')
|
22
|
-
|
23
|
-
|
24
|
-
#
|
25
|
-
|
26
|
-
|
24
|
+
attribute_value = public_send(reader_method_name)
|
25
|
+
# When calling as_json on our members there is no need to pass
|
26
|
+
# the root: option given to us by the caller
|
27
|
+
unwrapped_attribute_value = attribute_value.respond_to?(:as_json) ? attribute_value.as_json : attribute_value
|
28
|
+
sanitized_value = _sanitize_json_value(unwrapped_attribute_value)
|
29
|
+
h[reader_method_name] = sanitized_value
|
27
30
|
end
|
28
31
|
if root
|
29
32
|
{'format_parser_file_info' => h}
|
@@ -32,6 +35,32 @@ module FormatParser::AttributesJSON
|
|
32
35
|
end
|
33
36
|
end
|
34
37
|
|
38
|
+
# Used for sanitizing values that are sourced to `JSON::Generator::State#generate`
|
39
|
+
# The reason we need to do this is as follows: `JSON.generate / JSON.dump / JSON.pretty_generate`
|
40
|
+
# use a totally different code path than `"foo".to_json(generator_state)`. We cannot predict
|
41
|
+
# which one of these two ways our users will be using, and at the same time we need to prevent
|
42
|
+
# invalid Strings (ones which cannot be encoded into UTF-8) as well as Float::INFINITY values
|
43
|
+
# from being passed to the JSON encoder. Since we cannot override the JSON generator with
|
44
|
+
# these additions, instead we will deep-convert the entire object being output to make sure
|
45
|
+
# it is up to snuff.
|
46
|
+
def _sanitize_json_value(value, nesting = 0)
|
47
|
+
raise ArgumentError, 'Nested JSON-ish structure too deep' if nesting > MAXIMUM_JSON_NESTING_WHEN_SANITIZING
|
48
|
+
case value
|
49
|
+
when Float::INFINITY
|
50
|
+
nil
|
51
|
+
when String
|
52
|
+
value.encode(Encoding::UTF_8, undef: :replace, replace: UNICODE_REPLACEMENT_CHAR)
|
53
|
+
when Hash
|
54
|
+
Hash[value.map { |k, v| [_sanitize_json_value(k, nesting + 1), _sanitize_json_value(v, nesting + 1)] }]
|
55
|
+
when Array
|
56
|
+
value.map { |v| _sanitize_json_value(v, nesting + 1) }
|
57
|
+
when Struct
|
58
|
+
_sanitize_json_value(value.to_h, nesting + 1)
|
59
|
+
else
|
60
|
+
value
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
35
64
|
# Implements to_json with sane defaults, with or without arguments
|
36
65
|
def to_json(*maybe_generator_state)
|
37
66
|
as_json(root: false).to_json(*maybe_generator_state)
|
data/lib/parsers/jpeg_parser.rb
CHANGED
@@ -5,7 +5,7 @@ class FormatParser::JPEGParser
|
|
5
5
|
class InvalidStructure < StandardError
|
6
6
|
end
|
7
7
|
|
8
|
-
|
8
|
+
JPEG_SOI_MARKER_HEAD = [0xFF, 0xD8].pack('C2')
|
9
9
|
SOF_MARKERS = [0xC0..0xC3, 0xC5..0xC7, 0xC9..0xCB, 0xCD..0xCF]
|
10
10
|
EOI_MARKER = 0xD9 # end of image
|
11
11
|
SOS_MARKER = 0xDA # start of stream
|
@@ -32,9 +32,13 @@ class FormatParser::JPEGParser
|
|
32
32
|
end
|
33
33
|
|
34
34
|
def scan
|
35
|
-
#
|
36
|
-
|
37
|
-
|
35
|
+
# Most JPEG images start with the 0xFF0xD8 SOI marker.
|
36
|
+
# We _can_ search for that marker, but we will then
|
37
|
+
# ambiguously capture things like JPEGs embedded in ID3
|
38
|
+
# tags of MP3s - these _are_ JPEGs but we care much
|
39
|
+
# more about the top-level "wrapper" file, not about
|
40
|
+
# it's bits and bobs
|
41
|
+
return unless safe_read(@buf, 2) == JPEG_SOI_MARKER_HEAD
|
38
42
|
|
39
43
|
markers_start_at = @buf.pos
|
40
44
|
|
@@ -97,4 +97,47 @@ describe FormatParser::AttributesJSON do
|
|
97
97
|
|
98
98
|
expect(readback).to have_key(:nature)
|
99
99
|
end
|
100
|
+
|
101
|
+
it 'converts purely-binary String objects deeply nested in the struct to escapes and question marks' do
|
102
|
+
nasty_hash = {
|
103
|
+
id: 'TIT2',
|
104
|
+
size: 37,
|
105
|
+
flags: "\x00\x00",
|
106
|
+
struct: Struct.new(:key).new('Value'),
|
107
|
+
content: "\x01\xFF\xFEb\x00i\x00r\x00d\x00s\x00 \x005\x00 \x00m\x00o\x00r\x00e\x00 \x00c\x00o\x00m\x00p\x00".b
|
108
|
+
}
|
109
|
+
expect {
|
110
|
+
JSON.pretty_generate(nasty_hash) # Should not raise an error
|
111
|
+
}.to raise_error(Encoding::UndefinedConversionError)
|
112
|
+
|
113
|
+
anon_class = Struct.new(:evil)
|
114
|
+
anon_class.include FormatParser::AttributesJSON
|
115
|
+
|
116
|
+
object_with_attributes_module = anon_class.new(nasty_hash)
|
117
|
+
output = JSON.pretty_generate(object_with_attributes_module)
|
118
|
+
|
119
|
+
parsed_output = JSON.parse(output, symbolize_names: true)
|
120
|
+
|
121
|
+
expect(parsed_output[:evil][:struct]).to eq(key: 'Value')
|
122
|
+
expect(parsed_output[:evil][:id]).to eq('TIT2')
|
123
|
+
expect(parsed_output[:evil][:flags]).to be_kind_of(String)
|
124
|
+
end
|
125
|
+
|
126
|
+
it 'prevents traversals of data structures which are too deep with an exception' do
|
127
|
+
fractal_hash = {}
|
128
|
+
current = fractal_hash
|
129
|
+
1024.times do
|
130
|
+
current[:leaf] = {}
|
131
|
+
current = current[:leaf]
|
132
|
+
end
|
133
|
+
|
134
|
+
anon_class = Struct.new(:evil)
|
135
|
+
anon_class.include FormatParser::AttributesJSON
|
136
|
+
|
137
|
+
object_with_attributes_module = anon_class.new(fractal_hash)
|
138
|
+
|
139
|
+
expect {
|
140
|
+
JSON.pretty_generate(object_with_attributes_module)
|
141
|
+
}.to raise_error(/structure too deep/)
|
142
|
+
end
|
100
143
|
end
|
@@ -14,6 +14,13 @@ describe 'Parsing esoteric files and files causing ambiguous detection' do
|
|
14
14
|
expect(result.nature).to eq(:archive)
|
15
15
|
end
|
16
16
|
|
17
|
+
it 'does not pick up JPG album art within an MP3 as a JPEG file' do
|
18
|
+
jpeg_path = fixtures_dir + '/MP3/ATC Fixture With Album Art.mp3'
|
19
|
+
results = FormatParser.parse(File.open(jpeg_path, 'rb'), results: :all)
|
20
|
+
expect(results).to be_one
|
21
|
+
expect(results.first.nature).to eq(:audio)
|
22
|
+
end
|
23
|
+
|
17
24
|
it 'returns a result for JPEG file that causes many reads due to too many APP1 markers' do
|
18
25
|
jpeg_path = fixtures_dir + '/JPEG/too_many_APP1_markers_surrogate.jpg'
|
19
26
|
result = FormatParser.parse(File.open(jpeg_path, 'rb'))
|
@@ -82,7 +82,6 @@ describe FormatParser::ZIPParser do
|
|
82
82
|
expect(json_parsed_repr[:format]).to eq('zip')
|
83
83
|
expect(json_parsed_repr[:entries]).to be_kind_of(Array)
|
84
84
|
expect(json_parsed_repr[:entries].length).to eq(3)
|
85
|
-
|
86
85
|
json_parsed_repr[:entries].each do |e|
|
87
86
|
expect(e[:filename]).to be_kind_of(String)
|
88
87
|
expect(e[:size]).to be_kind_of(Integer)
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: format_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.12.
|
4
|
+
version: 0.12.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Noah Berman
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2018-05-
|
12
|
+
date: 2018-05-10 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: ks
|