format_parser 0.12.2 → 0.12.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7dce2d0bca62e45c867866920dabc1b302f3ec3823e9a2602b1898699213694f
4
- data.tar.gz: 8e4e2826467614c21d687648123cae1326976bcc4f6f344a40d35a3bac16f988
3
+ metadata.gz: 03325dd0dc412571fd66d0c94226b9b88d7ae88b9bb6a985fe7c29f226c87b64
4
+ data.tar.gz: cbb1e33a3bffa36e832a064d9f28c6f8ebb2985fee397560d3ce19d971c36b96
5
5
  SHA512:
6
- metadata.gz: dbe1cf517017ee9d0ec8ada4cde56f0a61dac1ae638d1e0059f5f717b0230b9fb6eb521a89e37fddfc3f441b2b457d534939acf0fa2d1dd0491d16a30a4975ce
7
- data.tar.gz: f4c53aa7f9b8608c7878607081768342880254963de4ae5b0eeebe2d0dd2901a2d8acd6f260a3743eca0cd3fdabb825cba94884856a1824453e35c88e3f1f733
6
+ metadata.gz: 95c9d34a4469670ee69b718dac997fc3d1cc471071979ceb9df0010bff191d8b4c39e02f34669a13d831e40d918ad79889a6ff3b3ff0272df40b6160ee63bf8c
7
+ data.tar.gz: 417dbdcd4ea63939e060df8a8dd6e4927b4d9ec2de01cfab7d7b9ecf9701f56c5a93494cae7fb30361178d2b05b840f6c55928fffed73742829374acc8e24aff
@@ -1,3 +1,11 @@
1
+ ## 0.12.4
2
+ * Ensure JPEG recognition only runs when the JPEG SOI marker is detected **at the start** of file. Previously
3
+ the JPEG parser would scan for the marker, sometimes finding it (appropriately) in places like... MP3 album
4
+ artwork inside ID3 tags. Or Keynote documents. Or whatnot - lots of things have JPEG thumbnails embedded.
5
+
6
+ ## 0.12.3
7
+ * Make sure all strings going to the JSON representations of parse results are encoded as UTF-8 or escaped
8
+
1
9
  ## 0.12.2
2
10
  * Make sure the `VERSION` constant is available in the loaded gem. Previously the constant would be made
3
11
  available by Bundler when developing the library - since it loads the `.gemspec` which, in turn, requires the
@@ -11,6 +11,8 @@
11
11
  # the_foo.number_of_bars = 42
12
12
  # the_foo.as_json #=> {:number_of_bars => 42}
13
13
  module FormatParser::AttributesJSON
14
+ UNICODE_REPLACEMENT_CHAR = [0xFFFD].pack('U')
15
+ MAXIMUM_JSON_NESTING_WHEN_SANITIZING = 256
14
16
 
15
17
  # Implements a sane default `as_json` for an object
16
18
  # that accessors defined
@@ -19,11 +21,12 @@ module FormatParser::AttributesJSON
19
21
  h['nature'] = nature if respond_to?(:nature) # Needed for file info structs
20
22
  methods.grep(/\w\=$/).each_with_object(h) do |attr_writer_method_name, h|
21
23
  reader_method_name = attr_writer_method_name.to_s.gsub(/\=$/, '')
22
- value = public_send(reader_method_name)
23
- value = nil if value == Float::INFINITY
24
- # When calling as_json on our members there is no need to pass the root: option given to us
25
- # by the caller
26
- h[reader_method_name] = value.respond_to?(:as_json) ? value.as_json : value
24
+ attribute_value = public_send(reader_method_name)
25
+ # When calling as_json on our members there is no need to pass
26
+ # the root: option given to us by the caller
27
+ unwrapped_attribute_value = attribute_value.respond_to?(:as_json) ? attribute_value.as_json : attribute_value
28
+ sanitized_value = _sanitize_json_value(unwrapped_attribute_value)
29
+ h[reader_method_name] = sanitized_value
27
30
  end
28
31
  if root
29
32
  {'format_parser_file_info' => h}
@@ -32,6 +35,32 @@ module FormatParser::AttributesJSON
32
35
  end
33
36
  end
34
37
 
38
+ # Used for sanitizing values that are sourced to `JSON::Generator::State#generate`
39
+ # The reason we need to do this is as follows: `JSON.generate / JSON.dump / JSON.pretty_generate`
40
+ # use a totally different code path than `"foo".to_json(generator_state)`. We cannot predict
41
+ # which one of these two ways our users will be using, and at the same time we need to prevent
42
+ # invalid Strings (ones which cannot be encoded into UTF-8) as well as Float::INFINITY values
43
+ # from being passed to the JSON encoder. Since we cannot override the JSON generator with
44
+ # these additions, instead we will deep-convert the entire object being output to make sure
45
+ # it is up to snuff.
46
+ def _sanitize_json_value(value, nesting = 0)
47
+ raise ArgumentError, 'Nested JSON-ish structure too deep' if nesting > MAXIMUM_JSON_NESTING_WHEN_SANITIZING
48
+ case value
49
+ when Float::INFINITY
50
+ nil
51
+ when String
52
+ value.encode(Encoding::UTF_8, undef: :replace, replace: UNICODE_REPLACEMENT_CHAR)
53
+ when Hash
54
+ Hash[value.map { |k, v| [_sanitize_json_value(k, nesting + 1), _sanitize_json_value(v, nesting + 1)] }]
55
+ when Array
56
+ value.map { |v| _sanitize_json_value(v, nesting + 1) }
57
+ when Struct
58
+ _sanitize_json_value(value.to_h, nesting + 1)
59
+ else
60
+ value
61
+ end
62
+ end
63
+
35
64
  # Implements to_json with sane defaults, with or without arguments
36
65
  def to_json(*maybe_generator_state)
37
66
  as_json(root: false).to_json(*maybe_generator_state)
@@ -1,3 +1,3 @@
1
1
  module FormatParser
2
- VERSION = '0.12.2'
2
+ VERSION = '0.12.4'
3
3
  end
@@ -5,7 +5,7 @@ class FormatParser::JPEGParser
5
5
  class InvalidStructure < StandardError
6
6
  end
7
7
 
8
- SOI_MARKER = 0xD8 # start of image
8
+ JPEG_SOI_MARKER_HEAD = [0xFF, 0xD8].pack('C2')
9
9
  SOF_MARKERS = [0xC0..0xC3, 0xC5..0xC7, 0xC9..0xCB, 0xCD..0xCF]
10
10
  EOI_MARKER = 0xD9 # end of image
11
11
  SOS_MARKER = 0xDA # start of stream
@@ -32,9 +32,13 @@ class FormatParser::JPEGParser
32
32
  end
33
33
 
34
34
  def scan
35
- # Return early if it is not a JPEG at all
36
- signature = read_next_marker
37
- return unless signature == SOI_MARKER
35
+ # Most JPEG images start with the 0xFF0xD8 SOI marker.
36
+ # We _can_ search for that marker, but we will then
37
+ # ambiguously capture things like JPEGs embedded in ID3
38
+ # tags of MP3s - these _are_ JPEGs but we care much
39
+ # more about the top-level "wrapper" file, not about
40
+ # it's bits and bobs
41
+ return unless safe_read(@buf, 2) == JPEG_SOI_MARKER_HEAD
38
42
 
39
43
  markers_start_at = @buf.pos
40
44
 
@@ -97,4 +97,47 @@ describe FormatParser::AttributesJSON do
97
97
 
98
98
  expect(readback).to have_key(:nature)
99
99
  end
100
+
101
+ it 'converts purely-binary String objects deeply nested in the struct to escapes and question marks' do
102
+ nasty_hash = {
103
+ id: 'TIT2',
104
+ size: 37,
105
+ flags: "\x00\x00",
106
+ struct: Struct.new(:key).new('Value'),
107
+ content: "\x01\xFF\xFEb\x00i\x00r\x00d\x00s\x00 \x005\x00 \x00m\x00o\x00r\x00e\x00 \x00c\x00o\x00m\x00p\x00".b
108
+ }
109
+ expect {
110
+ JSON.pretty_generate(nasty_hash) # Should not raise an error
111
+ }.to raise_error(Encoding::UndefinedConversionError)
112
+
113
+ anon_class = Struct.new(:evil)
114
+ anon_class.include FormatParser::AttributesJSON
115
+
116
+ object_with_attributes_module = anon_class.new(nasty_hash)
117
+ output = JSON.pretty_generate(object_with_attributes_module)
118
+
119
+ parsed_output = JSON.parse(output, symbolize_names: true)
120
+
121
+ expect(parsed_output[:evil][:struct]).to eq(key: 'Value')
122
+ expect(parsed_output[:evil][:id]).to eq('TIT2')
123
+ expect(parsed_output[:evil][:flags]).to be_kind_of(String)
124
+ end
125
+
126
+ it 'prevents traversals of data structures which are too deep with an exception' do
127
+ fractal_hash = {}
128
+ current = fractal_hash
129
+ 1024.times do
130
+ current[:leaf] = {}
131
+ current = current[:leaf]
132
+ end
133
+
134
+ anon_class = Struct.new(:evil)
135
+ anon_class.include FormatParser::AttributesJSON
136
+
137
+ object_with_attributes_module = anon_class.new(fractal_hash)
138
+
139
+ expect {
140
+ JSON.pretty_generate(object_with_attributes_module)
141
+ }.to raise_error(/structure too deep/)
142
+ end
100
143
  end
@@ -14,6 +14,13 @@ describe 'Parsing esoteric files and files causing ambiguous detection' do
14
14
  expect(result.nature).to eq(:archive)
15
15
  end
16
16
 
17
+ it 'does not pick up JPG album art within an MP3 as a JPEG file' do
18
+ jpeg_path = fixtures_dir + '/MP3/ATC Fixture With Album Art.mp3'
19
+ results = FormatParser.parse(File.open(jpeg_path, 'rb'), results: :all)
20
+ expect(results).to be_one
21
+ expect(results.first.nature).to eq(:audio)
22
+ end
23
+
17
24
  it 'returns a result for JPEG file that causes many reads due to too many APP1 markers' do
18
25
  jpeg_path = fixtures_dir + '/JPEG/too_many_APP1_markers_surrogate.jpg'
19
26
  result = FormatParser.parse(File.open(jpeg_path, 'rb'))
@@ -82,7 +82,6 @@ describe FormatParser::ZIPParser do
82
82
  expect(json_parsed_repr[:format]).to eq('zip')
83
83
  expect(json_parsed_repr[:entries]).to be_kind_of(Array)
84
84
  expect(json_parsed_repr[:entries].length).to eq(3)
85
-
86
85
  json_parsed_repr[:entries].each do |e|
87
86
  expect(e[:filename]).to be_kind_of(String)
88
87
  expect(e[:size]).to be_kind_of(Integer)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: format_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.12.2
4
+ version: 0.12.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Noah Berman
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2018-05-06 00:00:00.000000000 Z
12
+ date: 2018-05-10 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: ks