format_parser 0.12.2 → 0.12.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7dce2d0bca62e45c867866920dabc1b302f3ec3823e9a2602b1898699213694f
4
- data.tar.gz: 8e4e2826467614c21d687648123cae1326976bcc4f6f344a40d35a3bac16f988
3
+ metadata.gz: 03325dd0dc412571fd66d0c94226b9b88d7ae88b9bb6a985fe7c29f226c87b64
4
+ data.tar.gz: cbb1e33a3bffa36e832a064d9f28c6f8ebb2985fee397560d3ce19d971c36b96
5
5
  SHA512:
6
- metadata.gz: dbe1cf517017ee9d0ec8ada4cde56f0a61dac1ae638d1e0059f5f717b0230b9fb6eb521a89e37fddfc3f441b2b457d534939acf0fa2d1dd0491d16a30a4975ce
7
- data.tar.gz: f4c53aa7f9b8608c7878607081768342880254963de4ae5b0eeebe2d0dd2901a2d8acd6f260a3743eca0cd3fdabb825cba94884856a1824453e35c88e3f1f733
6
+ metadata.gz: 95c9d34a4469670ee69b718dac997fc3d1cc471071979ceb9df0010bff191d8b4c39e02f34669a13d831e40d918ad79889a6ff3b3ff0272df40b6160ee63bf8c
7
+ data.tar.gz: 417dbdcd4ea63939e060df8a8dd6e4927b4d9ec2de01cfab7d7b9ecf9701f56c5a93494cae7fb30361178d2b05b840f6c55928fffed73742829374acc8e24aff
@@ -1,3 +1,11 @@
1
+ ## 0.12.4
2
+ * Ensure JPEG recognition only runs when the JPEG SOI marker is detected **at the start** of file. Previously
3
+ the JPEG parser would scan for the marker, sometimes finding it (appropriately) in places like... MP3 album
4
+ artwork inside ID3 tags. Or Keynote documents. Or whatnot - lots of things have JPEG thumbnails embedded.
5
+
6
+ ## 0.12.3
7
+ * Make sure all strings going to the JSON representations of parse results are encoded as UTF-8 or escaped
8
+
1
9
  ## 0.12.2
2
10
  * Make sure the `VERSION` constant is available in the loaded gem. Previously the constant would be made
3
11
  available by Bundler when developing the library - since it loads the `.gemspec` which, in turn, requires the
@@ -11,6 +11,8 @@
11
11
  # the_foo.number_of_bars = 42
12
12
  # the_foo.as_json #=> {:number_of_bars => 42}
13
13
  module FormatParser::AttributesJSON
14
+ UNICODE_REPLACEMENT_CHAR = [0xFFFD].pack('U')
15
+ MAXIMUM_JSON_NESTING_WHEN_SANITIZING = 256
14
16
 
15
17
  # Implements a sane default `as_json` for an object
16
18
  # that accessors defined
@@ -19,11 +21,12 @@ module FormatParser::AttributesJSON
19
21
  h['nature'] = nature if respond_to?(:nature) # Needed for file info structs
20
22
  methods.grep(/\w\=$/).each_with_object(h) do |attr_writer_method_name, h|
21
23
  reader_method_name = attr_writer_method_name.to_s.gsub(/\=$/, '')
22
- value = public_send(reader_method_name)
23
- value = nil if value == Float::INFINITY
24
- # When calling as_json on our members there is no need to pass the root: option given to us
25
- # by the caller
26
- h[reader_method_name] = value.respond_to?(:as_json) ? value.as_json : value
24
+ attribute_value = public_send(reader_method_name)
25
+ # When calling as_json on our members there is no need to pass
26
+ # the root: option given to us by the caller
27
+ unwrapped_attribute_value = attribute_value.respond_to?(:as_json) ? attribute_value.as_json : attribute_value
28
+ sanitized_value = _sanitize_json_value(unwrapped_attribute_value)
29
+ h[reader_method_name] = sanitized_value
27
30
  end
28
31
  if root
29
32
  {'format_parser_file_info' => h}
@@ -32,6 +35,32 @@ module FormatParser::AttributesJSON
32
35
  end
33
36
  end
34
37
 
38
+ # Used for sanitizing values that are sourced to `JSON::Generator::State#generate`
39
+ # The reason we need to do this is as follows: `JSON.generate / JSON.dump / JSON.pretty_generate`
40
+ # use a totally different code path than `"foo".to_json(generator_state)`. We cannot predict
41
+ # which one of these two ways our users will be using, and at the same time we need to prevent
42
+ # invalid Strings (ones which cannot be encoded into UTF-8) as well as Float::INFINITY values
43
+ # from being passed to the JSON encoder. Since we cannot override the JSON generator with
44
+ # these additions, instead we will deep-convert the entire object being output to make sure
45
+ # it is up to snuff.
46
+ def _sanitize_json_value(value, nesting = 0)
47
+ raise ArgumentError, 'Nested JSON-ish structure too deep' if nesting > MAXIMUM_JSON_NESTING_WHEN_SANITIZING
48
+ case value
49
+ when Float::INFINITY
50
+ nil
51
+ when String
52
+ value.encode(Encoding::UTF_8, undef: :replace, replace: UNICODE_REPLACEMENT_CHAR)
53
+ when Hash
54
+ Hash[value.map { |k, v| [_sanitize_json_value(k, nesting + 1), _sanitize_json_value(v, nesting + 1)] }]
55
+ when Array
56
+ value.map { |v| _sanitize_json_value(v, nesting + 1) }
57
+ when Struct
58
+ _sanitize_json_value(value.to_h, nesting + 1)
59
+ else
60
+ value
61
+ end
62
+ end
63
+
35
64
  # Implements to_json with sane defaults, with or without arguments
36
65
  def to_json(*maybe_generator_state)
37
66
  as_json(root: false).to_json(*maybe_generator_state)
@@ -1,3 +1,3 @@
1
1
  module FormatParser
2
- VERSION = '0.12.2'
2
+ VERSION = '0.12.4'
3
3
  end
@@ -5,7 +5,7 @@ class FormatParser::JPEGParser
5
5
  class InvalidStructure < StandardError
6
6
  end
7
7
 
8
- SOI_MARKER = 0xD8 # start of image
8
+ JPEG_SOI_MARKER_HEAD = [0xFF, 0xD8].pack('C2')
9
9
  SOF_MARKERS = [0xC0..0xC3, 0xC5..0xC7, 0xC9..0xCB, 0xCD..0xCF]
10
10
  EOI_MARKER = 0xD9 # end of image
11
11
  SOS_MARKER = 0xDA # start of stream
@@ -32,9 +32,13 @@ class FormatParser::JPEGParser
32
32
  end
33
33
 
34
34
  def scan
35
- # Return early if it is not a JPEG at all
36
- signature = read_next_marker
37
- return unless signature == SOI_MARKER
35
+ # Most JPEG images start with the 0xFF0xD8 SOI marker.
36
+ # We _can_ search for that marker, but we will then
37
+ # ambiguously capture things like JPEGs embedded in ID3
38
+ # tags of MP3s - these _are_ JPEGs but we care much
39
+ # more about the top-level "wrapper" file, not about
40
+ # it's bits and bobs
41
+ return unless safe_read(@buf, 2) == JPEG_SOI_MARKER_HEAD
38
42
 
39
43
  markers_start_at = @buf.pos
40
44
 
@@ -97,4 +97,47 @@ describe FormatParser::AttributesJSON do
97
97
 
98
98
  expect(readback).to have_key(:nature)
99
99
  end
100
+
101
+ it 'converts purely-binary String objects deeply nested in the struct to escapes and question marks' do
102
+ nasty_hash = {
103
+ id: 'TIT2',
104
+ size: 37,
105
+ flags: "\x00\x00",
106
+ struct: Struct.new(:key).new('Value'),
107
+ content: "\x01\xFF\xFEb\x00i\x00r\x00d\x00s\x00 \x005\x00 \x00m\x00o\x00r\x00e\x00 \x00c\x00o\x00m\x00p\x00".b
108
+ }
109
+ expect {
110
+ JSON.pretty_generate(nasty_hash) # Should not raise an error
111
+ }.to raise_error(Encoding::UndefinedConversionError)
112
+
113
+ anon_class = Struct.new(:evil)
114
+ anon_class.include FormatParser::AttributesJSON
115
+
116
+ object_with_attributes_module = anon_class.new(nasty_hash)
117
+ output = JSON.pretty_generate(object_with_attributes_module)
118
+
119
+ parsed_output = JSON.parse(output, symbolize_names: true)
120
+
121
+ expect(parsed_output[:evil][:struct]).to eq(key: 'Value')
122
+ expect(parsed_output[:evil][:id]).to eq('TIT2')
123
+ expect(parsed_output[:evil][:flags]).to be_kind_of(String)
124
+ end
125
+
126
+ it 'prevents traversals of data structures which are too deep with an exception' do
127
+ fractal_hash = {}
128
+ current = fractal_hash
129
+ 1024.times do
130
+ current[:leaf] = {}
131
+ current = current[:leaf]
132
+ end
133
+
134
+ anon_class = Struct.new(:evil)
135
+ anon_class.include FormatParser::AttributesJSON
136
+
137
+ object_with_attributes_module = anon_class.new(fractal_hash)
138
+
139
+ expect {
140
+ JSON.pretty_generate(object_with_attributes_module)
141
+ }.to raise_error(/structure too deep/)
142
+ end
100
143
  end
@@ -14,6 +14,13 @@ describe 'Parsing esoteric files and files causing ambiguous detection' do
14
14
  expect(result.nature).to eq(:archive)
15
15
  end
16
16
 
17
+ it 'does not pick up JPG album art within an MP3 as a JPEG file' do
18
+ jpeg_path = fixtures_dir + '/MP3/ATC Fixture With Album Art.mp3'
19
+ results = FormatParser.parse(File.open(jpeg_path, 'rb'), results: :all)
20
+ expect(results).to be_one
21
+ expect(results.first.nature).to eq(:audio)
22
+ end
23
+
17
24
  it 'returns a result for JPEG file that causes many reads due to too many APP1 markers' do
18
25
  jpeg_path = fixtures_dir + '/JPEG/too_many_APP1_markers_surrogate.jpg'
19
26
  result = FormatParser.parse(File.open(jpeg_path, 'rb'))
@@ -82,7 +82,6 @@ describe FormatParser::ZIPParser do
82
82
  expect(json_parsed_repr[:format]).to eq('zip')
83
83
  expect(json_parsed_repr[:entries]).to be_kind_of(Array)
84
84
  expect(json_parsed_repr[:entries].length).to eq(3)
85
-
86
85
  json_parsed_repr[:entries].each do |e|
87
86
  expect(e[:filename]).to be_kind_of(String)
88
87
  expect(e[:size]).to be_kind_of(Integer)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: format_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.12.2
4
+ version: 0.12.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Noah Berman
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2018-05-06 00:00:00.000000000 Z
12
+ date: 2018-05-10 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: ks