format_parser 1.3.0 → 1.4.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d90a6eee951feb5017bdedc0fb6dd4be49fc0a7f218972c0fb423b1985bf9a97
4
- data.tar.gz: ee6caab359b0e01450897d00abd1f190b131e40d842d487d1f29107ac3b374f0
3
+ metadata.gz: 5a9a94233949cc72d18b433cf1ddcba0e479e8b93aa1ff2e48bda6f6d86f667b
4
+ data.tar.gz: 4e357bc46207e95cad52d21b2aaa1781e9c231bab02e235be29663db9722f5d9
5
5
  SHA512:
6
- metadata.gz: 72aca621c20dfb24443e32c52a3d27f64a4887c1254f6d2295f77b39ce57e1a5ef0aa52f365630badcb2e3aa90d32c3546c8fcea3b83e025e2c4fc3606dd2dd7
7
- data.tar.gz: 90da48352579c4044035732fd1f837d86db1a2d3c8f325b60ee688630f6cb4ea8efdb2ffd7887ac4e49cf9f9df2a22fe2189187c32d606b8f027a0a3d6a3ec5f
6
+ metadata.gz: dcf8c8aeefc6166f3645dae461aadbcc2b36e96cb7a75162586fc009d562f6f978767ff877b27d0c192b5ca3107011a1bfdda842e730e486ced02e4191b53f59
7
+ data.tar.gz: fbc2caafb269f5e9c249e6ffe62ea8141477589256fab1bea5d058877d571725b1e619619ef9fcd33005d42759697ae7c7575d7a79217bdedc7e96ab02ce3c1b
data/.gitignore CHANGED
@@ -61,3 +61,6 @@ Gemfile.lock
61
61
 
62
62
  # rspec examples
63
63
  spec/examples.txt
64
+
65
+ # IntelliJ config:
66
+ /.idea/
data/CHANGELOG.md CHANGED
@@ -1,6 +1,15 @@
1
+ ## 1.4.2
2
+ * Fix `MP3Parser` taking precedence when parsing `WEBP` files.
3
+
4
+ ## 1.4.1
5
+ * Skip Exif chunks that are malformed during `WEBP` parsing.
6
+
7
+ ## 1.4.0
8
+ * Add support for `WEBP` lossy, lossless and extended file formats.
9
+
1
10
  ## 1.3.0
2
11
  * Add `heif_parser` and support for `HEIF` and `HEIC` formats. Exif parsing is still missing.
3
- *
12
+
4
13
  ## 1.2.1
5
14
  * Resolve bug when `stts` atom is `nil`
6
15
 
data/README.md CHANGED
@@ -33,6 +33,7 @@ and [dimensions,](https://github.com/sstephenson/dimensions) borrowing from them
33
33
  * OGG
34
34
  * MPEG, MPG
35
35
  * M3U
36
+ * WEBP
36
37
 
37
38
  ...with [more](https://github.com/WeTransfer/format_parser/issues?q=is%3Aissue+is%3Aopen+label%3Aformats) on the way!
38
39
 
@@ -198,6 +199,10 @@ Unless specified otherwise in this section the fixture files are MIT licensed an
198
199
  ### M3U
199
200
  - The M3U fixture files were created by one of the project maintainers
200
201
 
202
+ ### WEBP
203
+ - With the exception of extended-animation.webp, which was obtained from Wikimedia Commons and is Creative Commons
204
+ licensed, all of the WebP fixture files have been created by one of the project maintainers.
205
+
201
206
  ### .key
202
207
  - The `keynote_recognized_as_jpeg.key` file was created by the project maintainers
203
208
 
@@ -1,3 +1,3 @@
1
1
  module FormatParser
2
- VERSION = '1.3.0'
2
+ VERSION = '1.4.2'
3
3
  end
data/lib/io_utils.rb CHANGED
@@ -42,6 +42,14 @@ module FormatParser::IOUtils
42
42
  safe_read(@buf, 4).unpack('N').first
43
43
  end
44
44
 
45
+ def read_little_endian_int_16
46
+ safe_read(@buf, 2).unpack('v').first
47
+ end
48
+
49
+ def read_little_endian_int_32
50
+ safe_read(@buf, 4).unpack('V').first
51
+ end
52
+
45
53
  # 'n' is the number of bytes to read
46
54
  def read_string(n)
47
55
  safe_read(@buf, n)
@@ -28,6 +28,7 @@ class FormatParser::MP3Parser
28
28
  # For some edge cases
29
29
  ZIP_LOCAL_ENTRY_SIGNATURE = "PK\x03\x04\x14\x00".b
30
30
  PNG_HEADER_BYTES = [137, 80, 78, 71, 13, 10, 26, 10].pack('C*')
31
+ WEBP_HEADER_REGEX = /RIFF.{4}WEBP/i
31
32
 
32
33
  MAGIC_LE = [0x49, 0x49, 0x2A, 0x0].pack('C4')
33
34
  MAGIC_BE = [0x4D, 0x4D, 0x0, 0x2A].pack('C4')
@@ -68,9 +69,10 @@ class FormatParser::MP3Parser
68
69
  # will terminate here. Same with PNGs. In the future
69
70
  # we should implement "confidence" for MP3 as of all our formats
70
71
  # it is by far the most lax.
71
- header = safe_read(io, 8)
72
+ header = safe_read(io, 12)
72
73
  return if header.start_with?(ZIP_LOCAL_ENTRY_SIGNATURE)
73
74
  return if header.start_with?(PNG_HEADER_BYTES)
75
+ return if header.start_with?(WEBP_HEADER_REGEX)
74
76
 
75
77
  io.seek(0)
76
78
  return if TIFF_HEADER_BYTES.include?(safe_read(io, 4))
@@ -0,0 +1,169 @@
1
+ # WebP is an image format that provides superior lossless and lossy compression for images on the web, with support for
2
+ # transparency. It uses predictive coding to encode an image, predicting the values in a block of pixels based on the
3
+ # values of neighbouring blocks. A WebP file consists of VP8 or VP8L data, and a container based on RIFF. There is also
4
+ # an extended file format, VP8X, that optionally encodes various information such as the color profile, animation
5
+ # control data, transparency, and EXIF and/or XMP metadata.
6
+ #
7
+ # For more information, visit https://developers.google.com/speed/webp.
8
+ #
9
+ # TODO: Decide how to determine color mode (depends on variant, transformations, flags, etc.; maybe not worth it).
10
+
11
+ class FormatParser::WebpParser
12
+ include FormatParser::EXIFParser
13
+ include FormatParser::IOUtils
14
+
15
+ WEBP_MIME_TYPE = 'image/webp'
16
+
17
+ def likely_match?(filename)
18
+ filename =~ /\.webp$/i
19
+ end
20
+
21
+ def call(io)
22
+ @buf = FormatParser::IOConstraint.new(io)
23
+
24
+ # All WebP files start with the following 20 bytes:
25
+ #
26
+ # Offset | Description
27
+ # -------------------------------------------------------------------------------------
28
+ # 0...3 | "RIFF" (Since WebP is based on the RIFF file container format).
29
+ # 4...7 | The size of the file in bytes - 8 bytes.
30
+ # 8...11 | "WEBP" (To signify that this is a WebP file).
31
+ # 12...15 | The VB8 variant in use ("VB8 ", "VP8L" or "VB8X")
32
+ # 16...19 | The length of the VB8 data in bytes (i.e. The size of the file - 20 bytes).
33
+ riff, webp, variant = safe_read(@buf, 20).unpack('A4x4A4A4')
34
+ return unless riff == 'RIFF' && webp == 'WEBP'
35
+ read_data(variant)
36
+ end
37
+
38
+ private
39
+
40
+ def read_data(variant)
41
+ case variant
42
+ when 'VP8' # Lossy
43
+ read_lossy_data
44
+ when 'VP8L' # Lossless
45
+ read_lossless_data
46
+ when 'VP8X' # Extended
47
+ read_extended_data
48
+ else
49
+ nil
50
+ end
51
+ end
52
+
53
+ def read_lossy_data
54
+ # Encoded as a single VP8 key frame - a 10-byte uncompressed chunk followed by 2+ partitions of compressed data.
55
+ # The first 6 bytes of this chunk contains information that is mostly relevant when using VP8 as a video
56
+ # compression format, and can be ignored.
57
+ safe_skip(@buf, 6)
58
+
59
+ # The subsequent 4 bytes contain the image width and height, respectively, as 16-bit unsigned little endian
60
+ # integers.
61
+ width, height = safe_read(@buf, 4).unpack('S<S<')
62
+ create_image(width, height)
63
+ end
64
+
65
+ def read_lossless_data
66
+ # There is a single byte signature, 0x2F, that we can disregard.
67
+ safe_skip(@buf, 1)
68
+
69
+ # The subsequent 4 bytes contain the image width and height, respectively, as 14-bit unsigned little endian
70
+ # integers (minus one). The 4 remaining bits consist of a 1-bit flag indicating whether alpha is used, and a 3-bit
71
+ # version that is always zero.
72
+ dimensions = read_little_endian_int_32
73
+ width = (dimensions & 0x3fff) + 1
74
+ height = (dimensions >> 14 & 0x3fff) + 1
75
+ has_transparency = (dimensions >> 28 & 0x1) == 1
76
+
77
+ create_image(width, height, has_transparency: has_transparency)
78
+ end
79
+
80
+ def read_extended_data
81
+ # After the common RIFF header bytes, the extended file format has a series of 1-bit flags to signify the presence
82
+ # of optional information. These flags are as follows:
83
+ #
84
+ # |0|1|2|3|4|5|6|7|
85
+ # +-+-+-+-+-+-+-+-+
86
+ # |Rsv|I|L|E|X|A|R|
87
+ #
88
+ # Where:
89
+ # - Rsv & R = Reserved - Should be 0.
90
+ # - I = Set if file contains an ICC profile.
91
+ # - L = Set if file contains transparency information.
92
+ # - E = Set if file contains Exif metadata.
93
+ # - X = Set if file contains XMP metadata.
94
+ # - A = Set if file is an animated image.
95
+ flags = read_int_8
96
+ has_transparency = flags & 0x10 != 0
97
+ has_exif_metadata = flags & 0x08 != 0
98
+ has_xmp_metadata = flags & 0x04 != 0
99
+ has_multiple_frames = flags & 0x02 != 0
100
+
101
+ # The flags are followed by three reserved bytes of zeros, and then by the width and height, respectively - each
102
+ # occupying three bytes and each one less than the actual canvas measurements.
103
+ safe_skip(@buf, 3)
104
+ dimensions = safe_read(@buf, 6).unpack('VS')
105
+ width = (dimensions[0] & 0xffffff) + 1
106
+ height = (dimensions[0] >> 24 | dimensions[1] << 8 & 0xffffff) + 1
107
+
108
+ image = create_image(width, height, has_multiple_frames: has_multiple_frames, has_transparency: has_transparency)
109
+ augment_image(image) if has_exif_metadata || has_xmp_metadata || has_multiple_frames
110
+ image
111
+ end
112
+
113
+ def create_image(width, height, has_multiple_frames: false, has_transparency: false)
114
+ FormatParser::Image.new(
115
+ content_type: WEBP_MIME_TYPE,
116
+ format: :webp,
117
+ has_multiple_frames: has_multiple_frames,
118
+ has_transparency: has_transparency,
119
+ height_px: height,
120
+ width_px: width
121
+ )
122
+ end
123
+
124
+ def augment_image(image)
125
+ # We're going to scan the file looking for the EXIF, XMP and/or ANMF chunks.
126
+ intrinsics = {}
127
+ num_frames = 0
128
+ loop do
129
+ # Try to read the next chunk header, and break the loop if we've reached EOF.
130
+ begin
131
+ fourcc, chunk_size = safe_read(@buf, 8).unpack('A4V')
132
+ rescue FormatParser::IOUtils::InvalidRead
133
+ break
134
+ end
135
+
136
+ # Padding byte of 0 added if chunk size is odd.
137
+ safe_skip(@buf, 1) if chunk_size.odd?
138
+
139
+ case fourcc
140
+ when 'EXIF'
141
+ chunk_pos = @buf.pos
142
+ begin
143
+ exif = exif_from_tiff_io(StringIO.new(safe_read(@buf, chunk_size)))
144
+ # We use ||= here as one Exif chunk at most should be present, even though it is possible for there to be more.
145
+ intrinsics[:exif] ||= exif
146
+ image.height_px, image.width_px = image.width_px, image.height_px if exif&.rotated?
147
+ image.orientation = exif&.orientation_sym
148
+ rescue EXIFR::MalformedTIFF
149
+ # Exif data was malformed and could not be parsed. Need to ensure that buffer is pointing at the next chunk.
150
+ @buf.seek(chunk_pos + chunk_size)
151
+ next
152
+ end
153
+ when 'XMP'
154
+ # We use ||= here as one XMP chunk at most should be present, even though it is possible for there to be more.
155
+ intrinsics[:xmp] ||= safe_read(@buf, chunk_size)
156
+ when 'ANMF'
157
+ num_frames += 1 if image.has_multiple_frames
158
+ safe_skip(@buf, chunk_size)
159
+ else
160
+ safe_skip(@buf, chunk_size)
161
+ end
162
+ end
163
+
164
+ image.intrinsics = intrinsics unless intrinsics.empty?
165
+ image.num_animation_or_video_frames = num_frames if num_frames > 0
166
+ end
167
+
168
+ FormatParser.register_parser new, natures: [:image], formats: [:webp]
169
+ end
@@ -0,0 +1,136 @@
1
+ require 'spec_helper'
2
+
3
+ describe FormatParser::WebpParser do
4
+ it 'does not parse files with an invalid RIFF header' do
5
+ result = subject.call(File.open(fixtures_dir + 'WEBP/invalid-header.webp', 'rb'))
6
+ expect(result).to be_nil
7
+ end
8
+
9
+ it 'does not parse files with an unrecognised variant' do
10
+ result = subject.call(File.open(fixtures_dir + 'WEBP/unrecognised-variant.webp', 'rb'))
11
+ expect(result).to be_nil
12
+ end
13
+
14
+ it 'successfully parses lossy (VP8) WebP files' do
15
+ result = subject.call(File.open(fixtures_dir + 'WEBP/lossy.webp', 'rb'))
16
+ expect(result).not_to be_nil
17
+ expect(result.content_type).to eq('image/webp')
18
+ expect(result.format).to eq(:webp)
19
+ expect(result.has_multiple_frames).to eq(false)
20
+ expect(result.has_transparency).to eq(false)
21
+ expect(result.height_px).to eq(181)
22
+ expect(result.intrinsics).to be_nil
23
+ expect(result.orientation).to be_nil
24
+ expect(result.width_px).to eq(65)
25
+ end
26
+
27
+ it 'successfully parses lossless WebP files' do
28
+ result = subject.call(File.open(fixtures_dir + 'WEBP/lossless.webp', 'rb'))
29
+ expect(result).not_to be_nil
30
+ expect(result.content_type).to eq('image/webp')
31
+ expect(result.format).to eq(:webp)
32
+ expect(result.has_multiple_frames).to eq(false)
33
+ expect(result.has_transparency).to eq(false)
34
+ expect(result.height_px).to eq(181)
35
+ expect(result.intrinsics).to be_nil
36
+ expect(result.orientation).to be_nil
37
+ expect(result.width_px).to eq(65)
38
+ end
39
+
40
+ it 'successfully parses lossless WebP files with an alpha channel' do
41
+ result = subject.call(File.open(fixtures_dir + 'WEBP/lossless-alpha.webp', 'rb'))
42
+ expect(result).not_to be_nil
43
+ expect(result.content_type).to eq('image/webp')
44
+ expect(result.format).to eq(:webp)
45
+ expect(result.has_multiple_frames).to eq(false)
46
+ expect(result.has_transparency).to eq(true)
47
+ expect(result.height_px).to eq(181)
48
+ expect(result.intrinsics).to be_nil
49
+ expect(result.orientation).to be_nil
50
+ expect(result.width_px).to eq(65)
51
+ end
52
+
53
+ it 'successfully parses extended WebP files' do
54
+ result = subject.call(File.open(fixtures_dir + 'WEBP/extended.webp', 'rb'))
55
+ expect(result).not_to be_nil
56
+ expect(result.content_type).to eq('image/webp')
57
+ expect(result.format).to eq(:webp)
58
+ expect(result.has_multiple_frames).to eq(false)
59
+ expect(result.has_transparency).to eq(false)
60
+ expect(result.height_px).to eq(181)
61
+ expect(result.intrinsics).to be_nil
62
+ expect(result.orientation).to be_nil
63
+ expect(result.width_px).to eq(65)
64
+ end
65
+
66
+ it 'successfully parses extended WebP files with an alpha channel' do
67
+ result = subject.call(File.open(fixtures_dir + 'WEBP/extended-alpha.webp', 'rb'))
68
+ expect(result).not_to be_nil
69
+ expect(result.content_type).to eq('image/webp')
70
+ expect(result.format).to eq(:webp)
71
+ expect(result.has_multiple_frames).to eq(false)
72
+ expect(result.has_transparency).to eq(true)
73
+ expect(result.height_px).to eq(181)
74
+ expect(result.intrinsics).to be_nil
75
+ expect(result.orientation).to be_nil
76
+ expect(result.width_px).to eq(65)
77
+ end
78
+
79
+ it 'successfully parses extended WebP files with Exif metadata' do
80
+ result = subject.call(File.open(fixtures_dir + 'WEBP/extended-exif.webp', 'rb'))
81
+ expect(result).not_to be_nil
82
+ expect(result.content_type).to eq('image/webp')
83
+ expect(result.format).to eq(:webp)
84
+ expect(result.has_multiple_frames).to eq(false)
85
+ expect(result.has_transparency).to eq(false)
86
+ expect(result.height_px).to eq(181)
87
+ expect(result.intrinsics).not_to be_nil
88
+ expect(result.intrinsics[:exif]).not_to be_nil
89
+ expect(result.intrinsics[:exif].image_length).to eq(result.height_px)
90
+ expect(result.intrinsics[:exif].image_width).to eq(result.width_px)
91
+ expect(result.orientation).to eq(:top_left)
92
+ expect(result.width_px).to eq(65)
93
+ end
94
+
95
+ it 'successfully parses extended WebP files with XMP metadata' do
96
+ result = subject.call(File.open(fixtures_dir + 'WEBP/extended-xmp.webp', 'rb'))
97
+ expect(result).not_to be_nil
98
+ expect(result.content_type).to eq('image/webp')
99
+ expect(result.format).to eq(:webp)
100
+ expect(result.has_multiple_frames).to eq(false)
101
+ expect(result.has_transparency).to eq(false)
102
+ expect(result.height_px).to eq(181)
103
+ expect(result.intrinsics).not_to be_nil
104
+ expect(result.intrinsics[:xmp]).not_to be_nil
105
+ expect(result.orientation).to be_nil
106
+ expect(result.width_px).to eq(65)
107
+ end
108
+
109
+ it 'successfully parses extended WebP files with animation' do
110
+ result = subject.call(File.open(fixtures_dir + 'WEBP/extended-animation.webp', 'rb'))
111
+ expect(result).not_to be_nil
112
+ expect(result.content_type).to eq('image/webp')
113
+ expect(result.format).to eq(:webp)
114
+ expect(result.has_multiple_frames).to eq(true)
115
+ expect(result.has_transparency).to eq(true)
116
+ expect(result.height_px).to eq(211)
117
+ expect(result.intrinsics).to be_nil
118
+ expect(result.orientation).to be_nil
119
+ expect(result.width_px).to eq(211)
120
+ end
121
+
122
+ it 'successfully skips malformed Exif chunks' do
123
+ result = subject.call(File.open(fixtures_dir + 'WEBP/extended-malformed-exif.webp', 'rb'))
124
+ expect(result).not_to be_nil
125
+ expect(result.content_type).to eq('image/webp')
126
+ expect(result.format).to eq(:webp)
127
+ expect(result.has_multiple_frames).to eq(false)
128
+ expect(result.has_transparency).to eq(false)
129
+ expect(result.height_px).to eq(181)
130
+ expect(result.intrinsics).not_to be_nil
131
+ expect(result.intrinsics[:exif]).to be_nil
132
+ expect(result.intrinsics[:xmp]).not_to be_nil
133
+ expect(result.orientation).to be_nil
134
+ expect(result.width_px).to eq(65)
135
+ end
136
+ end
metadata CHANGED
@@ -1,15 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: format_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.0
4
+ version: 1.4.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Noah Berman
8
8
  - Julik Tarkhanov
9
- autorequire:
9
+ autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2022-05-31 00:00:00.000000000 Z
12
+ date: 2022-07-13 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: ks
@@ -252,6 +252,7 @@ files:
252
252
  - lib/parsers/psd_parser.rb
253
253
  - lib/parsers/tiff_parser.rb
254
254
  - lib/parsers/wav_parser.rb
255
+ - lib/parsers/webp_parser.rb
255
256
  - lib/parsers/zip_parser.rb
256
257
  - lib/parsers/zip_parser/file_reader.rb
257
258
  - lib/parsers/zip_parser/office_formats.rb
@@ -291,6 +292,7 @@ files:
291
292
  - spec/parsers/psd_parser_spec.rb
292
293
  - spec/parsers/tiff_parser_spec.rb
293
294
  - spec/parsers/wav_parser_spec.rb
295
+ - spec/parsers/webp_parser_spec.rb
294
296
  - spec/parsers/zip_parser_spec.rb
295
297
  - spec/read_limiter_spec.rb
296
298
  - spec/read_limits_config_spec.rb
@@ -302,7 +304,7 @@ licenses:
302
304
  - MIT (Hippocratic)
303
305
  metadata:
304
306
  allowed_push_host: https://rubygems.org
305
- post_install_message:
307
+ post_install_message:
306
308
  rdoc_options: []
307
309
  require_paths:
308
310
  - lib
@@ -317,8 +319,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
317
319
  - !ruby/object:Gem::Version
318
320
  version: '0'
319
321
  requirements: []
320
- rubygems_version: 3.1.6
321
- signing_key:
322
+ rubygems_version: 3.3.7
323
+ signing_key:
322
324
  specification_version: 4
323
325
  summary: A library for efficient parsing of file metadata
324
326
  test_files: []