format_parser 1.3.0 → 1.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d90a6eee951feb5017bdedc0fb6dd4be49fc0a7f218972c0fb423b1985bf9a97
4
- data.tar.gz: ee6caab359b0e01450897d00abd1f190b131e40d842d487d1f29107ac3b374f0
3
+ metadata.gz: 5a9a94233949cc72d18b433cf1ddcba0e479e8b93aa1ff2e48bda6f6d86f667b
4
+ data.tar.gz: 4e357bc46207e95cad52d21b2aaa1781e9c231bab02e235be29663db9722f5d9
5
5
  SHA512:
6
- metadata.gz: 72aca621c20dfb24443e32c52a3d27f64a4887c1254f6d2295f77b39ce57e1a5ef0aa52f365630badcb2e3aa90d32c3546c8fcea3b83e025e2c4fc3606dd2dd7
7
- data.tar.gz: 90da48352579c4044035732fd1f837d86db1a2d3c8f325b60ee688630f6cb4ea8efdb2ffd7887ac4e49cf9f9df2a22fe2189187c32d606b8f027a0a3d6a3ec5f
6
+ metadata.gz: dcf8c8aeefc6166f3645dae461aadbcc2b36e96cb7a75162586fc009d562f6f978767ff877b27d0c192b5ca3107011a1bfdda842e730e486ced02e4191b53f59
7
+ data.tar.gz: fbc2caafb269f5e9c249e6ffe62ea8141477589256fab1bea5d058877d571725b1e619619ef9fcd33005d42759697ae7c7575d7a79217bdedc7e96ab02ce3c1b
data/.gitignore CHANGED
@@ -61,3 +61,6 @@ Gemfile.lock
61
61
 
62
62
  # rspec examples
63
63
  spec/examples.txt
64
+
65
+ # IntelliJ config:
66
+ /.idea/
data/CHANGELOG.md CHANGED
@@ -1,6 +1,15 @@
1
+ ## 1.4.2
2
+ * Fix `MP3Parser` taking precedence when parsing `WEBP` files.
3
+
4
+ ## 1.4.1
5
+ * Skip Exif chunks that are malformed during `WEBP` parsing.
6
+
7
+ ## 1.4.0
8
+ * Add support for `WEBP` lossy, lossless and extended file formats.
9
+
1
10
  ## 1.3.0
2
11
  * Add `heif_parser` and support for `HEIF` and `HEIC` formats. Exif parsing is still missing.
3
- *
12
+
4
13
  ## 1.2.1
5
14
  * Resolve bug when `stts` atom is `nil`
6
15
 
data/README.md CHANGED
@@ -33,6 +33,7 @@ and [dimensions,](https://github.com/sstephenson/dimensions) borrowing from them
33
33
  * OGG
34
34
  * MPEG, MPG
35
35
  * M3U
36
+ * WEBP
36
37
 
37
38
  ...with [more](https://github.com/WeTransfer/format_parser/issues?q=is%3Aissue+is%3Aopen+label%3Aformats) on the way!
38
39
 
@@ -198,6 +199,10 @@ Unless specified otherwise in this section the fixture files are MIT licensed an
198
199
  ### M3U
199
200
  - The M3U fixture files were created by one of the project maintainers
200
201
 
202
+ ### WEBP
203
+ - With the exception of extended-animation.webp, which was obtained from Wikimedia Commons and is Creative Commons
204
+ licensed, all of the WebP fixture files have been created by one of the project maintainers.
205
+
201
206
  ### .key
202
207
  - The `keynote_recognized_as_jpeg.key` file was created by the project maintainers
203
208
 
@@ -1,3 +1,3 @@
1
1
  module FormatParser
2
- VERSION = '1.3.0'
2
+ VERSION = '1.4.2'
3
3
  end
data/lib/io_utils.rb CHANGED
@@ -42,6 +42,14 @@ module FormatParser::IOUtils
42
42
  safe_read(@buf, 4).unpack('N').first
43
43
  end
44
44
 
45
+ def read_little_endian_int_16
46
+ safe_read(@buf, 2).unpack('v').first
47
+ end
48
+
49
+ def read_little_endian_int_32
50
+ safe_read(@buf, 4).unpack('V').first
51
+ end
52
+
45
53
  # 'n' is the number of bytes to read
46
54
  def read_string(n)
47
55
  safe_read(@buf, n)
@@ -28,6 +28,7 @@ class FormatParser::MP3Parser
28
28
  # For some edge cases
29
29
  ZIP_LOCAL_ENTRY_SIGNATURE = "PK\x03\x04\x14\x00".b
30
30
  PNG_HEADER_BYTES = [137, 80, 78, 71, 13, 10, 26, 10].pack('C*')
31
+ WEBP_HEADER_REGEX = /RIFF.{4}WEBP/i
31
32
 
32
33
  MAGIC_LE = [0x49, 0x49, 0x2A, 0x0].pack('C4')
33
34
  MAGIC_BE = [0x4D, 0x4D, 0x0, 0x2A].pack('C4')
@@ -68,9 +69,10 @@ class FormatParser::MP3Parser
68
69
  # will terminate here. Same with PNGs. In the future
69
70
  # we should implement "confidence" for MP3 as of all our formats
70
71
  # it is by far the most lax.
71
- header = safe_read(io, 8)
72
+ header = safe_read(io, 12)
72
73
  return if header.start_with?(ZIP_LOCAL_ENTRY_SIGNATURE)
73
74
  return if header.start_with?(PNG_HEADER_BYTES)
75
+ return if header.start_with?(WEBP_HEADER_REGEX)
74
76
 
75
77
  io.seek(0)
76
78
  return if TIFF_HEADER_BYTES.include?(safe_read(io, 4))
@@ -0,0 +1,169 @@
1
+ # WebP is an image format that provides superior lossless and lossy compression for images on the web, with support for
2
+ # transparency. It uses predictive coding to encode an image, predicting the values in a block of pixels based on the
3
+ # values of neighbouring blocks. A WebP file consists of VP8 or VP8L data, and a container based on RIFF. There is also
4
+ # an extended file format, VP8X, that optionally encodes various information such as the color profile, animation
5
+ # control data, transparency, and EXIF and/or XMP metadata.
6
+ #
7
+ # For more information, visit https://developers.google.com/speed/webp.
8
+ #
9
+ # TODO: Decide how to determine color mode (depends on variant, transformations, flags, etc.; maybe not worth it).
10
+
11
+ class FormatParser::WebpParser
12
+ include FormatParser::EXIFParser
13
+ include FormatParser::IOUtils
14
+
15
+ WEBP_MIME_TYPE = 'image/webp'
16
+
17
+ def likely_match?(filename)
18
+ filename =~ /\.webp$/i
19
+ end
20
+
21
+ def call(io)
22
+ @buf = FormatParser::IOConstraint.new(io)
23
+
24
+ # All WebP files start with the following 20 bytes:
25
+ #
26
+ # Offset | Description
27
+ # -------------------------------------------------------------------------------------
28
+ # 0...3 | "RIFF" (Since WebP is based on the RIFF file container format).
29
+ # 4...7 | The size of the file in bytes - 8 bytes.
30
+ # 8...11 | "WEBP" (To signify that this is a WebP file).
31
+ # 12...15 | The VB8 variant in use ("VB8 ", "VP8L" or "VB8X")
32
+ # 16...19 | The length of the VB8 data in bytes (i.e. The size of the file - 20 bytes).
33
+ riff, webp, variant = safe_read(@buf, 20).unpack('A4x4A4A4')
34
+ return unless riff == 'RIFF' && webp == 'WEBP'
35
+ read_data(variant)
36
+ end
37
+
38
+ private
39
+
40
+ def read_data(variant)
41
+ case variant
42
+ when 'VP8' # Lossy
43
+ read_lossy_data
44
+ when 'VP8L' # Lossless
45
+ read_lossless_data
46
+ when 'VP8X' # Extended
47
+ read_extended_data
48
+ else
49
+ nil
50
+ end
51
+ end
52
+
53
+ def read_lossy_data
54
+ # Encoded as a single VP8 key frame - a 10-byte uncompressed chunk followed by 2+ partitions of compressed data.
55
+ # The first 6 bytes of this chunk contains information that is mostly relevant when using VP8 as a video
56
+ # compression format, and can be ignored.
57
+ safe_skip(@buf, 6)
58
+
59
+ # The subsequent 4 bytes contain the image width and height, respectively, as 16-bit unsigned little endian
60
+ # integers.
61
+ width, height = safe_read(@buf, 4).unpack('S<S<')
62
+ create_image(width, height)
63
+ end
64
+
65
+ def read_lossless_data
66
+ # There is a single byte signature, 0x2F, that we can disregard.
67
+ safe_skip(@buf, 1)
68
+
69
+ # The subsequent 4 bytes contain the image width and height, respectively, as 14-bit unsigned little endian
70
+ # integers (minus one). The 4 remaining bits consist of a 1-bit flag indicating whether alpha is used, and a 3-bit
71
+ # version that is always zero.
72
+ dimensions = read_little_endian_int_32
73
+ width = (dimensions & 0x3fff) + 1
74
+ height = (dimensions >> 14 & 0x3fff) + 1
75
+ has_transparency = (dimensions >> 28 & 0x1) == 1
76
+
77
+ create_image(width, height, has_transparency: has_transparency)
78
+ end
79
+
80
+ def read_extended_data
81
+ # After the common RIFF header bytes, the extended file format has a series of 1-bit flags to signify the presence
82
+ # of optional information. These flags are as follows:
83
+ #
84
+ # |0|1|2|3|4|5|6|7|
85
+ # +-+-+-+-+-+-+-+-+
86
+ # |Rsv|I|L|E|X|A|R|
87
+ #
88
+ # Where:
89
+ # - Rsv & R = Reserved - Should be 0.
90
+ # - I = Set if file contains an ICC profile.
91
+ # - L = Set if file contains transparency information.
92
+ # - E = Set if file contains Exif metadata.
93
+ # - X = Set if file contains XMP metadata.
94
+ # - A = Set if file is an animated image.
95
+ flags = read_int_8
96
+ has_transparency = flags & 0x10 != 0
97
+ has_exif_metadata = flags & 0x08 != 0
98
+ has_xmp_metadata = flags & 0x04 != 0
99
+ has_multiple_frames = flags & 0x02 != 0
100
+
101
+ # The flags are followed by three reserved bytes of zeros, and then by the width and height, respectively - each
102
+ # occupying three bytes and each one less than the actual canvas measurements.
103
+ safe_skip(@buf, 3)
104
+ dimensions = safe_read(@buf, 6).unpack('VS')
105
+ width = (dimensions[0] & 0xffffff) + 1
106
+ height = (dimensions[0] >> 24 | dimensions[1] << 8 & 0xffffff) + 1
107
+
108
+ image = create_image(width, height, has_multiple_frames: has_multiple_frames, has_transparency: has_transparency)
109
+ augment_image(image) if has_exif_metadata || has_xmp_metadata || has_multiple_frames
110
+ image
111
+ end
112
+
113
+ def create_image(width, height, has_multiple_frames: false, has_transparency: false)
114
+ FormatParser::Image.new(
115
+ content_type: WEBP_MIME_TYPE,
116
+ format: :webp,
117
+ has_multiple_frames: has_multiple_frames,
118
+ has_transparency: has_transparency,
119
+ height_px: height,
120
+ width_px: width
121
+ )
122
+ end
123
+
124
+ def augment_image(image)
125
+ # We're going to scan the file looking for the EXIF, XMP and/or ANMF chunks.
126
+ intrinsics = {}
127
+ num_frames = 0
128
+ loop do
129
+ # Try to read the next chunk header, and break the loop if we've reached EOF.
130
+ begin
131
+ fourcc, chunk_size = safe_read(@buf, 8).unpack('A4V')
132
+ rescue FormatParser::IOUtils::InvalidRead
133
+ break
134
+ end
135
+
136
+ # Padding byte of 0 added if chunk size is odd.
137
+ safe_skip(@buf, 1) if chunk_size.odd?
138
+
139
+ case fourcc
140
+ when 'EXIF'
141
+ chunk_pos = @buf.pos
142
+ begin
143
+ exif = exif_from_tiff_io(StringIO.new(safe_read(@buf, chunk_size)))
144
+ # We use ||= here as one Exif chunk at most should be present, even though it is possible for there to be more.
145
+ intrinsics[:exif] ||= exif
146
+ image.height_px, image.width_px = image.width_px, image.height_px if exif&.rotated?
147
+ image.orientation = exif&.orientation_sym
148
+ rescue EXIFR::MalformedTIFF
149
+ # Exif data was malformed and could not be parsed. Need to ensure that buffer is pointing at the next chunk.
150
+ @buf.seek(chunk_pos + chunk_size)
151
+ next
152
+ end
153
+ when 'XMP'
154
+ # We use ||= here as one XMP chunk at most should be present, even though it is possible for there to be more.
155
+ intrinsics[:xmp] ||= safe_read(@buf, chunk_size)
156
+ when 'ANMF'
157
+ num_frames += 1 if image.has_multiple_frames
158
+ safe_skip(@buf, chunk_size)
159
+ else
160
+ safe_skip(@buf, chunk_size)
161
+ end
162
+ end
163
+
164
+ image.intrinsics = intrinsics unless intrinsics.empty?
165
+ image.num_animation_or_video_frames = num_frames if num_frames > 0
166
+ end
167
+
168
+ FormatParser.register_parser new, natures: [:image], formats: [:webp]
169
+ end
@@ -0,0 +1,136 @@
1
+ require 'spec_helper'
2
+
3
+ describe FormatParser::WebpParser do
4
+ it 'does not parse files with an invalid RIFF header' do
5
+ result = subject.call(File.open(fixtures_dir + 'WEBP/invalid-header.webp', 'rb'))
6
+ expect(result).to be_nil
7
+ end
8
+
9
+ it 'does not parse files with an unrecognised variant' do
10
+ result = subject.call(File.open(fixtures_dir + 'WEBP/unrecognised-variant.webp', 'rb'))
11
+ expect(result).to be_nil
12
+ end
13
+
14
+ it 'successfully parses lossy (VP8) WebP files' do
15
+ result = subject.call(File.open(fixtures_dir + 'WEBP/lossy.webp', 'rb'))
16
+ expect(result).not_to be_nil
17
+ expect(result.content_type).to eq('image/webp')
18
+ expect(result.format).to eq(:webp)
19
+ expect(result.has_multiple_frames).to eq(false)
20
+ expect(result.has_transparency).to eq(false)
21
+ expect(result.height_px).to eq(181)
22
+ expect(result.intrinsics).to be_nil
23
+ expect(result.orientation).to be_nil
24
+ expect(result.width_px).to eq(65)
25
+ end
26
+
27
+ it 'successfully parses lossless WebP files' do
28
+ result = subject.call(File.open(fixtures_dir + 'WEBP/lossless.webp', 'rb'))
29
+ expect(result).not_to be_nil
30
+ expect(result.content_type).to eq('image/webp')
31
+ expect(result.format).to eq(:webp)
32
+ expect(result.has_multiple_frames).to eq(false)
33
+ expect(result.has_transparency).to eq(false)
34
+ expect(result.height_px).to eq(181)
35
+ expect(result.intrinsics).to be_nil
36
+ expect(result.orientation).to be_nil
37
+ expect(result.width_px).to eq(65)
38
+ end
39
+
40
+ it 'successfully parses lossless WebP files with an alpha channel' do
41
+ result = subject.call(File.open(fixtures_dir + 'WEBP/lossless-alpha.webp', 'rb'))
42
+ expect(result).not_to be_nil
43
+ expect(result.content_type).to eq('image/webp')
44
+ expect(result.format).to eq(:webp)
45
+ expect(result.has_multiple_frames).to eq(false)
46
+ expect(result.has_transparency).to eq(true)
47
+ expect(result.height_px).to eq(181)
48
+ expect(result.intrinsics).to be_nil
49
+ expect(result.orientation).to be_nil
50
+ expect(result.width_px).to eq(65)
51
+ end
52
+
53
+ it 'successfully parses extended WebP files' do
54
+ result = subject.call(File.open(fixtures_dir + 'WEBP/extended.webp', 'rb'))
55
+ expect(result).not_to be_nil
56
+ expect(result.content_type).to eq('image/webp')
57
+ expect(result.format).to eq(:webp)
58
+ expect(result.has_multiple_frames).to eq(false)
59
+ expect(result.has_transparency).to eq(false)
60
+ expect(result.height_px).to eq(181)
61
+ expect(result.intrinsics).to be_nil
62
+ expect(result.orientation).to be_nil
63
+ expect(result.width_px).to eq(65)
64
+ end
65
+
66
+ it 'successfully parses extended WebP files with an alpha channel' do
67
+ result = subject.call(File.open(fixtures_dir + 'WEBP/extended-alpha.webp', 'rb'))
68
+ expect(result).not_to be_nil
69
+ expect(result.content_type).to eq('image/webp')
70
+ expect(result.format).to eq(:webp)
71
+ expect(result.has_multiple_frames).to eq(false)
72
+ expect(result.has_transparency).to eq(true)
73
+ expect(result.height_px).to eq(181)
74
+ expect(result.intrinsics).to be_nil
75
+ expect(result.orientation).to be_nil
76
+ expect(result.width_px).to eq(65)
77
+ end
78
+
79
+ it 'successfully parses extended WebP files with Exif metadata' do
80
+ result = subject.call(File.open(fixtures_dir + 'WEBP/extended-exif.webp', 'rb'))
81
+ expect(result).not_to be_nil
82
+ expect(result.content_type).to eq('image/webp')
83
+ expect(result.format).to eq(:webp)
84
+ expect(result.has_multiple_frames).to eq(false)
85
+ expect(result.has_transparency).to eq(false)
86
+ expect(result.height_px).to eq(181)
87
+ expect(result.intrinsics).not_to be_nil
88
+ expect(result.intrinsics[:exif]).not_to be_nil
89
+ expect(result.intrinsics[:exif].image_length).to eq(result.height_px)
90
+ expect(result.intrinsics[:exif].image_width).to eq(result.width_px)
91
+ expect(result.orientation).to eq(:top_left)
92
+ expect(result.width_px).to eq(65)
93
+ end
94
+
95
+ it 'successfully parses extended WebP files with XMP metadata' do
96
+ result = subject.call(File.open(fixtures_dir + 'WEBP/extended-xmp.webp', 'rb'))
97
+ expect(result).not_to be_nil
98
+ expect(result.content_type).to eq('image/webp')
99
+ expect(result.format).to eq(:webp)
100
+ expect(result.has_multiple_frames).to eq(false)
101
+ expect(result.has_transparency).to eq(false)
102
+ expect(result.height_px).to eq(181)
103
+ expect(result.intrinsics).not_to be_nil
104
+ expect(result.intrinsics[:xmp]).not_to be_nil
105
+ expect(result.orientation).to be_nil
106
+ expect(result.width_px).to eq(65)
107
+ end
108
+
109
+ it 'successfully parses extended WebP files with animation' do
110
+ result = subject.call(File.open(fixtures_dir + 'WEBP/extended-animation.webp', 'rb'))
111
+ expect(result).not_to be_nil
112
+ expect(result.content_type).to eq('image/webp')
113
+ expect(result.format).to eq(:webp)
114
+ expect(result.has_multiple_frames).to eq(true)
115
+ expect(result.has_transparency).to eq(true)
116
+ expect(result.height_px).to eq(211)
117
+ expect(result.intrinsics).to be_nil
118
+ expect(result.orientation).to be_nil
119
+ expect(result.width_px).to eq(211)
120
+ end
121
+
122
+ it 'successfully skips malformed Exif chunks' do
123
+ result = subject.call(File.open(fixtures_dir + 'WEBP/extended-malformed-exif.webp', 'rb'))
124
+ expect(result).not_to be_nil
125
+ expect(result.content_type).to eq('image/webp')
126
+ expect(result.format).to eq(:webp)
127
+ expect(result.has_multiple_frames).to eq(false)
128
+ expect(result.has_transparency).to eq(false)
129
+ expect(result.height_px).to eq(181)
130
+ expect(result.intrinsics).not_to be_nil
131
+ expect(result.intrinsics[:exif]).to be_nil
132
+ expect(result.intrinsics[:xmp]).not_to be_nil
133
+ expect(result.orientation).to be_nil
134
+ expect(result.width_px).to eq(65)
135
+ end
136
+ end
metadata CHANGED
@@ -1,15 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: format_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.0
4
+ version: 1.4.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Noah Berman
8
8
  - Julik Tarkhanov
9
- autorequire:
9
+ autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2022-05-31 00:00:00.000000000 Z
12
+ date: 2022-07-13 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: ks
@@ -252,6 +252,7 @@ files:
252
252
  - lib/parsers/psd_parser.rb
253
253
  - lib/parsers/tiff_parser.rb
254
254
  - lib/parsers/wav_parser.rb
255
+ - lib/parsers/webp_parser.rb
255
256
  - lib/parsers/zip_parser.rb
256
257
  - lib/parsers/zip_parser/file_reader.rb
257
258
  - lib/parsers/zip_parser/office_formats.rb
@@ -291,6 +292,7 @@ files:
291
292
  - spec/parsers/psd_parser_spec.rb
292
293
  - spec/parsers/tiff_parser_spec.rb
293
294
  - spec/parsers/wav_parser_spec.rb
295
+ - spec/parsers/webp_parser_spec.rb
294
296
  - spec/parsers/zip_parser_spec.rb
295
297
  - spec/read_limiter_spec.rb
296
298
  - spec/read_limits_config_spec.rb
@@ -302,7 +304,7 @@ licenses:
302
304
  - MIT (Hippocratic)
303
305
  metadata:
304
306
  allowed_push_host: https://rubygems.org
305
- post_install_message:
307
+ post_install_message:
306
308
  rdoc_options: []
307
309
  require_paths:
308
310
  - lib
@@ -317,8 +319,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
317
319
  - !ruby/object:Gem::Version
318
320
  version: '0'
319
321
  requirements: []
320
- rubygems_version: 3.1.6
321
- signing_key:
322
+ rubygems_version: 3.3.7
323
+ signing_key:
322
324
  specification_version: 4
323
325
  summary: A library for efficient parsing of file metadata
324
326
  test_files: []