format_parser 1.2.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d8bdd6daa59e43bbe6033562b804d3b1b9685c6e2aa5ef3f8d527c74516e1f6c
4
- data.tar.gz: d6035f0b3c819085ffd23c4988e48145976ba1f98547d16589b2a4b4aa3e7aa7
3
+ metadata.gz: cf7fbbf842a1ae6fcde3986b360877223ac699a87950848b508da15f8a8280ad
4
+ data.tar.gz: 29882db7afe75a1d3b6554f18dbc837cefb1dbe9e8927adafe959ac8d37ade84
5
5
  SHA512:
6
- metadata.gz: 1a2f6243d295589972c63e45b719f19a5761c872103c32bfaacaef9cee9b85551c18ddaaf3c4095567c358958e7b7bf78800fc3e5a8e6c9f774eab383dc3160c
7
- data.tar.gz: 73ba4f1099ccb4c490b50a8f531be843679159a6417980b2a61b905724905cf6b7e884127b66096e2237982dc6743e7f057bea0062472c1779e4d754dd5388ff
6
+ metadata.gz: 0cf33f73ac298f565020e9819c9c7d2e2af340490b869b97a008b4466ac2b0825fed70d5d9e255ef1192520cef92fdeeff0c7ade18d5e38910d6dc2fd0de89f3
7
+ data.tar.gz: c20cdc92df0d29d1e0c4b9f8c05644e17216f239a8d90e9c7af38f5566b4abaaf6f6289d5cc69d6a25b0ed644236403820b5c3402080f0b7ba40ca112b671d3a
data/.gitignore CHANGED
@@ -61,3 +61,6 @@ Gemfile.lock
61
61
 
62
62
  # rspec examples
63
63
  spec/examples.txt
64
+
65
+ # IntelliJ config:
66
+ /.idea/
data/CHANGELOG.md CHANGED
@@ -1,3 +1,12 @@
1
+ ## 1.4.0
2
+ * Add support for `WEBP` lossy, lossless and extended file formats.
3
+
4
+ ## 1.3.0
5
+ * Add `heif_parser` and support for `HEIF` and `HEIC` formats. Exif parsing is still missing.
6
+
7
+ ## 1.2.1
8
+ * Resolve bug when `stts` atom is `nil`
9
+
1
10
  ## 1.2.0
2
11
  * Add support for `codecs` in moov_parser for video metadata
3
12
 
data/Gemfile CHANGED
@@ -1,9 +1,4 @@
1
1
  source 'https://rubygems.org'
2
2
 
3
- gem 'ruby-debug-ide'
4
- gem 'debase'
5
- gem 'solargraph', group: :development
6
- gem 'pry', group: :development
7
-
8
3
  # Gem dependencies specified in the gemspec
9
4
  gemspec
data/README.md CHANGED
@@ -33,6 +33,7 @@ and [dimensions,](https://github.com/sstephenson/dimensions) borrowing from them
33
33
  * OGG
34
34
  * MPEG, MPG
35
35
  * M3U
36
+ * WEBP
36
37
 
37
38
  ...with [more](https://github.com/WeTransfer/format_parser/issues?q=is%3Aissue+is%3Aopen+label%3Aformats) on the way!
38
39
 
@@ -198,6 +199,10 @@ Unless specified otherwise in this section the fixture files are MIT licensed an
198
199
  ### M3U
199
200
  - The M3U fixture files were created by one of the project maintainers
200
201
 
202
+ ### WEBP
203
+ - With the exception of extended-animation.webp, which was obtained from Wikimedia Commons and is Creative Commons
204
+ licensed, all of the WebP fixture files have been created by one of the project maintainers.
205
+
201
206
  ### .key
202
207
  - The `keynote_recognized_as_jpeg.key` file was created by the project maintainers
203
208
 
@@ -1,3 +1,3 @@
1
1
  module FormatParser
2
- VERSION = '1.2.0'
2
+ VERSION = '1.4.0'
3
3
  end
data/lib/io_utils.rb CHANGED
@@ -30,5 +30,30 @@ module FormatParser::IOUtils
30
30
  nil
31
31
  end
32
32
 
33
+ def read_int_8
34
+ safe_read(@buf, 1).unpack('C').first
35
+ end
36
+
37
+ def read_int_16
38
+ safe_read(@buf, 2).unpack('n').first
39
+ end
40
+
41
+ def read_int_32
42
+ safe_read(@buf, 4).unpack('N').first
43
+ end
44
+
45
+ def read_little_endian_int_16
46
+ safe_read(@buf, 2).unpack('v').first
47
+ end
48
+
49
+ def read_little_endian_int_32
50
+ safe_read(@buf, 4).unpack('V').first
51
+ end
52
+
53
+ # 'n' is the number of bytes to read
54
+ def read_string(n)
55
+ safe_read(@buf, n)
56
+ end
57
+
33
58
  ### TODO: Some kind of built-in offset for the read
34
59
  end
@@ -0,0 +1,431 @@
1
+ # HEIF stands for High-Efficiency Image File format, which is basically a container that is capable of storing an image, or a sequence of images in a single file.
2
+ # There are a number of variants of HEIF, which can be used to store images, sequences of images, or videos using different codecs.
3
+ # The variant that Apple uses to store images and sequences of images in its iOS and macOS operating systems is High Efficiency Image Coding (HEIC), which uses HEVC / H.265 for content compression.
4
+ class FormatParser::HEIFParser
5
+ include FormatParser::IOUtils
6
+
7
+ HEIF_MARKER = [0x68, 0x65, 0x69, 0x63].pack('C4') # heif marker
8
+ FILE_TYPE_BOX_MARKER = [0x66, 0x74, 0x79, 0x70].pack('C4') # ftyp marker
9
+ META_BOX_MARKER = [0x6D, 0x65, 0x74, 0x61].pack('C4') # meta marker
10
+ MIF1_MARKER = [0x6D, 0x69, 0x66, 0x31].pack('C4') # mif1 marker
11
+ MSF1_MARKER = [0x6D, 0x73, 0x66, 0x31].pack('C4') # msf1 marker
12
+ MEANINGLESS_BYTE = [0x00, 0x00, 0x00, 0x00].pack('C4')
13
+ HANDLER_MARKER = [0x68, 0x64, 0x6C, 0x72].pack('C4') # hdlr marker
14
+ ITEM_PROPERTIES_BOX = [0x69, 0x70, 0x72, 0x70].pack('C4') # iprp marker
15
+ ITEM_PROPERTIES_CONTAINER_BOX = [0x69, 0x70, 0x63, 0x6F].pack('C4') # ipco marker
16
+ IMAGE_SPATIAL_EXTENTS_BOX = [0x69, 0x73, 0x70, 0x65].pack('C4') # ispe marker
17
+ PIXEL_ASPECT_RATIO_BOX = [0x70, 0x61, 0x73, 0x70].pack('C4') # pasp marker
18
+ ITEM_INFO_BOX = [0x69, 0x69, 0x6E, 0x66].pack('C4') # iinf marker
19
+ ITEM_INFO_ENTRY = [0x69, 0x6E, 0x66, 0x65].pack('C4') # infe marker
20
+ MIME_MARKER = [0x6D, 0x69, 0x6D, 0x65].pack('C4') # mime marker
21
+ COLOUR_INFO_BOX = [0x63, 0x6F, 0x6C, 0x72].pack('C4') # colr marker
22
+ PIXEL_INFO_BOX = [0x70, 0x69, 0x78, 0x69].pack('C4') # pixi marker
23
+ RELATIVE_LOCATION_BOX = [0x72, 0x6C, 0x6F, 0x63].pack('C4') # rloc marker
24
+ CLEAN_APERTURE_BOX = [0x63, 0x6C, 0x61, 0x70].pack('C4') # clap marker
25
+ PRIMARY_ITEM_BOX = [0x70, 0x69, 0x74, 0x6D].pack('C4') # pitm marker
26
+ ITEM_PROPERTIES_ASSOCIATION_BOX = [0x69, 0x70, 0x6D, 0x61].pack('C4') # ipma marker
27
+ IMAGE_ROTATION_BOX = [0x69, 0x72, 0x6F, 0x74].pack('C4') # irot marker
28
+ HEADER_LENGTH = 8 # every box header has a length of 8 bytes
29
+ HEIC_MIME_POSSIBLE_TYPES = {
30
+ 'heic' => :heic,
31
+ 'heix' => :heix,
32
+ 'heim' => :heim,
33
+ 'heis' => :heis
34
+ }
35
+ HEIC_MIME_TYPE = 'image/heic'
36
+ HEIF_MIME_TYPE = 'image/heif'
37
+ # TODO: use the following when adding image-sequence parsing
38
+ # HEIC_SEQUENCE_MIME_TYPE = 'image/heic-sequence'
39
+ # HEIF_SEQUENCE_MIME_TYPE = 'image/heif-sequence'
40
+
41
+ def self.call(io)
42
+ new.call(io)
43
+ end
44
+
45
+ def call(io)
46
+ @buf = FormatParser::IOConstraint.new(io)
47
+ @format = nil
48
+ @@major_brand = nil
49
+ @width = nil
50
+ @height = nil
51
+ @exif_data_frames = []
52
+ @compatible_brands = nil
53
+ @metadata_start_pos = 0
54
+ @metadata_end_pos = 0
55
+ @handler_type = nil
56
+ @sub_items = nil
57
+ @pixel_aspect_ratio = nil
58
+ @colour_info = nil
59
+ @pixel_info = nil
60
+ @horizontal_offset = nil
61
+ @vertical_offset = nil
62
+ @clean_aperture = nil
63
+ @primary_item_id = 0
64
+ @item_props = {}
65
+ @rotation = 0
66
+ @item_props_idxs = []
67
+ @content_type = nil
68
+ scan
69
+ end
70
+
71
+ def scan
72
+ # All HEIC files must be conform to ISO/IEC 23008-12:2017
73
+ # Moreover, all HEIC files are conform to ISO/IEC 14496-12:2015 and should be conform to the Clause 4 of such spec.
74
+ # Files are formed as a series of objects, called boxes. All data is contained in such boxes.
75
+ # All boxes start with a header which defines both size and type.
76
+ # The size is the entire size of the box, including the size and type header, fields, and all contained boxes.
77
+ # The fields in the objects are stored with the most significant byte first, commonly known as network byte order or big-endian format.
78
+ # A HEIC file must contain a File Type Box (ftyp).
79
+ # A file conforms to all the requirements of the brands listed in the compatible_brands.
80
+ scan_file_type_box
81
+
82
+ # file may be identified by MIME type of Annex C of ISO/IEC 23008-12 if 'mif1' is the major brand or Annex D if 'msf1' is the major brand.
83
+ # the MIME indicates the nature and format of our assortment of bytes
84
+ # note particularly that the brand 'mif1' doesn't mandate a MovieBox ("moov").
85
+ # One or more brands must be included in the list of compatible brands
86
+ return if @compatible_brands.nil?
87
+ if @compatible_brands&.include?(MIF1_MARKER)
88
+ scan_meta_level_box
89
+ if @major_brand == MIF1_MARKER
90
+ @content_type = HEIF_MIME_TYPE
91
+ @format = :heif
92
+ elsif (@compatible_brands & HEIC_MIME_POSSIBLE_TYPES.keys).length > 0
93
+ @format = :heic
94
+ @content_type = HEIC_MIME_TYPE
95
+ end
96
+ end
97
+ if @compatible_brands&.include?(MSF1_MARKER)
98
+ # TODO
99
+ end
100
+
101
+ result = FormatParser::Image.new(
102
+ format: @format,
103
+ width_px: @width,
104
+ height_px: @height,
105
+ intrinsics: {
106
+ compatible_brands: @compatible_brands,
107
+ handler_type: @handler_type,
108
+ # 'sub_items': @sub_items, # enable this if you want to output all the sub-items in the image
109
+ pixel_aspect_ratio: @pixel_aspect_ratio,
110
+ colour_info: @colour_info,
111
+ pixel_info: @pixel_info,
112
+ horizontal_offset: @horizontal_offset,
113
+ vertical_offset: @vertical_offset,
114
+ clean_aperture: @clean_aperture,
115
+ rotation: @rotation
116
+ },
117
+ content_type: @content_type
118
+ )
119
+
120
+ result
121
+ end
122
+
123
+ def scan_file_type_box
124
+ file_type_box_length = read_int_32
125
+ return unless read_string(4) == FILE_TYPE_BOX_MARKER
126
+ @major_brand = read_string(4)
127
+ return unless @major_brand == HEIF_MARKER || @major_brand == MIF1_MARKER
128
+ read_string(4) # minor_brand
129
+
130
+ # Subtracting from the total length of the box specified in the header the size header itself (8 bytes = header length and length of ftyp)
131
+ # and the length of the major and minor brand, we obtain the compatible brands
132
+ data_left_length = file_type_box_length - HEADER_LENGTH - HEIF_MARKER.length - 4
133
+
134
+ @compatible_brands = []
135
+ (data_left_length / 4).times do
136
+ @compatible_brands << read_string(4)
137
+ end
138
+ end
139
+
140
+ def scan_meta_level_box
141
+ metadata_length = read_int_32
142
+ return unless read_string(4) == META_BOX_MARKER
143
+ @metadata_start_pos = @buf.pos
144
+ @metadata_end_pos = @buf.pos + metadata_length - HEADER_LENGTH # the real data is always without the 8 initial bytes of the handler
145
+ read_nil_version_and_flag
146
+
147
+ # we are looking for box/containers right beneath the Meta box
148
+ # we start with the HDLR (Handler) box..
149
+ handler_length = read_int_32
150
+ return unless read_string(4) == HANDLER_MARKER
151
+ handler_length -= HEADER_LENGTH # subtract the header as usual (will not be mentioned anymore from now on)
152
+ handler_start = @buf.pos
153
+ # the handler type declares the type of metadata and thus the process by which the media-data in the track is presented
154
+ # it also indicates the structure or format of the ‘meta’ box contents
155
+ read_nil_version_and_flag
156
+ read_string(4) # pre_defined bytes, always 4 null bytes in the hdlr box
157
+ @handler_type = read_string(4)
158
+ @buf.seek(handler_start + handler_length) # the remaining part is reserved
159
+
160
+ # ..continue looking for the IINF box and especially for the IPRP box, containing info about the image itself
161
+ next_box_length = read_int_32
162
+ next_box = read_string(4)
163
+ next_box_start_pos = @buf.pos
164
+ while @buf.pos < @metadata_end_pos # we iterate over all next incoming boxed but without going outside the meta-box
165
+ case next_box
166
+ when PRIMARY_ITEM_BOX
167
+ read_primary_item_box
168
+ when ITEM_INFO_BOX
169
+ read_item_info_box
170
+ when ITEM_PROPERTIES_BOX
171
+ read_item_properties_box
172
+ fill_primary_values
173
+ when next_box == ''
174
+ break
175
+ end
176
+ next_box_length, next_box, next_box_start_pos = get_next_box(next_box_start_pos, next_box_length, @metadata_end_pos)
177
+ end
178
+ end
179
+
180
+ def read_item_info_box
181
+ version = read_int_8
182
+ safe_skip(@buf, 3) # 0 flags
183
+ entry_count = if version == 0
184
+ read_int_16
185
+ else
186
+ read_int_32
187
+ end
188
+ @sub_items = []
189
+ entry_count.times {
190
+ item_info_entry_length = read_int_32
191
+ return unless read_string(4) == ITEM_INFO_ENTRY
192
+ item_info_end_pos = @buf.pos + item_info_entry_length - HEADER_LENGTH
193
+ version = read_int_8
194
+ safe_skip(@buf, 3) # 0 flags
195
+ case version
196
+ when 2
197
+ item_id = read_int_16
198
+ when 3
199
+ item_id = read_int_32
200
+ else
201
+ return # wrong version according to standards, hence return
202
+ end
203
+ safe_skip(@buf, 2) # not interested in the item_protection_index
204
+ item_type = read_string(4)
205
+ content_encoding = ''
206
+ if item_type == MIME_MARKER
207
+ content_encoding = read_string(item_info_end_pos - @buf.pos).delete!("\0") # remove the null-termination part for output visualization reason
208
+ end
209
+ @sub_items << {item_id: item_id, item_type: item_type, content_encoding: content_encoding}
210
+ @buf.seek(item_info_end_pos) # we are not interested in anything else, go directly to the end of this 'infe' box
211
+ }
212
+ end
213
+
214
+ def read_nil_version_and_flag
215
+ safe_skip(@buf, 1) # version, always 0 in this current box
216
+ safe_skip(@buf, 3) # flags, always 0 in this current box
217
+ end
218
+
219
+ def read_primary_item_box
220
+ version = read_int_8
221
+ safe_read(@buf, 3) # flags, always 0 in this current box
222
+ @primary_item_id = if version == 0
223
+ read_int_16
224
+ else
225
+ read_int_32
226
+ end
227
+ end
228
+
229
+ # the ITEM_PROPERTIES_CONTAINER_BOX contains an implicitely 1-based index list of item properties.
230
+ # While parsing such box we are storing the properties with its own index.
231
+ # Reason behind is that the primary_item will be associated to some of these properties through the same index
232
+ # and in order to output relevant data from the format_parser we need all the properties associated to the primary_item.
233
+ # Hence the need of the association between an item and its properties, found in the ITEM_PROPERTIES_ASSOCIATION_BOX
234
+ def read_item_properties_box
235
+ ipco_length = read_int_32
236
+ return unless read_string(4) == ITEM_PROPERTIES_CONTAINER_BOX
237
+ read_item_properties_container_box(ipco_length)
238
+ read_int_32 # ipma_length
239
+ return unless read_string(4) == ITEM_PROPERTIES_ASSOCIATION_BOX
240
+ read_item_properties_association_box
241
+ end
242
+
243
+ def read_item_properties_container_box(box_length)
244
+ end_of_ipco_box = @buf.pos + box_length - HEADER_LENGTH
245
+ item_prop_length = read_int_32
246
+ item_prop_name = read_string(4)
247
+ item_prop_start_pos = @buf.pos
248
+ item_prop_index = 1
249
+ while @buf.pos < end_of_ipco_box
250
+ case item_prop_name
251
+ when IMAGE_SPATIAL_EXTENTS_BOX
252
+ read_nil_version_and_flag
253
+ width = read_int_32
254
+ height = read_int_32
255
+ @item_props[item_prop_index] = {
256
+ type: IMAGE_SPATIAL_EXTENTS_BOX,
257
+ width: width,
258
+ height: height
259
+ }
260
+ when PIXEL_ASPECT_RATIO_BOX
261
+ h_spacing = read_int_32
262
+ v_spacing = read_int_32
263
+ pixel_aspect_ratio = "#{h_spacing}/#{v_spacing}"
264
+ @item_props[item_prop_index] = {
265
+ type: PIXEL_ASPECT_RATIO_BOX,
266
+ pixel_aspect_ratio: pixel_aspect_ratio
267
+ }
268
+ when COLOUR_INFO_BOX
269
+ colour_info = {
270
+ colour_primaries: read_int_16,
271
+ transfer_characteristics: read_int_16,
272
+ matrix_coefficients: read_int_16
273
+ }
274
+ @item_props[item_prop_index] = {
275
+ type: COLOUR_INFO_BOX,
276
+ colour_info: colour_info
277
+ }
278
+ when PIXEL_INFO_BOX
279
+ pixel_info = []
280
+ read_nil_version_and_flag
281
+ num_channels = read_int_8
282
+ channel = 1
283
+ while channel <= num_channels
284
+ channel += 1
285
+ pixel_info << {
286
+ "bits_in_channel_#{channel}": read_int_8
287
+ }
288
+ end
289
+ @item_props[item_prop_index] = {
290
+ type: PIXEL_INFO_BOX,
291
+ pixel_info: pixel_info
292
+ }
293
+ when RELATIVE_LOCATION_BOX
294
+ read_nil_version_and_flag
295
+ horizontal_offset = read_int_32
296
+ vertical_offset = read_int_32
297
+ @item_props[item_prop_index] = {
298
+ type: RELATIVE_LOCATION_BOX,
299
+ horizontal_offset: horizontal_offset,
300
+ vertical_offset: vertical_offset
301
+ }
302
+ when CLEAN_APERTURE_BOX
303
+ clean_aperture = []
304
+ clean_aperture << {
305
+ clean_aperture_width_n: read_int_32,
306
+ clean_aperture_width_d: read_int_32,
307
+ clean_aperture_height_n: read_int_32,
308
+ clean_aperture_height_d: read_int_32,
309
+ horiz_off_n: read_int_32,
310
+ horiz_off_d: read_int_32,
311
+ vert_off_n: read_int_32,
312
+ vert_off_d: read_int_32
313
+ }
314
+ @item_props[item_prop_index] = {
315
+ type: CLEAN_APERTURE_BOX,
316
+ clean_aperture: clean_aperture
317
+ }
318
+ when IMAGE_ROTATION_BOX
319
+ read_nil_version_and_flag
320
+ binary = convert_byte_to_binary(read_int_8)
321
+ # we need only the last 2 bits to retrieve the angle multiplier. angle multiplier * 90 specifies the angle
322
+ rotation = binary.slice(6, 2).join.to_i(2) * 90
323
+ @item_props[item_prop_index] = {
324
+ type: IMAGE_ROTATION_BOX,
325
+ rotation: rotation
326
+ }
327
+ end
328
+ item_prop_length, item_prop_name, item_prop_start_pos = get_next_box(item_prop_start_pos, item_prop_length, end_of_ipco_box)
329
+ item_prop_index += 1
330
+ end
331
+ end
332
+
333
+ def read_item_properties_association_box
334
+ version = read_int_8
335
+ safe_read(@buf, 2) # we skip the first 2 bytes of the flags (total of 3 bytes) cause we care only about the least significant bit
336
+ flags = read_int_8
337
+ entry_count = read_int_32
338
+ item_id = 0
339
+ entry_count.times do
340
+ item_id = if version == 0
341
+ read_int_16
342
+ else
343
+ read_int_32
344
+ end
345
+
346
+ association_count = read_int_8
347
+ association_count.times do
348
+ # we need to retrieve the "essential" bit wich is just the first bit in the next byte
349
+ binary = convert_byte_to_binary(read_int_8)
350
+ # essential_bit = binary[0] # uncomment if needed
351
+ binary.concat(convert_byte_to_binary(read_int_8)) if (flags & 1) == 1 # if flag is 1 we need the next 15 bits instead of only the next 7 bits
352
+ # we need to nullify the 1st bit since that one was the essential bit and doesn't count now to calculate the property index
353
+ binary[0] = 0
354
+ item_property_index = binary.join.to_i(2)
355
+ # we are interested only in the primary item properties
356
+ @item_props_idxs << item_property_index if item_id == @primary_item_id
357
+ end
358
+
359
+ # we are interested only in the primary item
360
+ if item_id != @primary_item_id
361
+ next
362
+ else
363
+ return
364
+ end
365
+ end
366
+ end
367
+
368
+ def fill_primary_values
369
+ @item_props_idxs.each { |x|
370
+ next if @item_props[x].nil?
371
+ prop = @item_props[x]
372
+ case prop[:type]
373
+ when IMAGE_SPATIAL_EXTENTS_BOX
374
+ @width = prop[:width]
375
+ @height = prop[:height]
376
+ when PIXEL_ASPECT_RATIO_BOX
377
+ @pixel_aspect_ratio = prop[:pixel_aspect_ratio]
378
+ when COLOUR_INFO_BOX
379
+ @colour_info = prop[:colour_info]
380
+ when PIXEL_INFO_BOX
381
+ @pixel_info = prop[:pixel_info]
382
+ when RELATIVE_LOCATION_BOX
383
+ @horizontal_offset = prop[:horizontal_offset]
384
+ @vertical_offset = prop[:vertical_offset]
385
+ when CLEAN_APERTURE_BOX
386
+ @clean_aperture = prop[:clean_aperture]
387
+ when IMAGE_ROTATION_BOX
388
+ @rotation = prop[:rotation]
389
+ end
390
+ }
391
+ end
392
+
393
+ def next_meaningful_meta_byte
394
+ while @buf.pos < @metadata_end_pos
395
+ next_byte = read_string(4)
396
+ return next_byte if meaningful?(next_byte)
397
+ end
398
+ end
399
+
400
+ def get_next_box(box_start_pos, box_length, end_pos_upper_box)
401
+ skip_pos = box_start_pos + box_length - HEADER_LENGTH
402
+ @buf.seek(skip_pos)
403
+ return if skip_pos >= end_pos_upper_box
404
+ next_box_length = read_int_32
405
+ next_box_name = read_string(4)
406
+ [next_box_length, next_box_name, @buf.pos]
407
+ end
408
+
409
+ def meaningful?(byte)
410
+ byte != MEANINGLESS_BYTE
411
+ end
412
+
413
+ def convert_byte_to_binary(integer)
414
+ binary = []
415
+ while integer > 0
416
+ binary << integer % 2
417
+ integer /= 2
418
+ end
419
+ binary_value = binary.reverse
420
+ (8 - binary_value.length).times do
421
+ binary_value.prepend('0')
422
+ end
423
+ binary_value
424
+ end
425
+
426
+ def likely_match?(filename)
427
+ filename =~ /\.hei[cf]$/i
428
+ end
429
+
430
+ FormatParser.register_parser(new, natures: :image, formats: [:heif, :heic], priority: 2)
431
+ end
@@ -136,7 +136,7 @@ class FormatParser::MOOVParser
136
136
 
137
137
  if stts && mdhd
138
138
  timescale = mdhd.atom_fields[:tscale]
139
- sample_duration = stts.field_value(:entries).first[:sample_duration]
139
+ sample_duration = stts.field_value(:entries).dig(0, :sample_duration)
140
140
  if timescale.nil? || timescale == 0 || sample_duration.nil? || sample_duration == 0
141
141
  nil
142
142
  else
@@ -163,5 +163,5 @@ class FormatParser::MOOVParser
163
163
  end
164
164
  end
165
165
 
166
- FormatParser.register_parser new, natures: :video, formats: FTYP_MAP.values, priority: 1
166
+ FormatParser.register_parser new, natures: :video, formats: FTYP_MAP.values, priority: 3
167
167
  end
@@ -21,5 +21,5 @@ class FormatParser::PDFParser
21
21
  FormatParser::Document.new(format: :pdf, content_type: PDF_CONTENT_TYPE)
22
22
  end
23
23
 
24
- FormatParser.register_parser new, natures: :document, formats: :pdf, priority: 1
24
+ FormatParser.register_parser new, natures: :document, formats: :pdf, priority: 3
25
25
  end
@@ -0,0 +1,162 @@
1
+ # WebP is an image format that provides superior lossless and lossy compression for images on the web, with support for
2
+ # transparency. It uses predictive coding to encode an image, predicting the values in a block of pixels based on the
3
+ # values of neighbouring blocks. A WebP file consists of VP8 or VP8L data, and a container based on RIFF. There is also
4
+ # an extended file format, VP8X, that optionally encodes various information such as the color profile, animation
5
+ # control data, transparency, and EXIF and/or XMP metadata.
6
+ #
7
+ # For more information, visit https://developers.google.com/speed/webp.
8
+ #
9
+ # TODO: Decide how to determine color mode (depends on variant, transformations, flags, etc.; maybe not worth it).
10
+
11
+ class FormatParser::WebpParser
12
+ include FormatParser::EXIFParser
13
+ include FormatParser::IOUtils
14
+
15
+ WEBP_MIME_TYPE = 'image/webp'
16
+
17
+ def likely_match?(filename)
18
+ filename =~ /\.webp$/i
19
+ end
20
+
21
+ def call(io)
22
+ @buf = FormatParser::IOConstraint.new(io)
23
+
24
+ # All WebP files start with the following 20 bytes:
25
+ #
26
+ # Offset | Description
27
+ # -------------------------------------------------------------------------------------
28
+ # 0...3 | "RIFF" (Since WebP is based on the RIFF file container format).
29
+ # 4...7 | The size of the file in bytes - 8 bytes.
30
+ # 8...11 | "WEBP" (To signify that this is a WebP file).
31
+ # 12...15 | The VB8 variant in use ("VB8 ", "VP8L" or "VB8X")
32
+ # 16...19 | The length of the VB8 data in bytes (i.e. The size of the file - 20 bytes).
33
+ riff, webp, variant = safe_read(@buf, 20).unpack('A4x4A4A4')
34
+ return unless riff == 'RIFF' && webp == 'WEBP'
35
+ read_data(variant)
36
+ end
37
+
38
+ private
39
+
40
+ def read_data(variant)
41
+ case variant
42
+ when 'VP8' # Lossy
43
+ read_lossy_data
44
+ when 'VP8L' # Lossless
45
+ read_lossless_data
46
+ when 'VP8X' # Extended
47
+ read_extended_data
48
+ else
49
+ nil
50
+ end
51
+ end
52
+
53
+ def read_lossy_data
54
+ # Encoded as a single VP8 key frame - a 10-byte uncompressed chunk followed by 2+ partitions of compressed data.
55
+ # The first 6 bytes of this chunk contains information that is mostly relevant when using VP8 as a video
56
+ # compression format, and can be ignored.
57
+ safe_skip(@buf, 6)
58
+
59
+ # The subsequent 4 bytes contain the image width and height, respectively, as 16-bit unsigned little endian
60
+ # integers.
61
+ width, height = safe_read(@buf, 4).unpack('S<S<')
62
+ create_image(width, height)
63
+ end
64
+
65
+ def read_lossless_data
66
+ # There is a single byte signature, 0x2F, that we can disregard.
67
+ safe_skip(@buf, 1)
68
+
69
+ # The subsequent 4 bytes contain the image width and height, respectively, as 14-bit unsigned little endian
70
+ # integers (minus one). The 4 remaining bits consist of a 1-bit flag indicating whether alpha is used, and a 3-bit
71
+ # version that is always zero.
72
+ dimensions = read_little_endian_int_32
73
+ width = (dimensions & 0x3fff) + 1
74
+ height = (dimensions >> 14 & 0x3fff) + 1
75
+ has_transparency = (dimensions >> 28 & 0x1) == 1
76
+
77
+ create_image(width, height, has_transparency: has_transparency)
78
+ end
79
+
80
+ def read_extended_data
81
+ # After the common RIFF header bytes, the extended file format has a series of 1-bit flags to signify the presence
82
+ # of optional information. These flags are as follows:
83
+ #
84
+ # |0|1|2|3|4|5|6|7|
85
+ # +-+-+-+-+-+-+-+-+
86
+ # |Rsv|I|L|E|X|A|R|
87
+ #
88
+ # Where:
89
+ # - Rsv & R = Reserved - Should be 0.
90
+ # - I = Set if file contains an ICC profile.
91
+ # - L = Set if file contains transparency information.
92
+ # - E = Set if file contains Exif metadata.
93
+ # - X = Set if file contains XMP metadata.
94
+ # - A = Set if file is an animated image.
95
+ flags = read_int_8
96
+ has_transparency = flags & 0x10 != 0
97
+ has_exif_metadata = flags & 0x08 != 0
98
+ has_xmp_metadata = flags & 0x04 != 0
99
+ has_multiple_frames = flags & 0x02 != 0
100
+
101
+ # The flags are followed by three reserved bytes of zeros, and then by the width and height, respectively - each
102
+ # occupying three bytes and each one less than the actual canvas measurements.
103
+ safe_skip(@buf, 3)
104
+ dimensions = safe_read(@buf, 6).unpack('VS')
105
+ width = (dimensions[0] & 0xffffff) + 1
106
+ height = (dimensions[0] >> 24 | dimensions[1] << 8 & 0xffffff) + 1
107
+
108
+ image = create_image(width, height, has_multiple_frames: has_multiple_frames, has_transparency: has_transparency)
109
+ augment_image(image) if has_exif_metadata || has_xmp_metadata || has_multiple_frames
110
+ image
111
+ end
112
+
113
+ def create_image(width, height, has_multiple_frames: false, has_transparency: false)
114
+ FormatParser::Image.new(
115
+ content_type: WEBP_MIME_TYPE,
116
+ format: :webp,
117
+ has_multiple_frames: has_multiple_frames,
118
+ has_transparency: has_transparency,
119
+ height_px: height,
120
+ width_px: width
121
+ )
122
+ end
123
+
124
+ def augment_image(image)
125
+ # We're going to scan the file looking for the EXIF, XMP and/or ANMF chunks.
126
+ intrinsics = {}
127
+ num_frames = 0
128
+ loop do
129
+ # Try to read the next chunk header, and break the loop if we've reached EOF.
130
+ begin
131
+ fourcc, chunk_size = safe_read(@buf, 8).unpack('A4V')
132
+ rescue InvalidRead
133
+ break
134
+ end
135
+
136
+ # Padding byte of 0 added if chunk size is odd.
137
+ safe_skip(@buf, 1) if chunk_size.odd?
138
+
139
+ case fourcc
140
+ when 'EXIF'
141
+ exif = exif_from_tiff_io(StringIO.new(safe_read(@buf, chunk_size)))
142
+ # We use ||= here as one Exif chunk at most should be present, even though it is possible for there to be more.
143
+ intrinsics[:exif] ||= exif
144
+ image.height_px, image.width_px = image.width_px, image.height_px if exif&.rotated?
145
+ image.orientation = exif&.orientation_sym
146
+ when 'XMP'
147
+ # We use ||= here as one XMP chunk at most should be present, even though it is possible for there to be more.
148
+ intrinsics[:xmp] ||= safe_read(@buf, chunk_size)
149
+ when 'ANMF'
150
+ num_frames += 1 if image.has_multiple_frames
151
+ safe_skip(@buf, chunk_size)
152
+ else
153
+ safe_skip(@buf, chunk_size)
154
+ end
155
+ end
156
+
157
+ image.intrinsics = intrinsics unless intrinsics.empty?
158
+ image.num_animation_or_video_frames = num_frames if num_frames > 0
159
+ end
160
+
161
+ FormatParser.register_parser new, natures: [:image], formats: [:webp]
162
+ end
@@ -60,5 +60,5 @@ class FormatParser::ZIPParser
60
60
  end
61
61
  end
62
62
 
63
- FormatParser.register_parser new, natures: [:archive, :document], formats: :zip, priority: 2
63
+ FormatParser.register_parser new, natures: [:archive, :document], formats: :zip, priority: 4
64
64
  end
@@ -0,0 +1,75 @@
1
+ require 'spec_helper'
2
+
3
+ describe FormatParser::HEIFParser do
4
+ it 'is able to parse single heif image with heic major brand' do
5
+ heif_path = fixtures_dir + 'HEIF/SingleImage.heic'
6
+
7
+ result = subject.call(File.open(heif_path, 'rb'))
8
+ expect(result).not_to be_nil
9
+ expect(result.nature).to eq(:image)
10
+ expect(result.format).to eq(:heic)
11
+ expect(result.width_px).to eq(4000)
12
+ expect(result.height_px).to eq(3000)
13
+ expect(result.content_type).to eq('image/heic')
14
+ expect(result.intrinsics[:compatible_brands].should =~ ['mif1', 'heic'])
15
+ end
16
+
17
+ it 'is able to parse single heif image with mif1 major brand' do
18
+ heif_path = fixtures_dir + 'HEIF/SingleImage_Autumn.heic'
19
+
20
+ result = subject.call(File.open(heif_path, 'rb'))
21
+ expect(result).not_to be_nil
22
+ expect(result.nature).to eq(:image)
23
+ expect(result.format).to eq(:heif)
24
+ expect(result.width_px).to eq(1440)
25
+ expect(result.height_px).to eq(960)
26
+ expect(result.content_type).to eq('image/heif')
27
+ expect(result.intrinsics[:compatible_brands].should =~ ['mif1', 'heic'])
28
+ end
29
+
30
+ it 'is able to parse image collection with mif1 major brand' do
31
+ heif_path = fixtures_dir + 'HEIF/ImageCollection.heic'
32
+
33
+ result = subject.call(File.open(heif_path, 'rb'))
34
+ expect(result).not_to be_nil
35
+ expect(result.nature).to eq(:image)
36
+ expect(result.format).to eq(:heif)
37
+ expect(result.width_px).to eq(1440)
38
+ expect(result.height_px).to eq(960)
39
+ expect(result.content_type).to eq('image/heif')
40
+ end
41
+
42
+ it 'is able to parse image collection with colour info' do
43
+ heif_path = fixtures_dir + 'HEIF/SingleImage_Autumn_WithColourInfo.heic'
44
+
45
+ result = subject.call(File.open(heif_path, 'rb'))
46
+ expect(result).not_to be_nil
47
+ expect(result.nature).to eq(:image)
48
+ expect(result.format).to eq(:heic)
49
+ expect(result.width_px).to eq(1440)
50
+ expect(result.height_px).to eq(960)
51
+ colour_info = result.intrinsics[:colour_info]
52
+ expect(colour_info[:colour_primaries]).to eq(28259)
53
+ expect(colour_info[:transfer_characteristics]).to eq(27768)
54
+ expect(colour_info[:matrix_coefficients]).to eq(2)
55
+ expect(result.content_type).to eq('image/heic')
56
+ expect(result.intrinsics[:compatible_brands].should =~ ['mif1', 'heic'])
57
+ end
58
+
59
+ it 'is able to parse image collection with pixel info' do
60
+ heif_path = fixtures_dir + 'HEIF/SingleImage_Autumn_WithColourInfo.heic'
61
+
62
+ result = subject.call(File.open(heif_path, 'rb'))
63
+ expect(result).not_to be_nil
64
+ expect(result.nature).to eq(:image)
65
+ expect(result.format).to eq(:heic)
66
+ expect(result.width_px).to eq(1440)
67
+ expect(result.height_px).to eq(960)
68
+ pixel_info = result.intrinsics[:pixel_info]
69
+ expect(pixel_info[0][:bits_in_channel_2]).to eq(8)
70
+ expect(pixel_info[1][:bits_in_channel_3]).to eq(8)
71
+ expect(pixel_info[2][:bits_in_channel_4]).to eq(8)
72
+ expect(result.content_type).to eq('image/heic')
73
+ expect(result.intrinsics[:compatible_brands].should =~ ['mif1', 'heic'])
74
+ end
75
+ end
@@ -0,0 +1,121 @@
1
+ require 'spec_helper'
2
+
3
+ describe FormatParser::WebpParser do
4
+ it 'does not parse files with an invalid RIFF header' do
5
+ result = subject.call(File.open(fixtures_dir + 'WEBP/invalid-header.webp', 'rb'))
6
+ expect(result).to be_nil
7
+ end
8
+
9
+ it 'does not parse files with an unrecognised variant' do
10
+ result = subject.call(File.open(fixtures_dir + 'WEBP/unrecognised-variant.webp', 'rb'))
11
+ expect(result).to be_nil
12
+ end
13
+
14
+ it 'successfully parses lossy (VP8) WebP files' do
15
+ result = subject.call(File.open(fixtures_dir + 'WEBP/lossy.webp', 'rb'))
16
+ expect(result).not_to be_nil
17
+ expect(result.content_type).to eq('image/webp')
18
+ expect(result.format).to eq(:webp)
19
+ expect(result.has_multiple_frames).to eq(false)
20
+ expect(result.has_transparency).to eq(false)
21
+ expect(result.height_px).to eq(181)
22
+ expect(result.intrinsics).to be_nil
23
+ expect(result.orientation).to be_nil
24
+ expect(result.width_px).to eq(65)
25
+ end
26
+
27
+ it 'successfully parses lossless WebP files' do
28
+ result = subject.call(File.open(fixtures_dir + 'WEBP/lossless.webp', 'rb'))
29
+ expect(result).not_to be_nil
30
+ expect(result.content_type).to eq('image/webp')
31
+ expect(result.format).to eq(:webp)
32
+ expect(result.has_multiple_frames).to eq(false)
33
+ expect(result.has_transparency).to eq(false)
34
+ expect(result.height_px).to eq(181)
35
+ expect(result.intrinsics).to be_nil
36
+ expect(result.orientation).to be_nil
37
+ expect(result.width_px).to eq(65)
38
+ end
39
+
40
+ it 'successfully parses lossless WebP files with an alpha channel' do
41
+ result = subject.call(File.open(fixtures_dir + 'WEBP/lossless-alpha.webp', 'rb'))
42
+ expect(result).not_to be_nil
43
+ expect(result.content_type).to eq('image/webp')
44
+ expect(result.format).to eq(:webp)
45
+ expect(result.has_multiple_frames).to eq(false)
46
+ expect(result.has_transparency).to eq(true)
47
+ expect(result.height_px).to eq(181)
48
+ expect(result.intrinsics).to be_nil
49
+ expect(result.orientation).to be_nil
50
+ expect(result.width_px).to eq(65)
51
+ end
52
+
53
+ it 'successfully parses extended WebP files' do
54
+ result = subject.call(File.open(fixtures_dir + 'WEBP/extended.webp', 'rb'))
55
+ expect(result).not_to be_nil
56
+ expect(result.content_type).to eq('image/webp')
57
+ expect(result.format).to eq(:webp)
58
+ expect(result.has_multiple_frames).to eq(false)
59
+ expect(result.has_transparency).to eq(false)
60
+ expect(result.height_px).to eq(181)
61
+ expect(result.intrinsics).to be_nil
62
+ expect(result.orientation).to be_nil
63
+ expect(result.width_px).to eq(65)
64
+ end
65
+
66
+ it 'successfully parses extended WebP files with an alpha channel' do
67
+ result = subject.call(File.open(fixtures_dir + 'WEBP/extended-alpha.webp', 'rb'))
68
+ expect(result).not_to be_nil
69
+ expect(result.content_type).to eq('image/webp')
70
+ expect(result.format).to eq(:webp)
71
+ expect(result.has_multiple_frames).to eq(false)
72
+ expect(result.has_transparency).to eq(true)
73
+ expect(result.height_px).to eq(181)
74
+ expect(result.intrinsics).to be_nil
75
+ expect(result.orientation).to be_nil
76
+ expect(result.width_px).to eq(65)
77
+ end
78
+
79
+ it 'successfully parses extended WebP files with Exif metadata' do
80
+ result = subject.call(File.open(fixtures_dir + 'WEBP/extended-exif.webp', 'rb'))
81
+ expect(result).not_to be_nil
82
+ expect(result.content_type).to eq('image/webp')
83
+ expect(result.format).to eq(:webp)
84
+ expect(result.has_multiple_frames).to eq(false)
85
+ expect(result.has_transparency).to eq(false)
86
+ expect(result.height_px).to eq(181)
87
+ expect(result.intrinsics).not_to be_nil
88
+ expect(result.intrinsics[:exif]).not_to be_nil
89
+ expect(result.intrinsics[:exif].image_length).to eq(result.height_px)
90
+ expect(result.intrinsics[:exif].image_width).to eq(result.width_px)
91
+ expect(result.orientation).to eq(:top_left)
92
+ expect(result.width_px).to eq(65)
93
+ end
94
+
95
+ it 'successfully parses extended WebP files with XMP metadata' do
96
+ result = subject.call(File.open(fixtures_dir + 'WEBP/extended-xmp.webp', 'rb'))
97
+ expect(result).not_to be_nil
98
+ expect(result.content_type).to eq('image/webp')
99
+ expect(result.format).to eq(:webp)
100
+ expect(result.has_multiple_frames).to eq(false)
101
+ expect(result.has_transparency).to eq(false)
102
+ expect(result.height_px).to eq(181)
103
+ expect(result.intrinsics).not_to be_nil
104
+ expect(result.intrinsics[:xmp]).not_to be_nil
105
+ expect(result.orientation).to be_nil
106
+ expect(result.width_px).to eq(65)
107
+ end
108
+
109
+ it 'successfully parses extended WebP files with animation' do
110
+ result = subject.call(File.open(fixtures_dir + 'WEBP/extended-animation.webp', 'rb'))
111
+ expect(result).not_to be_nil
112
+ expect(result.content_type).to eq('image/webp')
113
+ expect(result.format).to eq(:webp)
114
+ expect(result.has_multiple_frames).to eq(true)
115
+ expect(result.has_transparency).to eq(true)
116
+ expect(result.height_px).to eq(211)
117
+ expect(result.intrinsics).to be_nil
118
+ expect(result.orientation).to be_nil
119
+ expect(result.width_px).to eq(211)
120
+ end
121
+ end
metadata CHANGED
@@ -1,15 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: format_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.0
4
+ version: 1.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Noah Berman
8
8
  - Julik Tarkhanov
9
- autorequire:
9
+ autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2022-04-07 00:00:00.000000000 Z
12
+ date: 2022-07-07 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: ks
@@ -238,6 +238,7 @@ files:
238
238
  - lib/parsers/fdx_parser.rb
239
239
  - lib/parsers/flac_parser.rb
240
240
  - lib/parsers/gif_parser.rb
241
+ - lib/parsers/heif_parser.rb
241
242
  - lib/parsers/jpeg_parser.rb
242
243
  - lib/parsers/m3u_parser.rb
243
244
  - lib/parsers/moov_parser.rb
@@ -251,6 +252,7 @@ files:
251
252
  - lib/parsers/psd_parser.rb
252
253
  - lib/parsers/tiff_parser.rb
253
254
  - lib/parsers/wav_parser.rb
255
+ - lib/parsers/webp_parser.rb
254
256
  - lib/parsers/zip_parser.rb
255
257
  - lib/parsers/zip_parser/file_reader.rb
256
258
  - lib/parsers/zip_parser/office_formats.rb
@@ -278,6 +280,7 @@ files:
278
280
  - spec/parsers/fdx_parser_spec.rb
279
281
  - spec/parsers/flac_parser_spec.rb
280
282
  - spec/parsers/gif_parser_spec.rb
283
+ - spec/parsers/heif_parser_spec.rb
281
284
  - spec/parsers/jpeg_parser_spec.rb
282
285
  - spec/parsers/m3u_parser_spec.rb
283
286
  - spec/parsers/moov_parser_spec.rb
@@ -289,6 +292,7 @@ files:
289
292
  - spec/parsers/psd_parser_spec.rb
290
293
  - spec/parsers/tiff_parser_spec.rb
291
294
  - spec/parsers/wav_parser_spec.rb
295
+ - spec/parsers/webp_parser_spec.rb
292
296
  - spec/parsers/zip_parser_spec.rb
293
297
  - spec/read_limiter_spec.rb
294
298
  - spec/read_limits_config_spec.rb
@@ -300,7 +304,7 @@ licenses:
300
304
  - MIT (Hippocratic)
301
305
  metadata:
302
306
  allowed_push_host: https://rubygems.org
303
- post_install_message:
307
+ post_install_message:
304
308
  rdoc_options: []
305
309
  require_paths:
306
310
  - lib
@@ -315,8 +319,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
315
319
  - !ruby/object:Gem::Version
316
320
  version: '0'
317
321
  requirements: []
318
- rubygems_version: 3.3.4
319
- signing_key:
322
+ rubygems_version: 3.2.33
323
+ signing_key:
320
324
  specification_version: 4
321
325
  summary: A library for efficient parsing of file metadata
322
326
  test_files: []