format_parser 1.2.0 → 1.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d8bdd6daa59e43bbe6033562b804d3b1b9685c6e2aa5ef3f8d527c74516e1f6c
4
- data.tar.gz: d6035f0b3c819085ffd23c4988e48145976ba1f98547d16589b2a4b4aa3e7aa7
3
+ metadata.gz: cf7fbbf842a1ae6fcde3986b360877223ac699a87950848b508da15f8a8280ad
4
+ data.tar.gz: 29882db7afe75a1d3b6554f18dbc837cefb1dbe9e8927adafe959ac8d37ade84
5
5
  SHA512:
6
- metadata.gz: 1a2f6243d295589972c63e45b719f19a5761c872103c32bfaacaef9cee9b85551c18ddaaf3c4095567c358958e7b7bf78800fc3e5a8e6c9f774eab383dc3160c
7
- data.tar.gz: 73ba4f1099ccb4c490b50a8f531be843679159a6417980b2a61b905724905cf6b7e884127b66096e2237982dc6743e7f057bea0062472c1779e4d754dd5388ff
6
+ metadata.gz: 0cf33f73ac298f565020e9819c9c7d2e2af340490b869b97a008b4466ac2b0825fed70d5d9e255ef1192520cef92fdeeff0c7ade18d5e38910d6dc2fd0de89f3
7
+ data.tar.gz: c20cdc92df0d29d1e0c4b9f8c05644e17216f239a8d90e9c7af38f5566b4abaaf6f6289d5cc69d6a25b0ed644236403820b5c3402080f0b7ba40ca112b671d3a
data/.gitignore CHANGED
@@ -61,3 +61,6 @@ Gemfile.lock
61
61
 
62
62
  # rspec examples
63
63
  spec/examples.txt
64
+
65
+ # IntelliJ config:
66
+ /.idea/
data/CHANGELOG.md CHANGED
@@ -1,3 +1,12 @@
1
+ ## 1.4.0
2
+ * Add support for `WEBP` lossy, lossless and extended file formats.
3
+
4
+ ## 1.3.0
5
+ * Add `heif_parser` and support for `HEIF` and `HEIC` formats. Exif parsing is still missing.
6
+
7
+ ## 1.2.1
8
+ * Resolve bug when `stts` atom is `nil`
9
+
1
10
  ## 1.2.0
2
11
  * Add support for `codecs` in moov_parser for video metadata
3
12
 
data/Gemfile CHANGED
@@ -1,9 +1,4 @@
1
1
  source 'https://rubygems.org'
2
2
 
3
- gem 'ruby-debug-ide'
4
- gem 'debase'
5
- gem 'solargraph', group: :development
6
- gem 'pry', group: :development
7
-
8
3
  # Gem dependencies specified in the gemspec
9
4
  gemspec
data/README.md CHANGED
@@ -33,6 +33,7 @@ and [dimensions,](https://github.com/sstephenson/dimensions) borrowing from them
33
33
  * OGG
34
34
  * MPEG, MPG
35
35
  * M3U
36
+ * WEBP
36
37
 
37
38
  ...with [more](https://github.com/WeTransfer/format_parser/issues?q=is%3Aissue+is%3Aopen+label%3Aformats) on the way!
38
39
 
@@ -198,6 +199,10 @@ Unless specified otherwise in this section the fixture files are MIT licensed an
198
199
  ### M3U
199
200
  - The M3U fixture files were created by one of the project maintainers
200
201
 
202
+ ### WEBP
203
+ - With the exception of extended-animation.webp, which was obtained from Wikimedia Commons and is Creative Commons
204
+ licensed, all of the WebP fixture files have been created by one of the project maintainers.
205
+
201
206
  ### .key
202
207
  - The `keynote_recognized_as_jpeg.key` file was created by the project maintainers
203
208
 
@@ -1,3 +1,3 @@
1
1
  module FormatParser
2
- VERSION = '1.2.0'
2
+ VERSION = '1.4.0'
3
3
  end
data/lib/io_utils.rb CHANGED
@@ -30,5 +30,30 @@ module FormatParser::IOUtils
30
30
  nil
31
31
  end
32
32
 
33
+ def read_int_8
34
+ safe_read(@buf, 1).unpack('C').first
35
+ end
36
+
37
+ def read_int_16
38
+ safe_read(@buf, 2).unpack('n').first
39
+ end
40
+
41
+ def read_int_32
42
+ safe_read(@buf, 4).unpack('N').first
43
+ end
44
+
45
+ def read_little_endian_int_16
46
+ safe_read(@buf, 2).unpack('v').first
47
+ end
48
+
49
+ def read_little_endian_int_32
50
+ safe_read(@buf, 4).unpack('V').first
51
+ end
52
+
53
+ # 'n' is the number of bytes to read
54
+ def read_string(n)
55
+ safe_read(@buf, n)
56
+ end
57
+
33
58
  ### TODO: Some kind of built-in offset for the read
34
59
  end
@@ -0,0 +1,431 @@
1
+ # HEIF stands for High-Efficiency Image File format, which is basically a container that is capable of storing an image, or a sequence of images in a single file.
2
+ # There are a number of variants of HEIF, which can be used to store images, sequences of images, or videos using different codecs.
3
+ # The variant that Apple uses to store images and sequences of images in its iOS and macOS operating systems is High Efficiency Image Coding (HEIC), which uses HEVC / H.265 for content compression.
4
+ class FormatParser::HEIFParser
5
+ include FormatParser::IOUtils
6
+
7
+ HEIF_MARKER = [0x68, 0x65, 0x69, 0x63].pack('C4') # heif marker
8
+ FILE_TYPE_BOX_MARKER = [0x66, 0x74, 0x79, 0x70].pack('C4') # ftyp marker
9
+ META_BOX_MARKER = [0x6D, 0x65, 0x74, 0x61].pack('C4') # meta marker
10
+ MIF1_MARKER = [0x6D, 0x69, 0x66, 0x31].pack('C4') # mif1 marker
11
+ MSF1_MARKER = [0x6D, 0x73, 0x66, 0x31].pack('C4') # msf1 marker
12
+ MEANINGLESS_BYTE = [0x00, 0x00, 0x00, 0x00].pack('C4')
13
+ HANDLER_MARKER = [0x68, 0x64, 0x6C, 0x72].pack('C4') # hdlr marker
14
+ ITEM_PROPERTIES_BOX = [0x69, 0x70, 0x72, 0x70].pack('C4') # iprp marker
15
+ ITEM_PROPERTIES_CONTAINER_BOX = [0x69, 0x70, 0x63, 0x6F].pack('C4') # ipco marker
16
+ IMAGE_SPATIAL_EXTENTS_BOX = [0x69, 0x73, 0x70, 0x65].pack('C4') # ispe marker
17
+ PIXEL_ASPECT_RATIO_BOX = [0x70, 0x61, 0x73, 0x70].pack('C4') # pasp marker
18
+ ITEM_INFO_BOX = [0x69, 0x69, 0x6E, 0x66].pack('C4') # iinf marker
19
+ ITEM_INFO_ENTRY = [0x69, 0x6E, 0x66, 0x65].pack('C4') # infe marker
20
+ MIME_MARKER = [0x6D, 0x69, 0x6D, 0x65].pack('C4') # mime marker
21
+ COLOUR_INFO_BOX = [0x63, 0x6F, 0x6C, 0x72].pack('C4') # colr marker
22
+ PIXEL_INFO_BOX = [0x70, 0x69, 0x78, 0x69].pack('C4') # pixi marker
23
+ RELATIVE_LOCATION_BOX = [0x72, 0x6C, 0x6F, 0x63].pack('C4') # rloc marker
24
+ CLEAN_APERTURE_BOX = [0x63, 0x6C, 0x61, 0x70].pack('C4') # clap marker
25
+ PRIMARY_ITEM_BOX = [0x70, 0x69, 0x74, 0x6D].pack('C4') # pitm marker
26
+ ITEM_PROPERTIES_ASSOCIATION_BOX = [0x69, 0x70, 0x6D, 0x61].pack('C4') # ipma marker
27
+ IMAGE_ROTATION_BOX = [0x69, 0x72, 0x6F, 0x74].pack('C4') # irot marker
28
+ HEADER_LENGTH = 8 # every box header has a length of 8 bytes
29
+ HEIC_MIME_POSSIBLE_TYPES = {
30
+ 'heic' => :heic,
31
+ 'heix' => :heix,
32
+ 'heim' => :heim,
33
+ 'heis' => :heis
34
+ }
35
+ HEIC_MIME_TYPE = 'image/heic'
36
+ HEIF_MIME_TYPE = 'image/heif'
37
+ # TODO: use the following when adding image-sequence parsing
38
+ # HEIC_SEQUENCE_MIME_TYPE = 'image/heic-sequence'
39
+ # HEIF_SEQUENCE_MIME_TYPE = 'image/heif-sequence'
40
+
41
+ def self.call(io)
42
+ new.call(io)
43
+ end
44
+
45
+ def call(io)
46
+ @buf = FormatParser::IOConstraint.new(io)
47
+ @format = nil
48
+ @@major_brand = nil
49
+ @width = nil
50
+ @height = nil
51
+ @exif_data_frames = []
52
+ @compatible_brands = nil
53
+ @metadata_start_pos = 0
54
+ @metadata_end_pos = 0
55
+ @handler_type = nil
56
+ @sub_items = nil
57
+ @pixel_aspect_ratio = nil
58
+ @colour_info = nil
59
+ @pixel_info = nil
60
+ @horizontal_offset = nil
61
+ @vertical_offset = nil
62
+ @clean_aperture = nil
63
+ @primary_item_id = 0
64
+ @item_props = {}
65
+ @rotation = 0
66
+ @item_props_idxs = []
67
+ @content_type = nil
68
+ scan
69
+ end
70
+
71
+ def scan
72
+ # All HEIC files must be conform to ISO/IEC 23008-12:2017
73
+ # Moreover, all HEIC files are conform to ISO/IEC 14496-12:2015 and should be conform to the Clause 4 of such spec.
74
+ # Files are formed as a series of objects, called boxes. All data is contained in such boxes.
75
+ # All boxes start with a header which defines both size and type.
76
+ # The size is the entire size of the box, including the size and type header, fields, and all contained boxes.
77
+ # The fields in the objects are stored with the most significant byte first, commonly known as network byte order or big-endian format.
78
+ # A HEIC file must contain a File Type Box (ftyp).
79
+ # A file conforms to all the requirements of the brands listed in the compatible_brands.
80
+ scan_file_type_box
81
+
82
+ # file may be identified by MIME type of Annex C of ISO/IEC 23008-12 if 'mif1' is the major brand or Annex D if 'msf1' is the major brand.
83
+ # the MIME indicates the nature and format of our assortment of bytes
84
+ # note particularly that the brand 'mif1' doesn't mandate a MovieBox ("moov").
85
+ # One or more brands must be included in the list of compatible brands
86
+ return if @compatible_brands.nil?
87
+ if @compatible_brands&.include?(MIF1_MARKER)
88
+ scan_meta_level_box
89
+ if @major_brand == MIF1_MARKER
90
+ @content_type = HEIF_MIME_TYPE
91
+ @format = :heif
92
+ elsif (@compatible_brands & HEIC_MIME_POSSIBLE_TYPES.keys).length > 0
93
+ @format = :heic
94
+ @content_type = HEIC_MIME_TYPE
95
+ end
96
+ end
97
+ if @compatible_brands&.include?(MSF1_MARKER)
98
+ # TODO
99
+ end
100
+
101
+ result = FormatParser::Image.new(
102
+ format: @format,
103
+ width_px: @width,
104
+ height_px: @height,
105
+ intrinsics: {
106
+ compatible_brands: @compatible_brands,
107
+ handler_type: @handler_type,
108
+ # 'sub_items': @sub_items, # enable this if you want to output all the sub-items in the image
109
+ pixel_aspect_ratio: @pixel_aspect_ratio,
110
+ colour_info: @colour_info,
111
+ pixel_info: @pixel_info,
112
+ horizontal_offset: @horizontal_offset,
113
+ vertical_offset: @vertical_offset,
114
+ clean_aperture: @clean_aperture,
115
+ rotation: @rotation
116
+ },
117
+ content_type: @content_type
118
+ )
119
+
120
+ result
121
+ end
122
+
123
+ def scan_file_type_box
124
+ file_type_box_length = read_int_32
125
+ return unless read_string(4) == FILE_TYPE_BOX_MARKER
126
+ @major_brand = read_string(4)
127
+ return unless @major_brand == HEIF_MARKER || @major_brand == MIF1_MARKER
128
+ read_string(4) # minor_brand
129
+
130
+ # Subtracting from the total length of the box specified in the header the size header itself (8 bytes = header length and length of ftyp)
131
+ # and the length of the major and minor brand, we obtain the compatible brands
132
+ data_left_length = file_type_box_length - HEADER_LENGTH - HEIF_MARKER.length - 4
133
+
134
+ @compatible_brands = []
135
+ (data_left_length / 4).times do
136
+ @compatible_brands << read_string(4)
137
+ end
138
+ end
139
+
140
+ def scan_meta_level_box
141
+ metadata_length = read_int_32
142
+ return unless read_string(4) == META_BOX_MARKER
143
+ @metadata_start_pos = @buf.pos
144
+ @metadata_end_pos = @buf.pos + metadata_length - HEADER_LENGTH # the real data is always without the 8 initial bytes of the handler
145
+ read_nil_version_and_flag
146
+
147
+ # we are looking for box/containers right beneath the Meta box
148
+ # we start with the HDLR (Handler) box..
149
+ handler_length = read_int_32
150
+ return unless read_string(4) == HANDLER_MARKER
151
+ handler_length -= HEADER_LENGTH # subtract the header as usual (will not be mentioned anymore from now on)
152
+ handler_start = @buf.pos
153
+ # the handler type declares the type of metadata and thus the process by which the media-data in the track is presented
154
+ # it also indicates the structure or format of the ‘meta’ box contents
155
+ read_nil_version_and_flag
156
+ read_string(4) # pre_defined bytes, always 4 null bytes in the hdlr box
157
+ @handler_type = read_string(4)
158
+ @buf.seek(handler_start + handler_length) # the remaining part is reserved
159
+
160
+ # ..continue looking for the IINF box and especially for the IPRP box, containing info about the image itself
161
+ next_box_length = read_int_32
162
+ next_box = read_string(4)
163
+ next_box_start_pos = @buf.pos
164
+ while @buf.pos < @metadata_end_pos # we iterate over all next incoming boxed but without going outside the meta-box
165
+ case next_box
166
+ when PRIMARY_ITEM_BOX
167
+ read_primary_item_box
168
+ when ITEM_INFO_BOX
169
+ read_item_info_box
170
+ when ITEM_PROPERTIES_BOX
171
+ read_item_properties_box
172
+ fill_primary_values
173
+ when next_box == ''
174
+ break
175
+ end
176
+ next_box_length, next_box, next_box_start_pos = get_next_box(next_box_start_pos, next_box_length, @metadata_end_pos)
177
+ end
178
+ end
179
+
180
+ def read_item_info_box
181
+ version = read_int_8
182
+ safe_skip(@buf, 3) # 0 flags
183
+ entry_count = if version == 0
184
+ read_int_16
185
+ else
186
+ read_int_32
187
+ end
188
+ @sub_items = []
189
+ entry_count.times {
190
+ item_info_entry_length = read_int_32
191
+ return unless read_string(4) == ITEM_INFO_ENTRY
192
+ item_info_end_pos = @buf.pos + item_info_entry_length - HEADER_LENGTH
193
+ version = read_int_8
194
+ safe_skip(@buf, 3) # 0 flags
195
+ case version
196
+ when 2
197
+ item_id = read_int_16
198
+ when 3
199
+ item_id = read_int_32
200
+ else
201
+ return # wrong version according to standards, hence return
202
+ end
203
+ safe_skip(@buf, 2) # not interested in the item_protection_index
204
+ item_type = read_string(4)
205
+ content_encoding = ''
206
+ if item_type == MIME_MARKER
207
+ content_encoding = read_string(item_info_end_pos - @buf.pos).delete!("\0") # remove the null-termination part for output visualization reason
208
+ end
209
+ @sub_items << {item_id: item_id, item_type: item_type, content_encoding: content_encoding}
210
+ @buf.seek(item_info_end_pos) # we are not interested in anything else, go directly to the end of this 'infe' box
211
+ }
212
+ end
213
+
214
+ def read_nil_version_and_flag
215
+ safe_skip(@buf, 1) # version, always 0 in this current box
216
+ safe_skip(@buf, 3) # flags, always 0 in this current box
217
+ end
218
+
219
+ def read_primary_item_box
220
+ version = read_int_8
221
+ safe_read(@buf, 3) # flags, always 0 in this current box
222
+ @primary_item_id = if version == 0
223
+ read_int_16
224
+ else
225
+ read_int_32
226
+ end
227
+ end
228
+
229
+ # the ITEM_PROPERTIES_CONTAINER_BOX contains an implicitely 1-based index list of item properties.
230
+ # While parsing such box we are storing the properties with its own index.
231
+ # Reason behind is that the primary_item will be associated to some of these properties through the same index
232
+ # and in order to output relevant data from the format_parser we need all the properties associated to the primary_item.
233
+ # Hence the need of the association between an item and its properties, found in the ITEM_PROPERTIES_ASSOCIATION_BOX
234
+ def read_item_properties_box
235
+ ipco_length = read_int_32
236
+ return unless read_string(4) == ITEM_PROPERTIES_CONTAINER_BOX
237
+ read_item_properties_container_box(ipco_length)
238
+ read_int_32 # ipma_length
239
+ return unless read_string(4) == ITEM_PROPERTIES_ASSOCIATION_BOX
240
+ read_item_properties_association_box
241
+ end
242
+
243
+ def read_item_properties_container_box(box_length)
244
+ end_of_ipco_box = @buf.pos + box_length - HEADER_LENGTH
245
+ item_prop_length = read_int_32
246
+ item_prop_name = read_string(4)
247
+ item_prop_start_pos = @buf.pos
248
+ item_prop_index = 1
249
+ while @buf.pos < end_of_ipco_box
250
+ case item_prop_name
251
+ when IMAGE_SPATIAL_EXTENTS_BOX
252
+ read_nil_version_and_flag
253
+ width = read_int_32
254
+ height = read_int_32
255
+ @item_props[item_prop_index] = {
256
+ type: IMAGE_SPATIAL_EXTENTS_BOX,
257
+ width: width,
258
+ height: height
259
+ }
260
+ when PIXEL_ASPECT_RATIO_BOX
261
+ h_spacing = read_int_32
262
+ v_spacing = read_int_32
263
+ pixel_aspect_ratio = "#{h_spacing}/#{v_spacing}"
264
+ @item_props[item_prop_index] = {
265
+ type: PIXEL_ASPECT_RATIO_BOX,
266
+ pixel_aspect_ratio: pixel_aspect_ratio
267
+ }
268
+ when COLOUR_INFO_BOX
269
+ colour_info = {
270
+ colour_primaries: read_int_16,
271
+ transfer_characteristics: read_int_16,
272
+ matrix_coefficients: read_int_16
273
+ }
274
+ @item_props[item_prop_index] = {
275
+ type: COLOUR_INFO_BOX,
276
+ colour_info: colour_info
277
+ }
278
+ when PIXEL_INFO_BOX
279
+ pixel_info = []
280
+ read_nil_version_and_flag
281
+ num_channels = read_int_8
282
+ channel = 1
283
+ while channel <= num_channels
284
+ channel += 1
285
+ pixel_info << {
286
+ "bits_in_channel_#{channel}": read_int_8
287
+ }
288
+ end
289
+ @item_props[item_prop_index] = {
290
+ type: PIXEL_INFO_BOX,
291
+ pixel_info: pixel_info
292
+ }
293
+ when RELATIVE_LOCATION_BOX
294
+ read_nil_version_and_flag
295
+ horizontal_offset = read_int_32
296
+ vertical_offset = read_int_32
297
+ @item_props[item_prop_index] = {
298
+ type: RELATIVE_LOCATION_BOX,
299
+ horizontal_offset: horizontal_offset,
300
+ vertical_offset: vertical_offset
301
+ }
302
+ when CLEAN_APERTURE_BOX
303
+ clean_aperture = []
304
+ clean_aperture << {
305
+ clean_aperture_width_n: read_int_32,
306
+ clean_aperture_width_d: read_int_32,
307
+ clean_aperture_height_n: read_int_32,
308
+ clean_aperture_height_d: read_int_32,
309
+ horiz_off_n: read_int_32,
310
+ horiz_off_d: read_int_32,
311
+ vert_off_n: read_int_32,
312
+ vert_off_d: read_int_32
313
+ }
314
+ @item_props[item_prop_index] = {
315
+ type: CLEAN_APERTURE_BOX,
316
+ clean_aperture: clean_aperture
317
+ }
318
+ when IMAGE_ROTATION_BOX
319
+ read_nil_version_and_flag
320
+ binary = convert_byte_to_binary(read_int_8)
321
+ # we need only the last 2 bits to retrieve the angle multiplier. angle multiplier * 90 specifies the angle
322
+ rotation = binary.slice(6, 2).join.to_i(2) * 90
323
+ @item_props[item_prop_index] = {
324
+ type: IMAGE_ROTATION_BOX,
325
+ rotation: rotation
326
+ }
327
+ end
328
+ item_prop_length, item_prop_name, item_prop_start_pos = get_next_box(item_prop_start_pos, item_prop_length, end_of_ipco_box)
329
+ item_prop_index += 1
330
+ end
331
+ end
332
+
333
+ def read_item_properties_association_box
334
+ version = read_int_8
335
+ safe_read(@buf, 2) # we skip the first 2 bytes of the flags (total of 3 bytes) cause we care only about the least significant bit
336
+ flags = read_int_8
337
+ entry_count = read_int_32
338
+ item_id = 0
339
+ entry_count.times do
340
+ item_id = if version == 0
341
+ read_int_16
342
+ else
343
+ read_int_32
344
+ end
345
+
346
+ association_count = read_int_8
347
+ association_count.times do
348
+ # we need to retrieve the "essential" bit wich is just the first bit in the next byte
349
+ binary = convert_byte_to_binary(read_int_8)
350
+ # essential_bit = binary[0] # uncomment if needed
351
+ binary.concat(convert_byte_to_binary(read_int_8)) if (flags & 1) == 1 # if flag is 1 we need the next 15 bits instead of only the next 7 bits
352
+ # we need to nullify the 1st bit since that one was the essential bit and doesn't count now to calculate the property index
353
+ binary[0] = 0
354
+ item_property_index = binary.join.to_i(2)
355
+ # we are interested only in the primary item properties
356
+ @item_props_idxs << item_property_index if item_id == @primary_item_id
357
+ end
358
+
359
+ # we are interested only in the primary item
360
+ if item_id != @primary_item_id
361
+ next
362
+ else
363
+ return
364
+ end
365
+ end
366
+ end
367
+
368
+ def fill_primary_values
369
+ @item_props_idxs.each { |x|
370
+ next if @item_props[x].nil?
371
+ prop = @item_props[x]
372
+ case prop[:type]
373
+ when IMAGE_SPATIAL_EXTENTS_BOX
374
+ @width = prop[:width]
375
+ @height = prop[:height]
376
+ when PIXEL_ASPECT_RATIO_BOX
377
+ @pixel_aspect_ratio = prop[:pixel_aspect_ratio]
378
+ when COLOUR_INFO_BOX
379
+ @colour_info = prop[:colour_info]
380
+ when PIXEL_INFO_BOX
381
+ @pixel_info = prop[:pixel_info]
382
+ when RELATIVE_LOCATION_BOX
383
+ @horizontal_offset = prop[:horizontal_offset]
384
+ @vertical_offset = prop[:vertical_offset]
385
+ when CLEAN_APERTURE_BOX
386
+ @clean_aperture = prop[:clean_aperture]
387
+ when IMAGE_ROTATION_BOX
388
+ @rotation = prop[:rotation]
389
+ end
390
+ }
391
+ end
392
+
393
+ def next_meaningful_meta_byte
394
+ while @buf.pos < @metadata_end_pos
395
+ next_byte = read_string(4)
396
+ return next_byte if meaningful?(next_byte)
397
+ end
398
+ end
399
+
400
+ def get_next_box(box_start_pos, box_length, end_pos_upper_box)
401
+ skip_pos = box_start_pos + box_length - HEADER_LENGTH
402
+ @buf.seek(skip_pos)
403
+ return if skip_pos >= end_pos_upper_box
404
+ next_box_length = read_int_32
405
+ next_box_name = read_string(4)
406
+ [next_box_length, next_box_name, @buf.pos]
407
+ end
408
+
409
+ def meaningful?(byte)
410
+ byte != MEANINGLESS_BYTE
411
+ end
412
+
413
+ def convert_byte_to_binary(integer)
414
+ binary = []
415
+ while integer > 0
416
+ binary << integer % 2
417
+ integer /= 2
418
+ end
419
+ binary_value = binary.reverse
420
+ (8 - binary_value.length).times do
421
+ binary_value.prepend('0')
422
+ end
423
+ binary_value
424
+ end
425
+
426
+ def likely_match?(filename)
427
+ filename =~ /\.hei[cf]$/i
428
+ end
429
+
430
+ FormatParser.register_parser(new, natures: :image, formats: [:heif, :heic], priority: 2)
431
+ end
@@ -136,7 +136,7 @@ class FormatParser::MOOVParser
136
136
 
137
137
  if stts && mdhd
138
138
  timescale = mdhd.atom_fields[:tscale]
139
- sample_duration = stts.field_value(:entries).first[:sample_duration]
139
+ sample_duration = stts.field_value(:entries).dig(0, :sample_duration)
140
140
  if timescale.nil? || timescale == 0 || sample_duration.nil? || sample_duration == 0
141
141
  nil
142
142
  else
@@ -163,5 +163,5 @@ class FormatParser::MOOVParser
163
163
  end
164
164
  end
165
165
 
166
- FormatParser.register_parser new, natures: :video, formats: FTYP_MAP.values, priority: 1
166
+ FormatParser.register_parser new, natures: :video, formats: FTYP_MAP.values, priority: 3
167
167
  end
@@ -21,5 +21,5 @@ class FormatParser::PDFParser
21
21
  FormatParser::Document.new(format: :pdf, content_type: PDF_CONTENT_TYPE)
22
22
  end
23
23
 
24
- FormatParser.register_parser new, natures: :document, formats: :pdf, priority: 1
24
+ FormatParser.register_parser new, natures: :document, formats: :pdf, priority: 3
25
25
  end
@@ -0,0 +1,162 @@
1
+ # WebP is an image format that provides superior lossless and lossy compression for images on the web, with support for
2
+ # transparency. It uses predictive coding to encode an image, predicting the values in a block of pixels based on the
3
+ # values of neighbouring blocks. A WebP file consists of VP8 or VP8L data, and a container based on RIFF. There is also
4
+ # an extended file format, VP8X, that optionally encodes various information such as the color profile, animation
5
+ # control data, transparency, and EXIF and/or XMP metadata.
6
+ #
7
+ # For more information, visit https://developers.google.com/speed/webp.
8
+ #
9
+ # TODO: Decide how to determine color mode (depends on variant, transformations, flags, etc.; maybe not worth it).
10
+
11
+ class FormatParser::WebpParser
12
+ include FormatParser::EXIFParser
13
+ include FormatParser::IOUtils
14
+
15
+ WEBP_MIME_TYPE = 'image/webp'
16
+
17
+ def likely_match?(filename)
18
+ filename =~ /\.webp$/i
19
+ end
20
+
21
+ def call(io)
22
+ @buf = FormatParser::IOConstraint.new(io)
23
+
24
+ # All WebP files start with the following 20 bytes:
25
+ #
26
+ # Offset | Description
27
+ # -------------------------------------------------------------------------------------
28
+ # 0...3 | "RIFF" (Since WebP is based on the RIFF file container format).
29
+ # 4...7 | The size of the file in bytes - 8 bytes.
30
+ # 8...11 | "WEBP" (To signify that this is a WebP file).
31
+ # 12...15 | The VB8 variant in use ("VB8 ", "VP8L" or "VB8X")
32
+ # 16...19 | The length of the VB8 data in bytes (i.e. The size of the file - 20 bytes).
33
+ riff, webp, variant = safe_read(@buf, 20).unpack('A4x4A4A4')
34
+ return unless riff == 'RIFF' && webp == 'WEBP'
35
+ read_data(variant)
36
+ end
37
+
38
+ private
39
+
40
+ def read_data(variant)
41
+ case variant
42
+ when 'VP8' # Lossy
43
+ read_lossy_data
44
+ when 'VP8L' # Lossless
45
+ read_lossless_data
46
+ when 'VP8X' # Extended
47
+ read_extended_data
48
+ else
49
+ nil
50
+ end
51
+ end
52
+
53
+ def read_lossy_data
54
+ # Encoded as a single VP8 key frame - a 10-byte uncompressed chunk followed by 2+ partitions of compressed data.
55
+ # The first 6 bytes of this chunk contains information that is mostly relevant when using VP8 as a video
56
+ # compression format, and can be ignored.
57
+ safe_skip(@buf, 6)
58
+
59
+ # The subsequent 4 bytes contain the image width and height, respectively, as 16-bit unsigned little endian
60
+ # integers.
61
+ width, height = safe_read(@buf, 4).unpack('S<S<')
62
+ create_image(width, height)
63
+ end
64
+
65
+ def read_lossless_data
66
+ # There is a single byte signature, 0x2F, that we can disregard.
67
+ safe_skip(@buf, 1)
68
+
69
+ # The subsequent 4 bytes contain the image width and height, respectively, as 14-bit unsigned little endian
70
+ # integers (minus one). The 4 remaining bits consist of a 1-bit flag indicating whether alpha is used, and a 3-bit
71
+ # version that is always zero.
72
+ dimensions = read_little_endian_int_32
73
+ width = (dimensions & 0x3fff) + 1
74
+ height = (dimensions >> 14 & 0x3fff) + 1
75
+ has_transparency = (dimensions >> 28 & 0x1) == 1
76
+
77
+ create_image(width, height, has_transparency: has_transparency)
78
+ end
79
+
80
+ def read_extended_data
81
+ # After the common RIFF header bytes, the extended file format has a series of 1-bit flags to signify the presence
82
+ # of optional information. These flags are as follows:
83
+ #
84
+ # |0|1|2|3|4|5|6|7|
85
+ # +-+-+-+-+-+-+-+-+
86
+ # |Rsv|I|L|E|X|A|R|
87
+ #
88
+ # Where:
89
+ # - Rsv & R = Reserved - Should be 0.
90
+ # - I = Set if file contains an ICC profile.
91
+ # - L = Set if file contains transparency information.
92
+ # - E = Set if file contains Exif metadata.
93
+ # - X = Set if file contains XMP metadata.
94
+ # - A = Set if file is an animated image.
95
+ flags = read_int_8
96
+ has_transparency = flags & 0x10 != 0
97
+ has_exif_metadata = flags & 0x08 != 0
98
+ has_xmp_metadata = flags & 0x04 != 0
99
+ has_multiple_frames = flags & 0x02 != 0
100
+
101
+ # The flags are followed by three reserved bytes of zeros, and then by the width and height, respectively - each
102
+ # occupying three bytes and each one less than the actual canvas measurements.
103
+ safe_skip(@buf, 3)
104
+ dimensions = safe_read(@buf, 6).unpack('VS')
105
+ width = (dimensions[0] & 0xffffff) + 1
106
+ height = (dimensions[0] >> 24 | dimensions[1] << 8 & 0xffffff) + 1
107
+
108
+ image = create_image(width, height, has_multiple_frames: has_multiple_frames, has_transparency: has_transparency)
109
+ augment_image(image) if has_exif_metadata || has_xmp_metadata || has_multiple_frames
110
+ image
111
+ end
112
+
113
+ def create_image(width, height, has_multiple_frames: false, has_transparency: false)
114
+ FormatParser::Image.new(
115
+ content_type: WEBP_MIME_TYPE,
116
+ format: :webp,
117
+ has_multiple_frames: has_multiple_frames,
118
+ has_transparency: has_transparency,
119
+ height_px: height,
120
+ width_px: width
121
+ )
122
+ end
123
+
124
+ def augment_image(image)
125
+ # We're going to scan the file looking for the EXIF, XMP and/or ANMF chunks.
126
+ intrinsics = {}
127
+ num_frames = 0
128
+ loop do
129
+ # Try to read the next chunk header, and break the loop if we've reached EOF.
130
+ begin
131
+ fourcc, chunk_size = safe_read(@buf, 8).unpack('A4V')
132
+ rescue InvalidRead
133
+ break
134
+ end
135
+
136
+ # Padding byte of 0 added if chunk size is odd.
137
+ safe_skip(@buf, 1) if chunk_size.odd?
138
+
139
+ case fourcc
140
+ when 'EXIF'
141
+ exif = exif_from_tiff_io(StringIO.new(safe_read(@buf, chunk_size)))
142
+ # We use ||= here as one Exif chunk at most should be present, even though it is possible for there to be more.
143
+ intrinsics[:exif] ||= exif
144
+ image.height_px, image.width_px = image.width_px, image.height_px if exif&.rotated?
145
+ image.orientation = exif&.orientation_sym
146
+ when 'XMP'
147
+ # We use ||= here as one XMP chunk at most should be present, even though it is possible for there to be more.
148
+ intrinsics[:xmp] ||= safe_read(@buf, chunk_size)
149
+ when 'ANMF'
150
+ num_frames += 1 if image.has_multiple_frames
151
+ safe_skip(@buf, chunk_size)
152
+ else
153
+ safe_skip(@buf, chunk_size)
154
+ end
155
+ end
156
+
157
+ image.intrinsics = intrinsics unless intrinsics.empty?
158
+ image.num_animation_or_video_frames = num_frames if num_frames > 0
159
+ end
160
+
161
+ FormatParser.register_parser new, natures: [:image], formats: [:webp]
162
+ end
@@ -60,5 +60,5 @@ class FormatParser::ZIPParser
60
60
  end
61
61
  end
62
62
 
63
- FormatParser.register_parser new, natures: [:archive, :document], formats: :zip, priority: 2
63
+ FormatParser.register_parser new, natures: [:archive, :document], formats: :zip, priority: 4
64
64
  end
@@ -0,0 +1,75 @@
1
+ require 'spec_helper'
2
+
3
+ describe FormatParser::HEIFParser do
4
+ it 'is able to parse single heif image with heic major brand' do
5
+ heif_path = fixtures_dir + 'HEIF/SingleImage.heic'
6
+
7
+ result = subject.call(File.open(heif_path, 'rb'))
8
+ expect(result).not_to be_nil
9
+ expect(result.nature).to eq(:image)
10
+ expect(result.format).to eq(:heic)
11
+ expect(result.width_px).to eq(4000)
12
+ expect(result.height_px).to eq(3000)
13
+ expect(result.content_type).to eq('image/heic')
14
+ expect(result.intrinsics[:compatible_brands].should =~ ['mif1', 'heic'])
15
+ end
16
+
17
+ it 'is able to parse single heif image with mif1 major brand' do
18
+ heif_path = fixtures_dir + 'HEIF/SingleImage_Autumn.heic'
19
+
20
+ result = subject.call(File.open(heif_path, 'rb'))
21
+ expect(result).not_to be_nil
22
+ expect(result.nature).to eq(:image)
23
+ expect(result.format).to eq(:heif)
24
+ expect(result.width_px).to eq(1440)
25
+ expect(result.height_px).to eq(960)
26
+ expect(result.content_type).to eq('image/heif')
27
+ expect(result.intrinsics[:compatible_brands].should =~ ['mif1', 'heic'])
28
+ end
29
+
30
+ it 'is able to parse image collection with mif1 major brand' do
31
+ heif_path = fixtures_dir + 'HEIF/ImageCollection.heic'
32
+
33
+ result = subject.call(File.open(heif_path, 'rb'))
34
+ expect(result).not_to be_nil
35
+ expect(result.nature).to eq(:image)
36
+ expect(result.format).to eq(:heif)
37
+ expect(result.width_px).to eq(1440)
38
+ expect(result.height_px).to eq(960)
39
+ expect(result.content_type).to eq('image/heif')
40
+ end
41
+
42
+ it 'is able to parse image collection with colour info' do
43
+ heif_path = fixtures_dir + 'HEIF/SingleImage_Autumn_WithColourInfo.heic'
44
+
45
+ result = subject.call(File.open(heif_path, 'rb'))
46
+ expect(result).not_to be_nil
47
+ expect(result.nature).to eq(:image)
48
+ expect(result.format).to eq(:heic)
49
+ expect(result.width_px).to eq(1440)
50
+ expect(result.height_px).to eq(960)
51
+ colour_info = result.intrinsics[:colour_info]
52
+ expect(colour_info[:colour_primaries]).to eq(28259)
53
+ expect(colour_info[:transfer_characteristics]).to eq(27768)
54
+ expect(colour_info[:matrix_coefficients]).to eq(2)
55
+ expect(result.content_type).to eq('image/heic')
56
+ expect(result.intrinsics[:compatible_brands].should =~ ['mif1', 'heic'])
57
+ end
58
+
59
+ it 'is able to parse image collection with pixel info' do
60
+ heif_path = fixtures_dir + 'HEIF/SingleImage_Autumn_WithColourInfo.heic'
61
+
62
+ result = subject.call(File.open(heif_path, 'rb'))
63
+ expect(result).not_to be_nil
64
+ expect(result.nature).to eq(:image)
65
+ expect(result.format).to eq(:heic)
66
+ expect(result.width_px).to eq(1440)
67
+ expect(result.height_px).to eq(960)
68
+ pixel_info = result.intrinsics[:pixel_info]
69
+ expect(pixel_info[0][:bits_in_channel_2]).to eq(8)
70
+ expect(pixel_info[1][:bits_in_channel_3]).to eq(8)
71
+ expect(pixel_info[2][:bits_in_channel_4]).to eq(8)
72
+ expect(result.content_type).to eq('image/heic')
73
+ expect(result.intrinsics[:compatible_brands].should =~ ['mif1', 'heic'])
74
+ end
75
+ end
@@ -0,0 +1,121 @@
1
+ require 'spec_helper'
2
+
3
+ describe FormatParser::WebpParser do
4
+ it 'does not parse files with an invalid RIFF header' do
5
+ result = subject.call(File.open(fixtures_dir + 'WEBP/invalid-header.webp', 'rb'))
6
+ expect(result).to be_nil
7
+ end
8
+
9
+ it 'does not parse files with an unrecognised variant' do
10
+ result = subject.call(File.open(fixtures_dir + 'WEBP/unrecognised-variant.webp', 'rb'))
11
+ expect(result).to be_nil
12
+ end
13
+
14
+ it 'successfully parses lossy (VP8) WebP files' do
15
+ result = subject.call(File.open(fixtures_dir + 'WEBP/lossy.webp', 'rb'))
16
+ expect(result).not_to be_nil
17
+ expect(result.content_type).to eq('image/webp')
18
+ expect(result.format).to eq(:webp)
19
+ expect(result.has_multiple_frames).to eq(false)
20
+ expect(result.has_transparency).to eq(false)
21
+ expect(result.height_px).to eq(181)
22
+ expect(result.intrinsics).to be_nil
23
+ expect(result.orientation).to be_nil
24
+ expect(result.width_px).to eq(65)
25
+ end
26
+
27
+ it 'successfully parses lossless WebP files' do
28
+ result = subject.call(File.open(fixtures_dir + 'WEBP/lossless.webp', 'rb'))
29
+ expect(result).not_to be_nil
30
+ expect(result.content_type).to eq('image/webp')
31
+ expect(result.format).to eq(:webp)
32
+ expect(result.has_multiple_frames).to eq(false)
33
+ expect(result.has_transparency).to eq(false)
34
+ expect(result.height_px).to eq(181)
35
+ expect(result.intrinsics).to be_nil
36
+ expect(result.orientation).to be_nil
37
+ expect(result.width_px).to eq(65)
38
+ end
39
+
40
+ it 'successfully parses lossless WebP files with an alpha channel' do
41
+ result = subject.call(File.open(fixtures_dir + 'WEBP/lossless-alpha.webp', 'rb'))
42
+ expect(result).not_to be_nil
43
+ expect(result.content_type).to eq('image/webp')
44
+ expect(result.format).to eq(:webp)
45
+ expect(result.has_multiple_frames).to eq(false)
46
+ expect(result.has_transparency).to eq(true)
47
+ expect(result.height_px).to eq(181)
48
+ expect(result.intrinsics).to be_nil
49
+ expect(result.orientation).to be_nil
50
+ expect(result.width_px).to eq(65)
51
+ end
52
+
53
+ it 'successfully parses extended WebP files' do
54
+ result = subject.call(File.open(fixtures_dir + 'WEBP/extended.webp', 'rb'))
55
+ expect(result).not_to be_nil
56
+ expect(result.content_type).to eq('image/webp')
57
+ expect(result.format).to eq(:webp)
58
+ expect(result.has_multiple_frames).to eq(false)
59
+ expect(result.has_transparency).to eq(false)
60
+ expect(result.height_px).to eq(181)
61
+ expect(result.intrinsics).to be_nil
62
+ expect(result.orientation).to be_nil
63
+ expect(result.width_px).to eq(65)
64
+ end
65
+
66
+ it 'successfully parses extended WebP files with an alpha channel' do
67
+ result = subject.call(File.open(fixtures_dir + 'WEBP/extended-alpha.webp', 'rb'))
68
+ expect(result).not_to be_nil
69
+ expect(result.content_type).to eq('image/webp')
70
+ expect(result.format).to eq(:webp)
71
+ expect(result.has_multiple_frames).to eq(false)
72
+ expect(result.has_transparency).to eq(true)
73
+ expect(result.height_px).to eq(181)
74
+ expect(result.intrinsics).to be_nil
75
+ expect(result.orientation).to be_nil
76
+ expect(result.width_px).to eq(65)
77
+ end
78
+
79
+ it 'successfully parses extended WebP files with Exif metadata' do
80
+ result = subject.call(File.open(fixtures_dir + 'WEBP/extended-exif.webp', 'rb'))
81
+ expect(result).not_to be_nil
82
+ expect(result.content_type).to eq('image/webp')
83
+ expect(result.format).to eq(:webp)
84
+ expect(result.has_multiple_frames).to eq(false)
85
+ expect(result.has_transparency).to eq(false)
86
+ expect(result.height_px).to eq(181)
87
+ expect(result.intrinsics).not_to be_nil
88
+ expect(result.intrinsics[:exif]).not_to be_nil
89
+ expect(result.intrinsics[:exif].image_length).to eq(result.height_px)
90
+ expect(result.intrinsics[:exif].image_width).to eq(result.width_px)
91
+ expect(result.orientation).to eq(:top_left)
92
+ expect(result.width_px).to eq(65)
93
+ end
94
+
95
+ it 'successfully parses extended WebP files with XMP metadata' do
96
+ result = subject.call(File.open(fixtures_dir + 'WEBP/extended-xmp.webp', 'rb'))
97
+ expect(result).not_to be_nil
98
+ expect(result.content_type).to eq('image/webp')
99
+ expect(result.format).to eq(:webp)
100
+ expect(result.has_multiple_frames).to eq(false)
101
+ expect(result.has_transparency).to eq(false)
102
+ expect(result.height_px).to eq(181)
103
+ expect(result.intrinsics).not_to be_nil
104
+ expect(result.intrinsics[:xmp]).not_to be_nil
105
+ expect(result.orientation).to be_nil
106
+ expect(result.width_px).to eq(65)
107
+ end
108
+
109
+ it 'successfully parses extended WebP files with animation' do
110
+ result = subject.call(File.open(fixtures_dir + 'WEBP/extended-animation.webp', 'rb'))
111
+ expect(result).not_to be_nil
112
+ expect(result.content_type).to eq('image/webp')
113
+ expect(result.format).to eq(:webp)
114
+ expect(result.has_multiple_frames).to eq(true)
115
+ expect(result.has_transparency).to eq(true)
116
+ expect(result.height_px).to eq(211)
117
+ expect(result.intrinsics).to be_nil
118
+ expect(result.orientation).to be_nil
119
+ expect(result.width_px).to eq(211)
120
+ end
121
+ end
metadata CHANGED
@@ -1,15 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: format_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.0
4
+ version: 1.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Noah Berman
8
8
  - Julik Tarkhanov
9
- autorequire:
9
+ autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2022-04-07 00:00:00.000000000 Z
12
+ date: 2022-07-07 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: ks
@@ -238,6 +238,7 @@ files:
238
238
  - lib/parsers/fdx_parser.rb
239
239
  - lib/parsers/flac_parser.rb
240
240
  - lib/parsers/gif_parser.rb
241
+ - lib/parsers/heif_parser.rb
241
242
  - lib/parsers/jpeg_parser.rb
242
243
  - lib/parsers/m3u_parser.rb
243
244
  - lib/parsers/moov_parser.rb
@@ -251,6 +252,7 @@ files:
251
252
  - lib/parsers/psd_parser.rb
252
253
  - lib/parsers/tiff_parser.rb
253
254
  - lib/parsers/wav_parser.rb
255
+ - lib/parsers/webp_parser.rb
254
256
  - lib/parsers/zip_parser.rb
255
257
  - lib/parsers/zip_parser/file_reader.rb
256
258
  - lib/parsers/zip_parser/office_formats.rb
@@ -278,6 +280,7 @@ files:
278
280
  - spec/parsers/fdx_parser_spec.rb
279
281
  - spec/parsers/flac_parser_spec.rb
280
282
  - spec/parsers/gif_parser_spec.rb
283
+ - spec/parsers/heif_parser_spec.rb
281
284
  - spec/parsers/jpeg_parser_spec.rb
282
285
  - spec/parsers/m3u_parser_spec.rb
283
286
  - spec/parsers/moov_parser_spec.rb
@@ -289,6 +292,7 @@ files:
289
292
  - spec/parsers/psd_parser_spec.rb
290
293
  - spec/parsers/tiff_parser_spec.rb
291
294
  - spec/parsers/wav_parser_spec.rb
295
+ - spec/parsers/webp_parser_spec.rb
292
296
  - spec/parsers/zip_parser_spec.rb
293
297
  - spec/read_limiter_spec.rb
294
298
  - spec/read_limits_config_spec.rb
@@ -300,7 +304,7 @@ licenses:
300
304
  - MIT (Hippocratic)
301
305
  metadata:
302
306
  allowed_push_host: https://rubygems.org
303
- post_install_message:
307
+ post_install_message:
304
308
  rdoc_options: []
305
309
  require_paths:
306
310
  - lib
@@ -315,8 +319,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
315
319
  - !ruby/object:Gem::Version
316
320
  version: '0'
317
321
  requirements: []
318
- rubygems_version: 3.3.4
319
- signing_key:
322
+ rubygems_version: 3.2.33
323
+ signing_key:
320
324
  specification_version: 4
321
325
  summary: A library for efficient parsing of file metadata
322
326
  test_files: []