format_parser 2.3.0 → 2.4.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 513e379194ef145016aa1c34de9065ffb99d35b5cd8bcf65e43b3941d1ca92a2
4
- data.tar.gz: e05007b688350e716f381be9f7aae6721b89a4de7daef858f25973e826c12fac
3
+ metadata.gz: 7ed2f3d1a503aee2a0b68f00dfae5253fa88cb9ab95b61066642749785542c06
4
+ data.tar.gz: 3cac7120ef119969273568714fa3f75c07e3493bd80c18b89e4a7ac93efbc9dd
5
5
  SHA512:
6
- metadata.gz: 8823d7eea23e201b59b31059f36626bc44d761b5b92dae26eb2ce3243041cd8b6fbdc61d964174bd89243138124f597e89c3e2473e697f55294643253a3165c7
7
- data.tar.gz: 777257aafb666788c0dceadd748977ef3dc381e91225d48cc6c882db5d921b6608397dd59651cce1b5b13e1fb263c9f429cd2fbff0f896543777187c15a97cdb
6
+ metadata.gz: 311393234d4bc595a81169dcfa658b872c539179fba29149d7794df64d388ebbad7e1bdbf8f963018b438c3ad0a350a1f23f47fd1a383a0d50d6897f429c727c
7
+ data.tar.gz: e73d90e24cef65d936eebd0765403b3d83e52fdb6bf3640bfabba2afd2b059d4a85a1010fca6ff575e6e0ff84e003cc75fa1bb4a8cb2adbff0c1fb55e97161d1
data/CHANGELOG.md CHANGED
@@ -1,3 +1,21 @@
1
+ ## 2.4.3
2
+ * Improve resiliency in ISOBMFF parsing to missing mandatory boxes and fields.
3
+ * Simplify ISOBMFF frame rate calculations.
4
+ * Refactor.
5
+
6
+ ## 2.4.2 (yanked)
7
+ * Added support for PDF 2.0
8
+ * Expanded test coverage for PDF parsing
9
+
10
+ ## 2.4.1 (yanked)
11
+ * Revert change where variable frame rates in MOV and MP4 files would result in an array value for `frame_rate`.
12
+
13
+ ## 2.4.0 (yanked)
14
+ * Adapt the ISOBMFF based decoder for parsing MOV and MP4 parsing.
15
+ * Fix MOV/MP4 issues:
16
+ * MP4 files being misidentified as MOV files.
17
+ * Dimensions being miscalculated when files include multiple tracks or transformations.
18
+
1
19
  ## 2.3.0
2
20
  * Add support for `RW2` files.
3
21
 
data/README.md CHANGED
@@ -28,6 +28,10 @@ and [dimensions,](https://github.com/sstephenson/dimensions) borrowing from them
28
28
  * JPEG
29
29
  * M3U
30
30
  * M4A
31
+ * M4B
32
+ * M4P
33
+ * M4R
34
+ * M4V
31
35
  * MOV
32
36
  * MP3
33
37
  * MP4
@@ -195,16 +199,15 @@ Unless specified otherwise in this section the fixture files are MIT licensed an
195
199
  ### M3U
196
200
  - The M3U fixture files were created by one of the project maintainers
197
201
 
198
- ### M4A
199
- - fixture.m4a was created by one of the project maintainers and is MIT licensed
200
-
201
- ### MOOV
202
- - bmff.mp4 is borrowed from the [bmff](https://github.com/zuku/bmff) project
203
- - Test_Circular MOV files were created by one of the project maintainers and are MIT licensed
202
+ ### MOV
203
+ - Fixtures were downloaded from https://pixabay.com/ (with some modifications) and are subject to the [Pixabay Licence](https://pixabay.com/service/license/).
204
204
 
205
205
  ### MP3
206
206
  - Cassy.mp3 has been produced by WeTransfer and may be used with the library for the purposes of testing
207
207
 
208
+ ### MP4
209
+ - Fixtures were downloaded from https://pixabay.com/ (with some modifications) and are subject to the [Pixabay Licence](https://pixabay.com/service/license/).
210
+
208
211
  ### MPEG
209
212
  - The files (video 1 to 4) were downloaded from https://standaloneinstaller.com/blog/big-list-of-sample-videos-for-testers-124.html.
210
213
  - Video 5 was downloaded from https://archive.org/details/ligouHDR-HC1_sample1.
@@ -215,6 +218,10 @@ Unless specified otherwise in this section the fixture files are MIT licensed an
215
218
  ### OGG
216
219
  - `hi.ogg`, `vorbis.ogg`, `with_confusing_magic_string.ogg`, `with_garbage_at_the_end.ogg` have been generated by the project contributors
217
220
 
221
+ ### PDF
222
+ - PDF 2.0 files downloaded from the [PDF Association public Github repository](https://github.com/pdf-association/pdf20examples). These files are licensed under the Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0) license.
223
+ - Lorem Ipsum PDF files created at WeTransfer for this project.
224
+
218
225
  ### PNG
219
226
  - `simulator_screenie.png` provided by [Rens Verhoeven](https://github.com/renssies)
220
227
 
@@ -34,6 +34,7 @@ Gem::Specification.new do |spec|
34
34
 
35
35
  spec.add_dependency 'exifr', '>= 1.3.8'
36
36
  spec.add_dependency 'id3tag', '>= 0.14.2'
37
+ spec.add_dependency 'matrix'
37
38
  spec.add_dependency 'measurometer'
38
39
 
39
40
  spec.add_development_dependency 'parallel_tests'
@@ -1,3 +1,3 @@
1
1
  module FormatParser
2
- VERSION = '2.3.0'
2
+ VERSION = '2.4.3'
3
3
  end
data/lib/io_utils.rb CHANGED
@@ -1,4 +1,11 @@
1
1
  module FormatParser::IOUtils
2
+ INTEGER_DIRECTIVES = {
3
+ 1 => 'C',
4
+ 2 => 'S',
5
+ 4 => 'L',
6
+ 8 => 'Q'
7
+ }
8
+
2
9
  class InvalidRead < ArgumentError
3
10
  end
4
11
 
@@ -26,41 +33,19 @@ module FormatParser::IOUtils
26
33
  nil
27
34
  end
28
35
 
29
- def read_int_8
30
- read_bytes(1).unpack('C').first
31
- end
32
-
33
- def read_int_16
34
- read_bytes(2).unpack('n').first
35
- end
36
-
37
- def read_int_32
38
- read_bytes(4).unpack('N').first
39
- end
40
-
41
- def read_int_64
42
- read_bytes(8).unpack('Q>').first
43
- end
44
-
45
- def read_little_endian_int_16
46
- read_bytes(2).unpack('v').first
47
- end
48
-
49
- def read_little_endian_int_32
50
- read_bytes(4).unpack('V').first
51
- end
52
-
53
- def read_fixed_point_16
54
- read_bytes(2).unpack('C2')
55
- end
56
-
57
- def read_fixed_point_32
58
- read_bytes(4).unpack('n2')
36
+ # Read an integer.
37
+ # @param [Integer] n Number of bytes. Defaults to 4 (32-bit).
38
+ # @param [Boolean] signed Signed if true, Unsigned if false. Defaults to false. (unsigned)
39
+ # @param [Boolean] big_endian Big-endian if true, little-endian if false. Defaults to true (big-endian).
40
+ def read_int(n: 4, signed: false, big_endian: true)
41
+ directive = INTEGER_DIRECTIVES[n]
42
+ directive.downcase! if signed
43
+ directive += (big_endian ? ">" : "<") if n > 1
44
+ read_bytes(n).unpack(directive).first
59
45
  end
60
46
 
61
- def read_fixed_point_32_2_30
62
- n = read_int_32
63
- [n >> 30, n & 0x3fffffff]
47
+ def read_fixed_point(fractional_digits: 16, **kwargs)
48
+ read_int(**kwargs) / 2.0**fractional_digits
64
49
  end
65
50
 
66
51
  # 'n' is the number of bytes to read
@@ -5,7 +5,7 @@ class FormatParser::CR3Parser::Decoder < FormatParser::ISOBaseMediaFileFormat::D
5
5
 
6
6
  protected
7
7
 
8
- ATOM_PARSERS = ATOM_PARSERS.merge({
8
+ BOX_PARSERS = BOX_PARSERS.merge({
9
9
  'CMT1' => :cmt1
10
10
  })
11
11
  CANON_METADATA_CONTAINER_UUID = '85c0b687820f11e08111f4ce462b6a48'
@@ -26,7 +26,7 @@ class FormatParser::CR3Parser::Decoder < FormatParser::ISOBaseMediaFileFormat::D
26
26
  usertype = read_bytes(16).unpack('H*').first
27
27
  fields = { usertype: usertype }
28
28
  children = if usertype == CANON_METADATA_CONTAINER_UUID
29
- build_atom_tree(size - 16)
29
+ build_box_tree(size - 16)
30
30
  else
31
31
  skip_bytes(size - 16)
32
32
  end
@@ -14,15 +14,17 @@ class FormatParser::CR3Parser
14
14
 
15
15
  return unless matches_cr3_definition?
16
16
 
17
- atom_tree = Decoder.new.build_atom_tree(0xffffffff, @buf)
18
- moov_atom = atom_tree.find { |atom| atom.type == 'moov' }
19
- cmt1_atom = moov_atom&.find_first_descendent(['CMT1'])
20
- return unless cmt1_atom
21
-
22
- width = cmt1_atom.fields[:image_width]
23
- height = cmt1_atom.fields[:image_length]
24
- rotated = cmt1_atom.fields[:rotated]
25
- orientation = cmt1_atom.fields[:orientation_sym]
17
+ box_tree = Measurometer.instrument('format_parser.cr3_parser.decoder.build_box_tree') do
18
+ Decoder.new.build_box_tree(0xffffffff, @buf)
19
+ end
20
+ moov_box = box_tree.find { |box| box.type == 'moov' }
21
+ cmt1_box = moov_box&.first_descendent('CMT1')
22
+ return unless cmt1_box
23
+
24
+ width = cmt1_box.fields[:image_width]
25
+ height = cmt1_box.fields[:image_length]
26
+ rotated = cmt1_box.fields[:rotated]
27
+ orientation = cmt1_box.fields[:orientation_sym]
26
28
  FormatParser::Image.new(
27
29
  format: :cr3,
28
30
  content_type: CR3_MIME_TYPE,
@@ -32,8 +34,8 @@ class FormatParser::CR3Parser
32
34
  display_width_px: rotated ? height : width,
33
35
  display_height_px: rotated ? width : height,
34
36
  intrinsics: {
35
- atom_tree: atom_tree,
36
- exif: cmt1_atom.fields,
37
+ box_tree: box_tree,
38
+ exif: cmt1_box.fields,
37
39
  },
38
40
  )
39
41
  end
@@ -121,7 +121,7 @@ class FormatParser::HEIFParser
121
121
  end
122
122
 
123
123
  def scan_file_type_box
124
- file_type_box_length = read_int_32
124
+ file_type_box_length = read_int
125
125
  return unless read_string(4) == FILE_TYPE_BOX_MARKER
126
126
  @major_brand = read_string(4)
127
127
  return unless @major_brand == HEIF_MARKER || @major_brand == MIF1_MARKER
@@ -138,7 +138,7 @@ class FormatParser::HEIFParser
138
138
  end
139
139
 
140
140
  def scan_meta_level_box
141
- metadata_length = read_int_32
141
+ metadata_length = read_int
142
142
  return unless read_string(4) == META_BOX_MARKER
143
143
  @metadata_start_pos = @buf.pos
144
144
  @metadata_end_pos = @buf.pos + metadata_length - HEADER_LENGTH # the real data is always without the 8 initial bytes of the handler
@@ -146,7 +146,7 @@ class FormatParser::HEIFParser
146
146
 
147
147
  # we are looking for box/containers right beneath the Meta box
148
148
  # we start with the HDLR (Handler) box..
149
- handler_length = read_int_32
149
+ handler_length = read_int
150
150
  return unless read_string(4) == HANDLER_MARKER
151
151
  handler_length -= HEADER_LENGTH # subtract the header as usual (will not be mentioned anymore from now on)
152
152
  handler_start = @buf.pos
@@ -158,7 +158,7 @@ class FormatParser::HEIFParser
158
158
  @buf.seek(handler_start + handler_length) # the remaining part is reserved
159
159
 
160
160
  # ..continue looking for the IINF box and especially for the IPRP box, containing info about the image itself
161
- next_box_length = read_int_32
161
+ next_box_length = read_int
162
162
  next_box = read_string(4)
163
163
  next_box_start_pos = @buf.pos
164
164
  while @buf.pos < @metadata_end_pos # we iterate over all next incoming boxed but without going outside the meta-box
@@ -178,25 +178,25 @@ class FormatParser::HEIFParser
178
178
  end
179
179
 
180
180
  def read_item_info_box
181
- version = read_int_8
181
+ version = read_int(n: 1)
182
182
  skip_bytes(3) # 0 flags
183
183
  entry_count = if version == 0
184
- read_int_16
184
+ read_int(n: 2)
185
185
  else
186
- read_int_32
186
+ read_int
187
187
  end
188
188
  @sub_items = []
189
189
  entry_count.times {
190
- item_info_entry_length = read_int_32
190
+ item_info_entry_length = read_int
191
191
  return unless read_string(4) == ITEM_INFO_ENTRY
192
192
  item_info_end_pos = @buf.pos + item_info_entry_length - HEADER_LENGTH
193
- version = read_int_8
193
+ version = read_int(n: 1)
194
194
  skip_bytes(3) # 0 flags
195
195
  case version
196
196
  when 2
197
- item_id = read_int_16
197
+ item_id = read_int(n: 2)
198
198
  when 3
199
- item_id = read_int_32
199
+ item_id = read_int
200
200
  else
201
201
  return # wrong version according to standards, hence return
202
202
  end
@@ -217,12 +217,12 @@ class FormatParser::HEIFParser
217
217
  end
218
218
 
219
219
  def read_primary_item_box
220
- version = read_int_8
220
+ version = read_int(n: 1)
221
221
  skip_bytes(3) # flags, always 0 in this current box
222
222
  @primary_item_id = if version == 0
223
- read_int_16
223
+ read_int(n: 2)
224
224
  else
225
- read_int_32
225
+ read_int
226
226
  end
227
227
  end
228
228
 
@@ -232,17 +232,17 @@ class FormatParser::HEIFParser
232
232
  # and in order to output relevant data from the format_parser we need all the properties associated to the primary_item.
233
233
  # Hence the need of the association between an item and its properties, found in the ITEM_PROPERTIES_ASSOCIATION_BOX
234
234
  def read_item_properties_box
235
- ipco_length = read_int_32
235
+ ipco_length = read_int
236
236
  return unless read_string(4) == ITEM_PROPERTIES_CONTAINER_BOX
237
237
  read_item_properties_container_box(ipco_length)
238
- read_int_32 # ipma_length
238
+ read_int # ipma_length
239
239
  return unless read_string(4) == ITEM_PROPERTIES_ASSOCIATION_BOX
240
240
  read_item_properties_association_box
241
241
  end
242
242
 
243
243
  def read_item_properties_container_box(box_length)
244
244
  end_of_ipco_box = @buf.pos + box_length - HEADER_LENGTH
245
- item_prop_length = read_int_32
245
+ item_prop_length = read_int
246
246
  item_prop_name = read_string(4)
247
247
  item_prop_start_pos = @buf.pos
248
248
  item_prop_index = 1
@@ -250,16 +250,16 @@ class FormatParser::HEIFParser
250
250
  case item_prop_name
251
251
  when IMAGE_SPATIAL_EXTENTS_BOX
252
252
  read_nil_version_and_flag
253
- width = read_int_32
254
- height = read_int_32
253
+ width = read_int
254
+ height = read_int
255
255
  @item_props[item_prop_index] = {
256
256
  type: IMAGE_SPATIAL_EXTENTS_BOX,
257
257
  width: width,
258
258
  height: height
259
259
  }
260
260
  when PIXEL_ASPECT_RATIO_BOX
261
- h_spacing = read_int_32
262
- v_spacing = read_int_32
261
+ h_spacing = read_int
262
+ v_spacing = read_int
263
263
  pixel_aspect_ratio = "#{h_spacing}/#{v_spacing}"
264
264
  @item_props[item_prop_index] = {
265
265
  type: PIXEL_ASPECT_RATIO_BOX,
@@ -267,9 +267,9 @@ class FormatParser::HEIFParser
267
267
  }
268
268
  when COLOUR_INFO_BOX
269
269
  colour_info = {
270
- colour_primaries: read_int_16,
271
- transfer_characteristics: read_int_16,
272
- matrix_coefficients: read_int_16
270
+ colour_primaries: read_int(n: 2),
271
+ transfer_characteristics: read_int(n: 2),
272
+ matrix_coefficients: read_int(n: 2)
273
273
  }
274
274
  @item_props[item_prop_index] = {
275
275
  type: COLOUR_INFO_BOX,
@@ -278,12 +278,12 @@ class FormatParser::HEIFParser
278
278
  when PIXEL_INFO_BOX
279
279
  pixel_info = []
280
280
  read_nil_version_and_flag
281
- num_channels = read_int_8
281
+ num_channels = read_int(n: 1)
282
282
  channel = 1
283
283
  while channel <= num_channels
284
284
  channel += 1
285
285
  pixel_info << {
286
- "bits_in_channel_#{channel}": read_int_8
286
+ "bits_in_channel_#{channel}": read_int(n: 1)
287
287
  }
288
288
  end
289
289
  @item_props[item_prop_index] = {
@@ -292,8 +292,8 @@ class FormatParser::HEIFParser
292
292
  }
293
293
  when RELATIVE_LOCATION_BOX
294
294
  read_nil_version_and_flag
295
- horizontal_offset = read_int_32
296
- vertical_offset = read_int_32
295
+ horizontal_offset = read_int
296
+ vertical_offset = read_int
297
297
  @item_props[item_prop_index] = {
298
298
  type: RELATIVE_LOCATION_BOX,
299
299
  horizontal_offset: horizontal_offset,
@@ -302,14 +302,14 @@ class FormatParser::HEIFParser
302
302
  when CLEAN_APERTURE_BOX
303
303
  clean_aperture = []
304
304
  clean_aperture << {
305
- clean_aperture_width_n: read_int_32,
306
- clean_aperture_width_d: read_int_32,
307
- clean_aperture_height_n: read_int_32,
308
- clean_aperture_height_d: read_int_32,
309
- horiz_off_n: read_int_32,
310
- horiz_off_d: read_int_32,
311
- vert_off_n: read_int_32,
312
- vert_off_d: read_int_32
305
+ clean_aperture_width_n: read_int,
306
+ clean_aperture_width_d: read_int,
307
+ clean_aperture_height_n: read_int,
308
+ clean_aperture_height_d: read_int,
309
+ horiz_off_n: read_int,
310
+ horiz_off_d: read_int,
311
+ vert_off_n: read_int,
312
+ vert_off_d: read_int
313
313
  }
314
314
  @item_props[item_prop_index] = {
315
315
  type: CLEAN_APERTURE_BOX,
@@ -317,7 +317,7 @@ class FormatParser::HEIFParser
317
317
  }
318
318
  when IMAGE_ROTATION_BOX
319
319
  read_nil_version_and_flag
320
- binary = convert_byte_to_binary(read_int_8)
320
+ binary = convert_byte_to_binary(read_int(n: 1))
321
321
  # we need only the last 2 bits to retrieve the angle multiplier. angle multiplier * 90 specifies the angle
322
322
  rotation = binary.slice(6, 2).join.to_i(2) * 90
323
323
  @item_props[item_prop_index] = {
@@ -331,24 +331,24 @@ class FormatParser::HEIFParser
331
331
  end
332
332
 
333
333
  def read_item_properties_association_box
334
- version = read_int_8
334
+ version = read_int(n: 1)
335
335
  skip_bytes(2) # we skip the first 2 bytes of the flags (total of 3 bytes) cause we care only about the least significant bit
336
- flags = read_int_8
337
- entry_count = read_int_32
336
+ flags = read_int(n: 1)
337
+ entry_count = read_int
338
338
  item_id = 0
339
339
  entry_count.times do
340
340
  item_id = if version == 0
341
- read_int_16
341
+ read_int(n: 2)
342
342
  else
343
- read_int_32
343
+ read_int
344
344
  end
345
345
 
346
- association_count = read_int_8
346
+ association_count = read_int(n: 1)
347
347
  association_count.times do
348
348
  # we need to retrieve the "essential" bit wich is just the first bit in the next byte
349
- binary = convert_byte_to_binary(read_int_8)
349
+ binary = convert_byte_to_binary(read_int(n: 1))
350
350
  # essential_bit = binary[0] # uncomment if needed
351
- binary.concat(convert_byte_to_binary(read_int_8)) if (flags & 1) == 1 # if flag is 1 we need the next 15 bits instead of only the next 7 bits
351
+ binary.concat(convert_byte_to_binary(read_int(n: 1))) if (flags & 1) == 1 # if flag is 1 we need the next 15 bits instead of only the next 7 bits
352
352
  # we need to nullify the 1st bit since that one was the essential bit and doesn't count now to calculate the property index
353
353
  binary[0] = 0
354
354
  item_property_index = binary.join.to_i(2)
@@ -401,7 +401,7 @@ class FormatParser::HEIFParser
401
401
  skip_pos = box_start_pos + box_length - HEADER_LENGTH
402
402
  @buf.seek(skip_pos)
403
403
  return if skip_pos >= end_pos_upper_box
404
- next_box_length = read_int_32
404
+ next_box_length = read_int
405
405
  next_box_name = read_string(4)
406
406
  [next_box_length, next_box_name, @buf.pos]
407
407
  end
@@ -0,0 +1,80 @@
1
+ module FormatParser
2
+ module ISOBaseMediaFileFormat
3
+ class Box < Struct.new(:type, :position, :size, :fields, :children)
4
+ def initialize(*args)
5
+ super
6
+ self.fields ||= {}
7
+ self.children ||= []
8
+ end
9
+
10
+ # Return all children with one of the given type(s).
11
+ #
12
+ # @param [Array<String>] types The box type(s) to search for.
13
+ # @return [Array<Box>]
14
+ def all_children(*types)
15
+ children.select { |child| types.include?(child.type) }
16
+ end
17
+
18
+ # Returns true if there are one or more children with the given type.
19
+ #
20
+ # @param [String] type The box type to search for.
21
+ # @return [Boolean]
22
+ def child?(type)
23
+ children.any? { |child| child.type == type }
24
+ end
25
+
26
+ # Return the first child with one of the given types.
27
+ #
28
+ # @param [Array<String>] types The box type(s) to search for.
29
+ # @return [Box, nil]
30
+ def first_child(*types)
31
+ children.find { |child| types.include?(child.type) }
32
+ end
33
+
34
+ # Find and return all descendents of a given type.
35
+ #
36
+ # @param [Array<String>] types The box type(s) to search for.
37
+ # @return [Array<Box>]
38
+ def all_descendents(*types)
39
+ children.flat_map do |child|
40
+ descendents = child.all_descendents(*types)
41
+ types.include?(child.type) ? [child] + descendents : descendents
42
+ end
43
+ end
44
+
45
+ # Find and return all descendents that exist at the given path.
46
+ #
47
+ # @param [Array<String>] path The path to search at.
48
+ # @return [Array<Box>]
49
+ def all_descendents_by_path(path)
50
+ return [] if path.empty?
51
+ next_type, *remaining_path = path
52
+ matching_children = all_children(next_type)
53
+ return matching_children if remaining_path.empty?
54
+ matching_children.flat_map { |child| child.all_descendents_by_path(remaining_path) }
55
+ end
56
+
57
+ # Find and return the first descendent (using depth-first search) of a given type.
58
+ #
59
+ # @param [Array<String>] types The box type(s) to search for.
60
+ # @return [Box, nil]
61
+ def first_descendent(*types)
62
+ children.each do |child|
63
+ return child if types.include?(child.type)
64
+ if (descendent = child.first_descendent(*types))
65
+ return descendent
66
+ end
67
+ end
68
+ nil
69
+ end
70
+
71
+ # Find and return the first descendent that exists at the given path.
72
+ #
73
+ # @param [Array<String>] path The path to search at.
74
+ # @return [Box, nil]
75
+ def first_descendent_by_path(path)
76
+ all_descendents_by_path(path)[0]
77
+ end
78
+ end
79
+ end
80
+ end