format_parser 2.3.0 → 2.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +21 -0
- data/README.md +13 -6
- data/format_parser.gemspec +1 -0
- data/lib/format_parser/version.rb +1 -1
- data/lib/io_utils.rb +18 -33
- data/lib/parsers/cr3_parser/decoder.rb +2 -2
- data/lib/parsers/cr3_parser.rb +13 -11
- data/lib/parsers/heif_parser.rb +46 -46
- data/lib/parsers/iso_base_media_file_format/box.rb +80 -0
- data/lib/parsers/iso_base_media_file_format/decoder.rb +348 -377
- data/lib/parsers/iso_base_media_file_format/utils.rb +89 -0
- data/lib/parsers/mov_parser/decoder.rb +53 -0
- data/lib/parsers/mov_parser.rb +48 -0
- data/lib/parsers/mp4_parser.rb +80 -0
- data/lib/parsers/pdf_parser.rb +5 -2
- data/lib/parsers/webp_parser.rb +2 -2
- data/spec/format_parser_spec.rb +1 -1
- data/spec/parsers/cr3_parser_spec.rb +3 -3
- data/spec/parsers/iso_base_media_file_format/box_spec.rb +399 -0
- data/spec/parsers/iso_base_media_file_format/decoder_spec.rb +117 -151
- data/spec/parsers/iso_base_media_file_format/utils_spec.rb +632 -0
- data/spec/parsers/mov_parser_spec.rb +139 -0
- data/spec/parsers/mp4_parser_spec.rb +188 -0
- data/spec/parsers/pdf_parser_spec.rb +37 -23
- metadata +25 -5
- data/lib/parsers/moov_parser/decoder.rb +0 -353
- data/lib/parsers/moov_parser.rb +0 -165
- data/spec/parsers/moov_parser_spec.rb +0 -144
@@ -0,0 +1,89 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'matrix'
|
4
|
+
|
5
|
+
module FormatParser
|
6
|
+
module ISOBaseMediaFileFormat
|
7
|
+
module Utils
|
8
|
+
|
9
|
+
IDENTITY_MATRIX = Matrix.identity(3)
|
10
|
+
|
11
|
+
def dimensions(box_tree)
|
12
|
+
moov_box = box_tree.find { |box| box.type == 'moov' }
|
13
|
+
return unless moov_box
|
14
|
+
movie_matrix = moov_box.first_child('mvhd')&.dig(:fields, :matrix) || IDENTITY_MATRIX
|
15
|
+
extreme_coordinates = video_trak_boxes(box_tree).each_with_object({}) do |trak_box, extreme_coordinates|
|
16
|
+
tkhd_box = trak_box.first_child('tkhd')
|
17
|
+
next unless tkhd_box
|
18
|
+
x = tkhd_box.fields[:width]
|
19
|
+
y = tkhd_box.fields[:height]
|
20
|
+
next unless x && y
|
21
|
+
track_matrix = tkhd_box.fields[:matrix] || IDENTITY_MATRIX
|
22
|
+
[[0, 0], [0, y], [x, 0], [x, y]].each do |coordinates|
|
23
|
+
x, y = (Matrix[[*coordinates, 1]] * track_matrix * movie_matrix).to_a[0][0..1]
|
24
|
+
extreme_coordinates[:min_x] = x if !extreme_coordinates[:min_x] || x < extreme_coordinates[:min_x]
|
25
|
+
extreme_coordinates[:max_x] = x if !extreme_coordinates[:max_x] || x > extreme_coordinates[:max_x]
|
26
|
+
extreme_coordinates[:min_y] = y if !extreme_coordinates[:min_y] || y < extreme_coordinates[:min_y]
|
27
|
+
extreme_coordinates[:max_y] = y if !extreme_coordinates[:max_y] || y > extreme_coordinates[:max_y]
|
28
|
+
end
|
29
|
+
end
|
30
|
+
unless extreme_coordinates.empty?
|
31
|
+
[
|
32
|
+
extreme_coordinates[:max_x] - extreme_coordinates[:min_x],
|
33
|
+
extreme_coordinates[:max_y] - extreme_coordinates[:min_y]
|
34
|
+
]
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def duration(box_tree)
|
39
|
+
mvhd_box = box_tree.find { |box| box.type == 'moov' }&.first_child('mvhd')
|
40
|
+
return unless mvhd_box
|
41
|
+
duration = mvhd_box.fields[:duration]
|
42
|
+
timescale = mvhd_box.fields[:timescale]&.to_f
|
43
|
+
duration / timescale if duration && timescale
|
44
|
+
end
|
45
|
+
|
46
|
+
def frame_rate(box_tree)
|
47
|
+
video_trak_boxes(box_tree).each do |trak_box|
|
48
|
+
mdhd_box = trak_box.first_descendent_by_path(%w[mdia mdhd])
|
49
|
+
stts_box = trak_box.first_descendent_by_path(%w[mdia minf stbl stts])
|
50
|
+
|
51
|
+
next unless mdhd_box && stts_box
|
52
|
+
|
53
|
+
timescale = mdhd_box.fields[:timescale]&.to_f
|
54
|
+
sample_delta = stts_box.dig(:fields, :entries, 0, :sample_delta)
|
55
|
+
|
56
|
+
next unless timescale && sample_delta
|
57
|
+
|
58
|
+
return (timescale / sample_delta).truncate(2)
|
59
|
+
end
|
60
|
+
nil
|
61
|
+
# TODO: Properly account for and represent variable frame-rates.
|
62
|
+
end
|
63
|
+
|
64
|
+
def video_codecs(box_tree)
|
65
|
+
video_trak_boxes(box_tree).flat_map do |trak_box|
|
66
|
+
trak_box.all_descendents_by_path(%w[mdia minf stbl stsd]).flat_map { |stsd_box| stsd_box.children.map(&:type) }
|
67
|
+
end.compact.uniq
|
68
|
+
end
|
69
|
+
|
70
|
+
private
|
71
|
+
|
72
|
+
# Find any and all `trak` boxes containing a video media handler.
|
73
|
+
def video_trak_boxes(box_tree)
|
74
|
+
moov_box = box_tree.find { |box| box.type == 'moov' }
|
75
|
+
return [] unless moov_box
|
76
|
+
moov_box.all_children('trak').select do |trak_box|
|
77
|
+
trak_box.all_descendents('hdlr').find do |hdlr_box|
|
78
|
+
hdlr_fields = hdlr_box.fields
|
79
|
+
if hdlr_fields.include?(:component_type) && hdlr_fields.include?(:component_subtype) # MOV
|
80
|
+
hdlr_fields[:component_type] == 'mhlr' && hdlr_fields[:component_subtype] == 'vide'
|
81
|
+
else
|
82
|
+
hdlr_fields[:handler_type] == 'vide'
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
require 'parsers/iso_base_media_file_format/decoder'
|
2
|
+
|
3
|
+
class FormatParser::MOVParser::Decoder < FormatParser::ISOBaseMediaFileFormat::Decoder
|
4
|
+
protected
|
5
|
+
|
6
|
+
def hdlr(size)
|
7
|
+
fields = read_version_and_flags.merge({
|
8
|
+
component_type: read_string(4),
|
9
|
+
component_subtype: read_string(4),
|
10
|
+
component_manufacturer: read_bytes(4),
|
11
|
+
component_flags: read_bytes(4),
|
12
|
+
component_flags_mask: read_bytes(4),
|
13
|
+
component_name: read_string(size - 24)
|
14
|
+
})
|
15
|
+
[fields, nil]
|
16
|
+
end
|
17
|
+
|
18
|
+
def mvhd(_)
|
19
|
+
fields = read_version_and_flags.merge({
|
20
|
+
creation_time: read_int,
|
21
|
+
modification_time: read_int,
|
22
|
+
timescale: read_int,
|
23
|
+
duration: read_int,
|
24
|
+
rate: read_fixed_point(n: 4),
|
25
|
+
volume: read_fixed_point(n: 2, signed: true),
|
26
|
+
matrix: skip_bytes(10) { read_matrix },
|
27
|
+
preview_time: read_int,
|
28
|
+
preview_duration: read_int,
|
29
|
+
poster_time: read_int,
|
30
|
+
selection_time: read_int,
|
31
|
+
selection_duration: read_int,
|
32
|
+
current_time: read_int,
|
33
|
+
next_trak_id: read_int,
|
34
|
+
})
|
35
|
+
[fields, nil]
|
36
|
+
end
|
37
|
+
|
38
|
+
def tkhd(_)
|
39
|
+
fields = read_version_and_flags.merge({
|
40
|
+
creation_time: read_int,
|
41
|
+
modification_time: read_int,
|
42
|
+
track_id: read_int,
|
43
|
+
duration: skip_bytes(4) { read_int },
|
44
|
+
layer: skip_bytes(8) { read_int(n: 2) },
|
45
|
+
alternate_group: read_int(n: 2),
|
46
|
+
volume: read_fixed_point(n: 2, signed: true),
|
47
|
+
matrix: skip_bytes(2) { read_matrix },
|
48
|
+
width: read_fixed_point(n: 4),
|
49
|
+
height: read_fixed_point(n: 4)
|
50
|
+
})
|
51
|
+
[fields, nil]
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'parsers/iso_base_media_file_format/utils'
|
2
|
+
|
3
|
+
class FormatParser::MOVParser
|
4
|
+
include FormatParser::IOUtils
|
5
|
+
include FormatParser::ISOBaseMediaFileFormat::Utils
|
6
|
+
require_relative 'mov_parser/decoder'
|
7
|
+
|
8
|
+
MAGIC_BYTES = 'ftypqt '
|
9
|
+
MOV_MIME_TYPE = 'video/quicktime'
|
10
|
+
|
11
|
+
def likely_match?(filename)
|
12
|
+
/\.(mov|moov|qt)$/i.match?(filename)
|
13
|
+
end
|
14
|
+
|
15
|
+
def call(io)
|
16
|
+
@buf = FormatParser::IOConstraint.new(io)
|
17
|
+
|
18
|
+
return unless matches_mov_definition?
|
19
|
+
|
20
|
+
box_tree = Measurometer.instrument('format_parser.mov_parser.decoder.build_box_tree') do
|
21
|
+
Decoder.new.build_box_tree(0xffffffff, @buf)
|
22
|
+
end
|
23
|
+
|
24
|
+
width, height = dimensions(box_tree)
|
25
|
+
|
26
|
+
FormatParser::Video.new(
|
27
|
+
format: :mov,
|
28
|
+
width_px: width,
|
29
|
+
height_px: height,
|
30
|
+
frame_rate: frame_rate(box_tree),
|
31
|
+
media_duration_seconds: duration(box_tree),
|
32
|
+
content_type: MOV_MIME_TYPE,
|
33
|
+
codecs: video_codecs(box_tree),
|
34
|
+
intrinsics: box_tree
|
35
|
+
)
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
def matches_mov_definition?
|
41
|
+
skip_bytes(4)
|
42
|
+
matches = read_string(8) == MAGIC_BYTES
|
43
|
+
@buf.seek(0)
|
44
|
+
matches
|
45
|
+
end
|
46
|
+
|
47
|
+
FormatParser.register_parser new, natures: [:video], formats: [:mov], priority: 3
|
48
|
+
end
|
@@ -0,0 +1,80 @@
|
|
1
|
+
require_relative 'iso_base_media_file_format/decoder'
|
2
|
+
|
3
|
+
class FormatParser::MP4Parser
|
4
|
+
include FormatParser::IOUtils
|
5
|
+
include FormatParser::ISOBaseMediaFileFormat
|
6
|
+
include FormatParser::ISOBaseMediaFileFormat::Utils
|
7
|
+
|
8
|
+
MAGIC_BYTES = /^ftyp(iso[m2]|mp4[12]|m4[abprv] )$/i
|
9
|
+
|
10
|
+
BRAND_FORMATS = {
|
11
|
+
'isom' => :mp4, # Prohibited as a major brand by ISO/IEC 14496-12 sec 6.3 paragraph 2, but occasionally used.
|
12
|
+
'iso2' => :mp4, # Prohibited as a major brand by ISO/IEC 14496-12 sec 6.3 paragraph 2, but occasionally used.
|
13
|
+
'mp41' => :mp4,
|
14
|
+
'mp42' => :mp4,
|
15
|
+
'm4a ' => :m4a,
|
16
|
+
'm4b ' => :m4b, # iTunes audiobooks
|
17
|
+
'm4p ' => :m4p, # iTunes audio
|
18
|
+
'm4r ' => :m4r, # iTunes ringtones
|
19
|
+
'm4v ' => :m4v, # iTunes video
|
20
|
+
}
|
21
|
+
AUDIO_FORMATS = Set[:m4a, :m4b, :m4p, :m4r]
|
22
|
+
VIDEO_FORMATS = Set[:mp4, :m4v]
|
23
|
+
|
24
|
+
AUDIO_MIMETYPE = 'audio/mp4'
|
25
|
+
VIDEO_MIMETYPE = 'video/mp4'
|
26
|
+
|
27
|
+
def likely_match?(filename)
|
28
|
+
/\.(mp4|m4[abprv])$/i.match?(filename)
|
29
|
+
end
|
30
|
+
|
31
|
+
def call(io)
|
32
|
+
@buf = FormatParser::IOConstraint.new(io)
|
33
|
+
|
34
|
+
return unless matches_mp4_definition?
|
35
|
+
|
36
|
+
box_tree = Measurometer.instrument('format_parser.mp4_parser.decoder.build_box_tree') do
|
37
|
+
Decoder.new.build_box_tree(0xffffffff, @buf)
|
38
|
+
end
|
39
|
+
|
40
|
+
case file_format = file_format(box_tree)
|
41
|
+
when VIDEO_FORMATS
|
42
|
+
width, height = dimensions(box_tree)
|
43
|
+
FormatParser::Video.new(
|
44
|
+
codecs: video_codecs(box_tree),
|
45
|
+
content_type: VIDEO_MIMETYPE,
|
46
|
+
format: file_format,
|
47
|
+
frame_rate: frame_rate(box_tree),
|
48
|
+
height_px: height,
|
49
|
+
intrinsics: box_tree,
|
50
|
+
media_duration_seconds: duration(box_tree),
|
51
|
+
width_px: width,
|
52
|
+
)
|
53
|
+
when AUDIO_FORMATS
|
54
|
+
FormatParser::Audio.new(
|
55
|
+
content_type: AUDIO_MIMETYPE,
|
56
|
+
format: file_format,
|
57
|
+
intrinsics: box_tree,
|
58
|
+
media_duration_seconds: duration(box_tree),
|
59
|
+
)
|
60
|
+
else
|
61
|
+
nil
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
private
|
66
|
+
|
67
|
+
def file_format(box_tree)
|
68
|
+
major_brand = box_tree.find { |box| box.type == 'ftyp' }&.fields[:major_brand]
|
69
|
+
BRAND_FORMATS[major_brand.downcase] if major_brand
|
70
|
+
end
|
71
|
+
|
72
|
+
def matches_mp4_definition?
|
73
|
+
skip_bytes(4)
|
74
|
+
matches = MAGIC_BYTES.match?(read_string(8))
|
75
|
+
@buf.seek(0)
|
76
|
+
matches
|
77
|
+
end
|
78
|
+
|
79
|
+
FormatParser.register_parser new, natures: [:audio, :video], formats: BRAND_FORMATS.values.uniq, priority: 3
|
80
|
+
end
|
data/lib/parsers/pdf_parser.rb
CHANGED
@@ -6,7 +6,7 @@ class FormatParser::PDFParser
|
|
6
6
|
#
|
7
7
|
# There are however exceptions, which are left out for now.
|
8
8
|
#
|
9
|
-
PDF_MARKER = /%PDF-
|
9
|
+
PDF_MARKER = /%PDF-[12]\.[0-8]{1}/
|
10
10
|
PDF_CONTENT_TYPE = 'application/pdf'
|
11
11
|
|
12
12
|
def likely_match?(filename)
|
@@ -16,9 +16,12 @@ class FormatParser::PDFParser
|
|
16
16
|
def call(io)
|
17
17
|
io = FormatParser::IOConstraint.new(io)
|
18
18
|
|
19
|
-
|
19
|
+
header = safe_read(io, 9)
|
20
|
+
return unless header =~ PDF_MARKER
|
20
21
|
|
21
22
|
FormatParser::Document.new(format: :pdf, content_type: PDF_CONTENT_TYPE)
|
23
|
+
rescue FormatParser::IOUtils::InvalidRead
|
24
|
+
nil
|
22
25
|
end
|
23
26
|
|
24
27
|
FormatParser.register_parser new, natures: :document, formats: :pdf, priority: 3
|
data/lib/parsers/webp_parser.rb
CHANGED
@@ -69,7 +69,7 @@ class FormatParser::WebpParser
|
|
69
69
|
# The subsequent 4 bytes contain the image width and height, respectively, as 14-bit unsigned little endian
|
70
70
|
# integers (minus one). The 4 remaining bits consist of a 1-bit flag indicating whether alpha is used, and a 3-bit
|
71
71
|
# version that is always zero.
|
72
|
-
dimensions =
|
72
|
+
dimensions = read_int(big_endian: false)
|
73
73
|
width = (dimensions & 0x3fff) + 1
|
74
74
|
height = (dimensions >> 14 & 0x3fff) + 1
|
75
75
|
has_transparency = (dimensions >> 28 & 0x1) == 1
|
@@ -92,7 +92,7 @@ class FormatParser::WebpParser
|
|
92
92
|
# - E = Set if file contains Exif metadata.
|
93
93
|
# - X = Set if file contains XMP metadata.
|
94
94
|
# - A = Set if file is an animated image.
|
95
|
-
flags =
|
95
|
+
flags = read_int(n: 1)
|
96
96
|
has_transparency = flags & 0x10 != 0
|
97
97
|
has_exif_metadata = flags & 0x08 != 0
|
98
98
|
has_xmp_metadata = flags & 0x04 != 0
|
data/spec/format_parser_spec.rb
CHANGED
@@ -184,7 +184,7 @@ describe FormatParser do
|
|
184
184
|
'FormatParser::GIFParser',
|
185
185
|
'Class',
|
186
186
|
'FormatParser::PNGParser',
|
187
|
-
'FormatParser::
|
187
|
+
'FormatParser::MP4Parser',
|
188
188
|
'FormatParser::CR2Parser',
|
189
189
|
'FormatParser::CR3Parser',
|
190
190
|
'FormatParser::DPXParser',
|
@@ -18,9 +18,9 @@ describe FormatParser::CR3Parser do
|
|
18
18
|
expect(result).to be_nil
|
19
19
|
end
|
20
20
|
|
21
|
-
it 'should return nil if no CMT1
|
21
|
+
it 'should return nil if no CMT1 box is present' do
|
22
22
|
# This is a MOV file with the ftyp header modified to masquerade as a CR3 file. It is therefore missing the
|
23
|
-
# CR3-specific CMT1
|
23
|
+
# CR3-specific CMT1 box containing the image metadata.
|
24
24
|
result = subject.call(File.open(fixtures_dir + '/CR3/invalid'))
|
25
25
|
expect(result).to be_nil
|
26
26
|
end
|
@@ -50,7 +50,7 @@ describe FormatParser::CR3Parser do
|
|
50
50
|
expect(result.display_height_px).to eq(4000)
|
51
51
|
expect(result.content_type).to eq('image/x-canon-cr3')
|
52
52
|
expect(result.intrinsics).not_to be_nil
|
53
|
-
expect(result.intrinsics[:
|
53
|
+
expect(result.intrinsics[:box_tree]).not_to be_nil
|
54
54
|
expect(result.intrinsics[:exif]).not_to be_nil
|
55
55
|
expect(result.intrinsics[:exif][:image_length]).to eq(result.height_px)
|
56
56
|
expect(result.intrinsics[:exif][:image_width]).to eq(result.width_px)
|