format_parser 2.3.0 → 2.4.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +18 -0
- data/README.md +13 -6
- data/format_parser.gemspec +1 -0
- data/lib/format_parser/version.rb +1 -1
- data/lib/io_utils.rb +18 -33
- data/lib/parsers/cr3_parser/decoder.rb +2 -2
- data/lib/parsers/cr3_parser.rb +13 -11
- data/lib/parsers/heif_parser.rb +46 -46
- data/lib/parsers/iso_base_media_file_format/box.rb +80 -0
- data/lib/parsers/iso_base_media_file_format/decoder.rb +342 -376
- data/lib/parsers/iso_base_media_file_format/utils.rb +89 -0
- data/lib/parsers/mov_parser/decoder.rb +53 -0
- data/lib/parsers/mov_parser.rb +48 -0
- data/lib/parsers/mp4_parser.rb +80 -0
- data/lib/parsers/pdf_parser.rb +5 -2
- data/lib/parsers/webp_parser.rb +2 -2
- data/spec/format_parser_spec.rb +1 -1
- data/spec/parsers/cr3_parser_spec.rb +3 -3
- data/spec/parsers/iso_base_media_file_format/box_spec.rb +399 -0
- data/spec/parsers/iso_base_media_file_format/decoder_spec.rb +53 -178
- data/spec/parsers/iso_base_media_file_format/utils_spec.rb +632 -0
- data/spec/parsers/mov_parser_spec.rb +90 -0
- data/spec/parsers/mp4_parser_spec.rb +114 -0
- data/spec/parsers/pdf_parser_spec.rb +37 -23
- metadata +25 -5
- data/lib/parsers/moov_parser/decoder.rb +0 -353
- data/lib/parsers/moov_parser.rb +0 -165
- data/spec/parsers/moov_parser_spec.rb +0 -144
@@ -0,0 +1,89 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'matrix'
|
4
|
+
|
5
|
+
module FormatParser
|
6
|
+
module ISOBaseMediaFileFormat
|
7
|
+
module Utils
|
8
|
+
|
9
|
+
IDENTITY_MATRIX = Matrix.identity(3)
|
10
|
+
|
11
|
+
def dimensions(box_tree)
|
12
|
+
moov_box = box_tree.find { |box| box.type == 'moov' }
|
13
|
+
return unless moov_box
|
14
|
+
movie_matrix = moov_box.first_child('mvhd')&.dig(:fields, :matrix) || IDENTITY_MATRIX
|
15
|
+
extreme_coordinates = video_trak_boxes(box_tree).each_with_object({}) do |trak_box, extreme_coordinates|
|
16
|
+
tkhd_box = trak_box.first_child('tkhd')
|
17
|
+
next unless tkhd_box
|
18
|
+
x = tkhd_box.fields[:width]
|
19
|
+
y = tkhd_box.fields[:height]
|
20
|
+
next unless x && y
|
21
|
+
track_matrix = tkhd_box.fields[:matrix] || IDENTITY_MATRIX
|
22
|
+
[[0, 0], [0, y], [x, 0], [x, y]].each do |coordinates|
|
23
|
+
x, y = (Matrix[[*coordinates, 1]] * track_matrix * movie_matrix).to_a[0][0..1]
|
24
|
+
extreme_coordinates[:min_x] = x if !extreme_coordinates[:min_x] || x < extreme_coordinates[:min_x]
|
25
|
+
extreme_coordinates[:max_x] = x if !extreme_coordinates[:max_x] || x > extreme_coordinates[:max_x]
|
26
|
+
extreme_coordinates[:min_y] = y if !extreme_coordinates[:min_y] || y < extreme_coordinates[:min_y]
|
27
|
+
extreme_coordinates[:max_y] = y if !extreme_coordinates[:max_y] || y > extreme_coordinates[:max_y]
|
28
|
+
end
|
29
|
+
end
|
30
|
+
unless extreme_coordinates.empty?
|
31
|
+
[
|
32
|
+
extreme_coordinates[:max_x] - extreme_coordinates[:min_x],
|
33
|
+
extreme_coordinates[:max_y] - extreme_coordinates[:min_y]
|
34
|
+
]
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def duration(box_tree)
|
39
|
+
mvhd_box = box_tree.find { |box| box.type == 'moov' }&.first_child('mvhd')
|
40
|
+
return unless mvhd_box
|
41
|
+
duration = mvhd_box.fields[:duration]
|
42
|
+
timescale = mvhd_box.fields[:timescale]&.to_f
|
43
|
+
duration / timescale if duration && timescale
|
44
|
+
end
|
45
|
+
|
46
|
+
def frame_rate(box_tree)
|
47
|
+
video_trak_boxes(box_tree).each do |trak_box|
|
48
|
+
mdhd_box = trak_box.first_descendent_by_path(%w[mdia mdhd])
|
49
|
+
stts_box = trak_box.first_descendent_by_path(%w[mdia minf stbl stts])
|
50
|
+
|
51
|
+
next unless mdhd_box && stts_box
|
52
|
+
|
53
|
+
timescale = mdhd_box.fields[:timescale]&.to_f
|
54
|
+
sample_delta = stts_box.dig(:fields, :entries, 0, :sample_delta)
|
55
|
+
|
56
|
+
next unless timescale && sample_delta
|
57
|
+
|
58
|
+
return (timescale / sample_delta).truncate(2)
|
59
|
+
end
|
60
|
+
nil
|
61
|
+
# TODO: Properly account for and represent variable frame-rates.
|
62
|
+
end
|
63
|
+
|
64
|
+
def video_codecs(box_tree)
|
65
|
+
video_trak_boxes(box_tree).flat_map do |trak_box|
|
66
|
+
trak_box.all_descendents_by_path(%w[mdia minf stbl stsd]).flat_map { |stsd_box| stsd_box.children.map(&:type) }
|
67
|
+
end.compact.uniq
|
68
|
+
end
|
69
|
+
|
70
|
+
private
|
71
|
+
|
72
|
+
# Find any and all `trak` boxes containing a video media handler.
|
73
|
+
def video_trak_boxes(box_tree)
|
74
|
+
moov_box = box_tree.find { |box| box.type == 'moov' }
|
75
|
+
return [] unless moov_box
|
76
|
+
moov_box.all_children('trak').select do |trak_box|
|
77
|
+
trak_box.all_descendents('hdlr').find do |hdlr_box|
|
78
|
+
hdlr_fields = hdlr_box.fields
|
79
|
+
if hdlr_fields.include?(:component_type) && hdlr_fields.include?(:component_subtype) # MOV
|
80
|
+
hdlr_fields[:component_type] == 'mhlr' && hdlr_fields[:component_subtype] == 'vide'
|
81
|
+
else
|
82
|
+
hdlr_fields[:handler_type] == 'vide'
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
require 'parsers/iso_base_media_file_format/decoder'
|
2
|
+
|
3
|
+
class FormatParser::MOVParser::Decoder < FormatParser::ISOBaseMediaFileFormat::Decoder
|
4
|
+
protected
|
5
|
+
|
6
|
+
def hdlr(size)
|
7
|
+
fields = read_version_and_flags.merge({
|
8
|
+
component_type: read_string(4),
|
9
|
+
component_subtype: read_string(4),
|
10
|
+
component_manufacturer: read_bytes(4),
|
11
|
+
component_flags: read_bytes(4),
|
12
|
+
component_flags_mask: read_bytes(4),
|
13
|
+
component_name: read_string(size - 24)
|
14
|
+
})
|
15
|
+
[fields, nil]
|
16
|
+
end
|
17
|
+
|
18
|
+
def mvhd(_)
|
19
|
+
fields = read_version_and_flags.merge({
|
20
|
+
creation_time: read_int,
|
21
|
+
modification_time: read_int,
|
22
|
+
timescale: read_int,
|
23
|
+
duration: read_int,
|
24
|
+
rate: read_fixed_point(n: 4),
|
25
|
+
volume: read_fixed_point(n: 2, signed: true),
|
26
|
+
matrix: skip_bytes(10) { read_matrix },
|
27
|
+
preview_time: read_int,
|
28
|
+
preview_duration: read_int,
|
29
|
+
poster_time: read_int,
|
30
|
+
selection_time: read_int,
|
31
|
+
selection_duration: read_int,
|
32
|
+
current_time: read_int,
|
33
|
+
next_trak_id: read_int,
|
34
|
+
})
|
35
|
+
[fields, nil]
|
36
|
+
end
|
37
|
+
|
38
|
+
def tkhd(_)
|
39
|
+
fields = read_version_and_flags.merge({
|
40
|
+
creation_time: read_int,
|
41
|
+
modification_time: read_int,
|
42
|
+
track_id: read_int,
|
43
|
+
duration: skip_bytes(4) { read_int },
|
44
|
+
layer: skip_bytes(8) { read_int(n: 2) },
|
45
|
+
alternate_group: read_int(n: 2),
|
46
|
+
volume: read_fixed_point(n: 2, signed: true),
|
47
|
+
matrix: skip_bytes(2) { read_matrix },
|
48
|
+
width: read_fixed_point(n: 4),
|
49
|
+
height: read_fixed_point(n: 4)
|
50
|
+
})
|
51
|
+
[fields, nil]
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'parsers/iso_base_media_file_format/utils'
|
2
|
+
|
3
|
+
class FormatParser::MOVParser
|
4
|
+
include FormatParser::IOUtils
|
5
|
+
include FormatParser::ISOBaseMediaFileFormat::Utils
|
6
|
+
require_relative 'mov_parser/decoder'
|
7
|
+
|
8
|
+
MAGIC_BYTES = 'ftypqt '
|
9
|
+
MOV_MIME_TYPE = 'video/quicktime'
|
10
|
+
|
11
|
+
def likely_match?(filename)
|
12
|
+
/\.(mov|moov|qt)$/i.match?(filename)
|
13
|
+
end
|
14
|
+
|
15
|
+
def call(io)
|
16
|
+
@buf = FormatParser::IOConstraint.new(io)
|
17
|
+
|
18
|
+
return unless matches_mov_definition?
|
19
|
+
|
20
|
+
box_tree = Measurometer.instrument('format_parser.mov_parser.decoder.build_box_tree') do
|
21
|
+
Decoder.new.build_box_tree(0xffffffff, @buf)
|
22
|
+
end
|
23
|
+
|
24
|
+
width, height = dimensions(box_tree)
|
25
|
+
|
26
|
+
FormatParser::Video.new(
|
27
|
+
format: :mov,
|
28
|
+
width_px: width,
|
29
|
+
height_px: height,
|
30
|
+
frame_rate: frame_rate(box_tree),
|
31
|
+
media_duration_seconds: duration(box_tree),
|
32
|
+
content_type: MOV_MIME_TYPE,
|
33
|
+
codecs: video_codecs(box_tree),
|
34
|
+
intrinsics: box_tree
|
35
|
+
)
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
def matches_mov_definition?
|
41
|
+
skip_bytes(4)
|
42
|
+
matches = read_string(8) == MAGIC_BYTES
|
43
|
+
@buf.seek(0)
|
44
|
+
matches
|
45
|
+
end
|
46
|
+
|
47
|
+
FormatParser.register_parser new, natures: [:video], formats: [:mov], priority: 3
|
48
|
+
end
|
@@ -0,0 +1,80 @@
|
|
1
|
+
require_relative 'iso_base_media_file_format/decoder'
|
2
|
+
|
3
|
+
class FormatParser::MP4Parser
|
4
|
+
include FormatParser::IOUtils
|
5
|
+
include FormatParser::ISOBaseMediaFileFormat
|
6
|
+
include FormatParser::ISOBaseMediaFileFormat::Utils
|
7
|
+
|
8
|
+
MAGIC_BYTES = /^ftyp(iso[m2]|mp4[12]|m4[abprv] )$/i
|
9
|
+
|
10
|
+
BRAND_FORMATS = {
|
11
|
+
'isom' => :mp4, # Prohibited as a major brand by ISO/IEC 14496-12 sec 6.3 paragraph 2, but occasionally used.
|
12
|
+
'iso2' => :mp4, # Prohibited as a major brand by ISO/IEC 14496-12 sec 6.3 paragraph 2, but occasionally used.
|
13
|
+
'mp41' => :mp4,
|
14
|
+
'mp42' => :mp4,
|
15
|
+
'm4a ' => :m4a,
|
16
|
+
'm4b ' => :m4b, # iTunes audiobooks
|
17
|
+
'm4p ' => :m4p, # iTunes audio
|
18
|
+
'm4r ' => :m4r, # iTunes ringtones
|
19
|
+
'm4v ' => :m4v, # iTunes video
|
20
|
+
}
|
21
|
+
AUDIO_FORMATS = Set[:m4a, :m4b, :m4p, :m4r]
|
22
|
+
VIDEO_FORMATS = Set[:mp4, :m4v]
|
23
|
+
|
24
|
+
AUDIO_MIMETYPE = 'audio/mp4'
|
25
|
+
VIDEO_MIMETYPE = 'video/mp4'
|
26
|
+
|
27
|
+
def likely_match?(filename)
|
28
|
+
/\.(mp4|m4[abprv])$/i.match?(filename)
|
29
|
+
end
|
30
|
+
|
31
|
+
def call(io)
|
32
|
+
@buf = FormatParser::IOConstraint.new(io)
|
33
|
+
|
34
|
+
return unless matches_mp4_definition?
|
35
|
+
|
36
|
+
box_tree = Measurometer.instrument('format_parser.mp4_parser.decoder.build_box_tree') do
|
37
|
+
Decoder.new.build_box_tree(0xffffffff, @buf)
|
38
|
+
end
|
39
|
+
|
40
|
+
case file_format = file_format(box_tree)
|
41
|
+
when VIDEO_FORMATS
|
42
|
+
width, height = dimensions(box_tree)
|
43
|
+
FormatParser::Video.new(
|
44
|
+
codecs: video_codecs(box_tree),
|
45
|
+
content_type: VIDEO_MIMETYPE,
|
46
|
+
format: file_format,
|
47
|
+
frame_rate: frame_rate(box_tree),
|
48
|
+
height_px: height,
|
49
|
+
intrinsics: box_tree,
|
50
|
+
media_duration_seconds: duration(box_tree),
|
51
|
+
width_px: width,
|
52
|
+
)
|
53
|
+
when AUDIO_FORMATS
|
54
|
+
FormatParser::Audio.new(
|
55
|
+
content_type: AUDIO_MIMETYPE,
|
56
|
+
format: file_format,
|
57
|
+
intrinsics: box_tree,
|
58
|
+
media_duration_seconds: duration(box_tree),
|
59
|
+
)
|
60
|
+
else
|
61
|
+
nil
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
private
|
66
|
+
|
67
|
+
def file_format(box_tree)
|
68
|
+
major_brand = box_tree.find { |box| box.type == 'ftyp' }&.fields[:major_brand]
|
69
|
+
BRAND_FORMATS[major_brand.downcase] if major_brand
|
70
|
+
end
|
71
|
+
|
72
|
+
def matches_mp4_definition?
|
73
|
+
skip_bytes(4)
|
74
|
+
matches = MAGIC_BYTES.match?(read_string(8))
|
75
|
+
@buf.seek(0)
|
76
|
+
matches
|
77
|
+
end
|
78
|
+
|
79
|
+
FormatParser.register_parser new, natures: [:audio, :video], formats: BRAND_FORMATS.values.uniq, priority: 3
|
80
|
+
end
|
data/lib/parsers/pdf_parser.rb
CHANGED
@@ -6,7 +6,7 @@ class FormatParser::PDFParser
|
|
6
6
|
#
|
7
7
|
# There are however exceptions, which are left out for now.
|
8
8
|
#
|
9
|
-
PDF_MARKER = /%PDF-
|
9
|
+
PDF_MARKER = /%PDF-[12]\.[0-8]{1}/
|
10
10
|
PDF_CONTENT_TYPE = 'application/pdf'
|
11
11
|
|
12
12
|
def likely_match?(filename)
|
@@ -16,9 +16,12 @@ class FormatParser::PDFParser
|
|
16
16
|
def call(io)
|
17
17
|
io = FormatParser::IOConstraint.new(io)
|
18
18
|
|
19
|
-
|
19
|
+
header = safe_read(io, 9)
|
20
|
+
return unless header =~ PDF_MARKER
|
20
21
|
|
21
22
|
FormatParser::Document.new(format: :pdf, content_type: PDF_CONTENT_TYPE)
|
23
|
+
rescue FormatParser::IOUtils::InvalidRead
|
24
|
+
nil
|
22
25
|
end
|
23
26
|
|
24
27
|
FormatParser.register_parser new, natures: :document, formats: :pdf, priority: 3
|
data/lib/parsers/webp_parser.rb
CHANGED
@@ -69,7 +69,7 @@ class FormatParser::WebpParser
|
|
69
69
|
# The subsequent 4 bytes contain the image width and height, respectively, as 14-bit unsigned little endian
|
70
70
|
# integers (minus one). The 4 remaining bits consist of a 1-bit flag indicating whether alpha is used, and a 3-bit
|
71
71
|
# version that is always zero.
|
72
|
-
dimensions =
|
72
|
+
dimensions = read_int(big_endian: false)
|
73
73
|
width = (dimensions & 0x3fff) + 1
|
74
74
|
height = (dimensions >> 14 & 0x3fff) + 1
|
75
75
|
has_transparency = (dimensions >> 28 & 0x1) == 1
|
@@ -92,7 +92,7 @@ class FormatParser::WebpParser
|
|
92
92
|
# - E = Set if file contains Exif metadata.
|
93
93
|
# - X = Set if file contains XMP metadata.
|
94
94
|
# - A = Set if file is an animated image.
|
95
|
-
flags =
|
95
|
+
flags = read_int(n: 1)
|
96
96
|
has_transparency = flags & 0x10 != 0
|
97
97
|
has_exif_metadata = flags & 0x08 != 0
|
98
98
|
has_xmp_metadata = flags & 0x04 != 0
|
data/spec/format_parser_spec.rb
CHANGED
@@ -184,7 +184,7 @@ describe FormatParser do
|
|
184
184
|
'FormatParser::GIFParser',
|
185
185
|
'Class',
|
186
186
|
'FormatParser::PNGParser',
|
187
|
-
'FormatParser::
|
187
|
+
'FormatParser::MP4Parser',
|
188
188
|
'FormatParser::CR2Parser',
|
189
189
|
'FormatParser::CR3Parser',
|
190
190
|
'FormatParser::DPXParser',
|
@@ -18,9 +18,9 @@ describe FormatParser::CR3Parser do
|
|
18
18
|
expect(result).to be_nil
|
19
19
|
end
|
20
20
|
|
21
|
-
it 'should return nil if no CMT1
|
21
|
+
it 'should return nil if no CMT1 box is present' do
|
22
22
|
# This is a MOV file with the ftyp header modified to masquerade as a CR3 file. It is therefore missing the
|
23
|
-
# CR3-specific CMT1
|
23
|
+
# CR3-specific CMT1 box containing the image metadata.
|
24
24
|
result = subject.call(File.open(fixtures_dir + '/CR3/invalid'))
|
25
25
|
expect(result).to be_nil
|
26
26
|
end
|
@@ -50,7 +50,7 @@ describe FormatParser::CR3Parser do
|
|
50
50
|
expect(result.display_height_px).to eq(4000)
|
51
51
|
expect(result.content_type).to eq('image/x-canon-cr3')
|
52
52
|
expect(result.intrinsics).not_to be_nil
|
53
|
-
expect(result.intrinsics[:
|
53
|
+
expect(result.intrinsics[:box_tree]).not_to be_nil
|
54
54
|
expect(result.intrinsics[:exif]).not_to be_nil
|
55
55
|
expect(result.intrinsics[:exif][:image_length]).to eq(result.height_px)
|
56
56
|
expect(result.intrinsics[:exif][:image_width]).to eq(result.width_px)
|