format_parser 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,70 @@
1
+ module FormatParser
2
+ class FileInformation
3
+
4
+ # What kind of file is it?
5
+ attr_accessor :file_nature
6
+
7
+ # What filetype was recognized? Will contain a non-ambiguous symbol
8
+ # referring to the file format. The symbol can be used as a filename
9
+ # extension safely
10
+ attr_accessor :file_type
11
+
12
+ # Number of pixels horizontally in the pixel buffer
13
+ attr_accessor :width_px
14
+
15
+ # Number of pixels vertically in the pixel buffer
16
+ attr_accessor :height_px
17
+
18
+ # Whether the file has multiple frames (relevant for image files and video)
19
+ attr_accessor :has_multiple_frames
20
+
21
+ # The angle by which the camera was rotated when taking the picture
22
+ # (affects display width and height)
23
+ attr_accessor :orientation
24
+
25
+ # Whether the image has transparency (or an alpha channel)
26
+ attr_accessor :has_transparency
27
+
28
+ # Basic information about the color mode
29
+ attr_accessor :color_mode
30
+
31
+ # If the file has animation or is video, this might
32
+ # indicate the number of frames. Some formats do not
33
+ # allow retrieving this value without parsing the entire
34
+ # file, so for GIF this might be nil even though it is
35
+ # animated. For a boolean check, `has_multiple_frames`
36
+ # might offer a better clue.
37
+ attr_accessor :num_animation_or_video_frames
38
+
39
+ # Orientation from EXIF data. Will come through as an integer.
40
+ # To be perfectly honest EXIF orientation handling is a bit of a mess,
41
+ # here's a reasonable blog post about it:
42
+ # http://magnushoff.com/jpeg-orientation.html
43
+ attr_accessor :image_orientation
44
+
45
+ # The number of audio channels for sound files that are muxed
46
+ # and for video files with embedded sound
47
+ attr_accessor :num_audio_channels
48
+
49
+ # SampeThe number of audio channels for sound files that are muxed
50
+ # and for video files with embedded sound
51
+ attr_accessor :audio_sample_rate_hz
52
+
53
+ # Duration of the media object (be it audio or video) in seconds,
54
+ # as a Float
55
+ attr_accessor :media_duration_seconds
56
+
57
+ # Duration of the media object in addressable frames or samples,
58
+ # as an Integer
59
+ attr_accessor :media_duration_frames
60
+
61
+ # Only permits assignments via defined accessors
62
+ def initialize(**attributes)
63
+ attributes.map { |(k, v)| public_send("#{k}=", v) }
64
+ end
65
+
66
+ def self.image(**kwargs)
67
+ new(file_nature: :image, **kwargs)
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,55 @@
1
+ require 'thread'
2
+
3
+ module FormatParser
4
+ require_relative 'file_information'
5
+ require_relative 'io_utils'
6
+ require_relative 'read_limiter'
7
+ require_relative 'remote_io'
8
+ require_relative 'care'
9
+
10
+ PARSER_MUX = Mutex.new
11
+
12
+ def self.register_parser_constructor(object_responding_to_new)
13
+ PARSER_MUX.synchronize do
14
+ @parsers ||= []
15
+ @parsers << object_responding_to_new
16
+ end
17
+ end
18
+
19
+ def self.parse_http(url)
20
+ parse(RemoteIO.new(url))
21
+ end
22
+
23
+ def self.parse(io)
24
+ io = Care::IOWrapper.new(io) unless io.is_a?(Care::IOWrapper)
25
+
26
+ # Always instantiate parsers fresh for each input, since they might
27
+ # contain instance variables which otherwise would have to be reset
28
+ # between invocations, and would complicate threading situations
29
+ parsers = @parsers.map(&:new)
30
+
31
+ parsers.each do |parser|
32
+ # We need to rewind for each parser, anew
33
+ io.seek(0)
34
+ # Limit how many operations the parser can perform
35
+ limited_io = ReadLimiter.new(io, max_bytes: 512*1024, max_reads: 64*1024, max_seeks: 64*1024)
36
+ begin
37
+ if info = parser.information_from_io(limited_io)
38
+ return info
39
+ end
40
+ rescue IOUtils::InvalidRead
41
+ # There was not enough data for this parser to work on,
42
+ # and it triggered an error
43
+ rescue ReadLimiter::BudgetExceeded
44
+ # The parser tried to read too much - most likely the file structure
45
+ # caused the parser to go off-track. Strictly speaking we should log this
46
+ # and examine the file more closely.
47
+ end
48
+ end
49
+ nil # Nothing matched
50
+ end
51
+
52
+ Dir.glob(__dir__ + '/parsers/*.rb').sort.each do |parser_file|
53
+ require parser_file
54
+ end
55
+ end
@@ -0,0 +1,3 @@
1
+ module FormatParser
2
+ VERSION = '0.1.0'
3
+ end
data/lib/io_utils.rb ADDED
@@ -0,0 +1,41 @@
1
+ module FormatParser::IOUtils
2
+ class InvalidRead < ArgumentError
3
+ end
4
+
5
+ def safe_read(io, n)
6
+ if n.nil?
7
+ raise ArgumentError, "Unbounded reads are not supported"
8
+ end
9
+ buf = io.read(n)
10
+
11
+ if !buf
12
+ raise InvalidRead, "We wanted to read #{n} bytes from the IO, but the IO is at EOF"
13
+ end
14
+ if buf.bytesize != n
15
+ raise InvalidRead, "We wanted to read #{n} bytes from the IO, but we got #{buf.bytesize} instead"
16
+ end
17
+
18
+ buf
19
+ end
20
+
21
+ def safe_skip(io, n)
22
+ if n.nil?
23
+ raise ArgumentError, "Unbounded skips are not supported"
24
+ end
25
+
26
+ return if n == 0
27
+
28
+ if n < 0
29
+ raise InvalidRead, "Negative skips are not supported"
30
+ end
31
+
32
+ if io.respond_to?(:pos)
33
+ io.seek(io.pos + n)
34
+ else
35
+ safe_read(io, n)
36
+ end
37
+ nil
38
+ end
39
+
40
+ ### TODO: Some kind of built-in offset for the read
41
+ end
@@ -0,0 +1,86 @@
1
+ class FormatParser::AIFFParser
2
+ include FormatParser::IOUtils
3
+
4
+ # Known chunk types we can omit when parsing,
5
+ # grossly lifted from http://www.muratnkonar.com/aiff/
6
+ KNOWN_CHUNKS = [
7
+ 'COMT',
8
+ 'INST',
9
+ 'MARK',
10
+ 'SKIP',
11
+ 'SSND',
12
+ 'MIDI',
13
+ 'AESD',
14
+ 'APPL',
15
+ 'NAME',
16
+ 'AUTH',
17
+ '(c) ', # yes it is a thing
18
+ 'ANNO',
19
+ ]
20
+
21
+ def information_from_io(io)
22
+ io.seek(0)
23
+ form_chunk_type, chunk_size = safe_read(io, 8).unpack('a4N')
24
+ return unless form_chunk_type == "FORM" && chunk_size > 4
25
+
26
+ fmt_chunk_type = safe_read(io, 4)
27
+
28
+ return unless fmt_chunk_type == "AIFF"
29
+
30
+ # There might be COMT chunks, for example in Logic exports
31
+ loop do
32
+ chunk_type, chunk_size = safe_read(io, 8).unpack('a4N')
33
+ case chunk_type
34
+ when 'COMM'
35
+ # The ID is always COMM. The chunkSize field is the number of bytes in the
36
+ # chunk. This does not include the 8 bytes used by ID and Size fields. For
37
+ # the Common Chunk, chunkSize should always 18 since there are no fields of
38
+ # variable length (but to maintain compatibility with possible future
39
+ # extensions, if the chunkSize is > 18, you should always treat those extra
40
+ # bytes as pad bytes).
41
+ return unpack_comm_chunk(io)
42
+ when *KNOWN_CHUNKS
43
+ # We continue looping only if we encountered something that looks like
44
+ # a valid AIFF chunk type - skip the size and continue
45
+ safe_skip(io, chunk_size)
46
+ next
47
+ else # This most likely not an AIFF
48
+ return nil
49
+ end
50
+ end
51
+ end
52
+
53
+ def unpack_comm_chunk(io)
54
+ # Parse the COMM chunk
55
+ channels, sample_frames, sample_size, sample_rate_extended = safe_read(io, 2 + 4 + 2 + 10).unpack('nNna10')
56
+ sample_rate = unpack_extended_float(sample_rate_extended)
57
+ bytes_per_sample = (sample_size - 1) / 8 + 1
58
+
59
+ return unless sample_frames > 0
60
+
61
+ # The sample rate is in Hz, so to get duration in seconds, as a float...
62
+ duration_in_seconds = sample_frames / sample_rate
63
+ return unless duration_in_seconds > 0
64
+
65
+ FormatParser::FileInformation.new(
66
+ file_nature: :audio,
67
+ file_type: :aiff,
68
+ num_audio_channels: channels,
69
+ audio_sample_rate_hz: sample_rate.to_i,
70
+ media_duration_frames: sample_frames,
71
+ media_duration_seconds: duration_in_seconds,
72
+ )
73
+ end
74
+
75
+ def unpack_extended_float(ten_bytes_string)
76
+ extended = ten_bytes_string.unpack('B80')[0]
77
+
78
+ sign = extended[0, 1]
79
+ exponent = extended[1, 15].to_i(2) - ((1 << 14) - 1)
80
+ fraction = extended[16, 64].to_i(2)
81
+
82
+ ((sign == '1') ? -1.0 : 1.0) * (fraction.to_f / ((1 << 63) - 1)) * (2 ** exponent)
83
+ end
84
+
85
+ FormatParser.register_parser_constructor self
86
+ end
@@ -0,0 +1,143 @@
1
+ class FormatParser::DPXParser
2
+ include FormatParser::IOUtils
3
+ FILE_INFO = [
4
+ # :x4, # magic bytes SDPX, we read them anyway so not in the pattern
5
+ :x4, # u32 :image_offset, :desc => 'Offset to image data in bytes', :req => true
6
+ :x8, # char :version, 8, :desc => 'Version of header format', :req => true
7
+ :x4, # u32 :file_size, :desc => "Total image size in bytes", :req => true
8
+ :x4, # u32 :ditto_key, :desc => 'Whether the basic headers stay the same through the sequence (1 means they do)'
9
+ :x4, # u32 :generic_size, :desc => 'Generic header length'
10
+ :x4, # u32 :industry_size, :desc => 'Industry header length'
11
+ :x4, # u32 :user_size, :desc => 'User header length'
12
+ :x100, # char :filename, 100, :desc => 'Original filename'
13
+ :x24, # char :timestamp, 24, :desc => 'Creation timestamp'
14
+ :x100, # char :creator, 100, :desc => 'Creator application'
15
+ :x200, # char :project, 200, :desc => 'Project name'
16
+ :x200, # char :copyright, 200, :desc => 'Copyright'
17
+ :x4, # u32 :encrypt_key, :desc => 'Encryption key'
18
+ :x104, # blanking :reserve, 104
19
+ ].join
20
+
21
+ FILM_INFO = [
22
+ :x2, # char :id, 2, :desc => 'Film mfg. ID code (2 digits from film edge code)'
23
+ :x2, # char :type, 2, :desc => 'Film type (2 digits from film edge code)'
24
+ :x2, # char :offset, 2, :desc => 'Offset in perfs (2 digits from film edge code)'
25
+ :x6, # char :prefix, 6, :desc => 'Prefix (6 digits from film edge code'
26
+ :x4, # char :count, 4, :desc => 'Count (4 digits from film edge code)'
27
+ :x32,# char :format, 32, :desc => 'Format (e.g. Academy)'
28
+ :x4, # u32 :frame_position, :desc => 'Frame position in sequence'
29
+ :x4, # u32 :sequence_extent, :desc => 'Sequence length'
30
+ :x4, # u32 :held_count, :desc => 'For how many frames the frame is held'
31
+ :x4, # r32 :frame_rate, :desc => 'Frame rate'
32
+ :x4, # r32 :shutter_angle, :desc => 'Shutter angle'
33
+ :x4, # char :frame_id, 32, :desc => 'Frame identification (keyframe)'
34
+ :x4, # char :slate, 100, :desc => 'Slate information'
35
+ :x4, # blanking :reserve, 56
36
+ ].join
37
+
38
+ IMAGE_ELEMENT = [
39
+ :x4, # u32 :data_sign, :desc => 'Data sign (0=unsigned, 1=signed). Core is unsigned', :req => true
40
+ #
41
+ :x4, # u32 :low_data, :desc => 'Reference low data code value'
42
+ :x4, # r32 :low_quantity, :desc => 'Reference low quantity represented'
43
+ :x4, # u32 :high_data, :desc => 'Reference high data code value (1023 for 10bit per channel)'
44
+ :x4, # r32 :high_quantity, :desc => 'Reference high quantity represented'
45
+ #
46
+ :x1, # u8 :descriptor, :desc => 'Descriptor for this image element (ie Video or Film), by enum', :req => true
47
+ # TODO - colirimetry information might be handy to recover,
48
+ # as well as "bit size per element" (how many bits _per component_ we have) -
49
+ # this will be different for, say, 8-bit DPX files versus 10-bit etc.
50
+ :x1, # u8 :transfer, :desc => 'Transfer function (ie Linear), by enum', :req => true
51
+ :x1, # u8 :colorimetric, :desc => 'Colorimetric (ie YcbCr), by enum', :req => true
52
+ :x1, # u8 :bit_size, :desc => 'Bit size for element (ie 10)', :req => true
53
+ #
54
+ :x2, # u16 :packing, :desc => 'Packing (0=Packed into 32-bit words, 1=Filled to 32-bit words))', :req => true
55
+ :x2, # u16 :encoding, :desc => "Encoding (0=None, 1=RLE)", :req => true
56
+ :x4, # u32 :data_offset, :desc => 'Offset to data for this image element', :req => true
57
+ :x4, # u32 :end_of_line_padding, :desc => "End-of-line padding for this image element"
58
+ :x4, # u32 :end_of_image_padding, :desc => "End-of-line padding for this image element"
59
+ :x32,# char :description, 32
60
+ ].join
61
+
62
+ IMAGE_INFO = [
63
+ :x2, # u16 :orientation, OrientationInfo, :desc => 'Orientation descriptor', :req => true
64
+ :n1, # u16 :number_elements, :desc => 'How many elements to scan', :req => true
65
+ :N1, # u32 :pixels_per_line, :desc => 'Pixels per horizontal line', :req => true
66
+ :N1, # u32 :lines_per_element, :desc => 'Line count', :req => true
67
+ IMAGE_ELEMENT * 8, # 8 IMAGE_ELEMENT structures
68
+ :x52, # blanking :reserve, 52
69
+ ].join
70
+
71
+ ORIENTATION_INFO = [
72
+ :x4, # u32 :x_offset
73
+ :x4, # u32 :y_offset
74
+ #
75
+ :x4, # r32 :x_center
76
+ :x4, # r32 :y_center
77
+ #
78
+ :x4, # u32 :x_size, :desc => 'Original X size'
79
+ :x4, # u32 :y_size, :desc => 'Original Y size'
80
+ #
81
+ :x100, # char :filename, 100, :desc => "Source image filename"
82
+ :x24, # char :timestamp, 24, :desc => "Source image/tape timestamp"
83
+ :x32, # char :device, 32, :desc => "Input device or tape"
84
+ :x32, # char :serial, 32, :desc => "Input device serial number"
85
+ #
86
+ :x4, # array :border, :u16, 4, :desc => 'Border validity: XL, XR, YT, YB'
87
+ :x4,
88
+ :x4,
89
+ :x4,
90
+
91
+ # TODO - the aspect ratio might be handy to recover since it
92
+ # will be used in DPX files in, say, anamorphic (non-square pixels)
93
+ :x4, # array :aspect_ratio , :u32, 2, :desc => "Aspect (H:V)"
94
+ :x4,
95
+ #
96
+ :x28, # blanking :reserve, 28
97
+ ].join
98
+
99
+ DPX_INFO = [
100
+ FILE_INFO,
101
+ IMAGE_INFO,
102
+ ORIENTATION_INFO,
103
+ ].join
104
+
105
+ DPX_INFO_LE = DPX_INFO.tr("n", "v").tr("N", "V")
106
+
107
+ SIZEOF = ->(pattern) {
108
+ bytes_per_element = {
109
+ "v" => 2, # 16bit uints
110
+ "n" => 2,
111
+ "V" => 4, # 32bit uints
112
+ "N" => 4,
113
+ "C" => 1,
114
+ "x" => 1,
115
+ }
116
+ pattern.scan(/[^\d]\d+/).map do |pattern|
117
+ unpack_code = pattern[0]
118
+ num_repetitions = pattern[1..-1].to_i
119
+ bytes_per_element.fetch(unpack_code) * num_repetitions
120
+ end.inject(&:+)
121
+ }
122
+
123
+ BE_MAGIC = 'SDPX'
124
+ LE_MAGIC = BE_MAGIC.reverse
125
+ HEADER_SIZE = SIZEOF[DPX_INFO] # Does not include the initial 4 bytes
126
+
127
+ def information_from_io(io)
128
+ magic = io.read(4)
129
+
130
+ return nil unless [BE_MAGIC, LE_MAGIC].include?(magic)
131
+
132
+ unpack_pattern = DPX_INFO
133
+ unpack_pattern = DPX_INFO_LE if magic == LE_MAGIC
134
+ num_elements, pixels_per_line, num_lines, *_ = safe_read(io, HEADER_SIZE).unpack(unpack_pattern)
135
+ FormatParser::FileInformation.image(
136
+ file_type: :dpx,
137
+ width_px: pixels_per_line,
138
+ height_px: num_lines,
139
+ )
140
+ end
141
+
142
+ FormatParser.register_parser_constructor self
143
+ end
@@ -0,0 +1,58 @@
1
+ require 'exifr/jpeg'
2
+ require 'exifr/tiff'
3
+
4
+ class FormatParser::EXIFParser
5
+ include FormatParser::IOUtils
6
+
7
+ # Squash exifr's invalid date warning since we do not use that data.
8
+ logger = Logger.new(nil)
9
+ EXIFR.logger = logger
10
+
11
+ attr_accessor :exif_data, :orientation, :width, :height
12
+
13
+ ORIENTATIONS = [
14
+ :top_left,
15
+ :top_right,
16
+ :bottom_right,
17
+ :bottom_left,
18
+ :left_top,
19
+ :right_top,
20
+ :right_bottom,
21
+ :left_bottom
22
+ ]
23
+
24
+ def initialize(filetype, file_data)
25
+ @filetype = filetype
26
+ @file_data = file_data
27
+ @exif_data = nil
28
+ @orientation = nil
29
+ @height = nil
30
+ @width = nil
31
+ end
32
+
33
+ def scan_image_exif
34
+
35
+ # Without the magic bytes EXIFR throws an error
36
+ @file_data.rewind
37
+ raw_exif_data = EXIFR::JPEG.new(@file_data) if @filetype == :jpeg
38
+ raw_exif_data = EXIFR::TIFF.new(@file_data) if @filetype == :tiff
39
+ # For things that we don't yet have a parser for
40
+ # we make the raw exif result available
41
+ @exif_data = raw_exif_data
42
+ @orientation = orientation_parser(raw_exif_data)
43
+ @width = @exif_data.width
44
+ @height = @exif_data.height
45
+ end
46
+
47
+ def orientation_parser(raw_exif_data)
48
+ value = raw_exif_data.orientation.to_i
49
+ if valid_orientation?(value)
50
+ @orientation = ORIENTATIONS[value - 1]
51
+ end
52
+ end
53
+
54
+ def valid_orientation?(value)
55
+ (1..ORIENTATIONS.length).include?(value)
56
+ end
57
+
58
+ end