format_parser 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,70 @@
1
+ module FormatParser
2
+ class FileInformation
3
+
4
+ # What kind of file is it?
5
+ attr_accessor :file_nature
6
+
7
+ # What filetype was recognized? Will contain a non-ambiguous symbol
8
+ # referring to the file format. The symbol can be used as a filename
9
+ # extension safely
10
+ attr_accessor :file_type
11
+
12
+ # Number of pixels horizontally in the pixel buffer
13
+ attr_accessor :width_px
14
+
15
+ # Number of pixels vertically in the pixel buffer
16
+ attr_accessor :height_px
17
+
18
+ # Whether the file has multiple frames (relevant for image files and video)
19
+ attr_accessor :has_multiple_frames
20
+
21
+ # The angle by which the camera was rotated when taking the picture
22
+ # (affects display width and height)
23
+ attr_accessor :orientation
24
+
25
+ # Whether the image has transparency (or an alpha channel)
26
+ attr_accessor :has_transparency
27
+
28
+ # Basic information about the color mode
29
+ attr_accessor :color_mode
30
+
31
+ # If the file has animation or is video, this might
32
+ # indicate the number of frames. Some formats do not
33
+ # allow retrieving this value without parsing the entire
34
+ # file, so for GIF this might be nil even though it is
35
+ # animated. For a boolean check, `has_multiple_frames`
36
+ # might offer a better clue.
37
+ attr_accessor :num_animation_or_video_frames
38
+
39
+ # Orientation from EXIF data. Will come through as an integer.
40
+ # To be perfectly honest EXIF orientation handling is a bit of a mess,
41
+ # here's a reasonable blog post about it:
42
+ # http://magnushoff.com/jpeg-orientation.html
43
+ attr_accessor :image_orientation
44
+
45
+ # The number of audio channels for sound files that are muxed
46
+ # and for video files with embedded sound
47
+ attr_accessor :num_audio_channels
48
+
49
+ # SampeThe number of audio channels for sound files that are muxed
50
+ # and for video files with embedded sound
51
+ attr_accessor :audio_sample_rate_hz
52
+
53
+ # Duration of the media object (be it audio or video) in seconds,
54
+ # as a Float
55
+ attr_accessor :media_duration_seconds
56
+
57
+ # Duration of the media object in addressable frames or samples,
58
+ # as an Integer
59
+ attr_accessor :media_duration_frames
60
+
61
+ # Only permits assignments via defined accessors
62
+ def initialize(**attributes)
63
+ attributes.map { |(k, v)| public_send("#{k}=", v) }
64
+ end
65
+
66
+ def self.image(**kwargs)
67
+ new(file_nature: :image, **kwargs)
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,55 @@
1
+ require 'thread'
2
+
3
+ module FormatParser
4
+ require_relative 'file_information'
5
+ require_relative 'io_utils'
6
+ require_relative 'read_limiter'
7
+ require_relative 'remote_io'
8
+ require_relative 'care'
9
+
10
+ PARSER_MUX = Mutex.new
11
+
12
+ def self.register_parser_constructor(object_responding_to_new)
13
+ PARSER_MUX.synchronize do
14
+ @parsers ||= []
15
+ @parsers << object_responding_to_new
16
+ end
17
+ end
18
+
19
+ def self.parse_http(url)
20
+ parse(RemoteIO.new(url))
21
+ end
22
+
23
+ def self.parse(io)
24
+ io = Care::IOWrapper.new(io) unless io.is_a?(Care::IOWrapper)
25
+
26
+ # Always instantiate parsers fresh for each input, since they might
27
+ # contain instance variables which otherwise would have to be reset
28
+ # between invocations, and would complicate threading situations
29
+ parsers = @parsers.map(&:new)
30
+
31
+ parsers.each do |parser|
32
+ # We need to rewind for each parser, anew
33
+ io.seek(0)
34
+ # Limit how many operations the parser can perform
35
+ limited_io = ReadLimiter.new(io, max_bytes: 512*1024, max_reads: 64*1024, max_seeks: 64*1024)
36
+ begin
37
+ if info = parser.information_from_io(limited_io)
38
+ return info
39
+ end
40
+ rescue IOUtils::InvalidRead
41
+ # There was not enough data for this parser to work on,
42
+ # and it triggered an error
43
+ rescue ReadLimiter::BudgetExceeded
44
+ # The parser tried to read too much - most likely the file structure
45
+ # caused the parser to go off-track. Strictly speaking we should log this
46
+ # and examine the file more closely.
47
+ end
48
+ end
49
+ nil # Nothing matched
50
+ end
51
+
52
+ Dir.glob(__dir__ + '/parsers/*.rb').sort.each do |parser_file|
53
+ require parser_file
54
+ end
55
+ end
@@ -0,0 +1,3 @@
1
+ module FormatParser
2
+ VERSION = '0.1.0'
3
+ end
data/lib/io_utils.rb ADDED
@@ -0,0 +1,41 @@
1
+ module FormatParser::IOUtils
2
+ class InvalidRead < ArgumentError
3
+ end
4
+
5
+ def safe_read(io, n)
6
+ if n.nil?
7
+ raise ArgumentError, "Unbounded reads are not supported"
8
+ end
9
+ buf = io.read(n)
10
+
11
+ if !buf
12
+ raise InvalidRead, "We wanted to read #{n} bytes from the IO, but the IO is at EOF"
13
+ end
14
+ if buf.bytesize != n
15
+ raise InvalidRead, "We wanted to read #{n} bytes from the IO, but we got #{buf.bytesize} instead"
16
+ end
17
+
18
+ buf
19
+ end
20
+
21
+ def safe_skip(io, n)
22
+ if n.nil?
23
+ raise ArgumentError, "Unbounded skips are not supported"
24
+ end
25
+
26
+ return if n == 0
27
+
28
+ if n < 0
29
+ raise InvalidRead, "Negative skips are not supported"
30
+ end
31
+
32
+ if io.respond_to?(:pos)
33
+ io.seek(io.pos + n)
34
+ else
35
+ safe_read(io, n)
36
+ end
37
+ nil
38
+ end
39
+
40
+ ### TODO: Some kind of built-in offset for the read
41
+ end
@@ -0,0 +1,86 @@
1
+ class FormatParser::AIFFParser
2
+ include FormatParser::IOUtils
3
+
4
+ # Known chunk types we can omit when parsing,
5
+ # grossly lifted from http://www.muratnkonar.com/aiff/
6
+ KNOWN_CHUNKS = [
7
+ 'COMT',
8
+ 'INST',
9
+ 'MARK',
10
+ 'SKIP',
11
+ 'SSND',
12
+ 'MIDI',
13
+ 'AESD',
14
+ 'APPL',
15
+ 'NAME',
16
+ 'AUTH',
17
+ '(c) ', # yes it is a thing
18
+ 'ANNO',
19
+ ]
20
+
21
+ def information_from_io(io)
22
+ io.seek(0)
23
+ form_chunk_type, chunk_size = safe_read(io, 8).unpack('a4N')
24
+ return unless form_chunk_type == "FORM" && chunk_size > 4
25
+
26
+ fmt_chunk_type = safe_read(io, 4)
27
+
28
+ return unless fmt_chunk_type == "AIFF"
29
+
30
+ # There might be COMT chunks, for example in Logic exports
31
+ loop do
32
+ chunk_type, chunk_size = safe_read(io, 8).unpack('a4N')
33
+ case chunk_type
34
+ when 'COMM'
35
+ # The ID is always COMM. The chunkSize field is the number of bytes in the
36
+ # chunk. This does not include the 8 bytes used by ID and Size fields. For
37
+ # the Common Chunk, chunkSize should always 18 since there are no fields of
38
+ # variable length (but to maintain compatibility with possible future
39
+ # extensions, if the chunkSize is > 18, you should always treat those extra
40
+ # bytes as pad bytes).
41
+ return unpack_comm_chunk(io)
42
+ when *KNOWN_CHUNKS
43
+ # We continue looping only if we encountered something that looks like
44
+ # a valid AIFF chunk type - skip the size and continue
45
+ safe_skip(io, chunk_size)
46
+ next
47
+ else # This most likely not an AIFF
48
+ return nil
49
+ end
50
+ end
51
+ end
52
+
53
+ def unpack_comm_chunk(io)
54
+ # Parse the COMM chunk
55
+ channels, sample_frames, sample_size, sample_rate_extended = safe_read(io, 2 + 4 + 2 + 10).unpack('nNna10')
56
+ sample_rate = unpack_extended_float(sample_rate_extended)
57
+ bytes_per_sample = (sample_size - 1) / 8 + 1
58
+
59
+ return unless sample_frames > 0
60
+
61
+ # The sample rate is in Hz, so to get duration in seconds, as a float...
62
+ duration_in_seconds = sample_frames / sample_rate
63
+ return unless duration_in_seconds > 0
64
+
65
+ FormatParser::FileInformation.new(
66
+ file_nature: :audio,
67
+ file_type: :aiff,
68
+ num_audio_channels: channels,
69
+ audio_sample_rate_hz: sample_rate.to_i,
70
+ media_duration_frames: sample_frames,
71
+ media_duration_seconds: duration_in_seconds,
72
+ )
73
+ end
74
+
75
+ def unpack_extended_float(ten_bytes_string)
76
+ extended = ten_bytes_string.unpack('B80')[0]
77
+
78
+ sign = extended[0, 1]
79
+ exponent = extended[1, 15].to_i(2) - ((1 << 14) - 1)
80
+ fraction = extended[16, 64].to_i(2)
81
+
82
+ ((sign == '1') ? -1.0 : 1.0) * (fraction.to_f / ((1 << 63) - 1)) * (2 ** exponent)
83
+ end
84
+
85
+ FormatParser.register_parser_constructor self
86
+ end
@@ -0,0 +1,143 @@
1
+ class FormatParser::DPXParser
2
+ include FormatParser::IOUtils
3
+ FILE_INFO = [
4
+ # :x4, # magic bytes SDPX, we read them anyway so not in the pattern
5
+ :x4, # u32 :image_offset, :desc => 'Offset to image data in bytes', :req => true
6
+ :x8, # char :version, 8, :desc => 'Version of header format', :req => true
7
+ :x4, # u32 :file_size, :desc => "Total image size in bytes", :req => true
8
+ :x4, # u32 :ditto_key, :desc => 'Whether the basic headers stay the same through the sequence (1 means they do)'
9
+ :x4, # u32 :generic_size, :desc => 'Generic header length'
10
+ :x4, # u32 :industry_size, :desc => 'Industry header length'
11
+ :x4, # u32 :user_size, :desc => 'User header length'
12
+ :x100, # char :filename, 100, :desc => 'Original filename'
13
+ :x24, # char :timestamp, 24, :desc => 'Creation timestamp'
14
+ :x100, # char :creator, 100, :desc => 'Creator application'
15
+ :x200, # char :project, 200, :desc => 'Project name'
16
+ :x200, # char :copyright, 200, :desc => 'Copyright'
17
+ :x4, # u32 :encrypt_key, :desc => 'Encryption key'
18
+ :x104, # blanking :reserve, 104
19
+ ].join
20
+
21
+ FILM_INFO = [
22
+ :x2, # char :id, 2, :desc => 'Film mfg. ID code (2 digits from film edge code)'
23
+ :x2, # char :type, 2, :desc => 'Film type (2 digits from film edge code)'
24
+ :x2, # char :offset, 2, :desc => 'Offset in perfs (2 digits from film edge code)'
25
+ :x6, # char :prefix, 6, :desc => 'Prefix (6 digits from film edge code'
26
+ :x4, # char :count, 4, :desc => 'Count (4 digits from film edge code)'
27
+ :x32,# char :format, 32, :desc => 'Format (e.g. Academy)'
28
+ :x4, # u32 :frame_position, :desc => 'Frame position in sequence'
29
+ :x4, # u32 :sequence_extent, :desc => 'Sequence length'
30
+ :x4, # u32 :held_count, :desc => 'For how many frames the frame is held'
31
+ :x4, # r32 :frame_rate, :desc => 'Frame rate'
32
+ :x4, # r32 :shutter_angle, :desc => 'Shutter angle'
33
+ :x4, # char :frame_id, 32, :desc => 'Frame identification (keyframe)'
34
+ :x4, # char :slate, 100, :desc => 'Slate information'
35
+ :x4, # blanking :reserve, 56
36
+ ].join
37
+
38
+ IMAGE_ELEMENT = [
39
+ :x4, # u32 :data_sign, :desc => 'Data sign (0=unsigned, 1=signed). Core is unsigned', :req => true
40
+ #
41
+ :x4, # u32 :low_data, :desc => 'Reference low data code value'
42
+ :x4, # r32 :low_quantity, :desc => 'Reference low quantity represented'
43
+ :x4, # u32 :high_data, :desc => 'Reference high data code value (1023 for 10bit per channel)'
44
+ :x4, # r32 :high_quantity, :desc => 'Reference high quantity represented'
45
+ #
46
+ :x1, # u8 :descriptor, :desc => 'Descriptor for this image element (ie Video or Film), by enum', :req => true
47
+ # TODO - colirimetry information might be handy to recover,
48
+ # as well as "bit size per element" (how many bits _per component_ we have) -
49
+ # this will be different for, say, 8-bit DPX files versus 10-bit etc.
50
+ :x1, # u8 :transfer, :desc => 'Transfer function (ie Linear), by enum', :req => true
51
+ :x1, # u8 :colorimetric, :desc => 'Colorimetric (ie YcbCr), by enum', :req => true
52
+ :x1, # u8 :bit_size, :desc => 'Bit size for element (ie 10)', :req => true
53
+ #
54
+ :x2, # u16 :packing, :desc => 'Packing (0=Packed into 32-bit words, 1=Filled to 32-bit words))', :req => true
55
+ :x2, # u16 :encoding, :desc => "Encoding (0=None, 1=RLE)", :req => true
56
+ :x4, # u32 :data_offset, :desc => 'Offset to data for this image element', :req => true
57
+ :x4, # u32 :end_of_line_padding, :desc => "End-of-line padding for this image element"
58
+ :x4, # u32 :end_of_image_padding, :desc => "End-of-line padding for this image element"
59
+ :x32,# char :description, 32
60
+ ].join
61
+
62
+ IMAGE_INFO = [
63
+ :x2, # u16 :orientation, OrientationInfo, :desc => 'Orientation descriptor', :req => true
64
+ :n1, # u16 :number_elements, :desc => 'How many elements to scan', :req => true
65
+ :N1, # u32 :pixels_per_line, :desc => 'Pixels per horizontal line', :req => true
66
+ :N1, # u32 :lines_per_element, :desc => 'Line count', :req => true
67
+ IMAGE_ELEMENT * 8, # 8 IMAGE_ELEMENT structures
68
+ :x52, # blanking :reserve, 52
69
+ ].join
70
+
71
+ ORIENTATION_INFO = [
72
+ :x4, # u32 :x_offset
73
+ :x4, # u32 :y_offset
74
+ #
75
+ :x4, # r32 :x_center
76
+ :x4, # r32 :y_center
77
+ #
78
+ :x4, # u32 :x_size, :desc => 'Original X size'
79
+ :x4, # u32 :y_size, :desc => 'Original Y size'
80
+ #
81
+ :x100, # char :filename, 100, :desc => "Source image filename"
82
+ :x24, # char :timestamp, 24, :desc => "Source image/tape timestamp"
83
+ :x32, # char :device, 32, :desc => "Input device or tape"
84
+ :x32, # char :serial, 32, :desc => "Input device serial number"
85
+ #
86
+ :x4, # array :border, :u16, 4, :desc => 'Border validity: XL, XR, YT, YB'
87
+ :x4,
88
+ :x4,
89
+ :x4,
90
+
91
+ # TODO - the aspect ratio might be handy to recover since it
92
+ # will be used in DPX files in, say, anamorphic (non-square pixels)
93
+ :x4, # array :aspect_ratio , :u32, 2, :desc => "Aspect (H:V)"
94
+ :x4,
95
+ #
96
+ :x28, # blanking :reserve, 28
97
+ ].join
98
+
99
+ DPX_INFO = [
100
+ FILE_INFO,
101
+ IMAGE_INFO,
102
+ ORIENTATION_INFO,
103
+ ].join
104
+
105
+ DPX_INFO_LE = DPX_INFO.tr("n", "v").tr("N", "V")
106
+
107
+ SIZEOF = ->(pattern) {
108
+ bytes_per_element = {
109
+ "v" => 2, # 16bit uints
110
+ "n" => 2,
111
+ "V" => 4, # 32bit uints
112
+ "N" => 4,
113
+ "C" => 1,
114
+ "x" => 1,
115
+ }
116
+ pattern.scan(/[^\d]\d+/).map do |pattern|
117
+ unpack_code = pattern[0]
118
+ num_repetitions = pattern[1..-1].to_i
119
+ bytes_per_element.fetch(unpack_code) * num_repetitions
120
+ end.inject(&:+)
121
+ }
122
+
123
+ BE_MAGIC = 'SDPX'
124
+ LE_MAGIC = BE_MAGIC.reverse
125
+ HEADER_SIZE = SIZEOF[DPX_INFO] # Does not include the initial 4 bytes
126
+
127
+ def information_from_io(io)
128
+ magic = io.read(4)
129
+
130
+ return nil unless [BE_MAGIC, LE_MAGIC].include?(magic)
131
+
132
+ unpack_pattern = DPX_INFO
133
+ unpack_pattern = DPX_INFO_LE if magic == LE_MAGIC
134
+ num_elements, pixels_per_line, num_lines, *_ = safe_read(io, HEADER_SIZE).unpack(unpack_pattern)
135
+ FormatParser::FileInformation.image(
136
+ file_type: :dpx,
137
+ width_px: pixels_per_line,
138
+ height_px: num_lines,
139
+ )
140
+ end
141
+
142
+ FormatParser.register_parser_constructor self
143
+ end
@@ -0,0 +1,58 @@
1
+ require 'exifr/jpeg'
2
+ require 'exifr/tiff'
3
+
4
+ class FormatParser::EXIFParser
5
+ include FormatParser::IOUtils
6
+
7
+ # Squash exifr's invalid date warning since we do not use that data.
8
+ logger = Logger.new(nil)
9
+ EXIFR.logger = logger
10
+
11
+ attr_accessor :exif_data, :orientation, :width, :height
12
+
13
+ ORIENTATIONS = [
14
+ :top_left,
15
+ :top_right,
16
+ :bottom_right,
17
+ :bottom_left,
18
+ :left_top,
19
+ :right_top,
20
+ :right_bottom,
21
+ :left_bottom
22
+ ]
23
+
24
+ def initialize(filetype, file_data)
25
+ @filetype = filetype
26
+ @file_data = file_data
27
+ @exif_data = nil
28
+ @orientation = nil
29
+ @height = nil
30
+ @width = nil
31
+ end
32
+
33
+ def scan_image_exif
34
+
35
+ # Without the magic bytes EXIFR throws an error
36
+ @file_data.rewind
37
+ raw_exif_data = EXIFR::JPEG.new(@file_data) if @filetype == :jpeg
38
+ raw_exif_data = EXIFR::TIFF.new(@file_data) if @filetype == :tiff
39
+ # For things that we don't yet have a parser for
40
+ # we make the raw exif result available
41
+ @exif_data = raw_exif_data
42
+ @orientation = orientation_parser(raw_exif_data)
43
+ @width = @exif_data.width
44
+ @height = @exif_data.height
45
+ end
46
+
47
+ def orientation_parser(raw_exif_data)
48
+ value = raw_exif_data.orientation.to_i
49
+ if valid_orientation?(value)
50
+ @orientation = ORIENTATIONS[value - 1]
51
+ end
52
+ end
53
+
54
+ def valid_orientation?(value)
55
+ (1..ORIENTATIONS.length).include?(value)
56
+ end
57
+
58
+ end