format_parser 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,49 @@
1
+ class FormatParser::GIFParser
2
+ HEADERS = ['GIF87a', 'GIF89a'].map(&:b)
3
+ NETSCAPE_AND_AUTHENTICATION_CODE = 'NETSCAPE2.0'
4
+
5
+ include FormatParser::IOUtils
6
+
7
+ def information_from_io(io)
8
+ header = safe_read(io, 6)
9
+ return unless HEADERS.include?(header)
10
+
11
+ w, h = safe_read(io, 4).unpack('vv')
12
+ gct_byte, bgcolor_index, pixel_aspect_ratio = safe_read(io, 5).unpack('Cvv')
13
+
14
+ # and actually onwards for this:
15
+ # http://www.matthewflickinger.com/lab/whatsinagif/bits_and_bytes.asp
16
+
17
+ # Determine how big our color table is
18
+ has_gct = gct_byte[0] == 1
19
+ bytes_per_color = gct_byte >> 6
20
+ unpacked_radix = gct_byte & 0b00000111
21
+ num_colors = 2**(unpacked_radix + 1)
22
+ gct_table_size = num_colors*bytes_per_color
23
+
24
+ # If we have the global color table - skip over it
25
+ if has_gct
26
+ safe_read(io, gct_table_size)
27
+ end
28
+
29
+ # Now it gets interesting - we are at the place where an
30
+ # application extension for the NETSCAPE2.0 block will occur.
31
+ # If it does, it most likely means the application that wrote the
32
+ # GIF needed looping, and if it did, it means that the GIF is
33
+ # very, very likely to be animated. To read the actual animation
34
+ # we need to skip over actual image data frames, which, in case
35
+ # of our paged reads, will incur
36
+ potentially_netscape_app_header = safe_read(io, 64)
37
+ is_animated = potentially_netscape_app_header.include?(NETSCAPE_AND_AUTHENTICATION_CODE)
38
+
39
+ FormatParser::FileInformation.image(
40
+ file_type: :gif,
41
+ width_px: w,
42
+ height_px: h,
43
+ has_multiple_frames: is_animated,
44
+ color_mode: :indexed,
45
+ )
46
+ end
47
+
48
+ FormatParser.register_parser_constructor self
49
+ end
@@ -0,0 +1,122 @@
1
+ class FormatParser::JPEGParser
2
+ include FormatParser::IOUtils
3
+
4
+ class InvalidStructure < StandardError
5
+ end
6
+
7
+ SOI_MARKER = 0xD8 # start of image
8
+ SOF_MARKERS = [0xC0..0xC3, 0xC5..0xC7, 0xC9..0xCB, 0xCD..0xCF]
9
+ EOI_MARKER = 0xD9 # end of image
10
+ SOS_MARKER = 0xDA # start of stream
11
+ APP1_MARKER = 0xE1 # maybe EXIF
12
+
13
+ def information_from_io(io)
14
+ @buf = io
15
+ @width = nil
16
+ @height = nil
17
+ @orientation = nil
18
+ scan
19
+ end
20
+
21
+ private
22
+
23
+ def advance(n)
24
+ safe_read(@buf, n); nil
25
+ end
26
+
27
+ def read_char
28
+ safe_read(@buf, 1).unpack('C').first
29
+ end
30
+
31
+ def read_short
32
+ safe_read(@buf, 2).unpack('n*').first
33
+ end
34
+
35
+ def scan
36
+ # Return early if it is not a JPEG at all
37
+ signature = read_next_marker
38
+ return unless signature == SOI_MARKER
39
+
40
+ while marker = read_next_marker
41
+ case marker
42
+ when *SOF_MARKERS
43
+ scan_start_of_frame
44
+ when EOI_MARKER, SOS_MARKER
45
+ break
46
+ when APP1_MARKER
47
+ scan_app1_frame
48
+ else
49
+ skip_frame
50
+ end
51
+
52
+ # Return at the earliest possible opportunity
53
+ if @width && @height && @orientation
54
+ file_info = FormatParser::FileInformation.image(
55
+ file_type: :jpg,
56
+ width_px: @width,
57
+ height_px: @height,
58
+ orientation: @orientation
59
+ )
60
+ return file_info
61
+ elsif @width && @height
62
+ file_info = FormatParser::FileInformation.image(
63
+ file_type: :jpg,
64
+ width_px: @width,
65
+ height_px: @height
66
+ )
67
+ return file_info
68
+ end
69
+ end
70
+ nil # We could not parse anything
71
+ rescue InvalidStructure
72
+ nil # Due to the way JPEG is structured it is possible that some invalid inputs will get caught
73
+ end
74
+
75
+
76
+ # Read a byte, if it is 0xFF then skip bytes as long as they are also 0xFF (byte stuffing)
77
+ # and return the first byte scanned that is not 0xFF
78
+ def read_next_marker
79
+ c = read_char while c != 0xFF
80
+ c = read_char while c == 0xFF
81
+ c
82
+ end
83
+
84
+ def scan_start_of_frame
85
+ length = read_short
86
+ read_char # depth, unused
87
+ height = read_short
88
+ width = read_short
89
+ size = read_char
90
+
91
+ if length == (size * 3) + 8
92
+ @width, @height = width, height
93
+ else
94
+ raise InvalidStructure
95
+ end
96
+ end
97
+
98
+ def scan_app1_frame
99
+ frame = @buf.read(8)
100
+ if frame.include?("Exif")
101
+ scanner = FormatParser::EXIFParser.new(:jpeg, @buf)
102
+ if scanner.scan_image_exif
103
+ @exif_output = scanner.exif_data
104
+ @orientation = scanner.orientation unless scanner.orientation.nil?
105
+ @width = @exif_output.pixel_x_dimension || scanner.width
106
+ @height = @exif_output.pixel_y_dimension || scanner.height
107
+ end
108
+ end
109
+ end
110
+
111
+ def read_frame
112
+ length = read_short - 2
113
+ safe_read(@buf, length)
114
+ end
115
+
116
+ def skip_frame
117
+ length = read_short - 2
118
+ advance(length)
119
+ end
120
+
121
+ FormatParser.register_parser_constructor self
122
+ end
@@ -0,0 +1,80 @@
1
+ class FormatParser::PNGParser
2
+ PNG_HEADER_BYTES = [137, 80, 78, 71, 13, 10, 26, 10].pack('C*')
3
+ COLOR_TYPES = {
4
+ 0 => :grayscale,
5
+ 2 => :rgb,
6
+ 3 => :indexed,
7
+ 4 => :grayscale, # with alpha
8
+ 6 => :rgba,
9
+ }
10
+ TRANSPARENCY_PER_COLOR_TYPE = {
11
+ 0 => true,
12
+ 4 => true, # Grayscale with alpha
13
+ 6 => true,
14
+ }
15
+
16
+ include FormatParser::IOUtils
17
+
18
+ def chunk_length_and_type(io)
19
+ safe_read(io, 8).unpack("Na4")
20
+ end
21
+
22
+
23
+ def information_from_io(io)
24
+ magic_bytes = safe_read(io, PNG_HEADER_BYTES.bytesize)
25
+ return unless magic_bytes == PNG_HEADER_BYTES
26
+
27
+ chunk_length, chunk_type = chunk_length_and_type(io)
28
+
29
+ # For later: look at gAMA and iCCP chunks too. For now,
30
+ # all we care about is the IHDR chunk, and it must have the
31
+ # correct length as well.
32
+ # IHDR _must_ come first, no exceptions. If it doesn't
33
+ # we should not consider this a valid PNG.
34
+ return unless chunk_type == "IHDR" && chunk_length == 13
35
+
36
+ chunk_data = safe_read(io, chunk_length)
37
+ # Width: 4 bytes
38
+ # Height: 4 bytes
39
+ # Bit depth: 1 byte
40
+ # Color type: 1 byte (0, 2, 3, 4, 6)
41
+ # Compression method: 1 byte
42
+ # Filter method: 1 byte
43
+ # Interlace method: 1 byte
44
+ w, h, bit_depth, color_type,
45
+ compression_method, filter_method, interlace_method = chunk_data.unpack("N2C5")
46
+
47
+ color_mode = COLOR_TYPES.fetch(color_type)
48
+ has_transparency = TRANSPARENCY_PER_COLOR_TYPE[color_type]
49
+
50
+ # Read the next chunk. If it turns out to be acTL (animation control)
51
+ # we are dealing with an APNG.
52
+ safe_skip(io, 4)
53
+
54
+ # dry-validation won't let booleans be filled with nil so we have to set
55
+ # has_animation to false by default
56
+ has_animation = nil
57
+ num_frames = nil
58
+ loop_n_times = nil
59
+
60
+ chunk_length, chunk_type = chunk_length_and_type(io)
61
+ if chunk_length == 8 && chunk_type == 'acTL'
62
+ # https://wiki.mozilla.org/APNG_Specification#.60acTL.60:_The_Animation_Control_Chunk
63
+ # Unlike GIF, we do have the frame count that we can recover
64
+ has_animation = true
65
+ num_frames, loop_n_times = safe_read(io, 8).unpack('NN')
66
+ end
67
+
68
+ FormatParser::FileInformation.image(
69
+ file_type: :png,
70
+ width_px: w,
71
+ height_px: h,
72
+ has_transparency: has_transparency,
73
+ color_mode: color_mode,
74
+ has_multiple_frames: has_animation,
75
+ num_animation_or_video_frames: num_frames,
76
+ )
77
+ end
78
+
79
+ FormatParser.register_parser_constructor self
80
+ end
@@ -0,0 +1,21 @@
1
+ class FormatParser::PSDParser
2
+ PSD_HEADER = [0x38, 0x42, 0x50, 0x53]
3
+ include FormatParser::IOUtils
4
+
5
+ def information_from_io(io)
6
+ magic_bytes = safe_read(io, 4).unpack("C4")
7
+
8
+ return unless magic_bytes == PSD_HEADER
9
+
10
+ # We can be reasonably certain this is a PSD so we grab the height
11
+ # and width bytes
12
+ w,h = safe_read(io, 22).unpack("x10N2")
13
+ FormatParser::FileInformation.image(
14
+ file_type: :psd,
15
+ width_px: w,
16
+ height_px: h,
17
+ )
18
+ end
19
+
20
+ FormatParser.register_parser_constructor self
21
+ end
@@ -0,0 +1,71 @@
1
+ class FormatParser::TIFFParser
2
+ LITTLE_ENDIAN_TIFF_HEADER_BYTES = [0x49, 0x49, 0x2A, 0x0]
3
+ BIG_ENDIAN_TIFF_HEADER_BYTES = [0x4D, 0x4D, 0x0, 0x2A]
4
+ WIDTH_TAG = 0x100
5
+ HEIGHT_TAG = 0x101
6
+
7
+ include FormatParser::IOUtils
8
+
9
+ def information_from_io(io)
10
+ magic_bytes = safe_read(io, 4).unpack("C4")
11
+ endianness = scan_tiff_endianness(magic_bytes)
12
+ return unless endianness
13
+ w, h = read_tiff_by_endianness(io, endianness)
14
+ scanner = FormatParser::EXIFParser.new(:tiff, io)
15
+ scanner.scan_image_exif
16
+ if scanner.orientation
17
+ FormatParser::FileInformation.image(
18
+ file_type: :tif,
19
+ width_px: w,
20
+ height_px: h,
21
+ orientation: scanner.orientation
22
+ )
23
+ else
24
+ FormatParser::FileInformation.image(
25
+ file_type: :tif,
26
+ width_px: w,
27
+ height_px: h
28
+ )
29
+ end
30
+ end
31
+
32
+ # TIFFs can be either big or little endian, so we check here
33
+ # and set our unpack method argument to suit.
34
+ def scan_tiff_endianness(magic_bytes)
35
+ if magic_bytes == LITTLE_ENDIAN_TIFF_HEADER_BYTES
36
+ "v"
37
+ elsif magic_bytes == BIG_ENDIAN_TIFF_HEADER_BYTES
38
+ "n"
39
+ else
40
+ nil
41
+ end
42
+ end
43
+
44
+ # The TIFF format stores metadata in a flexible set of information fields
45
+ # called tags, which are stored in a header referred to as the IFD or
46
+ # Image File Directory. It is not necessarily in the same place in every image,
47
+ # so we need to do some work to scan through it and find the tags we need.
48
+ # For more information the TIFF wikipedia page is a reasonable place to start:
49
+ # https://en.wikipedia.org/wiki/TIFF
50
+ def scan_ifd(cache, offset, endianness)
51
+ entry_count = safe_read(cache, 4).unpack(endianness)[0]
52
+ entry_count.times do |i|
53
+ cache.seek(offset + 2 + (12 * i))
54
+ tag = safe_read(cache, 4).unpack(endianness)[0]
55
+ if tag == WIDTH_TAG
56
+ @width = safe_read(cache, 4).unpack(endianness.upcase)[0]
57
+ elsif tag == HEIGHT_TAG
58
+ @height = safe_read(cache, 4).unpack(endianness.upcase)[0]
59
+ end
60
+ end
61
+ end
62
+
63
+ def read_tiff_by_endianness(io, endianness)
64
+ offset = safe_read(io, 4).unpack(endianness.upcase)[0]
65
+ io.seek(offset)
66
+ scan_ifd(io, offset, endianness)
67
+ [@width, @height]
68
+ end
69
+
70
+ FormatParser.register_parser_constructor self
71
+ end
@@ -0,0 +1,39 @@
1
+ class FormatParser::ReadLimiter
2
+ NO_LIMIT = nil
3
+ class BudgetExceeded < StandardError
4
+ end
5
+
6
+ def initialize(io, max_bytes: NO_LIMIT, max_reads: NO_LIMIT, max_seeks: NO_LIMIT)
7
+ @max_bytes = max_bytes
8
+ @max_reads = max_reads
9
+ @max_seeks = max_seeks
10
+
11
+ @io = io
12
+ @seeks = 0
13
+ @reads = 0
14
+ @bytes = 0
15
+ end
16
+
17
+ def seek(to_offset)
18
+ @seeks += 1
19
+ if @max_seeks && @seeks > @max_seeks
20
+ raise BudgetExceeded, "Seek budget exceeded (%d seeks performed)" % @max_seeks
21
+ end
22
+ @io.seek(to_offset)
23
+ end
24
+
25
+ def read(n)
26
+ @bytes += n
27
+ @reads += 1
28
+
29
+ if @max_bytes && @bytes > @max_bytes
30
+ raise BudgetExceeded, "Read bytes budget (%d) exceeded" % @max_bytes
31
+ end
32
+
33
+ if @max_reads && @reads > @max_reads
34
+ raise BudgetExceeded, "Number of read() calls exceeded (%d max)" % @max_reads
35
+ end
36
+
37
+ @io.read(n)
38
+ end
39
+ end
data/lib/remote_io.rb ADDED
@@ -0,0 +1,89 @@
1
+ class FormatParser::RemoteIO
2
+
3
+ # Represents a failure that might be retried
4
+ # (like a 5xx response or a timeout)
5
+ class IntermittentFailure < StandardError
6
+ end
7
+
8
+ # Represents a failure that should not be retried
9
+ # (like a 4xx response or a DNS resolution error)
10
+ class InvalidRequest < StandardError
11
+ end
12
+
13
+ # @param uri[URI, String] the remote URL to obtain
14
+ def initialize(uri)
15
+ require 'faraday'
16
+ @uri = uri
17
+ @pos = 0
18
+ @remote_size = false
19
+ end
20
+
21
+ # Emulates IO#seek
22
+ def seek(offset)
23
+ @pos = offset
24
+ 0 # always return 0
25
+ end
26
+
27
+ # Emulates IO#size.
28
+ #
29
+ # @return [Fixnum] the size of the remote resource
30
+ def size
31
+ raise "Remote size not yet obtained, need to perform at least one read() to get it" unless @remote_size
32
+ @remote_size
33
+ end
34
+
35
+ # Emulates IO#read, but requires the number of bytes to read
36
+ # The read will be limited to the
37
+ # size of the remote resource relative to the current offset in the IO,
38
+ # so if you are at offset 0 in the IO of size 10, doing a `read(20)`
39
+ # will only return you 10 bytes of result, and not raise any exceptions.
40
+ #
41
+ # @param n_bytes[Fixnum, nil] how many bytes to read, or `nil` to read all the way to the end
42
+ # @return [String] the read bytes
43
+ def read(n_bytes)
44
+ http_range = (@pos..(@pos + n_bytes - 1))
45
+ @remote_size, body = request_range(http_range)
46
+ body.force_encoding(Encoding::BINARY) if body
47
+ body
48
+ end
49
+
50
+ protected
51
+
52
+ # Only used internally when reading the remote file
53
+ #
54
+ # @param range[Range] the HTTP range of data to fetch from remote
55
+ # @return [String] the response body of the ranged request
56
+ def request_range(range)
57
+ # We use a GET and not a HEAD request followed by a GET because
58
+ # S3 does not allow HEAD requests if you only presigned your URL for GETs, so we
59
+ # combine the first GET of a segment and retrieving the size of the resource
60
+ response = Faraday.get(@uri, nil, range: "bytes=%d-%d" % [range.begin, range.end])
61
+
62
+ case response.status
63
+ when 200, 206
64
+ # Figure out of the server supports content ranges, if it doesn't we have no
65
+ # business working with that server
66
+ range_header = response.headers['Content-Range']
67
+ raise InvalidRequest, "No range support at #{@uri}" unless range_header
68
+
69
+ # "Content-Range: bytes 0-0/307404381" is how the response header is structured
70
+ size = range_header[/\/(\d+)$/, 1].to_i
71
+
72
+ # S3 returns 200 when you request a Range that is fully satisfied by the entire object,
73
+ # we take that into account here. For other servers, 206 is the expected response code.
74
+ # Also, if we request a _larger_ range than what can be satisfied by the server,
75
+ # the response is going to only contain what _can_ be sent and the status is also going
76
+ # to be 206
77
+ return [size, response.body]
78
+ when 416
79
+ # We return `nil` as the body if we tried to read past the end of the IO,
80
+ # which satisfies the Ruby IO convention. The caller should deal with `nil` being the result of a read()
81
+ # S3 will also handily _not_ supply us with the Content-Range of the actual resource
82
+ return [nil, nil]
83
+ when 500..599
84
+ raise IntermittentFailure, "Server at #{@uri} replied with a #{response.status} and we might want to retry"
85
+ else
86
+ raise InvalidRequest, "Server at #{@uri} replied with a #{response.status} and refused our request"
87
+ end
88
+ end
89
+ end