format_parser 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,49 @@
1
+ class FormatParser::GIFParser
2
+ HEADERS = ['GIF87a', 'GIF89a'].map(&:b)
3
+ NETSCAPE_AND_AUTHENTICATION_CODE = 'NETSCAPE2.0'
4
+
5
+ include FormatParser::IOUtils
6
+
7
+ def information_from_io(io)
8
+ header = safe_read(io, 6)
9
+ return unless HEADERS.include?(header)
10
+
11
+ w, h = safe_read(io, 4).unpack('vv')
12
+ gct_byte, bgcolor_index, pixel_aspect_ratio = safe_read(io, 5).unpack('Cvv')
13
+
14
+ # and actually onwards for this:
15
+ # http://www.matthewflickinger.com/lab/whatsinagif/bits_and_bytes.asp
16
+
17
+ # Determine how big our color table is
18
+ has_gct = gct_byte[0] == 1
19
+ bytes_per_color = gct_byte >> 6
20
+ unpacked_radix = gct_byte & 0b00000111
21
+ num_colors = 2**(unpacked_radix + 1)
22
+ gct_table_size = num_colors*bytes_per_color
23
+
24
+ # If we have the global color table - skip over it
25
+ if has_gct
26
+ safe_read(io, gct_table_size)
27
+ end
28
+
29
+ # Now it gets interesting - we are at the place where an
30
+ # application extension for the NETSCAPE2.0 block will occur.
31
+ # If it does, it most likely means the application that wrote the
32
+ # GIF needed looping, and if it did, it means that the GIF is
33
+ # very, very likely to be animated. To read the actual animation
34
+ # we need to skip over actual image data frames, which, in case
35
+ # of our paged reads, will incur
36
+ potentially_netscape_app_header = safe_read(io, 64)
37
+ is_animated = potentially_netscape_app_header.include?(NETSCAPE_AND_AUTHENTICATION_CODE)
38
+
39
+ FormatParser::FileInformation.image(
40
+ file_type: :gif,
41
+ width_px: w,
42
+ height_px: h,
43
+ has_multiple_frames: is_animated,
44
+ color_mode: :indexed,
45
+ )
46
+ end
47
+
48
+ FormatParser.register_parser_constructor self
49
+ end
@@ -0,0 +1,122 @@
1
+ class FormatParser::JPEGParser
2
+ include FormatParser::IOUtils
3
+
4
+ class InvalidStructure < StandardError
5
+ end
6
+
7
+ SOI_MARKER = 0xD8 # start of image
8
+ SOF_MARKERS = [0xC0..0xC3, 0xC5..0xC7, 0xC9..0xCB, 0xCD..0xCF]
9
+ EOI_MARKER = 0xD9 # end of image
10
+ SOS_MARKER = 0xDA # start of stream
11
+ APP1_MARKER = 0xE1 # maybe EXIF
12
+
13
+ def information_from_io(io)
14
+ @buf = io
15
+ @width = nil
16
+ @height = nil
17
+ @orientation = nil
18
+ scan
19
+ end
20
+
21
+ private
22
+
23
+ def advance(n)
24
+ safe_read(@buf, n); nil
25
+ end
26
+
27
+ def read_char
28
+ safe_read(@buf, 1).unpack('C').first
29
+ end
30
+
31
+ def read_short
32
+ safe_read(@buf, 2).unpack('n*').first
33
+ end
34
+
35
+ def scan
36
+ # Return early if it is not a JPEG at all
37
+ signature = read_next_marker
38
+ return unless signature == SOI_MARKER
39
+
40
+ while marker = read_next_marker
41
+ case marker
42
+ when *SOF_MARKERS
43
+ scan_start_of_frame
44
+ when EOI_MARKER, SOS_MARKER
45
+ break
46
+ when APP1_MARKER
47
+ scan_app1_frame
48
+ else
49
+ skip_frame
50
+ end
51
+
52
+ # Return at the earliest possible opportunity
53
+ if @width && @height && @orientation
54
+ file_info = FormatParser::FileInformation.image(
55
+ file_type: :jpg,
56
+ width_px: @width,
57
+ height_px: @height,
58
+ orientation: @orientation
59
+ )
60
+ return file_info
61
+ elsif @width && @height
62
+ file_info = FormatParser::FileInformation.image(
63
+ file_type: :jpg,
64
+ width_px: @width,
65
+ height_px: @height
66
+ )
67
+ return file_info
68
+ end
69
+ end
70
+ nil # We could not parse anything
71
+ rescue InvalidStructure
72
+ nil # Due to the way JPEG is structured it is possible that some invalid inputs will get caught
73
+ end
74
+
75
+
76
+ # Read a byte, if it is 0xFF then skip bytes as long as they are also 0xFF (byte stuffing)
77
+ # and return the first byte scanned that is not 0xFF
78
+ def read_next_marker
79
+ c = read_char while c != 0xFF
80
+ c = read_char while c == 0xFF
81
+ c
82
+ end
83
+
84
+ def scan_start_of_frame
85
+ length = read_short
86
+ read_char # depth, unused
87
+ height = read_short
88
+ width = read_short
89
+ size = read_char
90
+
91
+ if length == (size * 3) + 8
92
+ @width, @height = width, height
93
+ else
94
+ raise InvalidStructure
95
+ end
96
+ end
97
+
98
+ def scan_app1_frame
99
+ frame = @buf.read(8)
100
+ if frame.include?("Exif")
101
+ scanner = FormatParser::EXIFParser.new(:jpeg, @buf)
102
+ if scanner.scan_image_exif
103
+ @exif_output = scanner.exif_data
104
+ @orientation = scanner.orientation unless scanner.orientation.nil?
105
+ @width = @exif_output.pixel_x_dimension || scanner.width
106
+ @height = @exif_output.pixel_y_dimension || scanner.height
107
+ end
108
+ end
109
+ end
110
+
111
+ def read_frame
112
+ length = read_short - 2
113
+ safe_read(@buf, length)
114
+ end
115
+
116
+ def skip_frame
117
+ length = read_short - 2
118
+ advance(length)
119
+ end
120
+
121
+ FormatParser.register_parser_constructor self
122
+ end
@@ -0,0 +1,80 @@
1
+ class FormatParser::PNGParser
2
+ PNG_HEADER_BYTES = [137, 80, 78, 71, 13, 10, 26, 10].pack('C*')
3
+ COLOR_TYPES = {
4
+ 0 => :grayscale,
5
+ 2 => :rgb,
6
+ 3 => :indexed,
7
+ 4 => :grayscale, # with alpha
8
+ 6 => :rgba,
9
+ }
10
+ TRANSPARENCY_PER_COLOR_TYPE = {
11
+ 0 => true,
12
+ 4 => true, # Grayscale with alpha
13
+ 6 => true,
14
+ }
15
+
16
+ include FormatParser::IOUtils
17
+
18
+ def chunk_length_and_type(io)
19
+ safe_read(io, 8).unpack("Na4")
20
+ end
21
+
22
+
23
+ def information_from_io(io)
24
+ magic_bytes = safe_read(io, PNG_HEADER_BYTES.bytesize)
25
+ return unless magic_bytes == PNG_HEADER_BYTES
26
+
27
+ chunk_length, chunk_type = chunk_length_and_type(io)
28
+
29
+ # For later: look at gAMA and iCCP chunks too. For now,
30
+ # all we care about is the IHDR chunk, and it must have the
31
+ # correct length as well.
32
+ # IHDR _must_ come first, no exceptions. If it doesn't
33
+ # we should not consider this a valid PNG.
34
+ return unless chunk_type == "IHDR" && chunk_length == 13
35
+
36
+ chunk_data = safe_read(io, chunk_length)
37
+ # Width: 4 bytes
38
+ # Height: 4 bytes
39
+ # Bit depth: 1 byte
40
+ # Color type: 1 byte (0, 2, 3, 4, 6)
41
+ # Compression method: 1 byte
42
+ # Filter method: 1 byte
43
+ # Interlace method: 1 byte
44
+ w, h, bit_depth, color_type,
45
+ compression_method, filter_method, interlace_method = chunk_data.unpack("N2C5")
46
+
47
+ color_mode = COLOR_TYPES.fetch(color_type)
48
+ has_transparency = TRANSPARENCY_PER_COLOR_TYPE[color_type]
49
+
50
+ # Read the next chunk. If it turns out to be acTL (animation control)
51
+ # we are dealing with an APNG.
52
+ safe_skip(io, 4)
53
+
54
+ # dry-validation won't let booleans be filled with nil so we have to set
55
+ # has_animation to false by default
56
+ has_animation = nil
57
+ num_frames = nil
58
+ loop_n_times = nil
59
+
60
+ chunk_length, chunk_type = chunk_length_and_type(io)
61
+ if chunk_length == 8 && chunk_type == 'acTL'
62
+ # https://wiki.mozilla.org/APNG_Specification#.60acTL.60:_The_Animation_Control_Chunk
63
+ # Unlike GIF, we do have the frame count that we can recover
64
+ has_animation = true
65
+ num_frames, loop_n_times = safe_read(io, 8).unpack('NN')
66
+ end
67
+
68
+ FormatParser::FileInformation.image(
69
+ file_type: :png,
70
+ width_px: w,
71
+ height_px: h,
72
+ has_transparency: has_transparency,
73
+ color_mode: color_mode,
74
+ has_multiple_frames: has_animation,
75
+ num_animation_or_video_frames: num_frames,
76
+ )
77
+ end
78
+
79
+ FormatParser.register_parser_constructor self
80
+ end
@@ -0,0 +1,21 @@
1
+ class FormatParser::PSDParser
2
+ PSD_HEADER = [0x38, 0x42, 0x50, 0x53]
3
+ include FormatParser::IOUtils
4
+
5
+ def information_from_io(io)
6
+ magic_bytes = safe_read(io, 4).unpack("C4")
7
+
8
+ return unless magic_bytes == PSD_HEADER
9
+
10
+ # We can be reasonably certain this is a PSD so we grab the height
11
+ # and width bytes
12
+ w,h = safe_read(io, 22).unpack("x10N2")
13
+ FormatParser::FileInformation.image(
14
+ file_type: :psd,
15
+ width_px: w,
16
+ height_px: h,
17
+ )
18
+ end
19
+
20
+ FormatParser.register_parser_constructor self
21
+ end
@@ -0,0 +1,71 @@
1
+ class FormatParser::TIFFParser
2
+ LITTLE_ENDIAN_TIFF_HEADER_BYTES = [0x49, 0x49, 0x2A, 0x0]
3
+ BIG_ENDIAN_TIFF_HEADER_BYTES = [0x4D, 0x4D, 0x0, 0x2A]
4
+ WIDTH_TAG = 0x100
5
+ HEIGHT_TAG = 0x101
6
+
7
+ include FormatParser::IOUtils
8
+
9
+ def information_from_io(io)
10
+ magic_bytes = safe_read(io, 4).unpack("C4")
11
+ endianness = scan_tiff_endianness(magic_bytes)
12
+ return unless endianness
13
+ w, h = read_tiff_by_endianness(io, endianness)
14
+ scanner = FormatParser::EXIFParser.new(:tiff, io)
15
+ scanner.scan_image_exif
16
+ if scanner.orientation
17
+ FormatParser::FileInformation.image(
18
+ file_type: :tif,
19
+ width_px: w,
20
+ height_px: h,
21
+ orientation: scanner.orientation
22
+ )
23
+ else
24
+ FormatParser::FileInformation.image(
25
+ file_type: :tif,
26
+ width_px: w,
27
+ height_px: h
28
+ )
29
+ end
30
+ end
31
+
32
+ # TIFFs can be either big or little endian, so we check here
33
+ # and set our unpack method argument to suit.
34
+ def scan_tiff_endianness(magic_bytes)
35
+ if magic_bytes == LITTLE_ENDIAN_TIFF_HEADER_BYTES
36
+ "v"
37
+ elsif magic_bytes == BIG_ENDIAN_TIFF_HEADER_BYTES
38
+ "n"
39
+ else
40
+ nil
41
+ end
42
+ end
43
+
44
+ # The TIFF format stores metadata in a flexible set of information fields
45
+ # called tags, which are stored in a header referred to as the IFD or
46
+ # Image File Directory. It is not necessarily in the same place in every image,
47
+ # so we need to do some work to scan through it and find the tags we need.
48
+ # For more information the TIFF wikipedia page is a reasonable place to start:
49
+ # https://en.wikipedia.org/wiki/TIFF
50
+ def scan_ifd(cache, offset, endianness)
51
+ entry_count = safe_read(cache, 4).unpack(endianness)[0]
52
+ entry_count.times do |i|
53
+ cache.seek(offset + 2 + (12 * i))
54
+ tag = safe_read(cache, 4).unpack(endianness)[0]
55
+ if tag == WIDTH_TAG
56
+ @width = safe_read(cache, 4).unpack(endianness.upcase)[0]
57
+ elsif tag == HEIGHT_TAG
58
+ @height = safe_read(cache, 4).unpack(endianness.upcase)[0]
59
+ end
60
+ end
61
+ end
62
+
63
+ def read_tiff_by_endianness(io, endianness)
64
+ offset = safe_read(io, 4).unpack(endianness.upcase)[0]
65
+ io.seek(offset)
66
+ scan_ifd(io, offset, endianness)
67
+ [@width, @height]
68
+ end
69
+
70
+ FormatParser.register_parser_constructor self
71
+ end
@@ -0,0 +1,39 @@
1
+ class FormatParser::ReadLimiter
2
+ NO_LIMIT = nil
3
+ class BudgetExceeded < StandardError
4
+ end
5
+
6
+ def initialize(io, max_bytes: NO_LIMIT, max_reads: NO_LIMIT, max_seeks: NO_LIMIT)
7
+ @max_bytes = max_bytes
8
+ @max_reads = max_reads
9
+ @max_seeks = max_seeks
10
+
11
+ @io = io
12
+ @seeks = 0
13
+ @reads = 0
14
+ @bytes = 0
15
+ end
16
+
17
+ def seek(to_offset)
18
+ @seeks += 1
19
+ if @max_seeks && @seeks > @max_seeks
20
+ raise BudgetExceeded, "Seek budget exceeded (%d seeks performed)" % @max_seeks
21
+ end
22
+ @io.seek(to_offset)
23
+ end
24
+
25
+ def read(n)
26
+ @bytes += n
27
+ @reads += 1
28
+
29
+ if @max_bytes && @bytes > @max_bytes
30
+ raise BudgetExceeded, "Read bytes budget (%d) exceeded" % @max_bytes
31
+ end
32
+
33
+ if @max_reads && @reads > @max_reads
34
+ raise BudgetExceeded, "Number of read() calls exceeded (%d max)" % @max_reads
35
+ end
36
+
37
+ @io.read(n)
38
+ end
39
+ end
data/lib/remote_io.rb ADDED
@@ -0,0 +1,89 @@
1
+ class FormatParser::RemoteIO
2
+
3
+ # Represents a failure that might be retried
4
+ # (like a 5xx response or a timeout)
5
+ class IntermittentFailure < StandardError
6
+ end
7
+
8
+ # Represents a failure that should not be retried
9
+ # (like a 4xx response or a DNS resolution error)
10
+ class InvalidRequest < StandardError
11
+ end
12
+
13
+ # @param uri[URI, String] the remote URL to obtain
14
+ def initialize(uri)
15
+ require 'faraday'
16
+ @uri = uri
17
+ @pos = 0
18
+ @remote_size = false
19
+ end
20
+
21
+ # Emulates IO#seek
22
+ def seek(offset)
23
+ @pos = offset
24
+ 0 # always return 0
25
+ end
26
+
27
+ # Emulates IO#size.
28
+ #
29
+ # @return [Fixnum] the size of the remote resource
30
+ def size
31
+ raise "Remote size not yet obtained, need to perform at least one read() to get it" unless @remote_size
32
+ @remote_size
33
+ end
34
+
35
+ # Emulates IO#read, but requires the number of bytes to read
36
+ # The read will be limited to the
37
+ # size of the remote resource relative to the current offset in the IO,
38
+ # so if you are at offset 0 in the IO of size 10, doing a `read(20)`
39
+ # will only return you 10 bytes of result, and not raise any exceptions.
40
+ #
41
+ # @param n_bytes[Fixnum, nil] how many bytes to read, or `nil` to read all the way to the end
42
+ # @return [String] the read bytes
43
+ def read(n_bytes)
44
+ http_range = (@pos..(@pos + n_bytes - 1))
45
+ @remote_size, body = request_range(http_range)
46
+ body.force_encoding(Encoding::BINARY) if body
47
+ body
48
+ end
49
+
50
+ protected
51
+
52
+ # Only used internally when reading the remote file
53
+ #
54
+ # @param range[Range] the HTTP range of data to fetch from remote
55
+ # @return [String] the response body of the ranged request
56
+ def request_range(range)
57
+ # We use a GET and not a HEAD request followed by a GET because
58
+ # S3 does not allow HEAD requests if you only presigned your URL for GETs, so we
59
+ # combine the first GET of a segment and retrieving the size of the resource
60
+ response = Faraday.get(@uri, nil, range: "bytes=%d-%d" % [range.begin, range.end])
61
+
62
+ case response.status
63
+ when 200, 206
64
+ # Figure out of the server supports content ranges, if it doesn't we have no
65
+ # business working with that server
66
+ range_header = response.headers['Content-Range']
67
+ raise InvalidRequest, "No range support at #{@uri}" unless range_header
68
+
69
+ # "Content-Range: bytes 0-0/307404381" is how the response header is structured
70
+ size = range_header[/\/(\d+)$/, 1].to_i
71
+
72
+ # S3 returns 200 when you request a Range that is fully satisfied by the entire object,
73
+ # we take that into account here. For other servers, 206 is the expected response code.
74
+ # Also, if we request a _larger_ range than what can be satisfied by the server,
75
+ # the response is going to only contain what _can_ be sent and the status is also going
76
+ # to be 206
77
+ return [size, response.body]
78
+ when 416
79
+ # We return `nil` as the body if we tried to read past the end of the IO,
80
+ # which satisfies the Ruby IO convention. The caller should deal with `nil` being the result of a read()
81
+ # S3 will also handily _not_ supply us with the Content-Range of the actual resource
82
+ return [nil, nil]
83
+ when 500..599
84
+ raise IntermittentFailure, "Server at #{@uri} replied with a #{response.status} and we might want to retry"
85
+ else
86
+ raise InvalidRequest, "Server at #{@uri} replied with a #{response.status} and refused our request"
87
+ end
88
+ end
89
+ end