format_parser 0.5.2 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +3 -0
- data/lib/care.rb +1 -1
- data/lib/format_parser/version.rb +1 -1
- data/lib/parsers/exif_parser.rb +4 -17
- data/lib/parsers/jpeg_parser.rb +63 -22
- data/lib/parsers/tiff_parser.rb +2 -2
- data/lib/parsers/zip_parser.rb +3 -1
- data/lib/parsers/zip_parser/file_reader.rb +19 -0
- data/spec/esoteric_formats_spec.rb +1 -1
- data/spec/parsers/exif_parser_spec.rb +2 -17
- data/spec/parsers/jpeg_parser_spec.rb +4 -5
- data/spec/parsers/zip_parser_spec.rb +14 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fbc5d4476d94ccbeb0f50c3bd0406cb57d1fc5fa640afbbf8057d874ecc128a3
|
4
|
+
data.tar.gz: fa4421fa509d0f0528beb4c4b19dc1a63cd66fbed29142ed33b9b2aeae9067c0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6c3abee783f5b2c35da44231bbec2bfabd1ff25cafe45716694751f865e4cae6c71286dcf2053770f925391f8b3e0cef72daa351c43de03d0ed8b0900e4eabad
|
7
|
+
data.tar.gz: ab9ef11e37f5d2610ded657483a688876402cde506af5f830dd2f60fe2a8d432c182227a5c514050e79cfb059906fca2d384b366a6e64969ad7f5b5de81b8f43
|
data/README.md
CHANGED
@@ -185,3 +185,6 @@ Unless specified otherwise in this section the fixture files are MIT licensed an
|
|
185
185
|
|
186
186
|
### .docx
|
187
187
|
- The .docx files were generated by the project maintainers
|
188
|
+
|
189
|
+
### .key
|
190
|
+
- The `keynote_recognized_as_jpeg.key` file was created by the project maintainers
|
data/lib/care.rb
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
# is only available via HTTP, for example, we can have less
|
5
5
|
# fetches and have them return more data for one fetch
|
6
6
|
class Care
|
7
|
-
DEFAULT_PAGE_SIZE =
|
7
|
+
DEFAULT_PAGE_SIZE = 128 * 1024
|
8
8
|
|
9
9
|
class IOWrapper
|
10
10
|
def initialize(io, cache = Cache.new(DEFAULT_PAGE_SIZE))
|
data/lib/parsers/exif_parser.rb
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
require 'exifr/jpeg'
|
2
1
|
require 'exifr/tiff'
|
3
2
|
require 'delegate'
|
4
3
|
|
@@ -48,22 +47,16 @@ class FormatParser::EXIFParser
|
|
48
47
|
:left_bottom
|
49
48
|
]
|
50
49
|
|
51
|
-
def initialize(
|
52
|
-
@
|
53
|
-
@file_io = IOExt.new(file_io)
|
50
|
+
def initialize(io_blob_with_exif_data)
|
51
|
+
@exif_io = IOExt.new(io_blob_with_exif_data)
|
54
52
|
@exif_data = nil
|
55
53
|
@orientation = nil
|
56
54
|
@height = nil
|
57
55
|
@width = nil
|
58
56
|
end
|
59
57
|
|
60
|
-
def
|
61
|
-
|
62
|
-
@file_io.seek(0)
|
63
|
-
raw_exif_data = EXIFR::JPEG.new(@file_io) if @filetype == :jpeg
|
64
|
-
# Return if it's a CR2, which we don't parse yet
|
65
|
-
return if cr2_check(@file_io)
|
66
|
-
raw_exif_data = EXIFR::TIFF.new(@file_io) if @filetype == :tiff
|
58
|
+
def scan_image_tiff
|
59
|
+
raw_exif_data = EXIFR::TIFF.new(@exif_io)
|
67
60
|
# For things that we don't yet have a parser for
|
68
61
|
# we make the raw exif result available
|
69
62
|
@exif_data = raw_exif_data
|
@@ -80,10 +73,4 @@ class FormatParser::EXIFParser
|
|
80
73
|
def valid_orientation?(value)
|
81
74
|
(1..ORIENTATIONS.length).include?(value)
|
82
75
|
end
|
83
|
-
|
84
|
-
def cr2_check(_file_io)
|
85
|
-
@file_io.seek(8)
|
86
|
-
cr2_check_bytes = @file_io.read(2)
|
87
|
-
cr2_check_bytes == 'CR'
|
88
|
-
end
|
89
76
|
end
|
data/lib/parsers/jpeg_parser.rb
CHANGED
@@ -9,6 +9,7 @@ class FormatParser::JPEGParser
|
|
9
9
|
EOI_MARKER = 0xD9 # end of image
|
10
10
|
SOS_MARKER = 0xDA # start of stream
|
11
11
|
APP1_MARKER = 0xE1 # maybe EXIF
|
12
|
+
EXIF_MAGIC_STRING = "Exif\0\0".b
|
12
13
|
|
13
14
|
def call(io)
|
14
15
|
@buf = FormatParser::IOConstraint.new(io)
|
@@ -34,29 +35,43 @@ class FormatParser::JPEGParser
|
|
34
35
|
signature = read_next_marker
|
35
36
|
return unless signature == SOI_MARKER
|
36
37
|
|
38
|
+
markers_start_at = @buf.pos
|
39
|
+
|
40
|
+
# Keynote files start with a series of _perfectly_ valid
|
41
|
+
# JPEG markers, probably for icon previews or QuickLook.
|
42
|
+
# We have to detect those and reject them earlier. We can
|
43
|
+
# make use of our magic ZIP reader to get there.
|
44
|
+
return if probably_keynote_zip?
|
45
|
+
|
46
|
+
@buf.seek(markers_start_at)
|
47
|
+
|
37
48
|
while marker = read_next_marker
|
38
49
|
case marker
|
39
50
|
when *SOF_MARKERS
|
40
51
|
scan_start_of_frame
|
41
52
|
when EOI_MARKER, SOS_MARKER
|
53
|
+
# When we reach "End of image" or "Start of scan" markers
|
54
|
+
# we are transitioning into the image data that we don't need
|
55
|
+
# or we have reached EOF.
|
42
56
|
break
|
43
57
|
when APP1_MARKER
|
44
58
|
scan_app1_frame
|
45
59
|
else
|
46
60
|
skip_frame
|
47
61
|
end
|
62
|
+
end
|
48
63
|
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
end
|
64
|
+
# Return at the earliest possible opportunity
|
65
|
+
if @width && @height
|
66
|
+
return FormatParser::Image.new(
|
67
|
+
format: :jpg,
|
68
|
+
width_px: @width,
|
69
|
+
height_px: @height,
|
70
|
+
orientation: @orientation,
|
71
|
+
intrinsics: @intrinsics,
|
72
|
+
)
|
59
73
|
end
|
74
|
+
|
60
75
|
nil # We could not parse anything
|
61
76
|
rescue InvalidStructure
|
62
77
|
nil # Due to the way JPEG is structured it is possible that some invalid inputs will get caught
|
@@ -86,19 +101,40 @@ class FormatParser::JPEGParser
|
|
86
101
|
end
|
87
102
|
|
88
103
|
def scan_app1_frame
|
89
|
-
frame
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
104
|
+
# Read the entire EXIF frame at once to not overload the number of reads. If we don't,
|
105
|
+
# EXIFR parses our file from the very beginning and does the same parsing we do, just
|
106
|
+
# the second time around. What we care about, rather, is the EXIF data only. So we will
|
107
|
+
# pry it out of the APP1 frame and parse it as the TIFF segment - which is what EXIFR
|
108
|
+
# does under the hood.
|
109
|
+
app1_frame_content_length = read_short - 2
|
110
|
+
app1_frame_bytes = safe_read(@buf, app1_frame_content_length)
|
111
|
+
|
112
|
+
maybe_exif_magic_str = app1_frame_bytes[0..5]
|
113
|
+
maybe_exif_data = app1_frame_bytes[6..-1]
|
114
|
+
if maybe_exif_magic_str == EXIF_MAGIC_STRING
|
115
|
+
scanner = FormatParser::EXIFParser.new(StringIO.new(maybe_exif_data))
|
116
|
+
scanner.scan_image_tiff
|
117
|
+
|
118
|
+
@exif_output = scanner.exif_data
|
119
|
+
@orientation = scanner.orientation unless scanner.orientation.nil?
|
120
|
+
@intrinsics[:exif_pixel_x_dimension] = @exif_output.pixel_x_dimension
|
121
|
+
@intrinsics[:exif_pixel_y_dimension] = @exif_output.pixel_y_dimension
|
122
|
+
# Save these two for later, when we decide to provide display width /
|
123
|
+
# display height in addition to pixel buffer width / height. These two
|
124
|
+
# are _different concepts_. Imagine you have an image shot with a camera
|
125
|
+
# in portrait orientation, and the camera has an anamorphic lens. That
|
126
|
+
# anamorpohic lens is a smart lens, and therefore transmits pixel aspect
|
127
|
+
# ratio to the camera, and the camera encodes that aspect ratio into the
|
128
|
+
# image metadata. If we want to know what size our _pixel buffer_ will be,
|
129
|
+
# and how to _read_ the pixel data (stride/interleaving) - we need the
|
130
|
+
# pixel buffer dimensions. If we want to know what aspect and dimensions
|
131
|
+
# our file is going to have _once displayed_ and _once pixels have been
|
132
|
+
# brought to the right orientation_ we need to work with **display dimensions**
|
133
|
+
# which can be remarkably different from the pixel buffer dimensions.
|
134
|
+
@exif_width = scanner.width
|
135
|
+
@exif_height = scanner.height
|
100
136
|
end
|
101
|
-
rescue EXIFR::
|
137
|
+
rescue EXIFR::MalformedTIFF
|
102
138
|
# Not a JPEG or the Exif headers contain invalid data, or
|
103
139
|
# an APP1 marker was detected in a file that is not a JPEG
|
104
140
|
end
|
@@ -113,5 +149,10 @@ class FormatParser::JPEGParser
|
|
113
149
|
safe_skip(@buf, length)
|
114
150
|
end
|
115
151
|
|
152
|
+
def probably_keynote_zip?
|
153
|
+
reader = FormatParser::ZIPParser::FileReader.new
|
154
|
+
reader.zip?(@buf)
|
155
|
+
end
|
156
|
+
|
116
157
|
FormatParser.register_parser self, natures: :image, formats: :jpg
|
117
158
|
end
|
data/lib/parsers/tiff_parser.rb
CHANGED
@@ -13,8 +13,8 @@ class FormatParser::TIFFParser
|
|
13
13
|
return if !endianness || cr2_check(io)
|
14
14
|
|
15
15
|
w, h = read_tiff_by_endianness(io, endianness)
|
16
|
-
scanner = FormatParser::EXIFParser.new(
|
17
|
-
scanner.
|
16
|
+
scanner = FormatParser::EXIFParser.new(io)
|
17
|
+
scanner.scan_image_tiff
|
18
18
|
FormatParser::Image.new(
|
19
19
|
format: :tif,
|
20
20
|
width_px: w,
|
data/lib/parsers/zip_parser.rb
CHANGED
@@ -6,8 +6,10 @@ class FormatParser::ZIPParser
|
|
6
6
|
include OfficeFormats
|
7
7
|
|
8
8
|
def call(io)
|
9
|
+
io = FormatParser::IOConstraint.new(io)
|
10
|
+
|
9
11
|
reader = FileReader.new
|
10
|
-
entries = reader.read_zip_structure(io:
|
12
|
+
entries = reader.read_zip_structure(io: io)
|
11
13
|
|
12
14
|
filenames_set = Set.new
|
13
15
|
entries_archive = entries.map do |ze|
|
@@ -195,6 +195,25 @@ class FormatParser::ZIPParser::FileReader
|
|
195
195
|
entries
|
196
196
|
end
|
197
197
|
|
198
|
+
# Tells whether the given IO is likely to be a ZIP file without
|
199
|
+
# performing too many detailed reads
|
200
|
+
#
|
201
|
+
# @param io[#tell, #seek, #read, #size] an IO-ish object
|
202
|
+
# @return [Boolean]
|
203
|
+
def zip?(io)
|
204
|
+
zip_file_size = io.size
|
205
|
+
eocd_offset = get_eocd_offset(io, zip_file_size)
|
206
|
+
zip64_end_of_cdir_location = get_zip64_eocd_location(io, eocd_offset)
|
207
|
+
if zip64_end_of_cdir_location
|
208
|
+
num_files_and_central_directory_offset_zip64(io, zip64_end_of_cdir_location)
|
209
|
+
else
|
210
|
+
num_files_and_central_directory_offset(io, eocd_offset)
|
211
|
+
end
|
212
|
+
true
|
213
|
+
rescue Error
|
214
|
+
false
|
215
|
+
end
|
216
|
+
|
198
217
|
private
|
199
218
|
|
200
219
|
def skip_ahead_2(io)
|
@@ -11,6 +11,6 @@ describe 'Parsing esoteric files and files causing ambiguous detection' do
|
|
11
11
|
it 'does not return a result for a Keynote file when it mistakes it for a JPEG, and does not raise any errors' do
|
12
12
|
jpeg_path = fixtures_dir + '/JPEG/keynote_recognized_as_jpeg.key'
|
13
13
|
result = FormatParser.parse(File.open(jpeg_path, 'rb'))
|
14
|
-
expect(result).to
|
14
|
+
expect(result.nature).to eq(:archive)
|
15
15
|
end
|
16
16
|
end
|
@@ -1,27 +1,12 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
describe FormatParser::EXIFParser do
|
4
|
-
describe 'is able to correctly parse orientation for all the JPEG EXIF examples from FastImage' do
|
5
|
-
Dir.glob(fixtures_dir + '/exif-orientation-testimages/jpg/*.jpg').each do |jpeg_path|
|
6
|
-
filename = File.basename(jpeg_path)
|
7
|
-
it "is able to parse #{filename}" do
|
8
|
-
parser = FormatParser::EXIFParser.new(:jpeg, File.open(jpeg_path, 'rb'))
|
9
|
-
parser.scan_image_exif
|
10
|
-
expect(parser).not_to be_nil
|
11
|
-
|
12
|
-
expect(parser.orientation).to be_kind_of(Symbol)
|
13
|
-
# Filenames in this dir correspond with the orientation of the file
|
14
|
-
expect(filename.include?(parser.orientation.to_s)).to be true
|
15
|
-
end
|
16
|
-
end
|
17
|
-
end
|
18
|
-
|
19
4
|
describe 'is able to correctly parse orientation for all the TIFF EXIF examples from FastImage' do
|
20
5
|
Dir.glob(fixtures_dir + '/exif-orientation-testimages/tiff-*/*.tif').each do |tiff_path|
|
21
6
|
filename = File.basename(tiff_path)
|
22
7
|
it "is able to parse #{filename}" do
|
23
|
-
parser = FormatParser::EXIFParser.new(
|
24
|
-
parser.
|
8
|
+
parser = FormatParser::EXIFParser.new(File.open(tiff_path, 'rb'))
|
9
|
+
parser.scan_image_tiff
|
25
10
|
expect(parser).not_to be_nil
|
26
11
|
|
27
12
|
expect(parser.orientation).to be_kind_of(Symbol)
|
@@ -49,10 +49,9 @@ describe FormatParser::JPEGParser do
|
|
49
49
|
expect(result.intrinsics).to eq(exif_pixel_x_dimension: 8214, exif_pixel_y_dimension: 5476)
|
50
50
|
end
|
51
51
|
|
52
|
-
it '
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
}.to raise_error(FormatParser::IOUtils::InvalidRead)
|
52
|
+
it 'does not return a result for a Keynote document' do
|
53
|
+
key_path = fixtures_dir + '/JPEG/keynote_recognized_as_jpeg.key'
|
54
|
+
result = subject.call(File.open(key_path, 'rb'))
|
55
|
+
expect(result).to be_nil
|
57
56
|
end
|
58
57
|
end
|
@@ -99,4 +99,18 @@ describe FormatParser::ZIPParser do
|
|
99
99
|
expect(first_entry.filename).to eq('Li��nia Extreme//')
|
100
100
|
expect(first_entry.type).to eq(:directory)
|
101
101
|
end
|
102
|
+
|
103
|
+
describe 'FileReader#zip?' do
|
104
|
+
it 'correctly detects all the ZIP files as such' do
|
105
|
+
reader = described_class::FileReader.new
|
106
|
+
Dir.glob(fixtures_dir + '/ZIP/*.zip').each do |path|
|
107
|
+
expect(reader).to be_zip(File.open(path, 'rb'))
|
108
|
+
end
|
109
|
+
|
110
|
+
4.times do
|
111
|
+
blob = Random.new.bytes(1024)
|
112
|
+
expect(reader).not_to be_zip(StringIO.new(blob))
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
102
116
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: format_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Noah Berman
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2018-04-
|
12
|
+
date: 2018-04-16 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: ks
|