format_parser 0.5.2 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +3 -0
- data/lib/care.rb +1 -1
- data/lib/format_parser/version.rb +1 -1
- data/lib/parsers/exif_parser.rb +4 -17
- data/lib/parsers/jpeg_parser.rb +63 -22
- data/lib/parsers/tiff_parser.rb +2 -2
- data/lib/parsers/zip_parser.rb +3 -1
- data/lib/parsers/zip_parser/file_reader.rb +19 -0
- data/spec/esoteric_formats_spec.rb +1 -1
- data/spec/parsers/exif_parser_spec.rb +2 -17
- data/spec/parsers/jpeg_parser_spec.rb +4 -5
- data/spec/parsers/zip_parser_spec.rb +14 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fbc5d4476d94ccbeb0f50c3bd0406cb57d1fc5fa640afbbf8057d874ecc128a3
|
4
|
+
data.tar.gz: fa4421fa509d0f0528beb4c4b19dc1a63cd66fbed29142ed33b9b2aeae9067c0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6c3abee783f5b2c35da44231bbec2bfabd1ff25cafe45716694751f865e4cae6c71286dcf2053770f925391f8b3e0cef72daa351c43de03d0ed8b0900e4eabad
|
7
|
+
data.tar.gz: ab9ef11e37f5d2610ded657483a688876402cde506af5f830dd2f60fe2a8d432c182227a5c514050e79cfb059906fca2d384b366a6e64969ad7f5b5de81b8f43
|
data/README.md
CHANGED
@@ -185,3 +185,6 @@ Unless specified otherwise in this section the fixture files are MIT licensed an
|
|
185
185
|
|
186
186
|
### .docx
|
187
187
|
- The .docx files were generated by the project maintainers
|
188
|
+
|
189
|
+
### .key
|
190
|
+
- The `keynote_recognized_as_jpeg.key` file was created by the project maintainers
|
data/lib/care.rb
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
# is only available via HTTP, for example, we can have less
|
5
5
|
# fetches and have them return more data for one fetch
|
6
6
|
class Care
|
7
|
-
DEFAULT_PAGE_SIZE =
|
7
|
+
DEFAULT_PAGE_SIZE = 128 * 1024
|
8
8
|
|
9
9
|
class IOWrapper
|
10
10
|
def initialize(io, cache = Cache.new(DEFAULT_PAGE_SIZE))
|
data/lib/parsers/exif_parser.rb
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
require 'exifr/jpeg'
|
2
1
|
require 'exifr/tiff'
|
3
2
|
require 'delegate'
|
4
3
|
|
@@ -48,22 +47,16 @@ class FormatParser::EXIFParser
|
|
48
47
|
:left_bottom
|
49
48
|
]
|
50
49
|
|
51
|
-
def initialize(
|
52
|
-
@
|
53
|
-
@file_io = IOExt.new(file_io)
|
50
|
+
def initialize(io_blob_with_exif_data)
|
51
|
+
@exif_io = IOExt.new(io_blob_with_exif_data)
|
54
52
|
@exif_data = nil
|
55
53
|
@orientation = nil
|
56
54
|
@height = nil
|
57
55
|
@width = nil
|
58
56
|
end
|
59
57
|
|
60
|
-
def
|
61
|
-
|
62
|
-
@file_io.seek(0)
|
63
|
-
raw_exif_data = EXIFR::JPEG.new(@file_io) if @filetype == :jpeg
|
64
|
-
# Return if it's a CR2, which we don't parse yet
|
65
|
-
return if cr2_check(@file_io)
|
66
|
-
raw_exif_data = EXIFR::TIFF.new(@file_io) if @filetype == :tiff
|
58
|
+
def scan_image_tiff
|
59
|
+
raw_exif_data = EXIFR::TIFF.new(@exif_io)
|
67
60
|
# For things that we don't yet have a parser for
|
68
61
|
# we make the raw exif result available
|
69
62
|
@exif_data = raw_exif_data
|
@@ -80,10 +73,4 @@ class FormatParser::EXIFParser
|
|
80
73
|
def valid_orientation?(value)
|
81
74
|
(1..ORIENTATIONS.length).include?(value)
|
82
75
|
end
|
83
|
-
|
84
|
-
def cr2_check(_file_io)
|
85
|
-
@file_io.seek(8)
|
86
|
-
cr2_check_bytes = @file_io.read(2)
|
87
|
-
cr2_check_bytes == 'CR'
|
88
|
-
end
|
89
76
|
end
|
data/lib/parsers/jpeg_parser.rb
CHANGED
@@ -9,6 +9,7 @@ class FormatParser::JPEGParser
|
|
9
9
|
EOI_MARKER = 0xD9 # end of image
|
10
10
|
SOS_MARKER = 0xDA # start of stream
|
11
11
|
APP1_MARKER = 0xE1 # maybe EXIF
|
12
|
+
EXIF_MAGIC_STRING = "Exif\0\0".b
|
12
13
|
|
13
14
|
def call(io)
|
14
15
|
@buf = FormatParser::IOConstraint.new(io)
|
@@ -34,29 +35,43 @@ class FormatParser::JPEGParser
|
|
34
35
|
signature = read_next_marker
|
35
36
|
return unless signature == SOI_MARKER
|
36
37
|
|
38
|
+
markers_start_at = @buf.pos
|
39
|
+
|
40
|
+
# Keynote files start with a series of _perfectly_ valid
|
41
|
+
# JPEG markers, probably for icon previews or QuickLook.
|
42
|
+
# We have to detect those and reject them earlier. We can
|
43
|
+
# make use of our magic ZIP reader to get there.
|
44
|
+
return if probably_keynote_zip?
|
45
|
+
|
46
|
+
@buf.seek(markers_start_at)
|
47
|
+
|
37
48
|
while marker = read_next_marker
|
38
49
|
case marker
|
39
50
|
when *SOF_MARKERS
|
40
51
|
scan_start_of_frame
|
41
52
|
when EOI_MARKER, SOS_MARKER
|
53
|
+
# When we reach "End of image" or "Start of scan" markers
|
54
|
+
# we are transitioning into the image data that we don't need
|
55
|
+
# or we have reached EOF.
|
42
56
|
break
|
43
57
|
when APP1_MARKER
|
44
58
|
scan_app1_frame
|
45
59
|
else
|
46
60
|
skip_frame
|
47
61
|
end
|
62
|
+
end
|
48
63
|
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
end
|
64
|
+
# Return at the earliest possible opportunity
|
65
|
+
if @width && @height
|
66
|
+
return FormatParser::Image.new(
|
67
|
+
format: :jpg,
|
68
|
+
width_px: @width,
|
69
|
+
height_px: @height,
|
70
|
+
orientation: @orientation,
|
71
|
+
intrinsics: @intrinsics,
|
72
|
+
)
|
59
73
|
end
|
74
|
+
|
60
75
|
nil # We could not parse anything
|
61
76
|
rescue InvalidStructure
|
62
77
|
nil # Due to the way JPEG is structured it is possible that some invalid inputs will get caught
|
@@ -86,19 +101,40 @@ class FormatParser::JPEGParser
|
|
86
101
|
end
|
87
102
|
|
88
103
|
def scan_app1_frame
|
89
|
-
frame
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
104
|
+
# Read the entire EXIF frame at once to not overload the number of reads. If we don't,
|
105
|
+
# EXIFR parses our file from the very beginning and does the same parsing we do, just
|
106
|
+
# the second time around. What we care about, rather, is the EXIF data only. So we will
|
107
|
+
# pry it out of the APP1 frame and parse it as the TIFF segment - which is what EXIFR
|
108
|
+
# does under the hood.
|
109
|
+
app1_frame_content_length = read_short - 2
|
110
|
+
app1_frame_bytes = safe_read(@buf, app1_frame_content_length)
|
111
|
+
|
112
|
+
maybe_exif_magic_str = app1_frame_bytes[0..5]
|
113
|
+
maybe_exif_data = app1_frame_bytes[6..-1]
|
114
|
+
if maybe_exif_magic_str == EXIF_MAGIC_STRING
|
115
|
+
scanner = FormatParser::EXIFParser.new(StringIO.new(maybe_exif_data))
|
116
|
+
scanner.scan_image_tiff
|
117
|
+
|
118
|
+
@exif_output = scanner.exif_data
|
119
|
+
@orientation = scanner.orientation unless scanner.orientation.nil?
|
120
|
+
@intrinsics[:exif_pixel_x_dimension] = @exif_output.pixel_x_dimension
|
121
|
+
@intrinsics[:exif_pixel_y_dimension] = @exif_output.pixel_y_dimension
|
122
|
+
# Save these two for later, when we decide to provide display width /
|
123
|
+
# display height in addition to pixel buffer width / height. These two
|
124
|
+
# are _different concepts_. Imagine you have an image shot with a camera
|
125
|
+
# in portrait orientation, and the camera has an anamorphic lens. That
|
126
|
+
# anamorpohic lens is a smart lens, and therefore transmits pixel aspect
|
127
|
+
# ratio to the camera, and the camera encodes that aspect ratio into the
|
128
|
+
# image metadata. If we want to know what size our _pixel buffer_ will be,
|
129
|
+
# and how to _read_ the pixel data (stride/interleaving) - we need the
|
130
|
+
# pixel buffer dimensions. If we want to know what aspect and dimensions
|
131
|
+
# our file is going to have _once displayed_ and _once pixels have been
|
132
|
+
# brought to the right orientation_ we need to work with **display dimensions**
|
133
|
+
# which can be remarkably different from the pixel buffer dimensions.
|
134
|
+
@exif_width = scanner.width
|
135
|
+
@exif_height = scanner.height
|
100
136
|
end
|
101
|
-
rescue EXIFR::
|
137
|
+
rescue EXIFR::MalformedTIFF
|
102
138
|
# Not a JPEG or the Exif headers contain invalid data, or
|
103
139
|
# an APP1 marker was detected in a file that is not a JPEG
|
104
140
|
end
|
@@ -113,5 +149,10 @@ class FormatParser::JPEGParser
|
|
113
149
|
safe_skip(@buf, length)
|
114
150
|
end
|
115
151
|
|
152
|
+
def probably_keynote_zip?
|
153
|
+
reader = FormatParser::ZIPParser::FileReader.new
|
154
|
+
reader.zip?(@buf)
|
155
|
+
end
|
156
|
+
|
116
157
|
FormatParser.register_parser self, natures: :image, formats: :jpg
|
117
158
|
end
|
data/lib/parsers/tiff_parser.rb
CHANGED
@@ -13,8 +13,8 @@ class FormatParser::TIFFParser
|
|
13
13
|
return if !endianness || cr2_check(io)
|
14
14
|
|
15
15
|
w, h = read_tiff_by_endianness(io, endianness)
|
16
|
-
scanner = FormatParser::EXIFParser.new(
|
17
|
-
scanner.
|
16
|
+
scanner = FormatParser::EXIFParser.new(io)
|
17
|
+
scanner.scan_image_tiff
|
18
18
|
FormatParser::Image.new(
|
19
19
|
format: :tif,
|
20
20
|
width_px: w,
|
data/lib/parsers/zip_parser.rb
CHANGED
@@ -6,8 +6,10 @@ class FormatParser::ZIPParser
|
|
6
6
|
include OfficeFormats
|
7
7
|
|
8
8
|
def call(io)
|
9
|
+
io = FormatParser::IOConstraint.new(io)
|
10
|
+
|
9
11
|
reader = FileReader.new
|
10
|
-
entries = reader.read_zip_structure(io:
|
12
|
+
entries = reader.read_zip_structure(io: io)
|
11
13
|
|
12
14
|
filenames_set = Set.new
|
13
15
|
entries_archive = entries.map do |ze|
|
@@ -195,6 +195,25 @@ class FormatParser::ZIPParser::FileReader
|
|
195
195
|
entries
|
196
196
|
end
|
197
197
|
|
198
|
+
# Tells whether the given IO is likely to be a ZIP file without
|
199
|
+
# performing too many detailed reads
|
200
|
+
#
|
201
|
+
# @param io[#tell, #seek, #read, #size] an IO-ish object
|
202
|
+
# @return [Boolean]
|
203
|
+
def zip?(io)
|
204
|
+
zip_file_size = io.size
|
205
|
+
eocd_offset = get_eocd_offset(io, zip_file_size)
|
206
|
+
zip64_end_of_cdir_location = get_zip64_eocd_location(io, eocd_offset)
|
207
|
+
if zip64_end_of_cdir_location
|
208
|
+
num_files_and_central_directory_offset_zip64(io, zip64_end_of_cdir_location)
|
209
|
+
else
|
210
|
+
num_files_and_central_directory_offset(io, eocd_offset)
|
211
|
+
end
|
212
|
+
true
|
213
|
+
rescue Error
|
214
|
+
false
|
215
|
+
end
|
216
|
+
|
198
217
|
private
|
199
218
|
|
200
219
|
def skip_ahead_2(io)
|
@@ -11,6 +11,6 @@ describe 'Parsing esoteric files and files causing ambiguous detection' do
|
|
11
11
|
it 'does not return a result for a Keynote file when it mistakes it for a JPEG, and does not raise any errors' do
|
12
12
|
jpeg_path = fixtures_dir + '/JPEG/keynote_recognized_as_jpeg.key'
|
13
13
|
result = FormatParser.parse(File.open(jpeg_path, 'rb'))
|
14
|
-
expect(result).to
|
14
|
+
expect(result.nature).to eq(:archive)
|
15
15
|
end
|
16
16
|
end
|
@@ -1,27 +1,12 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
describe FormatParser::EXIFParser do
|
4
|
-
describe 'is able to correctly parse orientation for all the JPEG EXIF examples from FastImage' do
|
5
|
-
Dir.glob(fixtures_dir + '/exif-orientation-testimages/jpg/*.jpg').each do |jpeg_path|
|
6
|
-
filename = File.basename(jpeg_path)
|
7
|
-
it "is able to parse #{filename}" do
|
8
|
-
parser = FormatParser::EXIFParser.new(:jpeg, File.open(jpeg_path, 'rb'))
|
9
|
-
parser.scan_image_exif
|
10
|
-
expect(parser).not_to be_nil
|
11
|
-
|
12
|
-
expect(parser.orientation).to be_kind_of(Symbol)
|
13
|
-
# Filenames in this dir correspond with the orientation of the file
|
14
|
-
expect(filename.include?(parser.orientation.to_s)).to be true
|
15
|
-
end
|
16
|
-
end
|
17
|
-
end
|
18
|
-
|
19
4
|
describe 'is able to correctly parse orientation for all the TIFF EXIF examples from FastImage' do
|
20
5
|
Dir.glob(fixtures_dir + '/exif-orientation-testimages/tiff-*/*.tif').each do |tiff_path|
|
21
6
|
filename = File.basename(tiff_path)
|
22
7
|
it "is able to parse #{filename}" do
|
23
|
-
parser = FormatParser::EXIFParser.new(
|
24
|
-
parser.
|
8
|
+
parser = FormatParser::EXIFParser.new(File.open(tiff_path, 'rb'))
|
9
|
+
parser.scan_image_tiff
|
25
10
|
expect(parser).not_to be_nil
|
26
11
|
|
27
12
|
expect(parser.orientation).to be_kind_of(Symbol)
|
@@ -49,10 +49,9 @@ describe FormatParser::JPEGParser do
|
|
49
49
|
expect(result.intrinsics).to eq(exif_pixel_x_dimension: 8214, exif_pixel_y_dimension: 5476)
|
50
50
|
end
|
51
51
|
|
52
|
-
it '
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
}.to raise_error(FormatParser::IOUtils::InvalidRead)
|
52
|
+
it 'does not return a result for a Keynote document' do
|
53
|
+
key_path = fixtures_dir + '/JPEG/keynote_recognized_as_jpeg.key'
|
54
|
+
result = subject.call(File.open(key_path, 'rb'))
|
55
|
+
expect(result).to be_nil
|
57
56
|
end
|
58
57
|
end
|
@@ -99,4 +99,18 @@ describe FormatParser::ZIPParser do
|
|
99
99
|
expect(first_entry.filename).to eq('Li��nia Extreme//')
|
100
100
|
expect(first_entry.type).to eq(:directory)
|
101
101
|
end
|
102
|
+
|
103
|
+
describe 'FileReader#zip?' do
|
104
|
+
it 'correctly detects all the ZIP files as such' do
|
105
|
+
reader = described_class::FileReader.new
|
106
|
+
Dir.glob(fixtures_dir + '/ZIP/*.zip').each do |path|
|
107
|
+
expect(reader).to be_zip(File.open(path, 'rb'))
|
108
|
+
end
|
109
|
+
|
110
|
+
4.times do
|
111
|
+
blob = Random.new.bytes(1024)
|
112
|
+
expect(reader).not_to be_zip(StringIO.new(blob))
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
102
116
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: format_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Noah Berman
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2018-04-
|
12
|
+
date: 2018-04-16 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: ks
|