format_parser 0.5.2 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 90c91543dad4ef03fec136ae3b8fd3385ee518e5bbbebbba867a75142a1780f0
4
- data.tar.gz: 5e7d286eefc1170391b6ff896d20e24f39682d2e5e4dc8a7c0b4c67df82eac92
3
+ metadata.gz: fbc5d4476d94ccbeb0f50c3bd0406cb57d1fc5fa640afbbf8057d874ecc128a3
4
+ data.tar.gz: fa4421fa509d0f0528beb4c4b19dc1a63cd66fbed29142ed33b9b2aeae9067c0
5
5
  SHA512:
6
- metadata.gz: 864c0a272275c3d877ecf6fac7b2d7a080876074ee8e7b5858270da1418737cca3b8abc7290649a41f662209c2cf953f714334dc1adb6dabd68187fa3e2fcd25
7
- data.tar.gz: cb04914d011588f6b53976fa5123f13b6ef9f5eb10c40bae1ea4e7c13ba51f382f10721d865164a4f8460554cb28881db7c8f4ff2217d4b1d0a115dbe5fbdf5e
6
+ metadata.gz: 6c3abee783f5b2c35da44231bbec2bfabd1ff25cafe45716694751f865e4cae6c71286dcf2053770f925391f8b3e0cef72daa351c43de03d0ed8b0900e4eabad
7
+ data.tar.gz: ab9ef11e37f5d2610ded657483a688876402cde506af5f830dd2f60fe2a8d432c182227a5c514050e79cfb059906fca2d384b366a6e64969ad7f5b5de81b8f43
data/README.md CHANGED
@@ -185,3 +185,6 @@ Unless specified otherwise in this section the fixture files are MIT licensed an
185
185
 
186
186
  ### .docx
187
187
  - The .docx files were generated by the project maintainers
188
+
189
+ ### .key
190
+ - The `keynote_recognized_as_jpeg.key` file was created by the project maintainers
data/lib/care.rb CHANGED
@@ -4,7 +4,7 @@
4
4
  # is only available via HTTP, for example, we can have less
5
5
  # fetches and have them return more data for one fetch
6
6
  class Care
7
- DEFAULT_PAGE_SIZE = 64 * 1024
7
+ DEFAULT_PAGE_SIZE = 128 * 1024
8
8
 
9
9
  class IOWrapper
10
10
  def initialize(io, cache = Cache.new(DEFAULT_PAGE_SIZE))
@@ -1,3 +1,3 @@
1
1
  module FormatParser
2
- VERSION = '0.5.2'
2
+ VERSION = '0.6.0'
3
3
  end
@@ -1,4 +1,3 @@
1
- require 'exifr/jpeg'
2
1
  require 'exifr/tiff'
3
2
  require 'delegate'
4
3
 
@@ -48,22 +47,16 @@ class FormatParser::EXIFParser
48
47
  :left_bottom
49
48
  ]
50
49
 
51
- def initialize(filetype, file_io)
52
- @filetype = filetype
53
- @file_io = IOExt.new(file_io)
50
+ def initialize(io_blob_with_exif_data)
51
+ @exif_io = IOExt.new(io_blob_with_exif_data)
54
52
  @exif_data = nil
55
53
  @orientation = nil
56
54
  @height = nil
57
55
  @width = nil
58
56
  end
59
57
 
60
- def scan_image_exif
61
- # Without the magic bytes EXIFR throws an error
62
- @file_io.seek(0)
63
- raw_exif_data = EXIFR::JPEG.new(@file_io) if @filetype == :jpeg
64
- # Return if it's a CR2, which we don't parse yet
65
- return if cr2_check(@file_io)
66
- raw_exif_data = EXIFR::TIFF.new(@file_io) if @filetype == :tiff
58
+ def scan_image_tiff
59
+ raw_exif_data = EXIFR::TIFF.new(@exif_io)
67
60
  # For things that we don't yet have a parser for
68
61
  # we make the raw exif result available
69
62
  @exif_data = raw_exif_data
@@ -80,10 +73,4 @@ class FormatParser::EXIFParser
80
73
  def valid_orientation?(value)
81
74
  (1..ORIENTATIONS.length).include?(value)
82
75
  end
83
-
84
- def cr2_check(_file_io)
85
- @file_io.seek(8)
86
- cr2_check_bytes = @file_io.read(2)
87
- cr2_check_bytes == 'CR'
88
- end
89
76
  end
@@ -9,6 +9,7 @@ class FormatParser::JPEGParser
9
9
  EOI_MARKER = 0xD9 # end of image
10
10
  SOS_MARKER = 0xDA # start of stream
11
11
  APP1_MARKER = 0xE1 # maybe EXIF
12
+ EXIF_MAGIC_STRING = "Exif\0\0".b
12
13
 
13
14
  def call(io)
14
15
  @buf = FormatParser::IOConstraint.new(io)
@@ -34,29 +35,43 @@ class FormatParser::JPEGParser
34
35
  signature = read_next_marker
35
36
  return unless signature == SOI_MARKER
36
37
 
38
+ markers_start_at = @buf.pos
39
+
40
+ # Keynote files start with a series of _perfectly_ valid
41
+ # JPEG markers, probably for icon previews or QuickLook.
42
+ # We have to detect those and reject them earlier. We can
43
+ # make use of our magic ZIP reader to get there.
44
+ return if probably_keynote_zip?
45
+
46
+ @buf.seek(markers_start_at)
47
+
37
48
  while marker = read_next_marker
38
49
  case marker
39
50
  when *SOF_MARKERS
40
51
  scan_start_of_frame
41
52
  when EOI_MARKER, SOS_MARKER
53
+ # When we reach "End of image" or "Start of scan" markers
54
+ # we are transitioning into the image data that we don't need
55
+ # or we have reached EOF.
42
56
  break
43
57
  when APP1_MARKER
44
58
  scan_app1_frame
45
59
  else
46
60
  skip_frame
47
61
  end
62
+ end
48
63
 
49
- # Return at the earliest possible opportunity
50
- if @width && @height
51
- return FormatParser::Image.new(
52
- format: :jpg,
53
- width_px: @width,
54
- height_px: @height,
55
- orientation: @orientation,
56
- intrinsics: @intrinsics,
57
- )
58
- end
64
+ # Return at the earliest possible opportunity
65
+ if @width && @height
66
+ return FormatParser::Image.new(
67
+ format: :jpg,
68
+ width_px: @width,
69
+ height_px: @height,
70
+ orientation: @orientation,
71
+ intrinsics: @intrinsics,
72
+ )
59
73
  end
74
+
60
75
  nil # We could not parse anything
61
76
  rescue InvalidStructure
62
77
  nil # Due to the way JPEG is structured it is possible that some invalid inputs will get caught
@@ -86,19 +101,40 @@ class FormatParser::JPEGParser
86
101
  end
87
102
 
88
103
  def scan_app1_frame
89
- frame = @buf.read(8)
90
- if frame.include?('Exif')
91
- scanner = FormatParser::EXIFParser.new(:jpeg, @buf)
92
- if scanner.scan_image_exif
93
- @exif_output = scanner.exif_data
94
- @orientation = scanner.orientation unless scanner.orientation.nil?
95
- @intrinsics[:exif_pixel_x_dimension] = @exif_output.pixel_x_dimension
96
- @intrinsics[:exif_pixel_y_dimension] = @exif_output.pixel_y_dimension
97
- @width = scanner.width
98
- @height = scanner.height
99
- end
104
+ # Read the entire EXIF frame at once to not overload the number of reads. If we don't,
105
+ # EXIFR parses our file from the very beginning and does the same parsing we do, just
106
+ # the second time around. What we care about, rather, is the EXIF data only. So we will
107
+ # pry it out of the APP1 frame and parse it as the TIFF segment - which is what EXIFR
108
+ # does under the hood.
109
+ app1_frame_content_length = read_short - 2
110
+ app1_frame_bytes = safe_read(@buf, app1_frame_content_length)
111
+
112
+ maybe_exif_magic_str = app1_frame_bytes[0..5]
113
+ maybe_exif_data = app1_frame_bytes[6..-1]
114
+ if maybe_exif_magic_str == EXIF_MAGIC_STRING
115
+ scanner = FormatParser::EXIFParser.new(StringIO.new(maybe_exif_data))
116
+ scanner.scan_image_tiff
117
+
118
+ @exif_output = scanner.exif_data
119
+ @orientation = scanner.orientation unless scanner.orientation.nil?
120
+ @intrinsics[:exif_pixel_x_dimension] = @exif_output.pixel_x_dimension
121
+ @intrinsics[:exif_pixel_y_dimension] = @exif_output.pixel_y_dimension
122
+ # Save these two for later, when we decide to provide display width /
123
+ # display height in addition to pixel buffer width / height. These two
124
+ # are _different concepts_. Imagine you have an image shot with a camera
125
+ # in portrait orientation, and the camera has an anamorphic lens. That
126
+ # anamorpohic lens is a smart lens, and therefore transmits pixel aspect
127
+ # ratio to the camera, and the camera encodes that aspect ratio into the
128
+ # image metadata. If we want to know what size our _pixel buffer_ will be,
129
+ # and how to _read_ the pixel data (stride/interleaving) - we need the
130
+ # pixel buffer dimensions. If we want to know what aspect and dimensions
131
+ # our file is going to have _once displayed_ and _once pixels have been
132
+ # brought to the right orientation_ we need to work with **display dimensions**
133
+ # which can be remarkably different from the pixel buffer dimensions.
134
+ @exif_width = scanner.width
135
+ @exif_height = scanner.height
100
136
  end
101
- rescue EXIFR::MalformedJPEG
137
+ rescue EXIFR::MalformedTIFF
102
138
  # Not a JPEG or the Exif headers contain invalid data, or
103
139
  # an APP1 marker was detected in a file that is not a JPEG
104
140
  end
@@ -113,5 +149,10 @@ class FormatParser::JPEGParser
113
149
  safe_skip(@buf, length)
114
150
  end
115
151
 
152
+ def probably_keynote_zip?
153
+ reader = FormatParser::ZIPParser::FileReader.new
154
+ reader.zip?(@buf)
155
+ end
156
+
116
157
  FormatParser.register_parser self, natures: :image, formats: :jpg
117
158
  end
@@ -13,8 +13,8 @@ class FormatParser::TIFFParser
13
13
  return if !endianness || cr2_check(io)
14
14
 
15
15
  w, h = read_tiff_by_endianness(io, endianness)
16
- scanner = FormatParser::EXIFParser.new(:tiff, io)
17
- scanner.scan_image_exif
16
+ scanner = FormatParser::EXIFParser.new(io)
17
+ scanner.scan_image_tiff
18
18
  FormatParser::Image.new(
19
19
  format: :tif,
20
20
  width_px: w,
@@ -6,8 +6,10 @@ class FormatParser::ZIPParser
6
6
  include OfficeFormats
7
7
 
8
8
  def call(io)
9
+ io = FormatParser::IOConstraint.new(io)
10
+
9
11
  reader = FileReader.new
10
- entries = reader.read_zip_structure(io: FormatParser::IOConstraint.new(io))
12
+ entries = reader.read_zip_structure(io: io)
11
13
 
12
14
  filenames_set = Set.new
13
15
  entries_archive = entries.map do |ze|
@@ -195,6 +195,25 @@ class FormatParser::ZIPParser::FileReader
195
195
  entries
196
196
  end
197
197
 
198
+ # Tells whether the given IO is likely to be a ZIP file without
199
+ # performing too many detailed reads
200
+ #
201
+ # @param io[#tell, #seek, #read, #size] an IO-ish object
202
+ # @return [Boolean]
203
+ def zip?(io)
204
+ zip_file_size = io.size
205
+ eocd_offset = get_eocd_offset(io, zip_file_size)
206
+ zip64_end_of_cdir_location = get_zip64_eocd_location(io, eocd_offset)
207
+ if zip64_end_of_cdir_location
208
+ num_files_and_central_directory_offset_zip64(io, zip64_end_of_cdir_location)
209
+ else
210
+ num_files_and_central_directory_offset(io, eocd_offset)
211
+ end
212
+ true
213
+ rescue Error
214
+ false
215
+ end
216
+
198
217
  private
199
218
 
200
219
  def skip_ahead_2(io)
@@ -11,6 +11,6 @@ describe 'Parsing esoteric files and files causing ambiguous detection' do
11
11
  it 'does not return a result for a Keynote file when it mistakes it for a JPEG, and does not raise any errors' do
12
12
  jpeg_path = fixtures_dir + '/JPEG/keynote_recognized_as_jpeg.key'
13
13
  result = FormatParser.parse(File.open(jpeg_path, 'rb'))
14
- expect(result).to be_nil
14
+ expect(result.nature).to eq(:archive)
15
15
  end
16
16
  end
@@ -1,27 +1,12 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  describe FormatParser::EXIFParser do
4
- describe 'is able to correctly parse orientation for all the JPEG EXIF examples from FastImage' do
5
- Dir.glob(fixtures_dir + '/exif-orientation-testimages/jpg/*.jpg').each do |jpeg_path|
6
- filename = File.basename(jpeg_path)
7
- it "is able to parse #{filename}" do
8
- parser = FormatParser::EXIFParser.new(:jpeg, File.open(jpeg_path, 'rb'))
9
- parser.scan_image_exif
10
- expect(parser).not_to be_nil
11
-
12
- expect(parser.orientation).to be_kind_of(Symbol)
13
- # Filenames in this dir correspond with the orientation of the file
14
- expect(filename.include?(parser.orientation.to_s)).to be true
15
- end
16
- end
17
- end
18
-
19
4
  describe 'is able to correctly parse orientation for all the TIFF EXIF examples from FastImage' do
20
5
  Dir.glob(fixtures_dir + '/exif-orientation-testimages/tiff-*/*.tif').each do |tiff_path|
21
6
  filename = File.basename(tiff_path)
22
7
  it "is able to parse #{filename}" do
23
- parser = FormatParser::EXIFParser.new(:tiff, File.open(tiff_path, 'rb'))
24
- parser.scan_image_exif
8
+ parser = FormatParser::EXIFParser.new(File.open(tiff_path, 'rb'))
9
+ parser.scan_image_tiff
25
10
  expect(parser).not_to be_nil
26
11
 
27
12
  expect(parser.orientation).to be_kind_of(Symbol)
@@ -49,10 +49,9 @@ describe FormatParser::JPEGParser do
49
49
  expect(result.intrinsics).to eq(exif_pixel_x_dimension: 8214, exif_pixel_y_dimension: 5476)
50
50
  end
51
51
 
52
- it 'fails correctly (with the right exception being raised) on a Keynote file' do
53
- jpeg_path = fixtures_dir + '/JPEG/keynote_recognized_as_jpeg.key'
54
- expect {
55
- subject.call(File.open(jpeg_path, 'rb'))
56
- }.to raise_error(FormatParser::IOUtils::InvalidRead)
52
+ it 'does not return a result for a Keynote document' do
53
+ key_path = fixtures_dir + '/JPEG/keynote_recognized_as_jpeg.key'
54
+ result = subject.call(File.open(key_path, 'rb'))
55
+ expect(result).to be_nil
57
56
  end
58
57
  end
@@ -99,4 +99,18 @@ describe FormatParser::ZIPParser do
99
99
  expect(first_entry.filename).to eq('Li��nia Extreme//')
100
100
  expect(first_entry.type).to eq(:directory)
101
101
  end
102
+
103
+ describe 'FileReader#zip?' do
104
+ it 'correctly detects all the ZIP files as such' do
105
+ reader = described_class::FileReader.new
106
+ Dir.glob(fixtures_dir + '/ZIP/*.zip').each do |path|
107
+ expect(reader).to be_zip(File.open(path, 'rb'))
108
+ end
109
+
110
+ 4.times do
111
+ blob = Random.new.bytes(1024)
112
+ expect(reader).not_to be_zip(StringIO.new(blob))
113
+ end
114
+ end
115
+ end
102
116
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: format_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.2
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Noah Berman
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2018-04-11 00:00:00.000000000 Z
12
+ date: 2018-04-16 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: ks