format_parser 0.5.2 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 90c91543dad4ef03fec136ae3b8fd3385ee518e5bbbebbba867a75142a1780f0
4
- data.tar.gz: 5e7d286eefc1170391b6ff896d20e24f39682d2e5e4dc8a7c0b4c67df82eac92
3
+ metadata.gz: fbc5d4476d94ccbeb0f50c3bd0406cb57d1fc5fa640afbbf8057d874ecc128a3
4
+ data.tar.gz: fa4421fa509d0f0528beb4c4b19dc1a63cd66fbed29142ed33b9b2aeae9067c0
5
5
  SHA512:
6
- metadata.gz: 864c0a272275c3d877ecf6fac7b2d7a080876074ee8e7b5858270da1418737cca3b8abc7290649a41f662209c2cf953f714334dc1adb6dabd68187fa3e2fcd25
7
- data.tar.gz: cb04914d011588f6b53976fa5123f13b6ef9f5eb10c40bae1ea4e7c13ba51f382f10721d865164a4f8460554cb28881db7c8f4ff2217d4b1d0a115dbe5fbdf5e
6
+ metadata.gz: 6c3abee783f5b2c35da44231bbec2bfabd1ff25cafe45716694751f865e4cae6c71286dcf2053770f925391f8b3e0cef72daa351c43de03d0ed8b0900e4eabad
7
+ data.tar.gz: ab9ef11e37f5d2610ded657483a688876402cde506af5f830dd2f60fe2a8d432c182227a5c514050e79cfb059906fca2d384b366a6e64969ad7f5b5de81b8f43
data/README.md CHANGED
@@ -185,3 +185,6 @@ Unless specified otherwise in this section the fixture files are MIT licensed an
185
185
 
186
186
  ### .docx
187
187
  - The .docx files were generated by the project maintainers
188
+
189
+ ### .key
190
+ - The `keynote_recognized_as_jpeg.key` file was created by the project maintainers
data/lib/care.rb CHANGED
@@ -4,7 +4,7 @@
4
4
  # is only available via HTTP, for example, we can have less
5
5
  # fetches and have them return more data for one fetch
6
6
  class Care
7
- DEFAULT_PAGE_SIZE = 64 * 1024
7
+ DEFAULT_PAGE_SIZE = 128 * 1024
8
8
 
9
9
  class IOWrapper
10
10
  def initialize(io, cache = Cache.new(DEFAULT_PAGE_SIZE))
@@ -1,3 +1,3 @@
1
1
  module FormatParser
2
- VERSION = '0.5.2'
2
+ VERSION = '0.6.0'
3
3
  end
@@ -1,4 +1,3 @@
1
- require 'exifr/jpeg'
2
1
  require 'exifr/tiff'
3
2
  require 'delegate'
4
3
 
@@ -48,22 +47,16 @@ class FormatParser::EXIFParser
48
47
  :left_bottom
49
48
  ]
50
49
 
51
- def initialize(filetype, file_io)
52
- @filetype = filetype
53
- @file_io = IOExt.new(file_io)
50
+ def initialize(io_blob_with_exif_data)
51
+ @exif_io = IOExt.new(io_blob_with_exif_data)
54
52
  @exif_data = nil
55
53
  @orientation = nil
56
54
  @height = nil
57
55
  @width = nil
58
56
  end
59
57
 
60
- def scan_image_exif
61
- # Without the magic bytes EXIFR throws an error
62
- @file_io.seek(0)
63
- raw_exif_data = EXIFR::JPEG.new(@file_io) if @filetype == :jpeg
64
- # Return if it's a CR2, which we don't parse yet
65
- return if cr2_check(@file_io)
66
- raw_exif_data = EXIFR::TIFF.new(@file_io) if @filetype == :tiff
58
+ def scan_image_tiff
59
+ raw_exif_data = EXIFR::TIFF.new(@exif_io)
67
60
  # For things that we don't yet have a parser for
68
61
  # we make the raw exif result available
69
62
  @exif_data = raw_exif_data
@@ -80,10 +73,4 @@ class FormatParser::EXIFParser
80
73
  def valid_orientation?(value)
81
74
  (1..ORIENTATIONS.length).include?(value)
82
75
  end
83
-
84
- def cr2_check(_file_io)
85
- @file_io.seek(8)
86
- cr2_check_bytes = @file_io.read(2)
87
- cr2_check_bytes == 'CR'
88
- end
89
76
  end
@@ -9,6 +9,7 @@ class FormatParser::JPEGParser
9
9
  EOI_MARKER = 0xD9 # end of image
10
10
  SOS_MARKER = 0xDA # start of stream
11
11
  APP1_MARKER = 0xE1 # maybe EXIF
12
+ EXIF_MAGIC_STRING = "Exif\0\0".b
12
13
 
13
14
  def call(io)
14
15
  @buf = FormatParser::IOConstraint.new(io)
@@ -34,29 +35,43 @@ class FormatParser::JPEGParser
34
35
  signature = read_next_marker
35
36
  return unless signature == SOI_MARKER
36
37
 
38
+ markers_start_at = @buf.pos
39
+
40
+ # Keynote files start with a series of _perfectly_ valid
41
+ # JPEG markers, probably for icon previews or QuickLook.
42
+ # We have to detect those and reject them earlier. We can
43
+ # make use of our magic ZIP reader to get there.
44
+ return if probably_keynote_zip?
45
+
46
+ @buf.seek(markers_start_at)
47
+
37
48
  while marker = read_next_marker
38
49
  case marker
39
50
  when *SOF_MARKERS
40
51
  scan_start_of_frame
41
52
  when EOI_MARKER, SOS_MARKER
53
+ # When we reach "End of image" or "Start of scan" markers
54
+ # we are transitioning into the image data that we don't need
55
+ # or we have reached EOF.
42
56
  break
43
57
  when APP1_MARKER
44
58
  scan_app1_frame
45
59
  else
46
60
  skip_frame
47
61
  end
62
+ end
48
63
 
49
- # Return at the earliest possible opportunity
50
- if @width && @height
51
- return FormatParser::Image.new(
52
- format: :jpg,
53
- width_px: @width,
54
- height_px: @height,
55
- orientation: @orientation,
56
- intrinsics: @intrinsics,
57
- )
58
- end
64
+ # Return at the earliest possible opportunity
65
+ if @width && @height
66
+ return FormatParser::Image.new(
67
+ format: :jpg,
68
+ width_px: @width,
69
+ height_px: @height,
70
+ orientation: @orientation,
71
+ intrinsics: @intrinsics,
72
+ )
59
73
  end
74
+
60
75
  nil # We could not parse anything
61
76
  rescue InvalidStructure
62
77
  nil # Due to the way JPEG is structured it is possible that some invalid inputs will get caught
@@ -86,19 +101,40 @@ class FormatParser::JPEGParser
86
101
  end
87
102
 
88
103
  def scan_app1_frame
89
- frame = @buf.read(8)
90
- if frame.include?('Exif')
91
- scanner = FormatParser::EXIFParser.new(:jpeg, @buf)
92
- if scanner.scan_image_exif
93
- @exif_output = scanner.exif_data
94
- @orientation = scanner.orientation unless scanner.orientation.nil?
95
- @intrinsics[:exif_pixel_x_dimension] = @exif_output.pixel_x_dimension
96
- @intrinsics[:exif_pixel_y_dimension] = @exif_output.pixel_y_dimension
97
- @width = scanner.width
98
- @height = scanner.height
99
- end
104
+ # Read the entire EXIF frame at once to not overload the number of reads. If we don't,
105
+ # EXIFR parses our file from the very beginning and does the same parsing we do, just
106
+ # the second time around. What we care about, rather, is the EXIF data only. So we will
107
+ # pry it out of the APP1 frame and parse it as the TIFF segment - which is what EXIFR
108
+ # does under the hood.
109
+ app1_frame_content_length = read_short - 2
110
+ app1_frame_bytes = safe_read(@buf, app1_frame_content_length)
111
+
112
+ maybe_exif_magic_str = app1_frame_bytes[0..5]
113
+ maybe_exif_data = app1_frame_bytes[6..-1]
114
+ if maybe_exif_magic_str == EXIF_MAGIC_STRING
115
+ scanner = FormatParser::EXIFParser.new(StringIO.new(maybe_exif_data))
116
+ scanner.scan_image_tiff
117
+
118
+ @exif_output = scanner.exif_data
119
+ @orientation = scanner.orientation unless scanner.orientation.nil?
120
+ @intrinsics[:exif_pixel_x_dimension] = @exif_output.pixel_x_dimension
121
+ @intrinsics[:exif_pixel_y_dimension] = @exif_output.pixel_y_dimension
122
+ # Save these two for later, when we decide to provide display width /
123
+ # display height in addition to pixel buffer width / height. These two
124
+ # are _different concepts_. Imagine you have an image shot with a camera
125
+ # in portrait orientation, and the camera has an anamorphic lens. That
126
+ # anamorpohic lens is a smart lens, and therefore transmits pixel aspect
127
+ # ratio to the camera, and the camera encodes that aspect ratio into the
128
+ # image metadata. If we want to know what size our _pixel buffer_ will be,
129
+ # and how to _read_ the pixel data (stride/interleaving) - we need the
130
+ # pixel buffer dimensions. If we want to know what aspect and dimensions
131
+ # our file is going to have _once displayed_ and _once pixels have been
132
+ # brought to the right orientation_ we need to work with **display dimensions**
133
+ # which can be remarkably different from the pixel buffer dimensions.
134
+ @exif_width = scanner.width
135
+ @exif_height = scanner.height
100
136
  end
101
- rescue EXIFR::MalformedJPEG
137
+ rescue EXIFR::MalformedTIFF
102
138
  # Not a JPEG or the Exif headers contain invalid data, or
103
139
  # an APP1 marker was detected in a file that is not a JPEG
104
140
  end
@@ -113,5 +149,10 @@ class FormatParser::JPEGParser
113
149
  safe_skip(@buf, length)
114
150
  end
115
151
 
152
+ def probably_keynote_zip?
153
+ reader = FormatParser::ZIPParser::FileReader.new
154
+ reader.zip?(@buf)
155
+ end
156
+
116
157
  FormatParser.register_parser self, natures: :image, formats: :jpg
117
158
  end
@@ -13,8 +13,8 @@ class FormatParser::TIFFParser
13
13
  return if !endianness || cr2_check(io)
14
14
 
15
15
  w, h = read_tiff_by_endianness(io, endianness)
16
- scanner = FormatParser::EXIFParser.new(:tiff, io)
17
- scanner.scan_image_exif
16
+ scanner = FormatParser::EXIFParser.new(io)
17
+ scanner.scan_image_tiff
18
18
  FormatParser::Image.new(
19
19
  format: :tif,
20
20
  width_px: w,
@@ -6,8 +6,10 @@ class FormatParser::ZIPParser
6
6
  include OfficeFormats
7
7
 
8
8
  def call(io)
9
+ io = FormatParser::IOConstraint.new(io)
10
+
9
11
  reader = FileReader.new
10
- entries = reader.read_zip_structure(io: FormatParser::IOConstraint.new(io))
12
+ entries = reader.read_zip_structure(io: io)
11
13
 
12
14
  filenames_set = Set.new
13
15
  entries_archive = entries.map do |ze|
@@ -195,6 +195,25 @@ class FormatParser::ZIPParser::FileReader
195
195
  entries
196
196
  end
197
197
 
198
+ # Tells whether the given IO is likely to be a ZIP file without
199
+ # performing too many detailed reads
200
+ #
201
+ # @param io[#tell, #seek, #read, #size] an IO-ish object
202
+ # @return [Boolean]
203
+ def zip?(io)
204
+ zip_file_size = io.size
205
+ eocd_offset = get_eocd_offset(io, zip_file_size)
206
+ zip64_end_of_cdir_location = get_zip64_eocd_location(io, eocd_offset)
207
+ if zip64_end_of_cdir_location
208
+ num_files_and_central_directory_offset_zip64(io, zip64_end_of_cdir_location)
209
+ else
210
+ num_files_and_central_directory_offset(io, eocd_offset)
211
+ end
212
+ true
213
+ rescue Error
214
+ false
215
+ end
216
+
198
217
  private
199
218
 
200
219
  def skip_ahead_2(io)
@@ -11,6 +11,6 @@ describe 'Parsing esoteric files and files causing ambiguous detection' do
11
11
  it 'does not return a result for a Keynote file when it mistakes it for a JPEG, and does not raise any errors' do
12
12
  jpeg_path = fixtures_dir + '/JPEG/keynote_recognized_as_jpeg.key'
13
13
  result = FormatParser.parse(File.open(jpeg_path, 'rb'))
14
- expect(result).to be_nil
14
+ expect(result.nature).to eq(:archive)
15
15
  end
16
16
  end
@@ -1,27 +1,12 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  describe FormatParser::EXIFParser do
4
- describe 'is able to correctly parse orientation for all the JPEG EXIF examples from FastImage' do
5
- Dir.glob(fixtures_dir + '/exif-orientation-testimages/jpg/*.jpg').each do |jpeg_path|
6
- filename = File.basename(jpeg_path)
7
- it "is able to parse #{filename}" do
8
- parser = FormatParser::EXIFParser.new(:jpeg, File.open(jpeg_path, 'rb'))
9
- parser.scan_image_exif
10
- expect(parser).not_to be_nil
11
-
12
- expect(parser.orientation).to be_kind_of(Symbol)
13
- # Filenames in this dir correspond with the orientation of the file
14
- expect(filename.include?(parser.orientation.to_s)).to be true
15
- end
16
- end
17
- end
18
-
19
4
  describe 'is able to correctly parse orientation for all the TIFF EXIF examples from FastImage' do
20
5
  Dir.glob(fixtures_dir + '/exif-orientation-testimages/tiff-*/*.tif').each do |tiff_path|
21
6
  filename = File.basename(tiff_path)
22
7
  it "is able to parse #{filename}" do
23
- parser = FormatParser::EXIFParser.new(:tiff, File.open(tiff_path, 'rb'))
24
- parser.scan_image_exif
8
+ parser = FormatParser::EXIFParser.new(File.open(tiff_path, 'rb'))
9
+ parser.scan_image_tiff
25
10
  expect(parser).not_to be_nil
26
11
 
27
12
  expect(parser.orientation).to be_kind_of(Symbol)
@@ -49,10 +49,9 @@ describe FormatParser::JPEGParser do
49
49
  expect(result.intrinsics).to eq(exif_pixel_x_dimension: 8214, exif_pixel_y_dimension: 5476)
50
50
  end
51
51
 
52
- it 'fails correctly (with the right exception being raised) on a Keynote file' do
53
- jpeg_path = fixtures_dir + '/JPEG/keynote_recognized_as_jpeg.key'
54
- expect {
55
- subject.call(File.open(jpeg_path, 'rb'))
56
- }.to raise_error(FormatParser::IOUtils::InvalidRead)
52
+ it 'does not return a result for a Keynote document' do
53
+ key_path = fixtures_dir + '/JPEG/keynote_recognized_as_jpeg.key'
54
+ result = subject.call(File.open(key_path, 'rb'))
55
+ expect(result).to be_nil
57
56
  end
58
57
  end
@@ -99,4 +99,18 @@ describe FormatParser::ZIPParser do
99
99
  expect(first_entry.filename).to eq('Li��nia Extreme//')
100
100
  expect(first_entry.type).to eq(:directory)
101
101
  end
102
+
103
+ describe 'FileReader#zip?' do
104
+ it 'correctly detects all the ZIP files as such' do
105
+ reader = described_class::FileReader.new
106
+ Dir.glob(fixtures_dir + '/ZIP/*.zip').each do |path|
107
+ expect(reader).to be_zip(File.open(path, 'rb'))
108
+ end
109
+
110
+ 4.times do
111
+ blob = Random.new.bytes(1024)
112
+ expect(reader).not_to be_zip(StringIO.new(blob))
113
+ end
114
+ end
115
+ end
102
116
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: format_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.2
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Noah Berman
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2018-04-11 00:00:00.000000000 Z
12
+ date: 2018-04-16 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: ks