format_parser 1.7.0 → 2.0.0.pre
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/main.yml +4 -9
- data/CHANGELOG.md +6 -0
- data/format_parser.gemspec +9 -11
- data/lib/care.rb +5 -11
- data/lib/format_parser/version.rb +1 -1
- data/lib/format_parser.rb +8 -11
- data/lib/io_utils.rb +2 -6
- data/lib/parsers/aac_parser/adts_header_info.rb +3 -9
- data/lib/parsers/dpx_parser/dpx_structs.rb +1 -1
- data/lib/parsers/exif_parser.rb +2 -4
- data/lib/parsers/fdx_parser.rb +2 -2
- data/lib/parsers/flac_parser.rb +2 -6
- data/lib/parsers/jpeg_parser.rb +2 -2
- data/lib/parsers/moov_parser.rb +5 -7
- data/lib/parsers/mp3_parser.rb +2 -6
- data/lib/parsers/mpeg_parser.rb +1 -3
- data/lib/parsers/wav_parser.rb +9 -12
- data/lib/parsers/zip_parser/file_reader.rb +45 -70
- data/lib/parsers/zip_parser.rb +1 -1
- data/lib/read_limiter.rb +8 -16
- data/lib/remote_io.rb +64 -34
- data/lib/string.rb +9 -0
- data/spec/attributes_json_spec.rb +0 -3
- data/spec/remote_fetching_spec.rb +3 -8
- data/spec/remote_io_spec.rb +116 -60
- metadata +40 -79
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d7c965b7783ecaea4802f7e585861b4400b2210fee4cb90388757530880fa074
|
4
|
+
data.tar.gz: fc8b7cc3f00825fa054c948a7ae817b1eee6457ffaec9e5a6b5bdd9a0b92d126
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 73f774ebe540dfd54e87f89cedecfc0fabf4a97f4e2ef72afcd94edc5e0fbc344c7c67b365942e3bb915dfe76f94f038072671c259c2d366a69d64a73cbde960
|
7
|
+
data.tar.gz: bc1405329d521487ec4d0738c258fb12c3acdb37b6b8ecebf7451a866d5f1072cfc23774e2ecc3d7d297095ff280320756fb4cd9000de3eac447a105cf87028b
|
data/.github/workflows/main.yml
CHANGED
@@ -14,8 +14,8 @@ jobs:
|
|
14
14
|
matrix:
|
15
15
|
ruby:
|
16
16
|
- 2.7
|
17
|
-
-
|
18
|
-
-
|
17
|
+
- 3.0
|
18
|
+
- 3.1
|
19
19
|
- jruby
|
20
20
|
steps:
|
21
21
|
- name: Checkout
|
@@ -60,15 +60,10 @@ jobs:
|
|
60
60
|
matrix:
|
61
61
|
ruby:
|
62
62
|
- 2.7
|
63
|
-
-
|
64
|
-
-
|
63
|
+
- 3.0
|
64
|
+
- 3.1
|
65
65
|
- jruby
|
66
66
|
experimental: [false]
|
67
|
-
include:
|
68
|
-
- ruby: 3.1
|
69
|
-
experimental: true
|
70
|
-
- ruby: 3.0
|
71
|
-
experimental: true
|
72
67
|
steps:
|
73
68
|
- name: Checkout
|
74
69
|
uses: actions/checkout@v2
|
data/CHANGELOG.md
CHANGED
data/format_parser.gemspec
CHANGED
@@ -30,17 +30,15 @@ Gem::Specification.new do |spec|
|
|
30
30
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
31
31
|
spec.require_paths = ['lib']
|
32
32
|
|
33
|
-
spec.add_dependency '
|
34
|
-
spec.add_dependency '
|
35
|
-
spec.add_dependency '
|
36
|
-
spec.add_dependency '
|
37
|
-
spec.add_dependency 'faraday_middleware', '~> 0.14'
|
38
|
-
spec.add_dependency 'measurometer', '~> 1'
|
33
|
+
spec.add_dependency 'exifr', '>= 1.3.8'
|
34
|
+
spec.add_dependency 'id3tag', '>= 0.14.2'
|
35
|
+
spec.add_dependency 'ks'
|
36
|
+
spec.add_dependency 'measurometer'
|
39
37
|
|
40
|
-
spec.add_development_dependency 'rspec', '~> 3.0'
|
41
|
-
spec.add_development_dependency 'rake', '~> 12'
|
42
|
-
spec.add_development_dependency 'simplecov', '~> 0.15'
|
43
|
-
spec.add_development_dependency 'yard', '~> 0.9'
|
44
|
-
spec.add_development_dependency 'wetransfer_style', '0.5.0'
|
45
38
|
spec.add_development_dependency 'parallel_tests'
|
39
|
+
spec.add_development_dependency 'rake'
|
40
|
+
spec.add_development_dependency 'rspec'
|
41
|
+
spec.add_development_dependency 'simplecov'
|
42
|
+
spec.add_development_dependency 'wetransfer_style', '1.0.0'
|
43
|
+
spec.add_development_dependency 'yard'
|
46
44
|
end
|
data/lib/care.rb
CHANGED
@@ -96,12 +96,8 @@ class Care
|
|
96
96
|
# @return [String, nil] the content read from the IO or `nil` if no data was available
|
97
97
|
# @raise ArgumentError
|
98
98
|
def byteslice(io, at, n_bytes)
|
99
|
-
if n_bytes < 1
|
100
|
-
|
101
|
-
end
|
102
|
-
if at < 0
|
103
|
-
raise ArgumentError, "Negative offsets are not supported (got #{at})"
|
104
|
-
end
|
99
|
+
raise ArgumentError, "The number of bytes to fetch must be a positive Integer, but was #{n_bytes}" if n_bytes < 1
|
100
|
+
raise ArgumentError, "Negative offsets are not supported (got #{at})" if at < 0
|
105
101
|
|
106
102
|
first_page = at / @page_size
|
107
103
|
last_page = (at + n_bytes) / @page_size
|
@@ -174,16 +170,14 @@ class Care
|
|
174
170
|
# @param io[IO] the IO to read from
|
175
171
|
# @param page_i[Integer] which page (zero-based) to read
|
176
172
|
def read_page(io, page_i)
|
177
|
-
Measurometer.increment_counter('format_parser.parser.
|
173
|
+
Measurometer.increment_counter('format_parser.parser.care.page_reads_from_upsteam', 1)
|
178
174
|
|
179
175
|
io.seek(page_i * @page_size)
|
180
|
-
read_result = Measurometer.instrument('format_parser.
|
176
|
+
read_result = Measurometer.instrument('format_parser.care.read_page') { io.read(@page_size) }
|
181
177
|
if read_result.nil?
|
182
178
|
# If the read went past the end of the IO the read result will be nil,
|
183
179
|
# so we know our IO is exhausted here
|
184
|
-
if @lowest_known_empty_page.nil? || @lowest_known_empty_page > page_i
|
185
|
-
@lowest_known_empty_page = page_i
|
186
|
-
end
|
180
|
+
@lowest_known_empty_page = page_i if @lowest_known_empty_page.nil? || @lowest_known_empty_page > page_i
|
187
181
|
elsif read_result.bytesize < @page_size
|
188
182
|
# If we read less than we initially wanted we know there are no pages
|
189
183
|
# to read following this one, so we can also optimize
|
data/lib/format_parser.rb
CHANGED
@@ -20,6 +20,7 @@ module FormatParser
|
|
20
20
|
require_relative 'care'
|
21
21
|
require_relative 'active_storage/blob_analyzer'
|
22
22
|
require_relative 'text'
|
23
|
+
require_relative 'string'
|
23
24
|
|
24
25
|
# Define Measurometer in the internal namespace as well
|
25
26
|
# so that we stay compatible for the applications that use it
|
@@ -87,8 +88,8 @@ module FormatParser
|
|
87
88
|
# Parses the resource at the given `url` and returns the results as if it were any IO
|
88
89
|
# given to `.parse`. The accepted keyword arguments are the same as the ones for `parse`.
|
89
90
|
#
|
90
|
-
# @param url[String, URI] the HTTP(S) URL to request the object from using
|
91
|
-
# @param headers[Hash] (optional) the HTTP headers to request the object from
|
91
|
+
# @param url[String, URI] the HTTP(S) URL to request the object from using `Range:` requests
|
92
|
+
# @param headers[Hash] (optional) the HTTP headers to request the object from
|
92
93
|
# @param kwargs the keyword arguments to be delegated to `.parse`
|
93
94
|
# @see {.parse}
|
94
95
|
def self.parse_http(url, headers: {}, **kwargs)
|
@@ -177,9 +178,7 @@ module FormatParser
|
|
177
178
|
# Convert the results from a lazy enumerator to an Array.
|
178
179
|
results = results.to_a
|
179
180
|
|
180
|
-
if results.empty?
|
181
|
-
Measurometer.increment_counter('format_parser.unknown_files', 1)
|
182
|
-
end
|
181
|
+
Measurometer.increment_counter('format_parser.unknown_files', 1) if results.empty?
|
183
182
|
|
184
183
|
amount == 1 ? results.first : results
|
185
184
|
ensure
|
@@ -202,12 +201,12 @@ module FormatParser
|
|
202
201
|
end
|
203
202
|
|
204
203
|
def self.execute_parser_and_capture_expected_exceptions(parser, limited_io)
|
205
|
-
parser_name_for_instrumentation = parser.class.to_s.split('::').last
|
204
|
+
parser_name_for_instrumentation = parser.class.to_s.split('::').last.underscore
|
206
205
|
Measurometer.instrument('format_parser.parser.%s' % parser_name_for_instrumentation) do
|
207
206
|
parser.call(limited_io).tap do |result|
|
208
207
|
if result
|
209
|
-
Measurometer.increment_counter('format_parser.detected_natures
|
210
|
-
Measurometer.increment_counter('format_parser.detected_formats
|
208
|
+
Measurometer.increment_counter('format_parser.detected_natures', 1, nature: result.nature)
|
209
|
+
Measurometer.increment_counter('format_parser.detected_formats', 1, format: result.format)
|
211
210
|
end
|
212
211
|
end
|
213
212
|
end
|
@@ -252,9 +251,7 @@ module FormatParser
|
|
252
251
|
fitting_by_formats = assemble_parser_set[@parsers_per_format, desired_formats]
|
253
252
|
parsers = fitting_by_natures & fitting_by_formats
|
254
253
|
|
255
|
-
if parsers.empty?
|
256
|
-
raise ArgumentError, "No parsers provide both natures #{desired_natures.inspect} and formats #{desired_formats.inspect}"
|
257
|
-
end
|
254
|
+
raise ArgumentError, "No parsers provide both natures #{desired_natures.inspect} and formats #{desired_formats.inspect}" if parsers.empty?
|
258
255
|
|
259
256
|
# Order the parsers according to their priority value. The ones having a lower
|
260
257
|
# value will sort higher and will be applied sooner
|
data/lib/io_utils.rb
CHANGED
@@ -9,12 +9,8 @@ module FormatParser::IOUtils
|
|
9
9
|
raise ArgumentError, 'Unbounded reads are not supported' if n.nil?
|
10
10
|
buf = io.read(n)
|
11
11
|
|
12
|
-
unless buf
|
13
|
-
|
14
|
-
end
|
15
|
-
if buf.bytesize != n
|
16
|
-
raise InvalidRead, "We wanted to read #{n} bytes from the IO, but we got #{buf.bytesize} instead"
|
17
|
-
end
|
12
|
+
raise InvalidRead, "We wanted to read #{n} bytes from the IO, but the IO is at EOF" unless buf
|
13
|
+
raise InvalidRead, "We wanted to read #{n} bytes from the IO, but we got #{buf.bytesize} instead" if buf.bytesize != n
|
18
14
|
|
19
15
|
buf
|
20
16
|
end
|
@@ -33,23 +33,17 @@ class FormatParser::AdtsHeaderInfo
|
|
33
33
|
MPEG_VERSION_HASH = { 0 => 'MPEG-4', 1 => 'MPEG-2'}
|
34
34
|
|
35
35
|
def mpeg4_sampling_frequency
|
36
|
-
if !@mpeg4_sampling_frequency_index.nil? && MPEG4_AUDIO_SAMPLING_FREQUENCY_HASH.key?(@mpeg4_sampling_frequency_index)
|
37
|
-
return MPEG4_AUDIO_SAMPLING_FREQUENCY_HASH[@mpeg4_sampling_frequency_index]
|
38
|
-
end
|
36
|
+
return MPEG4_AUDIO_SAMPLING_FREQUENCY_HASH[@mpeg4_sampling_frequency_index] if !@mpeg4_sampling_frequency_index.nil? && MPEG4_AUDIO_SAMPLING_FREQUENCY_HASH.key?(@mpeg4_sampling_frequency_index)
|
39
37
|
nil
|
40
38
|
end
|
41
39
|
|
42
40
|
def profile_description
|
43
|
-
if !@profile.nil? && AAC_PROFILE_DESCRIPTION_HASH.key?(@profile)
|
44
|
-
return AAC_PROFILE_DESCRIPTION_HASH[@profile]
|
45
|
-
end
|
41
|
+
return AAC_PROFILE_DESCRIPTION_HASH[@profile] if !@profile.nil? && AAC_PROFILE_DESCRIPTION_HASH.key?(@profile)
|
46
42
|
nil
|
47
43
|
end
|
48
44
|
|
49
45
|
def mpeg_version_description
|
50
|
-
if !@mpeg_version.nil? && MPEG_VERSION_HASH.key?(@mpeg_version)
|
51
|
-
return MPEG_VERSION_HASH[@mpeg_version]
|
52
|
-
end
|
46
|
+
return MPEG_VERSION_HASH[@mpeg_version] if !@mpeg_version.nil? && MPEG_VERSION_HASH.key?(@mpeg_version)
|
53
47
|
nil
|
54
48
|
end
|
55
49
|
|
data/lib/parsers/exif_parser.rb
CHANGED
@@ -125,9 +125,7 @@ module FormatParser::EXIFParser
|
|
125
125
|
# those and return the _last_ non-0 orientation, or 0 otherwise
|
126
126
|
@multiple_exif_results.reverse_each do |exif_tag_frame|
|
127
127
|
orientation_value = exif_tag_frame.orientation
|
128
|
-
if !orientation_value.nil? && orientation_value != 0
|
129
|
-
return orientation_value
|
130
|
-
end
|
128
|
+
return orientation_value if !orientation_value.nil? && orientation_value != 0
|
131
129
|
end
|
132
130
|
0 # If none were found - the orientation is unknown
|
133
131
|
end
|
@@ -175,7 +173,7 @@ module FormatParser::EXIFParser
|
|
175
173
|
EXIFR.logger = Logger.new(nil)
|
176
174
|
|
177
175
|
def exif_from_tiff_io(constrained_io, should_include_sub_ifds = false)
|
178
|
-
Measurometer.instrument('format_parser.
|
176
|
+
Measurometer.instrument('format_parser.exif_parser.exif_from_tiff_io') do
|
179
177
|
extended_io = IOExt.new(constrained_io)
|
180
178
|
exif_raw_data = EXIFR::TIFF.new(extended_io)
|
181
179
|
|
data/lib/parsers/fdx_parser.rb
CHANGED
@@ -24,9 +24,9 @@ class FormatParser::FDXParser
|
|
24
24
|
def check_for_document_type(file_and_document_type)
|
25
25
|
sanitized_data = file_and_document_type.downcase
|
26
26
|
if sanitized_data.include?('finaldraft') && sanitized_data.include?('script')
|
27
|
-
|
27
|
+
[:fdx, :script]
|
28
28
|
else
|
29
|
-
|
29
|
+
nil
|
30
30
|
end
|
31
31
|
end
|
32
32
|
|
data/lib/parsers/flac_parser.rb
CHANGED
@@ -20,15 +20,11 @@ class FormatParser::FLACParser
|
|
20
20
|
|
21
21
|
minimum_block_size = bytestring_to_int(safe_read(io, 2))
|
22
22
|
|
23
|
-
if minimum_block_size < 16
|
24
|
-
raise MalformedFile, 'FLAC file minimum block size must be larger than 16'
|
25
|
-
end
|
23
|
+
raise MalformedFile, 'FLAC file minimum block size must be larger than 16' if minimum_block_size < 16
|
26
24
|
|
27
25
|
maximum_block_size = bytestring_to_int(safe_read(io, 2))
|
28
26
|
|
29
|
-
if maximum_block_size < minimum_block_size
|
30
|
-
raise MalformedFile, 'FLAC file maximum block size must be equal to or larger than minimum block size'
|
31
|
-
end
|
27
|
+
raise MalformedFile, 'FLAC file maximum block size must be equal to or larger than minimum block size' if maximum_block_size < minimum_block_size
|
32
28
|
|
33
29
|
minimum_frame_size = bytestring_to_int(safe_read(io, 3))
|
34
30
|
maximum_frame_size = bytestring_to_int(safe_read(io, 3))
|
data/lib/parsers/jpeg_parser.rb
CHANGED
@@ -69,7 +69,7 @@ class FormatParser::JPEGParser
|
|
69
69
|
end
|
70
70
|
end
|
71
71
|
|
72
|
-
Measurometer.add_distribution_value('format_parser.
|
72
|
+
Measurometer.add_distribution_value('format_parser.jpeg_parser.bytes_read_until_capture', @buf.pos)
|
73
73
|
|
74
74
|
# A single file might contain multiple EXIF data frames. In a JPEG this would
|
75
75
|
# manifest as multiple APP1 markers. The way different programs handle these
|
@@ -156,7 +156,7 @@ class FormatParser::JPEGParser
|
|
156
156
|
# Use StringIO.new instead of #write - https://github.com/aws/aws-sdk-ruby/issues/785#issuecomment-95456838
|
157
157
|
exif_buf = StringIO.new(safe_read(@buf, app1_frame_content_length - EXIF_MAGIC_STRING.bytesize))
|
158
158
|
|
159
|
-
Measurometer.add_distribution_value('format_parser.
|
159
|
+
Measurometer.add_distribution_value('format_parser.jpeg_parser.bytes_sent_to_exif_parser', exif_buf.size)
|
160
160
|
|
161
161
|
@exif_data_frames << exif_from_tiff_io(exif_buf)
|
162
162
|
rescue EXIFR::MalformedTIFF
|
data/lib/parsers/moov_parser.rb
CHANGED
@@ -37,7 +37,7 @@ class FormatParser::MOOVParser
|
|
37
37
|
# size that gets parsed just before.
|
38
38
|
max_read_offset = 0xFFFFFFFF
|
39
39
|
decoder = Decoder.new
|
40
|
-
atom_tree = Measurometer.instrument('format_parser.
|
40
|
+
atom_tree = Measurometer.instrument('format_parser.decoder.extract_atom_stream') do
|
41
41
|
decoder.extract_atom_stream(io, max_read_offset)
|
42
42
|
end
|
43
43
|
|
@@ -93,12 +93,10 @@ class FormatParser::MOOVParser
|
|
93
93
|
def parse_dimensions(decoder, atom_tree)
|
94
94
|
video_trak_atom = decoder.find_video_trak_atom(atom_tree)
|
95
95
|
|
96
|
-
tkhd =
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
decoder.find_first_atom_by_path(atom_tree, 'moov', 'trak', 'tkhd')
|
101
|
-
end
|
96
|
+
tkhd = if video_trak_atom
|
97
|
+
decoder.find_first_atom_by_path([video_trak_atom], 'trak', 'tkhd')
|
98
|
+
else
|
99
|
+
decoder.find_first_atom_by_path(atom_tree, 'moov', 'trak', 'tkhd')
|
102
100
|
end
|
103
101
|
|
104
102
|
if tkhd
|
data/lib/parsers/mp3_parser.rb
CHANGED
@@ -179,13 +179,9 @@ class FormatParser::MP3Parser
|
|
179
179
|
frame_data_str = io.read(frame_detail.frame_length)
|
180
180
|
io.seek(io.pos - frame_detail.frame_length)
|
181
181
|
xing_header = attempt_xing_header(frame_data_str)
|
182
|
-
if xing_header_usable_for_duration?(xing_header)
|
183
|
-
return [xing_header, mpeg_frames]
|
184
|
-
end
|
185
|
-
end
|
186
|
-
if frame_detail.frame_length > 1 # jump over current frame body
|
187
|
-
io.seek(io.pos + frame_detail.frame_length - bytes_to_read)
|
182
|
+
return [xing_header, mpeg_frames] if xing_header_usable_for_duration?(xing_header)
|
188
183
|
end
|
184
|
+
io.seek(io.pos + frame_detail.frame_length - bytes_to_read) if frame_detail.frame_length > 1 # jump over current frame body
|
189
185
|
end
|
190
186
|
[nil, mpeg_frames]
|
191
187
|
rescue InvalidDeepFetch # A frame was invalid - bail out since it's unlikely we can recover
|
data/lib/parsers/mpeg_parser.rb
CHANGED
@@ -44,9 +44,7 @@ class FormatParser::MPEGParser
|
|
44
44
|
io.seek(pos + 1)
|
45
45
|
horizontal_size, vertical_size = parse_image_size(io)
|
46
46
|
ratio_code, rate_code = parse_rate_information(io)
|
47
|
-
if valid_aspect_ratio_code?(ratio_code) && valid_frame_rate_code?(rate_code)
|
48
|
-
return file_info(horizontal_size, vertical_size, ratio_code, rate_code)
|
49
|
-
end
|
47
|
+
return file_info(horizontal_size, vertical_size, ratio_code, rate_code) if valid_aspect_ratio_code?(ratio_code) && valid_frame_rate_code?(rate_code)
|
50
48
|
end
|
51
49
|
nil # otherwise the return value of Integer#times will be returned
|
52
50
|
rescue FormatParser::IOUtils::InvalidRead
|
data/lib/parsers/wav_parser.rb
CHANGED
@@ -34,9 +34,7 @@ class FormatParser::WAVParser
|
|
34
34
|
case chunk_type
|
35
35
|
when 'fmt ' # watch out: the chunk ID of the format chunk ends with a space
|
36
36
|
fmt_data = unpack_fmt_chunk(io, chunk_size)
|
37
|
-
if fmt_data[:audio_format] != 1 and fact_processed
|
38
|
-
return process_non_pcm(fmt_data, total_sample_frames)
|
39
|
-
end
|
37
|
+
return process_non_pcm(fmt_data, total_sample_frames) if fmt_data[:audio_format] != 1 and fact_processed
|
40
38
|
fmt_processed = true
|
41
39
|
when 'data'
|
42
40
|
return unless fmt_processed # the 'data' chunk cannot preceed the 'fmt ' chunk
|
@@ -45,11 +43,10 @@ class FormatParser::WAVParser
|
|
45
43
|
when 'fact'
|
46
44
|
total_sample_frames = safe_read(io, 4).unpack('l').first
|
47
45
|
safe_skip(io, chunk_size - 4)
|
48
|
-
if fmt_processed and fmt_data[:audio_format] != 1
|
49
|
-
return process_non_pcm(fmt_data, total_sample_frames)
|
50
|
-
end
|
46
|
+
return process_non_pcm(fmt_data, total_sample_frames) if fmt_processed and fmt_data[:audio_format] != 1
|
51
47
|
fact_processed = true
|
52
|
-
else
|
48
|
+
else
|
49
|
+
# Skip this chunk until a known chunk is encountered
|
53
50
|
safe_skip(io, chunk_size)
|
54
51
|
end
|
55
52
|
end
|
@@ -70,11 +67,11 @@ class FormatParser::WAVParser
|
|
70
67
|
safe_skip(io, chunk_size - 16) # skip the extra fields
|
71
68
|
|
72
69
|
{
|
73
|
-
audio_format:
|
74
|
-
channels:
|
75
|
-
sample_rate:
|
76
|
-
byte_rate:
|
77
|
-
block_align:
|
70
|
+
audio_format: fmt_info[0],
|
71
|
+
channels: fmt_info[1],
|
72
|
+
sample_rate: fmt_info[2],
|
73
|
+
byte_rate: fmt_info[3],
|
74
|
+
block_align: fmt_info[4],
|
78
75
|
bits_per_sample: fmt_info[5],
|
79
76
|
}
|
80
77
|
end
|
@@ -27,52 +27,43 @@ class FormatParser::ZIPParser::FileReader
|
|
27
27
|
# To prevent too many tiny reads, read the maximum possible size of end of
|
28
28
|
# central directory record upfront (all the fixed fields + at most 0xFFFF
|
29
29
|
# bytes of the archive comment)
|
30
|
-
MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE =
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
2 + # The comment size
|
40
|
-
0xFFFF # Maximum comment size
|
41
|
-
end
|
30
|
+
MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE = 4 + # Offset of the start of central directory
|
31
|
+
4 + # Size of the central directory
|
32
|
+
2 + # Number of files in the cdir
|
33
|
+
4 + # End-of-central-directory signature
|
34
|
+
2 + # Number of this disk
|
35
|
+
2 + # Number of disk with the start of cdir
|
36
|
+
2 + # Number of files in the cdir of this disk
|
37
|
+
2 + # The comment size
|
38
|
+
0xFFFF # Maximum comment size
|
42
39
|
|
43
40
|
# To prevent too many tiny reads, read the maximum possible size of the local file header upfront.
|
44
41
|
# The maximum size is all the usual items, plus the maximum size
|
45
42
|
# of the filename (0xFFFF bytes) and the maximum size of the extras (0xFFFF bytes)
|
46
|
-
MAX_LOCAL_HEADER_SIZE =
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
2 + # Number of the disk with the EOCD record
|
68
|
-
2 + # Number of entries in the central directory of this disk
|
69
|
-
2 + # Number of entries in the central directory total
|
70
|
-
4 + # Size of the central directory
|
71
|
-
4 # Start of the central directory offset
|
72
|
-
end
|
43
|
+
MAX_LOCAL_HEADER_SIZE = 4 + # signature
|
44
|
+
2 + # Version needed to extract
|
45
|
+
2 + # gp flags
|
46
|
+
2 + # storage mode
|
47
|
+
2 + # dos time
|
48
|
+
2 + # dos date
|
49
|
+
4 + # CRC32
|
50
|
+
4 + # Comp size
|
51
|
+
4 + # Uncomp size
|
52
|
+
2 + # Filename size
|
53
|
+
2 + # Extra fields size
|
54
|
+
0xFFFF + # Maximum filename size
|
55
|
+
0xFFFF # Maximum extra fields size
|
56
|
+
|
57
|
+
SIZE_OF_USABLE_EOCD_RECORD = 4 + # Signature
|
58
|
+
2 + # Number of this disk
|
59
|
+
2 + # Number of the disk with the EOCD record
|
60
|
+
2 + # Number of entries in the central directory of this disk
|
61
|
+
2 + # Number of entries in the central directory total
|
62
|
+
4 + # Size of the central directory
|
63
|
+
4 # Start of the central directory offset
|
73
64
|
|
74
65
|
private_constant :C_UINT32LE, :C_UINT16LE, :C_UINT64LE, :MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE,
|
75
|
-
|
66
|
+
:MAX_LOCAL_HEADER_SIZE, :SIZE_OF_USABLE_EOCD_RECORD
|
76
67
|
|
77
68
|
# Represents a file within the ZIP archive being read
|
78
69
|
class ZipEntry
|
@@ -216,7 +207,7 @@ class FormatParser::ZIPParser::FileReader
|
|
216
207
|
io.seek(absolute_pos)
|
217
208
|
unless absolute_pos == io.pos
|
218
209
|
raise ReadError,
|
219
|
-
|
210
|
+
"Expected to seek to #{absolute_pos} but only got to #{io.pos}"
|
220
211
|
end
|
221
212
|
nil
|
222
213
|
end
|
@@ -235,18 +226,14 @@ class FormatParser::ZIPParser::FileReader
|
|
235
226
|
io.seek(io.pos + n)
|
236
227
|
pos_after = io.pos
|
237
228
|
delta = pos_after - pos_before
|
238
|
-
unless delta == n
|
239
|
-
raise ReadError, "Expected to seek #{n} bytes ahead, but could only seek #{delta} bytes ahead"
|
240
|
-
end
|
229
|
+
raise ReadError, "Expected to seek #{n} bytes ahead, but could only seek #{delta} bytes ahead" unless delta == n
|
241
230
|
nil
|
242
231
|
end
|
243
232
|
|
244
233
|
def read_n(io, n_bytes)
|
245
234
|
io.read(n_bytes).tap do |d|
|
246
235
|
raise ReadError, "Expected to read #{n_bytes} bytes, but the IO was at the end" if d.nil?
|
247
|
-
unless d.bytesize == n_bytes
|
248
|
-
raise ReadError, "Expected to read #{n_bytes} bytes, read #{d.bytesize}"
|
249
|
-
end
|
236
|
+
raise ReadError, "Expected to read #{n_bytes} bytes, read #{d.bytesize}" unless d.bytesize == n_bytes
|
250
237
|
end
|
251
238
|
end
|
252
239
|
|
@@ -310,15 +297,9 @@ class FormatParser::ZIPParser::FileReader
|
|
310
297
|
#
|
311
298
|
# It means that before we read this stuff we need to check if the previously-read
|
312
299
|
# values are at overflow, and only _then_ proceed to read them. Bah.
|
313
|
-
if e.uncompressed_size == 0xFFFFFFFF
|
314
|
-
|
315
|
-
|
316
|
-
if e.compressed_size == 0xFFFFFFFF
|
317
|
-
e.compressed_size = read_8b(zip64_extra)
|
318
|
-
end
|
319
|
-
if e.local_file_header_offset == 0xFFFFFFFF
|
320
|
-
e.local_file_header_offset = read_8b(zip64_extra)
|
321
|
-
end
|
300
|
+
e.uncompressed_size = read_8b(zip64_extra) if e.uncompressed_size == 0xFFFFFFFF
|
301
|
+
e.compressed_size = read_8b(zip64_extra) if e.compressed_size == 0xFFFFFFFF
|
302
|
+
e.local_file_header_offset = read_8b(zip64_extra) if e.local_file_header_offset == 0xFFFFFFFF
|
322
303
|
# Disk number comes last and we can skip it anyway, since we do
|
323
304
|
# not support multi-disk archives
|
324
305
|
end
|
@@ -370,9 +351,7 @@ class FormatParser::ZIPParser::FileReader
|
|
370
351
|
signature, *_rest, comment_size = maybe_record.unpack(unpack_pattern)
|
371
352
|
|
372
353
|
# Check the only condition for the match
|
373
|
-
if signature == 0x06054b50 && (maybe_record.bytesize - minimum_record_size) == comment_size
|
374
|
-
return check_at # Found the EOCD marker location
|
375
|
-
end
|
354
|
+
return check_at if signature == 0x06054b50 && (maybe_record.bytesize - minimum_record_size) == comment_size
|
376
355
|
end
|
377
356
|
# If we haven't caught anything, return nil deliberately instead of returning the last statement
|
378
357
|
nil
|
@@ -422,16 +401,12 @@ class FormatParser::ZIPParser::FileReader
|
|
422
401
|
|
423
402
|
disk_n = read_4b(zip64_eocdr) # number of this disk
|
424
403
|
disk_n_with_eocdr = read_4b(zip64_eocdr) # number of the disk with the EOCDR
|
425
|
-
if disk_n != disk_n_with_eocdr
|
426
|
-
raise UnsupportedFeature, 'The archive spans multiple disks'
|
427
|
-
end
|
404
|
+
raise UnsupportedFeature, 'The archive spans multiple disks' if disk_n != disk_n_with_eocdr
|
428
405
|
|
429
406
|
num_files_this_disk = read_8b(zip64_eocdr) # number of files on this disk
|
430
|
-
num_files_total
|
407
|
+
num_files_total = read_8b(zip64_eocdr) # files total in the central directory
|
431
408
|
|
432
|
-
if num_files_this_disk != num_files_total
|
433
|
-
raise UnsupportedFeature, 'The archive spans multiple disks'
|
434
|
-
end
|
409
|
+
raise UnsupportedFeature, 'The archive spans multiple disks' if num_files_this_disk != num_files_total
|
435
410
|
|
436
411
|
log do
|
437
412
|
format(
|
@@ -439,8 +414,8 @@ class FormatParser::ZIPParser::FileReader
|
|
439
414
|
num_files_total)
|
440
415
|
end
|
441
416
|
|
442
|
-
central_dir_size
|
443
|
-
central_dir_offset
|
417
|
+
central_dir_size = read_8b(zip64_eocdr) # Size of the central directory
|
418
|
+
central_dir_offset = read_8b(zip64_eocdr) # Where the central directory starts
|
444
419
|
|
445
420
|
[num_files_total, central_dir_offset, central_dir_size]
|
446
421
|
end
|
@@ -456,8 +431,8 @@ class FormatParser::ZIPParser::FileReader
|
|
456
431
|
skip_ahead_2(io) # number_of_this_disk
|
457
432
|
skip_ahead_2(io) # number of the disk with the EOCD record
|
458
433
|
skip_ahead_2(io) # number of entries in the central directory of this disk
|
459
|
-
num_files = read_2b(io)
|
460
|
-
cdir_size = read_4b(io)
|
434
|
+
num_files = read_2b(io) # number of entries in the central directory total
|
435
|
+
cdir_size = read_4b(io) # size of the central directory
|
461
436
|
cdir_offset = read_4b(io) # start of central directorty offset
|
462
437
|
[num_files, cdir_offset, cdir_size]
|
463
438
|
end
|
data/lib/parsers/zip_parser.rb
CHANGED
data/lib/read_limiter.rb
CHANGED
@@ -45,9 +45,7 @@ class FormatParser::ReadLimiter
|
|
45
45
|
# @return Integer
|
46
46
|
def seek(to)
|
47
47
|
@seeks += 1
|
48
|
-
if @max_seeks && @seeks > @max_seeks
|
49
|
-
raise BudgetExceeded, 'Seek budget exceeded (%d seeks performed)' % @max_seeks
|
50
|
-
end
|
48
|
+
raise BudgetExceeded, 'Seek budget exceeded (%d seeks performed)' % @max_seeks if @max_seeks && @seeks > @max_seeks
|
51
49
|
@io.seek(to)
|
52
50
|
end
|
53
51
|
|
@@ -60,26 +58,20 @@ class FormatParser::ReadLimiter
|
|
60
58
|
@bytes += n_bytes
|
61
59
|
@reads += 1
|
62
60
|
|
63
|
-
if @max_bytes && @bytes > @max_bytes
|
64
|
-
|
65
|
-
end
|
66
|
-
|
67
|
-
if @max_reads && @reads > @max_reads
|
68
|
-
raise BudgetExceeded, 'Number of read() calls exceeded (%d max)' % @max_reads
|
69
|
-
end
|
61
|
+
raise BudgetExceeded, 'Read bytes budget (%d) exceeded' % @max_bytes if @max_bytes && @bytes > @max_bytes
|
62
|
+
raise BudgetExceeded, 'Number of read() calls exceeded (%d max)' % @max_reads if @max_reads && @reads > @max_reads
|
70
63
|
|
71
64
|
@io.read(n_bytes)
|
72
65
|
end
|
73
66
|
|
74
67
|
# Sends the metrics about the state of this ReadLimiter to a Measurometer
|
75
68
|
#
|
76
|
-
# @param
|
77
|
-
# `format_parser.TIFF.read_limiter.num_seeks` and so forth
|
69
|
+
# @param parser[String] the parser to add as a tag.
|
78
70
|
# @return void
|
79
|
-
def send_metrics(
|
80
|
-
Measurometer.add_distribution_value('format_parser
|
81
|
-
Measurometer.add_distribution_value('format_parser
|
82
|
-
Measurometer.add_distribution_value('format_parser
|
71
|
+
def send_metrics(parser)
|
72
|
+
Measurometer.add_distribution_value('format_parser.read_limiter.num_seeks', @seeks, parser: parser)
|
73
|
+
Measurometer.add_distribution_value('format_parser.read_limiter.num_reads', @reads, parser: parser)
|
74
|
+
Measurometer.add_distribution_value('format_parser.read_limiter.read_bytes', @bytes, parser: parser)
|
83
75
|
end
|
84
76
|
|
85
77
|
# Resets all the recorded call counters so that the object can be reused for the next parser,
|