format_parser 1.7.0 → 2.0.0.pre
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/main.yml +4 -9
- data/CHANGELOG.md +6 -0
- data/format_parser.gemspec +9 -11
- data/lib/care.rb +5 -11
- data/lib/format_parser/version.rb +1 -1
- data/lib/format_parser.rb +8 -11
- data/lib/io_utils.rb +2 -6
- data/lib/parsers/aac_parser/adts_header_info.rb +3 -9
- data/lib/parsers/dpx_parser/dpx_structs.rb +1 -1
- data/lib/parsers/exif_parser.rb +2 -4
- data/lib/parsers/fdx_parser.rb +2 -2
- data/lib/parsers/flac_parser.rb +2 -6
- data/lib/parsers/jpeg_parser.rb +2 -2
- data/lib/parsers/moov_parser.rb +5 -7
- data/lib/parsers/mp3_parser.rb +2 -6
- data/lib/parsers/mpeg_parser.rb +1 -3
- data/lib/parsers/wav_parser.rb +9 -12
- data/lib/parsers/zip_parser/file_reader.rb +45 -70
- data/lib/parsers/zip_parser.rb +1 -1
- data/lib/read_limiter.rb +8 -16
- data/lib/remote_io.rb +64 -34
- data/lib/string.rb +9 -0
- data/spec/attributes_json_spec.rb +0 -3
- data/spec/remote_fetching_spec.rb +3 -8
- data/spec/remote_io_spec.rb +116 -60
- metadata +40 -79
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d7c965b7783ecaea4802f7e585861b4400b2210fee4cb90388757530880fa074
|
4
|
+
data.tar.gz: fc8b7cc3f00825fa054c948a7ae817b1eee6457ffaec9e5a6b5bdd9a0b92d126
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 73f774ebe540dfd54e87f89cedecfc0fabf4a97f4e2ef72afcd94edc5e0fbc344c7c67b365942e3bb915dfe76f94f038072671c259c2d366a69d64a73cbde960
|
7
|
+
data.tar.gz: bc1405329d521487ec4d0738c258fb12c3acdb37b6b8ecebf7451a866d5f1072cfc23774e2ecc3d7d297095ff280320756fb4cd9000de3eac447a105cf87028b
|
data/.github/workflows/main.yml
CHANGED
@@ -14,8 +14,8 @@ jobs:
|
|
14
14
|
matrix:
|
15
15
|
ruby:
|
16
16
|
- 2.7
|
17
|
-
-
|
18
|
-
-
|
17
|
+
- 3.0
|
18
|
+
- 3.1
|
19
19
|
- jruby
|
20
20
|
steps:
|
21
21
|
- name: Checkout
|
@@ -60,15 +60,10 @@ jobs:
|
|
60
60
|
matrix:
|
61
61
|
ruby:
|
62
62
|
- 2.7
|
63
|
-
-
|
64
|
-
-
|
63
|
+
- 3.0
|
64
|
+
- 3.1
|
65
65
|
- jruby
|
66
66
|
experimental: [false]
|
67
|
-
include:
|
68
|
-
- ruby: 3.1
|
69
|
-
experimental: true
|
70
|
-
- ruby: 3.0
|
71
|
-
experimental: true
|
72
67
|
steps:
|
73
68
|
- name: Checkout
|
74
69
|
uses: actions/checkout@v2
|
data/CHANGELOG.md
CHANGED
data/format_parser.gemspec
CHANGED
@@ -30,17 +30,15 @@ Gem::Specification.new do |spec|
|
|
30
30
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
31
31
|
spec.require_paths = ['lib']
|
32
32
|
|
33
|
-
spec.add_dependency '
|
34
|
-
spec.add_dependency '
|
35
|
-
spec.add_dependency '
|
36
|
-
spec.add_dependency '
|
37
|
-
spec.add_dependency 'faraday_middleware', '~> 0.14'
|
38
|
-
spec.add_dependency 'measurometer', '~> 1'
|
33
|
+
spec.add_dependency 'exifr', '>= 1.3.8'
|
34
|
+
spec.add_dependency 'id3tag', '>= 0.14.2'
|
35
|
+
spec.add_dependency 'ks'
|
36
|
+
spec.add_dependency 'measurometer'
|
39
37
|
|
40
|
-
spec.add_development_dependency 'rspec', '~> 3.0'
|
41
|
-
spec.add_development_dependency 'rake', '~> 12'
|
42
|
-
spec.add_development_dependency 'simplecov', '~> 0.15'
|
43
|
-
spec.add_development_dependency 'yard', '~> 0.9'
|
44
|
-
spec.add_development_dependency 'wetransfer_style', '0.5.0'
|
45
38
|
spec.add_development_dependency 'parallel_tests'
|
39
|
+
spec.add_development_dependency 'rake'
|
40
|
+
spec.add_development_dependency 'rspec'
|
41
|
+
spec.add_development_dependency 'simplecov'
|
42
|
+
spec.add_development_dependency 'wetransfer_style', '1.0.0'
|
43
|
+
spec.add_development_dependency 'yard'
|
46
44
|
end
|
data/lib/care.rb
CHANGED
@@ -96,12 +96,8 @@ class Care
|
|
96
96
|
# @return [String, nil] the content read from the IO or `nil` if no data was available
|
97
97
|
# @raise ArgumentError
|
98
98
|
def byteslice(io, at, n_bytes)
|
99
|
-
if n_bytes < 1
|
100
|
-
|
101
|
-
end
|
102
|
-
if at < 0
|
103
|
-
raise ArgumentError, "Negative offsets are not supported (got #{at})"
|
104
|
-
end
|
99
|
+
raise ArgumentError, "The number of bytes to fetch must be a positive Integer, but was #{n_bytes}" if n_bytes < 1
|
100
|
+
raise ArgumentError, "Negative offsets are not supported (got #{at})" if at < 0
|
105
101
|
|
106
102
|
first_page = at / @page_size
|
107
103
|
last_page = (at + n_bytes) / @page_size
|
@@ -174,16 +170,14 @@ class Care
|
|
174
170
|
# @param io[IO] the IO to read from
|
175
171
|
# @param page_i[Integer] which page (zero-based) to read
|
176
172
|
def read_page(io, page_i)
|
177
|
-
Measurometer.increment_counter('format_parser.parser.
|
173
|
+
Measurometer.increment_counter('format_parser.parser.care.page_reads_from_upsteam', 1)
|
178
174
|
|
179
175
|
io.seek(page_i * @page_size)
|
180
|
-
read_result = Measurometer.instrument('format_parser.
|
176
|
+
read_result = Measurometer.instrument('format_parser.care.read_page') { io.read(@page_size) }
|
181
177
|
if read_result.nil?
|
182
178
|
# If the read went past the end of the IO the read result will be nil,
|
183
179
|
# so we know our IO is exhausted here
|
184
|
-
if @lowest_known_empty_page.nil? || @lowest_known_empty_page > page_i
|
185
|
-
@lowest_known_empty_page = page_i
|
186
|
-
end
|
180
|
+
@lowest_known_empty_page = page_i if @lowest_known_empty_page.nil? || @lowest_known_empty_page > page_i
|
187
181
|
elsif read_result.bytesize < @page_size
|
188
182
|
# If we read less than we initially wanted we know there are no pages
|
189
183
|
# to read following this one, so we can also optimize
|
data/lib/format_parser.rb
CHANGED
@@ -20,6 +20,7 @@ module FormatParser
|
|
20
20
|
require_relative 'care'
|
21
21
|
require_relative 'active_storage/blob_analyzer'
|
22
22
|
require_relative 'text'
|
23
|
+
require_relative 'string'
|
23
24
|
|
24
25
|
# Define Measurometer in the internal namespace as well
|
25
26
|
# so that we stay compatible for the applications that use it
|
@@ -87,8 +88,8 @@ module FormatParser
|
|
87
88
|
# Parses the resource at the given `url` and returns the results as if it were any IO
|
88
89
|
# given to `.parse`. The accepted keyword arguments are the same as the ones for `parse`.
|
89
90
|
#
|
90
|
-
# @param url[String, URI] the HTTP(S) URL to request the object from using
|
91
|
-
# @param headers[Hash] (optional) the HTTP headers to request the object from
|
91
|
+
# @param url[String, URI] the HTTP(S) URL to request the object from using `Range:` requests
|
92
|
+
# @param headers[Hash] (optional) the HTTP headers to request the object from
|
92
93
|
# @param kwargs the keyword arguments to be delegated to `.parse`
|
93
94
|
# @see {.parse}
|
94
95
|
def self.parse_http(url, headers: {}, **kwargs)
|
@@ -177,9 +178,7 @@ module FormatParser
|
|
177
178
|
# Convert the results from a lazy enumerator to an Array.
|
178
179
|
results = results.to_a
|
179
180
|
|
180
|
-
if results.empty?
|
181
|
-
Measurometer.increment_counter('format_parser.unknown_files', 1)
|
182
|
-
end
|
181
|
+
Measurometer.increment_counter('format_parser.unknown_files', 1) if results.empty?
|
183
182
|
|
184
183
|
amount == 1 ? results.first : results
|
185
184
|
ensure
|
@@ -202,12 +201,12 @@ module FormatParser
|
|
202
201
|
end
|
203
202
|
|
204
203
|
def self.execute_parser_and_capture_expected_exceptions(parser, limited_io)
|
205
|
-
parser_name_for_instrumentation = parser.class.to_s.split('::').last
|
204
|
+
parser_name_for_instrumentation = parser.class.to_s.split('::').last.underscore
|
206
205
|
Measurometer.instrument('format_parser.parser.%s' % parser_name_for_instrumentation) do
|
207
206
|
parser.call(limited_io).tap do |result|
|
208
207
|
if result
|
209
|
-
Measurometer.increment_counter('format_parser.detected_natures
|
210
|
-
Measurometer.increment_counter('format_parser.detected_formats
|
208
|
+
Measurometer.increment_counter('format_parser.detected_natures', 1, nature: result.nature)
|
209
|
+
Measurometer.increment_counter('format_parser.detected_formats', 1, format: result.format)
|
211
210
|
end
|
212
211
|
end
|
213
212
|
end
|
@@ -252,9 +251,7 @@ module FormatParser
|
|
252
251
|
fitting_by_formats = assemble_parser_set[@parsers_per_format, desired_formats]
|
253
252
|
parsers = fitting_by_natures & fitting_by_formats
|
254
253
|
|
255
|
-
if parsers.empty?
|
256
|
-
raise ArgumentError, "No parsers provide both natures #{desired_natures.inspect} and formats #{desired_formats.inspect}"
|
257
|
-
end
|
254
|
+
raise ArgumentError, "No parsers provide both natures #{desired_natures.inspect} and formats #{desired_formats.inspect}" if parsers.empty?
|
258
255
|
|
259
256
|
# Order the parsers according to their priority value. The ones having a lower
|
260
257
|
# value will sort higher and will be applied sooner
|
data/lib/io_utils.rb
CHANGED
@@ -9,12 +9,8 @@ module FormatParser::IOUtils
|
|
9
9
|
raise ArgumentError, 'Unbounded reads are not supported' if n.nil?
|
10
10
|
buf = io.read(n)
|
11
11
|
|
12
|
-
unless buf
|
13
|
-
|
14
|
-
end
|
15
|
-
if buf.bytesize != n
|
16
|
-
raise InvalidRead, "We wanted to read #{n} bytes from the IO, but we got #{buf.bytesize} instead"
|
17
|
-
end
|
12
|
+
raise InvalidRead, "We wanted to read #{n} bytes from the IO, but the IO is at EOF" unless buf
|
13
|
+
raise InvalidRead, "We wanted to read #{n} bytes from the IO, but we got #{buf.bytesize} instead" if buf.bytesize != n
|
18
14
|
|
19
15
|
buf
|
20
16
|
end
|
@@ -33,23 +33,17 @@ class FormatParser::AdtsHeaderInfo
|
|
33
33
|
MPEG_VERSION_HASH = { 0 => 'MPEG-4', 1 => 'MPEG-2'}
|
34
34
|
|
35
35
|
def mpeg4_sampling_frequency
|
36
|
-
if !@mpeg4_sampling_frequency_index.nil? && MPEG4_AUDIO_SAMPLING_FREQUENCY_HASH.key?(@mpeg4_sampling_frequency_index)
|
37
|
-
return MPEG4_AUDIO_SAMPLING_FREQUENCY_HASH[@mpeg4_sampling_frequency_index]
|
38
|
-
end
|
36
|
+
return MPEG4_AUDIO_SAMPLING_FREQUENCY_HASH[@mpeg4_sampling_frequency_index] if !@mpeg4_sampling_frequency_index.nil? && MPEG4_AUDIO_SAMPLING_FREQUENCY_HASH.key?(@mpeg4_sampling_frequency_index)
|
39
37
|
nil
|
40
38
|
end
|
41
39
|
|
42
40
|
def profile_description
|
43
|
-
if !@profile.nil? && AAC_PROFILE_DESCRIPTION_HASH.key?(@profile)
|
44
|
-
return AAC_PROFILE_DESCRIPTION_HASH[@profile]
|
45
|
-
end
|
41
|
+
return AAC_PROFILE_DESCRIPTION_HASH[@profile] if !@profile.nil? && AAC_PROFILE_DESCRIPTION_HASH.key?(@profile)
|
46
42
|
nil
|
47
43
|
end
|
48
44
|
|
49
45
|
def mpeg_version_description
|
50
|
-
if !@mpeg_version.nil? && MPEG_VERSION_HASH.key?(@mpeg_version)
|
51
|
-
return MPEG_VERSION_HASH[@mpeg_version]
|
52
|
-
end
|
46
|
+
return MPEG_VERSION_HASH[@mpeg_version] if !@mpeg_version.nil? && MPEG_VERSION_HASH.key?(@mpeg_version)
|
53
47
|
nil
|
54
48
|
end
|
55
49
|
|
data/lib/parsers/exif_parser.rb
CHANGED
@@ -125,9 +125,7 @@ module FormatParser::EXIFParser
|
|
125
125
|
# those and return the _last_ non-0 orientation, or 0 otherwise
|
126
126
|
@multiple_exif_results.reverse_each do |exif_tag_frame|
|
127
127
|
orientation_value = exif_tag_frame.orientation
|
128
|
-
if !orientation_value.nil? && orientation_value != 0
|
129
|
-
return orientation_value
|
130
|
-
end
|
128
|
+
return orientation_value if !orientation_value.nil? && orientation_value != 0
|
131
129
|
end
|
132
130
|
0 # If none were found - the orientation is unknown
|
133
131
|
end
|
@@ -175,7 +173,7 @@ module FormatParser::EXIFParser
|
|
175
173
|
EXIFR.logger = Logger.new(nil)
|
176
174
|
|
177
175
|
def exif_from_tiff_io(constrained_io, should_include_sub_ifds = false)
|
178
|
-
Measurometer.instrument('format_parser.
|
176
|
+
Measurometer.instrument('format_parser.exif_parser.exif_from_tiff_io') do
|
179
177
|
extended_io = IOExt.new(constrained_io)
|
180
178
|
exif_raw_data = EXIFR::TIFF.new(extended_io)
|
181
179
|
|
data/lib/parsers/fdx_parser.rb
CHANGED
@@ -24,9 +24,9 @@ class FormatParser::FDXParser
|
|
24
24
|
def check_for_document_type(file_and_document_type)
|
25
25
|
sanitized_data = file_and_document_type.downcase
|
26
26
|
if sanitized_data.include?('finaldraft') && sanitized_data.include?('script')
|
27
|
-
|
27
|
+
[:fdx, :script]
|
28
28
|
else
|
29
|
-
|
29
|
+
nil
|
30
30
|
end
|
31
31
|
end
|
32
32
|
|
data/lib/parsers/flac_parser.rb
CHANGED
@@ -20,15 +20,11 @@ class FormatParser::FLACParser
|
|
20
20
|
|
21
21
|
minimum_block_size = bytestring_to_int(safe_read(io, 2))
|
22
22
|
|
23
|
-
if minimum_block_size < 16
|
24
|
-
raise MalformedFile, 'FLAC file minimum block size must be larger than 16'
|
25
|
-
end
|
23
|
+
raise MalformedFile, 'FLAC file minimum block size must be larger than 16' if minimum_block_size < 16
|
26
24
|
|
27
25
|
maximum_block_size = bytestring_to_int(safe_read(io, 2))
|
28
26
|
|
29
|
-
if maximum_block_size < minimum_block_size
|
30
|
-
raise MalformedFile, 'FLAC file maximum block size must be equal to or larger than minimum block size'
|
31
|
-
end
|
27
|
+
raise MalformedFile, 'FLAC file maximum block size must be equal to or larger than minimum block size' if maximum_block_size < minimum_block_size
|
32
28
|
|
33
29
|
minimum_frame_size = bytestring_to_int(safe_read(io, 3))
|
34
30
|
maximum_frame_size = bytestring_to_int(safe_read(io, 3))
|
data/lib/parsers/jpeg_parser.rb
CHANGED
@@ -69,7 +69,7 @@ class FormatParser::JPEGParser
|
|
69
69
|
end
|
70
70
|
end
|
71
71
|
|
72
|
-
Measurometer.add_distribution_value('format_parser.
|
72
|
+
Measurometer.add_distribution_value('format_parser.jpeg_parser.bytes_read_until_capture', @buf.pos)
|
73
73
|
|
74
74
|
# A single file might contain multiple EXIF data frames. In a JPEG this would
|
75
75
|
# manifest as multiple APP1 markers. The way different programs handle these
|
@@ -156,7 +156,7 @@ class FormatParser::JPEGParser
|
|
156
156
|
# Use StringIO.new instead of #write - https://github.com/aws/aws-sdk-ruby/issues/785#issuecomment-95456838
|
157
157
|
exif_buf = StringIO.new(safe_read(@buf, app1_frame_content_length - EXIF_MAGIC_STRING.bytesize))
|
158
158
|
|
159
|
-
Measurometer.add_distribution_value('format_parser.
|
159
|
+
Measurometer.add_distribution_value('format_parser.jpeg_parser.bytes_sent_to_exif_parser', exif_buf.size)
|
160
160
|
|
161
161
|
@exif_data_frames << exif_from_tiff_io(exif_buf)
|
162
162
|
rescue EXIFR::MalformedTIFF
|
data/lib/parsers/moov_parser.rb
CHANGED
@@ -37,7 +37,7 @@ class FormatParser::MOOVParser
|
|
37
37
|
# size that gets parsed just before.
|
38
38
|
max_read_offset = 0xFFFFFFFF
|
39
39
|
decoder = Decoder.new
|
40
|
-
atom_tree = Measurometer.instrument('format_parser.
|
40
|
+
atom_tree = Measurometer.instrument('format_parser.decoder.extract_atom_stream') do
|
41
41
|
decoder.extract_atom_stream(io, max_read_offset)
|
42
42
|
end
|
43
43
|
|
@@ -93,12 +93,10 @@ class FormatParser::MOOVParser
|
|
93
93
|
def parse_dimensions(decoder, atom_tree)
|
94
94
|
video_trak_atom = decoder.find_video_trak_atom(atom_tree)
|
95
95
|
|
96
|
-
tkhd =
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
decoder.find_first_atom_by_path(atom_tree, 'moov', 'trak', 'tkhd')
|
101
|
-
end
|
96
|
+
tkhd = if video_trak_atom
|
97
|
+
decoder.find_first_atom_by_path([video_trak_atom], 'trak', 'tkhd')
|
98
|
+
else
|
99
|
+
decoder.find_first_atom_by_path(atom_tree, 'moov', 'trak', 'tkhd')
|
102
100
|
end
|
103
101
|
|
104
102
|
if tkhd
|
data/lib/parsers/mp3_parser.rb
CHANGED
@@ -179,13 +179,9 @@ class FormatParser::MP3Parser
|
|
179
179
|
frame_data_str = io.read(frame_detail.frame_length)
|
180
180
|
io.seek(io.pos - frame_detail.frame_length)
|
181
181
|
xing_header = attempt_xing_header(frame_data_str)
|
182
|
-
if xing_header_usable_for_duration?(xing_header)
|
183
|
-
return [xing_header, mpeg_frames]
|
184
|
-
end
|
185
|
-
end
|
186
|
-
if frame_detail.frame_length > 1 # jump over current frame body
|
187
|
-
io.seek(io.pos + frame_detail.frame_length - bytes_to_read)
|
182
|
+
return [xing_header, mpeg_frames] if xing_header_usable_for_duration?(xing_header)
|
188
183
|
end
|
184
|
+
io.seek(io.pos + frame_detail.frame_length - bytes_to_read) if frame_detail.frame_length > 1 # jump over current frame body
|
189
185
|
end
|
190
186
|
[nil, mpeg_frames]
|
191
187
|
rescue InvalidDeepFetch # A frame was invalid - bail out since it's unlikely we can recover
|
data/lib/parsers/mpeg_parser.rb
CHANGED
@@ -44,9 +44,7 @@ class FormatParser::MPEGParser
|
|
44
44
|
io.seek(pos + 1)
|
45
45
|
horizontal_size, vertical_size = parse_image_size(io)
|
46
46
|
ratio_code, rate_code = parse_rate_information(io)
|
47
|
-
if valid_aspect_ratio_code?(ratio_code) && valid_frame_rate_code?(rate_code)
|
48
|
-
return file_info(horizontal_size, vertical_size, ratio_code, rate_code)
|
49
|
-
end
|
47
|
+
return file_info(horizontal_size, vertical_size, ratio_code, rate_code) if valid_aspect_ratio_code?(ratio_code) && valid_frame_rate_code?(rate_code)
|
50
48
|
end
|
51
49
|
nil # otherwise the return value of Integer#times will be returned
|
52
50
|
rescue FormatParser::IOUtils::InvalidRead
|
data/lib/parsers/wav_parser.rb
CHANGED
@@ -34,9 +34,7 @@ class FormatParser::WAVParser
|
|
34
34
|
case chunk_type
|
35
35
|
when 'fmt ' # watch out: the chunk ID of the format chunk ends with a space
|
36
36
|
fmt_data = unpack_fmt_chunk(io, chunk_size)
|
37
|
-
if fmt_data[:audio_format] != 1 and fact_processed
|
38
|
-
return process_non_pcm(fmt_data, total_sample_frames)
|
39
|
-
end
|
37
|
+
return process_non_pcm(fmt_data, total_sample_frames) if fmt_data[:audio_format] != 1 and fact_processed
|
40
38
|
fmt_processed = true
|
41
39
|
when 'data'
|
42
40
|
return unless fmt_processed # the 'data' chunk cannot preceed the 'fmt ' chunk
|
@@ -45,11 +43,10 @@ class FormatParser::WAVParser
|
|
45
43
|
when 'fact'
|
46
44
|
total_sample_frames = safe_read(io, 4).unpack('l').first
|
47
45
|
safe_skip(io, chunk_size - 4)
|
48
|
-
if fmt_processed and fmt_data[:audio_format] != 1
|
49
|
-
return process_non_pcm(fmt_data, total_sample_frames)
|
50
|
-
end
|
46
|
+
return process_non_pcm(fmt_data, total_sample_frames) if fmt_processed and fmt_data[:audio_format] != 1
|
51
47
|
fact_processed = true
|
52
|
-
else
|
48
|
+
else
|
49
|
+
# Skip this chunk until a known chunk is encountered
|
53
50
|
safe_skip(io, chunk_size)
|
54
51
|
end
|
55
52
|
end
|
@@ -70,11 +67,11 @@ class FormatParser::WAVParser
|
|
70
67
|
safe_skip(io, chunk_size - 16) # skip the extra fields
|
71
68
|
|
72
69
|
{
|
73
|
-
audio_format:
|
74
|
-
channels:
|
75
|
-
sample_rate:
|
76
|
-
byte_rate:
|
77
|
-
block_align:
|
70
|
+
audio_format: fmt_info[0],
|
71
|
+
channels: fmt_info[1],
|
72
|
+
sample_rate: fmt_info[2],
|
73
|
+
byte_rate: fmt_info[3],
|
74
|
+
block_align: fmt_info[4],
|
78
75
|
bits_per_sample: fmt_info[5],
|
79
76
|
}
|
80
77
|
end
|
@@ -27,52 +27,43 @@ class FormatParser::ZIPParser::FileReader
|
|
27
27
|
# To prevent too many tiny reads, read the maximum possible size of end of
|
28
28
|
# central directory record upfront (all the fixed fields + at most 0xFFFF
|
29
29
|
# bytes of the archive comment)
|
30
|
-
MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE =
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
2 + # The comment size
|
40
|
-
0xFFFF # Maximum comment size
|
41
|
-
end
|
30
|
+
MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE = 4 + # Offset of the start of central directory
|
31
|
+
4 + # Size of the central directory
|
32
|
+
2 + # Number of files in the cdir
|
33
|
+
4 + # End-of-central-directory signature
|
34
|
+
2 + # Number of this disk
|
35
|
+
2 + # Number of disk with the start of cdir
|
36
|
+
2 + # Number of files in the cdir of this disk
|
37
|
+
2 + # The comment size
|
38
|
+
0xFFFF # Maximum comment size
|
42
39
|
|
43
40
|
# To prevent too many tiny reads, read the maximum possible size of the local file header upfront.
|
44
41
|
# The maximum size is all the usual items, plus the maximum size
|
45
42
|
# of the filename (0xFFFF bytes) and the maximum size of the extras (0xFFFF bytes)
|
46
|
-
MAX_LOCAL_HEADER_SIZE =
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
2 + # Number of the disk with the EOCD record
|
68
|
-
2 + # Number of entries in the central directory of this disk
|
69
|
-
2 + # Number of entries in the central directory total
|
70
|
-
4 + # Size of the central directory
|
71
|
-
4 # Start of the central directory offset
|
72
|
-
end
|
43
|
+
MAX_LOCAL_HEADER_SIZE = 4 + # signature
|
44
|
+
2 + # Version needed to extract
|
45
|
+
2 + # gp flags
|
46
|
+
2 + # storage mode
|
47
|
+
2 + # dos time
|
48
|
+
2 + # dos date
|
49
|
+
4 + # CRC32
|
50
|
+
4 + # Comp size
|
51
|
+
4 + # Uncomp size
|
52
|
+
2 + # Filename size
|
53
|
+
2 + # Extra fields size
|
54
|
+
0xFFFF + # Maximum filename size
|
55
|
+
0xFFFF # Maximum extra fields size
|
56
|
+
|
57
|
+
SIZE_OF_USABLE_EOCD_RECORD = 4 + # Signature
|
58
|
+
2 + # Number of this disk
|
59
|
+
2 + # Number of the disk with the EOCD record
|
60
|
+
2 + # Number of entries in the central directory of this disk
|
61
|
+
2 + # Number of entries in the central directory total
|
62
|
+
4 + # Size of the central directory
|
63
|
+
4 # Start of the central directory offset
|
73
64
|
|
74
65
|
private_constant :C_UINT32LE, :C_UINT16LE, :C_UINT64LE, :MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE,
|
75
|
-
|
66
|
+
:MAX_LOCAL_HEADER_SIZE, :SIZE_OF_USABLE_EOCD_RECORD
|
76
67
|
|
77
68
|
# Represents a file within the ZIP archive being read
|
78
69
|
class ZipEntry
|
@@ -216,7 +207,7 @@ class FormatParser::ZIPParser::FileReader
|
|
216
207
|
io.seek(absolute_pos)
|
217
208
|
unless absolute_pos == io.pos
|
218
209
|
raise ReadError,
|
219
|
-
|
210
|
+
"Expected to seek to #{absolute_pos} but only got to #{io.pos}"
|
220
211
|
end
|
221
212
|
nil
|
222
213
|
end
|
@@ -235,18 +226,14 @@ class FormatParser::ZIPParser::FileReader
|
|
235
226
|
io.seek(io.pos + n)
|
236
227
|
pos_after = io.pos
|
237
228
|
delta = pos_after - pos_before
|
238
|
-
unless delta == n
|
239
|
-
raise ReadError, "Expected to seek #{n} bytes ahead, but could only seek #{delta} bytes ahead"
|
240
|
-
end
|
229
|
+
raise ReadError, "Expected to seek #{n} bytes ahead, but could only seek #{delta} bytes ahead" unless delta == n
|
241
230
|
nil
|
242
231
|
end
|
243
232
|
|
244
233
|
def read_n(io, n_bytes)
|
245
234
|
io.read(n_bytes).tap do |d|
|
246
235
|
raise ReadError, "Expected to read #{n_bytes} bytes, but the IO was at the end" if d.nil?
|
247
|
-
unless d.bytesize == n_bytes
|
248
|
-
raise ReadError, "Expected to read #{n_bytes} bytes, read #{d.bytesize}"
|
249
|
-
end
|
236
|
+
raise ReadError, "Expected to read #{n_bytes} bytes, read #{d.bytesize}" unless d.bytesize == n_bytes
|
250
237
|
end
|
251
238
|
end
|
252
239
|
|
@@ -310,15 +297,9 @@ class FormatParser::ZIPParser::FileReader
|
|
310
297
|
#
|
311
298
|
# It means that before we read this stuff we need to check if the previously-read
|
312
299
|
# values are at overflow, and only _then_ proceed to read them. Bah.
|
313
|
-
if e.uncompressed_size == 0xFFFFFFFF
|
314
|
-
|
315
|
-
|
316
|
-
if e.compressed_size == 0xFFFFFFFF
|
317
|
-
e.compressed_size = read_8b(zip64_extra)
|
318
|
-
end
|
319
|
-
if e.local_file_header_offset == 0xFFFFFFFF
|
320
|
-
e.local_file_header_offset = read_8b(zip64_extra)
|
321
|
-
end
|
300
|
+
e.uncompressed_size = read_8b(zip64_extra) if e.uncompressed_size == 0xFFFFFFFF
|
301
|
+
e.compressed_size = read_8b(zip64_extra) if e.compressed_size == 0xFFFFFFFF
|
302
|
+
e.local_file_header_offset = read_8b(zip64_extra) if e.local_file_header_offset == 0xFFFFFFFF
|
322
303
|
# Disk number comes last and we can skip it anyway, since we do
|
323
304
|
# not support multi-disk archives
|
324
305
|
end
|
@@ -370,9 +351,7 @@ class FormatParser::ZIPParser::FileReader
|
|
370
351
|
signature, *_rest, comment_size = maybe_record.unpack(unpack_pattern)
|
371
352
|
|
372
353
|
# Check the only condition for the match
|
373
|
-
if signature == 0x06054b50 && (maybe_record.bytesize - minimum_record_size) == comment_size
|
374
|
-
return check_at # Found the EOCD marker location
|
375
|
-
end
|
354
|
+
return check_at if signature == 0x06054b50 && (maybe_record.bytesize - minimum_record_size) == comment_size
|
376
355
|
end
|
377
356
|
# If we haven't caught anything, return nil deliberately instead of returning the last statement
|
378
357
|
nil
|
@@ -422,16 +401,12 @@ class FormatParser::ZIPParser::FileReader
|
|
422
401
|
|
423
402
|
disk_n = read_4b(zip64_eocdr) # number of this disk
|
424
403
|
disk_n_with_eocdr = read_4b(zip64_eocdr) # number of the disk with the EOCDR
|
425
|
-
if disk_n != disk_n_with_eocdr
|
426
|
-
raise UnsupportedFeature, 'The archive spans multiple disks'
|
427
|
-
end
|
404
|
+
raise UnsupportedFeature, 'The archive spans multiple disks' if disk_n != disk_n_with_eocdr
|
428
405
|
|
429
406
|
num_files_this_disk = read_8b(zip64_eocdr) # number of files on this disk
|
430
|
-
num_files_total
|
407
|
+
num_files_total = read_8b(zip64_eocdr) # files total in the central directory
|
431
408
|
|
432
|
-
if num_files_this_disk != num_files_total
|
433
|
-
raise UnsupportedFeature, 'The archive spans multiple disks'
|
434
|
-
end
|
409
|
+
raise UnsupportedFeature, 'The archive spans multiple disks' if num_files_this_disk != num_files_total
|
435
410
|
|
436
411
|
log do
|
437
412
|
format(
|
@@ -439,8 +414,8 @@ class FormatParser::ZIPParser::FileReader
|
|
439
414
|
num_files_total)
|
440
415
|
end
|
441
416
|
|
442
|
-
central_dir_size
|
443
|
-
central_dir_offset
|
417
|
+
central_dir_size = read_8b(zip64_eocdr) # Size of the central directory
|
418
|
+
central_dir_offset = read_8b(zip64_eocdr) # Where the central directory starts
|
444
419
|
|
445
420
|
[num_files_total, central_dir_offset, central_dir_size]
|
446
421
|
end
|
@@ -456,8 +431,8 @@ class FormatParser::ZIPParser::FileReader
|
|
456
431
|
skip_ahead_2(io) # number_of_this_disk
|
457
432
|
skip_ahead_2(io) # number of the disk with the EOCD record
|
458
433
|
skip_ahead_2(io) # number of entries in the central directory of this disk
|
459
|
-
num_files = read_2b(io)
|
460
|
-
cdir_size = read_4b(io)
|
434
|
+
num_files = read_2b(io) # number of entries in the central directory total
|
435
|
+
cdir_size = read_4b(io) # size of the central directory
|
461
436
|
cdir_offset = read_4b(io) # start of central directorty offset
|
462
437
|
[num_files, cdir_offset, cdir_size]
|
463
438
|
end
|
data/lib/parsers/zip_parser.rb
CHANGED
data/lib/read_limiter.rb
CHANGED
@@ -45,9 +45,7 @@ class FormatParser::ReadLimiter
|
|
45
45
|
# @return Integer
|
46
46
|
def seek(to)
|
47
47
|
@seeks += 1
|
48
|
-
if @max_seeks && @seeks > @max_seeks
|
49
|
-
raise BudgetExceeded, 'Seek budget exceeded (%d seeks performed)' % @max_seeks
|
50
|
-
end
|
48
|
+
raise BudgetExceeded, 'Seek budget exceeded (%d seeks performed)' % @max_seeks if @max_seeks && @seeks > @max_seeks
|
51
49
|
@io.seek(to)
|
52
50
|
end
|
53
51
|
|
@@ -60,26 +58,20 @@ class FormatParser::ReadLimiter
|
|
60
58
|
@bytes += n_bytes
|
61
59
|
@reads += 1
|
62
60
|
|
63
|
-
if @max_bytes && @bytes > @max_bytes
|
64
|
-
|
65
|
-
end
|
66
|
-
|
67
|
-
if @max_reads && @reads > @max_reads
|
68
|
-
raise BudgetExceeded, 'Number of read() calls exceeded (%d max)' % @max_reads
|
69
|
-
end
|
61
|
+
raise BudgetExceeded, 'Read bytes budget (%d) exceeded' % @max_bytes if @max_bytes && @bytes > @max_bytes
|
62
|
+
raise BudgetExceeded, 'Number of read() calls exceeded (%d max)' % @max_reads if @max_reads && @reads > @max_reads
|
70
63
|
|
71
64
|
@io.read(n_bytes)
|
72
65
|
end
|
73
66
|
|
74
67
|
# Sends the metrics about the state of this ReadLimiter to a Measurometer
|
75
68
|
#
|
76
|
-
# @param
|
77
|
-
# `format_parser.TIFF.read_limiter.num_seeks` and so forth
|
69
|
+
# @param parser[String] the parser to add as a tag.
|
78
70
|
# @return void
|
79
|
-
def send_metrics(
|
80
|
-
Measurometer.add_distribution_value('format_parser
|
81
|
-
Measurometer.add_distribution_value('format_parser
|
82
|
-
Measurometer.add_distribution_value('format_parser
|
71
|
+
def send_metrics(parser)
|
72
|
+
Measurometer.add_distribution_value('format_parser.read_limiter.num_seeks', @seeks, parser: parser)
|
73
|
+
Measurometer.add_distribution_value('format_parser.read_limiter.num_reads', @reads, parser: parser)
|
74
|
+
Measurometer.add_distribution_value('format_parser.read_limiter.read_bytes', @bytes, parser: parser)
|
83
75
|
end
|
84
76
|
|
85
77
|
# Resets all the recorded call counters so that the object can be reused for the next parser,
|