format_parser 0.7.0 → 0.8.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +98 -0
- data/CONTRIBUTING.md +6 -2
- data/lib/care.rb +52 -3
- data/lib/format_parser/version.rb +1 -1
- data/lib/format_parser.rb +95 -21
- data/lib/io_constraint.rb +17 -2
- data/lib/io_utils.rb +1 -5
- data/lib/measurometer.rb +100 -0
- data/lib/parsers/jpeg_parser.rb +1 -0
- data/lib/parsers/moov_parser.rb +0 -4
- data/lib/read_limiter.rb +42 -5
- data/lib/read_limits_config.rb +26 -0
- data/lib/remote_io.rb +7 -0
- data/spec/io_utils_spec.rb +1 -7
- data/spec/measurometer_spec.rb +48 -0
- data/spec/remote_fetching_spec.rb +5 -4
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 789592107e9fa74091745b703249248b6a05e9dd73af45803ab799708f8498fc
|
4
|
+
data.tar.gz: 191ff20e5b8d455f681eb19b40a2bd406c1cfa1a9b2aeebaa483ea829071e532
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dde5379a4d0590019ac6eee21361e10688f8582efa169132f28a0dc531f6b05811368e987a3a30902dcc1a509952c11bb9810bee3f3dbd17e99f3c13f1776b71
|
7
|
+
data.tar.gz: ec5a699fb441ca622006fb2885a6e1ce623da0a7ea6a5b78e5df83db2a1f9227dd023681f12a7ec703188133a8276e363dd3e5c090bca3ce694064124925e47e
|
data/CHANGELOG.md
ADDED
@@ -0,0 +1,98 @@
|
|
1
|
+
## 0.8.0
|
2
|
+
* Add `Measurometer` for applying instrumentation around FormatParser operaions. See documentation for usage.
|
3
|
+
|
4
|
+
## 0.7.0
|
5
|
+
* Configure read limits / pagefault limits centrally so that those limits make sense together
|
6
|
+
|
7
|
+
## 0.6.0
|
8
|
+
* Double the cache page size once more
|
9
|
+
* We no longer need exifr/jpeg
|
10
|
+
* Fix EXIF parsing in JPEG files
|
11
|
+
* Reject Keynote documents in JPEG parser
|
12
|
+
|
13
|
+
## 0.5.2
|
14
|
+
* Do not raise EXIFR errors for keynote files
|
15
|
+
* Correct broken comment for the audio nature
|
16
|
+
|
17
|
+
## 0.5.1
|
18
|
+
* Raise the cache page size during detection
|
19
|
+
* Fix ZIP entry filename parsing
|
20
|
+
|
21
|
+
## 0.5.0
|
22
|
+
* Add FLAC parser
|
23
|
+
* Add parse_atom_children_and_data_fields support
|
24
|
+
* Add basic detection of Office files
|
25
|
+
* Optimize EOCD signature lookup
|
26
|
+
|
27
|
+
## 0.4.0
|
28
|
+
* Adds a basic PDF parser
|
29
|
+
* Make sure root: and to_json without arguments work
|
30
|
+
* ZIP file format support
|
31
|
+
|
32
|
+
## 0.3.5
|
33
|
+
* Fix the bug with EXIF dimensions being used instead of pixel dimensions
|
34
|
+
|
35
|
+
## 0.3.4
|
36
|
+
* Pagefault limit
|
37
|
+
* Add seek modes required by exifr
|
38
|
+
|
39
|
+
## 0.3.3
|
40
|
+
* Implement a sane to_json as well
|
41
|
+
|
42
|
+
## 0.3.2
|
43
|
+
* Add default as_json
|
44
|
+
* Test on 2.5.0
|
45
|
+
|
46
|
+
## 0.3.1
|
47
|
+
* Remove post install warning
|
48
|
+
* Moved aiff_parser_spec.rb to spec/parsers
|
49
|
+
* CR2 file support
|
50
|
+
* Add require 'set' to format_parser.rb
|
51
|
+
* Use register_parser for natures/fmts
|
52
|
+
|
53
|
+
## 0.3.0
|
54
|
+
* Reverse API changes to support :first as default and add opts to parse_http
|
55
|
+
* Implement and comply with rubocop
|
56
|
+
* JPEG parser and Care fixes
|
57
|
+
* Add format and count options to parse_http
|
58
|
+
* Return first result as default
|
59
|
+
* Use hashes for MOOV atom default fields
|
60
|
+
|
61
|
+
## 0.2.0
|
62
|
+
* Implement parser DSL
|
63
|
+
|
64
|
+
## 0.1.7
|
65
|
+
* Fix read(0) on Care::IOWrapper, introduce top-level tests
|
66
|
+
|
67
|
+
## 0.1.6
|
68
|
+
* Fix mp3 parsing bug
|
69
|
+
* Add MOOV parser
|
70
|
+
|
71
|
+
## 0.1.5
|
72
|
+
* Add FDX parser
|
73
|
+
* Remove dry-structs
|
74
|
+
* New interface updates
|
75
|
+
|
76
|
+
## 0.1.4
|
77
|
+
* Add WAV parser
|
78
|
+
|
79
|
+
## 0.1.3
|
80
|
+
* Add MP3 parser
|
81
|
+
* Add FileInformation#intrinsics
|
82
|
+
* Disallow negative Care offsets
|
83
|
+
|
84
|
+
## 0.1.2
|
85
|
+
* Introduce a restrictive IO subset wrapper
|
86
|
+
* Switch rewind for seek in exif parser
|
87
|
+
* Prep for OSS release
|
88
|
+
* Add fuzz spec
|
89
|
+
* Improve orientation parsing
|
90
|
+
* Optimisation for PNG and invalid input protection on JPEG
|
91
|
+
|
92
|
+
## 0.1.1
|
93
|
+
* Add AIFF parser
|
94
|
+
|
95
|
+
## 0.1.0
|
96
|
+
* Add parsers for PNG, JPG, TIFF, PSD
|
97
|
+
* Add GIF parser
|
98
|
+
* Add DPX parser
|
data/CONTRIBUTING.md
CHANGED
@@ -16,7 +16,7 @@ If you are interested in contributing code and would like to learn more about th
|
|
16
16
|
|
17
17
|
- [ruby](https://ruby-doc.org)
|
18
18
|
- [rspec](http://rspec.info/) (for testing)
|
19
|
-
|
19
|
+
|
20
20
|
# How do I make a contribution?
|
21
21
|
|
22
22
|
## Using the issue tracker
|
@@ -101,7 +101,7 @@ project's developers might not want to merge into the project.
|
|
101
101
|
Please adhere to the coding conventions used throughout the project (indentation,
|
102
102
|
accurate comments, etc.) and any other requirements (such as test coverage).
|
103
103
|
|
104
|
-
The test suite can be run with `bundle exec rspec`.
|
104
|
+
The test suite can be run with `bundle exec rspec`.
|
105
105
|
|
106
106
|
Follow this process if you'd like your work considered for inclusion in the
|
107
107
|
project:
|
@@ -155,3 +155,7 @@ project:
|
|
155
155
|
license your work under the same license as that used by the project, which you
|
156
156
|
can see by clicking [here](https://github.com/WeTransfer/format_parser/blob/master/LICENSE.txt).
|
157
157
|
This provision also applies to the test files you include with the changed code as fixtures.
|
158
|
+
|
159
|
+
## Changelog
|
160
|
+
|
161
|
+
When creating a new release you must add an entry in the `CHANGELOG.md`.
|
data/lib/care.rb
CHANGED
@@ -4,27 +4,49 @@
|
|
4
4
|
# is only available via HTTP, for example, we can have less
|
5
5
|
# fetches and have them return more data for one fetch
|
6
6
|
class Care
|
7
|
+
# Defines the size of a page in bytes that the Care will prefetch
|
7
8
|
DEFAULT_PAGE_SIZE = 128 * 1024
|
8
9
|
|
10
|
+
# Wraps any given IO with Care caching superpowers. Supports the subset
|
11
|
+
# of IO declared in IOConstraint.
|
9
12
|
class IOWrapper
|
13
|
+
# Creates a new IOWrapper around the given source IO
|
14
|
+
#
|
15
|
+
# @param io[#seek, #pos, #size] the IO to wrap
|
16
|
+
# @param page_size[Integer] the size of the cache page to use for this wrapper
|
10
17
|
def initialize(io, page_size: DEFAULT_PAGE_SIZE)
|
11
18
|
@cache = Cache.new(page_size)
|
12
19
|
@io = io
|
13
20
|
@pos = 0
|
14
21
|
end
|
15
22
|
|
23
|
+
# Returns the size of the resource contained in the IO
|
24
|
+
#
|
25
|
+
# @return Integer
|
16
26
|
def size
|
17
27
|
@io.size
|
18
28
|
end
|
19
29
|
|
30
|
+
# Seeks the IO to the given absolute offset from the start of the file/resource
|
31
|
+
#
|
32
|
+
# @param to[Integer] offset in the IO
|
33
|
+
# @return Integer
|
20
34
|
def seek(to)
|
21
35
|
@pos = to
|
22
36
|
end
|
23
37
|
|
38
|
+
# Returns the current position/offset within the IO
|
39
|
+
#
|
40
|
+
# @return Integer
|
24
41
|
def pos
|
25
42
|
@pos
|
26
43
|
end
|
27
44
|
|
45
|
+
# Returns at most `n_bytes` of data from the IO or less if less data was available
|
46
|
+
# before the EOF was hit
|
47
|
+
#
|
48
|
+
# @param n_bytes[Integer]
|
49
|
+
# @return [String, nil] the content read from the IO or `nil` if no data was available
|
28
50
|
def read(n_bytes)
|
29
51
|
return '' if n_bytes == 0 # As hardcoded for all Ruby IO objects
|
30
52
|
raise ArgumentError, "negative length #{n_bytes} given" if n_bytes < 0 # also as per Ruby IO objects
|
@@ -34,10 +56,17 @@ class Care
|
|
34
56
|
read
|
35
57
|
end
|
36
58
|
|
59
|
+
# Clears all the cached pages explicitly to help GC
|
60
|
+
#
|
61
|
+
# @return void
|
37
62
|
def clear
|
38
63
|
@cache.clear
|
39
64
|
end
|
40
65
|
|
66
|
+
# Clears all the cached pages explicitly to help GC, and
|
67
|
+
# calls `#close` on the source IO if the IO responds to `#close`
|
68
|
+
#
|
69
|
+
# @return void
|
41
70
|
def close
|
42
71
|
clear
|
43
72
|
@io.close if @io.respond_to?(:close)
|
@@ -47,6 +76,7 @@ class Care
|
|
47
76
|
# Stores cached pages of data from the given IO as strings.
|
48
77
|
# Pages are sized to be `page_size` or less (for the last page).
|
49
78
|
class Cache
|
79
|
+
# Initializes a new cache pages container with pages of given size
|
50
80
|
def initialize(page_size = DEFAULT_PAGE_SIZE)
|
51
81
|
@page_size = page_size.to_i
|
52
82
|
raise ArgumentError, 'The page size must be a positive Integer' unless @page_size > 0
|
@@ -59,6 +89,12 @@ class Care
|
|
59
89
|
# If the IO has been exhausted, `nil` will be returned
|
60
90
|
# instead. Will use the cached pages where available,
|
61
91
|
# or fetch pages where necessary
|
92
|
+
#
|
93
|
+
# @param io[#seek, #read] the IO to read data from
|
94
|
+
# @param at[Integer] at which offset we have to read
|
95
|
+
# @param n_bytes[Integer] how many bytes we want to read/cache
|
96
|
+
# @return [String, nil] the content read from the IO or `nil` if no data was available
|
97
|
+
# @raise ArgumentError
|
62
98
|
def byteslice(io, at, n_bytes)
|
63
99
|
if n_bytes < 1
|
64
100
|
raise ArgumentError, "The number of bytes to fetch must be a positive Integer, but was #{n_bytes}"
|
@@ -97,10 +133,18 @@ class Care
|
|
97
133
|
slice if slice && !slice.empty?
|
98
134
|
end
|
99
135
|
|
136
|
+
# Clears the page cache of all strings with data
|
137
|
+
#
|
138
|
+
# @return void
|
100
139
|
def clear
|
101
140
|
@pages.clear
|
102
141
|
end
|
103
142
|
|
143
|
+
# Hydrates a page at the certain index or returns the contents of
|
144
|
+
# that page if it is already in the cache
|
145
|
+
#
|
146
|
+
# @param io[IO] the IO to read from
|
147
|
+
# @param page_i[Integer] which page (zero-based) to hydrate and return
|
104
148
|
def hydrate_page(io, page_i)
|
105
149
|
# Avoid trying to read the page if we know there is no content to fill it
|
106
150
|
# in the underlying IO
|
@@ -109,9 +153,9 @@ class Care
|
|
109
153
|
@pages[page_i] ||= read_page(io, page_i)
|
110
154
|
end
|
111
155
|
|
156
|
+
# We provide an overridden implementation of #inspect to avoid
|
157
|
+
# printing the actual contents of the cached pages
|
112
158
|
def inspect
|
113
|
-
# To avoid page _contents_ in the inspect outputs we need to implement our own inspect.
|
114
|
-
|
115
159
|
# Simulate the builtin object ID output https://stackoverflow.com/a/11765495/153886
|
116
160
|
oid_str = (object_id << 1).to_s(16).rjust(16, '0')
|
117
161
|
|
@@ -124,10 +168,15 @@ class Care
|
|
124
168
|
'#<%s:%s %s %s>' % [self.class, oid_str, synthetic_vars, ivars_str]
|
125
169
|
end
|
126
170
|
|
171
|
+
# Reads the requested page from the given IO
|
172
|
+
#
|
173
|
+
# @param io[IO] the IO to read from
|
174
|
+
# @param page_i[Integer] which page (zero-based) to read
|
127
175
|
def read_page(io, page_i)
|
176
|
+
FormatParser::Measurometer.increment_counter('format_parser.parser.Care.page_reads_from_upsteam', 1)
|
177
|
+
|
128
178
|
io.seek(page_i * @page_size)
|
129
179
|
read_result = io.read(@page_size)
|
130
|
-
|
131
180
|
if read_result.nil?
|
132
181
|
# If the read went past the end of the IO the read result will be nil,
|
133
182
|
# so we know our IO is exhausted here
|
data/lib/format_parser.rb
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
require 'set'
|
2
2
|
|
3
|
+
# A pretty nimble module for parsing file metadata using partial reads. Contains all the
|
4
|
+
# top-level methods of the library.
|
3
5
|
module FormatParser
|
4
6
|
require_relative 'attributes_json'
|
5
7
|
require_relative 'image'
|
@@ -14,9 +16,19 @@ module FormatParser
|
|
14
16
|
require_relative 'io_constraint'
|
15
17
|
require_relative 'care'
|
16
18
|
|
19
|
+
# Is used to manage access to the shared array of parser constructors, which might
|
20
|
+
# potentially be mutated from different threads. The mutex won't be hit too often
|
21
|
+
# since it only locks when adding/removing parsers.
|
17
22
|
PARSER_MUX = Mutex.new
|
18
23
|
MAX_BYTES_READ_PER_PARSER = 1024 * 1024 * 2
|
19
24
|
|
25
|
+
# Register a parser object to be used to perform file format detection. Each parser FormatParser
|
26
|
+
# provides out of the box registers itself using this method.
|
27
|
+
#
|
28
|
+
# @param callable_or_responding_to_new[#call, #new] an object that either responds to #new or to #call
|
29
|
+
# @param formats[Array<Symbol>] file formats that the parser provides
|
30
|
+
# @param natures[Array<Symbol>] file natures that the parser provides
|
31
|
+
# @return void
|
20
32
|
def self.register_parser(callable_or_responding_to_new, formats:, natures:)
|
21
33
|
parser_provided_formats = Array(formats)
|
22
34
|
parser_provided_natures = Array(natures)
|
@@ -36,6 +48,11 @@ module FormatParser
|
|
36
48
|
end
|
37
49
|
end
|
38
50
|
|
51
|
+
# Deregister a parser object (makes FormatParser forget this parser existed). Is mostly used in
|
52
|
+
# tests, but can also be used to forcibly disable some formats completely.
|
53
|
+
#
|
54
|
+
# @param callable_or_responding_to_new[#call, #new] an object that either responds to #new or to #call
|
55
|
+
# @return void
|
39
56
|
def self.deregister_parser(callable_or_responding_to_new)
|
40
57
|
# Used only in tests
|
41
58
|
PARSER_MUX.synchronize do
|
@@ -45,11 +62,32 @@ module FormatParser
|
|
45
62
|
end
|
46
63
|
end
|
47
64
|
|
65
|
+
# Parses the resource at the given `url` and returns the results as if it were any IO
|
66
|
+
# given to `.parse`. The accepted keyword arguments are the same as the ones for `parse`.
|
67
|
+
#
|
68
|
+
# @param url[String, URI] the HTTP(S) URL to request the object from using Faraday and `Range:` requests
|
69
|
+
# @param kwargs the keyword arguments to be delegated to `.parse`
|
70
|
+
# @see {.parse}
|
48
71
|
def self.parse_http(url, **kwargs)
|
49
72
|
parse(RemoteIO.new(url), **kwargs)
|
50
73
|
end
|
51
74
|
|
52
|
-
#
|
75
|
+
# Parses the resource contained in the given IO-ish object, and returns either the first matched
|
76
|
+
# result (omitting all the other parsers), the first N results or all results.
|
77
|
+
#
|
78
|
+
# @param io[#seek, #pos, #read] an IO-ish object containing the resource to parse formats for
|
79
|
+
# @param natures[Array] an array of file natures to scope the parsing to.
|
80
|
+
# For example `[:image]` will limit to image files.
|
81
|
+
# The default value is "all natures known to FormatParser"
|
82
|
+
# @param formats[Array] an array of file formats to scope the parsing to.
|
83
|
+
# For example `[:jpg, :tif]` will scope the parsing to TIFF and JPEG files.
|
84
|
+
# The default value is "all formats known to FormatParser"
|
85
|
+
# @param results[:first, :all, Integer] one of the values defining how many results to return if parsing
|
86
|
+
# is ambiguous. The default is `:first` which returns the first matching result. Other
|
87
|
+
# possible values are `:all` to get all possible results and an Integer to return
|
88
|
+
# at most N results.
|
89
|
+
# @return [Array<Result>, Result, nil] either an Array of results, a single parsing result or `nil`if
|
90
|
+
# no useful metadata could be recovered from the file
|
53
91
|
def self.parse(io, natures: @parsers_per_nature.keys, formats: @parsers_per_format.keys, results: :first)
|
54
92
|
# We need to apply various limits so that parsers do not over-read, do not cause too many HTTP
|
55
93
|
# requests to be dispatched and so on. These should be _balanced_ with one another- for example,
|
@@ -92,31 +130,59 @@ module FormatParser
|
|
92
130
|
|
93
131
|
# We need to rewind for each parser, anew
|
94
132
|
limited_io.seek(0)
|
95
|
-
|
96
|
-
begin
|
97
|
-
parser.call(limited_io)
|
98
|
-
rescue IOUtils::InvalidRead
|
99
|
-
# There was not enough data for this parser to work on,
|
100
|
-
# and it triggered an error
|
101
|
-
rescue IOUtils::MalformedFile
|
102
|
-
# Unexpected input was encountered during the parsing of
|
103
|
-
# a file. This might indicate either a malicious or a
|
104
|
-
# corruped file.
|
105
|
-
rescue ReadLimiter::BudgetExceeded
|
106
|
-
# The parser tried to read too much - most likely the file structure
|
107
|
-
# caused the parser to go off-track. Strictly speaking we should log this
|
108
|
-
# and examine the file more closely.
|
109
|
-
# Or the parser caused too many cache pages to be fetched, which likely means we should not allow
|
110
|
-
# it to continue
|
111
|
-
end
|
133
|
+
execute_parser_and_capture_expected_exceptions(parser, limited_io)
|
112
134
|
end.reject(&:nil?).take(amount)
|
113
135
|
|
114
|
-
return results.first if amount == 1
|
115
|
-
|
116
136
|
# Convert the results from a lazy enumerator to an Array.
|
117
|
-
results.to_a
|
137
|
+
results = results.to_a
|
138
|
+
|
139
|
+
if results.empty?
|
140
|
+
Measurometer.increment_counter('format_parser.unknown_files', 1)
|
141
|
+
end
|
142
|
+
|
143
|
+
amount == 1 ? results.first : results
|
144
|
+
end
|
145
|
+
|
146
|
+
def self.execute_parser_and_capture_expected_exceptions(parser, limited_io)
|
147
|
+
parser_name_for_instrumentation = parser.class.to_s.split('::').last
|
148
|
+
Measurometer.instrument('format_parser.parser.%s' % parser_name_for_instrumentation) do
|
149
|
+
parser.call(limited_io).tap do |result|
|
150
|
+
if result
|
151
|
+
Measurometer.increment_counter('format_parser.detected_natures.%s' % result.nature, 1)
|
152
|
+
Measurometer.increment_counter('format_parser.detected_formats.%s' % result.format, 1)
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
156
|
+
rescue IOUtils::InvalidRead
|
157
|
+
# There was not enough data for this parser to work on,
|
158
|
+
# and it triggered an error
|
159
|
+
Measurometer.increment_counter('format_parser.invalid_read_errors', 1)
|
160
|
+
rescue IOUtils::MalformedFile
|
161
|
+
# Unexpected input was encountered during the parsing of
|
162
|
+
# a file. This might indicate either a malicious or a
|
163
|
+
# corruped file.
|
164
|
+
Measurometer.increment_counter('format_parser.malformed_errors', 1)
|
165
|
+
rescue ReadLimiter::BudgetExceeded
|
166
|
+
# The parser tried to read too much - most likely the file structure
|
167
|
+
# caused the parser to go off-track. Strictly speaking we should log this
|
168
|
+
# and examine the file more closely.
|
169
|
+
# Or the parser caused too many cache pages to be fetched, which likely means we should not allow
|
170
|
+
# it to continue
|
171
|
+
Measurometer.increment_counter('format_parser.exceeded_budget_errors', 1)
|
172
|
+
ensure
|
173
|
+
limited_io.send_metrics(parser_name_for_instrumentation)
|
118
174
|
end
|
119
175
|
|
176
|
+
# Returns objects that respond to `call` and can be called to perform parsing
|
177
|
+
# based on the _intersection_ of the two given nature/format constraints. For
|
178
|
+
# example, a constraint of "only image and only ZIP files" can be given -
|
179
|
+
# but would raise an error since no parsers provide both ZIP file parsing and
|
180
|
+
# images as their information.
|
181
|
+
#
|
182
|
+
# @param desired_natures[Array] which natures should be considered (like `[:image, :archive]`)
|
183
|
+
# @param desired_formats[Array] which formats should be considered (like `[:tif, :jpg]`)
|
184
|
+
# @return [Array<#call>] an array of callable parsers
|
185
|
+
# @raise ArgumentError when there are no parsers satisfying the constraint
|
120
186
|
def self.parsers_for(desired_natures, desired_formats)
|
121
187
|
assemble_parser_set = ->(hash_of_sets, keys_of_interest) {
|
122
188
|
hash_of_sets.values_at(*keys_of_interest).compact.inject(&:+) || Set.new
|
@@ -133,6 +199,11 @@ module FormatParser
|
|
133
199
|
factories.map { |callable_or_class| instantiate_parser(callable_or_class) }
|
134
200
|
end
|
135
201
|
|
202
|
+
# Instantiates a parser object (an object that responds to `#call`) from a given class
|
203
|
+
# or returns the parameter as is if it is callable by itself - i.e. if it is a Proc
|
204
|
+
#
|
205
|
+
# @param callable_or_responding_to_new[#call, #new] a callable or a Class/Module
|
206
|
+
# @return [#call] a parser that can be called with an IO-ish argument
|
136
207
|
def self.instantiate_parser(callable_or_responding_to_new)
|
137
208
|
if callable_or_responding_to_new.respond_to?(:call)
|
138
209
|
callable_or_responding_to_new
|
@@ -146,4 +217,7 @@ module FormatParser
|
|
146
217
|
Dir.glob(__dir__ + '/parsers/*.rb').sort.each do |parser_file|
|
147
218
|
require parser_file
|
148
219
|
end
|
220
|
+
# The Measurometer latches itself onto existing classes, so load it after
|
221
|
+
# we have loaded all the parsers
|
222
|
+
require_relative 'measurometer'
|
149
223
|
end
|
data/lib/io_constraint.rb
CHANGED
@@ -19,18 +19,33 @@ class FormatParser::IOConstraint
|
|
19
19
|
@io = io
|
20
20
|
end
|
21
21
|
|
22
|
+
# Returns at most `n_bytes` of data from the IO or less if less data was available
|
23
|
+
# before the EOF was hit
|
24
|
+
#
|
25
|
+
# @param n_bytes[Integer]
|
26
|
+
# @return [String, nil] the content read from the IO or `nil` if no data was available
|
22
27
|
def read(n_bytes)
|
23
28
|
@io.read(n_bytes)
|
24
29
|
end
|
25
30
|
|
26
|
-
|
27
|
-
|
31
|
+
# Seeks the IO to the given absolute offset from the start of the file/resource
|
32
|
+
#
|
33
|
+
# @param to[Integer] offset in the IO
|
34
|
+
# @return Integer
|
35
|
+
def seek(to)
|
36
|
+
@io.seek(to)
|
28
37
|
end
|
29
38
|
|
39
|
+
# Returns the size of the resource contained in the IO
|
40
|
+
#
|
41
|
+
# @return Integer
|
30
42
|
def size
|
31
43
|
@io.size
|
32
44
|
end
|
33
45
|
|
46
|
+
# Returns the current position/offset within the IO
|
47
|
+
#
|
48
|
+
# @return Integer
|
34
49
|
def pos
|
35
50
|
@io.pos
|
36
51
|
end
|
data/lib/io_utils.rb
CHANGED
data/lib/measurometer.rb
ADDED
@@ -0,0 +1,100 @@
|
|
1
|
+
class FormatParser::Measurometer
|
2
|
+
class << self
|
3
|
+
# Permits adding instrumentation drivers. Measurometer is 1-1 API
|
4
|
+
# compatible with Appsignal, which we use a lot. So to magically
|
5
|
+
# obtain all Appsignal instrumentation, add the Appsignal module
|
6
|
+
# as a driver.
|
7
|
+
#
|
8
|
+
# Measurometer.drivers << Appsignal
|
9
|
+
#
|
10
|
+
# A driver must be reentrant and thread-safe - it should be possible
|
11
|
+
# to have multiple `instrument` calls open from different threads at the
|
12
|
+
# same time.
|
13
|
+
# The driver must support the same interface as the Measurometer class
|
14
|
+
# itself, minus the `drivers` and `instrument_instance_method` methods.
|
15
|
+
#
|
16
|
+
# @return Array
|
17
|
+
def drivers
|
18
|
+
@drivers ||= []
|
19
|
+
@drivers
|
20
|
+
end
|
21
|
+
|
22
|
+
# Runs a given block within a cascade of `instrument` blocks of all the
|
23
|
+
# added drivers.
|
24
|
+
#
|
25
|
+
# Measurometer.instrument('do_foo') { compute! }
|
26
|
+
#
|
27
|
+
# unfolds to
|
28
|
+
# Appsignal.instrument('do_foo') do
|
29
|
+
# Statsd.timing('do_foo') do
|
30
|
+
# compute!
|
31
|
+
# end
|
32
|
+
# end
|
33
|
+
#
|
34
|
+
# A driver must be reentrant and thread-safe - it should be possible
|
35
|
+
# to have multiple `instrument` calls open from different threads at the
|
36
|
+
# same time.
|
37
|
+
# The driver must support the same interface as the Measurometer class
|
38
|
+
# itself, minus the `drivers` and `instrument_instance_method` methods.
|
39
|
+
#
|
40
|
+
# @param block_name[String] under which path to push the metric
|
41
|
+
# @param blk[#call] the block to instrument
|
42
|
+
# @return [Object] the return value of &blk
|
43
|
+
def instrument(block_name, &blk)
|
44
|
+
return yield unless @drivers && @drivers.any? # The block wrapping business is not free
|
45
|
+
@drivers.inject(blk) { |outer_block, driver|
|
46
|
+
-> {
|
47
|
+
driver.instrument(block_name, &outer_block)
|
48
|
+
}
|
49
|
+
}.call
|
50
|
+
end
|
51
|
+
|
52
|
+
# Adds a distribution value (sample) under a given path
|
53
|
+
#
|
54
|
+
# @param value_path[String] under which path to push the metric
|
55
|
+
# @param value[Numeric] distribution value
|
56
|
+
# @return nil
|
57
|
+
def add_distribution_value(value_path, value)
|
58
|
+
(@drivers || []).each { |d| d.add_distribution_value(value_path, value) }
|
59
|
+
nil
|
60
|
+
end
|
61
|
+
|
62
|
+
# Increment a named counter under a given path
|
63
|
+
#
|
64
|
+
# @param counter_path[String] under which path to push the metric
|
65
|
+
# @param by[Integer] the counter increment to apply
|
66
|
+
# @return nil
|
67
|
+
def increment_counter(counter_path, by)
|
68
|
+
(@drivers || []).each { |d| d.increment_counter(counter_path, by) }
|
69
|
+
nil
|
70
|
+
end
|
71
|
+
|
72
|
+
# Wrap an anonymous module around an instance method in the given class to have
|
73
|
+
# it instrumented automatically. The name of the measurement will be interpolated as:
|
74
|
+
#
|
75
|
+
# "#{prefix}.#{rightmost_class_constant_name}.#{instance_method_name}"
|
76
|
+
#
|
77
|
+
# @param target_class[Class] the class to instrument
|
78
|
+
# @param instance_method_name_to_instrument[Symbol] the method name to instrument
|
79
|
+
# @param path_prefix[String] under which path to push the instrumented metric
|
80
|
+
# @return void
|
81
|
+
def instrument_instance_method(target_class, instance_method_name_to_instrument, path_prefix)
|
82
|
+
short_class_name = target_class.to_s.split('::').last
|
83
|
+
instrumentation_name = [path_prefix, short_class_name, instance_method_name_to_instrument].join('.')
|
84
|
+
instrumenter_module = Module.new do
|
85
|
+
define_method(instance_method_name_to_instrument) do |*any|
|
86
|
+
::FormatParser::Measurometer.instrument(instrumentation_name) { super(*any) }
|
87
|
+
end
|
88
|
+
end
|
89
|
+
target_class.prepend(instrumenter_module)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
# Instrument things interesting in the global sense
|
94
|
+
instrument_instance_method(FormatParser::RemoteIO, :read, 'format_parser')
|
95
|
+
instrument_instance_method(Care::Cache, :read_page, 'format_parser')
|
96
|
+
|
97
|
+
# Instrument more specific things on a per-parser basis
|
98
|
+
instrument_instance_method(FormatParser::EXIFParser, :scan_image_tiff, 'format_parser')
|
99
|
+
instrument_instance_method(FormatParser::MOOVParser::Decoder, :extract_atom_stream, 'format_parser.parsers.MOOVParser')
|
100
|
+
end
|
data/lib/parsers/jpeg_parser.rb
CHANGED
@@ -112,6 +112,7 @@ class FormatParser::JPEGParser
|
|
112
112
|
maybe_exif_magic_str = app1_frame_bytes[0..5]
|
113
113
|
maybe_exif_data = app1_frame_bytes[6..-1]
|
114
114
|
if maybe_exif_magic_str == EXIF_MAGIC_STRING
|
115
|
+
FormatParser::Measurometer.add_distribution_value('format_parser.JPEGParser.bytes_sent_to_exif_parser', maybe_exif_data.bytesize)
|
115
116
|
scanner = FormatParser::EXIFParser.new(StringIO.new(maybe_exif_data))
|
116
117
|
scanner.scan_image_tiff
|
117
118
|
|
data/lib/parsers/moov_parser.rb
CHANGED
@@ -11,10 +11,6 @@ class FormatParser::MOOVParser
|
|
11
11
|
'm4a ' => :m4a,
|
12
12
|
}
|
13
13
|
|
14
|
-
# It is currently not documented and not particularly well-tested,
|
15
|
-
# so not considered a public API for now
|
16
|
-
private_constant :Decoder
|
17
|
-
|
18
14
|
def call(io)
|
19
15
|
return unless matches_moov_definition?(io)
|
20
16
|
|
data/lib/read_limiter.rb
CHANGED
@@ -1,9 +1,17 @@
|
|
1
|
+
# Is used to limit the number of reads/seeks parsers can perform
|
1
2
|
class FormatParser::ReadLimiter
|
2
3
|
NO_LIMIT = nil
|
3
4
|
|
4
5
|
class BudgetExceeded < StandardError
|
5
6
|
end
|
6
7
|
|
8
|
+
# Creates a ReadLimiter wrapper around the given IO object and sets the limits
|
9
|
+
# on the number of reads/writes
|
10
|
+
#
|
11
|
+
# @param io[#seek, #pos, #size, #read] the IO object to wrap
|
12
|
+
# @param max_bytes[Integer, nil] how many bytes can we read from this before an exception is raised
|
13
|
+
# @param max_reads[Integer, nil] how many read() calls can we perform on this before an exception is raised
|
14
|
+
# @param max_seeks[Integer, nil] how many seek() calls can we perform on this before an exception is raised
|
7
15
|
def initialize(io, max_bytes: NO_LIMIT, max_reads: NO_LIMIT, max_seeks: NO_LIMIT)
|
8
16
|
@max_bytes = max_bytes
|
9
17
|
@max_reads = max_reads
|
@@ -15,24 +23,39 @@ class FormatParser::ReadLimiter
|
|
15
23
|
@bytes = 0
|
16
24
|
end
|
17
25
|
|
26
|
+
# Returns the size of the resource contained in the IO
|
27
|
+
#
|
28
|
+
# @return Integer
|
18
29
|
def size
|
19
30
|
@io.size
|
20
31
|
end
|
21
32
|
|
33
|
+
# Returns the current position/offset within the IO
|
34
|
+
#
|
35
|
+
# @return Integer
|
22
36
|
def pos
|
23
37
|
@io.pos
|
24
38
|
end
|
25
39
|
|
26
|
-
|
40
|
+
# Seeks the IO to the given absolute offset from the start of the file/resource
|
41
|
+
#
|
42
|
+
# @param to[Integer] offset in the IO
|
43
|
+
# @return Integer
|
44
|
+
def seek(to)
|
27
45
|
@seeks += 1
|
28
46
|
if @max_seeks && @seeks > @max_seeks
|
29
47
|
raise BudgetExceeded, 'Seek budget exceeded (%d seeks performed)' % @max_seeks
|
30
48
|
end
|
31
|
-
@io.seek(
|
49
|
+
@io.seek(to)
|
32
50
|
end
|
33
51
|
|
34
|
-
|
35
|
-
|
52
|
+
# Returns at most `n_bytes` of data from the IO or less if less data was available
|
53
|
+
# before the EOF was hit
|
54
|
+
#
|
55
|
+
# @param n_bytes[Integer]
|
56
|
+
# @return [String, nil] the content read from the IO or `nil` if no data was available
|
57
|
+
def read(n_bytes)
|
58
|
+
@bytes += n_bytes
|
36
59
|
@reads += 1
|
37
60
|
|
38
61
|
if @max_bytes && @bytes > @max_bytes
|
@@ -43,9 +66,23 @@ class FormatParser::ReadLimiter
|
|
43
66
|
raise BudgetExceeded, 'Number of read() calls exceeded (%d max)' % @max_reads
|
44
67
|
end
|
45
68
|
|
46
|
-
@io.read(
|
69
|
+
@io.read(n_bytes)
|
47
70
|
end
|
48
71
|
|
72
|
+
# Sends the metrics about the state of this ReadLimiter to a Measurometer
|
73
|
+
#
|
74
|
+
# @param prefix[String] the prefix to set. For example, with prefix "TIFF" the metrics will be called
|
75
|
+
# `format_parser.TIFF.read_limiter.num_seeks` and so forth
|
76
|
+
# @return void
|
77
|
+
def send_metrics(prefix)
|
78
|
+
FormatParser::Measurometer.add_distribution_value('format_parser.%s.read_limiter.num_seeks' % prefix, @seeks)
|
79
|
+
FormatParser::Measurometer.add_distribution_value('format_parser.%s.read_limiter.num_reads' % prefix, @reads)
|
80
|
+
FormatParser::Measurometer.add_distribution_value('format_parser.%s.read_limiter.read_bytes' % prefix, @bytes)
|
81
|
+
end
|
82
|
+
|
83
|
+
# Resets all the recorded call counters so that the object can be reused for the next parser,
|
84
|
+
# which will have it's own limits
|
85
|
+
# @return void
|
49
86
|
def reset_limits!
|
50
87
|
@seeks = 0
|
51
88
|
@reads = 0
|
data/lib/read_limits_config.rb
CHANGED
@@ -5,23 +5,49 @@ class FormatParser::ReadLimitsConfig
|
|
5
5
|
@max_read_bytes_per_parser = total_bytes_available_per_parser.to_i
|
6
6
|
end
|
7
7
|
|
8
|
+
# Defines how many bytes each parser may request to read from the IO object given to it.
|
9
|
+
# Is used to artificially limit unbounded reads in parsers that may wander off and
|
10
|
+
# try to gulp in the file given to them indefinitely due to infinite loops or
|
11
|
+
# wrongly implemented skips - or when handling data that has been deliberately
|
12
|
+
# crafted in a way that can make a parser misbehave.
|
13
|
+
# This is less strict than one could think - for example, the MOOV parser used for
|
14
|
+
# Quicktime files will skip over the actual atom contents of the atoms, and will only
|
15
|
+
# read atom headers - which stays under this limit for quite some time.
|
8
16
|
def max_read_bytes_per_parser
|
9
17
|
@max_read_bytes_per_parser
|
10
18
|
end
|
11
19
|
|
20
|
+
# How big should the cache page be. Each cache page read will incur one `#read`
|
21
|
+
# on the underlying IO object, remote or local
|
12
22
|
def cache_page_size
|
13
23
|
@max_read_bytes_per_parser / 4
|
14
24
|
end
|
15
25
|
|
26
|
+
# Each parser can incur HTTP requests when performing `parse_http`. This constant
|
27
|
+
# sets the maximum number of pages each parser is allowed to hit that have not
|
28
|
+
# been fetched previously and are not stored in the cache. For example, with most
|
29
|
+
# formats the first cache page and the last cache page - tail and head of the file,
|
30
|
+
# respectively - will be available right after the first parser retreives some data.
|
31
|
+
# The second parser accessing the same data will reuse the in-memory cache.
|
16
32
|
def max_pagefaults_per_parser
|
17
33
|
MAX_PAGE_FAULTS
|
18
34
|
end
|
19
35
|
|
36
|
+
# Defines how many `#read` calls each parser may perform on the IO object given to it.
|
37
|
+
# Is used to artificially limit unbounded reads in parsers that may wander off and
|
38
|
+
# try to gulp in the file given to them indefinitely due to infinite loops or
|
39
|
+
# wrongly implemented skips - or when handling data that has been deliberately
|
40
|
+
# crafted in a way that can make a parser misbehave.
|
20
41
|
def max_reads_per_parser
|
21
42
|
# Imagine we read per single byte
|
22
43
|
@max_read_bytes_per_parser / 2
|
23
44
|
end
|
24
45
|
|
46
|
+
# Defines how many `#seek` calls each parser may perform on the IO object given to it.
|
47
|
+
# Is used to artificially limit unbounded reads in parsers that may wander off and
|
48
|
+
# try to gulp in the file given to them indefinitely due to infinite loops or
|
49
|
+
# wrongly implemented skips - or when handling data that has been deliberately
|
50
|
+
# crafted in a way that can make a parser misbehave.
|
25
51
|
def max_seeks_per_parser
|
26
52
|
# Imagine we have to seek once per byte
|
27
53
|
@max_read_bytes_per_parser / 2
|
data/lib/remote_io.rb
CHANGED
@@ -1,3 +1,8 @@
|
|
1
|
+
# Acts as a wrapper for turning a given URL into an IO object
|
2
|
+
# you can read from and seek in. Uses Faraday under the hood
|
3
|
+
# to perform fetches, so if you apply Faraday configuration
|
4
|
+
# tweaks using `Faraday.default_connection = ...` these will
|
5
|
+
# take effect for these RemoteIO objects as well
|
1
6
|
class FormatParser::RemoteIO
|
2
7
|
# Represents a failure that might be retried
|
3
8
|
# (like a 5xx response or a timeout)
|
@@ -89,8 +94,10 @@ class FormatParser::RemoteIO
|
|
89
94
|
# cannot hint size with this response - at lease not when working with S3
|
90
95
|
return
|
91
96
|
when 500..599
|
97
|
+
FormatParser::Measurometer.increment_counter('format_parser.RemoteIO.upstream50x_errors', 1)
|
92
98
|
raise IntermittentFailure, "Server at #{@uri} replied with a #{response.status} and we might want to retry"
|
93
99
|
else
|
100
|
+
FormatParser::Measurometer.increment_counter('format_parser.RemoteIO.invalid_request_errors', 1)
|
94
101
|
raise InvalidRequest, "Server at #{@uri} replied with a #{response.status} and refused our request"
|
95
102
|
end
|
96
103
|
end
|
data/spec/io_utils_spec.rb
CHANGED
@@ -27,16 +27,10 @@ describe 'IOUtils' do
|
|
27
27
|
}.to raise_error(FormatParser::IOUtils::InvalidRead)
|
28
28
|
end
|
29
29
|
|
30
|
-
it 'uses #pos
|
30
|
+
it 'uses #pos available on the object' do
|
31
31
|
fake_io = double(pos: 11)
|
32
32
|
expect(fake_io).to receive(:seek).with(11 + 5)
|
33
33
|
safe_skip(fake_io, 5)
|
34
34
|
end
|
35
|
-
|
36
|
-
it 'uses #read if no #pos is available on the object' do
|
37
|
-
fake_io = double
|
38
|
-
expect(fake_io).to receive(:read).with(5).and_return('x' * 5)
|
39
|
-
safe_skip(fake_io, 5)
|
40
|
-
end
|
41
35
|
end
|
42
36
|
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe FormatParser::Measurometer do
|
4
|
+
RSpec::Matchers.define :include_counter_or_measurement_named do |named|
|
5
|
+
match do |actual|
|
6
|
+
actual.any? do |e|
|
7
|
+
e[0] == named && e[1] > 0
|
8
|
+
end
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
it 'instruments a full cycle FormatParser.parse' do
|
13
|
+
driver_class = Class.new do
|
14
|
+
attr_accessor :timings, :counters, :distributions
|
15
|
+
def instrument(block_name)
|
16
|
+
s = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
17
|
+
yield.tap do
|
18
|
+
delta = Process.clock_gettime(Process::CLOCK_MONOTONIC) - s
|
19
|
+
@timings ||= []
|
20
|
+
@timings << [block_name, delta * 1000]
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def add_distribution_value(value_path, value)
|
25
|
+
@distributions ||= []
|
26
|
+
@distributions << [value_path, value]
|
27
|
+
end
|
28
|
+
|
29
|
+
def increment_counter(value_path, value)
|
30
|
+
@counters ||= []
|
31
|
+
@counters << [value_path, value]
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
instrumenter = driver_class.new
|
36
|
+
described_class.drivers << instrumenter
|
37
|
+
|
38
|
+
FormatParser.parse(File.open(fixtures_dir + 'JPEG/keynote_recognized_as_jpeg.key', 'rb'), results: :all)
|
39
|
+
|
40
|
+
described_class.drivers.delete(instrumenter)
|
41
|
+
expect(described_class.drivers).not_to include(instrumenter)
|
42
|
+
|
43
|
+
expect(instrumenter.counters).to include_counter_or_measurement_named('format_parser.detected_formats.zip')
|
44
|
+
expect(instrumenter.counters).to include_counter_or_measurement_named('format_parser.parser.Care.page_reads_from_upsteam')
|
45
|
+
expect(instrumenter.distributions).to include_counter_or_measurement_named('format_parser.ZIPParser.read_limiter.read_bytes')
|
46
|
+
expect(instrumenter.timings).to include_counter_or_measurement_named('format_parser.Cache.read_page')
|
47
|
+
end
|
48
|
+
end
|
@@ -27,11 +27,12 @@ describe 'Fetching data from HTTP remotes' do
|
|
27
27
|
end
|
28
28
|
|
29
29
|
it '#parse_http is called with hash options' do
|
30
|
-
|
31
|
-
|
30
|
+
fake_result = double(nature: :audio, format: :aiff)
|
31
|
+
expect_any_instance_of(FormatParser::AIFFParser).to receive(:call).and_return(fake_result)
|
32
|
+
results = FormatParser.parse_http('http://localhost:9399/PNG/anim.png', results: :all)
|
32
33
|
|
33
|
-
expect(
|
34
|
-
expect(
|
34
|
+
expect(results.count).to eq(2)
|
35
|
+
expect(results).to include(fake_result)
|
35
36
|
end
|
36
37
|
|
37
38
|
it 'parses the animated PNG over HTTP' do
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: format_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.8.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Noah Berman
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2018-04-
|
12
|
+
date: 2018-04-17 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: ks
|
@@ -152,6 +152,7 @@ files:
|
|
152
152
|
- ".rspec"
|
153
153
|
- ".rubocop.yml"
|
154
154
|
- ".travis.yml"
|
155
|
+
- CHANGELOG.md
|
155
156
|
- CODE_OF_CONDUCT.md
|
156
157
|
- CONTRIBUTING.md
|
157
158
|
- Gemfile
|
@@ -169,6 +170,7 @@ files:
|
|
169
170
|
- lib/image.rb
|
170
171
|
- lib/io_constraint.rb
|
171
172
|
- lib/io_utils.rb
|
173
|
+
- lib/measurometer.rb
|
172
174
|
- lib/parsers/aiff_parser.rb
|
173
175
|
- lib/parsers/cr2_parser.rb
|
174
176
|
- lib/parsers/dpx_parser.rb
|
@@ -200,6 +202,7 @@ files:
|
|
200
202
|
- spec/file_information_spec.rb
|
201
203
|
- spec/format_parser_spec.rb
|
202
204
|
- spec/io_utils_spec.rb
|
205
|
+
- spec/measurometer_spec.rb
|
203
206
|
- spec/parsers/aiff_parser_spec.rb
|
204
207
|
- spec/parsers/cr2_parser_spec.rb
|
205
208
|
- spec/parsers/dpx_parser_spec.rb
|