format_parser 0.7.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +98 -0
- data/CONTRIBUTING.md +6 -2
- data/lib/care.rb +52 -3
- data/lib/format_parser/version.rb +1 -1
- data/lib/format_parser.rb +95 -21
- data/lib/io_constraint.rb +17 -2
- data/lib/io_utils.rb +1 -5
- data/lib/measurometer.rb +100 -0
- data/lib/parsers/jpeg_parser.rb +1 -0
- data/lib/parsers/moov_parser.rb +0 -4
- data/lib/read_limiter.rb +42 -5
- data/lib/read_limits_config.rb +26 -0
- data/lib/remote_io.rb +7 -0
- data/spec/io_utils_spec.rb +1 -7
- data/spec/measurometer_spec.rb +48 -0
- data/spec/remote_fetching_spec.rb +5 -4
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 789592107e9fa74091745b703249248b6a05e9dd73af45803ab799708f8498fc
|
4
|
+
data.tar.gz: 191ff20e5b8d455f681eb19b40a2bd406c1cfa1a9b2aeebaa483ea829071e532
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dde5379a4d0590019ac6eee21361e10688f8582efa169132f28a0dc531f6b05811368e987a3a30902dcc1a509952c11bb9810bee3f3dbd17e99f3c13f1776b71
|
7
|
+
data.tar.gz: ec5a699fb441ca622006fb2885a6e1ce623da0a7ea6a5b78e5df83db2a1f9227dd023681f12a7ec703188133a8276e363dd3e5c090bca3ce694064124925e47e
|
data/CHANGELOG.md
ADDED
@@ -0,0 +1,98 @@
|
|
1
|
+
## 0.8.0
|
2
|
+
* Add `Measurometer` for applying instrumentation around FormatParser operaions. See documentation for usage.
|
3
|
+
|
4
|
+
## 0.7.0
|
5
|
+
* Configure read limits / pagefault limits centrally so that those limits make sense together
|
6
|
+
|
7
|
+
## 0.6.0
|
8
|
+
* Double the cache page size once more
|
9
|
+
* We no longer need exifr/jpeg
|
10
|
+
* Fix EXIF parsing in JPEG files
|
11
|
+
* Reject Keynote documents in JPEG parser
|
12
|
+
|
13
|
+
## 0.5.2
|
14
|
+
* Do not raise EXIFR errors for keynote files
|
15
|
+
* Correct broken comment for the audio nature
|
16
|
+
|
17
|
+
## 0.5.1
|
18
|
+
* Raise the cache page size during detection
|
19
|
+
* Fix ZIP entry filename parsing
|
20
|
+
|
21
|
+
## 0.5.0
|
22
|
+
* Add FLAC parser
|
23
|
+
* Add parse_atom_children_and_data_fields support
|
24
|
+
* Add basic detection of Office files
|
25
|
+
* Optimize EOCD signature lookup
|
26
|
+
|
27
|
+
## 0.4.0
|
28
|
+
* Adds a basic PDF parser
|
29
|
+
* Make sure root: and to_json without arguments work
|
30
|
+
* ZIP file format support
|
31
|
+
|
32
|
+
## 0.3.5
|
33
|
+
* Fix the bug with EXIF dimensions being used instead of pixel dimensions
|
34
|
+
|
35
|
+
## 0.3.4
|
36
|
+
* Pagefault limit
|
37
|
+
* Add seek modes required by exifr
|
38
|
+
|
39
|
+
## 0.3.3
|
40
|
+
* Implement a sane to_json as well
|
41
|
+
|
42
|
+
## 0.3.2
|
43
|
+
* Add default as_json
|
44
|
+
* Test on 2.5.0
|
45
|
+
|
46
|
+
## 0.3.1
|
47
|
+
* Remove post install warning
|
48
|
+
* Moved aiff_parser_spec.rb to spec/parsers
|
49
|
+
* CR2 file support
|
50
|
+
* Add require 'set' to format_parser.rb
|
51
|
+
* Use register_parser for natures/fmts
|
52
|
+
|
53
|
+
## 0.3.0
|
54
|
+
* Reverse API changes to support :first as default and add opts to parse_http
|
55
|
+
* Implement and comply with rubocop
|
56
|
+
* JPEG parser and Care fixes
|
57
|
+
* Add format and count options to parse_http
|
58
|
+
* Return first result as default
|
59
|
+
* Use hashes for MOOV atom default fields
|
60
|
+
|
61
|
+
## 0.2.0
|
62
|
+
* Implement parser DSL
|
63
|
+
|
64
|
+
## 0.1.7
|
65
|
+
* Fix read(0) on Care::IOWrapper, introduce top-level tests
|
66
|
+
|
67
|
+
## 0.1.6
|
68
|
+
* Fix mp3 parsing bug
|
69
|
+
* Add MOOV parser
|
70
|
+
|
71
|
+
## 0.1.5
|
72
|
+
* Add FDX parser
|
73
|
+
* Remove dry-structs
|
74
|
+
* New interface updates
|
75
|
+
|
76
|
+
## 0.1.4
|
77
|
+
* Add WAV parser
|
78
|
+
|
79
|
+
## 0.1.3
|
80
|
+
* Add MP3 parser
|
81
|
+
* Add FileInformation#intrinsics
|
82
|
+
* Disallow negative Care offsets
|
83
|
+
|
84
|
+
## 0.1.2
|
85
|
+
* Introduce a restrictive IO subset wrapper
|
86
|
+
* Switch rewind for seek in exif parser
|
87
|
+
* Prep for OSS release
|
88
|
+
* Add fuzz spec
|
89
|
+
* Improve orientation parsing
|
90
|
+
* Optimisation for PNG and invalid input protection on JPEG
|
91
|
+
|
92
|
+
## 0.1.1
|
93
|
+
* Add AIFF parser
|
94
|
+
|
95
|
+
## 0.1.0
|
96
|
+
* Add parsers for PNG, JPG, TIFF, PSD
|
97
|
+
* Add GIF parser
|
98
|
+
* Add DPX parser
|
data/CONTRIBUTING.md
CHANGED
@@ -16,7 +16,7 @@ If you are interested in contributing code and would like to learn more about th
|
|
16
16
|
|
17
17
|
- [ruby](https://ruby-doc.org)
|
18
18
|
- [rspec](http://rspec.info/) (for testing)
|
19
|
-
|
19
|
+
|
20
20
|
# How do I make a contribution?
|
21
21
|
|
22
22
|
## Using the issue tracker
|
@@ -101,7 +101,7 @@ project's developers might not want to merge into the project.
|
|
101
101
|
Please adhere to the coding conventions used throughout the project (indentation,
|
102
102
|
accurate comments, etc.) and any other requirements (such as test coverage).
|
103
103
|
|
104
|
-
The test suite can be run with `bundle exec rspec`.
|
104
|
+
The test suite can be run with `bundle exec rspec`.
|
105
105
|
|
106
106
|
Follow this process if you'd like your work considered for inclusion in the
|
107
107
|
project:
|
@@ -155,3 +155,7 @@ project:
|
|
155
155
|
license your work under the same license as that used by the project, which you
|
156
156
|
can see by clicking [here](https://github.com/WeTransfer/format_parser/blob/master/LICENSE.txt).
|
157
157
|
This provision also applies to the test files you include with the changed code as fixtures.
|
158
|
+
|
159
|
+
## Changelog
|
160
|
+
|
161
|
+
When creating a new release you must add an entry in the `CHANGELOG.md`.
|
data/lib/care.rb
CHANGED
@@ -4,27 +4,49 @@
|
|
4
4
|
# is only available via HTTP, for example, we can have less
|
5
5
|
# fetches and have them return more data for one fetch
|
6
6
|
class Care
|
7
|
+
# Defines the size of a page in bytes that the Care will prefetch
|
7
8
|
DEFAULT_PAGE_SIZE = 128 * 1024
|
8
9
|
|
10
|
+
# Wraps any given IO with Care caching superpowers. Supports the subset
|
11
|
+
# of IO declared in IOConstraint.
|
9
12
|
class IOWrapper
|
13
|
+
# Creates a new IOWrapper around the given source IO
|
14
|
+
#
|
15
|
+
# @param io[#seek, #pos, #size] the IO to wrap
|
16
|
+
# @param page_size[Integer] the size of the cache page to use for this wrapper
|
10
17
|
def initialize(io, page_size: DEFAULT_PAGE_SIZE)
|
11
18
|
@cache = Cache.new(page_size)
|
12
19
|
@io = io
|
13
20
|
@pos = 0
|
14
21
|
end
|
15
22
|
|
23
|
+
# Returns the size of the resource contained in the IO
|
24
|
+
#
|
25
|
+
# @return Integer
|
16
26
|
def size
|
17
27
|
@io.size
|
18
28
|
end
|
19
29
|
|
30
|
+
# Seeks the IO to the given absolute offset from the start of the file/resource
|
31
|
+
#
|
32
|
+
# @param to[Integer] offset in the IO
|
33
|
+
# @return Integer
|
20
34
|
def seek(to)
|
21
35
|
@pos = to
|
22
36
|
end
|
23
37
|
|
38
|
+
# Returns the current position/offset within the IO
|
39
|
+
#
|
40
|
+
# @return Integer
|
24
41
|
def pos
|
25
42
|
@pos
|
26
43
|
end
|
27
44
|
|
45
|
+
# Returns at most `n_bytes` of data from the IO or less if less data was available
|
46
|
+
# before the EOF was hit
|
47
|
+
#
|
48
|
+
# @param n_bytes[Integer]
|
49
|
+
# @return [String, nil] the content read from the IO or `nil` if no data was available
|
28
50
|
def read(n_bytes)
|
29
51
|
return '' if n_bytes == 0 # As hardcoded for all Ruby IO objects
|
30
52
|
raise ArgumentError, "negative length #{n_bytes} given" if n_bytes < 0 # also as per Ruby IO objects
|
@@ -34,10 +56,17 @@ class Care
|
|
34
56
|
read
|
35
57
|
end
|
36
58
|
|
59
|
+
# Clears all the cached pages explicitly to help GC
|
60
|
+
#
|
61
|
+
# @return void
|
37
62
|
def clear
|
38
63
|
@cache.clear
|
39
64
|
end
|
40
65
|
|
66
|
+
# Clears all the cached pages explicitly to help GC, and
|
67
|
+
# calls `#close` on the source IO if the IO responds to `#close`
|
68
|
+
#
|
69
|
+
# @return void
|
41
70
|
def close
|
42
71
|
clear
|
43
72
|
@io.close if @io.respond_to?(:close)
|
@@ -47,6 +76,7 @@ class Care
|
|
47
76
|
# Stores cached pages of data from the given IO as strings.
|
48
77
|
# Pages are sized to be `page_size` or less (for the last page).
|
49
78
|
class Cache
|
79
|
+
# Initializes a new cache pages container with pages of given size
|
50
80
|
def initialize(page_size = DEFAULT_PAGE_SIZE)
|
51
81
|
@page_size = page_size.to_i
|
52
82
|
raise ArgumentError, 'The page size must be a positive Integer' unless @page_size > 0
|
@@ -59,6 +89,12 @@ class Care
|
|
59
89
|
# If the IO has been exhausted, `nil` will be returned
|
60
90
|
# instead. Will use the cached pages where available,
|
61
91
|
# or fetch pages where necessary
|
92
|
+
#
|
93
|
+
# @param io[#seek, #read] the IO to read data from
|
94
|
+
# @param at[Integer] at which offset we have to read
|
95
|
+
# @param n_bytes[Integer] how many bytes we want to read/cache
|
96
|
+
# @return [String, nil] the content read from the IO or `nil` if no data was available
|
97
|
+
# @raise ArgumentError
|
62
98
|
def byteslice(io, at, n_bytes)
|
63
99
|
if n_bytes < 1
|
64
100
|
raise ArgumentError, "The number of bytes to fetch must be a positive Integer, but was #{n_bytes}"
|
@@ -97,10 +133,18 @@ class Care
|
|
97
133
|
slice if slice && !slice.empty?
|
98
134
|
end
|
99
135
|
|
136
|
+
# Clears the page cache of all strings with data
|
137
|
+
#
|
138
|
+
# @return void
|
100
139
|
def clear
|
101
140
|
@pages.clear
|
102
141
|
end
|
103
142
|
|
143
|
+
# Hydrates a page at the certain index or returns the contents of
|
144
|
+
# that page if it is already in the cache
|
145
|
+
#
|
146
|
+
# @param io[IO] the IO to read from
|
147
|
+
# @param page_i[Integer] which page (zero-based) to hydrate and return
|
104
148
|
def hydrate_page(io, page_i)
|
105
149
|
# Avoid trying to read the page if we know there is no content to fill it
|
106
150
|
# in the underlying IO
|
@@ -109,9 +153,9 @@ class Care
|
|
109
153
|
@pages[page_i] ||= read_page(io, page_i)
|
110
154
|
end
|
111
155
|
|
156
|
+
# We provide an overridden implementation of #inspect to avoid
|
157
|
+
# printing the actual contents of the cached pages
|
112
158
|
def inspect
|
113
|
-
# To avoid page _contents_ in the inspect outputs we need to implement our own inspect.
|
114
|
-
|
115
159
|
# Simulate the builtin object ID output https://stackoverflow.com/a/11765495/153886
|
116
160
|
oid_str = (object_id << 1).to_s(16).rjust(16, '0')
|
117
161
|
|
@@ -124,10 +168,15 @@ class Care
|
|
124
168
|
'#<%s:%s %s %s>' % [self.class, oid_str, synthetic_vars, ivars_str]
|
125
169
|
end
|
126
170
|
|
171
|
+
# Reads the requested page from the given IO
|
172
|
+
#
|
173
|
+
# @param io[IO] the IO to read from
|
174
|
+
# @param page_i[Integer] which page (zero-based) to read
|
127
175
|
def read_page(io, page_i)
|
176
|
+
FormatParser::Measurometer.increment_counter('format_parser.parser.Care.page_reads_from_upsteam', 1)
|
177
|
+
|
128
178
|
io.seek(page_i * @page_size)
|
129
179
|
read_result = io.read(@page_size)
|
130
|
-
|
131
180
|
if read_result.nil?
|
132
181
|
# If the read went past the end of the IO the read result will be nil,
|
133
182
|
# so we know our IO is exhausted here
|
data/lib/format_parser.rb
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
require 'set'
|
2
2
|
|
3
|
+
# A pretty nimble module for parsing file metadata using partial reads. Contains all the
|
4
|
+
# top-level methods of the library.
|
3
5
|
module FormatParser
|
4
6
|
require_relative 'attributes_json'
|
5
7
|
require_relative 'image'
|
@@ -14,9 +16,19 @@ module FormatParser
|
|
14
16
|
require_relative 'io_constraint'
|
15
17
|
require_relative 'care'
|
16
18
|
|
19
|
+
# Is used to manage access to the shared array of parser constructors, which might
|
20
|
+
# potentially be mutated from different threads. The mutex won't be hit too often
|
21
|
+
# since it only locks when adding/removing parsers.
|
17
22
|
PARSER_MUX = Mutex.new
|
18
23
|
MAX_BYTES_READ_PER_PARSER = 1024 * 1024 * 2
|
19
24
|
|
25
|
+
# Register a parser object to be used to perform file format detection. Each parser FormatParser
|
26
|
+
# provides out of the box registers itself using this method.
|
27
|
+
#
|
28
|
+
# @param callable_or_responding_to_new[#call, #new] an object that either responds to #new or to #call
|
29
|
+
# @param formats[Array<Symbol>] file formats that the parser provides
|
30
|
+
# @param natures[Array<Symbol>] file natures that the parser provides
|
31
|
+
# @return void
|
20
32
|
def self.register_parser(callable_or_responding_to_new, formats:, natures:)
|
21
33
|
parser_provided_formats = Array(formats)
|
22
34
|
parser_provided_natures = Array(natures)
|
@@ -36,6 +48,11 @@ module FormatParser
|
|
36
48
|
end
|
37
49
|
end
|
38
50
|
|
51
|
+
# Deregister a parser object (makes FormatParser forget this parser existed). Is mostly used in
|
52
|
+
# tests, but can also be used to forcibly disable some formats completely.
|
53
|
+
#
|
54
|
+
# @param callable_or_responding_to_new[#call, #new] an object that either responds to #new or to #call
|
55
|
+
# @return void
|
39
56
|
def self.deregister_parser(callable_or_responding_to_new)
|
40
57
|
# Used only in tests
|
41
58
|
PARSER_MUX.synchronize do
|
@@ -45,11 +62,32 @@ module FormatParser
|
|
45
62
|
end
|
46
63
|
end
|
47
64
|
|
65
|
+
# Parses the resource at the given `url` and returns the results as if it were any IO
|
66
|
+
# given to `.parse`. The accepted keyword arguments are the same as the ones for `parse`.
|
67
|
+
#
|
68
|
+
# @param url[String, URI] the HTTP(S) URL to request the object from using Faraday and `Range:` requests
|
69
|
+
# @param kwargs the keyword arguments to be delegated to `.parse`
|
70
|
+
# @see {.parse}
|
48
71
|
def self.parse_http(url, **kwargs)
|
49
72
|
parse(RemoteIO.new(url), **kwargs)
|
50
73
|
end
|
51
74
|
|
52
|
-
#
|
75
|
+
# Parses the resource contained in the given IO-ish object, and returns either the first matched
|
76
|
+
# result (omitting all the other parsers), the first N results or all results.
|
77
|
+
#
|
78
|
+
# @param io[#seek, #pos, #read] an IO-ish object containing the resource to parse formats for
|
79
|
+
# @param natures[Array] an array of file natures to scope the parsing to.
|
80
|
+
# For example `[:image]` will limit to image files.
|
81
|
+
# The default value is "all natures known to FormatParser"
|
82
|
+
# @param formats[Array] an array of file formats to scope the parsing to.
|
83
|
+
# For example `[:jpg, :tif]` will scope the parsing to TIFF and JPEG files.
|
84
|
+
# The default value is "all formats known to FormatParser"
|
85
|
+
# @param results[:first, :all, Integer] one of the values defining how many results to return if parsing
|
86
|
+
# is ambiguous. The default is `:first` which returns the first matching result. Other
|
87
|
+
# possible values are `:all` to get all possible results and an Integer to return
|
88
|
+
# at most N results.
|
89
|
+
# @return [Array<Result>, Result, nil] either an Array of results, a single parsing result or `nil`if
|
90
|
+
# no useful metadata could be recovered from the file
|
53
91
|
def self.parse(io, natures: @parsers_per_nature.keys, formats: @parsers_per_format.keys, results: :first)
|
54
92
|
# We need to apply various limits so that parsers do not over-read, do not cause too many HTTP
|
55
93
|
# requests to be dispatched and so on. These should be _balanced_ with one another- for example,
|
@@ -92,31 +130,59 @@ module FormatParser
|
|
92
130
|
|
93
131
|
# We need to rewind for each parser, anew
|
94
132
|
limited_io.seek(0)
|
95
|
-
|
96
|
-
begin
|
97
|
-
parser.call(limited_io)
|
98
|
-
rescue IOUtils::InvalidRead
|
99
|
-
# There was not enough data for this parser to work on,
|
100
|
-
# and it triggered an error
|
101
|
-
rescue IOUtils::MalformedFile
|
102
|
-
# Unexpected input was encountered during the parsing of
|
103
|
-
# a file. This might indicate either a malicious or a
|
104
|
-
# corruped file.
|
105
|
-
rescue ReadLimiter::BudgetExceeded
|
106
|
-
# The parser tried to read too much - most likely the file structure
|
107
|
-
# caused the parser to go off-track. Strictly speaking we should log this
|
108
|
-
# and examine the file more closely.
|
109
|
-
# Or the parser caused too many cache pages to be fetched, which likely means we should not allow
|
110
|
-
# it to continue
|
111
|
-
end
|
133
|
+
execute_parser_and_capture_expected_exceptions(parser, limited_io)
|
112
134
|
end.reject(&:nil?).take(amount)
|
113
135
|
|
114
|
-
return results.first if amount == 1
|
115
|
-
|
116
136
|
# Convert the results from a lazy enumerator to an Array.
|
117
|
-
results.to_a
|
137
|
+
results = results.to_a
|
138
|
+
|
139
|
+
if results.empty?
|
140
|
+
Measurometer.increment_counter('format_parser.unknown_files', 1)
|
141
|
+
end
|
142
|
+
|
143
|
+
amount == 1 ? results.first : results
|
144
|
+
end
|
145
|
+
|
146
|
+
def self.execute_parser_and_capture_expected_exceptions(parser, limited_io)
|
147
|
+
parser_name_for_instrumentation = parser.class.to_s.split('::').last
|
148
|
+
Measurometer.instrument('format_parser.parser.%s' % parser_name_for_instrumentation) do
|
149
|
+
parser.call(limited_io).tap do |result|
|
150
|
+
if result
|
151
|
+
Measurometer.increment_counter('format_parser.detected_natures.%s' % result.nature, 1)
|
152
|
+
Measurometer.increment_counter('format_parser.detected_formats.%s' % result.format, 1)
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
156
|
+
rescue IOUtils::InvalidRead
|
157
|
+
# There was not enough data for this parser to work on,
|
158
|
+
# and it triggered an error
|
159
|
+
Measurometer.increment_counter('format_parser.invalid_read_errors', 1)
|
160
|
+
rescue IOUtils::MalformedFile
|
161
|
+
# Unexpected input was encountered during the parsing of
|
162
|
+
# a file. This might indicate either a malicious or a
|
163
|
+
# corruped file.
|
164
|
+
Measurometer.increment_counter('format_parser.malformed_errors', 1)
|
165
|
+
rescue ReadLimiter::BudgetExceeded
|
166
|
+
# The parser tried to read too much - most likely the file structure
|
167
|
+
# caused the parser to go off-track. Strictly speaking we should log this
|
168
|
+
# and examine the file more closely.
|
169
|
+
# Or the parser caused too many cache pages to be fetched, which likely means we should not allow
|
170
|
+
# it to continue
|
171
|
+
Measurometer.increment_counter('format_parser.exceeded_budget_errors', 1)
|
172
|
+
ensure
|
173
|
+
limited_io.send_metrics(parser_name_for_instrumentation)
|
118
174
|
end
|
119
175
|
|
176
|
+
# Returns objects that respond to `call` and can be called to perform parsing
|
177
|
+
# based on the _intersection_ of the two given nature/format constraints. For
|
178
|
+
# example, a constraint of "only image and only ZIP files" can be given -
|
179
|
+
# but would raise an error since no parsers provide both ZIP file parsing and
|
180
|
+
# images as their information.
|
181
|
+
#
|
182
|
+
# @param desired_natures[Array] which natures should be considered (like `[:image, :archive]`)
|
183
|
+
# @param desired_formats[Array] which formats should be considered (like `[:tif, :jpg]`)
|
184
|
+
# @return [Array<#call>] an array of callable parsers
|
185
|
+
# @raise ArgumentError when there are no parsers satisfying the constraint
|
120
186
|
def self.parsers_for(desired_natures, desired_formats)
|
121
187
|
assemble_parser_set = ->(hash_of_sets, keys_of_interest) {
|
122
188
|
hash_of_sets.values_at(*keys_of_interest).compact.inject(&:+) || Set.new
|
@@ -133,6 +199,11 @@ module FormatParser
|
|
133
199
|
factories.map { |callable_or_class| instantiate_parser(callable_or_class) }
|
134
200
|
end
|
135
201
|
|
202
|
+
# Instantiates a parser object (an object that responds to `#call`) from a given class
|
203
|
+
# or returns the parameter as is if it is callable by itself - i.e. if it is a Proc
|
204
|
+
#
|
205
|
+
# @param callable_or_responding_to_new[#call, #new] a callable or a Class/Module
|
206
|
+
# @return [#call] a parser that can be called with an IO-ish argument
|
136
207
|
def self.instantiate_parser(callable_or_responding_to_new)
|
137
208
|
if callable_or_responding_to_new.respond_to?(:call)
|
138
209
|
callable_or_responding_to_new
|
@@ -146,4 +217,7 @@ module FormatParser
|
|
146
217
|
Dir.glob(__dir__ + '/parsers/*.rb').sort.each do |parser_file|
|
147
218
|
require parser_file
|
148
219
|
end
|
220
|
+
# The Measurometer latches itself onto existing classes, so load it after
|
221
|
+
# we have loaded all the parsers
|
222
|
+
require_relative 'measurometer'
|
149
223
|
end
|
data/lib/io_constraint.rb
CHANGED
@@ -19,18 +19,33 @@ class FormatParser::IOConstraint
|
|
19
19
|
@io = io
|
20
20
|
end
|
21
21
|
|
22
|
+
# Returns at most `n_bytes` of data from the IO or less if less data was available
|
23
|
+
# before the EOF was hit
|
24
|
+
#
|
25
|
+
# @param n_bytes[Integer]
|
26
|
+
# @return [String, nil] the content read from the IO or `nil` if no data was available
|
22
27
|
def read(n_bytes)
|
23
28
|
@io.read(n_bytes)
|
24
29
|
end
|
25
30
|
|
26
|
-
|
27
|
-
|
31
|
+
# Seeks the IO to the given absolute offset from the start of the file/resource
|
32
|
+
#
|
33
|
+
# @param to[Integer] offset in the IO
|
34
|
+
# @return Integer
|
35
|
+
def seek(to)
|
36
|
+
@io.seek(to)
|
28
37
|
end
|
29
38
|
|
39
|
+
# Returns the size of the resource contained in the IO
|
40
|
+
#
|
41
|
+
# @return Integer
|
30
42
|
def size
|
31
43
|
@io.size
|
32
44
|
end
|
33
45
|
|
46
|
+
# Returns the current position/offset within the IO
|
47
|
+
#
|
48
|
+
# @return Integer
|
34
49
|
def pos
|
35
50
|
@io.pos
|
36
51
|
end
|
data/lib/io_utils.rb
CHANGED
data/lib/measurometer.rb
ADDED
@@ -0,0 +1,100 @@
|
|
1
|
+
class FormatParser::Measurometer
|
2
|
+
class << self
|
3
|
+
# Permits adding instrumentation drivers. Measurometer is 1-1 API
|
4
|
+
# compatible with Appsignal, which we use a lot. So to magically
|
5
|
+
# obtain all Appsignal instrumentation, add the Appsignal module
|
6
|
+
# as a driver.
|
7
|
+
#
|
8
|
+
# Measurometer.drivers << Appsignal
|
9
|
+
#
|
10
|
+
# A driver must be reentrant and thread-safe - it should be possible
|
11
|
+
# to have multiple `instrument` calls open from different threads at the
|
12
|
+
# same time.
|
13
|
+
# The driver must support the same interface as the Measurometer class
|
14
|
+
# itself, minus the `drivers` and `instrument_instance_method` methods.
|
15
|
+
#
|
16
|
+
# @return Array
|
17
|
+
def drivers
|
18
|
+
@drivers ||= []
|
19
|
+
@drivers
|
20
|
+
end
|
21
|
+
|
22
|
+
# Runs a given block within a cascade of `instrument` blocks of all the
|
23
|
+
# added drivers.
|
24
|
+
#
|
25
|
+
# Measurometer.instrument('do_foo') { compute! }
|
26
|
+
#
|
27
|
+
# unfolds to
|
28
|
+
# Appsignal.instrument('do_foo') do
|
29
|
+
# Statsd.timing('do_foo') do
|
30
|
+
# compute!
|
31
|
+
# end
|
32
|
+
# end
|
33
|
+
#
|
34
|
+
# A driver must be reentrant and thread-safe - it should be possible
|
35
|
+
# to have multiple `instrument` calls open from different threads at the
|
36
|
+
# same time.
|
37
|
+
# The driver must support the same interface as the Measurometer class
|
38
|
+
# itself, minus the `drivers` and `instrument_instance_method` methods.
|
39
|
+
#
|
40
|
+
# @param block_name[String] under which path to push the metric
|
41
|
+
# @param blk[#call] the block to instrument
|
42
|
+
# @return [Object] the return value of &blk
|
43
|
+
def instrument(block_name, &blk)
|
44
|
+
return yield unless @drivers && @drivers.any? # The block wrapping business is not free
|
45
|
+
@drivers.inject(blk) { |outer_block, driver|
|
46
|
+
-> {
|
47
|
+
driver.instrument(block_name, &outer_block)
|
48
|
+
}
|
49
|
+
}.call
|
50
|
+
end
|
51
|
+
|
52
|
+
# Adds a distribution value (sample) under a given path
|
53
|
+
#
|
54
|
+
# @param value_path[String] under which path to push the metric
|
55
|
+
# @param value[Numeric] distribution value
|
56
|
+
# @return nil
|
57
|
+
def add_distribution_value(value_path, value)
|
58
|
+
(@drivers || []).each { |d| d.add_distribution_value(value_path, value) }
|
59
|
+
nil
|
60
|
+
end
|
61
|
+
|
62
|
+
# Increment a named counter under a given path
|
63
|
+
#
|
64
|
+
# @param counter_path[String] under which path to push the metric
|
65
|
+
# @param by[Integer] the counter increment to apply
|
66
|
+
# @return nil
|
67
|
+
def increment_counter(counter_path, by)
|
68
|
+
(@drivers || []).each { |d| d.increment_counter(counter_path, by) }
|
69
|
+
nil
|
70
|
+
end
|
71
|
+
|
72
|
+
# Wrap an anonymous module around an instance method in the given class to have
|
73
|
+
# it instrumented automatically. The name of the measurement will be interpolated as:
|
74
|
+
#
|
75
|
+
# "#{prefix}.#{rightmost_class_constant_name}.#{instance_method_name}"
|
76
|
+
#
|
77
|
+
# @param target_class[Class] the class to instrument
|
78
|
+
# @param instance_method_name_to_instrument[Symbol] the method name to instrument
|
79
|
+
# @param path_prefix[String] under which path to push the instrumented metric
|
80
|
+
# @return void
|
81
|
+
def instrument_instance_method(target_class, instance_method_name_to_instrument, path_prefix)
|
82
|
+
short_class_name = target_class.to_s.split('::').last
|
83
|
+
instrumentation_name = [path_prefix, short_class_name, instance_method_name_to_instrument].join('.')
|
84
|
+
instrumenter_module = Module.new do
|
85
|
+
define_method(instance_method_name_to_instrument) do |*any|
|
86
|
+
::FormatParser::Measurometer.instrument(instrumentation_name) { super(*any) }
|
87
|
+
end
|
88
|
+
end
|
89
|
+
target_class.prepend(instrumenter_module)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
# Instrument things interesting in the global sense
|
94
|
+
instrument_instance_method(FormatParser::RemoteIO, :read, 'format_parser')
|
95
|
+
instrument_instance_method(Care::Cache, :read_page, 'format_parser')
|
96
|
+
|
97
|
+
# Instrument more specific things on a per-parser basis
|
98
|
+
instrument_instance_method(FormatParser::EXIFParser, :scan_image_tiff, 'format_parser')
|
99
|
+
instrument_instance_method(FormatParser::MOOVParser::Decoder, :extract_atom_stream, 'format_parser.parsers.MOOVParser')
|
100
|
+
end
|
data/lib/parsers/jpeg_parser.rb
CHANGED
@@ -112,6 +112,7 @@ class FormatParser::JPEGParser
|
|
112
112
|
maybe_exif_magic_str = app1_frame_bytes[0..5]
|
113
113
|
maybe_exif_data = app1_frame_bytes[6..-1]
|
114
114
|
if maybe_exif_magic_str == EXIF_MAGIC_STRING
|
115
|
+
FormatParser::Measurometer.add_distribution_value('format_parser.JPEGParser.bytes_sent_to_exif_parser', maybe_exif_data.bytesize)
|
115
116
|
scanner = FormatParser::EXIFParser.new(StringIO.new(maybe_exif_data))
|
116
117
|
scanner.scan_image_tiff
|
117
118
|
|
data/lib/parsers/moov_parser.rb
CHANGED
@@ -11,10 +11,6 @@ class FormatParser::MOOVParser
|
|
11
11
|
'm4a ' => :m4a,
|
12
12
|
}
|
13
13
|
|
14
|
-
# It is currently not documented and not particularly well-tested,
|
15
|
-
# so not considered a public API for now
|
16
|
-
private_constant :Decoder
|
17
|
-
|
18
14
|
def call(io)
|
19
15
|
return unless matches_moov_definition?(io)
|
20
16
|
|
data/lib/read_limiter.rb
CHANGED
@@ -1,9 +1,17 @@
|
|
1
|
+
# Is used to limit the number of reads/seeks parsers can perform
|
1
2
|
class FormatParser::ReadLimiter
|
2
3
|
NO_LIMIT = nil
|
3
4
|
|
4
5
|
class BudgetExceeded < StandardError
|
5
6
|
end
|
6
7
|
|
8
|
+
# Creates a ReadLimiter wrapper around the given IO object and sets the limits
|
9
|
+
# on the number of reads/writes
|
10
|
+
#
|
11
|
+
# @param io[#seek, #pos, #size, #read] the IO object to wrap
|
12
|
+
# @param max_bytes[Integer, nil] how many bytes can we read from this before an exception is raised
|
13
|
+
# @param max_reads[Integer, nil] how many read() calls can we perform on this before an exception is raised
|
14
|
+
# @param max_seeks[Integer, nil] how many seek() calls can we perform on this before an exception is raised
|
7
15
|
def initialize(io, max_bytes: NO_LIMIT, max_reads: NO_LIMIT, max_seeks: NO_LIMIT)
|
8
16
|
@max_bytes = max_bytes
|
9
17
|
@max_reads = max_reads
|
@@ -15,24 +23,39 @@ class FormatParser::ReadLimiter
|
|
15
23
|
@bytes = 0
|
16
24
|
end
|
17
25
|
|
26
|
+
# Returns the size of the resource contained in the IO
|
27
|
+
#
|
28
|
+
# @return Integer
|
18
29
|
def size
|
19
30
|
@io.size
|
20
31
|
end
|
21
32
|
|
33
|
+
# Returns the current position/offset within the IO
|
34
|
+
#
|
35
|
+
# @return Integer
|
22
36
|
def pos
|
23
37
|
@io.pos
|
24
38
|
end
|
25
39
|
|
26
|
-
|
40
|
+
# Seeks the IO to the given absolute offset from the start of the file/resource
|
41
|
+
#
|
42
|
+
# @param to[Integer] offset in the IO
|
43
|
+
# @return Integer
|
44
|
+
def seek(to)
|
27
45
|
@seeks += 1
|
28
46
|
if @max_seeks && @seeks > @max_seeks
|
29
47
|
raise BudgetExceeded, 'Seek budget exceeded (%d seeks performed)' % @max_seeks
|
30
48
|
end
|
31
|
-
@io.seek(
|
49
|
+
@io.seek(to)
|
32
50
|
end
|
33
51
|
|
34
|
-
|
35
|
-
|
52
|
+
# Returns at most `n_bytes` of data from the IO or less if less data was available
|
53
|
+
# before the EOF was hit
|
54
|
+
#
|
55
|
+
# @param n_bytes[Integer]
|
56
|
+
# @return [String, nil] the content read from the IO or `nil` if no data was available
|
57
|
+
def read(n_bytes)
|
58
|
+
@bytes += n_bytes
|
36
59
|
@reads += 1
|
37
60
|
|
38
61
|
if @max_bytes && @bytes > @max_bytes
|
@@ -43,9 +66,23 @@ class FormatParser::ReadLimiter
|
|
43
66
|
raise BudgetExceeded, 'Number of read() calls exceeded (%d max)' % @max_reads
|
44
67
|
end
|
45
68
|
|
46
|
-
@io.read(
|
69
|
+
@io.read(n_bytes)
|
47
70
|
end
|
48
71
|
|
72
|
+
# Sends the metrics about the state of this ReadLimiter to a Measurometer
|
73
|
+
#
|
74
|
+
# @param prefix[String] the prefix to set. For example, with prefix "TIFF" the metrics will be called
|
75
|
+
# `format_parser.TIFF.read_limiter.num_seeks` and so forth
|
76
|
+
# @return void
|
77
|
+
def send_metrics(prefix)
|
78
|
+
FormatParser::Measurometer.add_distribution_value('format_parser.%s.read_limiter.num_seeks' % prefix, @seeks)
|
79
|
+
FormatParser::Measurometer.add_distribution_value('format_parser.%s.read_limiter.num_reads' % prefix, @reads)
|
80
|
+
FormatParser::Measurometer.add_distribution_value('format_parser.%s.read_limiter.read_bytes' % prefix, @bytes)
|
81
|
+
end
|
82
|
+
|
83
|
+
# Resets all the recorded call counters so that the object can be reused for the next parser,
|
84
|
+
# which will have it's own limits
|
85
|
+
# @return void
|
49
86
|
def reset_limits!
|
50
87
|
@seeks = 0
|
51
88
|
@reads = 0
|
data/lib/read_limits_config.rb
CHANGED
@@ -5,23 +5,49 @@ class FormatParser::ReadLimitsConfig
|
|
5
5
|
@max_read_bytes_per_parser = total_bytes_available_per_parser.to_i
|
6
6
|
end
|
7
7
|
|
8
|
+
# Defines how many bytes each parser may request to read from the IO object given to it.
|
9
|
+
# Is used to artificially limit unbounded reads in parsers that may wander off and
|
10
|
+
# try to gulp in the file given to them indefinitely due to infinite loops or
|
11
|
+
# wrongly implemented skips - or when handling data that has been deliberately
|
12
|
+
# crafted in a way that can make a parser misbehave.
|
13
|
+
# This is less strict than one could think - for example, the MOOV parser used for
|
14
|
+
# Quicktime files will skip over the actual atom contents of the atoms, and will only
|
15
|
+
# read atom headers - which stays under this limit for quite some time.
|
8
16
|
def max_read_bytes_per_parser
|
9
17
|
@max_read_bytes_per_parser
|
10
18
|
end
|
11
19
|
|
20
|
+
# How big should the cache page be. Each cache page read will incur one `#read`
|
21
|
+
# on the underlying IO object, remote or local
|
12
22
|
def cache_page_size
|
13
23
|
@max_read_bytes_per_parser / 4
|
14
24
|
end
|
15
25
|
|
26
|
+
# Each parser can incur HTTP requests when performing `parse_http`. This constant
|
27
|
+
# sets the maximum number of pages each parser is allowed to hit that have not
|
28
|
+
# been fetched previously and are not stored in the cache. For example, with most
|
29
|
+
# formats the first cache page and the last cache page - tail and head of the file,
|
30
|
+
# respectively - will be available right after the first parser retreives some data.
|
31
|
+
# The second parser accessing the same data will reuse the in-memory cache.
|
16
32
|
def max_pagefaults_per_parser
|
17
33
|
MAX_PAGE_FAULTS
|
18
34
|
end
|
19
35
|
|
36
|
+
# Defines how many `#read` calls each parser may perform on the IO object given to it.
|
37
|
+
# Is used to artificially limit unbounded reads in parsers that may wander off and
|
38
|
+
# try to gulp in the file given to them indefinitely due to infinite loops or
|
39
|
+
# wrongly implemented skips - or when handling data that has been deliberately
|
40
|
+
# crafted in a way that can make a parser misbehave.
|
20
41
|
def max_reads_per_parser
|
21
42
|
# Imagine we read per single byte
|
22
43
|
@max_read_bytes_per_parser / 2
|
23
44
|
end
|
24
45
|
|
46
|
+
# Defines how many `#seek` calls each parser may perform on the IO object given to it.
|
47
|
+
# Is used to artificially limit unbounded reads in parsers that may wander off and
|
48
|
+
# try to gulp in the file given to them indefinitely due to infinite loops or
|
49
|
+
# wrongly implemented skips - or when handling data that has been deliberately
|
50
|
+
# crafted in a way that can make a parser misbehave.
|
25
51
|
def max_seeks_per_parser
|
26
52
|
# Imagine we have to seek once per byte
|
27
53
|
@max_read_bytes_per_parser / 2
|
data/lib/remote_io.rb
CHANGED
@@ -1,3 +1,8 @@
|
|
1
|
+
# Acts as a wrapper for turning a given URL into an IO object
|
2
|
+
# you can read from and seek in. Uses Faraday under the hood
|
3
|
+
# to perform fetches, so if you apply Faraday configuration
|
4
|
+
# tweaks using `Faraday.default_connection = ...` these will
|
5
|
+
# take effect for these RemoteIO objects as well
|
1
6
|
class FormatParser::RemoteIO
|
2
7
|
# Represents a failure that might be retried
|
3
8
|
# (like a 5xx response or a timeout)
|
@@ -89,8 +94,10 @@ class FormatParser::RemoteIO
|
|
89
94
|
# cannot hint size with this response - at lease not when working with S3
|
90
95
|
return
|
91
96
|
when 500..599
|
97
|
+
FormatParser::Measurometer.increment_counter('format_parser.RemoteIO.upstream50x_errors', 1)
|
92
98
|
raise IntermittentFailure, "Server at #{@uri} replied with a #{response.status} and we might want to retry"
|
93
99
|
else
|
100
|
+
FormatParser::Measurometer.increment_counter('format_parser.RemoteIO.invalid_request_errors', 1)
|
94
101
|
raise InvalidRequest, "Server at #{@uri} replied with a #{response.status} and refused our request"
|
95
102
|
end
|
96
103
|
end
|
data/spec/io_utils_spec.rb
CHANGED
@@ -27,16 +27,10 @@ describe 'IOUtils' do
|
|
27
27
|
}.to raise_error(FormatParser::IOUtils::InvalidRead)
|
28
28
|
end
|
29
29
|
|
30
|
-
it 'uses #pos
|
30
|
+
it 'uses #pos available on the object' do
|
31
31
|
fake_io = double(pos: 11)
|
32
32
|
expect(fake_io).to receive(:seek).with(11 + 5)
|
33
33
|
safe_skip(fake_io, 5)
|
34
34
|
end
|
35
|
-
|
36
|
-
it 'uses #read if no #pos is available on the object' do
|
37
|
-
fake_io = double
|
38
|
-
expect(fake_io).to receive(:read).with(5).and_return('x' * 5)
|
39
|
-
safe_skip(fake_io, 5)
|
40
|
-
end
|
41
35
|
end
|
42
36
|
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe FormatParser::Measurometer do
|
4
|
+
RSpec::Matchers.define :include_counter_or_measurement_named do |named|
|
5
|
+
match do |actual|
|
6
|
+
actual.any? do |e|
|
7
|
+
e[0] == named && e[1] > 0
|
8
|
+
end
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
it 'instruments a full cycle FormatParser.parse' do
|
13
|
+
driver_class = Class.new do
|
14
|
+
attr_accessor :timings, :counters, :distributions
|
15
|
+
def instrument(block_name)
|
16
|
+
s = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
17
|
+
yield.tap do
|
18
|
+
delta = Process.clock_gettime(Process::CLOCK_MONOTONIC) - s
|
19
|
+
@timings ||= []
|
20
|
+
@timings << [block_name, delta * 1000]
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def add_distribution_value(value_path, value)
|
25
|
+
@distributions ||= []
|
26
|
+
@distributions << [value_path, value]
|
27
|
+
end
|
28
|
+
|
29
|
+
def increment_counter(value_path, value)
|
30
|
+
@counters ||= []
|
31
|
+
@counters << [value_path, value]
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
instrumenter = driver_class.new
|
36
|
+
described_class.drivers << instrumenter
|
37
|
+
|
38
|
+
FormatParser.parse(File.open(fixtures_dir + 'JPEG/keynote_recognized_as_jpeg.key', 'rb'), results: :all)
|
39
|
+
|
40
|
+
described_class.drivers.delete(instrumenter)
|
41
|
+
expect(described_class.drivers).not_to include(instrumenter)
|
42
|
+
|
43
|
+
expect(instrumenter.counters).to include_counter_or_measurement_named('format_parser.detected_formats.zip')
|
44
|
+
expect(instrumenter.counters).to include_counter_or_measurement_named('format_parser.parser.Care.page_reads_from_upsteam')
|
45
|
+
expect(instrumenter.distributions).to include_counter_or_measurement_named('format_parser.ZIPParser.read_limiter.read_bytes')
|
46
|
+
expect(instrumenter.timings).to include_counter_or_measurement_named('format_parser.Cache.read_page')
|
47
|
+
end
|
48
|
+
end
|
@@ -27,11 +27,12 @@ describe 'Fetching data from HTTP remotes' do
|
|
27
27
|
end
|
28
28
|
|
29
29
|
it '#parse_http is called with hash options' do
|
30
|
-
|
31
|
-
|
30
|
+
fake_result = double(nature: :audio, format: :aiff)
|
31
|
+
expect_any_instance_of(FormatParser::AIFFParser).to receive(:call).and_return(fake_result)
|
32
|
+
results = FormatParser.parse_http('http://localhost:9399/PNG/anim.png', results: :all)
|
32
33
|
|
33
|
-
expect(
|
34
|
-
expect(
|
34
|
+
expect(results.count).to eq(2)
|
35
|
+
expect(results).to include(fake_result)
|
35
36
|
end
|
36
37
|
|
37
38
|
it 'parses the animated PNG over HTTP' do
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: format_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.8.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Noah Berman
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2018-04-
|
12
|
+
date: 2018-04-17 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: ks
|
@@ -152,6 +152,7 @@ files:
|
|
152
152
|
- ".rspec"
|
153
153
|
- ".rubocop.yml"
|
154
154
|
- ".travis.yml"
|
155
|
+
- CHANGELOG.md
|
155
156
|
- CODE_OF_CONDUCT.md
|
156
157
|
- CONTRIBUTING.md
|
157
158
|
- Gemfile
|
@@ -169,6 +170,7 @@ files:
|
|
169
170
|
- lib/image.rb
|
170
171
|
- lib/io_constraint.rb
|
171
172
|
- lib/io_utils.rb
|
173
|
+
- lib/measurometer.rb
|
172
174
|
- lib/parsers/aiff_parser.rb
|
173
175
|
- lib/parsers/cr2_parser.rb
|
174
176
|
- lib/parsers/dpx_parser.rb
|
@@ -200,6 +202,7 @@ files:
|
|
200
202
|
- spec/file_information_spec.rb
|
201
203
|
- spec/format_parser_spec.rb
|
202
204
|
- spec/io_utils_spec.rb
|
205
|
+
- spec/measurometer_spec.rb
|
203
206
|
- spec/parsers/aiff_parser_spec.rb
|
204
207
|
- spec/parsers/cr2_parser_spec.rb
|
205
208
|
- spec/parsers/dpx_parser_spec.rb
|