format_parser 0.7.0 → 0.8.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ebde95ad53411a3d4ae4db175bab7b90d568476302df66842c83710f88ad2e15
4
- data.tar.gz: cfb0e6a00c9ca1f8447e71e0800643327b05114be1b592d888052ba94ed91a98
3
+ metadata.gz: 789592107e9fa74091745b703249248b6a05e9dd73af45803ab799708f8498fc
4
+ data.tar.gz: 191ff20e5b8d455f681eb19b40a2bd406c1cfa1a9b2aeebaa483ea829071e532
5
5
  SHA512:
6
- metadata.gz: 1c6509362046f64b49472b3323f154ffc509f4f6d308d32ebc889299e470cc46d07cbe7864a5b8021a4e59d922d530cedf1e83852402305dab50f8c297ac9d8a
7
- data.tar.gz: f8143722bd3dbc9b0b431732fc717d15dcd70c73e08bb2b76025f6e919976f6a62bb244f7525445387169e1ef7cea8e048084818255ea413458de5981e8fa0af
6
+ metadata.gz: dde5379a4d0590019ac6eee21361e10688f8582efa169132f28a0dc531f6b05811368e987a3a30902dcc1a509952c11bb9810bee3f3dbd17e99f3c13f1776b71
7
+ data.tar.gz: ec5a699fb441ca622006fb2885a6e1ce623da0a7ea6a5b78e5df83db2a1f9227dd023681f12a7ec703188133a8276e363dd3e5c090bca3ce694064124925e47e
data/CHANGELOG.md ADDED
@@ -0,0 +1,98 @@
1
+ ## 0.8.0
2
+ * Add `Measurometer` for applying instrumentation around FormatParser operaions. See documentation for usage.
3
+
4
+ ## 0.7.0
5
+ * Configure read limits / pagefault limits centrally so that those limits make sense together
6
+
7
+ ## 0.6.0
8
+ * Double the cache page size once more
9
+ * We no longer need exifr/jpeg
10
+ * Fix EXIF parsing in JPEG files
11
+ * Reject Keynote documents in JPEG parser
12
+
13
+ ## 0.5.2
14
+ * Do not raise EXIFR errors for keynote files
15
+ * Correct broken comment for the audio nature
16
+
17
+ ## 0.5.1
18
+ * Raise the cache page size during detection
19
+ * Fix ZIP entry filename parsing
20
+
21
+ ## 0.5.0
22
+ * Add FLAC parser
23
+ * Add parse_atom_children_and_data_fields support
24
+ * Add basic detection of Office files
25
+ * Optimize EOCD signature lookup
26
+
27
+ ## 0.4.0
28
+ * Adds a basic PDF parser
29
+ * Make sure root: and to_json without arguments work
30
+ * ZIP file format support
31
+
32
+ ## 0.3.5
33
+ * Fix the bug with EXIF dimensions being used instead of pixel dimensions
34
+
35
+ ## 0.3.4
36
+ * Pagefault limit
37
+ * Add seek modes required by exifr
38
+
39
+ ## 0.3.3
40
+ * Implement a sane to_json as well
41
+
42
+ ## 0.3.2
43
+ * Add default as_json
44
+ * Test on 2.5.0
45
+
46
+ ## 0.3.1
47
+ * Remove post install warning
48
+ * Moved aiff_parser_spec.rb to spec/parsers
49
+ * CR2 file support
50
+ * Add require 'set' to format_parser.rb
51
+ * Use register_parser for natures/fmts
52
+
53
+ ## 0.3.0
54
+ * Reverse API changes to support :first as default and add opts to parse_http
55
+ * Implement and comply with rubocop
56
+ * JPEG parser and Care fixes
57
+ * Add format and count options to parse_http
58
+ * Return first result as default
59
+ * Use hashes for MOOV atom default fields
60
+
61
+ ## 0.2.0
62
+ * Implement parser DSL
63
+
64
+ ## 0.1.7
65
+ * Fix read(0) on Care::IOWrapper, introduce top-level tests
66
+
67
+ ## 0.1.6
68
+ * Fix mp3 parsing bug
69
+ * Add MOOV parser
70
+
71
+ ## 0.1.5
72
+ * Add FDX parser
73
+ * Remove dry-structs
74
+ * New interface updates
75
+
76
+ ## 0.1.4
77
+ * Add WAV parser
78
+
79
+ ## 0.1.3
80
+ * Add MP3 parser
81
+ * Add FileInformation#intrinsics
82
+ * Disallow negative Care offsets
83
+
84
+ ## 0.1.2
85
+ * Introduce a restrictive IO subset wrapper
86
+ * Switch rewind for seek in exif parser
87
+ * Prep for OSS release
88
+ * Add fuzz spec
89
+ * Improve orientation parsing
90
+ * Optimisation for PNG and invalid input protection on JPEG
91
+
92
+ ## 0.1.1
93
+ * Add AIFF parser
94
+
95
+ ## 0.1.0
96
+ * Add parsers for PNG, JPG, TIFF, PSD
97
+ * Add GIF parser
98
+ * Add DPX parser
data/CONTRIBUTING.md CHANGED
@@ -16,7 +16,7 @@ If you are interested in contributing code and would like to learn more about th
16
16
 
17
17
  - [ruby](https://ruby-doc.org)
18
18
  - [rspec](http://rspec.info/) (for testing)
19
-
19
+
20
20
  # How do I make a contribution?
21
21
 
22
22
  ## Using the issue tracker
@@ -101,7 +101,7 @@ project's developers might not want to merge into the project.
101
101
  Please adhere to the coding conventions used throughout the project (indentation,
102
102
  accurate comments, etc.) and any other requirements (such as test coverage).
103
103
 
104
- The test suite can be run with `bundle exec rspec`.
104
+ The test suite can be run with `bundle exec rspec`.
105
105
 
106
106
  Follow this process if you'd like your work considered for inclusion in the
107
107
  project:
@@ -155,3 +155,7 @@ project:
155
155
  license your work under the same license as that used by the project, which you
156
156
  can see by clicking [here](https://github.com/WeTransfer/format_parser/blob/master/LICENSE.txt).
157
157
  This provision also applies to the test files you include with the changed code as fixtures.
158
+
159
+ ## Changelog
160
+
161
+ When creating a new release you must add an entry in the `CHANGELOG.md`.
data/lib/care.rb CHANGED
@@ -4,27 +4,49 @@
4
4
  # is only available via HTTP, for example, we can have less
5
5
  # fetches and have them return more data for one fetch
6
6
  class Care
7
+ # Defines the size of a page in bytes that the Care will prefetch
7
8
  DEFAULT_PAGE_SIZE = 128 * 1024
8
9
 
10
+ # Wraps any given IO with Care caching superpowers. Supports the subset
11
+ # of IO declared in IOConstraint.
9
12
  class IOWrapper
13
+ # Creates a new IOWrapper around the given source IO
14
+ #
15
+ # @param io[#seek, #pos, #size] the IO to wrap
16
+ # @param page_size[Integer] the size of the cache page to use for this wrapper
10
17
  def initialize(io, page_size: DEFAULT_PAGE_SIZE)
11
18
  @cache = Cache.new(page_size)
12
19
  @io = io
13
20
  @pos = 0
14
21
  end
15
22
 
23
+ # Returns the size of the resource contained in the IO
24
+ #
25
+ # @return Integer
16
26
  def size
17
27
  @io.size
18
28
  end
19
29
 
30
+ # Seeks the IO to the given absolute offset from the start of the file/resource
31
+ #
32
+ # @param to[Integer] offset in the IO
33
+ # @return Integer
20
34
  def seek(to)
21
35
  @pos = to
22
36
  end
23
37
 
38
+ # Returns the current position/offset within the IO
39
+ #
40
+ # @return Integer
24
41
  def pos
25
42
  @pos
26
43
  end
27
44
 
45
+ # Returns at most `n_bytes` of data from the IO or less if less data was available
46
+ # before the EOF was hit
47
+ #
48
+ # @param n_bytes[Integer]
49
+ # @return [String, nil] the content read from the IO or `nil` if no data was available
28
50
  def read(n_bytes)
29
51
  return '' if n_bytes == 0 # As hardcoded for all Ruby IO objects
30
52
  raise ArgumentError, "negative length #{n_bytes} given" if n_bytes < 0 # also as per Ruby IO objects
@@ -34,10 +56,17 @@ class Care
34
56
  read
35
57
  end
36
58
 
59
+ # Clears all the cached pages explicitly to help GC
60
+ #
61
+ # @return void
37
62
  def clear
38
63
  @cache.clear
39
64
  end
40
65
 
66
+ # Clears all the cached pages explicitly to help GC, and
67
+ # calls `#close` on the source IO if the IO responds to `#close`
68
+ #
69
+ # @return void
41
70
  def close
42
71
  clear
43
72
  @io.close if @io.respond_to?(:close)
@@ -47,6 +76,7 @@ class Care
47
76
  # Stores cached pages of data from the given IO as strings.
48
77
  # Pages are sized to be `page_size` or less (for the last page).
49
78
  class Cache
79
+ # Initializes a new cache pages container with pages of given size
50
80
  def initialize(page_size = DEFAULT_PAGE_SIZE)
51
81
  @page_size = page_size.to_i
52
82
  raise ArgumentError, 'The page size must be a positive Integer' unless @page_size > 0
@@ -59,6 +89,12 @@ class Care
59
89
  # If the IO has been exhausted, `nil` will be returned
60
90
  # instead. Will use the cached pages where available,
61
91
  # or fetch pages where necessary
92
+ #
93
+ # @param io[#seek, #read] the IO to read data from
94
+ # @param at[Integer] at which offset we have to read
95
+ # @param n_bytes[Integer] how many bytes we want to read/cache
96
+ # @return [String, nil] the content read from the IO or `nil` if no data was available
97
+ # @raise ArgumentError
62
98
  def byteslice(io, at, n_bytes)
63
99
  if n_bytes < 1
64
100
  raise ArgumentError, "The number of bytes to fetch must be a positive Integer, but was #{n_bytes}"
@@ -97,10 +133,18 @@ class Care
97
133
  slice if slice && !slice.empty?
98
134
  end
99
135
 
136
+ # Clears the page cache of all strings with data
137
+ #
138
+ # @return void
100
139
  def clear
101
140
  @pages.clear
102
141
  end
103
142
 
143
+ # Hydrates a page at the certain index or returns the contents of
144
+ # that page if it is already in the cache
145
+ #
146
+ # @param io[IO] the IO to read from
147
+ # @param page_i[Integer] which page (zero-based) to hydrate and return
104
148
  def hydrate_page(io, page_i)
105
149
  # Avoid trying to read the page if we know there is no content to fill it
106
150
  # in the underlying IO
@@ -109,9 +153,9 @@ class Care
109
153
  @pages[page_i] ||= read_page(io, page_i)
110
154
  end
111
155
 
156
+ # We provide an overridden implementation of #inspect to avoid
157
+ # printing the actual contents of the cached pages
112
158
  def inspect
113
- # To avoid page _contents_ in the inspect outputs we need to implement our own inspect.
114
-
115
159
  # Simulate the builtin object ID output https://stackoverflow.com/a/11765495/153886
116
160
  oid_str = (object_id << 1).to_s(16).rjust(16, '0')
117
161
 
@@ -124,10 +168,15 @@ class Care
124
168
  '#<%s:%s %s %s>' % [self.class, oid_str, synthetic_vars, ivars_str]
125
169
  end
126
170
 
171
+ # Reads the requested page from the given IO
172
+ #
173
+ # @param io[IO] the IO to read from
174
+ # @param page_i[Integer] which page (zero-based) to read
127
175
  def read_page(io, page_i)
176
+ FormatParser::Measurometer.increment_counter('format_parser.parser.Care.page_reads_from_upsteam', 1)
177
+
128
178
  io.seek(page_i * @page_size)
129
179
  read_result = io.read(@page_size)
130
-
131
180
  if read_result.nil?
132
181
  # If the read went past the end of the IO the read result will be nil,
133
182
  # so we know our IO is exhausted here
@@ -1,3 +1,3 @@
1
1
  module FormatParser
2
- VERSION = '0.7.0'
2
+ VERSION = '0.8.0'
3
3
  end
data/lib/format_parser.rb CHANGED
@@ -1,5 +1,7 @@
1
1
  require 'set'
2
2
 
3
+ # A pretty nimble module for parsing file metadata using partial reads. Contains all the
4
+ # top-level methods of the library.
3
5
  module FormatParser
4
6
  require_relative 'attributes_json'
5
7
  require_relative 'image'
@@ -14,9 +16,19 @@ module FormatParser
14
16
  require_relative 'io_constraint'
15
17
  require_relative 'care'
16
18
 
19
+ # Is used to manage access to the shared array of parser constructors, which might
20
+ # potentially be mutated from different threads. The mutex won't be hit too often
21
+ # since it only locks when adding/removing parsers.
17
22
  PARSER_MUX = Mutex.new
18
23
  MAX_BYTES_READ_PER_PARSER = 1024 * 1024 * 2
19
24
 
25
+ # Register a parser object to be used to perform file format detection. Each parser FormatParser
26
+ # provides out of the box registers itself using this method.
27
+ #
28
+ # @param callable_or_responding_to_new[#call, #new] an object that either responds to #new or to #call
29
+ # @param formats[Array<Symbol>] file formats that the parser provides
30
+ # @param natures[Array<Symbol>] file natures that the parser provides
31
+ # @return void
20
32
  def self.register_parser(callable_or_responding_to_new, formats:, natures:)
21
33
  parser_provided_formats = Array(formats)
22
34
  parser_provided_natures = Array(natures)
@@ -36,6 +48,11 @@ module FormatParser
36
48
  end
37
49
  end
38
50
 
51
+ # Deregister a parser object (makes FormatParser forget this parser existed). Is mostly used in
52
+ # tests, but can also be used to forcibly disable some formats completely.
53
+ #
54
+ # @param callable_or_responding_to_new[#call, #new] an object that either responds to #new or to #call
55
+ # @return void
39
56
  def self.deregister_parser(callable_or_responding_to_new)
40
57
  # Used only in tests
41
58
  PARSER_MUX.synchronize do
@@ -45,11 +62,32 @@ module FormatParser
45
62
  end
46
63
  end
47
64
 
65
+ # Parses the resource at the given `url` and returns the results as if it were any IO
66
+ # given to `.parse`. The accepted keyword arguments are the same as the ones for `parse`.
67
+ #
68
+ # @param url[String, URI] the HTTP(S) URL to request the object from using Faraday and `Range:` requests
69
+ # @param kwargs the keyword arguments to be delegated to `.parse`
70
+ # @see {.parse}
48
71
  def self.parse_http(url, **kwargs)
49
72
  parse(RemoteIO.new(url), **kwargs)
50
73
  end
51
74
 
52
- # Return all by default
75
+ # Parses the resource contained in the given IO-ish object, and returns either the first matched
76
+ # result (omitting all the other parsers), the first N results or all results.
77
+ #
78
+ # @param io[#seek, #pos, #read] an IO-ish object containing the resource to parse formats for
79
+ # @param natures[Array] an array of file natures to scope the parsing to.
80
+ # For example `[:image]` will limit to image files.
81
+ # The default value is "all natures known to FormatParser"
82
+ # @param formats[Array] an array of file formats to scope the parsing to.
83
+ # For example `[:jpg, :tif]` will scope the parsing to TIFF and JPEG files.
84
+ # The default value is "all formats known to FormatParser"
85
+ # @param results[:first, :all, Integer] one of the values defining how many results to return if parsing
86
+ # is ambiguous. The default is `:first` which returns the first matching result. Other
87
+ # possible values are `:all` to get all possible results and an Integer to return
88
+ # at most N results.
89
+ # @return [Array<Result>, Result, nil] either an Array of results, a single parsing result or `nil`if
90
+ # no useful metadata could be recovered from the file
53
91
  def self.parse(io, natures: @parsers_per_nature.keys, formats: @parsers_per_format.keys, results: :first)
54
92
  # We need to apply various limits so that parsers do not over-read, do not cause too many HTTP
55
93
  # requests to be dispatched and so on. These should be _balanced_ with one another- for example,
@@ -92,31 +130,59 @@ module FormatParser
92
130
 
93
131
  # We need to rewind for each parser, anew
94
132
  limited_io.seek(0)
95
-
96
- begin
97
- parser.call(limited_io)
98
- rescue IOUtils::InvalidRead
99
- # There was not enough data for this parser to work on,
100
- # and it triggered an error
101
- rescue IOUtils::MalformedFile
102
- # Unexpected input was encountered during the parsing of
103
- # a file. This might indicate either a malicious or a
104
- # corruped file.
105
- rescue ReadLimiter::BudgetExceeded
106
- # The parser tried to read too much - most likely the file structure
107
- # caused the parser to go off-track. Strictly speaking we should log this
108
- # and examine the file more closely.
109
- # Or the parser caused too many cache pages to be fetched, which likely means we should not allow
110
- # it to continue
111
- end
133
+ execute_parser_and_capture_expected_exceptions(parser, limited_io)
112
134
  end.reject(&:nil?).take(amount)
113
135
 
114
- return results.first if amount == 1
115
-
116
136
  # Convert the results from a lazy enumerator to an Array.
117
- results.to_a
137
+ results = results.to_a
138
+
139
+ if results.empty?
140
+ Measurometer.increment_counter('format_parser.unknown_files', 1)
141
+ end
142
+
143
+ amount == 1 ? results.first : results
144
+ end
145
+
146
+ def self.execute_parser_and_capture_expected_exceptions(parser, limited_io)
147
+ parser_name_for_instrumentation = parser.class.to_s.split('::').last
148
+ Measurometer.instrument('format_parser.parser.%s' % parser_name_for_instrumentation) do
149
+ parser.call(limited_io).tap do |result|
150
+ if result
151
+ Measurometer.increment_counter('format_parser.detected_natures.%s' % result.nature, 1)
152
+ Measurometer.increment_counter('format_parser.detected_formats.%s' % result.format, 1)
153
+ end
154
+ end
155
+ end
156
+ rescue IOUtils::InvalidRead
157
+ # There was not enough data for this parser to work on,
158
+ # and it triggered an error
159
+ Measurometer.increment_counter('format_parser.invalid_read_errors', 1)
160
+ rescue IOUtils::MalformedFile
161
+ # Unexpected input was encountered during the parsing of
162
+ # a file. This might indicate either a malicious or a
163
+ # corruped file.
164
+ Measurometer.increment_counter('format_parser.malformed_errors', 1)
165
+ rescue ReadLimiter::BudgetExceeded
166
+ # The parser tried to read too much - most likely the file structure
167
+ # caused the parser to go off-track. Strictly speaking we should log this
168
+ # and examine the file more closely.
169
+ # Or the parser caused too many cache pages to be fetched, which likely means we should not allow
170
+ # it to continue
171
+ Measurometer.increment_counter('format_parser.exceeded_budget_errors', 1)
172
+ ensure
173
+ limited_io.send_metrics(parser_name_for_instrumentation)
118
174
  end
119
175
 
176
+ # Returns objects that respond to `call` and can be called to perform parsing
177
+ # based on the _intersection_ of the two given nature/format constraints. For
178
+ # example, a constraint of "only image and only ZIP files" can be given -
179
+ # but would raise an error since no parsers provide both ZIP file parsing and
180
+ # images as their information.
181
+ #
182
+ # @param desired_natures[Array] which natures should be considered (like `[:image, :archive]`)
183
+ # @param desired_formats[Array] which formats should be considered (like `[:tif, :jpg]`)
184
+ # @return [Array<#call>] an array of callable parsers
185
+ # @raise ArgumentError when there are no parsers satisfying the constraint
120
186
  def self.parsers_for(desired_natures, desired_formats)
121
187
  assemble_parser_set = ->(hash_of_sets, keys_of_interest) {
122
188
  hash_of_sets.values_at(*keys_of_interest).compact.inject(&:+) || Set.new
@@ -133,6 +199,11 @@ module FormatParser
133
199
  factories.map { |callable_or_class| instantiate_parser(callable_or_class) }
134
200
  end
135
201
 
202
+ # Instantiates a parser object (an object that responds to `#call`) from a given class
203
+ # or returns the parameter as is if it is callable by itself - i.e. if it is a Proc
204
+ #
205
+ # @param callable_or_responding_to_new[#call, #new] a callable or a Class/Module
206
+ # @return [#call] a parser that can be called with an IO-ish argument
136
207
  def self.instantiate_parser(callable_or_responding_to_new)
137
208
  if callable_or_responding_to_new.respond_to?(:call)
138
209
  callable_or_responding_to_new
@@ -146,4 +217,7 @@ module FormatParser
146
217
  Dir.glob(__dir__ + '/parsers/*.rb').sort.each do |parser_file|
147
218
  require parser_file
148
219
  end
220
+ # The Measurometer latches itself onto existing classes, so load it after
221
+ # we have loaded all the parsers
222
+ require_relative 'measurometer'
149
223
  end
data/lib/io_constraint.rb CHANGED
@@ -19,18 +19,33 @@ class FormatParser::IOConstraint
19
19
  @io = io
20
20
  end
21
21
 
22
+ # Returns at most `n_bytes` of data from the IO or less if less data was available
23
+ # before the EOF was hit
24
+ #
25
+ # @param n_bytes[Integer]
26
+ # @return [String, nil] the content read from the IO or `nil` if no data was available
22
27
  def read(n_bytes)
23
28
  @io.read(n_bytes)
24
29
  end
25
30
 
26
- def seek(absolute_offset)
27
- @io.seek(absolute_offset)
31
+ # Seeks the IO to the given absolute offset from the start of the file/resource
32
+ #
33
+ # @param to[Integer] offset in the IO
34
+ # @return Integer
35
+ def seek(to)
36
+ @io.seek(to)
28
37
  end
29
38
 
39
+ # Returns the size of the resource contained in the IO
40
+ #
41
+ # @return Integer
30
42
  def size
31
43
  @io.size
32
44
  end
33
45
 
46
+ # Returns the current position/offset within the IO
47
+ #
48
+ # @return Integer
34
49
  def pos
35
50
  @io.pos
36
51
  end
data/lib/io_utils.rb CHANGED
@@ -26,11 +26,7 @@ module FormatParser::IOUtils
26
26
 
27
27
  raise InvalidRead, 'Negative skips are not supported' if n < 0
28
28
 
29
- if io.respond_to?(:pos)
30
- io.seek(io.pos + n)
31
- else
32
- safe_read(io, n)
33
- end
29
+ io.seek(io.pos + n)
34
30
  nil
35
31
  end
36
32
 
@@ -0,0 +1,100 @@
1
+ class FormatParser::Measurometer
2
+ class << self
3
+ # Permits adding instrumentation drivers. Measurometer is 1-1 API
4
+ # compatible with Appsignal, which we use a lot. So to magically
5
+ # obtain all Appsignal instrumentation, add the Appsignal module
6
+ # as a driver.
7
+ #
8
+ # Measurometer.drivers << Appsignal
9
+ #
10
+ # A driver must be reentrant and thread-safe - it should be possible
11
+ # to have multiple `instrument` calls open from different threads at the
12
+ # same time.
13
+ # The driver must support the same interface as the Measurometer class
14
+ # itself, minus the `drivers` and `instrument_instance_method` methods.
15
+ #
16
+ # @return Array
17
+ def drivers
18
+ @drivers ||= []
19
+ @drivers
20
+ end
21
+
22
+ # Runs a given block within a cascade of `instrument` blocks of all the
23
+ # added drivers.
24
+ #
25
+ # Measurometer.instrument('do_foo') { compute! }
26
+ #
27
+ # unfolds to
28
+ # Appsignal.instrument('do_foo') do
29
+ # Statsd.timing('do_foo') do
30
+ # compute!
31
+ # end
32
+ # end
33
+ #
34
+ # A driver must be reentrant and thread-safe - it should be possible
35
+ # to have multiple `instrument` calls open from different threads at the
36
+ # same time.
37
+ # The driver must support the same interface as the Measurometer class
38
+ # itself, minus the `drivers` and `instrument_instance_method` methods.
39
+ #
40
+ # @param block_name[String] under which path to push the metric
41
+ # @param blk[#call] the block to instrument
42
+ # @return [Object] the return value of &blk
43
+ def instrument(block_name, &blk)
44
+ return yield unless @drivers && @drivers.any? # The block wrapping business is not free
45
+ @drivers.inject(blk) { |outer_block, driver|
46
+ -> {
47
+ driver.instrument(block_name, &outer_block)
48
+ }
49
+ }.call
50
+ end
51
+
52
+ # Adds a distribution value (sample) under a given path
53
+ #
54
+ # @param value_path[String] under which path to push the metric
55
+ # @param value[Numeric] distribution value
56
+ # @return nil
57
+ def add_distribution_value(value_path, value)
58
+ (@drivers || []).each { |d| d.add_distribution_value(value_path, value) }
59
+ nil
60
+ end
61
+
62
+ # Increment a named counter under a given path
63
+ #
64
+ # @param counter_path[String] under which path to push the metric
65
+ # @param by[Integer] the counter increment to apply
66
+ # @return nil
67
+ def increment_counter(counter_path, by)
68
+ (@drivers || []).each { |d| d.increment_counter(counter_path, by) }
69
+ nil
70
+ end
71
+
72
+ # Wrap an anonymous module around an instance method in the given class to have
73
+ # it instrumented automatically. The name of the measurement will be interpolated as:
74
+ #
75
+ # "#{prefix}.#{rightmost_class_constant_name}.#{instance_method_name}"
76
+ #
77
+ # @param target_class[Class] the class to instrument
78
+ # @param instance_method_name_to_instrument[Symbol] the method name to instrument
79
+ # @param path_prefix[String] under which path to push the instrumented metric
80
+ # @return void
81
+ def instrument_instance_method(target_class, instance_method_name_to_instrument, path_prefix)
82
+ short_class_name = target_class.to_s.split('::').last
83
+ instrumentation_name = [path_prefix, short_class_name, instance_method_name_to_instrument].join('.')
84
+ instrumenter_module = Module.new do
85
+ define_method(instance_method_name_to_instrument) do |*any|
86
+ ::FormatParser::Measurometer.instrument(instrumentation_name) { super(*any) }
87
+ end
88
+ end
89
+ target_class.prepend(instrumenter_module)
90
+ end
91
+ end
92
+
93
+ # Instrument things interesting in the global sense
94
+ instrument_instance_method(FormatParser::RemoteIO, :read, 'format_parser')
95
+ instrument_instance_method(Care::Cache, :read_page, 'format_parser')
96
+
97
+ # Instrument more specific things on a per-parser basis
98
+ instrument_instance_method(FormatParser::EXIFParser, :scan_image_tiff, 'format_parser')
99
+ instrument_instance_method(FormatParser::MOOVParser::Decoder, :extract_atom_stream, 'format_parser.parsers.MOOVParser')
100
+ end
@@ -112,6 +112,7 @@ class FormatParser::JPEGParser
112
112
  maybe_exif_magic_str = app1_frame_bytes[0..5]
113
113
  maybe_exif_data = app1_frame_bytes[6..-1]
114
114
  if maybe_exif_magic_str == EXIF_MAGIC_STRING
115
+ FormatParser::Measurometer.add_distribution_value('format_parser.JPEGParser.bytes_sent_to_exif_parser', maybe_exif_data.bytesize)
115
116
  scanner = FormatParser::EXIFParser.new(StringIO.new(maybe_exif_data))
116
117
  scanner.scan_image_tiff
117
118
 
@@ -11,10 +11,6 @@ class FormatParser::MOOVParser
11
11
  'm4a ' => :m4a,
12
12
  }
13
13
 
14
- # It is currently not documented and not particularly well-tested,
15
- # so not considered a public API for now
16
- private_constant :Decoder
17
-
18
14
  def call(io)
19
15
  return unless matches_moov_definition?(io)
20
16
 
data/lib/read_limiter.rb CHANGED
@@ -1,9 +1,17 @@
1
+ # Is used to limit the number of reads/seeks parsers can perform
1
2
  class FormatParser::ReadLimiter
2
3
  NO_LIMIT = nil
3
4
 
4
5
  class BudgetExceeded < StandardError
5
6
  end
6
7
 
8
+ # Creates a ReadLimiter wrapper around the given IO object and sets the limits
9
+ # on the number of reads/writes
10
+ #
11
+ # @param io[#seek, #pos, #size, #read] the IO object to wrap
12
+ # @param max_bytes[Integer, nil] how many bytes can we read from this before an exception is raised
13
+ # @param max_reads[Integer, nil] how many read() calls can we perform on this before an exception is raised
14
+ # @param max_seeks[Integer, nil] how many seek() calls can we perform on this before an exception is raised
7
15
  def initialize(io, max_bytes: NO_LIMIT, max_reads: NO_LIMIT, max_seeks: NO_LIMIT)
8
16
  @max_bytes = max_bytes
9
17
  @max_reads = max_reads
@@ -15,24 +23,39 @@ class FormatParser::ReadLimiter
15
23
  @bytes = 0
16
24
  end
17
25
 
26
+ # Returns the size of the resource contained in the IO
27
+ #
28
+ # @return Integer
18
29
  def size
19
30
  @io.size
20
31
  end
21
32
 
33
+ # Returns the current position/offset within the IO
34
+ #
35
+ # @return Integer
22
36
  def pos
23
37
  @io.pos
24
38
  end
25
39
 
26
- def seek(to_offset)
40
+ # Seeks the IO to the given absolute offset from the start of the file/resource
41
+ #
42
+ # @param to[Integer] offset in the IO
43
+ # @return Integer
44
+ def seek(to)
27
45
  @seeks += 1
28
46
  if @max_seeks && @seeks > @max_seeks
29
47
  raise BudgetExceeded, 'Seek budget exceeded (%d seeks performed)' % @max_seeks
30
48
  end
31
- @io.seek(to_offset)
49
+ @io.seek(to)
32
50
  end
33
51
 
34
- def read(n)
35
- @bytes += n
52
+ # Returns at most `n_bytes` of data from the IO or less if less data was available
53
+ # before the EOF was hit
54
+ #
55
+ # @param n_bytes[Integer]
56
+ # @return [String, nil] the content read from the IO or `nil` if no data was available
57
+ def read(n_bytes)
58
+ @bytes += n_bytes
36
59
  @reads += 1
37
60
 
38
61
  if @max_bytes && @bytes > @max_bytes
@@ -43,9 +66,23 @@ class FormatParser::ReadLimiter
43
66
  raise BudgetExceeded, 'Number of read() calls exceeded (%d max)' % @max_reads
44
67
  end
45
68
 
46
- @io.read(n)
69
+ @io.read(n_bytes)
47
70
  end
48
71
 
72
+ # Sends the metrics about the state of this ReadLimiter to a Measurometer
73
+ #
74
+ # @param prefix[String] the prefix to set. For example, with prefix "TIFF" the metrics will be called
75
+ # `format_parser.TIFF.read_limiter.num_seeks` and so forth
76
+ # @return void
77
+ def send_metrics(prefix)
78
+ FormatParser::Measurometer.add_distribution_value('format_parser.%s.read_limiter.num_seeks' % prefix, @seeks)
79
+ FormatParser::Measurometer.add_distribution_value('format_parser.%s.read_limiter.num_reads' % prefix, @reads)
80
+ FormatParser::Measurometer.add_distribution_value('format_parser.%s.read_limiter.read_bytes' % prefix, @bytes)
81
+ end
82
+
83
+ # Resets all the recorded call counters so that the object can be reused for the next parser,
84
+ # which will have it's own limits
85
+ # @return void
49
86
  def reset_limits!
50
87
  @seeks = 0
51
88
  @reads = 0
@@ -5,23 +5,49 @@ class FormatParser::ReadLimitsConfig
5
5
  @max_read_bytes_per_parser = total_bytes_available_per_parser.to_i
6
6
  end
7
7
 
8
+ # Defines how many bytes each parser may request to read from the IO object given to it.
9
+ # Is used to artificially limit unbounded reads in parsers that may wander off and
10
+ # try to gulp in the file given to them indefinitely due to infinite loops or
11
+ # wrongly implemented skips - or when handling data that has been deliberately
12
+ # crafted in a way that can make a parser misbehave.
13
+ # This is less strict than one could think - for example, the MOOV parser used for
14
+ # Quicktime files will skip over the actual atom contents of the atoms, and will only
15
+ # read atom headers - which stays under this limit for quite some time.
8
16
  def max_read_bytes_per_parser
9
17
  @max_read_bytes_per_parser
10
18
  end
11
19
 
20
+ # How big should the cache page be. Each cache page read will incur one `#read`
21
+ # on the underlying IO object, remote or local
12
22
  def cache_page_size
13
23
  @max_read_bytes_per_parser / 4
14
24
  end
15
25
 
26
+ # Each parser can incur HTTP requests when performing `parse_http`. This constant
27
+ # sets the maximum number of pages each parser is allowed to hit that have not
28
+ # been fetched previously and are not stored in the cache. For example, with most
29
+ # formats the first cache page and the last cache page - tail and head of the file,
30
+ # respectively - will be available right after the first parser retreives some data.
31
+ # The second parser accessing the same data will reuse the in-memory cache.
16
32
  def max_pagefaults_per_parser
17
33
  MAX_PAGE_FAULTS
18
34
  end
19
35
 
36
+ # Defines how many `#read` calls each parser may perform on the IO object given to it.
37
+ # Is used to artificially limit unbounded reads in parsers that may wander off and
38
+ # try to gulp in the file given to them indefinitely due to infinite loops or
39
+ # wrongly implemented skips - or when handling data that has been deliberately
40
+ # crafted in a way that can make a parser misbehave.
20
41
  def max_reads_per_parser
21
42
  # Imagine we read per single byte
22
43
  @max_read_bytes_per_parser / 2
23
44
  end
24
45
 
46
+ # Defines how many `#seek` calls each parser may perform on the IO object given to it.
47
+ # Is used to artificially limit unbounded reads in parsers that may wander off and
48
+ # try to gulp in the file given to them indefinitely due to infinite loops or
49
+ # wrongly implemented skips - or when handling data that has been deliberately
50
+ # crafted in a way that can make a parser misbehave.
25
51
  def max_seeks_per_parser
26
52
  # Imagine we have to seek once per byte
27
53
  @max_read_bytes_per_parser / 2
data/lib/remote_io.rb CHANGED
@@ -1,3 +1,8 @@
1
+ # Acts as a wrapper for turning a given URL into an IO object
2
+ # you can read from and seek in. Uses Faraday under the hood
3
+ # to perform fetches, so if you apply Faraday configuration
4
+ # tweaks using `Faraday.default_connection = ...` these will
5
+ # take effect for these RemoteIO objects as well
1
6
  class FormatParser::RemoteIO
2
7
  # Represents a failure that might be retried
3
8
  # (like a 5xx response or a timeout)
@@ -89,8 +94,10 @@ class FormatParser::RemoteIO
89
94
  # cannot hint size with this response - at lease not when working with S3
90
95
  return
91
96
  when 500..599
97
+ FormatParser::Measurometer.increment_counter('format_parser.RemoteIO.upstream50x_errors', 1)
92
98
  raise IntermittentFailure, "Server at #{@uri} replied with a #{response.status} and we might want to retry"
93
99
  else
100
+ FormatParser::Measurometer.increment_counter('format_parser.RemoteIO.invalid_request_errors', 1)
94
101
  raise InvalidRequest, "Server at #{@uri} replied with a #{response.status} and refused our request"
95
102
  end
96
103
  end
@@ -27,16 +27,10 @@ describe 'IOUtils' do
27
27
  }.to raise_error(FormatParser::IOUtils::InvalidRead)
28
28
  end
29
29
 
30
- it 'uses #pos if available on the object' do
30
+ it 'uses #pos available on the object' do
31
31
  fake_io = double(pos: 11)
32
32
  expect(fake_io).to receive(:seek).with(11 + 5)
33
33
  safe_skip(fake_io, 5)
34
34
  end
35
-
36
- it 'uses #read if no #pos is available on the object' do
37
- fake_io = double
38
- expect(fake_io).to receive(:read).with(5).and_return('x' * 5)
39
- safe_skip(fake_io, 5)
40
- end
41
35
  end
42
36
  end
@@ -0,0 +1,48 @@
1
+ require 'spec_helper'
2
+
3
+ describe FormatParser::Measurometer do
4
+ RSpec::Matchers.define :include_counter_or_measurement_named do |named|
5
+ match do |actual|
6
+ actual.any? do |e|
7
+ e[0] == named && e[1] > 0
8
+ end
9
+ end
10
+ end
11
+
12
+ it 'instruments a full cycle FormatParser.parse' do
13
+ driver_class = Class.new do
14
+ attr_accessor :timings, :counters, :distributions
15
+ def instrument(block_name)
16
+ s = Process.clock_gettime(Process::CLOCK_MONOTONIC)
17
+ yield.tap do
18
+ delta = Process.clock_gettime(Process::CLOCK_MONOTONIC) - s
19
+ @timings ||= []
20
+ @timings << [block_name, delta * 1000]
21
+ end
22
+ end
23
+
24
+ def add_distribution_value(value_path, value)
25
+ @distributions ||= []
26
+ @distributions << [value_path, value]
27
+ end
28
+
29
+ def increment_counter(value_path, value)
30
+ @counters ||= []
31
+ @counters << [value_path, value]
32
+ end
33
+ end
34
+
35
+ instrumenter = driver_class.new
36
+ described_class.drivers << instrumenter
37
+
38
+ FormatParser.parse(File.open(fixtures_dir + 'JPEG/keynote_recognized_as_jpeg.key', 'rb'), results: :all)
39
+
40
+ described_class.drivers.delete(instrumenter)
41
+ expect(described_class.drivers).not_to include(instrumenter)
42
+
43
+ expect(instrumenter.counters).to include_counter_or_measurement_named('format_parser.detected_formats.zip')
44
+ expect(instrumenter.counters).to include_counter_or_measurement_named('format_parser.parser.Care.page_reads_from_upsteam')
45
+ expect(instrumenter.distributions).to include_counter_or_measurement_named('format_parser.ZIPParser.read_limiter.read_bytes')
46
+ expect(instrumenter.timings).to include_counter_or_measurement_named('format_parser.Cache.read_page')
47
+ end
48
+ end
@@ -27,11 +27,12 @@ describe 'Fetching data from HTTP remotes' do
27
27
  end
28
28
 
29
29
  it '#parse_http is called with hash options' do
30
- expect_any_instance_of(FormatParser::AIFFParser).to receive(:call).and_return(:audio)
31
- result = FormatParser.parse_http('http://localhost:9399/PNG/anim.png', results: :all)
30
+ fake_result = double(nature: :audio, format: :aiff)
31
+ expect_any_instance_of(FormatParser::AIFFParser).to receive(:call).and_return(fake_result)
32
+ results = FormatParser.parse_http('http://localhost:9399/PNG/anim.png', results: :all)
32
33
 
33
- expect(result.include?(:audio)).to be true
34
- expect(result.count).to eq(2)
34
+ expect(results.count).to eq(2)
35
+ expect(results).to include(fake_result)
35
36
  end
36
37
 
37
38
  it 'parses the animated PNG over HTTP' do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: format_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.0
4
+ version: 0.8.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Noah Berman
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2018-04-16 00:00:00.000000000 Z
12
+ date: 2018-04-17 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: ks
@@ -152,6 +152,7 @@ files:
152
152
  - ".rspec"
153
153
  - ".rubocop.yml"
154
154
  - ".travis.yml"
155
+ - CHANGELOG.md
155
156
  - CODE_OF_CONDUCT.md
156
157
  - CONTRIBUTING.md
157
158
  - Gemfile
@@ -169,6 +170,7 @@ files:
169
170
  - lib/image.rb
170
171
  - lib/io_constraint.rb
171
172
  - lib/io_utils.rb
173
+ - lib/measurometer.rb
172
174
  - lib/parsers/aiff_parser.rb
173
175
  - lib/parsers/cr2_parser.rb
174
176
  - lib/parsers/dpx_parser.rb
@@ -200,6 +202,7 @@ files:
200
202
  - spec/file_information_spec.rb
201
203
  - spec/format_parser_spec.rb
202
204
  - spec/io_utils_spec.rb
205
+ - spec/measurometer_spec.rb
203
206
  - spec/parsers/aiff_parser_spec.rb
204
207
  - spec/parsers/cr2_parser_spec.rb
205
208
  - spec/parsers/dpx_parser_spec.rb