format_parser 1.7.0 → 2.0.0.pre

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/remote_io.rb CHANGED
@@ -1,14 +1,14 @@
1
+ require 'net/http'
2
+
1
3
  # Acts as a wrapper for turning a given URL into an IO object
2
- # you can read from and seek in. Uses Faraday under the hood
3
- # to perform fetches, so if you apply Faraday configuration
4
- # tweaks using `Faraday.default_connection = ...` these will
5
- # take effect for these RemoteIO objects as well
4
+ # you can read from and seek in.
6
5
  class FormatParser::RemoteIO
7
6
  class UpstreamError < StandardError
8
7
  # @return Integer
9
8
  attr_reader :status_code
9
+
10
10
  def initialize(status_code, message)
11
- @status_code = status_code
11
+ @status_code = Integer(status_code)
12
12
  super(message)
13
13
  end
14
14
  end
@@ -23,13 +23,19 @@ class FormatParser::RemoteIO
23
23
  class InvalidRequest < UpstreamError
24
24
  end
25
25
 
26
- # @param uri[URI, String] the remote URL to obtain
26
+ # Represents a failure where the maximum amount of
27
+ # redirect requests are exceeded.
28
+ class RedirectLimitReached < UpstreamError
29
+ def initialize(uri)
30
+ super(504, "Too many redirects; last one to: #{uri}")
31
+ end
32
+ end
33
+
34
+ # @param uri[String, URI::Generic] the remote URL to obtain
27
35
  # @param headers[Hash] (optional) the HTTP headers to be used in the HTTP request
28
36
  def initialize(uri, headers: {})
29
- require 'faraday'
30
- require 'faraday_middleware/response/follow_redirects'
31
37
  @headers = headers
32
- @uri = uri
38
+ @uri = URI(uri)
33
39
  @pos = 0
34
40
  @remote_size = false
35
41
  end
@@ -63,7 +69,7 @@ class FormatParser::RemoteIO
63
69
  # @return [String] the read bytes
64
70
  def read(n_bytes)
65
71
  http_range = (@pos..(@pos + n_bytes - 1))
66
- maybe_size, maybe_body = Measurometer.instrument('format_parser.RemoteIO.read') { request_range(http_range) }
72
+ maybe_size, maybe_body = Measurometer.instrument('format_parser.remote_io.read') { request_range(http_range) }
67
73
  if maybe_size && maybe_body
68
74
  @remote_size = maybe_size
69
75
  @pos += maybe_body.bytesize
@@ -73,23 +79,39 @@ class FormatParser::RemoteIO
73
79
 
74
80
  protected
75
81
 
82
+ REDIRECT_LIMIT = 3
83
+ UNSAFE_URI_CHARS = %r{[^\-_.!~*'()a-zA-Z\d;/?:@&=+$,\[\]%]}
84
+
85
+ # Generate the URI to fetch from following a redirect response.
86
+ #
87
+ # @param location[String] The new URI reference, as provided by the Location header of the previous response.
88
+ # @param previous_uri[URI] The URI used in the previous request.
89
+ def redirect_uri(location, previous_uri)
90
+ # Escape unsafe characters in location. Use location as new URI if absolute, otherwise use it to replace the path of
91
+ # the previous URI.
92
+ new_uri = previous_uri.merge(location.to_s.gsub(UNSAFE_URI_CHARS) do |unsafe_char|
93
+ "%#{unsafe_char.unpack('H2' * unsafe_char.bytesize).join('%').upcase}"
94
+ end)
95
+ # Keep previous URI's fragment if not present in location (https://www.rfc-editor.org/rfc/rfc9110.html#section-10.2.2-5)
96
+ new_uri.fragment = previous_uri.fragment unless new_uri.fragment
97
+ new_uri
98
+ end
99
+
76
100
  # Only used internally when reading the remote file
77
101
  #
78
- # @param range[Range] the HTTP range of data to fetch from remote
79
- # @return [String] the response body of the ranged request
80
- def request_range(range)
102
+ # @param range[Range] The HTTP range of data to fetch from remote
103
+ # @param uri[URI] The URI to fetch from
104
+ # @param redirects[Integer] The amount of remaining permitted redirects
105
+ # @return [[Integer, String]] The response body of the ranged request
106
+ def request_range(range, uri = @uri, redirects = REDIRECT_LIMIT)
81
107
  # We use a GET and not a HEAD request followed by a GET because
82
108
  # S3 does not allow HEAD requests if you only presigned your URL for GETs, so we
83
109
  # combine the first GET of a segment and retrieving the size of the resource
84
- conn = Faraday.new(headers: @headers) do |faraday|
85
- faraday.use FaradayMiddleware::FollowRedirects
86
- # we still need the default adapter, more details: https://blog.thecodewhisperer.com/permalink/losing-time-to-faraday
87
- faraday.adapter Faraday.default_adapter
110
+ response = Net::HTTP.start(uri.hostname, uri.port, use_ssl: uri.scheme == 'https') do |http|
111
+ http.request_get(uri, @headers.merge({ 'range' => 'bytes=%d-%d' % [range.begin, range.end] }))
88
112
  end
89
- response = conn.get(@uri, nil, range: 'bytes=%d-%d' % [range.begin, range.end])
90
-
91
- case response.status
92
- when 200
113
+ case response
114
+ when Net::HTTPOK
93
115
  # S3 returns 200 when you request a Range that is fully satisfied by the entire object,
94
116
  # we take that into account here. Also, for very tiny responses (and also for empty responses)
95
117
  # the responses are going to be 200 which does not mean we cannot proceed
@@ -100,16 +122,16 @@ class FormatParser::RemoteIO
100
122
  error_message = [
101
123
  "We requested #{requested_range_size} bytes, but the server sent us more",
102
124
  "(#{response_size} bytes) - it likely has no `Range:` support.",
103
- "The error occurred when talking to #{@uri})"
125
+ "The error occurred when talking to #{uri})"
104
126
  ]
105
- raise InvalidRequest.new(response.status, error_message.join("\n"))
127
+ raise InvalidRequest.new(response.code, error_message.join("\n"))
106
128
  end
107
129
  [response_size, response.body]
108
- when 206
130
+ when Net::HTTPPartialContent
109
131
  # Figure out of the server supports content ranges, if it doesn't we have no
110
132
  # business working with that server
111
- range_header = response.headers['Content-Range']
112
- raise InvalidRequest.new(response.status, "The server replied with 206 status but no Content-Range at #{@uri}") unless range_header
133
+ range_header = response['Content-Range']
134
+ raise InvalidRequest.new(response.code, "The server replied with 206 status but no Content-Range at #{uri}") unless range_header
113
135
 
114
136
  # "Content-Range: bytes 0-0/307404381" is how the response header is structured
115
137
  size = range_header[/\/(\d+)$/, 1].to_i
@@ -117,19 +139,27 @@ class FormatParser::RemoteIO
117
139
  # If we request a _larger_ range than what can be satisfied by the server,
118
140
  # the response is going to only contain what _can_ be sent and the status is also going
119
141
  # to be 206
120
- return [size, response.body]
121
- when 416
142
+ [size, response.body]
143
+ when Net::HTTPMovedPermanently, Net::HTTPFound, Net::HTTPSeeOther, Net::HTTPTemporaryRedirect, Net::HTTPPermanentRedirect
144
+ raise RedirectLimitReached(uri) if redirects == 0
145
+ location = response['location']
146
+ if location
147
+ request_range(range, redirect_uri(location, uri), redirects - 1)
148
+ else
149
+ raise InvalidRequest.new(response.code, "Server at #{uri} replied with a #{response.code}, indicating redirection; however, the location header was empty.")
150
+ end
151
+ when Net::HTTPRangeNotSatisfiable
122
152
  # We return `nil` if we tried to read past the end of the IO,
123
153
  # which satisfies the Ruby IO convention. The caller should deal with `nil` being the result of a read()
124
154
  # S3 will also handily _not_ supply us with the Content-Range of the actual resource, so we
125
155
  # cannot hint size with this response - at lease not when working with S3
126
- return
127
- when 500..599
128
- Measurometer.increment_counter('format_parser.RemoteIO.upstream50x_errors', 1)
129
- raise IntermittentFailure.new(response.status, "Server at #{@uri} replied with a #{response.status} and we might want to retry")
156
+ nil
157
+ when Net::HTTPServerError
158
+ Measurometer.increment_counter('format_parser.remote_io.upstream50x_errors', 1)
159
+ raise IntermittentFailure.new(response.code, "Server at #{uri} replied with a #{response.code} and we might want to retry")
130
160
  else
131
- Measurometer.increment_counter('format_parser.RemoteIO.invalid_request_errors', 1)
132
- raise InvalidRequest.new(response.status, "Server at #{@uri} replied with a #{response.status} and refused our request")
161
+ Measurometer.increment_counter('format_parser.remote_io.invalid_request_errors', 1)
162
+ raise InvalidRequest.new(response.code, "Server at #{uri} replied with a #{response.code} and refused our request")
133
163
  end
134
164
  end
135
165
  end
data/lib/string.rb ADDED
@@ -0,0 +1,9 @@
1
+ class String
2
+ def underscore
3
+ gsub(/::/, '/').
4
+ gsub(/([A-Z]+)([A-Z][a-z])/, '\1_\2').
5
+ gsub(/([a-z\d])([A-Z])/, '\1_\2').
6
+ tr('-', '_').
7
+ downcase
8
+ end
9
+ end
@@ -106,9 +106,6 @@ describe FormatParser::AttributesJSON do
106
106
  struct: Struct.new(:key).new('Value'),
107
107
  content: "\x01\xFF\xFEb\x00i\x00r\x00d\x00s\x00 \x005\x00 \x00m\x00o\x00r\x00e\x00 \x00c\x00o\x00m\x00p\x00".b
108
108
  }
109
- expect {
110
- JSON.pretty_generate(nasty_hash) # Should not raise an error
111
- }.to raise_error(Encoding::UndefinedConversionError)
112
109
 
113
110
  anon_class = Struct.new(:evil)
114
111
  anon_class.include FormatParser::AttributesJSON
@@ -124,14 +124,9 @@ describe 'Fetching data from HTTP remotes' do
124
124
  end
125
125
 
126
126
  it 'sends provided HTTP headers in the request' do
127
- # Faraday is required only after calling .parse_http
128
- # This line is just to trigger this require, then it's possible to
129
- # add an expectation of how Faraday is initialized after.
130
- FormatParser.parse_http('invalid_url') rescue nil
131
-
132
- expect(Faraday)
133
- .to receive(:new)
134
- .with(headers: {'test-header' => 'test-value'})
127
+ expect_any_instance_of(Net::HTTP)
128
+ .to receive(:request_get)
129
+ .with(anything, a_hash_including('test-header' => 'test-value'))
135
130
  .and_call_original
136
131
 
137
132
  file_information = FormatParser.parse_http(
@@ -4,130 +4,186 @@ describe FormatParser::RemoteIO do
4
4
  it_behaves_like 'an IO object compatible with IOConstraint'
5
5
 
6
6
  it 'returns the partial content when the server supplies a 206 status' do
7
- rio = described_class.new('https://images.invalid/img.jpg')
7
+ url = 'https://images.invalid/img.jpg'
8
+ response = Net::HTTPPartialContent.new('2', '206', 'Partial Content')
9
+ response['Content-Range'] = '10-109/2577'
10
+ allow(response).to receive(:body).and_return('Response body')
8
11
 
9
- fake_resp = double(headers: {'Content-Range' => '10-109/2577'}, status: 206, body: 'This is the response')
10
- faraday_conn = instance_double(Faraday::Connection, get: fake_resp)
11
- allow(Faraday).to receive(:new).and_return(faraday_conn)
12
- expect(faraday_conn).to receive(:get).with('https://images.invalid/img.jpg', nil, range: 'bytes=10-109')
12
+ allow(Net::HTTP).to receive(:start).and_yield(Net::HTTP).and_return(response)
13
+ allow(Net::HTTP).to receive(:request_get).and_return(response)
13
14
 
15
+ expect(Net::HTTP).to receive(:request_get).with(
16
+ an_object_satisfying { |uri| URI::HTTPS === uri && uri.to_s == url },
17
+ a_hash_including('range' => 'bytes=10-109')
18
+ )
19
+
20
+ rio = described_class.new(url)
14
21
  rio.seek(10)
15
22
  read_result = rio.read(100)
16
- expect(read_result).to eq('This is the response')
23
+
24
+ expect(read_result).to eq(response.body)
17
25
  end
18
26
 
19
27
  it 'returns the entire content when the server supplies the Content-Range response but sends a 200 status' do
20
- rio = described_class.new('https://images.invalid/img.jpg')
28
+ url = 'https://images.invalid/img.jpg'
29
+ response = Net::HTTPOK.new('2', '200', 'OK')
30
+ allow(response).to receive(:body).and_return('Response body')
31
+
32
+ allow(Net::HTTP).to receive(:start).and_yield(Net::HTTP).and_return(response)
33
+ allow(Net::HTTP).to receive(:request_get).and_return(response)
21
34
 
22
- fake_resp = double(headers: {'Content-Range' => '10-109/2577'}, status: 200, body: 'This is the response')
23
- faraday_conn = instance_double(Faraday::Connection, get: fake_resp)
24
- allow(Faraday).to receive(:new).and_return(faraday_conn)
25
- expect(faraday_conn).to receive(:get).with('https://images.invalid/img.jpg', nil, range: 'bytes=10-109')
35
+ expect(Net::HTTP).to receive(:request_get).with(
36
+ an_object_satisfying { |uri| URI::HTTPS === uri && uri.to_s == url },
37
+ a_hash_including('range' => 'bytes=10-109')
38
+ )
26
39
 
40
+ rio = described_class.new(url)
27
41
  rio.seek(10)
28
42
  read_result = rio.read(100)
29
- expect(read_result).to eq('This is the response')
43
+
44
+ expect(read_result).to eq(response.body)
30
45
  end
31
46
 
32
47
  it 'raises a specific error for all 4xx responses except 416' do
33
- rio = described_class.new('https://images.invalid/img.jpg')
48
+ url = 'https://images.invalid/img.jpg'
49
+ response = Net::HTTPForbidden.new('2', '403', 'Forbidden')
50
+
51
+ allow(Net::HTTP).to receive(:start).and_yield(Net::HTTP).and_return(response)
52
+ allow(Net::HTTP).to receive(:request_get).and_return(response)
34
53
 
35
- fake_resp = double(headers: {}, status: 403, body: 'Please log in')
36
- faraday_conn = instance_double(Faraday::Connection, get: fake_resp)
37
- allow(Faraday).to receive(:new).and_return(faraday_conn)
38
- expect(faraday_conn).to receive(:get).with('https://images.invalid/img.jpg', nil, range: 'bytes=100-199')
54
+ expect(Net::HTTP).to receive(:request_get).with(
55
+ an_object_satisfying { |uri| uri.to_s == url },
56
+ a_hash_including('range' => 'bytes=100-199')
57
+ )
39
58
 
59
+ rio = described_class.new(url)
40
60
  rio.seek(100)
61
+
41
62
  expect { rio.read(100) }.to raise_error(/replied with a 403 and refused/)
42
63
  end
43
64
 
44
65
  it 'returns nil on a 416 response' do
45
- rio = described_class.new('https://images.invalid/img.jpg')
66
+ url = 'https://images.invalid/img.jpg'
67
+ response = Net::HTTPRangeNotSatisfiable.new('2', '416', 'Range Not Satisfiable')
46
68
 
47
- fake_resp = double(headers: {}, status: 416, body: 'You stepped off the ledge of the range')
48
- faraday_conn = instance_double(Faraday::Connection, get: fake_resp)
49
- allow(Faraday).to receive(:new).and_return(faraday_conn)
50
- expect(faraday_conn).to receive(:get).with('https://images.invalid/img.jpg', nil, range: 'bytes=100-199')
69
+ allow(Net::HTTP).to receive(:start).and_yield(Net::HTTP).and_return(response)
70
+ allow(Net::HTTP).to receive(:request_get).and_return(response)
51
71
 
72
+ expect(Net::HTTP).to receive(:request_get).with(
73
+ an_object_satisfying { |uri| uri.to_s == url },
74
+ a_hash_including('range' => 'bytes=100-199')
75
+ )
76
+
77
+ rio = described_class.new(url)
52
78
  rio.seek(100)
79
+
53
80
  expect(rio.read(100)).to be_nil
54
81
  end
55
82
 
56
83
  it 'sets the status_code of the exception on a 4xx response from upstream' do
57
- rio = described_class.new('https://images.invalid/img.jpg')
84
+ url = 'https://images.invalid/img.jpg'
85
+ response = Net::HTTPForbidden.new('2', '403', 'Forbidden')
86
+
87
+ allow(Net::HTTP).to receive(:start).and_yield(Net::HTTP).and_return(response)
88
+ allow(Net::HTTP).to receive(:request_get).and_return(response)
58
89
 
59
- fake_resp = double(headers: {}, status: 403, body: 'Please log in')
60
- faraday_conn = instance_double(Faraday::Connection, get: fake_resp)
61
- allow(Faraday).to receive(:new).and_return(faraday_conn)
62
- expect(faraday_conn).to receive(:get).with('https://images.invalid/img.jpg', nil, range: 'bytes=100-199')
90
+ expect(Net::HTTP).to receive(:request_get).with(
91
+ an_object_satisfying { |uri| uri.to_s == url },
92
+ a_hash_including('range' => 'bytes=100-199')
93
+ )
63
94
 
95
+ rio = described_class.new(url)
64
96
  rio.seek(100)
65
- # rubocop: disable Lint/AmbiguousBlockAssociation
66
- expect { rio.read(100) }.to raise_error { |e| expect(e.status_code).to eq(403) }
97
+ expect { rio.read(100) }.to(raise_error { |e| expect(e.status_code).to eq(403) })
67
98
  end
68
99
 
69
100
  it 'returns a nil when the range cannot be satisfied and the response is 416' do
70
- rio = described_class.new('https://images.invalid/img.jpg')
101
+ url = 'https://images.invalid/img.jpg'
102
+ response = Net::HTTPRangeNotSatisfiable.new('2', '416', 'Range Not Satisfiable')
71
103
 
72
- fake_resp = double(headers: {}, status: 416, body: 'You jumped off the end of the file maam')
73
- faraday_conn = instance_double(Faraday::Connection, get: fake_resp)
74
- allow(Faraday).to receive(:new).and_return(faraday_conn)
75
- expect(faraday_conn).to receive(:get).with('https://images.invalid/img.jpg', nil, range: 'bytes=100-199')
104
+ allow(Net::HTTP).to receive(:start).and_yield(Net::HTTP).and_return(response)
105
+ allow(Net::HTTP).to receive(:request_get).and_return(response)
76
106
 
107
+ expect(Net::HTTP).to receive(:request_get).with(
108
+ an_object_satisfying { |uri| uri.to_s == url },
109
+ a_hash_including('range' => 'bytes=100-199')
110
+ )
111
+
112
+ rio = described_class.new(url)
77
113
  rio.seek(100)
114
+
78
115
  expect(rio.read(100)).to be_nil
79
116
  end
80
117
 
81
118
  it 'does not overwrite size when the range cannot be satisfied and the response is 416' do
82
- rio = described_class.new('https://images.invalid/img.jpg')
83
-
84
- fake_resp1 = double(headers: {'Content-Range' => 'bytes 0-0/13'}, status: 206, body: 'a')
85
- fake_resp2 = double(headers: {}, status: 416, body: 'You jumped off the end of the file maam')
86
-
87
- faraday_conn = instance_double(Faraday::Connection)
88
- allow(Faraday).to receive(:new).and_return(faraday_conn)
89
- expect(faraday_conn).to receive(:get)
90
- .with('https://images.invalid/img.jpg', nil, range: 'bytes=0-0')
119
+ url = 'https://images.invalid/img.jpg'
120
+ response_1 = Net::HTTPPartialContent.new('2', '206', 'Partial Content')
121
+ response_1['Content-Range'] = 'bytes 0-0/13'
122
+ allow(response_1).to receive(:body).and_return('Response body')
123
+ response_2 = Net::HTTPRangeNotSatisfiable.new('2', '416', 'Range Not Satisfiable')
124
+
125
+ allow(Net::HTTP).to receive(:start).and_yield(Net::HTTP).and_return(response_1, response_2)
126
+ allow(Net::HTTP).to receive(:request_get).and_return(response_1, response_2)
127
+
128
+ expect(Net::HTTP).to receive(:request_get)
129
+ .with(
130
+ an_object_satisfying { |uri| uri.to_s == url },
131
+ a_hash_including('range' => 'bytes=0-0')
132
+ )
91
133
  .ordered
92
- .and_return(fake_resp1)
93
- expect(faraday_conn).to receive(:get)
94
- .with('https://images.invalid/img.jpg', nil, range: 'bytes=100-199')
134
+ expect(Net::HTTP).to receive(:request_get)
135
+ .with(
136
+ an_object_satisfying { |uri| uri.to_s == url },
137
+ a_hash_including('range' => 'bytes=100-199')
138
+ )
95
139
  .ordered
96
- .and_return(fake_resp2)
97
140
 
141
+ rio = described_class.new(url)
98
142
  rio.read(1)
99
143
 
100
144
  expect(rio.size).to eq(13)
101
145
 
102
146
  rio.seek(100)
103
- expect(rio.read(100)).to be_nil
104
147
 
148
+ expect(rio.read(100)).to be_nil
105
149
  expect(rio.size).to eq(13)
106
150
  end
107
151
 
108
152
  it 'raises a specific error for all 5xx responses' do
109
- rio = described_class.new('https://images.invalid/img.jpg')
153
+ url = 'https://images.invalid/img.jpg'
154
+ response = Net::HTTPBadGateway.new('2', '502', 'Bad Gateway')
155
+
156
+ allow(Net::HTTP).to receive(:start).and_yield(Net::HTTP).and_return(response)
157
+ allow(Net::HTTP).to receive(:request_get).and_return(response)
110
158
 
111
- fake_resp = double(headers: {}, status: 502, body: 'Guru meditation')
112
- faraday_conn = instance_double(Faraday::Connection, get: fake_resp)
113
- allow(Faraday).to receive(:new).and_return(faraday_conn)
114
- expect(faraday_conn).to receive(:get).with('https://images.invalid/img.jpg', nil, range: 'bytes=100-199')
159
+ expect(Net::HTTP).to receive(:request_get).with(
160
+ an_object_satisfying { |uri| uri.to_s == url },
161
+ a_hash_including('range' => 'bytes=100-199')
162
+ )
115
163
 
164
+ rio = described_class.new(url)
116
165
  rio.seek(100)
166
+
117
167
  expect { rio.read(100) }.to raise_error(/replied with a 502 and we might want to retry/)
118
168
  end
119
169
 
120
170
  it 'maintains and exposes #pos' do
121
- rio = described_class.new('https://images.invalid/img.jpg')
171
+ url = 'https://images.invalid/img.jpg'
172
+ response = Net::HTTPPartialContent.new('2', '206', 'Partial Content')
173
+ response['Content-Range'] = 'bytes 0-0/13'
174
+ allow(response).to receive(:body).and_return('a')
122
175
 
123
- expect(rio.pos).to eq(0)
176
+ allow(Net::HTTP).to receive(:start).and_yield(Net::HTTP).and_return(response)
177
+ allow(Net::HTTP).to receive(:request_get).and_return(response)
124
178
 
125
- fake_resp = double(headers: {'Content-Range' => 'bytes 0-0/13'}, status: 206, body: 'a')
126
- faraday_conn = instance_double(Faraday::Connection, get: fake_resp)
127
- allow(Faraday).to receive(:new).and_return(faraday_conn)
128
- expect(faraday_conn).to receive(:get).with('https://images.invalid/img.jpg', nil, range: 'bytes=0-0')
129
- rio.read(1)
179
+ expect(Net::HTTP).to receive(:request_get).with(
180
+ an_object_satisfying { |uri| uri.to_s == url },
181
+ a_hash_including('range' => 'bytes=0-0')
182
+ )
130
183
 
184
+ rio = described_class.new(url)
185
+ expect(rio.pos).to eq(0)
186
+ rio.read(1)
131
187
  expect(rio.pos).to eq(1)
132
188
  end
133
189
  end