format_parser 1.7.0 → 2.0.0.pre

Sign up to get free protection for your applications and to get access to all the features.
data/lib/remote_io.rb CHANGED
@@ -1,14 +1,14 @@
1
+ require 'net/http'
2
+
1
3
  # Acts as a wrapper for turning a given URL into an IO object
2
- # you can read from and seek in. Uses Faraday under the hood
3
- # to perform fetches, so if you apply Faraday configuration
4
- # tweaks using `Faraday.default_connection = ...` these will
5
- # take effect for these RemoteIO objects as well
4
+ # you can read from and seek in.
6
5
  class FormatParser::RemoteIO
7
6
  class UpstreamError < StandardError
8
7
  # @return Integer
9
8
  attr_reader :status_code
9
+
10
10
  def initialize(status_code, message)
11
- @status_code = status_code
11
+ @status_code = Integer(status_code)
12
12
  super(message)
13
13
  end
14
14
  end
@@ -23,13 +23,19 @@ class FormatParser::RemoteIO
23
23
  class InvalidRequest < UpstreamError
24
24
  end
25
25
 
26
- # @param uri[URI, String] the remote URL to obtain
26
+ # Represents a failure where the maximum amount of
27
+ # redirect requests are exceeded.
28
+ class RedirectLimitReached < UpstreamError
29
+ def initialize(uri)
30
+ super(504, "Too many redirects; last one to: #{uri}")
31
+ end
32
+ end
33
+
34
+ # @param uri[String, URI::Generic] the remote URL to obtain
27
35
  # @param headers[Hash] (optional) the HTTP headers to be used in the HTTP request
28
36
  def initialize(uri, headers: {})
29
- require 'faraday'
30
- require 'faraday_middleware/response/follow_redirects'
31
37
  @headers = headers
32
- @uri = uri
38
+ @uri = URI(uri)
33
39
  @pos = 0
34
40
  @remote_size = false
35
41
  end
@@ -63,7 +69,7 @@ class FormatParser::RemoteIO
63
69
  # @return [String] the read bytes
64
70
  def read(n_bytes)
65
71
  http_range = (@pos..(@pos + n_bytes - 1))
66
- maybe_size, maybe_body = Measurometer.instrument('format_parser.RemoteIO.read') { request_range(http_range) }
72
+ maybe_size, maybe_body = Measurometer.instrument('format_parser.remote_io.read') { request_range(http_range) }
67
73
  if maybe_size && maybe_body
68
74
  @remote_size = maybe_size
69
75
  @pos += maybe_body.bytesize
@@ -73,23 +79,39 @@ class FormatParser::RemoteIO
73
79
 
74
80
  protected
75
81
 
82
+ REDIRECT_LIMIT = 3
83
+ UNSAFE_URI_CHARS = %r{[^\-_.!~*'()a-zA-Z\d;/?:@&=+$,\[\]%]}
84
+
85
+ # Generate the URI to fetch from following a redirect response.
86
+ #
87
+ # @param location[String] The new URI reference, as provided by the Location header of the previous response.
88
+ # @param previous_uri[URI] The URI used in the previous request.
89
+ def redirect_uri(location, previous_uri)
90
+ # Escape unsafe characters in location. Use location as new URI if absolute, otherwise use it to replace the path of
91
+ # the previous URI.
92
+ new_uri = previous_uri.merge(location.to_s.gsub(UNSAFE_URI_CHARS) do |unsafe_char|
93
+ "%#{unsafe_char.unpack('H2' * unsafe_char.bytesize).join('%').upcase}"
94
+ end)
95
+ # Keep previous URI's fragment if not present in location (https://www.rfc-editor.org/rfc/rfc9110.html#section-10.2.2-5)
96
+ new_uri.fragment = previous_uri.fragment unless new_uri.fragment
97
+ new_uri
98
+ end
99
+
76
100
  # Only used internally when reading the remote file
77
101
  #
78
- # @param range[Range] the HTTP range of data to fetch from remote
79
- # @return [String] the response body of the ranged request
80
- def request_range(range)
102
+ # @param range[Range] The HTTP range of data to fetch from remote
103
+ # @param uri[URI] The URI to fetch from
104
+ # @param redirects[Integer] The amount of remaining permitted redirects
105
+ # @return [[Integer, String]] The response body of the ranged request
106
+ def request_range(range, uri = @uri, redirects = REDIRECT_LIMIT)
81
107
  # We use a GET and not a HEAD request followed by a GET because
82
108
  # S3 does not allow HEAD requests if you only presigned your URL for GETs, so we
83
109
  # combine the first GET of a segment and retrieving the size of the resource
84
- conn = Faraday.new(headers: @headers) do |faraday|
85
- faraday.use FaradayMiddleware::FollowRedirects
86
- # we still need the default adapter, more details: https://blog.thecodewhisperer.com/permalink/losing-time-to-faraday
87
- faraday.adapter Faraday.default_adapter
110
+ response = Net::HTTP.start(uri.hostname, uri.port, use_ssl: uri.scheme == 'https') do |http|
111
+ http.request_get(uri, @headers.merge({ 'range' => 'bytes=%d-%d' % [range.begin, range.end] }))
88
112
  end
89
- response = conn.get(@uri, nil, range: 'bytes=%d-%d' % [range.begin, range.end])
90
-
91
- case response.status
92
- when 200
113
+ case response
114
+ when Net::HTTPOK
93
115
  # S3 returns 200 when you request a Range that is fully satisfied by the entire object,
94
116
  # we take that into account here. Also, for very tiny responses (and also for empty responses)
95
117
  # the responses are going to be 200 which does not mean we cannot proceed
@@ -100,16 +122,16 @@ class FormatParser::RemoteIO
100
122
  error_message = [
101
123
  "We requested #{requested_range_size} bytes, but the server sent us more",
102
124
  "(#{response_size} bytes) - it likely has no `Range:` support.",
103
- "The error occurred when talking to #{@uri})"
125
+ "The error occurred when talking to #{uri})"
104
126
  ]
105
- raise InvalidRequest.new(response.status, error_message.join("\n"))
127
+ raise InvalidRequest.new(response.code, error_message.join("\n"))
106
128
  end
107
129
  [response_size, response.body]
108
- when 206
130
+ when Net::HTTPPartialContent
109
131
  # Figure out of the server supports content ranges, if it doesn't we have no
110
132
  # business working with that server
111
- range_header = response.headers['Content-Range']
112
- raise InvalidRequest.new(response.status, "The server replied with 206 status but no Content-Range at #{@uri}") unless range_header
133
+ range_header = response['Content-Range']
134
+ raise InvalidRequest.new(response.code, "The server replied with 206 status but no Content-Range at #{uri}") unless range_header
113
135
 
114
136
  # "Content-Range: bytes 0-0/307404381" is how the response header is structured
115
137
  size = range_header[/\/(\d+)$/, 1].to_i
@@ -117,19 +139,27 @@ class FormatParser::RemoteIO
117
139
  # If we request a _larger_ range than what can be satisfied by the server,
118
140
  # the response is going to only contain what _can_ be sent and the status is also going
119
141
  # to be 206
120
- return [size, response.body]
121
- when 416
142
+ [size, response.body]
143
+ when Net::HTTPMovedPermanently, Net::HTTPFound, Net::HTTPSeeOther, Net::HTTPTemporaryRedirect, Net::HTTPPermanentRedirect
144
+ raise RedirectLimitReached(uri) if redirects == 0
145
+ location = response['location']
146
+ if location
147
+ request_range(range, redirect_uri(location, uri), redirects - 1)
148
+ else
149
+ raise InvalidRequest.new(response.code, "Server at #{uri} replied with a #{response.code}, indicating redirection; however, the location header was empty.")
150
+ end
151
+ when Net::HTTPRangeNotSatisfiable
122
152
  # We return `nil` if we tried to read past the end of the IO,
123
153
  # which satisfies the Ruby IO convention. The caller should deal with `nil` being the result of a read()
124
154
  # S3 will also handily _not_ supply us with the Content-Range of the actual resource, so we
125
155
  # cannot hint size with this response - at lease not when working with S3
126
- return
127
- when 500..599
128
- Measurometer.increment_counter('format_parser.RemoteIO.upstream50x_errors', 1)
129
- raise IntermittentFailure.new(response.status, "Server at #{@uri} replied with a #{response.status} and we might want to retry")
156
+ nil
157
+ when Net::HTTPServerError
158
+ Measurometer.increment_counter('format_parser.remote_io.upstream50x_errors', 1)
159
+ raise IntermittentFailure.new(response.code, "Server at #{uri} replied with a #{response.code} and we might want to retry")
130
160
  else
131
- Measurometer.increment_counter('format_parser.RemoteIO.invalid_request_errors', 1)
132
- raise InvalidRequest.new(response.status, "Server at #{@uri} replied with a #{response.status} and refused our request")
161
+ Measurometer.increment_counter('format_parser.remote_io.invalid_request_errors', 1)
162
+ raise InvalidRequest.new(response.code, "Server at #{uri} replied with a #{response.code} and refused our request")
133
163
  end
134
164
  end
135
165
  end
data/lib/string.rb ADDED
@@ -0,0 +1,9 @@
1
+ class String
2
+ def underscore
3
+ gsub(/::/, '/').
4
+ gsub(/([A-Z]+)([A-Z][a-z])/, '\1_\2').
5
+ gsub(/([a-z\d])([A-Z])/, '\1_\2').
6
+ tr('-', '_').
7
+ downcase
8
+ end
9
+ end
@@ -106,9 +106,6 @@ describe FormatParser::AttributesJSON do
106
106
  struct: Struct.new(:key).new('Value'),
107
107
  content: "\x01\xFF\xFEb\x00i\x00r\x00d\x00s\x00 \x005\x00 \x00m\x00o\x00r\x00e\x00 \x00c\x00o\x00m\x00p\x00".b
108
108
  }
109
- expect {
110
- JSON.pretty_generate(nasty_hash) # Should not raise an error
111
- }.to raise_error(Encoding::UndefinedConversionError)
112
109
 
113
110
  anon_class = Struct.new(:evil)
114
111
  anon_class.include FormatParser::AttributesJSON
@@ -124,14 +124,9 @@ describe 'Fetching data from HTTP remotes' do
124
124
  end
125
125
 
126
126
  it 'sends provided HTTP headers in the request' do
127
- # Faraday is required only after calling .parse_http
128
- # This line is just to trigger this require, then it's possible to
129
- # add an expectation of how Faraday is initialized after.
130
- FormatParser.parse_http('invalid_url') rescue nil
131
-
132
- expect(Faraday)
133
- .to receive(:new)
134
- .with(headers: {'test-header' => 'test-value'})
127
+ expect_any_instance_of(Net::HTTP)
128
+ .to receive(:request_get)
129
+ .with(anything, a_hash_including('test-header' => 'test-value'))
135
130
  .and_call_original
136
131
 
137
132
  file_information = FormatParser.parse_http(
@@ -4,130 +4,186 @@ describe FormatParser::RemoteIO do
4
4
  it_behaves_like 'an IO object compatible with IOConstraint'
5
5
 
6
6
  it 'returns the partial content when the server supplies a 206 status' do
7
- rio = described_class.new('https://images.invalid/img.jpg')
7
+ url = 'https://images.invalid/img.jpg'
8
+ response = Net::HTTPPartialContent.new('2', '206', 'Partial Content')
9
+ response['Content-Range'] = '10-109/2577'
10
+ allow(response).to receive(:body).and_return('Response body')
8
11
 
9
- fake_resp = double(headers: {'Content-Range' => '10-109/2577'}, status: 206, body: 'This is the response')
10
- faraday_conn = instance_double(Faraday::Connection, get: fake_resp)
11
- allow(Faraday).to receive(:new).and_return(faraday_conn)
12
- expect(faraday_conn).to receive(:get).with('https://images.invalid/img.jpg', nil, range: 'bytes=10-109')
12
+ allow(Net::HTTP).to receive(:start).and_yield(Net::HTTP).and_return(response)
13
+ allow(Net::HTTP).to receive(:request_get).and_return(response)
13
14
 
15
+ expect(Net::HTTP).to receive(:request_get).with(
16
+ an_object_satisfying { |uri| URI::HTTPS === uri && uri.to_s == url },
17
+ a_hash_including('range' => 'bytes=10-109')
18
+ )
19
+
20
+ rio = described_class.new(url)
14
21
  rio.seek(10)
15
22
  read_result = rio.read(100)
16
- expect(read_result).to eq('This is the response')
23
+
24
+ expect(read_result).to eq(response.body)
17
25
  end
18
26
 
19
27
  it 'returns the entire content when the server supplies the Content-Range response but sends a 200 status' do
20
- rio = described_class.new('https://images.invalid/img.jpg')
28
+ url = 'https://images.invalid/img.jpg'
29
+ response = Net::HTTPOK.new('2', '200', 'OK')
30
+ allow(response).to receive(:body).and_return('Response body')
31
+
32
+ allow(Net::HTTP).to receive(:start).and_yield(Net::HTTP).and_return(response)
33
+ allow(Net::HTTP).to receive(:request_get).and_return(response)
21
34
 
22
- fake_resp = double(headers: {'Content-Range' => '10-109/2577'}, status: 200, body: 'This is the response')
23
- faraday_conn = instance_double(Faraday::Connection, get: fake_resp)
24
- allow(Faraday).to receive(:new).and_return(faraday_conn)
25
- expect(faraday_conn).to receive(:get).with('https://images.invalid/img.jpg', nil, range: 'bytes=10-109')
35
+ expect(Net::HTTP).to receive(:request_get).with(
36
+ an_object_satisfying { |uri| URI::HTTPS === uri && uri.to_s == url },
37
+ a_hash_including('range' => 'bytes=10-109')
38
+ )
26
39
 
40
+ rio = described_class.new(url)
27
41
  rio.seek(10)
28
42
  read_result = rio.read(100)
29
- expect(read_result).to eq('This is the response')
43
+
44
+ expect(read_result).to eq(response.body)
30
45
  end
31
46
 
32
47
  it 'raises a specific error for all 4xx responses except 416' do
33
- rio = described_class.new('https://images.invalid/img.jpg')
48
+ url = 'https://images.invalid/img.jpg'
49
+ response = Net::HTTPForbidden.new('2', '403', 'Forbidden')
50
+
51
+ allow(Net::HTTP).to receive(:start).and_yield(Net::HTTP).and_return(response)
52
+ allow(Net::HTTP).to receive(:request_get).and_return(response)
34
53
 
35
- fake_resp = double(headers: {}, status: 403, body: 'Please log in')
36
- faraday_conn = instance_double(Faraday::Connection, get: fake_resp)
37
- allow(Faraday).to receive(:new).and_return(faraday_conn)
38
- expect(faraday_conn).to receive(:get).with('https://images.invalid/img.jpg', nil, range: 'bytes=100-199')
54
+ expect(Net::HTTP).to receive(:request_get).with(
55
+ an_object_satisfying { |uri| uri.to_s == url },
56
+ a_hash_including('range' => 'bytes=100-199')
57
+ )
39
58
 
59
+ rio = described_class.new(url)
40
60
  rio.seek(100)
61
+
41
62
  expect { rio.read(100) }.to raise_error(/replied with a 403 and refused/)
42
63
  end
43
64
 
44
65
  it 'returns nil on a 416 response' do
45
- rio = described_class.new('https://images.invalid/img.jpg')
66
+ url = 'https://images.invalid/img.jpg'
67
+ response = Net::HTTPRangeNotSatisfiable.new('2', '416', 'Range Not Satisfiable')
46
68
 
47
- fake_resp = double(headers: {}, status: 416, body: 'You stepped off the ledge of the range')
48
- faraday_conn = instance_double(Faraday::Connection, get: fake_resp)
49
- allow(Faraday).to receive(:new).and_return(faraday_conn)
50
- expect(faraday_conn).to receive(:get).with('https://images.invalid/img.jpg', nil, range: 'bytes=100-199')
69
+ allow(Net::HTTP).to receive(:start).and_yield(Net::HTTP).and_return(response)
70
+ allow(Net::HTTP).to receive(:request_get).and_return(response)
51
71
 
72
+ expect(Net::HTTP).to receive(:request_get).with(
73
+ an_object_satisfying { |uri| uri.to_s == url },
74
+ a_hash_including('range' => 'bytes=100-199')
75
+ )
76
+
77
+ rio = described_class.new(url)
52
78
  rio.seek(100)
79
+
53
80
  expect(rio.read(100)).to be_nil
54
81
  end
55
82
 
56
83
  it 'sets the status_code of the exception on a 4xx response from upstream' do
57
- rio = described_class.new('https://images.invalid/img.jpg')
84
+ url = 'https://images.invalid/img.jpg'
85
+ response = Net::HTTPForbidden.new('2', '403', 'Forbidden')
86
+
87
+ allow(Net::HTTP).to receive(:start).and_yield(Net::HTTP).and_return(response)
88
+ allow(Net::HTTP).to receive(:request_get).and_return(response)
58
89
 
59
- fake_resp = double(headers: {}, status: 403, body: 'Please log in')
60
- faraday_conn = instance_double(Faraday::Connection, get: fake_resp)
61
- allow(Faraday).to receive(:new).and_return(faraday_conn)
62
- expect(faraday_conn).to receive(:get).with('https://images.invalid/img.jpg', nil, range: 'bytes=100-199')
90
+ expect(Net::HTTP).to receive(:request_get).with(
91
+ an_object_satisfying { |uri| uri.to_s == url },
92
+ a_hash_including('range' => 'bytes=100-199')
93
+ )
63
94
 
95
+ rio = described_class.new(url)
64
96
  rio.seek(100)
65
- # rubocop: disable Lint/AmbiguousBlockAssociation
66
- expect { rio.read(100) }.to raise_error { |e| expect(e.status_code).to eq(403) }
97
+ expect { rio.read(100) }.to(raise_error { |e| expect(e.status_code).to eq(403) })
67
98
  end
68
99
 
69
100
  it 'returns a nil when the range cannot be satisfied and the response is 416' do
70
- rio = described_class.new('https://images.invalid/img.jpg')
101
+ url = 'https://images.invalid/img.jpg'
102
+ response = Net::HTTPRangeNotSatisfiable.new('2', '416', 'Range Not Satisfiable')
71
103
 
72
- fake_resp = double(headers: {}, status: 416, body: 'You jumped off the end of the file maam')
73
- faraday_conn = instance_double(Faraday::Connection, get: fake_resp)
74
- allow(Faraday).to receive(:new).and_return(faraday_conn)
75
- expect(faraday_conn).to receive(:get).with('https://images.invalid/img.jpg', nil, range: 'bytes=100-199')
104
+ allow(Net::HTTP).to receive(:start).and_yield(Net::HTTP).and_return(response)
105
+ allow(Net::HTTP).to receive(:request_get).and_return(response)
76
106
 
107
+ expect(Net::HTTP).to receive(:request_get).with(
108
+ an_object_satisfying { |uri| uri.to_s == url },
109
+ a_hash_including('range' => 'bytes=100-199')
110
+ )
111
+
112
+ rio = described_class.new(url)
77
113
  rio.seek(100)
114
+
78
115
  expect(rio.read(100)).to be_nil
79
116
  end
80
117
 
81
118
  it 'does not overwrite size when the range cannot be satisfied and the response is 416' do
82
- rio = described_class.new('https://images.invalid/img.jpg')
83
-
84
- fake_resp1 = double(headers: {'Content-Range' => 'bytes 0-0/13'}, status: 206, body: 'a')
85
- fake_resp2 = double(headers: {}, status: 416, body: 'You jumped off the end of the file maam')
86
-
87
- faraday_conn = instance_double(Faraday::Connection)
88
- allow(Faraday).to receive(:new).and_return(faraday_conn)
89
- expect(faraday_conn).to receive(:get)
90
- .with('https://images.invalid/img.jpg', nil, range: 'bytes=0-0')
119
+ url = 'https://images.invalid/img.jpg'
120
+ response_1 = Net::HTTPPartialContent.new('2', '206', 'Partial Content')
121
+ response_1['Content-Range'] = 'bytes 0-0/13'
122
+ allow(response_1).to receive(:body).and_return('Response body')
123
+ response_2 = Net::HTTPRangeNotSatisfiable.new('2', '416', 'Range Not Satisfiable')
124
+
125
+ allow(Net::HTTP).to receive(:start).and_yield(Net::HTTP).and_return(response_1, response_2)
126
+ allow(Net::HTTP).to receive(:request_get).and_return(response_1, response_2)
127
+
128
+ expect(Net::HTTP).to receive(:request_get)
129
+ .with(
130
+ an_object_satisfying { |uri| uri.to_s == url },
131
+ a_hash_including('range' => 'bytes=0-0')
132
+ )
91
133
  .ordered
92
- .and_return(fake_resp1)
93
- expect(faraday_conn).to receive(:get)
94
- .with('https://images.invalid/img.jpg', nil, range: 'bytes=100-199')
134
+ expect(Net::HTTP).to receive(:request_get)
135
+ .with(
136
+ an_object_satisfying { |uri| uri.to_s == url },
137
+ a_hash_including('range' => 'bytes=100-199')
138
+ )
95
139
  .ordered
96
- .and_return(fake_resp2)
97
140
 
141
+ rio = described_class.new(url)
98
142
  rio.read(1)
99
143
 
100
144
  expect(rio.size).to eq(13)
101
145
 
102
146
  rio.seek(100)
103
- expect(rio.read(100)).to be_nil
104
147
 
148
+ expect(rio.read(100)).to be_nil
105
149
  expect(rio.size).to eq(13)
106
150
  end
107
151
 
108
152
  it 'raises a specific error for all 5xx responses' do
109
- rio = described_class.new('https://images.invalid/img.jpg')
153
+ url = 'https://images.invalid/img.jpg'
154
+ response = Net::HTTPBadGateway.new('2', '502', 'Bad Gateway')
155
+
156
+ allow(Net::HTTP).to receive(:start).and_yield(Net::HTTP).and_return(response)
157
+ allow(Net::HTTP).to receive(:request_get).and_return(response)
110
158
 
111
- fake_resp = double(headers: {}, status: 502, body: 'Guru meditation')
112
- faraday_conn = instance_double(Faraday::Connection, get: fake_resp)
113
- allow(Faraday).to receive(:new).and_return(faraday_conn)
114
- expect(faraday_conn).to receive(:get).with('https://images.invalid/img.jpg', nil, range: 'bytes=100-199')
159
+ expect(Net::HTTP).to receive(:request_get).with(
160
+ an_object_satisfying { |uri| uri.to_s == url },
161
+ a_hash_including('range' => 'bytes=100-199')
162
+ )
115
163
 
164
+ rio = described_class.new(url)
116
165
  rio.seek(100)
166
+
117
167
  expect { rio.read(100) }.to raise_error(/replied with a 502 and we might want to retry/)
118
168
  end
119
169
 
120
170
  it 'maintains and exposes #pos' do
121
- rio = described_class.new('https://images.invalid/img.jpg')
171
+ url = 'https://images.invalid/img.jpg'
172
+ response = Net::HTTPPartialContent.new('2', '206', 'Partial Content')
173
+ response['Content-Range'] = 'bytes 0-0/13'
174
+ allow(response).to receive(:body).and_return('a')
122
175
 
123
- expect(rio.pos).to eq(0)
176
+ allow(Net::HTTP).to receive(:start).and_yield(Net::HTTP).and_return(response)
177
+ allow(Net::HTTP).to receive(:request_get).and_return(response)
124
178
 
125
- fake_resp = double(headers: {'Content-Range' => 'bytes 0-0/13'}, status: 206, body: 'a')
126
- faraday_conn = instance_double(Faraday::Connection, get: fake_resp)
127
- allow(Faraday).to receive(:new).and_return(faraday_conn)
128
- expect(faraday_conn).to receive(:get).with('https://images.invalid/img.jpg', nil, range: 'bytes=0-0')
129
- rio.read(1)
179
+ expect(Net::HTTP).to receive(:request_get).with(
180
+ an_object_satisfying { |uri| uri.to_s == url },
181
+ a_hash_including('range' => 'bytes=0-0')
182
+ )
130
183
 
184
+ rio = described_class.new(url)
185
+ expect(rio.pos).to eq(0)
186
+ rio.read(1)
131
187
  expect(rio.pos).to eq(1)
132
188
  end
133
189
  end