sinew 2.0.2 → 3.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/test.yml +26 -0
  3. data/.rubocop.yml +9 -6
  4. data/.vscode/settings.json +0 -10
  5. data/Gemfile +9 -0
  6. data/README.md +62 -54
  7. data/Rakefile +33 -18
  8. data/bin/sinew +2 -0
  9. data/lib/sinew.rb +0 -1
  10. data/lib/sinew/connection.rb +52 -0
  11. data/lib/sinew/connection/log_formatter.rb +22 -0
  12. data/lib/sinew/connection/rate_limit.rb +29 -0
  13. data/lib/sinew/core_ext.rb +1 -1
  14. data/lib/sinew/dsl.rb +10 -6
  15. data/lib/sinew/main.rb +29 -56
  16. data/lib/sinew/output.rb +7 -16
  17. data/lib/sinew/request.rb +22 -87
  18. data/lib/sinew/response.rb +8 -57
  19. data/lib/sinew/runtime_options.rb +4 -4
  20. data/lib/sinew/version.rb +1 -1
  21. data/sample.sinew +2 -2
  22. data/sinew.gemspec +16 -18
  23. metadata +38 -110
  24. data/.travis.yml +0 -4
  25. data/lib/sinew/cache.rb +0 -79
  26. data/test/legacy/eu.httpbin.org/head/redirect,3 +0 -51
  27. data/test/legacy/eu.httpbin.org/head/status,500 +0 -1
  28. data/test/legacy/eu.httpbin.org/redirect,3 +0 -11
  29. data/test/legacy/eu.httpbin.org/status,500 +0 -1
  30. data/test/legacy/legacy.sinew +0 -2
  31. data/test/recipes/array_header.sinew +0 -6
  32. data/test/recipes/basic.sinew +0 -8
  33. data/test/recipes/dups.sinew +0 -7
  34. data/test/recipes/implicit_header.sinew +0 -5
  35. data/test/recipes/limit.sinew +0 -11
  36. data/test/recipes/noko.sinew +0 -9
  37. data/test/recipes/uri.sinew +0 -11
  38. data/test/recipes/xml.sinew +0 -8
  39. data/test/test.html +0 -45
  40. data/test/test_cache.rb +0 -69
  41. data/test/test_helper.rb +0 -123
  42. data/test/test_legacy.rb +0 -23
  43. data/test/test_main.rb +0 -34
  44. data/test/test_nokogiri_ext.rb +0 -18
  45. data/test/test_output.rb +0 -56
  46. data/test/test_recipes.rb +0 -60
  47. data/test/test_requests.rb +0 -135
  48. data/test/test_utf8.rb +0 -39
@@ -0,0 +1,29 @@
1
+ module Sinew
2
+ module Connection
3
+ class RateLimit < Faraday::Middleware
4
+ attr_reader :rate_limit
5
+
6
+ def initialize(app, options = {})
7
+ super(app)
8
+
9
+ @last_request_tm = @current_request_tm = nil
10
+ @rate_limit = options.fetch(:rate_limit, 1)
11
+ end
12
+
13
+ def on_request(_env)
14
+ if @last_request_tm
15
+ sleep = (@last_request_tm + rate_limit) - Time.now
16
+ sleep(sleep) if sleep > 0
17
+ end
18
+
19
+ @current_request_tm = Time.now
20
+ end
21
+
22
+ def on_complete(env)
23
+ # Only rate limit on uncached requests
24
+ @last_request_tm = @current_request_tm unless env[:httpdisk]
25
+ @current_request_tm = nil
26
+ end
27
+ end
28
+ end
29
+ end
@@ -30,7 +30,7 @@ class String
30
30
  elsif limit >= size
31
31
  dup
32
32
  else
33
- self[-limit..-1]
33
+ self[-limit..]
34
34
  end
35
35
  end
36
36
 
data/lib/sinew/dsl.rb CHANGED
@@ -1,5 +1,6 @@
1
- require 'awesome_print'
1
+ require 'amazing_print'
2
2
  require 'cgi'
3
+ require 'json'
3
4
 
4
5
  #
5
6
  # The DSL available to .sinew files.
@@ -10,7 +11,7 @@ module Sinew
10
11
  # this is used to break out of --limit
11
12
  class LimitError < StandardError; end
12
13
 
13
- attr_reader :sinew, :raw, :uri, :elapsed
14
+ attr_reader :sinew, :uri, :raw, :code, :elapsed
14
15
 
15
16
  def initialize(sinew)
16
17
  @sinew = sinew
@@ -52,14 +53,17 @@ module Sinew
52
53
  end
53
54
 
54
55
  def http(method, url, options = {})
55
- # reset
56
- instance_variables.each do |i|
57
- instance_variable_set(i, nil) if i != :@sinew
56
+ # these need to be cleared before each request
57
+ %i[@html @noko @xml @json].each do |i|
58
+ instance_variable_set(i, nil)
58
59
  end
59
60
 
60
61
  # fetch and make response available to callers
61
62
  response = sinew.http(method, url, options)
62
- @uri, @raw = response.uri, response.body
63
+ @uri, @raw, @code = response.uri, response.body, response.code
64
+
65
+ # don't confuse the user
66
+ nil
63
67
  end
64
68
 
65
69
  #
data/lib/sinew/main.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  require 'scripto'
2
+ require 'sinew/connection'
2
3
 
3
4
  #
4
5
  # Main sinew entry point.
@@ -6,21 +7,13 @@ require 'scripto'
6
7
 
7
8
  module Sinew
8
9
  class Main < Scripto::Main
9
- attr_reader :runtime_options, :request_tm, :request_count
10
+ attr_reader :runtime_options
10
11
 
11
12
  def initialize(options)
12
13
  super(options)
13
14
 
14
15
  # init
15
16
  @runtime_options = RuntimeOptions.new
16
- @request_tm = Time.at(0)
17
- @request_count = 0
18
-
19
- if options[:proxy]
20
- addr, port = options[:proxy].split(':')
21
- runtime_options.httparty_options[:http_proxyaddr] = addr
22
- runtime_options.httparty_options[:http_proxyport] = port || 80
23
- end
24
17
  end
25
18
 
26
19
  def run
@@ -37,24 +30,12 @@ module Sinew
37
30
  end
38
31
 
39
32
  #
40
- # http requests and caching
33
+ # http requests
41
34
  #
42
35
 
43
- def cache
44
- @cache ||= Cache.new(self)
45
- end
46
-
47
36
  def http(method, url, options = {})
48
- request = Request.new(self, method, url, options)
49
-
50
- # try to get from cache
51
- response = cache.get(request)
52
-
53
- # perform if necessary
54
- if !response
55
- response = perform(request)
56
- cache.set(response)
57
- end
37
+ request = Request.new(method, url, request_options(options))
38
+ response = request.perform(connection)
58
39
 
59
40
  # always log error messages
60
41
  if response.error?
@@ -64,26 +45,10 @@ module Sinew
64
45
  response
65
46
  end
66
47
 
67
- def perform(request)
68
- before_perform_request(request)
69
-
70
- response = nil
71
-
72
- tries = runtime_options.retries + 1
73
- while tries > 0
74
- tries -= 1
75
- begin
76
- @request_count += 1
77
- response = request.perform
78
- rescue Timeout::Error
79
- response = Response.from_timeout(request)
80
- end
81
- break if !response.error_500?
82
- end
83
-
84
- response
48
+ def connection
49
+ @connection ||= Connection.create(options: options, runtime_options: runtime_options)
85
50
  end
86
- protected :perform
51
+ protected :connection
87
52
 
88
53
  #
89
54
  # output
@@ -97,23 +62,31 @@ module Sinew
97
62
  # helpers
98
63
  #
99
64
 
100
- def before_perform_request(request)
101
- # log
102
- if !quiet?
103
- msg = if request.method != 'get'
104
- "req #{request.uri} (#{request.method})"
105
- else
106
- "req #{request.uri}"
65
+ def request_options(options)
66
+ options.dup.tap do |req|
67
+ req[:headers] = {}.tap do |h|
68
+ [ runtime_options.headers, options[:headers]].each do
69
+ h.merge!(_1) if _1
70
+ end
107
71
  end
108
- $stderr.puts msg
72
+ req[:proxy] = random_proxy
73
+ end
74
+ end
75
+ protected :request_options
76
+
77
+ PROXY_RE = /\A#{URI::PATTERN::HOST}(:\d+)?\Z/.freeze
78
+
79
+ def random_proxy
80
+ return if !options[:proxy]
81
+
82
+ proxy = options[:proxy].split(',').sample
83
+ if proxy !~ PROXY_RE
84
+ raise ArgumentError, "invalid proxy #{proxy.inspect}, should be host[:port]"
109
85
  end
110
86
 
111
- # rate limit
112
- sleep = (request_tm + runtime_options.rate_limit) - Time.now
113
- sleep(sleep) if sleep > 0
114
- @request_tm = Time.now
87
+ "http://#{proxy}"
115
88
  end
116
- protected :before_perform_request
89
+ protected :random_proxy
117
90
 
118
91
  def footer
119
92
  output.report
data/lib/sinew/output.rb CHANGED
@@ -1,6 +1,6 @@
1
1
  require 'csv'
2
2
  require 'set'
3
- require 'stringex'
3
+ require 'sterile'
4
4
 
5
5
  #
6
6
  # CSV output.
@@ -45,6 +45,7 @@ module Sinew
45
45
 
46
46
  # don't allow duplicate urls
47
47
  return if dup_url?(row)
48
+
48
49
  rows << row.dup
49
50
 
50
51
  # map columns to row, and normalize along the way
@@ -101,24 +102,14 @@ module Sinew
101
102
  # strip html tags. Note that we replace tags with spaces
102
103
  s = s.gsub(/<[^>]+>/, ' ')
103
104
 
104
- #
105
- # Below uses stringex
106
- #
107
- # github.com/rsl/stringex/blob/master/lib/stringex/string_extensions.rb
108
- # github.com/rsl/stringex/blob/master/lib/stringex/localization/conversion_expressions.rb
109
- #
110
-
111
105
  # Converts MS Word 'smart punctuation' to ASCII
112
- s = s.convert_smart_punctuation
113
-
114
- # "&aacute;".convert_accented_html_entities # => "a"
115
- s = s.convert_accented_html_entities
106
+ s = Sterile.plain_format(s)
116
107
 
117
- # &amp, &frac, etc.
118
- s = s.convert_miscellaneous_html_entities
108
+ # &aacute; &amp; etc.
109
+ s = Sterile.decode_entities(s)
119
110
 
120
- # convert unicode => regular characters
121
- s = s.to_ascii
111
+ # "šţɽĩɳģ" => "string"
112
+ s = Sterile.transliterate(s)
122
113
 
123
114
  # squish
124
115
  s = s.squish
data/lib/sinew/request.rb CHANGED
@@ -1,61 +1,57 @@
1
- require 'digest/md5'
2
- require 'httparty'
3
- require 'htmlentities'
1
+ require 'sterile'
4
2
 
5
3
  #
6
- # Process a single HTTP request. Mostly a wrapper around HTTParty.
4
+ # Process a single HTTP request.
7
5
  #
8
6
 
9
7
  module Sinew
10
8
  class Error < StandardError; end
11
9
 
12
10
  class Request
13
- HTML_ENTITIES = HTMLEntities.new
14
11
  VALID_METHODS = %w[get post patch put delete head options].freeze
12
+ METHODS_WITH_BODY = %w[patch post put].freeze
15
13
 
16
- attr_reader :sinew, :method, :uri, :options, :cache_key
14
+ attr_reader :method, :options, :uri
17
15
 
18
- # Options are largely compatible with HTTParty, except for :method.
19
- def initialize(sinew, method, url, options = {})
20
- @sinew = sinew
16
+ # Supported options:
17
+ # body: Body of http post
18
+ # headers: Hash of HTTP headers (combined with runtime_options.headers)
19
+ # query: Hash of query parameters to add to url
20
+ def initialize(method, url, options = {})
21
21
  @method = method
22
22
  @options = options.dup
23
23
  @uri = parse_url(url)
24
- @cache_key = calculate_cache_key
25
24
  end
26
25
 
27
26
  # run the request, return the result
28
- def perform
27
+ def perform(connection)
29
28
  validate!
30
29
 
31
- # merge optons
32
- options = self.options.merge(sinew.runtime_options.httparty_options)
33
-
34
- # merge headers
35
- headers = sinew.runtime_options.headers
36
- headers = headers.merge(options[:headers]) if options[:headers]
37
- options[:headers] = headers
30
+ body = options.delete(:body)
31
+ fday_response = connection.send(method, uri, body) do
32
+ _1.headers.update(options[:headers]) if options[:headers]
33
+ _1.options[:proxy] = options[:proxy]
34
+ end
38
35
 
39
- party_response = HTTParty.send(method, uri, options)
40
- Response.from_network(self, party_response)
36
+ Response.from_network(self, fday_response)
41
37
  end
42
38
 
43
39
  # We accept sloppy urls and attempt to clean them up
44
40
  def parse_url(url)
45
- s = url
41
+ s = url.to_s
46
42
 
47
43
  # remove entities
48
- s = HTML_ENTITIES.decode(s)
44
+ s = Sterile.decode_entities(s)
49
45
 
50
46
  # fix a couple of common encoding bugs
51
47
  s = s.gsub(' ', '%20')
52
48
  s = s.gsub("'", '%27')
53
49
 
54
- # append query manually (instead of letting HTTParty handle it) so we can
55
- # include it in cache_key
50
+ # append query manually (instead of letting Faraday handle it) for consistent
51
+ # Request#uri and Response#uri
56
52
  query = options.delete(:query)
57
53
  if query.present?
58
- q = HTTParty::HashConversions.to_params(query)
54
+ q = Faraday::Utils.default_params_encoder.encode(query)
59
55
  separator = s.include?('?') ? '&' : '?'
60
56
  s = "#{s}#{separator}#{q}"
61
57
  end
@@ -64,44 +60,10 @@ module Sinew
64
60
  end
65
61
  protected :parse_url
66
62
 
67
- def calculate_cache_key
68
- dir = pathify(uri.host)
69
-
70
- body_key = if body.is_a?(Hash)
71
- HTTParty::HashConversions.to_params(body)
72
- else
73
- body&.dup
74
- end
75
-
76
- # build key, as a hash for before_generate_cache_key
77
- key = {
78
- method: method.dup,
79
- path: uri.path,
80
- query: uri.query,
81
- body: body_key,
82
- }
83
- key = sinew.runtime_options.before_generate_cache_key.call(key)
84
-
85
- # strip method for gets
86
- key.delete(:method) if key[:method] == 'get'
87
-
88
- # pull out the values, join and pathify
89
- path = key.values.select(&:present?).join(',')
90
- path = pathify(path)
91
-
92
- # shorten long paths
93
- if path.length > 250
94
- path = Digest::MD5.hexdigest(path)
95
- end
96
-
97
- "#{dir}/#{path}"
98
- end
99
- protected :calculate_cache_key
100
-
101
63
  def validate!
102
64
  raise "invalid method #{method}" if !VALID_METHODS.include?(method)
103
65
  raise "invalid url #{uri}" if uri.scheme !~ /^http/
104
- raise "can't get with a body" if method == 'get' && body
66
+ raise "can't #{method} with a body" if body && !METHODS_WITH_BODY.include?(method)
105
67
  raise "Content-Type doesn't make sense without a body" if content_type && !body
106
68
  end
107
69
  protected :validate!
@@ -120,32 +82,5 @@ module Sinew
120
82
  headers && headers['Content-Type']
121
83
  end
122
84
  protected :content_type
123
-
124
- def form?
125
- content_type == 'application/x-www-form-urlencoded'
126
- end
127
- protected :form?
128
-
129
- def pathify(s)
130
- # remove leading slash
131
- s = s.gsub(/^\//, '')
132
- # .. => comma
133
- s = s.gsub('..', ',')
134
- # query separators => comma
135
- s = s.gsub(/[?\/&]/, ',')
136
- # ,, => comma
137
- s = s.gsub(',,', ',')
138
- # encode invalid path chars
139
- s = s.gsub(/[^A-Za-z0-9_.,=-]/) do |i|
140
- hex = i.unpack('H2').first
141
- "%#{hex}"
142
- end
143
- # handle empty case
144
- s = '_root_' if s.blank?
145
- # always downcase
146
- s = s.downcase
147
- s
148
- end
149
- protected :pathify
150
85
  end
151
86
  end
@@ -2,7 +2,7 @@ require 'stringio'
2
2
  require 'zlib'
3
3
 
4
4
  #
5
- # An HTTP response. Mostly a wrapper around HTTParty.
5
+ # An HTTP response.
6
6
  #
7
7
 
8
8
  module Sinew
@@ -13,62 +13,13 @@ module Sinew
13
13
  # factory methods
14
14
  #
15
15
 
16
- def self.from_network(request, party_response)
17
- Response.new.tap do |response|
18
- response.request = request
19
- response.uri = party_response.request.last_uri
20
- response.code = party_response.code
21
- response.headers = party_response.headers.to_h
22
- response.body = process_body(party_response)
23
- end
24
- end
25
-
26
- def self.from_cache(request, body, head)
27
- Response.new.tap do |response|
28
- response.request = request
29
- response.body = body
30
-
31
- # defaults
32
- response.uri = request.uri
33
- response.code = 200
34
- response.headers = {}
35
-
36
- # overwrite with cached response headers
37
- if head
38
- if head !~ /^{/
39
- return from_legacy_head(response, head)
40
- end
41
- head = JSON.parse(head, symbolize_names: true)
42
- response.uri = URI.parse(head[:uri])
43
- response.code = head[:code]
44
- response.headers = head[:headers]
45
- end
46
- end
47
- end
48
-
49
- def self.from_timeout(request)
50
- Response.new.tap do |response|
51
- response.request = request
52
- response.uri = request.uri
53
- response.body = 'timeout'
54
- response.code = 999
55
- response.headers = {}
56
- end
57
- end
58
-
59
- def self.from_legacy_head(response, head)
60
- response.tap do |r|
61
- case head
62
- when /\ACURLER_ERROR/
63
- # error
64
- r.code = 999
65
- when /\AHTTP/
66
- # redirect
67
- location = head.scan(/Location: ([^\r\n]+)/).flatten.last
68
- r.uri += location
69
- else
70
- $stderr.puts "unknown cached /head for #{r.uri}"
71
- end
16
+ def self.from_network(request, fday_response)
17
+ Response.new.tap do
18
+ _1.request = request
19
+ _1.uri = fday_response.env.url
20
+ _1.code = fday_response.status
21
+ _1.headers = fday_response.headers.to_h
22
+ _1.body = process_body(fday_response)
72
23
  end
73
24
  end
74
25