sinew 2.0.1 → 3.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,22 @@
1
+ module Sinew
2
+ module Connection
3
+ class LogFormatter < Faraday::Logging::Formatter
4
+ def request(env)
5
+ info('req') do
6
+ # Only log the initial request, not the redirects
7
+ return if env[:redirect]
8
+
9
+ msg = apply_filters(env.url.to_s)
10
+ msg = "#{msg} (#{env.method})" if env.method != :get
11
+ msg = "#{msg} => #{env.request.proxy.uri}" if env.request.proxy
12
+
13
+ msg
14
+ end
15
+ end
16
+
17
+ def response(env)
18
+ # silent
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,29 @@
1
+ module Sinew
2
+ module Connection
3
+ class RateLimit < Faraday::Middleware
4
+ attr_reader :rate_limit
5
+
6
+ def initialize(app, options = {})
7
+ super(app)
8
+
9
+ @last_request_tm = @current_request_tm = nil
10
+ @rate_limit = options.fetch(:rate_limit, 1)
11
+ end
12
+
13
+ def on_request(_env)
14
+ if @last_request_tm
15
+ sleep = (@last_request_tm + rate_limit) - Time.now
16
+ sleep(sleep) if sleep > 0
17
+ end
18
+
19
+ @current_request_tm = Time.now
20
+ end
21
+
22
+ def on_complete(env)
23
+ # Only rate limit on uncached requests
24
+ @last_request_tm = @current_request_tm unless env[:httpdisk]
25
+ @current_request_tm = nil
26
+ end
27
+ end
28
+ end
29
+ end
@@ -30,7 +30,7 @@ class String
30
30
  elsif limit >= size
31
31
  dup
32
32
  else
33
- self[-limit..-1]
33
+ self[-limit..]
34
34
  end
35
35
  end
36
36
 
data/lib/sinew/dsl.rb CHANGED
@@ -1,5 +1,6 @@
1
- require 'awesome_print'
1
+ require 'amazing_print'
2
2
  require 'cgi'
3
+ require 'json'
3
4
 
4
5
  #
5
6
  # The DSL available to .sinew files.
@@ -7,7 +8,10 @@ require 'cgi'
7
8
 
8
9
  module Sinew
9
10
  class DSL
10
- attr_reader :sinew, :raw, :uri, :elapsed
11
+ # this is used to break out of --limit
12
+ class LimitError < StandardError; end
13
+
14
+ attr_reader :sinew, :uri, :raw, :code, :elapsed
11
15
 
12
16
  def initialize(sinew)
13
17
  @sinew = sinew
@@ -15,8 +19,12 @@ module Sinew
15
19
 
16
20
  def run
17
21
  tm = Time.now
18
- recipe = sinew.options[:recipe]
19
- instance_eval(File.read(recipe, mode: 'rb'), recipe)
22
+ begin
23
+ recipe = sinew.options[:recipe]
24
+ instance_eval(File.read(recipe, mode: 'rb'), recipe)
25
+ rescue LimitError
26
+ # ignore - this is flow control for --limit
27
+ end
20
28
  @elapsed = Time.now - tm
21
29
  end
22
30
 
@@ -45,15 +53,17 @@ module Sinew
45
53
  end
46
54
 
47
55
  def http(method, url, options = {})
48
- # reset
49
- @html = @noko = @json = @url = nil
56
+ # these need to be cleared before each request
57
+ %i[@html @noko @xml @json].each do |i|
58
+ instance_variable_set(i, nil)
59
+ end
50
60
 
51
- # fetch
61
+ # fetch and make response available to callers
52
62
  response = sinew.http(method, url, options)
63
+ @uri, @raw, @code = response.uri, response.body, response.code
53
64
 
54
- # respond
55
- @uri = response.uri
56
- @raw = response.body
65
+ # don't confuse the user
66
+ nil
57
67
  end
58
68
 
59
69
  #
@@ -75,6 +85,10 @@ module Sinew
75
85
  @noko ||= Nokogiri::HTML(html)
76
86
  end
77
87
 
88
+ def xml
89
+ @xml ||= Nokogiri::XML(html)
90
+ end
91
+
78
92
  def json
79
93
  @json ||= JSON.parse(raw, symbolize_names: true)
80
94
  end
@@ -93,6 +107,9 @@ module Sinew
93
107
 
94
108
  def csv_emit(row)
95
109
  sinew.output.emit(row)
110
+ if sinew.output.count == sinew.options[:limit]
111
+ raise LimitError.new
112
+ end
96
113
  end
97
114
  end
98
115
  end
data/lib/sinew/main.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  require 'scripto'
2
+ require 'sinew/connection'
2
3
 
3
4
  #
4
5
  # Main sinew entry point.
@@ -6,15 +7,13 @@ require 'scripto'
6
7
 
7
8
  module Sinew
8
9
  class Main < Scripto::Main
9
- attr_reader :runtime_options, :request_tm, :request_count
10
+ attr_reader :runtime_options
10
11
 
11
12
  def initialize(options)
12
13
  super(options)
13
14
 
14
15
  # init
15
16
  @runtime_options = RuntimeOptions.new
16
- @request_tm = Time.at(0)
17
- @request_count = 0
18
17
  end
19
18
 
20
19
  def run
@@ -31,24 +30,12 @@ module Sinew
31
30
  end
32
31
 
33
32
  #
34
- # http requests and caching
33
+ # http requests
35
34
  #
36
35
 
37
- def cache
38
- @cache ||= Cache.new(self)
39
- end
40
-
41
36
  def http(method, url, options = {})
42
37
  request = Request.new(self, method, url, options)
43
-
44
- # try to get from cache
45
- response = cache.get(request)
46
-
47
- # perform if necessary
48
- if !response
49
- response = perform(request)
50
- cache.set(response)
51
- end
38
+ response = request.perform(connection)
52
39
 
53
40
  # always log error messages
54
41
  if response.error?
@@ -58,26 +45,10 @@ module Sinew
58
45
  response
59
46
  end
60
47
 
61
- def perform(request)
62
- before_perform_request(request)
63
-
64
- response = nil
65
-
66
- tries = runtime_options.retries + 1
67
- while tries > 0
68
- tries -= 1
69
- begin
70
- @request_count += 1
71
- response = request.perform
72
- rescue Timeout::Error
73
- response = Response.from_timeout(request)
74
- end
75
- break if !response.error_500?
76
- end
77
-
78
- response
48
+ def connection
49
+ @connection ||= Connection.create(options: options, runtime_options: runtime_options)
79
50
  end
80
- protected :perform
51
+ protected :connection
81
52
 
82
53
  #
83
54
  # output
@@ -91,24 +62,6 @@ module Sinew
91
62
  # helpers
92
63
  #
93
64
 
94
- def before_perform_request(request)
95
- # log
96
- if !quiet?
97
- msg = if request.method != 'get'
98
- "req #{request.uri} (#{request.method})"
99
- else
100
- "req #{request.uri}"
101
- end
102
- $stderr.puts msg
103
- end
104
-
105
- # rate limit
106
- sleep = (request_tm + runtime_options.rate_limit) - Time.now
107
- sleep(sleep) if sleep > 0
108
- @request_tm = Time.now
109
- end
110
- protected :before_perform_request
111
-
112
65
  def footer
113
66
  output.report
114
67
  finished = output.count > 0 ? "Finished #{output.filename}" : 'Finished'
data/lib/sinew/output.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  require 'csv'
2
- require 'stringex'
2
+ require 'set'
3
+ require 'sterile'
3
4
 
4
5
  #
5
6
  # CSV output.
@@ -7,11 +8,12 @@ require 'stringex'
7
8
 
8
9
  module Sinew
9
10
  class Output
10
- attr_reader :sinew, :columns, :rows, :csv
11
+ attr_reader :sinew, :columns, :rows, :urls, :csv
11
12
 
12
13
  def initialize(sinew)
13
14
  @sinew = sinew
14
15
  @rows = []
16
+ @urls = Set.new
15
17
  end
16
18
 
17
19
  def filename
@@ -41,6 +43,9 @@ module Sinew
41
43
  # implicit header if necessary
42
44
  header(row.keys) if !csv
43
45
 
46
+ # don't allow duplicate urls
47
+ return if dup_url?(row)
48
+
44
49
  rows << row.dup
45
50
 
46
51
  # map columns to row, and normalize along the way
@@ -94,27 +99,17 @@ module Sinew
94
99
  s.to_s
95
100
  end
96
101
 
97
- #
98
- # Below uses stringex
99
- #
100
- # github.com/rsl/stringex/blob/master/lib/stringex/string_extensions.rb
101
- # github.com/rsl/stringex/blob/master/lib/stringex/localization/conversion_expressions.rb
102
- #
103
-
104
- # <a>b</a> => b
105
- s = s.strip_html_tags
102
+ # strip html tags. Note that we replace tags with spaces
103
+ s = s.gsub(/<[^>]+>/, ' ')
106
104
 
107
105
  # Converts MS Word 'smart punctuation' to ASCII
108
- s = s.convert_smart_punctuation
106
+ s = Sterile.plain_format(s)
109
107
 
110
- # "&aacute;".convert_accented_html_entities # => "a"
111
- s = s.convert_accented_html_entities
108
+ # &aacute; &amp; etc.
109
+ s = Sterile.decode_entities(s)
112
110
 
113
- # &amp, &frac, etc.
114
- s = s.convert_miscellaneous_html_entities
115
-
116
- # convert unicode => regular characters
117
- s = s.to_ascii
111
+ # "šţɽĩɳģ" => "string"
112
+ s = Sterile.transliterate(s)
118
113
 
119
114
  # squish
120
115
  s = s.squish
@@ -122,5 +117,17 @@ module Sinew
122
117
  s
123
118
  end
124
119
  protected :normalize
120
+
121
+ def dup_url?(row)
122
+ if url = row[:url]
123
+ if urls.include?(url)
124
+ sinew.warning("duplicate url: #{url}") if !sinew.quiet?
125
+ return true
126
+ end
127
+ urls << url
128
+ end
129
+ false
130
+ end
131
+ protected :dup_url?
125
132
  end
126
133
  end
data/lib/sinew/request.rb CHANGED
@@ -1,9 +1,8 @@
1
1
  require 'digest/md5'
2
- require 'httparty'
3
2
  require 'htmlentities'
4
3
 
5
4
  #
6
- # Process a single HTTP request. Mostly a wrapper around HTTParty.
5
+ # Process a single HTTP request.
7
6
  #
8
7
 
9
8
  module Sinew
@@ -12,29 +11,43 @@ module Sinew
12
11
  class Request
13
12
  HTML_ENTITIES = HTMLEntities.new
14
13
  VALID_METHODS = %w[get post patch put delete head options].freeze
14
+ METHODS_WITH_BODY = %w[patch post put].freeze
15
15
 
16
- attr_reader :sinew, :method, :uri, :options, :cache_key
16
+ attr_reader :sinew, :method, :uri, :options
17
17
 
18
- # Options are largely compatible with HTTParty, except for :method.
18
+ # Supported options:
19
+ # body: Body of http post
20
+ # headers: Hash of HTTP headers (combined with runtime_options.headers)
21
+ # query: Hash of query parameters to add to url
19
22
  def initialize(sinew, method, url, options = {})
20
23
  @sinew = sinew
21
24
  @method = method
22
25
  @options = options.dup
23
26
  @uri = parse_url(url)
24
- @cache_key = calculate_cache_key
27
+ end
28
+
29
+ def proxy
30
+ @proxy ||= begin
31
+ if proxies = sinew.options[:proxy]
32
+ proxies.split(',').sample
33
+ end
34
+ end
25
35
  end
26
36
 
27
37
  # run the request, return the result
28
- def perform
38
+ def perform(connection)
29
39
  validate!
30
40
 
31
- # merge global/options headers
32
41
  headers = sinew.runtime_options.headers
33
42
  headers = headers.merge(options[:headers]) if options[:headers]
34
- options[:headers] = headers
35
43
 
36
- party_response = HTTParty.send(method, uri, options)
37
- Response.from_network(self, party_response)
44
+ body = options.delete(:body)
45
+
46
+ fday_response = connection.send(method, uri, body, headers) do
47
+ _1.options[:proxy] = proxy
48
+ end
49
+
50
+ Response.from_network(self, fday_response)
38
51
  end
39
52
 
40
53
  # We accept sloppy urls and attempt to clean them up
@@ -48,11 +61,11 @@ module Sinew
48
61
  s = s.gsub(' ', '%20')
49
62
  s = s.gsub("'", '%27')
50
63
 
51
- # append query manually (instead of letting HTTParty handle it) so we can
52
- # include it in cache_key
64
+ # append query manually (instead of letting Faraday handle it) for consistent
65
+ # Request#uri and Response#uri
53
66
  query = options.delete(:query)
54
67
  if query.present?
55
- q = HTTParty::HashConversions.to_params(query)
68
+ q = Faraday::Utils.default_params_encoder.encode(query)
56
69
  separator = s.include?('?') ? '&' : '?'
57
70
  s = "#{s}#{separator}#{q}"
58
71
  end
@@ -61,44 +74,10 @@ module Sinew
61
74
  end
62
75
  protected :parse_url
63
76
 
64
- def calculate_cache_key
65
- dir = pathify(uri.host)
66
-
67
- body_key = if body.is_a?(Hash)
68
- HTTParty::HashConversions.to_params(body)
69
- else
70
- body&.dup
71
- end
72
-
73
- # build key, as a hash for before_generate_cache_key
74
- key = {
75
- method: method.dup,
76
- path: uri.path,
77
- query: uri.query,
78
- body: body_key,
79
- }
80
- key = sinew.runtime_options.before_generate_cache_key.call(key)
81
-
82
- # strip method for gets
83
- key.delete(:method) if key[:method] == 'get'
84
-
85
- # pull out the values, join and pathify
86
- path = key.values.select(&:present?).join(',')
87
- path = pathify(path)
88
-
89
- # shorten long paths
90
- if path.length > 250
91
- path = Digest::MD5.hexdigest(path)
92
- end
93
-
94
- "#{dir}/#{path}"
95
- end
96
- protected :calculate_cache_key
97
-
98
77
  def validate!
99
78
  raise "invalid method #{method}" if !VALID_METHODS.include?(method)
100
79
  raise "invalid url #{uri}" if uri.scheme !~ /^http/
101
- raise "can't get with a body" if method == 'get' && body
80
+ raise "can't #{method} with a body" if body && !METHODS_WITH_BODY.include?(method)
102
81
  raise "Content-Type doesn't make sense without a body" if content_type && !body
103
82
  end
104
83
  protected :validate!
@@ -134,7 +113,7 @@ module Sinew
134
113
  s = s.gsub(',,', ',')
135
114
  # encode invalid path chars
136
115
  s = s.gsub(/[^A-Za-z0-9_.,=-]/) do |i|
137
- hex = i.unpack('H2').first
116
+ hex = i.unpack1('H2')
138
117
  "%#{hex}"
139
118
  end
140
119
  # handle empty case