sinew 2.0.1 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,22 @@
1
+ module Sinew
2
+ module Connection
3
+ class LogFormatter < Faraday::Logging::Formatter
4
+ def request(env)
5
+ info('req') do
6
+ # Only log the initial request, not the redirects
7
+ return if env[:redirect]
8
+
9
+ msg = apply_filters(env.url.to_s)
10
+ msg = "#{msg} (#{env.method})" if env.method != :get
11
+ msg = "#{msg} => #{env.request.proxy.uri}" if env.request.proxy
12
+
13
+ msg
14
+ end
15
+ end
16
+
17
+ def response(env)
18
+ # silent
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,29 @@
1
+ module Sinew
2
+ module Connection
3
+ class RateLimit < Faraday::Middleware
4
+ attr_reader :rate_limit
5
+
6
+ def initialize(app, options = {})
7
+ super(app)
8
+
9
+ @last_request_tm = @current_request_tm = nil
10
+ @rate_limit = options.fetch(:rate_limit, 1)
11
+ end
12
+
13
+ def on_request(_env)
14
+ if @last_request_tm
15
+ sleep = (@last_request_tm + rate_limit) - Time.now
16
+ sleep(sleep) if sleep > 0
17
+ end
18
+
19
+ @current_request_tm = Time.now
20
+ end
21
+
22
+ def on_complete(env)
23
+ # Only rate limit on uncached requests
24
+ @last_request_tm = @current_request_tm unless env[:httpdisk]
25
+ @current_request_tm = nil
26
+ end
27
+ end
28
+ end
29
+ end
@@ -30,7 +30,7 @@ class String
30
30
  elsif limit >= size
31
31
  dup
32
32
  else
33
- self[-limit..-1]
33
+ self[-limit..]
34
34
  end
35
35
  end
36
36
 
data/lib/sinew/dsl.rb CHANGED
@@ -1,5 +1,6 @@
1
- require 'awesome_print'
1
+ require 'amazing_print'
2
2
  require 'cgi'
3
+ require 'json'
3
4
 
4
5
  #
5
6
  # The DSL available to .sinew files.
@@ -7,7 +8,10 @@ require 'cgi'
7
8
 
8
9
  module Sinew
9
10
  class DSL
10
- attr_reader :sinew, :raw, :uri, :elapsed
11
+ # this is used to break out of --limit
12
+ class LimitError < StandardError; end
13
+
14
+ attr_reader :sinew, :uri, :raw, :code, :elapsed
11
15
 
12
16
  def initialize(sinew)
13
17
  @sinew = sinew
@@ -15,8 +19,12 @@ module Sinew
15
19
 
16
20
  def run
17
21
  tm = Time.now
18
- recipe = sinew.options[:recipe]
19
- instance_eval(File.read(recipe, mode: 'rb'), recipe)
22
+ begin
23
+ recipe = sinew.options[:recipe]
24
+ instance_eval(File.read(recipe, mode: 'rb'), recipe)
25
+ rescue LimitError
26
+ # ignore - this is flow control for --limit
27
+ end
20
28
  @elapsed = Time.now - tm
21
29
  end
22
30
 
@@ -45,15 +53,17 @@ module Sinew
45
53
  end
46
54
 
47
55
  def http(method, url, options = {})
48
- # reset
49
- @html = @noko = @json = @url = nil
56
+ # these need to be cleared before each request
57
+ %i[@html @noko @xml @json].each do |i|
58
+ instance_variable_set(i, nil)
59
+ end
50
60
 
51
- # fetch
61
+ # fetch and make response available to callers
52
62
  response = sinew.http(method, url, options)
63
+ @uri, @raw, @code = response.uri, response.body, response.code
53
64
 
54
- # respond
55
- @uri = response.uri
56
- @raw = response.body
65
+ # don't confuse the user
66
+ nil
57
67
  end
58
68
 
59
69
  #
@@ -75,6 +85,10 @@ module Sinew
75
85
  @noko ||= Nokogiri::HTML(html)
76
86
  end
77
87
 
88
+ def xml
89
+ @xml ||= Nokogiri::XML(html)
90
+ end
91
+
78
92
  def json
79
93
  @json ||= JSON.parse(raw, symbolize_names: true)
80
94
  end
@@ -93,6 +107,9 @@ module Sinew
93
107
 
94
108
  def csv_emit(row)
95
109
  sinew.output.emit(row)
110
+ if sinew.output.count == sinew.options[:limit]
111
+ raise LimitError.new
112
+ end
96
113
  end
97
114
  end
98
115
  end
data/lib/sinew/main.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  require 'scripto'
2
+ require 'sinew/connection'
2
3
 
3
4
  #
4
5
  # Main sinew entry point.
@@ -6,15 +7,13 @@ require 'scripto'
6
7
 
7
8
  module Sinew
8
9
  class Main < Scripto::Main
9
- attr_reader :runtime_options, :request_tm, :request_count
10
+ attr_reader :runtime_options
10
11
 
11
12
  def initialize(options)
12
13
  super(options)
13
14
 
14
15
  # init
15
16
  @runtime_options = RuntimeOptions.new
16
- @request_tm = Time.at(0)
17
- @request_count = 0
18
17
  end
19
18
 
20
19
  def run
@@ -31,24 +30,12 @@ module Sinew
31
30
  end
32
31
 
33
32
  #
34
- # http requests and caching
33
+ # http requests
35
34
  #
36
35
 
37
- def cache
38
- @cache ||= Cache.new(self)
39
- end
40
-
41
36
  def http(method, url, options = {})
42
37
  request = Request.new(self, method, url, options)
43
-
44
- # try to get from cache
45
- response = cache.get(request)
46
-
47
- # perform if necessary
48
- if !response
49
- response = perform(request)
50
- cache.set(response)
51
- end
38
+ response = request.perform(connection)
52
39
 
53
40
  # always log error messages
54
41
  if response.error?
@@ -58,26 +45,10 @@ module Sinew
58
45
  response
59
46
  end
60
47
 
61
- def perform(request)
62
- before_perform_request(request)
63
-
64
- response = nil
65
-
66
- tries = runtime_options.retries + 1
67
- while tries > 0
68
- tries -= 1
69
- begin
70
- @request_count += 1
71
- response = request.perform
72
- rescue Timeout::Error
73
- response = Response.from_timeout(request)
74
- end
75
- break if !response.error_500?
76
- end
77
-
78
- response
48
+ def connection
49
+ @connection ||= Connection.create(options: options, runtime_options: runtime_options)
79
50
  end
80
- protected :perform
51
+ protected :connection
81
52
 
82
53
  #
83
54
  # output
@@ -91,24 +62,6 @@ module Sinew
91
62
  # helpers
92
63
  #
93
64
 
94
- def before_perform_request(request)
95
- # log
96
- if !quiet?
97
- msg = if request.method != 'get'
98
- "req #{request.uri} (#{request.method})"
99
- else
100
- "req #{request.uri}"
101
- end
102
- $stderr.puts msg
103
- end
104
-
105
- # rate limit
106
- sleep = (request_tm + runtime_options.rate_limit) - Time.now
107
- sleep(sleep) if sleep > 0
108
- @request_tm = Time.now
109
- end
110
- protected :before_perform_request
111
-
112
65
  def footer
113
66
  output.report
114
67
  finished = output.count > 0 ? "Finished #{output.filename}" : 'Finished'
data/lib/sinew/output.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  require 'csv'
2
- require 'stringex'
2
+ require 'set'
3
+ require 'sterile'
3
4
 
4
5
  #
5
6
  # CSV output.
@@ -7,11 +8,12 @@ require 'stringex'
7
8
 
8
9
  module Sinew
9
10
  class Output
10
- attr_reader :sinew, :columns, :rows, :csv
11
+ attr_reader :sinew, :columns, :rows, :urls, :csv
11
12
 
12
13
  def initialize(sinew)
13
14
  @sinew = sinew
14
15
  @rows = []
16
+ @urls = Set.new
15
17
  end
16
18
 
17
19
  def filename
@@ -41,6 +43,9 @@ module Sinew
41
43
  # implicit header if necessary
42
44
  header(row.keys) if !csv
43
45
 
46
+ # don't allow duplicate urls
47
+ return if dup_url?(row)
48
+
44
49
  rows << row.dup
45
50
 
46
51
  # map columns to row, and normalize along the way
@@ -94,27 +99,17 @@ module Sinew
94
99
  s.to_s
95
100
  end
96
101
 
97
- #
98
- # Below uses stringex
99
- #
100
- # github.com/rsl/stringex/blob/master/lib/stringex/string_extensions.rb
101
- # github.com/rsl/stringex/blob/master/lib/stringex/localization/conversion_expressions.rb
102
- #
103
-
104
- # <a>b</a> => b
105
- s = s.strip_html_tags
102
+ # strip html tags. Note that we replace tags with spaces
103
+ s = s.gsub(/<[^>]+>/, ' ')
106
104
 
107
105
  # Converts MS Word 'smart punctuation' to ASCII
108
- s = s.convert_smart_punctuation
106
+ s = Sterile.plain_format(s)
109
107
 
110
- # "&aacute;".convert_accented_html_entities # => "a"
111
- s = s.convert_accented_html_entities
108
+ # &aacute; &amp; etc.
109
+ s = Sterile.decode_entities(s)
112
110
 
113
- # &amp, &frac, etc.
114
- s = s.convert_miscellaneous_html_entities
115
-
116
- # convert unicode => regular characters
117
- s = s.to_ascii
111
+ # "šţɽĩɳģ" => "string"
112
+ s = Sterile.transliterate(s)
118
113
 
119
114
  # squish
120
115
  s = s.squish
@@ -122,5 +117,17 @@ module Sinew
122
117
  s
123
118
  end
124
119
  protected :normalize
120
+
121
+ def dup_url?(row)
122
+ if url = row[:url]
123
+ if urls.include?(url)
124
+ sinew.warning("duplicate url: #{url}") if !sinew.quiet?
125
+ return true
126
+ end
127
+ urls << url
128
+ end
129
+ false
130
+ end
131
+ protected :dup_url?
125
132
  end
126
133
  end
data/lib/sinew/request.rb CHANGED
@@ -1,9 +1,8 @@
1
1
  require 'digest/md5'
2
- require 'httparty'
3
2
  require 'htmlentities'
4
3
 
5
4
  #
6
- # Process a single HTTP request. Mostly a wrapper around HTTParty.
5
+ # Process a single HTTP request.
7
6
  #
8
7
 
9
8
  module Sinew
@@ -12,29 +11,43 @@ module Sinew
12
11
  class Request
13
12
  HTML_ENTITIES = HTMLEntities.new
14
13
  VALID_METHODS = %w[get post patch put delete head options].freeze
14
+ METHODS_WITH_BODY = %w[patch post put].freeze
15
15
 
16
- attr_reader :sinew, :method, :uri, :options, :cache_key
16
+ attr_reader :sinew, :method, :uri, :options
17
17
 
18
- # Options are largely compatible with HTTParty, except for :method.
18
+ # Supported options:
19
+ # body: Body of http post
20
+ # headers: Hash of HTTP headers (combined with runtime_options.headers)
21
+ # query: Hash of query parameters to add to url
19
22
  def initialize(sinew, method, url, options = {})
20
23
  @sinew = sinew
21
24
  @method = method
22
25
  @options = options.dup
23
26
  @uri = parse_url(url)
24
- @cache_key = calculate_cache_key
27
+ end
28
+
29
+ def proxy
30
+ @proxy ||= begin
31
+ if proxies = sinew.options[:proxy]
32
+ proxies.split(',').sample
33
+ end
34
+ end
25
35
  end
26
36
 
27
37
  # run the request, return the result
28
- def perform
38
+ def perform(connection)
29
39
  validate!
30
40
 
31
- # merge global/options headers
32
41
  headers = sinew.runtime_options.headers
33
42
  headers = headers.merge(options[:headers]) if options[:headers]
34
- options[:headers] = headers
35
43
 
36
- party_response = HTTParty.send(method, uri, options)
37
- Response.from_network(self, party_response)
44
+ body = options.delete(:body)
45
+
46
+ fday_response = connection.send(method, uri, body, headers) do
47
+ _1.options[:proxy] = proxy
48
+ end
49
+
50
+ Response.from_network(self, fday_response)
38
51
  end
39
52
 
40
53
  # We accept sloppy urls and attempt to clean them up
@@ -48,11 +61,11 @@ module Sinew
48
61
  s = s.gsub(' ', '%20')
49
62
  s = s.gsub("'", '%27')
50
63
 
51
- # append query manually (instead of letting HTTParty handle it) so we can
52
- # include it in cache_key
64
+ # append query manually (instead of letting Faraday handle it) for consistent
65
+ # Request#uri and Response#uri
53
66
  query = options.delete(:query)
54
67
  if query.present?
55
- q = HTTParty::HashConversions.to_params(query)
68
+ q = Faraday::Utils.default_params_encoder.encode(query)
56
69
  separator = s.include?('?') ? '&' : '?'
57
70
  s = "#{s}#{separator}#{q}"
58
71
  end
@@ -61,44 +74,10 @@ module Sinew
61
74
  end
62
75
  protected :parse_url
63
76
 
64
- def calculate_cache_key
65
- dir = pathify(uri.host)
66
-
67
- body_key = if body.is_a?(Hash)
68
- HTTParty::HashConversions.to_params(body)
69
- else
70
- body&.dup
71
- end
72
-
73
- # build key, as a hash for before_generate_cache_key
74
- key = {
75
- method: method.dup,
76
- path: uri.path,
77
- query: uri.query,
78
- body: body_key,
79
- }
80
- key = sinew.runtime_options.before_generate_cache_key.call(key)
81
-
82
- # strip method for gets
83
- key.delete(:method) if key[:method] == 'get'
84
-
85
- # pull out the values, join and pathify
86
- path = key.values.select(&:present?).join(',')
87
- path = pathify(path)
88
-
89
- # shorten long paths
90
- if path.length > 250
91
- path = Digest::MD5.hexdigest(path)
92
- end
93
-
94
- "#{dir}/#{path}"
95
- end
96
- protected :calculate_cache_key
97
-
98
77
  def validate!
99
78
  raise "invalid method #{method}" if !VALID_METHODS.include?(method)
100
79
  raise "invalid url #{uri}" if uri.scheme !~ /^http/
101
- raise "can't get with a body" if method == 'get' && body
80
+ raise "can't #{method} with a body" if body && !METHODS_WITH_BODY.include?(method)
102
81
  raise "Content-Type doesn't make sense without a body" if content_type && !body
103
82
  end
104
83
  protected :validate!
@@ -134,7 +113,7 @@ module Sinew
134
113
  s = s.gsub(',,', ',')
135
114
  # encode invalid path chars
136
115
  s = s.gsub(/[^A-Za-z0-9_.,=-]/) do |i|
137
- hex = i.unpack('H2').first
116
+ hex = i.unpack1('H2')
138
117
  "%#{hex}"
139
118
  end
140
119
  # handle empty case