sinew 3.0.1 → 4.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/test.yml +4 -8
- data/.gitignore +3 -5
- data/.rubocop.yml +13 -48
- data/Gemfile +9 -9
- data/Gemfile.lock +132 -0
- data/LICENSE +1 -1
- data/README.md +113 -48
- data/Rakefile +3 -51
- data/bin/sinew +13 -41
- data/justfile +59 -0
- data/lib/sinew/args.rb +53 -0
- data/lib/sinew/base.rb +252 -0
- data/lib/sinew/csv.rb +89 -0
- data/lib/sinew/main.rb +46 -72
- data/lib/sinew/{connection → middleware}/log_formatter.rb +3 -2
- data/lib/sinew/nokogiri_ext.rb +13 -22
- data/lib/sinew/response.rb +41 -52
- data/lib/sinew/version.rb +1 -1
- data/lib/sinew.rb +23 -9
- data/sample.rb +13 -0
- data/sample.sinew +4 -4
- data/sinew.gemspec +24 -20
- metadata +56 -31
- data/.vscode/extensions.json +0 -3
- data/.vscode/settings.json +0 -5
- data/lib/sinew/connection/rate_limit.rb +0 -29
- data/lib/sinew/connection.rb +0 -52
- data/lib/sinew/core_ext.rb +0 -59
- data/lib/sinew/dsl.rb +0 -115
- data/lib/sinew/output.rb +0 -133
- data/lib/sinew/request.rb +0 -86
- data/lib/sinew/runtime_options.rb +0 -28
data/lib/sinew/dsl.rb
DELETED
@@ -1,115 +0,0 @@
|
|
1
|
-
require 'amazing_print'
|
2
|
-
require 'cgi'
|
3
|
-
require 'json'
|
4
|
-
|
5
|
-
#
|
6
|
-
# The DSL available to .sinew files.
|
7
|
-
#
|
8
|
-
|
9
|
-
module Sinew
|
10
|
-
class DSL
|
11
|
-
# this is used to break out of --limit
|
12
|
-
class LimitError < StandardError; end
|
13
|
-
|
14
|
-
attr_reader :sinew, :uri, :raw, :code, :elapsed
|
15
|
-
|
16
|
-
def initialize(sinew)
|
17
|
-
@sinew = sinew
|
18
|
-
end
|
19
|
-
|
20
|
-
def run
|
21
|
-
tm = Time.now
|
22
|
-
begin
|
23
|
-
recipe = sinew.options[:recipe]
|
24
|
-
instance_eval(File.read(recipe, mode: 'rb'), recipe)
|
25
|
-
rescue LimitError
|
26
|
-
# ignore - this is flow control for --limit
|
27
|
-
end
|
28
|
-
@elapsed = Time.now - tm
|
29
|
-
end
|
30
|
-
|
31
|
-
#
|
32
|
-
# request
|
33
|
-
#
|
34
|
-
|
35
|
-
def get(url, query = {})
|
36
|
-
http('get', url, query: query)
|
37
|
-
end
|
38
|
-
|
39
|
-
def post(url, form = {})
|
40
|
-
body = form
|
41
|
-
headers = {
|
42
|
-
'Content-Type' => 'application/x-www-form-urlencoded',
|
43
|
-
}
|
44
|
-
http('post', url, body: body, headers: headers)
|
45
|
-
end
|
46
|
-
|
47
|
-
def post_json(url, json = {})
|
48
|
-
body = json.to_json
|
49
|
-
headers = {
|
50
|
-
'Content-Type' => 'application/json',
|
51
|
-
}
|
52
|
-
http('post', url, body: body, headers: headers)
|
53
|
-
end
|
54
|
-
|
55
|
-
def http(method, url, options = {})
|
56
|
-
# these need to be cleared before each request
|
57
|
-
%i[@html @noko @xml @json].each do |i|
|
58
|
-
instance_variable_set(i, nil)
|
59
|
-
end
|
60
|
-
|
61
|
-
# fetch and make response available to callers
|
62
|
-
response = sinew.http(method, url, options)
|
63
|
-
@uri, @raw, @code = response.uri, response.body, response.code
|
64
|
-
|
65
|
-
# don't confuse the user
|
66
|
-
nil
|
67
|
-
end
|
68
|
-
|
69
|
-
#
|
70
|
-
# response
|
71
|
-
#
|
72
|
-
|
73
|
-
def html
|
74
|
-
@html ||= begin
|
75
|
-
s = raw.dup
|
76
|
-
# squish!
|
77
|
-
s.squish!
|
78
|
-
# kill whitespace around tags
|
79
|
-
s.gsub!(/ ?<([^>]+)> ?/, '<\\1>')
|
80
|
-
s
|
81
|
-
end
|
82
|
-
end
|
83
|
-
|
84
|
-
def noko
|
85
|
-
@noko ||= Nokogiri::HTML(html)
|
86
|
-
end
|
87
|
-
|
88
|
-
def xml
|
89
|
-
@xml ||= Nokogiri::XML(html)
|
90
|
-
end
|
91
|
-
|
92
|
-
def json
|
93
|
-
@json ||= JSON.parse(raw, symbolize_names: true)
|
94
|
-
end
|
95
|
-
|
96
|
-
def url
|
97
|
-
uri.to_s
|
98
|
-
end
|
99
|
-
|
100
|
-
#
|
101
|
-
# csv
|
102
|
-
#
|
103
|
-
|
104
|
-
def csv_header(*args)
|
105
|
-
sinew.output.header(args)
|
106
|
-
end
|
107
|
-
|
108
|
-
def csv_emit(row)
|
109
|
-
sinew.output.emit(row)
|
110
|
-
if sinew.output.count == sinew.options[:limit]
|
111
|
-
raise LimitError.new
|
112
|
-
end
|
113
|
-
end
|
114
|
-
end
|
115
|
-
end
|
data/lib/sinew/output.rb
DELETED
@@ -1,133 +0,0 @@
|
|
1
|
-
require 'csv'
|
2
|
-
require 'set'
|
3
|
-
require 'sterile'
|
4
|
-
|
5
|
-
#
|
6
|
-
# CSV output.
|
7
|
-
#
|
8
|
-
|
9
|
-
module Sinew
|
10
|
-
class Output
|
11
|
-
attr_reader :sinew, :columns, :rows, :urls, :csv
|
12
|
-
|
13
|
-
def initialize(sinew)
|
14
|
-
@sinew = sinew
|
15
|
-
@rows = []
|
16
|
-
@urls = Set.new
|
17
|
-
end
|
18
|
-
|
19
|
-
def filename
|
20
|
-
@filename ||= begin
|
21
|
-
recipe = sinew.options[:recipe]
|
22
|
-
ext = File.extname(recipe)
|
23
|
-
if ext.empty?
|
24
|
-
"#{recipe}.csv"
|
25
|
-
else
|
26
|
-
recipe.gsub(ext, '.csv')
|
27
|
-
end
|
28
|
-
end
|
29
|
-
end
|
30
|
-
|
31
|
-
def header(columns)
|
32
|
-
sinew.banner("Writing to #{filename}...") if !sinew.quiet?
|
33
|
-
|
34
|
-
columns = columns.flatten
|
35
|
-
@columns = columns
|
36
|
-
|
37
|
-
# open csv, write header row
|
38
|
-
@csv = CSV.open(filename, 'wb')
|
39
|
-
csv << columns
|
40
|
-
end
|
41
|
-
|
42
|
-
def emit(row)
|
43
|
-
# implicit header if necessary
|
44
|
-
header(row.keys) if !csv
|
45
|
-
|
46
|
-
# don't allow duplicate urls
|
47
|
-
return if dup_url?(row)
|
48
|
-
|
49
|
-
rows << row.dup
|
50
|
-
|
51
|
-
# map columns to row, and normalize along the way
|
52
|
-
print = {}
|
53
|
-
row = columns.map do |i|
|
54
|
-
value = normalize(row[i])
|
55
|
-
print[i] = value if value.present?
|
56
|
-
value
|
57
|
-
end
|
58
|
-
|
59
|
-
# print
|
60
|
-
sinew.vputs print.ai
|
61
|
-
|
62
|
-
csv << row
|
63
|
-
csv.flush
|
64
|
-
end
|
65
|
-
|
66
|
-
def count
|
67
|
-
rows.length
|
68
|
-
end
|
69
|
-
|
70
|
-
def report
|
71
|
-
return if count == 0
|
72
|
-
|
73
|
-
sinew.banner("Got #{count} rows.")
|
74
|
-
|
75
|
-
# calculate counts
|
76
|
-
counts = Hash.new(0)
|
77
|
-
rows.each do |row|
|
78
|
-
row.each_pair { |k, v| counts[k] += 1 if v.present? }
|
79
|
-
end
|
80
|
-
# sort by counts
|
81
|
-
cols = columns.sort_by { |i| [ -counts[i], i ] }
|
82
|
-
|
83
|
-
# report
|
84
|
-
len = cols.map { |i| i.to_s.length }.max
|
85
|
-
fmt = " %-#{len + 1}s %7d / %-7d %6.1f%%\n"
|
86
|
-
cols.each do |col|
|
87
|
-
$stderr.printf(fmt, col, counts[col], count, counts[col] * 100.0 / count)
|
88
|
-
end
|
89
|
-
end
|
90
|
-
|
91
|
-
def normalize(s)
|
92
|
-
# noko/array/misc => string
|
93
|
-
s = case s
|
94
|
-
when Nokogiri::XML::Element, Nokogiri::XML::NodeSet
|
95
|
-
s.inner_html
|
96
|
-
when Array
|
97
|
-
s.map(&:to_s).join('|')
|
98
|
-
else
|
99
|
-
s.to_s
|
100
|
-
end
|
101
|
-
|
102
|
-
# strip html tags. Note that we replace tags with spaces
|
103
|
-
s = s.gsub(/<[^>]+>/, ' ')
|
104
|
-
|
105
|
-
# Converts MS Word 'smart punctuation' to ASCII
|
106
|
-
s = Sterile.plain_format(s)
|
107
|
-
|
108
|
-
# á & etc.
|
109
|
-
s = Sterile.decode_entities(s)
|
110
|
-
|
111
|
-
# "šţɽĩɳģ" => "string"
|
112
|
-
s = Sterile.transliterate(s)
|
113
|
-
|
114
|
-
# squish
|
115
|
-
s = s.squish
|
116
|
-
|
117
|
-
s
|
118
|
-
end
|
119
|
-
protected :normalize
|
120
|
-
|
121
|
-
def dup_url?(row)
|
122
|
-
if url = row[:url]
|
123
|
-
if urls.include?(url)
|
124
|
-
sinew.warning("duplicate url: #{url}") if !sinew.quiet?
|
125
|
-
return true
|
126
|
-
end
|
127
|
-
urls << url
|
128
|
-
end
|
129
|
-
false
|
130
|
-
end
|
131
|
-
protected :dup_url?
|
132
|
-
end
|
133
|
-
end
|
data/lib/sinew/request.rb
DELETED
@@ -1,86 +0,0 @@
|
|
1
|
-
require 'sterile'
|
2
|
-
|
3
|
-
#
|
4
|
-
# Process a single HTTP request.
|
5
|
-
#
|
6
|
-
|
7
|
-
module Sinew
|
8
|
-
class Error < StandardError; end
|
9
|
-
|
10
|
-
class Request
|
11
|
-
VALID_METHODS = %w[get post patch put delete head options].freeze
|
12
|
-
METHODS_WITH_BODY = %w[patch post put].freeze
|
13
|
-
|
14
|
-
attr_reader :method, :options, :uri
|
15
|
-
|
16
|
-
# Supported options:
|
17
|
-
# body: Body of http post
|
18
|
-
# headers: Hash of HTTP headers (combined with runtime_options.headers)
|
19
|
-
# query: Hash of query parameters to add to url
|
20
|
-
def initialize(method, url, options = {})
|
21
|
-
@method = method
|
22
|
-
@options = options.dup
|
23
|
-
@uri = parse_url(url)
|
24
|
-
end
|
25
|
-
|
26
|
-
# run the request, return the result
|
27
|
-
def perform(connection)
|
28
|
-
validate!
|
29
|
-
|
30
|
-
body = options.delete(:body)
|
31
|
-
fday_response = connection.send(method, uri, body) do
|
32
|
-
_1.headers.update(options[:headers]) if options[:headers]
|
33
|
-
_1.options[:proxy] = options[:proxy]
|
34
|
-
end
|
35
|
-
|
36
|
-
Response.from_network(self, fday_response)
|
37
|
-
end
|
38
|
-
|
39
|
-
# We accept sloppy urls and attempt to clean them up
|
40
|
-
def parse_url(url)
|
41
|
-
s = url.to_s
|
42
|
-
|
43
|
-
# remove entities
|
44
|
-
s = Sterile.decode_entities(s)
|
45
|
-
|
46
|
-
# fix a couple of common encoding bugs
|
47
|
-
s = s.gsub(' ', '%20')
|
48
|
-
s = s.gsub("'", '%27')
|
49
|
-
|
50
|
-
# append query manually (instead of letting Faraday handle it) for consistent
|
51
|
-
# Request#uri and Response#uri
|
52
|
-
query = options.delete(:query)
|
53
|
-
if query.present?
|
54
|
-
q = Faraday::Utils.default_params_encoder.encode(query)
|
55
|
-
separator = s.include?('?') ? '&' : '?'
|
56
|
-
s = "#{s}#{separator}#{q}"
|
57
|
-
end
|
58
|
-
|
59
|
-
URI.parse(s)
|
60
|
-
end
|
61
|
-
protected :parse_url
|
62
|
-
|
63
|
-
def validate!
|
64
|
-
raise "invalid method #{method}" if !VALID_METHODS.include?(method)
|
65
|
-
raise "invalid url #{uri}" if uri.scheme !~ /^http/
|
66
|
-
raise "can't #{method} with a body" if body && !METHODS_WITH_BODY.include?(method)
|
67
|
-
raise "Content-Type doesn't make sense without a body" if content_type && !body
|
68
|
-
end
|
69
|
-
protected :validate!
|
70
|
-
|
71
|
-
def body
|
72
|
-
options[:body]
|
73
|
-
end
|
74
|
-
protected :body
|
75
|
-
|
76
|
-
def headers
|
77
|
-
options[:headers]
|
78
|
-
end
|
79
|
-
protected :headers
|
80
|
-
|
81
|
-
def content_type
|
82
|
-
headers && headers['Content-Type']
|
83
|
-
end
|
84
|
-
protected :content_type
|
85
|
-
end
|
86
|
-
end
|
@@ -1,28 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# Runtime options that sinew files can modify.
|
3
|
-
#
|
4
|
-
|
5
|
-
module Sinew
|
6
|
-
class RuntimeOptions
|
7
|
-
attr_accessor :retries
|
8
|
-
attr_accessor :rate_limit
|
9
|
-
attr_accessor :headers
|
10
|
-
attr_accessor :httpdisk_options
|
11
|
-
attr_accessor :insecure
|
12
|
-
|
13
|
-
def initialize
|
14
|
-
self.retries = 3
|
15
|
-
self.rate_limit = 1
|
16
|
-
self.headers = {
|
17
|
-
'User-Agent' => "sinew/#{VERSION}",
|
18
|
-
}
|
19
|
-
self.httpdisk_options = {}
|
20
|
-
self.insecure = false
|
21
|
-
|
22
|
-
# for testing
|
23
|
-
if ENV['SINEW_TEST']
|
24
|
-
self.rate_limit = 0
|
25
|
-
end
|
26
|
-
end
|
27
|
-
end
|
28
|
-
end
|