sinew 3.0.1 → 4.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/test.yml +4 -8
- data/.gitignore +3 -5
- data/.rubocop.yml +13 -48
- data/Gemfile +9 -9
- data/Gemfile.lock +132 -0
- data/LICENSE +1 -1
- data/README.md +113 -48
- data/Rakefile +3 -51
- data/bin/sinew +13 -41
- data/justfile +59 -0
- data/lib/sinew/args.rb +53 -0
- data/lib/sinew/base.rb +252 -0
- data/lib/sinew/csv.rb +89 -0
- data/lib/sinew/main.rb +46 -72
- data/lib/sinew/{connection → middleware}/log_formatter.rb +3 -2
- data/lib/sinew/nokogiri_ext.rb +13 -22
- data/lib/sinew/response.rb +41 -52
- data/lib/sinew/version.rb +1 -1
- data/lib/sinew.rb +23 -9
- data/sample.rb +13 -0
- data/sample.sinew +4 -4
- data/sinew.gemspec +24 -20
- metadata +56 -31
- data/.vscode/extensions.json +0 -3
- data/.vscode/settings.json +0 -5
- data/lib/sinew/connection/rate_limit.rb +0 -29
- data/lib/sinew/connection.rb +0 -52
- data/lib/sinew/core_ext.rb +0 -59
- data/lib/sinew/dsl.rb +0 -115
- data/lib/sinew/output.rb +0 -133
- data/lib/sinew/request.rb +0 -86
- data/lib/sinew/runtime_options.rb +0 -28
data/lib/sinew/dsl.rb
DELETED
@@ -1,115 +0,0 @@
|
|
1
|
-
require 'amazing_print'
|
2
|
-
require 'cgi'
|
3
|
-
require 'json'
|
4
|
-
|
5
|
-
#
|
6
|
-
# The DSL available to .sinew files.
|
7
|
-
#
|
8
|
-
|
9
|
-
module Sinew
|
10
|
-
class DSL
|
11
|
-
# this is used to break out of --limit
|
12
|
-
class LimitError < StandardError; end
|
13
|
-
|
14
|
-
attr_reader :sinew, :uri, :raw, :code, :elapsed
|
15
|
-
|
16
|
-
def initialize(sinew)
|
17
|
-
@sinew = sinew
|
18
|
-
end
|
19
|
-
|
20
|
-
def run
|
21
|
-
tm = Time.now
|
22
|
-
begin
|
23
|
-
recipe = sinew.options[:recipe]
|
24
|
-
instance_eval(File.read(recipe, mode: 'rb'), recipe)
|
25
|
-
rescue LimitError
|
26
|
-
# ignore - this is flow control for --limit
|
27
|
-
end
|
28
|
-
@elapsed = Time.now - tm
|
29
|
-
end
|
30
|
-
|
31
|
-
#
|
32
|
-
# request
|
33
|
-
#
|
34
|
-
|
35
|
-
def get(url, query = {})
|
36
|
-
http('get', url, query: query)
|
37
|
-
end
|
38
|
-
|
39
|
-
def post(url, form = {})
|
40
|
-
body = form
|
41
|
-
headers = {
|
42
|
-
'Content-Type' => 'application/x-www-form-urlencoded',
|
43
|
-
}
|
44
|
-
http('post', url, body: body, headers: headers)
|
45
|
-
end
|
46
|
-
|
47
|
-
def post_json(url, json = {})
|
48
|
-
body = json.to_json
|
49
|
-
headers = {
|
50
|
-
'Content-Type' => 'application/json',
|
51
|
-
}
|
52
|
-
http('post', url, body: body, headers: headers)
|
53
|
-
end
|
54
|
-
|
55
|
-
def http(method, url, options = {})
|
56
|
-
# these need to be cleared before each request
|
57
|
-
%i[@html @noko @xml @json].each do |i|
|
58
|
-
instance_variable_set(i, nil)
|
59
|
-
end
|
60
|
-
|
61
|
-
# fetch and make response available to callers
|
62
|
-
response = sinew.http(method, url, options)
|
63
|
-
@uri, @raw, @code = response.uri, response.body, response.code
|
64
|
-
|
65
|
-
# don't confuse the user
|
66
|
-
nil
|
67
|
-
end
|
68
|
-
|
69
|
-
#
|
70
|
-
# response
|
71
|
-
#
|
72
|
-
|
73
|
-
def html
|
74
|
-
@html ||= begin
|
75
|
-
s = raw.dup
|
76
|
-
# squish!
|
77
|
-
s.squish!
|
78
|
-
# kill whitespace around tags
|
79
|
-
s.gsub!(/ ?<([^>]+)> ?/, '<\\1>')
|
80
|
-
s
|
81
|
-
end
|
82
|
-
end
|
83
|
-
|
84
|
-
def noko
|
85
|
-
@noko ||= Nokogiri::HTML(html)
|
86
|
-
end
|
87
|
-
|
88
|
-
def xml
|
89
|
-
@xml ||= Nokogiri::XML(html)
|
90
|
-
end
|
91
|
-
|
92
|
-
def json
|
93
|
-
@json ||= JSON.parse(raw, symbolize_names: true)
|
94
|
-
end
|
95
|
-
|
96
|
-
def url
|
97
|
-
uri.to_s
|
98
|
-
end
|
99
|
-
|
100
|
-
#
|
101
|
-
# csv
|
102
|
-
#
|
103
|
-
|
104
|
-
def csv_header(*args)
|
105
|
-
sinew.output.header(args)
|
106
|
-
end
|
107
|
-
|
108
|
-
def csv_emit(row)
|
109
|
-
sinew.output.emit(row)
|
110
|
-
if sinew.output.count == sinew.options[:limit]
|
111
|
-
raise LimitError.new
|
112
|
-
end
|
113
|
-
end
|
114
|
-
end
|
115
|
-
end
|
data/lib/sinew/output.rb
DELETED
@@ -1,133 +0,0 @@
|
|
1
|
-
require 'csv'
|
2
|
-
require 'set'
|
3
|
-
require 'sterile'
|
4
|
-
|
5
|
-
#
|
6
|
-
# CSV output.
|
7
|
-
#
|
8
|
-
|
9
|
-
module Sinew
|
10
|
-
class Output
|
11
|
-
attr_reader :sinew, :columns, :rows, :urls, :csv
|
12
|
-
|
13
|
-
def initialize(sinew)
|
14
|
-
@sinew = sinew
|
15
|
-
@rows = []
|
16
|
-
@urls = Set.new
|
17
|
-
end
|
18
|
-
|
19
|
-
def filename
|
20
|
-
@filename ||= begin
|
21
|
-
recipe = sinew.options[:recipe]
|
22
|
-
ext = File.extname(recipe)
|
23
|
-
if ext.empty?
|
24
|
-
"#{recipe}.csv"
|
25
|
-
else
|
26
|
-
recipe.gsub(ext, '.csv')
|
27
|
-
end
|
28
|
-
end
|
29
|
-
end
|
30
|
-
|
31
|
-
def header(columns)
|
32
|
-
sinew.banner("Writing to #{filename}...") if !sinew.quiet?
|
33
|
-
|
34
|
-
columns = columns.flatten
|
35
|
-
@columns = columns
|
36
|
-
|
37
|
-
# open csv, write header row
|
38
|
-
@csv = CSV.open(filename, 'wb')
|
39
|
-
csv << columns
|
40
|
-
end
|
41
|
-
|
42
|
-
def emit(row)
|
43
|
-
# implicit header if necessary
|
44
|
-
header(row.keys) if !csv
|
45
|
-
|
46
|
-
# don't allow duplicate urls
|
47
|
-
return if dup_url?(row)
|
48
|
-
|
49
|
-
rows << row.dup
|
50
|
-
|
51
|
-
# map columns to row, and normalize along the way
|
52
|
-
print = {}
|
53
|
-
row = columns.map do |i|
|
54
|
-
value = normalize(row[i])
|
55
|
-
print[i] = value if value.present?
|
56
|
-
value
|
57
|
-
end
|
58
|
-
|
59
|
-
# print
|
60
|
-
sinew.vputs print.ai
|
61
|
-
|
62
|
-
csv << row
|
63
|
-
csv.flush
|
64
|
-
end
|
65
|
-
|
66
|
-
def count
|
67
|
-
rows.length
|
68
|
-
end
|
69
|
-
|
70
|
-
def report
|
71
|
-
return if count == 0
|
72
|
-
|
73
|
-
sinew.banner("Got #{count} rows.")
|
74
|
-
|
75
|
-
# calculate counts
|
76
|
-
counts = Hash.new(0)
|
77
|
-
rows.each do |row|
|
78
|
-
row.each_pair { |k, v| counts[k] += 1 if v.present? }
|
79
|
-
end
|
80
|
-
# sort by counts
|
81
|
-
cols = columns.sort_by { |i| [ -counts[i], i ] }
|
82
|
-
|
83
|
-
# report
|
84
|
-
len = cols.map { |i| i.to_s.length }.max
|
85
|
-
fmt = " %-#{len + 1}s %7d / %-7d %6.1f%%\n"
|
86
|
-
cols.each do |col|
|
87
|
-
$stderr.printf(fmt, col, counts[col], count, counts[col] * 100.0 / count)
|
88
|
-
end
|
89
|
-
end
|
90
|
-
|
91
|
-
def normalize(s)
|
92
|
-
# noko/array/misc => string
|
93
|
-
s = case s
|
94
|
-
when Nokogiri::XML::Element, Nokogiri::XML::NodeSet
|
95
|
-
s.inner_html
|
96
|
-
when Array
|
97
|
-
s.map(&:to_s).join('|')
|
98
|
-
else
|
99
|
-
s.to_s
|
100
|
-
end
|
101
|
-
|
102
|
-
# strip html tags. Note that we replace tags with spaces
|
103
|
-
s = s.gsub(/<[^>]+>/, ' ')
|
104
|
-
|
105
|
-
# Converts MS Word 'smart punctuation' to ASCII
|
106
|
-
s = Sterile.plain_format(s)
|
107
|
-
|
108
|
-
# á & etc.
|
109
|
-
s = Sterile.decode_entities(s)
|
110
|
-
|
111
|
-
# "šţɽĩɳģ" => "string"
|
112
|
-
s = Sterile.transliterate(s)
|
113
|
-
|
114
|
-
# squish
|
115
|
-
s = s.squish
|
116
|
-
|
117
|
-
s
|
118
|
-
end
|
119
|
-
protected :normalize
|
120
|
-
|
121
|
-
def dup_url?(row)
|
122
|
-
if url = row[:url]
|
123
|
-
if urls.include?(url)
|
124
|
-
sinew.warning("duplicate url: #{url}") if !sinew.quiet?
|
125
|
-
return true
|
126
|
-
end
|
127
|
-
urls << url
|
128
|
-
end
|
129
|
-
false
|
130
|
-
end
|
131
|
-
protected :dup_url?
|
132
|
-
end
|
133
|
-
end
|
data/lib/sinew/request.rb
DELETED
@@ -1,86 +0,0 @@
|
|
1
|
-
require 'sterile'
|
2
|
-
|
3
|
-
#
|
4
|
-
# Process a single HTTP request.
|
5
|
-
#
|
6
|
-
|
7
|
-
module Sinew
|
8
|
-
class Error < StandardError; end
|
9
|
-
|
10
|
-
class Request
|
11
|
-
VALID_METHODS = %w[get post patch put delete head options].freeze
|
12
|
-
METHODS_WITH_BODY = %w[patch post put].freeze
|
13
|
-
|
14
|
-
attr_reader :method, :options, :uri
|
15
|
-
|
16
|
-
# Supported options:
|
17
|
-
# body: Body of http post
|
18
|
-
# headers: Hash of HTTP headers (combined with runtime_options.headers)
|
19
|
-
# query: Hash of query parameters to add to url
|
20
|
-
def initialize(method, url, options = {})
|
21
|
-
@method = method
|
22
|
-
@options = options.dup
|
23
|
-
@uri = parse_url(url)
|
24
|
-
end
|
25
|
-
|
26
|
-
# run the request, return the result
|
27
|
-
def perform(connection)
|
28
|
-
validate!
|
29
|
-
|
30
|
-
body = options.delete(:body)
|
31
|
-
fday_response = connection.send(method, uri, body) do
|
32
|
-
_1.headers.update(options[:headers]) if options[:headers]
|
33
|
-
_1.options[:proxy] = options[:proxy]
|
34
|
-
end
|
35
|
-
|
36
|
-
Response.from_network(self, fday_response)
|
37
|
-
end
|
38
|
-
|
39
|
-
# We accept sloppy urls and attempt to clean them up
|
40
|
-
def parse_url(url)
|
41
|
-
s = url.to_s
|
42
|
-
|
43
|
-
# remove entities
|
44
|
-
s = Sterile.decode_entities(s)
|
45
|
-
|
46
|
-
# fix a couple of common encoding bugs
|
47
|
-
s = s.gsub(' ', '%20')
|
48
|
-
s = s.gsub("'", '%27')
|
49
|
-
|
50
|
-
# append query manually (instead of letting Faraday handle it) for consistent
|
51
|
-
# Request#uri and Response#uri
|
52
|
-
query = options.delete(:query)
|
53
|
-
if query.present?
|
54
|
-
q = Faraday::Utils.default_params_encoder.encode(query)
|
55
|
-
separator = s.include?('?') ? '&' : '?'
|
56
|
-
s = "#{s}#{separator}#{q}"
|
57
|
-
end
|
58
|
-
|
59
|
-
URI.parse(s)
|
60
|
-
end
|
61
|
-
protected :parse_url
|
62
|
-
|
63
|
-
def validate!
|
64
|
-
raise "invalid method #{method}" if !VALID_METHODS.include?(method)
|
65
|
-
raise "invalid url #{uri}" if uri.scheme !~ /^http/
|
66
|
-
raise "can't #{method} with a body" if body && !METHODS_WITH_BODY.include?(method)
|
67
|
-
raise "Content-Type doesn't make sense without a body" if content_type && !body
|
68
|
-
end
|
69
|
-
protected :validate!
|
70
|
-
|
71
|
-
def body
|
72
|
-
options[:body]
|
73
|
-
end
|
74
|
-
protected :body
|
75
|
-
|
76
|
-
def headers
|
77
|
-
options[:headers]
|
78
|
-
end
|
79
|
-
protected :headers
|
80
|
-
|
81
|
-
def content_type
|
82
|
-
headers && headers['Content-Type']
|
83
|
-
end
|
84
|
-
protected :content_type
|
85
|
-
end
|
86
|
-
end
|
@@ -1,28 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# Runtime options that sinew files can modify.
|
3
|
-
#
|
4
|
-
|
5
|
-
module Sinew
|
6
|
-
class RuntimeOptions
|
7
|
-
attr_accessor :retries
|
8
|
-
attr_accessor :rate_limit
|
9
|
-
attr_accessor :headers
|
10
|
-
attr_accessor :httpdisk_options
|
11
|
-
attr_accessor :insecure
|
12
|
-
|
13
|
-
def initialize
|
14
|
-
self.retries = 3
|
15
|
-
self.rate_limit = 1
|
16
|
-
self.headers = {
|
17
|
-
'User-Agent' => "sinew/#{VERSION}",
|
18
|
-
}
|
19
|
-
self.httpdisk_options = {}
|
20
|
-
self.insecure = false
|
21
|
-
|
22
|
-
# for testing
|
23
|
-
if ENV['SINEW_TEST']
|
24
|
-
self.rate_limit = 0
|
25
|
-
end
|
26
|
-
end
|
27
|
-
end
|
28
|
-
end
|