sinew 3.0.1 → 4.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/sinew/dsl.rb DELETED
@@ -1,115 +0,0 @@
1
- require 'amazing_print'
2
- require 'cgi'
3
- require 'json'
4
-
5
- #
6
- # The DSL available to .sinew files.
7
- #
8
-
9
- module Sinew
10
- class DSL
11
- # this is used to break out of --limit
12
- class LimitError < StandardError; end
13
-
14
- attr_reader :sinew, :uri, :raw, :code, :elapsed
15
-
16
- def initialize(sinew)
17
- @sinew = sinew
18
- end
19
-
20
- def run
21
- tm = Time.now
22
- begin
23
- recipe = sinew.options[:recipe]
24
- instance_eval(File.read(recipe, mode: 'rb'), recipe)
25
- rescue LimitError
26
- # ignore - this is flow control for --limit
27
- end
28
- @elapsed = Time.now - tm
29
- end
30
-
31
- #
32
- # request
33
- #
34
-
35
- def get(url, query = {})
36
- http('get', url, query: query)
37
- end
38
-
39
- def post(url, form = {})
40
- body = form
41
- headers = {
42
- 'Content-Type' => 'application/x-www-form-urlencoded',
43
- }
44
- http('post', url, body: body, headers: headers)
45
- end
46
-
47
- def post_json(url, json = {})
48
- body = json.to_json
49
- headers = {
50
- 'Content-Type' => 'application/json',
51
- }
52
- http('post', url, body: body, headers: headers)
53
- end
54
-
55
- def http(method, url, options = {})
56
- # these need to be cleared before each request
57
- %i[@html @noko @xml @json].each do |i|
58
- instance_variable_set(i, nil)
59
- end
60
-
61
- # fetch and make response available to callers
62
- response = sinew.http(method, url, options)
63
- @uri, @raw, @code = response.uri, response.body, response.code
64
-
65
- # don't confuse the user
66
- nil
67
- end
68
-
69
- #
70
- # response
71
- #
72
-
73
- def html
74
- @html ||= begin
75
- s = raw.dup
76
- # squish!
77
- s.squish!
78
- # kill whitespace around tags
79
- s.gsub!(/ ?<([^>]+)> ?/, '<\\1>')
80
- s
81
- end
82
- end
83
-
84
- def noko
85
- @noko ||= Nokogiri::HTML(html)
86
- end
87
-
88
- def xml
89
- @xml ||= Nokogiri::XML(html)
90
- end
91
-
92
- def json
93
- @json ||= JSON.parse(raw, symbolize_names: true)
94
- end
95
-
96
- def url
97
- uri.to_s
98
- end
99
-
100
- #
101
- # csv
102
- #
103
-
104
- def csv_header(*args)
105
- sinew.output.header(args)
106
- end
107
-
108
- def csv_emit(row)
109
- sinew.output.emit(row)
110
- if sinew.output.count == sinew.options[:limit]
111
- raise LimitError.new
112
- end
113
- end
114
- end
115
- end
data/lib/sinew/output.rb DELETED
@@ -1,133 +0,0 @@
1
- require 'csv'
2
- require 'set'
3
- require 'sterile'
4
-
5
- #
6
- # CSV output.
7
- #
8
-
9
- module Sinew
10
- class Output
11
- attr_reader :sinew, :columns, :rows, :urls, :csv
12
-
13
- def initialize(sinew)
14
- @sinew = sinew
15
- @rows = []
16
- @urls = Set.new
17
- end
18
-
19
- def filename
20
- @filename ||= begin
21
- recipe = sinew.options[:recipe]
22
- ext = File.extname(recipe)
23
- if ext.empty?
24
- "#{recipe}.csv"
25
- else
26
- recipe.gsub(ext, '.csv')
27
- end
28
- end
29
- end
30
-
31
- def header(columns)
32
- sinew.banner("Writing to #{filename}...") if !sinew.quiet?
33
-
34
- columns = columns.flatten
35
- @columns = columns
36
-
37
- # open csv, write header row
38
- @csv = CSV.open(filename, 'wb')
39
- csv << columns
40
- end
41
-
42
- def emit(row)
43
- # implicit header if necessary
44
- header(row.keys) if !csv
45
-
46
- # don't allow duplicate urls
47
- return if dup_url?(row)
48
-
49
- rows << row.dup
50
-
51
- # map columns to row, and normalize along the way
52
- print = {}
53
- row = columns.map do |i|
54
- value = normalize(row[i])
55
- print[i] = value if value.present?
56
- value
57
- end
58
-
59
- # print
60
- sinew.vputs print.ai
61
-
62
- csv << row
63
- csv.flush
64
- end
65
-
66
- def count
67
- rows.length
68
- end
69
-
70
- def report
71
- return if count == 0
72
-
73
- sinew.banner("Got #{count} rows.")
74
-
75
- # calculate counts
76
- counts = Hash.new(0)
77
- rows.each do |row|
78
- row.each_pair { |k, v| counts[k] += 1 if v.present? }
79
- end
80
- # sort by counts
81
- cols = columns.sort_by { |i| [ -counts[i], i ] }
82
-
83
- # report
84
- len = cols.map { |i| i.to_s.length }.max
85
- fmt = " %-#{len + 1}s %7d / %-7d %6.1f%%\n"
86
- cols.each do |col|
87
- $stderr.printf(fmt, col, counts[col], count, counts[col] * 100.0 / count)
88
- end
89
- end
90
-
91
- def normalize(s)
92
- # noko/array/misc => string
93
- s = case s
94
- when Nokogiri::XML::Element, Nokogiri::XML::NodeSet
95
- s.inner_html
96
- when Array
97
- s.map(&:to_s).join('|')
98
- else
99
- s.to_s
100
- end
101
-
102
- # strip html tags. Note that we replace tags with spaces
103
- s = s.gsub(/<[^>]+>/, ' ')
104
-
105
- # Converts MS Word 'smart punctuation' to ASCII
106
- s = Sterile.plain_format(s)
107
-
108
- # &aacute; &amp; etc.
109
- s = Sterile.decode_entities(s)
110
-
111
- # "šţɽĩɳģ" => "string"
112
- s = Sterile.transliterate(s)
113
-
114
- # squish
115
- s = s.squish
116
-
117
- s
118
- end
119
- protected :normalize
120
-
121
- def dup_url?(row)
122
- if url = row[:url]
123
- if urls.include?(url)
124
- sinew.warning("duplicate url: #{url}") if !sinew.quiet?
125
- return true
126
- end
127
- urls << url
128
- end
129
- false
130
- end
131
- protected :dup_url?
132
- end
133
- end
data/lib/sinew/request.rb DELETED
@@ -1,86 +0,0 @@
1
- require 'sterile'
2
-
3
- #
4
- # Process a single HTTP request.
5
- #
6
-
7
- module Sinew
8
- class Error < StandardError; end
9
-
10
- class Request
11
- VALID_METHODS = %w[get post patch put delete head options].freeze
12
- METHODS_WITH_BODY = %w[patch post put].freeze
13
-
14
- attr_reader :method, :options, :uri
15
-
16
- # Supported options:
17
- # body: Body of http post
18
- # headers: Hash of HTTP headers (combined with runtime_options.headers)
19
- # query: Hash of query parameters to add to url
20
- def initialize(method, url, options = {})
21
- @method = method
22
- @options = options.dup
23
- @uri = parse_url(url)
24
- end
25
-
26
- # run the request, return the result
27
- def perform(connection)
28
- validate!
29
-
30
- body = options.delete(:body)
31
- fday_response = connection.send(method, uri, body) do
32
- _1.headers.update(options[:headers]) if options[:headers]
33
- _1.options[:proxy] = options[:proxy]
34
- end
35
-
36
- Response.from_network(self, fday_response)
37
- end
38
-
39
- # We accept sloppy urls and attempt to clean them up
40
- def parse_url(url)
41
- s = url.to_s
42
-
43
- # remove entities
44
- s = Sterile.decode_entities(s)
45
-
46
- # fix a couple of common encoding bugs
47
- s = s.gsub(' ', '%20')
48
- s = s.gsub("'", '%27')
49
-
50
- # append query manually (instead of letting Faraday handle it) for consistent
51
- # Request#uri and Response#uri
52
- query = options.delete(:query)
53
- if query.present?
54
- q = Faraday::Utils.default_params_encoder.encode(query)
55
- separator = s.include?('?') ? '&' : '?'
56
- s = "#{s}#{separator}#{q}"
57
- end
58
-
59
- URI.parse(s)
60
- end
61
- protected :parse_url
62
-
63
- def validate!
64
- raise "invalid method #{method}" if !VALID_METHODS.include?(method)
65
- raise "invalid url #{uri}" if uri.scheme !~ /^http/
66
- raise "can't #{method} with a body" if body && !METHODS_WITH_BODY.include?(method)
67
- raise "Content-Type doesn't make sense without a body" if content_type && !body
68
- end
69
- protected :validate!
70
-
71
- def body
72
- options[:body]
73
- end
74
- protected :body
75
-
76
- def headers
77
- options[:headers]
78
- end
79
- protected :headers
80
-
81
- def content_type
82
- headers && headers['Content-Type']
83
- end
84
- protected :content_type
85
- end
86
- end
@@ -1,28 +0,0 @@
1
- #
2
- # Runtime options that sinew files can modify.
3
- #
4
-
5
- module Sinew
6
- class RuntimeOptions
7
- attr_accessor :retries
8
- attr_accessor :rate_limit
9
- attr_accessor :headers
10
- attr_accessor :httpdisk_options
11
- attr_accessor :insecure
12
-
13
- def initialize
14
- self.retries = 3
15
- self.rate_limit = 1
16
- self.headers = {
17
- 'User-Agent' => "sinew/#{VERSION}",
18
- }
19
- self.httpdisk_options = {}
20
- self.insecure = false
21
-
22
- # for testing
23
- if ENV['SINEW_TEST']
24
- self.rate_limit = 0
25
- end
26
- end
27
- end
28
- end