sinew 1.0.4 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.gitignore +1 -0
- data/.rubocop.yml +49 -0
- data/.travis.yml +4 -0
- data/.vscode/extensions.json +3 -0
- data/.vscode/settings.json +15 -0
- data/Gemfile +1 -1
- data/README.md +153 -12
- data/Rakefile +13 -14
- data/bin/sinew +40 -20
- data/lib/sinew.rb +10 -6
- data/lib/sinew/cache.rb +79 -0
- data/lib/sinew/core_ext.rb +59 -0
- data/lib/sinew/dsl.rb +98 -0
- data/lib/sinew/main.rb +80 -149
- data/lib/sinew/nokogiri_ext.rb +10 -9
- data/lib/sinew/output.rb +126 -0
- data/lib/sinew/request.rb +148 -0
- data/lib/sinew/response.rb +75 -0
- data/lib/sinew/runtime_options.rb +26 -0
- data/lib/sinew/version.rb +1 -1
- data/sample.sinew +5 -3
- data/sinew.gemspec +24 -19
- data/test/test.html +40 -34
- data/test/test_cache.rb +69 -0
- data/test/test_helper.rb +113 -0
- data/test/test_main.rb +36 -91
- data/test/test_nokogiri_ext.rb +14 -15
- data/test/test_output.rb +73 -0
- data/test/test_requests.rb +135 -0
- data/test/test_utf8.rb +39 -0
- metadata +103 -48
- data/lib/sinew/curler.rb +0 -173
- data/lib/sinew/text_util.rb +0 -101
- data/lib/sinew/util.rb +0 -236
- data/test/helper.rb +0 -64
- data/test/test_curler.rb +0 -70
- data/test/test_text_util.rb +0 -23
data/lib/sinew/output.rb
ADDED
@@ -0,0 +1,126 @@
|
|
1
|
+
require 'csv'
|
2
|
+
require 'stringex'
|
3
|
+
|
4
|
+
#
|
5
|
+
# CSV output.
|
6
|
+
#
|
7
|
+
|
8
|
+
module Sinew
|
9
|
+
class Output
|
10
|
+
attr_reader :sinew, :columns, :rows, :csv
|
11
|
+
|
12
|
+
def initialize(sinew)
|
13
|
+
@sinew = sinew
|
14
|
+
@rows = []
|
15
|
+
end
|
16
|
+
|
17
|
+
def filename
|
18
|
+
@filename ||= begin
|
19
|
+
recipe = sinew.options[:recipe]
|
20
|
+
ext = File.extname(recipe)
|
21
|
+
if ext.empty?
|
22
|
+
"#{recipe}.csv"
|
23
|
+
else
|
24
|
+
recipe.gsub(ext, '.csv')
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def header(columns)
|
30
|
+
sinew.banner("Writing to #{filename}...") if !sinew.quiet?
|
31
|
+
|
32
|
+
columns = columns.flatten
|
33
|
+
@columns = columns
|
34
|
+
|
35
|
+
# open csv, write header row
|
36
|
+
@csv = CSV.open(filename, 'wb')
|
37
|
+
csv << columns
|
38
|
+
end
|
39
|
+
|
40
|
+
def emit(row)
|
41
|
+
# implicit header if necessary
|
42
|
+
header(row.keys) if !csv
|
43
|
+
|
44
|
+
rows << row.dup
|
45
|
+
|
46
|
+
# map columns to row, and normalize along the way
|
47
|
+
print = {}
|
48
|
+
row = columns.map do |i|
|
49
|
+
value = normalize(row[i])
|
50
|
+
print[i] = value if value.present?
|
51
|
+
value
|
52
|
+
end
|
53
|
+
|
54
|
+
# print
|
55
|
+
sinew.vputs print.ai
|
56
|
+
|
57
|
+
csv << row
|
58
|
+
csv.flush
|
59
|
+
end
|
60
|
+
|
61
|
+
def count
|
62
|
+
rows.length
|
63
|
+
end
|
64
|
+
|
65
|
+
def report
|
66
|
+
return if count == 0
|
67
|
+
|
68
|
+
sinew.banner("Got #{count} rows.")
|
69
|
+
|
70
|
+
# calculate counts
|
71
|
+
counts = Hash.new(0)
|
72
|
+
rows.each do |row|
|
73
|
+
row.each_pair { |k, v| counts[k] += 1 if v.present? }
|
74
|
+
end
|
75
|
+
# sort by counts
|
76
|
+
cols = columns.sort_by { |i| [ -counts[i], i ] }
|
77
|
+
|
78
|
+
# report
|
79
|
+
len = cols.map { |i| i.to_s.length }.max
|
80
|
+
fmt = " %-#{len + 1}s %7d / %-7d %6.1f%%\n"
|
81
|
+
cols.each do |col|
|
82
|
+
$stderr.printf(fmt, col, counts[col], count, counts[col] * 100.0 / count)
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
def normalize(s)
|
87
|
+
# noko/array/misc => string
|
88
|
+
s = case s
|
89
|
+
when Nokogiri::XML::Element, Nokogiri::XML::NodeSet
|
90
|
+
s.inner_html
|
91
|
+
when Array
|
92
|
+
s.map(&:to_s).join('|')
|
93
|
+
else
|
94
|
+
s.to_s
|
95
|
+
end
|
96
|
+
|
97
|
+
#
|
98
|
+
# Below uses stringex
|
99
|
+
#
|
100
|
+
# github.com/rsl/stringex/blob/master/lib/stringex/string_extensions.rb
|
101
|
+
# github.com/rsl/stringex/blob/master/lib/stringex/localization/conversion_expressions.rb
|
102
|
+
#
|
103
|
+
|
104
|
+
# <a>b</a> => b
|
105
|
+
s = s.strip_html_tags
|
106
|
+
|
107
|
+
# Converts MS Word 'smart punctuation' to ASCII
|
108
|
+
s = s.convert_smart_punctuation
|
109
|
+
|
110
|
+
# "á".convert_accented_html_entities # => "a"
|
111
|
+
s = s.convert_accented_html_entities
|
112
|
+
|
113
|
+
# &, &frac, etc.
|
114
|
+
s = s.convert_miscellaneous_html_entities
|
115
|
+
|
116
|
+
# convert unicode => regular characters
|
117
|
+
s = s.to_ascii
|
118
|
+
|
119
|
+
# squish
|
120
|
+
s = s.squish
|
121
|
+
|
122
|
+
s
|
123
|
+
end
|
124
|
+
protected :normalize
|
125
|
+
end
|
126
|
+
end
|
@@ -0,0 +1,148 @@
|
|
1
|
+
require 'digest/md5'
|
2
|
+
require 'httparty'
|
3
|
+
require 'htmlentities'
|
4
|
+
|
5
|
+
#
|
6
|
+
# Process a single HTTP request. Mostly a wrapper around HTTParty.
|
7
|
+
#
|
8
|
+
|
9
|
+
module Sinew
|
10
|
+
class Error < StandardError; end
|
11
|
+
|
12
|
+
class Request
|
13
|
+
HTML_ENTITIES = HTMLEntities.new
|
14
|
+
VALID_METHODS = %w[get post patch put delete head options].freeze
|
15
|
+
|
16
|
+
attr_reader :sinew, :method, :uri, :options, :cache_key
|
17
|
+
|
18
|
+
# Options are largely compatible with HTTParty, except for :method.
|
19
|
+
def initialize(sinew, method, url, options = {})
|
20
|
+
@sinew = sinew
|
21
|
+
@method = method
|
22
|
+
@options = options.dup
|
23
|
+
@uri = parse_url(url)
|
24
|
+
@cache_key = calculate_cache_key
|
25
|
+
end
|
26
|
+
|
27
|
+
# run the request, return the result
|
28
|
+
def perform
|
29
|
+
validate!
|
30
|
+
|
31
|
+
# merge global/options headers
|
32
|
+
headers = sinew.runtime_options.headers
|
33
|
+
headers = headers.merge(options[:headers]) if options[:headers]
|
34
|
+
options[:headers] = headers
|
35
|
+
|
36
|
+
party_response = HTTParty.send(method, uri, options)
|
37
|
+
Response.from_network(self, party_response)
|
38
|
+
end
|
39
|
+
|
40
|
+
# We accept sloppy urls and attempt to clean them up
|
41
|
+
def parse_url(url)
|
42
|
+
s = url
|
43
|
+
|
44
|
+
# remove entities
|
45
|
+
s = HTML_ENTITIES.decode(s)
|
46
|
+
|
47
|
+
# fix a couple of common encoding bugs
|
48
|
+
s = s.gsub(' ', '%20')
|
49
|
+
s = s.gsub("'", '%27')
|
50
|
+
|
51
|
+
# append query manually (instead of letting HTTParty handle it) so we can
|
52
|
+
# include it in cache_key
|
53
|
+
query = options.delete(:query)
|
54
|
+
if query.present?
|
55
|
+
q = HTTParty::HashConversions.to_params(query)
|
56
|
+
separator = s.include?('?') ? '&' : '?'
|
57
|
+
s = "#{s}#{separator}#{q}"
|
58
|
+
end
|
59
|
+
|
60
|
+
URI.parse(s)
|
61
|
+
end
|
62
|
+
protected :parse_url
|
63
|
+
|
64
|
+
def calculate_cache_key
|
65
|
+
dir = pathify(uri.host)
|
66
|
+
|
67
|
+
body_key = if body.is_a?(Hash)
|
68
|
+
HTTParty::HashConversions.to_params(body)
|
69
|
+
else
|
70
|
+
body&.dup
|
71
|
+
end
|
72
|
+
|
73
|
+
# build key, as a hash for before_generate_cache_key
|
74
|
+
key = {
|
75
|
+
method: method.dup,
|
76
|
+
path: uri.path,
|
77
|
+
query: uri.query,
|
78
|
+
body: body_key,
|
79
|
+
}
|
80
|
+
key = sinew.runtime_options.before_generate_cache_key.call(key)
|
81
|
+
|
82
|
+
# strip method for gets
|
83
|
+
key.delete(:method) if key[:method] == 'get'
|
84
|
+
|
85
|
+
# pull out the values, join and pathify
|
86
|
+
path = key.values.select(&:present?).join(',')
|
87
|
+
path = pathify(path)
|
88
|
+
|
89
|
+
# shorten long paths
|
90
|
+
if path.length > 250
|
91
|
+
path = Digest::MD5.hexdigest(path)
|
92
|
+
end
|
93
|
+
|
94
|
+
"#{dir}/#{path}"
|
95
|
+
end
|
96
|
+
protected :calculate_cache_key
|
97
|
+
|
98
|
+
def validate!
|
99
|
+
raise "invalid method #{method}" if !VALID_METHODS.include?(method)
|
100
|
+
raise "invalid url #{uri}" if uri.scheme !~ /^http/
|
101
|
+
raise "can't get with a body" if method == 'get' && body
|
102
|
+
raise "Content-Type doesn't make sense without a body" if content_type && !body
|
103
|
+
end
|
104
|
+
protected :validate!
|
105
|
+
|
106
|
+
def body
|
107
|
+
options[:body]
|
108
|
+
end
|
109
|
+
protected :body
|
110
|
+
|
111
|
+
def headers
|
112
|
+
options[:headers]
|
113
|
+
end
|
114
|
+
protected :headers
|
115
|
+
|
116
|
+
def content_type
|
117
|
+
headers && headers['Content-Type']
|
118
|
+
end
|
119
|
+
protected :content_type
|
120
|
+
|
121
|
+
def form?
|
122
|
+
content_type == 'application/x-www-form-urlencoded'
|
123
|
+
end
|
124
|
+
protected :form?
|
125
|
+
|
126
|
+
def pathify(s)
|
127
|
+
# remove leading slash
|
128
|
+
s = s.gsub(/^\//, '')
|
129
|
+
# .. => comma
|
130
|
+
s = s.gsub('..', ',')
|
131
|
+
# query separators => comma
|
132
|
+
s = s.gsub(/[?\/&]/, ',')
|
133
|
+
# ,, => comma
|
134
|
+
s = s.gsub(',,', ',')
|
135
|
+
# encode invalid path chars
|
136
|
+
s = s.gsub(/[^A-Za-z0-9_.,=-]/) do |i|
|
137
|
+
hex = i.unpack('H2').first
|
138
|
+
"%#{hex}"
|
139
|
+
end
|
140
|
+
# handle empty case
|
141
|
+
s = '_root_' if s.blank?
|
142
|
+
# always downcase
|
143
|
+
s = s.downcase
|
144
|
+
s
|
145
|
+
end
|
146
|
+
protected :pathify
|
147
|
+
end
|
148
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
#
|
2
|
+
# An HTTP response. Mostly a wrapper around HTTParty.
|
3
|
+
#
|
4
|
+
|
5
|
+
module Sinew
|
6
|
+
class Response
|
7
|
+
attr_accessor :request, :uri, :body, :code, :headers
|
8
|
+
|
9
|
+
def self.from_network(request, party_response)
|
10
|
+
Response.new.tap do |response|
|
11
|
+
response.request = request
|
12
|
+
response.uri = party_response.request.last_uri
|
13
|
+
response.code = party_response.code
|
14
|
+
response.headers = party_response.headers.to_h
|
15
|
+
|
16
|
+
# force to utf-8 as best we can
|
17
|
+
body = party_response.body
|
18
|
+
if body.encoding != Encoding::UTF_8
|
19
|
+
body = body.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
|
20
|
+
end
|
21
|
+
response.body = body
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.from_cache(request, body, head)
|
26
|
+
Response.new.tap do |response|
|
27
|
+
response.request = request
|
28
|
+
response.body = body
|
29
|
+
|
30
|
+
# defaults
|
31
|
+
response.uri = request.uri
|
32
|
+
response.code = 200
|
33
|
+
response.headers = {}
|
34
|
+
|
35
|
+
# overwrite with cached response headers
|
36
|
+
if head
|
37
|
+
head = JSON.parse(head, symbolize_names: true)
|
38
|
+
response.uri = URI.parse(head[:uri])
|
39
|
+
response.code = head[:code]
|
40
|
+
response.headers = head[:headers]
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.from_timeout(request)
|
46
|
+
Response.new.tap do |response|
|
47
|
+
response.request = request
|
48
|
+
response.uri = request.uri
|
49
|
+
response.body = 'timeout'
|
50
|
+
response.code = 999
|
51
|
+
response.headers = {}
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def error?
|
56
|
+
code >= 400
|
57
|
+
end
|
58
|
+
|
59
|
+
def error_500?
|
60
|
+
code / 100 >= 5
|
61
|
+
end
|
62
|
+
|
63
|
+
def redirected?
|
64
|
+
request.uri != uri
|
65
|
+
end
|
66
|
+
|
67
|
+
def head_as_json
|
68
|
+
{
|
69
|
+
uri: uri,
|
70
|
+
code: code,
|
71
|
+
headers: headers,
|
72
|
+
}
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
#
|
2
|
+
# Runtime options that sinew files can modify.
|
3
|
+
#
|
4
|
+
|
5
|
+
module Sinew
|
6
|
+
class RuntimeOptions
|
7
|
+
attr_accessor :retries
|
8
|
+
attr_accessor :rate_limit
|
9
|
+
attr_accessor :headers
|
10
|
+
attr_accessor :before_generate_cache_key
|
11
|
+
|
12
|
+
def initialize
|
13
|
+
self.retries = 3
|
14
|
+
self.rate_limit = 1
|
15
|
+
self.headers = {
|
16
|
+
'User-Agent' => "sinew/#{VERSION}",
|
17
|
+
}
|
18
|
+
self.before_generate_cache_key = ->(i) { i }
|
19
|
+
|
20
|
+
# for testing
|
21
|
+
if ENV['SINEW_TEST']
|
22
|
+
self.rate_limit = 0
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
data/lib/sinew/version.rb
CHANGED
data/sample.sinew
CHANGED
data/sinew.gemspec
CHANGED
@@ -1,29 +1,34 @@
|
|
1
|
-
$LOAD_PATH
|
1
|
+
$LOAD_PATH.unshift("#{__dir__}/lib")
|
2
2
|
|
3
|
-
require
|
3
|
+
require 'sinew/version'
|
4
4
|
|
5
5
|
Gem::Specification.new do |s|
|
6
|
-
s.name =
|
6
|
+
s.name = 'sinew'
|
7
7
|
s.version = Sinew::VERSION
|
8
8
|
s.platform = Gem::Platform::RUBY
|
9
|
-
s.
|
10
|
-
s.
|
11
|
-
s.
|
12
|
-
s.
|
13
|
-
s.
|
9
|
+
s.license = 'MIT'
|
10
|
+
s.authors = [ 'Adam Doppelt' ]
|
11
|
+
s.email = [ 'amd@gurge.com' ]
|
12
|
+
s.homepage = 'http://github.com/gurgeous/sinew'
|
13
|
+
s.summary = 'Sinew - structured web crawling using recipes.'
|
14
|
+
s.description = 'Crawl web sites easily using ruby recipes, with caching and nokogiri.'
|
15
|
+
s.required_ruby_version = '~> 2.3'
|
14
16
|
|
15
|
-
s.rubyforge_project =
|
17
|
+
s.rubyforge_project = 'sinew'
|
16
18
|
|
17
|
-
s.add_runtime_dependency
|
18
|
-
s.add_runtime_dependency
|
19
|
-
s.add_runtime_dependency
|
20
|
-
s.add_runtime_dependency
|
21
|
-
s.add_runtime_dependency
|
22
|
-
s.add_runtime_dependency
|
23
|
-
s.
|
19
|
+
s.add_runtime_dependency 'awesome_print', '~> 1.8'
|
20
|
+
s.add_runtime_dependency 'htmlentities', '~> 4.3'
|
21
|
+
s.add_runtime_dependency 'httparty', '~> 0.16'
|
22
|
+
s.add_runtime_dependency 'nokogiri', '~> 1.8'
|
23
|
+
s.add_runtime_dependency 'scripto', '~> 0'
|
24
|
+
s.add_runtime_dependency 'slop', '~> 4.6'
|
25
|
+
s.add_runtime_dependency 'stringex', '~> 2.8'
|
26
|
+
s.add_development_dependency 'minitest', '~> 5.11'
|
27
|
+
s.add_development_dependency 'rake', '~> 12.3'
|
28
|
+
s.add_development_dependency 'webmock', '~> 3.4'
|
24
29
|
|
25
30
|
s.files = `git ls-files`.split("\n")
|
26
|
-
s.test_files = `git ls-files --
|
27
|
-
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
28
|
-
s.require_paths = [
|
31
|
+
s.test_files = `git ls-files -- test/*`.split("\n")
|
32
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map { |f| File.basename(f) }
|
33
|
+
s.require_paths = [ 'lib' ]
|
29
34
|
end
|