sinew 1.0.4 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.gitignore +1 -0
- data/.rubocop.yml +49 -0
- data/.travis.yml +4 -0
- data/.vscode/extensions.json +3 -0
- data/.vscode/settings.json +15 -0
- data/Gemfile +1 -1
- data/README.md +153 -12
- data/Rakefile +13 -14
- data/bin/sinew +40 -20
- data/lib/sinew.rb +10 -6
- data/lib/sinew/cache.rb +79 -0
- data/lib/sinew/core_ext.rb +59 -0
- data/lib/sinew/dsl.rb +98 -0
- data/lib/sinew/main.rb +80 -149
- data/lib/sinew/nokogiri_ext.rb +10 -9
- data/lib/sinew/output.rb +126 -0
- data/lib/sinew/request.rb +148 -0
- data/lib/sinew/response.rb +75 -0
- data/lib/sinew/runtime_options.rb +26 -0
- data/lib/sinew/version.rb +1 -1
- data/sample.sinew +5 -3
- data/sinew.gemspec +24 -19
- data/test/test.html +40 -34
- data/test/test_cache.rb +69 -0
- data/test/test_helper.rb +113 -0
- data/test/test_main.rb +36 -91
- data/test/test_nokogiri_ext.rb +14 -15
- data/test/test_output.rb +73 -0
- data/test/test_requests.rb +135 -0
- data/test/test_utf8.rb +39 -0
- metadata +103 -48
- data/lib/sinew/curler.rb +0 -173
- data/lib/sinew/text_util.rb +0 -101
- data/lib/sinew/util.rb +0 -236
- data/test/helper.rb +0 -64
- data/test/test_curler.rb +0 -70
- data/test/test_text_util.rb +0 -23
data/lib/sinew/output.rb
ADDED
@@ -0,0 +1,126 @@
|
|
1
|
+
require 'csv'
|
2
|
+
require 'stringex'
|
3
|
+
|
4
|
+
#
|
5
|
+
# CSV output.
|
6
|
+
#
|
7
|
+
|
8
|
+
module Sinew
|
9
|
+
class Output
|
10
|
+
attr_reader :sinew, :columns, :rows, :csv
|
11
|
+
|
12
|
+
def initialize(sinew)
|
13
|
+
@sinew = sinew
|
14
|
+
@rows = []
|
15
|
+
end
|
16
|
+
|
17
|
+
def filename
|
18
|
+
@filename ||= begin
|
19
|
+
recipe = sinew.options[:recipe]
|
20
|
+
ext = File.extname(recipe)
|
21
|
+
if ext.empty?
|
22
|
+
"#{recipe}.csv"
|
23
|
+
else
|
24
|
+
recipe.gsub(ext, '.csv')
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def header(columns)
|
30
|
+
sinew.banner("Writing to #{filename}...") if !sinew.quiet?
|
31
|
+
|
32
|
+
columns = columns.flatten
|
33
|
+
@columns = columns
|
34
|
+
|
35
|
+
# open csv, write header row
|
36
|
+
@csv = CSV.open(filename, 'wb')
|
37
|
+
csv << columns
|
38
|
+
end
|
39
|
+
|
40
|
+
def emit(row)
|
41
|
+
# implicit header if necessary
|
42
|
+
header(row.keys) if !csv
|
43
|
+
|
44
|
+
rows << row.dup
|
45
|
+
|
46
|
+
# map columns to row, and normalize along the way
|
47
|
+
print = {}
|
48
|
+
row = columns.map do |i|
|
49
|
+
value = normalize(row[i])
|
50
|
+
print[i] = value if value.present?
|
51
|
+
value
|
52
|
+
end
|
53
|
+
|
54
|
+
# print
|
55
|
+
sinew.vputs print.ai
|
56
|
+
|
57
|
+
csv << row
|
58
|
+
csv.flush
|
59
|
+
end
|
60
|
+
|
61
|
+
def count
|
62
|
+
rows.length
|
63
|
+
end
|
64
|
+
|
65
|
+
def report
|
66
|
+
return if count == 0
|
67
|
+
|
68
|
+
sinew.banner("Got #{count} rows.")
|
69
|
+
|
70
|
+
# calculate counts
|
71
|
+
counts = Hash.new(0)
|
72
|
+
rows.each do |row|
|
73
|
+
row.each_pair { |k, v| counts[k] += 1 if v.present? }
|
74
|
+
end
|
75
|
+
# sort by counts
|
76
|
+
cols = columns.sort_by { |i| [ -counts[i], i ] }
|
77
|
+
|
78
|
+
# report
|
79
|
+
len = cols.map { |i| i.to_s.length }.max
|
80
|
+
fmt = " %-#{len + 1}s %7d / %-7d %6.1f%%\n"
|
81
|
+
cols.each do |col|
|
82
|
+
$stderr.printf(fmt, col, counts[col], count, counts[col] * 100.0 / count)
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
def normalize(s)
|
87
|
+
# noko/array/misc => string
|
88
|
+
s = case s
|
89
|
+
when Nokogiri::XML::Element, Nokogiri::XML::NodeSet
|
90
|
+
s.inner_html
|
91
|
+
when Array
|
92
|
+
s.map(&:to_s).join('|')
|
93
|
+
else
|
94
|
+
s.to_s
|
95
|
+
end
|
96
|
+
|
97
|
+
#
|
98
|
+
# Below uses stringex
|
99
|
+
#
|
100
|
+
# github.com/rsl/stringex/blob/master/lib/stringex/string_extensions.rb
|
101
|
+
# github.com/rsl/stringex/blob/master/lib/stringex/localization/conversion_expressions.rb
|
102
|
+
#
|
103
|
+
|
104
|
+
# <a>b</a> => b
|
105
|
+
s = s.strip_html_tags
|
106
|
+
|
107
|
+
# Converts MS Word 'smart punctuation' to ASCII
|
108
|
+
s = s.convert_smart_punctuation
|
109
|
+
|
110
|
+
# "á".convert_accented_html_entities # => "a"
|
111
|
+
s = s.convert_accented_html_entities
|
112
|
+
|
113
|
+
# &, &frac, etc.
|
114
|
+
s = s.convert_miscellaneous_html_entities
|
115
|
+
|
116
|
+
# convert unicode => regular characters
|
117
|
+
s = s.to_ascii
|
118
|
+
|
119
|
+
# squish
|
120
|
+
s = s.squish
|
121
|
+
|
122
|
+
s
|
123
|
+
end
|
124
|
+
protected :normalize
|
125
|
+
end
|
126
|
+
end
|
@@ -0,0 +1,148 @@
|
|
1
|
+
require 'digest/md5'
|
2
|
+
require 'httparty'
|
3
|
+
require 'htmlentities'
|
4
|
+
|
5
|
+
#
|
6
|
+
# Process a single HTTP request. Mostly a wrapper around HTTParty.
|
7
|
+
#
|
8
|
+
|
9
|
+
module Sinew
|
10
|
+
class Error < StandardError; end
|
11
|
+
|
12
|
+
class Request
|
13
|
+
HTML_ENTITIES = HTMLEntities.new
|
14
|
+
VALID_METHODS = %w[get post patch put delete head options].freeze
|
15
|
+
|
16
|
+
attr_reader :sinew, :method, :uri, :options, :cache_key
|
17
|
+
|
18
|
+
# Options are largely compatible with HTTParty, except for :method.
|
19
|
+
def initialize(sinew, method, url, options = {})
|
20
|
+
@sinew = sinew
|
21
|
+
@method = method
|
22
|
+
@options = options.dup
|
23
|
+
@uri = parse_url(url)
|
24
|
+
@cache_key = calculate_cache_key
|
25
|
+
end
|
26
|
+
|
27
|
+
# run the request, return the result
|
28
|
+
def perform
|
29
|
+
validate!
|
30
|
+
|
31
|
+
# merge global/options headers
|
32
|
+
headers = sinew.runtime_options.headers
|
33
|
+
headers = headers.merge(options[:headers]) if options[:headers]
|
34
|
+
options[:headers] = headers
|
35
|
+
|
36
|
+
party_response = HTTParty.send(method, uri, options)
|
37
|
+
Response.from_network(self, party_response)
|
38
|
+
end
|
39
|
+
|
40
|
+
# We accept sloppy urls and attempt to clean them up
|
41
|
+
def parse_url(url)
|
42
|
+
s = url
|
43
|
+
|
44
|
+
# remove entities
|
45
|
+
s = HTML_ENTITIES.decode(s)
|
46
|
+
|
47
|
+
# fix a couple of common encoding bugs
|
48
|
+
s = s.gsub(' ', '%20')
|
49
|
+
s = s.gsub("'", '%27')
|
50
|
+
|
51
|
+
# append query manually (instead of letting HTTParty handle it) so we can
|
52
|
+
# include it in cache_key
|
53
|
+
query = options.delete(:query)
|
54
|
+
if query.present?
|
55
|
+
q = HTTParty::HashConversions.to_params(query)
|
56
|
+
separator = s.include?('?') ? '&' : '?'
|
57
|
+
s = "#{s}#{separator}#{q}"
|
58
|
+
end
|
59
|
+
|
60
|
+
URI.parse(s)
|
61
|
+
end
|
62
|
+
protected :parse_url
|
63
|
+
|
64
|
+
def calculate_cache_key
|
65
|
+
dir = pathify(uri.host)
|
66
|
+
|
67
|
+
body_key = if body.is_a?(Hash)
|
68
|
+
HTTParty::HashConversions.to_params(body)
|
69
|
+
else
|
70
|
+
body&.dup
|
71
|
+
end
|
72
|
+
|
73
|
+
# build key, as a hash for before_generate_cache_key
|
74
|
+
key = {
|
75
|
+
method: method.dup,
|
76
|
+
path: uri.path,
|
77
|
+
query: uri.query,
|
78
|
+
body: body_key,
|
79
|
+
}
|
80
|
+
key = sinew.runtime_options.before_generate_cache_key.call(key)
|
81
|
+
|
82
|
+
# strip method for gets
|
83
|
+
key.delete(:method) if key[:method] == 'get'
|
84
|
+
|
85
|
+
# pull out the values, join and pathify
|
86
|
+
path = key.values.select(&:present?).join(',')
|
87
|
+
path = pathify(path)
|
88
|
+
|
89
|
+
# shorten long paths
|
90
|
+
if path.length > 250
|
91
|
+
path = Digest::MD5.hexdigest(path)
|
92
|
+
end
|
93
|
+
|
94
|
+
"#{dir}/#{path}"
|
95
|
+
end
|
96
|
+
protected :calculate_cache_key
|
97
|
+
|
98
|
+
def validate!
|
99
|
+
raise "invalid method #{method}" if !VALID_METHODS.include?(method)
|
100
|
+
raise "invalid url #{uri}" if uri.scheme !~ /^http/
|
101
|
+
raise "can't get with a body" if method == 'get' && body
|
102
|
+
raise "Content-Type doesn't make sense without a body" if content_type && !body
|
103
|
+
end
|
104
|
+
protected :validate!
|
105
|
+
|
106
|
+
def body
|
107
|
+
options[:body]
|
108
|
+
end
|
109
|
+
protected :body
|
110
|
+
|
111
|
+
def headers
|
112
|
+
options[:headers]
|
113
|
+
end
|
114
|
+
protected :headers
|
115
|
+
|
116
|
+
def content_type
|
117
|
+
headers && headers['Content-Type']
|
118
|
+
end
|
119
|
+
protected :content_type
|
120
|
+
|
121
|
+
def form?
|
122
|
+
content_type == 'application/x-www-form-urlencoded'
|
123
|
+
end
|
124
|
+
protected :form?
|
125
|
+
|
126
|
+
def pathify(s)
|
127
|
+
# remove leading slash
|
128
|
+
s = s.gsub(/^\//, '')
|
129
|
+
# .. => comma
|
130
|
+
s = s.gsub('..', ',')
|
131
|
+
# query separators => comma
|
132
|
+
s = s.gsub(/[?\/&]/, ',')
|
133
|
+
# ,, => comma
|
134
|
+
s = s.gsub(',,', ',')
|
135
|
+
# encode invalid path chars
|
136
|
+
s = s.gsub(/[^A-Za-z0-9_.,=-]/) do |i|
|
137
|
+
hex = i.unpack('H2').first
|
138
|
+
"%#{hex}"
|
139
|
+
end
|
140
|
+
# handle empty case
|
141
|
+
s = '_root_' if s.blank?
|
142
|
+
# always downcase
|
143
|
+
s = s.downcase
|
144
|
+
s
|
145
|
+
end
|
146
|
+
protected :pathify
|
147
|
+
end
|
148
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
#
|
2
|
+
# An HTTP response. Mostly a wrapper around HTTParty.
|
3
|
+
#
|
4
|
+
|
5
|
+
module Sinew
|
6
|
+
class Response
|
7
|
+
attr_accessor :request, :uri, :body, :code, :headers
|
8
|
+
|
9
|
+
def self.from_network(request, party_response)
|
10
|
+
Response.new.tap do |response|
|
11
|
+
response.request = request
|
12
|
+
response.uri = party_response.request.last_uri
|
13
|
+
response.code = party_response.code
|
14
|
+
response.headers = party_response.headers.to_h
|
15
|
+
|
16
|
+
# force to utf-8 as best we can
|
17
|
+
body = party_response.body
|
18
|
+
if body.encoding != Encoding::UTF_8
|
19
|
+
body = body.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
|
20
|
+
end
|
21
|
+
response.body = body
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.from_cache(request, body, head)
|
26
|
+
Response.new.tap do |response|
|
27
|
+
response.request = request
|
28
|
+
response.body = body
|
29
|
+
|
30
|
+
# defaults
|
31
|
+
response.uri = request.uri
|
32
|
+
response.code = 200
|
33
|
+
response.headers = {}
|
34
|
+
|
35
|
+
# overwrite with cached response headers
|
36
|
+
if head
|
37
|
+
head = JSON.parse(head, symbolize_names: true)
|
38
|
+
response.uri = URI.parse(head[:uri])
|
39
|
+
response.code = head[:code]
|
40
|
+
response.headers = head[:headers]
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.from_timeout(request)
|
46
|
+
Response.new.tap do |response|
|
47
|
+
response.request = request
|
48
|
+
response.uri = request.uri
|
49
|
+
response.body = 'timeout'
|
50
|
+
response.code = 999
|
51
|
+
response.headers = {}
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def error?
|
56
|
+
code >= 400
|
57
|
+
end
|
58
|
+
|
59
|
+
def error_500?
|
60
|
+
code / 100 >= 5
|
61
|
+
end
|
62
|
+
|
63
|
+
def redirected?
|
64
|
+
request.uri != uri
|
65
|
+
end
|
66
|
+
|
67
|
+
def head_as_json
|
68
|
+
{
|
69
|
+
uri: uri,
|
70
|
+
code: code,
|
71
|
+
headers: headers,
|
72
|
+
}
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
#
|
2
|
+
# Runtime options that sinew files can modify.
|
3
|
+
#
|
4
|
+
|
5
|
+
module Sinew
|
6
|
+
class RuntimeOptions
|
7
|
+
attr_accessor :retries
|
8
|
+
attr_accessor :rate_limit
|
9
|
+
attr_accessor :headers
|
10
|
+
attr_accessor :before_generate_cache_key
|
11
|
+
|
12
|
+
def initialize
|
13
|
+
self.retries = 3
|
14
|
+
self.rate_limit = 1
|
15
|
+
self.headers = {
|
16
|
+
'User-Agent' => "sinew/#{VERSION}",
|
17
|
+
}
|
18
|
+
self.before_generate_cache_key = ->(i) { i }
|
19
|
+
|
20
|
+
# for testing
|
21
|
+
if ENV['SINEW_TEST']
|
22
|
+
self.rate_limit = 0
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
data/lib/sinew/version.rb
CHANGED
data/sample.sinew
CHANGED
data/sinew.gemspec
CHANGED
@@ -1,29 +1,34 @@
|
|
1
|
-
$LOAD_PATH
|
1
|
+
$LOAD_PATH.unshift("#{__dir__}/lib")
|
2
2
|
|
3
|
-
require
|
3
|
+
require 'sinew/version'
|
4
4
|
|
5
5
|
Gem::Specification.new do |s|
|
6
|
-
s.name =
|
6
|
+
s.name = 'sinew'
|
7
7
|
s.version = Sinew::VERSION
|
8
8
|
s.platform = Gem::Platform::RUBY
|
9
|
-
s.
|
10
|
-
s.
|
11
|
-
s.
|
12
|
-
s.
|
13
|
-
s.
|
9
|
+
s.license = 'MIT'
|
10
|
+
s.authors = [ 'Adam Doppelt' ]
|
11
|
+
s.email = [ 'amd@gurge.com' ]
|
12
|
+
s.homepage = 'http://github.com/gurgeous/sinew'
|
13
|
+
s.summary = 'Sinew - structured web crawling using recipes.'
|
14
|
+
s.description = 'Crawl web sites easily using ruby recipes, with caching and nokogiri.'
|
15
|
+
s.required_ruby_version = '~> 2.3'
|
14
16
|
|
15
|
-
s.rubyforge_project =
|
17
|
+
s.rubyforge_project = 'sinew'
|
16
18
|
|
17
|
-
s.add_runtime_dependency
|
18
|
-
s.add_runtime_dependency
|
19
|
-
s.add_runtime_dependency
|
20
|
-
s.add_runtime_dependency
|
21
|
-
s.add_runtime_dependency
|
22
|
-
s.add_runtime_dependency
|
23
|
-
s.
|
19
|
+
s.add_runtime_dependency 'awesome_print', '~> 1.8'
|
20
|
+
s.add_runtime_dependency 'htmlentities', '~> 4.3'
|
21
|
+
s.add_runtime_dependency 'httparty', '~> 0.16'
|
22
|
+
s.add_runtime_dependency 'nokogiri', '~> 1.8'
|
23
|
+
s.add_runtime_dependency 'scripto', '~> 0'
|
24
|
+
s.add_runtime_dependency 'slop', '~> 4.6'
|
25
|
+
s.add_runtime_dependency 'stringex', '~> 2.8'
|
26
|
+
s.add_development_dependency 'minitest', '~> 5.11'
|
27
|
+
s.add_development_dependency 'rake', '~> 12.3'
|
28
|
+
s.add_development_dependency 'webmock', '~> 3.4'
|
24
29
|
|
25
30
|
s.files = `git ls-files`.split("\n")
|
26
|
-
s.test_files = `git ls-files --
|
27
|
-
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
28
|
-
s.require_paths = [
|
31
|
+
s.test_files = `git ls-files -- test/*`.split("\n")
|
32
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map { |f| File.basename(f) }
|
33
|
+
s.require_paths = [ 'lib' ]
|
29
34
|
end
|