sinew 3.0.1 → 4.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +3 -5
- data/.rubocop.yml +30 -48
- data/Gemfile +4 -4
- data/Gemfile.lock +124 -0
- data/README.md +108 -47
- data/Rakefile +16 -15
- data/bin/sinew +13 -41
- data/lib/sinew.rb +23 -9
- data/lib/sinew/args.rb +53 -0
- data/lib/sinew/base.rb +251 -0
- data/lib/sinew/csv.rb +89 -0
- data/lib/sinew/main.rb +46 -72
- data/lib/sinew/{connection → middleware}/log_formatter.rb +2 -1
- data/lib/sinew/nokogiri_ext.rb +12 -21
- data/lib/sinew/response.rb +41 -52
- data/lib/sinew/version.rb +1 -1
- data/sample.rb +13 -0
- data/sample.sinew +4 -4
- data/sinew.gemspec +19 -16
- metadata +31 -21
- data/.vscode/extensions.json +0 -3
- data/.vscode/settings.json +0 -5
- data/lib/sinew/connection.rb +0 -52
- data/lib/sinew/connection/rate_limit.rb +0 -29
- data/lib/sinew/core_ext.rb +0 -59
- data/lib/sinew/dsl.rb +0 -115
- data/lib/sinew/output.rb +0 -133
- data/lib/sinew/request.rb +0 -86
- data/lib/sinew/runtime_options.rb +0 -28
data/lib/sinew/csv.rb
ADDED
@@ -0,0 +1,89 @@
|
|
1
|
+
require 'csv'
|
2
|
+
require 'sterile'
|
3
|
+
|
4
|
+
module Sinew
|
5
|
+
class CSV
|
6
|
+
attr_reader :columns, :count, :csv, :path, :tally
|
7
|
+
|
8
|
+
def initialize(path)
|
9
|
+
@count = 0
|
10
|
+
@csv = nil
|
11
|
+
@path = path
|
12
|
+
end
|
13
|
+
|
14
|
+
# start writing the csv
|
15
|
+
def start(columns)
|
16
|
+
raise 'started twice' if started?
|
17
|
+
|
18
|
+
@columns = columns
|
19
|
+
@tally = columns.map { [_1, 0] }.to_h
|
20
|
+
@csv = ::CSV.open(path, 'wb').tap do
|
21
|
+
_1 << columns
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# has this csv been started?
|
26
|
+
def started?
|
27
|
+
@csv != nil
|
28
|
+
end
|
29
|
+
|
30
|
+
# append a row
|
31
|
+
def emit(row)
|
32
|
+
# convert row to cols, and construct print (our return value)
|
33
|
+
print = {}
|
34
|
+
row = columns.map do
|
35
|
+
value = normalize(row[_1])
|
36
|
+
if value
|
37
|
+
print[_1] = value
|
38
|
+
tally[_1] += 1
|
39
|
+
end
|
40
|
+
value
|
41
|
+
end
|
42
|
+
@count += 1
|
43
|
+
|
44
|
+
# emit
|
45
|
+
csv << row
|
46
|
+
csv.flush
|
47
|
+
|
48
|
+
# return in case someone wants to pretty print this
|
49
|
+
print
|
50
|
+
end
|
51
|
+
|
52
|
+
ASCII_ONLY = begin
|
53
|
+
chars = (33..126).map(&:chr) - ['&']
|
54
|
+
/\A[#{Regexp.escape(chars.join)}\s]+\Z/
|
55
|
+
end.freeze
|
56
|
+
|
57
|
+
def normalize(s)
|
58
|
+
# nokogiri/array/misc => string
|
59
|
+
s = if s.respond_to?(:inner_html)
|
60
|
+
s.inner_html
|
61
|
+
elsif s.is_a?(Array)
|
62
|
+
s.join('|')
|
63
|
+
else
|
64
|
+
s.to_s
|
65
|
+
end
|
66
|
+
return if s.empty?
|
67
|
+
|
68
|
+
# simple attempt to strip tags. Note that we replace tags with spaces
|
69
|
+
s = s.gsub(/<[^>]+>/, ' ')
|
70
|
+
|
71
|
+
if s !~ ASCII_ONLY
|
72
|
+
# Converts MS Word 'smart punctuation' to ASCII
|
73
|
+
s = Sterile.plain_format(s)
|
74
|
+
|
75
|
+
# á & etc.
|
76
|
+
s = Sterile.decode_entities(s)
|
77
|
+
|
78
|
+
# "šţɽĩɳģ" => "string"
|
79
|
+
s = Sterile.transliterate(s)
|
80
|
+
end
|
81
|
+
|
82
|
+
# squish
|
83
|
+
s = s.strip.gsub(/\s+/, ' ')
|
84
|
+
return if s.empty?
|
85
|
+
|
86
|
+
s
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
data/lib/sinew/main.rb
CHANGED
@@ -1,98 +1,72 @@
|
|
1
|
-
require 'scripto'
|
2
|
-
require 'sinew/connection'
|
3
|
-
|
4
|
-
#
|
5
|
-
# Main sinew entry point.
|
6
|
-
#
|
7
|
-
|
8
1
|
module Sinew
|
9
|
-
class
|
10
|
-
|
2
|
+
# Helper class used by sinew bin. This exists as an independent class solely
|
3
|
+
# for testing, otherwise it would be built into the bin script.
|
4
|
+
class Main
|
5
|
+
attr_reader :sinew
|
11
6
|
|
12
7
|
def initialize(options)
|
13
|
-
|
8
|
+
options[:output] ||= begin
|
9
|
+
src = options[:recipe]
|
10
|
+
dst = File.join(File.dirname(src), "#{File.basename(src, File.extname(src))}.csv")
|
11
|
+
dst = dst.sub(%r{^./}, '') # nice to clean this up
|
12
|
+
dst
|
13
|
+
end
|
14
14
|
|
15
|
-
|
16
|
-
@runtime_options = RuntimeOptions.new
|
15
|
+
@sinew = Sinew::Base.new(options)
|
17
16
|
end
|
18
17
|
|
19
18
|
def run
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
def dsl
|
29
|
-
@dsl ||= DSL.new(self)
|
30
|
-
end
|
31
|
-
|
32
|
-
#
|
33
|
-
# http requests
|
34
|
-
#
|
35
|
-
|
36
|
-
def http(method, url, options = {})
|
37
|
-
request = Request.new(method, url, request_options(options))
|
38
|
-
response = request.perform(connection)
|
39
|
-
|
40
|
-
# always log error messages
|
41
|
-
if response.error?
|
42
|
-
puts "xxx http request failed with #{response.code}"
|
19
|
+
tm = Time.now
|
20
|
+
header if !sinew.options[:silent]
|
21
|
+
recipe = sinew.options[:recipe]
|
22
|
+
dsl = DSL.new(sinew)
|
23
|
+
begin
|
24
|
+
dsl.instance_eval(File.read(recipe, mode: 'rb'), recipe)
|
25
|
+
rescue LimitError
|
26
|
+
# ignore - this is flow control for --limit
|
43
27
|
end
|
44
|
-
|
45
|
-
response
|
28
|
+
footer(Time.now - tm) if !sinew.options[:silent]
|
46
29
|
end
|
47
30
|
|
48
|
-
|
49
|
-
@connection ||= Connection.create(options: options, runtime_options: runtime_options)
|
50
|
-
end
|
51
|
-
protected :connection
|
31
|
+
protected
|
52
32
|
|
53
33
|
#
|
54
|
-
#
|
34
|
+
# header/footer
|
55
35
|
#
|
56
36
|
|
57
|
-
def
|
58
|
-
|
37
|
+
def header
|
38
|
+
sinew.banner("Writing to #{sinew.csv.path}...")
|
59
39
|
end
|
60
40
|
|
61
|
-
|
62
|
-
|
63
|
-
|
41
|
+
def footer(elapsed)
|
42
|
+
csv = sinew.csv
|
43
|
+
count = csv.count
|
64
44
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
[ runtime_options.headers, options[:headers]].each do
|
69
|
-
h.merge!(_1) if _1
|
70
|
-
end
|
71
|
-
end
|
72
|
-
req[:proxy] = random_proxy
|
45
|
+
if count == 0
|
46
|
+
sinew.banner(format('Done in %ds. Nothing written.', elapsed))
|
47
|
+
return
|
73
48
|
end
|
74
|
-
end
|
75
|
-
protected :request_options
|
76
|
-
|
77
|
-
PROXY_RE = /\A#{URI::PATTERN::HOST}(:\d+)?\Z/.freeze
|
78
49
|
|
79
|
-
|
80
|
-
|
50
|
+
# summary
|
51
|
+
msg = format('Done in %ds. Wrote %d rows to %s. Summary:', elapsed, count, csv.path)
|
52
|
+
sinew.banner(msg)
|
81
53
|
|
82
|
-
|
83
|
-
|
84
|
-
|
54
|
+
# tally
|
55
|
+
tally = csv.tally.sort_by { [-_2, _1.to_s] }.to_h
|
56
|
+
len = tally.keys.map { _1.to_s.length }.max
|
57
|
+
fmt = " %-#{len + 1}s %7d/%-7d %5.1f%%\n"
|
58
|
+
tally.each do
|
59
|
+
printf(fmt, _1, _2, count, _2 * 100.0 / count)
|
85
60
|
end
|
86
|
-
|
87
|
-
"http://#{proxy}"
|
88
61
|
end
|
89
|
-
protected :random_proxy
|
90
62
|
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
63
|
+
# simple DSL for .sinew files
|
64
|
+
class DSL
|
65
|
+
attr_reader :sinew
|
66
|
+
|
67
|
+
def initialize(sinew)
|
68
|
+
@sinew = sinew
|
69
|
+
end
|
95
70
|
end
|
96
|
-
protected :footer
|
97
71
|
end
|
98
72
|
end
|
data/lib/sinew/nokogiri_ext.rb
CHANGED
@@ -1,28 +1,19 @@
|
|
1
1
|
require 'nokogiri'
|
2
2
|
|
3
3
|
# modify NodeSet to join with SPACE instead of empty string
|
4
|
-
|
5
|
-
|
6
|
-
|
4
|
+
module Nokogiri
|
5
|
+
module XML
|
6
|
+
class NodeSet
|
7
|
+
alias old_inner_html inner_html
|
8
|
+
alias old_inner_text inner_text
|
7
9
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
def inner_html(*args)
|
13
|
-
map { |i| i.inner_html(*args) }.join(' ')
|
14
|
-
end
|
15
|
-
end
|
10
|
+
def inner_text
|
11
|
+
map(&:inner_text).join(' ')
|
12
|
+
end
|
16
13
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
t&.text
|
22
|
-
end
|
23
|
-
end
|
24
|
-
class Nokogiri::XML::NodeSet
|
25
|
-
def text_just_me
|
26
|
-
map(&:text_just_me).join(' ')
|
14
|
+
def inner_html(*args)
|
15
|
+
map { _1.inner_html(*args) }.join(' ')
|
16
|
+
end
|
17
|
+
end
|
27
18
|
end
|
28
19
|
end
|
data/lib/sinew/response.rb
CHANGED
@@ -1,72 +1,61 @@
|
|
1
|
-
require '
|
2
|
-
require '
|
3
|
-
|
4
|
-
|
5
|
-
# An HTTP response.
|
6
|
-
#
|
1
|
+
require 'delegate'
|
2
|
+
require 'hashie/mash'
|
3
|
+
require 'json'
|
4
|
+
require 'nokogiri'
|
7
5
|
|
8
6
|
module Sinew
|
9
|
-
|
10
|
-
|
7
|
+
# A wrapper around Faraday::Response, with some parsing helpers.
|
8
|
+
class Response < SimpleDelegator
|
9
|
+
# Like body, but tries to cleanup whitespace around HTML for easier parsing.
|
10
|
+
def html
|
11
|
+
@html ||= body.dup.tap do
|
12
|
+
# fix invalid utf8
|
13
|
+
if _1.encoding == Encoding::UTF_8
|
14
|
+
_1.encode!('UTF-8', invalid: :replace, undef: :replace, replace: '?')
|
15
|
+
end
|
11
16
|
|
12
|
-
|
13
|
-
|
14
|
-
|
17
|
+
# squish
|
18
|
+
_1.strip!
|
19
|
+
_1.gsub!(/\s+/, ' ')
|
15
20
|
|
16
|
-
|
17
|
-
|
18
|
-
_1.request = request
|
19
|
-
_1.uri = fday_response.env.url
|
20
|
-
_1.code = fday_response.status
|
21
|
-
_1.headers = fday_response.headers.to_h
|
22
|
-
_1.body = process_body(fday_response)
|
21
|
+
# kill whitespace around tags
|
22
|
+
_1.gsub!(/ ?<([^>]+)> ?/, '<\\1>')
|
23
23
|
end
|
24
24
|
end
|
25
25
|
|
26
|
-
#
|
27
|
-
def
|
28
|
-
|
29
|
-
|
30
|
-
# inflate if necessary
|
31
|
-
bits = body[0, 10].force_encoding('BINARY')
|
32
|
-
if bits =~ /\A\x1f\x8b/n
|
33
|
-
body = Zlib::GzipReader.new(StringIO.new(body)).read
|
34
|
-
end
|
35
|
-
|
36
|
-
# force to utf-8 if we think this could be text
|
37
|
-
if body.encoding != Encoding::UTF_8
|
38
|
-
if content_type = response.headers['content-type']
|
39
|
-
if content_type =~ /\b(html|javascript|json|text|xml)\b/
|
40
|
-
body = body.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
|
41
|
-
end
|
42
|
-
end
|
43
|
-
end
|
26
|
+
# Return body as JSON
|
27
|
+
def json
|
28
|
+
@json ||= JSON.parse(body, symbolize_names: true)
|
29
|
+
end
|
44
30
|
|
45
|
-
|
31
|
+
# Return JSON body as Hashie::Mash
|
32
|
+
def mash
|
33
|
+
@mash ||= Hashie::Mash.new(json)
|
46
34
|
end
|
47
35
|
|
48
|
-
#
|
49
|
-
|
50
|
-
|
36
|
+
# Return body HTML as Nokogiri document
|
37
|
+
def noko
|
38
|
+
@noko ||= Nokogiri::HTML(html)
|
39
|
+
end
|
51
40
|
|
52
|
-
|
53
|
-
|
41
|
+
# Return body XML as Nokogiri document
|
42
|
+
def xml
|
43
|
+
@xml ||= Nokogiri::XML(html)
|
54
44
|
end
|
55
45
|
|
56
|
-
|
57
|
-
|
46
|
+
# Return the final URI for the request, after redirects
|
47
|
+
def url
|
48
|
+
env.url
|
58
49
|
end
|
59
50
|
|
60
|
-
|
61
|
-
|
51
|
+
# Return the cache diskpath for this response
|
52
|
+
def diskpath
|
53
|
+
env[:httpdisk_diskpath]
|
62
54
|
end
|
63
55
|
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
code: code,
|
68
|
-
headers: headers,
|
69
|
-
}
|
56
|
+
# Remove cached response from disk, if any
|
57
|
+
def uncache
|
58
|
+
File.unlink(diskpath) if File.exist?(diskpath)
|
70
59
|
end
|
71
60
|
end
|
72
61
|
end
|
data/lib/sinew/version.rb
CHANGED
data/sample.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
require_relative 'lib/sinew'
|
2
|
+
|
3
|
+
sinew = Sinew.new(output: 'sample.csv', verbose: true)
|
4
|
+
|
5
|
+
response = sinew.get 'http://httpbingo.org'
|
6
|
+
response.noko.css('ul li a').each do |a|
|
7
|
+
row = {}
|
8
|
+
row[:url] = a[:href]
|
9
|
+
row[:title] = a.text
|
10
|
+
sinew.csv_emit(row)
|
11
|
+
end
|
12
|
+
|
13
|
+
sinew.get 'http://httpbingo.org/redirect/2'
|
data/sample.sinew
CHANGED
@@ -1,9 +1,9 @@
|
|
1
|
-
get 'http://httpbingo.org'
|
2
|
-
noko.css('ul li a').each do |a|
|
1
|
+
response = sinew.get 'http://httpbingo.org'
|
2
|
+
response.noko.css('ul li a').each do |a|
|
3
3
|
row = {}
|
4
4
|
row[:url] = a[:href]
|
5
5
|
row[:title] = a.text
|
6
|
-
csv_emit(row)
|
6
|
+
sinew.csv_emit(row)
|
7
7
|
end
|
8
8
|
|
9
|
-
get 'http://httpbingo.org/redirect/2'
|
9
|
+
sinew.get 'http://httpbingo.org/redirect/2'
|
data/sinew.gemspec
CHANGED
@@ -3,14 +3,15 @@ $LOAD_PATH.unshift("#{__dir__}/lib")
|
|
3
3
|
require 'sinew/version'
|
4
4
|
|
5
5
|
Gem::Specification.new do |s|
|
6
|
-
s.name
|
7
|
-
s.version
|
8
|
-
s.
|
9
|
-
s.
|
10
|
-
|
11
|
-
s.
|
12
|
-
s.summary = 'Sinew - structured web crawling using recipes.'
|
6
|
+
s.name = 'sinew'
|
7
|
+
s.version = Sinew::VERSION
|
8
|
+
s.authors = ['Adam Doppelt', 'Nathan Kriege']
|
9
|
+
s.email = ['amd@gurge.com']
|
10
|
+
|
11
|
+
s.summary = 'Sinew - structured web crawling using recipes.'
|
13
12
|
s.description = 'Crawl web sites easily using ruby recipes, with caching and nokogiri.'
|
13
|
+
s.homepage = 'http://github.com/gurgeous/sinew'
|
14
|
+
s.license = 'MIT'
|
14
15
|
s.required_ruby_version = '>= 2.7'
|
15
16
|
|
16
17
|
# what's in the gem?
|
@@ -19,14 +20,16 @@ Gem::Specification.new do |s|
|
|
19
20
|
end
|
20
21
|
s.bindir = 'bin'
|
21
22
|
s.executables = s.files.grep(%r{^#{s.bindir}/}) { File.basename(_1) }
|
22
|
-
s.require_paths = [
|
23
|
+
s.require_paths = ['lib']
|
23
24
|
|
24
|
-
|
25
|
-
s.
|
26
|
-
s.
|
27
|
-
s.
|
28
|
-
s.
|
29
|
-
s.
|
30
|
-
s.
|
31
|
-
s.
|
25
|
+
# gem dependencies
|
26
|
+
s.add_dependency 'amazing_print', '~> 1.3'
|
27
|
+
s.add_dependency 'faraday', '~> 1.4'
|
28
|
+
s.add_dependency 'faraday-encoding', '~> 0'
|
29
|
+
s.add_dependency 'faraday-rate_limiter', '~> 0.0'
|
30
|
+
s.add_dependency 'hashie', '~> 4.1'
|
31
|
+
s.add_dependency 'httpdisk', '~> 0.5'
|
32
|
+
s.add_dependency 'nokogiri', '~> 1.11'
|
33
|
+
s.add_dependency 'slop', '~> 4.8'
|
34
|
+
s.add_dependency 'sterile', '~> 1.0'
|
32
35
|
end
|