sinew 3.0.1 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +3 -5
- data/.rubocop.yml +30 -48
- data/Gemfile +4 -4
- data/Gemfile.lock +124 -0
- data/README.md +108 -47
- data/Rakefile +16 -15
- data/bin/sinew +13 -41
- data/lib/sinew.rb +23 -9
- data/lib/sinew/args.rb +53 -0
- data/lib/sinew/base.rb +251 -0
- data/lib/sinew/csv.rb +89 -0
- data/lib/sinew/main.rb +46 -72
- data/lib/sinew/{connection → middleware}/log_formatter.rb +2 -1
- data/lib/sinew/nokogiri_ext.rb +12 -21
- data/lib/sinew/response.rb +41 -52
- data/lib/sinew/version.rb +1 -1
- data/sample.rb +13 -0
- data/sample.sinew +4 -4
- data/sinew.gemspec +19 -16
- metadata +31 -21
- data/.vscode/extensions.json +0 -3
- data/.vscode/settings.json +0 -5
- data/lib/sinew/connection.rb +0 -52
- data/lib/sinew/connection/rate_limit.rb +0 -29
- data/lib/sinew/core_ext.rb +0 -59
- data/lib/sinew/dsl.rb +0 -115
- data/lib/sinew/output.rb +0 -133
- data/lib/sinew/request.rb +0 -86
- data/lib/sinew/runtime_options.rb +0 -28
data/lib/sinew/csv.rb
ADDED
@@ -0,0 +1,89 @@
|
|
1
|
+
require 'csv'
|
2
|
+
require 'sterile'
|
3
|
+
|
4
|
+
module Sinew
|
5
|
+
class CSV
|
6
|
+
attr_reader :columns, :count, :csv, :path, :tally
|
7
|
+
|
8
|
+
def initialize(path)
|
9
|
+
@count = 0
|
10
|
+
@csv = nil
|
11
|
+
@path = path
|
12
|
+
end
|
13
|
+
|
14
|
+
# start writing the csv
|
15
|
+
def start(columns)
|
16
|
+
raise 'started twice' if started?
|
17
|
+
|
18
|
+
@columns = columns
|
19
|
+
@tally = columns.map { [_1, 0] }.to_h
|
20
|
+
@csv = ::CSV.open(path, 'wb').tap do
|
21
|
+
_1 << columns
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# has this csv been started?
|
26
|
+
def started?
|
27
|
+
@csv != nil
|
28
|
+
end
|
29
|
+
|
30
|
+
# append a row
|
31
|
+
def emit(row)
|
32
|
+
# convert row to cols, and construct print (our return value)
|
33
|
+
print = {}
|
34
|
+
row = columns.map do
|
35
|
+
value = normalize(row[_1])
|
36
|
+
if value
|
37
|
+
print[_1] = value
|
38
|
+
tally[_1] += 1
|
39
|
+
end
|
40
|
+
value
|
41
|
+
end
|
42
|
+
@count += 1
|
43
|
+
|
44
|
+
# emit
|
45
|
+
csv << row
|
46
|
+
csv.flush
|
47
|
+
|
48
|
+
# return in case someone wants to pretty print this
|
49
|
+
print
|
50
|
+
end
|
51
|
+
|
52
|
+
ASCII_ONLY = begin
|
53
|
+
chars = (33..126).map(&:chr) - ['&']
|
54
|
+
/\A[#{Regexp.escape(chars.join)}\s]+\Z/
|
55
|
+
end.freeze
|
56
|
+
|
57
|
+
def normalize(s)
|
58
|
+
# nokogiri/array/misc => string
|
59
|
+
s = if s.respond_to?(:inner_html)
|
60
|
+
s.inner_html
|
61
|
+
elsif s.is_a?(Array)
|
62
|
+
s.join('|')
|
63
|
+
else
|
64
|
+
s.to_s
|
65
|
+
end
|
66
|
+
return if s.empty?
|
67
|
+
|
68
|
+
# simple attempt to strip tags. Note that we replace tags with spaces
|
69
|
+
s = s.gsub(/<[^>]+>/, ' ')
|
70
|
+
|
71
|
+
if s !~ ASCII_ONLY
|
72
|
+
# Converts MS Word 'smart punctuation' to ASCII
|
73
|
+
s = Sterile.plain_format(s)
|
74
|
+
|
75
|
+
# á & etc.
|
76
|
+
s = Sterile.decode_entities(s)
|
77
|
+
|
78
|
+
# "šţɽĩɳģ" => "string"
|
79
|
+
s = Sterile.transliterate(s)
|
80
|
+
end
|
81
|
+
|
82
|
+
# squish
|
83
|
+
s = s.strip.gsub(/\s+/, ' ')
|
84
|
+
return if s.empty?
|
85
|
+
|
86
|
+
s
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
data/lib/sinew/main.rb
CHANGED
@@ -1,98 +1,72 @@
|
|
1
|
-
require 'scripto'
|
2
|
-
require 'sinew/connection'
|
3
|
-
|
4
|
-
#
|
5
|
-
# Main sinew entry point.
|
6
|
-
#
|
7
|
-
|
8
1
|
module Sinew
|
9
|
-
class
|
10
|
-
|
2
|
+
# Helper class used by sinew bin. This exists as an independent class solely
|
3
|
+
# for testing, otherwise it would be built into the bin script.
|
4
|
+
class Main
|
5
|
+
attr_reader :sinew
|
11
6
|
|
12
7
|
def initialize(options)
|
13
|
-
|
8
|
+
options[:output] ||= begin
|
9
|
+
src = options[:recipe]
|
10
|
+
dst = File.join(File.dirname(src), "#{File.basename(src, File.extname(src))}.csv")
|
11
|
+
dst = dst.sub(%r{^./}, '') # nice to clean this up
|
12
|
+
dst
|
13
|
+
end
|
14
14
|
|
15
|
-
|
16
|
-
@runtime_options = RuntimeOptions.new
|
15
|
+
@sinew = Sinew::Base.new(options)
|
17
16
|
end
|
18
17
|
|
19
18
|
def run
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
def dsl
|
29
|
-
@dsl ||= DSL.new(self)
|
30
|
-
end
|
31
|
-
|
32
|
-
#
|
33
|
-
# http requests
|
34
|
-
#
|
35
|
-
|
36
|
-
def http(method, url, options = {})
|
37
|
-
request = Request.new(method, url, request_options(options))
|
38
|
-
response = request.perform(connection)
|
39
|
-
|
40
|
-
# always log error messages
|
41
|
-
if response.error?
|
42
|
-
puts "xxx http request failed with #{response.code}"
|
19
|
+
tm = Time.now
|
20
|
+
header if !sinew.options[:silent]
|
21
|
+
recipe = sinew.options[:recipe]
|
22
|
+
dsl = DSL.new(sinew)
|
23
|
+
begin
|
24
|
+
dsl.instance_eval(File.read(recipe, mode: 'rb'), recipe)
|
25
|
+
rescue LimitError
|
26
|
+
# ignore - this is flow control for --limit
|
43
27
|
end
|
44
|
-
|
45
|
-
response
|
28
|
+
footer(Time.now - tm) if !sinew.options[:silent]
|
46
29
|
end
|
47
30
|
|
48
|
-
|
49
|
-
@connection ||= Connection.create(options: options, runtime_options: runtime_options)
|
50
|
-
end
|
51
|
-
protected :connection
|
31
|
+
protected
|
52
32
|
|
53
33
|
#
|
54
|
-
#
|
34
|
+
# header/footer
|
55
35
|
#
|
56
36
|
|
57
|
-
def
|
58
|
-
|
37
|
+
def header
|
38
|
+
sinew.banner("Writing to #{sinew.csv.path}...")
|
59
39
|
end
|
60
40
|
|
61
|
-
|
62
|
-
|
63
|
-
|
41
|
+
def footer(elapsed)
|
42
|
+
csv = sinew.csv
|
43
|
+
count = csv.count
|
64
44
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
[ runtime_options.headers, options[:headers]].each do
|
69
|
-
h.merge!(_1) if _1
|
70
|
-
end
|
71
|
-
end
|
72
|
-
req[:proxy] = random_proxy
|
45
|
+
if count == 0
|
46
|
+
sinew.banner(format('Done in %ds. Nothing written.', elapsed))
|
47
|
+
return
|
73
48
|
end
|
74
|
-
end
|
75
|
-
protected :request_options
|
76
|
-
|
77
|
-
PROXY_RE = /\A#{URI::PATTERN::HOST}(:\d+)?\Z/.freeze
|
78
49
|
|
79
|
-
|
80
|
-
|
50
|
+
# summary
|
51
|
+
msg = format('Done in %ds. Wrote %d rows to %s. Summary:', elapsed, count, csv.path)
|
52
|
+
sinew.banner(msg)
|
81
53
|
|
82
|
-
|
83
|
-
|
84
|
-
|
54
|
+
# tally
|
55
|
+
tally = csv.tally.sort_by { [-_2, _1.to_s] }.to_h
|
56
|
+
len = tally.keys.map { _1.to_s.length }.max
|
57
|
+
fmt = " %-#{len + 1}s %7d/%-7d %5.1f%%\n"
|
58
|
+
tally.each do
|
59
|
+
printf(fmt, _1, _2, count, _2 * 100.0 / count)
|
85
60
|
end
|
86
|
-
|
87
|
-
"http://#{proxy}"
|
88
61
|
end
|
89
|
-
protected :random_proxy
|
90
62
|
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
63
|
+
# simple DSL for .sinew files
|
64
|
+
class DSL
|
65
|
+
attr_reader :sinew
|
66
|
+
|
67
|
+
def initialize(sinew)
|
68
|
+
@sinew = sinew
|
69
|
+
end
|
95
70
|
end
|
96
|
-
protected :footer
|
97
71
|
end
|
98
72
|
end
|
data/lib/sinew/nokogiri_ext.rb
CHANGED
@@ -1,28 +1,19 @@
|
|
1
1
|
require 'nokogiri'
|
2
2
|
|
3
3
|
# modify NodeSet to join with SPACE instead of empty string
|
4
|
-
|
5
|
-
|
6
|
-
|
4
|
+
module Nokogiri
|
5
|
+
module XML
|
6
|
+
class NodeSet
|
7
|
+
alias old_inner_html inner_html
|
8
|
+
alias old_inner_text inner_text
|
7
9
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
def inner_html(*args)
|
13
|
-
map { |i| i.inner_html(*args) }.join(' ')
|
14
|
-
end
|
15
|
-
end
|
10
|
+
def inner_text
|
11
|
+
map(&:inner_text).join(' ')
|
12
|
+
end
|
16
13
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
t&.text
|
22
|
-
end
|
23
|
-
end
|
24
|
-
class Nokogiri::XML::NodeSet
|
25
|
-
def text_just_me
|
26
|
-
map(&:text_just_me).join(' ')
|
14
|
+
def inner_html(*args)
|
15
|
+
map { _1.inner_html(*args) }.join(' ')
|
16
|
+
end
|
17
|
+
end
|
27
18
|
end
|
28
19
|
end
|
data/lib/sinew/response.rb
CHANGED
@@ -1,72 +1,61 @@
|
|
1
|
-
require '
|
2
|
-
require '
|
3
|
-
|
4
|
-
|
5
|
-
# An HTTP response.
|
6
|
-
#
|
1
|
+
require 'delegate'
|
2
|
+
require 'hashie/mash'
|
3
|
+
require 'json'
|
4
|
+
require 'nokogiri'
|
7
5
|
|
8
6
|
module Sinew
|
9
|
-
|
10
|
-
|
7
|
+
# A wrapper around Faraday::Response, with some parsing helpers.
|
8
|
+
class Response < SimpleDelegator
|
9
|
+
# Like body, but tries to cleanup whitespace around HTML for easier parsing.
|
10
|
+
def html
|
11
|
+
@html ||= body.dup.tap do
|
12
|
+
# fix invalid utf8
|
13
|
+
if _1.encoding == Encoding::UTF_8
|
14
|
+
_1.encode!('UTF-8', invalid: :replace, undef: :replace, replace: '?')
|
15
|
+
end
|
11
16
|
|
12
|
-
|
13
|
-
|
14
|
-
|
17
|
+
# squish
|
18
|
+
_1.strip!
|
19
|
+
_1.gsub!(/\s+/, ' ')
|
15
20
|
|
16
|
-
|
17
|
-
|
18
|
-
_1.request = request
|
19
|
-
_1.uri = fday_response.env.url
|
20
|
-
_1.code = fday_response.status
|
21
|
-
_1.headers = fday_response.headers.to_h
|
22
|
-
_1.body = process_body(fday_response)
|
21
|
+
# kill whitespace around tags
|
22
|
+
_1.gsub!(/ ?<([^>]+)> ?/, '<\\1>')
|
23
23
|
end
|
24
24
|
end
|
25
25
|
|
26
|
-
#
|
27
|
-
def
|
28
|
-
|
29
|
-
|
30
|
-
# inflate if necessary
|
31
|
-
bits = body[0, 10].force_encoding('BINARY')
|
32
|
-
if bits =~ /\A\x1f\x8b/n
|
33
|
-
body = Zlib::GzipReader.new(StringIO.new(body)).read
|
34
|
-
end
|
35
|
-
|
36
|
-
# force to utf-8 if we think this could be text
|
37
|
-
if body.encoding != Encoding::UTF_8
|
38
|
-
if content_type = response.headers['content-type']
|
39
|
-
if content_type =~ /\b(html|javascript|json|text|xml)\b/
|
40
|
-
body = body.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
|
41
|
-
end
|
42
|
-
end
|
43
|
-
end
|
26
|
+
# Return body as JSON
|
27
|
+
def json
|
28
|
+
@json ||= JSON.parse(body, symbolize_names: true)
|
29
|
+
end
|
44
30
|
|
45
|
-
|
31
|
+
# Return JSON body as Hashie::Mash
|
32
|
+
def mash
|
33
|
+
@mash ||= Hashie::Mash.new(json)
|
46
34
|
end
|
47
35
|
|
48
|
-
#
|
49
|
-
|
50
|
-
|
36
|
+
# Return body HTML as Nokogiri document
|
37
|
+
def noko
|
38
|
+
@noko ||= Nokogiri::HTML(html)
|
39
|
+
end
|
51
40
|
|
52
|
-
|
53
|
-
|
41
|
+
# Return body XML as Nokogiri document
|
42
|
+
def xml
|
43
|
+
@xml ||= Nokogiri::XML(html)
|
54
44
|
end
|
55
45
|
|
56
|
-
|
57
|
-
|
46
|
+
# Return the final URI for the request, after redirects
|
47
|
+
def url
|
48
|
+
env.url
|
58
49
|
end
|
59
50
|
|
60
|
-
|
61
|
-
|
51
|
+
# Return the cache diskpath for this response
|
52
|
+
def diskpath
|
53
|
+
env[:httpdisk_diskpath]
|
62
54
|
end
|
63
55
|
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
code: code,
|
68
|
-
headers: headers,
|
69
|
-
}
|
56
|
+
# Remove cached response from disk, if any
|
57
|
+
def uncache
|
58
|
+
File.unlink(diskpath) if File.exist?(diskpath)
|
70
59
|
end
|
71
60
|
end
|
72
61
|
end
|
data/lib/sinew/version.rb
CHANGED
data/sample.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
require_relative 'lib/sinew'
|
2
|
+
|
3
|
+
sinew = Sinew.new(output: 'sample.csv', verbose: true)
|
4
|
+
|
5
|
+
response = sinew.get 'http://httpbingo.org'
|
6
|
+
response.noko.css('ul li a').each do |a|
|
7
|
+
row = {}
|
8
|
+
row[:url] = a[:href]
|
9
|
+
row[:title] = a.text
|
10
|
+
sinew.csv_emit(row)
|
11
|
+
end
|
12
|
+
|
13
|
+
sinew.get 'http://httpbingo.org/redirect/2'
|
data/sample.sinew
CHANGED
@@ -1,9 +1,9 @@
|
|
1
|
-
get 'http://httpbingo.org'
|
2
|
-
noko.css('ul li a').each do |a|
|
1
|
+
response = sinew.get 'http://httpbingo.org'
|
2
|
+
response.noko.css('ul li a').each do |a|
|
3
3
|
row = {}
|
4
4
|
row[:url] = a[:href]
|
5
5
|
row[:title] = a.text
|
6
|
-
csv_emit(row)
|
6
|
+
sinew.csv_emit(row)
|
7
7
|
end
|
8
8
|
|
9
|
-
get 'http://httpbingo.org/redirect/2'
|
9
|
+
sinew.get 'http://httpbingo.org/redirect/2'
|
data/sinew.gemspec
CHANGED
@@ -3,14 +3,15 @@ $LOAD_PATH.unshift("#{__dir__}/lib")
|
|
3
3
|
require 'sinew/version'
|
4
4
|
|
5
5
|
Gem::Specification.new do |s|
|
6
|
-
s.name
|
7
|
-
s.version
|
8
|
-
s.
|
9
|
-
s.
|
10
|
-
|
11
|
-
s.
|
12
|
-
s.summary = 'Sinew - structured web crawling using recipes.'
|
6
|
+
s.name = 'sinew'
|
7
|
+
s.version = Sinew::VERSION
|
8
|
+
s.authors = ['Adam Doppelt', 'Nathan Kriege']
|
9
|
+
s.email = ['amd@gurge.com']
|
10
|
+
|
11
|
+
s.summary = 'Sinew - structured web crawling using recipes.'
|
13
12
|
s.description = 'Crawl web sites easily using ruby recipes, with caching and nokogiri.'
|
13
|
+
s.homepage = 'http://github.com/gurgeous/sinew'
|
14
|
+
s.license = 'MIT'
|
14
15
|
s.required_ruby_version = '>= 2.7'
|
15
16
|
|
16
17
|
# what's in the gem?
|
@@ -19,14 +20,16 @@ Gem::Specification.new do |s|
|
|
19
20
|
end
|
20
21
|
s.bindir = 'bin'
|
21
22
|
s.executables = s.files.grep(%r{^#{s.bindir}/}) { File.basename(_1) }
|
22
|
-
s.require_paths = [
|
23
|
+
s.require_paths = ['lib']
|
23
24
|
|
24
|
-
|
25
|
-
s.
|
26
|
-
s.
|
27
|
-
s.
|
28
|
-
s.
|
29
|
-
s.
|
30
|
-
s.
|
31
|
-
s.
|
25
|
+
# gem dependencies
|
26
|
+
s.add_dependency 'amazing_print', '~> 1.3'
|
27
|
+
s.add_dependency 'faraday', '~> 1.4'
|
28
|
+
s.add_dependency 'faraday-encoding', '~> 0'
|
29
|
+
s.add_dependency 'faraday-rate_limiter', '~> 0.0'
|
30
|
+
s.add_dependency 'hashie', '~> 4.1'
|
31
|
+
s.add_dependency 'httpdisk', '~> 0.5'
|
32
|
+
s.add_dependency 'nokogiri', '~> 1.11'
|
33
|
+
s.add_dependency 'slop', '~> 4.8'
|
34
|
+
s.add_dependency 'sterile', '~> 1.0'
|
32
35
|
end
|