sinew 1.0.4 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.gitignore +1 -0
- data/.rubocop.yml +49 -0
- data/.travis.yml +4 -0
- data/.vscode/extensions.json +3 -0
- data/.vscode/settings.json +15 -0
- data/Gemfile +1 -1
- data/README.md +153 -12
- data/Rakefile +13 -14
- data/bin/sinew +40 -20
- data/lib/sinew.rb +10 -6
- data/lib/sinew/cache.rb +79 -0
- data/lib/sinew/core_ext.rb +59 -0
- data/lib/sinew/dsl.rb +98 -0
- data/lib/sinew/main.rb +80 -149
- data/lib/sinew/nokogiri_ext.rb +10 -9
- data/lib/sinew/output.rb +126 -0
- data/lib/sinew/request.rb +148 -0
- data/lib/sinew/response.rb +75 -0
- data/lib/sinew/runtime_options.rb +26 -0
- data/lib/sinew/version.rb +1 -1
- data/sample.sinew +5 -3
- data/sinew.gemspec +24 -19
- data/test/test.html +40 -34
- data/test/test_cache.rb +69 -0
- data/test/test_helper.rb +113 -0
- data/test/test_main.rb +36 -91
- data/test/test_nokogiri_ext.rb +14 -15
- data/test/test_output.rb +73 -0
- data/test/test_requests.rb +135 -0
- data/test/test_utf8.rb +39 -0
- metadata +103 -48
- data/lib/sinew/curler.rb +0 -173
- data/lib/sinew/text_util.rb +0 -101
- data/lib/sinew/util.rb +0 -236
- data/test/helper.rb +0 -64
- data/test/test_curler.rb +0 -70
- data/test/test_text_util.rb +0 -23
data/test/test.html
CHANGED
@@ -1,39 +1,45 @@
|
|
1
1
|
<html>
|
2
|
-
<head>
|
3
|
-
<title>Title</title>
|
4
|
-
<script>
|
5
|
-
alert("alert 1");
|
6
|
-
alert("alert 2");
|
7
|
-
</script>
|
8
|
-
</head>
|
9
|
-
|
10
|
-
<body>
|
11
|
-
<div id="main">
|
12
|
-
<span class="class1"> text1 </span>
|
13
|
-
<span class="class2"> text2 </span>
|
14
2
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
a
|
28
|
-
<p>b<span>c</span></p>
|
29
|
-
<p>b<span>c</span></p>
|
30
|
-
</div>
|
31
|
-
</div>
|
3
|
+
<head>
|
4
|
+
<title>Title</title>
|
5
|
+
<script>
|
6
|
+
alert("alert 1");
|
7
|
+
alert("alert 2");
|
8
|
+
</script>
|
9
|
+
</head>
|
10
|
+
|
11
|
+
<body>
|
12
|
+
<div id="main">
|
13
|
+
<span class="class1"> text1 </span>
|
14
|
+
<span class="class2"> text2 </span>
|
32
15
|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
16
|
+
<!-- for test_normalize -->
|
17
|
+
<div id="element"> text </div>
|
18
|
+
<div class="e"> text1 </div>
|
19
|
+
<div class="e"> text2 </div>
|
20
|
+
</div>
|
21
|
+
|
22
|
+
<div id="nokogiri_ext">
|
23
|
+
<ul>
|
24
|
+
<li>hello</li>
|
25
|
+
<li>world</li>
|
26
|
+
</ul>
|
27
|
+
<div>
|
28
|
+
a
|
29
|
+
<p>b
|
30
|
+
<span>c</span>
|
31
|
+
</p>
|
32
|
+
<p>b
|
33
|
+
<span>c</span>
|
34
|
+
</p>
|
37
35
|
</div>
|
38
|
-
</
|
36
|
+
</div>
|
37
|
+
|
38
|
+
<div id="text_util">
|
39
|
+
<!-- a comment that should be removed -->
|
40
|
+
<div class="will_be_removed" />
|
41
|
+
<a class="will_be_preserved" />
|
42
|
+
</div>
|
43
|
+
</body>
|
44
|
+
|
39
45
|
</html>
|
data/test/test_cache.rb
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
require_relative 'test_helper'
|
2
|
+
|
3
|
+
class TestCache < MiniTest::Test
|
4
|
+
def test_get
|
5
|
+
2.times do
|
6
|
+
sinew.dsl.get('http://httpbin.org/get', c: 3, d: 4)
|
7
|
+
end
|
8
|
+
if !test_network?
|
9
|
+
assert_requested :get, 'http://httpbin.org/get?c=3&d=4', times: 1
|
10
|
+
end
|
11
|
+
assert_equal 1, sinew.request_count
|
12
|
+
assert_equal({ c: '3', d: '4' }, sinew.dsl.json[:args])
|
13
|
+
assert File.exist?("#{TMP}/httpbin.org/get,c=3,d=4")
|
14
|
+
assert !File.exist?("#{TMP}/httpbin.org/head/get,c=3,d=4")
|
15
|
+
end
|
16
|
+
|
17
|
+
def test_post
|
18
|
+
2.times do
|
19
|
+
sinew.dsl.post('http://httpbin.org/post', c: 5, d: 6)
|
20
|
+
end
|
21
|
+
if !test_network?
|
22
|
+
assert_requested :post, 'http://httpbin.org/post', times: 1
|
23
|
+
end
|
24
|
+
assert_equal 1, sinew.request_count
|
25
|
+
assert_equal({ c: '5', d: '6' }, sinew.dsl.json[:form])
|
26
|
+
end
|
27
|
+
|
28
|
+
def test_redirect
|
29
|
+
2.times do
|
30
|
+
sinew.dsl.get('http://httpbin.org/redirect/2')
|
31
|
+
end
|
32
|
+
if !test_network?
|
33
|
+
assert_requested :get, 'http://httpbin.org/redirect/2', times: 1
|
34
|
+
assert_requested :get, 'http://httpbin.org/redirect/1', times: 1
|
35
|
+
assert_requested :get, 'http://httpbin.org/get', times: 1
|
36
|
+
end
|
37
|
+
assert_equal 1, sinew.request_count
|
38
|
+
assert_equal 'http://httpbin.org/get', sinew.dsl.url
|
39
|
+
end
|
40
|
+
|
41
|
+
def test_error
|
42
|
+
# gotta set this or the retries mess up our request counts
|
43
|
+
sinew.runtime_options.retries = 0
|
44
|
+
assert_output(/failed with 500/) do
|
45
|
+
2.times do
|
46
|
+
sinew.dsl.get('http://httpbin.org/status/500')
|
47
|
+
end
|
48
|
+
end
|
49
|
+
if !test_network?
|
50
|
+
assert_requested :get, 'http://httpbin.org/status/500', times: 1
|
51
|
+
assert_equal '500', sinew.dsl.raw
|
52
|
+
end
|
53
|
+
assert_equal 1, sinew.request_count
|
54
|
+
end
|
55
|
+
|
56
|
+
def test_timeout
|
57
|
+
return if test_network?
|
58
|
+
|
59
|
+
# gotta set this or the retries mess up our request counts
|
60
|
+
sinew.runtime_options.retries = 0
|
61
|
+
assert_output(/failed with 999/) do
|
62
|
+
2.times do
|
63
|
+
sinew.dsl.get('http://httpbin.org/delay/1')
|
64
|
+
end
|
65
|
+
end
|
66
|
+
assert_requested :get, 'http://httpbin.org/delay/1', times: 1
|
67
|
+
assert_equal 'timeout', sinew.dsl.raw
|
68
|
+
end
|
69
|
+
end
|
data/test/test_helper.rb
ADDED
@@ -0,0 +1,113 @@
|
|
1
|
+
require 'minitest/autorun'
|
2
|
+
require 'minitest/pride'
|
3
|
+
require 'webmock/minitest' unless ENV['SINEW_TEST_NETWORK']
|
4
|
+
|
5
|
+
# a hint to sinew, so that it'll do things like set rate limit to zero
|
6
|
+
ENV['SINEW_TEST'] = '1'
|
7
|
+
|
8
|
+
# Normally the Rakefile takes care of this, but it's handy to have it here when
|
9
|
+
# running tests individually.
|
10
|
+
$LOAD_PATH.unshift("#{__dir__}/../lib")
|
11
|
+
require 'sinew'
|
12
|
+
|
13
|
+
class MiniTest::Test
|
14
|
+
TMP = '/tmp/_test_sinew'.freeze
|
15
|
+
RECIPE = "#{TMP}/test.sinew".freeze
|
16
|
+
CSV = "#{TMP}/test.csv".freeze
|
17
|
+
HTML = File.read("#{__dir__}/test.html")
|
18
|
+
|
19
|
+
def setup
|
20
|
+
super
|
21
|
+
|
22
|
+
# prepare TMP
|
23
|
+
FileUtils.rm_rf(TMP)
|
24
|
+
FileUtils.mkdir_p(TMP)
|
25
|
+
|
26
|
+
stub_network unless test_network?
|
27
|
+
end
|
28
|
+
|
29
|
+
def sinew
|
30
|
+
@sinew ||= Sinew::Main.new(cache: TMP, quiet: true, recipe: RECIPE)
|
31
|
+
end
|
32
|
+
protected :sinew
|
33
|
+
|
34
|
+
def run_recipe(recipe)
|
35
|
+
File.write(RECIPE, recipe)
|
36
|
+
sinew.run
|
37
|
+
end
|
38
|
+
protected :run_recipe
|
39
|
+
|
40
|
+
def test_network?
|
41
|
+
!!ENV['SINEW_TEST_NETWORK']
|
42
|
+
end
|
43
|
+
protected :test_network?
|
44
|
+
|
45
|
+
# mock requests, patterned on httpbin
|
46
|
+
def stub_network
|
47
|
+
stub_request(:get, %r{http://[^/]+/html}).to_return(method(:respond_html))
|
48
|
+
stub_request(:get, %r{http://[^/]+/get\b}).to_return(method(:respond_echo))
|
49
|
+
stub_request(:post, %r{http://[^/]+/post\b}).to_return(method(:respond_echo))
|
50
|
+
stub_request(:get, %r{http://[^/]+/status/\d+}).to_return(method(:respond_status))
|
51
|
+
stub_request(:get, %r{http://[^/]+/(relative-)?redirect/\d+}).to_return(method(:respond_redirect))
|
52
|
+
stub_request(:get, %r{http://[^/]+/delay/\d+}).to_timeout
|
53
|
+
end
|
54
|
+
protected :stub_network
|
55
|
+
|
56
|
+
#
|
57
|
+
# respond_xxx helpers
|
58
|
+
#
|
59
|
+
|
60
|
+
def respond_html(_request)
|
61
|
+
# this html was carefully chosen to match httpbin.org/html
|
62
|
+
html = <<~EOF
|
63
|
+
<body>
|
64
|
+
<h1>Herman Melville - Moby-Dick</h1>
|
65
|
+
</body>
|
66
|
+
EOF
|
67
|
+
{ body: html }
|
68
|
+
end
|
69
|
+
protected :respond_html
|
70
|
+
|
71
|
+
def respond_echo(request)
|
72
|
+
response = {}
|
73
|
+
response[:headers] = request.headers
|
74
|
+
|
75
|
+
# args
|
76
|
+
response[:args] = if request.uri.query
|
77
|
+
CGI.parse(request.uri.query).map { |k, v| [k, v.first] }.to_h
|
78
|
+
else
|
79
|
+
{}
|
80
|
+
end
|
81
|
+
|
82
|
+
# form
|
83
|
+
if request.headers['Content-Type'] == 'application/x-www-form-urlencoded'
|
84
|
+
response[:form] = CGI.parse(request.body).map { |k, v| [k, v.first] }.to_h
|
85
|
+
end
|
86
|
+
|
87
|
+
# json
|
88
|
+
if request.headers['Content-Type'] == 'application/json'
|
89
|
+
response[:json] = JSON.parse(request.body)
|
90
|
+
end
|
91
|
+
|
92
|
+
{
|
93
|
+
headers: { 'Content-Type' => 'application/json' },
|
94
|
+
body: response.to_json,
|
95
|
+
}
|
96
|
+
end
|
97
|
+
protected :respond_echo
|
98
|
+
|
99
|
+
def respond_status(request)
|
100
|
+
status = request.uri.to_s.split('/').last.to_i
|
101
|
+
{ body: status.to_s, status: status }
|
102
|
+
end
|
103
|
+
protected :respond_status
|
104
|
+
|
105
|
+
def respond_redirect(request)
|
106
|
+
parts = request.uri.to_s.split('/')
|
107
|
+
path, count = parts[-2], parts[-1].to_i
|
108
|
+
url = count == 1 ? '/get' : "/#{path}/#{count - 1}"
|
109
|
+
url = "http://example#{url}" if path =~ /absolute/
|
110
|
+
{ status: 302, headers: { 'Location' => url } }
|
111
|
+
end
|
112
|
+
protected :respond_redirect
|
113
|
+
end
|
data/test/test_main.rb
CHANGED
@@ -1,101 +1,46 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
def setup
|
11
|
-
# create TMP dir
|
12
|
-
FileUtils.rm_rf(TMP) if File.exists?(TMP)
|
13
|
-
FileUtils.mkdir_p(TMP)
|
14
|
-
end
|
15
|
-
|
16
|
-
def run_recipe(recipe)
|
17
|
-
File.write(RECIPE, recipe)
|
18
|
-
Util.stub(:run, mock_curl_200) do
|
19
|
-
Sinew::Main.new(cache: TMP, file: RECIPE, quiet: true)
|
1
|
+
require_relative 'test_helper'
|
2
|
+
|
3
|
+
class TestMain < MiniTest::Test
|
4
|
+
def test_noko
|
5
|
+
run_recipe <<~'EOF'
|
6
|
+
get 'http://httpbin.org/html'
|
7
|
+
noko.css("h1").each do |h1|
|
8
|
+
csv_emit(h1: h1.text)
|
20
9
|
end
|
21
|
-
|
10
|
+
EOF
|
11
|
+
assert_equal("h1\nHerman Melville - Moby-Dick\n", File.read(CSV))
|
12
|
+
end
|
22
13
|
|
23
|
-
|
24
|
-
|
25
|
-
get "http://
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
end
|
14
|
+
def test_raw
|
15
|
+
run_recipe <<~'EOF'
|
16
|
+
get "http://httpbin.org/html"
|
17
|
+
raw.scan(/<h1>([^<]+)/) do
|
18
|
+
csv_emit(h1: $1)
|
19
|
+
end
|
20
|
+
EOF
|
21
|
+
assert_equal("h1\nHerman Melville - Moby-Dick\n", File.read(CSV))
|
22
|
+
end
|
33
23
|
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
raw.scan(/alert\("([^"]+)/) do
|
39
|
-
csv_emit(alert: $1)
|
40
|
-
end
|
41
|
-
EOF
|
42
|
-
assert_equal("alert\nalert 1\nalert 2\n", File.read(CSV))
|
43
|
-
end
|
24
|
+
def test_rate_limit
|
25
|
+
# true network requests call sleep for timeouts, which interferes with our
|
26
|
+
# instrumentation of Kernel#sleep
|
27
|
+
skip if test_network?
|
44
28
|
|
45
|
-
|
46
|
-
# note the cleaned up whitespace
|
47
|
-
run_recipe <<'EOF'
|
48
|
-
get "http://www.example.com"
|
49
|
-
csv_header(:class, :text)
|
50
|
-
html.scan(/<span class="(\w+)">(\w+)/) do
|
51
|
-
csv_emit(class: $1, text: $2)
|
52
|
-
end
|
53
|
-
EOF
|
54
|
-
assert_equal("class,text\nclass1,text1\nclass2,text2\n", File.read(CSV))
|
55
|
-
end
|
29
|
+
slept = false
|
56
30
|
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
clean.scan(/<span>(text\d)/) do
|
62
|
-
csv_emit(text: $1)
|
63
|
-
end
|
64
|
-
EOF
|
65
|
-
assert_equal("text\ntext1\ntext2\n", File.read(CSV))
|
31
|
+
# change Kernel#sleep to not really sleep!
|
32
|
+
Kernel.send(:alias_method, :old_sleep, :sleep)
|
33
|
+
Kernel.send(:define_method, :sleep) do |_duration|
|
34
|
+
slept = true
|
66
35
|
end
|
67
36
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
# non-strings
|
73
|
-
#
|
74
|
-
|
75
|
-
noko = Nokogiri::HTML(HTML).css("#main")
|
76
|
-
# node => text
|
77
|
-
assert_equal("text", s.send(:_normalize, noko.css("#element")))
|
78
|
-
# nodes => text joined with space
|
79
|
-
assert_equal("text1 text2", s.send(:_normalize, noko.css(".e")))
|
80
|
-
# array => text joined with pipe
|
81
|
-
assert_equal("1|2", s.send(:_normalize, [1,2]))
|
37
|
+
sinew.runtime_options.rate_limit = 1
|
38
|
+
sinew.dsl.get('http://httpbin.org/html')
|
39
|
+
sinew.dsl.get('http://httpbin.org/get')
|
40
|
+
assert(slept)
|
82
41
|
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
# untag
|
88
|
-
assert_equal("gub", s.send(:_normalize, "<tag>gub</tag>"))
|
89
|
-
# convert_accented_entities
|
90
|
-
assert_equal("a", s.send(:_normalize, "á"))
|
91
|
-
# unent
|
92
|
-
assert_equal("<>", s.send(:_normalize, "<>"))
|
93
|
-
# to_ascii
|
94
|
-
assert_equal("cafe", s.send(:_normalize, "caf\xc3\xa9"))
|
95
|
-
# squish
|
96
|
-
assert_equal("hello world", s.send(:_normalize, "\nhello \t \rworld"))
|
97
|
-
end
|
42
|
+
# restore old Kernel#sleep
|
43
|
+
Kernel.send(:alias_method, :sleep, :old_sleep)
|
44
|
+
Kernel.send(:undef_method, :old_sleep)
|
98
45
|
end
|
99
46
|
end
|
100
|
-
|
101
|
-
|
data/test/test_nokogiri_ext.rb
CHANGED
@@ -1,19 +1,18 @@
|
|
1
|
-
|
1
|
+
require_relative 'test_helper'
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
3
|
+
class TestNokogiriExt < MiniTest::Test
|
4
|
+
def test_inner_text
|
5
|
+
assert_equal('hello world', noko.css('li').inner_text)
|
6
|
+
assert_equal('<li>hello</li> <li>world</li>', noko.css('ul').inner_html.squish)
|
7
|
+
end
|
8
|
+
|
9
|
+
def test_just_me
|
10
|
+
assert_equal('a', noko.css('div').text_just_me.squish)
|
11
|
+
assert_equal('b b', noko.css('p').text_just_me.squish)
|
12
|
+
end
|
13
13
|
|
14
|
-
|
15
|
-
|
16
|
-
assert_equal("b b", @noko.css("p").text_just_me.squish)
|
17
|
-
end
|
14
|
+
def noko
|
15
|
+
@noko ||= Nokogiri::HTML(HTML).css('#nokogiri_ext')
|
18
16
|
end
|
17
|
+
protected :noko
|
19
18
|
end
|
data/test/test_output.rb
ADDED
@@ -0,0 +1,73 @@
|
|
1
|
+
require_relative 'test_helper'
|
2
|
+
|
3
|
+
class TestOutput < MiniTest::Test
|
4
|
+
def test_output
|
5
|
+
sinew.dsl.csv_header(:n, :a, :p)
|
6
|
+
sinew.dsl.csv_emit(n: 'n1', a: 'a1')
|
7
|
+
sinew.dsl.csv_emit(n: 'n2', a: 'a2')
|
8
|
+
assert_equal 2, sinew.output.count
|
9
|
+
assert_equal "n,a,p\nn1,a1,\"\"\nn2,a2,\"\"\n", File.read(CSV)
|
10
|
+
end
|
11
|
+
|
12
|
+
def test_implicit_header
|
13
|
+
sinew.dsl.csv_emit(name: 'bob', address: 'main')
|
14
|
+
assert_equal "name,address\nbob,main\n", File.read(CSV)
|
15
|
+
end
|
16
|
+
|
17
|
+
def test_array_header
|
18
|
+
sinew.dsl.csv_header(%i[n a p])
|
19
|
+
sinew.dsl.csv_emit(n: 'n1', a: 'a1')
|
20
|
+
assert_equal "n,a,p\nn1,a1,\"\"\n", File.read(CSV)
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_filenames
|
24
|
+
sinew = Sinew::Main.new(recipe: 'gub.sinew')
|
25
|
+
assert_equal 'gub.csv', sinew.output.filename
|
26
|
+
sinew = Sinew::Main.new(recipe: 'gub')
|
27
|
+
assert_equal 'gub.csv', sinew.output.filename
|
28
|
+
sinew = Sinew::Main.new(recipe: '/somewhere/gub.sinew')
|
29
|
+
assert_equal '/somewhere/gub.csv', sinew.output.filename
|
30
|
+
end
|
31
|
+
|
32
|
+
def test_normalization
|
33
|
+
output = Sinew::Output.new(nil)
|
34
|
+
|
35
|
+
#
|
36
|
+
# simple types
|
37
|
+
#
|
38
|
+
|
39
|
+
assert_equal '', output.send(:normalize, nil)
|
40
|
+
assert_equal '', output.send(:normalize, '')
|
41
|
+
assert_equal 'text', output.send(:normalize, 'text')
|
42
|
+
assert_equal '123', output.send(:normalize, 123)
|
43
|
+
assert_equal('1|2', output.send(:normalize, [ 1, 2 ]))
|
44
|
+
|
45
|
+
#
|
46
|
+
# nokogiri
|
47
|
+
#
|
48
|
+
|
49
|
+
noko = Nokogiri::HTML(HTML)
|
50
|
+
|
51
|
+
# node => text
|
52
|
+
assert_equal('text', output.send(:normalize, noko.css('#element')))
|
53
|
+
# nodes => text joined with space
|
54
|
+
assert_equal('text1 text2', output.send(:normalize, noko.css('.e')))
|
55
|
+
|
56
|
+
#
|
57
|
+
# string cleanups
|
58
|
+
#
|
59
|
+
|
60
|
+
# strip_html_tags
|
61
|
+
assert_equal('gub', output.send(:normalize, '<tag>gub</tag>'))
|
62
|
+
# convert_smart_punctuation
|
63
|
+
assert_equal('"gub"', output.send(:normalize, "\302\223gub\302\224"))
|
64
|
+
# convert_accented_html_entities
|
65
|
+
assert_equal('a', output.send(:normalize, 'á'))
|
66
|
+
# convert_miscellaneous_html_entities
|
67
|
+
assert_equal('<>', output.send(:normalize, '<>'))
|
68
|
+
# to_ascii
|
69
|
+
assert_equal('cafe', output.send(:normalize, "caf\xc3\xa9"))
|
70
|
+
# squish
|
71
|
+
assert_equal('hello world', output.send(:normalize, "\nhello \t \rworld"))
|
72
|
+
end
|
73
|
+
end
|