sinew 1.0.4 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.gitignore +1 -0
- data/.rubocop.yml +49 -0
- data/.travis.yml +4 -0
- data/.vscode/extensions.json +3 -0
- data/.vscode/settings.json +15 -0
- data/Gemfile +1 -1
- data/README.md +153 -12
- data/Rakefile +13 -14
- data/bin/sinew +40 -20
- data/lib/sinew.rb +10 -6
- data/lib/sinew/cache.rb +79 -0
- data/lib/sinew/core_ext.rb +59 -0
- data/lib/sinew/dsl.rb +98 -0
- data/lib/sinew/main.rb +80 -149
- data/lib/sinew/nokogiri_ext.rb +10 -9
- data/lib/sinew/output.rb +126 -0
- data/lib/sinew/request.rb +148 -0
- data/lib/sinew/response.rb +75 -0
- data/lib/sinew/runtime_options.rb +26 -0
- data/lib/sinew/version.rb +1 -1
- data/sample.sinew +5 -3
- data/sinew.gemspec +24 -19
- data/test/test.html +40 -34
- data/test/test_cache.rb +69 -0
- data/test/test_helper.rb +113 -0
- data/test/test_main.rb +36 -91
- data/test/test_nokogiri_ext.rb +14 -15
- data/test/test_output.rb +73 -0
- data/test/test_requests.rb +135 -0
- data/test/test_utf8.rb +39 -0
- metadata +103 -48
- data/lib/sinew/curler.rb +0 -173
- data/lib/sinew/text_util.rb +0 -101
- data/lib/sinew/util.rb +0 -236
- data/test/helper.rb +0 -64
- data/test/test_curler.rb +0 -70
- data/test/test_text_util.rb +0 -23
data/test/test.html
CHANGED
@@ -1,39 +1,45 @@
|
|
1
1
|
<html>
|
2
|
-
<head>
|
3
|
-
<title>Title</title>
|
4
|
-
<script>
|
5
|
-
alert("alert 1");
|
6
|
-
alert("alert 2");
|
7
|
-
</script>
|
8
|
-
</head>
|
9
|
-
|
10
|
-
<body>
|
11
|
-
<div id="main">
|
12
|
-
<span class="class1"> text1 </span>
|
13
|
-
<span class="class2"> text2 </span>
|
14
2
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
a
|
28
|
-
<p>b<span>c</span></p>
|
29
|
-
<p>b<span>c</span></p>
|
30
|
-
</div>
|
31
|
-
</div>
|
3
|
+
<head>
|
4
|
+
<title>Title</title>
|
5
|
+
<script>
|
6
|
+
alert("alert 1");
|
7
|
+
alert("alert 2");
|
8
|
+
</script>
|
9
|
+
</head>
|
10
|
+
|
11
|
+
<body>
|
12
|
+
<div id="main">
|
13
|
+
<span class="class1"> text1 </span>
|
14
|
+
<span class="class2"> text2 </span>
|
32
15
|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
16
|
+
<!-- for test_normalize -->
|
17
|
+
<div id="element"> text </div>
|
18
|
+
<div class="e"> text1 </div>
|
19
|
+
<div class="e"> text2 </div>
|
20
|
+
</div>
|
21
|
+
|
22
|
+
<div id="nokogiri_ext">
|
23
|
+
<ul>
|
24
|
+
<li>hello</li>
|
25
|
+
<li>world</li>
|
26
|
+
</ul>
|
27
|
+
<div>
|
28
|
+
a
|
29
|
+
<p>b
|
30
|
+
<span>c</span>
|
31
|
+
</p>
|
32
|
+
<p>b
|
33
|
+
<span>c</span>
|
34
|
+
</p>
|
37
35
|
</div>
|
38
|
-
</
|
36
|
+
</div>
|
37
|
+
|
38
|
+
<div id="text_util">
|
39
|
+
<!-- a comment that should be removed -->
|
40
|
+
<div class="will_be_removed" />
|
41
|
+
<a class="will_be_preserved" />
|
42
|
+
</div>
|
43
|
+
</body>
|
44
|
+
|
39
45
|
</html>
|
data/test/test_cache.rb
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
require_relative 'test_helper'
|
2
|
+
|
3
|
+
class TestCache < MiniTest::Test
|
4
|
+
def test_get
|
5
|
+
2.times do
|
6
|
+
sinew.dsl.get('http://httpbin.org/get', c: 3, d: 4)
|
7
|
+
end
|
8
|
+
if !test_network?
|
9
|
+
assert_requested :get, 'http://httpbin.org/get?c=3&d=4', times: 1
|
10
|
+
end
|
11
|
+
assert_equal 1, sinew.request_count
|
12
|
+
assert_equal({ c: '3', d: '4' }, sinew.dsl.json[:args])
|
13
|
+
assert File.exist?("#{TMP}/httpbin.org/get,c=3,d=4")
|
14
|
+
assert !File.exist?("#{TMP}/httpbin.org/head/get,c=3,d=4")
|
15
|
+
end
|
16
|
+
|
17
|
+
def test_post
|
18
|
+
2.times do
|
19
|
+
sinew.dsl.post('http://httpbin.org/post', c: 5, d: 6)
|
20
|
+
end
|
21
|
+
if !test_network?
|
22
|
+
assert_requested :post, 'http://httpbin.org/post', times: 1
|
23
|
+
end
|
24
|
+
assert_equal 1, sinew.request_count
|
25
|
+
assert_equal({ c: '5', d: '6' }, sinew.dsl.json[:form])
|
26
|
+
end
|
27
|
+
|
28
|
+
def test_redirect
|
29
|
+
2.times do
|
30
|
+
sinew.dsl.get('http://httpbin.org/redirect/2')
|
31
|
+
end
|
32
|
+
if !test_network?
|
33
|
+
assert_requested :get, 'http://httpbin.org/redirect/2', times: 1
|
34
|
+
assert_requested :get, 'http://httpbin.org/redirect/1', times: 1
|
35
|
+
assert_requested :get, 'http://httpbin.org/get', times: 1
|
36
|
+
end
|
37
|
+
assert_equal 1, sinew.request_count
|
38
|
+
assert_equal 'http://httpbin.org/get', sinew.dsl.url
|
39
|
+
end
|
40
|
+
|
41
|
+
def test_error
|
42
|
+
# gotta set this or the retries mess up our request counts
|
43
|
+
sinew.runtime_options.retries = 0
|
44
|
+
assert_output(/failed with 500/) do
|
45
|
+
2.times do
|
46
|
+
sinew.dsl.get('http://httpbin.org/status/500')
|
47
|
+
end
|
48
|
+
end
|
49
|
+
if !test_network?
|
50
|
+
assert_requested :get, 'http://httpbin.org/status/500', times: 1
|
51
|
+
assert_equal '500', sinew.dsl.raw
|
52
|
+
end
|
53
|
+
assert_equal 1, sinew.request_count
|
54
|
+
end
|
55
|
+
|
56
|
+
def test_timeout
|
57
|
+
return if test_network?
|
58
|
+
|
59
|
+
# gotta set this or the retries mess up our request counts
|
60
|
+
sinew.runtime_options.retries = 0
|
61
|
+
assert_output(/failed with 999/) do
|
62
|
+
2.times do
|
63
|
+
sinew.dsl.get('http://httpbin.org/delay/1')
|
64
|
+
end
|
65
|
+
end
|
66
|
+
assert_requested :get, 'http://httpbin.org/delay/1', times: 1
|
67
|
+
assert_equal 'timeout', sinew.dsl.raw
|
68
|
+
end
|
69
|
+
end
|
data/test/test_helper.rb
ADDED
@@ -0,0 +1,113 @@
|
|
1
|
+
require 'minitest/autorun'
|
2
|
+
require 'minitest/pride'
|
3
|
+
require 'webmock/minitest' unless ENV['SINEW_TEST_NETWORK']
|
4
|
+
|
5
|
+
# a hint to sinew, so that it'll do things like set rate limit to zero
|
6
|
+
ENV['SINEW_TEST'] = '1'
|
7
|
+
|
8
|
+
# Normally the Rakefile takes care of this, but it's handy to have it here when
|
9
|
+
# running tests individually.
|
10
|
+
$LOAD_PATH.unshift("#{__dir__}/../lib")
|
11
|
+
require 'sinew'
|
12
|
+
|
13
|
+
class MiniTest::Test
|
14
|
+
TMP = '/tmp/_test_sinew'.freeze
|
15
|
+
RECIPE = "#{TMP}/test.sinew".freeze
|
16
|
+
CSV = "#{TMP}/test.csv".freeze
|
17
|
+
HTML = File.read("#{__dir__}/test.html")
|
18
|
+
|
19
|
+
def setup
|
20
|
+
super
|
21
|
+
|
22
|
+
# prepare TMP
|
23
|
+
FileUtils.rm_rf(TMP)
|
24
|
+
FileUtils.mkdir_p(TMP)
|
25
|
+
|
26
|
+
stub_network unless test_network?
|
27
|
+
end
|
28
|
+
|
29
|
+
def sinew
|
30
|
+
@sinew ||= Sinew::Main.new(cache: TMP, quiet: true, recipe: RECIPE)
|
31
|
+
end
|
32
|
+
protected :sinew
|
33
|
+
|
34
|
+
def run_recipe(recipe)
|
35
|
+
File.write(RECIPE, recipe)
|
36
|
+
sinew.run
|
37
|
+
end
|
38
|
+
protected :run_recipe
|
39
|
+
|
40
|
+
def test_network?
|
41
|
+
!!ENV['SINEW_TEST_NETWORK']
|
42
|
+
end
|
43
|
+
protected :test_network?
|
44
|
+
|
45
|
+
# mock requests, patterned on httpbin
|
46
|
+
def stub_network
|
47
|
+
stub_request(:get, %r{http://[^/]+/html}).to_return(method(:respond_html))
|
48
|
+
stub_request(:get, %r{http://[^/]+/get\b}).to_return(method(:respond_echo))
|
49
|
+
stub_request(:post, %r{http://[^/]+/post\b}).to_return(method(:respond_echo))
|
50
|
+
stub_request(:get, %r{http://[^/]+/status/\d+}).to_return(method(:respond_status))
|
51
|
+
stub_request(:get, %r{http://[^/]+/(relative-)?redirect/\d+}).to_return(method(:respond_redirect))
|
52
|
+
stub_request(:get, %r{http://[^/]+/delay/\d+}).to_timeout
|
53
|
+
end
|
54
|
+
protected :stub_network
|
55
|
+
|
56
|
+
#
|
57
|
+
# respond_xxx helpers
|
58
|
+
#
|
59
|
+
|
60
|
+
def respond_html(_request)
|
61
|
+
# this html was carefully chosen to match httpbin.org/html
|
62
|
+
html = <<~EOF
|
63
|
+
<body>
|
64
|
+
<h1>Herman Melville - Moby-Dick</h1>
|
65
|
+
</body>
|
66
|
+
EOF
|
67
|
+
{ body: html }
|
68
|
+
end
|
69
|
+
protected :respond_html
|
70
|
+
|
71
|
+
def respond_echo(request)
|
72
|
+
response = {}
|
73
|
+
response[:headers] = request.headers
|
74
|
+
|
75
|
+
# args
|
76
|
+
response[:args] = if request.uri.query
|
77
|
+
CGI.parse(request.uri.query).map { |k, v| [k, v.first] }.to_h
|
78
|
+
else
|
79
|
+
{}
|
80
|
+
end
|
81
|
+
|
82
|
+
# form
|
83
|
+
if request.headers['Content-Type'] == 'application/x-www-form-urlencoded'
|
84
|
+
response[:form] = CGI.parse(request.body).map { |k, v| [k, v.first] }.to_h
|
85
|
+
end
|
86
|
+
|
87
|
+
# json
|
88
|
+
if request.headers['Content-Type'] == 'application/json'
|
89
|
+
response[:json] = JSON.parse(request.body)
|
90
|
+
end
|
91
|
+
|
92
|
+
{
|
93
|
+
headers: { 'Content-Type' => 'application/json' },
|
94
|
+
body: response.to_json,
|
95
|
+
}
|
96
|
+
end
|
97
|
+
protected :respond_echo
|
98
|
+
|
99
|
+
def respond_status(request)
|
100
|
+
status = request.uri.to_s.split('/').last.to_i
|
101
|
+
{ body: status.to_s, status: status }
|
102
|
+
end
|
103
|
+
protected :respond_status
|
104
|
+
|
105
|
+
def respond_redirect(request)
|
106
|
+
parts = request.uri.to_s.split('/')
|
107
|
+
path, count = parts[-2], parts[-1].to_i
|
108
|
+
url = count == 1 ? '/get' : "/#{path}/#{count - 1}"
|
109
|
+
url = "http://example#{url}" if path =~ /absolute/
|
110
|
+
{ status: 302, headers: { 'Location' => url } }
|
111
|
+
end
|
112
|
+
protected :respond_redirect
|
113
|
+
end
|
data/test/test_main.rb
CHANGED
@@ -1,101 +1,46 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
def setup
|
11
|
-
# create TMP dir
|
12
|
-
FileUtils.rm_rf(TMP) if File.exists?(TMP)
|
13
|
-
FileUtils.mkdir_p(TMP)
|
14
|
-
end
|
15
|
-
|
16
|
-
def run_recipe(recipe)
|
17
|
-
File.write(RECIPE, recipe)
|
18
|
-
Util.stub(:run, mock_curl_200) do
|
19
|
-
Sinew::Main.new(cache: TMP, file: RECIPE, quiet: true)
|
1
|
+
require_relative 'test_helper'
|
2
|
+
|
3
|
+
class TestMain < MiniTest::Test
|
4
|
+
def test_noko
|
5
|
+
run_recipe <<~'EOF'
|
6
|
+
get 'http://httpbin.org/html'
|
7
|
+
noko.css("h1").each do |h1|
|
8
|
+
csv_emit(h1: h1.text)
|
20
9
|
end
|
21
|
-
|
10
|
+
EOF
|
11
|
+
assert_equal("h1\nHerman Melville - Moby-Dick\n", File.read(CSV))
|
12
|
+
end
|
22
13
|
|
23
|
-
|
24
|
-
|
25
|
-
get "http://
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
end
|
14
|
+
def test_raw
|
15
|
+
run_recipe <<~'EOF'
|
16
|
+
get "http://httpbin.org/html"
|
17
|
+
raw.scan(/<h1>([^<]+)/) do
|
18
|
+
csv_emit(h1: $1)
|
19
|
+
end
|
20
|
+
EOF
|
21
|
+
assert_equal("h1\nHerman Melville - Moby-Dick\n", File.read(CSV))
|
22
|
+
end
|
33
23
|
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
raw.scan(/alert\("([^"]+)/) do
|
39
|
-
csv_emit(alert: $1)
|
40
|
-
end
|
41
|
-
EOF
|
42
|
-
assert_equal("alert\nalert 1\nalert 2\n", File.read(CSV))
|
43
|
-
end
|
24
|
+
def test_rate_limit
|
25
|
+
# true network requests call sleep for timeouts, which interferes with our
|
26
|
+
# instrumentation of Kernel#sleep
|
27
|
+
skip if test_network?
|
44
28
|
|
45
|
-
|
46
|
-
# note the cleaned up whitespace
|
47
|
-
run_recipe <<'EOF'
|
48
|
-
get "http://www.example.com"
|
49
|
-
csv_header(:class, :text)
|
50
|
-
html.scan(/<span class="(\w+)">(\w+)/) do
|
51
|
-
csv_emit(class: $1, text: $2)
|
52
|
-
end
|
53
|
-
EOF
|
54
|
-
assert_equal("class,text\nclass1,text1\nclass2,text2\n", File.read(CSV))
|
55
|
-
end
|
29
|
+
slept = false
|
56
30
|
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
clean.scan(/<span>(text\d)/) do
|
62
|
-
csv_emit(text: $1)
|
63
|
-
end
|
64
|
-
EOF
|
65
|
-
assert_equal("text\ntext1\ntext2\n", File.read(CSV))
|
31
|
+
# change Kernel#sleep to not really sleep!
|
32
|
+
Kernel.send(:alias_method, :old_sleep, :sleep)
|
33
|
+
Kernel.send(:define_method, :sleep) do |_duration|
|
34
|
+
slept = true
|
66
35
|
end
|
67
36
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
# non-strings
|
73
|
-
#
|
74
|
-
|
75
|
-
noko = Nokogiri::HTML(HTML).css("#main")
|
76
|
-
# node => text
|
77
|
-
assert_equal("text", s.send(:_normalize, noko.css("#element")))
|
78
|
-
# nodes => text joined with space
|
79
|
-
assert_equal("text1 text2", s.send(:_normalize, noko.css(".e")))
|
80
|
-
# array => text joined with pipe
|
81
|
-
assert_equal("1|2", s.send(:_normalize, [1,2]))
|
37
|
+
sinew.runtime_options.rate_limit = 1
|
38
|
+
sinew.dsl.get('http://httpbin.org/html')
|
39
|
+
sinew.dsl.get('http://httpbin.org/get')
|
40
|
+
assert(slept)
|
82
41
|
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
# untag
|
88
|
-
assert_equal("gub", s.send(:_normalize, "<tag>gub</tag>"))
|
89
|
-
# convert_accented_entities
|
90
|
-
assert_equal("a", s.send(:_normalize, "á"))
|
91
|
-
# unent
|
92
|
-
assert_equal("<>", s.send(:_normalize, "<>"))
|
93
|
-
# to_ascii
|
94
|
-
assert_equal("cafe", s.send(:_normalize, "caf\xc3\xa9"))
|
95
|
-
# squish
|
96
|
-
assert_equal("hello world", s.send(:_normalize, "\nhello \t \rworld"))
|
97
|
-
end
|
42
|
+
# restore old Kernel#sleep
|
43
|
+
Kernel.send(:alias_method, :sleep, :old_sleep)
|
44
|
+
Kernel.send(:undef_method, :old_sleep)
|
98
45
|
end
|
99
46
|
end
|
100
|
-
|
101
|
-
|
data/test/test_nokogiri_ext.rb
CHANGED
@@ -1,19 +1,18 @@
|
|
1
|
-
|
1
|
+
require_relative 'test_helper'
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
3
|
+
class TestNokogiriExt < MiniTest::Test
|
4
|
+
def test_inner_text
|
5
|
+
assert_equal('hello world', noko.css('li').inner_text)
|
6
|
+
assert_equal('<li>hello</li> <li>world</li>', noko.css('ul').inner_html.squish)
|
7
|
+
end
|
8
|
+
|
9
|
+
def test_just_me
|
10
|
+
assert_equal('a', noko.css('div').text_just_me.squish)
|
11
|
+
assert_equal('b b', noko.css('p').text_just_me.squish)
|
12
|
+
end
|
13
13
|
|
14
|
-
|
15
|
-
|
16
|
-
assert_equal("b b", @noko.css("p").text_just_me.squish)
|
17
|
-
end
|
14
|
+
def noko
|
15
|
+
@noko ||= Nokogiri::HTML(HTML).css('#nokogiri_ext')
|
18
16
|
end
|
17
|
+
protected :noko
|
19
18
|
end
|
data/test/test_output.rb
ADDED
@@ -0,0 +1,73 @@
|
|
1
|
+
require_relative 'test_helper'
|
2
|
+
|
3
|
+
class TestOutput < MiniTest::Test
|
4
|
+
def test_output
|
5
|
+
sinew.dsl.csv_header(:n, :a, :p)
|
6
|
+
sinew.dsl.csv_emit(n: 'n1', a: 'a1')
|
7
|
+
sinew.dsl.csv_emit(n: 'n2', a: 'a2')
|
8
|
+
assert_equal 2, sinew.output.count
|
9
|
+
assert_equal "n,a,p\nn1,a1,\"\"\nn2,a2,\"\"\n", File.read(CSV)
|
10
|
+
end
|
11
|
+
|
12
|
+
def test_implicit_header
|
13
|
+
sinew.dsl.csv_emit(name: 'bob', address: 'main')
|
14
|
+
assert_equal "name,address\nbob,main\n", File.read(CSV)
|
15
|
+
end
|
16
|
+
|
17
|
+
def test_array_header
|
18
|
+
sinew.dsl.csv_header(%i[n a p])
|
19
|
+
sinew.dsl.csv_emit(n: 'n1', a: 'a1')
|
20
|
+
assert_equal "n,a,p\nn1,a1,\"\"\n", File.read(CSV)
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_filenames
|
24
|
+
sinew = Sinew::Main.new(recipe: 'gub.sinew')
|
25
|
+
assert_equal 'gub.csv', sinew.output.filename
|
26
|
+
sinew = Sinew::Main.new(recipe: 'gub')
|
27
|
+
assert_equal 'gub.csv', sinew.output.filename
|
28
|
+
sinew = Sinew::Main.new(recipe: '/somewhere/gub.sinew')
|
29
|
+
assert_equal '/somewhere/gub.csv', sinew.output.filename
|
30
|
+
end
|
31
|
+
|
32
|
+
def test_normalization
|
33
|
+
output = Sinew::Output.new(nil)
|
34
|
+
|
35
|
+
#
|
36
|
+
# simple types
|
37
|
+
#
|
38
|
+
|
39
|
+
assert_equal '', output.send(:normalize, nil)
|
40
|
+
assert_equal '', output.send(:normalize, '')
|
41
|
+
assert_equal 'text', output.send(:normalize, 'text')
|
42
|
+
assert_equal '123', output.send(:normalize, 123)
|
43
|
+
assert_equal('1|2', output.send(:normalize, [ 1, 2 ]))
|
44
|
+
|
45
|
+
#
|
46
|
+
# nokogiri
|
47
|
+
#
|
48
|
+
|
49
|
+
noko = Nokogiri::HTML(HTML)
|
50
|
+
|
51
|
+
# node => text
|
52
|
+
assert_equal('text', output.send(:normalize, noko.css('#element')))
|
53
|
+
# nodes => text joined with space
|
54
|
+
assert_equal('text1 text2', output.send(:normalize, noko.css('.e')))
|
55
|
+
|
56
|
+
#
|
57
|
+
# string cleanups
|
58
|
+
#
|
59
|
+
|
60
|
+
# strip_html_tags
|
61
|
+
assert_equal('gub', output.send(:normalize, '<tag>gub</tag>'))
|
62
|
+
# convert_smart_punctuation
|
63
|
+
assert_equal('"gub"', output.send(:normalize, "\302\223gub\302\224"))
|
64
|
+
# convert_accented_html_entities
|
65
|
+
assert_equal('a', output.send(:normalize, 'á'))
|
66
|
+
# convert_miscellaneous_html_entities
|
67
|
+
assert_equal('<>', output.send(:normalize, '<>'))
|
68
|
+
# to_ascii
|
69
|
+
assert_equal('cafe', output.send(:normalize, "caf\xc3\xa9"))
|
70
|
+
# squish
|
71
|
+
assert_equal('hello world', output.send(:normalize, "\nhello \t \rworld"))
|
72
|
+
end
|
73
|
+
end
|