sinew 1.0.4 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,39 +1,45 @@
1
1
  <html>
2
- <head>
3
- <title>Title</title>
4
- <script>
5
- alert("alert 1");
6
- alert("alert 2");
7
- </script>
8
- </head>
9
-
10
- <body>
11
- <div id="main">
12
- <span class="class1"> text1 </span>
13
- <span class="class2"> text2 </span>
14
2
 
15
- <!-- for test_normalize -->
16
- <div id="element">text</div>
17
- <div class="e">text1</div>
18
- <div class="e">text2</div>
19
- </div>
20
-
21
- <div id="nokogiri_ext">
22
- <ul>
23
- <li>hello</li>
24
- <li>world</li>
25
- </ul>
26
- <div>
27
- a
28
- <p>b<span>c</span></p>
29
- <p>b<span>c</span></p>
30
- </div>
31
- </div>
3
+ <head>
4
+ <title>Title</title>
5
+ <script>
6
+ alert("alert 1");
7
+ alert("alert 2");
8
+ </script>
9
+ </head>
10
+
11
+ <body>
12
+ <div id="main">
13
+ <span class="class1"> text1 </span>
14
+ <span class="class2"> text2 </span>
32
15
 
33
- <div id="text_util">
34
- <!-- a comment that should be removed -->
35
- <div class="will_be_removed"/>
36
- <a class="will_be_preserved"/>
16
+ <!-- for test_normalize -->
17
+ <div id="element"> text </div>
18
+ <div class="e"> text1 </div>
19
+ <div class="e"> text2 </div>
20
+ </div>
21
+
22
+ <div id="nokogiri_ext">
23
+ <ul>
24
+ <li>hello</li>
25
+ <li>world</li>
26
+ </ul>
27
+ <div>
28
+ a
29
+ <p>b
30
+ <span>c</span>
31
+ </p>
32
+ <p>b
33
+ <span>c</span>
34
+ </p>
37
35
  </div>
38
- </body>
36
+ </div>
37
+
38
+ <div id="text_util">
39
+ <!-- a comment that should be removed -->
40
+ <div class="will_be_removed" />
41
+ <a class="will_be_preserved" />
42
+ </div>
43
+ </body>
44
+
39
45
  </html>
@@ -0,0 +1,69 @@
1
+ require_relative 'test_helper'
2
+
3
+ class TestCache < MiniTest::Test
4
+ def test_get
5
+ 2.times do
6
+ sinew.dsl.get('http://httpbin.org/get', c: 3, d: 4)
7
+ end
8
+ if !test_network?
9
+ assert_requested :get, 'http://httpbin.org/get?c=3&d=4', times: 1
10
+ end
11
+ assert_equal 1, sinew.request_count
12
+ assert_equal({ c: '3', d: '4' }, sinew.dsl.json[:args])
13
+ assert File.exist?("#{TMP}/httpbin.org/get,c=3,d=4")
14
+ assert !File.exist?("#{TMP}/httpbin.org/head/get,c=3,d=4")
15
+ end
16
+
17
+ def test_post
18
+ 2.times do
19
+ sinew.dsl.post('http://httpbin.org/post', c: 5, d: 6)
20
+ end
21
+ if !test_network?
22
+ assert_requested :post, 'http://httpbin.org/post', times: 1
23
+ end
24
+ assert_equal 1, sinew.request_count
25
+ assert_equal({ c: '5', d: '6' }, sinew.dsl.json[:form])
26
+ end
27
+
28
+ def test_redirect
29
+ 2.times do
30
+ sinew.dsl.get('http://httpbin.org/redirect/2')
31
+ end
32
+ if !test_network?
33
+ assert_requested :get, 'http://httpbin.org/redirect/2', times: 1
34
+ assert_requested :get, 'http://httpbin.org/redirect/1', times: 1
35
+ assert_requested :get, 'http://httpbin.org/get', times: 1
36
+ end
37
+ assert_equal 1, sinew.request_count
38
+ assert_equal 'http://httpbin.org/get', sinew.dsl.url
39
+ end
40
+
41
+ def test_error
42
+ # gotta set this or the retries mess up our request counts
43
+ sinew.runtime_options.retries = 0
44
+ assert_output(/failed with 500/) do
45
+ 2.times do
46
+ sinew.dsl.get('http://httpbin.org/status/500')
47
+ end
48
+ end
49
+ if !test_network?
50
+ assert_requested :get, 'http://httpbin.org/status/500', times: 1
51
+ assert_equal '500', sinew.dsl.raw
52
+ end
53
+ assert_equal 1, sinew.request_count
54
+ end
55
+
56
+ def test_timeout
57
+ return if test_network?
58
+
59
+ # gotta set this or the retries mess up our request counts
60
+ sinew.runtime_options.retries = 0
61
+ assert_output(/failed with 999/) do
62
+ 2.times do
63
+ sinew.dsl.get('http://httpbin.org/delay/1')
64
+ end
65
+ end
66
+ assert_requested :get, 'http://httpbin.org/delay/1', times: 1
67
+ assert_equal 'timeout', sinew.dsl.raw
68
+ end
69
+ end
@@ -0,0 +1,113 @@
1
+ require 'minitest/autorun'
2
+ require 'minitest/pride'
3
+ require 'webmock/minitest' unless ENV['SINEW_TEST_NETWORK']
4
+
5
+ # a hint to sinew, so that it'll do things like set rate limit to zero
6
+ ENV['SINEW_TEST'] = '1'
7
+
8
+ # Normally the Rakefile takes care of this, but it's handy to have it here when
9
+ # running tests individually.
10
+ $LOAD_PATH.unshift("#{__dir__}/../lib")
11
+ require 'sinew'
12
+
13
+ class MiniTest::Test
14
+ TMP = '/tmp/_test_sinew'.freeze
15
+ RECIPE = "#{TMP}/test.sinew".freeze
16
+ CSV = "#{TMP}/test.csv".freeze
17
+ HTML = File.read("#{__dir__}/test.html")
18
+
19
+ def setup
20
+ super
21
+
22
+ # prepare TMP
23
+ FileUtils.rm_rf(TMP)
24
+ FileUtils.mkdir_p(TMP)
25
+
26
+ stub_network unless test_network?
27
+ end
28
+
29
+ def sinew
30
+ @sinew ||= Sinew::Main.new(cache: TMP, quiet: true, recipe: RECIPE)
31
+ end
32
+ protected :sinew
33
+
34
+ def run_recipe(recipe)
35
+ File.write(RECIPE, recipe)
36
+ sinew.run
37
+ end
38
+ protected :run_recipe
39
+
40
+ def test_network?
41
+ !!ENV['SINEW_TEST_NETWORK']
42
+ end
43
+ protected :test_network?
44
+
45
+ # mock requests, patterned on httpbin
46
+ def stub_network
47
+ stub_request(:get, %r{http://[^/]+/html}).to_return(method(:respond_html))
48
+ stub_request(:get, %r{http://[^/]+/get\b}).to_return(method(:respond_echo))
49
+ stub_request(:post, %r{http://[^/]+/post\b}).to_return(method(:respond_echo))
50
+ stub_request(:get, %r{http://[^/]+/status/\d+}).to_return(method(:respond_status))
51
+ stub_request(:get, %r{http://[^/]+/(relative-)?redirect/\d+}).to_return(method(:respond_redirect))
52
+ stub_request(:get, %r{http://[^/]+/delay/\d+}).to_timeout
53
+ end
54
+ protected :stub_network
55
+
56
+ #
57
+ # respond_xxx helpers
58
+ #
59
+
60
+ def respond_html(_request)
61
+ # this html was carefully chosen to match httpbin.org/html
62
+ html = <<~EOF
63
+ <body>
64
+ <h1>Herman Melville - Moby-Dick</h1>
65
+ </body>
66
+ EOF
67
+ { body: html }
68
+ end
69
+ protected :respond_html
70
+
71
+ def respond_echo(request)
72
+ response = {}
73
+ response[:headers] = request.headers
74
+
75
+ # args
76
+ response[:args] = if request.uri.query
77
+ CGI.parse(request.uri.query).map { |k, v| [k, v.first] }.to_h
78
+ else
79
+ {}
80
+ end
81
+
82
+ # form
83
+ if request.headers['Content-Type'] == 'application/x-www-form-urlencoded'
84
+ response[:form] = CGI.parse(request.body).map { |k, v| [k, v.first] }.to_h
85
+ end
86
+
87
+ # json
88
+ if request.headers['Content-Type'] == 'application/json'
89
+ response[:json] = JSON.parse(request.body)
90
+ end
91
+
92
+ {
93
+ headers: { 'Content-Type' => 'application/json' },
94
+ body: response.to_json,
95
+ }
96
+ end
97
+ protected :respond_echo
98
+
99
+ def respond_status(request)
100
+ status = request.uri.to_s.split('/').last.to_i
101
+ { body: status.to_s, status: status }
102
+ end
103
+ protected :respond_status
104
+
105
+ def respond_redirect(request)
106
+ parts = request.uri.to_s.split('/')
107
+ path, count = parts[-2], parts[-1].to_i
108
+ url = count == 1 ? '/get' : "/#{path}/#{count - 1}"
109
+ url = "http://example#{url}" if path =~ /absolute/
110
+ { status: 302, headers: { 'Location' => url } }
111
+ end
112
+ protected :respond_redirect
113
+ end
@@ -1,101 +1,46 @@
1
- # encoding: UTF-8
2
-
3
- require "helper"
4
-
5
- module Sinew
6
- class TestMain < TestCase
7
- RECIPE = "#{TMP}/test.sinew"
8
- CSV = "#{TMP}/test.csv"
9
-
10
- def setup
11
- # create TMP dir
12
- FileUtils.rm_rf(TMP) if File.exists?(TMP)
13
- FileUtils.mkdir_p(TMP)
14
- end
15
-
16
- def run_recipe(recipe)
17
- File.write(RECIPE, recipe)
18
- Util.stub(:run, mock_curl_200) do
19
- Sinew::Main.new(cache: TMP, file: RECIPE, quiet: true)
1
+ require_relative 'test_helper'
2
+
3
+ class TestMain < MiniTest::Test
4
+ def test_noko
5
+ run_recipe <<~'EOF'
6
+ get 'http://httpbin.org/html'
7
+ noko.css("h1").each do |h1|
8
+ csv_emit(h1: h1.text)
20
9
  end
21
- end
10
+ EOF
11
+ assert_equal("h1\nHerman Melville - Moby-Dick\n", File.read(CSV))
12
+ end
22
13
 
23
- def test_noko
24
- run_recipe <<'EOF'
25
- get "http://www.example.com"
26
- csv_header(:class, :text)
27
- noko.css("#main span").each do |span|
28
- csv_emit(class: span[:class], text: span.text)
29
- end
30
- EOF
31
- assert_equal("class,text\nclass1,text1\nclass2,text2\n", File.read(CSV))
32
- end
14
+ def test_raw
15
+ run_recipe <<~'EOF'
16
+ get "http://httpbin.org/html"
17
+ raw.scan(/<h1>([^<]+)/) do
18
+ csv_emit(h1: $1)
19
+ end
20
+ EOF
21
+ assert_equal("h1\nHerman Melville - Moby-Dick\n", File.read(CSV))
22
+ end
33
23
 
34
- def test_raw
35
- # test javascript, which is only crawlable with raw
36
- run_recipe <<'EOF'
37
- get "http://www.example.com"
38
- raw.scan(/alert\("([^"]+)/) do
39
- csv_emit(alert: $1)
40
- end
41
- EOF
42
- assert_equal("alert\nalert 1\nalert 2\n", File.read(CSV))
43
- end
24
+ def test_rate_limit
25
+ # true network requests call sleep for timeouts, which interferes with our
26
+ # instrumentation of Kernel#sleep
27
+ skip if test_network?
44
28
 
45
- def test_html
46
- # note the cleaned up whitespace
47
- run_recipe <<'EOF'
48
- get "http://www.example.com"
49
- csv_header(:class, :text)
50
- html.scan(/<span class="(\w+)">(\w+)/) do
51
- csv_emit(class: $1, text: $2)
52
- end
53
- EOF
54
- assert_equal("class,text\nclass1,text1\nclass2,text2\n", File.read(CSV))
55
- end
29
+ slept = false
56
30
 
57
- def test_clean
58
- # note the removed attributes from span
59
- run_recipe <<'EOF'
60
- get "http://www.example.com"
61
- clean.scan(/<span>(text\d)/) do
62
- csv_emit(text: $1)
63
- end
64
- EOF
65
- assert_equal("text\ntext1\ntext2\n", File.read(CSV))
31
+ # change Kernel#sleep to not really sleep!
32
+ Kernel.send(:alias_method, :old_sleep, :sleep)
33
+ Kernel.send(:define_method, :sleep) do |_duration|
34
+ slept = true
66
35
  end
67
36
 
68
- def test_normalize
69
- s = Sinew::Main.new(test: true)
70
-
71
- #
72
- # non-strings
73
- #
74
-
75
- noko = Nokogiri::HTML(HTML).css("#main")
76
- # node => text
77
- assert_equal("text", s.send(:_normalize, noko.css("#element")))
78
- # nodes => text joined with space
79
- assert_equal("text1 text2", s.send(:_normalize, noko.css(".e")))
80
- # array => text joined with pipe
81
- assert_equal("1|2", s.send(:_normalize, [1,2]))
37
+ sinew.runtime_options.rate_limit = 1
38
+ sinew.dsl.get('http://httpbin.org/html')
39
+ sinew.dsl.get('http://httpbin.org/get')
40
+ assert(slept)
82
41
 
83
- #
84
- # string cleanups
85
- #
86
-
87
- # untag
88
- assert_equal("gub", s.send(:_normalize, "<tag>gub</tag>"))
89
- # convert_accented_entities
90
- assert_equal("a", s.send(:_normalize, "&aacute;"))
91
- # unent
92
- assert_equal("<>", s.send(:_normalize, "&lt;&gt;"))
93
- # to_ascii
94
- assert_equal("cafe", s.send(:_normalize, "caf\xc3\xa9"))
95
- # squish
96
- assert_equal("hello world", s.send(:_normalize, "\nhello \t \rworld"))
97
- end
42
+ # restore old Kernel#sleep
43
+ Kernel.send(:alias_method, :sleep, :old_sleep)
44
+ Kernel.send(:undef_method, :old_sleep)
98
45
  end
99
46
  end
100
-
101
-
@@ -1,19 +1,18 @@
1
- require "helper"
1
+ require_relative 'test_helper'
2
2
 
3
- module Sinew
4
- class TestNokogiriExt < TestCase
5
- def setup
6
- @noko = Nokogiri::HTML(HTML).css("#nokogiri_ext")
7
- end
8
-
9
- def test_inner_text
10
- assert_equal("hello world", @noko.css("li").inner_text)
11
- assert_equal("<li>hello</li> <li>world</li>", @noko.css("ul").inner_html.squish)
12
- end
3
+ class TestNokogiriExt < MiniTest::Test
4
+ def test_inner_text
5
+ assert_equal('hello world', noko.css('li').inner_text)
6
+ assert_equal('<li>hello</li> <li>world</li>', noko.css('ul').inner_html.squish)
7
+ end
8
+
9
+ def test_just_me
10
+ assert_equal('a', noko.css('div').text_just_me.squish)
11
+ assert_equal('b b', noko.css('p').text_just_me.squish)
12
+ end
13
13
 
14
- def test_just_me
15
- assert_equal("a", @noko.css("div").text_just_me.squish)
16
- assert_equal("b b", @noko.css("p").text_just_me.squish)
17
- end
14
+ def noko
15
+ @noko ||= Nokogiri::HTML(HTML).css('#nokogiri_ext')
18
16
  end
17
+ protected :noko
19
18
  end
@@ -0,0 +1,73 @@
1
+ require_relative 'test_helper'
2
+
3
+ class TestOutput < MiniTest::Test
4
+ def test_output
5
+ sinew.dsl.csv_header(:n, :a, :p)
6
+ sinew.dsl.csv_emit(n: 'n1', a: 'a1')
7
+ sinew.dsl.csv_emit(n: 'n2', a: 'a2')
8
+ assert_equal 2, sinew.output.count
9
+ assert_equal "n,a,p\nn1,a1,\"\"\nn2,a2,\"\"\n", File.read(CSV)
10
+ end
11
+
12
+ def test_implicit_header
13
+ sinew.dsl.csv_emit(name: 'bob', address: 'main')
14
+ assert_equal "name,address\nbob,main\n", File.read(CSV)
15
+ end
16
+
17
+ def test_array_header
18
+ sinew.dsl.csv_header(%i[n a p])
19
+ sinew.dsl.csv_emit(n: 'n1', a: 'a1')
20
+ assert_equal "n,a,p\nn1,a1,\"\"\n", File.read(CSV)
21
+ end
22
+
23
+ def test_filenames
24
+ sinew = Sinew::Main.new(recipe: 'gub.sinew')
25
+ assert_equal 'gub.csv', sinew.output.filename
26
+ sinew = Sinew::Main.new(recipe: 'gub')
27
+ assert_equal 'gub.csv', sinew.output.filename
28
+ sinew = Sinew::Main.new(recipe: '/somewhere/gub.sinew')
29
+ assert_equal '/somewhere/gub.csv', sinew.output.filename
30
+ end
31
+
32
+ def test_normalization
33
+ output = Sinew::Output.new(nil)
34
+
35
+ #
36
+ # simple types
37
+ #
38
+
39
+ assert_equal '', output.send(:normalize, nil)
40
+ assert_equal '', output.send(:normalize, '')
41
+ assert_equal 'text', output.send(:normalize, 'text')
42
+ assert_equal '123', output.send(:normalize, 123)
43
+ assert_equal('1|2', output.send(:normalize, [ 1, 2 ]))
44
+
45
+ #
46
+ # nokogiri
47
+ #
48
+
49
+ noko = Nokogiri::HTML(HTML)
50
+
51
+ # node => text
52
+ assert_equal('text', output.send(:normalize, noko.css('#element')))
53
+ # nodes => text joined with space
54
+ assert_equal('text1 text2', output.send(:normalize, noko.css('.e')))
55
+
56
+ #
57
+ # string cleanups
58
+ #
59
+
60
+ # strip_html_tags
61
+ assert_equal('gub', output.send(:normalize, '<tag>gub</tag>'))
62
+ # convert_smart_punctuation
63
+ assert_equal('"gub"', output.send(:normalize, "\302\223gub\302\224"))
64
+ # convert_accented_html_entities
65
+ assert_equal('a', output.send(:normalize, '&aacute;'))
66
+ # convert_miscellaneous_html_entities
67
+ assert_equal('<>', output.send(:normalize, '&lt;&gt;'))
68
+ # to_ascii
69
+ assert_equal('cafe', output.send(:normalize, "caf\xc3\xa9"))
70
+ # squish
71
+ assert_equal('hello world', output.send(:normalize, "\nhello \t \rworld"))
72
+ end
73
+ end