sinew 1.0.4 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,39 +1,45 @@
1
1
  <html>
2
- <head>
3
- <title>Title</title>
4
- <script>
5
- alert("alert 1");
6
- alert("alert 2");
7
- </script>
8
- </head>
9
-
10
- <body>
11
- <div id="main">
12
- <span class="class1"> text1 </span>
13
- <span class="class2"> text2 </span>
14
2
 
15
- <!-- for test_normalize -->
16
- <div id="element">text</div>
17
- <div class="e">text1</div>
18
- <div class="e">text2</div>
19
- </div>
20
-
21
- <div id="nokogiri_ext">
22
- <ul>
23
- <li>hello</li>
24
- <li>world</li>
25
- </ul>
26
- <div>
27
- a
28
- <p>b<span>c</span></p>
29
- <p>b<span>c</span></p>
30
- </div>
31
- </div>
3
+ <head>
4
+ <title>Title</title>
5
+ <script>
6
+ alert("alert 1");
7
+ alert("alert 2");
8
+ </script>
9
+ </head>
10
+
11
+ <body>
12
+ <div id="main">
13
+ <span class="class1"> text1 </span>
14
+ <span class="class2"> text2 </span>
32
15
 
33
- <div id="text_util">
34
- <!-- a comment that should be removed -->
35
- <div class="will_be_removed"/>
36
- <a class="will_be_preserved"/>
16
+ <!-- for test_normalize -->
17
+ <div id="element"> text </div>
18
+ <div class="e"> text1 </div>
19
+ <div class="e"> text2 </div>
20
+ </div>
21
+
22
+ <div id="nokogiri_ext">
23
+ <ul>
24
+ <li>hello</li>
25
+ <li>world</li>
26
+ </ul>
27
+ <div>
28
+ a
29
+ <p>b
30
+ <span>c</span>
31
+ </p>
32
+ <p>b
33
+ <span>c</span>
34
+ </p>
37
35
  </div>
38
- </body>
36
+ </div>
37
+
38
+ <div id="text_util">
39
+ <!-- a comment that should be removed -->
40
+ <div class="will_be_removed" />
41
+ <a class="will_be_preserved" />
42
+ </div>
43
+ </body>
44
+
39
45
  </html>
@@ -0,0 +1,69 @@
1
+ require_relative 'test_helper'
2
+
3
+ class TestCache < MiniTest::Test
4
+ def test_get
5
+ 2.times do
6
+ sinew.dsl.get('http://httpbin.org/get', c: 3, d: 4)
7
+ end
8
+ if !test_network?
9
+ assert_requested :get, 'http://httpbin.org/get?c=3&d=4', times: 1
10
+ end
11
+ assert_equal 1, sinew.request_count
12
+ assert_equal({ c: '3', d: '4' }, sinew.dsl.json[:args])
13
+ assert File.exist?("#{TMP}/httpbin.org/get,c=3,d=4")
14
+ assert !File.exist?("#{TMP}/httpbin.org/head/get,c=3,d=4")
15
+ end
16
+
17
+ def test_post
18
+ 2.times do
19
+ sinew.dsl.post('http://httpbin.org/post', c: 5, d: 6)
20
+ end
21
+ if !test_network?
22
+ assert_requested :post, 'http://httpbin.org/post', times: 1
23
+ end
24
+ assert_equal 1, sinew.request_count
25
+ assert_equal({ c: '5', d: '6' }, sinew.dsl.json[:form])
26
+ end
27
+
28
+ def test_redirect
29
+ 2.times do
30
+ sinew.dsl.get('http://httpbin.org/redirect/2')
31
+ end
32
+ if !test_network?
33
+ assert_requested :get, 'http://httpbin.org/redirect/2', times: 1
34
+ assert_requested :get, 'http://httpbin.org/redirect/1', times: 1
35
+ assert_requested :get, 'http://httpbin.org/get', times: 1
36
+ end
37
+ assert_equal 1, sinew.request_count
38
+ assert_equal 'http://httpbin.org/get', sinew.dsl.url
39
+ end
40
+
41
+ def test_error
42
+ # gotta set this or the retries mess up our request counts
43
+ sinew.runtime_options.retries = 0
44
+ assert_output(/failed with 500/) do
45
+ 2.times do
46
+ sinew.dsl.get('http://httpbin.org/status/500')
47
+ end
48
+ end
49
+ if !test_network?
50
+ assert_requested :get, 'http://httpbin.org/status/500', times: 1
51
+ assert_equal '500', sinew.dsl.raw
52
+ end
53
+ assert_equal 1, sinew.request_count
54
+ end
55
+
56
+ def test_timeout
57
+ return if test_network?
58
+
59
+ # gotta set this or the retries mess up our request counts
60
+ sinew.runtime_options.retries = 0
61
+ assert_output(/failed with 999/) do
62
+ 2.times do
63
+ sinew.dsl.get('http://httpbin.org/delay/1')
64
+ end
65
+ end
66
+ assert_requested :get, 'http://httpbin.org/delay/1', times: 1
67
+ assert_equal 'timeout', sinew.dsl.raw
68
+ end
69
+ end
@@ -0,0 +1,113 @@
1
+ require 'minitest/autorun'
2
+ require 'minitest/pride'
3
+ require 'webmock/minitest' unless ENV['SINEW_TEST_NETWORK']
4
+
5
+ # a hint to sinew, so that it'll do things like set rate limit to zero
6
+ ENV['SINEW_TEST'] = '1'
7
+
8
+ # Normally the Rakefile takes care of this, but it's handy to have it here when
9
+ # running tests individually.
10
+ $LOAD_PATH.unshift("#{__dir__}/../lib")
11
+ require 'sinew'
12
+
13
+ class MiniTest::Test
14
+ TMP = '/tmp/_test_sinew'.freeze
15
+ RECIPE = "#{TMP}/test.sinew".freeze
16
+ CSV = "#{TMP}/test.csv".freeze
17
+ HTML = File.read("#{__dir__}/test.html")
18
+
19
+ def setup
20
+ super
21
+
22
+ # prepare TMP
23
+ FileUtils.rm_rf(TMP)
24
+ FileUtils.mkdir_p(TMP)
25
+
26
+ stub_network unless test_network?
27
+ end
28
+
29
+ def sinew
30
+ @sinew ||= Sinew::Main.new(cache: TMP, quiet: true, recipe: RECIPE)
31
+ end
32
+ protected :sinew
33
+
34
+ def run_recipe(recipe)
35
+ File.write(RECIPE, recipe)
36
+ sinew.run
37
+ end
38
+ protected :run_recipe
39
+
40
+ def test_network?
41
+ !!ENV['SINEW_TEST_NETWORK']
42
+ end
43
+ protected :test_network?
44
+
45
+ # mock requests, patterned on httpbin
46
+ def stub_network
47
+ stub_request(:get, %r{http://[^/]+/html}).to_return(method(:respond_html))
48
+ stub_request(:get, %r{http://[^/]+/get\b}).to_return(method(:respond_echo))
49
+ stub_request(:post, %r{http://[^/]+/post\b}).to_return(method(:respond_echo))
50
+ stub_request(:get, %r{http://[^/]+/status/\d+}).to_return(method(:respond_status))
51
+ stub_request(:get, %r{http://[^/]+/(relative-)?redirect/\d+}).to_return(method(:respond_redirect))
52
+ stub_request(:get, %r{http://[^/]+/delay/\d+}).to_timeout
53
+ end
54
+ protected :stub_network
55
+
56
+ #
57
+ # respond_xxx helpers
58
+ #
59
+
60
+ def respond_html(_request)
61
+ # this html was carefully chosen to match httpbin.org/html
62
+ html = <<~EOF
63
+ <body>
64
+ <h1>Herman Melville - Moby-Dick</h1>
65
+ </body>
66
+ EOF
67
+ { body: html }
68
+ end
69
+ protected :respond_html
70
+
71
+ def respond_echo(request)
72
+ response = {}
73
+ response[:headers] = request.headers
74
+
75
+ # args
76
+ response[:args] = if request.uri.query
77
+ CGI.parse(request.uri.query).map { |k, v| [k, v.first] }.to_h
78
+ else
79
+ {}
80
+ end
81
+
82
+ # form
83
+ if request.headers['Content-Type'] == 'application/x-www-form-urlencoded'
84
+ response[:form] = CGI.parse(request.body).map { |k, v| [k, v.first] }.to_h
85
+ end
86
+
87
+ # json
88
+ if request.headers['Content-Type'] == 'application/json'
89
+ response[:json] = JSON.parse(request.body)
90
+ end
91
+
92
+ {
93
+ headers: { 'Content-Type' => 'application/json' },
94
+ body: response.to_json,
95
+ }
96
+ end
97
+ protected :respond_echo
98
+
99
+ def respond_status(request)
100
+ status = request.uri.to_s.split('/').last.to_i
101
+ { body: status.to_s, status: status }
102
+ end
103
+ protected :respond_status
104
+
105
+ def respond_redirect(request)
106
+ parts = request.uri.to_s.split('/')
107
+ path, count = parts[-2], parts[-1].to_i
108
+ url = count == 1 ? '/get' : "/#{path}/#{count - 1}"
109
+ url = "http://example#{url}" if path =~ /absolute/
110
+ { status: 302, headers: { 'Location' => url } }
111
+ end
112
+ protected :respond_redirect
113
+ end
@@ -1,101 +1,46 @@
1
- # encoding: UTF-8
2
-
3
- require "helper"
4
-
5
- module Sinew
6
- class TestMain < TestCase
7
- RECIPE = "#{TMP}/test.sinew"
8
- CSV = "#{TMP}/test.csv"
9
-
10
- def setup
11
- # create TMP dir
12
- FileUtils.rm_rf(TMP) if File.exists?(TMP)
13
- FileUtils.mkdir_p(TMP)
14
- end
15
-
16
- def run_recipe(recipe)
17
- File.write(RECIPE, recipe)
18
- Util.stub(:run, mock_curl_200) do
19
- Sinew::Main.new(cache: TMP, file: RECIPE, quiet: true)
1
+ require_relative 'test_helper'
2
+
3
+ class TestMain < MiniTest::Test
4
+ def test_noko
5
+ run_recipe <<~'EOF'
6
+ get 'http://httpbin.org/html'
7
+ noko.css("h1").each do |h1|
8
+ csv_emit(h1: h1.text)
20
9
  end
21
- end
10
+ EOF
11
+ assert_equal("h1\nHerman Melville - Moby-Dick\n", File.read(CSV))
12
+ end
22
13
 
23
- def test_noko
24
- run_recipe <<'EOF'
25
- get "http://www.example.com"
26
- csv_header(:class, :text)
27
- noko.css("#main span").each do |span|
28
- csv_emit(class: span[:class], text: span.text)
29
- end
30
- EOF
31
- assert_equal("class,text\nclass1,text1\nclass2,text2\n", File.read(CSV))
32
- end
14
+ def test_raw
15
+ run_recipe <<~'EOF'
16
+ get "http://httpbin.org/html"
17
+ raw.scan(/<h1>([^<]+)/) do
18
+ csv_emit(h1: $1)
19
+ end
20
+ EOF
21
+ assert_equal("h1\nHerman Melville - Moby-Dick\n", File.read(CSV))
22
+ end
33
23
 
34
- def test_raw
35
- # test javascript, which is only crawlable with raw
36
- run_recipe <<'EOF'
37
- get "http://www.example.com"
38
- raw.scan(/alert\("([^"]+)/) do
39
- csv_emit(alert: $1)
40
- end
41
- EOF
42
- assert_equal("alert\nalert 1\nalert 2\n", File.read(CSV))
43
- end
24
+ def test_rate_limit
25
+ # true network requests call sleep for timeouts, which interferes with our
26
+ # instrumentation of Kernel#sleep
27
+ skip if test_network?
44
28
 
45
- def test_html
46
- # note the cleaned up whitespace
47
- run_recipe <<'EOF'
48
- get "http://www.example.com"
49
- csv_header(:class, :text)
50
- html.scan(/<span class="(\w+)">(\w+)/) do
51
- csv_emit(class: $1, text: $2)
52
- end
53
- EOF
54
- assert_equal("class,text\nclass1,text1\nclass2,text2\n", File.read(CSV))
55
- end
29
+ slept = false
56
30
 
57
- def test_clean
58
- # note the removed attributes from span
59
- run_recipe <<'EOF'
60
- get "http://www.example.com"
61
- clean.scan(/<span>(text\d)/) do
62
- csv_emit(text: $1)
63
- end
64
- EOF
65
- assert_equal("text\ntext1\ntext2\n", File.read(CSV))
31
+ # change Kernel#sleep to not really sleep!
32
+ Kernel.send(:alias_method, :old_sleep, :sleep)
33
+ Kernel.send(:define_method, :sleep) do |_duration|
34
+ slept = true
66
35
  end
67
36
 
68
- def test_normalize
69
- s = Sinew::Main.new(test: true)
70
-
71
- #
72
- # non-strings
73
- #
74
-
75
- noko = Nokogiri::HTML(HTML).css("#main")
76
- # node => text
77
- assert_equal("text", s.send(:_normalize, noko.css("#element")))
78
- # nodes => text joined with space
79
- assert_equal("text1 text2", s.send(:_normalize, noko.css(".e")))
80
- # array => text joined with pipe
81
- assert_equal("1|2", s.send(:_normalize, [1,2]))
37
+ sinew.runtime_options.rate_limit = 1
38
+ sinew.dsl.get('http://httpbin.org/html')
39
+ sinew.dsl.get('http://httpbin.org/get')
40
+ assert(slept)
82
41
 
83
- #
84
- # string cleanups
85
- #
86
-
87
- # untag
88
- assert_equal("gub", s.send(:_normalize, "<tag>gub</tag>"))
89
- # convert_accented_entities
90
- assert_equal("a", s.send(:_normalize, "&aacute;"))
91
- # unent
92
- assert_equal("<>", s.send(:_normalize, "&lt;&gt;"))
93
- # to_ascii
94
- assert_equal("cafe", s.send(:_normalize, "caf\xc3\xa9"))
95
- # squish
96
- assert_equal("hello world", s.send(:_normalize, "\nhello \t \rworld"))
97
- end
42
+ # restore old Kernel#sleep
43
+ Kernel.send(:alias_method, :sleep, :old_sleep)
44
+ Kernel.send(:undef_method, :old_sleep)
98
45
  end
99
46
  end
100
-
101
-
@@ -1,19 +1,18 @@
1
- require "helper"
1
+ require_relative 'test_helper'
2
2
 
3
- module Sinew
4
- class TestNokogiriExt < TestCase
5
- def setup
6
- @noko = Nokogiri::HTML(HTML).css("#nokogiri_ext")
7
- end
8
-
9
- def test_inner_text
10
- assert_equal("hello world", @noko.css("li").inner_text)
11
- assert_equal("<li>hello</li> <li>world</li>", @noko.css("ul").inner_html.squish)
12
- end
3
+ class TestNokogiriExt < MiniTest::Test
4
+ def test_inner_text
5
+ assert_equal('hello world', noko.css('li').inner_text)
6
+ assert_equal('<li>hello</li> <li>world</li>', noko.css('ul').inner_html.squish)
7
+ end
8
+
9
+ def test_just_me
10
+ assert_equal('a', noko.css('div').text_just_me.squish)
11
+ assert_equal('b b', noko.css('p').text_just_me.squish)
12
+ end
13
13
 
14
- def test_just_me
15
- assert_equal("a", @noko.css("div").text_just_me.squish)
16
- assert_equal("b b", @noko.css("p").text_just_me.squish)
17
- end
14
+ def noko
15
+ @noko ||= Nokogiri::HTML(HTML).css('#nokogiri_ext')
18
16
  end
17
+ protected :noko
19
18
  end
@@ -0,0 +1,73 @@
1
+ require_relative 'test_helper'
2
+
3
+ class TestOutput < MiniTest::Test
4
+ def test_output
5
+ sinew.dsl.csv_header(:n, :a, :p)
6
+ sinew.dsl.csv_emit(n: 'n1', a: 'a1')
7
+ sinew.dsl.csv_emit(n: 'n2', a: 'a2')
8
+ assert_equal 2, sinew.output.count
9
+ assert_equal "n,a,p\nn1,a1,\"\"\nn2,a2,\"\"\n", File.read(CSV)
10
+ end
11
+
12
+ def test_implicit_header
13
+ sinew.dsl.csv_emit(name: 'bob', address: 'main')
14
+ assert_equal "name,address\nbob,main\n", File.read(CSV)
15
+ end
16
+
17
+ def test_array_header
18
+ sinew.dsl.csv_header(%i[n a p])
19
+ sinew.dsl.csv_emit(n: 'n1', a: 'a1')
20
+ assert_equal "n,a,p\nn1,a1,\"\"\n", File.read(CSV)
21
+ end
22
+
23
+ def test_filenames
24
+ sinew = Sinew::Main.new(recipe: 'gub.sinew')
25
+ assert_equal 'gub.csv', sinew.output.filename
26
+ sinew = Sinew::Main.new(recipe: 'gub')
27
+ assert_equal 'gub.csv', sinew.output.filename
28
+ sinew = Sinew::Main.new(recipe: '/somewhere/gub.sinew')
29
+ assert_equal '/somewhere/gub.csv', sinew.output.filename
30
+ end
31
+
32
+ def test_normalization
33
+ output = Sinew::Output.new(nil)
34
+
35
+ #
36
+ # simple types
37
+ #
38
+
39
+ assert_equal '', output.send(:normalize, nil)
40
+ assert_equal '', output.send(:normalize, '')
41
+ assert_equal 'text', output.send(:normalize, 'text')
42
+ assert_equal '123', output.send(:normalize, 123)
43
+ assert_equal('1|2', output.send(:normalize, [ 1, 2 ]))
44
+
45
+ #
46
+ # nokogiri
47
+ #
48
+
49
+ noko = Nokogiri::HTML(HTML)
50
+
51
+ # node => text
52
+ assert_equal('text', output.send(:normalize, noko.css('#element')))
53
+ # nodes => text joined with space
54
+ assert_equal('text1 text2', output.send(:normalize, noko.css('.e')))
55
+
56
+ #
57
+ # string cleanups
58
+ #
59
+
60
+ # strip_html_tags
61
+ assert_equal('gub', output.send(:normalize, '<tag>gub</tag>'))
62
+ # convert_smart_punctuation
63
+ assert_equal('"gub"', output.send(:normalize, "\302\223gub\302\224"))
64
+ # convert_accented_html_entities
65
+ assert_equal('a', output.send(:normalize, '&aacute;'))
66
+ # convert_miscellaneous_html_entities
67
+ assert_equal('<>', output.send(:normalize, '&lt;&gt;'))
68
+ # to_ascii
69
+ assert_equal('cafe', output.send(:normalize, "caf\xc3\xa9"))
70
+ # squish
71
+ assert_equal('hello world', output.send(:normalize, "\nhello \t \rworld"))
72
+ end
73
+ end