scruber 0.1.5 → 0.1.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,46 @@
1
+ require "spec_helper"
2
+
3
+ RSpec.describe Scruber::Helpers::FetcherAgentAdapters::AbstractAdapter do
4
+
5
+ let(:cookie_jar_string) { "---\n- !ruby/object:HTTP::Cookie\n name: feed_flow\n value: top\n domain: example.com\n for_domain: false\n path: \"/\"\n secure: false\n httponly: true\n expires: \n max_age: 26784000\n created_at: #{Time.now.strftime('%Y-%m-%d')} 16:46:15.443984000 +03:00\n accessed_at: #{Time.now.strftime('%Y-%m-%d')} 16:47:07.047296000 +03:00\n" }
6
+
7
+ describe "initialize" do
8
+ let(:agent) do
9
+ described_class.new id: 1,
10
+ user_agent: 'Scruber',
11
+ proxy_id: 1,
12
+ headers: {'a' => 1},
13
+ cookie_jar: cookie_jar_string,
14
+ disable_proxy: true
15
+ end
16
+
17
+ it "set values" do
18
+ expect(agent.id).to eq(1)
19
+ expect(agent.user_agent).to eq('Scruber')
20
+ expect(agent.proxy_id).to eq(1)
21
+ expect(agent.headers).to eq({'a' => 1})
22
+ expect(agent.cookie_jar).to eq(cookie_jar_string)
23
+ expect(agent.disable_proxy).to eq(true)
24
+ end
25
+
26
+ it "load cookies" do
27
+ expect(agent.cookie_for('http://example.com')).to eq('feed_flow=top')
28
+ expect(agent.cookie_for(URI('http://example.com'))).to eq('feed_flow=top')
29
+ end
30
+
31
+ it "serialize cookie" do
32
+ expect(agent.serialize_cookies).to eq(cookie_jar_string)
33
+ end
34
+
35
+ it "parse cookies from page" do
36
+ page = Scruber::QueueAdapters::AbstractAdapter::Page.new(nil, url: 'http://example.com', response_headers: {"Connection" => "keep-alive","Set-Cookie" => "__cfduid=dc8db498b1e419b7943052a69c8e9d1d01504311966; expires=Sun, 02-Sep-18 00:26:06 GMT; path=/; domain=example.com; HttpOnly"})
37
+ agent.parse_cookies_from_page!(page)
38
+ expect(agent.cookie_for('http://example.com')).to eq('__cfduid=dc8db498b1e419b7943052a69c8e9d1d01504311966; feed_flow=top')
39
+ end
40
+
41
+ end
42
+
43
+ it "should be accessible from scraper" do
44
+ expect { Scruber.run(:sample) { FetcherAgent } }.not_to raise_error
45
+ end
46
+ end
@@ -0,0 +1,45 @@
1
+ require "spec_helper"
2
+
3
+ RSpec.describe Scruber::Helpers::FetcherAgentAdapters::Memory do
4
+
5
+ let(:cookie_jar_string) { "---\n- !ruby/object:HTTP::Cookie\n name: feed_flow\n value: top\n domain: example.com\n for_domain: false\n path: \"/\"\n secure: false\n httponly: true\n expires: \n max_age: 26784000\n created_at: #{Time.now.strftime('%Y-%m-%d')} 16:46:15.443984000 +03:00\n accessed_at: #{Time.now.strftime('%Y-%m-%d')} 16:47:07.047296000 +03:00\n" }
6
+
7
+ let(:agent) do
8
+ described_class.new user_agent: 'Scruber',
9
+ proxy_id: 1,
10
+ headers: {'a' => 1},
11
+ cookie_jar: cookie_jar_string,
12
+ disable_proxy: true
13
+ end
14
+
15
+ describe "initialize" do
16
+ it "should generate id" do
17
+ expect(agent.id).not_to be_nil
18
+ end
19
+ end
20
+
21
+ describe "save" do
22
+ it "should be stored to memory collection" do
23
+ agent.save
24
+ expect(Scruber::Helpers::FetcherAgentAdapters::Memory._collection[agent.id]).to eq(agent)
25
+ end
26
+ end
27
+
28
+ describe "delete" do
29
+ it "should be deleted from memory collection" do
30
+ agent.save
31
+ expect(Scruber::Helpers::FetcherAgentAdapters::Memory._collection[agent.id]).to eq(agent)
32
+ agent.delete
33
+ expect(Scruber::Helpers::FetcherAgentAdapters::Memory._collection[agent.id]).to be_nil
34
+ end
35
+ end
36
+
37
+ describe "class methods" do
38
+ describe "find" do
39
+ it "should find agent by id" do
40
+ agent.save
41
+ expect(Scruber::Helpers::FetcherAgentAdapters::Memory.find(agent.id)).to eq(agent)
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,21 @@
1
+ require "spec_helper"
2
+
3
+ RSpec.describe Scruber::Helpers::ProxyRotator::Proxy do
4
+
5
+ describe "proxy" do
6
+ let(:proxy) { described_class.new('127.0.0.1:3000', :user=>'user', :password=>'password', :probability=>1, :type=>'socks') }
7
+
8
+ it 'should have valid id' do
9
+ proxy = described_class.new('127.0.0.1:3000')
10
+ expect(proxy.id).to eq('127.0.0.1:3000')
11
+ proxy = described_class.new('127.0.0.1', port: 3001)
12
+ expect(proxy.id).to eq('127.0.0.1:3001')
13
+ end
14
+
15
+ it 'should raise error if port or address not given' do
16
+ expect{ described_class.new('127.0.0.1') }.to raise_error(Scruber::ArgumentError)
17
+ expect{ described_class.new('', port: 3000) }.to raise_error(Scruber::ArgumentError)
18
+ end
19
+ end
20
+
21
+ end
@@ -0,0 +1,118 @@
1
+ require "spec_helper"
2
+
3
+ RSpec.describe Scruber::Helpers::ProxyRotator do
4
+
5
+ describe "configurable" do
6
+ before do
7
+ described_class.configure do
8
+ set_mode :random
9
+
10
+ add "127.0.0.1:3000"
11
+ add "127.0.0.1", port: 3001
12
+ end
13
+ end
14
+
15
+ it "adds 2 proxies to list" do
16
+ expect(described_class.configuration.proxies.count).to eq(2)
17
+ end
18
+
19
+ it "clean proxies list" do
20
+ described_class.configure do
21
+ clean
22
+ end
23
+ expect(described_class.configuration.proxies.count).to eq(0)
24
+ end
25
+
26
+ it "should have random mode by default" do
27
+ expect(described_class.configuration.mode).to eq(:random)
28
+ end
29
+
30
+ it "should raise error when set incorrect mode" do
31
+ expect{ described_class.configure { set_mode :bad } }.to raise_error(Scruber::ArgumentError)
32
+ end
33
+
34
+ it "should build proxy_keys" do
35
+ described_class.configure do
36
+ clean
37
+ set_mode :round_robin
38
+
39
+ add "127.0.0.1:3000"
40
+ add "127.0.0.1", port: 3001
41
+ end
42
+ expect(described_class.configuration.proxy_keys.sort).to eq(["127.0.0.1:3000", "127.0.0.1:3001"].sort)
43
+ end
44
+
45
+ it "should rebuild proxy_keys" do
46
+ described_class.configure do
47
+ clean
48
+ set_mode :round_robin
49
+
50
+ add "127.0.0.1:3000"
51
+ add "127.0.0.1", port: 3001
52
+ end
53
+ expect(described_class.configuration.proxy_keys.sort).to eq(["127.0.0.1:3000", "127.0.0.1:3001"].sort)
54
+ described_class.configure do
55
+ add "127.0.0.5:3000"
56
+ end
57
+ expect(described_class.configuration.proxy_keys.sort).to eq(["127.0.0.1:3000", "127.0.0.1:3001", "127.0.0.5:3000"].sort)
58
+ end
59
+ end
60
+
61
+ describe "round_robin mode" do
62
+ before do
63
+ described_class.configure do
64
+ clean
65
+ set_mode :round_robin
66
+
67
+ add "127.0.0.1:3000"
68
+ add "127.0.0.2:3000"
69
+ add "127.0.0.3:3000"
70
+ end
71
+ end
72
+
73
+ it "should return all 3 proxies" do
74
+ expect(3.times.map{ described_class.next.host }.sort).to eq(["127.0.0.1", "127.0.0.2", "127.0.0.3"].sort)
75
+ end
76
+
77
+ it "should return all 3 proxies for random method" do
78
+ expect(3.times.map{ described_class.random.host }.sort).to eq(["127.0.0.1", "127.0.0.2", "127.0.0.3"].sort)
79
+ end
80
+
81
+ it "should return all 3 proxies twice" do
82
+ expect(6.times.map{ described_class.next.host }.sort).to eq(["127.0.0.1", "127.0.0.2", "127.0.0.3", "127.0.0.1", "127.0.0.2", "127.0.0.3"].sort)
83
+ end
84
+ end
85
+
86
+ describe "random mode" do
87
+ before do
88
+ described_class.configure do
89
+ clean
90
+ set_mode :random
91
+
92
+ add "127.0.0.1:3000"
93
+ add "127.0.0.2:3000"
94
+ add "127.0.0.3:3000"
95
+ end
96
+ end
97
+
98
+ it "should return all 3 proxies (may raise phantom error)" do
99
+ expect(100.times.map{ described_class.next.host }.uniq.sort).to eq(["127.0.0.1", "127.0.0.2", "127.0.0.3"].sort)
100
+ end
101
+
102
+ it "should return 127.0.0.1 more often (may raise phantom error)" do
103
+ described_class.configure do
104
+ clean
105
+ set_mode :random
106
+
107
+ add "127.0.0.1:3000", probability: 0.9
108
+ add "127.0.0.2:3000", probability: 0.05
109
+ add "127.0.0.3:3000", probability: 0.05
110
+ end
111
+ expect(100.times.map{ described_class.next.host }.select{|h| h == '127.0.0.1'}.count).to be > 75
112
+ end
113
+ end
114
+
115
+ it "should be accessible from scraper" do
116
+ expect { Scruber.run(:sample) { ProxyRotator } }.not_to raise_error
117
+ end
118
+ end
@@ -0,0 +1,145 @@
1
+ require "spec_helper"
2
+
3
+ RSpec.describe Scruber::Helpers::UserAgentRotator do
4
+
5
+ describe "configurable" do
6
+ context "with block" do
7
+ before do
8
+ described_class.configure do
9
+ clean
10
+ set_filter :all
11
+
12
+ add "Scruber 1.0", tags: [:robot, :scruber]
13
+ add "GoogleBot 1.0", tags: [:robot, :google]
14
+ add "Chrome 1.0", tags: [:desktop, :chrome]
15
+ add "Android 1.0", tags: [:mobile, :android]
16
+ end
17
+ end
18
+
19
+ it "adds 4 user agents to list" do
20
+ expect(described_class.configuration.user_agents.count).to eq(4)
21
+ end
22
+
23
+ it "clean user agents list" do
24
+ described_class.configure do
25
+ clean
26
+ end
27
+ expect(described_class.configuration.user_agents.count).to eq(0)
28
+ end
29
+
30
+ it "should have all filter by default" do
31
+ expect(described_class.configuration.tags).to eq(:all)
32
+ end
33
+
34
+ it "should set different filter" do
35
+ described_class.configure do
36
+ set_filter :desktop
37
+ end
38
+ expect(described_class.configuration.tags).to eq(:desktop)
39
+ end
40
+ end
41
+
42
+ context "with dictionary" do
43
+ before do
44
+ Scruber::Core::Extensions::Loop.add_dictionary(:default_user_agents, File.expand_path(File.dirname(__FILE__))+'/user_agents.xml', :xml)
45
+
46
+ described_class.configure do
47
+ clean
48
+ set_filter :all
49
+
50
+ loop :default_user_agents do |ua|
51
+ add ua['name'], tags: ua['tags'].split(',').map(&:strip)
52
+ end
53
+ end
54
+ end
55
+
56
+ it "adds 4 user agents to list" do
57
+ expect(described_class.configuration.user_agents.count).to eq(4)
58
+ end
59
+
60
+ it "clean proxies list" do
61
+ described_class.configure do
62
+ clean
63
+ end
64
+ expect(described_class.configuration.user_agents.count).to eq(0)
65
+ end
66
+
67
+ it "should have all filter by default" do
68
+ expect(described_class.configuration.tags).to eq(:all)
69
+ end
70
+
71
+ it "should set different filter" do
72
+ described_class.configure do
73
+ set_filter :desktop
74
+ end
75
+ expect(described_class.configuration.tags).to eq(:desktop)
76
+ end
77
+ end
78
+ end
79
+
80
+ describe "with default config" do
81
+ before do
82
+ described_class.configure do
83
+ clean
84
+ set_filter :all
85
+
86
+ add "Scruber 1.0", tags: [:robot, :scruber]
87
+ add "GoogleBot 1.0", tags: [:robot, :google]
88
+ add "Chrome 1.0", tags: [:desktop, :chrome]
89
+ add "Android 1.0", tags: [:mobile, :android]
90
+ end
91
+ end
92
+
93
+ it "should return all 4 user agents" do
94
+ expect(4.times.map{ described_class.next }.sort).to eq(["Scruber 1.0","GoogleBot 1.0","Chrome 1.0","Android 1.0"].sort)
95
+ end
96
+
97
+ it "should return all 4 user agents twice" do
98
+ expect(8.times.map{ described_class.next }.sort).to eq(["Scruber 1.0","GoogleBot 1.0","Chrome 1.0","Android 1.0","Scruber 1.0","GoogleBot 1.0","Chrome 1.0","Android 1.0"].sort)
99
+ end
100
+
101
+ it "should return only robot user agents" do
102
+ described_class.configure do
103
+ set_filter :robot
104
+ end
105
+ expect(4.times.map{ described_class.next }.sort).to eq(["Scruber 1.0","GoogleBot 1.0","Scruber 1.0","GoogleBot 1.0"].sort)
106
+ end
107
+
108
+ it "should return only desktop chrome" do
109
+ described_class.configure do
110
+ set_filter [:desktop, :chrome]
111
+ end
112
+ expect(2.times.map{ described_class.next }.sort).to eq(["Chrome 1.0", "Chrome 1.0"].sort)
113
+ end
114
+ end
115
+
116
+ describe "with passed config" do
117
+ before do
118
+ described_class.configure do
119
+ clean
120
+ set_filter :bad
121
+
122
+ add "Scruber 1.0", tags: [:robot, :scruber]
123
+ add "GoogleBot 1.0", tags: [:robot, :google]
124
+ add "Chrome 1.0", tags: [:desktop, :chrome]
125
+ add "Android 1.0", tags: [:mobile, :android]
126
+ end
127
+ end
128
+
129
+ it "should return all 4 user agents" do
130
+ expect(4.times.map{ described_class.next(:all) }.sort).to eq(["Scruber 1.0","GoogleBot 1.0","Chrome 1.0","Android 1.0"].sort)
131
+ end
132
+
133
+ it "should return all 4 user agents twice" do
134
+ expect(8.times.map{ described_class.next(:all) }.sort).to eq(["Scruber 1.0","GoogleBot 1.0","Chrome 1.0","Android 1.0","Scruber 1.0","GoogleBot 1.0","Chrome 1.0","Android 1.0"].sort)
135
+ end
136
+
137
+ it "should return only robot user agents" do
138
+ expect(4.times.map{ described_class.next(:robot) }.sort).to eq(["Scruber 1.0","GoogleBot 1.0","Scruber 1.0","GoogleBot 1.0"].sort)
139
+ end
140
+
141
+ it "should return only desktop chrome" do
142
+ expect(2.times.map{ described_class.next([:desktop, :chrome]) }.sort).to eq(["Chrome 1.0", "Chrome 1.0"].sort)
143
+ end
144
+ end
145
+ end
@@ -0,0 +1,40 @@
1
+ require "spec_helper"
2
+
3
+ RSpec.describe Scruber::Helpers::UserAgentRotator::UserAgent do
4
+
5
+ describe "user_agent" do
6
+ let(:user_agent) { described_class.new('Scruber 1.0', tags: [:robot, :scruber]) }
7
+
8
+ it 'should have valid id' do
9
+ expect(user_agent.id).to eq('Scruber 1.0')
10
+ end
11
+
12
+ it 'should have valid tags' do
13
+ expect(user_agent.tags).to eq([:robot, :scruber])
14
+ end
15
+
16
+ it 'should always have array of tags' do
17
+ ua = described_class.new('Scruber 1.0')
18
+ expect(ua.tags).to eq([])
19
+
20
+ ua = described_class.new('Scruber 1.0', tags: nil)
21
+ expect(ua.tags).to eq([])
22
+
23
+ ua = described_class.new('Scruber 1.0', tags: :robot)
24
+ expect(ua.tags).to eq([:robot])
25
+ end
26
+
27
+ it 'should always have array of symbolic tags' do
28
+ ua = described_class.new('Scruber 1.0', tags: 'robot')
29
+ expect(ua.tags).to eq([:robot])
30
+
31
+ ua = described_class.new('Scruber 1.0', tags: ['robot', 'scruber'])
32
+ expect(ua.tags).to eq([:robot, :scruber])
33
+ end
34
+
35
+ it 'should raise error if port or address not given' do
36
+ expect{ described_class.new('') }.to raise_error(Scruber::ArgumentError)
37
+ end
38
+ end
39
+
40
+ end
@@ -0,0 +1,6 @@
1
+ <items>
2
+ <item name="Scruber 1.0" tags="robot,scruber" />
3
+ <item name="GoogleBot 1.0" tags="robot,google" />
4
+ <item name="Chrome 1.0" tags="chrome,desktop,macos" />
5
+ <item name="Android 1.0" tags="mobile,android" />
6
+ </items>
@@ -0,0 +1,15 @@
1
+ require "spec_helper"
2
+
3
+ RSpec.describe Scruber::QueueAdapters::Memory do
4
+ let(:queue){ described_class.new }
5
+
6
+ it_behaves_like "queue_adapter"
7
+
8
+ it "shift first enqueued page" do
9
+ queue.add "http://example.com"
10
+ queue.add "http://example2.com"
11
+ page = queue.fetch_pending
12
+ expect(page.url).to eq("http://example.com")
13
+ end
14
+
15
+ end
@@ -0,0 +1,27 @@
1
+ require "spec_helper"
2
+
3
+ RSpec.describe Scruber::Queue do
4
+
5
+ describe "add_adapter" do
6
+ it "should raise error" do
7
+ expect{ described_class.add_adapter(:obj, Object) }.to raise_error(NoMethodError)
8
+ end
9
+
10
+ it "should add new adapter and return added class" do
11
+ expect(described_class.add_adapter(:simple2, Scruber::QueueAdapters::Memory)).to eq(Scruber::QueueAdapters::Memory)
12
+ expect(described_class._adapters.keys).to include(:simple2)
13
+ end
14
+ end
15
+
16
+ describe "adapter" do
17
+ it "should return default adapter" do
18
+ expect(described_class.adapter).to eq(Scruber::QueueAdapters::Memory)
19
+ end
20
+ end
21
+
22
+ describe "new" do
23
+ it "should return instance of default adapter" do
24
+ expect(described_class.new).to be_a(Scruber::QueueAdapters::Memory)
25
+ end
26
+ end
27
+ end