scruber 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,46 @@
1
+ require "spec_helper"
2
+
3
+ RSpec.describe Scruber::Helpers::FetcherAgentAdapters::AbstractAdapter do
4
+
5
+ let(:cookie_jar_string) { "---\n- !ruby/object:HTTP::Cookie\n name: feed_flow\n value: top\n domain: example.com\n for_domain: false\n path: \"/\"\n secure: false\n httponly: true\n expires: \n max_age: 26784000\n created_at: #{Time.now.strftime('%Y-%m-%d')} 16:46:15.443984000 +03:00\n accessed_at: #{Time.now.strftime('%Y-%m-%d')} 16:47:07.047296000 +03:00\n" }
6
+
7
+ describe "initialize" do
8
+ let(:agent) do
9
+ described_class.new id: 1,
10
+ user_agent: 'Scruber',
11
+ proxy_id: 1,
12
+ headers: {'a' => 1},
13
+ cookie_jar: cookie_jar_string,
14
+ disable_proxy: true
15
+ end
16
+
17
+ it "set values" do
18
+ expect(agent.id).to eq(1)
19
+ expect(agent.user_agent).to eq('Scruber')
20
+ expect(agent.proxy_id).to eq(1)
21
+ expect(agent.headers).to eq({'a' => 1})
22
+ expect(agent.cookie_jar).to eq(cookie_jar_string)
23
+ expect(agent.disable_proxy).to eq(true)
24
+ end
25
+
26
+ it "load cookies" do
27
+ expect(agent.cookie_for('http://example.com')).to eq('feed_flow=top')
28
+ expect(agent.cookie_for(URI('http://example.com'))).to eq('feed_flow=top')
29
+ end
30
+
31
+ it "serialize cookie" do
32
+ expect(agent.serialize_cookies).to eq(cookie_jar_string)
33
+ end
34
+
35
+ it "parse cookies from page" do
36
+ page = Scruber::QueueAdapters::AbstractAdapter::Page.new(nil, url: 'http://example.com', response_headers: {"Connection" => "keep-alive","Set-Cookie" => "__cfduid=dc8db498b1e419b7943052a69c8e9d1d01504311966; expires=Sun, 02-Sep-18 00:26:06 GMT; path=/; domain=example.com; HttpOnly"})
37
+ agent.parse_cookies_from_page!(page)
38
+ expect(agent.cookie_for('http://example.com')).to eq('__cfduid=dc8db498b1e419b7943052a69c8e9d1d01504311966; feed_flow=top')
39
+ end
40
+
41
+ end
42
+
43
+ it "should be accessible from scraper" do
44
+ expect { Scruber.run(:sample) { FetcherAgent } }.not_to raise_error
45
+ end
46
+ end
@@ -0,0 +1,45 @@
1
+ require "spec_helper"
2
+
3
+ RSpec.describe Scruber::Helpers::FetcherAgentAdapters::Memory do
4
+
5
+ let(:cookie_jar_string) { "---\n- !ruby/object:HTTP::Cookie\n name: feed_flow\n value: top\n domain: example.com\n for_domain: false\n path: \"/\"\n secure: false\n httponly: true\n expires: \n max_age: 26784000\n created_at: #{Time.now.strftime('%Y-%m-%d')} 16:46:15.443984000 +03:00\n accessed_at: #{Time.now.strftime('%Y-%m-%d')} 16:47:07.047296000 +03:00\n" }
6
+
7
+ let(:agent) do
8
+ described_class.new user_agent: 'Scruber',
9
+ proxy_id: 1,
10
+ headers: {'a' => 1},
11
+ cookie_jar: cookie_jar_string,
12
+ disable_proxy: true
13
+ end
14
+
15
+ describe "initialize" do
16
+ it "should generate id" do
17
+ expect(agent.id).not_to be_nil
18
+ end
19
+ end
20
+
21
+ describe "save" do
22
+ it "should be stored to memory collection" do
23
+ agent.save
24
+ expect(Scruber::Helpers::FetcherAgentAdapters::Memory._collection[agent.id]).to eq(agent)
25
+ end
26
+ end
27
+
28
+ describe "delete" do
29
+ it "should be deleted from memory collection" do
30
+ agent.save
31
+ expect(Scruber::Helpers::FetcherAgentAdapters::Memory._collection[agent.id]).to eq(agent)
32
+ agent.delete
33
+ expect(Scruber::Helpers::FetcherAgentAdapters::Memory._collection[agent.id]).to be_nil
34
+ end
35
+ end
36
+
37
+ describe "class methods" do
38
+ describe "find" do
39
+ it "should find agent by id" do
40
+ agent.save
41
+ expect(Scruber::Helpers::FetcherAgentAdapters::Memory.find(agent.id)).to eq(agent)
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,21 @@
1
+ require "spec_helper"
2
+
3
+ RSpec.describe Scruber::Helpers::ProxyRotator::Proxy do
4
+
5
+ describe "proxy" do
6
+ let(:proxy) { described_class.new('127.0.0.1:3000', :user=>'user', :password=>'password', :probability=>1, :type=>'socks') }
7
+
8
+ it 'should have valid id' do
9
+ proxy = described_class.new('127.0.0.1:3000')
10
+ expect(proxy.id).to eq('127.0.0.1:3000')
11
+ proxy = described_class.new('127.0.0.1', port: 3001)
12
+ expect(proxy.id).to eq('127.0.0.1:3001')
13
+ end
14
+
15
+ it 'should raise error if port or address not given' do
16
+ expect{ described_class.new('127.0.0.1') }.to raise_error(Scruber::ArgumentError)
17
+ expect{ described_class.new('', port: 3000) }.to raise_error(Scruber::ArgumentError)
18
+ end
19
+ end
20
+
21
+ end
@@ -0,0 +1,118 @@
1
+ require "spec_helper"
2
+
3
+ RSpec.describe Scruber::Helpers::ProxyRotator do
4
+
5
+ describe "configurable" do
6
+ before do
7
+ described_class.configure do
8
+ set_mode :random
9
+
10
+ add "127.0.0.1:3000"
11
+ add "127.0.0.1", port: 3001
12
+ end
13
+ end
14
+
15
+ it "adds 2 proxies to list" do
16
+ expect(described_class.configuration.proxies.count).to eq(2)
17
+ end
18
+
19
+ it "clean proxies list" do
20
+ described_class.configure do
21
+ clean
22
+ end
23
+ expect(described_class.configuration.proxies.count).to eq(0)
24
+ end
25
+
26
+ it "should have random mode by default" do
27
+ expect(described_class.configuration.mode).to eq(:random)
28
+ end
29
+
30
+ it "should raise error when set incorrect mode" do
31
+ expect{ described_class.configure { set_mode :bad } }.to raise_error(Scruber::ArgumentError)
32
+ end
33
+
34
+ it "should build proxy_keys" do
35
+ described_class.configure do
36
+ clean
37
+ set_mode :round_robin
38
+
39
+ add "127.0.0.1:3000"
40
+ add "127.0.0.1", port: 3001
41
+ end
42
+ expect(described_class.configuration.proxy_keys.sort).to eq(["127.0.0.1:3000", "127.0.0.1:3001"].sort)
43
+ end
44
+
45
+ it "should rebuild proxy_keys" do
46
+ described_class.configure do
47
+ clean
48
+ set_mode :round_robin
49
+
50
+ add "127.0.0.1:3000"
51
+ add "127.0.0.1", port: 3001
52
+ end
53
+ expect(described_class.configuration.proxy_keys.sort).to eq(["127.0.0.1:3000", "127.0.0.1:3001"].sort)
54
+ described_class.configure do
55
+ add "127.0.0.5:3000"
56
+ end
57
+ expect(described_class.configuration.proxy_keys.sort).to eq(["127.0.0.1:3000", "127.0.0.1:3001", "127.0.0.5:3000"].sort)
58
+ end
59
+ end
60
+
61
+ describe "round_robin mode" do
62
+ before do
63
+ described_class.configure do
64
+ clean
65
+ set_mode :round_robin
66
+
67
+ add "127.0.0.1:3000"
68
+ add "127.0.0.2:3000"
69
+ add "127.0.0.3:3000"
70
+ end
71
+ end
72
+
73
+ it "should return all 3 proxies" do
74
+ expect(3.times.map{ described_class.next.host }.sort).to eq(["127.0.0.1", "127.0.0.2", "127.0.0.3"].sort)
75
+ end
76
+
77
+ it "should return all 3 proxies for random method" do
78
+ expect(3.times.map{ described_class.random.host }.sort).to eq(["127.0.0.1", "127.0.0.2", "127.0.0.3"].sort)
79
+ end
80
+
81
+ it "should return all 3 proxies twice" do
82
+ expect(6.times.map{ described_class.next.host }.sort).to eq(["127.0.0.1", "127.0.0.2", "127.0.0.3", "127.0.0.1", "127.0.0.2", "127.0.0.3"].sort)
83
+ end
84
+ end
85
+
86
+ describe "random mode" do
87
+ before do
88
+ described_class.configure do
89
+ clean
90
+ set_mode :random
91
+
92
+ add "127.0.0.1:3000"
93
+ add "127.0.0.2:3000"
94
+ add "127.0.0.3:3000"
95
+ end
96
+ end
97
+
98
+ it "should return all 3 proxies (may raise phantom error)" do
99
+ expect(100.times.map{ described_class.next.host }.uniq.sort).to eq(["127.0.0.1", "127.0.0.2", "127.0.0.3"].sort)
100
+ end
101
+
102
+ it "should return 127.0.0.1 more often (may raise phantom error)" do
103
+ described_class.configure do
104
+ clean
105
+ set_mode :random
106
+
107
+ add "127.0.0.1:3000", probability: 0.9
108
+ add "127.0.0.2:3000", probability: 0.05
109
+ add "127.0.0.3:3000", probability: 0.05
110
+ end
111
+ expect(100.times.map{ described_class.next.host }.select{|h| h == '127.0.0.1'}.count).to be > 75
112
+ end
113
+ end
114
+
115
+ it "should be accessible from scraper" do
116
+ expect { Scruber.run(:sample) { ProxyRotator } }.not_to raise_error
117
+ end
118
+ end
@@ -0,0 +1,145 @@
1
+ require "spec_helper"
2
+
3
+ RSpec.describe Scruber::Helpers::UserAgentRotator do
4
+
5
+ describe "configurable" do
6
+ context "with block" do
7
+ before do
8
+ described_class.configure do
9
+ clean
10
+ set_filter :all
11
+
12
+ add "Scruber 1.0", tags: [:robot, :scruber]
13
+ add "GoogleBot 1.0", tags: [:robot, :google]
14
+ add "Chrome 1.0", tags: [:desktop, :chrome]
15
+ add "Android 1.0", tags: [:mobile, :android]
16
+ end
17
+ end
18
+
19
+ it "adds 4 user agents to list" do
20
+ expect(described_class.configuration.user_agents.count).to eq(4)
21
+ end
22
+
23
+ it "clean user agents list" do
24
+ described_class.configure do
25
+ clean
26
+ end
27
+ expect(described_class.configuration.user_agents.count).to eq(0)
28
+ end
29
+
30
+ it "should have all filter by default" do
31
+ expect(described_class.configuration.tags).to eq(:all)
32
+ end
33
+
34
+ it "should set different filter" do
35
+ described_class.configure do
36
+ set_filter :desktop
37
+ end
38
+ expect(described_class.configuration.tags).to eq(:desktop)
39
+ end
40
+ end
41
+
42
+ context "with dictionary" do
43
+ before do
44
+ Scruber::Core::Extensions::Loop.add_dictionary(:default_user_agents, File.expand_path(File.dirname(__FILE__))+'/user_agents.xml', :xml)
45
+
46
+ described_class.configure do
47
+ clean
48
+ set_filter :all
49
+
50
+ loop :default_user_agents do |ua|
51
+ add ua['name'], tags: ua['tags'].split(',').map(&:strip)
52
+ end
53
+ end
54
+ end
55
+
56
+ it "adds 4 user agents to list" do
57
+ expect(described_class.configuration.user_agents.count).to eq(4)
58
+ end
59
+
60
+ it "clean proxies list" do
61
+ described_class.configure do
62
+ clean
63
+ end
64
+ expect(described_class.configuration.user_agents.count).to eq(0)
65
+ end
66
+
67
+ it "should have all filter by default" do
68
+ expect(described_class.configuration.tags).to eq(:all)
69
+ end
70
+
71
+ it "should set different filter" do
72
+ described_class.configure do
73
+ set_filter :desktop
74
+ end
75
+ expect(described_class.configuration.tags).to eq(:desktop)
76
+ end
77
+ end
78
+ end
79
+
80
+ describe "with default config" do
81
+ before do
82
+ described_class.configure do
83
+ clean
84
+ set_filter :all
85
+
86
+ add "Scruber 1.0", tags: [:robot, :scruber]
87
+ add "GoogleBot 1.0", tags: [:robot, :google]
88
+ add "Chrome 1.0", tags: [:desktop, :chrome]
89
+ add "Android 1.0", tags: [:mobile, :android]
90
+ end
91
+ end
92
+
93
+ it "should return all 4 user agents" do
94
+ expect(4.times.map{ described_class.next }.sort).to eq(["Scruber 1.0","GoogleBot 1.0","Chrome 1.0","Android 1.0"].sort)
95
+ end
96
+
97
+ it "should return all 4 user agents twice" do
98
+ expect(8.times.map{ described_class.next }.sort).to eq(["Scruber 1.0","GoogleBot 1.0","Chrome 1.0","Android 1.0","Scruber 1.0","GoogleBot 1.0","Chrome 1.0","Android 1.0"].sort)
99
+ end
100
+
101
+ it "should return only robot user agents" do
102
+ described_class.configure do
103
+ set_filter :robot
104
+ end
105
+ expect(4.times.map{ described_class.next }.sort).to eq(["Scruber 1.0","GoogleBot 1.0","Scruber 1.0","GoogleBot 1.0"].sort)
106
+ end
107
+
108
+ it "should return only desktop chrome" do
109
+ described_class.configure do
110
+ set_filter [:desktop, :chrome]
111
+ end
112
+ expect(2.times.map{ described_class.next }.sort).to eq(["Chrome 1.0", "Chrome 1.0"].sort)
113
+ end
114
+ end
115
+
116
+ describe "with passed config" do
117
+ before do
118
+ described_class.configure do
119
+ clean
120
+ set_filter :bad
121
+
122
+ add "Scruber 1.0", tags: [:robot, :scruber]
123
+ add "GoogleBot 1.0", tags: [:robot, :google]
124
+ add "Chrome 1.0", tags: [:desktop, :chrome]
125
+ add "Android 1.0", tags: [:mobile, :android]
126
+ end
127
+ end
128
+
129
+ it "should return all 4 user agents" do
130
+ expect(4.times.map{ described_class.next(:all) }.sort).to eq(["Scruber 1.0","GoogleBot 1.0","Chrome 1.0","Android 1.0"].sort)
131
+ end
132
+
133
+ it "should return all 4 user agents twice" do
134
+ expect(8.times.map{ described_class.next(:all) }.sort).to eq(["Scruber 1.0","GoogleBot 1.0","Chrome 1.0","Android 1.0","Scruber 1.0","GoogleBot 1.0","Chrome 1.0","Android 1.0"].sort)
135
+ end
136
+
137
+ it "should return only robot user agents" do
138
+ expect(4.times.map{ described_class.next(:robot) }.sort).to eq(["Scruber 1.0","GoogleBot 1.0","Scruber 1.0","GoogleBot 1.0"].sort)
139
+ end
140
+
141
+ it "should return only desktop chrome" do
142
+ expect(2.times.map{ described_class.next([:desktop, :chrome]) }.sort).to eq(["Chrome 1.0", "Chrome 1.0"].sort)
143
+ end
144
+ end
145
+ end
@@ -0,0 +1,40 @@
1
+ require "spec_helper"
2
+
3
+ RSpec.describe Scruber::Helpers::UserAgentRotator::UserAgent do
4
+
5
+ describe "user_agent" do
6
+ let(:user_agent) { described_class.new('Scruber 1.0', tags: [:robot, :scruber]) }
7
+
8
+ it 'should have valid id' do
9
+ expect(user_agent.id).to eq('Scruber 1.0')
10
+ end
11
+
12
+ it 'should have valid tags' do
13
+ expect(user_agent.tags).to eq([:robot, :scruber])
14
+ end
15
+
16
+ it 'should always have array of tags' do
17
+ ua = described_class.new('Scruber 1.0')
18
+ expect(ua.tags).to eq([])
19
+
20
+ ua = described_class.new('Scruber 1.0', tags: nil)
21
+ expect(ua.tags).to eq([])
22
+
23
+ ua = described_class.new('Scruber 1.0', tags: :robot)
24
+ expect(ua.tags).to eq([:robot])
25
+ end
26
+
27
+ it 'should always have array of symbolic tags' do
28
+ ua = described_class.new('Scruber 1.0', tags: 'robot')
29
+ expect(ua.tags).to eq([:robot])
30
+
31
+ ua = described_class.new('Scruber 1.0', tags: ['robot', 'scruber'])
32
+ expect(ua.tags).to eq([:robot, :scruber])
33
+ end
34
+
35
+ it 'should raise error if port or address not given' do
36
+ expect{ described_class.new('') }.to raise_error(Scruber::ArgumentError)
37
+ end
38
+ end
39
+
40
+ end
@@ -0,0 +1,6 @@
1
+ <items>
2
+ <item name="Scruber 1.0" tags="robot,scruber" />
3
+ <item name="GoogleBot 1.0" tags="robot,google" />
4
+ <item name="Chrome 1.0" tags="chrome,desktop,macos" />
5
+ <item name="Android 1.0" tags="mobile,android" />
6
+ </items>
@@ -0,0 +1,15 @@
1
+ require "spec_helper"
2
+
3
+ RSpec.describe Scruber::QueueAdapters::Memory do
4
+ let(:queue){ described_class.new }
5
+
6
+ it_behaves_like "queue_adapter"
7
+
8
+ it "shift first enqueued page" do
9
+ queue.add "http://example.com"
10
+ queue.add "http://example2.com"
11
+ page = queue.fetch_pending
12
+ expect(page.url).to eq("http://example.com")
13
+ end
14
+
15
+ end
@@ -0,0 +1,27 @@
1
+ require "spec_helper"
2
+
3
+ RSpec.describe Scruber::Queue do
4
+
5
+ describe "add_adapter" do
6
+ it "should raise error" do
7
+ expect{ described_class.add_adapter(:obj, Object) }.to raise_error(NoMethodError)
8
+ end
9
+
10
+ it "should add new adapter and return added class" do
11
+ expect(described_class.add_adapter(:simple2, Scruber::QueueAdapters::Memory)).to eq(Scruber::QueueAdapters::Memory)
12
+ expect(described_class._adapters.keys).to include(:simple2)
13
+ end
14
+ end
15
+
16
+ describe "adapter" do
17
+ it "should return default adapter" do
18
+ expect(described_class.adapter).to eq(Scruber::QueueAdapters::Memory)
19
+ end
20
+ end
21
+
22
+ describe "new" do
23
+ it "should return instance of default adapter" do
24
+ expect(described_class.new).to be_a(Scruber::QueueAdapters::Memory)
25
+ end
26
+ end
27
+ end