serp_scraper 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/EXAMPLES.md +7 -4
- data/README.md +5 -4
- data/lib/engines/google.rb +51 -20
- data/lib/serp_scraper.rb +10 -17
- data/serp_scraper.gemspec +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 57fa4ca58ef4a346fd8839408b73ac3adbf2773c
|
4
|
+
data.tar.gz: 17798f9cb46e7c16011770b1c6a6c25996777fa2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6bf5846e86571595732c09e66aef92c71070ed59b338d10f1e0a6ddfbd5d46699667ee0ec33c8c29cc912b223b1db5dcb86f31d421b2ea701819f2e1c4fccde3
|
7
|
+
data.tar.gz: febb89dac3c82b1a73f3a8e94bafffe310b926e898d8b78440b29c78c36cbf67eb25684076260f860726e10676e39ade6ec59ce2e426f1167e24637493d1020d
|
data/EXAMPLES.md
CHANGED
@@ -15,11 +15,14 @@ end
|
|
15
15
|
|
16
16
|
## Country/TLD specific search
|
17
17
|
```ruby
|
18
|
-
#
|
18
|
+
# Set '.se' as TLD for swedish results
|
19
19
|
s = SerpScraper.new(engine: 'google', tld: 'se')
|
20
20
|
|
21
|
-
# Set language to
|
21
|
+
# Set language parameter to swedish
|
22
22
|
s.engine.parameter('hl', 'sv')
|
23
23
|
|
24
|
-
|
25
|
-
|
24
|
+
s.search('köp bilar online').results.each do |result|
|
25
|
+
puts result
|
26
|
+
# => {:position=>1, :title=>"kvd.se - Bilauktioner på nätet", :scheme=>"https", :domain=>"www.kvd.se", :url=>"/", :full_url=>"https://www.kvd.se/"}
|
27
|
+
end
|
28
|
+
```
|
data/README.md
CHANGED
@@ -15,10 +15,11 @@ gem 'serp_scraper'
|
|
15
15
|
```
|
16
16
|
|
17
17
|
## Examples
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
18
|
+
|
19
|
+
```ruby
|
20
|
+
google = SerpScraper.new(engine: 'google', tld: 'com')
|
21
|
+
first_result = google.search('buy cars onlines').results[0]
|
22
|
+
puts first_result
|
22
23
|
# => {:position=>1, :title=>"Buying From CarMax", :scheme=>"https", :domain=>"www.carmax.com", :url=>"/car-buying-process", :full_url=>"https://www.carmax.com/car-buying-process"}
|
23
24
|
```
|
24
25
|
|
data/lib/engines/google.rb
CHANGED
@@ -1,42 +1,75 @@
|
|
1
1
|
class SerpScraper::Google
|
2
2
|
attr_accessor :tld
|
3
3
|
attr_accessor :user_agent
|
4
|
+
attr_accessor :browser
|
4
5
|
|
5
6
|
def initialize(tld)
|
6
|
-
|
7
|
+
# Make tld global
|
8
|
+
@tld = tld
|
7
9
|
|
10
|
+
# Create new Mechanize object
|
8
11
|
@browser = Mechanize.new { |agent|
|
9
12
|
agent.user_agent_alias = 'Mac Safari'
|
10
13
|
}
|
11
|
-
|
12
|
-
|
13
|
-
@parameters
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
14
|
+
|
15
|
+
# Set standard query parameters
|
16
|
+
@parameters = {
|
17
|
+
gbv: 1,
|
18
|
+
complete: 0,
|
19
|
+
num: 100,
|
20
|
+
pws: 0,
|
21
|
+
nfrpr: 1,
|
22
|
+
ie: 'utf-8',
|
23
|
+
oe: 'utf-8',
|
24
|
+
site: 'webhp',
|
25
|
+
source: 'hp'
|
26
|
+
}
|
22
27
|
end
|
23
28
|
|
24
29
|
def search(keyword)
|
30
|
+
# Add keyword to parameters
|
31
|
+
@parameters['q'] = keyword
|
32
|
+
|
33
|
+
# Create build google search url
|
34
|
+
search_url = build_query_url_from_keyword(keyword)
|
35
|
+
|
25
36
|
# Do the Googleing
|
26
|
-
|
37
|
+
response = @browser.get(search_url, :referer => "https://www.google.#{@tld}")
|
38
|
+
|
39
|
+
# 503 error = Google Captcha
|
40
|
+
tries = 1
|
41
|
+
while response.code[/503/] and tries <= 3
|
42
|
+
# Try to solve with captcha
|
43
|
+
solve_captcha(response.uri.to_s)
|
44
|
+
|
45
|
+
# Do another search
|
46
|
+
response = @browser.get(search_url)
|
47
|
+
|
48
|
+
tries += 1
|
49
|
+
end
|
27
50
|
|
28
|
-
return build_serp_response(
|
51
|
+
return build_serp_response(response) if response.code == "200"
|
29
52
|
|
30
53
|
# @todo: Look for and solve captchas.
|
31
54
|
puts "Did not get a 200 response. Maybe a captcha error?"
|
32
55
|
end
|
33
56
|
|
34
|
-
def
|
57
|
+
def solve_captcha(captcha_url)
|
58
|
+
puts "trying to solve captcha on url #{captcha_url}"
|
59
|
+
|
60
|
+
page = @browser.get(captcha_url)
|
61
|
+
doc = Nokogiri::HTML(page.content)
|
62
|
+
|
63
|
+
image_url = Addressable::URI.parse('http://ipv4.google.com/' + doc.css('img')[0]["src"]).normalize
|
64
|
+
puts "Captcha url: " + image_url
|
65
|
+
end
|
66
|
+
|
67
|
+
def build_serp_response(response)
|
35
68
|
sr = SerpScraper::SerpResponse.new
|
36
69
|
sr.keyword = @parameters['q']
|
37
70
|
sr.user_agent = @browser.user_agent
|
38
|
-
sr.url =
|
39
|
-
sr.html =
|
71
|
+
sr.url = response.uri.to_s
|
72
|
+
sr.html = response.content
|
40
73
|
sr.results = extract_results(sr.html)
|
41
74
|
|
42
75
|
sr # Return sr
|
@@ -77,10 +110,8 @@ class SerpScraper::Google
|
|
77
110
|
end
|
78
111
|
|
79
112
|
def build_query_url_from_keyword(keyword)
|
80
|
-
@parameters['q'] = keyword
|
81
|
-
|
82
113
|
uri = Addressable::URI.new
|
83
|
-
uri.host = "www.google.#{tld}"
|
114
|
+
uri.host = "www.google.#{@tld}"
|
84
115
|
uri.scheme = "https"
|
85
116
|
uri.path = "/search"
|
86
117
|
uri.query_values = @parameters
|
data/lib/serp_scraper.rb
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
require 'uri'
|
2
|
+
require 'mechanize'
|
3
|
+
require 'addressable/uri'
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'deathbycaptcha'
|
6
|
+
|
1
7
|
class SerpScraper
|
2
8
|
attr_accessor :engine
|
3
9
|
|
@@ -11,27 +17,14 @@ class SerpScraper
|
|
11
17
|
end
|
12
18
|
end
|
13
19
|
|
20
|
+
def set_proxy(address, port, user = nil, password = nil)
|
21
|
+
@engine.browser.set_proxy(address, port, user, password)
|
22
|
+
end
|
23
|
+
|
14
24
|
def search(keyword)
|
15
25
|
@engine.search(keyword)
|
16
26
|
end
|
17
27
|
end
|
18
28
|
|
19
|
-
def test
|
20
|
-
google = SerpScraper.new(engine: 'google', tld: 'se')
|
21
|
-
|
22
|
-
# Set language to Swedish
|
23
|
-
google.engine.parameter('hl', 'sv')
|
24
|
-
|
25
|
-
# GO, FETCH!
|
26
|
-
response = google.search("casino faktura")
|
27
|
-
|
28
|
-
# Return search results
|
29
|
-
response.results
|
30
|
-
end
|
31
|
-
|
32
|
-
require 'uri'
|
33
|
-
require 'mechanize'
|
34
|
-
require 'addressable/uri'
|
35
|
-
require 'nokogiri'
|
36
29
|
require 'engines/google'
|
37
30
|
require 'serp_response'
|
data/serp_scraper.gemspec
CHANGED