serp_scraper 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/EXAMPLES.md +7 -4
- data/README.md +5 -4
- data/lib/engines/google.rb +51 -20
- data/lib/serp_scraper.rb +10 -17
- data/serp_scraper.gemspec +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 57fa4ca58ef4a346fd8839408b73ac3adbf2773c
|
4
|
+
data.tar.gz: 17798f9cb46e7c16011770b1c6a6c25996777fa2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6bf5846e86571595732c09e66aef92c71070ed59b338d10f1e0a6ddfbd5d46699667ee0ec33c8c29cc912b223b1db5dcb86f31d421b2ea701819f2e1c4fccde3
|
7
|
+
data.tar.gz: febb89dac3c82b1a73f3a8e94bafffe310b926e898d8b78440b29c78c36cbf67eb25684076260f860726e10676e39ade6ec59ce2e426f1167e24637493d1020d
|
data/EXAMPLES.md
CHANGED
@@ -15,11 +15,14 @@ end
|
|
15
15
|
|
16
16
|
## Country/TLD specific search
|
17
17
|
```ruby
|
18
|
-
#
|
18
|
+
# Set '.se' as TLD for swedish results
|
19
19
|
s = SerpScraper.new(engine: 'google', tld: 'se')
|
20
20
|
|
21
|
-
# Set language to
|
21
|
+
# Set language parameter to swedish
|
22
22
|
s.engine.parameter('hl', 'sv')
|
23
23
|
|
24
|
-
|
25
|
-
|
24
|
+
s.search('köp bilar online').results.each do |result|
|
25
|
+
puts result
|
26
|
+
# => {:position=>1, :title=>"kvd.se - Bilauktioner på nätet", :scheme=>"https", :domain=>"www.kvd.se", :url=>"/", :full_url=>"https://www.kvd.se/"}
|
27
|
+
end
|
28
|
+
```
|
data/README.md
CHANGED
@@ -15,10 +15,11 @@ gem 'serp_scraper'
|
|
15
15
|
```
|
16
16
|
|
17
17
|
## Examples
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
18
|
+
|
19
|
+
```ruby
|
20
|
+
google = SerpScraper.new(engine: 'google', tld: 'com')
|
21
|
+
first_result = google.search('buy cars onlines').results[0]
|
22
|
+
puts first_result
|
22
23
|
# => {:position=>1, :title=>"Buying From CarMax", :scheme=>"https", :domain=>"www.carmax.com", :url=>"/car-buying-process", :full_url=>"https://www.carmax.com/car-buying-process"}
|
23
24
|
```
|
24
25
|
|
data/lib/engines/google.rb
CHANGED
@@ -1,42 +1,75 @@
|
|
1
1
|
class SerpScraper::Google
|
2
2
|
attr_accessor :tld
|
3
3
|
attr_accessor :user_agent
|
4
|
+
attr_accessor :browser
|
4
5
|
|
5
6
|
def initialize(tld)
|
6
|
-
|
7
|
+
# Make tld global
|
8
|
+
@tld = tld
|
7
9
|
|
10
|
+
# Create new Mechanize object
|
8
11
|
@browser = Mechanize.new { |agent|
|
9
12
|
agent.user_agent_alias = 'Mac Safari'
|
10
13
|
}
|
11
|
-
|
12
|
-
|
13
|
-
@parameters
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
14
|
+
|
15
|
+
# Set standard query parameters
|
16
|
+
@parameters = {
|
17
|
+
gbv: 1,
|
18
|
+
complete: 0,
|
19
|
+
num: 100,
|
20
|
+
pws: 0,
|
21
|
+
nfrpr: 1,
|
22
|
+
ie: 'utf-8',
|
23
|
+
oe: 'utf-8',
|
24
|
+
site: 'webhp',
|
25
|
+
source: 'hp'
|
26
|
+
}
|
22
27
|
end
|
23
28
|
|
24
29
|
def search(keyword)
|
30
|
+
# Add keyword to parameters
|
31
|
+
@parameters['q'] = keyword
|
32
|
+
|
33
|
+
# Create build google search url
|
34
|
+
search_url = build_query_url_from_keyword(keyword)
|
35
|
+
|
25
36
|
# Do the Googleing
|
26
|
-
|
37
|
+
response = @browser.get(search_url, :referer => "https://www.google.#{@tld}")
|
38
|
+
|
39
|
+
# 503 error = Google Captcha
|
40
|
+
tries = 1
|
41
|
+
while response.code[/503/] and tries <= 3
|
42
|
+
# Try to solve with captcha
|
43
|
+
solve_captcha(response.uri.to_s)
|
44
|
+
|
45
|
+
# Do another search
|
46
|
+
response = @browser.get(search_url)
|
47
|
+
|
48
|
+
tries += 1
|
49
|
+
end
|
27
50
|
|
28
|
-
return build_serp_response(
|
51
|
+
return build_serp_response(response) if response.code == "200"
|
29
52
|
|
30
53
|
# @todo: Look for and solve captchas.
|
31
54
|
puts "Did not get a 200 response. Maybe a captcha error?"
|
32
55
|
end
|
33
56
|
|
34
|
-
def
|
57
|
+
def solve_captcha(captcha_url)
|
58
|
+
puts "trying to solve captcha on url #{captcha_url}"
|
59
|
+
|
60
|
+
page = @browser.get(captcha_url)
|
61
|
+
doc = Nokogiri::HTML(page.content)
|
62
|
+
|
63
|
+
image_url = Addressable::URI.parse('http://ipv4.google.com/' + doc.css('img')[0]["src"]).normalize
|
64
|
+
puts "Captcha url: " + image_url
|
65
|
+
end
|
66
|
+
|
67
|
+
def build_serp_response(response)
|
35
68
|
sr = SerpScraper::SerpResponse.new
|
36
69
|
sr.keyword = @parameters['q']
|
37
70
|
sr.user_agent = @browser.user_agent
|
38
|
-
sr.url =
|
39
|
-
sr.html =
|
71
|
+
sr.url = response.uri.to_s
|
72
|
+
sr.html = response.content
|
40
73
|
sr.results = extract_results(sr.html)
|
41
74
|
|
42
75
|
sr # Return sr
|
@@ -77,10 +110,8 @@ class SerpScraper::Google
|
|
77
110
|
end
|
78
111
|
|
79
112
|
def build_query_url_from_keyword(keyword)
|
80
|
-
@parameters['q'] = keyword
|
81
|
-
|
82
113
|
uri = Addressable::URI.new
|
83
|
-
uri.host = "www.google.#{tld}"
|
114
|
+
uri.host = "www.google.#{@tld}"
|
84
115
|
uri.scheme = "https"
|
85
116
|
uri.path = "/search"
|
86
117
|
uri.query_values = @parameters
|
data/lib/serp_scraper.rb
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
require 'uri'
|
2
|
+
require 'mechanize'
|
3
|
+
require 'addressable/uri'
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'deathbycaptcha'
|
6
|
+
|
1
7
|
class SerpScraper
|
2
8
|
attr_accessor :engine
|
3
9
|
|
@@ -11,27 +17,14 @@ class SerpScraper
|
|
11
17
|
end
|
12
18
|
end
|
13
19
|
|
20
|
+
def set_proxy(address, port, user = nil, password = nil)
|
21
|
+
@engine.browser.set_proxy(address, port, user, password)
|
22
|
+
end
|
23
|
+
|
14
24
|
def search(keyword)
|
15
25
|
@engine.search(keyword)
|
16
26
|
end
|
17
27
|
end
|
18
28
|
|
19
|
-
def test
|
20
|
-
google = SerpScraper.new(engine: 'google', tld: 'se')
|
21
|
-
|
22
|
-
# Set language to Swedish
|
23
|
-
google.engine.parameter('hl', 'sv')
|
24
|
-
|
25
|
-
# GO, FETCH!
|
26
|
-
response = google.search("casino faktura")
|
27
|
-
|
28
|
-
# Return search results
|
29
|
-
response.results
|
30
|
-
end
|
31
|
-
|
32
|
-
require 'uri'
|
33
|
-
require 'mechanize'
|
34
|
-
require 'addressable/uri'
|
35
|
-
require 'nokogiri'
|
36
29
|
require 'engines/google'
|
37
30
|
require 'serp_response'
|
data/serp_scraper.gemspec
CHANGED