serp_scraper 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 50907d75fe90b6a6eba27dcccf7da802e3d2b999
4
- data.tar.gz: 69210b67fa80e1df6774600ffc798e93306a3a70
3
+ metadata.gz: 57fa4ca58ef4a346fd8839408b73ac3adbf2773c
4
+ data.tar.gz: 17798f9cb46e7c16011770b1c6a6c25996777fa2
5
5
  SHA512:
6
- metadata.gz: 030bba279587f40d63c258824c32ef6c0bed017e07a9330a008e6df292c556500b293bc01549a181aa4dd22e8a0ab38c4ec258030464848de117208735a788d9
7
- data.tar.gz: cdffcc183ba38b153fce55257a508500dc0378582dc9547dd1c88e80f8dfed1a0a741712ad3e1a4b70271392e4c8cdabb5d0fe5f3f4984400db6c994b0eb49ac
6
+ metadata.gz: 6bf5846e86571595732c09e66aef92c71070ed59b338d10f1e0a6ddfbd5d46699667ee0ec33c8c29cc912b223b1db5dcb86f31d421b2ea701819f2e1c4fccde3
7
+ data.tar.gz: febb89dac3c82b1a73f3a8e94bafffe310b926e898d8b78440b29c78c36cbf67eb25684076260f860726e10676e39ade6ec59ce2e426f1167e24637493d1020d
data/EXAMPLES.md CHANGED
@@ -15,11 +15,14 @@ end
15
15
 
16
16
  ## Country/TLD specific search
17
17
  ```ruby
18
- # Usees google.se for swedish results
18
+ # Set '.se' as TLD for swedish results
19
19
  s = SerpScraper.new(engine: 'google', tld: 'se')
20
20
 
21
- # Set language to Swedish
21
+ # Set language parameter to swedish
22
22
  s.engine.parameter('hl', 'sv')
23
23
 
24
- response = s.search('köp bilar online')
25
- ```
24
+ s.search('köp bilar online').results.each do |result|
25
+ puts result
26
+ # => {:position=>1, :title=>"kvd.se - Bilauktioner på nätet", :scheme=>"https", :domain=>"www.kvd.se", :url=>"/", :full_url=>"https://www.kvd.se/"}
27
+ end
28
+ ```
data/README.md CHANGED
@@ -15,10 +15,11 @@ gem 'serp_scraper'
15
15
  ```
16
16
 
17
17
  ## Examples
18
- ```ruby
19
- s = SerpScraper.new(engine: 'google')
20
- res = s.search('buy cars onlines')
21
- puts res.results[0]
18
+
19
+ ```ruby
20
+ google = SerpScraper.new(engine: 'google', tld: 'com')
21
+ first_result = google.search('buy cars onlines').results[0]
22
+ puts first_result
22
23
  # => {:position=>1, :title=>"Buying From CarMax", :scheme=>"https", :domain=>"www.carmax.com", :url=>"/car-buying-process", :full_url=>"https://www.carmax.com/car-buying-process"}
23
24
  ```
24
25
 
@@ -1,42 +1,75 @@
1
1
  class SerpScraper::Google
2
2
  attr_accessor :tld
3
3
  attr_accessor :user_agent
4
+ attr_accessor :browser
4
5
 
5
6
  def initialize(tld)
6
- self.tld = tld
7
+ # Make tld global
8
+ @tld = tld
7
9
 
10
+ # Create new Mechanize object
8
11
  @browser = Mechanize.new { |agent|
9
12
  agent.user_agent_alias = 'Mac Safari'
10
13
  }
11
-
12
- @parameters = Hash.new
13
- @parameters['gbv'] = 1
14
- @parameters['complete'] = 0
15
- @parameters['num'] = 100
16
- @parameters['pws'] = 0
17
- @parameters['nfrpr'] = 1
18
- @parameters['ie'] = 'utf-8'
19
- @parameters['oe'] = 'utf-8'
20
- @parameters['site'] = 'webhp'
21
- @parameters['source'] = 'hp'
14
+
15
+ # Set standard query parameters
16
+ @parameters = {
17
+ gbv: 1,
18
+ complete: 0,
19
+ num: 100,
20
+ pws: 0,
21
+ nfrpr: 1,
22
+ ie: 'utf-8',
23
+ oe: 'utf-8',
24
+ site: 'webhp',
25
+ source: 'hp'
26
+ }
22
27
  end
23
28
 
24
29
  def search(keyword)
30
+ # Add keyword to parameters
31
+ @parameters['q'] = keyword
32
+
33
+ # Create build google search url
34
+ search_url = build_query_url_from_keyword(keyword)
35
+
25
36
  # Do the Googleing
26
- http_response = @browser.get(build_query_url_from_keyword(keyword))
37
+ response = @browser.get(search_url, :referer => "https://www.google.#{@tld}")
38
+
39
+ # 503 error = Google Captcha
40
+ tries = 1
41
+ while response.code[/503/] and tries <= 3
42
+ # Try to solve with captcha
43
+ solve_captcha(response.uri.to_s)
44
+
45
+ # Do another search
46
+ response = @browser.get(search_url)
47
+
48
+ tries += 1
49
+ end
27
50
 
28
- return build_serp_response(http_response) if http_response.code == "200"
51
+ return build_serp_response(response) if response.code == "200"
29
52
 
30
53
  # @todo: Look for and solve captchas.
31
54
  puts "Did not get a 200 response. Maybe a captcha error?"
32
55
  end
33
56
 
34
- def build_serp_response(http_response)
57
+ def solve_captcha(captcha_url)
58
+ puts "trying to solve captcha on url #{captcha_url}"
59
+
60
+ page = @browser.get(captcha_url)
61
+ doc = Nokogiri::HTML(page.content)
62
+
63
+ image_url = Addressable::URI.parse('http://ipv4.google.com/' + doc.css('img')[0]["src"]).normalize
64
+ puts "Captcha url: " + image_url
65
+ end
66
+
67
+ def build_serp_response(response)
35
68
  sr = SerpScraper::SerpResponse.new
36
69
  sr.keyword = @parameters['q']
37
70
  sr.user_agent = @browser.user_agent
38
- sr.url = http_response.uri.to_s
39
- sr.html = http_response.content
71
+ sr.url = response.uri.to_s
72
+ sr.html = response.content
40
73
  sr.results = extract_results(sr.html)
41
74
 
42
75
  sr # Return sr
@@ -77,10 +110,8 @@ class SerpScraper::Google
77
110
  end
78
111
 
79
112
  def build_query_url_from_keyword(keyword)
80
- @parameters['q'] = keyword
81
-
82
113
  uri = Addressable::URI.new
83
- uri.host = "www.google.#{tld}"
114
+ uri.host = "www.google.#{@tld}"
84
115
  uri.scheme = "https"
85
116
  uri.path = "/search"
86
117
  uri.query_values = @parameters
data/lib/serp_scraper.rb CHANGED
@@ -1,3 +1,9 @@
1
+ require 'uri'
2
+ require 'mechanize'
3
+ require 'addressable/uri'
4
+ require 'nokogiri'
5
+ require 'deathbycaptcha'
6
+
1
7
  class SerpScraper
2
8
  attr_accessor :engine
3
9
 
@@ -11,27 +17,14 @@ class SerpScraper
11
17
  end
12
18
  end
13
19
 
20
+ def set_proxy(address, port, user = nil, password = nil)
21
+ @engine.browser.set_proxy(address, port, user, password)
22
+ end
23
+
14
24
  def search(keyword)
15
25
  @engine.search(keyword)
16
26
  end
17
27
  end
18
28
 
19
- def test
20
- google = SerpScraper.new(engine: 'google', tld: 'se')
21
-
22
- # Set language to Swedish
23
- google.engine.parameter('hl', 'sv')
24
-
25
- # GO, FETCH!
26
- response = google.search("casino faktura")
27
-
28
- # Return search results
29
- response.results
30
- end
31
-
32
- require 'uri'
33
- require 'mechanize'
34
- require 'addressable/uri'
35
- require 'nokogiri'
36
29
  require 'engines/google'
37
30
  require 'serp_response'
data/serp_scraper.gemspec CHANGED
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'serp_scraper'
3
- s.version = '0.0.2'
3
+ s.version = '0.0.3'
4
4
  s.date = '2017-05-26'
5
5
 
6
6
  s.homepage = 'https://github.com/kjellberg'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: serp_scraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rasmus Kjellberg