serp_scraper 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 50907d75fe90b6a6eba27dcccf7da802e3d2b999
4
- data.tar.gz: 69210b67fa80e1df6774600ffc798e93306a3a70
3
+ metadata.gz: 57fa4ca58ef4a346fd8839408b73ac3adbf2773c
4
+ data.tar.gz: 17798f9cb46e7c16011770b1c6a6c25996777fa2
5
5
  SHA512:
6
- metadata.gz: 030bba279587f40d63c258824c32ef6c0bed017e07a9330a008e6df292c556500b293bc01549a181aa4dd22e8a0ab38c4ec258030464848de117208735a788d9
7
- data.tar.gz: cdffcc183ba38b153fce55257a508500dc0378582dc9547dd1c88e80f8dfed1a0a741712ad3e1a4b70271392e4c8cdabb5d0fe5f3f4984400db6c994b0eb49ac
6
+ metadata.gz: 6bf5846e86571595732c09e66aef92c71070ed59b338d10f1e0a6ddfbd5d46699667ee0ec33c8c29cc912b223b1db5dcb86f31d421b2ea701819f2e1c4fccde3
7
+ data.tar.gz: febb89dac3c82b1a73f3a8e94bafffe310b926e898d8b78440b29c78c36cbf67eb25684076260f860726e10676e39ade6ec59ce2e426f1167e24637493d1020d
data/EXAMPLES.md CHANGED
@@ -15,11 +15,14 @@ end
15
15
 
16
16
  ## Country/TLD specific search
17
17
  ```ruby
18
- # Usees google.se for swedish results
18
+ # Set '.se' as TLD for swedish results
19
19
  s = SerpScraper.new(engine: 'google', tld: 'se')
20
20
 
21
- # Set language to Swedish
21
+ # Set language parameter to swedish
22
22
  s.engine.parameter('hl', 'sv')
23
23
 
24
- response = s.search('köp bilar online')
25
- ```
24
+ s.search('köp bilar online').results.each do |result|
25
+ puts result
26
+ # => {:position=>1, :title=>"kvd.se - Bilauktioner på nätet", :scheme=>"https", :domain=>"www.kvd.se", :url=>"/", :full_url=>"https://www.kvd.se/"}
27
+ end
28
+ ```
data/README.md CHANGED
@@ -15,10 +15,11 @@ gem 'serp_scraper'
15
15
  ```
16
16
 
17
17
  ## Examples
18
- ```ruby
19
- s = SerpScraper.new(engine: 'google')
20
- res = s.search('buy cars onlines')
21
- puts res.results[0]
18
+
19
+ ```ruby
20
+ google = SerpScraper.new(engine: 'google', tld: 'com')
21
+ first_result = google.search('buy cars onlines').results[0]
22
+ puts first_result
22
23
  # => {:position=>1, :title=>"Buying From CarMax", :scheme=>"https", :domain=>"www.carmax.com", :url=>"/car-buying-process", :full_url=>"https://www.carmax.com/car-buying-process"}
23
24
  ```
24
25
 
@@ -1,42 +1,75 @@
1
1
  class SerpScraper::Google
2
2
  attr_accessor :tld
3
3
  attr_accessor :user_agent
4
+ attr_accessor :browser
4
5
 
5
6
  def initialize(tld)
6
- self.tld = tld
7
+ # Make tld global
8
+ @tld = tld
7
9
 
10
+ # Create new Mechanize object
8
11
  @browser = Mechanize.new { |agent|
9
12
  agent.user_agent_alias = 'Mac Safari'
10
13
  }
11
-
12
- @parameters = Hash.new
13
- @parameters['gbv'] = 1
14
- @parameters['complete'] = 0
15
- @parameters['num'] = 100
16
- @parameters['pws'] = 0
17
- @parameters['nfrpr'] = 1
18
- @parameters['ie'] = 'utf-8'
19
- @parameters['oe'] = 'utf-8'
20
- @parameters['site'] = 'webhp'
21
- @parameters['source'] = 'hp'
14
+
15
+ # Set standard query parameters
16
+ @parameters = {
17
+ gbv: 1,
18
+ complete: 0,
19
+ num: 100,
20
+ pws: 0,
21
+ nfrpr: 1,
22
+ ie: 'utf-8',
23
+ oe: 'utf-8',
24
+ site: 'webhp',
25
+ source: 'hp'
26
+ }
22
27
  end
23
28
 
24
29
  def search(keyword)
30
+ # Add keyword to parameters
31
+ @parameters['q'] = keyword
32
+
33
+ # Create build google search url
34
+ search_url = build_query_url_from_keyword(keyword)
35
+
25
36
  # Do the Googleing
26
- http_response = @browser.get(build_query_url_from_keyword(keyword))
37
+ response = @browser.get(search_url, :referer => "https://www.google.#{@tld}")
38
+
39
+ # 503 error = Google Captcha
40
+ tries = 1
41
+ while response.code[/503/] and tries <= 3
42
+ # Try to solve with captcha
43
+ solve_captcha(response.uri.to_s)
44
+
45
+ # Do another search
46
+ response = @browser.get(search_url)
47
+
48
+ tries += 1
49
+ end
27
50
 
28
- return build_serp_response(http_response) if http_response.code == "200"
51
+ return build_serp_response(response) if response.code == "200"
29
52
 
30
53
  # @todo: Look for and solve captchas.
31
54
  puts "Did not get a 200 response. Maybe a captcha error?"
32
55
  end
33
56
 
34
- def build_serp_response(http_response)
57
+ def solve_captcha(captcha_url)
58
+ puts "trying to solve captcha on url #{captcha_url}"
59
+
60
+ page = @browser.get(captcha_url)
61
+ doc = Nokogiri::HTML(page.content)
62
+
63
+ image_url = Addressable::URI.parse('http://ipv4.google.com/' + doc.css('img')[0]["src"]).normalize
64
+ puts "Captcha url: " + image_url
65
+ end
66
+
67
+ def build_serp_response(response)
35
68
  sr = SerpScraper::SerpResponse.new
36
69
  sr.keyword = @parameters['q']
37
70
  sr.user_agent = @browser.user_agent
38
- sr.url = http_response.uri.to_s
39
- sr.html = http_response.content
71
+ sr.url = response.uri.to_s
72
+ sr.html = response.content
40
73
  sr.results = extract_results(sr.html)
41
74
 
42
75
  sr # Return sr
@@ -77,10 +110,8 @@ class SerpScraper::Google
77
110
  end
78
111
 
79
112
  def build_query_url_from_keyword(keyword)
80
- @parameters['q'] = keyword
81
-
82
113
  uri = Addressable::URI.new
83
- uri.host = "www.google.#{tld}"
114
+ uri.host = "www.google.#{@tld}"
84
115
  uri.scheme = "https"
85
116
  uri.path = "/search"
86
117
  uri.query_values = @parameters
data/lib/serp_scraper.rb CHANGED
@@ -1,3 +1,9 @@
1
+ require 'uri'
2
+ require 'mechanize'
3
+ require 'addressable/uri'
4
+ require 'nokogiri'
5
+ require 'deathbycaptcha'
6
+
1
7
  class SerpScraper
2
8
  attr_accessor :engine
3
9
 
@@ -11,27 +17,14 @@ class SerpScraper
11
17
  end
12
18
  end
13
19
 
20
+ def set_proxy(address, port, user = nil, password = nil)
21
+ @engine.browser.set_proxy(address, port, user, password)
22
+ end
23
+
14
24
  def search(keyword)
15
25
  @engine.search(keyword)
16
26
  end
17
27
  end
18
28
 
19
- def test
20
- google = SerpScraper.new(engine: 'google', tld: 'se')
21
-
22
- # Set language to Swedish
23
- google.engine.parameter('hl', 'sv')
24
-
25
- # GO, FETCH!
26
- response = google.search("casino faktura")
27
-
28
- # Return search results
29
- response.results
30
- end
31
-
32
- require 'uri'
33
- require 'mechanize'
34
- require 'addressable/uri'
35
- require 'nokogiri'
36
29
  require 'engines/google'
37
30
  require 'serp_response'
data/serp_scraper.gemspec CHANGED
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'serp_scraper'
3
- s.version = '0.0.2'
3
+ s.version = '0.0.3'
4
4
  s.date = '2017-05-26'
5
5
 
6
6
  s.homepage = 'https://github.com/kjellberg'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: serp_scraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rasmus Kjellberg