generalscraper 0.0.6 → 0.0.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: dbbd54bb1986056a1e3207eda365f04d4684e0f2
4
- data.tar.gz: 274b1a30166371f8f587484020c20a86690e7161
3
+ metadata.gz: ee21102a93458476611426362719e7b3bc7edc9d
4
+ data.tar.gz: 0ae6818f49029a0b3cab8c8e9d9ff3ba7f23d26e
5
5
  SHA512:
6
- metadata.gz: 0f99388b754103326c19a436d41dac5740169beb36554c40f6731a6c0bb3c2d0232742d17fab442c41ee34f55e424dfe6c046d0a0b2c7156d53d5a370d1800ac
7
- data.tar.gz: d7f7ed11b8d11c4e0a8b85e83a35d86b3729633d4f2335d8dc6e3d206bd9b04eaaf02070458b28f879b8fd5df9620da37c34770606b3d26a58faa385c78f45e9
6
+ metadata.gz: 2203574a839f5c122621153bbd8666c5686bbfd3ee9020a74c26e6acdfa80aeec5263a45ea2d80bc092568b1633bc8f964429ba3fee6d5853b854a5a767a5ddd
7
+ data.tar.gz: 419c278558b0b17ddeadc3cd61c4891711fb20d0678706f8f50c16441165a0c9f8c0172f5c2160a07d8b690976162d8e0c2f93758c01891845c571aa49d1c1b2
@@ -9,7 +9,7 @@ class GeneralScraper
9
9
  include ParsePage
10
10
  include ProxyManager
11
11
 
12
- def initialize(operators, searchterm, proxylist)
12
+ def initialize(operators, searchterm, proxylist, use_proxy)
13
13
  @operators = operators
14
14
  @searchterm = searchterm
15
15
  @op_val = @operators.split(" ")[0].split(":")[1]
@@ -19,11 +19,12 @@ class GeneralScraper
19
19
  @output = Array.new
20
20
  @urllist = Array.new
21
21
  @startindex = 10
22
+ @use_proxy = use_proxy
22
23
  end
23
24
 
24
25
  # Searches for links on Google
25
26
  def search
26
- categorizeLinks(getPage("http://google.com", @operators + " " + @searchterm))
27
+ categorizeLinks(getPage("http://google.com", @operators + " " + @searchterm, @use_proxy))
27
28
  end
28
29
 
29
30
  # Categorizes the links on results page into results and other search pages
@@ -60,7 +61,7 @@ class GeneralScraper
60
61
 
61
62
  if page_index_num.to_i == @startindex
62
63
  @startindex += 10
63
- categorizeLinks(getPage("http://google.com" + link.href + "&filter=0"))
64
+ categorizeLinks(getPage("http://google.com" + link.href + "&filter=0", @use_proxy))
64
65
  end
65
66
  end
66
67
 
@@ -80,4 +81,3 @@ class GeneralScraper
80
81
  return JSON.pretty_generate(@urllist)
81
82
  end
82
83
  end
83
-
data/lib/proxy_manager.rb CHANGED
@@ -4,10 +4,16 @@ require 'uri'
4
4
 
5
5
  module ProxyManager
6
6
  # Get the page with a proxy
7
- def getPage(url, form_input = nil, fail_count = 0)
7
+ def getPage(url, form_input = nil, fail_count = 0, use_proxy)
8
8
  agent = Mechanize.new do |a|
9
9
  a.user_agent_alias = "Linux Firefox"
10
- a.set_proxy(*getRandomProxy(url))
10
+
11
+ # Set proxy if specified, otherwise delay to avoid blocks
12
+ if use_proxy
13
+ a.set_proxy(*getRandomProxy(url))
14
+ else
15
+ sleep(20)
16
+ end
11
17
  end
12
18
 
13
19
  # Slightly different based on filling in form or not
@@ -19,8 +25,8 @@ module ProxyManager
19
25
  else
20
26
  return agent.get(url)
21
27
  end
22
- rescue # Only retry request 5 times
23
- getPage(url, form_input, fail_count+=1) if fail_count < 5
28
+ rescue # Only retry request 10 times
29
+ getPage(url, form_input, fail_count+=1) if fail_count < 10
24
30
  end
25
31
  end
26
32
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: generalscraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.6
4
+ version: 0.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-04-10 00:00:00.000000000 Z
11
+ date: 2014-08-22 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Scrapes Google
14
14
  email: shidash@shidash.com