generalscraper 0.0.6 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: dbbd54bb1986056a1e3207eda365f04d4684e0f2
4
- data.tar.gz: 274b1a30166371f8f587484020c20a86690e7161
3
+ metadata.gz: ee21102a93458476611426362719e7b3bc7edc9d
4
+ data.tar.gz: 0ae6818f49029a0b3cab8c8e9d9ff3ba7f23d26e
5
5
  SHA512:
6
- metadata.gz: 0f99388b754103326c19a436d41dac5740169beb36554c40f6731a6c0bb3c2d0232742d17fab442c41ee34f55e424dfe6c046d0a0b2c7156d53d5a370d1800ac
7
- data.tar.gz: d7f7ed11b8d11c4e0a8b85e83a35d86b3729633d4f2335d8dc6e3d206bd9b04eaaf02070458b28f879b8fd5df9620da37c34770606b3d26a58faa385c78f45e9
6
+ metadata.gz: 2203574a839f5c122621153bbd8666c5686bbfd3ee9020a74c26e6acdfa80aeec5263a45ea2d80bc092568b1633bc8f964429ba3fee6d5853b854a5a767a5ddd
7
+ data.tar.gz: 419c278558b0b17ddeadc3cd61c4891711fb20d0678706f8f50c16441165a0c9f8c0172f5c2160a07d8b690976162d8e0c2f93758c01891845c571aa49d1c1b2
@@ -9,7 +9,7 @@ class GeneralScraper
9
9
  include ParsePage
10
10
  include ProxyManager
11
11
 
12
- def initialize(operators, searchterm, proxylist)
12
+ def initialize(operators, searchterm, proxylist, use_proxy)
13
13
  @operators = operators
14
14
  @searchterm = searchterm
15
15
  @op_val = @operators.split(" ")[0].split(":")[1]
@@ -19,11 +19,12 @@ class GeneralScraper
19
19
  @output = Array.new
20
20
  @urllist = Array.new
21
21
  @startindex = 10
22
+ @use_proxy = use_proxy
22
23
  end
23
24
 
24
25
  # Searches for links on Google
25
26
  def search
26
- categorizeLinks(getPage("http://google.com", @operators + " " + @searchterm))
27
+ categorizeLinks(getPage("http://google.com", @operators + " " + @searchterm, @use_proxy))
27
28
  end
28
29
 
29
30
  # Categorizes the links on results page into results and other search pages
@@ -60,7 +61,7 @@ class GeneralScraper
60
61
 
61
62
  if page_index_num.to_i == @startindex
62
63
  @startindex += 10
63
- categorizeLinks(getPage("http://google.com" + link.href + "&filter=0"))
64
+ categorizeLinks(getPage("http://google.com" + link.href + "&filter=0", @use_proxy))
64
65
  end
65
66
  end
66
67
 
@@ -80,4 +81,3 @@ class GeneralScraper
80
81
  return JSON.pretty_generate(@urllist)
81
82
  end
82
83
  end
83
-
data/lib/proxy_manager.rb CHANGED
@@ -4,10 +4,16 @@ require 'uri'
4
4
 
5
5
  module ProxyManager
6
6
  # Get the page with a proxy
7
- def getPage(url, form_input = nil, fail_count = 0)
7
+ def getPage(url, form_input = nil, fail_count = 0, use_proxy)
8
8
  agent = Mechanize.new do |a|
9
9
  a.user_agent_alias = "Linux Firefox"
10
- a.set_proxy(*getRandomProxy(url))
10
+
11
+ # Set proxy if specified, otherwise delay to avoid blocks
12
+ if use_proxy
13
+ a.set_proxy(*getRandomProxy(url))
14
+ else
15
+ sleep(20)
16
+ end
11
17
  end
12
18
 
13
19
  # Slightly different based on filling in form or not
@@ -19,8 +25,8 @@ module ProxyManager
19
25
  else
20
26
  return agent.get(url)
21
27
  end
22
- rescue # Only retry request 5 times
23
- getPage(url, form_input, fail_count+=1) if fail_count < 5
28
+ rescue # Only retry request 10 times
29
+ getPage(url, form_input, fail_count+=1) if fail_count < 10
24
30
  end
25
31
  end
26
32
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: generalscraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.6
4
+ version: 0.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-04-10 00:00:00.000000000 Z
11
+ date: 2014-08-22 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Scrapes Google
14
14
  email: shidash@shidash.com