generalscraper 0.0.6 → 0.0.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/generalscraper.rb +4 -4
- data/lib/proxy_manager.rb +10 -4
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ee21102a93458476611426362719e7b3bc7edc9d
|
4
|
+
data.tar.gz: 0ae6818f49029a0b3cab8c8e9d9ff3ba7f23d26e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2203574a839f5c122621153bbd8666c5686bbfd3ee9020a74c26e6acdfa80aeec5263a45ea2d80bc092568b1633bc8f964429ba3fee6d5853b854a5a767a5ddd
|
7
|
+
data.tar.gz: 419c278558b0b17ddeadc3cd61c4891711fb20d0678706f8f50c16441165a0c9f8c0172f5c2160a07d8b690976162d8e0c2f93758c01891845c571aa49d1c1b2
|
data/lib/generalscraper.rb
CHANGED
@@ -9,7 +9,7 @@ class GeneralScraper
|
|
9
9
|
include ParsePage
|
10
10
|
include ProxyManager
|
11
11
|
|
12
|
-
def initialize(operators, searchterm, proxylist)
|
12
|
+
def initialize(operators, searchterm, proxylist, use_proxy)
|
13
13
|
@operators = operators
|
14
14
|
@searchterm = searchterm
|
15
15
|
@op_val = @operators.split(" ")[0].split(":")[1]
|
@@ -19,11 +19,12 @@ class GeneralScraper
|
|
19
19
|
@output = Array.new
|
20
20
|
@urllist = Array.new
|
21
21
|
@startindex = 10
|
22
|
+
@use_proxy = use_proxy
|
22
23
|
end
|
23
24
|
|
24
25
|
# Searches for links on Google
|
25
26
|
def search
|
26
|
-
categorizeLinks(getPage("http://google.com", @operators + " " + @searchterm))
|
27
|
+
categorizeLinks(getPage("http://google.com", @operators + " " + @searchterm, @use_proxy))
|
27
28
|
end
|
28
29
|
|
29
30
|
# Categorizes the links on results page into results and other search pages
|
@@ -60,7 +61,7 @@ class GeneralScraper
|
|
60
61
|
|
61
62
|
if page_index_num.to_i == @startindex
|
62
63
|
@startindex += 10
|
63
|
-
categorizeLinks(getPage("http://google.com" + link.href + "&filter=0"))
|
64
|
+
categorizeLinks(getPage("http://google.com" + link.href + "&filter=0", @use_proxy))
|
64
65
|
end
|
65
66
|
end
|
66
67
|
|
@@ -80,4 +81,3 @@ class GeneralScraper
|
|
80
81
|
return JSON.pretty_generate(@urllist)
|
81
82
|
end
|
82
83
|
end
|
83
|
-
|
data/lib/proxy_manager.rb
CHANGED
@@ -4,10 +4,16 @@ require 'uri'
|
|
4
4
|
|
5
5
|
module ProxyManager
|
6
6
|
# Get the page with a proxy
|
7
|
-
def getPage(url, form_input = nil, fail_count = 0)
|
7
|
+
def getPage(url, form_input = nil, fail_count = 0, use_proxy)
|
8
8
|
agent = Mechanize.new do |a|
|
9
9
|
a.user_agent_alias = "Linux Firefox"
|
10
|
-
|
10
|
+
|
11
|
+
# Set proxy if specified, otherwise delay to avoid blocks
|
12
|
+
if use_proxy
|
13
|
+
a.set_proxy(*getRandomProxy(url))
|
14
|
+
else
|
15
|
+
sleep(20)
|
16
|
+
end
|
11
17
|
end
|
12
18
|
|
13
19
|
# Slightly different based on filling in form or not
|
@@ -19,8 +25,8 @@ module ProxyManager
|
|
19
25
|
else
|
20
26
|
return agent.get(url)
|
21
27
|
end
|
22
|
-
rescue # Only retry request
|
23
|
-
getPage(url, form_input, fail_count+=1) if fail_count <
|
28
|
+
rescue # Only retry request 10 times
|
29
|
+
getPage(url, form_input, fail_count+=1) if fail_count < 10
|
24
30
|
end
|
25
31
|
end
|
26
32
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: generalscraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- M. C. McGrath
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-08-22 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Scrapes Google
|
14
14
|
email: shidash@shidash.com
|