generalscraper 0.0.11 → 0.0.12
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/generalscraper.rb +31 -26
- data/lib/parse_page.rb +3 -3
- metadata +4 -4
- data/lib/proxy_manager.rb +0 -70
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a23d6483229cb3d18a14d8ba906658779f1a80d2
|
4
|
+
data.tar.gz: 0a719a46f8b3091880419495ad8e746d3e53e722
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9554472410fb5879f28de9fdd3e55625c2d5016435bf3df4ad314d6f2cb9b77101fde746f97d651575e006f49d3825256095ea6b73d1154d5072d5cee8ee12c1
|
7
|
+
data.tar.gz: 76f39348dde21f700ce560346243b5b18e857340ce59a973cb23c6735d89ea626d93a3e1cd0d3c6027e201e74d2fef1762a640f0c3cbb35792f89c5300def338
|
data/lib/generalscraper.rb
CHANGED
@@ -1,46 +1,50 @@
|
|
1
1
|
require 'json'
|
2
2
|
require 'nokogiri'
|
3
3
|
require 'mechanize'
|
4
|
+
require 'requestmanager'
|
5
|
+
require 'pry'
|
4
6
|
|
5
7
|
load 'parse_page.rb'
|
6
|
-
load 'proxy_manager.rb'
|
7
8
|
|
8
9
|
class GeneralScraper
|
9
10
|
include ParsePage
|
10
|
-
include ProxyManager
|
11
11
|
|
12
|
-
def initialize(operators, searchterm, proxylist
|
12
|
+
def initialize(operators, searchterm, proxylist)
|
13
13
|
@operators = operators
|
14
14
|
@searchterm = searchterm
|
15
15
|
@op_val = @operators.split(" ")[0].split(":")[1]
|
16
|
-
@proxylist =
|
17
|
-
@
|
16
|
+
@proxylist = proxylist
|
17
|
+
@requests = RequestManager.new(@proxylist, [4, 15], 1)
|
18
18
|
|
19
19
|
@output = Array.new
|
20
20
|
@urllist = Array.new
|
21
21
|
@startindex = 10
|
22
|
-
@use_proxy = use_proxy
|
23
|
-
|
24
|
-
# Generate driver
|
25
|
-
profile = Selenium::WebDriver::Firefox::Profile.new
|
26
|
-
profile['intl.accept_languages'] = 'en'
|
27
|
-
@driver = Selenium::WebDriver.for :firefox, profile: profile
|
28
22
|
end
|
29
23
|
|
30
24
|
# Searches for links on Google
|
31
25
|
def search
|
32
|
-
|
26
|
+
check_results(@requests.get_page("http://google.com", @operators + " " + @searchterm),
|
27
|
+
"http://google.com", (@operators + " " + @searchterm))
|
28
|
+
end
|
29
|
+
|
30
|
+
# Check that page with links loaded
|
31
|
+
def check_results(page, *requested_page)
|
32
|
+
if page.include?("To continue, please type the characters below:")
|
33
|
+
@requests.restart_browser
|
34
|
+
check_results(@requests.get_page(requested_page), requested_page)
|
35
|
+
else
|
36
|
+
categorizeLinks(page)
|
37
|
+
end
|
33
38
|
end
|
34
39
|
|
35
40
|
# Gets the links from the page
|
36
|
-
def getLinks(page)
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
return page.find_elements(css: "a").inject(Array.new) do |link_arr, al|
|
41
|
+
def getLinks(page)
|
42
|
+
html = Nokogiri::HTML(page)
|
43
|
+
|
44
|
+
# Get array of links
|
45
|
+
return html.css("a").inject(Array.new) do |link_arr, al|
|
42
46
|
begin
|
43
|
-
link_arr.push(al
|
47
|
+
link_arr.push(al["href"])
|
44
48
|
rescue
|
45
49
|
|
46
50
|
end
|
@@ -52,12 +56,14 @@ class GeneralScraper
|
|
52
56
|
# Categorizes the links on results page into results and other search pages
|
53
57
|
def categorizeLinks(page)
|
54
58
|
links = getLinks(page)
|
59
|
+
|
60
|
+
# Categorize as results or search pages
|
55
61
|
links.each do |link|
|
56
62
|
if link
|
57
63
|
if isResultLink?(link)
|
58
64
|
siteURLSave(link)
|
59
65
|
elsif isSearchPageLink?(link)
|
60
|
-
nextSearchPage(link)
|
66
|
+
nextSearchPage("google.com"+link)
|
61
67
|
end
|
62
68
|
end
|
63
69
|
end
|
@@ -88,26 +94,25 @@ class GeneralScraper
|
|
88
94
|
|
89
95
|
if page_index_num.to_i == @startindex
|
90
96
|
@startindex += 10
|
91
|
-
|
97
|
+
check_results(@requests.get_page(link), link)
|
92
98
|
end
|
93
99
|
end
|
94
100
|
|
95
|
-
|
96
101
|
# Gets all data and returns in JSON
|
97
102
|
def getData
|
98
103
|
search
|
99
104
|
@urllist.each do |url|
|
100
|
-
getPageData(url
|
105
|
+
getPageData(url)
|
101
106
|
end
|
102
|
-
|
107
|
+
|
108
|
+
@requests.close_all_browsers
|
103
109
|
return JSON.pretty_generate(@output)
|
104
110
|
end
|
105
111
|
|
106
112
|
# Returns a list of search result URLs
|
107
113
|
def getURLs
|
108
114
|
search
|
109
|
-
@
|
115
|
+
@requests.close_all_browsers
|
110
116
|
return JSON.pretty_generate(@urllist)
|
111
117
|
end
|
112
118
|
end
|
113
|
-
|
data/lib/parse_page.rb
CHANGED
@@ -2,10 +2,10 @@ require 'uploadconvert'
|
|
2
2
|
|
3
3
|
module ParsePage
|
4
4
|
# Get both page metadata and text
|
5
|
-
def getPageData(url
|
5
|
+
def getPageData(url)
|
6
6
|
begin
|
7
|
-
page =
|
8
|
-
html = Nokogiri::HTML(page
|
7
|
+
page = @requests.get_page(url)
|
8
|
+
html = Nokogiri::HTML(page)
|
9
9
|
pagehash = getMetadata(url, html)
|
10
10
|
pagehash = getContent(url, pagehash, html)
|
11
11
|
@output.push(pagehash)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: generalscraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.12
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- M. C. McGrath
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2015-11-03 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Scrapes Google
|
14
14
|
email: shidash@shidash.com
|
@@ -18,7 +18,6 @@ extra_rdoc_files: []
|
|
18
18
|
files:
|
19
19
|
- lib/generalscraper.rb
|
20
20
|
- lib/parse_page.rb
|
21
|
-
- lib/proxy_manager.rb
|
22
21
|
homepage: https://github.com/TransparencyToolkit/generalscraper
|
23
22
|
licenses:
|
24
23
|
- GPL
|
@@ -39,8 +38,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
39
38
|
version: '0'
|
40
39
|
requirements: []
|
41
40
|
rubyforge_project:
|
42
|
-
rubygems_version: 2.4.
|
41
|
+
rubygems_version: 2.4.8
|
43
42
|
signing_key:
|
44
43
|
specification_version: 4
|
45
44
|
summary: Scrapes Google
|
46
45
|
test_files: []
|
46
|
+
has_rdoc:
|
data/lib/proxy_manager.rb
DELETED
@@ -1,70 +0,0 @@
|
|
1
|
-
require 'active_support/time'
|
2
|
-
require 'mechanize'
|
3
|
-
require 'uri'
|
4
|
-
require 'selenium-webdriver'
|
5
|
-
|
6
|
-
module ProxyManager
|
7
|
-
# Get the page with a proxy
|
8
|
-
def getPage(url, driver, form_input = nil, fail_count = 0, use_proxy)
|
9
|
-
agent = Mechanize.new do |a|
|
10
|
-
a.user_agent_alias = "Linux Firefox"
|
11
|
-
|
12
|
-
# Set proxy if specified, otherwise delay to avoid blocks
|
13
|
-
if use_proxy
|
14
|
-
a.set_proxy(*getRandomProxy(url))
|
15
|
-
else
|
16
|
-
sleep(rand(30..90))
|
17
|
-
end
|
18
|
-
end
|
19
|
-
|
20
|
-
# Slightly different based on filling in form or not
|
21
|
-
begin
|
22
|
-
if form_input
|
23
|
-
driver.navigate.to url
|
24
|
-
element = driver.find_element(name: "q")
|
25
|
-
element.send_keys form_input
|
26
|
-
element.submit
|
27
|
-
puts "Searched for: " + form_input
|
28
|
-
|
29
|
-
return driver
|
30
|
-
else
|
31
|
-
puts "Getting page " + url
|
32
|
-
driver.navigate.to url
|
33
|
-
return driver
|
34
|
-
end
|
35
|
-
rescue # Only retry request 10 times
|
36
|
-
begin
|
37
|
-
puts "FAILED"
|
38
|
-
getPage(url, form_input, fail_count+=1) if fail_count < 10
|
39
|
-
rescue
|
40
|
-
end
|
41
|
-
end
|
42
|
-
end
|
43
|
-
|
44
|
-
# Choose a random proxy
|
45
|
-
def getRandomProxy(url)
|
46
|
-
max = @proxylist.length
|
47
|
-
chosen = @proxylist[Random.rand(max)]
|
48
|
-
|
49
|
-
# Only use proxy if it hasn't been used in last 20 seconds on same host
|
50
|
-
if isNotUsed?(chosen, url)
|
51
|
-
@usedproxies[chosen] = [Time.now, URI.parse(url).host]
|
52
|
-
return parseProxy(chosen)
|
53
|
-
else
|
54
|
-
sleep(0.005)
|
55
|
-
getRandomProxy(url)
|
56
|
-
end
|
57
|
-
end
|
58
|
-
|
59
|
-
# Splits up proxy into IP, port, user, password
|
60
|
-
def parseProxy(chosen)
|
61
|
-
proxy_info = chosen.split(":")
|
62
|
-
proxy_info[proxy_info.length-1] = proxy_info.last.strip
|
63
|
-
return proxy_info
|
64
|
-
end
|
65
|
-
|
66
|
-
# Checks if a proxy has been used on domain in the last 20 seconds
|
67
|
-
def isNotUsed?(chosen, url)
|
68
|
-
return !@usedproxies[chosen] || @usedproxies[chosen][0] <= Time.now-20 || @usedproxies[chosen][1] != URI.parse(url).host
|
69
|
-
end
|
70
|
-
end
|