generalscraper 0.0.11 → 0.0.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/generalscraper.rb +31 -26
- data/lib/parse_page.rb +3 -3
- metadata +4 -4
- data/lib/proxy_manager.rb +0 -70
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a23d6483229cb3d18a14d8ba906658779f1a80d2
|
4
|
+
data.tar.gz: 0a719a46f8b3091880419495ad8e746d3e53e722
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9554472410fb5879f28de9fdd3e55625c2d5016435bf3df4ad314d6f2cb9b77101fde746f97d651575e006f49d3825256095ea6b73d1154d5072d5cee8ee12c1
|
7
|
+
data.tar.gz: 76f39348dde21f700ce560346243b5b18e857340ce59a973cb23c6735d89ea626d93a3e1cd0d3c6027e201e74d2fef1762a640f0c3cbb35792f89c5300def338
|
data/lib/generalscraper.rb
CHANGED
@@ -1,46 +1,50 @@
|
|
1
1
|
require 'json'
|
2
2
|
require 'nokogiri'
|
3
3
|
require 'mechanize'
|
4
|
+
require 'requestmanager'
|
5
|
+
require 'pry'
|
4
6
|
|
5
7
|
load 'parse_page.rb'
|
6
|
-
load 'proxy_manager.rb'
|
7
8
|
|
8
9
|
class GeneralScraper
|
9
10
|
include ParsePage
|
10
|
-
include ProxyManager
|
11
11
|
|
12
|
-
def initialize(operators, searchterm, proxylist
|
12
|
+
def initialize(operators, searchterm, proxylist)
|
13
13
|
@operators = operators
|
14
14
|
@searchterm = searchterm
|
15
15
|
@op_val = @operators.split(" ")[0].split(":")[1]
|
16
|
-
@proxylist =
|
17
|
-
@
|
16
|
+
@proxylist = proxylist
|
17
|
+
@requests = RequestManager.new(@proxylist, [4, 15], 1)
|
18
18
|
|
19
19
|
@output = Array.new
|
20
20
|
@urllist = Array.new
|
21
21
|
@startindex = 10
|
22
|
-
@use_proxy = use_proxy
|
23
|
-
|
24
|
-
# Generate driver
|
25
|
-
profile = Selenium::WebDriver::Firefox::Profile.new
|
26
|
-
profile['intl.accept_languages'] = 'en'
|
27
|
-
@driver = Selenium::WebDriver.for :firefox, profile: profile
|
28
22
|
end
|
29
23
|
|
30
24
|
# Searches for links on Google
|
31
25
|
def search
|
32
|
-
|
26
|
+
check_results(@requests.get_page("http://google.com", @operators + " " + @searchterm),
|
27
|
+
"http://google.com", (@operators + " " + @searchterm))
|
28
|
+
end
|
29
|
+
|
30
|
+
# Check that page with links loaded
|
31
|
+
def check_results(page, *requested_page)
|
32
|
+
if page.include?("To continue, please type the characters below:")
|
33
|
+
@requests.restart_browser
|
34
|
+
check_results(@requests.get_page(requested_page), requested_page)
|
35
|
+
else
|
36
|
+
categorizeLinks(page)
|
37
|
+
end
|
33
38
|
end
|
34
39
|
|
35
40
|
# Gets the links from the page
|
36
|
-
def getLinks(page)
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
return page.find_elements(css: "a").inject(Array.new) do |link_arr, al|
|
41
|
+
def getLinks(page)
|
42
|
+
html = Nokogiri::HTML(page)
|
43
|
+
|
44
|
+
# Get array of links
|
45
|
+
return html.css("a").inject(Array.new) do |link_arr, al|
|
42
46
|
begin
|
43
|
-
link_arr.push(al
|
47
|
+
link_arr.push(al["href"])
|
44
48
|
rescue
|
45
49
|
|
46
50
|
end
|
@@ -52,12 +56,14 @@ class GeneralScraper
|
|
52
56
|
# Categorizes the links on results page into results and other search pages
|
53
57
|
def categorizeLinks(page)
|
54
58
|
links = getLinks(page)
|
59
|
+
|
60
|
+
# Categorize as results or search pages
|
55
61
|
links.each do |link|
|
56
62
|
if link
|
57
63
|
if isResultLink?(link)
|
58
64
|
siteURLSave(link)
|
59
65
|
elsif isSearchPageLink?(link)
|
60
|
-
nextSearchPage(link)
|
66
|
+
nextSearchPage("google.com"+link)
|
61
67
|
end
|
62
68
|
end
|
63
69
|
end
|
@@ -88,26 +94,25 @@ class GeneralScraper
|
|
88
94
|
|
89
95
|
if page_index_num.to_i == @startindex
|
90
96
|
@startindex += 10
|
91
|
-
|
97
|
+
check_results(@requests.get_page(link), link)
|
92
98
|
end
|
93
99
|
end
|
94
100
|
|
95
|
-
|
96
101
|
# Gets all data and returns in JSON
|
97
102
|
def getData
|
98
103
|
search
|
99
104
|
@urllist.each do |url|
|
100
|
-
getPageData(url
|
105
|
+
getPageData(url)
|
101
106
|
end
|
102
|
-
|
107
|
+
|
108
|
+
@requests.close_all_browsers
|
103
109
|
return JSON.pretty_generate(@output)
|
104
110
|
end
|
105
111
|
|
106
112
|
# Returns a list of search result URLs
|
107
113
|
def getURLs
|
108
114
|
search
|
109
|
-
@
|
115
|
+
@requests.close_all_browsers
|
110
116
|
return JSON.pretty_generate(@urllist)
|
111
117
|
end
|
112
118
|
end
|
113
|
-
|
data/lib/parse_page.rb
CHANGED
@@ -2,10 +2,10 @@ require 'uploadconvert'
|
|
2
2
|
|
3
3
|
module ParsePage
|
4
4
|
# Get both page metadata and text
|
5
|
-
def getPageData(url
|
5
|
+
def getPageData(url)
|
6
6
|
begin
|
7
|
-
page =
|
8
|
-
html = Nokogiri::HTML(page
|
7
|
+
page = @requests.get_page(url)
|
8
|
+
html = Nokogiri::HTML(page)
|
9
9
|
pagehash = getMetadata(url, html)
|
10
10
|
pagehash = getContent(url, pagehash, html)
|
11
11
|
@output.push(pagehash)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: generalscraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.12
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- M. C. McGrath
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2015-11-03 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Scrapes Google
|
14
14
|
email: shidash@shidash.com
|
@@ -18,7 +18,6 @@ extra_rdoc_files: []
|
|
18
18
|
files:
|
19
19
|
- lib/generalscraper.rb
|
20
20
|
- lib/parse_page.rb
|
21
|
-
- lib/proxy_manager.rb
|
22
21
|
homepage: https://github.com/TransparencyToolkit/generalscraper
|
23
22
|
licenses:
|
24
23
|
- GPL
|
@@ -39,8 +38,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
39
38
|
version: '0'
|
40
39
|
requirements: []
|
41
40
|
rubyforge_project:
|
42
|
-
rubygems_version: 2.4.
|
41
|
+
rubygems_version: 2.4.8
|
43
42
|
signing_key:
|
44
43
|
specification_version: 4
|
45
44
|
summary: Scrapes Google
|
46
45
|
test_files: []
|
46
|
+
has_rdoc:
|
data/lib/proxy_manager.rb
DELETED
@@ -1,70 +0,0 @@
|
|
1
|
-
require 'active_support/time'
|
2
|
-
require 'mechanize'
|
3
|
-
require 'uri'
|
4
|
-
require 'selenium-webdriver'
|
5
|
-
|
6
|
-
module ProxyManager
|
7
|
-
# Get the page with a proxy
|
8
|
-
def getPage(url, driver, form_input = nil, fail_count = 0, use_proxy)
|
9
|
-
agent = Mechanize.new do |a|
|
10
|
-
a.user_agent_alias = "Linux Firefox"
|
11
|
-
|
12
|
-
# Set proxy if specified, otherwise delay to avoid blocks
|
13
|
-
if use_proxy
|
14
|
-
a.set_proxy(*getRandomProxy(url))
|
15
|
-
else
|
16
|
-
sleep(rand(30..90))
|
17
|
-
end
|
18
|
-
end
|
19
|
-
|
20
|
-
# Slightly different based on filling in form or not
|
21
|
-
begin
|
22
|
-
if form_input
|
23
|
-
driver.navigate.to url
|
24
|
-
element = driver.find_element(name: "q")
|
25
|
-
element.send_keys form_input
|
26
|
-
element.submit
|
27
|
-
puts "Searched for: " + form_input
|
28
|
-
|
29
|
-
return driver
|
30
|
-
else
|
31
|
-
puts "Getting page " + url
|
32
|
-
driver.navigate.to url
|
33
|
-
return driver
|
34
|
-
end
|
35
|
-
rescue # Only retry request 10 times
|
36
|
-
begin
|
37
|
-
puts "FAILED"
|
38
|
-
getPage(url, form_input, fail_count+=1) if fail_count < 10
|
39
|
-
rescue
|
40
|
-
end
|
41
|
-
end
|
42
|
-
end
|
43
|
-
|
44
|
-
# Choose a random proxy
|
45
|
-
def getRandomProxy(url)
|
46
|
-
max = @proxylist.length
|
47
|
-
chosen = @proxylist[Random.rand(max)]
|
48
|
-
|
49
|
-
# Only use proxy if it hasn't been used in last 20 seconds on same host
|
50
|
-
if isNotUsed?(chosen, url)
|
51
|
-
@usedproxies[chosen] = [Time.now, URI.parse(url).host]
|
52
|
-
return parseProxy(chosen)
|
53
|
-
else
|
54
|
-
sleep(0.005)
|
55
|
-
getRandomProxy(url)
|
56
|
-
end
|
57
|
-
end
|
58
|
-
|
59
|
-
# Splits up proxy into IP, port, user, password
|
60
|
-
def parseProxy(chosen)
|
61
|
-
proxy_info = chosen.split(":")
|
62
|
-
proxy_info[proxy_info.length-1] = proxy_info.last.strip
|
63
|
-
return proxy_info
|
64
|
-
end
|
65
|
-
|
66
|
-
# Checks if a proxy has been used on domain in the last 20 seconds
|
67
|
-
def isNotUsed?(chosen, url)
|
68
|
-
return !@usedproxies[chosen] || @usedproxies[chosen][0] <= Time.now-20 || @usedproxies[chosen][1] != URI.parse(url).host
|
69
|
-
end
|
70
|
-
end
|