generalscraper 0.0.9 → 0.0.10
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/generalscraper.rb +44 -14
- data/lib/parse_page.rb +3 -2
- data/lib/proxy_manager.rb +11 -5
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 29d8ecb9bacae3fb5f21d57bd48aa6f7aa7bf9e3
|
4
|
+
data.tar.gz: 7bb42fc8560c5be6e65dd93827b4eafbfa647718
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 02e934121567d5fb9f18392581ac145e94f01ee1473261090d15fd0490abeff8a96888fed556f792e46728725d6d03aa542fb34fc309e0dc66ecc2204d6988c7
|
7
|
+
data.tar.gz: c020dcf77f83b20e52325988ad6401a8e315410bbda70d4b5d58842065579cb897bfe6869c9f88eb8e60a89f243835bd4fbbdd5fa4713910b77033684d4b3840
|
data/lib/generalscraper.rb
CHANGED
@@ -20,48 +20,75 @@ class GeneralScraper
|
|
20
20
|
@urllist = Array.new
|
21
21
|
@startindex = 10
|
22
22
|
@use_proxy = use_proxy
|
23
|
+
|
24
|
+
# Generate driver
|
25
|
+
profile = Selenium::WebDriver::Firefox::Profile.new
|
26
|
+
profile['intl.accept_languages'] = 'en'
|
27
|
+
@driver = Selenium::WebDriver.for :firefox, profile: profile
|
23
28
|
end
|
24
29
|
|
25
30
|
# Searches for links on Google
|
26
31
|
def search
|
27
|
-
categorizeLinks(getPage("http://google.com", @operators + " " + @searchterm, @use_proxy))
|
32
|
+
categorizeLinks(getPage("http://google.com", @driver, @operators + " " + @searchterm, @use_proxy))
|
33
|
+
end
|
34
|
+
|
35
|
+
# Gets the links from the page
|
36
|
+
def getLinks(page)
|
37
|
+
# Sleep while things load
|
38
|
+
sleep(10)
|
39
|
+
|
40
|
+
# Extract arr
|
41
|
+
return page.find_elements(css: "a").inject(Array.new) do |link_arr, al|
|
42
|
+
begin
|
43
|
+
link_arr.push(al.attribute("href"))
|
44
|
+
rescue
|
45
|
+
|
46
|
+
end
|
47
|
+
|
48
|
+
link_arr
|
49
|
+
end
|
28
50
|
end
|
29
51
|
|
30
52
|
# Categorizes the links on results page into results and other search pages
|
31
53
|
def categorizeLinks(page)
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
54
|
+
links = getLinks(page)
|
55
|
+
links.each do |link|
|
56
|
+
if link
|
57
|
+
if isResultLink?(link)
|
58
|
+
siteURLSave(link)
|
59
|
+
elsif isSearchPageLink?(link)
|
60
|
+
nextSearchPage(link)
|
61
|
+
end
|
37
62
|
end
|
38
63
|
end
|
39
64
|
end
|
40
65
|
|
41
66
|
# Determines if url is link to search result
|
42
67
|
def isResultLink?(link)
|
43
|
-
return (link.
|
68
|
+
return (link.include? @op_val) &&
|
69
|
+
(!link.include? "webcache") &&
|
70
|
+
(!link.include? @operators.gsub(" ", "+")) &&
|
71
|
+
(!link.include?("translate.google"))
|
44
72
|
end
|
45
73
|
|
46
74
|
# Determines if URL is link to next search page
|
47
75
|
def isSearchPageLink?(link)
|
48
|
-
return (link.
|
76
|
+
return (link.include? "&sa=N") && (link.include? "&start=")
|
49
77
|
end
|
50
78
|
|
51
79
|
|
52
80
|
# Parse and save the URLs for search results
|
53
81
|
def siteURLSave(link)
|
54
|
-
|
55
|
-
@urllist.push(site_url.split("&")[0]) if site_url
|
82
|
+
@urllist.push(link)
|
56
83
|
end
|
57
84
|
|
58
85
|
# Process search links and go to next page
|
59
86
|
def nextSearchPage(link)
|
60
|
-
page_index_num = link.
|
61
|
-
|
87
|
+
page_index_num = link.split("&start=")[1].split("&sa=N")[0]
|
88
|
+
|
62
89
|
if page_index_num.to_i == @startindex
|
63
90
|
@startindex += 10
|
64
|
-
categorizeLinks(getPage(
|
91
|
+
categorizeLinks(getPage(link, @driver, @use_proxy))
|
65
92
|
end
|
66
93
|
end
|
67
94
|
|
@@ -70,14 +97,17 @@ class GeneralScraper
|
|
70
97
|
def getData
|
71
98
|
search
|
72
99
|
@urllist.each do |url|
|
73
|
-
getPageData(url)
|
100
|
+
getPageData(url, @driver)
|
74
101
|
end
|
102
|
+
@driver.close
|
75
103
|
return JSON.pretty_generate(@output)
|
76
104
|
end
|
77
105
|
|
78
106
|
# Returns a list of search result URLs
|
79
107
|
def getURLs
|
80
108
|
search
|
109
|
+
@driver.close
|
81
110
|
return JSON.pretty_generate(@urllist)
|
82
111
|
end
|
83
112
|
end
|
113
|
+
|
data/lib/parse_page.rb
CHANGED
@@ -2,9 +2,10 @@ require 'uploadconvert'
|
|
2
2
|
|
3
3
|
module ParsePage
|
4
4
|
# Get both page metadata and text
|
5
|
-
def getPageData(url)
|
5
|
+
def getPageData(url, driver)
|
6
6
|
begin
|
7
|
-
|
7
|
+
page = getPage(url, driver, nil, 5, false)
|
8
|
+
html = Nokogiri::HTML(page.page_source)
|
8
9
|
pagehash = getMetadata(url, html)
|
9
10
|
pagehash = getContent(url, pagehash, html)
|
10
11
|
@output.push(pagehash)
|
data/lib/proxy_manager.rb
CHANGED
@@ -1,10 +1,11 @@
|
|
1
1
|
require 'active_support/time'
|
2
2
|
require 'mechanize'
|
3
3
|
require 'uri'
|
4
|
+
require 'selenium-webdriver'
|
4
5
|
|
5
6
|
module ProxyManager
|
6
7
|
# Get the page with a proxy
|
7
|
-
def getPage(url, form_input = nil, fail_count = 0, use_proxy)
|
8
|
+
def getPage(url, driver, form_input = nil, fail_count = 0, use_proxy)
|
8
9
|
agent = Mechanize.new do |a|
|
9
10
|
a.user_agent_alias = "Linux Firefox"
|
10
11
|
|
@@ -19,12 +20,17 @@ module ProxyManager
|
|
19
20
|
# Slightly different based on filling in form or not
|
20
21
|
begin
|
21
22
|
if form_input
|
22
|
-
|
23
|
-
|
24
|
-
|
23
|
+
driver.navigate.to url
|
24
|
+
element = driver.find_element(name: "q")
|
25
|
+
element.send_keys form_input
|
26
|
+
element.submit
|
27
|
+
puts "Searched for: " + form_input
|
28
|
+
|
29
|
+
return driver
|
25
30
|
else
|
26
31
|
puts "Getting page " + url
|
27
|
-
|
32
|
+
driver.navigate.to url
|
33
|
+
return driver
|
28
34
|
end
|
29
35
|
rescue # Only retry request 10 times
|
30
36
|
puts "FAILED"
|