generalscraper 0.0.9 → 0.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/generalscraper.rb +44 -14
- data/lib/parse_page.rb +3 -2
- data/lib/proxy_manager.rb +11 -5
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 29d8ecb9bacae3fb5f21d57bd48aa6f7aa7bf9e3
|
4
|
+
data.tar.gz: 7bb42fc8560c5be6e65dd93827b4eafbfa647718
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 02e934121567d5fb9f18392581ac145e94f01ee1473261090d15fd0490abeff8a96888fed556f792e46728725d6d03aa542fb34fc309e0dc66ecc2204d6988c7
|
7
|
+
data.tar.gz: c020dcf77f83b20e52325988ad6401a8e315410bbda70d4b5d58842065579cb897bfe6869c9f88eb8e60a89f243835bd4fbbdd5fa4713910b77033684d4b3840
|
data/lib/generalscraper.rb
CHANGED
@@ -20,48 +20,75 @@ class GeneralScraper
|
|
20
20
|
@urllist = Array.new
|
21
21
|
@startindex = 10
|
22
22
|
@use_proxy = use_proxy
|
23
|
+
|
24
|
+
# Generate driver
|
25
|
+
profile = Selenium::WebDriver::Firefox::Profile.new
|
26
|
+
profile['intl.accept_languages'] = 'en'
|
27
|
+
@driver = Selenium::WebDriver.for :firefox, profile: profile
|
23
28
|
end
|
24
29
|
|
25
30
|
# Searches for links on Google
|
26
31
|
def search
|
27
|
-
categorizeLinks(getPage("http://google.com", @operators + " " + @searchterm, @use_proxy))
|
32
|
+
categorizeLinks(getPage("http://google.com", @driver, @operators + " " + @searchterm, @use_proxy))
|
33
|
+
end
|
34
|
+
|
35
|
+
# Gets the links from the page
|
36
|
+
def getLinks(page)
|
37
|
+
# Sleep while things load
|
38
|
+
sleep(10)
|
39
|
+
|
40
|
+
# Extract arr
|
41
|
+
return page.find_elements(css: "a").inject(Array.new) do |link_arr, al|
|
42
|
+
begin
|
43
|
+
link_arr.push(al.attribute("href"))
|
44
|
+
rescue
|
45
|
+
|
46
|
+
end
|
47
|
+
|
48
|
+
link_arr
|
49
|
+
end
|
28
50
|
end
|
29
51
|
|
30
52
|
# Categorizes the links on results page into results and other search pages
|
31
53
|
def categorizeLinks(page)
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
54
|
+
links = getLinks(page)
|
55
|
+
links.each do |link|
|
56
|
+
if link
|
57
|
+
if isResultLink?(link)
|
58
|
+
siteURLSave(link)
|
59
|
+
elsif isSearchPageLink?(link)
|
60
|
+
nextSearchPage(link)
|
61
|
+
end
|
37
62
|
end
|
38
63
|
end
|
39
64
|
end
|
40
65
|
|
41
66
|
# Determines if url is link to search result
|
42
67
|
def isResultLink?(link)
|
43
|
-
return (link.
|
68
|
+
return (link.include? @op_val) &&
|
69
|
+
(!link.include? "webcache") &&
|
70
|
+
(!link.include? @operators.gsub(" ", "+")) &&
|
71
|
+
(!link.include?("translate.google"))
|
44
72
|
end
|
45
73
|
|
46
74
|
# Determines if URL is link to next search page
|
47
75
|
def isSearchPageLink?(link)
|
48
|
-
return (link.
|
76
|
+
return (link.include? "&sa=N") && (link.include? "&start=")
|
49
77
|
end
|
50
78
|
|
51
79
|
|
52
80
|
# Parse and save the URLs for search results
|
53
81
|
def siteURLSave(link)
|
54
|
-
|
55
|
-
@urllist.push(site_url.split("&")[0]) if site_url
|
82
|
+
@urllist.push(link)
|
56
83
|
end
|
57
84
|
|
58
85
|
# Process search links and go to next page
|
59
86
|
def nextSearchPage(link)
|
60
|
-
page_index_num = link.
|
61
|
-
|
87
|
+
page_index_num = link.split("&start=")[1].split("&sa=N")[0]
|
88
|
+
|
62
89
|
if page_index_num.to_i == @startindex
|
63
90
|
@startindex += 10
|
64
|
-
categorizeLinks(getPage(
|
91
|
+
categorizeLinks(getPage(link, @driver, @use_proxy))
|
65
92
|
end
|
66
93
|
end
|
67
94
|
|
@@ -70,14 +97,17 @@ class GeneralScraper
|
|
70
97
|
def getData
|
71
98
|
search
|
72
99
|
@urllist.each do |url|
|
73
|
-
getPageData(url)
|
100
|
+
getPageData(url, @driver)
|
74
101
|
end
|
102
|
+
@driver.close
|
75
103
|
return JSON.pretty_generate(@output)
|
76
104
|
end
|
77
105
|
|
78
106
|
# Returns a list of search result URLs
|
79
107
|
def getURLs
|
80
108
|
search
|
109
|
+
@driver.close
|
81
110
|
return JSON.pretty_generate(@urllist)
|
82
111
|
end
|
83
112
|
end
|
113
|
+
|
data/lib/parse_page.rb
CHANGED
@@ -2,9 +2,10 @@ require 'uploadconvert'
|
|
2
2
|
|
3
3
|
module ParsePage
|
4
4
|
# Get both page metadata and text
|
5
|
-
def getPageData(url)
|
5
|
+
def getPageData(url, driver)
|
6
6
|
begin
|
7
|
-
|
7
|
+
page = getPage(url, driver, nil, 5, false)
|
8
|
+
html = Nokogiri::HTML(page.page_source)
|
8
9
|
pagehash = getMetadata(url, html)
|
9
10
|
pagehash = getContent(url, pagehash, html)
|
10
11
|
@output.push(pagehash)
|
data/lib/proxy_manager.rb
CHANGED
@@ -1,10 +1,11 @@
|
|
1
1
|
require 'active_support/time'
|
2
2
|
require 'mechanize'
|
3
3
|
require 'uri'
|
4
|
+
require 'selenium-webdriver'
|
4
5
|
|
5
6
|
module ProxyManager
|
6
7
|
# Get the page with a proxy
|
7
|
-
def getPage(url, form_input = nil, fail_count = 0, use_proxy)
|
8
|
+
def getPage(url, driver, form_input = nil, fail_count = 0, use_proxy)
|
8
9
|
agent = Mechanize.new do |a|
|
9
10
|
a.user_agent_alias = "Linux Firefox"
|
10
11
|
|
@@ -19,12 +20,17 @@ module ProxyManager
|
|
19
20
|
# Slightly different based on filling in form or not
|
20
21
|
begin
|
21
22
|
if form_input
|
22
|
-
|
23
|
-
|
24
|
-
|
23
|
+
driver.navigate.to url
|
24
|
+
element = driver.find_element(name: "q")
|
25
|
+
element.send_keys form_input
|
26
|
+
element.submit
|
27
|
+
puts "Searched for: " + form_input
|
28
|
+
|
29
|
+
return driver
|
25
30
|
else
|
26
31
|
puts "Getting page " + url
|
27
|
-
|
32
|
+
driver.navigate.to url
|
33
|
+
return driver
|
28
34
|
end
|
29
35
|
rescue # Only retry request 10 times
|
30
36
|
puts "FAILED"
|