generalscraper 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/generalscraper.rb +14 -2
- data/lib/parse_page.rb +19 -12
- data/lib/proxy_manager.rb +33 -15
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: dbbd54bb1986056a1e3207eda365f04d4684e0f2
|
4
|
+
data.tar.gz: 274b1a30166371f8f587484020c20a86690e7161
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0f99388b754103326c19a436d41dac5740169beb36554c40f6731a6c0bb3c2d0232742d17fab442c41ee34f55e424dfe6c046d0a0b2c7156d53d5a370d1800ac
|
7
|
+
data.tar.gz: d7f7ed11b8d11c4e0a8b85e83a35d86b3729633d4f2335d8dc6e3d206bd9b04eaaf02070458b28f879b8fd5df9620da37c34770606b3d26a58faa385c78f45e9
|
data/lib/generalscraper.rb
CHANGED
@@ -29,14 +29,25 @@ class GeneralScraper
|
|
29
29
|
# Categorizes the links on results page into results and other search pages
|
30
30
|
def categorizeLinks(page)
|
31
31
|
page.links.each do |link|
|
32
|
-
if
|
32
|
+
if isResultLink?(link)
|
33
33
|
siteURLSave(link)
|
34
|
-
elsif
|
34
|
+
elsif isSearchPageLink?(link)
|
35
35
|
nextSearchPage(link)
|
36
36
|
end
|
37
37
|
end
|
38
38
|
end
|
39
39
|
|
40
|
+
# Determines if url is link to search result
|
41
|
+
def isResultLink?(link)
|
42
|
+
return (link.href.include? @op_val) && (!link.href.include? "webcache") && (!link.href.include? @operators.gsub(" ", "+"))
|
43
|
+
end
|
44
|
+
|
45
|
+
# Determines if URL is link to next search page
|
46
|
+
def isSearchPageLink?(link)
|
47
|
+
return (link.href.include? "&sa=N") && (link.href.include? "&start=")
|
48
|
+
end
|
49
|
+
|
50
|
+
|
40
51
|
# Parse and save the URLs for search results
|
41
52
|
def siteURLSave(link)
|
42
53
|
site_url = link.href.split("?q=")[1]
|
@@ -53,6 +64,7 @@ class GeneralScraper
|
|
53
64
|
end
|
54
65
|
end
|
55
66
|
|
67
|
+
|
56
68
|
# Gets all data and returns in JSON
|
57
69
|
def getData
|
58
70
|
search
|
data/lib/parse_page.rb
CHANGED
@@ -4,26 +4,26 @@ module ParsePage
|
|
4
4
|
# Get both page metadata and text
|
5
5
|
def getPageData(url)
|
6
6
|
begin
|
7
|
-
|
8
|
-
pagehash =
|
7
|
+
html = Nokogiri::HTML(getPage(url).body)
|
8
|
+
pagehash = getMetadata(url, html)
|
9
|
+
pagehash = getContent(url, pagehash, html)
|
9
10
|
@output.push(pagehash)
|
10
11
|
rescue
|
11
12
|
end
|
12
13
|
end
|
13
14
|
|
14
15
|
# Get the page content by type of page
|
15
|
-
def getContent(url, pagehash)
|
16
|
+
def getContent(url, pagehash, html)
|
16
17
|
if url.include? ".pdf"
|
17
18
|
return getPDF(url, pagehash)
|
18
19
|
else
|
19
|
-
return getHTMLText(url, pagehash)
|
20
|
+
return getHTMLText(url, pagehash, html)
|
20
21
|
end
|
21
22
|
end
|
22
23
|
|
23
24
|
# Download the page text
|
24
|
-
def getHTMLText(url, pagehash)
|
25
|
-
|
26
|
-
pagehash[:text] = html.css("body").text.encode("UTF-8")
|
25
|
+
def getHTMLText(url, pagehash, html)
|
26
|
+
pagehash[:text] = fixEncode(html.css("body").text)
|
27
27
|
return pagehash
|
28
28
|
end
|
29
29
|
|
@@ -35,12 +35,12 @@ module ParsePage
|
|
35
35
|
# OCR PDF and save fields
|
36
36
|
u = UploadConvert.new("public/uploads/" + path[path.length-1].chomp.strip)
|
37
37
|
pdfparse = JSON.parse(u.handleDoc)
|
38
|
-
pdfparse.each{|k, v| pagehash[k] = v
|
38
|
+
pdfparse.each{|k, v| pagehash[k] = fixEncode(v)}
|
39
39
|
return pagehash
|
40
40
|
end
|
41
41
|
|
42
42
|
# Get the page metadata
|
43
|
-
def getMetadata(url)
|
43
|
+
def getMetadata(url, html)
|
44
44
|
pagehash = Hash.new
|
45
45
|
|
46
46
|
# Save URL and date retreived
|
@@ -50,14 +50,21 @@ module ParsePage
|
|
50
50
|
pagehash[:date_retrieved] = Time.now
|
51
51
|
|
52
52
|
# Get title and meta tag info
|
53
|
-
|
54
|
-
pagehash[:title] = html.css("title").text.encode("UTF-8")
|
53
|
+
pagehash[:title] = fixEncode(html.css("title").text)
|
55
54
|
html.css("meta").each do |m|
|
56
55
|
if m
|
57
|
-
pagehash[m['name']] = m['content']
|
56
|
+
pagehash[m['name']] = fixEncode(m['content'])
|
58
57
|
end
|
59
58
|
end
|
60
59
|
|
61
60
|
return pagehash
|
62
61
|
end
|
62
|
+
|
63
|
+
def fixEncode(str)
|
64
|
+
if str.is_a?(String)
|
65
|
+
return str.unpack('C*').pack('U*')
|
66
|
+
else
|
67
|
+
return str
|
68
|
+
end
|
69
|
+
end
|
63
70
|
end
|
data/lib/proxy_manager.rb
CHANGED
@@ -1,35 +1,53 @@
|
|
1
1
|
require 'active_support/time'
|
2
2
|
require 'mechanize'
|
3
|
+
require 'uri'
|
3
4
|
|
4
5
|
module ProxyManager
|
5
6
|
# Get the page with a proxy
|
6
|
-
def getPage(url, form_input = nil)
|
7
|
+
def getPage(url, form_input = nil, fail_count = 0)
|
7
8
|
agent = Mechanize.new do |a|
|
8
9
|
a.user_agent_alias = "Linux Firefox"
|
9
|
-
a.set_proxy(getRandomProxy
|
10
|
+
a.set_proxy(*getRandomProxy(url))
|
10
11
|
end
|
11
12
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
13
|
+
# Slightly different based on filling in form or not
|
14
|
+
begin
|
15
|
+
if form_input
|
16
|
+
gform = agent.get(url).form("f")
|
17
|
+
gform.q = form_input
|
18
|
+
return agent.submit(gform, gform.buttons.first)
|
19
|
+
else
|
20
|
+
return agent.get(url)
|
21
|
+
end
|
22
|
+
rescue # Only retry request 5 times
|
23
|
+
getPage(url, form_input, fail_count+=1) if fail_count < 5
|
18
24
|
end
|
19
25
|
end
|
20
26
|
|
21
27
|
# Choose a random proxy
|
22
|
-
def getRandomProxy
|
28
|
+
def getRandomProxy(url)
|
23
29
|
max = @proxylist.length
|
24
30
|
chosen = @proxylist[Random.rand(max)]
|
25
31
|
|
26
|
-
# Only use proxy if it hasn't been used in last 20 seconds
|
27
|
-
if
|
28
|
-
@usedproxies[chosen] = Time.now
|
29
|
-
return chosen
|
32
|
+
# Only use proxy if it hasn't been used in last 20 seconds on same host
|
33
|
+
if isNotUsed?(chosen, url)
|
34
|
+
@usedproxies[chosen] = [Time.now, URI.parse(url).host]
|
35
|
+
return parseProxy(chosen)
|
30
36
|
else
|
31
|
-
sleep(0.
|
32
|
-
getRandomProxy
|
37
|
+
sleep(0.005)
|
38
|
+
getRandomProxy(url)
|
33
39
|
end
|
34
40
|
end
|
41
|
+
|
42
|
+
# Splits up proxy into IP, port, user, password
|
43
|
+
def parseProxy(chosen)
|
44
|
+
proxy_info = chosen.split(":")
|
45
|
+
proxy_info[proxy_info.length-1] = proxy_info.last.strip
|
46
|
+
return proxy_info
|
47
|
+
end
|
48
|
+
|
49
|
+
# Checks if a proxy has been used on domain in the last 20 seconds
|
50
|
+
def isNotUsed?(chosen, url)
|
51
|
+
return !@usedproxies[chosen] || @usedproxies[chosen][0] <= Time.now-20 || @usedproxies[chosen][1] != URI.parse(url).host
|
52
|
+
end
|
35
53
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: generalscraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- M. C. McGrath
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-04-
|
11
|
+
date: 2014-04-10 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Scrapes Google
|
14
14
|
email: shidash@shidash.com
|