generalscraper 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/generalscraper.rb +47 -66
- data/lib/parse_page.rb +63 -0
- data/lib/proxy_manager.rb +35 -0
- metadata +9 -8
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 48ee021e7ac6bb45a00308d69003bd6ba379b20b
|
4
|
+
data.tar.gz: d3b631127266dbfaacaee4eb74c2868e48a1f0c8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b15d9ce46f5223be79fca5ba74423c0eab88c03dc3ed1e40baef500d30ab9f15c1f364bfb23244ea1dc741edcd91281b779b4ff1170341f0c534859aa174ff94
|
7
|
+
data.tar.gz: 149dadfabb77b586164c4213fd58bca33a5de5d0c64af48c04db6f4e47eaf3c5c1563ceaeedd7e9a97c813e7e5b95cc45a671734b8a5d2b78212db0d30d700ed
|
data/lib/generalscraper.rb
CHANGED
@@ -1,90 +1,71 @@
|
|
1
|
-
require 'mechanize'
|
2
1
|
require 'json'
|
3
2
|
require 'nokogiri'
|
4
|
-
require '
|
5
|
-
|
3
|
+
require 'mechanize'
|
4
|
+
|
5
|
+
load 'parse_page.rb'
|
6
|
+
load 'proxy_manager.rb'
|
6
7
|
|
7
8
|
class GeneralScraper
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
9
|
+
include ParsePage
|
10
|
+
include ProxyManager
|
11
|
+
|
12
|
+
def initialize(operators, searchterm, proxylist)
|
13
|
+
@operators = operators
|
14
|
+
@searchterm = searchterm
|
15
|
+
@op_val = @operators.split(" ")[0].split(":")[1]
|
16
|
+
@proxylist = IO.readlines(proxylist)
|
17
|
+
@usedproxies = Hash.new
|
18
|
+
|
19
|
+
@output = Array.new
|
20
|
+
@urllist = Array.new
|
21
|
+
@startindex = 10
|
14
22
|
end
|
15
23
|
|
16
24
|
# Searches for links on Google
|
17
25
|
def search
|
18
|
-
|
19
|
-
agent.user_agent_alias = 'Linux Firefox'
|
20
|
-
gform = agent.get("http://google.com").form("f")
|
21
|
-
gform.q = "site:" + @scrapesite + " " + @input
|
22
|
-
page = agent.submit(gform, gform.buttons.first)
|
23
|
-
examine(page)
|
26
|
+
categorizeLinks(getPage("http://google.com", @operators + " " + @searchterm))
|
24
27
|
end
|
25
|
-
|
26
|
-
# Examines a search page
|
27
|
-
def examine(page)
|
28
|
-
page.links.each do |link|
|
29
|
-
if (link.href.include? @scrapesite) && (!link.href.include? "webcache") && (!link.href.include? "site:"+@scrapesite)
|
30
|
-
saveurl = link.href.split("?q=")
|
31
|
-
|
32
|
-
if saveurl[1]
|
33
|
-
url = saveurl[1].split("&")
|
34
|
-
getPage(url[0])
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
if (link.href.include? "&sa=N") && (link.href.include? "&start=")
|
39
|
-
url1 = link.href.split("&start=")
|
40
|
-
url2 = url1[1].split("&sa=N")
|
41
28
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
29
|
+
# Categorizes the links on results page into results and other search pages
|
30
|
+
def categorizeLinks(page)
|
31
|
+
page.links.each do |link|
|
32
|
+
if (link.href.include? @op_val) && (!link.href.include? "webcache") && (!link.href.include? @operators.gsub(" ", "+"))
|
33
|
+
siteURLSave(link)
|
34
|
+
elsif (link.href.include? "&sa=N") && (link.href.include? "&start=")
|
35
|
+
nextSearchPage(link)
|
48
36
|
end
|
49
37
|
end
|
50
38
|
end
|
51
39
|
|
52
|
-
#
|
53
|
-
def
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
end
|
66
|
-
end
|
67
|
-
if @table == false
|
68
|
-
if url.include? ".pdf"
|
69
|
-
`wget -P public/uploads #{url}`
|
70
|
-
path = url.split("/")
|
71
|
-
u = UploadConvert.new("public/uploads/" + path[path.length-1].chomp.strip)
|
72
|
-
pdfparse = JSON.parse(u.handleDoc)
|
73
|
-
pdfparse.each{|k, v| pagehash[k] = v}
|
74
|
-
else
|
75
|
-
pagehash[:text] = html.css("body").text
|
76
|
-
end
|
77
|
-
end
|
78
|
-
@output.push(pagehash)
|
79
|
-
rescue
|
80
|
-
|
40
|
+
# Parse and save the URLs for search results
|
41
|
+
def siteURLSave(link)
|
42
|
+
site_url = link.href.split("?q=")[1]
|
43
|
+
@urllist.push(site_url.split("&")[0]) if site_url
|
44
|
+
end
|
45
|
+
|
46
|
+
# Process search links and go to next page
|
47
|
+
def nextSearchPage(link)
|
48
|
+
page_index_num = link.href.split("&start=")[1].split("&sa=N")[0]
|
49
|
+
|
50
|
+
if page_index_num.to_i == @startindex
|
51
|
+
@startindex += 10
|
52
|
+
categorizeLinks(getPage("http://google.com" + link.href + "&filter=0"))
|
81
53
|
end
|
82
54
|
end
|
83
55
|
|
84
56
|
# Gets all data and returns in JSON
|
85
57
|
def getData
|
86
58
|
search
|
59
|
+
@urllist.each do |url|
|
60
|
+
getPageData(url)
|
61
|
+
end
|
87
62
|
return JSON.pretty_generate(@output)
|
88
63
|
end
|
64
|
+
|
65
|
+
# Returns a list of search result URLs
|
66
|
+
def getURLs
|
67
|
+
search
|
68
|
+
return JSON.pretty_generate(@urllist)
|
69
|
+
end
|
89
70
|
end
|
90
71
|
|
data/lib/parse_page.rb
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
require 'uploadconvert'
|
2
|
+
|
3
|
+
module ParsePage
|
4
|
+
# Get both page metadata and text
|
5
|
+
def getPageData(url)
|
6
|
+
begin
|
7
|
+
pagehash = getMetadata(url)
|
8
|
+
pagehash = getContent(url, pagehash)
|
9
|
+
@output.push(pagehash)
|
10
|
+
rescue
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
# Get the page content by type of page
|
15
|
+
def getContent(url, pagehash)
|
16
|
+
if url.include? ".pdf"
|
17
|
+
return getPDF(url, pagehash)
|
18
|
+
else
|
19
|
+
return getHTMLText(url, pagehash)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
# Download the page text
|
24
|
+
def getHTMLText(url, pagehash)
|
25
|
+
html = Nokogiri::HTML(getPage(url).body)
|
26
|
+
pagehash[:text] = html.css("body").text
|
27
|
+
return pagehash
|
28
|
+
end
|
29
|
+
|
30
|
+
# Download and extract text from PDF
|
31
|
+
def getPDF(url, pagehash)
|
32
|
+
`wget -P public/uploads #{url}`
|
33
|
+
path = url.split("/")
|
34
|
+
|
35
|
+
# OCR PDF and save fields
|
36
|
+
u = UploadConvert.new("public/uploads/" + path[path.length-1].chomp.strip)
|
37
|
+
pdfparse = JSON.parse(u.handleDoc)
|
38
|
+
pdfparse.each{|k, v| pagehash[k] = v}
|
39
|
+
return pagehash
|
40
|
+
end
|
41
|
+
|
42
|
+
# Get the page metadata
|
43
|
+
def getMetadata(url)
|
44
|
+
pagehash = Hash.new
|
45
|
+
|
46
|
+
# Save URL and date retreived
|
47
|
+
url.gsub!("%3F", "?")
|
48
|
+
url.gsub!("%3D", "=")
|
49
|
+
pagehash[:url] = url
|
50
|
+
pagehash[:date_retrieved] = Time.now
|
51
|
+
|
52
|
+
# Get title and meta tag info
|
53
|
+
html = Nokogiri::HTML(getPage(url).body) # Eventually modify this
|
54
|
+
pagehash[:title] = html.css("title").text
|
55
|
+
html.css("meta").each do |m|
|
56
|
+
if m
|
57
|
+
pagehash[m['name']] = m['content']
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
return pagehash
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'active_support/time'
|
2
|
+
require 'mechanize'
|
3
|
+
|
4
|
+
module ProxyManager
|
5
|
+
# Get the page with a proxy
|
6
|
+
def getPage(url, form_input = nil)
|
7
|
+
agent = Mechanize.new do |a|
|
8
|
+
a.user_agent_alias = "Linux Firefox"
|
9
|
+
a.set_proxy(getRandomProxy, 80)
|
10
|
+
end
|
11
|
+
|
12
|
+
if form_input
|
13
|
+
gform = agent.get(url).form("f")
|
14
|
+
gform.q = form_input
|
15
|
+
return agent.submit(gform, gform.buttons.first)
|
16
|
+
else
|
17
|
+
return agent.get(url)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
# Choose a random proxy
|
22
|
+
def getRandomProxy
|
23
|
+
max = @proxylist.length
|
24
|
+
chosen = @proxylist[Random.rand(max)]
|
25
|
+
|
26
|
+
# Only use proxy if it hasn't been used in last 20 seconds
|
27
|
+
if !@usedproxies[chosen] || @usedproxies[chosen] < Time.now-20
|
28
|
+
@usedproxies[chosen] = Time.now
|
29
|
+
return chosen
|
30
|
+
else
|
31
|
+
sleep(0.5)
|
32
|
+
getRandomProxy
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
metadata
CHANGED
@@ -1,22 +1,24 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: generalscraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- M. C. McGrath
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-04-07 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
|
-
description: Scrapes
|
13
|
+
description: Scrapes Google
|
14
14
|
email: shidash@shidash.com
|
15
15
|
executables: []
|
16
16
|
extensions: []
|
17
17
|
extra_rdoc_files: []
|
18
18
|
files:
|
19
19
|
- lib/generalscraper.rb
|
20
|
+
- lib/parse_page.rb
|
21
|
+
- lib/proxy_manager.rb
|
20
22
|
homepage: https://github.com/TransparencyToolkit/generalscraper
|
21
23
|
licenses:
|
22
24
|
- GPL
|
@@ -27,19 +29,18 @@ require_paths:
|
|
27
29
|
- lib
|
28
30
|
required_ruby_version: !ruby/object:Gem::Requirement
|
29
31
|
requirements:
|
30
|
-
- -
|
32
|
+
- - ">="
|
31
33
|
- !ruby/object:Gem::Version
|
32
34
|
version: '0'
|
33
35
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
34
36
|
requirements:
|
35
|
-
- -
|
37
|
+
- - ">="
|
36
38
|
- !ruby/object:Gem::Version
|
37
39
|
version: '0'
|
38
40
|
requirements: []
|
39
41
|
rubyforge_project:
|
40
|
-
rubygems_version: 2.
|
42
|
+
rubygems_version: 2.4.6
|
41
43
|
signing_key:
|
42
44
|
specification_version: 4
|
43
|
-
summary:
|
45
|
+
summary: Scrapes Google
|
44
46
|
test_files: []
|
45
|
-
has_rdoc:
|