generalscraper 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/generalscraper.rb +47 -66
- data/lib/parse_page.rb +63 -0
- data/lib/proxy_manager.rb +35 -0
- metadata +9 -8
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 48ee021e7ac6bb45a00308d69003bd6ba379b20b
|
4
|
+
data.tar.gz: d3b631127266dbfaacaee4eb74c2868e48a1f0c8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b15d9ce46f5223be79fca5ba74423c0eab88c03dc3ed1e40baef500d30ab9f15c1f364bfb23244ea1dc741edcd91281b779b4ff1170341f0c534859aa174ff94
|
7
|
+
data.tar.gz: 149dadfabb77b586164c4213fd58bca33a5de5d0c64af48c04db6f4e47eaf3c5c1563ceaeedd7e9a97c813e7e5b95cc45a671734b8a5d2b78212db0d30d700ed
|
data/lib/generalscraper.rb
CHANGED
@@ -1,90 +1,71 @@
|
|
1
|
-
require 'mechanize'
|
2
1
|
require 'json'
|
3
2
|
require 'nokogiri'
|
4
|
-
require '
|
5
|
-
|
3
|
+
require 'mechanize'
|
4
|
+
|
5
|
+
load 'parse_page.rb'
|
6
|
+
load 'proxy_manager.rb'
|
6
7
|
|
7
8
|
class GeneralScraper
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
9
|
+
include ParsePage
|
10
|
+
include ProxyManager
|
11
|
+
|
12
|
+
def initialize(operators, searchterm, proxylist)
|
13
|
+
@operators = operators
|
14
|
+
@searchterm = searchterm
|
15
|
+
@op_val = @operators.split(" ")[0].split(":")[1]
|
16
|
+
@proxylist = IO.readlines(proxylist)
|
17
|
+
@usedproxies = Hash.new
|
18
|
+
|
19
|
+
@output = Array.new
|
20
|
+
@urllist = Array.new
|
21
|
+
@startindex = 10
|
14
22
|
end
|
15
23
|
|
16
24
|
# Searches for links on Google
|
17
25
|
def search
|
18
|
-
|
19
|
-
agent.user_agent_alias = 'Linux Firefox'
|
20
|
-
gform = agent.get("http://google.com").form("f")
|
21
|
-
gform.q = "site:" + @scrapesite + " " + @input
|
22
|
-
page = agent.submit(gform, gform.buttons.first)
|
23
|
-
examine(page)
|
26
|
+
categorizeLinks(getPage("http://google.com", @operators + " " + @searchterm))
|
24
27
|
end
|
25
|
-
|
26
|
-
# Examines a search page
|
27
|
-
def examine(page)
|
28
|
-
page.links.each do |link|
|
29
|
-
if (link.href.include? @scrapesite) && (!link.href.include? "webcache") && (!link.href.include? "site:"+@scrapesite)
|
30
|
-
saveurl = link.href.split("?q=")
|
31
|
-
|
32
|
-
if saveurl[1]
|
33
|
-
url = saveurl[1].split("&")
|
34
|
-
getPage(url[0])
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
if (link.href.include? "&sa=N") && (link.href.include? "&start=")
|
39
|
-
url1 = link.href.split("&start=")
|
40
|
-
url2 = url1[1].split("&sa=N")
|
41
28
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
29
|
+
# Categorizes the links on results page into results and other search pages
|
30
|
+
def categorizeLinks(page)
|
31
|
+
page.links.each do |link|
|
32
|
+
if (link.href.include? @op_val) && (!link.href.include? "webcache") && (!link.href.include? @operators.gsub(" ", "+"))
|
33
|
+
siteURLSave(link)
|
34
|
+
elsif (link.href.include? "&sa=N") && (link.href.include? "&start=")
|
35
|
+
nextSearchPage(link)
|
48
36
|
end
|
49
37
|
end
|
50
38
|
end
|
51
39
|
|
52
|
-
#
|
53
|
-
def
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
end
|
66
|
-
end
|
67
|
-
if @table == false
|
68
|
-
if url.include? ".pdf"
|
69
|
-
`wget -P public/uploads #{url}`
|
70
|
-
path = url.split("/")
|
71
|
-
u = UploadConvert.new("public/uploads/" + path[path.length-1].chomp.strip)
|
72
|
-
pdfparse = JSON.parse(u.handleDoc)
|
73
|
-
pdfparse.each{|k, v| pagehash[k] = v}
|
74
|
-
else
|
75
|
-
pagehash[:text] = html.css("body").text
|
76
|
-
end
|
77
|
-
end
|
78
|
-
@output.push(pagehash)
|
79
|
-
rescue
|
80
|
-
|
40
|
+
# Parse and save the URLs for search results
|
41
|
+
def siteURLSave(link)
|
42
|
+
site_url = link.href.split("?q=")[1]
|
43
|
+
@urllist.push(site_url.split("&")[0]) if site_url
|
44
|
+
end
|
45
|
+
|
46
|
+
# Process search links and go to next page
|
47
|
+
def nextSearchPage(link)
|
48
|
+
page_index_num = link.href.split("&start=")[1].split("&sa=N")[0]
|
49
|
+
|
50
|
+
if page_index_num.to_i == @startindex
|
51
|
+
@startindex += 10
|
52
|
+
categorizeLinks(getPage("http://google.com" + link.href + "&filter=0"))
|
81
53
|
end
|
82
54
|
end
|
83
55
|
|
84
56
|
# Gets all data and returns in JSON
|
85
57
|
def getData
|
86
58
|
search
|
59
|
+
@urllist.each do |url|
|
60
|
+
getPageData(url)
|
61
|
+
end
|
87
62
|
return JSON.pretty_generate(@output)
|
88
63
|
end
|
64
|
+
|
65
|
+
# Returns a list of search result URLs
|
66
|
+
def getURLs
|
67
|
+
search
|
68
|
+
return JSON.pretty_generate(@urllist)
|
69
|
+
end
|
89
70
|
end
|
90
71
|
|
data/lib/parse_page.rb
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
require 'uploadconvert'
|
2
|
+
|
3
|
+
module ParsePage
|
4
|
+
# Get both page metadata and text
|
5
|
+
def getPageData(url)
|
6
|
+
begin
|
7
|
+
pagehash = getMetadata(url)
|
8
|
+
pagehash = getContent(url, pagehash)
|
9
|
+
@output.push(pagehash)
|
10
|
+
rescue
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
# Get the page content by type of page
|
15
|
+
def getContent(url, pagehash)
|
16
|
+
if url.include? ".pdf"
|
17
|
+
return getPDF(url, pagehash)
|
18
|
+
else
|
19
|
+
return getHTMLText(url, pagehash)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
# Download the page text
|
24
|
+
def getHTMLText(url, pagehash)
|
25
|
+
html = Nokogiri::HTML(getPage(url).body)
|
26
|
+
pagehash[:text] = html.css("body").text
|
27
|
+
return pagehash
|
28
|
+
end
|
29
|
+
|
30
|
+
# Download and extract text from PDF
|
31
|
+
def getPDF(url, pagehash)
|
32
|
+
`wget -P public/uploads #{url}`
|
33
|
+
path = url.split("/")
|
34
|
+
|
35
|
+
# OCR PDF and save fields
|
36
|
+
u = UploadConvert.new("public/uploads/" + path[path.length-1].chomp.strip)
|
37
|
+
pdfparse = JSON.parse(u.handleDoc)
|
38
|
+
pdfparse.each{|k, v| pagehash[k] = v}
|
39
|
+
return pagehash
|
40
|
+
end
|
41
|
+
|
42
|
+
# Get the page metadata
|
43
|
+
def getMetadata(url)
|
44
|
+
pagehash = Hash.new
|
45
|
+
|
46
|
+
# Save URL and date retreived
|
47
|
+
url.gsub!("%3F", "?")
|
48
|
+
url.gsub!("%3D", "=")
|
49
|
+
pagehash[:url] = url
|
50
|
+
pagehash[:date_retrieved] = Time.now
|
51
|
+
|
52
|
+
# Get title and meta tag info
|
53
|
+
html = Nokogiri::HTML(getPage(url).body) # Eventually modify this
|
54
|
+
pagehash[:title] = html.css("title").text
|
55
|
+
html.css("meta").each do |m|
|
56
|
+
if m
|
57
|
+
pagehash[m['name']] = m['content']
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
return pagehash
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'active_support/time'
|
2
|
+
require 'mechanize'
|
3
|
+
|
4
|
+
module ProxyManager
|
5
|
+
# Get the page with a proxy
|
6
|
+
def getPage(url, form_input = nil)
|
7
|
+
agent = Mechanize.new do |a|
|
8
|
+
a.user_agent_alias = "Linux Firefox"
|
9
|
+
a.set_proxy(getRandomProxy, 80)
|
10
|
+
end
|
11
|
+
|
12
|
+
if form_input
|
13
|
+
gform = agent.get(url).form("f")
|
14
|
+
gform.q = form_input
|
15
|
+
return agent.submit(gform, gform.buttons.first)
|
16
|
+
else
|
17
|
+
return agent.get(url)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
# Choose a random proxy
|
22
|
+
def getRandomProxy
|
23
|
+
max = @proxylist.length
|
24
|
+
chosen = @proxylist[Random.rand(max)]
|
25
|
+
|
26
|
+
# Only use proxy if it hasn't been used in last 20 seconds
|
27
|
+
if !@usedproxies[chosen] || @usedproxies[chosen] < Time.now-20
|
28
|
+
@usedproxies[chosen] = Time.now
|
29
|
+
return chosen
|
30
|
+
else
|
31
|
+
sleep(0.5)
|
32
|
+
getRandomProxy
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
metadata
CHANGED
@@ -1,22 +1,24 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: generalscraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- M. C. McGrath
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-04-07 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
|
-
description: Scrapes
|
13
|
+
description: Scrapes Google
|
14
14
|
email: shidash@shidash.com
|
15
15
|
executables: []
|
16
16
|
extensions: []
|
17
17
|
extra_rdoc_files: []
|
18
18
|
files:
|
19
19
|
- lib/generalscraper.rb
|
20
|
+
- lib/parse_page.rb
|
21
|
+
- lib/proxy_manager.rb
|
20
22
|
homepage: https://github.com/TransparencyToolkit/generalscraper
|
21
23
|
licenses:
|
22
24
|
- GPL
|
@@ -27,19 +29,18 @@ require_paths:
|
|
27
29
|
- lib
|
28
30
|
required_ruby_version: !ruby/object:Gem::Requirement
|
29
31
|
requirements:
|
30
|
-
- -
|
32
|
+
- - ">="
|
31
33
|
- !ruby/object:Gem::Version
|
32
34
|
version: '0'
|
33
35
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
34
36
|
requirements:
|
35
|
-
- -
|
37
|
+
- - ">="
|
36
38
|
- !ruby/object:Gem::Version
|
37
39
|
version: '0'
|
38
40
|
requirements: []
|
39
41
|
rubyforge_project:
|
40
|
-
rubygems_version: 2.
|
42
|
+
rubygems_version: 2.4.6
|
41
43
|
signing_key:
|
42
44
|
specification_version: 4
|
43
|
-
summary:
|
45
|
+
summary: Scrapes Google
|
44
46
|
test_files: []
|
45
|
-
has_rdoc:
|