generalscraper 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 84330f1a3a5c18dd9b2d8b200b141b5ba7c85827
4
- data.tar.gz: 5fe4f81df62c28962565340858984e3d6b6f8a86
3
+ metadata.gz: 48ee021e7ac6bb45a00308d69003bd6ba379b20b
4
+ data.tar.gz: d3b631127266dbfaacaee4eb74c2868e48a1f0c8
5
5
  SHA512:
6
- metadata.gz: 58b032cb6a3f33f4e5f1d972147952906ca95be1fbbbf181afad95336cc4a24b7585733305e8eefe9bce38da77cb989510a41020b9d418d93b68c7ef2fc8c1e1
7
- data.tar.gz: 401093d88ca984e4a3fdc5f39f3ec35be9441fdff15e23fd434d623203ec22a9687f98d5fcfdd021c90662eb791f8ec34e57d45d59d08b5364b626a1961d6865
6
+ metadata.gz: b15d9ce46f5223be79fca5ba74423c0eab88c03dc3ed1e40baef500d30ab9f15c1f364bfb23244ea1dc741edcd91281b779b4ff1170341f0c534859aa174ff94
7
+ data.tar.gz: 149dadfabb77b586164c4213fd58bca33a5de5d0c64af48c04db6f4e47eaf3c5c1563ceaeedd7e9a97c813e7e5b95cc45a671734b8a5d2b78212db0d30d700ed
@@ -1,90 +1,71 @@
1
- require 'mechanize'
2
1
  require 'json'
3
2
  require 'nokogiri'
4
- require 'open-uri'
5
- require 'uploadconvert'
3
+ require 'mechanize'
4
+
5
+ load 'parse_page.rb'
6
+ load 'proxy_manager.rb'
6
7
 
7
8
  class GeneralScraper
8
- def initialize(scrapesite, input, table)
9
- @input = input
10
- @scrapesite = scrapesite
11
- @output = Array.new
12
- @startindex = 10
13
- @table = table
9
+ include ParsePage
10
+ include ProxyManager
11
+
12
+ def initialize(operators, searchterm, proxylist)
13
+ @operators = operators
14
+ @searchterm = searchterm
15
+ @op_val = @operators.split(" ")[0].split(":")[1]
16
+ @proxylist = IO.readlines(proxylist)
17
+ @usedproxies = Hash.new
18
+
19
+ @output = Array.new
20
+ @urllist = Array.new
21
+ @startindex = 10
14
22
  end
15
23
 
16
24
  # Searches for links on Google
17
25
  def search
18
- agent = Mechanize.new
19
- agent.user_agent_alias = 'Linux Firefox'
20
- gform = agent.get("http://google.com").form("f")
21
- gform.q = "site:" + @scrapesite + " " + @input
22
- page = agent.submit(gform, gform.buttons.first)
23
- examine(page)
26
+ categorizeLinks(getPage("http://google.com", @operators + " " + @searchterm))
24
27
  end
25
-
26
- # Examines a search page
27
- def examine(page)
28
- page.links.each do |link|
29
- if (link.href.include? @scrapesite) && (!link.href.include? "webcache") && (!link.href.include? "site:"+@scrapesite)
30
- saveurl = link.href.split("?q=")
31
-
32
- if saveurl[1]
33
- url = saveurl[1].split("&")
34
- getPage(url[0])
35
- end
36
- end
37
-
38
- if (link.href.include? "&sa=N") && (link.href.include? "&start=")
39
- url1 = link.href.split("&start=")
40
- url2 = url1[1].split("&sa=N")
41
28
 
42
- if url2[0].to_i == @startindex
43
- sleep(rand(30..90))
44
- @startindex += 10
45
- agent = Mechanize.new
46
- examine(agent.get("http://google.com" + link.href))
47
- end
29
+ # Categorizes the links on results page into results and other search pages
30
+ def categorizeLinks(page)
31
+ page.links.each do |link|
32
+ if (link.href.include? @op_val) && (!link.href.include? "webcache") && (!link.href.include? @operators.gsub(" ", "+"))
33
+ siteURLSave(link)
34
+ elsif (link.href.include? "&sa=N") && (link.href.include? "&start=")
35
+ nextSearchPage(link)
48
36
  end
49
37
  end
50
38
  end
51
39
 
52
- # Scrape the page content
53
- def getPage(url)
54
- pagehash = Hash.new
55
- begin
56
- url.gsub!("%3F", "?")
57
- url.gsub!("%3D", "=")
58
- pagehash[:url] = url
59
- pagehash[:date_retrieved] = Time.now
60
- html = Nokogiri::HTML(open(url))
61
- pagehash[:title] = html.css("title").text
62
- html.css("meta").each do |m|
63
- if m
64
- pagehash[m['name']] = m['content']
65
- end
66
- end
67
- if @table == false
68
- if url.include? ".pdf"
69
- `wget -P public/uploads #{url}`
70
- path = url.split("/")
71
- u = UploadConvert.new("public/uploads/" + path[path.length-1].chomp.strip)
72
- pdfparse = JSON.parse(u.handleDoc)
73
- pdfparse.each{|k, v| pagehash[k] = v}
74
- else
75
- pagehash[:text] = html.css("body").text
76
- end
77
- end
78
- @output.push(pagehash)
79
- rescue
80
-
40
+ # Parse and save the URLs for search results
41
+ def siteURLSave(link)
42
+ site_url = link.href.split("?q=")[1]
43
+ @urllist.push(site_url.split("&")[0]) if site_url
44
+ end
45
+
46
+ # Process search links and go to next page
47
+ def nextSearchPage(link)
48
+ page_index_num = link.href.split("&start=")[1].split("&sa=N")[0]
49
+
50
+ if page_index_num.to_i == @startindex
51
+ @startindex += 10
52
+ categorizeLinks(getPage("http://google.com" + link.href + "&filter=0"))
81
53
  end
82
54
  end
83
55
 
84
56
  # Gets all data and returns in JSON
85
57
  def getData
86
58
  search
59
+ @urllist.each do |url|
60
+ getPageData(url)
61
+ end
87
62
  return JSON.pretty_generate(@output)
88
63
  end
64
+
65
+ # Returns a list of search result URLs
66
+ def getURLs
67
+ search
68
+ return JSON.pretty_generate(@urllist)
69
+ end
89
70
  end
90
71
 
data/lib/parse_page.rb ADDED
@@ -0,0 +1,63 @@
1
+ require 'uploadconvert'
2
+
3
+ module ParsePage
4
+ # Get both page metadata and text
5
+ def getPageData(url)
6
+ begin
7
+ pagehash = getMetadata(url)
8
+ pagehash = getContent(url, pagehash)
9
+ @output.push(pagehash)
10
+ rescue
11
+ end
12
+ end
13
+
14
+ # Get the page content by type of page
15
+ def getContent(url, pagehash)
16
+ if url.include? ".pdf"
17
+ return getPDF(url, pagehash)
18
+ else
19
+ return getHTMLText(url, pagehash)
20
+ end
21
+ end
22
+
23
+ # Download the page text
24
+ def getHTMLText(url, pagehash)
25
+ html = Nokogiri::HTML(getPage(url).body)
26
+ pagehash[:text] = html.css("body").text
27
+ return pagehash
28
+ end
29
+
30
+ # Download and extract text from PDF
31
+ def getPDF(url, pagehash)
32
+ `wget -P public/uploads #{url}`
33
+ path = url.split("/")
34
+
35
+ # OCR PDF and save fields
36
+ u = UploadConvert.new("public/uploads/" + path[path.length-1].chomp.strip)
37
+ pdfparse = JSON.parse(u.handleDoc)
38
+ pdfparse.each{|k, v| pagehash[k] = v}
39
+ return pagehash
40
+ end
41
+
42
+ # Get the page metadata
43
+ def getMetadata(url)
44
+ pagehash = Hash.new
45
+
46
+ # Save URL and date retreived
47
+ url.gsub!("%3F", "?")
48
+ url.gsub!("%3D", "=")
49
+ pagehash[:url] = url
50
+ pagehash[:date_retrieved] = Time.now
51
+
52
+ # Get title and meta tag info
53
+ html = Nokogiri::HTML(getPage(url).body) # Eventually modify this
54
+ pagehash[:title] = html.css("title").text
55
+ html.css("meta").each do |m|
56
+ if m
57
+ pagehash[m['name']] = m['content']
58
+ end
59
+ end
60
+
61
+ return pagehash
62
+ end
63
+ end
@@ -0,0 +1,35 @@
1
+ require 'active_support/time'
2
+ require 'mechanize'
3
+
4
+ module ProxyManager
5
+ # Get the page with a proxy
6
+ def getPage(url, form_input = nil)
7
+ agent = Mechanize.new do |a|
8
+ a.user_agent_alias = "Linux Firefox"
9
+ a.set_proxy(getRandomProxy, 80)
10
+ end
11
+
12
+ if form_input
13
+ gform = agent.get(url).form("f")
14
+ gform.q = form_input
15
+ return agent.submit(gform, gform.buttons.first)
16
+ else
17
+ return agent.get(url)
18
+ end
19
+ end
20
+
21
+ # Choose a random proxy
22
+ def getRandomProxy
23
+ max = @proxylist.length
24
+ chosen = @proxylist[Random.rand(max)]
25
+
26
+ # Only use proxy if it hasn't been used in last 20 seconds
27
+ if !@usedproxies[chosen] || @usedproxies[chosen] < Time.now-20
28
+ @usedproxies[chosen] = Time.now
29
+ return chosen
30
+ else
31
+ sleep(0.5)
32
+ getRandomProxy
33
+ end
34
+ end
35
+ end
metadata CHANGED
@@ -1,22 +1,24 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: generalscraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-05-18 00:00:00.000000000 Z
11
+ date: 2014-04-07 00:00:00.000000000 Z
12
12
  dependencies: []
13
- description: Scrapes all pages on a site you specify including terms you specify.
13
+ description: Scrapes Google
14
14
  email: shidash@shidash.com
15
15
  executables: []
16
16
  extensions: []
17
17
  extra_rdoc_files: []
18
18
  files:
19
19
  - lib/generalscraper.rb
20
+ - lib/parse_page.rb
21
+ - lib/proxy_manager.rb
20
22
  homepage: https://github.com/TransparencyToolkit/generalscraper
21
23
  licenses:
22
24
  - GPL
@@ -27,19 +29,18 @@ require_paths:
27
29
  - lib
28
30
  required_ruby_version: !ruby/object:Gem::Requirement
29
31
  requirements:
30
- - - '>='
32
+ - - ">="
31
33
  - !ruby/object:Gem::Version
32
34
  version: '0'
33
35
  required_rubygems_version: !ruby/object:Gem::Requirement
34
36
  requirements:
35
- - - '>='
37
+ - - ">="
36
38
  - !ruby/object:Gem::Version
37
39
  version: '0'
38
40
  requirements: []
39
41
  rubyforge_project:
40
- rubygems_version: 2.0.14
42
+ rubygems_version: 2.4.6
41
43
  signing_key:
42
44
  specification_version: 4
43
- summary: Get all pages on a site for terms specified
45
+ summary: Scrapes Google
44
46
  test_files: []
45
- has_rdoc: