generalscraper 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 84330f1a3a5c18dd9b2d8b200b141b5ba7c85827
4
- data.tar.gz: 5fe4f81df62c28962565340858984e3d6b6f8a86
3
+ metadata.gz: 48ee021e7ac6bb45a00308d69003bd6ba379b20b
4
+ data.tar.gz: d3b631127266dbfaacaee4eb74c2868e48a1f0c8
5
5
  SHA512:
6
- metadata.gz: 58b032cb6a3f33f4e5f1d972147952906ca95be1fbbbf181afad95336cc4a24b7585733305e8eefe9bce38da77cb989510a41020b9d418d93b68c7ef2fc8c1e1
7
- data.tar.gz: 401093d88ca984e4a3fdc5f39f3ec35be9441fdff15e23fd434d623203ec22a9687f98d5fcfdd021c90662eb791f8ec34e57d45d59d08b5364b626a1961d6865
6
+ metadata.gz: b15d9ce46f5223be79fca5ba74423c0eab88c03dc3ed1e40baef500d30ab9f15c1f364bfb23244ea1dc741edcd91281b779b4ff1170341f0c534859aa174ff94
7
+ data.tar.gz: 149dadfabb77b586164c4213fd58bca33a5de5d0c64af48c04db6f4e47eaf3c5c1563ceaeedd7e9a97c813e7e5b95cc45a671734b8a5d2b78212db0d30d700ed
@@ -1,90 +1,71 @@
1
- require 'mechanize'
2
1
  require 'json'
3
2
  require 'nokogiri'
4
- require 'open-uri'
5
- require 'uploadconvert'
3
+ require 'mechanize'
4
+
5
+ load 'parse_page.rb'
6
+ load 'proxy_manager.rb'
6
7
 
7
8
  class GeneralScraper
8
- def initialize(scrapesite, input, table)
9
- @input = input
10
- @scrapesite = scrapesite
11
- @output = Array.new
12
- @startindex = 10
13
- @table = table
9
+ include ParsePage
10
+ include ProxyManager
11
+
12
+ def initialize(operators, searchterm, proxylist)
13
+ @operators = operators
14
+ @searchterm = searchterm
15
+ @op_val = @operators.split(" ")[0].split(":")[1]
16
+ @proxylist = IO.readlines(proxylist)
17
+ @usedproxies = Hash.new
18
+
19
+ @output = Array.new
20
+ @urllist = Array.new
21
+ @startindex = 10
14
22
  end
15
23
 
16
24
  # Searches for links on Google
17
25
  def search
18
- agent = Mechanize.new
19
- agent.user_agent_alias = 'Linux Firefox'
20
- gform = agent.get("http://google.com").form("f")
21
- gform.q = "site:" + @scrapesite + " " + @input
22
- page = agent.submit(gform, gform.buttons.first)
23
- examine(page)
26
+ categorizeLinks(getPage("http://google.com", @operators + " " + @searchterm))
24
27
  end
25
-
26
- # Examines a search page
27
- def examine(page)
28
- page.links.each do |link|
29
- if (link.href.include? @scrapesite) && (!link.href.include? "webcache") && (!link.href.include? "site:"+@scrapesite)
30
- saveurl = link.href.split("?q=")
31
-
32
- if saveurl[1]
33
- url = saveurl[1].split("&")
34
- getPage(url[0])
35
- end
36
- end
37
-
38
- if (link.href.include? "&sa=N") && (link.href.include? "&start=")
39
- url1 = link.href.split("&start=")
40
- url2 = url1[1].split("&sa=N")
41
28
 
42
- if url2[0].to_i == @startindex
43
- sleep(rand(30..90))
44
- @startindex += 10
45
- agent = Mechanize.new
46
- examine(agent.get("http://google.com" + link.href))
47
- end
29
+ # Categorizes the links on results page into results and other search pages
30
+ def categorizeLinks(page)
31
+ page.links.each do |link|
32
+ if (link.href.include? @op_val) && (!link.href.include? "webcache") && (!link.href.include? @operators.gsub(" ", "+"))
33
+ siteURLSave(link)
34
+ elsif (link.href.include? "&sa=N") && (link.href.include? "&start=")
35
+ nextSearchPage(link)
48
36
  end
49
37
  end
50
38
  end
51
39
 
52
- # Scrape the page content
53
- def getPage(url)
54
- pagehash = Hash.new
55
- begin
56
- url.gsub!("%3F", "?")
57
- url.gsub!("%3D", "=")
58
- pagehash[:url] = url
59
- pagehash[:date_retrieved] = Time.now
60
- html = Nokogiri::HTML(open(url))
61
- pagehash[:title] = html.css("title").text
62
- html.css("meta").each do |m|
63
- if m
64
- pagehash[m['name']] = m['content']
65
- end
66
- end
67
- if @table == false
68
- if url.include? ".pdf"
69
- `wget -P public/uploads #{url}`
70
- path = url.split("/")
71
- u = UploadConvert.new("public/uploads/" + path[path.length-1].chomp.strip)
72
- pdfparse = JSON.parse(u.handleDoc)
73
- pdfparse.each{|k, v| pagehash[k] = v}
74
- else
75
- pagehash[:text] = html.css("body").text
76
- end
77
- end
78
- @output.push(pagehash)
79
- rescue
80
-
40
+ # Parse and save the URLs for search results
41
+ def siteURLSave(link)
42
+ site_url = link.href.split("?q=")[1]
43
+ @urllist.push(site_url.split("&")[0]) if site_url
44
+ end
45
+
46
+ # Process search links and go to next page
47
+ def nextSearchPage(link)
48
+ page_index_num = link.href.split("&start=")[1].split("&sa=N")[0]
49
+
50
+ if page_index_num.to_i == @startindex
51
+ @startindex += 10
52
+ categorizeLinks(getPage("http://google.com" + link.href + "&filter=0"))
81
53
  end
82
54
  end
83
55
 
84
56
  # Gets all data and returns in JSON
85
57
  def getData
86
58
  search
59
+ @urllist.each do |url|
60
+ getPageData(url)
61
+ end
87
62
  return JSON.pretty_generate(@output)
88
63
  end
64
+
65
+ # Returns a list of search result URLs
66
+ def getURLs
67
+ search
68
+ return JSON.pretty_generate(@urllist)
69
+ end
89
70
  end
90
71
 
data/lib/parse_page.rb ADDED
@@ -0,0 +1,63 @@
1
+ require 'uploadconvert'
2
+
3
+ module ParsePage
4
+ # Get both page metadata and text
5
+ def getPageData(url)
6
+ begin
7
+ pagehash = getMetadata(url)
8
+ pagehash = getContent(url, pagehash)
9
+ @output.push(pagehash)
10
+ rescue
11
+ end
12
+ end
13
+
14
+ # Get the page content by type of page
15
+ def getContent(url, pagehash)
16
+ if url.include? ".pdf"
17
+ return getPDF(url, pagehash)
18
+ else
19
+ return getHTMLText(url, pagehash)
20
+ end
21
+ end
22
+
23
+ # Download the page text
24
+ def getHTMLText(url, pagehash)
25
+ html = Nokogiri::HTML(getPage(url).body)
26
+ pagehash[:text] = html.css("body").text
27
+ return pagehash
28
+ end
29
+
30
+ # Download and extract text from PDF
31
+ def getPDF(url, pagehash)
32
+ `wget -P public/uploads #{url}`
33
+ path = url.split("/")
34
+
35
+ # OCR PDF and save fields
36
+ u = UploadConvert.new("public/uploads/" + path[path.length-1].chomp.strip)
37
+ pdfparse = JSON.parse(u.handleDoc)
38
+ pdfparse.each{|k, v| pagehash[k] = v}
39
+ return pagehash
40
+ end
41
+
42
+ # Get the page metadata
43
+ def getMetadata(url)
44
+ pagehash = Hash.new
45
+
46
+ # Save URL and date retreived
47
+ url.gsub!("%3F", "?")
48
+ url.gsub!("%3D", "=")
49
+ pagehash[:url] = url
50
+ pagehash[:date_retrieved] = Time.now
51
+
52
+ # Get title and meta tag info
53
+ html = Nokogiri::HTML(getPage(url).body) # Eventually modify this
54
+ pagehash[:title] = html.css("title").text
55
+ html.css("meta").each do |m|
56
+ if m
57
+ pagehash[m['name']] = m['content']
58
+ end
59
+ end
60
+
61
+ return pagehash
62
+ end
63
+ end
@@ -0,0 +1,35 @@
1
+ require 'active_support/time'
2
+ require 'mechanize'
3
+
4
+ module ProxyManager
5
+ # Get the page with a proxy
6
+ def getPage(url, form_input = nil)
7
+ agent = Mechanize.new do |a|
8
+ a.user_agent_alias = "Linux Firefox"
9
+ a.set_proxy(getRandomProxy, 80)
10
+ end
11
+
12
+ if form_input
13
+ gform = agent.get(url).form("f")
14
+ gform.q = form_input
15
+ return agent.submit(gform, gform.buttons.first)
16
+ else
17
+ return agent.get(url)
18
+ end
19
+ end
20
+
21
+ # Choose a random proxy
22
+ def getRandomProxy
23
+ max = @proxylist.length
24
+ chosen = @proxylist[Random.rand(max)]
25
+
26
+ # Only use proxy if it hasn't been used in last 20 seconds
27
+ if !@usedproxies[chosen] || @usedproxies[chosen] < Time.now-20
28
+ @usedproxies[chosen] = Time.now
29
+ return chosen
30
+ else
31
+ sleep(0.5)
32
+ getRandomProxy
33
+ end
34
+ end
35
+ end
metadata CHANGED
@@ -1,22 +1,24 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: generalscraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-05-18 00:00:00.000000000 Z
11
+ date: 2014-04-07 00:00:00.000000000 Z
12
12
  dependencies: []
13
- description: Scrapes all pages on a site you specify including terms you specify.
13
+ description: Scrapes Google
14
14
  email: shidash@shidash.com
15
15
  executables: []
16
16
  extensions: []
17
17
  extra_rdoc_files: []
18
18
  files:
19
19
  - lib/generalscraper.rb
20
+ - lib/parse_page.rb
21
+ - lib/proxy_manager.rb
20
22
  homepage: https://github.com/TransparencyToolkit/generalscraper
21
23
  licenses:
22
24
  - GPL
@@ -27,19 +29,18 @@ require_paths:
27
29
  - lib
28
30
  required_ruby_version: !ruby/object:Gem::Requirement
29
31
  requirements:
30
- - - '>='
32
+ - - ">="
31
33
  - !ruby/object:Gem::Version
32
34
  version: '0'
33
35
  required_rubygems_version: !ruby/object:Gem::Requirement
34
36
  requirements:
35
- - - '>='
37
+ - - ">="
36
38
  - !ruby/object:Gem::Version
37
39
  version: '0'
38
40
  requirements: []
39
41
  rubyforge_project:
40
- rubygems_version: 2.0.14
42
+ rubygems_version: 2.4.6
41
43
  signing_key:
42
44
  specification_version: 4
43
- summary: Get all pages on a site for terms specified
45
+ summary: Scrapes Google
44
46
  test_files: []
45
- has_rdoc: