generalscraper 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/generalscraper.rb +78 -0
  3. metadata +45 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 1e568d1e9d1c0fa9c98f814128439bd009194c0d
4
+ data.tar.gz: f26d58116e50b61ad308eed4df0ce03e76cbfba8
5
+ SHA512:
6
+ metadata.gz: 92120914fbe80f8a0b5b3b5b996753b07a218745a88d7cd201fb533fa3de4e116499c1cc2d218d030771bf3d25dbf42743efc83ed7f1d2e4cb44939229ec31a6
7
+ data.tar.gz: b7d4070620f7a43ff7008ae414ea3aed53da287c697f32cb80f6afbd66186d94151a7393ac9547428f9ddeabf37e3500e7e55165ad11f7d2e1f994975e539f96
@@ -0,0 +1,78 @@
1
+ require 'mechanize'
2
+ require 'json'
3
+ require 'nokogiri'
4
+ require 'open-uri'
5
+
6
+ class GeneralScraper
7
+ def initialize(scrapesite, input)
8
+ @input = input
9
+ @scrapesite = scrapesite
10
+ @output = Array.new
11
+ @startindex = 10
12
+ end
13
+
14
+ # Searches for links on Google
15
+ def search
16
+ agent = Mechanize.new
17
+ agent.user_agent_alias = 'Linux Firefox'
18
+ gform = agent.get("http://google.com").form("f")
19
+ gform.q = "site:" + @scrapesite + " " + @input
20
+ page = agent.submit(gform, gform.buttons.first)
21
+ examine(page)
22
+ end
23
+
24
+ # Examines a search page
25
+ def examine(page)
26
+ page.links.each do |link|
27
+ if (link.href.include? @scrapesite) && (!link.href.include? "webcache") && (!link.href.include? "site:"+@scrapesite)
28
+ saveurl = link.href.split("?q=")
29
+
30
+ if saveurl[1]
31
+ url = saveurl[1].split("&")
32
+ getPage(url[0])
33
+ end
34
+ end
35
+
36
+ if (link.href.include? "&sa=N") && (link.href.include? "&start=")
37
+ url1 = link.href.split("&start=")
38
+ url2 = url1[1].split("&sa=N")
39
+
40
+ if url2[0].to_i == @startindex
41
+ sleep(rand(30..90))
42
+ @startindex += 10
43
+ agent = Mechanize.new
44
+ examine(agent.get("http://google.com" + link.href))
45
+ end
46
+ end
47
+ end
48
+ end
49
+
50
+ # Scrape the page content
51
+ def getPage(url)
52
+ pagehash = Hash.new
53
+ begin
54
+ url.gsub!("%3F", "?")
55
+ url.gsub!("%3D", "=")
56
+ pagehash[:url] = url
57
+ pagehash[:date_retrieved] = Time.now
58
+ html = Nokogiri::HTML(open(url))
59
+ pagehash[:title] = html.css("title").text
60
+ html.css("meta").each do |m|
61
+ if m
62
+ pagehash[m['name']] = m['content']
63
+ end
64
+ end
65
+ pagehash[:page] = html.css("body").text
66
+ @output.push(pagehash)
67
+ rescue
68
+ puts "URL: " + url
69
+ end
70
+ end
71
+
72
+ # Gets all data and returns in JSON
73
+ def getData
74
+ search
75
+ return JSON.pretty_generate(@output)
76
+ end
77
+ end
78
+
metadata ADDED
@@ -0,0 +1,45 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: generalscraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - M. C. McGrath
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-05-15 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Scrapes all pages on a site you specify including terms you specify.
14
+ email: shidash@shidash.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/generalscraper.rb
20
+ homepage: https://github.com/TransparencyToolkit/generalscraper
21
+ licenses:
22
+ - GPL
23
+ metadata: {}
24
+ post_install_message:
25
+ rdoc_options: []
26
+ require_paths:
27
+ - lib
28
+ required_ruby_version: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ required_rubygems_version: !ruby/object:Gem::Requirement
34
+ requirements:
35
+ - - '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ requirements: []
39
+ rubyforge_project:
40
+ rubygems_version: 2.0.14
41
+ signing_key:
42
+ specification_version: 4
43
+ summary: Get all pages on a site for terms specified
44
+ test_files: []
45
+ has_rdoc: