generalscraper 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/generalscraper.rb +78 -0
  3. metadata +45 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 1e568d1e9d1c0fa9c98f814128439bd009194c0d
4
+ data.tar.gz: f26d58116e50b61ad308eed4df0ce03e76cbfba8
5
+ SHA512:
6
+ metadata.gz: 92120914fbe80f8a0b5b3b5b996753b07a218745a88d7cd201fb533fa3de4e116499c1cc2d218d030771bf3d25dbf42743efc83ed7f1d2e4cb44939229ec31a6
7
+ data.tar.gz: b7d4070620f7a43ff7008ae414ea3aed53da287c697f32cb80f6afbd66186d94151a7393ac9547428f9ddeabf37e3500e7e55165ad11f7d2e1f994975e539f96
@@ -0,0 +1,78 @@
1
+ require 'mechanize'
2
+ require 'json'
3
+ require 'nokogiri'
4
+ require 'open-uri'
5
+
6
+ class GeneralScraper
7
+ def initialize(scrapesite, input)
8
+ @input = input
9
+ @scrapesite = scrapesite
10
+ @output = Array.new
11
+ @startindex = 10
12
+ end
13
+
14
+ # Searches for links on Google
15
+ def search
16
+ agent = Mechanize.new
17
+ agent.user_agent_alias = 'Linux Firefox'
18
+ gform = agent.get("http://google.com").form("f")
19
+ gform.q = "site:" + @scrapesite + " " + @input
20
+ page = agent.submit(gform, gform.buttons.first)
21
+ examine(page)
22
+ end
23
+
24
+ # Examines a search page
25
+ def examine(page)
26
+ page.links.each do |link|
27
+ if (link.href.include? @scrapesite) && (!link.href.include? "webcache") && (!link.href.include? "site:"+@scrapesite)
28
+ saveurl = link.href.split("?q=")
29
+
30
+ if saveurl[1]
31
+ url = saveurl[1].split("&")
32
+ getPage(url[0])
33
+ end
34
+ end
35
+
36
+ if (link.href.include? "&sa=N") && (link.href.include? "&start=")
37
+ url1 = link.href.split("&start=")
38
+ url2 = url1[1].split("&sa=N")
39
+
40
+ if url2[0].to_i == @startindex
41
+ sleep(rand(30..90))
42
+ @startindex += 10
43
+ agent = Mechanize.new
44
+ examine(agent.get("http://google.com" + link.href))
45
+ end
46
+ end
47
+ end
48
+ end
49
+
50
+ # Scrape the page content
51
+ def getPage(url)
52
+ pagehash = Hash.new
53
+ begin
54
+ url.gsub!("%3F", "?")
55
+ url.gsub!("%3D", "=")
56
+ pagehash[:url] = url
57
+ pagehash[:date_retrieved] = Time.now
58
+ html = Nokogiri::HTML(open(url))
59
+ pagehash[:title] = html.css("title").text
60
+ html.css("meta").each do |m|
61
+ if m
62
+ pagehash[m['name']] = m['content']
63
+ end
64
+ end
65
+ pagehash[:page] = html.css("body").text
66
+ @output.push(pagehash)
67
+ rescue
68
+ puts "URL: " + url
69
+ end
70
+ end
71
+
72
+ # Gets all data and returns in JSON
73
+ def getData
74
+ search
75
+ return JSON.pretty_generate(@output)
76
+ end
77
+ end
78
+
metadata ADDED
@@ -0,0 +1,45 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: generalscraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - M. C. McGrath
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-05-15 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Scrapes all pages on a site you specify including terms you specify.
14
+ email: shidash@shidash.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/generalscraper.rb
20
+ homepage: https://github.com/TransparencyToolkit/generalscraper
21
+ licenses:
22
+ - GPL
23
+ metadata: {}
24
+ post_install_message:
25
+ rdoc_options: []
26
+ require_paths:
27
+ - lib
28
+ required_ruby_version: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ required_rubygems_version: !ruby/object:Gem::Requirement
34
+ requirements:
35
+ - - '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ requirements: []
39
+ rubyforge_project:
40
+ rubygems_version: 2.0.14
41
+ signing_key:
42
+ specification_version: 4
43
+ summary: Get all pages on a site for terms specified
44
+ test_files: []
45
+ has_rdoc: