google_scraper 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,4 @@
1
+ .DS_Store
2
+ results.html
3
+ pkg
4
+ html
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in google_scraper.gemspec
4
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 John Tajima
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,30 @@
1
+ # GoogleScraper
2
+
3
+ Scrapes first 10 pages of Google.ca search results and saves url,text into output.csv file.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'google_scraper'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install google_scraper
18
+
19
+ ## Usage
20
+
21
+ $ google_scraper "search term site:somesite.com"
22
+
23
+
24
+ ## Contributing
25
+
26
+ 1. Fork it
27
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
28
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
29
+ 4. Push to the branch (`git push origin my-new-feature`)
30
+ 5. Create new Pull Request
@@ -0,0 +1,19 @@
1
+ = google_scraper - DESCRIBE YOUR GEM
2
+
3
+ Author:: YOUR NAME (YOUR EMAIL)
4
+ Copyright:: Copyright (c) 2012 YOUR NAME
5
+
6
+
7
+ DESCRIBE YOUR GEM HERE
8
+
9
+ == Links
10
+
11
+ * {Source on Github}[LINK TO GITHUB]
12
+ * RDoc[LINK TO RDOC.INFO]
13
+
14
+ == Install
15
+
16
+ == Examples
17
+
18
+ == Contributing
19
+
@@ -0,0 +1,61 @@
1
+ def dump_load_path
2
+ puts $LOAD_PATH.join("\n")
3
+ found = nil
4
+ $LOAD_PATH.each do |path|
5
+ if File.exists?(File.join(path,"rspec"))
6
+ puts "Found rspec in #{path}"
7
+ if File.exists?(File.join(path,"rspec","core"))
8
+ puts "Found core"
9
+ if File.exists?(File.join(path,"rspec","core","rake_task"))
10
+ puts "Found rake_task"
11
+ found = path
12
+ else
13
+ puts "!! no rake_task"
14
+ end
15
+ else
16
+ puts "!!! no core"
17
+ end
18
+ end
19
+ end
20
+ if found.nil?
21
+ puts "Didn't find rspec/core/rake_task anywhere"
22
+ else
23
+ puts "Found in #{path}"
24
+ end
25
+ end
26
+ require 'bundler'
27
+ require 'rake/clean'
28
+
29
+ require 'rake/testtask'
30
+
31
+ require 'cucumber'
32
+ require 'cucumber/rake/task'
33
+ gem 'rdoc' # we need the installed RDoc gem, not the system one
34
+ require 'rdoc/task'
35
+
36
+ include Rake::DSL
37
+
38
+ Bundler::GemHelper.install_tasks
39
+
40
+
41
+ Rake::TestTask.new do |t|
42
+ t.pattern = 'test/tc_*.rb'
43
+ end
44
+
45
+
46
+ CUKE_RESULTS = 'results.html'
47
+ CLEAN << CUKE_RESULTS
48
+ Cucumber::Rake::Task.new(:features) do |t|
49
+ t.cucumber_opts = "features --format html -o #{CUKE_RESULTS} --format pretty --no-source -x"
50
+ t.fork = false
51
+ end
52
+
53
+ Rake::RDocTask.new do |rd|
54
+
55
+ rd.main = "README.rdoc"
56
+
57
+ rd.rdoc_files.include("README.rdoc","lib/**/*.rb","bin/**/*")
58
+ end
59
+
60
+ task :default => [:test,:features]
61
+
@@ -0,0 +1,44 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'optparse'
4
+ require 'methadone'
5
+ require 'google_scraper'
6
+ require 'nokogiri'
7
+ require 'open-uri'
8
+
9
+ class App
10
+ include Methadone::Main
11
+ include Methadone::CLILogging
12
+
13
+ main do |query| # Add args you want: |like,so|
14
+
15
+ GoogleScraper.run(query)
16
+ end
17
+
18
+ # supplemental methods here
19
+
20
+ # Declare command-line interface here
21
+
22
+ # description "one line description of your app"
23
+ #
24
+ # Accept flags via:
25
+ # on("--flag VAL","Some flag")
26
+ # options[flag] will contain VAL
27
+ #
28
+ # Specify switches via:
29
+ # on("--[no-]switch","Some switch")
30
+ #
31
+ # Or, just call OptionParser methods on opts
32
+ #
33
+ # Require an argument
34
+ arg :query, :required
35
+ #
36
+ # # Make an argument optional
37
+ # arg :optional_arg, :optional
38
+
39
+ version GoogleScraper::VERSION
40
+
41
+ use_log_level_option
42
+
43
+ go!
44
+ end
@@ -0,0 +1,13 @@
1
+ Feature: My bootstrapped app kinda works
2
+ In order to get going on coding my awesome app
3
+ I want to have aruba and cucumber setup
4
+ So I don't have to do it myself
5
+
6
+ Scenario: App just runs
7
+ When I get help for "google_scraper"
8
+ Then the exit status should be 0
9
+ And the banner should be present
10
+ And the banner should document that this app takes options
11
+ And the following options should be documented:
12
+ |--version|
13
+ And the banner should document that this app takes no arguments
@@ -0,0 +1 @@
1
+ # Put your step definitions here
@@ -0,0 +1,16 @@
1
+ require 'aruba/cucumber'
2
+ require 'methadone/cucumber'
3
+
4
+ ENV['PATH'] = "#{File.expand_path(File.dirname(__FILE__) + '/../../bin')}#{File::PATH_SEPARATOR}#{ENV['PATH']}"
5
+ LIB_DIR = File.join(File.expand_path(File.dirname(__FILE__)),'..','..','lib')
6
+
7
+ Before do
8
+ # Using "announce" causes massive warnings on 1.9.2
9
+ @puts = true
10
+ @original_rubylib = ENV['RUBYLIB']
11
+ ENV['RUBYLIB'] = LIB_DIR + File::PATH_SEPARATOR + ENV['RUBYLIB'].to_s
12
+ end
13
+
14
+ After do
15
+ ENV['RUBYLIB'] = @original_rubylib
16
+ end
@@ -0,0 +1,25 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'google_scraper/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "google_scraper"
8
+ gem.version = GoogleScraper::VERSION
9
+ gem.authors = ["John Tajima"]
10
+ gem.email = ["johntajima@gmail.com"]
11
+ gem.description = %q{Scrapes first 10 pages of google with given query}
12
+ gem.summary = %q{Scrapes first 10 pages of google with given query}
13
+ gem.homepage = ""
14
+
15
+ gem.files = `git ls-files`.split($/)
16
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
+ gem.require_paths = ["lib"]
19
+ gem.add_development_dependency('rdoc')
20
+ gem.add_development_dependency('aruba')
21
+ gem.add_development_dependency('rake', '~> 0.9.2')
22
+ gem.add_dependency('methadone', '~> 1.2.2')
23
+ gem.add_dependency('nokogiri')
24
+
25
+ end
@@ -0,0 +1,43 @@
1
+ require "google_scraper/version"
2
+ require 'nokogiri'
3
+ require 'csv'
4
+
5
+ module GoogleScraper
6
+
7
+ def self.run(query, options = {})
8
+ puts "Query is #{query}"
9
+ query = URI.escape(query)
10
+ data = []
11
+
12
+ (0..9).each do |count|
13
+ puts "scraping page #{count+1} with query #{query}"
14
+ doc = Nokogiri::HTML(open("https://www.google.ca/search?q=#{query}&start=#{count*10}"))
15
+
16
+ link = doc.search('h3')
17
+ puts "Found #{link.size} possible results"
18
+
19
+ link.each do |l|
20
+ a = l.search('a').first
21
+ next unless a['href'].match(/^\/url/)
22
+ result = {
23
+ :url => a['href'].gsub(/^\/url\?q\=/,'').gsub(/\&sa=.*$/, ''),
24
+ :text => a.content
25
+ }
26
+ data << result
27
+ end
28
+ end
29
+
30
+ CSV.open('output.csv', 'w') do |csv|
31
+ csv << ["url", "text"]
32
+ data.each do |entry|
33
+ begin
34
+ csv << [entry[:url], entry[:text]]
35
+ rescue => e
36
+ puts "ugh. #{e} - skipping"
37
+
38
+ end
39
+ end
40
+ end
41
+ end
42
+
43
+ end
@@ -0,0 +1,3 @@
1
+ module GoogleScraper
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,56 @@
1
+ url,text
2
+ http://www.shopify.com/blog/6700752-9-elements-you-need-to-conversion-test,What is Conversion Testing? 9 Website Elements to A/B Test.
3
+ http://support.shopify.com/customer/portal/articles/75397-how-can-i-test-orders-without-being-charged-by-shopify-,Shopify Support: How can I test orders without being char...
4
+ http://wiki.shopify.com/Troubleshooting_PayPal_Website_Payments_Pro,PayPal - Shopify Wiki
5
+ http://ecommerce.shopify.com/c/shopify-discussion/t/testing-migration-from-magento-54355,testing migration from magento - Ecommerce Forums - Shopify
6
+ http://ecommerce.shopify.com/c/ecommerce-marketing/t/a-b-testing-55230,A/B Testing - Ecommerce Forums - Shopify
7
+ http://ecommerce.shopify.com/c/ecommerce-marketing/t/a-b-testing-44736,A/B Testing - Ecommerce Forums - Shopify
8
+ http://ecommerce.shopify.com/c/shopify-discussion/t/testing-105919,Testing - Shopify Discussion - Shopify Ecommerce Discussion Forums
9
+ http://ecommerce.shopify.com/ecommerce/bogus,bogus gateway - Ecommerce Forums - Shopify
10
+ http://static.shopify.com/s/files/1/0079/8612/t/1/assets/PS_Stainless_Steel_Cleaner_8oz.pdf,Stainless Steel - Shopify
11
+ http://static.shopify.com/s/files/1/0079/8612/t/1/assets/PS_Stainless_Steel_Cleaner.pdf,Stainless Steel - Shopify
12
+ http://static.shopify.com/s/files/1/0079/8612/t/1/assets/PS_Glass_Cleaner.pdf,nuveraproducts.com - Shopify
13
+ http://cdn.shopify.com/s/files/1/0088/4082/t/3/assets/Ludaire_Engineered_Flooring_Installation.pdf%3F1644,Engineered Flooring Installation Instructions
14
+ http://ecommerce.shopify.com/c/shopify-discussion/t/smart-collections-acting-dumb-30558,Smart collections acting dumb? - Ecommerce Forums - Shopify
15
+ http://cdn.shopify.com/s/files/1/0017/7152/files/MCN-GB-review-sm.pdf%3F1257882558,Next .. -
16
+ http://cdn.shopify.com/s/files/1/0027/5622/files/Larry_McConkey_comments.pdf%3F1287650375,"June 15, 2005 Excerpt from Steadicam Forum posting I did not want ..."
17
+ http://www.shopify.com/technology/4906352-salmagundi-for-thursday-december-15-2011,"Salmagundi for Thursday, December 15, 2011 — Ecommerce Blog ..."
18
+ http://cdn.shopify.com/s/files/1/0153/9461/files/WHO_IPA_hand_sanitiser_formulations.pdf%3F2206,WHO-recommended Handrub Formulations
19
+ http://static.shopify.com/s/files/1/0077/9972/t/1/assets/Ausblu_Breez_A4.pdf,How do Breez Air Purif iers work? - Shopify
20
+ http://ecommerce.shopify.com/c/ecommerce-design/t/back-button-woes-35677,Back Button Woes - Ecommerce Forums - Shopify
21
+ http://ecommerce.shopify.com/c/ecommerce-design/t/turn-contents-of-an-article-into-a-liquid-array-35152,turn contents of an article into a liquid array..? - Shopify Design ...
22
+ http://static0.shopify.com/s/files/1/0011/2372/files/Toxicity_Testing_FDA-registered_lab_9_06_2007_R1.pdf,Neuon - Shopify
23
+ http://cdn.shopify.com/s/files/1/0061/6332/files/IMDG_Code_35_Changes.pdf%3F1291827878,E
24
+ http://ecommerce.shopify.com/c/ecommerce-design/t/theme-change-9199,Theme Change - Ecommerce Forums - Shopify
25
+ http://cdn.shopify.com/s/files/1/0038/9582/files/ATSAM2195UserGuide.pdf%3F1,ATSAM2195 User Guide.pdf
26
+ http://ecommerce.shopify.com/c/shopify-discussion/t/webhooks-17070,Webhooks - Ecommerce Forums - Shopify
27
+ http://static3.shopify.com/s/files/1/0028/5492/files/Lead_Test.pdf,Report No. : GR:TX:7410062889 DATE : 27/07/2007 - Shopify
28
+ http://ecommerce.shopify.com/c/ecommerce-gallery/t/what-do-you-hate-about-our-store-34897,What do you HATE about our store?! - Ecommerce Forums - Shopify
29
+ http://cdn.shopify.com/s/files/1/0062/7112/files/SocialDesignVolumeOneDigitalEdition.pdf%3F100303,Social Design in Museums
30
+ http://www.shopify.com/technology%3Fpage%3D8,8 - Shopify
31
+ http://wiki.shopify.com/Asset_url,Asset url - Shopify Wiki
32
+ http://support.shopify.com/customer/portal/articles/75397-how-can-i-test-orders-without-being-charged-by-shopify-,Shopify Support: How can I test orders without being char...
33
+ http://cdn.shopify.com/s/files/1/0062/7112/files/SocialDesignVolumeOneDigitalEdition.pdf%3F100303,Social Design in Museums
34
+ http://www.shopify.com/technology%3Fpage%3D8,8 - Shopify
35
+ http://wiki.shopify.com/Asset_url,Asset url - Shopify Wiki
36
+ http://support.shopify.com/customer/portal/articles/75397-how-can-i-test-orders-without-being-charged-by-shopify-,Shopify Support: How can I test orders without being char...
37
+ http://cdn.shopify.com/s/files/1/0062/7112/files/SocialDesignVolumeOneDigitalEdition.pdf%3F100303,Social Design in Museums
38
+ http://www.shopify.com/technology%3Fpage%3D8,8 - Shopify
39
+ http://wiki.shopify.com/Asset_url,Asset url - Shopify Wiki
40
+ http://support.shopify.com/customer/portal/articles/75397-how-can-i-test-orders-without-being-charged-by-shopify-,Shopify Support: How can I test orders without being char...
41
+ http://cdn.shopify.com/s/files/1/0062/7112/files/SocialDesignVolumeOneDigitalEdition.pdf%3F100303,Social Design in Museums
42
+ http://www.shopify.com/technology%3Fpage%3D8,8 - Shopify
43
+ http://wiki.shopify.com/Asset_url,Asset url - Shopify Wiki
44
+ http://support.shopify.com/customer/portal/articles/75397-how-can-i-test-orders-without-being-charged-by-shopify-,Shopify Support: How can I test orders without being char...
45
+ http://cdn.shopify.com/s/files/1/0062/7112/files/SocialDesignVolumeOneDigitalEdition.pdf%3F100303,Social Design in Museums
46
+ http://www.shopify.com/technology%3Fpage%3D8,8 - Shopify
47
+ http://wiki.shopify.com/Asset_url,Asset url - Shopify Wiki
48
+ http://support.shopify.com/customer/portal/articles/75397-how-can-i-test-orders-without-being-charged-by-shopify-,Shopify Support: How can I test orders without being char...
49
+ http://cdn.shopify.com/s/files/1/0062/7112/files/SocialDesignVolumeOneDigitalEdition.pdf%3F100303,Social Design in Museums
50
+ http://www.shopify.com/technology%3Fpage%3D8,8 - Shopify
51
+ http://wiki.shopify.com/Asset_url,Asset url - Shopify Wiki
52
+ http://support.shopify.com/customer/portal/articles/75397-how-can-i-test-orders-without-being-charged-by-shopify-,Shopify Support: How can I test orders without being char...
53
+ http://cdn.shopify.com/s/files/1/0062/7112/files/SocialDesignVolumeOneDigitalEdition.pdf%3F100303,Social Design in Museums
54
+ http://www.shopify.com/technology%3Fpage%3D8,8 - Shopify
55
+ http://wiki.shopify.com/Asset_url,Asset url - Shopify Wiki
56
+ http://support.shopify.com/customer/portal/articles/75397-how-can-i-test-orders-without-being-charged-by-shopify-,Shopify Support: How can I test orders without being char...
@@ -0,0 +1,7 @@
1
+ require 'test/unit'
2
+
3
+ class TestSomething < Test::Unit::TestCase
4
+ def test_truth
5
+ assert true
6
+ end
7
+ end
metadata ADDED
@@ -0,0 +1,120 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: google_scraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - John Tajima
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-11-22 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rdoc
16
+ requirement: &70194036751580 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: *70194036751580
25
+ - !ruby/object:Gem::Dependency
26
+ name: aruba
27
+ requirement: &70194036751140 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: *70194036751140
36
+ - !ruby/object:Gem::Dependency
37
+ name: rake
38
+ requirement: &70194036750600 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ~>
42
+ - !ruby/object:Gem::Version
43
+ version: 0.9.2
44
+ type: :development
45
+ prerelease: false
46
+ version_requirements: *70194036750600
47
+ - !ruby/object:Gem::Dependency
48
+ name: methadone
49
+ requirement: &70194036750060 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ~>
53
+ - !ruby/object:Gem::Version
54
+ version: 1.2.2
55
+ type: :runtime
56
+ prerelease: false
57
+ version_requirements: *70194036750060
58
+ - !ruby/object:Gem::Dependency
59
+ name: nokogiri
60
+ requirement: &70194036749660 !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - ! '>='
64
+ - !ruby/object:Gem::Version
65
+ version: '0'
66
+ type: :runtime
67
+ prerelease: false
68
+ version_requirements: *70194036749660
69
+ description: Scrapes first 10 pages of google with given query
70
+ email:
71
+ - johntajima@gmail.com
72
+ executables:
73
+ - google_scraper
74
+ extensions: []
75
+ extra_rdoc_files: []
76
+ files:
77
+ - .gitignore
78
+ - Gemfile
79
+ - LICENSE.txt
80
+ - README.md
81
+ - README.rdoc
82
+ - Rakefile
83
+ - bin/google_scraper
84
+ - features/google_scraper.feature
85
+ - features/step_definitions/google_scraper_steps.rb
86
+ - features/support/env.rb
87
+ - google_scraper.gemspec
88
+ - lib/google_scraper.rb
89
+ - lib/google_scraper/version.rb
90
+ - output.csv
91
+ - test/tc_something.rb
92
+ homepage: ''
93
+ licenses: []
94
+ post_install_message:
95
+ rdoc_options: []
96
+ require_paths:
97
+ - lib
98
+ required_ruby_version: !ruby/object:Gem::Requirement
99
+ none: false
100
+ requirements:
101
+ - - ! '>='
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ required_rubygems_version: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ! '>='
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
110
+ requirements: []
111
+ rubyforge_project:
112
+ rubygems_version: 1.8.11
113
+ signing_key:
114
+ specification_version: 3
115
+ summary: Scrapes first 10 pages of google with given query
116
+ test_files:
117
+ - features/google_scraper.feature
118
+ - features/step_definitions/google_scraper_steps.rb
119
+ - features/support/env.rb
120
+ - test/tc_something.rb