adwords_scraper 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in adwords_scraper.gemspec
4
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 asahi
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,35 @@
1
+ # AdwordsScraper
2
+
3
+ This gem scrapes the Adwords ads from a Google search engine results page, maps
4
+ it by position and then parses the title, description, display url, redirect,
5
+ sitelinks (if any), boxed warning (for pharma ads and if applicable) as well as
6
+ reviews text (if any).
7
+
8
+ ## Installation
9
+
10
+ Add this line to your application's Gemfile:
11
+
12
+ gem 'adwords_scraper'
13
+
14
+ And then execute:
15
+
16
+ $ bundle
17
+
18
+ Or install it yourself as:
19
+
20
+ $ gem install adwords_scraper
21
+
22
+ ## Usage
23
+
24
+ Once installed, simply run the following (replace "green ipod" with your own
25
+ keyword text:
26
+
27
+ AdwordsScraper.start("green ipod")
28
+
29
+ ## Contributing
30
+
31
+ 1. Fork it
32
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
33
+ 3. Commit your changes (`git commit -am 'Added some feature'`)
34
+ 4. Push to the branch (`git push origin my-new-feature`)
35
+ 5. Create new Pull Request
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
@@ -0,0 +1,19 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/adwords_scraper/version', __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.authors = ["asahi"]
6
+ gem.email = ["sodani@gmail.com"]
7
+ gem.description = %q{Scrapes Google Adwords ads using Mechanize}
8
+ gem.summary = %q{run AdwordsScraper.start and pass a keyword string.}
9
+ gem.homepage = ""
10
+
11
+ gem.add_development_dependency "mechanize"
12
+
13
+ gem.files = `git ls-files`.split($\)
14
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
15
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
16
+ gem.name = "adwords_scraper"
17
+ gem.require_paths = ["lib"]
18
+ gem.version = AdwordsScraper::VERSION
19
+ end
@@ -0,0 +1,92 @@
1
+ require "adwords_scraper/version"
2
+ require "mechanize"
3
+
4
+ module AdwordsScraper
5
+ def self.test
6
+ "inside test"
7
+ end
8
+
9
+ def self.start(keyword)
10
+ doc = fetch_serp(keyword)
11
+ scrape_serp(doc)
12
+
13
+ end
14
+
15
+ def self.fetch_serp(keyword)
16
+ url = query_url(keyword)
17
+
18
+ agent = Mechanize.new
19
+
20
+ # It's best to mimic a common browser or else Google may not display all ad
21
+ # formats
22
+ agent.user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.121 Safari/535.2'
23
+ agent.get url
24
+ end
25
+
26
+ def self.query_url(keyword)
27
+ 'http://www.google.com/search?gcx=w&sourceid=chrome&ie=UTF-8&q='+ keyword.gsub(" ", "+")
28
+ end
29
+
30
+ def self.scrape_serp(doc)
31
+ container = {}
32
+ selectors = {}
33
+ selectors['top'] = "#tads .vsta"
34
+ selectors['right'] = "#mbEnd li" # .vsra (old)
35
+ selectors['bottom'] = "#tadsb li"
36
+
37
+ selectors.each do |location, selector|
38
+ candidate = doc.search(selector)
39
+ if !candidate.search('h3').empty? && candidate.size < 10 # two validations
40
+ container[location] = candidate
41
+ end
42
+ end
43
+ ad_container = []
44
+
45
+ container.each do |location, ad_docs|
46
+ ad_docs.each do |ad_doc|
47
+ next unless ad_doc.search('img').empty? # skipping ads that have an image attribute
48
+ begin
49
+ p = ad_doc.search('a').first['id'].match(/\d/)[0]
50
+ rescue => e
51
+ binding.pry
52
+ end
53
+ position = "#{location}:#{p}"
54
+ ad_container << [ position, parse_ad(ad_doc) ]
55
+ end
56
+ end
57
+ ad_container
58
+ end
59
+
60
+ def self.parse_ad(doc)
61
+ container = {}
62
+
63
+ desc = ''
64
+ d = doc.search('.ac').first.children
65
+ d.each do |i|
66
+ if i.name == 'br'
67
+ desc = desc + ' '
68
+ else
69
+ desc = desc + i.text
70
+ end
71
+ end
72
+ container['description'] = desc.gsub(' ', ' ')
73
+
74
+ container['title'] = doc.search('h3').text # doc title text
75
+ container['displayurl'] = doc.search('cite').text # display URL
76
+ container['boxed_warning'] = doc.search('.pwl').text # boxed warning
77
+ container['review'] = doc.search('.f div').text # supplemental text in gray
78
+
79
+ redirect = doc.at_css('a')['href'].match(/.*(https?:\/\/\S+)/)[1]
80
+ container['redirect'] = CGI.unescape(redirect) #unescape URL encoding
81
+
82
+ sitelinks = doc.search('table a')
83
+ unless sitelinks.empty?
84
+ sitelinks_array = []
85
+ sitelinks.each {|i| sitelinks_array << i.text }
86
+ container['sitelinks'] = sitelinks_array
87
+ end
88
+
89
+ container
90
+ end
91
+
92
+ end
@@ -0,0 +1,3 @@
1
+ module AdwordsScraper
2
+ VERSION = "0.0.1"
3
+ end
metadata ADDED
@@ -0,0 +1,69 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: adwords_scraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - asahi
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-07-28 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: mechanize
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ description: Scrapes Google Adwords ads using Mechanize
31
+ email:
32
+ - sodani@gmail.com
33
+ executables: []
34
+ extensions: []
35
+ extra_rdoc_files: []
36
+ files:
37
+ - .gitignore
38
+ - Gemfile
39
+ - LICENSE
40
+ - README.md
41
+ - Rakefile
42
+ - adwords_scraper.gemspec
43
+ - lib/adwords_scraper.rb
44
+ - lib/adwords_scraper/version.rb
45
+ homepage: ''
46
+ licenses: []
47
+ post_install_message:
48
+ rdoc_options: []
49
+ require_paths:
50
+ - lib
51
+ required_ruby_version: !ruby/object:Gem::Requirement
52
+ none: false
53
+ requirements:
54
+ - - ! '>='
55
+ - !ruby/object:Gem::Version
56
+ version: '0'
57
+ required_rubygems_version: !ruby/object:Gem::Requirement
58
+ none: false
59
+ requirements:
60
+ - - ! '>='
61
+ - !ruby/object:Gem::Version
62
+ version: '0'
63
+ requirements: []
64
+ rubyforge_project:
65
+ rubygems_version: 1.8.24
66
+ signing_key:
67
+ specification_version: 3
68
+ summary: run AdwordsScraper.start and pass a keyword string.
69
+ test_files: []