gscraper 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +5 -0
- data/LICENSE.txt +23 -0
- data/Manifest.txt +17 -0
- data/README.txt +46 -0
- data/Rakefile +17 -0
- data/lib/gscraper.rb +2 -0
- data/lib/gscraper/extensions.rb +1 -0
- data/lib/gscraper/extensions/uri.rb +1 -0
- data/lib/gscraper/extensions/uri/http.rb +71 -0
- data/lib/gscraper/gscraper.rb +62 -0
- data/lib/gscraper/licenses.rb +56 -0
- data/lib/gscraper/search.rb +1 -0
- data/lib/gscraper/search/query.rb +394 -0
- data/lib/gscraper/search/result.rb +37 -0
- data/lib/gscraper/search/search.rb +33 -0
- data/test/search/query_from_url.rb +50 -0
- data/test/test_gscraper.rb +4 -0
- metadata +74 -0
data/History.txt
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
The MIT License
|
4
|
+
|
5
|
+
Copyright (c) 2007 Hal Brodigan
|
6
|
+
|
7
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
8
|
+
of this software and associated documentation files (the "Software"), to deal
|
9
|
+
in the Software without restriction, including without limitation the rights
|
10
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
11
|
+
copies of the Software, and to permit persons to whom the Software is
|
12
|
+
furnished to do so, subject to the following conditions:
|
13
|
+
|
14
|
+
The above copyright notice and this permission notice shall be included in
|
15
|
+
all copies or substantial portions of the Software.
|
16
|
+
|
17
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
18
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
19
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
20
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
21
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
22
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
23
|
+
THE SOFTWARE.
|
data/Manifest.txt
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
History.txt
|
2
|
+
LICENSE.txt
|
3
|
+
Manifest.txt
|
4
|
+
README.txt
|
5
|
+
Rakefile
|
6
|
+
lib/gscraper.rb
|
7
|
+
lib/gscraper/gscraper.rb
|
8
|
+
lib/gscraper/extensions/uri/http.rb
|
9
|
+
lib/gscraper/extensions/uri.rb
|
10
|
+
lib/gscraper/extensions.rb
|
11
|
+
lib/gscraper/licenses.rb
|
12
|
+
lib/gscraper/search/result.rb
|
13
|
+
lib/gscraper/search/query.rb
|
14
|
+
lib/gscraper/search/search.rb
|
15
|
+
lib/gscraper/search.rb
|
16
|
+
test/test_gscraper.rb
|
17
|
+
test/search/query_from_url.rb
|
data/README.txt
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
GScraper
|
2
|
+
by Postmodern Modulus III
|
3
|
+
http://rubyforge.net/projects/gscraper/
|
4
|
+
|
5
|
+
== DESCRIPTION:
|
6
|
+
|
7
|
+
GScraper is a web-scraping interface to various Google Services.
|
8
|
+
|
9
|
+
== FEATURES/PROBLEMS:
|
10
|
+
|
11
|
+
* Supports the Google Search service.
|
12
|
+
* Provides HTTP access with custom User-Agent strings.
|
13
|
+
|
14
|
+
== REQUIREMENTS:
|
15
|
+
|
16
|
+
* Hpricot
|
17
|
+
* Mechanize
|
18
|
+
|
19
|
+
== INSTALL:
|
20
|
+
|
21
|
+
sudo gem install gscraper
|
22
|
+
|
23
|
+
== LICENSE:
|
24
|
+
|
25
|
+
The MIT License
|
26
|
+
|
27
|
+
Copyright (c) 2007 Hal Brodigan
|
28
|
+
|
29
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
30
|
+
a copy of this software and associated documentation files (the
|
31
|
+
'Software'), to deal in the Software without restriction, including
|
32
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
33
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
34
|
+
permit persons to whom the Software is furnished to do so, subject to
|
35
|
+
the following conditions:
|
36
|
+
|
37
|
+
The above copyright notice and this permission notice shall be
|
38
|
+
included in all copies or substantial portions of the Software.
|
39
|
+
|
40
|
+
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
41
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
42
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
43
|
+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
44
|
+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
45
|
+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
46
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/Rakefile
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
# -*- ruby -*-
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'hoe'
|
5
|
+
require './lib/gscraper/version.rb'
|
6
|
+
|
7
|
+
Hoe.new('gscraper', GScraper::VERSION) do |p|
|
8
|
+
p.rubyforge_name = 'gscraper'
|
9
|
+
p.author = 'Postmodern Modulus III'
|
10
|
+
p.email = 'postmodern.mod3@gmail.com'
|
11
|
+
p.summary = 'A ruby web-scraping interface to various Google Services'
|
12
|
+
p.description = p.paragraphs_of('README.txt', 2..5).join("\n\n")
|
13
|
+
p.url = p.paragraphs_of('README.txt', 0).first.split(/\n/)[1..-1]
|
14
|
+
p.changes = p.paragraphs_of('History.txt', 0..1).join("\n\n")
|
15
|
+
end
|
16
|
+
|
17
|
+
# vim: syntax=Ruby
|
data/lib/gscraper.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require 'gscraper/extensions/uri'
|
@@ -0,0 +1 @@
|
|
1
|
+
require 'gscraper/extensions/uri/http'
|
@@ -0,0 +1,71 @@
|
|
1
|
+
module URI
|
2
|
+
class HTTP
|
3
|
+
|
4
|
+
# Query parameters
|
5
|
+
attr_reader :query_params
|
6
|
+
|
7
|
+
#
|
8
|
+
# Creates a new URI::HTTP object and initializes query_params as a
|
9
|
+
# new Hash.
|
10
|
+
#
|
11
|
+
def initialize(*args)
|
12
|
+
super(*args)
|
13
|
+
|
14
|
+
@query_params = {}
|
15
|
+
parse_query_params
|
16
|
+
end
|
17
|
+
|
18
|
+
#
|
19
|
+
# Sets the query data and updates query_params.
|
20
|
+
#
|
21
|
+
def query=(query_str)
|
22
|
+
new_query = super(query_str)
|
23
|
+
parse_query_params
|
24
|
+
return new_query
|
25
|
+
end
|
26
|
+
|
27
|
+
protected
|
28
|
+
|
29
|
+
#
|
30
|
+
# Parses the query parameters from the query data, populating
|
31
|
+
# query_params with the parsed parameters.
|
32
|
+
#
|
33
|
+
def parse_query_params
|
34
|
+
@query_params.clear
|
35
|
+
|
36
|
+
if @query
|
37
|
+
@query.split('&').each do |param|
|
38
|
+
name, value = param.split('=')
|
39
|
+
|
40
|
+
if value
|
41
|
+
@query_params[name] = URI.decode(value)
|
42
|
+
else
|
43
|
+
@query_params[name] = nil
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
|
51
|
+
# :nodoc
|
52
|
+
def path_query
|
53
|
+
str = @path
|
54
|
+
|
55
|
+
unless @query_params.empty?
|
56
|
+
str += '?' + @query_params.to_a.map { |name,value|
|
57
|
+
if value==true
|
58
|
+
"#{name}=active"
|
59
|
+
elsif value
|
60
|
+
"#{name}=#{URI.encode(value.to_s)}"
|
61
|
+
else
|
62
|
+
"#{name}="
|
63
|
+
end
|
64
|
+
}.join('&')
|
65
|
+
end
|
66
|
+
|
67
|
+
return str
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
require 'mechanize'
|
2
|
+
require 'open-uri'
|
3
|
+
|
4
|
+
module GScraper
|
5
|
+
#
|
6
|
+
# Returns the GScraper user-agent
|
7
|
+
#
|
8
|
+
def GScraper.user_agent
|
9
|
+
@user_agent
|
10
|
+
end
|
11
|
+
|
12
|
+
#
|
13
|
+
# Sets the GScraper user-agent to the specified _agent_.
|
14
|
+
#
|
15
|
+
def GScraper.user_agent=(agent)
|
16
|
+
@user_agent = agent
|
17
|
+
end
|
18
|
+
|
19
|
+
#
|
20
|
+
# Opens the _uri_ with the given _opts_. The contents of the _uri_ will
|
21
|
+
# be returned.
|
22
|
+
#
|
23
|
+
# GScraper.open('http://www.hackety.org/')
|
24
|
+
# GScraper.open('http://tenderlovemaking.com/',
|
25
|
+
# :user_agent_alias => 'Linux Mozilla')
|
26
|
+
# GScraper.open('http://www.wired.com/', :user_agent => 'the future')
|
27
|
+
#
|
28
|
+
def GScraper.open(uri,opts={})
|
29
|
+
headers = {}
|
30
|
+
|
31
|
+
if opts[:user_agent_alias]
|
32
|
+
headers['User-Agent'] = WWW::Mechanize::AGENT_ALIASES[opts[:user_agent_alias]]
|
33
|
+
elsif opts[:user_agent]
|
34
|
+
headers['User-Agent'] = opts[:user_agent]
|
35
|
+
elsif GScraper.user_agent
|
36
|
+
headers['User-Agent'] = GScraper.user_agent
|
37
|
+
end
|
38
|
+
|
39
|
+
return Kernel.open(uri,headers)
|
40
|
+
end
|
41
|
+
|
42
|
+
#
|
43
|
+
# Creates a new Mechanize agent with the given _opts_.
|
44
|
+
#
|
45
|
+
# GScraper.http_agent
|
46
|
+
# GScraper.http_agent(:user_agent_alias => 'Linux Mozilla')
|
47
|
+
# GScraper.http_agent(:user_agent => 'wooden pants')
|
48
|
+
#
|
49
|
+
def GScraper.http_agent(opts={})
|
50
|
+
agent = WWW::Mechanize.new
|
51
|
+
|
52
|
+
if opts[:user_agent_alias]
|
53
|
+
agent.user_agent_alias = opts[:user_agent_alias]
|
54
|
+
elsif opts[:user_agent]
|
55
|
+
agent.user_agent = opts[:user_agent]
|
56
|
+
elsif GScraper.user_agent
|
57
|
+
agent.user_agent = GScraper.user_agent
|
58
|
+
end
|
59
|
+
|
60
|
+
return agent
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
module GScraper
|
2
|
+
module Licenses
|
3
|
+
ANY = nil
|
4
|
+
|
5
|
+
ALADDIN = :aladdin
|
6
|
+
|
7
|
+
ARTISTIC = :artistic
|
8
|
+
|
9
|
+
APACHE = :apache
|
10
|
+
|
11
|
+
APPLE = :apple
|
12
|
+
|
13
|
+
BSD = :bsd
|
14
|
+
|
15
|
+
COMMON_PUBLIC = :cpl
|
16
|
+
|
17
|
+
CC_BY = :cc_by
|
18
|
+
|
19
|
+
CC_BY_SA = :cc_by_sa
|
20
|
+
|
21
|
+
CC_BY_ND = :cc_by_nd
|
22
|
+
|
23
|
+
CC_BY_NC = :cc_by_nc_sa
|
24
|
+
|
25
|
+
CC_BY_ND_SA = :cc_by_nd
|
26
|
+
|
27
|
+
CC_BY_NC_SA = :cc_by_nc_sa
|
28
|
+
|
29
|
+
CC_BY_NC_ND = :cc_by_nc_nd
|
30
|
+
|
31
|
+
GPL = :gpl
|
32
|
+
|
33
|
+
LGPL = :lgpl
|
34
|
+
|
35
|
+
HISTORICAL = :disclaimer
|
36
|
+
|
37
|
+
IBM_PUBLIC = :ibm
|
38
|
+
|
39
|
+
LUCENT_PUBLIC = :lucent
|
40
|
+
|
41
|
+
MIT = :mit
|
42
|
+
|
43
|
+
MOZILLA_PUBLI = :mozilla
|
44
|
+
|
45
|
+
NASA_OSA = :nasa
|
46
|
+
|
47
|
+
PYTHON = :python
|
48
|
+
|
49
|
+
Q_PUBLIC = :qpl
|
50
|
+
|
51
|
+
SLEEPYCAT = :sleepycat
|
52
|
+
|
53
|
+
ZOPE_PUBLIC = :zope
|
54
|
+
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
require 'gscraper/search/search'
|
@@ -0,0 +1,394 @@
|
|
1
|
+
require 'gscraper/search/result'
|
2
|
+
require 'gscraper/extensions/uri'
|
3
|
+
require 'gscraper/licenses'
|
4
|
+
require 'gscraper/gscraper'
|
5
|
+
|
6
|
+
require 'hpricot'
|
7
|
+
|
8
|
+
module GScraper
|
9
|
+
module Search
|
10
|
+
class Query
|
11
|
+
|
12
|
+
SEARCH_URL = 'http://www.google.com/search'
|
13
|
+
|
14
|
+
RESULTS_PER_PAGE = 10
|
15
|
+
|
16
|
+
# Results per-page
|
17
|
+
attr_accessor :results_per_page
|
18
|
+
|
19
|
+
# Search query
|
20
|
+
attr_accessor :query
|
21
|
+
|
22
|
+
# Search for results containing the exact phrase
|
23
|
+
attr_accessor :exact_phrase
|
24
|
+
|
25
|
+
# Search for results with the words
|
26
|
+
attr_accessor :with_words
|
27
|
+
|
28
|
+
# Search for results with-out the words
|
29
|
+
attr_accessor :without_words
|
30
|
+
|
31
|
+
# Search for results written in the language
|
32
|
+
attr_accessor :language
|
33
|
+
|
34
|
+
# Search for results from the region
|
35
|
+
attr_accessor :region
|
36
|
+
|
37
|
+
# Search for results in the format
|
38
|
+
attr_accessor :in_format
|
39
|
+
|
40
|
+
# Search for results not in the format
|
41
|
+
attr_accessor :not_in_format
|
42
|
+
|
43
|
+
# Search for results within the past day
|
44
|
+
attr_accessor :within_past_day
|
45
|
+
|
46
|
+
# Search for results within the past week
|
47
|
+
attr_accessor :within_past_week
|
48
|
+
|
49
|
+
# Search for results within the past months
|
50
|
+
attr_accessor :within_past_months
|
51
|
+
|
52
|
+
# Search for results within the past year
|
53
|
+
attr_accessor :within_past_year
|
54
|
+
|
55
|
+
# Search for results containing numbers between the range
|
56
|
+
attr_accessor :numeric_range
|
57
|
+
|
58
|
+
# Search for results where the query ocurrs within the area
|
59
|
+
attr_accessor :occurrs_within
|
60
|
+
|
61
|
+
# Search for results inside the domain
|
62
|
+
attr_accessor :inside_domain
|
63
|
+
|
64
|
+
# Search for results outside the domain
|
65
|
+
attr_accessor :outside_domain
|
66
|
+
|
67
|
+
# Search for results which have the rights
|
68
|
+
attr_accessor :rights
|
69
|
+
|
70
|
+
# Filter the search results
|
71
|
+
attr_accessor :filtered
|
72
|
+
|
73
|
+
# Search for results similar to the page
|
74
|
+
attr_accessor :similar_to
|
75
|
+
|
76
|
+
# Search for results linking to the page
|
77
|
+
attr_accessor :links_to
|
78
|
+
|
79
|
+
#
|
80
|
+
# Creates a new Query object from the given search options. If a
|
81
|
+
# block is given, it will be passed the newly created query object.
|
82
|
+
#
|
83
|
+
# Query.new(:query => 'ruby', :with_words => 'rspec rails')
|
84
|
+
#
|
85
|
+
# Query.new(:exact_phrase => 'fluent interfaces') do |q|
|
86
|
+
# q.within_past_week = true
|
87
|
+
# end
|
88
|
+
#
|
89
|
+
def initialize(opts={},&block)
|
90
|
+
super()
|
91
|
+
|
92
|
+
@results_per_page = opts[:results_per_page] || RESULTS_PER_PAGE
|
93
|
+
|
94
|
+
@query = opts[:query]
|
95
|
+
@exact_phrase = opts[:exact_phrase]
|
96
|
+
@with_words = opts[:with_words]
|
97
|
+
@without_words = opts[:without_words]
|
98
|
+
|
99
|
+
@language = opts[:language]
|
100
|
+
@region = opts[:region]
|
101
|
+
@in_format = opts[:in_format]
|
102
|
+
@not_in_format = opts[:not_in_format]
|
103
|
+
|
104
|
+
if opts[:within_past_day]
|
105
|
+
@within_past_day = opts[:within_past_day]
|
106
|
+
elsif opts[:within_past_week]
|
107
|
+
@within_past_week = opts[:within_past_week]
|
108
|
+
elsif opts[:within_past_months]
|
109
|
+
@within_past_months = opts[:within_past_months]
|
110
|
+
elsif opts[:within_past_year]
|
111
|
+
@within_past_year = opts[:within_past_year]
|
112
|
+
end
|
113
|
+
|
114
|
+
@numeric_range = opts[:numeric_range]
|
115
|
+
@occurrs_within = opts[:occurrs_within]
|
116
|
+
@inside_domain = opts[:inside_domain]
|
117
|
+
@outside_domain = opts[:outside_domain]
|
118
|
+
@rights = opts[:rights]
|
119
|
+
@filtered = opts[:filtered]
|
120
|
+
|
121
|
+
@similar_to = opts[:similar_to]
|
122
|
+
@links_to = opts[:links_to]
|
123
|
+
|
124
|
+
block.call(self) if block
|
125
|
+
end
|
126
|
+
|
127
|
+
#
|
128
|
+
# Creates a new Query object from the specified URL. If a block is
|
129
|
+
# given, it will be passed the newly created Query object.
|
130
|
+
#
|
131
|
+
# Query.from_url('http://www.google.com/search?q=ruby+zen)
|
132
|
+
#
|
133
|
+
# Query.from_url('http://www.google.com/search?q=ruby') do |q|
|
134
|
+
# q.within_last_month = true
|
135
|
+
# q.occurrs_within = :title
|
136
|
+
# end
|
137
|
+
#
|
138
|
+
def self.from_url(url,&block)
|
139
|
+
url = URI.parse(url)
|
140
|
+
opts = {}
|
141
|
+
|
142
|
+
opts[:results_per_page] = url.query_params['num']
|
143
|
+
|
144
|
+
opts[:query] = url.query_params['as_q']
|
145
|
+
opts[:exact_phrase] = url.query_params['as_epq']
|
146
|
+
opts[:with_words] = url.query_params['as_oq']
|
147
|
+
opts[:without_words] = url.query_params['as_eq']
|
148
|
+
|
149
|
+
opts[:language] = url.query_params['lr']
|
150
|
+
opts[:region] = url.query_params['cr']
|
151
|
+
|
152
|
+
case url.query_params['as_ft']
|
153
|
+
when 'i'
|
154
|
+
opts[:in_format] = url.query_params['as_filetype']
|
155
|
+
when 'e'
|
156
|
+
opts[:not_in_format] = url.query_params['as_filetype']
|
157
|
+
end
|
158
|
+
|
159
|
+
case url.query_params['as_qdr']
|
160
|
+
when 'd'
|
161
|
+
opts[:within_past_day] = true
|
162
|
+
when 'w'
|
163
|
+
opts[:within_past_week] = true
|
164
|
+
when 'm'
|
165
|
+
opts[:within_past_months] = 1
|
166
|
+
when 'm2'
|
167
|
+
opts[:within_past_months] = 2
|
168
|
+
when 'm3'
|
169
|
+
opts[:within_past_months] = 3
|
170
|
+
when 'm6'
|
171
|
+
opts[:within_past_months] = 6
|
172
|
+
when 'y'
|
173
|
+
opts[:within_past_year] = true
|
174
|
+
end
|
175
|
+
|
176
|
+
if (url.query_params['as_nlo'] || url.query_params['as_nhi'])
|
177
|
+
opts[:numeric_range] = Range.new(url.query_params['as_nlo'].to_i,url.query_params['as_nhi'].to_i)
|
178
|
+
end
|
179
|
+
|
180
|
+
case url.query_params['as_occt']
|
181
|
+
when 'title'
|
182
|
+
opts[:occurrs_within] = :title
|
183
|
+
when 'body'
|
184
|
+
opts[:occurrs_within] = :body
|
185
|
+
when 'url'
|
186
|
+
opts[:occurrs_within] = :url
|
187
|
+
when 'links'
|
188
|
+
opts[:occurrs_within] = :links
|
189
|
+
end
|
190
|
+
|
191
|
+
case url.query_params['as_dt']
|
192
|
+
when 'i'
|
193
|
+
opts[:inside_domain] = url.query_params['as_sitesearch']
|
194
|
+
when 'e'
|
195
|
+
opts[:outside_domain] = url.query_params['as_sitesearch']
|
196
|
+
end
|
197
|
+
|
198
|
+
case url.query_params['as_rights']
|
199
|
+
when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived)'
|
200
|
+
opts[:rights] = Licenses::CC_BY_NC_ND
|
201
|
+
when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)'
|
202
|
+
opts[:rights] = Licenses::CC_BY_SA
|
203
|
+
when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)'
|
204
|
+
opts[:rights] = Licenses::CC_BY_ND
|
205
|
+
when '(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)'
|
206
|
+
opts[:rights] = Licenses::CC_BY
|
207
|
+
end
|
208
|
+
|
209
|
+
if url.query_params[:safe]=='active'
|
210
|
+
opts[:filtered] = true
|
211
|
+
end
|
212
|
+
|
213
|
+
if url.query_params['as_rq']
|
214
|
+
opts[:similar_to] = url.query_params['as_rq']
|
215
|
+
elsif url.query_params['as_lq']
|
216
|
+
opts[:links_to] = url.query_params['as_lq']
|
217
|
+
end
|
218
|
+
|
219
|
+
return self.new(opts,&block)
|
220
|
+
end
|
221
|
+
|
222
|
+
#
|
223
|
+
# Returns the URL that represents the query.
|
224
|
+
#
|
225
|
+
def search_url
|
226
|
+
url = URI.parse(SEARCH_URL)
|
227
|
+
|
228
|
+
if @results_per_page
|
229
|
+
url.query_params['num'] = @results_per_page
|
230
|
+
end
|
231
|
+
|
232
|
+
url.query_params['as_q'] = @query if @query
|
233
|
+
url.query_params['as_epq'] = @exact_phrase if @exact_phrase
|
234
|
+
url.query_params['as_oq'] = @with_words if @with_words
|
235
|
+
url.query_params['as_eq'] = @without_words if @without_words
|
236
|
+
|
237
|
+
url.query_params['lr'] = @language if @language
|
238
|
+
url.query_params['cr'] = @region if @region
|
239
|
+
|
240
|
+
if @in_format
|
241
|
+
url.query_params['as_ft'] = 'i'
|
242
|
+
url.query_params['as_filtetype'] = @in_format
|
243
|
+
elsif @not_in_format
|
244
|
+
url.query_params['as_ft'] = 'e'
|
245
|
+
url.query_params['as_filtetype'] = @not_in_format
|
246
|
+
end
|
247
|
+
|
248
|
+
if @within_past_day
|
249
|
+
url.query_params['as_qdr'] = 'd'
|
250
|
+
elsif @within_past_week
|
251
|
+
url.query_params['as_qdr'] = 'w'
|
252
|
+
elsif @within_past_months
|
253
|
+
case @within_past_months
|
254
|
+
when 1
|
255
|
+
url.query_params['as_qdr'] = 'm'
|
256
|
+
when 2
|
257
|
+
url.query_params['as_qdr'] = 'm2'
|
258
|
+
when 3
|
259
|
+
url.query_params['as_qdr'] = 'm3'
|
260
|
+
when 6
|
261
|
+
url.query_params['as_qdr'] = 'm6'
|
262
|
+
end
|
263
|
+
elsif @within_past_year
|
264
|
+
url.query_params['as_qdr'] = 'y'
|
265
|
+
end
|
266
|
+
|
267
|
+
if @numeric_range
|
268
|
+
url.query_params['as_nlo'] = @numeric_range.begin
|
269
|
+
url.query_params['as_nhi'] = @numeric_range.end
|
270
|
+
end
|
271
|
+
|
272
|
+
case @occurrs_within
|
273
|
+
when :title, 'title'
|
274
|
+
url.query_params['as_occt'] = 'title'
|
275
|
+
when :body, 'body'
|
276
|
+
url.query_params['as_occt'] = 'body'
|
277
|
+
when :url, 'url'
|
278
|
+
url.query_params['as_occt'] = 'url'
|
279
|
+
when :links, 'links'
|
280
|
+
url.query_params['as_occt'] = 'links'
|
281
|
+
end
|
282
|
+
|
283
|
+
if @inside_domain
|
284
|
+
url.query_params['as_dt'] = 'i'
|
285
|
+
url.query_params['as_sitesearch'] = @inside_domain
|
286
|
+
elsif @outside_domain
|
287
|
+
url.query_params['as_dt'] = 'e'
|
288
|
+
url.query_params['as_sitesearch'] = @outside_domain
|
289
|
+
end
|
290
|
+
|
291
|
+
case @rights
|
292
|
+
when Licenses::CC_BY_NC_ND
|
293
|
+
url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived)'
|
294
|
+
when Licenses::CC_BY_SA
|
295
|
+
url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)'
|
296
|
+
when Licenses::CC_BY_ND
|
297
|
+
url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)'
|
298
|
+
when Licenses::CC_BY
|
299
|
+
url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)'
|
300
|
+
end
|
301
|
+
|
302
|
+
url.query_params['safe'] = true if @filtered
|
303
|
+
|
304
|
+
if @similar_to
|
305
|
+
url.query_params['as_rq'] = @similar_to
|
306
|
+
elsif @links_to
|
307
|
+
url.query_params['as_lq'] = @links_to
|
308
|
+
end
|
309
|
+
|
310
|
+
return url
|
311
|
+
end
|
312
|
+
|
313
|
+
#
|
314
|
+
# Returns the URL that represents the query at the specific
|
315
|
+
# _page_index_.
|
316
|
+
#
|
317
|
+
def page_url(page_index)
|
318
|
+
url = search_url
|
319
|
+
|
320
|
+
url.query_params['start'] = page_index_offset(page_index)
|
321
|
+
url.query_params['sa'] = 'N'
|
322
|
+
|
323
|
+
return url
|
324
|
+
end
|
325
|
+
|
326
|
+
#
|
327
|
+
# Returns an array of Result objects at the specified _page_index_.
|
328
|
+
# If _opts_ are given, they will be used in accessing the SEARCH_URL.
|
329
|
+
#
|
330
|
+
def page(page_index,opts={})
|
331
|
+
results = []
|
332
|
+
doc = Hpricot(GScraper.open(page_url(page_index),opts))
|
333
|
+
|
334
|
+
doc.search('//div.g').each_with_index do |result,index|
|
335
|
+
rank = page_index_offset(page_index) + (index + 1)
|
336
|
+
title = result.search('//h2.r').first.inner_text
|
337
|
+
url = result.search('//h2.r/a').first.get_attribute('href')
|
338
|
+
# TODO: exclude URL and Links from summary text
|
339
|
+
summary = result.search('//td.j').first.inner_text
|
340
|
+
|
341
|
+
# TODO: scrape Cached and Similar links
|
342
|
+
|
343
|
+
results << Result.new(rank,title,url,summary)
|
344
|
+
end
|
345
|
+
|
346
|
+
return results
|
347
|
+
end
|
348
|
+
|
349
|
+
#
|
350
|
+
# Returns the results on the first page. If _opts_ are given, they
|
351
|
+
# will be used in accessing the SEARCH_URL.
|
352
|
+
#
|
353
|
+
def first_page(opts={})
|
354
|
+
page(1,opts)
|
355
|
+
end
|
356
|
+
|
357
|
+
#
|
358
|
+
# Iterates over the results at the specified _page_index_, passing
|
359
|
+
# each to the given _block_. If _opts_ are given they will be used
|
360
|
+
# in accessing the SEARCH_URL.
|
361
|
+
#
|
362
|
+
# query.each_on_page(2) do |result|
|
363
|
+
# puts result.title
|
364
|
+
# end
|
365
|
+
#
|
366
|
+
def each_on_page(page_index,opts={},&block)
|
367
|
+
page(page_index,opts).each(&block)
|
368
|
+
end
|
369
|
+
|
370
|
+
#
|
371
|
+
# Iterates over the results on the first page, passing
|
372
|
+
# each to the given _block_. If _opts_ are given, they will be used
|
373
|
+
# in accessing the SEARCH_URL.
|
374
|
+
#
|
375
|
+
# query.each_on_first_page do |result|
|
376
|
+
# puts result.url
|
377
|
+
# end
|
378
|
+
#
|
379
|
+
def each_on_first_page(opts={},&block)
|
380
|
+
each_on_page(1,opts,&block)
|
381
|
+
end
|
382
|
+
|
383
|
+
protected
|
384
|
+
|
385
|
+
#
|
386
|
+
# Returns the rank offset for the specified _page_index_.
|
387
|
+
#
|
388
|
+
def page_index_offset(page_index)
|
389
|
+
(page_index.to_i - 1) * @result_per_page.to_i
|
390
|
+
end
|
391
|
+
|
392
|
+
end
|
393
|
+
end
|
394
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module GScraper
|
2
|
+
module Search
|
3
|
+
class Result
|
4
|
+
|
5
|
+
# Rank of the result page
|
6
|
+
attr_reader :rank
|
7
|
+
|
8
|
+
# Title of the result page
|
9
|
+
attr_reader :title
|
10
|
+
|
11
|
+
# URL of the result page
|
12
|
+
attr_reader :url
|
13
|
+
|
14
|
+
# Summary from the result page
|
15
|
+
attr_reader :summary
|
16
|
+
|
17
|
+
#
|
18
|
+
# Creates a new Result object with the given _rank_, _title_
|
19
|
+
# _summary_, _url_ and _size_.
|
20
|
+
#
|
21
|
+
def initialize(rank,title,url,summary)
|
22
|
+
@rank = rank
|
23
|
+
@title = title
|
24
|
+
@url = url
|
25
|
+
@summary = summary
|
26
|
+
end
|
27
|
+
|
28
|
+
#
|
29
|
+
# Returns a string containing the result's title.
|
30
|
+
#
|
31
|
+
def to_s
|
32
|
+
@title.to_s
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'gscraper/search/query'
|
2
|
+
|
3
|
+
module GScraper
|
4
|
+
module Search
|
5
|
+
#
|
6
|
+
# Returns a new Query object with the given _opts_. See Query.new.
|
7
|
+
#
|
8
|
+
# Search.query(:query => 'ruby', :with_words => 'rspec rails')
|
9
|
+
#
|
10
|
+
# Search.query(:exact_phrase => 'fluent interfaces') do |q|
|
11
|
+
# q.within_past_week = true
|
12
|
+
# end
|
13
|
+
#
|
14
|
+
def Search.query(opts={},&block)
|
15
|
+
Query.new(opts,&block)
|
16
|
+
end
|
17
|
+
|
18
|
+
#
|
19
|
+
# Returns the Query object that represents the specified _url_.
|
20
|
+
# See Query.from_url.
|
21
|
+
#
|
22
|
+
# Search.query_from_url('http://www.google.com/search?q=ruby+zen)
|
23
|
+
#
|
24
|
+
# Search.query_from_url('http://www.google.com/search?q=ruby') do |q|
|
25
|
+
# q.within_last_month = true
|
26
|
+
# q.occurrs_within = :title
|
27
|
+
# end
|
28
|
+
#
|
29
|
+
def Search.query_from_url(url,&block)
|
30
|
+
Query.from_url(url,&block)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require 'gscraper/search/query'
|
3
|
+
|
4
|
+
class QueryFromURL < Test::Unit::TestCase
|
5
|
+
|
6
|
+
include GScraper
|
7
|
+
|
8
|
+
QUERY_URL = 'http://www.google.com/search?as_q=test&hl=en&num=20&btnG=Google+Search&as_epq=what+if&as_oq=dog&as_eq=haha&lr=&cr=&as_ft=i&as_filetype=&as_qdr=w&as_nlo=&as_nhi=&as_occt=body&as_dt=i&as_sitesearch=&as_rights=&safe=images'
|
9
|
+
|
10
|
+
def setup
|
11
|
+
@query = Search::Query.from_url(QUERY_URL)
|
12
|
+
end
|
13
|
+
|
14
|
+
def teardown
|
15
|
+
@query = nil
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_query
|
19
|
+
assert_equal @query.query, 'test'
|
20
|
+
end
|
21
|
+
|
22
|
+
def test_exact_phrase
|
23
|
+
assert_equal @query.exact_phrase, 'what+if'
|
24
|
+
end
|
25
|
+
|
26
|
+
def test_with_words
|
27
|
+
assert_equal @query.with_words, 'dog'
|
28
|
+
end
|
29
|
+
|
30
|
+
def test_without_words
|
31
|
+
assert_equal @query.without_words, 'haha'
|
32
|
+
end
|
33
|
+
|
34
|
+
def test_within_past_week
|
35
|
+
assert_equal @query.within_past_week, true
|
36
|
+
end
|
37
|
+
|
38
|
+
def test_occurrs_within
|
39
|
+
assert_equal @query.occurrs_within, :body
|
40
|
+
end
|
41
|
+
|
42
|
+
def test_similar_to
|
43
|
+
assert_nil @query.similar_to
|
44
|
+
end
|
45
|
+
|
46
|
+
def test_links_to
|
47
|
+
assert_nil @query.links_to
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
metadata
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.9.4
|
3
|
+
specification_version: 1
|
4
|
+
name: gscraper
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: 0.1.0
|
7
|
+
date: 2007-12-21 00:00:00 -08:00
|
8
|
+
summary: A ruby web-scraping interface to various Google Services
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
email: postmodern.mod3@gmail.com
|
12
|
+
homepage: " by Postmodern Modulus III"
|
13
|
+
rubyforge_project: gscraper
|
14
|
+
description: "== FEATURES/PROBLEMS: * Supports the Google Search service. * Provides HTTP access with custom User-Agent strings. == REQUIREMENTS: * Hpricot * Mechanize == INSTALL:"
|
15
|
+
autorequire:
|
16
|
+
default_executable:
|
17
|
+
bindir: bin
|
18
|
+
has_rdoc: true
|
19
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">"
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.0.0
|
24
|
+
version:
|
25
|
+
platform: ruby
|
26
|
+
signing_key:
|
27
|
+
cert_chain:
|
28
|
+
post_install_message:
|
29
|
+
authors:
|
30
|
+
- Postmodern Modulus III
|
31
|
+
files:
|
32
|
+
- History.txt
|
33
|
+
- LICENSE.txt
|
34
|
+
- Manifest.txt
|
35
|
+
- README.txt
|
36
|
+
- Rakefile
|
37
|
+
- lib/gscraper.rb
|
38
|
+
- lib/gscraper/gscraper.rb
|
39
|
+
- lib/gscraper/extensions/uri/http.rb
|
40
|
+
- lib/gscraper/extensions/uri.rb
|
41
|
+
- lib/gscraper/extensions.rb
|
42
|
+
- lib/gscraper/licenses.rb
|
43
|
+
- lib/gscraper/search/result.rb
|
44
|
+
- lib/gscraper/search/query.rb
|
45
|
+
- lib/gscraper/search/search.rb
|
46
|
+
- lib/gscraper/search.rb
|
47
|
+
- test/test_gscraper.rb
|
48
|
+
- test/search/query_from_url.rb
|
49
|
+
test_files:
|
50
|
+
- test/test_gscraper.rb
|
51
|
+
rdoc_options:
|
52
|
+
- --main
|
53
|
+
- README.txt
|
54
|
+
extra_rdoc_files:
|
55
|
+
- History.txt
|
56
|
+
- LICENSE.txt
|
57
|
+
- Manifest.txt
|
58
|
+
- README.txt
|
59
|
+
executables: []
|
60
|
+
|
61
|
+
extensions: []
|
62
|
+
|
63
|
+
requirements: []
|
64
|
+
|
65
|
+
dependencies:
|
66
|
+
- !ruby/object:Gem::Dependency
|
67
|
+
name: hoe
|
68
|
+
version_requirement:
|
69
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
70
|
+
requirements:
|
71
|
+
- - ">="
|
72
|
+
- !ruby/object:Gem::Version
|
73
|
+
version: 1.3.0
|
74
|
+
version:
|