gscraper 0.1.7 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/COPYING.txt +339 -0
- data/History.txt +21 -0
- data/Manifest.txt +23 -10
- data/README.txt +17 -21
- data/Rakefile +3 -6
- data/lib/gscraper.rb +22 -0
- data/lib/gscraper/extensions.rb +22 -0
- data/lib/gscraper/extensions/uri.rb +22 -0
- data/lib/gscraper/extensions/uri/http.rb +25 -71
- data/lib/gscraper/extensions/uri/query_params.rb +96 -0
- data/lib/gscraper/gscraper.rb +30 -0
- data/lib/gscraper/has_pages.rb +114 -0
- data/lib/gscraper/licenses.rb +22 -0
- data/lib/gscraper/page.rb +64 -0
- data/lib/gscraper/search.rb +24 -0
- data/lib/gscraper/search/ajax_query.rb +176 -0
- data/lib/gscraper/search/page.rb +27 -72
- data/lib/gscraper/search/query.rb +46 -457
- data/lib/gscraper/search/result.rb +32 -29
- data/lib/gscraper/search/search.rb +44 -3
- data/lib/gscraper/search/web_query.rb +472 -0
- data/lib/gscraper/sponsored_ad.rb +26 -2
- data/lib/gscraper/sponsored_links.rb +77 -8
- data/lib/gscraper/version.rb +23 -1
- data/spec/extensions/uri/http_spec.rb +9 -0
- data/spec/extensions/uri/query_params_spec.rb +38 -0
- data/spec/gscraper_spec.rb +29 -0
- data/spec/has_pages_examples.rb +19 -0
- data/spec/has_sponsored_links_examples.rb +57 -0
- data/spec/helpers/query.rb +1 -0
- data/spec/helpers/uri.rb +8 -0
- data/spec/page_has_results_examples.rb +13 -0
- data/spec/search/ajax_query_spec.rb +124 -0
- data/spec/search/page_has_results_examples.rb +51 -0
- data/spec/search/query_spec.rb +103 -0
- data/spec/search/web_query_spec.rb +74 -0
- data/spec/spec_helper.rb +6 -0
- data/tasks/spec.rb +7 -0
- metadata +34 -20
- data/LICENSE.txt +0 -23
- data/lib/gscraper/web_agent.rb +0 -38
- data/test/search/page_results.rb +0 -103
- data/test/search/query_from_url.rb +0 -50
- data/test/search/query_pages.rb +0 -32
- data/test/search/query_result.rb +0 -30
- data/test/test_gscraper.rb +0 -4
data/lib/gscraper/licenses.rb
CHANGED
@@ -1,3 +1,25 @@
|
|
1
|
+
#
|
2
|
+
#--
|
3
|
+
# GScraper - A web-scraping interface to various Google Services.
|
4
|
+
#
|
5
|
+
# Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
|
6
|
+
#
|
7
|
+
# This program is free software; you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU General Public License as published by
|
9
|
+
# the Free Software Foundation; either version 2 of the License, or
|
10
|
+
# (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU General Public License
|
18
|
+
# along with this program; if not, write to the Free Software
|
19
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
+
#++
|
21
|
+
#
|
22
|
+
|
1
23
|
module GScraper
|
2
24
|
module Licenses
|
3
25
|
# Any desired license
|
@@ -0,0 +1,64 @@
|
|
1
|
+
#
|
2
|
+
#--
|
3
|
+
# GScraper - A web-scraping interface to various Google Services.
|
4
|
+
#
|
5
|
+
# Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
|
6
|
+
#
|
7
|
+
# This program is free software; you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU General Public License as published by
|
9
|
+
# the Free Software Foundation; either version 2 of the License, or
|
10
|
+
# (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU General Public License
|
18
|
+
# along with this program; if not, write to the Free Software
|
19
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
+
#++
|
21
|
+
#
|
22
|
+
|
23
|
+
module GScraper
|
24
|
+
class Page < Array
|
25
|
+
|
26
|
+
#
|
27
|
+
# Creates a new Page object with the given _elements_. If a _block_
|
28
|
+
# is given, it will be passed the newly created Page object.
|
29
|
+
#
|
30
|
+
def initialize(elements=[],&block)
|
31
|
+
super(elements)
|
32
|
+
|
33
|
+
block.call(self) if block
|
34
|
+
end
|
35
|
+
|
36
|
+
#
|
37
|
+
# Returns a mapped Array of the elements within the Page using the
|
38
|
+
# given _block_. If the _block_ is not given, the page will be
|
39
|
+
# returned.
|
40
|
+
#
|
41
|
+
# page.map # => Page
|
42
|
+
#
|
43
|
+
# page.map { |element| element.field } # => [...]
|
44
|
+
#
|
45
|
+
def map(&block)
|
46
|
+
return self unless block
|
47
|
+
|
48
|
+
mapped = []
|
49
|
+
|
50
|
+
each { |element| mapped << block.call(element) }
|
51
|
+
return mapped
|
52
|
+
end
|
53
|
+
|
54
|
+
#
|
55
|
+
# Selects the elements within the Page which match the given _block_.
|
56
|
+
#
|
57
|
+
# page.select { |element| element.field =~ /ruby/i }
|
58
|
+
#
|
59
|
+
def select(&block)
|
60
|
+
self.class.new(super(&block))
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
64
|
+
end
|
data/lib/gscraper/search.rb
CHANGED
@@ -1 +1,25 @@
|
|
1
|
+
#
|
2
|
+
#--
|
3
|
+
# GScraper - A web-scraping interface to various Google Services.
|
4
|
+
#
|
5
|
+
# Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
|
6
|
+
#
|
7
|
+
# This program is free software; you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU General Public License as published by
|
9
|
+
# the Free Software Foundation; either version 2 of the License, or
|
10
|
+
# (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU General Public License
|
18
|
+
# along with this program; if not, write to the Free Software
|
19
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
+
#++
|
21
|
+
#
|
22
|
+
|
23
|
+
require 'gscraper/search/web_query'
|
24
|
+
require 'gscraper/search/ajax_query'
|
1
25
|
require 'gscraper/search/search'
|
@@ -0,0 +1,176 @@
|
|
1
|
+
#
|
2
|
+
#--
|
3
|
+
# GScraper - A web-scraping interface to various Google Services.
|
4
|
+
#
|
5
|
+
# Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
|
6
|
+
#
|
7
|
+
# This program is free software; you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU General Public License as published by
|
9
|
+
# the Free Software Foundation; either version 2 of the License, or
|
10
|
+
# (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU General Public License
|
18
|
+
# along with this program; if not, write to the Free Software
|
19
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
+
#++
|
21
|
+
#
|
22
|
+
|
23
|
+
require 'gscraper/search/result'
|
24
|
+
require 'gscraper/search/page'
|
25
|
+
require 'gscraper/search/query'
|
26
|
+
require 'gscraper/extensions/uri'
|
27
|
+
require 'gscraper/has_pages'
|
28
|
+
require 'gscraper/gscraper'
|
29
|
+
|
30
|
+
require 'json'
|
31
|
+
|
32
|
+
module GScraper
|
33
|
+
module Search
|
34
|
+
class AJAXQuery < Query
|
35
|
+
|
36
|
+
include HasPages
|
37
|
+
|
38
|
+
# Maximum results per-page
|
39
|
+
RESULTS_PER_PAGE = 8
|
40
|
+
|
41
|
+
# AJAX API host
|
42
|
+
API_HOST = 'www.google.com'
|
43
|
+
|
44
|
+
# AJAX API URL
|
45
|
+
API_URL = "http://#{API_HOST}/uds/GwebSearch?callback=google.search.WebSearch.RawCompletion&context=0&lstkp=0&rsz=large"
|
46
|
+
|
47
|
+
# Default language
|
48
|
+
DEFAULT_LANGUAGE = 'en'
|
49
|
+
|
50
|
+
# Default signature
|
51
|
+
DEFAULT_SIG = '582c1116317355adf613a6a843f19ece'
|
52
|
+
|
53
|
+
# Default key
|
54
|
+
DEFAULT_KEY = 'notsupplied'
|
55
|
+
|
56
|
+
# Default version
|
57
|
+
DEFAULT_VERSION = '1.0'
|
58
|
+
|
59
|
+
# The search language
|
60
|
+
attr_accessor :language
|
61
|
+
|
62
|
+
# The search signature
|
63
|
+
attr_accessor :sig
|
64
|
+
|
65
|
+
# The search key
|
66
|
+
attr_accessor :key
|
67
|
+
|
68
|
+
# The API version
|
69
|
+
attr_accessor :version
|
70
|
+
|
71
|
+
#
|
72
|
+
# Creates a new AJAXQuery with the given _options_. If a _block_ is
|
73
|
+
# given it will be passed the newly created AJAXQuery object.
|
74
|
+
#
|
75
|
+
# _options_ may contain the following keys:
|
76
|
+
# <tt>:language</tt>:: The search language. Defaults to <tt>:en</tt>.
|
77
|
+
# <tt>:sig</tt>:: The search signature. Defaults to
|
78
|
+
# +582c1116317355adf613a6a843f19ece+.
|
79
|
+
# <tt>:key</tt>:: The search key. Defaults to <tt>:notsupplied</tt>.
|
80
|
+
# <tt>:version</tt>:: The desired API version. Defaults to
|
81
|
+
# <tt>1.0</tt>.
|
82
|
+
#
|
83
|
+
def initialize(options={},&block)
|
84
|
+
@agent = GScraper.web_agent(options)
|
85
|
+
|
86
|
+
@language = (options[:language] || DEFAULT_LANGUAGE)
|
87
|
+
|
88
|
+
@sig = (options[:sig] || DEFAULT_SIG)
|
89
|
+
@key = (options[:key] || DEFAULT_KEY)
|
90
|
+
@version = (options[:version] || DEFAULT_VERSION)
|
91
|
+
|
92
|
+
super(options,&block)
|
93
|
+
end
|
94
|
+
|
95
|
+
#
|
96
|
+
# Creates a new AJAXQuery object from the specified URL. If a block is
|
97
|
+
# given, it will be passed the newly created AJAXQuery object.
|
98
|
+
#
|
99
|
+
def self.from_url(url,options={},&block)
|
100
|
+
url = URI(url.to_s)
|
101
|
+
|
102
|
+
options[:language] = url.query_params['hl']
|
103
|
+
options[:query] = url.query_params['q']
|
104
|
+
|
105
|
+
options[:sig] = url.query_params['sig']
|
106
|
+
options[:key] = url.query_params['key']
|
107
|
+
options[:version] = url.query_params['v']
|
108
|
+
|
109
|
+
return self.new(options,&block)
|
110
|
+
end
|
111
|
+
|
112
|
+
#
|
113
|
+
# Returns +RESULTS_PER_PAGE+.
|
114
|
+
#
|
115
|
+
def results_per_page
|
116
|
+
RESULTS_PER_PAGE
|
117
|
+
end
|
118
|
+
|
119
|
+
#
|
120
|
+
# Returns the URL that represents the query.
|
121
|
+
#
|
122
|
+
def search_url
|
123
|
+
search_url = URI(API_URL)
|
124
|
+
|
125
|
+
search_url.query_params['hl'] = @language
|
126
|
+
search_url.query_params['gss'] = '.com'
|
127
|
+
search_url.query_params['q'] = expression
|
128
|
+
search_url.query_params['sig'] = @sig
|
129
|
+
search_url.query_params['key'] = @key
|
130
|
+
search_url.query_params['v'] = @version
|
131
|
+
|
132
|
+
return search_url
|
133
|
+
end
|
134
|
+
|
135
|
+
#
|
136
|
+
# Returns the URL that represents the query of a specific
|
137
|
+
# _page_index_.
|
138
|
+
#
|
139
|
+
def page_url(page_index)
|
140
|
+
url = search_url
|
141
|
+
|
142
|
+
if page_index > 1
|
143
|
+
url.query_params['start'] = result_offset_of(page_index)
|
144
|
+
end
|
145
|
+
|
146
|
+
return url
|
147
|
+
end
|
148
|
+
|
149
|
+
#
|
150
|
+
# Returns a Page object containing Result objects at the specified
|
151
|
+
# _page_index_.
|
152
|
+
#
|
153
|
+
def page(page_index)
|
154
|
+
Page.new do |new_page|
|
155
|
+
body = @agent.get(page_url(page_index)).body
|
156
|
+
hash = JSON.parse(body.scan(/\{.*\}/).first)
|
157
|
+
|
158
|
+
rank_offset = result_offset_of(page_index)
|
159
|
+
|
160
|
+
if (hash.kind_of?(Hash) && hash['results'])
|
161
|
+
hash['results'].each_with_index do |result,index|
|
162
|
+
rank = rank_offset + (index + 1)
|
163
|
+
title = Hpricot(result['title']).inner_text
|
164
|
+
url = result['unescapedUrl']
|
165
|
+
summary = Hpricot(result['content']).inner_text
|
166
|
+
cached_url = result['cacheUrl']
|
167
|
+
|
168
|
+
new_page << Result.new(rank,title,url,summary,cached_url)
|
169
|
+
end
|
170
|
+
end
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
end
|
175
|
+
end
|
176
|
+
end
|
data/lib/gscraper/search/page.rb
CHANGED
@@ -1,42 +1,31 @@
|
|
1
|
+
#
|
2
|
+
#--
|
3
|
+
# GScraper - A web-scraping interface to various Google Services.
|
4
|
+
#
|
5
|
+
# Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
|
6
|
+
#
|
7
|
+
# This program is free software; you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU General Public License as published by
|
9
|
+
# the Free Software Foundation; either version 2 of the License, or
|
10
|
+
# (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU General Public License
|
18
|
+
# along with this program; if not, write to the Free Software
|
19
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
+
#++
|
21
|
+
#
|
22
|
+
|
1
23
|
require 'gscraper/search/result'
|
24
|
+
require 'gscraper/page'
|
2
25
|
|
3
26
|
module GScraper
|
4
27
|
module Search
|
5
|
-
class Page <
|
6
|
-
|
7
|
-
#
|
8
|
-
# Creates a new Page object with the given _results_.
|
9
|
-
#
|
10
|
-
def initialize(results=[])
|
11
|
-
super(results)
|
12
|
-
end
|
13
|
-
|
14
|
-
#
|
15
|
-
# Returns a mapped Array of the results within the Page using the
|
16
|
-
# given _block_. If the _block_ is not given, the page will be
|
17
|
-
# returned.
|
18
|
-
#
|
19
|
-
# page.map # => Page
|
20
|
-
#
|
21
|
-
# page.map { |result| result.url } # => [...]
|
22
|
-
#
|
23
|
-
def map(&block)
|
24
|
-
return self unless block
|
25
|
-
|
26
|
-
mapped = []
|
27
|
-
|
28
|
-
each { |result| mapped << block.call(result) }
|
29
|
-
return mapped
|
30
|
-
end
|
31
|
-
|
32
|
-
#
|
33
|
-
# Selects the results within the Page which match the given _block_.
|
34
|
-
#
|
35
|
-
# page.select { |result| result.title =~ /ruby/i }
|
36
|
-
#
|
37
|
-
def select(&block)
|
38
|
-
Page.new(super(&block))
|
39
|
-
end
|
28
|
+
class Page < GScraper::Page
|
40
29
|
|
41
30
|
#
|
42
31
|
# Selects the results using the specified _block_.
|
@@ -160,7 +149,7 @@ module GScraper
|
|
160
149
|
# page.cached_urls # => [...]
|
161
150
|
#
|
162
151
|
def cached_urls
|
163
|
-
map { |result| result.cached_url }
|
152
|
+
map { |result| result.cached_url }.compact
|
164
153
|
end
|
165
154
|
|
166
155
|
#
|
@@ -170,7 +159,7 @@ module GScraper
|
|
170
159
|
# page.cached_pages # => [...]
|
171
160
|
#
|
172
161
|
def cached_pages
|
173
|
-
map { |result| result.cached_page }
|
162
|
+
map { |result| result.cached_page }.compact
|
174
163
|
end
|
175
164
|
|
176
165
|
#
|
@@ -180,17 +169,7 @@ module GScraper
|
|
180
169
|
# page.similar_urls # => [...]
|
181
170
|
#
|
182
171
|
def similar_urls
|
183
|
-
map { |result| result.similar_url }
|
184
|
-
end
|
185
|
-
|
186
|
-
#
|
187
|
-
# Returns an Array containing the similar Queries of the results
|
188
|
-
# within the Page.
|
189
|
-
#
|
190
|
-
# page.similar_queries # => [...]
|
191
|
-
#
|
192
|
-
def similar_queries
|
193
|
-
map { |result| result.similar_query }
|
172
|
+
map { |result| result.similar_url }.compact
|
194
173
|
end
|
195
174
|
|
196
175
|
#
|
@@ -263,20 +242,6 @@ module GScraper
|
|
263
242
|
similar_urls.each(&block)
|
264
243
|
end
|
265
244
|
|
266
|
-
#
|
267
|
-
# Iterates over each result's similar Query within the Page, passing
|
268
|
-
# each to the given _block_.
|
269
|
-
#
|
270
|
-
# each_similar_query do |q|
|
271
|
-
# q.first_page do |page|
|
272
|
-
# puts page.urls.join("\n")
|
273
|
-
# end
|
274
|
-
# end
|
275
|
-
#
|
276
|
-
def each_similar_query(&block)
|
277
|
-
similar_queries.each(&block)
|
278
|
-
end
|
279
|
-
|
280
245
|
#
|
281
246
|
# Returns the ranks of the results that match the specified _block_.
|
282
247
|
#
|
@@ -345,16 +310,6 @@ module GScraper
|
|
345
310
|
results_with(&block).similar_urls
|
346
311
|
end
|
347
312
|
|
348
|
-
#
|
349
|
-
# Returns the similar Queries of the results that match the
|
350
|
-
# specified _block_.
|
351
|
-
#
|
352
|
-
# page.similar_queries_of { |result| result.title =~ /hackety/ }
|
353
|
-
#
|
354
|
-
def similar_queries_of(&block)
|
355
|
-
results_with(&block).similar_queries
|
356
|
-
end
|
357
|
-
|
358
313
|
end
|
359
314
|
end
|
360
315
|
end
|
@@ -1,10 +1,33 @@
|
|
1
|
+
#
|
2
|
+
#--
|
3
|
+
# GScraper - A web-scraping interface to various Google Services.
|
4
|
+
#
|
5
|
+
# Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
|
6
|
+
#
|
7
|
+
# This program is free software; you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU General Public License as published by
|
9
|
+
# the Free Software Foundation; either version 2 of the License, or
|
10
|
+
# (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU General Public License
|
18
|
+
# along with this program; if not, write to the Free Software
|
19
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
+
#++
|
21
|
+
#
|
22
|
+
|
1
23
|
require 'gscraper/search/result'
|
2
24
|
require 'gscraper/search/page'
|
3
25
|
require 'gscraper/sponsored_ad'
|
4
26
|
require 'gscraper/sponsored_links'
|
5
27
|
require 'gscraper/extensions/uri'
|
28
|
+
require 'gscraper/has_pages'
|
6
29
|
require 'gscraper/licenses'
|
7
|
-
require 'gscraper/
|
30
|
+
require 'gscraper/gscraper'
|
8
31
|
|
9
32
|
require 'hpricot'
|
10
33
|
|
@@ -12,20 +35,6 @@ module GScraper
|
|
12
35
|
module Search
|
13
36
|
class Query
|
14
37
|
|
15
|
-
include WebAgent
|
16
|
-
|
17
|
-
# Search host
|
18
|
-
SEARCH_HOST = 'www.google.com'
|
19
|
-
|
20
|
-
# Search URL
|
21
|
-
SEARCH_URL = "http://#{SEARCH_HOST}/search"
|
22
|
-
|
23
|
-
# Default results per-page
|
24
|
-
RESULTS_PER_PAGE = 10
|
25
|
-
|
26
|
-
# Results per-page
|
27
|
-
attr_accessor :results_per_page
|
28
|
-
|
29
38
|
# Search query
|
30
39
|
attr_accessor :query
|
31
40
|
|
@@ -71,67 +80,14 @@ module GScraper
|
|
71
80
|
# Search for results with-out the words
|
72
81
|
attr_accessor :without_words
|
73
82
|
|
74
|
-
# Search for results written in the language
|
75
|
-
attr_accessor :language
|
76
|
-
|
77
|
-
# Search for results from the region
|
78
|
-
attr_accessor :region
|
79
|
-
|
80
|
-
# Search for results in the format
|
81
|
-
attr_accessor :in_format
|
82
|
-
|
83
|
-
# Search for results not in the format
|
84
|
-
attr_accessor :not_in_format
|
85
|
-
|
86
|
-
# Search for results within the past day
|
87
|
-
attr_accessor :within_past_day
|
88
|
-
|
89
|
-
# Search for results within the past week
|
90
|
-
attr_accessor :within_past_week
|
91
|
-
|
92
|
-
# Search for results within the past months
|
93
|
-
attr_accessor :within_past_months
|
94
|
-
|
95
|
-
# Search for results within the past year
|
96
|
-
attr_accessor :within_past_year
|
97
|
-
|
98
83
|
# Search for results containing numbers between the range
|
99
84
|
attr_accessor :numeric_range
|
100
85
|
|
101
|
-
# Search for results where the query ocurrs within the area
|
102
|
-
attr_accessor :occurrs_within
|
103
|
-
|
104
|
-
# Search for results inside the domain
|
105
|
-
attr_accessor :inside_domain
|
106
|
-
|
107
|
-
# Search for results outside the domain
|
108
|
-
attr_accessor :outside_domain
|
109
|
-
|
110
|
-
# Search for results which have the rights
|
111
|
-
attr_accessor :rights
|
112
|
-
|
113
|
-
# Filter the search results
|
114
|
-
attr_accessor :filtered
|
115
|
-
|
116
|
-
# Search for results similar to the page
|
117
|
-
attr_accessor :similar_to
|
118
|
-
|
119
|
-
# Search for results linking to the page
|
120
|
-
attr_accessor :links_to
|
121
|
-
|
122
86
|
#
|
123
87
|
# Creates a new Query object from the given search options. If a
|
124
|
-
# block is given, it will be passed the newly created
|
125
|
-
#
|
126
|
-
# Query.new(:query => 'ruby', :with_words => 'sow rspec')
|
127
|
-
#
|
128
|
-
# Query.new(:exact_phrase => 'fluent interfaces') do |q|
|
129
|
-
# q.within_past_week = true
|
130
|
-
# end
|
88
|
+
# block is given, it will be passed the newly created Query object.
|
131
89
|
#
|
132
90
|
def initialize(options={},&block)
|
133
|
-
@results_per_page = (options[:results_per_page] || RESULTS_PER_PAGE)
|
134
|
-
|
135
91
|
@query = options[:query]
|
136
92
|
|
137
93
|
@link = options[:link]
|
@@ -151,175 +107,34 @@ module GScraper
|
|
151
107
|
@with_words = options[:with_words]
|
152
108
|
@without_words = options[:without_words]
|
153
109
|
|
154
|
-
@language = options[:language]
|
155
|
-
@region = options[:region]
|
156
|
-
@in_format = options[:in_format]
|
157
|
-
@not_in_format = options[:not_in_format]
|
158
|
-
|
159
|
-
if options[:within_past_day]
|
160
|
-
@within_past_day = options[:within_past_day]
|
161
|
-
@within_past_week = false
|
162
|
-
@within_past_months = false
|
163
|
-
@within_past_year = false
|
164
|
-
elsif options[:within_past_week]
|
165
|
-
@within_past_day = false
|
166
|
-
@within_past_week = options[:within_past_week]
|
167
|
-
@within_past_months = false
|
168
|
-
@within_past_year = false
|
169
|
-
elsif options[:within_past_months]
|
170
|
-
@within_past_day = false
|
171
|
-
@within_past_week = false
|
172
|
-
@within_past_months = options[:within_past_months]
|
173
|
-
@within_past_year = false
|
174
|
-
elsif options[:within_past_year]
|
175
|
-
@within_past_day = false
|
176
|
-
@within_past_week = false
|
177
|
-
@within_past_months = false
|
178
|
-
@within_past_year = options[:within_past_year]
|
179
|
-
else
|
180
|
-
@within_past_day = false
|
181
|
-
@within_past_week = false
|
182
|
-
@within_past_months = false
|
183
|
-
@within_past_year = false
|
184
|
-
end
|
185
|
-
|
186
110
|
@numeric_range = options[:numeric_range]
|
187
|
-
@occurrs_within = options[:occurrs_within]
|
188
|
-
@inside_domain = options[:inside_domain]
|
189
|
-
@outside_domain = options[:outside_domain]
|
190
|
-
@rights = options[:rights]
|
191
|
-
@filtered = options[:filtered]
|
192
|
-
|
193
|
-
@similar_to = options[:similar_to]
|
194
|
-
@links_to = options[:links_to]
|
195
111
|
|
196
112
|
block.call(self) if block
|
197
113
|
end
|
198
114
|
|
199
115
|
#
|
200
|
-
#
|
201
|
-
# given, it will be passed the newly created Query object.
|
202
|
-
#
|
203
|
-
# Query.from_url('http://www.google.com/search?q=ruby+zen)
|
116
|
+
# Returns the query expression.
|
204
117
|
#
|
205
|
-
|
206
|
-
|
207
|
-
# q.occurrs_within = :title
|
208
|
-
# end
|
209
|
-
#
|
210
|
-
def self.from_url(url,options={},&block)
|
211
|
-
url = URI.parse(url)
|
212
|
-
|
213
|
-
options[:results_per_page] = url.query_params['num']
|
214
|
-
|
215
|
-
options[:query] = url.query_params['as_q']
|
216
|
-
options[:exact_phrase] = url.query_params['as_epq']
|
217
|
-
options[:with_words] = url.query_params['as_oq']
|
218
|
-
options[:without_words] = url.query_params['as_eq']
|
219
|
-
|
220
|
-
options[:language] = url.query_params['lr']
|
221
|
-
options[:region] = url.query_params['cr']
|
222
|
-
|
223
|
-
case url.query_params['as_ft']
|
224
|
-
when 'i'
|
225
|
-
options[:in_format] = url.query_params['as_filetype']
|
226
|
-
when 'e'
|
227
|
-
options[:not_in_format] = url.query_params['as_filetype']
|
228
|
-
end
|
229
|
-
|
230
|
-
case url.query_params['as_qdr']
|
231
|
-
when 'd'
|
232
|
-
options[:within_past_day] = true
|
233
|
-
when 'w'
|
234
|
-
options[:within_past_week] = true
|
235
|
-
when 'm'
|
236
|
-
options[:within_past_months] = 1
|
237
|
-
when 'm2'
|
238
|
-
options[:within_past_months] = 2
|
239
|
-
when 'm3'
|
240
|
-
options[:within_past_months] = 3
|
241
|
-
when 'm6'
|
242
|
-
options[:within_past_months] = 6
|
243
|
-
when 'y'
|
244
|
-
options[:within_past_year] = true
|
245
|
-
end
|
246
|
-
|
247
|
-
if (url.query_params['as_nlo'] || url.query_params['as_nhi'])
|
248
|
-
options[:numeric_range] = Range.new(url.query_params['as_nlo'].to_i,url.query_params['as_nhi'].to_i)
|
249
|
-
end
|
250
|
-
|
251
|
-
case url.query_params['as_occt']
|
252
|
-
when 'title'
|
253
|
-
options[:occurrs_within] = :title
|
254
|
-
when 'body'
|
255
|
-
options[:occurrs_within] = :body
|
256
|
-
when 'url'
|
257
|
-
options[:occurrs_within] = :url
|
258
|
-
when 'links'
|
259
|
-
options[:occurrs_within] = :links
|
260
|
-
end
|
261
|
-
|
262
|
-
case url.query_params['as_dt']
|
263
|
-
when 'i'
|
264
|
-
options[:inside_domain] = url.query_params['as_sitesearch']
|
265
|
-
when 'e'
|
266
|
-
options[:outside_domain] = url.query_params['as_sitesearch']
|
267
|
-
end
|
268
|
-
|
269
|
-
case url.query_params['as_rights']
|
270
|
-
when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived)'
|
271
|
-
options[:rights] = Licenses::CC_BY_NC_ND
|
272
|
-
when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)'
|
273
|
-
options[:rights] = Licenses::CC_BY_SA
|
274
|
-
when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)'
|
275
|
-
options[:rights] = Licenses::CC_BY_NC
|
276
|
-
when '(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)'
|
277
|
-
options[:rights] = Licenses::CC_BY
|
278
|
-
end
|
279
|
-
|
280
|
-
if url.query_params[:safe]=='active'
|
281
|
-
options[:filtered] = true
|
282
|
-
end
|
283
|
-
|
284
|
-
if url.query_params['as_rq']
|
285
|
-
options[:similar_to] = url.query_params['as_rq']
|
286
|
-
elsif url.query_params['as_lq']
|
287
|
-
options[:links_to] = url.query_params['as_lq']
|
288
|
-
end
|
289
|
-
|
290
|
-
return self.new(options,&block)
|
291
|
-
end
|
292
|
-
|
293
|
-
#
|
294
|
-
# Returns the URL that represents the query.
|
295
|
-
#
|
296
|
-
def search_url
|
297
|
-
url = URI(SEARCH_URL)
|
298
|
-
query_expr = []
|
299
|
-
|
300
|
-
set_param = lambda { |param,value|
|
301
|
-
url.query_params[param.to_s] = value if value
|
302
|
-
}
|
118
|
+
def expression
|
119
|
+
expr = []
|
303
120
|
|
304
121
|
append_modifier = lambda { |name|
|
305
122
|
modifier = instance_variable_get("@#{name}")
|
306
123
|
|
307
|
-
|
124
|
+
expr << "#{name}:#{modifier}" if modifier
|
308
125
|
}
|
309
126
|
|
310
|
-
|
127
|
+
append_options = lambda { |name|
|
311
128
|
ops = instance_variable_get("@#{name}")
|
312
129
|
|
313
130
|
if ops.kind_of?(Array)
|
314
|
-
|
131
|
+
expr << "#{name}:#{ops.join(' ')}"
|
315
132
|
elsif ops
|
316
|
-
|
133
|
+
expr << "#{name}:#{ops}"
|
317
134
|
end
|
318
135
|
}
|
319
136
|
|
320
|
-
|
321
|
-
|
322
|
-
query_expr << @query if @query
|
137
|
+
expr << @query if @query
|
323
138
|
|
324
139
|
append_modifier.call(:link)
|
325
140
|
append_modifier.call(:related)
|
@@ -327,256 +142,30 @@ module GScraper
|
|
327
142
|
append_modifier.call(:site)
|
328
143
|
append_modifier.call(:filetype)
|
329
144
|
|
330
|
-
|
145
|
+
append_options.call(:allintitle)
|
331
146
|
append_modifier.call(:intitle)
|
332
|
-
|
147
|
+
append_options.call(:allinurl)
|
333
148
|
append_modifier.call(:inurl)
|
334
|
-
|
149
|
+
append_options.call(:allintext)
|
335
150
|
append_modifier.call(:intext)
|
336
151
|
|
337
|
-
|
338
|
-
|
339
|
-
end
|
340
|
-
|
341
|
-
set_param.call('as_epq',@exact_phrase)
|
342
|
-
set_param.call('as_oq',@with_words)
|
343
|
-
set_param.call('as_eq',@without_words)
|
344
|
-
|
345
|
-
set_param.call('lr',@language)
|
346
|
-
set_param.call('cr',@region)
|
347
|
-
|
348
|
-
if @in_format
|
349
|
-
url.query_params['as_ft'] = 'i'
|
350
|
-
url.query_params['as_filtetype'] = @in_format
|
351
|
-
elsif @not_in_format
|
352
|
-
url.query_params['as_ft'] = 'e'
|
353
|
-
url.query_params['as_filtetype'] = @not_in_format
|
354
|
-
end
|
355
|
-
|
356
|
-
if @within_past_day
|
357
|
-
url.query_params['as_qdr'] = 'd'
|
358
|
-
elsif @within_past_week
|
359
|
-
url.query_params['as_qdr'] = 'w'
|
360
|
-
elsif @within_past_months
|
361
|
-
case @within_past_months
|
362
|
-
when 1
|
363
|
-
url.query_params['as_qdr'] = 'm'
|
364
|
-
when 2
|
365
|
-
url.query_params['as_qdr'] = 'm2'
|
366
|
-
when 3
|
367
|
-
url.query_params['as_qdr'] = 'm3'
|
368
|
-
when 6
|
369
|
-
url.query_params['as_qdr'] = 'm6'
|
370
|
-
end
|
371
|
-
elsif @within_past_year
|
372
|
-
url.query_params['as_qdr'] = 'y'
|
373
|
-
end
|
374
|
-
|
375
|
-
if @numeric_range
|
376
|
-
url.query_params['as_nlo'] = @numeric_range.begin
|
377
|
-
url.query_params['as_nhi'] = @numeric_range.end
|
378
|
-
end
|
379
|
-
|
380
|
-
case @occurrs_within
|
381
|
-
when :title, 'title'
|
382
|
-
url.query_params['as_occt'] = 'title'
|
383
|
-
when :body, 'body'
|
384
|
-
url.query_params['as_occt'] = 'body'
|
385
|
-
when :url, 'url'
|
386
|
-
url.query_params['as_occt'] = 'url'
|
387
|
-
when :links, 'links'
|
388
|
-
url.query_params['as_occt'] = 'links'
|
152
|
+
if @exact_phrase
|
153
|
+
expr << "\"#{@exact_phrase}\""
|
389
154
|
end
|
390
155
|
|
391
|
-
if @
|
392
|
-
|
393
|
-
url.query_params['as_sitesearch'] = @inside_domain
|
394
|
-
elsif @outside_domain
|
395
|
-
url.query_params['as_dt'] = 'e'
|
396
|
-
url.query_params['as_sitesearch'] = @outside_domain
|
156
|
+
if @with_words.kind_of?(Array)
|
157
|
+
expr << @with_words.join(' OR ')
|
397
158
|
end
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived)'
|
402
|
-
when Licenses::CC_BY_SA
|
403
|
-
url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)'
|
404
|
-
when Licenses::CC_BY_ND
|
405
|
-
url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)'
|
406
|
-
when Licenses::CC_BY
|
407
|
-
url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)'
|
159
|
+
|
160
|
+
if @without_words.kind_of?(Array)
|
161
|
+
expr << @without_words.map { |word| "-#{word}" }.join(' ')
|
408
162
|
end
|
409
163
|
|
410
|
-
|
411
|
-
|
412
|
-
if @similar_to
|
413
|
-
url.query_params['as_rq'] = @similar_to
|
414
|
-
elsif @links_to
|
415
|
-
url.query_params['as_lq'] = @links_to
|
164
|
+
if @numeric_range.kind_of?(Range)
|
165
|
+
expr << "#{@numeric_range.begin}..#{@numeric_range.end}"
|
416
166
|
end
|
417
167
|
|
418
|
-
return
|
419
|
-
end
|
420
|
-
|
421
|
-
#
|
422
|
-
# Returns the URL that represents the query at the specific
|
423
|
-
# _page_index_.
|
424
|
-
#
|
425
|
-
def page_url(page_index)
|
426
|
-
url = search_url
|
427
|
-
|
428
|
-
url.query_params['start'] = page_result_offset(page_index)
|
429
|
-
url.query_params['sa'] = 'N'
|
430
|
-
|
431
|
-
return url
|
432
|
-
end
|
433
|
-
|
434
|
-
#
|
435
|
-
# Returns a Page object containing Result objects at the specified
|
436
|
-
# _page_index_. If a _block_ is given, it will be passed the newly
|
437
|
-
# created Page.
|
438
|
-
#
|
439
|
-
def page(page_index,&block)
|
440
|
-
doc = get_page(page_url(page_index))
|
441
|
-
|
442
|
-
new_page = Page.new
|
443
|
-
results = doc.search('//div.g')[0...@results_per_page.to_i]
|
444
|
-
|
445
|
-
results.each_with_index do |result,index|
|
446
|
-
rank = page_result_offset(page_index) + (index + 1)
|
447
|
-
link = result.at('//a.l')
|
448
|
-
title = link.inner_text
|
449
|
-
url = link.get_attribute('href')
|
450
|
-
summary_text = ''
|
451
|
-
cached_url = nil
|
452
|
-
similar_url = nil
|
453
|
-
|
454
|
-
if (content = (result.at('//td.j//font|//td.j/div.sml')))
|
455
|
-
content.children.each do |elem|
|
456
|
-
break if (!(elem.text?) && elem.name=='br')
|
457
|
-
|
458
|
-
summary_text << elem.inner_text
|
459
|
-
end
|
460
|
-
|
461
|
-
if (cached_link = result.at('nobr/a:first'))
|
462
|
-
cached_url = cached_link.get_attribute('href')
|
463
|
-
end
|
464
|
-
|
465
|
-
if (similar_link = result.at('nobr/a:last'))
|
466
|
-
similar_url = "http://#{SEARCH_HOST}" + similar_link.get_attribute('href')
|
467
|
-
end
|
468
|
-
end
|
469
|
-
|
470
|
-
new_page << Result.new(rank,title,url,summary_text,cached_url,similar_url)
|
471
|
-
end
|
472
|
-
|
473
|
-
block.call(new_page) if block
|
474
|
-
return new_page
|
475
|
-
end
|
476
|
-
|
477
|
-
#
|
478
|
-
# Returns the Results on the first page. If a _block_ is given it
|
479
|
-
# will be passed the newly created Page.
|
480
|
-
#
|
481
|
-
def first_page(&block)
|
482
|
-
page(1,&block)
|
483
|
-
end
|
484
|
-
|
485
|
-
#
|
486
|
-
# Returns the Result at the specified _index_.
|
487
|
-
#
|
488
|
-
def result_at(index)
|
489
|
-
page(result_page_index(index))[page_result_index(index)]
|
490
|
-
end
|
491
|
-
|
492
|
-
#
|
493
|
-
# Returns the first Result on the first_page.
|
494
|
-
#
|
495
|
-
def top_result
|
496
|
-
result_at(1)
|
497
|
-
end
|
498
|
-
|
499
|
-
#
|
500
|
-
# Iterates over the results at the specified _page_index_, passing
|
501
|
-
# each to the given _block_.
|
502
|
-
#
|
503
|
-
# query.each_on_page(2) do |result|
|
504
|
-
# puts result.title
|
505
|
-
# end
|
506
|
-
#
|
507
|
-
def each_on_page(page_index,&block)
|
508
|
-
page(page_index).each(&block)
|
509
|
-
end
|
510
|
-
|
511
|
-
#
|
512
|
-
# Iterates over the results on the first page, passing each to the
|
513
|
-
# given _block_.
|
514
|
-
#
|
515
|
-
# query.each_on_first_page do |result|
|
516
|
-
# puts result.url
|
517
|
-
# end
|
518
|
-
#
|
519
|
-
def each_on_first_page(&block)
|
520
|
-
each_on_page(1,&block)
|
521
|
-
end
|
522
|
-
|
523
|
-
#
|
524
|
-
# Returns a SponsoredLinks object containing SponsoredAd objects of
|
525
|
-
# the query. If a _block_ is given, it will be passed the newly
|
526
|
-
# created Page.
|
527
|
-
#
|
528
|
-
def sponsored_links(&block)
|
529
|
-
doc = get_page(search_url)
|
530
|
-
new_links = SponsoredLinks.new
|
531
|
-
|
532
|
-
# top and side ads
|
533
|
-
doc.search('//a[@id="pa1"]|//a[@id*="an"]').each do |link|
|
534
|
-
title = link.inner_text
|
535
|
-
url = "http://#{SEARCH_HOST}" + link.get_attribute('href')
|
536
|
-
|
537
|
-
new_links << SponsoredAd.new(title,url)
|
538
|
-
end
|
539
|
-
|
540
|
-
block.call(new_links) if block
|
541
|
-
return new_links
|
542
|
-
end
|
543
|
-
|
544
|
-
#
|
545
|
-
# Returns the first sponsored link on the first page of results.
|
546
|
-
#
|
547
|
-
def top_sponsored_link
|
548
|
-
top_sponsored_links.first
|
549
|
-
end
|
550
|
-
|
551
|
-
#
|
552
|
-
# Iterates over the sponsored links on the first page of
|
553
|
-
# results passing each to the specified _block_.
|
554
|
-
#
|
555
|
-
def each_sponsored_link(&block)
|
556
|
-
sponsored_links.each(&block)
|
557
|
-
end
|
558
|
-
|
559
|
-
protected
|
560
|
-
|
561
|
-
#
|
562
|
-
# Returns the rank offset for the specified _page_index_.
|
563
|
-
#
|
564
|
-
def page_result_offset(page_index)
|
565
|
-
(page_index.to_i - 1) * @results_per_page.to_i
|
566
|
-
end
|
567
|
-
|
568
|
-
#
|
569
|
-
# Returns the in-Page index of the _result_index_.
|
570
|
-
#
|
571
|
-
def page_result_index(result_index)
|
572
|
-
(result_index.to_i - 1) % @results_per_page.to_i
|
573
|
-
end
|
574
|
-
|
575
|
-
#
|
576
|
-
# Returns the page index for the specified _result_index_
|
577
|
-
#
|
578
|
-
def result_page_index(result_index)
|
579
|
-
((result_index.to_i - 1) / @results_per_page.to_i) + 1
|
168
|
+
return expr.join(' ')
|
580
169
|
end
|
581
170
|
|
582
171
|
end
|