gscraper 0.1.7 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/COPYING.txt +339 -0
- data/History.txt +21 -0
- data/Manifest.txt +23 -10
- data/README.txt +17 -21
- data/Rakefile +3 -6
- data/lib/gscraper.rb +22 -0
- data/lib/gscraper/extensions.rb +22 -0
- data/lib/gscraper/extensions/uri.rb +22 -0
- data/lib/gscraper/extensions/uri/http.rb +25 -71
- data/lib/gscraper/extensions/uri/query_params.rb +96 -0
- data/lib/gscraper/gscraper.rb +30 -0
- data/lib/gscraper/has_pages.rb +114 -0
- data/lib/gscraper/licenses.rb +22 -0
- data/lib/gscraper/page.rb +64 -0
- data/lib/gscraper/search.rb +24 -0
- data/lib/gscraper/search/ajax_query.rb +176 -0
- data/lib/gscraper/search/page.rb +27 -72
- data/lib/gscraper/search/query.rb +46 -457
- data/lib/gscraper/search/result.rb +32 -29
- data/lib/gscraper/search/search.rb +44 -3
- data/lib/gscraper/search/web_query.rb +472 -0
- data/lib/gscraper/sponsored_ad.rb +26 -2
- data/lib/gscraper/sponsored_links.rb +77 -8
- data/lib/gscraper/version.rb +23 -1
- data/spec/extensions/uri/http_spec.rb +9 -0
- data/spec/extensions/uri/query_params_spec.rb +38 -0
- data/spec/gscraper_spec.rb +29 -0
- data/spec/has_pages_examples.rb +19 -0
- data/spec/has_sponsored_links_examples.rb +57 -0
- data/spec/helpers/query.rb +1 -0
- data/spec/helpers/uri.rb +8 -0
- data/spec/page_has_results_examples.rb +13 -0
- data/spec/search/ajax_query_spec.rb +124 -0
- data/spec/search/page_has_results_examples.rb +51 -0
- data/spec/search/query_spec.rb +103 -0
- data/spec/search/web_query_spec.rb +74 -0
- data/spec/spec_helper.rb +6 -0
- data/tasks/spec.rb +7 -0
- metadata +34 -20
- data/LICENSE.txt +0 -23
- data/lib/gscraper/web_agent.rb +0 -38
- data/test/search/page_results.rb +0 -103
- data/test/search/query_from_url.rb +0 -50
- data/test/search/query_pages.rb +0 -32
- data/test/search/query_result.rb +0 -30
- data/test/test_gscraper.rb +0 -4
data/lib/gscraper/licenses.rb
CHANGED
@@ -1,3 +1,25 @@
|
|
1
|
+
#
|
2
|
+
#--
|
3
|
+
# GScraper - A web-scraping interface to various Google Services.
|
4
|
+
#
|
5
|
+
# Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
|
6
|
+
#
|
7
|
+
# This program is free software; you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU General Public License as published by
|
9
|
+
# the Free Software Foundation; either version 2 of the License, or
|
10
|
+
# (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU General Public License
|
18
|
+
# along with this program; if not, write to the Free Software
|
19
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
+
#++
|
21
|
+
#
|
22
|
+
|
1
23
|
module GScraper
|
2
24
|
module Licenses
|
3
25
|
# Any desired license
|
@@ -0,0 +1,64 @@
|
|
1
|
+
#
|
2
|
+
#--
|
3
|
+
# GScraper - A web-scraping interface to various Google Services.
|
4
|
+
#
|
5
|
+
# Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
|
6
|
+
#
|
7
|
+
# This program is free software; you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU General Public License as published by
|
9
|
+
# the Free Software Foundation; either version 2 of the License, or
|
10
|
+
# (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU General Public License
|
18
|
+
# along with this program; if not, write to the Free Software
|
19
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
+
#++
|
21
|
+
#
|
22
|
+
|
23
|
+
module GScraper
|
24
|
+
class Page < Array
|
25
|
+
|
26
|
+
#
|
27
|
+
# Creates a new Page object with the given _elements_. If a _block_
|
28
|
+
# is given, it will be passed the newly created Page object.
|
29
|
+
#
|
30
|
+
def initialize(elements=[],&block)
|
31
|
+
super(elements)
|
32
|
+
|
33
|
+
block.call(self) if block
|
34
|
+
end
|
35
|
+
|
36
|
+
#
|
37
|
+
# Returns a mapped Array of the elements within the Page using the
|
38
|
+
# given _block_. If the _block_ is not given, the page will be
|
39
|
+
# returned.
|
40
|
+
#
|
41
|
+
# page.map # => Page
|
42
|
+
#
|
43
|
+
# page.map { |element| element.field } # => [...]
|
44
|
+
#
|
45
|
+
def map(&block)
|
46
|
+
return self unless block
|
47
|
+
|
48
|
+
mapped = []
|
49
|
+
|
50
|
+
each { |element| mapped << block.call(element) }
|
51
|
+
return mapped
|
52
|
+
end
|
53
|
+
|
54
|
+
#
|
55
|
+
# Selects the elements within the Page which match the given _block_.
|
56
|
+
#
|
57
|
+
# page.select { |element| element.field =~ /ruby/i }
|
58
|
+
#
|
59
|
+
def select(&block)
|
60
|
+
self.class.new(super(&block))
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
64
|
+
end
|
data/lib/gscraper/search.rb
CHANGED
@@ -1 +1,25 @@
|
|
1
|
+
#
|
2
|
+
#--
|
3
|
+
# GScraper - A web-scraping interface to various Google Services.
|
4
|
+
#
|
5
|
+
# Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
|
6
|
+
#
|
7
|
+
# This program is free software; you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU General Public License as published by
|
9
|
+
# the Free Software Foundation; either version 2 of the License, or
|
10
|
+
# (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU General Public License
|
18
|
+
# along with this program; if not, write to the Free Software
|
19
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
+
#++
|
21
|
+
#
|
22
|
+
|
23
|
+
require 'gscraper/search/web_query'
|
24
|
+
require 'gscraper/search/ajax_query'
|
1
25
|
require 'gscraper/search/search'
|
@@ -0,0 +1,176 @@
|
|
1
|
+
#
|
2
|
+
#--
|
3
|
+
# GScraper - A web-scraping interface to various Google Services.
|
4
|
+
#
|
5
|
+
# Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
|
6
|
+
#
|
7
|
+
# This program is free software; you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU General Public License as published by
|
9
|
+
# the Free Software Foundation; either version 2 of the License, or
|
10
|
+
# (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU General Public License
|
18
|
+
# along with this program; if not, write to the Free Software
|
19
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
+
#++
|
21
|
+
#
|
22
|
+
|
23
|
+
require 'gscraper/search/result'
|
24
|
+
require 'gscraper/search/page'
|
25
|
+
require 'gscraper/search/query'
|
26
|
+
require 'gscraper/extensions/uri'
|
27
|
+
require 'gscraper/has_pages'
|
28
|
+
require 'gscraper/gscraper'
|
29
|
+
|
30
|
+
require 'json'
|
31
|
+
|
32
|
+
module GScraper
|
33
|
+
module Search
|
34
|
+
class AJAXQuery < Query
|
35
|
+
|
36
|
+
include HasPages
|
37
|
+
|
38
|
+
# Maximum results per-page
|
39
|
+
RESULTS_PER_PAGE = 8
|
40
|
+
|
41
|
+
# AJAX API host
|
42
|
+
API_HOST = 'www.google.com'
|
43
|
+
|
44
|
+
# AJAX API URL
|
45
|
+
API_URL = "http://#{API_HOST}/uds/GwebSearch?callback=google.search.WebSearch.RawCompletion&context=0&lstkp=0&rsz=large"
|
46
|
+
|
47
|
+
# Default language
|
48
|
+
DEFAULT_LANGUAGE = 'en'
|
49
|
+
|
50
|
+
# Default signature
|
51
|
+
DEFAULT_SIG = '582c1116317355adf613a6a843f19ece'
|
52
|
+
|
53
|
+
# Default key
|
54
|
+
DEFAULT_KEY = 'notsupplied'
|
55
|
+
|
56
|
+
# Default version
|
57
|
+
DEFAULT_VERSION = '1.0'
|
58
|
+
|
59
|
+
# The search language
|
60
|
+
attr_accessor :language
|
61
|
+
|
62
|
+
# The search signature
|
63
|
+
attr_accessor :sig
|
64
|
+
|
65
|
+
# The search key
|
66
|
+
attr_accessor :key
|
67
|
+
|
68
|
+
# The API version
|
69
|
+
attr_accessor :version
|
70
|
+
|
71
|
+
#
|
72
|
+
# Creates a new AJAXQuery with the given _options_. If a _block_ is
|
73
|
+
# given it will be passed the newly created AJAXQuery object.
|
74
|
+
#
|
75
|
+
# _options_ may contain the following keys:
|
76
|
+
# <tt>:language</tt>:: The search language. Defaults to <tt>:en</tt>.
|
77
|
+
# <tt>:sig</tt>:: The search signature. Defaults to
|
78
|
+
# +582c1116317355adf613a6a843f19ece+.
|
79
|
+
# <tt>:key</tt>:: The search key. Defaults to <tt>:notsupplied</tt>.
|
80
|
+
# <tt>:version</tt>:: The desired API version. Defaults to
|
81
|
+
# <tt>1.0</tt>.
|
82
|
+
#
|
83
|
+
def initialize(options={},&block)
|
84
|
+
@agent = GScraper.web_agent(options)
|
85
|
+
|
86
|
+
@language = (options[:language] || DEFAULT_LANGUAGE)
|
87
|
+
|
88
|
+
@sig = (options[:sig] || DEFAULT_SIG)
|
89
|
+
@key = (options[:key] || DEFAULT_KEY)
|
90
|
+
@version = (options[:version] || DEFAULT_VERSION)
|
91
|
+
|
92
|
+
super(options,&block)
|
93
|
+
end
|
94
|
+
|
95
|
+
#
|
96
|
+
# Creates a new AJAXQuery object from the specified URL. If a block is
|
97
|
+
# given, it will be passed the newly created AJAXQuery object.
|
98
|
+
#
|
99
|
+
def self.from_url(url,options={},&block)
|
100
|
+
url = URI(url.to_s)
|
101
|
+
|
102
|
+
options[:language] = url.query_params['hl']
|
103
|
+
options[:query] = url.query_params['q']
|
104
|
+
|
105
|
+
options[:sig] = url.query_params['sig']
|
106
|
+
options[:key] = url.query_params['key']
|
107
|
+
options[:version] = url.query_params['v']
|
108
|
+
|
109
|
+
return self.new(options,&block)
|
110
|
+
end
|
111
|
+
|
112
|
+
#
|
113
|
+
# Returns +RESULTS_PER_PAGE+.
|
114
|
+
#
|
115
|
+
def results_per_page
|
116
|
+
RESULTS_PER_PAGE
|
117
|
+
end
|
118
|
+
|
119
|
+
#
|
120
|
+
# Returns the URL that represents the query.
|
121
|
+
#
|
122
|
+
def search_url
|
123
|
+
search_url = URI(API_URL)
|
124
|
+
|
125
|
+
search_url.query_params['hl'] = @language
|
126
|
+
search_url.query_params['gss'] = '.com'
|
127
|
+
search_url.query_params['q'] = expression
|
128
|
+
search_url.query_params['sig'] = @sig
|
129
|
+
search_url.query_params['key'] = @key
|
130
|
+
search_url.query_params['v'] = @version
|
131
|
+
|
132
|
+
return search_url
|
133
|
+
end
|
134
|
+
|
135
|
+
#
|
136
|
+
# Returns the URL that represents the query of a specific
|
137
|
+
# _page_index_.
|
138
|
+
#
|
139
|
+
def page_url(page_index)
|
140
|
+
url = search_url
|
141
|
+
|
142
|
+
if page_index > 1
|
143
|
+
url.query_params['start'] = result_offset_of(page_index)
|
144
|
+
end
|
145
|
+
|
146
|
+
return url
|
147
|
+
end
|
148
|
+
|
149
|
+
#
|
150
|
+
# Returns a Page object containing Result objects at the specified
|
151
|
+
# _page_index_.
|
152
|
+
#
|
153
|
+
def page(page_index)
|
154
|
+
Page.new do |new_page|
|
155
|
+
body = @agent.get(page_url(page_index)).body
|
156
|
+
hash = JSON.parse(body.scan(/\{.*\}/).first)
|
157
|
+
|
158
|
+
rank_offset = result_offset_of(page_index)
|
159
|
+
|
160
|
+
if (hash.kind_of?(Hash) && hash['results'])
|
161
|
+
hash['results'].each_with_index do |result,index|
|
162
|
+
rank = rank_offset + (index + 1)
|
163
|
+
title = Hpricot(result['title']).inner_text
|
164
|
+
url = result['unescapedUrl']
|
165
|
+
summary = Hpricot(result['content']).inner_text
|
166
|
+
cached_url = result['cacheUrl']
|
167
|
+
|
168
|
+
new_page << Result.new(rank,title,url,summary,cached_url)
|
169
|
+
end
|
170
|
+
end
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
end
|
175
|
+
end
|
176
|
+
end
|
data/lib/gscraper/search/page.rb
CHANGED
@@ -1,42 +1,31 @@
|
|
1
|
+
#
|
2
|
+
#--
|
3
|
+
# GScraper - A web-scraping interface to various Google Services.
|
4
|
+
#
|
5
|
+
# Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
|
6
|
+
#
|
7
|
+
# This program is free software; you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU General Public License as published by
|
9
|
+
# the Free Software Foundation; either version 2 of the License, or
|
10
|
+
# (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU General Public License
|
18
|
+
# along with this program; if not, write to the Free Software
|
19
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
+
#++
|
21
|
+
#
|
22
|
+
|
1
23
|
require 'gscraper/search/result'
|
24
|
+
require 'gscraper/page'
|
2
25
|
|
3
26
|
module GScraper
|
4
27
|
module Search
|
5
|
-
class Page <
|
6
|
-
|
7
|
-
#
|
8
|
-
# Creates a new Page object with the given _results_.
|
9
|
-
#
|
10
|
-
def initialize(results=[])
|
11
|
-
super(results)
|
12
|
-
end
|
13
|
-
|
14
|
-
#
|
15
|
-
# Returns a mapped Array of the results within the Page using the
|
16
|
-
# given _block_. If the _block_ is not given, the page will be
|
17
|
-
# returned.
|
18
|
-
#
|
19
|
-
# page.map # => Page
|
20
|
-
#
|
21
|
-
# page.map { |result| result.url } # => [...]
|
22
|
-
#
|
23
|
-
def map(&block)
|
24
|
-
return self unless block
|
25
|
-
|
26
|
-
mapped = []
|
27
|
-
|
28
|
-
each { |result| mapped << block.call(result) }
|
29
|
-
return mapped
|
30
|
-
end
|
31
|
-
|
32
|
-
#
|
33
|
-
# Selects the results within the Page which match the given _block_.
|
34
|
-
#
|
35
|
-
# page.select { |result| result.title =~ /ruby/i }
|
36
|
-
#
|
37
|
-
def select(&block)
|
38
|
-
Page.new(super(&block))
|
39
|
-
end
|
28
|
+
class Page < GScraper::Page
|
40
29
|
|
41
30
|
#
|
42
31
|
# Selects the results using the specified _block_.
|
@@ -160,7 +149,7 @@ module GScraper
|
|
160
149
|
# page.cached_urls # => [...]
|
161
150
|
#
|
162
151
|
def cached_urls
|
163
|
-
map { |result| result.cached_url }
|
152
|
+
map { |result| result.cached_url }.compact
|
164
153
|
end
|
165
154
|
|
166
155
|
#
|
@@ -170,7 +159,7 @@ module GScraper
|
|
170
159
|
# page.cached_pages # => [...]
|
171
160
|
#
|
172
161
|
def cached_pages
|
173
|
-
map { |result| result.cached_page }
|
162
|
+
map { |result| result.cached_page }.compact
|
174
163
|
end
|
175
164
|
|
176
165
|
#
|
@@ -180,17 +169,7 @@ module GScraper
|
|
180
169
|
# page.similar_urls # => [...]
|
181
170
|
#
|
182
171
|
def similar_urls
|
183
|
-
map { |result| result.similar_url }
|
184
|
-
end
|
185
|
-
|
186
|
-
#
|
187
|
-
# Returns an Array containing the similar Queries of the results
|
188
|
-
# within the Page.
|
189
|
-
#
|
190
|
-
# page.similar_queries # => [...]
|
191
|
-
#
|
192
|
-
def similar_queries
|
193
|
-
map { |result| result.similar_query }
|
172
|
+
map { |result| result.similar_url }.compact
|
194
173
|
end
|
195
174
|
|
196
175
|
#
|
@@ -263,20 +242,6 @@ module GScraper
|
|
263
242
|
similar_urls.each(&block)
|
264
243
|
end
|
265
244
|
|
266
|
-
#
|
267
|
-
# Iterates over each result's similar Query within the Page, passing
|
268
|
-
# each to the given _block_.
|
269
|
-
#
|
270
|
-
# each_similar_query do |q|
|
271
|
-
# q.first_page do |page|
|
272
|
-
# puts page.urls.join("\n")
|
273
|
-
# end
|
274
|
-
# end
|
275
|
-
#
|
276
|
-
def each_similar_query(&block)
|
277
|
-
similar_queries.each(&block)
|
278
|
-
end
|
279
|
-
|
280
245
|
#
|
281
246
|
# Returns the ranks of the results that match the specified _block_.
|
282
247
|
#
|
@@ -345,16 +310,6 @@ module GScraper
|
|
345
310
|
results_with(&block).similar_urls
|
346
311
|
end
|
347
312
|
|
348
|
-
#
|
349
|
-
# Returns the similar Queries of the results that match the
|
350
|
-
# specified _block_.
|
351
|
-
#
|
352
|
-
# page.similar_queries_of { |result| result.title =~ /hackety/ }
|
353
|
-
#
|
354
|
-
def similar_queries_of(&block)
|
355
|
-
results_with(&block).similar_queries
|
356
|
-
end
|
357
|
-
|
358
313
|
end
|
359
314
|
end
|
360
315
|
end
|
@@ -1,10 +1,33 @@
|
|
1
|
+
#
|
2
|
+
#--
|
3
|
+
# GScraper - A web-scraping interface to various Google Services.
|
4
|
+
#
|
5
|
+
# Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
|
6
|
+
#
|
7
|
+
# This program is free software; you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU General Public License as published by
|
9
|
+
# the Free Software Foundation; either version 2 of the License, or
|
10
|
+
# (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU General Public License
|
18
|
+
# along with this program; if not, write to the Free Software
|
19
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
+
#++
|
21
|
+
#
|
22
|
+
|
1
23
|
require 'gscraper/search/result'
|
2
24
|
require 'gscraper/search/page'
|
3
25
|
require 'gscraper/sponsored_ad'
|
4
26
|
require 'gscraper/sponsored_links'
|
5
27
|
require 'gscraper/extensions/uri'
|
28
|
+
require 'gscraper/has_pages'
|
6
29
|
require 'gscraper/licenses'
|
7
|
-
require 'gscraper/
|
30
|
+
require 'gscraper/gscraper'
|
8
31
|
|
9
32
|
require 'hpricot'
|
10
33
|
|
@@ -12,20 +35,6 @@ module GScraper
|
|
12
35
|
module Search
|
13
36
|
class Query
|
14
37
|
|
15
|
-
include WebAgent
|
16
|
-
|
17
|
-
# Search host
|
18
|
-
SEARCH_HOST = 'www.google.com'
|
19
|
-
|
20
|
-
# Search URL
|
21
|
-
SEARCH_URL = "http://#{SEARCH_HOST}/search"
|
22
|
-
|
23
|
-
# Default results per-page
|
24
|
-
RESULTS_PER_PAGE = 10
|
25
|
-
|
26
|
-
# Results per-page
|
27
|
-
attr_accessor :results_per_page
|
28
|
-
|
29
38
|
# Search query
|
30
39
|
attr_accessor :query
|
31
40
|
|
@@ -71,67 +80,14 @@ module GScraper
|
|
71
80
|
# Search for results with-out the words
|
72
81
|
attr_accessor :without_words
|
73
82
|
|
74
|
-
# Search for results written in the language
|
75
|
-
attr_accessor :language
|
76
|
-
|
77
|
-
# Search for results from the region
|
78
|
-
attr_accessor :region
|
79
|
-
|
80
|
-
# Search for results in the format
|
81
|
-
attr_accessor :in_format
|
82
|
-
|
83
|
-
# Search for results not in the format
|
84
|
-
attr_accessor :not_in_format
|
85
|
-
|
86
|
-
# Search for results within the past day
|
87
|
-
attr_accessor :within_past_day
|
88
|
-
|
89
|
-
# Search for results within the past week
|
90
|
-
attr_accessor :within_past_week
|
91
|
-
|
92
|
-
# Search for results within the past months
|
93
|
-
attr_accessor :within_past_months
|
94
|
-
|
95
|
-
# Search for results within the past year
|
96
|
-
attr_accessor :within_past_year
|
97
|
-
|
98
83
|
# Search for results containing numbers between the range
|
99
84
|
attr_accessor :numeric_range
|
100
85
|
|
101
|
-
# Search for results where the query ocurrs within the area
|
102
|
-
attr_accessor :occurrs_within
|
103
|
-
|
104
|
-
# Search for results inside the domain
|
105
|
-
attr_accessor :inside_domain
|
106
|
-
|
107
|
-
# Search for results outside the domain
|
108
|
-
attr_accessor :outside_domain
|
109
|
-
|
110
|
-
# Search for results which have the rights
|
111
|
-
attr_accessor :rights
|
112
|
-
|
113
|
-
# Filter the search results
|
114
|
-
attr_accessor :filtered
|
115
|
-
|
116
|
-
# Search for results similar to the page
|
117
|
-
attr_accessor :similar_to
|
118
|
-
|
119
|
-
# Search for results linking to the page
|
120
|
-
attr_accessor :links_to
|
121
|
-
|
122
86
|
#
|
123
87
|
# Creates a new Query object from the given search options. If a
|
124
|
-
# block is given, it will be passed the newly created
|
125
|
-
#
|
126
|
-
# Query.new(:query => 'ruby', :with_words => 'sow rspec')
|
127
|
-
#
|
128
|
-
# Query.new(:exact_phrase => 'fluent interfaces') do |q|
|
129
|
-
# q.within_past_week = true
|
130
|
-
# end
|
88
|
+
# block is given, it will be passed the newly created Query object.
|
131
89
|
#
|
132
90
|
def initialize(options={},&block)
|
133
|
-
@results_per_page = (options[:results_per_page] || RESULTS_PER_PAGE)
|
134
|
-
|
135
91
|
@query = options[:query]
|
136
92
|
|
137
93
|
@link = options[:link]
|
@@ -151,175 +107,34 @@ module GScraper
|
|
151
107
|
@with_words = options[:with_words]
|
152
108
|
@without_words = options[:without_words]
|
153
109
|
|
154
|
-
@language = options[:language]
|
155
|
-
@region = options[:region]
|
156
|
-
@in_format = options[:in_format]
|
157
|
-
@not_in_format = options[:not_in_format]
|
158
|
-
|
159
|
-
if options[:within_past_day]
|
160
|
-
@within_past_day = options[:within_past_day]
|
161
|
-
@within_past_week = false
|
162
|
-
@within_past_months = false
|
163
|
-
@within_past_year = false
|
164
|
-
elsif options[:within_past_week]
|
165
|
-
@within_past_day = false
|
166
|
-
@within_past_week = options[:within_past_week]
|
167
|
-
@within_past_months = false
|
168
|
-
@within_past_year = false
|
169
|
-
elsif options[:within_past_months]
|
170
|
-
@within_past_day = false
|
171
|
-
@within_past_week = false
|
172
|
-
@within_past_months = options[:within_past_months]
|
173
|
-
@within_past_year = false
|
174
|
-
elsif options[:within_past_year]
|
175
|
-
@within_past_day = false
|
176
|
-
@within_past_week = false
|
177
|
-
@within_past_months = false
|
178
|
-
@within_past_year = options[:within_past_year]
|
179
|
-
else
|
180
|
-
@within_past_day = false
|
181
|
-
@within_past_week = false
|
182
|
-
@within_past_months = false
|
183
|
-
@within_past_year = false
|
184
|
-
end
|
185
|
-
|
186
110
|
@numeric_range = options[:numeric_range]
|
187
|
-
@occurrs_within = options[:occurrs_within]
|
188
|
-
@inside_domain = options[:inside_domain]
|
189
|
-
@outside_domain = options[:outside_domain]
|
190
|
-
@rights = options[:rights]
|
191
|
-
@filtered = options[:filtered]
|
192
|
-
|
193
|
-
@similar_to = options[:similar_to]
|
194
|
-
@links_to = options[:links_to]
|
195
111
|
|
196
112
|
block.call(self) if block
|
197
113
|
end
|
198
114
|
|
199
115
|
#
|
200
|
-
#
|
201
|
-
# given, it will be passed the newly created Query object.
|
202
|
-
#
|
203
|
-
# Query.from_url('http://www.google.com/search?q=ruby+zen)
|
116
|
+
# Returns the query expression.
|
204
117
|
#
|
205
|
-
|
206
|
-
|
207
|
-
# q.occurrs_within = :title
|
208
|
-
# end
|
209
|
-
#
|
210
|
-
def self.from_url(url,options={},&block)
|
211
|
-
url = URI.parse(url)
|
212
|
-
|
213
|
-
options[:results_per_page] = url.query_params['num']
|
214
|
-
|
215
|
-
options[:query] = url.query_params['as_q']
|
216
|
-
options[:exact_phrase] = url.query_params['as_epq']
|
217
|
-
options[:with_words] = url.query_params['as_oq']
|
218
|
-
options[:without_words] = url.query_params['as_eq']
|
219
|
-
|
220
|
-
options[:language] = url.query_params['lr']
|
221
|
-
options[:region] = url.query_params['cr']
|
222
|
-
|
223
|
-
case url.query_params['as_ft']
|
224
|
-
when 'i'
|
225
|
-
options[:in_format] = url.query_params['as_filetype']
|
226
|
-
when 'e'
|
227
|
-
options[:not_in_format] = url.query_params['as_filetype']
|
228
|
-
end
|
229
|
-
|
230
|
-
case url.query_params['as_qdr']
|
231
|
-
when 'd'
|
232
|
-
options[:within_past_day] = true
|
233
|
-
when 'w'
|
234
|
-
options[:within_past_week] = true
|
235
|
-
when 'm'
|
236
|
-
options[:within_past_months] = 1
|
237
|
-
when 'm2'
|
238
|
-
options[:within_past_months] = 2
|
239
|
-
when 'm3'
|
240
|
-
options[:within_past_months] = 3
|
241
|
-
when 'm6'
|
242
|
-
options[:within_past_months] = 6
|
243
|
-
when 'y'
|
244
|
-
options[:within_past_year] = true
|
245
|
-
end
|
246
|
-
|
247
|
-
if (url.query_params['as_nlo'] || url.query_params['as_nhi'])
|
248
|
-
options[:numeric_range] = Range.new(url.query_params['as_nlo'].to_i,url.query_params['as_nhi'].to_i)
|
249
|
-
end
|
250
|
-
|
251
|
-
case url.query_params['as_occt']
|
252
|
-
when 'title'
|
253
|
-
options[:occurrs_within] = :title
|
254
|
-
when 'body'
|
255
|
-
options[:occurrs_within] = :body
|
256
|
-
when 'url'
|
257
|
-
options[:occurrs_within] = :url
|
258
|
-
when 'links'
|
259
|
-
options[:occurrs_within] = :links
|
260
|
-
end
|
261
|
-
|
262
|
-
case url.query_params['as_dt']
|
263
|
-
when 'i'
|
264
|
-
options[:inside_domain] = url.query_params['as_sitesearch']
|
265
|
-
when 'e'
|
266
|
-
options[:outside_domain] = url.query_params['as_sitesearch']
|
267
|
-
end
|
268
|
-
|
269
|
-
case url.query_params['as_rights']
|
270
|
-
when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived)'
|
271
|
-
options[:rights] = Licenses::CC_BY_NC_ND
|
272
|
-
when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)'
|
273
|
-
options[:rights] = Licenses::CC_BY_SA
|
274
|
-
when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)'
|
275
|
-
options[:rights] = Licenses::CC_BY_NC
|
276
|
-
when '(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)'
|
277
|
-
options[:rights] = Licenses::CC_BY
|
278
|
-
end
|
279
|
-
|
280
|
-
if url.query_params[:safe]=='active'
|
281
|
-
options[:filtered] = true
|
282
|
-
end
|
283
|
-
|
284
|
-
if url.query_params['as_rq']
|
285
|
-
options[:similar_to] = url.query_params['as_rq']
|
286
|
-
elsif url.query_params['as_lq']
|
287
|
-
options[:links_to] = url.query_params['as_lq']
|
288
|
-
end
|
289
|
-
|
290
|
-
return self.new(options,&block)
|
291
|
-
end
|
292
|
-
|
293
|
-
#
|
294
|
-
# Returns the URL that represents the query.
|
295
|
-
#
|
296
|
-
def search_url
|
297
|
-
url = URI(SEARCH_URL)
|
298
|
-
query_expr = []
|
299
|
-
|
300
|
-
set_param = lambda { |param,value|
|
301
|
-
url.query_params[param.to_s] = value if value
|
302
|
-
}
|
118
|
+
def expression
|
119
|
+
expr = []
|
303
120
|
|
304
121
|
append_modifier = lambda { |name|
|
305
122
|
modifier = instance_variable_get("@#{name}")
|
306
123
|
|
307
|
-
|
124
|
+
expr << "#{name}:#{modifier}" if modifier
|
308
125
|
}
|
309
126
|
|
310
|
-
|
127
|
+
append_options = lambda { |name|
|
311
128
|
ops = instance_variable_get("@#{name}")
|
312
129
|
|
313
130
|
if ops.kind_of?(Array)
|
314
|
-
|
131
|
+
expr << "#{name}:#{ops.join(' ')}"
|
315
132
|
elsif ops
|
316
|
-
|
133
|
+
expr << "#{name}:#{ops}"
|
317
134
|
end
|
318
135
|
}
|
319
136
|
|
320
|
-
|
321
|
-
|
322
|
-
query_expr << @query if @query
|
137
|
+
expr << @query if @query
|
323
138
|
|
324
139
|
append_modifier.call(:link)
|
325
140
|
append_modifier.call(:related)
|
@@ -327,256 +142,30 @@ module GScraper
|
|
327
142
|
append_modifier.call(:site)
|
328
143
|
append_modifier.call(:filetype)
|
329
144
|
|
330
|
-
|
145
|
+
append_options.call(:allintitle)
|
331
146
|
append_modifier.call(:intitle)
|
332
|
-
|
147
|
+
append_options.call(:allinurl)
|
333
148
|
append_modifier.call(:inurl)
|
334
|
-
|
149
|
+
append_options.call(:allintext)
|
335
150
|
append_modifier.call(:intext)
|
336
151
|
|
337
|
-
|
338
|
-
|
339
|
-
end
|
340
|
-
|
341
|
-
set_param.call('as_epq',@exact_phrase)
|
342
|
-
set_param.call('as_oq',@with_words)
|
343
|
-
set_param.call('as_eq',@without_words)
|
344
|
-
|
345
|
-
set_param.call('lr',@language)
|
346
|
-
set_param.call('cr',@region)
|
347
|
-
|
348
|
-
if @in_format
|
349
|
-
url.query_params['as_ft'] = 'i'
|
350
|
-
url.query_params['as_filtetype'] = @in_format
|
351
|
-
elsif @not_in_format
|
352
|
-
url.query_params['as_ft'] = 'e'
|
353
|
-
url.query_params['as_filtetype'] = @not_in_format
|
354
|
-
end
|
355
|
-
|
356
|
-
if @within_past_day
|
357
|
-
url.query_params['as_qdr'] = 'd'
|
358
|
-
elsif @within_past_week
|
359
|
-
url.query_params['as_qdr'] = 'w'
|
360
|
-
elsif @within_past_months
|
361
|
-
case @within_past_months
|
362
|
-
when 1
|
363
|
-
url.query_params['as_qdr'] = 'm'
|
364
|
-
when 2
|
365
|
-
url.query_params['as_qdr'] = 'm2'
|
366
|
-
when 3
|
367
|
-
url.query_params['as_qdr'] = 'm3'
|
368
|
-
when 6
|
369
|
-
url.query_params['as_qdr'] = 'm6'
|
370
|
-
end
|
371
|
-
elsif @within_past_year
|
372
|
-
url.query_params['as_qdr'] = 'y'
|
373
|
-
end
|
374
|
-
|
375
|
-
if @numeric_range
|
376
|
-
url.query_params['as_nlo'] = @numeric_range.begin
|
377
|
-
url.query_params['as_nhi'] = @numeric_range.end
|
378
|
-
end
|
379
|
-
|
380
|
-
case @occurrs_within
|
381
|
-
when :title, 'title'
|
382
|
-
url.query_params['as_occt'] = 'title'
|
383
|
-
when :body, 'body'
|
384
|
-
url.query_params['as_occt'] = 'body'
|
385
|
-
when :url, 'url'
|
386
|
-
url.query_params['as_occt'] = 'url'
|
387
|
-
when :links, 'links'
|
388
|
-
url.query_params['as_occt'] = 'links'
|
152
|
+
if @exact_phrase
|
153
|
+
expr << "\"#{@exact_phrase}\""
|
389
154
|
end
|
390
155
|
|
391
|
-
if @
|
392
|
-
|
393
|
-
url.query_params['as_sitesearch'] = @inside_domain
|
394
|
-
elsif @outside_domain
|
395
|
-
url.query_params['as_dt'] = 'e'
|
396
|
-
url.query_params['as_sitesearch'] = @outside_domain
|
156
|
+
if @with_words.kind_of?(Array)
|
157
|
+
expr << @with_words.join(' OR ')
|
397
158
|
end
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived)'
|
402
|
-
when Licenses::CC_BY_SA
|
403
|
-
url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)'
|
404
|
-
when Licenses::CC_BY_ND
|
405
|
-
url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)'
|
406
|
-
when Licenses::CC_BY
|
407
|
-
url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)'
|
159
|
+
|
160
|
+
if @without_words.kind_of?(Array)
|
161
|
+
expr << @without_words.map { |word| "-#{word}" }.join(' ')
|
408
162
|
end
|
409
163
|
|
410
|
-
|
411
|
-
|
412
|
-
if @similar_to
|
413
|
-
url.query_params['as_rq'] = @similar_to
|
414
|
-
elsif @links_to
|
415
|
-
url.query_params['as_lq'] = @links_to
|
164
|
+
if @numeric_range.kind_of?(Range)
|
165
|
+
expr << "#{@numeric_range.begin}..#{@numeric_range.end}"
|
416
166
|
end
|
417
167
|
|
418
|
-
return
|
419
|
-
end
|
420
|
-
|
421
|
-
#
|
422
|
-
# Returns the URL that represents the query at the specific
|
423
|
-
# _page_index_.
|
424
|
-
#
|
425
|
-
def page_url(page_index)
|
426
|
-
url = search_url
|
427
|
-
|
428
|
-
url.query_params['start'] = page_result_offset(page_index)
|
429
|
-
url.query_params['sa'] = 'N'
|
430
|
-
|
431
|
-
return url
|
432
|
-
end
|
433
|
-
|
434
|
-
#
|
435
|
-
# Returns a Page object containing Result objects at the specified
|
436
|
-
# _page_index_. If a _block_ is given, it will be passed the newly
|
437
|
-
# created Page.
|
438
|
-
#
|
439
|
-
def page(page_index,&block)
|
440
|
-
doc = get_page(page_url(page_index))
|
441
|
-
|
442
|
-
new_page = Page.new
|
443
|
-
results = doc.search('//div.g')[0...@results_per_page.to_i]
|
444
|
-
|
445
|
-
results.each_with_index do |result,index|
|
446
|
-
rank = page_result_offset(page_index) + (index + 1)
|
447
|
-
link = result.at('//a.l')
|
448
|
-
title = link.inner_text
|
449
|
-
url = link.get_attribute('href')
|
450
|
-
summary_text = ''
|
451
|
-
cached_url = nil
|
452
|
-
similar_url = nil
|
453
|
-
|
454
|
-
if (content = (result.at('//td.j//font|//td.j/div.sml')))
|
455
|
-
content.children.each do |elem|
|
456
|
-
break if (!(elem.text?) && elem.name=='br')
|
457
|
-
|
458
|
-
summary_text << elem.inner_text
|
459
|
-
end
|
460
|
-
|
461
|
-
if (cached_link = result.at('nobr/a:first'))
|
462
|
-
cached_url = cached_link.get_attribute('href')
|
463
|
-
end
|
464
|
-
|
465
|
-
if (similar_link = result.at('nobr/a:last'))
|
466
|
-
similar_url = "http://#{SEARCH_HOST}" + similar_link.get_attribute('href')
|
467
|
-
end
|
468
|
-
end
|
469
|
-
|
470
|
-
new_page << Result.new(rank,title,url,summary_text,cached_url,similar_url)
|
471
|
-
end
|
472
|
-
|
473
|
-
block.call(new_page) if block
|
474
|
-
return new_page
|
475
|
-
end
|
476
|
-
|
477
|
-
#
|
478
|
-
# Returns the Results on the first page. If a _block_ is given it
|
479
|
-
# will be passed the newly created Page.
|
480
|
-
#
|
481
|
-
def first_page(&block)
|
482
|
-
page(1,&block)
|
483
|
-
end
|
484
|
-
|
485
|
-
#
|
486
|
-
# Returns the Result at the specified _index_.
|
487
|
-
#
|
488
|
-
def result_at(index)
|
489
|
-
page(result_page_index(index))[page_result_index(index)]
|
490
|
-
end
|
491
|
-
|
492
|
-
#
|
493
|
-
# Returns the first Result on the first_page.
|
494
|
-
#
|
495
|
-
def top_result
|
496
|
-
result_at(1)
|
497
|
-
end
|
498
|
-
|
499
|
-
#
|
500
|
-
# Iterates over the results at the specified _page_index_, passing
|
501
|
-
# each to the given _block_.
|
502
|
-
#
|
503
|
-
# query.each_on_page(2) do |result|
|
504
|
-
# puts result.title
|
505
|
-
# end
|
506
|
-
#
|
507
|
-
def each_on_page(page_index,&block)
|
508
|
-
page(page_index).each(&block)
|
509
|
-
end
|
510
|
-
|
511
|
-
#
|
512
|
-
# Iterates over the results on the first page, passing each to the
|
513
|
-
# given _block_.
|
514
|
-
#
|
515
|
-
# query.each_on_first_page do |result|
|
516
|
-
# puts result.url
|
517
|
-
# end
|
518
|
-
#
|
519
|
-
def each_on_first_page(&block)
|
520
|
-
each_on_page(1,&block)
|
521
|
-
end
|
522
|
-
|
523
|
-
#
|
524
|
-
# Returns a SponsoredLinks object containing SponsoredAd objects of
|
525
|
-
# the query. If a _block_ is given, it will be passed the newly
|
526
|
-
# created Page.
|
527
|
-
#
|
528
|
-
def sponsored_links(&block)
|
529
|
-
doc = get_page(search_url)
|
530
|
-
new_links = SponsoredLinks.new
|
531
|
-
|
532
|
-
# top and side ads
|
533
|
-
doc.search('//a[@id="pa1"]|//a[@id*="an"]').each do |link|
|
534
|
-
title = link.inner_text
|
535
|
-
url = "http://#{SEARCH_HOST}" + link.get_attribute('href')
|
536
|
-
|
537
|
-
new_links << SponsoredAd.new(title,url)
|
538
|
-
end
|
539
|
-
|
540
|
-
block.call(new_links) if block
|
541
|
-
return new_links
|
542
|
-
end
|
543
|
-
|
544
|
-
#
|
545
|
-
# Returns the first sponsored link on the first page of results.
|
546
|
-
#
|
547
|
-
def top_sponsored_link
|
548
|
-
top_sponsored_links.first
|
549
|
-
end
|
550
|
-
|
551
|
-
#
|
552
|
-
# Iterates over the sponsored links on the first page of
|
553
|
-
# results passing each to the specified _block_.
|
554
|
-
#
|
555
|
-
def each_sponsored_link(&block)
|
556
|
-
sponsored_links.each(&block)
|
557
|
-
end
|
558
|
-
|
559
|
-
protected
|
560
|
-
|
561
|
-
#
|
562
|
-
# Returns the rank offset for the specified _page_index_.
|
563
|
-
#
|
564
|
-
def page_result_offset(page_index)
|
565
|
-
(page_index.to_i - 1) * @results_per_page.to_i
|
566
|
-
end
|
567
|
-
|
568
|
-
#
|
569
|
-
# Returns the in-Page index of the _result_index_.
|
570
|
-
#
|
571
|
-
def page_result_index(result_index)
|
572
|
-
(result_index.to_i - 1) % @results_per_page.to_i
|
573
|
-
end
|
574
|
-
|
575
|
-
#
|
576
|
-
# Returns the page index for the specified _result_index_
|
577
|
-
#
|
578
|
-
def result_page_index(result_index)
|
579
|
-
((result_index.to_i - 1) / @results_per_page.to_i) + 1
|
168
|
+
return expr.join(' ')
|
580
169
|
end
|
581
170
|
|
582
171
|
end
|