gscraper 0.3.0 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.rspec +1 -0
- data/ChangeLog.md +24 -2
- data/README.md +12 -7
- data/Rakefile +26 -29
- data/gemspec.yml +20 -0
- data/gscraper.gemspec +124 -109
- data/lib/gscraper.rb +1 -1
- data/lib/gscraper/gscraper.rb +24 -20
- data/lib/gscraper/has_pages.rb +1 -3
- data/lib/gscraper/hosts.rb +158 -0
- data/lib/gscraper/languages.rb +110 -0
- data/lib/gscraper/licenses.rb +4 -1
- data/lib/gscraper/page.rb +1 -3
- data/lib/gscraper/search.rb +1 -1
- data/lib/gscraper/search/ajax_query.rb +33 -34
- data/lib/gscraper/{extensions.rb → search/exceptions.rb} +2 -2
- data/lib/gscraper/{extensions/uri.rb → search/exceptions/blocked.rb} +10 -2
- data/lib/gscraper/search/page.rb +47 -67
- data/lib/gscraper/search/query.rb +90 -44
- data/lib/gscraper/search/result.rb +7 -9
- data/lib/gscraper/search/search.rb +2 -2
- data/lib/gscraper/search/web_query.rb +93 -101
- data/lib/gscraper/sponsored_ad.rb +3 -3
- data/lib/gscraper/sponsored_links.rb +1 -3
- data/lib/gscraper/version.rb +2 -2
- data/spec/languages_spec.rb +28 -0
- data/spec/search/ajax_query_spec.rb +2 -1
- data/spec/search/query_spec.rb +29 -0
- data/spec/search/web_query_spec.rb +21 -1
- data/spec/spec_helper.rb +2 -12
- metadata +107 -125
- data/.specopts +0 -1
- data/Gemfile +0 -25
- data/lib/gscraper/extensions/uri/http.rb +0 -31
- data/lib/gscraper/extensions/uri/query_params.rb +0 -109
- data/spec/extensions/uri/http_spec.rb +0 -9
- data/spec/extensions/uri/query_params_spec.rb +0 -46
data/lib/gscraper/has_pages.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#
|
2
2
|
# GScraper - A web-scraping interface to various Google Services.
|
3
3
|
#
|
4
|
-
# Copyright (c) 2007-
|
4
|
+
# Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
5
|
#
|
6
6
|
# This program is free software; you can redistribute it and/or modify
|
7
7
|
# it under the terms of the GNU General Public License as published by
|
@@ -18,8 +18,6 @@
|
|
18
18
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
19
19
|
#
|
20
20
|
|
21
|
-
require 'enumerator'
|
22
|
-
|
23
21
|
module GScraper
|
24
22
|
module HasPages
|
25
23
|
include Enumerable
|
@@ -0,0 +1,158 @@
|
|
1
|
+
#
|
2
|
+
# GScraper - A web-scraping interface to various Google Services.
|
3
|
+
#
|
4
|
+
# Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
|
+
#
|
6
|
+
# This program is free software; you can redistribute it and/or modify
|
7
|
+
# it under the terms of the GNU General Public License as published by
|
8
|
+
# the Free Software Foundation; either version 2 of the License, or
|
9
|
+
# (at your option) any later version.
|
10
|
+
#
|
11
|
+
# This program is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
# GNU General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU General Public License
|
17
|
+
# along with this program; if not, write to the Free Software
|
18
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
19
|
+
#
|
20
|
+
|
21
|
+
module GScraper
|
22
|
+
#
|
23
|
+
# @api semipublic
|
24
|
+
#
|
25
|
+
# @since 0.4.0
|
26
|
+
#
|
27
|
+
module Hosts
|
28
|
+
# List of all google domain-names.
|
29
|
+
DOMAINS = %w[
|
30
|
+
google.com
|
31
|
+
google.de
|
32
|
+
google.at
|
33
|
+
google.pl
|
34
|
+
google.fr
|
35
|
+
google.nl
|
36
|
+
google.it
|
37
|
+
google.com.tr
|
38
|
+
google.es
|
39
|
+
google.ch
|
40
|
+
google.be
|
41
|
+
google.gr
|
42
|
+
google.com.br
|
43
|
+
google.lu
|
44
|
+
google.fi
|
45
|
+
google.pt
|
46
|
+
google.hu
|
47
|
+
google.hr
|
48
|
+
google.bg
|
49
|
+
google.com.mx
|
50
|
+
google.si
|
51
|
+
google.sk
|
52
|
+
google.ro
|
53
|
+
google.ca
|
54
|
+
google.co.uk
|
55
|
+
google.cl
|
56
|
+
google.com.ar
|
57
|
+
google.se
|
58
|
+
google.cz
|
59
|
+
google.dk
|
60
|
+
google.co.th
|
61
|
+
google.com.co
|
62
|
+
google.lt
|
63
|
+
google.co.id
|
64
|
+
google.co.in
|
65
|
+
google.co.il
|
66
|
+
google.com.eg
|
67
|
+
google.cn
|
68
|
+
google.co.ve
|
69
|
+
google.ru
|
70
|
+
google.co.jp
|
71
|
+
google.com.pe
|
72
|
+
google.com.au
|
73
|
+
google.co.ma
|
74
|
+
google.co.za
|
75
|
+
google.com.ph
|
76
|
+
google.com.sa
|
77
|
+
google.ie
|
78
|
+
google.co.kr
|
79
|
+
google.no
|
80
|
+
google.com.ec
|
81
|
+
google.com.vn
|
82
|
+
google.lv
|
83
|
+
google.com.mt
|
84
|
+
google.com.uy
|
85
|
+
google.ae
|
86
|
+
google.ba
|
87
|
+
google.co.nz
|
88
|
+
google.com.ua
|
89
|
+
google.co.cr
|
90
|
+
google.ee
|
91
|
+
google.com.do
|
92
|
+
google.com.tw
|
93
|
+
google.com.hk
|
94
|
+
google.com.my
|
95
|
+
google.com.sv
|
96
|
+
google.com.pr
|
97
|
+
google.lk
|
98
|
+
google.com.gt
|
99
|
+
google.com.bd
|
100
|
+
google.com.pk
|
101
|
+
google.is
|
102
|
+
google.li
|
103
|
+
google.com.bh
|
104
|
+
google.com.ni
|
105
|
+
google.com.py
|
106
|
+
google.com.ng
|
107
|
+
google.com.bo
|
108
|
+
google.co.ke
|
109
|
+
google.hn
|
110
|
+
google.com.sg
|
111
|
+
google.mu
|
112
|
+
google.ci
|
113
|
+
google.jo
|
114
|
+
google.nu
|
115
|
+
google.com.jm
|
116
|
+
google.com.ly
|
117
|
+
google.co.yu
|
118
|
+
google.tt
|
119
|
+
google.com.kh
|
120
|
+
google.ge
|
121
|
+
google.com.na
|
122
|
+
google.com.et
|
123
|
+
google.sm
|
124
|
+
google.cd
|
125
|
+
google.gm
|
126
|
+
google.com.qa
|
127
|
+
google.dj
|
128
|
+
google.com.cu
|
129
|
+
google.com.pa
|
130
|
+
google.gp
|
131
|
+
google.az
|
132
|
+
google.as
|
133
|
+
google.pl
|
134
|
+
google.mn
|
135
|
+
google.ht
|
136
|
+
google.md
|
137
|
+
google.am
|
138
|
+
google.sn
|
139
|
+
google.je
|
140
|
+
google.com.bn
|
141
|
+
google.com.ai
|
142
|
+
google.co.zm
|
143
|
+
google.ma
|
144
|
+
google.rw
|
145
|
+
google.co.ug
|
146
|
+
google.com.vc
|
147
|
+
google.at
|
148
|
+
google.com.gi
|
149
|
+
google.to
|
150
|
+
google.com.om
|
151
|
+
google.kz
|
152
|
+
google.co.uz
|
153
|
+
]
|
154
|
+
|
155
|
+
# The primary domain
|
156
|
+
PRIMARY_DOMAIN = DOMAINS.first
|
157
|
+
end
|
158
|
+
end
|
@@ -0,0 +1,110 @@
|
|
1
|
+
#
|
2
|
+
# GScraper - A web-scraping interface to various Google Services.
|
3
|
+
#
|
4
|
+
# Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
|
+
#
|
6
|
+
# This program is free software; you can redistribute it and/or modify
|
7
|
+
# it under the terms of the GNU General Public License as published by
|
8
|
+
# the Free Software Foundation; either version 2 of the License, or
|
9
|
+
# (at your option) any later version.
|
10
|
+
#
|
11
|
+
# This program is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
# GNU General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU General Public License
|
17
|
+
# along with this program; if not, write to the Free Software
|
18
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
19
|
+
#
|
20
|
+
|
21
|
+
module GScraper
|
22
|
+
#
|
23
|
+
# @api semipublic
|
24
|
+
#
|
25
|
+
# @since 0.3.0
|
26
|
+
#
|
27
|
+
module Languages
|
28
|
+
# The list of language names
|
29
|
+
NAMES = %w[
|
30
|
+
af
|
31
|
+
ar
|
32
|
+
be
|
33
|
+
bg
|
34
|
+
ca
|
35
|
+
cs
|
36
|
+
da
|
37
|
+
de
|
38
|
+
el
|
39
|
+
en
|
40
|
+
eo
|
41
|
+
es
|
42
|
+
et
|
43
|
+
fa
|
44
|
+
fi
|
45
|
+
fr
|
46
|
+
hi
|
47
|
+
hr
|
48
|
+
hu
|
49
|
+
hy
|
50
|
+
id
|
51
|
+
is
|
52
|
+
it
|
53
|
+
iw
|
54
|
+
ja
|
55
|
+
ko
|
56
|
+
lt
|
57
|
+
lv
|
58
|
+
nl
|
59
|
+
no
|
60
|
+
pl
|
61
|
+
pt
|
62
|
+
ro
|
63
|
+
ru
|
64
|
+
sk
|
65
|
+
sl
|
66
|
+
sr
|
67
|
+
sv
|
68
|
+
sw
|
69
|
+
th
|
70
|
+
tl
|
71
|
+
tr
|
72
|
+
uk
|
73
|
+
vi
|
74
|
+
zh-CN
|
75
|
+
zh-TW
|
76
|
+
]
|
77
|
+
|
78
|
+
#
|
79
|
+
# Looks up the language for the given locale.
|
80
|
+
#
|
81
|
+
# @param [String] locale
|
82
|
+
# A locale.
|
83
|
+
#
|
84
|
+
# @return [String]
|
85
|
+
# The language used by the locale.
|
86
|
+
#
|
87
|
+
def Languages.find(locale)
|
88
|
+
if locale =~ /^zh_CN/
|
89
|
+
'zh-CN'
|
90
|
+
elsif locale =~ /^zh_TW/
|
91
|
+
'zh-TW'
|
92
|
+
else
|
93
|
+
if (match = locale.match(/^([^_@]+)([_@].+)?$/))
|
94
|
+
match[1] if (match[1] && NAMES.include?(match[1]))
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
#
|
100
|
+
# Determines the native language.
|
101
|
+
#
|
102
|
+
# @return [String]
|
103
|
+
# The native language.
|
104
|
+
#
|
105
|
+
def Languages.native
|
106
|
+
language = ENV['LANG'] || 'en'
|
107
|
+
Languages.find(language)
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
data/lib/gscraper/licenses.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#
|
2
2
|
# GScraper - A web-scraping interface to various Google Services.
|
3
3
|
#
|
4
|
-
# Copyright (c) 2007-
|
4
|
+
# Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
5
|
#
|
6
6
|
# This program is free software; you can redistribute it and/or modify
|
7
7
|
# it under the terms of the GNU General Public License as published by
|
@@ -19,6 +19,9 @@
|
|
19
19
|
#
|
20
20
|
|
21
21
|
module GScraper
|
22
|
+
#
|
23
|
+
# @api semipublic
|
24
|
+
#
|
22
25
|
module Licenses
|
23
26
|
# Any desired license
|
24
27
|
ANY = nil
|
data/lib/gscraper/page.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#
|
2
2
|
# GScraper - A web-scraping interface to various Google Services.
|
3
3
|
#
|
4
|
-
# Copyright (c) 2007-
|
4
|
+
# Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
5
|
#
|
6
6
|
# This program is free software; you can redistribute it and/or modify
|
7
7
|
# it under the terms of the GNU General Public License as published by
|
@@ -18,8 +18,6 @@
|
|
18
18
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
19
19
|
#
|
20
20
|
|
21
|
-
require 'enumerator'
|
22
|
-
|
23
21
|
module GScraper
|
24
22
|
class Page < Array
|
25
23
|
|
data/lib/gscraper/search.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#
|
2
2
|
# GScraper - A web-scraping interface to various Google Services.
|
3
3
|
#
|
4
|
-
# Copyright (c) 2007-
|
4
|
+
# Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
5
|
#
|
6
6
|
# This program is free software; you can redistribute it and/or modify
|
7
7
|
# it under the terms of the GNU General Public License as published by
|
@@ -1,7 +1,7 @@
|
|
1
1
|
#
|
2
2
|
# GScraper - A web-scraping interface to various Google Services.
|
3
3
|
#
|
4
|
-
# Copyright (c) 2007-
|
4
|
+
# Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
5
|
#
|
6
6
|
# This program is free software; you can redistribute it and/or modify
|
7
7
|
# it under the terms of the GNU General Public License as published by
|
@@ -21,11 +21,11 @@
|
|
21
21
|
require 'gscraper/search/result'
|
22
22
|
require 'gscraper/search/page'
|
23
23
|
require 'gscraper/search/query'
|
24
|
-
require 'gscraper/extensions/uri'
|
25
24
|
require 'gscraper/has_pages'
|
26
25
|
require 'gscraper/gscraper'
|
27
26
|
|
28
27
|
require 'json'
|
28
|
+
require 'uri/query_params'
|
29
29
|
require 'nokogiri'
|
30
30
|
|
31
31
|
module GScraper
|
@@ -40,14 +40,11 @@ module GScraper
|
|
40
40
|
# Maximum results per-page
|
41
41
|
RESULTS_PER_PAGE = 8
|
42
42
|
|
43
|
-
# AJAX API
|
44
|
-
|
43
|
+
# AJAX API Path
|
44
|
+
PATH = '/uds/GwebSearch'
|
45
45
|
|
46
|
-
# AJAX API
|
47
|
-
|
48
|
-
|
49
|
-
# Default language
|
50
|
-
DEFAULT_LANGUAGE = 'en'
|
46
|
+
# AJAX API Query string
|
47
|
+
QUERY = 'callback=google.search.WebSearch.RawCompletion&context=0&lstkp=0&rsz=large'
|
51
48
|
|
52
49
|
# Default signature
|
53
50
|
DEFAULT_SIG = '582c1116317355adf613a6a843f19ece'
|
@@ -58,9 +55,6 @@ module GScraper
|
|
58
55
|
# Default version
|
59
56
|
DEFAULT_VERSION = '1.0'
|
60
57
|
|
61
|
-
# The search language
|
62
|
-
attr_accessor :language
|
63
|
-
|
64
58
|
# The search signature
|
65
59
|
attr_accessor :sig
|
66
60
|
|
@@ -76,13 +70,16 @@ module GScraper
|
|
76
70
|
# @param [Hash] options
|
77
71
|
# Query options.
|
78
72
|
#
|
79
|
-
# @option options [
|
73
|
+
# @option options [String] :search_host (www.google.com)
|
74
|
+
# The host to submit queries to.
|
75
|
+
#
|
76
|
+
# @option options [String, Symbol] :language (Languages.native)
|
80
77
|
# The search language.
|
81
78
|
#
|
82
79
|
# @option options [String] :sig ('582c1116317355adf613a6a843f19ece')
|
83
80
|
# The search signature.
|
84
81
|
#
|
85
|
-
# @option options [Symbol] :key (
|
82
|
+
# @option options [String, Symbol] :key ('notsupplied')
|
86
83
|
# The search key.
|
87
84
|
#
|
88
85
|
# @option options [Float] :version (1.0)
|
@@ -97,11 +94,9 @@ module GScraper
|
|
97
94
|
def initialize(options={},&block)
|
98
95
|
@agent = GScraper.web_agent(options)
|
99
96
|
|
100
|
-
@
|
101
|
-
|
102
|
-
@
|
103
|
-
@key = (options[:key] || DEFAULT_KEY)
|
104
|
-
@version = (options[:version] || DEFAULT_VERSION)
|
97
|
+
@sig = options.fetch(:sig,DEFAULT_SIG)
|
98
|
+
@key = options.fetch(:key,DEFAULT_KEY)
|
99
|
+
@version = options.fetch(:version,DEFAULT_VERSION)
|
105
100
|
|
106
101
|
super(options,&block)
|
107
102
|
end
|
@@ -130,13 +125,13 @@ module GScraper
|
|
130
125
|
url = URI(url.to_s)
|
131
126
|
|
132
127
|
options[:language] = url.query_params['hl']
|
133
|
-
options[:query]
|
128
|
+
options[:query] = url.query_params['q']
|
134
129
|
|
135
|
-
options[:sig]
|
136
|
-
options[:key]
|
130
|
+
options[:sig] = url.query_params['sig']
|
131
|
+
options[:key] = url.query_params['key']
|
137
132
|
options[:version] = url.query_params['v']
|
138
133
|
|
139
|
-
return
|
134
|
+
return AJAXQuery.new(options,&block)
|
140
135
|
end
|
141
136
|
|
142
137
|
#
|
@@ -158,14 +153,18 @@ module GScraper
|
|
158
153
|
# The URL for the query.
|
159
154
|
#
|
160
155
|
def search_url
|
161
|
-
search_url = URI(
|
156
|
+
search_url = URI::HTTP.build(
|
157
|
+
:host => search_host,
|
158
|
+
:path => PATH,
|
159
|
+
:query => QUERY
|
160
|
+
)
|
162
161
|
|
163
|
-
search_url.query_params['hl']
|
162
|
+
search_url.query_params['hl'] = @language
|
164
163
|
search_url.query_params['gss'] = '.com'
|
165
|
-
search_url.query_params['q']
|
164
|
+
search_url.query_params['q'] = expression
|
166
165
|
search_url.query_params['sig'] = @sig
|
167
166
|
search_url.query_params['key'] = @key
|
168
|
-
search_url.query_params['v']
|
167
|
+
search_url.query_params['v'] = @version
|
169
168
|
|
170
169
|
return search_url
|
171
170
|
end
|
@@ -207,15 +206,15 @@ module GScraper
|
|
207
206
|
|
208
207
|
if (hash.kind_of?(Hash) && hash['results'])
|
209
208
|
hash['results'].each_with_index do |result,index|
|
210
|
-
rank
|
209
|
+
rank = rank_offset + (index + 1)
|
211
210
|
title = Nokogiri::HTML(result['title']).inner_text
|
212
|
-
url
|
211
|
+
url = URI(URI.escape(result['unescapedUrl']))
|
213
212
|
|
214
|
-
unless result['content'].empty?
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
213
|
+
summary = unless result['content'].empty?
|
214
|
+
Nokogiri::HTML(result['content']).inner_text
|
215
|
+
else
|
216
|
+
''
|
217
|
+
end
|
219
218
|
|
220
219
|
cached_url = URI(result['cacheUrl'])
|
221
220
|
|