gscraper 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.rspec +1 -0
- data/ChangeLog.md +24 -2
- data/README.md +12 -7
- data/Rakefile +26 -29
- data/gemspec.yml +20 -0
- data/gscraper.gemspec +124 -109
- data/lib/gscraper.rb +1 -1
- data/lib/gscraper/gscraper.rb +24 -20
- data/lib/gscraper/has_pages.rb +1 -3
- data/lib/gscraper/hosts.rb +158 -0
- data/lib/gscraper/languages.rb +110 -0
- data/lib/gscraper/licenses.rb +4 -1
- data/lib/gscraper/page.rb +1 -3
- data/lib/gscraper/search.rb +1 -1
- data/lib/gscraper/search/ajax_query.rb +33 -34
- data/lib/gscraper/{extensions.rb → search/exceptions.rb} +2 -2
- data/lib/gscraper/{extensions/uri.rb → search/exceptions/blocked.rb} +10 -2
- data/lib/gscraper/search/page.rb +47 -67
- data/lib/gscraper/search/query.rb +90 -44
- data/lib/gscraper/search/result.rb +7 -9
- data/lib/gscraper/search/search.rb +2 -2
- data/lib/gscraper/search/web_query.rb +93 -101
- data/lib/gscraper/sponsored_ad.rb +3 -3
- data/lib/gscraper/sponsored_links.rb +1 -3
- data/lib/gscraper/version.rb +2 -2
- data/spec/languages_spec.rb +28 -0
- data/spec/search/ajax_query_spec.rb +2 -1
- data/spec/search/query_spec.rb +29 -0
- data/spec/search/web_query_spec.rb +21 -1
- data/spec/spec_helper.rb +2 -12
- metadata +107 -125
- data/.specopts +0 -1
- data/Gemfile +0 -25
- data/lib/gscraper/extensions/uri/http.rb +0 -31
- data/lib/gscraper/extensions/uri/query_params.rb +0 -109
- data/spec/extensions/uri/http_spec.rb +0 -9
- data/spec/extensions/uri/query_params_spec.rb +0 -46
data/lib/gscraper/has_pages.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#
|
2
2
|
# GScraper - A web-scraping interface to various Google Services.
|
3
3
|
#
|
4
|
-
# Copyright (c) 2007-
|
4
|
+
# Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
5
|
#
|
6
6
|
# This program is free software; you can redistribute it and/or modify
|
7
7
|
# it under the terms of the GNU General Public License as published by
|
@@ -18,8 +18,6 @@
|
|
18
18
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
19
19
|
#
|
20
20
|
|
21
|
-
require 'enumerator'
|
22
|
-
|
23
21
|
module GScraper
|
24
22
|
module HasPages
|
25
23
|
include Enumerable
|
@@ -0,0 +1,158 @@
|
|
1
|
+
#
|
2
|
+
# GScraper - A web-scraping interface to various Google Services.
|
3
|
+
#
|
4
|
+
# Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
|
+
#
|
6
|
+
# This program is free software; you can redistribute it and/or modify
|
7
|
+
# it under the terms of the GNU General Public License as published by
|
8
|
+
# the Free Software Foundation; either version 2 of the License, or
|
9
|
+
# (at your option) any later version.
|
10
|
+
#
|
11
|
+
# This program is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
# GNU General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU General Public License
|
17
|
+
# along with this program; if not, write to the Free Software
|
18
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
19
|
+
#
|
20
|
+
|
21
|
+
module GScraper
|
22
|
+
#
|
23
|
+
# @api semipublic
|
24
|
+
#
|
25
|
+
# @since 0.4.0
|
26
|
+
#
|
27
|
+
module Hosts
|
28
|
+
# List of all google domain-names.
|
29
|
+
DOMAINS = %w[
|
30
|
+
google.com
|
31
|
+
google.de
|
32
|
+
google.at
|
33
|
+
google.pl
|
34
|
+
google.fr
|
35
|
+
google.nl
|
36
|
+
google.it
|
37
|
+
google.com.tr
|
38
|
+
google.es
|
39
|
+
google.ch
|
40
|
+
google.be
|
41
|
+
google.gr
|
42
|
+
google.com.br
|
43
|
+
google.lu
|
44
|
+
google.fi
|
45
|
+
google.pt
|
46
|
+
google.hu
|
47
|
+
google.hr
|
48
|
+
google.bg
|
49
|
+
google.com.mx
|
50
|
+
google.si
|
51
|
+
google.sk
|
52
|
+
google.ro
|
53
|
+
google.ca
|
54
|
+
google.co.uk
|
55
|
+
google.cl
|
56
|
+
google.com.ar
|
57
|
+
google.se
|
58
|
+
google.cz
|
59
|
+
google.dk
|
60
|
+
google.co.th
|
61
|
+
google.com.co
|
62
|
+
google.lt
|
63
|
+
google.co.id
|
64
|
+
google.co.in
|
65
|
+
google.co.il
|
66
|
+
google.com.eg
|
67
|
+
google.cn
|
68
|
+
google.co.ve
|
69
|
+
google.ru
|
70
|
+
google.co.jp
|
71
|
+
google.com.pe
|
72
|
+
google.com.au
|
73
|
+
google.co.ma
|
74
|
+
google.co.za
|
75
|
+
google.com.ph
|
76
|
+
google.com.sa
|
77
|
+
google.ie
|
78
|
+
google.co.kr
|
79
|
+
google.no
|
80
|
+
google.com.ec
|
81
|
+
google.com.vn
|
82
|
+
google.lv
|
83
|
+
google.com.mt
|
84
|
+
google.com.uy
|
85
|
+
google.ae
|
86
|
+
google.ba
|
87
|
+
google.co.nz
|
88
|
+
google.com.ua
|
89
|
+
google.co.cr
|
90
|
+
google.ee
|
91
|
+
google.com.do
|
92
|
+
google.com.tw
|
93
|
+
google.com.hk
|
94
|
+
google.com.my
|
95
|
+
google.com.sv
|
96
|
+
google.com.pr
|
97
|
+
google.lk
|
98
|
+
google.com.gt
|
99
|
+
google.com.bd
|
100
|
+
google.com.pk
|
101
|
+
google.is
|
102
|
+
google.li
|
103
|
+
google.com.bh
|
104
|
+
google.com.ni
|
105
|
+
google.com.py
|
106
|
+
google.com.ng
|
107
|
+
google.com.bo
|
108
|
+
google.co.ke
|
109
|
+
google.hn
|
110
|
+
google.com.sg
|
111
|
+
google.mu
|
112
|
+
google.ci
|
113
|
+
google.jo
|
114
|
+
google.nu
|
115
|
+
google.com.jm
|
116
|
+
google.com.ly
|
117
|
+
google.co.yu
|
118
|
+
google.tt
|
119
|
+
google.com.kh
|
120
|
+
google.ge
|
121
|
+
google.com.na
|
122
|
+
google.com.et
|
123
|
+
google.sm
|
124
|
+
google.cd
|
125
|
+
google.gm
|
126
|
+
google.com.qa
|
127
|
+
google.dj
|
128
|
+
google.com.cu
|
129
|
+
google.com.pa
|
130
|
+
google.gp
|
131
|
+
google.az
|
132
|
+
google.as
|
133
|
+
google.pl
|
134
|
+
google.mn
|
135
|
+
google.ht
|
136
|
+
google.md
|
137
|
+
google.am
|
138
|
+
google.sn
|
139
|
+
google.je
|
140
|
+
google.com.bn
|
141
|
+
google.com.ai
|
142
|
+
google.co.zm
|
143
|
+
google.ma
|
144
|
+
google.rw
|
145
|
+
google.co.ug
|
146
|
+
google.com.vc
|
147
|
+
google.at
|
148
|
+
google.com.gi
|
149
|
+
google.to
|
150
|
+
google.com.om
|
151
|
+
google.kz
|
152
|
+
google.co.uz
|
153
|
+
]
|
154
|
+
|
155
|
+
# The primary domain
|
156
|
+
PRIMARY_DOMAIN = DOMAINS.first
|
157
|
+
end
|
158
|
+
end
|
@@ -0,0 +1,110 @@
|
|
1
|
+
#
|
2
|
+
# GScraper - A web-scraping interface to various Google Services.
|
3
|
+
#
|
4
|
+
# Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
|
+
#
|
6
|
+
# This program is free software; you can redistribute it and/or modify
|
7
|
+
# it under the terms of the GNU General Public License as published by
|
8
|
+
# the Free Software Foundation; either version 2 of the License, or
|
9
|
+
# (at your option) any later version.
|
10
|
+
#
|
11
|
+
# This program is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
# GNU General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU General Public License
|
17
|
+
# along with this program; if not, write to the Free Software
|
18
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
19
|
+
#
|
20
|
+
|
21
|
+
module GScraper
|
22
|
+
#
|
23
|
+
# @api semipublic
|
24
|
+
#
|
25
|
+
# @since 0.3.0
|
26
|
+
#
|
27
|
+
module Languages
|
28
|
+
# The list of language names
|
29
|
+
NAMES = %w[
|
30
|
+
af
|
31
|
+
ar
|
32
|
+
be
|
33
|
+
bg
|
34
|
+
ca
|
35
|
+
cs
|
36
|
+
da
|
37
|
+
de
|
38
|
+
el
|
39
|
+
en
|
40
|
+
eo
|
41
|
+
es
|
42
|
+
et
|
43
|
+
fa
|
44
|
+
fi
|
45
|
+
fr
|
46
|
+
hi
|
47
|
+
hr
|
48
|
+
hu
|
49
|
+
hy
|
50
|
+
id
|
51
|
+
is
|
52
|
+
it
|
53
|
+
iw
|
54
|
+
ja
|
55
|
+
ko
|
56
|
+
lt
|
57
|
+
lv
|
58
|
+
nl
|
59
|
+
no
|
60
|
+
pl
|
61
|
+
pt
|
62
|
+
ro
|
63
|
+
ru
|
64
|
+
sk
|
65
|
+
sl
|
66
|
+
sr
|
67
|
+
sv
|
68
|
+
sw
|
69
|
+
th
|
70
|
+
tl
|
71
|
+
tr
|
72
|
+
uk
|
73
|
+
vi
|
74
|
+
zh-CN
|
75
|
+
zh-TW
|
76
|
+
]
|
77
|
+
|
78
|
+
#
|
79
|
+
# Looks up the language for the given locale.
|
80
|
+
#
|
81
|
+
# @param [String] locale
|
82
|
+
# A locale.
|
83
|
+
#
|
84
|
+
# @return [String]
|
85
|
+
# The language used by the locale.
|
86
|
+
#
|
87
|
+
def Languages.find(locale)
|
88
|
+
if locale =~ /^zh_CN/
|
89
|
+
'zh-CN'
|
90
|
+
elsif locale =~ /^zh_TW/
|
91
|
+
'zh-TW'
|
92
|
+
else
|
93
|
+
if (match = locale.match(/^([^_@]+)([_@].+)?$/))
|
94
|
+
match[1] if (match[1] && NAMES.include?(match[1]))
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
#
|
100
|
+
# Determines the native language.
|
101
|
+
#
|
102
|
+
# @return [String]
|
103
|
+
# The native language.
|
104
|
+
#
|
105
|
+
def Languages.native
|
106
|
+
language = ENV['LANG'] || 'en'
|
107
|
+
Languages.find(language)
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
data/lib/gscraper/licenses.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#
|
2
2
|
# GScraper - A web-scraping interface to various Google Services.
|
3
3
|
#
|
4
|
-
# Copyright (c) 2007-
|
4
|
+
# Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
5
|
#
|
6
6
|
# This program is free software; you can redistribute it and/or modify
|
7
7
|
# it under the terms of the GNU General Public License as published by
|
@@ -19,6 +19,9 @@
|
|
19
19
|
#
|
20
20
|
|
21
21
|
module GScraper
|
22
|
+
#
|
23
|
+
# @api semipublic
|
24
|
+
#
|
22
25
|
module Licenses
|
23
26
|
# Any desired license
|
24
27
|
ANY = nil
|
data/lib/gscraper/page.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#
|
2
2
|
# GScraper - A web-scraping interface to various Google Services.
|
3
3
|
#
|
4
|
-
# Copyright (c) 2007-
|
4
|
+
# Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
5
|
#
|
6
6
|
# This program is free software; you can redistribute it and/or modify
|
7
7
|
# it under the terms of the GNU General Public License as published by
|
@@ -18,8 +18,6 @@
|
|
18
18
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
19
19
|
#
|
20
20
|
|
21
|
-
require 'enumerator'
|
22
|
-
|
23
21
|
module GScraper
|
24
22
|
class Page < Array
|
25
23
|
|
data/lib/gscraper/search.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#
|
2
2
|
# GScraper - A web-scraping interface to various Google Services.
|
3
3
|
#
|
4
|
-
# Copyright (c) 2007-
|
4
|
+
# Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
5
|
#
|
6
6
|
# This program is free software; you can redistribute it and/or modify
|
7
7
|
# it under the terms of the GNU General Public License as published by
|
@@ -1,7 +1,7 @@
|
|
1
1
|
#
|
2
2
|
# GScraper - A web-scraping interface to various Google Services.
|
3
3
|
#
|
4
|
-
# Copyright (c) 2007-
|
4
|
+
# Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
5
|
#
|
6
6
|
# This program is free software; you can redistribute it and/or modify
|
7
7
|
# it under the terms of the GNU General Public License as published by
|
@@ -21,11 +21,11 @@
|
|
21
21
|
require 'gscraper/search/result'
|
22
22
|
require 'gscraper/search/page'
|
23
23
|
require 'gscraper/search/query'
|
24
|
-
require 'gscraper/extensions/uri'
|
25
24
|
require 'gscraper/has_pages'
|
26
25
|
require 'gscraper/gscraper'
|
27
26
|
|
28
27
|
require 'json'
|
28
|
+
require 'uri/query_params'
|
29
29
|
require 'nokogiri'
|
30
30
|
|
31
31
|
module GScraper
|
@@ -40,14 +40,11 @@ module GScraper
|
|
40
40
|
# Maximum results per-page
|
41
41
|
RESULTS_PER_PAGE = 8
|
42
42
|
|
43
|
-
# AJAX API
|
44
|
-
|
43
|
+
# AJAX API Path
|
44
|
+
PATH = '/uds/GwebSearch'
|
45
45
|
|
46
|
-
# AJAX API
|
47
|
-
|
48
|
-
|
49
|
-
# Default language
|
50
|
-
DEFAULT_LANGUAGE = 'en'
|
46
|
+
# AJAX API Query string
|
47
|
+
QUERY = 'callback=google.search.WebSearch.RawCompletion&context=0&lstkp=0&rsz=large'
|
51
48
|
|
52
49
|
# Default signature
|
53
50
|
DEFAULT_SIG = '582c1116317355adf613a6a843f19ece'
|
@@ -58,9 +55,6 @@ module GScraper
|
|
58
55
|
# Default version
|
59
56
|
DEFAULT_VERSION = '1.0'
|
60
57
|
|
61
|
-
# The search language
|
62
|
-
attr_accessor :language
|
63
|
-
|
64
58
|
# The search signature
|
65
59
|
attr_accessor :sig
|
66
60
|
|
@@ -76,13 +70,16 @@ module GScraper
|
|
76
70
|
# @param [Hash] options
|
77
71
|
# Query options.
|
78
72
|
#
|
79
|
-
# @option options [
|
73
|
+
# @option options [String] :search_host (www.google.com)
|
74
|
+
# The host to submit queries to.
|
75
|
+
#
|
76
|
+
# @option options [String, Symbol] :language (Languages.native)
|
80
77
|
# The search language.
|
81
78
|
#
|
82
79
|
# @option options [String] :sig ('582c1116317355adf613a6a843f19ece')
|
83
80
|
# The search signature.
|
84
81
|
#
|
85
|
-
# @option options [Symbol] :key (
|
82
|
+
# @option options [String, Symbol] :key ('notsupplied')
|
86
83
|
# The search key.
|
87
84
|
#
|
88
85
|
# @option options [Float] :version (1.0)
|
@@ -97,11 +94,9 @@ module GScraper
|
|
97
94
|
def initialize(options={},&block)
|
98
95
|
@agent = GScraper.web_agent(options)
|
99
96
|
|
100
|
-
@
|
101
|
-
|
102
|
-
@
|
103
|
-
@key = (options[:key] || DEFAULT_KEY)
|
104
|
-
@version = (options[:version] || DEFAULT_VERSION)
|
97
|
+
@sig = options.fetch(:sig,DEFAULT_SIG)
|
98
|
+
@key = options.fetch(:key,DEFAULT_KEY)
|
99
|
+
@version = options.fetch(:version,DEFAULT_VERSION)
|
105
100
|
|
106
101
|
super(options,&block)
|
107
102
|
end
|
@@ -130,13 +125,13 @@ module GScraper
|
|
130
125
|
url = URI(url.to_s)
|
131
126
|
|
132
127
|
options[:language] = url.query_params['hl']
|
133
|
-
options[:query]
|
128
|
+
options[:query] = url.query_params['q']
|
134
129
|
|
135
|
-
options[:sig]
|
136
|
-
options[:key]
|
130
|
+
options[:sig] = url.query_params['sig']
|
131
|
+
options[:key] = url.query_params['key']
|
137
132
|
options[:version] = url.query_params['v']
|
138
133
|
|
139
|
-
return
|
134
|
+
return AJAXQuery.new(options,&block)
|
140
135
|
end
|
141
136
|
|
142
137
|
#
|
@@ -158,14 +153,18 @@ module GScraper
|
|
158
153
|
# The URL for the query.
|
159
154
|
#
|
160
155
|
def search_url
|
161
|
-
search_url = URI(
|
156
|
+
search_url = URI::HTTP.build(
|
157
|
+
:host => search_host,
|
158
|
+
:path => PATH,
|
159
|
+
:query => QUERY
|
160
|
+
)
|
162
161
|
|
163
|
-
search_url.query_params['hl']
|
162
|
+
search_url.query_params['hl'] = @language
|
164
163
|
search_url.query_params['gss'] = '.com'
|
165
|
-
search_url.query_params['q']
|
164
|
+
search_url.query_params['q'] = expression
|
166
165
|
search_url.query_params['sig'] = @sig
|
167
166
|
search_url.query_params['key'] = @key
|
168
|
-
search_url.query_params['v']
|
167
|
+
search_url.query_params['v'] = @version
|
169
168
|
|
170
169
|
return search_url
|
171
170
|
end
|
@@ -207,15 +206,15 @@ module GScraper
|
|
207
206
|
|
208
207
|
if (hash.kind_of?(Hash) && hash['results'])
|
209
208
|
hash['results'].each_with_index do |result,index|
|
210
|
-
rank
|
209
|
+
rank = rank_offset + (index + 1)
|
211
210
|
title = Nokogiri::HTML(result['title']).inner_text
|
212
|
-
url
|
211
|
+
url = URI(URI.escape(result['unescapedUrl']))
|
213
212
|
|
214
|
-
unless result['content'].empty?
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
213
|
+
summary = unless result['content'].empty?
|
214
|
+
Nokogiri::HTML(result['content']).inner_text
|
215
|
+
else
|
216
|
+
''
|
217
|
+
end
|
219
218
|
|
220
219
|
cached_url = URI(result['cacheUrl'])
|
221
220
|
|