gscraper 0.3.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,7 +1,7 @@
1
1
  #
2
2
  # GScraper - A web-scraping interface to various Google Services.
3
3
  #
4
- # Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
4
+ # Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
5
5
  #
6
6
  # This program is free software; you can redistribute it and/or modify
7
7
  # it under the terms of the GNU General Public License as published by
@@ -18,8 +18,6 @@
18
18
  # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
19
  #
20
20
 
21
- require 'enumerator'
22
-
23
21
  module GScraper
24
22
  module HasPages
25
23
  include Enumerable
@@ -0,0 +1,158 @@
1
+ #
2
+ # GScraper - A web-scraping interface to various Google Services.
3
+ #
4
+ # Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
5
+ #
6
+ # This program is free software; you can redistribute it and/or modify
7
+ # it under the terms of the GNU General Public License as published by
8
+ # the Free Software Foundation; either version 2 of the License, or
9
+ # (at your option) any later version.
10
+ #
11
+ # This program is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU General Public License
17
+ # along with this program; if not, write to the Free Software
18
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
+ #
20
+
21
+ module GScraper
22
+ #
23
+ # @api semipublic
24
+ #
25
+ # @since 0.4.0
26
+ #
27
+ module Hosts
28
+ # List of all google domain-names.
29
+ DOMAINS = %w[
30
+ google.com
31
+ google.de
32
+ google.at
33
+ google.pl
34
+ google.fr
35
+ google.nl
36
+ google.it
37
+ google.com.tr
38
+ google.es
39
+ google.ch
40
+ google.be
41
+ google.gr
42
+ google.com.br
43
+ google.lu
44
+ google.fi
45
+ google.pt
46
+ google.hu
47
+ google.hr
48
+ google.bg
49
+ google.com.mx
50
+ google.si
51
+ google.sk
52
+ google.ro
53
+ google.ca
54
+ google.co.uk
55
+ google.cl
56
+ google.com.ar
57
+ google.se
58
+ google.cz
59
+ google.dk
60
+ google.co.th
61
+ google.com.co
62
+ google.lt
63
+ google.co.id
64
+ google.co.in
65
+ google.co.il
66
+ google.com.eg
67
+ google.cn
68
+ google.co.ve
69
+ google.ru
70
+ google.co.jp
71
+ google.com.pe
72
+ google.com.au
73
+ google.co.ma
74
+ google.co.za
75
+ google.com.ph
76
+ google.com.sa
77
+ google.ie
78
+ google.co.kr
79
+ google.no
80
+ google.com.ec
81
+ google.com.vn
82
+ google.lv
83
+ google.com.mt
84
+ google.com.uy
85
+ google.ae
86
+ google.ba
87
+ google.co.nz
88
+ google.com.ua
89
+ google.co.cr
90
+ google.ee
91
+ google.com.do
92
+ google.com.tw
93
+ google.com.hk
94
+ google.com.my
95
+ google.com.sv
96
+ google.com.pr
97
+ google.lk
98
+ google.com.gt
99
+ google.com.bd
100
+ google.com.pk
101
+ google.is
102
+ google.li
103
+ google.com.bh
104
+ google.com.ni
105
+ google.com.py
106
+ google.com.ng
107
+ google.com.bo
108
+ google.co.ke
109
+ google.hn
110
+ google.com.sg
111
+ google.mu
112
+ google.ci
113
+ google.jo
114
+ google.nu
115
+ google.com.jm
116
+ google.com.ly
117
+ google.co.yu
118
+ google.tt
119
+ google.com.kh
120
+ google.ge
121
+ google.com.na
122
+ google.com.et
123
+ google.sm
124
+ google.cd
125
+ google.gm
126
+ google.com.qa
127
+ google.dj
128
+ google.com.cu
129
+ google.com.pa
130
+ google.gp
131
+ google.az
132
+ google.as
133
+ google.pl
134
+ google.mn
135
+ google.ht
136
+ google.md
137
+ google.am
138
+ google.sn
139
+ google.je
140
+ google.com.bn
141
+ google.com.ai
142
+ google.co.zm
143
+ google.ma
144
+ google.rw
145
+ google.co.ug
146
+ google.com.vc
147
+ google.at
148
+ google.com.gi
149
+ google.to
150
+ google.com.om
151
+ google.kz
152
+ google.co.uz
153
+ ]
154
+
155
+ # The primary domain
156
+ PRIMARY_DOMAIN = DOMAINS.first
157
+ end
158
+ end
@@ -0,0 +1,110 @@
1
+ #
2
+ # GScraper - A web-scraping interface to various Google Services.
3
+ #
4
+ # Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
5
+ #
6
+ # This program is free software; you can redistribute it and/or modify
7
+ # it under the terms of the GNU General Public License as published by
8
+ # the Free Software Foundation; either version 2 of the License, or
9
+ # (at your option) any later version.
10
+ #
11
+ # This program is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU General Public License
17
+ # along with this program; if not, write to the Free Software
18
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
+ #
20
+
21
+ module GScraper
22
+ #
23
+ # @api semipublic
24
+ #
25
+ # @since 0.3.0
26
+ #
27
+ module Languages
28
+ # The list of language names
29
+ NAMES = %w[
30
+ af
31
+ ar
32
+ be
33
+ bg
34
+ ca
35
+ cs
36
+ da
37
+ de
38
+ el
39
+ en
40
+ eo
41
+ es
42
+ et
43
+ fa
44
+ fi
45
+ fr
46
+ hi
47
+ hr
48
+ hu
49
+ hy
50
+ id
51
+ is
52
+ it
53
+ iw
54
+ ja
55
+ ko
56
+ lt
57
+ lv
58
+ nl
59
+ no
60
+ pl
61
+ pt
62
+ ro
63
+ ru
64
+ sk
65
+ sl
66
+ sr
67
+ sv
68
+ sw
69
+ th
70
+ tl
71
+ tr
72
+ uk
73
+ vi
74
+ zh-CN
75
+ zh-TW
76
+ ]
77
+
78
+ #
79
+ # Looks up the language for the given locale.
80
+ #
81
+ # @param [String] locale
82
+ # A locale.
83
+ #
84
+ # @return [String]
85
+ # The language used by the locale.
86
+ #
87
+ def Languages.find(locale)
88
+ if locale =~ /^zh_CN/
89
+ 'zh-CN'
90
+ elsif locale =~ /^zh_TW/
91
+ 'zh-TW'
92
+ else
93
+ if (match = locale.match(/^([^_@]+)([_@].+)?$/))
94
+ match[1] if (match[1] && NAMES.include?(match[1]))
95
+ end
96
+ end
97
+ end
98
+
99
+ #
100
+ # Determines the native language.
101
+ #
102
+ # @return [String]
103
+ # The native language.
104
+ #
105
+ def Languages.native
106
+ language = ENV['LANG'] || 'en'
107
+ Languages.find(language)
108
+ end
109
+ end
110
+ end
@@ -1,7 +1,7 @@
1
1
  #
2
2
  # GScraper - A web-scraping interface to various Google Services.
3
3
  #
4
- # Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
4
+ # Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
5
5
  #
6
6
  # This program is free software; you can redistribute it and/or modify
7
7
  # it under the terms of the GNU General Public License as published by
@@ -19,6 +19,9 @@
19
19
  #
20
20
 
21
21
  module GScraper
22
+ #
23
+ # @api semipublic
24
+ #
22
25
  module Licenses
23
26
  # Any desired license
24
27
  ANY = nil
@@ -1,7 +1,7 @@
1
1
  #
2
2
  # GScraper - A web-scraping interface to various Google Services.
3
3
  #
4
- # Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
4
+ # Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
5
5
  #
6
6
  # This program is free software; you can redistribute it and/or modify
7
7
  # it under the terms of the GNU General Public License as published by
@@ -18,8 +18,6 @@
18
18
  # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
19
  #
20
20
 
21
- require 'enumerator'
22
-
23
21
  module GScraper
24
22
  class Page < Array
25
23
 
@@ -1,7 +1,7 @@
1
1
  #
2
2
  # GScraper - A web-scraping interface to various Google Services.
3
3
  #
4
- # Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
4
+ # Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
5
5
  #
6
6
  # This program is free software; you can redistribute it and/or modify
7
7
  # it under the terms of the GNU General Public License as published by
@@ -1,7 +1,7 @@
1
1
  #
2
2
  # GScraper - A web-scraping interface to various Google Services.
3
3
  #
4
- # Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
4
+ # Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
5
5
  #
6
6
  # This program is free software; you can redistribute it and/or modify
7
7
  # it under the terms of the GNU General Public License as published by
@@ -21,11 +21,11 @@
21
21
  require 'gscraper/search/result'
22
22
  require 'gscraper/search/page'
23
23
  require 'gscraper/search/query'
24
- require 'gscraper/extensions/uri'
25
24
  require 'gscraper/has_pages'
26
25
  require 'gscraper/gscraper'
27
26
 
28
27
  require 'json'
28
+ require 'uri/query_params'
29
29
  require 'nokogiri'
30
30
 
31
31
  module GScraper
@@ -40,14 +40,11 @@ module GScraper
40
40
  # Maximum results per-page
41
41
  RESULTS_PER_PAGE = 8
42
42
 
43
- # AJAX API host
44
- API_HOST = 'www.google.com'
43
+ # AJAX API Path
44
+ PATH = '/uds/GwebSearch'
45
45
 
46
- # AJAX API URL
47
- API_URL = "http://#{API_HOST}/uds/GwebSearch?callback=google.search.WebSearch.RawCompletion&context=0&lstkp=0&rsz=large"
48
-
49
- # Default language
50
- DEFAULT_LANGUAGE = 'en'
46
+ # AJAX API Query string
47
+ QUERY = 'callback=google.search.WebSearch.RawCompletion&context=0&lstkp=0&rsz=large'
51
48
 
52
49
  # Default signature
53
50
  DEFAULT_SIG = '582c1116317355adf613a6a843f19ece'
@@ -58,9 +55,6 @@ module GScraper
58
55
  # Default version
59
56
  DEFAULT_VERSION = '1.0'
60
57
 
61
- # The search language
62
- attr_accessor :language
63
-
64
58
  # The search signature
65
59
  attr_accessor :sig
66
60
 
@@ -76,13 +70,16 @@ module GScraper
76
70
  # @param [Hash] options
77
71
  # Query options.
78
72
  #
79
- # @option options [Symbol] :language (:en)
73
+ # @option options [String] :search_host (www.google.com)
74
+ # The host to submit queries to.
75
+ #
76
+ # @option options [String, Symbol] :language (Languages.native)
80
77
  # The search language.
81
78
  #
82
79
  # @option options [String] :sig ('582c1116317355adf613a6a843f19ece')
83
80
  # The search signature.
84
81
  #
85
- # @option options [Symbol] :key (:notsupplied)
82
+ # @option options [String, Symbol] :key ('notsupplied')
86
83
  # The search key.
87
84
  #
88
85
  # @option options [Float] :version (1.0)
@@ -97,11 +94,9 @@ module GScraper
97
94
  def initialize(options={},&block)
98
95
  @agent = GScraper.web_agent(options)
99
96
 
100
- @language = (options[:language] || DEFAULT_LANGUAGE)
101
-
102
- @sig = (options[:sig] || DEFAULT_SIG)
103
- @key = (options[:key] || DEFAULT_KEY)
104
- @version = (options[:version] || DEFAULT_VERSION)
97
+ @sig = options.fetch(:sig,DEFAULT_SIG)
98
+ @key = options.fetch(:key,DEFAULT_KEY)
99
+ @version = options.fetch(:version,DEFAULT_VERSION)
105
100
 
106
101
  super(options,&block)
107
102
  end
@@ -130,13 +125,13 @@ module GScraper
130
125
  url = URI(url.to_s)
131
126
 
132
127
  options[:language] = url.query_params['hl']
133
- options[:query] = url.query_params['q']
128
+ options[:query] = url.query_params['q']
134
129
 
135
- options[:sig] = url.query_params['sig']
136
- options[:key] = url.query_params['key']
130
+ options[:sig] = url.query_params['sig']
131
+ options[:key] = url.query_params['key']
137
132
  options[:version] = url.query_params['v']
138
133
 
139
- return self.new(options,&block)
134
+ return AJAXQuery.new(options,&block)
140
135
  end
141
136
 
142
137
  #
@@ -158,14 +153,18 @@ module GScraper
158
153
  # The URL for the query.
159
154
  #
160
155
  def search_url
161
- search_url = URI(API_URL)
156
+ search_url = URI::HTTP.build(
157
+ :host => search_host,
158
+ :path => PATH,
159
+ :query => QUERY
160
+ )
162
161
 
163
- search_url.query_params['hl'] = @language
162
+ search_url.query_params['hl'] = @language
164
163
  search_url.query_params['gss'] = '.com'
165
- search_url.query_params['q'] = expression
164
+ search_url.query_params['q'] = expression
166
165
  search_url.query_params['sig'] = @sig
167
166
  search_url.query_params['key'] = @key
168
- search_url.query_params['v'] = @version
167
+ search_url.query_params['v'] = @version
169
168
 
170
169
  return search_url
171
170
  end
@@ -207,15 +206,15 @@ module GScraper
207
206
 
208
207
  if (hash.kind_of?(Hash) && hash['results'])
209
208
  hash['results'].each_with_index do |result,index|
210
- rank = rank_offset + (index + 1)
209
+ rank = rank_offset + (index + 1)
211
210
  title = Nokogiri::HTML(result['title']).inner_text
212
- url = URI(URI.escape(result['unescapedUrl']))
211
+ url = URI(URI.escape(result['unescapedUrl']))
213
212
 
214
- unless result['content'].empty?
215
- summary = Nokogiri::HTML(result['content']).inner_text
216
- else
217
- summary = ''
218
- end
213
+ summary = unless result['content'].empty?
214
+ Nokogiri::HTML(result['content']).inner_text
215
+ else
216
+ ''
217
+ end
219
218
 
220
219
  cached_url = URI(result['cacheUrl'])
221
220