gscraper 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  #
2
2
  # GScraper - A web-scraping interface to various Google Services.
3
3
  #
4
- # Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
4
+ # Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
5
5
  #
6
6
  # This program is free software; you can redistribute it and/or modify
7
7
  # it under the terms of the GNU General Public License as published by
@@ -18,8 +18,6 @@
18
18
  # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
19
  #
20
20
 
21
- require 'enumerator'
22
-
23
21
  module GScraper
24
22
  module HasPages
25
23
  include Enumerable
@@ -0,0 +1,158 @@
1
+ #
2
+ # GScraper - A web-scraping interface to various Google Services.
3
+ #
4
+ # Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
5
+ #
6
+ # This program is free software; you can redistribute it and/or modify
7
+ # it under the terms of the GNU General Public License as published by
8
+ # the Free Software Foundation; either version 2 of the License, or
9
+ # (at your option) any later version.
10
+ #
11
+ # This program is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU General Public License
17
+ # along with this program; if not, write to the Free Software
18
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
+ #
20
+
21
+ module GScraper
22
+ #
23
+ # @api semipublic
24
+ #
25
+ # @since 0.4.0
26
+ #
27
+ module Hosts
28
+ # List of all google domain-names.
29
+ DOMAINS = %w[
30
+ google.com
31
+ google.de
32
+ google.at
33
+ google.pl
34
+ google.fr
35
+ google.nl
36
+ google.it
37
+ google.com.tr
38
+ google.es
39
+ google.ch
40
+ google.be
41
+ google.gr
42
+ google.com.br
43
+ google.lu
44
+ google.fi
45
+ google.pt
46
+ google.hu
47
+ google.hr
48
+ google.bg
49
+ google.com.mx
50
+ google.si
51
+ google.sk
52
+ google.ro
53
+ google.ca
54
+ google.co.uk
55
+ google.cl
56
+ google.com.ar
57
+ google.se
58
+ google.cz
59
+ google.dk
60
+ google.co.th
61
+ google.com.co
62
+ google.lt
63
+ google.co.id
64
+ google.co.in
65
+ google.co.il
66
+ google.com.eg
67
+ google.cn
68
+ google.co.ve
69
+ google.ru
70
+ google.co.jp
71
+ google.com.pe
72
+ google.com.au
73
+ google.co.ma
74
+ google.co.za
75
+ google.com.ph
76
+ google.com.sa
77
+ google.ie
78
+ google.co.kr
79
+ google.no
80
+ google.com.ec
81
+ google.com.vn
82
+ google.lv
83
+ google.com.mt
84
+ google.com.uy
85
+ google.ae
86
+ google.ba
87
+ google.co.nz
88
+ google.com.ua
89
+ google.co.cr
90
+ google.ee
91
+ google.com.do
92
+ google.com.tw
93
+ google.com.hk
94
+ google.com.my
95
+ google.com.sv
96
+ google.com.pr
97
+ google.lk
98
+ google.com.gt
99
+ google.com.bd
100
+ google.com.pk
101
+ google.is
102
+ google.li
103
+ google.com.bh
104
+ google.com.ni
105
+ google.com.py
106
+ google.com.ng
107
+ google.com.bo
108
+ google.co.ke
109
+ google.hn
110
+ google.com.sg
111
+ google.mu
112
+ google.ci
113
+ google.jo
114
+ google.nu
115
+ google.com.jm
116
+ google.com.ly
117
+ google.co.yu
118
+ google.tt
119
+ google.com.kh
120
+ google.ge
121
+ google.com.na
122
+ google.com.et
123
+ google.sm
124
+ google.cd
125
+ google.gm
126
+ google.com.qa
127
+ google.dj
128
+ google.com.cu
129
+ google.com.pa
130
+ google.gp
131
+ google.az
132
+ google.as
133
+ google.pl
134
+ google.mn
135
+ google.ht
136
+ google.md
137
+ google.am
138
+ google.sn
139
+ google.je
140
+ google.com.bn
141
+ google.com.ai
142
+ google.co.zm
143
+ google.ma
144
+ google.rw
145
+ google.co.ug
146
+ google.com.vc
147
+ google.at
148
+ google.com.gi
149
+ google.to
150
+ google.com.om
151
+ google.kz
152
+ google.co.uz
153
+ ]
154
+
155
+ # The primary domain
156
+ PRIMARY_DOMAIN = DOMAINS.first
157
+ end
158
+ end
@@ -0,0 +1,110 @@
1
+ #
2
+ # GScraper - A web-scraping interface to various Google Services.
3
+ #
4
+ # Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
5
+ #
6
+ # This program is free software; you can redistribute it and/or modify
7
+ # it under the terms of the GNU General Public License as published by
8
+ # the Free Software Foundation; either version 2 of the License, or
9
+ # (at your option) any later version.
10
+ #
11
+ # This program is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU General Public License
17
+ # along with this program; if not, write to the Free Software
18
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
+ #
20
+
21
+ module GScraper
22
+ #
23
+ # @api semipublic
24
+ #
25
+ # @since 0.3.0
26
+ #
27
+ module Languages
28
+ # The list of language names
29
+ NAMES = %w[
30
+ af
31
+ ar
32
+ be
33
+ bg
34
+ ca
35
+ cs
36
+ da
37
+ de
38
+ el
39
+ en
40
+ eo
41
+ es
42
+ et
43
+ fa
44
+ fi
45
+ fr
46
+ hi
47
+ hr
48
+ hu
49
+ hy
50
+ id
51
+ is
52
+ it
53
+ iw
54
+ ja
55
+ ko
56
+ lt
57
+ lv
58
+ nl
59
+ no
60
+ pl
61
+ pt
62
+ ro
63
+ ru
64
+ sk
65
+ sl
66
+ sr
67
+ sv
68
+ sw
69
+ th
70
+ tl
71
+ tr
72
+ uk
73
+ vi
74
+ zh-CN
75
+ zh-TW
76
+ ]
77
+
78
+ #
79
+ # Looks up the language for the given locale.
80
+ #
81
+ # @param [String] locale
82
+ # A locale.
83
+ #
84
+ # @return [String]
85
+ # The language used by the locale.
86
+ #
87
+ def Languages.find(locale)
88
+ if locale =~ /^zh_CN/
89
+ 'zh-CN'
90
+ elsif locale =~ /^zh_TW/
91
+ 'zh-TW'
92
+ else
93
+ if (match = locale.match(/^([^_@]+)([_@].+)?$/))
94
+ match[1] if (match[1] && NAMES.include?(match[1]))
95
+ end
96
+ end
97
+ end
98
+
99
+ #
100
+ # Determines the native language.
101
+ #
102
+ # @return [String]
103
+ # The native language.
104
+ #
105
+ def Languages.native
106
+ language = ENV['LANG'] || 'en'
107
+ Languages.find(language)
108
+ end
109
+ end
110
+ end
@@ -1,7 +1,7 @@
1
1
  #
2
2
  # GScraper - A web-scraping interface to various Google Services.
3
3
  #
4
- # Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
4
+ # Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
5
5
  #
6
6
  # This program is free software; you can redistribute it and/or modify
7
7
  # it under the terms of the GNU General Public License as published by
@@ -19,6 +19,9 @@
19
19
  #
20
20
 
21
21
  module GScraper
22
+ #
23
+ # @api semipublic
24
+ #
22
25
  module Licenses
23
26
  # Any desired license
24
27
  ANY = nil
@@ -1,7 +1,7 @@
1
1
  #
2
2
  # GScraper - A web-scraping interface to various Google Services.
3
3
  #
4
- # Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
4
+ # Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
5
5
  #
6
6
  # This program is free software; you can redistribute it and/or modify
7
7
  # it under the terms of the GNU General Public License as published by
@@ -18,8 +18,6 @@
18
18
  # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
19
  #
20
20
 
21
- require 'enumerator'
22
-
23
21
  module GScraper
24
22
  class Page < Array
25
23
 
@@ -1,7 +1,7 @@
1
1
  #
2
2
  # GScraper - A web-scraping interface to various Google Services.
3
3
  #
4
- # Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
4
+ # Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
5
5
  #
6
6
  # This program is free software; you can redistribute it and/or modify
7
7
  # it under the terms of the GNU General Public License as published by
@@ -1,7 +1,7 @@
1
1
  #
2
2
  # GScraper - A web-scraping interface to various Google Services.
3
3
  #
4
- # Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
4
+ # Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
5
5
  #
6
6
  # This program is free software; you can redistribute it and/or modify
7
7
  # it under the terms of the GNU General Public License as published by
@@ -21,11 +21,11 @@
21
21
  require 'gscraper/search/result'
22
22
  require 'gscraper/search/page'
23
23
  require 'gscraper/search/query'
24
- require 'gscraper/extensions/uri'
25
24
  require 'gscraper/has_pages'
26
25
  require 'gscraper/gscraper'
27
26
 
28
27
  require 'json'
28
+ require 'uri/query_params'
29
29
  require 'nokogiri'
30
30
 
31
31
  module GScraper
@@ -40,14 +40,11 @@ module GScraper
40
40
  # Maximum results per-page
41
41
  RESULTS_PER_PAGE = 8
42
42
 
43
- # AJAX API host
44
- API_HOST = 'www.google.com'
43
+ # AJAX API Path
44
+ PATH = '/uds/GwebSearch'
45
45
 
46
- # AJAX API URL
47
- API_URL = "http://#{API_HOST}/uds/GwebSearch?callback=google.search.WebSearch.RawCompletion&context=0&lstkp=0&rsz=large"
48
-
49
- # Default language
50
- DEFAULT_LANGUAGE = 'en'
46
+ # AJAX API Query string
47
+ QUERY = 'callback=google.search.WebSearch.RawCompletion&context=0&lstkp=0&rsz=large'
51
48
 
52
49
  # Default signature
53
50
  DEFAULT_SIG = '582c1116317355adf613a6a843f19ece'
@@ -58,9 +55,6 @@ module GScraper
58
55
  # Default version
59
56
  DEFAULT_VERSION = '1.0'
60
57
 
61
- # The search language
62
- attr_accessor :language
63
-
64
58
  # The search signature
65
59
  attr_accessor :sig
66
60
 
@@ -76,13 +70,16 @@ module GScraper
76
70
  # @param [Hash] options
77
71
  # Query options.
78
72
  #
79
- # @option options [Symbol] :language (:en)
73
+ # @option options [String] :search_host (www.google.com)
74
+ # The host to submit queries to.
75
+ #
76
+ # @option options [String, Symbol] :language (Languages.native)
80
77
  # The search language.
81
78
  #
82
79
  # @option options [String] :sig ('582c1116317355adf613a6a843f19ece')
83
80
  # The search signature.
84
81
  #
85
- # @option options [Symbol] :key (:notsupplied)
82
+ # @option options [String, Symbol] :key ('notsupplied')
86
83
  # The search key.
87
84
  #
88
85
  # @option options [Float] :version (1.0)
@@ -97,11 +94,9 @@ module GScraper
97
94
  def initialize(options={},&block)
98
95
  @agent = GScraper.web_agent(options)
99
96
 
100
- @language = (options[:language] || DEFAULT_LANGUAGE)
101
-
102
- @sig = (options[:sig] || DEFAULT_SIG)
103
- @key = (options[:key] || DEFAULT_KEY)
104
- @version = (options[:version] || DEFAULT_VERSION)
97
+ @sig = options.fetch(:sig,DEFAULT_SIG)
98
+ @key = options.fetch(:key,DEFAULT_KEY)
99
+ @version = options.fetch(:version,DEFAULT_VERSION)
105
100
 
106
101
  super(options,&block)
107
102
  end
@@ -130,13 +125,13 @@ module GScraper
130
125
  url = URI(url.to_s)
131
126
 
132
127
  options[:language] = url.query_params['hl']
133
- options[:query] = url.query_params['q']
128
+ options[:query] = url.query_params['q']
134
129
 
135
- options[:sig] = url.query_params['sig']
136
- options[:key] = url.query_params['key']
130
+ options[:sig] = url.query_params['sig']
131
+ options[:key] = url.query_params['key']
137
132
  options[:version] = url.query_params['v']
138
133
 
139
- return self.new(options,&block)
134
+ return AJAXQuery.new(options,&block)
140
135
  end
141
136
 
142
137
  #
@@ -158,14 +153,18 @@ module GScraper
158
153
  # The URL for the query.
159
154
  #
160
155
  def search_url
161
- search_url = URI(API_URL)
156
+ search_url = URI::HTTP.build(
157
+ :host => search_host,
158
+ :path => PATH,
159
+ :query => QUERY
160
+ )
162
161
 
163
- search_url.query_params['hl'] = @language
162
+ search_url.query_params['hl'] = @language
164
163
  search_url.query_params['gss'] = '.com'
165
- search_url.query_params['q'] = expression
164
+ search_url.query_params['q'] = expression
166
165
  search_url.query_params['sig'] = @sig
167
166
  search_url.query_params['key'] = @key
168
- search_url.query_params['v'] = @version
167
+ search_url.query_params['v'] = @version
169
168
 
170
169
  return search_url
171
170
  end
@@ -207,15 +206,15 @@ module GScraper
207
206
 
208
207
  if (hash.kind_of?(Hash) && hash['results'])
209
208
  hash['results'].each_with_index do |result,index|
210
- rank = rank_offset + (index + 1)
209
+ rank = rank_offset + (index + 1)
211
210
  title = Nokogiri::HTML(result['title']).inner_text
212
- url = URI(URI.escape(result['unescapedUrl']))
211
+ url = URI(URI.escape(result['unescapedUrl']))
213
212
 
214
- unless result['content'].empty?
215
- summary = Nokogiri::HTML(result['content']).inner_text
216
- else
217
- summary = ''
218
- end
213
+ summary = unless result['content'].empty?
214
+ Nokogiri::HTML(result['content']).inner_text
215
+ else
216
+ ''
217
+ end
219
218
 
220
219
  cached_url = URI(result['cacheUrl'])
221
220