gscraper 0.2.4 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +8 -0
- data/.specopts +1 -0
- data/.yardopts +1 -0
- data/ChangeLog.md +122 -0
- data/Gemfile +25 -0
- data/{README.txt → README.md} +25 -24
- data/Rakefile +32 -10
- data/gscraper.gemspec +112 -0
- data/lib/gscraper.rb +0 -2
- data/lib/gscraper/extensions.rb +0 -2
- data/lib/gscraper/extensions/uri.rb +0 -2
- data/lib/gscraper/extensions/uri/http.rb +0 -2
- data/lib/gscraper/extensions/uri/query_params.rb +18 -5
- data/lib/gscraper/gscraper.rb +61 -70
- data/lib/gscraper/has_pages.rb +76 -20
- data/lib/gscraper/licenses.rb +0 -2
- data/lib/gscraper/page.rb +45 -16
- data/lib/gscraper/search.rb +0 -2
- data/lib/gscraper/search/ajax_query.rb +75 -22
- data/lib/gscraper/search/page.rb +328 -122
- data/lib/gscraper/search/query.rb +100 -7
- data/lib/gscraper/search/result.rb +27 -6
- data/lib/gscraper/search/search.rb +59 -9
- data/lib/gscraper/search/web_query.rb +120 -37
- data/lib/gscraper/sponsored_ad.rb +19 -6
- data/lib/gscraper/sponsored_links.rb +260 -92
- data/lib/gscraper/version.rb +2 -3
- data/spec/extensions/uri/query_params_spec.rb +8 -0
- data/spec/gscraper_spec.rb +9 -4
- data/spec/has_pages_examples.rb +0 -2
- data/spec/has_sponsored_links_examples.rb +2 -1
- data/spec/helpers/query.rb +3 -1
- data/spec/helpers/uri.rb +6 -4
- data/spec/page_has_results_examples.rb +0 -2
- data/spec/search/ajax_query_spec.rb +6 -11
- data/spec/search/page_has_results_examples.rb +0 -2
- data/spec/search/web_query_spec.rb +6 -11
- data/spec/spec_helper.rb +10 -4
- metadata +147 -54
- data/History.txt +0 -101
- data/Manifest.txt +0 -38
- data/tasks/spec.rb +0 -9
data/lib/gscraper.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
#
|
2
|
-
#--
|
3
2
|
# GScraper - A web-scraping interface to various Google Services.
|
4
3
|
#
|
5
4
|
# Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
|
@@ -17,7 +16,6 @@
|
|
17
16
|
# You should have received a copy of the GNU General Public License
|
18
17
|
# along with this program; if not, write to the Free Software
|
19
18
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
-
#++
|
21
19
|
#
|
22
20
|
|
23
21
|
require 'gscraper/search'
|
data/lib/gscraper/extensions.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
#
|
2
|
-
#--
|
3
2
|
# GScraper - A web-scraping interface to various Google Services.
|
4
3
|
#
|
5
4
|
# Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
|
@@ -17,7 +16,6 @@
|
|
17
16
|
# You should have received a copy of the GNU General Public License
|
18
17
|
# along with this program; if not, write to the Free Software
|
19
18
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
-
#++
|
21
19
|
#
|
22
20
|
|
23
21
|
require 'gscraper/extensions/uri'
|
@@ -1,5 +1,4 @@
|
|
1
1
|
#
|
2
|
-
#--
|
3
2
|
# GScraper - A web-scraping interface to various Google Services.
|
4
3
|
#
|
5
4
|
# Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
|
@@ -17,7 +16,6 @@
|
|
17
16
|
# You should have received a copy of the GNU General Public License
|
18
17
|
# along with this program; if not, write to the Free Software
|
19
18
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
-
#++
|
21
19
|
#
|
22
20
|
|
23
21
|
require 'gscraper/extensions/uri/http'
|
@@ -1,5 +1,4 @@
|
|
1
1
|
#
|
2
|
-
#--
|
3
2
|
# GScraper - A web-scraping interface to various Google Services.
|
4
3
|
#
|
5
4
|
# Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
|
@@ -17,7 +16,6 @@
|
|
17
16
|
# You should have received a copy of the GNU General Public License
|
18
17
|
# along with this program; if not, write to the Free Software
|
19
18
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
-
#++
|
21
19
|
#
|
22
20
|
|
23
21
|
require 'gscraper/extensions/uri/query_params'
|
@@ -1,5 +1,4 @@
|
|
1
1
|
#
|
2
|
-
#--
|
3
2
|
# GScraper - A web-scraping interface to various Google Services.
|
4
3
|
#
|
5
4
|
# Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
|
@@ -17,10 +16,15 @@
|
|
17
16
|
# You should have received a copy of the GNU General Public License
|
18
17
|
# along with this program; if not, write to the Free Software
|
19
18
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
-
#++
|
21
19
|
#
|
22
20
|
|
21
|
+
require 'cgi'
|
22
|
+
|
23
23
|
module URI
|
24
|
+
#
|
25
|
+
# Adds the ability to parse individual parameters from a the query field
|
26
|
+
# of a URI.
|
27
|
+
#
|
24
28
|
module QueryParams
|
25
29
|
# Query parameters
|
26
30
|
attr_reader :query_params
|
@@ -40,6 +44,16 @@ module URI
|
|
40
44
|
#
|
41
45
|
# Sets the query data and updates query_params.
|
42
46
|
#
|
47
|
+
# @param [String] query_str
|
48
|
+
# The new URI query string to use.
|
49
|
+
#
|
50
|
+
# @return [String]
|
51
|
+
# The new URI query string.
|
52
|
+
#
|
53
|
+
# @example
|
54
|
+
# url.query = 'a=1&b=2'
|
55
|
+
# # => "a=1&b=2"
|
56
|
+
#
|
43
57
|
def query=(query_str)
|
44
58
|
new_query = super(query_str)
|
45
59
|
parse_query_params
|
@@ -70,7 +84,6 @@ module URI
|
|
70
84
|
|
71
85
|
private
|
72
86
|
|
73
|
-
# :nodoc
|
74
87
|
def path_query
|
75
88
|
str = @path
|
76
89
|
|
@@ -80,9 +93,9 @@ module URI
|
|
80
93
|
"#{name}=active"
|
81
94
|
elsif value
|
82
95
|
if value.kind_of?(Array)
|
83
|
-
"#{name}=#{
|
96
|
+
"#{name}=#{CGI.escape(value.join(' '))}"
|
84
97
|
else
|
85
|
-
"#{name}=#{
|
98
|
+
"#{name}=#{CGI.escape(value.to_s)}"
|
86
99
|
end
|
87
100
|
else
|
88
101
|
"#{name}="
|
data/lib/gscraper/gscraper.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
#
|
2
|
-
#--
|
3
2
|
# GScraper - A web-scraping interface to various Google Services.
|
4
3
|
#
|
5
4
|
# Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
|
@@ -17,7 +16,6 @@
|
|
17
16
|
# You should have received a copy of the GNU General Public License
|
18
17
|
# along with this program; if not, write to the Free Software
|
19
18
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
-
#++
|
21
19
|
#
|
22
20
|
|
23
21
|
require 'uri/http'
|
@@ -30,22 +28,31 @@ module GScraper
|
|
30
28
|
COMMON_PROXY_PORT = 8080
|
31
29
|
|
32
30
|
#
|
33
|
-
#
|
31
|
+
# The proxy information.
|
32
|
+
#
|
33
|
+
# @return [Hash]
|
34
34
|
#
|
35
35
|
def GScraper.proxy
|
36
36
|
@@gscraper_proxy ||= {:host => nil, :port => COMMON_PROXY_PORT, :user => nil, :password => nil}
|
37
37
|
end
|
38
38
|
|
39
39
|
#
|
40
|
-
# Creates a HTTP URI
|
41
|
-
#
|
40
|
+
# Creates a HTTP URI for the current proxy.
|
41
|
+
#
|
42
|
+
# @param [Hash] proxy_info
|
43
|
+
# The proxy information.
|
44
|
+
#
|
45
|
+
# @option proxy_info [String] :host
|
46
|
+
# The proxy host.
|
47
|
+
#
|
48
|
+
# @option proxy_info [Integer] :port (COMMON_PROXY_PORT)
|
49
|
+
# The proxy port.
|
42
50
|
#
|
43
|
-
#
|
44
|
-
#
|
45
|
-
#
|
46
|
-
#
|
47
|
-
#
|
48
|
-
# <tt>:password</tt>:: The password to login with.
|
51
|
+
# @option proxy_info [String] :user
|
52
|
+
# The user-name to login as.
|
53
|
+
#
|
54
|
+
# @option proxy_info [String] :password
|
55
|
+
# The password to login with.
|
49
56
|
#
|
50
57
|
def GScraper.proxy_uri(proxy_info=GScraper.proxy)
|
51
58
|
if GScraper.proxy[:host]
|
@@ -59,101 +66,85 @@ module GScraper
|
|
59
66
|
end
|
60
67
|
|
61
68
|
#
|
62
|
-
#
|
69
|
+
# The supported GScraper User-Agent Aliases.
|
70
|
+
#
|
71
|
+
# @return [Array<String>]
|
63
72
|
#
|
64
73
|
def GScraper.user_agent_aliases
|
65
|
-
|
74
|
+
Mechanize::AGENT_ALIASES
|
66
75
|
end
|
67
76
|
|
68
77
|
#
|
69
|
-
#
|
78
|
+
# The GScraper User-Agent.
|
79
|
+
#
|
80
|
+
# @return [String]
|
70
81
|
#
|
71
82
|
def GScraper.user_agent
|
72
83
|
@@gscraper_user_agent ||= GScraper.user_agent_aliases['Windows IE 6']
|
73
84
|
end
|
74
85
|
|
75
86
|
#
|
76
|
-
# Sets the GScraper User-Agent
|
87
|
+
# Sets the GScraper User-Agent.
|
88
|
+
#
|
89
|
+
# @param [String] agent
|
90
|
+
# The new User-Agent string.
|
91
|
+
#
|
92
|
+
# @return [String]
|
93
|
+
# The new User-Agent string.
|
77
94
|
#
|
78
95
|
def GScraper.user_agent=(agent)
|
79
96
|
@@gscraper_user_agent = agent
|
80
97
|
end
|
81
98
|
|
82
99
|
#
|
83
|
-
# Sets the GScraper User-Agent
|
84
|
-
#
|
100
|
+
# Sets the GScraper User-Agent.
|
101
|
+
#
|
102
|
+
# @param [String] name
|
103
|
+
# The User-Agent alias.
|
104
|
+
#
|
105
|
+
# @return [String]
|
106
|
+
# The new User-Agent string.
|
85
107
|
#
|
86
108
|
def GScraper.user_agent_alias=(name)
|
87
109
|
@@gscraper_user_agent = GScraper.user_agent_aliases[name.to_s]
|
88
110
|
end
|
89
111
|
|
90
112
|
#
|
91
|
-
#
|
92
|
-
# will be returned.
|
113
|
+
# Creates a new Mechanize agent.
|
93
114
|
#
|
94
|
-
#
|
95
|
-
#
|
96
|
-
# <tt>:user_agent</tt>:: The User-Agent String to use.
|
97
|
-
# <tt>:proxy</tt>:: A +Hash+ of proxy information which may
|
98
|
-
# contain the following keys:
|
99
|
-
# <tt>:host</tt>:: The proxy host.
|
100
|
-
# <tt>:port</tt>:: The proxy port.
|
101
|
-
# <tt>:user</tt>:: The user-name to login as.
|
102
|
-
# <tt>:password</tt>:: The password to login with.
|
115
|
+
# @param [Hash] options
|
116
|
+
# Additional options.
|
103
117
|
#
|
104
|
-
#
|
118
|
+
# @option options [String] :user_agent_alias
|
119
|
+
# The User-Agent Alias to use.
|
105
120
|
#
|
106
|
-
#
|
107
|
-
#
|
108
|
-
# GScraper.open_uri('http://www.wired.com/',
|
109
|
-
# :user_agent => 'the future')
|
121
|
+
# @option options [String] :user_agent
|
122
|
+
# The User-Agent string to use.
|
110
123
|
#
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
if options[:user_agent_alias]
|
115
|
-
headers['User-Agent'] = WWW::Mechanize::AGENT_ALIASES[options[:user_agent_alias]]
|
116
|
-
elsif options[:user_agent]
|
117
|
-
headers['User-Agent'] = options[:user_agent]
|
118
|
-
elsif GScraper.user_agent
|
119
|
-
headers['User-Agent'] = GScraper.user_agent
|
120
|
-
end
|
121
|
-
|
122
|
-
proxy = (options[:proxy] || GScraper.proxy)
|
123
|
-
if proxy[:host]
|
124
|
-
headers[:proxy] = GScraper.proxy_uri(proxy)
|
125
|
-
end
|
126
|
-
|
127
|
-
return Kernel.open(uri,headers)
|
128
|
-
end
|
129
|
-
|
124
|
+
# @option options [Hash] :proxy
|
125
|
+
# The proxy information to use.
|
130
126
|
#
|
131
|
-
#
|
127
|
+
# @option :proxy [String] :host
|
128
|
+
# The proxy host.
|
132
129
|
#
|
133
|
-
|
134
|
-
|
135
|
-
end
|
136
|
-
|
130
|
+
# @option :proxy [Integer] :port
|
131
|
+
# The proxy port.
|
137
132
|
#
|
138
|
-
#
|
133
|
+
# @option :proxy [String] :user
|
134
|
+
# The user-name to login as.
|
139
135
|
#
|
140
|
-
#
|
141
|
-
#
|
142
|
-
# <tt>:user_agent</tt>:: The User-Agent string to use.
|
143
|
-
# <tt>:proxy</tt>:: A +Hash+ of proxy information which may
|
144
|
-
# contain the following keys:
|
145
|
-
# <tt>:host</tt>:: The proxy host.
|
146
|
-
# <tt>:port</tt>:: The proxy port.
|
147
|
-
# <tt>:user</tt>:: The user-name to login as.
|
148
|
-
# <tt>:password</tt>:: The password to login with.
|
136
|
+
# @option :proxy [String] :password
|
137
|
+
# The password to login with.
|
149
138
|
#
|
139
|
+
# @example
|
150
140
|
# GScraper.web_agent
|
151
141
|
#
|
142
|
+
# @example
|
152
143
|
# GScraper.web_agent(:user_agent_alias => 'Linux Mozilla')
|
153
144
|
# GScraper.web_agent(:user_agent => 'Google Bot')
|
154
145
|
#
|
155
|
-
def GScraper.web_agent(options={}
|
156
|
-
agent =
|
146
|
+
def GScraper.web_agent(options={})
|
147
|
+
agent = Mechanize.new
|
157
148
|
|
158
149
|
if options[:user_agent_alias]
|
159
150
|
agent.user_agent_alias = options[:user_agent_alias]
|
@@ -168,7 +159,7 @@ module GScraper
|
|
168
159
|
agent.set_proxy(proxy[:host],proxy[:port],proxy[:user],proxy[:password])
|
169
160
|
end
|
170
161
|
|
171
|
-
|
162
|
+
yield agent if block_given?
|
172
163
|
return agent
|
173
164
|
end
|
174
165
|
end
|
data/lib/gscraper/has_pages.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
#
|
2
|
-
#--
|
3
2
|
# GScraper - A web-scraping interface to various Google Services.
|
4
3
|
#
|
5
4
|
# Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
|
@@ -17,51 +16,87 @@
|
|
17
16
|
# You should have received a copy of the GNU General Public License
|
18
17
|
# along with this program; if not, write to the Free Software
|
19
18
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
-
#++
|
21
19
|
#
|
22
20
|
|
21
|
+
require 'enumerator'
|
22
|
+
|
23
23
|
module GScraper
|
24
24
|
module HasPages
|
25
25
|
include Enumerable
|
26
26
|
|
27
27
|
#
|
28
|
-
#
|
28
|
+
# The first page.
|
29
|
+
#
|
30
|
+
# @return [Page]
|
31
|
+
# The first page.
|
29
32
|
#
|
30
33
|
def first_page
|
31
34
|
page_cache[1]
|
32
35
|
end
|
33
36
|
|
34
37
|
#
|
35
|
-
#
|
38
|
+
# The page at the specified index.
|
39
|
+
#
|
40
|
+
# @param [Integer] index
|
41
|
+
# The index.
|
42
|
+
#
|
43
|
+
# @return [Page]
|
44
|
+
# The page at the given index.
|
36
45
|
#
|
37
46
|
def [](index)
|
38
47
|
page_cache[index]
|
39
48
|
end
|
40
49
|
|
41
50
|
#
|
42
|
-
#
|
51
|
+
# The pages with the specified indices.
|
52
|
+
#
|
53
|
+
# @param [Array, Range] indices
|
54
|
+
# The indices.
|
55
|
+
#
|
56
|
+
# @return [Page]
|
57
|
+
# The pages at the given indices.
|
43
58
|
#
|
44
59
|
def pages(indices)
|
45
60
|
indices.map { |index| page_cache[index] }
|
46
61
|
end
|
47
62
|
|
48
63
|
#
|
49
|
-
# Iterates over the pages
|
50
|
-
# to the specified _block_.
|
64
|
+
# Iterates over the pages at the specified indices.
|
51
65
|
#
|
52
|
-
|
53
|
-
|
66
|
+
# @param [Array, Range] indices
|
67
|
+
# The indices.
|
68
|
+
#
|
69
|
+
# @yield [page]
|
70
|
+
# The given block will be passed each page.
|
71
|
+
#
|
72
|
+
# @yieldparam [Page] page
|
73
|
+
# A page at one of the given indices.
|
74
|
+
#
|
75
|
+
def each_page(indices)
|
76
|
+
unless block_given?
|
77
|
+
enum_for(:each_page,indices)
|
78
|
+
else
|
79
|
+
indices.map { |index| yield page_cache[index] }
|
80
|
+
end
|
54
81
|
end
|
55
82
|
|
56
83
|
#
|
57
|
-
# Iterates over all the pages of the query,
|
58
|
-
#
|
84
|
+
# Iterates over all the pages of the query, until an empty page is
|
85
|
+
# encountered.
|
86
|
+
#
|
87
|
+
# @yield [page]
|
88
|
+
# A page with results from the query.
|
59
89
|
#
|
60
|
-
|
90
|
+
# @yieldparam [Page] page
|
91
|
+
# A non-empty page from the query.
|
92
|
+
#
|
93
|
+
def each
|
94
|
+
return enum_for(:each) unless block_given?
|
95
|
+
|
61
96
|
index = 1
|
62
97
|
|
63
98
|
until ((next_page = page_cache[index]).empty?) do
|
64
|
-
|
99
|
+
yield next_page
|
65
100
|
index = index + 1
|
66
101
|
end
|
67
102
|
|
@@ -69,16 +104,20 @@ module GScraper
|
|
69
104
|
end
|
70
105
|
|
71
106
|
#
|
72
|
-
# Iterates over the elements on the page with the specified
|
73
|
-
#
|
107
|
+
# Iterates over the elements on the page with the specified index.
|
108
|
+
#
|
109
|
+
# @param [Integer] index
|
110
|
+
# The index to access.
|
74
111
|
#
|
75
112
|
def each_on_page(index,&block)
|
76
113
|
page_cache[index].each(&block)
|
77
114
|
end
|
78
115
|
|
79
116
|
#
|
80
|
-
# Iterates over each element on the pages with the specified
|
81
|
-
#
|
117
|
+
# Iterates over each element on the pages with the specified indices.
|
118
|
+
#
|
119
|
+
# @param [Array, Range] indices
|
120
|
+
# The indices to access.
|
82
121
|
#
|
83
122
|
def each_on_pages(indices,&block)
|
84
123
|
each_page(indices) { |page| page.each(&block) }
|
@@ -87,21 +126,36 @@ module GScraper
|
|
87
126
|
protected
|
88
127
|
|
89
128
|
#
|
90
|
-
#
|
129
|
+
# The page index for the specified result rank.
|
130
|
+
#
|
131
|
+
# @param [Integer] rank
|
132
|
+
# A result ranking.
|
133
|
+
#
|
134
|
+
# @return [Integer]
|
135
|
+
# The page index.
|
91
136
|
#
|
92
137
|
def page_index_of(rank)
|
93
138
|
(((rank.to_i - 1) / results_per_page.to_i) + 1)
|
94
139
|
end
|
95
140
|
|
96
141
|
#
|
97
|
-
#
|
142
|
+
# The rank offset for the specified page-index.
|
143
|
+
#
|
144
|
+
# @param [Integer] page_index
|
145
|
+
# The result offset within a page.
|
98
146
|
#
|
99
147
|
def result_offset_of(page_index)
|
100
148
|
((page_index.to_i - 1) * results_per_page.to_i)
|
101
149
|
end
|
102
150
|
|
103
151
|
#
|
104
|
-
#
|
152
|
+
# The in-page index of the specified result rank.
|
153
|
+
#
|
154
|
+
# @param [Integer] rank
|
155
|
+
# The result ranking.
|
156
|
+
#
|
157
|
+
# @return [Integer]
|
158
|
+
# The in-page index.
|
105
159
|
#
|
106
160
|
def result_index_of(rank)
|
107
161
|
((rank.to_i - 1) % results_per_page.to_i)
|
@@ -110,6 +164,8 @@ module GScraper
|
|
110
164
|
#
|
111
165
|
# The cache of previously requested pages.
|
112
166
|
#
|
167
|
+
# @return [Hash]
|
168
|
+
#
|
113
169
|
def page_cache
|
114
170
|
@page_cache ||= Hash.new { |hash,key| hash[key] = page(key.to_i) }
|
115
171
|
end
|