gscraper 0.2.4 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +8 -0
- data/.specopts +1 -0
- data/.yardopts +1 -0
- data/ChangeLog.md +122 -0
- data/Gemfile +25 -0
- data/{README.txt → README.md} +25 -24
- data/Rakefile +32 -10
- data/gscraper.gemspec +112 -0
- data/lib/gscraper.rb +0 -2
- data/lib/gscraper/extensions.rb +0 -2
- data/lib/gscraper/extensions/uri.rb +0 -2
- data/lib/gscraper/extensions/uri/http.rb +0 -2
- data/lib/gscraper/extensions/uri/query_params.rb +18 -5
- data/lib/gscraper/gscraper.rb +61 -70
- data/lib/gscraper/has_pages.rb +76 -20
- data/lib/gscraper/licenses.rb +0 -2
- data/lib/gscraper/page.rb +45 -16
- data/lib/gscraper/search.rb +0 -2
- data/lib/gscraper/search/ajax_query.rb +75 -22
- data/lib/gscraper/search/page.rb +328 -122
- data/lib/gscraper/search/query.rb +100 -7
- data/lib/gscraper/search/result.rb +27 -6
- data/lib/gscraper/search/search.rb +59 -9
- data/lib/gscraper/search/web_query.rb +120 -37
- data/lib/gscraper/sponsored_ad.rb +19 -6
- data/lib/gscraper/sponsored_links.rb +260 -92
- data/lib/gscraper/version.rb +2 -3
- data/spec/extensions/uri/query_params_spec.rb +8 -0
- data/spec/gscraper_spec.rb +9 -4
- data/spec/has_pages_examples.rb +0 -2
- data/spec/has_sponsored_links_examples.rb +2 -1
- data/spec/helpers/query.rb +3 -1
- data/spec/helpers/uri.rb +6 -4
- data/spec/page_has_results_examples.rb +0 -2
- data/spec/search/ajax_query_spec.rb +6 -11
- data/spec/search/page_has_results_examples.rb +0 -2
- data/spec/search/web_query_spec.rb +6 -11
- data/spec/spec_helper.rb +10 -4
- metadata +147 -54
- data/History.txt +0 -101
- data/Manifest.txt +0 -38
- data/tasks/spec.rb +0 -9
data/lib/gscraper.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
#
|
2
|
-
#--
|
3
2
|
# GScraper - A web-scraping interface to various Google Services.
|
4
3
|
#
|
5
4
|
# Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
|
@@ -17,7 +16,6 @@
|
|
17
16
|
# You should have received a copy of the GNU General Public License
|
18
17
|
# along with this program; if not, write to the Free Software
|
19
18
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
-
#++
|
21
19
|
#
|
22
20
|
|
23
21
|
require 'gscraper/search'
|
data/lib/gscraper/extensions.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
#
|
2
|
-
#--
|
3
2
|
# GScraper - A web-scraping interface to various Google Services.
|
4
3
|
#
|
5
4
|
# Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
|
@@ -17,7 +16,6 @@
|
|
17
16
|
# You should have received a copy of the GNU General Public License
|
18
17
|
# along with this program; if not, write to the Free Software
|
19
18
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
-
#++
|
21
19
|
#
|
22
20
|
|
23
21
|
require 'gscraper/extensions/uri'
|
@@ -1,5 +1,4 @@
|
|
1
1
|
#
|
2
|
-
#--
|
3
2
|
# GScraper - A web-scraping interface to various Google Services.
|
4
3
|
#
|
5
4
|
# Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
|
@@ -17,7 +16,6 @@
|
|
17
16
|
# You should have received a copy of the GNU General Public License
|
18
17
|
# along with this program; if not, write to the Free Software
|
19
18
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
-
#++
|
21
19
|
#
|
22
20
|
|
23
21
|
require 'gscraper/extensions/uri/http'
|
@@ -1,5 +1,4 @@
|
|
1
1
|
#
|
2
|
-
#--
|
3
2
|
# GScraper - A web-scraping interface to various Google Services.
|
4
3
|
#
|
5
4
|
# Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
|
@@ -17,7 +16,6 @@
|
|
17
16
|
# You should have received a copy of the GNU General Public License
|
18
17
|
# along with this program; if not, write to the Free Software
|
19
18
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
-
#++
|
21
19
|
#
|
22
20
|
|
23
21
|
require 'gscraper/extensions/uri/query_params'
|
@@ -1,5 +1,4 @@
|
|
1
1
|
#
|
2
|
-
#--
|
3
2
|
# GScraper - A web-scraping interface to various Google Services.
|
4
3
|
#
|
5
4
|
# Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
|
@@ -17,10 +16,15 @@
|
|
17
16
|
# You should have received a copy of the GNU General Public License
|
18
17
|
# along with this program; if not, write to the Free Software
|
19
18
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
-
#++
|
21
19
|
#
|
22
20
|
|
21
|
+
require 'cgi'
|
22
|
+
|
23
23
|
module URI
|
24
|
+
#
|
25
|
+
# Adds the ability to parse individual parameters from a the query field
|
26
|
+
# of a URI.
|
27
|
+
#
|
24
28
|
module QueryParams
|
25
29
|
# Query parameters
|
26
30
|
attr_reader :query_params
|
@@ -40,6 +44,16 @@ module URI
|
|
40
44
|
#
|
41
45
|
# Sets the query data and updates query_params.
|
42
46
|
#
|
47
|
+
# @param [String] query_str
|
48
|
+
# The new URI query string to use.
|
49
|
+
#
|
50
|
+
# @return [String]
|
51
|
+
# The new URI query string.
|
52
|
+
#
|
53
|
+
# @example
|
54
|
+
# url.query = 'a=1&b=2'
|
55
|
+
# # => "a=1&b=2"
|
56
|
+
#
|
43
57
|
def query=(query_str)
|
44
58
|
new_query = super(query_str)
|
45
59
|
parse_query_params
|
@@ -70,7 +84,6 @@ module URI
|
|
70
84
|
|
71
85
|
private
|
72
86
|
|
73
|
-
# :nodoc
|
74
87
|
def path_query
|
75
88
|
str = @path
|
76
89
|
|
@@ -80,9 +93,9 @@ module URI
|
|
80
93
|
"#{name}=active"
|
81
94
|
elsif value
|
82
95
|
if value.kind_of?(Array)
|
83
|
-
"#{name}=#{
|
96
|
+
"#{name}=#{CGI.escape(value.join(' '))}"
|
84
97
|
else
|
85
|
-
"#{name}=#{
|
98
|
+
"#{name}=#{CGI.escape(value.to_s)}"
|
86
99
|
end
|
87
100
|
else
|
88
101
|
"#{name}="
|
data/lib/gscraper/gscraper.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
#
|
2
|
-
#--
|
3
2
|
# GScraper - A web-scraping interface to various Google Services.
|
4
3
|
#
|
5
4
|
# Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
|
@@ -17,7 +16,6 @@
|
|
17
16
|
# You should have received a copy of the GNU General Public License
|
18
17
|
# along with this program; if not, write to the Free Software
|
19
18
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
-
#++
|
21
19
|
#
|
22
20
|
|
23
21
|
require 'uri/http'
|
@@ -30,22 +28,31 @@ module GScraper
|
|
30
28
|
COMMON_PROXY_PORT = 8080
|
31
29
|
|
32
30
|
#
|
33
|
-
#
|
31
|
+
# The proxy information.
|
32
|
+
#
|
33
|
+
# @return [Hash]
|
34
34
|
#
|
35
35
|
def GScraper.proxy
|
36
36
|
@@gscraper_proxy ||= {:host => nil, :port => COMMON_PROXY_PORT, :user => nil, :password => nil}
|
37
37
|
end
|
38
38
|
|
39
39
|
#
|
40
|
-
# Creates a HTTP URI
|
41
|
-
#
|
40
|
+
# Creates a HTTP URI for the current proxy.
|
41
|
+
#
|
42
|
+
# @param [Hash] proxy_info
|
43
|
+
# The proxy information.
|
44
|
+
#
|
45
|
+
# @option proxy_info [String] :host
|
46
|
+
# The proxy host.
|
47
|
+
#
|
48
|
+
# @option proxy_info [Integer] :port (COMMON_PROXY_PORT)
|
49
|
+
# The proxy port.
|
42
50
|
#
|
43
|
-
#
|
44
|
-
#
|
45
|
-
#
|
46
|
-
#
|
47
|
-
#
|
48
|
-
# <tt>:password</tt>:: The password to login with.
|
51
|
+
# @option proxy_info [String] :user
|
52
|
+
# The user-name to login as.
|
53
|
+
#
|
54
|
+
# @option proxy_info [String] :password
|
55
|
+
# The password to login with.
|
49
56
|
#
|
50
57
|
def GScraper.proxy_uri(proxy_info=GScraper.proxy)
|
51
58
|
if GScraper.proxy[:host]
|
@@ -59,101 +66,85 @@ module GScraper
|
|
59
66
|
end
|
60
67
|
|
61
68
|
#
|
62
|
-
#
|
69
|
+
# The supported GScraper User-Agent Aliases.
|
70
|
+
#
|
71
|
+
# @return [Array<String>]
|
63
72
|
#
|
64
73
|
def GScraper.user_agent_aliases
|
65
|
-
|
74
|
+
Mechanize::AGENT_ALIASES
|
66
75
|
end
|
67
76
|
|
68
77
|
#
|
69
|
-
#
|
78
|
+
# The GScraper User-Agent.
|
79
|
+
#
|
80
|
+
# @return [String]
|
70
81
|
#
|
71
82
|
def GScraper.user_agent
|
72
83
|
@@gscraper_user_agent ||= GScraper.user_agent_aliases['Windows IE 6']
|
73
84
|
end
|
74
85
|
|
75
86
|
#
|
76
|
-
# Sets the GScraper User-Agent
|
87
|
+
# Sets the GScraper User-Agent.
|
88
|
+
#
|
89
|
+
# @param [String] agent
|
90
|
+
# The new User-Agent string.
|
91
|
+
#
|
92
|
+
# @return [String]
|
93
|
+
# The new User-Agent string.
|
77
94
|
#
|
78
95
|
def GScraper.user_agent=(agent)
|
79
96
|
@@gscraper_user_agent = agent
|
80
97
|
end
|
81
98
|
|
82
99
|
#
|
83
|
-
# Sets the GScraper User-Agent
|
84
|
-
#
|
100
|
+
# Sets the GScraper User-Agent.
|
101
|
+
#
|
102
|
+
# @param [String] name
|
103
|
+
# The User-Agent alias.
|
104
|
+
#
|
105
|
+
# @return [String]
|
106
|
+
# The new User-Agent string.
|
85
107
|
#
|
86
108
|
def GScraper.user_agent_alias=(name)
|
87
109
|
@@gscraper_user_agent = GScraper.user_agent_aliases[name.to_s]
|
88
110
|
end
|
89
111
|
|
90
112
|
#
|
91
|
-
#
|
92
|
-
# will be returned.
|
113
|
+
# Creates a new Mechanize agent.
|
93
114
|
#
|
94
|
-
#
|
95
|
-
#
|
96
|
-
# <tt>:user_agent</tt>:: The User-Agent String to use.
|
97
|
-
# <tt>:proxy</tt>:: A +Hash+ of proxy information which may
|
98
|
-
# contain the following keys:
|
99
|
-
# <tt>:host</tt>:: The proxy host.
|
100
|
-
# <tt>:port</tt>:: The proxy port.
|
101
|
-
# <tt>:user</tt>:: The user-name to login as.
|
102
|
-
# <tt>:password</tt>:: The password to login with.
|
115
|
+
# @param [Hash] options
|
116
|
+
# Additional options.
|
103
117
|
#
|
104
|
-
#
|
118
|
+
# @option options [String] :user_agent_alias
|
119
|
+
# The User-Agent Alias to use.
|
105
120
|
#
|
106
|
-
#
|
107
|
-
#
|
108
|
-
# GScraper.open_uri('http://www.wired.com/',
|
109
|
-
# :user_agent => 'the future')
|
121
|
+
# @option options [String] :user_agent
|
122
|
+
# The User-Agent string to use.
|
110
123
|
#
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
if options[:user_agent_alias]
|
115
|
-
headers['User-Agent'] = WWW::Mechanize::AGENT_ALIASES[options[:user_agent_alias]]
|
116
|
-
elsif options[:user_agent]
|
117
|
-
headers['User-Agent'] = options[:user_agent]
|
118
|
-
elsif GScraper.user_agent
|
119
|
-
headers['User-Agent'] = GScraper.user_agent
|
120
|
-
end
|
121
|
-
|
122
|
-
proxy = (options[:proxy] || GScraper.proxy)
|
123
|
-
if proxy[:host]
|
124
|
-
headers[:proxy] = GScraper.proxy_uri(proxy)
|
125
|
-
end
|
126
|
-
|
127
|
-
return Kernel.open(uri,headers)
|
128
|
-
end
|
129
|
-
|
124
|
+
# @option options [Hash] :proxy
|
125
|
+
# The proxy information to use.
|
130
126
|
#
|
131
|
-
#
|
127
|
+
# @option :proxy [String] :host
|
128
|
+
# The proxy host.
|
132
129
|
#
|
133
|
-
|
134
|
-
|
135
|
-
end
|
136
|
-
|
130
|
+
# @option :proxy [Integer] :port
|
131
|
+
# The proxy port.
|
137
132
|
#
|
138
|
-
#
|
133
|
+
# @option :proxy [String] :user
|
134
|
+
# The user-name to login as.
|
139
135
|
#
|
140
|
-
#
|
141
|
-
#
|
142
|
-
# <tt>:user_agent</tt>:: The User-Agent string to use.
|
143
|
-
# <tt>:proxy</tt>:: A +Hash+ of proxy information which may
|
144
|
-
# contain the following keys:
|
145
|
-
# <tt>:host</tt>:: The proxy host.
|
146
|
-
# <tt>:port</tt>:: The proxy port.
|
147
|
-
# <tt>:user</tt>:: The user-name to login as.
|
148
|
-
# <tt>:password</tt>:: The password to login with.
|
136
|
+
# @option :proxy [String] :password
|
137
|
+
# The password to login with.
|
149
138
|
#
|
139
|
+
# @example
|
150
140
|
# GScraper.web_agent
|
151
141
|
#
|
142
|
+
# @example
|
152
143
|
# GScraper.web_agent(:user_agent_alias => 'Linux Mozilla')
|
153
144
|
# GScraper.web_agent(:user_agent => 'Google Bot')
|
154
145
|
#
|
155
|
-
def GScraper.web_agent(options={}
|
156
|
-
agent =
|
146
|
+
def GScraper.web_agent(options={})
|
147
|
+
agent = Mechanize.new
|
157
148
|
|
158
149
|
if options[:user_agent_alias]
|
159
150
|
agent.user_agent_alias = options[:user_agent_alias]
|
@@ -168,7 +159,7 @@ module GScraper
|
|
168
159
|
agent.set_proxy(proxy[:host],proxy[:port],proxy[:user],proxy[:password])
|
169
160
|
end
|
170
161
|
|
171
|
-
|
162
|
+
yield agent if block_given?
|
172
163
|
return agent
|
173
164
|
end
|
174
165
|
end
|
data/lib/gscraper/has_pages.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
#
|
2
|
-
#--
|
3
2
|
# GScraper - A web-scraping interface to various Google Services.
|
4
3
|
#
|
5
4
|
# Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
|
@@ -17,51 +16,87 @@
|
|
17
16
|
# You should have received a copy of the GNU General Public License
|
18
17
|
# along with this program; if not, write to the Free Software
|
19
18
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
-
#++
|
21
19
|
#
|
22
20
|
|
21
|
+
require 'enumerator'
|
22
|
+
|
23
23
|
module GScraper
|
24
24
|
module HasPages
|
25
25
|
include Enumerable
|
26
26
|
|
27
27
|
#
|
28
|
-
#
|
28
|
+
# The first page.
|
29
|
+
#
|
30
|
+
# @return [Page]
|
31
|
+
# The first page.
|
29
32
|
#
|
30
33
|
def first_page
|
31
34
|
page_cache[1]
|
32
35
|
end
|
33
36
|
|
34
37
|
#
|
35
|
-
#
|
38
|
+
# The page at the specified index.
|
39
|
+
#
|
40
|
+
# @param [Integer] index
|
41
|
+
# The index.
|
42
|
+
#
|
43
|
+
# @return [Page]
|
44
|
+
# The page at the given index.
|
36
45
|
#
|
37
46
|
def [](index)
|
38
47
|
page_cache[index]
|
39
48
|
end
|
40
49
|
|
41
50
|
#
|
42
|
-
#
|
51
|
+
# The pages with the specified indices.
|
52
|
+
#
|
53
|
+
# @param [Array, Range] indices
|
54
|
+
# The indices.
|
55
|
+
#
|
56
|
+
# @return [Page]
|
57
|
+
# The pages at the given indices.
|
43
58
|
#
|
44
59
|
def pages(indices)
|
45
60
|
indices.map { |index| page_cache[index] }
|
46
61
|
end
|
47
62
|
|
48
63
|
#
|
49
|
-
# Iterates over the pages
|
50
|
-
# to the specified _block_.
|
64
|
+
# Iterates over the pages at the specified indices.
|
51
65
|
#
|
52
|
-
|
53
|
-
|
66
|
+
# @param [Array, Range] indices
|
67
|
+
# The indices.
|
68
|
+
#
|
69
|
+
# @yield [page]
|
70
|
+
# The given block will be passed each page.
|
71
|
+
#
|
72
|
+
# @yieldparam [Page] page
|
73
|
+
# A page at one of the given indices.
|
74
|
+
#
|
75
|
+
def each_page(indices)
|
76
|
+
unless block_given?
|
77
|
+
enum_for(:each_page,indices)
|
78
|
+
else
|
79
|
+
indices.map { |index| yield page_cache[index] }
|
80
|
+
end
|
54
81
|
end
|
55
82
|
|
56
83
|
#
|
57
|
-
# Iterates over all the pages of the query,
|
58
|
-
#
|
84
|
+
# Iterates over all the pages of the query, until an empty page is
|
85
|
+
# encountered.
|
86
|
+
#
|
87
|
+
# @yield [page]
|
88
|
+
# A page with results from the query.
|
59
89
|
#
|
60
|
-
|
90
|
+
# @yieldparam [Page] page
|
91
|
+
# A non-empty page from the query.
|
92
|
+
#
|
93
|
+
def each
|
94
|
+
return enum_for(:each) unless block_given?
|
95
|
+
|
61
96
|
index = 1
|
62
97
|
|
63
98
|
until ((next_page = page_cache[index]).empty?) do
|
64
|
-
|
99
|
+
yield next_page
|
65
100
|
index = index + 1
|
66
101
|
end
|
67
102
|
|
@@ -69,16 +104,20 @@ module GScraper
|
|
69
104
|
end
|
70
105
|
|
71
106
|
#
|
72
|
-
# Iterates over the elements on the page with the specified
|
73
|
-
#
|
107
|
+
# Iterates over the elements on the page with the specified index.
|
108
|
+
#
|
109
|
+
# @param [Integer] index
|
110
|
+
# The index to access.
|
74
111
|
#
|
75
112
|
def each_on_page(index,&block)
|
76
113
|
page_cache[index].each(&block)
|
77
114
|
end
|
78
115
|
|
79
116
|
#
|
80
|
-
# Iterates over each element on the pages with the specified
|
81
|
-
#
|
117
|
+
# Iterates over each element on the pages with the specified indices.
|
118
|
+
#
|
119
|
+
# @param [Array, Range] indices
|
120
|
+
# The indices to access.
|
82
121
|
#
|
83
122
|
def each_on_pages(indices,&block)
|
84
123
|
each_page(indices) { |page| page.each(&block) }
|
@@ -87,21 +126,36 @@ module GScraper
|
|
87
126
|
protected
|
88
127
|
|
89
128
|
#
|
90
|
-
#
|
129
|
+
# The page index for the specified result rank.
|
130
|
+
#
|
131
|
+
# @param [Integer] rank
|
132
|
+
# A result ranking.
|
133
|
+
#
|
134
|
+
# @return [Integer]
|
135
|
+
# The page index.
|
91
136
|
#
|
92
137
|
def page_index_of(rank)
|
93
138
|
(((rank.to_i - 1) / results_per_page.to_i) + 1)
|
94
139
|
end
|
95
140
|
|
96
141
|
#
|
97
|
-
#
|
142
|
+
# The rank offset for the specified page-index.
|
143
|
+
#
|
144
|
+
# @param [Integer] page_index
|
145
|
+
# The result offset within a page.
|
98
146
|
#
|
99
147
|
def result_offset_of(page_index)
|
100
148
|
((page_index.to_i - 1) * results_per_page.to_i)
|
101
149
|
end
|
102
150
|
|
103
151
|
#
|
104
|
-
#
|
152
|
+
# The in-page index of the specified result rank.
|
153
|
+
#
|
154
|
+
# @param [Integer] rank
|
155
|
+
# The result ranking.
|
156
|
+
#
|
157
|
+
# @return [Integer]
|
158
|
+
# The in-page index.
|
105
159
|
#
|
106
160
|
def result_index_of(rank)
|
107
161
|
((rank.to_i - 1) % results_per_page.to_i)
|
@@ -110,6 +164,8 @@ module GScraper
|
|
110
164
|
#
|
111
165
|
# The cache of previously requested pages.
|
112
166
|
#
|
167
|
+
# @return [Hash]
|
168
|
+
#
|
113
169
|
def page_cache
|
114
170
|
@page_cache ||= Hash.new { |hash,key| hash[key] = page(key.to_i) }
|
115
171
|
end
|