gscraper 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. data/.gitignore +8 -0
  2. data/.specopts +1 -0
  3. data/.yardopts +1 -0
  4. data/ChangeLog.md +122 -0
  5. data/Gemfile +25 -0
  6. data/{README.txt → README.md} +25 -24
  7. data/Rakefile +32 -10
  8. data/gscraper.gemspec +112 -0
  9. data/lib/gscraper.rb +0 -2
  10. data/lib/gscraper/extensions.rb +0 -2
  11. data/lib/gscraper/extensions/uri.rb +0 -2
  12. data/lib/gscraper/extensions/uri/http.rb +0 -2
  13. data/lib/gscraper/extensions/uri/query_params.rb +18 -5
  14. data/lib/gscraper/gscraper.rb +61 -70
  15. data/lib/gscraper/has_pages.rb +76 -20
  16. data/lib/gscraper/licenses.rb +0 -2
  17. data/lib/gscraper/page.rb +45 -16
  18. data/lib/gscraper/search.rb +0 -2
  19. data/lib/gscraper/search/ajax_query.rb +75 -22
  20. data/lib/gscraper/search/page.rb +328 -122
  21. data/lib/gscraper/search/query.rb +100 -7
  22. data/lib/gscraper/search/result.rb +27 -6
  23. data/lib/gscraper/search/search.rb +59 -9
  24. data/lib/gscraper/search/web_query.rb +120 -37
  25. data/lib/gscraper/sponsored_ad.rb +19 -6
  26. data/lib/gscraper/sponsored_links.rb +260 -92
  27. data/lib/gscraper/version.rb +2 -3
  28. data/spec/extensions/uri/query_params_spec.rb +8 -0
  29. data/spec/gscraper_spec.rb +9 -4
  30. data/spec/has_pages_examples.rb +0 -2
  31. data/spec/has_sponsored_links_examples.rb +2 -1
  32. data/spec/helpers/query.rb +3 -1
  33. data/spec/helpers/uri.rb +6 -4
  34. data/spec/page_has_results_examples.rb +0 -2
  35. data/spec/search/ajax_query_spec.rb +6 -11
  36. data/spec/search/page_has_results_examples.rb +0 -2
  37. data/spec/search/web_query_spec.rb +6 -11
  38. data/spec/spec_helper.rb +10 -4
  39. metadata +147 -54
  40. data/History.txt +0 -101
  41. data/Manifest.txt +0 -38
  42. data/tasks/spec.rb +0 -9
@@ -1,5 +1,4 @@
1
1
  #
2
- #--
3
2
  # GScraper - A web-scraping interface to various Google Services.
4
3
  #
5
4
  # Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
@@ -17,7 +16,6 @@
17
16
  # You should have received a copy of the GNU General Public License
18
17
  # along with this program; if not, write to the Free Software
19
18
  # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
- #++
21
19
  #
22
20
 
23
21
  require 'gscraper/search'
@@ -1,5 +1,4 @@
1
1
  #
2
- #--
3
2
  # GScraper - A web-scraping interface to various Google Services.
4
3
  #
5
4
  # Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
@@ -17,7 +16,6 @@
17
16
  # You should have received a copy of the GNU General Public License
18
17
  # along with this program; if not, write to the Free Software
19
18
  # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
- #++
21
19
  #
22
20
 
23
21
  require 'gscraper/extensions/uri'
@@ -1,5 +1,4 @@
1
1
  #
2
- #--
3
2
  # GScraper - A web-scraping interface to various Google Services.
4
3
  #
5
4
  # Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
@@ -17,7 +16,6 @@
17
16
  # You should have received a copy of the GNU General Public License
18
17
  # along with this program; if not, write to the Free Software
19
18
  # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
- #++
21
19
  #
22
20
 
23
21
  require 'gscraper/extensions/uri/http'
@@ -1,5 +1,4 @@
1
1
  #
2
- #--
3
2
  # GScraper - A web-scraping interface to various Google Services.
4
3
  #
5
4
  # Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
@@ -17,7 +16,6 @@
17
16
  # You should have received a copy of the GNU General Public License
18
17
  # along with this program; if not, write to the Free Software
19
18
  # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
- #++
21
19
  #
22
20
 
23
21
  require 'gscraper/extensions/uri/query_params'
@@ -1,5 +1,4 @@
1
1
  #
2
- #--
3
2
  # GScraper - A web-scraping interface to various Google Services.
4
3
  #
5
4
  # Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
@@ -17,10 +16,15 @@
17
16
  # You should have received a copy of the GNU General Public License
18
17
  # along with this program; if not, write to the Free Software
19
18
  # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
- #++
21
19
  #
22
20
 
21
+ require 'cgi'
22
+
23
23
  module URI
24
+ #
25
+ # Adds the ability to parse individual parameters from a the query field
26
+ # of a URI.
27
+ #
24
28
  module QueryParams
25
29
  # Query parameters
26
30
  attr_reader :query_params
@@ -40,6 +44,16 @@ module URI
40
44
  #
41
45
  # Sets the query data and updates query_params.
42
46
  #
47
+ # @param [String] query_str
48
+ # The new URI query string to use.
49
+ #
50
+ # @return [String]
51
+ # The new URI query string.
52
+ #
53
+ # @example
54
+ # url.query = 'a=1&b=2'
55
+ # # => "a=1&b=2"
56
+ #
43
57
  def query=(query_str)
44
58
  new_query = super(query_str)
45
59
  parse_query_params
@@ -70,7 +84,6 @@ module URI
70
84
 
71
85
  private
72
86
 
73
- # :nodoc
74
87
  def path_query
75
88
  str = @path
76
89
 
@@ -80,9 +93,9 @@ module URI
80
93
  "#{name}=active"
81
94
  elsif value
82
95
  if value.kind_of?(Array)
83
- "#{name}=#{URI.encode(value.join(' '))}"
96
+ "#{name}=#{CGI.escape(value.join(' '))}"
84
97
  else
85
- "#{name}=#{URI.encode(value.to_s)}"
98
+ "#{name}=#{CGI.escape(value.to_s)}"
86
99
  end
87
100
  else
88
101
  "#{name}="
@@ -1,5 +1,4 @@
1
1
  #
2
- #--
3
2
  # GScraper - A web-scraping interface to various Google Services.
4
3
  #
5
4
  # Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
@@ -17,7 +16,6 @@
17
16
  # You should have received a copy of the GNU General Public License
18
17
  # along with this program; if not, write to the Free Software
19
18
  # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
- #++
21
19
  #
22
20
 
23
21
  require 'uri/http'
@@ -30,22 +28,31 @@ module GScraper
30
28
  COMMON_PROXY_PORT = 8080
31
29
 
32
30
  #
33
- # Returns the +Hash+ of proxy information.
31
+ # The proxy information.
32
+ #
33
+ # @return [Hash]
34
34
  #
35
35
  def GScraper.proxy
36
36
  @@gscraper_proxy ||= {:host => nil, :port => COMMON_PROXY_PORT, :user => nil, :password => nil}
37
37
  end
38
38
 
39
39
  #
40
- # Creates a HTTP URI based from the given _proxy_info_ hash. The
41
- # _proxy_info_ hash defaults to Web.proxy, if not given.
40
+ # Creates a HTTP URI for the current proxy.
41
+ #
42
+ # @param [Hash] proxy_info
43
+ # The proxy information.
44
+ #
45
+ # @option proxy_info [String] :host
46
+ # The proxy host.
47
+ #
48
+ # @option proxy_info [Integer] :port (COMMON_PROXY_PORT)
49
+ # The proxy port.
42
50
  #
43
- # _proxy_info_ may contain the following keys:
44
- # <tt>:host</tt>:: The proxy host.
45
- # <tt>:port</tt>:: The proxy port. Defaults to COMMON_PROXY_PORT,
46
- # if not specified.
47
- # <tt>:user</tt>:: The user-name to login as.
48
- # <tt>:password</tt>:: The password to login with.
51
+ # @option proxy_info [String] :user
52
+ # The user-name to login as.
53
+ #
54
+ # @option proxy_info [String] :password
55
+ # The password to login with.
49
56
  #
50
57
  def GScraper.proxy_uri(proxy_info=GScraper.proxy)
51
58
  if GScraper.proxy[:host]
@@ -59,101 +66,85 @@ module GScraper
59
66
  end
60
67
 
61
68
  #
62
- # Returns the supported GScraper User-Agent Aliases.
69
+ # The supported GScraper User-Agent Aliases.
70
+ #
71
+ # @return [Array<String>]
63
72
  #
64
73
  def GScraper.user_agent_aliases
65
- WWW::Mechanize::AGENT_ALIASES
74
+ Mechanize::AGENT_ALIASES
66
75
  end
67
76
 
68
77
  #
69
- # Returns the GScraper User-Agent
78
+ # The GScraper User-Agent.
79
+ #
80
+ # @return [String]
70
81
  #
71
82
  def GScraper.user_agent
72
83
  @@gscraper_user_agent ||= GScraper.user_agent_aliases['Windows IE 6']
73
84
  end
74
85
 
75
86
  #
76
- # Sets the GScraper User-Agent to the specified _agent_.
87
+ # Sets the GScraper User-Agent.
88
+ #
89
+ # @param [String] agent
90
+ # The new User-Agent string.
91
+ #
92
+ # @return [String]
93
+ # The new User-Agent string.
77
94
  #
78
95
  def GScraper.user_agent=(agent)
79
96
  @@gscraper_user_agent = agent
80
97
  end
81
98
 
82
99
  #
83
- # Sets the GScraper User-Agent using the specified user-agent alias
84
- # _name_.
100
+ # Sets the GScraper User-Agent.
101
+ #
102
+ # @param [String] name
103
+ # The User-Agent alias.
104
+ #
105
+ # @return [String]
106
+ # The new User-Agent string.
85
107
  #
86
108
  def GScraper.user_agent_alias=(name)
87
109
  @@gscraper_user_agent = GScraper.user_agent_aliases[name.to_s]
88
110
  end
89
111
 
90
112
  #
91
- # Opens the _uri_ with the given _options_. The contents of the _uri_
92
- # will be returned.
113
+ # Creates a new Mechanize agent.
93
114
  #
94
- # _options_ may contain the following keys:
95
- # <tt>:user_agent_alias</tt>:: The User-Agent Alias to use.
96
- # <tt>:user_agent</tt>:: The User-Agent String to use.
97
- # <tt>:proxy</tt>:: A +Hash+ of proxy information which may
98
- # contain the following keys:
99
- # <tt>:host</tt>:: The proxy host.
100
- # <tt>:port</tt>:: The proxy port.
101
- # <tt>:user</tt>:: The user-name to login as.
102
- # <tt>:password</tt>:: The password to login with.
115
+ # @param [Hash] options
116
+ # Additional options.
103
117
  #
104
- # GScraper.open_uri('http://www.hackety.org/')
118
+ # @option options [String] :user_agent_alias
119
+ # The User-Agent Alias to use.
105
120
  #
106
- # GScraper.open_uri('http://tenderlovemaking.com/',
107
- # :user_agent_alias => 'Linux Mozilla')
108
- # GScraper.open_uri('http://www.wired.com/',
109
- # :user_agent => 'the future')
121
+ # @option options [String] :user_agent
122
+ # The User-Agent string to use.
110
123
  #
111
- def GScraper.open_uri(uri,options={})
112
- headers = {}
113
-
114
- if options[:user_agent_alias]
115
- headers['User-Agent'] = WWW::Mechanize::AGENT_ALIASES[options[:user_agent_alias]]
116
- elsif options[:user_agent]
117
- headers['User-Agent'] = options[:user_agent]
118
- elsif GScraper.user_agent
119
- headers['User-Agent'] = GScraper.user_agent
120
- end
121
-
122
- proxy = (options[:proxy] || GScraper.proxy)
123
- if proxy[:host]
124
- headers[:proxy] = GScraper.proxy_uri(proxy)
125
- end
126
-
127
- return Kernel.open(uri,headers)
128
- end
129
-
124
+ # @option options [Hash] :proxy
125
+ # The proxy information to use.
130
126
  #
131
- # Similar to GScraper.open_uri but returns a Nokogiri::HTML document.
127
+ # @option :proxy [String] :host
128
+ # The proxy host.
132
129
  #
133
- def GScraper.open_page(uri,options={})
134
- Nokogiri::HTML(GScraper.open_uri(uri,options))
135
- end
136
-
130
+ # @option :proxy [Integer] :port
131
+ # The proxy port.
137
132
  #
138
- # Creates a new WWW::Mechanize agent with the given _options_.
133
+ # @option :proxy [String] :user
134
+ # The user-name to login as.
139
135
  #
140
- # _options_ may contain the following keys:
141
- # <tt>:user_agent_alias</tt>:: The User-Agent Alias to use.
142
- # <tt>:user_agent</tt>:: The User-Agent string to use.
143
- # <tt>:proxy</tt>:: A +Hash+ of proxy information which may
144
- # contain the following keys:
145
- # <tt>:host</tt>:: The proxy host.
146
- # <tt>:port</tt>:: The proxy port.
147
- # <tt>:user</tt>:: The user-name to login as.
148
- # <tt>:password</tt>:: The password to login with.
136
+ # @option :proxy [String] :password
137
+ # The password to login with.
149
138
  #
139
+ # @example
150
140
  # GScraper.web_agent
151
141
  #
142
+ # @example
152
143
  # GScraper.web_agent(:user_agent_alias => 'Linux Mozilla')
153
144
  # GScraper.web_agent(:user_agent => 'Google Bot')
154
145
  #
155
- def GScraper.web_agent(options={},&block)
156
- agent = WWW::Mechanize.new
146
+ def GScraper.web_agent(options={})
147
+ agent = Mechanize.new
157
148
 
158
149
  if options[:user_agent_alias]
159
150
  agent.user_agent_alias = options[:user_agent_alias]
@@ -168,7 +159,7 @@ module GScraper
168
159
  agent.set_proxy(proxy[:host],proxy[:port],proxy[:user],proxy[:password])
169
160
  end
170
161
 
171
- block.call(agent) if block
162
+ yield agent if block_given?
172
163
  return agent
173
164
  end
174
165
  end
@@ -1,5 +1,4 @@
1
1
  #
2
- #--
3
2
  # GScraper - A web-scraping interface to various Google Services.
4
3
  #
5
4
  # Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
@@ -17,51 +16,87 @@
17
16
  # You should have received a copy of the GNU General Public License
18
17
  # along with this program; if not, write to the Free Software
19
18
  # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
- #++
21
19
  #
22
20
 
21
+ require 'enumerator'
22
+
23
23
  module GScraper
24
24
  module HasPages
25
25
  include Enumerable
26
26
 
27
27
  #
28
- # Returns the first page.
28
+ # The first page.
29
+ #
30
+ # @return [Page]
31
+ # The first page.
29
32
  #
30
33
  def first_page
31
34
  page_cache[1]
32
35
  end
33
36
 
34
37
  #
35
- # Returns the page at the specified _index_.
38
+ # The page at the specified index.
39
+ #
40
+ # @param [Integer] index
41
+ # The index.
42
+ #
43
+ # @return [Page]
44
+ # The page at the given index.
36
45
  #
37
46
  def [](index)
38
47
  page_cache[index]
39
48
  end
40
49
 
41
50
  #
42
- # Returns the pages with the specified _indices_.
51
+ # The pages with the specified indices.
52
+ #
53
+ # @param [Array, Range] indices
54
+ # The indices.
55
+ #
56
+ # @return [Page]
57
+ # The pages at the given indices.
43
58
  #
44
59
  def pages(indices)
45
60
  indices.map { |index| page_cache[index] }
46
61
  end
47
62
 
48
63
  #
49
- # Iterates over the pages with the specified _indices_, passing each
50
- # to the specified _block_.
64
+ # Iterates over the pages at the specified indices.
51
65
  #
52
- def each_page(indices,&block)
53
- indices.map { |index| block.call(page_cache[index]) }
66
+ # @param [Array, Range] indices
67
+ # The indices.
68
+ #
69
+ # @yield [page]
70
+ # The given block will be passed each page.
71
+ #
72
+ # @yieldparam [Page] page
73
+ # A page at one of the given indices.
74
+ #
75
+ def each_page(indices)
76
+ unless block_given?
77
+ enum_for(:each_page,indices)
78
+ else
79
+ indices.map { |index| yield page_cache[index] }
80
+ end
54
81
  end
55
82
 
56
83
  #
57
- # Iterates over all the pages of the query, passing each to the
58
- # specified _block_.
84
+ # Iterates over all the pages of the query, until an empty page is
85
+ # encountered.
86
+ #
87
+ # @yield [page]
88
+ # A page with results from the query.
59
89
  #
60
- def each(&block)
90
+ # @yieldparam [Page] page
91
+ # A non-empty page from the query.
92
+ #
93
+ def each
94
+ return enum_for(:each) unless block_given?
95
+
61
96
  index = 1
62
97
 
63
98
  until ((next_page = page_cache[index]).empty?) do
64
- block.call(next_page)
99
+ yield next_page
65
100
  index = index + 1
66
101
  end
67
102
 
@@ -69,16 +104,20 @@ module GScraper
69
104
  end
70
105
 
71
106
  #
72
- # Iterates over the elements on the page with the specified _index_,
73
- # passing each element to the specified _block_.
107
+ # Iterates over the elements on the page with the specified index.
108
+ #
109
+ # @param [Integer] index
110
+ # The index to access.
74
111
  #
75
112
  def each_on_page(index,&block)
76
113
  page_cache[index].each(&block)
77
114
  end
78
115
 
79
116
  #
80
- # Iterates over each element on the pages with the specified _indices_,
81
- # passing each element to the specified _block_.
117
+ # Iterates over each element on the pages with the specified indices.
118
+ #
119
+ # @param [Array, Range] indices
120
+ # The indices to access.
82
121
  #
83
122
  def each_on_pages(indices,&block)
84
123
  each_page(indices) { |page| page.each(&block) }
@@ -87,21 +126,36 @@ module GScraper
87
126
  protected
88
127
 
89
128
  #
90
- # Returns the page index for the specified result _rank_.
129
+ # The page index for the specified result rank.
130
+ #
131
+ # @param [Integer] rank
132
+ # A result ranking.
133
+ #
134
+ # @return [Integer]
135
+ # The page index.
91
136
  #
92
137
  def page_index_of(rank)
93
138
  (((rank.to_i - 1) / results_per_page.to_i) + 1)
94
139
  end
95
140
 
96
141
  #
97
- # Returns the rank offset for the specified _page_index_.
142
+ # The rank offset for the specified page-index.
143
+ #
144
+ # @param [Integer] page_index
145
+ # The result offset within a page.
98
146
  #
99
147
  def result_offset_of(page_index)
100
148
  ((page_index.to_i - 1) * results_per_page.to_i)
101
149
  end
102
150
 
103
151
  #
104
- # Returns the in-page index of the specified result _rank_.
152
+ # The in-page index of the specified result rank.
153
+ #
154
+ # @param [Integer] rank
155
+ # The result ranking.
156
+ #
157
+ # @return [Integer]
158
+ # The in-page index.
105
159
  #
106
160
  def result_index_of(rank)
107
161
  ((rank.to_i - 1) % results_per_page.to_i)
@@ -110,6 +164,8 @@ module GScraper
110
164
  #
111
165
  # The cache of previously requested pages.
112
166
  #
167
+ # @return [Hash]
168
+ #
113
169
  def page_cache
114
170
  @page_cache ||= Hash.new { |hash,key| hash[key] = page(key.to_i) }
115
171
  end