gscraper 0.2.4 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. data/.gitignore +8 -0
  2. data/.specopts +1 -0
  3. data/.yardopts +1 -0
  4. data/ChangeLog.md +122 -0
  5. data/Gemfile +25 -0
  6. data/{README.txt → README.md} +25 -24
  7. data/Rakefile +32 -10
  8. data/gscraper.gemspec +112 -0
  9. data/lib/gscraper.rb +0 -2
  10. data/lib/gscraper/extensions.rb +0 -2
  11. data/lib/gscraper/extensions/uri.rb +0 -2
  12. data/lib/gscraper/extensions/uri/http.rb +0 -2
  13. data/lib/gscraper/extensions/uri/query_params.rb +18 -5
  14. data/lib/gscraper/gscraper.rb +61 -70
  15. data/lib/gscraper/has_pages.rb +76 -20
  16. data/lib/gscraper/licenses.rb +0 -2
  17. data/lib/gscraper/page.rb +45 -16
  18. data/lib/gscraper/search.rb +0 -2
  19. data/lib/gscraper/search/ajax_query.rb +75 -22
  20. data/lib/gscraper/search/page.rb +328 -122
  21. data/lib/gscraper/search/query.rb +100 -7
  22. data/lib/gscraper/search/result.rb +27 -6
  23. data/lib/gscraper/search/search.rb +59 -9
  24. data/lib/gscraper/search/web_query.rb +120 -37
  25. data/lib/gscraper/sponsored_ad.rb +19 -6
  26. data/lib/gscraper/sponsored_links.rb +260 -92
  27. data/lib/gscraper/version.rb +2 -3
  28. data/spec/extensions/uri/query_params_spec.rb +8 -0
  29. data/spec/gscraper_spec.rb +9 -4
  30. data/spec/has_pages_examples.rb +0 -2
  31. data/spec/has_sponsored_links_examples.rb +2 -1
  32. data/spec/helpers/query.rb +3 -1
  33. data/spec/helpers/uri.rb +6 -4
  34. data/spec/page_has_results_examples.rb +0 -2
  35. data/spec/search/ajax_query_spec.rb +6 -11
  36. data/spec/search/page_has_results_examples.rb +0 -2
  37. data/spec/search/web_query_spec.rb +6 -11
  38. data/spec/spec_helper.rb +10 -4
  39. metadata +147 -54
  40. data/History.txt +0 -101
  41. data/Manifest.txt +0 -38
  42. data/tasks/spec.rb +0 -9
@@ -1,5 +1,4 @@
1
1
  #
2
- #--
3
2
  # GScraper - A web-scraping interface to various Google Services.
4
3
  #
5
4
  # Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
@@ -17,7 +16,6 @@
17
16
  # You should have received a copy of the GNU General Public License
18
17
  # along with this program; if not, write to the Free Software
19
18
  # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
- #++
21
19
  #
22
20
 
23
21
  require 'gscraper/search'
@@ -1,5 +1,4 @@
1
1
  #
2
- #--
3
2
  # GScraper - A web-scraping interface to various Google Services.
4
3
  #
5
4
  # Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
@@ -17,7 +16,6 @@
17
16
  # You should have received a copy of the GNU General Public License
18
17
  # along with this program; if not, write to the Free Software
19
18
  # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
- #++
21
19
  #
22
20
 
23
21
  require 'gscraper/extensions/uri'
@@ -1,5 +1,4 @@
1
1
  #
2
- #--
3
2
  # GScraper - A web-scraping interface to various Google Services.
4
3
  #
5
4
  # Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
@@ -17,7 +16,6 @@
17
16
  # You should have received a copy of the GNU General Public License
18
17
  # along with this program; if not, write to the Free Software
19
18
  # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
- #++
21
19
  #
22
20
 
23
21
  require 'gscraper/extensions/uri/http'
@@ -1,5 +1,4 @@
1
1
  #
2
- #--
3
2
  # GScraper - A web-scraping interface to various Google Services.
4
3
  #
5
4
  # Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
@@ -17,7 +16,6 @@
17
16
  # You should have received a copy of the GNU General Public License
18
17
  # along with this program; if not, write to the Free Software
19
18
  # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
- #++
21
19
  #
22
20
 
23
21
  require 'gscraper/extensions/uri/query_params'
@@ -1,5 +1,4 @@
1
1
  #
2
- #--
3
2
  # GScraper - A web-scraping interface to various Google Services.
4
3
  #
5
4
  # Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
@@ -17,10 +16,15 @@
17
16
  # You should have received a copy of the GNU General Public License
18
17
  # along with this program; if not, write to the Free Software
19
18
  # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
- #++
21
19
  #
22
20
 
21
+ require 'cgi'
22
+
23
23
  module URI
24
+ #
25
+ # Adds the ability to parse individual parameters from a the query field
26
+ # of a URI.
27
+ #
24
28
  module QueryParams
25
29
  # Query parameters
26
30
  attr_reader :query_params
@@ -40,6 +44,16 @@ module URI
40
44
  #
41
45
  # Sets the query data and updates query_params.
42
46
  #
47
+ # @param [String] query_str
48
+ # The new URI query string to use.
49
+ #
50
+ # @return [String]
51
+ # The new URI query string.
52
+ #
53
+ # @example
54
+ # url.query = 'a=1&b=2'
55
+ # # => "a=1&b=2"
56
+ #
43
57
  def query=(query_str)
44
58
  new_query = super(query_str)
45
59
  parse_query_params
@@ -70,7 +84,6 @@ module URI
70
84
 
71
85
  private
72
86
 
73
- # :nodoc
74
87
  def path_query
75
88
  str = @path
76
89
 
@@ -80,9 +93,9 @@ module URI
80
93
  "#{name}=active"
81
94
  elsif value
82
95
  if value.kind_of?(Array)
83
- "#{name}=#{URI.encode(value.join(' '))}"
96
+ "#{name}=#{CGI.escape(value.join(' '))}"
84
97
  else
85
- "#{name}=#{URI.encode(value.to_s)}"
98
+ "#{name}=#{CGI.escape(value.to_s)}"
86
99
  end
87
100
  else
88
101
  "#{name}="
@@ -1,5 +1,4 @@
1
1
  #
2
- #--
3
2
  # GScraper - A web-scraping interface to various Google Services.
4
3
  #
5
4
  # Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
@@ -17,7 +16,6 @@
17
16
  # You should have received a copy of the GNU General Public License
18
17
  # along with this program; if not, write to the Free Software
19
18
  # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
- #++
21
19
  #
22
20
 
23
21
  require 'uri/http'
@@ -30,22 +28,31 @@ module GScraper
30
28
  COMMON_PROXY_PORT = 8080
31
29
 
32
30
  #
33
- # Returns the +Hash+ of proxy information.
31
+ # The proxy information.
32
+ #
33
+ # @return [Hash]
34
34
  #
35
35
  def GScraper.proxy
36
36
  @@gscraper_proxy ||= {:host => nil, :port => COMMON_PROXY_PORT, :user => nil, :password => nil}
37
37
  end
38
38
 
39
39
  #
40
- # Creates a HTTP URI based from the given _proxy_info_ hash. The
41
- # _proxy_info_ hash defaults to Web.proxy, if not given.
40
+ # Creates a HTTP URI for the current proxy.
41
+ #
42
+ # @param [Hash] proxy_info
43
+ # The proxy information.
44
+ #
45
+ # @option proxy_info [String] :host
46
+ # The proxy host.
47
+ #
48
+ # @option proxy_info [Integer] :port (COMMON_PROXY_PORT)
49
+ # The proxy port.
42
50
  #
43
- # _proxy_info_ may contain the following keys:
44
- # <tt>:host</tt>:: The proxy host.
45
- # <tt>:port</tt>:: The proxy port. Defaults to COMMON_PROXY_PORT,
46
- # if not specified.
47
- # <tt>:user</tt>:: The user-name to login as.
48
- # <tt>:password</tt>:: The password to login with.
51
+ # @option proxy_info [String] :user
52
+ # The user-name to login as.
53
+ #
54
+ # @option proxy_info [String] :password
55
+ # The password to login with.
49
56
  #
50
57
  def GScraper.proxy_uri(proxy_info=GScraper.proxy)
51
58
  if GScraper.proxy[:host]
@@ -59,101 +66,85 @@ module GScraper
59
66
  end
60
67
 
61
68
  #
62
- # Returns the supported GScraper User-Agent Aliases.
69
+ # The supported GScraper User-Agent Aliases.
70
+ #
71
+ # @return [Array<String>]
63
72
  #
64
73
  def GScraper.user_agent_aliases
65
- WWW::Mechanize::AGENT_ALIASES
74
+ Mechanize::AGENT_ALIASES
66
75
  end
67
76
 
68
77
  #
69
- # Returns the GScraper User-Agent
78
+ # The GScraper User-Agent.
79
+ #
80
+ # @return [String]
70
81
  #
71
82
  def GScraper.user_agent
72
83
  @@gscraper_user_agent ||= GScraper.user_agent_aliases['Windows IE 6']
73
84
  end
74
85
 
75
86
  #
76
- # Sets the GScraper User-Agent to the specified _agent_.
87
+ # Sets the GScraper User-Agent.
88
+ #
89
+ # @param [String] agent
90
+ # The new User-Agent string.
91
+ #
92
+ # @return [String]
93
+ # The new User-Agent string.
77
94
  #
78
95
  def GScraper.user_agent=(agent)
79
96
  @@gscraper_user_agent = agent
80
97
  end
81
98
 
82
99
  #
83
- # Sets the GScraper User-Agent using the specified user-agent alias
84
- # _name_.
100
+ # Sets the GScraper User-Agent.
101
+ #
102
+ # @param [String] name
103
+ # The User-Agent alias.
104
+ #
105
+ # @return [String]
106
+ # The new User-Agent string.
85
107
  #
86
108
  def GScraper.user_agent_alias=(name)
87
109
  @@gscraper_user_agent = GScraper.user_agent_aliases[name.to_s]
88
110
  end
89
111
 
90
112
  #
91
- # Opens the _uri_ with the given _options_. The contents of the _uri_
92
- # will be returned.
113
+ # Creates a new Mechanize agent.
93
114
  #
94
- # _options_ may contain the following keys:
95
- # <tt>:user_agent_alias</tt>:: The User-Agent Alias to use.
96
- # <tt>:user_agent</tt>:: The User-Agent String to use.
97
- # <tt>:proxy</tt>:: A +Hash+ of proxy information which may
98
- # contain the following keys:
99
- # <tt>:host</tt>:: The proxy host.
100
- # <tt>:port</tt>:: The proxy port.
101
- # <tt>:user</tt>:: The user-name to login as.
102
- # <tt>:password</tt>:: The password to login with.
115
+ # @param [Hash] options
116
+ # Additional options.
103
117
  #
104
- # GScraper.open_uri('http://www.hackety.org/')
118
+ # @option options [String] :user_agent_alias
119
+ # The User-Agent Alias to use.
105
120
  #
106
- # GScraper.open_uri('http://tenderlovemaking.com/',
107
- # :user_agent_alias => 'Linux Mozilla')
108
- # GScraper.open_uri('http://www.wired.com/',
109
- # :user_agent => 'the future')
121
+ # @option options [String] :user_agent
122
+ # The User-Agent string to use.
110
123
  #
111
- def GScraper.open_uri(uri,options={})
112
- headers = {}
113
-
114
- if options[:user_agent_alias]
115
- headers['User-Agent'] = WWW::Mechanize::AGENT_ALIASES[options[:user_agent_alias]]
116
- elsif options[:user_agent]
117
- headers['User-Agent'] = options[:user_agent]
118
- elsif GScraper.user_agent
119
- headers['User-Agent'] = GScraper.user_agent
120
- end
121
-
122
- proxy = (options[:proxy] || GScraper.proxy)
123
- if proxy[:host]
124
- headers[:proxy] = GScraper.proxy_uri(proxy)
125
- end
126
-
127
- return Kernel.open(uri,headers)
128
- end
129
-
124
+ # @option options [Hash] :proxy
125
+ # The proxy information to use.
130
126
  #
131
- # Similar to GScraper.open_uri but returns a Nokogiri::HTML document.
127
+ # @option :proxy [String] :host
128
+ # The proxy host.
132
129
  #
133
- def GScraper.open_page(uri,options={})
134
- Nokogiri::HTML(GScraper.open_uri(uri,options))
135
- end
136
-
130
+ # @option :proxy [Integer] :port
131
+ # The proxy port.
137
132
  #
138
- # Creates a new WWW::Mechanize agent with the given _options_.
133
+ # @option :proxy [String] :user
134
+ # The user-name to login as.
139
135
  #
140
- # _options_ may contain the following keys:
141
- # <tt>:user_agent_alias</tt>:: The User-Agent Alias to use.
142
- # <tt>:user_agent</tt>:: The User-Agent string to use.
143
- # <tt>:proxy</tt>:: A +Hash+ of proxy information which may
144
- # contain the following keys:
145
- # <tt>:host</tt>:: The proxy host.
146
- # <tt>:port</tt>:: The proxy port.
147
- # <tt>:user</tt>:: The user-name to login as.
148
- # <tt>:password</tt>:: The password to login with.
136
+ # @option :proxy [String] :password
137
+ # The password to login with.
149
138
  #
139
+ # @example
150
140
  # GScraper.web_agent
151
141
  #
142
+ # @example
152
143
  # GScraper.web_agent(:user_agent_alias => 'Linux Mozilla')
153
144
  # GScraper.web_agent(:user_agent => 'Google Bot')
154
145
  #
155
- def GScraper.web_agent(options={},&block)
156
- agent = WWW::Mechanize.new
146
+ def GScraper.web_agent(options={})
147
+ agent = Mechanize.new
157
148
 
158
149
  if options[:user_agent_alias]
159
150
  agent.user_agent_alias = options[:user_agent_alias]
@@ -168,7 +159,7 @@ module GScraper
168
159
  agent.set_proxy(proxy[:host],proxy[:port],proxy[:user],proxy[:password])
169
160
  end
170
161
 
171
- block.call(agent) if block
162
+ yield agent if block_given?
172
163
  return agent
173
164
  end
174
165
  end
@@ -1,5 +1,4 @@
1
1
  #
2
- #--
3
2
  # GScraper - A web-scraping interface to various Google Services.
4
3
  #
5
4
  # Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
@@ -17,51 +16,87 @@
17
16
  # You should have received a copy of the GNU General Public License
18
17
  # along with this program; if not, write to the Free Software
19
18
  # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
- #++
21
19
  #
22
20
 
21
+ require 'enumerator'
22
+
23
23
  module GScraper
24
24
  module HasPages
25
25
  include Enumerable
26
26
 
27
27
  #
28
- # Returns the first page.
28
+ # The first page.
29
+ #
30
+ # @return [Page]
31
+ # The first page.
29
32
  #
30
33
  def first_page
31
34
  page_cache[1]
32
35
  end
33
36
 
34
37
  #
35
- # Returns the page at the specified _index_.
38
+ # The page at the specified index.
39
+ #
40
+ # @param [Integer] index
41
+ # The index.
42
+ #
43
+ # @return [Page]
44
+ # The page at the given index.
36
45
  #
37
46
  def [](index)
38
47
  page_cache[index]
39
48
  end
40
49
 
41
50
  #
42
- # Returns the pages with the specified _indices_.
51
+ # The pages with the specified indices.
52
+ #
53
+ # @param [Array, Range] indices
54
+ # The indices.
55
+ #
56
+ # @return [Page]
57
+ # The pages at the given indices.
43
58
  #
44
59
  def pages(indices)
45
60
  indices.map { |index| page_cache[index] }
46
61
  end
47
62
 
48
63
  #
49
- # Iterates over the pages with the specified _indices_, passing each
50
- # to the specified _block_.
64
+ # Iterates over the pages at the specified indices.
51
65
  #
52
- def each_page(indices,&block)
53
- indices.map { |index| block.call(page_cache[index]) }
66
+ # @param [Array, Range] indices
67
+ # The indices.
68
+ #
69
+ # @yield [page]
70
+ # The given block will be passed each page.
71
+ #
72
+ # @yieldparam [Page] page
73
+ # A page at one of the given indices.
74
+ #
75
+ def each_page(indices)
76
+ unless block_given?
77
+ enum_for(:each_page,indices)
78
+ else
79
+ indices.map { |index| yield page_cache[index] }
80
+ end
54
81
  end
55
82
 
56
83
  #
57
- # Iterates over all the pages of the query, passing each to the
58
- # specified _block_.
84
+ # Iterates over all the pages of the query, until an empty page is
85
+ # encountered.
86
+ #
87
+ # @yield [page]
88
+ # A page with results from the query.
59
89
  #
60
- def each(&block)
90
+ # @yieldparam [Page] page
91
+ # A non-empty page from the query.
92
+ #
93
+ def each
94
+ return enum_for(:each) unless block_given?
95
+
61
96
  index = 1
62
97
 
63
98
  until ((next_page = page_cache[index]).empty?) do
64
- block.call(next_page)
99
+ yield next_page
65
100
  index = index + 1
66
101
  end
67
102
 
@@ -69,16 +104,20 @@ module GScraper
69
104
  end
70
105
 
71
106
  #
72
- # Iterates over the elements on the page with the specified _index_,
73
- # passing each element to the specified _block_.
107
+ # Iterates over the elements on the page with the specified index.
108
+ #
109
+ # @param [Integer] index
110
+ # The index to access.
74
111
  #
75
112
  def each_on_page(index,&block)
76
113
  page_cache[index].each(&block)
77
114
  end
78
115
 
79
116
  #
80
- # Iterates over each element on the pages with the specified _indices_,
81
- # passing each element to the specified _block_.
117
+ # Iterates over each element on the pages with the specified indices.
118
+ #
119
+ # @param [Array, Range] indices
120
+ # The indices to access.
82
121
  #
83
122
  def each_on_pages(indices,&block)
84
123
  each_page(indices) { |page| page.each(&block) }
@@ -87,21 +126,36 @@ module GScraper
87
126
  protected
88
127
 
89
128
  #
90
- # Returns the page index for the specified result _rank_.
129
+ # The page index for the specified result rank.
130
+ #
131
+ # @param [Integer] rank
132
+ # A result ranking.
133
+ #
134
+ # @return [Integer]
135
+ # The page index.
91
136
  #
92
137
  def page_index_of(rank)
93
138
  (((rank.to_i - 1) / results_per_page.to_i) + 1)
94
139
  end
95
140
 
96
141
  #
97
- # Returns the rank offset for the specified _page_index_.
142
+ # The rank offset for the specified page-index.
143
+ #
144
+ # @param [Integer] page_index
145
+ # The result offset within a page.
98
146
  #
99
147
  def result_offset_of(page_index)
100
148
  ((page_index.to_i - 1) * results_per_page.to_i)
101
149
  end
102
150
 
103
151
  #
104
- # Returns the in-page index of the specified result _rank_.
152
+ # The in-page index of the specified result rank.
153
+ #
154
+ # @param [Integer] rank
155
+ # The result ranking.
156
+ #
157
+ # @return [Integer]
158
+ # The in-page index.
105
159
  #
106
160
  def result_index_of(rank)
107
161
  ((rank.to_i - 1) % results_per_page.to_i)
@@ -110,6 +164,8 @@ module GScraper
110
164
  #
111
165
  # The cache of previously requested pages.
112
166
  #
167
+ # @return [Hash]
168
+ #
113
169
  def page_cache
114
170
  @page_cache ||= Hash.new { |hash,key| hash[key] = page(key.to_i) }
115
171
  end