gscraper 0.1.7 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. data/COPYING.txt +339 -0
  2. data/History.txt +21 -0
  3. data/Manifest.txt +23 -10
  4. data/README.txt +17 -21
  5. data/Rakefile +3 -6
  6. data/lib/gscraper.rb +22 -0
  7. data/lib/gscraper/extensions.rb +22 -0
  8. data/lib/gscraper/extensions/uri.rb +22 -0
  9. data/lib/gscraper/extensions/uri/http.rb +25 -71
  10. data/lib/gscraper/extensions/uri/query_params.rb +96 -0
  11. data/lib/gscraper/gscraper.rb +30 -0
  12. data/lib/gscraper/has_pages.rb +114 -0
  13. data/lib/gscraper/licenses.rb +22 -0
  14. data/lib/gscraper/page.rb +64 -0
  15. data/lib/gscraper/search.rb +24 -0
  16. data/lib/gscraper/search/ajax_query.rb +176 -0
  17. data/lib/gscraper/search/page.rb +27 -72
  18. data/lib/gscraper/search/query.rb +46 -457
  19. data/lib/gscraper/search/result.rb +32 -29
  20. data/lib/gscraper/search/search.rb +44 -3
  21. data/lib/gscraper/search/web_query.rb +472 -0
  22. data/lib/gscraper/sponsored_ad.rb +26 -2
  23. data/lib/gscraper/sponsored_links.rb +77 -8
  24. data/lib/gscraper/version.rb +23 -1
  25. data/spec/extensions/uri/http_spec.rb +9 -0
  26. data/spec/extensions/uri/query_params_spec.rb +38 -0
  27. data/spec/gscraper_spec.rb +29 -0
  28. data/spec/has_pages_examples.rb +19 -0
  29. data/spec/has_sponsored_links_examples.rb +57 -0
  30. data/spec/helpers/query.rb +1 -0
  31. data/spec/helpers/uri.rb +8 -0
  32. data/spec/page_has_results_examples.rb +13 -0
  33. data/spec/search/ajax_query_spec.rb +124 -0
  34. data/spec/search/page_has_results_examples.rb +51 -0
  35. data/spec/search/query_spec.rb +103 -0
  36. data/spec/search/web_query_spec.rb +74 -0
  37. data/spec/spec_helper.rb +6 -0
  38. data/tasks/spec.rb +7 -0
  39. metadata +34 -20
  40. data/LICENSE.txt +0 -23
  41. data/lib/gscraper/web_agent.rb +0 -38
  42. data/test/search/page_results.rb +0 -103
  43. data/test/search/query_from_url.rb +0 -50
  44. data/test/search/query_pages.rb +0 -32
  45. data/test/search/query_result.rb +0 -30
  46. data/test/test_gscraper.rb +0 -4
data/Rakefile CHANGED
@@ -2,16 +2,13 @@
2
2
 
3
3
  require 'rubygems'
4
4
  require 'hoe'
5
+
6
+ require './tasks/spec.rb'
5
7
  require './lib/gscraper/version.rb'
6
8
 
7
9
  Hoe.new('gscraper', GScraper::VERSION) do |p|
8
10
  p.rubyforge_name = 'gscraper'
9
- p.author = 'Postmodern Modulus III'
10
- p.email = 'postmodern.mod3@gmail.com'
11
- p.summary = 'A ruby web-scraping interface to various Google Services'
12
- p.description = p.paragraphs_of('README.txt', 2..5).join("\n\n")
13
- p.url = p.paragraphs_of('README.txt', 0).first.split(/\n/)[1..-1]
14
- p.changes = p.paragraphs_of('History.txt', 0..1).join("\n\n")
11
+ p.developer('Postmodern Modulus III', 'postmodern.mod3@gmail.com')
15
12
  p.extra_deps = ['hpricot', 'mechanize']
16
13
  end
17
14
 
data/lib/gscraper.rb CHANGED
@@ -1,2 +1,24 @@
1
+ #
2
+ #--
3
+ # GScraper - A web-scraping interface to various Google Services.
4
+ #
5
+ # Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
6
+ #
7
+ # This program is free software; you can redistribute it and/or modify
8
+ # it under the terms of the GNU General Public License as published by
9
+ # the Free Software Foundation; either version 2 of the License, or
10
+ # (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU General Public License
18
+ # along with this program; if not, write to the Free Software
19
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
+ #++
21
+ #
22
+
1
23
  require 'gscraper/search'
2
24
  require 'gscraper/version'
@@ -1 +1,23 @@
1
+ #
2
+ #--
3
+ # GScraper - A web-scraping interface to various Google Services.
4
+ #
5
+ # Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
6
+ #
7
+ # This program is free software; you can redistribute it and/or modify
8
+ # it under the terms of the GNU General Public License as published by
9
+ # the Free Software Foundation; either version 2 of the License, or
10
+ # (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU General Public License
18
+ # along with this program; if not, write to the Free Software
19
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
+ #++
21
+ #
22
+
1
23
  require 'gscraper/extensions/uri'
@@ -1 +1,23 @@
1
+ #
2
+ #--
3
+ # GScraper - A web-scraping interface to various Google Services.
4
+ #
5
+ # Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
6
+ #
7
+ # This program is free software; you can redistribute it and/or modify
8
+ # it under the terms of the GNU General Public License as published by
9
+ # the Free Software Foundation; either version 2 of the License, or
10
+ # (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU General Public License
18
+ # along with this program; if not, write to the Free Software
19
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
+ #++
21
+ #
22
+
1
23
  require 'gscraper/extensions/uri/http'
@@ -1,79 +1,33 @@
1
+ #
2
+ #--
3
+ # GScraper - A web-scraping interface to various Google Services.
4
+ #
5
+ # Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
6
+ #
7
+ # This program is free software; you can redistribute it and/or modify
8
+ # it under the terms of the GNU General Public License as published by
9
+ # the Free Software Foundation; either version 2 of the License, or
10
+ # (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU General Public License
18
+ # along with this program; if not, write to the Free Software
19
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
+ #++
21
+ #
22
+
23
+ require 'gscraper/extensions/uri/query_params'
24
+
1
25
  require 'uri/http'
2
26
 
3
27
  module URI
4
28
  class HTTP < Generic
5
29
 
6
- # Query parameters
7
- attr_reader :query_params
8
-
9
- alias_method :old_initialize, :initialize
10
-
11
- #
12
- # Creates a new URI::HTTP object and initializes query_params as a
13
- # new Hash.
14
- #
15
- def initialize(*args)
16
- old_initialize(*args)
17
-
18
- @query_params = {}
19
- parse_query_params
20
- end
21
-
22
- #
23
- # Sets the query data and updates query_params.
24
- #
25
- def query=(query_str)
26
- new_query = super(query_str)
27
- parse_query_params
28
- return new_query
29
- end
30
-
31
- protected
32
-
33
- #
34
- # Parses the query parameters from the query data, populating
35
- # query_params with the parsed parameters.
36
- #
37
- def parse_query_params
38
- @query_params.clear
39
-
40
- if @query
41
- @query.split('&').each do |param|
42
- name, value = param.split('=')
43
-
44
- if value
45
- @query_params[name] = URI.decode(value)
46
- else
47
- @query_params[name] = nil
48
- end
49
- end
50
- end
51
- end
52
-
53
- private
54
-
55
- # :nodoc
56
- def path_query
57
- str = @path
58
-
59
- unless @query_params.empty?
60
- str += '?' + @query_params.to_a.map { |name,value|
61
- if value==true
62
- "#{name}=active"
63
- elsif value
64
- if value.kind_of?(Array)
65
- "#{name}=#{URI.encode(value.join(' '))}"
66
- else
67
- "#{name}=#{URI.encode(value.to_s)}"
68
- end
69
- else
70
- "#{name}="
71
- end
72
- }.join('&')
73
- end
74
-
75
- return str
76
- end
30
+ include QueryParams
77
31
 
78
32
  end
79
33
  end
@@ -0,0 +1,96 @@
1
+ #
2
+ #--
3
+ # GScraper - A web-scraping interface to various Google Services.
4
+ #
5
+ # Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
6
+ #
7
+ # This program is free software; you can redistribute it and/or modify
8
+ # it under the terms of the GNU General Public License as published by
9
+ # the Free Software Foundation; either version 2 of the License, or
10
+ # (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU General Public License
18
+ # along with this program; if not, write to the Free Software
19
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
+ #++
21
+ #
22
+
23
+ module URI
24
+ module QueryParams
25
+ # Query parameters
26
+ attr_reader :query_params
27
+
28
+ #
29
+ # Creates a new URI::HTTP object and initializes query_params as a
30
+ # new Hash.
31
+ #
32
+ def initialize(*args)
33
+ @query_params = {}
34
+
35
+ super(*args)
36
+
37
+ parse_query_params
38
+ end
39
+
40
+ #
41
+ # Sets the query data and updates query_params.
42
+ #
43
+ def query=(query_str)
44
+ new_query = super(query_str)
45
+ parse_query_params
46
+ return new_query
47
+ end
48
+
49
+ protected
50
+
51
+ #
52
+ # Parses the query parameters from the query data, populating
53
+ # query_params with the parsed parameters.
54
+ #
55
+ def parse_query_params
56
+ @query_params.clear
57
+
58
+ if @query
59
+ @query.split('&').each do |param|
60
+ name, value = param.split('=')
61
+
62
+ if value
63
+ @query_params[name] = URI.decode(value)
64
+ else
65
+ @query_params[name] = nil
66
+ end
67
+ end
68
+ end
69
+ end
70
+
71
+ private
72
+
73
+ # :nodoc
74
+ def path_query
75
+ str = @path
76
+
77
+ unless @query_params.empty?
78
+ str += '?' + @query_params.to_a.map { |name,value|
79
+ if value==true
80
+ "#{name}=active"
81
+ elsif value
82
+ if value.kind_of?(Array)
83
+ "#{name}=#{URI.encode(value.join(' '))}"
84
+ else
85
+ "#{name}=#{URI.encode(value.to_s)}"
86
+ end
87
+ else
88
+ "#{name}="
89
+ end
90
+ }.join('&')
91
+ end
92
+
93
+ return str
94
+ end
95
+ end
96
+ end
@@ -1,3 +1,25 @@
1
+ #
2
+ #--
3
+ # GScraper - A web-scraping interface to various Google Services.
4
+ #
5
+ # Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
6
+ #
7
+ # This program is free software; you can redistribute it and/or modify
8
+ # it under the terms of the GNU General Public License as published by
9
+ # the Free Software Foundation; either version 2 of the License, or
10
+ # (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU General Public License
18
+ # along with this program; if not, write to the Free Software
19
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
+ #++
21
+ #
22
+
1
23
  require 'uri/http'
2
24
  require 'mechanize'
3
25
  require 'open-uri'
@@ -54,6 +76,14 @@ module GScraper
54
76
  @@gscraper_user_agent = agent
55
77
  end
56
78
 
79
+ #
80
+ # Sets the GScraper User-Agent using the specified user-agent alias
81
+ # _name_.
82
+ #
83
+ def GScraper.user_agent_alias=(name)
84
+ @@gscraper_user_agent = GScraper.user_agent_aliases[name.to_s]
85
+ end
86
+
57
87
  #
58
88
  # Opens the _uri_ with the given _options_. The contents of the _uri_
59
89
  # will be returned.
@@ -0,0 +1,114 @@
1
+ #
2
+ #--
3
+ # GScraper - A web-scraping interface to various Google Services.
4
+ #
5
+ # Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
6
+ #
7
+ # This program is free software; you can redistribute it and/or modify
8
+ # it under the terms of the GNU General Public License as published by
9
+ # the Free Software Foundation; either version 2 of the License, or
10
+ # (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU General Public License
18
+ # along with this program; if not, write to the Free Software
19
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
+ #++
21
+ #
22
+
23
+ module GScraper
24
+ module HasPages
25
+ include Enumerable
26
+
27
+ #
28
+ # Returns the first page.
29
+ #
30
+ def first_page
31
+ page_cache[1]
32
+ end
33
+
34
+ #
35
+ # Returns the page at the specified _index_.
36
+ #
37
+ def [](index)
38
+ page_cache[index]
39
+ end
40
+
41
+ #
42
+ # Returns the pages with the specified _indices_.
43
+ #
44
+ def pages(indices)
45
+ indices.map { |index| page_cache[index] }
46
+ end
47
+
48
+ #
49
+ # Iterates over the pages with the specified _indices_, passing each
50
+ # to the specified _block_.
51
+ #
52
+ def each_page(indices,&block)
53
+ indices.map { |index| block.call(page_cache[index]) }
54
+ end
55
+
56
+ #
57
+ # Iterates over all the pages of the query, passing each to the
58
+ # specified _block_.
59
+ #
60
+ def each(&block)
61
+ index = 1
62
+
63
+ until ((next_page = page_cache[index]).empty?) do
64
+ block.call(next_page)
65
+ index = index + 1
66
+ end
67
+
68
+ return self
69
+ end
70
+
71
+ #
72
+ # Iterates over the elements on the page with the specified _index_,
73
+ # passing each element to the specified _block_.
74
+ #
75
+ def each_on_page(index,&block)
76
+ page_cache[index].each(&block)
77
+ end
78
+
79
+ #
80
+ # Iterates over each element on the pages with the specified _indices_,
81
+ # passing each element to the specified _block_.
82
+ #
83
+ def each_on_pages(indices,&block)
84
+ each_page(indices) { |page| page.each(&block) }
85
+ end
86
+
87
+ protected
88
+
89
+ #
90
+ # Returns the page index for the specified result _rank_.
91
+ #
92
+ def page_index_of(rank)
93
+ (((rank.to_i - 1) / results_per_page.to_i) + 1)
94
+ end
95
+
96
+ #
97
+ # Returns the rank offset for the specified _page_index_.
98
+ #
99
+ def result_offset_of(page_index)
100
+ ((page_index.to_i - 1) * results_per_page.to_i)
101
+ end
102
+
103
+ #
104
+ # Returns the in-page index of the specified result _rank_.
105
+ #
106
+ def result_index_of(rank)
107
+ ((rank.to_i - 1) % results_per_page.to_i)
108
+ end
109
+
110
+ def page_cache
111
+ @page_cache ||= Hash.new { |hash,key| hash[key] = page(key.to_i) }
112
+ end
113
+ end
114
+ end