gscraper 0.1.7 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. data/COPYING.txt +339 -0
  2. data/History.txt +21 -0
  3. data/Manifest.txt +23 -10
  4. data/README.txt +17 -21
  5. data/Rakefile +3 -6
  6. data/lib/gscraper.rb +22 -0
  7. data/lib/gscraper/extensions.rb +22 -0
  8. data/lib/gscraper/extensions/uri.rb +22 -0
  9. data/lib/gscraper/extensions/uri/http.rb +25 -71
  10. data/lib/gscraper/extensions/uri/query_params.rb +96 -0
  11. data/lib/gscraper/gscraper.rb +30 -0
  12. data/lib/gscraper/has_pages.rb +114 -0
  13. data/lib/gscraper/licenses.rb +22 -0
  14. data/lib/gscraper/page.rb +64 -0
  15. data/lib/gscraper/search.rb +24 -0
  16. data/lib/gscraper/search/ajax_query.rb +176 -0
  17. data/lib/gscraper/search/page.rb +27 -72
  18. data/lib/gscraper/search/query.rb +46 -457
  19. data/lib/gscraper/search/result.rb +32 -29
  20. data/lib/gscraper/search/search.rb +44 -3
  21. data/lib/gscraper/search/web_query.rb +472 -0
  22. data/lib/gscraper/sponsored_ad.rb +26 -2
  23. data/lib/gscraper/sponsored_links.rb +77 -8
  24. data/lib/gscraper/version.rb +23 -1
  25. data/spec/extensions/uri/http_spec.rb +9 -0
  26. data/spec/extensions/uri/query_params_spec.rb +38 -0
  27. data/spec/gscraper_spec.rb +29 -0
  28. data/spec/has_pages_examples.rb +19 -0
  29. data/spec/has_sponsored_links_examples.rb +57 -0
  30. data/spec/helpers/query.rb +1 -0
  31. data/spec/helpers/uri.rb +8 -0
  32. data/spec/page_has_results_examples.rb +13 -0
  33. data/spec/search/ajax_query_spec.rb +124 -0
  34. data/spec/search/page_has_results_examples.rb +51 -0
  35. data/spec/search/query_spec.rb +103 -0
  36. data/spec/search/web_query_spec.rb +74 -0
  37. data/spec/spec_helper.rb +6 -0
  38. data/tasks/spec.rb +7 -0
  39. metadata +34 -20
  40. data/LICENSE.txt +0 -23
  41. data/lib/gscraper/web_agent.rb +0 -38
  42. data/test/search/page_results.rb +0 -103
  43. data/test/search/query_from_url.rb +0 -50
  44. data/test/search/query_pages.rb +0 -32
  45. data/test/search/query_result.rb +0 -30
  46. data/test/test_gscraper.rb +0 -4
data/Rakefile CHANGED
@@ -2,16 +2,13 @@
2
2
 
3
3
  require 'rubygems'
4
4
  require 'hoe'
5
+
6
+ require './tasks/spec.rb'
5
7
  require './lib/gscraper/version.rb'
6
8
 
7
9
  Hoe.new('gscraper', GScraper::VERSION) do |p|
8
10
  p.rubyforge_name = 'gscraper'
9
- p.author = 'Postmodern Modulus III'
10
- p.email = 'postmodern.mod3@gmail.com'
11
- p.summary = 'A ruby web-scraping interface to various Google Services'
12
- p.description = p.paragraphs_of('README.txt', 2..5).join("\n\n")
13
- p.url = p.paragraphs_of('README.txt', 0).first.split(/\n/)[1..-1]
14
- p.changes = p.paragraphs_of('History.txt', 0..1).join("\n\n")
11
+ p.developer('Postmodern Modulus III', 'postmodern.mod3@gmail.com')
15
12
  p.extra_deps = ['hpricot', 'mechanize']
16
13
  end
17
14
 
data/lib/gscraper.rb CHANGED
@@ -1,2 +1,24 @@
1
+ #
2
+ #--
3
+ # GScraper - A web-scraping interface to various Google Services.
4
+ #
5
+ # Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
6
+ #
7
+ # This program is free software; you can redistribute it and/or modify
8
+ # it under the terms of the GNU General Public License as published by
9
+ # the Free Software Foundation; either version 2 of the License, or
10
+ # (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU General Public License
18
+ # along with this program; if not, write to the Free Software
19
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
+ #++
21
+ #
22
+
1
23
  require 'gscraper/search'
2
24
  require 'gscraper/version'
@@ -1 +1,23 @@
1
+ #
2
+ #--
3
+ # GScraper - A web-scraping interface to various Google Services.
4
+ #
5
+ # Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
6
+ #
7
+ # This program is free software; you can redistribute it and/or modify
8
+ # it under the terms of the GNU General Public License as published by
9
+ # the Free Software Foundation; either version 2 of the License, or
10
+ # (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU General Public License
18
+ # along with this program; if not, write to the Free Software
19
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
+ #++
21
+ #
22
+
1
23
  require 'gscraper/extensions/uri'
@@ -1 +1,23 @@
1
+ #
2
+ #--
3
+ # GScraper - A web-scraping interface to various Google Services.
4
+ #
5
+ # Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
6
+ #
7
+ # This program is free software; you can redistribute it and/or modify
8
+ # it under the terms of the GNU General Public License as published by
9
+ # the Free Software Foundation; either version 2 of the License, or
10
+ # (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU General Public License
18
+ # along with this program; if not, write to the Free Software
19
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
+ #++
21
+ #
22
+
1
23
  require 'gscraper/extensions/uri/http'
@@ -1,79 +1,33 @@
1
+ #
2
+ #--
3
+ # GScraper - A web-scraping interface to various Google Services.
4
+ #
5
+ # Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
6
+ #
7
+ # This program is free software; you can redistribute it and/or modify
8
+ # it under the terms of the GNU General Public License as published by
9
+ # the Free Software Foundation; either version 2 of the License, or
10
+ # (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU General Public License
18
+ # along with this program; if not, write to the Free Software
19
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
+ #++
21
+ #
22
+
23
+ require 'gscraper/extensions/uri/query_params'
24
+
1
25
  require 'uri/http'
2
26
 
3
27
  module URI
4
28
  class HTTP < Generic
5
29
 
6
- # Query parameters
7
- attr_reader :query_params
8
-
9
- alias_method :old_initialize, :initialize
10
-
11
- #
12
- # Creates a new URI::HTTP object and initializes query_params as a
13
- # new Hash.
14
- #
15
- def initialize(*args)
16
- old_initialize(*args)
17
-
18
- @query_params = {}
19
- parse_query_params
20
- end
21
-
22
- #
23
- # Sets the query data and updates query_params.
24
- #
25
- def query=(query_str)
26
- new_query = super(query_str)
27
- parse_query_params
28
- return new_query
29
- end
30
-
31
- protected
32
-
33
- #
34
- # Parses the query parameters from the query data, populating
35
- # query_params with the parsed parameters.
36
- #
37
- def parse_query_params
38
- @query_params.clear
39
-
40
- if @query
41
- @query.split('&').each do |param|
42
- name, value = param.split('=')
43
-
44
- if value
45
- @query_params[name] = URI.decode(value)
46
- else
47
- @query_params[name] = nil
48
- end
49
- end
50
- end
51
- end
52
-
53
- private
54
-
55
- # :nodoc
56
- def path_query
57
- str = @path
58
-
59
- unless @query_params.empty?
60
- str += '?' + @query_params.to_a.map { |name,value|
61
- if value==true
62
- "#{name}=active"
63
- elsif value
64
- if value.kind_of?(Array)
65
- "#{name}=#{URI.encode(value.join(' '))}"
66
- else
67
- "#{name}=#{URI.encode(value.to_s)}"
68
- end
69
- else
70
- "#{name}="
71
- end
72
- }.join('&')
73
- end
74
-
75
- return str
76
- end
30
+ include QueryParams
77
31
 
78
32
  end
79
33
  end
@@ -0,0 +1,96 @@
1
+ #
2
+ #--
3
+ # GScraper - A web-scraping interface to various Google Services.
4
+ #
5
+ # Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
6
+ #
7
+ # This program is free software; you can redistribute it and/or modify
8
+ # it under the terms of the GNU General Public License as published by
9
+ # the Free Software Foundation; either version 2 of the License, or
10
+ # (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU General Public License
18
+ # along with this program; if not, write to the Free Software
19
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
+ #++
21
+ #
22
+
23
+ module URI
24
+ module QueryParams
25
+ # Query parameters
26
+ attr_reader :query_params
27
+
28
+ #
29
+ # Creates a new URI::HTTP object and initializes query_params as a
30
+ # new Hash.
31
+ #
32
+ def initialize(*args)
33
+ @query_params = {}
34
+
35
+ super(*args)
36
+
37
+ parse_query_params
38
+ end
39
+
40
+ #
41
+ # Sets the query data and updates query_params.
42
+ #
43
+ def query=(query_str)
44
+ new_query = super(query_str)
45
+ parse_query_params
46
+ return new_query
47
+ end
48
+
49
+ protected
50
+
51
+ #
52
+ # Parses the query parameters from the query data, populating
53
+ # query_params with the parsed parameters.
54
+ #
55
+ def parse_query_params
56
+ @query_params.clear
57
+
58
+ if @query
59
+ @query.split('&').each do |param|
60
+ name, value = param.split('=')
61
+
62
+ if value
63
+ @query_params[name] = URI.decode(value)
64
+ else
65
+ @query_params[name] = nil
66
+ end
67
+ end
68
+ end
69
+ end
70
+
71
+ private
72
+
73
+ # :nodoc
74
+ def path_query
75
+ str = @path
76
+
77
+ unless @query_params.empty?
78
+ str += '?' + @query_params.to_a.map { |name,value|
79
+ if value==true
80
+ "#{name}=active"
81
+ elsif value
82
+ if value.kind_of?(Array)
83
+ "#{name}=#{URI.encode(value.join(' '))}"
84
+ else
85
+ "#{name}=#{URI.encode(value.to_s)}"
86
+ end
87
+ else
88
+ "#{name}="
89
+ end
90
+ }.join('&')
91
+ end
92
+
93
+ return str
94
+ end
95
+ end
96
+ end
@@ -1,3 +1,25 @@
1
+ #
2
+ #--
3
+ # GScraper - A web-scraping interface to various Google Services.
4
+ #
5
+ # Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
6
+ #
7
+ # This program is free software; you can redistribute it and/or modify
8
+ # it under the terms of the GNU General Public License as published by
9
+ # the Free Software Foundation; either version 2 of the License, or
10
+ # (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU General Public License
18
+ # along with this program; if not, write to the Free Software
19
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
+ #++
21
+ #
22
+
1
23
  require 'uri/http'
2
24
  require 'mechanize'
3
25
  require 'open-uri'
@@ -54,6 +76,14 @@ module GScraper
54
76
  @@gscraper_user_agent = agent
55
77
  end
56
78
 
79
+ #
80
+ # Sets the GScraper User-Agent using the specified user-agent alias
81
+ # _name_.
82
+ #
83
+ def GScraper.user_agent_alias=(name)
84
+ @@gscraper_user_agent = GScraper.user_agent_aliases[name.to_s]
85
+ end
86
+
57
87
  #
58
88
  # Opens the _uri_ with the given _options_. The contents of the _uri_
59
89
  # will be returned.
@@ -0,0 +1,114 @@
1
+ #
2
+ #--
3
+ # GScraper - A web-scraping interface to various Google Services.
4
+ #
5
+ # Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
6
+ #
7
+ # This program is free software; you can redistribute it and/or modify
8
+ # it under the terms of the GNU General Public License as published by
9
+ # the Free Software Foundation; either version 2 of the License, or
10
+ # (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU General Public License
18
+ # along with this program; if not, write to the Free Software
19
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
+ #++
21
+ #
22
+
23
+ module GScraper
24
+ module HasPages
25
+ include Enumerable
26
+
27
+ #
28
+ # Returns the first page.
29
+ #
30
+ def first_page
31
+ page_cache[1]
32
+ end
33
+
34
+ #
35
+ # Returns the page at the specified _index_.
36
+ #
37
+ def [](index)
38
+ page_cache[index]
39
+ end
40
+
41
+ #
42
+ # Returns the pages with the specified _indices_.
43
+ #
44
+ def pages(indices)
45
+ indices.map { |index| page_cache[index] }
46
+ end
47
+
48
+ #
49
+ # Iterates over the pages with the specified _indices_, passing each
50
+ # to the specified _block_.
51
+ #
52
+ def each_page(indices,&block)
53
+ indices.map { |index| block.call(page_cache[index]) }
54
+ end
55
+
56
+ #
57
+ # Iterates over all the pages of the query, passing each to the
58
+ # specified _block_.
59
+ #
60
+ def each(&block)
61
+ index = 1
62
+
63
+ until ((next_page = page_cache[index]).empty?) do
64
+ block.call(next_page)
65
+ index = index + 1
66
+ end
67
+
68
+ return self
69
+ end
70
+
71
+ #
72
+ # Iterates over the elements on the page with the specified _index_,
73
+ # passing each element to the specified _block_.
74
+ #
75
+ def each_on_page(index,&block)
76
+ page_cache[index].each(&block)
77
+ end
78
+
79
+ #
80
+ # Iterates over each element on the pages with the specified _indices_,
81
+ # passing each element to the specified _block_.
82
+ #
83
+ def each_on_pages(indices,&block)
84
+ each_page(indices) { |page| page.each(&block) }
85
+ end
86
+
87
+ protected
88
+
89
+ #
90
+ # Returns the page index for the specified result _rank_.
91
+ #
92
+ def page_index_of(rank)
93
+ (((rank.to_i - 1) / results_per_page.to_i) + 1)
94
+ end
95
+
96
+ #
97
+ # Returns the rank offset for the specified _page_index_.
98
+ #
99
+ def result_offset_of(page_index)
100
+ ((page_index.to_i - 1) * results_per_page.to_i)
101
+ end
102
+
103
+ #
104
+ # Returns the in-page index of the specified result _rank_.
105
+ #
106
+ def result_index_of(rank)
107
+ ((rank.to_i - 1) % results_per_page.to_i)
108
+ end
109
+
110
+ def page_cache
111
+ @page_cache ||= Hash.new { |hash,key| hash[key] = page(key.to_i) }
112
+ end
113
+ end
114
+ end