gscraper 0.1.7 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/COPYING.txt +339 -0
- data/History.txt +21 -0
- data/Manifest.txt +23 -10
- data/README.txt +17 -21
- data/Rakefile +3 -6
- data/lib/gscraper.rb +22 -0
- data/lib/gscraper/extensions.rb +22 -0
- data/lib/gscraper/extensions/uri.rb +22 -0
- data/lib/gscraper/extensions/uri/http.rb +25 -71
- data/lib/gscraper/extensions/uri/query_params.rb +96 -0
- data/lib/gscraper/gscraper.rb +30 -0
- data/lib/gscraper/has_pages.rb +114 -0
- data/lib/gscraper/licenses.rb +22 -0
- data/lib/gscraper/page.rb +64 -0
- data/lib/gscraper/search.rb +24 -0
- data/lib/gscraper/search/ajax_query.rb +176 -0
- data/lib/gscraper/search/page.rb +27 -72
- data/lib/gscraper/search/query.rb +46 -457
- data/lib/gscraper/search/result.rb +32 -29
- data/lib/gscraper/search/search.rb +44 -3
- data/lib/gscraper/search/web_query.rb +472 -0
- data/lib/gscraper/sponsored_ad.rb +26 -2
- data/lib/gscraper/sponsored_links.rb +77 -8
- data/lib/gscraper/version.rb +23 -1
- data/spec/extensions/uri/http_spec.rb +9 -0
- data/spec/extensions/uri/query_params_spec.rb +38 -0
- data/spec/gscraper_spec.rb +29 -0
- data/spec/has_pages_examples.rb +19 -0
- data/spec/has_sponsored_links_examples.rb +57 -0
- data/spec/helpers/query.rb +1 -0
- data/spec/helpers/uri.rb +8 -0
- data/spec/page_has_results_examples.rb +13 -0
- data/spec/search/ajax_query_spec.rb +124 -0
- data/spec/search/page_has_results_examples.rb +51 -0
- data/spec/search/query_spec.rb +103 -0
- data/spec/search/web_query_spec.rb +74 -0
- data/spec/spec_helper.rb +6 -0
- data/tasks/spec.rb +7 -0
- metadata +34 -20
- data/LICENSE.txt +0 -23
- data/lib/gscraper/web_agent.rb +0 -38
- data/test/search/page_results.rb +0 -103
- data/test/search/query_from_url.rb +0 -50
- data/test/search/query_pages.rb +0 -32
- data/test/search/query_result.rb +0 -30
- data/test/test_gscraper.rb +0 -4
data/Rakefile
CHANGED
@@ -2,16 +2,13 @@
|
|
2
2
|
|
3
3
|
require 'rubygems'
|
4
4
|
require 'hoe'
|
5
|
+
|
6
|
+
require './tasks/spec.rb'
|
5
7
|
require './lib/gscraper/version.rb'
|
6
8
|
|
7
9
|
Hoe.new('gscraper', GScraper::VERSION) do |p|
|
8
10
|
p.rubyforge_name = 'gscraper'
|
9
|
-
p.
|
10
|
-
p.email = 'postmodern.mod3@gmail.com'
|
11
|
-
p.summary = 'A ruby web-scraping interface to various Google Services'
|
12
|
-
p.description = p.paragraphs_of('README.txt', 2..5).join("\n\n")
|
13
|
-
p.url = p.paragraphs_of('README.txt', 0).first.split(/\n/)[1..-1]
|
14
|
-
p.changes = p.paragraphs_of('History.txt', 0..1).join("\n\n")
|
11
|
+
p.developer('Postmodern Modulus III', 'postmodern.mod3@gmail.com')
|
15
12
|
p.extra_deps = ['hpricot', 'mechanize']
|
16
13
|
end
|
17
14
|
|
data/lib/gscraper.rb
CHANGED
@@ -1,2 +1,24 @@
|
|
1
|
+
#
|
2
|
+
#--
|
3
|
+
# GScraper - A web-scraping interface to various Google Services.
|
4
|
+
#
|
5
|
+
# Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
|
6
|
+
#
|
7
|
+
# This program is free software; you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU General Public License as published by
|
9
|
+
# the Free Software Foundation; either version 2 of the License, or
|
10
|
+
# (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU General Public License
|
18
|
+
# along with this program; if not, write to the Free Software
|
19
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
+
#++
|
21
|
+
#
|
22
|
+
|
1
23
|
require 'gscraper/search'
|
2
24
|
require 'gscraper/version'
|
data/lib/gscraper/extensions.rb
CHANGED
@@ -1 +1,23 @@
|
|
1
|
+
#
|
2
|
+
#--
|
3
|
+
# GScraper - A web-scraping interface to various Google Services.
|
4
|
+
#
|
5
|
+
# Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
|
6
|
+
#
|
7
|
+
# This program is free software; you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU General Public License as published by
|
9
|
+
# the Free Software Foundation; either version 2 of the License, or
|
10
|
+
# (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU General Public License
|
18
|
+
# along with this program; if not, write to the Free Software
|
19
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
+
#++
|
21
|
+
#
|
22
|
+
|
1
23
|
require 'gscraper/extensions/uri'
|
@@ -1 +1,23 @@
|
|
1
|
+
#
|
2
|
+
#--
|
3
|
+
# GScraper - A web-scraping interface to various Google Services.
|
4
|
+
#
|
5
|
+
# Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
|
6
|
+
#
|
7
|
+
# This program is free software; you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU General Public License as published by
|
9
|
+
# the Free Software Foundation; either version 2 of the License, or
|
10
|
+
# (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU General Public License
|
18
|
+
# along with this program; if not, write to the Free Software
|
19
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
+
#++
|
21
|
+
#
|
22
|
+
|
1
23
|
require 'gscraper/extensions/uri/http'
|
@@ -1,79 +1,33 @@
|
|
1
|
+
#
|
2
|
+
#--
|
3
|
+
# GScraper - A web-scraping interface to various Google Services.
|
4
|
+
#
|
5
|
+
# Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
|
6
|
+
#
|
7
|
+
# This program is free software; you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU General Public License as published by
|
9
|
+
# the Free Software Foundation; either version 2 of the License, or
|
10
|
+
# (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU General Public License
|
18
|
+
# along with this program; if not, write to the Free Software
|
19
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
+
#++
|
21
|
+
#
|
22
|
+
|
23
|
+
require 'gscraper/extensions/uri/query_params'
|
24
|
+
|
1
25
|
require 'uri/http'
|
2
26
|
|
3
27
|
module URI
|
4
28
|
class HTTP < Generic
|
5
29
|
|
6
|
-
|
7
|
-
attr_reader :query_params
|
8
|
-
|
9
|
-
alias_method :old_initialize, :initialize
|
10
|
-
|
11
|
-
#
|
12
|
-
# Creates a new URI::HTTP object and initializes query_params as a
|
13
|
-
# new Hash.
|
14
|
-
#
|
15
|
-
def initialize(*args)
|
16
|
-
old_initialize(*args)
|
17
|
-
|
18
|
-
@query_params = {}
|
19
|
-
parse_query_params
|
20
|
-
end
|
21
|
-
|
22
|
-
#
|
23
|
-
# Sets the query data and updates query_params.
|
24
|
-
#
|
25
|
-
def query=(query_str)
|
26
|
-
new_query = super(query_str)
|
27
|
-
parse_query_params
|
28
|
-
return new_query
|
29
|
-
end
|
30
|
-
|
31
|
-
protected
|
32
|
-
|
33
|
-
#
|
34
|
-
# Parses the query parameters from the query data, populating
|
35
|
-
# query_params with the parsed parameters.
|
36
|
-
#
|
37
|
-
def parse_query_params
|
38
|
-
@query_params.clear
|
39
|
-
|
40
|
-
if @query
|
41
|
-
@query.split('&').each do |param|
|
42
|
-
name, value = param.split('=')
|
43
|
-
|
44
|
-
if value
|
45
|
-
@query_params[name] = URI.decode(value)
|
46
|
-
else
|
47
|
-
@query_params[name] = nil
|
48
|
-
end
|
49
|
-
end
|
50
|
-
end
|
51
|
-
end
|
52
|
-
|
53
|
-
private
|
54
|
-
|
55
|
-
# :nodoc
|
56
|
-
def path_query
|
57
|
-
str = @path
|
58
|
-
|
59
|
-
unless @query_params.empty?
|
60
|
-
str += '?' + @query_params.to_a.map { |name,value|
|
61
|
-
if value==true
|
62
|
-
"#{name}=active"
|
63
|
-
elsif value
|
64
|
-
if value.kind_of?(Array)
|
65
|
-
"#{name}=#{URI.encode(value.join(' '))}"
|
66
|
-
else
|
67
|
-
"#{name}=#{URI.encode(value.to_s)}"
|
68
|
-
end
|
69
|
-
else
|
70
|
-
"#{name}="
|
71
|
-
end
|
72
|
-
}.join('&')
|
73
|
-
end
|
74
|
-
|
75
|
-
return str
|
76
|
-
end
|
30
|
+
include QueryParams
|
77
31
|
|
78
32
|
end
|
79
33
|
end
|
@@ -0,0 +1,96 @@
|
|
1
|
+
#
|
2
|
+
#--
|
3
|
+
# GScraper - A web-scraping interface to various Google Services.
|
4
|
+
#
|
5
|
+
# Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
|
6
|
+
#
|
7
|
+
# This program is free software; you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU General Public License as published by
|
9
|
+
# the Free Software Foundation; either version 2 of the License, or
|
10
|
+
# (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU General Public License
|
18
|
+
# along with this program; if not, write to the Free Software
|
19
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
+
#++
|
21
|
+
#
|
22
|
+
|
23
|
+
module URI
|
24
|
+
module QueryParams
|
25
|
+
# Query parameters
|
26
|
+
attr_reader :query_params
|
27
|
+
|
28
|
+
#
|
29
|
+
# Creates a new URI::HTTP object and initializes query_params as a
|
30
|
+
# new Hash.
|
31
|
+
#
|
32
|
+
def initialize(*args)
|
33
|
+
@query_params = {}
|
34
|
+
|
35
|
+
super(*args)
|
36
|
+
|
37
|
+
parse_query_params
|
38
|
+
end
|
39
|
+
|
40
|
+
#
|
41
|
+
# Sets the query data and updates query_params.
|
42
|
+
#
|
43
|
+
def query=(query_str)
|
44
|
+
new_query = super(query_str)
|
45
|
+
parse_query_params
|
46
|
+
return new_query
|
47
|
+
end
|
48
|
+
|
49
|
+
protected
|
50
|
+
|
51
|
+
#
|
52
|
+
# Parses the query parameters from the query data, populating
|
53
|
+
# query_params with the parsed parameters.
|
54
|
+
#
|
55
|
+
def parse_query_params
|
56
|
+
@query_params.clear
|
57
|
+
|
58
|
+
if @query
|
59
|
+
@query.split('&').each do |param|
|
60
|
+
name, value = param.split('=')
|
61
|
+
|
62
|
+
if value
|
63
|
+
@query_params[name] = URI.decode(value)
|
64
|
+
else
|
65
|
+
@query_params[name] = nil
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
private
|
72
|
+
|
73
|
+
# :nodoc
|
74
|
+
def path_query
|
75
|
+
str = @path
|
76
|
+
|
77
|
+
unless @query_params.empty?
|
78
|
+
str += '?' + @query_params.to_a.map { |name,value|
|
79
|
+
if value==true
|
80
|
+
"#{name}=active"
|
81
|
+
elsif value
|
82
|
+
if value.kind_of?(Array)
|
83
|
+
"#{name}=#{URI.encode(value.join(' '))}"
|
84
|
+
else
|
85
|
+
"#{name}=#{URI.encode(value.to_s)}"
|
86
|
+
end
|
87
|
+
else
|
88
|
+
"#{name}="
|
89
|
+
end
|
90
|
+
}.join('&')
|
91
|
+
end
|
92
|
+
|
93
|
+
return str
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
data/lib/gscraper/gscraper.rb
CHANGED
@@ -1,3 +1,25 @@
|
|
1
|
+
#
|
2
|
+
#--
|
3
|
+
# GScraper - A web-scraping interface to various Google Services.
|
4
|
+
#
|
5
|
+
# Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
|
6
|
+
#
|
7
|
+
# This program is free software; you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU General Public License as published by
|
9
|
+
# the Free Software Foundation; either version 2 of the License, or
|
10
|
+
# (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU General Public License
|
18
|
+
# along with this program; if not, write to the Free Software
|
19
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
+
#++
|
21
|
+
#
|
22
|
+
|
1
23
|
require 'uri/http'
|
2
24
|
require 'mechanize'
|
3
25
|
require 'open-uri'
|
@@ -54,6 +76,14 @@ module GScraper
|
|
54
76
|
@@gscraper_user_agent = agent
|
55
77
|
end
|
56
78
|
|
79
|
+
#
|
80
|
+
# Sets the GScraper User-Agent using the specified user-agent alias
|
81
|
+
# _name_.
|
82
|
+
#
|
83
|
+
def GScraper.user_agent_alias=(name)
|
84
|
+
@@gscraper_user_agent = GScraper.user_agent_aliases[name.to_s]
|
85
|
+
end
|
86
|
+
|
57
87
|
#
|
58
88
|
# Opens the _uri_ with the given _options_. The contents of the _uri_
|
59
89
|
# will be returned.
|
@@ -0,0 +1,114 @@
|
|
1
|
+
#
|
2
|
+
#--
|
3
|
+
# GScraper - A web-scraping interface to various Google Services.
|
4
|
+
#
|
5
|
+
# Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
|
6
|
+
#
|
7
|
+
# This program is free software; you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU General Public License as published by
|
9
|
+
# the Free Software Foundation; either version 2 of the License, or
|
10
|
+
# (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU General Public License
|
18
|
+
# along with this program; if not, write to the Free Software
|
19
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
+
#++
|
21
|
+
#
|
22
|
+
|
23
|
+
module GScraper
|
24
|
+
module HasPages
|
25
|
+
include Enumerable
|
26
|
+
|
27
|
+
#
|
28
|
+
# Returns the first page.
|
29
|
+
#
|
30
|
+
def first_page
|
31
|
+
page_cache[1]
|
32
|
+
end
|
33
|
+
|
34
|
+
#
|
35
|
+
# Returns the page at the specified _index_.
|
36
|
+
#
|
37
|
+
def [](index)
|
38
|
+
page_cache[index]
|
39
|
+
end
|
40
|
+
|
41
|
+
#
|
42
|
+
# Returns the pages with the specified _indices_.
|
43
|
+
#
|
44
|
+
def pages(indices)
|
45
|
+
indices.map { |index| page_cache[index] }
|
46
|
+
end
|
47
|
+
|
48
|
+
#
|
49
|
+
# Iterates over the pages with the specified _indices_, passing each
|
50
|
+
# to the specified _block_.
|
51
|
+
#
|
52
|
+
def each_page(indices,&block)
|
53
|
+
indices.map { |index| block.call(page_cache[index]) }
|
54
|
+
end
|
55
|
+
|
56
|
+
#
|
57
|
+
# Iterates over all the pages of the query, passing each to the
|
58
|
+
# specified _block_.
|
59
|
+
#
|
60
|
+
def each(&block)
|
61
|
+
index = 1
|
62
|
+
|
63
|
+
until ((next_page = page_cache[index]).empty?) do
|
64
|
+
block.call(next_page)
|
65
|
+
index = index + 1
|
66
|
+
end
|
67
|
+
|
68
|
+
return self
|
69
|
+
end
|
70
|
+
|
71
|
+
#
|
72
|
+
# Iterates over the elements on the page with the specified _index_,
|
73
|
+
# passing each element to the specified _block_.
|
74
|
+
#
|
75
|
+
def each_on_page(index,&block)
|
76
|
+
page_cache[index].each(&block)
|
77
|
+
end
|
78
|
+
|
79
|
+
#
|
80
|
+
# Iterates over each element on the pages with the specified _indices_,
|
81
|
+
# passing each element to the specified _block_.
|
82
|
+
#
|
83
|
+
def each_on_pages(indices,&block)
|
84
|
+
each_page(indices) { |page| page.each(&block) }
|
85
|
+
end
|
86
|
+
|
87
|
+
protected
|
88
|
+
|
89
|
+
#
|
90
|
+
# Returns the page index for the specified result _rank_.
|
91
|
+
#
|
92
|
+
def page_index_of(rank)
|
93
|
+
(((rank.to_i - 1) / results_per_page.to_i) + 1)
|
94
|
+
end
|
95
|
+
|
96
|
+
#
|
97
|
+
# Returns the rank offset for the specified _page_index_.
|
98
|
+
#
|
99
|
+
def result_offset_of(page_index)
|
100
|
+
((page_index.to_i - 1) * results_per_page.to_i)
|
101
|
+
end
|
102
|
+
|
103
|
+
#
|
104
|
+
# Returns the in-page index of the specified result _rank_.
|
105
|
+
#
|
106
|
+
def result_index_of(rank)
|
107
|
+
((rank.to_i - 1) % results_per_page.to_i)
|
108
|
+
end
|
109
|
+
|
110
|
+
def page_cache
|
111
|
+
@page_cache ||= Hash.new { |hash,key| hash[key] = page(key.to_i) }
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|