gscraper 0.1.7 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/COPYING.txt +339 -0
- data/History.txt +21 -0
- data/Manifest.txt +23 -10
- data/README.txt +17 -21
- data/Rakefile +3 -6
- data/lib/gscraper.rb +22 -0
- data/lib/gscraper/extensions.rb +22 -0
- data/lib/gscraper/extensions/uri.rb +22 -0
- data/lib/gscraper/extensions/uri/http.rb +25 -71
- data/lib/gscraper/extensions/uri/query_params.rb +96 -0
- data/lib/gscraper/gscraper.rb +30 -0
- data/lib/gscraper/has_pages.rb +114 -0
- data/lib/gscraper/licenses.rb +22 -0
- data/lib/gscraper/page.rb +64 -0
- data/lib/gscraper/search.rb +24 -0
- data/lib/gscraper/search/ajax_query.rb +176 -0
- data/lib/gscraper/search/page.rb +27 -72
- data/lib/gscraper/search/query.rb +46 -457
- data/lib/gscraper/search/result.rb +32 -29
- data/lib/gscraper/search/search.rb +44 -3
- data/lib/gscraper/search/web_query.rb +472 -0
- data/lib/gscraper/sponsored_ad.rb +26 -2
- data/lib/gscraper/sponsored_links.rb +77 -8
- data/lib/gscraper/version.rb +23 -1
- data/spec/extensions/uri/http_spec.rb +9 -0
- data/spec/extensions/uri/query_params_spec.rb +38 -0
- data/spec/gscraper_spec.rb +29 -0
- data/spec/has_pages_examples.rb +19 -0
- data/spec/has_sponsored_links_examples.rb +57 -0
- data/spec/helpers/query.rb +1 -0
- data/spec/helpers/uri.rb +8 -0
- data/spec/page_has_results_examples.rb +13 -0
- data/spec/search/ajax_query_spec.rb +124 -0
- data/spec/search/page_has_results_examples.rb +51 -0
- data/spec/search/query_spec.rb +103 -0
- data/spec/search/web_query_spec.rb +74 -0
- data/spec/spec_helper.rb +6 -0
- data/tasks/spec.rb +7 -0
- metadata +34 -20
- data/LICENSE.txt +0 -23
- data/lib/gscraper/web_agent.rb +0 -38
- data/test/search/page_results.rb +0 -103
- data/test/search/query_from_url.rb +0 -50
- data/test/search/query_pages.rb +0 -32
- data/test/search/query_result.rb +0 -30
- data/test/test_gscraper.rb +0 -4
data/Rakefile
CHANGED
@@ -2,16 +2,13 @@
|
|
2
2
|
|
3
3
|
require 'rubygems'
|
4
4
|
require 'hoe'
|
5
|
+
|
6
|
+
require './tasks/spec.rb'
|
5
7
|
require './lib/gscraper/version.rb'
|
6
8
|
|
7
9
|
Hoe.new('gscraper', GScraper::VERSION) do |p|
|
8
10
|
p.rubyforge_name = 'gscraper'
|
9
|
-
p.
|
10
|
-
p.email = 'postmodern.mod3@gmail.com'
|
11
|
-
p.summary = 'A ruby web-scraping interface to various Google Services'
|
12
|
-
p.description = p.paragraphs_of('README.txt', 2..5).join("\n\n")
|
13
|
-
p.url = p.paragraphs_of('README.txt', 0).first.split(/\n/)[1..-1]
|
14
|
-
p.changes = p.paragraphs_of('History.txt', 0..1).join("\n\n")
|
11
|
+
p.developer('Postmodern Modulus III', 'postmodern.mod3@gmail.com')
|
15
12
|
p.extra_deps = ['hpricot', 'mechanize']
|
16
13
|
end
|
17
14
|
|
data/lib/gscraper.rb
CHANGED
@@ -1,2 +1,24 @@
|
|
1
|
+
#
|
2
|
+
#--
|
3
|
+
# GScraper - A web-scraping interface to various Google Services.
|
4
|
+
#
|
5
|
+
# Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
|
6
|
+
#
|
7
|
+
# This program is free software; you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU General Public License as published by
|
9
|
+
# the Free Software Foundation; either version 2 of the License, or
|
10
|
+
# (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU General Public License
|
18
|
+
# along with this program; if not, write to the Free Software
|
19
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
+
#++
|
21
|
+
#
|
22
|
+
|
1
23
|
require 'gscraper/search'
|
2
24
|
require 'gscraper/version'
|
data/lib/gscraper/extensions.rb
CHANGED
@@ -1 +1,23 @@
|
|
1
|
+
#
|
2
|
+
#--
|
3
|
+
# GScraper - A web-scraping interface to various Google Services.
|
4
|
+
#
|
5
|
+
# Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
|
6
|
+
#
|
7
|
+
# This program is free software; you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU General Public License as published by
|
9
|
+
# the Free Software Foundation; either version 2 of the License, or
|
10
|
+
# (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU General Public License
|
18
|
+
# along with this program; if not, write to the Free Software
|
19
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
+
#++
|
21
|
+
#
|
22
|
+
|
1
23
|
require 'gscraper/extensions/uri'
|
@@ -1 +1,23 @@
|
|
1
|
+
#
|
2
|
+
#--
|
3
|
+
# GScraper - A web-scraping interface to various Google Services.
|
4
|
+
#
|
5
|
+
# Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
|
6
|
+
#
|
7
|
+
# This program is free software; you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU General Public License as published by
|
9
|
+
# the Free Software Foundation; either version 2 of the License, or
|
10
|
+
# (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU General Public License
|
18
|
+
# along with this program; if not, write to the Free Software
|
19
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
+
#++
|
21
|
+
#
|
22
|
+
|
1
23
|
require 'gscraper/extensions/uri/http'
|
@@ -1,79 +1,33 @@
|
|
1
|
+
#
|
2
|
+
#--
|
3
|
+
# GScraper - A web-scraping interface to various Google Services.
|
4
|
+
#
|
5
|
+
# Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
|
6
|
+
#
|
7
|
+
# This program is free software; you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU General Public License as published by
|
9
|
+
# the Free Software Foundation; either version 2 of the License, or
|
10
|
+
# (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU General Public License
|
18
|
+
# along with this program; if not, write to the Free Software
|
19
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
+
#++
|
21
|
+
#
|
22
|
+
|
23
|
+
require 'gscraper/extensions/uri/query_params'
|
24
|
+
|
1
25
|
require 'uri/http'
|
2
26
|
|
3
27
|
module URI
|
4
28
|
class HTTP < Generic
|
5
29
|
|
6
|
-
|
7
|
-
attr_reader :query_params
|
8
|
-
|
9
|
-
alias_method :old_initialize, :initialize
|
10
|
-
|
11
|
-
#
|
12
|
-
# Creates a new URI::HTTP object and initializes query_params as a
|
13
|
-
# new Hash.
|
14
|
-
#
|
15
|
-
def initialize(*args)
|
16
|
-
old_initialize(*args)
|
17
|
-
|
18
|
-
@query_params = {}
|
19
|
-
parse_query_params
|
20
|
-
end
|
21
|
-
|
22
|
-
#
|
23
|
-
# Sets the query data and updates query_params.
|
24
|
-
#
|
25
|
-
def query=(query_str)
|
26
|
-
new_query = super(query_str)
|
27
|
-
parse_query_params
|
28
|
-
return new_query
|
29
|
-
end
|
30
|
-
|
31
|
-
protected
|
32
|
-
|
33
|
-
#
|
34
|
-
# Parses the query parameters from the query data, populating
|
35
|
-
# query_params with the parsed parameters.
|
36
|
-
#
|
37
|
-
def parse_query_params
|
38
|
-
@query_params.clear
|
39
|
-
|
40
|
-
if @query
|
41
|
-
@query.split('&').each do |param|
|
42
|
-
name, value = param.split('=')
|
43
|
-
|
44
|
-
if value
|
45
|
-
@query_params[name] = URI.decode(value)
|
46
|
-
else
|
47
|
-
@query_params[name] = nil
|
48
|
-
end
|
49
|
-
end
|
50
|
-
end
|
51
|
-
end
|
52
|
-
|
53
|
-
private
|
54
|
-
|
55
|
-
# :nodoc
|
56
|
-
def path_query
|
57
|
-
str = @path
|
58
|
-
|
59
|
-
unless @query_params.empty?
|
60
|
-
str += '?' + @query_params.to_a.map { |name,value|
|
61
|
-
if value==true
|
62
|
-
"#{name}=active"
|
63
|
-
elsif value
|
64
|
-
if value.kind_of?(Array)
|
65
|
-
"#{name}=#{URI.encode(value.join(' '))}"
|
66
|
-
else
|
67
|
-
"#{name}=#{URI.encode(value.to_s)}"
|
68
|
-
end
|
69
|
-
else
|
70
|
-
"#{name}="
|
71
|
-
end
|
72
|
-
}.join('&')
|
73
|
-
end
|
74
|
-
|
75
|
-
return str
|
76
|
-
end
|
30
|
+
include QueryParams
|
77
31
|
|
78
32
|
end
|
79
33
|
end
|
@@ -0,0 +1,96 @@
|
|
1
|
+
#
|
2
|
+
#--
|
3
|
+
# GScraper - A web-scraping interface to various Google Services.
|
4
|
+
#
|
5
|
+
# Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
|
6
|
+
#
|
7
|
+
# This program is free software; you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU General Public License as published by
|
9
|
+
# the Free Software Foundation; either version 2 of the License, or
|
10
|
+
# (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU General Public License
|
18
|
+
# along with this program; if not, write to the Free Software
|
19
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
+
#++
|
21
|
+
#
|
22
|
+
|
23
|
+
module URI
|
24
|
+
module QueryParams
|
25
|
+
# Query parameters
|
26
|
+
attr_reader :query_params
|
27
|
+
|
28
|
+
#
|
29
|
+
# Creates a new URI::HTTP object and initializes query_params as a
|
30
|
+
# new Hash.
|
31
|
+
#
|
32
|
+
def initialize(*args)
|
33
|
+
@query_params = {}
|
34
|
+
|
35
|
+
super(*args)
|
36
|
+
|
37
|
+
parse_query_params
|
38
|
+
end
|
39
|
+
|
40
|
+
#
|
41
|
+
# Sets the query data and updates query_params.
|
42
|
+
#
|
43
|
+
def query=(query_str)
|
44
|
+
new_query = super(query_str)
|
45
|
+
parse_query_params
|
46
|
+
return new_query
|
47
|
+
end
|
48
|
+
|
49
|
+
protected
|
50
|
+
|
51
|
+
#
|
52
|
+
# Parses the query parameters from the query data, populating
|
53
|
+
# query_params with the parsed parameters.
|
54
|
+
#
|
55
|
+
def parse_query_params
|
56
|
+
@query_params.clear
|
57
|
+
|
58
|
+
if @query
|
59
|
+
@query.split('&').each do |param|
|
60
|
+
name, value = param.split('=')
|
61
|
+
|
62
|
+
if value
|
63
|
+
@query_params[name] = URI.decode(value)
|
64
|
+
else
|
65
|
+
@query_params[name] = nil
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
private
|
72
|
+
|
73
|
+
# :nodoc
|
74
|
+
def path_query
|
75
|
+
str = @path
|
76
|
+
|
77
|
+
unless @query_params.empty?
|
78
|
+
str += '?' + @query_params.to_a.map { |name,value|
|
79
|
+
if value==true
|
80
|
+
"#{name}=active"
|
81
|
+
elsif value
|
82
|
+
if value.kind_of?(Array)
|
83
|
+
"#{name}=#{URI.encode(value.join(' '))}"
|
84
|
+
else
|
85
|
+
"#{name}=#{URI.encode(value.to_s)}"
|
86
|
+
end
|
87
|
+
else
|
88
|
+
"#{name}="
|
89
|
+
end
|
90
|
+
}.join('&')
|
91
|
+
end
|
92
|
+
|
93
|
+
return str
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
data/lib/gscraper/gscraper.rb
CHANGED
@@ -1,3 +1,25 @@
|
|
1
|
+
#
|
2
|
+
#--
|
3
|
+
# GScraper - A web-scraping interface to various Google Services.
|
4
|
+
#
|
5
|
+
# Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
|
6
|
+
#
|
7
|
+
# This program is free software; you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU General Public License as published by
|
9
|
+
# the Free Software Foundation; either version 2 of the License, or
|
10
|
+
# (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU General Public License
|
18
|
+
# along with this program; if not, write to the Free Software
|
19
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
+
#++
|
21
|
+
#
|
22
|
+
|
1
23
|
require 'uri/http'
|
2
24
|
require 'mechanize'
|
3
25
|
require 'open-uri'
|
@@ -54,6 +76,14 @@ module GScraper
|
|
54
76
|
@@gscraper_user_agent = agent
|
55
77
|
end
|
56
78
|
|
79
|
+
#
|
80
|
+
# Sets the GScraper User-Agent using the specified user-agent alias
|
81
|
+
# _name_.
|
82
|
+
#
|
83
|
+
def GScraper.user_agent_alias=(name)
|
84
|
+
@@gscraper_user_agent = GScraper.user_agent_aliases[name.to_s]
|
85
|
+
end
|
86
|
+
|
57
87
|
#
|
58
88
|
# Opens the _uri_ with the given _options_. The contents of the _uri_
|
59
89
|
# will be returned.
|
@@ -0,0 +1,114 @@
|
|
1
|
+
#
|
2
|
+
#--
|
3
|
+
# GScraper - A web-scraping interface to various Google Services.
|
4
|
+
#
|
5
|
+
# Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
|
6
|
+
#
|
7
|
+
# This program is free software; you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU General Public License as published by
|
9
|
+
# the Free Software Foundation; either version 2 of the License, or
|
10
|
+
# (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU General Public License
|
18
|
+
# along with this program; if not, write to the Free Software
|
19
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
+
#++
|
21
|
+
#
|
22
|
+
|
23
|
+
module GScraper
|
24
|
+
module HasPages
|
25
|
+
include Enumerable
|
26
|
+
|
27
|
+
#
|
28
|
+
# Returns the first page.
|
29
|
+
#
|
30
|
+
def first_page
|
31
|
+
page_cache[1]
|
32
|
+
end
|
33
|
+
|
34
|
+
#
|
35
|
+
# Returns the page at the specified _index_.
|
36
|
+
#
|
37
|
+
def [](index)
|
38
|
+
page_cache[index]
|
39
|
+
end
|
40
|
+
|
41
|
+
#
|
42
|
+
# Returns the pages with the specified _indices_.
|
43
|
+
#
|
44
|
+
def pages(indices)
|
45
|
+
indices.map { |index| page_cache[index] }
|
46
|
+
end
|
47
|
+
|
48
|
+
#
|
49
|
+
# Iterates over the pages with the specified _indices_, passing each
|
50
|
+
# to the specified _block_.
|
51
|
+
#
|
52
|
+
def each_page(indices,&block)
|
53
|
+
indices.map { |index| block.call(page_cache[index]) }
|
54
|
+
end
|
55
|
+
|
56
|
+
#
|
57
|
+
# Iterates over all the pages of the query, passing each to the
|
58
|
+
# specified _block_.
|
59
|
+
#
|
60
|
+
def each(&block)
|
61
|
+
index = 1
|
62
|
+
|
63
|
+
until ((next_page = page_cache[index]).empty?) do
|
64
|
+
block.call(next_page)
|
65
|
+
index = index + 1
|
66
|
+
end
|
67
|
+
|
68
|
+
return self
|
69
|
+
end
|
70
|
+
|
71
|
+
#
|
72
|
+
# Iterates over the elements on the page with the specified _index_,
|
73
|
+
# passing each element to the specified _block_.
|
74
|
+
#
|
75
|
+
def each_on_page(index,&block)
|
76
|
+
page_cache[index].each(&block)
|
77
|
+
end
|
78
|
+
|
79
|
+
#
|
80
|
+
# Iterates over each element on the pages with the specified _indices_,
|
81
|
+
# passing each element to the specified _block_.
|
82
|
+
#
|
83
|
+
def each_on_pages(indices,&block)
|
84
|
+
each_page(indices) { |page| page.each(&block) }
|
85
|
+
end
|
86
|
+
|
87
|
+
protected
|
88
|
+
|
89
|
+
#
|
90
|
+
# Returns the page index for the specified result _rank_.
|
91
|
+
#
|
92
|
+
def page_index_of(rank)
|
93
|
+
(((rank.to_i - 1) / results_per_page.to_i) + 1)
|
94
|
+
end
|
95
|
+
|
96
|
+
#
|
97
|
+
# Returns the rank offset for the specified _page_index_.
|
98
|
+
#
|
99
|
+
def result_offset_of(page_index)
|
100
|
+
((page_index.to_i - 1) * results_per_page.to_i)
|
101
|
+
end
|
102
|
+
|
103
|
+
#
|
104
|
+
# Returns the in-page index of the specified result _rank_.
|
105
|
+
#
|
106
|
+
def result_index_of(rank)
|
107
|
+
((rank.to_i - 1) % results_per_page.to_i)
|
108
|
+
end
|
109
|
+
|
110
|
+
def page_cache
|
111
|
+
@page_cache ||= Hash.new { |hash,key| hash[key] = page(key.to_i) }
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|