gscraper 0.3.0 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.rspec +1 -0
- data/ChangeLog.md +24 -2
- data/README.md +12 -7
- data/Rakefile +26 -29
- data/gemspec.yml +20 -0
- data/gscraper.gemspec +124 -109
- data/lib/gscraper.rb +1 -1
- data/lib/gscraper/gscraper.rb +24 -20
- data/lib/gscraper/has_pages.rb +1 -3
- data/lib/gscraper/hosts.rb +158 -0
- data/lib/gscraper/languages.rb +110 -0
- data/lib/gscraper/licenses.rb +4 -1
- data/lib/gscraper/page.rb +1 -3
- data/lib/gscraper/search.rb +1 -1
- data/lib/gscraper/search/ajax_query.rb +33 -34
- data/lib/gscraper/{extensions.rb → search/exceptions.rb} +2 -2
- data/lib/gscraper/{extensions/uri.rb → search/exceptions/blocked.rb} +10 -2
- data/lib/gscraper/search/page.rb +47 -67
- data/lib/gscraper/search/query.rb +90 -44
- data/lib/gscraper/search/result.rb +7 -9
- data/lib/gscraper/search/search.rb +2 -2
- data/lib/gscraper/search/web_query.rb +93 -101
- data/lib/gscraper/sponsored_ad.rb +3 -3
- data/lib/gscraper/sponsored_links.rb +1 -3
- data/lib/gscraper/version.rb +2 -2
- data/spec/languages_spec.rb +28 -0
- data/spec/search/ajax_query_spec.rb +2 -1
- data/spec/search/query_spec.rb +29 -0
- data/spec/search/web_query_spec.rb +21 -1
- data/spec/spec_helper.rb +2 -12
- metadata +107 -125
- data/.specopts +0 -1
- data/Gemfile +0 -25
- data/lib/gscraper/extensions/uri/http.rb +0 -31
- data/lib/gscraper/extensions/uri/query_params.rb +0 -109
- data/spec/extensions/uri/http_spec.rb +0 -9
- data/spec/extensions/uri/query_params_spec.rb +0 -46
data/.specopts
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
--colour --format specdoc
|
data/Gemfile
DELETED
@@ -1,25 +0,0 @@
|
|
1
|
-
source 'https://rubygems.org'
|
2
|
-
|
3
|
-
group(:runtime) do
|
4
|
-
gem 'json_pure', '~> 1.4.0'
|
5
|
-
gem 'mechanize', '~> 1.0.0'
|
6
|
-
end
|
7
|
-
|
8
|
-
group(:development) do
|
9
|
-
gem 'bundler', '~> 0.9.19'
|
10
|
-
gem 'rake', '~> 0.8.7'
|
11
|
-
gem 'jeweler', '~> 1.4.0', :git => 'git://github.com/technicalpickles/jeweler.git'
|
12
|
-
end
|
13
|
-
|
14
|
-
group(:doc) do
|
15
|
-
case RUBY_PLATFORM
|
16
|
-
when 'java'
|
17
|
-
gem 'maruku', '~> 0.6.0'
|
18
|
-
else
|
19
|
-
gem 'rdiscount', '~> 1.6.3'
|
20
|
-
end
|
21
|
-
|
22
|
-
gem 'yard', '~> 0.5.3'
|
23
|
-
end
|
24
|
-
|
25
|
-
gem 'rspec', '~> 1.3.0', :group => [:development, :test]
|
@@ -1,31 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# GScraper - A web-scraping interface to various Google Services.
|
3
|
-
#
|
4
|
-
# Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
|
-
#
|
6
|
-
# This program is free software; you can redistribute it and/or modify
|
7
|
-
# it under the terms of the GNU General Public License as published by
|
8
|
-
# the Free Software Foundation; either version 2 of the License, or
|
9
|
-
# (at your option) any later version.
|
10
|
-
#
|
11
|
-
# This program is distributed in the hope that it will be useful,
|
12
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
-
# GNU General Public License for more details.
|
15
|
-
#
|
16
|
-
# You should have received a copy of the GNU General Public License
|
17
|
-
# along with this program; if not, write to the Free Software
|
18
|
-
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
19
|
-
#
|
20
|
-
|
21
|
-
require 'gscraper/extensions/uri/query_params'
|
22
|
-
|
23
|
-
require 'uri/http'
|
24
|
-
|
25
|
-
module URI
|
26
|
-
class HTTP < Generic
|
27
|
-
|
28
|
-
include QueryParams
|
29
|
-
|
30
|
-
end
|
31
|
-
end
|
@@ -1,109 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# GScraper - A web-scraping interface to various Google Services.
|
3
|
-
#
|
4
|
-
# Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
|
-
#
|
6
|
-
# This program is free software; you can redistribute it and/or modify
|
7
|
-
# it under the terms of the GNU General Public License as published by
|
8
|
-
# the Free Software Foundation; either version 2 of the License, or
|
9
|
-
# (at your option) any later version.
|
10
|
-
#
|
11
|
-
# This program is distributed in the hope that it will be useful,
|
12
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
-
# GNU General Public License for more details.
|
15
|
-
#
|
16
|
-
# You should have received a copy of the GNU General Public License
|
17
|
-
# along with this program; if not, write to the Free Software
|
18
|
-
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
19
|
-
#
|
20
|
-
|
21
|
-
require 'cgi'
|
22
|
-
|
23
|
-
module URI
|
24
|
-
#
|
25
|
-
# Adds the ability to parse individual parameters from a the query field
|
26
|
-
# of a URI.
|
27
|
-
#
|
28
|
-
module QueryParams
|
29
|
-
# Query parameters
|
30
|
-
attr_reader :query_params
|
31
|
-
|
32
|
-
#
|
33
|
-
# Creates a new URI::HTTP object and initializes query_params as a
|
34
|
-
# new Hash.
|
35
|
-
#
|
36
|
-
def initialize(*args)
|
37
|
-
@query_params = {}
|
38
|
-
|
39
|
-
super(*args)
|
40
|
-
|
41
|
-
parse_query_params
|
42
|
-
end
|
43
|
-
|
44
|
-
#
|
45
|
-
# Sets the query data and updates query_params.
|
46
|
-
#
|
47
|
-
# @param [String] query_str
|
48
|
-
# The new URI query string to use.
|
49
|
-
#
|
50
|
-
# @return [String]
|
51
|
-
# The new URI query string.
|
52
|
-
#
|
53
|
-
# @example
|
54
|
-
# url.query = 'a=1&b=2'
|
55
|
-
# # => "a=1&b=2"
|
56
|
-
#
|
57
|
-
def query=(query_str)
|
58
|
-
new_query = super(query_str)
|
59
|
-
parse_query_params
|
60
|
-
return new_query
|
61
|
-
end
|
62
|
-
|
63
|
-
protected
|
64
|
-
|
65
|
-
#
|
66
|
-
# Parses the query parameters from the query data, populating
|
67
|
-
# query_params with the parsed parameters.
|
68
|
-
#
|
69
|
-
def parse_query_params
|
70
|
-
@query_params.clear
|
71
|
-
|
72
|
-
if @query
|
73
|
-
@query.split('&').each do |param|
|
74
|
-
name, value = param.split('=')
|
75
|
-
|
76
|
-
if value
|
77
|
-
@query_params[name] = URI.decode(value)
|
78
|
-
else
|
79
|
-
@query_params[name] = nil
|
80
|
-
end
|
81
|
-
end
|
82
|
-
end
|
83
|
-
end
|
84
|
-
|
85
|
-
private
|
86
|
-
|
87
|
-
def path_query
|
88
|
-
str = @path
|
89
|
-
|
90
|
-
unless @query_params.empty?
|
91
|
-
str += '?' + @query_params.to_a.map { |name,value|
|
92
|
-
if value==true
|
93
|
-
"#{name}=active"
|
94
|
-
elsif value
|
95
|
-
if value.kind_of?(Array)
|
96
|
-
"#{name}=#{CGI.escape(value.join(' '))}"
|
97
|
-
else
|
98
|
-
"#{name}=#{CGI.escape(value.to_s)}"
|
99
|
-
end
|
100
|
-
else
|
101
|
-
"#{name}="
|
102
|
-
end
|
103
|
-
}.join('&')
|
104
|
-
end
|
105
|
-
|
106
|
-
return str
|
107
|
-
end
|
108
|
-
end
|
109
|
-
end
|
@@ -1,46 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
|
3
|
-
require 'gscraper/extensions/uri'
|
4
|
-
|
5
|
-
describe "URI::QueryParams" do
|
6
|
-
before(:each) do
|
7
|
-
@uri = URI('http://www.test.com/page.php?x=1&y=one%20two&z')
|
8
|
-
end
|
9
|
-
|
10
|
-
it "should provide #query_params" do
|
11
|
-
@uri.should respond_to(:query_params)
|
12
|
-
end
|
13
|
-
|
14
|
-
it "#query_params should be a Hash" do
|
15
|
-
@uri.query_params.class.should == Hash
|
16
|
-
end
|
17
|
-
|
18
|
-
it "#query_params should contain params" do
|
19
|
-
@uri.query_params.empty?.should == false
|
20
|
-
end
|
21
|
-
|
22
|
-
it "#query_params can contain single-word params" do
|
23
|
-
@uri.query_params['x'].should == '1'
|
24
|
-
end
|
25
|
-
|
26
|
-
it "#query_params can contain multi-word params" do
|
27
|
-
@uri.query_params['y'].should == 'one two'
|
28
|
-
end
|
29
|
-
|
30
|
-
it "#query_params can contain empty params" do
|
31
|
-
@uri.query_params['z'].should be_nil
|
32
|
-
end
|
33
|
-
|
34
|
-
it "should update #query_params along with #query=" do
|
35
|
-
@uri.query = 'u=3'
|
36
|
-
@uri.query_params['u'].should == '3'
|
37
|
-
end
|
38
|
-
|
39
|
-
it "should properly escape query param values" do
|
40
|
-
@uri.query_params['x'] = '1&2'
|
41
|
-
@uri.query_params['y'] = 'one=two'
|
42
|
-
@uri.query_params['z'] = '?'
|
43
|
-
|
44
|
-
@uri.to_s.should == "http://www.test.com/page.php?x=1%262&y=one%3Dtwo&z=%3F"
|
45
|
-
end
|
46
|
-
end
|