gscraper 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.rspec +1 -0
- data/ChangeLog.md +24 -2
- data/README.md +12 -7
- data/Rakefile +26 -29
- data/gemspec.yml +20 -0
- data/gscraper.gemspec +124 -109
- data/lib/gscraper.rb +1 -1
- data/lib/gscraper/gscraper.rb +24 -20
- data/lib/gscraper/has_pages.rb +1 -3
- data/lib/gscraper/hosts.rb +158 -0
- data/lib/gscraper/languages.rb +110 -0
- data/lib/gscraper/licenses.rb +4 -1
- data/lib/gscraper/page.rb +1 -3
- data/lib/gscraper/search.rb +1 -1
- data/lib/gscraper/search/ajax_query.rb +33 -34
- data/lib/gscraper/{extensions.rb → search/exceptions.rb} +2 -2
- data/lib/gscraper/{extensions/uri.rb → search/exceptions/blocked.rb} +10 -2
- data/lib/gscraper/search/page.rb +47 -67
- data/lib/gscraper/search/query.rb +90 -44
- data/lib/gscraper/search/result.rb +7 -9
- data/lib/gscraper/search/search.rb +2 -2
- data/lib/gscraper/search/web_query.rb +93 -101
- data/lib/gscraper/sponsored_ad.rb +3 -3
- data/lib/gscraper/sponsored_links.rb +1 -3
- data/lib/gscraper/version.rb +2 -2
- data/spec/languages_spec.rb +28 -0
- data/spec/search/ajax_query_spec.rb +2 -1
- data/spec/search/query_spec.rb +29 -0
- data/spec/search/web_query_spec.rb +21 -1
- data/spec/spec_helper.rb +2 -12
- metadata +107 -125
- data/.specopts +0 -1
- data/Gemfile +0 -25
- data/lib/gscraper/extensions/uri/http.rb +0 -31
- data/lib/gscraper/extensions/uri/query_params.rb +0 -109
- data/spec/extensions/uri/http_spec.rb +0 -9
- data/spec/extensions/uri/query_params_spec.rb +0 -46
data/.specopts
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
--colour --format specdoc
|
data/Gemfile
DELETED
@@ -1,25 +0,0 @@
|
|
1
|
-
source 'https://rubygems.org'
|
2
|
-
|
3
|
-
group(:runtime) do
|
4
|
-
gem 'json_pure', '~> 1.4.0'
|
5
|
-
gem 'mechanize', '~> 1.0.0'
|
6
|
-
end
|
7
|
-
|
8
|
-
group(:development) do
|
9
|
-
gem 'bundler', '~> 0.9.19'
|
10
|
-
gem 'rake', '~> 0.8.7'
|
11
|
-
gem 'jeweler', '~> 1.4.0', :git => 'git://github.com/technicalpickles/jeweler.git'
|
12
|
-
end
|
13
|
-
|
14
|
-
group(:doc) do
|
15
|
-
case RUBY_PLATFORM
|
16
|
-
when 'java'
|
17
|
-
gem 'maruku', '~> 0.6.0'
|
18
|
-
else
|
19
|
-
gem 'rdiscount', '~> 1.6.3'
|
20
|
-
end
|
21
|
-
|
22
|
-
gem 'yard', '~> 0.5.3'
|
23
|
-
end
|
24
|
-
|
25
|
-
gem 'rspec', '~> 1.3.0', :group => [:development, :test]
|
@@ -1,31 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# GScraper - A web-scraping interface to various Google Services.
|
3
|
-
#
|
4
|
-
# Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
|
-
#
|
6
|
-
# This program is free software; you can redistribute it and/or modify
|
7
|
-
# it under the terms of the GNU General Public License as published by
|
8
|
-
# the Free Software Foundation; either version 2 of the License, or
|
9
|
-
# (at your option) any later version.
|
10
|
-
#
|
11
|
-
# This program is distributed in the hope that it will be useful,
|
12
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
-
# GNU General Public License for more details.
|
15
|
-
#
|
16
|
-
# You should have received a copy of the GNU General Public License
|
17
|
-
# along with this program; if not, write to the Free Software
|
18
|
-
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
19
|
-
#
|
20
|
-
|
21
|
-
require 'gscraper/extensions/uri/query_params'
|
22
|
-
|
23
|
-
require 'uri/http'
|
24
|
-
|
25
|
-
module URI
|
26
|
-
class HTTP < Generic
|
27
|
-
|
28
|
-
include QueryParams
|
29
|
-
|
30
|
-
end
|
31
|
-
end
|
@@ -1,109 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# GScraper - A web-scraping interface to various Google Services.
|
3
|
-
#
|
4
|
-
# Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
|
-
#
|
6
|
-
# This program is free software; you can redistribute it and/or modify
|
7
|
-
# it under the terms of the GNU General Public License as published by
|
8
|
-
# the Free Software Foundation; either version 2 of the License, or
|
9
|
-
# (at your option) any later version.
|
10
|
-
#
|
11
|
-
# This program is distributed in the hope that it will be useful,
|
12
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
-
# GNU General Public License for more details.
|
15
|
-
#
|
16
|
-
# You should have received a copy of the GNU General Public License
|
17
|
-
# along with this program; if not, write to the Free Software
|
18
|
-
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
19
|
-
#
|
20
|
-
|
21
|
-
require 'cgi'
|
22
|
-
|
23
|
-
module URI
|
24
|
-
#
|
25
|
-
# Adds the ability to parse individual parameters from a the query field
|
26
|
-
# of a URI.
|
27
|
-
#
|
28
|
-
module QueryParams
|
29
|
-
# Query parameters
|
30
|
-
attr_reader :query_params
|
31
|
-
|
32
|
-
#
|
33
|
-
# Creates a new URI::HTTP object and initializes query_params as a
|
34
|
-
# new Hash.
|
35
|
-
#
|
36
|
-
def initialize(*args)
|
37
|
-
@query_params = {}
|
38
|
-
|
39
|
-
super(*args)
|
40
|
-
|
41
|
-
parse_query_params
|
42
|
-
end
|
43
|
-
|
44
|
-
#
|
45
|
-
# Sets the query data and updates query_params.
|
46
|
-
#
|
47
|
-
# @param [String] query_str
|
48
|
-
# The new URI query string to use.
|
49
|
-
#
|
50
|
-
# @return [String]
|
51
|
-
# The new URI query string.
|
52
|
-
#
|
53
|
-
# @example
|
54
|
-
# url.query = 'a=1&b=2'
|
55
|
-
# # => "a=1&b=2"
|
56
|
-
#
|
57
|
-
def query=(query_str)
|
58
|
-
new_query = super(query_str)
|
59
|
-
parse_query_params
|
60
|
-
return new_query
|
61
|
-
end
|
62
|
-
|
63
|
-
protected
|
64
|
-
|
65
|
-
#
|
66
|
-
# Parses the query parameters from the query data, populating
|
67
|
-
# query_params with the parsed parameters.
|
68
|
-
#
|
69
|
-
def parse_query_params
|
70
|
-
@query_params.clear
|
71
|
-
|
72
|
-
if @query
|
73
|
-
@query.split('&').each do |param|
|
74
|
-
name, value = param.split('=')
|
75
|
-
|
76
|
-
if value
|
77
|
-
@query_params[name] = URI.decode(value)
|
78
|
-
else
|
79
|
-
@query_params[name] = nil
|
80
|
-
end
|
81
|
-
end
|
82
|
-
end
|
83
|
-
end
|
84
|
-
|
85
|
-
private
|
86
|
-
|
87
|
-
def path_query
|
88
|
-
str = @path
|
89
|
-
|
90
|
-
unless @query_params.empty?
|
91
|
-
str += '?' + @query_params.to_a.map { |name,value|
|
92
|
-
if value==true
|
93
|
-
"#{name}=active"
|
94
|
-
elsif value
|
95
|
-
if value.kind_of?(Array)
|
96
|
-
"#{name}=#{CGI.escape(value.join(' '))}"
|
97
|
-
else
|
98
|
-
"#{name}=#{CGI.escape(value.to_s)}"
|
99
|
-
end
|
100
|
-
else
|
101
|
-
"#{name}="
|
102
|
-
end
|
103
|
-
}.join('&')
|
104
|
-
end
|
105
|
-
|
106
|
-
return str
|
107
|
-
end
|
108
|
-
end
|
109
|
-
end
|
@@ -1,46 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
|
3
|
-
require 'gscraper/extensions/uri'
|
4
|
-
|
5
|
-
describe "URI::QueryParams" do
|
6
|
-
before(:each) do
|
7
|
-
@uri = URI('http://www.test.com/page.php?x=1&y=one%20two&z')
|
8
|
-
end
|
9
|
-
|
10
|
-
it "should provide #query_params" do
|
11
|
-
@uri.should respond_to(:query_params)
|
12
|
-
end
|
13
|
-
|
14
|
-
it "#query_params should be a Hash" do
|
15
|
-
@uri.query_params.class.should == Hash
|
16
|
-
end
|
17
|
-
|
18
|
-
it "#query_params should contain params" do
|
19
|
-
@uri.query_params.empty?.should == false
|
20
|
-
end
|
21
|
-
|
22
|
-
it "#query_params can contain single-word params" do
|
23
|
-
@uri.query_params['x'].should == '1'
|
24
|
-
end
|
25
|
-
|
26
|
-
it "#query_params can contain multi-word params" do
|
27
|
-
@uri.query_params['y'].should == 'one two'
|
28
|
-
end
|
29
|
-
|
30
|
-
it "#query_params can contain empty params" do
|
31
|
-
@uri.query_params['z'].should be_nil
|
32
|
-
end
|
33
|
-
|
34
|
-
it "should update #query_params along with #query=" do
|
35
|
-
@uri.query = 'u=3'
|
36
|
-
@uri.query_params['u'].should == '3'
|
37
|
-
end
|
38
|
-
|
39
|
-
it "should properly escape query param values" do
|
40
|
-
@uri.query_params['x'] = '1&2'
|
41
|
-
@uri.query_params['y'] = 'one=two'
|
42
|
-
@uri.query_params['z'] = '?'
|
43
|
-
|
44
|
-
@uri.to_s.should == "http://www.test.com/page.php?x=1%262&y=one%3Dtwo&z=%3F"
|
45
|
-
end
|
46
|
-
end
|