gscraper 0.1.5 → 0.1.6
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +33 -21
- data/Manifest.txt +3 -0
- data/README.txt +107 -4
- data/lib/gscraper/gscraper.rb +92 -21
- data/lib/gscraper/licenses.rb +27 -4
- data/lib/gscraper/search/page.rb +9 -11
- data/lib/gscraper/search/query.rb +142 -104
- data/lib/gscraper/search/result.rb +13 -12
- data/lib/gscraper/search/search.rb +3 -3
- data/lib/gscraper/sponsored_ad.rb +35 -0
- data/lib/gscraper/sponsored_links.rb +151 -0
- data/lib/gscraper/version.rb +1 -1
- data/lib/gscraper/web_agent.rb +38 -0
- data/test/search/query_result.rb +1 -1
- data/test/test_gscraper.rb +1 -4
- metadata +73 -63
data/History.txt
CHANGED
@@ -1,39 +1,51 @@
|
|
1
|
+
== 0.1.6 / 2008-03-15
|
2
|
+
|
3
|
+
* Renamed GScraper.http_agent to GScraper.web_agent.
|
4
|
+
* Added GScraper.proxy for global proxy configuration.
|
5
|
+
* Added the WebAgent module.
|
6
|
+
* Renamed Search::Query#first_result to Search::Query#top_result.
|
7
|
+
* Updated Search::Query#page logic for the new DOM layout being used.
|
8
|
+
* Added support for Sponsored Ad scraping.
|
9
|
+
* Added the methods Query#sponsored_links and Query#top_sponsored_link.
|
10
|
+
* Added examples to README.txt.
|
11
|
+
|
1
12
|
== 0.1.5 / 2007-12-29
|
2
13
|
|
3
|
-
|
14
|
+
* Fixed class inheritance in gscraper/extensions/uri/http.rb, found by
|
15
|
+
sanitybit.
|
4
16
|
|
5
17
|
== 0.1.4 / 2007-12-23
|
6
18
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
+
* Added Search::Query#result_at for easier access of a single result at
|
20
|
+
a given index.
|
21
|
+
* Adding scraping of the "Cached" and "Similar Pages" URLs of Search
|
22
|
+
Results.
|
23
|
+
* Added methods to Search::Page for accessing cached URLs, cached pages,
|
24
|
+
similar query URLs and similar Queries in mass.
|
25
|
+
* Search::Query#page and Search::Query#first_page now can receive blocks.
|
26
|
+
* Improved the formating of URL query parameters.
|
27
|
+
* Added more unit-tests.
|
28
|
+
* Fixed scraping of Search Result summaries.
|
29
|
+
* Fixed various bugs in Search::Query uncovered during unit-testing.
|
30
|
+
* Fixed typos in Search::Page's documentation.
|
19
31
|
|
20
32
|
== 0.1.3 / 2007-12-22
|
21
33
|
|
22
|
-
|
23
|
-
|
34
|
+
* Added the Search::Page class, which contains many of convenance methods
|
35
|
+
for searching through the results within a Page.
|
24
36
|
|
25
37
|
== 0.1.2 / 2007-12-22
|
26
38
|
|
27
|
-
|
28
|
-
|
29
|
-
|
39
|
+
* Fixed a bug related to extracting the correct content-rights from search
|
40
|
+
query URLs.
|
41
|
+
* Added GScraper.user_agent_aliases.
|
30
42
|
|
31
43
|
== 0.1.1 / 2007-12-21
|
32
44
|
|
33
|
-
|
45
|
+
* Forgot to include lib/gscraper/version.rb.
|
34
46
|
|
35
47
|
== 0.1.0 / 2007-12-20
|
36
48
|
|
37
|
-
|
38
|
-
|
49
|
+
* Initial release.
|
50
|
+
* Supports the Google Search service.
|
39
51
|
|
data/Manifest.txt
CHANGED
@@ -6,10 +6,13 @@ Rakefile
|
|
6
6
|
lib/gscraper.rb
|
7
7
|
lib/gscraper/version.rb
|
8
8
|
lib/gscraper/gscraper.rb
|
9
|
+
lib/gscraper/web_agent.rb
|
9
10
|
lib/gscraper/extensions/uri/http.rb
|
10
11
|
lib/gscraper/extensions/uri.rb
|
11
12
|
lib/gscraper/extensions.rb
|
12
13
|
lib/gscraper/licenses.rb
|
14
|
+
lib/gscraper/sponsored_ad.rb
|
15
|
+
lib/gscraper/sponsored_links.rb
|
13
16
|
lib/gscraper/search/result.rb
|
14
17
|
lib/gscraper/search/page.rb
|
15
18
|
lib/gscraper/search/query.rb
|
data/README.txt
CHANGED
@@ -8,17 +8,120 @@ GScraper is a web-scraping interface to various Google Services.
|
|
8
8
|
|
9
9
|
== FEATURES/PROBLEMS:
|
10
10
|
|
11
|
-
|
12
|
-
|
11
|
+
* Supports the Google Search service.
|
12
|
+
* Provides access to search results and ranks.
|
13
|
+
* Provides access to the Sponsored Links.
|
14
|
+
* Provides HTTP access with custom User-Agent strings.
|
15
|
+
* Provides proxy settings for HTTP access.
|
13
16
|
|
14
17
|
== REQUIREMENTS:
|
15
18
|
|
16
19
|
* Hpricot
|
17
|
-
* Mechanize
|
20
|
+
* WWW::Mechanize
|
18
21
|
|
19
22
|
== INSTALL:
|
20
23
|
|
21
|
-
sudo gem install gscraper
|
24
|
+
$ sudo gem install gscraper
|
25
|
+
|
26
|
+
== EXAMPLES:
|
27
|
+
|
28
|
+
* Basic query:
|
29
|
+
|
30
|
+
q = GScraper::Search.query(:query => 'ruby')
|
31
|
+
|
32
|
+
* Advanced query:
|
33
|
+
|
34
|
+
q = GScraper::Search.query(:query => 'ruby') do |q|
|
35
|
+
q.without_words = 'is'
|
36
|
+
q.within_past_day = true
|
37
|
+
q.numeric_range = 2..10
|
38
|
+
end
|
39
|
+
|
40
|
+
* Queries from URLs:
|
41
|
+
|
42
|
+
q = GScraper::Search.query_from_url('http://www.google.com/search?as_q=ruby&as_epq=&as_oq=rails&as_ft=i&as_qdr=all&as_occt=body&as_rights=%28cc_publicdomain%7Ccc_attribute%7Ccc_sharealike%7Ccc_noncommercial%29.-%28cc_nonderived%29')
|
43
|
+
|
44
|
+
q.query # =>; "ruby"
|
45
|
+
q.with_words # => "rails"
|
46
|
+
q.occurrs_within # => :title
|
47
|
+
q.rights # => :cc_by_nc
|
48
|
+
|
49
|
+
* Getting the search results:
|
50
|
+
|
51
|
+
q.first_page.select do |result|
|
52
|
+
result.title =~ /Blog/
|
53
|
+
end
|
54
|
+
|
55
|
+
q.page(2).map do |result|
|
56
|
+
result.title.reverse
|
57
|
+
end
|
58
|
+
|
59
|
+
q.result_at(25) # => Result
|
60
|
+
|
61
|
+
q.top_result # => Result
|
62
|
+
|
63
|
+
* A Result object contains the rank, title, summary, cahced URL, similiar
|
64
|
+
query URL and link URL of the search result.
|
65
|
+
|
66
|
+
page = q.page(2)
|
67
|
+
|
68
|
+
page.urls # => [...]
|
69
|
+
pagesummaries # => [...]
|
70
|
+
page.ranks_of { |result| result.url =~ /^https/ } # => [...]
|
71
|
+
page.titles_of { |result| result.summary =~ /password/ } # => [...]
|
72
|
+
page.cached_pages # => [...]
|
73
|
+
page.similar_queries # => [...]
|
74
|
+
|
75
|
+
* Iterating over the search results:
|
76
|
+
|
77
|
+
q.each_on_page(2) do |result|
|
78
|
+
puts result.title
|
79
|
+
end
|
80
|
+
|
81
|
+
page.each do |result|
|
82
|
+
puts result.url
|
83
|
+
end
|
84
|
+
|
85
|
+
* Iterating over the data within the search results:
|
86
|
+
|
87
|
+
page.each_title do |title|
|
88
|
+
puts title
|
89
|
+
end
|
90
|
+
|
91
|
+
page.each_summary do |text|
|
92
|
+
puts text
|
93
|
+
end
|
94
|
+
|
95
|
+
* Selecting search results:
|
96
|
+
|
97
|
+
page.results_with do |result|
|
98
|
+
((result.rank > 2) && (result.rank < 10))
|
99
|
+
end
|
100
|
+
|
101
|
+
page.results_with_title(/Ruby/i) # => [...]
|
102
|
+
|
103
|
+
* Selecting data within the search results:
|
104
|
+
|
105
|
+
page.titles # => [...]
|
106
|
+
|
107
|
+
page.summaries # => [...]
|
108
|
+
|
109
|
+
* Selecting the data of search results based on the search result:
|
110
|
+
|
111
|
+
page.urls_of do |result|
|
112
|
+
result.description.length > 10
|
113
|
+
end
|
114
|
+
|
115
|
+
* Selecting the Sponsored Links of a Query:
|
116
|
+
|
117
|
+
q.sponsored_links # => [...]
|
118
|
+
|
119
|
+
q.top_sponsored_link # => SponsoredAd
|
120
|
+
|
121
|
+
* Setting the User-Agent globally:
|
122
|
+
|
123
|
+
GScraper.user_agent # => nil
|
124
|
+
GScraper.user_agent = 'Awesome Browser v1.2'
|
22
125
|
|
23
126
|
== LICENSE:
|
24
127
|
|
data/lib/gscraper/gscraper.rb
CHANGED
@@ -1,7 +1,38 @@
|
|
1
|
+
require 'uri/http'
|
1
2
|
require 'mechanize'
|
2
3
|
require 'open-uri'
|
3
4
|
|
4
5
|
module GScraper
|
6
|
+
# Common proxy port.
|
7
|
+
COMMON_PROXY_PORT = 8080
|
8
|
+
|
9
|
+
#
|
10
|
+
# Returns the +Hash+ of proxy information.
|
11
|
+
#
|
12
|
+
def GScraper.proxy
|
13
|
+
@@gscraper_proxy ||= {:host => nil, :port => COMMON_PROXY_PORT, :user => nil, :password => nil}
|
14
|
+
end
|
15
|
+
|
16
|
+
#
|
17
|
+
# Creates a HTTP URI based from the given _proxy_info_ hash. The
|
18
|
+
# _proxy_info_ hash defaults to Web.proxy, if not given.
|
19
|
+
#
|
20
|
+
# _proxy_info_ may contain the following keys:
|
21
|
+
# <tt>:host</tt>:: The proxy host.
|
22
|
+
# <tt>:port</tt>:: The proxy port. Defaults to COMMON_PROXY_PORT,
|
23
|
+
# if not specified.
|
24
|
+
# <tt>:user</tt>:: The user-name to login as.
|
25
|
+
# <tt>:password</tt>:: The password to login with.
|
26
|
+
#
|
27
|
+
def GScraper.proxy_uri(proxy_info=GScraper.proxy)
|
28
|
+
if GScraper.proxy[:host]
|
29
|
+
return URI::HTTP.build(:host => GScraper.proxy[:host],
|
30
|
+
:port => GScraper.proxy[:port],
|
31
|
+
:userinfo => "#{GScraper.proxy[:user]}:#{GScraper.proxy[:password]}",
|
32
|
+
:path => '/')
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
5
36
|
#
|
6
37
|
# Returns the supported GScraper User-Agent Aliases.
|
7
38
|
#
|
@@ -13,58 +44,98 @@ module GScraper
|
|
13
44
|
# Returns the GScraper User-Agent
|
14
45
|
#
|
15
46
|
def GScraper.user_agent
|
16
|
-
|
47
|
+
@@gscraper_user_agent ||= GScraper.user_agent_aliases['Windows IE 6']
|
17
48
|
end
|
18
49
|
|
19
50
|
#
|
20
51
|
# Sets the GScraper User-Agent to the specified _agent_.
|
21
52
|
#
|
22
53
|
def GScraper.user_agent=(agent)
|
23
|
-
|
54
|
+
@@gscraper_user_agent = agent
|
24
55
|
end
|
25
56
|
|
26
57
|
#
|
27
|
-
# Opens the _uri_ with the given
|
28
|
-
# returned.
|
58
|
+
# Opens the _uri_ with the given _options_. The contents of the _uri_
|
59
|
+
# will be returned.
|
60
|
+
#
|
61
|
+
# _options_ may contain the following keys:
|
62
|
+
# <tt>:user_agent_alias</tt>:: The User-Agent Alias to use.
|
63
|
+
# <tt>:user_agent</tt>:: The User-Agent String to use.
|
64
|
+
# <tt>:proxy</tt>:: A +Hash+ of proxy information which may
|
65
|
+
# contain the following keys:
|
66
|
+
# <tt>:host</tt>:: The proxy host.
|
67
|
+
# <tt>:port</tt>:: The proxy port.
|
68
|
+
# <tt>:user</tt>:: The user-name to login as.
|
69
|
+
# <tt>:password</tt>:: The password to login with.
|
29
70
|
#
|
30
|
-
# GScraper.
|
71
|
+
# GScraper.open_uri('http://www.hackety.org/')
|
31
72
|
#
|
32
|
-
# GScraper.
|
73
|
+
# GScraper.open_uri('http://tenderlovemaking.com/',
|
33
74
|
# :user_agent_alias => 'Linux Mozilla')
|
34
|
-
# GScraper.
|
75
|
+
# GScraper.open_uri('http://www.wired.com/',
|
76
|
+
# :user_agent => 'the future')
|
35
77
|
#
|
36
|
-
def GScraper.
|
78
|
+
def GScraper.open_uri(uri,options={})
|
37
79
|
headers = {}
|
38
80
|
|
39
|
-
if
|
40
|
-
headers['User-Agent'] = WWW::Mechanize::AGENT_ALIASES[
|
41
|
-
elsif
|
42
|
-
headers['User-Agent'] =
|
81
|
+
if options[:user_agent_alias]
|
82
|
+
headers['User-Agent'] = WWW::Mechanize::AGENT_ALIASES[options[:user_agent_alias]]
|
83
|
+
elsif options[:user_agent]
|
84
|
+
headers['User-Agent'] = options[:user_agent]
|
43
85
|
elsif GScraper.user_agent
|
44
86
|
headers['User-Agent'] = GScraper.user_agent
|
45
87
|
end
|
46
88
|
|
89
|
+
proxy = (options[:proxy] || GScraper.proxy)
|
90
|
+
if proxy[:host]
|
91
|
+
headers[:proxy] = GScraper.proxy_uri(proxy)
|
92
|
+
end
|
93
|
+
|
47
94
|
return Kernel.open(uri,headers)
|
48
95
|
end
|
49
96
|
|
50
97
|
#
|
51
|
-
#
|
98
|
+
# Similar to GScraper.open_uri but returns an Hpricot document.
|
99
|
+
#
|
100
|
+
def GScraper.open_page(uri,options={})
|
101
|
+
Hpricot(GScraper.open_uri(uri,options))
|
102
|
+
end
|
103
|
+
|
104
|
+
#
|
105
|
+
# Creates a new WWW::Mechanize agent with the given _options_.
|
106
|
+
#
|
107
|
+
# _options_ may contain the following keys:
|
108
|
+
# <tt>:user_agent_alias</tt>:: The User-Agent Alias to use.
|
109
|
+
# <tt>:user_agent</tt>:: The User-Agent string to use.
|
110
|
+
# <tt>:proxy</tt>:: A +Hash+ of proxy information which may
|
111
|
+
# contain the following keys:
|
112
|
+
# <tt>:host</tt>:: The proxy host.
|
113
|
+
# <tt>:port</tt>:: The proxy port.
|
114
|
+
# <tt>:user</tt>:: The user-name to login as.
|
115
|
+
# <tt>:password</tt>:: The password to login with.
|
116
|
+
#
|
117
|
+
# GScraper.web_agent
|
52
118
|
#
|
53
|
-
# GScraper.
|
54
|
-
# GScraper.
|
55
|
-
# GScraper.http_agent(:user_agent => 'wooden pants')
|
119
|
+
# GScraper.web_agent(:user_agent_alias => 'Linux Mozilla')
|
120
|
+
# GScraper.web_agent(:user_agent => 'Google Bot')
|
56
121
|
#
|
57
|
-
def GScraper.
|
122
|
+
def GScraper.web_agent(options={},&block)
|
58
123
|
agent = WWW::Mechanize.new
|
59
124
|
|
60
|
-
if
|
61
|
-
agent.user_agent_alias =
|
62
|
-
elsif
|
63
|
-
agent.user_agent =
|
125
|
+
if options[:user_agent_alias]
|
126
|
+
agent.user_agent_alias = options[:user_agent_alias]
|
127
|
+
elsif options[:user_agent]
|
128
|
+
agent.user_agent = options[:user_agent]
|
64
129
|
elsif GScraper.user_agent
|
65
130
|
agent.user_agent = GScraper.user_agent
|
66
131
|
end
|
67
132
|
|
133
|
+
proxy = (options[:proxy] || GScraper.proxy)
|
134
|
+
if proxy[:host]
|
135
|
+
agent.set_proxy(proxy[:host],proxy[:port],proxy[:user],proxy[:password])
|
136
|
+
end
|
137
|
+
|
138
|
+
block.call(agent) if block
|
68
139
|
return agent
|
69
140
|
end
|
70
141
|
end
|
data/lib/gscraper/licenses.rb
CHANGED
@@ -1,55 +1,78 @@
|
|
1
1
|
module GScraper
|
2
2
|
module Licenses
|
3
|
+
# Any desired license
|
3
4
|
ANY = nil
|
4
5
|
|
6
|
+
# Aladdin license
|
5
7
|
ALADDIN = :aladdin
|
6
8
|
|
9
|
+
# Artistic license
|
7
10
|
ARTISTIC = :artistic
|
8
11
|
|
12
|
+
# Apache license
|
9
13
|
APACHE = :apache
|
10
14
|
|
15
|
+
# Apple license
|
11
16
|
APPLE = :apple
|
12
17
|
|
18
|
+
# BSD license
|
13
19
|
BSD = :bsd
|
14
20
|
|
21
|
+
# Common public license
|
15
22
|
COMMON_PUBLIC = :cpl
|
16
23
|
|
24
|
+
# Creative Commons By-Attribution license
|
17
25
|
CC_BY = :cc_by
|
18
26
|
|
27
|
+
# Creative Commons By-Attribution-Share-Alike license
|
19
28
|
CC_BY_SA = :cc_by_sa
|
20
29
|
|
30
|
+
# Creative Commons By-Attribution-No-Derivative license
|
21
31
|
CC_BY_ND = :cc_by_nd
|
22
32
|
|
33
|
+
# Creative Commons By-Attribution-Noncommercial-Share-Alike license
|
23
34
|
CC_BY_NC = :cc_by_nc_sa
|
24
35
|
|
25
|
-
|
26
|
-
|
27
|
-
CC_BY_NC_SA = :cc_by_nc_sa
|
36
|
+
# Creative Commons By-Attribution-No-Derivative-Share-Alike license
|
37
|
+
CC_BY_ND_SA = :cc_by_nd_sa
|
28
38
|
|
39
|
+
# Creative Commons By-Attribution-Noncommercial-No-Derivative license
|
29
40
|
CC_BY_NC_ND = :cc_by_nc_nd
|
30
41
|
|
42
|
+
# GNU General Public license
|
31
43
|
GPL = :gpl
|
32
44
|
|
45
|
+
# GNU Lesser General Public license
|
33
46
|
LGPL = :lgpl
|
34
47
|
|
48
|
+
# Historical Permission Notice and Disclaimer license
|
35
49
|
HISTORICAL = :disclaimer
|
36
50
|
|
51
|
+
# IBM Public license
|
37
52
|
IBM_PUBLIC = :ibm
|
38
53
|
|
54
|
+
# Lucent Public license
|
39
55
|
LUCENT_PUBLIC = :lucent
|
40
56
|
|
57
|
+
# MIT license
|
41
58
|
MIT = :mit
|
42
59
|
|
43
|
-
|
60
|
+
# Mozilla Public license
|
61
|
+
MOZILLA_PUBLIC = :mozilla
|
44
62
|
|
63
|
+
# NASA OSA license
|
45
64
|
NASA_OSA = :nasa
|
46
65
|
|
66
|
+
# Python license
|
47
67
|
PYTHON = :python
|
48
68
|
|
69
|
+
# Q Public license
|
49
70
|
Q_PUBLIC = :qpl
|
50
71
|
|
72
|
+
# Sleepycat license
|
51
73
|
SLEEPYCAT = :sleepycat
|
52
74
|
|
75
|
+
# Zope Public license
|
53
76
|
ZOPE_PUBLIC = :zope
|
54
77
|
|
55
78
|
end
|