gscraper 0.1.5 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +33 -21
- data/Manifest.txt +3 -0
- data/README.txt +107 -4
- data/lib/gscraper/gscraper.rb +92 -21
- data/lib/gscraper/licenses.rb +27 -4
- data/lib/gscraper/search/page.rb +9 -11
- data/lib/gscraper/search/query.rb +142 -104
- data/lib/gscraper/search/result.rb +13 -12
- data/lib/gscraper/search/search.rb +3 -3
- data/lib/gscraper/sponsored_ad.rb +35 -0
- data/lib/gscraper/sponsored_links.rb +151 -0
- data/lib/gscraper/version.rb +1 -1
- data/lib/gscraper/web_agent.rb +38 -0
- data/test/search/query_result.rb +1 -1
- data/test/test_gscraper.rb +1 -4
- metadata +73 -63
data/History.txt
CHANGED
@@ -1,39 +1,51 @@
|
|
1
|
+
== 0.1.6 / 2008-03-15
|
2
|
+
|
3
|
+
* Renamed GScraper.http_agent to GScraper.web_agent.
|
4
|
+
* Added GScraper.proxy for global proxy configuration.
|
5
|
+
* Added the WebAgent module.
|
6
|
+
* Renamed Search::Query#first_result to Search::Query#top_result.
|
7
|
+
* Updated Search::Query#page logic for the new DOM layout being used.
|
8
|
+
* Added support for Sponsored Ad scraping.
|
9
|
+
* Added the methods Query#sponsored_links and Query#top_sponsored_link.
|
10
|
+
* Added examples to README.txt.
|
11
|
+
|
1
12
|
== 0.1.5 / 2007-12-29
|
2
13
|
|
3
|
-
|
14
|
+
* Fixed class inheritance in gscraper/extensions/uri/http.rb, found by
|
15
|
+
sanitybit.
|
4
16
|
|
5
17
|
== 0.1.4 / 2007-12-23
|
6
18
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
+
* Added Search::Query#result_at for easier access of a single result at
|
20
|
+
a given index.
|
21
|
+
* Adding scraping of the "Cached" and "Similar Pages" URLs of Search
|
22
|
+
Results.
|
23
|
+
* Added methods to Search::Page for accessing cached URLs, cached pages,
|
24
|
+
similar query URLs and similar Queries in mass.
|
25
|
+
* Search::Query#page and Search::Query#first_page now can receive blocks.
|
26
|
+
* Improved the formating of URL query parameters.
|
27
|
+
* Added more unit-tests.
|
28
|
+
* Fixed scraping of Search Result summaries.
|
29
|
+
* Fixed various bugs in Search::Query uncovered during unit-testing.
|
30
|
+
* Fixed typos in Search::Page's documentation.
|
19
31
|
|
20
32
|
== 0.1.3 / 2007-12-22
|
21
33
|
|
22
|
-
|
23
|
-
|
34
|
+
* Added the Search::Page class, which contains many of convenance methods
|
35
|
+
for searching through the results within a Page.
|
24
36
|
|
25
37
|
== 0.1.2 / 2007-12-22
|
26
38
|
|
27
|
-
|
28
|
-
|
29
|
-
|
39
|
+
* Fixed a bug related to extracting the correct content-rights from search
|
40
|
+
query URLs.
|
41
|
+
* Added GScraper.user_agent_aliases.
|
30
42
|
|
31
43
|
== 0.1.1 / 2007-12-21
|
32
44
|
|
33
|
-
|
45
|
+
* Forgot to include lib/gscraper/version.rb.
|
34
46
|
|
35
47
|
== 0.1.0 / 2007-12-20
|
36
48
|
|
37
|
-
|
38
|
-
|
49
|
+
* Initial release.
|
50
|
+
* Supports the Google Search service.
|
39
51
|
|
data/Manifest.txt
CHANGED
@@ -6,10 +6,13 @@ Rakefile
|
|
6
6
|
lib/gscraper.rb
|
7
7
|
lib/gscraper/version.rb
|
8
8
|
lib/gscraper/gscraper.rb
|
9
|
+
lib/gscraper/web_agent.rb
|
9
10
|
lib/gscraper/extensions/uri/http.rb
|
10
11
|
lib/gscraper/extensions/uri.rb
|
11
12
|
lib/gscraper/extensions.rb
|
12
13
|
lib/gscraper/licenses.rb
|
14
|
+
lib/gscraper/sponsored_ad.rb
|
15
|
+
lib/gscraper/sponsored_links.rb
|
13
16
|
lib/gscraper/search/result.rb
|
14
17
|
lib/gscraper/search/page.rb
|
15
18
|
lib/gscraper/search/query.rb
|
data/README.txt
CHANGED
@@ -8,17 +8,120 @@ GScraper is a web-scraping interface to various Google Services.
|
|
8
8
|
|
9
9
|
== FEATURES/PROBLEMS:
|
10
10
|
|
11
|
-
|
12
|
-
|
11
|
+
* Supports the Google Search service.
|
12
|
+
* Provides access to search results and ranks.
|
13
|
+
* Provides access to the Sponsored Links.
|
14
|
+
* Provides HTTP access with custom User-Agent strings.
|
15
|
+
* Provides proxy settings for HTTP access.
|
13
16
|
|
14
17
|
== REQUIREMENTS:
|
15
18
|
|
16
19
|
* Hpricot
|
17
|
-
* Mechanize
|
20
|
+
* WWW::Mechanize
|
18
21
|
|
19
22
|
== INSTALL:
|
20
23
|
|
21
|
-
sudo gem install gscraper
|
24
|
+
$ sudo gem install gscraper
|
25
|
+
|
26
|
+
== EXAMPLES:
|
27
|
+
|
28
|
+
* Basic query:
|
29
|
+
|
30
|
+
q = GScraper::Search.query(:query => 'ruby')
|
31
|
+
|
32
|
+
* Advanced query:
|
33
|
+
|
34
|
+
q = GScraper::Search.query(:query => 'ruby') do |q|
|
35
|
+
q.without_words = 'is'
|
36
|
+
q.within_past_day = true
|
37
|
+
q.numeric_range = 2..10
|
38
|
+
end
|
39
|
+
|
40
|
+
* Queries from URLs:
|
41
|
+
|
42
|
+
q = GScraper::Search.query_from_url('http://www.google.com/search?as_q=ruby&as_epq=&as_oq=rails&as_ft=i&as_qdr=all&as_occt=body&as_rights=%28cc_publicdomain%7Ccc_attribute%7Ccc_sharealike%7Ccc_noncommercial%29.-%28cc_nonderived%29')
|
43
|
+
|
44
|
+
q.query # =>; "ruby"
|
45
|
+
q.with_words # => "rails"
|
46
|
+
q.occurrs_within # => :title
|
47
|
+
q.rights # => :cc_by_nc
|
48
|
+
|
49
|
+
* Getting the search results:
|
50
|
+
|
51
|
+
q.first_page.select do |result|
|
52
|
+
result.title =~ /Blog/
|
53
|
+
end
|
54
|
+
|
55
|
+
q.page(2).map do |result|
|
56
|
+
result.title.reverse
|
57
|
+
end
|
58
|
+
|
59
|
+
q.result_at(25) # => Result
|
60
|
+
|
61
|
+
q.top_result # => Result
|
62
|
+
|
63
|
+
* A Result object contains the rank, title, summary, cahced URL, similiar
|
64
|
+
query URL and link URL of the search result.
|
65
|
+
|
66
|
+
page = q.page(2)
|
67
|
+
|
68
|
+
page.urls # => [...]
|
69
|
+
pagesummaries # => [...]
|
70
|
+
page.ranks_of { |result| result.url =~ /^https/ } # => [...]
|
71
|
+
page.titles_of { |result| result.summary =~ /password/ } # => [...]
|
72
|
+
page.cached_pages # => [...]
|
73
|
+
page.similar_queries # => [...]
|
74
|
+
|
75
|
+
* Iterating over the search results:
|
76
|
+
|
77
|
+
q.each_on_page(2) do |result|
|
78
|
+
puts result.title
|
79
|
+
end
|
80
|
+
|
81
|
+
page.each do |result|
|
82
|
+
puts result.url
|
83
|
+
end
|
84
|
+
|
85
|
+
* Iterating over the data within the search results:
|
86
|
+
|
87
|
+
page.each_title do |title|
|
88
|
+
puts title
|
89
|
+
end
|
90
|
+
|
91
|
+
page.each_summary do |text|
|
92
|
+
puts text
|
93
|
+
end
|
94
|
+
|
95
|
+
* Selecting search results:
|
96
|
+
|
97
|
+
page.results_with do |result|
|
98
|
+
((result.rank > 2) && (result.rank < 10))
|
99
|
+
end
|
100
|
+
|
101
|
+
page.results_with_title(/Ruby/i) # => [...]
|
102
|
+
|
103
|
+
* Selecting data within the search results:
|
104
|
+
|
105
|
+
page.titles # => [...]
|
106
|
+
|
107
|
+
page.summaries # => [...]
|
108
|
+
|
109
|
+
* Selecting the data of search results based on the search result:
|
110
|
+
|
111
|
+
page.urls_of do |result|
|
112
|
+
result.description.length > 10
|
113
|
+
end
|
114
|
+
|
115
|
+
* Selecting the Sponsored Links of a Query:
|
116
|
+
|
117
|
+
q.sponsored_links # => [...]
|
118
|
+
|
119
|
+
q.top_sponsored_link # => SponsoredAd
|
120
|
+
|
121
|
+
* Setting the User-Agent globally:
|
122
|
+
|
123
|
+
GScraper.user_agent # => nil
|
124
|
+
GScraper.user_agent = 'Awesome Browser v1.2'
|
22
125
|
|
23
126
|
== LICENSE:
|
24
127
|
|
data/lib/gscraper/gscraper.rb
CHANGED
@@ -1,7 +1,38 @@
|
|
1
|
+
require 'uri/http'
|
1
2
|
require 'mechanize'
|
2
3
|
require 'open-uri'
|
3
4
|
|
4
5
|
module GScraper
|
6
|
+
# Common proxy port.
|
7
|
+
COMMON_PROXY_PORT = 8080
|
8
|
+
|
9
|
+
#
|
10
|
+
# Returns the +Hash+ of proxy information.
|
11
|
+
#
|
12
|
+
def GScraper.proxy
|
13
|
+
@@gscraper_proxy ||= {:host => nil, :port => COMMON_PROXY_PORT, :user => nil, :password => nil}
|
14
|
+
end
|
15
|
+
|
16
|
+
#
|
17
|
+
# Creates a HTTP URI based from the given _proxy_info_ hash. The
|
18
|
+
# _proxy_info_ hash defaults to Web.proxy, if not given.
|
19
|
+
#
|
20
|
+
# _proxy_info_ may contain the following keys:
|
21
|
+
# <tt>:host</tt>:: The proxy host.
|
22
|
+
# <tt>:port</tt>:: The proxy port. Defaults to COMMON_PROXY_PORT,
|
23
|
+
# if not specified.
|
24
|
+
# <tt>:user</tt>:: The user-name to login as.
|
25
|
+
# <tt>:password</tt>:: The password to login with.
|
26
|
+
#
|
27
|
+
def GScraper.proxy_uri(proxy_info=GScraper.proxy)
|
28
|
+
if GScraper.proxy[:host]
|
29
|
+
return URI::HTTP.build(:host => GScraper.proxy[:host],
|
30
|
+
:port => GScraper.proxy[:port],
|
31
|
+
:userinfo => "#{GScraper.proxy[:user]}:#{GScraper.proxy[:password]}",
|
32
|
+
:path => '/')
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
5
36
|
#
|
6
37
|
# Returns the supported GScraper User-Agent Aliases.
|
7
38
|
#
|
@@ -13,58 +44,98 @@ module GScraper
|
|
13
44
|
# Returns the GScraper User-Agent
|
14
45
|
#
|
15
46
|
def GScraper.user_agent
|
16
|
-
|
47
|
+
@@gscraper_user_agent ||= GScraper.user_agent_aliases['Windows IE 6']
|
17
48
|
end
|
18
49
|
|
19
50
|
#
|
20
51
|
# Sets the GScraper User-Agent to the specified _agent_.
|
21
52
|
#
|
22
53
|
def GScraper.user_agent=(agent)
|
23
|
-
|
54
|
+
@@gscraper_user_agent = agent
|
24
55
|
end
|
25
56
|
|
26
57
|
#
|
27
|
-
# Opens the _uri_ with the given
|
28
|
-
# returned.
|
58
|
+
# Opens the _uri_ with the given _options_. The contents of the _uri_
|
59
|
+
# will be returned.
|
60
|
+
#
|
61
|
+
# _options_ may contain the following keys:
|
62
|
+
# <tt>:user_agent_alias</tt>:: The User-Agent Alias to use.
|
63
|
+
# <tt>:user_agent</tt>:: The User-Agent String to use.
|
64
|
+
# <tt>:proxy</tt>:: A +Hash+ of proxy information which may
|
65
|
+
# contain the following keys:
|
66
|
+
# <tt>:host</tt>:: The proxy host.
|
67
|
+
# <tt>:port</tt>:: The proxy port.
|
68
|
+
# <tt>:user</tt>:: The user-name to login as.
|
69
|
+
# <tt>:password</tt>:: The password to login with.
|
29
70
|
#
|
30
|
-
# GScraper.
|
71
|
+
# GScraper.open_uri('http://www.hackety.org/')
|
31
72
|
#
|
32
|
-
# GScraper.
|
73
|
+
# GScraper.open_uri('http://tenderlovemaking.com/',
|
33
74
|
# :user_agent_alias => 'Linux Mozilla')
|
34
|
-
# GScraper.
|
75
|
+
# GScraper.open_uri('http://www.wired.com/',
|
76
|
+
# :user_agent => 'the future')
|
35
77
|
#
|
36
|
-
def GScraper.
|
78
|
+
def GScraper.open_uri(uri,options={})
|
37
79
|
headers = {}
|
38
80
|
|
39
|
-
if
|
40
|
-
headers['User-Agent'] = WWW::Mechanize::AGENT_ALIASES[
|
41
|
-
elsif
|
42
|
-
headers['User-Agent'] =
|
81
|
+
if options[:user_agent_alias]
|
82
|
+
headers['User-Agent'] = WWW::Mechanize::AGENT_ALIASES[options[:user_agent_alias]]
|
83
|
+
elsif options[:user_agent]
|
84
|
+
headers['User-Agent'] = options[:user_agent]
|
43
85
|
elsif GScraper.user_agent
|
44
86
|
headers['User-Agent'] = GScraper.user_agent
|
45
87
|
end
|
46
88
|
|
89
|
+
proxy = (options[:proxy] || GScraper.proxy)
|
90
|
+
if proxy[:host]
|
91
|
+
headers[:proxy] = GScraper.proxy_uri(proxy)
|
92
|
+
end
|
93
|
+
|
47
94
|
return Kernel.open(uri,headers)
|
48
95
|
end
|
49
96
|
|
50
97
|
#
|
51
|
-
#
|
98
|
+
# Similar to GScraper.open_uri but returns an Hpricot document.
|
99
|
+
#
|
100
|
+
def GScraper.open_page(uri,options={})
|
101
|
+
Hpricot(GScraper.open_uri(uri,options))
|
102
|
+
end
|
103
|
+
|
104
|
+
#
|
105
|
+
# Creates a new WWW::Mechanize agent with the given _options_.
|
106
|
+
#
|
107
|
+
# _options_ may contain the following keys:
|
108
|
+
# <tt>:user_agent_alias</tt>:: The User-Agent Alias to use.
|
109
|
+
# <tt>:user_agent</tt>:: The User-Agent string to use.
|
110
|
+
# <tt>:proxy</tt>:: A +Hash+ of proxy information which may
|
111
|
+
# contain the following keys:
|
112
|
+
# <tt>:host</tt>:: The proxy host.
|
113
|
+
# <tt>:port</tt>:: The proxy port.
|
114
|
+
# <tt>:user</tt>:: The user-name to login as.
|
115
|
+
# <tt>:password</tt>:: The password to login with.
|
116
|
+
#
|
117
|
+
# GScraper.web_agent
|
52
118
|
#
|
53
|
-
# GScraper.
|
54
|
-
# GScraper.
|
55
|
-
# GScraper.http_agent(:user_agent => 'wooden pants')
|
119
|
+
# GScraper.web_agent(:user_agent_alias => 'Linux Mozilla')
|
120
|
+
# GScraper.web_agent(:user_agent => 'Google Bot')
|
56
121
|
#
|
57
|
-
def GScraper.
|
122
|
+
def GScraper.web_agent(options={},&block)
|
58
123
|
agent = WWW::Mechanize.new
|
59
124
|
|
60
|
-
if
|
61
|
-
agent.user_agent_alias =
|
62
|
-
elsif
|
63
|
-
agent.user_agent =
|
125
|
+
if options[:user_agent_alias]
|
126
|
+
agent.user_agent_alias = options[:user_agent_alias]
|
127
|
+
elsif options[:user_agent]
|
128
|
+
agent.user_agent = options[:user_agent]
|
64
129
|
elsif GScraper.user_agent
|
65
130
|
agent.user_agent = GScraper.user_agent
|
66
131
|
end
|
67
132
|
|
133
|
+
proxy = (options[:proxy] || GScraper.proxy)
|
134
|
+
if proxy[:host]
|
135
|
+
agent.set_proxy(proxy[:host],proxy[:port],proxy[:user],proxy[:password])
|
136
|
+
end
|
137
|
+
|
138
|
+
block.call(agent) if block
|
68
139
|
return agent
|
69
140
|
end
|
70
141
|
end
|
data/lib/gscraper/licenses.rb
CHANGED
@@ -1,55 +1,78 @@
|
|
1
1
|
module GScraper
|
2
2
|
module Licenses
|
3
|
+
# Any desired license
|
3
4
|
ANY = nil
|
4
5
|
|
6
|
+
# Aladdin license
|
5
7
|
ALADDIN = :aladdin
|
6
8
|
|
9
|
+
# Artistic license
|
7
10
|
ARTISTIC = :artistic
|
8
11
|
|
12
|
+
# Apache license
|
9
13
|
APACHE = :apache
|
10
14
|
|
15
|
+
# Apple license
|
11
16
|
APPLE = :apple
|
12
17
|
|
18
|
+
# BSD license
|
13
19
|
BSD = :bsd
|
14
20
|
|
21
|
+
# Common public license
|
15
22
|
COMMON_PUBLIC = :cpl
|
16
23
|
|
24
|
+
# Creative Commons By-Attribution license
|
17
25
|
CC_BY = :cc_by
|
18
26
|
|
27
|
+
# Creative Commons By-Attribution-Share-Alike license
|
19
28
|
CC_BY_SA = :cc_by_sa
|
20
29
|
|
30
|
+
# Creative Commons By-Attribution-No-Derivative license
|
21
31
|
CC_BY_ND = :cc_by_nd
|
22
32
|
|
33
|
+
# Creative Commons By-Attribution-Noncommercial-Share-Alike license
|
23
34
|
CC_BY_NC = :cc_by_nc_sa
|
24
35
|
|
25
|
-
|
26
|
-
|
27
|
-
CC_BY_NC_SA = :cc_by_nc_sa
|
36
|
+
# Creative Commons By-Attribution-No-Derivative-Share-Alike license
|
37
|
+
CC_BY_ND_SA = :cc_by_nd_sa
|
28
38
|
|
39
|
+
# Creative Commons By-Attribution-Noncommercial-No-Derivative license
|
29
40
|
CC_BY_NC_ND = :cc_by_nc_nd
|
30
41
|
|
42
|
+
# GNU General Public license
|
31
43
|
GPL = :gpl
|
32
44
|
|
45
|
+
# GNU Lesser General Public license
|
33
46
|
LGPL = :lgpl
|
34
47
|
|
48
|
+
# Historical Permission Notice and Disclaimer license
|
35
49
|
HISTORICAL = :disclaimer
|
36
50
|
|
51
|
+
# IBM Public license
|
37
52
|
IBM_PUBLIC = :ibm
|
38
53
|
|
54
|
+
# Lucent Public license
|
39
55
|
LUCENT_PUBLIC = :lucent
|
40
56
|
|
57
|
+
# MIT license
|
41
58
|
MIT = :mit
|
42
59
|
|
43
|
-
|
60
|
+
# Mozilla Public license
|
61
|
+
MOZILLA_PUBLIC = :mozilla
|
44
62
|
|
63
|
+
# NASA OSA license
|
45
64
|
NASA_OSA = :nasa
|
46
65
|
|
66
|
+
# Python license
|
47
67
|
PYTHON = :python
|
48
68
|
|
69
|
+
# Q Public license
|
49
70
|
Q_PUBLIC = :qpl
|
50
71
|
|
72
|
+
# Sleepycat license
|
51
73
|
SLEEPYCAT = :sleepycat
|
52
74
|
|
75
|
+
# Zope Public license
|
53
76
|
ZOPE_PUBLIC = :zope
|
54
77
|
|
55
78
|
end
|