spidr 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +5 -0
- data/README.txt +29 -7
- data/Rakefile +1 -0
- data/lib/spidr/agent.rb +26 -22
- data/lib/spidr/page.rb +103 -17
- data/lib/spidr/version.rb +1 -1
- metadata +6 -4
data/History.txt
CHANGED
data/README.txt
CHANGED
@@ -12,14 +12,16 @@ and easy to use.
|
|
12
12
|
== FEATURES/PROBLEMS:
|
13
13
|
|
14
14
|
* Black-list or white-list URLs based upon:
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
15
|
+
* Host name
|
16
|
+
* Port number
|
17
|
+
* Full link
|
18
|
+
* URL extension
|
19
19
|
* Provides call-backs for:
|
20
|
-
|
21
|
-
|
22
|
-
|
20
|
+
* Every visited Page.
|
21
|
+
* Every visited URL.
|
22
|
+
* Every visited URL that matches a specified pattern.
|
23
|
+
* Custom User-Agent strings.
|
24
|
+
* Custom proxy settings.
|
23
25
|
|
24
26
|
== REQUIREMENTS:
|
25
27
|
|
@@ -29,6 +31,26 @@ and easy to use.
|
|
29
31
|
|
30
32
|
$ sudo gem install spidr
|
31
33
|
|
34
|
+
== EXAMPLES:
|
35
|
+
|
36
|
+
* Start spidering from a URL:
|
37
|
+
|
38
|
+
Spidr.start_at('http://tenderlovemaking.com/')
|
39
|
+
|
40
|
+
* Spider a host:
|
41
|
+
|
42
|
+
Spidr.host('www.0x000000.com')
|
43
|
+
|
44
|
+
* Spider a site:
|
45
|
+
|
46
|
+
Spidr.site('http://hackety.org/')
|
47
|
+
|
48
|
+
* Print out visited URLs:
|
49
|
+
|
50
|
+
Spidr.site('http://rubyinside.org/') do |spider|
|
51
|
+
spider.every_url { |url| puts url }
|
52
|
+
end
|
53
|
+
|
32
54
|
== LICENSE:
|
33
55
|
|
34
56
|
The MIT License
|
data/Rakefile
CHANGED
data/lib/spidr/agent.rb
CHANGED
@@ -366,6 +366,32 @@ module Spidr
|
|
366
366
|
end
|
367
367
|
end
|
368
368
|
|
369
|
+
#
|
370
|
+
# Creates a new Page object from the specified _url_. If a _block_ is
|
371
|
+
# given, it will be passed the newly created Page object.
|
372
|
+
#
|
373
|
+
def get_page(url,&block)
|
374
|
+
host = url.host
|
375
|
+
port = url.port
|
376
|
+
|
377
|
+
proxy_host = @proxy[:host]
|
378
|
+
proxy_port = @proxy[:port]
|
379
|
+
proxy_user = @proxy[:user]
|
380
|
+
proxy_password = @proxy[:password]
|
381
|
+
|
382
|
+
Net::HTTP::Proxy(proxy_host,proxy_port,proxy_user,proxy_password).start(host,port) do |sess|
|
383
|
+
headers = {}
|
384
|
+
|
385
|
+
headers['User-Agent'] = @user_agent if @user_agent
|
386
|
+
headers['Referer'] = @referer if @referer
|
387
|
+
|
388
|
+
new_page = Page.new(url,sess.get(url.path,headers))
|
389
|
+
|
390
|
+
block.call(new_page) if block
|
391
|
+
return new_page
|
392
|
+
end
|
393
|
+
end
|
394
|
+
|
369
395
|
protected
|
370
396
|
|
371
397
|
#
|
@@ -464,27 +490,5 @@ module Spidr
|
|
464
490
|
@ext_rules.accept?(File.extname(url.path)[1..-1])
|
465
491
|
end
|
466
492
|
|
467
|
-
def get_page(url,&block)
|
468
|
-
host = url.host
|
469
|
-
port = url.port
|
470
|
-
|
471
|
-
proxy_host = @proxy[:host]
|
472
|
-
proxy_port = @proxy[:port]
|
473
|
-
proxy_user = @proxy[:user]
|
474
|
-
proxy_password = @proxy[:password]
|
475
|
-
|
476
|
-
Net::HTTP::Proxy(proxy_host,proxy_port,proxy_user,proxy_password).start(host,port) do |sess|
|
477
|
-
headers = {}
|
478
|
-
|
479
|
-
headers['User-Agent'] = @user_agent if @user_agent
|
480
|
-
headers['Referer'] = @referer if @referer
|
481
|
-
|
482
|
-
new_page = Page.new(url,sess.get(url.path,headers))
|
483
|
-
|
484
|
-
block.call(new_page) if block
|
485
|
-
return new_page
|
486
|
-
end
|
487
|
-
end
|
488
|
-
|
489
493
|
end
|
490
494
|
end
|
data/lib/spidr/page.rb
CHANGED
@@ -7,6 +7,9 @@ module Spidr
|
|
7
7
|
# URL of the page
|
8
8
|
attr_reader :url
|
9
9
|
|
10
|
+
# HTTP Response
|
11
|
+
attr_reader :response
|
12
|
+
|
10
13
|
# Body returned for the page
|
11
14
|
attr_reader :body
|
12
15
|
|
@@ -23,6 +26,70 @@ module Spidr
|
|
23
26
|
@doc = nil
|
24
27
|
end
|
25
28
|
|
29
|
+
#
|
30
|
+
# Returns the response code from the page.
|
31
|
+
#
|
32
|
+
def code
|
33
|
+
@response.code
|
34
|
+
end
|
35
|
+
|
36
|
+
#
|
37
|
+
# Returns +true+ if the response code is 200, returns +false+ otherwise.
|
38
|
+
#
|
39
|
+
def is_ok?
|
40
|
+
code == 200
|
41
|
+
end
|
42
|
+
|
43
|
+
#
|
44
|
+
# Returns +true+ if the response code is 301 or 307, returns +false+
|
45
|
+
# otherwise.
|
46
|
+
#
|
47
|
+
def is_redirect?
|
48
|
+
(code == 301 || code == 307)
|
49
|
+
end
|
50
|
+
|
51
|
+
#
|
52
|
+
# Returns +true+ if the response code is 308, returns +false+ otherwise.
|
53
|
+
#
|
54
|
+
def timedout?
|
55
|
+
code == 308
|
56
|
+
end
|
57
|
+
|
58
|
+
#
|
59
|
+
# Returns +true+ if the response code is 400, returns +false+ otherwise.
|
60
|
+
#
|
61
|
+
def bad_request?
|
62
|
+
code == 400
|
63
|
+
end
|
64
|
+
|
65
|
+
#
|
66
|
+
# Returns +true+ if the response code is 401, returns +false+ otherwise.
|
67
|
+
#
|
68
|
+
def is_unauthorized?
|
69
|
+
code == 401
|
70
|
+
end
|
71
|
+
|
72
|
+
#
|
73
|
+
# Returns +true+ if the response code is 403, returns +false+ otherwise.
|
74
|
+
#
|
75
|
+
def is_forbidden?
|
76
|
+
code == 403
|
77
|
+
end
|
78
|
+
|
79
|
+
#
|
80
|
+
# Returns +true+ if the response code is 404, returns +false+ otherwise.
|
81
|
+
#
|
82
|
+
def is_missing?
|
83
|
+
code == 404
|
84
|
+
end
|
85
|
+
|
86
|
+
#
|
87
|
+
# Returns +true+ if the response code is 500, returns +false+ otherwise.
|
88
|
+
#
|
89
|
+
def had_internal_server_error?
|
90
|
+
code == 500
|
91
|
+
end
|
92
|
+
|
26
93
|
#
|
27
94
|
# Returns the content-type of the page.
|
28
95
|
#
|
@@ -30,6 +97,14 @@ module Spidr
|
|
30
97
|
@response['Content-Type']
|
31
98
|
end
|
32
99
|
|
100
|
+
#
|
101
|
+
# Returns +true+ if the page is a plain text document, returns +false+
|
102
|
+
# otherwise.
|
103
|
+
#
|
104
|
+
def plain_text?
|
105
|
+
(content_type =~ /text\/plain/) == 0
|
106
|
+
end
|
107
|
+
|
33
108
|
#
|
34
109
|
# Returns +true+ if the page is a HTML document, returns +false+
|
35
110
|
# otherwise.
|
@@ -78,6 +153,30 @@ module Spidr
|
|
78
153
|
(content_type =~ /application\/atom\+xml/) == 0
|
79
154
|
end
|
80
155
|
|
156
|
+
#
|
157
|
+
# Returns +true+ if the page is a MS Word document, returns +false+
|
158
|
+
# otherwise.
|
159
|
+
#
|
160
|
+
def ms_word?
|
161
|
+
(content_type =~ /application\/msword/) == 0
|
162
|
+
end
|
163
|
+
|
164
|
+
#
|
165
|
+
# Returns +true+ if the page is a PDF document, returns +false+
|
166
|
+
# otherwise.
|
167
|
+
#
|
168
|
+
def pdf?
|
169
|
+
(content_type =~ /application\/pdf/) == 0
|
170
|
+
end
|
171
|
+
|
172
|
+
#
|
173
|
+
# Returns +true+ if the page is a ZIP archive, returns +false+
|
174
|
+
# otherwise.
|
175
|
+
#
|
176
|
+
def zip?
|
177
|
+
(content_type =~ /application\/zip/) == 0
|
178
|
+
end
|
179
|
+
|
81
180
|
#
|
82
181
|
# Returns the body of the page in +String+ form.
|
83
182
|
#
|
@@ -122,24 +221,11 @@ module Spidr
|
|
122
221
|
# based on the url of the page.
|
123
222
|
#
|
124
223
|
def to_absolute(link)
|
224
|
+
# clean the link
|
125
225
|
link = URI.encode(link.to_s.gsub(/#.*$/,''))
|
126
|
-
relative = URI(link)
|
127
|
-
|
128
|
-
if relative.scheme.nil?
|
129
|
-
new_url = @url.clone
|
130
|
-
|
131
|
-
if relative.path[0..0] == '/'
|
132
|
-
new_url.path = relative.path
|
133
|
-
elsif relative.path[-1..-1] == '/'
|
134
|
-
new_url.path = File.expand_path(File.join(new_url.path,relative.path))
|
135
|
-
elsif !(relative.path.empty?)
|
136
|
-
new_url.path = File.expand_path(File.join(File.dirname(new_url.path),relative.path))
|
137
|
-
end
|
138
|
-
|
139
|
-
return new_url
|
140
|
-
end
|
141
226
|
|
142
|
-
|
227
|
+
relative = URI(link)
|
228
|
+
return @url.merge(relative)
|
143
229
|
end
|
144
230
|
|
145
231
|
#
|
@@ -149,7 +235,7 @@ module Spidr
|
|
149
235
|
if (args.empty? && block.nil?)
|
150
236
|
name = sym.id2name.sub('_','-')
|
151
237
|
|
152
|
-
return @response[name] if @response.
|
238
|
+
return @response[name] if @response.key?(name)
|
153
239
|
end
|
154
240
|
|
155
241
|
return super(sym,*args,&block)
|
data/lib/spidr/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spidr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Postmodern Modulus III
|
@@ -9,11 +9,12 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2008-
|
12
|
+
date: 2008-10-04 00:00:00 -07:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: hpricot
|
17
|
+
type: :runtime
|
17
18
|
version_requirement:
|
18
19
|
version_requirements: !ruby/object:Gem::Requirement
|
19
20
|
requirements:
|
@@ -23,12 +24,13 @@ dependencies:
|
|
23
24
|
version:
|
24
25
|
- !ruby/object:Gem::Dependency
|
25
26
|
name: hoe
|
27
|
+
type: :development
|
26
28
|
version_requirement:
|
27
29
|
version_requirements: !ruby/object:Gem::Requirement
|
28
30
|
requirements:
|
29
31
|
- - ">="
|
30
32
|
- !ruby/object:Gem::Version
|
31
|
-
version: 1.
|
33
|
+
version: 1.7.0
|
32
34
|
version:
|
33
35
|
description: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.
|
34
36
|
email:
|
@@ -76,7 +78,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
76
78
|
requirements: []
|
77
79
|
|
78
80
|
rubyforge_project: spidr
|
79
|
-
rubygems_version: 1.
|
81
|
+
rubygems_version: 1.2.0
|
80
82
|
signing_key:
|
81
83
|
specification_version: 2
|
82
84
|
summary: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely
|