spidr 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt CHANGED
@@ -1,3 +1,8 @@
1
+ === 0.1.1 / 2008-10-04
2
+
3
+ * Added a reader method for the response instance variable in Page.
4
+ * Fixed a bug in Page#method_missing.
5
+
1
6
  === 0.1.0 / 2008-05-23
2
7
 
3
8
  * Initial release.
data/README.txt CHANGED
@@ -12,14 +12,16 @@ and easy to use.
12
12
  == FEATURES/PROBLEMS:
13
13
 
14
14
  * Black-list or white-list URLs based upon:
15
- * Host name
16
- * Port number
17
- * Full link
18
- * URL extension
15
+ * Host name
16
+ * Port number
17
+ * Full link
18
+ * URL extension
19
19
  * Provides call-backs for:
20
- * Every visited Page.
21
- * Every visited URL.
22
- * Every visited URL that matches a specified pattern.
20
+ * Every visited Page.
21
+ * Every visited URL.
22
+ * Every visited URL that matches a specified pattern.
23
+ * Custom User-Agent strings.
24
+ * Custom proxy settings.
23
25
 
24
26
  == REQUIREMENTS:
25
27
 
@@ -29,6 +31,26 @@ and easy to use.
29
31
 
30
32
  $ sudo gem install spidr
31
33
 
34
+ == EXAMPLES:
35
+
36
+ * Start spidering from a URL:
37
+
38
+ Spidr.start_at('http://tenderlovemaking.com/')
39
+
40
+ * Spider a host:
41
+
42
+ Spidr.host('www.0x000000.com')
43
+
44
+ * Spider a site:
45
+
46
+ Spidr.site('http://hackety.org/')
47
+
48
+ * Print out visited URLs:
49
+
50
+ Spidr.site('http://rubyinside.org/') do |spider|
51
+ spider.every_url { |url| puts url }
52
+ end
53
+
32
54
  == LICENSE:
33
55
 
34
56
  The MIT License
data/Rakefile CHANGED
@@ -7,6 +7,7 @@ require './lib/spidr/version.rb'
7
7
  Hoe.new('spidr', Spidr::VERSION) do |p|
8
8
  p.rubyforge_name = 'spidr'
9
9
  p.developer('Postmodern Modulus III', 'postmodern.mod3@gmail.com')
10
+ p.remote_rdoc_dir = 'docs'
10
11
  p.extra_deps = ['hpricot']
11
12
  end
12
13
 
data/lib/spidr/agent.rb CHANGED
@@ -366,6 +366,32 @@ module Spidr
366
366
  end
367
367
  end
368
368
 
369
+ #
370
+ # Creates a new Page object from the specified _url_. If a _block_ is
371
+ # given, it will be passed the newly created Page object.
372
+ #
373
+ def get_page(url,&block)
374
+ host = url.host
375
+ port = url.port
376
+
377
+ proxy_host = @proxy[:host]
378
+ proxy_port = @proxy[:port]
379
+ proxy_user = @proxy[:user]
380
+ proxy_password = @proxy[:password]
381
+
382
+ Net::HTTP::Proxy(proxy_host,proxy_port,proxy_user,proxy_password).start(host,port) do |sess|
383
+ headers = {}
384
+
385
+ headers['User-Agent'] = @user_agent if @user_agent
386
+ headers['Referer'] = @referer if @referer
387
+
388
+ new_page = Page.new(url,sess.get(url.path,headers))
389
+
390
+ block.call(new_page) if block
391
+ return new_page
392
+ end
393
+ end
394
+
369
395
  protected
370
396
 
371
397
  #
@@ -464,27 +490,5 @@ module Spidr
464
490
  @ext_rules.accept?(File.extname(url.path)[1..-1])
465
491
  end
466
492
 
467
- def get_page(url,&block)
468
- host = url.host
469
- port = url.port
470
-
471
- proxy_host = @proxy[:host]
472
- proxy_port = @proxy[:port]
473
- proxy_user = @proxy[:user]
474
- proxy_password = @proxy[:password]
475
-
476
- Net::HTTP::Proxy(proxy_host,proxy_port,proxy_user,proxy_password).start(host,port) do |sess|
477
- headers = {}
478
-
479
- headers['User-Agent'] = @user_agent if @user_agent
480
- headers['Referer'] = @referer if @referer
481
-
482
- new_page = Page.new(url,sess.get(url.path,headers))
483
-
484
- block.call(new_page) if block
485
- return new_page
486
- end
487
- end
488
-
489
493
  end
490
494
  end
data/lib/spidr/page.rb CHANGED
@@ -7,6 +7,9 @@ module Spidr
7
7
  # URL of the page
8
8
  attr_reader :url
9
9
 
10
+ # HTTP Response
11
+ attr_reader :response
12
+
10
13
  # Body returned for the page
11
14
  attr_reader :body
12
15
 
@@ -23,6 +26,70 @@ module Spidr
23
26
  @doc = nil
24
27
  end
25
28
 
29
+ #
30
+ # Returns the response code from the page.
31
+ #
32
+ def code
33
+ @response.code
34
+ end
35
+
36
+ #
37
+ # Returns +true+ if the response code is 200, returns +false+ otherwise.
38
+ #
39
+ def is_ok?
40
+ code == 200
41
+ end
42
+
43
+ #
44
+ # Returns +true+ if the response code is 301 or 307, returns +false+
45
+ # otherwise.
46
+ #
47
+ def is_redirect?
48
+ (code == 301 || code == 307)
49
+ end
50
+
51
+ #
52
+ # Returns +true+ if the response code is 308, returns +false+ otherwise.
53
+ #
54
+ def timedout?
55
+ code == 308
56
+ end
57
+
58
+ #
59
+ # Returns +true+ if the response code is 400, returns +false+ otherwise.
60
+ #
61
+ def bad_request?
62
+ code == 400
63
+ end
64
+
65
+ #
66
+ # Returns +true+ if the response code is 401, returns +false+ otherwise.
67
+ #
68
+ def is_unauthorized?
69
+ code == 401
70
+ end
71
+
72
+ #
73
+ # Returns +true+ if the response code is 403, returns +false+ otherwise.
74
+ #
75
+ def is_forbidden?
76
+ code == 403
77
+ end
78
+
79
+ #
80
+ # Returns +true+ if the response code is 404, returns +false+ otherwise.
81
+ #
82
+ def is_missing?
83
+ code == 404
84
+ end
85
+
86
+ #
87
+ # Returns +true+ if the response code is 500, returns +false+ otherwise.
88
+ #
89
+ def had_internal_server_error?
90
+ code == 500
91
+ end
92
+
26
93
  #
27
94
  # Returns the content-type of the page.
28
95
  #
@@ -30,6 +97,14 @@ module Spidr
30
97
  @response['Content-Type']
31
98
  end
32
99
 
100
+ #
101
+ # Returns +true+ if the page is a plain text document, returns +false+
102
+ # otherwise.
103
+ #
104
+ def plain_text?
105
+ (content_type =~ /text\/plain/) == 0
106
+ end
107
+
33
108
  #
34
109
  # Returns +true+ if the page is a HTML document, returns +false+
35
110
  # otherwise.
@@ -78,6 +153,30 @@ module Spidr
78
153
  (content_type =~ /application\/atom\+xml/) == 0
79
154
  end
80
155
 
156
+ #
157
+ # Returns +true+ if the page is a MS Word document, returns +false+
158
+ # otherwise.
159
+ #
160
+ def ms_word?
161
+ (content_type =~ /application\/msword/) == 0
162
+ end
163
+
164
+ #
165
+ # Returns +true+ if the page is a PDF document, returns +false+
166
+ # otherwise.
167
+ #
168
+ def pdf?
169
+ (content_type =~ /application\/pdf/) == 0
170
+ end
171
+
172
+ #
173
+ # Returns +true+ if the page is a ZIP archive, returns +false+
174
+ # otherwise.
175
+ #
176
+ def zip?
177
+ (content_type =~ /application\/zip/) == 0
178
+ end
179
+
81
180
  #
82
181
  # Returns the body of the page in +String+ form.
83
182
  #
@@ -122,24 +221,11 @@ module Spidr
122
221
  # based on the url of the page.
123
222
  #
124
223
  def to_absolute(link)
224
+ # clean the link
125
225
  link = URI.encode(link.to_s.gsub(/#.*$/,''))
126
- relative = URI(link)
127
-
128
- if relative.scheme.nil?
129
- new_url = @url.clone
130
-
131
- if relative.path[0..0] == '/'
132
- new_url.path = relative.path
133
- elsif relative.path[-1..-1] == '/'
134
- new_url.path = File.expand_path(File.join(new_url.path,relative.path))
135
- elsif !(relative.path.empty?)
136
- new_url.path = File.expand_path(File.join(File.dirname(new_url.path),relative.path))
137
- end
138
-
139
- return new_url
140
- end
141
226
 
142
- return relative
227
+ relative = URI(link)
228
+ return @url.merge(relative)
143
229
  end
144
230
 
145
231
  #
@@ -149,7 +235,7 @@ module Spidr
149
235
  if (args.empty? && block.nil?)
150
236
  name = sym.id2name.sub('_','-')
151
237
 
152
- return @response[name] if @response.has_key?(name)
238
+ return @response[name] if @response.key?(name)
153
239
  end
154
240
 
155
241
  return super(sym,*args,&block)
data/lib/spidr/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Spidr
2
- VERSION = '0.1.0'
2
+ VERSION = '0.1.1'
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Postmodern Modulus III
@@ -9,11 +9,12 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2008-05-23 00:00:00 -07:00
12
+ date: 2008-10-04 00:00:00 -07:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: hpricot
17
+ type: :runtime
17
18
  version_requirement:
18
19
  version_requirements: !ruby/object:Gem::Requirement
19
20
  requirements:
@@ -23,12 +24,13 @@ dependencies:
23
24
  version:
24
25
  - !ruby/object:Gem::Dependency
25
26
  name: hoe
27
+ type: :development
26
28
  version_requirement:
27
29
  version_requirements: !ruby/object:Gem::Requirement
28
30
  requirements:
29
31
  - - ">="
30
32
  - !ruby/object:Gem::Version
31
- version: 1.5.3
33
+ version: 1.7.0
32
34
  version:
33
35
  description: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.
34
36
  email:
@@ -76,7 +78,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
76
78
  requirements: []
77
79
 
78
80
  rubyforge_project: spidr
79
- rubygems_version: 1.1.1
81
+ rubygems_version: 1.2.0
80
82
  signing_key:
81
83
  specification_version: 2
82
84
  summary: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely