spidr 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.txt CHANGED
@@ -1,3 +1,8 @@
1
+ === 0.1.1 / 2008-10-04
2
+
3
+ * Added a reader method for the response instance variable in Page.
4
+ * Fixed a bug in Page#method_missing.
5
+
1
6
  === 0.1.0 / 2008-05-23
2
7
 
3
8
  * Initial release.
data/README.txt CHANGED
@@ -12,14 +12,16 @@ and easy to use.
12
12
  == FEATURES/PROBLEMS:
13
13
 
14
14
  * Black-list or white-list URLs based upon:
15
- * Host name
16
- * Port number
17
- * Full link
18
- * URL extension
15
+ * Host name
16
+ * Port number
17
+ * Full link
18
+ * URL extension
19
19
  * Provides call-backs for:
20
- * Every visited Page.
21
- * Every visited URL.
22
- * Every visited URL that matches a specified pattern.
20
+ * Every visited Page.
21
+ * Every visited URL.
22
+ * Every visited URL that matches a specified pattern.
23
+ * Custom User-Agent strings.
24
+ * Custom proxy settings.
23
25
 
24
26
  == REQUIREMENTS:
25
27
 
@@ -29,6 +31,26 @@ and easy to use.
29
31
 
30
32
  $ sudo gem install spidr
31
33
 
34
+ == EXAMPLES:
35
+
36
+ * Start spidering from a URL:
37
+
38
+ Spidr.start_at('http://tenderlovemaking.com/')
39
+
40
+ * Spider a host:
41
+
42
+ Spidr.host('www.0x000000.com')
43
+
44
+ * Spider a site:
45
+
46
+ Spidr.site('http://hackety.org/')
47
+
48
+ * Print out visited URLs:
49
+
50
+ Spidr.site('http://rubyinside.org/') do |spider|
51
+ spider.every_url { |url| puts url }
52
+ end
53
+
32
54
  == LICENSE:
33
55
 
34
56
  The MIT License
data/Rakefile CHANGED
@@ -7,6 +7,7 @@ require './lib/spidr/version.rb'
7
7
  Hoe.new('spidr', Spidr::VERSION) do |p|
8
8
  p.rubyforge_name = 'spidr'
9
9
  p.developer('Postmodern Modulus III', 'postmodern.mod3@gmail.com')
10
+ p.remote_rdoc_dir = 'docs'
10
11
  p.extra_deps = ['hpricot']
11
12
  end
12
13
 
data/lib/spidr/agent.rb CHANGED
@@ -366,6 +366,32 @@ module Spidr
366
366
  end
367
367
  end
368
368
 
369
+ #
370
+ # Creates a new Page object from the specified _url_. If a _block_ is
371
+ # given, it will be passed the newly created Page object.
372
+ #
373
+ def get_page(url,&block)
374
+ host = url.host
375
+ port = url.port
376
+
377
+ proxy_host = @proxy[:host]
378
+ proxy_port = @proxy[:port]
379
+ proxy_user = @proxy[:user]
380
+ proxy_password = @proxy[:password]
381
+
382
+ Net::HTTP::Proxy(proxy_host,proxy_port,proxy_user,proxy_password).start(host,port) do |sess|
383
+ headers = {}
384
+
385
+ headers['User-Agent'] = @user_agent if @user_agent
386
+ headers['Referer'] = @referer if @referer
387
+
388
+ new_page = Page.new(url,sess.get(url.path,headers))
389
+
390
+ block.call(new_page) if block
391
+ return new_page
392
+ end
393
+ end
394
+
369
395
  protected
370
396
 
371
397
  #
@@ -464,27 +490,5 @@ module Spidr
464
490
  @ext_rules.accept?(File.extname(url.path)[1..-1])
465
491
  end
466
492
 
467
- def get_page(url,&block)
468
- host = url.host
469
- port = url.port
470
-
471
- proxy_host = @proxy[:host]
472
- proxy_port = @proxy[:port]
473
- proxy_user = @proxy[:user]
474
- proxy_password = @proxy[:password]
475
-
476
- Net::HTTP::Proxy(proxy_host,proxy_port,proxy_user,proxy_password).start(host,port) do |sess|
477
- headers = {}
478
-
479
- headers['User-Agent'] = @user_agent if @user_agent
480
- headers['Referer'] = @referer if @referer
481
-
482
- new_page = Page.new(url,sess.get(url.path,headers))
483
-
484
- block.call(new_page) if block
485
- return new_page
486
- end
487
- end
488
-
489
493
  end
490
494
  end
data/lib/spidr/page.rb CHANGED
@@ -7,6 +7,9 @@ module Spidr
7
7
  # URL of the page
8
8
  attr_reader :url
9
9
 
10
+ # HTTP Response
11
+ attr_reader :response
12
+
10
13
  # Body returned for the page
11
14
  attr_reader :body
12
15
 
@@ -23,6 +26,70 @@ module Spidr
23
26
  @doc = nil
24
27
  end
25
28
 
29
+ #
30
+ # Returns the response code from the page.
31
+ #
32
+ def code
33
+ @response.code
34
+ end
35
+
36
+ #
37
+ # Returns +true+ if the response code is 200, returns +false+ otherwise.
38
+ #
39
+ def is_ok?
40
+ code == 200
41
+ end
42
+
43
+ #
44
+ # Returns +true+ if the response code is 301 or 307, returns +false+
45
+ # otherwise.
46
+ #
47
+ def is_redirect?
48
+ (code == 301 || code == 307)
49
+ end
50
+
51
+ #
52
+ # Returns +true+ if the response code is 308, returns +false+ otherwise.
53
+ #
54
+ def timedout?
55
+ code == 308
56
+ end
57
+
58
+ #
59
+ # Returns +true+ if the response code is 400, returns +false+ otherwise.
60
+ #
61
+ def bad_request?
62
+ code == 400
63
+ end
64
+
65
+ #
66
+ # Returns +true+ if the response code is 401, returns +false+ otherwise.
67
+ #
68
+ def is_unauthorized?
69
+ code == 401
70
+ end
71
+
72
+ #
73
+ # Returns +true+ if the response code is 403, returns +false+ otherwise.
74
+ #
75
+ def is_forbidden?
76
+ code == 403
77
+ end
78
+
79
+ #
80
+ # Returns +true+ if the response code is 404, returns +false+ otherwise.
81
+ #
82
+ def is_missing?
83
+ code == 404
84
+ end
85
+
86
+ #
87
+ # Returns +true+ if the response code is 500, returns +false+ otherwise.
88
+ #
89
+ def had_internal_server_error?
90
+ code == 500
91
+ end
92
+
26
93
  #
27
94
  # Returns the content-type of the page.
28
95
  #
@@ -30,6 +97,14 @@ module Spidr
30
97
  @response['Content-Type']
31
98
  end
32
99
 
100
+ #
101
+ # Returns +true+ if the page is a plain text document, returns +false+
102
+ # otherwise.
103
+ #
104
+ def plain_text?
105
+ (content_type =~ /text\/plain/) == 0
106
+ end
107
+
33
108
  #
34
109
  # Returns +true+ if the page is a HTML document, returns +false+
35
110
  # otherwise.
@@ -78,6 +153,30 @@ module Spidr
78
153
  (content_type =~ /application\/atom\+xml/) == 0
79
154
  end
80
155
 
156
+ #
157
+ # Returns +true+ if the page is a MS Word document, returns +false+
158
+ # otherwise.
159
+ #
160
+ def ms_word?
161
+ (content_type =~ /application\/msword/) == 0
162
+ end
163
+
164
+ #
165
+ # Returns +true+ if the page is a PDF document, returns +false+
166
+ # otherwise.
167
+ #
168
+ def pdf?
169
+ (content_type =~ /application\/pdf/) == 0
170
+ end
171
+
172
+ #
173
+ # Returns +true+ if the page is a ZIP archive, returns +false+
174
+ # otherwise.
175
+ #
176
+ def zip?
177
+ (content_type =~ /application\/zip/) == 0
178
+ end
179
+
81
180
  #
82
181
  # Returns the body of the page in +String+ form.
83
182
  #
@@ -122,24 +221,11 @@ module Spidr
122
221
  # based on the url of the page.
123
222
  #
124
223
  def to_absolute(link)
224
+ # clean the link
125
225
  link = URI.encode(link.to_s.gsub(/#.*$/,''))
126
- relative = URI(link)
127
-
128
- if relative.scheme.nil?
129
- new_url = @url.clone
130
-
131
- if relative.path[0..0] == '/'
132
- new_url.path = relative.path
133
- elsif relative.path[-1..-1] == '/'
134
- new_url.path = File.expand_path(File.join(new_url.path,relative.path))
135
- elsif !(relative.path.empty?)
136
- new_url.path = File.expand_path(File.join(File.dirname(new_url.path),relative.path))
137
- end
138
-
139
- return new_url
140
- end
141
226
 
142
- return relative
227
+ relative = URI(link)
228
+ return @url.merge(relative)
143
229
  end
144
230
 
145
231
  #
@@ -149,7 +235,7 @@ module Spidr
149
235
  if (args.empty? && block.nil?)
150
236
  name = sym.id2name.sub('_','-')
151
237
 
152
- return @response[name] if @response.has_key?(name)
238
+ return @response[name] if @response.key?(name)
153
239
  end
154
240
 
155
241
  return super(sym,*args,&block)
data/lib/spidr/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Spidr
2
- VERSION = '0.1.0'
2
+ VERSION = '0.1.1'
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Postmodern Modulus III
@@ -9,11 +9,12 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2008-05-23 00:00:00 -07:00
12
+ date: 2008-10-04 00:00:00 -07:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: hpricot
17
+ type: :runtime
17
18
  version_requirement:
18
19
  version_requirements: !ruby/object:Gem::Requirement
19
20
  requirements:
@@ -23,12 +24,13 @@ dependencies:
23
24
  version:
24
25
  - !ruby/object:Gem::Dependency
25
26
  name: hoe
27
+ type: :development
26
28
  version_requirement:
27
29
  version_requirements: !ruby/object:Gem::Requirement
28
30
  requirements:
29
31
  - - ">="
30
32
  - !ruby/object:Gem::Version
31
- version: 1.5.3
33
+ version: 1.7.0
32
34
  version:
33
35
  description: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.
34
36
  email:
@@ -76,7 +78,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
76
78
  requirements: []
77
79
 
78
80
  rubyforge_project: spidr
79
- rubygems_version: 1.1.1
81
+ rubygems_version: 1.2.0
80
82
  signing_key:
81
83
  specification_version: 2
82
84
  summary: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely