spidr 0.2.1 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (56) hide show
  1. data.tar.gz.sig +0 -0
  2. data/History.rdoc +191 -0
  3. data/Manifest.txt +10 -34
  4. data/{README.txt → README.rdoc} +3 -1
  5. data/Rakefile +6 -4
  6. data/lib/spidr/agent.rb +137 -97
  7. data/lib/spidr/auth_credential.rb +25 -0
  8. data/lib/spidr/auth_store.rb +157 -0
  9. data/lib/spidr/cookie_jar.rb +166 -0
  10. data/lib/spidr/filters.rb +2 -0
  11. data/lib/spidr/page.rb +75 -11
  12. data/lib/spidr/sanitizers.rb +59 -0
  13. data/lib/spidr/session_cache.rb +119 -0
  14. data/lib/spidr/version.rb +1 -1
  15. data/spec/agent_spec.rb +2 -2
  16. data/spec/helpers/history.rb +34 -0
  17. data/spec/helpers/wsoc.rb +83 -0
  18. data/spec/page_examples.rb +5 -1
  19. data/spec/page_spec.rb +30 -0
  20. data/spec/sanitizers_spec.rb +67 -0
  21. data/tasks/yard.rb +1 -1
  22. metadata +24 -40
  23. metadata.gz.sig +0 -0
  24. data/History.txt +0 -167
  25. data/spec/helpers/course.rb +0 -95
  26. data/static/course/absolute/index.html +0 -10
  27. data/static/course/absolute/next.html +0 -9
  28. data/static/course/absolute/start.html +0 -19
  29. data/static/course/empty/index.html +0 -10
  30. data/static/course/empty/start.html +0 -23
  31. data/static/course/fail.html +0 -14
  32. data/static/course/frames/frame.html +0 -15
  33. data/static/course/frames/frame_next.html +0 -9
  34. data/static/course/frames/iframe.html +0 -15
  35. data/static/course/frames/iframe_next.html +0 -9
  36. data/static/course/frames/index.html +0 -10
  37. data/static/course/frames/start.html +0 -15
  38. data/static/course/index.html +0 -10
  39. data/static/course/javascript/index.html +0 -10
  40. data/static/course/javascript/start.html +0 -19
  41. data/static/course/loop/index.html +0 -10
  42. data/static/course/loop/next.html +0 -13
  43. data/static/course/loop/start.html +0 -19
  44. data/static/course/relative/current_directory.html +0 -9
  45. data/static/course/relative/index.html +0 -10
  46. data/static/course/relative/normal.html +0 -9
  47. data/static/course/relative/same_directory.html +0 -9
  48. data/static/course/relative/start.html +0 -27
  49. data/static/course/remote/index.html +0 -10
  50. data/static/course/remote/next.html +0 -9
  51. data/static/course/remote/start.html +0 -27
  52. data/static/course/scripts/course.js +0 -29
  53. data/static/course/scripts/jquery-1.2.6.min.js +0 -32
  54. data/static/course/specs.json +0 -1
  55. data/static/course/start.html +0 -27
  56. data/tasks/course.rb +0 -63
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Postmodern
@@ -30,7 +30,7 @@ cert_chain:
30
30
  pDj+ws7QjtH/Qcrr1l9jfN0ehDs=
31
31
  -----END CERTIFICATE-----
32
32
 
33
- date: 2009-11-25 00:00:00 -08:00
33
+ date: 2010-01-06 00:00:00 -08:00
34
34
  default_executable:
35
35
  dependencies:
36
36
  - !ruby/object:Gem::Dependency
@@ -63,6 +63,16 @@ dependencies:
63
63
  - !ruby/object:Gem::Version
64
64
  version: 0.4.0
65
65
  version:
66
+ - !ruby/object:Gem::Dependency
67
+ name: wsoc
68
+ type: :development
69
+ version_requirement:
70
+ version_requirements: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ version: 0.1.1
75
+ version:
66
76
  - !ruby/object:Gem::Dependency
67
77
  name: hoe
68
78
  type: :development
@@ -71,7 +81,7 @@ dependencies:
71
81
  requirements:
72
82
  - - ">="
73
83
  - !ruby/object:Gem::Version
74
- version: 2.3.3
84
+ version: 2.4.0
75
85
  version:
76
86
  description: |-
77
87
  Spidr is a versatile Ruby web spidering library that can spider a site,
@@ -84,18 +94,17 @@ executables: []
84
94
  extensions: []
85
95
 
86
96
  extra_rdoc_files:
87
- - History.txt
88
97
  - Manifest.txt
89
- - README.txt
90
98
  files:
91
- - History.txt
99
+ - History.rdoc
92
100
  - Manifest.txt
93
- - README.txt
101
+ - README.rdoc
94
102
  - Rakefile
95
103
  - lib/spidr.rb
96
104
  - lib/spidr/extensions.rb
97
105
  - lib/spidr/extensions/uri.rb
98
106
  - lib/spidr/page.rb
107
+ - lib/spidr/sanitizers.rb
99
108
  - lib/spidr/rules.rb
100
109
  - lib/spidr/filters.rb
101
110
  - lib/spidr/events.rb
@@ -106,53 +115,28 @@ files:
106
115
  - lib/spidr/actions/exceptions/skip_link.rb
107
116
  - lib/spidr/actions/exceptions/skip_page.rb
108
117
  - lib/spidr/actions/actions.rb
118
+ - lib/spidr/session_cache.rb
119
+ - lib/spidr/cookie_jar.rb
120
+ - lib/spidr/auth_credential.rb
121
+ - lib/spidr/auth_store.rb
109
122
  - lib/spidr/agent.rb
110
123
  - lib/spidr/spidr.rb
111
124
  - lib/spidr/version.rb
112
125
  - tasks/spec.rb
113
126
  - tasks/yard.rb
114
- - tasks/course.rb
115
127
  - spec/spec_helper.rb
116
- - spec/helpers/course.rb
128
+ - spec/helpers/history.rb
129
+ - spec/helpers/wsoc.rb
117
130
  - spec/helpers/page.rb
118
131
  - spec/extensions/uri_spec.rb
119
132
  - spec/page_examples.rb
120
133
  - spec/page_spec.rb
121
134
  - spec/rules_spec.rb
135
+ - spec/sanitizers_spec.rb
122
136
  - spec/filters_spec.rb
123
137
  - spec/actions_spec.rb
124
138
  - spec/agent_spec.rb
125
139
  - spec/spidr_spec.rb
126
- - static/course/index.html
127
- - static/course/start.html
128
- - static/course/fail.html
129
- - static/course/scripts/jquery-1.2.6.min.js
130
- - static/course/scripts/course.js
131
- - static/course/empty/index.html
132
- - static/course/empty/start.html
133
- - static/course/javascript/index.html
134
- - static/course/javascript/start.html
135
- - static/course/loop/index.html
136
- - static/course/loop/start.html
137
- - static/course/loop/next.html
138
- - static/course/relative/index.html
139
- - static/course/relative/start.html
140
- - static/course/relative/normal.html
141
- - static/course/relative/current_directory.html
142
- - static/course/relative/same_directory.html
143
- - static/course/absolute/index.html
144
- - static/course/absolute/start.html
145
- - static/course/absolute/next.html
146
- - static/course/remote/index.html
147
- - static/course/remote/start.html
148
- - static/course/remote/next.html
149
- - static/course/frames/index.html
150
- - static/course/frames/start.html
151
- - static/course/frames/iframe.html
152
- - static/course/frames/iframe_next.html
153
- - static/course/frames/frame.html
154
- - static/course/frames/frame_next.html
155
- - static/course/specs.json
156
140
  has_rdoc: yard
157
141
  homepage: http://spidr.rubyforge.org
158
142
  licenses: []
@@ -160,7 +144,7 @@ licenses: []
160
144
  post_install_message:
161
145
  rdoc_options:
162
146
  - --main
163
- - README.txt
147
+ - README.rdoc
164
148
  require_paths:
165
149
  - lib
166
150
  required_ruby_version: !ruby/object:Gem::Requirement
metadata.gz.sig CHANGED
Binary file
data/History.txt DELETED
@@ -1,167 +0,0 @@
1
- === 0.2.1 / 2009-11-25
2
-
3
- * Added Spidr::Events#every_ok_page.
4
- * Added Spidr::Events#every_redirect_page.
5
- * Added Spidr::Events#every_timedout_page.
6
- * Added Spidr::Events#every_bad_request_page.
7
- * Added Spidr::Events#every_unauthorized_page.
8
- * Added Spidr::Events#every_forbidden_page.
9
- * Added Spidr::Events#every_missing_page.
10
- * Added Spidr::Events#every_internal_server_error_page.
11
- * Added Spidr::Events#every_txt_page.
12
- * Added Spidr::Events#every_html_page.
13
- * Added Spidr::Events#every_xml_page.
14
- * Added Spidr::Events#every_xsl_page.
15
- * Added Spidr::Events#every_doc.
16
- * Added Spidr::Events#every_html_doc.
17
- * Added Spidr::Events#every_xml_doc.
18
- * Added Spidr::Events#every_xsl_doc.
19
- * Added Spidr::Events#every_rss_doc.
20
- * Added Spidr::Events#every_atom_doc.
21
- * Added Spidr::Events#every_javascript_page.
22
- * Added Spidr::Events#every_css_page.
23
- * Added Spidr::Events#every_rss_page.
24
- * Added Spidr::Events#every_atom_page.
25
- * Added Spidr::Events#every_ms_word_page.
26
- * Added Spidr::Events#every_pdf_page.
27
- * Added Spidr::Events#every_zip_page.
28
- * Fixed a bug where Spidr::Agent#delay was not being used to delay
29
- requesting pages.
30
- * Spider +link+ and +script+ tags in HTML pages (thanks Nick Plante).
31
-
32
- === 0.2.0 / 2009-10-10
33
-
34
- * Added URI.expand_path.
35
- * Added Spidr::Page#search.
36
- * Added Spidr::Page#at.
37
- * Added Spidr::Page#title.
38
- * Added Spidr::Agent#failures=.
39
- * Added a HTTP session cache to Spidr::Agent, per suggestion of falter.
40
- * Added Spidr::Agent#get_session.
41
- * Added Spidr::Agent#kill_session.
42
- * Added Spidr.proxy=.
43
- * Added Spidr.disable_proxy!.
44
- * Aliased Spidr::Page#txt? to Spidr::Page#plain_text?.
45
- * Aliased Spidr::Page#ok? to Spidr::Page#is_ok?.
46
- * Aliased Spidr::Page#redirect? to Spidr::Page#is_redirect?.
47
- * Aliased Spidr::Page#unauthorized? to Spidr::Page#is_unauthorized?.
48
- * Aliased Spidr::Page#forbidden? to Spidr::Page#is_forbidden?.
49
- * Aliased Spidr::Page#missing? to Spidr::Page#is_missing?.
50
- * Split URL filtering code out of Spidr::Agent and into Spidr::Filtering.
51
- * Split URL / Page event code out of Spidr::Agent and into Spidr::Events.
52
- * Split pause! / continue! / skip_link! / skip_page! methods out of
53
- Spidr::Agent and into Spidr::Actions.
54
- * Fixed a bug in Spidr::Page#code, where it was not returning an Integer.
55
- * Make sure Spidr::Page#doc returns Nokogiri::XML::Document objects for
56
- RSS/RDF/Atom pages as well.
57
- * Fixed the handling of the Location header in Spidr::Page#links
58
- (thanks falter).
59
- * Fixed a bug in Spidr::Page#to_absolute where trailing '/' characters on
60
- URI paths were not being preserved (thanks falter).
61
- * Fixed a bug where the URI query was not being sent with the request
62
- in Spidr::Agent#get_page (thanks Damian Steer).
63
- * Fixed a bug where SSL sessions were not being properly setup
64
- (thanks falter).
65
- * Switched Spidr::Agent#history to be a Set, to improve search-time
66
- of the history (thanks falter).
67
- * Switched Spidr::Agent#failures to a Set.
68
- * Allow a block to be passed to Spidr::Agent#run, which will receive all
69
- pages visited.
70
- * Allow Spidr::Agent#start_at and Spidr::Agent#continue! to pass blocks to
71
- Spidr::Agent#run.
72
- * Made Spidr::Agent#visit_page public.
73
- * Moved to YARD based documentation.
74
-
75
- === 0.1.9 / 2009-06-13
76
-
77
- * Upgraded to Hoe 2.0.0.
78
- * Use Hoe.spec instead of Hoe.new.
79
- * Use the Hoe signing task for signed gems.
80
- * Added the Agent#schemes and Agent#schemes= methods.
81
- * Added a warning message if 'net/https' cannot be loaded.
82
- * Allow the list of acceptable URL schemes to be passed into Agent.new.
83
- * Allow history and queue information to be passed into Agent.new.
84
- * Agent#start_at no longer clears the history or the queue.
85
- * Fixed a bug in the sanitization of semi-escaped URLs.
86
- * Fixed a bug where https URLs would be followed even if 'net/https'
87
- could not be loaded.
88
- * Removed Agent::SCHEMES.
89
-
90
- === 0.1.8 / 2009-05-27
91
-
92
- * Added the Agent#pause! and Agent#continue! methods.
93
- * Added the Agent#running? and Agent#paused? methods.
94
- * Added an alias for pending_urls to the queue methods.
95
- * Added Agent#queue to provide read access to the queue.
96
- * Added Agent#queue= and Agent#history= for setting the queue and history.
97
- * Added Agent#to_hash which returns a Hash of the agents queue and history.
98
- * Made Agent#enqueue and Agent#queued? public.
99
- * Added more specs.
100
-
101
- === 0.1.7 / 2009-04-24
102
-
103
- * Added Agent#all_headers.
104
- * Fixed a bug where Page#headers was always +nil+.
105
- * Spidr::Agent will now follow the Location header in HTTP 300, 301, 302,
106
- 303 and 307 Redirects.
107
- * Spidr::Agent will now follow iframe and frame tags.
108
-
109
- === 0.1.6 / 2009-04-14
110
-
111
- * Added Agent#failures, a list of URLs which could not be visited.
112
- * Added Agent#failed?.
113
- * Added Agent#every_failed_url.
114
- * Added Agent#clear, which clears the history and failures URL lists.
115
- * Improved fault tolerance in Agent#get_page.
116
- * If a Network or HTTP error is encountered, the URL will be added to
117
- the failures list and the next URL will be visited.
118
- * Fixed a typo in Agent#ignore_exts_like.
119
- * Updated the Web Spider Obstacle Course with links that always fail to be
120
- visited.
121
-
122
- === 0.1.5 / 2009-03-22
123
-
124
- * Catch malformed URIs in Page#to_absolute and return +nil+.
125
- * Filter out +nil+ URIs in Page#urls.
126
-
127
- === 0.1.4 / 2009-01-15
128
-
129
- * Use Nokogiri for HTML and XML parsing.
130
-
131
- === 0.1.3 / 2009-01-10
132
-
133
- * Added the :host options to Spidr::Agent#initialize.
134
- * Added the Web Spider Obstacle Course files to the Manifest.
135
- * Aliased Spidr::Agent#visited_urls to Spidr::Agent#history.
136
-
137
- === 0.1.2 / 2008-11-06
138
-
139
- * Fixed a bug in Page#to_absolute where URLs with no path were not
140
- receiving a default path of <tt>/</tt>.
141
- * Fixed a bug in Page#to_absolute where URL paths were not being
142
- expanded, in order to remove <tt>..</tt> and <tt>.</tt> directories.
143
- * Fixed a bug where absolute URLs could have a blank path, thus causing
144
- Agent#get_page to crash when it performed the HTTP request.
145
- * Added RSpec spec tests.
146
- * Created a Web-Spider Obstacle Course
147
- (http://spidr.rubyforge.org/course/start.html) which is used in the spec
148
- tests.
149
-
150
- === 0.1.1 / 2008-10-04
151
-
152
- * Added a reader method for the response instance variable in Page.
153
- * Fixed a bug in Page#method_missing.
154
-
155
- === 0.1.0 / 2008-05-23
156
-
157
- * Initial release.
158
- * Black-list or white-list URLs based upon:
159
- * Host name
160
- * Port number
161
- * Full link
162
- * URL extension
163
- * Provides call-backs for:
164
- * Every visited Page.
165
- * Every visited URL.
166
- * Every visited URL that matches a specified pattern.
167
-
@@ -1,95 +0,0 @@
1
- require 'open-uri'
2
- require 'json'
3
-
4
- module Helpers
5
- module Course
6
- COURSE_URL = URI('http://spidr.rubyforge.org/course/start.html')
7
-
8
- SPECS_URL = 'http://spidr.rubyforge.org/course/specs.json'
9
-
10
- def self.included(base)
11
- specs = JSON.parse(open(SPECS_URL).read)
12
-
13
- if specs.kind_of?(Array)
14
- specs.each do |spec|
15
- message = spec['message'].to_s.dump
16
- url = spec['url'].to_s.dump
17
-
18
- case spec['behavior']
19
- when 'follow'
20
- base.module_eval %{
21
- it #{message} do
22
- should_visit_link(#{url})
23
- end
24
- }
25
- when 'nofollow'
26
- base.module_eval %{
27
- it #{message} do
28
- should_visit_once(#{url})
29
- end
30
- }
31
- when 'fail'
32
- base.module_eval %{
33
- it #{message} do
34
- should_fail_link(#{url})
35
- end
36
- }
37
- else
38
- link = spec['link'].to_s.dump
39
-
40
- base.module_eval %{
41
- it #{message} do
42
- should_ignore_link(#{link})
43
- should_ignore_link(#{url})
44
- end
45
- }
46
- end
47
- end
48
- end
49
- end
50
-
51
- def run_course
52
- Agent.start_at(COURSE_URL,:hosts => [COURSE_URL.host]) do |agent|
53
- agent.every_failed_url { |url| puts "[FAILED] #{url}" }
54
- agent.every_url { |url| puts url }
55
- end
56
- end
57
-
58
- def visited_once?(link)
59
- url = COURSE_URL.merge(URI.encode(link))
60
-
61
- return @agent.visited_urls.select { |visited_url|
62
- visited_url == url
63
- }.length == 1
64
- end
65
-
66
- #
67
- # Returns +true+ if the agent has visited the specified _link_, returns
68
- # +false+ otherwise.
69
- #
70
- def visited_link?(link)
71
- @agent.visited?(COURSE_URL.merge(URI.encode(link)))
72
- end
73
-
74
- def visit_failed?(link)
75
- @agent.failed?(COURSE_URL.merge(URI.encode(link)))
76
- end
77
-
78
- def should_visit_link(link)
79
- visited_link?(link).should == true
80
- end
81
-
82
- def should_ignore_link(link)
83
- visited_link?(link).should == false
84
- end
85
-
86
- def should_visit_once(link)
87
- visited_once?(link).should == true
88
- end
89
-
90
- def should_fail_link(link)
91
- visited_link?(link).should == false
92
- visit_failed?(link).should == true
93
- end
94
- end
95
- end
@@ -1,10 +0,0 @@
1
- <html>
2
- <head>
3
- <title>Spidr :: Web-Spider Obstacle Course :: Empty Links</title>
4
- <script type="text/javascript" src="../scripts/jquery-1.2.6.min.js"></script>
5
- <script type="text/javascript" src="../scripts/course.js"></script>
6
- <script type="text/javascript">
7
- fail();
8
- </script>
9
- </head>
10
- </html>
@@ -1,9 +0,0 @@
1
- <html>
2
- <head>
3
- <title>Spidr :: Web-Spider Obstacle Course :: Absolute Links</title>
4
- </head>
5
-
6
- <body>
7
- <p>Absolute links to an unvisited page</p>
8
- </body>
9
- </html>
@@ -1,19 +0,0 @@
1
- <html>
2
- <head>
3
- <title>Spidr :: Web-Spider Obstacle Course :: Absolute Links</title>
4
- </head>
5
-
6
- <body>
7
- <p>Absolute links</p>
8
-
9
- <ul>
10
- <li class="nofollow">
11
- <a href="/course/absolute/start.html">should not follow absolute links to the current page</a>
12
- </li>
13
-
14
- <li class="follow">
15
- <a href="/course/absolute/next.html">should follow absolute links to unvisited pages</a>
16
- </li>
17
- </ul>
18
- </body>
19
- </html>
@@ -1,10 +0,0 @@
1
- <html>
2
- <head>
3
- <title>Spidr :: Web-Spider Obstacle Course :: Empty Links</title>
4
- <script type="text/javascript" src="../scripts/jquery-1.2.6.min.js"></script>
5
- <script type="text/javascript" src="../scripts/course.js"></script>
6
- <script type="text/javascript">
7
- fail();
8
- </script>
9
- </head>
10
- </html>