spidr 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. data.tar.gz.sig +0 -0
  2. data/History.rdoc +191 -0
  3. data/Manifest.txt +10 -34
  4. data/{README.txt → README.rdoc} +3 -1
  5. data/Rakefile +6 -4
  6. data/lib/spidr/agent.rb +137 -97
  7. data/lib/spidr/auth_credential.rb +25 -0
  8. data/lib/spidr/auth_store.rb +157 -0
  9. data/lib/spidr/cookie_jar.rb +166 -0
  10. data/lib/spidr/filters.rb +2 -0
  11. data/lib/spidr/page.rb +75 -11
  12. data/lib/spidr/sanitizers.rb +59 -0
  13. data/lib/spidr/session_cache.rb +119 -0
  14. data/lib/spidr/version.rb +1 -1
  15. data/spec/agent_spec.rb +2 -2
  16. data/spec/helpers/history.rb +34 -0
  17. data/spec/helpers/wsoc.rb +83 -0
  18. data/spec/page_examples.rb +5 -1
  19. data/spec/page_spec.rb +30 -0
  20. data/spec/sanitizers_spec.rb +67 -0
  21. data/tasks/yard.rb +1 -1
  22. metadata +24 -40
  23. metadata.gz.sig +0 -0
  24. data/History.txt +0 -167
  25. data/spec/helpers/course.rb +0 -95
  26. data/static/course/absolute/index.html +0 -10
  27. data/static/course/absolute/next.html +0 -9
  28. data/static/course/absolute/start.html +0 -19
  29. data/static/course/empty/index.html +0 -10
  30. data/static/course/empty/start.html +0 -23
  31. data/static/course/fail.html +0 -14
  32. data/static/course/frames/frame.html +0 -15
  33. data/static/course/frames/frame_next.html +0 -9
  34. data/static/course/frames/iframe.html +0 -15
  35. data/static/course/frames/iframe_next.html +0 -9
  36. data/static/course/frames/index.html +0 -10
  37. data/static/course/frames/start.html +0 -15
  38. data/static/course/index.html +0 -10
  39. data/static/course/javascript/index.html +0 -10
  40. data/static/course/javascript/start.html +0 -19
  41. data/static/course/loop/index.html +0 -10
  42. data/static/course/loop/next.html +0 -13
  43. data/static/course/loop/start.html +0 -19
  44. data/static/course/relative/current_directory.html +0 -9
  45. data/static/course/relative/index.html +0 -10
  46. data/static/course/relative/normal.html +0 -9
  47. data/static/course/relative/same_directory.html +0 -9
  48. data/static/course/relative/start.html +0 -27
  49. data/static/course/remote/index.html +0 -10
  50. data/static/course/remote/next.html +0 -9
  51. data/static/course/remote/start.html +0 -27
  52. data/static/course/scripts/course.js +0 -29
  53. data/static/course/scripts/jquery-1.2.6.min.js +0 -32
  54. data/static/course/specs.json +0 -1
  55. data/static/course/start.html +0 -27
  56. data/tasks/course.rb +0 -63
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Postmodern
@@ -30,7 +30,7 @@ cert_chain:
30
30
  pDj+ws7QjtH/Qcrr1l9jfN0ehDs=
31
31
  -----END CERTIFICATE-----
32
32
 
33
- date: 2009-11-25 00:00:00 -08:00
33
+ date: 2010-01-06 00:00:00 -08:00
34
34
  default_executable:
35
35
  dependencies:
36
36
  - !ruby/object:Gem::Dependency
@@ -63,6 +63,16 @@ dependencies:
63
63
  - !ruby/object:Gem::Version
64
64
  version: 0.4.0
65
65
  version:
66
+ - !ruby/object:Gem::Dependency
67
+ name: wsoc
68
+ type: :development
69
+ version_requirement:
70
+ version_requirements: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ version: 0.1.1
75
+ version:
66
76
  - !ruby/object:Gem::Dependency
67
77
  name: hoe
68
78
  type: :development
@@ -71,7 +81,7 @@ dependencies:
71
81
  requirements:
72
82
  - - ">="
73
83
  - !ruby/object:Gem::Version
74
- version: 2.3.3
84
+ version: 2.4.0
75
85
  version:
76
86
  description: |-
77
87
  Spidr is a versatile Ruby web spidering library that can spider a site,
@@ -84,18 +94,17 @@ executables: []
84
94
  extensions: []
85
95
 
86
96
  extra_rdoc_files:
87
- - History.txt
88
97
  - Manifest.txt
89
- - README.txt
90
98
  files:
91
- - History.txt
99
+ - History.rdoc
92
100
  - Manifest.txt
93
- - README.txt
101
+ - README.rdoc
94
102
  - Rakefile
95
103
  - lib/spidr.rb
96
104
  - lib/spidr/extensions.rb
97
105
  - lib/spidr/extensions/uri.rb
98
106
  - lib/spidr/page.rb
107
+ - lib/spidr/sanitizers.rb
99
108
  - lib/spidr/rules.rb
100
109
  - lib/spidr/filters.rb
101
110
  - lib/spidr/events.rb
@@ -106,53 +115,28 @@ files:
106
115
  - lib/spidr/actions/exceptions/skip_link.rb
107
116
  - lib/spidr/actions/exceptions/skip_page.rb
108
117
  - lib/spidr/actions/actions.rb
118
+ - lib/spidr/session_cache.rb
119
+ - lib/spidr/cookie_jar.rb
120
+ - lib/spidr/auth_credential.rb
121
+ - lib/spidr/auth_store.rb
109
122
  - lib/spidr/agent.rb
110
123
  - lib/spidr/spidr.rb
111
124
  - lib/spidr/version.rb
112
125
  - tasks/spec.rb
113
126
  - tasks/yard.rb
114
- - tasks/course.rb
115
127
  - spec/spec_helper.rb
116
- - spec/helpers/course.rb
128
+ - spec/helpers/history.rb
129
+ - spec/helpers/wsoc.rb
117
130
  - spec/helpers/page.rb
118
131
  - spec/extensions/uri_spec.rb
119
132
  - spec/page_examples.rb
120
133
  - spec/page_spec.rb
121
134
  - spec/rules_spec.rb
135
+ - spec/sanitizers_spec.rb
122
136
  - spec/filters_spec.rb
123
137
  - spec/actions_spec.rb
124
138
  - spec/agent_spec.rb
125
139
  - spec/spidr_spec.rb
126
- - static/course/index.html
127
- - static/course/start.html
128
- - static/course/fail.html
129
- - static/course/scripts/jquery-1.2.6.min.js
130
- - static/course/scripts/course.js
131
- - static/course/empty/index.html
132
- - static/course/empty/start.html
133
- - static/course/javascript/index.html
134
- - static/course/javascript/start.html
135
- - static/course/loop/index.html
136
- - static/course/loop/start.html
137
- - static/course/loop/next.html
138
- - static/course/relative/index.html
139
- - static/course/relative/start.html
140
- - static/course/relative/normal.html
141
- - static/course/relative/current_directory.html
142
- - static/course/relative/same_directory.html
143
- - static/course/absolute/index.html
144
- - static/course/absolute/start.html
145
- - static/course/absolute/next.html
146
- - static/course/remote/index.html
147
- - static/course/remote/start.html
148
- - static/course/remote/next.html
149
- - static/course/frames/index.html
150
- - static/course/frames/start.html
151
- - static/course/frames/iframe.html
152
- - static/course/frames/iframe_next.html
153
- - static/course/frames/frame.html
154
- - static/course/frames/frame_next.html
155
- - static/course/specs.json
156
140
  has_rdoc: yard
157
141
  homepage: http://spidr.rubyforge.org
158
142
  licenses: []
@@ -160,7 +144,7 @@ licenses: []
160
144
  post_install_message:
161
145
  rdoc_options:
162
146
  - --main
163
- - README.txt
147
+ - README.rdoc
164
148
  require_paths:
165
149
  - lib
166
150
  required_ruby_version: !ruby/object:Gem::Requirement
metadata.gz.sig CHANGED
Binary file
data/History.txt DELETED
@@ -1,167 +0,0 @@
1
- === 0.2.1 / 2009-11-25
2
-
3
- * Added Spidr::Events#every_ok_page.
4
- * Added Spidr::Events#every_redirect_page.
5
- * Added Spidr::Events#every_timedout_page.
6
- * Added Spidr::Events#every_bad_request_page.
7
- * Added Spidr::Events#every_unauthorized_page.
8
- * Added Spidr::Events#every_forbidden_page.
9
- * Added Spidr::Events#every_missing_page.
10
- * Added Spidr::Events#every_internal_server_error_page.
11
- * Added Spidr::Events#every_txt_page.
12
- * Added Spidr::Events#every_html_page.
13
- * Added Spidr::Events#every_xml_page.
14
- * Added Spidr::Events#every_xsl_page.
15
- * Added Spidr::Events#every_doc.
16
- * Added Spidr::Events#every_html_doc.
17
- * Added Spidr::Events#every_xml_doc.
18
- * Added Spidr::Events#every_xsl_doc.
19
- * Added Spidr::Events#every_rss_doc.
20
- * Added Spidr::Events#every_atom_doc.
21
- * Added Spidr::Events#every_javascript_page.
22
- * Added Spidr::Events#every_css_page.
23
- * Added Spidr::Events#every_rss_page.
24
- * Added Spidr::Events#every_atom_page.
25
- * Added Spidr::Events#every_ms_word_page.
26
- * Added Spidr::Events#every_pdf_page.
27
- * Added Spidr::Events#every_zip_page.
28
- * Fixed a bug where Spidr::Agent#delay was not being used to delay
29
- requesting pages.
30
- * Spider +link+ and +script+ tags in HTML pages (thanks Nick Plante).
31
-
32
- === 0.2.0 / 2009-10-10
33
-
34
- * Added URI.expand_path.
35
- * Added Spidr::Page#search.
36
- * Added Spidr::Page#at.
37
- * Added Spidr::Page#title.
38
- * Added Spidr::Agent#failures=.
39
- * Added a HTTP session cache to Spidr::Agent, per suggestion of falter.
40
- * Added Spidr::Agent#get_session.
41
- * Added Spidr::Agent#kill_session.
42
- * Added Spidr.proxy=.
43
- * Added Spidr.disable_proxy!.
44
- * Aliased Spidr::Page#txt? to Spidr::Page#plain_text?.
45
- * Aliased Spidr::Page#ok? to Spidr::Page#is_ok?.
46
- * Aliased Spidr::Page#redirect? to Spidr::Page#is_redirect?.
47
- * Aliased Spidr::Page#unauthorized? to Spidr::Page#is_unauthorized?.
48
- * Aliased Spidr::Page#forbidden? to Spidr::Page#is_forbidden?.
49
- * Aliased Spidr::Page#missing? to Spidr::Page#is_missing?.
50
- * Split URL filtering code out of Spidr::Agent and into Spidr::Filtering.
51
- * Split URL / Page event code out of Spidr::Agent and into Spidr::Events.
52
- * Split pause! / continue! / skip_link! / skip_page! methods out of
53
- Spidr::Agent and into Spidr::Actions.
54
- * Fixed a bug in Spidr::Page#code, where it was not returning an Integer.
55
- * Make sure Spidr::Page#doc returns Nokogiri::XML::Document objects for
56
- RSS/RDF/Atom pages as well.
57
- * Fixed the handling of the Location header in Spidr::Page#links
58
- (thanks falter).
59
- * Fixed a bug in Spidr::Page#to_absolute where trailing '/' characters on
60
- URI paths were not being preserved (thanks falter).
61
- * Fixed a bug where the URI query was not being sent with the request
62
- in Spidr::Agent#get_page (thanks Damian Steer).
63
- * Fixed a bug where SSL sessions were not being properly setup
64
- (thanks falter).
65
- * Switched Spidr::Agent#history to be a Set, to improve search-time
66
- of the history (thanks falter).
67
- * Switched Spidr::Agent#failures to a Set.
68
- * Allow a block to be passed to Spidr::Agent#run, which will receive all
69
- pages visited.
70
- * Allow Spidr::Agent#start_at and Spidr::Agent#continue! to pass blocks to
71
- Spidr::Agent#run.
72
- * Made Spidr::Agent#visit_page public.
73
- * Moved to YARD based documentation.
74
-
75
- === 0.1.9 / 2009-06-13
76
-
77
- * Upgraded to Hoe 2.0.0.
78
- * Use Hoe.spec instead of Hoe.new.
79
- * Use the Hoe signing task for signed gems.
80
- * Added the Agent#schemes and Agent#schemes= methods.
81
- * Added a warning message if 'net/https' cannot be loaded.
82
- * Allow the list of acceptable URL schemes to be passed into Agent.new.
83
- * Allow history and queue information to be passed into Agent.new.
84
- * Agent#start_at no longer clears the history or the queue.
85
- * Fixed a bug in the sanitization of semi-escaped URLs.
86
- * Fixed a bug where https URLs would be followed even if 'net/https'
87
- could not be loaded.
88
- * Removed Agent::SCHEMES.
89
-
90
- === 0.1.8 / 2009-05-27
91
-
92
- * Added the Agent#pause! and Agent#continue! methods.
93
- * Added the Agent#running? and Agent#paused? methods.
94
- * Added an alias for pending_urls to the queue methods.
95
- * Added Agent#queue to provide read access to the queue.
96
- * Added Agent#queue= and Agent#history= for setting the queue and history.
97
- * Added Agent#to_hash which returns a Hash of the agents queue and history.
98
- * Made Agent#enqueue and Agent#queued? public.
99
- * Added more specs.
100
-
101
- === 0.1.7 / 2009-04-24
102
-
103
- * Added Agent#all_headers.
104
- * Fixed a bug where Page#headers was always +nil+.
105
- * Spidr::Agent will now follow the Location header in HTTP 300, 301, 302,
106
- 303 and 307 Redirects.
107
- * Spidr::Agent will now follow iframe and frame tags.
108
-
109
- === 0.1.6 / 2009-04-14
110
-
111
- * Added Agent#failures, a list of URLs which could not be visited.
112
- * Added Agent#failed?.
113
- * Added Agent#every_failed_url.
114
- * Added Agent#clear, which clears the history and failures URL lists.
115
- * Improved fault tolerance in Agent#get_page.
116
- * If a Network or HTTP error is encountered, the URL will be added to
117
- the failures list and the next URL will be visited.
118
- * Fixed a typo in Agent#ignore_exts_like.
119
- * Updated the Web Spider Obstacle Course with links that always fail to be
120
- visited.
121
-
122
- === 0.1.5 / 2009-03-22
123
-
124
- * Catch malformed URIs in Page#to_absolute and return +nil+.
125
- * Filter out +nil+ URIs in Page#urls.
126
-
127
- === 0.1.4 / 2009-01-15
128
-
129
- * Use Nokogiri for HTML and XML parsing.
130
-
131
- === 0.1.3 / 2009-01-10
132
-
133
- * Added the :host options to Spidr::Agent#initialize.
134
- * Added the Web Spider Obstacle Course files to the Manifest.
135
- * Aliased Spidr::Agent#visited_urls to Spidr::Agent#history.
136
-
137
- === 0.1.2 / 2008-11-06
138
-
139
- * Fixed a bug in Page#to_absolute where URLs with no path were not
140
- receiving a default path of <tt>/</tt>.
141
- * Fixed a bug in Page#to_absolute where URL paths were not being
142
- expanded, in order to remove <tt>..</tt> and <tt>.</tt> directories.
143
- * Fixed a bug where absolute URLs could have a blank path, thus causing
144
- Agent#get_page to crash when it performed the HTTP request.
145
- * Added RSpec spec tests.
146
- * Created a Web-Spider Obstacle Course
147
- (http://spidr.rubyforge.org/course/start.html) which is used in the spec
148
- tests.
149
-
150
- === 0.1.1 / 2008-10-04
151
-
152
- * Added a reader method for the response instance variable in Page.
153
- * Fixed a bug in Page#method_missing.
154
-
155
- === 0.1.0 / 2008-05-23
156
-
157
- * Initial release.
158
- * Black-list or white-list URLs based upon:
159
- * Host name
160
- * Port number
161
- * Full link
162
- * URL extension
163
- * Provides call-backs for:
164
- * Every visited Page.
165
- * Every visited URL.
166
- * Every visited URL that matches a specified pattern.
167
-
@@ -1,95 +0,0 @@
1
- require 'open-uri'
2
- require 'json'
3
-
4
- module Helpers
5
- module Course
6
- COURSE_URL = URI('http://spidr.rubyforge.org/course/start.html')
7
-
8
- SPECS_URL = 'http://spidr.rubyforge.org/course/specs.json'
9
-
10
- def self.included(base)
11
- specs = JSON.parse(open(SPECS_URL).read)
12
-
13
- if specs.kind_of?(Array)
14
- specs.each do |spec|
15
- message = spec['message'].to_s.dump
16
- url = spec['url'].to_s.dump
17
-
18
- case spec['behavior']
19
- when 'follow'
20
- base.module_eval %{
21
- it #{message} do
22
- should_visit_link(#{url})
23
- end
24
- }
25
- when 'nofollow'
26
- base.module_eval %{
27
- it #{message} do
28
- should_visit_once(#{url})
29
- end
30
- }
31
- when 'fail'
32
- base.module_eval %{
33
- it #{message} do
34
- should_fail_link(#{url})
35
- end
36
- }
37
- else
38
- link = spec['link'].to_s.dump
39
-
40
- base.module_eval %{
41
- it #{message} do
42
- should_ignore_link(#{link})
43
- should_ignore_link(#{url})
44
- end
45
- }
46
- end
47
- end
48
- end
49
- end
50
-
51
- def run_course
52
- Agent.start_at(COURSE_URL,:hosts => [COURSE_URL.host]) do |agent|
53
- agent.every_failed_url { |url| puts "[FAILED] #{url}" }
54
- agent.every_url { |url| puts url }
55
- end
56
- end
57
-
58
- def visited_once?(link)
59
- url = COURSE_URL.merge(URI.encode(link))
60
-
61
- return @agent.visited_urls.select { |visited_url|
62
- visited_url == url
63
- }.length == 1
64
- end
65
-
66
- #
67
- # Returns +true+ if the agent has visited the specified _link_, returns
68
- # +false+ otherwise.
69
- #
70
- def visited_link?(link)
71
- @agent.visited?(COURSE_URL.merge(URI.encode(link)))
72
- end
73
-
74
- def visit_failed?(link)
75
- @agent.failed?(COURSE_URL.merge(URI.encode(link)))
76
- end
77
-
78
- def should_visit_link(link)
79
- visited_link?(link).should == true
80
- end
81
-
82
- def should_ignore_link(link)
83
- visited_link?(link).should == false
84
- end
85
-
86
- def should_visit_once(link)
87
- visited_once?(link).should == true
88
- end
89
-
90
- def should_fail_link(link)
91
- visited_link?(link).should == false
92
- visit_failed?(link).should == true
93
- end
94
- end
95
- end
@@ -1,10 +0,0 @@
1
- <html>
2
- <head>
3
- <title>Spidr :: Web-Spider Obstacle Course :: Empty Links</title>
4
- <script type="text/javascript" src="../scripts/jquery-1.2.6.min.js"></script>
5
- <script type="text/javascript" src="../scripts/course.js"></script>
6
- <script type="text/javascript">
7
- fail();
8
- </script>
9
- </head>
10
- </html>
@@ -1,9 +0,0 @@
1
- <html>
2
- <head>
3
- <title>Spidr :: Web-Spider Obstacle Course :: Absolute Links</title>
4
- </head>
5
-
6
- <body>
7
- <p>Absolute links to an unvisited page</p>
8
- </body>
9
- </html>
@@ -1,19 +0,0 @@
1
- <html>
2
- <head>
3
- <title>Spidr :: Web-Spider Obstacle Course :: Absolute Links</title>
4
- </head>
5
-
6
- <body>
7
- <p>Absolute links</p>
8
-
9
- <ul>
10
- <li class="nofollow">
11
- <a href="/course/absolute/start.html">should not follow absolute links to the current page</a>
12
- </li>
13
-
14
- <li class="follow">
15
- <a href="/course/absolute/next.html">should follow absolute links to unvisited pages</a>
16
- </li>
17
- </ul>
18
- </body>
19
- </html>
@@ -1,10 +0,0 @@
1
- <html>
2
- <head>
3
- <title>Spidr :: Web-Spider Obstacle Course :: Empty Links</title>
4
- <script type="text/javascript" src="../scripts/jquery-1.2.6.min.js"></script>
5
- <script type="text/javascript" src="../scripts/course.js"></script>
6
- <script type="text/javascript">
7
- fail();
8
- </script>
9
- </head>
10
- </html>