spidr_epg 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (60) hide show
  1. checksums.yaml +15 -0
  2. data/.gitignore +10 -0
  3. data/.rspec +1 -0
  4. data/.yardopts +1 -0
  5. data/ChangeLog.md +291 -0
  6. data/ChangeLog.md~ +291 -0
  7. data/Gemfile +16 -0
  8. data/Gemfile.lock +49 -0
  9. data/Gemfile~ +16 -0
  10. data/LICENSE.txt +20 -0
  11. data/README.md +193 -0
  12. data/README.md~ +190 -0
  13. data/Rakefile +29 -0
  14. data/gemspec.yml +19 -0
  15. data/lib/spidr/actions/actions.rb +83 -0
  16. data/lib/spidr/actions/exceptions/action.rb +9 -0
  17. data/lib/spidr/actions/exceptions/paused.rb +11 -0
  18. data/lib/spidr/actions/exceptions/skip_link.rb +12 -0
  19. data/lib/spidr/actions/exceptions/skip_page.rb +12 -0
  20. data/lib/spidr/actions/exceptions.rb +4 -0
  21. data/lib/spidr/actions.rb +2 -0
  22. data/lib/spidr/agent.rb +866 -0
  23. data/lib/spidr/auth_credential.rb +28 -0
  24. data/lib/spidr/auth_store.rb +161 -0
  25. data/lib/spidr/body.rb +98 -0
  26. data/lib/spidr/cookie_jar.rb +202 -0
  27. data/lib/spidr/events.rb +537 -0
  28. data/lib/spidr/extensions/uri.rb +52 -0
  29. data/lib/spidr/extensions.rb +1 -0
  30. data/lib/spidr/filters.rb +539 -0
  31. data/lib/spidr/headers.rb +370 -0
  32. data/lib/spidr/links.rb +229 -0
  33. data/lib/spidr/page.rb +108 -0
  34. data/lib/spidr/rules.rb +79 -0
  35. data/lib/spidr/sanitizers.rb +56 -0
  36. data/lib/spidr/session_cache.rb +145 -0
  37. data/lib/spidr/spidr.rb +107 -0
  38. data/lib/spidr/version.rb +4 -0
  39. data/lib/spidr/version.rb~ +4 -0
  40. data/lib/spidr.rb +3 -0
  41. data/pkg/spidr-1.0.0.gem +0 -0
  42. data/spec/actions_spec.rb +59 -0
  43. data/spec/agent_spec.rb +81 -0
  44. data/spec/auth_store_spec.rb +85 -0
  45. data/spec/cookie_jar_spec.rb +144 -0
  46. data/spec/extensions/uri_spec.rb +43 -0
  47. data/spec/filters_spec.rb +61 -0
  48. data/spec/helpers/history.rb +34 -0
  49. data/spec/helpers/page.rb +8 -0
  50. data/spec/helpers/wsoc.rb +83 -0
  51. data/spec/page_examples.rb +21 -0
  52. data/spec/page_spec.rb +125 -0
  53. data/spec/rules_spec.rb +45 -0
  54. data/spec/sanitizers_spec.rb +61 -0
  55. data/spec/session_cache.rb +58 -0
  56. data/spec/spec_helper.rb +4 -0
  57. data/spec/spidr_spec.rb +39 -0
  58. data/spidr.gemspec +133 -0
  59. data/spidr.gemspec~ +131 -0
  60. metadata +158 -0
checksums.yaml ADDED
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ NTcxZjE1YTc5OTY1MTQyM2NmYTRjM2VlODE5ZWIwZjBiMjc1OGVmMg==
5
+ data.tar.gz: !binary |-
6
+ ZWIyNzI5NGMwMGMwNzJmYTA4Nzg2YzI4OTk3YTIyNjYyZGU4ZDQzYw==
7
+ !binary "U0hBNTEy":
8
+ metadata.gz: !binary |-
9
+ ODdiYzJjYWM3NjM3NzBjZjExMzI1YzJmN2I2YzA2YTQ5Y2I5N2IwNTRhMzE3
10
+ YWE0NDI5MGMwZTM1M2U2YTFkNWNiOWNjY2Q1Y2ExYjYzY2M4MDI1MTliODhi
11
+ NTM3Y2UyNDUxZDkxZjIwNTRmNzI1NzZjZjliNzE0YzcwNTUyMjk=
12
+ data.tar.gz: !binary |-
13
+ MDZkYjMwYTBkZTUyMGMwMGRkZTgxZWM4ZWY3YTgwNWJlNTQ4NmVlNjVkNDli
14
+ ZWQ3M2FiYjkwYjJkNjI1NGQyOTJhMmM5ZTk2NTZjMzk4ZTgxNmYyNmIxNTA2
15
+ YTZlNmVmYTg0ZGVmYWFjODMzOTBjMmE3MTAzY2NiYTAxYTQ5ZTU=
data/.gitignore ADDED
@@ -0,0 +1,10 @@
1
+ pkg
2
+ doc
3
+ web
4
+ tmp
5
+ Gemfile.lock
6
+ .DS_Store
7
+ .bundle
8
+ .yardoc
9
+ *.swp
10
+ *~
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --colour --format documentation
data/.yardopts ADDED
@@ -0,0 +1 @@
1
+ --markup markdown --title 'Spidr Documentation' --protected --files ChangeLog.md,LICENSE.txt
data/ChangeLog.md ADDED
@@ -0,0 +1,291 @@
1
+ ### 1.4.2 /2012-04-12
2
+
3
+ * 对此gem进行了修改.
4
+
5
+
6
+ ### 0.4.1 / 2011-12-08
7
+
8
+ * Catch `OpenSSL::SSL::SSLError` exceptions when initiated HTTPS Sessions.
9
+
10
+ ### 0.4.0 / 2011-08-07
11
+
12
+ * Added {Spidr::Headers#content_charset}.
13
+ * Pass the Page `url` and `content_charset` to Nokogiri in {Spidr::Body#doc}.
14
+ This ensures that Nokogiri will preserve the body encoding.
15
+ * Made {Spidr::Headers#is_content_type?} public.
16
+ * Allow {Spidr::Headers#is_content_type?} to match the full Content-Type
17
+ or the sub-type.
18
+
19
+ ### 0.3.2 / 2011-06-20
20
+
21
+ * Added separate intitialize methods for {Spidr::Actions}, {Spidr::Events},
22
+ {Spidr::Filters} and {Spidr::Sanitizers}.
23
+ * Aliased {Spidr::Events#urls_like} to {Spidr::Events#every_url_like}.
24
+ * Reduce usage of `self.included` and `module_eval`.
25
+ * Reduce usage of nested-blocks.
26
+ * Reduce usage of `return`.
27
+
28
+ ### 0.3.1 / 2011-04-22
29
+
30
+ * Require `set` in `spidr/headers.rb`.
31
+
32
+ ### 0.3.0 / 2011-04-14
33
+
34
+ * Switched from Jeweler to [Ore](http://github.com/ruby-ore/ore).
35
+ * Split all header related methods out of {Spidr::Page} and into
36
+ {Spidr::Headers}.
37
+ * Split all body related methods out of {Spidr::Page} and into
38
+ {Spidr::Body}.
39
+ * Split all link related methods out of {Spidr::Page} and into
40
+ {Spidr::Links}.
41
+ * Added {Spidr::Headers#directory?}.
42
+ * Added {Spidr::Headers#json?}.
43
+ * Added {Spidr::Links#each_url}.
44
+ * Added {Spidr::Links#each_link}.
45
+ * Added {Spidr::Links#each_redirect}.
46
+ * Added {Spidr::Links#each_meta_redirect}.
47
+ * Aliased {Spidr::Headers#raw_cookie} to {Spidr::Headers#cookie}.
48
+ * Aliased {Spidr::Body#to_s} to {Spidr::Body#body}.
49
+ * Also check for `application/xml` in {Spidr::Headers#xml?}.
50
+ * Catch all exceptions when merging URIs in {Spidr::Links#to_absolute}.
51
+ * Always prepend a `/` to all FTP URI paths. Fixes a Ruby 1.8 specific
52
+ bug, where it expects an absolute path for all FTP URIs.
53
+ * Refactored {URI.expand_path}.
54
+ * Start the session in {Spidr::SessionCache#[]} to prevent multiple
55
+ `CONNECT` commands being sent to HTTP Proxies (thanks falaise).
56
+
57
+ ### 0.2.7 / 2010-08-17
58
+
59
+ * Added {Spidr::CookieJar#cookies_for_host} (thanks zapnap).
60
+ * Renamed `Spidr::Page#cookie` to `Spidr::Page#raw_cookie`.
61
+ * Rescue `URI::InvalidComponentError` exceptions in
62
+ `Spidr::Page#to_absolute` (thanks zapnap).
63
+
64
+ ### 0.2.6 / 2010-07-05
65
+
66
+ * Fixed a bug in `Spidr::Page#meta_redirect`, by calling
67
+ `Nokogiri::XML::Element#get_attribute` instead of `attr`.
68
+
69
+ ### 0.2.5 / 2010-07-02
70
+
71
+ * Added `Spidr::Page#meta_redirect`.
72
+ * Added `Spidr::Page#meta_redirect?`.
73
+ * Manage development dependencies with Bundler.
74
+ * Support following "old-school" meta-refresh redirects (thanks zapnap).
75
+ * Allow {Spidr::CookieJar} inherit cookies set by a parent domain.
76
+ * Fixed a constant lookup issue in {Spidr::Agent}.
77
+ * Use `yield` instead of `block.call` when necessary.
78
+
79
+ ### 0.2.4 / 2010-05-05
80
+
81
+ * Added {Spidr::Filters#visit_urls}.
82
+ * Added {Spidr::Filters#visit_urls_like}.
83
+ * Added {Spidr::Filters#ignore_urls}.
84
+ * Added {Spidr::Filters#ignore_urls_like}.
85
+ * Added `Spidr::Page#is_content_type?`.
86
+ * Default `Spidr::Page#body` to an empty String.
87
+ * Default `Spidr::Page#content_type` to an empty String.
88
+ * Default `Spidr::Page#content_types` to an empty Array.
89
+ * Improved reliability of {Spidr::Page#is_redirect?}.
90
+ * Improved content type detection in {Spidr::Page} to handle `Content-Type`
91
+ headers containing charsets (thanks Josh Lindsey).
92
+
93
+ ### 0.2.3 / 2010-02-27
94
+
95
+ * Migrated to Jeweler, for the packaging and releasing RubyGems.
96
+ * Switched to MarkDown formatted YARD documentation.
97
+ * Added {Spidr::Events#every_link}.
98
+ * Added {Spidr::SessionCache#active?}.
99
+ * Added specs for {Spidr::SessionCache}.
100
+
101
+ ### 0.2.2 / 2010-01-06
102
+
103
+ * Require Web Spider Obstacle Course (WSOC) >= 0.1.1.
104
+ * Integrated the new WSOC into the specs.
105
+ * Removed the built-in Web Spider Obstacle Course.
106
+ * Added `Spidr::Page#content_types`.
107
+ * Added `Spidr::Page#cookie`.
108
+ * Added `Spidr::Page#cookies`.
109
+ * Added `Spidr::Page#cookie_params`.
110
+ * Added {Spidr::Sanitizers}.
111
+ * Added {Spidr::SessionCache}.
112
+ * Added {Spidr::CookieJar} (thanks Nick Plante).
113
+ * Added {Spidr::AuthStore} (thanks Nick Plante).
114
+ * Added {Spidr::Agent#post_page} (thanks Nick Plante).
115
+ * Renamed `Spidr::Agent#get_session` to {Spidr::SessionCache#[]}.
116
+ * Renamed `Spidr::Agent#kill_session` to {Spidr::SessionCache#kill!}.
117
+
118
+ ### 0.2.1 / 2009-11-25
119
+
120
+ * Added {Spidr::Events#every_ok_page}.
121
+ * Added {Spidr::Events#every_redirect_page}.
122
+ * Added {Spidr::Events#every_timedout_page}.
123
+ * Added {Spidr::Events#every_bad_request_page}.
124
+ * Added {Spidr::Events#every_unauthorized_page}.
125
+ * Added {Spidr::Events#every_forbidden_page}.
126
+ * Added {Spidr::Events#every_missing_page}.
127
+ * Added {Spidr::Events#every_internal_server_error_page}.
128
+ * Added {Spidr::Events#every_txt_page}.
129
+ * Added {Spidr::Events#every_html_page}.
130
+ * Added {Spidr::Events#every_xml_page}.
131
+ * Added {Spidr::Events#every_xsl_page}.
132
+ * Added {Spidr::Events#every_doc}.
133
+ * Added {Spidr::Events#every_html_doc}.
134
+ * Added {Spidr::Events#every_xml_doc}.
135
+ * Added {Spidr::Events#every_xsl_doc}.
136
+ * Added {Spidr::Events#every_rss_doc}.
137
+ * Added {Spidr::Events#every_atom_doc}.
138
+ * Added {Spidr::Events#every_javascript_page}.
139
+ * Added {Spidr::Events#every_css_page}.
140
+ * Added {Spidr::Events#every_rss_page}.
141
+ * Added {Spidr::Events#every_atom_page}.
142
+ * Added {Spidr::Events#every_ms_word_page}.
143
+ * Added {Spidr::Events#every_pdf_page}.
144
+ * Added {Spidr::Events#every_zip_page}.
145
+ * Fixed a bug where {Spidr::Agent#delay} was not being used to delay
146
+ requesting pages.
147
+ * Spider `link` and `script` tags in HTML pages (thanks Nick Plante).
148
+
149
+ ### 0.2.0 / 2009-10-10
150
+
151
+ * Added {URI.expand_path}.
152
+ * Added `Spidr::Page#search`.
153
+ * Added `Spidr::Page#at`.
154
+ * Added `Spidr::Page#title`.
155
+ * Added {Spidr::Agent#failures=}.
156
+ * Added a HTTP session cache to {Spidr::Agent}, per suggestion of falter.
157
+ * Added `Spidr::Agent#get_session`.
158
+ * Added `Spidr::Agent#kill_session`.
159
+ * Added {Spidr.proxy=}.
160
+ * Added {Spidr.disable_proxy!}.
161
+ * Aliased `Spidr::Page#txt?` to `Spidr::Page#plain_text?`.
162
+ * Aliased `Spidr::Page#ok?` to `Spidr::Page#is_ok?`.
163
+ * Aliased `Spidr::Page#redirect?` to `Spidr::Page#is_redirect?`.
164
+ * Aliased `Spidr::Page#unauthorized?` to `Spidr::Page#is_unauthorized?`.
165
+ * Aliased `Spidr::Page#forbidden?` to `Spidr::Page#is_forbidden?`.
166
+ * Aliased `Spidr::Page#missing?` to `Spidr::Page#is_missing?`.
167
+ * Split URL filtering code out of {Spidr::Agent} and into
168
+ {Spidr::Filters}.
169
+ * Split URL / Page event code out of {Spidr::Agent} and into
170
+ {Spidr::Events}.
171
+ * Split pause! / continue! / skip_link! / skip_page! methods out of
172
+ {Spidr::Agent} and into {Spidr::Actions}.
173
+ * Fixed a bug in `Spidr::Page#code`, where it was not returning an Integer.
174
+ * Make sure `Spidr::Page#doc` returns `Nokogiri::XML::Document` objects for
175
+ RSS/RDF/Atom pages as well.
176
+ * Fixed the handling of the Location header in `Spidr::Page#links`
177
+ (thanks falter).
178
+ * Fixed a bug in `Spidr::Page#to_absolute` where trailing `/` characters on
179
+ URI paths were not being preserved (thanks falter).
180
+ * Fixed a bug where the URI query was not being sent with the request
181
+ in {Spidr::Agent#get_page} (thanks Damian Steer).
182
+ * Fixed a bug where SSL sessions were not being properly setup
183
+ (thanks falter).
184
+ * Switched {Spidr::Agent#history} to be a Set, to improve search-time
185
+ of the history (thanks falter).
186
+ * Switched {Spidr::Agent#failures} to a Set.
187
+ * Allow a block to be passed to {Spidr::Agent#run}, which will receive all
188
+ pages visited.
189
+ * Allow `Spidr::Agent#start_at` and `Spidr::Agent#continue!` to pass blocks
190
+ to {Spidr::Agent#run}.
191
+ * Made {Spidr::Agent#visit_page} public.
192
+ * Moved to YARD based documentation.
193
+
194
+ ### 0.1.9 / 2009-06-13
195
+
196
+ * Upgraded to Hoe 2.0.0.
197
+ * Use Hoe.spec instead of Hoe.new.
198
+ * Use the Hoe signing task for signed gems.
199
+ * Added the `Spidr::Agent#schemes` and `Spidr::Agent#schemes=` methods.
200
+ * Added a warning message if 'net/https' cannot be loaded.
201
+ * Allow the list of acceptable URL schemes to be passed into
202
+ {Spidr::Agent#initialize}.
203
+ * Allow history and queue information to be passed into
204
+ {Spidr::Agent#initialize}.
205
+ * {Spidr::Agent#start_at} no longer clears the history or the queue.
206
+ * Fixed a bug in the sanitization of semi-escaped URLs.
207
+ * Fixed a bug where https URLs would be followed even if 'net/https'
208
+ could not be loaded.
209
+ * Removed Spidr::Agent::SCHEMES.
210
+
211
+ ### 0.1.8 / 2009-05-27
212
+
213
+ * Added the `Spidr::Agent#pause!` and `Spidr::Agent#continue!` methods.
214
+ * Added the `Spidr::Agent#running?` and `Spidr::Agent#paused?` methods.
215
+ * Added an alias for pending_urls to the queue methods.
216
+ * Added {Spidr::Agent#queue} to provide read access to the queue.
217
+ * Added {Spidr::Agent#queue=} and {Spidr::Agent#history=} for setting the
218
+ queue and history.
219
+ * Added {Spidr::Agent#to_hash} which returns a Hash of the agents queue and
220
+ history.
221
+ * Made {Spidr::Agent#enqueue} and {Spidr::Agent#queued?} public.
222
+ * Added more specs.
223
+
224
+ ### 0.1.7 / 2009-04-24
225
+
226
+ * Added `Spidr::Agent#all_headers`.
227
+ * Fixed a bug where {Spidr::Page#headers} was always `nil`.
228
+ * {Spidr::Agent} will now follow the Location header in HTTP 300,
229
+ 301, 302, 303 and 307 Redirects.
230
+ * {Spidr::Agent} will now follow iframe and frame tags.
231
+
232
+ ### 0.1.6 / 2009-04-14
233
+
234
+ * Added {Spidr::Agent#failures}, a list of URLs which could not be visited.
235
+ * Added {Spidr::Agent#failed?}.
236
+ * Added `Spidr::Agent#every_failed_url`.
237
+ * Added {Spidr::Agent#clear}, which clears the history and failures URL
238
+ lists.
239
+ * Improved fault tolerance in {Spidr::Agent#get_page}.
240
+ * If a Network or HTTP error is encountered, the URL will be added to
241
+ the failures list and the next URL will be visited.
242
+ * Fixed a typo in `Spidr::Agent#ignore_exts_like`.
243
+ * Updated the Web Spider Obstacle Course with links that always fail to be
244
+ visited.
245
+
246
+ ### 0.1.5 / 2009-03-22
247
+
248
+ * Catch malformed URIs in `Spidr::Page#to_absolute` and return `nil`.
249
+ * Filter out `nil` URIs in `Spidr::Page#urls`.
250
+
251
+ ### 0.1.4 / 2009-01-15
252
+
253
+ * Use Nokogiri for HTML and XML parsing.
254
+
255
+ ### 0.1.3 / 2009-01-10
256
+
257
+ * Added the `:host` options to {Spidr::Agent#initialize}.
258
+ * Added the Web Spider Obstacle Course files to the Manifest.
259
+ * Aliased {Spidr::Agent#visited_urls} to {Spidr::Agent#history}.
260
+
261
+ ### 0.1.2 / 2008-11-06
262
+
263
+ * Fixed a bug in `Spidr::Page#to_absolute` where URLs with no path were not
264
+ receiving a default path of `/`.
265
+ * Fixed a bug in `Spidr::Page#to_absolute` where URL paths were not being
266
+ expanded, in order to remove `..` and `.` directories.
267
+ * Fixed a bug where absolute URLs could have a blank path, thus causing
268
+ {Spidr::Agent#get_page} to crash when it performed the HTTP request.
269
+ * Added RSpec spec tests.
270
+ * Created a Web-Spider Obstacle Course
271
+ (http://spidr.rubyforge.org/course/start.html) which is used in the spec
272
+ tests.
273
+
274
+ ### 0.1.1 / 2008-10-04
275
+
276
+ * Added a reader method for the response instance variable in Page.
277
+ * Fixed a bug in {Spidr::Page#method_missing}.
278
+
279
+ ### 0.1.0 / 2008-05-23
280
+
281
+ * Initial release.
282
+ * Black-list or white-list URLs based upon:
283
+ * Host name
284
+ * Port number
285
+ * Full link
286
+ * URL extension
287
+ * Provides call-backs for:
288
+ * Every visited Page.
289
+ * Every visited URL.
290
+ * Every visited URL that matches a specified pattern.
291
+
data/ChangeLog.md~ ADDED
@@ -0,0 +1,291 @@
1
+ ### 0.4.2 /2012-04-12
2
+
3
+ * 对此gem进行了修改.
4
+
5
+
6
+ ### 0.4.1 / 2011-12-08
7
+
8
+ * Catch `OpenSSL::SSL::SSLError` exceptions when initiated HTTPS Sessions.
9
+
10
+ ### 0.4.0 / 2011-08-07
11
+
12
+ * Added {Spidr::Headers#content_charset}.
13
+ * Pass the Page `url` and `content_charset` to Nokogiri in {Spidr::Body#doc}.
14
+ This ensures that Nokogiri will preserve the body encoding.
15
+ * Made {Spidr::Headers#is_content_type?} public.
16
+ * Allow {Spidr::Headers#is_content_type?} to match the full Content-Type
17
+ or the sub-type.
18
+
19
+ ### 0.3.2 / 2011-06-20
20
+
21
+ * Added separate intitialize methods for {Spidr::Actions}, {Spidr::Events},
22
+ {Spidr::Filters} and {Spidr::Sanitizers}.
23
+ * Aliased {Spidr::Events#urls_like} to {Spidr::Events#every_url_like}.
24
+ * Reduce usage of `self.included` and `module_eval`.
25
+ * Reduce usage of nested-blocks.
26
+ * Reduce usage of `return`.
27
+
28
+ ### 0.3.1 / 2011-04-22
29
+
30
+ * Require `set` in `spidr/headers.rb`.
31
+
32
+ ### 0.3.0 / 2011-04-14
33
+
34
+ * Switched from Jeweler to [Ore](http://github.com/ruby-ore/ore).
35
+ * Split all header related methods out of {Spidr::Page} and into
36
+ {Spidr::Headers}.
37
+ * Split all body related methods out of {Spidr::Page} and into
38
+ {Spidr::Body}.
39
+ * Split all link related methods out of {Spidr::Page} and into
40
+ {Spidr::Links}.
41
+ * Added {Spidr::Headers#directory?}.
42
+ * Added {Spidr::Headers#json?}.
43
+ * Added {Spidr::Links#each_url}.
44
+ * Added {Spidr::Links#each_link}.
45
+ * Added {Spidr::Links#each_redirect}.
46
+ * Added {Spidr::Links#each_meta_redirect}.
47
+ * Aliased {Spidr::Headers#raw_cookie} to {Spidr::Headers#cookie}.
48
+ * Aliased {Spidr::Body#to_s} to {Spidr::Body#body}.
49
+ * Also check for `application/xml` in {Spidr::Headers#xml?}.
50
+ * Catch all exceptions when merging URIs in {Spidr::Links#to_absolute}.
51
+ * Always prepend a `/` to all FTP URI paths. Fixes a Ruby 1.8 specific
52
+ bug, where it expects an absolute path for all FTP URIs.
53
+ * Refactored {URI.expand_path}.
54
+ * Start the session in {Spidr::SessionCache#[]} to prevent multiple
55
+ `CONNECT` commands being sent to HTTP Proxies (thanks falaise).
56
+
57
+ ### 0.2.7 / 2010-08-17
58
+
59
+ * Added {Spidr::CookieJar#cookies_for_host} (thanks zapnap).
60
+ * Renamed `Spidr::Page#cookie` to `Spidr::Page#raw_cookie`.
61
+ * Rescue `URI::InvalidComponentError` exceptions in
62
+ `Spidr::Page#to_absolute` (thanks zapnap).
63
+
64
+ ### 0.2.6 / 2010-07-05
65
+
66
+ * Fixed a bug in `Spidr::Page#meta_redirect`, by calling
67
+ `Nokogiri::XML::Element#get_attribute` instead of `attr`.
68
+
69
+ ### 0.2.5 / 2010-07-02
70
+
71
+ * Added `Spidr::Page#meta_redirect`.
72
+ * Added `Spidr::Page#meta_redirect?`.
73
+ * Manage development dependencies with Bundler.
74
+ * Support following "old-school" meta-refresh redirects (thanks zapnap).
75
+ * Allow {Spidr::CookieJar} inherit cookies set by a parent domain.
76
+ * Fixed a constant lookup issue in {Spidr::Agent}.
77
+ * Use `yield` instead of `block.call` when necessary.
78
+
79
+ ### 0.2.4 / 2010-05-05
80
+
81
+ * Added {Spidr::Filters#visit_urls}.
82
+ * Added {Spidr::Filters#visit_urls_like}.
83
+ * Added {Spidr::Filters#ignore_urls}.
84
+ * Added {Spidr::Filters#ignore_urls_like}.
85
+ * Added `Spidr::Page#is_content_type?`.
86
+ * Default `Spidr::Page#body` to an empty String.
87
+ * Default `Spidr::Page#content_type` to an empty String.
88
+ * Default `Spidr::Page#content_types` to an empty Array.
89
+ * Improved reliability of {Spidr::Page#is_redirect?}.
90
+ * Improved content type detection in {Spidr::Page} to handle `Content-Type`
91
+ headers containing charsets (thanks Josh Lindsey).
92
+
93
+ ### 0.2.3 / 2010-02-27
94
+
95
+ * Migrated to Jeweler, for the packaging and releasing RubyGems.
96
+ * Switched to MarkDown formatted YARD documentation.
97
+ * Added {Spidr::Events#every_link}.
98
+ * Added {Spidr::SessionCache#active?}.
99
+ * Added specs for {Spidr::SessionCache}.
100
+
101
+ ### 0.2.2 / 2010-01-06
102
+
103
+ * Require Web Spider Obstacle Course (WSOC) >= 0.1.1.
104
+ * Integrated the new WSOC into the specs.
105
+ * Removed the built-in Web Spider Obstacle Course.
106
+ * Added `Spidr::Page#content_types`.
107
+ * Added `Spidr::Page#cookie`.
108
+ * Added `Spidr::Page#cookies`.
109
+ * Added `Spidr::Page#cookie_params`.
110
+ * Added {Spidr::Sanitizers}.
111
+ * Added {Spidr::SessionCache}.
112
+ * Added {Spidr::CookieJar} (thanks Nick Plante).
113
+ * Added {Spidr::AuthStore} (thanks Nick Plante).
114
+ * Added {Spidr::Agent#post_page} (thanks Nick Plante).
115
+ * Renamed `Spidr::Agent#get_session` to {Spidr::SessionCache#[]}.
116
+ * Renamed `Spidr::Agent#kill_session` to {Spidr::SessionCache#kill!}.
117
+
118
+ ### 0.2.1 / 2009-11-25
119
+
120
+ * Added {Spidr::Events#every_ok_page}.
121
+ * Added {Spidr::Events#every_redirect_page}.
122
+ * Added {Spidr::Events#every_timedout_page}.
123
+ * Added {Spidr::Events#every_bad_request_page}.
124
+ * Added {Spidr::Events#every_unauthorized_page}.
125
+ * Added {Spidr::Events#every_forbidden_page}.
126
+ * Added {Spidr::Events#every_missing_page}.
127
+ * Added {Spidr::Events#every_internal_server_error_page}.
128
+ * Added {Spidr::Events#every_txt_page}.
129
+ * Added {Spidr::Events#every_html_page}.
130
+ * Added {Spidr::Events#every_xml_page}.
131
+ * Added {Spidr::Events#every_xsl_page}.
132
+ * Added {Spidr::Events#every_doc}.
133
+ * Added {Spidr::Events#every_html_doc}.
134
+ * Added {Spidr::Events#every_xml_doc}.
135
+ * Added {Spidr::Events#every_xsl_doc}.
136
+ * Added {Spidr::Events#every_rss_doc}.
137
+ * Added {Spidr::Events#every_atom_doc}.
138
+ * Added {Spidr::Events#every_javascript_page}.
139
+ * Added {Spidr::Events#every_css_page}.
140
+ * Added {Spidr::Events#every_rss_page}.
141
+ * Added {Spidr::Events#every_atom_page}.
142
+ * Added {Spidr::Events#every_ms_word_page}.
143
+ * Added {Spidr::Events#every_pdf_page}.
144
+ * Added {Spidr::Events#every_zip_page}.
145
+ * Fixed a bug where {Spidr::Agent#delay} was not being used to delay
146
+ requesting pages.
147
+ * Spider `link` and `script` tags in HTML pages (thanks Nick Plante).
148
+
149
+ ### 0.2.0 / 2009-10-10
150
+
151
+ * Added {URI.expand_path}.
152
+ * Added `Spidr::Page#search`.
153
+ * Added `Spidr::Page#at`.
154
+ * Added `Spidr::Page#title`.
155
+ * Added {Spidr::Agent#failures=}.
156
+ * Added a HTTP session cache to {Spidr::Agent}, per suggestion of falter.
157
+ * Added `Spidr::Agent#get_session`.
158
+ * Added `Spidr::Agent#kill_session`.
159
+ * Added {Spidr.proxy=}.
160
+ * Added {Spidr.disable_proxy!}.
161
+ * Aliased `Spidr::Page#txt?` to `Spidr::Page#plain_text?`.
162
+ * Aliased `Spidr::Page#ok?` to `Spidr::Page#is_ok?`.
163
+ * Aliased `Spidr::Page#redirect?` to `Spidr::Page#is_redirect?`.
164
+ * Aliased `Spidr::Page#unauthorized?` to `Spidr::Page#is_unauthorized?`.
165
+ * Aliased `Spidr::Page#forbidden?` to `Spidr::Page#is_forbidden?`.
166
+ * Aliased `Spidr::Page#missing?` to `Spidr::Page#is_missing?`.
167
+ * Split URL filtering code out of {Spidr::Agent} and into
168
+ {Spidr::Filters}.
169
+ * Split URL / Page event code out of {Spidr::Agent} and into
170
+ {Spidr::Events}.
171
+ * Split pause! / continue! / skip_link! / skip_page! methods out of
172
+ {Spidr::Agent} and into {Spidr::Actions}.
173
+ * Fixed a bug in `Spidr::Page#code`, where it was not returning an Integer.
174
+ * Make sure `Spidr::Page#doc` returns `Nokogiri::XML::Document` objects for
175
+ RSS/RDF/Atom pages as well.
176
+ * Fixed the handling of the Location header in `Spidr::Page#links`
177
+ (thanks falter).
178
+ * Fixed a bug in `Spidr::Page#to_absolute` where trailing `/` characters on
179
+ URI paths were not being preserved (thanks falter).
180
+ * Fixed a bug where the URI query was not being sent with the request
181
+ in {Spidr::Agent#get_page} (thanks Damian Steer).
182
+ * Fixed a bug where SSL sessions were not being properly setup
183
+ (thanks falter).
184
+ * Switched {Spidr::Agent#history} to be a Set, to improve search-time
185
+ of the history (thanks falter).
186
+ * Switched {Spidr::Agent#failures} to a Set.
187
+ * Allow a block to be passed to {Spidr::Agent#run}, which will receive all
188
+ pages visited.
189
+ * Allow `Spidr::Agent#start_at` and `Spidr::Agent#continue!` to pass blocks
190
+ to {Spidr::Agent#run}.
191
+ * Made {Spidr::Agent#visit_page} public.
192
+ * Moved to YARD based documentation.
193
+
194
+ ### 0.1.9 / 2009-06-13
195
+
196
+ * Upgraded to Hoe 2.0.0.
197
+ * Use Hoe.spec instead of Hoe.new.
198
+ * Use the Hoe signing task for signed gems.
199
+ * Added the `Spidr::Agent#schemes` and `Spidr::Agent#schemes=` methods.
200
+ * Added a warning message if 'net/https' cannot be loaded.
201
+ * Allow the list of acceptable URL schemes to be passed into
202
+ {Spidr::Agent#initialize}.
203
+ * Allow history and queue information to be passed into
204
+ {Spidr::Agent#initialize}.
205
+ * {Spidr::Agent#start_at} no longer clears the history or the queue.
206
+ * Fixed a bug in the sanitization of semi-escaped URLs.
207
+ * Fixed a bug where https URLs would be followed even if 'net/https'
208
+ could not be loaded.
209
+ * Removed Spidr::Agent::SCHEMES.
210
+
211
+ ### 0.1.8 / 2009-05-27
212
+
213
+ * Added the `Spidr::Agent#pause!` and `Spidr::Agent#continue!` methods.
214
+ * Added the `Spidr::Agent#running?` and `Spidr::Agent#paused?` methods.
215
+ * Added an alias for pending_urls to the queue methods.
216
+ * Added {Spidr::Agent#queue} to provide read access to the queue.
217
+ * Added {Spidr::Agent#queue=} and {Spidr::Agent#history=} for setting the
218
+ queue and history.
219
+ * Added {Spidr::Agent#to_hash} which returns a Hash of the agents queue and
220
+ history.
221
+ * Made {Spidr::Agent#enqueue} and {Spidr::Agent#queued?} public.
222
+ * Added more specs.
223
+
224
+ ### 0.1.7 / 2009-04-24
225
+
226
+ * Added `Spidr::Agent#all_headers`.
227
+ * Fixed a bug where {Spidr::Page#headers} was always `nil`.
228
+ * {Spidr::Agent} will now follow the Location header in HTTP 300,
229
+ 301, 302, 303 and 307 Redirects.
230
+ * {Spidr::Agent} will now follow iframe and frame tags.
231
+
232
+ ### 0.1.6 / 2009-04-14
233
+
234
+ * Added {Spidr::Agent#failures}, a list of URLs which could not be visited.
235
+ * Added {Spidr::Agent#failed?}.
236
+ * Added `Spidr::Agent#every_failed_url`.
237
+ * Added {Spidr::Agent#clear}, which clears the history and failures URL
238
+ lists.
239
+ * Improved fault tolerance in {Spidr::Agent#get_page}.
240
+ * If a Network or HTTP error is encountered, the URL will be added to
241
+ the failures list and the next URL will be visited.
242
+ * Fixed a typo in `Spidr::Agent#ignore_exts_like`.
243
+ * Updated the Web Spider Obstacle Course with links that always fail to be
244
+ visited.
245
+
246
+ ### 0.1.5 / 2009-03-22
247
+
248
+ * Catch malformed URIs in `Spidr::Page#to_absolute` and return `nil`.
249
+ * Filter out `nil` URIs in `Spidr::Page#urls`.
250
+
251
+ ### 0.1.4 / 2009-01-15
252
+
253
+ * Use Nokogiri for HTML and XML parsing.
254
+
255
+ ### 0.1.3 / 2009-01-10
256
+
257
+ * Added the `:host` options to {Spidr::Agent#initialize}.
258
+ * Added the Web Spider Obstacle Course files to the Manifest.
259
+ * Aliased {Spidr::Agent#visited_urls} to {Spidr::Agent#history}.
260
+
261
+ ### 0.1.2 / 2008-11-06
262
+
263
+ * Fixed a bug in `Spidr::Page#to_absolute` where URLs with no path were not
264
+ receiving a default path of `/`.
265
+ * Fixed a bug in `Spidr::Page#to_absolute` where URL paths were not being
266
+ expanded, in order to remove `..` and `.` directories.
267
+ * Fixed a bug where absolute URLs could have a blank path, thus causing
268
+ {Spidr::Agent#get_page} to crash when it performed the HTTP request.
269
+ * Added RSpec spec tests.
270
+ * Created a Web-Spider Obstacle Course
271
+ (http://spidr.rubyforge.org/course/start.html) which is used in the spec
272
+ tests.
273
+
274
+ ### 0.1.1 / 2008-10-04
275
+
276
+ * Added a reader method for the response instance variable in Page.
277
+ * Fixed a bug in {Spidr::Page#method_missing}.
278
+
279
+ ### 0.1.0 / 2008-05-23
280
+
281
+ * Initial release.
282
+ * Black-list or white-list URLs based upon:
283
+ * Host name
284
+ * Port number
285
+ * Full link
286
+ * URL extension
287
+ * Provides call-backs for:
288
+ * Every visited Page.
289
+ * Every visited URL.
290
+ * Every visited URL that matches a specified pattern.
291
+
data/Gemfile ADDED
@@ -0,0 +1,16 @@
1
+ source 'http://ruby.taobao.org'
2
+
3
+ platform :jruby do
4
+ gem 'jruby-openssl'
5
+ end
6
+
7
+ gemspec
8
+
9
+ group :development do
10
+ gem 'rake', '~> 10.0'
11
+ gem 'rubygems-tasks', '~> 0.1'
12
+ gem 'rspec', '~> 2.4'
13
+
14
+ gem 'wsoc', '~> 0.1.3'
15
+ gem 'kramdown', '~> 0.12'
16
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,49 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ spidr_epg (1.0.0)
5
+ nokogiri (~> 1.3)
6
+
7
+ GEM
8
+ remote: http://ruby.taobao.org/
9
+ specs:
10
+ diff-lcs (1.2.3)
11
+ json (1.7.7)
12
+ kramdown (0.14.2)
13
+ nokogiri (1.5.9)
14
+ rack (1.5.2)
15
+ rack-protection (1.5.0)
16
+ rack
17
+ rake (10.0.4)
18
+ rspec (2.13.0)
19
+ rspec-core (~> 2.13.0)
20
+ rspec-expectations (~> 2.13.0)
21
+ rspec-mocks (~> 2.13.0)
22
+ rspec-core (2.13.1)
23
+ rspec-expectations (2.13.0)
24
+ diff-lcs (>= 1.1.3, < 2.0)
25
+ rspec-mocks (2.13.1)
26
+ rubygems-tasks (0.2.4)
27
+ sinatra (1.4.2)
28
+ rack (~> 1.5, >= 1.5.2)
29
+ rack-protection (~> 1.4)
30
+ tilt (~> 1.3, >= 1.3.4)
31
+ tilt (1.3.7)
32
+ wsoc (0.1.4)
33
+ json (~> 1.4)
34
+ sinatra (~> 1.0)
35
+ yard (0.8.5.2)
36
+
37
+ PLATFORMS
38
+ ruby
39
+
40
+ DEPENDENCIES
41
+ bundler (~> 1.0)
42
+ jruby-openssl
43
+ kramdown (~> 0.12)
44
+ rake (~> 10.0)
45
+ rspec (~> 2.4)
46
+ rubygems-tasks (~> 0.1)
47
+ spidr_epg!
48
+ wsoc (~> 0.1.3)
49
+ yard (~> 0.7)
data/Gemfile~ ADDED
@@ -0,0 +1,16 @@
1
+ source 'http://ruby.taobao.org'
2
+
3
+ platform :jruby do
4
+ gem 'jruby-openssl'
5
+ end
6
+
7
+ gemspec
8
+
9
+ group :development do
10
+ gem 'rake', '~> 10.0'
11
+ gem 'rubygems-tasks', '~> 0.1'
12
+ gem 'rspec', '~> 2.4'
13
+
14
+ gem 'wsoc', '~> 0.1.3'
15
+ gem 'kramdown', '~> 0.12'
16
+ end