spidr_epg 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. checksums.yaml +15 -0
  2. data/.gitignore +10 -0
  3. data/.rspec +1 -0
  4. data/.yardopts +1 -0
  5. data/ChangeLog.md +291 -0
  6. data/ChangeLog.md~ +291 -0
  7. data/Gemfile +16 -0
  8. data/Gemfile.lock +49 -0
  9. data/Gemfile~ +16 -0
  10. data/LICENSE.txt +20 -0
  11. data/README.md +193 -0
  12. data/README.md~ +190 -0
  13. data/Rakefile +29 -0
  14. data/gemspec.yml +19 -0
  15. data/lib/spidr/actions/actions.rb +83 -0
  16. data/lib/spidr/actions/exceptions/action.rb +9 -0
  17. data/lib/spidr/actions/exceptions/paused.rb +11 -0
  18. data/lib/spidr/actions/exceptions/skip_link.rb +12 -0
  19. data/lib/spidr/actions/exceptions/skip_page.rb +12 -0
  20. data/lib/spidr/actions/exceptions.rb +4 -0
  21. data/lib/spidr/actions.rb +2 -0
  22. data/lib/spidr/agent.rb +866 -0
  23. data/lib/spidr/auth_credential.rb +28 -0
  24. data/lib/spidr/auth_store.rb +161 -0
  25. data/lib/spidr/body.rb +98 -0
  26. data/lib/spidr/cookie_jar.rb +202 -0
  27. data/lib/spidr/events.rb +537 -0
  28. data/lib/spidr/extensions/uri.rb +52 -0
  29. data/lib/spidr/extensions.rb +1 -0
  30. data/lib/spidr/filters.rb +539 -0
  31. data/lib/spidr/headers.rb +370 -0
  32. data/lib/spidr/links.rb +229 -0
  33. data/lib/spidr/page.rb +108 -0
  34. data/lib/spidr/rules.rb +79 -0
  35. data/lib/spidr/sanitizers.rb +56 -0
  36. data/lib/spidr/session_cache.rb +145 -0
  37. data/lib/spidr/spidr.rb +107 -0
  38. data/lib/spidr/version.rb +4 -0
  39. data/lib/spidr/version.rb~ +4 -0
  40. data/lib/spidr.rb +3 -0
  41. data/pkg/spidr-1.0.0.gem +0 -0
  42. data/spec/actions_spec.rb +59 -0
  43. data/spec/agent_spec.rb +81 -0
  44. data/spec/auth_store_spec.rb +85 -0
  45. data/spec/cookie_jar_spec.rb +144 -0
  46. data/spec/extensions/uri_spec.rb +43 -0
  47. data/spec/filters_spec.rb +61 -0
  48. data/spec/helpers/history.rb +34 -0
  49. data/spec/helpers/page.rb +8 -0
  50. data/spec/helpers/wsoc.rb +83 -0
  51. data/spec/page_examples.rb +21 -0
  52. data/spec/page_spec.rb +125 -0
  53. data/spec/rules_spec.rb +45 -0
  54. data/spec/sanitizers_spec.rb +61 -0
  55. data/spec/session_cache.rb +58 -0
  56. data/spec/spec_helper.rb +4 -0
  57. data/spec/spidr_spec.rb +39 -0
  58. data/spidr.gemspec +133 -0
  59. data/spidr.gemspec~ +131 -0
  60. metadata +158 -0
checksums.yaml ADDED
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ NTcxZjE1YTc5OTY1MTQyM2NmYTRjM2VlODE5ZWIwZjBiMjc1OGVmMg==
5
+ data.tar.gz: !binary |-
6
+ ZWIyNzI5NGMwMGMwNzJmYTA4Nzg2YzI4OTk3YTIyNjYyZGU4ZDQzYw==
7
+ !binary "U0hBNTEy":
8
+ metadata.gz: !binary |-
9
+ ODdiYzJjYWM3NjM3NzBjZjExMzI1YzJmN2I2YzA2YTQ5Y2I5N2IwNTRhMzE3
10
+ YWE0NDI5MGMwZTM1M2U2YTFkNWNiOWNjY2Q1Y2ExYjYzY2M4MDI1MTliODhi
11
+ NTM3Y2UyNDUxZDkxZjIwNTRmNzI1NzZjZjliNzE0YzcwNTUyMjk=
12
+ data.tar.gz: !binary |-
13
+ MDZkYjMwYTBkZTUyMGMwMGRkZTgxZWM4ZWY3YTgwNWJlNTQ4NmVlNjVkNDli
14
+ ZWQ3M2FiYjkwYjJkNjI1NGQyOTJhMmM5ZTk2NTZjMzk4ZTgxNmYyNmIxNTA2
15
+ YTZlNmVmYTg0ZGVmYWFjODMzOTBjMmE3MTAzY2NiYTAxYTQ5ZTU=
data/.gitignore ADDED
@@ -0,0 +1,10 @@
1
+ pkg
2
+ doc
3
+ web
4
+ tmp
5
+ Gemfile.lock
6
+ .DS_Store
7
+ .bundle
8
+ .yardoc
9
+ *.swp
10
+ *~
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --colour --format documentation
data/.yardopts ADDED
@@ -0,0 +1 @@
1
+ --markup markdown --title 'Spidr Documentation' --protected --files ChangeLog.md,LICENSE.txt
data/ChangeLog.md ADDED
@@ -0,0 +1,291 @@
1
+ ### 1.4.2 /2012-04-12
2
+
3
+ * 对此gem进行了修改.
4
+
5
+
6
+ ### 0.4.1 / 2011-12-08
7
+
8
+ * Catch `OpenSSL::SSL::SSLError` exceptions when initiated HTTPS Sessions.
9
+
10
+ ### 0.4.0 / 2011-08-07
11
+
12
+ * Added {Spidr::Headers#content_charset}.
13
+ * Pass the Page `url` and `content_charset` to Nokogiri in {Spidr::Body#doc}.
14
+ This ensures that Nokogiri will preserve the body encoding.
15
+ * Made {Spidr::Headers#is_content_type?} public.
16
+ * Allow {Spidr::Headers#is_content_type?} to match the full Content-Type
17
+ or the sub-type.
18
+
19
+ ### 0.3.2 / 2011-06-20
20
+
21
+ * Added separate intitialize methods for {Spidr::Actions}, {Spidr::Events},
22
+ {Spidr::Filters} and {Spidr::Sanitizers}.
23
+ * Aliased {Spidr::Events#urls_like} to {Spidr::Events#every_url_like}.
24
+ * Reduce usage of `self.included` and `module_eval`.
25
+ * Reduce usage of nested-blocks.
26
+ * Reduce usage of `return`.
27
+
28
+ ### 0.3.1 / 2011-04-22
29
+
30
+ * Require `set` in `spidr/headers.rb`.
31
+
32
+ ### 0.3.0 / 2011-04-14
33
+
34
+ * Switched from Jeweler to [Ore](http://github.com/ruby-ore/ore).
35
+ * Split all header related methods out of {Spidr::Page} and into
36
+ {Spidr::Headers}.
37
+ * Split all body related methods out of {Spidr::Page} and into
38
+ {Spidr::Body}.
39
+ * Split all link related methods out of {Spidr::Page} and into
40
+ {Spidr::Links}.
41
+ * Added {Spidr::Headers#directory?}.
42
+ * Added {Spidr::Headers#json?}.
43
+ * Added {Spidr::Links#each_url}.
44
+ * Added {Spidr::Links#each_link}.
45
+ * Added {Spidr::Links#each_redirect}.
46
+ * Added {Spidr::Links#each_meta_redirect}.
47
+ * Aliased {Spidr::Headers#raw_cookie} to {Spidr::Headers#cookie}.
48
+ * Aliased {Spidr::Body#to_s} to {Spidr::Body#body}.
49
+ * Also check for `application/xml` in {Spidr::Headers#xml?}.
50
+ * Catch all exceptions when merging URIs in {Spidr::Links#to_absolute}.
51
+ * Always prepend a `/` to all FTP URI paths. Fixes a Ruby 1.8 specific
52
+ bug, where it expects an absolute path for all FTP URIs.
53
+ * Refactored {URI.expand_path}.
54
+ * Start the session in {Spidr::SessionCache#[]} to prevent multiple
55
+ `CONNECT` commands being sent to HTTP Proxies (thanks falaise).
56
+
57
+ ### 0.2.7 / 2010-08-17
58
+
59
+ * Added {Spidr::CookieJar#cookies_for_host} (thanks zapnap).
60
+ * Renamed `Spidr::Page#cookie` to `Spidr::Page#raw_cookie`.
61
+ * Rescue `URI::InvalidComponentError` exceptions in
62
+ `Spidr::Page#to_absolute` (thanks zapnap).
63
+
64
+ ### 0.2.6 / 2010-07-05
65
+
66
+ * Fixed a bug in `Spidr::Page#meta_redirect`, by calling
67
+ `Nokogiri::XML::Element#get_attribute` instead of `attr`.
68
+
69
+ ### 0.2.5 / 2010-07-02
70
+
71
+ * Added `Spidr::Page#meta_redirect`.
72
+ * Added `Spidr::Page#meta_redirect?`.
73
+ * Manage development dependencies with Bundler.
74
+ * Support following "old-school" meta-refresh redirects (thanks zapnap).
75
+ * Allow {Spidr::CookieJar} inherit cookies set by a parent domain.
76
+ * Fixed a constant lookup issue in {Spidr::Agent}.
77
+ * Use `yield` instead of `block.call` when necessary.
78
+
79
+ ### 0.2.4 / 2010-05-05
80
+
81
+ * Added {Spidr::Filters#visit_urls}.
82
+ * Added {Spidr::Filters#visit_urls_like}.
83
+ * Added {Spidr::Filters#ignore_urls}.
84
+ * Added {Spidr::Filters#ignore_urls_like}.
85
+ * Added `Spidr::Page#is_content_type?`.
86
+ * Default `Spidr::Page#body` to an empty String.
87
+ * Default `Spidr::Page#content_type` to an empty String.
88
+ * Default `Spidr::Page#content_types` to an empty Array.
89
+ * Improved reliability of {Spidr::Page#is_redirect?}.
90
+ * Improved content type detection in {Spidr::Page} to handle `Content-Type`
91
+ headers containing charsets (thanks Josh Lindsey).
92
+
93
+ ### 0.2.3 / 2010-02-27
94
+
95
+ * Migrated to Jeweler, for the packaging and releasing RubyGems.
96
+ * Switched to MarkDown formatted YARD documentation.
97
+ * Added {Spidr::Events#every_link}.
98
+ * Added {Spidr::SessionCache#active?}.
99
+ * Added specs for {Spidr::SessionCache}.
100
+
101
+ ### 0.2.2 / 2010-01-06
102
+
103
+ * Require Web Spider Obstacle Course (WSOC) >= 0.1.1.
104
+ * Integrated the new WSOC into the specs.
105
+ * Removed the built-in Web Spider Obstacle Course.
106
+ * Added `Spidr::Page#content_types`.
107
+ * Added `Spidr::Page#cookie`.
108
+ * Added `Spidr::Page#cookies`.
109
+ * Added `Spidr::Page#cookie_params`.
110
+ * Added {Spidr::Sanitizers}.
111
+ * Added {Spidr::SessionCache}.
112
+ * Added {Spidr::CookieJar} (thanks Nick Plante).
113
+ * Added {Spidr::AuthStore} (thanks Nick Plante).
114
+ * Added {Spidr::Agent#post_page} (thanks Nick Plante).
115
+ * Renamed `Spidr::Agent#get_session` to {Spidr::SessionCache#[]}.
116
+ * Renamed `Spidr::Agent#kill_session` to {Spidr::SessionCache#kill!}.
117
+
118
+ ### 0.2.1 / 2009-11-25
119
+
120
+ * Added {Spidr::Events#every_ok_page}.
121
+ * Added {Spidr::Events#every_redirect_page}.
122
+ * Added {Spidr::Events#every_timedout_page}.
123
+ * Added {Spidr::Events#every_bad_request_page}.
124
+ * Added {Spidr::Events#every_unauthorized_page}.
125
+ * Added {Spidr::Events#every_forbidden_page}.
126
+ * Added {Spidr::Events#every_missing_page}.
127
+ * Added {Spidr::Events#every_internal_server_error_page}.
128
+ * Added {Spidr::Events#every_txt_page}.
129
+ * Added {Spidr::Events#every_html_page}.
130
+ * Added {Spidr::Events#every_xml_page}.
131
+ * Added {Spidr::Events#every_xsl_page}.
132
+ * Added {Spidr::Events#every_doc}.
133
+ * Added {Spidr::Events#every_html_doc}.
134
+ * Added {Spidr::Events#every_xml_doc}.
135
+ * Added {Spidr::Events#every_xsl_doc}.
136
+ * Added {Spidr::Events#every_rss_doc}.
137
+ * Added {Spidr::Events#every_atom_doc}.
138
+ * Added {Spidr::Events#every_javascript_page}.
139
+ * Added {Spidr::Events#every_css_page}.
140
+ * Added {Spidr::Events#every_rss_page}.
141
+ * Added {Spidr::Events#every_atom_page}.
142
+ * Added {Spidr::Events#every_ms_word_page}.
143
+ * Added {Spidr::Events#every_pdf_page}.
144
+ * Added {Spidr::Events#every_zip_page}.
145
+ * Fixed a bug where {Spidr::Agent#delay} was not being used to delay
146
+ requesting pages.
147
+ * Spider `link` and `script` tags in HTML pages (thanks Nick Plante).
148
+
149
+ ### 0.2.0 / 2009-10-10
150
+
151
+ * Added {URI.expand_path}.
152
+ * Added `Spidr::Page#search`.
153
+ * Added `Spidr::Page#at`.
154
+ * Added `Spidr::Page#title`.
155
+ * Added {Spidr::Agent#failures=}.
156
+ * Added a HTTP session cache to {Spidr::Agent}, per suggestion of falter.
157
+ * Added `Spidr::Agent#get_session`.
158
+ * Added `Spidr::Agent#kill_session`.
159
+ * Added {Spidr.proxy=}.
160
+ * Added {Spidr.disable_proxy!}.
161
+ * Aliased `Spidr::Page#txt?` to `Spidr::Page#plain_text?`.
162
+ * Aliased `Spidr::Page#ok?` to `Spidr::Page#is_ok?`.
163
+ * Aliased `Spidr::Page#redirect?` to `Spidr::Page#is_redirect?`.
164
+ * Aliased `Spidr::Page#unauthorized?` to `Spidr::Page#is_unauthorized?`.
165
+ * Aliased `Spidr::Page#forbidden?` to `Spidr::Page#is_forbidden?`.
166
+ * Aliased `Spidr::Page#missing?` to `Spidr::Page#is_missing?`.
167
+ * Split URL filtering code out of {Spidr::Agent} and into
168
+ {Spidr::Filters}.
169
+ * Split URL / Page event code out of {Spidr::Agent} and into
170
+ {Spidr::Events}.
171
+ * Split pause! / continue! / skip_link! / skip_page! methods out of
172
+ {Spidr::Agent} and into {Spidr::Actions}.
173
+ * Fixed a bug in `Spidr::Page#code`, where it was not returning an Integer.
174
+ * Make sure `Spidr::Page#doc` returns `Nokogiri::XML::Document` objects for
175
+ RSS/RDF/Atom pages as well.
176
+ * Fixed the handling of the Location header in `Spidr::Page#links`
177
+ (thanks falter).
178
+ * Fixed a bug in `Spidr::Page#to_absolute` where trailing `/` characters on
179
+ URI paths were not being preserved (thanks falter).
180
+ * Fixed a bug where the URI query was not being sent with the request
181
+ in {Spidr::Agent#get_page} (thanks Damian Steer).
182
+ * Fixed a bug where SSL sessions were not being properly setup
183
+ (thanks falter).
184
+ * Switched {Spidr::Agent#history} to be a Set, to improve search-time
185
+ of the history (thanks falter).
186
+ * Switched {Spidr::Agent#failures} to a Set.
187
+ * Allow a block to be passed to {Spidr::Agent#run}, which will receive all
188
+ pages visited.
189
+ * Allow `Spidr::Agent#start_at` and `Spidr::Agent#continue!` to pass blocks
190
+ to {Spidr::Agent#run}.
191
+ * Made {Spidr::Agent#visit_page} public.
192
+ * Moved to YARD based documentation.
193
+
194
+ ### 0.1.9 / 2009-06-13
195
+
196
+ * Upgraded to Hoe 2.0.0.
197
+ * Use Hoe.spec instead of Hoe.new.
198
+ * Use the Hoe signing task for signed gems.
199
+ * Added the `Spidr::Agent#schemes` and `Spidr::Agent#schemes=` methods.
200
+ * Added a warning message if 'net/https' cannot be loaded.
201
+ * Allow the list of acceptable URL schemes to be passed into
202
+ {Spidr::Agent#initialize}.
203
+ * Allow history and queue information to be passed into
204
+ {Spidr::Agent#initialize}.
205
+ * {Spidr::Agent#start_at} no longer clears the history or the queue.
206
+ * Fixed a bug in the sanitization of semi-escaped URLs.
207
+ * Fixed a bug where https URLs would be followed even if 'net/https'
208
+ could not be loaded.
209
+ * Removed Spidr::Agent::SCHEMES.
210
+
211
+ ### 0.1.8 / 2009-05-27
212
+
213
+ * Added the `Spidr::Agent#pause!` and `Spidr::Agent#continue!` methods.
214
+ * Added the `Spidr::Agent#running?` and `Spidr::Agent#paused?` methods.
215
+ * Added an alias for pending_urls to the queue methods.
216
+ * Added {Spidr::Agent#queue} to provide read access to the queue.
217
+ * Added {Spidr::Agent#queue=} and {Spidr::Agent#history=} for setting the
218
+ queue and history.
219
+ * Added {Spidr::Agent#to_hash} which returns a Hash of the agents queue and
220
+ history.
221
+ * Made {Spidr::Agent#enqueue} and {Spidr::Agent#queued?} public.
222
+ * Added more specs.
223
+
224
+ ### 0.1.7 / 2009-04-24
225
+
226
+ * Added `Spidr::Agent#all_headers`.
227
+ * Fixed a bug where {Spidr::Page#headers} was always `nil`.
228
+ * {Spidr::Agent} will now follow the Location header in HTTP 300,
229
+ 301, 302, 303 and 307 Redirects.
230
+ * {Spidr::Agent} will now follow iframe and frame tags.
231
+
232
+ ### 0.1.6 / 2009-04-14
233
+
234
+ * Added {Spidr::Agent#failures}, a list of URLs which could not be visited.
235
+ * Added {Spidr::Agent#failed?}.
236
+ * Added `Spidr::Agent#every_failed_url`.
237
+ * Added {Spidr::Agent#clear}, which clears the history and failures URL
238
+ lists.
239
+ * Improved fault tolerance in {Spidr::Agent#get_page}.
240
+ * If a Network or HTTP error is encountered, the URL will be added to
241
+ the failures list and the next URL will be visited.
242
+ * Fixed a typo in `Spidr::Agent#ignore_exts_like`.
243
+ * Updated the Web Spider Obstacle Course with links that always fail to be
244
+ visited.
245
+
246
+ ### 0.1.5 / 2009-03-22
247
+
248
+ * Catch malformed URIs in `Spidr::Page#to_absolute` and return `nil`.
249
+ * Filter out `nil` URIs in `Spidr::Page#urls`.
250
+
251
+ ### 0.1.4 / 2009-01-15
252
+
253
+ * Use Nokogiri for HTML and XML parsing.
254
+
255
+ ### 0.1.3 / 2009-01-10
256
+
257
+ * Added the `:host` options to {Spidr::Agent#initialize}.
258
+ * Added the Web Spider Obstacle Course files to the Manifest.
259
+ * Aliased {Spidr::Agent#visited_urls} to {Spidr::Agent#history}.
260
+
261
+ ### 0.1.2 / 2008-11-06
262
+
263
+ * Fixed a bug in `Spidr::Page#to_absolute` where URLs with no path were not
264
+ receiving a default path of `/`.
265
+ * Fixed a bug in `Spidr::Page#to_absolute` where URL paths were not being
266
+ expanded, in order to remove `..` and `.` directories.
267
+ * Fixed a bug where absolute URLs could have a blank path, thus causing
268
+ {Spidr::Agent#get_page} to crash when it performed the HTTP request.
269
+ * Added RSpec spec tests.
270
+ * Created a Web-Spider Obstacle Course
271
+ (http://spidr.rubyforge.org/course/start.html) which is used in the spec
272
+ tests.
273
+
274
+ ### 0.1.1 / 2008-10-04
275
+
276
+ * Added a reader method for the response instance variable in Page.
277
+ * Fixed a bug in {Spidr::Page#method_missing}.
278
+
279
+ ### 0.1.0 / 2008-05-23
280
+
281
+ * Initial release.
282
+ * Black-list or white-list URLs based upon:
283
+ * Host name
284
+ * Port number
285
+ * Full link
286
+ * URL extension
287
+ * Provides call-backs for:
288
+ * Every visited Page.
289
+ * Every visited URL.
290
+ * Every visited URL that matches a specified pattern.
291
+
data/ChangeLog.md~ ADDED
@@ -0,0 +1,291 @@
1
+ ### 0.4.2 /2012-04-12
2
+
3
+ * 对此gem进行了修改.
4
+
5
+
6
+ ### 0.4.1 / 2011-12-08
7
+
8
+ * Catch `OpenSSL::SSL::SSLError` exceptions when initiated HTTPS Sessions.
9
+
10
+ ### 0.4.0 / 2011-08-07
11
+
12
+ * Added {Spidr::Headers#content_charset}.
13
+ * Pass the Page `url` and `content_charset` to Nokogiri in {Spidr::Body#doc}.
14
+ This ensures that Nokogiri will preserve the body encoding.
15
+ * Made {Spidr::Headers#is_content_type?} public.
16
+ * Allow {Spidr::Headers#is_content_type?} to match the full Content-Type
17
+ or the sub-type.
18
+
19
+ ### 0.3.2 / 2011-06-20
20
+
21
+ * Added separate intitialize methods for {Spidr::Actions}, {Spidr::Events},
22
+ {Spidr::Filters} and {Spidr::Sanitizers}.
23
+ * Aliased {Spidr::Events#urls_like} to {Spidr::Events#every_url_like}.
24
+ * Reduce usage of `self.included` and `module_eval`.
25
+ * Reduce usage of nested-blocks.
26
+ * Reduce usage of `return`.
27
+
28
+ ### 0.3.1 / 2011-04-22
29
+
30
+ * Require `set` in `spidr/headers.rb`.
31
+
32
+ ### 0.3.0 / 2011-04-14
33
+
34
+ * Switched from Jeweler to [Ore](http://github.com/ruby-ore/ore).
35
+ * Split all header related methods out of {Spidr::Page} and into
36
+ {Spidr::Headers}.
37
+ * Split all body related methods out of {Spidr::Page} and into
38
+ {Spidr::Body}.
39
+ * Split all link related methods out of {Spidr::Page} and into
40
+ {Spidr::Links}.
41
+ * Added {Spidr::Headers#directory?}.
42
+ * Added {Spidr::Headers#json?}.
43
+ * Added {Spidr::Links#each_url}.
44
+ * Added {Spidr::Links#each_link}.
45
+ * Added {Spidr::Links#each_redirect}.
46
+ * Added {Spidr::Links#each_meta_redirect}.
47
+ * Aliased {Spidr::Headers#raw_cookie} to {Spidr::Headers#cookie}.
48
+ * Aliased {Spidr::Body#to_s} to {Spidr::Body#body}.
49
+ * Also check for `application/xml` in {Spidr::Headers#xml?}.
50
+ * Catch all exceptions when merging URIs in {Spidr::Links#to_absolute}.
51
+ * Always prepend a `/` to all FTP URI paths. Fixes a Ruby 1.8 specific
52
+ bug, where it expects an absolute path for all FTP URIs.
53
+ * Refactored {URI.expand_path}.
54
+ * Start the session in {Spidr::SessionCache#[]} to prevent multiple
55
+ `CONNECT` commands being sent to HTTP Proxies (thanks falaise).
56
+
57
+ ### 0.2.7 / 2010-08-17
58
+
59
+ * Added {Spidr::CookieJar#cookies_for_host} (thanks zapnap).
60
+ * Renamed `Spidr::Page#cookie` to `Spidr::Page#raw_cookie`.
61
+ * Rescue `URI::InvalidComponentError` exceptions in
62
+ `Spidr::Page#to_absolute` (thanks zapnap).
63
+
64
+ ### 0.2.6 / 2010-07-05
65
+
66
+ * Fixed a bug in `Spidr::Page#meta_redirect`, by calling
67
+ `Nokogiri::XML::Element#get_attribute` instead of `attr`.
68
+
69
+ ### 0.2.5 / 2010-07-02
70
+
71
+ * Added `Spidr::Page#meta_redirect`.
72
+ * Added `Spidr::Page#meta_redirect?`.
73
+ * Manage development dependencies with Bundler.
74
+ * Support following "old-school" meta-refresh redirects (thanks zapnap).
75
+ * Allow {Spidr::CookieJar} inherit cookies set by a parent domain.
76
+ * Fixed a constant lookup issue in {Spidr::Agent}.
77
+ * Use `yield` instead of `block.call` when necessary.
78
+
79
+ ### 0.2.4 / 2010-05-05
80
+
81
+ * Added {Spidr::Filters#visit_urls}.
82
+ * Added {Spidr::Filters#visit_urls_like}.
83
+ * Added {Spidr::Filters#ignore_urls}.
84
+ * Added {Spidr::Filters#ignore_urls_like}.
85
+ * Added `Spidr::Page#is_content_type?`.
86
+ * Default `Spidr::Page#body` to an empty String.
87
+ * Default `Spidr::Page#content_type` to an empty String.
88
+ * Default `Spidr::Page#content_types` to an empty Array.
89
+ * Improved reliability of {Spidr::Page#is_redirect?}.
90
+ * Improved content type detection in {Spidr::Page} to handle `Content-Type`
91
+ headers containing charsets (thanks Josh Lindsey).
92
+
93
+ ### 0.2.3 / 2010-02-27
94
+
95
+ * Migrated to Jeweler, for the packaging and releasing RubyGems.
96
+ * Switched to MarkDown formatted YARD documentation.
97
+ * Added {Spidr::Events#every_link}.
98
+ * Added {Spidr::SessionCache#active?}.
99
+ * Added specs for {Spidr::SessionCache}.
100
+
101
+ ### 0.2.2 / 2010-01-06
102
+
103
+ * Require Web Spider Obstacle Course (WSOC) >= 0.1.1.
104
+ * Integrated the new WSOC into the specs.
105
+ * Removed the built-in Web Spider Obstacle Course.
106
+ * Added `Spidr::Page#content_types`.
107
+ * Added `Spidr::Page#cookie`.
108
+ * Added `Spidr::Page#cookies`.
109
+ * Added `Spidr::Page#cookie_params`.
110
+ * Added {Spidr::Sanitizers}.
111
+ * Added {Spidr::SessionCache}.
112
+ * Added {Spidr::CookieJar} (thanks Nick Plante).
113
+ * Added {Spidr::AuthStore} (thanks Nick Plante).
114
+ * Added {Spidr::Agent#post_page} (thanks Nick Plante).
115
+ * Renamed `Spidr::Agent#get_session` to {Spidr::SessionCache#[]}.
116
+ * Renamed `Spidr::Agent#kill_session` to {Spidr::SessionCache#kill!}.
117
+
118
+ ### 0.2.1 / 2009-11-25
119
+
120
+ * Added {Spidr::Events#every_ok_page}.
121
+ * Added {Spidr::Events#every_redirect_page}.
122
+ * Added {Spidr::Events#every_timedout_page}.
123
+ * Added {Spidr::Events#every_bad_request_page}.
124
+ * Added {Spidr::Events#every_unauthorized_page}.
125
+ * Added {Spidr::Events#every_forbidden_page}.
126
+ * Added {Spidr::Events#every_missing_page}.
127
+ * Added {Spidr::Events#every_internal_server_error_page}.
128
+ * Added {Spidr::Events#every_txt_page}.
129
+ * Added {Spidr::Events#every_html_page}.
130
+ * Added {Spidr::Events#every_xml_page}.
131
+ * Added {Spidr::Events#every_xsl_page}.
132
+ * Added {Spidr::Events#every_doc}.
133
+ * Added {Spidr::Events#every_html_doc}.
134
+ * Added {Spidr::Events#every_xml_doc}.
135
+ * Added {Spidr::Events#every_xsl_doc}.
136
+ * Added {Spidr::Events#every_rss_doc}.
137
+ * Added {Spidr::Events#every_atom_doc}.
138
+ * Added {Spidr::Events#every_javascript_page}.
139
+ * Added {Spidr::Events#every_css_page}.
140
+ * Added {Spidr::Events#every_rss_page}.
141
+ * Added {Spidr::Events#every_atom_page}.
142
+ * Added {Spidr::Events#every_ms_word_page}.
143
+ * Added {Spidr::Events#every_pdf_page}.
144
+ * Added {Spidr::Events#every_zip_page}.
145
+ * Fixed a bug where {Spidr::Agent#delay} was not being used to delay
146
+ requesting pages.
147
+ * Spider `link` and `script` tags in HTML pages (thanks Nick Plante).
148
+
149
+ ### 0.2.0 / 2009-10-10
150
+
151
+ * Added {URI.expand_path}.
152
+ * Added `Spidr::Page#search`.
153
+ * Added `Spidr::Page#at`.
154
+ * Added `Spidr::Page#title`.
155
+ * Added {Spidr::Agent#failures=}.
156
+ * Added a HTTP session cache to {Spidr::Agent}, per suggestion of falter.
157
+ * Added `Spidr::Agent#get_session`.
158
+ * Added `Spidr::Agent#kill_session`.
159
+ * Added {Spidr.proxy=}.
160
+ * Added {Spidr.disable_proxy!}.
161
+ * Aliased `Spidr::Page#txt?` to `Spidr::Page#plain_text?`.
162
+ * Aliased `Spidr::Page#ok?` to `Spidr::Page#is_ok?`.
163
+ * Aliased `Spidr::Page#redirect?` to `Spidr::Page#is_redirect?`.
164
+ * Aliased `Spidr::Page#unauthorized?` to `Spidr::Page#is_unauthorized?`.
165
+ * Aliased `Spidr::Page#forbidden?` to `Spidr::Page#is_forbidden?`.
166
+ * Aliased `Spidr::Page#missing?` to `Spidr::Page#is_missing?`.
167
+ * Split URL filtering code out of {Spidr::Agent} and into
168
+ {Spidr::Filters}.
169
+ * Split URL / Page event code out of {Spidr::Agent} and into
170
+ {Spidr::Events}.
171
+ * Split pause! / continue! / skip_link! / skip_page! methods out of
172
+ {Spidr::Agent} and into {Spidr::Actions}.
173
+ * Fixed a bug in `Spidr::Page#code`, where it was not returning an Integer.
174
+ * Make sure `Spidr::Page#doc` returns `Nokogiri::XML::Document` objects for
175
+ RSS/RDF/Atom pages as well.
176
+ * Fixed the handling of the Location header in `Spidr::Page#links`
177
+ (thanks falter).
178
+ * Fixed a bug in `Spidr::Page#to_absolute` where trailing `/` characters on
179
+ URI paths were not being preserved (thanks falter).
180
+ * Fixed a bug where the URI query was not being sent with the request
181
+ in {Spidr::Agent#get_page} (thanks Damian Steer).
182
+ * Fixed a bug where SSL sessions were not being properly setup
183
+ (thanks falter).
184
+ * Switched {Spidr::Agent#history} to be a Set, to improve search-time
185
+ of the history (thanks falter).
186
+ * Switched {Spidr::Agent#failures} to a Set.
187
+ * Allow a block to be passed to {Spidr::Agent#run}, which will receive all
188
+ pages visited.
189
+ * Allow `Spidr::Agent#start_at` and `Spidr::Agent#continue!` to pass blocks
190
+ to {Spidr::Agent#run}.
191
+ * Made {Spidr::Agent#visit_page} public.
192
+ * Moved to YARD based documentation.
193
+
194
+ ### 0.1.9 / 2009-06-13
195
+
196
+ * Upgraded to Hoe 2.0.0.
197
+ * Use Hoe.spec instead of Hoe.new.
198
+ * Use the Hoe signing task for signed gems.
199
+ * Added the `Spidr::Agent#schemes` and `Spidr::Agent#schemes=` methods.
200
+ * Added a warning message if 'net/https' cannot be loaded.
201
+ * Allow the list of acceptable URL schemes to be passed into
202
+ {Spidr::Agent#initialize}.
203
+ * Allow history and queue information to be passed into
204
+ {Spidr::Agent#initialize}.
205
+ * {Spidr::Agent#start_at} no longer clears the history or the queue.
206
+ * Fixed a bug in the sanitization of semi-escaped URLs.
207
+ * Fixed a bug where https URLs would be followed even if 'net/https'
208
+ could not be loaded.
209
+ * Removed Spidr::Agent::SCHEMES.
210
+
211
+ ### 0.1.8 / 2009-05-27
212
+
213
+ * Added the `Spidr::Agent#pause!` and `Spidr::Agent#continue!` methods.
214
+ * Added the `Spidr::Agent#running?` and `Spidr::Agent#paused?` methods.
215
+ * Added an alias for pending_urls to the queue methods.
216
+ * Added {Spidr::Agent#queue} to provide read access to the queue.
217
+ * Added {Spidr::Agent#queue=} and {Spidr::Agent#history=} for setting the
218
+ queue and history.
219
+ * Added {Spidr::Agent#to_hash} which returns a Hash of the agents queue and
220
+ history.
221
+ * Made {Spidr::Agent#enqueue} and {Spidr::Agent#queued?} public.
222
+ * Added more specs.
223
+
224
+ ### 0.1.7 / 2009-04-24
225
+
226
+ * Added `Spidr::Agent#all_headers`.
227
+ * Fixed a bug where {Spidr::Page#headers} was always `nil`.
228
+ * {Spidr::Agent} will now follow the Location header in HTTP 300,
229
+ 301, 302, 303 and 307 Redirects.
230
+ * {Spidr::Agent} will now follow iframe and frame tags.
231
+
232
+ ### 0.1.6 / 2009-04-14
233
+
234
+ * Added {Spidr::Agent#failures}, a list of URLs which could not be visited.
235
+ * Added {Spidr::Agent#failed?}.
236
+ * Added `Spidr::Agent#every_failed_url`.
237
+ * Added {Spidr::Agent#clear}, which clears the history and failures URL
238
+ lists.
239
+ * Improved fault tolerance in {Spidr::Agent#get_page}.
240
+ * If a Network or HTTP error is encountered, the URL will be added to
241
+ the failures list and the next URL will be visited.
242
+ * Fixed a typo in `Spidr::Agent#ignore_exts_like`.
243
+ * Updated the Web Spider Obstacle Course with links that always fail to be
244
+ visited.
245
+
246
+ ### 0.1.5 / 2009-03-22
247
+
248
+ * Catch malformed URIs in `Spidr::Page#to_absolute` and return `nil`.
249
+ * Filter out `nil` URIs in `Spidr::Page#urls`.
250
+
251
+ ### 0.1.4 / 2009-01-15
252
+
253
+ * Use Nokogiri for HTML and XML parsing.
254
+
255
+ ### 0.1.3 / 2009-01-10
256
+
257
+ * Added the `:host` options to {Spidr::Agent#initialize}.
258
+ * Added the Web Spider Obstacle Course files to the Manifest.
259
+ * Aliased {Spidr::Agent#visited_urls} to {Spidr::Agent#history}.
260
+
261
+ ### 0.1.2 / 2008-11-06
262
+
263
+ * Fixed a bug in `Spidr::Page#to_absolute` where URLs with no path were not
264
+ receiving a default path of `/`.
265
+ * Fixed a bug in `Spidr::Page#to_absolute` where URL paths were not being
266
+ expanded, in order to remove `..` and `.` directories.
267
+ * Fixed a bug where absolute URLs could have a blank path, thus causing
268
+ {Spidr::Agent#get_page} to crash when it performed the HTTP request.
269
+ * Added RSpec spec tests.
270
+ * Created a Web-Spider Obstacle Course
271
+ (http://spidr.rubyforge.org/course/start.html) which is used in the spec
272
+ tests.
273
+
274
+ ### 0.1.1 / 2008-10-04
275
+
276
+ * Added a reader method for the response instance variable in Page.
277
+ * Fixed a bug in {Spidr::Page#method_missing}.
278
+
279
+ ### 0.1.0 / 2008-05-23
280
+
281
+ * Initial release.
282
+ * Black-list or white-list URLs based upon:
283
+ * Host name
284
+ * Port number
285
+ * Full link
286
+ * URL extension
287
+ * Provides call-backs for:
288
+ * Every visited Page.
289
+ * Every visited URL.
290
+ * Every visited URL that matches a specified pattern.
291
+
data/Gemfile ADDED
@@ -0,0 +1,16 @@
1
+ source 'http://ruby.taobao.org'
2
+
3
+ platform :jruby do
4
+ gem 'jruby-openssl'
5
+ end
6
+
7
+ gemspec
8
+
9
+ group :development do
10
+ gem 'rake', '~> 10.0'
11
+ gem 'rubygems-tasks', '~> 0.1'
12
+ gem 'rspec', '~> 2.4'
13
+
14
+ gem 'wsoc', '~> 0.1.3'
15
+ gem 'kramdown', '~> 0.12'
16
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,49 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ spidr_epg (1.0.0)
5
+ nokogiri (~> 1.3)
6
+
7
+ GEM
8
+ remote: http://ruby.taobao.org/
9
+ specs:
10
+ diff-lcs (1.2.3)
11
+ json (1.7.7)
12
+ kramdown (0.14.2)
13
+ nokogiri (1.5.9)
14
+ rack (1.5.2)
15
+ rack-protection (1.5.0)
16
+ rack
17
+ rake (10.0.4)
18
+ rspec (2.13.0)
19
+ rspec-core (~> 2.13.0)
20
+ rspec-expectations (~> 2.13.0)
21
+ rspec-mocks (~> 2.13.0)
22
+ rspec-core (2.13.1)
23
+ rspec-expectations (2.13.0)
24
+ diff-lcs (>= 1.1.3, < 2.0)
25
+ rspec-mocks (2.13.1)
26
+ rubygems-tasks (0.2.4)
27
+ sinatra (1.4.2)
28
+ rack (~> 1.5, >= 1.5.2)
29
+ rack-protection (~> 1.4)
30
+ tilt (~> 1.3, >= 1.3.4)
31
+ tilt (1.3.7)
32
+ wsoc (0.1.4)
33
+ json (~> 1.4)
34
+ sinatra (~> 1.0)
35
+ yard (0.8.5.2)
36
+
37
+ PLATFORMS
38
+ ruby
39
+
40
+ DEPENDENCIES
41
+ bundler (~> 1.0)
42
+ jruby-openssl
43
+ kramdown (~> 0.12)
44
+ rake (~> 10.0)
45
+ rspec (~> 2.4)
46
+ rubygems-tasks (~> 0.1)
47
+ spidr_epg!
48
+ wsoc (~> 0.1.3)
49
+ yard (~> 0.7)
data/Gemfile~ ADDED
@@ -0,0 +1,16 @@
1
+ source 'http://ruby.taobao.org'
2
+
3
+ platform :jruby do
4
+ gem 'jruby-openssl'
5
+ end
6
+
7
+ gemspec
8
+
9
+ group :development do
10
+ gem 'rake', '~> 10.0'
11
+ gem 'rubygems-tasks', '~> 0.1'
12
+ gem 'rspec', '~> 2.4'
13
+
14
+ gem 'wsoc', '~> 0.1.3'
15
+ gem 'kramdown', '~> 0.12'
16
+ end