spider 0.4.4 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (74) hide show
  1. checksums.yaml +7 -0
  2. data/AUTHORS +12 -0
  3. data/CHANGES +6 -0
  4. data/LICENSE +21 -0
  5. data/{README → README.md} +50 -43
  6. data/lib/spider.rb +12 -29
  7. data/lib/spider/included_in_memcached.rb +1 -24
  8. data/lib/spider/next_urls_in_sqs.rb +6 -29
  9. data/lib/spider/robot_rules.rb +61 -57
  10. data/lib/spider/spider_instance.rb +8 -31
  11. data/spider.gemspec +4 -2
  12. metadata +33 -124
  13. data/doc/classes/BeStaticServerPages.html +0 -197
  14. data/doc/classes/BeStaticServerPages.src/M000030.html +0 -19
  15. data/doc/classes/BeStaticServerPages.src/M000031.html +0 -19
  16. data/doc/classes/BeStaticServerPages.src/M000032.html +0 -18
  17. data/doc/classes/BeStaticServerPages.src/M000033.html +0 -18
  18. data/doc/classes/IncludedInMemcached.html +0 -199
  19. data/doc/classes/IncludedInMemcached.src/M000015.html +0 -18
  20. data/doc/classes/IncludedInMemcached.src/M000016.html +0 -18
  21. data/doc/classes/IncludedInMemcached.src/M000017.html +0 -18
  22. data/doc/classes/LoopingServlet.html +0 -137
  23. data/doc/classes/LoopingServlet.src/M000037.html +0 -23
  24. data/doc/classes/NextUrlsInSQS.html +0 -204
  25. data/doc/classes/NextUrlsInSQS.src/M000018.html +0 -19
  26. data/doc/classes/NextUrlsInSQS.src/M000019.html +0 -22
  27. data/doc/classes/NextUrlsInSQS.src/M000020.html +0 -19
  28. data/doc/classes/QueryServlet.html +0 -137
  29. data/doc/classes/QueryServlet.src/M000038.html +0 -19
  30. data/doc/classes/RobotRules.html +0 -175
  31. data/doc/classes/RobotRules.src/M000034.html +0 -19
  32. data/doc/classes/RobotRules.src/M000035.html +0 -67
  33. data/doc/classes/RobotRules.src/M000036.html +0 -24
  34. data/doc/classes/Spider.html +0 -170
  35. data/doc/classes/Spider.src/M000029.html +0 -21
  36. data/doc/classes/SpiderInstance.html +0 -345
  37. data/doc/classes/SpiderInstance.src/M000021.html +0 -18
  38. data/doc/classes/SpiderInstance.src/M000022.html +0 -22
  39. data/doc/classes/SpiderInstance.src/M000023.html +0 -22
  40. data/doc/classes/SpiderInstance.src/M000024.html +0 -24
  41. data/doc/classes/SpiderInstance.src/M000025.html +0 -18
  42. data/doc/classes/SpiderInstance.src/M000026.html +0 -18
  43. data/doc/classes/SpiderInstance.src/M000027.html +0 -18
  44. data/doc/classes/SpiderInstance.src/M000028.html +0 -18
  45. data/doc/created.rid +0 -1
  46. data/doc/files/README.html +0 -223
  47. data/doc/files/lib/spider/included_in_memcached_rb.html +0 -142
  48. data/doc/files/lib/spider/next_urls_in_sqs_rb.html +0 -144
  49. data/doc/files/lib/spider/robot_rules_rb.html +0 -114
  50. data/doc/files/lib/spider/spider_instance_rb.html +0 -117
  51. data/doc/files/lib/spider_rb.html +0 -254
  52. data/doc/files/spec/spec_helper_rb.html +0 -196
  53. data/doc/files/spec/spec_helper_rb.src/M000001.html +0 -20
  54. data/doc/files/spec/spec_helper_rb.src/M000002.html +0 -26
  55. data/doc/files/spec/spec_helper_rb.src/M000003.html +0 -24
  56. data/doc/files/spec/spec_helper_rb.src/M000004.html +0 -18
  57. data/doc/files/spec/spec_helper_rb.src/M000005.html +0 -23
  58. data/doc/files/spec/spider/included_in_memcached_spec_rb.html +0 -142
  59. data/doc/files/spec/spider/included_in_memcached_spec_rb.src/M000006.html +0 -19
  60. data/doc/files/spec/spider/included_in_memcached_spec_rb.src/M000007.html +0 -18
  61. data/doc/files/spec/spider/spider_instance_spec_rb.html +0 -210
  62. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000008.html +0 -21
  63. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000009.html +0 -19
  64. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000010.html +0 -19
  65. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000011.html +0 -27
  66. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000012.html +0 -26
  67. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000013.html +0 -27
  68. data/doc/files/spec/spider_spec_rb.html +0 -127
  69. data/doc/files/spec/spider_spec_rb.src/M000014.html +0 -23
  70. data/doc/fr_class_index.html +0 -34
  71. data/doc/fr_file_index.html +0 -35
  72. data/doc/fr_method_index.html +0 -64
  73. data/doc/index.html +0 -24
  74. data/doc/rdoc-style.css +0 -208
@@ -1,23 +0,0 @@
1
- <?xml version="1.0" encoding="iso-8859-1"?>
2
- <!DOCTYPE html
3
- PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
- "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
-
6
- <html>
7
- <head>
8
- <title>find_pages_with_static_server (spec/spider_spec.rb)</title>
9
- <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
- <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
11
- </head>
12
- <body class="standalone-code">
13
- <pre><span class="ruby-comment cmt"># File spec/spider_spec.rb, line 25</span>
14
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">find_pages_with_static_server</span>(<span class="ruby-operator">&amp;</span><span class="ruby-identifier">block</span>)
15
- <span class="ruby-identifier">pages</span> = []
16
- <span class="ruby-constant">Spider</span>.<span class="ruby-identifier">start_at</span>(<span class="ruby-value str">'http://localhost:8888/'</span>) <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">s</span><span class="ruby-operator">|</span>
17
- <span class="ruby-identifier">block</span>.<span class="ruby-identifier">call</span>(<span class="ruby-identifier">s</span>) <span class="ruby-keyword kw">unless</span> <span class="ruby-identifier">block</span>.<span class="ruby-identifier">nil?</span>
18
- <span class="ruby-identifier">s</span>.<span class="ruby-identifier">on</span>(<span class="ruby-identifier">:every</span>){ <span class="ruby-operator">|</span><span class="ruby-identifier">u</span>,<span class="ruby-identifier">r</span>,<span class="ruby-identifier">p</span><span class="ruby-operator">|</span> <span class="ruby-identifier">pages</span> <span class="ruby-operator">&lt;&lt;</span> <span class="ruby-identifier">u</span> }
19
- <span class="ruby-keyword kw">end</span>
20
- <span class="ruby-identifier">pages</span>
21
- <span class="ruby-keyword kw">end</span></pre>
22
- </body>
23
- </html>
@@ -1,34 +0,0 @@
1
-
2
- <?xml version="1.0" encoding="iso-8859-1"?>
3
- <!DOCTYPE html
4
- PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
5
- "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
6
-
7
- <!--
8
-
9
- Classes
10
-
11
- -->
12
- <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
13
- <head>
14
- <title>Classes</title>
15
- <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
16
- <link rel="stylesheet" href="rdoc-style.css" type="text/css" />
17
- <base target="docwin" />
18
- </head>
19
- <body>
20
- <div id="index">
21
- <h1 class="section-bar">Classes</h1>
22
- <div id="index-entries">
23
- <a href="classes/BeStaticServerPages.html">BeStaticServerPages</a><br />
24
- <a href="classes/IncludedInMemcached.html">IncludedInMemcached</a><br />
25
- <a href="classes/LoopingServlet.html">LoopingServlet</a><br />
26
- <a href="classes/NextUrlsInSQS.html">NextUrlsInSQS</a><br />
27
- <a href="classes/QueryServlet.html">QueryServlet</a><br />
28
- <a href="classes/RobotRules.html">RobotRules</a><br />
29
- <a href="classes/Spider.html">Spider</a><br />
30
- <a href="classes/SpiderInstance.html">SpiderInstance</a><br />
31
- </div>
32
- </div>
33
- </body>
34
- </html>
@@ -1,35 +0,0 @@
1
-
2
- <?xml version="1.0" encoding="iso-8859-1"?>
3
- <!DOCTYPE html
4
- PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
5
- "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
6
-
7
- <!--
8
-
9
- Files
10
-
11
- -->
12
- <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
13
- <head>
14
- <title>Files</title>
15
- <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
16
- <link rel="stylesheet" href="rdoc-style.css" type="text/css" />
17
- <base target="docwin" />
18
- </head>
19
- <body>
20
- <div id="index">
21
- <h1 class="section-bar">Files</h1>
22
- <div id="index-entries">
23
- <a href="files/lib/spider_rb.html">lib/spider.rb</a><br />
24
- <a href="files/lib/spider/included_in_memcached_rb.html">lib/spider/included_in_memcached.rb</a><br />
25
- <a href="files/lib/spider/next_urls_in_sqs_rb.html">lib/spider/next_urls_in_sqs.rb</a><br />
26
- <a href="files/lib/spider/robot_rules_rb.html">lib/spider/robot_rules.rb</a><br />
27
- <a href="files/lib/spider/spider_instance_rb.html">lib/spider/spider_instance.rb</a><br />
28
- <a href="files/spec/spec_helper_rb.html">spec/spec_helper.rb</a><br />
29
- <a href="files/spec/spider/included_in_memcached_spec_rb.html">spec/spider/included_in_memcached_spec.rb</a><br />
30
- <a href="files/spec/spider/spider_instance_spec_rb.html">spec/spider/spider_instance_spec.rb</a><br />
31
- <a href="files/spec/spider_spec_rb.html">spec/spider_spec.rb</a><br />
32
- </div>
33
- </div>
34
- </body>
35
- </html>
@@ -1,64 +0,0 @@
1
-
2
- <?xml version="1.0" encoding="iso-8859-1"?>
3
- <!DOCTYPE html
4
- PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
5
- "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
6
-
7
- <!--
8
-
9
- Methods
10
-
11
- -->
12
- <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
13
- <head>
14
- <title>Methods</title>
15
- <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
16
- <link rel="stylesheet" href="rdoc-style.css" type="text/css" />
17
- <base target="docwin" />
18
- </head>
19
- <body>
20
- <div id="index">
21
- <h1 class="section-bar">Methods</h1>
22
- <div id="index-entries">
23
- <a href="classes/IncludedInMemcached.html#M000016"><< (IncludedInMemcached)</a><br />
24
- <a href="classes/SpiderInstance.html#M000021">add_url_check (SpiderInstance)</a><br />
25
- <a href="files/spec/spider/included_in_memcached_spec_rb.html#M000007">after_specing_memcached (spec/spider/included_in_memcached_spec.rb)</a><br />
26
- <a href="classes/RobotRules.html#M000036">allowed? (RobotRules)</a><br />
27
- <a href="files/spec/spec_helper_rb.html#M000004">be_static_server_pages (spec/spec_helper.rb)</a><br />
28
- <a href="files/spec/spider/included_in_memcached_spec_rb.html#M000006">before_specing_memcached (spec/spider/included_in_memcached_spec.rb)</a><br />
29
- <a href="files/spec/spider/spider_instance_spec_rb.html#M000012">callback_arguments_on (spec/spider/spider_instance_spec.rb)</a><br />
30
- <a href="classes/SpiderInstance.html#M000022">check_already_seen_with (SpiderInstance)</a><br />
31
- <a href="classes/SpiderInstance.html#M000028">clear_headers (SpiderInstance)</a><br />
32
- <a href="classes/BeStaticServerPages.html#M000033">description (BeStaticServerPages)</a><br />
33
- <a href="classes/QueryServlet.html#M000038">do_GET (QueryServlet)</a><br />
34
- <a href="classes/LoopingServlet.html#M000037">do_GET (LoopingServlet)</a><br />
35
- <a href="classes/BeStaticServerPages.html#M000032">failure_message (BeStaticServerPages)</a><br />
36
- <a href="files/spec/spider_spec_rb.html#M000014">find_pages_with_static_server (spec/spider_spec.rb)</a><br />
37
- <a href="classes/SpiderInstance.html#M000027">headers (SpiderInstance)</a><br />
38
- <a href="classes/IncludedInMemcached.html#M000017">include? (IncludedInMemcached)</a><br />
39
- <a href="files/spec/spider/spider_instance_spec_rb.html#M000013">it_should_prevent_cycles_with (spec/spider/spider_instance_spec.rb)</a><br />
40
- <a href="files/spec/spec_helper_rb.html#M000001">local_require (spec/spec_helper.rb)</a><br />
41
- <a href="classes/BeStaticServerPages.html#M000031">matches? (BeStaticServerPages)</a><br />
42
- <a href="files/spec/spider/spider_instance_spec_rb.html#M000010">mock_failed_http (spec/spider/spider_instance_spec.rb)</a><br />
43
- <a href="files/spec/spider/spider_instance_spec_rb.html#M000008">mock_http (spec/spider/spider_instance_spec.rb)</a><br />
44
- <a href="files/spec/spider/spider_instance_spec_rb.html#M000011">mock_redirect_http (spec/spider/spider_instance_spec.rb)</a><br />
45
- <a href="files/spec/spider/spider_instance_spec_rb.html#M000009">mock_successful_http (spec/spider/spider_instance_spec.rb)</a><br />
46
- <a href="classes/IncludedInMemcached.html#M000015">new (IncludedInMemcached)</a><br />
47
- <a href="classes/NextUrlsInSQS.html#M000018">new (NextUrlsInSQS)</a><br />
48
- <a href="classes/BeStaticServerPages.html#M000030">new (BeStaticServerPages)</a><br />
49
- <a href="classes/RobotRules.html#M000034">new (RobotRules)</a><br />
50
- <a href="files/spec/spec_helper_rb.html#M000005">null_logger (spec/spec_helper.rb)</a><br />
51
- <a href="classes/SpiderInstance.html#M000024">on (SpiderInstance)</a><br />
52
- <a href="classes/RobotRules.html#M000035">parse (RobotRules)</a><br />
53
- <a href="classes/NextUrlsInSQS.html#M000019">pop (NextUrlsInSQS)</a><br />
54
- <a href="classes/NextUrlsInSQS.html#M000020">push (NextUrlsInSQS)</a><br />
55
- <a href="classes/SpiderInstance.html#M000025">setup (SpiderInstance)</a><br />
56
- <a href="classes/Spider.html#M000029">start_at (Spider)</a><br />
57
- <a href="classes/SpiderInstance.html#M000023">store_next_urls_with (SpiderInstance)</a><br />
58
- <a href="classes/SpiderInstance.html#M000026">teardown (SpiderInstance)</a><br />
59
- <a href="files/spec/spec_helper_rb.html#M000003">with_memcached (spec/spec_helper.rb)</a><br />
60
- <a href="files/spec/spec_helper_rb.html#M000002">with_web_server (spec/spec_helper.rb)</a><br />
61
- </div>
62
- </div>
63
- </body>
64
- </html>
@@ -1,24 +0,0 @@
1
- <?xml version="1.0" encoding="iso-8859-1"?>
2
- <!DOCTYPE html
3
- PUBLIC "-//W3C//DTD XHTML 1.0 Frameset//EN"
4
- "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd">
5
-
6
- <!--
7
-
8
- RDoc Documentation
9
-
10
- -->
11
- <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
12
- <head>
13
- <title>RDoc Documentation</title>
14
- <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
15
- </head>
16
- <frameset rows="20%, 80%">
17
- <frameset cols="25%,35%,45%">
18
- <frame src="fr_file_index.html" title="Files" name="Files" />
19
- <frame src="fr_class_index.html" name="Classes" />
20
- <frame src="fr_method_index.html" name="Methods" />
21
- </frameset>
22
- <frame src="files/spec/spec_helper_rb.html" name="docwin" />
23
- </frameset>
24
- </html>
@@ -1,208 +0,0 @@
1
-
2
- body {
3
- font-family: Verdana,Arial,Helvetica,sans-serif;
4
- font-size: 90%;
5
- margin: 0;
6
- margin-left: 40px;
7
- padding: 0;
8
- background: white;
9
- }
10
-
11
- h1,h2,h3,h4 { margin: 0; color: #efefef; background: transparent; }
12
- h1 { font-size: 150%; }
13
- h2,h3,h4 { margin-top: 1em; }
14
-
15
- a { background: #eef; color: #039; text-decoration: none; }
16
- a:hover { background: #039; color: #eef; }
17
-
18
- /* Override the base stylesheet's Anchor inside a table cell */
19
- td > a {
20
- background: transparent;
21
- color: #039;
22
- text-decoration: none;
23
- }
24
-
25
- /* and inside a section title */
26
- .section-title > a {
27
- background: transparent;
28
- color: #eee;
29
- text-decoration: none;
30
- }
31
-
32
- /* === Structural elements =================================== */
33
-
34
- div#index {
35
- margin: 0;
36
- margin-left: -40px;
37
- padding: 0;
38
- font-size: 90%;
39
- }
40
-
41
-
42
- div#index a {
43
- margin-left: 0.7em;
44
- }
45
-
46
- div#index .section-bar {
47
- margin-left: 0px;
48
- padding-left: 0.7em;
49
- background: #ccc;
50
- font-size: small;
51
- }
52
-
53
-
54
- div#classHeader, div#fileHeader {
55
- width: auto;
56
- color: white;
57
- padding: 0.5em 1.5em 0.5em 1.5em;
58
- margin: 0;
59
- margin-left: -40px;
60
- border-bottom: 3px solid #006;
61
- }
62
-
63
- div#classHeader a, div#fileHeader a {
64
- background: inherit;
65
- color: white;
66
- }
67
-
68
- div#classHeader td, div#fileHeader td {
69
- background: inherit;
70
- color: white;
71
- }
72
-
73
-
74
- div#fileHeader {
75
- background: #057;
76
- }
77
-
78
- div#classHeader {
79
- background: #048;
80
- }
81
-
82
-
83
- .class-name-in-header {
84
- font-size: 180%;
85
- font-weight: bold;
86
- }
87
-
88
-
89
- div#bodyContent {
90
- padding: 0 1.5em 0 1.5em;
91
- }
92
-
93
- div#description {
94
- padding: 0.5em 1.5em;
95
- background: #efefef;
96
- border: 1px dotted #999;
97
- }
98
-
99
- div#description h1,h2,h3,h4,h5,h6 {
100
- color: #125;;
101
- background: transparent;
102
- }
103
-
104
- div#validator-badges {
105
- text-align: center;
106
- }
107
- div#validator-badges img { border: 0; }
108
-
109
- div#copyright {
110
- color: #333;
111
- background: #efefef;
112
- font: 0.75em sans-serif;
113
- margin-top: 5em;
114
- margin-bottom: 0;
115
- padding: 0.5em 2em;
116
- }
117
-
118
-
119
- /* === Classes =================================== */
120
-
121
- table.header-table {
122
- color: white;
123
- font-size: small;
124
- }
125
-
126
- .type-note {
127
- font-size: small;
128
- color: #DEDEDE;
129
- }
130
-
131
- .xxsection-bar {
132
- background: #eee;
133
- color: #333;
134
- padding: 3px;
135
- }
136
-
137
- .section-bar {
138
- color: #333;
139
- border-bottom: 1px solid #999;
140
- margin-left: -20px;
141
- }
142
-
143
-
144
- .section-title {
145
- background: #79a;
146
- color: #eee;
147
- padding: 3px;
148
- margin-top: 2em;
149
- margin-left: -30px;
150
- border: 1px solid #999;
151
- }
152
-
153
- .top-aligned-row { vertical-align: top }
154
- .bottom-aligned-row { vertical-align: bottom }
155
-
156
- /* --- Context section classes ----------------------- */
157
-
158
- .context-row { }
159
- .context-item-name { font-family: monospace; font-weight: bold; color: black; }
160
- .context-item-value { font-size: small; color: #448; }
161
- .context-item-desc { color: #333; padding-left: 2em; }
162
-
163
- /* --- Method classes -------------------------- */
164
- .method-detail {
165
- background: #efefef;
166
- padding: 0;
167
- margin-top: 0.5em;
168
- margin-bottom: 1em;
169
- border: 1px dotted #ccc;
170
- }
171
- .method-heading {
172
- color: black;
173
- background: #ccc;
174
- border-bottom: 1px solid #666;
175
- padding: 0.2em 0.5em 0 0.5em;
176
- }
177
- .method-signature { color: black; background: inherit; }
178
- .method-name { font-weight: bold; }
179
- .method-args { font-style: italic; }
180
- .method-description { padding: 0 0.5em 0 0.5em; }
181
-
182
- /* --- Source code sections -------------------- */
183
-
184
- a.source-toggle { font-size: 90%; }
185
- div.method-source-code {
186
- background: #262626;
187
- color: #ffdead;
188
- margin: 1em;
189
- padding: 0.5em;
190
- border: 1px dashed #999;
191
- overflow: hidden;
192
- }
193
-
194
- div.method-source-code pre { color: #ffdead; overflow: hidden; }
195
-
196
- /* --- Ruby keyword styles --------------------- */
197
-
198
- .standalone-code { background: #221111; color: #ffdead; overflow: hidden; }
199
-
200
- .ruby-constant { color: #7fffd4; background: transparent; }
201
- .ruby-keyword { color: #00ffff; background: transparent; }
202
- .ruby-ivar { color: #eedd82; background: transparent; }
203
- .ruby-operator { color: #00ffee; background: transparent; }
204
- .ruby-identifier { color: #ffdead; background: transparent; }
205
- .ruby-node { color: #ffa07a; background: transparent; }
206
- .ruby-comment { color: #b22222; font-weight: bold; background: transparent; }
207
- .ruby-regexp { color: #ffa07a; background: transparent; }
208
- .ruby-value { color: #7fffd4; background: transparent; }