spider 0.4.3 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. data/CHANGES +6 -0
  2. data/README +3 -3
  3. data/doc/classes/BeStaticServerPages.html +197 -0
  4. data/doc/classes/BeStaticServerPages.src/M000030.html +19 -0
  5. data/doc/classes/BeStaticServerPages.src/M000031.html +19 -0
  6. data/doc/classes/BeStaticServerPages.src/M000032.html +18 -0
  7. data/doc/classes/BeStaticServerPages.src/M000033.html +18 -0
  8. data/doc/classes/IncludedInMemcached.html +18 -45
  9. data/doc/classes/IncludedInMemcached.src/M000015.html +18 -0
  10. data/doc/classes/IncludedInMemcached.src/M000016.html +18 -0
  11. data/doc/classes/IncludedInMemcached.src/M000017.html +18 -0
  12. data/doc/classes/LoopingServlet.html +137 -0
  13. data/doc/classes/LoopingServlet.src/M000037.html +23 -0
  14. data/doc/classes/NextUrlsInSQS.html +204 -0
  15. data/doc/classes/NextUrlsInSQS.src/M000018.html +19 -0
  16. data/doc/classes/NextUrlsInSQS.src/M000019.html +22 -0
  17. data/doc/classes/NextUrlsInSQS.src/M000020.html +19 -0
  18. data/doc/classes/QueryServlet.html +137 -0
  19. data/doc/classes/QueryServlet.src/M000038.html +19 -0
  20. data/doc/classes/RobotRules.html +175 -0
  21. data/doc/classes/RobotRules.src/M000034.html +19 -0
  22. data/doc/classes/RobotRules.src/M000035.html +67 -0
  23. data/doc/classes/RobotRules.src/M000036.html +24 -0
  24. data/doc/classes/Spider.html +5 -17
  25. data/doc/classes/Spider.src/M000029.html +21 -0
  26. data/doc/classes/SpiderInstance.html +72 -108
  27. data/doc/classes/SpiderInstance.src/M000021.html +18 -0
  28. data/doc/classes/SpiderInstance.src/M000022.html +22 -0
  29. data/doc/classes/SpiderInstance.src/M000023.html +22 -0
  30. data/doc/classes/SpiderInstance.src/M000024.html +24 -0
  31. data/doc/classes/SpiderInstance.src/M000025.html +18 -0
  32. data/doc/classes/SpiderInstance.src/M000026.html +18 -0
  33. data/doc/classes/SpiderInstance.src/M000027.html +18 -0
  34. data/doc/classes/SpiderInstance.src/M000028.html +18 -0
  35. data/doc/created.rid +1 -1
  36. data/doc/files/lib/spider/included_in_memcached_rb.html +29 -1
  37. data/doc/files/lib/spider/next_urls_in_sqs_rb.html +144 -0
  38. data/doc/files/lib/spider/robot_rules_rb.html +114 -0
  39. data/doc/files/lib/spider/spider_instance_rb.html +1 -2
  40. data/doc/files/lib/spider_rb.html +40 -9
  41. data/doc/files/spec/spec_helper_rb.html +196 -0
  42. data/doc/files/spec/spec_helper_rb.src/M000001.html +20 -0
  43. data/doc/files/spec/spec_helper_rb.src/M000002.html +26 -0
  44. data/doc/files/spec/spec_helper_rb.src/M000003.html +24 -0
  45. data/doc/files/spec/spec_helper_rb.src/M000004.html +18 -0
  46. data/doc/files/spec/spec_helper_rb.src/M000005.html +23 -0
  47. data/doc/files/spec/spider/included_in_memcached_spec_rb.html +142 -0
  48. data/doc/files/spec/spider/included_in_memcached_spec_rb.src/M000006.html +19 -0
  49. data/doc/files/spec/spider/included_in_memcached_spec_rb.src/M000007.html +18 -0
  50. data/doc/files/spec/spider/spider_instance_spec_rb.html +210 -0
  51. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000008.html +21 -0
  52. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000009.html +19 -0
  53. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000010.html +19 -0
  54. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000011.html +27 -0
  55. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000012.html +26 -0
  56. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000013.html +27 -0
  57. data/doc/files/spec/spider_spec_rb.html +127 -0
  58. data/doc/files/spec/spider_spec_rb.src/M000014.html +23 -0
  59. data/doc/fr_class_index.html +5 -0
  60. data/doc/fr_file_index.html +6 -1
  61. data/doc/fr_method_index.html +38 -11
  62. data/doc/index.html +1 -1
  63. data/lib/spider/spider_instance.rb +15 -7
  64. data/spider.gemspec +1 -1
  65. metadata +84 -22
  66. data/lib/test.rb +0 -27
@@ -0,0 +1,18 @@
1
+ <?xml version="1.0" encoding="iso-8859-1"?>
2
+ <!DOCTYPE html
3
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
+
6
+ <html>
7
+ <head>
8
+ <title>&lt;&lt; (IncludedInMemcached)</title>
9
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
+ <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
11
+ </head>
12
+ <body class="standalone-code">
13
+ <pre><span class="ruby-comment cmt"># File lib/spider/included_in_memcached.rb, line 45</span>
14
+ <span class="ruby-keyword kw">def</span> <span class="ruby-operator">&lt;&lt;</span>(<span class="ruby-identifier">v</span>)
15
+ <span class="ruby-ivar">@c</span>.<span class="ruby-identifier">add</span>(<span class="ruby-identifier">v</span>.<span class="ruby-identifier">to_s</span>, <span class="ruby-identifier">v</span>)
16
+ <span class="ruby-keyword kw">end</span></pre>
17
+ </body>
18
+ </html>
@@ -0,0 +1,18 @@
1
+ <?xml version="1.0" encoding="iso-8859-1"?>
2
+ <!DOCTYPE html
3
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
+
6
+ <html>
7
+ <head>
8
+ <title>include? (IncludedInMemcached)</title>
9
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
+ <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
11
+ </head>
12
+ <body class="standalone-code">
13
+ <pre><span class="ruby-comment cmt"># File lib/spider/included_in_memcached.rb, line 50</span>
14
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">include?</span>(<span class="ruby-identifier">v</span>)
15
+ <span class="ruby-ivar">@c</span>.<span class="ruby-identifier">get</span>(<span class="ruby-identifier">v</span>.<span class="ruby-identifier">to_s</span>) <span class="ruby-operator">==</span> <span class="ruby-identifier">v</span>
16
+ <span class="ruby-keyword kw">end</span></pre>
17
+ </body>
18
+ </html>
@@ -0,0 +1,137 @@
1
+ <?xml version="1.0" encoding="iso-8859-1"?>
2
+ <!DOCTYPE html
3
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
+
6
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
7
+ <head>
8
+ <title>Class: LoopingServlet</title>
9
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
+ <meta http-equiv="Content-Script-Type" content="text/javascript" />
11
+ <link rel="stylesheet" href=".././rdoc-style.css" type="text/css" media="screen" />
12
+ <script type="text/javascript">
13
+ // <![CDATA[
14
+
15
+ function popupCode( url ) {
16
+ window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
17
+ }
18
+
19
+ function toggleCode( id ) {
20
+ if ( document.getElementById )
21
+ elem = document.getElementById( id );
22
+ else if ( document.all )
23
+ elem = eval( "document.all." + id );
24
+ else
25
+ return false;
26
+
27
+ elemStyle = elem.style;
28
+
29
+ if ( elemStyle.display != "block" ) {
30
+ elemStyle.display = "block"
31
+ } else {
32
+ elemStyle.display = "none"
33
+ }
34
+
35
+ return true;
36
+ }
37
+
38
+ // Make codeblocks hidden by default
39
+ document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
40
+
41
+ // ]]>
42
+ </script>
43
+
44
+ </head>
45
+ <body>
46
+
47
+
48
+
49
+ <div id="classHeader">
50
+ <table class="header-table">
51
+ <tr class="top-aligned-row">
52
+ <td><strong>Class</strong></td>
53
+ <td class="class-name-in-header">LoopingServlet</td>
54
+ </tr>
55
+ <tr class="top-aligned-row">
56
+ <td><strong>In:</strong></td>
57
+ <td>
58
+ <a href="../files/spec/spec_helper_rb.html">
59
+ spec/spec_helper.rb
60
+ </a>
61
+ <br />
62
+ </td>
63
+ </tr>
64
+
65
+ <tr class="top-aligned-row">
66
+ <td><strong>Parent:</strong></td>
67
+ <td>
68
+ WEBrick::HTTPServlet::AbstractServlet
69
+ </td>
70
+ </tr>
71
+ </table>
72
+ </div>
73
+ <!-- banner header -->
74
+
75
+ <div id="bodyContent">
76
+
77
+
78
+
79
+ <div id="contextContent">
80
+
81
+
82
+
83
+ </div>
84
+
85
+ <div id="method-list">
86
+ <h3 class="section-bar">Methods</h3>
87
+
88
+ <div class="name-list">
89
+ <a href="#M000037">do_GET</a>&nbsp;&nbsp;
90
+ </div>
91
+ </div>
92
+
93
+ </div>
94
+
95
+
96
+ <!-- if includes -->
97
+
98
+ <div id="section">
99
+
100
+
101
+
102
+
103
+
104
+
105
+
106
+
107
+ <!-- if method_list -->
108
+ <div id="methods">
109
+ <h3 class="section-bar">Public Instance methods</h3>
110
+
111
+ <div id="method-M000037" class="method-detail">
112
+ <a name="M000037"></a>
113
+
114
+ <div class="method-heading">
115
+ <a href="LoopingServlet.src/M000037.html" target="Code" class="method-signature"
116
+ onclick="popupCode('LoopingServlet.src/M000037.html');return false;">
117
+ <span class="method-name">do_GET</span><span class="method-args">(req, res)</span>
118
+ </a>
119
+ </div>
120
+
121
+ <div class="method-description">
122
+ </div>
123
+ </div>
124
+
125
+
126
+ </div>
127
+
128
+
129
+ </div>
130
+
131
+
132
+ <div id="validator-badges">
133
+ <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
134
+ </div>
135
+
136
+ </body>
137
+ </html>
@@ -0,0 +1,23 @@
1
+ <?xml version="1.0" encoding="iso-8859-1"?>
2
+ <!DOCTYPE html
3
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
+
6
+ <html>
7
+ <head>
8
+ <title>do_GET (LoopingServlet)</title>
9
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
+ <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
11
+ </head>
12
+ <body class="standalone-code">
13
+ <pre><span class="ruby-comment cmt"># File spec/spec_helper.rb, line 69</span>
14
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">do_GET</span>(<span class="ruby-identifier">req</span>, <span class="ruby-identifier">res</span>)
15
+ <span class="ruby-identifier">res</span>[<span class="ruby-value str">'Content-type'</span>] = <span class="ruby-value str">'text/html'</span>
16
+ <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">req</span>.<span class="ruby-identifier">path</span> <span class="ruby-operator">==</span> <span class="ruby-value str">'/foo'</span>
17
+ <span class="ruby-identifier">res</span>.<span class="ruby-identifier">body</span> = <span class="ruby-value str">&quot;&lt;a href=\&quot;/\&quot;&gt;a&lt;/a&gt;\n&quot;</span>
18
+ <span class="ruby-keyword kw">else</span>
19
+ <span class="ruby-identifier">res</span>.<span class="ruby-identifier">body</span> = <span class="ruby-value str">&quot;&lt;a href=\&quot;/foo\&quot;&gt;b&lt;/a&gt;\n&quot;</span>
20
+ <span class="ruby-keyword kw">end</span>
21
+ <span class="ruby-keyword kw">end</span></pre>
22
+ </body>
23
+ </html>
@@ -0,0 +1,204 @@
1
+ <?xml version="1.0" encoding="iso-8859-1"?>
2
+ <!DOCTYPE html
3
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
+
6
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
7
+ <head>
8
+ <title>Class: NextUrlsInSQS</title>
9
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
+ <meta http-equiv="Content-Script-Type" content="text/javascript" />
11
+ <link rel="stylesheet" href=".././rdoc-style.css" type="text/css" media="screen" />
12
+ <script type="text/javascript">
13
+ // <![CDATA[
14
+
15
+ function popupCode( url ) {
16
+ window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
17
+ }
18
+
19
+ function toggleCode( id ) {
20
+ if ( document.getElementById )
21
+ elem = document.getElementById( id );
22
+ else if ( document.all )
23
+ elem = eval( "document.all." + id );
24
+ else
25
+ return false;
26
+
27
+ elemStyle = elem.style;
28
+
29
+ if ( elemStyle.display != "block" ) {
30
+ elemStyle.display = "block"
31
+ } else {
32
+ elemStyle.display = "none"
33
+ }
34
+
35
+ return true;
36
+ }
37
+
38
+ // Make codeblocks hidden by default
39
+ document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
40
+
41
+ // ]]>
42
+ </script>
43
+
44
+ </head>
45
+ <body>
46
+
47
+
48
+
49
+ <div id="classHeader">
50
+ <table class="header-table">
51
+ <tr class="top-aligned-row">
52
+ <td><strong>Class</strong></td>
53
+ <td class="class-name-in-header">NextUrlsInSQS</td>
54
+ </tr>
55
+ <tr class="top-aligned-row">
56
+ <td><strong>In:</strong></td>
57
+ <td>
58
+ <a href="../files/lib/spider/next_urls_in_sqs_rb.html">
59
+ lib/spider/next_urls_in_sqs.rb
60
+ </a>
61
+ <br />
62
+ </td>
63
+ </tr>
64
+
65
+ <tr class="top-aligned-row">
66
+ <td><strong>Parent:</strong></td>
67
+ <td>
68
+ Object
69
+ </td>
70
+ </tr>
71
+ </table>
72
+ </div>
73
+ <!-- banner header -->
74
+
75
+ <div id="bodyContent">
76
+
77
+
78
+
79
+ <div id="contextContent">
80
+
81
+ <div id="description">
82
+ <p>
83
+ A specialized class using AmazonSQS to track nodes to walk. It supports two
84
+ operations: <a href="NextUrlsInSQS.html#M000020">push</a> and <a
85
+ href="NextUrlsInSQS.html#M000019">pop</a> . Together these can be used to
86
+ add items to the queue, then pull items off the queue.
87
+ </p>
88
+ <p>
89
+ This is useful if you want multiple <a href="Spider.html">Spider</a>
90
+ processes crawling the same data set.
91
+ </p>
92
+ <p>
93
+ To use it with <a href="Spider.html">Spider</a> use the
94
+ store_next_urls_with method:
95
+ </p>
96
+ <pre>
97
+ Spider.start_at('http://example.com/') do |s|
98
+ s.store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY, queue_name)
99
+ end
100
+ </pre>
101
+
102
+ </div>
103
+
104
+
105
+ </div>
106
+
107
+ <div id="method-list">
108
+ <h3 class="section-bar">Methods</h3>
109
+
110
+ <div class="name-list">
111
+ <a href="#M000018">new</a>&nbsp;&nbsp;
112
+ <a href="#M000019">pop</a>&nbsp;&nbsp;
113
+ <a href="#M000020">push</a>&nbsp;&nbsp;
114
+ </div>
115
+ </div>
116
+
117
+ </div>
118
+
119
+
120
+ <!-- if includes -->
121
+
122
+ <div id="section">
123
+
124
+
125
+
126
+
127
+
128
+
129
+
130
+
131
+ <!-- if method_list -->
132
+ <div id="methods">
133
+ <h3 class="section-bar">Public Class methods</h3>
134
+
135
+ <div id="method-M000018" class="method-detail">
136
+ <a name="M000018"></a>
137
+
138
+ <div class="method-heading">
139
+ <a href="NextUrlsInSQS.src/M000018.html" target="Code" class="method-signature"
140
+ onclick="popupCode('NextUrlsInSQS.src/M000018.html');return false;">
141
+ <span class="method-name">new</span><span class="method-args">(aws_access_key, aws_secret_access_key, queue_name = 'ruby-spider')</span>
142
+ </a>
143
+ </div>
144
+
145
+ <div class="method-description">
146
+ <p>
147
+ Construct a <a href="NextUrlsInSQS.html#M000018">new</a> <a
148
+ href="NextUrlsInSQS.html">NextUrlsInSQS</a> instance. All arguments here
149
+ are passed to RightAWS::SqsGen2 (part of the right_aws gem) or used to set
150
+ the AmazonSQS queue name (optional).
151
+ </p>
152
+ </div>
153
+ </div>
154
+
155
+ <h3 class="section-bar">Public Instance methods</h3>
156
+
157
+ <div id="method-M000019" class="method-detail">
158
+ <a name="M000019"></a>
159
+
160
+ <div class="method-heading">
161
+ <a href="NextUrlsInSQS.src/M000019.html" target="Code" class="method-signature"
162
+ onclick="popupCode('NextUrlsInSQS.src/M000019.html');return false;">
163
+ <span class="method-name">pop</span><span class="method-args">()</span>
164
+ </a>
165
+ </div>
166
+
167
+ <div class="method-description">
168
+ <p>
169
+ Pull an item off the queue, loop until data is found. Data is encoded with
170
+ YAML.
171
+ </p>
172
+ </div>
173
+ </div>
174
+
175
+ <div id="method-M000020" class="method-detail">
176
+ <a name="M000020"></a>
177
+
178
+ <div class="method-heading">
179
+ <a href="NextUrlsInSQS.src/M000020.html" target="Code" class="method-signature"
180
+ onclick="popupCode('NextUrlsInSQS.src/M000020.html');return false;">
181
+ <span class="method-name">push</span><span class="method-args">(a_msg)</span>
182
+ </a>
183
+ </div>
184
+
185
+ <div class="method-description">
186
+ <p>
187
+ Put data on the queue. Data is encoded with YAML.
188
+ </p>
189
+ </div>
190
+ </div>
191
+
192
+
193
+ </div>
194
+
195
+
196
+ </div>
197
+
198
+
199
+ <div id="validator-badges">
200
+ <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
201
+ </div>
202
+
203
+ </body>
204
+ </html>
@@ -0,0 +1,19 @@
1
+ <?xml version="1.0" encoding="iso-8859-1"?>
2
+ <!DOCTYPE html
3
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
+
6
+ <html>
7
+ <head>
8
+ <title>new (NextUrlsInSQS)</title>
9
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
+ <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
11
+ </head>
12
+ <body class="standalone-code">
13
+ <pre><span class="ruby-comment cmt"># File lib/spider/next_urls_in_sqs.rb, line 46</span>
14
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-identifier">aws_access_key</span>, <span class="ruby-identifier">aws_secret_access_key</span>, <span class="ruby-identifier">queue_name</span> = <span class="ruby-value str">'ruby-spider'</span>)
15
+ <span class="ruby-ivar">@sqs</span> = <span class="ruby-constant">RightAws</span><span class="ruby-operator">::</span><span class="ruby-constant">SqsGen2</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">aws_access_key</span>, <span class="ruby-identifier">aws_secret_access_key</span>)
16
+ <span class="ruby-ivar">@queue</span> = <span class="ruby-ivar">@sqs</span>.<span class="ruby-identifier">queue</span>(<span class="ruby-identifier">queue_name</span>)
17
+ <span class="ruby-keyword kw">end</span></pre>
18
+ </body>
19
+ </html>
@@ -0,0 +1,22 @@
1
+ <?xml version="1.0" encoding="iso-8859-1"?>
2
+ <!DOCTYPE html
3
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
+
6
+ <html>
7
+ <head>
8
+ <title>pop (NextUrlsInSQS)</title>
9
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
+ <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
11
+ </head>
12
+ <body class="standalone-code">
13
+ <pre><span class="ruby-comment cmt"># File lib/spider/next_urls_in_sqs.rb, line 53</span>
14
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">pop</span>
15
+ <span class="ruby-keyword kw">while</span> <span class="ruby-keyword kw">true</span>
16
+ <span class="ruby-identifier">message</span> = <span class="ruby-ivar">@queue</span>.<span class="ruby-identifier">pop</span>
17
+ <span class="ruby-keyword kw">return</span> <span class="ruby-constant">YAML</span><span class="ruby-operator">::</span><span class="ruby-identifier">load</span>(<span class="ruby-identifier">message</span>.<span class="ruby-identifier">to_s</span>) <span class="ruby-keyword kw">unless</span> <span class="ruby-identifier">message</span>.<span class="ruby-identifier">nil?</span>
18
+ <span class="ruby-identifier">sleep</span> <span class="ruby-value">5</span>
19
+ <span class="ruby-keyword kw">end</span>
20
+ <span class="ruby-keyword kw">end</span></pre>
21
+ </body>
22
+ </html>