cobweb 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.textile +16 -9
- data/lib/cobweb.rb +6 -7
- data/spec/cobweb/cobweb_spec.rb +14 -20
- metadata +3 -4
- data/spec/cobweb/crawl_job_spec.rb +0 -24
data/README.textile
CHANGED
@@ -1,9 +1,9 @@
|
|
1
1
|
|
2
|
-
h1. Cobweb v0.0.
|
2
|
+
h1. Cobweb v0.0.2
|
3
3
|
|
4
4
|
h2. Intro
|
5
5
|
|
6
|
-
Crawler that utilises resque jobs to perform the crawl allowing clustering of crawls.
|
6
|
+
Crawler that utilises resque jobs to perform the crawl allowing clustering of crawls and redis to perform caching of responses. It currently requires redis on the same machine but will add options to specify redis server shortly.
|
7
7
|
|
8
8
|
h2. Installation
|
9
9
|
|
@@ -19,24 +19,31 @@ Creates a new crawler object based on a base_url
|
|
19
19
|
|
20
20
|
* options - Options are passed in as a hash,
|
21
21
|
|
22
|
-
** :follow_redirects - transparently follows redirects and populates the :redirect_through key in the content hash(true)
|
23
|
-
** :redirect_limit - sets the limit to be used for concurrent redirects(10)
|
24
|
-
** :processing_queue - specifies the processing queue for content to be sent to (ContentProcessJob)
|
25
|
-
** :debug - enables debug output (false)
|
26
|
-
** :quiet - hides default output (false)
|
27
|
-
** :cache -
|
22
|
+
** :follow_redirects - transparently follows redirects and populates the :redirect_through key in the content hash (Default: true)
|
23
|
+
** :redirect_limit - sets the limit to be used for concurrent redirects (Default: 10)
|
24
|
+
** :processing_queue - specifies the processing queue for content to be sent to (Default: ContentProcessJob)
|
25
|
+
** :debug - enables debug output (Default: false)
|
26
|
+
** :quiet - hides default output (Default: false)
|
27
|
+
** :cache - sets the ttl for caching pages, set to nil to disable caching (Default: 300)
|
28
28
|
|
29
29
|
bq. crawler = CobWeb.new(:follow_redirects => false)
|
30
30
|
|
31
|
-
|
32
31
|
h4. start(base_url)
|
33
32
|
|
33
|
+
Starts a crawl through resque. Requires the :processing_queue to be set to a valid class for the resque job to work with the data retrieved.
|
34
|
+
|
34
35
|
* base_url - the url to start the crawl from
|
35
36
|
|
37
|
+
bq. crawler.start("http://www.google.com/")
|
38
|
+
|
36
39
|
h4. get(url)
|
37
40
|
|
41
|
+
Simple get that obey's the options supplied in new.
|
42
|
+
|
38
43
|
* url - url requested
|
39
44
|
|
45
|
+
bq. crawler.get("http://www.google.com/")
|
46
|
+
|
40
47
|
|
41
48
|
h2. License
|
42
49
|
|
data/lib/cobweb.rb
CHANGED
@@ -11,10 +11,11 @@ class CobWeb
|
|
11
11
|
|
12
12
|
def initialize(options = {})
|
13
13
|
@options = options
|
14
|
-
@options[:follow_redirects] = true
|
15
|
-
@options[:redirect_limit] = 10
|
16
|
-
@options[:processing_queue] = ContentProcessJob
|
17
|
-
@options[:debug] = false unless @options
|
14
|
+
@options[:follow_redirects] = true unless @options.has_key?(:follow_redirects)
|
15
|
+
@options[:redirect_limit] = 10 unless @options.has_key?(:redirect_limit)
|
16
|
+
@options[:processing_queue] = ContentProcessJob unless @options.has_key?(:processing_queue)
|
17
|
+
@options[:debug] = false unless @options.has_key?(:debug)
|
18
|
+
@options[:cache] = 300 unless @options.has_key?(:cache)
|
18
19
|
|
19
20
|
end
|
20
21
|
|
@@ -30,7 +31,6 @@ class CobWeb
|
|
30
31
|
Resque.enqueue(CrawlJob, request)
|
31
32
|
end
|
32
33
|
|
33
|
-
|
34
34
|
def get(url, redirect_limit = @options[:redirect_limit])
|
35
35
|
|
36
36
|
raise "url cannot be nil" if url.nil?
|
@@ -90,7 +90,6 @@ class CobWeb
|
|
90
90
|
content[:content_body] = response.body
|
91
91
|
content[:location] = response["location"]
|
92
92
|
content[:headers] = response.to_hash.symbolize_keys
|
93
|
-
|
94
93
|
# parse data for links
|
95
94
|
link_parser = ContentLinkParser.new(content[:url], content[:content_body])
|
96
95
|
content[:links] = link_parser.link_data
|
@@ -98,7 +97,7 @@ class CobWeb
|
|
98
97
|
# add content to cache if required
|
99
98
|
if @options[:cache]
|
100
99
|
redis.set(unique_id, content.to_json)
|
101
|
-
redis.expire unique_id,
|
100
|
+
redis.expire unique_id, @options[:cache].to_i
|
102
101
|
end
|
103
102
|
end
|
104
103
|
end
|
data/spec/cobweb/cobweb_spec.rb
CHANGED
@@ -15,7 +15,7 @@ describe CobWeb do
|
|
15
15
|
"Server" => "gws",
|
16
16
|
"X-XSS-Protection" => "1; mode=block"}
|
17
17
|
|
18
|
-
@cobweb = CobWeb.new :quiet => true
|
18
|
+
@cobweb = CobWeb.new :quiet => true, :cache => nil
|
19
19
|
end
|
20
20
|
|
21
21
|
describe "with mock" do
|
@@ -24,11 +24,11 @@ describe CobWeb do
|
|
24
24
|
@mock_http_request = mock(Net::HTTPRequest)
|
25
25
|
@mock_http_redirect_request = mock(Net::HTTPRequest)
|
26
26
|
@mock_http_redirect_request2 = mock(Net::HTTPRequest)
|
27
|
-
|
27
|
+
|
28
28
|
@mock_http_response = mock(Net::HTTPResponse)
|
29
29
|
@mock_http_redirect_response = mock(Net::HTTPRedirection)
|
30
30
|
@mock_http_get = mock(Net::HTTP::Get)
|
31
|
-
|
31
|
+
|
32
32
|
Net::HTTP.stub!(:new).and_return(@mock_http_client)
|
33
33
|
Net::HTTP::Get.stub!(:new).and_return(@mock_http_request)
|
34
34
|
Net::HTTP::Get.stub!(:new).with("/redirect.html").and_return(@mock_http_redirect_request)
|
@@ -37,7 +37,7 @@ describe CobWeb do
|
|
37
37
|
@mock_http_client.stub!(:request).with(@mock_http_request).and_return(@mock_http_response)
|
38
38
|
@mock_http_client.stub!(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
|
39
39
|
@mock_http_client.stub!(:request).with(@mock_http_redirect_request2).and_return(@mock_http_redirect_response2)
|
40
|
-
|
40
|
+
|
41
41
|
@mock_http_response.stub!(:code).and_return(200)
|
42
42
|
@mock_http_response.stub!(:content_type).and_return("text/html")
|
43
43
|
@mock_http_response.stub!(:[]).with("Content-Type").and_return(@default_headers["Content-Type"])
|
@@ -72,36 +72,31 @@ describe CobWeb do
|
|
72
72
|
it "should return a hash with default values" do
|
73
73
|
@cobweb.get(@base_url).should be_an_instance_of Hash
|
74
74
|
end
|
75
|
-
|
75
|
+
|
76
76
|
it "should return a hash with default values without quiet option" do
|
77
77
|
@cobweb.get(@base_url).should be_an_instance_of Hash
|
78
78
|
end
|
79
|
-
|
79
|
+
|
80
80
|
it "should raise exception if there is no url" do
|
81
81
|
lambda {@cobweb.get(nil)}.should raise_error("url cannot be nil")
|
82
82
|
end
|
83
|
-
|
84
|
-
describe "content object" do
|
85
83
|
|
84
|
+
describe "content object" do
|
86
85
|
it "should return the url" do
|
87
86
|
@cobweb.get(@base_url)[:url].should == @base_url
|
88
87
|
end
|
89
|
-
|
90
|
-
it "should return correct content-types" do
|
88
|
+
it "should return correct content-type" do
|
91
89
|
@mock_http_response.stub!(:content_type).and_return("image/jpeg")
|
92
90
|
@cobweb.get(@base_url)[:content_type].should == "image/jpeg"
|
93
91
|
end
|
94
|
-
|
95
92
|
it "should return correct status-code" do
|
96
93
|
@mock_http_response.stub!(:code).and_return(404)
|
97
94
|
@cobweb.get(@base_url)[:status_code].should == 404
|
98
95
|
end
|
99
|
-
|
100
96
|
it "should return correct status-code" do
|
101
97
|
@mock_http_response.stub!(:code).and_return(404)
|
102
98
|
@cobweb.get(@base_url)[:status_code].should == 404
|
103
99
|
end
|
104
|
-
|
105
100
|
it "should return correct character_set" do
|
106
101
|
@cobweb.get(@base_url)[:character_set].should == "UTF-8"
|
107
102
|
end
|
@@ -126,19 +121,18 @@ describe CobWeb do
|
|
126
121
|
it "should return the response time for the url" do
|
127
122
|
@cobweb.get(@base_url)[:response_time].should be_an_instance_of Float
|
128
123
|
end
|
129
|
-
|
130
124
|
end
|
131
125
|
describe "with redirect" do
|
132
126
|
|
133
127
|
before(:each) do
|
134
128
|
@base_url = "http://redirect-me.com/redirect.html"
|
135
|
-
@cobweb = CobWeb.new(:follow_redirects => true, :quiet => true)
|
129
|
+
@cobweb = CobWeb.new(:follow_redirects => true, :quiet => true, :cache => nil)
|
136
130
|
end
|
137
131
|
|
138
132
|
it "should flow through redirect" do
|
139
|
-
|
133
|
+
|
140
134
|
@mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
|
141
|
-
|
135
|
+
|
142
136
|
content = @cobweb.get(@base_url)
|
143
137
|
content.should be_an_instance_of Hash
|
144
138
|
|
@@ -150,15 +144,15 @@ describe CobWeb do
|
|
150
144
|
end
|
151
145
|
it "should return the path followed" do
|
152
146
|
@mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
|
153
|
-
|
147
|
+
|
154
148
|
content = @cobweb.get(@base_url)
|
155
149
|
content[:redirect_through].should == ["http://redirected-to.com/redirect2.html", "http://redirected-to.com/redirected.html"]
|
156
150
|
|
157
151
|
end
|
158
152
|
it "should not follow with redirect disabled" do
|
159
|
-
@cobweb = CobWeb.new(:follow_redirects => false)
|
153
|
+
@cobweb = CobWeb.new(:follow_redirects => false, :cache => nil)
|
160
154
|
@mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
|
161
|
-
|
155
|
+
|
162
156
|
content = @cobweb.get(@base_url)
|
163
157
|
content[:url].should == "http://redirect-me.com/redirect.html"
|
164
158
|
content[:redirect_through].should be_nil
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 27
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 2
|
10
|
+
version: 0.0.2
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Stewart McKee
|
@@ -88,7 +88,6 @@ files:
|
|
88
88
|
- spec/spec_helper.rb
|
89
89
|
- spec/cobweb/content_link_parser_spec.rb
|
90
90
|
- spec/cobweb/cobweb_spec.rb
|
91
|
-
- spec/cobweb/crawl_job_spec.rb
|
92
91
|
- lib/namespaced_redis.rb
|
93
92
|
- lib/cobweb.rb
|
94
93
|
- lib/content_process_job.rb
|
@@ -1,24 +0,0 @@
|
|
1
|
-
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
-
|
3
|
-
|
4
|
-
describe CrawlJob do
|
5
|
-
|
6
|
-
before(:each) do
|
7
|
-
@base_url = "http://www.baseurl.com/"
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
client = Net::HTTPClient.new
|
12
|
-
puts client.get('http://www.google.com.au')
|
13
|
-
puts "asdf"
|
14
|
-
|
15
|
-
@cobweb = CobWeb.new("http://www.google.com")
|
16
|
-
|
17
|
-
end
|
18
|
-
|
19
|
-
it "should be a cobweb type" do
|
20
|
-
@cobweb.should be_an_instance_of CobWeb
|
21
|
-
end
|
22
|
-
|
23
|
-
|
24
|
-
end
|