cobweb 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
data/README.textile CHANGED
@@ -1,9 +1,9 @@
1
1
 
2
- h1. Cobweb v0.0.1
2
+ h1. Cobweb v0.0.2
3
3
 
4
4
  h2. Intro
5
5
 
6
- Crawler that utilises resque jobs to perform the crawl allowing clustering of crawls.
6
+ Crawler that utilises resque jobs to perform the crawl allowing clustering of crawls and redis to perform caching of responses. It currently requires redis on the same machine but will add options to specify redis server shortly.
7
7
 
8
8
  h2. Installation
9
9
 
@@ -19,24 +19,31 @@ Creates a new crawler object based on a base_url
19
19
 
20
20
  * options - Options are passed in as a hash,
21
21
 
22
- ** :follow_redirects - transparently follows redirects and populates the :redirect_through key in the content hash(true)
23
- ** :redirect_limit - sets the limit to be used for concurrent redirects(10)
24
- ** :processing_queue - specifies the processing queue for content to be sent to (ContentProcessJob)
25
- ** :debug - enables debug output (false)
26
- ** :quiet - hides default output (false)
27
- ** :cache - if set, enables the cache and sets the ttl (300)
22
+ ** :follow_redirects - transparently follows redirects and populates the :redirect_through key in the content hash (Default: true)
23
+ ** :redirect_limit - sets the limit to be used for concurrent redirects (Default: 10)
24
+ ** :processing_queue - specifies the processing queue for content to be sent to (Default: ContentProcessJob)
25
+ ** :debug - enables debug output (Default: false)
26
+ ** :quiet - hides default output (Default: false)
27
+ ** :cache - sets the ttl for caching pages, set to nil to disable caching (Default: 300)
28
28
 
29
29
  bq. crawler = CobWeb.new(:follow_redirects => false)
30
30
 
31
-
32
31
  h4. start(base_url)
33
32
 
33
+ Starts a crawl through resque. Requires the :processing_queue to be set to a valid class for the resque job to work with the data retrieved.
34
+
34
35
  * base_url - the url to start the crawl from
35
36
 
37
+ bq. crawler.start("http://www.google.com/")
38
+
36
39
  h4. get(url)
37
40
 
41
+ Simple get that obey's the options supplied in new.
42
+
38
43
  * url - url requested
39
44
 
45
+ bq. crawler.get("http://www.google.com/")
46
+
40
47
 
41
48
  h2. License
42
49
 
data/lib/cobweb.rb CHANGED
@@ -11,10 +11,11 @@ class CobWeb
11
11
 
12
12
  def initialize(options = {})
13
13
  @options = options
14
- @options[:follow_redirects] = true if @options[:follow_redirects].nil?
15
- @options[:redirect_limit] = 10 if @options[:redirect_limit].nil?
16
- @options[:processing_queue] = ContentProcessJob if @options[:processing_queue].nil?
17
- @options[:debug] = false unless @options[:debug]
14
+ @options[:follow_redirects] = true unless @options.has_key?(:follow_redirects)
15
+ @options[:redirect_limit] = 10 unless @options.has_key?(:redirect_limit)
16
+ @options[:processing_queue] = ContentProcessJob unless @options.has_key?(:processing_queue)
17
+ @options[:debug] = false unless @options.has_key?(:debug)
18
+ @options[:cache] = 300 unless @options.has_key?(:cache)
18
19
 
19
20
  end
20
21
 
@@ -30,7 +31,6 @@ class CobWeb
30
31
  Resque.enqueue(CrawlJob, request)
31
32
  end
32
33
 
33
-
34
34
  def get(url, redirect_limit = @options[:redirect_limit])
35
35
 
36
36
  raise "url cannot be nil" if url.nil?
@@ -90,7 +90,6 @@ class CobWeb
90
90
  content[:content_body] = response.body
91
91
  content[:location] = response["location"]
92
92
  content[:headers] = response.to_hash.symbolize_keys
93
-
94
93
  # parse data for links
95
94
  link_parser = ContentLinkParser.new(content[:url], content[:content_body])
96
95
  content[:links] = link_parser.link_data
@@ -98,7 +97,7 @@ class CobWeb
98
97
  # add content to cache if required
99
98
  if @options[:cache]
100
99
  redis.set(unique_id, content.to_json)
101
- redis.expire unique_id, content_request[:cache].to_i
100
+ redis.expire unique_id, @options[:cache].to_i
102
101
  end
103
102
  end
104
103
  end
@@ -15,7 +15,7 @@ describe CobWeb do
15
15
  "Server" => "gws",
16
16
  "X-XSS-Protection" => "1; mode=block"}
17
17
 
18
- @cobweb = CobWeb.new :quiet => true
18
+ @cobweb = CobWeb.new :quiet => true, :cache => nil
19
19
  end
20
20
 
21
21
  describe "with mock" do
@@ -24,11 +24,11 @@ describe CobWeb do
24
24
  @mock_http_request = mock(Net::HTTPRequest)
25
25
  @mock_http_redirect_request = mock(Net::HTTPRequest)
26
26
  @mock_http_redirect_request2 = mock(Net::HTTPRequest)
27
-
27
+
28
28
  @mock_http_response = mock(Net::HTTPResponse)
29
29
  @mock_http_redirect_response = mock(Net::HTTPRedirection)
30
30
  @mock_http_get = mock(Net::HTTP::Get)
31
-
31
+
32
32
  Net::HTTP.stub!(:new).and_return(@mock_http_client)
33
33
  Net::HTTP::Get.stub!(:new).and_return(@mock_http_request)
34
34
  Net::HTTP::Get.stub!(:new).with("/redirect.html").and_return(@mock_http_redirect_request)
@@ -37,7 +37,7 @@ describe CobWeb do
37
37
  @mock_http_client.stub!(:request).with(@mock_http_request).and_return(@mock_http_response)
38
38
  @mock_http_client.stub!(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
39
39
  @mock_http_client.stub!(:request).with(@mock_http_redirect_request2).and_return(@mock_http_redirect_response2)
40
-
40
+
41
41
  @mock_http_response.stub!(:code).and_return(200)
42
42
  @mock_http_response.stub!(:content_type).and_return("text/html")
43
43
  @mock_http_response.stub!(:[]).with("Content-Type").and_return(@default_headers["Content-Type"])
@@ -72,36 +72,31 @@ describe CobWeb do
72
72
  it "should return a hash with default values" do
73
73
  @cobweb.get(@base_url).should be_an_instance_of Hash
74
74
  end
75
-
75
+
76
76
  it "should return a hash with default values without quiet option" do
77
77
  @cobweb.get(@base_url).should be_an_instance_of Hash
78
78
  end
79
-
79
+
80
80
  it "should raise exception if there is no url" do
81
81
  lambda {@cobweb.get(nil)}.should raise_error("url cannot be nil")
82
82
  end
83
-
84
- describe "content object" do
85
83
 
84
+ describe "content object" do
86
85
  it "should return the url" do
87
86
  @cobweb.get(@base_url)[:url].should == @base_url
88
87
  end
89
-
90
- it "should return correct content-types" do
88
+ it "should return correct content-type" do
91
89
  @mock_http_response.stub!(:content_type).and_return("image/jpeg")
92
90
  @cobweb.get(@base_url)[:content_type].should == "image/jpeg"
93
91
  end
94
-
95
92
  it "should return correct status-code" do
96
93
  @mock_http_response.stub!(:code).and_return(404)
97
94
  @cobweb.get(@base_url)[:status_code].should == 404
98
95
  end
99
-
100
96
  it "should return correct status-code" do
101
97
  @mock_http_response.stub!(:code).and_return(404)
102
98
  @cobweb.get(@base_url)[:status_code].should == 404
103
99
  end
104
-
105
100
  it "should return correct character_set" do
106
101
  @cobweb.get(@base_url)[:character_set].should == "UTF-8"
107
102
  end
@@ -126,19 +121,18 @@ describe CobWeb do
126
121
  it "should return the response time for the url" do
127
122
  @cobweb.get(@base_url)[:response_time].should be_an_instance_of Float
128
123
  end
129
-
130
124
  end
131
125
  describe "with redirect" do
132
126
 
133
127
  before(:each) do
134
128
  @base_url = "http://redirect-me.com/redirect.html"
135
- @cobweb = CobWeb.new(:follow_redirects => true, :quiet => true)
129
+ @cobweb = CobWeb.new(:follow_redirects => true, :quiet => true, :cache => nil)
136
130
  end
137
131
 
138
132
  it "should flow through redirect" do
139
-
133
+
140
134
  @mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
141
-
135
+
142
136
  content = @cobweb.get(@base_url)
143
137
  content.should be_an_instance_of Hash
144
138
 
@@ -150,15 +144,15 @@ describe CobWeb do
150
144
  end
151
145
  it "should return the path followed" do
152
146
  @mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
153
-
147
+
154
148
  content = @cobweb.get(@base_url)
155
149
  content[:redirect_through].should == ["http://redirected-to.com/redirect2.html", "http://redirected-to.com/redirected.html"]
156
150
 
157
151
  end
158
152
  it "should not follow with redirect disabled" do
159
- @cobweb = CobWeb.new(:follow_redirects => false)
153
+ @cobweb = CobWeb.new(:follow_redirects => false, :cache => nil)
160
154
  @mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
161
-
155
+
162
156
  content = @cobweb.get(@base_url)
163
157
  content[:url].should == "http://redirect-me.com/redirect.html"
164
158
  content[:redirect_through].should be_nil
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- hash: 29
4
+ hash: 27
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 0
9
- - 1
10
- version: 0.0.1
9
+ - 2
10
+ version: 0.0.2
11
11
  platform: ruby
12
12
  authors:
13
13
  - Stewart McKee
@@ -88,7 +88,6 @@ files:
88
88
  - spec/spec_helper.rb
89
89
  - spec/cobweb/content_link_parser_spec.rb
90
90
  - spec/cobweb/cobweb_spec.rb
91
- - spec/cobweb/crawl_job_spec.rb
92
91
  - lib/namespaced_redis.rb
93
92
  - lib/cobweb.rb
94
93
  - lib/content_process_job.rb
@@ -1,24 +0,0 @@
1
- require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
-
3
-
4
- describe CrawlJob do
5
-
6
- before(:each) do
7
- @base_url = "http://www.baseurl.com/"
8
-
9
-
10
-
11
- client = Net::HTTPClient.new
12
- puts client.get('http://www.google.com.au')
13
- puts "asdf"
14
-
15
- @cobweb = CobWeb.new("http://www.google.com")
16
-
17
- end
18
-
19
- it "should be a cobweb type" do
20
- @cobweb.should be_an_instance_of CobWeb
21
- end
22
-
23
-
24
- end