cobweb 1.0.8 → 1.0.9

Sign up to get free protection for your applications and to get access to all the features.
data/README.textile CHANGED
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v1.0.8
2
+ h1. Cobweb v1.0.9
3
3
 
4
4
  "@cobweb_gem":https://twitter.com/cobweb_gem
5
5
 
data/lib/cobweb.rb CHANGED
@@ -42,6 +42,7 @@ class Cobweb
42
42
  default_timeout_to 10
43
43
  default_redis_options_to Hash.new
44
44
  default_internal_urls_to []
45
+ default_external_urls_to []
45
46
  default_first_page_redirect_internal_to true
46
47
  default_text_mime_types_to ["text/*", "application/xhtml+xml"]
47
48
  default_obey_robots_to false
@@ -49,7 +50,7 @@ class Cobweb
49
50
  default_valid_mime_types_to ["*/*"]
50
51
  default_raise_exceptions_to false
51
52
  default_store_refered_url_to false
52
-
53
+
53
54
  end
54
55
 
55
56
  # This method starts the resque based crawl and enqueues the base_url
@@ -101,7 +101,7 @@ class CobwebCrawler
101
101
  # select the link if its internal (eliminate external before expensive lookups in queued and crawled)
102
102
  cobweb_links = CobwebLinks.new(@options)
103
103
 
104
- internal_links = internal_links.select{|link| cobweb_links.internal?(link) || (@options[:crawl_linked_external] && cobweb_links.internal?(url.to_s))}
104
+ internal_links = internal_links.select{|link| cobweb_links.internal?(link) || (@options[:crawl_linked_external] && cobweb_links.internal?(url.to_s) && !cobweb_links.matches_external?(link))}
105
105
 
106
106
  all_internal_links = internal_links
107
107
 
data/lib/cobweb_links.rb CHANGED
@@ -35,6 +35,10 @@ class CobwebLinks
35
35
  def external?(link)
36
36
  @internal_patterns.select{|pattern| link.match(pattern)}.empty? || !@external_patterns.select{|pattern| link.match(pattern)}.empty?
37
37
  end
38
+
39
+ def matches_external?(link)
40
+ !@external_patterns.select{|pattern| link.match(pattern)}.empty?
41
+ end
38
42
 
39
43
  end
40
44
 
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "1.0.8"
6
+ "1.0.9"
7
7
  end
8
8
 
9
9
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.8
4
+ version: 1.0.9
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-02-12 00:00:00.000000000 Z
12
+ date: 2013-02-14 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70128767187740 !ruby/object:Gem::Requirement
16
+ requirement: &70315669563580 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70128767187740
24
+ version_requirements: *70315669563580
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70128767183580 !ruby/object:Gem::Requirement
27
+ requirement: &70315669557780 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70128767183580
35
+ version_requirements: *70315669557780
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70128767182220 !ruby/object:Gem::Requirement
38
+ requirement: &70315669553860 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70128767182220
46
+ version_requirements: *70315669553860
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: addressable
49
- requirement: &70128767175380 !ruby/object:Gem::Requirement
49
+ requirement: &70315669550620 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70128767175380
57
+ version_requirements: *70315669550620
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &70128767172200 !ruby/object:Gem::Requirement
60
+ requirement: &70315669548760 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70128767172200
68
+ version_requirements: *70315669548760
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: awesome_print
71
- requirement: &70128767170580 !ruby/object:Gem::Requirement
71
+ requirement: &70315669546580 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70128767170580
79
+ version_requirements: *70315669546580
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: sinatra
82
- requirement: &70128767167800 !ruby/object:Gem::Requirement
82
+ requirement: &70315669542520 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70128767167800
90
+ version_requirements: *70315669542520
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: thin
93
- requirement: &70128767161460 !ruby/object:Gem::Requirement
93
+ requirement: &70315669538180 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70128767161460
101
+ version_requirements: *70315669538180
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: haml
104
- requirement: &70128767157640 !ruby/object:Gem::Requirement
104
+ requirement: &70315669533960 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70128767157640
112
+ version_requirements: *70315669533960
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: namespaced_redis
115
- requirement: &70128767152300 !ruby/object:Gem::Requirement
115
+ requirement: &70315669528800 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,10 +120,10 @@ dependencies:
120
120
  version: '0'
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *70128767152300
123
+ version_requirements: *70315669528800
124
124
  - !ruby/object:Gem::Dependency
125
125
  name: json
126
- requirement: &70128767148180 !ruby/object:Gem::Requirement
126
+ requirement: &70315669525400 !ruby/object:Gem::Requirement
127
127
  none: false
128
128
  requirements:
129
129
  - - ! '>='
@@ -131,7 +131,7 @@ dependencies:
131
131
  version: '0'
132
132
  type: :runtime
133
133
  prerelease: false
134
- version_requirements: *70128767148180
134
+ version_requirements: *70315669525400
135
135
  description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
136
136
  crawl extremely large sites which is much more performant than multi-threaded crawlers. It
137
137
  is also a standalone crawler that has a sophisticated statistics monitoring interface