cobweb 1.0.8 → 1.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.textile CHANGED
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v1.0.8
2
+ h1. Cobweb v1.0.9
3
3
 
4
4
  "@cobweb_gem":https://twitter.com/cobweb_gem
5
5
 
data/lib/cobweb.rb CHANGED
@@ -42,6 +42,7 @@ class Cobweb
42
42
  default_timeout_to 10
43
43
  default_redis_options_to Hash.new
44
44
  default_internal_urls_to []
45
+ default_external_urls_to []
45
46
  default_first_page_redirect_internal_to true
46
47
  default_text_mime_types_to ["text/*", "application/xhtml+xml"]
47
48
  default_obey_robots_to false
@@ -49,7 +50,7 @@ class Cobweb
49
50
  default_valid_mime_types_to ["*/*"]
50
51
  default_raise_exceptions_to false
51
52
  default_store_refered_url_to false
52
-
53
+
53
54
  end
54
55
 
55
56
  # This method starts the resque based crawl and enqueues the base_url
@@ -101,7 +101,7 @@ class CobwebCrawler
101
101
  # select the link if its internal (eliminate external before expensive lookups in queued and crawled)
102
102
  cobweb_links = CobwebLinks.new(@options)
103
103
 
104
- internal_links = internal_links.select{|link| cobweb_links.internal?(link) || (@options[:crawl_linked_external] && cobweb_links.internal?(url.to_s))}
104
+ internal_links = internal_links.select{|link| cobweb_links.internal?(link) || (@options[:crawl_linked_external] && cobweb_links.internal?(url.to_s) && !cobweb_links.matches_external?(link))}
105
105
 
106
106
  all_internal_links = internal_links
107
107
 
data/lib/cobweb_links.rb CHANGED
@@ -35,6 +35,10 @@ class CobwebLinks
35
35
  def external?(link)
36
36
  @internal_patterns.select{|pattern| link.match(pattern)}.empty? || !@external_patterns.select{|pattern| link.match(pattern)}.empty?
37
37
  end
38
+
39
+ def matches_external?(link)
40
+ !@external_patterns.select{|pattern| link.match(pattern)}.empty?
41
+ end
38
42
 
39
43
  end
40
44
 
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "1.0.8"
6
+ "1.0.9"
7
7
  end
8
8
 
9
9
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.8
4
+ version: 1.0.9
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-02-12 00:00:00.000000000 Z
12
+ date: 2013-02-14 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70128767187740 !ruby/object:Gem::Requirement
16
+ requirement: &70315669563580 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70128767187740
24
+ version_requirements: *70315669563580
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70128767183580 !ruby/object:Gem::Requirement
27
+ requirement: &70315669557780 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70128767183580
35
+ version_requirements: *70315669557780
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70128767182220 !ruby/object:Gem::Requirement
38
+ requirement: &70315669553860 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70128767182220
46
+ version_requirements: *70315669553860
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: addressable
49
- requirement: &70128767175380 !ruby/object:Gem::Requirement
49
+ requirement: &70315669550620 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70128767175380
57
+ version_requirements: *70315669550620
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &70128767172200 !ruby/object:Gem::Requirement
60
+ requirement: &70315669548760 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70128767172200
68
+ version_requirements: *70315669548760
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: awesome_print
71
- requirement: &70128767170580 !ruby/object:Gem::Requirement
71
+ requirement: &70315669546580 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70128767170580
79
+ version_requirements: *70315669546580
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: sinatra
82
- requirement: &70128767167800 !ruby/object:Gem::Requirement
82
+ requirement: &70315669542520 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70128767167800
90
+ version_requirements: *70315669542520
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: thin
93
- requirement: &70128767161460 !ruby/object:Gem::Requirement
93
+ requirement: &70315669538180 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70128767161460
101
+ version_requirements: *70315669538180
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: haml
104
- requirement: &70128767157640 !ruby/object:Gem::Requirement
104
+ requirement: &70315669533960 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70128767157640
112
+ version_requirements: *70315669533960
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: namespaced_redis
115
- requirement: &70128767152300 !ruby/object:Gem::Requirement
115
+ requirement: &70315669528800 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,10 +120,10 @@ dependencies:
120
120
  version: '0'
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *70128767152300
123
+ version_requirements: *70315669528800
124
124
  - !ruby/object:Gem::Dependency
125
125
  name: json
126
- requirement: &70128767148180 !ruby/object:Gem::Requirement
126
+ requirement: &70315669525400 !ruby/object:Gem::Requirement
127
127
  none: false
128
128
  requirements:
129
129
  - - ! '>='
@@ -131,7 +131,7 @@ dependencies:
131
131
  version: '0'
132
132
  type: :runtime
133
133
  prerelease: false
134
- version_requirements: *70128767148180
134
+ version_requirements: *70315669525400
135
135
  description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
136
136
  crawl extremely large sites which is much more performant than multi-threaded crawlers. It
137
137
  is also a standalone crawler that has a sophisticated statistics monitoring interface