arachnid 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/arachnid.rb +13 -4
- metadata +70 -58
data/lib/arachnid.rb
CHANGED
@@ -47,7 +47,7 @@ class Arachnid
|
|
47
47
|
links = Nokogiri::HTML.parse(response.body).xpath('.//a/@href')
|
48
48
|
|
49
49
|
links.each do |link|
|
50
|
-
if(internal_link?(link) && !@global_visited.include?(split_url_at_hash(link)) && no_hash_in_url?(link) && no_image_in_url?(link))
|
50
|
+
if(internal_link?(link, response.effective_url) && !@global_visited.include?(split_url_at_hash(link)) && no_hash_in_url?(link) && no_image_in_url?(link))
|
51
51
|
|
52
52
|
sanitized_link = sanitize_link(split_url_at_hash(link))
|
53
53
|
if(sanitized_link)
|
@@ -83,15 +83,23 @@ class Arachnid
|
|
83
83
|
|
84
84
|
begin
|
85
85
|
parsed_domain = Domainatrix.parse(url)
|
86
|
-
|
86
|
+
|
87
|
+
if(parsed_domain.subdomain != "")
|
88
|
+
parsed_domain.subdomain + '.' + parsed_domain.domain + '.' + parsed_domain.public_suffix
|
89
|
+
else
|
90
|
+
parsed_domain.domain + '.' + parsed_domain.public_suffix
|
91
|
+
end
|
87
92
|
rescue NoMethodError, Addressable::URI::InvalidURIError => e
|
88
93
|
puts "URL Parsing Exception (#{url}): #{e}" if @debug == true
|
89
94
|
return nil
|
90
95
|
end
|
91
96
|
end
|
92
97
|
|
93
|
-
def internal_link?(url)
|
94
|
-
|
98
|
+
def internal_link?(url, effective_url)
|
99
|
+
|
100
|
+
absolute_url = make_absolute(url, effective_url)
|
101
|
+
|
102
|
+
parsed_url = parse_domain(absolute_url)
|
95
103
|
if(@domain == parsed_url)
|
96
104
|
return true
|
97
105
|
else
|
@@ -117,6 +125,7 @@ class Arachnid
|
|
117
125
|
end
|
118
126
|
|
119
127
|
def no_image_in_url?(url)
|
128
|
+
return true if url.to_s.length == 0
|
120
129
|
return true unless @exclude_urls_with_images
|
121
130
|
|
122
131
|
extensions = ['.jpg', '.gif', '.png', '.jpeg']
|
metadata
CHANGED
@@ -1,99 +1,111 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: arachnid
|
3
|
-
version: !ruby/object:Gem::Version
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.3.0
|
4
5
|
prerelease:
|
5
|
-
version: 0.2.2
|
6
6
|
platform: ruby
|
7
|
-
authors:
|
7
|
+
authors:
|
8
8
|
- dchuk
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
dependencies:
|
16
|
-
- !ruby/object:Gem::Dependency
|
12
|
+
date: 2014-01-16 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
17
15
|
name: typhoeus
|
18
|
-
|
19
|
-
requirement: &id001 !ruby/object:Gem::Requirement
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
20
17
|
none: false
|
21
|
-
requirements:
|
22
|
-
- -
|
23
|
-
- !ruby/object:Gem::Version
|
18
|
+
requirements:
|
19
|
+
- - '='
|
20
|
+
- !ruby/object:Gem::Version
|
24
21
|
version: 0.3.2
|
25
22
|
type: :runtime
|
26
|
-
version_requirements: *id001
|
27
|
-
- !ruby/object:Gem::Dependency
|
28
|
-
name: bloomfilter-rb
|
29
23
|
prerelease: false
|
30
|
-
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - '='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 0.3.2
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: bloomfilter-rb
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
31
33
|
none: false
|
32
|
-
requirements:
|
33
|
-
- -
|
34
|
-
- !ruby/object:Gem::Version
|
34
|
+
requirements:
|
35
|
+
- - '='
|
36
|
+
- !ruby/object:Gem::Version
|
35
37
|
version: 2.1.1
|
36
38
|
type: :runtime
|
37
|
-
version_requirements: *id002
|
38
|
-
- !ruby/object:Gem::Dependency
|
39
|
-
name: nokogiri
|
40
39
|
prerelease: false
|
41
|
-
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - '='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: 2.1.1
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: nokogiri
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
42
49
|
none: false
|
43
|
-
requirements:
|
44
|
-
- -
|
45
|
-
- !ruby/object:Gem::Version
|
50
|
+
requirements:
|
51
|
+
- - '='
|
52
|
+
- !ruby/object:Gem::Version
|
46
53
|
version: 1.5.0
|
47
54
|
type: :runtime
|
48
|
-
version_requirements: *id003
|
49
|
-
- !ruby/object:Gem::Dependency
|
50
|
-
name: domainatrix
|
51
55
|
prerelease: false
|
52
|
-
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - '='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 1.5.0
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: domainatrix
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
53
65
|
none: false
|
54
|
-
requirements:
|
55
|
-
- -
|
56
|
-
- !ruby/object:Gem::Version
|
66
|
+
requirements:
|
67
|
+
- - '='
|
68
|
+
- !ruby/object:Gem::Version
|
57
69
|
version: 0.0.10
|
58
70
|
type: :runtime
|
59
|
-
|
60
|
-
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - '='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: 0.0.10
|
78
|
+
description: Arachnid is a web crawler that relies on Bloom Filters to efficiently
|
79
|
+
store visited urls and Typhoeus to avoid the overhead of Mechanize when crawling
|
80
|
+
every page on a domain.
|
61
81
|
email: me@dchuk.com
|
62
82
|
executables: []
|
63
|
-
|
64
83
|
extensions: []
|
65
|
-
|
66
84
|
extra_rdoc_files: []
|
67
|
-
|
68
|
-
files:
|
85
|
+
files:
|
69
86
|
- lib/arachnid.rb
|
70
|
-
has_rdoc: true
|
71
87
|
homepage: https://github.com/dchuk/Arachnid
|
72
88
|
licenses: []
|
73
|
-
|
74
89
|
post_install_message:
|
75
90
|
rdoc_options: []
|
76
|
-
|
77
|
-
require_paths:
|
91
|
+
require_paths:
|
78
92
|
- lib
|
79
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
93
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
80
94
|
none: false
|
81
|
-
requirements:
|
82
|
-
- -
|
83
|
-
- !ruby/object:Gem::Version
|
84
|
-
version:
|
85
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
95
|
+
requirements:
|
96
|
+
- - ! '>='
|
97
|
+
- !ruby/object:Gem::Version
|
98
|
+
version: '0'
|
99
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
86
100
|
none: false
|
87
|
-
requirements:
|
88
|
-
- -
|
89
|
-
- !ruby/object:Gem::Version
|
90
|
-
version:
|
101
|
+
requirements:
|
102
|
+
- - ! '>='
|
103
|
+
- !ruby/object:Gem::Version
|
104
|
+
version: '0'
|
91
105
|
requirements: []
|
92
|
-
|
93
106
|
rubyforge_project:
|
94
|
-
rubygems_version: 1.
|
107
|
+
rubygems_version: 1.8.23
|
95
108
|
signing_key:
|
96
109
|
specification_version: 3
|
97
110
|
summary: Extremely fast and efficient domain spider
|
98
111
|
test_files: []
|
99
|
-
|