arachnid 0.2.2 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (2) hide show
  1. data/lib/arachnid.rb +13 -4
  2. metadata +70 -58
data/lib/arachnid.rb CHANGED
@@ -47,7 +47,7 @@ class Arachnid
47
47
  links = Nokogiri::HTML.parse(response.body).xpath('.//a/@href')
48
48
 
49
49
  links.each do |link|
50
- if(internal_link?(link) && !@global_visited.include?(split_url_at_hash(link)) && no_hash_in_url?(link) && no_image_in_url?(link))
50
+ if(internal_link?(link, response.effective_url) && !@global_visited.include?(split_url_at_hash(link)) && no_hash_in_url?(link) && no_image_in_url?(link))
51
51
 
52
52
  sanitized_link = sanitize_link(split_url_at_hash(link))
53
53
  if(sanitized_link)
@@ -83,15 +83,23 @@ class Arachnid
83
83
 
84
84
  begin
85
85
  parsed_domain = Domainatrix.parse(url)
86
- parsed_domain.subdomain + '.' + parsed_domain.domain + '.' + parsed_domain.public_suffix
86
+
87
+ if(parsed_domain.subdomain != "")
88
+ parsed_domain.subdomain + '.' + parsed_domain.domain + '.' + parsed_domain.public_suffix
89
+ else
90
+ parsed_domain.domain + '.' + parsed_domain.public_suffix
91
+ end
87
92
  rescue NoMethodError, Addressable::URI::InvalidURIError => e
88
93
  puts "URL Parsing Exception (#{url}): #{e}" if @debug == true
89
94
  return nil
90
95
  end
91
96
  end
92
97
 
93
- def internal_link?(url)
94
- parsed_url = parse_domain(url)
98
+ def internal_link?(url, effective_url)
99
+
100
+ absolute_url = make_absolute(url, effective_url)
101
+
102
+ parsed_url = parse_domain(absolute_url)
95
103
  if(@domain == parsed_url)
96
104
  return true
97
105
  else
@@ -117,6 +125,7 @@ class Arachnid
117
125
  end
118
126
 
119
127
  def no_image_in_url?(url)
128
+ return true if url.to_s.length == 0
120
129
  return true unless @exclude_urls_with_images
121
130
 
122
131
  extensions = ['.jpg', '.gif', '.png', '.jpeg']
metadata CHANGED
@@ -1,99 +1,111 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: arachnid
3
- version: !ruby/object:Gem::Version
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.3.0
4
5
  prerelease:
5
- version: 0.2.2
6
6
  platform: ruby
7
- authors:
7
+ authors:
8
8
  - dchuk
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
-
13
- date: 2011-11-11 00:00:00 -08:00
14
- default_executable:
15
- dependencies:
16
- - !ruby/object:Gem::Dependency
12
+ date: 2014-01-16 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
17
15
  name: typhoeus
18
- prerelease: false
19
- requirement: &id001 !ruby/object:Gem::Requirement
16
+ requirement: !ruby/object:Gem::Requirement
20
17
  none: false
21
- requirements:
22
- - - "="
23
- - !ruby/object:Gem::Version
18
+ requirements:
19
+ - - '='
20
+ - !ruby/object:Gem::Version
24
21
  version: 0.3.2
25
22
  type: :runtime
26
- version_requirements: *id001
27
- - !ruby/object:Gem::Dependency
28
- name: bloomfilter-rb
29
23
  prerelease: false
30
- requirement: &id002 !ruby/object:Gem::Requirement
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - '='
28
+ - !ruby/object:Gem::Version
29
+ version: 0.3.2
30
+ - !ruby/object:Gem::Dependency
31
+ name: bloomfilter-rb
32
+ requirement: !ruby/object:Gem::Requirement
31
33
  none: false
32
- requirements:
33
- - - "="
34
- - !ruby/object:Gem::Version
34
+ requirements:
35
+ - - '='
36
+ - !ruby/object:Gem::Version
35
37
  version: 2.1.1
36
38
  type: :runtime
37
- version_requirements: *id002
38
- - !ruby/object:Gem::Dependency
39
- name: nokogiri
40
39
  prerelease: false
41
- requirement: &id003 !ruby/object:Gem::Requirement
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - '='
44
+ - !ruby/object:Gem::Version
45
+ version: 2.1.1
46
+ - !ruby/object:Gem::Dependency
47
+ name: nokogiri
48
+ requirement: !ruby/object:Gem::Requirement
42
49
  none: false
43
- requirements:
44
- - - "="
45
- - !ruby/object:Gem::Version
50
+ requirements:
51
+ - - '='
52
+ - !ruby/object:Gem::Version
46
53
  version: 1.5.0
47
54
  type: :runtime
48
- version_requirements: *id003
49
- - !ruby/object:Gem::Dependency
50
- name: domainatrix
51
55
  prerelease: false
52
- requirement: &id004 !ruby/object:Gem::Requirement
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - '='
60
+ - !ruby/object:Gem::Version
61
+ version: 1.5.0
62
+ - !ruby/object:Gem::Dependency
63
+ name: domainatrix
64
+ requirement: !ruby/object:Gem::Requirement
53
65
  none: false
54
- requirements:
55
- - - "="
56
- - !ruby/object:Gem::Version
66
+ requirements:
67
+ - - '='
68
+ - !ruby/object:Gem::Version
57
69
  version: 0.0.10
58
70
  type: :runtime
59
- version_requirements: *id004
60
- description: Arachnid is a web crawler that relies on Bloom Filters to efficiently store visited urls and Typhoeus to avoid the overhead of Mechanize when crawling every page on a domain.
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - '='
76
+ - !ruby/object:Gem::Version
77
+ version: 0.0.10
78
+ description: Arachnid is a web crawler that relies on Bloom Filters to efficiently
79
+ store visited urls and Typhoeus to avoid the overhead of Mechanize when crawling
80
+ every page on a domain.
61
81
  email: me@dchuk.com
62
82
  executables: []
63
-
64
83
  extensions: []
65
-
66
84
  extra_rdoc_files: []
67
-
68
- files:
85
+ files:
69
86
  - lib/arachnid.rb
70
- has_rdoc: true
71
87
  homepage: https://github.com/dchuk/Arachnid
72
88
  licenses: []
73
-
74
89
  post_install_message:
75
90
  rdoc_options: []
76
-
77
- require_paths:
91
+ require_paths:
78
92
  - lib
79
- required_ruby_version: !ruby/object:Gem::Requirement
93
+ required_ruby_version: !ruby/object:Gem::Requirement
80
94
  none: false
81
- requirements:
82
- - - ">="
83
- - !ruby/object:Gem::Version
84
- version: "0"
85
- required_rubygems_version: !ruby/object:Gem::Requirement
95
+ requirements:
96
+ - - ! '>='
97
+ - !ruby/object:Gem::Version
98
+ version: '0'
99
+ required_rubygems_version: !ruby/object:Gem::Requirement
86
100
  none: false
87
- requirements:
88
- - - ">="
89
- - !ruby/object:Gem::Version
90
- version: "0"
101
+ requirements:
102
+ - - ! '>='
103
+ - !ruby/object:Gem::Version
104
+ version: '0'
91
105
  requirements: []
92
-
93
106
  rubyforge_project:
94
- rubygems_version: 1.6.2
107
+ rubygems_version: 1.8.23
95
108
  signing_key:
96
109
  specification_version: 3
97
110
  summary: Extremely fast and efficient domain spider
98
111
  test_files: []
99
-