arachnid 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. data/lib/arachnid.rb +13 -4
  2. metadata +70 -58
data/lib/arachnid.rb CHANGED
@@ -47,7 +47,7 @@ class Arachnid
47
47
  links = Nokogiri::HTML.parse(response.body).xpath('.//a/@href')
48
48
 
49
49
  links.each do |link|
50
- if(internal_link?(link) && !@global_visited.include?(split_url_at_hash(link)) && no_hash_in_url?(link) && no_image_in_url?(link))
50
+ if(internal_link?(link, response.effective_url) && !@global_visited.include?(split_url_at_hash(link)) && no_hash_in_url?(link) && no_image_in_url?(link))
51
51
 
52
52
  sanitized_link = sanitize_link(split_url_at_hash(link))
53
53
  if(sanitized_link)
@@ -83,15 +83,23 @@ class Arachnid
83
83
 
84
84
  begin
85
85
  parsed_domain = Domainatrix.parse(url)
86
- parsed_domain.subdomain + '.' + parsed_domain.domain + '.' + parsed_domain.public_suffix
86
+
87
+ if(parsed_domain.subdomain != "")
88
+ parsed_domain.subdomain + '.' + parsed_domain.domain + '.' + parsed_domain.public_suffix
89
+ else
90
+ parsed_domain.domain + '.' + parsed_domain.public_suffix
91
+ end
87
92
  rescue NoMethodError, Addressable::URI::InvalidURIError => e
88
93
  puts "URL Parsing Exception (#{url}): #{e}" if @debug == true
89
94
  return nil
90
95
  end
91
96
  end
92
97
 
93
- def internal_link?(url)
94
- parsed_url = parse_domain(url)
98
+ def internal_link?(url, effective_url)
99
+
100
+ absolute_url = make_absolute(url, effective_url)
101
+
102
+ parsed_url = parse_domain(absolute_url)
95
103
  if(@domain == parsed_url)
96
104
  return true
97
105
  else
@@ -117,6 +125,7 @@ class Arachnid
117
125
  end
118
126
 
119
127
  def no_image_in_url?(url)
128
+ return true if url.to_s.length == 0
120
129
  return true unless @exclude_urls_with_images
121
130
 
122
131
  extensions = ['.jpg', '.gif', '.png', '.jpeg']
metadata CHANGED
@@ -1,99 +1,111 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: arachnid
3
- version: !ruby/object:Gem::Version
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.3.0
4
5
  prerelease:
5
- version: 0.2.2
6
6
  platform: ruby
7
- authors:
7
+ authors:
8
8
  - dchuk
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
-
13
- date: 2011-11-11 00:00:00 -08:00
14
- default_executable:
15
- dependencies:
16
- - !ruby/object:Gem::Dependency
12
+ date: 2014-01-16 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
17
15
  name: typhoeus
18
- prerelease: false
19
- requirement: &id001 !ruby/object:Gem::Requirement
16
+ requirement: !ruby/object:Gem::Requirement
20
17
  none: false
21
- requirements:
22
- - - "="
23
- - !ruby/object:Gem::Version
18
+ requirements:
19
+ - - '='
20
+ - !ruby/object:Gem::Version
24
21
  version: 0.3.2
25
22
  type: :runtime
26
- version_requirements: *id001
27
- - !ruby/object:Gem::Dependency
28
- name: bloomfilter-rb
29
23
  prerelease: false
30
- requirement: &id002 !ruby/object:Gem::Requirement
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - '='
28
+ - !ruby/object:Gem::Version
29
+ version: 0.3.2
30
+ - !ruby/object:Gem::Dependency
31
+ name: bloomfilter-rb
32
+ requirement: !ruby/object:Gem::Requirement
31
33
  none: false
32
- requirements:
33
- - - "="
34
- - !ruby/object:Gem::Version
34
+ requirements:
35
+ - - '='
36
+ - !ruby/object:Gem::Version
35
37
  version: 2.1.1
36
38
  type: :runtime
37
- version_requirements: *id002
38
- - !ruby/object:Gem::Dependency
39
- name: nokogiri
40
39
  prerelease: false
41
- requirement: &id003 !ruby/object:Gem::Requirement
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - '='
44
+ - !ruby/object:Gem::Version
45
+ version: 2.1.1
46
+ - !ruby/object:Gem::Dependency
47
+ name: nokogiri
48
+ requirement: !ruby/object:Gem::Requirement
42
49
  none: false
43
- requirements:
44
- - - "="
45
- - !ruby/object:Gem::Version
50
+ requirements:
51
+ - - '='
52
+ - !ruby/object:Gem::Version
46
53
  version: 1.5.0
47
54
  type: :runtime
48
- version_requirements: *id003
49
- - !ruby/object:Gem::Dependency
50
- name: domainatrix
51
55
  prerelease: false
52
- requirement: &id004 !ruby/object:Gem::Requirement
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - '='
60
+ - !ruby/object:Gem::Version
61
+ version: 1.5.0
62
+ - !ruby/object:Gem::Dependency
63
+ name: domainatrix
64
+ requirement: !ruby/object:Gem::Requirement
53
65
  none: false
54
- requirements:
55
- - - "="
56
- - !ruby/object:Gem::Version
66
+ requirements:
67
+ - - '='
68
+ - !ruby/object:Gem::Version
57
69
  version: 0.0.10
58
70
  type: :runtime
59
- version_requirements: *id004
60
- description: Arachnid is a web crawler that relies on Bloom Filters to efficiently store visited urls and Typhoeus to avoid the overhead of Mechanize when crawling every page on a domain.
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - '='
76
+ - !ruby/object:Gem::Version
77
+ version: 0.0.10
78
+ description: Arachnid is a web crawler that relies on Bloom Filters to efficiently
79
+ store visited urls and Typhoeus to avoid the overhead of Mechanize when crawling
80
+ every page on a domain.
61
81
  email: me@dchuk.com
62
82
  executables: []
63
-
64
83
  extensions: []
65
-
66
84
  extra_rdoc_files: []
67
-
68
- files:
85
+ files:
69
86
  - lib/arachnid.rb
70
- has_rdoc: true
71
87
  homepage: https://github.com/dchuk/Arachnid
72
88
  licenses: []
73
-
74
89
  post_install_message:
75
90
  rdoc_options: []
76
-
77
- require_paths:
91
+ require_paths:
78
92
  - lib
79
- required_ruby_version: !ruby/object:Gem::Requirement
93
+ required_ruby_version: !ruby/object:Gem::Requirement
80
94
  none: false
81
- requirements:
82
- - - ">="
83
- - !ruby/object:Gem::Version
84
- version: "0"
85
- required_rubygems_version: !ruby/object:Gem::Requirement
95
+ requirements:
96
+ - - ! '>='
97
+ - !ruby/object:Gem::Version
98
+ version: '0'
99
+ required_rubygems_version: !ruby/object:Gem::Requirement
86
100
  none: false
87
- requirements:
88
- - - ">="
89
- - !ruby/object:Gem::Version
90
- version: "0"
101
+ requirements:
102
+ - - ! '>='
103
+ - !ruby/object:Gem::Version
104
+ version: '0'
91
105
  requirements: []
92
-
93
106
  rubyforge_project:
94
- rubygems_version: 1.6.2
107
+ rubygems_version: 1.8.23
95
108
  signing_key:
96
109
  specification_version: 3
97
110
  summary: Extremely fast and efficient domain spider
98
111
  test_files: []
99
-