find_dead_link 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +41 -0
- data/README.md +9 -0
- data/bin/find_dead_link.rb +15 -0
- data/find_dead_link.gemspec +26 -0
- data/find_dead_link.log +815 -0
- data/lib/find_dead_link.rb +16 -0
- data/lib/find_dead_link/bad_url_exception.rb +3 -0
- data/lib/find_dead_link/crawler.rb +41 -0
- data/lib/find_dead_link/html_doc.rb +21 -0
- data/lib/find_dead_link/url_opener.rb +28 -0
- data/lib/find_dead_link/version.rb +3 -0
- metadata +98 -0
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
require 'semantic_logger' unless defined?(logger)
|
|
2
|
+
require 'nokogiri'
|
|
3
|
+
require 'open-uri'
|
|
4
|
+
require 'pry'
|
|
5
|
+
|
|
6
|
+
SemanticLogger.default_level = :info
|
|
7
|
+
SemanticLogger.add_appender('find_dead_link.log')
|
|
8
|
+
|
|
9
|
+
module FindDeadLink
|
|
10
|
+
GEM_ROOT = File.join(File.dirname(__FILE__), '..')
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
require_relative './find_dead_link/crawler'
|
|
14
|
+
require_relative './find_dead_link/html_doc'
|
|
15
|
+
require_relative './find_dead_link/url_opener'
|
|
16
|
+
require_relative './find_dead_link/version'
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
module FindDeadLink
|
|
2
|
+
class Crawler
|
|
3
|
+
include SemanticLogger::Loggable
|
|
4
|
+
def initialize(url)
|
|
5
|
+
raise BadUrlException unless (URI(url) rescue false)
|
|
6
|
+
@base_url = url
|
|
7
|
+
@host = URI(url).host
|
|
8
|
+
@deadlinks = []
|
|
9
|
+
@visited_links = []
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def crawl
|
|
13
|
+
visit(@base_url)
|
|
14
|
+
logger.info("you have Deadlinks url #{@deadlinks}")
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
private
|
|
18
|
+
def visit(url)
|
|
19
|
+
logger.info("Visiting url #{url}")
|
|
20
|
+
return if visited_url?(url) or external_url?(url)
|
|
21
|
+
@visited_links << url
|
|
22
|
+
url_opener = UrlOpener.new(url)
|
|
23
|
+
return (@deadlinks << url) if url_opener.dead_link?
|
|
24
|
+
links = HtmlDoc.new(url_opener.get_content).get_links
|
|
25
|
+
links.each{|url| visit(get_url(url)) }
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def visited_url?(url)
|
|
29
|
+
@visited_links.include?(url)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def get_url(url)
|
|
33
|
+
return url if url.start_with?('http') or url.start_with?('https')
|
|
34
|
+
URI.join(@base_url, url).to_s
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def external_url?(url)
|
|
38
|
+
!(@host == URI(url).host rescue false)
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
module FindDeadLink
|
|
2
|
+
class HtmlDoc
|
|
3
|
+
IGNORE_URLS_START_WITH = ['#', 'javascript', 'mailto']
|
|
4
|
+
|
|
5
|
+
def initialize(html_content)
|
|
6
|
+
@html_content = html_content
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
def get_links
|
|
10
|
+
@html_content.css('a').collect do |link|
|
|
11
|
+
link["href"] unless ignore?(link["href"])
|
|
12
|
+
end.compact
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def ignore?(url)
|
|
16
|
+
return true if url.nil?
|
|
17
|
+
IGNORE_URLS_START_WITH.each{|element| return true if url.start_with?(element) }
|
|
18
|
+
false
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
require 'net/http'
|
|
2
|
+
module FindDeadLink
|
|
3
|
+
class UrlOpener
|
|
4
|
+
include SemanticLogger::Loggable
|
|
5
|
+
|
|
6
|
+
def initialize(url)
|
|
7
|
+
@url = url
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def dead_link?
|
|
11
|
+
begin
|
|
12
|
+
(Net::HTTP.get_response(URI(@url)).code) == "404"
|
|
13
|
+
rescue StandardError => e
|
|
14
|
+
logger.error "DEADLINK :: Error in opening URL:: #{@url}", error: e.inspect
|
|
15
|
+
return true
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def get_content
|
|
20
|
+
begin
|
|
21
|
+
Nokogiri::HTML(open(@url))
|
|
22
|
+
rescue StandardError => e
|
|
23
|
+
logger.error "Error in parsing content for URL:: #{@url}", error: e.inspect
|
|
24
|
+
Nokogiri::HTML("")
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: find_dead_link
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.0.1
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Ankur Maheshwari
|
|
8
|
+
autorequire:
|
|
9
|
+
bindir: bin
|
|
10
|
+
cert_chain: []
|
|
11
|
+
date: 2014-04-01 00:00:00.000000000 Z
|
|
12
|
+
dependencies:
|
|
13
|
+
- !ruby/object:Gem::Dependency
|
|
14
|
+
name: nokogiri
|
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
|
16
|
+
requirements:
|
|
17
|
+
- - ~>
|
|
18
|
+
- !ruby/object:Gem::Version
|
|
19
|
+
version: 1.6.1
|
|
20
|
+
type: :runtime
|
|
21
|
+
prerelease: false
|
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
23
|
+
requirements:
|
|
24
|
+
- - ~>
|
|
25
|
+
- !ruby/object:Gem::Version
|
|
26
|
+
version: 1.6.1
|
|
27
|
+
- !ruby/object:Gem::Dependency
|
|
28
|
+
name: semantic_logger
|
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
|
30
|
+
requirements:
|
|
31
|
+
- - ~>
|
|
32
|
+
- !ruby/object:Gem::Version
|
|
33
|
+
version: 2.7.0
|
|
34
|
+
type: :runtime
|
|
35
|
+
prerelease: false
|
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
37
|
+
requirements:
|
|
38
|
+
- - ~>
|
|
39
|
+
- !ruby/object:Gem::Version
|
|
40
|
+
version: 2.7.0
|
|
41
|
+
- !ruby/object:Gem::Dependency
|
|
42
|
+
name: pry
|
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
|
44
|
+
requirements:
|
|
45
|
+
- - ! '>='
|
|
46
|
+
- !ruby/object:Gem::Version
|
|
47
|
+
version: '0'
|
|
48
|
+
type: :development
|
|
49
|
+
prerelease: false
|
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
51
|
+
requirements:
|
|
52
|
+
- - ! '>='
|
|
53
|
+
- !ruby/object:Gem::Version
|
|
54
|
+
version: '0'
|
|
55
|
+
description: The library takes a url and search for deadlinks within given website
|
|
56
|
+
email:
|
|
57
|
+
- amaheshwari@systango.com
|
|
58
|
+
executables:
|
|
59
|
+
- find_dead_link.rb
|
|
60
|
+
extensions: []
|
|
61
|
+
extra_rdoc_files: []
|
|
62
|
+
files:
|
|
63
|
+
- Gemfile
|
|
64
|
+
- Gemfile.lock
|
|
65
|
+
- README.md
|
|
66
|
+
- bin/find_dead_link.rb
|
|
67
|
+
- find_dead_link.gemspec
|
|
68
|
+
- find_dead_link.log
|
|
69
|
+
- lib/find_dead_link.rb
|
|
70
|
+
- lib/find_dead_link/bad_url_exception.rb
|
|
71
|
+
- lib/find_dead_link/crawler.rb
|
|
72
|
+
- lib/find_dead_link/html_doc.rb
|
|
73
|
+
- lib/find_dead_link/url_opener.rb
|
|
74
|
+
- lib/find_dead_link/version.rb
|
|
75
|
+
homepage: http://systango.com/
|
|
76
|
+
licenses: []
|
|
77
|
+
metadata: {}
|
|
78
|
+
post_install_message:
|
|
79
|
+
rdoc_options: []
|
|
80
|
+
require_paths:
|
|
81
|
+
- lib
|
|
82
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
83
|
+
requirements:
|
|
84
|
+
- - ! '>='
|
|
85
|
+
- !ruby/object:Gem::Version
|
|
86
|
+
version: '0'
|
|
87
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
88
|
+
requirements:
|
|
89
|
+
- - ! '>='
|
|
90
|
+
- !ruby/object:Gem::Version
|
|
91
|
+
version: '0'
|
|
92
|
+
requirements: []
|
|
93
|
+
rubyforge_project:
|
|
94
|
+
rubygems_version: 2.2.2
|
|
95
|
+
signing_key:
|
|
96
|
+
specification_version: 4
|
|
97
|
+
summary: Looks for deadlinks exists in provided url
|
|
98
|
+
test_files: []
|