find_dead_link 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,16 @@
1
+ require 'semantic_logger' unless defined?(logger)
2
+ require 'nokogiri'
3
+ require 'open-uri'
4
+ require 'pry'
5
+
6
+ SemanticLogger.default_level = :info
7
+ SemanticLogger.add_appender('find_dead_link.log')
8
+
9
+ module FindDeadLink
10
+ GEM_ROOT = File.join(File.dirname(__FILE__), '..')
11
+ end
12
+
13
+ require_relative './find_dead_link/crawler'
14
+ require_relative './find_dead_link/html_doc'
15
+ require_relative './find_dead_link/url_opener'
16
+ require_relative './find_dead_link/version'
@@ -0,0 +1,3 @@
1
+ module FindDeadLink
2
+ class BadUrlException < StandardError;end
3
+ end
@@ -0,0 +1,41 @@
1
+ module FindDeadLink
2
+ class Crawler
3
+ include SemanticLogger::Loggable
4
+ def initialize(url)
5
+ raise BadUrlException unless (URI(url) rescue false)
6
+ @base_url = url
7
+ @host = URI(url).host
8
+ @deadlinks = []
9
+ @visited_links = []
10
+ end
11
+
12
+ def crawl
13
+ visit(@base_url)
14
+ logger.info("you have Deadlinks url #{@deadlinks}")
15
+ end
16
+
17
+ private
18
+ def visit(url)
19
+ logger.info("Visiting url #{url}")
20
+ return if visited_url?(url) or external_url?(url)
21
+ @visited_links << url
22
+ url_opener = UrlOpener.new(url)
23
+ return (@deadlinks << url) if url_opener.dead_link?
24
+ links = HtmlDoc.new(url_opener.get_content).get_links
25
+ links.each{|url| visit(get_url(url)) }
26
+ end
27
+
28
+ def visited_url?(url)
29
+ @visited_links.include?(url)
30
+ end
31
+
32
+ def get_url(url)
33
+ return url if url.start_with?('http') or url.start_with?('https')
34
+ URI.join(@base_url, url).to_s
35
+ end
36
+
37
+ def external_url?(url)
38
+ !(@host == URI(url).host rescue false)
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,21 @@
1
+ module FindDeadLink
2
+ class HtmlDoc
3
+ IGNORE_URLS_START_WITH = ['#', 'javascript', 'mailto']
4
+
5
+ def initialize(html_content)
6
+ @html_content = html_content
7
+ end
8
+
9
+ def get_links
10
+ @html_content.css('a').collect do |link|
11
+ link["href"] unless ignore?(link["href"])
12
+ end.compact
13
+ end
14
+
15
+ def ignore?(url)
16
+ return true if url.nil?
17
+ IGNORE_URLS_START_WITH.each{|element| return true if url.start_with?(element) }
18
+ false
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,28 @@
1
+ require 'net/http'
2
+ module FindDeadLink
3
+ class UrlOpener
4
+ include SemanticLogger::Loggable
5
+
6
+ def initialize(url)
7
+ @url = url
8
+ end
9
+
10
+ def dead_link?
11
+ begin
12
+ (Net::HTTP.get_response(URI(@url)).code) == "404"
13
+ rescue StandardError => e
14
+ logger.error "DEADLINK :: Error in opening URL:: #{@url}", error: e.inspect
15
+ return true
16
+ end
17
+ end
18
+
19
+ def get_content
20
+ begin
21
+ Nokogiri::HTML(open(@url))
22
+ rescue StandardError => e
23
+ logger.error "Error in parsing content for URL:: #{@url}", error: e.inspect
24
+ Nokogiri::HTML("")
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,3 @@
1
+ module FindDeadLink
2
+ VERSION = "0.0.1"
3
+ end
metadata ADDED
@@ -0,0 +1,98 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: find_dead_link
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Ankur Maheshwari
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-04-01 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: 1.6.1
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: 1.6.1
27
+ - !ruby/object:Gem::Dependency
28
+ name: semantic_logger
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: 2.7.0
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ~>
39
+ - !ruby/object:Gem::Version
40
+ version: 2.7.0
41
+ - !ruby/object:Gem::Dependency
42
+ name: pry
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ! '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ description: The library takes a url and search for deadlinks within given website
56
+ email:
57
+ - amaheshwari@systango.com
58
+ executables:
59
+ - find_dead_link.rb
60
+ extensions: []
61
+ extra_rdoc_files: []
62
+ files:
63
+ - Gemfile
64
+ - Gemfile.lock
65
+ - README.md
66
+ - bin/find_dead_link.rb
67
+ - find_dead_link.gemspec
68
+ - find_dead_link.log
69
+ - lib/find_dead_link.rb
70
+ - lib/find_dead_link/bad_url_exception.rb
71
+ - lib/find_dead_link/crawler.rb
72
+ - lib/find_dead_link/html_doc.rb
73
+ - lib/find_dead_link/url_opener.rb
74
+ - lib/find_dead_link/version.rb
75
+ homepage: http://systango.com/
76
+ licenses: []
77
+ metadata: {}
78
+ post_install_message:
79
+ rdoc_options: []
80
+ require_paths:
81
+ - lib
82
+ required_ruby_version: !ruby/object:Gem::Requirement
83
+ requirements:
84
+ - - ! '>='
85
+ - !ruby/object:Gem::Version
86
+ version: '0'
87
+ required_rubygems_version: !ruby/object:Gem::Requirement
88
+ requirements:
89
+ - - ! '>='
90
+ - !ruby/object:Gem::Version
91
+ version: '0'
92
+ requirements: []
93
+ rubyforge_project:
94
+ rubygems_version: 2.2.2
95
+ signing_key:
96
+ specification_version: 4
97
+ summary: Looks for deadlinks exists in provided url
98
+ test_files: []