broken_link_finder 0.2.1 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/Gemfile.lock +4 -2
- data/README.md +3 -4
- data/broken_link_finder.gemspec +2 -1
- data/lib/broken_link_finder/finder.rb +28 -15
- data/lib/broken_link_finder/version.rb +1 -1
- metadata +20 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0f89fd142f329ee9b5df8de5a01abd2976cab0b16a690e50d98a0775a94eb937
|
4
|
+
data.tar.gz: 81e1db2b0a7f2e76e0113b4086fea5a41a6bca6b8f51d79d849007fe73fb3c34
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d8d9d745fdd025ce30a6c082730b878deb409bd7342d82f63dfbffbbecb5fa75f4ddadea2acace87904255199cb32b53277662d573534079d7d6575d2e80e03b
|
7
|
+
data.tar.gz: 813926476b8fed62c28a46decbff72018134f547f1618dd28418d6ea31f55bbc55bd43d0cd47bfc36fd8a476e0d65f4dd21d31d6bd79aeb3317d5282bbe9c46b
|
data/.gitignore
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,8 +1,9 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
broken_link_finder (0.
|
5
|
-
|
4
|
+
broken_link_finder (0.3.0)
|
5
|
+
thread (= 0.2)
|
6
|
+
wgit (= 0.0.9)
|
6
7
|
|
7
8
|
GEM
|
8
9
|
remote: https://rubygems.org/
|
@@ -28,6 +29,7 @@ GEM
|
|
28
29
|
public_suffix (3.1.0)
|
29
30
|
rake (10.5.0)
|
30
31
|
safe_yaml (1.0.5)
|
32
|
+
thread (0.2.0)
|
31
33
|
webmock (3.5.1)
|
32
34
|
addressable (>= 2.3.6)
|
33
35
|
crack (>= 0.3.2)
|
data/README.md
CHANGED
@@ -35,14 +35,14 @@ require 'broken_link_finder'
|
|
35
35
|
|
36
36
|
finder = BrokenLinkFinder::Finder.new
|
37
37
|
finder.crawl_site "http://txti.es" # Also, see Finder#crawl_page for a single webpage.
|
38
|
-
finder.pretty_print_broken_links # Also, see Finder#broken_links for a Hash.
|
38
|
+
finder.pretty_print_broken_links # Also, see Finder#broken_links for a Hash of links.
|
39
39
|
```
|
40
40
|
|
41
41
|
Then execute the script with:
|
42
42
|
|
43
43
|
$ ruby main.rb
|
44
44
|
|
45
|
-
|
45
|
+
If broken links are found then the output should look something like:
|
46
46
|
|
47
47
|
```text
|
48
48
|
Below is a breakdown of the different pages and their broken links...
|
@@ -60,7 +60,6 @@ http://imgur.com
|
|
60
60
|
|
61
61
|
## TODO
|
62
62
|
|
63
|
-
- Speed boost.
|
64
63
|
- Create a `broken_link_finder` executable.
|
65
64
|
- Add logger functionality (especially useful in the console during development).
|
66
65
|
|
@@ -68,7 +67,7 @@ http://imgur.com
|
|
68
67
|
|
69
68
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `bundle exec rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
70
69
|
|
71
|
-
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release[origin]`, which will create a git tag for the version, push git commits and tags, and push the
|
70
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release[origin]`, which will create a git tag for the version, push git commits and tags, and push the `*.gem` file to [rubygems.org](https://rubygems.org).
|
72
71
|
|
73
72
|
## Contributing
|
74
73
|
|
data/broken_link_finder.gemspec
CHANGED
@@ -42,5 +42,6 @@ Gem::Specification.new do |spec|
|
|
42
42
|
spec.add_development_dependency "byebug", "~> 11.0"
|
43
43
|
spec.add_development_dependency "webmock", "~> 3.5"
|
44
44
|
|
45
|
-
spec.add_runtime_dependency "wgit", "
|
45
|
+
spec.add_runtime_dependency "wgit", "0.0.9"
|
46
|
+
spec.add_runtime_dependency "thread", "0.2"
|
46
47
|
end
|
@@ -1,13 +1,18 @@
|
|
1
1
|
require 'wgit'
|
2
|
+
require 'thread/pool'
|
2
3
|
|
3
4
|
module BrokenLinkFinder
|
4
5
|
class Finder
|
6
|
+
DEFAULT_MAX_THREADS = 30.freeze
|
7
|
+
|
5
8
|
attr_reader :broken_links
|
6
9
|
|
7
10
|
# Create a new Finder instance.
|
8
|
-
def initialize
|
9
|
-
@
|
11
|
+
def initialize(max_threads: DEFAULT_MAX_THREADS)
|
12
|
+
@max_threads = max_threads
|
13
|
+
@lock = Mutex.new
|
10
14
|
@crawler = Wgit::Crawler.new
|
15
|
+
@broken_links = {}
|
11
16
|
end
|
12
17
|
|
13
18
|
# Clear/empty the @broken_links Hash.
|
@@ -16,30 +21,36 @@ module BrokenLinkFinder
|
|
16
21
|
end
|
17
22
|
|
18
23
|
# Finds broken links within an entire site and appends them to the
|
19
|
-
# @broken_links array.
|
24
|
+
# @broken_links array. Returns a tuple containing a Boolean of true if
|
25
|
+
# at least one broken link was found and an Array of all pages crawled.
|
26
|
+
# Access the broken links with Finder#broken_links.
|
20
27
|
def crawl_site(url)
|
21
28
|
clear_broken_links
|
22
29
|
url = Wgit::Url.new(url)
|
23
|
-
|
30
|
+
pool = Thread.pool(@max_threads)
|
24
31
|
crawled_pages = []
|
32
|
+
|
25
33
|
@crawler.crawl_site(url) do |doc|
|
26
34
|
# Ensure the given website url is valid.
|
27
35
|
raise "Invalid URL: #{url}" if doc.url == url and doc.empty?
|
28
36
|
|
29
|
-
# Ensure we only process each page once.
|
37
|
+
# Ensure we only process each page once. For example, /about.html might
|
38
|
+
# be linked to several times throughout the entire site.
|
30
39
|
next if crawled_pages.include?(doc.url)
|
31
40
|
crawled_pages << doc.url
|
32
41
|
|
33
42
|
# Get all page links and determine which are broken.
|
34
43
|
next unless doc
|
35
|
-
find_broken_links(doc)
|
44
|
+
pool.process { find_broken_links(doc) }
|
36
45
|
end
|
37
46
|
|
38
|
-
|
47
|
+
pool.shutdown
|
48
|
+
[!@broken_links.empty?, crawled_pages]
|
39
49
|
end
|
40
50
|
|
41
51
|
# Finds broken links within a single page and appends them to the
|
42
|
-
# @broken_links array.
|
52
|
+
# @broken_links array. Returns true if at least one broken link was found.
|
53
|
+
# Access the broken links with Finder#broken_links.
|
43
54
|
def crawl_url(url)
|
44
55
|
clear_broken_links
|
45
56
|
url = Wgit::Url.new(url)
|
@@ -55,12 +66,12 @@ module BrokenLinkFinder
|
|
55
66
|
end
|
56
67
|
|
57
68
|
# Pretty prints the contents of broken_links into a stream e.g. Kernel
|
58
|
-
# (STDOUT) or a file.
|
69
|
+
# (STDOUT) or a file - anything that respond_to? :puts.
|
59
70
|
# Returns true if there were broken links and vice versa.
|
60
71
|
def pretty_print_broken_links(stream = Kernel)
|
61
72
|
raise "stream must respond_to? :puts" unless stream.respond_to? :puts
|
62
|
-
|
63
|
-
if
|
73
|
+
|
74
|
+
if @broken_links.empty?
|
64
75
|
stream.puts("Good news, there are no broken links!")
|
65
76
|
false
|
66
77
|
else
|
@@ -81,7 +92,7 @@ broken links...")
|
|
81
92
|
|
82
93
|
private
|
83
94
|
|
84
|
-
# Finds which links are broken and
|
95
|
+
# Finds which links are broken and appends the details to @broken_links.
|
85
96
|
def find_broken_links(doc)
|
86
97
|
links = doc.internal_full_links + doc.external_links
|
87
98
|
links.each do |link|
|
@@ -94,10 +105,12 @@ broken links...")
|
|
94
105
|
|
95
106
|
# Append url => [link] to @broken_links.
|
96
107
|
def append_broken_link(url, link)
|
97
|
-
|
98
|
-
@broken_links[url]
|
108
|
+
@lock.synchronize do
|
109
|
+
unless @broken_links[url]
|
110
|
+
@broken_links[url] = []
|
111
|
+
end
|
112
|
+
@broken_links[url] << link
|
99
113
|
end
|
100
|
-
@broken_links[url] << link
|
101
114
|
end
|
102
115
|
|
103
116
|
alias_method :crawl_page, :crawl_url
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: broken_link_finder
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Telford
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-06-
|
11
|
+
date: 2019-06-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -98,16 +98,30 @@ dependencies:
|
|
98
98
|
name: wgit
|
99
99
|
requirement: !ruby/object:Gem::Requirement
|
100
100
|
requirements:
|
101
|
-
- -
|
101
|
+
- - '='
|
102
102
|
- !ruby/object:Gem::Version
|
103
|
-
version:
|
103
|
+
version: 0.0.9
|
104
104
|
type: :runtime
|
105
105
|
prerelease: false
|
106
106
|
version_requirements: !ruby/object:Gem::Requirement
|
107
107
|
requirements:
|
108
|
-
- -
|
108
|
+
- - '='
|
109
109
|
- !ruby/object:Gem::Version
|
110
|
-
version:
|
110
|
+
version: 0.0.9
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: thread
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - '='
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0.2'
|
118
|
+
type: :runtime
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - '='
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0.2'
|
111
125
|
description: Finds a website's broken links using the 'wgit' gem and reports back
|
112
126
|
to you with a summary.
|
113
127
|
email: michael.telford@live.com
|