broken_link_finder 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/Gemfile.lock +4 -2
- data/README.md +3 -4
- data/broken_link_finder.gemspec +2 -1
- data/lib/broken_link_finder/finder.rb +28 -15
- data/lib/broken_link_finder/version.rb +1 -1
- metadata +20 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0f89fd142f329ee9b5df8de5a01abd2976cab0b16a690e50d98a0775a94eb937
|
4
|
+
data.tar.gz: 81e1db2b0a7f2e76e0113b4086fea5a41a6bca6b8f51d79d849007fe73fb3c34
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d8d9d745fdd025ce30a6c082730b878deb409bd7342d82f63dfbffbbecb5fa75f4ddadea2acace87904255199cb32b53277662d573534079d7d6575d2e80e03b
|
7
|
+
data.tar.gz: 813926476b8fed62c28a46decbff72018134f547f1618dd28418d6ea31f55bbc55bd43d0cd47bfc36fd8a476e0d65f4dd21d31d6bd79aeb3317d5282bbe9c46b
|
data/.gitignore
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,8 +1,9 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
broken_link_finder (0.
|
5
|
-
|
4
|
+
broken_link_finder (0.3.0)
|
5
|
+
thread (= 0.2)
|
6
|
+
wgit (= 0.0.9)
|
6
7
|
|
7
8
|
GEM
|
8
9
|
remote: https://rubygems.org/
|
@@ -28,6 +29,7 @@ GEM
|
|
28
29
|
public_suffix (3.1.0)
|
29
30
|
rake (10.5.0)
|
30
31
|
safe_yaml (1.0.5)
|
32
|
+
thread (0.2.0)
|
31
33
|
webmock (3.5.1)
|
32
34
|
addressable (>= 2.3.6)
|
33
35
|
crack (>= 0.3.2)
|
data/README.md
CHANGED
@@ -35,14 +35,14 @@ require 'broken_link_finder'
|
|
35
35
|
|
36
36
|
finder = BrokenLinkFinder::Finder.new
|
37
37
|
finder.crawl_site "http://txti.es" # Also, see Finder#crawl_page for a single webpage.
|
38
|
-
finder.pretty_print_broken_links # Also, see Finder#broken_links for a Hash.
|
38
|
+
finder.pretty_print_broken_links # Also, see Finder#broken_links for a Hash of links.
|
39
39
|
```
|
40
40
|
|
41
41
|
Then execute the script with:
|
42
42
|
|
43
43
|
$ ruby main.rb
|
44
44
|
|
45
|
-
|
45
|
+
If broken links are found then the output should look something like:
|
46
46
|
|
47
47
|
```text
|
48
48
|
Below is a breakdown of the different pages and their broken links...
|
@@ -60,7 +60,6 @@ http://imgur.com
|
|
60
60
|
|
61
61
|
## TODO
|
62
62
|
|
63
|
-
- Speed boost.
|
64
63
|
- Create a `broken_link_finder` executable.
|
65
64
|
- Add logger functionality (especially useful in the console during development).
|
66
65
|
|
@@ -68,7 +67,7 @@ http://imgur.com
|
|
68
67
|
|
69
68
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `bundle exec rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
70
69
|
|
71
|
-
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release[origin]`, which will create a git tag for the version, push git commits and tags, and push the
|
70
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release[origin]`, which will create a git tag for the version, push git commits and tags, and push the `*.gem` file to [rubygems.org](https://rubygems.org).
|
72
71
|
|
73
72
|
## Contributing
|
74
73
|
|
data/broken_link_finder.gemspec
CHANGED
@@ -42,5 +42,6 @@ Gem::Specification.new do |spec|
|
|
42
42
|
spec.add_development_dependency "byebug", "~> 11.0"
|
43
43
|
spec.add_development_dependency "webmock", "~> 3.5"
|
44
44
|
|
45
|
-
spec.add_runtime_dependency "wgit", "
|
45
|
+
spec.add_runtime_dependency "wgit", "0.0.9"
|
46
|
+
spec.add_runtime_dependency "thread", "0.2"
|
46
47
|
end
|
@@ -1,13 +1,18 @@
|
|
1
1
|
require 'wgit'
|
2
|
+
require 'thread/pool'
|
2
3
|
|
3
4
|
module BrokenLinkFinder
|
4
5
|
class Finder
|
6
|
+
DEFAULT_MAX_THREADS = 30.freeze
|
7
|
+
|
5
8
|
attr_reader :broken_links
|
6
9
|
|
7
10
|
# Create a new Finder instance.
|
8
|
-
def initialize
|
9
|
-
@
|
11
|
+
def initialize(max_threads: DEFAULT_MAX_THREADS)
|
12
|
+
@max_threads = max_threads
|
13
|
+
@lock = Mutex.new
|
10
14
|
@crawler = Wgit::Crawler.new
|
15
|
+
@broken_links = {}
|
11
16
|
end
|
12
17
|
|
13
18
|
# Clear/empty the @broken_links Hash.
|
@@ -16,30 +21,36 @@ module BrokenLinkFinder
|
|
16
21
|
end
|
17
22
|
|
18
23
|
# Finds broken links within an entire site and appends them to the
|
19
|
-
# @broken_links array.
|
24
|
+
# @broken_links array. Returns a tuple containing a Boolean of true if
|
25
|
+
# at least one broken link was found and an Array of all pages crawled.
|
26
|
+
# Access the broken links with Finder#broken_links.
|
20
27
|
def crawl_site(url)
|
21
28
|
clear_broken_links
|
22
29
|
url = Wgit::Url.new(url)
|
23
|
-
|
30
|
+
pool = Thread.pool(@max_threads)
|
24
31
|
crawled_pages = []
|
32
|
+
|
25
33
|
@crawler.crawl_site(url) do |doc|
|
26
34
|
# Ensure the given website url is valid.
|
27
35
|
raise "Invalid URL: #{url}" if doc.url == url and doc.empty?
|
28
36
|
|
29
|
-
# Ensure we only process each page once.
|
37
|
+
# Ensure we only process each page once. For example, /about.html might
|
38
|
+
# be linked to several times throughout the entire site.
|
30
39
|
next if crawled_pages.include?(doc.url)
|
31
40
|
crawled_pages << doc.url
|
32
41
|
|
33
42
|
# Get all page links and determine which are broken.
|
34
43
|
next unless doc
|
35
|
-
find_broken_links(doc)
|
44
|
+
pool.process { find_broken_links(doc) }
|
36
45
|
end
|
37
46
|
|
38
|
-
|
47
|
+
pool.shutdown
|
48
|
+
[!@broken_links.empty?, crawled_pages]
|
39
49
|
end
|
40
50
|
|
41
51
|
# Finds broken links within a single page and appends them to the
|
42
|
-
# @broken_links array.
|
52
|
+
# @broken_links array. Returns true if at least one broken link was found.
|
53
|
+
# Access the broken links with Finder#broken_links.
|
43
54
|
def crawl_url(url)
|
44
55
|
clear_broken_links
|
45
56
|
url = Wgit::Url.new(url)
|
@@ -55,12 +66,12 @@ module BrokenLinkFinder
|
|
55
66
|
end
|
56
67
|
|
57
68
|
# Pretty prints the contents of broken_links into a stream e.g. Kernel
|
58
|
-
# (STDOUT) or a file.
|
69
|
+
# (STDOUT) or a file - anything that respond_to? :puts.
|
59
70
|
# Returns true if there were broken links and vice versa.
|
60
71
|
def pretty_print_broken_links(stream = Kernel)
|
61
72
|
raise "stream must respond_to? :puts" unless stream.respond_to? :puts
|
62
|
-
|
63
|
-
if
|
73
|
+
|
74
|
+
if @broken_links.empty?
|
64
75
|
stream.puts("Good news, there are no broken links!")
|
65
76
|
false
|
66
77
|
else
|
@@ -81,7 +92,7 @@ broken links...")
|
|
81
92
|
|
82
93
|
private
|
83
94
|
|
84
|
-
# Finds which links are broken and
|
95
|
+
# Finds which links are broken and appends the details to @broken_links.
|
85
96
|
def find_broken_links(doc)
|
86
97
|
links = doc.internal_full_links + doc.external_links
|
87
98
|
links.each do |link|
|
@@ -94,10 +105,12 @@ broken links...")
|
|
94
105
|
|
95
106
|
# Append url => [link] to @broken_links.
|
96
107
|
def append_broken_link(url, link)
|
97
|
-
|
98
|
-
@broken_links[url]
|
108
|
+
@lock.synchronize do
|
109
|
+
unless @broken_links[url]
|
110
|
+
@broken_links[url] = []
|
111
|
+
end
|
112
|
+
@broken_links[url] << link
|
99
113
|
end
|
100
|
-
@broken_links[url] << link
|
101
114
|
end
|
102
115
|
|
103
116
|
alias_method :crawl_page, :crawl_url
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: broken_link_finder
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Telford
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-06-
|
11
|
+
date: 2019-06-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -98,16 +98,30 @@ dependencies:
|
|
98
98
|
name: wgit
|
99
99
|
requirement: !ruby/object:Gem::Requirement
|
100
100
|
requirements:
|
101
|
-
- -
|
101
|
+
- - '='
|
102
102
|
- !ruby/object:Gem::Version
|
103
|
-
version:
|
103
|
+
version: 0.0.9
|
104
104
|
type: :runtime
|
105
105
|
prerelease: false
|
106
106
|
version_requirements: !ruby/object:Gem::Requirement
|
107
107
|
requirements:
|
108
|
-
- -
|
108
|
+
- - '='
|
109
109
|
- !ruby/object:Gem::Version
|
110
|
-
version:
|
110
|
+
version: 0.0.9
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: thread
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - '='
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0.2'
|
118
|
+
type: :runtime
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - '='
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0.2'
|
111
125
|
description: Finds a website's broken links using the 'wgit' gem and reports back
|
112
126
|
to you with a summary.
|
113
127
|
email: michael.telford@live.com
|