httpspell 1.1.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/exe/httpspell +28 -27
- data/lib/httpspell/spellchecker.rb +13 -2
- data/lib/httpspell/spider.rb +36 -12
- data/lib/httpspell/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 01e176d60bc87e62dae15531feabcfa31ac27a1d4d74b4f1faf3ee161579fc35
|
4
|
+
data.tar.gz: 36b6d13d79bc37531054f59859708db1ee9dd594f668ed8f4cf6bcd18cba66e2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 91d76a7d20f95562b8012ce1a76aee35008def4bac42d537104450db8cfc4974bdd480c24d7ab8388164f89bb3cf486465f39958b67fd5c792196620eb06261b
|
7
|
+
data.tar.gz: d9c6faca03e9d992fbf7d07792b88106807977092ae4591a5f4c316e05d86ad44a5bf6a5b637be37e9d8c5b0e3ea78d9c118810ce2b2e257c996459f92012490
|
data/Gemfile.lock
CHANGED
data/exe/httpspell
CHANGED
@@ -10,7 +10,8 @@ personal_dictionary_path = nil
|
|
10
10
|
force_language = nil
|
11
11
|
tracing = nil
|
12
12
|
verbose = nil
|
13
|
-
|
13
|
+
whitelist = nil
|
14
|
+
blacklist = []
|
14
15
|
|
15
16
|
begin
|
16
17
|
OptionParser.new do |parser|
|
@@ -28,8 +29,9 @@ begin
|
|
28
29
|
force_language = l
|
29
30
|
end
|
30
31
|
|
31
|
-
parser.on('-
|
32
|
-
|
32
|
+
parser.on('-w', '--whitelist=EXPRESSION', 'when recursively retrieving URLs, allow only those matching the given regular EXPRESSION') do |w|
|
33
|
+
whitelist ||= []
|
34
|
+
whitelist << Regexp.new(w)
|
33
35
|
end
|
34
36
|
|
35
37
|
parser.on('-t', '--trace', 'enable error tracing') do
|
@@ -40,6 +42,10 @@ begin
|
|
40
42
|
verbose = true
|
41
43
|
end
|
42
44
|
|
45
|
+
parser.on('-b', '--blacklist=EXPRESSION', 'blacklist (ignore) URLs matching the given regular EXPRESSION') do |b|
|
46
|
+
blacklist << Regexp.new(b)
|
47
|
+
end
|
48
|
+
|
43
49
|
# TODO: --recursive, defaults to false
|
44
50
|
# TODO wget has some additional options for recursive behavior that should be reviewed
|
45
51
|
end.parse!
|
@@ -53,33 +59,28 @@ if ARGV.size != 1
|
|
53
59
|
exit 1
|
54
60
|
end
|
55
61
|
|
56
|
-
spell_checker = HttpSpell::SpellChecker.new(personal_dictionary_path)
|
62
|
+
spell_checker = HttpSpell::SpellChecker.new(personal_dictionary_path, tracing: tracing)
|
57
63
|
has_unknown_words = false
|
58
64
|
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
if
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
has_unknown_words = true
|
77
|
-
end
|
65
|
+
spider_success = HttpSpell::Spider.new(ARGV.first, whitelist: whitelist, blacklist: blacklist, tracing: tracing).start do |url, doc|
|
66
|
+
lang = force_language || doc.root['lang'] || ENV['LANGUAGE']
|
67
|
+
|
68
|
+
# Remove sections that are not to be spellchecked
|
69
|
+
doc.css('pre').each(&:unlink)
|
70
|
+
doc.css('code').each(&:unlink)
|
71
|
+
doc.css('[spellcheck=false]').each(&:unlink)
|
72
|
+
|
73
|
+
# TODO: Find sections with a lang attribute and handle them separately
|
74
|
+
unknown_words = spell_checker.check(doc.to_s, lang)
|
75
|
+
|
76
|
+
if unknown_words.empty?
|
77
|
+
warn "No unknown words (language is #{lang}) at #{url}." if verbose
|
78
|
+
else
|
79
|
+
warn "#{unknown_words.size} unknown words (language is #{lang}) at #{url}:" if verbose
|
80
|
+
puts unknown_words
|
81
|
+
has_unknown_words = true
|
78
82
|
end
|
79
|
-
rescue StandardError
|
80
|
-
warn $ERROR_INFO.message
|
81
|
-
warn $ERROR_INFO.backtrace if tracing
|
82
|
-
exit 2
|
83
83
|
end
|
84
84
|
|
85
|
+
exit 2 unless spider_success
|
85
86
|
exit 1 if has_unknown_words
|
@@ -1,11 +1,22 @@
|
|
1
1
|
module HttpSpell
|
2
2
|
class SpellChecker
|
3
|
-
def initialize(personal_dictionary_path = nil)
|
3
|
+
def initialize(personal_dictionary_path = nil, tracing: false)
|
4
4
|
@personal_dictionary_arg = "-p #{personal_dictionary_path}" if personal_dictionary_path
|
5
|
+
@tracing = tracing
|
5
6
|
end
|
6
7
|
|
7
8
|
def check(doc, lang)
|
8
|
-
|
9
|
+
commands = [
|
10
|
+
'pandoc --from html --to plain',
|
11
|
+
"hunspell -d #{translate(lang)} #{@personal_dictionary_arg} -i UTF-8 -l",
|
12
|
+
]
|
13
|
+
|
14
|
+
if @tracing
|
15
|
+
warn "Piping the HTML document into the following chain of commands:"
|
16
|
+
warn commands
|
17
|
+
end
|
18
|
+
|
19
|
+
Open3.pipeline_rw(*commands) do |stdin, stdout, _wait_thrs|
|
9
20
|
stdin.puts(doc)
|
10
21
|
stdin.close
|
11
22
|
stdout.read.split.uniq
|
data/lib/httpspell/spider.rb
CHANGED
@@ -8,27 +8,39 @@ module HttpSpell
|
|
8
8
|
class Spider
|
9
9
|
attr_reader :todo, :done
|
10
10
|
|
11
|
-
def initialize(starting_point,
|
11
|
+
def initialize(starting_point, whitelist: nil, blacklist: [], tracing: false)
|
12
12
|
@todo = []
|
13
13
|
@done = []
|
14
14
|
todo << Addressable::URI.parse(starting_point)
|
15
|
-
@
|
15
|
+
@whitelist = whitelist || [/^#{starting_point}/]
|
16
|
+
@blacklist = blacklist
|
16
17
|
@tracing = tracing
|
17
18
|
end
|
18
19
|
|
19
20
|
def start
|
21
|
+
success = true
|
22
|
+
|
20
23
|
while todo.any?
|
21
24
|
url = todo.pop
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
25
|
+
|
26
|
+
begin
|
27
|
+
extracted = links(url) do |u, d|
|
28
|
+
yield u, d if block_given?
|
29
|
+
rescue
|
30
|
+
warn "Callback error for #{url}: #{$ERROR_INFO}"
|
31
|
+
warn $ERROR_INFO.backtrace if @tracing
|
32
|
+
end
|
33
|
+
|
34
|
+
done.append(url)
|
35
|
+
todo.concat(extracted - done - todo)
|
36
|
+
rescue StandardError
|
37
|
+
warn "Skipping #{url} because of #{$ERROR_INFO.message}"
|
26
38
|
warn $ERROR_INFO.backtrace if @tracing
|
39
|
+
success = false
|
27
40
|
end
|
28
|
-
|
29
|
-
done.append(url)
|
30
|
-
todo.concat(extracted - done - todo)
|
31
41
|
end
|
42
|
+
|
43
|
+
return success
|
32
44
|
end
|
33
45
|
|
34
46
|
private
|
@@ -37,8 +49,9 @@ module HttpSpell
|
|
37
49
|
# We are using open-uri, which follows redirects and also provides the content-type.
|
38
50
|
response = open(uri).read
|
39
51
|
|
40
|
-
if response.respond_to?(:content_type)
|
41
|
-
|
52
|
+
if response.respond_to?(:content_type) && response.content_type != 'text/html'
|
53
|
+
warn "Skipping #{uri} because it is not HTML" if @tracing
|
54
|
+
return []
|
42
55
|
end
|
43
56
|
|
44
57
|
doc = Nokogiri::HTML(response)
|
@@ -46,7 +59,18 @@ module HttpSpell
|
|
46
59
|
links = doc.css('a[href]').map do |e|
|
47
60
|
link = Addressable::URI.parse(e['href'])
|
48
61
|
link = uri.join(link) if link.relative?
|
49
|
-
|
62
|
+
|
63
|
+
if @whitelist.none? { |re| re.match?(link.to_s) }
|
64
|
+
warn "Skipping #{link} because it is not on the whitelist #{@whitelist}" if @tracing
|
65
|
+
next
|
66
|
+
end
|
67
|
+
|
68
|
+
if @blacklist.any? { |re| re.match?(link.to_s) }
|
69
|
+
# TODO Print _which_ entry of the blacklist matches
|
70
|
+
warn "Skipping #{link} because it is on the blacklist #{@blacklist}" if @tracing
|
71
|
+
next
|
72
|
+
end
|
73
|
+
|
50
74
|
# TODO Ignore same page links (some anchor)
|
51
75
|
link
|
52
76
|
rescue StandardError
|
data/lib/httpspell/version.rb
CHANGED