httpspell 1.1.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: aeed14621889176b3b295937ab5ae5d75371862b0e9107fc11ab0be65cebc082
4
- data.tar.gz: 10a13a180d6b032b0e71623ae82239325c658bad2a4a4f74424636adedc22531
3
+ metadata.gz: 01e176d60bc87e62dae15531feabcfa31ac27a1d4d74b4f1faf3ee161579fc35
4
+ data.tar.gz: 36b6d13d79bc37531054f59859708db1ee9dd594f668ed8f4cf6bcd18cba66e2
5
5
  SHA512:
6
- metadata.gz: 9517f20cdfa7da8f013eb01e456c5a67fbccd49df25bfcfa437aca1e20ccd570623c465e8a953fb588d83f95c783caa1fccce19dab0ff416caf05c5a4c75a58a
7
- data.tar.gz: 54d38926757fcd968a461cfff777343e08a69f8a3fe687fd00694486a1ac4539d4547f6faaa34c9a997b7643a134bf27e346d9ddf0c3fe7d11c38d5d6ca56e45
6
+ metadata.gz: 91d76a7d20f95562b8012ce1a76aee35008def4bac42d537104450db8cfc4974bdd480c24d7ab8388164f89bb3cf486465f39958b67fd5c792196620eb06261b
7
+ data.tar.gz: d9c6faca03e9d992fbf7d07792b88106807977092ae4591a5f4c316e05d86ad44a5bf6a5b637be37e9d8c5b0e3ea78d9c118810ce2b2e257c996459f92012490
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- httpspell (1.1.0)
4
+ httpspell (1.2.0)
5
5
  addressable
6
6
  nokogiri
7
7
 
@@ -10,7 +10,8 @@ personal_dictionary_path = nil
10
10
  force_language = nil
11
11
  tracing = nil
12
12
  verbose = nil
13
- limit = nil
13
+ whitelist = nil
14
+ blacklist = []
14
15
 
15
16
  begin
16
17
  OptionParser.new do |parser|
@@ -28,8 +29,9 @@ begin
28
29
  force_language = l
29
30
  end
30
31
 
31
- parser.on('-L', '--limit=EXPRESSION', 'limit recursive retrieval to URLs matching a regular EXPRESSION') do |l|
32
- limit = Regexp.new(l)
32
+ parser.on('-w', '--whitelist=EXPRESSION', 'when recursively retrieving URLs, allow only those matching the given regular EXPRESSION') do |w|
33
+ whitelist ||= []
34
+ whitelist << Regexp.new(w)
33
35
  end
34
36
 
35
37
  parser.on('-t', '--trace', 'enable error tracing') do
@@ -40,6 +42,10 @@ begin
40
42
  verbose = true
41
43
  end
42
44
 
45
+ parser.on('-b', '--blacklist=EXPRESSION', 'blacklist (ignore) URLs matching the given regular EXPRESSION') do |b|
46
+ blacklist << Regexp.new(b)
47
+ end
48
+
43
49
  # TODO: --recursive, defaults to false
44
50
  # TODO wget has some additional options for recursive behavior that should be reviewed
45
51
  end.parse!
@@ -53,33 +59,28 @@ if ARGV.size != 1
53
59
  exit 1
54
60
  end
55
61
 
56
- spell_checker = HttpSpell::SpellChecker.new(personal_dictionary_path)
62
+ spell_checker = HttpSpell::SpellChecker.new(personal_dictionary_path, tracing: tracing)
57
63
  has_unknown_words = false
58
64
 
59
- begin
60
- HttpSpell::Spider.new(ARGV.first, limit: limit, tracing: tracing).start do |url, doc|
61
- lang = force_language || doc.root['lang'] || ENV['LANGUAGE']
62
-
63
- # Remove sections that are not to be spellchecked
64
- doc.css('pre').each(&:unlink)
65
- doc.css('code').each(&:unlink)
66
- doc.css('[spellcheck=false]').each(&:unlink)
67
-
68
- # TODO: Find sections with a lang attribute and handle them separately
69
- unknown_words = spell_checker.check(doc.to_s, lang)
70
-
71
- if unknown_words.empty?
72
- warn "No unknown words (language is #{lang}) at #{url}." if verbose
73
- else
74
- warn "#{unknown_words.size} unknown words (language is #{lang}) at #{url}:" if verbose
75
- puts unknown_words
76
- has_unknown_words = true
77
- end
65
+ spider_success = HttpSpell::Spider.new(ARGV.first, whitelist: whitelist, blacklist: blacklist, tracing: tracing).start do |url, doc|
66
+ lang = force_language || doc.root['lang'] || ENV['LANGUAGE']
67
+
68
+ # Remove sections that are not to be spellchecked
69
+ doc.css('pre').each(&:unlink)
70
+ doc.css('code').each(&:unlink)
71
+ doc.css('[spellcheck=false]').each(&:unlink)
72
+
73
+ # TODO: Find sections with a lang attribute and handle them separately
74
+ unknown_words = spell_checker.check(doc.to_s, lang)
75
+
76
+ if unknown_words.empty?
77
+ warn "No unknown words (language is #{lang}) at #{url}." if verbose
78
+ else
79
+ warn "#{unknown_words.size} unknown words (language is #{lang}) at #{url}:" if verbose
80
+ puts unknown_words
81
+ has_unknown_words = true
78
82
  end
79
- rescue StandardError
80
- warn $ERROR_INFO.message
81
- warn $ERROR_INFO.backtrace if tracing
82
- exit 2
83
83
  end
84
84
 
85
+ exit 2 unless spider_success
85
86
  exit 1 if has_unknown_words
@@ -1,11 +1,22 @@
1
1
  module HttpSpell
2
2
  class SpellChecker
3
- def initialize(personal_dictionary_path = nil)
3
+ def initialize(personal_dictionary_path = nil, tracing: false)
4
4
  @personal_dictionary_arg = "-p #{personal_dictionary_path}" if personal_dictionary_path
5
+ @tracing = tracing
5
6
  end
6
7
 
7
8
  def check(doc, lang)
8
- Open3.pipeline_rw('pandoc --from html --to plain', "hunspell -d #{translate(lang)} #{@personal_dictionary_arg} -i UTF-8 -l") do |stdin, stdout, _wait_thrs|
9
+ commands = [
10
+ 'pandoc --from html --to plain',
11
+ "hunspell -d #{translate(lang)} #{@personal_dictionary_arg} -i UTF-8 -l",
12
+ ]
13
+
14
+ if @tracing
15
+ warn "Piping the HTML document into the following chain of commands:"
16
+ warn commands
17
+ end
18
+
19
+ Open3.pipeline_rw(*commands) do |stdin, stdout, _wait_thrs|
9
20
  stdin.puts(doc)
10
21
  stdin.close
11
22
  stdout.read.split.uniq
@@ -8,27 +8,39 @@ module HttpSpell
8
8
  class Spider
9
9
  attr_reader :todo, :done
10
10
 
11
- def initialize(starting_point, limit: nil, tracing: false)
11
+ def initialize(starting_point, whitelist: nil, blacklist: [], tracing: false)
12
12
  @todo = []
13
13
  @done = []
14
14
  todo << Addressable::URI.parse(starting_point)
15
- @limit = limit || /^#{starting_point}/
15
+ @whitelist = whitelist || [/^#{starting_point}/]
16
+ @blacklist = blacklist
16
17
  @tracing = tracing
17
18
  end
18
19
 
19
20
  def start
21
+ success = true
22
+
20
23
  while todo.any?
21
24
  url = todo.pop
22
- extracted = links(url) do |u, d|
23
- yield u, d if block_given?
24
- rescue
25
- warn "Callback error for #{url}: #{$ERROR_INFO}"
25
+
26
+ begin
27
+ extracted = links(url) do |u, d|
28
+ yield u, d if block_given?
29
+ rescue
30
+ warn "Callback error for #{url}: #{$ERROR_INFO}"
31
+ warn $ERROR_INFO.backtrace if @tracing
32
+ end
33
+
34
+ done.append(url)
35
+ todo.concat(extracted - done - todo)
36
+ rescue StandardError
37
+ warn "Skipping #{url} because of #{$ERROR_INFO.message}"
26
38
  warn $ERROR_INFO.backtrace if @tracing
39
+ success = false
27
40
  end
28
-
29
- done.append(url)
30
- todo.concat(extracted - done - todo)
31
41
  end
42
+
43
+ return success
32
44
  end
33
45
 
34
46
  private
@@ -37,8 +49,9 @@ module HttpSpell
37
49
  # We are using open-uri, which follows redirects and also provides the content-type.
38
50
  response = open(uri).read
39
51
 
40
- if response.respond_to?(:content_type)
41
- return [] unless response.content_type == 'text/html'
52
+ if response.respond_to?(:content_type) && response.content_type != 'text/html'
53
+ warn "Skipping #{uri} because it is not HTML" if @tracing
54
+ return []
42
55
  end
43
56
 
44
57
  doc = Nokogiri::HTML(response)
@@ -46,7 +59,18 @@ module HttpSpell
46
59
  links = doc.css('a[href]').map do |e|
47
60
  link = Addressable::URI.parse(e['href'])
48
61
  link = uri.join(link) if link.relative?
49
- next unless @limit.match?(link.to_s)
62
+
63
+ if @whitelist.none? { |re| re.match?(link.to_s) }
64
+ warn "Skipping #{link} because it is not on the whitelist #{@whitelist}" if @tracing
65
+ next
66
+ end
67
+
68
+ if @blacklist.any? { |re| re.match?(link.to_s) }
69
+ # TODO Print _which_ entry of the blacklist matches
70
+ warn "Skipping #{link} because it is on the blacklist #{@blacklist}" if @tracing
71
+ next
72
+ end
73
+
50
74
  # TODO Ignore same page links (some anchor)
51
75
  link
52
76
  rescue StandardError
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module HttpSpell
4
- VERSION = '1.1.0'
4
+ VERSION = '1.2.0'
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: httpspell
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Steffen Uhlig