httpspell 1.1.0 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: aeed14621889176b3b295937ab5ae5d75371862b0e9107fc11ab0be65cebc082
4
- data.tar.gz: 10a13a180d6b032b0e71623ae82239325c658bad2a4a4f74424636adedc22531
3
+ metadata.gz: 01e176d60bc87e62dae15531feabcfa31ac27a1d4d74b4f1faf3ee161579fc35
4
+ data.tar.gz: 36b6d13d79bc37531054f59859708db1ee9dd594f668ed8f4cf6bcd18cba66e2
5
5
  SHA512:
6
- metadata.gz: 9517f20cdfa7da8f013eb01e456c5a67fbccd49df25bfcfa437aca1e20ccd570623c465e8a953fb588d83f95c783caa1fccce19dab0ff416caf05c5a4c75a58a
7
- data.tar.gz: 54d38926757fcd968a461cfff777343e08a69f8a3fe687fd00694486a1ac4539d4547f6faaa34c9a997b7643a134bf27e346d9ddf0c3fe7d11c38d5d6ca56e45
6
+ metadata.gz: 91d76a7d20f95562b8012ce1a76aee35008def4bac42d537104450db8cfc4974bdd480c24d7ab8388164f89bb3cf486465f39958b67fd5c792196620eb06261b
7
+ data.tar.gz: d9c6faca03e9d992fbf7d07792b88106807977092ae4591a5f4c316e05d86ad44a5bf6a5b637be37e9d8c5b0e3ea78d9c118810ce2b2e257c996459f92012490
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- httpspell (1.1.0)
4
+ httpspell (1.2.0)
5
5
  addressable
6
6
  nokogiri
7
7
 
@@ -10,7 +10,8 @@ personal_dictionary_path = nil
10
10
  force_language = nil
11
11
  tracing = nil
12
12
  verbose = nil
13
- limit = nil
13
+ whitelist = nil
14
+ blacklist = []
14
15
 
15
16
  begin
16
17
  OptionParser.new do |parser|
@@ -28,8 +29,9 @@ begin
28
29
  force_language = l
29
30
  end
30
31
 
31
- parser.on('-L', '--limit=EXPRESSION', 'limit recursive retrieval to URLs matching a regular EXPRESSION') do |l|
32
- limit = Regexp.new(l)
32
+ parser.on('-w', '--whitelist=EXPRESSION', 'when recursively retrieving URLs, allow only those matching the given regular EXPRESSION') do |w|
33
+ whitelist ||= []
34
+ whitelist << Regexp.new(w)
33
35
  end
34
36
 
35
37
  parser.on('-t', '--trace', 'enable error tracing') do
@@ -40,6 +42,10 @@ begin
40
42
  verbose = true
41
43
  end
42
44
 
45
+ parser.on('-b', '--blacklist=EXPRESSION', 'blacklist (ignore) URLs matching the given regular EXPRESSION') do |b|
46
+ blacklist << Regexp.new(b)
47
+ end
48
+
43
49
  # TODO: --recursive, defaults to false
44
50
  # TODO wget has some additional options for recursive behavior that should be reviewed
45
51
  end.parse!
@@ -53,33 +59,28 @@ if ARGV.size != 1
53
59
  exit 1
54
60
  end
55
61
 
56
- spell_checker = HttpSpell::SpellChecker.new(personal_dictionary_path)
62
+ spell_checker = HttpSpell::SpellChecker.new(personal_dictionary_path, tracing: tracing)
57
63
  has_unknown_words = false
58
64
 
59
- begin
60
- HttpSpell::Spider.new(ARGV.first, limit: limit, tracing: tracing).start do |url, doc|
61
- lang = force_language || doc.root['lang'] || ENV['LANGUAGE']
62
-
63
- # Remove sections that are not to be spellchecked
64
- doc.css('pre').each(&:unlink)
65
- doc.css('code').each(&:unlink)
66
- doc.css('[spellcheck=false]').each(&:unlink)
67
-
68
- # TODO: Find sections with a lang attribute and handle them separately
69
- unknown_words = spell_checker.check(doc.to_s, lang)
70
-
71
- if unknown_words.empty?
72
- warn "No unknown words (language is #{lang}) at #{url}." if verbose
73
- else
74
- warn "#{unknown_words.size} unknown words (language is #{lang}) at #{url}:" if verbose
75
- puts unknown_words
76
- has_unknown_words = true
77
- end
65
+ spider_success = HttpSpell::Spider.new(ARGV.first, whitelist: whitelist, blacklist: blacklist, tracing: tracing).start do |url, doc|
66
+ lang = force_language || doc.root['lang'] || ENV['LANGUAGE']
67
+
68
+ # Remove sections that are not to be spellchecked
69
+ doc.css('pre').each(&:unlink)
70
+ doc.css('code').each(&:unlink)
71
+ doc.css('[spellcheck=false]').each(&:unlink)
72
+
73
+ # TODO: Find sections with a lang attribute and handle them separately
74
+ unknown_words = spell_checker.check(doc.to_s, lang)
75
+
76
+ if unknown_words.empty?
77
+ warn "No unknown words (language is #{lang}) at #{url}." if verbose
78
+ else
79
+ warn "#{unknown_words.size} unknown words (language is #{lang}) at #{url}:" if verbose
80
+ puts unknown_words
81
+ has_unknown_words = true
78
82
  end
79
- rescue StandardError
80
- warn $ERROR_INFO.message
81
- warn $ERROR_INFO.backtrace if tracing
82
- exit 2
83
83
  end
84
84
 
85
+ exit 2 unless spider_success
85
86
  exit 1 if has_unknown_words
@@ -1,11 +1,22 @@
1
1
  module HttpSpell
2
2
  class SpellChecker
3
- def initialize(personal_dictionary_path = nil)
3
+ def initialize(personal_dictionary_path = nil, tracing: false)
4
4
  @personal_dictionary_arg = "-p #{personal_dictionary_path}" if personal_dictionary_path
5
+ @tracing = tracing
5
6
  end
6
7
 
7
8
  def check(doc, lang)
8
- Open3.pipeline_rw('pandoc --from html --to plain', "hunspell -d #{translate(lang)} #{@personal_dictionary_arg} -i UTF-8 -l") do |stdin, stdout, _wait_thrs|
9
+ commands = [
10
+ 'pandoc --from html --to plain',
11
+ "hunspell -d #{translate(lang)} #{@personal_dictionary_arg} -i UTF-8 -l",
12
+ ]
13
+
14
+ if @tracing
15
+ warn "Piping the HTML document into the following chain of commands:"
16
+ warn commands
17
+ end
18
+
19
+ Open3.pipeline_rw(*commands) do |stdin, stdout, _wait_thrs|
9
20
  stdin.puts(doc)
10
21
  stdin.close
11
22
  stdout.read.split.uniq
@@ -8,27 +8,39 @@ module HttpSpell
8
8
  class Spider
9
9
  attr_reader :todo, :done
10
10
 
11
- def initialize(starting_point, limit: nil, tracing: false)
11
+ def initialize(starting_point, whitelist: nil, blacklist: [], tracing: false)
12
12
  @todo = []
13
13
  @done = []
14
14
  todo << Addressable::URI.parse(starting_point)
15
- @limit = limit || /^#{starting_point}/
15
+ @whitelist = whitelist || [/^#{starting_point}/]
16
+ @blacklist = blacklist
16
17
  @tracing = tracing
17
18
  end
18
19
 
19
20
  def start
21
+ success = true
22
+
20
23
  while todo.any?
21
24
  url = todo.pop
22
- extracted = links(url) do |u, d|
23
- yield u, d if block_given?
24
- rescue
25
- warn "Callback error for #{url}: #{$ERROR_INFO}"
25
+
26
+ begin
27
+ extracted = links(url) do |u, d|
28
+ yield u, d if block_given?
29
+ rescue
30
+ warn "Callback error for #{url}: #{$ERROR_INFO}"
31
+ warn $ERROR_INFO.backtrace if @tracing
32
+ end
33
+
34
+ done.append(url)
35
+ todo.concat(extracted - done - todo)
36
+ rescue StandardError
37
+ warn "Skipping #{url} because of #{$ERROR_INFO.message}"
26
38
  warn $ERROR_INFO.backtrace if @tracing
39
+ success = false
27
40
  end
28
-
29
- done.append(url)
30
- todo.concat(extracted - done - todo)
31
41
  end
42
+
43
+ return success
32
44
  end
33
45
 
34
46
  private
@@ -37,8 +49,9 @@ module HttpSpell
37
49
  # We are using open-uri, which follows redirects and also provides the content-type.
38
50
  response = open(uri).read
39
51
 
40
- if response.respond_to?(:content_type)
41
- return [] unless response.content_type == 'text/html'
52
+ if response.respond_to?(:content_type) && response.content_type != 'text/html'
53
+ warn "Skipping #{uri} because it is not HTML" if @tracing
54
+ return []
42
55
  end
43
56
 
44
57
  doc = Nokogiri::HTML(response)
@@ -46,7 +59,18 @@ module HttpSpell
46
59
  links = doc.css('a[href]').map do |e|
47
60
  link = Addressable::URI.parse(e['href'])
48
61
  link = uri.join(link) if link.relative?
49
- next unless @limit.match?(link.to_s)
62
+
63
+ if @whitelist.none? { |re| re.match?(link.to_s) }
64
+ warn "Skipping #{link} because it is not on the whitelist #{@whitelist}" if @tracing
65
+ next
66
+ end
67
+
68
+ if @blacklist.any? { |re| re.match?(link.to_s) }
69
+ # TODO Print _which_ entry of the blacklist matches
70
+ warn "Skipping #{link} because it is on the blacklist #{@blacklist}" if @tracing
71
+ next
72
+ end
73
+
50
74
  # TODO Ignore same page links (some anchor)
51
75
  link
52
76
  rescue StandardError
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module HttpSpell
4
- VERSION = '1.1.0'
4
+ VERSION = '1.2.0'
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: httpspell
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Steffen Uhlig