httpspell 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 0fe73f8f1ff3740d6e3ae3af685d3554879a2819fc9d7803c994a21dd3694d91
4
+ data.tar.gz: 8354f5c3bdc325a073310aa534a6171164d3dfbe7a1c4154f77737f20108eb91
5
+ SHA512:
6
+ metadata.gz: '073693d2520238d10012e4c02057c4966ab8af80f1c9db868e5ae2a4b95e4ae59a7d0989c162f62649aa0d2194290da0bca1ac5e1186f8ff3569cca581d571ae'
7
+ data.tar.gz: 8eb778ffa3bcc1f56e8362d160117f695ec5f3ca146592219f4ef43a160ea28b96c67c5a6edeba52fbdc6dc3413b9f4967243fc60431a81861506d2c46435b7b
@@ -0,0 +1 @@
1
+ pkg
@@ -0,0 +1,30 @@
1
+ AllCops:
2
+ TargetRubyVersion: 2.5.1
3
+ Include:
4
+ - '**/Gemfile'
5
+ - '**/Rakefile'
6
+ - '**/config.ru'
7
+ - '**/*.rake'
8
+ Exclude:
9
+ - vendor/**/*
10
+ - db/migrations/**/*
11
+
12
+ DisplayCopNames:
13
+ Enabled: true
14
+
15
+ DisplayStyleGuide:
16
+ Enabled: true
17
+
18
+ Naming/FileName:
19
+ Exclude:
20
+ - Guardfile
21
+
22
+ Metrics/BlockLength:
23
+ Exclude:
24
+ - spec/**/*
25
+
26
+ Metrics/LineLength:
27
+ Max: 160
28
+
29
+ Style/Documentation:
30
+ Enabled: false
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ # frozen_string_literal: true
2
+
3
+ source 'https://rubygems.org'
4
+ gemspec
@@ -0,0 +1,108 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ httpspell (1.0.0)
5
+ addressable
6
+ nokogiri
7
+
8
+ GEM
9
+ remote: https://rubygems.org/
10
+ specs:
11
+ addressable (2.5.2)
12
+ public_suffix (>= 2.0.2, < 4.0)
13
+ ast (2.4.0)
14
+ byebug (10.0.2)
15
+ coderay (1.1.2)
16
+ diff-lcs (1.3)
17
+ ffi (1.9.23)
18
+ formatador (0.2.5)
19
+ guard (2.14.2)
20
+ formatador (>= 0.2.4)
21
+ listen (>= 2.7, < 4.0)
22
+ lumberjack (>= 1.0.12, < 2.0)
23
+ nenv (~> 0.1)
24
+ notiffany (~> 0.0)
25
+ pry (>= 0.9.12)
26
+ shellany (~> 0.0)
27
+ thor (>= 0.18.1)
28
+ guard-bundler (2.1.0)
29
+ bundler (~> 1.0)
30
+ guard (~> 2.2)
31
+ guard-compat (~> 1.1)
32
+ guard-compat (1.2.1)
33
+ guard-rspec (4.7.3)
34
+ guard (~> 2.1)
35
+ guard-compat (~> 1.1)
36
+ rspec (>= 2.99.0, < 4.0)
37
+ listen (3.1.5)
38
+ rb-fsevent (~> 0.9, >= 0.9.4)
39
+ rb-inotify (~> 0.9, >= 0.9.7)
40
+ ruby_dep (~> 1.2)
41
+ lumberjack (1.0.13)
42
+ method_source (0.9.0)
43
+ mini_portile2 (2.3.0)
44
+ nenv (0.3.0)
45
+ nokogiri (1.8.2)
46
+ mini_portile2 (~> 2.3.0)
47
+ notiffany (0.1.1)
48
+ nenv (~> 0.1)
49
+ shellany (~> 0.0)
50
+ parallel (1.12.1)
51
+ parser (2.5.1.0)
52
+ ast (~> 2.4.0)
53
+ powerpack (0.1.1)
54
+ pry (0.11.3)
55
+ coderay (~> 1.1.0)
56
+ method_source (~> 0.9.0)
57
+ pry-byebug (3.6.0)
58
+ byebug (~> 10.0)
59
+ pry (~> 0.10)
60
+ public_suffix (3.0.2)
61
+ rainbow (3.0.0)
62
+ rake (12.3.1)
63
+ rb-fsevent (0.10.3)
64
+ rb-inotify (0.9.10)
65
+ ffi (>= 0.5.0, < 2)
66
+ rspec (3.7.0)
67
+ rspec-core (~> 3.7.0)
68
+ rspec-expectations (~> 3.7.0)
69
+ rspec-mocks (~> 3.7.0)
70
+ rspec-core (3.7.1)
71
+ rspec-support (~> 3.7.0)
72
+ rspec-expectations (3.7.0)
73
+ diff-lcs (>= 1.2.0, < 2.0)
74
+ rspec-support (~> 3.7.0)
75
+ rspec-mocks (3.7.0)
76
+ diff-lcs (>= 1.2.0, < 2.0)
77
+ rspec-support (~> 3.7.0)
78
+ rspec-support (3.7.1)
79
+ rubocop (0.56.0)
80
+ parallel (~> 1.10)
81
+ parser (>= 2.5)
82
+ powerpack (~> 0.1)
83
+ rainbow (>= 2.2.2, < 4.0)
84
+ ruby-progressbar (~> 1.7)
85
+ unicode-display_width (~> 1.0, >= 1.0.1)
86
+ ruby-progressbar (1.9.0)
87
+ ruby_dep (1.5.0)
88
+ shellany (0.0.1)
89
+ thor (0.20.0)
90
+ unicode-display_width (1.3.3)
91
+
92
+ PLATFORMS
93
+ ruby
94
+
95
+ DEPENDENCIES
96
+ bundler
97
+ guard
98
+ guard-bundler
99
+ guard-rspec
100
+ httpspell!
101
+ pry
102
+ pry-byebug
103
+ rake
104
+ rspec
105
+ rubocop
106
+
107
+ BUNDLED WITH
108
+ 1.16.1
@@ -0,0 +1,21 @@
1
+ # `httpspell`
2
+
3
+ This is a spellchecker that recursively fetches HTML pages, converts them to plain text (using [pandoc](http://pandoc.org/)), and spellchecks them with [hunspell](https://hunspell.github.io/). Unknown words will be printed to `stdout`, which makes the tool a good candidate for CI pipelines where you might want to take action when a spelling error is found on a web page.
4
+
5
+ Words that are not in the dictionary for the given language (inferred from the `lang` attribute of the HTML document's root element) can be added to a personal dictionary, which will mark the word as correctly spelled.
6
+
7
+ # What is *not* checked
8
+
9
+ * When spidering a site, `httpspell` will skip all responses with a `content-type` header other than `text/html`.
10
+ * Before converting, `httpspell` removes the following nodes from the HTML DOM as they are not a good target for spellchecking:
11
+ - `code`
12
+ - `pre`
13
+ - Elements with `spellcheck='false'` (this is how HTML5 allows tagging elements as a target for spellchecking)
14
+
15
+ # Misc
16
+
17
+ If you produce content with kramdown (e.g. using Jekyll), setting `spellcheck='false'` for an element is a simple as adding this line *after* the element (e.g. heading):
18
+
19
+ ```
20
+ {: spellcheck="false"}
21
+ ```
@@ -0,0 +1,18 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rspec/core/rake_task'
4
+ require 'bundler/gem_tasks'
5
+ require 'rubocop/rake_task'
6
+
7
+ RuboCop::RakeTask.new
8
+
9
+ task default: ['spec:all']
10
+
11
+ namespace :spec do
12
+ desc 'Run all specs'
13
+ task all: ['rubocop:auto_correct', :unit]
14
+
15
+ RSpec::Core::RakeTask.new(:unit) do |t|
16
+ t.pattern = 'spec/unit/**/*_spec.rb'
17
+ end
18
+ end
@@ -0,0 +1,51 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require 'optparse'
5
+ require 'httpspell/spider'
6
+ require 'httpspell/spellchecker'
7
+
8
+ personal_dictionary_path = nil
9
+
10
+ begin
11
+ OptionParser.new do |parser|
12
+ parser.banner.prepend <<~BANNER
13
+ Spellchecks a website via HTTP.
14
+
15
+ BANNER
16
+
17
+ parser.on('-p', '--personal-dictionary=FILE', 'path to the personal dictionary file') do |p|
18
+ personal_dictionary_path = p
19
+ end
20
+
21
+ # TODO: --recursive, defaults to false
22
+ # TODO wget has some additional options for recursive behavior that should be reviewed
23
+ end.parse!
24
+ rescue StandardError
25
+ warn "Error - #{$ERROR_INFO}"
26
+ exit 1
27
+ end
28
+
29
+ if ARGV.size != 1
30
+ warn "Expected exactly one argument, but received #{ARGV.size}."
31
+ exit 1
32
+ end
33
+
34
+ spell_checker = HttpSpell::SpellChecker.new(personal_dictionary_path)
35
+
36
+ HttpSpell::Spider.new(ARGV.first).start do |url, doc|
37
+ lang = doc.root['lang'] || 'de-DE'
38
+
39
+ # Remove sections that are not to be spellchecked
40
+ doc.css('pre').each(&:unlink)
41
+ doc.css('code').each(&:unlink)
42
+ doc.css('[spellcheck=false]').each(&:unlink)
43
+
44
+ # TODO: Find sections with a lang attribute and handle them separately
45
+ unknown_words = spell_checker.check(doc.to_s, lang)
46
+
47
+ unless unknown_words.empty?
48
+ warn "#{unknown_words.size} unknown words at #{url}:"
49
+ puts unknown_words
50
+ end
51
+ end
@@ -0,0 +1,40 @@
1
+ # frozen_string_literal: true
2
+
3
+ lib = File.expand_path('lib', __dir__)
4
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
5
+ require 'httpspell/version'
6
+
7
+ # rubocop:disable Metrics/BlockLength
8
+ Gem::Specification.new do |spec|
9
+ spec.name = 'httpspell'
10
+ spec.version = HttpSpell::VERSION
11
+ spec.authors = ['Steffen Uhlig']
12
+ spec.email = ['steffen@familie-uhlig.net']
13
+
14
+ spec.summary = 'HTTP spellchecker'
15
+ spec.description = %(httpspell is a spellchecker that recursively fetches
16
+ HTML pages, converts them to plain text using pandoc, and
17
+ spellchecks them with hunspell.)
18
+ spec.license = 'MIT'
19
+
20
+ spec.files = `git ls-files -z`.split("\x0").reject do |f|
21
+ f.match(%r{^(test|spec|features)/})
22
+ end
23
+ spec.bindir = 'exe'
24
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
25
+ spec.require_paths = ['lib']
26
+
27
+ spec.add_dependency 'addressable'
28
+ spec.add_dependency 'nokogiri'
29
+
30
+ spec.add_development_dependency 'bundler'
31
+ spec.add_development_dependency 'guard'
32
+ spec.add_development_dependency 'guard-bundler'
33
+ spec.add_development_dependency 'guard-rspec'
34
+ spec.add_development_dependency 'pry'
35
+ spec.add_development_dependency 'pry-byebug'
36
+ spec.add_development_dependency 'rake'
37
+ spec.add_development_dependency 'rspec'
38
+ spec.add_development_dependency 'rubocop'
39
+ end
40
+ # rubocop:enable Metrics/BlockLength
@@ -0,0 +1,25 @@
1
+ module HttpSpell
2
+ class SpellChecker
3
+ def initialize(personal_dictionary_path = nil)
4
+ @personal_dictionary_arg = "-p #{personal_dictionary_path}" if personal_dictionary_path
5
+ end
6
+
7
+ def check(doc, lang)
8
+ Open3.pipeline_rw('pandoc --from html --to plain', "hunspell -d #{translate(lang)} #{@personal_dictionary_arg} -i UTF-8 -l") do |stdin, stdout, _wait_thrs|
9
+ stdin.puts(doc)
10
+ stdin.close
11
+ stdout.read.split.uniq
12
+ end
13
+ end
14
+
15
+ private
16
+
17
+ # The W3C [recommends](https://www.w3.org/International/questions/qa-html-language-declarations)
18
+ # to specify language using identifiers as per [RFC 5646](https://tools.ietf.org/html/rfc5646)
19
+ # which uses dashes. Hunspell, however, uses underscores. This method translates RFC-style identifiers
20
+ # to hunspell-style.
21
+ def translate(lang)
22
+ lang.tr('-', '_')
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,62 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+ require 'open3'
4
+ require 'addressable/uri'
5
+ require 'English'
6
+
7
+ module HttpSpell
8
+ # rubocop:disable Metrics/AbcSize
9
+ # rubocop:disable Metrics/MethodLength
10
+ class Spider
11
+ attr_reader :todo, :done
12
+
13
+ def initialize(starting_point, base_url = starting_point)
14
+ @todo = []
15
+ @done = []
16
+ todo << Addressable::URI.parse(starting_point)
17
+ @base_url = Addressable::URI.parse(base_url)
18
+ end
19
+
20
+ def start
21
+ while todo.any?
22
+ url = todo.pop
23
+
24
+ begin
25
+ extracted = links(url) do |u, d|
26
+ yield u, d if block_given?
27
+ rescue
28
+ warn "Callback error for #{url}: #{$ERROR_INFO}"
29
+ end
30
+
31
+ done.append(url)
32
+ todo.concat(extracted - done - todo)
33
+ rescue StandardError
34
+ warn "Could not fetch #{url}: #{$ERROR_INFO}"
35
+ end
36
+ end
37
+ end
38
+
39
+ private
40
+
41
+ def links(uri)
42
+ # We are using open-uri, which follows redirects and also provides the content-type.
43
+ response = URI(uri).read
44
+ return [] unless response.content_type == 'text/html'
45
+ doc = Nokogiri::HTML(response)
46
+
47
+ links = doc.css('a[href]').map do |e|
48
+ link = Addressable::URI.parse(e['href'])
49
+ link = uri.join(link) if link.relative?
50
+ next unless link.to_s.start_with?(@base_url.to_s)
51
+ link
52
+ rescue StandardError
53
+ warn $ERROR_INFO
54
+ end.compact
55
+
56
+ yield uri, doc if block_given?
57
+ links
58
+ end
59
+ end
60
+ # rubocop:enable Metrics/AbcSize
61
+ # rubocop:enable Metrics/MethodLength
62
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module HttpSpell
4
+ VERSION = '1.0.0'
5
+ end
metadata ADDED
@@ -0,0 +1,213 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: httpspell
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Steffen Uhlig
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2018-05-30 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: addressable
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: nokogiri
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: bundler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: guard
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: guard-bundler
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: guard-rspec
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: pry
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: pry-byebug
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ - !ruby/object:Gem::Dependency
126
+ name: rake
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ">="
130
+ - !ruby/object:Gem::Version
131
+ version: '0'
132
+ type: :development
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - ">="
137
+ - !ruby/object:Gem::Version
138
+ version: '0'
139
+ - !ruby/object:Gem::Dependency
140
+ name: rspec
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - ">="
144
+ - !ruby/object:Gem::Version
145
+ version: '0'
146
+ type: :development
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - ">="
151
+ - !ruby/object:Gem::Version
152
+ version: '0'
153
+ - !ruby/object:Gem::Dependency
154
+ name: rubocop
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - ">="
158
+ - !ruby/object:Gem::Version
159
+ version: '0'
160
+ type: :development
161
+ prerelease: false
162
+ version_requirements: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - ">="
165
+ - !ruby/object:Gem::Version
166
+ version: '0'
167
+ description: |-
168
+ httpspell is a spellchecker that recursively fetches
169
+ HTML pages, converts them to plain text using pandoc, and
170
+ spellchecks them with hunspell.
171
+ email:
172
+ - steffen@familie-uhlig.net
173
+ executables:
174
+ - httpspell
175
+ extensions: []
176
+ extra_rdoc_files: []
177
+ files:
178
+ - ".gitignore"
179
+ - ".rubocop.yml"
180
+ - Gemfile
181
+ - Gemfile.lock
182
+ - README.markdown
183
+ - Rakefile
184
+ - exe/httpspell
185
+ - httpspell.gemspec
186
+ - lib/httpspell/spellchecker.rb
187
+ - lib/httpspell/spider.rb
188
+ - lib/httpspell/version.rb
189
+ homepage:
190
+ licenses:
191
+ - MIT
192
+ metadata: {}
193
+ post_install_message:
194
+ rdoc_options: []
195
+ require_paths:
196
+ - lib
197
+ required_ruby_version: !ruby/object:Gem::Requirement
198
+ requirements:
199
+ - - ">="
200
+ - !ruby/object:Gem::Version
201
+ version: '0'
202
+ required_rubygems_version: !ruby/object:Gem::Requirement
203
+ requirements:
204
+ - - ">="
205
+ - !ruby/object:Gem::Version
206
+ version: '0'
207
+ requirements: []
208
+ rubyforge_project:
209
+ rubygems_version: 2.7.6
210
+ signing_key:
211
+ specification_version: 4
212
+ summary: HTTP spellchecker
213
+ test_files: []