cw 0.3.2 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/VERSION +1 -1
- data/cw.gemspec +2 -2
- data/lib/cw.rb +1 -0
- data/lib/cw/rss.rb +5 -3
- data/lib/cw/rss_clean.rb +41 -0
- metadata +4 -17
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f32cecabf34fb0f987f2c33b7678f5d1de000655
|
4
|
+
data.tar.gz: 38ea73121579a8b0e194021e8997b191939e3bc8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3a9c6b660fba1f14a7429d73879cef6cc28790ecb8136576d580e3ad388a35cfdd12d4f8fdcee7d67b0b5427bc0a290db942359d73fa73faec06f2cc119cc2d3
|
7
|
+
data.tar.gz: a0bd05411135bfdbb77dd51091445bf5d3e556c1f05a7afbe5bc60787ec8c2c276e487ec3bd60807a8f4b89b3853f25ae631de76648498f45d05e3aa8c1c3478
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.3.
|
1
|
+
0.3.3
|
data/cw.gemspec
CHANGED
@@ -19,16 +19,16 @@ Gem::Specification.new do |spec|
|
|
19
19
|
spec.require_paths = ["lib", "audio", "data/text", "test"]
|
20
20
|
|
21
21
|
spec.required_ruby_version = '>= 2.0.0'
|
22
|
-
spec.add_runtime_dependency 'oga', '~>2.
|
22
|
+
spec.add_runtime_dependency 'oga', '~> 2.8'
|
23
23
|
spec.add_runtime_dependency 'httpclient'
|
24
24
|
spec.add_runtime_dependency 'htmlentities', '>= 4.3.4'
|
25
25
|
spec.add_runtime_dependency 'paint', '>= 1.0.1'
|
26
26
|
spec.add_runtime_dependency 'rake', '>= 11.2.2'
|
27
27
|
spec.add_runtime_dependency 'ruby-progressbar', '>= 1.8.1'
|
28
|
-
spec.add_runtime_dependency 'sanitize', '~> 4.4.0'
|
29
28
|
spec.add_runtime_dependency 'wavefile', '>= 0.7.0'
|
30
29
|
spec.add_runtime_dependency 'parseconfig', '~> 1.0.8'
|
31
30
|
spec.add_runtime_dependency 'rubyserial', '~> 0.4.0'
|
31
|
+
|
32
32
|
spec.add_dependency 'os', '~> 0.9.6'
|
33
33
|
|
34
34
|
spec.add_development_dependency 'version', '>= 1.0.0'
|
data/lib/cw.rb
CHANGED
data/lib/cw/rss.rb
CHANGED
@@ -3,7 +3,6 @@
|
|
3
3
|
require 'oga'
|
4
4
|
require 'httpclient'
|
5
5
|
require "htmlentities"
|
6
|
-
require 'sanitize'
|
7
6
|
|
8
7
|
module CWG
|
9
8
|
|
@@ -41,8 +40,11 @@ module CWG
|
|
41
40
|
unless(title.include?('VIDEO:') ||
|
42
41
|
title.include?('In pictures:') ||
|
43
42
|
title.include?('Morning business round-up'))
|
44
|
-
|
45
|
-
|
43
|
+
clean_title = CWG::RSSClean.new(title).scrub
|
44
|
+
clean_desc = CWG::RSSClean.new(description).scrub
|
45
|
+
# @rss_articles << Sanitize.clean(coder.decode(title)) + '. ' +
|
46
|
+
# Sanitize.clean(coder.decode(description))
|
47
|
+
@rss_articles << clean_title + '. ' + clean_desc
|
46
48
|
count += 1
|
47
49
|
break if count >= article_count
|
48
50
|
end
|
data/lib/cw/rss_clean.rb
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'oga'
|
2
|
+
|
3
|
+
module CWG
|
4
|
+
class RSSClean
|
5
|
+
def initialize(html_fragment)
|
6
|
+
@html_fragment = html_fragment
|
7
|
+
end
|
8
|
+
|
9
|
+
def scrub(options = {})
|
10
|
+
blacklisted_tags = NON_CONTENT_TAGS + options.fetch(:blacklist, [])
|
11
|
+
|
12
|
+
sanitize(Oga.parse_html(html_fragment).children, blacklisted_tags)
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
attr_reader :html_fragment
|
18
|
+
|
19
|
+
NON_CONTENT_TAGS = %w(script style)
|
20
|
+
WHITESPACE_CONTENT_TAGS = %w(address article aside blockquote br dd div dl dt footer h1 h2 h3 h4 h5 h6 header hgroup hr li nav ol p pre section ul)
|
21
|
+
|
22
|
+
def sanitize(node_set, blacklisted_tags)
|
23
|
+
node_set.reject { |node| !text?(node) && blacklisted_tags.include?(node.name) }
|
24
|
+
.flat_map { |node| [whitespace(node, :prefix), text(node, blacklisted_tags), whitespace(node, :suffix)] }.join
|
25
|
+
end
|
26
|
+
|
27
|
+
def text?(node)
|
28
|
+
node.is_a?(Oga::XML::Text)
|
29
|
+
end
|
30
|
+
|
31
|
+
def whitespace(node, _position)
|
32
|
+
return ' ' if !text?(node) && WHITESPACE_CONTENT_TAGS.include?(node.name)
|
33
|
+
''
|
34
|
+
end
|
35
|
+
|
36
|
+
def text(node, blacklisted_tags)
|
37
|
+
return node.text if text?(node)
|
38
|
+
sanitize(node.children, blacklisted_tags)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cw
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Martyn Jago
|
@@ -16,14 +16,14 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '2.
|
19
|
+
version: '2.8'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '2.
|
26
|
+
version: '2.8'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: httpclient
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -94,20 +94,6 @@ dependencies:
|
|
94
94
|
- - ">="
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: 1.8.1
|
97
|
-
- !ruby/object:Gem::Dependency
|
98
|
-
name: sanitize
|
99
|
-
requirement: !ruby/object:Gem::Requirement
|
100
|
-
requirements:
|
101
|
-
- - "~>"
|
102
|
-
- !ruby/object:Gem::Version
|
103
|
-
version: 4.4.0
|
104
|
-
type: :runtime
|
105
|
-
prerelease: false
|
106
|
-
version_requirements: !ruby/object:Gem::Requirement
|
107
|
-
requirements:
|
108
|
-
- - "~>"
|
109
|
-
- !ruby/object:Gem::Version
|
110
|
-
version: 4.4.0
|
111
97
|
- !ruby/object:Gem::Dependency
|
112
98
|
name: wavefile
|
113
99
|
requirement: !ruby/object:Gem::Requirement
|
@@ -310,6 +296,7 @@ files:
|
|
310
296
|
- lib/cw/repeat_word.rb
|
311
297
|
- lib/cw/reveal.rb
|
312
298
|
- lib/cw/rss.rb
|
299
|
+
- lib/cw/rss_clean.rb
|
313
300
|
- lib/cw/sentence.rb
|
314
301
|
- lib/cw/speak.rb
|
315
302
|
- lib/cw/spoken.rb
|