chupa-text-decomposer-html 1.0.2 → 1.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/chupa-text-decomposer-html.gemspec +1 -1
- data/doc/text/news.md +15 -3
- data/lib/chupa-text/decomposers/html.rb +29 -3
- data/test/test-html.rb +90 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5b340a505f73aa5bcc6613a55b6fcec5e7988bc4
|
4
|
+
data.tar.gz: 3f4e1ef2b30decee069f76521753f6533a51024b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 51f94a9bd3eb45765aa4518f8415ec82ec235f53e99e75e4ec94afb45721869587e9ad558fc7bdfeb05fca39ed76f24d08d770428b0a775c613f8b0768e60b7c
|
7
|
+
data.tar.gz: 409267fc2e80bc9cbc119443b6dee928ab72a7e1eb747c69cb8c781654fcd6977cf2d16663622c29f46e6c86923da0a9e8f911b82f73c8cb63e0722e9a130cda
|
@@ -22,7 +22,7 @@ end
|
|
22
22
|
|
23
23
|
Gem::Specification.new do |spec|
|
24
24
|
spec.name = "chupa-text-decomposer-html"
|
25
|
-
spec.version = "1.0.
|
25
|
+
spec.version = "1.0.3"
|
26
26
|
spec.homepage = "https://github.com/ranguba/chupa-text-decomposer-html"
|
27
27
|
spec.authors = ["Kouhei Sutou"]
|
28
28
|
spec.email = ["kou@clear-code.com"]
|
data/doc/text/news.md
CHANGED
@@ -1,14 +1,26 @@
|
|
1
1
|
# News
|
2
2
|
|
3
|
+
## 1.0.3: 2017-07-10
|
4
|
+
|
5
|
+
### Improvements
|
6
|
+
|
7
|
+
* Supported ignoring topic path content.
|
8
|
+
|
9
|
+
* Supported ignoring aside content.
|
10
|
+
|
11
|
+
### Fixes
|
12
|
+
|
13
|
+
* Fixed a infinite loop bug.
|
14
|
+
|
3
15
|
## 1.0.2: 2017-07-05
|
4
16
|
|
5
|
-
*
|
17
|
+
* Supported content based HTML detection.
|
6
18
|
|
7
|
-
*
|
19
|
+
* Supported ignoring common contents.
|
8
20
|
|
9
21
|
## 1.0.1: 2014-02-18
|
10
22
|
|
11
|
-
*
|
23
|
+
* Supported chupa-text 1.0.4.
|
12
24
|
|
13
25
|
## 1.0.0: 2014-01-05
|
14
26
|
|
@@ -28,6 +28,10 @@ module ChupaText
|
|
28
28
|
"application/xhtml+xml",
|
29
29
|
]
|
30
30
|
def target?(data)
|
31
|
+
(data["source-mime-types"] || []).each do |source_mime_type|
|
32
|
+
return false if TARGET_MIME_TYPES.include?(source_mime_type)
|
33
|
+
end
|
34
|
+
|
31
35
|
return true if TARGET_EXTENSIONS.include?(data.extension)
|
32
36
|
return true if TARGET_MIME_TYPES.include?(data.mime_type)
|
33
37
|
|
@@ -112,6 +116,7 @@ module ChupaText
|
|
112
116
|
return text if header_element?(element, name, classes)
|
113
117
|
return text if footer_element?(element, name, classes)
|
114
118
|
return text if navigation_element?(element, name, classes)
|
119
|
+
return text if aside_element?(element, name, classes)
|
115
120
|
|
116
121
|
element.children.each do |child|
|
117
122
|
case child
|
@@ -143,7 +148,7 @@ module ChupaText
|
|
143
148
|
|
144
149
|
def header_element?(element, name, classes)
|
145
150
|
case name
|
146
|
-
when "header"
|
151
|
+
when "header"
|
147
152
|
return true
|
148
153
|
end
|
149
154
|
|
@@ -191,13 +196,34 @@ module ChupaText
|
|
191
196
|
|
192
197
|
classes.each do |klass|
|
193
198
|
case klass
|
194
|
-
when "nav", "menu"
|
199
|
+
when "nav", "menu", /\Atopic[-_]?path\z/i
|
200
|
+
return true
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
case element["id"]
|
205
|
+
when "nav", "menu", /\Atopic[-_]?path\z/i
|
206
|
+
return true
|
207
|
+
end
|
208
|
+
|
209
|
+
false
|
210
|
+
end
|
211
|
+
|
212
|
+
def aside_element?(element, name, classes)
|
213
|
+
case name
|
214
|
+
when "aside"
|
215
|
+
return true
|
216
|
+
end
|
217
|
+
|
218
|
+
classes.each do |klass|
|
219
|
+
case klass
|
220
|
+
when "aside"
|
195
221
|
return true
|
196
222
|
end
|
197
223
|
end
|
198
224
|
|
199
225
|
case element["id"]
|
200
|
-
when "
|
226
|
+
when "aside"
|
201
227
|
return true
|
202
228
|
end
|
203
229
|
|
data/test/test-html.rb
CHANGED
@@ -28,6 +28,30 @@ class TestHTML < Test::Unit::TestCase
|
|
28
28
|
end
|
29
29
|
|
30
30
|
sub_test_case("target?") do
|
31
|
+
sub_test_case("source-mime-type") do
|
32
|
+
def create_data(uri, mime_type)
|
33
|
+
data = ChupaText::Data.new
|
34
|
+
data.body = ""
|
35
|
+
data.uri = uri
|
36
|
+
data["source-mime-types"] = [mime_type]
|
37
|
+
data
|
38
|
+
end
|
39
|
+
|
40
|
+
def test_text_html
|
41
|
+
data = create_data("index.html", "text/html")
|
42
|
+
assert do
|
43
|
+
not @decomposer.target?(data)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def test_application_xhtml_xml
|
48
|
+
data = create_data("index.html", "application/xhtml+xml")
|
49
|
+
assert do
|
50
|
+
not @decomposer.target?(data)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
31
55
|
sub_test_case("extension") do
|
32
56
|
def create_data(uri)
|
33
57
|
data = ChupaText::Data.new
|
@@ -436,6 +460,23 @@ class TestHTML < Test::Unit::TestCase
|
|
436
460
|
decompose(@data))
|
437
461
|
end
|
438
462
|
|
463
|
+
def test_topic_path_class
|
464
|
+
@data.body = <<-HTML
|
465
|
+
<html>
|
466
|
+
<body>
|
467
|
+
Before
|
468
|
+
<div class="topic-path">topic-path</div>
|
469
|
+
<div class="topic_path">topic_path</div>
|
470
|
+
<div class="topicpath">topicpath</div>
|
471
|
+
<div class="TopicPath">TopicPath</div>
|
472
|
+
After
|
473
|
+
</body>
|
474
|
+
</html>
|
475
|
+
HTML
|
476
|
+
assert_equal(["Before\nAfter"],
|
477
|
+
decompose(@data))
|
478
|
+
end
|
479
|
+
|
439
480
|
def test_nav_id
|
440
481
|
@data.body = <<-HTML
|
441
482
|
<html>
|
@@ -450,6 +491,55 @@ class TestHTML < Test::Unit::TestCase
|
|
450
491
|
@data.body = <<-HTML
|
451
492
|
<html>
|
452
493
|
<body>Before<div id="menu">nav</div>After</body>
|
494
|
+
</html>
|
495
|
+
HTML
|
496
|
+
assert_equal(["BeforeAfter"],
|
497
|
+
decompose(@data))
|
498
|
+
end
|
499
|
+
|
500
|
+
def test_topic_path_id
|
501
|
+
@data.body = <<-HTML
|
502
|
+
<html>
|
503
|
+
<body>
|
504
|
+
Before
|
505
|
+
<div id="topic-path">topic-path</div>
|
506
|
+
<div id="topic_path">topic_path</div>
|
507
|
+
<div id="topicpath">topicpath</div>
|
508
|
+
<div id="TopicPath">TopicPath</div>
|
509
|
+
After
|
510
|
+
</body>
|
511
|
+
</html>
|
512
|
+
HTML
|
513
|
+
assert_equal(["Before\nAfter"],
|
514
|
+
decompose(@data))
|
515
|
+
end
|
516
|
+
end
|
517
|
+
|
518
|
+
sub_test_case("aside") do
|
519
|
+
def test_aside_tag
|
520
|
+
@data.body = <<-HTML
|
521
|
+
<html>
|
522
|
+
<body>Before<aside>aside</aside>After</body>
|
523
|
+
</html>
|
524
|
+
HTML
|
525
|
+
assert_equal(["BeforeAfter"],
|
526
|
+
decompose(@data))
|
527
|
+
end
|
528
|
+
|
529
|
+
def test_aside_class
|
530
|
+
@data.body = <<-HTML
|
531
|
+
<html>
|
532
|
+
<body>Before<div class="aside">aside</div>After</body>
|
533
|
+
</html>
|
534
|
+
HTML
|
535
|
+
assert_equal(["BeforeAfter"],
|
536
|
+
decompose(@data))
|
537
|
+
end
|
538
|
+
|
539
|
+
def test_aside_id
|
540
|
+
@data.body = <<-HTML
|
541
|
+
<html>
|
542
|
+
<body>Before<div id="aside">aside</div>After</body>
|
453
543
|
</html>
|
454
544
|
HTML
|
455
545
|
assert_equal(["BeforeAfter"],
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: chupa-text-decomposer-html
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kouhei Sutou
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-07-
|
11
|
+
date: 2017-07-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: chupa-text
|