chupa-text-decomposer-html 1.0.2 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/chupa-text-decomposer-html.gemspec +1 -1
- data/doc/text/news.md +15 -3
- data/lib/chupa-text/decomposers/html.rb +29 -3
- data/test/test-html.rb +90 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5b340a505f73aa5bcc6613a55b6fcec5e7988bc4
|
4
|
+
data.tar.gz: 3f4e1ef2b30decee069f76521753f6533a51024b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 51f94a9bd3eb45765aa4518f8415ec82ec235f53e99e75e4ec94afb45721869587e9ad558fc7bdfeb05fca39ed76f24d08d770428b0a775c613f8b0768e60b7c
|
7
|
+
data.tar.gz: 409267fc2e80bc9cbc119443b6dee928ab72a7e1eb747c69cb8c781654fcd6977cf2d16663622c29f46e6c86923da0a9e8f911b82f73c8cb63e0722e9a130cda
|
@@ -22,7 +22,7 @@ end
|
|
22
22
|
|
23
23
|
Gem::Specification.new do |spec|
|
24
24
|
spec.name = "chupa-text-decomposer-html"
|
25
|
-
spec.version = "1.0.
|
25
|
+
spec.version = "1.0.3"
|
26
26
|
spec.homepage = "https://github.com/ranguba/chupa-text-decomposer-html"
|
27
27
|
spec.authors = ["Kouhei Sutou"]
|
28
28
|
spec.email = ["kou@clear-code.com"]
|
data/doc/text/news.md
CHANGED
@@ -1,14 +1,26 @@
|
|
1
1
|
# News
|
2
2
|
|
3
|
+
## 1.0.3: 2017-07-10
|
4
|
+
|
5
|
+
### Improvements
|
6
|
+
|
7
|
+
* Supported ignoring topic path content.
|
8
|
+
|
9
|
+
* Supported ignoring aside content.
|
10
|
+
|
11
|
+
### Fixes
|
12
|
+
|
13
|
+
* Fixed a infinite loop bug.
|
14
|
+
|
3
15
|
## 1.0.2: 2017-07-05
|
4
16
|
|
5
|
-
*
|
17
|
+
* Supported content based HTML detection.
|
6
18
|
|
7
|
-
*
|
19
|
+
* Supported ignoring common contents.
|
8
20
|
|
9
21
|
## 1.0.1: 2014-02-18
|
10
22
|
|
11
|
-
*
|
23
|
+
* Supported chupa-text 1.0.4.
|
12
24
|
|
13
25
|
## 1.0.0: 2014-01-05
|
14
26
|
|
@@ -28,6 +28,10 @@ module ChupaText
|
|
28
28
|
"application/xhtml+xml",
|
29
29
|
]
|
30
30
|
def target?(data)
|
31
|
+
(data["source-mime-types"] || []).each do |source_mime_type|
|
32
|
+
return false if TARGET_MIME_TYPES.include?(source_mime_type)
|
33
|
+
end
|
34
|
+
|
31
35
|
return true if TARGET_EXTENSIONS.include?(data.extension)
|
32
36
|
return true if TARGET_MIME_TYPES.include?(data.mime_type)
|
33
37
|
|
@@ -112,6 +116,7 @@ module ChupaText
|
|
112
116
|
return text if header_element?(element, name, classes)
|
113
117
|
return text if footer_element?(element, name, classes)
|
114
118
|
return text if navigation_element?(element, name, classes)
|
119
|
+
return text if aside_element?(element, name, classes)
|
115
120
|
|
116
121
|
element.children.each do |child|
|
117
122
|
case child
|
@@ -143,7 +148,7 @@ module ChupaText
|
|
143
148
|
|
144
149
|
def header_element?(element, name, classes)
|
145
150
|
case name
|
146
|
-
when "header"
|
151
|
+
when "header"
|
147
152
|
return true
|
148
153
|
end
|
149
154
|
|
@@ -191,13 +196,34 @@ module ChupaText
|
|
191
196
|
|
192
197
|
classes.each do |klass|
|
193
198
|
case klass
|
194
|
-
when "nav", "menu"
|
199
|
+
when "nav", "menu", /\Atopic[-_]?path\z/i
|
200
|
+
return true
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
case element["id"]
|
205
|
+
when "nav", "menu", /\Atopic[-_]?path\z/i
|
206
|
+
return true
|
207
|
+
end
|
208
|
+
|
209
|
+
false
|
210
|
+
end
|
211
|
+
|
212
|
+
def aside_element?(element, name, classes)
|
213
|
+
case name
|
214
|
+
when "aside"
|
215
|
+
return true
|
216
|
+
end
|
217
|
+
|
218
|
+
classes.each do |klass|
|
219
|
+
case klass
|
220
|
+
when "aside"
|
195
221
|
return true
|
196
222
|
end
|
197
223
|
end
|
198
224
|
|
199
225
|
case element["id"]
|
200
|
-
when "
|
226
|
+
when "aside"
|
201
227
|
return true
|
202
228
|
end
|
203
229
|
|
data/test/test-html.rb
CHANGED
@@ -28,6 +28,30 @@ class TestHTML < Test::Unit::TestCase
|
|
28
28
|
end
|
29
29
|
|
30
30
|
sub_test_case("target?") do
|
31
|
+
sub_test_case("source-mime-type") do
|
32
|
+
def create_data(uri, mime_type)
|
33
|
+
data = ChupaText::Data.new
|
34
|
+
data.body = ""
|
35
|
+
data.uri = uri
|
36
|
+
data["source-mime-types"] = [mime_type]
|
37
|
+
data
|
38
|
+
end
|
39
|
+
|
40
|
+
def test_text_html
|
41
|
+
data = create_data("index.html", "text/html")
|
42
|
+
assert do
|
43
|
+
not @decomposer.target?(data)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def test_application_xhtml_xml
|
48
|
+
data = create_data("index.html", "application/xhtml+xml")
|
49
|
+
assert do
|
50
|
+
not @decomposer.target?(data)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
31
55
|
sub_test_case("extension") do
|
32
56
|
def create_data(uri)
|
33
57
|
data = ChupaText::Data.new
|
@@ -436,6 +460,23 @@ class TestHTML < Test::Unit::TestCase
|
|
436
460
|
decompose(@data))
|
437
461
|
end
|
438
462
|
|
463
|
+
def test_topic_path_class
|
464
|
+
@data.body = <<-HTML
|
465
|
+
<html>
|
466
|
+
<body>
|
467
|
+
Before
|
468
|
+
<div class="topic-path">topic-path</div>
|
469
|
+
<div class="topic_path">topic_path</div>
|
470
|
+
<div class="topicpath">topicpath</div>
|
471
|
+
<div class="TopicPath">TopicPath</div>
|
472
|
+
After
|
473
|
+
</body>
|
474
|
+
</html>
|
475
|
+
HTML
|
476
|
+
assert_equal(["Before\nAfter"],
|
477
|
+
decompose(@data))
|
478
|
+
end
|
479
|
+
|
439
480
|
def test_nav_id
|
440
481
|
@data.body = <<-HTML
|
441
482
|
<html>
|
@@ -450,6 +491,55 @@ class TestHTML < Test::Unit::TestCase
|
|
450
491
|
@data.body = <<-HTML
|
451
492
|
<html>
|
452
493
|
<body>Before<div id="menu">nav</div>After</body>
|
494
|
+
</html>
|
495
|
+
HTML
|
496
|
+
assert_equal(["BeforeAfter"],
|
497
|
+
decompose(@data))
|
498
|
+
end
|
499
|
+
|
500
|
+
def test_topic_path_id
|
501
|
+
@data.body = <<-HTML
|
502
|
+
<html>
|
503
|
+
<body>
|
504
|
+
Before
|
505
|
+
<div id="topic-path">topic-path</div>
|
506
|
+
<div id="topic_path">topic_path</div>
|
507
|
+
<div id="topicpath">topicpath</div>
|
508
|
+
<div id="TopicPath">TopicPath</div>
|
509
|
+
After
|
510
|
+
</body>
|
511
|
+
</html>
|
512
|
+
HTML
|
513
|
+
assert_equal(["Before\nAfter"],
|
514
|
+
decompose(@data))
|
515
|
+
end
|
516
|
+
end
|
517
|
+
|
518
|
+
sub_test_case("aside") do
|
519
|
+
def test_aside_tag
|
520
|
+
@data.body = <<-HTML
|
521
|
+
<html>
|
522
|
+
<body>Before<aside>aside</aside>After</body>
|
523
|
+
</html>
|
524
|
+
HTML
|
525
|
+
assert_equal(["BeforeAfter"],
|
526
|
+
decompose(@data))
|
527
|
+
end
|
528
|
+
|
529
|
+
def test_aside_class
|
530
|
+
@data.body = <<-HTML
|
531
|
+
<html>
|
532
|
+
<body>Before<div class="aside">aside</div>After</body>
|
533
|
+
</html>
|
534
|
+
HTML
|
535
|
+
assert_equal(["BeforeAfter"],
|
536
|
+
decompose(@data))
|
537
|
+
end
|
538
|
+
|
539
|
+
def test_aside_id
|
540
|
+
@data.body = <<-HTML
|
541
|
+
<html>
|
542
|
+
<body>Before<div id="aside">aside</div>After</body>
|
453
543
|
</html>
|
454
544
|
HTML
|
455
545
|
assert_equal(["BeforeAfter"],
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: chupa-text-decomposer-html
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kouhei Sutou
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-07-
|
11
|
+
date: 2017-07-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: chupa-text
|