chupa-text-decomposer-html 1.0.2 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/Gemfile +8 -2
- data/chupa-text-decomposer-html.gemspec +2 -8
- data/doc/text/news.md +21 -3
- data/lib/chupa-text/decomposers/html.rb +48 -9
- data/test/run-test.rb +1 -1
- data/test/test-html.rb +92 -2
- metadata +3 -77
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: d18136db785fa061ef0bea3f17f8826a1ff55ed5020a591926e1700b91b9df38
|
4
|
+
data.tar.gz: 2c173149ac68d34756944ce98caa32f1dbc5bba4f86dab0461505bb90a5d406f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 692141e0ed3d3d92729de8c47d62fa78ad6bc571070d293cdd8a7865e2d2366d82d68ef83121300dd92a82193b3db3ca83b7865ad6e0212c7e1b26e698830b13
|
7
|
+
data.tar.gz: 061e659f770c63f304cc7b697b6f3c512ea0c975eac35d55a9de9ad434820f056477275e4a240e058a42ace0187b1240290050bf83984049593003e309694411
|
data/Gemfile
CHANGED
@@ -1,6 +1,6 @@
|
|
1
|
-
# -*- mode: ruby
|
1
|
+
# -*- mode: ruby -*-
|
2
2
|
#
|
3
|
-
# Copyright (C) 2013 Kouhei
|
3
|
+
# Copyright (C) 2013-2024 Sutou Kouhei <kou@clear-code.com>
|
4
4
|
#
|
5
5
|
# This library is free software; you can redistribute it and/or
|
6
6
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -20,6 +20,12 @@ source "https://rubygems.org/"
|
|
20
20
|
|
21
21
|
gemspec
|
22
22
|
|
23
|
+
gem "bundler"
|
24
|
+
gem "packnga"
|
25
|
+
gem "rake"
|
26
|
+
gem "redcarpet"
|
27
|
+
gem "test-unit"
|
28
|
+
|
23
29
|
base_dir = File.dirname(__FILE__)
|
24
30
|
local_chupa_text_dir = File.join(base_dir, "..", "chupa-text")
|
25
31
|
if File.exist?(local_chupa_text_dir)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# -*- ruby -*-
|
2
2
|
#
|
3
|
-
# Copyright (C) 2013-
|
3
|
+
# Copyright (C) 2013-2024 Sutou Kouhei <kou@clear-code.com>
|
4
4
|
#
|
5
5
|
# This library is free software; you can redistribute it and/or
|
6
6
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -22,7 +22,7 @@ end
|
|
22
22
|
|
23
23
|
Gem::Specification.new do |spec|
|
24
24
|
spec.name = "chupa-text-decomposer-html"
|
25
|
-
spec.version = "1.0.
|
25
|
+
spec.version = "1.0.4"
|
26
26
|
spec.homepage = "https://github.com/ranguba/chupa-text-decomposer-html"
|
27
27
|
spec.authors = ["Kouhei Sutou"]
|
28
28
|
spec.email = ["kou@clear-code.com"]
|
@@ -41,10 +41,4 @@ Gem::Specification.new do |spec|
|
|
41
41
|
|
42
42
|
spec.add_runtime_dependency("chupa-text")
|
43
43
|
spec.add_runtime_dependency("nokogiri")
|
44
|
-
|
45
|
-
spec.add_development_dependency("bundler")
|
46
|
-
spec.add_development_dependency("rake")
|
47
|
-
spec.add_development_dependency("test-unit")
|
48
|
-
spec.add_development_dependency("packnga")
|
49
|
-
spec.add_development_dependency("redcarpet")
|
50
44
|
end
|
data/doc/text/news.md
CHANGED
@@ -1,14 +1,32 @@
|
|
1
1
|
# News
|
2
2
|
|
3
|
+
## 1.0.4: 2024-09-22
|
4
|
+
|
5
|
+
### Improvements
|
6
|
+
|
7
|
+
* Removed NKF dependency.
|
8
|
+
|
9
|
+
## 1.0.3: 2017-07-10
|
10
|
+
|
11
|
+
### Improvements
|
12
|
+
|
13
|
+
* Supported ignoring topic path content.
|
14
|
+
|
15
|
+
* Supported ignoring aside content.
|
16
|
+
|
17
|
+
### Fixes
|
18
|
+
|
19
|
+
* Fixed a infinite loop bug.
|
20
|
+
|
3
21
|
## 1.0.2: 2017-07-05
|
4
22
|
|
5
|
-
*
|
23
|
+
* Supported content based HTML detection.
|
6
24
|
|
7
|
-
*
|
25
|
+
* Supported ignoring common contents.
|
8
26
|
|
9
27
|
## 1.0.1: 2014-02-18
|
10
28
|
|
11
|
-
*
|
29
|
+
* Supported chupa-text 1.0.4.
|
12
30
|
|
13
31
|
## 1.0.0: 2014-01-05
|
14
32
|
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2013-
|
1
|
+
# Copyright (C) 2013-2024 Sutou Kouhei <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -14,7 +14,6 @@
|
|
14
14
|
# License along with this library; if not, write to the Free Software
|
15
15
|
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
16
|
|
17
|
-
require "nkf"
|
18
17
|
require "nokogiri"
|
19
18
|
|
20
19
|
module ChupaText
|
@@ -28,6 +27,10 @@ module ChupaText
|
|
28
27
|
"application/xhtml+xml",
|
29
28
|
]
|
30
29
|
def target?(data)
|
30
|
+
(data["source-mime-types"] || []).each do |source_mime_type|
|
31
|
+
return false if TARGET_MIME_TYPES.include?(source_mime_type)
|
32
|
+
end
|
33
|
+
|
31
34
|
return true if TARGET_EXTENSIONS.include?(data.extension)
|
32
35
|
return true if TARGET_MIME_TYPES.include?(data.mime_type)
|
33
36
|
|
@@ -45,7 +48,7 @@ module ChupaText
|
|
45
48
|
doc = Nokogiri::HTML.parse(html, nil, guess_encoding(html))
|
46
49
|
body_element = (doc % "body")
|
47
50
|
if body_element
|
48
|
-
body = extract_text(body_element, "").scrub.gsub(/^\s+|\s+$/, '')
|
51
|
+
body = extract_text(body_element, +"").scrub.gsub(/^\s+|\s+$/, '')
|
49
52
|
else
|
50
53
|
body = ""
|
51
54
|
end
|
@@ -85,7 +88,7 @@ module ChupaText
|
|
85
88
|
if text.encoding != Encoding::ASCII_8BIT and text.valid_encoding?
|
86
89
|
text.encoding.to_s
|
87
90
|
else
|
88
|
-
|
91
|
+
guess_encoding_heuristic(text)
|
89
92
|
end
|
90
93
|
end
|
91
94
|
end
|
@@ -101,8 +104,22 @@ module ChupaText
|
|
101
104
|
end
|
102
105
|
end
|
103
106
|
|
104
|
-
def
|
105
|
-
|
107
|
+
def guess_encoding_heuristic(text)
|
108
|
+
candidates = [
|
109
|
+
Encoding::EUC_JP,
|
110
|
+
Encoding::WINDOWS_31J,
|
111
|
+
Encoding::UTF16_BE,
|
112
|
+
Encoding::UTF16_LE,
|
113
|
+
]
|
114
|
+
candidates.each do |candidate|
|
115
|
+
begin
|
116
|
+
text.encode(Encoding::UTF_8, candidate)
|
117
|
+
rescue EncodingError
|
118
|
+
else
|
119
|
+
return candidate.name
|
120
|
+
end
|
121
|
+
end
|
122
|
+
"UTF-8"
|
106
123
|
end
|
107
124
|
|
108
125
|
def extract_text(element, text)
|
@@ -112,6 +129,7 @@ module ChupaText
|
|
112
129
|
return text if header_element?(element, name, classes)
|
113
130
|
return text if footer_element?(element, name, classes)
|
114
131
|
return text if navigation_element?(element, name, classes)
|
132
|
+
return text if aside_element?(element, name, classes)
|
115
133
|
|
116
134
|
element.children.each do |child|
|
117
135
|
case child
|
@@ -143,7 +161,7 @@ module ChupaText
|
|
143
161
|
|
144
162
|
def header_element?(element, name, classes)
|
145
163
|
case name
|
146
|
-
when "header"
|
164
|
+
when "header"
|
147
165
|
return true
|
148
166
|
end
|
149
167
|
|
@@ -191,13 +209,34 @@ module ChupaText
|
|
191
209
|
|
192
210
|
classes.each do |klass|
|
193
211
|
case klass
|
194
|
-
when "nav", "menu"
|
212
|
+
when "nav", "menu", /\Atopic[-_]?path\z/i
|
213
|
+
return true
|
214
|
+
end
|
215
|
+
end
|
216
|
+
|
217
|
+
case element["id"]
|
218
|
+
when "nav", "menu", /\Atopic[-_]?path\z/i
|
219
|
+
return true
|
220
|
+
end
|
221
|
+
|
222
|
+
false
|
223
|
+
end
|
224
|
+
|
225
|
+
def aside_element?(element, name, classes)
|
226
|
+
case name
|
227
|
+
when "aside"
|
228
|
+
return true
|
229
|
+
end
|
230
|
+
|
231
|
+
classes.each do |klass|
|
232
|
+
case klass
|
233
|
+
when "aside"
|
195
234
|
return true
|
196
235
|
end
|
197
236
|
end
|
198
237
|
|
199
238
|
case element["id"]
|
200
|
-
when "
|
239
|
+
when "aside"
|
201
240
|
return true
|
202
241
|
end
|
203
242
|
|
data/test/run-test.rb
CHANGED
data/test/test-html.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2013-
|
1
|
+
# Copyright (C) 2013-2024 Sutou Kouhei <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -28,6 +28,30 @@ class TestHTML < Test::Unit::TestCase
|
|
28
28
|
end
|
29
29
|
|
30
30
|
sub_test_case("target?") do
|
31
|
+
sub_test_case("source-mime-type") do
|
32
|
+
def create_data(uri, mime_type)
|
33
|
+
data = ChupaText::Data.new
|
34
|
+
data.body = ""
|
35
|
+
data.uri = uri
|
36
|
+
data["source-mime-types"] = [mime_type]
|
37
|
+
data
|
38
|
+
end
|
39
|
+
|
40
|
+
def test_text_html
|
41
|
+
data = create_data("index.html", "text/html")
|
42
|
+
assert do
|
43
|
+
not @decomposer.target?(data)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def test_application_xhtml_xml
|
48
|
+
data = create_data("index.html", "application/xhtml+xml")
|
49
|
+
assert do
|
50
|
+
not @decomposer.target?(data)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
31
55
|
sub_test_case("extension") do
|
32
56
|
def create_data(uri)
|
33
57
|
data = ChupaText::Data.new
|
@@ -164,7 +188,7 @@ class TestHTML < Test::Unit::TestCase
|
|
164
188
|
|
165
189
|
sub_test_case("detect") do
|
166
190
|
def test_nothing
|
167
|
-
@data.body = <<-HTML.force_encoding("UTF-8")
|
191
|
+
@data.body = <<-HTML.dup.force_encoding("UTF-8")
|
168
192
|
<html>
|
169
193
|
<body>Hello</body>
|
170
194
|
</html>
|
@@ -436,6 +460,23 @@ class TestHTML < Test::Unit::TestCase
|
|
436
460
|
decompose(@data))
|
437
461
|
end
|
438
462
|
|
463
|
+
def test_topic_path_class
|
464
|
+
@data.body = <<-HTML
|
465
|
+
<html>
|
466
|
+
<body>
|
467
|
+
Before
|
468
|
+
<div class="topic-path">topic-path</div>
|
469
|
+
<div class="topic_path">topic_path</div>
|
470
|
+
<div class="topicpath">topicpath</div>
|
471
|
+
<div class="TopicPath">TopicPath</div>
|
472
|
+
After
|
473
|
+
</body>
|
474
|
+
</html>
|
475
|
+
HTML
|
476
|
+
assert_equal(["Before\nAfter"],
|
477
|
+
decompose(@data))
|
478
|
+
end
|
479
|
+
|
439
480
|
def test_nav_id
|
440
481
|
@data.body = <<-HTML
|
441
482
|
<html>
|
@@ -450,6 +491,55 @@ class TestHTML < Test::Unit::TestCase
|
|
450
491
|
@data.body = <<-HTML
|
451
492
|
<html>
|
452
493
|
<body>Before<div id="menu">nav</div>After</body>
|
494
|
+
</html>
|
495
|
+
HTML
|
496
|
+
assert_equal(["BeforeAfter"],
|
497
|
+
decompose(@data))
|
498
|
+
end
|
499
|
+
|
500
|
+
def test_topic_path_id
|
501
|
+
@data.body = <<-HTML
|
502
|
+
<html>
|
503
|
+
<body>
|
504
|
+
Before
|
505
|
+
<div id="topic-path">topic-path</div>
|
506
|
+
<div id="topic_path">topic_path</div>
|
507
|
+
<div id="topicpath">topicpath</div>
|
508
|
+
<div id="TopicPath">TopicPath</div>
|
509
|
+
After
|
510
|
+
</body>
|
511
|
+
</html>
|
512
|
+
HTML
|
513
|
+
assert_equal(["Before\nAfter"],
|
514
|
+
decompose(@data))
|
515
|
+
end
|
516
|
+
end
|
517
|
+
|
518
|
+
sub_test_case("aside") do
|
519
|
+
def test_aside_tag
|
520
|
+
@data.body = <<-HTML
|
521
|
+
<html>
|
522
|
+
<body>Before<aside>aside</aside>After</body>
|
523
|
+
</html>
|
524
|
+
HTML
|
525
|
+
assert_equal(["BeforeAfter"],
|
526
|
+
decompose(@data))
|
527
|
+
end
|
528
|
+
|
529
|
+
def test_aside_class
|
530
|
+
@data.body = <<-HTML
|
531
|
+
<html>
|
532
|
+
<body>Before<div class="aside">aside</div>After</body>
|
533
|
+
</html>
|
534
|
+
HTML
|
535
|
+
assert_equal(["BeforeAfter"],
|
536
|
+
decompose(@data))
|
537
|
+
end
|
538
|
+
|
539
|
+
def test_aside_id
|
540
|
+
@data.body = <<-HTML
|
541
|
+
<html>
|
542
|
+
<body>Before<div id="aside">aside</div>After</body>
|
453
543
|
</html>
|
454
544
|
HTML
|
455
545
|
assert_equal(["BeforeAfter"],
|
metadata
CHANGED
@@ -1,14 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: chupa-text-decomposer-html
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kouhei Sutou
|
8
|
-
autorequire:
|
9
8
|
bindir: bin
|
10
9
|
cert_chain: []
|
11
|
-
date:
|
10
|
+
date: 2024-09-22 00:00:00.000000000 Z
|
12
11
|
dependencies:
|
13
12
|
- !ruby/object:Gem::Dependency
|
14
13
|
name: chupa-text
|
@@ -38,76 +37,6 @@ dependencies:
|
|
38
37
|
- - ">="
|
39
38
|
- !ruby/object:Gem::Version
|
40
39
|
version: '0'
|
41
|
-
- !ruby/object:Gem::Dependency
|
42
|
-
name: bundler
|
43
|
-
requirement: !ruby/object:Gem::Requirement
|
44
|
-
requirements:
|
45
|
-
- - ">="
|
46
|
-
- !ruby/object:Gem::Version
|
47
|
-
version: '0'
|
48
|
-
type: :development
|
49
|
-
prerelease: false
|
50
|
-
version_requirements: !ruby/object:Gem::Requirement
|
51
|
-
requirements:
|
52
|
-
- - ">="
|
53
|
-
- !ruby/object:Gem::Version
|
54
|
-
version: '0'
|
55
|
-
- !ruby/object:Gem::Dependency
|
56
|
-
name: rake
|
57
|
-
requirement: !ruby/object:Gem::Requirement
|
58
|
-
requirements:
|
59
|
-
- - ">="
|
60
|
-
- !ruby/object:Gem::Version
|
61
|
-
version: '0'
|
62
|
-
type: :development
|
63
|
-
prerelease: false
|
64
|
-
version_requirements: !ruby/object:Gem::Requirement
|
65
|
-
requirements:
|
66
|
-
- - ">="
|
67
|
-
- !ruby/object:Gem::Version
|
68
|
-
version: '0'
|
69
|
-
- !ruby/object:Gem::Dependency
|
70
|
-
name: test-unit
|
71
|
-
requirement: !ruby/object:Gem::Requirement
|
72
|
-
requirements:
|
73
|
-
- - ">="
|
74
|
-
- !ruby/object:Gem::Version
|
75
|
-
version: '0'
|
76
|
-
type: :development
|
77
|
-
prerelease: false
|
78
|
-
version_requirements: !ruby/object:Gem::Requirement
|
79
|
-
requirements:
|
80
|
-
- - ">="
|
81
|
-
- !ruby/object:Gem::Version
|
82
|
-
version: '0'
|
83
|
-
- !ruby/object:Gem::Dependency
|
84
|
-
name: packnga
|
85
|
-
requirement: !ruby/object:Gem::Requirement
|
86
|
-
requirements:
|
87
|
-
- - ">="
|
88
|
-
- !ruby/object:Gem::Version
|
89
|
-
version: '0'
|
90
|
-
type: :development
|
91
|
-
prerelease: false
|
92
|
-
version_requirements: !ruby/object:Gem::Requirement
|
93
|
-
requirements:
|
94
|
-
- - ">="
|
95
|
-
- !ruby/object:Gem::Version
|
96
|
-
version: '0'
|
97
|
-
- !ruby/object:Gem::Dependency
|
98
|
-
name: redcarpet
|
99
|
-
requirement: !ruby/object:Gem::Requirement
|
100
|
-
requirements:
|
101
|
-
- - ">="
|
102
|
-
- !ruby/object:Gem::Version
|
103
|
-
version: '0'
|
104
|
-
type: :development
|
105
|
-
prerelease: false
|
106
|
-
version_requirements: !ruby/object:Gem::Requirement
|
107
|
-
requirements:
|
108
|
-
- - ">="
|
109
|
-
- !ruby/object:Gem::Version
|
110
|
-
version: '0'
|
111
40
|
description: |
|
112
41
|
This is a ChupaText decomposer plugin for to extract text and
|
113
42
|
meta-data from HTML.
|
@@ -133,7 +62,6 @@ homepage: https://github.com/ranguba/chupa-text-decomposer-html
|
|
133
62
|
licenses:
|
134
63
|
- LGPL-2.1+
|
135
64
|
metadata: {}
|
136
|
-
post_install_message:
|
137
65
|
rdoc_options: []
|
138
66
|
require_paths:
|
139
67
|
- lib
|
@@ -148,9 +76,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
148
76
|
- !ruby/object:Gem::Version
|
149
77
|
version: '0'
|
150
78
|
requirements: []
|
151
|
-
|
152
|
-
rubygems_version: 2.5.2
|
153
|
-
signing_key:
|
79
|
+
rubygems_version: 3.6.0.dev
|
154
80
|
specification_version: 4
|
155
81
|
summary: This is a ChupaText decomposer plugin for to extract text and meta-data from
|
156
82
|
HTML.
|