chupa-text-decomposer-html 1.0.3 → 1.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/Gemfile +8 -2
- data/chupa-text-decomposer-html.gemspec +2 -8
- data/doc/text/news.md +12 -0
- data/lib/chupa-text/decomposers/html.rb +19 -6
- data/test/run-test.rb +1 -1
- data/test/test-html.rb +2 -2
- metadata +3 -77
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: e5d82df61c624d47a350142691c76d2dbab98fbd90123501b0a6693cd0a3c496
|
4
|
+
data.tar.gz: 1c7df05103ac4d448ce48927b991cd750048754f2a5a236b79a2c61e87603691
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 47fb20db31734d563d250dab59f43d81afafd7cb6df2b2e717bc6130dd74d6d0e110783f14f66c8d4ec2ee4c93150fe943238b78d9b7702e8e9bf634174138f7
|
7
|
+
data.tar.gz: d52756b3cd3ff79fd471851a38e7774527850a48b42b7333e56bcadb1c8e5b3e952d100b91f3103feb4ba09d082900efb6fa394c94431e03cd8be7f9cc4c48ea
|
data/Gemfile
CHANGED
@@ -1,6 +1,6 @@
|
|
1
|
-
# -*- mode: ruby
|
1
|
+
# -*- mode: ruby -*-
|
2
2
|
#
|
3
|
-
# Copyright (C) 2013 Kouhei
|
3
|
+
# Copyright (C) 2013-2024 Sutou Kouhei <kou@clear-code.com>
|
4
4
|
#
|
5
5
|
# This library is free software; you can redistribute it and/or
|
6
6
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -20,6 +20,12 @@ source "https://rubygems.org/"
|
|
20
20
|
|
21
21
|
gemspec
|
22
22
|
|
23
|
+
gem "bundler"
|
24
|
+
gem "packnga"
|
25
|
+
gem "rake"
|
26
|
+
gem "redcarpet"
|
27
|
+
gem "test-unit"
|
28
|
+
|
23
29
|
base_dir = File.dirname(__FILE__)
|
24
30
|
local_chupa_text_dir = File.join(base_dir, "..", "chupa-text")
|
25
31
|
if File.exist?(local_chupa_text_dir)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# -*- ruby -*-
|
2
2
|
#
|
3
|
-
# Copyright (C) 2013-
|
3
|
+
# Copyright (C) 2013-2024 Sutou Kouhei <kou@clear-code.com>
|
4
4
|
#
|
5
5
|
# This library is free software; you can redistribute it and/or
|
6
6
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -22,7 +22,7 @@ end
|
|
22
22
|
|
23
23
|
Gem::Specification.new do |spec|
|
24
24
|
spec.name = "chupa-text-decomposer-html"
|
25
|
-
spec.version = "1.0.
|
25
|
+
spec.version = "1.0.5"
|
26
26
|
spec.homepage = "https://github.com/ranguba/chupa-text-decomposer-html"
|
27
27
|
spec.authors = ["Kouhei Sutou"]
|
28
28
|
spec.email = ["kou@clear-code.com"]
|
@@ -41,10 +41,4 @@ Gem::Specification.new do |spec|
|
|
41
41
|
|
42
42
|
spec.add_runtime_dependency("chupa-text")
|
43
43
|
spec.add_runtime_dependency("nokogiri")
|
44
|
-
|
45
|
-
spec.add_development_dependency("bundler")
|
46
|
-
spec.add_development_dependency("rake")
|
47
|
-
spec.add_development_dependency("test-unit")
|
48
|
-
spec.add_development_dependency("packnga")
|
49
|
-
spec.add_development_dependency("redcarpet")
|
50
44
|
end
|
data/doc/text/news.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2013-
|
1
|
+
# Copyright (C) 2013-2024 Sutou Kouhei <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -14,7 +14,6 @@
|
|
14
14
|
# License along with this library; if not, write to the Free Software
|
15
15
|
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
16
|
|
17
|
-
require "nkf"
|
18
17
|
require "nokogiri"
|
19
18
|
|
20
19
|
module ChupaText
|
@@ -49,7 +48,7 @@ module ChupaText
|
|
49
48
|
doc = Nokogiri::HTML.parse(html, nil, guess_encoding(html))
|
50
49
|
body_element = (doc % "body")
|
51
50
|
if body_element
|
52
|
-
body = extract_text(body_element, "").scrub.gsub(/^\s+|\s+$/, '')
|
51
|
+
body = extract_text(body_element, +"").scrub.gsub(/^\s+|\s+$/, '')
|
53
52
|
else
|
54
53
|
body = ""
|
55
54
|
end
|
@@ -89,7 +88,7 @@ module ChupaText
|
|
89
88
|
if text.encoding != Encoding::ASCII_8BIT and text.valid_encoding?
|
90
89
|
text.encoding.to_s
|
91
90
|
else
|
92
|
-
|
91
|
+
guess_encoding_heuristic(text)
|
93
92
|
end
|
94
93
|
end
|
95
94
|
end
|
@@ -105,8 +104,22 @@ module ChupaText
|
|
105
104
|
end
|
106
105
|
end
|
107
106
|
|
108
|
-
def
|
109
|
-
|
107
|
+
def guess_encoding_heuristic(text)
|
108
|
+
candidates = [
|
109
|
+
Encoding::EUC_JP,
|
110
|
+
Encoding::WINDOWS_31J,
|
111
|
+
Encoding::UTF_16BE,
|
112
|
+
Encoding::UTF_16LE,
|
113
|
+
]
|
114
|
+
candidates.each do |candidate|
|
115
|
+
begin
|
116
|
+
text.encode(Encoding::UTF_8, candidate)
|
117
|
+
rescue EncodingError
|
118
|
+
else
|
119
|
+
return candidate.name
|
120
|
+
end
|
121
|
+
end
|
122
|
+
"UTF-8"
|
110
123
|
end
|
111
124
|
|
112
125
|
def extract_text(element, text)
|
data/test/run-test.rb
CHANGED
data/test/test-html.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2013-
|
1
|
+
# Copyright (C) 2013-2024 Sutou Kouhei <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -188,7 +188,7 @@ class TestHTML < Test::Unit::TestCase
|
|
188
188
|
|
189
189
|
sub_test_case("detect") do
|
190
190
|
def test_nothing
|
191
|
-
@data.body = <<-HTML.force_encoding("UTF-8")
|
191
|
+
@data.body = <<-HTML.dup.force_encoding("UTF-8")
|
192
192
|
<html>
|
193
193
|
<body>Hello</body>
|
194
194
|
</html>
|
metadata
CHANGED
@@ -1,14 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: chupa-text-decomposer-html
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kouhei Sutou
|
8
|
-
autorequire:
|
9
8
|
bindir: bin
|
10
9
|
cert_chain: []
|
11
|
-
date:
|
10
|
+
date: 2024-09-22 00:00:00.000000000 Z
|
12
11
|
dependencies:
|
13
12
|
- !ruby/object:Gem::Dependency
|
14
13
|
name: chupa-text
|
@@ -38,76 +37,6 @@ dependencies:
|
|
38
37
|
- - ">="
|
39
38
|
- !ruby/object:Gem::Version
|
40
39
|
version: '0'
|
41
|
-
- !ruby/object:Gem::Dependency
|
42
|
-
name: bundler
|
43
|
-
requirement: !ruby/object:Gem::Requirement
|
44
|
-
requirements:
|
45
|
-
- - ">="
|
46
|
-
- !ruby/object:Gem::Version
|
47
|
-
version: '0'
|
48
|
-
type: :development
|
49
|
-
prerelease: false
|
50
|
-
version_requirements: !ruby/object:Gem::Requirement
|
51
|
-
requirements:
|
52
|
-
- - ">="
|
53
|
-
- !ruby/object:Gem::Version
|
54
|
-
version: '0'
|
55
|
-
- !ruby/object:Gem::Dependency
|
56
|
-
name: rake
|
57
|
-
requirement: !ruby/object:Gem::Requirement
|
58
|
-
requirements:
|
59
|
-
- - ">="
|
60
|
-
- !ruby/object:Gem::Version
|
61
|
-
version: '0'
|
62
|
-
type: :development
|
63
|
-
prerelease: false
|
64
|
-
version_requirements: !ruby/object:Gem::Requirement
|
65
|
-
requirements:
|
66
|
-
- - ">="
|
67
|
-
- !ruby/object:Gem::Version
|
68
|
-
version: '0'
|
69
|
-
- !ruby/object:Gem::Dependency
|
70
|
-
name: test-unit
|
71
|
-
requirement: !ruby/object:Gem::Requirement
|
72
|
-
requirements:
|
73
|
-
- - ">="
|
74
|
-
- !ruby/object:Gem::Version
|
75
|
-
version: '0'
|
76
|
-
type: :development
|
77
|
-
prerelease: false
|
78
|
-
version_requirements: !ruby/object:Gem::Requirement
|
79
|
-
requirements:
|
80
|
-
- - ">="
|
81
|
-
- !ruby/object:Gem::Version
|
82
|
-
version: '0'
|
83
|
-
- !ruby/object:Gem::Dependency
|
84
|
-
name: packnga
|
85
|
-
requirement: !ruby/object:Gem::Requirement
|
86
|
-
requirements:
|
87
|
-
- - ">="
|
88
|
-
- !ruby/object:Gem::Version
|
89
|
-
version: '0'
|
90
|
-
type: :development
|
91
|
-
prerelease: false
|
92
|
-
version_requirements: !ruby/object:Gem::Requirement
|
93
|
-
requirements:
|
94
|
-
- - ">="
|
95
|
-
- !ruby/object:Gem::Version
|
96
|
-
version: '0'
|
97
|
-
- !ruby/object:Gem::Dependency
|
98
|
-
name: redcarpet
|
99
|
-
requirement: !ruby/object:Gem::Requirement
|
100
|
-
requirements:
|
101
|
-
- - ">="
|
102
|
-
- !ruby/object:Gem::Version
|
103
|
-
version: '0'
|
104
|
-
type: :development
|
105
|
-
prerelease: false
|
106
|
-
version_requirements: !ruby/object:Gem::Requirement
|
107
|
-
requirements:
|
108
|
-
- - ">="
|
109
|
-
- !ruby/object:Gem::Version
|
110
|
-
version: '0'
|
111
40
|
description: |
|
112
41
|
This is a ChupaText decomposer plugin for to extract text and
|
113
42
|
meta-data from HTML.
|
@@ -133,7 +62,6 @@ homepage: https://github.com/ranguba/chupa-text-decomposer-html
|
|
133
62
|
licenses:
|
134
63
|
- LGPL-2.1+
|
135
64
|
metadata: {}
|
136
|
-
post_install_message:
|
137
65
|
rdoc_options: []
|
138
66
|
require_paths:
|
139
67
|
- lib
|
@@ -148,9 +76,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
148
76
|
- !ruby/object:Gem::Version
|
149
77
|
version: '0'
|
150
78
|
requirements: []
|
151
|
-
|
152
|
-
rubygems_version: 2.5.2
|
153
|
-
signing_key:
|
79
|
+
rubygems_version: 3.6.0.dev
|
154
80
|
specification_version: 4
|
155
81
|
summary: This is a ChupaText decomposer plugin for to extract text and meta-data from
|
156
82
|
HTML.
|