chupa-text-decomposer-html 1.0.3 → 1.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/Gemfile +8 -2
- data/chupa-text-decomposer-html.gemspec +2 -8
- data/doc/text/news.md +6 -0
- data/lib/chupa-text/decomposers/html.rb +19 -6
- data/test/run-test.rb +1 -1
- data/test/test-html.rb +2 -2
- metadata +3 -77
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: d18136db785fa061ef0bea3f17f8826a1ff55ed5020a591926e1700b91b9df38
|
4
|
+
data.tar.gz: 2c173149ac68d34756944ce98caa32f1dbc5bba4f86dab0461505bb90a5d406f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 692141e0ed3d3d92729de8c47d62fa78ad6bc571070d293cdd8a7865e2d2366d82d68ef83121300dd92a82193b3db3ca83b7865ad6e0212c7e1b26e698830b13
|
7
|
+
data.tar.gz: 061e659f770c63f304cc7b697b6f3c512ea0c975eac35d55a9de9ad434820f056477275e4a240e058a42ace0187b1240290050bf83984049593003e309694411
|
data/Gemfile
CHANGED
@@ -1,6 +1,6 @@
|
|
1
|
-
# -*- mode: ruby
|
1
|
+
# -*- mode: ruby -*-
|
2
2
|
#
|
3
|
-
# Copyright (C) 2013 Kouhei
|
3
|
+
# Copyright (C) 2013-2024 Sutou Kouhei <kou@clear-code.com>
|
4
4
|
#
|
5
5
|
# This library is free software; you can redistribute it and/or
|
6
6
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -20,6 +20,12 @@ source "https://rubygems.org/"
|
|
20
20
|
|
21
21
|
gemspec
|
22
22
|
|
23
|
+
gem "bundler"
|
24
|
+
gem "packnga"
|
25
|
+
gem "rake"
|
26
|
+
gem "redcarpet"
|
27
|
+
gem "test-unit"
|
28
|
+
|
23
29
|
base_dir = File.dirname(__FILE__)
|
24
30
|
local_chupa_text_dir = File.join(base_dir, "..", "chupa-text")
|
25
31
|
if File.exist?(local_chupa_text_dir)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# -*- ruby -*-
|
2
2
|
#
|
3
|
-
# Copyright (C) 2013-
|
3
|
+
# Copyright (C) 2013-2024 Sutou Kouhei <kou@clear-code.com>
|
4
4
|
#
|
5
5
|
# This library is free software; you can redistribute it and/or
|
6
6
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -22,7 +22,7 @@ end
|
|
22
22
|
|
23
23
|
Gem::Specification.new do |spec|
|
24
24
|
spec.name = "chupa-text-decomposer-html"
|
25
|
-
spec.version = "1.0.
|
25
|
+
spec.version = "1.0.4"
|
26
26
|
spec.homepage = "https://github.com/ranguba/chupa-text-decomposer-html"
|
27
27
|
spec.authors = ["Kouhei Sutou"]
|
28
28
|
spec.email = ["kou@clear-code.com"]
|
@@ -41,10 +41,4 @@ Gem::Specification.new do |spec|
|
|
41
41
|
|
42
42
|
spec.add_runtime_dependency("chupa-text")
|
43
43
|
spec.add_runtime_dependency("nokogiri")
|
44
|
-
|
45
|
-
spec.add_development_dependency("bundler")
|
46
|
-
spec.add_development_dependency("rake")
|
47
|
-
spec.add_development_dependency("test-unit")
|
48
|
-
spec.add_development_dependency("packnga")
|
49
|
-
spec.add_development_dependency("redcarpet")
|
50
44
|
end
|
data/doc/text/news.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2013-
|
1
|
+
# Copyright (C) 2013-2024 Sutou Kouhei <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -14,7 +14,6 @@
|
|
14
14
|
# License along with this library; if not, write to the Free Software
|
15
15
|
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
16
|
|
17
|
-
require "nkf"
|
18
17
|
require "nokogiri"
|
19
18
|
|
20
19
|
module ChupaText
|
@@ -49,7 +48,7 @@ module ChupaText
|
|
49
48
|
doc = Nokogiri::HTML.parse(html, nil, guess_encoding(html))
|
50
49
|
body_element = (doc % "body")
|
51
50
|
if body_element
|
52
|
-
body = extract_text(body_element, "").scrub.gsub(/^\s+|\s+$/, '')
|
51
|
+
body = extract_text(body_element, +"").scrub.gsub(/^\s+|\s+$/, '')
|
53
52
|
else
|
54
53
|
body = ""
|
55
54
|
end
|
@@ -89,7 +88,7 @@ module ChupaText
|
|
89
88
|
if text.encoding != Encoding::ASCII_8BIT and text.valid_encoding?
|
90
89
|
text.encoding.to_s
|
91
90
|
else
|
92
|
-
|
91
|
+
guess_encoding_heuristic(text)
|
93
92
|
end
|
94
93
|
end
|
95
94
|
end
|
@@ -105,8 +104,22 @@ module ChupaText
|
|
105
104
|
end
|
106
105
|
end
|
107
106
|
|
108
|
-
def
|
109
|
-
|
107
|
+
def guess_encoding_heuristic(text)
|
108
|
+
candidates = [
|
109
|
+
Encoding::EUC_JP,
|
110
|
+
Encoding::WINDOWS_31J,
|
111
|
+
Encoding::UTF16_BE,
|
112
|
+
Encoding::UTF16_LE,
|
113
|
+
]
|
114
|
+
candidates.each do |candidate|
|
115
|
+
begin
|
116
|
+
text.encode(Encoding::UTF_8, candidate)
|
117
|
+
rescue EncodingError
|
118
|
+
else
|
119
|
+
return candidate.name
|
120
|
+
end
|
121
|
+
end
|
122
|
+
"UTF-8"
|
110
123
|
end
|
111
124
|
|
112
125
|
def extract_text(element, text)
|
data/test/run-test.rb
CHANGED
data/test/test-html.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2013-
|
1
|
+
# Copyright (C) 2013-2024 Sutou Kouhei <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -188,7 +188,7 @@ class TestHTML < Test::Unit::TestCase
|
|
188
188
|
|
189
189
|
sub_test_case("detect") do
|
190
190
|
def test_nothing
|
191
|
-
@data.body = <<-HTML.force_encoding("UTF-8")
|
191
|
+
@data.body = <<-HTML.dup.force_encoding("UTF-8")
|
192
192
|
<html>
|
193
193
|
<body>Hello</body>
|
194
194
|
</html>
|
metadata
CHANGED
@@ -1,14 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: chupa-text-decomposer-html
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kouhei Sutou
|
8
|
-
autorequire:
|
9
8
|
bindir: bin
|
10
9
|
cert_chain: []
|
11
|
-
date:
|
10
|
+
date: 2024-09-22 00:00:00.000000000 Z
|
12
11
|
dependencies:
|
13
12
|
- !ruby/object:Gem::Dependency
|
14
13
|
name: chupa-text
|
@@ -38,76 +37,6 @@ dependencies:
|
|
38
37
|
- - ">="
|
39
38
|
- !ruby/object:Gem::Version
|
40
39
|
version: '0'
|
41
|
-
- !ruby/object:Gem::Dependency
|
42
|
-
name: bundler
|
43
|
-
requirement: !ruby/object:Gem::Requirement
|
44
|
-
requirements:
|
45
|
-
- - ">="
|
46
|
-
- !ruby/object:Gem::Version
|
47
|
-
version: '0'
|
48
|
-
type: :development
|
49
|
-
prerelease: false
|
50
|
-
version_requirements: !ruby/object:Gem::Requirement
|
51
|
-
requirements:
|
52
|
-
- - ">="
|
53
|
-
- !ruby/object:Gem::Version
|
54
|
-
version: '0'
|
55
|
-
- !ruby/object:Gem::Dependency
|
56
|
-
name: rake
|
57
|
-
requirement: !ruby/object:Gem::Requirement
|
58
|
-
requirements:
|
59
|
-
- - ">="
|
60
|
-
- !ruby/object:Gem::Version
|
61
|
-
version: '0'
|
62
|
-
type: :development
|
63
|
-
prerelease: false
|
64
|
-
version_requirements: !ruby/object:Gem::Requirement
|
65
|
-
requirements:
|
66
|
-
- - ">="
|
67
|
-
- !ruby/object:Gem::Version
|
68
|
-
version: '0'
|
69
|
-
- !ruby/object:Gem::Dependency
|
70
|
-
name: test-unit
|
71
|
-
requirement: !ruby/object:Gem::Requirement
|
72
|
-
requirements:
|
73
|
-
- - ">="
|
74
|
-
- !ruby/object:Gem::Version
|
75
|
-
version: '0'
|
76
|
-
type: :development
|
77
|
-
prerelease: false
|
78
|
-
version_requirements: !ruby/object:Gem::Requirement
|
79
|
-
requirements:
|
80
|
-
- - ">="
|
81
|
-
- !ruby/object:Gem::Version
|
82
|
-
version: '0'
|
83
|
-
- !ruby/object:Gem::Dependency
|
84
|
-
name: packnga
|
85
|
-
requirement: !ruby/object:Gem::Requirement
|
86
|
-
requirements:
|
87
|
-
- - ">="
|
88
|
-
- !ruby/object:Gem::Version
|
89
|
-
version: '0'
|
90
|
-
type: :development
|
91
|
-
prerelease: false
|
92
|
-
version_requirements: !ruby/object:Gem::Requirement
|
93
|
-
requirements:
|
94
|
-
- - ">="
|
95
|
-
- !ruby/object:Gem::Version
|
96
|
-
version: '0'
|
97
|
-
- !ruby/object:Gem::Dependency
|
98
|
-
name: redcarpet
|
99
|
-
requirement: !ruby/object:Gem::Requirement
|
100
|
-
requirements:
|
101
|
-
- - ">="
|
102
|
-
- !ruby/object:Gem::Version
|
103
|
-
version: '0'
|
104
|
-
type: :development
|
105
|
-
prerelease: false
|
106
|
-
version_requirements: !ruby/object:Gem::Requirement
|
107
|
-
requirements:
|
108
|
-
- - ">="
|
109
|
-
- !ruby/object:Gem::Version
|
110
|
-
version: '0'
|
111
40
|
description: |
|
112
41
|
This is a ChupaText decomposer plugin for to extract text and
|
113
42
|
meta-data from HTML.
|
@@ -133,7 +62,6 @@ homepage: https://github.com/ranguba/chupa-text-decomposer-html
|
|
133
62
|
licenses:
|
134
63
|
- LGPL-2.1+
|
135
64
|
metadata: {}
|
136
|
-
post_install_message:
|
137
65
|
rdoc_options: []
|
138
66
|
require_paths:
|
139
67
|
- lib
|
@@ -148,9 +76,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
148
76
|
- !ruby/object:Gem::Version
|
149
77
|
version: '0'
|
150
78
|
requirements: []
|
151
|
-
|
152
|
-
rubygems_version: 2.5.2
|
153
|
-
signing_key:
|
79
|
+
rubygems_version: 3.6.0.dev
|
154
80
|
specification_version: 4
|
155
81
|
summary: This is a ChupaText decomposer plugin for to extract text and meta-data from
|
156
82
|
HTML.
|