chupa-text-decomposer-html 1.0.3 → 1.0.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 5b340a505f73aa5bcc6613a55b6fcec5e7988bc4
4
- data.tar.gz: 3f4e1ef2b30decee069f76521753f6533a51024b
2
+ SHA256:
3
+ metadata.gz: d18136db785fa061ef0bea3f17f8826a1ff55ed5020a591926e1700b91b9df38
4
+ data.tar.gz: 2c173149ac68d34756944ce98caa32f1dbc5bba4f86dab0461505bb90a5d406f
5
5
  SHA512:
6
- metadata.gz: 51f94a9bd3eb45765aa4518f8415ec82ec235f53e99e75e4ec94afb45721869587e9ad558fc7bdfeb05fca39ed76f24d08d770428b0a775c613f8b0768e60b7c
7
- data.tar.gz: 409267fc2e80bc9cbc119443b6dee928ab72a7e1eb747c69cb8c781654fcd6977cf2d16663622c29f46e6c86923da0a9e8f911b82f73c8cb63e0722e9a130cda
6
+ metadata.gz: 692141e0ed3d3d92729de8c47d62fa78ad6bc571070d293cdd8a7865e2d2366d82d68ef83121300dd92a82193b3db3ca83b7865ad6e0212c7e1b26e698830b13
7
+ data.tar.gz: 061e659f770c63f304cc7b697b6f3c512ea0c975eac35d55a9de9ad434820f056477275e4a240e058a42ace0187b1240290050bf83984049593003e309694411
data/Gemfile CHANGED
@@ -1,6 +1,6 @@
1
- # -*- mode: ruby; coding: utf-8 -*-
1
+ # -*- mode: ruby -*-
2
2
  #
3
- # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
3
+ # Copyright (C) 2013-2024 Sutou Kouhei <kou@clear-code.com>
4
4
  #
5
5
  # This library is free software; you can redistribute it and/or
6
6
  # modify it under the terms of the GNU Lesser General Public
@@ -20,6 +20,12 @@ source "https://rubygems.org/"
20
20
 
21
21
  gemspec
22
22
 
23
+ gem "bundler"
24
+ gem "packnga"
25
+ gem "rake"
26
+ gem "redcarpet"
27
+ gem "test-unit"
28
+
23
29
  base_dir = File.dirname(__FILE__)
24
30
  local_chupa_text_dir = File.join(base_dir, "..", "chupa-text")
25
31
  if File.exist?(local_chupa_text_dir)
@@ -1,6 +1,6 @@
1
1
  # -*- ruby -*-
2
2
  #
3
- # Copyright (C) 2013-2017 Kouhei Sutou <kou@clear-code.com>
3
+ # Copyright (C) 2013-2024 Sutou Kouhei <kou@clear-code.com>
4
4
  #
5
5
  # This library is free software; you can redistribute it and/or
6
6
  # modify it under the terms of the GNU Lesser General Public
@@ -22,7 +22,7 @@ end
22
22
 
23
23
  Gem::Specification.new do |spec|
24
24
  spec.name = "chupa-text-decomposer-html"
25
- spec.version = "1.0.3"
25
+ spec.version = "1.0.4"
26
26
  spec.homepage = "https://github.com/ranguba/chupa-text-decomposer-html"
27
27
  spec.authors = ["Kouhei Sutou"]
28
28
  spec.email = ["kou@clear-code.com"]
@@ -41,10 +41,4 @@ Gem::Specification.new do |spec|
41
41
 
42
42
  spec.add_runtime_dependency("chupa-text")
43
43
  spec.add_runtime_dependency("nokogiri")
44
-
45
- spec.add_development_dependency("bundler")
46
- spec.add_development_dependency("rake")
47
- spec.add_development_dependency("test-unit")
48
- spec.add_development_dependency("packnga")
49
- spec.add_development_dependency("redcarpet")
50
44
  end
data/doc/text/news.md CHANGED
@@ -1,5 +1,11 @@
1
1
  # News
2
2
 
3
+ ## 1.0.4: 2024-09-22
4
+
5
+ ### Improvements
6
+
7
+ * Removed NKF dependency.
8
+
3
9
  ## 1.0.3: 2017-07-10
4
10
 
5
11
  ### Improvements
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2013-2017 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright (C) 2013-2024 Sutou Kouhei <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -14,7 +14,6 @@
14
14
  # License along with this library; if not, write to the Free Software
15
15
  # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
16
 
17
- require "nkf"
18
17
  require "nokogiri"
19
18
 
20
19
  module ChupaText
@@ -49,7 +48,7 @@ module ChupaText
49
48
  doc = Nokogiri::HTML.parse(html, nil, guess_encoding(html))
50
49
  body_element = (doc % "body")
51
50
  if body_element
52
- body = extract_text(body_element, "").scrub.gsub(/^\s+|\s+$/, '')
51
+ body = extract_text(body_element, +"").scrub.gsub(/^\s+|\s+$/, '')
53
52
  else
54
53
  body = ""
55
54
  end
@@ -89,7 +88,7 @@ module ChupaText
89
88
  if text.encoding != Encoding::ASCII_8BIT and text.valid_encoding?
90
89
  text.encoding.to_s
91
90
  else
92
- guess_encoding_nkf(text)
91
+ guess_encoding_heuristic(text)
93
92
  end
94
93
  end
95
94
  end
@@ -105,8 +104,22 @@ module ChupaText
105
104
  end
106
105
  end
107
106
 
108
- def guess_encoding_nkf(text)
109
- NKF.guess(text).name
107
+ def guess_encoding_heuristic(text)
108
+ candidates = [
109
+ Encoding::EUC_JP,
110
+ Encoding::WINDOWS_31J,
111
+ Encoding::UTF16_BE,
112
+ Encoding::UTF16_LE,
113
+ ]
114
+ candidates.each do |candidate|
115
+ begin
116
+ text.encode(Encoding::UTF_8, candidate)
117
+ rescue EncodingError
118
+ else
119
+ return candidate.name
120
+ end
121
+ end
122
+ "UTF-8"
110
123
  end
111
124
 
112
125
  def extract_text(element, text)
data/test/run-test.rb CHANGED
@@ -26,4 +26,4 @@ require "chupa-text"
26
26
 
27
27
  ChupaText::Decomposers.load
28
28
 
29
- exit(Test::Unit::AutoRunner.run(true))
29
+ exit(Test::Unit::AutoRunner.run(true, __dir__))
data/test/test-html.rb CHANGED
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2013-2017 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright (C) 2013-2024 Sutou Kouhei <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -188,7 +188,7 @@ class TestHTML < Test::Unit::TestCase
188
188
 
189
189
  sub_test_case("detect") do
190
190
  def test_nothing
191
- @data.body = <<-HTML.force_encoding("UTF-8")
191
+ @data.body = <<-HTML.dup.force_encoding("UTF-8")
192
192
  <html>
193
193
  <body>Hello</body>
194
194
  </html>
metadata CHANGED
@@ -1,14 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: chupa-text-decomposer-html
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.3
4
+ version: 1.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kouhei Sutou
8
- autorequire:
9
8
  bindir: bin
10
9
  cert_chain: []
11
- date: 2017-07-10 00:00:00.000000000 Z
10
+ date: 2024-09-22 00:00:00.000000000 Z
12
11
  dependencies:
13
12
  - !ruby/object:Gem::Dependency
14
13
  name: chupa-text
@@ -38,76 +37,6 @@ dependencies:
38
37
  - - ">="
39
38
  - !ruby/object:Gem::Version
40
39
  version: '0'
41
- - !ruby/object:Gem::Dependency
42
- name: bundler
43
- requirement: !ruby/object:Gem::Requirement
44
- requirements:
45
- - - ">="
46
- - !ruby/object:Gem::Version
47
- version: '0'
48
- type: :development
49
- prerelease: false
50
- version_requirements: !ruby/object:Gem::Requirement
51
- requirements:
52
- - - ">="
53
- - !ruby/object:Gem::Version
54
- version: '0'
55
- - !ruby/object:Gem::Dependency
56
- name: rake
57
- requirement: !ruby/object:Gem::Requirement
58
- requirements:
59
- - - ">="
60
- - !ruby/object:Gem::Version
61
- version: '0'
62
- type: :development
63
- prerelease: false
64
- version_requirements: !ruby/object:Gem::Requirement
65
- requirements:
66
- - - ">="
67
- - !ruby/object:Gem::Version
68
- version: '0'
69
- - !ruby/object:Gem::Dependency
70
- name: test-unit
71
- requirement: !ruby/object:Gem::Requirement
72
- requirements:
73
- - - ">="
74
- - !ruby/object:Gem::Version
75
- version: '0'
76
- type: :development
77
- prerelease: false
78
- version_requirements: !ruby/object:Gem::Requirement
79
- requirements:
80
- - - ">="
81
- - !ruby/object:Gem::Version
82
- version: '0'
83
- - !ruby/object:Gem::Dependency
84
- name: packnga
85
- requirement: !ruby/object:Gem::Requirement
86
- requirements:
87
- - - ">="
88
- - !ruby/object:Gem::Version
89
- version: '0'
90
- type: :development
91
- prerelease: false
92
- version_requirements: !ruby/object:Gem::Requirement
93
- requirements:
94
- - - ">="
95
- - !ruby/object:Gem::Version
96
- version: '0'
97
- - !ruby/object:Gem::Dependency
98
- name: redcarpet
99
- requirement: !ruby/object:Gem::Requirement
100
- requirements:
101
- - - ">="
102
- - !ruby/object:Gem::Version
103
- version: '0'
104
- type: :development
105
- prerelease: false
106
- version_requirements: !ruby/object:Gem::Requirement
107
- requirements:
108
- - - ">="
109
- - !ruby/object:Gem::Version
110
- version: '0'
111
40
  description: |
112
41
  This is a ChupaText decomposer plugin for to extract text and
113
42
  meta-data from HTML.
@@ -133,7 +62,6 @@ homepage: https://github.com/ranguba/chupa-text-decomposer-html
133
62
  licenses:
134
63
  - LGPL-2.1+
135
64
  metadata: {}
136
- post_install_message:
137
65
  rdoc_options: []
138
66
  require_paths:
139
67
  - lib
@@ -148,9 +76,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
148
76
  - !ruby/object:Gem::Version
149
77
  version: '0'
150
78
  requirements: []
151
- rubyforge_project:
152
- rubygems_version: 2.5.2
153
- signing_key:
79
+ rubygems_version: 3.6.0.dev
154
80
  specification_version: 4
155
81
  summary: This is a ChupaText decomposer plugin for to extract text and meta-data from
156
82
  HTML.