chupa-text-decomposer-html 1.0.3 → 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 5b340a505f73aa5bcc6613a55b6fcec5e7988bc4
4
- data.tar.gz: 3f4e1ef2b30decee069f76521753f6533a51024b
2
+ SHA256:
3
+ metadata.gz: e5d82df61c624d47a350142691c76d2dbab98fbd90123501b0a6693cd0a3c496
4
+ data.tar.gz: 1c7df05103ac4d448ce48927b991cd750048754f2a5a236b79a2c61e87603691
5
5
  SHA512:
6
- metadata.gz: 51f94a9bd3eb45765aa4518f8415ec82ec235f53e99e75e4ec94afb45721869587e9ad558fc7bdfeb05fca39ed76f24d08d770428b0a775c613f8b0768e60b7c
7
- data.tar.gz: 409267fc2e80bc9cbc119443b6dee928ab72a7e1eb747c69cb8c781654fcd6977cf2d16663622c29f46e6c86923da0a9e8f911b82f73c8cb63e0722e9a130cda
6
+ metadata.gz: 47fb20db31734d563d250dab59f43d81afafd7cb6df2b2e717bc6130dd74d6d0e110783f14f66c8d4ec2ee4c93150fe943238b78d9b7702e8e9bf634174138f7
7
+ data.tar.gz: d52756b3cd3ff79fd471851a38e7774527850a48b42b7333e56bcadb1c8e5b3e952d100b91f3103feb4ba09d082900efb6fa394c94431e03cd8be7f9cc4c48ea
data/Gemfile CHANGED
@@ -1,6 +1,6 @@
1
- # -*- mode: ruby; coding: utf-8 -*-
1
+ # -*- mode: ruby -*-
2
2
  #
3
- # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
3
+ # Copyright (C) 2013-2024 Sutou Kouhei <kou@clear-code.com>
4
4
  #
5
5
  # This library is free software; you can redistribute it and/or
6
6
  # modify it under the terms of the GNU Lesser General Public
@@ -20,6 +20,12 @@ source "https://rubygems.org/"
20
20
 
21
21
  gemspec
22
22
 
23
+ gem "bundler"
24
+ gem "packnga"
25
+ gem "rake"
26
+ gem "redcarpet"
27
+ gem "test-unit"
28
+
23
29
  base_dir = File.dirname(__FILE__)
24
30
  local_chupa_text_dir = File.join(base_dir, "..", "chupa-text")
25
31
  if File.exist?(local_chupa_text_dir)
@@ -1,6 +1,6 @@
1
1
  # -*- ruby -*-
2
2
  #
3
- # Copyright (C) 2013-2017 Kouhei Sutou <kou@clear-code.com>
3
+ # Copyright (C) 2013-2024 Sutou Kouhei <kou@clear-code.com>
4
4
  #
5
5
  # This library is free software; you can redistribute it and/or
6
6
  # modify it under the terms of the GNU Lesser General Public
@@ -22,7 +22,7 @@ end
22
22
 
23
23
  Gem::Specification.new do |spec|
24
24
  spec.name = "chupa-text-decomposer-html"
25
- spec.version = "1.0.3"
25
+ spec.version = "1.0.5"
26
26
  spec.homepage = "https://github.com/ranguba/chupa-text-decomposer-html"
27
27
  spec.authors = ["Kouhei Sutou"]
28
28
  spec.email = ["kou@clear-code.com"]
@@ -41,10 +41,4 @@ Gem::Specification.new do |spec|
41
41
 
42
42
  spec.add_runtime_dependency("chupa-text")
43
43
  spec.add_runtime_dependency("nokogiri")
44
-
45
- spec.add_development_dependency("bundler")
46
- spec.add_development_dependency("rake")
47
- spec.add_development_dependency("test-unit")
48
- spec.add_development_dependency("packnga")
49
- spec.add_development_dependency("redcarpet")
50
44
  end
data/doc/text/news.md CHANGED
@@ -1,5 +1,17 @@
1
1
  # News
2
2
 
3
+ ## 1.0.5: 2024-09-22
4
+
5
+ ### Fixes
6
+
7
+ * Fixed typos.
8
+
9
+ ## 1.0.4: 2024-09-22
10
+
11
+ ### Improvements
12
+
13
+ * Removed NKF dependency.
14
+
3
15
  ## 1.0.3: 2017-07-10
4
16
 
5
17
  ### Improvements
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2013-2017 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright (C) 2013-2024 Sutou Kouhei <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -14,7 +14,6 @@
14
14
  # License along with this library; if not, write to the Free Software
15
15
  # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
16
 
17
- require "nkf"
18
17
  require "nokogiri"
19
18
 
20
19
  module ChupaText
@@ -49,7 +48,7 @@ module ChupaText
49
48
  doc = Nokogiri::HTML.parse(html, nil, guess_encoding(html))
50
49
  body_element = (doc % "body")
51
50
  if body_element
52
- body = extract_text(body_element, "").scrub.gsub(/^\s+|\s+$/, '')
51
+ body = extract_text(body_element, +"").scrub.gsub(/^\s+|\s+$/, '')
53
52
  else
54
53
  body = ""
55
54
  end
@@ -89,7 +88,7 @@ module ChupaText
89
88
  if text.encoding != Encoding::ASCII_8BIT and text.valid_encoding?
90
89
  text.encoding.to_s
91
90
  else
92
- guess_encoding_nkf(text)
91
+ guess_encoding_heuristic(text)
93
92
  end
94
93
  end
95
94
  end
@@ -105,8 +104,22 @@ module ChupaText
105
104
  end
106
105
  end
107
106
 
108
- def guess_encoding_nkf(text)
109
- NKF.guess(text).name
107
+ def guess_encoding_heuristic(text)
108
+ candidates = [
109
+ Encoding::EUC_JP,
110
+ Encoding::WINDOWS_31J,
111
+ Encoding::UTF_16BE,
112
+ Encoding::UTF_16LE,
113
+ ]
114
+ candidates.each do |candidate|
115
+ begin
116
+ text.encode(Encoding::UTF_8, candidate)
117
+ rescue EncodingError
118
+ else
119
+ return candidate.name
120
+ end
121
+ end
122
+ "UTF-8"
110
123
  end
111
124
 
112
125
  def extract_text(element, text)
data/test/run-test.rb CHANGED
@@ -26,4 +26,4 @@ require "chupa-text"
26
26
 
27
27
  ChupaText::Decomposers.load
28
28
 
29
- exit(Test::Unit::AutoRunner.run(true))
29
+ exit(Test::Unit::AutoRunner.run(true, __dir__))
data/test/test-html.rb CHANGED
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2013-2017 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright (C) 2013-2024 Sutou Kouhei <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -188,7 +188,7 @@ class TestHTML < Test::Unit::TestCase
188
188
 
189
189
  sub_test_case("detect") do
190
190
  def test_nothing
191
- @data.body = <<-HTML.force_encoding("UTF-8")
191
+ @data.body = <<-HTML.dup.force_encoding("UTF-8")
192
192
  <html>
193
193
  <body>Hello</body>
194
194
  </html>
metadata CHANGED
@@ -1,14 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: chupa-text-decomposer-html
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.3
4
+ version: 1.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kouhei Sutou
8
- autorequire:
9
8
  bindir: bin
10
9
  cert_chain: []
11
- date: 2017-07-10 00:00:00.000000000 Z
10
+ date: 2024-09-22 00:00:00.000000000 Z
12
11
  dependencies:
13
12
  - !ruby/object:Gem::Dependency
14
13
  name: chupa-text
@@ -38,76 +37,6 @@ dependencies:
38
37
  - - ">="
39
38
  - !ruby/object:Gem::Version
40
39
  version: '0'
41
- - !ruby/object:Gem::Dependency
42
- name: bundler
43
- requirement: !ruby/object:Gem::Requirement
44
- requirements:
45
- - - ">="
46
- - !ruby/object:Gem::Version
47
- version: '0'
48
- type: :development
49
- prerelease: false
50
- version_requirements: !ruby/object:Gem::Requirement
51
- requirements:
52
- - - ">="
53
- - !ruby/object:Gem::Version
54
- version: '0'
55
- - !ruby/object:Gem::Dependency
56
- name: rake
57
- requirement: !ruby/object:Gem::Requirement
58
- requirements:
59
- - - ">="
60
- - !ruby/object:Gem::Version
61
- version: '0'
62
- type: :development
63
- prerelease: false
64
- version_requirements: !ruby/object:Gem::Requirement
65
- requirements:
66
- - - ">="
67
- - !ruby/object:Gem::Version
68
- version: '0'
69
- - !ruby/object:Gem::Dependency
70
- name: test-unit
71
- requirement: !ruby/object:Gem::Requirement
72
- requirements:
73
- - - ">="
74
- - !ruby/object:Gem::Version
75
- version: '0'
76
- type: :development
77
- prerelease: false
78
- version_requirements: !ruby/object:Gem::Requirement
79
- requirements:
80
- - - ">="
81
- - !ruby/object:Gem::Version
82
- version: '0'
83
- - !ruby/object:Gem::Dependency
84
- name: packnga
85
- requirement: !ruby/object:Gem::Requirement
86
- requirements:
87
- - - ">="
88
- - !ruby/object:Gem::Version
89
- version: '0'
90
- type: :development
91
- prerelease: false
92
- version_requirements: !ruby/object:Gem::Requirement
93
- requirements:
94
- - - ">="
95
- - !ruby/object:Gem::Version
96
- version: '0'
97
- - !ruby/object:Gem::Dependency
98
- name: redcarpet
99
- requirement: !ruby/object:Gem::Requirement
100
- requirements:
101
- - - ">="
102
- - !ruby/object:Gem::Version
103
- version: '0'
104
- type: :development
105
- prerelease: false
106
- version_requirements: !ruby/object:Gem::Requirement
107
- requirements:
108
- - - ">="
109
- - !ruby/object:Gem::Version
110
- version: '0'
111
40
  description: |
112
41
  This is a ChupaText decomposer plugin for to extract text and
113
42
  meta-data from HTML.
@@ -133,7 +62,6 @@ homepage: https://github.com/ranguba/chupa-text-decomposer-html
133
62
  licenses:
134
63
  - LGPL-2.1+
135
64
  metadata: {}
136
- post_install_message:
137
65
  rdoc_options: []
138
66
  require_paths:
139
67
  - lib
@@ -148,9 +76,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
148
76
  - !ruby/object:Gem::Version
149
77
  version: '0'
150
78
  requirements: []
151
- rubyforge_project:
152
- rubygems_version: 2.5.2
153
- signing_key:
79
+ rubygems_version: 3.6.0.dev
154
80
  specification_version: 4
155
81
  summary: This is a ChupaText decomposer plugin for to extract text and meta-data from
156
82
  HTML.