chupa-text-decomposer-html 1.0.2 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: '068db7f654ca70a8f65209c3d2f0d2fa1edf45ff'
4
- data.tar.gz: 715d8630a7e69c100ed38a8bb090c5282b62fe2d
2
+ SHA256:
3
+ metadata.gz: d18136db785fa061ef0bea3f17f8826a1ff55ed5020a591926e1700b91b9df38
4
+ data.tar.gz: 2c173149ac68d34756944ce98caa32f1dbc5bba4f86dab0461505bb90a5d406f
5
5
  SHA512:
6
- metadata.gz: ab46f697f57427a940bd7968391648f54e2edbfab77974bc651ebe155f8dbfb6bbb9f8922d7153ee306ccb0402d30957a05fefe839437ba3ff28037f9a9e6ab0
7
- data.tar.gz: 932251f709b54256f6478d4e7103f1b483c9c83166a277c6ee4b67b24c1528389ef1602ec960536de9b727cfcfa771e323d3467e9573911486f752e715ca5fcb
6
+ metadata.gz: 692141e0ed3d3d92729de8c47d62fa78ad6bc571070d293cdd8a7865e2d2366d82d68ef83121300dd92a82193b3db3ca83b7865ad6e0212c7e1b26e698830b13
7
+ data.tar.gz: 061e659f770c63f304cc7b697b6f3c512ea0c975eac35d55a9de9ad434820f056477275e4a240e058a42ace0187b1240290050bf83984049593003e309694411
data/Gemfile CHANGED
@@ -1,6 +1,6 @@
1
- # -*- mode: ruby; coding: utf-8 -*-
1
+ # -*- mode: ruby -*-
2
2
  #
3
- # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
3
+ # Copyright (C) 2013-2024 Sutou Kouhei <kou@clear-code.com>
4
4
  #
5
5
  # This library is free software; you can redistribute it and/or
6
6
  # modify it under the terms of the GNU Lesser General Public
@@ -20,6 +20,12 @@ source "https://rubygems.org/"
20
20
 
21
21
  gemspec
22
22
 
23
+ gem "bundler"
24
+ gem "packnga"
25
+ gem "rake"
26
+ gem "redcarpet"
27
+ gem "test-unit"
28
+
23
29
  base_dir = File.dirname(__FILE__)
24
30
  local_chupa_text_dir = File.join(base_dir, "..", "chupa-text")
25
31
  if File.exist?(local_chupa_text_dir)
@@ -1,6 +1,6 @@
1
1
  # -*- ruby -*-
2
2
  #
3
- # Copyright (C) 2013-2017 Kouhei Sutou <kou@clear-code.com>
3
+ # Copyright (C) 2013-2024 Sutou Kouhei <kou@clear-code.com>
4
4
  #
5
5
  # This library is free software; you can redistribute it and/or
6
6
  # modify it under the terms of the GNU Lesser General Public
@@ -22,7 +22,7 @@ end
22
22
 
23
23
  Gem::Specification.new do |spec|
24
24
  spec.name = "chupa-text-decomposer-html"
25
- spec.version = "1.0.2"
25
+ spec.version = "1.0.4"
26
26
  spec.homepage = "https://github.com/ranguba/chupa-text-decomposer-html"
27
27
  spec.authors = ["Kouhei Sutou"]
28
28
  spec.email = ["kou@clear-code.com"]
@@ -41,10 +41,4 @@ Gem::Specification.new do |spec|
41
41
 
42
42
  spec.add_runtime_dependency("chupa-text")
43
43
  spec.add_runtime_dependency("nokogiri")
44
-
45
- spec.add_development_dependency("bundler")
46
- spec.add_development_dependency("rake")
47
- spec.add_development_dependency("test-unit")
48
- spec.add_development_dependency("packnga")
49
- spec.add_development_dependency("redcarpet")
50
44
  end
data/doc/text/news.md CHANGED
@@ -1,14 +1,32 @@
1
1
  # News
2
2
 
3
+ ## 1.0.4: 2024-09-22
4
+
5
+ ### Improvements
6
+
7
+ * Removed NKF dependency.
8
+
9
+ ## 1.0.3: 2017-07-10
10
+
11
+ ### Improvements
12
+
13
+ * Supported ignoring topic path content.
14
+
15
+ * Supported ignoring aside content.
16
+
17
+ ### Fixes
18
+
19
+ * Fixed a infinite loop bug.
20
+
3
21
  ## 1.0.2: 2017-07-05
4
22
 
5
- * Support content based HTML detection.
23
+ * Supported content based HTML detection.
6
24
 
7
- * Ignore common contents.
25
+ * Supported ignoring common contents.
8
26
 
9
27
  ## 1.0.1: 2014-02-18
10
28
 
11
- * Support chupa-text 1.0.4.
29
+ * Supported chupa-text 1.0.4.
12
30
 
13
31
  ## 1.0.0: 2014-01-05
14
32
 
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2013-2017 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright (C) 2013-2024 Sutou Kouhei <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -14,7 +14,6 @@
14
14
  # License along with this library; if not, write to the Free Software
15
15
  # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
16
 
17
- require "nkf"
18
17
  require "nokogiri"
19
18
 
20
19
  module ChupaText
@@ -28,6 +27,10 @@ module ChupaText
28
27
  "application/xhtml+xml",
29
28
  ]
30
29
  def target?(data)
30
+ (data["source-mime-types"] || []).each do |source_mime_type|
31
+ return false if TARGET_MIME_TYPES.include?(source_mime_type)
32
+ end
33
+
31
34
  return true if TARGET_EXTENSIONS.include?(data.extension)
32
35
  return true if TARGET_MIME_TYPES.include?(data.mime_type)
33
36
 
@@ -45,7 +48,7 @@ module ChupaText
45
48
  doc = Nokogiri::HTML.parse(html, nil, guess_encoding(html))
46
49
  body_element = (doc % "body")
47
50
  if body_element
48
- body = extract_text(body_element, "").scrub.gsub(/^\s+|\s+$/, '')
51
+ body = extract_text(body_element, +"").scrub.gsub(/^\s+|\s+$/, '')
49
52
  else
50
53
  body = ""
51
54
  end
@@ -85,7 +88,7 @@ module ChupaText
85
88
  if text.encoding != Encoding::ASCII_8BIT and text.valid_encoding?
86
89
  text.encoding.to_s
87
90
  else
88
- guess_encoding_nkf(text)
91
+ guess_encoding_heuristic(text)
89
92
  end
90
93
  end
91
94
  end
@@ -101,8 +104,22 @@ module ChupaText
101
104
  end
102
105
  end
103
106
 
104
- def guess_encoding_nkf(text)
105
- NKF.guess(text).name
107
+ def guess_encoding_heuristic(text)
108
+ candidates = [
109
+ Encoding::EUC_JP,
110
+ Encoding::WINDOWS_31J,
111
+ Encoding::UTF16_BE,
112
+ Encoding::UTF16_LE,
113
+ ]
114
+ candidates.each do |candidate|
115
+ begin
116
+ text.encode(Encoding::UTF_8, candidate)
117
+ rescue EncodingError
118
+ else
119
+ return candidate.name
120
+ end
121
+ end
122
+ "UTF-8"
106
123
  end
107
124
 
108
125
  def extract_text(element, text)
@@ -112,6 +129,7 @@ module ChupaText
112
129
  return text if header_element?(element, name, classes)
113
130
  return text if footer_element?(element, name, classes)
114
131
  return text if navigation_element?(element, name, classes)
132
+ return text if aside_element?(element, name, classes)
115
133
 
116
134
  element.children.each do |child|
117
135
  case child
@@ -143,7 +161,7 @@ module ChupaText
143
161
 
144
162
  def header_element?(element, name, classes)
145
163
  case name
146
- when "header", "nav"
164
+ when "header"
147
165
  return true
148
166
  end
149
167
 
@@ -191,13 +209,34 @@ module ChupaText
191
209
 
192
210
  classes.each do |klass|
193
211
  case klass
194
- when "nav", "menu"
212
+ when "nav", "menu", /\Atopic[-_]?path\z/i
213
+ return true
214
+ end
215
+ end
216
+
217
+ case element["id"]
218
+ when "nav", "menu", /\Atopic[-_]?path\z/i
219
+ return true
220
+ end
221
+
222
+ false
223
+ end
224
+
225
+ def aside_element?(element, name, classes)
226
+ case name
227
+ when "aside"
228
+ return true
229
+ end
230
+
231
+ classes.each do |klass|
232
+ case klass
233
+ when "aside"
195
234
  return true
196
235
  end
197
236
  end
198
237
 
199
238
  case element["id"]
200
- when "nav", "menu"
239
+ when "aside"
201
240
  return true
202
241
  end
203
242
 
data/test/run-test.rb CHANGED
@@ -26,4 +26,4 @@ require "chupa-text"
26
26
 
27
27
  ChupaText::Decomposers.load
28
28
 
29
- exit(Test::Unit::AutoRunner.run(true))
29
+ exit(Test::Unit::AutoRunner.run(true, __dir__))
data/test/test-html.rb CHANGED
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2013-2017 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright (C) 2013-2024 Sutou Kouhei <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -28,6 +28,30 @@ class TestHTML < Test::Unit::TestCase
28
28
  end
29
29
 
30
30
  sub_test_case("target?") do
31
+ sub_test_case("source-mime-type") do
32
+ def create_data(uri, mime_type)
33
+ data = ChupaText::Data.new
34
+ data.body = ""
35
+ data.uri = uri
36
+ data["source-mime-types"] = [mime_type]
37
+ data
38
+ end
39
+
40
+ def test_text_html
41
+ data = create_data("index.html", "text/html")
42
+ assert do
43
+ not @decomposer.target?(data)
44
+ end
45
+ end
46
+
47
+ def test_application_xhtml_xml
48
+ data = create_data("index.html", "application/xhtml+xml")
49
+ assert do
50
+ not @decomposer.target?(data)
51
+ end
52
+ end
53
+ end
54
+
31
55
  sub_test_case("extension") do
32
56
  def create_data(uri)
33
57
  data = ChupaText::Data.new
@@ -164,7 +188,7 @@ class TestHTML < Test::Unit::TestCase
164
188
 
165
189
  sub_test_case("detect") do
166
190
  def test_nothing
167
- @data.body = <<-HTML.force_encoding("UTF-8")
191
+ @data.body = <<-HTML.dup.force_encoding("UTF-8")
168
192
  <html>
169
193
  <body>Hello</body>
170
194
  </html>
@@ -436,6 +460,23 @@ class TestHTML < Test::Unit::TestCase
436
460
  decompose(@data))
437
461
  end
438
462
 
463
+ def test_topic_path_class
464
+ @data.body = <<-HTML
465
+ <html>
466
+ <body>
467
+ Before
468
+ <div class="topic-path">topic-path</div>
469
+ <div class="topic_path">topic_path</div>
470
+ <div class="topicpath">topicpath</div>
471
+ <div class="TopicPath">TopicPath</div>
472
+ After
473
+ </body>
474
+ </html>
475
+ HTML
476
+ assert_equal(["Before\nAfter"],
477
+ decompose(@data))
478
+ end
479
+
439
480
  def test_nav_id
440
481
  @data.body = <<-HTML
441
482
  <html>
@@ -450,6 +491,55 @@ class TestHTML < Test::Unit::TestCase
450
491
  @data.body = <<-HTML
451
492
  <html>
452
493
  <body>Before<div id="menu">nav</div>After</body>
494
+ </html>
495
+ HTML
496
+ assert_equal(["BeforeAfter"],
497
+ decompose(@data))
498
+ end
499
+
500
+ def test_topic_path_id
501
+ @data.body = <<-HTML
502
+ <html>
503
+ <body>
504
+ Before
505
+ <div id="topic-path">topic-path</div>
506
+ <div id="topic_path">topic_path</div>
507
+ <div id="topicpath">topicpath</div>
508
+ <div id="TopicPath">TopicPath</div>
509
+ After
510
+ </body>
511
+ </html>
512
+ HTML
513
+ assert_equal(["Before\nAfter"],
514
+ decompose(@data))
515
+ end
516
+ end
517
+
518
+ sub_test_case("aside") do
519
+ def test_aside_tag
520
+ @data.body = <<-HTML
521
+ <html>
522
+ <body>Before<aside>aside</aside>After</body>
523
+ </html>
524
+ HTML
525
+ assert_equal(["BeforeAfter"],
526
+ decompose(@data))
527
+ end
528
+
529
+ def test_aside_class
530
+ @data.body = <<-HTML
531
+ <html>
532
+ <body>Before<div class="aside">aside</div>After</body>
533
+ </html>
534
+ HTML
535
+ assert_equal(["BeforeAfter"],
536
+ decompose(@data))
537
+ end
538
+
539
+ def test_aside_id
540
+ @data.body = <<-HTML
541
+ <html>
542
+ <body>Before<div id="aside">aside</div>After</body>
453
543
  </html>
454
544
  HTML
455
545
  assert_equal(["BeforeAfter"],
metadata CHANGED
@@ -1,14 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: chupa-text-decomposer-html
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.2
4
+ version: 1.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kouhei Sutou
8
- autorequire:
9
8
  bindir: bin
10
9
  cert_chain: []
11
- date: 2017-07-05 00:00:00.000000000 Z
10
+ date: 2024-09-22 00:00:00.000000000 Z
12
11
  dependencies:
13
12
  - !ruby/object:Gem::Dependency
14
13
  name: chupa-text
@@ -38,76 +37,6 @@ dependencies:
38
37
  - - ">="
39
38
  - !ruby/object:Gem::Version
40
39
  version: '0'
41
- - !ruby/object:Gem::Dependency
42
- name: bundler
43
- requirement: !ruby/object:Gem::Requirement
44
- requirements:
45
- - - ">="
46
- - !ruby/object:Gem::Version
47
- version: '0'
48
- type: :development
49
- prerelease: false
50
- version_requirements: !ruby/object:Gem::Requirement
51
- requirements:
52
- - - ">="
53
- - !ruby/object:Gem::Version
54
- version: '0'
55
- - !ruby/object:Gem::Dependency
56
- name: rake
57
- requirement: !ruby/object:Gem::Requirement
58
- requirements:
59
- - - ">="
60
- - !ruby/object:Gem::Version
61
- version: '0'
62
- type: :development
63
- prerelease: false
64
- version_requirements: !ruby/object:Gem::Requirement
65
- requirements:
66
- - - ">="
67
- - !ruby/object:Gem::Version
68
- version: '0'
69
- - !ruby/object:Gem::Dependency
70
- name: test-unit
71
- requirement: !ruby/object:Gem::Requirement
72
- requirements:
73
- - - ">="
74
- - !ruby/object:Gem::Version
75
- version: '0'
76
- type: :development
77
- prerelease: false
78
- version_requirements: !ruby/object:Gem::Requirement
79
- requirements:
80
- - - ">="
81
- - !ruby/object:Gem::Version
82
- version: '0'
83
- - !ruby/object:Gem::Dependency
84
- name: packnga
85
- requirement: !ruby/object:Gem::Requirement
86
- requirements:
87
- - - ">="
88
- - !ruby/object:Gem::Version
89
- version: '0'
90
- type: :development
91
- prerelease: false
92
- version_requirements: !ruby/object:Gem::Requirement
93
- requirements:
94
- - - ">="
95
- - !ruby/object:Gem::Version
96
- version: '0'
97
- - !ruby/object:Gem::Dependency
98
- name: redcarpet
99
- requirement: !ruby/object:Gem::Requirement
100
- requirements:
101
- - - ">="
102
- - !ruby/object:Gem::Version
103
- version: '0'
104
- type: :development
105
- prerelease: false
106
- version_requirements: !ruby/object:Gem::Requirement
107
- requirements:
108
- - - ">="
109
- - !ruby/object:Gem::Version
110
- version: '0'
111
40
  description: |
112
41
  This is a ChupaText decomposer plugin for to extract text and
113
42
  meta-data from HTML.
@@ -133,7 +62,6 @@ homepage: https://github.com/ranguba/chupa-text-decomposer-html
133
62
  licenses:
134
63
  - LGPL-2.1+
135
64
  metadata: {}
136
- post_install_message:
137
65
  rdoc_options: []
138
66
  require_paths:
139
67
  - lib
@@ -148,9 +76,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
148
76
  - !ruby/object:Gem::Version
149
77
  version: '0'
150
78
  requirements: []
151
- rubyforge_project:
152
- rubygems_version: 2.5.2
153
- signing_key:
79
+ rubygems_version: 3.6.0.dev
154
80
  specification_version: 4
155
81
  summary: This is a ChupaText decomposer plugin for to extract text and meta-data from
156
82
  HTML.