chupa-text-decomposer-html 1.0.1 → 1.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 75be13613ba030b332f565e1e569c9e15a75e111
4
- data.tar.gz: 0ad922b353bffb19819340d6ffb70ce739e51fc0
3
+ metadata.gz: '068db7f654ca70a8f65209c3d2f0d2fa1edf45ff'
4
+ data.tar.gz: 715d8630a7e69c100ed38a8bb090c5282b62fe2d
5
5
  SHA512:
6
- metadata.gz: 5f00a62cb2156eab85ab87e4a1f50aee301fe17d831965cc1a2b7fe0b6bbde6b67f8b108a16999d6ecad3df7349b84a1d8ef7103901cfdfde0ba1ad45c5c8235
7
- data.tar.gz: 975b69746801761f2872921e83c95bd1c309f01a6ae9703be404ec81b1b92cb359049dab3aa74cc37d603d0c71734fa30ed29715489298c0c70514f8b7534e6f
6
+ metadata.gz: ab46f697f57427a940bd7968391648f54e2edbfab77974bc651ebe155f8dbfb6bbb9f8922d7153ee306ccb0402d30957a05fefe839437ba3ff28037f9a9e6ab0
7
+ data.tar.gz: 932251f709b54256f6478d4e7103f1b483c9c83166a277c6ee4b67b24c1528389ef1602ec960536de9b727cfcfa771e323d3467e9573911486f752e715ca5fcb
data/Rakefile CHANGED
@@ -1,6 +1,6 @@
1
1
  # -*- mode: ruby; coding: utf-8 -*-
2
2
  #
3
- # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
3
+ # Copyright (C) 2013-2017 Kouhei Sutou <kou@clear-code.com>
4
4
  #
5
5
  # This library is free software; you can redistribute it and/or
6
6
  # modify it under the terms of the GNU Lesser General Public
@@ -34,7 +34,9 @@ end
34
34
  helper.install
35
35
  spec = helper.gemspec
36
36
 
37
- Packnga::DocumentTask.new(spec) do
37
+ Packnga::DocumentTask.new(spec) do |task|
38
+ task.original_language = "en"
39
+ task.translate_language = "ja"
38
40
  end
39
41
 
40
42
  Packnga::ReleaseTask.new(spec) do
@@ -1,6 +1,6 @@
1
- # -*- mode: ruby; coding: utf-8 -*-
1
+ # -*- ruby -*-
2
2
  #
3
- # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
3
+ # Copyright (C) 2013-2017 Kouhei Sutou <kou@clear-code.com>
4
4
  #
5
5
  # This library is free software; you can redistribute it and/or
6
6
  # modify it under the terms of the GNU Lesser General Public
@@ -22,7 +22,7 @@ end
22
22
 
23
23
  Gem::Specification.new do |spec|
24
24
  spec.name = "chupa-text-decomposer-html"
25
- spec.version = "1.0.1"
25
+ spec.version = "1.0.2"
26
26
  spec.homepage = "https://github.com/ranguba/chupa-text-decomposer-html"
27
27
  spec.authors = ["Kouhei Sutou"]
28
28
  spec.email = ["kou@clear-code.com"]
@@ -31,7 +31,7 @@ Gem::Specification.new do |spec|
31
31
  description = clean_white_space.call(entries[entries.index("Description") + 1])
32
32
  spec.summary = description.split(/\n\n+/, 2).first
33
33
  spec.description = description
34
- spec.license = "LGPLv2.1 or later"
34
+ spec.license = "LGPL-2.1+"
35
35
  spec.files = ["#{spec.name}.gemspec"]
36
36
  spec.files += ["README.md", "LICENSE.txt", "Rakefile", "Gemfile"]
37
37
  spec.files += [".yardopts"]
@@ -1,5 +1,11 @@
1
1
  # News
2
2
 
3
+ ## 1.0.2: 2017-07-05
4
+
5
+ * Support content based HTML detection.
6
+
7
+ * Ignore common contents.
8
+
3
9
  ## 1.0.1: 2014-02-18
4
10
 
5
11
  * Support chupa-text 1.0.4.
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright (C) 2013-2017 Kouhei Sutou <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -28,8 +28,16 @@ module ChupaText
28
28
  "application/xhtml+xml",
29
29
  ]
30
30
  def target?(data)
31
- TARGET_EXTENSIONS.include?(data.extension) or
32
- TARGET_MIME_TYPES.include?(data.mime_type)
31
+ return true if TARGET_EXTENSIONS.include?(data.extension)
32
+ return true if TARGET_MIME_TYPES.include?(data.mime_type)
33
+
34
+ body = data.body
35
+ return false if body.nil?
36
+
37
+ return true if body.start_with?("<!DOCTYPE html ")
38
+ return true if body.start_with?("<html")
39
+
40
+ false
33
41
  end
34
42
 
35
43
  def decompose(data)
@@ -37,13 +45,11 @@ module ChupaText
37
45
  doc = Nokogiri::HTML.parse(html, nil, guess_encoding(html))
38
46
  body_element = (doc % "body")
39
47
  if body_element
40
- body = body_element.text.gsub(/^\s+|\s+$/, '')
48
+ body = extract_text(body_element, "").scrub.gsub(/^\s+|\s+$/, '')
41
49
  else
42
50
  body = ""
43
51
  end
44
- decomposed_data = TextData.new(body)
45
- decomposed_data.uri = data.uri
46
-
52
+ decomposed_data = TextData.new(body, :source_data => data)
47
53
  attributes = decomposed_data.attributes
48
54
  title_element = (doc % "head/title")
49
55
  attributes.title = title_element.text if title_element
@@ -55,6 +61,10 @@ module ChupaText
55
61
 
56
62
  private
57
63
  def guess_encoding(text)
64
+ unless text.encoding.ascii_compatible?
65
+ return text.encoding.name
66
+ end
67
+
58
68
  case text
59
69
  when /\A<\?xml.+?encoding=(['"])([a-zA-Z0-9_-]+)\1/
60
70
  $2
@@ -72,7 +82,11 @@ module ChupaText
72
82
  charset = $2
73
83
  normalize_charset(charset)
74
84
  else
75
- guess_encoding_nkf(text)
85
+ if text.encoding != Encoding::ASCII_8BIT and text.valid_encoding?
86
+ text.encoding.to_s
87
+ else
88
+ guess_encoding_nkf(text)
89
+ end
76
90
  end
77
91
  end
78
92
 
@@ -90,6 +104,105 @@ module ChupaText
90
104
  def guess_encoding_nkf(text)
91
105
  NKF.guess(text).name
92
106
  end
107
+
108
+ def extract_text(element, text)
109
+ name = element.name.downcase
110
+ classes = (element["class"] || "").split
111
+ return text if noindex_element?(element, name, classes)
112
+ return text if header_element?(element, name, classes)
113
+ return text if footer_element?(element, name, classes)
114
+ return text if navigation_element?(element, name, classes)
115
+
116
+ element.children.each do |child|
117
+ case child
118
+ when Nokogiri::XML::Text
119
+ text << child.text
120
+ when Nokogiri::XML::Element
121
+ extract_text(child, text)
122
+ end
123
+ end
124
+
125
+ text
126
+ end
127
+
128
+ def noindex_element?(element, name, classes)
129
+ case name
130
+ when "script", "noscript", "link", "style"
131
+ return true
132
+ end
133
+
134
+ classes.each do |klass|
135
+ case klass
136
+ when "noindex", "robots-noindex"
137
+ return true
138
+ end
139
+ end
140
+
141
+ false
142
+ end
143
+
144
+ def header_element?(element, name, classes)
145
+ case name
146
+ when "header", "nav"
147
+ return true
148
+ end
149
+
150
+ classes.each do |klass|
151
+ case klass
152
+ when "header"
153
+ return true
154
+ end
155
+ end
156
+
157
+ case element["id"]
158
+ when "header"
159
+ return true
160
+ end
161
+
162
+ false
163
+ end
164
+
165
+ def footer_element?(element, name, classes)
166
+ case name
167
+ when "footer"
168
+ return true
169
+ end
170
+
171
+ classes.each do |klass|
172
+ case klass
173
+ when "footer"
174
+ return true
175
+ end
176
+ end
177
+
178
+ case element["id"]
179
+ when "footer"
180
+ return true
181
+ end
182
+
183
+ false
184
+ end
185
+
186
+ def navigation_element?(element, name, classes)
187
+ case name
188
+ when "nav"
189
+ return true
190
+ end
191
+
192
+ classes.each do |klass|
193
+ case klass
194
+ when "nav", "menu"
195
+ return true
196
+ end
197
+ end
198
+
199
+ case element["id"]
200
+ when "nav", "menu"
201
+ return true
202
+ end
203
+
204
+ false
205
+ end
93
206
  end
94
207
  end
95
208
  end
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2013-2014 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright (C) 2013-2017 Kouhei Sutou <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -51,6 +51,12 @@ class TestHTML < Test::Unit::TestCase
51
51
  def test_txt
52
52
  assert_false(@decomposer.target?(create_data("index.txt")))
53
53
  end
54
+
55
+ def test_php
56
+ assert do
57
+ not @decomposer.target?(create_data("index.php"))
58
+ end
59
+ end
54
60
  end
55
61
 
56
62
  sub_test_case("mime-type") do
@@ -72,6 +78,26 @@ class TestHTML < Test::Unit::TestCase
72
78
  assert_false(@decomposer.target?(create_data("text/plain")))
73
79
  end
74
80
  end
81
+
82
+ sub_test_case("content") do
83
+ def create_data(body)
84
+ data = ChupaText::Data.new
85
+ data.body = body
86
+ data
87
+ end
88
+
89
+ def test_doctype_html
90
+ assert do
91
+ @decomposer.target?(create_data("<!DOCTYPE html "))
92
+ end
93
+ end
94
+
95
+ def test_html
96
+ assert do
97
+ @decomposer.target?(create_data("<html"))
98
+ end
99
+ end
100
+ end
75
101
  end
76
102
 
77
103
  sub_test_case("decompose") do
@@ -138,12 +164,12 @@ class TestHTML < Test::Unit::TestCase
138
164
 
139
165
  sub_test_case("detect") do
140
166
  def test_nothing
141
- @data.body = <<-HTML
167
+ @data.body = <<-HTML.force_encoding("UTF-8")
142
168
  <html>
143
169
  <body>Hello</body>
144
170
  </html>
145
171
  HTML
146
- assert_equal([Encoding::US_ASCII], decompose(@data))
172
+ assert_equal([Encoding::UTF_8], decompose(@data))
147
173
  end
148
174
 
149
175
  def test_xml_declaration
@@ -181,6 +207,44 @@ class TestHTML < Test::Unit::TestCase
181
207
  HTML5
182
208
  assert_equal([Encoding::EUC_JP], decompose(@data))
183
209
  end
210
+
211
+ sub_test_case("not ascii_compatible?") do
212
+ def test_iso_2022_jp
213
+ @data.body = <<-ISO_2022_JP_HTML.encode("ISO-2022-JP")
214
+ <html>
215
+ <head>
216
+ <title>タイトル</title>
217
+ </head>
218
+ <body>Hello</body>
219
+ </html>
220
+ ISO_2022_JP_HTML
221
+ assert_equal([Encoding::ISO_2022_JP], decompose(@data))
222
+ end
223
+
224
+ def test_utf_32
225
+ @data.body = <<-UTF_32_HTML.encode("UTF-32")
226
+ <html>
227
+ <head>
228
+ <title>タイトル</title>
229
+ </head>
230
+ <body>Hello</body>
231
+ </html>
232
+ UTF_32_HTML
233
+ assert_equal([Encoding::UTF_32], decompose(@data))
234
+ end
235
+
236
+ def test_koi8_r
237
+ @data.body = <<-KOI8_R_HTML.encode("KOI8-R")
238
+ <html>
239
+ <head>
240
+ <title>название</title>
241
+ </head>
242
+ <body>Hello</body>
243
+ </html>
244
+ KOI8_R_HTML
245
+ assert_equal([Encoding::KOI8_R], decompose(@data))
246
+ end
247
+ end
184
248
  end
185
249
 
186
250
  sub_test_case("normalize") do
@@ -209,5 +273,189 @@ class TestHTML < Test::Unit::TestCase
209
273
  end
210
274
  end
211
275
  end
276
+
277
+ sub_test_case("body") do
278
+ def normalize_decomposed_data(decomposed_data)
279
+ decomposed_data.body
280
+ end
281
+
282
+ sub_test_case("noindex") do
283
+ def test_script
284
+ @data.body = <<-HTML
285
+ <html>
286
+ <body>Before<script>var x;</script>After</body>
287
+ </html>
288
+ HTML
289
+ assert_equal(["BeforeAfter"],
290
+ decompose(@data))
291
+ end
292
+
293
+ def test_noscript
294
+ @data.body = <<-HTML
295
+ <html>
296
+ <body>Before<noscript>Enable JavaScript!</noscript>After</body>
297
+ </html>
298
+ HTML
299
+ assert_equal(["BeforeAfter"],
300
+ decompose(@data))
301
+ end
302
+
303
+ def test_link
304
+ @data.body = <<-HTML
305
+ <html>
306
+ <body>Before<link rel="stylehseet">After</body>
307
+ </html>
308
+ HTML
309
+ assert_equal(["BeforeAfter"],
310
+ decompose(@data))
311
+ end
312
+
313
+ def test_style
314
+ @data.body = <<-HTML
315
+ <html>
316
+ <body>Before<style>a {color: "red";}</style>After</body>
317
+ </html>
318
+ HTML
319
+ assert_equal(["BeforeAfter"],
320
+ decompose(@data))
321
+ end
322
+
323
+ def test_noindex
324
+ @data.body = <<-HTML
325
+ <html>
326
+ <body>Before<div class="noindex">header</div>After</body>
327
+ </html>
328
+ HTML
329
+ assert_equal(["BeforeAfter"],
330
+ decompose(@data))
331
+ end
332
+
333
+ def test_robots_noindex
334
+ @data.body = <<-HTML
335
+ <html>
336
+ <body>Before<div class="robots-noindex">header</div>After</body>
337
+ </html>
338
+ HTML
339
+ assert_equal(["BeforeAfter"],
340
+ decompose(@data))
341
+ end
342
+ end
343
+
344
+ sub_test_case("header") do
345
+ def test_tag
346
+ @data.body = <<-HTML
347
+ <html>
348
+ <body>Before<header>header</header>After</body>
349
+ </html>
350
+ HTML
351
+ assert_equal(["BeforeAfter"],
352
+ decompose(@data))
353
+ end
354
+
355
+ def test_class
356
+ @data.body = <<-HTML
357
+ <html>
358
+ <body>Before<div class="header">header</div>After</body>
359
+ </html>
360
+ HTML
361
+ assert_equal(["BeforeAfter"],
362
+ decompose(@data))
363
+ end
364
+
365
+ def test_id
366
+ @data.body = <<-HTML
367
+ <html>
368
+ <body>Before<div id="header">header</div>After</body>
369
+ </html>
370
+ HTML
371
+ assert_equal(["BeforeAfter"],
372
+ decompose(@data))
373
+ end
374
+ end
375
+
376
+ sub_test_case("footer") do
377
+ def test_tag
378
+ @data.body = <<-HTML
379
+ <html>
380
+ <body>Before<footer>footer</footer>After</body>
381
+ </html>
382
+ HTML
383
+ assert_equal(["BeforeAfter"],
384
+ decompose(@data))
385
+ end
386
+
387
+ def test_class
388
+ @data.body = <<-HTML
389
+ <html>
390
+ <body>Before<div class="footer">footer</div>After</body>
391
+ </html>
392
+ HTML
393
+ assert_equal(["BeforeAfter"],
394
+ decompose(@data))
395
+ end
396
+
397
+ def test_id
398
+ @data.body = <<-HTML
399
+ <html>
400
+ <body>Before<div id="footer">footer</div>After</body>
401
+ </html>
402
+ HTML
403
+ assert_equal(["BeforeAfter"],
404
+ decompose(@data))
405
+ end
406
+ end
407
+
408
+ sub_test_case("navigation") do
409
+ def test_nav_tag
410
+ @data.body = <<-HTML
411
+ <html>
412
+ <body>Before<nav>nav</nav>After</body>
413
+ </html>
414
+ HTML
415
+ assert_equal(["BeforeAfter"],
416
+ decompose(@data))
417
+ end
418
+
419
+ def test_nav_class
420
+ @data.body = <<-HTML
421
+ <html>
422
+ <body>Before<div class="nav">nav</div>After</body>
423
+ </html>
424
+ HTML
425
+ assert_equal(["BeforeAfter"],
426
+ decompose(@data))
427
+ end
428
+
429
+ def test_menu_class
430
+ @data.body = <<-HTML
431
+ <html>
432
+ <body>Before<div class="menu">nav</div>After</body>
433
+ </html>
434
+ HTML
435
+ assert_equal(["BeforeAfter"],
436
+ decompose(@data))
437
+ end
438
+
439
+ def test_nav_id
440
+ @data.body = <<-HTML
441
+ <html>
442
+ <body>Before<div id="nav">nav</div>After</body>
443
+ </html>
444
+ HTML
445
+ assert_equal(["BeforeAfter"],
446
+ decompose(@data))
447
+ end
448
+
449
+ def test_menu_id
450
+ @data.body = <<-HTML
451
+ <html>
452
+ <body>Before<div id="menu">nav</div>After</body>
453
+ </html>
454
+ HTML
455
+ assert_equal(["BeforeAfter"],
456
+ decompose(@data))
457
+ end
458
+ end
459
+ end
212
460
  end
213
461
  end
metadata CHANGED
@@ -1,111 +1,111 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: chupa-text-decomposer-html
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.1
4
+ version: 1.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kouhei Sutou
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-02-17 00:00:00.000000000 Z
11
+ date: 2017-07-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: chupa-text
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - '>='
17
+ - - ">="
18
18
  - !ruby/object:Gem::Version
19
19
  version: '0'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - '>='
24
+ - - ">="
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: nokogiri
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - '>='
31
+ - - ">="
32
32
  - !ruby/object:Gem::Version
33
33
  version: '0'
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - '>='
38
+ - - ">="
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: bundler
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - '>='
45
+ - - ">="
46
46
  - !ruby/object:Gem::Version
47
47
  version: '0'
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - '>='
52
+ - - ">="
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0'
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: rake
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
- - - '>='
59
+ - - ">="
60
60
  - !ruby/object:Gem::Version
61
61
  version: '0'
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
- - - '>='
66
+ - - ">="
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0'
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: test-unit
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
- - - '>='
73
+ - - ">="
74
74
  - !ruby/object:Gem::Version
75
75
  version: '0'
76
76
  type: :development
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
- - - '>='
80
+ - - ">="
81
81
  - !ruby/object:Gem::Version
82
82
  version: '0'
83
83
  - !ruby/object:Gem::Dependency
84
84
  name: packnga
85
85
  requirement: !ruby/object:Gem::Requirement
86
86
  requirements:
87
- - - '>='
87
+ - - ">="
88
88
  - !ruby/object:Gem::Version
89
89
  version: '0'
90
90
  type: :development
91
91
  prerelease: false
92
92
  version_requirements: !ruby/object:Gem::Requirement
93
93
  requirements:
94
- - - '>='
94
+ - - ">="
95
95
  - !ruby/object:Gem::Version
96
96
  version: '0'
97
97
  - !ruby/object:Gem::Dependency
98
98
  name: redcarpet
99
99
  requirement: !ruby/object:Gem::Requirement
100
100
  requirements:
101
- - - '>='
101
+ - - ">="
102
102
  - !ruby/object:Gem::Version
103
103
  version: '0'
104
104
  type: :development
105
105
  prerelease: false
106
106
  version_requirements: !ruby/object:Gem::Requirement
107
107
  requirements:
108
- - - '>='
108
+ - - ">="
109
109
  - !ruby/object:Gem::Version
110
110
  version: '0'
111
111
  description: |
@@ -119,19 +119,19 @@ executables: []
119
119
  extensions: []
120
120
  extra_rdoc_files: []
121
121
  files:
122
- - chupa-text-decomposer-html.gemspec
123
- - README.md
122
+ - ".yardopts"
123
+ - Gemfile
124
124
  - LICENSE.txt
125
+ - README.md
125
126
  - Rakefile
126
- - Gemfile
127
- - .yardopts
128
- - lib/chupa-text/decomposers/html.rb
127
+ - chupa-text-decomposer-html.gemspec
129
128
  - doc/text/news.md
129
+ - lib/chupa-text/decomposers/html.rb
130
130
  - test/run-test.rb
131
131
  - test/test-html.rb
132
132
  homepage: https://github.com/ranguba/chupa-text-decomposer-html
133
133
  licenses:
134
- - LGPLv2.1 or later
134
+ - LGPL-2.1+
135
135
  metadata: {}
136
136
  post_install_message:
137
137
  rdoc_options: []
@@ -139,20 +139,19 @@ require_paths:
139
139
  - lib
140
140
  required_ruby_version: !ruby/object:Gem::Requirement
141
141
  requirements:
142
- - - '>='
142
+ - - ">="
143
143
  - !ruby/object:Gem::Version
144
144
  version: '0'
145
145
  required_rubygems_version: !ruby/object:Gem::Requirement
146
146
  requirements:
147
- - - '>='
147
+ - - ">="
148
148
  - !ruby/object:Gem::Version
149
149
  version: '0'
150
150
  requirements: []
151
151
  rubyforge_project:
152
- rubygems_version: 2.0.14
152
+ rubygems_version: 2.5.2
153
153
  signing_key:
154
154
  specification_version: 4
155
155
  summary: This is a ChupaText decomposer plugin for to extract text and meta-data from
156
156
  HTML.
157
157
  test_files: []
158
- has_rdoc: