chupa-text-decomposer-html 1.0.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/chupa-text-decomposer-html.gemspec +3 -2
- data/doc/text/news.md +4 -0
- data/lib/chupa-text/decomposers/html.rb +5 -2
- data/test/test-html.rb +8 -8
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 75be13613ba030b332f565e1e569c9e15a75e111
|
4
|
+
data.tar.gz: 0ad922b353bffb19819340d6ffb70ce739e51fc0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5f00a62cb2156eab85ab87e4a1f50aee301fe17d831965cc1a2b7fe0b6bbde6b67f8b108a16999d6ecad3df7349b84a1d8ef7103901cfdfde0ba1ad45c5c8235
|
7
|
+
data.tar.gz: 975b69746801761f2872921e83c95bd1c309f01a6ae9703be404ec81b1b92cb359049dab3aa74cc37d603d0c71734fa30ed29715489298c0c70514f8b7534e6f
|
@@ -22,14 +22,15 @@ end
|
|
22
22
|
|
23
23
|
Gem::Specification.new do |spec|
|
24
24
|
spec.name = "chupa-text-decomposer-html"
|
25
|
-
spec.version = "1.0.
|
25
|
+
spec.version = "1.0.1"
|
26
26
|
spec.homepage = "https://github.com/ranguba/chupa-text-decomposer-html"
|
27
27
|
spec.authors = ["Kouhei Sutou"]
|
28
28
|
spec.email = ["kou@clear-code.com"]
|
29
29
|
readme = File.read("README.md", :encoding => "UTF-8")
|
30
30
|
entries = readme.split(/^\#\#\s(.*)$/)
|
31
31
|
description = clean_white_space.call(entries[entries.index("Description") + 1])
|
32
|
-
spec.summary
|
32
|
+
spec.summary = description.split(/\n\n+/, 2).first
|
33
|
+
spec.description = description
|
33
34
|
spec.license = "LGPLv2.1 or later"
|
34
35
|
spec.files = ["#{spec.name}.gemspec"]
|
35
36
|
spec.files += ["README.md", "LICENSE.txt", "Rakefile", "Gemfile"]
|
data/doc/text/news.md
CHANGED
@@ -43,10 +43,13 @@ module ChupaText
|
|
43
43
|
end
|
44
44
|
decomposed_data = TextData.new(body)
|
45
45
|
decomposed_data.uri = data.uri
|
46
|
+
|
47
|
+
attributes = decomposed_data.attributes
|
46
48
|
title_element = (doc % "head/title")
|
47
|
-
|
49
|
+
attributes.title = title_element.text if title_element
|
48
50
|
encoding = doc.encoding
|
49
|
-
|
51
|
+
attributes.encoding = encoding if encoding
|
52
|
+
|
50
53
|
yield(decomposed_data)
|
51
54
|
end
|
52
55
|
|
data/test/test-html.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
|
1
|
+
# Copyright (C) 2013-2014 Kouhei Sutou <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -143,7 +143,7 @@ class TestHTML < Test::Unit::TestCase
|
|
143
143
|
<body>Hello</body>
|
144
144
|
</html>
|
145
145
|
HTML
|
146
|
-
assert_equal([
|
146
|
+
assert_equal([Encoding::US_ASCII], decompose(@data))
|
147
147
|
end
|
148
148
|
|
149
149
|
def test_xml_declaration
|
@@ -155,7 +155,7 @@ class TestHTML < Test::Unit::TestCase
|
|
155
155
|
<body>Hello</body>
|
156
156
|
</html>
|
157
157
|
XHTML
|
158
|
-
assert_equal([
|
158
|
+
assert_equal([Encoding::Shift_JIS], decompose(@data))
|
159
159
|
end
|
160
160
|
|
161
161
|
def test_content_type
|
@@ -167,7 +167,7 @@ class TestHTML < Test::Unit::TestCase
|
|
167
167
|
<body>Hello</body>
|
168
168
|
</html>
|
169
169
|
HTML
|
170
|
-
assert_equal([
|
170
|
+
assert_equal([Encoding::EUC_JP], decompose(@data))
|
171
171
|
end
|
172
172
|
|
173
173
|
def test_meta_charset
|
@@ -179,7 +179,7 @@ class TestHTML < Test::Unit::TestCase
|
|
179
179
|
<body>Hello</body>
|
180
180
|
</html>
|
181
181
|
HTML5
|
182
|
-
assert_equal([
|
182
|
+
assert_equal([Encoding::EUC_JP], decompose(@data))
|
183
183
|
end
|
184
184
|
end
|
185
185
|
|
@@ -197,15 +197,15 @@ class TestHTML < Test::Unit::TestCase
|
|
197
197
|
end
|
198
198
|
|
199
199
|
def test_x_sjis
|
200
|
-
assert_equal([
|
200
|
+
assert_equal([Encoding::WINDOWS_31J], decompose("x-sjis"))
|
201
201
|
end
|
202
202
|
|
203
203
|
def test_shift_jis_hyphen
|
204
|
-
assert_equal([
|
204
|
+
assert_equal([Encoding::WINDOWS_31J], decompose("Shift-JIS"))
|
205
205
|
end
|
206
206
|
|
207
207
|
def test_shift_jis_under_score
|
208
|
-
assert_equal([
|
208
|
+
assert_equal([Encoding::WINDOWS_31J], decompose("Shift_JIS"))
|
209
209
|
end
|
210
210
|
end
|
211
211
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: chupa-text-decomposer-html
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kouhei Sutou
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-02-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: chupa-text
|
@@ -109,6 +109,9 @@ dependencies:
|
|
109
109
|
- !ruby/object:Gem::Version
|
110
110
|
version: '0'
|
111
111
|
description: |
|
112
|
+
This is a ChupaText decomposer plugin for to extract text and
|
113
|
+
meta-data from HTML.
|
114
|
+
|
112
115
|
You can use `html` decomposer.
|
113
116
|
email:
|
114
117
|
- kou@clear-code.com
|