rubysl-nkf 1.1.0 → 2.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +3 -2
- data/ext/rubysl/nkf/nkf-utf8/config.h +20 -57
- data/ext/rubysl/nkf/nkf-utf8/nkf.c +5705 -5028
- data/ext/rubysl/nkf/nkf-utf8/nkf.h +192 -0
- data/ext/rubysl/nkf/nkf-utf8/utf8tbl.c +863 -609
- data/ext/rubysl/nkf/nkf-utf8/utf8tbl.h +27 -0
- data/ext/rubysl/nkf/nkf.c +127 -279
- data/lib/kconv.rb +101 -186
- data/lib/rubysl/nkf/version.rb +1 -1
- data/rubysl-nkf.gemspec +3 -1
- metadata +21 -19
data/lib/kconv.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#
|
2
2
|
# kconv.rb - Kanji Converter.
|
3
3
|
#
|
4
|
-
# $Id: kconv.rb
|
4
|
+
# $Id: kconv.rb 30112 2010-12-07 11:47:39Z naruse $
|
5
5
|
#
|
6
6
|
# ----
|
7
7
|
#
|
@@ -18,9 +18,9 @@ module Kconv
|
|
18
18
|
#
|
19
19
|
# Public Constants
|
20
20
|
#
|
21
|
-
|
21
|
+
|
22
22
|
#Constant of Encoding
|
23
|
-
|
23
|
+
|
24
24
|
# Auto-Detect
|
25
25
|
AUTO = NKF::AUTO
|
26
26
|
# ISO-2022-JP
|
@@ -44,85 +44,19 @@ module Kconv
|
|
44
44
|
# UNKNOWN
|
45
45
|
UNKNOWN = NKF::UNKNOWN
|
46
46
|
|
47
|
-
#
|
48
|
-
# Private Constants
|
49
|
-
#
|
50
|
-
|
51
|
-
# Revision of kconv.rb
|
52
|
-
REVISION = %q$Revision: 11708 $
|
53
|
-
|
54
|
-
#Regexp of Encoding
|
55
|
-
|
56
|
-
# Regexp of Shift_JIS string (private constant)
|
57
|
-
RegexpShiftjis = /\A(?:
|
58
|
-
[\x00-\x7f\xa1-\xdf] |
|
59
|
-
[\x81-\x9f\xe0-\xfc][\x40-\x7e\x80-\xfc]
|
60
|
-
)*\z/nx
|
61
|
-
|
62
|
-
# Regexp of EUC-JP string (private constant)
|
63
|
-
RegexpEucjp = /\A(?:
|
64
|
-
[\x00-\x7f] |
|
65
|
-
\x8e [\xa1-\xdf] |
|
66
|
-
\x8f [\xa1-\xfe] [\xa1-\xfe] |
|
67
|
-
[\xa1-\xfe] [\xa1-\xfe]
|
68
|
-
)*\z/nx
|
69
|
-
|
70
|
-
# Regexp of UTF-8 string (private constant)
|
71
|
-
RegexpUtf8 = /\A(?:
|
72
|
-
[\x00-\x7f] |
|
73
|
-
[\xc2-\xdf] [\x80-\xbf] |
|
74
|
-
\xe0 [\xa0-\xbf] [\x80-\xbf] |
|
75
|
-
[\xe1-\xef] [\x80-\xbf] [\x80-\xbf] |
|
76
|
-
\xf0 [\x90-\xbf] [\x80-\xbf] [\x80-\xbf] |
|
77
|
-
[\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf] |
|
78
|
-
\xf4 [\x80-\x8f] [\x80-\xbf] [\x80-\xbf]
|
79
|
-
)*\z/nx
|
80
|
-
|
81
47
|
#
|
82
48
|
# Public Methods
|
83
49
|
#
|
84
|
-
|
50
|
+
|
85
51
|
# call-seq:
|
86
|
-
# Kconv.kconv(str,
|
87
|
-
#
|
88
|
-
# Convert <code>str</code> to
|
89
|
-
# <code>
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
# If you don't want to decode them, use NKF.nkf.
|
95
|
-
def kconv(str, out_code, in_code = AUTO)
|
96
|
-
opt = '-'
|
97
|
-
case in_code
|
98
|
-
when ::NKF::JIS
|
99
|
-
opt << 'J'
|
100
|
-
when ::NKF::EUC
|
101
|
-
opt << 'E'
|
102
|
-
when ::NKF::SJIS
|
103
|
-
opt << 'S'
|
104
|
-
when ::NKF::UTF8
|
105
|
-
opt << 'W'
|
106
|
-
when ::NKF::UTF16
|
107
|
-
opt << 'W16'
|
108
|
-
end
|
109
|
-
|
110
|
-
case out_code
|
111
|
-
when ::NKF::JIS
|
112
|
-
opt << 'j'
|
113
|
-
when ::NKF::EUC
|
114
|
-
opt << 'e'
|
115
|
-
when ::NKF::SJIS
|
116
|
-
opt << 's'
|
117
|
-
when ::NKF::UTF8
|
118
|
-
opt << 'w'
|
119
|
-
when ::NKF::UTF16
|
120
|
-
opt << 'w16'
|
121
|
-
when ::NKF::NOCONV
|
122
|
-
return str
|
123
|
-
end
|
124
|
-
|
125
|
-
opt = '' if opt == '-'
|
52
|
+
# Kconv.kconv(str, to_enc, from_enc=nil)
|
53
|
+
#
|
54
|
+
# Convert <code>str</code> to <code>to_enc</code>.
|
55
|
+
# <code>to_enc</code> and <code>from_enc</code> are given as constants of Kconv or Encoding objects.
|
56
|
+
def kconv(str, to_enc, from_enc=nil)
|
57
|
+
opt = ''
|
58
|
+
opt += ' --ic=' + from_enc.to_s if from_enc
|
59
|
+
opt += ' --oc=' + to_enc.to_s if to_enc
|
126
60
|
|
127
61
|
::NKF::nkf(opt, str)
|
128
62
|
end
|
@@ -133,235 +67,216 @@ module Kconv
|
|
133
67
|
#
|
134
68
|
|
135
69
|
# call-seq:
|
136
|
-
# Kconv.tojis(str)
|
70
|
+
# Kconv.tojis(str) => string
|
137
71
|
#
|
138
72
|
# Convert <code>str</code> to ISO-2022-JP
|
139
|
-
#
|
140
|
-
# *Note*
|
141
|
-
# This method decode MIME encoded string and
|
142
|
-
# convert halfwidth katakana to fullwidth katakana.
|
143
|
-
# If you don't want it, use NKF.nkf('-jxm0', str).
|
144
73
|
def tojis(str)
|
145
|
-
|
74
|
+
kconv(str, JIS)
|
146
75
|
end
|
147
76
|
module_function :tojis
|
148
77
|
|
149
78
|
# call-seq:
|
150
|
-
# Kconv.toeuc(str)
|
79
|
+
# Kconv.toeuc(str) => string
|
151
80
|
#
|
152
81
|
# Convert <code>str</code> to EUC-JP
|
153
|
-
#
|
154
|
-
# *Note*
|
155
|
-
# This method decode MIME encoded string and
|
156
|
-
# convert halfwidth katakana to fullwidth katakana.
|
157
|
-
# If you don't want it, use NKF.nkf('-exm0', str).
|
158
82
|
def toeuc(str)
|
159
|
-
|
83
|
+
kconv(str, EUC)
|
160
84
|
end
|
161
85
|
module_function :toeuc
|
162
86
|
|
163
87
|
# call-seq:
|
164
|
-
# Kconv.tosjis(str)
|
88
|
+
# Kconv.tosjis(str) => string
|
165
89
|
#
|
166
90
|
# Convert <code>str</code> to Shift_JIS
|
167
|
-
#
|
168
|
-
# *Note*
|
169
|
-
# This method decode MIME encoded string and
|
170
|
-
# convert halfwidth katakana to fullwidth katakana.
|
171
|
-
# If you don't want it, use NKF.nkf('-sxm0', str).
|
172
91
|
def tosjis(str)
|
173
|
-
|
92
|
+
kconv(str, SJIS)
|
174
93
|
end
|
175
94
|
module_function :tosjis
|
176
95
|
|
177
96
|
# call-seq:
|
178
|
-
# Kconv.toutf8(str)
|
97
|
+
# Kconv.toutf8(str) => string
|
179
98
|
#
|
180
99
|
# Convert <code>str</code> to UTF-8
|
181
|
-
#
|
182
|
-
# *Note*
|
183
|
-
# This method decode MIME encoded string and
|
184
|
-
# convert halfwidth katakana to fullwidth katakana.
|
185
|
-
# If you don't want it, use NKF.nkf('-wxm0', str).
|
186
100
|
def toutf8(str)
|
187
|
-
|
101
|
+
kconv(str, UTF8)
|
188
102
|
end
|
189
103
|
module_function :toutf8
|
190
104
|
|
191
105
|
# call-seq:
|
192
|
-
# Kconv.toutf16(str)
|
106
|
+
# Kconv.toutf16(str) => string
|
193
107
|
#
|
194
108
|
# Convert <code>str</code> to UTF-16
|
195
|
-
#
|
196
|
-
# *Note*
|
197
|
-
# This method decode MIME encoded string and
|
198
|
-
# convert halfwidth katakana to fullwidth katakana.
|
199
|
-
# If you don't want it, use NKF.nkf('-w16xm0', str).
|
200
109
|
def toutf16(str)
|
201
|
-
|
110
|
+
kconv(str, UTF16)
|
202
111
|
end
|
203
112
|
module_function :toutf16
|
204
113
|
|
114
|
+
# call-seq:
|
115
|
+
# Kconv.toutf32(str) => string
|
116
|
+
#
|
117
|
+
# Convert <code>str</code> to UTF-32
|
118
|
+
def toutf32(str)
|
119
|
+
kconv(str, UTF32)
|
120
|
+
end
|
121
|
+
module_function :toutf32
|
122
|
+
|
123
|
+
# call-seq:
|
124
|
+
# Kconv.tolocale => string
|
125
|
+
#
|
126
|
+
# Convert <code>self</code> to locale encoding
|
127
|
+
def tolocale(str)
|
128
|
+
kconv(str, Encoding.locale_charmap)
|
129
|
+
end
|
130
|
+
module_function :tolocale
|
131
|
+
|
205
132
|
#
|
206
133
|
# guess
|
207
134
|
#
|
208
135
|
|
209
136
|
# call-seq:
|
210
|
-
# Kconv.guess(str)
|
137
|
+
# Kconv.guess(str) => encoding
|
211
138
|
#
|
212
|
-
# Guess input encoding by NKF.
|
139
|
+
# Guess input encoding by NKF.guess
|
213
140
|
def guess(str)
|
214
141
|
::NKF::guess(str)
|
215
142
|
end
|
216
143
|
module_function :guess
|
217
144
|
|
218
|
-
# call-seq:
|
219
|
-
# Kconv.guess_old(str) -> integer
|
220
|
-
#
|
221
|
-
# Guess input encoding by NKF.guess1
|
222
|
-
def guess_old(str)
|
223
|
-
::NKF::guess1(str)
|
224
|
-
end
|
225
|
-
module_function :guess_old
|
226
|
-
|
227
145
|
#
|
228
146
|
# isEncoding
|
229
147
|
#
|
230
148
|
|
231
149
|
# call-seq:
|
232
|
-
# Kconv.iseuc(str)
|
150
|
+
# Kconv.iseuc(str) => true or false
|
233
151
|
#
|
234
152
|
# Returns whether input encoding is EUC-JP or not.
|
235
153
|
#
|
236
154
|
# *Note* don't expect this return value is MatchData.
|
237
155
|
def iseuc(str)
|
238
|
-
|
156
|
+
str.dup.force_encoding(EUC).valid_encoding?
|
239
157
|
end
|
240
158
|
module_function :iseuc
|
241
159
|
|
242
160
|
# call-seq:
|
243
|
-
# Kconv.issjis(str)
|
161
|
+
# Kconv.issjis(str) => true or false
|
244
162
|
#
|
245
163
|
# Returns whether input encoding is Shift_JIS or not.
|
246
|
-
#
|
247
|
-
# *Note* don't expect this return value is MatchData.
|
248
164
|
def issjis(str)
|
249
|
-
|
165
|
+
str.dup.force_encoding(SJIS).valid_encoding?
|
250
166
|
end
|
251
167
|
module_function :issjis
|
252
168
|
|
253
169
|
# call-seq:
|
254
|
-
# Kconv.
|
170
|
+
# Kconv.isjis(str) => true or false
|
171
|
+
#
|
172
|
+
# Returns whether input encoding is ISO-2022-JP or not.
|
173
|
+
def isjis(str)
|
174
|
+
/\A [\t\n\r\x20-\x7E]*
|
175
|
+
(?:
|
176
|
+
(?:\x1b \x28 I [\x21-\x7E]*
|
177
|
+
|\x1b \x28 J [\x21-\x7E]*
|
178
|
+
|\x1b \x24 @ (?:[\x21-\x7E]{2})*
|
179
|
+
|\x1b \x24 B (?:[\x21-\x7E]{2})*
|
180
|
+
|\x1b \x24 \x28 D (?:[\x21-\x7E]{2})*
|
181
|
+
)*
|
182
|
+
\x1b \x28 B [\t\n\r\x20-\x7E]*
|
183
|
+
)*
|
184
|
+
\z/nox =~ str.dup.force_encoding('BINARY') ? true : false
|
185
|
+
end
|
186
|
+
module_function :isjis
|
187
|
+
|
188
|
+
# call-seq:
|
189
|
+
# Kconv.isutf8(str) => true or false
|
255
190
|
#
|
256
191
|
# Returns whether input encoding is UTF-8 or not.
|
257
|
-
#
|
258
|
-
# *Note* don't expect this return value is MatchData.
|
259
192
|
def isutf8(str)
|
260
|
-
|
193
|
+
str.dup.force_encoding(UTF8).valid_encoding?
|
261
194
|
end
|
262
195
|
module_function :isutf8
|
263
|
-
|
264
196
|
end
|
265
197
|
|
266
198
|
class String
|
267
199
|
# call-seq:
|
268
|
-
# String#kconv(
|
200
|
+
# String#kconv(to_enc, from_enc)
|
269
201
|
#
|
270
|
-
# Convert <code>self</code> to
|
271
|
-
# <code>
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
# convert halfwidth katakana to fullwidth katakana.
|
276
|
-
# If you don't want to decode them, use NKF.nkf.
|
277
|
-
def kconv(out_code, in_code=Kconv::AUTO)
|
278
|
-
Kconv::kconv(self, out_code, in_code)
|
202
|
+
# Convert <code>self</code> to <code>to_enc</code>.
|
203
|
+
# <code>to_enc</code> and <code>from_enc</code> are given as constants of Kconv or Encoding objects.
|
204
|
+
def kconv(to_enc, from_enc=nil)
|
205
|
+
from_enc = self.encoding if !from_enc && self.encoding != Encoding.list[0]
|
206
|
+
Kconv::kconv(self, to_enc, from_enc)
|
279
207
|
end
|
280
|
-
|
208
|
+
|
281
209
|
#
|
282
210
|
# to Encoding
|
283
211
|
#
|
284
|
-
|
212
|
+
|
285
213
|
# call-seq:
|
286
|
-
# String#tojis
|
214
|
+
# String#tojis => string
|
287
215
|
#
|
288
216
|
# Convert <code>self</code> to ISO-2022-JP
|
289
|
-
#
|
290
|
-
# *Note*
|
291
|
-
# This method decode MIME encoded string and
|
292
|
-
# convert halfwidth katakana to fullwidth katakana.
|
293
|
-
# If you don't want it, use NKF.nkf('-jxm0', str).
|
294
217
|
def tojis; Kconv.tojis(self) end
|
295
218
|
|
296
219
|
# call-seq:
|
297
|
-
# String#toeuc
|
220
|
+
# String#toeuc => string
|
298
221
|
#
|
299
222
|
# Convert <code>self</code> to EUC-JP
|
300
|
-
#
|
301
|
-
# *Note*
|
302
|
-
# This method decode MIME encoded string and
|
303
|
-
# convert halfwidth katakana to fullwidth katakana.
|
304
|
-
# If you don't want it, use NKF.nkf('-exm0', str).
|
305
223
|
def toeuc; Kconv.toeuc(self) end
|
306
224
|
|
307
225
|
# call-seq:
|
308
|
-
# String#tosjis
|
226
|
+
# String#tosjis => string
|
309
227
|
#
|
310
228
|
# Convert <code>self</code> to Shift_JIS
|
311
|
-
#
|
312
|
-
# *Note*
|
313
|
-
# This method decode MIME encoded string and
|
314
|
-
# convert halfwidth katakana to fullwidth katakana.
|
315
|
-
# If you don't want it, use NKF.nkf('-sxm0', str).
|
316
229
|
def tosjis; Kconv.tosjis(self) end
|
317
230
|
|
318
231
|
# call-seq:
|
319
|
-
# String#toutf8
|
232
|
+
# String#toutf8 => string
|
320
233
|
#
|
321
234
|
# Convert <code>self</code> to UTF-8
|
322
|
-
#
|
323
|
-
# *Note*
|
324
|
-
# This method decode MIME encoded string and
|
325
|
-
# convert halfwidth katakana to fullwidth katakana.
|
326
|
-
# If you don't want it, use NKF.nkf('-wxm0', str).
|
327
235
|
def toutf8; Kconv.toutf8(self) end
|
328
236
|
|
329
237
|
# call-seq:
|
330
|
-
# String#toutf16
|
238
|
+
# String#toutf16 => string
|
331
239
|
#
|
332
240
|
# Convert <code>self</code> to UTF-16
|
333
|
-
#
|
334
|
-
# *Note*
|
335
|
-
# This method decode MIME encoded string and
|
336
|
-
# convert halfwidth katakana to fullwidth katakana.
|
337
|
-
# If you don't want it, use NKF.nkf('-w16xm0', str).
|
338
241
|
def toutf16; Kconv.toutf16(self) end
|
339
242
|
|
243
|
+
# call-seq:
|
244
|
+
# String#toutf32 => string
|
245
|
+
#
|
246
|
+
# Convert <code>self</code> to UTF-32
|
247
|
+
def toutf32; Kconv.toutf32(self) end
|
248
|
+
|
249
|
+
# call-seq:
|
250
|
+
# String#tolocale => string
|
251
|
+
#
|
252
|
+
# Convert <code>self</code> to locale encoding
|
253
|
+
def tolocale; Kconv.tolocale(self) end
|
254
|
+
|
340
255
|
#
|
341
256
|
# is Encoding
|
342
257
|
#
|
343
258
|
|
344
259
|
# call-seq:
|
345
|
-
# String#iseuc
|
260
|
+
# String#iseuc => true or false
|
346
261
|
#
|
347
262
|
# Returns whether <code>self</code>'s encoding is EUC-JP or not.
|
348
|
-
#
|
349
|
-
# *Note* don't expect this return value is MatchData.
|
350
263
|
def iseuc; Kconv.iseuc(self) end
|
351
264
|
|
352
265
|
# call-seq:
|
353
|
-
# String#issjis
|
266
|
+
# String#issjis => true or false
|
354
267
|
#
|
355
268
|
# Returns whether <code>self</code>'s encoding is Shift_JIS or not.
|
356
|
-
#
|
357
|
-
# *Note* don't expect this return value is MatchData.
|
358
269
|
def issjis; Kconv.issjis(self) end
|
359
270
|
|
360
271
|
# call-seq:
|
361
|
-
# String#
|
272
|
+
# String#isjis => true or false
|
362
273
|
#
|
363
|
-
# Returns whether <code>self</code>'s encoding is
|
274
|
+
# Returns whether <code>self</code>'s encoding is ISO-2022-JP or not.
|
275
|
+
def isjis; Kconv.isjis(self) end
|
276
|
+
|
277
|
+
# call-seq:
|
278
|
+
# String#isutf8 => true or false
|
364
279
|
#
|
365
|
-
#
|
280
|
+
# Returns whether <code>self</code>'s encoding is UTF-8 or not.
|
366
281
|
def isutf8; Kconv.isutf8(self) end
|
367
282
|
end
|
data/lib/rubysl/nkf/version.rb
CHANGED
data/rubysl-nkf.gemspec
CHANGED
@@ -17,8 +17,10 @@ Gem::Specification.new do |spec|
|
|
17
17
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
18
18
|
spec.require_paths = ["lib"]
|
19
19
|
|
20
|
+
spec.required_ruby_version = "~> 2.0"
|
21
|
+
|
20
22
|
spec.add_development_dependency "bundler", "~> 1.3"
|
21
23
|
spec.add_development_dependency "rake", "~> 10.0"
|
22
24
|
spec.add_development_dependency "mspec", "~> 1.5"
|
23
|
-
spec.add_development_dependency "rubysl-prettyprint", "~>
|
25
|
+
spec.add_development_dependency "rubysl-prettyprint", "~> 2.0"
|
24
26
|
end
|