rubysl-nkf 1.1.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +3 -2
- data/ext/rubysl/nkf/nkf-utf8/config.h +20 -57
- data/ext/rubysl/nkf/nkf-utf8/nkf.c +5705 -5028
- data/ext/rubysl/nkf/nkf-utf8/nkf.h +192 -0
- data/ext/rubysl/nkf/nkf-utf8/utf8tbl.c +863 -609
- data/ext/rubysl/nkf/nkf-utf8/utf8tbl.h +27 -0
- data/ext/rubysl/nkf/nkf.c +127 -279
- data/lib/kconv.rb +101 -186
- data/lib/rubysl/nkf/version.rb +1 -1
- data/rubysl-nkf.gemspec +3 -1
- metadata +21 -19
@@ -1,28 +1,55 @@
|
|
1
|
+
/*
|
2
|
+
* utf8tbl.h - Header file for Convertion Table
|
3
|
+
*
|
4
|
+
* $Id: utf8tbl.h 25189 2009-10-02 12:04:37Z akr $
|
5
|
+
*/
|
6
|
+
|
1
7
|
#ifndef _UTF8TBL_H_
|
2
8
|
#define _UTF8TBL_H_
|
3
9
|
|
4
10
|
#ifdef UTF8_OUTPUT_ENABLE
|
11
|
+
#define sizeof_euc_to_utf8_1byte 94
|
12
|
+
#define sizeof_euc_to_utf8_2bytes 94
|
5
13
|
extern const unsigned short euc_to_utf8_1byte[];
|
6
14
|
extern const unsigned short *const euc_to_utf8_2bytes[];
|
7
15
|
extern const unsigned short *const euc_to_utf8_2bytes_ms[];
|
16
|
+
extern const unsigned short *const euc_to_utf8_2bytes_mac[];
|
8
17
|
extern const unsigned short *const x0212_to_utf8_2bytes[];
|
9
18
|
#endif /* UTF8_OUTPUT_ENABLE */
|
10
19
|
|
11
20
|
#ifdef UTF8_INPUT_ENABLE
|
21
|
+
#define sizeof_utf8_to_euc_C2 64
|
22
|
+
#define sizeof_utf8_to_euc_E5B8 64
|
23
|
+
#define sizeof_utf8_to_euc_2bytes 112
|
24
|
+
#define sizeof_utf8_to_euc_3bytes 16
|
12
25
|
extern const unsigned short *const utf8_to_euc_2bytes[];
|
13
26
|
extern const unsigned short *const utf8_to_euc_2bytes_ms[];
|
14
27
|
extern const unsigned short *const utf8_to_euc_2bytes_932[];
|
28
|
+
extern const unsigned short *const utf8_to_euc_2bytes_mac[];
|
15
29
|
extern const unsigned short *const *const utf8_to_euc_3bytes[];
|
16
30
|
extern const unsigned short *const *const utf8_to_euc_3bytes_ms[];
|
17
31
|
extern const unsigned short *const *const utf8_to_euc_3bytes_932[];
|
32
|
+
extern const unsigned short *const *const utf8_to_euc_3bytes_mac[];
|
18
33
|
#endif /* UTF8_INPUT_ENABLE */
|
19
34
|
|
20
35
|
#ifdef UNICODE_NORMALIZATION
|
36
|
+
|
37
|
+
#define NORMALIZATION_TABLE_LENGTH 942
|
38
|
+
#define NORMALIZATION_TABLE_NFC_LENGTH 3
|
39
|
+
#define NORMALIZATION_TABLE_NFD_LENGTH 9
|
40
|
+
struct normalization_pair {
|
41
|
+
const unsigned char nfc[NORMALIZATION_TABLE_NFC_LENGTH];
|
42
|
+
const unsigned char nfd[NORMALIZATION_TABLE_NFD_LENGTH];
|
43
|
+
};
|
21
44
|
extern const struct normalization_pair normalization_table[];
|
22
45
|
#endif
|
23
46
|
|
24
47
|
#ifdef SHIFTJIS_CP932
|
48
|
+
#define CP932_TABLE_BEGIN 0xFA
|
49
|
+
#define CP932_TABLE_END 0xFC
|
25
50
|
extern const unsigned short shiftjis_cp932[3][189];
|
51
|
+
#define CP932INV_TABLE_BEGIN 0xED
|
52
|
+
#define CP932INV_TABLE_END 0xEE
|
26
53
|
extern const unsigned short cp932inv[2][189];
|
27
54
|
#endif /* SHIFTJIS_CP932 */
|
28
55
|
|
data/ext/rubysl/nkf/nkf.c
CHANGED
@@ -3,31 +3,15 @@
|
|
3
3
|
*
|
4
4
|
* original nkf2.x is maintained at http://sourceforge.jp/projects/nkf/
|
5
5
|
*
|
6
|
-
* $Id: nkf.c
|
6
|
+
* $Id: nkf.c 27947 2010-05-21 10:11:44Z nobu $
|
7
7
|
*
|
8
8
|
*/
|
9
9
|
|
10
|
-
#define RUBY_NKF_REVISION "$Revision:
|
10
|
+
#define RUBY_NKF_REVISION "$Revision: 27947 $"
|
11
11
|
#define RUBY_NKF_VERSION NKF_VERSION " (" NKF_RELEASE_DATE ")"
|
12
12
|
|
13
|
-
#include "ruby.h"
|
14
|
-
|
15
|
-
/* Encoding Constants */
|
16
|
-
#define _AUTO 0
|
17
|
-
#define _JIS 1
|
18
|
-
#define _EUC 2
|
19
|
-
#define _SJIS 3
|
20
|
-
#define _BINARY 4
|
21
|
-
#define _NOCONV 4
|
22
|
-
#define _ASCII 5
|
23
|
-
/* 0b011x is reserved for UTF-8 Family */
|
24
|
-
#define _UTF8 6
|
25
|
-
/* 0b10xx is reserved for UTF-16 Family */
|
26
|
-
#define _UTF16 8
|
27
|
-
/* 0b11xx is reserved for UTF-32 Family */
|
28
|
-
#define _UTF32 12
|
29
|
-
#define _OTHER 16
|
30
|
-
#define _UNKNOWN _AUTO
|
13
|
+
#include "ruby/ruby.h"
|
14
|
+
#include "ruby/encoding.h"
|
31
15
|
|
32
16
|
/* Replace nkf's getchar/putchar for variable modification */
|
33
17
|
/* we never use getc, ungetc */
|
@@ -56,14 +40,13 @@ static int incsize;
|
|
56
40
|
static VALUE result;
|
57
41
|
|
58
42
|
static int
|
59
|
-
rb_nkf_putchar(c)
|
60
|
-
unsigned int c;
|
43
|
+
rb_nkf_putchar(unsigned int c)
|
61
44
|
{
|
62
45
|
if (output_ctr >= o_len) {
|
63
46
|
o_len += incsize;
|
64
47
|
rb_str_resize(result, o_len);
|
65
48
|
incsize *= 2;
|
66
|
-
output = (unsigned char *)
|
49
|
+
output = (unsigned char *)RSTRING_PTR(result);
|
67
50
|
}
|
68
51
|
output[output_ctr++] = c;
|
69
52
|
|
@@ -78,11 +61,23 @@ rb_nkf_putchar(c)
|
|
78
61
|
#include "nkf-utf8/utf8tbl.c"
|
79
62
|
#include "nkf-utf8/nkf.c"
|
80
63
|
|
81
|
-
|
82
|
-
|
64
|
+
rb_encoding* rb_nkf_enc_get(const char *name)
|
65
|
+
{
|
66
|
+
int idx = rb_enc_find_index(name);
|
67
|
+
if (idx < 0) {
|
68
|
+
nkf_encoding *nkf_enc = nkf_enc_find(name);
|
69
|
+
idx = rb_enc_find_index(nkf_enc_name(nkf_enc_to_base_encoding(nkf_enc)));
|
70
|
+
if (idx < 0) {
|
71
|
+
idx = rb_define_dummy_encoding(name);
|
72
|
+
}
|
73
|
+
}
|
74
|
+
return rb_enc_from_index(idx);
|
75
|
+
}
|
76
|
+
|
77
|
+
int nkf_split_options(const char *arg)
|
83
78
|
{
|
84
79
|
int count = 0;
|
85
|
-
char option[256];
|
80
|
+
unsigned char option[256];
|
86
81
|
int i = 0, j = 0;
|
87
82
|
int is_escaped = FALSE;
|
88
83
|
int is_single_quoted = FALSE;
|
@@ -113,7 +108,7 @@ int nkf_split_options(arg)
|
|
113
108
|
is_double_quoted = TRUE;
|
114
109
|
}else if(arg[i] == ' '){
|
115
110
|
option[j] = '\0';
|
116
|
-
options(
|
111
|
+
options(option);
|
117
112
|
j = 0;
|
118
113
|
}else{
|
119
114
|
option[j++] = arg[i];
|
@@ -121,252 +116,99 @@ int nkf_split_options(arg)
|
|
121
116
|
}
|
122
117
|
if(j){
|
123
118
|
option[j] = '\0';
|
124
|
-
options(
|
119
|
+
options(option);
|
125
120
|
}
|
126
121
|
return count;
|
127
122
|
}
|
128
123
|
|
129
124
|
/*
|
130
125
|
* call-seq:
|
131
|
-
* NKF.nkf(opt, str)
|
126
|
+
* NKF.nkf(opt, str) => string
|
132
127
|
*
|
133
128
|
* Convert _str_ and return converted result.
|
134
129
|
* Conversion details are specified by _opt_ as String.
|
135
130
|
*
|
136
131
|
* require 'nkf'
|
137
132
|
* output = NKF.nkf("-s", input)
|
138
|
-
*
|
139
|
-
* *Note*
|
140
|
-
* By default, nkf decodes MIME encoded string.
|
141
|
-
* If you want not to decode input, use NKF.nkf with <b>-m0</b> flag.
|
142
133
|
*/
|
143
134
|
|
144
135
|
static VALUE
|
145
|
-
|
146
|
-
VALUE obj, opt, src;
|
136
|
+
rb_nkf_convert(VALUE obj, VALUE opt, VALUE src)
|
147
137
|
{
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
i_len = RSTRING(src)->len;
|
163
|
-
result = rb_str_new(0, i_len*3 + 10);
|
164
|
-
v = result;
|
165
|
-
|
166
|
-
output_ctr = 0;
|
167
|
-
output = (unsigned char *)RSTRING(result)->ptr;
|
168
|
-
o_len = RSTRING(result)->len;
|
169
|
-
*output = '\0';
|
170
|
-
|
171
|
-
if(x0201_f == WISH_TRUE)
|
172
|
-
x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201);
|
173
|
-
|
174
|
-
kanji_convert(NULL);
|
175
|
-
RSTRING(result)->ptr[output_ctr] = '\0';
|
176
|
-
RSTRING(result)->len = output_ctr;
|
177
|
-
OBJ_INFECT(result, src);
|
178
|
-
|
179
|
-
return result;
|
180
|
-
}
|
138
|
+
volatile VALUE tmp;
|
139
|
+
reinit();
|
140
|
+
StringValue(opt);
|
141
|
+
nkf_split_options(RSTRING_PTR(opt));
|
142
|
+
if (!output_encoding) rb_raise(rb_eArgError, "no output encoding given");
|
143
|
+
|
144
|
+
switch (nkf_enc_to_index(output_encoding)) {
|
145
|
+
case UTF_8_BOM: output_encoding = nkf_enc_from_index(UTF_8); break;
|
146
|
+
case UTF_16BE_BOM: output_encoding = nkf_enc_from_index(UTF_16BE); break;
|
147
|
+
case UTF_16LE_BOM: output_encoding = nkf_enc_from_index(UTF_16LE); break;
|
148
|
+
case UTF_32BE_BOM: output_encoding = nkf_enc_from_index(UTF_32BE); break;
|
149
|
+
case UTF_32LE_BOM: output_encoding = nkf_enc_from_index(UTF_32LE); break;
|
150
|
+
}
|
151
|
+
output_bom_f = FALSE;
|
181
152
|
|
153
|
+
incsize = INCSIZE;
|
182
154
|
|
183
|
-
|
184
|
-
|
185
|
-
*
|
186
|
-
|
187
|
-
|
188
|
-
*
|
189
|
-
* Algorithm described in:
|
190
|
-
* Ken Lunde. `Understanding Japanese Information Processing'
|
191
|
-
* Sebastopol, CA: O'Reilly & Associates.
|
192
|
-
*
|
193
|
-
* case NKF.guess1(input)
|
194
|
-
* when NKF::JIS
|
195
|
-
* "ISO-2022-JP"
|
196
|
-
* when NKF::SJIS
|
197
|
-
* "Shift_JIS"
|
198
|
-
* when NKF::EUC
|
199
|
-
* "EUC-JP"
|
200
|
-
* when NKF::UNKNOWN
|
201
|
-
* "UNKNOWN(ASCII)"
|
202
|
-
* when NKF::BINARY
|
203
|
-
* "BINARY"
|
204
|
-
* end
|
205
|
-
*/
|
155
|
+
input_ctr = 0;
|
156
|
+
StringValue(src);
|
157
|
+
input = (unsigned char *)RSTRING_PTR(src);
|
158
|
+
i_len = RSTRING_LEN(src);
|
159
|
+
tmp = result = rb_str_new(0, i_len*3 + 10);
|
206
160
|
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
if (p==pend) return INT2FIX(_UNKNOWN);\
|
223
|
-
sequence_counter++;\
|
224
|
-
if (sequence_counter % 2 == 1 && *p != 0xa4)\
|
225
|
-
sequence_counter = 0;\
|
226
|
-
if (6 <= sequence_counter) {\
|
227
|
-
sequence_counter = 0;\
|
228
|
-
return INT2FIX(_EUC);\
|
229
|
-
}\
|
230
|
-
} while (0)
|
231
|
-
|
232
|
-
if (*p == 0xa4)
|
233
|
-
sequence_counter = 1;
|
234
|
-
|
235
|
-
while (p<pend) {
|
236
|
-
if (*p == '\033') {
|
237
|
-
return INT2FIX(_JIS);
|
238
|
-
}
|
239
|
-
if (*p < '\006' || *p == 0x7f || *p == 0xff) {
|
240
|
-
return INT2FIX(_BINARY);
|
241
|
-
}
|
242
|
-
if (0x81 <= *p && *p <= 0x8d) {
|
243
|
-
return INT2FIX(_SJIS);
|
244
|
-
}
|
245
|
-
if (0x8f <= *p && *p <= 0x9f) {
|
246
|
-
return INT2FIX(_SJIS);
|
247
|
-
}
|
248
|
-
if (*p == 0x8e) { /* SS2 */
|
249
|
-
INCR;
|
250
|
-
if ((0x40 <= *p && *p <= 0x7e) ||
|
251
|
-
(0x80 <= *p && *p <= 0xa0) ||
|
252
|
-
(0xe0 <= *p && *p <= 0xfc))
|
253
|
-
return INT2FIX(_SJIS);
|
254
|
-
}
|
255
|
-
else if (0xa1 <= *p && *p <= 0xdf) {
|
256
|
-
INCR;
|
257
|
-
if (0xf0 <= *p && *p <= 0xfe)
|
258
|
-
return INT2FIX(_EUC);
|
259
|
-
if (0xe0 <= *p && *p <= 0xef) {
|
260
|
-
while (p < pend && *p >= 0x40) {
|
261
|
-
if (*p >= 0x81) {
|
262
|
-
if (*p <= 0x8d || (0x8f <= *p && *p <= 0x9f)) {
|
263
|
-
return INT2FIX(_SJIS);
|
264
|
-
}
|
265
|
-
else if (0xfd <= *p && *p <= 0xfe) {
|
266
|
-
return INT2FIX(_EUC);
|
267
|
-
}
|
268
|
-
}
|
269
|
-
INCR;
|
270
|
-
}
|
271
|
-
}
|
272
|
-
else if (*p <= 0x9f) {
|
273
|
-
return INT2FIX(_SJIS);
|
274
|
-
}
|
275
|
-
}
|
276
|
-
else if (0xf0 <= *p && *p <= 0xfe) {
|
277
|
-
return INT2FIX(_EUC);
|
278
|
-
}
|
279
|
-
else if (0xe0 <= *p && *p <= 0xef) {
|
280
|
-
INCR;
|
281
|
-
if ((0x40 <= *p && *p <= 0x7e) ||
|
282
|
-
(0x80 <= *p && *p <= 0xa0)) {
|
283
|
-
return INT2FIX(_SJIS);
|
284
|
-
}
|
285
|
-
if (0xfd <= *p && *p <= 0xfe) {
|
286
|
-
return INT2FIX(_EUC);
|
287
|
-
}
|
288
|
-
}
|
289
|
-
INCR;
|
290
|
-
}
|
291
|
-
return INT2FIX(_UNKNOWN);
|
161
|
+
output_ctr = 0;
|
162
|
+
output = (unsigned char *)RSTRING_PTR(result);
|
163
|
+
o_len = RSTRING_LEN(result);
|
164
|
+
*output = '\0';
|
165
|
+
|
166
|
+
kanji_convert(NULL);
|
167
|
+
rb_str_set_len(result, output_ctr);
|
168
|
+
OBJ_INFECT(result, src);
|
169
|
+
|
170
|
+
if (mimeout_f)
|
171
|
+
rb_enc_associate(result, rb_usascii_encoding());
|
172
|
+
else
|
173
|
+
rb_enc_associate(result, rb_nkf_enc_get(nkf_enc_name(output_encoding)));
|
174
|
+
|
175
|
+
return result;
|
292
176
|
}
|
293
177
|
|
294
178
|
|
295
179
|
/*
|
296
180
|
* call-seq:
|
297
|
-
* NKF.
|
298
|
-
*
|
299
|
-
* Returns guessed encoding of _str_
|
300
|
-
*
|
301
|
-
* case NKF.guess(input)
|
302
|
-
* when NKF::ASCII
|
303
|
-
* "ASCII"
|
304
|
-
* when NKF::JIS
|
305
|
-
* "ISO-2022-JP"
|
306
|
-
* when NKF::SJIS
|
307
|
-
* "Shift_JIS"
|
308
|
-
* when NKF::EUC
|
309
|
-
* "EUC-JP"
|
310
|
-
* when NKF::UTF8
|
311
|
-
* "UTF-8"
|
312
|
-
* when NKF::UTF16
|
313
|
-
* "UTF-16"
|
314
|
-
* when NKF::UNKNOWN
|
315
|
-
* "UNKNOWN"
|
316
|
-
* when NKF::BINARY
|
317
|
-
* "BINARY"
|
318
|
-
* end
|
181
|
+
* NKF.guess(str) => encoding
|
182
|
+
*
|
183
|
+
* Returns guessed encoding of _str_ by nkf routine.
|
184
|
+
*
|
319
185
|
*/
|
320
186
|
|
321
187
|
static VALUE
|
322
|
-
|
323
|
-
VALUE obj, src;
|
188
|
+
rb_nkf_guess(VALUE obj, VALUE src)
|
324
189
|
{
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
input = (unsigned char *)RSTRING(src)->ptr;
|
332
|
-
i_len = RSTRING(src)->len;
|
333
|
-
|
334
|
-
if(x0201_f == WISH_TRUE)
|
335
|
-
x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201);
|
336
|
-
|
337
|
-
guess_f = TRUE;
|
338
|
-
kanji_convert( NULL );
|
339
|
-
guess_f = FALSE;
|
340
|
-
|
341
|
-
if (!is_inputcode_mixed) {
|
342
|
-
if (strcmp(input_codename, "") == 0) {
|
343
|
-
code = _ASCII;
|
344
|
-
} else if (strcmp(input_codename, "ISO-2022-JP") == 0) {
|
345
|
-
code = _JIS;
|
346
|
-
} else if (strcmp(input_codename, "EUC-JP") == 0) {
|
347
|
-
code = _EUC;
|
348
|
-
} else if (strcmp(input_codename, "Shift_JIS") == 0) {
|
349
|
-
code = _SJIS;
|
350
|
-
} else if (strcmp(input_codename, "UTF-8") == 0) {
|
351
|
-
code = _UTF8;
|
352
|
-
} else if (strcmp(input_codename, "UTF-16") == 0) {
|
353
|
-
code = _UTF16;
|
354
|
-
} else if (strlen(input_codename) > 0) {
|
355
|
-
code = _UNKNOWN;
|
356
|
-
}
|
357
|
-
}
|
190
|
+
reinit();
|
191
|
+
|
192
|
+
input_ctr = 0;
|
193
|
+
StringValue(src);
|
194
|
+
input = (unsigned char *)RSTRING_PTR(src);
|
195
|
+
i_len = RSTRING_LEN(src);
|
358
196
|
|
359
|
-
|
197
|
+
guess_f = TRUE;
|
198
|
+
kanji_convert( NULL );
|
199
|
+
guess_f = FALSE;
|
200
|
+
|
201
|
+
return rb_enc_from_encoding(rb_nkf_enc_get(get_guessed_code()));
|
360
202
|
}
|
361
203
|
|
362
204
|
|
363
205
|
/*
|
364
|
-
* NKF - Ruby extension for Network Kanji Filter
|
206
|
+
* NKF - Ruby extension for Network Kanji Filter
|
365
207
|
*
|
366
208
|
* == Description
|
367
209
|
*
|
368
|
-
* This is a Ruby Extension version of nkf (
|
369
|
-
* It converts the first argument and
|
210
|
+
* This is a Ruby Extension version of nkf (Network Kanji Filter).
|
211
|
+
* It converts the first argument and returns converted result. Conversion
|
370
212
|
* details are specified by flags as the first argument.
|
371
213
|
*
|
372
214
|
* *Nkf* is a yet another kanji code converter among networks, hosts and terminals.
|
@@ -388,16 +230,16 @@ rb_nkf_guess2(obj, src)
|
|
388
230
|
*
|
389
231
|
* Output is buffered (DEFAULT), Output is unbuffered.
|
390
232
|
*
|
391
|
-
* === -j -s -e -w -w16
|
233
|
+
* === -j -s -e -w -w16 -w32
|
392
234
|
*
|
393
235
|
* Output code is ISO-2022-JP (7bit JIS), Shift_JIS, EUC-JP,
|
394
|
-
* UTF-8N, UTF-16BE.
|
236
|
+
* UTF-8N, UTF-16BE, UTF-32BE.
|
395
237
|
* Without this option and compile option, ISO-2022-JP is assumed.
|
396
238
|
*
|
397
|
-
* === -J -S -E -W -W16
|
239
|
+
* === -J -S -E -W -W16 -W32
|
398
240
|
*
|
399
241
|
* Input assumption is JIS 7 bit, Shift_JIS, EUC-JP,
|
400
|
-
* UTF-8, UTF-
|
242
|
+
* UTF-8, UTF-16, UTF-32.
|
401
243
|
*
|
402
244
|
* ==== -J
|
403
245
|
*
|
@@ -499,7 +341,7 @@ rb_nkf_guess2(obj, src)
|
|
499
341
|
* To see ISO8859-1 (Latin-1) -l is necessary.
|
500
342
|
*
|
501
343
|
* [-mB] Decode MIME base64 encoded stream. Remove header or other part before
|
502
|
-
* conversion.
|
344
|
+
* conversion.
|
503
345
|
*
|
504
346
|
* [-mQ] Decode MIME quoted stream. '_' in quoted stream is converted to space.
|
505
347
|
*
|
@@ -562,7 +404,7 @@ rb_nkf_guess2(obj, src)
|
|
562
404
|
*
|
563
405
|
* [Shift_JIS] SJIS, MS-Kanji
|
564
406
|
*
|
565
|
-
* [
|
407
|
+
* [Windows-31J] a.k.a. CP932
|
566
408
|
*
|
567
409
|
* [UTF-8] same as UTF-8N
|
568
410
|
*
|
@@ -580,6 +422,16 @@ rb_nkf_guess2(obj, src)
|
|
580
422
|
*
|
581
423
|
* [UTF-16LE-BOM] UTF-16 Little Endian with BOM
|
582
424
|
*
|
425
|
+
* [UTF-32] same as UTF-32BE
|
426
|
+
*
|
427
|
+
* [UTF-32BE] UTF-32 Big Endian without BOM
|
428
|
+
*
|
429
|
+
* [UTF-32BE-BOM] UTF-32 Big Endian with BOM
|
430
|
+
*
|
431
|
+
* [UTF-32LE] UTF-32 Little Endian without BOM
|
432
|
+
*
|
433
|
+
* [UTF-32LE-BOM] UTF-32 Little Endian with BOM
|
434
|
+
*
|
583
435
|
* [UTF8-MAC] NKDed UTF-8, a.k.a. UTF8-NFD (input only)
|
584
436
|
*
|
585
437
|
* === --fb-{skip, html, xml, perl, java, subchar}
|
@@ -593,10 +445,20 @@ rb_nkf_guess2(obj, src)
|
|
593
445
|
* nkf adds a specified escape character to specified 2nd byte of Shift_JIS characters.
|
594
446
|
* 1st byte of argument is the escape character and following bytes are target characters.
|
595
447
|
*
|
596
|
-
* === --
|
448
|
+
* === --no-cp932ext
|
597
449
|
*
|
598
450
|
* Handle the characters extended in CP932 as unassigned characters.
|
599
451
|
*
|
452
|
+
* == --no-best-fit-chars
|
453
|
+
*
|
454
|
+
* When Unicode to Encoded byte conversion,
|
455
|
+
* don't convert characters which is not round trip safe.
|
456
|
+
* When Unicode to Unicode conversion,
|
457
|
+
* with this and -x option, nkf can be used as UTF converter.
|
458
|
+
* (In other words, without this and -x option, nkf doesn't save some characters)
|
459
|
+
*
|
460
|
+
* When nkf convert string which related to path, you should use this opion.
|
461
|
+
*
|
600
462
|
* === --cap-input
|
601
463
|
*
|
602
464
|
* Decode hex encoded characters.
|
@@ -613,42 +475,28 @@ rb_nkf_guess2(obj, src)
|
|
613
475
|
void
|
614
476
|
Init_nkf()
|
615
477
|
{
|
616
|
-
|
617
|
-
|
618
|
-
|
619
|
-
|
620
|
-
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
rb_define_const(
|
628
|
-
|
629
|
-
rb_define_const(
|
630
|
-
|
631
|
-
rb_define_const(
|
632
|
-
|
633
|
-
|
634
|
-
/* BINARY */
|
635
|
-
rb_define_const(mKconv, "BINARY", INT2FIX(_BINARY));
|
636
|
-
/* No conversion */
|
637
|
-
rb_define_const(mKconv, "NOCONV", INT2FIX(_NOCONV));
|
638
|
-
/* ASCII */
|
639
|
-
rb_define_const(mKconv, "ASCII", INT2FIX(_ASCII));
|
640
|
-
/* UTF-8 */
|
641
|
-
rb_define_const(mKconv, "UTF8", INT2FIX(_UTF8));
|
642
|
-
/* UTF-16 */
|
643
|
-
rb_define_const(mKconv, "UTF16", INT2FIX(_UTF16));
|
644
|
-
/* UTF-32 */
|
645
|
-
rb_define_const(mKconv, "UTF32", INT2FIX(_UTF32));
|
646
|
-
/* UNKNOWN */
|
647
|
-
rb_define_const(mKconv, "UNKNOWN", INT2FIX(_UNKNOWN));
|
478
|
+
VALUE mNKF = rb_define_module("NKF");
|
479
|
+
|
480
|
+
rb_define_module_function(mNKF, "nkf", rb_nkf_convert, 2);
|
481
|
+
rb_define_module_function(mNKF, "guess", rb_nkf_guess, 1);
|
482
|
+
rb_define_alias(rb_singleton_class(mNKF), "guess", "guess");
|
483
|
+
|
484
|
+
rb_define_const(mNKF, "AUTO", Qnil);
|
485
|
+
rb_define_const(mNKF, "NOCONV", Qnil);
|
486
|
+
rb_define_const(mNKF, "UNKNOWN", Qnil);
|
487
|
+
rb_define_const(mNKF, "BINARY", rb_enc_from_encoding(rb_nkf_enc_get("BINARY")));
|
488
|
+
rb_define_const(mNKF, "ASCII", rb_enc_from_encoding(rb_nkf_enc_get("US-ASCII")));
|
489
|
+
rb_define_const(mNKF, "JIS", rb_enc_from_encoding(rb_nkf_enc_get("ISO-2022-JP")));
|
490
|
+
rb_define_const(mNKF, "EUC", rb_enc_from_encoding(rb_nkf_enc_get("EUC-JP")));
|
491
|
+
rb_define_const(mNKF, "SJIS", rb_enc_from_encoding(rb_nkf_enc_get("Shift_JIS")));
|
492
|
+
rb_define_const(mNKF, "UTF8", rb_enc_from_encoding(rb_utf8_encoding()));
|
493
|
+
rb_define_const(mNKF, "UTF16", rb_enc_from_encoding(rb_nkf_enc_get("UTF-16BE")));
|
494
|
+
rb_define_const(mNKF, "UTF32", rb_enc_from_encoding(rb_nkf_enc_get("UTF-32BE")));
|
495
|
+
|
648
496
|
/* Full version string of nkf */
|
649
|
-
rb_define_const(
|
497
|
+
rb_define_const(mNKF, "VERSION", rb_str_new2(RUBY_NKF_VERSION));
|
650
498
|
/* Version of nkf */
|
651
|
-
rb_define_const(
|
499
|
+
rb_define_const(mNKF, "NKF_VERSION", rb_str_new2(NKF_VERSION));
|
652
500
|
/* Release date of nkf */
|
653
|
-
rb_define_const(
|
501
|
+
rb_define_const(mNKF, "NKF_RELEASE_DATE", rb_str_new2(NKF_RELEASE_DATE));
|
654
502
|
}
|