nkf 0.2.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.git-blame-ignore-revs +7 -0
- data/.github/dependabot.yml +6 -0
- data/.github/workflows/test.yml +29 -0
- data/.gitignore +14 -0
- data/Gemfile +5 -0
- data/LICENSE.txt +22 -0
- data/README.md +38 -0
- data/Rakefile +24 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/ext/java/org/jruby/ext/nkf/Command.java +58 -0
- data/ext/java/org/jruby/ext/nkf/CommandParser.java +70 -0
- data/ext/java/org/jruby/ext/nkf/NKFLibrary.java +13 -0
- data/ext/java/org/jruby/ext/nkf/Option.java +80 -0
- data/ext/java/org/jruby/ext/nkf/Options.java +109 -0
- data/ext/java/org/jruby/ext/nkf/RubyNKF.java +601 -0
- data/ext/nkf/extconf.rb +3 -0
- data/ext/nkf/nkf-utf8/config.h +51 -0
- data/ext/nkf/nkf-utf8/nkf.c +7205 -0
- data/ext/nkf/nkf-utf8/nkf.h +189 -0
- data/ext/nkf/nkf-utf8/utf8tbl.c +14638 -0
- data/ext/nkf/nkf-utf8/utf8tbl.h +72 -0
- data/ext/nkf/nkf.c +506 -0
- data/lib/kconv.rb +283 -0
- data/lib/nkf.jar +0 -0
- data/lib/nkf.rb +6 -0
- data/nkf.gemspec +43 -0
- metadata +77 -0
@@ -0,0 +1,601 @@
|
|
1
|
+
/***** BEGIN LICENSE BLOCK *****
|
2
|
+
* Version: EPL 2.0/LGPL 2.1
|
3
|
+
*
|
4
|
+
* The contents of this file are subject to the Eclipse Public
|
5
|
+
* License Version 2.0 (the "License"); you may not use this file
|
6
|
+
* except in compliance with the License. You may obtain a copy of
|
7
|
+
* the License at http://www.eclipse.org/legal/epl-v20.html
|
8
|
+
*
|
9
|
+
* Software distributed under the License is distributed on an "AS
|
10
|
+
* IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
|
11
|
+
* implied. See the License for the specific language governing
|
12
|
+
* rights and limitations under the License.
|
13
|
+
*
|
14
|
+
* Copyright (C) 2007-2011 Koichiro Ohba <koichiro@meadowy.org>
|
15
|
+
*
|
16
|
+
* Alternatively, the contents of this file may be used under the terms of
|
17
|
+
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
18
|
+
* in which case the provisions of the LGPL are applicable instead
|
19
|
+
* of those above. If you wish to allow use of your version of this file only
|
20
|
+
* under the terms of either the LGPL, and not to allow others to
|
21
|
+
* use your version of this file under the terms of the EPL, indicate your
|
22
|
+
* decision by deleting the provisions above and replace them with the notice
|
23
|
+
* and other provisions required by the LGPL. If you do not delete
|
24
|
+
* the provisions above, a recipient may use your version of this file under
|
25
|
+
* the terms of any one of the EPL, the LGPL.
|
26
|
+
***** END LICENSE BLOCK *****/
|
27
|
+
|
28
|
+
package org.jruby.ext.nkf;
|
29
|
+
|
30
|
+
import java.nio.ByteBuffer;
|
31
|
+
import java.nio.CharBuffer;
|
32
|
+
import java.nio.charset.CharacterCodingException;
|
33
|
+
import java.nio.charset.Charset;
|
34
|
+
import java.nio.charset.CharsetDecoder;
|
35
|
+
import java.nio.charset.CharsetEncoder;
|
36
|
+
import java.nio.charset.UnsupportedCharsetException;
|
37
|
+
import java.util.ArrayList;
|
38
|
+
import java.util.Map;
|
39
|
+
import java.util.HashMap;
|
40
|
+
|
41
|
+
import org.jcodings.Encoding;
|
42
|
+
import org.jcodings.specific.ASCIIEncoding;
|
43
|
+
import org.jcodings.specific.UTF8Encoding;
|
44
|
+
import org.jcodings.transcode.EConv;
|
45
|
+
import org.jcodings.transcode.EConvFlags;
|
46
|
+
import org.jruby.Ruby;
|
47
|
+
import org.jruby.RubyArray;
|
48
|
+
import org.jruby.RubyModule;
|
49
|
+
import org.jruby.RubyString;
|
50
|
+
|
51
|
+
import org.jruby.anno.JRubyMethod;
|
52
|
+
import org.jruby.anno.JRubyModule;
|
53
|
+
import org.jruby.runtime.Helpers;
|
54
|
+
import org.jruby.runtime.ThreadContext;
|
55
|
+
import org.jruby.runtime.builtin.IRubyObject;
|
56
|
+
import org.jruby.util.ByteList;
|
57
|
+
import org.jruby.util.KCode;
|
58
|
+
import org.jruby.util.Pack;
|
59
|
+
import org.jruby.util.io.EncodingUtils;
|
60
|
+
|
61
|
+
@JRubyModule(name="NKF")
|
62
|
+
public class RubyNKF {
|
63
|
+
public static enum NKFCharset {
|
64
|
+
AUTO(0, "x-JISAutoDetect"),
|
65
|
+
// no ISO-2022-JP in jcodings
|
66
|
+
JIS(1, "ISO-2022-JP"),
|
67
|
+
EUC(2, "EUC-JP"),
|
68
|
+
SJIS(3, "Shift_JIS"),
|
69
|
+
BINARY(4, null),
|
70
|
+
NOCONV(4, null),
|
71
|
+
UNKNOWN(0, null),
|
72
|
+
ASCII(5, "iso-8859-1"),
|
73
|
+
UTF8(6, "UTF-8"),
|
74
|
+
UTF16(8, "UTF-16"),
|
75
|
+
UTF32(12, "UTF-32"),
|
76
|
+
OTHER(16, null),
|
77
|
+
BASE64(20, "base64"),
|
78
|
+
QENCODE(21, "qencode"),
|
79
|
+
MIME_DETECT(22, "MimeAutoDetect");
|
80
|
+
|
81
|
+
private NKFCharset(int value, String charset) {
|
82
|
+
this.value = value;
|
83
|
+
this.charset = charset;
|
84
|
+
}
|
85
|
+
|
86
|
+
public int getValue() {
|
87
|
+
return value;
|
88
|
+
}
|
89
|
+
|
90
|
+
public String getCharset() {
|
91
|
+
return charset;
|
92
|
+
}
|
93
|
+
|
94
|
+
private final int value;
|
95
|
+
private final String charset;
|
96
|
+
}
|
97
|
+
|
98
|
+
private static final ByteList BEGIN_MIME_STRING = new ByteList(ByteList.plain("=?"));
|
99
|
+
private static final ByteList END_MIME_STRING = new ByteList(ByteList.plain("?="));
|
100
|
+
private static final ByteList PACK_BASE64 = new ByteList(ByteList.plain("m"));
|
101
|
+
private static final ByteList PACK_QENCODE = new ByteList(ByteList.plain("M"));
|
102
|
+
|
103
|
+
public static final Map<Integer, String> NKFCharsetMap = new HashMap<Integer, String>(20, 1);
|
104
|
+
|
105
|
+
public static void load(Ruby runtime) {
|
106
|
+
createNKF(runtime);
|
107
|
+
}
|
108
|
+
|
109
|
+
public static void createNKF(Ruby runtime) {
|
110
|
+
final RubyModule NKF = runtime.defineModule("NKF");
|
111
|
+
final String version = "2.1.2";
|
112
|
+
final String relDate = "2011-09-08";
|
113
|
+
|
114
|
+
NKF.defineConstant("NKF_VERSION", runtime.newString(version));
|
115
|
+
NKF.defineConstant("NKF_RELEASE_DATE", runtime.newString(relDate));
|
116
|
+
NKF.defineConstant("VERSION", runtime.newString(version + ' ' + '(' + "JRuby" + '_' + relDate + ')'));
|
117
|
+
|
118
|
+
for ( NKFCharset charset : NKFCharset.values() ) {
|
119
|
+
NKFCharsetMap.put(charset.value, charset.name());
|
120
|
+
|
121
|
+
if (charset.value > 12 ) continue;
|
122
|
+
NKF.defineConstant(charset.name(), charsetMappedValue(runtime, charset));
|
123
|
+
}
|
124
|
+
|
125
|
+
NKF.defineAnnotatedMethods(RubyNKF.class);
|
126
|
+
}
|
127
|
+
|
128
|
+
@JRubyMethod(name = "guess", module = true)
|
129
|
+
public static IRubyObject guess(ThreadContext context, IRubyObject recv, IRubyObject s) {
|
130
|
+
return charsetMappedValue(context.runtime, guess(context, s));
|
131
|
+
}
|
132
|
+
|
133
|
+
public static NKFCharset guess(ThreadContext context, IRubyObject s) {
|
134
|
+
// TODO: Fix charset usage for JRUBY-4553
|
135
|
+
Ruby runtime = context.runtime;
|
136
|
+
if (!s.respondsTo("to_str")) {
|
137
|
+
throw runtime.newTypeError("can't convert " + s.getMetaClass() + " into String");
|
138
|
+
}
|
139
|
+
ByteList bytes = s.convertToString().getByteList();
|
140
|
+
ByteBuffer buf = ByteBuffer.wrap(bytes.getUnsafeBytes(), bytes.begin(), bytes.length());
|
141
|
+
CharsetDecoder decoder;
|
142
|
+
try {
|
143
|
+
decoder = Charset.forName("x-JISAutoDetect").newDecoder();
|
144
|
+
} catch (UnsupportedCharsetException e) {
|
145
|
+
throw runtime.newStandardError("charsets.jar is required to use NKF#guess. Please install JRE which supports m17n.");
|
146
|
+
}
|
147
|
+
try {
|
148
|
+
decoder.decode(buf);
|
149
|
+
|
150
|
+
if ( ! decoder.isCharsetDetected() ) {
|
151
|
+
return NKFCharset.UNKNOWN;
|
152
|
+
}
|
153
|
+
Charset charset = decoder.detectedCharset();
|
154
|
+
String name = charset.name();
|
155
|
+
if ("Shift_JIS".equals(name)) {
|
156
|
+
return NKFCharset.SJIS;
|
157
|
+
}
|
158
|
+
if ("Windows-31j".equalsIgnoreCase(name)) {
|
159
|
+
return NKFCharset.JIS;
|
160
|
+
}
|
161
|
+
if ("EUC-JP".equals(name)) {
|
162
|
+
return NKFCharset.EUC;
|
163
|
+
}
|
164
|
+
if ("ISO-2022-JP".equals(name)) {
|
165
|
+
return NKFCharset.JIS;
|
166
|
+
}
|
167
|
+
}
|
168
|
+
catch (CharacterCodingException e) {
|
169
|
+
// fall through and try direct encoding
|
170
|
+
}
|
171
|
+
|
172
|
+
if (bytes.getEncoding() == UTF8Encoding.INSTANCE) {
|
173
|
+
return NKFCharset.UTF8;
|
174
|
+
}
|
175
|
+
if (bytes.getEncoding().toString().startsWith("UTF-16")) {
|
176
|
+
return NKFCharset.UTF16;
|
177
|
+
}
|
178
|
+
if (bytes.getEncoding().toString().startsWith("UTF-32")) {
|
179
|
+
return NKFCharset.UTF32;
|
180
|
+
}
|
181
|
+
return NKFCharset.UNKNOWN;
|
182
|
+
}
|
183
|
+
|
184
|
+
private static IRubyObject charsetMappedValue(final Ruby runtime, final NKFCharset charset) {
|
185
|
+
final Encoding encoding;
|
186
|
+
switch (charset) {
|
187
|
+
case AUTO: case NOCONV: case UNKNOWN: return runtime.getNil();
|
188
|
+
case BINARY:
|
189
|
+
encoding = runtime.getEncodingService().getAscii8bitEncoding();
|
190
|
+
return runtime.getEncodingService().convertEncodingToRubyEncoding(encoding);
|
191
|
+
}
|
192
|
+
|
193
|
+
encoding = runtime.getEncodingService().getEncodingFromString(charset.getCharset());
|
194
|
+
return runtime.getEncodingService().convertEncodingToRubyEncoding(encoding);
|
195
|
+
}
|
196
|
+
|
197
|
+
@JRubyMethod(name = "guess1", module = true)
|
198
|
+
public static IRubyObject guess1(ThreadContext context, IRubyObject recv, IRubyObject str) {
|
199
|
+
return guess(context, recv, str);
|
200
|
+
}
|
201
|
+
|
202
|
+
@JRubyMethod(name = "guess2", module = true)
|
203
|
+
public static IRubyObject guess2(ThreadContext context, IRubyObject recv, IRubyObject str) {
|
204
|
+
return guess(context, recv, str);
|
205
|
+
}
|
206
|
+
|
207
|
+
@JRubyMethod(name = "nkf", module = true)
|
208
|
+
public static IRubyObject nkf(ThreadContext context, IRubyObject recv, IRubyObject opt, IRubyObject str) {
|
209
|
+
Ruby runtime = context.runtime;
|
210
|
+
|
211
|
+
if (!opt.respondsTo("to_str")) {
|
212
|
+
throw runtime.newTypeError("can't convert " + opt.getMetaClass() + " into String");
|
213
|
+
}
|
214
|
+
|
215
|
+
if (!str.respondsTo("to_str")) {
|
216
|
+
throw runtime.newTypeError("can't convert " + str.getMetaClass() + " into String");
|
217
|
+
}
|
218
|
+
|
219
|
+
Map<String, NKFCharset> options = parseOpt(opt.convertToString().toString());
|
220
|
+
|
221
|
+
if (options.get("input").getValue() == NKFCharset.AUTO.getValue()) {
|
222
|
+
options.put("input", guess(context, str));
|
223
|
+
}
|
224
|
+
|
225
|
+
ByteList bstr = str.convertToString().getByteList();
|
226
|
+
final Converter converter;
|
227
|
+
if (Converter.isMimeText(bstr, options)) {
|
228
|
+
converter = new MimeConverter(context, options);
|
229
|
+
} else {
|
230
|
+
converter = new DefaultConverter(context, options);
|
231
|
+
}
|
232
|
+
|
233
|
+
RubyString result = converter.convert(bstr);
|
234
|
+
|
235
|
+
if (options.get("mime-encode") == NKFCharset.BASE64) {
|
236
|
+
result = Converter.encodeMimeString(runtime, result, PACK_BASE64);
|
237
|
+
} else if (options.get("mime-encode") == NKFCharset.QENCODE) {
|
238
|
+
result = Converter.encodeMimeString(runtime, result, PACK_QENCODE);
|
239
|
+
}
|
240
|
+
|
241
|
+
return result;
|
242
|
+
}
|
243
|
+
|
244
|
+
public static Command parseOption(String s) {
|
245
|
+
Options options = new Options();
|
246
|
+
options.addOption("b");
|
247
|
+
options.addOption("u");
|
248
|
+
options.addOption("j", "jis");
|
249
|
+
options.addOption("s", "sjis");
|
250
|
+
options.addOption("e", "euc");
|
251
|
+
options.addOption("w", null, "[0-9][0-9]");
|
252
|
+
options.addOption("J", "jis-input");
|
253
|
+
options.addOption("S", "sjis-input");
|
254
|
+
options.addOption("E", "euc-input");
|
255
|
+
options.addOption("W", null, "[0-9][0-9]");
|
256
|
+
options.addOption("t");
|
257
|
+
options.addOption("i_");
|
258
|
+
options.addOption("o_");
|
259
|
+
options.addOption("r");
|
260
|
+
options.addOption("h1", "hiragana");
|
261
|
+
options.addOption("h2", "katakana");
|
262
|
+
options.addOption("h3", "katakana-hiragana");
|
263
|
+
options.addOption("T");
|
264
|
+
options.addOption("l");
|
265
|
+
options.addOption("f", null, "[0-9]+-[0-9]*");
|
266
|
+
options.addOption("F");
|
267
|
+
options.addOption("Z", null, "[0-3]");
|
268
|
+
options.addOption("X");
|
269
|
+
options.addOption("x");
|
270
|
+
options.addOption("B", null, "[0-2]");
|
271
|
+
options.addOption("I");
|
272
|
+
options.addOption("L", null, "[uwm]");
|
273
|
+
options.addOption("d");
|
274
|
+
options.addOption("c");
|
275
|
+
options.addOption("m", null, "[BQN0]");
|
276
|
+
options.addOption("M", null, "[BQ]");
|
277
|
+
options.addOption(null, "fj");
|
278
|
+
options.addOption(null, "unix");
|
279
|
+
options.addOption(null, "mac");
|
280
|
+
options.addOption(null, "msdos");
|
281
|
+
options.addOption(null, "windows");
|
282
|
+
options.addOption(null, "mime");
|
283
|
+
options.addOption(null, "base64");
|
284
|
+
options.addOption(null, "mime-input");
|
285
|
+
options.addOption(null, "base64-input");
|
286
|
+
options.addOption(null, "ic", "ic=(.*)");
|
287
|
+
options.addOption(null, "oc", "oc=(.*)");
|
288
|
+
options.addOption(null, "fb-skip");
|
289
|
+
options.addOption(null, "fb-html");
|
290
|
+
options.addOption(null, "fb-xml");
|
291
|
+
options.addOption(null, "fb-perl");
|
292
|
+
options.addOption(null, "fb-java");
|
293
|
+
options.addOption(null, "fb-subchar", "fb-subchar=(.*)");
|
294
|
+
options.addOption(null, "no-cp932ext");
|
295
|
+
options.addOption(null, "cap-input");
|
296
|
+
options.addOption(null, "url-input");
|
297
|
+
options.addOption(null, "numchar-input");
|
298
|
+
options.addOption(null, "no-best-fit-chars");
|
299
|
+
|
300
|
+
CommandParser parser = new CommandParser();
|
301
|
+
Command cmd = parser.parse(options, s);
|
302
|
+
return cmd;
|
303
|
+
}
|
304
|
+
|
305
|
+
private static Map<String, NKFCharset> parseOpt(String s) {
|
306
|
+
Map<String, NKFCharset> options = new HashMap<String, NKFCharset>();
|
307
|
+
|
308
|
+
// default options
|
309
|
+
options.put("input", NKFCharset.AUTO);
|
310
|
+
options.put("output", NKFCharset.JIS);
|
311
|
+
options.put("mime-decode", NKFCharset.MIME_DETECT);
|
312
|
+
options.put("mime-encode", NKFCharset.NOCONV);
|
313
|
+
|
314
|
+
Command cmd = parseOption(s);
|
315
|
+
if (cmd.hasOption("j")) {
|
316
|
+
options.put("output", NKFCharset.JIS);
|
317
|
+
}
|
318
|
+
if (cmd.hasOption("s")) {
|
319
|
+
options.put("output", NKFCharset.SJIS);
|
320
|
+
}
|
321
|
+
if (cmd.hasOption("e")) {
|
322
|
+
options.put("output", NKFCharset.EUC);
|
323
|
+
}
|
324
|
+
if (cmd.hasOption("w")) {
|
325
|
+
Option opt = cmd.getOption("w");
|
326
|
+
if ("32".equals(opt.getValue())) {
|
327
|
+
options.put("output", NKFCharset.UTF32);
|
328
|
+
} else if("16".equals(opt.getValue())) {
|
329
|
+
options.put("output", NKFCharset.UTF16);
|
330
|
+
} else {
|
331
|
+
options.put("output", NKFCharset.UTF8);
|
332
|
+
}
|
333
|
+
}
|
334
|
+
if (cmd.hasOption("J")) {
|
335
|
+
options.put("input", NKFCharset.JIS);
|
336
|
+
}
|
337
|
+
if (cmd.hasOption("S")) {
|
338
|
+
options.put("input", NKFCharset.SJIS);
|
339
|
+
}
|
340
|
+
if (cmd.hasOption("E")) {
|
341
|
+
options.put("input", NKFCharset.EUC);
|
342
|
+
}
|
343
|
+
if (cmd.hasOption("W")) {
|
344
|
+
Option opt = cmd.getOption("W");
|
345
|
+
if ("32".equals(opt.getValue())) {
|
346
|
+
options.put("input", NKFCharset.UTF32);
|
347
|
+
} else if("16".equals(opt.getValue())) {
|
348
|
+
options.put("input", NKFCharset.UTF16);
|
349
|
+
} else {
|
350
|
+
options.put("input", NKFCharset.UTF8);
|
351
|
+
}
|
352
|
+
}
|
353
|
+
if (cmd.hasOption("m")) {
|
354
|
+
Option opt = cmd.getOption("m");
|
355
|
+
if (opt.getValue() == null) {
|
356
|
+
options.put("mime-decode", NKFCharset.MIME_DETECT);
|
357
|
+
} else if ("B".equals(opt.getValue())) {
|
358
|
+
options.put("mime-decode", NKFCharset.BASE64);
|
359
|
+
} else if ("Q".equals(opt.getValue())) {
|
360
|
+
options.put("mime-decode", NKFCharset.QENCODE);
|
361
|
+
} else if ("N".equals(opt.getValue())) {
|
362
|
+
// TODO: non-strict option
|
363
|
+
} else if ("0".equals(opt.getValue())) {
|
364
|
+
options.put("mime-decode", NKFCharset.NOCONV);
|
365
|
+
}
|
366
|
+
}
|
367
|
+
if (cmd.hasOption("M")) {
|
368
|
+
Option opt = cmd.getOption("M");
|
369
|
+
if (opt.getValue() == null) {
|
370
|
+
options.put("mime-encode", NKFCharset.NOCONV);
|
371
|
+
} else if ("B".equals(opt.getValue())) {
|
372
|
+
options.put("mime-encode", NKFCharset.BASE64);
|
373
|
+
} else if ("Q".equals(opt.getValue())) {
|
374
|
+
options.put("mime-encode", NKFCharset.QENCODE);
|
375
|
+
}
|
376
|
+
}
|
377
|
+
if (cmd.hasOption("base64")) {
|
378
|
+
options.put("mime-encode", NKFCharset.BASE64);
|
379
|
+
}
|
380
|
+
if (cmd.hasOption("oc")) {
|
381
|
+
Option opt = cmd.getOption("oc");
|
382
|
+
if ("ISO-2022-JP".compareToIgnoreCase(opt.getValue()) == 0) {
|
383
|
+
options.put("output", NKFCharset.JIS);
|
384
|
+
} else if ("EUC-JP".compareToIgnoreCase(opt.getValue()) == 0) {
|
385
|
+
options.put("output", NKFCharset.EUC);
|
386
|
+
} else if ("CP932".compareToIgnoreCase(opt.getValue()) == 0) {
|
387
|
+
options.put("output", NKFCharset.SJIS);
|
388
|
+
} else if ("Shift_JIS".compareToIgnoreCase(opt.getValue()) == 0) {
|
389
|
+
options.put("output", NKFCharset.SJIS);
|
390
|
+
} else if ("Windows-31J".compareToIgnoreCase(opt.getValue()) == 0) {
|
391
|
+
options.put("output", NKFCharset.JIS);
|
392
|
+
} else if ("UTF-8".compareToIgnoreCase(opt.getValue()) == 0) {
|
393
|
+
options.put("output", NKFCharset.UTF8);
|
394
|
+
} else if ("UTF-8N".compareToIgnoreCase(opt.getValue()) == 0) {
|
395
|
+
options.put("output", NKFCharset.UTF8);
|
396
|
+
} else if ("UTF-16".compareToIgnoreCase(opt.getValue()) == 0) {
|
397
|
+
options.put("output", NKFCharset.UTF16);
|
398
|
+
} else if ("UTF-16BE-BOM".compareToIgnoreCase(opt.getValue()) == 0) {
|
399
|
+
options.put("output", NKFCharset.UTF16);
|
400
|
+
} else if ("UTF-32".compareToIgnoreCase(opt.getValue()) == 0) {
|
401
|
+
options.put("output", NKFCharset.UTF32);
|
402
|
+
} else if ("UTF-32BE-BOM".compareToIgnoreCase(opt.getValue()) == 0) {
|
403
|
+
options.put("output", NKFCharset.UTF32);
|
404
|
+
}
|
405
|
+
}
|
406
|
+
if (cmd.hasOption("ic")) {
|
407
|
+
Option opt = cmd.getOption("ic");
|
408
|
+
if ("ISO-2022-JP".compareToIgnoreCase(opt.getValue()) == 0) {
|
409
|
+
options.put("input", NKFCharset.JIS);
|
410
|
+
} else if ("EUC-JP".compareToIgnoreCase(opt.getValue()) == 0) {
|
411
|
+
options.put("input", NKFCharset.EUC);
|
412
|
+
} else if ("CP932".compareToIgnoreCase(opt.getValue()) == 0) {
|
413
|
+
options.put("input", NKFCharset.SJIS);
|
414
|
+
} else if ("Shift_JIS".compareToIgnoreCase(opt.getValue()) == 0) {
|
415
|
+
options.put("input", NKFCharset.SJIS);
|
416
|
+
} else if ("Windows-31J".compareToIgnoreCase(opt.getValue()) == 0) {
|
417
|
+
options.put("input", NKFCharset.SJIS);
|
418
|
+
} else if ("UTF-8".compareToIgnoreCase(opt.getValue()) == 0) {
|
419
|
+
options.put("input", NKFCharset.UTF8);
|
420
|
+
} else if ("UTF-8N".compareToIgnoreCase(opt.getValue()) == 0) {
|
421
|
+
options.put("input", NKFCharset.UTF8);
|
422
|
+
} else if ("UTF-16".compareToIgnoreCase(opt.getValue()) == 0) {
|
423
|
+
options.put("input", NKFCharset.UTF16);
|
424
|
+
} else if ("UTF-16BE-BOM".compareToIgnoreCase(opt.getValue()) == 0) {
|
425
|
+
options.put("input", NKFCharset.UTF16);
|
426
|
+
} else if ("UTF-32".compareToIgnoreCase(opt.getValue()) == 0) {
|
427
|
+
options.put("input", NKFCharset.UTF32);
|
428
|
+
} else if ("UTF-32BE-BOM".compareToIgnoreCase(opt.getValue()) == 0) {
|
429
|
+
options.put("input", NKFCharset.UTF32);
|
430
|
+
}
|
431
|
+
}
|
432
|
+
|
433
|
+
return options;
|
434
|
+
}
|
435
|
+
|
436
|
+
static abstract class Converter {
|
437
|
+
|
438
|
+
protected final ThreadContext context;
|
439
|
+
protected final Map<String, NKFCharset> options;
|
440
|
+
|
441
|
+
public Converter(ThreadContext ctx, Map<String, NKFCharset> opt) {
|
442
|
+
context = ctx;
|
443
|
+
options = opt;
|
444
|
+
}
|
445
|
+
|
446
|
+
static boolean isMimeText(ByteList str, Map<String, NKFCharset> options) {
|
447
|
+
if (str.length() <= 6) {
|
448
|
+
return false;
|
449
|
+
}
|
450
|
+
if (options.get("mime-decode") == NKFCharset.NOCONV) {
|
451
|
+
return false;
|
452
|
+
}
|
453
|
+
if (str.indexOf(BEGIN_MIME_STRING) < 0) {
|
454
|
+
return false;
|
455
|
+
}
|
456
|
+
if (str.lastIndexOf(END_MIME_STRING) < 0) {
|
457
|
+
return false;
|
458
|
+
}
|
459
|
+
return true;
|
460
|
+
}
|
461
|
+
|
462
|
+
private static RubyString encodeMimeString(Ruby runtime, RubyString str, ByteList format) {
|
463
|
+
RubyArray array = RubyArray.newArray(runtime, str);
|
464
|
+
return Pack.pack(runtime, array, format).chomp(runtime.getCurrentContext());
|
465
|
+
}
|
466
|
+
|
467
|
+
abstract RubyString convert(ByteList str);
|
468
|
+
|
469
|
+
ByteList convert_byte(ByteList str, String inputCharset, NKFCharset output) {
|
470
|
+
String outputCharset = output.getCharset();
|
471
|
+
|
472
|
+
if (inputCharset == null) {
|
473
|
+
inputCharset = str.getEncoding().toString();
|
474
|
+
}
|
475
|
+
|
476
|
+
if (outputCharset.equals(inputCharset)) {
|
477
|
+
return str.dup();
|
478
|
+
}
|
479
|
+
|
480
|
+
byte[] outCharsetBytes = outputCharset.getBytes();
|
481
|
+
|
482
|
+
EConv ec = EncodingUtils.econvOpenOpts(context, inputCharset.getBytes(), outCharsetBytes, 0, context.nil);
|
483
|
+
|
484
|
+
if (ec == null) {
|
485
|
+
throw context.runtime.newArgumentError("invalid encoding pair: " + inputCharset + " to " + outputCharset);
|
486
|
+
}
|
487
|
+
|
488
|
+
ByteList converted = EncodingUtils.econvStrConvert(context, ec, str, EConvFlags.INVALID_REPLACE);
|
489
|
+
|
490
|
+
converted.setEncoding(context.runtime.getEncodingService().findEncodingOrAliasEntry(outCharsetBytes).getEncoding());
|
491
|
+
|
492
|
+
return converted;
|
493
|
+
}
|
494
|
+
}
|
495
|
+
|
496
|
+
static class DefaultConverter extends Converter {
|
497
|
+
|
498
|
+
public DefaultConverter(ThreadContext ctx, Map<String, NKFCharset> opt) {
|
499
|
+
super(ctx, opt);
|
500
|
+
}
|
501
|
+
|
502
|
+
RubyString convert(ByteList str) {
|
503
|
+
NKFCharset input = options.get("input");
|
504
|
+
NKFCharset output = options.get("output");
|
505
|
+
ByteList b = convert_byte(str,
|
506
|
+
input.getCharset(),
|
507
|
+
output);
|
508
|
+
return context.runtime.newString(b);
|
509
|
+
}
|
510
|
+
}
|
511
|
+
|
512
|
+
static class MimeConverter extends Converter {
|
513
|
+
|
514
|
+
public MimeConverter(ThreadContext ctx, Map<String, NKFCharset> opt) {
|
515
|
+
super(ctx, opt);
|
516
|
+
}
|
517
|
+
|
518
|
+
private String detectCharset(String charset) {
|
519
|
+
if (charset.compareToIgnoreCase(NKFCharset.UTF8.getCharset()) == 0) {
|
520
|
+
return NKFCharset.UTF8.getCharset();
|
521
|
+
} else if (charset.compareToIgnoreCase(NKFCharset.JIS.getCharset()) == 0) {
|
522
|
+
return NKFCharset.JIS.getCharset();
|
523
|
+
} else if (charset.compareToIgnoreCase(NKFCharset.EUC.getCharset()) == 0) {
|
524
|
+
return NKFCharset.EUC.getCharset();
|
525
|
+
} else {
|
526
|
+
return NKFCharset.ASCII.getCharset();
|
527
|
+
}
|
528
|
+
}
|
529
|
+
|
530
|
+
private ByteList decodeMimeString(String str) {
|
531
|
+
String[] mime = str.split("^=\\?|\\?|\\?=$");
|
532
|
+
String charset = detectCharset(mime[1]);
|
533
|
+
int encode = mime[2].charAt(0);
|
534
|
+
RubyString body = EncodingUtils.newExternalStringWithEncoding(context.runtime, mime[3], ASCIIEncoding.INSTANCE);
|
535
|
+
|
536
|
+
final RubyArray<?> array;
|
537
|
+
if ('B' == encode || 'b' == encode) { // BASE64
|
538
|
+
array = Pack.unpack(context, body, PACK_BASE64);
|
539
|
+
} else { // Qencode
|
540
|
+
array = Pack.unpack(context, body, PACK_QENCODE);
|
541
|
+
}
|
542
|
+
RubyString s = (RubyString) array.entry(0);
|
543
|
+
ByteList decodeStr = s.asString().getByteList();
|
544
|
+
|
545
|
+
return convert_byte(decodeStr, charset, options.get("output"));
|
546
|
+
}
|
547
|
+
|
548
|
+
RubyString makeRubyString(ArrayList<ByteList> list) {
|
549
|
+
ByteList r = new ByteList();
|
550
|
+
for (ByteList l : list) {
|
551
|
+
r.append(l);
|
552
|
+
}
|
553
|
+
return context.runtime.newString(r);
|
554
|
+
}
|
555
|
+
|
556
|
+
RubyString convert(ByteList str) {
|
557
|
+
String s = Helpers.decodeByteList(context.runtime, str);
|
558
|
+
String[] token = s.split("\\s");
|
559
|
+
ArrayList<ByteList> raw_data = new ArrayList<ByteList>();
|
560
|
+
|
561
|
+
for (int i = 0; i < token.length; i++) {
|
562
|
+
raw_data.add(decodeMimeString(token[i]));
|
563
|
+
}
|
564
|
+
|
565
|
+
return makeRubyString(raw_data);
|
566
|
+
}
|
567
|
+
|
568
|
+
}
|
569
|
+
|
570
|
+
@Deprecated
|
571
|
+
public static final NKFCharset AUTO = NKFCharset.AUTO;
|
572
|
+
// no ISO-2022-JP in jcodings
|
573
|
+
@Deprecated
|
574
|
+
public static final NKFCharset JIS = NKFCharset.JIS;
|
575
|
+
@Deprecated
|
576
|
+
public static final NKFCharset EUC = NKFCharset.EUC;
|
577
|
+
@Deprecated
|
578
|
+
public static final NKFCharset SJIS = NKFCharset.SJIS;
|
579
|
+
@Deprecated
|
580
|
+
public static final NKFCharset BINARY = NKFCharset.BINARY;
|
581
|
+
@Deprecated
|
582
|
+
public static final NKFCharset NOCONV = NKFCharset.NOCONV;
|
583
|
+
@Deprecated
|
584
|
+
public static final NKFCharset UNKNOWN = NKFCharset.UNKNOWN;
|
585
|
+
@Deprecated
|
586
|
+
public static final NKFCharset ASCII = NKFCharset.ASCII;
|
587
|
+
@Deprecated
|
588
|
+
public static final NKFCharset UTF8 = NKFCharset.UTF8;
|
589
|
+
@Deprecated
|
590
|
+
public static final NKFCharset UTF16 = NKFCharset.UTF16;
|
591
|
+
@Deprecated
|
592
|
+
public static final NKFCharset UTF32 = NKFCharset.UTF32;
|
593
|
+
@Deprecated
|
594
|
+
public static final NKFCharset OTHER = NKFCharset.OTHER;
|
595
|
+
@Deprecated
|
596
|
+
public static final NKFCharset BASE64 = NKFCharset.BASE64;
|
597
|
+
@Deprecated
|
598
|
+
public static final NKFCharset QENCODE = NKFCharset.QENCODE;
|
599
|
+
@Deprecated
|
600
|
+
public static final NKFCharset MIME_DETECT = NKFCharset.MIME_DETECT;
|
601
|
+
}
|
data/ext/nkf/extconf.rb
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
#ifndef _CONFIG_H_
|
2
|
+
#define _CONFIG_H_
|
3
|
+
|
4
|
+
/* UTF8 input and output */
|
5
|
+
#define UTF8_INPUT_ENABLE
|
6
|
+
#define UTF8_OUTPUT_ENABLE
|
7
|
+
|
8
|
+
/* invert characters invalid in Shift_JIS to CP932 */
|
9
|
+
#define SHIFTJIS_CP932
|
10
|
+
|
11
|
+
/* fix input encoding when given by option */
|
12
|
+
#define INPUT_CODE_FIX
|
13
|
+
|
14
|
+
/* --overwrite option */
|
15
|
+
/* by Satoru Takabayashi <ccsatoru@vega.aichi-u.ac.jp> */
|
16
|
+
#define OVERWRITE
|
17
|
+
|
18
|
+
/* --cap-input, --url-input option */
|
19
|
+
#define INPUT_OPTION
|
20
|
+
|
21
|
+
/* --numchar-input option */
|
22
|
+
#define NUMCHAR_OPTION
|
23
|
+
|
24
|
+
/* --debug, --no-output option */
|
25
|
+
#define CHECK_OPTION
|
26
|
+
|
27
|
+
/* JIS X0212 */
|
28
|
+
#define X0212_ENABLE
|
29
|
+
|
30
|
+
/* --exec-in, --exec-out option
|
31
|
+
* require pipe, fork, execvp and so on.
|
32
|
+
* please undef this on MS-DOS, MinGW
|
33
|
+
* this is still buggy around child process
|
34
|
+
*/
|
35
|
+
/* #define EXEC_IO */
|
36
|
+
|
37
|
+
/* Unicode Normalization */
|
38
|
+
#define UNICODE_NORMALIZATION
|
39
|
+
|
40
|
+
/*
|
41
|
+
* Select Default Output Encoding
|
42
|
+
*
|
43
|
+
*/
|
44
|
+
|
45
|
+
/* #define DEFAULT_CODE_JIS */
|
46
|
+
/* #define DEFAULT_CODE_SJIS */
|
47
|
+
/* #define DEFAULT_CODE_WINDOWS_31J */
|
48
|
+
/* #define DEFAULT_CODE_EUC */
|
49
|
+
/* #define DEFAULT_CODE_UTF8 */
|
50
|
+
|
51
|
+
#endif /* _CONFIG_H_ */
|