RubyGems - rubysl-nkf - Versions diffs - 1.1.0 → 2.0.1 - Mend

rubysl-nkf 1.1.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

checksums.yaml +4 -4
data/.travis.yml +3 -2
data/ext/rubysl/nkf/nkf-utf8/config.h +20 -57
data/ext/rubysl/nkf/nkf-utf8/nkf.c +5705 -5028
data/ext/rubysl/nkf/nkf-utf8/nkf.h +192 -0
data/ext/rubysl/nkf/nkf-utf8/utf8tbl.c +863 -609
data/ext/rubysl/nkf/nkf-utf8/utf8tbl.h +27 -0
data/ext/rubysl/nkf/nkf.c +127 -279
data/lib/kconv.rb +101 -186
data/lib/rubysl/nkf/version.rb +1 -1
data/rubysl-nkf.gemspec +3 -1
metadata +21 -19

data/ext/rubysl/nkf/nkf-utf8/utf8tbl.h CHANGED

@@ -1,28 +1,55 @@
+/*
+ * utf8tbl.h - Header file for Convertion Table
+ *
+ * $Id: utf8tbl.h 25189 2009-10-02 12:04:37Z akr $
+ */
 #ifndef _UTF8TBL_H_
 #define _UTF8TBL_H_
 #ifdef UTF8_OUTPUT_ENABLE
+#define sizeof_euc_to_utf8_1byte 94
+#define sizeof_euc_to_utf8_2bytes 94
 extern const unsigned short euc_to_utf8_1byte[];
 extern const unsigned short *const euc_to_utf8_2bytes[];
 extern const unsigned short *const euc_to_utf8_2bytes_ms[];
+extern const unsigned short *const euc_to_utf8_2bytes_mac[];
 extern const unsigned short *const x0212_to_utf8_2bytes[];
 #endif /* UTF8_OUTPUT_ENABLE */
 #ifdef UTF8_INPUT_ENABLE
+#define sizeof_utf8_to_euc_C2 64
+#define sizeof_utf8_to_euc_E5B8 64
+#define sizeof_utf8_to_euc_2bytes 112
+#define sizeof_utf8_to_euc_3bytes 16
 extern const unsigned short *const utf8_to_euc_2bytes[];
 extern const unsigned short *const utf8_to_euc_2bytes_ms[];
 extern const unsigned short *const utf8_to_euc_2bytes_932[];
+extern const unsigned short *const utf8_to_euc_2bytes_mac[];
 extern const unsigned short *const *const utf8_to_euc_3bytes[];
 extern const unsigned short *const *const utf8_to_euc_3bytes_ms[];
 extern const unsigned short *const *const utf8_to_euc_3bytes_932[];
+extern const unsigned short *const *const utf8_to_euc_3bytes_mac[];
 #endif /* UTF8_INPUT_ENABLE */
 #ifdef UNICODE_NORMALIZATION
+#define NORMALIZATION_TABLE_LENGTH 942
+#define NORMALIZATION_TABLE_NFC_LENGTH 3
+#define NORMALIZATION_TABLE_NFD_LENGTH 9
+struct normalization_pair {
+    const unsigned char nfc[NORMALIZATION_TABLE_NFC_LENGTH];
+    const unsigned char nfd[NORMALIZATION_TABLE_NFD_LENGTH];
+};
 extern const struct normalization_pair normalization_table[];
 #endif
 #ifdef SHIFTJIS_CP932
+#define CP932_TABLE_BEGIN 0xFA
+#define CP932_TABLE_END   0xFC
 extern const unsigned short shiftjis_cp932[3][189];
+#define CP932INV_TABLE_BEGIN 0xED
+#define CP932INV_TABLE_END   0xEE
 extern const unsigned short cp932inv[2][189];
 #endif /* SHIFTJIS_CP932 */

data/ext/rubysl/nkf/nkf.c CHANGED

@@ -3,31 +3,15 @@
  *
  *  original nkf2.x is maintained at http://sourceforge.jp/projects/nkf/
  *
- *  $Id: nkf.c 11905 2007-02-27 10:38:32Z knu $
+ *  $Id: nkf.c 27947 2010-05-21 10:11:44Z nobu $
  *
  */
-#define RUBY_NKF_REVISION "$Revision: 11905 $"
+#define RUBY_NKF_REVISION "$Revision: 27947 $"
 #define RUBY_NKF_VERSION NKF_VERSION " (" NKF_RELEASE_DATE ")"
-#include "ruby.h"
-/* Encoding Constants */
-#define	_AUTO		0
-#define	_JIS		1
-#define	_EUC		2
-#define	_SJIS		3
-#define	_BINARY		4
-#define	_NOCONV		4
-#define	_ASCII		5
-/* 0b011x is reserved for UTF-8 Family */
-#define	_UTF8		6
-/* 0b10xx is reserved for UTF-16 Family */
-#define	_UTF16		8
-/* 0b11xx is reserved for UTF-32 Family */
-#define	_UTF32		12
-#define	_OTHER		16
-#define	_UNKNOWN	_AUTO
+#include "ruby/ruby.h"
+#include "ruby/encoding.h"
 /* Replace nkf's getchar/putchar for variable modification */
 /* we never use getc, ungetc */
@@ -56,14 +40,13 @@ static int incsize;
 static VALUE result;
 static int
-rb_nkf_putchar(c)
-  unsigned int c;
+rb_nkf_putchar(unsigned int c)
 {
   if (output_ctr >= o_len) {
     o_len += incsize;
     rb_str_resize(result, o_len);
     incsize *= 2;
-    output = (unsigned char *)RSTRING(result)->ptr;
+    output = (unsigned char *)RSTRING_PTR(result);
   }
   output[output_ctr++] = c;
@@ -78,11 +61,23 @@ rb_nkf_putchar(c)
 #include "nkf-utf8/utf8tbl.c"
 #include "nkf-utf8/nkf.c"
-int nkf_split_options(arg)
-    const char* arg;
+rb_encoding* rb_nkf_enc_get(const char *name)
+{
+    int idx = rb_enc_find_index(name);
+    if (idx < 0) {
+	nkf_encoding *nkf_enc = nkf_enc_find(name);
+	idx = rb_enc_find_index(nkf_enc_name(nkf_enc_to_base_encoding(nkf_enc)));
+	if (idx < 0) {
+	    idx = rb_define_dummy_encoding(name);
+	}
+    }
+    return rb_enc_from_index(idx);
+}
+int nkf_split_options(const char *arg)
 {
     int count = 0;
-    char option[256];
+    unsigned char option[256];
     int i = 0, j = 0;
     int is_escaped = FALSE;
     int is_single_quoted = FALSE;
@@ -113,7 +108,7 @@ int nkf_split_options(arg)
 	    is_double_quoted = TRUE;
 	}else if(arg[i] == ' '){
 	    option[j] = '\0';
-	    options((unsigned char*)option);
+	    options(option);
 	    j = 0;
 	}else{
 	    option[j++] = arg[i];
@@ -121,252 +116,99 @@ int nkf_split_options(arg)
     }
     if(j){
 	option[j] = '\0';
-	options((unsigned char*)option);
+	options(option);
     }
     return count;
 }
 /*
  *  call-seq:
- *     NKF.nkf(opt, str)   -> string
+ *     NKF.nkf(opt, str)   => string
  *
  *  Convert _str_ and return converted result.
  *  Conversion details are specified by _opt_ as String.
  *
  *     require 'nkf'
  *     output = NKF.nkf("-s", input)
- *
- *  *Note*
- *  By default, nkf decodes MIME encoded string.
- *  If you want not to decode input, use NKF.nkf with <b>-m0</b> flag.
  */
 static VALUE
-rb_nkf_kconv(obj, opt, src)
-  VALUE obj, opt, src;
+rb_nkf_convert(VALUE obj, VALUE opt, VALUE src)
 {
-  char *opt_ptr, *opt_end;
-  volatile VALUE v;
-  reinit();
-  StringValue(opt);
-  opt_ptr = RSTRING(opt)->ptr;
-  opt_end = opt_ptr + RSTRING(opt)->len;
-  nkf_split_options(opt_ptr);
-  incsize = INCSIZE;
-  input_ctr = 0;
-  StringValue(src);
-  input = (unsigned char *)RSTRING(src)->ptr;
-  i_len = RSTRING(src)->len;
-  result = rb_str_new(0, i_len*3 + 10);
-  v = result;
-  output_ctr = 0;
-  output     = (unsigned char *)RSTRING(result)->ptr;
-  o_len      = RSTRING(result)->len;
-  *output    = '\0';
-  if(x0201_f == WISH_TRUE)
-    x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201);
-  kanji_convert(NULL);
-  RSTRING(result)->ptr[output_ctr] = '\0';
-  RSTRING(result)->len = output_ctr;
-  OBJ_INFECT(result, src);
-  return result;
-}
+    volatile VALUE tmp;
+    reinit();
+    StringValue(opt);
+    nkf_split_options(RSTRING_PTR(opt));
+    if (!output_encoding) rb_raise(rb_eArgError, "no output encoding given");
+    switch (nkf_enc_to_index(output_encoding)) {
+    case UTF_8_BOM:    output_encoding = nkf_enc_from_index(UTF_8); break;
+    case UTF_16BE_BOM: output_encoding = nkf_enc_from_index(UTF_16BE); break;
+    case UTF_16LE_BOM: output_encoding = nkf_enc_from_index(UTF_16LE); break;
+    case UTF_32BE_BOM: output_encoding = nkf_enc_from_index(UTF_32BE); break;
+    case UTF_32LE_BOM: output_encoding = nkf_enc_from_index(UTF_32LE); break;
+    }
+    output_bom_f = FALSE;
+    incsize = INCSIZE;
-/*
- *  call-seq:
- *     NKF.guess1(str)  -> integer
- *
- *  Returns guessed encoding of _str_ as integer.
- *
- *  Algorithm described in:
- *  Ken Lunde. `Understanding Japanese Information Processing'
- *  Sebastopol, CA: O'Reilly & Associates.
- *
- *      case NKF.guess1(input)
- *      when NKF::JIS
- *        "ISO-2022-JP"
- *      when NKF::SJIS
- *        "Shift_JIS"
- *      when NKF::EUC
- *        "EUC-JP"
- *      when NKF::UNKNOWN
- *        "UNKNOWN(ASCII)"
- *      when NKF::BINARY
- *        "BINARY"
- *      end
- */
+    input_ctr = 0;
+    StringValue(src);
+    input = (unsigned char *)RSTRING_PTR(src);
+    i_len = RSTRING_LEN(src);
+    tmp = result = rb_str_new(0, i_len*3 + 10);
-static VALUE
-rb_nkf_guess1(obj, src)
-  VALUE obj, src;
-{
-  unsigned char *p;
-  unsigned char *pend;
-  int sequence_counter = 0;
-  StringValue(src);
-  p = (unsigned char *)RSTRING(src)->ptr;
-  pend = p + RSTRING(src)->len;
-  if (p == pend) return INT2FIX(_UNKNOWN);
-#define INCR do {\
-      p++;\
-      if (p==pend) return INT2FIX(_UNKNOWN);\
-      sequence_counter++;\
-      if (sequence_counter % 2 == 1 && *p != 0xa4)\
-	sequence_counter = 0;\
-      if (6 <= sequence_counter) {\
-	  sequence_counter = 0;\
-	  return INT2FIX(_EUC);\
-      }\
-  } while (0)
-  if (*p == 0xa4)
-    sequence_counter = 1;
-  while (p<pend) {
-    if (*p == '\033') {
-      return INT2FIX(_JIS);
-    }
-    if (*p < '\006' || *p == 0x7f || *p == 0xff) {
-      return INT2FIX(_BINARY);
-    }
-    if (0x81 <= *p && *p <= 0x8d) {
-      return INT2FIX(_SJIS);
-    }
-    if (0x8f <= *p && *p <= 0x9f) {
-      return INT2FIX(_SJIS);
-    }
-    if (*p == 0x8e) {	/* SS2 */
-      INCR;
-      if ((0x40 <= *p && *p <= 0x7e) ||
-	  (0x80 <= *p && *p <= 0xa0) ||
-	  (0xe0 <= *p && *p <= 0xfc))
-	return INT2FIX(_SJIS);
-    }
-    else if (0xa1 <= *p && *p <= 0xdf) {
-      INCR;
-      if (0xf0 <= *p && *p <= 0xfe)
-	return INT2FIX(_EUC);
-      if (0xe0 <= *p && *p <= 0xef) {
-	while (p < pend && *p >= 0x40) {
-	  if (*p >= 0x81) {
-	    if (*p <= 0x8d || (0x8f <= *p && *p <= 0x9f)) {
-	      return INT2FIX(_SJIS);
-	    }
-	    else if (0xfd <= *p && *p <= 0xfe) {
-	      return INT2FIX(_EUC);
-	    }
-	  }
-	  INCR;
-	}
-      }
-      else if (*p <= 0x9f) {
-	return INT2FIX(_SJIS);
-      }
-    }
-    else if (0xf0 <= *p && *p <= 0xfe) {
-      return INT2FIX(_EUC);
-    }
-    else if (0xe0 <= *p && *p <= 0xef) {
-      INCR;
-      if ((0x40 <= *p && *p <= 0x7e) ||
-	  (0x80 <= *p && *p <= 0xa0)) {
-	return INT2FIX(_SJIS);
-      }
-      if (0xfd <= *p && *p <= 0xfe) {
-	return INT2FIX(_EUC);
-      }
-    }
-    INCR;
-  }
-  return INT2FIX(_UNKNOWN);
+    output_ctr = 0;
+    output     = (unsigned char *)RSTRING_PTR(result);
+    o_len      = RSTRING_LEN(result);
+    *output    = '\0';
+    kanji_convert(NULL);
+    rb_str_set_len(result, output_ctr);
+    OBJ_INFECT(result, src);
+    if (mimeout_f)
+	rb_enc_associate(result, rb_usascii_encoding());
+    else
+	rb_enc_associate(result, rb_nkf_enc_get(nkf_enc_name(output_encoding)));
+    return result;
 }
 /*
  *  call-seq:
- *     NKF.guess2(str)  -> integer
- *
- *  Returns guessed encoding of _str_ as integer by nkf routine.
- *
- *     case NKF.guess(input)
- *     when NKF::ASCII
- *       "ASCII"
- *     when NKF::JIS
- *       "ISO-2022-JP"
- *     when NKF::SJIS
- *       "Shift_JIS"
- *     when NKF::EUC
- *       "EUC-JP"
- *     when NKF::UTF8
- *       "UTF-8"
- *     when NKF::UTF16
- *       "UTF-16"
- *     when NKF::UNKNOWN
- *       "UNKNOWN"
- *     when NKF::BINARY
- *       "BINARY"
- *     end
+ *     NKF.guess(str)  => encoding
+ *
+ *  Returns guessed encoding of _str_ by nkf routine.
+ *
  */
 static VALUE
-rb_nkf_guess2(obj, src)
-  VALUE obj, src;
+rb_nkf_guess(VALUE obj, VALUE src)
 {
-  int code = _BINARY;
-  reinit();
-  input_ctr = 0;
-  StringValue(src);
-  input = (unsigned char *)RSTRING(src)->ptr;
-  i_len = RSTRING(src)->len;
-  if(x0201_f == WISH_TRUE)
-    x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201);
-  guess_f = TRUE;
-  kanji_convert( NULL );
-  guess_f = FALSE;
-  if (!is_inputcode_mixed) {
-    if (strcmp(input_codename, "") == 0) {
-      code = _ASCII;
-    } else if (strcmp(input_codename, "ISO-2022-JP") == 0) {
-      code = _JIS;
-    } else if (strcmp(input_codename, "EUC-JP") == 0) {
-      code = _EUC;
-    } else if (strcmp(input_codename, "Shift_JIS") == 0) {
-      code = _SJIS;
-    } else if (strcmp(input_codename, "UTF-8") == 0) {
-      code = _UTF8;
-    } else if (strcmp(input_codename, "UTF-16") == 0) {
-      code = _UTF16;
-    } else if (strlen(input_codename) > 0) {
-      code = _UNKNOWN;
-    }
-  }
+    reinit();
+    input_ctr = 0;
+    StringValue(src);
+    input = (unsigned char *)RSTRING_PTR(src);
+    i_len = RSTRING_LEN(src);
-  return INT2FIX( code );
+    guess_f = TRUE;
+    kanji_convert( NULL );
+    guess_f = FALSE;
+    return rb_enc_from_encoding(rb_nkf_enc_get(get_guessed_code()));
 }
 /*
- *  NKF - Ruby extension for Network Kanji Filter
+ *  NKF - Ruby extension for Network Kanji Filter
  *
  *  == Description
  *
- *  This is a Ruby Extension version of nkf (Netowrk Kanji Filter).
- *  It converts the first argument and return converted result. Conversion
+ *  This is a Ruby Extension version of nkf (Network Kanji Filter).
+ *  It converts the first argument and returns converted result. Conversion
  *  details are specified by flags as the first argument.
  *
  *  *Nkf* is a yet another kanji code converter among networks, hosts and terminals.
@@ -388,16 +230,16 @@ rb_nkf_guess2(obj, src)
  *
  *  Output is buffered (DEFAULT), Output is unbuffered.
  *
- *  === -j -s -e -w -w16
+ *  === -j -s -e -w -w16 -w32
  *
  *  Output code is ISO-2022-JP (7bit JIS), Shift_JIS, EUC-JP,
- *  UTF-8N, UTF-16BE.
+ *  UTF-8N, UTF-16BE, UTF-32BE.
  *  Without this option and compile option, ISO-2022-JP is assumed.
  *
- *  === -J -S -E -W -W16
+ *  === -J -S -E -W -W16 -W32
  *
  *  Input assumption is JIS 7 bit, Shift_JIS, EUC-JP,
- *  UTF-8, UTF-16LE.
+ *  UTF-8, UTF-16, UTF-32.
  *
  *  ==== -J
  *
@@ -499,7 +341,7 @@ rb_nkf_guess2(obj, src)
  *  To see ISO8859-1 (Latin-1) -l is necessary.
  *
  *  [-mB] Decode MIME base64 encoded stream. Remove header or other part before
- *  conversion.
+ *  conversion.
  *
  *  [-mQ] Decode MIME quoted stream. '_' in quoted stream is converted to space.
  *
@@ -562,7 +404,7 @@ rb_nkf_guess2(obj, src)
  *
  *  [Shift_JIS] SJIS, MS-Kanji
  *
- *  [CP932] a.k.a. Windows-31J
+ *  [Windows-31J] a.k.a. CP932
  *
  *  [UTF-8] same as UTF-8N
  *
@@ -580,6 +422,16 @@ rb_nkf_guess2(obj, src)
  *
  *  [UTF-16LE-BOM] UTF-16 Little Endian with BOM
  *
+ *  [UTF-32] same as UTF-32BE
+ *
+ *  [UTF-32BE] UTF-32 Big Endian without BOM
+ *
+ *  [UTF-32BE-BOM] UTF-32 Big Endian with BOM
+ *
+ *  [UTF-32LE] UTF-32 Little Endian without BOM
+ *
+ *  [UTF-32LE-BOM] UTF-32 Little Endian with BOM
+ *
  *  [UTF8-MAC] NKDed UTF-8, a.k.a. UTF8-NFD (input only)
  *
  *  === --fb-{skip, html, xml, perl, java, subchar}
@@ -593,10 +445,20 @@ rb_nkf_guess2(obj, src)
  *  nkf adds a specified escape character to specified 2nd byte of Shift_JIS characters.
  *  1st byte of argument is the escape character and following bytes are target characters.
  *
- *  === --disable-cp932ext
+ *  === --no-cp932ext
  *
  *  Handle the characters extended in CP932 as unassigned characters.
  *
+ *  == --no-best-fit-chars
+ *
+ *  When Unicode to Encoded byte conversion,
+ *  don't convert characters which is not round trip safe.
+ *  When Unicode to Unicode conversion,
+ *  with this and -x option, nkf can be used as UTF converter.
+ *  (In other words, without this and -x option, nkf doesn't save some characters)
+ *
+ *  When nkf convert string which related to path, you should use this opion.
+ *
  *  === --cap-input
  *
  *  Decode hex encoded characters.
@@ -613,42 +475,28 @@ rb_nkf_guess2(obj, src)
 void
 Init_nkf()
 {
-    /* hoge */
-    VALUE mKconv = rb_define_module("NKF");
-    /* hoge */
-    rb_define_module_function(mKconv, "nkf", rb_nkf_kconv, 2);
-    rb_define_module_function(mKconv, "guess1", rb_nkf_guess1, 1);
-    rb_define_module_function(mKconv, "guess2", rb_nkf_guess2, 1);
-    rb_define_alias(mKconv, "guess", "guess2");
-    rb_define_alias(rb_singleton_class(mKconv), "guess", "guess2");
-    /* Auto-Detect */
-    rb_define_const(mKconv, "AUTO", INT2FIX(_AUTO));
-    /* ISO-2022-JP */
-    rb_define_const(mKconv, "JIS", INT2FIX(_JIS));
-    /* EUC-JP */
-    rb_define_const(mKconv, "EUC", INT2FIX(_EUC));
-    /* Shift_JIS */
-    rb_define_const(mKconv, "SJIS", INT2FIX(_SJIS));
-    /* BINARY */
-    rb_define_const(mKconv, "BINARY", INT2FIX(_BINARY));
-    /* No conversion */
-    rb_define_const(mKconv, "NOCONV", INT2FIX(_NOCONV));
-    /* ASCII */
-    rb_define_const(mKconv, "ASCII", INT2FIX(_ASCII));
-    /* UTF-8 */
-    rb_define_const(mKconv, "UTF8", INT2FIX(_UTF8));
-    /* UTF-16 */
-    rb_define_const(mKconv, "UTF16", INT2FIX(_UTF16));
-    /* UTF-32 */
-    rb_define_const(mKconv, "UTF32", INT2FIX(_UTF32));
-    /* UNKNOWN */
-    rb_define_const(mKconv, "UNKNOWN", INT2FIX(_UNKNOWN));
+    VALUE mNKF = rb_define_module("NKF");
+    rb_define_module_function(mNKF, "nkf", rb_nkf_convert, 2);
+    rb_define_module_function(mNKF, "guess", rb_nkf_guess, 1);
+    rb_define_alias(rb_singleton_class(mNKF), "guess", "guess");
+    rb_define_const(mNKF, "AUTO",	Qnil);
+    rb_define_const(mNKF, "NOCONV",	Qnil);
+    rb_define_const(mNKF, "UNKNOWN",	Qnil);
+    rb_define_const(mNKF, "BINARY",	rb_enc_from_encoding(rb_nkf_enc_get("BINARY")));
+    rb_define_const(mNKF, "ASCII",	rb_enc_from_encoding(rb_nkf_enc_get("US-ASCII")));
+    rb_define_const(mNKF, "JIS",	rb_enc_from_encoding(rb_nkf_enc_get("ISO-2022-JP")));
+    rb_define_const(mNKF, "EUC",	rb_enc_from_encoding(rb_nkf_enc_get("EUC-JP")));
+    rb_define_const(mNKF, "SJIS",	rb_enc_from_encoding(rb_nkf_enc_get("Shift_JIS")));
+    rb_define_const(mNKF, "UTF8",	rb_enc_from_encoding(rb_utf8_encoding()));
+    rb_define_const(mNKF, "UTF16",	rb_enc_from_encoding(rb_nkf_enc_get("UTF-16BE")));
+    rb_define_const(mNKF, "UTF32",	rb_enc_from_encoding(rb_nkf_enc_get("UTF-32BE")));
     /* Full version string of nkf */
-    rb_define_const(mKconv, "VERSION", rb_str_new2(RUBY_NKF_VERSION));
+    rb_define_const(mNKF, "VERSION", rb_str_new2(RUBY_NKF_VERSION));
     /* Version of nkf */
-    rb_define_const(mKconv, "NKF_VERSION", rb_str_new2(NKF_VERSION));
+    rb_define_const(mNKF, "NKF_VERSION", rb_str_new2(NKF_VERSION));
     /* Release date of nkf */
-    rb_define_const(mKconv, "NKF_RELEASE_DATE", rb_str_new2(NKF_RELEASE_DATE));
+    rb_define_const(mNKF, "NKF_RELEASE_DATE", rb_str_new2(NKF_RELEASE_DATE));
 }