RubyGems - icu4r_19 - Versions diffs - 1.0 - Mend

icu4r_19 1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

data/ChangeLog +87 -0
data/MIT-LICENSE +20 -0
data/README +156 -0
data/Rakefile +32 -0
data/calendar.c +636 -0
data/collator.c +233 -0
data/converter.c +322 -0
data/docs/FORMATTING +131 -0
data/docs/UNICODE_REGEXPS +204 -0
data/extconf.rb +17 -0
data/fmt.cpp +156 -0
data/icu4r.c +18 -0
data/icu_common.h +45 -0
data/lib/dummy +0 -0
data/samples/demo_each.rb +23 -0
data/samples/demo_locales.rb +16 -0
data/samples/demo_regexp.rb +11 -0
data/samples/resbundle/appmsg/root.res +0 -0
data/samples/resbundle/appmsg/ru.res +0 -0
data/samples/resbundle/demo_bundle.rb +4 -0
data/samples/resbundle/mkres.sh +4 -0
data/samples/resbundle/root.txt +10 -0
data/samples/resbundle/ru.txt +4 -0
data/test/test_calendar.rb +123 -0
data/test/test_collator.rb +33 -0
data/test/test_converter.rb +72 -0
data/test/test_ustring.rb +508 -0
data/tools/doc.sh +2 -0
data/tools/km.rb +425 -0
data/ubundle.c +223 -0
data/ucore_ext.c +168 -0
data/uregex.c +697 -0
data/uregex.h +27 -0
data/ustring.c +3039 -0
metadata +164 -0

data/ucore_ext.c ADDED

@@ -0,0 +1,168 @@
+#include "icu_common.h"
+extern VALUE rb_cUString;
+extern  VALUE icu_ustr_new_set(const UChar * str, long len, long capa);
+/**
+ * call-seq:
+ *     ary.to_u => anUString
+ *
+ * Creates UString from array of fixnums, representing Unicode codepoints.
+ * (inversion of UString#codepoints)
+ *
+ *      a = "поддержка".to_u.codepoints  # => [1087, 1086, 1076, 1076, 1077, 1088, 1078, 1082, 1072]
+ *      a.to_u                           # => "поддержка"
+ *
+ */
+VALUE icu_ustr_from_array(obj)
+	VALUE		obj;
+{
+	int		i, n;
+	VALUE		*p;
+	VALUE		ret, temp;
+	UChar32		* src , *pos, chr;
+	UChar		* buf;
+	int32_t		len, capa;
+	UErrorCode	status = U_ZERO_ERROR;
+	n = RARRAY_LEN(obj);
+	p = RARRAY_PTR(obj);
+	src = ALLOC_N(UChar32, n);
+	pos = src;
+	for ( i = 0; i < n; i++){
+	    temp = p[i];
+	    if(TYPE(temp) != T_FIXNUM) {
+	    	free(src);
+	    	rb_raise(rb_eTypeError, "Can't convert from %s", rb_class2name(CLASS_OF(temp)));
+	    }
+	    chr = (UChar32) FIX2INT(temp);
+	    // invalid codepoints are converted to U+FFFD
+	    if( ! (U_IS_UNICODE_CHAR(chr)) ) {
+	    	chr = 0xFFFD;
+	    }
+	    *pos = chr;
+	    pos ++;
+	}
+	capa = n+1;
+	buf = ALLOC_N(UChar, capa);
+	u_strFromUTF32(buf, capa, &len, src, n, &status);
+	if( U_BUFFER_OVERFLOW_ERROR == status ){
+		capa = len+1;
+		REALLOC_N(buf, UChar, capa);
+		status = U_ZERO_ERROR;
+		u_strFromUTF32(buf, capa, &len, src, n, &status);
+	}
+	if (U_FAILURE(status) ) {
+		free(src);
+		free(buf);
+		rb_raise(rb_eRuntimeError, u_errorName(status));
+	}
+	if( capa <= len ){
+		++capa;
+		REALLOC_N(buf, UChar, capa);
+	}
+	ret = icu_ustr_new_set(buf, len, capa);
+	free(src);
+	return ret;
+}
+/**
+ * call-seq:
+ *    str.to_u(encoding = 'utf8') => String
+ *
+ * Converts String value in  given encoding to UString.
+ * When no encoding is given, utf8 is assumed. If string is not valid UTF8,
+ * and no encoding is given, exception is raised.
+ *
+ * When explicit encoding is given, converter will replace incorrect codepoints
+ * with <U+FFFD> - replacement character.
+ */
+VALUE
+icu_from_rstr(argc, argv, str)
+     int             argc;
+     VALUE          *argv,
+                     str;
+{
+    VALUE           enc;
+    char           *encoding = 0;	/* default */
+    UErrorCode      error = 0;
+    int32_t         capa, len;
+    VALUE s;
+    UChar * buf;
+    UConverter * conv;
+    if (rb_scan_args(argc, argv, "01", &enc) == 1) {
+	Check_Type(enc, T_STRING);
+	encoding = RSTRING_PTR(enc);
+    }
+    capa = RSTRING_LEN(str) + 1;
+    buf = ALLOC_N(UChar, capa);
+    if(! encoding || !strncmp(encoding, "utf8", 4) ) {
+      /* from UTF8 */
+        u_strFromUTF8(buf, capa-1, &len, RSTRING_PTR(str), RSTRING_LEN(str), &error);
+	if( U_FAILURE(error)) {
+	   free(buf);
+	   rb_raise(rb_eArgError, u_errorName(error));
+	}
+	s = icu_ustr_new_set(buf, len, capa);
+    } else {
+          conv = ucnv_open(encoding, &error);
+          if (U_FAILURE(error)) {
+              ucnv_close(conv);
+              rb_raise(rb_eArgError, u_errorName(error));
+          }
+          len =  ucnv_toUChars(conv, buf, capa-1, RSTRING_PTR(str),
+              	      RSTRING_LEN(str), &error);
+          if (U_BUFFER_OVERFLOW_ERROR == error) {
+	      capa = len+1;
+              REALLOC_N(buf, UChar, capa);
+              error = 0;
+              len = ucnv_toUChars(conv, buf, capa-1, RSTRING_PTR(str),
+              	      RSTRING_LEN(str), &error);
+              if (U_FAILURE(error)) {
+		  free(buf);
+                  rb_raise(rb_eArgError, u_errorName(error));
+	      }
+          }
+          s = icu_ustr_new_set(buf, len, capa);
+          ucnv_close(conv);
+    }
+    return s;
+}
+/**
+ * call-seq:
+ *   u(str, enc = 'utf8') => UString
+ *
+ * Global function to convert from String to UString
+ */
+VALUE
+icu_f_rb_str(argc, argv, obj)
+     int             argc;
+     VALUE          *argv;
+     VALUE           obj;
+{
+    VALUE           enc;
+    VALUE           str;
+    if (rb_scan_args(argc, argv, "11", &str, &enc) == 2) {
+	Check_Type(enc, T_STRING);
+    	Check_Type(str, T_STRING);
+	return icu_from_rstr(1, &enc, str);
+    } else {
+    	Check_Type(str, T_STRING);
+	return icu_from_rstr(0, NULL, str);
+    }
+}
+void initialize_ucore_ext(void)
+{
+    /* conversion from String to UString */
+    rb_define_method(rb_cString, "to_u", icu_from_rstr, -1);
+    rb_define_alias(rb_cString, "u", "to_u");
+    rb_define_global_function("u", icu_f_rb_str, -1);
+    /* conversion from Array to UString */
+    rb_define_method(rb_cArray, "to_u", icu_ustr_from_array, 0);
+}

data/uregex.c ADDED

@@ -0,0 +1,697 @@
+/**
+ * Document-class: URegexp
+ *
+ * See [docs/UNICODE_REGEXPS] for details of patterns.
+ *
+ *
+ * Replacement Text
+ *
+ * The replacement text for find-and-replace operations may contain references to
+ * capture-group text from the find. References are of the form $n, where n is the
+ * number of the capture group.
+ *
+ *      Character      Descriptions
+ *      $n     The text of capture group n will be substituted for $n. n must be >= 0 and not
+ *      greater than the number of capture groups. A $ not followed by a digit has no special meaning,
+ *      and will appear in the substitution text as itself, a $.
+ *      \     Treat the following character as a literal, suppressing any special meaning. Backslash escaping in
+ *      substitution text is only required for '$' and '\', but may be used on any other character without bad effects.
+ *
+ *
+ * Valid URegexp options are:  COMMENTS, MULTILINE, DOTALL, IGNORECASE, which can be OR'ed.
+ */
+#include "icu_common.h"
+extern VALUE rb_cURegexp;
+extern VALUE rb_cUString;
+extern VALUE rb_cUMatch;
+VALUE           icu_umatch_aref(VALUE match, VALUE idx);
+VALUE           icu_umatch_new (VALUE re);
+extern VALUE icu_ustr_new(const UChar * ptr, long len);
+extern VALUE icu_ustr_new2(const UChar * ptr);
+extern void ustr_splice_units(ICUString * str, long start, long del_len, const UChar * replacement, long repl_len);
+extern VALUE icu_from_rstr(int, VALUE *, VALUE);
+/* --------- regular expressions */
+void icu_regex_free( ICURegexp      *ptr)
+{
+    if (ptr->pattern)
+	uregex_close(ptr->pattern);
+    ptr->pattern = 0;
+    free(ptr);
+}
+VALUE
+icu_reg_s_alloc(klass)
+     VALUE           klass;
+{
+    ICURegexp      *ptr = ALLOC_N(ICURegexp, 1);
+    ptr->pattern = 0;
+    return Data_Wrap_Struct(klass, 0, icu_regex_free, ptr);
+}
+void
+icu_reg_initialize(obj, s, len, options)
+     VALUE           obj;
+     const UChar    *s;
+     long            len;
+     int             options;
+{
+    UParseError     pe;
+    UErrorCode      status = 0;
+    ICURegexp      *re = UREGEX(obj);
+    if (re->pattern)
+	uregex_close(re->pattern);
+    re->pattern = uregex_open(s, len, options, &pe, &status);
+    re->options = options;
+    if (U_FAILURE(status))
+	rb_raise(rb_eArgError,
+		 "Wrong regexp: %s line %d column %d flags %d",
+		 u_errorName(status), pe.line, pe.offset, options);
+}
+const UChar *
+icu_reg_get_pattern(ptr, len)
+     ICURegexp      *ptr;
+     int32_t        *len;
+{
+    UErrorCode      error = U_ZERO_ERROR;
+    *len = 0;
+    return uregex_pattern(ptr->pattern, len, &error);
+}
+/**
+ * call-seq:
+ *     URegexp.new(str [,options])
+ *     URegexp.new(regexp)
+ *
+ *  Constructs a new regular expression from <i>pattern</i>, which can be either
+ *  a <code>UString</code> or a <code>URegexp</code>.
+ * */
+VALUE
+icu_reg_initialize_m(argc, argv, self)
+     int             argc;
+     VALUE          *argv;
+     VALUE           self;
+{
+    const UChar    *s;
+    int32_t         len = 0;
+    int             flags = 0;
+    if (argc == 0 || argc > 2) {
+	rb_raise(rb_eArgError, "wrong number of arguments");
+    }
+    if (CLASS_OF(argv[0]) == rb_cURegexp) {
+	if (argc > 1) {
+	    rb_warn("flags ignored");
+	}
+	flags = UREGEX(argv[0])->options;
+	s = icu_reg_get_pattern(UREGEX(argv[0]), &len);
+    } else {
+	Check_Class(argv[0], rb_cUString);
+	if (argc == 2) {
+	    if (FIXNUM_P(argv[1]))
+		flags = FIX2INT(argv[1]);
+	    else if (RTEST(argv[1]))
+		flags = UREGEX_CASE_INSENSITIVE;
+	}
+	s = ICU_PTR(argv[0]);
+	len = ICU_LEN(argv[0]);
+    }
+    icu_reg_initialize(self, s, len, flags);
+    return self;
+}
+VALUE
+icu_reg_new(s, len, options)
+     const UChar    *s;
+     long            len;
+     int             options;
+{
+    VALUE           re = icu_reg_s_alloc(rb_cURegexp);
+    icu_reg_initialize(re, s, len, options);
+    return (VALUE) re;
+}
+VALUE
+icu_reg_clone(obj)
+      VALUE       obj;
+{
+      ICURegexp      *regex = UREGEX(obj);
+      URegularExpression *old_pattern = UREGEX(obj)->pattern;
+      VALUE           ret ;
+      UErrorCode      status = U_ZERO_ERROR;
+      URegularExpression * new_pattern = uregex_clone(regex->pattern, &status);
+       if(U_FAILURE(status) ){
+        rb_raise(rb_eArgError, u_errorName(status));
+      }
+      ret = icu_reg_s_alloc(rb_cURegexp);
+      regex = UREGEX(ret);
+      regex->pattern = old_pattern;
+      UREGEX(obj)->pattern = new_pattern;
+      return ret;
+}
+VALUE
+icu_reg_comp(str)
+     VALUE           str;
+{
+    return icu_reg_new(USTRING(str)->ptr, USTRING(str)->len, 0);
+}
+/**
+ * call-seq:
+ *     regexp.to_u => URegexp
+ *
+ * Converts Ruby Regexp to unicode URegexp, assuming it is in UTF8 encoding.
+ * $KCODE must be set to 'u' to work reliably
+ */
+VALUE icu_reg_from_rb_reg(re)
+      VALUE          re;
+{
+    return icu_reg_comp(icu_from_rstr(0, NULL, rb_funcall(re, rb_intern("to_s"), 0)));
+}
+/**
+ * call-seq:
+ *     uregex.to_u
+ *
+ * Returns UString of this URegexp pattern.
+ * */
+VALUE
+icu_reg_to_u(self)
+     VALUE           self;
+{
+    int32_t         len = 0;
+    const UChar    *s = icu_reg_get_pattern(UREGEX(self), &len);
+    return icu_ustr_new(s, len);
+}
+/**
+ * call-seq:
+ *     uregex.split(str, limit)
+ *
+ *  Divides <i>str</i> into substrings based on a regexp pattern,
+ *  returning an array of these substrings. <i>str</i> is divided where the
+ *  pattern matches.
+ * */
+VALUE
+icu_reg_split(self, str, limit)
+     VALUE           self,
+                     str,
+                     limit;
+{
+    VALUE splits;
+    URegularExpression *theRegEx = UREGEX(self)->pattern;
+    UErrorCode      error = U_ZERO_ERROR;
+    UChar * dest_buf, **dest_fields;
+    int32_t limt, req_cap, total, i;
+    Check_Class(str, rb_cUString);
+    if (limit != Qnil)
+	Check_Type(limit, T_FIXNUM);
+    dest_buf = ALLOC_N(UChar, USTRING(str)->len * 2 + 2);
+    limt = (limit == Qnil ? USTRING(str)->len + 1 : FIX2INT(limit));
+    dest_fields = ALLOC_N(UChar *, limt);
+    uregex_setText(theRegEx, USTRING(str)->ptr, USTRING(str)->len, &error);
+    if (U_FAILURE(error)) {
+	free(dest_buf);
+	free(dest_fields);
+	rb_raise(rb_eArgError, u_errorName(error));
+    }
+    req_cap = 0;
+    total =
+	uregex_split(theRegEx, dest_buf, USTRING(str)->len * 2, &req_cap,
+		     dest_fields, limt, &error);
+    if (U_BUFFER_OVERFLOW_ERROR == error) {
+       error = U_ZERO_ERROR;
+       REALLOC_N( dest_buf, UChar, req_cap);
+       total = uregex_split(theRegEx, dest_buf, req_cap, &req_cap,  dest_fields, limt, &error);
+    }
+    if (U_FAILURE(error) ) {
+	free(dest_buf);
+	free(dest_fields);
+	rb_raise(rb_eArgError, u_errorName(error));
+    }
+    splits = rb_ary_new();
+    for (i = 0; i < total; i++)
+	rb_ary_push(splits, icu_ustr_new2(dest_fields[i]));
+    	free(dest_buf);
+	free(dest_fields);
+    return splits;
+}
+long
+icu_reg_search(re, str, pos, reverse)
+     VALUE           re,
+                     str;
+     long            pos,
+                     reverse;
+{
+    UErrorCode      error = U_ZERO_ERROR;
+    long            cur_pos = 0;
+    long            start,
+                    last;
+    if (!reverse) {
+	start = pos;
+    } else {
+	start = 0;
+    }
+    uregex_setText(UREGEX(re)->pattern, USTRING(str)->ptr,
+		   USTRING(str)->len, &error);
+    if (U_FAILURE(error))
+	rb_raise(rb_eArgError, u_errorName(error));
+    if (!uregex_find(UREGEX(re)->pattern, start, &error))
+	return -1;
+    if (U_FAILURE(error))
+	rb_raise(rb_eArgError, u_errorName(error));
+    cur_pos = uregex_start(UREGEX(re)->pattern, 0, &error);
+    if (reverse) {
+	while (uregex_findNext(UREGEX(re)->pattern, &error)) {
+	    last = uregex_start(UREGEX(re)->pattern, 0, &error);
+	    error = U_ZERO_ERROR;
+	    if (reverse && last > pos)
+		break;
+	    cur_pos = last;
+	}
+    }
+    if (reverse && cur_pos > pos)
+	return -1;
+    return cur_pos;
+}
+long
+icu_group_count(re)
+     VALUE           re;
+{
+    UErrorCode      error = U_ZERO_ERROR;
+    return uregex_groupCount(UREGEX(re)->pattern, &error);
+}
+VALUE
+icu_reg_nth_match(re, nth)
+     VALUE           re;
+     long            nth;
+{
+    URegularExpression *the_expr = UREGEX(re)->pattern;
+    UErrorCode      error = U_ZERO_ERROR;
+    long            start, end;
+    int32_t len;
+    if( nth < 0 ) {
+    	nth += icu_group_count(re) + 1;
+	if(nth<=0) return Qnil;
+    }
+    start = uregex_start(the_expr, nth, &error);
+    if (U_FAILURE(error)) {
+	return Qnil;
+    }
+    end = uregex_end(the_expr, nth, &error);
+    len = 0;
+    return icu_ustr_new(uregex_getText(the_expr, &len, &error) + start,
+			end - start);
+}
+VALUE
+icu_reg_range(re, nth, start, end)
+     VALUE           re;
+     int             nth;
+     long           *start;
+     long           *end;
+{
+    URegularExpression *the_expr = UREGEX(re)->pattern;
+    UErrorCode      error = U_ZERO_ERROR;
+    if(nth < 0) {
+       nth += icu_group_count(re) + 1;
+       if(nth <= 0) return Qnil;
+    }
+    *start = uregex_start(the_expr, nth, &error);
+    if (U_FAILURE(error))
+	return Qnil;
+    *end = uregex_end(the_expr, nth, &error);
+    return Qtrue;
+}
+/**
+ *  call-seq:
+ *     uregex.match(str)   => matchdata or nil
+ *     uregex =~ (str)   => matchdata or nil
+ *
+ *  Returns a <code>UMatch</code> object describing the match, or
+ *  <code>nil</code> if there was no match.
+ *
+ *     ure("(.)(.)(.)").match("abc".u)[2]   #=> "b"
+ */
+VALUE
+icu_reg_match(re, str)
+     VALUE           re,
+                     str;
+{
+    UErrorCode      error = U_ZERO_ERROR;
+    Check_Class(str, rb_cUString);
+    uregex_setText(UREGEX(re)->pattern, USTRING(str)->ptr,
+		   USTRING(str)->len, &error);
+    if (U_FAILURE(error))
+	rb_raise(rb_eArgError, u_errorName(error));
+    if (uregex_find(UREGEX(re)->pattern, 0, &error)) {
+	return icu_umatch_new(re);
+    }
+    return Qnil;
+}
+/**
+ *  call-seq:
+ *     rxp === str   => true or false
+ *
+ *  Case Equality---Synonym for <code>URegexp#=~</code> used in case statements.
+ *
+ *     a = "HELLO".u
+ *     case a
+ *     when ure("^[a-z]*$"); print "Lower case\n"
+ *     when ure("^[A-Z]*$"); print "Upper case\n"
+ *     else;            print "Mixed case\n"
+ *     end
+ *
+ *  <em>produces:</em>
+ *
+ *     Upper case
+ */
+VALUE
+icu_reg_eqq(re, str)
+     VALUE           re,
+                     str;
+{
+    long            start;
+    Check_Class(str, rb_cUString);
+    start = icu_reg_search(re, str, 0, 0);
+    return start < 0 ? Qfalse : Qtrue;
+}
+int
+icu_reg_find_next(pat)
+     VALUE           pat;
+{
+    URegularExpression *the_expr = UREGEX(pat)->pattern;
+    UErrorCode      error = U_ZERO_ERROR;
+    return uregex_findNext(the_expr, &error);
+}
+static const UChar BACKSLASH  = 0x5c;
+static const UChar DOLLARSIGN = 0x24;
+VALUE
+icu_reg_get_replacement(pat, repl_text, prev_end)
+     VALUE           pat,
+                     repl_text;
+     long            prev_end;
+{
+    UErrorCode      error = U_ZERO_ERROR;
+    URegularExpression *the_expr = UREGEX(pat)->pattern;
+    VALUE 	    ret = icu_ustr_new(0, 0);
+    /* scan the replacement text, looking for substitutions ($n) and \escapes. */
+    int32_t  replIdx = 0;
+    int32_t  replacementLength = ICU_LEN(repl_text);
+    UChar    *replacementText = ICU_PTR(repl_text);
+    int32_t numDigits = 0;
+    int32_t groupNum  = 0, g_start, g_end;
+    UChar32 digitC;
+    int32_t len;
+    /* following code is rewritten version of code found  */
+    /* in ICU sources : i18n/regexp.cpp */
+    while (replIdx < replacementLength) {
+        UChar  c = replacementText[replIdx];
+        replIdx++;
+        if (c != DOLLARSIGN && c != BACKSLASH) {
+            /* Common case, no substitution, no escaping,  */
+            /*  just copy the char to the dest buf. */
+            ustr_splice_units(USTRING(ret), ICU_LEN(ret), 0, replacementText+replIdx-1, 1);
+            continue;
+        }
+        if (c == BACKSLASH) {
+            /* Backslash Escape.  Copy the following char out without further checks. */
+            /*                    Note:  Surrogate pairs don't need any special handling */
+            /*                           The second half wont be a '$' or a '\', and */
+            /*                           will move to the dest normally on the next */
+            /*                           loop iteration. */
+            if (replIdx >= replacementLength) {
+                break;
+            }
+            /* ICU4R : \uxxxx case is removed for simplicity : if (c==0x55 || c==0x75) { */
+            /* Plain backslash escape.  Just put out the escaped character. */
+	    ustr_splice_units(USTRING(ret), ICU_LEN(ret), 0, replacementText+replIdx, 1);
+            replIdx++;
+            continue;
+        }
+        /* We've got a $.  Pick up a capture group number if one follows. */
+        /* Consume at most the number of digits necessary for the largest capture */
+        /* number that is valid for this pattern. */
+        numDigits = 0;
+        groupNum  = 0;
+        for (;;) {
+            if (replIdx >= replacementLength) {
+                break;
+            }
+            U16_GET(replacementText, 0, replIdx, replacementLength, digitC); /* care surrogates */
+            if (u_isdigit(digitC) == FALSE) {
+                break;
+            }
+            U16_FWD_1(replacementText, replIdx, replacementLength); /* care surrogates */
+            groupNum=groupNum*10 + u_charDigitValue(digitC);
+            numDigits++;
+            if (numDigits >= 3) { /* limit 999 groups */
+                break;
+            }
+        }
+        if (numDigits == 0) {
+            /* The $ didn't introduce a group number at all. */
+            /* Treat it as just part of the substitution text. */
+	    ustr_splice_units(USTRING(ret), ICU_LEN(ret), 0, &DOLLARSIGN, 1);
+            continue;
+        }
+        /* Finally, append the capture group data to the destination. */
+	error = U_ZERO_ERROR;
+	g_start = uregex_start(the_expr, groupNum, &error);
+	g_end   = uregex_end  (the_expr, groupNum, &error);
+	if(U_SUCCESS(error) && g_start != -1  ) {
+	   ustr_splice_units(USTRING(ret), ICU_LEN(ret), 0,
+	    uregex_getText(the_expr, &len, &error) + g_start, g_end - g_start);
+	}
+    }
+    return ret;
+}
+VALUE
+icu_reg_get_prematch(pat, prev_end)
+     VALUE           pat;
+     long            prev_end;
+{
+    URegularExpression *the_expr = UREGEX(pat)->pattern;
+    UErrorCode      error = U_ZERO_ERROR;
+    int32_t         len = 0;
+    int32_t         cur_start = uregex_start(the_expr, 0, &error);
+    const UChar    *temp = uregex_getText(the_expr, &len, &error);
+    VALUE           pm =
+	icu_ustr_new(temp + prev_end, cur_start - prev_end);
+    return pm;
+}
+VALUE
+icu_reg_get_tail(pat, prev_end)
+     VALUE           pat;
+     long            prev_end;
+{
+    UErrorCode      error = U_ZERO_ERROR;
+    URegularExpression *the_expr = UREGEX(pat)->pattern;
+    int32_t         len = 0;
+    const UChar    *temp = uregex_getText(the_expr, &len, &error);
+    VALUE           pm = icu_ustr_new(temp + prev_end, len - prev_end);
+    return pm;
+}
+/**
+ * call-seq:
+ *    ure(str[, options]) => URegexp
+ *
+ * Creates URegexp object from UString.
+ * */
+VALUE
+icu_reg_from_rb_str(argc, argv, obj)
+     int             argc;
+     VALUE          *argv;
+     VALUE           obj;
+{
+    VALUE           pat,
+                    options = Qnil;
+    int             reg_opts = 0;
+    if (rb_scan_args(argc, argv, "11", &pat, &options) == 1) {
+	reg_opts = 0;
+    } else {
+	if (options != Qnil) {
+	    Check_Type(options, T_FIXNUM);
+	    reg_opts = FIX2INT(options);
+	}
+    }
+    if (TYPE(pat) == T_STRING)
+	pat = icu_from_rstr(0, NULL, pat);
+    if (CLASS_OF(pat) != rb_cUString)
+	rb_raise(rb_eArgError, "Expected String or UString");
+    return icu_reg_new(ICU_PTR(pat), ICU_LEN(pat), reg_opts);
+}
+/**
+ * call-seq:
+ *     umatch[idx] => string
+ *
+ * Returns capture group. Group 0 is for full match.
+ * */
+VALUE
+icu_umatch_aref(match, index)
+     VALUE           match,
+                     index;
+{
+    long idx;
+    VALUE cg;
+    Check_Type(index, T_FIXNUM);
+    idx = FIX2LONG(index);
+    cg = rb_iv_get(match, "@cg");
+    return rb_ary_entry(cg, idx);
+}
+/**
+ * call-seq:
+ *     umatch.range(idx) => range
+ *
+ * Returns range (start, end) of capture group. Group 0 is for full match.
+ *
+ * NOTE: this method returns <b>code unit</b> indexes. To convert this range
+ * to <b>code point</b> range use UString#conv_unit_range. If your chars don't
+ * require surrogate UTF16 pairs, range will be the same.
+ * */
+VALUE
+icu_umatch_range(match, index)
+     VALUE           match,
+                     index;
+{
+    long idx;
+    VALUE cg;
+    Check_Type(index, T_FIXNUM);
+    idx = FIX2LONG(index);
+    cg = rb_iv_get(match, "@ranges");
+    return rb_ary_entry(cg, idx);
+}
+/**
+ * call-seq:
+ *     umatch.size => fixnum
+ *
+ * Returns number of capture groups.
+ * */
+VALUE
+icu_umatch_size(match)
+     VALUE           match;
+{
+    VALUE           cg = rb_iv_get(match, "@cg");
+    return LONG2NUM(RARRAY_LEN(cg) - 1);
+}
+VALUE
+icu_umatch_init( self, re)
+     VALUE            self, re;
+{
+    UErrorCode           status = U_ZERO_ERROR;
+    long                 count, i, cu_start, cu_end;
+    URegularExpression * the_regex;
+    VALUE                obj, groups, ranges;
+    Check_Class(re, rb_cURegexp);
+    the_regex = UREGEX(re)->pattern;
+    count = icu_group_count(re);
+    if (U_FAILURE(status)) {
+	rb_raise(rb_eArgError, u_errorName(status));
+    }
+    groups = rb_ary_new2(count);
+    rb_iv_set(self, "@cg", groups);
+    for (i = 0; i <= count; i++) {
+	obj = icu_reg_nth_match(re, i);
+	rb_obj_freeze(obj);
+	rb_ary_store(groups, i, obj);
+    }
+    ranges = rb_ary_new2(count);
+    for ( i = 0; i <= count; i++){
+        cu_start = uregex_start(the_regex, i, &status);
+        cu_end  = uregex_end(the_regex, i, &status);
+	if( cu_start == -1) rb_ary_store(ranges, i, Qnil);
+	else rb_ary_store(ranges, i, rb_range_new(LONG2NUM(cu_start), LONG2NUM(cu_end-1), 0));
+    }
+    rb_iv_set(self, "@ranges", ranges);
+    return self;
+}
+VALUE icu_umatch_new(re)
+	VALUE 		re;
+{
+	return icu_umatch_init(rb_class_new_instance(0, NULL, rb_cUMatch), re);
+}
+void initialize_uregexp (void)
+{
+/* regular expressions */
+    rb_cURegexp = rb_define_class("URegexp", rb_cObject);
+    rb_define_alloc_func(rb_cURegexp, icu_reg_s_alloc);
+    rb_define_method(rb_cURegexp, "initialize", icu_reg_initialize_m, -1);
+    rb_define_method(rb_cURegexp, "to_u", icu_reg_to_u, 0);
+    rb_define_method(rb_cURegexp, "match", icu_reg_match, 1);
+    rb_define_method(rb_cURegexp, "split", icu_reg_split, 2);
+    rb_define_method(rb_cURegexp, "=~", icu_reg_match, 1);
+    rb_define_method(rb_cURegexp, "===", icu_reg_eqq, 1);
+    /* Enable case insensitive matching.  */
+    rb_define_const(rb_cURegexp, "IGNORECASE", INT2FIX(UREGEX_CASE_INSENSITIVE));
+    /* Allow white space and comments within patterns   */
+    rb_define_const(rb_cURegexp, "COMMENTS", INT2FIX(UREGEX_COMMENTS));
+    /* Control behavior of "$" and "^" If set, recognize line terminators  within string, otherwise, match only at start and end of input  string. */
+    rb_define_const(rb_cURegexp, "MULTILINE", INT2FIX(UREGEX_MULTILINE));
+    /* If set, '.' matches line terminators, otherwise '.' matching stops at line end.   */
+    rb_define_const(rb_cURegexp, "DOTALL", INT2FIX(UREGEX_DOTALL));
+    rb_define_global_function("ure", icu_reg_from_rb_str, -1);
+/**
+ * Document-class: UMatch
+ *
+ * Class to store information about capturing
+ * groups. Used in UString#sub, UString#gsub methods, as parameter to
+ * passed block.
+ */
+    rb_cUMatch = rb_define_class("UMatch", rb_cObject);
+    rb_define_method(rb_cUMatch, "[]", icu_umatch_aref, 1);
+    rb_define_method(rb_cUMatch, "size", icu_umatch_size, 0);
+    rb_define_method(rb_cUMatch, "range", icu_umatch_range, 1);
+    rb_define_method(rb_cRegexp, "to_u", icu_reg_from_rb_reg, 0);
+    rb_define_alias (rb_cRegexp, "U", "to_u");
+    rb_define_alias (rb_cRegexp, "ur", "to_u");
+}