icu4r 0.1.3.2006.01.26
Sign up to get free protection for your applications and to get access to all the features.
- data/MIT-LICENSE +20 -0
- data/README +153 -0
- data/calendar.c +576 -0
- data/docs/FORMATTING +131 -0
- data/docs/UNICODE_REGEXPS +204 -0
- data/extconf.rb +15 -0
- data/fmt.cpp +150 -0
- data/icu4r.c +14 -0
- data/icu_common.h +45 -0
- data/samples/demo_each.rb +23 -0
- data/samples/demo_locales.rb +16 -0
- data/samples/demo_regexp.rb +11 -0
- data/samples/resbundle/appmsg/root.res +0 -0
- data/samples/resbundle/appmsg/ru.res +0 -0
- data/samples/resbundle/demo_bundle.rb +4 -0
- data/samples/resbundle/mkres.sh +4 -0
- data/samples/resbundle/root.txt +10 -0
- data/samples/resbundle/ru.txt +4 -0
- data/test/test_calendar.rb +109 -0
- data/test/test_ustring.rb +381 -0
- data/tools/doc.sh +2 -0
- data/tools/km.rb +425 -0
- data/ubundle.c +209 -0
- data/ucore_ext.c +168 -0
- data/uregex.c +673 -0
- data/uregex.h +27 -0
- data/ustring.c +3042 -0
- metadata +81 -0
@@ -0,0 +1,204 @@
|
|
1
|
+
This is one-stop reference on Unicode regular expressions syntax, available in ICU.
|
2
|
+
This text is compiled from several sources in ICU userguide and code docs.
|
3
|
+
|
4
|
+
=== Regular Expression Metacharacters
|
5
|
+
|
6
|
+
Character Description
|
7
|
+
\a Match a BELL, \u0007
|
8
|
+
\A Match at the beginning of the input. Differs from ^ in that \A will not match after a new line within the input.
|
9
|
+
\b, outside of a [Set] Match if the current position is a word boundary. Boundaries occur
|
10
|
+
at the transitions between word (\w) and non-word (\W) characters,
|
11
|
+
with combining marks ignored. For better word boundaries, see ICU Boundary Analysis .
|
12
|
+
\b, within a [Set] Match a BACKSPACE, \u0008.
|
13
|
+
\B Match if the current position is not a word boundary.
|
14
|
+
\cX Match a control-X character.
|
15
|
+
\d Match any character with the Unicode General Category of Nd (Number, Decimal Digit.)
|
16
|
+
\D Match any character that is not a decimal digit.
|
17
|
+
\e Match an ESCAPE, \u001B.
|
18
|
+
\E Terminates a \Q ... \E quoted sequence.
|
19
|
+
\f Match a FORM FEED, \u000C.
|
20
|
+
\G Match if the current position is at the end of the previous match.
|
21
|
+
\n Match a LINE FEED, \u000A.
|
22
|
+
\N{UNICODE CHARACTER NAME} Match the named character.
|
23
|
+
\p{UNICODE PROPERTY NAME} Match any character with the specified Unicode Property.
|
24
|
+
\P{UNICODE PROPERTY NAME} Match any character not having the specified Unicode Property.
|
25
|
+
\Q Quotes all following characters until \E.
|
26
|
+
\r Match a CARRIAGE RETURN, \u000D.
|
27
|
+
\s Match a white space character. White space is defined as [\t\n\f\r\p{Z}].
|
28
|
+
\S Match a non-white space character.
|
29
|
+
\t Match a HORIZONTAL TABULATION, \u0009.
|
30
|
+
\uhhhh Match the character with the hex value hhhh.
|
31
|
+
\Uhhhhhhhh Match the character with the hex value hhhhhhhh. Exactly eight hex digits must be
|
32
|
+
provided, even though the largest Unicode code point is \U0010ffff.
|
33
|
+
\w Match a word character. Word characters are [\p{Ll}\p{Lu}\p{Lt}\p{Lo}\p{Nd}].
|
34
|
+
\W Match a non-word character.
|
35
|
+
\x{hhhh} Match the character with hex value hhhh. From one to six hex digits may be supplied.
|
36
|
+
\xhh Match the character with two digit hex value hh
|
37
|
+
\X Match a Grapheme Cluster .
|
38
|
+
\Z Match if the current position is at the end of input, but before the final line terminator, if one exists.
|
39
|
+
\z Match if the current position is at the end of input.
|
40
|
+
\n Back Reference. Match whatever the nth capturing group matched. n must be a number > 1 and < total number
|
41
|
+
of capture groups in the pattern. Note: Octal escapes, such as \012, are not supported in ICU regular expressions
|
42
|
+
[pattern] Match any one character from the set. See UnicodeSet for a full description of what may appear in the pattern
|
43
|
+
. Match any character.
|
44
|
+
^ Match at the beginning of a line.
|
45
|
+
$ Match at the end of a line.
|
46
|
+
\ Quotes the following character. Characters that must be quoted to be treated as literals are * ? + [ ( ) { } ^ $ | \ . /
|
47
|
+
|
48
|
+
=== Regular Expression Operators
|
49
|
+
|
50
|
+
Operator Description
|
51
|
+
| Alternation. A|B matches either A or B.
|
52
|
+
* Match 0 or more times. Match as many times as possible.
|
53
|
+
+ Match 1 or more times. Match as many times as possible.
|
54
|
+
? Match zero or one times. Prefer one.
|
55
|
+
{n} Match exactly n times
|
56
|
+
{n,} Match at least n times. Match as many times as possible.
|
57
|
+
{n,m} Match between n and m times. Match as many times as possible, but not more than m.
|
58
|
+
*? Match 0 or more times. Match as few times as possible.
|
59
|
+
+? Match 1 or more times. Match as few times as possible.
|
60
|
+
?? Match zero or one times. Prefer zero.
|
61
|
+
{n}? Match exactly n times
|
62
|
+
{n,}? Match at least n times, but no more than required for an overall pattern match
|
63
|
+
{n,m}? Match between n and m times. Match as few times as possible, but not less than n.
|
64
|
+
*+ Match 0 or more times. Match as many times as possible when first encountered, do not retry with
|
65
|
+
fewer even if overall match fails (Possessive Match)
|
66
|
+
++ Match 1 or more times. Possessive match.
|
67
|
+
?+ Match zero or one times. Possessive match.
|
68
|
+
{n}+ Match exactly n times
|
69
|
+
{n,}+ Match at least n times. Possessive Match.
|
70
|
+
{n,m}+ Match between n and m times. Possessive Match.
|
71
|
+
( ... ) Capturing parentheses. Range of input that matched the parenthesized subexpression is
|
72
|
+
available after the match.
|
73
|
+
(?: ... ) Non-capturing parentheses. Groups the included pattern, but does not provide
|
74
|
+
capturing of matching text. Somewhat more efficient than capturing parentheses.
|
75
|
+
(?> ... ) Atomic-match parentheses. First match of the parenthesized subexpression is the only
|
76
|
+
one tried; if it does not lead to an overall pattern match, back up the search for a
|
77
|
+
match to a position before the "(?>"
|
78
|
+
(?# ... ) Free-format comment (?# comment ).
|
79
|
+
(?= ... ) Look-ahead assertion. True if the parenthesized pattern matches at the current input position,
|
80
|
+
but does not advance the input position.
|
81
|
+
(?! ... ) Negative look-ahead assertion. True if the parenthesized pattern does not match at the current
|
82
|
+
input position. Does not advance the input position.
|
83
|
+
(?<= ... ) Look-behind assertion. True if the parenthesized pattern matches text preceding the current
|
84
|
+
input position, with the last character of the match being the input character just before
|
85
|
+
the current position. Does not alter the input position. The length of possible strings matched
|
86
|
+
by the look-behind pattern must not be unbounded (no * or + operators.)
|
87
|
+
(?<! ... ) Negative Look-behind assertion. True if the parenthesized pattern does not
|
88
|
+
match text preceding the current input position, with the last character of the
|
89
|
+
match being the input character just before the current position. Does not alter
|
90
|
+
the input position. The length of possible strings matched by the look-behind pattern
|
91
|
+
must not be unbounded (no * or + operators.)
|
92
|
+
(?ismx-ismx: ... ) Flag settings. Evaluate the parenthesized expression with the specified flags enabled or -disabled.
|
93
|
+
(?ismx-ismx) Flag settings. Change the flag settings. Changes apply to the portion of the pattern
|
94
|
+
following the setting. For example, (?i) changes to a case insensitive match.
|
95
|
+
|
96
|
+
=== Replacement Text
|
97
|
+
|
98
|
+
The replacement text for find-and-replace operations may contain references to capture-group text from the find. References are of the form $n, where n is the number of the capture group.
|
99
|
+
|
100
|
+
Character Descriptions
|
101
|
+
$n The text of capture group n will be substituted for $n. n must be >= 0 and not
|
102
|
+
greater than the number of capture groups. A $ not followed by a digit has no special meaning,
|
103
|
+
and will appear in the substitution text as itself, a $.
|
104
|
+
\ Treat the following character as a literal, suppressing any special meaning. Backslash escaping in
|
105
|
+
substitution text is only required for '$' and '\', but may be used on any other character without bad effects.
|
106
|
+
|
107
|
+
=== UnicodeSet Pattern syntax
|
108
|
+
|
109
|
+
These patterns follow a syntax similar to that employed by version 8 regular expression character classes.
|
110
|
+
|
111
|
+
Here are some simple examples:
|
112
|
+
|
113
|
+
[] No characters
|
114
|
+
[a] The character 'a'
|
115
|
+
[ae] The characters 'a' and 'e'
|
116
|
+
[a-e] The characters 'a' through 'e' inclusive, in Unicode code point order
|
117
|
+
[\u4E01] The character U+4E01
|
118
|
+
[a{ab}{ac}] The character 'a' and the multicharacter strings "ab" and "ac"
|
119
|
+
[\p{Lu}] All characters in the general category Uppercase Letter
|
120
|
+
|
121
|
+
Any character may be preceded by a backslash in order to remove any special meaning.
|
122
|
+
White space characters, are ignored, unless they are escaped.
|
123
|
+
|
124
|
+
Property patterns specify a set of characters having a certain property as defined by the
|
125
|
+
Unicode standard. Both the POSIX-like "[:Lu:]" and the Perl-like syntax "\\p{Lu}" are recognized.
|
126
|
+
|
127
|
+
Patterns specify individual characters, ranges of characters, and Unicode property sets.
|
128
|
+
When elements are concatenated, they specify their union. To complement a set, place a '^'
|
129
|
+
immediately after the opening '['. Property patterns are inverted by modifying their delimiters;
|
130
|
+
"[:^foo]" and "\\P{foo}". In any other location, '^' has no special meaning.
|
131
|
+
|
132
|
+
Ranges are indicated by placing two a '-' between two characters, as in "a-z".
|
133
|
+
This specifies the range of all characters from the left to the right, in Unicode order.
|
134
|
+
If the left character is greater than or equal to the right character it is a syntax error.
|
135
|
+
If a '-' occurs as the first character after the opening '[' or '[^', or if it occurs as the
|
136
|
+
last character before the closing ']', then it is taken as a literal. Thus "[a\-b]", "[-ab]",
|
137
|
+
and "[ab-]" all indicate the same set of three characters, 'a', 'b', and '-'.
|
138
|
+
|
139
|
+
Sets may be intersected using the '&' operator or the asymmetric set difference may be taken using
|
140
|
+
the '-' operator, for example, "[[:L:]&[\\u0000-\\u0FFF]]" indicates the set of all
|
141
|
+
Unicode letters with values less than 4096. Operators ('&' and '|') have equal precedence
|
142
|
+
and bind left-to-right. Thus "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to
|
143
|
+
"[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]". This only really matters for difference; intersection
|
144
|
+
is commutative.
|
145
|
+
|
146
|
+
[a] The set containing 'a'
|
147
|
+
[a-z] The set containing 'a' through 'z' and all letters in between, in Unicode order
|
148
|
+
[^a-z] The set containing all characters but 'a' through 'z', that is, U+0000 through 'a'-1
|
149
|
+
and 'z'+1 through U+10FFFF
|
150
|
+
[[pat1][pat2]] The union of sets specified by pat1 and pat2
|
151
|
+
[[pat1]&[pat2]] The intersection of sets specified by pat1 and pat2
|
152
|
+
[[pat1]-[pat2]] The asymmetric difference of sets specified by pat1 and pat2
|
153
|
+
[:Lu:] or \p{Lu} The set of characters having the specified Unicode property; in this case, Unicode uppercase letters
|
154
|
+
[:^Lu:] or \P{Lu} The set of characters not having the given Unicode property
|
155
|
+
|
156
|
+
Warning: you cannot add an empty string ("") to a UnicodeSet.
|
157
|
+
|
158
|
+
Formal syntax
|
159
|
+
|
160
|
+
pattern := ('[' '^'? item* ']') | property
|
161
|
+
item := char | (char '-' char) | pattern-expr
|
162
|
+
pattern-expr := pattern | pattern-expr pattern | pattern-expr op pattern
|
163
|
+
op := '&' | '-'
|
164
|
+
special := '[' | ']' | '-'
|
165
|
+
char := any character that is not special | ('\' any character) | ('\u' hex hex hex hex)
|
166
|
+
hex := any character for which Character.digit(c, 16) returns a non-negative result
|
167
|
+
property := a Unicode property set pattern
|
168
|
+
|
169
|
+
Legend:
|
170
|
+
a := b a may be replaced by b
|
171
|
+
a? zero or one instance of a
|
172
|
+
a* one or more instances of a
|
173
|
+
a | b either a or b
|
174
|
+
'a' the literal string between the quotes
|
175
|
+
|
176
|
+
The C/POSIX character classes are also available in UnicodeSet patterns, using patterns like [:graph:] or \p{graph}.
|
177
|
+
|
178
|
+
alpha , lower , upper , punct , digit , xdigit, alnum , space , blank , cntrl , graph , print
|
179
|
+
|
180
|
+
=== General Categories
|
181
|
+
|
182
|
+
Lu = Letter, uppercase Mn = Mark, nonspacing Nd = Number, decimal digit
|
183
|
+
Ll = Letter, lowercase Mc = Mark, spacing combining Nl = Number, letter
|
184
|
+
Lt = Letter, titlecase Me = Mark, enclosing No = Number, other
|
185
|
+
Lm = Letter, modifier
|
186
|
+
Lo = Letter, other
|
187
|
+
|
188
|
+
Zs = Separator, space Cc = Other, control
|
189
|
+
Zl = Separator, line Cf = Other, format
|
190
|
+
Zp = Separator, paragraph Cs = Other, surrogate
|
191
|
+
Co = Other, private use
|
192
|
+
Cn = Other, not assigned (including noncharacters)
|
193
|
+
|
194
|
+
Pc = Punctuation, connector Sm = Symbol, math
|
195
|
+
Pd = Punctuation, dash Sc = Symbol, currency
|
196
|
+
Ps = Punctuation, open Sk = Symbol, modifier
|
197
|
+
Pe = Punctuation, close So = Symbol, other
|
198
|
+
Pi = Punctuation, initial quote (may behave like Ps or Pe depending on usage)
|
199
|
+
Pf = Punctuation, final quote (may behave like Ps or Pe depending on usage)
|
200
|
+
Po = Punctuation, other
|
201
|
+
|
202
|
+
|
203
|
+
See also http://www.unicode.org/Public/UNIDATA/PropertyAliases.txt and
|
204
|
+
http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt for additional info.
|
data/extconf.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'mkmf'
|
2
|
+
$LDFLAGS = "-licuuc -licui18n -licudata -lstdc++ "
|
3
|
+
$CFLAGS = "-Wall"
|
4
|
+
if !have_library('icui18n', 'u_init_3_4')
|
5
|
+
puts "ICU v3.4 required -- not found."
|
6
|
+
exit 1
|
7
|
+
end
|
8
|
+
create_makefile('icu4r')
|
9
|
+
File.open("Makefile", "a") << <<-EOT
|
10
|
+
|
11
|
+
check: $(DLLIB)
|
12
|
+
@$(RUBY) $(srcdir)/test/test_ustring.rb
|
13
|
+
@$(RUBY) $(srcdir)/test/test_calendar.rb
|
14
|
+
|
15
|
+
EOT
|
data/fmt.cpp
ADDED
@@ -0,0 +1,150 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
#include "icu_common.h"
|
3
|
+
#include <unicode/msgfmt.h>
|
4
|
+
#include <unicode/translit.h>
|
5
|
+
#include <unicode/smpdtfmt.h>
|
6
|
+
#include <unicode/calendar.h>
|
7
|
+
#include <unicode/ucal.h>
|
8
|
+
/* This file contains various C-C++ wrappers, to ease my life
|
9
|
+
*/
|
10
|
+
extern "C" {
|
11
|
+
extern VALUE rb_cUString;
|
12
|
+
extern VALUE rb_cUCalendar;
|
13
|
+
extern VALUE icu_ustr_new(const UChar * str, long len);
|
14
|
+
extern VALUE icu_ustr_new_set(const UChar * str, long len, long capa);
|
15
|
+
|
16
|
+
VALUE icu_format(UChar * pattern, int32_t len, VALUE args, int32_t arg_len, char * locale)
|
17
|
+
{
|
18
|
+
Formattable * arguments = new Formattable[arg_len];
|
19
|
+
int i, is_set;
|
20
|
+
VALUE obj;
|
21
|
+
for(i = 0; i < arg_len; i++){
|
22
|
+
obj = rb_ary_entry(args,i);
|
23
|
+
is_set = 0;
|
24
|
+
switch(TYPE(obj)){
|
25
|
+
case T_FIXNUM:
|
26
|
+
case T_FLOAT:
|
27
|
+
arguments[i].setDouble(rb_num2dbl(obj));
|
28
|
+
is_set = 1;
|
29
|
+
break;
|
30
|
+
}
|
31
|
+
if(! is_set) {
|
32
|
+
if (CLASS_OF(obj) == rb_cUString) {
|
33
|
+
arguments[i].setString(UnicodeString(ICU_PTR(obj), ICU_LEN(obj)));
|
34
|
+
} else
|
35
|
+
if (CLASS_OF(obj) == rb_cTime) {
|
36
|
+
// ICU expects milliseconds since 01.01.1970
|
37
|
+
arguments[i].setDate(rb_num2dbl(rb_funcall(obj, rb_intern("to_f"), 0))*1000);
|
38
|
+
}
|
39
|
+
else {
|
40
|
+
delete [] arguments;
|
41
|
+
rb_raise(rb_eArgError, "wrong arg type: %s", rb_class2name(CLASS_OF(obj)));
|
42
|
+
}
|
43
|
+
}
|
44
|
+
}
|
45
|
+
UnicodeString patString(pattern,len);
|
46
|
+
UErrorCode status = U_ZERO_ERROR;
|
47
|
+
UnicodeString resultStr;
|
48
|
+
FieldPosition fieldPosition(0);
|
49
|
+
MessageFormat * fmt= new MessageFormat(patString,Locale(locale), status);
|
50
|
+
if( U_FAILURE(status) ){
|
51
|
+
rb_raise(rb_eArgError, "Can't format: %s", u_errorName(status));
|
52
|
+
}
|
53
|
+
fmt->format(arguments,arg_len,resultStr,fieldPosition,status);
|
54
|
+
if( U_FAILURE(status) ){
|
55
|
+
rb_raise(rb_eArgError, "Can't format: %s", u_errorName(status));
|
56
|
+
}
|
57
|
+
int32_t blen = resultStr.length();
|
58
|
+
UChar * buf = ALLOC_N(UChar, blen + 1);
|
59
|
+
resultStr.extract(buf, blen, status);
|
60
|
+
VALUE ret = icu_ustr_new( buf, blen);
|
61
|
+
free(buf);
|
62
|
+
delete[] arguments;
|
63
|
+
delete fmt;
|
64
|
+
return ret;
|
65
|
+
}
|
66
|
+
VALUE icu_date_parse(UChar * str, int32_t str_len, char * locale, UChar * val, int32_t len)
|
67
|
+
{
|
68
|
+
UErrorCode status = U_ZERO_ERROR;
|
69
|
+
SimpleDateFormat formatter(UnicodeString(str, str_len), Locale(locale), status);
|
70
|
+
if( U_FAILURE(status) ) {
|
71
|
+
rb_raise(rb_eArgError, u_errorName(status));
|
72
|
+
}
|
73
|
+
status = U_ZERO_ERROR;
|
74
|
+
UDate p_time = formatter.parse(UnicodeString(val, len), status);
|
75
|
+
if( U_FAILURE(status) ) {
|
76
|
+
rb_raise(rb_eArgError, u_errorName(status));
|
77
|
+
}
|
78
|
+
return rb_time_new( (time_t) (p_time/1000.0), 0);
|
79
|
+
}
|
80
|
+
VALUE icu_transliterate(UChar * str, int32_t str_len, UChar * id, int32_t id_len, UChar * rules, int32_t rule_len)
|
81
|
+
{
|
82
|
+
UErrorCode status = U_ZERO_ERROR;
|
83
|
+
UParseError p_error;
|
84
|
+
Transliterator * t ;
|
85
|
+
if( rules != NULL) {
|
86
|
+
t = Transliterator::createFromRules(UnicodeString(id, id_len), UnicodeString(rules, rule_len),
|
87
|
+
UTRANS_FORWARD, p_error, status);
|
88
|
+
} else {
|
89
|
+
t = Transliterator::createInstance(UnicodeString(id, id_len), UTRANS_FORWARD, p_error, status);
|
90
|
+
}
|
91
|
+
if( U_FAILURE(status) )
|
92
|
+
{
|
93
|
+
rb_raise(rb_eRuntimeError, u_errorName(status));
|
94
|
+
}
|
95
|
+
UnicodeString * src = new UnicodeString(str, str_len);
|
96
|
+
t->transliterate(*src);
|
97
|
+
int32_t blen = src->length();
|
98
|
+
UChar * buf = ALLOC_N(UChar, blen + 1);
|
99
|
+
src->extract(buf, blen, status);
|
100
|
+
VALUE ret = icu_ustr_new_set( buf, blen, blen+1);
|
101
|
+
delete src;
|
102
|
+
delete t;
|
103
|
+
return ret;
|
104
|
+
}
|
105
|
+
extern void icu4r_cal_free(UCalendar *);
|
106
|
+
|
107
|
+
VALUE icu4r_cal_clone(VALUE cal)
|
108
|
+
{
|
109
|
+
Calendar * clon;
|
110
|
+
clon = ((Calendar *)(DATA_PTR(cal)))->clone();
|
111
|
+
return Data_Wrap_Struct(rb_cUCalendar, 0, icu4r_cal_free, clon);
|
112
|
+
}
|
113
|
+
#define CPP_CALENDAR(obj) ((Calendar*)DATA_PTR(obj))
|
114
|
+
VALUE icu4r_cal_before(VALUE cal, VALUE obj)
|
115
|
+
{
|
116
|
+
UErrorCode status = U_ZERO_ERROR;
|
117
|
+
UBool answer;
|
118
|
+
Check_Class( obj, rb_cUCalendar);
|
119
|
+
Calendar *other = CPP_CALENDAR(obj);
|
120
|
+
answer = CPP_CALENDAR(cal)->before(*other, status);
|
121
|
+
if( U_FAILURE(status) ) rb_raise(rb_eArgError, u_errorName(status));
|
122
|
+
return answer ? Qtrue : Qfalse;
|
123
|
+
}
|
124
|
+
|
125
|
+
VALUE icu4r_cal_time_equals(VALUE cal, VALUE obj)
|
126
|
+
{
|
127
|
+
UErrorCode status = U_ZERO_ERROR;
|
128
|
+
UBool answer;
|
129
|
+
Check_Class( obj, rb_cUCalendar);
|
130
|
+
Calendar *other = CPP_CALENDAR(obj);
|
131
|
+
answer = CPP_CALENDAR(cal)->equals(*other, status);
|
132
|
+
if( U_FAILURE(status) ) rb_raise(rb_eArgError, u_errorName(status));
|
133
|
+
return answer ? Qtrue : Qfalse;
|
134
|
+
}
|
135
|
+
|
136
|
+
VALUE icu4r_cal_after(VALUE cal, VALUE obj)
|
137
|
+
{
|
138
|
+
Check_Class( obj, rb_cUCalendar);
|
139
|
+
return icu4r_cal_before(obj, cal);
|
140
|
+
}
|
141
|
+
|
142
|
+
VALUE icu4r_cal_equal(VALUE cal, VALUE obj)
|
143
|
+
{
|
144
|
+
UBool answer;
|
145
|
+
Check_Class( obj, rb_cUCalendar);
|
146
|
+
answer = (*CPP_CALENDAR(cal)) == (*CPP_CALENDAR(obj));
|
147
|
+
return answer ? Qtrue : Qfalse;
|
148
|
+
}
|
149
|
+
}
|
150
|
+
|
data/icu4r.c
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
extern void initialize_ustring(void);
|
2
|
+
extern void initialize_calendar(void);
|
3
|
+
extern void initialize_uregexp(void);
|
4
|
+
extern void initialize_ucore_ext(void);
|
5
|
+
extern void initialize_ubundle(void);
|
6
|
+
void Init_icu4r (void) {
|
7
|
+
|
8
|
+
initialize_ustring();
|
9
|
+
initialize_uregexp();
|
10
|
+
initialize_ucore_ext();
|
11
|
+
initialize_ubundle();
|
12
|
+
initialize_calendar();
|
13
|
+
|
14
|
+
}
|
data/icu_common.h
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
#include <unicode/utypes.h>
|
3
|
+
#include <unicode/ustring.h>
|
4
|
+
#include <unicode/ustdio.h>
|
5
|
+
#include <unicode/uchar.h>
|
6
|
+
#include <unicode/uclean.h>
|
7
|
+
#include <unicode/uregex.h>
|
8
|
+
#include <unicode/unorm.h>
|
9
|
+
#include <unicode/ubrk.h>
|
10
|
+
#include <unicode/ucnv.h>
|
11
|
+
#include <unicode/uset.h>
|
12
|
+
#include <unicode/uenum.h>
|
13
|
+
#include <unicode/utrans.h>
|
14
|
+
#include <unicode/ucol.h>
|
15
|
+
#include <unicode/usearch.h>
|
16
|
+
#include <unicode/ures.h>
|
17
|
+
#include <unicode/unum.h>
|
18
|
+
|
19
|
+
#ifdef HAVE_UNISTD_H
|
20
|
+
#include <unistd.h>
|
21
|
+
#endif
|
22
|
+
typedef struct {
|
23
|
+
long len;
|
24
|
+
long capa;
|
25
|
+
UChar *ptr;
|
26
|
+
char busy;
|
27
|
+
} ICUString ;
|
28
|
+
#define USTRING(obj) ((ICUString *)DATA_PTR(obj))
|
29
|
+
#define UREGEX(obj) ((ICURegexp *)DATA_PTR(obj))
|
30
|
+
#define ICU_PTR(str) USTRING(str)->ptr
|
31
|
+
#define ICU_LEN(str) USTRING(str)->len
|
32
|
+
#define ICU_CAPA(str) USTRING(str)->capa
|
33
|
+
#define ICU_RESIZE(str,capacity) REALLOC_N(ICU_PTR(str), UChar, (capacity)+1);
|
34
|
+
|
35
|
+
typedef struct {
|
36
|
+
URegularExpression *pattern;
|
37
|
+
int options;
|
38
|
+
} ICURegexp;
|
39
|
+
|
40
|
+
|
41
|
+
#define Check_Class(obj, klass) if(CLASS_OF(obj) != klass) rb_raise(rb_eTypeError, "Wrong type: expected %s, got %s", rb_class2name(klass), rb_class2name(rb_obj_class(obj)));
|
42
|
+
|
43
|
+
|
44
|
+
#define ICU_RAISE(status) if(U_FAILURE(status)) rb_raise(rb_eRuntimeError, u_errorName(status));
|
45
|
+
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'icu4r'
|
2
|
+
res = {}
|
3
|
+
src = <<-EOT
|
4
|
+
外国語の勉強と教え
|
5
|
+
Изучение и обучение иностранных языков
|
6
|
+
Enseñanza y estudio de idiomas
|
7
|
+
'læŋɡwidʒ 'lɘr:niŋ ænd 'ti:ʃiŋ
|
8
|
+
ללמוד וללמד את השֵפה
|
9
|
+
L'enseignement et l'étude des langues
|
10
|
+
Γλωσσική Εκμὰθηση και Διδασκαλία
|
11
|
+
เรียนและสอนภาษา
|
12
|
+
EOT
|
13
|
+
src = src.u
|
14
|
+
["line_break", "char", "sentence", "word"].each do |brk|
|
15
|
+
res[brk] = {}
|
16
|
+
["ja", "en", "th"].each do |loc|
|
17
|
+
out = []
|
18
|
+
src.send("each_#{brk}".to_sym, loc) { |s| out << s }
|
19
|
+
res[brk][loc] = out.join("|")
|
20
|
+
puts "---------#{brk}-------#{loc}---------"
|
21
|
+
puts out.join("|")
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require 'icu4r'
|
2
|
+
root = UResourceBundle.open(nil, "en")
|
3
|
+
today = Time.now
|
4
|
+
UString::list_locales.each do |locale|
|
5
|
+
b = UResourceBundle.open(nil, locale)
|
6
|
+
lang, ctry, var = locale.split '_', 3
|
7
|
+
ctry = var ? var : ctry
|
8
|
+
puts [
|
9
|
+
locale,
|
10
|
+
"("+root["Countries"][ctry].to_s + " : " + root["Languages"][lang].to_s+")",
|
11
|
+
"("+b["Countries"][ctry].to_s + " : " + b["Languages"][lang].to_s+")",
|
12
|
+
"[{0,date,long}]({1,number,currency})".u.fmt(locale, today, 123.45),
|
13
|
+
b["ExemplarCharacters"]
|
14
|
+
].join("\t")
|
15
|
+
|
16
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
require 'icu4r'
|
2
|
+
str = " abcあいうえおアイウエオアイウエオ漢字,0123スクリプト".u
|
3
|
+
puts str.inspect_names
|
4
|
+
p str=~ ure('[\p{Script=Latin}]+')
|
5
|
+
p str=~ ure('[\p{Script=Hiragana}]+')
|
6
|
+
p str=~ ure('[\p{Script=Katakana}]+')
|
7
|
+
p str=~ ure('[\p{Script=Hiragana}\p{Script=Katakana}]+')
|
8
|
+
p str=~ ure('[\p{blk=CJKUnifiedIdeographs}]+')
|
9
|
+
p str=~ ure('[\p{L}]+')
|
10
|
+
p str=~ ure('\u3042') # あ
|
11
|
+
p str.scan(ure('[\p{N}]'))
|
Binary file
|
Binary file
|
@@ -0,0 +1,109 @@
|
|
1
|
+
require './icu4r'
|
2
|
+
require 'test/unit'
|
3
|
+
# these tests are ICU 3.4 dependent
|
4
|
+
class UCalendarTest < Test::Unit::TestCase
|
5
|
+
|
6
|
+
def test_time_zones
|
7
|
+
v = UCalendar.time_zones
|
8
|
+
assert_equal(v.size, 577)
|
9
|
+
assert_equal( UString, v[0].class)
|
10
|
+
assert_equal("ACT".u, v[0] )
|
11
|
+
end
|
12
|
+
|
13
|
+
def test_default
|
14
|
+
v = UCalendar.default_tz
|
15
|
+
UCalendar.default_tz ="Europe/Paris".u
|
16
|
+
assert_equal( "Europe/Paris".u, UCalendar.default_tz)
|
17
|
+
c = UCalendar.new
|
18
|
+
assert_equal( "GMT+01:00".u, c.time_zone)
|
19
|
+
end
|
20
|
+
|
21
|
+
def test_dst
|
22
|
+
assert_equal(UCalendar.dst_savings("America/Detroit".u), 3600000)
|
23
|
+
assert_equal(UCalendar.dst_savings("Australia/Lord_Howe".u), 1800000)
|
24
|
+
end
|
25
|
+
|
26
|
+
def test_tz_for_country
|
27
|
+
zones = %w{Europe/Kiev Europe/Simferopol Europe/Uzhgorod Europe/Zaporozhye}.collect {|s| s.to_u}
|
28
|
+
assert_equal(zones, UCalendar.tz_for_country("UA"))
|
29
|
+
end
|
30
|
+
|
31
|
+
def test_time_now
|
32
|
+
assert_equal(Time.now.to_i/100, UCalendar.now.to_i/100000)
|
33
|
+
end
|
34
|
+
|
35
|
+
def test_in_daylight
|
36
|
+
t = UCalendar.new
|
37
|
+
t.set_date(2006, 8, 22)
|
38
|
+
t.time_zone = "US/Hawaii".u
|
39
|
+
assert_equal(false, t.in_daylight_time?)
|
40
|
+
t.time_zone = "Europe/Berlin".u
|
41
|
+
assert_equal(true, t.in_daylight_time?)
|
42
|
+
end
|
43
|
+
def test_set_date
|
44
|
+
t = UCalendar.new
|
45
|
+
t.set_date(2006, 0, 22)
|
46
|
+
assert_equal(2006, t[:year])
|
47
|
+
assert_equal(0, t[:month])
|
48
|
+
assert_equal(22, t[:date])
|
49
|
+
t[:year] = 2007
|
50
|
+
t[:month] = 2
|
51
|
+
t[:date] = 23
|
52
|
+
assert_equal(2007, t[:year])
|
53
|
+
assert_equal(2, t[:month])
|
54
|
+
assert_equal(23, t[:date])
|
55
|
+
|
56
|
+
end
|
57
|
+
|
58
|
+
def test_set_date_time
|
59
|
+
t = UCalendar.new
|
60
|
+
t.set_date_time(2006, 0, 22, 11, 22, 33)
|
61
|
+
assert_equal(11, t[:hour])
|
62
|
+
assert_equal(22, t[:minute])
|
63
|
+
assert_equal(33, t[:second])
|
64
|
+
end
|
65
|
+
|
66
|
+
def test_millis
|
67
|
+
m = UCalendar.now
|
68
|
+
t = UCalendar.new
|
69
|
+
assert(m <= t.millis)
|
70
|
+
n = Time.now.to_i
|
71
|
+
t.millis = n * 1000.0
|
72
|
+
assert_equal(n*1000.0, t.millis)
|
73
|
+
end
|
74
|
+
|
75
|
+
def test_add_time
|
76
|
+
t = UCalendar.new
|
77
|
+
t.set_date_time(2006, 0, 22, 11, 22, 33)
|
78
|
+
t.add(:week_of_year, 1)
|
79
|
+
assert_equal(29, t[:date])
|
80
|
+
t.add(:hour, 48)
|
81
|
+
assert_equal(31, t[:date])
|
82
|
+
end
|
83
|
+
|
84
|
+
def test_format
|
85
|
+
t = UCalendar.new
|
86
|
+
t.set_date_time(2006, 0, 22, 11, 22, 33)
|
87
|
+
t.time_zone = "Europe/London".u
|
88
|
+
assert_equal("2006/01/22 11:22:33 GMT AD".u, t.format("yyyy/MM/dd HH:mm:ss z G".u, "en"))
|
89
|
+
end
|
90
|
+
|
91
|
+
def test_clone_and_compare
|
92
|
+
c = UCalendar.new
|
93
|
+
d = c.clone
|
94
|
+
assert(c == d)
|
95
|
+
assert(! (c < d) )
|
96
|
+
assert(! (c > d) )
|
97
|
+
assert(c.eql?(d))
|
98
|
+
c.add(:date, 1)
|
99
|
+
assert(c != d)
|
100
|
+
assert(! (c < d) )
|
101
|
+
assert( (c > d) )
|
102
|
+
assert(!c.eql?(d))
|
103
|
+
d.add(:date, 1)
|
104
|
+
assert(c.eql?(d))
|
105
|
+
d.time_zone = "Europe/Kiev".u
|
106
|
+
assert(!c.eql?(d))
|
107
|
+
assert(c == d)
|
108
|
+
end
|
109
|
+
end
|