icu4r_19 1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/ChangeLog +87 -0
- data/MIT-LICENSE +20 -0
- data/README +156 -0
- data/Rakefile +32 -0
- data/calendar.c +636 -0
- data/collator.c +233 -0
- data/converter.c +322 -0
- data/docs/FORMATTING +131 -0
- data/docs/UNICODE_REGEXPS +204 -0
- data/extconf.rb +17 -0
- data/fmt.cpp +156 -0
- data/icu4r.c +18 -0
- data/icu_common.h +45 -0
- data/lib/dummy +0 -0
- data/samples/demo_each.rb +23 -0
- data/samples/demo_locales.rb +16 -0
- data/samples/demo_regexp.rb +11 -0
- data/samples/resbundle/appmsg/root.res +0 -0
- data/samples/resbundle/appmsg/ru.res +0 -0
- data/samples/resbundle/demo_bundle.rb +4 -0
- data/samples/resbundle/mkres.sh +4 -0
- data/samples/resbundle/root.txt +10 -0
- data/samples/resbundle/ru.txt +4 -0
- data/test/test_calendar.rb +123 -0
- data/test/test_collator.rb +33 -0
- data/test/test_converter.rb +72 -0
- data/test/test_ustring.rb +508 -0
- data/tools/doc.sh +2 -0
- data/tools/km.rb +425 -0
- data/ubundle.c +223 -0
- data/ucore_ext.c +168 -0
- data/uregex.c +697 -0
- data/uregex.h +27 -0
- data/ustring.c +3039 -0
- metadata +164 -0
data/docs/FORMATTING
ADDED
@@ -0,0 +1,131 @@
|
|
1
|
+
=== Locale-sensitive Message Formatting
|
2
|
+
|
3
|
+
Basic pattern rules are:
|
4
|
+
|
5
|
+
messageFormatPattern := string ( "{" messageFormatElement "}" string )*
|
6
|
+
|
7
|
+
messageFormatElement := argument { "," elementFormat }
|
8
|
+
|
9
|
+
elementFormat := "time" { "," datetimeStyle }
|
10
|
+
| "date" { "," datetimeStyle }
|
11
|
+
| "number" { "," numberStyle }
|
12
|
+
| "spellout"
|
13
|
+
| "ordinal"
|
14
|
+
| "duration"
|
15
|
+
| "choice" "," choiceStyle
|
16
|
+
|
17
|
+
datetimeStyle := "short"
|
18
|
+
| "medium"
|
19
|
+
| "long"
|
20
|
+
| "full"
|
21
|
+
| dateFormatPattern
|
22
|
+
|
23
|
+
numberStyle := "currency"
|
24
|
+
| "percent"
|
25
|
+
| "integer"
|
26
|
+
| numberFormatPattern
|
27
|
+
|
28
|
+
choiceStyle := choiceFormatPattern
|
29
|
+
|
30
|
+
=== numberFormatPattern
|
31
|
+
|
32
|
+
pattern := subpattern{;subpattern}
|
33
|
+
subpattern := {prefix}integer{.fraction}{suffix}
|
34
|
+
|
35
|
+
prefix := '\\u0000'..'\\uFFFD' - specialCharacters
|
36
|
+
suffix := '\\u0000'..'\\uFFFD' - specialCharacters
|
37
|
+
integer := '#'* '0'* '0'
|
38
|
+
fraction := '0'* '#'*
|
39
|
+
|
40
|
+
Notation:
|
41
|
+
X* 0 or more instances of X
|
42
|
+
(X | Y) either X or Y.
|
43
|
+
X..Y any character from X up to Y, inclusive.
|
44
|
+
S - T characters in S, except those in T
|
45
|
+
|
46
|
+
The first subpattern is for positive numbers. The second (optional)
|
47
|
+
subpattern is used for negative numbers. (In both cases, ',' can
|
48
|
+
occur inside the integer portion--it is just too messy to indicate
|
49
|
+
in BNF.) For the second subpattern, only the PREFIX and SUFFIX are
|
50
|
+
noted; other attributes are taken only from the first subpattern.
|
51
|
+
|
52
|
+
Here are the special characters used in the parts of the
|
53
|
+
subpattern, with notes on their usage.
|
54
|
+
|
55
|
+
Symbol Meaning
|
56
|
+
0 a digit, showing up a zero if it is zero
|
57
|
+
# a digit, supressed if zero
|
58
|
+
. placeholder for decimal separator
|
59
|
+
, placeholder for grouping separator.
|
60
|
+
E separates mantissa and exponent for exponential formats.
|
61
|
+
; separates formats.
|
62
|
+
- default negative prefix.
|
63
|
+
% multiply by 100 and show as percentage
|
64
|
+
\u2030 multiply by 1000 and show as per mille
|
65
|
+
\u00A4 currency sign; replaced by currency symbol; if doubled, replaced by international currency symbol.
|
66
|
+
If present in a pattern, the monetary decimal separator
|
67
|
+
is used instead of the decimal separator.
|
68
|
+
X any other characters can be used in the prefix or suffix
|
69
|
+
' used to quote special characters in a prefix or suffix.
|
70
|
+
|
71
|
+
=== dateFormatPattern
|
72
|
+
|
73
|
+
Symbol Meaning Presentation Example
|
74
|
+
------ ------- ------------ -------
|
75
|
+
G era designator (Text) AD
|
76
|
+
y year (Number) 1996
|
77
|
+
Y year/week of year (Number) 1996
|
78
|
+
M month in year (Text & Number) July & 07
|
79
|
+
d day in month (Number) 10
|
80
|
+
h hour in am/pm (1~12) (Number) 12
|
81
|
+
H hour in day (0~23) (Number) 0
|
82
|
+
m minute in hour (Number) 30
|
83
|
+
s second in minute (Number) 55
|
84
|
+
S millisecond (Number) 978
|
85
|
+
E day of week (Text) Tuesday
|
86
|
+
e day of week/local (1~7) (Number) 2
|
87
|
+
D day of year (Number) 189
|
88
|
+
F day of week in month (Number) 2 (2nd Wed in July)
|
89
|
+
w week in year (Number) 27
|
90
|
+
W week in month (Number) 2
|
91
|
+
a am/pm marker (Text) PM
|
92
|
+
k hour in day (1~24) (Number) 24
|
93
|
+
K hour in am/pm (0~11) (Number) 0
|
94
|
+
z time zone (Text) Pacific Standard Time
|
95
|
+
' escape for text
|
96
|
+
'' single quote '
|
97
|
+
|
98
|
+
|
99
|
+
=== choiceFormatPattern
|
100
|
+
In most cases, the preferred way to define a ChoiceFormat is with a pattern. Here is an example of a ChoiceFormat pattern:
|
101
|
+
|
102
|
+
0≤are no files|1≤is one file|1<are many files
|
103
|
+
|
104
|
+
or equivalently,
|
105
|
+
|
106
|
+
0#are no files|1#is one file|1<are many files
|
107
|
+
|
108
|
+
The pattern consists of a number or range specifiers separated by vertical bars '|' (U+007C). There is no vertical bar after the last range. Each range specifier is of the form:
|
109
|
+
|
110
|
+
Number is a floating point number that can be parsed by a default
|
111
|
+
NumberFormat for the US locale. It gives the lower limit of this range.
|
112
|
+
The lower limit is either inclusive or exclusive, depending on the separator.
|
113
|
+
The upper limit is given by the lower limit of the next range. The Unicode infinity
|
114
|
+
sign ∞ (U+221E) is recognized for positive infinity. It may be preceded by '-' (U+002D)
|
115
|
+
to indicate negative infinity.
|
116
|
+
|
117
|
+
String is the format string for this range, with special characters enclosed in single
|
118
|
+
quotes ('The # sign'). Single quotes themselves are indicated by two single quotes in a
|
119
|
+
row ('o''clock').
|
120
|
+
|
121
|
+
Separator is one of the following single characters:
|
122
|
+
* '≤' (U+2264) or '#' (U+0023) indicates that the lower limit given by
|
123
|
+
Number is inclusive. (The two characters are equivalent to ChoiceFormat.)
|
124
|
+
This means that the limit value Number belongs to this range. Another way of
|
125
|
+
saying this is that the corresponding closure is FALSE.
|
126
|
+
|
127
|
+
* '<' (U+003C) indicates that the lower limit given by Number is exclusive.
|
128
|
+
This means that the value Number belongs to the prior range. Another way of saying
|
129
|
+
this is that the corresponding closure is TRUE.
|
130
|
+
|
131
|
+
See ICU docs for more info and examples.
|
@@ -0,0 +1,204 @@
|
|
1
|
+
This is one-stop reference on Unicode regular expressions syntax, available in ICU.
|
2
|
+
This text is compiled from several sources in ICU userguide and code docs.
|
3
|
+
|
4
|
+
=== Regular Expression Metacharacters
|
5
|
+
|
6
|
+
Character Description
|
7
|
+
\a Match a BELL, \u0007
|
8
|
+
\A Match at the beginning of the input. Differs from ^ in that \A will not match after a new line within the input.
|
9
|
+
\b, outside of a [Set] Match if the current position is a word boundary. Boundaries occur
|
10
|
+
at the transitions between word (\w) and non-word (\W) characters,
|
11
|
+
with combining marks ignored. For better word boundaries, see ICU Boundary Analysis .
|
12
|
+
\b, within a [Set] Match a BACKSPACE, \u0008.
|
13
|
+
\B Match if the current position is not a word boundary.
|
14
|
+
\cX Match a control-X character.
|
15
|
+
\d Match any character with the Unicode General Category of Nd (Number, Decimal Digit.)
|
16
|
+
\D Match any character that is not a decimal digit.
|
17
|
+
\e Match an ESCAPE, \u001B.
|
18
|
+
\E Terminates a \Q ... \E quoted sequence.
|
19
|
+
\f Match a FORM FEED, \u000C.
|
20
|
+
\G Match if the current position is at the end of the previous match.
|
21
|
+
\n Match a LINE FEED, \u000A.
|
22
|
+
\N{UNICODE CHARACTER NAME} Match the named character.
|
23
|
+
\p{UNICODE PROPERTY NAME} Match any character with the specified Unicode Property.
|
24
|
+
\P{UNICODE PROPERTY NAME} Match any character not having the specified Unicode Property.
|
25
|
+
\Q Quotes all following characters until \E.
|
26
|
+
\r Match a CARRIAGE RETURN, \u000D.
|
27
|
+
\s Match a white space character. White space is defined as [\t\n\f\r\p{Z}].
|
28
|
+
\S Match a non-white space character.
|
29
|
+
\t Match a HORIZONTAL TABULATION, \u0009.
|
30
|
+
\uhhhh Match the character with the hex value hhhh.
|
31
|
+
\Uhhhhhhhh Match the character with the hex value hhhhhhhh. Exactly eight hex digits must be
|
32
|
+
provided, even though the largest Unicode code point is \U0010ffff.
|
33
|
+
\w Match a word character. Word characters are [\p{Ll}\p{Lu}\p{Lt}\p{Lo}\p{Nd}].
|
34
|
+
\W Match a non-word character.
|
35
|
+
\x{hhhh} Match the character with hex value hhhh. From one to six hex digits may be supplied.
|
36
|
+
\xhh Match the character with two digit hex value hh
|
37
|
+
\X Match a Grapheme Cluster .
|
38
|
+
\Z Match if the current position is at the end of input, but before the final line terminator, if one exists.
|
39
|
+
\z Match if the current position is at the end of input.
|
40
|
+
\n Back Reference. Match whatever the nth capturing group matched. n must be a number > 1 and < total number
|
41
|
+
of capture groups in the pattern. Note: Octal escapes, such as \012, are not supported in ICU regular expressions
|
42
|
+
[pattern] Match any one character from the set. See UnicodeSet for a full description of what may appear in the pattern
|
43
|
+
. Match any character.
|
44
|
+
^ Match at the beginning of a line.
|
45
|
+
$ Match at the end of a line.
|
46
|
+
\ Quotes the following character. Characters that must be quoted to be treated as literals are * ? + [ ( ) { } ^ $ | \ . /
|
47
|
+
|
48
|
+
=== Regular Expression Operators
|
49
|
+
|
50
|
+
Operator Description
|
51
|
+
| Alternation. A|B matches either A or B.
|
52
|
+
* Match 0 or more times. Match as many times as possible.
|
53
|
+
+ Match 1 or more times. Match as many times as possible.
|
54
|
+
? Match zero or one times. Prefer one.
|
55
|
+
{n} Match exactly n times
|
56
|
+
{n,} Match at least n times. Match as many times as possible.
|
57
|
+
{n,m} Match between n and m times. Match as many times as possible, but not more than m.
|
58
|
+
*? Match 0 or more times. Match as few times as possible.
|
59
|
+
+? Match 1 or more times. Match as few times as possible.
|
60
|
+
?? Match zero or one times. Prefer zero.
|
61
|
+
{n}? Match exactly n times
|
62
|
+
{n,}? Match at least n times, but no more than required for an overall pattern match
|
63
|
+
{n,m}? Match between n and m times. Match as few times as possible, but not less than n.
|
64
|
+
*+ Match 0 or more times. Match as many times as possible when first encountered, do not retry with
|
65
|
+
fewer even if overall match fails (Possessive Match)
|
66
|
+
++ Match 1 or more times. Possessive match.
|
67
|
+
?+ Match zero or one times. Possessive match.
|
68
|
+
{n}+ Match exactly n times
|
69
|
+
{n,}+ Match at least n times. Possessive Match.
|
70
|
+
{n,m}+ Match between n and m times. Possessive Match.
|
71
|
+
( ... ) Capturing parentheses. Range of input that matched the parenthesized subexpression is
|
72
|
+
available after the match.
|
73
|
+
(?: ... ) Non-capturing parentheses. Groups the included pattern, but does not provide
|
74
|
+
capturing of matching text. Somewhat more efficient than capturing parentheses.
|
75
|
+
(?> ... ) Atomic-match parentheses. First match of the parenthesized subexpression is the only
|
76
|
+
one tried; if it does not lead to an overall pattern match, back up the search for a
|
77
|
+
match to a position before the "(?>"
|
78
|
+
(?# ... ) Free-format comment (?# comment ).
|
79
|
+
(?= ... ) Look-ahead assertion. True if the parenthesized pattern matches at the current input position,
|
80
|
+
but does not advance the input position.
|
81
|
+
(?! ... ) Negative look-ahead assertion. True if the parenthesized pattern does not match at the current
|
82
|
+
input position. Does not advance the input position.
|
83
|
+
(?<= ... ) Look-behind assertion. True if the parenthesized pattern matches text preceding the current
|
84
|
+
input position, with the last character of the match being the input character just before
|
85
|
+
the current position. Does not alter the input position. The length of possible strings matched
|
86
|
+
by the look-behind pattern must not be unbounded (no * or + operators.)
|
87
|
+
(?<! ... ) Negative Look-behind assertion. True if the parenthesized pattern does not
|
88
|
+
match text preceding the current input position, with the last character of the
|
89
|
+
match being the input character just before the current position. Does not alter
|
90
|
+
the input position. The length of possible strings matched by the look-behind pattern
|
91
|
+
must not be unbounded (no * or + operators.)
|
92
|
+
(?ismx-ismx: ... ) Flag settings. Evaluate the parenthesized expression with the specified flags enabled or -disabled.
|
93
|
+
(?ismx-ismx) Flag settings. Change the flag settings. Changes apply to the portion of the pattern
|
94
|
+
following the setting. For example, (?i) changes to a case insensitive match.
|
95
|
+
|
96
|
+
=== Replacement Text
|
97
|
+
|
98
|
+
The replacement text for find-and-replace operations may contain references to capture-group text from the find. References are of the form $n, where n is the number of the capture group.
|
99
|
+
|
100
|
+
Character Descriptions
|
101
|
+
$n The text of capture group n will be substituted for $n. n must be >= 0 and not
|
102
|
+
greater than the number of capture groups. A $ not followed by a digit has no special meaning,
|
103
|
+
and will appear in the substitution text as itself, a $.
|
104
|
+
\ Treat the following character as a literal, suppressing any special meaning. Backslash escaping in
|
105
|
+
substitution text is only required for '$' and '\', but may be used on any other character without bad effects.
|
106
|
+
|
107
|
+
=== UnicodeSet Pattern syntax
|
108
|
+
|
109
|
+
These patterns follow a syntax similar to that employed by version 8 regular expression character classes.
|
110
|
+
|
111
|
+
Here are some simple examples:
|
112
|
+
|
113
|
+
[] No characters
|
114
|
+
[a] The character 'a'
|
115
|
+
[ae] The characters 'a' and 'e'
|
116
|
+
[a-e] The characters 'a' through 'e' inclusive, in Unicode code point order
|
117
|
+
[\u4E01] The character U+4E01
|
118
|
+
[a{ab}{ac}] The character 'a' and the multicharacter strings "ab" and "ac"
|
119
|
+
[\p{Lu}] All characters in the general category Uppercase Letter
|
120
|
+
|
121
|
+
Any character may be preceded by a backslash in order to remove any special meaning.
|
122
|
+
White space characters, are ignored, unless they are escaped.
|
123
|
+
|
124
|
+
Property patterns specify a set of characters having a certain property as defined by the
|
125
|
+
Unicode standard. Both the POSIX-like "[:Lu:]" and the Perl-like syntax "\\p{Lu}" are recognized.
|
126
|
+
|
127
|
+
Patterns specify individual characters, ranges of characters, and Unicode property sets.
|
128
|
+
When elements are concatenated, they specify their union. To complement a set, place a '^'
|
129
|
+
immediately after the opening '['. Property patterns are inverted by modifying their delimiters;
|
130
|
+
"[:^foo]" and "\\P{foo}". In any other location, '^' has no special meaning.
|
131
|
+
|
132
|
+
Ranges are indicated by placing two a '-' between two characters, as in "a-z".
|
133
|
+
This specifies the range of all characters from the left to the right, in Unicode order.
|
134
|
+
If the left character is greater than or equal to the right character it is a syntax error.
|
135
|
+
If a '-' occurs as the first character after the opening '[' or '[^', or if it occurs as the
|
136
|
+
last character before the closing ']', then it is taken as a literal. Thus "[a\-b]", "[-ab]",
|
137
|
+
and "[ab-]" all indicate the same set of three characters, 'a', 'b', and '-'.
|
138
|
+
|
139
|
+
Sets may be intersected using the '&' operator or the asymmetric set difference may be taken using
|
140
|
+
the '-' operator, for example, "[[:L:]&[\\u0000-\\u0FFF]]" indicates the set of all
|
141
|
+
Unicode letters with values less than 4096. Operators ('&' and '|') have equal precedence
|
142
|
+
and bind left-to-right. Thus "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to
|
143
|
+
"[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]". This only really matters for difference; intersection
|
144
|
+
is commutative.
|
145
|
+
|
146
|
+
[a] The set containing 'a'
|
147
|
+
[a-z] The set containing 'a' through 'z' and all letters in between, in Unicode order
|
148
|
+
[^a-z] The set containing all characters but 'a' through 'z', that is, U+0000 through 'a'-1
|
149
|
+
and 'z'+1 through U+10FFFF
|
150
|
+
[[pat1][pat2]] The union of sets specified by pat1 and pat2
|
151
|
+
[[pat1]&[pat2]] The intersection of sets specified by pat1 and pat2
|
152
|
+
[[pat1]-[pat2]] The asymmetric difference of sets specified by pat1 and pat2
|
153
|
+
[:Lu:] or \p{Lu} The set of characters having the specified Unicode property; in this case, Unicode uppercase letters
|
154
|
+
[:^Lu:] or \P{Lu} The set of characters not having the given Unicode property
|
155
|
+
|
156
|
+
Warning: you cannot add an empty string ("") to a UnicodeSet.
|
157
|
+
|
158
|
+
Formal syntax
|
159
|
+
|
160
|
+
pattern := ('[' '^'? item* ']') | property
|
161
|
+
item := char | (char '-' char) | pattern-expr
|
162
|
+
pattern-expr := pattern | pattern-expr pattern | pattern-expr op pattern
|
163
|
+
op := '&' | '-'
|
164
|
+
special := '[' | ']' | '-'
|
165
|
+
char := any character that is not special | ('\' any character) | ('\u' hex hex hex hex)
|
166
|
+
hex := any character for which Character.digit(c, 16) returns a non-negative result
|
167
|
+
property := a Unicode property set pattern
|
168
|
+
|
169
|
+
Legend:
|
170
|
+
a := b a may be replaced by b
|
171
|
+
a? zero or one instance of a
|
172
|
+
a* one or more instances of a
|
173
|
+
a | b either a or b
|
174
|
+
'a' the literal string between the quotes
|
175
|
+
|
176
|
+
The C/POSIX character classes are also available in UnicodeSet patterns, using patterns like [:graph:] or \p{graph}.
|
177
|
+
|
178
|
+
alpha , lower , upper , punct , digit , xdigit, alnum , space , blank , cntrl , graph , print
|
179
|
+
|
180
|
+
=== General Categories
|
181
|
+
|
182
|
+
Lu = Letter, uppercase Mn = Mark, nonspacing Nd = Number, decimal digit
|
183
|
+
Ll = Letter, lowercase Mc = Mark, spacing combining Nl = Number, letter
|
184
|
+
Lt = Letter, titlecase Me = Mark, enclosing No = Number, other
|
185
|
+
Lm = Letter, modifier
|
186
|
+
Lo = Letter, other
|
187
|
+
|
188
|
+
Zs = Separator, space Cc = Other, control
|
189
|
+
Zl = Separator, line Cf = Other, format
|
190
|
+
Zp = Separator, paragraph Cs = Other, surrogate
|
191
|
+
Co = Other, private use
|
192
|
+
Cn = Other, not assigned (including noncharacters)
|
193
|
+
|
194
|
+
Pc = Punctuation, connector Sm = Symbol, math
|
195
|
+
Pd = Punctuation, dash Sc = Symbol, currency
|
196
|
+
Ps = Punctuation, open Sk = Symbol, modifier
|
197
|
+
Pe = Punctuation, close So = Symbol, other
|
198
|
+
Pi = Punctuation, initial quote (may behave like Ps or Pe depending on usage)
|
199
|
+
Pf = Punctuation, final quote (may behave like Ps or Pe depending on usage)
|
200
|
+
Po = Punctuation, other
|
201
|
+
|
202
|
+
|
203
|
+
See also http://www.unicode.org/Public/UNIDATA/PropertyAliases.txt and
|
204
|
+
http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt for additional info.
|
data/extconf.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'mkmf'
|
2
|
+
$LDFLAGS = "-licuuc -licui18n -licudata -lstdc++ "
|
3
|
+
$CFLAGS = "-Wall"
|
4
|
+
if !have_library('icui18n', 'u_init_46')
|
5
|
+
puts "ICU v3.4 required -- not found."
|
6
|
+
exit 1
|
7
|
+
end
|
8
|
+
create_makefile('icu4r')
|
9
|
+
File.open("Makefile", "a") << <<-EOT
|
10
|
+
|
11
|
+
check: $(DLLIB)
|
12
|
+
@$(RUBY) $(srcdir)/test/test_ustring.rb
|
13
|
+
@$(RUBY) $(srcdir)/test/test_calendar.rb
|
14
|
+
@$(RUBY) $(srcdir)/test/test_converter.rb
|
15
|
+
@$(RUBY) $(srcdir)/test/test_collator.rb
|
16
|
+
|
17
|
+
EOT
|
data/fmt.cpp
ADDED
@@ -0,0 +1,156 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
#include "icu_common.h"
|
3
|
+
#include <unicode/msgfmt.h>
|
4
|
+
#include <unicode/translit.h>
|
5
|
+
#include <unicode/smpdtfmt.h>
|
6
|
+
#include <unicode/calendar.h>
|
7
|
+
#include <unicode/ucal.h>
|
8
|
+
/* This file contains various C-C++ wrappers, to ease my life
|
9
|
+
*/
|
10
|
+
extern "C" {
|
11
|
+
extern VALUE rb_cUString;
|
12
|
+
extern VALUE rb_cUCalendar;
|
13
|
+
extern VALUE icu_ustr_new(const UChar * str, long len);
|
14
|
+
extern VALUE icu_ustr_new_set(const UChar * str, long len, long capa);
|
15
|
+
|
16
|
+
VALUE icu_format(UChar * pattern, int32_t len, VALUE args, int32_t arg_len, char * locale)
|
17
|
+
{
|
18
|
+
Formattable * arguments = new Formattable[arg_len];
|
19
|
+
int i, is_set;
|
20
|
+
VALUE obj;
|
21
|
+
for(i = 0; i < arg_len; i++){
|
22
|
+
obj = rb_ary_entry(args,i);
|
23
|
+
is_set = 0;
|
24
|
+
switch(TYPE(obj)){
|
25
|
+
case T_FIXNUM:
|
26
|
+
case T_FLOAT:
|
27
|
+
arguments[i].setDouble(rb_num2dbl(obj));
|
28
|
+
is_set = 1;
|
29
|
+
break;
|
30
|
+
}
|
31
|
+
if(! is_set) {
|
32
|
+
if (CLASS_OF(obj) == rb_cUString) {
|
33
|
+
arguments[i].setString(UnicodeString(ICU_PTR(obj), ICU_LEN(obj)));
|
34
|
+
} else
|
35
|
+
if (CLASS_OF(obj) == rb_cTime) {
|
36
|
+
// ICU expects milliseconds since 01.01.1970
|
37
|
+
arguments[i].setDate(rb_num2dbl(rb_funcall(obj, rb_intern("to_f"), 0))*1000);
|
38
|
+
}
|
39
|
+
else {
|
40
|
+
delete [] arguments;
|
41
|
+
rb_raise(rb_eArgError, "wrong arg type: %s", rb_class2name(CLASS_OF(obj)));
|
42
|
+
}
|
43
|
+
}
|
44
|
+
}
|
45
|
+
UnicodeString * patString = new UnicodeString(pattern,len);
|
46
|
+
UErrorCode status = U_ZERO_ERROR;
|
47
|
+
UnicodeString * resultStr = new UnicodeString();
|
48
|
+
FieldPosition * fieldPosition = new FieldPosition(0);
|
49
|
+
Locale * loc = new Locale(locale);
|
50
|
+
int32_t blen ;
|
51
|
+
UChar * buf ;
|
52
|
+
VALUE ret ;
|
53
|
+
|
54
|
+
MessageFormat * fmt= new MessageFormat(*patString,*loc, status);
|
55
|
+
if( U_FAILURE(status) ){
|
56
|
+
goto cleanup;
|
57
|
+
}
|
58
|
+
fmt->format(arguments,arg_len,*resultStr,*fieldPosition,status);
|
59
|
+
if( U_FAILURE(status) ){
|
60
|
+
goto cleanup;
|
61
|
+
}
|
62
|
+
blen = resultStr->length();
|
63
|
+
buf = ALLOC_N(UChar, blen + 1);
|
64
|
+
resultStr->extract(buf, blen, status);
|
65
|
+
ret = icu_ustr_new( buf, blen);
|
66
|
+
free(buf);
|
67
|
+
|
68
|
+
cleanup:
|
69
|
+
delete fmt;
|
70
|
+
delete [] arguments;
|
71
|
+
delete patString;
|
72
|
+
delete resultStr;
|
73
|
+
delete fieldPosition;
|
74
|
+
delete loc;
|
75
|
+
|
76
|
+
if( U_FAILURE(status) ){
|
77
|
+
rb_raise(rb_eArgError, "Can't format: %s", u_errorName(status));
|
78
|
+
}else {
|
79
|
+
return ret;
|
80
|
+
}
|
81
|
+
}
|
82
|
+
UCalendar * icu_date_parse(UChar * str, int32_t str_len, char * locale, UChar * val, int32_t len)
|
83
|
+
{
|
84
|
+
UErrorCode status = U_ZERO_ERROR;
|
85
|
+
UCalendar * c;
|
86
|
+
c = ucal_open(NULL, -1, NULL, UCAL_GREGORIAN, &status);
|
87
|
+
if( U_FAILURE(status) ) {
|
88
|
+
rb_raise(rb_eArgError, u_errorName(status));
|
89
|
+
}
|
90
|
+
UnicodeString * temp = new UnicodeString(str, str_len);
|
91
|
+
Locale * loc = new Locale(locale);
|
92
|
+
SimpleDateFormat * formatter = new SimpleDateFormat(*temp, *loc, status);
|
93
|
+
if( U_FAILURE(status) ) {
|
94
|
+
delete formatter;
|
95
|
+
delete temp;
|
96
|
+
delete loc;
|
97
|
+
rb_raise(rb_eArgError, "Can't create formatter:%s", u_errorName(status));
|
98
|
+
}
|
99
|
+
formatter->setLenient( 0 );
|
100
|
+
UnicodeString * val_str = new UnicodeString(val, len);
|
101
|
+
UDate p_time = formatter->parse(*val_str, status);
|
102
|
+
ucal_setMillis(c, p_time, &status);
|
103
|
+
delete formatter;
|
104
|
+
delete temp;
|
105
|
+
delete loc;
|
106
|
+
delete val_str;
|
107
|
+
|
108
|
+
if( U_FAILURE(status) ) {
|
109
|
+
ucal_close(c);
|
110
|
+
rb_raise(rb_eArgError, "Can't parse date:%s", u_errorName(status));
|
111
|
+
}
|
112
|
+
return c;
|
113
|
+
}
|
114
|
+
VALUE icu_transliterate(UChar * str, int32_t str_len, UChar * id, int32_t id_len, UChar * rules, int32_t rule_len)
|
115
|
+
{
|
116
|
+
UErrorCode status = U_ZERO_ERROR;
|
117
|
+
UParseError p_error;
|
118
|
+
Transliterator * t ;
|
119
|
+
if( rules != NULL) {
|
120
|
+
t = Transliterator::createFromRules(UnicodeString(id, id_len), UnicodeString(rules, rule_len),
|
121
|
+
UTRANS_FORWARD, p_error, status);
|
122
|
+
} else {
|
123
|
+
t = Transliterator::createInstance(UnicodeString(id, id_len), UTRANS_FORWARD, p_error, status);
|
124
|
+
}
|
125
|
+
if( U_FAILURE(status) )
|
126
|
+
{
|
127
|
+
rb_raise(rb_eRuntimeError, u_errorName(status));
|
128
|
+
}
|
129
|
+
UnicodeString * src = new UnicodeString(str, str_len);
|
130
|
+
t->transliterate(*src);
|
131
|
+
int32_t blen = src->length();
|
132
|
+
UChar * buf = ALLOC_N(UChar, blen + 1);
|
133
|
+
src->extract(buf, blen, status);
|
134
|
+
VALUE ret = icu_ustr_new_set( buf, blen, blen+1);
|
135
|
+
delete src;
|
136
|
+
delete t;
|
137
|
+
return ret;
|
138
|
+
}
|
139
|
+
extern void icu4r_cal_free(UCalendar *);
|
140
|
+
|
141
|
+
VALUE icu4r_cal_clone(VALUE cal)
|
142
|
+
{
|
143
|
+
Calendar * clon;
|
144
|
+
clon = ((Calendar *)(DATA_PTR(cal)))->clone();
|
145
|
+
return Data_Wrap_Struct(rb_cUCalendar, 0, icu4r_cal_free, clon);
|
146
|
+
}
|
147
|
+
#define CPP_CALENDAR(obj) ((Calendar*)DATA_PTR(obj))
|
148
|
+
|
149
|
+
VALUE icu4r_cal_equal(VALUE cal, VALUE obj)
|
150
|
+
{
|
151
|
+
UBool answer;
|
152
|
+
Check_Class( obj, rb_cUCalendar);
|
153
|
+
answer = (*CPP_CALENDAR(cal)) == (*CPP_CALENDAR(obj));
|
154
|
+
return answer ? Qtrue : Qfalse;
|
155
|
+
}
|
156
|
+
}
|