byk 0.4.0 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/ext/byk/byk.c CHANGED
@@ -1,314 +1,380 @@
1
- #include <stdio.h>
2
1
  #include <ruby.h>
3
2
  #include <ruby/encoding.h>
4
3
 
5
- #ifndef rb_check_arity
6
- #define rb_check_arity rb_check_arity
7
-
8
- NORETURN(void rb_error_arity(int, int, int));
4
+ #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
9
5
 
10
6
  static inline void
11
- rb_check_arity(int argc, int min, int max)
7
+ _str_cat_char(VALUE str, unsigned c, rb_encoding *enc)
12
8
  {
13
- if ((argc < min) || (max != -1 && argc > max))
14
- rb_error_arity(argc, min, max);
9
+ char s[16];
10
+ int n = rb_enc_codelen(c, enc);
11
+ rb_enc_mbcput(c, s, enc);
12
+ rb_str_buf_cat(str, s, n);
15
13
  }
16
- #endif
17
-
18
- #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
19
-
20
- #define STR_CAT_COND_ASCII(ascii, dest, chr, ascii_chr, len, enc) \
21
- ascii ? rb_enc_str_buf_cat(dest, chr, len, enc) \
22
- : str_cat_char(dest, ascii_chr, enc)
23
14
 
24
15
  enum {
25
- LAT_CAP_TJ=262,
26
- LAT_TJ,
27
- LAT_CAP_CH=268,
28
- LAT_CH,
29
- LAT_CAP_DJ=272,
30
- LAT_DJ,
31
- LAT_CAP_SH=352,
32
- LAT_SH,
33
- LAT_CAP_ZH=381,
34
- LAT_ZH,
35
- CYR_CAP_DJ=1026,
36
- CYR_CAP_J=1032,
37
- CYR_CAP_LJ,
38
- CYR_CAP_NJ,
39
- CYR_CAP_TJ,
40
- CYR_CAP_DZ=1039,
41
- CYR_CAP_A,
42
- CYR_CAP_B,
43
- CYR_CAP_V,
44
- CYR_CAP_G,
45
- CYR_CAP_D,
46
- CYR_CAP_E,
47
- CYR_CAP_ZH,
48
- CYR_CAP_Z,
49
- CYR_CAP_I,
50
- CYR_CAP_K=1050,
51
- CYR_CAP_L,
52
- CYR_CAP_M,
53
- CYR_CAP_N,
54
- CYR_CAP_O,
55
- CYR_CAP_P,
56
- CYR_CAP_R,
57
- CYR_CAP_S,
58
- CYR_CAP_T,
59
- CYR_CAP_U,
60
- CYR_CAP_F,
61
- CYR_CAP_H,
62
- CYR_CAP_C,
63
- CYR_CAP_CH,
64
- CYR_CAP_SH,
65
- CYR_A=1072,
66
- CYR_B,
67
- CYR_V,
68
- CYR_G,
69
- CYR_D,
70
- CYR_E,
71
- CYR_ZH,
72
- CYR_Z,
73
- CYR_I,
74
- CYR_K=1082,
75
- CYR_L,
76
- CYR_M,
77
- CYR_N,
78
- CYR_O,
79
- CYR_P,
80
- CYR_R,
81
- CYR_S,
82
- CYR_T,
83
- CYR_U,
84
- CYR_F,
85
- CYR_H,
86
- CYR_C,
87
- CYR_CH,
88
- CYR_SH,
89
- CYR_DJ=1106,
90
- CYR_J=1112,
91
- CYR_LJ,
92
- CYR_NJ,
93
- CYR_TJ,
94
- CYR_DZ=1119
16
+ LAT_CAP_TJ=262, LAT_TJ, LAT_CAP_CH=268, LAT_CH,
17
+ LAT_CAP_DJ=272, LAT_DJ, LAT_CAP_SH=352, LAT_SH,
18
+ LAT_CAP_ZH=381, LAT_ZH, CYR_CAP_DJ=1026, CYR_CAP_J=1032,
19
+ CYR_CAP_LJ, CYR_CAP_NJ, CYR_CAP_TJ, CYR_CAP_DZ=1039,
20
+ CYR_CAP_A, CYR_CAP_B, CYR_CAP_V, CYR_CAP_G,
21
+ CYR_CAP_D, CYR_CAP_E, CYR_CAP_ZH, CYR_CAP_Z,
22
+ CYR_CAP_I, CYR_CAP_K=1050, CYR_CAP_L, CYR_CAP_M,
23
+ CYR_CAP_N, CYR_CAP_O, CYR_CAP_P, CYR_CAP_R,
24
+ CYR_CAP_S, CYR_CAP_T, CYR_CAP_U, CYR_CAP_F,
25
+ CYR_CAP_H, CYR_CAP_C, CYR_CAP_CH, CYR_CAP_SH,
26
+ CYR_A=1072, CYR_B, CYR_V, CYR_G, CYR_D,
27
+ CYR_E, CYR_ZH, CYR_Z, CYR_I, CYR_K=1082,
28
+ CYR_L, CYR_M, CYR_N, CYR_O, CYR_P,
29
+ CYR_R, CYR_S, CYR_T, CYR_U, CYR_F,
30
+ CYR_H, CYR_C, CYR_CH, CYR_SH, CYR_DJ=1106,
31
+ CYR_J=1112, CYR_LJ, CYR_NJ, CYR_TJ, CYR_DZ=1119
95
32
  };
96
33
 
97
- static inline unsigned int
98
- is_upper_case(unsigned int c)
34
+ static inline unsigned
35
+ is_cap(unsigned codepoint)
99
36
  {
100
- return ((c >= 65 && c <= 90)
101
- || (c >= CYR_CAP_DJ && c <= CYR_CAP_SH)
102
- || c == LAT_CAP_TJ
103
- || c == LAT_CAP_CH
104
- || c == LAT_CAP_DJ
105
- || c == LAT_CAP_SH
106
- || c == LAT_CAP_ZH);
37
+ if (codepoint >= 65 && codepoint <= 90) return 1;
38
+ if (codepoint >= CYR_CAP_DJ && codepoint <= CYR_CAP_SH) return 1;
39
+
40
+ switch(codepoint) {
41
+ case LAT_CAP_TJ:
42
+ case LAT_CAP_CH:
43
+ case LAT_CAP_DJ:
44
+ case LAT_CAP_SH:
45
+ case LAT_CAP_ZH:
46
+ return 1;
47
+ default:
48
+ return 0;
49
+ }
107
50
  }
108
51
 
109
- static void
110
- str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
52
+ static inline unsigned
53
+ is_digraph(unsigned codepoint)
111
54
  {
112
- char s[16];
113
- int n = rb_enc_codelen(c, enc);
114
- rb_enc_mbcput(c, s, enc);
115
- rb_enc_str_buf_cat(str, s, n, enc);
55
+ switch(codepoint) {
56
+ case CYR_LJ:
57
+ case CYR_NJ:
58
+ case CYR_DZ:
59
+ case CYR_CAP_LJ:
60
+ case CYR_CAP_NJ:
61
+ case CYR_CAP_DZ:
62
+ return 1;
63
+ default:
64
+ return 0;
65
+ }
66
+ }
67
+
68
+ static unsigned
69
+ digraph_to_cyr(unsigned codepoint, unsigned codepoint2, unsigned capitalize, unsigned *next_out)
70
+ {
71
+ static unsigned CYR_MAP[] = {
72
+ CYR_A, CYR_B, CYR_C, CYR_D, CYR_E, CYR_F,
73
+ CYR_G, CYR_H, CYR_I, CYR_J, CYR_K, CYR_L,
74
+ CYR_M, CYR_N, CYR_O, CYR_P, 0, CYR_R,
75
+ CYR_S, CYR_T, CYR_U, CYR_V, 0, 0, 0, CYR_Z
76
+ };
77
+
78
+ static unsigned CYR_CAPS_MAP[] = {
79
+ CYR_CAP_A, CYR_CAP_B, CYR_CAP_C, CYR_CAP_D, CYR_CAP_E, CYR_CAP_F,
80
+ CYR_CAP_G, CYR_CAP_H, CYR_CAP_I, CYR_CAP_J, CYR_CAP_K, CYR_CAP_L,
81
+ CYR_CAP_M, CYR_CAP_N, CYR_CAP_O, CYR_CAP_P, 0, CYR_CAP_R,
82
+ CYR_CAP_S, CYR_CAP_T, CYR_CAP_U, CYR_CAP_V, 0, 0, 0, CYR_CAP_Z
83
+ };
84
+
85
+ if (codepoint2 == LAT_CAP_ZH || codepoint2 == LAT_ZH) {
86
+ switch (codepoint) {
87
+ case 'd': return CYR_DZ;
88
+ case 'D': return CYR_CAP_DZ;
89
+ }
90
+ }
91
+
92
+ if (codepoint2 == 'j' || codepoint2 == 'J') {
93
+ switch (codepoint) {
94
+ case 'l': return CYR_LJ;
95
+ case 'n': return CYR_NJ;
96
+ case 'L': return CYR_CAP_LJ;
97
+ case 'N': return CYR_CAP_NJ;
98
+ }
99
+ }
100
+
101
+ if (codepoint >= 'a' && codepoint <= 'z') return CYR_MAP[codepoint - 'a'];
102
+ if (codepoint >= 'A' && codepoint <= 'Z') return CYR_CAPS_MAP[codepoint - 'A'];
103
+
104
+ switch (codepoint) {
105
+ case LAT_CH: return CYR_CH;
106
+ case LAT_DJ: return CYR_DJ;
107
+ case LAT_SH: return CYR_SH;
108
+ case LAT_TJ: return CYR_TJ;
109
+ case LAT_ZH: return CYR_ZH;
110
+ case LAT_CAP_CH: return CYR_CAP_CH;
111
+ case LAT_CAP_DJ: return CYR_CAP_DJ;
112
+ case LAT_CAP_SH: return CYR_CAP_SH;
113
+ case LAT_CAP_TJ: return CYR_CAP_TJ;
114
+ case LAT_CAP_ZH: return CYR_CAP_ZH;
115
+ }
116
+
117
+ return 0;
118
+ }
119
+
120
+ static unsigned
121
+ digraph_to_latin(unsigned codepoint, unsigned codepoint2, unsigned capitalize, unsigned *next_out)
122
+ {
123
+ static char LAT_MAP[] = {
124
+ 'a', 'b', 'v', 'g', 'd', 'e', 0, 'z', 'i', 0, 'k', 'l',
125
+ 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'f', 'h', 'c'
126
+ };
127
+
128
+ static char LAT_CAPS_MAP[] = {
129
+ 'A', 'B', 'V', 'G', 'D', 'E', 0, 'Z', 'I', 0, 'K', 'L',
130
+ 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'F', 'H', 'C'
131
+ };
132
+
133
+ if (codepoint < CYR_CAP_DJ || codepoint > CYR_DZ) return 0;
134
+
135
+ switch (codepoint) {
136
+ case CYR_ZH: return LAT_ZH;
137
+ case CYR_CAP_ZH: return LAT_CAP_ZH;
138
+ }
139
+
140
+ if (codepoint >= CYR_A && codepoint <= CYR_C)
141
+ return LAT_MAP[codepoint - CYR_A];
142
+
143
+ if (codepoint >= CYR_CAP_A && codepoint <= CYR_CAP_C)
144
+ return LAT_CAPS_MAP[codepoint - CYR_CAP_A];
145
+
146
+ if (codepoint >= CYR_A) {
147
+ switch (codepoint) {
148
+ case CYR_J: return 'j';
149
+ case CYR_TJ: return LAT_TJ;
150
+ case CYR_CH: return LAT_CH;
151
+ case CYR_SH: return LAT_SH;
152
+ case CYR_DJ: return LAT_DJ;
153
+ case CYR_LJ: *next_out = 'j'; return 'l';
154
+ case CYR_NJ: *next_out = 'j'; return 'n';
155
+ case CYR_DZ: *next_out = LAT_ZH; return 'd';
156
+ }
157
+ }
158
+ else {
159
+ switch (codepoint) {
160
+ case CYR_CAP_J: return 'J';
161
+ case CYR_CAP_TJ: return LAT_CAP_TJ;
162
+ case CYR_CAP_CH: return LAT_CAP_CH;
163
+ case CYR_CAP_SH: return LAT_CAP_SH;
164
+ case CYR_CAP_DJ: return LAT_CAP_DJ;
165
+ case CYR_CAP_LJ: *next_out = (capitalize || is_cap(codepoint2)) ? 'J' : 'j'; return 'L';
166
+ case CYR_CAP_NJ: *next_out = (capitalize || is_cap(codepoint2)) ? 'J' : 'j'; return 'N';
167
+ case CYR_CAP_DZ: *next_out = (capitalize || is_cap(codepoint2)) ? LAT_CAP_ZH : LAT_ZH; return 'D';
168
+ }
169
+ }
170
+
171
+ return 0;
172
+ }
173
+
174
+ static unsigned
175
+ digraph_to_ascii(unsigned codepoint, unsigned codepoint2, unsigned capitalize, unsigned *next_out)
176
+ {
177
+ switch (codepoint) {
178
+ case LAT_TJ:
179
+ case LAT_CH:
180
+ case CYR_TJ:
181
+ case CYR_CH: return 'c';
182
+ case LAT_SH:
183
+ case CYR_SH: return 's';
184
+ case LAT_ZH:
185
+ case CYR_ZH: return 'z';
186
+ case LAT_DJ:
187
+ case CYR_DJ: *next_out = 'j'; return 'd';
188
+ case LAT_CAP_TJ:
189
+ case LAT_CAP_CH:
190
+ case CYR_CAP_TJ:
191
+ case CYR_CAP_CH: return 'C';
192
+ case LAT_CAP_SH:
193
+ case CYR_CAP_SH: return 'S';
194
+ case LAT_CAP_ZH:
195
+ case CYR_CAP_ZH: return 'Z';
196
+ case LAT_CAP_DJ:
197
+ case CYR_CAP_DJ:
198
+ *next_out = (capitalize || is_cap(codepoint2)) ? 'J' : 'j'; return 'D';
199
+ case CYR_DZ:
200
+ *next_out = (capitalize || is_cap(codepoint2)) ? 'Z' : 'z'; return 'd';
201
+ case CYR_CAP_DZ:
202
+ *next_out = (capitalize || is_cap(codepoint2)) ? 'Z' : 'z'; return 'D';
203
+ default:
204
+ return digraph_to_latin(codepoint, codepoint2, capitalize, next_out);
205
+ }
116
206
  }
117
207
 
118
208
  static VALUE
119
- str_to_latin(int argc, VALUE *argv, VALUE str, int ascii, int bang)
209
+ str_to_srb(VALUE str, int strategy, int bang)
120
210
  {
121
211
  VALUE dest;
122
- long dest_len;
123
- char *pos, *end;
124
212
  rb_encoding *enc;
213
+
125
214
  int len, next_len;
126
- int seen_upper = 0;
127
- int force_upper = 0;
128
- unsigned int codepoint = 0;
129
- unsigned int next_codepoint = 0;
215
+ unsigned in, in2, out, out2, seen_cap = 0;
216
+ char *pos, *end, *seq_start = 0;
130
217
 
131
- rb_check_arity(argc, 0, 1);
218
+ unsigned (*method)(unsigned, unsigned, unsigned, unsigned*);
219
+
220
+ switch(strategy) {
221
+ case 0: method = &digraph_to_cyr; break;
222
+ case 1: method = &digraph_to_latin; break;
223
+ default: method = &digraph_to_ascii;
224
+ }
132
225
 
226
+ StringValue(str);
133
227
  pos = RSTRING_PTR(str);
134
228
  if (!pos || RSTRING_LEN(str) == 0) return str;
135
229
 
136
230
  end = RSTRING_END(str);
137
231
  enc = STR_ENC_GET(str);
138
- dest_len = RSTRING_LEN(str) + 30;
139
- dest = rb_str_buf_new(dest_len);
232
+ dest = rb_str_buf_new(RSTRING_LEN(str) + 30);
140
233
  rb_enc_associate(dest, enc);
141
234
 
142
- codepoint = rb_enc_codepoint_len(pos, end, &len, enc);
235
+ in = rb_enc_codepoint_len(pos, end, &len, enc);
143
236
 
144
237
  while (pos < end) {
145
- if (pos + len < end) {
146
- next_codepoint = rb_enc_codepoint_len(pos + len, end, &next_len, enc);
147
- }
238
+ in2 = out2 = 0;
148
239
 
149
- force_upper = seen_upper || is_upper_case(next_codepoint);
150
- seen_upper = is_upper_case(codepoint);
151
-
152
- /* Latin -> "ASCII Latin" conversion */
153
- if (ascii && codepoint >= LAT_CAP_TJ && codepoint <= LAT_ZH) {
154
- switch (codepoint) {
155
- case LAT_TJ:
156
- case LAT_CH: rb_enc_str_buf_cat(dest, "c", 1, enc); break;
157
- case LAT_DJ: rb_enc_str_buf_cat(dest, "dj", 2, enc); break;
158
- case LAT_SH: rb_enc_str_buf_cat(dest, "s", 1, enc); break;
159
- case LAT_ZH: rb_enc_str_buf_cat(dest, "z", 1, enc); break;
160
- case LAT_CAP_TJ:
161
- case LAT_CAP_CH: rb_enc_str_buf_cat(dest, "C", 1, enc); break;
162
- case LAT_CAP_SH: rb_enc_str_buf_cat(dest, "S", 1, enc); break;
163
- case LAT_CAP_ZH: rb_enc_str_buf_cat(dest, "Z", 1, enc); break;
164
-
165
- case LAT_CAP_DJ:
166
- force_upper ? rb_enc_str_buf_cat(dest, "DJ", 2, enc)
167
- : rb_enc_str_buf_cat(dest, "Dj", 2, enc);
168
- break;
169
- default:
170
- rb_enc_str_buf_cat(dest, pos, len, enc);
171
- }
172
- }
240
+ if (pos + len < end)
241
+ in2 = rb_enc_codepoint_len(pos + len, end, &next_len, enc);
173
242
 
174
- /* Non-Cyrillic codepoints */
175
- else if (codepoint < CYR_CAP_DJ || codepoint > CYR_DZ) {
176
- rb_enc_str_buf_cat(dest, pos, len, enc);
177
- }
243
+ out = (*method)(in, in2, seen_cap, &out2);
178
244
 
179
- /* Cyrillic -> Latin conversion */
180
- else if (codepoint >= CYR_A) {
181
- switch (codepoint) {
182
- case CYR_A: rb_enc_str_buf_cat(dest, "a", 1, enc); break;
183
- case CYR_B: rb_enc_str_buf_cat(dest, "b", 1, enc); break;
184
- case CYR_V: rb_enc_str_buf_cat(dest, "v", 1, enc); break;
185
- case CYR_G: rb_enc_str_buf_cat(dest, "g", 1, enc); break;
186
- case CYR_D: rb_enc_str_buf_cat(dest, "d", 1, enc); break;
187
- case CYR_E: rb_enc_str_buf_cat(dest, "e", 1, enc); break;
188
- case CYR_Z: rb_enc_str_buf_cat(dest, "z", 1, enc); break;
189
- case CYR_I: rb_enc_str_buf_cat(dest, "i", 1, enc); break;
190
- case CYR_K: rb_enc_str_buf_cat(dest, "k", 1, enc); break;
191
- case CYR_L: rb_enc_str_buf_cat(dest, "l", 1, enc); break;
192
- case CYR_M: rb_enc_str_buf_cat(dest, "m", 1, enc); break;
193
- case CYR_N: rb_enc_str_buf_cat(dest, "n", 1, enc); break;
194
- case CYR_O: rb_enc_str_buf_cat(dest, "o", 1, enc); break;
195
- case CYR_P: rb_enc_str_buf_cat(dest, "p", 1, enc); break;
196
- case CYR_R: rb_enc_str_buf_cat(dest, "r", 1, enc); break;
197
- case CYR_S: rb_enc_str_buf_cat(dest, "s", 1, enc); break;
198
- case CYR_T: rb_enc_str_buf_cat(dest, "t", 1, enc); break;
199
- case CYR_U: rb_enc_str_buf_cat(dest, "u", 1, enc); break;
200
- case CYR_F: rb_enc_str_buf_cat(dest, "f", 1, enc); break;
201
- case CYR_H: rb_enc_str_buf_cat(dest, "h", 1, enc); break;
202
- case CYR_C: rb_enc_str_buf_cat(dest, "c", 1, enc); break;
203
- case CYR_J: rb_enc_str_buf_cat(dest, "j", 1, enc); break;
204
- case CYR_LJ: rb_enc_str_buf_cat(dest, "lj", 2, enc); break;
205
- case CYR_NJ: rb_enc_str_buf_cat(dest, "nj", 2, enc); break;
206
- case CYR_DJ: STR_CAT_COND_ASCII(ascii, dest, "dj", LAT_DJ, 2, enc); break;
207
- case CYR_TJ: STR_CAT_COND_ASCII(ascii, dest, "c", LAT_TJ, 1, enc); break;
208
- case CYR_CH: STR_CAT_COND_ASCII(ascii, dest, "c", LAT_CH, 1, enc); break;
209
- case CYR_ZH: STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc); break;
210
- case CYR_SH: STR_CAT_COND_ASCII(ascii, dest, "s", LAT_SH, 1, enc); break;
211
- case CYR_DZ:
212
- rb_enc_str_buf_cat(dest, "d", 1, enc);
213
- STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc);
214
- break;
215
- default:
216
- rb_enc_str_buf_cat(dest, pos, len, enc);
245
+ if (out) {
246
+ /* flush previous untranslatable sequence */
247
+ if (seq_start) {
248
+ rb_str_buf_cat(dest, seq_start, pos - seq_start);
249
+ seq_start = 0;
217
250
  }
251
+
252
+ _str_cat_char(dest, out, enc);
253
+ if (out2) _str_cat_char(dest, out2, enc);
254
+ }
255
+ else if (!seq_start) {
256
+ /* mark the beginning of an untranslatable sequence */
257
+ seq_start = pos;
218
258
  }
219
259
 
220
- /* Cyrillic -> Latin conversion, caps */
221
- else {
222
- switch (codepoint) {
223
- case CYR_CAP_J: rb_enc_str_buf_cat(dest, "J", 1, enc); break;
224
- case CYR_CAP_A: rb_enc_str_buf_cat(dest, "A", 1, enc); break;
225
- case CYR_CAP_B: rb_enc_str_buf_cat(dest, "B", 1, enc); break;
226
- case CYR_CAP_V: rb_enc_str_buf_cat(dest, "V", 1, enc); break;
227
- case CYR_CAP_G: rb_enc_str_buf_cat(dest, "G", 1, enc); break;
228
- case CYR_CAP_D: rb_enc_str_buf_cat(dest, "D", 1, enc); break;
229
- case CYR_CAP_E: rb_enc_str_buf_cat(dest, "E", 1, enc); break;
230
- case CYR_CAP_Z: rb_enc_str_buf_cat(dest, "Z", 1, enc); break;
231
- case CYR_CAP_I: rb_enc_str_buf_cat(dest, "I", 1, enc); break;
232
- case CYR_CAP_K: rb_enc_str_buf_cat(dest, "K", 1, enc); break;
233
- case CYR_CAP_L: rb_enc_str_buf_cat(dest, "L", 1, enc); break;
234
- case CYR_CAP_M: rb_enc_str_buf_cat(dest, "M", 1, enc); break;
235
- case CYR_CAP_N: rb_enc_str_buf_cat(dest, "N", 1, enc); break;
236
- case CYR_CAP_O: rb_enc_str_buf_cat(dest, "O", 1, enc); break;
237
- case CYR_CAP_P: rb_enc_str_buf_cat(dest, "P", 1, enc); break;
238
- case CYR_CAP_R: rb_enc_str_buf_cat(dest, "R", 1, enc); break;
239
- case CYR_CAP_S: rb_enc_str_buf_cat(dest, "S", 1, enc); break;
240
- case CYR_CAP_T: rb_enc_str_buf_cat(dest, "T", 1, enc); break;
241
- case CYR_CAP_U: rb_enc_str_buf_cat(dest, "U", 1, enc); break;
242
- case CYR_CAP_F: rb_enc_str_buf_cat(dest, "F", 1, enc); break;
243
- case CYR_CAP_H: rb_enc_str_buf_cat(dest, "H", 1, enc); break;
244
- case CYR_CAP_C: rb_enc_str_buf_cat(dest, "C", 1, enc); break;
245
- case CYR_CAP_TJ: STR_CAT_COND_ASCII(ascii, dest, "C", LAT_CAP_TJ, 1, enc); break;
246
- case CYR_CAP_CH: STR_CAT_COND_ASCII(ascii, dest, "C", LAT_CAP_CH, 1, enc); break;
247
- case CYR_CAP_ZH: STR_CAT_COND_ASCII(ascii, dest, "Z", LAT_CAP_ZH, 1, enc); break;
248
- case CYR_CAP_SH: STR_CAT_COND_ASCII(ascii, dest, "S", LAT_CAP_SH, 1, enc); break;
249
- case CYR_CAP_LJ:
250
- rb_enc_str_buf_cat(dest, (force_upper ? "LJ" : "Lj"), 2, enc);
251
- break;
252
- case CYR_CAP_NJ:
253
- rb_enc_str_buf_cat(dest, (force_upper ? "NJ" : "Nj"), 2, enc);
254
- break;
255
- case CYR_CAP_DJ:
256
- STR_CAT_COND_ASCII(ascii, dest, (force_upper ? "DJ" : "Dj"), LAT_CAP_DJ, 2, enc);
257
- break;
258
- case CYR_CAP_DZ:
259
- rb_enc_str_buf_cat(dest, "D", 1, enc);
260
- if (force_upper) {
261
- STR_CAT_COND_ASCII(ascii, dest, "Z", LAT_CAP_ZH, 1, enc);
262
- }
263
- else {
264
- STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc);
265
- }
266
- break;
267
- default:
268
- rb_enc_str_buf_cat(dest, pos, len, enc);
269
- }
260
+ /* for cyrillic output, skip the second half of an input digraph */
261
+ if (strategy == 0 && is_digraph(out)) {
262
+ pos += next_len;
263
+ if (pos + len < end)
264
+ in2 = rb_enc_codepoint_len(pos + len, end, &next_len, enc);
270
265
  }
266
+
267
+ seen_cap = is_cap(in);
268
+
271
269
  pos += len;
272
270
  len = next_len;
273
- codepoint = next_codepoint;
274
- next_codepoint = 0;
271
+ in = in2;
275
272
  }
276
273
 
274
+ /* flush final sequence */
275
+ if (seq_start) rb_str_buf_cat(dest, seq_start, pos - seq_start);
276
+
277
277
  if (bang) {
278
278
  rb_str_shared_replace(str, dest);
279
279
  }
280
280
  else {
281
- OBJ_INFECT(dest, str);
282
- str = dest;
281
+ str = dest;
283
282
  }
284
283
 
285
284
  return str;
286
285
  }
287
286
 
287
+ /**
288
+ * Returns a copy of <i>str</i> with Latin characters transliterated
289
+ * into Serbian Cyrillic.
290
+ *
291
+ * @overload to_cyrillic(str)
292
+ * @param [String] str text to be transliterated
293
+ * @return [String] transliterated text
294
+ */
295
+ static VALUE
296
+ rb_str_to_cyrillic(VALUE self, VALUE str)
297
+ {
298
+ return str_to_srb(str, 0, 0);
299
+ }
300
+
301
+ /**
302
+ * Performs transliteration of <code>Byk.to_cyrillic</code> in place,
303
+ * returning <i>str</i>, whether any changes were made or not.
304
+ *
305
+ * @overload to_cyrillic!(str)
306
+ * @param [String] str text to be transliterated
307
+ * @return [String] transliterated text
308
+ */
288
309
  static VALUE
289
- rb_str_to_latin(int argc, VALUE *argv, VALUE str) {
290
- return str_to_latin(argc, argv, str, 0, 0);
310
+ rb_str_to_cyrillic_bang(VALUE self, VALUE str)
311
+ {
312
+ return str_to_srb(str, 0, 1);
291
313
  }
292
314
 
315
+ /**
316
+ * Returns a copy of <i>str</i> with Serbian Cyrillic characters
317
+ * transliterated into Latin.
318
+ *
319
+ * @overload to_latin(str)
320
+ * @param [String] str text to be transliterated
321
+ * @return [String] transliterated text
322
+ */
293
323
  static VALUE
294
- rb_str_to_latin_bang(int argc, VALUE *argv, VALUE str) {
295
- return str_to_latin(argc, argv, str, 0, 1);
324
+ rb_str_to_latin(VALUE self, VALUE str)
325
+ {
326
+ return str_to_srb(str, 1, 0);
296
327
  }
297
328
 
329
+ /**
330
+ * Performs transliteration of <code>Byk.to_latin</code> in place,
331
+ * returning <i>str</i>, whether any changes were made or not.
332
+ *
333
+ * @overload to_latin!(str)
334
+ * @param [String] str text to be transliterated
335
+ * @return [String] transliterated text
336
+ */
298
337
  static VALUE
299
- rb_str_to_ascii_latin(int argc, VALUE *argv, VALUE str) {
300
- return str_to_latin(argc, argv, str, 1, 0);
338
+ rb_str_to_latin_bang(VALUE self, VALUE str)
339
+ {
340
+ return str_to_srb(str, 1, 1);
301
341
  }
302
342
 
343
+ /**
344
+ * Returns a copy of <i>str</i> with Serbian characters transliterated
345
+ * into ASCII Latin.
346
+ *
347
+ * @overload to_ascii_latin(str)
348
+ * @param [String] str text to be transliterated
349
+ * @return [String] transliterated text
350
+ */
303
351
  static VALUE
304
- rb_str_to_ascii_latin_bang(int argc, VALUE *argv, VALUE str) {
305
- return str_to_latin(argc, argv, str, 1, 1);
352
+ rb_str_to_ascii_latin(VALUE self, VALUE str)
353
+ {
354
+ return str_to_srb(str, 2, 0);
355
+ }
356
+
357
+ /**
358
+ * Performs transliteration of <code>Byk.to_ascii_latin</code> in
359
+ * place, returning <i>str</i>, whether any changes were made or not.
360
+ *
361
+ * @overload to_ascii_latin!(str)
362
+ * @param [String] str text to be transliterated
363
+ * @return [String] transliterated text
364
+ */
365
+ static VALUE
366
+ rb_str_to_ascii_latin_bang(VALUE self, VALUE str)
367
+ {
368
+ return str_to_srb(str, 2, 1);
306
369
  }
307
370
 
308
371
  void Init_byk_native(void)
309
372
  {
310
- rb_define_method(rb_cString, "to_latin", rb_str_to_latin, -1);
311
- rb_define_method(rb_cString, "to_latin!", rb_str_to_latin_bang, -1);
312
- rb_define_method(rb_cString, "to_ascii_latin", rb_str_to_ascii_latin, -1);
313
- rb_define_method(rb_cString, "to_ascii_latin!", rb_str_to_ascii_latin_bang, -1);
373
+ VALUE Byk = rb_define_module("Byk");
374
+ rb_define_singleton_method(Byk, "to_cyrillic", rb_str_to_cyrillic, 1);
375
+ rb_define_singleton_method(Byk, "to_cyrillic!", rb_str_to_cyrillic_bang, 1);
376
+ rb_define_singleton_method(Byk, "to_latin", rb_str_to_latin, 1);
377
+ rb_define_singleton_method(Byk, "to_latin!", rb_str_to_latin_bang, 1);
378
+ rb_define_singleton_method(Byk, "to_ascii_latin", rb_str_to_ascii_latin, 1);
379
+ rb_define_singleton_method(Byk, "to_ascii_latin!", rb_str_to_ascii_latin_bang, 1);
314
380
  }
@@ -0,0 +1,8 @@
1
+ class String
2
+
3
+ Byk.singleton_methods.each do |method|
4
+ define_method(method) do
5
+ Byk.send(method, self)
6
+ end
7
+ end
8
+ end
data/lib/byk/safe.rb ADDED
@@ -0,0 +1,14 @@
1
+ # coding: utf-8
2
+
3
+ require "byk_native"
4
+ require "byk/version"
5
+
6
+ module Byk
7
+
8
+ AZBUKA = %w[а б в г д ђ е ж з и ј к л љ м н њ о п р с т ћ у ф х ц ч џ ш]
9
+ AZBUKA_CAPS = %W[А Б В Г Д Ђ Е Ж З И Ј К Л Љ М Н Њ О П Р С Т Ћ У Ф Х Ц Ч Џ Ш]
10
+
11
+ ABECEDA = %w[a b c č ć d dž đ e f g h i j k l lj m n nj o p r s š t u v z ž]
12
+ ABECEDA_CAPS = %W[A B C Č Ć D Dž Đ E F G H I J K L Lj M N Nj O P R S Š T U V Z Ž]
13
+
14
+ end
data/lib/byk/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Byk
2
- VERSION = "0.4.0"
2
+ VERSION = "1.1.0"
3
3
  end
data/lib/byk.rb CHANGED
@@ -1,14 +1,2 @@
1
- # coding: utf-8
2
-
3
- require "byk_native"
4
- require "byk/version"
5
-
6
- module Byk
7
-
8
- AZBUKA = %w[а б в г д ђ е ж з и ј к л љ м н њ о п р с т ћ у ф х ц ч џ ш]
9
- AZBUKA_CAPS = %W[А Б В Г Д Ђ Е Ж З И Ј К Л Љ М Н Њ О П Р С Т Ћ У Ф Х Ц Ч Џ Ш]
10
-
11
- ABECEDA = %w[a b c č ć d dž đ e f g h i j k l lj m n nj o p r s š t u v z ž]
12
- ABECEDA_CAPS = %W[A B C Č Ć D Dž Đ E F G H I J K L Lj M N Nj O P R S Š T U V Z Ž]
13
-
14
- end
1
+ require "byk/safe"
2
+ require "byk/core_ext/string"