byk 0.4.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ext/byk/byk.c CHANGED
@@ -1,314 +1,380 @@
1
- #include <stdio.h>
2
1
  #include <ruby.h>
3
2
  #include <ruby/encoding.h>
4
3
 
5
- #ifndef rb_check_arity
6
- #define rb_check_arity rb_check_arity
7
-
8
- NORETURN(void rb_error_arity(int, int, int));
4
+ #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
9
5
 
10
6
  static inline void
11
- rb_check_arity(int argc, int min, int max)
7
+ _str_cat_char(VALUE str, unsigned c, rb_encoding *enc)
12
8
  {
13
- if ((argc < min) || (max != -1 && argc > max))
14
- rb_error_arity(argc, min, max);
9
+ char s[16];
10
+ int n = rb_enc_codelen(c, enc);
11
+ rb_enc_mbcput(c, s, enc);
12
+ rb_str_buf_cat(str, s, n);
15
13
  }
16
- #endif
17
-
18
- #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
19
-
20
- #define STR_CAT_COND_ASCII(ascii, dest, chr, ascii_chr, len, enc) \
21
- ascii ? rb_enc_str_buf_cat(dest, chr, len, enc) \
22
- : str_cat_char(dest, ascii_chr, enc)
23
14
 
24
15
  enum {
25
- LAT_CAP_TJ=262,
26
- LAT_TJ,
27
- LAT_CAP_CH=268,
28
- LAT_CH,
29
- LAT_CAP_DJ=272,
30
- LAT_DJ,
31
- LAT_CAP_SH=352,
32
- LAT_SH,
33
- LAT_CAP_ZH=381,
34
- LAT_ZH,
35
- CYR_CAP_DJ=1026,
36
- CYR_CAP_J=1032,
37
- CYR_CAP_LJ,
38
- CYR_CAP_NJ,
39
- CYR_CAP_TJ,
40
- CYR_CAP_DZ=1039,
41
- CYR_CAP_A,
42
- CYR_CAP_B,
43
- CYR_CAP_V,
44
- CYR_CAP_G,
45
- CYR_CAP_D,
46
- CYR_CAP_E,
47
- CYR_CAP_ZH,
48
- CYR_CAP_Z,
49
- CYR_CAP_I,
50
- CYR_CAP_K=1050,
51
- CYR_CAP_L,
52
- CYR_CAP_M,
53
- CYR_CAP_N,
54
- CYR_CAP_O,
55
- CYR_CAP_P,
56
- CYR_CAP_R,
57
- CYR_CAP_S,
58
- CYR_CAP_T,
59
- CYR_CAP_U,
60
- CYR_CAP_F,
61
- CYR_CAP_H,
62
- CYR_CAP_C,
63
- CYR_CAP_CH,
64
- CYR_CAP_SH,
65
- CYR_A=1072,
66
- CYR_B,
67
- CYR_V,
68
- CYR_G,
69
- CYR_D,
70
- CYR_E,
71
- CYR_ZH,
72
- CYR_Z,
73
- CYR_I,
74
- CYR_K=1082,
75
- CYR_L,
76
- CYR_M,
77
- CYR_N,
78
- CYR_O,
79
- CYR_P,
80
- CYR_R,
81
- CYR_S,
82
- CYR_T,
83
- CYR_U,
84
- CYR_F,
85
- CYR_H,
86
- CYR_C,
87
- CYR_CH,
88
- CYR_SH,
89
- CYR_DJ=1106,
90
- CYR_J=1112,
91
- CYR_LJ,
92
- CYR_NJ,
93
- CYR_TJ,
94
- CYR_DZ=1119
16
+ LAT_CAP_TJ=262, LAT_TJ, LAT_CAP_CH=268, LAT_CH,
17
+ LAT_CAP_DJ=272, LAT_DJ, LAT_CAP_SH=352, LAT_SH,
18
+ LAT_CAP_ZH=381, LAT_ZH, CYR_CAP_DJ=1026, CYR_CAP_J=1032,
19
+ CYR_CAP_LJ, CYR_CAP_NJ, CYR_CAP_TJ, CYR_CAP_DZ=1039,
20
+ CYR_CAP_A, CYR_CAP_B, CYR_CAP_V, CYR_CAP_G,
21
+ CYR_CAP_D, CYR_CAP_E, CYR_CAP_ZH, CYR_CAP_Z,
22
+ CYR_CAP_I, CYR_CAP_K=1050, CYR_CAP_L, CYR_CAP_M,
23
+ CYR_CAP_N, CYR_CAP_O, CYR_CAP_P, CYR_CAP_R,
24
+ CYR_CAP_S, CYR_CAP_T, CYR_CAP_U, CYR_CAP_F,
25
+ CYR_CAP_H, CYR_CAP_C, CYR_CAP_CH, CYR_CAP_SH,
26
+ CYR_A=1072, CYR_B, CYR_V, CYR_G, CYR_D,
27
+ CYR_E, CYR_ZH, CYR_Z, CYR_I, CYR_K=1082,
28
+ CYR_L, CYR_M, CYR_N, CYR_O, CYR_P,
29
+ CYR_R, CYR_S, CYR_T, CYR_U, CYR_F,
30
+ CYR_H, CYR_C, CYR_CH, CYR_SH, CYR_DJ=1106,
31
+ CYR_J=1112, CYR_LJ, CYR_NJ, CYR_TJ, CYR_DZ=1119
95
32
  };
96
33
 
97
- static inline unsigned int
98
- is_upper_case(unsigned int c)
34
+ static inline unsigned
35
+ is_cap(unsigned codepoint)
99
36
  {
100
- return ((c >= 65 && c <= 90)
101
- || (c >= CYR_CAP_DJ && c <= CYR_CAP_SH)
102
- || c == LAT_CAP_TJ
103
- || c == LAT_CAP_CH
104
- || c == LAT_CAP_DJ
105
- || c == LAT_CAP_SH
106
- || c == LAT_CAP_ZH);
37
+ if (codepoint >= 65 && codepoint <= 90) return 1;
38
+ if (codepoint >= CYR_CAP_DJ && codepoint <= CYR_CAP_SH) return 1;
39
+
40
+ switch(codepoint) {
41
+ case LAT_CAP_TJ:
42
+ case LAT_CAP_CH:
43
+ case LAT_CAP_DJ:
44
+ case LAT_CAP_SH:
45
+ case LAT_CAP_ZH:
46
+ return 1;
47
+ default:
48
+ return 0;
49
+ }
107
50
  }
108
51
 
109
- static void
110
- str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
52
+ static inline unsigned
53
+ is_digraph(unsigned codepoint)
111
54
  {
112
- char s[16];
113
- int n = rb_enc_codelen(c, enc);
114
- rb_enc_mbcput(c, s, enc);
115
- rb_enc_str_buf_cat(str, s, n, enc);
55
+ switch(codepoint) {
56
+ case CYR_LJ:
57
+ case CYR_NJ:
58
+ case CYR_DZ:
59
+ case CYR_CAP_LJ:
60
+ case CYR_CAP_NJ:
61
+ case CYR_CAP_DZ:
62
+ return 1;
63
+ default:
64
+ return 0;
65
+ }
66
+ }
67
+
68
+ static unsigned
69
+ digraph_to_cyr(unsigned codepoint, unsigned codepoint2, unsigned capitalize, unsigned *next_out)
70
+ {
71
+ static unsigned CYR_MAP[] = {
72
+ CYR_A, CYR_B, CYR_C, CYR_D, CYR_E, CYR_F,
73
+ CYR_G, CYR_H, CYR_I, CYR_J, CYR_K, CYR_L,
74
+ CYR_M, CYR_N, CYR_O, CYR_P, 0, CYR_R,
75
+ CYR_S, CYR_T, CYR_U, CYR_V, 0, 0, 0, CYR_Z
76
+ };
77
+
78
+ static unsigned CYR_CAPS_MAP[] = {
79
+ CYR_CAP_A, CYR_CAP_B, CYR_CAP_C, CYR_CAP_D, CYR_CAP_E, CYR_CAP_F,
80
+ CYR_CAP_G, CYR_CAP_H, CYR_CAP_I, CYR_CAP_J, CYR_CAP_K, CYR_CAP_L,
81
+ CYR_CAP_M, CYR_CAP_N, CYR_CAP_O, CYR_CAP_P, 0, CYR_CAP_R,
82
+ CYR_CAP_S, CYR_CAP_T, CYR_CAP_U, CYR_CAP_V, 0, 0, 0, CYR_CAP_Z
83
+ };
84
+
85
+ if (codepoint2 == LAT_CAP_ZH || codepoint2 == LAT_ZH) {
86
+ switch (codepoint) {
87
+ case 'd': return CYR_DZ;
88
+ case 'D': return CYR_CAP_DZ;
89
+ }
90
+ }
91
+
92
+ if (codepoint2 == 'j' || codepoint2 == 'J') {
93
+ switch (codepoint) {
94
+ case 'l': return CYR_LJ;
95
+ case 'n': return CYR_NJ;
96
+ case 'L': return CYR_CAP_LJ;
97
+ case 'N': return CYR_CAP_NJ;
98
+ }
99
+ }
100
+
101
+ if (codepoint >= 'a' && codepoint <= 'z') return CYR_MAP[codepoint - 'a'];
102
+ if (codepoint >= 'A' && codepoint <= 'Z') return CYR_CAPS_MAP[codepoint - 'A'];
103
+
104
+ switch (codepoint) {
105
+ case LAT_CH: return CYR_CH;
106
+ case LAT_DJ: return CYR_DJ;
107
+ case LAT_SH: return CYR_SH;
108
+ case LAT_TJ: return CYR_TJ;
109
+ case LAT_ZH: return CYR_ZH;
110
+ case LAT_CAP_CH: return CYR_CAP_CH;
111
+ case LAT_CAP_DJ: return CYR_CAP_DJ;
112
+ case LAT_CAP_SH: return CYR_CAP_SH;
113
+ case LAT_CAP_TJ: return CYR_CAP_TJ;
114
+ case LAT_CAP_ZH: return CYR_CAP_ZH;
115
+ }
116
+
117
+ return 0;
118
+ }
119
+
120
+ static unsigned
121
+ digraph_to_latin(unsigned codepoint, unsigned codepoint2, unsigned capitalize, unsigned *next_out)
122
+ {
123
+ static char LAT_MAP[] = {
124
+ 'a', 'b', 'v', 'g', 'd', 'e', 0, 'z', 'i', 0, 'k', 'l',
125
+ 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'f', 'h', 'c'
126
+ };
127
+
128
+ static char LAT_CAPS_MAP[] = {
129
+ 'A', 'B', 'V', 'G', 'D', 'E', 0, 'Z', 'I', 0, 'K', 'L',
130
+ 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'F', 'H', 'C'
131
+ };
132
+
133
+ if (codepoint < CYR_CAP_DJ || codepoint > CYR_DZ) return 0;
134
+
135
+ switch (codepoint) {
136
+ case CYR_ZH: return LAT_ZH;
137
+ case CYR_CAP_ZH: return LAT_CAP_ZH;
138
+ }
139
+
140
+ if (codepoint >= CYR_A && codepoint <= CYR_C)
141
+ return LAT_MAP[codepoint - CYR_A];
142
+
143
+ if (codepoint >= CYR_CAP_A && codepoint <= CYR_CAP_C)
144
+ return LAT_CAPS_MAP[codepoint - CYR_CAP_A];
145
+
146
+ if (codepoint >= CYR_A) {
147
+ switch (codepoint) {
148
+ case CYR_J: return 'j';
149
+ case CYR_TJ: return LAT_TJ;
150
+ case CYR_CH: return LAT_CH;
151
+ case CYR_SH: return LAT_SH;
152
+ case CYR_DJ: return LAT_DJ;
153
+ case CYR_LJ: *next_out = 'j'; return 'l';
154
+ case CYR_NJ: *next_out = 'j'; return 'n';
155
+ case CYR_DZ: *next_out = LAT_ZH; return 'd';
156
+ }
157
+ }
158
+ else {
159
+ switch (codepoint) {
160
+ case CYR_CAP_J: return 'J';
161
+ case CYR_CAP_TJ: return LAT_CAP_TJ;
162
+ case CYR_CAP_CH: return LAT_CAP_CH;
163
+ case CYR_CAP_SH: return LAT_CAP_SH;
164
+ case CYR_CAP_DJ: return LAT_CAP_DJ;
165
+ case CYR_CAP_LJ: *next_out = (capitalize || is_cap(codepoint2)) ? 'J' : 'j'; return 'L';
166
+ case CYR_CAP_NJ: *next_out = (capitalize || is_cap(codepoint2)) ? 'J' : 'j'; return 'N';
167
+ case CYR_CAP_DZ: *next_out = (capitalize || is_cap(codepoint2)) ? LAT_CAP_ZH : LAT_ZH; return 'D';
168
+ }
169
+ }
170
+
171
+ return 0;
172
+ }
173
+
174
+ static unsigned
175
+ digraph_to_ascii(unsigned codepoint, unsigned codepoint2, unsigned capitalize, unsigned *next_out)
176
+ {
177
+ switch (codepoint) {
178
+ case LAT_TJ:
179
+ case LAT_CH:
180
+ case CYR_TJ:
181
+ case CYR_CH: return 'c';
182
+ case LAT_SH:
183
+ case CYR_SH: return 's';
184
+ case LAT_ZH:
185
+ case CYR_ZH: return 'z';
186
+ case LAT_DJ:
187
+ case CYR_DJ: *next_out = 'j'; return 'd';
188
+ case LAT_CAP_TJ:
189
+ case LAT_CAP_CH:
190
+ case CYR_CAP_TJ:
191
+ case CYR_CAP_CH: return 'C';
192
+ case LAT_CAP_SH:
193
+ case CYR_CAP_SH: return 'S';
194
+ case LAT_CAP_ZH:
195
+ case CYR_CAP_ZH: return 'Z';
196
+ case LAT_CAP_DJ:
197
+ case CYR_CAP_DJ:
198
+ *next_out = (capitalize || is_cap(codepoint2)) ? 'J' : 'j'; return 'D';
199
+ case CYR_DZ:
200
+ *next_out = (capitalize || is_cap(codepoint2)) ? 'Z' : 'z'; return 'd';
201
+ case CYR_CAP_DZ:
202
+ *next_out = (capitalize || is_cap(codepoint2)) ? 'Z' : 'z'; return 'D';
203
+ default:
204
+ return digraph_to_latin(codepoint, codepoint2, capitalize, next_out);
205
+ }
116
206
  }
117
207
 
118
208
  static VALUE
119
- str_to_latin(int argc, VALUE *argv, VALUE str, int ascii, int bang)
209
+ str_to_srb(VALUE str, int strategy, int bang)
120
210
  {
121
211
  VALUE dest;
122
- long dest_len;
123
- char *pos, *end;
124
212
  rb_encoding *enc;
213
+
125
214
  int len, next_len;
126
- int seen_upper = 0;
127
- int force_upper = 0;
128
- unsigned int codepoint = 0;
129
- unsigned int next_codepoint = 0;
215
+ unsigned in, in2, out, out2, seen_cap = 0;
216
+ char *pos, *end, *seq_start = 0;
130
217
 
131
- rb_check_arity(argc, 0, 1);
218
+ unsigned (*method)(unsigned, unsigned, unsigned, unsigned*);
219
+
220
+ switch(strategy) {
221
+ case 0: method = &digraph_to_cyr; break;
222
+ case 1: method = &digraph_to_latin; break;
223
+ default: method = &digraph_to_ascii;
224
+ }
132
225
 
226
+ StringValue(str);
133
227
  pos = RSTRING_PTR(str);
134
228
  if (!pos || RSTRING_LEN(str) == 0) return str;
135
229
 
136
230
  end = RSTRING_END(str);
137
231
  enc = STR_ENC_GET(str);
138
- dest_len = RSTRING_LEN(str) + 30;
139
- dest = rb_str_buf_new(dest_len);
232
+ dest = rb_str_buf_new(RSTRING_LEN(str) + 30);
140
233
  rb_enc_associate(dest, enc);
141
234
 
142
- codepoint = rb_enc_codepoint_len(pos, end, &len, enc);
235
+ in = rb_enc_codepoint_len(pos, end, &len, enc);
143
236
 
144
237
  while (pos < end) {
145
- if (pos + len < end) {
146
- next_codepoint = rb_enc_codepoint_len(pos + len, end, &next_len, enc);
147
- }
238
+ in2 = out2 = 0;
148
239
 
149
- force_upper = seen_upper || is_upper_case(next_codepoint);
150
- seen_upper = is_upper_case(codepoint);
151
-
152
- /* Latin -> "ASCII Latin" conversion */
153
- if (ascii && codepoint >= LAT_CAP_TJ && codepoint <= LAT_ZH) {
154
- switch (codepoint) {
155
- case LAT_TJ:
156
- case LAT_CH: rb_enc_str_buf_cat(dest, "c", 1, enc); break;
157
- case LAT_DJ: rb_enc_str_buf_cat(dest, "dj", 2, enc); break;
158
- case LAT_SH: rb_enc_str_buf_cat(dest, "s", 1, enc); break;
159
- case LAT_ZH: rb_enc_str_buf_cat(dest, "z", 1, enc); break;
160
- case LAT_CAP_TJ:
161
- case LAT_CAP_CH: rb_enc_str_buf_cat(dest, "C", 1, enc); break;
162
- case LAT_CAP_SH: rb_enc_str_buf_cat(dest, "S", 1, enc); break;
163
- case LAT_CAP_ZH: rb_enc_str_buf_cat(dest, "Z", 1, enc); break;
164
-
165
- case LAT_CAP_DJ:
166
- force_upper ? rb_enc_str_buf_cat(dest, "DJ", 2, enc)
167
- : rb_enc_str_buf_cat(dest, "Dj", 2, enc);
168
- break;
169
- default:
170
- rb_enc_str_buf_cat(dest, pos, len, enc);
171
- }
172
- }
240
+ if (pos + len < end)
241
+ in2 = rb_enc_codepoint_len(pos + len, end, &next_len, enc);
173
242
 
174
- /* Non-Cyrillic codepoints */
175
- else if (codepoint < CYR_CAP_DJ || codepoint > CYR_DZ) {
176
- rb_enc_str_buf_cat(dest, pos, len, enc);
177
- }
243
+ out = (*method)(in, in2, seen_cap, &out2);
178
244
 
179
- /* Cyrillic -> Latin conversion */
180
- else if (codepoint >= CYR_A) {
181
- switch (codepoint) {
182
- case CYR_A: rb_enc_str_buf_cat(dest, "a", 1, enc); break;
183
- case CYR_B: rb_enc_str_buf_cat(dest, "b", 1, enc); break;
184
- case CYR_V: rb_enc_str_buf_cat(dest, "v", 1, enc); break;
185
- case CYR_G: rb_enc_str_buf_cat(dest, "g", 1, enc); break;
186
- case CYR_D: rb_enc_str_buf_cat(dest, "d", 1, enc); break;
187
- case CYR_E: rb_enc_str_buf_cat(dest, "e", 1, enc); break;
188
- case CYR_Z: rb_enc_str_buf_cat(dest, "z", 1, enc); break;
189
- case CYR_I: rb_enc_str_buf_cat(dest, "i", 1, enc); break;
190
- case CYR_K: rb_enc_str_buf_cat(dest, "k", 1, enc); break;
191
- case CYR_L: rb_enc_str_buf_cat(dest, "l", 1, enc); break;
192
- case CYR_M: rb_enc_str_buf_cat(dest, "m", 1, enc); break;
193
- case CYR_N: rb_enc_str_buf_cat(dest, "n", 1, enc); break;
194
- case CYR_O: rb_enc_str_buf_cat(dest, "o", 1, enc); break;
195
- case CYR_P: rb_enc_str_buf_cat(dest, "p", 1, enc); break;
196
- case CYR_R: rb_enc_str_buf_cat(dest, "r", 1, enc); break;
197
- case CYR_S: rb_enc_str_buf_cat(dest, "s", 1, enc); break;
198
- case CYR_T: rb_enc_str_buf_cat(dest, "t", 1, enc); break;
199
- case CYR_U: rb_enc_str_buf_cat(dest, "u", 1, enc); break;
200
- case CYR_F: rb_enc_str_buf_cat(dest, "f", 1, enc); break;
201
- case CYR_H: rb_enc_str_buf_cat(dest, "h", 1, enc); break;
202
- case CYR_C: rb_enc_str_buf_cat(dest, "c", 1, enc); break;
203
- case CYR_J: rb_enc_str_buf_cat(dest, "j", 1, enc); break;
204
- case CYR_LJ: rb_enc_str_buf_cat(dest, "lj", 2, enc); break;
205
- case CYR_NJ: rb_enc_str_buf_cat(dest, "nj", 2, enc); break;
206
- case CYR_DJ: STR_CAT_COND_ASCII(ascii, dest, "dj", LAT_DJ, 2, enc); break;
207
- case CYR_TJ: STR_CAT_COND_ASCII(ascii, dest, "c", LAT_TJ, 1, enc); break;
208
- case CYR_CH: STR_CAT_COND_ASCII(ascii, dest, "c", LAT_CH, 1, enc); break;
209
- case CYR_ZH: STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc); break;
210
- case CYR_SH: STR_CAT_COND_ASCII(ascii, dest, "s", LAT_SH, 1, enc); break;
211
- case CYR_DZ:
212
- rb_enc_str_buf_cat(dest, "d", 1, enc);
213
- STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc);
214
- break;
215
- default:
216
- rb_enc_str_buf_cat(dest, pos, len, enc);
245
+ if (out) {
246
+ /* flush previous untranslatable sequence */
247
+ if (seq_start) {
248
+ rb_str_buf_cat(dest, seq_start, pos - seq_start);
249
+ seq_start = 0;
217
250
  }
251
+
252
+ _str_cat_char(dest, out, enc);
253
+ if (out2) _str_cat_char(dest, out2, enc);
254
+ }
255
+ else if (!seq_start) {
256
+ /* mark the beginning of an untranslatable sequence */
257
+ seq_start = pos;
218
258
  }
219
259
 
220
- /* Cyrillic -> Latin conversion, caps */
221
- else {
222
- switch (codepoint) {
223
- case CYR_CAP_J: rb_enc_str_buf_cat(dest, "J", 1, enc); break;
224
- case CYR_CAP_A: rb_enc_str_buf_cat(dest, "A", 1, enc); break;
225
- case CYR_CAP_B: rb_enc_str_buf_cat(dest, "B", 1, enc); break;
226
- case CYR_CAP_V: rb_enc_str_buf_cat(dest, "V", 1, enc); break;
227
- case CYR_CAP_G: rb_enc_str_buf_cat(dest, "G", 1, enc); break;
228
- case CYR_CAP_D: rb_enc_str_buf_cat(dest, "D", 1, enc); break;
229
- case CYR_CAP_E: rb_enc_str_buf_cat(dest, "E", 1, enc); break;
230
- case CYR_CAP_Z: rb_enc_str_buf_cat(dest, "Z", 1, enc); break;
231
- case CYR_CAP_I: rb_enc_str_buf_cat(dest, "I", 1, enc); break;
232
- case CYR_CAP_K: rb_enc_str_buf_cat(dest, "K", 1, enc); break;
233
- case CYR_CAP_L: rb_enc_str_buf_cat(dest, "L", 1, enc); break;
234
- case CYR_CAP_M: rb_enc_str_buf_cat(dest, "M", 1, enc); break;
235
- case CYR_CAP_N: rb_enc_str_buf_cat(dest, "N", 1, enc); break;
236
- case CYR_CAP_O: rb_enc_str_buf_cat(dest, "O", 1, enc); break;
237
- case CYR_CAP_P: rb_enc_str_buf_cat(dest, "P", 1, enc); break;
238
- case CYR_CAP_R: rb_enc_str_buf_cat(dest, "R", 1, enc); break;
239
- case CYR_CAP_S: rb_enc_str_buf_cat(dest, "S", 1, enc); break;
240
- case CYR_CAP_T: rb_enc_str_buf_cat(dest, "T", 1, enc); break;
241
- case CYR_CAP_U: rb_enc_str_buf_cat(dest, "U", 1, enc); break;
242
- case CYR_CAP_F: rb_enc_str_buf_cat(dest, "F", 1, enc); break;
243
- case CYR_CAP_H: rb_enc_str_buf_cat(dest, "H", 1, enc); break;
244
- case CYR_CAP_C: rb_enc_str_buf_cat(dest, "C", 1, enc); break;
245
- case CYR_CAP_TJ: STR_CAT_COND_ASCII(ascii, dest, "C", LAT_CAP_TJ, 1, enc); break;
246
- case CYR_CAP_CH: STR_CAT_COND_ASCII(ascii, dest, "C", LAT_CAP_CH, 1, enc); break;
247
- case CYR_CAP_ZH: STR_CAT_COND_ASCII(ascii, dest, "Z", LAT_CAP_ZH, 1, enc); break;
248
- case CYR_CAP_SH: STR_CAT_COND_ASCII(ascii, dest, "S", LAT_CAP_SH, 1, enc); break;
249
- case CYR_CAP_LJ:
250
- rb_enc_str_buf_cat(dest, (force_upper ? "LJ" : "Lj"), 2, enc);
251
- break;
252
- case CYR_CAP_NJ:
253
- rb_enc_str_buf_cat(dest, (force_upper ? "NJ" : "Nj"), 2, enc);
254
- break;
255
- case CYR_CAP_DJ:
256
- STR_CAT_COND_ASCII(ascii, dest, (force_upper ? "DJ" : "Dj"), LAT_CAP_DJ, 2, enc);
257
- break;
258
- case CYR_CAP_DZ:
259
- rb_enc_str_buf_cat(dest, "D", 1, enc);
260
- if (force_upper) {
261
- STR_CAT_COND_ASCII(ascii, dest, "Z", LAT_CAP_ZH, 1, enc);
262
- }
263
- else {
264
- STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc);
265
- }
266
- break;
267
- default:
268
- rb_enc_str_buf_cat(dest, pos, len, enc);
269
- }
260
+ /* for cyrillic output, skip the second half of an input digraph */
261
+ if (strategy == 0 && is_digraph(out)) {
262
+ pos += next_len;
263
+ if (pos + len < end)
264
+ in2 = rb_enc_codepoint_len(pos + len, end, &next_len, enc);
270
265
  }
266
+
267
+ seen_cap = is_cap(in);
268
+
271
269
  pos += len;
272
270
  len = next_len;
273
- codepoint = next_codepoint;
274
- next_codepoint = 0;
271
+ in = in2;
275
272
  }
276
273
 
274
+ /* flush final sequence */
275
+ if (seq_start) rb_str_buf_cat(dest, seq_start, pos - seq_start);
276
+
277
277
  if (bang) {
278
278
  rb_str_shared_replace(str, dest);
279
279
  }
280
280
  else {
281
- OBJ_INFECT(dest, str);
282
- str = dest;
281
+ str = dest;
283
282
  }
284
283
 
285
284
  return str;
286
285
  }
287
286
 
287
+ /**
288
+ * Returns a copy of <i>str</i> with Latin characters transliterated
289
+ * into Serbian Cyrillic.
290
+ *
291
+ * @overload to_cyrillic(str)
292
+ * @param [String] str text to be transliterated
293
+ * @return [String] transliterated text
294
+ */
295
+ static VALUE
296
+ rb_str_to_cyrillic(VALUE self, VALUE str)
297
+ {
298
+ return str_to_srb(str, 0, 0);
299
+ }
300
+
301
+ /**
302
+ * Performs transliteration of <code>Byk.to_cyrillic</code> in place,
303
+ * returning <i>str</i>, whether any changes were made or not.
304
+ *
305
+ * @overload to_cyrillic!(str)
306
+ * @param [String] str text to be transliterated
307
+ * @return [String] transliterated text
308
+ */
288
309
  static VALUE
289
- rb_str_to_latin(int argc, VALUE *argv, VALUE str) {
290
- return str_to_latin(argc, argv, str, 0, 0);
310
+ rb_str_to_cyrillic_bang(VALUE self, VALUE str)
311
+ {
312
+ return str_to_srb(str, 0, 1);
291
313
  }
292
314
 
315
+ /**
316
+ * Returns a copy of <i>str</i> with Serbian Cyrillic characters
317
+ * transliterated into Latin.
318
+ *
319
+ * @overload to_latin(str)
320
+ * @param [String] str text to be transliterated
321
+ * @return [String] transliterated text
322
+ */
293
323
  static VALUE
294
- rb_str_to_latin_bang(int argc, VALUE *argv, VALUE str) {
295
- return str_to_latin(argc, argv, str, 0, 1);
324
+ rb_str_to_latin(VALUE self, VALUE str)
325
+ {
326
+ return str_to_srb(str, 1, 0);
296
327
  }
297
328
 
329
+ /**
330
+ * Performs transliteration of <code>Byk.to_latin</code> in place,
331
+ * returning <i>str</i>, whether any changes were made or not.
332
+ *
333
+ * @overload to_latin!(str)
334
+ * @param [String] str text to be transliterated
335
+ * @return [String] transliterated text
336
+ */
298
337
  static VALUE
299
- rb_str_to_ascii_latin(int argc, VALUE *argv, VALUE str) {
300
- return str_to_latin(argc, argv, str, 1, 0);
338
+ rb_str_to_latin_bang(VALUE self, VALUE str)
339
+ {
340
+ return str_to_srb(str, 1, 1);
301
341
  }
302
342
 
343
+ /**
344
+ * Returns a copy of <i>str</i> with Serbian characters transliterated
345
+ * into ASCII Latin.
346
+ *
347
+ * @overload to_ascii_latin(str)
348
+ * @param [String] str text to be transliterated
349
+ * @return [String] transliterated text
350
+ */
303
351
  static VALUE
304
- rb_str_to_ascii_latin_bang(int argc, VALUE *argv, VALUE str) {
305
- return str_to_latin(argc, argv, str, 1, 1);
352
+ rb_str_to_ascii_latin(VALUE self, VALUE str)
353
+ {
354
+ return str_to_srb(str, 2, 0);
355
+ }
356
+
357
+ /**
358
+ * Performs transliteration of <code>Byk.to_ascii_latin</code> in
359
+ * place, returning <i>str</i>, whether any changes were made or not.
360
+ *
361
+ * @overload to_ascii_latin!(str)
362
+ * @param [String] str text to be transliterated
363
+ * @return [String] transliterated text
364
+ */
365
+ static VALUE
366
+ rb_str_to_ascii_latin_bang(VALUE self, VALUE str)
367
+ {
368
+ return str_to_srb(str, 2, 1);
306
369
  }
307
370
 
308
371
  void Init_byk_native(void)
309
372
  {
310
- rb_define_method(rb_cString, "to_latin", rb_str_to_latin, -1);
311
- rb_define_method(rb_cString, "to_latin!", rb_str_to_latin_bang, -1);
312
- rb_define_method(rb_cString, "to_ascii_latin", rb_str_to_ascii_latin, -1);
313
- rb_define_method(rb_cString, "to_ascii_latin!", rb_str_to_ascii_latin_bang, -1);
373
+ VALUE Byk = rb_define_module("Byk");
374
+ rb_define_singleton_method(Byk, "to_cyrillic", rb_str_to_cyrillic, 1);
375
+ rb_define_singleton_method(Byk, "to_cyrillic!", rb_str_to_cyrillic_bang, 1);
376
+ rb_define_singleton_method(Byk, "to_latin", rb_str_to_latin, 1);
377
+ rb_define_singleton_method(Byk, "to_latin!", rb_str_to_latin_bang, 1);
378
+ rb_define_singleton_method(Byk, "to_ascii_latin", rb_str_to_ascii_latin, 1);
379
+ rb_define_singleton_method(Byk, "to_ascii_latin!", rb_str_to_ascii_latin_bang, 1);
314
380
  }
@@ -0,0 +1,8 @@
1
+ class String
2
+
3
+ Byk.singleton_methods.each do |method|
4
+ define_method(method) do
5
+ Byk.send(method, self)
6
+ end
7
+ end
8
+ end
data/lib/byk/safe.rb ADDED
@@ -0,0 +1,14 @@
1
+ # coding: utf-8
2
+
3
+ require "byk_native"
4
+ require "byk/version"
5
+
6
+ module Byk
7
+
8
+ AZBUKA = %w[а б в г д ђ е ж з и ј к л љ м н њ о п р с т ћ у ф х ц ч џ ш]
9
+ AZBUKA_CAPS = %W[А Б В Г Д Ђ Е Ж З И Ј К Л Љ М Н Њ О П Р С Т Ћ У Ф Х Ц Ч Џ Ш]
10
+
11
+ ABECEDA = %w[a b c č ć d dž đ e f g h i j k l lj m n nj o p r s š t u v z ž]
12
+ ABECEDA_CAPS = %W[A B C Č Ć D Dž Đ E F G H I J K L Lj M N Nj O P R S Š T U V Z Ž]
13
+
14
+ end
data/lib/byk/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Byk
2
- VERSION = "0.4.0"
2
+ VERSION = "1.1.0"
3
3
  end
data/lib/byk.rb CHANGED
@@ -1,14 +1,2 @@
1
- # coding: utf-8
2
-
3
- require "byk_native"
4
- require "byk/version"
5
-
6
- module Byk
7
-
8
- AZBUKA = %w[а б в г д ђ е ж з и ј к л љ м н њ о п р с т ћ у ф х ц ч џ ш]
9
- AZBUKA_CAPS = %W[А Б В Г Д Ђ Е Ж З И Ј К Л Љ М Н Њ О П Р С Т Ћ У Ф Х Ц Ч Џ Ш]
10
-
11
- ABECEDA = %w[a b c č ć d dž đ e f g h i j k l lj m n nj o p r s š t u v z ž]
12
- ABECEDA_CAPS = %W[A B C Č Ć D Dž Đ E F G H I J K L Lj M N Nj O P R S Š T U V Z Ž]
13
-
14
- end
1
+ require "byk/safe"
2
+ require "byk/core_ext/string"