byk 0.4.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/CHANGELOG.md +22 -2
- data/README.md +96 -45
- data/exe/byk +51 -0
- data/ext/byk/byk.c +312 -246
- data/lib/byk/core_ext/string.rb +8 -0
- data/lib/byk/safe.rb +14 -0
- data/lib/byk/version.rb +1 -1
- data/lib/byk.rb +2 -14
- data/spec/byk_spec.rb +186 -72
- metadata +48 -17
data/ext/byk/byk.c
CHANGED
@@ -1,314 +1,380 @@
|
|
1
|
-
#include <stdio.h>
|
2
1
|
#include <ruby.h>
|
3
2
|
#include <ruby/encoding.h>
|
4
3
|
|
5
|
-
#
|
6
|
-
#define rb_check_arity rb_check_arity
|
7
|
-
|
8
|
-
NORETURN(void rb_error_arity(int, int, int));
|
4
|
+
#define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
|
9
5
|
|
10
6
|
static inline void
|
11
|
-
|
7
|
+
_str_cat_char(VALUE str, unsigned c, rb_encoding *enc)
|
12
8
|
{
|
13
|
-
|
14
|
-
|
9
|
+
char s[16];
|
10
|
+
int n = rb_enc_codelen(c, enc);
|
11
|
+
rb_enc_mbcput(c, s, enc);
|
12
|
+
rb_str_buf_cat(str, s, n);
|
15
13
|
}
|
16
|
-
#endif
|
17
|
-
|
18
|
-
#define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
|
19
|
-
|
20
|
-
#define STR_CAT_COND_ASCII(ascii, dest, chr, ascii_chr, len, enc) \
|
21
|
-
ascii ? rb_enc_str_buf_cat(dest, chr, len, enc) \
|
22
|
-
: str_cat_char(dest, ascii_chr, enc)
|
23
14
|
|
24
15
|
enum {
|
25
|
-
LAT_CAP_TJ=262,
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
CYR_CAP_A,
|
42
|
-
CYR_CAP_B,
|
43
|
-
CYR_CAP_V,
|
44
|
-
CYR_CAP_G,
|
45
|
-
CYR_CAP_D,
|
46
|
-
CYR_CAP_E,
|
47
|
-
CYR_CAP_ZH,
|
48
|
-
CYR_CAP_Z,
|
49
|
-
CYR_CAP_I,
|
50
|
-
CYR_CAP_K=1050,
|
51
|
-
CYR_CAP_L,
|
52
|
-
CYR_CAP_M,
|
53
|
-
CYR_CAP_N,
|
54
|
-
CYR_CAP_O,
|
55
|
-
CYR_CAP_P,
|
56
|
-
CYR_CAP_R,
|
57
|
-
CYR_CAP_S,
|
58
|
-
CYR_CAP_T,
|
59
|
-
CYR_CAP_U,
|
60
|
-
CYR_CAP_F,
|
61
|
-
CYR_CAP_H,
|
62
|
-
CYR_CAP_C,
|
63
|
-
CYR_CAP_CH,
|
64
|
-
CYR_CAP_SH,
|
65
|
-
CYR_A=1072,
|
66
|
-
CYR_B,
|
67
|
-
CYR_V,
|
68
|
-
CYR_G,
|
69
|
-
CYR_D,
|
70
|
-
CYR_E,
|
71
|
-
CYR_ZH,
|
72
|
-
CYR_Z,
|
73
|
-
CYR_I,
|
74
|
-
CYR_K=1082,
|
75
|
-
CYR_L,
|
76
|
-
CYR_M,
|
77
|
-
CYR_N,
|
78
|
-
CYR_O,
|
79
|
-
CYR_P,
|
80
|
-
CYR_R,
|
81
|
-
CYR_S,
|
82
|
-
CYR_T,
|
83
|
-
CYR_U,
|
84
|
-
CYR_F,
|
85
|
-
CYR_H,
|
86
|
-
CYR_C,
|
87
|
-
CYR_CH,
|
88
|
-
CYR_SH,
|
89
|
-
CYR_DJ=1106,
|
90
|
-
CYR_J=1112,
|
91
|
-
CYR_LJ,
|
92
|
-
CYR_NJ,
|
93
|
-
CYR_TJ,
|
94
|
-
CYR_DZ=1119
|
16
|
+
LAT_CAP_TJ=262, LAT_TJ, LAT_CAP_CH=268, LAT_CH,
|
17
|
+
LAT_CAP_DJ=272, LAT_DJ, LAT_CAP_SH=352, LAT_SH,
|
18
|
+
LAT_CAP_ZH=381, LAT_ZH, CYR_CAP_DJ=1026, CYR_CAP_J=1032,
|
19
|
+
CYR_CAP_LJ, CYR_CAP_NJ, CYR_CAP_TJ, CYR_CAP_DZ=1039,
|
20
|
+
CYR_CAP_A, CYR_CAP_B, CYR_CAP_V, CYR_CAP_G,
|
21
|
+
CYR_CAP_D, CYR_CAP_E, CYR_CAP_ZH, CYR_CAP_Z,
|
22
|
+
CYR_CAP_I, CYR_CAP_K=1050, CYR_CAP_L, CYR_CAP_M,
|
23
|
+
CYR_CAP_N, CYR_CAP_O, CYR_CAP_P, CYR_CAP_R,
|
24
|
+
CYR_CAP_S, CYR_CAP_T, CYR_CAP_U, CYR_CAP_F,
|
25
|
+
CYR_CAP_H, CYR_CAP_C, CYR_CAP_CH, CYR_CAP_SH,
|
26
|
+
CYR_A=1072, CYR_B, CYR_V, CYR_G, CYR_D,
|
27
|
+
CYR_E, CYR_ZH, CYR_Z, CYR_I, CYR_K=1082,
|
28
|
+
CYR_L, CYR_M, CYR_N, CYR_O, CYR_P,
|
29
|
+
CYR_R, CYR_S, CYR_T, CYR_U, CYR_F,
|
30
|
+
CYR_H, CYR_C, CYR_CH, CYR_SH, CYR_DJ=1106,
|
31
|
+
CYR_J=1112, CYR_LJ, CYR_NJ, CYR_TJ, CYR_DZ=1119
|
95
32
|
};
|
96
33
|
|
97
|
-
static inline unsigned
|
98
|
-
|
34
|
+
static inline unsigned
|
35
|
+
is_cap(unsigned codepoint)
|
99
36
|
{
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
37
|
+
if (codepoint >= 65 && codepoint <= 90) return 1;
|
38
|
+
if (codepoint >= CYR_CAP_DJ && codepoint <= CYR_CAP_SH) return 1;
|
39
|
+
|
40
|
+
switch(codepoint) {
|
41
|
+
case LAT_CAP_TJ:
|
42
|
+
case LAT_CAP_CH:
|
43
|
+
case LAT_CAP_DJ:
|
44
|
+
case LAT_CAP_SH:
|
45
|
+
case LAT_CAP_ZH:
|
46
|
+
return 1;
|
47
|
+
default:
|
48
|
+
return 0;
|
49
|
+
}
|
107
50
|
}
|
108
51
|
|
109
|
-
static
|
110
|
-
|
52
|
+
static inline unsigned
|
53
|
+
is_digraph(unsigned codepoint)
|
111
54
|
{
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
55
|
+
switch(codepoint) {
|
56
|
+
case CYR_LJ:
|
57
|
+
case CYR_NJ:
|
58
|
+
case CYR_DZ:
|
59
|
+
case CYR_CAP_LJ:
|
60
|
+
case CYR_CAP_NJ:
|
61
|
+
case CYR_CAP_DZ:
|
62
|
+
return 1;
|
63
|
+
default:
|
64
|
+
return 0;
|
65
|
+
}
|
66
|
+
}
|
67
|
+
|
68
|
+
static unsigned
|
69
|
+
digraph_to_cyr(unsigned codepoint, unsigned codepoint2, unsigned capitalize, unsigned *next_out)
|
70
|
+
{
|
71
|
+
static unsigned CYR_MAP[] = {
|
72
|
+
CYR_A, CYR_B, CYR_C, CYR_D, CYR_E, CYR_F,
|
73
|
+
CYR_G, CYR_H, CYR_I, CYR_J, CYR_K, CYR_L,
|
74
|
+
CYR_M, CYR_N, CYR_O, CYR_P, 0, CYR_R,
|
75
|
+
CYR_S, CYR_T, CYR_U, CYR_V, 0, 0, 0, CYR_Z
|
76
|
+
};
|
77
|
+
|
78
|
+
static unsigned CYR_CAPS_MAP[] = {
|
79
|
+
CYR_CAP_A, CYR_CAP_B, CYR_CAP_C, CYR_CAP_D, CYR_CAP_E, CYR_CAP_F,
|
80
|
+
CYR_CAP_G, CYR_CAP_H, CYR_CAP_I, CYR_CAP_J, CYR_CAP_K, CYR_CAP_L,
|
81
|
+
CYR_CAP_M, CYR_CAP_N, CYR_CAP_O, CYR_CAP_P, 0, CYR_CAP_R,
|
82
|
+
CYR_CAP_S, CYR_CAP_T, CYR_CAP_U, CYR_CAP_V, 0, 0, 0, CYR_CAP_Z
|
83
|
+
};
|
84
|
+
|
85
|
+
if (codepoint2 == LAT_CAP_ZH || codepoint2 == LAT_ZH) {
|
86
|
+
switch (codepoint) {
|
87
|
+
case 'd': return CYR_DZ;
|
88
|
+
case 'D': return CYR_CAP_DZ;
|
89
|
+
}
|
90
|
+
}
|
91
|
+
|
92
|
+
if (codepoint2 == 'j' || codepoint2 == 'J') {
|
93
|
+
switch (codepoint) {
|
94
|
+
case 'l': return CYR_LJ;
|
95
|
+
case 'n': return CYR_NJ;
|
96
|
+
case 'L': return CYR_CAP_LJ;
|
97
|
+
case 'N': return CYR_CAP_NJ;
|
98
|
+
}
|
99
|
+
}
|
100
|
+
|
101
|
+
if (codepoint >= 'a' && codepoint <= 'z') return CYR_MAP[codepoint - 'a'];
|
102
|
+
if (codepoint >= 'A' && codepoint <= 'Z') return CYR_CAPS_MAP[codepoint - 'A'];
|
103
|
+
|
104
|
+
switch (codepoint) {
|
105
|
+
case LAT_CH: return CYR_CH;
|
106
|
+
case LAT_DJ: return CYR_DJ;
|
107
|
+
case LAT_SH: return CYR_SH;
|
108
|
+
case LAT_TJ: return CYR_TJ;
|
109
|
+
case LAT_ZH: return CYR_ZH;
|
110
|
+
case LAT_CAP_CH: return CYR_CAP_CH;
|
111
|
+
case LAT_CAP_DJ: return CYR_CAP_DJ;
|
112
|
+
case LAT_CAP_SH: return CYR_CAP_SH;
|
113
|
+
case LAT_CAP_TJ: return CYR_CAP_TJ;
|
114
|
+
case LAT_CAP_ZH: return CYR_CAP_ZH;
|
115
|
+
}
|
116
|
+
|
117
|
+
return 0;
|
118
|
+
}
|
119
|
+
|
120
|
+
static unsigned
|
121
|
+
digraph_to_latin(unsigned codepoint, unsigned codepoint2, unsigned capitalize, unsigned *next_out)
|
122
|
+
{
|
123
|
+
static char LAT_MAP[] = {
|
124
|
+
'a', 'b', 'v', 'g', 'd', 'e', 0, 'z', 'i', 0, 'k', 'l',
|
125
|
+
'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'f', 'h', 'c'
|
126
|
+
};
|
127
|
+
|
128
|
+
static char LAT_CAPS_MAP[] = {
|
129
|
+
'A', 'B', 'V', 'G', 'D', 'E', 0, 'Z', 'I', 0, 'K', 'L',
|
130
|
+
'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'F', 'H', 'C'
|
131
|
+
};
|
132
|
+
|
133
|
+
if (codepoint < CYR_CAP_DJ || codepoint > CYR_DZ) return 0;
|
134
|
+
|
135
|
+
switch (codepoint) {
|
136
|
+
case CYR_ZH: return LAT_ZH;
|
137
|
+
case CYR_CAP_ZH: return LAT_CAP_ZH;
|
138
|
+
}
|
139
|
+
|
140
|
+
if (codepoint >= CYR_A && codepoint <= CYR_C)
|
141
|
+
return LAT_MAP[codepoint - CYR_A];
|
142
|
+
|
143
|
+
if (codepoint >= CYR_CAP_A && codepoint <= CYR_CAP_C)
|
144
|
+
return LAT_CAPS_MAP[codepoint - CYR_CAP_A];
|
145
|
+
|
146
|
+
if (codepoint >= CYR_A) {
|
147
|
+
switch (codepoint) {
|
148
|
+
case CYR_J: return 'j';
|
149
|
+
case CYR_TJ: return LAT_TJ;
|
150
|
+
case CYR_CH: return LAT_CH;
|
151
|
+
case CYR_SH: return LAT_SH;
|
152
|
+
case CYR_DJ: return LAT_DJ;
|
153
|
+
case CYR_LJ: *next_out = 'j'; return 'l';
|
154
|
+
case CYR_NJ: *next_out = 'j'; return 'n';
|
155
|
+
case CYR_DZ: *next_out = LAT_ZH; return 'd';
|
156
|
+
}
|
157
|
+
}
|
158
|
+
else {
|
159
|
+
switch (codepoint) {
|
160
|
+
case CYR_CAP_J: return 'J';
|
161
|
+
case CYR_CAP_TJ: return LAT_CAP_TJ;
|
162
|
+
case CYR_CAP_CH: return LAT_CAP_CH;
|
163
|
+
case CYR_CAP_SH: return LAT_CAP_SH;
|
164
|
+
case CYR_CAP_DJ: return LAT_CAP_DJ;
|
165
|
+
case CYR_CAP_LJ: *next_out = (capitalize || is_cap(codepoint2)) ? 'J' : 'j'; return 'L';
|
166
|
+
case CYR_CAP_NJ: *next_out = (capitalize || is_cap(codepoint2)) ? 'J' : 'j'; return 'N';
|
167
|
+
case CYR_CAP_DZ: *next_out = (capitalize || is_cap(codepoint2)) ? LAT_CAP_ZH : LAT_ZH; return 'D';
|
168
|
+
}
|
169
|
+
}
|
170
|
+
|
171
|
+
return 0;
|
172
|
+
}
|
173
|
+
|
174
|
+
static unsigned
|
175
|
+
digraph_to_ascii(unsigned codepoint, unsigned codepoint2, unsigned capitalize, unsigned *next_out)
|
176
|
+
{
|
177
|
+
switch (codepoint) {
|
178
|
+
case LAT_TJ:
|
179
|
+
case LAT_CH:
|
180
|
+
case CYR_TJ:
|
181
|
+
case CYR_CH: return 'c';
|
182
|
+
case LAT_SH:
|
183
|
+
case CYR_SH: return 's';
|
184
|
+
case LAT_ZH:
|
185
|
+
case CYR_ZH: return 'z';
|
186
|
+
case LAT_DJ:
|
187
|
+
case CYR_DJ: *next_out = 'j'; return 'd';
|
188
|
+
case LAT_CAP_TJ:
|
189
|
+
case LAT_CAP_CH:
|
190
|
+
case CYR_CAP_TJ:
|
191
|
+
case CYR_CAP_CH: return 'C';
|
192
|
+
case LAT_CAP_SH:
|
193
|
+
case CYR_CAP_SH: return 'S';
|
194
|
+
case LAT_CAP_ZH:
|
195
|
+
case CYR_CAP_ZH: return 'Z';
|
196
|
+
case LAT_CAP_DJ:
|
197
|
+
case CYR_CAP_DJ:
|
198
|
+
*next_out = (capitalize || is_cap(codepoint2)) ? 'J' : 'j'; return 'D';
|
199
|
+
case CYR_DZ:
|
200
|
+
*next_out = (capitalize || is_cap(codepoint2)) ? 'Z' : 'z'; return 'd';
|
201
|
+
case CYR_CAP_DZ:
|
202
|
+
*next_out = (capitalize || is_cap(codepoint2)) ? 'Z' : 'z'; return 'D';
|
203
|
+
default:
|
204
|
+
return digraph_to_latin(codepoint, codepoint2, capitalize, next_out);
|
205
|
+
}
|
116
206
|
}
|
117
207
|
|
118
208
|
static VALUE
|
119
|
-
|
209
|
+
str_to_srb(VALUE str, int strategy, int bang)
|
120
210
|
{
|
121
211
|
VALUE dest;
|
122
|
-
long dest_len;
|
123
|
-
char *pos, *end;
|
124
212
|
rb_encoding *enc;
|
213
|
+
|
125
214
|
int len, next_len;
|
126
|
-
|
127
|
-
|
128
|
-
unsigned int codepoint = 0;
|
129
|
-
unsigned int next_codepoint = 0;
|
215
|
+
unsigned in, in2, out, out2, seen_cap = 0;
|
216
|
+
char *pos, *end, *seq_start = 0;
|
130
217
|
|
131
|
-
|
218
|
+
unsigned (*method)(unsigned, unsigned, unsigned, unsigned*);
|
219
|
+
|
220
|
+
switch(strategy) {
|
221
|
+
case 0: method = &digraph_to_cyr; break;
|
222
|
+
case 1: method = &digraph_to_latin; break;
|
223
|
+
default: method = &digraph_to_ascii;
|
224
|
+
}
|
132
225
|
|
226
|
+
StringValue(str);
|
133
227
|
pos = RSTRING_PTR(str);
|
134
228
|
if (!pos || RSTRING_LEN(str) == 0) return str;
|
135
229
|
|
136
230
|
end = RSTRING_END(str);
|
137
231
|
enc = STR_ENC_GET(str);
|
138
|
-
|
139
|
-
dest = rb_str_buf_new(dest_len);
|
232
|
+
dest = rb_str_buf_new(RSTRING_LEN(str) + 30);
|
140
233
|
rb_enc_associate(dest, enc);
|
141
234
|
|
142
|
-
|
235
|
+
in = rb_enc_codepoint_len(pos, end, &len, enc);
|
143
236
|
|
144
237
|
while (pos < end) {
|
145
|
-
|
146
|
-
next_codepoint = rb_enc_codepoint_len(pos + len, end, &next_len, enc);
|
147
|
-
}
|
238
|
+
in2 = out2 = 0;
|
148
239
|
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
/* Latin -> "ASCII Latin" conversion */
|
153
|
-
if (ascii && codepoint >= LAT_CAP_TJ && codepoint <= LAT_ZH) {
|
154
|
-
switch (codepoint) {
|
155
|
-
case LAT_TJ:
|
156
|
-
case LAT_CH: rb_enc_str_buf_cat(dest, "c", 1, enc); break;
|
157
|
-
case LAT_DJ: rb_enc_str_buf_cat(dest, "dj", 2, enc); break;
|
158
|
-
case LAT_SH: rb_enc_str_buf_cat(dest, "s", 1, enc); break;
|
159
|
-
case LAT_ZH: rb_enc_str_buf_cat(dest, "z", 1, enc); break;
|
160
|
-
case LAT_CAP_TJ:
|
161
|
-
case LAT_CAP_CH: rb_enc_str_buf_cat(dest, "C", 1, enc); break;
|
162
|
-
case LAT_CAP_SH: rb_enc_str_buf_cat(dest, "S", 1, enc); break;
|
163
|
-
case LAT_CAP_ZH: rb_enc_str_buf_cat(dest, "Z", 1, enc); break;
|
164
|
-
|
165
|
-
case LAT_CAP_DJ:
|
166
|
-
force_upper ? rb_enc_str_buf_cat(dest, "DJ", 2, enc)
|
167
|
-
: rb_enc_str_buf_cat(dest, "Dj", 2, enc);
|
168
|
-
break;
|
169
|
-
default:
|
170
|
-
rb_enc_str_buf_cat(dest, pos, len, enc);
|
171
|
-
}
|
172
|
-
}
|
240
|
+
if (pos + len < end)
|
241
|
+
in2 = rb_enc_codepoint_len(pos + len, end, &next_len, enc);
|
173
242
|
|
174
|
-
|
175
|
-
else if (codepoint < CYR_CAP_DJ || codepoint > CYR_DZ) {
|
176
|
-
rb_enc_str_buf_cat(dest, pos, len, enc);
|
177
|
-
}
|
243
|
+
out = (*method)(in, in2, seen_cap, &out2);
|
178
244
|
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
case CYR_V: rb_enc_str_buf_cat(dest, "v", 1, enc); break;
|
185
|
-
case CYR_G: rb_enc_str_buf_cat(dest, "g", 1, enc); break;
|
186
|
-
case CYR_D: rb_enc_str_buf_cat(dest, "d", 1, enc); break;
|
187
|
-
case CYR_E: rb_enc_str_buf_cat(dest, "e", 1, enc); break;
|
188
|
-
case CYR_Z: rb_enc_str_buf_cat(dest, "z", 1, enc); break;
|
189
|
-
case CYR_I: rb_enc_str_buf_cat(dest, "i", 1, enc); break;
|
190
|
-
case CYR_K: rb_enc_str_buf_cat(dest, "k", 1, enc); break;
|
191
|
-
case CYR_L: rb_enc_str_buf_cat(dest, "l", 1, enc); break;
|
192
|
-
case CYR_M: rb_enc_str_buf_cat(dest, "m", 1, enc); break;
|
193
|
-
case CYR_N: rb_enc_str_buf_cat(dest, "n", 1, enc); break;
|
194
|
-
case CYR_O: rb_enc_str_buf_cat(dest, "o", 1, enc); break;
|
195
|
-
case CYR_P: rb_enc_str_buf_cat(dest, "p", 1, enc); break;
|
196
|
-
case CYR_R: rb_enc_str_buf_cat(dest, "r", 1, enc); break;
|
197
|
-
case CYR_S: rb_enc_str_buf_cat(dest, "s", 1, enc); break;
|
198
|
-
case CYR_T: rb_enc_str_buf_cat(dest, "t", 1, enc); break;
|
199
|
-
case CYR_U: rb_enc_str_buf_cat(dest, "u", 1, enc); break;
|
200
|
-
case CYR_F: rb_enc_str_buf_cat(dest, "f", 1, enc); break;
|
201
|
-
case CYR_H: rb_enc_str_buf_cat(dest, "h", 1, enc); break;
|
202
|
-
case CYR_C: rb_enc_str_buf_cat(dest, "c", 1, enc); break;
|
203
|
-
case CYR_J: rb_enc_str_buf_cat(dest, "j", 1, enc); break;
|
204
|
-
case CYR_LJ: rb_enc_str_buf_cat(dest, "lj", 2, enc); break;
|
205
|
-
case CYR_NJ: rb_enc_str_buf_cat(dest, "nj", 2, enc); break;
|
206
|
-
case CYR_DJ: STR_CAT_COND_ASCII(ascii, dest, "dj", LAT_DJ, 2, enc); break;
|
207
|
-
case CYR_TJ: STR_CAT_COND_ASCII(ascii, dest, "c", LAT_TJ, 1, enc); break;
|
208
|
-
case CYR_CH: STR_CAT_COND_ASCII(ascii, dest, "c", LAT_CH, 1, enc); break;
|
209
|
-
case CYR_ZH: STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc); break;
|
210
|
-
case CYR_SH: STR_CAT_COND_ASCII(ascii, dest, "s", LAT_SH, 1, enc); break;
|
211
|
-
case CYR_DZ:
|
212
|
-
rb_enc_str_buf_cat(dest, "d", 1, enc);
|
213
|
-
STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc);
|
214
|
-
break;
|
215
|
-
default:
|
216
|
-
rb_enc_str_buf_cat(dest, pos, len, enc);
|
245
|
+
if (out) {
|
246
|
+
/* flush previous untranslatable sequence */
|
247
|
+
if (seq_start) {
|
248
|
+
rb_str_buf_cat(dest, seq_start, pos - seq_start);
|
249
|
+
seq_start = 0;
|
217
250
|
}
|
251
|
+
|
252
|
+
_str_cat_char(dest, out, enc);
|
253
|
+
if (out2) _str_cat_char(dest, out2, enc);
|
254
|
+
}
|
255
|
+
else if (!seq_start) {
|
256
|
+
/* mark the beginning of an untranslatable sequence */
|
257
|
+
seq_start = pos;
|
218
258
|
}
|
219
259
|
|
220
|
-
/*
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
case CYR_CAP_B: rb_enc_str_buf_cat(dest, "B", 1, enc); break;
|
226
|
-
case CYR_CAP_V: rb_enc_str_buf_cat(dest, "V", 1, enc); break;
|
227
|
-
case CYR_CAP_G: rb_enc_str_buf_cat(dest, "G", 1, enc); break;
|
228
|
-
case CYR_CAP_D: rb_enc_str_buf_cat(dest, "D", 1, enc); break;
|
229
|
-
case CYR_CAP_E: rb_enc_str_buf_cat(dest, "E", 1, enc); break;
|
230
|
-
case CYR_CAP_Z: rb_enc_str_buf_cat(dest, "Z", 1, enc); break;
|
231
|
-
case CYR_CAP_I: rb_enc_str_buf_cat(dest, "I", 1, enc); break;
|
232
|
-
case CYR_CAP_K: rb_enc_str_buf_cat(dest, "K", 1, enc); break;
|
233
|
-
case CYR_CAP_L: rb_enc_str_buf_cat(dest, "L", 1, enc); break;
|
234
|
-
case CYR_CAP_M: rb_enc_str_buf_cat(dest, "M", 1, enc); break;
|
235
|
-
case CYR_CAP_N: rb_enc_str_buf_cat(dest, "N", 1, enc); break;
|
236
|
-
case CYR_CAP_O: rb_enc_str_buf_cat(dest, "O", 1, enc); break;
|
237
|
-
case CYR_CAP_P: rb_enc_str_buf_cat(dest, "P", 1, enc); break;
|
238
|
-
case CYR_CAP_R: rb_enc_str_buf_cat(dest, "R", 1, enc); break;
|
239
|
-
case CYR_CAP_S: rb_enc_str_buf_cat(dest, "S", 1, enc); break;
|
240
|
-
case CYR_CAP_T: rb_enc_str_buf_cat(dest, "T", 1, enc); break;
|
241
|
-
case CYR_CAP_U: rb_enc_str_buf_cat(dest, "U", 1, enc); break;
|
242
|
-
case CYR_CAP_F: rb_enc_str_buf_cat(dest, "F", 1, enc); break;
|
243
|
-
case CYR_CAP_H: rb_enc_str_buf_cat(dest, "H", 1, enc); break;
|
244
|
-
case CYR_CAP_C: rb_enc_str_buf_cat(dest, "C", 1, enc); break;
|
245
|
-
case CYR_CAP_TJ: STR_CAT_COND_ASCII(ascii, dest, "C", LAT_CAP_TJ, 1, enc); break;
|
246
|
-
case CYR_CAP_CH: STR_CAT_COND_ASCII(ascii, dest, "C", LAT_CAP_CH, 1, enc); break;
|
247
|
-
case CYR_CAP_ZH: STR_CAT_COND_ASCII(ascii, dest, "Z", LAT_CAP_ZH, 1, enc); break;
|
248
|
-
case CYR_CAP_SH: STR_CAT_COND_ASCII(ascii, dest, "S", LAT_CAP_SH, 1, enc); break;
|
249
|
-
case CYR_CAP_LJ:
|
250
|
-
rb_enc_str_buf_cat(dest, (force_upper ? "LJ" : "Lj"), 2, enc);
|
251
|
-
break;
|
252
|
-
case CYR_CAP_NJ:
|
253
|
-
rb_enc_str_buf_cat(dest, (force_upper ? "NJ" : "Nj"), 2, enc);
|
254
|
-
break;
|
255
|
-
case CYR_CAP_DJ:
|
256
|
-
STR_CAT_COND_ASCII(ascii, dest, (force_upper ? "DJ" : "Dj"), LAT_CAP_DJ, 2, enc);
|
257
|
-
break;
|
258
|
-
case CYR_CAP_DZ:
|
259
|
-
rb_enc_str_buf_cat(dest, "D", 1, enc);
|
260
|
-
if (force_upper) {
|
261
|
-
STR_CAT_COND_ASCII(ascii, dest, "Z", LAT_CAP_ZH, 1, enc);
|
262
|
-
}
|
263
|
-
else {
|
264
|
-
STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc);
|
265
|
-
}
|
266
|
-
break;
|
267
|
-
default:
|
268
|
-
rb_enc_str_buf_cat(dest, pos, len, enc);
|
269
|
-
}
|
260
|
+
/* for cyrillic output, skip the second half of an input digraph */
|
261
|
+
if (strategy == 0 && is_digraph(out)) {
|
262
|
+
pos += next_len;
|
263
|
+
if (pos + len < end)
|
264
|
+
in2 = rb_enc_codepoint_len(pos + len, end, &next_len, enc);
|
270
265
|
}
|
266
|
+
|
267
|
+
seen_cap = is_cap(in);
|
268
|
+
|
271
269
|
pos += len;
|
272
270
|
len = next_len;
|
273
|
-
|
274
|
-
next_codepoint = 0;
|
271
|
+
in = in2;
|
275
272
|
}
|
276
273
|
|
274
|
+
/* flush final sequence */
|
275
|
+
if (seq_start) rb_str_buf_cat(dest, seq_start, pos - seq_start);
|
276
|
+
|
277
277
|
if (bang) {
|
278
278
|
rb_str_shared_replace(str, dest);
|
279
279
|
}
|
280
280
|
else {
|
281
|
-
|
282
|
-
str = dest;
|
281
|
+
str = dest;
|
283
282
|
}
|
284
283
|
|
285
284
|
return str;
|
286
285
|
}
|
287
286
|
|
287
|
+
/**
|
288
|
+
* Returns a copy of <i>str</i> with Latin characters transliterated
|
289
|
+
* into Serbian Cyrillic.
|
290
|
+
*
|
291
|
+
* @overload to_cyrillic(str)
|
292
|
+
* @param [String] str text to be transliterated
|
293
|
+
* @return [String] transliterated text
|
294
|
+
*/
|
295
|
+
static VALUE
|
296
|
+
rb_str_to_cyrillic(VALUE self, VALUE str)
|
297
|
+
{
|
298
|
+
return str_to_srb(str, 0, 0);
|
299
|
+
}
|
300
|
+
|
301
|
+
/**
|
302
|
+
* Performs transliteration of <code>Byk.to_cyrillic</code> in place,
|
303
|
+
* returning <i>str</i>, whether any changes were made or not.
|
304
|
+
*
|
305
|
+
* @overload to_cyrillic!(str)
|
306
|
+
* @param [String] str text to be transliterated
|
307
|
+
* @return [String] transliterated text
|
308
|
+
*/
|
288
309
|
static VALUE
|
289
|
-
|
290
|
-
|
310
|
+
rb_str_to_cyrillic_bang(VALUE self, VALUE str)
|
311
|
+
{
|
312
|
+
return str_to_srb(str, 0, 1);
|
291
313
|
}
|
292
314
|
|
315
|
+
/**
|
316
|
+
* Returns a copy of <i>str</i> with Serbian Cyrillic characters
|
317
|
+
* transliterated into Latin.
|
318
|
+
*
|
319
|
+
* @overload to_latin(str)
|
320
|
+
* @param [String] str text to be transliterated
|
321
|
+
* @return [String] transliterated text
|
322
|
+
*/
|
293
323
|
static VALUE
|
294
|
-
|
295
|
-
|
324
|
+
rb_str_to_latin(VALUE self, VALUE str)
|
325
|
+
{
|
326
|
+
return str_to_srb(str, 1, 0);
|
296
327
|
}
|
297
328
|
|
329
|
+
/**
|
330
|
+
* Performs transliteration of <code>Byk.to_latin</code> in place,
|
331
|
+
* returning <i>str</i>, whether any changes were made or not.
|
332
|
+
*
|
333
|
+
* @overload to_latin!(str)
|
334
|
+
* @param [String] str text to be transliterated
|
335
|
+
* @return [String] transliterated text
|
336
|
+
*/
|
298
337
|
static VALUE
|
299
|
-
|
300
|
-
|
338
|
+
rb_str_to_latin_bang(VALUE self, VALUE str)
|
339
|
+
{
|
340
|
+
return str_to_srb(str, 1, 1);
|
301
341
|
}
|
302
342
|
|
343
|
+
/**
|
344
|
+
* Returns a copy of <i>str</i> with Serbian characters transliterated
|
345
|
+
* into ASCII Latin.
|
346
|
+
*
|
347
|
+
* @overload to_ascii_latin(str)
|
348
|
+
* @param [String] str text to be transliterated
|
349
|
+
* @return [String] transliterated text
|
350
|
+
*/
|
303
351
|
static VALUE
|
304
|
-
|
305
|
-
|
352
|
+
rb_str_to_ascii_latin(VALUE self, VALUE str)
|
353
|
+
{
|
354
|
+
return str_to_srb(str, 2, 0);
|
355
|
+
}
|
356
|
+
|
357
|
+
/**
|
358
|
+
* Performs transliteration of <code>Byk.to_ascii_latin</code> in
|
359
|
+
* place, returning <i>str</i>, whether any changes were made or not.
|
360
|
+
*
|
361
|
+
* @overload to_ascii_latin!(str)
|
362
|
+
* @param [String] str text to be transliterated
|
363
|
+
* @return [String] transliterated text
|
364
|
+
*/
|
365
|
+
static VALUE
|
366
|
+
rb_str_to_ascii_latin_bang(VALUE self, VALUE str)
|
367
|
+
{
|
368
|
+
return str_to_srb(str, 2, 1);
|
306
369
|
}
|
307
370
|
|
308
371
|
void Init_byk_native(void)
|
309
372
|
{
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
373
|
+
VALUE Byk = rb_define_module("Byk");
|
374
|
+
rb_define_singleton_method(Byk, "to_cyrillic", rb_str_to_cyrillic, 1);
|
375
|
+
rb_define_singleton_method(Byk, "to_cyrillic!", rb_str_to_cyrillic_bang, 1);
|
376
|
+
rb_define_singleton_method(Byk, "to_latin", rb_str_to_latin, 1);
|
377
|
+
rb_define_singleton_method(Byk, "to_latin!", rb_str_to_latin_bang, 1);
|
378
|
+
rb_define_singleton_method(Byk, "to_ascii_latin", rb_str_to_ascii_latin, 1);
|
379
|
+
rb_define_singleton_method(Byk, "to_ascii_latin!", rb_str_to_ascii_latin_bang, 1);
|
314
380
|
}
|
data/lib/byk/safe.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
require "byk_native"
|
4
|
+
require "byk/version"
|
5
|
+
|
6
|
+
module Byk
|
7
|
+
|
8
|
+
AZBUKA = %w[а б в г д ђ е ж з и ј к л љ м н њ о п р с т ћ у ф х ц ч џ ш]
|
9
|
+
AZBUKA_CAPS = %W[А Б В Г Д Ђ Е Ж З И Ј К Л Љ М Н Њ О П Р С Т Ћ У Ф Х Ц Ч Џ Ш]
|
10
|
+
|
11
|
+
ABECEDA = %w[a b c č ć d dž đ e f g h i j k l lj m n nj o p r s š t u v z ž]
|
12
|
+
ABECEDA_CAPS = %W[A B C Č Ć D Dž Đ E F G H I J K L Lj M N Nj O P R S Š T U V Z Ž]
|
13
|
+
|
14
|
+
end
|
data/lib/byk/version.rb
CHANGED
data/lib/byk.rb
CHANGED
@@ -1,14 +1,2 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
require "byk_native"
|
4
|
-
require "byk/version"
|
5
|
-
|
6
|
-
module Byk
|
7
|
-
|
8
|
-
AZBUKA = %w[а б в г д ђ е ж з и ј к л љ м н њ о п р с т ћ у ф х ц ч џ ш]
|
9
|
-
AZBUKA_CAPS = %W[А Б В Г Д Ђ Е Ж З И Ј К Л Љ М Н Њ О П Р С Т Ћ У Ф Х Ц Ч Џ Ш]
|
10
|
-
|
11
|
-
ABECEDA = %w[a b c č ć d dž đ e f g h i j k l lj m n nj o p r s š t u v z ž]
|
12
|
-
ABECEDA_CAPS = %W[A B C Č Ć D Dž Đ E F G H I J K L Lj M N Nj O P R S Š T U V Z Ž]
|
13
|
-
|
14
|
-
end
|
1
|
+
require "byk/safe"
|
2
|
+
require "byk/core_ext/string"
|