byk 0.5.0 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/README.md +31 -6
- data/ext/byk/byk.c +153 -163
- data/lib/byk.rb +2 -14
- data/lib/byk/core_ext/string.rb +8 -0
- data/lib/byk/safe.rb +14 -0
- data/lib/byk/version.rb +1 -1
- data/spec/byk_spec.rb +126 -69
- metadata +5 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 34203e0b4291cde495d17da65522df586de7e712
|
4
|
+
data.tar.gz: 290d743dab23c58241520252bd81d4ae4115ce98
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f11d00e9ac1057a5596804e03c6c4a6c41841bedc21030a9ed776cfbaaabba85a341a62de71c990c8deadc8f8384bf263b41d477b33b299af26b55acef47fe0c
|
7
|
+
data.tar.gz: 335ddfeca9f6793f2887c1cc93cfc916e011f7dd01fd97073162871148d0fe61395bdb1115c5ed4f7583ff207f6b6d27462c8920a7a70b016b9427420796bc28
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -61,13 +61,40 @@ text # => "Zvazbuka"
|
|
61
61
|
```
|
62
62
|
|
63
63
|
Note that these methods take into account the
|
64
|
-
[
|
64
|
+
[digraph capitalization rules](http://sr.wikipedia.org/wiki/Гајица#.D0.94.D0.B8.D0.B3.D1.80.D0.B0.D1.84.D0.B8):
|
65
65
|
|
66
66
|
```ruby
|
67
67
|
"ЉИЉА Љиљановић".to_latin # => "LJILJA Ljiljanović"
|
68
68
|
"ĐORĐE Đorđević".to_ascii_latin # => "DJORDJE Djordjevic"
|
69
69
|
```
|
70
70
|
|
71
|
+
If you prefer not to monkey patch your strings, you can use the "safe"
|
72
|
+
require:
|
73
|
+
|
74
|
+
```ruby
|
75
|
+
require "byk/safe"
|
76
|
+
```
|
77
|
+
|
78
|
+
and then call the module methods:
|
79
|
+
|
80
|
+
```ruby
|
81
|
+
text = "Вук"
|
82
|
+
Byk.to_latin(text) # => "Vuk"
|
83
|
+
text # => "Byk"
|
84
|
+
Byk.to_latin!(text) # => "Vuk"
|
85
|
+
text # => "Vuk"
|
86
|
+
```
|
87
|
+
|
88
|
+
|
89
|
+
## Testing
|
90
|
+
|
91
|
+
To test the gem, clone the repo and run:
|
92
|
+
|
93
|
+
```
|
94
|
+
$ bundle
|
95
|
+
$ bundle exec rake
|
96
|
+
```
|
97
|
+
|
71
98
|
|
72
99
|
## How fast is fast?
|
73
100
|
|
@@ -84,7 +111,7 @@ projects, e.g. sites supporting dual script content. Remember,
|
|
84
111
|
|
85
112
|
I found transliteration to be a straightforward little problem that
|
86
113
|
lends itself well to optimization. It also gave me an excuse to play
|
87
|
-
with Ruby extensions, so there :
|
114
|
+
with Ruby extensions, so there :smirk_cat:
|
88
115
|
|
89
116
|
|
90
117
|
## Compatibility
|
@@ -92,10 +119,8 @@ with Ruby extensions, so there :smile_cat:
|
|
92
119
|
Byk is supported under MRI Ruby >= 1.9.2.
|
93
120
|
|
94
121
|
I don't plan to support 1.8.7 or older due to substantial C API
|
95
|
-
changes between 1.8 and 1.9.
|
96
|
-
|
97
|
-
It doesn't build under Rubinius currently, but I intend to support it
|
98
|
-
in future releases.
|
122
|
+
changes between 1.8 and 1.9. It doesn't build under Rubinius
|
123
|
+
currently, but I intend to support it in future releases.
|
99
124
|
|
100
125
|
|
101
126
|
## License
|
data/ext/byk/byk.c
CHANGED
@@ -5,91 +5,67 @@
|
|
5
5
|
|
6
6
|
#define STR_CAT_COND_ASCII(ascii, dest, chr, ascii_chr, len, enc) \
|
7
7
|
ascii ? rb_str_buf_cat(dest, chr, len) \
|
8
|
-
|
8
|
+
: str_cat_char(dest, ascii_chr, enc)
|
9
9
|
|
10
10
|
enum {
|
11
|
-
LAT_CAP_TJ=
|
11
|
+
LAT_CAP_TJ = 0x106,
|
12
12
|
LAT_TJ,
|
13
|
-
LAT_CAP_CH=
|
13
|
+
LAT_CAP_CH = 0x10c,
|
14
14
|
LAT_CH,
|
15
|
-
LAT_CAP_DJ=
|
15
|
+
LAT_CAP_DJ = 0x110,
|
16
16
|
LAT_DJ,
|
17
|
-
LAT_CAP_SH=
|
17
|
+
LAT_CAP_SH = 0x160,
|
18
18
|
LAT_SH,
|
19
|
-
LAT_CAP_ZH=
|
19
|
+
LAT_CAP_ZH = 0x17d,
|
20
20
|
LAT_ZH,
|
21
|
-
CYR_CAP_DJ=
|
22
|
-
CYR_CAP_J=
|
21
|
+
CYR_CAP_DJ = 0x402,
|
22
|
+
CYR_CAP_J = 0x408,
|
23
23
|
CYR_CAP_LJ,
|
24
24
|
CYR_CAP_NJ,
|
25
25
|
CYR_CAP_TJ,
|
26
|
-
CYR_CAP_DZ=
|
26
|
+
CYR_CAP_DZ = 0x40f,
|
27
27
|
CYR_CAP_A,
|
28
|
-
|
29
|
-
|
30
|
-
CYR_CAP_G,
|
31
|
-
CYR_CAP_D,
|
32
|
-
CYR_CAP_E,
|
33
|
-
CYR_CAP_ZH,
|
34
|
-
CYR_CAP_Z,
|
35
|
-
CYR_CAP_I,
|
36
|
-
CYR_CAP_K=1050,
|
37
|
-
CYR_CAP_L,
|
38
|
-
CYR_CAP_M,
|
39
|
-
CYR_CAP_N,
|
40
|
-
CYR_CAP_O,
|
41
|
-
CYR_CAP_P,
|
42
|
-
CYR_CAP_R,
|
43
|
-
CYR_CAP_S,
|
44
|
-
CYR_CAP_T,
|
45
|
-
CYR_CAP_U,
|
46
|
-
CYR_CAP_F,
|
47
|
-
CYR_CAP_H,
|
48
|
-
CYR_CAP_C,
|
28
|
+
CYR_CAP_ZH = 0x416,
|
29
|
+
CYR_CAP_C = 0x426,
|
49
30
|
CYR_CAP_CH,
|
50
31
|
CYR_CAP_SH,
|
51
|
-
CYR_A=
|
52
|
-
|
53
|
-
|
54
|
-
CYR_G,
|
55
|
-
CYR_D,
|
56
|
-
CYR_E,
|
57
|
-
CYR_ZH,
|
58
|
-
CYR_Z,
|
59
|
-
CYR_I,
|
60
|
-
CYR_K=1082,
|
61
|
-
CYR_L,
|
62
|
-
CYR_M,
|
63
|
-
CYR_N,
|
64
|
-
CYR_O,
|
65
|
-
CYR_P,
|
66
|
-
CYR_R,
|
67
|
-
CYR_S,
|
68
|
-
CYR_T,
|
69
|
-
CYR_U,
|
70
|
-
CYR_F,
|
71
|
-
CYR_H,
|
72
|
-
CYR_C,
|
32
|
+
CYR_A = 0x430,
|
33
|
+
CYR_ZH = 0x436,
|
34
|
+
CYR_C = 0x446,
|
73
35
|
CYR_CH,
|
74
36
|
CYR_SH,
|
75
|
-
CYR_DJ=
|
76
|
-
CYR_J=
|
37
|
+
CYR_DJ = 0x452,
|
38
|
+
CYR_J = 0x458,
|
77
39
|
CYR_LJ,
|
78
40
|
CYR_NJ,
|
79
41
|
CYR_TJ,
|
80
|
-
CYR_DZ=
|
42
|
+
CYR_DZ = 0x45f
|
81
43
|
};
|
82
44
|
|
83
45
|
static inline unsigned int
|
84
|
-
|
46
|
+
is_cyrillic(unsigned int c)
|
85
47
|
{
|
86
|
-
return
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
48
|
+
return c >= CYR_CAP_DJ && c <= CYR_DZ;
|
49
|
+
}
|
50
|
+
|
51
|
+
static inline unsigned int
|
52
|
+
is_upper(unsigned int c)
|
53
|
+
{
|
54
|
+
return (c >= 65 && c <= 90)
|
55
|
+
|| (c >= CYR_CAP_DJ && c <= CYR_CAP_SH)
|
56
|
+
|| c == LAT_CAP_TJ
|
57
|
+
|| c == LAT_CAP_CH
|
58
|
+
|| c == LAT_CAP_DJ
|
59
|
+
|| c == LAT_CAP_SH
|
60
|
+
|| c == LAT_CAP_ZH;
|
61
|
+
}
|
62
|
+
|
63
|
+
static inline unsigned int
|
64
|
+
maps_directly(unsigned int c)
|
65
|
+
{
|
66
|
+
return c != CYR_ZH
|
67
|
+
&& c != CYR_CAP_ZH
|
68
|
+
&& ((c >= CYR_A && c <= CYR_C) || (c >= CYR_CAP_A && c <= CYR_CAP_C));
|
93
69
|
}
|
94
70
|
|
95
71
|
static void
|
@@ -109,12 +85,24 @@ str_to_latin(VALUE str, int ascii, int bang)
|
|
109
85
|
int len, next_len;
|
110
86
|
int seen_upper = 0;
|
111
87
|
int force_upper = 0;
|
112
|
-
char *pos =
|
113
|
-
char
|
88
|
+
char *pos, *end, *seq_start = 0;
|
89
|
+
char cyr;
|
114
90
|
unsigned int codepoint = 0;
|
115
91
|
unsigned int next_codepoint = 0;
|
116
92
|
rb_encoding *enc;
|
117
93
|
|
94
|
+
char CYR_MAP[] = {
|
95
|
+
'a', 'b', 'v', 'g', 'd', 'e', '\0', 'z', 'i', '\0', 'k',
|
96
|
+
'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'f', 'h', 'c'
|
97
|
+
};
|
98
|
+
|
99
|
+
char CYR_CAPS_MAP[] = {
|
100
|
+
'A', 'B', 'V', 'G', 'D', 'E', '\0', 'Z', 'I', '\0', 'K',
|
101
|
+
'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'F', 'H', 'C'
|
102
|
+
};
|
103
|
+
|
104
|
+
StringValue(str);
|
105
|
+
pos = RSTRING_PTR(str);
|
118
106
|
if (!pos || RSTRING_LEN(str) == 0) return str;
|
119
107
|
|
120
108
|
end = RSTRING_END(str);
|
@@ -133,10 +121,10 @@ str_to_latin(VALUE str, int ascii, int bang)
|
|
133
121
|
/* Latin -> "ASCII Latin" conversion */
|
134
122
|
if (ascii && codepoint >= LAT_CAP_TJ && codepoint <= LAT_ZH) {
|
135
123
|
if (seq_start) {
|
136
|
-
/* flush the sequence */
|
137
124
|
rb_str_buf_cat(dest, seq_start, pos - seq_start);
|
138
125
|
seq_start = 0;
|
139
126
|
}
|
127
|
+
|
140
128
|
switch (codepoint) {
|
141
129
|
case LAT_TJ:
|
142
130
|
case LAT_CH: rb_str_buf_cat(dest, "c", 1); break;
|
@@ -148,7 +136,7 @@ str_to_latin(VALUE str, int ascii, int bang)
|
|
148
136
|
case LAT_CAP_SH: rb_str_buf_cat(dest, "S", 1); break;
|
149
137
|
case LAT_CAP_ZH: rb_str_buf_cat(dest, "Z", 1); break;
|
150
138
|
case LAT_CAP_DJ:
|
151
|
-
(seen_upper ||
|
139
|
+
(seen_upper || is_upper(next_codepoint))
|
152
140
|
? rb_str_buf_cat(dest, "DJ", 2)
|
153
141
|
: rb_str_buf_cat(dest, "Dj", 2);
|
154
142
|
break;
|
@@ -157,108 +145,73 @@ str_to_latin(VALUE str, int ascii, int bang)
|
|
157
145
|
}
|
158
146
|
}
|
159
147
|
|
160
|
-
/*
|
161
|
-
else if (codepoint
|
162
|
-
if (!seq_start)
|
163
|
-
seq_start = pos;
|
164
|
-
}
|
165
|
-
|
166
|
-
/* Cyrillic -> Latin conversion */
|
167
|
-
else {
|
148
|
+
/* Cyrillic coderange */
|
149
|
+
else if (is_cyrillic(codepoint)) {
|
168
150
|
if (seq_start) {
|
169
|
-
/* flush the sequence */
|
170
151
|
rb_str_buf_cat(dest, seq_start, pos - seq_start);
|
171
152
|
seq_start = 0;
|
172
153
|
}
|
173
154
|
|
174
155
|
if (codepoint >= CYR_A) {
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
case CYR_C: rb_str_buf_cat(dest, "c", 1); break;
|
198
|
-
case CYR_LJ: rb_str_buf_cat(dest, "lj", 2); break;
|
199
|
-
case CYR_NJ: rb_str_buf_cat(dest, "nj", 2); break;
|
200
|
-
case CYR_DJ: STR_CAT_COND_ASCII(ascii, dest, "dj", LAT_DJ, 2, enc); break;
|
201
|
-
case CYR_TJ: STR_CAT_COND_ASCII(ascii, dest, "c", LAT_TJ, 1, enc); break;
|
202
|
-
case CYR_CH: STR_CAT_COND_ASCII(ascii, dest, "c", LAT_CH, 1, enc); break;
|
203
|
-
case CYR_ZH: STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc); break;
|
204
|
-
case CYR_SH: STR_CAT_COND_ASCII(ascii, dest, "s", LAT_SH, 1, enc); break;
|
205
|
-
case CYR_DZ:
|
206
|
-
rb_str_buf_cat(dest, "d", 1);
|
207
|
-
STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc);
|
208
|
-
break;
|
209
|
-
default:
|
210
|
-
rb_str_buf_cat(dest, pos, len);
|
156
|
+
if (maps_directly(codepoint)) {
|
157
|
+
cyr = CYR_MAP[codepoint - CYR_A];
|
158
|
+
cyr ? rb_str_buf_cat(dest, &cyr, 1)
|
159
|
+
: rb_str_buf_cat(dest, pos, len);
|
160
|
+
}
|
161
|
+
else {
|
162
|
+
switch (codepoint) {
|
163
|
+
case CYR_J: rb_str_buf_cat(dest, "j", 1); break;
|
164
|
+
case CYR_LJ: rb_str_buf_cat(dest, "lj", 2); break;
|
165
|
+
case CYR_NJ: rb_str_buf_cat(dest, "nj", 2); break;
|
166
|
+
case CYR_DJ: STR_CAT_COND_ASCII(ascii, dest, "dj", LAT_DJ, 2, enc); break;
|
167
|
+
case CYR_TJ: STR_CAT_COND_ASCII(ascii, dest, "c", LAT_TJ, 1, enc); break;
|
168
|
+
case CYR_CH: STR_CAT_COND_ASCII(ascii, dest, "c", LAT_CH, 1, enc); break;
|
169
|
+
case CYR_SH: STR_CAT_COND_ASCII(ascii, dest, "s", LAT_SH, 1, enc); break;
|
170
|
+
case CYR_ZH: STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc); break;
|
171
|
+
case CYR_DZ:
|
172
|
+
rb_str_buf_cat(dest, "d", 1);
|
173
|
+
STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc);
|
174
|
+
break;
|
175
|
+
default:
|
176
|
+
rb_str_buf_cat(dest, pos, len);
|
177
|
+
}
|
211
178
|
}
|
212
179
|
}
|
213
180
|
else {
|
214
|
-
|
181
|
+
if (maps_directly(codepoint)) {
|
182
|
+
cyr = CYR_CAPS_MAP[codepoint - CYR_CAP_A];
|
183
|
+
cyr ? rb_str_buf_cat(dest, &cyr, 1)
|
184
|
+
: rb_str_buf_cat(dest, pos, len);
|
185
|
+
}
|
186
|
+
else {
|
187
|
+
force_upper = seen_upper || is_upper(next_codepoint);
|
215
188
|
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
case CYR_CAP_R: rb_str_buf_cat(dest, "R", 1); break;
|
233
|
-
case CYR_CAP_S: rb_str_buf_cat(dest, "S", 1); break;
|
234
|
-
case CYR_CAP_T: rb_str_buf_cat(dest, "T", 1); break;
|
235
|
-
case CYR_CAP_U: rb_str_buf_cat(dest, "U", 1); break;
|
236
|
-
case CYR_CAP_F: rb_str_buf_cat(dest, "F", 1); break;
|
237
|
-
case CYR_CAP_H: rb_str_buf_cat(dest, "H", 1); break;
|
238
|
-
case CYR_CAP_C: rb_str_buf_cat(dest, "C", 1); break;
|
239
|
-
case CYR_CAP_LJ: rb_str_buf_cat(dest, (force_upper ? "LJ" : "Lj"), 2); break;
|
240
|
-
case CYR_CAP_NJ: rb_str_buf_cat(dest, (force_upper ? "NJ" : "Nj"), 2); break;
|
241
|
-
case CYR_CAP_TJ: STR_CAT_COND_ASCII(ascii, dest, "C", LAT_CAP_TJ, 1, enc); break;
|
242
|
-
case CYR_CAP_CH: STR_CAT_COND_ASCII(ascii, dest, "C", LAT_CAP_CH, 1, enc); break;
|
243
|
-
case CYR_CAP_ZH: STR_CAT_COND_ASCII(ascii, dest, "Z", LAT_CAP_ZH, 1, enc); break;
|
244
|
-
case CYR_CAP_SH: STR_CAT_COND_ASCII(ascii, dest, "S", LAT_CAP_SH, 1, enc); break;
|
245
|
-
case CYR_CAP_DJ: STR_CAT_COND_ASCII(ascii, dest, (force_upper ? "DJ" : "Dj"), LAT_CAP_DJ, 2, enc); break;
|
246
|
-
case CYR_CAP_DZ:
|
247
|
-
rb_str_buf_cat(dest, "D", 1);
|
248
|
-
if (force_upper) {
|
249
|
-
STR_CAT_COND_ASCII(ascii, dest, "Z", LAT_CAP_ZH, 1, enc);
|
250
|
-
}
|
251
|
-
else {
|
252
|
-
STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc);
|
189
|
+
switch (codepoint) {
|
190
|
+
case CYR_CAP_J: rb_str_buf_cat(dest, "J", 1); break;
|
191
|
+
case CYR_CAP_LJ: rb_str_buf_cat(dest, (force_upper ? "LJ" : "Lj"), 2); break;
|
192
|
+
case CYR_CAP_NJ: rb_str_buf_cat(dest, (force_upper ? "NJ" : "Nj"), 2); break;
|
193
|
+
case CYR_CAP_TJ: STR_CAT_COND_ASCII(ascii, dest, "C", LAT_CAP_TJ, 1, enc); break;
|
194
|
+
case CYR_CAP_CH: STR_CAT_COND_ASCII(ascii, dest, "C", LAT_CAP_CH, 1, enc); break;
|
195
|
+
case CYR_CAP_SH: STR_CAT_COND_ASCII(ascii, dest, "S", LAT_CAP_SH, 1, enc); break;
|
196
|
+
case CYR_CAP_ZH: STR_CAT_COND_ASCII(ascii, dest, "Z", LAT_CAP_ZH, 1, enc); break;
|
197
|
+
case CYR_CAP_DJ: STR_CAT_COND_ASCII(ascii, dest, (force_upper ? "DJ" : "Dj"), LAT_CAP_DJ, 2, enc); break;
|
198
|
+
case CYR_CAP_DZ:
|
199
|
+
rb_str_buf_cat(dest, "D", 1);
|
200
|
+
force_upper ? STR_CAT_COND_ASCII(ascii, dest, "Z", LAT_CAP_ZH, 1, enc)
|
201
|
+
: STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc);
|
202
|
+
break;
|
203
|
+
default:
|
204
|
+
rb_str_buf_cat(dest, pos, len);
|
253
205
|
}
|
254
|
-
break;
|
255
|
-
default:
|
256
|
-
rb_str_buf_cat(dest, pos, len);
|
257
206
|
}
|
258
207
|
}
|
259
208
|
}
|
209
|
+
else {
|
210
|
+
/* Mark the start of a copyable sequence */
|
211
|
+
if (!seq_start) seq_start = pos;
|
212
|
+
}
|
260
213
|
|
261
|
-
seen_upper =
|
214
|
+
seen_upper = is_upper(codepoint);
|
262
215
|
|
263
216
|
pos += len;
|
264
217
|
len = next_len;
|
@@ -267,8 +220,8 @@ str_to_latin(VALUE str, int ascii, int bang)
|
|
267
220
|
next_codepoint = 0;
|
268
221
|
}
|
269
222
|
|
223
|
+
/* Flush the last sequence, if any */
|
270
224
|
if (seq_start) {
|
271
|
-
/* flush the last sequence */
|
272
225
|
rb_str_buf_cat(dest, seq_start, pos - seq_start);
|
273
226
|
}
|
274
227
|
|
@@ -283,30 +236,67 @@ str_to_latin(VALUE str, int ascii, int bang)
|
|
283
236
|
return str;
|
284
237
|
}
|
285
238
|
|
239
|
+
/**
|
240
|
+
* Returns a copy of <i>str</i> with the Serbian Cyrillic characters
|
241
|
+
* transliterated into Latin.
|
242
|
+
*
|
243
|
+
* @overload to_latin(str)
|
244
|
+
* @param [String] str text to be transliterated
|
245
|
+
* @return [String] transliterated text
|
246
|
+
*/
|
286
247
|
static VALUE
|
287
|
-
rb_str_to_latin(VALUE str)
|
248
|
+
rb_str_to_latin(VALUE self, VALUE str)
|
249
|
+
{
|
288
250
|
return str_to_latin(str, 0, 0);
|
289
251
|
}
|
290
252
|
|
253
|
+
/**
|
254
|
+
* Performs the transliteration of <code>Byk.to_latin</code> in place,
|
255
|
+
* returning <i>str</i>, whether changes were made or not.
|
256
|
+
*
|
257
|
+
* @overload to_latin!(str)
|
258
|
+
* @param [String] str text to be transliterated
|
259
|
+
* @return [String] transliterated text
|
260
|
+
*/
|
291
261
|
static VALUE
|
292
|
-
rb_str_to_latin_bang(VALUE str)
|
262
|
+
rb_str_to_latin_bang(VALUE self, VALUE str)
|
263
|
+
{
|
293
264
|
return str_to_latin(str, 0, 1);
|
294
265
|
}
|
295
266
|
|
267
|
+
/**
|
268
|
+
* Returns a copy of <i>str</i> with the Serbian Cyrillic
|
269
|
+
* characters transliterated into ASCII Latin.
|
270
|
+
*
|
271
|
+
* @overload to_ascii_latin(str)
|
272
|
+
* @param [String] str text to be transliterated
|
273
|
+
* @return [String] transliterated text
|
274
|
+
*/
|
296
275
|
static VALUE
|
297
|
-
rb_str_to_ascii_latin(VALUE str)
|
276
|
+
rb_str_to_ascii_latin(VALUE self, VALUE str)
|
277
|
+
{
|
298
278
|
return str_to_latin(str, 1, 0);
|
299
279
|
}
|
300
280
|
|
281
|
+
/**
|
282
|
+
* Performs the transliteration of <code>Byk.to_ascii_latin</code> in
|
283
|
+
* place, returning <i>str</i>, whether changes were made or not.
|
284
|
+
*
|
285
|
+
* @overload to_ascii_latin!(str)
|
286
|
+
* @param [String] str text to be transliterated
|
287
|
+
* @return [String] transliterated text
|
288
|
+
*/
|
301
289
|
static VALUE
|
302
|
-
rb_str_to_ascii_latin_bang(VALUE str)
|
290
|
+
rb_str_to_ascii_latin_bang(VALUE self, VALUE str)
|
291
|
+
{
|
303
292
|
return str_to_latin(str, 1, 1);
|
304
293
|
}
|
305
294
|
|
306
295
|
void Init_byk_native(void)
|
307
296
|
{
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
297
|
+
VALUE Byk = rb_define_module("Byk");
|
298
|
+
rb_define_singleton_method(Byk, "to_latin", rb_str_to_latin, 1);
|
299
|
+
rb_define_singleton_method(Byk, "to_latin!", rb_str_to_latin_bang, 1);
|
300
|
+
rb_define_singleton_method(Byk, "to_ascii_latin", rb_str_to_ascii_latin, 1);
|
301
|
+
rb_define_singleton_method(Byk, "to_ascii_latin!", rb_str_to_ascii_latin_bang, 1);
|
312
302
|
}
|
data/lib/byk.rb
CHANGED
@@ -1,14 +1,2 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
require "byk_native"
|
4
|
-
require "byk/version"
|
5
|
-
|
6
|
-
module Byk
|
7
|
-
|
8
|
-
AZBUKA = %w[а б в г д ђ е ж з и ј к л љ м н њ о п р с т ћ у ф х ц ч џ ш]
|
9
|
-
AZBUKA_CAPS = %W[А Б В Г Д Ђ Е Ж З И Ј К Л Љ М Н Њ О П Р С Т Ћ У Ф Х Ц Ч Џ Ш]
|
10
|
-
|
11
|
-
ABECEDA = %w[a b c č ć d dž đ e f g h i j k l lj m n nj o p r s š t u v z ž]
|
12
|
-
ABECEDA_CAPS = %W[A B C Č Ć D Dž Đ E F G H I J K L Lj M N Nj O P R S Š T U V Z Ž]
|
13
|
-
|
14
|
-
end
|
1
|
+
require "byk/safe"
|
2
|
+
require "byk/core_ext/string"
|
data/lib/byk/safe.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
require "byk_native"
|
4
|
+
require "byk/version"
|
5
|
+
|
6
|
+
module Byk
|
7
|
+
|
8
|
+
AZBUKA = %w[а б в г д ђ е ж з и ј к л љ м н њ о п р с т ћ у ф х ц ч џ ш]
|
9
|
+
AZBUKA_CAPS = %W[А Б В Г Д Ђ Е Ж З И Ј К Л Љ М Н Њ О П Р С Т Ћ У Ф Х Ц Ч Џ Ш]
|
10
|
+
|
11
|
+
ABECEDA = %w[a b c č ć d dž đ e f g h i j k l lj m n nj o p r s š t u v z ž]
|
12
|
+
ABECEDA_CAPS = %W[A B C Č Ć D Dž Đ E F G H I J K L Lj M N Nj O P R S Š T U V Z Ž]
|
13
|
+
|
14
|
+
end
|
data/lib/byk/version.rb
CHANGED
data/spec/byk_spec.rb
CHANGED
@@ -4,123 +4,180 @@ require "spec_helper"
|
|
4
4
|
|
5
5
|
describe Byk do
|
6
6
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
let(:pangram_ascii_latin) { "fijuce vetar u siblju, ledi pasaze i kuce iza njih i gundja u odzacima." }
|
7
|
+
it "has a version number" do
|
8
|
+
expect(Byk::VERSION).not_to be nil
|
9
|
+
end
|
11
10
|
|
12
|
-
|
13
|
-
let(:pangram_latin_caps) { "FIJUČE VETAR U ŠIBLJU, LEDI PASAŽE I KUĆE IZA NJIH I GUNĐA U ODŽACIMA." }
|
14
|
-
let(:pangram_ascii_latin_caps) { "FIJUCE VETAR U SIBLJU, LEDI PASAZE I KUCE IZA NJIH I GUNDJA U ODZACIMA." }
|
11
|
+
shared_examples :base do |method|
|
15
12
|
|
16
|
-
|
17
|
-
|
13
|
+
# See http://sr.wikipedia.org/wiki/Панграм
|
14
|
+
let(:pangram) { "фијуче ветар у шибљу, леди пасаже и куће иза њих и гунђа у оџацима." }
|
15
|
+
let(:pangram_latin) { "fijuče vetar u šiblju, ledi pasaže i kuće iza njih i gunđa u odžacima." }
|
16
|
+
let(:pangram_ascii_latin) { "fijuce vetar u siblju, ledi pasaze i kuce iza njih i gundja u odzacima." }
|
18
17
|
|
19
|
-
|
20
|
-
|
21
|
-
|
18
|
+
let(:pangram_caps) { "ФИЈУЧЕ ВЕТАР У ШИБЉУ, ЛЕДИ ПАСАЖЕ И КУЋЕ ИЗА ЊИХ И ГУНЂА У ОЏАЦИМА." }
|
19
|
+
let(:pangram_latin_caps) { "FIJUČE VETAR U ŠIBLJU, LEDI PASAŽE I KUĆE IZA NJIH I GUNĐA U ODŽACIMA." }
|
20
|
+
let(:pangram_ascii_latin_caps) { "FIJUCE VETAR U SIBLJU, LEDI PASAZE I KUCE IZA NJIH I GUNDJA U ODZACIMA." }
|
22
21
|
|
23
|
-
|
24
|
-
|
25
|
-
|
22
|
+
let(:full_cyrillic_coderange) { (0x400..0x4ff).map { |i| i.chr(Encoding::UTF_8) } }
|
23
|
+
let(:non_serbian_cyrillic_coderange) { full_cyrillic_coderange - Byk::AZBUKA - Byk::AZBUKA_CAPS }
|
24
|
+
let(:non_serbian_cyrillic) { non_serbian_cyrillic_coderange.join }
|
25
|
+
|
26
|
+
let(:ascii) { "The quick brown fox jumps over the lazy dog." }
|
27
|
+
let(:other) { "संस्कृतम् saṃskṛtam" }
|
26
28
|
|
27
|
-
|
29
|
+
let(:mixed) { "संस्कृतम् saṃskṛtam илити Sanskrit, obrati ПАЖЊУ." }
|
30
|
+
let(:mixed_latin) { "संस्कृतम् saṃskṛtam iliti Sanskrit, obrati PAŽNJU." }
|
31
|
+
let(:mixed_ascii_latin) { "संस्कृतम् saṃskṛtam iliti Sanskrit, obrati PAZNJU." }
|
28
32
|
|
29
|
-
it "doesn't
|
30
|
-
expect(""
|
33
|
+
it "doesn't convert an empty string" do
|
34
|
+
expect(Byk.send(method, "")).to eq ""
|
31
35
|
end
|
32
36
|
|
33
|
-
it "doesn't
|
34
|
-
expect(ascii
|
37
|
+
it "doesn't convert ASCII text" do
|
38
|
+
expect(Byk.send(method, ascii)).to eq ascii
|
35
39
|
end
|
36
40
|
|
37
|
-
it "doesn't
|
38
|
-
expect(
|
41
|
+
it "doesn't convert non-Serbian Cyrillic" do
|
42
|
+
expect(Byk.send(method, non_serbian_cyrillic)).to eq non_serbian_cyrillic
|
39
43
|
end
|
40
44
|
|
41
|
-
it "doesn't
|
42
|
-
expect(other
|
45
|
+
it "doesn't convert other coderanges" do
|
46
|
+
expect(Byk.send(method, other)).to eq other
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
shared_examples :latinization_method do |method|
|
51
|
+
include_examples :base, method
|
52
|
+
|
53
|
+
let(:edge_cases) {
|
54
|
+
[
|
55
|
+
["Њ", "Nj"],
|
56
|
+
["Љ", "Lj"],
|
57
|
+
["Џ", "Dž"],
|
58
|
+
["ЊЊ", "NJNJ"],
|
59
|
+
["ЉЉ", "LJLJ"],
|
60
|
+
["ЏЏ", "DŽDŽ"]
|
61
|
+
]
|
62
|
+
}
|
63
|
+
|
64
|
+
it "doesn't convert Latin" do
|
65
|
+
expect(Byk.send(method, pangram_latin)).to eq pangram_latin
|
43
66
|
end
|
44
67
|
|
45
|
-
it "converts
|
46
|
-
expect(pangram
|
68
|
+
it "converts Cyrillic to Latin" do
|
69
|
+
expect(Byk.send(method, pangram)).to eq pangram_latin
|
47
70
|
end
|
48
71
|
|
49
|
-
it "converts
|
50
|
-
expect(pangram_caps
|
72
|
+
it "converts Cyrillic caps to Latin caps" do
|
73
|
+
expect(Byk.send(method, pangram_caps)).to eq pangram_latin_caps
|
51
74
|
end
|
52
75
|
|
53
76
|
it "converts mixed text properly" do
|
54
|
-
expect(mixed
|
77
|
+
expect(Byk.send(method, mixed)).to eq mixed_latin
|
78
|
+
end
|
79
|
+
|
80
|
+
it "converts edge cases properly" do
|
81
|
+
edge_cases.each do |input, output|
|
82
|
+
expect(Byk.send(method, input)).to eq output
|
83
|
+
end
|
55
84
|
end
|
56
85
|
|
57
86
|
it "converts AZBUKA to ABECEDA" do
|
58
|
-
expect(Byk::AZBUKA.map(
|
87
|
+
expect(Byk::AZBUKA.map { |l| l.dup.send(method) }).to match_array(Byk::ABECEDA)
|
59
88
|
end
|
60
89
|
|
61
90
|
it "converts AZBUKA_CAPS to ABECEDA_CAPS" do
|
62
|
-
expect(Byk::AZBUKA_CAPS.map(
|
91
|
+
expect(Byk::AZBUKA_CAPS.map { |l| l.dup.send(method) }).to match_array(Byk::ABECEDA_CAPS)
|
63
92
|
end
|
64
93
|
end
|
65
94
|
|
66
|
-
|
95
|
+
shared_examples :ascii_latinization_method do |method|
|
96
|
+
include_examples :base, method
|
67
97
|
|
68
|
-
# Special care for Њ, Љ, Ђ, Đ
|
69
98
|
let(:edge_cases) {
|
70
|
-
|
71
|
-
"Њ"
|
72
|
-
"Љ"
|
73
|
-
"
|
74
|
-
"
|
75
|
-
"
|
76
|
-
"
|
77
|
-
"
|
78
|
-
"
|
79
|
-
"
|
80
|
-
"
|
81
|
-
"ЂУРАЂ"
|
82
|
-
"ĐURAĐ"
|
83
|
-
|
84
|
-
"ĐURAĐ. Đorđević" => "DJURADJ. Djordjevic"
|
85
|
-
}
|
99
|
+
[
|
100
|
+
["Њ", "Nj"],
|
101
|
+
["Љ", "Lj"],
|
102
|
+
["Џ", "Dz"],
|
103
|
+
["Ђ", "Dj"],
|
104
|
+
["Đ", "Dj"],
|
105
|
+
["ЊЊ", "NJNJ"],
|
106
|
+
["ЉЉ", "LJLJ"],
|
107
|
+
["ЏЏ", "DZDZ"],
|
108
|
+
["ЂЂ", "DJDJ"],
|
109
|
+
["ĐĐ", "DJDJ"],
|
110
|
+
["ЂУРАЂ Ђорђевић", "DJURADJ Djordjevic"],
|
111
|
+
["ĐURAĐ Đorđević", "DJURADJ Djordjevic"]
|
112
|
+
]
|
86
113
|
}
|
87
114
|
|
88
|
-
it "
|
89
|
-
expect(
|
115
|
+
it "converts Cyrillic to ASCII Latin" do
|
116
|
+
expect(Byk.send(method, pangram)).to eq pangram_ascii_latin
|
90
117
|
end
|
91
118
|
|
92
|
-
it "
|
93
|
-
expect(
|
119
|
+
it "converts Cyrillic caps to ASCII Latin caps" do
|
120
|
+
expect(Byk.send(method, pangram_caps)).to eq pangram_ascii_latin_caps
|
94
121
|
end
|
95
122
|
|
96
|
-
it "
|
97
|
-
expect(
|
123
|
+
it "converts Latin to ASCII Latin" do
|
124
|
+
expect(Byk.send(method, pangram_latin)).to eq pangram_ascii_latin
|
98
125
|
end
|
99
126
|
|
100
|
-
it "converts
|
101
|
-
expect(
|
127
|
+
it "converts Latin caps to ASCII Latin caps" do
|
128
|
+
expect(Byk.send(method, pangram_latin_caps)).to eq pangram_ascii_latin_caps
|
102
129
|
end
|
103
130
|
|
104
|
-
it "converts
|
105
|
-
expect(
|
131
|
+
it "converts mixed text properly" do
|
132
|
+
expect(Byk.send(method, mixed)).to eq mixed_ascii_latin
|
106
133
|
end
|
107
134
|
|
108
|
-
it "converts
|
109
|
-
|
135
|
+
it "converts edge cases properly" do
|
136
|
+
edge_cases.each do |input, output|
|
137
|
+
expect(Byk.send(method, input)).to eq output
|
138
|
+
end
|
110
139
|
end
|
140
|
+
end
|
111
141
|
|
112
|
-
|
113
|
-
|
142
|
+
shared_examples :non_destructive_method do |method|
|
143
|
+
it "doesn't modify the arg" do
|
144
|
+
str = "Ж"
|
145
|
+
expect { Byk.send(method, str) }.to_not change { str }
|
114
146
|
end
|
147
|
+
end
|
115
148
|
|
116
|
-
|
117
|
-
|
149
|
+
shared_examples :destructive_method do |method|
|
150
|
+
it "modifies the arg" do
|
151
|
+
str = "Ж"
|
152
|
+
expect { Byk.send(method, str) }.to change { str }
|
118
153
|
end
|
154
|
+
end
|
119
155
|
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
156
|
+
describe ".to_latin" do
|
157
|
+
it_behaves_like :latinization_method, :to_latin
|
158
|
+
it_behaves_like :non_destructive_method, :to_latin
|
159
|
+
end
|
160
|
+
|
161
|
+
describe ".to_latin!" do
|
162
|
+
it_behaves_like :latinization_method, :to_latin!
|
163
|
+
it_behaves_like :destructive_method, :to_latin!
|
164
|
+
end
|
165
|
+
|
166
|
+
describe ".to_ascii_latin" do
|
167
|
+
it_behaves_like :ascii_latinization_method, :to_ascii_latin
|
168
|
+
it_behaves_like :non_destructive_method, :to_ascii_latin
|
169
|
+
end
|
170
|
+
|
171
|
+
describe ".to_ascii_latin!" do
|
172
|
+
it_behaves_like :ascii_latinization_method, :to_ascii_latin!
|
173
|
+
it_behaves_like :destructive_method, :to_ascii_latin!
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
describe String do
|
178
|
+
it "responds to Byk methods" do
|
179
|
+
Byk.instance_methods.each do |method|
|
180
|
+
expect("").to respond_to(method)
|
124
181
|
end
|
125
182
|
end
|
126
183
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: byk
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nikola Topalović
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-04-
|
11
|
+
date: 2015-04-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake-compiler
|
@@ -52,6 +52,8 @@ files:
|
|
52
52
|
- ext/byk/byk.c
|
53
53
|
- ext/byk/extconf.rb
|
54
54
|
- lib/byk.rb
|
55
|
+
- lib/byk/core_ext/string.rb
|
56
|
+
- lib/byk/safe.rb
|
55
57
|
- lib/byk/version.rb
|
56
58
|
- spec/byk_spec.rb
|
57
59
|
homepage: https://github.com/topalovic/byk
|
@@ -74,7 +76,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
74
76
|
version: '0'
|
75
77
|
requirements: []
|
76
78
|
rubyforge_project:
|
77
|
-
rubygems_version: 2.
|
79
|
+
rubygems_version: 2.4.5
|
78
80
|
signing_key:
|
79
81
|
specification_version: 4
|
80
82
|
summary: Fast transliteration of Serbian Cyrillic into Latin.
|