byk 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/README.md +31 -6
- data/ext/byk/byk.c +153 -163
- data/lib/byk.rb +2 -14
- data/lib/byk/core_ext/string.rb +8 -0
- data/lib/byk/safe.rb +14 -0
- data/lib/byk/version.rb +1 -1
- data/spec/byk_spec.rb +126 -69
- metadata +5 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 34203e0b4291cde495d17da65522df586de7e712
|
4
|
+
data.tar.gz: 290d743dab23c58241520252bd81d4ae4115ce98
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f11d00e9ac1057a5596804e03c6c4a6c41841bedc21030a9ed776cfbaaabba85a341a62de71c990c8deadc8f8384bf263b41d477b33b299af26b55acef47fe0c
|
7
|
+
data.tar.gz: 335ddfeca9f6793f2887c1cc93cfc916e011f7dd01fd97073162871148d0fe61395bdb1115c5ed4f7583ff207f6b6d27462c8920a7a70b016b9427420796bc28
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -61,13 +61,40 @@ text # => "Zvazbuka"
|
|
61
61
|
```
|
62
62
|
|
63
63
|
Note that these methods take into account the
|
64
|
-
[
|
64
|
+
[digraph capitalization rules](http://sr.wikipedia.org/wiki/Гајица#.D0.94.D0.B8.D0.B3.D1.80.D0.B0.D1.84.D0.B8):
|
65
65
|
|
66
66
|
```ruby
|
67
67
|
"ЉИЉА Љиљановић".to_latin # => "LJILJA Ljiljanović"
|
68
68
|
"ĐORĐE Đorđević".to_ascii_latin # => "DJORDJE Djordjevic"
|
69
69
|
```
|
70
70
|
|
71
|
+
If you prefer not to monkey patch your strings, you can use the "safe"
|
72
|
+
require:
|
73
|
+
|
74
|
+
```ruby
|
75
|
+
require "byk/safe"
|
76
|
+
```
|
77
|
+
|
78
|
+
and then call the module methods:
|
79
|
+
|
80
|
+
```ruby
|
81
|
+
text = "Вук"
|
82
|
+
Byk.to_latin(text) # => "Vuk"
|
83
|
+
text # => "Byk"
|
84
|
+
Byk.to_latin!(text) # => "Vuk"
|
85
|
+
text # => "Vuk"
|
86
|
+
```
|
87
|
+
|
88
|
+
|
89
|
+
## Testing
|
90
|
+
|
91
|
+
To test the gem, clone the repo and run:
|
92
|
+
|
93
|
+
```
|
94
|
+
$ bundle
|
95
|
+
$ bundle exec rake
|
96
|
+
```
|
97
|
+
|
71
98
|
|
72
99
|
## How fast is fast?
|
73
100
|
|
@@ -84,7 +111,7 @@ projects, e.g. sites supporting dual script content. Remember,
|
|
84
111
|
|
85
112
|
I found transliteration to be a straightforward little problem that
|
86
113
|
lends itself well to optimization. It also gave me an excuse to play
|
87
|
-
with Ruby extensions, so there :
|
114
|
+
with Ruby extensions, so there :smirk_cat:
|
88
115
|
|
89
116
|
|
90
117
|
## Compatibility
|
@@ -92,10 +119,8 @@ with Ruby extensions, so there :smile_cat:
|
|
92
119
|
Byk is supported under MRI Ruby >= 1.9.2.
|
93
120
|
|
94
121
|
I don't plan to support 1.8.7 or older due to substantial C API
|
95
|
-
changes between 1.8 and 1.9.
|
96
|
-
|
97
|
-
It doesn't build under Rubinius currently, but I intend to support it
|
98
|
-
in future releases.
|
122
|
+
changes between 1.8 and 1.9. It doesn't build under Rubinius
|
123
|
+
currently, but I intend to support it in future releases.
|
99
124
|
|
100
125
|
|
101
126
|
## License
|
data/ext/byk/byk.c
CHANGED
@@ -5,91 +5,67 @@
|
|
5
5
|
|
6
6
|
#define STR_CAT_COND_ASCII(ascii, dest, chr, ascii_chr, len, enc) \
|
7
7
|
ascii ? rb_str_buf_cat(dest, chr, len) \
|
8
|
-
|
8
|
+
: str_cat_char(dest, ascii_chr, enc)
|
9
9
|
|
10
10
|
enum {
|
11
|
-
LAT_CAP_TJ=
|
11
|
+
LAT_CAP_TJ = 0x106,
|
12
12
|
LAT_TJ,
|
13
|
-
LAT_CAP_CH=
|
13
|
+
LAT_CAP_CH = 0x10c,
|
14
14
|
LAT_CH,
|
15
|
-
LAT_CAP_DJ=
|
15
|
+
LAT_CAP_DJ = 0x110,
|
16
16
|
LAT_DJ,
|
17
|
-
LAT_CAP_SH=
|
17
|
+
LAT_CAP_SH = 0x160,
|
18
18
|
LAT_SH,
|
19
|
-
LAT_CAP_ZH=
|
19
|
+
LAT_CAP_ZH = 0x17d,
|
20
20
|
LAT_ZH,
|
21
|
-
CYR_CAP_DJ=
|
22
|
-
CYR_CAP_J=
|
21
|
+
CYR_CAP_DJ = 0x402,
|
22
|
+
CYR_CAP_J = 0x408,
|
23
23
|
CYR_CAP_LJ,
|
24
24
|
CYR_CAP_NJ,
|
25
25
|
CYR_CAP_TJ,
|
26
|
-
CYR_CAP_DZ=
|
26
|
+
CYR_CAP_DZ = 0x40f,
|
27
27
|
CYR_CAP_A,
|
28
|
-
|
29
|
-
|
30
|
-
CYR_CAP_G,
|
31
|
-
CYR_CAP_D,
|
32
|
-
CYR_CAP_E,
|
33
|
-
CYR_CAP_ZH,
|
34
|
-
CYR_CAP_Z,
|
35
|
-
CYR_CAP_I,
|
36
|
-
CYR_CAP_K=1050,
|
37
|
-
CYR_CAP_L,
|
38
|
-
CYR_CAP_M,
|
39
|
-
CYR_CAP_N,
|
40
|
-
CYR_CAP_O,
|
41
|
-
CYR_CAP_P,
|
42
|
-
CYR_CAP_R,
|
43
|
-
CYR_CAP_S,
|
44
|
-
CYR_CAP_T,
|
45
|
-
CYR_CAP_U,
|
46
|
-
CYR_CAP_F,
|
47
|
-
CYR_CAP_H,
|
48
|
-
CYR_CAP_C,
|
28
|
+
CYR_CAP_ZH = 0x416,
|
29
|
+
CYR_CAP_C = 0x426,
|
49
30
|
CYR_CAP_CH,
|
50
31
|
CYR_CAP_SH,
|
51
|
-
CYR_A=
|
52
|
-
|
53
|
-
|
54
|
-
CYR_G,
|
55
|
-
CYR_D,
|
56
|
-
CYR_E,
|
57
|
-
CYR_ZH,
|
58
|
-
CYR_Z,
|
59
|
-
CYR_I,
|
60
|
-
CYR_K=1082,
|
61
|
-
CYR_L,
|
62
|
-
CYR_M,
|
63
|
-
CYR_N,
|
64
|
-
CYR_O,
|
65
|
-
CYR_P,
|
66
|
-
CYR_R,
|
67
|
-
CYR_S,
|
68
|
-
CYR_T,
|
69
|
-
CYR_U,
|
70
|
-
CYR_F,
|
71
|
-
CYR_H,
|
72
|
-
CYR_C,
|
32
|
+
CYR_A = 0x430,
|
33
|
+
CYR_ZH = 0x436,
|
34
|
+
CYR_C = 0x446,
|
73
35
|
CYR_CH,
|
74
36
|
CYR_SH,
|
75
|
-
CYR_DJ=
|
76
|
-
CYR_J=
|
37
|
+
CYR_DJ = 0x452,
|
38
|
+
CYR_J = 0x458,
|
77
39
|
CYR_LJ,
|
78
40
|
CYR_NJ,
|
79
41
|
CYR_TJ,
|
80
|
-
CYR_DZ=
|
42
|
+
CYR_DZ = 0x45f
|
81
43
|
};
|
82
44
|
|
83
45
|
static inline unsigned int
|
84
|
-
|
46
|
+
is_cyrillic(unsigned int c)
|
85
47
|
{
|
86
|
-
return
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
48
|
+
return c >= CYR_CAP_DJ && c <= CYR_DZ;
|
49
|
+
}
|
50
|
+
|
51
|
+
static inline unsigned int
|
52
|
+
is_upper(unsigned int c)
|
53
|
+
{
|
54
|
+
return (c >= 65 && c <= 90)
|
55
|
+
|| (c >= CYR_CAP_DJ && c <= CYR_CAP_SH)
|
56
|
+
|| c == LAT_CAP_TJ
|
57
|
+
|| c == LAT_CAP_CH
|
58
|
+
|| c == LAT_CAP_DJ
|
59
|
+
|| c == LAT_CAP_SH
|
60
|
+
|| c == LAT_CAP_ZH;
|
61
|
+
}
|
62
|
+
|
63
|
+
static inline unsigned int
|
64
|
+
maps_directly(unsigned int c)
|
65
|
+
{
|
66
|
+
return c != CYR_ZH
|
67
|
+
&& c != CYR_CAP_ZH
|
68
|
+
&& ((c >= CYR_A && c <= CYR_C) || (c >= CYR_CAP_A && c <= CYR_CAP_C));
|
93
69
|
}
|
94
70
|
|
95
71
|
static void
|
@@ -109,12 +85,24 @@ str_to_latin(VALUE str, int ascii, int bang)
|
|
109
85
|
int len, next_len;
|
110
86
|
int seen_upper = 0;
|
111
87
|
int force_upper = 0;
|
112
|
-
char *pos =
|
113
|
-
char
|
88
|
+
char *pos, *end, *seq_start = 0;
|
89
|
+
char cyr;
|
114
90
|
unsigned int codepoint = 0;
|
115
91
|
unsigned int next_codepoint = 0;
|
116
92
|
rb_encoding *enc;
|
117
93
|
|
94
|
+
char CYR_MAP[] = {
|
95
|
+
'a', 'b', 'v', 'g', 'd', 'e', '\0', 'z', 'i', '\0', 'k',
|
96
|
+
'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'f', 'h', 'c'
|
97
|
+
};
|
98
|
+
|
99
|
+
char CYR_CAPS_MAP[] = {
|
100
|
+
'A', 'B', 'V', 'G', 'D', 'E', '\0', 'Z', 'I', '\0', 'K',
|
101
|
+
'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'F', 'H', 'C'
|
102
|
+
};
|
103
|
+
|
104
|
+
StringValue(str);
|
105
|
+
pos = RSTRING_PTR(str);
|
118
106
|
if (!pos || RSTRING_LEN(str) == 0) return str;
|
119
107
|
|
120
108
|
end = RSTRING_END(str);
|
@@ -133,10 +121,10 @@ str_to_latin(VALUE str, int ascii, int bang)
|
|
133
121
|
/* Latin -> "ASCII Latin" conversion */
|
134
122
|
if (ascii && codepoint >= LAT_CAP_TJ && codepoint <= LAT_ZH) {
|
135
123
|
if (seq_start) {
|
136
|
-
/* flush the sequence */
|
137
124
|
rb_str_buf_cat(dest, seq_start, pos - seq_start);
|
138
125
|
seq_start = 0;
|
139
126
|
}
|
127
|
+
|
140
128
|
switch (codepoint) {
|
141
129
|
case LAT_TJ:
|
142
130
|
case LAT_CH: rb_str_buf_cat(dest, "c", 1); break;
|
@@ -148,7 +136,7 @@ str_to_latin(VALUE str, int ascii, int bang)
|
|
148
136
|
case LAT_CAP_SH: rb_str_buf_cat(dest, "S", 1); break;
|
149
137
|
case LAT_CAP_ZH: rb_str_buf_cat(dest, "Z", 1); break;
|
150
138
|
case LAT_CAP_DJ:
|
151
|
-
(seen_upper ||
|
139
|
+
(seen_upper || is_upper(next_codepoint))
|
152
140
|
? rb_str_buf_cat(dest, "DJ", 2)
|
153
141
|
: rb_str_buf_cat(dest, "Dj", 2);
|
154
142
|
break;
|
@@ -157,108 +145,73 @@ str_to_latin(VALUE str, int ascii, int bang)
|
|
157
145
|
}
|
158
146
|
}
|
159
147
|
|
160
|
-
/*
|
161
|
-
else if (codepoint
|
162
|
-
if (!seq_start)
|
163
|
-
seq_start = pos;
|
164
|
-
}
|
165
|
-
|
166
|
-
/* Cyrillic -> Latin conversion */
|
167
|
-
else {
|
148
|
+
/* Cyrillic coderange */
|
149
|
+
else if (is_cyrillic(codepoint)) {
|
168
150
|
if (seq_start) {
|
169
|
-
/* flush the sequence */
|
170
151
|
rb_str_buf_cat(dest, seq_start, pos - seq_start);
|
171
152
|
seq_start = 0;
|
172
153
|
}
|
173
154
|
|
174
155
|
if (codepoint >= CYR_A) {
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
case CYR_C: rb_str_buf_cat(dest, "c", 1); break;
|
198
|
-
case CYR_LJ: rb_str_buf_cat(dest, "lj", 2); break;
|
199
|
-
case CYR_NJ: rb_str_buf_cat(dest, "nj", 2); break;
|
200
|
-
case CYR_DJ: STR_CAT_COND_ASCII(ascii, dest, "dj", LAT_DJ, 2, enc); break;
|
201
|
-
case CYR_TJ: STR_CAT_COND_ASCII(ascii, dest, "c", LAT_TJ, 1, enc); break;
|
202
|
-
case CYR_CH: STR_CAT_COND_ASCII(ascii, dest, "c", LAT_CH, 1, enc); break;
|
203
|
-
case CYR_ZH: STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc); break;
|
204
|
-
case CYR_SH: STR_CAT_COND_ASCII(ascii, dest, "s", LAT_SH, 1, enc); break;
|
205
|
-
case CYR_DZ:
|
206
|
-
rb_str_buf_cat(dest, "d", 1);
|
207
|
-
STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc);
|
208
|
-
break;
|
209
|
-
default:
|
210
|
-
rb_str_buf_cat(dest, pos, len);
|
156
|
+
if (maps_directly(codepoint)) {
|
157
|
+
cyr = CYR_MAP[codepoint - CYR_A];
|
158
|
+
cyr ? rb_str_buf_cat(dest, &cyr, 1)
|
159
|
+
: rb_str_buf_cat(dest, pos, len);
|
160
|
+
}
|
161
|
+
else {
|
162
|
+
switch (codepoint) {
|
163
|
+
case CYR_J: rb_str_buf_cat(dest, "j", 1); break;
|
164
|
+
case CYR_LJ: rb_str_buf_cat(dest, "lj", 2); break;
|
165
|
+
case CYR_NJ: rb_str_buf_cat(dest, "nj", 2); break;
|
166
|
+
case CYR_DJ: STR_CAT_COND_ASCII(ascii, dest, "dj", LAT_DJ, 2, enc); break;
|
167
|
+
case CYR_TJ: STR_CAT_COND_ASCII(ascii, dest, "c", LAT_TJ, 1, enc); break;
|
168
|
+
case CYR_CH: STR_CAT_COND_ASCII(ascii, dest, "c", LAT_CH, 1, enc); break;
|
169
|
+
case CYR_SH: STR_CAT_COND_ASCII(ascii, dest, "s", LAT_SH, 1, enc); break;
|
170
|
+
case CYR_ZH: STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc); break;
|
171
|
+
case CYR_DZ:
|
172
|
+
rb_str_buf_cat(dest, "d", 1);
|
173
|
+
STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc);
|
174
|
+
break;
|
175
|
+
default:
|
176
|
+
rb_str_buf_cat(dest, pos, len);
|
177
|
+
}
|
211
178
|
}
|
212
179
|
}
|
213
180
|
else {
|
214
|
-
|
181
|
+
if (maps_directly(codepoint)) {
|
182
|
+
cyr = CYR_CAPS_MAP[codepoint - CYR_CAP_A];
|
183
|
+
cyr ? rb_str_buf_cat(dest, &cyr, 1)
|
184
|
+
: rb_str_buf_cat(dest, pos, len);
|
185
|
+
}
|
186
|
+
else {
|
187
|
+
force_upper = seen_upper || is_upper(next_codepoint);
|
215
188
|
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
case CYR_CAP_R: rb_str_buf_cat(dest, "R", 1); break;
|
233
|
-
case CYR_CAP_S: rb_str_buf_cat(dest, "S", 1); break;
|
234
|
-
case CYR_CAP_T: rb_str_buf_cat(dest, "T", 1); break;
|
235
|
-
case CYR_CAP_U: rb_str_buf_cat(dest, "U", 1); break;
|
236
|
-
case CYR_CAP_F: rb_str_buf_cat(dest, "F", 1); break;
|
237
|
-
case CYR_CAP_H: rb_str_buf_cat(dest, "H", 1); break;
|
238
|
-
case CYR_CAP_C: rb_str_buf_cat(dest, "C", 1); break;
|
239
|
-
case CYR_CAP_LJ: rb_str_buf_cat(dest, (force_upper ? "LJ" : "Lj"), 2); break;
|
240
|
-
case CYR_CAP_NJ: rb_str_buf_cat(dest, (force_upper ? "NJ" : "Nj"), 2); break;
|
241
|
-
case CYR_CAP_TJ: STR_CAT_COND_ASCII(ascii, dest, "C", LAT_CAP_TJ, 1, enc); break;
|
242
|
-
case CYR_CAP_CH: STR_CAT_COND_ASCII(ascii, dest, "C", LAT_CAP_CH, 1, enc); break;
|
243
|
-
case CYR_CAP_ZH: STR_CAT_COND_ASCII(ascii, dest, "Z", LAT_CAP_ZH, 1, enc); break;
|
244
|
-
case CYR_CAP_SH: STR_CAT_COND_ASCII(ascii, dest, "S", LAT_CAP_SH, 1, enc); break;
|
245
|
-
case CYR_CAP_DJ: STR_CAT_COND_ASCII(ascii, dest, (force_upper ? "DJ" : "Dj"), LAT_CAP_DJ, 2, enc); break;
|
246
|
-
case CYR_CAP_DZ:
|
247
|
-
rb_str_buf_cat(dest, "D", 1);
|
248
|
-
if (force_upper) {
|
249
|
-
STR_CAT_COND_ASCII(ascii, dest, "Z", LAT_CAP_ZH, 1, enc);
|
250
|
-
}
|
251
|
-
else {
|
252
|
-
STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc);
|
189
|
+
switch (codepoint) {
|
190
|
+
case CYR_CAP_J: rb_str_buf_cat(dest, "J", 1); break;
|
191
|
+
case CYR_CAP_LJ: rb_str_buf_cat(dest, (force_upper ? "LJ" : "Lj"), 2); break;
|
192
|
+
case CYR_CAP_NJ: rb_str_buf_cat(dest, (force_upper ? "NJ" : "Nj"), 2); break;
|
193
|
+
case CYR_CAP_TJ: STR_CAT_COND_ASCII(ascii, dest, "C", LAT_CAP_TJ, 1, enc); break;
|
194
|
+
case CYR_CAP_CH: STR_CAT_COND_ASCII(ascii, dest, "C", LAT_CAP_CH, 1, enc); break;
|
195
|
+
case CYR_CAP_SH: STR_CAT_COND_ASCII(ascii, dest, "S", LAT_CAP_SH, 1, enc); break;
|
196
|
+
case CYR_CAP_ZH: STR_CAT_COND_ASCII(ascii, dest, "Z", LAT_CAP_ZH, 1, enc); break;
|
197
|
+
case CYR_CAP_DJ: STR_CAT_COND_ASCII(ascii, dest, (force_upper ? "DJ" : "Dj"), LAT_CAP_DJ, 2, enc); break;
|
198
|
+
case CYR_CAP_DZ:
|
199
|
+
rb_str_buf_cat(dest, "D", 1);
|
200
|
+
force_upper ? STR_CAT_COND_ASCII(ascii, dest, "Z", LAT_CAP_ZH, 1, enc)
|
201
|
+
: STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc);
|
202
|
+
break;
|
203
|
+
default:
|
204
|
+
rb_str_buf_cat(dest, pos, len);
|
253
205
|
}
|
254
|
-
break;
|
255
|
-
default:
|
256
|
-
rb_str_buf_cat(dest, pos, len);
|
257
206
|
}
|
258
207
|
}
|
259
208
|
}
|
209
|
+
else {
|
210
|
+
/* Mark the start of a copyable sequence */
|
211
|
+
if (!seq_start) seq_start = pos;
|
212
|
+
}
|
260
213
|
|
261
|
-
seen_upper =
|
214
|
+
seen_upper = is_upper(codepoint);
|
262
215
|
|
263
216
|
pos += len;
|
264
217
|
len = next_len;
|
@@ -267,8 +220,8 @@ str_to_latin(VALUE str, int ascii, int bang)
|
|
267
220
|
next_codepoint = 0;
|
268
221
|
}
|
269
222
|
|
223
|
+
/* Flush the last sequence, if any */
|
270
224
|
if (seq_start) {
|
271
|
-
/* flush the last sequence */
|
272
225
|
rb_str_buf_cat(dest, seq_start, pos - seq_start);
|
273
226
|
}
|
274
227
|
|
@@ -283,30 +236,67 @@ str_to_latin(VALUE str, int ascii, int bang)
|
|
283
236
|
return str;
|
284
237
|
}
|
285
238
|
|
239
|
+
/**
|
240
|
+
* Returns a copy of <i>str</i> with the Serbian Cyrillic characters
|
241
|
+
* transliterated into Latin.
|
242
|
+
*
|
243
|
+
* @overload to_latin(str)
|
244
|
+
* @param [String] str text to be transliterated
|
245
|
+
* @return [String] transliterated text
|
246
|
+
*/
|
286
247
|
static VALUE
|
287
|
-
rb_str_to_latin(VALUE str)
|
248
|
+
rb_str_to_latin(VALUE self, VALUE str)
|
249
|
+
{
|
288
250
|
return str_to_latin(str, 0, 0);
|
289
251
|
}
|
290
252
|
|
253
|
+
/**
|
254
|
+
* Performs the transliteration of <code>Byk.to_latin</code> in place,
|
255
|
+
* returning <i>str</i>, whether changes were made or not.
|
256
|
+
*
|
257
|
+
* @overload to_latin!(str)
|
258
|
+
* @param [String] str text to be transliterated
|
259
|
+
* @return [String] transliterated text
|
260
|
+
*/
|
291
261
|
static VALUE
|
292
|
-
rb_str_to_latin_bang(VALUE str)
|
262
|
+
rb_str_to_latin_bang(VALUE self, VALUE str)
|
263
|
+
{
|
293
264
|
return str_to_latin(str, 0, 1);
|
294
265
|
}
|
295
266
|
|
267
|
+
/**
|
268
|
+
* Returns a copy of <i>str</i> with the Serbian Cyrillic
|
269
|
+
* characters transliterated into ASCII Latin.
|
270
|
+
*
|
271
|
+
* @overload to_ascii_latin(str)
|
272
|
+
* @param [String] str text to be transliterated
|
273
|
+
* @return [String] transliterated text
|
274
|
+
*/
|
296
275
|
static VALUE
|
297
|
-
rb_str_to_ascii_latin(VALUE str)
|
276
|
+
rb_str_to_ascii_latin(VALUE self, VALUE str)
|
277
|
+
{
|
298
278
|
return str_to_latin(str, 1, 0);
|
299
279
|
}
|
300
280
|
|
281
|
+
/**
|
282
|
+
* Performs the transliteration of <code>Byk.to_ascii_latin</code> in
|
283
|
+
* place, returning <i>str</i>, whether changes were made or not.
|
284
|
+
*
|
285
|
+
* @overload to_ascii_latin!(str)
|
286
|
+
* @param [String] str text to be transliterated
|
287
|
+
* @return [String] transliterated text
|
288
|
+
*/
|
301
289
|
static VALUE
|
302
|
-
rb_str_to_ascii_latin_bang(VALUE str)
|
290
|
+
rb_str_to_ascii_latin_bang(VALUE self, VALUE str)
|
291
|
+
{
|
303
292
|
return str_to_latin(str, 1, 1);
|
304
293
|
}
|
305
294
|
|
306
295
|
void Init_byk_native(void)
|
307
296
|
{
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
297
|
+
VALUE Byk = rb_define_module("Byk");
|
298
|
+
rb_define_singleton_method(Byk, "to_latin", rb_str_to_latin, 1);
|
299
|
+
rb_define_singleton_method(Byk, "to_latin!", rb_str_to_latin_bang, 1);
|
300
|
+
rb_define_singleton_method(Byk, "to_ascii_latin", rb_str_to_ascii_latin, 1);
|
301
|
+
rb_define_singleton_method(Byk, "to_ascii_latin!", rb_str_to_ascii_latin_bang, 1);
|
312
302
|
}
|
data/lib/byk.rb
CHANGED
@@ -1,14 +1,2 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
require "byk_native"
|
4
|
-
require "byk/version"
|
5
|
-
|
6
|
-
module Byk
|
7
|
-
|
8
|
-
AZBUKA = %w[а б в г д ђ е ж з и ј к л љ м н њ о п р с т ћ у ф х ц ч џ ш]
|
9
|
-
AZBUKA_CAPS = %W[А Б В Г Д Ђ Е Ж З И Ј К Л Љ М Н Њ О П Р С Т Ћ У Ф Х Ц Ч Џ Ш]
|
10
|
-
|
11
|
-
ABECEDA = %w[a b c č ć d dž đ e f g h i j k l lj m n nj o p r s š t u v z ž]
|
12
|
-
ABECEDA_CAPS = %W[A B C Č Ć D Dž Đ E F G H I J K L Lj M N Nj O P R S Š T U V Z Ž]
|
13
|
-
|
14
|
-
end
|
1
|
+
require "byk/safe"
|
2
|
+
require "byk/core_ext/string"
|
data/lib/byk/safe.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
require "byk_native"
|
4
|
+
require "byk/version"
|
5
|
+
|
6
|
+
module Byk
|
7
|
+
|
8
|
+
AZBUKA = %w[а б в г д ђ е ж з и ј к л љ м н њ о п р с т ћ у ф х ц ч џ ш]
|
9
|
+
AZBUKA_CAPS = %W[А Б В Г Д Ђ Е Ж З И Ј К Л Љ М Н Њ О П Р С Т Ћ У Ф Х Ц Ч Џ Ш]
|
10
|
+
|
11
|
+
ABECEDA = %w[a b c č ć d dž đ e f g h i j k l lj m n nj o p r s š t u v z ž]
|
12
|
+
ABECEDA_CAPS = %W[A B C Č Ć D Dž Đ E F G H I J K L Lj M N Nj O P R S Š T U V Z Ž]
|
13
|
+
|
14
|
+
end
|
data/lib/byk/version.rb
CHANGED
data/spec/byk_spec.rb
CHANGED
@@ -4,123 +4,180 @@ require "spec_helper"
|
|
4
4
|
|
5
5
|
describe Byk do
|
6
6
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
let(:pangram_ascii_latin) { "fijuce vetar u siblju, ledi pasaze i kuce iza njih i gundja u odzacima." }
|
7
|
+
it "has a version number" do
|
8
|
+
expect(Byk::VERSION).not_to be nil
|
9
|
+
end
|
11
10
|
|
12
|
-
|
13
|
-
let(:pangram_latin_caps) { "FIJUČE VETAR U ŠIBLJU, LEDI PASAŽE I KUĆE IZA NJIH I GUNĐA U ODŽACIMA." }
|
14
|
-
let(:pangram_ascii_latin_caps) { "FIJUCE VETAR U SIBLJU, LEDI PASAZE I KUCE IZA NJIH I GUNDJA U ODZACIMA." }
|
11
|
+
shared_examples :base do |method|
|
15
12
|
|
16
|
-
|
17
|
-
|
13
|
+
# See http://sr.wikipedia.org/wiki/Панграм
|
14
|
+
let(:pangram) { "фијуче ветар у шибљу, леди пасаже и куће иза њих и гунђа у оџацима." }
|
15
|
+
let(:pangram_latin) { "fijuče vetar u šiblju, ledi pasaže i kuće iza njih i gunđa u odžacima." }
|
16
|
+
let(:pangram_ascii_latin) { "fijuce vetar u siblju, ledi pasaze i kuce iza njih i gundja u odzacima." }
|
18
17
|
|
19
|
-
|
20
|
-
|
21
|
-
|
18
|
+
let(:pangram_caps) { "ФИЈУЧЕ ВЕТАР У ШИБЉУ, ЛЕДИ ПАСАЖЕ И КУЋЕ ИЗА ЊИХ И ГУНЂА У ОЏАЦИМА." }
|
19
|
+
let(:pangram_latin_caps) { "FIJUČE VETAR U ŠIBLJU, LEDI PASAŽE I KUĆE IZA NJIH I GUNĐA U ODŽACIMA." }
|
20
|
+
let(:pangram_ascii_latin_caps) { "FIJUCE VETAR U SIBLJU, LEDI PASAZE I KUCE IZA NJIH I GUNDJA U ODZACIMA." }
|
22
21
|
|
23
|
-
|
24
|
-
|
25
|
-
|
22
|
+
let(:full_cyrillic_coderange) { (0x400..0x4ff).map { |i| i.chr(Encoding::UTF_8) } }
|
23
|
+
let(:non_serbian_cyrillic_coderange) { full_cyrillic_coderange - Byk::AZBUKA - Byk::AZBUKA_CAPS }
|
24
|
+
let(:non_serbian_cyrillic) { non_serbian_cyrillic_coderange.join }
|
25
|
+
|
26
|
+
let(:ascii) { "The quick brown fox jumps over the lazy dog." }
|
27
|
+
let(:other) { "संस्कृतम् saṃskṛtam" }
|
26
28
|
|
27
|
-
|
29
|
+
let(:mixed) { "संस्कृतम् saṃskṛtam илити Sanskrit, obrati ПАЖЊУ." }
|
30
|
+
let(:mixed_latin) { "संस्कृतम् saṃskṛtam iliti Sanskrit, obrati PAŽNJU." }
|
31
|
+
let(:mixed_ascii_latin) { "संस्कृतम् saṃskṛtam iliti Sanskrit, obrati PAZNJU." }
|
28
32
|
|
29
|
-
it "doesn't
|
30
|
-
expect(""
|
33
|
+
it "doesn't convert an empty string" do
|
34
|
+
expect(Byk.send(method, "")).to eq ""
|
31
35
|
end
|
32
36
|
|
33
|
-
it "doesn't
|
34
|
-
expect(ascii
|
37
|
+
it "doesn't convert ASCII text" do
|
38
|
+
expect(Byk.send(method, ascii)).to eq ascii
|
35
39
|
end
|
36
40
|
|
37
|
-
it "doesn't
|
38
|
-
expect(
|
41
|
+
it "doesn't convert non-Serbian Cyrillic" do
|
42
|
+
expect(Byk.send(method, non_serbian_cyrillic)).to eq non_serbian_cyrillic
|
39
43
|
end
|
40
44
|
|
41
|
-
it "doesn't
|
42
|
-
expect(other
|
45
|
+
it "doesn't convert other coderanges" do
|
46
|
+
expect(Byk.send(method, other)).to eq other
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
shared_examples :latinization_method do |method|
|
51
|
+
include_examples :base, method
|
52
|
+
|
53
|
+
let(:edge_cases) {
|
54
|
+
[
|
55
|
+
["Њ", "Nj"],
|
56
|
+
["Љ", "Lj"],
|
57
|
+
["Џ", "Dž"],
|
58
|
+
["ЊЊ", "NJNJ"],
|
59
|
+
["ЉЉ", "LJLJ"],
|
60
|
+
["ЏЏ", "DŽDŽ"]
|
61
|
+
]
|
62
|
+
}
|
63
|
+
|
64
|
+
it "doesn't convert Latin" do
|
65
|
+
expect(Byk.send(method, pangram_latin)).to eq pangram_latin
|
43
66
|
end
|
44
67
|
|
45
|
-
it "converts
|
46
|
-
expect(pangram
|
68
|
+
it "converts Cyrillic to Latin" do
|
69
|
+
expect(Byk.send(method, pangram)).to eq pangram_latin
|
47
70
|
end
|
48
71
|
|
49
|
-
it "converts
|
50
|
-
expect(pangram_caps
|
72
|
+
it "converts Cyrillic caps to Latin caps" do
|
73
|
+
expect(Byk.send(method, pangram_caps)).to eq pangram_latin_caps
|
51
74
|
end
|
52
75
|
|
53
76
|
it "converts mixed text properly" do
|
54
|
-
expect(mixed
|
77
|
+
expect(Byk.send(method, mixed)).to eq mixed_latin
|
78
|
+
end
|
79
|
+
|
80
|
+
it "converts edge cases properly" do
|
81
|
+
edge_cases.each do |input, output|
|
82
|
+
expect(Byk.send(method, input)).to eq output
|
83
|
+
end
|
55
84
|
end
|
56
85
|
|
57
86
|
it "converts AZBUKA to ABECEDA" do
|
58
|
-
expect(Byk::AZBUKA.map(
|
87
|
+
expect(Byk::AZBUKA.map { |l| l.dup.send(method) }).to match_array(Byk::ABECEDA)
|
59
88
|
end
|
60
89
|
|
61
90
|
it "converts AZBUKA_CAPS to ABECEDA_CAPS" do
|
62
|
-
expect(Byk::AZBUKA_CAPS.map(
|
91
|
+
expect(Byk::AZBUKA_CAPS.map { |l| l.dup.send(method) }).to match_array(Byk::ABECEDA_CAPS)
|
63
92
|
end
|
64
93
|
end
|
65
94
|
|
66
|
-
|
95
|
+
shared_examples :ascii_latinization_method do |method|
|
96
|
+
include_examples :base, method
|
67
97
|
|
68
|
-
# Special care for Њ, Љ, Ђ, Đ
|
69
98
|
let(:edge_cases) {
|
70
|
-
|
71
|
-
"Њ"
|
72
|
-
"Љ"
|
73
|
-
"
|
74
|
-
"
|
75
|
-
"
|
76
|
-
"
|
77
|
-
"
|
78
|
-
"
|
79
|
-
"
|
80
|
-
"
|
81
|
-
"ЂУРАЂ"
|
82
|
-
"ĐURAĐ"
|
83
|
-
|
84
|
-
"ĐURAĐ. Đorđević" => "DJURADJ. Djordjevic"
|
85
|
-
}
|
99
|
+
[
|
100
|
+
["Њ", "Nj"],
|
101
|
+
["Љ", "Lj"],
|
102
|
+
["Џ", "Dz"],
|
103
|
+
["Ђ", "Dj"],
|
104
|
+
["Đ", "Dj"],
|
105
|
+
["ЊЊ", "NJNJ"],
|
106
|
+
["ЉЉ", "LJLJ"],
|
107
|
+
["ЏЏ", "DZDZ"],
|
108
|
+
["ЂЂ", "DJDJ"],
|
109
|
+
["ĐĐ", "DJDJ"],
|
110
|
+
["ЂУРАЂ Ђорђевић", "DJURADJ Djordjevic"],
|
111
|
+
["ĐURAĐ Đorđević", "DJURADJ Djordjevic"]
|
112
|
+
]
|
86
113
|
}
|
87
114
|
|
88
|
-
it "
|
89
|
-
expect(
|
115
|
+
it "converts Cyrillic to ASCII Latin" do
|
116
|
+
expect(Byk.send(method, pangram)).to eq pangram_ascii_latin
|
90
117
|
end
|
91
118
|
|
92
|
-
it "
|
93
|
-
expect(
|
119
|
+
it "converts Cyrillic caps to ASCII Latin caps" do
|
120
|
+
expect(Byk.send(method, pangram_caps)).to eq pangram_ascii_latin_caps
|
94
121
|
end
|
95
122
|
|
96
|
-
it "
|
97
|
-
expect(
|
123
|
+
it "converts Latin to ASCII Latin" do
|
124
|
+
expect(Byk.send(method, pangram_latin)).to eq pangram_ascii_latin
|
98
125
|
end
|
99
126
|
|
100
|
-
it "converts
|
101
|
-
expect(
|
127
|
+
it "converts Latin caps to ASCII Latin caps" do
|
128
|
+
expect(Byk.send(method, pangram_latin_caps)).to eq pangram_ascii_latin_caps
|
102
129
|
end
|
103
130
|
|
104
|
-
it "converts
|
105
|
-
expect(
|
131
|
+
it "converts mixed text properly" do
|
132
|
+
expect(Byk.send(method, mixed)).to eq mixed_ascii_latin
|
106
133
|
end
|
107
134
|
|
108
|
-
it "converts
|
109
|
-
|
135
|
+
it "converts edge cases properly" do
|
136
|
+
edge_cases.each do |input, output|
|
137
|
+
expect(Byk.send(method, input)).to eq output
|
138
|
+
end
|
110
139
|
end
|
140
|
+
end
|
111
141
|
|
112
|
-
|
113
|
-
|
142
|
+
shared_examples :non_destructive_method do |method|
|
143
|
+
it "doesn't modify the arg" do
|
144
|
+
str = "Ж"
|
145
|
+
expect { Byk.send(method, str) }.to_not change { str }
|
114
146
|
end
|
147
|
+
end
|
115
148
|
|
116
|
-
|
117
|
-
|
149
|
+
shared_examples :destructive_method do |method|
|
150
|
+
it "modifies the arg" do
|
151
|
+
str = "Ж"
|
152
|
+
expect { Byk.send(method, str) }.to change { str }
|
118
153
|
end
|
154
|
+
end
|
119
155
|
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
156
|
+
describe ".to_latin" do
|
157
|
+
it_behaves_like :latinization_method, :to_latin
|
158
|
+
it_behaves_like :non_destructive_method, :to_latin
|
159
|
+
end
|
160
|
+
|
161
|
+
describe ".to_latin!" do
|
162
|
+
it_behaves_like :latinization_method, :to_latin!
|
163
|
+
it_behaves_like :destructive_method, :to_latin!
|
164
|
+
end
|
165
|
+
|
166
|
+
describe ".to_ascii_latin" do
|
167
|
+
it_behaves_like :ascii_latinization_method, :to_ascii_latin
|
168
|
+
it_behaves_like :non_destructive_method, :to_ascii_latin
|
169
|
+
end
|
170
|
+
|
171
|
+
describe ".to_ascii_latin!" do
|
172
|
+
it_behaves_like :ascii_latinization_method, :to_ascii_latin!
|
173
|
+
it_behaves_like :destructive_method, :to_ascii_latin!
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
describe String do
|
178
|
+
it "responds to Byk methods" do
|
179
|
+
Byk.instance_methods.each do |method|
|
180
|
+
expect("").to respond_to(method)
|
124
181
|
end
|
125
182
|
end
|
126
183
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: byk
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nikola Topalović
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-04-
|
11
|
+
date: 2015-04-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake-compiler
|
@@ -52,6 +52,8 @@ files:
|
|
52
52
|
- ext/byk/byk.c
|
53
53
|
- ext/byk/extconf.rb
|
54
54
|
- lib/byk.rb
|
55
|
+
- lib/byk/core_ext/string.rb
|
56
|
+
- lib/byk/safe.rb
|
55
57
|
- lib/byk/version.rb
|
56
58
|
- spec/byk_spec.rb
|
57
59
|
homepage: https://github.com/topalovic/byk
|
@@ -74,7 +76,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
74
76
|
version: '0'
|
75
77
|
requirements: []
|
76
78
|
rubyforge_project:
|
77
|
-
rubygems_version: 2.
|
79
|
+
rubygems_version: 2.4.5
|
78
80
|
signing_key:
|
79
81
|
specification_version: 4
|
80
82
|
summary: Fast transliteration of Serbian Cyrillic into Latin.
|