byk 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/README.md +83 -0
- data/ext/byk/byk.c +303 -0
- data/ext/byk/extconf.rb +2 -0
- data/lib/byk.rb +14 -0
- data/lib/byk/version.rb +3 -0
- data/spec/byk_spec.rb +126 -0
- metadata +81 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 18a9ea704a27dfc6a2ca4f7fd9b83e05c1f5a57b
|
4
|
+
data.tar.gz: b6490d7a249f5ceb6378d92d6ee63af2424074ab
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 809f4204b60ec626f15aff3e32b8ca4368e0bcecbda1700b039b3edf086853841a6e94c7ef3e9a47003623b9cde0635d555d1d19ccca28220aa165f7a28c8327
|
7
|
+
data.tar.gz: 9a103df7e2976d6ab2574a372479e18f6fccd3c2e005d49e5c464f82befb6942ca19a80091387d67ee85aa98980b2bfe21bf9efada48ad1fec5e817a60d827f8
|
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2015 Nikola Topalović
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,83 @@
|
|
1
|
+
Byk
|
2
|
+
===
|
3
|
+
|
4
|
+
Fast transliteration of Serbian Cyrillic into Latin.
|
5
|
+
|
6
|
+
This package was inspired by @dejan's
|
7
|
+
[nice little gem](https://github.com/dejan/srbovanje), but this one
|
8
|
+
comes with a C-optimized twist.
|
9
|
+
|
10
|
+
## Installation
|
11
|
+
|
12
|
+
Add this line to your application's Gemfile:
|
13
|
+
|
14
|
+
```ruby
|
15
|
+
gem "byk"
|
16
|
+
```
|
17
|
+
|
18
|
+
And then execute:
|
19
|
+
|
20
|
+
```
|
21
|
+
$ bundle
|
22
|
+
```
|
23
|
+
|
24
|
+
Or install it yourself as:
|
25
|
+
```
|
26
|
+
$ gem install byk
|
27
|
+
```
|
28
|
+
|
29
|
+
## Usage
|
30
|
+
|
31
|
+
First, make sure to require the gem in your initializer:
|
32
|
+
|
33
|
+
```
|
34
|
+
require "byk"
|
35
|
+
```
|
36
|
+
|
37
|
+
This will extend `String` with a couple of simple methods:
|
38
|
+
|
39
|
+
```ruby
|
40
|
+
"Шеширџија".to_latin # => "Šeširdžija"
|
41
|
+
"Шеширџија".to_ascii_latin # => "Sesirdzija"
|
42
|
+
"Šeširdžija".to_ascii_latin # => "Sesirdzija"
|
43
|
+
```
|
44
|
+
|
45
|
+
There's also a destructive version of each:
|
46
|
+
|
47
|
+
```ruby
|
48
|
+
text = "Жвазбука"
|
49
|
+
text.to_latin! # => "Žvazbuka"
|
50
|
+
text # => "Žvazbuka"
|
51
|
+
text.to_ascii_latin! # => "Zvazbuka"
|
52
|
+
text # => "Zvazbuka"
|
53
|
+
```
|
54
|
+
|
55
|
+
Note that these methods will take into account the
|
56
|
+
[special two-letter rules](http://sr.wikipedia.org/wiki/Gajica#Abeceda):
|
57
|
+
|
58
|
+
```
|
59
|
+
"ĐORĐE Đorđević".to_ascii_latin # => "DJORDJE Djordjevic"
|
60
|
+
```
|
61
|
+
|
62
|
+
## Notes
|
63
|
+
|
64
|
+
### How fast is fast?
|
65
|
+
|
66
|
+
About [7x](benchmark) faster than the baseline Ruby implementation on
|
67
|
+
my hardware. YMMV of course.
|
68
|
+
|
69
|
+
### Compatibility
|
70
|
+
|
71
|
+
Byk is supported under MRI Ruby 1.9.3, 2.0, 2.1 and 2.2. Earlier
|
72
|
+
versions of MRI are untested.
|
73
|
+
|
74
|
+
### Raison d'être
|
75
|
+
|
76
|
+
For massive transliteration (e.g. sites supporting dual script
|
77
|
+
output), this kind of speed-up might be worthwhile, even with caching.
|
78
|
+
|
79
|
+
Also, it's a well-defined problem with hard-set rules which makes it a
|
80
|
+
natural target for optimization. Plus, it gave me an excuse to play
|
81
|
+
with Ruby extensions, so there :smile_cat:
|
82
|
+
|
83
|
+
Уздравље!
|
data/ext/byk/byk.c
ADDED
@@ -0,0 +1,303 @@
|
|
1
|
+
#include <stdio.h>
|
2
|
+
#include <ruby.h>
|
3
|
+
#include <ruby/encoding.h>
|
4
|
+
|
5
|
+
#define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
|
6
|
+
|
7
|
+
#define STR_CAT_COND_ASCII(force_ascii, dest, chr, ascii_chr, len, enc) \
|
8
|
+
force_ascii ? rb_enc_str_buf_cat(dest, chr, len, enc) \
|
9
|
+
: str_cat_char(dest, ascii_chr, enc)
|
10
|
+
|
11
|
+
enum {
|
12
|
+
LAT_CAP_TJ=262,
|
13
|
+
LAT_TJ,
|
14
|
+
LAT_CAP_CH=268,
|
15
|
+
LAT_CH,
|
16
|
+
LAT_CAP_DJ=272,
|
17
|
+
LAT_DJ,
|
18
|
+
LAT_CAP_SH=352,
|
19
|
+
LAT_SH,
|
20
|
+
LAT_CAP_ZH=381,
|
21
|
+
LAT_ZH,
|
22
|
+
CYR_CAP_DJ=1026,
|
23
|
+
CYR_CAP_J=1032,
|
24
|
+
CYR_CAP_LJ,
|
25
|
+
CYR_CAP_NJ,
|
26
|
+
CYR_CAP_TJ,
|
27
|
+
CYR_CAP_DZ=1039,
|
28
|
+
CYR_CAP_A,
|
29
|
+
CYR_CAP_B,
|
30
|
+
CYR_CAP_V,
|
31
|
+
CYR_CAP_G,
|
32
|
+
CYR_CAP_D,
|
33
|
+
CYR_CAP_E,
|
34
|
+
CYR_CAP_ZH,
|
35
|
+
CYR_CAP_Z,
|
36
|
+
CYR_CAP_I,
|
37
|
+
CYR_CAP_K=1050,
|
38
|
+
CYR_CAP_L,
|
39
|
+
CYR_CAP_M,
|
40
|
+
CYR_CAP_N,
|
41
|
+
CYR_CAP_O,
|
42
|
+
CYR_CAP_P,
|
43
|
+
CYR_CAP_R,
|
44
|
+
CYR_CAP_S,
|
45
|
+
CYR_CAP_T,
|
46
|
+
CYR_CAP_U,
|
47
|
+
CYR_CAP_F,
|
48
|
+
CYR_CAP_H,
|
49
|
+
CYR_CAP_C,
|
50
|
+
CYR_CAP_CH,
|
51
|
+
CYR_CAP_SH,
|
52
|
+
CYR_A=1072,
|
53
|
+
CYR_B,
|
54
|
+
CYR_V,
|
55
|
+
CYR_G,
|
56
|
+
CYR_D,
|
57
|
+
CYR_E,
|
58
|
+
CYR_ZH,
|
59
|
+
CYR_Z,
|
60
|
+
CYR_I,
|
61
|
+
CYR_K=1082,
|
62
|
+
CYR_L,
|
63
|
+
CYR_M,
|
64
|
+
CYR_N,
|
65
|
+
CYR_O,
|
66
|
+
CYR_P,
|
67
|
+
CYR_R,
|
68
|
+
CYR_S,
|
69
|
+
CYR_T,
|
70
|
+
CYR_U,
|
71
|
+
CYR_F,
|
72
|
+
CYR_H,
|
73
|
+
CYR_C,
|
74
|
+
CYR_CH,
|
75
|
+
CYR_SH,
|
76
|
+
CYR_DJ=1106,
|
77
|
+
CYR_J=1112,
|
78
|
+
CYR_LJ,
|
79
|
+
CYR_NJ,
|
80
|
+
CYR_TJ,
|
81
|
+
CYR_DZ=1119
|
82
|
+
};
|
83
|
+
|
84
|
+
static inline unsigned int
|
85
|
+
is_upper_case(unsigned int c)
|
86
|
+
{
|
87
|
+
return ((c >= 65 && c <= 90)
|
88
|
+
|| (c >= CYR_CAP_DJ && c <= CYR_CAP_SH)
|
89
|
+
|| c == LAT_CAP_TJ
|
90
|
+
|| c == LAT_CAP_CH
|
91
|
+
|| c == LAT_CAP_DJ
|
92
|
+
|| c == LAT_CAP_SH
|
93
|
+
|| c == LAT_CAP_ZH);
|
94
|
+
}
|
95
|
+
|
96
|
+
|
97
|
+
static void
|
98
|
+
str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
|
99
|
+
{
|
100
|
+
char s[16];
|
101
|
+
int n = rb_enc_codelen(c, enc);
|
102
|
+
rb_enc_mbcput(c, s, enc);
|
103
|
+
rb_enc_str_buf_cat(str, s, n, enc);
|
104
|
+
}
|
105
|
+
|
106
|
+
static VALUE
|
107
|
+
str_to_latin(int argc, VALUE *argv, VALUE str, int ascii, int bang)
|
108
|
+
{
|
109
|
+
VALUE dest;
|
110
|
+
long dest_len;
|
111
|
+
char *pos, *end;
|
112
|
+
rb_encoding *enc;
|
113
|
+
unsigned int codepoint = 0;
|
114
|
+
unsigned int prev_codepoint = 0;
|
115
|
+
unsigned int next_codepoint = 0;
|
116
|
+
|
117
|
+
rb_check_arity(argc, 0, 1);
|
118
|
+
|
119
|
+
pos = RSTRING_PTR(str);
|
120
|
+
if (!pos || RSTRING_LEN(str) == 0) return str;
|
121
|
+
|
122
|
+
end = RSTRING_END(str);
|
123
|
+
enc = STR_ENC_GET(str);
|
124
|
+
dest_len = RSTRING_LEN(str) + 30; /* TODO len + margin */
|
125
|
+
dest = rb_str_buf_new(dest_len);
|
126
|
+
rb_enc_associate(dest, enc);
|
127
|
+
|
128
|
+
while (pos < end) {
|
129
|
+
int len, force_upper = 0;
|
130
|
+
|
131
|
+
prev_codepoint = codepoint;
|
132
|
+
codepoint = rb_enc_codepoint_len(pos, end, &len, enc);
|
133
|
+
next_codepoint = 0;
|
134
|
+
|
135
|
+
force_upper = prev_codepoint && is_upper_case(prev_codepoint);
|
136
|
+
|
137
|
+
if (!force_upper && (pos + len < end)) {
|
138
|
+
/* TODO Trim down to one rb_enc_codepoint call per iter. */
|
139
|
+
next_codepoint = rb_enc_codepoint(pos + len, end, enc);
|
140
|
+
force_upper = is_upper_case(next_codepoint);
|
141
|
+
}
|
142
|
+
|
143
|
+
/* Latin -> "ASCII latin" conversion */
|
144
|
+
if (ascii && codepoint >= LAT_CAP_TJ && codepoint <= LAT_ZH) {
|
145
|
+
switch (codepoint) {
|
146
|
+
case LAT_TJ:
|
147
|
+
case LAT_CH: rb_enc_str_buf_cat(dest, "c", 1, enc); break;
|
148
|
+
case LAT_DJ: rb_enc_str_buf_cat(dest, "dj", 2, enc); break;
|
149
|
+
case LAT_SH: rb_enc_str_buf_cat(dest, "s", 1, enc); break;
|
150
|
+
case LAT_ZH: rb_enc_str_buf_cat(dest, "z", 1, enc); break;
|
151
|
+
case LAT_CAP_TJ:
|
152
|
+
case LAT_CAP_CH: rb_enc_str_buf_cat(dest, "C", 1, enc); break;
|
153
|
+
case LAT_CAP_SH: rb_enc_str_buf_cat(dest, "S", 1, enc); break;
|
154
|
+
case LAT_CAP_ZH: rb_enc_str_buf_cat(dest, "Z", 1, enc); break;
|
155
|
+
|
156
|
+
case LAT_CAP_DJ:
|
157
|
+
force_upper ? rb_enc_str_buf_cat(dest, "DJ", 2, enc)
|
158
|
+
: rb_enc_str_buf_cat(dest, "Dj", 2, enc);
|
159
|
+
break;
|
160
|
+
default:
|
161
|
+
rb_enc_str_buf_cat(dest, pos, len, enc);
|
162
|
+
}
|
163
|
+
pos += len;
|
164
|
+
continue;
|
165
|
+
}
|
166
|
+
|
167
|
+
/* Short-circuit for non-cyrillic codepoints */
|
168
|
+
if (codepoint < CYR_CAP_DJ || codepoint > CYR_DZ) {
|
169
|
+
rb_enc_str_buf_cat(dest, pos, len, enc);
|
170
|
+
pos += len;
|
171
|
+
continue;
|
172
|
+
}
|
173
|
+
|
174
|
+
/* Cyrillic -> latin conversion */
|
175
|
+
switch (codepoint) {
|
176
|
+
case CYR_CAP_J: rb_enc_str_buf_cat(dest, "J", 1, enc); break;
|
177
|
+
case CYR_CAP_A: rb_enc_str_buf_cat(dest, "A", 1, enc); break;
|
178
|
+
case CYR_CAP_B: rb_enc_str_buf_cat(dest, "B", 1, enc); break;
|
179
|
+
case CYR_CAP_V: rb_enc_str_buf_cat(dest, "V", 1, enc); break;
|
180
|
+
case CYR_CAP_G: rb_enc_str_buf_cat(dest, "G", 1, enc); break;
|
181
|
+
case CYR_CAP_D: rb_enc_str_buf_cat(dest, "D", 1, enc); break;
|
182
|
+
case CYR_CAP_E: rb_enc_str_buf_cat(dest, "E", 1, enc); break;
|
183
|
+
case CYR_CAP_Z: rb_enc_str_buf_cat(dest, "Z", 1, enc); break;
|
184
|
+
case CYR_CAP_I: rb_enc_str_buf_cat(dest, "I", 1, enc); break;
|
185
|
+
case CYR_CAP_K: rb_enc_str_buf_cat(dest, "K", 1, enc); break;
|
186
|
+
case CYR_CAP_L: rb_enc_str_buf_cat(dest, "L", 1, enc); break;
|
187
|
+
case CYR_CAP_M: rb_enc_str_buf_cat(dest, "M", 1, enc); break;
|
188
|
+
case CYR_CAP_N: rb_enc_str_buf_cat(dest, "N", 1, enc); break;
|
189
|
+
case CYR_CAP_O: rb_enc_str_buf_cat(dest, "O", 1, enc); break;
|
190
|
+
case CYR_CAP_P: rb_enc_str_buf_cat(dest, "P", 1, enc); break;
|
191
|
+
case CYR_CAP_R: rb_enc_str_buf_cat(dest, "R", 1, enc); break;
|
192
|
+
case CYR_CAP_S: rb_enc_str_buf_cat(dest, "S", 1, enc); break;
|
193
|
+
case CYR_CAP_T: rb_enc_str_buf_cat(dest, "T", 1, enc); break;
|
194
|
+
case CYR_CAP_U: rb_enc_str_buf_cat(dest, "U", 1, enc); break;
|
195
|
+
case CYR_CAP_F: rb_enc_str_buf_cat(dest, "F", 1, enc); break;
|
196
|
+
case CYR_CAP_H: rb_enc_str_buf_cat(dest, "H", 1, enc); break;
|
197
|
+
case CYR_CAP_C: rb_enc_str_buf_cat(dest, "C", 1, enc); break;
|
198
|
+
case CYR_A: rb_enc_str_buf_cat(dest, "a", 1, enc); break;
|
199
|
+
case CYR_B: rb_enc_str_buf_cat(dest, "b", 1, enc); break;
|
200
|
+
case CYR_V: rb_enc_str_buf_cat(dest, "v", 1, enc); break;
|
201
|
+
case CYR_G: rb_enc_str_buf_cat(dest, "g", 1, enc); break;
|
202
|
+
case CYR_D: rb_enc_str_buf_cat(dest, "d", 1, enc); break;
|
203
|
+
case CYR_E: rb_enc_str_buf_cat(dest, "e", 1, enc); break;
|
204
|
+
case CYR_Z: rb_enc_str_buf_cat(dest, "z", 1, enc); break;
|
205
|
+
case CYR_I: rb_enc_str_buf_cat(dest, "i", 1, enc); break;
|
206
|
+
case CYR_K: rb_enc_str_buf_cat(dest, "k", 1, enc); break;
|
207
|
+
case CYR_L: rb_enc_str_buf_cat(dest, "l", 1, enc); break;
|
208
|
+
case CYR_M: rb_enc_str_buf_cat(dest, "m", 1, enc); break;
|
209
|
+
case CYR_N: rb_enc_str_buf_cat(dest, "n", 1, enc); break;
|
210
|
+
case CYR_O: rb_enc_str_buf_cat(dest, "o", 1, enc); break;
|
211
|
+
case CYR_P: rb_enc_str_buf_cat(dest, "p", 1, enc); break;
|
212
|
+
case CYR_R: rb_enc_str_buf_cat(dest, "r", 1, enc); break;
|
213
|
+
case CYR_S: rb_enc_str_buf_cat(dest, "s", 1, enc); break;
|
214
|
+
case CYR_T: rb_enc_str_buf_cat(dest, "t", 1, enc); break;
|
215
|
+
case CYR_U: rb_enc_str_buf_cat(dest, "u", 1, enc); break;
|
216
|
+
case CYR_F: rb_enc_str_buf_cat(dest, "f", 1, enc); break;
|
217
|
+
case CYR_H: rb_enc_str_buf_cat(dest, "h", 1, enc); break;
|
218
|
+
case CYR_C: rb_enc_str_buf_cat(dest, "c", 1, enc); break;
|
219
|
+
case CYR_J: rb_enc_str_buf_cat(dest, "j", 1, enc); break;
|
220
|
+
case CYR_LJ: rb_enc_str_buf_cat(dest, "lj", 2, enc); break;
|
221
|
+
case CYR_NJ: rb_enc_str_buf_cat(dest, "nj", 2, enc); break;
|
222
|
+
case CYR_DJ: STR_CAT_COND_ASCII(ascii, dest, "dj", LAT_DJ, 2, enc); break;
|
223
|
+
case CYR_TJ: STR_CAT_COND_ASCII(ascii, dest, "c", LAT_TJ, 1, enc); break;
|
224
|
+
case CYR_CH: STR_CAT_COND_ASCII(ascii, dest, "c", LAT_CH, 1, enc); break;
|
225
|
+
case CYR_ZH: STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc); break;
|
226
|
+
case CYR_SH: STR_CAT_COND_ASCII(ascii, dest, "s", LAT_SH, 1, enc); break;
|
227
|
+
case CYR_CAP_TJ: STR_CAT_COND_ASCII(ascii, dest, "C", LAT_CAP_TJ, 1, enc); break;
|
228
|
+
case CYR_CAP_CH: STR_CAT_COND_ASCII(ascii, dest, "C", LAT_CAP_CH, 1, enc); break;
|
229
|
+
case CYR_CAP_ZH: STR_CAT_COND_ASCII(ascii, dest, "Z", LAT_CAP_ZH, 1, enc); break;
|
230
|
+
case CYR_CAP_SH: STR_CAT_COND_ASCII(ascii, dest, "S", LAT_CAP_SH, 1, enc); break;
|
231
|
+
|
232
|
+
/* Several special cases */
|
233
|
+
case CYR_CAP_LJ:
|
234
|
+
rb_enc_str_buf_cat(dest, (force_upper ? "LJ" : "Lj"), 2, enc);
|
235
|
+
break;
|
236
|
+
|
237
|
+
case CYR_CAP_NJ:
|
238
|
+
rb_enc_str_buf_cat(dest, (force_upper ? "NJ" : "Nj"), 2, enc);
|
239
|
+
break;
|
240
|
+
|
241
|
+
case CYR_CAP_DJ:
|
242
|
+
STR_CAT_COND_ASCII(ascii, dest, (force_upper ? "DJ" : "Dj"), LAT_CAP_DJ, 2, enc);
|
243
|
+
break;
|
244
|
+
|
245
|
+
case CYR_CAP_DZ:
|
246
|
+
rb_enc_str_buf_cat(dest, "D", 1, enc);
|
247
|
+
if (force_upper) {
|
248
|
+
STR_CAT_COND_ASCII(ascii, dest, "Z", LAT_CAP_ZH, 1, enc);
|
249
|
+
}
|
250
|
+
else {
|
251
|
+
STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc);
|
252
|
+
}
|
253
|
+
break;
|
254
|
+
|
255
|
+
case CYR_DZ:
|
256
|
+
rb_enc_str_buf_cat(dest, "d", 1, enc);
|
257
|
+
STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc);
|
258
|
+
break;
|
259
|
+
|
260
|
+
default:
|
261
|
+
rb_enc_str_buf_cat(dest, pos, len, enc);
|
262
|
+
}
|
263
|
+
pos += len;
|
264
|
+
}
|
265
|
+
|
266
|
+
if (bang) {
|
267
|
+
rb_str_shared_replace(str, dest);
|
268
|
+
}
|
269
|
+
else {
|
270
|
+
OBJ_INFECT(dest, str);
|
271
|
+
str = dest;
|
272
|
+
}
|
273
|
+
|
274
|
+
return str;
|
275
|
+
}
|
276
|
+
|
277
|
+
static VALUE
|
278
|
+
rb_str_to_latin(int argc, VALUE *argv, VALUE str) {
|
279
|
+
return str_to_latin(argc, argv, str, 0, 0);
|
280
|
+
}
|
281
|
+
|
282
|
+
static VALUE
|
283
|
+
rb_str_to_latin_bang(int argc, VALUE *argv, VALUE str) {
|
284
|
+
return str_to_latin(argc, argv, str, 0, 1);
|
285
|
+
}
|
286
|
+
|
287
|
+
static VALUE
|
288
|
+
rb_str_to_ascii_latin(int argc, VALUE *argv, VALUE str) {
|
289
|
+
return str_to_latin(argc, argv, str, 1, 0);
|
290
|
+
}
|
291
|
+
|
292
|
+
static VALUE
|
293
|
+
rb_str_to_ascii_latin_bang(int argc, VALUE *argv, VALUE str) {
|
294
|
+
return str_to_latin(argc, argv, str, 1, 1);
|
295
|
+
}
|
296
|
+
|
297
|
+
void Init_byk_native(void)
|
298
|
+
{
|
299
|
+
rb_define_method(rb_cString, "to_latin", rb_str_to_latin, -1);
|
300
|
+
rb_define_method(rb_cString, "to_latin!", rb_str_to_latin_bang, -1);
|
301
|
+
rb_define_method(rb_cString, "to_ascii_latin", rb_str_to_ascii_latin, -1);
|
302
|
+
rb_define_method(rb_cString, "to_ascii_latin!", rb_str_to_ascii_latin_bang, -1);
|
303
|
+
}
|
data/ext/byk/extconf.rb
ADDED
data/lib/byk.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
require "byk_native"
|
4
|
+
require "byk/version"
|
5
|
+
|
6
|
+
module Byk
|
7
|
+
|
8
|
+
AZBUKA = %w[а б в г д ђ е ж з и ј к л љ м н њ о п р с т ћ у ф х ц ч џ ш]
|
9
|
+
AZBUKA_CAPS = %W[А Б В Г Д Ђ Е Ж З И Ј К Л Љ М Н Њ О П Р С Т Ћ У Ф Х Ц Ч Џ Ш]
|
10
|
+
|
11
|
+
ABECEDA = %w[a b c č ć d dž đ e f g h i j k l lj m n nj o p r s š t u v z ž]
|
12
|
+
ABECEDA_CAPS = %W[A B C Č Ć D Dž Đ E F G H I J K L Lj M N Nj O P R S Š T U V Z Ž]
|
13
|
+
|
14
|
+
end
|
data/lib/byk/version.rb
ADDED
data/spec/byk_spec.rb
ADDED
@@ -0,0 +1,126 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
require "spec_helper"
|
4
|
+
|
5
|
+
describe Byk do
|
6
|
+
|
7
|
+
# See http://sr.wikipedia.org/wiki/Панграм
|
8
|
+
let(:pangram) { "фијуче ветар у шибљу, леди пасаже и куће иза њих и гунђа у оџацима." }
|
9
|
+
let(:pangram_latin) { "fijuče vetar u šiblju, ledi pasaže i kuće iza njih i gunđa u odžacima." }
|
10
|
+
let(:pangram_ascii_latin) { "fijuce vetar u siblju, ledi pasaze i kuce iza njih i gundja u odzacima." }
|
11
|
+
|
12
|
+
let(:pangram_caps) { "ФИЈУЧЕ ВЕТАР У ШИБЉУ, ЛЕДИ ПАСАЖЕ И КУЋЕ ИЗА ЊИХ И ГУНЂА У ОЏАЦИМА." }
|
13
|
+
let(:pangram_latin_caps) { "FIJUČE VETAR U ŠIBLJU, LEDI PASAŽE I KUĆE IZA NJIH I GUNĐA U ODŽACIMA." }
|
14
|
+
let(:pangram_ascii_latin_caps) { "FIJUCE VETAR U SIBLJU, LEDI PASAZE I KUCE IZA NJIH I GUNDJA U ODZACIMA." }
|
15
|
+
|
16
|
+
let(:ascii) { "The quick brown fox jumps over the lazy dog." }
|
17
|
+
let(:other) { "संस्कृतम् saṃskṛtam" }
|
18
|
+
|
19
|
+
let(:mixed) { "संस्कृतम् saṃskṛtam илити Sanskrit, obrati ПАЖЊУ." }
|
20
|
+
let(:mixed_latin) { "संस्कृतम् saṃskṛtam iliti Sanskrit, obrati PAŽNJU." }
|
21
|
+
let(:mixed_ascii_latin) { "संस्कृतम् saṃskṛtam iliti Sanskrit, obrati PAZNJU." }
|
22
|
+
|
23
|
+
it "has a version number" do
|
24
|
+
expect(Byk::VERSION).not_to be nil
|
25
|
+
end
|
26
|
+
|
27
|
+
describe "#to_latin" do
|
28
|
+
|
29
|
+
it "doesn't modify an empty string" do
|
30
|
+
expect("".to_latin).to eq ""
|
31
|
+
end
|
32
|
+
|
33
|
+
it "doesn't modify ASCII text" do
|
34
|
+
expect(ascii.to_latin).to eq ascii
|
35
|
+
end
|
36
|
+
|
37
|
+
it "doesn't modify latin" do
|
38
|
+
expect(pangram_latin.to_latin).to eq pangram_latin
|
39
|
+
end
|
40
|
+
|
41
|
+
it "doesn't modify other scripts" do
|
42
|
+
expect(other.to_latin).to eq other
|
43
|
+
end
|
44
|
+
|
45
|
+
it "converts cyrillic to latin" do
|
46
|
+
expect(pangram.to_latin).to eq pangram_latin
|
47
|
+
end
|
48
|
+
|
49
|
+
it "converts cyrillic caps to latin caps" do
|
50
|
+
expect(pangram_caps.to_latin).to eq pangram_latin_caps
|
51
|
+
end
|
52
|
+
|
53
|
+
it "converts mixed text properly" do
|
54
|
+
expect(mixed.to_latin).to eq mixed_latin
|
55
|
+
end
|
56
|
+
|
57
|
+
it "converts AZBUKA to ABECEDA" do
|
58
|
+
expect(Byk::AZBUKA.map(&:to_latin)).to match_array(Byk::ABECEDA)
|
59
|
+
end
|
60
|
+
|
61
|
+
it "converts AZBUKA_CAPS to ABECEDA_CAPS" do
|
62
|
+
expect(Byk::AZBUKA_CAPS.map(&:to_latin)).to match_array(Byk::ABECEDA_CAPS)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
describe "#to_ascii_latin" do
|
67
|
+
|
68
|
+
# Special care for Њ, Љ, Ђ, Đ
|
69
|
+
let(:edge_cases) {
|
70
|
+
{
|
71
|
+
"Њ" => "Nj",
|
72
|
+
"Љ" => "Lj",
|
73
|
+
"Ђ" => "Dj",
|
74
|
+
"Đ" => "Dj",
|
75
|
+
"ЊЊ" => "NJNJ",
|
76
|
+
"ЉЉ" => "LJLJ",
|
77
|
+
"ЂЂ" => "DJDJ",
|
78
|
+
"ĐĐ" => "DJDJ",
|
79
|
+
"ГУЊ" => "GUNJ",
|
80
|
+
"ПАСУЉ" => "PASULJ",
|
81
|
+
"ЂУРАЂ" => "DJURADJ",
|
82
|
+
"ĐURAĐ" => "DJURADJ",
|
83
|
+
"ĐURAĐ Đorđević" => "DJURADJ Djordjevic",
|
84
|
+
"ĐURAĐ. Đorđević" => "DJURADJ. Djordjevic"
|
85
|
+
}
|
86
|
+
}
|
87
|
+
|
88
|
+
it "doesn't modify an empty string" do
|
89
|
+
expect("".to_ascii_latin).to eq ""
|
90
|
+
end
|
91
|
+
|
92
|
+
it "doesn't modify ASCII text" do
|
93
|
+
expect(ascii.to_ascii_latin).to eq ascii
|
94
|
+
end
|
95
|
+
|
96
|
+
it "doesn't modify other scripts" do
|
97
|
+
expect(other.to_ascii_latin).to eq other
|
98
|
+
end
|
99
|
+
|
100
|
+
it "converts cyrillic to ASCII latin" do
|
101
|
+
expect(pangram.to_ascii_latin).to eq pangram_ascii_latin
|
102
|
+
end
|
103
|
+
|
104
|
+
it "converts cyrillic caps to ASCII latin caps" do
|
105
|
+
expect(pangram_caps.to_ascii_latin).to eq pangram_ascii_latin_caps
|
106
|
+
end
|
107
|
+
|
108
|
+
it "converts latin to ASCII latin" do
|
109
|
+
expect(pangram_latin.to_ascii_latin).to eq pangram_ascii_latin
|
110
|
+
end
|
111
|
+
|
112
|
+
it "converts latin caps to ASCII latin caps" do
|
113
|
+
expect(pangram_latin_caps.to_ascii_latin).to eq pangram_ascii_latin_caps
|
114
|
+
end
|
115
|
+
|
116
|
+
it "converts mixed text properly" do
|
117
|
+
expect(mixed.to_ascii_latin).to eq mixed_ascii_latin
|
118
|
+
end
|
119
|
+
|
120
|
+
it "converts edge cases properly" do
|
121
|
+
edge_cases.each do |input, output|
|
122
|
+
expect(input.to_ascii_latin).to eq output
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
metadata
ADDED
@@ -0,0 +1,81 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: byk
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Nikola Topalović
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-04-15 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rake-compiler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0.9'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0.9'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rspec
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '3.2'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '3.2'
|
41
|
+
description: Provides C-optimized methods for transliteration of Serbian Cyrillic
|
42
|
+
into Latin.
|
43
|
+
email: nikola.topalovic@gmail.com
|
44
|
+
executables: []
|
45
|
+
extensions:
|
46
|
+
- ext/byk/extconf.rb
|
47
|
+
extra_rdoc_files: []
|
48
|
+
files:
|
49
|
+
- LICENSE
|
50
|
+
- README.md
|
51
|
+
- ext/byk/byk.c
|
52
|
+
- ext/byk/extconf.rb
|
53
|
+
- lib/byk.rb
|
54
|
+
- lib/byk/version.rb
|
55
|
+
- spec/byk_spec.rb
|
56
|
+
homepage: https://github.com/topalovic/byk
|
57
|
+
licenses:
|
58
|
+
- MIT
|
59
|
+
metadata: {}
|
60
|
+
post_install_message:
|
61
|
+
rdoc_options: []
|
62
|
+
require_paths:
|
63
|
+
- lib
|
64
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
70
|
+
requirements:
|
71
|
+
- - ">="
|
72
|
+
- !ruby/object:Gem::Version
|
73
|
+
version: '0'
|
74
|
+
requirements: []
|
75
|
+
rubyforge_project:
|
76
|
+
rubygems_version: 2.2.2
|
77
|
+
signing_key:
|
78
|
+
specification_version: 4
|
79
|
+
summary: Fast transliteration of Serbian Cyrillic into Latin.
|
80
|
+
test_files:
|
81
|
+
- spec/byk_spec.rb
|