byk 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/README.md +83 -0
- data/ext/byk/byk.c +303 -0
- data/ext/byk/extconf.rb +2 -0
- data/lib/byk.rb +14 -0
- data/lib/byk/version.rb +3 -0
- data/spec/byk_spec.rb +126 -0
- metadata +81 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 18a9ea704a27dfc6a2ca4f7fd9b83e05c1f5a57b
|
4
|
+
data.tar.gz: b6490d7a249f5ceb6378d92d6ee63af2424074ab
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 809f4204b60ec626f15aff3e32b8ca4368e0bcecbda1700b039b3edf086853841a6e94c7ef3e9a47003623b9cde0635d555d1d19ccca28220aa165f7a28c8327
|
7
|
+
data.tar.gz: 9a103df7e2976d6ab2574a372479e18f6fccd3c2e005d49e5c464f82befb6942ca19a80091387d67ee85aa98980b2bfe21bf9efada48ad1fec5e817a60d827f8
|
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2015 Nikola Topalović
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,83 @@
|
|
1
|
+
Byk
|
2
|
+
===
|
3
|
+
|
4
|
+
Fast transliteration of Serbian Cyrillic into Latin.
|
5
|
+
|
6
|
+
This package was inspired by @dejan's
|
7
|
+
[nice little gem](https://github.com/dejan/srbovanje), but this one
|
8
|
+
comes with a C-optimized twist.
|
9
|
+
|
10
|
+
## Installation
|
11
|
+
|
12
|
+
Add this line to your application's Gemfile:
|
13
|
+
|
14
|
+
```ruby
|
15
|
+
gem "byk"
|
16
|
+
```
|
17
|
+
|
18
|
+
And then execute:
|
19
|
+
|
20
|
+
```
|
21
|
+
$ bundle
|
22
|
+
```
|
23
|
+
|
24
|
+
Or install it yourself as:
|
25
|
+
```
|
26
|
+
$ gem install byk
|
27
|
+
```
|
28
|
+
|
29
|
+
## Usage
|
30
|
+
|
31
|
+
First, make sure to require the gem in your initializer:
|
32
|
+
|
33
|
+
```
|
34
|
+
require "byk"
|
35
|
+
```
|
36
|
+
|
37
|
+
This will extend `String` with a couple of simple methods:
|
38
|
+
|
39
|
+
```ruby
|
40
|
+
"Шеширџија".to_latin # => "Šeširdžija"
|
41
|
+
"Шеширџија".to_ascii_latin # => "Sesirdzija"
|
42
|
+
"Šeširdžija".to_ascii_latin # => "Sesirdzija"
|
43
|
+
```
|
44
|
+
|
45
|
+
There's also a destructive version of each:
|
46
|
+
|
47
|
+
```ruby
|
48
|
+
text = "Жвазбука"
|
49
|
+
text.to_latin! # => "Žvazbuka"
|
50
|
+
text # => "Žvazbuka"
|
51
|
+
text.to_ascii_latin! # => "Zvazbuka"
|
52
|
+
text # => "Zvazbuka"
|
53
|
+
```
|
54
|
+
|
55
|
+
Note that these methods will take into account the
|
56
|
+
[special two-letter rules](http://sr.wikipedia.org/wiki/Gajica#Abeceda):
|
57
|
+
|
58
|
+
```
|
59
|
+
"ĐORĐE Đorđević".to_ascii_latin # => "DJORDJE Djordjevic"
|
60
|
+
```
|
61
|
+
|
62
|
+
## Notes
|
63
|
+
|
64
|
+
### How fast is fast?
|
65
|
+
|
66
|
+
About [7x](benchmark) faster than the baseline Ruby implementation on
|
67
|
+
my hardware. YMMV of course.
|
68
|
+
|
69
|
+
### Compatibility
|
70
|
+
|
71
|
+
Byk is supported under MRI Ruby 1.9.3, 2.0, 2.1 and 2.2. Earlier
|
72
|
+
versions of MRI are untested.
|
73
|
+
|
74
|
+
### Raison d'être
|
75
|
+
|
76
|
+
For massive transliteration (e.g. sites supporting dual script
|
77
|
+
output), this kind of speed-up might be worthwhile, even with caching.
|
78
|
+
|
79
|
+
Also, it's a well-defined problem with hard-set rules which makes it a
|
80
|
+
natural target for optimization. Plus, it gave me an excuse to play
|
81
|
+
with Ruby extensions, so there :smile_cat:
|
82
|
+
|
83
|
+
Уздравље!
|
data/ext/byk/byk.c
ADDED
@@ -0,0 +1,303 @@
|
|
1
|
+
#include <stdio.h>
|
2
|
+
#include <ruby.h>
|
3
|
+
#include <ruby/encoding.h>
|
4
|
+
|
5
|
+
#define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
|
6
|
+
|
7
|
+
#define STR_CAT_COND_ASCII(force_ascii, dest, chr, ascii_chr, len, enc) \
|
8
|
+
force_ascii ? rb_enc_str_buf_cat(dest, chr, len, enc) \
|
9
|
+
: str_cat_char(dest, ascii_chr, enc)
|
10
|
+
|
11
|
+
enum {
|
12
|
+
LAT_CAP_TJ=262,
|
13
|
+
LAT_TJ,
|
14
|
+
LAT_CAP_CH=268,
|
15
|
+
LAT_CH,
|
16
|
+
LAT_CAP_DJ=272,
|
17
|
+
LAT_DJ,
|
18
|
+
LAT_CAP_SH=352,
|
19
|
+
LAT_SH,
|
20
|
+
LAT_CAP_ZH=381,
|
21
|
+
LAT_ZH,
|
22
|
+
CYR_CAP_DJ=1026,
|
23
|
+
CYR_CAP_J=1032,
|
24
|
+
CYR_CAP_LJ,
|
25
|
+
CYR_CAP_NJ,
|
26
|
+
CYR_CAP_TJ,
|
27
|
+
CYR_CAP_DZ=1039,
|
28
|
+
CYR_CAP_A,
|
29
|
+
CYR_CAP_B,
|
30
|
+
CYR_CAP_V,
|
31
|
+
CYR_CAP_G,
|
32
|
+
CYR_CAP_D,
|
33
|
+
CYR_CAP_E,
|
34
|
+
CYR_CAP_ZH,
|
35
|
+
CYR_CAP_Z,
|
36
|
+
CYR_CAP_I,
|
37
|
+
CYR_CAP_K=1050,
|
38
|
+
CYR_CAP_L,
|
39
|
+
CYR_CAP_M,
|
40
|
+
CYR_CAP_N,
|
41
|
+
CYR_CAP_O,
|
42
|
+
CYR_CAP_P,
|
43
|
+
CYR_CAP_R,
|
44
|
+
CYR_CAP_S,
|
45
|
+
CYR_CAP_T,
|
46
|
+
CYR_CAP_U,
|
47
|
+
CYR_CAP_F,
|
48
|
+
CYR_CAP_H,
|
49
|
+
CYR_CAP_C,
|
50
|
+
CYR_CAP_CH,
|
51
|
+
CYR_CAP_SH,
|
52
|
+
CYR_A=1072,
|
53
|
+
CYR_B,
|
54
|
+
CYR_V,
|
55
|
+
CYR_G,
|
56
|
+
CYR_D,
|
57
|
+
CYR_E,
|
58
|
+
CYR_ZH,
|
59
|
+
CYR_Z,
|
60
|
+
CYR_I,
|
61
|
+
CYR_K=1082,
|
62
|
+
CYR_L,
|
63
|
+
CYR_M,
|
64
|
+
CYR_N,
|
65
|
+
CYR_O,
|
66
|
+
CYR_P,
|
67
|
+
CYR_R,
|
68
|
+
CYR_S,
|
69
|
+
CYR_T,
|
70
|
+
CYR_U,
|
71
|
+
CYR_F,
|
72
|
+
CYR_H,
|
73
|
+
CYR_C,
|
74
|
+
CYR_CH,
|
75
|
+
CYR_SH,
|
76
|
+
CYR_DJ=1106,
|
77
|
+
CYR_J=1112,
|
78
|
+
CYR_LJ,
|
79
|
+
CYR_NJ,
|
80
|
+
CYR_TJ,
|
81
|
+
CYR_DZ=1119
|
82
|
+
};
|
83
|
+
|
84
|
+
static inline unsigned int
|
85
|
+
is_upper_case(unsigned int c)
|
86
|
+
{
|
87
|
+
return ((c >= 65 && c <= 90)
|
88
|
+
|| (c >= CYR_CAP_DJ && c <= CYR_CAP_SH)
|
89
|
+
|| c == LAT_CAP_TJ
|
90
|
+
|| c == LAT_CAP_CH
|
91
|
+
|| c == LAT_CAP_DJ
|
92
|
+
|| c == LAT_CAP_SH
|
93
|
+
|| c == LAT_CAP_ZH);
|
94
|
+
}
|
95
|
+
|
96
|
+
|
97
|
+
static void
|
98
|
+
str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
|
99
|
+
{
|
100
|
+
char s[16];
|
101
|
+
int n = rb_enc_codelen(c, enc);
|
102
|
+
rb_enc_mbcput(c, s, enc);
|
103
|
+
rb_enc_str_buf_cat(str, s, n, enc);
|
104
|
+
}
|
105
|
+
|
106
|
+
static VALUE
|
107
|
+
str_to_latin(int argc, VALUE *argv, VALUE str, int ascii, int bang)
|
108
|
+
{
|
109
|
+
VALUE dest;
|
110
|
+
long dest_len;
|
111
|
+
char *pos, *end;
|
112
|
+
rb_encoding *enc;
|
113
|
+
unsigned int codepoint = 0;
|
114
|
+
unsigned int prev_codepoint = 0;
|
115
|
+
unsigned int next_codepoint = 0;
|
116
|
+
|
117
|
+
rb_check_arity(argc, 0, 1);
|
118
|
+
|
119
|
+
pos = RSTRING_PTR(str);
|
120
|
+
if (!pos || RSTRING_LEN(str) == 0) return str;
|
121
|
+
|
122
|
+
end = RSTRING_END(str);
|
123
|
+
enc = STR_ENC_GET(str);
|
124
|
+
dest_len = RSTRING_LEN(str) + 30; /* TODO len + margin */
|
125
|
+
dest = rb_str_buf_new(dest_len);
|
126
|
+
rb_enc_associate(dest, enc);
|
127
|
+
|
128
|
+
while (pos < end) {
|
129
|
+
int len, force_upper = 0;
|
130
|
+
|
131
|
+
prev_codepoint = codepoint;
|
132
|
+
codepoint = rb_enc_codepoint_len(pos, end, &len, enc);
|
133
|
+
next_codepoint = 0;
|
134
|
+
|
135
|
+
force_upper = prev_codepoint && is_upper_case(prev_codepoint);
|
136
|
+
|
137
|
+
if (!force_upper && (pos + len < end)) {
|
138
|
+
/* TODO Trim down to one rb_enc_codepoint call per iter. */
|
139
|
+
next_codepoint = rb_enc_codepoint(pos + len, end, enc);
|
140
|
+
force_upper = is_upper_case(next_codepoint);
|
141
|
+
}
|
142
|
+
|
143
|
+
/* Latin -> "ASCII latin" conversion */
|
144
|
+
if (ascii && codepoint >= LAT_CAP_TJ && codepoint <= LAT_ZH) {
|
145
|
+
switch (codepoint) {
|
146
|
+
case LAT_TJ:
|
147
|
+
case LAT_CH: rb_enc_str_buf_cat(dest, "c", 1, enc); break;
|
148
|
+
case LAT_DJ: rb_enc_str_buf_cat(dest, "dj", 2, enc); break;
|
149
|
+
case LAT_SH: rb_enc_str_buf_cat(dest, "s", 1, enc); break;
|
150
|
+
case LAT_ZH: rb_enc_str_buf_cat(dest, "z", 1, enc); break;
|
151
|
+
case LAT_CAP_TJ:
|
152
|
+
case LAT_CAP_CH: rb_enc_str_buf_cat(dest, "C", 1, enc); break;
|
153
|
+
case LAT_CAP_SH: rb_enc_str_buf_cat(dest, "S", 1, enc); break;
|
154
|
+
case LAT_CAP_ZH: rb_enc_str_buf_cat(dest, "Z", 1, enc); break;
|
155
|
+
|
156
|
+
case LAT_CAP_DJ:
|
157
|
+
force_upper ? rb_enc_str_buf_cat(dest, "DJ", 2, enc)
|
158
|
+
: rb_enc_str_buf_cat(dest, "Dj", 2, enc);
|
159
|
+
break;
|
160
|
+
default:
|
161
|
+
rb_enc_str_buf_cat(dest, pos, len, enc);
|
162
|
+
}
|
163
|
+
pos += len;
|
164
|
+
continue;
|
165
|
+
}
|
166
|
+
|
167
|
+
/* Short-circuit for non-cyrillic codepoints */
|
168
|
+
if (codepoint < CYR_CAP_DJ || codepoint > CYR_DZ) {
|
169
|
+
rb_enc_str_buf_cat(dest, pos, len, enc);
|
170
|
+
pos += len;
|
171
|
+
continue;
|
172
|
+
}
|
173
|
+
|
174
|
+
/* Cyrillic -> latin conversion */
|
175
|
+
switch (codepoint) {
|
176
|
+
case CYR_CAP_J: rb_enc_str_buf_cat(dest, "J", 1, enc); break;
|
177
|
+
case CYR_CAP_A: rb_enc_str_buf_cat(dest, "A", 1, enc); break;
|
178
|
+
case CYR_CAP_B: rb_enc_str_buf_cat(dest, "B", 1, enc); break;
|
179
|
+
case CYR_CAP_V: rb_enc_str_buf_cat(dest, "V", 1, enc); break;
|
180
|
+
case CYR_CAP_G: rb_enc_str_buf_cat(dest, "G", 1, enc); break;
|
181
|
+
case CYR_CAP_D: rb_enc_str_buf_cat(dest, "D", 1, enc); break;
|
182
|
+
case CYR_CAP_E: rb_enc_str_buf_cat(dest, "E", 1, enc); break;
|
183
|
+
case CYR_CAP_Z: rb_enc_str_buf_cat(dest, "Z", 1, enc); break;
|
184
|
+
case CYR_CAP_I: rb_enc_str_buf_cat(dest, "I", 1, enc); break;
|
185
|
+
case CYR_CAP_K: rb_enc_str_buf_cat(dest, "K", 1, enc); break;
|
186
|
+
case CYR_CAP_L: rb_enc_str_buf_cat(dest, "L", 1, enc); break;
|
187
|
+
case CYR_CAP_M: rb_enc_str_buf_cat(dest, "M", 1, enc); break;
|
188
|
+
case CYR_CAP_N: rb_enc_str_buf_cat(dest, "N", 1, enc); break;
|
189
|
+
case CYR_CAP_O: rb_enc_str_buf_cat(dest, "O", 1, enc); break;
|
190
|
+
case CYR_CAP_P: rb_enc_str_buf_cat(dest, "P", 1, enc); break;
|
191
|
+
case CYR_CAP_R: rb_enc_str_buf_cat(dest, "R", 1, enc); break;
|
192
|
+
case CYR_CAP_S: rb_enc_str_buf_cat(dest, "S", 1, enc); break;
|
193
|
+
case CYR_CAP_T: rb_enc_str_buf_cat(dest, "T", 1, enc); break;
|
194
|
+
case CYR_CAP_U: rb_enc_str_buf_cat(dest, "U", 1, enc); break;
|
195
|
+
case CYR_CAP_F: rb_enc_str_buf_cat(dest, "F", 1, enc); break;
|
196
|
+
case CYR_CAP_H: rb_enc_str_buf_cat(dest, "H", 1, enc); break;
|
197
|
+
case CYR_CAP_C: rb_enc_str_buf_cat(dest, "C", 1, enc); break;
|
198
|
+
case CYR_A: rb_enc_str_buf_cat(dest, "a", 1, enc); break;
|
199
|
+
case CYR_B: rb_enc_str_buf_cat(dest, "b", 1, enc); break;
|
200
|
+
case CYR_V: rb_enc_str_buf_cat(dest, "v", 1, enc); break;
|
201
|
+
case CYR_G: rb_enc_str_buf_cat(dest, "g", 1, enc); break;
|
202
|
+
case CYR_D: rb_enc_str_buf_cat(dest, "d", 1, enc); break;
|
203
|
+
case CYR_E: rb_enc_str_buf_cat(dest, "e", 1, enc); break;
|
204
|
+
case CYR_Z: rb_enc_str_buf_cat(dest, "z", 1, enc); break;
|
205
|
+
case CYR_I: rb_enc_str_buf_cat(dest, "i", 1, enc); break;
|
206
|
+
case CYR_K: rb_enc_str_buf_cat(dest, "k", 1, enc); break;
|
207
|
+
case CYR_L: rb_enc_str_buf_cat(dest, "l", 1, enc); break;
|
208
|
+
case CYR_M: rb_enc_str_buf_cat(dest, "m", 1, enc); break;
|
209
|
+
case CYR_N: rb_enc_str_buf_cat(dest, "n", 1, enc); break;
|
210
|
+
case CYR_O: rb_enc_str_buf_cat(dest, "o", 1, enc); break;
|
211
|
+
case CYR_P: rb_enc_str_buf_cat(dest, "p", 1, enc); break;
|
212
|
+
case CYR_R: rb_enc_str_buf_cat(dest, "r", 1, enc); break;
|
213
|
+
case CYR_S: rb_enc_str_buf_cat(dest, "s", 1, enc); break;
|
214
|
+
case CYR_T: rb_enc_str_buf_cat(dest, "t", 1, enc); break;
|
215
|
+
case CYR_U: rb_enc_str_buf_cat(dest, "u", 1, enc); break;
|
216
|
+
case CYR_F: rb_enc_str_buf_cat(dest, "f", 1, enc); break;
|
217
|
+
case CYR_H: rb_enc_str_buf_cat(dest, "h", 1, enc); break;
|
218
|
+
case CYR_C: rb_enc_str_buf_cat(dest, "c", 1, enc); break;
|
219
|
+
case CYR_J: rb_enc_str_buf_cat(dest, "j", 1, enc); break;
|
220
|
+
case CYR_LJ: rb_enc_str_buf_cat(dest, "lj", 2, enc); break;
|
221
|
+
case CYR_NJ: rb_enc_str_buf_cat(dest, "nj", 2, enc); break;
|
222
|
+
case CYR_DJ: STR_CAT_COND_ASCII(ascii, dest, "dj", LAT_DJ, 2, enc); break;
|
223
|
+
case CYR_TJ: STR_CAT_COND_ASCII(ascii, dest, "c", LAT_TJ, 1, enc); break;
|
224
|
+
case CYR_CH: STR_CAT_COND_ASCII(ascii, dest, "c", LAT_CH, 1, enc); break;
|
225
|
+
case CYR_ZH: STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc); break;
|
226
|
+
case CYR_SH: STR_CAT_COND_ASCII(ascii, dest, "s", LAT_SH, 1, enc); break;
|
227
|
+
case CYR_CAP_TJ: STR_CAT_COND_ASCII(ascii, dest, "C", LAT_CAP_TJ, 1, enc); break;
|
228
|
+
case CYR_CAP_CH: STR_CAT_COND_ASCII(ascii, dest, "C", LAT_CAP_CH, 1, enc); break;
|
229
|
+
case CYR_CAP_ZH: STR_CAT_COND_ASCII(ascii, dest, "Z", LAT_CAP_ZH, 1, enc); break;
|
230
|
+
case CYR_CAP_SH: STR_CAT_COND_ASCII(ascii, dest, "S", LAT_CAP_SH, 1, enc); break;
|
231
|
+
|
232
|
+
/* Several special cases */
|
233
|
+
case CYR_CAP_LJ:
|
234
|
+
rb_enc_str_buf_cat(dest, (force_upper ? "LJ" : "Lj"), 2, enc);
|
235
|
+
break;
|
236
|
+
|
237
|
+
case CYR_CAP_NJ:
|
238
|
+
rb_enc_str_buf_cat(dest, (force_upper ? "NJ" : "Nj"), 2, enc);
|
239
|
+
break;
|
240
|
+
|
241
|
+
case CYR_CAP_DJ:
|
242
|
+
STR_CAT_COND_ASCII(ascii, dest, (force_upper ? "DJ" : "Dj"), LAT_CAP_DJ, 2, enc);
|
243
|
+
break;
|
244
|
+
|
245
|
+
case CYR_CAP_DZ:
|
246
|
+
rb_enc_str_buf_cat(dest, "D", 1, enc);
|
247
|
+
if (force_upper) {
|
248
|
+
STR_CAT_COND_ASCII(ascii, dest, "Z", LAT_CAP_ZH, 1, enc);
|
249
|
+
}
|
250
|
+
else {
|
251
|
+
STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc);
|
252
|
+
}
|
253
|
+
break;
|
254
|
+
|
255
|
+
case CYR_DZ:
|
256
|
+
rb_enc_str_buf_cat(dest, "d", 1, enc);
|
257
|
+
STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc);
|
258
|
+
break;
|
259
|
+
|
260
|
+
default:
|
261
|
+
rb_enc_str_buf_cat(dest, pos, len, enc);
|
262
|
+
}
|
263
|
+
pos += len;
|
264
|
+
}
|
265
|
+
|
266
|
+
if (bang) {
|
267
|
+
rb_str_shared_replace(str, dest);
|
268
|
+
}
|
269
|
+
else {
|
270
|
+
OBJ_INFECT(dest, str);
|
271
|
+
str = dest;
|
272
|
+
}
|
273
|
+
|
274
|
+
return str;
|
275
|
+
}
|
276
|
+
|
277
|
+
static VALUE
|
278
|
+
rb_str_to_latin(int argc, VALUE *argv, VALUE str) {
|
279
|
+
return str_to_latin(argc, argv, str, 0, 0);
|
280
|
+
}
|
281
|
+
|
282
|
+
static VALUE
|
283
|
+
rb_str_to_latin_bang(int argc, VALUE *argv, VALUE str) {
|
284
|
+
return str_to_latin(argc, argv, str, 0, 1);
|
285
|
+
}
|
286
|
+
|
287
|
+
static VALUE
|
288
|
+
rb_str_to_ascii_latin(int argc, VALUE *argv, VALUE str) {
|
289
|
+
return str_to_latin(argc, argv, str, 1, 0);
|
290
|
+
}
|
291
|
+
|
292
|
+
static VALUE
|
293
|
+
rb_str_to_ascii_latin_bang(int argc, VALUE *argv, VALUE str) {
|
294
|
+
return str_to_latin(argc, argv, str, 1, 1);
|
295
|
+
}
|
296
|
+
|
297
|
+
void Init_byk_native(void)
|
298
|
+
{
|
299
|
+
rb_define_method(rb_cString, "to_latin", rb_str_to_latin, -1);
|
300
|
+
rb_define_method(rb_cString, "to_latin!", rb_str_to_latin_bang, -1);
|
301
|
+
rb_define_method(rb_cString, "to_ascii_latin", rb_str_to_ascii_latin, -1);
|
302
|
+
rb_define_method(rb_cString, "to_ascii_latin!", rb_str_to_ascii_latin_bang, -1);
|
303
|
+
}
|
data/ext/byk/extconf.rb
ADDED
data/lib/byk.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
require "byk_native"
|
4
|
+
require "byk/version"
|
5
|
+
|
6
|
+
module Byk
|
7
|
+
|
8
|
+
AZBUKA = %w[а б в г д ђ е ж з и ј к л љ м н њ о п р с т ћ у ф х ц ч џ ш]
|
9
|
+
AZBUKA_CAPS = %W[А Б В Г Д Ђ Е Ж З И Ј К Л Љ М Н Њ О П Р С Т Ћ У Ф Х Ц Ч Џ Ш]
|
10
|
+
|
11
|
+
ABECEDA = %w[a b c č ć d dž đ e f g h i j k l lj m n nj o p r s š t u v z ž]
|
12
|
+
ABECEDA_CAPS = %W[A B C Č Ć D Dž Đ E F G H I J K L Lj M N Nj O P R S Š T U V Z Ž]
|
13
|
+
|
14
|
+
end
|
data/lib/byk/version.rb
ADDED
data/spec/byk_spec.rb
ADDED
@@ -0,0 +1,126 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
require "spec_helper"
|
4
|
+
|
5
|
+
describe Byk do
|
6
|
+
|
7
|
+
# See http://sr.wikipedia.org/wiki/Панграм
|
8
|
+
let(:pangram) { "фијуче ветар у шибљу, леди пасаже и куће иза њих и гунђа у оџацима." }
|
9
|
+
let(:pangram_latin) { "fijuče vetar u šiblju, ledi pasaže i kuće iza njih i gunđa u odžacima." }
|
10
|
+
let(:pangram_ascii_latin) { "fijuce vetar u siblju, ledi pasaze i kuce iza njih i gundja u odzacima." }
|
11
|
+
|
12
|
+
let(:pangram_caps) { "ФИЈУЧЕ ВЕТАР У ШИБЉУ, ЛЕДИ ПАСАЖЕ И КУЋЕ ИЗА ЊИХ И ГУНЂА У ОЏАЦИМА." }
|
13
|
+
let(:pangram_latin_caps) { "FIJUČE VETAR U ŠIBLJU, LEDI PASAŽE I KUĆE IZA NJIH I GUNĐA U ODŽACIMA." }
|
14
|
+
let(:pangram_ascii_latin_caps) { "FIJUCE VETAR U SIBLJU, LEDI PASAZE I KUCE IZA NJIH I GUNDJA U ODZACIMA." }
|
15
|
+
|
16
|
+
let(:ascii) { "The quick brown fox jumps over the lazy dog." }
|
17
|
+
let(:other) { "संस्कृतम् saṃskṛtam" }
|
18
|
+
|
19
|
+
let(:mixed) { "संस्कृतम् saṃskṛtam илити Sanskrit, obrati ПАЖЊУ." }
|
20
|
+
let(:mixed_latin) { "संस्कृतम् saṃskṛtam iliti Sanskrit, obrati PAŽNJU." }
|
21
|
+
let(:mixed_ascii_latin) { "संस्कृतम् saṃskṛtam iliti Sanskrit, obrati PAZNJU." }
|
22
|
+
|
23
|
+
it "has a version number" do
|
24
|
+
expect(Byk::VERSION).not_to be nil
|
25
|
+
end
|
26
|
+
|
27
|
+
describe "#to_latin" do
|
28
|
+
|
29
|
+
it "doesn't modify an empty string" do
|
30
|
+
expect("".to_latin).to eq ""
|
31
|
+
end
|
32
|
+
|
33
|
+
it "doesn't modify ASCII text" do
|
34
|
+
expect(ascii.to_latin).to eq ascii
|
35
|
+
end
|
36
|
+
|
37
|
+
it "doesn't modify latin" do
|
38
|
+
expect(pangram_latin.to_latin).to eq pangram_latin
|
39
|
+
end
|
40
|
+
|
41
|
+
it "doesn't modify other scripts" do
|
42
|
+
expect(other.to_latin).to eq other
|
43
|
+
end
|
44
|
+
|
45
|
+
it "converts cyrillic to latin" do
|
46
|
+
expect(pangram.to_latin).to eq pangram_latin
|
47
|
+
end
|
48
|
+
|
49
|
+
it "converts cyrillic caps to latin caps" do
|
50
|
+
expect(pangram_caps.to_latin).to eq pangram_latin_caps
|
51
|
+
end
|
52
|
+
|
53
|
+
it "converts mixed text properly" do
|
54
|
+
expect(mixed.to_latin).to eq mixed_latin
|
55
|
+
end
|
56
|
+
|
57
|
+
it "converts AZBUKA to ABECEDA" do
|
58
|
+
expect(Byk::AZBUKA.map(&:to_latin)).to match_array(Byk::ABECEDA)
|
59
|
+
end
|
60
|
+
|
61
|
+
it "converts AZBUKA_CAPS to ABECEDA_CAPS" do
|
62
|
+
expect(Byk::AZBUKA_CAPS.map(&:to_latin)).to match_array(Byk::ABECEDA_CAPS)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
describe "#to_ascii_latin" do
|
67
|
+
|
68
|
+
# Special care for Њ, Љ, Ђ, Đ
|
69
|
+
let(:edge_cases) {
|
70
|
+
{
|
71
|
+
"Њ" => "Nj",
|
72
|
+
"Љ" => "Lj",
|
73
|
+
"Ђ" => "Dj",
|
74
|
+
"Đ" => "Dj",
|
75
|
+
"ЊЊ" => "NJNJ",
|
76
|
+
"ЉЉ" => "LJLJ",
|
77
|
+
"ЂЂ" => "DJDJ",
|
78
|
+
"ĐĐ" => "DJDJ",
|
79
|
+
"ГУЊ" => "GUNJ",
|
80
|
+
"ПАСУЉ" => "PASULJ",
|
81
|
+
"ЂУРАЂ" => "DJURADJ",
|
82
|
+
"ĐURAĐ" => "DJURADJ",
|
83
|
+
"ĐURAĐ Đorđević" => "DJURADJ Djordjevic",
|
84
|
+
"ĐURAĐ. Đorđević" => "DJURADJ. Djordjevic"
|
85
|
+
}
|
86
|
+
}
|
87
|
+
|
88
|
+
it "doesn't modify an empty string" do
|
89
|
+
expect("".to_ascii_latin).to eq ""
|
90
|
+
end
|
91
|
+
|
92
|
+
it "doesn't modify ASCII text" do
|
93
|
+
expect(ascii.to_ascii_latin).to eq ascii
|
94
|
+
end
|
95
|
+
|
96
|
+
it "doesn't modify other scripts" do
|
97
|
+
expect(other.to_ascii_latin).to eq other
|
98
|
+
end
|
99
|
+
|
100
|
+
it "converts cyrillic to ASCII latin" do
|
101
|
+
expect(pangram.to_ascii_latin).to eq pangram_ascii_latin
|
102
|
+
end
|
103
|
+
|
104
|
+
it "converts cyrillic caps to ASCII latin caps" do
|
105
|
+
expect(pangram_caps.to_ascii_latin).to eq pangram_ascii_latin_caps
|
106
|
+
end
|
107
|
+
|
108
|
+
it "converts latin to ASCII latin" do
|
109
|
+
expect(pangram_latin.to_ascii_latin).to eq pangram_ascii_latin
|
110
|
+
end
|
111
|
+
|
112
|
+
it "converts latin caps to ASCII latin caps" do
|
113
|
+
expect(pangram_latin_caps.to_ascii_latin).to eq pangram_ascii_latin_caps
|
114
|
+
end
|
115
|
+
|
116
|
+
it "converts mixed text properly" do
|
117
|
+
expect(mixed.to_ascii_latin).to eq mixed_ascii_latin
|
118
|
+
end
|
119
|
+
|
120
|
+
it "converts edge cases properly" do
|
121
|
+
edge_cases.each do |input, output|
|
122
|
+
expect(input.to_ascii_latin).to eq output
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
metadata
ADDED
@@ -0,0 +1,81 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: byk
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Nikola Topalović
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-04-15 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rake-compiler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0.9'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0.9'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rspec
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '3.2'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '3.2'
|
41
|
+
description: Provides C-optimized methods for transliteration of Serbian Cyrillic
|
42
|
+
into Latin.
|
43
|
+
email: nikola.topalovic@gmail.com
|
44
|
+
executables: []
|
45
|
+
extensions:
|
46
|
+
- ext/byk/extconf.rb
|
47
|
+
extra_rdoc_files: []
|
48
|
+
files:
|
49
|
+
- LICENSE
|
50
|
+
- README.md
|
51
|
+
- ext/byk/byk.c
|
52
|
+
- ext/byk/extconf.rb
|
53
|
+
- lib/byk.rb
|
54
|
+
- lib/byk/version.rb
|
55
|
+
- spec/byk_spec.rb
|
56
|
+
homepage: https://github.com/topalovic/byk
|
57
|
+
licenses:
|
58
|
+
- MIT
|
59
|
+
metadata: {}
|
60
|
+
post_install_message:
|
61
|
+
rdoc_options: []
|
62
|
+
require_paths:
|
63
|
+
- lib
|
64
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
70
|
+
requirements:
|
71
|
+
- - ">="
|
72
|
+
- !ruby/object:Gem::Version
|
73
|
+
version: '0'
|
74
|
+
requirements: []
|
75
|
+
rubyforge_project:
|
76
|
+
rubygems_version: 2.2.2
|
77
|
+
signing_key:
|
78
|
+
specification_version: 4
|
79
|
+
summary: Fast transliteration of Serbian Cyrillic into Latin.
|
80
|
+
test_files:
|
81
|
+
- spec/byk_spec.rb
|