byk 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 18a9ea704a27dfc6a2ca4f7fd9b83e05c1f5a57b
4
+ data.tar.gz: b6490d7a249f5ceb6378d92d6ee63af2424074ab
5
+ SHA512:
6
+ metadata.gz: 809f4204b60ec626f15aff3e32b8ca4368e0bcecbda1700b039b3edf086853841a6e94c7ef3e9a47003623b9cde0635d555d1d19ccca28220aa165f7a28c8327
7
+ data.tar.gz: 9a103df7e2976d6ab2574a372479e18f6fccd3c2e005d49e5c464f82befb6942ca19a80091387d67ee85aa98980b2bfe21bf9efada48ad1fec5e817a60d827f8
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2015 Nikola Topalović
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1,83 @@
1
+ Byk
2
+ ===
3
+
4
+ Fast transliteration of Serbian Cyrillic into Latin.
5
+
6
+ This package was inspired by @dejan's
7
+ [nice little gem](https://github.com/dejan/srbovanje), but this one
8
+ comes with a C-optimized twist.
9
+
10
+ ## Installation
11
+
12
+ Add this line to your application's Gemfile:
13
+
14
+ ```ruby
15
+ gem "byk"
16
+ ```
17
+
18
+ And then execute:
19
+
20
+ ```
21
+ $ bundle
22
+ ```
23
+
24
+ Or install it yourself as:
25
+ ```
26
+ $ gem install byk
27
+ ```
28
+
29
+ ## Usage
30
+
31
+ First, make sure to require the gem in your initializer:
32
+
33
+ ```
34
+ require "byk"
35
+ ```
36
+
37
+ This will extend `String` with a couple of simple methods:
38
+
39
+ ```ruby
40
+ "Шеширџија".to_latin # => "Šeširdžija"
41
+ "Шеширџија".to_ascii_latin # => "Sesirdzija"
42
+ "Šeširdžija".to_ascii_latin # => "Sesirdzija"
43
+ ```
44
+
45
+ There's also a destructive version of each:
46
+
47
+ ```ruby
48
+ text = "Жвазбука"
49
+ text.to_latin! # => "Žvazbuka"
50
+ text # => "Žvazbuka"
51
+ text.to_ascii_latin! # => "Zvazbuka"
52
+ text # => "Zvazbuka"
53
+ ```
54
+
55
+ Note that these methods will take into account the
56
+ [special two-letter rules](http://sr.wikipedia.org/wiki/Gajica#Abeceda):
57
+
58
+ ```
59
+ "ĐORĐE Đorđević".to_ascii_latin # => "DJORDJE Djordjevic"
60
+ ```
61
+
62
+ ## Notes
63
+
64
+ ### How fast is fast?
65
+
66
+ About [7x](benchmark) faster than the baseline Ruby implementation on
67
+ my hardware. YMMV of course.
68
+
69
+ ### Compatibility
70
+
71
+ Byk is supported under MRI Ruby 1.9.3, 2.0, 2.1 and 2.2. Earlier
72
+ versions of MRI are untested.
73
+
74
+ ### Raison d'être
75
+
76
+ For massive transliteration (e.g. sites supporting dual script
77
+ output), this kind of speed-up might be worthwhile, even with caching.
78
+
79
+ Also, it's a well-defined problem with hard-set rules which makes it a
80
+ natural target for optimization. Plus, it gave me an excuse to play
81
+ with Ruby extensions, so there :smile_cat:
82
+
83
+ Уздравље!
@@ -0,0 +1,303 @@
1
+ #include <stdio.h>
2
+ #include <ruby.h>
3
+ #include <ruby/encoding.h>
4
+
5
+ #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
6
+
7
+ #define STR_CAT_COND_ASCII(force_ascii, dest, chr, ascii_chr, len, enc) \
8
+ force_ascii ? rb_enc_str_buf_cat(dest, chr, len, enc) \
9
+ : str_cat_char(dest, ascii_chr, enc)
10
+
11
+ enum {
12
+ LAT_CAP_TJ=262,
13
+ LAT_TJ,
14
+ LAT_CAP_CH=268,
15
+ LAT_CH,
16
+ LAT_CAP_DJ=272,
17
+ LAT_DJ,
18
+ LAT_CAP_SH=352,
19
+ LAT_SH,
20
+ LAT_CAP_ZH=381,
21
+ LAT_ZH,
22
+ CYR_CAP_DJ=1026,
23
+ CYR_CAP_J=1032,
24
+ CYR_CAP_LJ,
25
+ CYR_CAP_NJ,
26
+ CYR_CAP_TJ,
27
+ CYR_CAP_DZ=1039,
28
+ CYR_CAP_A,
29
+ CYR_CAP_B,
30
+ CYR_CAP_V,
31
+ CYR_CAP_G,
32
+ CYR_CAP_D,
33
+ CYR_CAP_E,
34
+ CYR_CAP_ZH,
35
+ CYR_CAP_Z,
36
+ CYR_CAP_I,
37
+ CYR_CAP_K=1050,
38
+ CYR_CAP_L,
39
+ CYR_CAP_M,
40
+ CYR_CAP_N,
41
+ CYR_CAP_O,
42
+ CYR_CAP_P,
43
+ CYR_CAP_R,
44
+ CYR_CAP_S,
45
+ CYR_CAP_T,
46
+ CYR_CAP_U,
47
+ CYR_CAP_F,
48
+ CYR_CAP_H,
49
+ CYR_CAP_C,
50
+ CYR_CAP_CH,
51
+ CYR_CAP_SH,
52
+ CYR_A=1072,
53
+ CYR_B,
54
+ CYR_V,
55
+ CYR_G,
56
+ CYR_D,
57
+ CYR_E,
58
+ CYR_ZH,
59
+ CYR_Z,
60
+ CYR_I,
61
+ CYR_K=1082,
62
+ CYR_L,
63
+ CYR_M,
64
+ CYR_N,
65
+ CYR_O,
66
+ CYR_P,
67
+ CYR_R,
68
+ CYR_S,
69
+ CYR_T,
70
+ CYR_U,
71
+ CYR_F,
72
+ CYR_H,
73
+ CYR_C,
74
+ CYR_CH,
75
+ CYR_SH,
76
+ CYR_DJ=1106,
77
+ CYR_J=1112,
78
+ CYR_LJ,
79
+ CYR_NJ,
80
+ CYR_TJ,
81
+ CYR_DZ=1119
82
+ };
83
+
84
+ static inline unsigned int
85
+ is_upper_case(unsigned int c)
86
+ {
87
+ return ((c >= 65 && c <= 90)
88
+ || (c >= CYR_CAP_DJ && c <= CYR_CAP_SH)
89
+ || c == LAT_CAP_TJ
90
+ || c == LAT_CAP_CH
91
+ || c == LAT_CAP_DJ
92
+ || c == LAT_CAP_SH
93
+ || c == LAT_CAP_ZH);
94
+ }
95
+
96
+
97
+ static void
98
+ str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
99
+ {
100
+ char s[16];
101
+ int n = rb_enc_codelen(c, enc);
102
+ rb_enc_mbcput(c, s, enc);
103
+ rb_enc_str_buf_cat(str, s, n, enc);
104
+ }
105
+
106
+ static VALUE
107
+ str_to_latin(int argc, VALUE *argv, VALUE str, int ascii, int bang)
108
+ {
109
+ VALUE dest;
110
+ long dest_len;
111
+ char *pos, *end;
112
+ rb_encoding *enc;
113
+ unsigned int codepoint = 0;
114
+ unsigned int prev_codepoint = 0;
115
+ unsigned int next_codepoint = 0;
116
+
117
+ rb_check_arity(argc, 0, 1);
118
+
119
+ pos = RSTRING_PTR(str);
120
+ if (!pos || RSTRING_LEN(str) == 0) return str;
121
+
122
+ end = RSTRING_END(str);
123
+ enc = STR_ENC_GET(str);
124
+ dest_len = RSTRING_LEN(str) + 30; /* TODO len + margin */
125
+ dest = rb_str_buf_new(dest_len);
126
+ rb_enc_associate(dest, enc);
127
+
128
+ while (pos < end) {
129
+ int len, force_upper = 0;
130
+
131
+ prev_codepoint = codepoint;
132
+ codepoint = rb_enc_codepoint_len(pos, end, &len, enc);
133
+ next_codepoint = 0;
134
+
135
+ force_upper = prev_codepoint && is_upper_case(prev_codepoint);
136
+
137
+ if (!force_upper && (pos + len < end)) {
138
+ /* TODO Trim down to one rb_enc_codepoint call per iter. */
139
+ next_codepoint = rb_enc_codepoint(pos + len, end, enc);
140
+ force_upper = is_upper_case(next_codepoint);
141
+ }
142
+
143
+ /* Latin -> "ASCII latin" conversion */
144
+ if (ascii && codepoint >= LAT_CAP_TJ && codepoint <= LAT_ZH) {
145
+ switch (codepoint) {
146
+ case LAT_TJ:
147
+ case LAT_CH: rb_enc_str_buf_cat(dest, "c", 1, enc); break;
148
+ case LAT_DJ: rb_enc_str_buf_cat(dest, "dj", 2, enc); break;
149
+ case LAT_SH: rb_enc_str_buf_cat(dest, "s", 1, enc); break;
150
+ case LAT_ZH: rb_enc_str_buf_cat(dest, "z", 1, enc); break;
151
+ case LAT_CAP_TJ:
152
+ case LAT_CAP_CH: rb_enc_str_buf_cat(dest, "C", 1, enc); break;
153
+ case LAT_CAP_SH: rb_enc_str_buf_cat(dest, "S", 1, enc); break;
154
+ case LAT_CAP_ZH: rb_enc_str_buf_cat(dest, "Z", 1, enc); break;
155
+
156
+ case LAT_CAP_DJ:
157
+ force_upper ? rb_enc_str_buf_cat(dest, "DJ", 2, enc)
158
+ : rb_enc_str_buf_cat(dest, "Dj", 2, enc);
159
+ break;
160
+ default:
161
+ rb_enc_str_buf_cat(dest, pos, len, enc);
162
+ }
163
+ pos += len;
164
+ continue;
165
+ }
166
+
167
+ /* Short-circuit for non-cyrillic codepoints */
168
+ if (codepoint < CYR_CAP_DJ || codepoint > CYR_DZ) {
169
+ rb_enc_str_buf_cat(dest, pos, len, enc);
170
+ pos += len;
171
+ continue;
172
+ }
173
+
174
+ /* Cyrillic -> latin conversion */
175
+ switch (codepoint) {
176
+ case CYR_CAP_J: rb_enc_str_buf_cat(dest, "J", 1, enc); break;
177
+ case CYR_CAP_A: rb_enc_str_buf_cat(dest, "A", 1, enc); break;
178
+ case CYR_CAP_B: rb_enc_str_buf_cat(dest, "B", 1, enc); break;
179
+ case CYR_CAP_V: rb_enc_str_buf_cat(dest, "V", 1, enc); break;
180
+ case CYR_CAP_G: rb_enc_str_buf_cat(dest, "G", 1, enc); break;
181
+ case CYR_CAP_D: rb_enc_str_buf_cat(dest, "D", 1, enc); break;
182
+ case CYR_CAP_E: rb_enc_str_buf_cat(dest, "E", 1, enc); break;
183
+ case CYR_CAP_Z: rb_enc_str_buf_cat(dest, "Z", 1, enc); break;
184
+ case CYR_CAP_I: rb_enc_str_buf_cat(dest, "I", 1, enc); break;
185
+ case CYR_CAP_K: rb_enc_str_buf_cat(dest, "K", 1, enc); break;
186
+ case CYR_CAP_L: rb_enc_str_buf_cat(dest, "L", 1, enc); break;
187
+ case CYR_CAP_M: rb_enc_str_buf_cat(dest, "M", 1, enc); break;
188
+ case CYR_CAP_N: rb_enc_str_buf_cat(dest, "N", 1, enc); break;
189
+ case CYR_CAP_O: rb_enc_str_buf_cat(dest, "O", 1, enc); break;
190
+ case CYR_CAP_P: rb_enc_str_buf_cat(dest, "P", 1, enc); break;
191
+ case CYR_CAP_R: rb_enc_str_buf_cat(dest, "R", 1, enc); break;
192
+ case CYR_CAP_S: rb_enc_str_buf_cat(dest, "S", 1, enc); break;
193
+ case CYR_CAP_T: rb_enc_str_buf_cat(dest, "T", 1, enc); break;
194
+ case CYR_CAP_U: rb_enc_str_buf_cat(dest, "U", 1, enc); break;
195
+ case CYR_CAP_F: rb_enc_str_buf_cat(dest, "F", 1, enc); break;
196
+ case CYR_CAP_H: rb_enc_str_buf_cat(dest, "H", 1, enc); break;
197
+ case CYR_CAP_C: rb_enc_str_buf_cat(dest, "C", 1, enc); break;
198
+ case CYR_A: rb_enc_str_buf_cat(dest, "a", 1, enc); break;
199
+ case CYR_B: rb_enc_str_buf_cat(dest, "b", 1, enc); break;
200
+ case CYR_V: rb_enc_str_buf_cat(dest, "v", 1, enc); break;
201
+ case CYR_G: rb_enc_str_buf_cat(dest, "g", 1, enc); break;
202
+ case CYR_D: rb_enc_str_buf_cat(dest, "d", 1, enc); break;
203
+ case CYR_E: rb_enc_str_buf_cat(dest, "e", 1, enc); break;
204
+ case CYR_Z: rb_enc_str_buf_cat(dest, "z", 1, enc); break;
205
+ case CYR_I: rb_enc_str_buf_cat(dest, "i", 1, enc); break;
206
+ case CYR_K: rb_enc_str_buf_cat(dest, "k", 1, enc); break;
207
+ case CYR_L: rb_enc_str_buf_cat(dest, "l", 1, enc); break;
208
+ case CYR_M: rb_enc_str_buf_cat(dest, "m", 1, enc); break;
209
+ case CYR_N: rb_enc_str_buf_cat(dest, "n", 1, enc); break;
210
+ case CYR_O: rb_enc_str_buf_cat(dest, "o", 1, enc); break;
211
+ case CYR_P: rb_enc_str_buf_cat(dest, "p", 1, enc); break;
212
+ case CYR_R: rb_enc_str_buf_cat(dest, "r", 1, enc); break;
213
+ case CYR_S: rb_enc_str_buf_cat(dest, "s", 1, enc); break;
214
+ case CYR_T: rb_enc_str_buf_cat(dest, "t", 1, enc); break;
215
+ case CYR_U: rb_enc_str_buf_cat(dest, "u", 1, enc); break;
216
+ case CYR_F: rb_enc_str_buf_cat(dest, "f", 1, enc); break;
217
+ case CYR_H: rb_enc_str_buf_cat(dest, "h", 1, enc); break;
218
+ case CYR_C: rb_enc_str_buf_cat(dest, "c", 1, enc); break;
219
+ case CYR_J: rb_enc_str_buf_cat(dest, "j", 1, enc); break;
220
+ case CYR_LJ: rb_enc_str_buf_cat(dest, "lj", 2, enc); break;
221
+ case CYR_NJ: rb_enc_str_buf_cat(dest, "nj", 2, enc); break;
222
+ case CYR_DJ: STR_CAT_COND_ASCII(ascii, dest, "dj", LAT_DJ, 2, enc); break;
223
+ case CYR_TJ: STR_CAT_COND_ASCII(ascii, dest, "c", LAT_TJ, 1, enc); break;
224
+ case CYR_CH: STR_CAT_COND_ASCII(ascii, dest, "c", LAT_CH, 1, enc); break;
225
+ case CYR_ZH: STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc); break;
226
+ case CYR_SH: STR_CAT_COND_ASCII(ascii, dest, "s", LAT_SH, 1, enc); break;
227
+ case CYR_CAP_TJ: STR_CAT_COND_ASCII(ascii, dest, "C", LAT_CAP_TJ, 1, enc); break;
228
+ case CYR_CAP_CH: STR_CAT_COND_ASCII(ascii, dest, "C", LAT_CAP_CH, 1, enc); break;
229
+ case CYR_CAP_ZH: STR_CAT_COND_ASCII(ascii, dest, "Z", LAT_CAP_ZH, 1, enc); break;
230
+ case CYR_CAP_SH: STR_CAT_COND_ASCII(ascii, dest, "S", LAT_CAP_SH, 1, enc); break;
231
+
232
+ /* Several special cases */
233
+ case CYR_CAP_LJ:
234
+ rb_enc_str_buf_cat(dest, (force_upper ? "LJ" : "Lj"), 2, enc);
235
+ break;
236
+
237
+ case CYR_CAP_NJ:
238
+ rb_enc_str_buf_cat(dest, (force_upper ? "NJ" : "Nj"), 2, enc);
239
+ break;
240
+
241
+ case CYR_CAP_DJ:
242
+ STR_CAT_COND_ASCII(ascii, dest, (force_upper ? "DJ" : "Dj"), LAT_CAP_DJ, 2, enc);
243
+ break;
244
+
245
+ case CYR_CAP_DZ:
246
+ rb_enc_str_buf_cat(dest, "D", 1, enc);
247
+ if (force_upper) {
248
+ STR_CAT_COND_ASCII(ascii, dest, "Z", LAT_CAP_ZH, 1, enc);
249
+ }
250
+ else {
251
+ STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc);
252
+ }
253
+ break;
254
+
255
+ case CYR_DZ:
256
+ rb_enc_str_buf_cat(dest, "d", 1, enc);
257
+ STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc);
258
+ break;
259
+
260
+ default:
261
+ rb_enc_str_buf_cat(dest, pos, len, enc);
262
+ }
263
+ pos += len;
264
+ }
265
+
266
+ if (bang) {
267
+ rb_str_shared_replace(str, dest);
268
+ }
269
+ else {
270
+ OBJ_INFECT(dest, str);
271
+ str = dest;
272
+ }
273
+
274
+ return str;
275
+ }
276
+
277
+ static VALUE
278
+ rb_str_to_latin(int argc, VALUE *argv, VALUE str) {
279
+ return str_to_latin(argc, argv, str, 0, 0);
280
+ }
281
+
282
+ static VALUE
283
+ rb_str_to_latin_bang(int argc, VALUE *argv, VALUE str) {
284
+ return str_to_latin(argc, argv, str, 0, 1);
285
+ }
286
+
287
+ static VALUE
288
+ rb_str_to_ascii_latin(int argc, VALUE *argv, VALUE str) {
289
+ return str_to_latin(argc, argv, str, 1, 0);
290
+ }
291
+
292
+ static VALUE
293
+ rb_str_to_ascii_latin_bang(int argc, VALUE *argv, VALUE str) {
294
+ return str_to_latin(argc, argv, str, 1, 1);
295
+ }
296
+
297
+ void Init_byk_native(void)
298
+ {
299
+ rb_define_method(rb_cString, "to_latin", rb_str_to_latin, -1);
300
+ rb_define_method(rb_cString, "to_latin!", rb_str_to_latin_bang, -1);
301
+ rb_define_method(rb_cString, "to_ascii_latin", rb_str_to_ascii_latin, -1);
302
+ rb_define_method(rb_cString, "to_ascii_latin!", rb_str_to_ascii_latin_bang, -1);
303
+ }
@@ -0,0 +1,2 @@
1
+ require "mkmf"
2
+ create_makefile "byk_native"
@@ -0,0 +1,14 @@
1
+ # coding: utf-8
2
+
3
+ require "byk_native"
4
+ require "byk/version"
5
+
6
+ module Byk
7
+
8
+ AZBUKA = %w[а б в г д ђ е ж з и ј к л љ м н њ о п р с т ћ у ф х ц ч џ ш]
9
+ AZBUKA_CAPS = %W[А Б В Г Д Ђ Е Ж З И Ј К Л Љ М Н Њ О П Р С Т Ћ У Ф Х Ц Ч Џ Ш]
10
+
11
+ ABECEDA = %w[a b c č ć d dž đ e f g h i j k l lj m n nj o p r s š t u v z ž]
12
+ ABECEDA_CAPS = %W[A B C Č Ć D Dž Đ E F G H I J K L Lj M N Nj O P R S Š T U V Z Ž]
13
+
14
+ end
@@ -0,0 +1,3 @@
1
+ module Byk
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,126 @@
1
+ # coding: utf-8
2
+
3
+ require "spec_helper"
4
+
5
+ describe Byk do
6
+
7
+ # See http://sr.wikipedia.org/wiki/Панграм
8
+ let(:pangram) { "фијуче ветар у шибљу, леди пасаже и куће иза њих и гунђа у оџацима." }
9
+ let(:pangram_latin) { "fijuče vetar u šiblju, ledi pasaže i kuće iza njih i gunđa u odžacima." }
10
+ let(:pangram_ascii_latin) { "fijuce vetar u siblju, ledi pasaze i kuce iza njih i gundja u odzacima." }
11
+
12
+ let(:pangram_caps) { "ФИЈУЧЕ ВЕТАР У ШИБЉУ, ЛЕДИ ПАСАЖЕ И КУЋЕ ИЗА ЊИХ И ГУНЂА У ОЏАЦИМА." }
13
+ let(:pangram_latin_caps) { "FIJUČE VETAR U ŠIBLJU, LEDI PASAŽE I KUĆE IZA NJIH I GUNĐA U ODŽACIMA." }
14
+ let(:pangram_ascii_latin_caps) { "FIJUCE VETAR U SIBLJU, LEDI PASAZE I KUCE IZA NJIH I GUNDJA U ODZACIMA." }
15
+
16
+ let(:ascii) { "The quick brown fox jumps over the lazy dog." }
17
+ let(:other) { "संस्कृतम् saṃskṛtam" }
18
+
19
+ let(:mixed) { "संस्कृतम् saṃskṛtam илити Sanskrit, obrati ПАЖЊУ." }
20
+ let(:mixed_latin) { "संस्कृतम् saṃskṛtam iliti Sanskrit, obrati PAŽNJU." }
21
+ let(:mixed_ascii_latin) { "संस्कृतम् saṃskṛtam iliti Sanskrit, obrati PAZNJU." }
22
+
23
+ it "has a version number" do
24
+ expect(Byk::VERSION).not_to be nil
25
+ end
26
+
27
+ describe "#to_latin" do
28
+
29
+ it "doesn't modify an empty string" do
30
+ expect("".to_latin).to eq ""
31
+ end
32
+
33
+ it "doesn't modify ASCII text" do
34
+ expect(ascii.to_latin).to eq ascii
35
+ end
36
+
37
+ it "doesn't modify latin" do
38
+ expect(pangram_latin.to_latin).to eq pangram_latin
39
+ end
40
+
41
+ it "doesn't modify other scripts" do
42
+ expect(other.to_latin).to eq other
43
+ end
44
+
45
+ it "converts cyrillic to latin" do
46
+ expect(pangram.to_latin).to eq pangram_latin
47
+ end
48
+
49
+ it "converts cyrillic caps to latin caps" do
50
+ expect(pangram_caps.to_latin).to eq pangram_latin_caps
51
+ end
52
+
53
+ it "converts mixed text properly" do
54
+ expect(mixed.to_latin).to eq mixed_latin
55
+ end
56
+
57
+ it "converts AZBUKA to ABECEDA" do
58
+ expect(Byk::AZBUKA.map(&:to_latin)).to match_array(Byk::ABECEDA)
59
+ end
60
+
61
+ it "converts AZBUKA_CAPS to ABECEDA_CAPS" do
62
+ expect(Byk::AZBUKA_CAPS.map(&:to_latin)).to match_array(Byk::ABECEDA_CAPS)
63
+ end
64
+ end
65
+
66
+ describe "#to_ascii_latin" do
67
+
68
+ # Special care for Њ, Љ, Ђ, Đ
69
+ let(:edge_cases) {
70
+ {
71
+ "Њ" => "Nj",
72
+ "Љ" => "Lj",
73
+ "Ђ" => "Dj",
74
+ "Đ" => "Dj",
75
+ "ЊЊ" => "NJNJ",
76
+ "ЉЉ" => "LJLJ",
77
+ "ЂЂ" => "DJDJ",
78
+ "ĐĐ" => "DJDJ",
79
+ "ГУЊ" => "GUNJ",
80
+ "ПАСУЉ" => "PASULJ",
81
+ "ЂУРАЂ" => "DJURADJ",
82
+ "ĐURAĐ" => "DJURADJ",
83
+ "ĐURAĐ Đorđević" => "DJURADJ Djordjevic",
84
+ "ĐURAĐ. Đorđević" => "DJURADJ. Djordjevic"
85
+ }
86
+ }
87
+
88
+ it "doesn't modify an empty string" do
89
+ expect("".to_ascii_latin).to eq ""
90
+ end
91
+
92
+ it "doesn't modify ASCII text" do
93
+ expect(ascii.to_ascii_latin).to eq ascii
94
+ end
95
+
96
+ it "doesn't modify other scripts" do
97
+ expect(other.to_ascii_latin).to eq other
98
+ end
99
+
100
+ it "converts cyrillic to ASCII latin" do
101
+ expect(pangram.to_ascii_latin).to eq pangram_ascii_latin
102
+ end
103
+
104
+ it "converts cyrillic caps to ASCII latin caps" do
105
+ expect(pangram_caps.to_ascii_latin).to eq pangram_ascii_latin_caps
106
+ end
107
+
108
+ it "converts latin to ASCII latin" do
109
+ expect(pangram_latin.to_ascii_latin).to eq pangram_ascii_latin
110
+ end
111
+
112
+ it "converts latin caps to ASCII latin caps" do
113
+ expect(pangram_latin_caps.to_ascii_latin).to eq pangram_ascii_latin_caps
114
+ end
115
+
116
+ it "converts mixed text properly" do
117
+ expect(mixed.to_ascii_latin).to eq mixed_ascii_latin
118
+ end
119
+
120
+ it "converts edge cases properly" do
121
+ edge_cases.each do |input, output|
122
+ expect(input.to_ascii_latin).to eq output
123
+ end
124
+ end
125
+ end
126
+ end
metadata ADDED
@@ -0,0 +1,81 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: byk
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Nikola Topalović
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-04-15 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rake-compiler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '0.9'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '0.9'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rspec
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '3.2'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '3.2'
41
+ description: Provides C-optimized methods for transliteration of Serbian Cyrillic
42
+ into Latin.
43
+ email: nikola.topalovic@gmail.com
44
+ executables: []
45
+ extensions:
46
+ - ext/byk/extconf.rb
47
+ extra_rdoc_files: []
48
+ files:
49
+ - LICENSE
50
+ - README.md
51
+ - ext/byk/byk.c
52
+ - ext/byk/extconf.rb
53
+ - lib/byk.rb
54
+ - lib/byk/version.rb
55
+ - spec/byk_spec.rb
56
+ homepage: https://github.com/topalovic/byk
57
+ licenses:
58
+ - MIT
59
+ metadata: {}
60
+ post_install_message:
61
+ rdoc_options: []
62
+ require_paths:
63
+ - lib
64
+ required_ruby_version: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ required_rubygems_version: !ruby/object:Gem::Requirement
70
+ requirements:
71
+ - - ">="
72
+ - !ruby/object:Gem::Version
73
+ version: '0'
74
+ requirements: []
75
+ rubyforge_project:
76
+ rubygems_version: 2.2.2
77
+ signing_key:
78
+ specification_version: 4
79
+ summary: Fast transliteration of Serbian Cyrillic into Latin.
80
+ test_files:
81
+ - spec/byk_spec.rb