byk 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 18a9ea704a27dfc6a2ca4f7fd9b83e05c1f5a57b
4
+ data.tar.gz: b6490d7a249f5ceb6378d92d6ee63af2424074ab
5
+ SHA512:
6
+ metadata.gz: 809f4204b60ec626f15aff3e32b8ca4368e0bcecbda1700b039b3edf086853841a6e94c7ef3e9a47003623b9cde0635d555d1d19ccca28220aa165f7a28c8327
7
+ data.tar.gz: 9a103df7e2976d6ab2574a372479e18f6fccd3c2e005d49e5c464f82befb6942ca19a80091387d67ee85aa98980b2bfe21bf9efada48ad1fec5e817a60d827f8
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2015 Nikola Topalović
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1,83 @@
1
+ Byk
2
+ ===
3
+
4
+ Fast transliteration of Serbian Cyrillic into Latin.
5
+
6
+ This package was inspired by @dejan's
7
+ [nice little gem](https://github.com/dejan/srbovanje), but this one
8
+ comes with a C-optimized twist.
9
+
10
+ ## Installation
11
+
12
+ Add this line to your application's Gemfile:
13
+
14
+ ```ruby
15
+ gem "byk"
16
+ ```
17
+
18
+ And then execute:
19
+
20
+ ```
21
+ $ bundle
22
+ ```
23
+
24
+ Or install it yourself as:
25
+ ```
26
+ $ gem install byk
27
+ ```
28
+
29
+ ## Usage
30
+
31
+ First, make sure to require the gem in your initializer:
32
+
33
+ ```
34
+ require "byk"
35
+ ```
36
+
37
+ This will extend `String` with a couple of simple methods:
38
+
39
+ ```ruby
40
+ "Шеширџија".to_latin # => "Šeširdžija"
41
+ "Шеширџија".to_ascii_latin # => "Sesirdzija"
42
+ "Šeširdžija".to_ascii_latin # => "Sesirdzija"
43
+ ```
44
+
45
+ There's also a destructive version of each:
46
+
47
+ ```ruby
48
+ text = "Жвазбука"
49
+ text.to_latin! # => "Žvazbuka"
50
+ text # => "Žvazbuka"
51
+ text.to_ascii_latin! # => "Zvazbuka"
52
+ text # => "Zvazbuka"
53
+ ```
54
+
55
+ Note that these methods will take into account the
56
+ [special two-letter rules](http://sr.wikipedia.org/wiki/Gajica#Abeceda):
57
+
58
+ ```
59
+ "ĐORĐE Đorđević".to_ascii_latin # => "DJORDJE Djordjevic"
60
+ ```
61
+
62
+ ## Notes
63
+
64
+ ### How fast is fast?
65
+
66
+ About [7x](benchmark) faster than the baseline Ruby implementation on
67
+ my hardware. YMMV of course.
68
+
69
+ ### Compatibility
70
+
71
+ Byk is supported under MRI Ruby 1.9.3, 2.0, 2.1 and 2.2. Earlier
72
+ versions of MRI are untested.
73
+
74
+ ### Raison d'être
75
+
76
+ For massive transliteration (e.g. sites supporting dual script
77
+ output), this kind of speed-up might be worthwhile, even with caching.
78
+
79
+ Also, it's a well-defined problem with hard-set rules which makes it a
80
+ natural target for optimization. Plus, it gave me an excuse to play
81
+ with Ruby extensions, so there :smile_cat:
82
+
83
+ Уздравље!
@@ -0,0 +1,303 @@
1
+ #include <stdio.h>
2
+ #include <ruby.h>
3
+ #include <ruby/encoding.h>
4
+
5
+ #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
6
+
7
+ #define STR_CAT_COND_ASCII(force_ascii, dest, chr, ascii_chr, len, enc) \
8
+ force_ascii ? rb_enc_str_buf_cat(dest, chr, len, enc) \
9
+ : str_cat_char(dest, ascii_chr, enc)
10
+
11
+ enum {
12
+ LAT_CAP_TJ=262,
13
+ LAT_TJ,
14
+ LAT_CAP_CH=268,
15
+ LAT_CH,
16
+ LAT_CAP_DJ=272,
17
+ LAT_DJ,
18
+ LAT_CAP_SH=352,
19
+ LAT_SH,
20
+ LAT_CAP_ZH=381,
21
+ LAT_ZH,
22
+ CYR_CAP_DJ=1026,
23
+ CYR_CAP_J=1032,
24
+ CYR_CAP_LJ,
25
+ CYR_CAP_NJ,
26
+ CYR_CAP_TJ,
27
+ CYR_CAP_DZ=1039,
28
+ CYR_CAP_A,
29
+ CYR_CAP_B,
30
+ CYR_CAP_V,
31
+ CYR_CAP_G,
32
+ CYR_CAP_D,
33
+ CYR_CAP_E,
34
+ CYR_CAP_ZH,
35
+ CYR_CAP_Z,
36
+ CYR_CAP_I,
37
+ CYR_CAP_K=1050,
38
+ CYR_CAP_L,
39
+ CYR_CAP_M,
40
+ CYR_CAP_N,
41
+ CYR_CAP_O,
42
+ CYR_CAP_P,
43
+ CYR_CAP_R,
44
+ CYR_CAP_S,
45
+ CYR_CAP_T,
46
+ CYR_CAP_U,
47
+ CYR_CAP_F,
48
+ CYR_CAP_H,
49
+ CYR_CAP_C,
50
+ CYR_CAP_CH,
51
+ CYR_CAP_SH,
52
+ CYR_A=1072,
53
+ CYR_B,
54
+ CYR_V,
55
+ CYR_G,
56
+ CYR_D,
57
+ CYR_E,
58
+ CYR_ZH,
59
+ CYR_Z,
60
+ CYR_I,
61
+ CYR_K=1082,
62
+ CYR_L,
63
+ CYR_M,
64
+ CYR_N,
65
+ CYR_O,
66
+ CYR_P,
67
+ CYR_R,
68
+ CYR_S,
69
+ CYR_T,
70
+ CYR_U,
71
+ CYR_F,
72
+ CYR_H,
73
+ CYR_C,
74
+ CYR_CH,
75
+ CYR_SH,
76
+ CYR_DJ=1106,
77
+ CYR_J=1112,
78
+ CYR_LJ,
79
+ CYR_NJ,
80
+ CYR_TJ,
81
+ CYR_DZ=1119
82
+ };
83
+
84
+ static inline unsigned int
85
+ is_upper_case(unsigned int c)
86
+ {
87
+ return ((c >= 65 && c <= 90)
88
+ || (c >= CYR_CAP_DJ && c <= CYR_CAP_SH)
89
+ || c == LAT_CAP_TJ
90
+ || c == LAT_CAP_CH
91
+ || c == LAT_CAP_DJ
92
+ || c == LAT_CAP_SH
93
+ || c == LAT_CAP_ZH);
94
+ }
95
+
96
+
97
+ static void
98
+ str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
99
+ {
100
+ char s[16];
101
+ int n = rb_enc_codelen(c, enc);
102
+ rb_enc_mbcput(c, s, enc);
103
+ rb_enc_str_buf_cat(str, s, n, enc);
104
+ }
105
+
106
+ static VALUE
107
+ str_to_latin(int argc, VALUE *argv, VALUE str, int ascii, int bang)
108
+ {
109
+ VALUE dest;
110
+ long dest_len;
111
+ char *pos, *end;
112
+ rb_encoding *enc;
113
+ unsigned int codepoint = 0;
114
+ unsigned int prev_codepoint = 0;
115
+ unsigned int next_codepoint = 0;
116
+
117
+ rb_check_arity(argc, 0, 1);
118
+
119
+ pos = RSTRING_PTR(str);
120
+ if (!pos || RSTRING_LEN(str) == 0) return str;
121
+
122
+ end = RSTRING_END(str);
123
+ enc = STR_ENC_GET(str);
124
+ dest_len = RSTRING_LEN(str) + 30; /* TODO len + margin */
125
+ dest = rb_str_buf_new(dest_len);
126
+ rb_enc_associate(dest, enc);
127
+
128
+ while (pos < end) {
129
+ int len, force_upper = 0;
130
+
131
+ prev_codepoint = codepoint;
132
+ codepoint = rb_enc_codepoint_len(pos, end, &len, enc);
133
+ next_codepoint = 0;
134
+
135
+ force_upper = prev_codepoint && is_upper_case(prev_codepoint);
136
+
137
+ if (!force_upper && (pos + len < end)) {
138
+ /* TODO Trim down to one rb_enc_codepoint call per iter. */
139
+ next_codepoint = rb_enc_codepoint(pos + len, end, enc);
140
+ force_upper = is_upper_case(next_codepoint);
141
+ }
142
+
143
+ /* Latin -> "ASCII latin" conversion */
144
+ if (ascii && codepoint >= LAT_CAP_TJ && codepoint <= LAT_ZH) {
145
+ switch (codepoint) {
146
+ case LAT_TJ:
147
+ case LAT_CH: rb_enc_str_buf_cat(dest, "c", 1, enc); break;
148
+ case LAT_DJ: rb_enc_str_buf_cat(dest, "dj", 2, enc); break;
149
+ case LAT_SH: rb_enc_str_buf_cat(dest, "s", 1, enc); break;
150
+ case LAT_ZH: rb_enc_str_buf_cat(dest, "z", 1, enc); break;
151
+ case LAT_CAP_TJ:
152
+ case LAT_CAP_CH: rb_enc_str_buf_cat(dest, "C", 1, enc); break;
153
+ case LAT_CAP_SH: rb_enc_str_buf_cat(dest, "S", 1, enc); break;
154
+ case LAT_CAP_ZH: rb_enc_str_buf_cat(dest, "Z", 1, enc); break;
155
+
156
+ case LAT_CAP_DJ:
157
+ force_upper ? rb_enc_str_buf_cat(dest, "DJ", 2, enc)
158
+ : rb_enc_str_buf_cat(dest, "Dj", 2, enc);
159
+ break;
160
+ default:
161
+ rb_enc_str_buf_cat(dest, pos, len, enc);
162
+ }
163
+ pos += len;
164
+ continue;
165
+ }
166
+
167
+ /* Short-circuit for non-cyrillic codepoints */
168
+ if (codepoint < CYR_CAP_DJ || codepoint > CYR_DZ) {
169
+ rb_enc_str_buf_cat(dest, pos, len, enc);
170
+ pos += len;
171
+ continue;
172
+ }
173
+
174
+ /* Cyrillic -> latin conversion */
175
+ switch (codepoint) {
176
+ case CYR_CAP_J: rb_enc_str_buf_cat(dest, "J", 1, enc); break;
177
+ case CYR_CAP_A: rb_enc_str_buf_cat(dest, "A", 1, enc); break;
178
+ case CYR_CAP_B: rb_enc_str_buf_cat(dest, "B", 1, enc); break;
179
+ case CYR_CAP_V: rb_enc_str_buf_cat(dest, "V", 1, enc); break;
180
+ case CYR_CAP_G: rb_enc_str_buf_cat(dest, "G", 1, enc); break;
181
+ case CYR_CAP_D: rb_enc_str_buf_cat(dest, "D", 1, enc); break;
182
+ case CYR_CAP_E: rb_enc_str_buf_cat(dest, "E", 1, enc); break;
183
+ case CYR_CAP_Z: rb_enc_str_buf_cat(dest, "Z", 1, enc); break;
184
+ case CYR_CAP_I: rb_enc_str_buf_cat(dest, "I", 1, enc); break;
185
+ case CYR_CAP_K: rb_enc_str_buf_cat(dest, "K", 1, enc); break;
186
+ case CYR_CAP_L: rb_enc_str_buf_cat(dest, "L", 1, enc); break;
187
+ case CYR_CAP_M: rb_enc_str_buf_cat(dest, "M", 1, enc); break;
188
+ case CYR_CAP_N: rb_enc_str_buf_cat(dest, "N", 1, enc); break;
189
+ case CYR_CAP_O: rb_enc_str_buf_cat(dest, "O", 1, enc); break;
190
+ case CYR_CAP_P: rb_enc_str_buf_cat(dest, "P", 1, enc); break;
191
+ case CYR_CAP_R: rb_enc_str_buf_cat(dest, "R", 1, enc); break;
192
+ case CYR_CAP_S: rb_enc_str_buf_cat(dest, "S", 1, enc); break;
193
+ case CYR_CAP_T: rb_enc_str_buf_cat(dest, "T", 1, enc); break;
194
+ case CYR_CAP_U: rb_enc_str_buf_cat(dest, "U", 1, enc); break;
195
+ case CYR_CAP_F: rb_enc_str_buf_cat(dest, "F", 1, enc); break;
196
+ case CYR_CAP_H: rb_enc_str_buf_cat(dest, "H", 1, enc); break;
197
+ case CYR_CAP_C: rb_enc_str_buf_cat(dest, "C", 1, enc); break;
198
+ case CYR_A: rb_enc_str_buf_cat(dest, "a", 1, enc); break;
199
+ case CYR_B: rb_enc_str_buf_cat(dest, "b", 1, enc); break;
200
+ case CYR_V: rb_enc_str_buf_cat(dest, "v", 1, enc); break;
201
+ case CYR_G: rb_enc_str_buf_cat(dest, "g", 1, enc); break;
202
+ case CYR_D: rb_enc_str_buf_cat(dest, "d", 1, enc); break;
203
+ case CYR_E: rb_enc_str_buf_cat(dest, "e", 1, enc); break;
204
+ case CYR_Z: rb_enc_str_buf_cat(dest, "z", 1, enc); break;
205
+ case CYR_I: rb_enc_str_buf_cat(dest, "i", 1, enc); break;
206
+ case CYR_K: rb_enc_str_buf_cat(dest, "k", 1, enc); break;
207
+ case CYR_L: rb_enc_str_buf_cat(dest, "l", 1, enc); break;
208
+ case CYR_M: rb_enc_str_buf_cat(dest, "m", 1, enc); break;
209
+ case CYR_N: rb_enc_str_buf_cat(dest, "n", 1, enc); break;
210
+ case CYR_O: rb_enc_str_buf_cat(dest, "o", 1, enc); break;
211
+ case CYR_P: rb_enc_str_buf_cat(dest, "p", 1, enc); break;
212
+ case CYR_R: rb_enc_str_buf_cat(dest, "r", 1, enc); break;
213
+ case CYR_S: rb_enc_str_buf_cat(dest, "s", 1, enc); break;
214
+ case CYR_T: rb_enc_str_buf_cat(dest, "t", 1, enc); break;
215
+ case CYR_U: rb_enc_str_buf_cat(dest, "u", 1, enc); break;
216
+ case CYR_F: rb_enc_str_buf_cat(dest, "f", 1, enc); break;
217
+ case CYR_H: rb_enc_str_buf_cat(dest, "h", 1, enc); break;
218
+ case CYR_C: rb_enc_str_buf_cat(dest, "c", 1, enc); break;
219
+ case CYR_J: rb_enc_str_buf_cat(dest, "j", 1, enc); break;
220
+ case CYR_LJ: rb_enc_str_buf_cat(dest, "lj", 2, enc); break;
221
+ case CYR_NJ: rb_enc_str_buf_cat(dest, "nj", 2, enc); break;
222
+ case CYR_DJ: STR_CAT_COND_ASCII(ascii, dest, "dj", LAT_DJ, 2, enc); break;
223
+ case CYR_TJ: STR_CAT_COND_ASCII(ascii, dest, "c", LAT_TJ, 1, enc); break;
224
+ case CYR_CH: STR_CAT_COND_ASCII(ascii, dest, "c", LAT_CH, 1, enc); break;
225
+ case CYR_ZH: STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc); break;
226
+ case CYR_SH: STR_CAT_COND_ASCII(ascii, dest, "s", LAT_SH, 1, enc); break;
227
+ case CYR_CAP_TJ: STR_CAT_COND_ASCII(ascii, dest, "C", LAT_CAP_TJ, 1, enc); break;
228
+ case CYR_CAP_CH: STR_CAT_COND_ASCII(ascii, dest, "C", LAT_CAP_CH, 1, enc); break;
229
+ case CYR_CAP_ZH: STR_CAT_COND_ASCII(ascii, dest, "Z", LAT_CAP_ZH, 1, enc); break;
230
+ case CYR_CAP_SH: STR_CAT_COND_ASCII(ascii, dest, "S", LAT_CAP_SH, 1, enc); break;
231
+
232
+ /* Several special cases */
233
+ case CYR_CAP_LJ:
234
+ rb_enc_str_buf_cat(dest, (force_upper ? "LJ" : "Lj"), 2, enc);
235
+ break;
236
+
237
+ case CYR_CAP_NJ:
238
+ rb_enc_str_buf_cat(dest, (force_upper ? "NJ" : "Nj"), 2, enc);
239
+ break;
240
+
241
+ case CYR_CAP_DJ:
242
+ STR_CAT_COND_ASCII(ascii, dest, (force_upper ? "DJ" : "Dj"), LAT_CAP_DJ, 2, enc);
243
+ break;
244
+
245
+ case CYR_CAP_DZ:
246
+ rb_enc_str_buf_cat(dest, "D", 1, enc);
247
+ if (force_upper) {
248
+ STR_CAT_COND_ASCII(ascii, dest, "Z", LAT_CAP_ZH, 1, enc);
249
+ }
250
+ else {
251
+ STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc);
252
+ }
253
+ break;
254
+
255
+ case CYR_DZ:
256
+ rb_enc_str_buf_cat(dest, "d", 1, enc);
257
+ STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc);
258
+ break;
259
+
260
+ default:
261
+ rb_enc_str_buf_cat(dest, pos, len, enc);
262
+ }
263
+ pos += len;
264
+ }
265
+
266
+ if (bang) {
267
+ rb_str_shared_replace(str, dest);
268
+ }
269
+ else {
270
+ OBJ_INFECT(dest, str);
271
+ str = dest;
272
+ }
273
+
274
+ return str;
275
+ }
276
+
277
+ static VALUE
278
+ rb_str_to_latin(int argc, VALUE *argv, VALUE str) {
279
+ return str_to_latin(argc, argv, str, 0, 0);
280
+ }
281
+
282
+ static VALUE
283
+ rb_str_to_latin_bang(int argc, VALUE *argv, VALUE str) {
284
+ return str_to_latin(argc, argv, str, 0, 1);
285
+ }
286
+
287
+ static VALUE
288
+ rb_str_to_ascii_latin(int argc, VALUE *argv, VALUE str) {
289
+ return str_to_latin(argc, argv, str, 1, 0);
290
+ }
291
+
292
+ static VALUE
293
+ rb_str_to_ascii_latin_bang(int argc, VALUE *argv, VALUE str) {
294
+ return str_to_latin(argc, argv, str, 1, 1);
295
+ }
296
+
297
+ void Init_byk_native(void)
298
+ {
299
+ rb_define_method(rb_cString, "to_latin", rb_str_to_latin, -1);
300
+ rb_define_method(rb_cString, "to_latin!", rb_str_to_latin_bang, -1);
301
+ rb_define_method(rb_cString, "to_ascii_latin", rb_str_to_ascii_latin, -1);
302
+ rb_define_method(rb_cString, "to_ascii_latin!", rb_str_to_ascii_latin_bang, -1);
303
+ }
@@ -0,0 +1,2 @@
1
+ require "mkmf"
2
+ create_makefile "byk_native"
@@ -0,0 +1,14 @@
1
+ # coding: utf-8
2
+
3
+ require "byk_native"
4
+ require "byk/version"
5
+
6
+ module Byk
7
+
8
+ AZBUKA = %w[а б в г д ђ е ж з и ј к л љ м н њ о п р с т ћ у ф х ц ч џ ш]
9
+ AZBUKA_CAPS = %W[А Б В Г Д Ђ Е Ж З И Ј К Л Љ М Н Њ О П Р С Т Ћ У Ф Х Ц Ч Џ Ш]
10
+
11
+ ABECEDA = %w[a b c č ć d dž đ e f g h i j k l lj m n nj o p r s š t u v z ž]
12
+ ABECEDA_CAPS = %W[A B C Č Ć D Dž Đ E F G H I J K L Lj M N Nj O P R S Š T U V Z Ž]
13
+
14
+ end
@@ -0,0 +1,3 @@
1
+ module Byk
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,126 @@
1
+ # coding: utf-8
2
+
3
+ require "spec_helper"
4
+
5
+ describe Byk do
6
+
7
+ # See http://sr.wikipedia.org/wiki/Панграм
8
+ let(:pangram) { "фијуче ветар у шибљу, леди пасаже и куће иза њих и гунђа у оџацима." }
9
+ let(:pangram_latin) { "fijuče vetar u šiblju, ledi pasaže i kuće iza njih i gunđa u odžacima." }
10
+ let(:pangram_ascii_latin) { "fijuce vetar u siblju, ledi pasaze i kuce iza njih i gundja u odzacima." }
11
+
12
+ let(:pangram_caps) { "ФИЈУЧЕ ВЕТАР У ШИБЉУ, ЛЕДИ ПАСАЖЕ И КУЋЕ ИЗА ЊИХ И ГУНЂА У ОЏАЦИМА." }
13
+ let(:pangram_latin_caps) { "FIJUČE VETAR U ŠIBLJU, LEDI PASAŽE I KUĆE IZA NJIH I GUNĐA U ODŽACIMA." }
14
+ let(:pangram_ascii_latin_caps) { "FIJUCE VETAR U SIBLJU, LEDI PASAZE I KUCE IZA NJIH I GUNDJA U ODZACIMA." }
15
+
16
+ let(:ascii) { "The quick brown fox jumps over the lazy dog." }
17
+ let(:other) { "संस्कृतम् saṃskṛtam" }
18
+
19
+ let(:mixed) { "संस्कृतम् saṃskṛtam илити Sanskrit, obrati ПАЖЊУ." }
20
+ let(:mixed_latin) { "संस्कृतम् saṃskṛtam iliti Sanskrit, obrati PAŽNJU." }
21
+ let(:mixed_ascii_latin) { "संस्कृतम् saṃskṛtam iliti Sanskrit, obrati PAZNJU." }
22
+
23
+ it "has a version number" do
24
+ expect(Byk::VERSION).not_to be nil
25
+ end
26
+
27
+ describe "#to_latin" do
28
+
29
+ it "doesn't modify an empty string" do
30
+ expect("".to_latin).to eq ""
31
+ end
32
+
33
+ it "doesn't modify ASCII text" do
34
+ expect(ascii.to_latin).to eq ascii
35
+ end
36
+
37
+ it "doesn't modify latin" do
38
+ expect(pangram_latin.to_latin).to eq pangram_latin
39
+ end
40
+
41
+ it "doesn't modify other scripts" do
42
+ expect(other.to_latin).to eq other
43
+ end
44
+
45
+ it "converts cyrillic to latin" do
46
+ expect(pangram.to_latin).to eq pangram_latin
47
+ end
48
+
49
+ it "converts cyrillic caps to latin caps" do
50
+ expect(pangram_caps.to_latin).to eq pangram_latin_caps
51
+ end
52
+
53
+ it "converts mixed text properly" do
54
+ expect(mixed.to_latin).to eq mixed_latin
55
+ end
56
+
57
+ it "converts AZBUKA to ABECEDA" do
58
+ expect(Byk::AZBUKA.map(&:to_latin)).to match_array(Byk::ABECEDA)
59
+ end
60
+
61
+ it "converts AZBUKA_CAPS to ABECEDA_CAPS" do
62
+ expect(Byk::AZBUKA_CAPS.map(&:to_latin)).to match_array(Byk::ABECEDA_CAPS)
63
+ end
64
+ end
65
+
66
+ describe "#to_ascii_latin" do
67
+
68
+ # Special care for Њ, Љ, Ђ, Đ
69
+ let(:edge_cases) {
70
+ {
71
+ "Њ" => "Nj",
72
+ "Љ" => "Lj",
73
+ "Ђ" => "Dj",
74
+ "Đ" => "Dj",
75
+ "ЊЊ" => "NJNJ",
76
+ "ЉЉ" => "LJLJ",
77
+ "ЂЂ" => "DJDJ",
78
+ "ĐĐ" => "DJDJ",
79
+ "ГУЊ" => "GUNJ",
80
+ "ПАСУЉ" => "PASULJ",
81
+ "ЂУРАЂ" => "DJURADJ",
82
+ "ĐURAĐ" => "DJURADJ",
83
+ "ĐURAĐ Đorđević" => "DJURADJ Djordjevic",
84
+ "ĐURAĐ. Đorđević" => "DJURADJ. Djordjevic"
85
+ }
86
+ }
87
+
88
+ it "doesn't modify an empty string" do
89
+ expect("".to_ascii_latin).to eq ""
90
+ end
91
+
92
+ it "doesn't modify ASCII text" do
93
+ expect(ascii.to_ascii_latin).to eq ascii
94
+ end
95
+
96
+ it "doesn't modify other scripts" do
97
+ expect(other.to_ascii_latin).to eq other
98
+ end
99
+
100
+ it "converts cyrillic to ASCII latin" do
101
+ expect(pangram.to_ascii_latin).to eq pangram_ascii_latin
102
+ end
103
+
104
+ it "converts cyrillic caps to ASCII latin caps" do
105
+ expect(pangram_caps.to_ascii_latin).to eq pangram_ascii_latin_caps
106
+ end
107
+
108
+ it "converts latin to ASCII latin" do
109
+ expect(pangram_latin.to_ascii_latin).to eq pangram_ascii_latin
110
+ end
111
+
112
+ it "converts latin caps to ASCII latin caps" do
113
+ expect(pangram_latin_caps.to_ascii_latin).to eq pangram_ascii_latin_caps
114
+ end
115
+
116
+ it "converts mixed text properly" do
117
+ expect(mixed.to_ascii_latin).to eq mixed_ascii_latin
118
+ end
119
+
120
+ it "converts edge cases properly" do
121
+ edge_cases.each do |input, output|
122
+ expect(input.to_ascii_latin).to eq output
123
+ end
124
+ end
125
+ end
126
+ end
metadata ADDED
@@ -0,0 +1,81 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: byk
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Nikola Topalović
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-04-15 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rake-compiler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '0.9'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '0.9'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rspec
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '3.2'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '3.2'
41
+ description: Provides C-optimized methods for transliteration of Serbian Cyrillic
42
+ into Latin.
43
+ email: nikola.topalovic@gmail.com
44
+ executables: []
45
+ extensions:
46
+ - ext/byk/extconf.rb
47
+ extra_rdoc_files: []
48
+ files:
49
+ - LICENSE
50
+ - README.md
51
+ - ext/byk/byk.c
52
+ - ext/byk/extconf.rb
53
+ - lib/byk.rb
54
+ - lib/byk/version.rb
55
+ - spec/byk_spec.rb
56
+ homepage: https://github.com/topalovic/byk
57
+ licenses:
58
+ - MIT
59
+ metadata: {}
60
+ post_install_message:
61
+ rdoc_options: []
62
+ require_paths:
63
+ - lib
64
+ required_ruby_version: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ required_rubygems_version: !ruby/object:Gem::Requirement
70
+ requirements:
71
+ - - ">="
72
+ - !ruby/object:Gem::Version
73
+ version: '0'
74
+ requirements: []
75
+ rubyforge_project:
76
+ rubygems_version: 2.2.2
77
+ signing_key:
78
+ specification_version: 4
79
+ summary: Fast transliteration of Serbian Cyrillic into Latin.
80
+ test_files:
81
+ - spec/byk_spec.rb