byk 0.5.0 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 70b44d7687698d589bad7ddd888824bb58d4146d
4
- data.tar.gz: a9c6c155c85533e0d16e8515176929faaf456613
3
+ metadata.gz: 34203e0b4291cde495d17da65522df586de7e712
4
+ data.tar.gz: 290d743dab23c58241520252bd81d4ae4115ce98
5
5
  SHA512:
6
- metadata.gz: 4543d2bc442e1bfcbb5bc5030a4cbb3cfdc456e68a7a4da2b623039109f0faa52aad4f335c1856f574d4b51a82f9ba5fb4f97beee8dc364126064df6b3cf70b5
7
- data.tar.gz: a852ce68e4635d1af3f48e0e29c7b987c53f216d63fcbe9102c05ba825eb3c74cc7317bf807d9f9ec687312a5962f73533eaa12e822407a326d3bfcb3d0901b7
6
+ metadata.gz: f11d00e9ac1057a5596804e03c6c4a6c41841bedc21030a9ed776cfbaaabba85a341a62de71c990c8deadc8f8384bf263b41d477b33b299af26b55acef47fe0c
7
+ data.tar.gz: 335ddfeca9f6793f2887c1cc93cfc916e011f7dd01fd97073162871148d0fe61395bdb1115c5ed4f7583ff207f6b6d27462c8920a7a70b016b9427420796bc28
@@ -1,5 +1,11 @@
1
1
  # Changelog
2
2
 
3
+ ### Byk 0.6.0 (2015-04-25)
4
+
5
+ * Introduced module methods and the optional safe require
6
+ * Documented the methods
7
+ * Upgraded spec suite
8
+
3
9
  ### Byk 0.5.0 (2015-04-18)
4
10
 
5
11
  * Performance tuning and refactoring, up to 5x speedup
data/README.md CHANGED
@@ -61,13 +61,40 @@ text # => "Zvazbuka"
61
61
  ```
62
62
 
63
63
  Note that these methods take into account the
64
- [two-letter capitalization rules](http://sr.wikipedia.org/wiki/Gajica#Abeceda):
64
+ [digraph capitalization rules](http://sr.wikipedia.org/wiki/Гајица#.D0.94.D0.B8.D0.B3.D1.80.D0.B0.D1.84.D0.B8):
65
65
 
66
66
  ```ruby
67
67
  "ЉИЉА Љиљановић".to_latin # => "LJILJA Ljiljanović"
68
68
  "ĐORĐE Đorđević".to_ascii_latin # => "DJORDJE Djordjevic"
69
69
  ```
70
70
 
71
+ If you prefer not to monkey patch your strings, you can use the "safe"
72
+ require:
73
+
74
+ ```ruby
75
+ require "byk/safe"
76
+ ```
77
+
78
+ and then call the module methods:
79
+
80
+ ```ruby
81
+ text = "Вук"
82
+ Byk.to_latin(text) # => "Vuk"
83
+ text # => "Byk"
84
+ Byk.to_latin!(text) # => "Vuk"
85
+ text # => "Vuk"
86
+ ```
87
+
88
+
89
+ ## Testing
90
+
91
+ To test the gem, clone the repo and run:
92
+
93
+ ```
94
+ $ bundle
95
+ $ bundle exec rake
96
+ ```
97
+
71
98
 
72
99
  ## How fast is fast?
73
100
 
@@ -84,7 +111,7 @@ projects, e.g. sites supporting dual script content. Remember,
84
111
 
85
112
  I found transliteration to be a straightforward little problem that
86
113
  lends itself well to optimization. It also gave me an excuse to play
87
- with Ruby extensions, so there :smile_cat:
114
+ with Ruby extensions, so there :smirk_cat:
88
115
 
89
116
 
90
117
  ## Compatibility
@@ -92,10 +119,8 @@ with Ruby extensions, so there :smile_cat:
92
119
  Byk is supported under MRI Ruby >= 1.9.2.
93
120
 
94
121
  I don't plan to support 1.8.7 or older due to substantial C API
95
- changes between 1.8 and 1.9.
96
-
97
- It doesn't build under Rubinius currently, but I intend to support it
98
- in future releases.
122
+ changes between 1.8 and 1.9. It doesn't build under Rubinius
123
+ currently, but I intend to support it in future releases.
99
124
 
100
125
 
101
126
  ## License
@@ -5,91 +5,67 @@
5
5
 
6
6
  #define STR_CAT_COND_ASCII(ascii, dest, chr, ascii_chr, len, enc) \
7
7
  ascii ? rb_str_buf_cat(dest, chr, len) \
8
- : str_cat_char(dest, ascii_chr, enc)
8
+ : str_cat_char(dest, ascii_chr, enc)
9
9
 
10
10
  enum {
11
- LAT_CAP_TJ=262,
11
+ LAT_CAP_TJ = 0x106,
12
12
  LAT_TJ,
13
- LAT_CAP_CH=268,
13
+ LAT_CAP_CH = 0x10c,
14
14
  LAT_CH,
15
- LAT_CAP_DJ=272,
15
+ LAT_CAP_DJ = 0x110,
16
16
  LAT_DJ,
17
- LAT_CAP_SH=352,
17
+ LAT_CAP_SH = 0x160,
18
18
  LAT_SH,
19
- LAT_CAP_ZH=381,
19
+ LAT_CAP_ZH = 0x17d,
20
20
  LAT_ZH,
21
- CYR_CAP_DJ=1026,
22
- CYR_CAP_J=1032,
21
+ CYR_CAP_DJ = 0x402,
22
+ CYR_CAP_J = 0x408,
23
23
  CYR_CAP_LJ,
24
24
  CYR_CAP_NJ,
25
25
  CYR_CAP_TJ,
26
- CYR_CAP_DZ=1039,
26
+ CYR_CAP_DZ = 0x40f,
27
27
  CYR_CAP_A,
28
- CYR_CAP_B,
29
- CYR_CAP_V,
30
- CYR_CAP_G,
31
- CYR_CAP_D,
32
- CYR_CAP_E,
33
- CYR_CAP_ZH,
34
- CYR_CAP_Z,
35
- CYR_CAP_I,
36
- CYR_CAP_K=1050,
37
- CYR_CAP_L,
38
- CYR_CAP_M,
39
- CYR_CAP_N,
40
- CYR_CAP_O,
41
- CYR_CAP_P,
42
- CYR_CAP_R,
43
- CYR_CAP_S,
44
- CYR_CAP_T,
45
- CYR_CAP_U,
46
- CYR_CAP_F,
47
- CYR_CAP_H,
48
- CYR_CAP_C,
28
+ CYR_CAP_ZH = 0x416,
29
+ CYR_CAP_C = 0x426,
49
30
  CYR_CAP_CH,
50
31
  CYR_CAP_SH,
51
- CYR_A=1072,
52
- CYR_B,
53
- CYR_V,
54
- CYR_G,
55
- CYR_D,
56
- CYR_E,
57
- CYR_ZH,
58
- CYR_Z,
59
- CYR_I,
60
- CYR_K=1082,
61
- CYR_L,
62
- CYR_M,
63
- CYR_N,
64
- CYR_O,
65
- CYR_P,
66
- CYR_R,
67
- CYR_S,
68
- CYR_T,
69
- CYR_U,
70
- CYR_F,
71
- CYR_H,
72
- CYR_C,
32
+ CYR_A = 0x430,
33
+ CYR_ZH = 0x436,
34
+ CYR_C = 0x446,
73
35
  CYR_CH,
74
36
  CYR_SH,
75
- CYR_DJ=1106,
76
- CYR_J=1112,
37
+ CYR_DJ = 0x452,
38
+ CYR_J = 0x458,
77
39
  CYR_LJ,
78
40
  CYR_NJ,
79
41
  CYR_TJ,
80
- CYR_DZ=1119
42
+ CYR_DZ = 0x45f
81
43
  };
82
44
 
83
45
  static inline unsigned int
84
- is_upper_case(unsigned int c)
46
+ is_cyrillic(unsigned int c)
85
47
  {
86
- return ((c >= 65 && c <= 90)
87
- || (c >= CYR_CAP_DJ && c <= CYR_CAP_SH)
88
- || c == LAT_CAP_TJ
89
- || c == LAT_CAP_CH
90
- || c == LAT_CAP_DJ
91
- || c == LAT_CAP_SH
92
- || c == LAT_CAP_ZH);
48
+ return c >= CYR_CAP_DJ && c <= CYR_DZ;
49
+ }
50
+
51
+ static inline unsigned int
52
+ is_upper(unsigned int c)
53
+ {
54
+ return (c >= 65 && c <= 90)
55
+ || (c >= CYR_CAP_DJ && c <= CYR_CAP_SH)
56
+ || c == LAT_CAP_TJ
57
+ || c == LAT_CAP_CH
58
+ || c == LAT_CAP_DJ
59
+ || c == LAT_CAP_SH
60
+ || c == LAT_CAP_ZH;
61
+ }
62
+
63
+ static inline unsigned int
64
+ maps_directly(unsigned int c)
65
+ {
66
+ return c != CYR_ZH
67
+ && c != CYR_CAP_ZH
68
+ && ((c >= CYR_A && c <= CYR_C) || (c >= CYR_CAP_A && c <= CYR_CAP_C));
93
69
  }
94
70
 
95
71
  static void
@@ -109,12 +85,24 @@ str_to_latin(VALUE str, int ascii, int bang)
109
85
  int len, next_len;
110
86
  int seen_upper = 0;
111
87
  int force_upper = 0;
112
- char *pos = RSTRING_PTR(str);
113
- char *end, *seq_start = 0;
88
+ char *pos, *end, *seq_start = 0;
89
+ char cyr;
114
90
  unsigned int codepoint = 0;
115
91
  unsigned int next_codepoint = 0;
116
92
  rb_encoding *enc;
117
93
 
94
+ char CYR_MAP[] = {
95
+ 'a', 'b', 'v', 'g', 'd', 'e', '\0', 'z', 'i', '\0', 'k',
96
+ 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'f', 'h', 'c'
97
+ };
98
+
99
+ char CYR_CAPS_MAP[] = {
100
+ 'A', 'B', 'V', 'G', 'D', 'E', '\0', 'Z', 'I', '\0', 'K',
101
+ 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'F', 'H', 'C'
102
+ };
103
+
104
+ StringValue(str);
105
+ pos = RSTRING_PTR(str);
118
106
  if (!pos || RSTRING_LEN(str) == 0) return str;
119
107
 
120
108
  end = RSTRING_END(str);
@@ -133,10 +121,10 @@ str_to_latin(VALUE str, int ascii, int bang)
133
121
  /* Latin -> "ASCII Latin" conversion */
134
122
  if (ascii && codepoint >= LAT_CAP_TJ && codepoint <= LAT_ZH) {
135
123
  if (seq_start) {
136
- /* flush the sequence */
137
124
  rb_str_buf_cat(dest, seq_start, pos - seq_start);
138
125
  seq_start = 0;
139
126
  }
127
+
140
128
  switch (codepoint) {
141
129
  case LAT_TJ:
142
130
  case LAT_CH: rb_str_buf_cat(dest, "c", 1); break;
@@ -148,7 +136,7 @@ str_to_latin(VALUE str, int ascii, int bang)
148
136
  case LAT_CAP_SH: rb_str_buf_cat(dest, "S", 1); break;
149
137
  case LAT_CAP_ZH: rb_str_buf_cat(dest, "Z", 1); break;
150
138
  case LAT_CAP_DJ:
151
- (seen_upper || is_upper_case(next_codepoint))
139
+ (seen_upper || is_upper(next_codepoint))
152
140
  ? rb_str_buf_cat(dest, "DJ", 2)
153
141
  : rb_str_buf_cat(dest, "Dj", 2);
154
142
  break;
@@ -157,108 +145,73 @@ str_to_latin(VALUE str, int ascii, int bang)
157
145
  }
158
146
  }
159
147
 
160
- /* Mark a start of inconsequential sequence */
161
- else if (codepoint < CYR_CAP_DJ || codepoint > CYR_DZ) {
162
- if (!seq_start)
163
- seq_start = pos;
164
- }
165
-
166
- /* Cyrillic -> Latin conversion */
167
- else {
148
+ /* Cyrillic coderange */
149
+ else if (is_cyrillic(codepoint)) {
168
150
  if (seq_start) {
169
- /* flush the sequence */
170
151
  rb_str_buf_cat(dest, seq_start, pos - seq_start);
171
152
  seq_start = 0;
172
153
  }
173
154
 
174
155
  if (codepoint >= CYR_A) {
175
- switch (codepoint) {
176
- case CYR_A: rb_str_buf_cat(dest, "a", 1); break;
177
- case CYR_B: rb_str_buf_cat(dest, "b", 1); break;
178
- case CYR_V: rb_str_buf_cat(dest, "v", 1); break;
179
- case CYR_G: rb_str_buf_cat(dest, "g", 1); break;
180
- case CYR_D: rb_str_buf_cat(dest, "d", 1); break;
181
- case CYR_E: rb_str_buf_cat(dest, "e", 1); break;
182
- case CYR_Z: rb_str_buf_cat(dest, "z", 1); break;
183
- case CYR_I: rb_str_buf_cat(dest, "i", 1); break;
184
- case CYR_J: rb_str_buf_cat(dest, "j", 1); break;
185
- case CYR_K: rb_str_buf_cat(dest, "k", 1); break;
186
- case CYR_L: rb_str_buf_cat(dest, "l", 1); break;
187
- case CYR_M: rb_str_buf_cat(dest, "m", 1); break;
188
- case CYR_N: rb_str_buf_cat(dest, "n", 1); break;
189
- case CYR_O: rb_str_buf_cat(dest, "o", 1); break;
190
- case CYR_P: rb_str_buf_cat(dest, "p", 1); break;
191
- case CYR_R: rb_str_buf_cat(dest, "r", 1); break;
192
- case CYR_S: rb_str_buf_cat(dest, "s", 1); break;
193
- case CYR_T: rb_str_buf_cat(dest, "t", 1); break;
194
- case CYR_U: rb_str_buf_cat(dest, "u", 1); break;
195
- case CYR_F: rb_str_buf_cat(dest, "f", 1); break;
196
- case CYR_H: rb_str_buf_cat(dest, "h", 1); break;
197
- case CYR_C: rb_str_buf_cat(dest, "c", 1); break;
198
- case CYR_LJ: rb_str_buf_cat(dest, "lj", 2); break;
199
- case CYR_NJ: rb_str_buf_cat(dest, "nj", 2); break;
200
- case CYR_DJ: STR_CAT_COND_ASCII(ascii, dest, "dj", LAT_DJ, 2, enc); break;
201
- case CYR_TJ: STR_CAT_COND_ASCII(ascii, dest, "c", LAT_TJ, 1, enc); break;
202
- case CYR_CH: STR_CAT_COND_ASCII(ascii, dest, "c", LAT_CH, 1, enc); break;
203
- case CYR_ZH: STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc); break;
204
- case CYR_SH: STR_CAT_COND_ASCII(ascii, dest, "s", LAT_SH, 1, enc); break;
205
- case CYR_DZ:
206
- rb_str_buf_cat(dest, "d", 1);
207
- STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc);
208
- break;
209
- default:
210
- rb_str_buf_cat(dest, pos, len);
156
+ if (maps_directly(codepoint)) {
157
+ cyr = CYR_MAP[codepoint - CYR_A];
158
+ cyr ? rb_str_buf_cat(dest, &cyr, 1)
159
+ : rb_str_buf_cat(dest, pos, len);
160
+ }
161
+ else {
162
+ switch (codepoint) {
163
+ case CYR_J: rb_str_buf_cat(dest, "j", 1); break;
164
+ case CYR_LJ: rb_str_buf_cat(dest, "lj", 2); break;
165
+ case CYR_NJ: rb_str_buf_cat(dest, "nj", 2); break;
166
+ case CYR_DJ: STR_CAT_COND_ASCII(ascii, dest, "dj", LAT_DJ, 2, enc); break;
167
+ case CYR_TJ: STR_CAT_COND_ASCII(ascii, dest, "c", LAT_TJ, 1, enc); break;
168
+ case CYR_CH: STR_CAT_COND_ASCII(ascii, dest, "c", LAT_CH, 1, enc); break;
169
+ case CYR_SH: STR_CAT_COND_ASCII(ascii, dest, "s", LAT_SH, 1, enc); break;
170
+ case CYR_ZH: STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc); break;
171
+ case CYR_DZ:
172
+ rb_str_buf_cat(dest, "d", 1);
173
+ STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc);
174
+ break;
175
+ default:
176
+ rb_str_buf_cat(dest, pos, len);
177
+ }
211
178
  }
212
179
  }
213
180
  else {
214
- force_upper = seen_upper || is_upper_case(next_codepoint);
181
+ if (maps_directly(codepoint)) {
182
+ cyr = CYR_CAPS_MAP[codepoint - CYR_CAP_A];
183
+ cyr ? rb_str_buf_cat(dest, &cyr, 1)
184
+ : rb_str_buf_cat(dest, pos, len);
185
+ }
186
+ else {
187
+ force_upper = seen_upper || is_upper(next_codepoint);
215
188
 
216
- switch (codepoint) {
217
- case CYR_CAP_A: rb_str_buf_cat(dest, "A", 1); break;
218
- case CYR_CAP_B: rb_str_buf_cat(dest, "B", 1); break;
219
- case CYR_CAP_V: rb_str_buf_cat(dest, "V", 1); break;
220
- case CYR_CAP_G: rb_str_buf_cat(dest, "G", 1); break;
221
- case CYR_CAP_D: rb_str_buf_cat(dest, "D", 1); break;
222
- case CYR_CAP_E: rb_str_buf_cat(dest, "E", 1); break;
223
- case CYR_CAP_Z: rb_str_buf_cat(dest, "Z", 1); break;
224
- case CYR_CAP_I: rb_str_buf_cat(dest, "I", 1); break;
225
- case CYR_CAP_J: rb_str_buf_cat(dest, "J", 1); break;
226
- case CYR_CAP_K: rb_str_buf_cat(dest, "K", 1); break;
227
- case CYR_CAP_L: rb_str_buf_cat(dest, "L", 1); break;
228
- case CYR_CAP_M: rb_str_buf_cat(dest, "M", 1); break;
229
- case CYR_CAP_N: rb_str_buf_cat(dest, "N", 1); break;
230
- case CYR_CAP_O: rb_str_buf_cat(dest, "O", 1); break;
231
- case CYR_CAP_P: rb_str_buf_cat(dest, "P", 1); break;
232
- case CYR_CAP_R: rb_str_buf_cat(dest, "R", 1); break;
233
- case CYR_CAP_S: rb_str_buf_cat(dest, "S", 1); break;
234
- case CYR_CAP_T: rb_str_buf_cat(dest, "T", 1); break;
235
- case CYR_CAP_U: rb_str_buf_cat(dest, "U", 1); break;
236
- case CYR_CAP_F: rb_str_buf_cat(dest, "F", 1); break;
237
- case CYR_CAP_H: rb_str_buf_cat(dest, "H", 1); break;
238
- case CYR_CAP_C: rb_str_buf_cat(dest, "C", 1); break;
239
- case CYR_CAP_LJ: rb_str_buf_cat(dest, (force_upper ? "LJ" : "Lj"), 2); break;
240
- case CYR_CAP_NJ: rb_str_buf_cat(dest, (force_upper ? "NJ" : "Nj"), 2); break;
241
- case CYR_CAP_TJ: STR_CAT_COND_ASCII(ascii, dest, "C", LAT_CAP_TJ, 1, enc); break;
242
- case CYR_CAP_CH: STR_CAT_COND_ASCII(ascii, dest, "C", LAT_CAP_CH, 1, enc); break;
243
- case CYR_CAP_ZH: STR_CAT_COND_ASCII(ascii, dest, "Z", LAT_CAP_ZH, 1, enc); break;
244
- case CYR_CAP_SH: STR_CAT_COND_ASCII(ascii, dest, "S", LAT_CAP_SH, 1, enc); break;
245
- case CYR_CAP_DJ: STR_CAT_COND_ASCII(ascii, dest, (force_upper ? "DJ" : "Dj"), LAT_CAP_DJ, 2, enc); break;
246
- case CYR_CAP_DZ:
247
- rb_str_buf_cat(dest, "D", 1);
248
- if (force_upper) {
249
- STR_CAT_COND_ASCII(ascii, dest, "Z", LAT_CAP_ZH, 1, enc);
250
- }
251
- else {
252
- STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc);
189
+ switch (codepoint) {
190
+ case CYR_CAP_J: rb_str_buf_cat(dest, "J", 1); break;
191
+ case CYR_CAP_LJ: rb_str_buf_cat(dest, (force_upper ? "LJ" : "Lj"), 2); break;
192
+ case CYR_CAP_NJ: rb_str_buf_cat(dest, (force_upper ? "NJ" : "Nj"), 2); break;
193
+ case CYR_CAP_TJ: STR_CAT_COND_ASCII(ascii, dest, "C", LAT_CAP_TJ, 1, enc); break;
194
+ case CYR_CAP_CH: STR_CAT_COND_ASCII(ascii, dest, "C", LAT_CAP_CH, 1, enc); break;
195
+ case CYR_CAP_SH: STR_CAT_COND_ASCII(ascii, dest, "S", LAT_CAP_SH, 1, enc); break;
196
+ case CYR_CAP_ZH: STR_CAT_COND_ASCII(ascii, dest, "Z", LAT_CAP_ZH, 1, enc); break;
197
+ case CYR_CAP_DJ: STR_CAT_COND_ASCII(ascii, dest, (force_upper ? "DJ" : "Dj"), LAT_CAP_DJ, 2, enc); break;
198
+ case CYR_CAP_DZ:
199
+ rb_str_buf_cat(dest, "D", 1);
200
+ force_upper ? STR_CAT_COND_ASCII(ascii, dest, "Z", LAT_CAP_ZH, 1, enc)
201
+ : STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc);
202
+ break;
203
+ default:
204
+ rb_str_buf_cat(dest, pos, len);
253
205
  }
254
- break;
255
- default:
256
- rb_str_buf_cat(dest, pos, len);
257
206
  }
258
207
  }
259
208
  }
209
+ else {
210
+ /* Mark the start of a copyable sequence */
211
+ if (!seq_start) seq_start = pos;
212
+ }
260
213
 
261
- seen_upper = is_upper_case(codepoint);
214
+ seen_upper = is_upper(codepoint);
262
215
 
263
216
  pos += len;
264
217
  len = next_len;
@@ -267,8 +220,8 @@ str_to_latin(VALUE str, int ascii, int bang)
267
220
  next_codepoint = 0;
268
221
  }
269
222
 
223
+ /* Flush the last sequence, if any */
270
224
  if (seq_start) {
271
- /* flush the last sequence */
272
225
  rb_str_buf_cat(dest, seq_start, pos - seq_start);
273
226
  }
274
227
 
@@ -283,30 +236,67 @@ str_to_latin(VALUE str, int ascii, int bang)
283
236
  return str;
284
237
  }
285
238
 
239
+ /**
240
+ * Returns a copy of <i>str</i> with the Serbian Cyrillic characters
241
+ * transliterated into Latin.
242
+ *
243
+ * @overload to_latin(str)
244
+ * @param [String] str text to be transliterated
245
+ * @return [String] transliterated text
246
+ */
286
247
  static VALUE
287
- rb_str_to_latin(VALUE str) {
248
+ rb_str_to_latin(VALUE self, VALUE str)
249
+ {
288
250
  return str_to_latin(str, 0, 0);
289
251
  }
290
252
 
253
+ /**
254
+ * Performs the transliteration of <code>Byk.to_latin</code> in place,
255
+ * returning <i>str</i>, whether changes were made or not.
256
+ *
257
+ * @overload to_latin!(str)
258
+ * @param [String] str text to be transliterated
259
+ * @return [String] transliterated text
260
+ */
291
261
  static VALUE
292
- rb_str_to_latin_bang(VALUE str) {
262
+ rb_str_to_latin_bang(VALUE self, VALUE str)
263
+ {
293
264
  return str_to_latin(str, 0, 1);
294
265
  }
295
266
 
267
+ /**
268
+ * Returns a copy of <i>str</i> with the Serbian Cyrillic
269
+ * characters transliterated into ASCII Latin.
270
+ *
271
+ * @overload to_ascii_latin(str)
272
+ * @param [String] str text to be transliterated
273
+ * @return [String] transliterated text
274
+ */
296
275
  static VALUE
297
- rb_str_to_ascii_latin(VALUE str) {
276
+ rb_str_to_ascii_latin(VALUE self, VALUE str)
277
+ {
298
278
  return str_to_latin(str, 1, 0);
299
279
  }
300
280
 
281
+ /**
282
+ * Performs the transliteration of <code>Byk.to_ascii_latin</code> in
283
+ * place, returning <i>str</i>, whether changes were made or not.
284
+ *
285
+ * @overload to_ascii_latin!(str)
286
+ * @param [String] str text to be transliterated
287
+ * @return [String] transliterated text
288
+ */
301
289
  static VALUE
302
- rb_str_to_ascii_latin_bang(VALUE str) {
290
+ rb_str_to_ascii_latin_bang(VALUE self, VALUE str)
291
+ {
303
292
  return str_to_latin(str, 1, 1);
304
293
  }
305
294
 
306
295
  void Init_byk_native(void)
307
296
  {
308
- rb_define_method(rb_cString, "to_latin", rb_str_to_latin, 0);
309
- rb_define_method(rb_cString, "to_latin!", rb_str_to_latin_bang, 0);
310
- rb_define_method(rb_cString, "to_ascii_latin", rb_str_to_ascii_latin, 0);
311
- rb_define_method(rb_cString, "to_ascii_latin!", rb_str_to_ascii_latin_bang, 0);
297
+ VALUE Byk = rb_define_module("Byk");
298
+ rb_define_singleton_method(Byk, "to_latin", rb_str_to_latin, 1);
299
+ rb_define_singleton_method(Byk, "to_latin!", rb_str_to_latin_bang, 1);
300
+ rb_define_singleton_method(Byk, "to_ascii_latin", rb_str_to_ascii_latin, 1);
301
+ rb_define_singleton_method(Byk, "to_ascii_latin!", rb_str_to_ascii_latin_bang, 1);
312
302
  }
data/lib/byk.rb CHANGED
@@ -1,14 +1,2 @@
1
- # coding: utf-8
2
-
3
- require "byk_native"
4
- require "byk/version"
5
-
6
- module Byk
7
-
8
- AZBUKA = %w[а б в г д ђ е ж з и ј к л љ м н њ о п р с т ћ у ф х ц ч џ ш]
9
- AZBUKA_CAPS = %W[А Б В Г Д Ђ Е Ж З И Ј К Л Љ М Н Њ О П Р С Т Ћ У Ф Х Ц Ч Џ Ш]
10
-
11
- ABECEDA = %w[a b c č ć d dž đ e f g h i j k l lj m n nj o p r s š t u v z ž]
12
- ABECEDA_CAPS = %W[A B C Č Ć D Dž Đ E F G H I J K L Lj M N Nj O P R S Š T U V Z Ž]
13
-
14
- end
1
+ require "byk/safe"
2
+ require "byk/core_ext/string"
@@ -0,0 +1,8 @@
1
+ class String
2
+
3
+ Byk.singleton_methods.each do |method|
4
+ define_method(method) do
5
+ Byk.send(method, self)
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,14 @@
1
+ # coding: utf-8
2
+
3
+ require "byk_native"
4
+ require "byk/version"
5
+
6
+ module Byk
7
+
8
+ AZBUKA = %w[а б в г д ђ е ж з и ј к л љ м н њ о п р с т ћ у ф х ц ч џ ш]
9
+ AZBUKA_CAPS = %W[А Б В Г Д Ђ Е Ж З И Ј К Л Љ М Н Њ О П Р С Т Ћ У Ф Х Ц Ч Џ Ш]
10
+
11
+ ABECEDA = %w[a b c č ć d dž đ e f g h i j k l lj m n nj o p r s š t u v z ž]
12
+ ABECEDA_CAPS = %W[A B C Č Ć D Dž Đ E F G H I J K L Lj M N Nj O P R S Š T U V Z Ž]
13
+
14
+ end
@@ -1,3 +1,3 @@
1
1
  module Byk
2
- VERSION = "0.5.0"
2
+ VERSION = "0.6.0"
3
3
  end
@@ -4,123 +4,180 @@ require "spec_helper"
4
4
 
5
5
  describe Byk do
6
6
 
7
- # See http://sr.wikipedia.org/wiki/Панграм
8
- let(:pangram) { "фијуче ветар у шибљу, леди пасаже и куће иза њих и гунђа у оџацима." }
9
- let(:pangram_latin) { "fijuče vetar u šiblju, ledi pasaže i kuće iza njih i gunđa u odžacima." }
10
- let(:pangram_ascii_latin) { "fijuce vetar u siblju, ledi pasaze i kuce iza njih i gundja u odzacima." }
7
+ it "has a version number" do
8
+ expect(Byk::VERSION).not_to be nil
9
+ end
11
10
 
12
- let(:pangram_caps) { "ФИЈУЧЕ ВЕТАР У ШИБЉУ, ЛЕДИ ПАСАЖЕ И КУЋЕ ИЗА ЊИХ И ГУНЂА У ОЏАЦИМА." }
13
- let(:pangram_latin_caps) { "FIJUČE VETAR U ŠIBLJU, LEDI PASAŽE I KUĆE IZA NJIH I GUNĐA U ODŽACIMA." }
14
- let(:pangram_ascii_latin_caps) { "FIJUCE VETAR U SIBLJU, LEDI PASAZE I KUCE IZA NJIH I GUNDJA U ODZACIMA." }
11
+ shared_examples :base do |method|
15
12
 
16
- let(:ascii) { "The quick brown fox jumps over the lazy dog." }
17
- let(:other) { "संस्कृतम् saṃskṛtam" }
13
+ # See http://sr.wikipedia.org/wiki/Панграм
14
+ let(:pangram) { "фијуче ветар у шибљу, леди пасаже и куће иза њих и гунђа у оџацима." }
15
+ let(:pangram_latin) { "fijuče vetar u šiblju, ledi pasaže i kuće iza njih i gunđa u odžacima." }
16
+ let(:pangram_ascii_latin) { "fijuce vetar u siblju, ledi pasaze i kuce iza njih i gundja u odzacima." }
18
17
 
19
- let(:mixed) { "संस्कृतम् saṃskṛtam илити Sanskrit, obrati ПАЖЊУ." }
20
- let(:mixed_latin) { "संस्कृतम् saṃskṛtam iliti Sanskrit, obrati PAŽNJU." }
21
- let(:mixed_ascii_latin) { "संस्कृतम् saṃskṛtam iliti Sanskrit, obrati PAZNJU." }
18
+ let(:pangram_caps) { "ФИЈУЧЕ ВЕТАР У ШИБЉУ, ЛЕДИ ПАСАЖЕ И КУЋЕ ИЗА ЊИХ И ГУНЂА У ОЏАЦИМА." }
19
+ let(:pangram_latin_caps) { "FIJUČE VETAR U ŠIBLJU, LEDI PASAŽE I KUĆE IZA NJIH I GUNĐA U ODŽACIMA." }
20
+ let(:pangram_ascii_latin_caps) { "FIJUCE VETAR U SIBLJU, LEDI PASAZE I KUCE IZA NJIH I GUNDJA U ODZACIMA." }
22
21
 
23
- it "has a version number" do
24
- expect(Byk::VERSION).not_to be nil
25
- end
22
+ let(:full_cyrillic_coderange) { (0x400..0x4ff).map { |i| i.chr(Encoding::UTF_8) } }
23
+ let(:non_serbian_cyrillic_coderange) { full_cyrillic_coderange - Byk::AZBUKA - Byk::AZBUKA_CAPS }
24
+ let(:non_serbian_cyrillic) { non_serbian_cyrillic_coderange.join }
25
+
26
+ let(:ascii) { "The quick brown fox jumps over the lazy dog." }
27
+ let(:other) { "संस्कृतम् saṃskṛtam" }
26
28
 
27
- describe "#to_latin" do
29
+ let(:mixed) { "संस्कृतम् saṃskṛtam илити Sanskrit, obrati ПАЖЊУ." }
30
+ let(:mixed_latin) { "संस्कृतम् saṃskṛtam iliti Sanskrit, obrati PAŽNJU." }
31
+ let(:mixed_ascii_latin) { "संस्कृतम् saṃskṛtam iliti Sanskrit, obrati PAZNJU." }
28
32
 
29
- it "doesn't modify an empty string" do
30
- expect("".to_latin).to eq ""
33
+ it "doesn't convert an empty string" do
34
+ expect(Byk.send(method, "")).to eq ""
31
35
  end
32
36
 
33
- it "doesn't modify ASCII text" do
34
- expect(ascii.to_latin).to eq ascii
37
+ it "doesn't convert ASCII text" do
38
+ expect(Byk.send(method, ascii)).to eq ascii
35
39
  end
36
40
 
37
- it "doesn't modify latin" do
38
- expect(pangram_latin.to_latin).to eq pangram_latin
41
+ it "doesn't convert non-Serbian Cyrillic" do
42
+ expect(Byk.send(method, non_serbian_cyrillic)).to eq non_serbian_cyrillic
39
43
  end
40
44
 
41
- it "doesn't modify other scripts" do
42
- expect(other.to_latin).to eq other
45
+ it "doesn't convert other coderanges" do
46
+ expect(Byk.send(method, other)).to eq other
47
+ end
48
+ end
49
+
50
+ shared_examples :latinization_method do |method|
51
+ include_examples :base, method
52
+
53
+ let(:edge_cases) {
54
+ [
55
+ ["Њ", "Nj"],
56
+ ["Љ", "Lj"],
57
+ ["Џ", "Dž"],
58
+ ["ЊЊ", "NJNJ"],
59
+ ["ЉЉ", "LJLJ"],
60
+ ["ЏЏ", "DŽDŽ"]
61
+ ]
62
+ }
63
+
64
+ it "doesn't convert Latin" do
65
+ expect(Byk.send(method, pangram_latin)).to eq pangram_latin
43
66
  end
44
67
 
45
- it "converts cyrillic to latin" do
46
- expect(pangram.to_latin).to eq pangram_latin
68
+ it "converts Cyrillic to Latin" do
69
+ expect(Byk.send(method, pangram)).to eq pangram_latin
47
70
  end
48
71
 
49
- it "converts cyrillic caps to latin caps" do
50
- expect(pangram_caps.to_latin).to eq pangram_latin_caps
72
+ it "converts Cyrillic caps to Latin caps" do
73
+ expect(Byk.send(method, pangram_caps)).to eq pangram_latin_caps
51
74
  end
52
75
 
53
76
  it "converts mixed text properly" do
54
- expect(mixed.to_latin).to eq mixed_latin
77
+ expect(Byk.send(method, mixed)).to eq mixed_latin
78
+ end
79
+
80
+ it "converts edge cases properly" do
81
+ edge_cases.each do |input, output|
82
+ expect(Byk.send(method, input)).to eq output
83
+ end
55
84
  end
56
85
 
57
86
  it "converts AZBUKA to ABECEDA" do
58
- expect(Byk::AZBUKA.map(&:to_latin)).to match_array(Byk::ABECEDA)
87
+ expect(Byk::AZBUKA.map { |l| l.dup.send(method) }).to match_array(Byk::ABECEDA)
59
88
  end
60
89
 
61
90
  it "converts AZBUKA_CAPS to ABECEDA_CAPS" do
62
- expect(Byk::AZBUKA_CAPS.map(&:to_latin)).to match_array(Byk::ABECEDA_CAPS)
91
+ expect(Byk::AZBUKA_CAPS.map { |l| l.dup.send(method) }).to match_array(Byk::ABECEDA_CAPS)
63
92
  end
64
93
  end
65
94
 
66
- describe "#to_ascii_latin" do
95
+ shared_examples :ascii_latinization_method do |method|
96
+ include_examples :base, method
67
97
 
68
- # Special care for Њ, Љ, Ђ, Đ
69
98
  let(:edge_cases) {
70
- {
71
- "Њ" => "Nj",
72
- "Љ" => "Lj",
73
- "Ђ" => "Dj",
74
- "Đ" => "Dj",
75
- "ЊЊ" => "NJNJ",
76
- "ЉЉ" => "LJLJ",
77
- "ЂЂ" => "DJDJ",
78
- "ĐĐ" => "DJDJ",
79
- "ГУЊ" => "GUNJ",
80
- "ПАСУЉ" => "PASULJ",
81
- "ЂУРАЂ" => "DJURADJ",
82
- "ĐURAĐ" => "DJURADJ",
83
- "ĐURAĐ Đorđević" => "DJURADJ Djordjevic",
84
- "ĐURAĐ. Đorđević" => "DJURADJ. Djordjevic"
85
- }
99
+ [
100
+ ["Њ", "Nj"],
101
+ ["Љ", "Lj"],
102
+ ["Џ", "Dz"],
103
+ ["Ђ", "Dj"],
104
+ ["Đ", "Dj"],
105
+ ["ЊЊ", "NJNJ"],
106
+ ["ЉЉ", "LJLJ"],
107
+ ["ЏЏ", "DZDZ"],
108
+ ["ЂЂ", "DJDJ"],
109
+ ["ĐĐ", "DJDJ"],
110
+ ["ЂУРАЂ Ђорђевић", "DJURADJ Djordjevic"],
111
+ ["ĐURAĐ Đorđević", "DJURADJ Djordjevic"]
112
+ ]
86
113
  }
87
114
 
88
- it "doesn't modify an empty string" do
89
- expect("".to_ascii_latin).to eq ""
115
+ it "converts Cyrillic to ASCII Latin" do
116
+ expect(Byk.send(method, pangram)).to eq pangram_ascii_latin
90
117
  end
91
118
 
92
- it "doesn't modify ASCII text" do
93
- expect(ascii.to_ascii_latin).to eq ascii
119
+ it "converts Cyrillic caps to ASCII Latin caps" do
120
+ expect(Byk.send(method, pangram_caps)).to eq pangram_ascii_latin_caps
94
121
  end
95
122
 
96
- it "doesn't modify other scripts" do
97
- expect(other.to_ascii_latin).to eq other
123
+ it "converts Latin to ASCII Latin" do
124
+ expect(Byk.send(method, pangram_latin)).to eq pangram_ascii_latin
98
125
  end
99
126
 
100
- it "converts cyrillic to ASCII latin" do
101
- expect(pangram.to_ascii_latin).to eq pangram_ascii_latin
127
+ it "converts Latin caps to ASCII Latin caps" do
128
+ expect(Byk.send(method, pangram_latin_caps)).to eq pangram_ascii_latin_caps
102
129
  end
103
130
 
104
- it "converts cyrillic caps to ASCII latin caps" do
105
- expect(pangram_caps.to_ascii_latin).to eq pangram_ascii_latin_caps
131
+ it "converts mixed text properly" do
132
+ expect(Byk.send(method, mixed)).to eq mixed_ascii_latin
106
133
  end
107
134
 
108
- it "converts latin to ASCII latin" do
109
- expect(pangram_latin.to_ascii_latin).to eq pangram_ascii_latin
135
+ it "converts edge cases properly" do
136
+ edge_cases.each do |input, output|
137
+ expect(Byk.send(method, input)).to eq output
138
+ end
110
139
  end
140
+ end
111
141
 
112
- it "converts latin caps to ASCII latin caps" do
113
- expect(pangram_latin_caps.to_ascii_latin).to eq pangram_ascii_latin_caps
142
+ shared_examples :non_destructive_method do |method|
143
+ it "doesn't modify the arg" do
144
+ str = "Ж"
145
+ expect { Byk.send(method, str) }.to_not change { str }
114
146
  end
147
+ end
115
148
 
116
- it "converts mixed text properly" do
117
- expect(mixed.to_ascii_latin).to eq mixed_ascii_latin
149
+ shared_examples :destructive_method do |method|
150
+ it "modifies the arg" do
151
+ str = "Ж"
152
+ expect { Byk.send(method, str) }.to change { str }
118
153
  end
154
+ end
119
155
 
120
- it "converts edge cases properly" do
121
- edge_cases.each do |input, output|
122
- expect(input.to_ascii_latin).to eq output
123
- end
156
+ describe ".to_latin" do
157
+ it_behaves_like :latinization_method, :to_latin
158
+ it_behaves_like :non_destructive_method, :to_latin
159
+ end
160
+
161
+ describe ".to_latin!" do
162
+ it_behaves_like :latinization_method, :to_latin!
163
+ it_behaves_like :destructive_method, :to_latin!
164
+ end
165
+
166
+ describe ".to_ascii_latin" do
167
+ it_behaves_like :ascii_latinization_method, :to_ascii_latin
168
+ it_behaves_like :non_destructive_method, :to_ascii_latin
169
+ end
170
+
171
+ describe ".to_ascii_latin!" do
172
+ it_behaves_like :ascii_latinization_method, :to_ascii_latin!
173
+ it_behaves_like :destructive_method, :to_ascii_latin!
174
+ end
175
+ end
176
+
177
+ describe String do
178
+ it "responds to Byk methods" do
179
+ Byk.instance_methods.each do |method|
180
+ expect("").to respond_to(method)
124
181
  end
125
182
  end
126
183
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: byk
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nikola Topalović
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-04-18 00:00:00.000000000 Z
11
+ date: 2015-04-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake-compiler
@@ -52,6 +52,8 @@ files:
52
52
  - ext/byk/byk.c
53
53
  - ext/byk/extconf.rb
54
54
  - lib/byk.rb
55
+ - lib/byk/core_ext/string.rb
56
+ - lib/byk/safe.rb
55
57
  - lib/byk/version.rb
56
58
  - spec/byk_spec.rb
57
59
  homepage: https://github.com/topalovic/byk
@@ -74,7 +76,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
74
76
  version: '0'
75
77
  requirements: []
76
78
  rubyforge_project:
77
- rubygems_version: 2.2.2
79
+ rubygems_version: 2.4.5
78
80
  signing_key:
79
81
  specification_version: 4
80
82
  summary: Fast transliteration of Serbian Cyrillic into Latin.