byk 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 70b44d7687698d589bad7ddd888824bb58d4146d
4
- data.tar.gz: a9c6c155c85533e0d16e8515176929faaf456613
3
+ metadata.gz: 34203e0b4291cde495d17da65522df586de7e712
4
+ data.tar.gz: 290d743dab23c58241520252bd81d4ae4115ce98
5
5
  SHA512:
6
- metadata.gz: 4543d2bc442e1bfcbb5bc5030a4cbb3cfdc456e68a7a4da2b623039109f0faa52aad4f335c1856f574d4b51a82f9ba5fb4f97beee8dc364126064df6b3cf70b5
7
- data.tar.gz: a852ce68e4635d1af3f48e0e29c7b987c53f216d63fcbe9102c05ba825eb3c74cc7317bf807d9f9ec687312a5962f73533eaa12e822407a326d3bfcb3d0901b7
6
+ metadata.gz: f11d00e9ac1057a5596804e03c6c4a6c41841bedc21030a9ed776cfbaaabba85a341a62de71c990c8deadc8f8384bf263b41d477b33b299af26b55acef47fe0c
7
+ data.tar.gz: 335ddfeca9f6793f2887c1cc93cfc916e011f7dd01fd97073162871148d0fe61395bdb1115c5ed4f7583ff207f6b6d27462c8920a7a70b016b9427420796bc28
@@ -1,5 +1,11 @@
1
1
  # Changelog
2
2
 
3
+ ### Byk 0.6.0 (2015-04-25)
4
+
5
+ * Introduced module methods and the optional safe require
6
+ * Documented the methods
7
+ * Upgraded spec suite
8
+
3
9
  ### Byk 0.5.0 (2015-04-18)
4
10
 
5
11
  * Performance tuning and refactoring, up to 5x speedup
data/README.md CHANGED
@@ -61,13 +61,40 @@ text # => "Zvazbuka"
61
61
  ```
62
62
 
63
63
  Note that these methods take into account the
64
- [two-letter capitalization rules](http://sr.wikipedia.org/wiki/Gajica#Abeceda):
64
+ [digraph capitalization rules](http://sr.wikipedia.org/wiki/Гајица#.D0.94.D0.B8.D0.B3.D1.80.D0.B0.D1.84.D0.B8):
65
65
 
66
66
  ```ruby
67
67
  "ЉИЉА Љиљановић".to_latin # => "LJILJA Ljiljanović"
68
68
  "ĐORĐE Đorđević".to_ascii_latin # => "DJORDJE Djordjevic"
69
69
  ```
70
70
 
71
+ If you prefer not to monkey patch your strings, you can use the "safe"
72
+ require:
73
+
74
+ ```ruby
75
+ require "byk/safe"
76
+ ```
77
+
78
+ and then call the module methods:
79
+
80
+ ```ruby
81
+ text = "Вук"
82
+ Byk.to_latin(text) # => "Vuk"
83
+ text # => "Byk"
84
+ Byk.to_latin!(text) # => "Vuk"
85
+ text # => "Vuk"
86
+ ```
87
+
88
+
89
+ ## Testing
90
+
91
+ To test the gem, clone the repo and run:
92
+
93
+ ```
94
+ $ bundle
95
+ $ bundle exec rake
96
+ ```
97
+
71
98
 
72
99
  ## How fast is fast?
73
100
 
@@ -84,7 +111,7 @@ projects, e.g. sites supporting dual script content. Remember,
84
111
 
85
112
  I found transliteration to be a straightforward little problem that
86
113
  lends itself well to optimization. It also gave me an excuse to play
87
- with Ruby extensions, so there :smile_cat:
114
+ with Ruby extensions, so there :smirk_cat:
88
115
 
89
116
 
90
117
  ## Compatibility
@@ -92,10 +119,8 @@ with Ruby extensions, so there :smile_cat:
92
119
  Byk is supported under MRI Ruby >= 1.9.2.
93
120
 
94
121
  I don't plan to support 1.8.7 or older due to substantial C API
95
- changes between 1.8 and 1.9.
96
-
97
- It doesn't build under Rubinius currently, but I intend to support it
98
- in future releases.
122
+ changes between 1.8 and 1.9. It doesn't build under Rubinius
123
+ currently, but I intend to support it in future releases.
99
124
 
100
125
 
101
126
  ## License
@@ -5,91 +5,67 @@
5
5
 
6
6
  #define STR_CAT_COND_ASCII(ascii, dest, chr, ascii_chr, len, enc) \
7
7
  ascii ? rb_str_buf_cat(dest, chr, len) \
8
- : str_cat_char(dest, ascii_chr, enc)
8
+ : str_cat_char(dest, ascii_chr, enc)
9
9
 
10
10
  enum {
11
- LAT_CAP_TJ=262,
11
+ LAT_CAP_TJ = 0x106,
12
12
  LAT_TJ,
13
- LAT_CAP_CH=268,
13
+ LAT_CAP_CH = 0x10c,
14
14
  LAT_CH,
15
- LAT_CAP_DJ=272,
15
+ LAT_CAP_DJ = 0x110,
16
16
  LAT_DJ,
17
- LAT_CAP_SH=352,
17
+ LAT_CAP_SH = 0x160,
18
18
  LAT_SH,
19
- LAT_CAP_ZH=381,
19
+ LAT_CAP_ZH = 0x17d,
20
20
  LAT_ZH,
21
- CYR_CAP_DJ=1026,
22
- CYR_CAP_J=1032,
21
+ CYR_CAP_DJ = 0x402,
22
+ CYR_CAP_J = 0x408,
23
23
  CYR_CAP_LJ,
24
24
  CYR_CAP_NJ,
25
25
  CYR_CAP_TJ,
26
- CYR_CAP_DZ=1039,
26
+ CYR_CAP_DZ = 0x40f,
27
27
  CYR_CAP_A,
28
- CYR_CAP_B,
29
- CYR_CAP_V,
30
- CYR_CAP_G,
31
- CYR_CAP_D,
32
- CYR_CAP_E,
33
- CYR_CAP_ZH,
34
- CYR_CAP_Z,
35
- CYR_CAP_I,
36
- CYR_CAP_K=1050,
37
- CYR_CAP_L,
38
- CYR_CAP_M,
39
- CYR_CAP_N,
40
- CYR_CAP_O,
41
- CYR_CAP_P,
42
- CYR_CAP_R,
43
- CYR_CAP_S,
44
- CYR_CAP_T,
45
- CYR_CAP_U,
46
- CYR_CAP_F,
47
- CYR_CAP_H,
48
- CYR_CAP_C,
28
+ CYR_CAP_ZH = 0x416,
29
+ CYR_CAP_C = 0x426,
49
30
  CYR_CAP_CH,
50
31
  CYR_CAP_SH,
51
- CYR_A=1072,
52
- CYR_B,
53
- CYR_V,
54
- CYR_G,
55
- CYR_D,
56
- CYR_E,
57
- CYR_ZH,
58
- CYR_Z,
59
- CYR_I,
60
- CYR_K=1082,
61
- CYR_L,
62
- CYR_M,
63
- CYR_N,
64
- CYR_O,
65
- CYR_P,
66
- CYR_R,
67
- CYR_S,
68
- CYR_T,
69
- CYR_U,
70
- CYR_F,
71
- CYR_H,
72
- CYR_C,
32
+ CYR_A = 0x430,
33
+ CYR_ZH = 0x436,
34
+ CYR_C = 0x446,
73
35
  CYR_CH,
74
36
  CYR_SH,
75
- CYR_DJ=1106,
76
- CYR_J=1112,
37
+ CYR_DJ = 0x452,
38
+ CYR_J = 0x458,
77
39
  CYR_LJ,
78
40
  CYR_NJ,
79
41
  CYR_TJ,
80
- CYR_DZ=1119
42
+ CYR_DZ = 0x45f
81
43
  };
82
44
 
83
45
  static inline unsigned int
84
- is_upper_case(unsigned int c)
46
+ is_cyrillic(unsigned int c)
85
47
  {
86
- return ((c >= 65 && c <= 90)
87
- || (c >= CYR_CAP_DJ && c <= CYR_CAP_SH)
88
- || c == LAT_CAP_TJ
89
- || c == LAT_CAP_CH
90
- || c == LAT_CAP_DJ
91
- || c == LAT_CAP_SH
92
- || c == LAT_CAP_ZH);
48
+ return c >= CYR_CAP_DJ && c <= CYR_DZ;
49
+ }
50
+
51
+ static inline unsigned int
52
+ is_upper(unsigned int c)
53
+ {
54
+ return (c >= 65 && c <= 90)
55
+ || (c >= CYR_CAP_DJ && c <= CYR_CAP_SH)
56
+ || c == LAT_CAP_TJ
57
+ || c == LAT_CAP_CH
58
+ || c == LAT_CAP_DJ
59
+ || c == LAT_CAP_SH
60
+ || c == LAT_CAP_ZH;
61
+ }
62
+
63
+ static inline unsigned int
64
+ maps_directly(unsigned int c)
65
+ {
66
+ return c != CYR_ZH
67
+ && c != CYR_CAP_ZH
68
+ && ((c >= CYR_A && c <= CYR_C) || (c >= CYR_CAP_A && c <= CYR_CAP_C));
93
69
  }
94
70
 
95
71
  static void
@@ -109,12 +85,24 @@ str_to_latin(VALUE str, int ascii, int bang)
109
85
  int len, next_len;
110
86
  int seen_upper = 0;
111
87
  int force_upper = 0;
112
- char *pos = RSTRING_PTR(str);
113
- char *end, *seq_start = 0;
88
+ char *pos, *end, *seq_start = 0;
89
+ char cyr;
114
90
  unsigned int codepoint = 0;
115
91
  unsigned int next_codepoint = 0;
116
92
  rb_encoding *enc;
117
93
 
94
+ char CYR_MAP[] = {
95
+ 'a', 'b', 'v', 'g', 'd', 'e', '\0', 'z', 'i', '\0', 'k',
96
+ 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'f', 'h', 'c'
97
+ };
98
+
99
+ char CYR_CAPS_MAP[] = {
100
+ 'A', 'B', 'V', 'G', 'D', 'E', '\0', 'Z', 'I', '\0', 'K',
101
+ 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'F', 'H', 'C'
102
+ };
103
+
104
+ StringValue(str);
105
+ pos = RSTRING_PTR(str);
118
106
  if (!pos || RSTRING_LEN(str) == 0) return str;
119
107
 
120
108
  end = RSTRING_END(str);
@@ -133,10 +121,10 @@ str_to_latin(VALUE str, int ascii, int bang)
133
121
  /* Latin -> "ASCII Latin" conversion */
134
122
  if (ascii && codepoint >= LAT_CAP_TJ && codepoint <= LAT_ZH) {
135
123
  if (seq_start) {
136
- /* flush the sequence */
137
124
  rb_str_buf_cat(dest, seq_start, pos - seq_start);
138
125
  seq_start = 0;
139
126
  }
127
+
140
128
  switch (codepoint) {
141
129
  case LAT_TJ:
142
130
  case LAT_CH: rb_str_buf_cat(dest, "c", 1); break;
@@ -148,7 +136,7 @@ str_to_latin(VALUE str, int ascii, int bang)
148
136
  case LAT_CAP_SH: rb_str_buf_cat(dest, "S", 1); break;
149
137
  case LAT_CAP_ZH: rb_str_buf_cat(dest, "Z", 1); break;
150
138
  case LAT_CAP_DJ:
151
- (seen_upper || is_upper_case(next_codepoint))
139
+ (seen_upper || is_upper(next_codepoint))
152
140
  ? rb_str_buf_cat(dest, "DJ", 2)
153
141
  : rb_str_buf_cat(dest, "Dj", 2);
154
142
  break;
@@ -157,108 +145,73 @@ str_to_latin(VALUE str, int ascii, int bang)
157
145
  }
158
146
  }
159
147
 
160
- /* Mark a start of inconsequential sequence */
161
- else if (codepoint < CYR_CAP_DJ || codepoint > CYR_DZ) {
162
- if (!seq_start)
163
- seq_start = pos;
164
- }
165
-
166
- /* Cyrillic -> Latin conversion */
167
- else {
148
+ /* Cyrillic coderange */
149
+ else if (is_cyrillic(codepoint)) {
168
150
  if (seq_start) {
169
- /* flush the sequence */
170
151
  rb_str_buf_cat(dest, seq_start, pos - seq_start);
171
152
  seq_start = 0;
172
153
  }
173
154
 
174
155
  if (codepoint >= CYR_A) {
175
- switch (codepoint) {
176
- case CYR_A: rb_str_buf_cat(dest, "a", 1); break;
177
- case CYR_B: rb_str_buf_cat(dest, "b", 1); break;
178
- case CYR_V: rb_str_buf_cat(dest, "v", 1); break;
179
- case CYR_G: rb_str_buf_cat(dest, "g", 1); break;
180
- case CYR_D: rb_str_buf_cat(dest, "d", 1); break;
181
- case CYR_E: rb_str_buf_cat(dest, "e", 1); break;
182
- case CYR_Z: rb_str_buf_cat(dest, "z", 1); break;
183
- case CYR_I: rb_str_buf_cat(dest, "i", 1); break;
184
- case CYR_J: rb_str_buf_cat(dest, "j", 1); break;
185
- case CYR_K: rb_str_buf_cat(dest, "k", 1); break;
186
- case CYR_L: rb_str_buf_cat(dest, "l", 1); break;
187
- case CYR_M: rb_str_buf_cat(dest, "m", 1); break;
188
- case CYR_N: rb_str_buf_cat(dest, "n", 1); break;
189
- case CYR_O: rb_str_buf_cat(dest, "o", 1); break;
190
- case CYR_P: rb_str_buf_cat(dest, "p", 1); break;
191
- case CYR_R: rb_str_buf_cat(dest, "r", 1); break;
192
- case CYR_S: rb_str_buf_cat(dest, "s", 1); break;
193
- case CYR_T: rb_str_buf_cat(dest, "t", 1); break;
194
- case CYR_U: rb_str_buf_cat(dest, "u", 1); break;
195
- case CYR_F: rb_str_buf_cat(dest, "f", 1); break;
196
- case CYR_H: rb_str_buf_cat(dest, "h", 1); break;
197
- case CYR_C: rb_str_buf_cat(dest, "c", 1); break;
198
- case CYR_LJ: rb_str_buf_cat(dest, "lj", 2); break;
199
- case CYR_NJ: rb_str_buf_cat(dest, "nj", 2); break;
200
- case CYR_DJ: STR_CAT_COND_ASCII(ascii, dest, "dj", LAT_DJ, 2, enc); break;
201
- case CYR_TJ: STR_CAT_COND_ASCII(ascii, dest, "c", LAT_TJ, 1, enc); break;
202
- case CYR_CH: STR_CAT_COND_ASCII(ascii, dest, "c", LAT_CH, 1, enc); break;
203
- case CYR_ZH: STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc); break;
204
- case CYR_SH: STR_CAT_COND_ASCII(ascii, dest, "s", LAT_SH, 1, enc); break;
205
- case CYR_DZ:
206
- rb_str_buf_cat(dest, "d", 1);
207
- STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc);
208
- break;
209
- default:
210
- rb_str_buf_cat(dest, pos, len);
156
+ if (maps_directly(codepoint)) {
157
+ cyr = CYR_MAP[codepoint - CYR_A];
158
+ cyr ? rb_str_buf_cat(dest, &cyr, 1)
159
+ : rb_str_buf_cat(dest, pos, len);
160
+ }
161
+ else {
162
+ switch (codepoint) {
163
+ case CYR_J: rb_str_buf_cat(dest, "j", 1); break;
164
+ case CYR_LJ: rb_str_buf_cat(dest, "lj", 2); break;
165
+ case CYR_NJ: rb_str_buf_cat(dest, "nj", 2); break;
166
+ case CYR_DJ: STR_CAT_COND_ASCII(ascii, dest, "dj", LAT_DJ, 2, enc); break;
167
+ case CYR_TJ: STR_CAT_COND_ASCII(ascii, dest, "c", LAT_TJ, 1, enc); break;
168
+ case CYR_CH: STR_CAT_COND_ASCII(ascii, dest, "c", LAT_CH, 1, enc); break;
169
+ case CYR_SH: STR_CAT_COND_ASCII(ascii, dest, "s", LAT_SH, 1, enc); break;
170
+ case CYR_ZH: STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc); break;
171
+ case CYR_DZ:
172
+ rb_str_buf_cat(dest, "d", 1);
173
+ STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc);
174
+ break;
175
+ default:
176
+ rb_str_buf_cat(dest, pos, len);
177
+ }
211
178
  }
212
179
  }
213
180
  else {
214
- force_upper = seen_upper || is_upper_case(next_codepoint);
181
+ if (maps_directly(codepoint)) {
182
+ cyr = CYR_CAPS_MAP[codepoint - CYR_CAP_A];
183
+ cyr ? rb_str_buf_cat(dest, &cyr, 1)
184
+ : rb_str_buf_cat(dest, pos, len);
185
+ }
186
+ else {
187
+ force_upper = seen_upper || is_upper(next_codepoint);
215
188
 
216
- switch (codepoint) {
217
- case CYR_CAP_A: rb_str_buf_cat(dest, "A", 1); break;
218
- case CYR_CAP_B: rb_str_buf_cat(dest, "B", 1); break;
219
- case CYR_CAP_V: rb_str_buf_cat(dest, "V", 1); break;
220
- case CYR_CAP_G: rb_str_buf_cat(dest, "G", 1); break;
221
- case CYR_CAP_D: rb_str_buf_cat(dest, "D", 1); break;
222
- case CYR_CAP_E: rb_str_buf_cat(dest, "E", 1); break;
223
- case CYR_CAP_Z: rb_str_buf_cat(dest, "Z", 1); break;
224
- case CYR_CAP_I: rb_str_buf_cat(dest, "I", 1); break;
225
- case CYR_CAP_J: rb_str_buf_cat(dest, "J", 1); break;
226
- case CYR_CAP_K: rb_str_buf_cat(dest, "K", 1); break;
227
- case CYR_CAP_L: rb_str_buf_cat(dest, "L", 1); break;
228
- case CYR_CAP_M: rb_str_buf_cat(dest, "M", 1); break;
229
- case CYR_CAP_N: rb_str_buf_cat(dest, "N", 1); break;
230
- case CYR_CAP_O: rb_str_buf_cat(dest, "O", 1); break;
231
- case CYR_CAP_P: rb_str_buf_cat(dest, "P", 1); break;
232
- case CYR_CAP_R: rb_str_buf_cat(dest, "R", 1); break;
233
- case CYR_CAP_S: rb_str_buf_cat(dest, "S", 1); break;
234
- case CYR_CAP_T: rb_str_buf_cat(dest, "T", 1); break;
235
- case CYR_CAP_U: rb_str_buf_cat(dest, "U", 1); break;
236
- case CYR_CAP_F: rb_str_buf_cat(dest, "F", 1); break;
237
- case CYR_CAP_H: rb_str_buf_cat(dest, "H", 1); break;
238
- case CYR_CAP_C: rb_str_buf_cat(dest, "C", 1); break;
239
- case CYR_CAP_LJ: rb_str_buf_cat(dest, (force_upper ? "LJ" : "Lj"), 2); break;
240
- case CYR_CAP_NJ: rb_str_buf_cat(dest, (force_upper ? "NJ" : "Nj"), 2); break;
241
- case CYR_CAP_TJ: STR_CAT_COND_ASCII(ascii, dest, "C", LAT_CAP_TJ, 1, enc); break;
242
- case CYR_CAP_CH: STR_CAT_COND_ASCII(ascii, dest, "C", LAT_CAP_CH, 1, enc); break;
243
- case CYR_CAP_ZH: STR_CAT_COND_ASCII(ascii, dest, "Z", LAT_CAP_ZH, 1, enc); break;
244
- case CYR_CAP_SH: STR_CAT_COND_ASCII(ascii, dest, "S", LAT_CAP_SH, 1, enc); break;
245
- case CYR_CAP_DJ: STR_CAT_COND_ASCII(ascii, dest, (force_upper ? "DJ" : "Dj"), LAT_CAP_DJ, 2, enc); break;
246
- case CYR_CAP_DZ:
247
- rb_str_buf_cat(dest, "D", 1);
248
- if (force_upper) {
249
- STR_CAT_COND_ASCII(ascii, dest, "Z", LAT_CAP_ZH, 1, enc);
250
- }
251
- else {
252
- STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc);
189
+ switch (codepoint) {
190
+ case CYR_CAP_J: rb_str_buf_cat(dest, "J", 1); break;
191
+ case CYR_CAP_LJ: rb_str_buf_cat(dest, (force_upper ? "LJ" : "Lj"), 2); break;
192
+ case CYR_CAP_NJ: rb_str_buf_cat(dest, (force_upper ? "NJ" : "Nj"), 2); break;
193
+ case CYR_CAP_TJ: STR_CAT_COND_ASCII(ascii, dest, "C", LAT_CAP_TJ, 1, enc); break;
194
+ case CYR_CAP_CH: STR_CAT_COND_ASCII(ascii, dest, "C", LAT_CAP_CH, 1, enc); break;
195
+ case CYR_CAP_SH: STR_CAT_COND_ASCII(ascii, dest, "S", LAT_CAP_SH, 1, enc); break;
196
+ case CYR_CAP_ZH: STR_CAT_COND_ASCII(ascii, dest, "Z", LAT_CAP_ZH, 1, enc); break;
197
+ case CYR_CAP_DJ: STR_CAT_COND_ASCII(ascii, dest, (force_upper ? "DJ" : "Dj"), LAT_CAP_DJ, 2, enc); break;
198
+ case CYR_CAP_DZ:
199
+ rb_str_buf_cat(dest, "D", 1);
200
+ force_upper ? STR_CAT_COND_ASCII(ascii, dest, "Z", LAT_CAP_ZH, 1, enc)
201
+ : STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc);
202
+ break;
203
+ default:
204
+ rb_str_buf_cat(dest, pos, len);
253
205
  }
254
- break;
255
- default:
256
- rb_str_buf_cat(dest, pos, len);
257
206
  }
258
207
  }
259
208
  }
209
+ else {
210
+ /* Mark the start of a copyable sequence */
211
+ if (!seq_start) seq_start = pos;
212
+ }
260
213
 
261
- seen_upper = is_upper_case(codepoint);
214
+ seen_upper = is_upper(codepoint);
262
215
 
263
216
  pos += len;
264
217
  len = next_len;
@@ -267,8 +220,8 @@ str_to_latin(VALUE str, int ascii, int bang)
267
220
  next_codepoint = 0;
268
221
  }
269
222
 
223
+ /* Flush the last sequence, if any */
270
224
  if (seq_start) {
271
- /* flush the last sequence */
272
225
  rb_str_buf_cat(dest, seq_start, pos - seq_start);
273
226
  }
274
227
 
@@ -283,30 +236,67 @@ str_to_latin(VALUE str, int ascii, int bang)
283
236
  return str;
284
237
  }
285
238
 
239
+ /**
240
+ * Returns a copy of <i>str</i> with the Serbian Cyrillic characters
241
+ * transliterated into Latin.
242
+ *
243
+ * @overload to_latin(str)
244
+ * @param [String] str text to be transliterated
245
+ * @return [String] transliterated text
246
+ */
286
247
  static VALUE
287
- rb_str_to_latin(VALUE str) {
248
+ rb_str_to_latin(VALUE self, VALUE str)
249
+ {
288
250
  return str_to_latin(str, 0, 0);
289
251
  }
290
252
 
253
+ /**
254
+ * Performs the transliteration of <code>Byk.to_latin</code> in place,
255
+ * returning <i>str</i>, whether changes were made or not.
256
+ *
257
+ * @overload to_latin!(str)
258
+ * @param [String] str text to be transliterated
259
+ * @return [String] transliterated text
260
+ */
291
261
  static VALUE
292
- rb_str_to_latin_bang(VALUE str) {
262
+ rb_str_to_latin_bang(VALUE self, VALUE str)
263
+ {
293
264
  return str_to_latin(str, 0, 1);
294
265
  }
295
266
 
267
+ /**
268
+ * Returns a copy of <i>str</i> with the Serbian Cyrillic
269
+ * characters transliterated into ASCII Latin.
270
+ *
271
+ * @overload to_ascii_latin(str)
272
+ * @param [String] str text to be transliterated
273
+ * @return [String] transliterated text
274
+ */
296
275
  static VALUE
297
- rb_str_to_ascii_latin(VALUE str) {
276
+ rb_str_to_ascii_latin(VALUE self, VALUE str)
277
+ {
298
278
  return str_to_latin(str, 1, 0);
299
279
  }
300
280
 
281
+ /**
282
+ * Performs the transliteration of <code>Byk.to_ascii_latin</code> in
283
+ * place, returning <i>str</i>, whether changes were made or not.
284
+ *
285
+ * @overload to_ascii_latin!(str)
286
+ * @param [String] str text to be transliterated
287
+ * @return [String] transliterated text
288
+ */
301
289
  static VALUE
302
- rb_str_to_ascii_latin_bang(VALUE str) {
290
+ rb_str_to_ascii_latin_bang(VALUE self, VALUE str)
291
+ {
303
292
  return str_to_latin(str, 1, 1);
304
293
  }
305
294
 
306
295
  void Init_byk_native(void)
307
296
  {
308
- rb_define_method(rb_cString, "to_latin", rb_str_to_latin, 0);
309
- rb_define_method(rb_cString, "to_latin!", rb_str_to_latin_bang, 0);
310
- rb_define_method(rb_cString, "to_ascii_latin", rb_str_to_ascii_latin, 0);
311
- rb_define_method(rb_cString, "to_ascii_latin!", rb_str_to_ascii_latin_bang, 0);
297
+ VALUE Byk = rb_define_module("Byk");
298
+ rb_define_singleton_method(Byk, "to_latin", rb_str_to_latin, 1);
299
+ rb_define_singleton_method(Byk, "to_latin!", rb_str_to_latin_bang, 1);
300
+ rb_define_singleton_method(Byk, "to_ascii_latin", rb_str_to_ascii_latin, 1);
301
+ rb_define_singleton_method(Byk, "to_ascii_latin!", rb_str_to_ascii_latin_bang, 1);
312
302
  }
data/lib/byk.rb CHANGED
@@ -1,14 +1,2 @@
1
- # coding: utf-8
2
-
3
- require "byk_native"
4
- require "byk/version"
5
-
6
- module Byk
7
-
8
- AZBUKA = %w[а б в г д ђ е ж з и ј к л љ м н њ о п р с т ћ у ф х ц ч џ ш]
9
- AZBUKA_CAPS = %W[А Б В Г Д Ђ Е Ж З И Ј К Л Љ М Н Њ О П Р С Т Ћ У Ф Х Ц Ч Џ Ш]
10
-
11
- ABECEDA = %w[a b c č ć d dž đ e f g h i j k l lj m n nj o p r s š t u v z ž]
12
- ABECEDA_CAPS = %W[A B C Č Ć D Dž Đ E F G H I J K L Lj M N Nj O P R S Š T U V Z Ž]
13
-
14
- end
1
+ require "byk/safe"
2
+ require "byk/core_ext/string"
@@ -0,0 +1,8 @@
1
+ class String
2
+
3
+ Byk.singleton_methods.each do |method|
4
+ define_method(method) do
5
+ Byk.send(method, self)
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,14 @@
1
+ # coding: utf-8
2
+
3
+ require "byk_native"
4
+ require "byk/version"
5
+
6
+ module Byk
7
+
8
+ AZBUKA = %w[а б в г д ђ е ж з и ј к л љ м н њ о п р с т ћ у ф х ц ч џ ш]
9
+ AZBUKA_CAPS = %W[А Б В Г Д Ђ Е Ж З И Ј К Л Љ М Н Њ О П Р С Т Ћ У Ф Х Ц Ч Џ Ш]
10
+
11
+ ABECEDA = %w[a b c č ć d dž đ e f g h i j k l lj m n nj o p r s š t u v z ž]
12
+ ABECEDA_CAPS = %W[A B C Č Ć D Dž Đ E F G H I J K L Lj M N Nj O P R S Š T U V Z Ž]
13
+
14
+ end
@@ -1,3 +1,3 @@
1
1
  module Byk
2
- VERSION = "0.5.0"
2
+ VERSION = "0.6.0"
3
3
  end
@@ -4,123 +4,180 @@ require "spec_helper"
4
4
 
5
5
  describe Byk do
6
6
 
7
- # See http://sr.wikipedia.org/wiki/Панграм
8
- let(:pangram) { "фијуче ветар у шибљу, леди пасаже и куће иза њих и гунђа у оџацима." }
9
- let(:pangram_latin) { "fijuče vetar u šiblju, ledi pasaže i kuće iza njih i gunđa u odžacima." }
10
- let(:pangram_ascii_latin) { "fijuce vetar u siblju, ledi pasaze i kuce iza njih i gundja u odzacima." }
7
+ it "has a version number" do
8
+ expect(Byk::VERSION).not_to be nil
9
+ end
11
10
 
12
- let(:pangram_caps) { "ФИЈУЧЕ ВЕТАР У ШИБЉУ, ЛЕДИ ПАСАЖЕ И КУЋЕ ИЗА ЊИХ И ГУНЂА У ОЏАЦИМА." }
13
- let(:pangram_latin_caps) { "FIJUČE VETAR U ŠIBLJU, LEDI PASAŽE I KUĆE IZA NJIH I GUNĐA U ODŽACIMA." }
14
- let(:pangram_ascii_latin_caps) { "FIJUCE VETAR U SIBLJU, LEDI PASAZE I KUCE IZA NJIH I GUNDJA U ODZACIMA." }
11
+ shared_examples :base do |method|
15
12
 
16
- let(:ascii) { "The quick brown fox jumps over the lazy dog." }
17
- let(:other) { "संस्कृतम् saṃskṛtam" }
13
+ # See http://sr.wikipedia.org/wiki/Панграм
14
+ let(:pangram) { "фијуче ветар у шибљу, леди пасаже и куће иза њих и гунђа у оџацима." }
15
+ let(:pangram_latin) { "fijuče vetar u šiblju, ledi pasaže i kuće iza njih i gunđa u odžacima." }
16
+ let(:pangram_ascii_latin) { "fijuce vetar u siblju, ledi pasaze i kuce iza njih i gundja u odzacima." }
18
17
 
19
- let(:mixed) { "संस्कृतम् saṃskṛtam илити Sanskrit, obrati ПАЖЊУ." }
20
- let(:mixed_latin) { "संस्कृतम् saṃskṛtam iliti Sanskrit, obrati PAŽNJU." }
21
- let(:mixed_ascii_latin) { "संस्कृतम् saṃskṛtam iliti Sanskrit, obrati PAZNJU." }
18
+ let(:pangram_caps) { "ФИЈУЧЕ ВЕТАР У ШИБЉУ, ЛЕДИ ПАСАЖЕ И КУЋЕ ИЗА ЊИХ И ГУНЂА У ОЏАЦИМА." }
19
+ let(:pangram_latin_caps) { "FIJUČE VETAR U ŠIBLJU, LEDI PASAŽE I KUĆE IZA NJIH I GUNĐA U ODŽACIMA." }
20
+ let(:pangram_ascii_latin_caps) { "FIJUCE VETAR U SIBLJU, LEDI PASAZE I KUCE IZA NJIH I GUNDJA U ODZACIMA." }
22
21
 
23
- it "has a version number" do
24
- expect(Byk::VERSION).not_to be nil
25
- end
22
+ let(:full_cyrillic_coderange) { (0x400..0x4ff).map { |i| i.chr(Encoding::UTF_8) } }
23
+ let(:non_serbian_cyrillic_coderange) { full_cyrillic_coderange - Byk::AZBUKA - Byk::AZBUKA_CAPS }
24
+ let(:non_serbian_cyrillic) { non_serbian_cyrillic_coderange.join }
25
+
26
+ let(:ascii) { "The quick brown fox jumps over the lazy dog." }
27
+ let(:other) { "संस्कृतम् saṃskṛtam" }
26
28
 
27
- describe "#to_latin" do
29
+ let(:mixed) { "संस्कृतम् saṃskṛtam илити Sanskrit, obrati ПАЖЊУ." }
30
+ let(:mixed_latin) { "संस्कृतम् saṃskṛtam iliti Sanskrit, obrati PAŽNJU." }
31
+ let(:mixed_ascii_latin) { "संस्कृतम् saṃskṛtam iliti Sanskrit, obrati PAZNJU." }
28
32
 
29
- it "doesn't modify an empty string" do
30
- expect("".to_latin).to eq ""
33
+ it "doesn't convert an empty string" do
34
+ expect(Byk.send(method, "")).to eq ""
31
35
  end
32
36
 
33
- it "doesn't modify ASCII text" do
34
- expect(ascii.to_latin).to eq ascii
37
+ it "doesn't convert ASCII text" do
38
+ expect(Byk.send(method, ascii)).to eq ascii
35
39
  end
36
40
 
37
- it "doesn't modify latin" do
38
- expect(pangram_latin.to_latin).to eq pangram_latin
41
+ it "doesn't convert non-Serbian Cyrillic" do
42
+ expect(Byk.send(method, non_serbian_cyrillic)).to eq non_serbian_cyrillic
39
43
  end
40
44
 
41
- it "doesn't modify other scripts" do
42
- expect(other.to_latin).to eq other
45
+ it "doesn't convert other coderanges" do
46
+ expect(Byk.send(method, other)).to eq other
47
+ end
48
+ end
49
+
50
+ shared_examples :latinization_method do |method|
51
+ include_examples :base, method
52
+
53
+ let(:edge_cases) {
54
+ [
55
+ ["Њ", "Nj"],
56
+ ["Љ", "Lj"],
57
+ ["Џ", "Dž"],
58
+ ["ЊЊ", "NJNJ"],
59
+ ["ЉЉ", "LJLJ"],
60
+ ["ЏЏ", "DŽDŽ"]
61
+ ]
62
+ }
63
+
64
+ it "doesn't convert Latin" do
65
+ expect(Byk.send(method, pangram_latin)).to eq pangram_latin
43
66
  end
44
67
 
45
- it "converts cyrillic to latin" do
46
- expect(pangram.to_latin).to eq pangram_latin
68
+ it "converts Cyrillic to Latin" do
69
+ expect(Byk.send(method, pangram)).to eq pangram_latin
47
70
  end
48
71
 
49
- it "converts cyrillic caps to latin caps" do
50
- expect(pangram_caps.to_latin).to eq pangram_latin_caps
72
+ it "converts Cyrillic caps to Latin caps" do
73
+ expect(Byk.send(method, pangram_caps)).to eq pangram_latin_caps
51
74
  end
52
75
 
53
76
  it "converts mixed text properly" do
54
- expect(mixed.to_latin).to eq mixed_latin
77
+ expect(Byk.send(method, mixed)).to eq mixed_latin
78
+ end
79
+
80
+ it "converts edge cases properly" do
81
+ edge_cases.each do |input, output|
82
+ expect(Byk.send(method, input)).to eq output
83
+ end
55
84
  end
56
85
 
57
86
  it "converts AZBUKA to ABECEDA" do
58
- expect(Byk::AZBUKA.map(&:to_latin)).to match_array(Byk::ABECEDA)
87
+ expect(Byk::AZBUKA.map { |l| l.dup.send(method) }).to match_array(Byk::ABECEDA)
59
88
  end
60
89
 
61
90
  it "converts AZBUKA_CAPS to ABECEDA_CAPS" do
62
- expect(Byk::AZBUKA_CAPS.map(&:to_latin)).to match_array(Byk::ABECEDA_CAPS)
91
+ expect(Byk::AZBUKA_CAPS.map { |l| l.dup.send(method) }).to match_array(Byk::ABECEDA_CAPS)
63
92
  end
64
93
  end
65
94
 
66
- describe "#to_ascii_latin" do
95
+ shared_examples :ascii_latinization_method do |method|
96
+ include_examples :base, method
67
97
 
68
- # Special care for Њ, Љ, Ђ, Đ
69
98
  let(:edge_cases) {
70
- {
71
- "Њ" => "Nj",
72
- "Љ" => "Lj",
73
- "Ђ" => "Dj",
74
- "Đ" => "Dj",
75
- "ЊЊ" => "NJNJ",
76
- "ЉЉ" => "LJLJ",
77
- "ЂЂ" => "DJDJ",
78
- "ĐĐ" => "DJDJ",
79
- "ГУЊ" => "GUNJ",
80
- "ПАСУЉ" => "PASULJ",
81
- "ЂУРАЂ" => "DJURADJ",
82
- "ĐURAĐ" => "DJURADJ",
83
- "ĐURAĐ Đorđević" => "DJURADJ Djordjevic",
84
- "ĐURAĐ. Đorđević" => "DJURADJ. Djordjevic"
85
- }
99
+ [
100
+ ["Њ", "Nj"],
101
+ ["Љ", "Lj"],
102
+ ["Џ", "Dz"],
103
+ ["Ђ", "Dj"],
104
+ ["Đ", "Dj"],
105
+ ["ЊЊ", "NJNJ"],
106
+ ["ЉЉ", "LJLJ"],
107
+ ["ЏЏ", "DZDZ"],
108
+ ["ЂЂ", "DJDJ"],
109
+ ["ĐĐ", "DJDJ"],
110
+ ["ЂУРАЂ Ђорђевић", "DJURADJ Djordjevic"],
111
+ ["ĐURAĐ Đorđević", "DJURADJ Djordjevic"]
112
+ ]
86
113
  }
87
114
 
88
- it "doesn't modify an empty string" do
89
- expect("".to_ascii_latin).to eq ""
115
+ it "converts Cyrillic to ASCII Latin" do
116
+ expect(Byk.send(method, pangram)).to eq pangram_ascii_latin
90
117
  end
91
118
 
92
- it "doesn't modify ASCII text" do
93
- expect(ascii.to_ascii_latin).to eq ascii
119
+ it "converts Cyrillic caps to ASCII Latin caps" do
120
+ expect(Byk.send(method, pangram_caps)).to eq pangram_ascii_latin_caps
94
121
  end
95
122
 
96
- it "doesn't modify other scripts" do
97
- expect(other.to_ascii_latin).to eq other
123
+ it "converts Latin to ASCII Latin" do
124
+ expect(Byk.send(method, pangram_latin)).to eq pangram_ascii_latin
98
125
  end
99
126
 
100
- it "converts cyrillic to ASCII latin" do
101
- expect(pangram.to_ascii_latin).to eq pangram_ascii_latin
127
+ it "converts Latin caps to ASCII Latin caps" do
128
+ expect(Byk.send(method, pangram_latin_caps)).to eq pangram_ascii_latin_caps
102
129
  end
103
130
 
104
- it "converts cyrillic caps to ASCII latin caps" do
105
- expect(pangram_caps.to_ascii_latin).to eq pangram_ascii_latin_caps
131
+ it "converts mixed text properly" do
132
+ expect(Byk.send(method, mixed)).to eq mixed_ascii_latin
106
133
  end
107
134
 
108
- it "converts latin to ASCII latin" do
109
- expect(pangram_latin.to_ascii_latin).to eq pangram_ascii_latin
135
+ it "converts edge cases properly" do
136
+ edge_cases.each do |input, output|
137
+ expect(Byk.send(method, input)).to eq output
138
+ end
110
139
  end
140
+ end
111
141
 
112
- it "converts latin caps to ASCII latin caps" do
113
- expect(pangram_latin_caps.to_ascii_latin).to eq pangram_ascii_latin_caps
142
+ shared_examples :non_destructive_method do |method|
143
+ it "doesn't modify the arg" do
144
+ str = "Ж"
145
+ expect { Byk.send(method, str) }.to_not change { str }
114
146
  end
147
+ end
115
148
 
116
- it "converts mixed text properly" do
117
- expect(mixed.to_ascii_latin).to eq mixed_ascii_latin
149
+ shared_examples :destructive_method do |method|
150
+ it "modifies the arg" do
151
+ str = "Ж"
152
+ expect { Byk.send(method, str) }.to change { str }
118
153
  end
154
+ end
119
155
 
120
- it "converts edge cases properly" do
121
- edge_cases.each do |input, output|
122
- expect(input.to_ascii_latin).to eq output
123
- end
156
+ describe ".to_latin" do
157
+ it_behaves_like :latinization_method, :to_latin
158
+ it_behaves_like :non_destructive_method, :to_latin
159
+ end
160
+
161
+ describe ".to_latin!" do
162
+ it_behaves_like :latinization_method, :to_latin!
163
+ it_behaves_like :destructive_method, :to_latin!
164
+ end
165
+
166
+ describe ".to_ascii_latin" do
167
+ it_behaves_like :ascii_latinization_method, :to_ascii_latin
168
+ it_behaves_like :non_destructive_method, :to_ascii_latin
169
+ end
170
+
171
+ describe ".to_ascii_latin!" do
172
+ it_behaves_like :ascii_latinization_method, :to_ascii_latin!
173
+ it_behaves_like :destructive_method, :to_ascii_latin!
174
+ end
175
+ end
176
+
177
+ describe String do
178
+ it "responds to Byk methods" do
179
+ Byk.instance_methods.each do |method|
180
+ expect("").to respond_to(method)
124
181
  end
125
182
  end
126
183
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: byk
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nikola Topalović
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-04-18 00:00:00.000000000 Z
11
+ date: 2015-04-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake-compiler
@@ -52,6 +52,8 @@ files:
52
52
  - ext/byk/byk.c
53
53
  - ext/byk/extconf.rb
54
54
  - lib/byk.rb
55
+ - lib/byk/core_ext/string.rb
56
+ - lib/byk/safe.rb
55
57
  - lib/byk/version.rb
56
58
  - spec/byk_spec.rb
57
59
  homepage: https://github.com/topalovic/byk
@@ -74,7 +76,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
74
76
  version: '0'
75
77
  requirements: []
76
78
  rubyforge_project:
77
- rubygems_version: 2.2.2
79
+ rubygems_version: 2.4.5
78
80
  signing_key:
79
81
  specification_version: 4
80
82
  summary: Fast transliteration of Serbian Cyrillic into Latin.