RubyGems - byk - Versions diffs - 0.5.0 → 0.6.0 - Mend

byk 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 70b44d7687698d589bad7ddd888824bb58d4146d
-  data.tar.gz: a9c6c155c85533e0d16e8515176929faaf456613
+  metadata.gz: 34203e0b4291cde495d17da65522df586de7e712
+  data.tar.gz: 290d743dab23c58241520252bd81d4ae4115ce98
 SHA512:
-  metadata.gz: 4543d2bc442e1bfcbb5bc5030a4cbb3cfdc456e68a7a4da2b623039109f0faa52aad4f335c1856f574d4b51a82f9ba5fb4f97beee8dc364126064df6b3cf70b5
-  data.tar.gz: a852ce68e4635d1af3f48e0e29c7b987c53f216d63fcbe9102c05ba825eb3c74cc7317bf807d9f9ec687312a5962f73533eaa12e822407a326d3bfcb3d0901b7
+  metadata.gz: f11d00e9ac1057a5596804e03c6c4a6c41841bedc21030a9ed776cfbaaabba85a341a62de71c990c8deadc8f8384bf263b41d477b33b299af26b55acef47fe0c
+  data.tar.gz: 335ddfeca9f6793f2887c1cc93cfc916e011f7dd01fd97073162871148d0fe61395bdb1115c5ed4f7583ff207f6b6d27462c8920a7a70b016b9427420796bc28

data/CHANGELOG.md CHANGED

@@ -1,5 +1,11 @@
 # Changelog
+### Byk 0.6.0 (2015-04-25)
+* Introduced module methods and the optional safe require
+* Documented the methods
+* Upgraded spec suite
 ### Byk 0.5.0 (2015-04-18)
 * Performance tuning and refactoring, up to 5x speedup

data/README.md CHANGED

@@ -61,13 +61,40 @@ text                  # => "Zvazbuka"
 ```
 Note that these methods take into account the
-[two-letter capitalization rules](http://sr.wikipedia.org/wiki/Gajica#Abeceda):
+[digraph capitalization rules](http://sr.wikipedia.org/wiki/Гајица#.D0.94.D0.B8.D0.B3.D1.80.D0.B0.D1.84.D0.B8):
 ```ruby
 "ЉИЉА Љиљановић".to_latin        # => "LJILJA Ljiljanović"
 "ĐORĐE Đorđević".to_ascii_latin  # => "DJORDJE Djordjevic"
 ```
+If you prefer not to monkey patch your strings, you can use the "safe"
+require:
+```ruby
+require "byk/safe"
+```
+and then call the module methods:
+```ruby
+text = "Вук"
+Byk.to_latin(text)   # => "Vuk"
+text                 # => "Byk"
+Byk.to_latin!(text)  # => "Vuk"
+text                 # => "Vuk"
+```
+## Testing
+To test the gem, clone the repo and run:
+```
+$ bundle
+$ bundle exec rake
+```
 ## How fast is fast?
@@ -84,7 +111,7 @@ projects, e.g. sites supporting dual script content. Remember,
 I found transliteration to be a straightforward little problem that
 lends itself well to optimization. It also gave me an excuse to play
-with Ruby extensions, so there :smile_cat:
+with Ruby extensions, so there :smirk_cat:
 ## Compatibility
@@ -92,10 +119,8 @@ with Ruby extensions, so there :smile_cat:
 Byk is supported under MRI Ruby >= 1.9.2.
 I don't plan to support 1.8.7 or older due to substantial C API
-changes between 1.8 and 1.9.
-It doesn't build under Rubinius currently, but I intend to support it
-in future releases.
+changes between 1.8 and 1.9. It doesn't build under Rubinius
+currently, but I intend to support it in future releases.
 ## License

data/ext/byk/byk.c CHANGED

@@ -5,91 +5,67 @@
 #define STR_CAT_COND_ASCII(ascii, dest, chr, ascii_chr, len, enc)       \
     ascii ? rb_str_buf_cat(dest, chr, len)                              \
-    : str_cat_char(dest, ascii_chr, enc)
+          : str_cat_char(dest, ascii_chr, enc)
 enum {
-    LAT_CAP_TJ=262,
+    LAT_CAP_TJ = 0x106,
     LAT_TJ,
-    LAT_CAP_CH=268,
+    LAT_CAP_CH = 0x10c,
     LAT_CH,
-    LAT_CAP_DJ=272,
+    LAT_CAP_DJ = 0x110,
     LAT_DJ,
-    LAT_CAP_SH=352,
+    LAT_CAP_SH = 0x160,
     LAT_SH,
-    LAT_CAP_ZH=381,
+    LAT_CAP_ZH = 0x17d,
     LAT_ZH,
-    CYR_CAP_DJ=1026,
-    CYR_CAP_J=1032,
+    CYR_CAP_DJ = 0x402,
+    CYR_CAP_J  = 0x408,
     CYR_CAP_LJ,
     CYR_CAP_NJ,
     CYR_CAP_TJ,
-    CYR_CAP_DZ=1039,
+    CYR_CAP_DZ = 0x40f,
     CYR_CAP_A,
-    CYR_CAP_B,
-    CYR_CAP_V,
-    CYR_CAP_G,
-    CYR_CAP_D,
-    CYR_CAP_E,
-    CYR_CAP_ZH,
-    CYR_CAP_Z,
-    CYR_CAP_I,
-    CYR_CAP_K=1050,
-    CYR_CAP_L,
-    CYR_CAP_M,
-    CYR_CAP_N,
-    CYR_CAP_O,
-    CYR_CAP_P,
-    CYR_CAP_R,
-    CYR_CAP_S,
-    CYR_CAP_T,
-    CYR_CAP_U,
-    CYR_CAP_F,
-    CYR_CAP_H,
-    CYR_CAP_C,
+    CYR_CAP_ZH = 0x416,
+    CYR_CAP_C  = 0x426,
     CYR_CAP_CH,
     CYR_CAP_SH,
-    CYR_A=1072,
-    CYR_B,
-    CYR_V,
-    CYR_G,
-    CYR_D,
-    CYR_E,
-    CYR_ZH,
-    CYR_Z,
-    CYR_I,
-    CYR_K=1082,
-    CYR_L,
-    CYR_M,
-    CYR_N,
-    CYR_O,
-    CYR_P,
-    CYR_R,
-    CYR_S,
-    CYR_T,
-    CYR_U,
-    CYR_F,
-    CYR_H,
-    CYR_C,
+    CYR_A  = 0x430,
+    CYR_ZH = 0x436,
+    CYR_C  = 0x446,
     CYR_CH,
     CYR_SH,
-    CYR_DJ=1106,
-    CYR_J=1112,
+    CYR_DJ = 0x452,
+    CYR_J  = 0x458,
     CYR_LJ,
     CYR_NJ,
     CYR_TJ,
-    CYR_DZ=1119
+    CYR_DZ = 0x45f
 };
 static inline unsigned int
-is_upper_case(unsigned int c)
+is_cyrillic(unsigned int c)
 {
-    return ((c >= 65 && c <= 90)
-            || (c >= CYR_CAP_DJ && c <= CYR_CAP_SH)
-            || c == LAT_CAP_TJ
-            || c == LAT_CAP_CH
-            || c == LAT_CAP_DJ
-            || c == LAT_CAP_SH
-            || c == LAT_CAP_ZH);
+    return c >= CYR_CAP_DJ && c <= CYR_DZ;
+}
+static inline unsigned int
+is_upper(unsigned int c)
+{
+    return (c >= 65 && c <= 90)
+        || (c >= CYR_CAP_DJ && c <= CYR_CAP_SH)
+        || c == LAT_CAP_TJ
+        || c == LAT_CAP_CH
+        || c == LAT_CAP_DJ
+        || c == LAT_CAP_SH
+        || c == LAT_CAP_ZH;
+}
+static inline unsigned int
+maps_directly(unsigned int c)
+{
+    return c != CYR_ZH
+        && c != CYR_CAP_ZH
+        && ((c >= CYR_A && c <= CYR_C) || (c >= CYR_CAP_A && c <= CYR_CAP_C));
 }
 static void
@@ -109,12 +85,24 @@ str_to_latin(VALUE str, int ascii, int bang)
     int len, next_len;
     int seen_upper = 0;
     int force_upper = 0;
-    char *pos = RSTRING_PTR(str);
-    char *end, *seq_start = 0;
+    char *pos, *end, *seq_start = 0;
+    char cyr;
     unsigned int codepoint = 0;
     unsigned int next_codepoint = 0;
     rb_encoding *enc;
+    char CYR_MAP[] = {
+        'a', 'b', 'v', 'g', 'd', 'e', '\0', 'z', 'i', '\0', 'k',
+        'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'f', 'h', 'c'
+    };
+    char CYR_CAPS_MAP[] = {
+        'A', 'B', 'V', 'G', 'D', 'E', '\0', 'Z', 'I', '\0', 'K',
+        'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'F', 'H', 'C'
+    };
+    StringValue(str);
+    pos = RSTRING_PTR(str);
     if (!pos || RSTRING_LEN(str) == 0) return str;
     end = RSTRING_END(str);
@@ -133,10 +121,10 @@ str_to_latin(VALUE str, int ascii, int bang)
         /* Latin -> "ASCII Latin" conversion */
         if (ascii && codepoint >= LAT_CAP_TJ && codepoint <= LAT_ZH) {
             if (seq_start) {
-                /* flush the sequence */
                 rb_str_buf_cat(dest, seq_start, pos - seq_start);
                 seq_start = 0;
             }
             switch (codepoint) {
             case LAT_TJ:
             case LAT_CH:     rb_str_buf_cat(dest, "c",  1); break;
@@ -148,7 +136,7 @@ str_to_latin(VALUE str, int ascii, int bang)
             case LAT_CAP_SH: rb_str_buf_cat(dest, "S",  1); break;
             case LAT_CAP_ZH: rb_str_buf_cat(dest, "Z",  1); break;
             case LAT_CAP_DJ:
-                (seen_upper || is_upper_case(next_codepoint))
+                (seen_upper || is_upper(next_codepoint))
                     ? rb_str_buf_cat(dest, "DJ", 2)
                     : rb_str_buf_cat(dest, "Dj", 2);
                 break;
@@ -157,108 +145,73 @@ str_to_latin(VALUE str, int ascii, int bang)
             }
         }
-        /* Mark a start of inconsequential sequence */
-        else if (codepoint < CYR_CAP_DJ || codepoint > CYR_DZ) {
-            if (!seq_start)
-                seq_start = pos;
-        }
-        /* Cyrillic -> Latin conversion */
-        else {
+        /* Cyrillic coderange */
+        else if (is_cyrillic(codepoint)) {
             if (seq_start) {
-                /* flush the sequence */
                 rb_str_buf_cat(dest, seq_start, pos - seq_start);
                 seq_start = 0;
             }
             if (codepoint >= CYR_A) {
-                switch (codepoint) {
-                case CYR_A:  rb_str_buf_cat(dest, "a",  1); break;
-                case CYR_B:  rb_str_buf_cat(dest, "b",  1); break;
-                case CYR_V:  rb_str_buf_cat(dest, "v",  1); break;
-                case CYR_G:  rb_str_buf_cat(dest, "g",  1); break;
-                case CYR_D:  rb_str_buf_cat(dest, "d",  1); break;
-                case CYR_E:  rb_str_buf_cat(dest, "e",  1); break;
-                case CYR_Z:  rb_str_buf_cat(dest, "z",  1); break;
-                case CYR_I:  rb_str_buf_cat(dest, "i",  1); break;
-                case CYR_J:  rb_str_buf_cat(dest, "j",  1); break;
-                case CYR_K:  rb_str_buf_cat(dest, "k",  1); break;
-                case CYR_L:  rb_str_buf_cat(dest, "l",  1); break;
-                case CYR_M:  rb_str_buf_cat(dest, "m",  1); break;
-                case CYR_N:  rb_str_buf_cat(dest, "n",  1); break;
-                case CYR_O:  rb_str_buf_cat(dest, "o",  1); break;
-                case CYR_P:  rb_str_buf_cat(dest, "p",  1); break;
-                case CYR_R:  rb_str_buf_cat(dest, "r",  1); break;
-                case CYR_S:  rb_str_buf_cat(dest, "s",  1); break;
-                case CYR_T:  rb_str_buf_cat(dest, "t",  1); break;
-                case CYR_U:  rb_str_buf_cat(dest, "u",  1); break;
-                case CYR_F:  rb_str_buf_cat(dest, "f",  1); break;
-                case CYR_H:  rb_str_buf_cat(dest, "h",  1); break;
-                case CYR_C:  rb_str_buf_cat(dest, "c",  1); break;
-                case CYR_LJ: rb_str_buf_cat(dest, "lj", 2); break;
-                case CYR_NJ: rb_str_buf_cat(dest, "nj", 2); break;
-                case CYR_DJ: STR_CAT_COND_ASCII(ascii, dest, "dj", LAT_DJ, 2, enc); break;
-                case CYR_TJ: STR_CAT_COND_ASCII(ascii, dest, "c",  LAT_TJ, 1, enc); break;
-                case CYR_CH: STR_CAT_COND_ASCII(ascii, dest, "c",  LAT_CH, 1, enc); break;
-                case CYR_ZH: STR_CAT_COND_ASCII(ascii, dest, "z",  LAT_ZH, 1, enc); break;
-                case CYR_SH: STR_CAT_COND_ASCII(ascii, dest, "s",  LAT_SH, 1, enc); break;
-                case CYR_DZ:
-                    rb_str_buf_cat(dest, "d", 1);
-                    STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc);
-                    break;
-                default:
-                    rb_str_buf_cat(dest, pos, len);
+                if (maps_directly(codepoint)) {
+                    cyr = CYR_MAP[codepoint - CYR_A];
+                    cyr ? rb_str_buf_cat(dest, &cyr, 1)
+                        : rb_str_buf_cat(dest, pos, len);
+                }
+                else {
+                    switch (codepoint) {
+                    case CYR_J:  rb_str_buf_cat(dest, "j",  1); break;
+                    case CYR_LJ: rb_str_buf_cat(dest, "lj", 2); break;
+                    case CYR_NJ: rb_str_buf_cat(dest, "nj", 2); break;
+                    case CYR_DJ: STR_CAT_COND_ASCII(ascii, dest, "dj", LAT_DJ, 2, enc); break;
+                    case CYR_TJ: STR_CAT_COND_ASCII(ascii, dest, "c",  LAT_TJ, 1, enc); break;
+                    case CYR_CH: STR_CAT_COND_ASCII(ascii, dest, "c",  LAT_CH, 1, enc); break;
+                    case CYR_SH: STR_CAT_COND_ASCII(ascii, dest, "s",  LAT_SH, 1, enc); break;
+                    case CYR_ZH: STR_CAT_COND_ASCII(ascii, dest, "z",  LAT_ZH, 1, enc); break;
+                    case CYR_DZ:
+                        rb_str_buf_cat(dest, "d", 1);
+                        STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc);
+                        break;
+                    default:
+                        rb_str_buf_cat(dest, pos, len);
+                    }
                 }
             }
             else {
-                force_upper = seen_upper || is_upper_case(next_codepoint);
+                if (maps_directly(codepoint)) {
+                    cyr = CYR_CAPS_MAP[codepoint - CYR_CAP_A];
+                    cyr ? rb_str_buf_cat(dest, &cyr, 1)
+                        : rb_str_buf_cat(dest, pos, len);
+                }
+                else {
+                    force_upper = seen_upper || is_upper(next_codepoint);
-                switch (codepoint) {
-                case CYR_CAP_A:  rb_str_buf_cat(dest, "A",  1); break;
-                case CYR_CAP_B:  rb_str_buf_cat(dest, "B",  1); break;
-                case CYR_CAP_V:  rb_str_buf_cat(dest, "V",  1); break;
-                case CYR_CAP_G:  rb_str_buf_cat(dest, "G",  1); break;
-                case CYR_CAP_D:  rb_str_buf_cat(dest, "D",  1); break;
-                case CYR_CAP_E:  rb_str_buf_cat(dest, "E",  1); break;
-                case CYR_CAP_Z:  rb_str_buf_cat(dest, "Z",  1); break;
-                case CYR_CAP_I:  rb_str_buf_cat(dest, "I",  1); break;
-                case CYR_CAP_J:  rb_str_buf_cat(dest, "J",  1); break;
-                case CYR_CAP_K:  rb_str_buf_cat(dest, "K",  1); break;
-                case CYR_CAP_L:  rb_str_buf_cat(dest, "L",  1); break;
-                case CYR_CAP_M:  rb_str_buf_cat(dest, "M",  1); break;
-                case CYR_CAP_N:  rb_str_buf_cat(dest, "N",  1); break;
-                case CYR_CAP_O:  rb_str_buf_cat(dest, "O",  1); break;
-                case CYR_CAP_P:  rb_str_buf_cat(dest, "P",  1); break;
-                case CYR_CAP_R:  rb_str_buf_cat(dest, "R",  1); break;
-                case CYR_CAP_S:  rb_str_buf_cat(dest, "S",  1); break;
-                case CYR_CAP_T:  rb_str_buf_cat(dest, "T",  1); break;
-                case CYR_CAP_U:  rb_str_buf_cat(dest, "U",  1); break;
-                case CYR_CAP_F:  rb_str_buf_cat(dest, "F",  1); break;
-                case CYR_CAP_H:  rb_str_buf_cat(dest, "H",  1); break;
-                case CYR_CAP_C:  rb_str_buf_cat(dest, "C",  1); break;
-                case CYR_CAP_LJ: rb_str_buf_cat(dest, (force_upper ? "LJ" : "Lj"), 2); break;
-                case CYR_CAP_NJ: rb_str_buf_cat(dest, (force_upper ? "NJ" : "Nj"), 2); break;
-                case CYR_CAP_TJ: STR_CAT_COND_ASCII(ascii, dest, "C", LAT_CAP_TJ, 1, enc); break;
-                case CYR_CAP_CH: STR_CAT_COND_ASCII(ascii, dest, "C", LAT_CAP_CH, 1, enc); break;
-                case CYR_CAP_ZH: STR_CAT_COND_ASCII(ascii, dest, "Z", LAT_CAP_ZH, 1, enc); break;
-                case CYR_CAP_SH: STR_CAT_COND_ASCII(ascii, dest, "S", LAT_CAP_SH, 1, enc); break;
-                case CYR_CAP_DJ: STR_CAT_COND_ASCII(ascii, dest, (force_upper ? "DJ" : "Dj"), LAT_CAP_DJ, 2, enc); break;
-                case CYR_CAP_DZ:
-                    rb_str_buf_cat(dest, "D", 1);
-                    if (force_upper) {
-                        STR_CAT_COND_ASCII(ascii, dest, "Z", LAT_CAP_ZH, 1, enc);
-                    }
-                    else {
-                        STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc);
+                    switch (codepoint) {
+                    case CYR_CAP_J:  rb_str_buf_cat(dest, "J", 1); break;
+                    case CYR_CAP_LJ: rb_str_buf_cat(dest, (force_upper ? "LJ" : "Lj"), 2); break;
+                    case CYR_CAP_NJ: rb_str_buf_cat(dest, (force_upper ? "NJ" : "Nj"), 2); break;
+                    case CYR_CAP_TJ: STR_CAT_COND_ASCII(ascii, dest, "C", LAT_CAP_TJ, 1, enc); break;
+                    case CYR_CAP_CH: STR_CAT_COND_ASCII(ascii, dest, "C", LAT_CAP_CH, 1, enc); break;
+                    case CYR_CAP_SH: STR_CAT_COND_ASCII(ascii, dest, "S", LAT_CAP_SH, 1, enc); break;
+                    case CYR_CAP_ZH: STR_CAT_COND_ASCII(ascii, dest, "Z", LAT_CAP_ZH, 1, enc); break;
+                    case CYR_CAP_DJ: STR_CAT_COND_ASCII(ascii, dest, (force_upper ? "DJ" : "Dj"), LAT_CAP_DJ, 2, enc); break;
+                    case CYR_CAP_DZ:
+                        rb_str_buf_cat(dest, "D", 1);
+                        force_upper ? STR_CAT_COND_ASCII(ascii, dest, "Z", LAT_CAP_ZH, 1, enc)
+                                    : STR_CAT_COND_ASCII(ascii, dest, "z", LAT_ZH, 1, enc);
+                        break;
+                    default:
+                        rb_str_buf_cat(dest, pos, len);
                     }
-                    break;
-                default:
-                    rb_str_buf_cat(dest, pos, len);
                 }
             }
         }
+        else {
+            /* Mark the start of a copyable sequence */
+            if (!seq_start) seq_start = pos;
+        }
-        seen_upper = is_upper_case(codepoint);
+        seen_upper = is_upper(codepoint);
         pos += len;
         len = next_len;
@@ -267,8 +220,8 @@ str_to_latin(VALUE str, int ascii, int bang)
         next_codepoint = 0;
     }
+    /* Flush the last sequence, if any */
     if (seq_start) {
-        /* flush the last sequence */
         rb_str_buf_cat(dest, seq_start, pos - seq_start);
     }
@@ -283,30 +236,67 @@ str_to_latin(VALUE str, int ascii, int bang)
     return str;
 }
+/**
+ * Returns a copy of <i>str</i> with the Serbian Cyrillic characters
+ * transliterated into Latin.
+ *
+ * @overload to_latin(str)
+ *   @param  [String] str text to be transliterated
+ *   @return [String] transliterated text
+ */
 static VALUE
-rb_str_to_latin(VALUE str) {
+rb_str_to_latin(VALUE self, VALUE str)
+{
     return str_to_latin(str, 0, 0);
 }
+/**
+ * Performs the transliteration of <code>Byk.to_latin</code> in place,
+ * returning <i>str</i>, whether changes were made or not.
+ *
+ * @overload to_latin!(str)
+ *   @param  [String] str text to be transliterated
+ *   @return [String] transliterated text
+ */
 static VALUE
-rb_str_to_latin_bang(VALUE str) {
+rb_str_to_latin_bang(VALUE self, VALUE str)
+{
     return str_to_latin(str, 0, 1);
 }
+/**
+ * Returns a copy of <i>str</i> with the Serbian Cyrillic
+ * characters transliterated into ASCII Latin.
+ *
+ * @overload to_ascii_latin(str)
+ *   @param  [String] str text to be transliterated
+ *   @return [String] transliterated text
+ */
 static VALUE
-rb_str_to_ascii_latin(VALUE str) {
+rb_str_to_ascii_latin(VALUE self, VALUE str)
+{
     return str_to_latin(str, 1, 0);
 }
+/**
+ * Performs the transliteration of <code>Byk.to_ascii_latin</code> in
+ * place, returning <i>str</i>, whether changes were made or not.
+ *
+ * @overload to_ascii_latin!(str)
+ *   @param  [String] str text to be transliterated
+ *   @return [String] transliterated text
+ */
 static VALUE
-rb_str_to_ascii_latin_bang(VALUE str) {
+rb_str_to_ascii_latin_bang(VALUE self, VALUE str)
+{
     return str_to_latin(str, 1, 1);
 }
 void Init_byk_native(void)
 {
-    rb_define_method(rb_cString, "to_latin", rb_str_to_latin, 0);
-    rb_define_method(rb_cString, "to_latin!", rb_str_to_latin_bang, 0);
-    rb_define_method(rb_cString, "to_ascii_latin", rb_str_to_ascii_latin, 0);
-    rb_define_method(rb_cString, "to_ascii_latin!", rb_str_to_ascii_latin_bang, 0);
+    VALUE Byk = rb_define_module("Byk");
+    rb_define_singleton_method(Byk, "to_latin", rb_str_to_latin, 1);
+    rb_define_singleton_method(Byk, "to_latin!", rb_str_to_latin_bang, 1);
+    rb_define_singleton_method(Byk, "to_ascii_latin", rb_str_to_ascii_latin, 1);
+    rb_define_singleton_method(Byk, "to_ascii_latin!", rb_str_to_ascii_latin_bang, 1);
 }

data/lib/byk.rb CHANGED

@@ -1,14 +1,2 @@
-# coding: utf-8
-require "byk_native"
-require "byk/version"
-module Byk
-  AZBUKA      = %w[а б в г д ђ е ж з и ј к л љ м н њ о п р с т ћ у ф х ц ч џ ш]
-  AZBUKA_CAPS = %W[А Б В Г Д Ђ Е Ж З И Ј К Л Љ М Н Њ О П Р С Т Ћ У Ф Х Ц Ч Џ Ш]
-  ABECEDA      = %w[a b c č ć d dž đ e f g h i j k l lj m n nj o p r s š t u v z ž]
-  ABECEDA_CAPS = %W[A B C Č Ć D Dž Đ E F G H I J K L Lj M N Nj O P R S Š T U V Z Ž]
-end
+require "byk/safe"
+require "byk/core_ext/string"

data/lib/byk/core_ext/string.rb ADDED

@@ -0,0 +1,8 @@
+class String
+  Byk.singleton_methods.each do |method|
+    define_method(method) do
+      Byk.send(method, self)
+    end
+  end
+end

data/lib/byk/safe.rb ADDED

@@ -0,0 +1,14 @@
+# coding: utf-8
+require "byk_native"
+require "byk/version"
+module Byk
+  AZBUKA      = %w[а б в г д ђ е ж з и ј к л љ м н њ о п р с т ћ у ф х ц ч џ ш]
+  AZBUKA_CAPS = %W[А Б В Г Д Ђ Е Ж З И Ј К Л Љ М Н Њ О П Р С Т Ћ У Ф Х Ц Ч Џ Ш]
+  ABECEDA      = %w[a b c č ć d dž đ e f g h i j k l lj m n nj o p r s š t u v z ž]
+  ABECEDA_CAPS = %W[A B C Č Ć D Dž Đ E F G H I J K L Lj M N Nj O P R S Š T U V Z Ž]
+end

data/lib/byk/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Byk
-  VERSION = "0.5.0"
+  VERSION = "0.6.0"
 end

data/spec/byk_spec.rb CHANGED

@@ -4,123 +4,180 @@ require "spec_helper"
 describe Byk do
-  # See http://sr.wikipedia.org/wiki/Панграм
-  let(:pangram) { "фијуче ветар у шибљу, леди пасаже и куће иза њих и гунђа у оџацима." }
-  let(:pangram_latin) { "fijuče vetar u šiblju, ledi pasaže i kuće iza njih i gunđa u odžacima." }
-  let(:pangram_ascii_latin) { "fijuce vetar u siblju, ledi pasaze i kuce iza njih i gundja u odzacima." }
+  it "has a version number" do
+    expect(Byk::VERSION).not_to be nil
+  end
-  let(:pangram_caps) { "ФИЈУЧЕ ВЕТАР У ШИБЉУ, ЛЕДИ ПАСАЖЕ И КУЋЕ ИЗА ЊИХ И ГУНЂА У ОЏАЦИМА." }
-  let(:pangram_latin_caps) { "FIJUČE VETAR U ŠIBLJU, LEDI PASAŽE I KUĆE IZA NJIH I GUNĐA U ODŽACIMA." }
-  let(:pangram_ascii_latin_caps) { "FIJUCE VETAR U SIBLJU, LEDI PASAZE I KUCE IZA NJIH I GUNDJA U ODZACIMA." }
+  shared_examples :base do |method|
-  let(:ascii) { "The quick brown fox jumps over the lazy dog." }
-  let(:other) { "संस्कृतम् saṃskṛtam" }
+    # See http://sr.wikipedia.org/wiki/Панграм
+    let(:pangram) { "фијуче ветар у шибљу, леди пасаже и куће иза њих и гунђа у оџацима." }
+    let(:pangram_latin) { "fijuče vetar u šiblju, ledi pasaže i kuće iza njih i gunđa u odžacima." }
+    let(:pangram_ascii_latin) { "fijuce vetar u siblju, ledi pasaze i kuce iza njih i gundja u odzacima." }
-  let(:mixed) { "संस्कृतम् saṃskṛtam илити Sanskrit, obrati ПАЖЊУ." }
-  let(:mixed_latin) { "संस्कृतम् saṃskṛtam iliti Sanskrit, obrati PAŽNJU." }
-  let(:mixed_ascii_latin) { "संस्कृतम् saṃskṛtam iliti Sanskrit, obrati PAZNJU." }
+    let(:pangram_caps) { "ФИЈУЧЕ ВЕТАР У ШИБЉУ, ЛЕДИ ПАСАЖЕ И КУЋЕ ИЗА ЊИХ И ГУНЂА У ОЏАЦИМА." }
+    let(:pangram_latin_caps) { "FIJUČE VETAR U ŠIBLJU, LEDI PASAŽE I KUĆE IZA NJIH I GUNĐA U ODŽACIMA." }
+    let(:pangram_ascii_latin_caps) { "FIJUCE VETAR U SIBLJU, LEDI PASAZE I KUCE IZA NJIH I GUNDJA U ODZACIMA." }
-  it "has a version number" do
-    expect(Byk::VERSION).not_to be nil
-  end
+    let(:full_cyrillic_coderange) { (0x400..0x4ff).map { |i| i.chr(Encoding::UTF_8) } }
+    let(:non_serbian_cyrillic_coderange) { full_cyrillic_coderange - Byk::AZBUKA - Byk::AZBUKA_CAPS }
+    let(:non_serbian_cyrillic) { non_serbian_cyrillic_coderange.join }
+    let(:ascii) { "The quick brown fox jumps over the lazy dog." }
+    let(:other) { "संस्कृतम् saṃskṛtam" }
-  describe "#to_latin" do
+    let(:mixed) { "संस्कृतम् saṃskṛtam илити Sanskrit, obrati ПАЖЊУ." }
+    let(:mixed_latin) { "संस्कृतम् saṃskṛtam iliti Sanskrit, obrati PAŽNJU." }
+    let(:mixed_ascii_latin) { "संस्कृतम् saṃskṛtam iliti Sanskrit, obrati PAZNJU." }
-    it "doesn't modify an empty string" do
-      expect("".to_latin).to eq ""
+    it "doesn't convert an empty string" do
+      expect(Byk.send(method, "")).to eq ""
     end
-    it "doesn't modify ASCII text" do
-      expect(ascii.to_latin).to eq ascii
+    it "doesn't convert ASCII text" do
+      expect(Byk.send(method, ascii)).to eq ascii
     end
-    it "doesn't modify latin" do
-      expect(pangram_latin.to_latin).to eq pangram_latin
+    it "doesn't convert non-Serbian Cyrillic" do
+      expect(Byk.send(method, non_serbian_cyrillic)).to eq non_serbian_cyrillic
     end
-    it "doesn't modify other scripts" do
-      expect(other.to_latin).to eq other
+    it "doesn't convert other coderanges" do
+      expect(Byk.send(method, other)).to eq other
+    end
+  end
+  shared_examples :latinization_method do |method|
+    include_examples :base, method
+    let(:edge_cases) {
+      [
+        ["Њ", "Nj"],
+        ["Љ", "Lj"],
+        ["Џ", "Dž"],
+        ["ЊЊ", "NJNJ"],
+        ["ЉЉ", "LJLJ"],
+        ["ЏЏ", "DŽDŽ"]
+      ]
+    }
+    it "doesn't convert Latin" do
+      expect(Byk.send(method, pangram_latin)).to eq pangram_latin
     end
-    it "converts cyrillic to latin" do
-      expect(pangram.to_latin).to eq pangram_latin
+    it "converts Cyrillic to Latin" do
+      expect(Byk.send(method, pangram)).to eq pangram_latin
     end
-    it "converts cyrillic caps to latin caps" do
-      expect(pangram_caps.to_latin).to eq pangram_latin_caps
+    it "converts Cyrillic caps to Latin caps" do
+      expect(Byk.send(method, pangram_caps)).to eq pangram_latin_caps
     end
     it "converts mixed text properly" do
-      expect(mixed.to_latin).to eq mixed_latin
+      expect(Byk.send(method, mixed)).to eq mixed_latin
+    end
+    it "converts edge cases properly" do
+      edge_cases.each do |input, output|
+        expect(Byk.send(method, input)).to eq output
+      end
     end
     it "converts AZBUKA to ABECEDA" do
-      expect(Byk::AZBUKA.map(&:to_latin)).to match_array(Byk::ABECEDA)
+      expect(Byk::AZBUKA.map { |l| l.dup.send(method) }).to match_array(Byk::ABECEDA)
     end
     it "converts AZBUKA_CAPS to ABECEDA_CAPS" do
-      expect(Byk::AZBUKA_CAPS.map(&:to_latin)).to match_array(Byk::ABECEDA_CAPS)
+      expect(Byk::AZBUKA_CAPS.map { |l| l.dup.send(method) }).to match_array(Byk::ABECEDA_CAPS)
     end
   end
-  describe "#to_ascii_latin" do
+  shared_examples :ascii_latinization_method do |method|
+    include_examples :base, method
-    # Special care for Њ, Љ, Ђ, Đ
     let(:edge_cases) {
-      {
-        "Њ" => "Nj",
-        "Љ" => "Lj",
-        "Ђ" => "Dj",
-        "Đ" => "Dj",
-        "ЊЊ" => "NJNJ",
-        "ЉЉ" => "LJLJ",
-        "ЂЂ" => "DJDJ",
-        "ĐĐ" => "DJDJ",
-        "ГУЊ" => "GUNJ",
-        "ПАСУЉ" => "PASULJ",
-        "ЂУРАЂ" => "DJURADJ",
-        "ĐURAĐ" => "DJURADJ",
-        "ĐURAĐ Đorđević" => "DJURADJ Djordjevic",
-        "ĐURAĐ. Đorđević" => "DJURADJ. Djordjevic"
-      }
+      [
+        ["Њ", "Nj"],
+        ["Љ", "Lj"],
+        ["Џ", "Dz"],
+        ["Ђ", "Dj"],
+        ["Đ", "Dj"],
+        ["ЊЊ", "NJNJ"],
+        ["ЉЉ", "LJLJ"],
+        ["ЏЏ", "DZDZ"],
+        ["ЂЂ", "DJDJ"],
+        ["ĐĐ", "DJDJ"],
+        ["ЂУРАЂ Ђорђевић", "DJURADJ Djordjevic"],
+        ["ĐURAĐ Đorđević", "DJURADJ Djordjevic"]
+      ]
     }
-    it "doesn't modify an empty string" do
-      expect("".to_ascii_latin).to eq ""
+    it "converts Cyrillic to ASCII Latin" do
+      expect(Byk.send(method, pangram)).to eq pangram_ascii_latin
     end
-    it "doesn't modify ASCII text" do
-      expect(ascii.to_ascii_latin).to eq ascii
+    it "converts Cyrillic caps to ASCII Latin caps" do
+      expect(Byk.send(method, pangram_caps)).to eq pangram_ascii_latin_caps
     end
-    it "doesn't modify other scripts" do
-      expect(other.to_ascii_latin).to eq other
+    it "converts Latin to ASCII Latin" do
+      expect(Byk.send(method, pangram_latin)).to eq pangram_ascii_latin
     end
-    it "converts cyrillic to ASCII latin" do
-      expect(pangram.to_ascii_latin).to eq pangram_ascii_latin
+    it "converts Latin caps to ASCII Latin caps" do
+      expect(Byk.send(method, pangram_latin_caps)).to eq pangram_ascii_latin_caps
     end
-    it "converts cyrillic caps to ASCII latin caps" do
-      expect(pangram_caps.to_ascii_latin).to eq pangram_ascii_latin_caps
+    it "converts mixed text properly" do
+      expect(Byk.send(method, mixed)).to eq mixed_ascii_latin
     end
-    it "converts latin to ASCII latin" do
-      expect(pangram_latin.to_ascii_latin).to eq pangram_ascii_latin
+    it "converts edge cases properly" do
+      edge_cases.each do |input, output|
+        expect(Byk.send(method, input)).to eq output
+      end
     end
+  end
-    it "converts latin caps to ASCII latin caps" do
-      expect(pangram_latin_caps.to_ascii_latin).to eq pangram_ascii_latin_caps
+  shared_examples :non_destructive_method do |method|
+    it "doesn't modify the arg" do
+      str = "Ж"
+      expect { Byk.send(method, str) }.to_not change { str }
     end
+  end
-    it "converts mixed text properly" do
-      expect(mixed.to_ascii_latin).to eq mixed_ascii_latin
+  shared_examples :destructive_method do |method|
+    it "modifies the arg" do
+      str = "Ж"
+      expect { Byk.send(method, str) }.to change { str }
     end
+  end
-    it "converts edge cases properly" do
-      edge_cases.each do |input, output|
-        expect(input.to_ascii_latin).to eq output
-      end
+  describe ".to_latin" do
+    it_behaves_like :latinization_method, :to_latin
+    it_behaves_like :non_destructive_method, :to_latin
+  end
+  describe ".to_latin!" do
+    it_behaves_like :latinization_method, :to_latin!
+    it_behaves_like :destructive_method, :to_latin!
+  end
+  describe ".to_ascii_latin" do
+    it_behaves_like :ascii_latinization_method, :to_ascii_latin
+    it_behaves_like :non_destructive_method, :to_ascii_latin
+  end
+  describe ".to_ascii_latin!" do
+    it_behaves_like :ascii_latinization_method, :to_ascii_latin!
+    it_behaves_like :destructive_method, :to_ascii_latin!
+  end
+end
+describe String do
+  it "responds to Byk methods" do
+    Byk.instance_methods.each do |method|
+      expect("").to respond_to(method)
     end
   end
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: byk
 version: !ruby/object:Gem::Version
-  version: 0.5.0
+  version: 0.6.0
 platform: ruby
 authors:
 - Nikola Topalović
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-04-18 00:00:00.000000000 Z
+date: 2015-04-25 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rake-compiler
@@ -52,6 +52,8 @@ files:
 - ext/byk/byk.c
 - ext/byk/extconf.rb
 - lib/byk.rb
+- lib/byk/core_ext/string.rb
+- lib/byk/safe.rb
 - lib/byk/version.rb
 - spec/byk_spec.rb
 homepage: https://github.com/topalovic/byk
@@ -74,7 +76,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.2.2
+rubygems_version: 2.4.5
 signing_key:
 specification_version: 4
 summary: Fast transliteration of Serbian Cyrillic into Latin.