RubyGems - edn_turbo - Versions diffs - 0.1.0 → 0.1.1 - Mend

edn_turbo 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

checksums.yaml +4 -4
data/README.md +52 -12
data/ext/edn_turbo/edn_parser.cc +504 -576
data/ext/edn_turbo/edn_parser.h +12 -14
data/ext/edn_turbo/edn_parser.rl +103 -175
data/ext/edn_turbo/edn_parser_def.cc +22 -24
data/ext/edn_turbo/edn_parser_unicode.cc +29 -0
data/ext/edn_turbo/extconf.rb +23 -0
data/ext/edn_turbo/main.cc +0 -1
data/lib/edn_turbo/version.rb +1 -1
data/test/test_output_diff.rb +9 -0
metadata +2 -1

data/ext/edn_turbo/edn_parser.h CHANGED Viewed

@@ -2,7 +2,7 @@
 #define EDN_RUBY_EXT_PARSER_H
 #include <string>
-#include <strstream>
+#include <sstream>
 #include <rice/Object.hpp>
 #include <rice/to_from_ruby.hpp>
@@ -21,17 +21,18 @@ namespace edn
         Rice::Object parse(const char* s, std::size_t len);
-        const char* EDN_parse_decimal(const char *p, const char *pe, Rice::Object& o);
-        const char* EDN_parse_integer(const char *p, const char *pe, Rice::Object& o);
-        const char* EDN_parse_keyword(const char *p, const char *pe, Rice::Object& o);
-        const char* EDN_parse_tagged (const char *p, const char *pe, Rice::Object& o, bool& dicard);
-        const char* EDN_parse_string (const char *p, const char *pe, Rice::Object& o);
-        const char* EDN_parse_value  (const char *p, const char *pe, Rice::Object& o);
-        const char* EDN_parse_vector (const char *p, const char *pe, Rice::Object& o);
-        const char* EDN_parse_map    (const char *p, const char *pe, Rice::Object& o);
-        const char* EDN_parse_list   (const char *p, const char *pe, Rice::Object& o);
+        const char* parse_decimal(const char *p, const char *pe, Rice::Object& o);
+        const char* parse_integer(const char *p, const char *pe, Rice::Object& o);
+        const char* parse_keyword(const char *p, const char *pe, Rice::Object& o);
+        const char* parse_tagged (const char *p, const char *pe, Rice::Object& o, bool& dicard);
+        const char* parse_string (const char *p, const char *pe, Rice::Object& o);
+        const char* parse_value  (const char *p, const char *pe, Rice::Object& o);
+        const char* parse_vector (const char *p, const char *pe, Rice::Object& o);
+        const char* parse_map    (const char *p, const char *pe, Rice::Object& o);
+        const char* parse_list   (const char *p, const char *pe, Rice::Object& o);
-        bool EDN_parse_byte_stream   (const char *p, const char *pe, Rice::String& s);
+        static bool parse_byte_stream(const char *p, const char *pe, Rice::String& s);
+        static bool unicode_to_utf8(const char *s, std::size_t len, std::string& rslt);
         void error(const std::string& err, char c) const;
         void error(char err_c) const { error("", err_c); }
@@ -52,9 +53,6 @@ namespace edn
         Rice::Object process(const std::string& data) { return parse(data.c_str(), data.length()); }
-        // handle file read from the c-side
-        Rice::Object open(const std::string& file);
     }; // Engine
 } // namespace

data/ext/edn_turbo/edn_parser.rl CHANGED Viewed

@@ -1,9 +1,6 @@
 #include <iostream>
 #include <string>
-#include <ruby/ruby.h>
-#include <ruby/encoding.h>
 #include <rice/Hash.hpp>
 #include <rice/Array.hpp>
 #include <rice/to_from_ruby.hpp>
@@ -13,6 +10,10 @@
 //
 // EDN spec at: https://github.com/edn-format/edn
 //
+//
+// many thanks to Florian Frank for json-ruby which was essential in
+// helping me learn about ragel
+//
 %%{
         machine EDN_common;
@@ -67,21 +68,21 @@
     }
     action parse_keyword {
-        const char *np = EDN_parse_keyword(fpc, pe, o);
+        const char *np = parse_keyword(fpc, pe, o);
         if (np == NULL) { fhold; fbreak; } else fexec np;
     }
     action parse_string {
-        const char *np = EDN_parse_string(fpc, pe, o);
+        const char *np = parse_string(fpc, pe, o);
         if (np == NULL) { fhold; fbreak; } else fexec np;
     }
     action parse_number {
         // try to parse a decimal first
-        const char *np = EDN_parse_decimal(fpc, pe, o);
+        const char *np = parse_decimal(fpc, pe, o);
         if (np == NULL) {
             // if we can't, try to parse it as an int
-            np = EDN_parse_integer(fpc, pe, o);
+            np = parse_integer(fpc, pe, o);
         }
         if (np) {
@@ -96,17 +97,17 @@
     }
     action parse_vector {
-        const char *np = EDN_parse_vector(fpc, pe, o);
+        const char *np = parse_vector(fpc, pe, o);
         if (np == NULL) { fhold; fbreak; } else fexec np;
     }
     action parse_list {
-        const char *np = EDN_parse_list(fpc, pe, o);
+        const char *np = parse_list(fpc, pe, o);
         if (np == NULL) { fhold; fbreak; } else fexec np;
     }
     action parse_map {
-        const char *np = EDN_parse_map(fpc, pe, o);
+        const char *np = parse_map(fpc, pe, o);
         if (np == NULL) { fhold; fbreak; } else fexec np;
     }
@@ -126,7 +127,7 @@
 }%%
-const char *edn::Parser::EDN_parse_value(const char *p, const char *pe, Rice::Object& o)
+const char *edn::Parser::parse_value(const char *p, const char *pe, Rice::Object& o)
 {
     int cs;
@@ -145,80 +146,6 @@ const char *edn::Parser::EDN_parse_value(const char *p, const char *pe, Rice::Ob
 }
-// ============================================================
-// tagged element parsing - any of #uuid, #inst, #{, #(some symbol)
-// discard (#_ <ident>) is handled by the top-level machine
-//
-%%{
-    machine EDN_dispatch;
-    include EDN_common;
-    begin_discard  = '_';
-    begin_set      = '{';
-    end_set        = '}';
-    write data;
-    action exit { fhold; fbreak; }
-    main := begin_dispatch (
-                            (begin_discard (space)? ([a-zA-Z0-9\-\.]*)) |
-                            ('inst ' string_delim ([0-9\-+:\.TZ])* string_delim) |
-                            ('uuid ' string_delim ([a-f0-9\-]* string_delim))
-                            )
-        (^[a-zA-Z0-9:\.\-+ ]* @exit);
-}%%
-const char* edn::Parser::EDN_parse_tagged(const char *p, const char *pe, Rice::Object& o, bool& discard)
-{
-    int cs;
-    Rice::String str;
-    %% write init;
-    p_save = p;
-    %% write exec;
-    if (cs >= EDN_dispatch_first_final) {
-        //is it a discard? if so, just drop the following token
-        if (*(p_save + 1) == '_')
-        {
-            discard = true;
-            return p + 1;
-        }
-        std::size_t len = p - p_save;
-        std::string buf;
-        buf.reserve(len);
-        if (len > 10)
-        {
-            // there's enough room to be #inst or #uuid, copy the
-            // string portion
-            if (std::strncmp(p_save + 1, "inst", 4) == 0) {
-                buf.append(p_save + 7, len - 8);
-            } else if (std::strncmp(p_save + 1, "uuid", 4) == 0) {
-                buf.append(p_save + 7, len - 8);
-            }
-            o = Rice::String(buf);
-            return p;
-        }
-        // tagged element
-        o = Rice::String(buf);
-        return p;
-    }
-    else if (cs == EDN_dispatch_error) {
-        error(*p);
-        return pe;
-    }
-    else if (cs == EDN_dispatch_en_main) {} // silence ragel warning
-    return NULL;
-}
 // ============================================================
 // keyword parsing
@@ -237,7 +164,7 @@ const char* edn::Parser::EDN_parse_tagged(const char *p, const char *pe, Rice::O
 }%%
-const char* edn::Parser::EDN_parse_keyword(const char *p, const char *pe, Rice::Object& o)
+const char* edn::Parser::parse_keyword(const char *p, const char *pe, Rice::Object& o)
 {
     int cs;
@@ -272,7 +199,7 @@ const char* edn::Parser::EDN_parse_keyword(const char *p, const char *pe, Rice::
     write data;
     action parse_string {
-        if (!EDN_parse_byte_stream(p_save + 1, p, s)) {
+        if (!parse_byte_stream(p_save + 1, p, s)) {
             fhold;
             fbreak;
         } else {
@@ -292,82 +219,7 @@ const char* edn::Parser::EDN_parse_keyword(const char *p, const char *pe, Rice::
 }%%
-//
-// copies the string data, unescaping any present values that need to be replaced
-//
-bool edn::Parser::EDN_parse_byte_stream(const char *p, const char *pe, Rice::String& s)
-{
-    if (pe > p) {
-        std::string buf;
-        std::size_t len = pe - p;
-        // pre-allocate storage needed
-        buf.reserve(len);
-        const char* cp = p;
-        std::size_t pos = 0;
-        char c, replacement;
-        while (cp < pe)
-        {
-            // append any other character that is not the escaping slash
-            if (*cp != '\\') {
-                buf.replace(pos++, 1, 1, *cp++);
-                continue;
-            }
-            // looking at a '\' - check what it escapes if there's a
-            // following character
-            if (++cp == pe)
-                break;
-            c = *cp++;
-            replacement = '?';
-            switch (c)
-            {
-              case 't':
-                  replacement = '\t';
-                  break;
-              case 'n':
-                  replacement = '\n';
-                  break;
-              case 'r':
-                  replacement = '\r';
-                  break;
-              case '\"':
-                  replacement = '\"';
-                  break;
-              case '\\':
-                  replacement = '\\';
-                  break;
-                  /* TODO: add support for this!
-              case 'u':
-                  replacement = '\u';
-                  break;
-                  */
-              default:
-                  std::cerr << "value must be unescaped but case is unhandled: '" << c << "'" << std::endl;
-                  break;
-            }
-            // substitute the escaped walue
-            if (replacement != '?')
-                buf.replace(pos++, 1, 1, replacement);
-        }
-        // utf-8 encode
-        VALUE vs = Rice::protect( rb_str_new2, buf.c_str() );
-        VALUE s_utf8 = Rice::protect( rb_enc_associate, vs, rb_utf8_encoding() );
-        s = Rice::String(s_utf8);
-        return true;
-    }
-    return false;
-}
-const char* edn::Parser::EDN_parse_string(const char *p, const char *pe, Rice::Object& o)
+const char* edn::Parser::parse_string(const char *p, const char *pe, Rice::Object& o)
 {
     static const char* EDN_TYPE = "string";
     int cs;
@@ -410,7 +262,7 @@ const char* edn::Parser::EDN_parse_string(const char *p, const char *pe, Rice::O
 }%%
-const char* edn::Parser::EDN_parse_decimal(const char *p, const char *pe, Rice::Object& o)
+const char* edn::Parser::parse_decimal(const char *p, const char *pe, Rice::Object& o)
 {
     int cs;
@@ -441,7 +293,7 @@ const char* edn::Parser::EDN_parse_decimal(const char *p, const char *pe, Rice::
     main := '-'? ('0' | [1-9][0-9]* [M]?) (^[0-9M]? @exit);
 }%%
-const char* edn::Parser::EDN_parse_integer(const char *p, const char *pe, Rice::Object& o)
+const char* edn::Parser::parse_integer(const char *p, const char *pe, Rice::Object& o)
 {
     int cs;
@@ -469,7 +321,7 @@ const char* edn::Parser::EDN_parse_integer(const char *p, const char *pe, Rice::
     action parse_value {
         Rice::Object v;
-        const char *np = EDN_parse_value(fpc, pe, v);
+        const char *np = parse_value(fpc, pe, v);
         if (np == NULL) {
             fhold; fbreak;
         } else {
@@ -481,7 +333,7 @@ const char* edn::Parser::EDN_parse_integer(const char *p, const char *pe, Rice::
     action parse_dispatch {
         bool discard = false;
         Rice::Object v;
-        const char *np = EDN_parse_tagged(fpc, pe, v, discard);
+        const char *np = parse_tagged(fpc, pe, v, discard);
         if (np == NULL) {
             fhold; fbreak;
         } else {
@@ -520,7 +372,7 @@ const char* edn::Parser::EDN_parse_integer(const char *p, const char *pe, Rice::
 //
 // vector parsing
 //
-const char* edn::Parser::EDN_parse_vector(const char *p, const char *pe, Rice::Object& o)
+const char* edn::Parser::parse_vector(const char *p, const char *pe, Rice::Object& o)
 {
     static const char* EDN_TYPE = "vector";
@@ -563,7 +415,7 @@ const char* edn::Parser::EDN_parse_vector(const char *p, const char *pe, Rice::O
 //
 // list parsing
 //
-const char* edn::Parser::EDN_parse_list(const char *p, const char *pe, Rice::Object& o)
+const char* edn::Parser::parse_list(const char *p, const char *pe, Rice::Object& o)
 {
     static const char* EDN_TYPE = "list";
@@ -597,7 +449,7 @@ const char* edn::Parser::EDN_parse_list(const char *p, const char *pe, Rice::Obj
     write data;
     action parse_key {
-        const char *np = EDN_parse_value(fpc, pe, k);
+        const char *np = parse_value(fpc, pe, k);
         if (np == NULL) {
             fhold; fbreak;
         } else {
@@ -606,7 +458,7 @@ const char* edn::Parser::EDN_parse_list(const char *p, const char *pe, Rice::Obj
     }
     action parse_value {
-        const char *np = EDN_parse_value(fpc, pe, v);
+        const char *np = parse_value(fpc, pe, v);
         if (np == NULL) {
             fhold; fbreak;
         } else {
@@ -634,7 +486,7 @@ const char* edn::Parser::EDN_parse_list(const char *p, const char *pe, Rice::Obj
 }%%
-const char* edn::Parser::EDN_parse_map(const char *p, const char *pe, Rice::Object& o)
+const char* edn::Parser::parse_map(const char *p, const char *pe, Rice::Object& o)
 {
     static const char* EDN_TYPE = "map";
@@ -659,6 +511,82 @@ const char* edn::Parser::EDN_parse_map(const char *p, const char *pe, Rice::Obje
+// ============================================================
+// tagged element parsing - any of #uuid, #inst, #{, #(some symbol)
+// discard (#_ <ident>) is handled by the top-level machine
+//
+// NOTE: this is not fully implemented yet
+//
+%%{
+    machine EDN_dispatch;
+    include EDN_common;
+    begin_discard  = '_';
+    begin_set      = '{';
+    end_set        = '}';
+    write data;
+    action exit { fhold; fbreak; }
+    main := begin_dispatch (
+                            (begin_discard (space)? ([a-zA-Z0-9\-\.]*)) |
+                            ('inst ' string_delim ([0-9\-+:\.TZ])* string_delim) |
+                            ('uuid ' string_delim ([a-f0-9\-]* string_delim))
+                            )
+        (^[a-zA-Z0-9:\.\-+ ]* @exit);
+}%%
+const char* edn::Parser::parse_tagged(const char *p, const char *pe, Rice::Object& o, bool& discard)
+{
+    int cs;
+    Rice::String str;
+    %% write init;
+    p_save = p;
+    %% write exec;
+    if (cs >= EDN_dispatch_first_final) {
+        //is it a discard? if so, just drop the following token
+        if (*(p_save + 1) == '_')
+        {
+            discard = true;
+            return p + 1;
+        }
+        std::size_t len = p - p_save;
+        std::string buf;
+        buf.reserve(len);
+        if (len > 10)
+        {
+            // there's enough room to be #inst or #uuid, copy the
+            // string portion
+            if (std::strncmp(p_save + 1, "inst", 4) == 0) {
+                buf.append(p_save + 7, len - 8);
+            } else if (std::strncmp(p_save + 1, "uuid", 4) == 0) {
+                buf.append(p_save + 7, len - 8);
+            }
+            o = Rice::String(buf);
+            return p;
+        }
+        // tagged element
+        o = Rice::String(buf);
+        return p;
+    }
+    else if (cs == EDN_dispatch_error) {
+        error(*p);
+        return pe;
+    }
+    else if (cs == EDN_dispatch_en_main) {} // silence ragel warning
+    return NULL;
+}
 // ============================================================
 // main parsing machine
 //
@@ -669,17 +597,17 @@ const char* edn::Parser::EDN_parse_map(const char *p, const char *pe, Rice::Obje
     write data nofinal;
     action parse_vector {
-        const char* np = EDN_parse_vector(fpc, pe, result);
+        const char* np = parse_vector(fpc, pe, result);
         if (np == NULL) { fhold; fbreak; } else fexec np;
     }
     action parse_map {
-        const char *np = EDN_parse_map(fpc, pe, result);
+        const char *np = parse_map(fpc, pe, result);
         if (np == NULL) { fhold; fbreak; } else fexec np;
     }
     action parse_list {
-        const char *np = EDN_parse_list(fpc, pe, result);
+        const char *np = parse_list(fpc, pe, result);
         if (np == NULL) { fhold; fbreak; } else fexec np;
     }

data/ext/edn_turbo/edn_parser_def.cc CHANGED Viewed

@@ -1,41 +1,39 @@
 #include <iostream>
 #include <string>
 #include <fstream>
 #include <rice/Object.hpp>
+#include <ruby/ruby.h>
+#include <ruby/encoding.h>
 #include "edn_parser.h"
 namespace edn
 {
-    // ============================================================
-    // reads the contents of a file and begins the parsing process
     //
-    Rice::Object Parser::open(const std::string& file)
+    // copies the string data, unescaping any present values that need to be replaced
+    //
+    bool Parser::parse_byte_stream(const char *p_start, const char *p_end, Rice::String& s)
     {
-        Rice::Object rslt = Qnil;
-        std::ifstream f(file);
-        if (f.is_open())
-        {
-            // determine the length of the file
-            f.seekg(0, f.end);
-            long len = f.tellg();
-            f.seekg(0, f.beg);
-            // read its contents
-            char* buf = new char[len];
-            f.read(buf, len);
-            f.close();
-            // parse the buffer
-            rslt = parse(buf, len);
-            delete [] buf;
+        if (p_end > p_start) {
+            std::string buf;
+            std::size_t len = p_end - p_start;
+            if (unicode_to_utf8(p_start, len, buf))
+            {
+                // utf-8 encode
+                VALUE vs = Rice::protect( rb_str_new2, buf.c_str() );
+                VALUE s_utf8 = Rice::protect( rb_enc_associate, vs, rb_utf8_encoding() );
+                s = Rice::String(s_utf8);
+                return true;
+            }
         }
-        return rslt;
+        return false;
     }
     //
     // error reporting
     void Parser::error(const std::string& err, char c) const

data/ext/edn_turbo/edn_parser_unicode.cc ADDED Viewed

@@ -0,0 +1,29 @@
+#include <string>
+//
+// needed to define this in its own file because icu and ruby have
+// differing definitions for Uchar and the compiler complains
+//
+#include <unicode/utypes.h>
+#include <unicode/ustring.h>
+#include <unicode/ucnv.h>
+#include "edn_parser.h"
+namespace edn
+{
+    //
+    // unescapes any values that need to be replaced, saves it to utf8
+    //
+    bool Parser::unicode_to_utf8(const char *s, std::size_t len, std::string& rslt)
+    {
+        icu::UnicodeString ustr(s, len);
+        if (ustr.isBogus()) {
+            return false;
+        }
+        ustr.unescape().toUTF8String(rslt);
+        return true;
+    }
+}

data/ext/edn_turbo/extconf.rb CHANGED Viewed

@@ -1,3 +1,26 @@
 require 'mkmf-rice'
+HEADER_DIRS = [
+  '/usr/local/include',
+  '/usr/local/opt/icu4c/include',
+  '/usr/include'
+]
+LIB_DIRS = [
+  '/usr/local/lib', # must be the first entry; add others after it
+  '/usr/local/opt/icu4c/lib'
+]
+unless find_header('unicode/uversion.h', *HEADER_DIRS)
+  abort "icu4c headers missing"
+end
+# haven't figured out how this ever works so..
+#unless have_library('icuuc', 'uconv_close', *LIB_DIRS)
+#  abort "ic4c lib missing"
+#end
+# do this instead. sigh
+$LOCAL_LIBS="-L#{LIB_DIRS[1]} -licuuc"
 create_makefile("edn_turbo/edn_turbo")

data/ext/edn_turbo/main.cc CHANGED Viewed

@@ -41,7 +41,6 @@ void Init_edn_turbo(void)
         Rice::define_class_under<edn::Parser>(rb_mEDNT, "Parser")
         .define_constructor(Rice::Constructor<edn::Parser>())
         .define_method("ext_read", &edn::Parser::process, (Rice::Arg("data")))
-        .define_method("ext_open", &edn::Parser::open, (Rice::Arg("file")))
         ;
     // import whatever else we've defined in the ruby side

data/lib/edn_turbo/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module EDNT
-  VERSION = '0.1.0'
+  VERSION = '0.1.1'
 end

data/test/test_output_diff.rb CHANGED Viewed

@@ -38,6 +38,15 @@ class EDNT_Test < Minitest::Test
               )
   end
+  def test_unicode
+    check_file('test/unicode.edn',
+               [:text,
+                "Page \u0018, October 2009 TechTIPS",
+                "This should be an unfilled star: ☆"]
+              )
+  end
   def test_vector
     check_file('test/vector_1.edn',

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: edn_turbo
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.1.1
 platform: ruby
 authors:
 - Ed Porras
@@ -70,6 +70,7 @@ files:
 - ext/edn_turbo/edn_parser.h
 - ext/edn_turbo/edn_parser.rl
 - ext/edn_turbo/edn_parser_def.cc
+- ext/edn_turbo/edn_parser_unicode.cc
 - ext/edn_turbo/extconf.rb
 - ext/edn_turbo/main.cc
 - lib/edn_turbo.rb