ruby-sfst 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +1 -0
- data/Manifest +31 -0
- data/README.rdoc +25 -0
- data/Rakefile +22 -0
- data/ext/sfst_machine/alphabet.C +807 -0
- data/ext/sfst_machine/alphabet.h +281 -0
- data/ext/sfst_machine/basic.C +84 -0
- data/ext/sfst_machine/basic.h +24 -0
- data/ext/sfst_machine/compact.C +616 -0
- data/ext/sfst_machine/compact.h +98 -0
- data/ext/sfst_machine/determinise.C +304 -0
- data/ext/sfst_machine/extconf.rb +4 -0
- data/ext/sfst_machine/fst-compiler.C +2375 -0
- data/ext/sfst_machine/fst-compiler.h +113 -0
- data/ext/sfst_machine/fst-compiler.yy +213 -0
- data/ext/sfst_machine/fst.C +966 -0
- data/ext/sfst_machine/fst.h +365 -0
- data/ext/sfst_machine/interface.C +1838 -0
- data/ext/sfst_machine/interface.h +94 -0
- data/ext/sfst_machine/make-compact.C +328 -0
- data/ext/sfst_machine/make-compact.h +34 -0
- data/ext/sfst_machine/mem.h +74 -0
- data/ext/sfst_machine/operators.C +1131 -0
- data/ext/sfst_machine/sfst_machine.cc +411 -0
- data/ext/sfst_machine/utf8-scanner.C +2197 -0
- data/ext/sfst_machine/utf8-scanner.ll +179 -0
- data/ext/sfst_machine/utf8.C +146 -0
- data/ext/sfst_machine/utf8.h +19 -0
- data/lib/sfst.rb +99 -0
- data/ruby-sfst.gemspec +34 -0
- data/test/test_sfst.fst +3 -0
- data/test/test_sfst.rb +119 -0
- metadata +100 -0
| @@ -0,0 +1,179 @@ | |
| 1 | 
            +
            %option 8Bit batch yylineno noyywrap
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            /* the "incl" state is used to pick up the name of an include file */
         | 
| 4 | 
            +
            %x incl
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            %{
         | 
| 7 | 
            +
            /*******************************************************************/
         | 
| 8 | 
            +
            /*                                                                 */
         | 
| 9 | 
            +
            /*  FILE     scanner.ll                                            */
         | 
| 10 | 
            +
            /*  MODULE   scanner                                               */
         | 
| 11 | 
            +
            /*  PROGRAM  SFST                                                  */
         | 
| 12 | 
            +
            /*  AUTHOR   Helmut Schmid, IMS, University of Stuttgart           */
         | 
| 13 | 
            +
            /*                                                                 */
         | 
| 14 | 
            +
            /*******************************************************************/
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            #include <string.h>
         | 
| 17 | 
            +
             | 
| 18 | 
            +
            #include "interface.h"
         | 
| 19 | 
            +
            #include "fst-compiler.h"
         | 
| 20 | 
            +
             | 
| 21 | 
            +
            #define MAX_INCLUDE_DEPTH 10
         | 
| 22 | 
            +
              
         | 
| 23 | 
            +
            int Include_Stack_Ptr = 0;
         | 
| 24 | 
            +
            YY_BUFFER_STATE Include_Stack[MAX_INCLUDE_DEPTH];
         | 
| 25 | 
            +
            char *Name_Stack[MAX_INCLUDE_DEPTH];
         | 
| 26 | 
            +
            int  Lineno_Stack[MAX_INCLUDE_DEPTH];
         | 
| 27 | 
            +
             | 
| 28 | 
            +
            char *FileName;
         | 
| 29 | 
            +
             | 
| 30 | 
            +
            bool UTF8=true;
         | 
| 31 | 
            +
             | 
| 32 | 
            +
            static char *unquote(char *string, bool del_quote=true) {
         | 
| 33 | 
            +
              char *s=string, *result=string;
         | 
| 34 | 
            +
              if (del_quote)
         | 
| 35 | 
            +
                string++;
         | 
| 36 | 
            +
             | 
| 37 | 
            +
              while (*string) {
         | 
| 38 | 
            +
                if (*string == '\\')
         | 
| 39 | 
            +
                  string++;
         | 
| 40 | 
            +
                *(s++) = *(string++);
         | 
| 41 | 
            +
              }
         | 
| 42 | 
            +
             | 
| 43 | 
            +
              if (del_quote)
         | 
| 44 | 
            +
                s--;
         | 
| 45 | 
            +
              *s = '\0';
         | 
| 46 | 
            +
             | 
| 47 | 
            +
              return fst_strdup(result);
         | 
| 48 | 
            +
            }
         | 
| 49 | 
            +
             | 
| 50 | 
            +
            static void print_lineno() {
         | 
| 51 | 
            +
              if (!Verbose)
         | 
| 52 | 
            +
                return;
         | 
| 53 | 
            +
              fputc('\r',stderr);
         | 
| 54 | 
            +
              for( int i=0; i<Include_Stack_Ptr; i++ )
         | 
| 55 | 
            +
                fputs("  ", stderr);
         | 
| 56 | 
            +
              fprintf(stderr,"%s: %d", FileName, yylineno);
         | 
| 57 | 
            +
            }
         | 
| 58 | 
            +
             | 
| 59 | 
            +
            extern void yyerror(char *text);
         | 
| 60 | 
            +
             | 
| 61 | 
            +
            %}
         | 
| 62 | 
            +
             | 
| 63 | 
            +
            CC	[\x80-\xbf]
         | 
| 64 | 
            +
            C1	[A-Za-z0-9._/\-]
         | 
| 65 | 
            +
            C2	[A-Za-z0-9._/\-&()+,=?\^|~]
         | 
| 66 | 
            +
            C3	[A-Za-z0-9._/\-&()+,=?\^|~#<>]
         | 
| 67 | 
            +
            C4	[A-Za-z0-9._/\-&()+,=?\^|~$<>]
         | 
| 68 | 
            +
            C5	[\!-;\?-\[\]-\~=]
         | 
| 69 | 
            +
            FN	[A-Za-z0-9._/\-*+]
         | 
| 70 | 
            +
             | 
| 71 | 
            +
            %%
         | 
| 72 | 
            +
             | 
| 73 | 
            +
            #include           BEGIN(incl);
         | 
| 74 | 
            +
            <incl>[ \t]*       /* eat the whitespace */
         | 
| 75 | 
            +
            <incl>{FN}+        { error2("Missing quotes",yytext); }
         | 
| 76 | 
            +
            <incl>\"{FN}+\"    { /* got the include file name */
         | 
| 77 | 
            +
                                 FILE *file;
         | 
| 78 | 
            +
                                 char *name=unquote(yytext);
         | 
| 79 | 
            +
                                 if ( Include_Stack_Ptr >= MAX_INCLUDE_DEPTH )
         | 
| 80 | 
            +
            		       {
         | 
| 81 | 
            +
            			 fprintf( stderr, "Includes nested too deeply" );
         | 
| 82 | 
            +
            			 exit( 1 );
         | 
| 83 | 
            +
            		       }
         | 
| 84 | 
            +
            		     if (Verbose) fputc('\n', stderr);
         | 
| 85 | 
            +
            		     file = fopen( name, "rt" );
         | 
| 86 | 
            +
            		     if (!file)
         | 
| 87 | 
            +
                                   error2("Can't open include file",name);
         | 
| 88 | 
            +
                                 else
         | 
| 89 | 
            +
                                   {
         | 
| 90 | 
            +
                                     Name_Stack[Include_Stack_Ptr] = FileName;
         | 
| 91 | 
            +
                                     FileName = name;
         | 
| 92 | 
            +
                                     Lineno_Stack[Include_Stack_Ptr] = yylineno;
         | 
| 93 | 
            +
            			 yylineno = 1;
         | 
| 94 | 
            +
            		         Include_Stack[Include_Stack_Ptr++]=YY_CURRENT_BUFFER;
         | 
| 95 | 
            +
            		         yy_switch_to_buffer(
         | 
| 96 | 
            +
                                          yy_create_buffer(yyin, YY_BUF_SIZE));
         | 
| 97 | 
            +
                                     yyin = file;
         | 
| 98 | 
            +
            			 print_lineno();
         | 
| 99 | 
            +
            		         BEGIN(INITIAL);
         | 
| 100 | 
            +
                                   }
         | 
| 101 | 
            +
                              }
         | 
| 102 | 
            +
            <<EOF>>           {
         | 
| 103 | 
            +
                                 if (Verbose)
         | 
| 104 | 
            +
            		       fputc('\n', stderr);
         | 
| 105 | 
            +
                                 if ( --Include_Stack_Ptr < 0 )
         | 
| 106 | 
            +
            		       yyterminate();
         | 
| 107 | 
            +
            		     else
         | 
| 108 | 
            +
            		       {
         | 
| 109 | 
            +
                                     free(FileName);
         | 
| 110 | 
            +
                                     FileName = Name_Stack[Include_Stack_Ptr];
         | 
| 111 | 
            +
                                     yylineno = Lineno_Stack[Include_Stack_Ptr];
         | 
| 112 | 
            +
            			 yy_delete_buffer( YY_CURRENT_BUFFER );
         | 
| 113 | 
            +
            			 yy_switch_to_buffer(Include_Stack[Include_Stack_Ptr]);
         | 
| 114 | 
            +
                                   }
         | 
| 115 | 
            +
                              }
         | 
| 116 | 
            +
             | 
| 117 | 
            +
             | 
| 118 | 
            +
            ^[ \t]*\%.*\r?\n  { print_lineno();  /* ignore comments */ }
         | 
| 119 | 
            +
             | 
| 120 | 
            +
            \%.*\\[ \t]*\r?\n { print_lineno();  /* ignore comments */ }
         | 
| 121 | 
            +
             | 
| 122 | 
            +
            \%.*              { /* ignore comments */ }
         | 
| 123 | 
            +
             | 
| 124 | 
            +
             | 
| 125 | 
            +
            ^[ \t]*ALPHABET[ \t]*= { return ALPHA; }
         | 
| 126 | 
            +
             | 
| 127 | 
            +
            \|\|              { return COMPOSE; }
         | 
| 128 | 
            +
            "<=>"             { yylval.type = twol_both; return ARROW; }
         | 
| 129 | 
            +
            "=>"              { yylval.type = twol_right;return ARROW; }
         | 
| 130 | 
            +
            "<="              { yylval.type = twol_left; return ARROW; }
         | 
| 131 | 
            +
            "^->"             { yylval.rtype = repl_up;   return REPLACE; }
         | 
| 132 | 
            +
            "_->"             { yylval.rtype = repl_down; return REPLACE; }
         | 
| 133 | 
            +
            "/->"             { yylval.rtype = repl_right;return REPLACE; }
         | 
| 134 | 
            +
            "\\->"            { yylval.rtype = repl_left; return REPLACE; }
         | 
| 135 | 
            +
            ">>"              { return PRINT; }
         | 
| 136 | 
            +
            "<<"              { return INSERT; }
         | 
| 137 | 
            +
            "__"              { return POS; }
         | 
| 138 | 
            +
            "^_"              { return REV; }
         | 
| 139 | 
            +
             | 
| 140 | 
            +
            [.,{}\[\]()&!?|*+:=_\^\-] { return yytext[0]; }
         | 
| 141 | 
            +
             | 
| 142 | 
            +
            \$=({C3}|(\\.))+\$ { yylval.name = fst_strdup(yytext); return RVAR; }
         | 
| 143 | 
            +
             | 
| 144 | 
            +
            \$({C3}|(\\.))+\$ { yylval.name = fst_strdup(yytext); return VAR; }
         | 
| 145 | 
            +
             | 
| 146 | 
            +
            #=({C4}|(\\.))+# { yylval.name = fst_strdup(yytext); return RSVAR; }
         | 
| 147 | 
            +
             | 
| 148 | 
            +
            #({C4}|(\\.))+# { yylval.name = fst_strdup(yytext); return SVAR; }
         | 
| 149 | 
            +
             | 
| 150 | 
            +
            \<({C5}|\\.)*\>   { yylval.name = unquote(yytext,false); return SYMBOL; }
         | 
| 151 | 
            +
             | 
| 152 | 
            +
            \"<{FN}+>\" { 
         | 
| 153 | 
            +
                                yylval.value = unquote(yytext)+1;
         | 
| 154 | 
            +
            		    yylval.value[strlen(yylval.value)-1] = 0;
         | 
| 155 | 
            +
                                return STRING2;
         | 
| 156 | 
            +
                              }
         | 
| 157 | 
            +
             | 
| 158 | 
            +
            \"{FN}+\" { 
         | 
| 159 | 
            +
                                yylval.value = unquote(yytext);
         | 
| 160 | 
            +
                                return STRING;
         | 
| 161 | 
            +
                              }
         | 
| 162 | 
            +
             | 
| 163 | 
            +
            [ \t]             { /* ignored */ }
         | 
| 164 | 
            +
            \\[ \t]*([ \t]\%.*)?\r?\n { print_lineno(); /* ignored */ }
         | 
| 165 | 
            +
            \r?\n             { print_lineno(); return NEWLINE; }
         | 
| 166 | 
            +
             | 
| 167 | 
            +
            \\[0-9]+          { long l=atol(yytext+1); 
         | 
| 168 | 
            +
            		    if (l <= 1114112) { yylval.uchar=l; return CHARACTER; }
         | 
| 169 | 
            +
            		    yyerror("invalid expression");
         | 
| 170 | 
            +
                              }
         | 
| 171 | 
            +
             | 
| 172 | 
            +
             | 
| 173 | 
            +
            \\.                { yylval.value=fst_strdup(yytext+1); return UTF8CHAR; }
         | 
| 174 | 
            +
            [\x00-\x7f]        { yylval.value=fst_strdup(yytext); return UTF8CHAR; }
         | 
| 175 | 
            +
            [\xc0-\xdf]{CC}    { yylval.value=fst_strdup(yytext); return UTF8CHAR; }
         | 
| 176 | 
            +
            [\xe0-\xef]{CC}{2} { yylval.value=fst_strdup(yytext); return UTF8CHAR; }
         | 
| 177 | 
            +
            [\xf0-\xff]{CC}{3} { yylval.value=fst_strdup(yytext); return UTF8CHAR; }
         | 
| 178 | 
            +
             | 
| 179 | 
            +
            %%
         | 
| @@ -0,0 +1,146 @@ | |
| 1 | 
            +
             | 
| 2 | 
            +
            /*******************************************************************/
         | 
| 3 | 
            +
            /*                                                                 */
         | 
| 4 | 
            +
            /*     File: utf8.C                                                */
         | 
| 5 | 
            +
            /*   Author: Helmut Schmid                                         */
         | 
| 6 | 
            +
            /*  Purpose:                                                       */
         | 
| 7 | 
            +
            /*  Created: Mon Sep  5 17:49:16 2005                              */
         | 
| 8 | 
            +
            /* Modified: Mon Mar  3 11:00:53 2008 (schmid)                     */
         | 
| 9 | 
            +
            /*                                                                 */
         | 
| 10 | 
            +
            /*******************************************************************/
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            #include "string.h"
         | 
| 13 | 
            +
             | 
| 14 | 
            +
            #include "utf8.h"
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            const unsigned char get3LSbits=7;
         | 
| 17 | 
            +
            const unsigned char get4LSbits=15;
         | 
| 18 | 
            +
            const unsigned char get5LSbits=31;
         | 
| 19 | 
            +
            const unsigned char get6LSbits=63;
         | 
| 20 | 
            +
             | 
| 21 | 
            +
            const unsigned char set1MSbits=128;
         | 
| 22 | 
            +
            const unsigned char set2MSbits=192;
         | 
| 23 | 
            +
            const unsigned char set3MSbits=224;
         | 
| 24 | 
            +
            const unsigned char set4MSbits=240;
         | 
| 25 | 
            +
             | 
| 26 | 
            +
             | 
| 27 | 
            +
             | 
| 28 | 
            +
            /*******************************************************************/
         | 
| 29 | 
            +
            /*                                                                 */
         | 
| 30 | 
            +
            /*  int2utf8                                                       */
         | 
| 31 | 
            +
            /*                                                                 */
         | 
| 32 | 
            +
            /*******************************************************************/
         | 
| 33 | 
            +
             | 
| 34 | 
            +
            char *int2utf8( unsigned int sym )
         | 
| 35 | 
            +
             | 
| 36 | 
            +
            {
         | 
| 37 | 
            +
              static unsigned char ch[5];
         | 
| 38 | 
            +
             | 
| 39 | 
            +
              if (sym < 128) {
         | 
| 40 | 
            +
                // 1-byte UTF8 symbol, 7 bits
         | 
| 41 | 
            +
                ch[0] = sym;
         | 
| 42 | 
            +
                ch[1] = 0;
         | 
| 43 | 
            +
              }
         | 
| 44 | 
            +
              
         | 
| 45 | 
            +
              else if (sym < 2048) {
         | 
| 46 | 
            +
                // 2-byte UTF8 symbol, 5+6 bits
         | 
| 47 | 
            +
                ch[0] = (sym >> 6) | set2MSbits;
         | 
| 48 | 
            +
                ch[1] = (sym & get6LSbits) | set1MSbits;
         | 
| 49 | 
            +
                ch[2] = 0;
         | 
| 50 | 
            +
              }
         | 
| 51 | 
            +
              
         | 
| 52 | 
            +
              else if (sym < 65536) {
         | 
| 53 | 
            +
                // 3-byte UTF8 symbol, 4+6+6 bits
         | 
| 54 | 
            +
                ch[0] = (sym >> 12) | set3MSbits;
         | 
| 55 | 
            +
                ch[1] = ((sym >> 6) & get6LSbits) | set1MSbits;
         | 
| 56 | 
            +
                ch[2] = (sym & get6LSbits) | set1MSbits;
         | 
| 57 | 
            +
                ch[3] = 0;
         | 
| 58 | 
            +
              }
         | 
| 59 | 
            +
              
         | 
| 60 | 
            +
              else if (sym < 2097152) {
         | 
| 61 | 
            +
                // 4-byte UTF8 symbol, 3+6+6+6 bits
         | 
| 62 | 
            +
                ch[0] = (sym >> 18) | set4MSbits;
         | 
| 63 | 
            +
                ch[1] = ((sym >> 12) & get6LSbits) | set1MSbits;
         | 
| 64 | 
            +
                ch[2] = ((sym >> 6) & get6LSbits) | set1MSbits;
         | 
| 65 | 
            +
                ch[3] = (sym & get6LSbits) | set1MSbits;
         | 
| 66 | 
            +
                ch[4] = 0;
         | 
| 67 | 
            +
              }
         | 
| 68 | 
            +
              
         | 
| 69 | 
            +
              else
         | 
| 70 | 
            +
                return NULL;
         | 
| 71 | 
            +
             | 
| 72 | 
            +
              return (char*)ch;
         | 
| 73 | 
            +
            }
         | 
| 74 | 
            +
             | 
| 75 | 
            +
             | 
| 76 | 
            +
            /*******************************************************************/
         | 
| 77 | 
            +
            /*                                                                 */
         | 
| 78 | 
            +
            /*  utf8toint                                                      */
         | 
| 79 | 
            +
            /*                                                                 */
         | 
| 80 | 
            +
            /*******************************************************************/
         | 
| 81 | 
            +
             | 
| 82 | 
            +
            unsigned int utf8toint( char **s )
         | 
| 83 | 
            +
             | 
| 84 | 
            +
            {
         | 
| 85 | 
            +
              int bytes_to_come;
         | 
| 86 | 
            +
              unsigned int result=0;
         | 
| 87 | 
            +
              unsigned char c=(unsigned char)**s;
         | 
| 88 | 
            +
             | 
| 89 | 
            +
              if (c >= (unsigned char)set4MSbits) { // 1111xxxx
         | 
| 90 | 
            +
                bytes_to_come = 3;
         | 
| 91 | 
            +
                result = (result << 3) | (c & get3LSbits);
         | 
| 92 | 
            +
              }
         | 
| 93 | 
            +
                  
         | 
| 94 | 
            +
              else if (c >= (unsigned char) set3MSbits) { // 1110xxxx
         | 
| 95 | 
            +
                // start of a three-byte symbol
         | 
| 96 | 
            +
                bytes_to_come = 2;
         | 
| 97 | 
            +
                result = (result << 4) | (c & get4LSbits);
         | 
| 98 | 
            +
              }
         | 
| 99 | 
            +
                  
         | 
| 100 | 
            +
              else if (c >= (unsigned char) set2MSbits) { // 1100xxxx
         | 
| 101 | 
            +
                // start of a two-byte symbol
         | 
| 102 | 
            +
                bytes_to_come = 1;
         | 
| 103 | 
            +
                result = (result << 5) | (c & get5LSbits);
         | 
| 104 | 
            +
              }
         | 
| 105 | 
            +
                  
         | 
| 106 | 
            +
              else if (c < (unsigned char) set1MSbits) { // 0100xxxx
         | 
| 107 | 
            +
                // one-byte symbol
         | 
| 108 | 
            +
                bytes_to_come = 0;
         | 
| 109 | 
            +
                result = c;
         | 
| 110 | 
            +
              }
         | 
| 111 | 
            +
             | 
| 112 | 
            +
              else
         | 
| 113 | 
            +
                return 0; // error
         | 
| 114 | 
            +
             | 
| 115 | 
            +
              while (bytes_to_come > 0) {
         | 
| 116 | 
            +
                bytes_to_come--;
         | 
| 117 | 
            +
                (*s)++;
         | 
| 118 | 
            +
                c = (unsigned char)**s;
         | 
| 119 | 
            +
                if (c < (unsigned char) set2MSbits &&
         | 
| 120 | 
            +
            	c >= (unsigned char) set1MSbits)    // 1000xxxx
         | 
| 121 | 
            +
                  {
         | 
| 122 | 
            +
            	result = (result << 6) | (c & get6LSbits);
         | 
| 123 | 
            +
                  }
         | 
| 124 | 
            +
                else
         | 
| 125 | 
            +
                  return 0;
         | 
| 126 | 
            +
              }
         | 
| 127 | 
            +
             | 
| 128 | 
            +
              (*s)++;
         | 
| 129 | 
            +
              return result;
         | 
| 130 | 
            +
            }
         | 
| 131 | 
            +
             | 
| 132 | 
            +
             | 
| 133 | 
            +
            /*******************************************************************/
         | 
| 134 | 
            +
            /*                                                                 */
         | 
| 135 | 
            +
            /*  utf8toint                                                      */
         | 
| 136 | 
            +
            /*                                                                 */
         | 
| 137 | 
            +
            /*******************************************************************/
         | 
| 138 | 
            +
             | 
| 139 | 
            +
            unsigned int utf8toint( char *s )
         | 
| 140 | 
            +
             | 
| 141 | 
            +
            {
         | 
| 142 | 
            +
              unsigned int result = utf8toint( &s );
         | 
| 143 | 
            +
              if (*s == 0) // all bytes converted?
         | 
| 144 | 
            +
                return result;
         | 
| 145 | 
            +
              return 0;
         | 
| 146 | 
            +
            }
         | 
| @@ -0,0 +1,19 @@ | |
| 1 | 
            +
             | 
| 2 | 
            +
            /*******************************************************************/
         | 
| 3 | 
            +
            /*                                                                 */
         | 
| 4 | 
            +
            /*     File: utf8.h                                                */
         | 
| 5 | 
            +
            /*   Author: Helmut Schmid                                         */
         | 
| 6 | 
            +
            /*  Purpose:                                                       */
         | 
| 7 | 
            +
            /*  Created: Mon Sep  5 17:49:16 2005                              */
         | 
| 8 | 
            +
            /* Modified: Mon Apr  7 08:26:39 2008 (schmid)                     */
         | 
| 9 | 
            +
            /*                                                                 */
         | 
| 10 | 
            +
            /*******************************************************************/
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            #ifndef _UTF8_H_
         | 
| 13 | 
            +
            #define _UTF8_H_
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            unsigned int utf8toint( char *s );
         | 
| 16 | 
            +
            unsigned int utf8toint( char **s );
         | 
| 17 | 
            +
            char *int2utf8( unsigned int );
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            #endif
         | 
    
        data/lib/sfst.rb
    ADDED
    
    | @@ -0,0 +1,99 @@ | |
| 1 | 
            +
            #!/usr/bin/env ruby
         | 
| 2 | 
            +
            #
         | 
| 3 | 
            +
            # sfst.rb - SFST interface
         | 
| 4 | 
            +
            #
         | 
| 5 | 
            +
            # Written by Marius L. Jøhndal, 2008.
         | 
| 6 | 
            +
            #
         | 
| 7 | 
            +
            require 'sfst_machine'
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            module SFST
         | 
| 10 | 
            +
              # Compiles an SFST transducer +source+ and saves it as +machine+.
         | 
| 11 | 
            +
              #
         | 
| 12 | 
            +
              # ==== Options
         | 
| 13 | 
            +
              # compact:: Compile a compact transducer.
         | 
| 14 | 
            +
              def self.compile(source, machine, options = {})
         | 
| 15 | 
            +
                unless options[:compact]
         | 
| 16 | 
            +
                  _compile_regular(source, machine)
         | 
| 17 | 
            +
                else
         | 
| 18 | 
            +
                  _compile_compact(source, machine)
         | 
| 19 | 
            +
                end
         | 
| 20 | 
            +
              end
         | 
| 21 | 
            +
             | 
| 22 | 
            +
              # A regular, i.e. not a compact, transducer.
         | 
| 23 | 
            +
              class RegularTransducer
         | 
| 24 | 
            +
                def initialize(file)
         | 
| 25 | 
            +
                  @fst = RegularTransducerMachine.new(file)
         | 
| 26 | 
            +
                end
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                # Analyses a string +string+. Returns an array of analysed
         | 
| 29 | 
            +
                # strings if the string is accepted, or an empty array if not.
         | 
| 30 | 
            +
                #
         | 
| 31 | 
            +
                # ==== Options
         | 
| 32 | 
            +
                # * +symbol_sequence+ - Return each analysis as a sequence of symbols.
         | 
| 33 | 
            +
                #   Multicharacter symbols will be strings on the form +<symbol>+.
         | 
| 34 | 
            +
                def analyze(string, options = {})
         | 
| 35 | 
            +
                  x = []
         | 
| 36 | 
            +
                  @fst._analyze(string) do |a| 
         | 
| 37 | 
            +
                    if options[:symbol_sequence]
         | 
| 38 | 
            +
                      x << a.map { |s| s.match(/^<(.*)>$/) ? $1.to_sym : s }
         | 
| 39 | 
            +
                    else
         | 
| 40 | 
            +
                      x << a.join
         | 
| 41 | 
            +
                    end
         | 
| 42 | 
            +
                  end
         | 
| 43 | 
            +
                  x
         | 
| 44 | 
            +
                end
         | 
| 45 | 
            +
             | 
| 46 | 
            +
                # Checks if the string +string+ is accepted for analysis.
         | 
| 47 | 
            +
                def accepted_analysis?(string)
         | 
| 48 | 
            +
                  @fst._analyze(string)
         | 
| 49 | 
            +
                end
         | 
| 50 | 
            +
             | 
| 51 | 
            +
                # Generates a string +string+. Returns an array of generated
         | 
| 52 | 
            +
                # strings if the string is accepted or an empty array if not.
         | 
| 53 | 
            +
                def generate(string)
         | 
| 54 | 
            +
                  x = []
         | 
| 55 | 
            +
                  @fst._generate(string) { |a| x << a.join }
         | 
| 56 | 
            +
                  x
         | 
| 57 | 
            +
                end
         | 
| 58 | 
            +
             | 
| 59 | 
            +
                # Checks if the string +string+ is accepted for generating.
         | 
| 60 | 
            +
                def accepted_generating?(string)
         | 
| 61 | 
            +
                  @fst._generate(string)
         | 
| 62 | 
            +
                end
         | 
| 63 | 
            +
             | 
| 64 | 
            +
                # Generates upper or lower level or both. This only works with
         | 
| 65 | 
            +
                # non-compact transducers.
         | 
| 66 | 
            +
                #
         | 
| 67 | 
            +
                # ==== Options
         | 
| 68 | 
            +
                # * +levels+ - if <tt>:upper</tt>, generates only upper level. If <tt>:lower</tt> generates 
         | 
| 69 | 
            +
                #   only lower level. If <tt>:both</tt>, generates both. Default is <tt>:both</tt>.
         | 
| 70 | 
            +
                # * +epsilons+ - if +true+, produces epsilons. Default is +false+.
         | 
| 71 | 
            +
                def generate_language(options = {}, &block)
         | 
| 72 | 
            +
                  @fst._generate_language(options[:levels] || :both, options[:epsilons] ? :all : :noepsilons, &block)
         | 
| 73 | 
            +
                end
         | 
| 74 | 
            +
             | 
| 75 | 
            +
                alias :analyse :analyze
         | 
| 76 | 
            +
              end
         | 
| 77 | 
            +
             | 
| 78 | 
            +
              # A compact transducer.
         | 
| 79 | 
            +
              class CompactTransducer < CompactTransducerMachine
         | 
| 80 | 
            +
                def initialize(file)
         | 
| 81 | 
            +
                  @fst = CompactTransducerMachine.new(file)
         | 
| 82 | 
            +
                end
         | 
| 83 | 
            +
             | 
| 84 | 
            +
                # Checks if the string +string+ is accepted for analysis.
         | 
| 85 | 
            +
                def accepted_analysis?(string)
         | 
| 86 | 
            +
                  @fst._analyze(string)
         | 
| 87 | 
            +
                end
         | 
| 88 | 
            +
             | 
| 89 | 
            +
                # Analyses a string +string+. Returns an array of analysed
         | 
| 90 | 
            +
                # strings if the string is accepted or an empty array if not.
         | 
| 91 | 
            +
                def analyze(form)
         | 
| 92 | 
            +
                  x = []
         | 
| 93 | 
            +
                  @fst._analyze(form) { |a| x << a }
         | 
| 94 | 
            +
                  x
         | 
| 95 | 
            +
                end
         | 
| 96 | 
            +
             | 
| 97 | 
            +
                alias :analyse :analyze
         | 
| 98 | 
            +
              end
         | 
| 99 | 
            +
            end
         | 
    
        data/ruby-sfst.gemspec
    ADDED
    
    | @@ -0,0 +1,34 @@ | |
| 1 | 
            +
            Gem::Specification.new do |s|
         | 
| 2 | 
            +
              s.name = %q{ruby-sfst}
         | 
| 3 | 
            +
              s.version = "0.1.0"
         | 
| 4 | 
            +
             | 
| 5 | 
            +
              s.required_rubygems_version = Gem::Requirement.new("= 1.2") if s.respond_to? :required_rubygems_version=
         | 
| 6 | 
            +
              s.authors = ["Marius L. J\303\270hndal"]
         | 
| 7 | 
            +
              s.date = %q{2008-08-04}
         | 
| 8 | 
            +
              s.description = %q{A wrapper for the Stuttgart Finite State Transducer Tools (SFST).}
         | 
| 9 | 
            +
              s.email = %q{mariuslj (at) ifi [dot] uio (dot) no}
         | 
| 10 | 
            +
              s.extensions = ["ext/sfst_machine/extconf.rb"]
         | 
| 11 | 
            +
              s.extra_rdoc_files = ["README.rdoc", "lib/sfst.rb"]
         | 
| 12 | 
            +
              s.files = ["README.rdoc", "Rakefile", "Manifest", "test/test_sfst.rb", "test/test_sfst.fst", "CHANGELOG", "ext/sfst_machine/fst-compiler.h", "ext/sfst_machine/utf8.C", "ext/sfst_machine/operators.C", "ext/sfst_machine/utf8-scanner.ll", "ext/sfst_machine/determinise.C", "ext/sfst_machine/interface.C", "ext/sfst_machine/compact.h", "ext/sfst_machine/basic.h", "ext/sfst_machine/fst.h", "ext/sfst_machine/make-compact.h", "ext/sfst_machine/fst-compiler.yy", "ext/sfst_machine/mem.h", "ext/sfst_machine/compact.C", "ext/sfst_machine/basic.C", "ext/sfst_machine/interface.h", "ext/sfst_machine/sfst_machine.cc", "ext/sfst_machine/extconf.rb", "ext/sfst_machine/alphabet.C", "ext/sfst_machine/fst.C", "ext/sfst_machine/alphabet.h", "ext/sfst_machine/make-compact.C", "ext/sfst_machine/fst-compiler.C", "ext/sfst_machine/utf8.h", "ext/sfst_machine/utf8-scanner.C", "lib/sfst.rb", "ruby-sfst.gemspec"]
         | 
| 13 | 
            +
              s.has_rdoc = true
         | 
| 14 | 
            +
              s.homepage = %q{http://github.com/mlj/ruby-sfst}
         | 
| 15 | 
            +
              s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Ruby-sfst", "--main", "README.rdoc"]
         | 
| 16 | 
            +
              s.require_paths = ["lib", "ext"]
         | 
| 17 | 
            +
              s.rubyforge_project = %q{sfst}
         | 
| 18 | 
            +
              s.rubygems_version = %q{1.2.0}
         | 
| 19 | 
            +
              s.summary = %q{A wrapper for the Stuttgart Finite State Transducer Tools (SFST).}
         | 
| 20 | 
            +
              s.test_files = ["test/test_sfst.rb"]
         | 
| 21 | 
            +
             | 
| 22 | 
            +
              if s.respond_to? :specification_version then
         | 
| 23 | 
            +
                current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
         | 
| 24 | 
            +
                s.specification_version = 2
         | 
| 25 | 
            +
             | 
| 26 | 
            +
                if current_version >= 3 then
         | 
| 27 | 
            +
                  s.add_development_dependency(%q<echoe>, [">= 0"])
         | 
| 28 | 
            +
                else
         | 
| 29 | 
            +
                  s.add_dependency(%q<echoe>, [">= 0"])
         | 
| 30 | 
            +
                end
         | 
| 31 | 
            +
              else
         | 
| 32 | 
            +
                s.add_dependency(%q<echoe>, [">= 0"])
         | 
| 33 | 
            +
              end
         | 
| 34 | 
            +
            end
         | 
    
        data/test/test_sfst.fst
    ADDED
    
    
    
        data/test/test_sfst.rb
    ADDED
    
    | @@ -0,0 +1,119 @@ | |
| 1 | 
            +
            require 'sfst'
         | 
| 2 | 
            +
            require 'test/unit'
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            TEST_DIRECTORY = File.expand_path(File.dirname(__FILE__))
         | 
| 5 | 
            +
            TEST_SCRIPT_FILE = File.join(TEST_DIRECTORY, 'test_sfst.fst')
         | 
| 6 | 
            +
            TEST_COMPILED_FILE = File.join(TEST_DIRECTORY, 'test_sfst.a')
         | 
| 7 | 
            +
            TEST_COMPILED_COMPACT_FILE = File.join(TEST_DIRECTORY, 'test_sfst_compact.a')
         | 
| 8 | 
            +
            TEST_COMPILED_REGULAR_FILE = File.join(TEST_DIRECTORY, 'test_sfst_regular.a')
         | 
| 9 | 
            +
             | 
| 10 | 
            +
            class SFSTTestCase < Test::Unit::TestCase
         | 
| 11 | 
            +
              def test_sfst_compile_regular
         | 
| 12 | 
            +
                SFST::compile(TEST_SCRIPT_FILE, TEST_COMPILED_FILE)
         | 
| 13 | 
            +
                SFST::compile(TEST_SCRIPT_FILE, TEST_COMPILED_FILE, :compact => false)
         | 
| 14 | 
            +
              end
         | 
| 15 | 
            +
             | 
| 16 | 
            +
              def test_sfst_compile_compact
         | 
| 17 | 
            +
                SFST::compile(TEST_SCRIPT_FILE, TEST_COMPILED_FILE, :compact => true)
         | 
| 18 | 
            +
              end
         | 
| 19 | 
            +
            end
         | 
| 20 | 
            +
             | 
| 21 | 
            +
            class RegularTransducerTestCase < Test::Unit::TestCase
         | 
| 22 | 
            +
              def setup
         | 
| 23 | 
            +
                SFST::compile(TEST_SCRIPT_FILE, TEST_COMPILED_REGULAR_FILE, :compact => false)
         | 
| 24 | 
            +
              end
         | 
| 25 | 
            +
             | 
| 26 | 
            +
              def test_analyze_acceptance
         | 
| 27 | 
            +
                fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
         | 
| 28 | 
            +
                assert_equal true, fst.accepted_analysis?('foo')
         | 
| 29 | 
            +
                assert_equal false, fst.accepted_analysis?('fox')
         | 
| 30 | 
            +
              end
         | 
| 31 | 
            +
             | 
| 32 | 
            +
              def test_analyze
         | 
| 33 | 
            +
                fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
         | 
| 34 | 
            +
                assert_equal ['bar', 'baz'], fst.analyse('foo').sort
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
         | 
| 37 | 
            +
                assert_equal [], fst.analyse('fox').sort
         | 
| 38 | 
            +
              end
         | 
| 39 | 
            +
             | 
| 40 | 
            +
              def test_analyze_symbol_sequence
         | 
| 41 | 
            +
                fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
         | 
| 42 | 
            +
                assert_equal [['b', 'a', 'r'], ['b', 'a', 'z']], fst.analyse('foo', :symbol_sequence => true).sort
         | 
| 43 | 
            +
             | 
| 44 | 
            +
                fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
         | 
| 45 | 
            +
                assert_equal [], fst.analyse('fox', :symbol_sequence => true).sort
         | 
| 46 | 
            +
              end
         | 
| 47 | 
            +
             | 
| 48 | 
            +
              def test_generate_acceptance
         | 
| 49 | 
            +
                fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
         | 
| 50 | 
            +
                assert_equal true, fst.accepted_generating?('bar')
         | 
| 51 | 
            +
                assert_equal true, fst.accepted_generating?('baz')
         | 
| 52 | 
            +
                assert_equal false, fst.accepted_generating?('bax')
         | 
| 53 | 
            +
              end
         | 
| 54 | 
            +
             | 
| 55 | 
            +
              def test_generate
         | 
| 56 | 
            +
                fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
         | 
| 57 | 
            +
                assert_equal ['foo'], fst.generate('bar').sort
         | 
| 58 | 
            +
             | 
| 59 | 
            +
                fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
         | 
| 60 | 
            +
                assert_equal ['foo'], fst.generate('baz').sort
         | 
| 61 | 
            +
             | 
| 62 | 
            +
                fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
         | 
| 63 | 
            +
                assert_equal [], fst.generate('bax').sort
         | 
| 64 | 
            +
              end
         | 
| 65 | 
            +
             | 
| 66 | 
            +
              def test_generate_language_default
         | 
| 67 | 
            +
                fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
         | 
| 68 | 
            +
                a = []
         | 
| 69 | 
            +
                fst.generate_language do |u|
         | 
| 70 | 
            +
                  a << u.collect { |pair| pair.join(':') }.join
         | 
| 71 | 
            +
                end
         | 
| 72 | 
            +
                assert_equal ['b:fa:or:o', 'b:fa:oz:o'], a.sort
         | 
| 73 | 
            +
              end
         | 
| 74 | 
            +
             | 
| 75 | 
            +
              def test_generate_language_both
         | 
| 76 | 
            +
                fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
         | 
| 77 | 
            +
                a = []
         | 
| 78 | 
            +
                fst.generate_language(:levels => :both) do |u|
         | 
| 79 | 
            +
                  a << u.collect { |pair| pair.join(':') }.join
         | 
| 80 | 
            +
                end
         | 
| 81 | 
            +
                assert_equal ['b:fa:or:o', 'b:fa:oz:o'], a.sort
         | 
| 82 | 
            +
              end
         | 
| 83 | 
            +
             | 
| 84 | 
            +
              def test_generate_language_upper
         | 
| 85 | 
            +
                fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
         | 
| 86 | 
            +
                a = []
         | 
| 87 | 
            +
                fst.generate_language(:levels => :upper) do |u|
         | 
| 88 | 
            +
                  a << u.join
         | 
| 89 | 
            +
                end
         | 
| 90 | 
            +
                assert_equal ['foo'], a.sort
         | 
| 91 | 
            +
              end
         | 
| 92 | 
            +
             | 
| 93 | 
            +
              def test_generate_language_lower
         | 
| 94 | 
            +
                fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
         | 
| 95 | 
            +
                a = []
         | 
| 96 | 
            +
                fst.generate_language(:levels => :lower) do |u|
         | 
| 97 | 
            +
                  a << u.join
         | 
| 98 | 
            +
                end
         | 
| 99 | 
            +
                assert_equal ['bar', 'baz'], a.sort
         | 
| 100 | 
            +
              end
         | 
| 101 | 
            +
            end
         | 
| 102 | 
            +
             | 
| 103 | 
            +
            class CompactTransducerTestCase < Test::Unit::TestCase
         | 
| 104 | 
            +
              def setup
         | 
| 105 | 
            +
                SFST::compile(TEST_SCRIPT_FILE, TEST_COMPILED_COMPACT_FILE, :compact => true)
         | 
| 106 | 
            +
              end
         | 
| 107 | 
            +
             | 
| 108 | 
            +
              def test_analyze_acceptance
         | 
| 109 | 
            +
                fst = SFST::CompactTransducer.new(TEST_COMPILED_COMPACT_FILE)
         | 
| 110 | 
            +
                assert_equal true, fst.accepted_analysis?('foo')
         | 
| 111 | 
            +
                assert_equal false, fst.accepted_analysis?('fox')
         | 
| 112 | 
            +
              end
         | 
| 113 | 
            +
             | 
| 114 | 
            +
              def test_analyze
         | 
| 115 | 
            +
                fst = SFST::CompactTransducer.new(TEST_COMPILED_COMPACT_FILE)
         | 
| 116 | 
            +
                assert_equal ['bar', 'baz'], fst.analyse('foo').sort
         | 
| 117 | 
            +
                assert_equal [], fst.analyse('fox').sort
         | 
| 118 | 
            +
              end
         | 
| 119 | 
            +
            end
         |