ruby-sfst 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,281 @@
1
+ /*******************************************************************/
2
+ /* */
3
+ /* FILE alphabet.h */
4
+ /* MODULE alphabet */
5
+ /* PROGRAM SFST */
6
+ /* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
7
+ /* */
8
+ /* PURPOSE finite state tools */
9
+ /* */
10
+ /*******************************************************************/
11
+
12
+ #ifndef _ALPHABET_H_
13
+ #define _ALPHABET_H_
14
+
15
+ #include <stdio.h>
16
+
17
+ #include "basic.h"
18
+
19
+ #include <set>
20
+ #include <vector>
21
+
22
+ #include <iostream>
23
+
24
+ #ifndef CODE_DATA_TYPE
25
+ typedef unsigned short Character; // data type of the symbol codes
26
+ #else
27
+ typedef unsigned CODE_DATA_TYPE Character;
28
+ #endif
29
+
30
+ // data type used to indicate whether some action is to be performed
31
+ // on the analysis level (lower) or the surface level (upper)
32
+ typedef enum {upper, lower} Level;
33
+
34
+ #ifdef SGIext
35
+
36
+ #include <ext/hash_set>
37
+ #include <ext/hash_map>
38
+
39
+ #else
40
+
41
+ #include <hash_set>
42
+ #include <hash_map>
43
+
44
+ #endif
45
+
46
+ extern char EpsilonString[]; // holds the symbol representing the empty string
47
+ // which is usually "<>"
48
+
49
+
50
+ /***************** class Label ***********************************/
51
+
52
+ class Label {
53
+
54
+ private:
55
+ // data structure where the two symbols are stored
56
+ struct {
57
+ Character lower;
58
+ Character upper;
59
+ } label;
60
+
61
+ public:
62
+ static const Character epsilon=0; // code of the empty symbol
63
+
64
+ // new label with two identical symbols
65
+ Label( Character c=epsilon ) { label.lower = label.upper = c; };
66
+
67
+ // new label with two different symbols
68
+ Label( Character c1, Character c2 )
69
+ { label.lower = c1; label.upper = c2; };
70
+
71
+ // returns the indicated symbol of the label
72
+ Character get_char( Level l ) const
73
+ { return ((l==upper)? label.upper: label.lower); };
74
+
75
+ // returns the "upper" symbol of the label (i.e. the surface symbol)
76
+ Character upper_char() const { return label.upper; };
77
+
78
+ // returns the "lower" symbol of the label (i.e. the analysis symbol)
79
+ Character lower_char() const { return label.lower; };
80
+
81
+ // replaces symbols in a label
82
+ Label replace_char( Character c, Character nc ) const {
83
+ Label l = *this;
84
+ if (l.label.lower == c)
85
+ l.label.lower = nc;
86
+ if (l.label.upper == c)
87
+ l.label.upper = nc;
88
+ return l;
89
+ };
90
+
91
+ // operators checking the equality of labels
92
+ int operator==( Label l ) const
93
+ { return (label.lower==l.label.lower && label.upper==l.label.upper); };
94
+ int operator!=( Label l ) const
95
+ { return !(l == *this); };
96
+
97
+ // comparison operator needed for sorting labels
98
+ int operator<( Label l ) const {
99
+ return (upper_char() < l.upper_char()); };
100
+
101
+ // check whether the label is epsilon (i.e. both symbols are epsilon)
102
+ // transitions with epsilon labels are epsilon transitions
103
+ int is_epsilon() const
104
+ { return (label.upper == epsilon && label.lower == epsilon); };
105
+
106
+ // check whether the "upper" symbol is epsilon
107
+ int upper_is_epsilon() const
108
+ { return (label.upper == epsilon); };
109
+
110
+ // check whether the "lower" symbol is epsilon
111
+ int lower_is_epsilon() const
112
+ { return (label.lower == epsilon); };
113
+
114
+ // hash function needed to store labels in a hash table
115
+ struct label_hash {
116
+ size_t operator() ( const Label l ) const {
117
+ return (size_t)l.lower_char() ^
118
+ ((size_t)l.upper_char() << 16) ^
119
+ ((size_t)l.upper_char() >> 16);
120
+ }
121
+ };
122
+
123
+ // hash function needed to store labels in a hash table
124
+ struct label_cmp {
125
+ bool operator() ( const Label l1, const Label l2 ) const {
126
+ return (l1.lower_char() < l2.lower_char() ||
127
+ (l1.lower_char() == l2.lower_char() &&
128
+ l1.upper_char() < l2.upper_char()));
129
+ }
130
+ };
131
+
132
+ // comparison operator needed to store labels in a hash table
133
+ struct label_eq {
134
+ bool operator() ( const Label l1, const Label l2 ) const {
135
+ return (l1.lower_char() == l2.lower_char() &&
136
+ l1.upper_char() == l2.upper_char());
137
+ }
138
+ };
139
+ };
140
+
141
+ typedef std::vector<Label> Analysis;
142
+
143
+
144
+ /***************** class Alphabet *******************************/
145
+
146
+ class Alphabet {
147
+
148
+ // string comparison operators needed to stored strings in a hash table
149
+ struct eqstr {
150
+ bool operator()(const char* s1, const char* s2) const {
151
+ return strcmp(s1, s2) == 0;
152
+ }
153
+ };
154
+
155
+ // data structure storing labels without repetitions (i.e. as a set)
156
+ typedef std::set<Label, Label::label_cmp> LabelSet;
157
+
158
+ // hash table used to map the symbols to their codes
159
+ typedef __gnu_cxx::hash_map<const char*, Character, __gnu_cxx::hash<const char*>,eqstr> SymbolMap;
160
+
161
+ // hash table used to map the codes back to the symbols
162
+ typedef __gnu_cxx::hash_map<Character, char*> CharMap;
163
+
164
+ private:
165
+ SymbolMap sm; // maps symbols to codes
166
+
167
+ CharMap cm; // maps codes to symbols
168
+ LabelSet ls; // set of labels known to the alphabet
169
+
170
+ // add a new symbol with symbol code c
171
+ void add( const char *symbol, Character c );
172
+
173
+ public:
174
+ bool utf8;
175
+
176
+ // iterators over the set of known labels
177
+ typedef LabelSet::iterator iterator;
178
+ typedef LabelSet::const_iterator const_iterator;
179
+ Alphabet();
180
+ ~Alphabet() { clear(); };
181
+ const_iterator begin() const { return ls.begin(); };
182
+ const_iterator end() const { return ls.end(); };
183
+ size_t size() const { return ls.size(); };
184
+
185
+ void clear();
186
+ void clear_char_pairs() { ls.clear(); };
187
+
188
+ // lookup a label in the alphabet
189
+ iterator find( Label l ) { return ls.find(l); };
190
+
191
+ // insert a label in the alphabet
192
+ void insert( Label l ) { if (!l.is_epsilon()) ls.insert(l); };
193
+
194
+ // insert the known symbols from another alphabet
195
+ void insert_symbols( const Alphabet& );
196
+
197
+ // insert the labels and known symbols from another alphabet
198
+ void copy( const Alphabet& );
199
+
200
+ // create the alphabet of a transducer obtained by a composition operation
201
+ void compose( const Alphabet &la, const Alphabet &ua );
202
+
203
+ // add a symbol to the alphabet and return its code
204
+ Character add_symbol(const char *symbol);
205
+
206
+ // add a symbol to the alphabet with a given code
207
+ void add_symbol(const char *symbol, Character c );
208
+
209
+ // create a new marker symbol and return its code
210
+ Character new_marker( void );
211
+ void delete_markers();
212
+
213
+ // compute the complement of a symbol set
214
+ void complement( std::vector<Character> &sym );
215
+
216
+ // return the code of the argument symbol
217
+ int symbol2code( const char *s ) const {
218
+ SymbolMap::const_iterator p = sm.find(s);
219
+ if (p != sm.end()) return p->second;
220
+ return EOF;
221
+ };
222
+
223
+ // return the symbol for the given symbol code
224
+ const char *code2symbol( Character c ) const {
225
+ CharMap::const_iterator p=cm.find(c);
226
+ if (p == cm.end())
227
+ return NULL;
228
+ else
229
+ return p->second;
230
+ };
231
+
232
+ // write the symbol for the given symbol code into a string
233
+ void write_char( Character c, char *buffer, int *pos,
234
+ bool with_brackets=true ) const;
235
+
236
+ // write the symbol pair of a given label into a string
237
+ void write_label( Label l, char *buffer, int *pos,
238
+ bool with_brackets=true ) const;
239
+
240
+ // write the symbol for the given symbol code into a buffer and return
241
+ // a pointer to it
242
+ // the flag "with_brackets" indicates whether the angle brackets
243
+ // surrounding multi-character symbols are to be printed or not
244
+ const char *write_char( Character c, bool with_brackets=true ) const;
245
+
246
+ // write the symbol pair of a given label into a string
247
+ // and return a pointer to it
248
+ const char *write_label( Label l, bool with_brackets=true ) const;
249
+
250
+ // scan the next multi-character symbol in the argument string
251
+ int next_mcsym( char*&, int extended=1 );
252
+
253
+ // scan the next symbol in the argument string
254
+ int next_code( char*&, int extended=1 );
255
+
256
+ // convert a character string into a symbol or label sequence
257
+ void string2symseq( char*, std::vector<Character>& );
258
+ void string2labelseq( char*, std::vector<Label>& );
259
+
260
+ // scan the next label in the argument string
261
+ Label next_label( char*&, int extended=1 );
262
+
263
+ // store the alphabet in the argument file (in binary form)
264
+ void store( FILE* ) const;
265
+
266
+ // read the alphabet from the argument file
267
+ void read( FILE* );
268
+
269
+ // disambiguation and printing of analyses
270
+ int compute_score( Analysis &ana );
271
+ void disambiguate( std::vector<Analysis> &analyses );
272
+ char *print_analysis( Analysis &ana, bool both_layers );
273
+
274
+ friend std::ostream &operator<<(std::ostream&, const Alphabet&);
275
+ };
276
+
277
+ // write the alphabet to the output stream (in readable form)
278
+ std::ostream &operator<<(std::ostream&, Alphabet&);
279
+
280
+
281
+ #endif
@@ -0,0 +1,84 @@
1
+
2
+ /*******************************************************************/
3
+ /* */
4
+ /* FILE basic.C */
5
+ /* MODULE basic */
6
+ /* PROGRAM SFST */
7
+ /* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
8
+ /* */
9
+ /* PURPOSE */
10
+ /* */
11
+ /*******************************************************************/
12
+
13
+ #include <stdlib.h>
14
+ #include <string.h>
15
+
16
+ #include "basic.h"
17
+
18
+ bool Switch_Bytes=false;
19
+
20
+
21
+
22
+ /*******************************************************************/
23
+ /* */
24
+ /* fst_strdup */
25
+ /* */
26
+ /*******************************************************************/
27
+
28
+ char* fst_strdup(const char* pString)
29
+
30
+ {
31
+ char* pStringCopy = (char*)malloc(strlen(pString) + 1);
32
+ if (pStringCopy == NULL) {
33
+ fprintf(stderr, "\nError: out of memory (malloc failed)\naborted.\n");
34
+ exit(1);
35
+ }
36
+ strcpy(pStringCopy, pString);
37
+ return pStringCopy;
38
+ }
39
+
40
+
41
+ /*******************************************************************/
42
+ /* */
43
+ /* read_string */
44
+ /* */
45
+ /*******************************************************************/
46
+
47
+ int read_string( char *buffer, int size, FILE *file )
48
+
49
+ {
50
+ for( int i=0; i<size; i++ ) {
51
+ int c=fgetc(file);
52
+ if (c == EOF || c == 0) {
53
+ buffer[i] = 0;
54
+ return (c==0);
55
+ }
56
+ buffer[i] = (char)c;
57
+ }
58
+ buffer[size-1] = 0;
59
+ return 0;
60
+ }
61
+
62
+
63
+ /*******************************************************************/
64
+ /* */
65
+ /* read_num */
66
+ /* */
67
+ /*******************************************************************/
68
+
69
+ size_t read_num( void *p, size_t n, FILE *file )
70
+
71
+ {
72
+ char *pp=(char*)p;
73
+ size_t result=fread( pp, 1, n, file );
74
+ if (Switch_Bytes) {
75
+ size_t e=n/2;
76
+ for( size_t i=0; i<e; i++ ) {
77
+ char tmp=pp[i];
78
+ pp[i] = pp[--n];
79
+ pp[n] = tmp;
80
+ }
81
+ }
82
+ return result;
83
+ }
84
+
@@ -0,0 +1,24 @@
1
+
2
+ /*******************************************************************/
3
+ /* */
4
+ /* FILE basic.h */
5
+ /* MODULE basic */
6
+ /* PROGRAM SFST */
7
+ /* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
8
+ /* */
9
+ /* PURPOSE */
10
+ /* */
11
+ /*******************************************************************/
12
+
13
+ #ifndef _BASIC_H_
14
+ #define _BASIC_H_
15
+
16
+ #include <stdio.h>
17
+
18
+ extern bool Switch_Bytes;
19
+
20
+ char* fst_strdup(const char* pString);
21
+ int read_string( char *buffer, int size, FILE *file );
22
+ size_t read_num( void *p, size_t size, FILE *file );
23
+
24
+ #endif