ruby-sfst 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,281 @@
1
+ /*******************************************************************/
2
+ /* */
3
+ /* FILE alphabet.h */
4
+ /* MODULE alphabet */
5
+ /* PROGRAM SFST */
6
+ /* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
7
+ /* */
8
+ /* PURPOSE finite state tools */
9
+ /* */
10
+ /*******************************************************************/
11
+
12
+ #ifndef _ALPHABET_H_
13
+ #define _ALPHABET_H_
14
+
15
+ #include <stdio.h>
16
+
17
+ #include "basic.h"
18
+
19
+ #include <set>
20
+ #include <vector>
21
+
22
+ #include <iostream>
23
+
24
+ #ifndef CODE_DATA_TYPE
25
+ typedef unsigned short Character; // data type of the symbol codes
26
+ #else
27
+ typedef unsigned CODE_DATA_TYPE Character;
28
+ #endif
29
+
30
+ // data type used to indicate whether some action is to be performed
31
+ // on the analysis level (lower) or the surface level (upper)
32
+ typedef enum {upper, lower} Level;
33
+
34
+ #ifdef SGIext
35
+
36
+ #include <ext/hash_set>
37
+ #include <ext/hash_map>
38
+
39
+ #else
40
+
41
+ #include <hash_set>
42
+ #include <hash_map>
43
+
44
+ #endif
45
+
46
+ extern char EpsilonString[]; // holds the symbol representing the empty string
47
+ // which is usually "<>"
48
+
49
+
50
+ /***************** class Label ***********************************/
51
+
52
+ class Label {
53
+
54
+ private:
55
+ // data structure where the two symbols are stored
56
+ struct {
57
+ Character lower;
58
+ Character upper;
59
+ } label;
60
+
61
+ public:
62
+ static const Character epsilon=0; // code of the empty symbol
63
+
64
+ // new label with two identical symbols
65
+ Label( Character c=epsilon ) { label.lower = label.upper = c; };
66
+
67
+ // new label with two different symbols
68
+ Label( Character c1, Character c2 )
69
+ { label.lower = c1; label.upper = c2; };
70
+
71
+ // returns the indicated symbol of the label
72
+ Character get_char( Level l ) const
73
+ { return ((l==upper)? label.upper: label.lower); };
74
+
75
+ // returns the "upper" symbol of the label (i.e. the surface symbol)
76
+ Character upper_char() const { return label.upper; };
77
+
78
+ // returns the "lower" symbol of the label (i.e. the analysis symbol)
79
+ Character lower_char() const { return label.lower; };
80
+
81
+ // replaces symbols in a label
82
+ Label replace_char( Character c, Character nc ) const {
83
+ Label l = *this;
84
+ if (l.label.lower == c)
85
+ l.label.lower = nc;
86
+ if (l.label.upper == c)
87
+ l.label.upper = nc;
88
+ return l;
89
+ };
90
+
91
+ // operators checking the equality of labels
92
+ int operator==( Label l ) const
93
+ { return (label.lower==l.label.lower && label.upper==l.label.upper); };
94
+ int operator!=( Label l ) const
95
+ { return !(l == *this); };
96
+
97
+ // comparison operator needed for sorting labels
98
+ int operator<( Label l ) const {
99
+ return (upper_char() < l.upper_char()); };
100
+
101
+ // check whether the label is epsilon (i.e. both symbols are epsilon)
102
+ // transitions with epsilon labels are epsilon transitions
103
+ int is_epsilon() const
104
+ { return (label.upper == epsilon && label.lower == epsilon); };
105
+
106
+ // check whether the "upper" symbol is epsilon
107
+ int upper_is_epsilon() const
108
+ { return (label.upper == epsilon); };
109
+
110
+ // check whether the "lower" symbol is epsilon
111
+ int lower_is_epsilon() const
112
+ { return (label.lower == epsilon); };
113
+
114
+ // hash function needed to store labels in a hash table
115
+ struct label_hash {
116
+ size_t operator() ( const Label l ) const {
117
+ return (size_t)l.lower_char() ^
118
+ ((size_t)l.upper_char() << 16) ^
119
+ ((size_t)l.upper_char() >> 16);
120
+ }
121
+ };
122
+
123
+ // hash function needed to store labels in a hash table
124
+ struct label_cmp {
125
+ bool operator() ( const Label l1, const Label l2 ) const {
126
+ return (l1.lower_char() < l2.lower_char() ||
127
+ (l1.lower_char() == l2.lower_char() &&
128
+ l1.upper_char() < l2.upper_char()));
129
+ }
130
+ };
131
+
132
+ // comparison operator needed to store labels in a hash table
133
+ struct label_eq {
134
+ bool operator() ( const Label l1, const Label l2 ) const {
135
+ return (l1.lower_char() == l2.lower_char() &&
136
+ l1.upper_char() == l2.upper_char());
137
+ }
138
+ };
139
+ };
140
+
141
+ typedef std::vector<Label> Analysis;
142
+
143
+
144
+ /***************** class Alphabet *******************************/
145
+
146
+ class Alphabet {
147
+
148
+ // string comparison operators needed to stored strings in a hash table
149
+ struct eqstr {
150
+ bool operator()(const char* s1, const char* s2) const {
151
+ return strcmp(s1, s2) == 0;
152
+ }
153
+ };
154
+
155
+ // data structure storing labels without repetitions (i.e. as a set)
156
+ typedef std::set<Label, Label::label_cmp> LabelSet;
157
+
158
+ // hash table used to map the symbols to their codes
159
+ typedef __gnu_cxx::hash_map<const char*, Character, __gnu_cxx::hash<const char*>,eqstr> SymbolMap;
160
+
161
+ // hash table used to map the codes back to the symbols
162
+ typedef __gnu_cxx::hash_map<Character, char*> CharMap;
163
+
164
+ private:
165
+ SymbolMap sm; // maps symbols to codes
166
+
167
+ CharMap cm; // maps codes to symbols
168
+ LabelSet ls; // set of labels known to the alphabet
169
+
170
+ // add a new symbol with symbol code c
171
+ void add( const char *symbol, Character c );
172
+
173
+ public:
174
+ bool utf8;
175
+
176
+ // iterators over the set of known labels
177
+ typedef LabelSet::iterator iterator;
178
+ typedef LabelSet::const_iterator const_iterator;
179
+ Alphabet();
180
+ ~Alphabet() { clear(); };
181
+ const_iterator begin() const { return ls.begin(); };
182
+ const_iterator end() const { return ls.end(); };
183
+ size_t size() const { return ls.size(); };
184
+
185
+ void clear();
186
+ void clear_char_pairs() { ls.clear(); };
187
+
188
+ // lookup a label in the alphabet
189
+ iterator find( Label l ) { return ls.find(l); };
190
+
191
+ // insert a label in the alphabet
192
+ void insert( Label l ) { if (!l.is_epsilon()) ls.insert(l); };
193
+
194
+ // insert the known symbols from another alphabet
195
+ void insert_symbols( const Alphabet& );
196
+
197
+ // insert the labels and known symbols from another alphabet
198
+ void copy( const Alphabet& );
199
+
200
+ // create the alphabet of a transducer obtained by a composition operation
201
+ void compose( const Alphabet &la, const Alphabet &ua );
202
+
203
+ // add a symbol to the alphabet and return its code
204
+ Character add_symbol(const char *symbol);
205
+
206
+ // add a symbol to the alphabet with a given code
207
+ void add_symbol(const char *symbol, Character c );
208
+
209
+ // create a new marker symbol and return its code
210
+ Character new_marker( void );
211
+ void delete_markers();
212
+
213
+ // compute the complement of a symbol set
214
+ void complement( std::vector<Character> &sym );
215
+
216
+ // return the code of the argument symbol
217
+ int symbol2code( const char *s ) const {
218
+ SymbolMap::const_iterator p = sm.find(s);
219
+ if (p != sm.end()) return p->second;
220
+ return EOF;
221
+ };
222
+
223
+ // return the symbol for the given symbol code
224
+ const char *code2symbol( Character c ) const {
225
+ CharMap::const_iterator p=cm.find(c);
226
+ if (p == cm.end())
227
+ return NULL;
228
+ else
229
+ return p->second;
230
+ };
231
+
232
+ // write the symbol for the given symbol code into a string
233
+ void write_char( Character c, char *buffer, int *pos,
234
+ bool with_brackets=true ) const;
235
+
236
+ // write the symbol pair of a given label into a string
237
+ void write_label( Label l, char *buffer, int *pos,
238
+ bool with_brackets=true ) const;
239
+
240
+ // write the symbol for the given symbol code into a buffer and return
241
+ // a pointer to it
242
+ // the flag "with_brackets" indicates whether the angle brackets
243
+ // surrounding multi-character symbols are to be printed or not
244
+ const char *write_char( Character c, bool with_brackets=true ) const;
245
+
246
+ // write the symbol pair of a given label into a string
247
+ // and return a pointer to it
248
+ const char *write_label( Label l, bool with_brackets=true ) const;
249
+
250
+ // scan the next multi-character symbol in the argument string
251
+ int next_mcsym( char*&, int extended=1 );
252
+
253
+ // scan the next symbol in the argument string
254
+ int next_code( char*&, int extended=1 );
255
+
256
+ // convert a character string into a symbol or label sequence
257
+ void string2symseq( char*, std::vector<Character>& );
258
+ void string2labelseq( char*, std::vector<Label>& );
259
+
260
+ // scan the next label in the argument string
261
+ Label next_label( char*&, int extended=1 );
262
+
263
+ // store the alphabet in the argument file (in binary form)
264
+ void store( FILE* ) const;
265
+
266
+ // read the alphabet from the argument file
267
+ void read( FILE* );
268
+
269
+ // disambiguation and printing of analyses
270
+ int compute_score( Analysis &ana );
271
+ void disambiguate( std::vector<Analysis> &analyses );
272
+ char *print_analysis( Analysis &ana, bool both_layers );
273
+
274
+ friend std::ostream &operator<<(std::ostream&, const Alphabet&);
275
+ };
276
+
277
+ // write the alphabet to the output stream (in readable form)
278
+ std::ostream &operator<<(std::ostream&, Alphabet&);
279
+
280
+
281
+ #endif
@@ -0,0 +1,84 @@
1
+
2
+ /*******************************************************************/
3
+ /* */
4
+ /* FILE basic.C */
5
+ /* MODULE basic */
6
+ /* PROGRAM SFST */
7
+ /* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
8
+ /* */
9
+ /* PURPOSE */
10
+ /* */
11
+ /*******************************************************************/
12
+
13
+ #include <stdlib.h>
14
+ #include <string.h>
15
+
16
+ #include "basic.h"
17
+
18
+ bool Switch_Bytes=false;
19
+
20
+
21
+
22
+ /*******************************************************************/
23
+ /* */
24
+ /* fst_strdup */
25
+ /* */
26
+ /*******************************************************************/
27
+
28
+ char* fst_strdup(const char* pString)
29
+
30
+ {
31
+ char* pStringCopy = (char*)malloc(strlen(pString) + 1);
32
+ if (pStringCopy == NULL) {
33
+ fprintf(stderr, "\nError: out of memory (malloc failed)\naborted.\n");
34
+ exit(1);
35
+ }
36
+ strcpy(pStringCopy, pString);
37
+ return pStringCopy;
38
+ }
39
+
40
+
41
+ /*******************************************************************/
42
+ /* */
43
+ /* read_string */
44
+ /* */
45
+ /*******************************************************************/
46
+
47
+ int read_string( char *buffer, int size, FILE *file )
48
+
49
+ {
50
+ for( int i=0; i<size; i++ ) {
51
+ int c=fgetc(file);
52
+ if (c == EOF || c == 0) {
53
+ buffer[i] = 0;
54
+ return (c==0);
55
+ }
56
+ buffer[i] = (char)c;
57
+ }
58
+ buffer[size-1] = 0;
59
+ return 0;
60
+ }
61
+
62
+
63
+ /*******************************************************************/
64
+ /* */
65
+ /* read_num */
66
+ /* */
67
+ /*******************************************************************/
68
+
69
+ size_t read_num( void *p, size_t n, FILE *file )
70
+
71
+ {
72
+ char *pp=(char*)p;
73
+ size_t result=fread( pp, 1, n, file );
74
+ if (Switch_Bytes) {
75
+ size_t e=n/2;
76
+ for( size_t i=0; i<e; i++ ) {
77
+ char tmp=pp[i];
78
+ pp[i] = pp[--n];
79
+ pp[n] = tmp;
80
+ }
81
+ }
82
+ return result;
83
+ }
84
+
@@ -0,0 +1,24 @@
1
+
2
+ /*******************************************************************/
3
+ /* */
4
+ /* FILE basic.h */
5
+ /* MODULE basic */
6
+ /* PROGRAM SFST */
7
+ /* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
8
+ /* */
9
+ /* PURPOSE */
10
+ /* */
11
+ /*******************************************************************/
12
+
13
+ #ifndef _BASIC_H_
14
+ #define _BASIC_H_
15
+
16
+ #include <stdio.h>
17
+
18
+ extern bool Switch_Bytes;
19
+
20
+ char* fst_strdup(const char* pString);
21
+ int read_string( char *buffer, int size, FILE *file );
22
+ size_t read_num( void *p, size_t size, FILE *file );
23
+
24
+ #endif