ruby-sfst 0.4.3 → 0.4.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (51) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +1 -0
  3. data/COPYING +280 -0
  4. data/Gemfile +3 -0
  5. data/Gemfile.lock +54 -0
  6. data/README.md +1 -1
  7. data/Rakefile +9 -18
  8. data/bin/console +7 -0
  9. data/bin/setup +6 -0
  10. data/ext/sfst/alphabet.cc +879 -0
  11. data/ext/sfst/alphabet.h +302 -0
  12. data/ext/sfst/basic.cc +85 -0
  13. data/ext/{sfst_machine → sfst}/basic.h +7 -4
  14. data/ext/sfst/compact.cc +629 -0
  15. data/ext/sfst/compact.h +100 -0
  16. data/ext/sfst/determinise.cc +279 -0
  17. data/ext/{sfst_machine → sfst}/extconf.rb +2 -1
  18. data/ext/sfst/fst.cc +1150 -0
  19. data/ext/sfst/fst.h +374 -0
  20. data/ext/sfst/hopcroft.cc +681 -0
  21. data/ext/sfst/interface.cc +1921 -0
  22. data/ext/sfst/interface.h +171 -0
  23. data/ext/sfst/make-compact.cc +323 -0
  24. data/ext/{sfst_machine → sfst}/make-compact.h +15 -13
  25. data/ext/sfst/mem.h +80 -0
  26. data/ext/sfst/operators.cc +1273 -0
  27. data/ext/{sfst_machine → sfst}/sfst_machine.cc +89 -78
  28. data/ext/sfst/sgi.h +72 -0
  29. data/ext/sfst/utf8.cc +149 -0
  30. data/ext/{sfst_machine → sfst}/utf8.h +7 -4
  31. data/lib/sfst.rb +2 -1
  32. data/lib/sfst/version.rb +1 -1
  33. data/ruby-sfst.gemspec +23 -23
  34. metadata +107 -35
  35. data/ext/sfst_machine/alphabet.cc +0 -812
  36. data/ext/sfst_machine/alphabet.h +0 -273
  37. data/ext/sfst_machine/basic.cc +0 -84
  38. data/ext/sfst_machine/compact.cc +0 -616
  39. data/ext/sfst_machine/compact.h +0 -98
  40. data/ext/sfst_machine/determinise.cc +0 -303
  41. data/ext/sfst_machine/fst.cc +0 -1000
  42. data/ext/sfst_machine/fst.h +0 -369
  43. data/ext/sfst_machine/interface.cc +0 -1842
  44. data/ext/sfst_machine/interface.h +0 -93
  45. data/ext/sfst_machine/make-compact.cc +0 -327
  46. data/ext/sfst_machine/mem.h +0 -74
  47. data/ext/sfst_machine/operators.cc +0 -1131
  48. data/ext/sfst_machine/sgi.h +0 -44
  49. data/ext/sfst_machine/utf8.cc +0 -146
  50. data/test/test_sfst.fst +0 -3
  51. data/test/test_sfst.rb +0 -114
@@ -12,8 +12,11 @@
12
12
  #ifndef _UTF8_H_
13
13
  #define _UTF8_H_
14
14
 
15
- unsigned int utf8toint( char *s );
16
- unsigned int utf8toint( char **s );
17
- char *int2utf8( unsigned int );
18
-
15
+ namespace SFST {
16
+
17
+ unsigned int utf8toint( char *s );
18
+ unsigned int utf8toint( char **s );
19
+ char *int2utf8( unsigned int );
20
+
21
+ }
19
22
  #endif
@@ -4,7 +4,8 @@
4
4
  #
5
5
  # Written by Marius L. Jøhndal, 2008.
6
6
  #
7
- require 'sfst_machine'
7
+ require 'sfst/version'
8
+ require 'sfst/sfst'
8
9
 
9
10
  module SFST
10
11
  # Compiles an SFST transducer +source+ and saves it as +machine+.
@@ -1,3 +1,3 @@
1
1
  module SFST
2
- VERSION = "0.4.3" unless defined?(SFST::VERSION)
2
+ VERSION = "0.4.4" unless defined?(SFST::VERSION)
3
3
  end
@@ -3,28 +3,28 @@ lib = File.expand_path('../lib', __FILE__)
3
3
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
4
  require 'sfst/version'
5
5
 
6
- Gem::Specification.new do |s|
7
- s.authors = ["Marius L. Jøhndal"]
8
- s.description = %q{A wrapper for the Stuttgart Finite State Transducer Tools (SFST).}
9
- s.summary = %q{Stuttgart Finite State Transducer Tools interface}
10
- s.email = ['mariuslj (at) ifi [dot] uio (dot) no']
11
- s.files = %w(CHANGELOG.md README.md Rakefile ruby-sfst.gemspec)
12
- s.files += Dir.glob("ext/**/*.C")
13
- s.files += Dir.glob("ext/**/*.h")
14
- s.files += Dir.glob("ext/**/*.rb")
15
- s.files += Dir.glob("ext/**/*.cc")
16
- s.files += Dir.glob("lib/**/*.rb")
17
- s.files += Dir.glob("test/*.fst")
18
- s.files += Dir.glob("test/*.rb")
19
- s.homepage = "http://github.com/mlj/ruby-sfst"
20
- s.licenses = ['GPL2']
21
- s.name = "ruby-sfst"
22
- s.require_paths = ["lib"]
23
- s.required_rubygems_version = '>= 1.3.5'
24
- s.extensions = ["ext/sfst_machine/extconf.rb"]
25
- s.test_files += Dir.glob("test/*.rb")
26
- s.version = SFST::VERSION
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "ruby-sfst"
8
+ spec.version = SFST::VERSION
9
+ spec.authors = ["Marius L. Jøhndal"]
10
+ spec.email = ["mariuslj@ifi.uio.no"]
11
+ spec.license = 'GPL2'
27
12
 
28
- s.add_development_dependency 'bundler', '~> 1.0'
29
- s.add_development_dependency 'test-unit', '~> 3.0'
13
+ spec.summary = %q{Stuttgart Finite State Transducer Tools interface}
14
+ spec.description = %q{A wrapper for the Stuttgart Finite State Transducer Tools (SFST).}
15
+ spec.homepage = "http://github.com/mlj/ruby-sfst"
16
+
17
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^spec/}) } - %w(.gitignore .rspec .travis.yml)
18
+ spec.require_paths = ["lib"]
19
+ spec.extensions = ["ext/sfst/extconf.rb"]
20
+
21
+ spec.required_ruby_version = '>= 1.9'
22
+
23
+ spec.add_development_dependency 'bundler', '~> 1.16'
24
+ spec.add_development_dependency 'rake', '~> 12.3'
25
+ spec.add_development_dependency 'rake-compiler', '~> 1.0'
26
+ spec.add_development_dependency 'rspec', '~> 3.7'
27
+ spec.add_development_dependency 'pry', '~> 0.11'
28
+ spec.add_development_dependency 'simplecov', '~> 0.15'
29
+ spec.add_development_dependency 'yard', '~> 0.9'
30
30
  end
metadata CHANGED
@@ -1,17 +1,45 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby-sfst
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.3
4
+ version: 0.4.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Marius L. Jøhndal
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-05-18 00:00:00.000000000 Z
11
+ date: 2017-12-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.16'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.16'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '12.3'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '12.3'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake-compiler
15
43
  requirement: !ruby/object:Gem::Requirement
16
44
  requirements:
17
45
  - - "~>"
@@ -25,55 +53,101 @@ dependencies:
25
53
  - !ruby/object:Gem::Version
26
54
  version: '1.0'
27
55
  - !ruby/object:Gem::Dependency
28
- name: test-unit
56
+ name: rspec
29
57
  requirement: !ruby/object:Gem::Requirement
30
58
  requirements:
31
59
  - - "~>"
32
60
  - !ruby/object:Gem::Version
33
- version: '3.0'
61
+ version: '3.7'
34
62
  type: :development
35
63
  prerelease: false
36
64
  version_requirements: !ruby/object:Gem::Requirement
37
65
  requirements:
38
66
  - - "~>"
39
67
  - !ruby/object:Gem::Version
40
- version: '3.0'
68
+ version: '3.7'
69
+ - !ruby/object:Gem::Dependency
70
+ name: pry
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '0.11'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '0.11'
83
+ - !ruby/object:Gem::Dependency
84
+ name: simplecov
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '0.15'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '0.15'
97
+ - !ruby/object:Gem::Dependency
98
+ name: yard
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: '0.9'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: '0.9'
41
111
  description: A wrapper for the Stuttgart Finite State Transducer Tools (SFST).
42
112
  email:
43
- - mariuslj (at) ifi [dot] uio (dot) no
113
+ - mariuslj@ifi.uio.no
44
114
  executables: []
45
115
  extensions:
46
- - ext/sfst_machine/extconf.rb
116
+ - ext/sfst/extconf.rb
47
117
  extra_rdoc_files: []
48
118
  files:
49
119
  - CHANGELOG.md
120
+ - COPYING
121
+ - Gemfile
122
+ - Gemfile.lock
50
123
  - README.md
51
124
  - Rakefile
52
- - ext/sfst_machine/alphabet.cc
53
- - ext/sfst_machine/alphabet.h
54
- - ext/sfst_machine/basic.cc
55
- - ext/sfst_machine/basic.h
56
- - ext/sfst_machine/compact.cc
57
- - ext/sfst_machine/compact.h
58
- - ext/sfst_machine/determinise.cc
59
- - ext/sfst_machine/extconf.rb
60
- - ext/sfst_machine/fst.cc
61
- - ext/sfst_machine/fst.h
62
- - ext/sfst_machine/interface.cc
63
- - ext/sfst_machine/interface.h
64
- - ext/sfst_machine/make-compact.cc
65
- - ext/sfst_machine/make-compact.h
66
- - ext/sfst_machine/mem.h
67
- - ext/sfst_machine/operators.cc
68
- - ext/sfst_machine/sfst_machine.cc
69
- - ext/sfst_machine/sgi.h
70
- - ext/sfst_machine/utf8.cc
71
- - ext/sfst_machine/utf8.h
125
+ - bin/console
126
+ - bin/setup
127
+ - ext/sfst/alphabet.cc
128
+ - ext/sfst/alphabet.h
129
+ - ext/sfst/basic.cc
130
+ - ext/sfst/basic.h
131
+ - ext/sfst/compact.cc
132
+ - ext/sfst/compact.h
133
+ - ext/sfst/determinise.cc
134
+ - ext/sfst/extconf.rb
135
+ - ext/sfst/fst.cc
136
+ - ext/sfst/fst.h
137
+ - ext/sfst/hopcroft.cc
138
+ - ext/sfst/interface.cc
139
+ - ext/sfst/interface.h
140
+ - ext/sfst/make-compact.cc
141
+ - ext/sfst/make-compact.h
142
+ - ext/sfst/mem.h
143
+ - ext/sfst/operators.cc
144
+ - ext/sfst/sfst_machine.cc
145
+ - ext/sfst/sgi.h
146
+ - ext/sfst/utf8.cc
147
+ - ext/sfst/utf8.h
72
148
  - lib/sfst.rb
73
149
  - lib/sfst/version.rb
74
150
  - ruby-sfst.gemspec
75
- - test/test_sfst.fst
76
- - test/test_sfst.rb
77
151
  homepage: http://github.com/mlj/ruby-sfst
78
152
  licenses:
79
153
  - GPL2
@@ -86,18 +160,16 @@ required_ruby_version: !ruby/object:Gem::Requirement
86
160
  requirements:
87
161
  - - ">="
88
162
  - !ruby/object:Gem::Version
89
- version: '0'
163
+ version: '1.9'
90
164
  required_rubygems_version: !ruby/object:Gem::Requirement
91
165
  requirements:
92
166
  - - ">="
93
167
  - !ruby/object:Gem::Version
94
- version: 1.3.5
168
+ version: '0'
95
169
  requirements: []
96
170
  rubyforge_project:
97
- rubygems_version: 2.4.5
171
+ rubygems_version: 2.6.14
98
172
  signing_key:
99
173
  specification_version: 4
100
174
  summary: Stuttgart Finite State Transducer Tools interface
101
- test_files:
102
- - test/test_sfst.rb
103
- has_rdoc:
175
+ test_files: []
@@ -1,812 +0,0 @@
1
-
2
- /*******************************************************************/
3
- /* */
4
- /* FILE alphabet.C */
5
- /* MODULE alphabet */
6
- /* PROGRAM SFST */
7
- /* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
8
- /* */
9
- /* PURPOSE basic FST functions */
10
- /* */
11
- /*******************************************************************/
12
-
13
- #include <climits>
14
- #include <cstring>
15
-
16
- #include "utf8.h"
17
- #include "alphabet.h"
18
-
19
- using std::vector;
20
- using std::ostream;
21
-
22
- const int BUFFER_SIZE=100000;
23
-
24
- char EpsilonString[]="<>";
25
-
26
-
27
- /*******************************************************************/
28
- /* */
29
- /* Alphabet::add */
30
- /* */
31
- /*******************************************************************/
32
-
33
- void Alphabet::add( const char *symbol, Character c )
34
-
35
- {
36
- char *s = fst_strdup(symbol);
37
- cm[c] = s;
38
- sm[s] = c;
39
- }
40
-
41
-
42
- /*******************************************************************/
43
- /* */
44
- /* Alphabet::Alphabet */
45
- /* */
46
- /*******************************************************************/
47
-
48
- Alphabet::Alphabet()
49
-
50
- {
51
- utf8 = false;
52
- add(EpsilonString, Label::epsilon);
53
- }
54
-
55
-
56
- /*******************************************************************/
57
- /* */
58
- /* Alphabet::clear */
59
- /* */
60
- /*******************************************************************/
61
-
62
- void Alphabet::clear()
63
-
64
- {
65
- char **s=new char*[cm.size()];
66
- ls.clear();
67
- sm.clear();
68
-
69
- size_t i, n=0;
70
- for( CharMap::iterator it=cm.begin(); it!=cm.end(); it++ )
71
- s[n++] = it->second;
72
- cm.clear();
73
-
74
- for( i=0; i<n; i++ )
75
- free(s[i]);
76
- delete[] s;
77
- }
78
-
79
-
80
- /*******************************************************************/
81
- /* */
82
- /* Alphabet::new_marker */
83
- /* */
84
- /*******************************************************************/
85
-
86
- Character Alphabet::new_marker()
87
-
88
- {
89
- // find some unused character code
90
- for(Character i=1; i!=0; i++)
91
- if (cm.find(i) == cm.end()) {
92
- // create a unique identifier string
93
- char symbol[100];
94
- sprintf(symbol,">%ld<",(long)i);
95
- add(symbol, i);
96
- return i;
97
- }
98
-
99
- throw "Error: too many symbols in transducer definition";
100
- }
101
-
102
-
103
- /*******************************************************************/
104
- /* */
105
- /* is_marker_symbol */
106
- /* */
107
- /*******************************************************************/
108
-
109
- static bool is_marker_symbol( const char *s )
110
-
111
- {
112
- // recogize strings matching the expression ">[0-9]+<"
113
- if (s != NULL && *s == '>') {
114
- do { s++; } while (*s >= '0' && *s <= '9');
115
- if (*s=='<' && *(s+1) == 0 && *(s-1) != '>')
116
- return true;
117
- }
118
- return false;
119
- }
120
-
121
-
122
- /*******************************************************************/
123
- /* */
124
- /* Alphabet::delete_markers */
125
- /* */
126
- /*******************************************************************/
127
-
128
- void Alphabet::delete_markers()
129
-
130
- {
131
- vector<char*> sym;
132
- vector<Character> code;
133
- vector<Label> label;
134
-
135
- for( CharMap::const_iterator it=cm.begin(); it!=cm.end(); it++ ) {
136
- Character c=it->first;
137
- char *s=it->second;
138
- if (!is_marker_symbol(s)) {
139
- sym.push_back(fst_strdup(s));
140
- code.push_back(c);
141
- }
142
- }
143
-
144
- for( LabelSet::const_iterator it=begin(); it!=end(); it++ ) {
145
- Label l=*it;
146
- if (!is_marker_symbol(code2symbol(l.upper_char())) &&
147
- !is_marker_symbol(code2symbol(l.lower_char())))
148
- label.push_back(l);
149
- }
150
-
151
- clear();
152
-
153
- for( size_t i=0; i<sym.size(); i++ ) {
154
- add_symbol(sym[i], code[i]);
155
- free(sym[i]);
156
- }
157
- for( size_t i=0; i<label.size(); i++ )
158
- insert( label[i] );
159
- }
160
-
161
-
162
- /*******************************************************************/
163
- /* */
164
- /* Alphabet::add_symbol */
165
- /* */
166
- /*******************************************************************/
167
-
168
- Character Alphabet::add_symbol(const char *symbol)
169
-
170
- {
171
- if (sm.find(symbol) != sm.end())
172
- return sm[symbol];
173
-
174
- // assign the symbol to some unused character
175
- for(Character i=1; i!=0; i++)
176
- if (cm.find(i) == cm.end()) {
177
- add(symbol, i);
178
- return i;
179
- }
180
-
181
- throw "Error: too many symbols in transducer definition";
182
- }
183
-
184
-
185
- /*******************************************************************/
186
- /* */
187
- /* Alphabet::add_symbol */
188
- /* */
189
- /*******************************************************************/
190
-
191
- void Alphabet::add_symbol( const char *symbol, Character c )
192
-
193
- {
194
- // check whether the symbol was previously defined
195
- int sc=symbol2code(symbol);
196
- if (sc != EOF) {
197
- if ((Character)sc == c)
198
- return;
199
-
200
- if (strlen(symbol) < 60) {
201
- static char message[100];
202
- sprintf(message, "Error: reinserting symbol '%s' in alphabet with incompatible character value %u %u", symbol, (unsigned)sc, (unsigned)c);
203
- throw message;
204
- }
205
- else
206
- throw "reinserting symbol in alphabet with incompatible character value";
207
- }
208
-
209
- // check whether the character is already in use
210
- const char *s=code2symbol(c);
211
- if (s == NULL)
212
- add(symbol, c);
213
- else {
214
- if (strcmp(s, symbol) != 0) {
215
- static char message[100];
216
- if (strlen(symbol) < 70)
217
- sprintf(message,"Error: defining symbol %s as character %d (previously defined as %s)", symbol, (unsigned)c, s);
218
- else
219
- sprintf(message,"Error: defining a (very long) symbol with previously used character");
220
- throw message;
221
- }
222
- }
223
- }
224
-
225
-
226
- /*******************************************************************/
227
- /* */
228
- /* Alphabet::write_char */
229
- /* */
230
- /*******************************************************************/
231
-
232
- void Alphabet::write_char( Character c, char *buffer, int *pos,
233
- bool with_brackets) const
234
- {
235
- const char *s = code2symbol(c);
236
-
237
- if (s) {
238
- int i = 0;
239
- int l=strlen(s)-1;
240
- if (!with_brackets && s[i] == '<' && s[l] == '>') { i++; l--; }
241
- while (i <= l)
242
- buffer[(*pos)++] = s[i++];
243
- }
244
- else {
245
- unsigned int uc = c;
246
- if (uc>=32 && uc<256)
247
- buffer[(*pos)++] = (char)c;
248
- else {
249
- sprintf(buffer+(*pos),"\\%u", uc);
250
- *pos += strlen(buffer+(*pos));
251
- }
252
- }
253
- buffer[*pos] = '\0';
254
- }
255
-
256
-
257
- /*******************************************************************/
258
- /* */
259
- /* Alphabet::write_char */
260
- /* */
261
- /*******************************************************************/
262
-
263
- const char *Alphabet::write_char( Character c, bool with_brackets ) const
264
-
265
- {
266
- static char buffer[1000];
267
- int n=0;
268
-
269
- write_char( c, buffer, &n, with_brackets );
270
- return buffer;
271
- }
272
-
273
-
274
- /*******************************************************************/
275
- /* */
276
- /* Alphabet::write_label */
277
- /* */
278
- /*******************************************************************/
279
-
280
- void Alphabet::write_label( Label l, char *buffer, int *pos,
281
- bool with_brackets ) const
282
- {
283
- Character lc=l.lower_char();
284
- Character uc=l.upper_char();
285
- write_char( lc, buffer, pos, with_brackets );
286
- if (lc != uc) {
287
- buffer[(*pos)++] = ':';
288
- write_char( uc, buffer, pos, with_brackets );
289
- }
290
- }
291
-
292
-
293
- /*******************************************************************/
294
- /* */
295
- /* Alphabet::write_label */
296
- /* */
297
- /*******************************************************************/
298
-
299
- const char *Alphabet::write_label( Label l, bool with_brackets ) const
300
-
301
- {
302
- static char buffer[1000];
303
- int n=0;
304
- write_label( l, buffer, &n, with_brackets );
305
- return buffer;
306
- }
307
-
308
-
309
- /*******************************************************************/
310
- /* */
311
- /* Alphabet::insert_symbols */
312
- /* */
313
- /*******************************************************************/
314
-
315
- void Alphabet::insert_symbols( const Alphabet &a )
316
-
317
- {
318
- for( CharMap::const_iterator it=a.cm.begin(); it!=a.cm.end(); it++ )
319
- add_symbol(it->second, it->first);
320
- }
321
-
322
-
323
- /*******************************************************************/
324
- /* */
325
- /* Alphabet::complement */
326
- /* */
327
- /*******************************************************************/
328
-
329
- void Alphabet::complement( vector<Character> &sym )
330
-
331
- {
332
- vector<Character> result;
333
- for( CharMap::const_iterator it=cm.begin(); it!=cm.end(); it++ ) {
334
- Character c = it->first;
335
- if (c != Label::epsilon) {
336
- size_t i;
337
- for( i=0; i<sym.size(); i++ )
338
- if (sym[i] == c)
339
- break;
340
- if (i == sym.size())
341
- result.push_back(c);
342
- }
343
- }
344
- sym.swap(result);
345
- }
346
-
347
-
348
- /*******************************************************************/
349
- /* */
350
- /* Alphabet::copy */
351
- /* */
352
- /*******************************************************************/
353
-
354
- void Alphabet::copy( const Alphabet &a )
355
-
356
- {
357
- insert_symbols( a );
358
- utf8 = a.utf8;
359
- for( LabelSet::const_iterator it=a.begin(); it!=a.end(); it++ )
360
- ls.insert( *it );
361
- }
362
-
363
-
364
- /*******************************************************************/
365
- /* */
366
- /* Alphabet::compose */
367
- /* */
368
- /*******************************************************************/
369
-
370
- void Alphabet::compose( const Alphabet &la, const Alphabet &ua )
371
-
372
- {
373
- // insert the symbols
374
- insert_symbols(la);
375
- insert_symbols(ua);
376
- utf8 = la.utf8;
377
-
378
- hash_map<Character, hash_set<Character> > cs;
379
-
380
- // create a hash table for a quick lookup of the target characters
381
- for( iterator it=ua.begin(); it!=ua.end(); it++ ) {
382
- Character lc=it->lower_char();
383
- if (lc == Label::epsilon)
384
- insert(*it);
385
- else
386
- cs[lc].insert(it->upper_char());
387
- }
388
-
389
- for( iterator it=la.begin(); it!=la.end(); it++ ) {
390
- Character uc=it->upper_char();
391
- if (uc == Label::epsilon)
392
- insert(*it);
393
- else {
394
- if (cs.find(uc) != cs.end()) {
395
- hash_set<Character> s=cs[uc];
396
- Character lc=it->lower_char();
397
- for( hash_set<Character>::iterator it=s.begin(); it!=s.end(); it++)
398
- insert(Label(lc, *it));
399
- }
400
- }
401
- }
402
- }
403
-
404
-
405
- /*******************************************************************/
406
- /* */
407
- /* operator<<(Alphabet) */
408
- /* */
409
- /*******************************************************************/
410
-
411
- ostream &operator<<( ostream &s, const Alphabet &a )
412
-
413
- {
414
- for( Alphabet::CharMap::const_iterator it=a.cm.begin(); it!=a.cm.end(); it++ )
415
- s << it->first << " -> " << it->second << "\n";
416
- for( Alphabet::iterator it=a.begin(); it!=a.end(); it++ )
417
- s << a.write_label(*it) << " ";
418
- s << "\n";
419
- return s;
420
- }
421
-
422
-
423
- /*******************************************************************/
424
- /* */
425
- /* Alphabet::next_mcsym */
426
- /* */
427
- /* recognizes multi-character symbols which are enclosed with */
428
- /* angle brackets <...>. If the argument flag insert is true, */
429
- /* the multi-character symbol must be already in the lexicon in */
430
- /* order to be recognized. */
431
- /* */
432
- /*******************************************************************/
433
-
434
- int Alphabet::next_mcsym( char* &string, bool insert )
435
-
436
- {
437
- char *start=string;
438
-
439
- if (*start == '<')
440
- // symbol might start here
441
- for( char *end=start+1; *end; end++ )
442
- if (*end == '>') {
443
- // matching pair of angle brackets found
444
- // mark the end of the substring with \0
445
- char lastc = *(++end);
446
- *end = 0;
447
-
448
- int c;
449
- if (insert)
450
- c = add_symbol( start );
451
- else
452
- c = symbol2code(start);
453
- // restore the original string
454
- *end = lastc;
455
-
456
- if (c != EOF) {
457
- // symbol found
458
- // return its code
459
- string = end;
460
- return (Character)c;
461
- }
462
- else
463
- // not a complex character
464
- break;
465
- }
466
- return EOF;
467
- }
468
-
469
-
470
- /*******************************************************************/
471
- /* */
472
- /* Alphabet::next_code */
473
- /* */
474
- /*******************************************************************/
475
-
476
- int Alphabet::next_code( char* &string, bool extended, bool insert )
477
-
478
- {
479
- if (*string == 0)
480
- return EOF; // finished
481
-
482
- int c = next_mcsym(string, insert);
483
- if (c != EOF)
484
- return c;
485
-
486
- if (extended && *string == '\\')
487
- string++; // remove quotation
488
-
489
- if (utf8) {
490
- unsigned int c = utf8toint( &string );
491
- return (int)add_symbol(int2utf8(c));
492
- }
493
- else {
494
- char buffer[2];
495
- buffer[0] = *string;
496
- buffer[1] = 0;
497
- string++;
498
- return (int)add_symbol(buffer);
499
- }
500
- }
501
-
502
-
503
- /*******************************************************************/
504
- /* */
505
- /* Alphabet::next_label */
506
- /* */
507
- /*******************************************************************/
508
-
509
- Label Alphabet::next_label( char* &string, bool extended )
510
-
511
- {
512
- // read first character
513
- int c = next_code( string, extended );
514
- if (c == EOF)
515
- return Label(); // end of string reached
516
-
517
- Character lc=(Character)c;
518
- if (!extended || *string != ':') { // single character?
519
- if (lc == Label::epsilon)
520
- return next_label(string, extended); // ignore epsilon
521
- return Label(lc);
522
- }
523
-
524
- // read second character
525
- string++; // jump over ':'
526
- c = next_code( string );
527
- if (c == EOF) {
528
- static char buffer[1000];
529
- sprintf(buffer,"Error: incomplete symbol in input file: %s", string);
530
- throw buffer;
531
- }
532
-
533
- Label l(lc, (Character)c);
534
- if (l.is_epsilon())
535
- return next_label(string, extended); // ignore epsilon transitions
536
- return l;
537
- }
538
-
539
-
540
- /*******************************************************************/
541
- /* */
542
- /* Alphabet::string2symseq */
543
- /* */
544
- /*******************************************************************/
545
-
546
- void Alphabet::string2symseq( char *s, vector<Character> &ch )
547
-
548
- {
549
- int c;
550
- while ((c = next_code(s, false)) != EOF)
551
- ch.push_back((Character)c);
552
- }
553
-
554
-
555
- /*******************************************************************/
556
- /* */
557
- /* Alphabet::string2labelseq */
558
- /* */
559
- /*******************************************************************/
560
-
561
- void Alphabet::string2labelseq( char *s, vector<Label> &labels )
562
-
563
- {
564
- Label l;
565
- while ((l = next_label(s)) != Label::epsilon)
566
- labels.push_back(l);
567
- }
568
-
569
-
570
- /*******************************************************************/
571
- /* */
572
- /* Alphabet::store */
573
- /* */
574
- /*******************************************************************/
575
-
576
- void Alphabet::store( FILE *file ) const
577
-
578
- {
579
- char c=(utf8)? 1: 0;
580
- fputc(c, file);
581
-
582
- // write the symbol mapping
583
- Character n=cm.size();
584
- fwrite(&n, sizeof(n), 1, file);
585
- for( CharMap::const_iterator it=cm.begin(); it!=cm.end(); it++ ) {
586
- Character c=it->first;
587
- char *s=it->second;
588
- fwrite(&c, sizeof(c), 1, file);
589
- fwrite(s, sizeof(char), strlen(s)+1, file);
590
- }
591
-
592
- // write the character pairs
593
- n = size();
594
- fwrite(&n, sizeof(n), 1, file);
595
- for( LabelSet::const_iterator p=ls.begin(); p!=ls.end(); p++ ) {
596
- Character c=p->lower_char();
597
- fwrite(&c, sizeof(c), 1, file);
598
- c = p->upper_char();
599
- fwrite(&c, sizeof(c), 1, file);
600
- }
601
-
602
- if (ferror(file))
603
- throw "Error encountered while writing alphabet to file\n";
604
- }
605
-
606
-
607
- /*******************************************************************/
608
- /* */
609
- /* Alphabet::read */
610
- /* */
611
- /*******************************************************************/
612
-
613
- void Alphabet::read( FILE *file )
614
-
615
- {
616
- utf8 = (fgetc(file) != 0);
617
-
618
- // read the symbol mapping
619
- Character n=0;
620
- read_num(&n, sizeof(n), file);
621
- for( unsigned i=0; i<n; i++) {
622
- char buffer[BUFFER_SIZE];
623
- Character c;
624
- read_num(&c, sizeof(c), file);
625
- if (!read_string(buffer, BUFFER_SIZE, file) ||
626
- feof(file) || ferror(file))
627
- throw "Error1 occurred while reading alphabet!\n";
628
- add_symbol(buffer, c);
629
- }
630
-
631
- // read the character pairs
632
- read_num(&n, sizeof(n), file);
633
- if (ferror(file))
634
- throw "Error2 occurred while reading alphabet!\n";
635
- for( unsigned i=0; i<n; i++) {
636
- Character lc, uc;
637
- read_num(&lc, sizeof(lc), file);
638
- read_num(&uc, sizeof(uc), file);
639
- insert(Label(lc, uc));
640
- }
641
- if (ferror(file))
642
- throw "Error3 occurred while reading alphabet!\n";
643
- }
644
-
645
-
646
- /*******************************************************************/
647
- /* */
648
- /* Alphabet::compute_score */
649
- /* */
650
- /*******************************************************************/
651
-
652
- int Alphabet::compute_score( Analysis &ana )
653
-
654
- {
655
- // check whether the morpheme boundaries are explicitly marked
656
- // with <X> tags
657
- int score=0;
658
- for( size_t i=0; i<ana.size(); i++ ) {
659
-
660
- // get next symbol
661
- const char *sym=write_char(ana[i].lower_char());
662
-
663
- if (strcmp(sym,"<X>") == 0)
664
- score--;
665
- }
666
- if (score < 0)
667
- return score;
668
-
669
- // No explicit morphome boundary markers have been found.
670
- // Count the number of part-of-speech and PREF tags.
671
- for( size_t i=0; i<ana.size(); i++ ) {
672
-
673
- // get next symbol
674
- const char *sym=write_char(ana[i].lower_char());
675
-
676
- // Is it not a multi-character symbol
677
- if (sym[0] != '<' || sym[1] == 0)
678
- continue;
679
-
680
- // Is it a POS tag starting with "+" like <+NN>?
681
- if (sym[1] == '+') {
682
- const char *t=sym+2;
683
- for( ; *t >= 'A' && *t <= 'Z'; t++) ;
684
- if (t > sym+2 && *t == '>')
685
- return score;
686
- }
687
-
688
- // Is it a potential POS tag (i.e. all uppercase)?
689
- const char *t = sym+1;
690
- for( ; *t >= 'A' && *t <= 'Z'; t++) ;
691
- if (t == sym+1 || *t != '>')
692
- continue;
693
-
694
- // uppercase symbol found
695
- if (strcmp(sym,"<SUFF>") == 0 ||
696
- strcmp(sym,"<OLDORTH>") == 0 ||
697
- strcmp(sym,"<NEWORTH>") == 0)
698
- continue; // not what we are looking for
699
-
700
- // disprefer nouns with prefixes
701
- if (strcmp(sym,"<PREF>") == 0)
702
- score-=2;
703
-
704
- if (strcmp(sym,"<V>") == 0 || strcmp(sym,"<ADJ>") == 0) {
705
- bool is_verb=(strcmp(sym,"<V>")==0);
706
- // get the next non-empty symbol
707
- Character c=Label::epsilon;
708
- size_t k;
709
- for( k=i+1; k<ana.size(); k++ )
710
- if ((c = ana[k].lower_char()) != Label::epsilon)
711
- break;
712
- // Is it a participle
713
- if (c != Label::epsilon) {
714
- sym = write_char(c);
715
- if (strcmp(sym,"<OLDORTH>") == 0 || strcmp(sym,"<NEWORTH>") == 0) {
716
- for( k++; k<ana.size(); k++ )
717
- if ((c = ana[k].lower_char()) != Label::epsilon)
718
- break;
719
- if (c != Label::epsilon)
720
- sym = write_char(c);
721
- }
722
- if (is_verb &&
723
- (strcmp(sym,"<PPres>") == 0 || strcmp(sym,"<PPast>") == 0))
724
- continue; // don't consider participles as complex
725
- if (!is_verb &&
726
- (strcmp(sym,"<Sup>") == 0 || strcmp(sym,"<Comp>") == 0))
727
- continue; // don't consider participles as complex
728
- }
729
- }
730
- score--;
731
- }
732
- return score;
733
- }
734
-
735
-
736
-
737
- /*******************************************************************/
738
- /* */
739
- /* Alphabet::disambiguate */
740
- /* */
741
- /*******************************************************************/
742
-
743
- void Alphabet::disambiguate( vector<Analysis> &analyses )
744
-
745
- {
746
- // compute the scores
747
- int bestscore=INT_MIN;
748
- vector<int> score;
749
-
750
- for( size_t i=0; i<analyses.size(); i++ ) {
751
- score.push_back(compute_score(analyses[i]));
752
- if (bestscore < score[i])
753
- bestscore = score[i];
754
- }
755
-
756
- // delete suboptimal analyses
757
- size_t k=0;
758
- for( size_t i=0; i<analyses.size(); i++ )
759
- if (score[i] == bestscore)
760
- analyses[k++] = analyses[i];
761
- analyses.resize(k);
762
- }
763
-
764
-
765
-
766
- /*******************************************************************/
767
- /* */
768
- /* Alphabet::print_analysis */
769
- /* */
770
- /*******************************************************************/
771
-
772
- char *Alphabet::print_analysis( Analysis &ana, bool both_layers )
773
-
774
- {
775
- vector<char> ch;
776
-
777
- // for each transition
778
- for( size_t i=0; i<ana.size(); i++ ) {
779
-
780
- // get the transition label
781
- Label l=ana[i];
782
- const char *s;
783
-
784
- // either print the analysis symbol or the whole label
785
- if (both_layers) {
786
- s = write_label(l);
787
- // quote colons
788
- if (strcmp(s,":") == 0)
789
- ch.push_back('\\');
790
- }
791
- else if (l.lower_char() != Label::epsilon)
792
- s = write_char(l.lower_char());
793
- else
794
- continue;
795
-
796
- // copy the characters to the character array
797
- while (*s)
798
- ch.push_back(*(s++));
799
- }
800
- ch.push_back(0); // terminate the string
801
-
802
- static char *result=NULL;
803
- if (result != NULL)
804
- delete[] result;
805
- result = new char[ch.size()];
806
- for( size_t i=0; i<ch.size(); i++ )
807
- result[i] = ch[i];
808
-
809
- return result;
810
- }
811
-
812
-