ruby-sfst 0.4.3 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +1 -0
  3. data/COPYING +280 -0
  4. data/Gemfile +3 -0
  5. data/Gemfile.lock +54 -0
  6. data/README.md +1 -1
  7. data/Rakefile +9 -18
  8. data/bin/console +7 -0
  9. data/bin/setup +6 -0
  10. data/ext/sfst/alphabet.cc +879 -0
  11. data/ext/sfst/alphabet.h +302 -0
  12. data/ext/sfst/basic.cc +85 -0
  13. data/ext/{sfst_machine → sfst}/basic.h +7 -4
  14. data/ext/sfst/compact.cc +629 -0
  15. data/ext/sfst/compact.h +100 -0
  16. data/ext/sfst/determinise.cc +279 -0
  17. data/ext/{sfst_machine → sfst}/extconf.rb +2 -1
  18. data/ext/sfst/fst.cc +1150 -0
  19. data/ext/sfst/fst.h +374 -0
  20. data/ext/sfst/hopcroft.cc +681 -0
  21. data/ext/sfst/interface.cc +1921 -0
  22. data/ext/sfst/interface.h +171 -0
  23. data/ext/sfst/make-compact.cc +323 -0
  24. data/ext/{sfst_machine → sfst}/make-compact.h +15 -13
  25. data/ext/sfst/mem.h +80 -0
  26. data/ext/sfst/operators.cc +1273 -0
  27. data/ext/{sfst_machine → sfst}/sfst_machine.cc +89 -78
  28. data/ext/sfst/sgi.h +72 -0
  29. data/ext/sfst/utf8.cc +149 -0
  30. data/ext/{sfst_machine → sfst}/utf8.h +7 -4
  31. data/lib/sfst.rb +2 -1
  32. data/lib/sfst/version.rb +1 -1
  33. data/ruby-sfst.gemspec +23 -23
  34. metadata +107 -35
  35. data/ext/sfst_machine/alphabet.cc +0 -812
  36. data/ext/sfst_machine/alphabet.h +0 -273
  37. data/ext/sfst_machine/basic.cc +0 -84
  38. data/ext/sfst_machine/compact.cc +0 -616
  39. data/ext/sfst_machine/compact.h +0 -98
  40. data/ext/sfst_machine/determinise.cc +0 -303
  41. data/ext/sfst_machine/fst.cc +0 -1000
  42. data/ext/sfst_machine/fst.h +0 -369
  43. data/ext/sfst_machine/interface.cc +0 -1842
  44. data/ext/sfst_machine/interface.h +0 -93
  45. data/ext/sfst_machine/make-compact.cc +0 -327
  46. data/ext/sfst_machine/mem.h +0 -74
  47. data/ext/sfst_machine/operators.cc +0 -1131
  48. data/ext/sfst_machine/sgi.h +0 -44
  49. data/ext/sfst_machine/utf8.cc +0 -146
  50. data/test/test_sfst.fst +0 -3
  51. data/test/test_sfst.rb +0 -114
@@ -12,8 +12,11 @@
12
12
  #ifndef _UTF8_H_
13
13
  #define _UTF8_H_
14
14
 
15
- unsigned int utf8toint( char *s );
16
- unsigned int utf8toint( char **s );
17
- char *int2utf8( unsigned int );
18
-
15
+ namespace SFST {
16
+
17
+ unsigned int utf8toint( char *s );
18
+ unsigned int utf8toint( char **s );
19
+ char *int2utf8( unsigned int );
20
+
21
+ }
19
22
  #endif
@@ -4,7 +4,8 @@
4
4
  #
5
5
  # Written by Marius L. Jøhndal, 2008.
6
6
  #
7
- require 'sfst_machine'
7
+ require 'sfst/version'
8
+ require 'sfst/sfst'
8
9
 
9
10
  module SFST
10
11
  # Compiles an SFST transducer +source+ and saves it as +machine+.
@@ -1,3 +1,3 @@
1
1
  module SFST
2
- VERSION = "0.4.3" unless defined?(SFST::VERSION)
2
+ VERSION = "0.4.4" unless defined?(SFST::VERSION)
3
3
  end
@@ -3,28 +3,28 @@ lib = File.expand_path('../lib', __FILE__)
3
3
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
4
  require 'sfst/version'
5
5
 
6
- Gem::Specification.new do |s|
7
- s.authors = ["Marius L. Jøhndal"]
8
- s.description = %q{A wrapper for the Stuttgart Finite State Transducer Tools (SFST).}
9
- s.summary = %q{Stuttgart Finite State Transducer Tools interface}
10
- s.email = ['mariuslj (at) ifi [dot] uio (dot) no']
11
- s.files = %w(CHANGELOG.md README.md Rakefile ruby-sfst.gemspec)
12
- s.files += Dir.glob("ext/**/*.C")
13
- s.files += Dir.glob("ext/**/*.h")
14
- s.files += Dir.glob("ext/**/*.rb")
15
- s.files += Dir.glob("ext/**/*.cc")
16
- s.files += Dir.glob("lib/**/*.rb")
17
- s.files += Dir.glob("test/*.fst")
18
- s.files += Dir.glob("test/*.rb")
19
- s.homepage = "http://github.com/mlj/ruby-sfst"
20
- s.licenses = ['GPL2']
21
- s.name = "ruby-sfst"
22
- s.require_paths = ["lib"]
23
- s.required_rubygems_version = '>= 1.3.5'
24
- s.extensions = ["ext/sfst_machine/extconf.rb"]
25
- s.test_files += Dir.glob("test/*.rb")
26
- s.version = SFST::VERSION
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "ruby-sfst"
8
+ spec.version = SFST::VERSION
9
+ spec.authors = ["Marius L. Jøhndal"]
10
+ spec.email = ["mariuslj@ifi.uio.no"]
11
+ spec.license = 'GPL2'
27
12
 
28
- s.add_development_dependency 'bundler', '~> 1.0'
29
- s.add_development_dependency 'test-unit', '~> 3.0'
13
+ spec.summary = %q{Stuttgart Finite State Transducer Tools interface}
14
+ spec.description = %q{A wrapper for the Stuttgart Finite State Transducer Tools (SFST).}
15
+ spec.homepage = "http://github.com/mlj/ruby-sfst"
16
+
17
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^spec/}) } - %w(.gitignore .rspec .travis.yml)
18
+ spec.require_paths = ["lib"]
19
+ spec.extensions = ["ext/sfst/extconf.rb"]
20
+
21
+ spec.required_ruby_version = '>= 1.9'
22
+
23
+ spec.add_development_dependency 'bundler', '~> 1.16'
24
+ spec.add_development_dependency 'rake', '~> 12.3'
25
+ spec.add_development_dependency 'rake-compiler', '~> 1.0'
26
+ spec.add_development_dependency 'rspec', '~> 3.7'
27
+ spec.add_development_dependency 'pry', '~> 0.11'
28
+ spec.add_development_dependency 'simplecov', '~> 0.15'
29
+ spec.add_development_dependency 'yard', '~> 0.9'
30
30
  end
metadata CHANGED
@@ -1,17 +1,45 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby-sfst
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.3
4
+ version: 0.4.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Marius L. Jøhndal
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-05-18 00:00:00.000000000 Z
11
+ date: 2017-12-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.16'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.16'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '12.3'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '12.3'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake-compiler
15
43
  requirement: !ruby/object:Gem::Requirement
16
44
  requirements:
17
45
  - - "~>"
@@ -25,55 +53,101 @@ dependencies:
25
53
  - !ruby/object:Gem::Version
26
54
  version: '1.0'
27
55
  - !ruby/object:Gem::Dependency
28
- name: test-unit
56
+ name: rspec
29
57
  requirement: !ruby/object:Gem::Requirement
30
58
  requirements:
31
59
  - - "~>"
32
60
  - !ruby/object:Gem::Version
33
- version: '3.0'
61
+ version: '3.7'
34
62
  type: :development
35
63
  prerelease: false
36
64
  version_requirements: !ruby/object:Gem::Requirement
37
65
  requirements:
38
66
  - - "~>"
39
67
  - !ruby/object:Gem::Version
40
- version: '3.0'
68
+ version: '3.7'
69
+ - !ruby/object:Gem::Dependency
70
+ name: pry
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '0.11'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '0.11'
83
+ - !ruby/object:Gem::Dependency
84
+ name: simplecov
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '0.15'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '0.15'
97
+ - !ruby/object:Gem::Dependency
98
+ name: yard
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: '0.9'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: '0.9'
41
111
  description: A wrapper for the Stuttgart Finite State Transducer Tools (SFST).
42
112
  email:
43
- - mariuslj (at) ifi [dot] uio (dot) no
113
+ - mariuslj@ifi.uio.no
44
114
  executables: []
45
115
  extensions:
46
- - ext/sfst_machine/extconf.rb
116
+ - ext/sfst/extconf.rb
47
117
  extra_rdoc_files: []
48
118
  files:
49
119
  - CHANGELOG.md
120
+ - COPYING
121
+ - Gemfile
122
+ - Gemfile.lock
50
123
  - README.md
51
124
  - Rakefile
52
- - ext/sfst_machine/alphabet.cc
53
- - ext/sfst_machine/alphabet.h
54
- - ext/sfst_machine/basic.cc
55
- - ext/sfst_machine/basic.h
56
- - ext/sfst_machine/compact.cc
57
- - ext/sfst_machine/compact.h
58
- - ext/sfst_machine/determinise.cc
59
- - ext/sfst_machine/extconf.rb
60
- - ext/sfst_machine/fst.cc
61
- - ext/sfst_machine/fst.h
62
- - ext/sfst_machine/interface.cc
63
- - ext/sfst_machine/interface.h
64
- - ext/sfst_machine/make-compact.cc
65
- - ext/sfst_machine/make-compact.h
66
- - ext/sfst_machine/mem.h
67
- - ext/sfst_machine/operators.cc
68
- - ext/sfst_machine/sfst_machine.cc
69
- - ext/sfst_machine/sgi.h
70
- - ext/sfst_machine/utf8.cc
71
- - ext/sfst_machine/utf8.h
125
+ - bin/console
126
+ - bin/setup
127
+ - ext/sfst/alphabet.cc
128
+ - ext/sfst/alphabet.h
129
+ - ext/sfst/basic.cc
130
+ - ext/sfst/basic.h
131
+ - ext/sfst/compact.cc
132
+ - ext/sfst/compact.h
133
+ - ext/sfst/determinise.cc
134
+ - ext/sfst/extconf.rb
135
+ - ext/sfst/fst.cc
136
+ - ext/sfst/fst.h
137
+ - ext/sfst/hopcroft.cc
138
+ - ext/sfst/interface.cc
139
+ - ext/sfst/interface.h
140
+ - ext/sfst/make-compact.cc
141
+ - ext/sfst/make-compact.h
142
+ - ext/sfst/mem.h
143
+ - ext/sfst/operators.cc
144
+ - ext/sfst/sfst_machine.cc
145
+ - ext/sfst/sgi.h
146
+ - ext/sfst/utf8.cc
147
+ - ext/sfst/utf8.h
72
148
  - lib/sfst.rb
73
149
  - lib/sfst/version.rb
74
150
  - ruby-sfst.gemspec
75
- - test/test_sfst.fst
76
- - test/test_sfst.rb
77
151
  homepage: http://github.com/mlj/ruby-sfst
78
152
  licenses:
79
153
  - GPL2
@@ -86,18 +160,16 @@ required_ruby_version: !ruby/object:Gem::Requirement
86
160
  requirements:
87
161
  - - ">="
88
162
  - !ruby/object:Gem::Version
89
- version: '0'
163
+ version: '1.9'
90
164
  required_rubygems_version: !ruby/object:Gem::Requirement
91
165
  requirements:
92
166
  - - ">="
93
167
  - !ruby/object:Gem::Version
94
- version: 1.3.5
168
+ version: '0'
95
169
  requirements: []
96
170
  rubyforge_project:
97
- rubygems_version: 2.4.5
171
+ rubygems_version: 2.6.14
98
172
  signing_key:
99
173
  specification_version: 4
100
174
  summary: Stuttgart Finite State Transducer Tools interface
101
- test_files:
102
- - test/test_sfst.rb
103
- has_rdoc:
175
+ test_files: []
@@ -1,812 +0,0 @@
1
-
2
- /*******************************************************************/
3
- /* */
4
- /* FILE alphabet.C */
5
- /* MODULE alphabet */
6
- /* PROGRAM SFST */
7
- /* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
8
- /* */
9
- /* PURPOSE basic FST functions */
10
- /* */
11
- /*******************************************************************/
12
-
13
- #include <climits>
14
- #include <cstring>
15
-
16
- #include "utf8.h"
17
- #include "alphabet.h"
18
-
19
- using std::vector;
20
- using std::ostream;
21
-
22
- const int BUFFER_SIZE=100000;
23
-
24
- char EpsilonString[]="<>";
25
-
26
-
27
- /*******************************************************************/
28
- /* */
29
- /* Alphabet::add */
30
- /* */
31
- /*******************************************************************/
32
-
33
- void Alphabet::add( const char *symbol, Character c )
34
-
35
- {
36
- char *s = fst_strdup(symbol);
37
- cm[c] = s;
38
- sm[s] = c;
39
- }
40
-
41
-
42
- /*******************************************************************/
43
- /* */
44
- /* Alphabet::Alphabet */
45
- /* */
46
- /*******************************************************************/
47
-
48
- Alphabet::Alphabet()
49
-
50
- {
51
- utf8 = false;
52
- add(EpsilonString, Label::epsilon);
53
- }
54
-
55
-
56
- /*******************************************************************/
57
- /* */
58
- /* Alphabet::clear */
59
- /* */
60
- /*******************************************************************/
61
-
62
- void Alphabet::clear()
63
-
64
- {
65
- char **s=new char*[cm.size()];
66
- ls.clear();
67
- sm.clear();
68
-
69
- size_t i, n=0;
70
- for( CharMap::iterator it=cm.begin(); it!=cm.end(); it++ )
71
- s[n++] = it->second;
72
- cm.clear();
73
-
74
- for( i=0; i<n; i++ )
75
- free(s[i]);
76
- delete[] s;
77
- }
78
-
79
-
80
- /*******************************************************************/
81
- /* */
82
- /* Alphabet::new_marker */
83
- /* */
84
- /*******************************************************************/
85
-
86
- Character Alphabet::new_marker()
87
-
88
- {
89
- // find some unused character code
90
- for(Character i=1; i!=0; i++)
91
- if (cm.find(i) == cm.end()) {
92
- // create a unique identifier string
93
- char symbol[100];
94
- sprintf(symbol,">%ld<",(long)i);
95
- add(symbol, i);
96
- return i;
97
- }
98
-
99
- throw "Error: too many symbols in transducer definition";
100
- }
101
-
102
-
103
- /*******************************************************************/
104
- /* */
105
- /* is_marker_symbol */
106
- /* */
107
- /*******************************************************************/
108
-
109
- static bool is_marker_symbol( const char *s )
110
-
111
- {
112
- // recogize strings matching the expression ">[0-9]+<"
113
- if (s != NULL && *s == '>') {
114
- do { s++; } while (*s >= '0' && *s <= '9');
115
- if (*s=='<' && *(s+1) == 0 && *(s-1) != '>')
116
- return true;
117
- }
118
- return false;
119
- }
120
-
121
-
122
- /*******************************************************************/
123
- /* */
124
- /* Alphabet::delete_markers */
125
- /* */
126
- /*******************************************************************/
127
-
128
- void Alphabet::delete_markers()
129
-
130
- {
131
- vector<char*> sym;
132
- vector<Character> code;
133
- vector<Label> label;
134
-
135
- for( CharMap::const_iterator it=cm.begin(); it!=cm.end(); it++ ) {
136
- Character c=it->first;
137
- char *s=it->second;
138
- if (!is_marker_symbol(s)) {
139
- sym.push_back(fst_strdup(s));
140
- code.push_back(c);
141
- }
142
- }
143
-
144
- for( LabelSet::const_iterator it=begin(); it!=end(); it++ ) {
145
- Label l=*it;
146
- if (!is_marker_symbol(code2symbol(l.upper_char())) &&
147
- !is_marker_symbol(code2symbol(l.lower_char())))
148
- label.push_back(l);
149
- }
150
-
151
- clear();
152
-
153
- for( size_t i=0; i<sym.size(); i++ ) {
154
- add_symbol(sym[i], code[i]);
155
- free(sym[i]);
156
- }
157
- for( size_t i=0; i<label.size(); i++ )
158
- insert( label[i] );
159
- }
160
-
161
-
162
- /*******************************************************************/
163
- /* */
164
- /* Alphabet::add_symbol */
165
- /* */
166
- /*******************************************************************/
167
-
168
- Character Alphabet::add_symbol(const char *symbol)
169
-
170
- {
171
- if (sm.find(symbol) != sm.end())
172
- return sm[symbol];
173
-
174
- // assign the symbol to some unused character
175
- for(Character i=1; i!=0; i++)
176
- if (cm.find(i) == cm.end()) {
177
- add(symbol, i);
178
- return i;
179
- }
180
-
181
- throw "Error: too many symbols in transducer definition";
182
- }
183
-
184
-
185
- /*******************************************************************/
186
- /* */
187
- /* Alphabet::add_symbol */
188
- /* */
189
- /*******************************************************************/
190
-
191
- void Alphabet::add_symbol( const char *symbol, Character c )
192
-
193
- {
194
- // check whether the symbol was previously defined
195
- int sc=symbol2code(symbol);
196
- if (sc != EOF) {
197
- if ((Character)sc == c)
198
- return;
199
-
200
- if (strlen(symbol) < 60) {
201
- static char message[100];
202
- sprintf(message, "Error: reinserting symbol '%s' in alphabet with incompatible character value %u %u", symbol, (unsigned)sc, (unsigned)c);
203
- throw message;
204
- }
205
- else
206
- throw "reinserting symbol in alphabet with incompatible character value";
207
- }
208
-
209
- // check whether the character is already in use
210
- const char *s=code2symbol(c);
211
- if (s == NULL)
212
- add(symbol, c);
213
- else {
214
- if (strcmp(s, symbol) != 0) {
215
- static char message[100];
216
- if (strlen(symbol) < 70)
217
- sprintf(message,"Error: defining symbol %s as character %d (previously defined as %s)", symbol, (unsigned)c, s);
218
- else
219
- sprintf(message,"Error: defining a (very long) symbol with previously used character");
220
- throw message;
221
- }
222
- }
223
- }
224
-
225
-
226
- /*******************************************************************/
227
- /* */
228
- /* Alphabet::write_char */
229
- /* */
230
- /*******************************************************************/
231
-
232
- void Alphabet::write_char( Character c, char *buffer, int *pos,
233
- bool with_brackets) const
234
- {
235
- const char *s = code2symbol(c);
236
-
237
- if (s) {
238
- int i = 0;
239
- int l=strlen(s)-1;
240
- if (!with_brackets && s[i] == '<' && s[l] == '>') { i++; l--; }
241
- while (i <= l)
242
- buffer[(*pos)++] = s[i++];
243
- }
244
- else {
245
- unsigned int uc = c;
246
- if (uc>=32 && uc<256)
247
- buffer[(*pos)++] = (char)c;
248
- else {
249
- sprintf(buffer+(*pos),"\\%u", uc);
250
- *pos += strlen(buffer+(*pos));
251
- }
252
- }
253
- buffer[*pos] = '\0';
254
- }
255
-
256
-
257
- /*******************************************************************/
258
- /* */
259
- /* Alphabet::write_char */
260
- /* */
261
- /*******************************************************************/
262
-
263
- const char *Alphabet::write_char( Character c, bool with_brackets ) const
264
-
265
- {
266
- static char buffer[1000];
267
- int n=0;
268
-
269
- write_char( c, buffer, &n, with_brackets );
270
- return buffer;
271
- }
272
-
273
-
274
- /*******************************************************************/
275
- /* */
276
- /* Alphabet::write_label */
277
- /* */
278
- /*******************************************************************/
279
-
280
- void Alphabet::write_label( Label l, char *buffer, int *pos,
281
- bool with_brackets ) const
282
- {
283
- Character lc=l.lower_char();
284
- Character uc=l.upper_char();
285
- write_char( lc, buffer, pos, with_brackets );
286
- if (lc != uc) {
287
- buffer[(*pos)++] = ':';
288
- write_char( uc, buffer, pos, with_brackets );
289
- }
290
- }
291
-
292
-
293
- /*******************************************************************/
294
- /* */
295
- /* Alphabet::write_label */
296
- /* */
297
- /*******************************************************************/
298
-
299
- const char *Alphabet::write_label( Label l, bool with_brackets ) const
300
-
301
- {
302
- static char buffer[1000];
303
- int n=0;
304
- write_label( l, buffer, &n, with_brackets );
305
- return buffer;
306
- }
307
-
308
-
309
- /*******************************************************************/
310
- /* */
311
- /* Alphabet::insert_symbols */
312
- /* */
313
- /*******************************************************************/
314
-
315
- void Alphabet::insert_symbols( const Alphabet &a )
316
-
317
- {
318
- for( CharMap::const_iterator it=a.cm.begin(); it!=a.cm.end(); it++ )
319
- add_symbol(it->second, it->first);
320
- }
321
-
322
-
323
- /*******************************************************************/
324
- /* */
325
- /* Alphabet::complement */
326
- /* */
327
- /*******************************************************************/
328
-
329
- void Alphabet::complement( vector<Character> &sym )
330
-
331
- {
332
- vector<Character> result;
333
- for( CharMap::const_iterator it=cm.begin(); it!=cm.end(); it++ ) {
334
- Character c = it->first;
335
- if (c != Label::epsilon) {
336
- size_t i;
337
- for( i=0; i<sym.size(); i++ )
338
- if (sym[i] == c)
339
- break;
340
- if (i == sym.size())
341
- result.push_back(c);
342
- }
343
- }
344
- sym.swap(result);
345
- }
346
-
347
-
348
- /*******************************************************************/
349
- /* */
350
- /* Alphabet::copy */
351
- /* */
352
- /*******************************************************************/
353
-
354
- void Alphabet::copy( const Alphabet &a )
355
-
356
- {
357
- insert_symbols( a );
358
- utf8 = a.utf8;
359
- for( LabelSet::const_iterator it=a.begin(); it!=a.end(); it++ )
360
- ls.insert( *it );
361
- }
362
-
363
-
364
- /*******************************************************************/
365
- /* */
366
- /* Alphabet::compose */
367
- /* */
368
- /*******************************************************************/
369
-
370
- void Alphabet::compose( const Alphabet &la, const Alphabet &ua )
371
-
372
- {
373
- // insert the symbols
374
- insert_symbols(la);
375
- insert_symbols(ua);
376
- utf8 = la.utf8;
377
-
378
- hash_map<Character, hash_set<Character> > cs;
379
-
380
- // create a hash table for a quick lookup of the target characters
381
- for( iterator it=ua.begin(); it!=ua.end(); it++ ) {
382
- Character lc=it->lower_char();
383
- if (lc == Label::epsilon)
384
- insert(*it);
385
- else
386
- cs[lc].insert(it->upper_char());
387
- }
388
-
389
- for( iterator it=la.begin(); it!=la.end(); it++ ) {
390
- Character uc=it->upper_char();
391
- if (uc == Label::epsilon)
392
- insert(*it);
393
- else {
394
- if (cs.find(uc) != cs.end()) {
395
- hash_set<Character> s=cs[uc];
396
- Character lc=it->lower_char();
397
- for( hash_set<Character>::iterator it=s.begin(); it!=s.end(); it++)
398
- insert(Label(lc, *it));
399
- }
400
- }
401
- }
402
- }
403
-
404
-
405
- /*******************************************************************/
406
- /* */
407
- /* operator<<(Alphabet) */
408
- /* */
409
- /*******************************************************************/
410
-
411
- ostream &operator<<( ostream &s, const Alphabet &a )
412
-
413
- {
414
- for( Alphabet::CharMap::const_iterator it=a.cm.begin(); it!=a.cm.end(); it++ )
415
- s << it->first << " -> " << it->second << "\n";
416
- for( Alphabet::iterator it=a.begin(); it!=a.end(); it++ )
417
- s << a.write_label(*it) << " ";
418
- s << "\n";
419
- return s;
420
- }
421
-
422
-
423
- /*******************************************************************/
424
- /* */
425
- /* Alphabet::next_mcsym */
426
- /* */
427
- /* recognizes multi-character symbols which are enclosed with */
428
- /* angle brackets <...>. If the argument flag insert is true, */
429
- /* the multi-character symbol must be already in the lexicon in */
430
- /* order to be recognized. */
431
- /* */
432
- /*******************************************************************/
433
-
434
- int Alphabet::next_mcsym( char* &string, bool insert )
435
-
436
- {
437
- char *start=string;
438
-
439
- if (*start == '<')
440
- // symbol might start here
441
- for( char *end=start+1; *end; end++ )
442
- if (*end == '>') {
443
- // matching pair of angle brackets found
444
- // mark the end of the substring with \0
445
- char lastc = *(++end);
446
- *end = 0;
447
-
448
- int c;
449
- if (insert)
450
- c = add_symbol( start );
451
- else
452
- c = symbol2code(start);
453
- // restore the original string
454
- *end = lastc;
455
-
456
- if (c != EOF) {
457
- // symbol found
458
- // return its code
459
- string = end;
460
- return (Character)c;
461
- }
462
- else
463
- // not a complex character
464
- break;
465
- }
466
- return EOF;
467
- }
468
-
469
-
470
- /*******************************************************************/
471
- /* */
472
- /* Alphabet::next_code */
473
- /* */
474
- /*******************************************************************/
475
-
476
- int Alphabet::next_code( char* &string, bool extended, bool insert )
477
-
478
- {
479
- if (*string == 0)
480
- return EOF; // finished
481
-
482
- int c = next_mcsym(string, insert);
483
- if (c != EOF)
484
- return c;
485
-
486
- if (extended && *string == '\\')
487
- string++; // remove quotation
488
-
489
- if (utf8) {
490
- unsigned int c = utf8toint( &string );
491
- return (int)add_symbol(int2utf8(c));
492
- }
493
- else {
494
- char buffer[2];
495
- buffer[0] = *string;
496
- buffer[1] = 0;
497
- string++;
498
- return (int)add_symbol(buffer);
499
- }
500
- }
501
-
502
-
503
- /*******************************************************************/
504
- /* */
505
- /* Alphabet::next_label */
506
- /* */
507
- /*******************************************************************/
508
-
509
- Label Alphabet::next_label( char* &string, bool extended )
510
-
511
- {
512
- // read first character
513
- int c = next_code( string, extended );
514
- if (c == EOF)
515
- return Label(); // end of string reached
516
-
517
- Character lc=(Character)c;
518
- if (!extended || *string != ':') { // single character?
519
- if (lc == Label::epsilon)
520
- return next_label(string, extended); // ignore epsilon
521
- return Label(lc);
522
- }
523
-
524
- // read second character
525
- string++; // jump over ':'
526
- c = next_code( string );
527
- if (c == EOF) {
528
- static char buffer[1000];
529
- sprintf(buffer,"Error: incomplete symbol in input file: %s", string);
530
- throw buffer;
531
- }
532
-
533
- Label l(lc, (Character)c);
534
- if (l.is_epsilon())
535
- return next_label(string, extended); // ignore epsilon transitions
536
- return l;
537
- }
538
-
539
-
540
- /*******************************************************************/
541
- /* */
542
- /* Alphabet::string2symseq */
543
- /* */
544
- /*******************************************************************/
545
-
546
- void Alphabet::string2symseq( char *s, vector<Character> &ch )
547
-
548
- {
549
- int c;
550
- while ((c = next_code(s, false)) != EOF)
551
- ch.push_back((Character)c);
552
- }
553
-
554
-
555
- /*******************************************************************/
556
- /* */
557
- /* Alphabet::string2labelseq */
558
- /* */
559
- /*******************************************************************/
560
-
561
- void Alphabet::string2labelseq( char *s, vector<Label> &labels )
562
-
563
- {
564
- Label l;
565
- while ((l = next_label(s)) != Label::epsilon)
566
- labels.push_back(l);
567
- }
568
-
569
-
570
- /*******************************************************************/
571
- /* */
572
- /* Alphabet::store */
573
- /* */
574
- /*******************************************************************/
575
-
576
- void Alphabet::store( FILE *file ) const
577
-
578
- {
579
- char c=(utf8)? 1: 0;
580
- fputc(c, file);
581
-
582
- // write the symbol mapping
583
- Character n=cm.size();
584
- fwrite(&n, sizeof(n), 1, file);
585
- for( CharMap::const_iterator it=cm.begin(); it!=cm.end(); it++ ) {
586
- Character c=it->first;
587
- char *s=it->second;
588
- fwrite(&c, sizeof(c), 1, file);
589
- fwrite(s, sizeof(char), strlen(s)+1, file);
590
- }
591
-
592
- // write the character pairs
593
- n = size();
594
- fwrite(&n, sizeof(n), 1, file);
595
- for( LabelSet::const_iterator p=ls.begin(); p!=ls.end(); p++ ) {
596
- Character c=p->lower_char();
597
- fwrite(&c, sizeof(c), 1, file);
598
- c = p->upper_char();
599
- fwrite(&c, sizeof(c), 1, file);
600
- }
601
-
602
- if (ferror(file))
603
- throw "Error encountered while writing alphabet to file\n";
604
- }
605
-
606
-
607
- /*******************************************************************/
608
- /* */
609
- /* Alphabet::read */
610
- /* */
611
- /*******************************************************************/
612
-
613
- void Alphabet::read( FILE *file )
614
-
615
- {
616
- utf8 = (fgetc(file) != 0);
617
-
618
- // read the symbol mapping
619
- Character n=0;
620
- read_num(&n, sizeof(n), file);
621
- for( unsigned i=0; i<n; i++) {
622
- char buffer[BUFFER_SIZE];
623
- Character c;
624
- read_num(&c, sizeof(c), file);
625
- if (!read_string(buffer, BUFFER_SIZE, file) ||
626
- feof(file) || ferror(file))
627
- throw "Error1 occurred while reading alphabet!\n";
628
- add_symbol(buffer, c);
629
- }
630
-
631
- // read the character pairs
632
- read_num(&n, sizeof(n), file);
633
- if (ferror(file))
634
- throw "Error2 occurred while reading alphabet!\n";
635
- for( unsigned i=0; i<n; i++) {
636
- Character lc, uc;
637
- read_num(&lc, sizeof(lc), file);
638
- read_num(&uc, sizeof(uc), file);
639
- insert(Label(lc, uc));
640
- }
641
- if (ferror(file))
642
- throw "Error3 occurred while reading alphabet!\n";
643
- }
644
-
645
-
646
- /*******************************************************************/
647
- /* */
648
- /* Alphabet::compute_score */
649
- /* */
650
- /*******************************************************************/
651
-
652
- int Alphabet::compute_score( Analysis &ana )
653
-
654
- {
655
- // check whether the morpheme boundaries are explicitly marked
656
- // with <X> tags
657
- int score=0;
658
- for( size_t i=0; i<ana.size(); i++ ) {
659
-
660
- // get next symbol
661
- const char *sym=write_char(ana[i].lower_char());
662
-
663
- if (strcmp(sym,"<X>") == 0)
664
- score--;
665
- }
666
- if (score < 0)
667
- return score;
668
-
669
- // No explicit morphome boundary markers have been found.
670
- // Count the number of part-of-speech and PREF tags.
671
- for( size_t i=0; i<ana.size(); i++ ) {
672
-
673
- // get next symbol
674
- const char *sym=write_char(ana[i].lower_char());
675
-
676
- // Is it not a multi-character symbol
677
- if (sym[0] != '<' || sym[1] == 0)
678
- continue;
679
-
680
- // Is it a POS tag starting with "+" like <+NN>?
681
- if (sym[1] == '+') {
682
- const char *t=sym+2;
683
- for( ; *t >= 'A' && *t <= 'Z'; t++) ;
684
- if (t > sym+2 && *t == '>')
685
- return score;
686
- }
687
-
688
- // Is it a potential POS tag (i.e. all uppercase)?
689
- const char *t = sym+1;
690
- for( ; *t >= 'A' && *t <= 'Z'; t++) ;
691
- if (t == sym+1 || *t != '>')
692
- continue;
693
-
694
- // uppercase symbol found
695
- if (strcmp(sym,"<SUFF>") == 0 ||
696
- strcmp(sym,"<OLDORTH>") == 0 ||
697
- strcmp(sym,"<NEWORTH>") == 0)
698
- continue; // not what we are looking for
699
-
700
- // disprefer nouns with prefixes
701
- if (strcmp(sym,"<PREF>") == 0)
702
- score-=2;
703
-
704
- if (strcmp(sym,"<V>") == 0 || strcmp(sym,"<ADJ>") == 0) {
705
- bool is_verb=(strcmp(sym,"<V>")==0);
706
- // get the next non-empty symbol
707
- Character c=Label::epsilon;
708
- size_t k;
709
- for( k=i+1; k<ana.size(); k++ )
710
- if ((c = ana[k].lower_char()) != Label::epsilon)
711
- break;
712
- // Is it a participle
713
- if (c != Label::epsilon) {
714
- sym = write_char(c);
715
- if (strcmp(sym,"<OLDORTH>") == 0 || strcmp(sym,"<NEWORTH>") == 0) {
716
- for( k++; k<ana.size(); k++ )
717
- if ((c = ana[k].lower_char()) != Label::epsilon)
718
- break;
719
- if (c != Label::epsilon)
720
- sym = write_char(c);
721
- }
722
- if (is_verb &&
723
- (strcmp(sym,"<PPres>") == 0 || strcmp(sym,"<PPast>") == 0))
724
- continue; // don't consider participles as complex
725
- if (!is_verb &&
726
- (strcmp(sym,"<Sup>") == 0 || strcmp(sym,"<Comp>") == 0))
727
- continue; // don't consider participles as complex
728
- }
729
- }
730
- score--;
731
- }
732
- return score;
733
- }
734
-
735
-
736
-
737
- /*******************************************************************/
738
- /* */
739
- /* Alphabet::disambiguate */
740
- /* */
741
- /*******************************************************************/
742
-
743
- void Alphabet::disambiguate( vector<Analysis> &analyses )
744
-
745
- {
746
- // compute the scores
747
- int bestscore=INT_MIN;
748
- vector<int> score;
749
-
750
- for( size_t i=0; i<analyses.size(); i++ ) {
751
- score.push_back(compute_score(analyses[i]));
752
- if (bestscore < score[i])
753
- bestscore = score[i];
754
- }
755
-
756
- // delete suboptimal analyses
757
- size_t k=0;
758
- for( size_t i=0; i<analyses.size(); i++ )
759
- if (score[i] == bestscore)
760
- analyses[k++] = analyses[i];
761
- analyses.resize(k);
762
- }
763
-
764
-
765
-
766
- /*******************************************************************/
767
- /* */
768
- /* Alphabet::print_analysis */
769
- /* */
770
- /*******************************************************************/
771
-
772
- char *Alphabet::print_analysis( Analysis &ana, bool both_layers )
773
-
774
- {
775
- vector<char> ch;
776
-
777
- // for each transition
778
- for( size_t i=0; i<ana.size(); i++ ) {
779
-
780
- // get the transition label
781
- Label l=ana[i];
782
- const char *s;
783
-
784
- // either print the analysis symbol or the whole label
785
- if (both_layers) {
786
- s = write_label(l);
787
- // quote colons
788
- if (strcmp(s,":") == 0)
789
- ch.push_back('\\');
790
- }
791
- else if (l.lower_char() != Label::epsilon)
792
- s = write_char(l.lower_char());
793
- else
794
- continue;
795
-
796
- // copy the characters to the character array
797
- while (*s)
798
- ch.push_back(*(s++));
799
- }
800
- ch.push_back(0); // terminate the string
801
-
802
- static char *result=NULL;
803
- if (result != NULL)
804
- delete[] result;
805
- result = new char[ch.size()];
806
- for( size_t i=0; i<ch.size(); i++ )
807
- result[i] = ch[i];
808
-
809
- return result;
810
- }
811
-
812
-