ruby-sfst 0.4.3 → 0.4.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -0
- data/COPYING +280 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +54 -0
- data/README.md +1 -1
- data/Rakefile +9 -18
- data/bin/console +7 -0
- data/bin/setup +6 -0
- data/ext/sfst/alphabet.cc +879 -0
- data/ext/sfst/alphabet.h +302 -0
- data/ext/sfst/basic.cc +85 -0
- data/ext/{sfst_machine → sfst}/basic.h +7 -4
- data/ext/sfst/compact.cc +629 -0
- data/ext/sfst/compact.h +100 -0
- data/ext/sfst/determinise.cc +279 -0
- data/ext/{sfst_machine → sfst}/extconf.rb +2 -1
- data/ext/sfst/fst.cc +1150 -0
- data/ext/sfst/fst.h +374 -0
- data/ext/sfst/hopcroft.cc +681 -0
- data/ext/sfst/interface.cc +1921 -0
- data/ext/sfst/interface.h +171 -0
- data/ext/sfst/make-compact.cc +323 -0
- data/ext/{sfst_machine → sfst}/make-compact.h +15 -13
- data/ext/sfst/mem.h +80 -0
- data/ext/sfst/operators.cc +1273 -0
- data/ext/{sfst_machine → sfst}/sfst_machine.cc +89 -78
- data/ext/sfst/sgi.h +72 -0
- data/ext/sfst/utf8.cc +149 -0
- data/ext/{sfst_machine → sfst}/utf8.h +7 -4
- data/lib/sfst.rb +2 -1
- data/lib/sfst/version.rb +1 -1
- data/ruby-sfst.gemspec +23 -23
- metadata +107 -35
- data/ext/sfst_machine/alphabet.cc +0 -812
- data/ext/sfst_machine/alphabet.h +0 -273
- data/ext/sfst_machine/basic.cc +0 -84
- data/ext/sfst_machine/compact.cc +0 -616
- data/ext/sfst_machine/compact.h +0 -98
- data/ext/sfst_machine/determinise.cc +0 -303
- data/ext/sfst_machine/fst.cc +0 -1000
- data/ext/sfst_machine/fst.h +0 -369
- data/ext/sfst_machine/interface.cc +0 -1842
- data/ext/sfst_machine/interface.h +0 -93
- data/ext/sfst_machine/make-compact.cc +0 -327
- data/ext/sfst_machine/mem.h +0 -74
- data/ext/sfst_machine/operators.cc +0 -1131
- data/ext/sfst_machine/sgi.h +0 -44
- data/ext/sfst_machine/utf8.cc +0 -146
- data/test/test_sfst.fst +0 -3
- data/test/test_sfst.rb +0 -114
@@ -12,8 +12,11 @@
|
|
12
12
|
#ifndef _UTF8_H_
|
13
13
|
#define _UTF8_H_
|
14
14
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
15
|
+
namespace SFST {
|
16
|
+
|
17
|
+
unsigned int utf8toint( char *s );
|
18
|
+
unsigned int utf8toint( char **s );
|
19
|
+
char *int2utf8( unsigned int );
|
20
|
+
|
21
|
+
}
|
19
22
|
#endif
|
data/lib/sfst.rb
CHANGED
data/lib/sfst/version.rb
CHANGED
data/ruby-sfst.gemspec
CHANGED
@@ -3,28 +3,28 @@ lib = File.expand_path('../lib', __FILE__)
|
|
3
3
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
4
|
require 'sfst/version'
|
5
5
|
|
6
|
-
Gem::Specification.new do |
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
s.files += Dir.glob("ext/**/*.C")
|
13
|
-
s.files += Dir.glob("ext/**/*.h")
|
14
|
-
s.files += Dir.glob("ext/**/*.rb")
|
15
|
-
s.files += Dir.glob("ext/**/*.cc")
|
16
|
-
s.files += Dir.glob("lib/**/*.rb")
|
17
|
-
s.files += Dir.glob("test/*.fst")
|
18
|
-
s.files += Dir.glob("test/*.rb")
|
19
|
-
s.homepage = "http://github.com/mlj/ruby-sfst"
|
20
|
-
s.licenses = ['GPL2']
|
21
|
-
s.name = "ruby-sfst"
|
22
|
-
s.require_paths = ["lib"]
|
23
|
-
s.required_rubygems_version = '>= 1.3.5'
|
24
|
-
s.extensions = ["ext/sfst_machine/extconf.rb"]
|
25
|
-
s.test_files += Dir.glob("test/*.rb")
|
26
|
-
s.version = SFST::VERSION
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "ruby-sfst"
|
8
|
+
spec.version = SFST::VERSION
|
9
|
+
spec.authors = ["Marius L. Jøhndal"]
|
10
|
+
spec.email = ["mariuslj@ifi.uio.no"]
|
11
|
+
spec.license = 'GPL2'
|
27
12
|
|
28
|
-
|
29
|
-
|
13
|
+
spec.summary = %q{Stuttgart Finite State Transducer Tools interface}
|
14
|
+
spec.description = %q{A wrapper for the Stuttgart Finite State Transducer Tools (SFST).}
|
15
|
+
spec.homepage = "http://github.com/mlj/ruby-sfst"
|
16
|
+
|
17
|
+
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^spec/}) } - %w(.gitignore .rspec .travis.yml)
|
18
|
+
spec.require_paths = ["lib"]
|
19
|
+
spec.extensions = ["ext/sfst/extconf.rb"]
|
20
|
+
|
21
|
+
spec.required_ruby_version = '>= 1.9'
|
22
|
+
|
23
|
+
spec.add_development_dependency 'bundler', '~> 1.16'
|
24
|
+
spec.add_development_dependency 'rake', '~> 12.3'
|
25
|
+
spec.add_development_dependency 'rake-compiler', '~> 1.0'
|
26
|
+
spec.add_development_dependency 'rspec', '~> 3.7'
|
27
|
+
spec.add_development_dependency 'pry', '~> 0.11'
|
28
|
+
spec.add_development_dependency 'simplecov', '~> 0.15'
|
29
|
+
spec.add_development_dependency 'yard', '~> 0.9'
|
30
30
|
end
|
metadata
CHANGED
@@ -1,17 +1,45 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ruby-sfst
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Marius L. Jøhndal
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2017-12-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.16'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.16'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '12.3'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '12.3'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rake-compiler
|
15
43
|
requirement: !ruby/object:Gem::Requirement
|
16
44
|
requirements:
|
17
45
|
- - "~>"
|
@@ -25,55 +53,101 @@ dependencies:
|
|
25
53
|
- !ruby/object:Gem::Version
|
26
54
|
version: '1.0'
|
27
55
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
56
|
+
name: rspec
|
29
57
|
requirement: !ruby/object:Gem::Requirement
|
30
58
|
requirements:
|
31
59
|
- - "~>"
|
32
60
|
- !ruby/object:Gem::Version
|
33
|
-
version: '3.
|
61
|
+
version: '3.7'
|
34
62
|
type: :development
|
35
63
|
prerelease: false
|
36
64
|
version_requirements: !ruby/object:Gem::Requirement
|
37
65
|
requirements:
|
38
66
|
- - "~>"
|
39
67
|
- !ruby/object:Gem::Version
|
40
|
-
version: '3.
|
68
|
+
version: '3.7'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: pry
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0.11'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0.11'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: simplecov
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0.15'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0.15'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: yard
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - "~>"
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0.9'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - "~>"
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0.9'
|
41
111
|
description: A wrapper for the Stuttgart Finite State Transducer Tools (SFST).
|
42
112
|
email:
|
43
|
-
- mariuslj
|
113
|
+
- mariuslj@ifi.uio.no
|
44
114
|
executables: []
|
45
115
|
extensions:
|
46
|
-
- ext/
|
116
|
+
- ext/sfst/extconf.rb
|
47
117
|
extra_rdoc_files: []
|
48
118
|
files:
|
49
119
|
- CHANGELOG.md
|
120
|
+
- COPYING
|
121
|
+
- Gemfile
|
122
|
+
- Gemfile.lock
|
50
123
|
- README.md
|
51
124
|
- Rakefile
|
52
|
-
-
|
53
|
-
-
|
54
|
-
- ext/
|
55
|
-
- ext/
|
56
|
-
- ext/
|
57
|
-
- ext/
|
58
|
-
- ext/
|
59
|
-
- ext/
|
60
|
-
- ext/
|
61
|
-
- ext/
|
62
|
-
- ext/
|
63
|
-
- ext/
|
64
|
-
- ext/
|
65
|
-
- ext/
|
66
|
-
- ext/
|
67
|
-
- ext/
|
68
|
-
- ext/
|
69
|
-
- ext/
|
70
|
-
- ext/
|
71
|
-
- ext/sfst_machine
|
125
|
+
- bin/console
|
126
|
+
- bin/setup
|
127
|
+
- ext/sfst/alphabet.cc
|
128
|
+
- ext/sfst/alphabet.h
|
129
|
+
- ext/sfst/basic.cc
|
130
|
+
- ext/sfst/basic.h
|
131
|
+
- ext/sfst/compact.cc
|
132
|
+
- ext/sfst/compact.h
|
133
|
+
- ext/sfst/determinise.cc
|
134
|
+
- ext/sfst/extconf.rb
|
135
|
+
- ext/sfst/fst.cc
|
136
|
+
- ext/sfst/fst.h
|
137
|
+
- ext/sfst/hopcroft.cc
|
138
|
+
- ext/sfst/interface.cc
|
139
|
+
- ext/sfst/interface.h
|
140
|
+
- ext/sfst/make-compact.cc
|
141
|
+
- ext/sfst/make-compact.h
|
142
|
+
- ext/sfst/mem.h
|
143
|
+
- ext/sfst/operators.cc
|
144
|
+
- ext/sfst/sfst_machine.cc
|
145
|
+
- ext/sfst/sgi.h
|
146
|
+
- ext/sfst/utf8.cc
|
147
|
+
- ext/sfst/utf8.h
|
72
148
|
- lib/sfst.rb
|
73
149
|
- lib/sfst/version.rb
|
74
150
|
- ruby-sfst.gemspec
|
75
|
-
- test/test_sfst.fst
|
76
|
-
- test/test_sfst.rb
|
77
151
|
homepage: http://github.com/mlj/ruby-sfst
|
78
152
|
licenses:
|
79
153
|
- GPL2
|
@@ -86,18 +160,16 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
86
160
|
requirements:
|
87
161
|
- - ">="
|
88
162
|
- !ruby/object:Gem::Version
|
89
|
-
version: '
|
163
|
+
version: '1.9'
|
90
164
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
91
165
|
requirements:
|
92
166
|
- - ">="
|
93
167
|
- !ruby/object:Gem::Version
|
94
|
-
version:
|
168
|
+
version: '0'
|
95
169
|
requirements: []
|
96
170
|
rubyforge_project:
|
97
|
-
rubygems_version: 2.
|
171
|
+
rubygems_version: 2.6.14
|
98
172
|
signing_key:
|
99
173
|
specification_version: 4
|
100
174
|
summary: Stuttgart Finite State Transducer Tools interface
|
101
|
-
test_files:
|
102
|
-
- test/test_sfst.rb
|
103
|
-
has_rdoc:
|
175
|
+
test_files: []
|
@@ -1,812 +0,0 @@
|
|
1
|
-
|
2
|
-
/*******************************************************************/
|
3
|
-
/* */
|
4
|
-
/* FILE alphabet.C */
|
5
|
-
/* MODULE alphabet */
|
6
|
-
/* PROGRAM SFST */
|
7
|
-
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
|
8
|
-
/* */
|
9
|
-
/* PURPOSE basic FST functions */
|
10
|
-
/* */
|
11
|
-
/*******************************************************************/
|
12
|
-
|
13
|
-
#include <climits>
|
14
|
-
#include <cstring>
|
15
|
-
|
16
|
-
#include "utf8.h"
|
17
|
-
#include "alphabet.h"
|
18
|
-
|
19
|
-
using std::vector;
|
20
|
-
using std::ostream;
|
21
|
-
|
22
|
-
const int BUFFER_SIZE=100000;
|
23
|
-
|
24
|
-
char EpsilonString[]="<>";
|
25
|
-
|
26
|
-
|
27
|
-
/*******************************************************************/
|
28
|
-
/* */
|
29
|
-
/* Alphabet::add */
|
30
|
-
/* */
|
31
|
-
/*******************************************************************/
|
32
|
-
|
33
|
-
void Alphabet::add( const char *symbol, Character c )
|
34
|
-
|
35
|
-
{
|
36
|
-
char *s = fst_strdup(symbol);
|
37
|
-
cm[c] = s;
|
38
|
-
sm[s] = c;
|
39
|
-
}
|
40
|
-
|
41
|
-
|
42
|
-
/*******************************************************************/
|
43
|
-
/* */
|
44
|
-
/* Alphabet::Alphabet */
|
45
|
-
/* */
|
46
|
-
/*******************************************************************/
|
47
|
-
|
48
|
-
Alphabet::Alphabet()
|
49
|
-
|
50
|
-
{
|
51
|
-
utf8 = false;
|
52
|
-
add(EpsilonString, Label::epsilon);
|
53
|
-
}
|
54
|
-
|
55
|
-
|
56
|
-
/*******************************************************************/
|
57
|
-
/* */
|
58
|
-
/* Alphabet::clear */
|
59
|
-
/* */
|
60
|
-
/*******************************************************************/
|
61
|
-
|
62
|
-
void Alphabet::clear()
|
63
|
-
|
64
|
-
{
|
65
|
-
char **s=new char*[cm.size()];
|
66
|
-
ls.clear();
|
67
|
-
sm.clear();
|
68
|
-
|
69
|
-
size_t i, n=0;
|
70
|
-
for( CharMap::iterator it=cm.begin(); it!=cm.end(); it++ )
|
71
|
-
s[n++] = it->second;
|
72
|
-
cm.clear();
|
73
|
-
|
74
|
-
for( i=0; i<n; i++ )
|
75
|
-
free(s[i]);
|
76
|
-
delete[] s;
|
77
|
-
}
|
78
|
-
|
79
|
-
|
80
|
-
/*******************************************************************/
|
81
|
-
/* */
|
82
|
-
/* Alphabet::new_marker */
|
83
|
-
/* */
|
84
|
-
/*******************************************************************/
|
85
|
-
|
86
|
-
Character Alphabet::new_marker()
|
87
|
-
|
88
|
-
{
|
89
|
-
// find some unused character code
|
90
|
-
for(Character i=1; i!=0; i++)
|
91
|
-
if (cm.find(i) == cm.end()) {
|
92
|
-
// create a unique identifier string
|
93
|
-
char symbol[100];
|
94
|
-
sprintf(symbol,">%ld<",(long)i);
|
95
|
-
add(symbol, i);
|
96
|
-
return i;
|
97
|
-
}
|
98
|
-
|
99
|
-
throw "Error: too many symbols in transducer definition";
|
100
|
-
}
|
101
|
-
|
102
|
-
|
103
|
-
/*******************************************************************/
|
104
|
-
/* */
|
105
|
-
/* is_marker_symbol */
|
106
|
-
/* */
|
107
|
-
/*******************************************************************/
|
108
|
-
|
109
|
-
static bool is_marker_symbol( const char *s )
|
110
|
-
|
111
|
-
{
|
112
|
-
// recogize strings matching the expression ">[0-9]+<"
|
113
|
-
if (s != NULL && *s == '>') {
|
114
|
-
do { s++; } while (*s >= '0' && *s <= '9');
|
115
|
-
if (*s=='<' && *(s+1) == 0 && *(s-1) != '>')
|
116
|
-
return true;
|
117
|
-
}
|
118
|
-
return false;
|
119
|
-
}
|
120
|
-
|
121
|
-
|
122
|
-
/*******************************************************************/
|
123
|
-
/* */
|
124
|
-
/* Alphabet::delete_markers */
|
125
|
-
/* */
|
126
|
-
/*******************************************************************/
|
127
|
-
|
128
|
-
void Alphabet::delete_markers()
|
129
|
-
|
130
|
-
{
|
131
|
-
vector<char*> sym;
|
132
|
-
vector<Character> code;
|
133
|
-
vector<Label> label;
|
134
|
-
|
135
|
-
for( CharMap::const_iterator it=cm.begin(); it!=cm.end(); it++ ) {
|
136
|
-
Character c=it->first;
|
137
|
-
char *s=it->second;
|
138
|
-
if (!is_marker_symbol(s)) {
|
139
|
-
sym.push_back(fst_strdup(s));
|
140
|
-
code.push_back(c);
|
141
|
-
}
|
142
|
-
}
|
143
|
-
|
144
|
-
for( LabelSet::const_iterator it=begin(); it!=end(); it++ ) {
|
145
|
-
Label l=*it;
|
146
|
-
if (!is_marker_symbol(code2symbol(l.upper_char())) &&
|
147
|
-
!is_marker_symbol(code2symbol(l.lower_char())))
|
148
|
-
label.push_back(l);
|
149
|
-
}
|
150
|
-
|
151
|
-
clear();
|
152
|
-
|
153
|
-
for( size_t i=0; i<sym.size(); i++ ) {
|
154
|
-
add_symbol(sym[i], code[i]);
|
155
|
-
free(sym[i]);
|
156
|
-
}
|
157
|
-
for( size_t i=0; i<label.size(); i++ )
|
158
|
-
insert( label[i] );
|
159
|
-
}
|
160
|
-
|
161
|
-
|
162
|
-
/*******************************************************************/
|
163
|
-
/* */
|
164
|
-
/* Alphabet::add_symbol */
|
165
|
-
/* */
|
166
|
-
/*******************************************************************/
|
167
|
-
|
168
|
-
Character Alphabet::add_symbol(const char *symbol)
|
169
|
-
|
170
|
-
{
|
171
|
-
if (sm.find(symbol) != sm.end())
|
172
|
-
return sm[symbol];
|
173
|
-
|
174
|
-
// assign the symbol to some unused character
|
175
|
-
for(Character i=1; i!=0; i++)
|
176
|
-
if (cm.find(i) == cm.end()) {
|
177
|
-
add(symbol, i);
|
178
|
-
return i;
|
179
|
-
}
|
180
|
-
|
181
|
-
throw "Error: too many symbols in transducer definition";
|
182
|
-
}
|
183
|
-
|
184
|
-
|
185
|
-
/*******************************************************************/
|
186
|
-
/* */
|
187
|
-
/* Alphabet::add_symbol */
|
188
|
-
/* */
|
189
|
-
/*******************************************************************/
|
190
|
-
|
191
|
-
void Alphabet::add_symbol( const char *symbol, Character c )
|
192
|
-
|
193
|
-
{
|
194
|
-
// check whether the symbol was previously defined
|
195
|
-
int sc=symbol2code(symbol);
|
196
|
-
if (sc != EOF) {
|
197
|
-
if ((Character)sc == c)
|
198
|
-
return;
|
199
|
-
|
200
|
-
if (strlen(symbol) < 60) {
|
201
|
-
static char message[100];
|
202
|
-
sprintf(message, "Error: reinserting symbol '%s' in alphabet with incompatible character value %u %u", symbol, (unsigned)sc, (unsigned)c);
|
203
|
-
throw message;
|
204
|
-
}
|
205
|
-
else
|
206
|
-
throw "reinserting symbol in alphabet with incompatible character value";
|
207
|
-
}
|
208
|
-
|
209
|
-
// check whether the character is already in use
|
210
|
-
const char *s=code2symbol(c);
|
211
|
-
if (s == NULL)
|
212
|
-
add(symbol, c);
|
213
|
-
else {
|
214
|
-
if (strcmp(s, symbol) != 0) {
|
215
|
-
static char message[100];
|
216
|
-
if (strlen(symbol) < 70)
|
217
|
-
sprintf(message,"Error: defining symbol %s as character %d (previously defined as %s)", symbol, (unsigned)c, s);
|
218
|
-
else
|
219
|
-
sprintf(message,"Error: defining a (very long) symbol with previously used character");
|
220
|
-
throw message;
|
221
|
-
}
|
222
|
-
}
|
223
|
-
}
|
224
|
-
|
225
|
-
|
226
|
-
/*******************************************************************/
|
227
|
-
/* */
|
228
|
-
/* Alphabet::write_char */
|
229
|
-
/* */
|
230
|
-
/*******************************************************************/
|
231
|
-
|
232
|
-
void Alphabet::write_char( Character c, char *buffer, int *pos,
|
233
|
-
bool with_brackets) const
|
234
|
-
{
|
235
|
-
const char *s = code2symbol(c);
|
236
|
-
|
237
|
-
if (s) {
|
238
|
-
int i = 0;
|
239
|
-
int l=strlen(s)-1;
|
240
|
-
if (!with_brackets && s[i] == '<' && s[l] == '>') { i++; l--; }
|
241
|
-
while (i <= l)
|
242
|
-
buffer[(*pos)++] = s[i++];
|
243
|
-
}
|
244
|
-
else {
|
245
|
-
unsigned int uc = c;
|
246
|
-
if (uc>=32 && uc<256)
|
247
|
-
buffer[(*pos)++] = (char)c;
|
248
|
-
else {
|
249
|
-
sprintf(buffer+(*pos),"\\%u", uc);
|
250
|
-
*pos += strlen(buffer+(*pos));
|
251
|
-
}
|
252
|
-
}
|
253
|
-
buffer[*pos] = '\0';
|
254
|
-
}
|
255
|
-
|
256
|
-
|
257
|
-
/*******************************************************************/
|
258
|
-
/* */
|
259
|
-
/* Alphabet::write_char */
|
260
|
-
/* */
|
261
|
-
/*******************************************************************/
|
262
|
-
|
263
|
-
const char *Alphabet::write_char( Character c, bool with_brackets ) const
|
264
|
-
|
265
|
-
{
|
266
|
-
static char buffer[1000];
|
267
|
-
int n=0;
|
268
|
-
|
269
|
-
write_char( c, buffer, &n, with_brackets );
|
270
|
-
return buffer;
|
271
|
-
}
|
272
|
-
|
273
|
-
|
274
|
-
/*******************************************************************/
|
275
|
-
/* */
|
276
|
-
/* Alphabet::write_label */
|
277
|
-
/* */
|
278
|
-
/*******************************************************************/
|
279
|
-
|
280
|
-
void Alphabet::write_label( Label l, char *buffer, int *pos,
|
281
|
-
bool with_brackets ) const
|
282
|
-
{
|
283
|
-
Character lc=l.lower_char();
|
284
|
-
Character uc=l.upper_char();
|
285
|
-
write_char( lc, buffer, pos, with_brackets );
|
286
|
-
if (lc != uc) {
|
287
|
-
buffer[(*pos)++] = ':';
|
288
|
-
write_char( uc, buffer, pos, with_brackets );
|
289
|
-
}
|
290
|
-
}
|
291
|
-
|
292
|
-
|
293
|
-
/*******************************************************************/
|
294
|
-
/* */
|
295
|
-
/* Alphabet::write_label */
|
296
|
-
/* */
|
297
|
-
/*******************************************************************/
|
298
|
-
|
299
|
-
const char *Alphabet::write_label( Label l, bool with_brackets ) const
|
300
|
-
|
301
|
-
{
|
302
|
-
static char buffer[1000];
|
303
|
-
int n=0;
|
304
|
-
write_label( l, buffer, &n, with_brackets );
|
305
|
-
return buffer;
|
306
|
-
}
|
307
|
-
|
308
|
-
|
309
|
-
/*******************************************************************/
|
310
|
-
/* */
|
311
|
-
/* Alphabet::insert_symbols */
|
312
|
-
/* */
|
313
|
-
/*******************************************************************/
|
314
|
-
|
315
|
-
void Alphabet::insert_symbols( const Alphabet &a )
|
316
|
-
|
317
|
-
{
|
318
|
-
for( CharMap::const_iterator it=a.cm.begin(); it!=a.cm.end(); it++ )
|
319
|
-
add_symbol(it->second, it->first);
|
320
|
-
}
|
321
|
-
|
322
|
-
|
323
|
-
/*******************************************************************/
|
324
|
-
/* */
|
325
|
-
/* Alphabet::complement */
|
326
|
-
/* */
|
327
|
-
/*******************************************************************/
|
328
|
-
|
329
|
-
void Alphabet::complement( vector<Character> &sym )
|
330
|
-
|
331
|
-
{
|
332
|
-
vector<Character> result;
|
333
|
-
for( CharMap::const_iterator it=cm.begin(); it!=cm.end(); it++ ) {
|
334
|
-
Character c = it->first;
|
335
|
-
if (c != Label::epsilon) {
|
336
|
-
size_t i;
|
337
|
-
for( i=0; i<sym.size(); i++ )
|
338
|
-
if (sym[i] == c)
|
339
|
-
break;
|
340
|
-
if (i == sym.size())
|
341
|
-
result.push_back(c);
|
342
|
-
}
|
343
|
-
}
|
344
|
-
sym.swap(result);
|
345
|
-
}
|
346
|
-
|
347
|
-
|
348
|
-
/*******************************************************************/
|
349
|
-
/* */
|
350
|
-
/* Alphabet::copy */
|
351
|
-
/* */
|
352
|
-
/*******************************************************************/
|
353
|
-
|
354
|
-
void Alphabet::copy( const Alphabet &a )
|
355
|
-
|
356
|
-
{
|
357
|
-
insert_symbols( a );
|
358
|
-
utf8 = a.utf8;
|
359
|
-
for( LabelSet::const_iterator it=a.begin(); it!=a.end(); it++ )
|
360
|
-
ls.insert( *it );
|
361
|
-
}
|
362
|
-
|
363
|
-
|
364
|
-
/*******************************************************************/
|
365
|
-
/* */
|
366
|
-
/* Alphabet::compose */
|
367
|
-
/* */
|
368
|
-
/*******************************************************************/
|
369
|
-
|
370
|
-
void Alphabet::compose( const Alphabet &la, const Alphabet &ua )
|
371
|
-
|
372
|
-
{
|
373
|
-
// insert the symbols
|
374
|
-
insert_symbols(la);
|
375
|
-
insert_symbols(ua);
|
376
|
-
utf8 = la.utf8;
|
377
|
-
|
378
|
-
hash_map<Character, hash_set<Character> > cs;
|
379
|
-
|
380
|
-
// create a hash table for a quick lookup of the target characters
|
381
|
-
for( iterator it=ua.begin(); it!=ua.end(); it++ ) {
|
382
|
-
Character lc=it->lower_char();
|
383
|
-
if (lc == Label::epsilon)
|
384
|
-
insert(*it);
|
385
|
-
else
|
386
|
-
cs[lc].insert(it->upper_char());
|
387
|
-
}
|
388
|
-
|
389
|
-
for( iterator it=la.begin(); it!=la.end(); it++ ) {
|
390
|
-
Character uc=it->upper_char();
|
391
|
-
if (uc == Label::epsilon)
|
392
|
-
insert(*it);
|
393
|
-
else {
|
394
|
-
if (cs.find(uc) != cs.end()) {
|
395
|
-
hash_set<Character> s=cs[uc];
|
396
|
-
Character lc=it->lower_char();
|
397
|
-
for( hash_set<Character>::iterator it=s.begin(); it!=s.end(); it++)
|
398
|
-
insert(Label(lc, *it));
|
399
|
-
}
|
400
|
-
}
|
401
|
-
}
|
402
|
-
}
|
403
|
-
|
404
|
-
|
405
|
-
/*******************************************************************/
|
406
|
-
/* */
|
407
|
-
/* operator<<(Alphabet) */
|
408
|
-
/* */
|
409
|
-
/*******************************************************************/
|
410
|
-
|
411
|
-
ostream &operator<<( ostream &s, const Alphabet &a )
|
412
|
-
|
413
|
-
{
|
414
|
-
for( Alphabet::CharMap::const_iterator it=a.cm.begin(); it!=a.cm.end(); it++ )
|
415
|
-
s << it->first << " -> " << it->second << "\n";
|
416
|
-
for( Alphabet::iterator it=a.begin(); it!=a.end(); it++ )
|
417
|
-
s << a.write_label(*it) << " ";
|
418
|
-
s << "\n";
|
419
|
-
return s;
|
420
|
-
}
|
421
|
-
|
422
|
-
|
423
|
-
/*******************************************************************/
|
424
|
-
/* */
|
425
|
-
/* Alphabet::next_mcsym */
|
426
|
-
/* */
|
427
|
-
/* recognizes multi-character symbols which are enclosed with */
|
428
|
-
/* angle brackets <...>. If the argument flag insert is true, */
|
429
|
-
/* the multi-character symbol must be already in the lexicon in */
|
430
|
-
/* order to be recognized. */
|
431
|
-
/* */
|
432
|
-
/*******************************************************************/
|
433
|
-
|
434
|
-
int Alphabet::next_mcsym( char* &string, bool insert )
|
435
|
-
|
436
|
-
{
|
437
|
-
char *start=string;
|
438
|
-
|
439
|
-
if (*start == '<')
|
440
|
-
// symbol might start here
|
441
|
-
for( char *end=start+1; *end; end++ )
|
442
|
-
if (*end == '>') {
|
443
|
-
// matching pair of angle brackets found
|
444
|
-
// mark the end of the substring with \0
|
445
|
-
char lastc = *(++end);
|
446
|
-
*end = 0;
|
447
|
-
|
448
|
-
int c;
|
449
|
-
if (insert)
|
450
|
-
c = add_symbol( start );
|
451
|
-
else
|
452
|
-
c = symbol2code(start);
|
453
|
-
// restore the original string
|
454
|
-
*end = lastc;
|
455
|
-
|
456
|
-
if (c != EOF) {
|
457
|
-
// symbol found
|
458
|
-
// return its code
|
459
|
-
string = end;
|
460
|
-
return (Character)c;
|
461
|
-
}
|
462
|
-
else
|
463
|
-
// not a complex character
|
464
|
-
break;
|
465
|
-
}
|
466
|
-
return EOF;
|
467
|
-
}
|
468
|
-
|
469
|
-
|
470
|
-
/*******************************************************************/
|
471
|
-
/* */
|
472
|
-
/* Alphabet::next_code */
|
473
|
-
/* */
|
474
|
-
/*******************************************************************/
|
475
|
-
|
476
|
-
int Alphabet::next_code( char* &string, bool extended, bool insert )
|
477
|
-
|
478
|
-
{
|
479
|
-
if (*string == 0)
|
480
|
-
return EOF; // finished
|
481
|
-
|
482
|
-
int c = next_mcsym(string, insert);
|
483
|
-
if (c != EOF)
|
484
|
-
return c;
|
485
|
-
|
486
|
-
if (extended && *string == '\\')
|
487
|
-
string++; // remove quotation
|
488
|
-
|
489
|
-
if (utf8) {
|
490
|
-
unsigned int c = utf8toint( &string );
|
491
|
-
return (int)add_symbol(int2utf8(c));
|
492
|
-
}
|
493
|
-
else {
|
494
|
-
char buffer[2];
|
495
|
-
buffer[0] = *string;
|
496
|
-
buffer[1] = 0;
|
497
|
-
string++;
|
498
|
-
return (int)add_symbol(buffer);
|
499
|
-
}
|
500
|
-
}
|
501
|
-
|
502
|
-
|
503
|
-
/*******************************************************************/
|
504
|
-
/* */
|
505
|
-
/* Alphabet::next_label */
|
506
|
-
/* */
|
507
|
-
/*******************************************************************/
|
508
|
-
|
509
|
-
Label Alphabet::next_label( char* &string, bool extended )
|
510
|
-
|
511
|
-
{
|
512
|
-
// read first character
|
513
|
-
int c = next_code( string, extended );
|
514
|
-
if (c == EOF)
|
515
|
-
return Label(); // end of string reached
|
516
|
-
|
517
|
-
Character lc=(Character)c;
|
518
|
-
if (!extended || *string != ':') { // single character?
|
519
|
-
if (lc == Label::epsilon)
|
520
|
-
return next_label(string, extended); // ignore epsilon
|
521
|
-
return Label(lc);
|
522
|
-
}
|
523
|
-
|
524
|
-
// read second character
|
525
|
-
string++; // jump over ':'
|
526
|
-
c = next_code( string );
|
527
|
-
if (c == EOF) {
|
528
|
-
static char buffer[1000];
|
529
|
-
sprintf(buffer,"Error: incomplete symbol in input file: %s", string);
|
530
|
-
throw buffer;
|
531
|
-
}
|
532
|
-
|
533
|
-
Label l(lc, (Character)c);
|
534
|
-
if (l.is_epsilon())
|
535
|
-
return next_label(string, extended); // ignore epsilon transitions
|
536
|
-
return l;
|
537
|
-
}
|
538
|
-
|
539
|
-
|
540
|
-
/*******************************************************************/
|
541
|
-
/* */
|
542
|
-
/* Alphabet::string2symseq */
|
543
|
-
/* */
|
544
|
-
/*******************************************************************/
|
545
|
-
|
546
|
-
void Alphabet::string2symseq( char *s, vector<Character> &ch )
|
547
|
-
|
548
|
-
{
|
549
|
-
int c;
|
550
|
-
while ((c = next_code(s, false)) != EOF)
|
551
|
-
ch.push_back((Character)c);
|
552
|
-
}
|
553
|
-
|
554
|
-
|
555
|
-
/*******************************************************************/
|
556
|
-
/* */
|
557
|
-
/* Alphabet::string2labelseq */
|
558
|
-
/* */
|
559
|
-
/*******************************************************************/
|
560
|
-
|
561
|
-
void Alphabet::string2labelseq( char *s, vector<Label> &labels )
|
562
|
-
|
563
|
-
{
|
564
|
-
Label l;
|
565
|
-
while ((l = next_label(s)) != Label::epsilon)
|
566
|
-
labels.push_back(l);
|
567
|
-
}
|
568
|
-
|
569
|
-
|
570
|
-
/*******************************************************************/
|
571
|
-
/* */
|
572
|
-
/* Alphabet::store */
|
573
|
-
/* */
|
574
|
-
/*******************************************************************/
|
575
|
-
|
576
|
-
void Alphabet::store( FILE *file ) const
|
577
|
-
|
578
|
-
{
|
579
|
-
char c=(utf8)? 1: 0;
|
580
|
-
fputc(c, file);
|
581
|
-
|
582
|
-
// write the symbol mapping
|
583
|
-
Character n=cm.size();
|
584
|
-
fwrite(&n, sizeof(n), 1, file);
|
585
|
-
for( CharMap::const_iterator it=cm.begin(); it!=cm.end(); it++ ) {
|
586
|
-
Character c=it->first;
|
587
|
-
char *s=it->second;
|
588
|
-
fwrite(&c, sizeof(c), 1, file);
|
589
|
-
fwrite(s, sizeof(char), strlen(s)+1, file);
|
590
|
-
}
|
591
|
-
|
592
|
-
// write the character pairs
|
593
|
-
n = size();
|
594
|
-
fwrite(&n, sizeof(n), 1, file);
|
595
|
-
for( LabelSet::const_iterator p=ls.begin(); p!=ls.end(); p++ ) {
|
596
|
-
Character c=p->lower_char();
|
597
|
-
fwrite(&c, sizeof(c), 1, file);
|
598
|
-
c = p->upper_char();
|
599
|
-
fwrite(&c, sizeof(c), 1, file);
|
600
|
-
}
|
601
|
-
|
602
|
-
if (ferror(file))
|
603
|
-
throw "Error encountered while writing alphabet to file\n";
|
604
|
-
}
|
605
|
-
|
606
|
-
|
607
|
-
/*******************************************************************/
|
608
|
-
/* */
|
609
|
-
/* Alphabet::read */
|
610
|
-
/* */
|
611
|
-
/*******************************************************************/
|
612
|
-
|
613
|
-
void Alphabet::read( FILE *file )
|
614
|
-
|
615
|
-
{
|
616
|
-
utf8 = (fgetc(file) != 0);
|
617
|
-
|
618
|
-
// read the symbol mapping
|
619
|
-
Character n=0;
|
620
|
-
read_num(&n, sizeof(n), file);
|
621
|
-
for( unsigned i=0; i<n; i++) {
|
622
|
-
char buffer[BUFFER_SIZE];
|
623
|
-
Character c;
|
624
|
-
read_num(&c, sizeof(c), file);
|
625
|
-
if (!read_string(buffer, BUFFER_SIZE, file) ||
|
626
|
-
feof(file) || ferror(file))
|
627
|
-
throw "Error1 occurred while reading alphabet!\n";
|
628
|
-
add_symbol(buffer, c);
|
629
|
-
}
|
630
|
-
|
631
|
-
// read the character pairs
|
632
|
-
read_num(&n, sizeof(n), file);
|
633
|
-
if (ferror(file))
|
634
|
-
throw "Error2 occurred while reading alphabet!\n";
|
635
|
-
for( unsigned i=0; i<n; i++) {
|
636
|
-
Character lc, uc;
|
637
|
-
read_num(&lc, sizeof(lc), file);
|
638
|
-
read_num(&uc, sizeof(uc), file);
|
639
|
-
insert(Label(lc, uc));
|
640
|
-
}
|
641
|
-
if (ferror(file))
|
642
|
-
throw "Error3 occurred while reading alphabet!\n";
|
643
|
-
}
|
644
|
-
|
645
|
-
|
646
|
-
/*******************************************************************/
|
647
|
-
/* */
|
648
|
-
/* Alphabet::compute_score */
|
649
|
-
/* */
|
650
|
-
/*******************************************************************/
|
651
|
-
|
652
|
-
int Alphabet::compute_score( Analysis &ana )
|
653
|
-
|
654
|
-
{
|
655
|
-
// check whether the morpheme boundaries are explicitly marked
|
656
|
-
// with <X> tags
|
657
|
-
int score=0;
|
658
|
-
for( size_t i=0; i<ana.size(); i++ ) {
|
659
|
-
|
660
|
-
// get next symbol
|
661
|
-
const char *sym=write_char(ana[i].lower_char());
|
662
|
-
|
663
|
-
if (strcmp(sym,"<X>") == 0)
|
664
|
-
score--;
|
665
|
-
}
|
666
|
-
if (score < 0)
|
667
|
-
return score;
|
668
|
-
|
669
|
-
// No explicit morphome boundary markers have been found.
|
670
|
-
// Count the number of part-of-speech and PREF tags.
|
671
|
-
for( size_t i=0; i<ana.size(); i++ ) {
|
672
|
-
|
673
|
-
// get next symbol
|
674
|
-
const char *sym=write_char(ana[i].lower_char());
|
675
|
-
|
676
|
-
// Is it not a multi-character symbol
|
677
|
-
if (sym[0] != '<' || sym[1] == 0)
|
678
|
-
continue;
|
679
|
-
|
680
|
-
// Is it a POS tag starting with "+" like <+NN>?
|
681
|
-
if (sym[1] == '+') {
|
682
|
-
const char *t=sym+2;
|
683
|
-
for( ; *t >= 'A' && *t <= 'Z'; t++) ;
|
684
|
-
if (t > sym+2 && *t == '>')
|
685
|
-
return score;
|
686
|
-
}
|
687
|
-
|
688
|
-
// Is it a potential POS tag (i.e. all uppercase)?
|
689
|
-
const char *t = sym+1;
|
690
|
-
for( ; *t >= 'A' && *t <= 'Z'; t++) ;
|
691
|
-
if (t == sym+1 || *t != '>')
|
692
|
-
continue;
|
693
|
-
|
694
|
-
// uppercase symbol found
|
695
|
-
if (strcmp(sym,"<SUFF>") == 0 ||
|
696
|
-
strcmp(sym,"<OLDORTH>") == 0 ||
|
697
|
-
strcmp(sym,"<NEWORTH>") == 0)
|
698
|
-
continue; // not what we are looking for
|
699
|
-
|
700
|
-
// disprefer nouns with prefixes
|
701
|
-
if (strcmp(sym,"<PREF>") == 0)
|
702
|
-
score-=2;
|
703
|
-
|
704
|
-
if (strcmp(sym,"<V>") == 0 || strcmp(sym,"<ADJ>") == 0) {
|
705
|
-
bool is_verb=(strcmp(sym,"<V>")==0);
|
706
|
-
// get the next non-empty symbol
|
707
|
-
Character c=Label::epsilon;
|
708
|
-
size_t k;
|
709
|
-
for( k=i+1; k<ana.size(); k++ )
|
710
|
-
if ((c = ana[k].lower_char()) != Label::epsilon)
|
711
|
-
break;
|
712
|
-
// Is it a participle
|
713
|
-
if (c != Label::epsilon) {
|
714
|
-
sym = write_char(c);
|
715
|
-
if (strcmp(sym,"<OLDORTH>") == 0 || strcmp(sym,"<NEWORTH>") == 0) {
|
716
|
-
for( k++; k<ana.size(); k++ )
|
717
|
-
if ((c = ana[k].lower_char()) != Label::epsilon)
|
718
|
-
break;
|
719
|
-
if (c != Label::epsilon)
|
720
|
-
sym = write_char(c);
|
721
|
-
}
|
722
|
-
if (is_verb &&
|
723
|
-
(strcmp(sym,"<PPres>") == 0 || strcmp(sym,"<PPast>") == 0))
|
724
|
-
continue; // don't consider participles as complex
|
725
|
-
if (!is_verb &&
|
726
|
-
(strcmp(sym,"<Sup>") == 0 || strcmp(sym,"<Comp>") == 0))
|
727
|
-
continue; // don't consider participles as complex
|
728
|
-
}
|
729
|
-
}
|
730
|
-
score--;
|
731
|
-
}
|
732
|
-
return score;
|
733
|
-
}
|
734
|
-
|
735
|
-
|
736
|
-
|
737
|
-
/*******************************************************************/
|
738
|
-
/* */
|
739
|
-
/* Alphabet::disambiguate */
|
740
|
-
/* */
|
741
|
-
/*******************************************************************/
|
742
|
-
|
743
|
-
void Alphabet::disambiguate( vector<Analysis> &analyses )
|
744
|
-
|
745
|
-
{
|
746
|
-
// compute the scores
|
747
|
-
int bestscore=INT_MIN;
|
748
|
-
vector<int> score;
|
749
|
-
|
750
|
-
for( size_t i=0; i<analyses.size(); i++ ) {
|
751
|
-
score.push_back(compute_score(analyses[i]));
|
752
|
-
if (bestscore < score[i])
|
753
|
-
bestscore = score[i];
|
754
|
-
}
|
755
|
-
|
756
|
-
// delete suboptimal analyses
|
757
|
-
size_t k=0;
|
758
|
-
for( size_t i=0; i<analyses.size(); i++ )
|
759
|
-
if (score[i] == bestscore)
|
760
|
-
analyses[k++] = analyses[i];
|
761
|
-
analyses.resize(k);
|
762
|
-
}
|
763
|
-
|
764
|
-
|
765
|
-
|
766
|
-
/*******************************************************************/
|
767
|
-
/* */
|
768
|
-
/* Alphabet::print_analysis */
|
769
|
-
/* */
|
770
|
-
/*******************************************************************/
|
771
|
-
|
772
|
-
char *Alphabet::print_analysis( Analysis &ana, bool both_layers )
|
773
|
-
|
774
|
-
{
|
775
|
-
vector<char> ch;
|
776
|
-
|
777
|
-
// for each transition
|
778
|
-
for( size_t i=0; i<ana.size(); i++ ) {
|
779
|
-
|
780
|
-
// get the transition label
|
781
|
-
Label l=ana[i];
|
782
|
-
const char *s;
|
783
|
-
|
784
|
-
// either print the analysis symbol or the whole label
|
785
|
-
if (both_layers) {
|
786
|
-
s = write_label(l);
|
787
|
-
// quote colons
|
788
|
-
if (strcmp(s,":") == 0)
|
789
|
-
ch.push_back('\\');
|
790
|
-
}
|
791
|
-
else if (l.lower_char() != Label::epsilon)
|
792
|
-
s = write_char(l.lower_char());
|
793
|
-
else
|
794
|
-
continue;
|
795
|
-
|
796
|
-
// copy the characters to the character array
|
797
|
-
while (*s)
|
798
|
-
ch.push_back(*(s++));
|
799
|
-
}
|
800
|
-
ch.push_back(0); // terminate the string
|
801
|
-
|
802
|
-
static char *result=NULL;
|
803
|
-
if (result != NULL)
|
804
|
-
delete[] result;
|
805
|
-
result = new char[ch.size()];
|
806
|
-
for( size_t i=0; i<ch.size(); i++ )
|
807
|
-
result[i] = ch[i];
|
808
|
-
|
809
|
-
return result;
|
810
|
-
}
|
811
|
-
|
812
|
-
|