pronounce 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +19 -0
- data/data/beep/ACKNOWLEDGEMENTS +36 -0
- data/data/beep/ANNOUNCE-1.0 +27 -0
- data/data/beep/README +39 -0
- data/data/beep/addparan +22 -0
- data/data/beep/beep-1.0 +257070 -0
- data/data/beep/case.txt +166944 -0
- data/data/beep/lexicode.doc +47 -0
- data/data/beep/phoncode.doc +48 -0
- data/data/beep/phone45.tab +45 -0
- data/data/beep/sayTimit.doc +130 -0
- data/data/beep/sayTimit.pl +174 -0
- data/data/cmudict/00README_FIRST.txt +36 -0
- data/data/cmudict/README.developer +50 -0
- data/data/cmudict/README.old +79 -0
- data/data/cmudict/README.weide +67 -0
- data/data/cmudict/cmudict.0.6d +129511 -0
- data/data/cmudict/cmudict.0.7a +133369 -0
- data/data/cmudict/cmudict.0.7a.phones +39 -0
- data/data/cmudict/cmudict.0.7a.symbols +84 -0
- data/data/cmudict/scripts/CompileDictionary.sh +36 -0
- data/data/cmudict/scripts/README.txt +27 -0
- data/data/cmudict/scripts/make_baseform.pl +172 -0
- data/data/cmudict/scripts/sort_cmudict.pl +141 -0
- data/data/cmudict/scripts/test_cmudict.pl +166 -0
- data/data/cmudict/scripts/test_dict.pl +119 -0
- data/data/cmudict/sphinxdict/README.txt +19 -0
- data/data/cmudict/sphinxdict/SphinxPhones_40 +40 -0
- data/data/cmudict/sphinxdict/cmudict.0.7a_SPHINX_40 +133012 -0
- data/data/cmudict/sphinxdict/cmudict_SPHINX_40 +133012 -0
- data/lib/pronounce.rb +33 -0
- metadata +104 -0
@@ -0,0 +1,39 @@
|
|
1
|
+
AA vowel
|
2
|
+
AE vowel
|
3
|
+
AH vowel
|
4
|
+
AO vowel
|
5
|
+
AW vowel
|
6
|
+
AY vowel
|
7
|
+
B stop
|
8
|
+
CH affricate
|
9
|
+
D stop
|
10
|
+
DH fricative
|
11
|
+
EH vowel
|
12
|
+
ER vowel
|
13
|
+
EY vowel
|
14
|
+
F fricative
|
15
|
+
G stop
|
16
|
+
HH aspirate
|
17
|
+
IH vowel
|
18
|
+
IY vowel
|
19
|
+
JH affricate
|
20
|
+
K stop
|
21
|
+
L liquid
|
22
|
+
M nasal
|
23
|
+
N nasal
|
24
|
+
NG nasal
|
25
|
+
OW vowel
|
26
|
+
OY vowel
|
27
|
+
P stop
|
28
|
+
R liquid
|
29
|
+
S fricative
|
30
|
+
SH fricative
|
31
|
+
T stop
|
32
|
+
TH fricative
|
33
|
+
UH vowel
|
34
|
+
UW vowel
|
35
|
+
V fricative
|
36
|
+
W semivowel
|
37
|
+
Y semivowel
|
38
|
+
Z fricative
|
39
|
+
ZH fricative
|
@@ -0,0 +1,84 @@
|
|
1
|
+
AA
|
2
|
+
AA0
|
3
|
+
AA1
|
4
|
+
AA2
|
5
|
+
AE
|
6
|
+
AE0
|
7
|
+
AE1
|
8
|
+
AE2
|
9
|
+
AH
|
10
|
+
AH0
|
11
|
+
AH1
|
12
|
+
AH2
|
13
|
+
AO
|
14
|
+
AO0
|
15
|
+
AO1
|
16
|
+
AO2
|
17
|
+
AW
|
18
|
+
AW0
|
19
|
+
AW1
|
20
|
+
AW2
|
21
|
+
AY
|
22
|
+
AY0
|
23
|
+
AY1
|
24
|
+
AY2
|
25
|
+
B
|
26
|
+
CH
|
27
|
+
D
|
28
|
+
DH
|
29
|
+
EH
|
30
|
+
EH0
|
31
|
+
EH1
|
32
|
+
EH2
|
33
|
+
ER
|
34
|
+
ER0
|
35
|
+
ER1
|
36
|
+
ER2
|
37
|
+
EY
|
38
|
+
EY0
|
39
|
+
EY1
|
40
|
+
EY2
|
41
|
+
F
|
42
|
+
G
|
43
|
+
HH
|
44
|
+
IH
|
45
|
+
IH0
|
46
|
+
IH1
|
47
|
+
IH2
|
48
|
+
IY
|
49
|
+
IY0
|
50
|
+
IY1
|
51
|
+
IY2
|
52
|
+
JH
|
53
|
+
K
|
54
|
+
L
|
55
|
+
M
|
56
|
+
N
|
57
|
+
NG
|
58
|
+
OW
|
59
|
+
OW0
|
60
|
+
OW1
|
61
|
+
OW2
|
62
|
+
OY
|
63
|
+
OY0
|
64
|
+
OY1
|
65
|
+
OY2
|
66
|
+
P
|
67
|
+
R
|
68
|
+
S
|
69
|
+
SH
|
70
|
+
T
|
71
|
+
TH
|
72
|
+
UH
|
73
|
+
UH0
|
74
|
+
UH1
|
75
|
+
UH2
|
76
|
+
UW
|
77
|
+
UW0
|
78
|
+
UW1
|
79
|
+
UW2
|
80
|
+
V
|
81
|
+
W
|
82
|
+
Y
|
83
|
+
Z
|
84
|
+
ZH
|
@@ -0,0 +1,36 @@
|
|
1
|
+
#!sh
|
2
|
+
# [20080422] (air) Compile cmudict into SPHINX_40 form
|
3
|
+
# [20100118] (air)
|
4
|
+
|
5
|
+
DIR=sphinxdict
|
6
|
+
DICT_BASE=cmudict
|
7
|
+
DICTIONARY=${DICT_BASE}.0.7a
|
8
|
+
|
9
|
+
|
10
|
+
|
11
|
+
echo "Compiling $DICTIONARY..."
|
12
|
+
|
13
|
+
# make_baseforms.pl removes stress marks and eliminates resulting duplicates
|
14
|
+
perl ./scripts/make_baseform.pl $DICTIONARY $DIR/$$_SPHINX_40
|
15
|
+
|
16
|
+
|
17
|
+
echo ""
|
18
|
+
echo "Testing sphinx cmudict... "
|
19
|
+
if ./scripts/test_dict.pl -p $DIR/SphinxPhones_40 $DIR/$$_SPHINX_40
|
20
|
+
then
|
21
|
+
cp -p $DIR/$$_SPHINX_40 $DIR/${DICT_BASE}_SPHINX_40
|
22
|
+
cp -p $DIR/$$_SPHINX_40 $DIR/${DICTIONARY}_SPHINX_40
|
23
|
+
echo "Dictionary successfully compiled"
|
24
|
+
else
|
25
|
+
if [ -e $DIR/${DICT_BASE}_SPHINX_40 ] ; then rm $DIR/${DICT_BASE}_SPHINX_40 ; fi
|
26
|
+
if [ -e $DIR/${DICTIONARY}_SPHINX_40 ] ; then rm $DIR/${DICTIONARY}_SPHINX_40 ; fi
|
27
|
+
echo ""
|
28
|
+
echo "$0 encountered errors"
|
29
|
+
echo "dictionary compilation not completed"
|
30
|
+
fi
|
31
|
+
|
32
|
+
rm $DIR/$$_SPHINX_40
|
33
|
+
|
34
|
+
echo "Done"
|
35
|
+
|
36
|
+
#
|
@@ -0,0 +1,27 @@
|
|
1
|
+
Maintenance scripts for cmudict
|
2
|
+
-------------------------------
|
3
|
+
[20100118] (air)
|
4
|
+
|
5
|
+
Use these scripts for checking and compiling the dictionary.
|
6
|
+
|
7
|
+
The process is the following:
|
8
|
+
|
9
|
+
1) make changes to the dictionary
|
10
|
+
- it's assumed that the changes are manual
|
11
|
+
- check your work by doing a svn diff with the previous version
|
12
|
+
|
13
|
+
2) run scripts/test_cmudict.pl
|
14
|
+
EG: ./scripts/test_cmudict.pl -p cmudict.0.7a.symbols cmudict.0.7a
|
15
|
+
- this checks for collation order, legal entry format and phonetic symbols
|
16
|
+
- if necessary fix problems then repeat this step until no errors
|
17
|
+
|
18
|
+
3) run CompileDictionary*
|
19
|
+
[converts cmudict to the Sphinx format using make_baseform.pl]
|
20
|
+
[checks for consistency using test_dict.pl]
|
21
|
+
- produces two *_SPHINX_40 files; one generic the other major-versioned
|
22
|
+
|
23
|
+
4) use svn to update cmudict; be sure to add a proper logging message
|
24
|
+
|
25
|
+
That's it!
|
26
|
+
|
27
|
+
|
@@ -0,0 +1,172 @@
|
|
1
|
+
#!perl -w
|
2
|
+
|
3
|
+
#
|
4
|
+
# ====================================================================
|
5
|
+
# Copyright (C) 1999-2008 Carnegie Mellon University and Alexander
|
6
|
+
# Rudnicky. All rights reserved.
|
7
|
+
#
|
8
|
+
# Redistribution and use in source and binary forms, with or without
|
9
|
+
# modification, are permitted provided that the following conditions
|
10
|
+
# are met:
|
11
|
+
#
|
12
|
+
# 1. Redistributions of source code must retain the above copyright
|
13
|
+
# notice, this list of conditions and the following disclaimer.
|
14
|
+
#
|
15
|
+
# 2. Redistributions in binary form must reproduce the above copyright
|
16
|
+
# notice, this list of conditions and the following disclaimer in
|
17
|
+
# the documentation and/or other materials provided with the
|
18
|
+
# distribution.
|
19
|
+
#
|
20
|
+
# This work was supported in part by funding from the Defense Advanced
|
21
|
+
# Research Projects Agency, the Office of Naval Research and the National
|
22
|
+
# Science Foundation of the United States of America, and by member
|
23
|
+
# companies of the Carnegie Mellon Sphinx Speech Consortium. We acknowledge
|
24
|
+
# the contributions of many volunteers to the expansion and improvement of
|
25
|
+
# this dictionary.
|
26
|
+
#
|
27
|
+
# THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
|
28
|
+
# ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
|
29
|
+
# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
30
|
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
|
31
|
+
# NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
32
|
+
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
33
|
+
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
34
|
+
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
35
|
+
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
36
|
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
37
|
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
38
|
+
#
|
39
|
+
# ====================================================================
|
40
|
+
#
|
41
|
+
|
42
|
+
# [20050309] (air) Created.
|
43
|
+
# strip out stress marks from a cmudict, producing a "SphinxPhones_40" dictionary
|
44
|
+
# [20080420] (air) Changed to pass comments.
|
45
|
+
# Fixed output collation sequence; DOS eol's
|
46
|
+
# [20090309] (air) fixed duplicate pron and collation bugs
|
47
|
+
# [20090331] (air) restored standard collation order (since other stuff deppends on it)
|
48
|
+
# [20090629] (air) do not put comments into SPHINX_40 version; not all software deals
|
49
|
+
# [20100118] (air) added $VERBOSE; this should really be a cmdline flag...
|
50
|
+
#
|
51
|
+
|
52
|
+
|
53
|
+
$VERBOSE = 0;
|
54
|
+
|
55
|
+
my $basecount = 0;
|
56
|
+
my $dupl = 0;
|
57
|
+
my $base = 0;
|
58
|
+
my $varia = 0;
|
59
|
+
|
60
|
+
if ( scalar @ARGV ne 2 ) { die "usage: make_baseform <input> <output>\n"; }
|
61
|
+
|
62
|
+
open(IN, $ARGV[0]) || die "can't open $ARGV[0] for reading!\n";
|
63
|
+
open(OUT,">$ARGV[1]") || die "can't open $ARGV[1] for writing!\n";
|
64
|
+
|
65
|
+
@header = (); # header comment lines (passed through)
|
66
|
+
%dict = (); # words end up in here
|
67
|
+
%histo = (); # some statistics on variants
|
68
|
+
|
69
|
+
get_dict(\%dict,\@header,IN); # process the entries
|
70
|
+
|
71
|
+
# what have we got?
|
72
|
+
print STDERR "$basecount forms processed\n";
|
73
|
+
print STDERR "$base baseforms, $varia variants and $dupl duplicates found.\n";
|
74
|
+
print STDERR "variant distribution:\n";
|
75
|
+
foreach $var ( sort keys %histo ) {
|
76
|
+
print STDERR "$var\t$histo{$var}\n";
|
77
|
+
}
|
78
|
+
|
79
|
+
# print special comments (copyright, etc.)
|
80
|
+
# removed since it messes some things up...
|
81
|
+
# foreach $h (@header) { print OUT "$h\n"; }
|
82
|
+
|
83
|
+
# print out each entry
|
84
|
+
%dict_out = ();
|
85
|
+
foreach $w (sort keys %dict) {
|
86
|
+
$var=1; # variants will number starting with 2
|
87
|
+
foreach $p ( @{$dict{$w}} ) {
|
88
|
+
if ($var eq 1) {
|
89
|
+
$dict_out{$w} = $p;
|
90
|
+
$var++;
|
91
|
+
} else {
|
92
|
+
$dict_out{"$w($var)"} = $p;
|
93
|
+
$var++;
|
94
|
+
}
|
95
|
+
}
|
96
|
+
}
|
97
|
+
|
98
|
+
foreach $entry ( sort keys %dict_out ) {
|
99
|
+
print OUT "$entry\t$dict_out{$entry}\n";
|
100
|
+
}
|
101
|
+
|
102
|
+
close(IN);
|
103
|
+
close(OUT);
|
104
|
+
|
105
|
+
#
|
106
|
+
#
|
107
|
+
# read in a dictionary
|
108
|
+
sub get_dict {
|
109
|
+
my $dict = shift; # data structure with dictionary entries
|
110
|
+
my $header = shift;
|
111
|
+
my $target = shift; # input file handle
|
112
|
+
|
113
|
+
while (<$target>) {
|
114
|
+
s/[\r\n]+$//g; # DOS-robust chomp;
|
115
|
+
|
116
|
+
# process comments; blank lines ignored
|
117
|
+
# presume that ";;; #" will be collected and emitted at the top
|
118
|
+
if ($_ =~ /^;;; \#/) { push @$header, $_; next; } # save header info
|
119
|
+
elsif ( $_ =~ /^;;;/ ) { next; } # ignore plain comments
|
120
|
+
elsif ( $_ =~ /^\s*$/ ) { next; } # ignore blank lines
|
121
|
+
|
122
|
+
# extract the word,pron pair and prepare for processing
|
123
|
+
($word,$pron) = /(.+?)\s+(.+?)$/;
|
124
|
+
if (! defined $word) { print STDERR "bad entry (no head word): $_\n"; next; }
|
125
|
+
|
126
|
+
$basecount++;
|
127
|
+
|
128
|
+
if ($word =~ /\)$/) { # variant
|
129
|
+
($root,$variant) = ($word =~ m/(.+?)\((.+?)\)/);
|
130
|
+
} else {
|
131
|
+
$root = $word;
|
132
|
+
$variant = 0;
|
133
|
+
}
|
134
|
+
$pron = &strip_stress($pron);
|
135
|
+
|
136
|
+
# found a new baseform; set it up
|
137
|
+
if ( ! defined $dict->{$root} ) {
|
138
|
+
$dict->{$root}[0] = $pron;
|
139
|
+
$base++;
|
140
|
+
next;
|
141
|
+
}
|
142
|
+
|
143
|
+
# old baseform; see if, after removed stress, pron is a duplicate
|
144
|
+
foreach $var ( @{$dict->{$root}} ) {
|
145
|
+
if ( $var eq $pron ) {
|
146
|
+
if ($VERBOSE) {print STDERR "duplicate entry: $root ($variant) $pron\n";}
|
147
|
+
$dupl++;
|
148
|
+
$pron = "";
|
149
|
+
last;
|
150
|
+
}
|
151
|
+
}
|
152
|
+
|
153
|
+
# it's a new variant on an existing baseform, keep it
|
154
|
+
if ( $pron ne "" ) {
|
155
|
+
push @{$dict->{$root}}, $pron;
|
156
|
+
$varia++;
|
157
|
+
$histo{scalar @{$dict->{$root}}}++; # track variant stats
|
158
|
+
if ( scalar @{$dict->{$root}} > 4 ) { print STDERR "$root -- ",scalar @{$dict->{$root}},"\n"; }
|
159
|
+
}
|
160
|
+
}
|
161
|
+
}
|
162
|
+
|
163
|
+
|
164
|
+
# strip stress marks from phonetic symbols
|
165
|
+
sub strip_stress {
|
166
|
+
@pron = split " ", $_[0];
|
167
|
+
my $p;
|
168
|
+
foreach $p (@pron) { if ( $p =~ /\d$/) { $p =~ s/(\d+)$//; } }
|
169
|
+
return ( join(" ",@pron));
|
170
|
+
}
|
171
|
+
|
172
|
+
#
|
@@ -0,0 +1,141 @@
|
|
1
|
+
#!perl -w
|
2
|
+
|
3
|
+
#
|
4
|
+
# ====================================================================
|
5
|
+
# Copyright (C) 1999-2008 Carnegie Mellon University and Alexander
|
6
|
+
# Rudnicky. All rights reserved.
|
7
|
+
#
|
8
|
+
# Redistribution and use in source and binary forms, with or without
|
9
|
+
# modification, are permitted provided that the following conditions
|
10
|
+
# are met:
|
11
|
+
#
|
12
|
+
# 1. Redistributions of source code must retain the above copyright
|
13
|
+
# notice, this list of conditions and the following disclaimer.
|
14
|
+
#
|
15
|
+
# 2. Redistributions in binary form must reproduce the above copyright
|
16
|
+
# notice, this list of conditions and the following disclaimer in
|
17
|
+
# the documentation and/or other materials provided with the
|
18
|
+
# distribution.
|
19
|
+
#
|
20
|
+
# This work was supported in part by funding from the Defense Advanced
|
21
|
+
# Research Projects Agency, the Office of Naval Research and the National
|
22
|
+
# Science Foundation of the United States of America, and by member
|
23
|
+
# companies of the Carnegie Mellon Sphinx Speech Consortium. We acknowledge
|
24
|
+
# the contributions of many volunteers to the expansion and improvement of
|
25
|
+
# this dictionary.
|
26
|
+
#
|
27
|
+
# THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
|
28
|
+
# ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
|
29
|
+
# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
30
|
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
|
31
|
+
# NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
32
|
+
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
33
|
+
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
34
|
+
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
35
|
+
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
36
|
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
37
|
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
38
|
+
#
|
39
|
+
# ====================================================================
|
40
|
+
#
|
41
|
+
|
42
|
+
# Sort cmudict according to head entry collating sequence
|
43
|
+
|
44
|
+
# [20090331] (air) Created.
|
45
|
+
|
46
|
+
use strict;
|
47
|
+
|
48
|
+
if ( scalar @ARGV ne 2 ) { die "usage: sort_cmudict <input> <output>\n"; }
|
49
|
+
|
50
|
+
open(IN, $ARGV[0]) || die "can't open $ARGV[0] for reading!\n";
|
51
|
+
open(OUT,">$ARGV[1]") || die "can't open $ARGV[1] for writing!\n";
|
52
|
+
|
53
|
+
my %header = (); # header comment lines (passed through)
|
54
|
+
my %histo = (); # some statistics on variants
|
55
|
+
|
56
|
+
my %dict = ("" => {VARIANT => [], COMMENT => ""} ); # words end up in here
|
57
|
+
my $last = ""; # the last word processed
|
58
|
+
|
59
|
+
&get_dict(\%dict,\%header,*IN); # process the entries
|
60
|
+
|
61
|
+
# print special comments (copyright, etc.)
|
62
|
+
foreach my $h (sort keys %header) { print OUT "$header{$h}"; }
|
63
|
+
|
64
|
+
# print out each entry
|
65
|
+
my $DELIMITER = ' ';
|
66
|
+
foreach my $w (sort keys %dict) {
|
67
|
+
my $var=1; # number variants from 2 (this is different from original)
|
68
|
+
foreach my $p ( @{$dict{$w}{VARIANT}} ) {
|
69
|
+
if ($var eq 1) {
|
70
|
+
print OUT "$w$DELIMITER$p\n";
|
71
|
+
} else {
|
72
|
+
print OUT "$w($var)$DELIMITER$p\n";
|
73
|
+
}
|
74
|
+
$var++;
|
75
|
+
}
|
76
|
+
}
|
77
|
+
|
78
|
+
|
79
|
+
|
80
|
+
# read in a dictionary
|
81
|
+
sub get_dict {
|
82
|
+
my $dict = shift; # data structure with dictionary entries
|
83
|
+
my $header = shift;
|
84
|
+
my $target = shift; # input file handle
|
85
|
+
my ($word,$pron,$root,$variant);
|
86
|
+
my ($basecount,$base,$dupl,$varia);
|
87
|
+
|
88
|
+
while (<$target>) {
|
89
|
+
s/[\r\n]+$//g; # DOS-robust chomp;
|
90
|
+
|
91
|
+
# process comments; blank lines ignored
|
92
|
+
# presume that ";;; #" will be collected and emitted at the top
|
93
|
+
if ($_ =~ /^;;; \#/) { # save header info
|
94
|
+
$header{$last} .= "$_\n";
|
95
|
+
next;
|
96
|
+
}
|
97
|
+
elsif ( $_ =~ /^;;;/ ) { $header{$last} .= "$_\n"; next; } # ignore plain comments
|
98
|
+
elsif ( $_ =~ /^\s*$/ ) { $header{$last} .= "$_\n"; next; } # ignore blank lines
|
99
|
+
|
100
|
+
# extract the word,pron pair and prepare for processing
|
101
|
+
($word,$pron) = /(.+?)\s+(.+?)$/;
|
102
|
+
if (! defined $word) { print STDERR "bad entry (no head word): $_\n"; next; }
|
103
|
+
|
104
|
+
$basecount++;
|
105
|
+
|
106
|
+
if ($word =~ /\(\d\)$/) { # variant
|
107
|
+
($root,$variant) = ($word =~ m/(.+?)\((.+?)\)/);
|
108
|
+
} else {
|
109
|
+
$root = $word;
|
110
|
+
$variant = 0;
|
111
|
+
}
|
112
|
+
|
113
|
+
# found a new baseform; set it up
|
114
|
+
if ( ! defined $dict->{$root} ) {
|
115
|
+
$dict->{$root}{VARIANT}[0] = $pron;
|
116
|
+
$base++;
|
117
|
+
next;
|
118
|
+
}
|
119
|
+
|
120
|
+
# already-seen baseform; see if pron is a duplicate
|
121
|
+
foreach my $var ( @{$dict->{$root}{VARIANT}} ) {
|
122
|
+
if ( $var eq $pron ) {
|
123
|
+
print STDERR "duplicate entry: $root ($variant) $pron!\n";
|
124
|
+
$dupl++;
|
125
|
+
$pron = "";
|
126
|
+
last;
|
127
|
+
}
|
128
|
+
}
|
129
|
+
|
130
|
+
# it's a new variant on an existing baseform, keep it
|
131
|
+
if ( $pron ne "" ) {
|
132
|
+
push @{$dict->{$root}{VARIANT}}, $pron;
|
133
|
+
$varia++;
|
134
|
+
$histo{scalar @{$dict->{$root}{VARIANT}}}++; # track variant stats
|
135
|
+
if ( scalar @{$dict->{$root}{VARIANT}} ge 4 ) {
|
136
|
+
print STDERR "$root -- ",scalar @{$dict->{$root}{VARIANT}},"\n";
|
137
|
+
}
|
138
|
+
}
|
139
|
+
$last = $word; # remember which token we just did
|
140
|
+
}
|
141
|
+
}
|