pronounce 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +19 -0
- data/data/beep/ACKNOWLEDGEMENTS +36 -0
- data/data/beep/ANNOUNCE-1.0 +27 -0
- data/data/beep/README +39 -0
- data/data/beep/addparan +22 -0
- data/data/beep/beep-1.0 +257070 -0
- data/data/beep/case.txt +166944 -0
- data/data/beep/lexicode.doc +47 -0
- data/data/beep/phoncode.doc +48 -0
- data/data/beep/phone45.tab +45 -0
- data/data/beep/sayTimit.doc +130 -0
- data/data/beep/sayTimit.pl +174 -0
- data/data/cmudict/00README_FIRST.txt +36 -0
- data/data/cmudict/README.developer +50 -0
- data/data/cmudict/README.old +79 -0
- data/data/cmudict/README.weide +67 -0
- data/data/cmudict/cmudict.0.6d +129511 -0
- data/data/cmudict/cmudict.0.7a +133369 -0
- data/data/cmudict/cmudict.0.7a.phones +39 -0
- data/data/cmudict/cmudict.0.7a.symbols +84 -0
- data/data/cmudict/scripts/CompileDictionary.sh +36 -0
- data/data/cmudict/scripts/README.txt +27 -0
- data/data/cmudict/scripts/make_baseform.pl +172 -0
- data/data/cmudict/scripts/sort_cmudict.pl +141 -0
- data/data/cmudict/scripts/test_cmudict.pl +166 -0
- data/data/cmudict/scripts/test_dict.pl +119 -0
- data/data/cmudict/sphinxdict/README.txt +19 -0
- data/data/cmudict/sphinxdict/SphinxPhones_40 +40 -0
- data/data/cmudict/sphinxdict/cmudict.0.7a_SPHINX_40 +133012 -0
- data/data/cmudict/sphinxdict/cmudict_SPHINX_40 +133012 -0
- data/lib/pronounce.rb +33 -0
- metadata +104 -0
@@ -0,0 +1,39 @@
|
|
1
|
+
AA vowel
|
2
|
+
AE vowel
|
3
|
+
AH vowel
|
4
|
+
AO vowel
|
5
|
+
AW vowel
|
6
|
+
AY vowel
|
7
|
+
B stop
|
8
|
+
CH affricate
|
9
|
+
D stop
|
10
|
+
DH fricative
|
11
|
+
EH vowel
|
12
|
+
ER vowel
|
13
|
+
EY vowel
|
14
|
+
F fricative
|
15
|
+
G stop
|
16
|
+
HH aspirate
|
17
|
+
IH vowel
|
18
|
+
IY vowel
|
19
|
+
JH affricate
|
20
|
+
K stop
|
21
|
+
L liquid
|
22
|
+
M nasal
|
23
|
+
N nasal
|
24
|
+
NG nasal
|
25
|
+
OW vowel
|
26
|
+
OY vowel
|
27
|
+
P stop
|
28
|
+
R liquid
|
29
|
+
S fricative
|
30
|
+
SH fricative
|
31
|
+
T stop
|
32
|
+
TH fricative
|
33
|
+
UH vowel
|
34
|
+
UW vowel
|
35
|
+
V fricative
|
36
|
+
W semivowel
|
37
|
+
Y semivowel
|
38
|
+
Z fricative
|
39
|
+
ZH fricative
|
@@ -0,0 +1,84 @@
|
|
1
|
+
AA
|
2
|
+
AA0
|
3
|
+
AA1
|
4
|
+
AA2
|
5
|
+
AE
|
6
|
+
AE0
|
7
|
+
AE1
|
8
|
+
AE2
|
9
|
+
AH
|
10
|
+
AH0
|
11
|
+
AH1
|
12
|
+
AH2
|
13
|
+
AO
|
14
|
+
AO0
|
15
|
+
AO1
|
16
|
+
AO2
|
17
|
+
AW
|
18
|
+
AW0
|
19
|
+
AW1
|
20
|
+
AW2
|
21
|
+
AY
|
22
|
+
AY0
|
23
|
+
AY1
|
24
|
+
AY2
|
25
|
+
B
|
26
|
+
CH
|
27
|
+
D
|
28
|
+
DH
|
29
|
+
EH
|
30
|
+
EH0
|
31
|
+
EH1
|
32
|
+
EH2
|
33
|
+
ER
|
34
|
+
ER0
|
35
|
+
ER1
|
36
|
+
ER2
|
37
|
+
EY
|
38
|
+
EY0
|
39
|
+
EY1
|
40
|
+
EY2
|
41
|
+
F
|
42
|
+
G
|
43
|
+
HH
|
44
|
+
IH
|
45
|
+
IH0
|
46
|
+
IH1
|
47
|
+
IH2
|
48
|
+
IY
|
49
|
+
IY0
|
50
|
+
IY1
|
51
|
+
IY2
|
52
|
+
JH
|
53
|
+
K
|
54
|
+
L
|
55
|
+
M
|
56
|
+
N
|
57
|
+
NG
|
58
|
+
OW
|
59
|
+
OW0
|
60
|
+
OW1
|
61
|
+
OW2
|
62
|
+
OY
|
63
|
+
OY0
|
64
|
+
OY1
|
65
|
+
OY2
|
66
|
+
P
|
67
|
+
R
|
68
|
+
S
|
69
|
+
SH
|
70
|
+
T
|
71
|
+
TH
|
72
|
+
UH
|
73
|
+
UH0
|
74
|
+
UH1
|
75
|
+
UH2
|
76
|
+
UW
|
77
|
+
UW0
|
78
|
+
UW1
|
79
|
+
UW2
|
80
|
+
V
|
81
|
+
W
|
82
|
+
Y
|
83
|
+
Z
|
84
|
+
ZH
|
@@ -0,0 +1,36 @@
|
|
1
|
+
#!sh
|
2
|
+
# [20080422] (air) Compile cmudict into SPHINX_40 form
|
3
|
+
# [20100118] (air)
|
4
|
+
|
5
|
+
DIR=sphinxdict
|
6
|
+
DICT_BASE=cmudict
|
7
|
+
DICTIONARY=${DICT_BASE}.0.7a
|
8
|
+
|
9
|
+
|
10
|
+
|
11
|
+
echo "Compiling $DICTIONARY..."
|
12
|
+
|
13
|
+
# make_baseforms.pl removes stress marks and eliminates resulting duplicates
|
14
|
+
perl ./scripts/make_baseform.pl $DICTIONARY $DIR/$$_SPHINX_40
|
15
|
+
|
16
|
+
|
17
|
+
echo ""
|
18
|
+
echo "Testing sphinx cmudict... "
|
19
|
+
if ./scripts/test_dict.pl -p $DIR/SphinxPhones_40 $DIR/$$_SPHINX_40
|
20
|
+
then
|
21
|
+
cp -p $DIR/$$_SPHINX_40 $DIR/${DICT_BASE}_SPHINX_40
|
22
|
+
cp -p $DIR/$$_SPHINX_40 $DIR/${DICTIONARY}_SPHINX_40
|
23
|
+
echo "Dictionary successfully compiled"
|
24
|
+
else
|
25
|
+
if [ -e $DIR/${DICT_BASE}_SPHINX_40 ] ; then rm $DIR/${DICT_BASE}_SPHINX_40 ; fi
|
26
|
+
if [ -e $DIR/${DICTIONARY}_SPHINX_40 ] ; then rm $DIR/${DICTIONARY}_SPHINX_40 ; fi
|
27
|
+
echo ""
|
28
|
+
echo "$0 encountered errors"
|
29
|
+
echo "dictionary compilation not completed"
|
30
|
+
fi
|
31
|
+
|
32
|
+
rm $DIR/$$_SPHINX_40
|
33
|
+
|
34
|
+
echo "Done"
|
35
|
+
|
36
|
+
#
|
@@ -0,0 +1,27 @@
|
|
1
|
+
Maintenance scripts for cmudict
|
2
|
+
-------------------------------
|
3
|
+
[20100118] (air)
|
4
|
+
|
5
|
+
Use these scripts for checking and compiling the dictionary.
|
6
|
+
|
7
|
+
The process is the following:
|
8
|
+
|
9
|
+
1) make changes to the dictionary
|
10
|
+
- it's assumed that the changes are manual
|
11
|
+
- check your work by doing a svn diff with the previous version
|
12
|
+
|
13
|
+
2) run scripts/test_cmudict.pl
|
14
|
+
EG: ./scripts/test_cmudict.pl -p cmudict.0.7a.symbols cmudict.0.7a
|
15
|
+
- this checks for collation order, legal entry format and phonetic symbols
|
16
|
+
- if necessary fix problems then repeat this step until no errors
|
17
|
+
|
18
|
+
3) run CompileDictionary*
|
19
|
+
[converts cmudict to the Sphinx format using make_baseform.pl]
|
20
|
+
[checks for consistency using test_dict.pl]
|
21
|
+
- produces two *_SPHINX_40 files; one generic the other major-versioned
|
22
|
+
|
23
|
+
4) use svn to update cmudict; be sure to add a proper logging message
|
24
|
+
|
25
|
+
That's it!
|
26
|
+
|
27
|
+
|
@@ -0,0 +1,172 @@
|
|
1
|
+
#!perl -w
|
2
|
+
|
3
|
+
#
|
4
|
+
# ====================================================================
|
5
|
+
# Copyright (C) 1999-2008 Carnegie Mellon University and Alexander
|
6
|
+
# Rudnicky. All rights reserved.
|
7
|
+
#
|
8
|
+
# Redistribution and use in source and binary forms, with or without
|
9
|
+
# modification, are permitted provided that the following conditions
|
10
|
+
# are met:
|
11
|
+
#
|
12
|
+
# 1. Redistributions of source code must retain the above copyright
|
13
|
+
# notice, this list of conditions and the following disclaimer.
|
14
|
+
#
|
15
|
+
# 2. Redistributions in binary form must reproduce the above copyright
|
16
|
+
# notice, this list of conditions and the following disclaimer in
|
17
|
+
# the documentation and/or other materials provided with the
|
18
|
+
# distribution.
|
19
|
+
#
|
20
|
+
# This work was supported in part by funding from the Defense Advanced
|
21
|
+
# Research Projects Agency, the Office of Naval Research and the National
|
22
|
+
# Science Foundation of the United States of America, and by member
|
23
|
+
# companies of the Carnegie Mellon Sphinx Speech Consortium. We acknowledge
|
24
|
+
# the contributions of many volunteers to the expansion and improvement of
|
25
|
+
# this dictionary.
|
26
|
+
#
|
27
|
+
# THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
|
28
|
+
# ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
|
29
|
+
# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
30
|
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
|
31
|
+
# NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
32
|
+
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
33
|
+
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
34
|
+
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
35
|
+
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
36
|
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
37
|
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
38
|
+
#
|
39
|
+
# ====================================================================
|
40
|
+
#
|
41
|
+
|
42
|
+
# [20050309] (air) Created.
|
43
|
+
# strip out stress marks from a cmudict, producing a "SphinxPhones_40" dictionary
|
44
|
+
# [20080420] (air) Changed to pass comments.
|
45
|
+
# Fixed output collation sequence; DOS eol's
|
46
|
+
# [20090309] (air) fixed duplicate pron and collation bugs
|
47
|
+
# [20090331] (air) restored standard collation order (since other stuff deppends on it)
|
48
|
+
# [20090629] (air) do not put comments into SPHINX_40 version; not all software deals
|
49
|
+
# [20100118] (air) added $VERBOSE; this should really be a cmdline flag...
|
50
|
+
#
|
51
|
+
|
52
|
+
|
53
|
+
$VERBOSE = 0;
|
54
|
+
|
55
|
+
my $basecount = 0;
|
56
|
+
my $dupl = 0;
|
57
|
+
my $base = 0;
|
58
|
+
my $varia = 0;
|
59
|
+
|
60
|
+
if ( scalar @ARGV ne 2 ) { die "usage: make_baseform <input> <output>\n"; }
|
61
|
+
|
62
|
+
open(IN, $ARGV[0]) || die "can't open $ARGV[0] for reading!\n";
|
63
|
+
open(OUT,">$ARGV[1]") || die "can't open $ARGV[1] for writing!\n";
|
64
|
+
|
65
|
+
@header = (); # header comment lines (passed through)
|
66
|
+
%dict = (); # words end up in here
|
67
|
+
%histo = (); # some statistics on variants
|
68
|
+
|
69
|
+
get_dict(\%dict,\@header,IN); # process the entries
|
70
|
+
|
71
|
+
# what have we got?
|
72
|
+
print STDERR "$basecount forms processed\n";
|
73
|
+
print STDERR "$base baseforms, $varia variants and $dupl duplicates found.\n";
|
74
|
+
print STDERR "variant distribution:\n";
|
75
|
+
foreach $var ( sort keys %histo ) {
|
76
|
+
print STDERR "$var\t$histo{$var}\n";
|
77
|
+
}
|
78
|
+
|
79
|
+
# print special comments (copyright, etc.)
|
80
|
+
# removed since it messes some things up...
|
81
|
+
# foreach $h (@header) { print OUT "$h\n"; }
|
82
|
+
|
83
|
+
# print out each entry
|
84
|
+
%dict_out = ();
|
85
|
+
foreach $w (sort keys %dict) {
|
86
|
+
$var=1; # variants will number starting with 2
|
87
|
+
foreach $p ( @{$dict{$w}} ) {
|
88
|
+
if ($var eq 1) {
|
89
|
+
$dict_out{$w} = $p;
|
90
|
+
$var++;
|
91
|
+
} else {
|
92
|
+
$dict_out{"$w($var)"} = $p;
|
93
|
+
$var++;
|
94
|
+
}
|
95
|
+
}
|
96
|
+
}
|
97
|
+
|
98
|
+
foreach $entry ( sort keys %dict_out ) {
|
99
|
+
print OUT "$entry\t$dict_out{$entry}\n";
|
100
|
+
}
|
101
|
+
|
102
|
+
close(IN);
|
103
|
+
close(OUT);
|
104
|
+
|
105
|
+
#
|
106
|
+
#
|
107
|
+
# read in a dictionary
|
108
|
+
sub get_dict {
|
109
|
+
my $dict = shift; # data structure with dictionary entries
|
110
|
+
my $header = shift;
|
111
|
+
my $target = shift; # input file handle
|
112
|
+
|
113
|
+
while (<$target>) {
|
114
|
+
s/[\r\n]+$//g; # DOS-robust chomp;
|
115
|
+
|
116
|
+
# process comments; blank lines ignored
|
117
|
+
# presume that ";;; #" will be collected and emitted at the top
|
118
|
+
if ($_ =~ /^;;; \#/) { push @$header, $_; next; } # save header info
|
119
|
+
elsif ( $_ =~ /^;;;/ ) { next; } # ignore plain comments
|
120
|
+
elsif ( $_ =~ /^\s*$/ ) { next; } # ignore blank lines
|
121
|
+
|
122
|
+
# extract the word,pron pair and prepare for processing
|
123
|
+
($word,$pron) = /(.+?)\s+(.+?)$/;
|
124
|
+
if (! defined $word) { print STDERR "bad entry (no head word): $_\n"; next; }
|
125
|
+
|
126
|
+
$basecount++;
|
127
|
+
|
128
|
+
if ($word =~ /\)$/) { # variant
|
129
|
+
($root,$variant) = ($word =~ m/(.+?)\((.+?)\)/);
|
130
|
+
} else {
|
131
|
+
$root = $word;
|
132
|
+
$variant = 0;
|
133
|
+
}
|
134
|
+
$pron = &strip_stress($pron);
|
135
|
+
|
136
|
+
# found a new baseform; set it up
|
137
|
+
if ( ! defined $dict->{$root} ) {
|
138
|
+
$dict->{$root}[0] = $pron;
|
139
|
+
$base++;
|
140
|
+
next;
|
141
|
+
}
|
142
|
+
|
143
|
+
# old baseform; see if, after removed stress, pron is a duplicate
|
144
|
+
foreach $var ( @{$dict->{$root}} ) {
|
145
|
+
if ( $var eq $pron ) {
|
146
|
+
if ($VERBOSE) {print STDERR "duplicate entry: $root ($variant) $pron\n";}
|
147
|
+
$dupl++;
|
148
|
+
$pron = "";
|
149
|
+
last;
|
150
|
+
}
|
151
|
+
}
|
152
|
+
|
153
|
+
# it's a new variant on an existing baseform, keep it
|
154
|
+
if ( $pron ne "" ) {
|
155
|
+
push @{$dict->{$root}}, $pron;
|
156
|
+
$varia++;
|
157
|
+
$histo{scalar @{$dict->{$root}}}++; # track variant stats
|
158
|
+
if ( scalar @{$dict->{$root}} > 4 ) { print STDERR "$root -- ",scalar @{$dict->{$root}},"\n"; }
|
159
|
+
}
|
160
|
+
}
|
161
|
+
}
|
162
|
+
|
163
|
+
|
164
|
+
# strip stress marks from phonetic symbols
|
165
|
+
sub strip_stress {
|
166
|
+
@pron = split " ", $_[0];
|
167
|
+
my $p;
|
168
|
+
foreach $p (@pron) { if ( $p =~ /\d$/) { $p =~ s/(\d+)$//; } }
|
169
|
+
return ( join(" ",@pron));
|
170
|
+
}
|
171
|
+
|
172
|
+
#
|
@@ -0,0 +1,141 @@
|
|
1
|
+
#!perl -w
|
2
|
+
|
3
|
+
#
|
4
|
+
# ====================================================================
|
5
|
+
# Copyright (C) 1999-2008 Carnegie Mellon University and Alexander
|
6
|
+
# Rudnicky. All rights reserved.
|
7
|
+
#
|
8
|
+
# Redistribution and use in source and binary forms, with or without
|
9
|
+
# modification, are permitted provided that the following conditions
|
10
|
+
# are met:
|
11
|
+
#
|
12
|
+
# 1. Redistributions of source code must retain the above copyright
|
13
|
+
# notice, this list of conditions and the following disclaimer.
|
14
|
+
#
|
15
|
+
# 2. Redistributions in binary form must reproduce the above copyright
|
16
|
+
# notice, this list of conditions and the following disclaimer in
|
17
|
+
# the documentation and/or other materials provided with the
|
18
|
+
# distribution.
|
19
|
+
#
|
20
|
+
# This work was supported in part by funding from the Defense Advanced
|
21
|
+
# Research Projects Agency, the Office of Naval Research and the National
|
22
|
+
# Science Foundation of the United States of America, and by member
|
23
|
+
# companies of the Carnegie Mellon Sphinx Speech Consortium. We acknowledge
|
24
|
+
# the contributions of many volunteers to the expansion and improvement of
|
25
|
+
# this dictionary.
|
26
|
+
#
|
27
|
+
# THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
|
28
|
+
# ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
|
29
|
+
# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
30
|
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
|
31
|
+
# NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
32
|
+
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
33
|
+
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
34
|
+
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
35
|
+
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
36
|
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
37
|
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
38
|
+
#
|
39
|
+
# ====================================================================
|
40
|
+
#
|
41
|
+
|
42
|
+
# Sort cmudict according to head entry collating sequence
|
43
|
+
|
44
|
+
# [20090331] (air) Created.
|
45
|
+
|
46
|
+
use strict;
|
47
|
+
|
48
|
+
if ( scalar @ARGV ne 2 ) { die "usage: sort_cmudict <input> <output>\n"; }
|
49
|
+
|
50
|
+
open(IN, $ARGV[0]) || die "can't open $ARGV[0] for reading!\n";
|
51
|
+
open(OUT,">$ARGV[1]") || die "can't open $ARGV[1] for writing!\n";
|
52
|
+
|
53
|
+
my %header = (); # header comment lines (passed through)
|
54
|
+
my %histo = (); # some statistics on variants
|
55
|
+
|
56
|
+
my %dict = ("" => {VARIANT => [], COMMENT => ""} ); # words end up in here
|
57
|
+
my $last = ""; # the last word processed
|
58
|
+
|
59
|
+
&get_dict(\%dict,\%header,*IN); # process the entries
|
60
|
+
|
61
|
+
# print special comments (copyright, etc.)
|
62
|
+
foreach my $h (sort keys %header) { print OUT "$header{$h}"; }
|
63
|
+
|
64
|
+
# print out each entry
|
65
|
+
my $DELIMITER = ' ';
|
66
|
+
foreach my $w (sort keys %dict) {
|
67
|
+
my $var=1; # number variants from 2 (this is different from original)
|
68
|
+
foreach my $p ( @{$dict{$w}{VARIANT}} ) {
|
69
|
+
if ($var eq 1) {
|
70
|
+
print OUT "$w$DELIMITER$p\n";
|
71
|
+
} else {
|
72
|
+
print OUT "$w($var)$DELIMITER$p\n";
|
73
|
+
}
|
74
|
+
$var++;
|
75
|
+
}
|
76
|
+
}
|
77
|
+
|
78
|
+
|
79
|
+
|
80
|
+
# read in a dictionary
|
81
|
+
sub get_dict {
|
82
|
+
my $dict = shift; # data structure with dictionary entries
|
83
|
+
my $header = shift;
|
84
|
+
my $target = shift; # input file handle
|
85
|
+
my ($word,$pron,$root,$variant);
|
86
|
+
my ($basecount,$base,$dupl,$varia);
|
87
|
+
|
88
|
+
while (<$target>) {
|
89
|
+
s/[\r\n]+$//g; # DOS-robust chomp;
|
90
|
+
|
91
|
+
# process comments; blank lines ignored
|
92
|
+
# presume that ";;; #" will be collected and emitted at the top
|
93
|
+
if ($_ =~ /^;;; \#/) { # save header info
|
94
|
+
$header{$last} .= "$_\n";
|
95
|
+
next;
|
96
|
+
}
|
97
|
+
elsif ( $_ =~ /^;;;/ ) { $header{$last} .= "$_\n"; next; } # ignore plain comments
|
98
|
+
elsif ( $_ =~ /^\s*$/ ) { $header{$last} .= "$_\n"; next; } # ignore blank lines
|
99
|
+
|
100
|
+
# extract the word,pron pair and prepare for processing
|
101
|
+
($word,$pron) = /(.+?)\s+(.+?)$/;
|
102
|
+
if (! defined $word) { print STDERR "bad entry (no head word): $_\n"; next; }
|
103
|
+
|
104
|
+
$basecount++;
|
105
|
+
|
106
|
+
if ($word =~ /\(\d\)$/) { # variant
|
107
|
+
($root,$variant) = ($word =~ m/(.+?)\((.+?)\)/);
|
108
|
+
} else {
|
109
|
+
$root = $word;
|
110
|
+
$variant = 0;
|
111
|
+
}
|
112
|
+
|
113
|
+
# found a new baseform; set it up
|
114
|
+
if ( ! defined $dict->{$root} ) {
|
115
|
+
$dict->{$root}{VARIANT}[0] = $pron;
|
116
|
+
$base++;
|
117
|
+
next;
|
118
|
+
}
|
119
|
+
|
120
|
+
# already-seen baseform; see if pron is a duplicate
|
121
|
+
foreach my $var ( @{$dict->{$root}{VARIANT}} ) {
|
122
|
+
if ( $var eq $pron ) {
|
123
|
+
print STDERR "duplicate entry: $root ($variant) $pron!\n";
|
124
|
+
$dupl++;
|
125
|
+
$pron = "";
|
126
|
+
last;
|
127
|
+
}
|
128
|
+
}
|
129
|
+
|
130
|
+
# it's a new variant on an existing baseform, keep it
|
131
|
+
if ( $pron ne "" ) {
|
132
|
+
push @{$dict->{$root}{VARIANT}}, $pron;
|
133
|
+
$varia++;
|
134
|
+
$histo{scalar @{$dict->{$root}{VARIANT}}}++; # track variant stats
|
135
|
+
if ( scalar @{$dict->{$root}{VARIANT}} ge 4 ) {
|
136
|
+
print STDERR "$root -- ",scalar @{$dict->{$root}{VARIANT}},"\n";
|
137
|
+
}
|
138
|
+
}
|
139
|
+
$last = $word; # remember which token we just did
|
140
|
+
}
|
141
|
+
}
|