pronounce 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,39 @@
1
+ AA vowel
2
+ AE vowel
3
+ AH vowel
4
+ AO vowel
5
+ AW vowel
6
+ AY vowel
7
+ B stop
8
+ CH affricate
9
+ D stop
10
+ DH fricative
11
+ EH vowel
12
+ ER vowel
13
+ EY vowel
14
+ F fricative
15
+ G stop
16
+ HH aspirate
17
+ IH vowel
18
+ IY vowel
19
+ JH affricate
20
+ K stop
21
+ L liquid
22
+ M nasal
23
+ N nasal
24
+ NG nasal
25
+ OW vowel
26
+ OY vowel
27
+ P stop
28
+ R liquid
29
+ S fricative
30
+ SH fricative
31
+ T stop
32
+ TH fricative
33
+ UH vowel
34
+ UW vowel
35
+ V fricative
36
+ W semivowel
37
+ Y semivowel
38
+ Z fricative
39
+ ZH fricative
@@ -0,0 +1,84 @@
1
+ AA
2
+ AA0
3
+ AA1
4
+ AA2
5
+ AE
6
+ AE0
7
+ AE1
8
+ AE2
9
+ AH
10
+ AH0
11
+ AH1
12
+ AH2
13
+ AO
14
+ AO0
15
+ AO1
16
+ AO2
17
+ AW
18
+ AW0
19
+ AW1
20
+ AW2
21
+ AY
22
+ AY0
23
+ AY1
24
+ AY2
25
+ B
26
+ CH
27
+ D
28
+ DH
29
+ EH
30
+ EH0
31
+ EH1
32
+ EH2
33
+ ER
34
+ ER0
35
+ ER1
36
+ ER2
37
+ EY
38
+ EY0
39
+ EY1
40
+ EY2
41
+ F
42
+ G
43
+ HH
44
+ IH
45
+ IH0
46
+ IH1
47
+ IH2
48
+ IY
49
+ IY0
50
+ IY1
51
+ IY2
52
+ JH
53
+ K
54
+ L
55
+ M
56
+ N
57
+ NG
58
+ OW
59
+ OW0
60
+ OW1
61
+ OW2
62
+ OY
63
+ OY0
64
+ OY1
65
+ OY2
66
+ P
67
+ R
68
+ S
69
+ SH
70
+ T
71
+ TH
72
+ UH
73
+ UH0
74
+ UH1
75
+ UH2
76
+ UW
77
+ UW0
78
+ UW1
79
+ UW2
80
+ V
81
+ W
82
+ Y
83
+ Z
84
+ ZH
@@ -0,0 +1,36 @@
1
+ #!sh
2
+ # [20080422] (air) Compile cmudict into SPHINX_40 form
3
+ # [20100118] (air)
4
+
5
+ DIR=sphinxdict
6
+ DICT_BASE=cmudict
7
+ DICTIONARY=${DICT_BASE}.0.7a
8
+
9
+
10
+
11
+ echo "Compiling $DICTIONARY..."
12
+
13
+ # make_baseforms.pl removes stress marks and eliminates resulting duplicates
14
+ perl ./scripts/make_baseform.pl $DICTIONARY $DIR/$$_SPHINX_40
15
+
16
+
17
+ echo ""
18
+ echo "Testing sphinx cmudict... "
19
+ if ./scripts/test_dict.pl -p $DIR/SphinxPhones_40 $DIR/$$_SPHINX_40
20
+ then
21
+ cp -p $DIR/$$_SPHINX_40 $DIR/${DICT_BASE}_SPHINX_40
22
+ cp -p $DIR/$$_SPHINX_40 $DIR/${DICTIONARY}_SPHINX_40
23
+ echo "Dictionary successfully compiled"
24
+ else
25
+ if [ -e $DIR/${DICT_BASE}_SPHINX_40 ] ; then rm $DIR/${DICT_BASE}_SPHINX_40 ; fi
26
+ if [ -e $DIR/${DICTIONARY}_SPHINX_40 ] ; then rm $DIR/${DICTIONARY}_SPHINX_40 ; fi
27
+ echo ""
28
+ echo "$0 encountered errors"
29
+ echo "dictionary compilation not completed"
30
+ fi
31
+
32
+ rm $DIR/$$_SPHINX_40
33
+
34
+ echo "Done"
35
+
36
+ #
@@ -0,0 +1,27 @@
1
+ Maintenance scripts for cmudict
2
+ -------------------------------
3
+ [20100118] (air)
4
+
5
+ Use these scripts for checking and compiling the dictionary.
6
+
7
+ The process is the following:
8
+
9
+ 1) make changes to the dictionary
10
+ - it's assumed that the changes are manual
11
+ - check your work by doing a svn diff with the previous version
12
+
13
+ 2) run scripts/test_cmudict.pl
14
+ EG: ./scripts/test_cmudict.pl -p cmudict.0.7a.symbols cmudict.0.7a
15
+ - this checks for collation order, legal entry format and phonetic symbols
16
+ - if necessary fix problems then repeat this step until no errors
17
+
18
+ 3) run CompileDictionary*
19
+ [converts cmudict to the Sphinx format using make_baseform.pl]
20
+ [checks for consistency using test_dict.pl]
21
+ - produces two *_SPHINX_40 files; one generic the other major-versioned
22
+
23
+ 4) use svn to update cmudict; be sure to add a proper logging message
24
+
25
+ That's it!
26
+
27
+
@@ -0,0 +1,172 @@
1
+ #!perl -w
2
+
3
+ #
4
+ # ====================================================================
5
+ # Copyright (C) 1999-2008 Carnegie Mellon University and Alexander
6
+ # Rudnicky. All rights reserved.
7
+ #
8
+ # Redistribution and use in source and binary forms, with or without
9
+ # modification, are permitted provided that the following conditions
10
+ # are met:
11
+ #
12
+ # 1. Redistributions of source code must retain the above copyright
13
+ # notice, this list of conditions and the following disclaimer.
14
+ #
15
+ # 2. Redistributions in binary form must reproduce the above copyright
16
+ # notice, this list of conditions and the following disclaimer in
17
+ # the documentation and/or other materials provided with the
18
+ # distribution.
19
+ #
20
+ # This work was supported in part by funding from the Defense Advanced
21
+ # Research Projects Agency, the Office of Naval Research and the National
22
+ # Science Foundation of the United States of America, and by member
23
+ # companies of the Carnegie Mellon Sphinx Speech Consortium. We acknowledge
24
+ # the contributions of many volunteers to the expansion and improvement of
25
+ # this dictionary.
26
+ #
27
+ # THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
28
+ # ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
29
+ # THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
31
+ # NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32
+ # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33
+ # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34
+ # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35
+ # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38
+ #
39
+ # ====================================================================
40
+ #
41
+
42
+ # [20050309] (air) Created.
43
+ # strip out stress marks from a cmudict, producing a "SphinxPhones_40" dictionary
44
+ # [20080420] (air) Changed to pass comments.
45
+ # Fixed output collation sequence; DOS eol's
46
+ # [20090309] (air) fixed duplicate pron and collation bugs
47
+ # [20090331] (air) restored standard collation order (since other stuff deppends on it)
48
+ # [20090629] (air) do not put comments into SPHINX_40 version; not all software deals
49
+ # [20100118] (air) added $VERBOSE; this should really be a cmdline flag...
50
+ #
51
+
52
+
53
+ $VERBOSE = 0;
54
+
55
+ my $basecount = 0;
56
+ my $dupl = 0;
57
+ my $base = 0;
58
+ my $varia = 0;
59
+
60
+ if ( scalar @ARGV ne 2 ) { die "usage: make_baseform <input> <output>\n"; }
61
+
62
+ open(IN, $ARGV[0]) || die "can't open $ARGV[0] for reading!\n";
63
+ open(OUT,">$ARGV[1]") || die "can't open $ARGV[1] for writing!\n";
64
+
65
+ @header = (); # header comment lines (passed through)
66
+ %dict = (); # words end up in here
67
+ %histo = (); # some statistics on variants
68
+
69
+ get_dict(\%dict,\@header,IN); # process the entries
70
+
71
+ # what have we got?
72
+ print STDERR "$basecount forms processed\n";
73
+ print STDERR "$base baseforms, $varia variants and $dupl duplicates found.\n";
74
+ print STDERR "variant distribution:\n";
75
+ foreach $var ( sort keys %histo ) {
76
+ print STDERR "$var\t$histo{$var}\n";
77
+ }
78
+
79
+ # print special comments (copyright, etc.)
80
+ # removed since it messes some things up...
81
+ # foreach $h (@header) { print OUT "$h\n"; }
82
+
83
+ # print out each entry
84
+ %dict_out = ();
85
+ foreach $w (sort keys %dict) {
86
+ $var=1; # variants will number starting with 2
87
+ foreach $p ( @{$dict{$w}} ) {
88
+ if ($var eq 1) {
89
+ $dict_out{$w} = $p;
90
+ $var++;
91
+ } else {
92
+ $dict_out{"$w($var)"} = $p;
93
+ $var++;
94
+ }
95
+ }
96
+ }
97
+
98
+ foreach $entry ( sort keys %dict_out ) {
99
+ print OUT "$entry\t$dict_out{$entry}\n";
100
+ }
101
+
102
+ close(IN);
103
+ close(OUT);
104
+
105
+ #
106
+ #
107
+ # read in a dictionary
108
+ sub get_dict {
109
+ my $dict = shift; # data structure with dictionary entries
110
+ my $header = shift;
111
+ my $target = shift; # input file handle
112
+
113
+ while (<$target>) {
114
+ s/[\r\n]+$//g; # DOS-robust chomp;
115
+
116
+ # process comments; blank lines ignored
117
+ # presume that ";;; #" will be collected and emitted at the top
118
+ if ($_ =~ /^;;; \#/) { push @$header, $_; next; } # save header info
119
+ elsif ( $_ =~ /^;;;/ ) { next; } # ignore plain comments
120
+ elsif ( $_ =~ /^\s*$/ ) { next; } # ignore blank lines
121
+
122
+ # extract the word,pron pair and prepare for processing
123
+ ($word,$pron) = /(.+?)\s+(.+?)$/;
124
+ if (! defined $word) { print STDERR "bad entry (no head word): $_\n"; next; }
125
+
126
+ $basecount++;
127
+
128
+ if ($word =~ /\)$/) { # variant
129
+ ($root,$variant) = ($word =~ m/(.+?)\((.+?)\)/);
130
+ } else {
131
+ $root = $word;
132
+ $variant = 0;
133
+ }
134
+ $pron = &strip_stress($pron);
135
+
136
+ # found a new baseform; set it up
137
+ if ( ! defined $dict->{$root} ) {
138
+ $dict->{$root}[0] = $pron;
139
+ $base++;
140
+ next;
141
+ }
142
+
143
+ # old baseform; see if, after removed stress, pron is a duplicate
144
+ foreach $var ( @{$dict->{$root}} ) {
145
+ if ( $var eq $pron ) {
146
+ if ($VERBOSE) {print STDERR "duplicate entry: $root ($variant) $pron\n";}
147
+ $dupl++;
148
+ $pron = "";
149
+ last;
150
+ }
151
+ }
152
+
153
+ # it's a new variant on an existing baseform, keep it
154
+ if ( $pron ne "" ) {
155
+ push @{$dict->{$root}}, $pron;
156
+ $varia++;
157
+ $histo{scalar @{$dict->{$root}}}++; # track variant stats
158
+ if ( scalar @{$dict->{$root}} > 4 ) { print STDERR "$root -- ",scalar @{$dict->{$root}},"\n"; }
159
+ }
160
+ }
161
+ }
162
+
163
+
164
+ # strip stress marks from phonetic symbols
165
+ sub strip_stress {
166
+ @pron = split " ", $_[0];
167
+ my $p;
168
+ foreach $p (@pron) { if ( $p =~ /\d$/) { $p =~ s/(\d+)$//; } }
169
+ return ( join(" ",@pron));
170
+ }
171
+
172
+ #
@@ -0,0 +1,141 @@
1
+ #!perl -w
2
+
3
+ #
4
+ # ====================================================================
5
+ # Copyright (C) 1999-2008 Carnegie Mellon University and Alexander
6
+ # Rudnicky. All rights reserved.
7
+ #
8
+ # Redistribution and use in source and binary forms, with or without
9
+ # modification, are permitted provided that the following conditions
10
+ # are met:
11
+ #
12
+ # 1. Redistributions of source code must retain the above copyright
13
+ # notice, this list of conditions and the following disclaimer.
14
+ #
15
+ # 2. Redistributions in binary form must reproduce the above copyright
16
+ # notice, this list of conditions and the following disclaimer in
17
+ # the documentation and/or other materials provided with the
18
+ # distribution.
19
+ #
20
+ # This work was supported in part by funding from the Defense Advanced
21
+ # Research Projects Agency, the Office of Naval Research and the National
22
+ # Science Foundation of the United States of America, and by member
23
+ # companies of the Carnegie Mellon Sphinx Speech Consortium. We acknowledge
24
+ # the contributions of many volunteers to the expansion and improvement of
25
+ # this dictionary.
26
+ #
27
+ # THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
28
+ # ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
29
+ # THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
31
+ # NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32
+ # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33
+ # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34
+ # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35
+ # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38
+ #
39
+ # ====================================================================
40
+ #
41
+
42
+ # Sort cmudict according to head entry collating sequence
43
+
44
+ # [20090331] (air) Created.
45
+
46
+ use strict;
47
+
48
+ if ( scalar @ARGV ne 2 ) { die "usage: sort_cmudict <input> <output>\n"; }
49
+
50
+ open(IN, $ARGV[0]) || die "can't open $ARGV[0] for reading!\n";
51
+ open(OUT,">$ARGV[1]") || die "can't open $ARGV[1] for writing!\n";
52
+
53
+ my %header = (); # header comment lines (passed through)
54
+ my %histo = (); # some statistics on variants
55
+
56
+ my %dict = ("" => {VARIANT => [], COMMENT => ""} ); # words end up in here
57
+ my $last = ""; # the last word processed
58
+
59
+ &get_dict(\%dict,\%header,*IN); # process the entries
60
+
61
+ # print special comments (copyright, etc.)
62
+ foreach my $h (sort keys %header) { print OUT "$header{$h}"; }
63
+
64
+ # print out each entry
65
+ my $DELIMITER = ' ';
66
+ foreach my $w (sort keys %dict) {
67
+ my $var=1; # number variants from 2 (this is different from original)
68
+ foreach my $p ( @{$dict{$w}{VARIANT}} ) {
69
+ if ($var eq 1) {
70
+ print OUT "$w$DELIMITER$p\n";
71
+ } else {
72
+ print OUT "$w($var)$DELIMITER$p\n";
73
+ }
74
+ $var++;
75
+ }
76
+ }
77
+
78
+
79
+
80
+ # read in a dictionary
81
+ sub get_dict {
82
+ my $dict = shift; # data structure with dictionary entries
83
+ my $header = shift;
84
+ my $target = shift; # input file handle
85
+ my ($word,$pron,$root,$variant);
86
+ my ($basecount,$base,$dupl,$varia);
87
+
88
+ while (<$target>) {
89
+ s/[\r\n]+$//g; # DOS-robust chomp;
90
+
91
+ # process comments; blank lines ignored
92
+ # presume that ";;; #" will be collected and emitted at the top
93
+ if ($_ =~ /^;;; \#/) { # save header info
94
+ $header{$last} .= "$_\n";
95
+ next;
96
+ }
97
+ elsif ( $_ =~ /^;;;/ ) { $header{$last} .= "$_\n"; next; } # ignore plain comments
98
+ elsif ( $_ =~ /^\s*$/ ) { $header{$last} .= "$_\n"; next; } # ignore blank lines
99
+
100
+ # extract the word,pron pair and prepare for processing
101
+ ($word,$pron) = /(.+?)\s+(.+?)$/;
102
+ if (! defined $word) { print STDERR "bad entry (no head word): $_\n"; next; }
103
+
104
+ $basecount++;
105
+
106
+ if ($word =~ /\(\d\)$/) { # variant
107
+ ($root,$variant) = ($word =~ m/(.+?)\((.+?)\)/);
108
+ } else {
109
+ $root = $word;
110
+ $variant = 0;
111
+ }
112
+
113
+ # found a new baseform; set it up
114
+ if ( ! defined $dict->{$root} ) {
115
+ $dict->{$root}{VARIANT}[0] = $pron;
116
+ $base++;
117
+ next;
118
+ }
119
+
120
+ # already-seen baseform; see if pron is a duplicate
121
+ foreach my $var ( @{$dict->{$root}{VARIANT}} ) {
122
+ if ( $var eq $pron ) {
123
+ print STDERR "duplicate entry: $root ($variant) $pron!\n";
124
+ $dupl++;
125
+ $pron = "";
126
+ last;
127
+ }
128
+ }
129
+
130
+ # it's a new variant on an existing baseform, keep it
131
+ if ( $pron ne "" ) {
132
+ push @{$dict->{$root}{VARIANT}}, $pron;
133
+ $varia++;
134
+ $histo{scalar @{$dict->{$root}{VARIANT}}}++; # track variant stats
135
+ if ( scalar @{$dict->{$root}{VARIANT}} ge 4 ) {
136
+ print STDERR "$root -- ",scalar @{$dict->{$root}{VARIANT}},"\n";
137
+ }
138
+ }
139
+ $last = $word; # remember which token we just did
140
+ }
141
+ }