pronounce 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,39 @@
1
+ AA vowel
2
+ AE vowel
3
+ AH vowel
4
+ AO vowel
5
+ AW vowel
6
+ AY vowel
7
+ B stop
8
+ CH affricate
9
+ D stop
10
+ DH fricative
11
+ EH vowel
12
+ ER vowel
13
+ EY vowel
14
+ F fricative
15
+ G stop
16
+ HH aspirate
17
+ IH vowel
18
+ IY vowel
19
+ JH affricate
20
+ K stop
21
+ L liquid
22
+ M nasal
23
+ N nasal
24
+ NG nasal
25
+ OW vowel
26
+ OY vowel
27
+ P stop
28
+ R liquid
29
+ S fricative
30
+ SH fricative
31
+ T stop
32
+ TH fricative
33
+ UH vowel
34
+ UW vowel
35
+ V fricative
36
+ W semivowel
37
+ Y semivowel
38
+ Z fricative
39
+ ZH fricative
@@ -0,0 +1,84 @@
1
+ AA
2
+ AA0
3
+ AA1
4
+ AA2
5
+ AE
6
+ AE0
7
+ AE1
8
+ AE2
9
+ AH
10
+ AH0
11
+ AH1
12
+ AH2
13
+ AO
14
+ AO0
15
+ AO1
16
+ AO2
17
+ AW
18
+ AW0
19
+ AW1
20
+ AW2
21
+ AY
22
+ AY0
23
+ AY1
24
+ AY2
25
+ B
26
+ CH
27
+ D
28
+ DH
29
+ EH
30
+ EH0
31
+ EH1
32
+ EH2
33
+ ER
34
+ ER0
35
+ ER1
36
+ ER2
37
+ EY
38
+ EY0
39
+ EY1
40
+ EY2
41
+ F
42
+ G
43
+ HH
44
+ IH
45
+ IH0
46
+ IH1
47
+ IH2
48
+ IY
49
+ IY0
50
+ IY1
51
+ IY2
52
+ JH
53
+ K
54
+ L
55
+ M
56
+ N
57
+ NG
58
+ OW
59
+ OW0
60
+ OW1
61
+ OW2
62
+ OY
63
+ OY0
64
+ OY1
65
+ OY2
66
+ P
67
+ R
68
+ S
69
+ SH
70
+ T
71
+ TH
72
+ UH
73
+ UH0
74
+ UH1
75
+ UH2
76
+ UW
77
+ UW0
78
+ UW1
79
+ UW2
80
+ V
81
+ W
82
+ Y
83
+ Z
84
+ ZH
@@ -0,0 +1,36 @@
1
+ #!sh
2
+ # [20080422] (air) Compile cmudict into SPHINX_40 form
3
+ # [20100118] (air)
4
+
5
+ DIR=sphinxdict
6
+ DICT_BASE=cmudict
7
+ DICTIONARY=${DICT_BASE}.0.7a
8
+
9
+
10
+
11
+ echo "Compiling $DICTIONARY..."
12
+
13
+ # make_baseforms.pl removes stress marks and eliminates resulting duplicates
14
+ perl ./scripts/make_baseform.pl $DICTIONARY $DIR/$$_SPHINX_40
15
+
16
+
17
+ echo ""
18
+ echo "Testing sphinx cmudict... "
19
+ if ./scripts/test_dict.pl -p $DIR/SphinxPhones_40 $DIR/$$_SPHINX_40
20
+ then
21
+ cp -p $DIR/$$_SPHINX_40 $DIR/${DICT_BASE}_SPHINX_40
22
+ cp -p $DIR/$$_SPHINX_40 $DIR/${DICTIONARY}_SPHINX_40
23
+ echo "Dictionary successfully compiled"
24
+ else
25
+ if [ -e $DIR/${DICT_BASE}_SPHINX_40 ] ; then rm $DIR/${DICT_BASE}_SPHINX_40 ; fi
26
+ if [ -e $DIR/${DICTIONARY}_SPHINX_40 ] ; then rm $DIR/${DICTIONARY}_SPHINX_40 ; fi
27
+ echo ""
28
+ echo "$0 encountered errors"
29
+ echo "dictionary compilation not completed"
30
+ fi
31
+
32
+ rm $DIR/$$_SPHINX_40
33
+
34
+ echo "Done"
35
+
36
+ #
@@ -0,0 +1,27 @@
1
+ Maintenance scripts for cmudict
2
+ -------------------------------
3
+ [20100118] (air)
4
+
5
+ Use these scripts for checking and compiling the dictionary.
6
+
7
+ The process is the following:
8
+
9
+ 1) make changes to the dictionary
10
+ - it's assumed that the changes are manual
11
+ - check your work by doing a svn diff with the previous version
12
+
13
+ 2) run scripts/test_cmudict.pl
14
+ EG: ./scripts/test_cmudict.pl -p cmudict.0.7a.symbols cmudict.0.7a
15
+ - this checks for collation order, legal entry format and phonetic symbols
16
+ - if necessary fix problems then repeat this step until no errors
17
+
18
+ 3) run CompileDictionary*
19
+ [converts cmudict to the Sphinx format using make_baseform.pl]
20
+ [checks for consistency using test_dict.pl]
21
+ - produces two *_SPHINX_40 files; one generic the other major-versioned
22
+
23
+ 4) use svn to update cmudict; be sure to add a proper logging message
24
+
25
+ That's it!
26
+
27
+
@@ -0,0 +1,172 @@
1
+ #!perl -w
2
+
3
+ #
4
+ # ====================================================================
5
+ # Copyright (C) 1999-2008 Carnegie Mellon University and Alexander
6
+ # Rudnicky. All rights reserved.
7
+ #
8
+ # Redistribution and use in source and binary forms, with or without
9
+ # modification, are permitted provided that the following conditions
10
+ # are met:
11
+ #
12
+ # 1. Redistributions of source code must retain the above copyright
13
+ # notice, this list of conditions and the following disclaimer.
14
+ #
15
+ # 2. Redistributions in binary form must reproduce the above copyright
16
+ # notice, this list of conditions and the following disclaimer in
17
+ # the documentation and/or other materials provided with the
18
+ # distribution.
19
+ #
20
+ # This work was supported in part by funding from the Defense Advanced
21
+ # Research Projects Agency, the Office of Naval Research and the National
22
+ # Science Foundation of the United States of America, and by member
23
+ # companies of the Carnegie Mellon Sphinx Speech Consortium. We acknowledge
24
+ # the contributions of many volunteers to the expansion and improvement of
25
+ # this dictionary.
26
+ #
27
+ # THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
28
+ # ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
29
+ # THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
31
+ # NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32
+ # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33
+ # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34
+ # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35
+ # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38
+ #
39
+ # ====================================================================
40
+ #
41
+
42
+ # [20050309] (air) Created.
43
+ # strip out stress marks from a cmudict, producing a "SphinxPhones_40" dictionary
44
+ # [20080420] (air) Changed to pass comments.
45
+ # Fixed output collation sequence; DOS eol's
46
+ # [20090309] (air) fixed duplicate pron and collation bugs
47
+ # [20090331] (air) restored standard collation order (since other stuff deppends on it)
48
+ # [20090629] (air) do not put comments into SPHINX_40 version; not all software deals
49
+ # [20100118] (air) added $VERBOSE; this should really be a cmdline flag...
50
+ #
51
+
52
+
53
+ $VERBOSE = 0;
54
+
55
+ my $basecount = 0;
56
+ my $dupl = 0;
57
+ my $base = 0;
58
+ my $varia = 0;
59
+
60
+ if ( scalar @ARGV ne 2 ) { die "usage: make_baseform <input> <output>\n"; }
61
+
62
+ open(IN, $ARGV[0]) || die "can't open $ARGV[0] for reading!\n";
63
+ open(OUT,">$ARGV[1]") || die "can't open $ARGV[1] for writing!\n";
64
+
65
+ @header = (); # header comment lines (passed through)
66
+ %dict = (); # words end up in here
67
+ %histo = (); # some statistics on variants
68
+
69
+ get_dict(\%dict,\@header,IN); # process the entries
70
+
71
+ # what have we got?
72
+ print STDERR "$basecount forms processed\n";
73
+ print STDERR "$base baseforms, $varia variants and $dupl duplicates found.\n";
74
+ print STDERR "variant distribution:\n";
75
+ foreach $var ( sort keys %histo ) {
76
+ print STDERR "$var\t$histo{$var}\n";
77
+ }
78
+
79
+ # print special comments (copyright, etc.)
80
+ # removed since it messes some things up...
81
+ # foreach $h (@header) { print OUT "$h\n"; }
82
+
83
+ # print out each entry
84
+ %dict_out = ();
85
+ foreach $w (sort keys %dict) {
86
+ $var=1; # variants will number starting with 2
87
+ foreach $p ( @{$dict{$w}} ) {
88
+ if ($var eq 1) {
89
+ $dict_out{$w} = $p;
90
+ $var++;
91
+ } else {
92
+ $dict_out{"$w($var)"} = $p;
93
+ $var++;
94
+ }
95
+ }
96
+ }
97
+
98
+ foreach $entry ( sort keys %dict_out ) {
99
+ print OUT "$entry\t$dict_out{$entry}\n";
100
+ }
101
+
102
+ close(IN);
103
+ close(OUT);
104
+
105
+ #
106
+ #
107
+ # read in a dictionary
108
+ sub get_dict {
109
+ my $dict = shift; # data structure with dictionary entries
110
+ my $header = shift;
111
+ my $target = shift; # input file handle
112
+
113
+ while (<$target>) {
114
+ s/[\r\n]+$//g; # DOS-robust chomp;
115
+
116
+ # process comments; blank lines ignored
117
+ # presume that ";;; #" will be collected and emitted at the top
118
+ if ($_ =~ /^;;; \#/) { push @$header, $_; next; } # save header info
119
+ elsif ( $_ =~ /^;;;/ ) { next; } # ignore plain comments
120
+ elsif ( $_ =~ /^\s*$/ ) { next; } # ignore blank lines
121
+
122
+ # extract the word,pron pair and prepare for processing
123
+ ($word,$pron) = /(.+?)\s+(.+?)$/;
124
+ if (! defined $word) { print STDERR "bad entry (no head word): $_\n"; next; }
125
+
126
+ $basecount++;
127
+
128
+ if ($word =~ /\)$/) { # variant
129
+ ($root,$variant) = ($word =~ m/(.+?)\((.+?)\)/);
130
+ } else {
131
+ $root = $word;
132
+ $variant = 0;
133
+ }
134
+ $pron = &strip_stress($pron);
135
+
136
+ # found a new baseform; set it up
137
+ if ( ! defined $dict->{$root} ) {
138
+ $dict->{$root}[0] = $pron;
139
+ $base++;
140
+ next;
141
+ }
142
+
143
+ # old baseform; see if, after removed stress, pron is a duplicate
144
+ foreach $var ( @{$dict->{$root}} ) {
145
+ if ( $var eq $pron ) {
146
+ if ($VERBOSE) {print STDERR "duplicate entry: $root ($variant) $pron\n";}
147
+ $dupl++;
148
+ $pron = "";
149
+ last;
150
+ }
151
+ }
152
+
153
+ # it's a new variant on an existing baseform, keep it
154
+ if ( $pron ne "" ) {
155
+ push @{$dict->{$root}}, $pron;
156
+ $varia++;
157
+ $histo{scalar @{$dict->{$root}}}++; # track variant stats
158
+ if ( scalar @{$dict->{$root}} > 4 ) { print STDERR "$root -- ",scalar @{$dict->{$root}},"\n"; }
159
+ }
160
+ }
161
+ }
162
+
163
+
164
+ # strip stress marks from phonetic symbols
165
+ sub strip_stress {
166
+ @pron = split " ", $_[0];
167
+ my $p;
168
+ foreach $p (@pron) { if ( $p =~ /\d$/) { $p =~ s/(\d+)$//; } }
169
+ return ( join(" ",@pron));
170
+ }
171
+
172
+ #
@@ -0,0 +1,141 @@
1
+ #!perl -w
2
+
3
+ #
4
+ # ====================================================================
5
+ # Copyright (C) 1999-2008 Carnegie Mellon University and Alexander
6
+ # Rudnicky. All rights reserved.
7
+ #
8
+ # Redistribution and use in source and binary forms, with or without
9
+ # modification, are permitted provided that the following conditions
10
+ # are met:
11
+ #
12
+ # 1. Redistributions of source code must retain the above copyright
13
+ # notice, this list of conditions and the following disclaimer.
14
+ #
15
+ # 2. Redistributions in binary form must reproduce the above copyright
16
+ # notice, this list of conditions and the following disclaimer in
17
+ # the documentation and/or other materials provided with the
18
+ # distribution.
19
+ #
20
+ # This work was supported in part by funding from the Defense Advanced
21
+ # Research Projects Agency, the Office of Naval Research and the National
22
+ # Science Foundation of the United States of America, and by member
23
+ # companies of the Carnegie Mellon Sphinx Speech Consortium. We acknowledge
24
+ # the contributions of many volunteers to the expansion and improvement of
25
+ # this dictionary.
26
+ #
27
+ # THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
28
+ # ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
29
+ # THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
31
+ # NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32
+ # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33
+ # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34
+ # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35
+ # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38
+ #
39
+ # ====================================================================
40
+ #
41
+
42
+ # Sort cmudict according to head entry collating sequence
43
+
44
+ # [20090331] (air) Created.
45
+
46
+ use strict;
47
+
48
+ if ( scalar @ARGV ne 2 ) { die "usage: sort_cmudict <input> <output>\n"; }
49
+
50
+ open(IN, $ARGV[0]) || die "can't open $ARGV[0] for reading!\n";
51
+ open(OUT,">$ARGV[1]") || die "can't open $ARGV[1] for writing!\n";
52
+
53
+ my %header = (); # header comment lines (passed through)
54
+ my %histo = (); # some statistics on variants
55
+
56
+ my %dict = ("" => {VARIANT => [], COMMENT => ""} ); # words end up in here
57
+ my $last = ""; # the last word processed
58
+
59
+ &get_dict(\%dict,\%header,*IN); # process the entries
60
+
61
+ # print special comments (copyright, etc.)
62
+ foreach my $h (sort keys %header) { print OUT "$header{$h}"; }
63
+
64
+ # print out each entry
65
+ my $DELIMITER = ' ';
66
+ foreach my $w (sort keys %dict) {
67
+ my $var=1; # number variants from 2 (this is different from original)
68
+ foreach my $p ( @{$dict{$w}{VARIANT}} ) {
69
+ if ($var eq 1) {
70
+ print OUT "$w$DELIMITER$p\n";
71
+ } else {
72
+ print OUT "$w($var)$DELIMITER$p\n";
73
+ }
74
+ $var++;
75
+ }
76
+ }
77
+
78
+
79
+
80
+ # read in a dictionary
81
+ sub get_dict {
82
+ my $dict = shift; # data structure with dictionary entries
83
+ my $header = shift;
84
+ my $target = shift; # input file handle
85
+ my ($word,$pron,$root,$variant);
86
+ my ($basecount,$base,$dupl,$varia);
87
+
88
+ while (<$target>) {
89
+ s/[\r\n]+$//g; # DOS-robust chomp;
90
+
91
+ # process comments; blank lines ignored
92
+ # presume that ";;; #" will be collected and emitted at the top
93
+ if ($_ =~ /^;;; \#/) { # save header info
94
+ $header{$last} .= "$_\n";
95
+ next;
96
+ }
97
+ elsif ( $_ =~ /^;;;/ ) { $header{$last} .= "$_\n"; next; } # ignore plain comments
98
+ elsif ( $_ =~ /^\s*$/ ) { $header{$last} .= "$_\n"; next; } # ignore blank lines
99
+
100
+ # extract the word,pron pair and prepare for processing
101
+ ($word,$pron) = /(.+?)\s+(.+?)$/;
102
+ if (! defined $word) { print STDERR "bad entry (no head word): $_\n"; next; }
103
+
104
+ $basecount++;
105
+
106
+ if ($word =~ /\(\d\)$/) { # variant
107
+ ($root,$variant) = ($word =~ m/(.+?)\((.+?)\)/);
108
+ } else {
109
+ $root = $word;
110
+ $variant = 0;
111
+ }
112
+
113
+ # found a new baseform; set it up
114
+ if ( ! defined $dict->{$root} ) {
115
+ $dict->{$root}{VARIANT}[0] = $pron;
116
+ $base++;
117
+ next;
118
+ }
119
+
120
+ # already-seen baseform; see if pron is a duplicate
121
+ foreach my $var ( @{$dict->{$root}{VARIANT}} ) {
122
+ if ( $var eq $pron ) {
123
+ print STDERR "duplicate entry: $root ($variant) $pron!\n";
124
+ $dupl++;
125
+ $pron = "";
126
+ last;
127
+ }
128
+ }
129
+
130
+ # it's a new variant on an existing baseform, keep it
131
+ if ( $pron ne "" ) {
132
+ push @{$dict->{$root}{VARIANT}}, $pron;
133
+ $varia++;
134
+ $histo{scalar @{$dict->{$root}{VARIANT}}}++; # track variant stats
135
+ if ( scalar @{$dict->{$root}{VARIANT}} ge 4 ) {
136
+ print STDERR "$root -- ",scalar @{$dict->{$root}{VARIANT}},"\n";
137
+ }
138
+ }
139
+ $last = $word; # remember which token we just did
140
+ }
141
+ }