noyes 0.9.0 → 0.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/lib/c_impl/n_matrix.c +32 -0
- data/lib/c_impl/n_speech_trimmer.c +31 -2
- data/lib/c_impl/noyes.h +23 -20
- data/lib/c_impl/rnoyes.h +1 -0
- data/lib/c_impl/speech_trimmer.c +26 -4
- data/lib/common/noyes_dsl.rb +13 -3
- data/lib/java_impl/speech_trimmer.rb +6 -2
- data/lib/ruby_impl/bent_cent_marker.rb +8 -1
- data/lib/ruby_impl/speech_trimmer.rb +14 -1
- metadata +5 -6
- data/README +0 -171
- data/ship/noyes.jar +0 -0
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.9.
|
1
|
+
0.9.2
|
data/lib/c_impl/n_matrix.c
CHANGED
@@ -38,3 +38,35 @@ void free_nmatrix1(NMatrix1 *M) {
|
|
38
38
|
free(M);
|
39
39
|
}
|
40
40
|
}
|
41
|
+
|
42
|
+
// Converts a square matrix to a list of one dimensional matrices.
|
43
|
+
// Simultaneously frees the original square matrix.
|
44
|
+
NMatrix1 ** nmatrix_2_nmatrix1s(NMatrix *M) {
|
45
|
+
NMatrix1 **single = malloc(sizeof(NMatrix1*) * M->rows);
|
46
|
+
int i;
|
47
|
+
for (i=0;i<M->rows;++i) {
|
48
|
+
single[i] = malloc(sizeof(NMatrix1));
|
49
|
+
single[i]->data = M->data[i];
|
50
|
+
single[i]->rows = M->cols;
|
51
|
+
}
|
52
|
+
free(M->data);
|
53
|
+
free(M);
|
54
|
+
return single;
|
55
|
+
}
|
56
|
+
|
57
|
+
// Converts an array of one dimensional arrays into a square matrix. It frees
|
58
|
+
// these arrays in the process.
|
59
|
+
NMatrix * nmatrix1_2_nmatrix(NMatrix1 **array, int size) {
|
60
|
+
if (size ==0)
|
61
|
+
return NULL;
|
62
|
+
NMatrix *result = malloc(sizeof(NMatrix));
|
63
|
+
result->data = malloc(sizeof(double*) * size);
|
64
|
+
result->rows = size;
|
65
|
+
int i;
|
66
|
+
for (i=0; i<size; ++i) {
|
67
|
+
result->data[i] = array[i]->data;
|
68
|
+
free(array[i]);
|
69
|
+
}
|
70
|
+
|
71
|
+
return result;
|
72
|
+
}
|
@@ -4,7 +4,7 @@
|
|
4
4
|
#undef FALSE
|
5
5
|
#define FALSE 0
|
6
6
|
|
7
|
-
SpeechTrimmer * new_speech_trimmer() {
|
7
|
+
SpeechTrimmer * new_speech_trimmer(int frequency) {
|
8
8
|
SpeechTrimmer *self = malloc(sizeof(SpeechTrimmer));
|
9
9
|
self->leader = 5;
|
10
10
|
self->trailer = 5;
|
@@ -16,6 +16,7 @@ SpeechTrimmer * new_speech_trimmer() {
|
|
16
16
|
self->eos_reached = FALSE;
|
17
17
|
self->scs = 20;
|
18
18
|
self->ecs = 50;
|
19
|
+
self->seg = new_segmenter(frequency/100, frequency/100);
|
19
20
|
return self;
|
20
21
|
}
|
21
22
|
|
@@ -25,6 +26,33 @@ void free_speech_trimmer(SpeechTrimmer *self) {
|
|
25
26
|
free(self);
|
26
27
|
}
|
27
28
|
|
29
|
+
NMatrix * speech_trimmer_apply(SpeechTrimmer *self, NMatrix1* pcm) {
|
30
|
+
if (self->eos_reached)
|
31
|
+
return NULL;
|
32
|
+
|
33
|
+
NMatrix *segment_matrix = segmenter_apply(self->seg, pcm);
|
34
|
+
int centisecond_count = segment_matrix->rows;
|
35
|
+
NMatrix1 **segments = nmatrix_2_nmatrix1s(segment_matrix);
|
36
|
+
NMatrix1 ** speech_segments = malloc(sizeof(NMatrix*) * segment_matrix->rows);
|
37
|
+
int speech_count = 0, i;
|
38
|
+
for (i=0; i<centisecond_count ;++i) {
|
39
|
+
speech_trimmer_enqueue(self, segments[i]);
|
40
|
+
NMatrix1 *centispeech = speech_trimmer_dequeue(self);
|
41
|
+
while (centispeech != NULL) {
|
42
|
+
speech_segments[speech_count++] = centispeech;
|
43
|
+
centispeech = speech_trimmer_dequeue(self);
|
44
|
+
}
|
45
|
+
if (speech_trimmer_eos(self))
|
46
|
+
break;
|
47
|
+
}
|
48
|
+
|
49
|
+
if (speech_trimmer_eos(self) && speech_count == 0)
|
50
|
+
return NULL;
|
51
|
+
|
52
|
+
return nmatrix1_2_nmatrix(speech_segments, speech_count);
|
53
|
+
}
|
54
|
+
|
55
|
+
|
28
56
|
void speech_trimmer_enqueue(SpeechTrimmer *self, NMatrix1* pcm) {
|
29
57
|
if (self->eos_reached)
|
30
58
|
return;
|
@@ -55,13 +83,14 @@ NMatrix1 * speech_trimmer_dequeue(SpeechTrimmer *self) {
|
|
55
83
|
if (n_list_size(self->queue) == 0)
|
56
84
|
return NULL;
|
57
85
|
if (self->eos_reached || (self->speech_started &&
|
58
|
-
|
86
|
+
n_list_size(self->queue) > self->ecs)) {
|
59
87
|
NMatrix1 * N = n_list_get(self->queue, 0);
|
60
88
|
n_list_remove(self->queue, 0, 1);
|
61
89
|
return N;
|
62
90
|
}
|
63
91
|
return NULL;
|
64
92
|
}
|
93
|
+
|
65
94
|
int speech_trimmer_eos(SpeechTrimmer *self) {
|
66
95
|
return self->eos_reached;
|
67
96
|
}
|
data/lib/c_impl/noyes.h
CHANGED
@@ -30,6 +30,8 @@ void free_nmatrix(NMatrix *);
|
|
30
30
|
|
31
31
|
NMatrix1 *new_nmatrix1(int rows);
|
32
32
|
void free_nmatrix1(NMatrix1 *);
|
33
|
+
NMatrix1 ** nmatrix_2_nmatrix1s(NMatrix *M);
|
34
|
+
NMatrix * nmatrix1_2_nmatrix(NMatrix1 **array, int size);
|
33
35
|
|
34
36
|
// Preemphasizer
|
35
37
|
typedef struct {
|
@@ -126,25 +128,6 @@ LiveCMN * new_live_cmn(int dimensions, double init_mean, int window_size, int sh
|
|
126
128
|
void free_live_cmn(LiveCMN *lcmn);
|
127
129
|
NMatrix *live_cmn_apply(LiveCMN *self, NMatrix *data);
|
128
130
|
|
129
|
-
|
130
|
-
// Fast 8k mfcc
|
131
|
-
// This strings together all the algorithms necessary to make mfcc's from an 8k
|
132
|
-
// signal so you don't have to.
|
133
|
-
typedef struct {
|
134
|
-
Preemphasizer *pre;
|
135
|
-
Segmenter *seg;
|
136
|
-
HammingWindow *ham;
|
137
|
-
PowerSpectrum *pow;
|
138
|
-
MelFilter *mel;
|
139
|
-
LogCompressor *log;
|
140
|
-
DiscreteCosineTransform *dct;
|
141
|
-
LiveCMN *cmn;
|
142
|
-
} Fast8kMfcc;
|
143
|
-
|
144
|
-
Fast8kMfcc* new_fast_8k_mfcc();
|
145
|
-
void free_fast_8k_mfcc(Fast8kMfcc *self);
|
146
|
-
NMatrix *fast_8k_mfcc_apply(Fast8kMfcc *self, NMatrix1 *data);
|
147
|
-
|
148
131
|
// Silence removal with BentCentMarker and SpeechTrimmer
|
149
132
|
typedef struct {
|
150
133
|
double adjustment;
|
@@ -171,15 +154,35 @@ typedef struct {
|
|
171
154
|
int scs;
|
172
155
|
int ecs;
|
173
156
|
BentCentMarker *bcm;
|
157
|
+
Segmenter *seg;
|
174
158
|
NList *queue;
|
175
159
|
int eos_reached;
|
176
160
|
} SpeechTrimmer;
|
177
|
-
|
161
|
+
|
178
162
|
SpeechTrimmer * new_speech_trimmer();
|
179
163
|
void free_speech_trimmer(SpeechTrimmer *self);
|
180
164
|
void speech_trimmer_enqueue(SpeechTrimmer *self, NMatrix1* pcm);
|
181
165
|
NMatrix1 * speech_trimmer_dequeue(SpeechTrimmer *self);
|
182
166
|
int speech_trimmer_eos(SpeechTrimmer *self);
|
167
|
+
NMatrix * speech_trimmer_apply(SpeechTrimmer *self, NMatrix1* pcm);
|
168
|
+
|
169
|
+
// Fast 8k mfcc
|
170
|
+
// This strings together all the algorithms necessary to make mfcc's from an 8k
|
171
|
+
// signal so you don't have to.
|
172
|
+
typedef struct {
|
173
|
+
Preemphasizer *pre;
|
174
|
+
Segmenter *seg;
|
175
|
+
HammingWindow *ham;
|
176
|
+
PowerSpectrum *pow;
|
177
|
+
MelFilter *mel;
|
178
|
+
LogCompressor *log;
|
179
|
+
DiscreteCosineTransform *dct;
|
180
|
+
LiveCMN *cmn;
|
181
|
+
} Fast8kMfcc;
|
182
|
+
|
183
|
+
Fast8kMfcc* new_fast_8k_mfcc();
|
184
|
+
void free_fast_8k_mfcc(Fast8kMfcc *self);
|
185
|
+
NMatrix *fast_8k_mfcc_apply(Fast8kMfcc *self, NMatrix1 *data);
|
183
186
|
|
184
187
|
#ifdef __cplusplus
|
185
188
|
}
|
data/lib/c_impl/rnoyes.h
CHANGED
data/lib/c_impl/speech_trimmer.c
CHANGED
@@ -10,8 +10,14 @@ static void speech_trimmer_free(void *p) {
|
|
10
10
|
free_speech_trimmer(p);
|
11
11
|
}
|
12
12
|
|
13
|
-
static VALUE t_init(VALUE self) {
|
14
|
-
|
13
|
+
static VALUE t_init(VALUE self, VALUE args) {
|
14
|
+
int len = RARRAY_LEN(args);
|
15
|
+
SpeechTrimmer *st;
|
16
|
+
if (len == 1)
|
17
|
+
st = new_speech_trimmer(NUM2INT(rb_ary_entry(args, 0)));
|
18
|
+
else
|
19
|
+
st = new_speech_trimmer(16000);
|
20
|
+
|
15
21
|
VALUE stv = Data_Wrap_Struct(cSpeechTrimmer, 0, speech_trimmer_free, st);
|
16
22
|
rb_iv_set(self, "@speech_trimmer", stv);
|
17
23
|
return self;
|
@@ -38,15 +44,31 @@ static VALUE t_eos(VALUE self) {
|
|
38
44
|
SpeechTrimmer *st;
|
39
45
|
VALUE stv = rb_iv_get(self, "@speech_trimmer");
|
40
46
|
Data_Get_Struct(stv, SpeechTrimmer, st);
|
41
|
-
return
|
47
|
+
return speech_trimmer_eos(st) ? Qtrue : Qfalse;
|
48
|
+
}
|
49
|
+
|
50
|
+
static VALUE t_left_shift(VALUE self, VALUE obj) {
|
51
|
+
NMatrix1 *M = v_2_nmatrix1(obj);
|
52
|
+
SpeechTrimmer *st;
|
53
|
+
Data_Get_Struct(rb_iv_get(self, "@speech_trimmer"), SpeechTrimmer, st);
|
54
|
+
NMatrix *R = speech_trimmer_apply(st, M);
|
55
|
+
if (!R) {
|
56
|
+
free_nmatrix1(M);
|
57
|
+
return Qnil;
|
58
|
+
}
|
59
|
+
VALUE result = nmatrix_2_v(R);
|
60
|
+
free_nmatrix1(M);
|
61
|
+
free_nmatrix(R);
|
62
|
+
return result;
|
42
63
|
}
|
43
64
|
|
44
65
|
void Init_speech_trimmer() {
|
45
66
|
VALUE m_noyes_c = rb_define_module("NoyesC");
|
46
67
|
cSpeechTrimmer = rb_define_class_under(m_noyes_c, "SpeechTrimmer", rb_cObject);
|
47
|
-
rb_define_method(cSpeechTrimmer, "initialize", t_init,
|
68
|
+
rb_define_method(cSpeechTrimmer, "initialize", t_init, -2);
|
48
69
|
rb_define_method(cSpeechTrimmer, "enqueue", t_enqueue, 1);
|
49
70
|
rb_define_method(cSpeechTrimmer, "dequeue", t_dequeue, 0);
|
50
71
|
rb_define_method(cSpeechTrimmer, "eos?", t_eos, 0);
|
72
|
+
rb_define_method(cSpeechTrimmer, "<<", t_left_shift, 1);
|
51
73
|
id_push = rb_intern("push");
|
52
74
|
}
|
data/lib/common/noyes_dsl.rb
CHANGED
@@ -1,17 +1,27 @@
|
|
1
1
|
class Array
|
2
|
-
#
|
3
|
-
|
4
|
-
|
2
|
+
# Run this array through a filter or anything that implements the '<<'
|
3
|
+
# operator. Returns whatever the filter returns.
|
4
|
+
def >> filter
|
5
|
+
filter << self
|
5
6
|
end
|
6
7
|
end
|
7
8
|
|
8
9
|
# This portion is still highly experimental. It allows filters to be combined
|
9
10
|
# in complicated ways using a syntax similar to Backus Naur Form.
|
10
11
|
module NoyesFilterDSL
|
12
|
+
# Combines two filters into a single serial filter. That is A + B
|
13
|
+
# results in a filter S such that filtering through S is the identical
|
14
|
+
# to filtering through A and then B.
|
11
15
|
def + other
|
12
16
|
other_filters = other.kind_of?(SerialFilter) ? other.filters.clone : other
|
13
17
|
SerialFilter.new [self, other].flatten
|
14
18
|
end
|
19
|
+
|
20
|
+
# Combines two filters into a single parallel filter. That is A | B creates
|
21
|
+
# a new filter P such that filtering through P is identical to filtering row
|
22
|
+
# 0 of an array through filter A and row 1 of an array through filter B.
|
23
|
+
# Typically P would be used with an array of arrays. This filter can be used
|
24
|
+
# with more than two filters.
|
15
25
|
def | other
|
16
26
|
other_filters = other.kind_of?(ParallelFilter) ? other.filtes.clone : other
|
17
27
|
ParallelFilter.new [self, other].flatten
|
@@ -1,7 +1,11 @@
|
|
1
1
|
module NoyesJava
|
2
2
|
class SpeechTrimmer
|
3
|
-
def initialize
|
4
|
-
@st = Java::talkhouse.SpeechTrimmer.new
|
3
|
+
def initialize frequency = 16000
|
4
|
+
@st = Java::talkhouse.SpeechTrimmer.new frequency
|
5
|
+
end
|
6
|
+
def << pcm
|
7
|
+
result = @st.apply(pcm.to_java(Java::double))
|
8
|
+
result.to_a if result
|
5
9
|
end
|
6
10
|
def enqueue pcm
|
7
11
|
@st.enqueue pcm.to_java(Java::double)
|
@@ -1,7 +1,9 @@
|
|
1
1
|
module Noyes
|
2
2
|
# Determines whether a PCM frame is speech or not using Bent
|
3
3
|
# Schmidt-Nielsen's algorithm. Basically, it's an energy-based detector
|
4
|
-
# where the background noise level is constantly estimated.
|
4
|
+
# where the background noise level is constantly estimated. You probably
|
5
|
+
# don't want to use this class directly. Most of the time you'll want
|
6
|
+
# to use SpeechTrimmer, which uses this class.
|
5
7
|
#
|
6
8
|
# The pcm data should be in 100 millisecond chunks. For example,
|
7
9
|
# At 8000 Hz there should 80 frames of pcm.
|
@@ -14,6 +16,8 @@ module Noyes
|
|
14
16
|
@min_signal = 0.0
|
15
17
|
@threshold = 10.0
|
16
18
|
end
|
19
|
+
|
20
|
+
# Take the log rms of an array of pcm values.
|
17
21
|
def logrms pcm
|
18
22
|
sum_of_squares = 0.0
|
19
23
|
pcm.each {|sample| sum_of_squares += sample * sample}
|
@@ -21,6 +25,9 @@ module Noyes
|
|
21
25
|
rms = Math.max rms, 1
|
22
26
|
Math.log(rms) * 20
|
23
27
|
end
|
28
|
+
|
29
|
+
# Takes a centisecond worth of pcm values and indicates whether it looks
|
30
|
+
# like speech. This information is typically used by SpeechTrimmer.
|
24
31
|
def << pcm
|
25
32
|
is_speech = false
|
26
33
|
current = logrms pcm
|
@@ -4,7 +4,7 @@ module Noyes
|
|
4
4
|
# returns nil. Then check for eos. If eos is true you are done.
|
5
5
|
# SpeechTrimmer is designed to work efficiently with live audio.
|
6
6
|
class SpeechTrimmer
|
7
|
-
def initialize
|
7
|
+
def initialize frequency=16000
|
8
8
|
@leader = 5 # Cents of leading silence to retain.
|
9
9
|
@trailer = 5 # Cents of trailing silence to retain.
|
10
10
|
@speech_started = false
|
@@ -15,6 +15,19 @@ module Noyes
|
|
15
15
|
@eos_reached = false
|
16
16
|
@scs = 20 # Centiseconds of speech before detection of utterance.
|
17
17
|
@ecs = 50 # Centiseconds of silence before end detection.
|
18
|
+
@segmenter = Segmenter.new(frequency/100, frequency/100)
|
19
|
+
end
|
20
|
+
|
21
|
+
def << pcm
|
22
|
+
return if eos?
|
23
|
+
(@segmenter << pcm).inject [] do |memo, centisec|
|
24
|
+
enqueue centisec unless eos?
|
25
|
+
while x = dequeue
|
26
|
+
memo << x
|
27
|
+
end
|
28
|
+
break memo if eos?
|
29
|
+
memo
|
30
|
+
end
|
18
31
|
end
|
19
32
|
|
20
33
|
def enqueue pcm
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: noyes
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.9.
|
4
|
+
version: 0.9.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Joe Woelfel
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2010-06-
|
12
|
+
date: 2010-06-30 00:00:00 -04:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -22,7 +22,9 @@ dependencies:
|
|
22
22
|
- !ruby/object:Gem::Version
|
23
23
|
version: 1.0.0
|
24
24
|
version:
|
25
|
-
description:
|
25
|
+
description: |-
|
26
|
+
A fast portable signal processing library sufficient for creating features for
|
27
|
+
speech recognition, etc.
|
26
28
|
email: joe@talkhouse.com
|
27
29
|
executables:
|
28
30
|
- mock_noyes_server
|
@@ -34,7 +36,6 @@ extensions:
|
|
34
36
|
extra_rdoc_files:
|
35
37
|
- COPYING
|
36
38
|
- FAQ
|
37
|
-
- README
|
38
39
|
files:
|
39
40
|
- VERSION
|
40
41
|
- lib/c_impl/array_list.c
|
@@ -108,10 +109,8 @@ files:
|
|
108
109
|
- lib/ruby_impl/preemphasis.rb
|
109
110
|
- lib/ruby_impl/segment.rb
|
110
111
|
- lib/ruby_impl/speech_trimmer.rb
|
111
|
-
- ship/noyes.jar
|
112
112
|
- COPYING
|
113
113
|
- FAQ
|
114
|
-
- README
|
115
114
|
has_rdoc: true
|
116
115
|
homepage: http://github.com/talkhouse/noyes
|
117
116
|
licenses: []
|
data/README
DELETED
@@ -1,171 +0,0 @@
|
|
1
|
-
Noyes is a signal processing library. It currently has just enough signal
|
2
|
-
processing to produce features suitable for speech recognition.
|
3
|
-
|
4
|
-
Pronunciation: Typically pronounced the same as 'noise'. But "NO!... YES!" is
|
5
|
-
considered acceptable if you say it with sufficient conviction to make people
|
6
|
-
think you have truly changed your mind.
|
7
|
-
|
8
|
-
Noyes is a general purpose signal processing tool that is flexible enough for
|
9
|
-
many purposes. However, it exists because there is a need for low-latency high
|
10
|
-
quality speech recognition on portable wireless devices. The most powerful
|
11
|
-
speech recognizers are very large with huge models running on powerful cloud
|
12
|
-
based systems. But transmitting raw audio to these recognizers creates too
|
13
|
-
much latency because raw audio uses too much bandwidth. By sending compressed
|
14
|
-
features instead of raw audio the bandwidth can be greatly reduced without
|
15
|
-
compromising recognition accuracy. In some cases the effect of inadequate
|
16
|
-
bandwidth on latency can be reduced to zero.
|
17
|
-
|
18
|
-
Because hand sets require different implementations the Noyes library is
|
19
|
-
designed to quickly and efficiently work with and develop multiple underlying
|
20
|
-
implementations. All implementations are accessible via a high level dynamic
|
21
|
-
language that includes a very expressive domain specific language for handling
|
22
|
-
signal processing routines. In addition, all implementations share unit tests
|
23
|
-
written in a high level dynamic language.
|
24
|
-
|
25
|
-
Noyes is implemented entirely in Ruby. It's also implemented entirely in Java.
|
26
|
-
The Java version has Ruby bindings too. So you can have Java's speed from
|
27
|
-
Ruby. If you need a pure Java version you can use the generated jar. There is
|
28
|
-
a lot of flexibility without a lot of overhead. All versions share the same
|
29
|
-
unit tests, which are written in Ruby.
|
30
|
-
|
31
|
-
The design goal is to have signal processing routines that are so simple and so
|
32
|
-
disentangled from the overall system that anyone could extract any of the
|
33
|
-
routines and use them elsewhere with little trouble. Benchmarks are included.
|
34
|
-
|
35
|
-
This library places an emphasis on expressiveness without sacrificing ultimate
|
36
|
-
performance. It does so by supporting multiple implementations each with Ruby
|
37
|
-
bindings. The pure Ruby version, while not fast, is often adequate for
|
38
|
-
development and is the best place to add new routines.
|
39
|
-
|
40
|
-
For examples of how to link with different implementations see the test section
|
41
|
-
of the Rakefile. At present only the pure Ruby implementation is exposed via
|
42
|
-
the gem.
|
43
|
-
|
44
|
-
Requirements:
|
45
|
-
Almost any version of ruby & rake.
|
46
|
-
Java, if you want to use the Java implementation instead of the default pure
|
47
|
-
ruby implementation.
|
48
|
-
|
49
|
-
Some of the utility scripts such as nrec and jrec may use sox, but
|
50
|
-
none of the core routines use it.
|
51
|
-
|
52
|
-
Build instructions
|
53
|
-
rake -T
|
54
|
-
|
55
|
-
|
56
|
-
= USAGE
|
57
|
-
|
58
|
-
All signal processing routines use a simple DSL style inteface. Below are some
|
59
|
-
examples.
|
60
|
-
|
61
|
-
== Filter operator example.
|
62
|
-
The '>>=' operator is called the filter operator. It modifies that data on the
|
63
|
-
left using the filter on the right. This is similar to the way the += operator
|
64
|
-
works for numbers. Note that the >>= actually looks like a filter making it easy
|
65
|
-
to remember.
|
66
|
-
|
67
|
-
require 'noyes'
|
68
|
-
data = (1..12).to_a # An array of nonesense data.
|
69
|
-
segmenter = Segmenter.new 4, 2 # window size, window shift
|
70
|
-
hamming_filter = HammingWindow.new 4 # window size
|
71
|
-
power_spec_filter = PowerSpectrumFilter.new 8 # number of ffts
|
72
|
-
|
73
|
-
data >>= segmenter
|
74
|
-
data >>= hamming_filter
|
75
|
-
data >>= power_spec_filter
|
76
|
-
data >>= dct_filter
|
77
|
-
|
78
|
-
You can expand the >>= operator out, but I think the flow is worse and there is
|
79
|
-
more repetition, particularly when you have a lot of filters in sequence. This
|
80
|
-
is perfectly valid syntax though. Also, this is very useful if you don't want
|
81
|
-
to keep a reference to your original data.
|
82
|
-
|
83
|
-
require 'noyes'
|
84
|
-
pcm_data = (1..12).to_a
|
85
|
-
segmenter = Segmenter.new
|
86
|
-
hamming_filter = HammingWindow.new 4
|
87
|
-
segmented_data = segmenter << pcm_data, 4, 2
|
88
|
-
hamming_data = hamming_filter << segmented_data
|
89
|
-
power_spectrum data = power_spec_filter hamminging_data, 8
|
90
|
-
dct_data = dct_filter << power_spectrum_data
|
91
|
-
|
92
|
-
== Advanced filter DSLs
|
93
|
-
For most things, the filter operator is simple, easy to remember, and
|
94
|
-
very concise. But sometimes you want to build more elaborate combinations
|
95
|
-
of filters and use them as if you had a single filter. In this case
|
96
|
-
making a new classes for every possible combination creates an explosion
|
97
|
-
of new classes and a maintainence nightmare. Instead, there is a simple
|
98
|
-
graph notation you can use to combine filters. In the following example
|
99
|
-
we'll combine all the filters from a previous example and then use them
|
100
|
-
as if they were a single filter.
|
101
|
-
|
102
|
-
serial_filter = segmenter & hamming_filter & power_spec_filter & dct_filter
|
103
|
-
data >>= serial_filter
|
104
|
-
|
105
|
-
It's also possible to take parallel data streams and pipe them through
|
106
|
-
parallel filters as if you had only one data stream and only one filter.
|
107
|
-
|
108
|
-
data = [stream_1,stream_2]
|
109
|
-
parallel_filter = filter_1 | filter_2
|
110
|
-
data >>= parallel_filter
|
111
|
-
|
112
|
-
It is not necessary for the data to be synchronous when using parallel filters.
|
113
|
-
When using parallel filters the number of elements going through one filter
|
114
|
-
does not have to equal the number of elements going through the second filter.
|
115
|
-
|
116
|
-
You can see that you can make arbitrarily complicated graphs of filters by
|
117
|
-
combined use of the '&' and '|' operators. Almost identical notation is used
|
118
|
-
to specify graphs for context free grammars. Keep in mind that '&' take
|
119
|
-
precedence over '|'. In the example below stream 1 goes through filter 1 and
|
120
|
-
filter 2 while stream 2 goes through filters 3, 4, and 5.
|
121
|
-
|
122
|
-
parallel_data = [stream_1,stream_2]
|
123
|
-
big_filter = filter_1 & filter_2 | filter_3 & filter_4 & filter_5
|
124
|
-
parallel_data >>= big_filter
|
125
|
-
|
126
|
-
== Command Line Utilities
|
127
|
-
|
128
|
-
The program nrec will process almost any format of audio file into speech
|
129
|
-
features and send the data to a cloud hosted speech recognizer. The resulting
|
130
|
-
transcript will be sent back and printed out. The nrec program uses whatever
|
131
|
-
version of Ruby is on the path of your current environment. It is compatible
|
132
|
-
with both ruby 1.9, ruby 1.8x, and JRuby. When run under JRuby it can
|
133
|
-
optionally use a Java implementation, which is very fast. See nrec --help for
|
134
|
-
more information.
|
135
|
-
|
136
|
-
== Assessing Performance for Wireless Devices
|
137
|
-
|
138
|
-
It's important to note that the performance characteristics of live data and
|
139
|
-
recorded data are different. Any delay experience by a user starts from the
|
140
|
-
time they stop speaking. In contrast, any delay experienced when processing a
|
141
|
-
file starts from the time a file starts processing. For that reason file
|
142
|
-
processing always seems slower. Modern recognizers are easily capable of
|
143
|
-
exceeding real time performance so that it not a factor. The delay experienced
|
144
|
-
by a user is typically due to the time required to transmit the audio to the
|
145
|
-
recognizer and the time required to detect end of utterance, assuming end of
|
146
|
-
utterance detection is used.
|
147
|
-
|
148
|
-
If end of utterance detection is used the recognizer must wait until it has
|
149
|
-
sufficient evidence to be reasonably sure the user has stopped talking. This
|
150
|
-
could mean that a suitable period of silence has passed which means the user
|
151
|
-
incurs a slight but unavoidable delay. End of utterance detection also could
|
152
|
-
mean the grammar or language model does not allow for any other reasonable
|
153
|
-
possibility even if more data were available, which may mean no delay at all
|
154
|
-
(or even a negative delay in some cases).
|
155
|
-
|
156
|
-
If the bandwidth of the network is low enough, which is often the case for the
|
157
|
-
data channel of portable wireless handsets, it will take time for raw
|
158
|
-
uncompressed audio to traverse the network. By computing features on the
|
159
|
-
handset it is possible to have significant reduction in bandwidth requirements
|
160
|
-
eliminating much of the latency. These features in turn may then be compressed
|
161
|
-
for further bandwidth reduction. This method exceeds what is possible with
|
162
|
-
alternative methods of audio compression. Further, it eliminates many of the
|
163
|
-
distortion components that may compromise recognition accuracy.
|
164
|
-
|
165
|
-
If all you want is a rough feeling of how responsive speech recognition will be
|
166
|
-
over your network try speaking an utterance at the same time you enter a
|
167
|
-
command to have a prerecorded utterance recognized. You'll probably be
|
168
|
-
surprised by how quickly the network is able to respond. You may find that the
|
169
|
-
Java implementation feels like instant response even though it takes time for
|
170
|
-
the JVM to launch. Ruby 1.9 is actually surprisingly quick on a reasonably
|
171
|
-
powerful laptop.
|
data/ship/noyes.jar
DELETED
Binary file
|