noyes 0.9.0 → 0.9.2
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/lib/c_impl/n_matrix.c +32 -0
- data/lib/c_impl/n_speech_trimmer.c +31 -2
- data/lib/c_impl/noyes.h +23 -20
- data/lib/c_impl/rnoyes.h +1 -0
- data/lib/c_impl/speech_trimmer.c +26 -4
- data/lib/common/noyes_dsl.rb +13 -3
- data/lib/java_impl/speech_trimmer.rb +6 -2
- data/lib/ruby_impl/bent_cent_marker.rb +8 -1
- data/lib/ruby_impl/speech_trimmer.rb +14 -1
- metadata +5 -6
- data/README +0 -171
- data/ship/noyes.jar +0 -0
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.9.
|
1
|
+
0.9.2
|
data/lib/c_impl/n_matrix.c
CHANGED
@@ -38,3 +38,35 @@ void free_nmatrix1(NMatrix1 *M) {
|
|
38
38
|
free(M);
|
39
39
|
}
|
40
40
|
}
|
41
|
+
|
42
|
+
// Converts a square matrix to a list of one dimensional matrices.
|
43
|
+
// Simultaneously frees the original square matrix.
|
44
|
+
NMatrix1 ** nmatrix_2_nmatrix1s(NMatrix *M) {
|
45
|
+
NMatrix1 **single = malloc(sizeof(NMatrix1*) * M->rows);
|
46
|
+
int i;
|
47
|
+
for (i=0;i<M->rows;++i) {
|
48
|
+
single[i] = malloc(sizeof(NMatrix1));
|
49
|
+
single[i]->data = M->data[i];
|
50
|
+
single[i]->rows = M->cols;
|
51
|
+
}
|
52
|
+
free(M->data);
|
53
|
+
free(M);
|
54
|
+
return single;
|
55
|
+
}
|
56
|
+
|
57
|
+
// Converts an array of one dimensional arrays into a square matrix. It frees
|
58
|
+
// these arrays in the process.
|
59
|
+
NMatrix * nmatrix1_2_nmatrix(NMatrix1 **array, int size) {
|
60
|
+
if (size ==0)
|
61
|
+
return NULL;
|
62
|
+
NMatrix *result = malloc(sizeof(NMatrix));
|
63
|
+
result->data = malloc(sizeof(double*) * size);
|
64
|
+
result->rows = size;
|
65
|
+
int i;
|
66
|
+
for (i=0; i<size; ++i) {
|
67
|
+
result->data[i] = array[i]->data;
|
68
|
+
free(array[i]);
|
69
|
+
}
|
70
|
+
|
71
|
+
return result;
|
72
|
+
}
|
@@ -4,7 +4,7 @@
|
|
4
4
|
#undef FALSE
|
5
5
|
#define FALSE 0
|
6
6
|
|
7
|
-
SpeechTrimmer * new_speech_trimmer() {
|
7
|
+
SpeechTrimmer * new_speech_trimmer(int frequency) {
|
8
8
|
SpeechTrimmer *self = malloc(sizeof(SpeechTrimmer));
|
9
9
|
self->leader = 5;
|
10
10
|
self->trailer = 5;
|
@@ -16,6 +16,7 @@ SpeechTrimmer * new_speech_trimmer() {
|
|
16
16
|
self->eos_reached = FALSE;
|
17
17
|
self->scs = 20;
|
18
18
|
self->ecs = 50;
|
19
|
+
self->seg = new_segmenter(frequency/100, frequency/100);
|
19
20
|
return self;
|
20
21
|
}
|
21
22
|
|
@@ -25,6 +26,33 @@ void free_speech_trimmer(SpeechTrimmer *self) {
|
|
25
26
|
free(self);
|
26
27
|
}
|
27
28
|
|
29
|
+
NMatrix * speech_trimmer_apply(SpeechTrimmer *self, NMatrix1* pcm) {
|
30
|
+
if (self->eos_reached)
|
31
|
+
return NULL;
|
32
|
+
|
33
|
+
NMatrix *segment_matrix = segmenter_apply(self->seg, pcm);
|
34
|
+
int centisecond_count = segment_matrix->rows;
|
35
|
+
NMatrix1 **segments = nmatrix_2_nmatrix1s(segment_matrix);
|
36
|
+
NMatrix1 ** speech_segments = malloc(sizeof(NMatrix*) * segment_matrix->rows);
|
37
|
+
int speech_count = 0, i;
|
38
|
+
for (i=0; i<centisecond_count ;++i) {
|
39
|
+
speech_trimmer_enqueue(self, segments[i]);
|
40
|
+
NMatrix1 *centispeech = speech_trimmer_dequeue(self);
|
41
|
+
while (centispeech != NULL) {
|
42
|
+
speech_segments[speech_count++] = centispeech;
|
43
|
+
centispeech = speech_trimmer_dequeue(self);
|
44
|
+
}
|
45
|
+
if (speech_trimmer_eos(self))
|
46
|
+
break;
|
47
|
+
}
|
48
|
+
|
49
|
+
if (speech_trimmer_eos(self) && speech_count == 0)
|
50
|
+
return NULL;
|
51
|
+
|
52
|
+
return nmatrix1_2_nmatrix(speech_segments, speech_count);
|
53
|
+
}
|
54
|
+
|
55
|
+
|
28
56
|
void speech_trimmer_enqueue(SpeechTrimmer *self, NMatrix1* pcm) {
|
29
57
|
if (self->eos_reached)
|
30
58
|
return;
|
@@ -55,13 +83,14 @@ NMatrix1 * speech_trimmer_dequeue(SpeechTrimmer *self) {
|
|
55
83
|
if (n_list_size(self->queue) == 0)
|
56
84
|
return NULL;
|
57
85
|
if (self->eos_reached || (self->speech_started &&
|
58
|
-
|
86
|
+
n_list_size(self->queue) > self->ecs)) {
|
59
87
|
NMatrix1 * N = n_list_get(self->queue, 0);
|
60
88
|
n_list_remove(self->queue, 0, 1);
|
61
89
|
return N;
|
62
90
|
}
|
63
91
|
return NULL;
|
64
92
|
}
|
93
|
+
|
65
94
|
int speech_trimmer_eos(SpeechTrimmer *self) {
|
66
95
|
return self->eos_reached;
|
67
96
|
}
|
data/lib/c_impl/noyes.h
CHANGED
@@ -30,6 +30,8 @@ void free_nmatrix(NMatrix *);
|
|
30
30
|
|
31
31
|
NMatrix1 *new_nmatrix1(int rows);
|
32
32
|
void free_nmatrix1(NMatrix1 *);
|
33
|
+
NMatrix1 ** nmatrix_2_nmatrix1s(NMatrix *M);
|
34
|
+
NMatrix * nmatrix1_2_nmatrix(NMatrix1 **array, int size);
|
33
35
|
|
34
36
|
// Preemphasizer
|
35
37
|
typedef struct {
|
@@ -126,25 +128,6 @@ LiveCMN * new_live_cmn(int dimensions, double init_mean, int window_size, int sh
|
|
126
128
|
void free_live_cmn(LiveCMN *lcmn);
|
127
129
|
NMatrix *live_cmn_apply(LiveCMN *self, NMatrix *data);
|
128
130
|
|
129
|
-
|
130
|
-
// Fast 8k mfcc
|
131
|
-
// This strings together all the algorithms necessary to make mfcc's from an 8k
|
132
|
-
// signal so you don't have to.
|
133
|
-
typedef struct {
|
134
|
-
Preemphasizer *pre;
|
135
|
-
Segmenter *seg;
|
136
|
-
HammingWindow *ham;
|
137
|
-
PowerSpectrum *pow;
|
138
|
-
MelFilter *mel;
|
139
|
-
LogCompressor *log;
|
140
|
-
DiscreteCosineTransform *dct;
|
141
|
-
LiveCMN *cmn;
|
142
|
-
} Fast8kMfcc;
|
143
|
-
|
144
|
-
Fast8kMfcc* new_fast_8k_mfcc();
|
145
|
-
void free_fast_8k_mfcc(Fast8kMfcc *self);
|
146
|
-
NMatrix *fast_8k_mfcc_apply(Fast8kMfcc *self, NMatrix1 *data);
|
147
|
-
|
148
131
|
// Silence removal with BentCentMarker and SpeechTrimmer
|
149
132
|
typedef struct {
|
150
133
|
double adjustment;
|
@@ -171,15 +154,35 @@ typedef struct {
|
|
171
154
|
int scs;
|
172
155
|
int ecs;
|
173
156
|
BentCentMarker *bcm;
|
157
|
+
Segmenter *seg;
|
174
158
|
NList *queue;
|
175
159
|
int eos_reached;
|
176
160
|
} SpeechTrimmer;
|
177
|
-
|
161
|
+
|
178
162
|
SpeechTrimmer * new_speech_trimmer();
|
179
163
|
void free_speech_trimmer(SpeechTrimmer *self);
|
180
164
|
void speech_trimmer_enqueue(SpeechTrimmer *self, NMatrix1* pcm);
|
181
165
|
NMatrix1 * speech_trimmer_dequeue(SpeechTrimmer *self);
|
182
166
|
int speech_trimmer_eos(SpeechTrimmer *self);
|
167
|
+
NMatrix * speech_trimmer_apply(SpeechTrimmer *self, NMatrix1* pcm);
|
168
|
+
|
169
|
+
// Fast 8k mfcc
|
170
|
+
// This strings together all the algorithms necessary to make mfcc's from an 8k
|
171
|
+
// signal so you don't have to.
|
172
|
+
typedef struct {
|
173
|
+
Preemphasizer *pre;
|
174
|
+
Segmenter *seg;
|
175
|
+
HammingWindow *ham;
|
176
|
+
PowerSpectrum *pow;
|
177
|
+
MelFilter *mel;
|
178
|
+
LogCompressor *log;
|
179
|
+
DiscreteCosineTransform *dct;
|
180
|
+
LiveCMN *cmn;
|
181
|
+
} Fast8kMfcc;
|
182
|
+
|
183
|
+
Fast8kMfcc* new_fast_8k_mfcc();
|
184
|
+
void free_fast_8k_mfcc(Fast8kMfcc *self);
|
185
|
+
NMatrix *fast_8k_mfcc_apply(Fast8kMfcc *self, NMatrix1 *data);
|
183
186
|
|
184
187
|
#ifdef __cplusplus
|
185
188
|
}
|
data/lib/c_impl/rnoyes.h
CHANGED
data/lib/c_impl/speech_trimmer.c
CHANGED
@@ -10,8 +10,14 @@ static void speech_trimmer_free(void *p) {
|
|
10
10
|
free_speech_trimmer(p);
|
11
11
|
}
|
12
12
|
|
13
|
-
static VALUE t_init(VALUE self) {
|
14
|
-
|
13
|
+
static VALUE t_init(VALUE self, VALUE args) {
|
14
|
+
int len = RARRAY_LEN(args);
|
15
|
+
SpeechTrimmer *st;
|
16
|
+
if (len == 1)
|
17
|
+
st = new_speech_trimmer(NUM2INT(rb_ary_entry(args, 0)));
|
18
|
+
else
|
19
|
+
st = new_speech_trimmer(16000);
|
20
|
+
|
15
21
|
VALUE stv = Data_Wrap_Struct(cSpeechTrimmer, 0, speech_trimmer_free, st);
|
16
22
|
rb_iv_set(self, "@speech_trimmer", stv);
|
17
23
|
return self;
|
@@ -38,15 +44,31 @@ static VALUE t_eos(VALUE self) {
|
|
38
44
|
SpeechTrimmer *st;
|
39
45
|
VALUE stv = rb_iv_get(self, "@speech_trimmer");
|
40
46
|
Data_Get_Struct(stv, SpeechTrimmer, st);
|
41
|
-
return
|
47
|
+
return speech_trimmer_eos(st) ? Qtrue : Qfalse;
|
48
|
+
}
|
49
|
+
|
50
|
+
static VALUE t_left_shift(VALUE self, VALUE obj) {
|
51
|
+
NMatrix1 *M = v_2_nmatrix1(obj);
|
52
|
+
SpeechTrimmer *st;
|
53
|
+
Data_Get_Struct(rb_iv_get(self, "@speech_trimmer"), SpeechTrimmer, st);
|
54
|
+
NMatrix *R = speech_trimmer_apply(st, M);
|
55
|
+
if (!R) {
|
56
|
+
free_nmatrix1(M);
|
57
|
+
return Qnil;
|
58
|
+
}
|
59
|
+
VALUE result = nmatrix_2_v(R);
|
60
|
+
free_nmatrix1(M);
|
61
|
+
free_nmatrix(R);
|
62
|
+
return result;
|
42
63
|
}
|
43
64
|
|
44
65
|
void Init_speech_trimmer() {
|
45
66
|
VALUE m_noyes_c = rb_define_module("NoyesC");
|
46
67
|
cSpeechTrimmer = rb_define_class_under(m_noyes_c, "SpeechTrimmer", rb_cObject);
|
47
|
-
rb_define_method(cSpeechTrimmer, "initialize", t_init,
|
68
|
+
rb_define_method(cSpeechTrimmer, "initialize", t_init, -2);
|
48
69
|
rb_define_method(cSpeechTrimmer, "enqueue", t_enqueue, 1);
|
49
70
|
rb_define_method(cSpeechTrimmer, "dequeue", t_dequeue, 0);
|
50
71
|
rb_define_method(cSpeechTrimmer, "eos?", t_eos, 0);
|
72
|
+
rb_define_method(cSpeechTrimmer, "<<", t_left_shift, 1);
|
51
73
|
id_push = rb_intern("push");
|
52
74
|
}
|
data/lib/common/noyes_dsl.rb
CHANGED
@@ -1,17 +1,27 @@
|
|
1
1
|
class Array
|
2
|
-
#
|
3
|
-
|
4
|
-
|
2
|
+
# Run this array through a filter or anything that implements the '<<'
|
3
|
+
# operator. Returns whatever the filter returns.
|
4
|
+
def >> filter
|
5
|
+
filter << self
|
5
6
|
end
|
6
7
|
end
|
7
8
|
|
8
9
|
# This portion is still highly experimental. It allows filters to be combined
|
9
10
|
# in complicated ways using a syntax similar to Backus Naur Form.
|
10
11
|
module NoyesFilterDSL
|
12
|
+
# Combines two filters into a single serial filter. That is A + B
|
13
|
+
# results in a filter S such that filtering through S is the identical
|
14
|
+
# to filtering through A and then B.
|
11
15
|
def + other
|
12
16
|
other_filters = other.kind_of?(SerialFilter) ? other.filters.clone : other
|
13
17
|
SerialFilter.new [self, other].flatten
|
14
18
|
end
|
19
|
+
|
20
|
+
# Combines two filters into a single parallel filter. That is A | B creates
|
21
|
+
# a new filter P such that filtering through P is identical to filtering row
|
22
|
+
# 0 of an array through filter A and row 1 of an array through filter B.
|
23
|
+
# Typically P would be used with an array of arrays. This filter can be used
|
24
|
+
# with more than two filters.
|
15
25
|
def | other
|
16
26
|
other_filters = other.kind_of?(ParallelFilter) ? other.filtes.clone : other
|
17
27
|
ParallelFilter.new [self, other].flatten
|
@@ -1,7 +1,11 @@
|
|
1
1
|
module NoyesJava
|
2
2
|
class SpeechTrimmer
|
3
|
-
def initialize
|
4
|
-
@st = Java::talkhouse.SpeechTrimmer.new
|
3
|
+
def initialize frequency = 16000
|
4
|
+
@st = Java::talkhouse.SpeechTrimmer.new frequency
|
5
|
+
end
|
6
|
+
def << pcm
|
7
|
+
result = @st.apply(pcm.to_java(Java::double))
|
8
|
+
result.to_a if result
|
5
9
|
end
|
6
10
|
def enqueue pcm
|
7
11
|
@st.enqueue pcm.to_java(Java::double)
|
@@ -1,7 +1,9 @@
|
|
1
1
|
module Noyes
|
2
2
|
# Determines whether a PCM frame is speech or not using Bent
|
3
3
|
# Schmidt-Nielsen's algorithm. Basically, it's an energy-based detector
|
4
|
-
# where the background noise level is constantly estimated.
|
4
|
+
# where the background noise level is constantly estimated. You probably
|
5
|
+
# don't want to use this class directly. Most of the time you'll want
|
6
|
+
# to use SpeechTrimmer, which uses this class.
|
5
7
|
#
|
6
8
|
# The pcm data should be in 100 millisecond chunks. For example,
|
7
9
|
# At 8000 Hz there should 80 frames of pcm.
|
@@ -14,6 +16,8 @@ module Noyes
|
|
14
16
|
@min_signal = 0.0
|
15
17
|
@threshold = 10.0
|
16
18
|
end
|
19
|
+
|
20
|
+
# Take the log rms of an array of pcm values.
|
17
21
|
def logrms pcm
|
18
22
|
sum_of_squares = 0.0
|
19
23
|
pcm.each {|sample| sum_of_squares += sample * sample}
|
@@ -21,6 +25,9 @@ module Noyes
|
|
21
25
|
rms = Math.max rms, 1
|
22
26
|
Math.log(rms) * 20
|
23
27
|
end
|
28
|
+
|
29
|
+
# Takes a centisecond worth of pcm values and indicates whether it looks
|
30
|
+
# like speech. This information is typically used by SpeechTrimmer.
|
24
31
|
def << pcm
|
25
32
|
is_speech = false
|
26
33
|
current = logrms pcm
|
@@ -4,7 +4,7 @@ module Noyes
|
|
4
4
|
# returns nil. Then check for eos. If eos is true you are done.
|
5
5
|
# SpeechTrimmer is designed to work efficiently with live audio.
|
6
6
|
class SpeechTrimmer
|
7
|
-
def initialize
|
7
|
+
def initialize frequency=16000
|
8
8
|
@leader = 5 # Cents of leading silence to retain.
|
9
9
|
@trailer = 5 # Cents of trailing silence to retain.
|
10
10
|
@speech_started = false
|
@@ -15,6 +15,19 @@ module Noyes
|
|
15
15
|
@eos_reached = false
|
16
16
|
@scs = 20 # Centiseconds of speech before detection of utterance.
|
17
17
|
@ecs = 50 # Centiseconds of silence before end detection.
|
18
|
+
@segmenter = Segmenter.new(frequency/100, frequency/100)
|
19
|
+
end
|
20
|
+
|
21
|
+
def << pcm
|
22
|
+
return if eos?
|
23
|
+
(@segmenter << pcm).inject [] do |memo, centisec|
|
24
|
+
enqueue centisec unless eos?
|
25
|
+
while x = dequeue
|
26
|
+
memo << x
|
27
|
+
end
|
28
|
+
break memo if eos?
|
29
|
+
memo
|
30
|
+
end
|
18
31
|
end
|
19
32
|
|
20
33
|
def enqueue pcm
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: noyes
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.9.
|
4
|
+
version: 0.9.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Joe Woelfel
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2010-06-
|
12
|
+
date: 2010-06-30 00:00:00 -04:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -22,7 +22,9 @@ dependencies:
|
|
22
22
|
- !ruby/object:Gem::Version
|
23
23
|
version: 1.0.0
|
24
24
|
version:
|
25
|
-
description:
|
25
|
+
description: |-
|
26
|
+
A fast portable signal processing library sufficient for creating features for
|
27
|
+
speech recognition, etc.
|
26
28
|
email: joe@talkhouse.com
|
27
29
|
executables:
|
28
30
|
- mock_noyes_server
|
@@ -34,7 +36,6 @@ extensions:
|
|
34
36
|
extra_rdoc_files:
|
35
37
|
- COPYING
|
36
38
|
- FAQ
|
37
|
-
- README
|
38
39
|
files:
|
39
40
|
- VERSION
|
40
41
|
- lib/c_impl/array_list.c
|
@@ -108,10 +109,8 @@ files:
|
|
108
109
|
- lib/ruby_impl/preemphasis.rb
|
109
110
|
- lib/ruby_impl/segment.rb
|
110
111
|
- lib/ruby_impl/speech_trimmer.rb
|
111
|
-
- ship/noyes.jar
|
112
112
|
- COPYING
|
113
113
|
- FAQ
|
114
|
-
- README
|
115
114
|
has_rdoc: true
|
116
115
|
homepage: http://github.com/talkhouse/noyes
|
117
116
|
licenses: []
|
data/README
DELETED
@@ -1,171 +0,0 @@
|
|
1
|
-
Noyes is a signal processing library. It currently has just enough signal
|
2
|
-
processing to produce features suitable for speech recognition.
|
3
|
-
|
4
|
-
Pronunciation: Typically pronounced the same as 'noise'. But "NO!... YES!" is
|
5
|
-
considered acceptable if you say it with sufficient conviction to make people
|
6
|
-
think you have truly changed your mind.
|
7
|
-
|
8
|
-
Noyes is a general purpose signal processing tool that is flexible enough for
|
9
|
-
many purposes. However, it exists because there is a need for low-latency high
|
10
|
-
quality speech recognition on portable wireless devices. The most powerful
|
11
|
-
speech recognizers are very large with huge models running on powerful cloud
|
12
|
-
based systems. But transmitting raw audio to these recognizers creates too
|
13
|
-
much latency because raw audio uses too much bandwidth. By sending compressed
|
14
|
-
features instead of raw audio the bandwidth can be greatly reduced without
|
15
|
-
compromising recognition accuracy. In some cases the effect of inadequate
|
16
|
-
bandwidth on latency can be reduced to zero.
|
17
|
-
|
18
|
-
Because hand sets require different implementations the Noyes library is
|
19
|
-
designed to quickly and efficiently work with and develop multiple underlying
|
20
|
-
implementations. All implementations are accessible via a high level dynamic
|
21
|
-
language that includes a very expressive domain specific language for handling
|
22
|
-
signal processing routines. In addition, all implementations share unit tests
|
23
|
-
written in a high level dynamic language.
|
24
|
-
|
25
|
-
Noyes is implemented entirely in Ruby. It's also implemented entirely in Java.
|
26
|
-
The Java version has Ruby bindings too. So you can have Java's speed from
|
27
|
-
Ruby. If you need a pure Java version you can use the generated jar. There is
|
28
|
-
a lot of flexibility without a lot of overhead. All versions share the same
|
29
|
-
unit tests, which are written in Ruby.
|
30
|
-
|
31
|
-
The design goal is to have signal processing routines that are so simple and so
|
32
|
-
disentangled from the overall system that anyone could extract any of the
|
33
|
-
routines and use them elsewhere with little trouble. Benchmarks are included.
|
34
|
-
|
35
|
-
This library places an emphasis on expressiveness without sacrificing ultimate
|
36
|
-
performance. It does so by supporting multiple implementations each with Ruby
|
37
|
-
bindings. The pure Ruby version, while not fast, is often adequate for
|
38
|
-
development and is the best place to add new routines.
|
39
|
-
|
40
|
-
For examples of how to link with different implementations see the test section
|
41
|
-
of the Rakefile. At present only the pure Ruby implementation is exposed via
|
42
|
-
the gem.
|
43
|
-
|
44
|
-
Requirements:
|
45
|
-
Almost any version of ruby & rake.
|
46
|
-
Java, if you want to use the Java implementation instead of the default pure
|
47
|
-
ruby implementation.
|
48
|
-
|
49
|
-
Some of the utility scripts such as nrec and jrec may use sox, but
|
50
|
-
none of the core routines use it.
|
51
|
-
|
52
|
-
Build instructions
|
53
|
-
rake -T
|
54
|
-
|
55
|
-
|
56
|
-
= USAGE
|
57
|
-
|
58
|
-
All signal processing routines use a simple DSL style inteface. Below are some
|
59
|
-
examples.
|
60
|
-
|
61
|
-
== Filter operator example.
|
62
|
-
The '>>=' operator is called the filter operator. It modifies that data on the
|
63
|
-
left using the filter on the right. This is similar to the way the += operator
|
64
|
-
works for numbers. Note that the >>= actually looks like a filter making it easy
|
65
|
-
to remember.
|
66
|
-
|
67
|
-
require 'noyes'
|
68
|
-
data = (1..12).to_a # An array of nonesense data.
|
69
|
-
segmenter = Segmenter.new 4, 2 # window size, window shift
|
70
|
-
hamming_filter = HammingWindow.new 4 # window size
|
71
|
-
power_spec_filter = PowerSpectrumFilter.new 8 # number of ffts
|
72
|
-
|
73
|
-
data >>= segmenter
|
74
|
-
data >>= hamming_filter
|
75
|
-
data >>= power_spec_filter
|
76
|
-
data >>= dct_filter
|
77
|
-
|
78
|
-
You can expand the >>= operator out, but I think the flow is worse and there is
|
79
|
-
more repetition, particularly when you have a lot of filters in sequence. This
|
80
|
-
is perfectly valid syntax though. Also, this is very useful if you don't want
|
81
|
-
to keep a reference to your original data.
|
82
|
-
|
83
|
-
require 'noyes'
|
84
|
-
pcm_data = (1..12).to_a
|
85
|
-
segmenter = Segmenter.new
|
86
|
-
hamming_filter = HammingWindow.new 4
|
87
|
-
segmented_data = segmenter << pcm_data, 4, 2
|
88
|
-
hamming_data = hamming_filter << segmented_data
|
89
|
-
power_spectrum data = power_spec_filter hamminging_data, 8
|
90
|
-
dct_data = dct_filter << power_spectrum_data
|
91
|
-
|
92
|
-
== Advanced filter DSLs
|
93
|
-
For most things, the filter operator is simple, easy to remember, and
|
94
|
-
very concise. But sometimes you want to build more elaborate combinations
|
95
|
-
of filters and use them as if you had a single filter. In this case
|
96
|
-
making a new classes for every possible combination creates an explosion
|
97
|
-
of new classes and a maintainence nightmare. Instead, there is a simple
|
98
|
-
graph notation you can use to combine filters. In the following example
|
99
|
-
we'll combine all the filters from a previous example and then use them
|
100
|
-
as if they were a single filter.
|
101
|
-
|
102
|
-
serial_filter = segmenter & hamming_filter & power_spec_filter & dct_filter
|
103
|
-
data >>= serial_filter
|
104
|
-
|
105
|
-
It's also possible to take parallel data streams and pipe them through
|
106
|
-
parallel filters as if you had only one data stream and only one filter.
|
107
|
-
|
108
|
-
data = [stream_1,stream_2]
|
109
|
-
parallel_filter = filter_1 | filter_2
|
110
|
-
data >>= parallel_filter
|
111
|
-
|
112
|
-
It is not necessary for the data to be synchronous when using parallel filters.
|
113
|
-
When using parallel filters the number of elements going through one filter
|
114
|
-
does not have to equal the number of elements going through the second filter.
|
115
|
-
|
116
|
-
You can see that you can make arbitrarily complicated graphs of filters by
|
117
|
-
combined use of the '&' and '|' operators. Almost identical notation is used
|
118
|
-
to specify graphs for context free grammars. Keep in mind that '&' take
|
119
|
-
precedence over '|'. In the example below stream 1 goes through filter 1 and
|
120
|
-
filter 2 while stream 2 goes through filters 3, 4, and 5.
|
121
|
-
|
122
|
-
parallel_data = [stream_1,stream_2]
|
123
|
-
big_filter = filter_1 & filter_2 | filter_3 & filter_4 & filter_5
|
124
|
-
parallel_data >>= big_filter
|
125
|
-
|
126
|
-
== Command Line Utilities
|
127
|
-
|
128
|
-
The program nrec will process almost any format of audio file into speech
|
129
|
-
features and send the data to a cloud hosted speech recognizer. The resulting
|
130
|
-
transcript will be sent back and printed out. The nrec program uses whatever
|
131
|
-
version of Ruby is on the path of your current environment. It is compatible
|
132
|
-
with both ruby 1.9, ruby 1.8x, and JRuby. When run under JRuby it can
|
133
|
-
optionally use a Java implementation, which is very fast. See nrec --help for
|
134
|
-
more information.
|
135
|
-
|
136
|
-
== Assessing Performance for Wireless Devices
|
137
|
-
|
138
|
-
It's important to note that the performance characteristics of live data and
|
139
|
-
recorded data are different. Any delay experience by a user starts from the
|
140
|
-
time they stop speaking. In contrast, any delay experienced when processing a
|
141
|
-
file starts from the time a file starts processing. For that reason file
|
142
|
-
processing always seems slower. Modern recognizers are easily capable of
|
143
|
-
exceeding real time performance so that it not a factor. The delay experienced
|
144
|
-
by a user is typically due to the time required to transmit the audio to the
|
145
|
-
recognizer and the time required to detect end of utterance, assuming end of
|
146
|
-
utterance detection is used.
|
147
|
-
|
148
|
-
If end of utterance detection is used the recognizer must wait until it has
|
149
|
-
sufficient evidence to be reasonably sure the user has stopped talking. This
|
150
|
-
could mean that a suitable period of silence has passed which means the user
|
151
|
-
incurs a slight but unavoidable delay. End of utterance detection also could
|
152
|
-
mean the grammar or language model does not allow for any other reasonable
|
153
|
-
possibility even if more data were available, which may mean no delay at all
|
154
|
-
(or even a negative delay in some cases).
|
155
|
-
|
156
|
-
If the bandwidth of the network is low enough, which is often the case for the
|
157
|
-
data channel of portable wireless handsets, it will take time for raw
|
158
|
-
uncompressed audio to traverse the network. By computing features on the
|
159
|
-
handset it is possible to have significant reduction in bandwidth requirements
|
160
|
-
eliminating much of the latency. These features in turn may then be compressed
|
161
|
-
for further bandwidth reduction. This method exceeds what is possible with
|
162
|
-
alternative methods of audio compression. Further, it eliminates many of the
|
163
|
-
distortion components that may compromise recognition accuracy.
|
164
|
-
|
165
|
-
If all you want is a rough feeling of how responsive speech recognition will be
|
166
|
-
over your network try speaking an utterance at the same time you enter a
|
167
|
-
command to have a prerecorded utterance recognized. You'll probably be
|
168
|
-
surprised by how quickly the network is able to respond. You may find that the
|
169
|
-
Java implementation feels like instant response even though it takes time for
|
170
|
-
the JVM to launch. Ruby 1.9 is actually surprisingly quick on a reasonably
|
171
|
-
powerful laptop.
|
data/ship/noyes.jar
DELETED
Binary file
|