sip-lab 1.27.1 → 1.28.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +29 -1
- package/binding.gyp +8 -0
- package/build_deps.sh +9 -0
- package/package.json +1 -1
- package/prebuilds/linux-x64/sip-lab.node +0 -0
- package/samples/play_wav_and_speech_recog.bad_transcript.pcmu8000.js +4 -4
- package/samples/speech_synth_and_recog.speex16000.js +6 -6
- package/samples/text_to_speech.js +5 -12
- package/src/event_templates.cpp +11 -4
- package/src/event_templates.hpp +4 -2
- package/src/pjmedia/include/pjmedia/flite_port.h +2 -2
- package/src/pjmedia/include/pjmedia/ws_speech_port.h +37 -0
- package/src/pjmedia/src/pjmedia/flite_port.c +16 -11
- package/src/pjmedia/src/pjmedia/ws_speech_port.cpp +377 -0
- package/src/sip.cpp +311 -95
package/README.md
CHANGED
|
@@ -14,10 +14,10 @@ It permits to:
|
|
|
14
14
|
- send/receive audio using SRTP
|
|
15
15
|
- do speech synth using flite
|
|
16
16
|
- do speech recog using pocketsphinx (but only works well with sampling rate of 16000)
|
|
17
|
+
- do speech synth/recog using [ws_speech_server](https://github.com/MayamaTakeshi/ws_speech_server) (this permits to use google/amazon/azure/etc speech services)
|
|
17
18
|
|
|
18
19
|
TODO:
|
|
19
20
|
- add support for video playing/recording from/to file
|
|
20
|
-
- add support for speech synth/recog via websocket server to permit to access Google Speech, Whisper, Amazon Poly etc.
|
|
21
21
|
- add support for T.38 fax
|
|
22
22
|
- add support for SIP over WebSocket
|
|
23
23
|
- add support for WebRTC
|
|
@@ -47,6 +47,34 @@ The above script has detailed comments.
|
|
|
47
47
|
|
|
48
48
|
Please read it to undestand how to write your own test scripts.
|
|
49
49
|
|
|
50
|
+
|
|
51
|
+
### Samples
|
|
52
|
+
|
|
53
|
+
See general sample scripts in folder samples.
|
|
54
|
+
|
|
55
|
+
There are additional samples scripts in folder samples_extra but they require [ws_speech_server](https://github.com/MayamaTakeshi/ws_speech_server) to be running locally (and it should be started with GOOGLE_APPLICATION_CREDENTIALS set).
|
|
56
|
+
|
|
57
|
+
To run ws_speech_server, do this:
|
|
58
|
+
```
|
|
59
|
+
https://github.com/MayamaTakeshi/ws_speech_server
|
|
60
|
+
cd ws_speech_server
|
|
61
|
+
npm i
|
|
62
|
+
npm run build
|
|
63
|
+
cp config/default.js.sample config/default.js
|
|
64
|
+
export GOOGLE_APPLICATION_CREDENTIALS=/path/to/your/credentials/file
|
|
65
|
+
node src/App.bs.js
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
Then you should be able to test with dtmf language:
|
|
69
|
+
```
|
|
70
|
+
node samples_extra/ws_speech_server.dtmf.js
|
|
71
|
+
```
|
|
72
|
+
or with google speech service:
|
|
73
|
+
```
|
|
74
|
+
node samples_extra/ws_speech_server.google.js
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
|
|
50
78
|
### About the code
|
|
51
79
|
|
|
52
80
|
Although the code in written in *.cpp/*.hpp named files, this is not actually a C++ project.
|
package/binding.gyp
CHANGED
|
@@ -19,10 +19,12 @@
|
|
|
19
19
|
"3rdParty/spandsp/src",
|
|
20
20
|
"3rdParty/pocketsphinx/include",
|
|
21
21
|
"3rdParty/pocketsphinx/build/include",
|
|
22
|
+
"3rdParty/pjwebsock/websock",
|
|
22
23
|
"<!@(node -p \"require('node-addon-api').include\")",
|
|
23
24
|
],
|
|
24
25
|
'conditions': [
|
|
25
26
|
[ 'OS!="win"', {
|
|
27
|
+
'cflags': ['-g'],
|
|
26
28
|
'cflags_cc': [
|
|
27
29
|
'-g',
|
|
28
30
|
'-fexceptions',
|
|
@@ -120,6 +122,12 @@
|
|
|
120
122
|
'src/pjmedia/src/pjmedia/fax_port.c',
|
|
121
123
|
'src/pjmedia/src/pjmedia/flite_port.c',
|
|
122
124
|
'src/pjmedia/src/pjmedia/pocketsphinx_port.c',
|
|
125
|
+
'src/pjmedia/src/pjmedia/ws_speech_port.cpp',
|
|
126
|
+
'3rdParty/pjwebsock/websock/http.c',
|
|
127
|
+
'3rdParty/pjwebsock/websock/websock_transport_tcp.c',
|
|
128
|
+
'3rdParty/pjwebsock/websock/websock_transport_tls.c',
|
|
129
|
+
'3rdParty/pjwebsock/websock/websock.c',
|
|
130
|
+
'3rdParty/pjwebsock/websock/websock_transport.c',
|
|
123
131
|
],
|
|
124
132
|
},
|
|
125
133
|
],
|
package/build_deps.sh
CHANGED
|
@@ -102,6 +102,15 @@ then
|
|
|
102
102
|
fi
|
|
103
103
|
|
|
104
104
|
|
|
105
|
+
cd $START_DIR/3rdParty
|
|
106
|
+
if [[ ! -d pjwebsock ]]
|
|
107
|
+
then
|
|
108
|
+
git clone https://github.com/jimying/pjwebsock
|
|
109
|
+
cd pjwebsock
|
|
110
|
+
git checkout a0616ea27f01d5e3bdfd5b801fb1499473a0b0cb
|
|
111
|
+
fi
|
|
112
|
+
|
|
113
|
+
|
|
105
114
|
#cd $START_DIR/3rdParty
|
|
106
115
|
#if [[ ! -d openssl ]]
|
|
107
116
|
#then
|
package/package.json
CHANGED
|
Binary file
|
|
@@ -115,7 +115,7 @@ async function test() {
|
|
|
115
115
|
sip.call.start_speech_recog(oc.id)
|
|
116
116
|
sip.call.start_speech_recog(ic.id)
|
|
117
117
|
|
|
118
|
-
await z.sleep(
|
|
118
|
+
await z.sleep(200)
|
|
119
119
|
|
|
120
120
|
sip.call.start_play_wav(oc.id, {file: 'samples/artifacts/hello_good_morning.wav', end_of_file_event: true, no_loop: true})
|
|
121
121
|
sip.call.start_play_wav(ic.id, {file: 'samples/artifacts/hello_good_morning.wav', end_of_file_event: true, no_loop: true})
|
|
@@ -130,16 +130,16 @@ async function test() {
|
|
|
130
130
|
call_id: oc.id,
|
|
131
131
|
},
|
|
132
132
|
{
|
|
133
|
-
event: '
|
|
133
|
+
event: 'speech',
|
|
134
134
|
call_id: oc.id,
|
|
135
135
|
//transcript: 'hello good morning', // bad transcript (will not match)
|
|
136
136
|
},
|
|
137
137
|
{
|
|
138
|
-
event: '
|
|
138
|
+
event: 'speech',
|
|
139
139
|
call_id: ic.id,
|
|
140
140
|
//transcript: 'hello good morning', // bad transcript (will not match)
|
|
141
141
|
},
|
|
142
|
-
],
|
|
142
|
+
], 5000)
|
|
143
143
|
|
|
144
144
|
sip.call.stop_record_wav(oc.id)
|
|
145
145
|
sip.call.stop_record_wav(ic.id)
|
|
@@ -117,25 +117,25 @@ async function test() {
|
|
|
117
117
|
|
|
118
118
|
await z.sleep(100)
|
|
119
119
|
|
|
120
|
-
sip.call.start_speech_synth(oc.id, {voice: 'kal16', text: 'Good morning.'
|
|
121
|
-
sip.call.start_speech_synth(ic.id, {voice: 'kal16', text: 'How are you?'
|
|
120
|
+
sip.call.start_speech_synth(oc.id, {voice: 'kal16', text: 'Good morning.'})
|
|
121
|
+
sip.call.start_speech_synth(ic.id, {voice: 'kal16', text: 'How are you?'})
|
|
122
122
|
|
|
123
123
|
await z.wait([
|
|
124
124
|
{
|
|
125
|
-
event: '
|
|
125
|
+
event: 'speech_synth_complete',
|
|
126
126
|
call_id: ic.id,
|
|
127
127
|
},
|
|
128
128
|
{
|
|
129
|
-
event: '
|
|
129
|
+
event: 'speech_synth_complete',
|
|
130
130
|
call_id: oc.id,
|
|
131
131
|
},
|
|
132
132
|
{
|
|
133
|
-
event: '
|
|
133
|
+
event: 'speech',
|
|
134
134
|
call_id: oc.id,
|
|
135
135
|
transcript: 'how are you',
|
|
136
136
|
},
|
|
137
137
|
{
|
|
138
|
-
event: '
|
|
138
|
+
event: 'speech',
|
|
139
139
|
call_id: ic.id,
|
|
140
140
|
transcript: 'good morning',
|
|
141
141
|
},
|
|
@@ -130,26 +130,19 @@ async function test() {
|
|
|
130
130
|
},
|
|
131
131
|
], 3000)
|
|
132
132
|
|
|
133
|
-
sip.call.start_speech_synth(oc.id, {voice: 'slt', text: 'Hello World.'
|
|
134
|
-
sip.call.start_speech_synth(ic.id, {voice: 'kal', text: 'How are you?'
|
|
133
|
+
sip.call.start_speech_synth(oc.id, {voice: 'slt', text: 'Hello World.'})
|
|
134
|
+
sip.call.start_speech_synth(ic.id, {voice: 'kal', text: 'How are you?'})
|
|
135
135
|
|
|
136
136
|
await z.wait([
|
|
137
137
|
{
|
|
138
|
-
event: '
|
|
138
|
+
event: 'speech_synth_complete',
|
|
139
139
|
call_id: ic.id,
|
|
140
140
|
},
|
|
141
141
|
{
|
|
142
|
-
event: '
|
|
142
|
+
event: 'speech_synth_complete',
|
|
143
143
|
call_id: oc.id,
|
|
144
144
|
},
|
|
145
|
-
],
|
|
146
|
-
|
|
147
|
-
await z.wait([
|
|
148
|
-
{
|
|
149
|
-
event: 'end_of_speech',
|
|
150
|
-
call_id: oc.id,
|
|
151
|
-
},
|
|
152
|
-
], 2000)
|
|
145
|
+
], 3000)
|
|
153
146
|
|
|
154
147
|
sip.call.stop_speech_synth(oc.id) // this is not actually necessary. It is used just to confirm the command works
|
|
155
148
|
sip.call.stop_speech_synth(ic.id) // this is not actually necessary. It is used just to confirm the command works
|
package/src/event_templates.cpp
CHANGED
|
@@ -105,16 +105,16 @@ int make_evt_end_of_file(char *dest, int size, long call_id) {
|
|
|
105
105
|
"{\"event\": \"end_of_file\", \"call_id\": %ld}", call_id);
|
|
106
106
|
}
|
|
107
107
|
|
|
108
|
-
int
|
|
108
|
+
int make_evt_speech_synth_complete(char *dest, int size, long call_id) {
|
|
109
109
|
return snprintf(
|
|
110
110
|
dest, size,
|
|
111
|
-
"{\"event\": \"
|
|
111
|
+
"{\"event\": \"speech_synth_complete\", \"call_id\": %ld}", call_id);
|
|
112
112
|
}
|
|
113
113
|
|
|
114
|
-
int
|
|
114
|
+
int make_evt_speech(char *dest, int size, long call_id, char* transcript) {
|
|
115
115
|
return snprintf(
|
|
116
116
|
dest, size,
|
|
117
|
-
"{\"event\": \"
|
|
117
|
+
"{\"event\": \"speech\", \"call_id\": %ld, \"transcript\": \"%s\"}", call_id, transcript);
|
|
118
118
|
}
|
|
119
119
|
|
|
120
120
|
int make_evt_tcp_msg(char *dest, int size, long call_id, const char *protocol, char *data, int data_len) {
|
|
@@ -122,3 +122,10 @@ int make_evt_tcp_msg(char *dest, int size, long call_id, const char *protocol, c
|
|
|
122
122
|
dest, size,
|
|
123
123
|
"{\"event\": \"%s_msg\", \"call_id\": %ld}\n%.*s", protocol, call_id, data_len, data);
|
|
124
124
|
}
|
|
125
|
+
|
|
126
|
+
int make_evt_ws_speech_event(char *dest, int size, long call_id, char *data, int data_len) {
|
|
127
|
+
return snprintf(
|
|
128
|
+
dest, size,
|
|
129
|
+
"{\"event\": \"ws_speech_event\", \"call_id\": %ld, \"data\": %.*s}", call_id, data_len, data);
|
|
130
|
+
}
|
|
131
|
+
|
package/src/event_templates.hpp
CHANGED
|
@@ -36,10 +36,12 @@ int make_evt_fax_result(char *dest, int size, long call_id, int result);
|
|
|
36
36
|
|
|
37
37
|
int make_evt_end_of_file(char *dest, int size, long call_id);
|
|
38
38
|
|
|
39
|
-
int
|
|
39
|
+
int make_evt_speech_synth_complete(char *dest, int size, long call_id);
|
|
40
40
|
|
|
41
|
-
int
|
|
41
|
+
int make_evt_speech(char *dest, int size, long call_id, char* transcript);
|
|
42
42
|
|
|
43
43
|
int make_evt_tcp_msg(char *dest, int size, long call_id, const char *protocol, char *data, int data_len);
|
|
44
44
|
|
|
45
|
+
int make_evt_ws_speech_event(char *dest, int size, long call_id, char *data, int data_len);
|
|
46
|
+
|
|
45
47
|
#endif
|
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
|
|
6
6
|
PJ_BEGIN_DECL
|
|
7
7
|
|
|
8
|
-
enum
|
|
8
|
+
enum pjmedia_flite_option
|
|
9
9
|
{
|
|
10
10
|
PJMEDIA_SPEECH_NO_LOOP = 1
|
|
11
11
|
};
|
|
@@ -25,7 +25,7 @@ PJ_DEF(pj_status_t) pjmedia_flite_port_set_eof_cb(pjmedia_port *port,
|
|
|
25
25
|
|
|
26
26
|
PJ_DEF(pj_status_t) pjmedia_flite_port_speak( pjmedia_port *port,
|
|
27
27
|
const char *text,
|
|
28
|
-
|
|
28
|
+
int times);
|
|
29
29
|
|
|
30
30
|
PJ_END_DECL
|
|
31
31
|
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
#ifndef __WS_SPEECH_PORT_H__
|
|
2
|
+
#define __WS_SPEECH_PORT_H__
|
|
3
|
+
|
|
4
|
+
#include <pjmedia/port.h>
|
|
5
|
+
#include "websock.h"
|
|
6
|
+
|
|
7
|
+
PJ_BEGIN_DECL
|
|
8
|
+
|
|
9
|
+
enum ws_speech_event
|
|
10
|
+
{
|
|
11
|
+
WS_SPEECH_EVENT_CONNECTED,
|
|
12
|
+
WS_SPEECH_EVENT_CONNECTION_ERROR,
|
|
13
|
+
WS_SPEECH_EVENT_DISCONNECTED,
|
|
14
|
+
WS_SPEECH_EVENT_TEXT_MSG
|
|
15
|
+
};
|
|
16
|
+
|
|
17
|
+
PJ_DEF(pj_status_t) pjmedia_ws_speech_port_create( pj_pool_t *pool,
|
|
18
|
+
unsigned clock_rate,
|
|
19
|
+
unsigned channel_count,
|
|
20
|
+
unsigned samples_per_frame,
|
|
21
|
+
unsigned bits_per_sample,
|
|
22
|
+
pj_websock_endpoint *ws_endpt,
|
|
23
|
+
const char *server_url,
|
|
24
|
+
const char *ss_engine,
|
|
25
|
+
const char *ss_voice,
|
|
26
|
+
const char *ss_language,
|
|
27
|
+
const char *ss_text,
|
|
28
|
+
int ss_times,
|
|
29
|
+
const char *sr_engine,
|
|
30
|
+
const char *sr_language,
|
|
31
|
+
void (*cb)(pjmedia_port*, void *user_data, enum ws_speech_event, char *data, int len),
|
|
32
|
+
void *cb_user_data,
|
|
33
|
+
pjmedia_port **p_port);
|
|
34
|
+
|
|
35
|
+
PJ_END_DECL
|
|
36
|
+
|
|
37
|
+
#endif /* __WS_SPEECH_PORT_H__ */
|
|
@@ -53,7 +53,6 @@ static struct {
|
|
|
53
53
|
|
|
54
54
|
struct flite_t {
|
|
55
55
|
struct pjmedia_port base;
|
|
56
|
-
unsigned options;
|
|
57
56
|
|
|
58
57
|
cst_voice *v;
|
|
59
58
|
unsigned written_samples;
|
|
@@ -61,6 +60,8 @@ struct flite_t {
|
|
|
61
60
|
|
|
62
61
|
pj_bool_t subscribed;
|
|
63
62
|
void (*cb)(pjmedia_port*, void*);
|
|
63
|
+
|
|
64
|
+
int times;
|
|
64
65
|
};
|
|
65
66
|
|
|
66
67
|
#define free_wave(w) if (w) {delete_wave(w) ; w = NULL; }
|
|
@@ -112,6 +113,7 @@ PJ_DEF(pj_status_t) pjmedia_flite_port_create( pj_pool_t *pool,
|
|
|
112
113
|
const char *voice,
|
|
113
114
|
pjmedia_port **p_port)
|
|
114
115
|
{
|
|
116
|
+
printf("pjmedia_flite_port_create\n");
|
|
115
117
|
struct flite_t *flite;
|
|
116
118
|
const pj_str_t name = pj_str("flite_data");
|
|
117
119
|
|
|
@@ -164,13 +166,14 @@ PJ_DEF(pj_status_t) pjmedia_flite_port_create( pj_pool_t *pool,
|
|
|
164
166
|
|
|
165
167
|
PJ_DEF(pj_status_t) pjmedia_flite_port_speak( pjmedia_port *port,
|
|
166
168
|
const char *text,
|
|
167
|
-
|
|
169
|
+
int times) {
|
|
170
|
+
printf("pjmedia_flite_port_speak. text=%s times=%i\n", text, times);
|
|
168
171
|
struct flite_t *flite = (struct flite_t*)port;
|
|
169
172
|
if(flite->w) {
|
|
170
173
|
free_wave(flite->w);
|
|
171
174
|
}
|
|
172
175
|
|
|
173
|
-
flite->
|
|
176
|
+
flite->times = times;
|
|
174
177
|
|
|
175
178
|
flite->w = flite_text_to_wave(text, flite->v);
|
|
176
179
|
if ((unsigned)flite->w->sample_rate != PJMEDIA_PIA_SRATE(&port->info)) {
|
|
@@ -185,22 +188,25 @@ PJ_DEF(pj_status_t) pjmedia_flite_port_speak( pjmedia_port *port,
|
|
|
185
188
|
// called when pjmedia needs data to be sent out
|
|
186
189
|
static pj_status_t flite_get_frame(pjmedia_port *port,
|
|
187
190
|
pjmedia_frame *frame) {
|
|
191
|
+
printf("flite_get_frame\n");
|
|
188
192
|
|
|
189
193
|
PJ_ASSERT_RETURN(port && frame, PJ_EINVAL);
|
|
190
194
|
|
|
191
195
|
struct flite_t *flite = (struct flite_t*)port;
|
|
192
196
|
|
|
193
|
-
if(!flite->w) {
|
|
194
|
-
|
|
197
|
+
if(flite->times <= 0 || !flite->w) {
|
|
198
|
+
printf("flite no data\n");
|
|
195
199
|
frame->type = PJMEDIA_FRAME_TYPE_NONE;
|
|
196
200
|
return PJ_SUCCESS;
|
|
197
201
|
}
|
|
198
202
|
|
|
199
|
-
|
|
203
|
+
printf("written_samples=%i num_samples=%i\n", flite->written_samples, flite->w->num_samples);
|
|
200
204
|
if (flite->written_samples + PJMEDIA_PIA_SPF(&port->info) > (unsigned)flite->w->num_samples) {
|
|
201
205
|
printf("flite end of speech\n");
|
|
202
206
|
|
|
203
|
-
|
|
207
|
+
flite->times--;
|
|
208
|
+
|
|
209
|
+
if(flite->times <= 0 && flite->cb) {
|
|
204
210
|
if (!flite->subscribed) {
|
|
205
211
|
pj_status_t status = pjmedia_event_subscribe(NULL, &speech_on_event,
|
|
206
212
|
flite, flite);
|
|
@@ -218,10 +224,9 @@ static pj_status_t flite_get_frame(pjmedia_port *port,
|
|
|
218
224
|
}
|
|
219
225
|
}
|
|
220
226
|
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
if(no_loop) {
|
|
227
|
+
if(flite->times <= 0) {
|
|
224
228
|
free_wave(flite->w);
|
|
229
|
+
flite->w = NULL;
|
|
225
230
|
frame->type = PJMEDIA_FRAME_TYPE_NONE;
|
|
226
231
|
return PJ_SUCCESS;
|
|
227
232
|
} else {
|
|
@@ -232,7 +237,7 @@ static pj_status_t flite_get_frame(pjmedia_port *port,
|
|
|
232
237
|
memcpy(frame->buf, flite->w->samples + flite->written_samples, PJMEDIA_PIA_SPF(&port->info)*2);
|
|
233
238
|
flite->written_samples += PJMEDIA_PIA_SPF(&port->info);
|
|
234
239
|
frame->type = PJMEDIA_FRAME_TYPE_AUDIO;
|
|
235
|
-
|
|
240
|
+
printf("flite data written samples=%i\n", PJMEDIA_PIA_SPF(&port->info));
|
|
236
241
|
|
|
237
242
|
return PJ_SUCCESS;
|
|
238
243
|
}
|