google-cloud-speech 0.23.0 → 0.24.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.yardopts +0 -2
- data/README.md +4 -2
- data/lib/google-cloud-speech.rb +6 -2
- data/lib/google/cloud/speech.rb +37 -23
- data/lib/google/cloud/speech/audio.rb +73 -44
- data/lib/google/cloud/speech/credentials.rb +2 -2
- data/lib/google/cloud/speech/operation.rb +262 -0
- data/lib/google/cloud/speech/project.rb +186 -83
- data/lib/google/cloud/speech/result.rb +14 -8
- data/lib/google/cloud/speech/service.rb +12 -6
- data/lib/google/cloud/speech/stream.rb +128 -131
- data/lib/google/cloud/speech/{v1beta1.rb → v1.rb} +2 -3
- data/lib/google/cloud/speech/v1/cloud_speech_pb.rb +116 -0
- data/lib/google/cloud/speech/{v1beta1 → v1}/cloud_speech_services_pb.rb +11 -11
- data/lib/google/cloud/speech/{v1beta1/doc/google/cloud/speech/v1beta1 → v1/doc/google/cloud/speech/v1}/cloud_speech.rb +157 -161
- data/lib/google/cloud/speech/{v1beta1 → v1}/doc/google/protobuf/any.rb +0 -0
- data/lib/google/cloud/speech/{v1beta1 → v1}/doc/google/rpc/status.rb +0 -0
- data/lib/google/cloud/speech/{v1beta1 → v1}/speech_client.rb +71 -58
- data/lib/google/cloud/speech/{v1beta1 → v1}/speech_client_config.json +8 -8
- data/lib/google/cloud/speech/version.rb +1 -1
- metadata +13 -13
- data/lib/google/cloud/speech/job.rb +0 -159
- data/lib/google/cloud/speech/v1beta1/cloud_speech_pb.rb +0 -116
@@ -1,5 +1,4 @@
|
|
1
|
-
# Copyright
|
2
|
-
#
|
1
|
+
# Copyright 2017, Google Inc. All rights reserved.
|
3
2
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
3
|
# you may not use this file except in compliance with the License.
|
5
4
|
# You may obtain a copy of the License at
|
@@ -12,4 +11,4 @@
|
|
12
11
|
# See the License for the specific language governing permissions and
|
13
12
|
# limitations under the License.
|
14
13
|
|
15
|
-
require "google/cloud/speech/
|
14
|
+
require "google/cloud/speech/v1/speech_client"
|
@@ -0,0 +1,116 @@
|
|
1
|
+
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
2
|
+
# source: google/cloud/speech/v1/cloud_speech.proto
|
3
|
+
|
4
|
+
require 'google/protobuf'
|
5
|
+
|
6
|
+
require 'google/api/annotations_pb'
|
7
|
+
require 'google/longrunning/operations_pb'
|
8
|
+
require 'google/protobuf/any_pb'
|
9
|
+
require 'google/protobuf/duration_pb'
|
10
|
+
require 'google/protobuf/timestamp_pb'
|
11
|
+
require 'google/rpc/status_pb'
|
12
|
+
Google::Protobuf::DescriptorPool.generated_pool.build do
|
13
|
+
add_message "google.cloud.speech.v1.RecognizeRequest" do
|
14
|
+
optional :config, :message, 1, "google.cloud.speech.v1.RecognitionConfig"
|
15
|
+
optional :audio, :message, 2, "google.cloud.speech.v1.RecognitionAudio"
|
16
|
+
end
|
17
|
+
add_message "google.cloud.speech.v1.LongRunningRecognizeRequest" do
|
18
|
+
optional :config, :message, 1, "google.cloud.speech.v1.RecognitionConfig"
|
19
|
+
optional :audio, :message, 2, "google.cloud.speech.v1.RecognitionAudio"
|
20
|
+
end
|
21
|
+
add_message "google.cloud.speech.v1.StreamingRecognizeRequest" do
|
22
|
+
oneof :streaming_request do
|
23
|
+
optional :streaming_config, :message, 1, "google.cloud.speech.v1.StreamingRecognitionConfig"
|
24
|
+
optional :audio_content, :bytes, 2
|
25
|
+
end
|
26
|
+
end
|
27
|
+
add_message "google.cloud.speech.v1.StreamingRecognitionConfig" do
|
28
|
+
optional :config, :message, 1, "google.cloud.speech.v1.RecognitionConfig"
|
29
|
+
optional :single_utterance, :bool, 2
|
30
|
+
optional :interim_results, :bool, 3
|
31
|
+
end
|
32
|
+
add_message "google.cloud.speech.v1.RecognitionConfig" do
|
33
|
+
optional :encoding, :enum, 1, "google.cloud.speech.v1.RecognitionConfig.AudioEncoding"
|
34
|
+
optional :sample_rate_hertz, :int32, 2
|
35
|
+
optional :language_code, :string, 3
|
36
|
+
optional :max_alternatives, :int32, 4
|
37
|
+
optional :profanity_filter, :bool, 5
|
38
|
+
repeated :speech_contexts, :message, 6, "google.cloud.speech.v1.SpeechContext"
|
39
|
+
end
|
40
|
+
add_enum "google.cloud.speech.v1.RecognitionConfig.AudioEncoding" do
|
41
|
+
value :ENCODING_UNSPECIFIED, 0
|
42
|
+
value :LINEAR16, 1
|
43
|
+
value :FLAC, 2
|
44
|
+
value :MULAW, 3
|
45
|
+
value :AMR, 4
|
46
|
+
value :AMR_WB, 5
|
47
|
+
value :OGG_OPUS, 6
|
48
|
+
value :SPEEX_WITH_HEADER_BYTE, 7
|
49
|
+
end
|
50
|
+
add_message "google.cloud.speech.v1.SpeechContext" do
|
51
|
+
repeated :phrases, :string, 1
|
52
|
+
end
|
53
|
+
add_message "google.cloud.speech.v1.RecognitionAudio" do
|
54
|
+
oneof :audio_source do
|
55
|
+
optional :content, :bytes, 1
|
56
|
+
optional :uri, :string, 2
|
57
|
+
end
|
58
|
+
end
|
59
|
+
add_message "google.cloud.speech.v1.RecognizeResponse" do
|
60
|
+
repeated :results, :message, 2, "google.cloud.speech.v1.SpeechRecognitionResult"
|
61
|
+
end
|
62
|
+
add_message "google.cloud.speech.v1.LongRunningRecognizeResponse" do
|
63
|
+
repeated :results, :message, 2, "google.cloud.speech.v1.SpeechRecognitionResult"
|
64
|
+
end
|
65
|
+
add_message "google.cloud.speech.v1.LongRunningRecognizeMetadata" do
|
66
|
+
optional :progress_percent, :int32, 1
|
67
|
+
optional :start_time, :message, 2, "google.protobuf.Timestamp"
|
68
|
+
optional :last_update_time, :message, 3, "google.protobuf.Timestamp"
|
69
|
+
end
|
70
|
+
add_message "google.cloud.speech.v1.StreamingRecognizeResponse" do
|
71
|
+
optional :error, :message, 1, "google.rpc.Status"
|
72
|
+
repeated :results, :message, 2, "google.cloud.speech.v1.StreamingRecognitionResult"
|
73
|
+
optional :speech_event_type, :enum, 4, "google.cloud.speech.v1.StreamingRecognizeResponse.SpeechEventType"
|
74
|
+
end
|
75
|
+
add_enum "google.cloud.speech.v1.StreamingRecognizeResponse.SpeechEventType" do
|
76
|
+
value :SPEECH_EVENT_UNSPECIFIED, 0
|
77
|
+
value :END_OF_SINGLE_UTTERANCE, 1
|
78
|
+
end
|
79
|
+
add_message "google.cloud.speech.v1.StreamingRecognitionResult" do
|
80
|
+
repeated :alternatives, :message, 1, "google.cloud.speech.v1.SpeechRecognitionAlternative"
|
81
|
+
optional :is_final, :bool, 2
|
82
|
+
optional :stability, :float, 3
|
83
|
+
end
|
84
|
+
add_message "google.cloud.speech.v1.SpeechRecognitionResult" do
|
85
|
+
repeated :alternatives, :message, 1, "google.cloud.speech.v1.SpeechRecognitionAlternative"
|
86
|
+
end
|
87
|
+
add_message "google.cloud.speech.v1.SpeechRecognitionAlternative" do
|
88
|
+
optional :transcript, :string, 1
|
89
|
+
optional :confidence, :float, 2
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
module Google
|
94
|
+
module Cloud
|
95
|
+
module Speech
|
96
|
+
module V1
|
97
|
+
RecognizeRequest = Google::Protobuf::DescriptorPool.generated_pool.lookup("google.cloud.speech.v1.RecognizeRequest").msgclass
|
98
|
+
LongRunningRecognizeRequest = Google::Protobuf::DescriptorPool.generated_pool.lookup("google.cloud.speech.v1.LongRunningRecognizeRequest").msgclass
|
99
|
+
StreamingRecognizeRequest = Google::Protobuf::DescriptorPool.generated_pool.lookup("google.cloud.speech.v1.StreamingRecognizeRequest").msgclass
|
100
|
+
StreamingRecognitionConfig = Google::Protobuf::DescriptorPool.generated_pool.lookup("google.cloud.speech.v1.StreamingRecognitionConfig").msgclass
|
101
|
+
RecognitionConfig = Google::Protobuf::DescriptorPool.generated_pool.lookup("google.cloud.speech.v1.RecognitionConfig").msgclass
|
102
|
+
RecognitionConfig::AudioEncoding = Google::Protobuf::DescriptorPool.generated_pool.lookup("google.cloud.speech.v1.RecognitionConfig.AudioEncoding").enummodule
|
103
|
+
SpeechContext = Google::Protobuf::DescriptorPool.generated_pool.lookup("google.cloud.speech.v1.SpeechContext").msgclass
|
104
|
+
RecognitionAudio = Google::Protobuf::DescriptorPool.generated_pool.lookup("google.cloud.speech.v1.RecognitionAudio").msgclass
|
105
|
+
RecognizeResponse = Google::Protobuf::DescriptorPool.generated_pool.lookup("google.cloud.speech.v1.RecognizeResponse").msgclass
|
106
|
+
LongRunningRecognizeResponse = Google::Protobuf::DescriptorPool.generated_pool.lookup("google.cloud.speech.v1.LongRunningRecognizeResponse").msgclass
|
107
|
+
LongRunningRecognizeMetadata = Google::Protobuf::DescriptorPool.generated_pool.lookup("google.cloud.speech.v1.LongRunningRecognizeMetadata").msgclass
|
108
|
+
StreamingRecognizeResponse = Google::Protobuf::DescriptorPool.generated_pool.lookup("google.cloud.speech.v1.StreamingRecognizeResponse").msgclass
|
109
|
+
StreamingRecognizeResponse::SpeechEventType = Google::Protobuf::DescriptorPool.generated_pool.lookup("google.cloud.speech.v1.StreamingRecognizeResponse.SpeechEventType").enummodule
|
110
|
+
StreamingRecognitionResult = Google::Protobuf::DescriptorPool.generated_pool.lookup("google.cloud.speech.v1.StreamingRecognitionResult").msgclass
|
111
|
+
SpeechRecognitionResult = Google::Protobuf::DescriptorPool.generated_pool.lookup("google.cloud.speech.v1.SpeechRecognitionResult").msgclass
|
112
|
+
SpeechRecognitionAlternative = Google::Protobuf::DescriptorPool.generated_pool.lookup("google.cloud.speech.v1.SpeechRecognitionAlternative").msgclass
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
2
|
-
# Source: google/cloud/speech/
|
2
|
+
# Source: google/cloud/speech/v1/cloud_speech.proto for package 'google.cloud.speech.v1'
|
3
3
|
# Original file comments:
|
4
|
-
# Copyright
|
4
|
+
# Copyright 2017 Google Inc.
|
5
5
|
#
|
6
6
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
7
7
|
# you may not use this file except in compliance with the License.
|
@@ -17,12 +17,12 @@
|
|
17
17
|
#
|
18
18
|
|
19
19
|
require 'grpc'
|
20
|
-
require 'google/cloud/speech/
|
20
|
+
require 'google/cloud/speech/v1/cloud_speech_pb'
|
21
21
|
|
22
22
|
module Google
|
23
23
|
module Cloud
|
24
24
|
module Speech
|
25
|
-
module
|
25
|
+
module V1
|
26
26
|
module Speech
|
27
27
|
# Service that implements Google Cloud Speech API.
|
28
28
|
class Service
|
@@ -31,17 +31,17 @@ module Google
|
|
31
31
|
|
32
32
|
self.marshal_class_method = :encode
|
33
33
|
self.unmarshal_class_method = :decode
|
34
|
-
self.service_name = 'google.cloud.speech.
|
34
|
+
self.service_name = 'google.cloud.speech.v1.Speech'
|
35
35
|
|
36
|
-
#
|
36
|
+
# Performs synchronous speech recognition: receive results after all audio
|
37
37
|
# has been sent and processed.
|
38
|
-
rpc :
|
39
|
-
#
|
38
|
+
rpc :Recognize, RecognizeRequest, RecognizeResponse
|
39
|
+
# Performs asynchronous speech recognition: receive results via the
|
40
40
|
# google.longrunning.Operations interface. Returns either an
|
41
41
|
# `Operation.error` or an `Operation.response` which contains
|
42
|
-
#
|
43
|
-
rpc :
|
44
|
-
#
|
42
|
+
# a `LongRunningRecognizeResponse` message.
|
43
|
+
rpc :LongRunningRecognize, LongRunningRecognizeRequest, Google::Longrunning::Operation
|
44
|
+
# Performs bidirectional streaming speech recognition: receive results while
|
45
45
|
# sending audio. This method is only available via the gRPC API (not REST).
|
46
46
|
rpc :StreamingRecognize, stream(StreamingRecognizeRequest), stream(StreamingRecognizeResponse)
|
47
47
|
end
|
@@ -15,40 +15,37 @@
|
|
15
15
|
module Google
|
16
16
|
module Cloud
|
17
17
|
module Speech
|
18
|
-
module
|
19
|
-
#
|
20
|
-
# the +SyncRecognize+ method.
|
18
|
+
module V1
|
19
|
+
# The top-level message sent by the client for the +Recognize+ method.
|
21
20
|
# @!attribute [rw] config
|
22
|
-
# @return [Google::Cloud::Speech::
|
23
|
-
#
|
24
|
-
#
|
21
|
+
# @return [Google::Cloud::Speech::V1::RecognitionConfig]
|
22
|
+
# *Required* Provides information to the recognizer that specifies how to
|
23
|
+
# process the request.
|
25
24
|
# @!attribute [rw] audio
|
26
|
-
# @return [Google::Cloud::Speech::
|
27
|
-
#
|
28
|
-
class
|
25
|
+
# @return [Google::Cloud::Speech::V1::RecognitionAudio]
|
26
|
+
# *Required* The audio data to be recognized.
|
27
|
+
class RecognizeRequest; end
|
29
28
|
|
30
|
-
#
|
31
|
-
#
|
29
|
+
# The top-level message sent by the client for the +LongRunningRecognize+
|
30
|
+
# method.
|
32
31
|
# @!attribute [rw] config
|
33
|
-
# @return [Google::Cloud::Speech::
|
34
|
-
#
|
35
|
-
#
|
32
|
+
# @return [Google::Cloud::Speech::V1::RecognitionConfig]
|
33
|
+
# *Required* Provides information to the recognizer that specifies how to
|
34
|
+
# process the request.
|
36
35
|
# @!attribute [rw] audio
|
37
|
-
# @return [Google::Cloud::Speech::
|
38
|
-
#
|
39
|
-
class
|
36
|
+
# @return [Google::Cloud::Speech::V1::RecognitionAudio]
|
37
|
+
# *Required* The audio data to be recognized.
|
38
|
+
class LongRunningRecognizeRequest; end
|
40
39
|
|
41
|
-
#
|
42
|
-
#
|
43
|
-
#
|
44
|
-
#
|
45
|
-
#
|
40
|
+
# The top-level message sent by the client for the +StreamingRecognize+ method.
|
41
|
+
# Multiple +StreamingRecognizeRequest+ messages are sent. The first message
|
42
|
+
# must contain a +streaming_config+ message and must not contain +audio+ data.
|
43
|
+
# All subsequent messages must contain +audio+ data and must not contain a
|
44
|
+
# +streaming_config+ message.
|
46
45
|
# @!attribute [rw] streaming_config
|
47
|
-
# @return [Google::Cloud::Speech::
|
48
|
-
#
|
49
|
-
#
|
50
|
-
#
|
51
|
-
# The first +StreamingRecognizeRequest+ message must contain a
|
46
|
+
# @return [Google::Cloud::Speech::V1::StreamingRecognitionConfig]
|
47
|
+
# Provides information to the recognizer that specifies how to process the
|
48
|
+
# request. The first +StreamingRecognizeRequest+ message must contain a
|
52
49
|
# +streaming_config+ message.
|
53
50
|
# @!attribute [rw] audio_content
|
54
51
|
# @return [String]
|
@@ -62,68 +59,69 @@ module Google
|
|
62
59
|
# {audio limits}[https://cloud.google.com/speech/limits#content].
|
63
60
|
class StreamingRecognizeRequest; end
|
64
61
|
|
65
|
-
#
|
66
|
-
#
|
62
|
+
# Provides information to the recognizer that specifies how to process the
|
63
|
+
# request.
|
67
64
|
# @!attribute [rw] config
|
68
|
-
# @return [Google::Cloud::Speech::
|
69
|
-
#
|
70
|
-
#
|
65
|
+
# @return [Google::Cloud::Speech::V1::RecognitionConfig]
|
66
|
+
# *Required* Provides information to the recognizer that specifies how to
|
67
|
+
# process the request.
|
71
68
|
# @!attribute [rw] single_utterance
|
72
69
|
# @return [true, false]
|
73
|
-
#
|
74
|
-
# recognition (continuing to process audio even if the user
|
75
|
-
# until the client closes the
|
76
|
-
# time limit has been reached.
|
77
|
-
# the +is_final+ flag set to +true
|
70
|
+
# *Optional* If +false+ or omitted, the recognizer will perform continuous
|
71
|
+
# recognition (continuing to wait for and process audio even if the user
|
72
|
+
# pauses speaking) until the client closes the input stream (gRPC API) or
|
73
|
+
# until the maximum time limit has been reached. May return multiple
|
74
|
+
# +StreamingRecognitionResult+s with the +is_final+ flag set to +true+.
|
78
75
|
#
|
79
76
|
# If +true+, the recognizer will detect a single spoken utterance. When it
|
80
77
|
# detects that the user has paused or stopped speaking, it will return an
|
81
|
-
# +
|
82
|
-
# one +StreamingRecognitionResult+ with the +is_final+ flag set to
|
78
|
+
# +END_OF_SINGLE_UTTERANCE+ event and cease recognition. It will return no
|
79
|
+
# more than one +StreamingRecognitionResult+ with the +is_final+ flag set to
|
80
|
+
# +true+.
|
83
81
|
# @!attribute [rw] interim_results
|
84
82
|
# @return [true, false]
|
85
|
-
#
|
83
|
+
# *Optional* If +true+, interim results (tentative hypotheses) may be
|
86
84
|
# returned as they become available (these interim results are indicated with
|
87
85
|
# the +is_final=false+ flag).
|
88
86
|
# If +false+ or omitted, only +is_final=true+ result(s) are returned.
|
89
87
|
class StreamingRecognitionConfig; end
|
90
88
|
|
91
|
-
#
|
92
|
-
#
|
89
|
+
# Provides information to the recognizer that specifies how to process the
|
90
|
+
# request.
|
93
91
|
# @!attribute [rw] encoding
|
94
|
-
# @return [Google::Cloud::Speech::
|
95
|
-
#
|
96
|
-
# @!attribute [rw]
|
92
|
+
# @return [Google::Cloud::Speech::V1::RecognitionConfig::AudioEncoding]
|
93
|
+
# *Required* Encoding of audio data sent in all +RecognitionAudio+ messages.
|
94
|
+
# @!attribute [rw] sample_rate_hertz
|
97
95
|
# @return [Integer]
|
98
|
-
#
|
96
|
+
# *Required* Sample rate in Hertz of the audio data sent in all
|
99
97
|
# +RecognitionAudio+ messages. Valid values are: 8000-48000.
|
100
98
|
# 16000 is optimal. For best results, set the sampling rate of the audio
|
101
99
|
# source to 16000 Hz. If that's not possible, use the native sample rate of
|
102
100
|
# the audio source (instead of re-sampling).
|
103
101
|
# @!attribute [rw] language_code
|
104
102
|
# @return [String]
|
105
|
-
#
|
106
|
-
#
|
107
|
-
#
|
108
|
-
# {Language Support}[https://cloud.google.com/speech/docs/
|
103
|
+
# *Required* The language of the supplied audio as a
|
104
|
+
# {BCP-47}[https://www.rfc-editor.org/rfc/bcp/bcp47.txt] language tag.
|
105
|
+
# Example: "en-US".
|
106
|
+
# See {Language Support}[https://cloud.google.com/speech/docs/languages]
|
109
107
|
# for a list of the currently supported language codes.
|
110
108
|
# @!attribute [rw] max_alternatives
|
111
109
|
# @return [Integer]
|
112
|
-
#
|
110
|
+
# *Optional* Maximum number of recognition hypotheses to be returned.
|
113
111
|
# Specifically, the maximum number of +SpeechRecognitionAlternative+ messages
|
114
112
|
# within each +SpeechRecognitionResult+.
|
115
113
|
# The server may return fewer than +max_alternatives+.
|
116
114
|
# Valid values are +0+-+30+. A value of +0+ or +1+ will return a maximum of
|
117
|
-
#
|
115
|
+
# one. If omitted, will return a maximum of one.
|
118
116
|
# @!attribute [rw] profanity_filter
|
119
117
|
# @return [true, false]
|
120
|
-
#
|
118
|
+
# *Optional* If set to +true+, the server will attempt to filter out
|
121
119
|
# profanities, replacing all but the initial character in each filtered word
|
122
120
|
# with asterisks, e.g. "f***". If set to +false+ or omitted, profanities
|
123
121
|
# won't be filtered out.
|
124
|
-
# @!attribute [rw]
|
125
|
-
# @return [Google::Cloud::Speech::
|
126
|
-
#
|
122
|
+
# @!attribute [rw] speech_contexts
|
123
|
+
# @return [Array<Google::Cloud::Speech::V1::SpeechContext>]
|
124
|
+
# *Optional* A means to provide context to assist the speech recognition.
|
127
125
|
class RecognitionConfig
|
128
126
|
# Audio encoding of the data sent in the audio message. All encodings support
|
129
127
|
# only 1 channel (mono) audio. Only +FLAC+ includes a header that describes
|
@@ -132,34 +130,52 @@ module Google
|
|
132
130
|
#
|
133
131
|
# For best results, the audio source should be captured and transmitted using
|
134
132
|
# a lossless encoding (+FLAC+ or +LINEAR16+). Recognition accuracy may be
|
135
|
-
# reduced if lossy codecs
|
136
|
-
# or transmit the audio, particularly if
|
133
|
+
# reduced if lossy codecs, which include the other codecs listed in
|
134
|
+
# this section, are used to capture or transmit the audio, particularly if
|
135
|
+
# background noise is present.
|
137
136
|
module AudioEncoding
|
138
137
|
# Not specified. Will return result Google::Rpc::Code::INVALID_ARGUMENT.
|
139
138
|
ENCODING_UNSPECIFIED = 0
|
140
139
|
|
141
140
|
# Uncompressed 16-bit signed little-endian samples (Linear PCM).
|
142
|
-
# This is the only encoding that may be used by +AsyncRecognize+.
|
143
141
|
LINEAR16 = 1
|
144
142
|
|
145
|
-
#
|
146
|
-
#
|
147
|
-
# recognition
|
148
|
-
#
|
149
|
-
#
|
150
|
-
#
|
151
|
-
# 16-bit and 24-bit samples are supported.
|
152
|
-
# Not all fields in STREAMINFO are supported.
|
143
|
+
# {+FLAC+}[https://xiph.org/flac/documentation.html] (Free Lossless Audio
|
144
|
+
# Codec) is the recommended encoding because it is
|
145
|
+
# lossless--therefore recognition is not compromised--and
|
146
|
+
# requires only about half the bandwidth of +LINEAR16+. +FLAC+ stream
|
147
|
+
# encoding supports 16-bit and 24-bit samples, however, not all fields in
|
148
|
+
# +STREAMINFO+ are supported.
|
153
149
|
FLAC = 2
|
154
150
|
|
155
151
|
# 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law.
|
156
152
|
MULAW = 3
|
157
153
|
|
158
|
-
# Adaptive Multi-Rate Narrowband codec. +
|
154
|
+
# Adaptive Multi-Rate Narrowband codec. +sample_rate_hertz+ must be 8000.
|
159
155
|
AMR = 4
|
160
156
|
|
161
|
-
# Adaptive Multi-Rate Wideband codec. +
|
157
|
+
# Adaptive Multi-Rate Wideband codec. +sample_rate_hertz+ must be 16000.
|
162
158
|
AMR_WB = 5
|
159
|
+
|
160
|
+
# Opus encoded audio frames in Ogg container
|
161
|
+
# ({OggOpus}[https://wiki.xiph.org/OggOpus]).
|
162
|
+
# +sample_rate_hertz+ must be 16000.
|
163
|
+
OGG_OPUS = 6
|
164
|
+
|
165
|
+
# Although the use of lossy encodings is not recommended, if a very low
|
166
|
+
# bitrate encoding is required, +OGG_OPUS+ is highly preferred over
|
167
|
+
# Speex encoding. The {Speex}[https://speex.org/] encoding supported by
|
168
|
+
# Cloud Speech API has a header byte in each block, as in MIME type
|
169
|
+
# +audio/x-speex-with-header-byte+.
|
170
|
+
# It is a variant of the RTP Speex encoding defined in
|
171
|
+
# {RFC 5574}[https://tools.ietf.org/html/rfc5574].
|
172
|
+
# The stream is a sequence of blocks, one block per RTP packet. Each block
|
173
|
+
# starts with a byte containing the length of the block, in bytes, followed
|
174
|
+
# by one or more frames of Speex data, padded to an integral number of
|
175
|
+
# bytes (octets) as specified in RFC 5574. In other words, each RTP header
|
176
|
+
# is replaced with a single byte containing the block length. Only Speex
|
177
|
+
# wideband is supported. +sample_rate_hertz+ must be 16000.
|
178
|
+
SPEEX_WITH_HEADER_BYTE = 7
|
163
179
|
end
|
164
180
|
end
|
165
181
|
|
@@ -167,7 +183,7 @@ module Google
|
|
167
183
|
# in the results.
|
168
184
|
# @!attribute [rw] phrases
|
169
185
|
# @return [Array<String>]
|
170
|
-
#
|
186
|
+
# *Optional* A list of strings containing words and phrases "hints" so that
|
171
187
|
# the speech recognition is more likely to recognize them. This can be used
|
172
188
|
# to improve the accuracy for specific words and phrases, for example, if
|
173
189
|
# specific commands are typically spoken by the user. This can also be used
|
@@ -194,30 +210,29 @@ module Google
|
|
194
210
|
# {Request URIs}[https://cloud.google.com/storage/docs/reference-uris].
|
195
211
|
class RecognitionAudio; end
|
196
212
|
|
197
|
-
#
|
198
|
-
#
|
199
|
-
#
|
213
|
+
# The only message returned to the client by the +Recognize+ method. It
|
214
|
+
# contains the result as zero or more sequential +SpeechRecognitionResult+
|
215
|
+
# messages.
|
200
216
|
# @!attribute [rw] results
|
201
|
-
# @return [Array<Google::Cloud::Speech::
|
202
|
-
#
|
217
|
+
# @return [Array<Google::Cloud::Speech::V1::SpeechRecognitionResult>]
|
218
|
+
# *Output-only* Sequential list of transcription results corresponding to
|
203
219
|
# sequential portions of audio.
|
204
|
-
class
|
220
|
+
class RecognizeResponse; end
|
205
221
|
|
206
|
-
#
|
207
|
-
#
|
208
|
-
#
|
209
|
-
#
|
210
|
-
#
|
222
|
+
# The only message returned to the client by the +LongRunningRecognize+ method.
|
223
|
+
# It contains the result as zero or more sequential +SpeechRecognitionResult+
|
224
|
+
# messages. It is included in the +result.response+ field of the +Operation+
|
225
|
+
# returned by the +GetOperation+ call of the +google::longrunning::Operations+
|
226
|
+
# service.
|
211
227
|
# @!attribute [rw] results
|
212
|
-
# @return [Array<Google::Cloud::Speech::
|
213
|
-
#
|
228
|
+
# @return [Array<Google::Cloud::Speech::V1::SpeechRecognitionResult>]
|
229
|
+
# *Output-only* Sequential list of transcription results corresponding to
|
214
230
|
# sequential portions of audio.
|
215
|
-
class
|
231
|
+
class LongRunningRecognizeResponse; end
|
216
232
|
|
217
|
-
#
|
218
|
-
#
|
219
|
-
# +
|
220
|
-
# +google::longrunning::Operations+ service.
|
233
|
+
# Describes the progress of a long-running +LongRunningRecognize+ call. It is
|
234
|
+
# included in the +metadata+ field of the +Operation+ returned by the
|
235
|
+
# +GetOperation+ call of the +google::longrunning::Operations+ service.
|
221
236
|
# @!attribute [rw] progress_percent
|
222
237
|
# @return [Integer]
|
223
238
|
# Approximate percentage of audio processed thus far. Guaranteed to be 100
|
@@ -228,7 +243,7 @@ module Google
|
|
228
243
|
# @!attribute [rw] last_update_time
|
229
244
|
# @return [Google::Protobuf::Timestamp]
|
230
245
|
# Time of the most recent processing update.
|
231
|
-
class
|
246
|
+
class LongRunningRecognizeMetadata; end
|
232
247
|
|
233
248
|
# +StreamingRecognizeResponse+ is the only message returned to the client by
|
234
249
|
# +StreamingRecognize+. A series of one or more +StreamingRecognizeResponse+
|
@@ -237,139 +252,120 @@ module Google
|
|
237
252
|
# Here's an example of a series of ten +StreamingRecognizeResponse+s that might
|
238
253
|
# be returned while processing audio:
|
239
254
|
#
|
240
|
-
# 1.
|
255
|
+
# 1. results { alternatives { transcript: "tube" } stability: 0.01 }
|
241
256
|
#
|
242
|
-
# 2. results { alternatives { transcript: "
|
243
|
-
# result_index: 0
|
257
|
+
# 2. results { alternatives { transcript: "to be a" } stability: 0.01 }
|
244
258
|
#
|
245
|
-
# 3. results { alternatives { transcript: "to be
|
246
|
-
# result_index: 0
|
247
|
-
#
|
248
|
-
# 4. results { alternatives { transcript: "to be" } stability: 0.9 }
|
259
|
+
# 3. results { alternatives { transcript: "to be" } stability: 0.9 }
|
249
260
|
# results { alternatives { transcript: " or not to be" } stability: 0.01 }
|
250
|
-
# result_index: 0
|
251
261
|
#
|
252
|
-
#
|
262
|
+
# 4. results { alternatives { transcript: "to be or not to be"
|
253
263
|
# confidence: 0.92 }
|
254
264
|
# alternatives { transcript: "to bee or not to bee" }
|
255
265
|
# is_final: true }
|
256
|
-
# result_index: 0
|
257
266
|
#
|
258
|
-
#
|
259
|
-
# result_index: 1
|
267
|
+
# 5. results { alternatives { transcript: " that's" } stability: 0.01 }
|
260
268
|
#
|
261
|
-
#
|
269
|
+
# 6. results { alternatives { transcript: " that is" } stability: 0.9 }
|
262
270
|
# results { alternatives { transcript: " the question" } stability: 0.01 }
|
263
|
-
# result_index: 1
|
264
271
|
#
|
265
|
-
#
|
272
|
+
# 7. speech_event_type: END_OF_SINGLE_UTTERANCE
|
266
273
|
#
|
267
|
-
#
|
274
|
+
# 8. results { alternatives { transcript: " that is the question"
|
268
275
|
# confidence: 0.98 }
|
269
276
|
# alternatives { transcript: " that was the question" }
|
270
277
|
# is_final: true }
|
271
|
-
# result_index: 1
|
272
|
-
#
|
273
|
-
# 10. endpointer_type: END_OF_AUDIO
|
274
278
|
#
|
275
279
|
# Notes:
|
276
280
|
#
|
277
|
-
# - Only two of the above responses #
|
281
|
+
# - Only two of the above responses #4 and #8 contain final results; they are
|
278
282
|
# indicated by +is_final: true+. Concatenating these together generates the
|
279
283
|
# full transcript: "to be or not to be that is the question".
|
280
284
|
#
|
281
|
-
# - The others contain interim +results+. #
|
282
|
-
# +results
|
283
|
-
# change
|
285
|
+
# - The others contain interim +results+. #3 and #6 contain two interim
|
286
|
+
# +results+: the first portion has a high stability and is less likely to
|
287
|
+
# change; the second portion has a low stability and is very likely to
|
284
288
|
# change. A UI designer might choose to show only high stability +results+.
|
285
289
|
#
|
286
|
-
# - The +
|
287
|
-
#
|
288
|
-
#
|
289
|
-
#
|
290
|
+
# - The specific +stability+ and +confidence+ values shown above are only for
|
291
|
+
# illustrative purposes. Actual values may vary.
|
292
|
+
#
|
293
|
+
# - In each response, only one of these fields will be set:
|
294
|
+
# +error+,
|
295
|
+
# +speech_event_type+, or
|
296
|
+
# one or more (repeated) +results+.
|
290
297
|
# @!attribute [rw] error
|
291
298
|
# @return [Google::Rpc::Status]
|
292
|
-
#
|
299
|
+
# *Output-only* If set, returns a Google::Rpc::Status message that
|
293
300
|
# specifies the error for the operation.
|
294
301
|
# @!attribute [rw] results
|
295
|
-
# @return [Array<Google::Cloud::Speech::
|
296
|
-
#
|
302
|
+
# @return [Array<Google::Cloud::Speech::V1::StreamingRecognitionResult>]
|
303
|
+
# *Output-only* This repeated list contains zero or more results that
|
297
304
|
# correspond to consecutive portions of the audio currently being processed.
|
298
305
|
# It contains zero or one +is_final=true+ result (the newly settled portion),
|
299
306
|
# followed by zero or more +is_final=false+ results.
|
300
|
-
# @!attribute [rw]
|
301
|
-
# @return [
|
302
|
-
#
|
303
|
-
# changed. The repeated +StreamingRecognitionResult+ results overwrite past
|
304
|
-
# results at this index and higher.
|
305
|
-
# @!attribute [rw] endpointer_type
|
306
|
-
# @return [Google::Cloud::Speech::V1beta1::StreamingRecognizeResponse::EndpointerType]
|
307
|
-
# [Output-only] Indicates the type of endpointer event.
|
307
|
+
# @!attribute [rw] speech_event_type
|
308
|
+
# @return [Google::Cloud::Speech::V1::StreamingRecognizeResponse::SpeechEventType]
|
309
|
+
# *Output-only* Indicates the type of speech event.
|
308
310
|
class StreamingRecognizeResponse
|
309
|
-
# Indicates the type of
|
310
|
-
module
|
311
|
-
# No
|
312
|
-
|
313
|
-
|
314
|
-
# Speech has been detected in the audio stream.
|
315
|
-
START_OF_SPEECH = 1
|
316
|
-
|
317
|
-
# Speech has ceased to be detected in the audio stream.
|
318
|
-
END_OF_SPEECH = 2
|
319
|
-
|
320
|
-
# The end of the audio stream has been reached. and it is being processed.
|
321
|
-
END_OF_AUDIO = 3
|
311
|
+
# Indicates the type of speech event.
|
312
|
+
module SpeechEventType
|
313
|
+
# No speech event specified.
|
314
|
+
SPEECH_EVENT_UNSPECIFIED = 0
|
322
315
|
|
323
|
-
# This event
|
324
|
-
#
|
325
|
-
#
|
326
|
-
# additional
|
327
|
-
|
316
|
+
# This event indicates that the server has detected the end of the user's
|
317
|
+
# speech utterance and expects no additional speech. Therefore, the server
|
318
|
+
# will not process additional audio (although it may subsequently return
|
319
|
+
# additional results). The client should stop sending additional audio
|
320
|
+
# data, half-close the gRPC connection, and wait for any additional results
|
321
|
+
# until the server closes the gRPC connection. This event is only sent if
|
322
|
+
# +single_utterance+ was set to +true+, and is not used otherwise.
|
323
|
+
END_OF_SINGLE_UTTERANCE = 1
|
328
324
|
end
|
329
325
|
end
|
330
326
|
|
331
327
|
# A streaming speech recognition result corresponding to a portion of the audio
|
332
328
|
# that is currently being processed.
|
333
329
|
# @!attribute [rw] alternatives
|
334
|
-
# @return [Array<Google::Cloud::Speech::
|
335
|
-
#
|
330
|
+
# @return [Array<Google::Cloud::Speech::V1::SpeechRecognitionAlternative>]
|
331
|
+
# *Output-only* May contain one or more recognition hypotheses (up to the
|
336
332
|
# maximum specified in +max_alternatives+).
|
337
333
|
# @!attribute [rw] is_final
|
338
334
|
# @return [true, false]
|
339
|
-
#
|
335
|
+
# *Output-only* If +false+, this +StreamingRecognitionResult+ represents an
|
340
336
|
# interim result that may change. If +true+, this is the final time the
|
341
337
|
# speech service will return this particular +StreamingRecognitionResult+,
|
342
338
|
# the recognizer will not return any further hypotheses for this portion of
|
343
339
|
# the transcript and corresponding audio.
|
344
340
|
# @!attribute [rw] stability
|
345
341
|
# @return [Float]
|
346
|
-
#
|
342
|
+
# *Output-only* An estimate of the likelihood that the recognizer will not
|
347
343
|
# change its guess about this interim result. Values range from 0.0
|
348
|
-
# (completely unstable) to 1.0 (completely stable).
|
349
|
-
# same as +confidence+, which estimates the probability that a recognition
|
350
|
-
# result is correct.
|
344
|
+
# (completely unstable) to 1.0 (completely stable).
|
351
345
|
# This field is only provided for interim results (+is_final=false+).
|
352
|
-
# The default of 0.0 is a sentinel value indicating stability was not set.
|
346
|
+
# The default of 0.0 is a sentinel value indicating +stability+ was not set.
|
353
347
|
class StreamingRecognitionResult; end
|
354
348
|
|
355
349
|
# A speech recognition result corresponding to a portion of the audio.
|
356
350
|
# @!attribute [rw] alternatives
|
357
|
-
# @return [Array<Google::Cloud::Speech::
|
358
|
-
#
|
351
|
+
# @return [Array<Google::Cloud::Speech::V1::SpeechRecognitionAlternative>]
|
352
|
+
# *Output-only* May contain one or more recognition hypotheses (up to the
|
359
353
|
# maximum specified in +max_alternatives+).
|
360
354
|
class SpeechRecognitionResult; end
|
361
355
|
|
362
356
|
# Alternative hypotheses (a.k.a. n-best list).
|
363
357
|
# @!attribute [rw] transcript
|
364
358
|
# @return [String]
|
365
|
-
#
|
359
|
+
# *Output-only* Transcript text representing the words that the user spoke.
|
366
360
|
# @!attribute [rw] confidence
|
367
361
|
# @return [Float]
|
368
|
-
#
|
369
|
-
#
|
370
|
-
# This field is typically provided only for the top hypothesis, and
|
371
|
-
# +is_final=true+ results.
|
372
|
-
#
|
362
|
+
# *Output-only* The confidence estimate between 0.0 and 1.0. A higher number
|
363
|
+
# indicates an estimated greater likelihood that the recognized words are
|
364
|
+
# correct. This field is typically provided only for the top hypothesis, and
|
365
|
+
# only for +is_final=true+ results. Clients should not rely on the
|
366
|
+
# +confidence+ field as it is not guaranteed to be accurate, or even set, in
|
367
|
+
# any of the results.
|
368
|
+
# The default of 0.0 is a sentinel value indicating +confidence+ was not set.
|
373
369
|
class SpeechRecognitionAlternative; end
|
374
370
|
end
|
375
371
|
end
|