google-cloud-speech 0.23.0 → 0.24.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.yardopts +0 -2
- data/README.md +4 -2
- data/lib/google-cloud-speech.rb +6 -2
- data/lib/google/cloud/speech.rb +37 -23
- data/lib/google/cloud/speech/audio.rb +73 -44
- data/lib/google/cloud/speech/credentials.rb +2 -2
- data/lib/google/cloud/speech/operation.rb +262 -0
- data/lib/google/cloud/speech/project.rb +186 -83
- data/lib/google/cloud/speech/result.rb +14 -8
- data/lib/google/cloud/speech/service.rb +12 -6
- data/lib/google/cloud/speech/stream.rb +128 -131
- data/lib/google/cloud/speech/{v1beta1.rb → v1.rb} +2 -3
- data/lib/google/cloud/speech/v1/cloud_speech_pb.rb +116 -0
- data/lib/google/cloud/speech/{v1beta1 → v1}/cloud_speech_services_pb.rb +11 -11
- data/lib/google/cloud/speech/{v1beta1/doc/google/cloud/speech/v1beta1 → v1/doc/google/cloud/speech/v1}/cloud_speech.rb +157 -161
- data/lib/google/cloud/speech/{v1beta1 → v1}/doc/google/protobuf/any.rb +0 -0
- data/lib/google/cloud/speech/{v1beta1 → v1}/doc/google/rpc/status.rb +0 -0
- data/lib/google/cloud/speech/{v1beta1 → v1}/speech_client.rb +71 -58
- data/lib/google/cloud/speech/{v1beta1 → v1}/speech_client_config.json +8 -8
- data/lib/google/cloud/speech/version.rb +1 -1
- metadata +13 -13
- data/lib/google/cloud/speech/job.rb +0 -159
- data/lib/google/cloud/speech/v1beta1/cloud_speech_pb.rb +0 -116
@@ -1,5 +1,4 @@
|
|
1
|
-
# Copyright
|
2
|
-
#
|
1
|
+
# Copyright 2017, Google Inc. All rights reserved.
|
3
2
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
3
|
# you may not use this file except in compliance with the License.
|
5
4
|
# You may obtain a copy of the License at
|
@@ -12,4 +11,4 @@
|
|
12
11
|
# See the License for the specific language governing permissions and
|
13
12
|
# limitations under the License.
|
14
13
|
|
15
|
-
require "google/cloud/speech/
|
14
|
+
require "google/cloud/speech/v1/speech_client"
|
@@ -0,0 +1,116 @@
|
|
1
|
+
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
2
|
+
# source: google/cloud/speech/v1/cloud_speech.proto
|
3
|
+
|
4
|
+
require 'google/protobuf'
|
5
|
+
|
6
|
+
require 'google/api/annotations_pb'
|
7
|
+
require 'google/longrunning/operations_pb'
|
8
|
+
require 'google/protobuf/any_pb'
|
9
|
+
require 'google/protobuf/duration_pb'
|
10
|
+
require 'google/protobuf/timestamp_pb'
|
11
|
+
require 'google/rpc/status_pb'
|
12
|
+
Google::Protobuf::DescriptorPool.generated_pool.build do
|
13
|
+
add_message "google.cloud.speech.v1.RecognizeRequest" do
|
14
|
+
optional :config, :message, 1, "google.cloud.speech.v1.RecognitionConfig"
|
15
|
+
optional :audio, :message, 2, "google.cloud.speech.v1.RecognitionAudio"
|
16
|
+
end
|
17
|
+
add_message "google.cloud.speech.v1.LongRunningRecognizeRequest" do
|
18
|
+
optional :config, :message, 1, "google.cloud.speech.v1.RecognitionConfig"
|
19
|
+
optional :audio, :message, 2, "google.cloud.speech.v1.RecognitionAudio"
|
20
|
+
end
|
21
|
+
add_message "google.cloud.speech.v1.StreamingRecognizeRequest" do
|
22
|
+
oneof :streaming_request do
|
23
|
+
optional :streaming_config, :message, 1, "google.cloud.speech.v1.StreamingRecognitionConfig"
|
24
|
+
optional :audio_content, :bytes, 2
|
25
|
+
end
|
26
|
+
end
|
27
|
+
add_message "google.cloud.speech.v1.StreamingRecognitionConfig" do
|
28
|
+
optional :config, :message, 1, "google.cloud.speech.v1.RecognitionConfig"
|
29
|
+
optional :single_utterance, :bool, 2
|
30
|
+
optional :interim_results, :bool, 3
|
31
|
+
end
|
32
|
+
add_message "google.cloud.speech.v1.RecognitionConfig" do
|
33
|
+
optional :encoding, :enum, 1, "google.cloud.speech.v1.RecognitionConfig.AudioEncoding"
|
34
|
+
optional :sample_rate_hertz, :int32, 2
|
35
|
+
optional :language_code, :string, 3
|
36
|
+
optional :max_alternatives, :int32, 4
|
37
|
+
optional :profanity_filter, :bool, 5
|
38
|
+
repeated :speech_contexts, :message, 6, "google.cloud.speech.v1.SpeechContext"
|
39
|
+
end
|
40
|
+
add_enum "google.cloud.speech.v1.RecognitionConfig.AudioEncoding" do
|
41
|
+
value :ENCODING_UNSPECIFIED, 0
|
42
|
+
value :LINEAR16, 1
|
43
|
+
value :FLAC, 2
|
44
|
+
value :MULAW, 3
|
45
|
+
value :AMR, 4
|
46
|
+
value :AMR_WB, 5
|
47
|
+
value :OGG_OPUS, 6
|
48
|
+
value :SPEEX_WITH_HEADER_BYTE, 7
|
49
|
+
end
|
50
|
+
add_message "google.cloud.speech.v1.SpeechContext" do
|
51
|
+
repeated :phrases, :string, 1
|
52
|
+
end
|
53
|
+
add_message "google.cloud.speech.v1.RecognitionAudio" do
|
54
|
+
oneof :audio_source do
|
55
|
+
optional :content, :bytes, 1
|
56
|
+
optional :uri, :string, 2
|
57
|
+
end
|
58
|
+
end
|
59
|
+
add_message "google.cloud.speech.v1.RecognizeResponse" do
|
60
|
+
repeated :results, :message, 2, "google.cloud.speech.v1.SpeechRecognitionResult"
|
61
|
+
end
|
62
|
+
add_message "google.cloud.speech.v1.LongRunningRecognizeResponse" do
|
63
|
+
repeated :results, :message, 2, "google.cloud.speech.v1.SpeechRecognitionResult"
|
64
|
+
end
|
65
|
+
add_message "google.cloud.speech.v1.LongRunningRecognizeMetadata" do
|
66
|
+
optional :progress_percent, :int32, 1
|
67
|
+
optional :start_time, :message, 2, "google.protobuf.Timestamp"
|
68
|
+
optional :last_update_time, :message, 3, "google.protobuf.Timestamp"
|
69
|
+
end
|
70
|
+
add_message "google.cloud.speech.v1.StreamingRecognizeResponse" do
|
71
|
+
optional :error, :message, 1, "google.rpc.Status"
|
72
|
+
repeated :results, :message, 2, "google.cloud.speech.v1.StreamingRecognitionResult"
|
73
|
+
optional :speech_event_type, :enum, 4, "google.cloud.speech.v1.StreamingRecognizeResponse.SpeechEventType"
|
74
|
+
end
|
75
|
+
add_enum "google.cloud.speech.v1.StreamingRecognizeResponse.SpeechEventType" do
|
76
|
+
value :SPEECH_EVENT_UNSPECIFIED, 0
|
77
|
+
value :END_OF_SINGLE_UTTERANCE, 1
|
78
|
+
end
|
79
|
+
add_message "google.cloud.speech.v1.StreamingRecognitionResult" do
|
80
|
+
repeated :alternatives, :message, 1, "google.cloud.speech.v1.SpeechRecognitionAlternative"
|
81
|
+
optional :is_final, :bool, 2
|
82
|
+
optional :stability, :float, 3
|
83
|
+
end
|
84
|
+
add_message "google.cloud.speech.v1.SpeechRecognitionResult" do
|
85
|
+
repeated :alternatives, :message, 1, "google.cloud.speech.v1.SpeechRecognitionAlternative"
|
86
|
+
end
|
87
|
+
add_message "google.cloud.speech.v1.SpeechRecognitionAlternative" do
|
88
|
+
optional :transcript, :string, 1
|
89
|
+
optional :confidence, :float, 2
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
module Google
|
94
|
+
module Cloud
|
95
|
+
module Speech
|
96
|
+
module V1
|
97
|
+
RecognizeRequest = Google::Protobuf::DescriptorPool.generated_pool.lookup("google.cloud.speech.v1.RecognizeRequest").msgclass
|
98
|
+
LongRunningRecognizeRequest = Google::Protobuf::DescriptorPool.generated_pool.lookup("google.cloud.speech.v1.LongRunningRecognizeRequest").msgclass
|
99
|
+
StreamingRecognizeRequest = Google::Protobuf::DescriptorPool.generated_pool.lookup("google.cloud.speech.v1.StreamingRecognizeRequest").msgclass
|
100
|
+
StreamingRecognitionConfig = Google::Protobuf::DescriptorPool.generated_pool.lookup("google.cloud.speech.v1.StreamingRecognitionConfig").msgclass
|
101
|
+
RecognitionConfig = Google::Protobuf::DescriptorPool.generated_pool.lookup("google.cloud.speech.v1.RecognitionConfig").msgclass
|
102
|
+
RecognitionConfig::AudioEncoding = Google::Protobuf::DescriptorPool.generated_pool.lookup("google.cloud.speech.v1.RecognitionConfig.AudioEncoding").enummodule
|
103
|
+
SpeechContext = Google::Protobuf::DescriptorPool.generated_pool.lookup("google.cloud.speech.v1.SpeechContext").msgclass
|
104
|
+
RecognitionAudio = Google::Protobuf::DescriptorPool.generated_pool.lookup("google.cloud.speech.v1.RecognitionAudio").msgclass
|
105
|
+
RecognizeResponse = Google::Protobuf::DescriptorPool.generated_pool.lookup("google.cloud.speech.v1.RecognizeResponse").msgclass
|
106
|
+
LongRunningRecognizeResponse = Google::Protobuf::DescriptorPool.generated_pool.lookup("google.cloud.speech.v1.LongRunningRecognizeResponse").msgclass
|
107
|
+
LongRunningRecognizeMetadata = Google::Protobuf::DescriptorPool.generated_pool.lookup("google.cloud.speech.v1.LongRunningRecognizeMetadata").msgclass
|
108
|
+
StreamingRecognizeResponse = Google::Protobuf::DescriptorPool.generated_pool.lookup("google.cloud.speech.v1.StreamingRecognizeResponse").msgclass
|
109
|
+
StreamingRecognizeResponse::SpeechEventType = Google::Protobuf::DescriptorPool.generated_pool.lookup("google.cloud.speech.v1.StreamingRecognizeResponse.SpeechEventType").enummodule
|
110
|
+
StreamingRecognitionResult = Google::Protobuf::DescriptorPool.generated_pool.lookup("google.cloud.speech.v1.StreamingRecognitionResult").msgclass
|
111
|
+
SpeechRecognitionResult = Google::Protobuf::DescriptorPool.generated_pool.lookup("google.cloud.speech.v1.SpeechRecognitionResult").msgclass
|
112
|
+
SpeechRecognitionAlternative = Google::Protobuf::DescriptorPool.generated_pool.lookup("google.cloud.speech.v1.SpeechRecognitionAlternative").msgclass
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
2
|
-
# Source: google/cloud/speech/
|
2
|
+
# Source: google/cloud/speech/v1/cloud_speech.proto for package 'google.cloud.speech.v1'
|
3
3
|
# Original file comments:
|
4
|
-
# Copyright
|
4
|
+
# Copyright 2017 Google Inc.
|
5
5
|
#
|
6
6
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
7
7
|
# you may not use this file except in compliance with the License.
|
@@ -17,12 +17,12 @@
|
|
17
17
|
#
|
18
18
|
|
19
19
|
require 'grpc'
|
20
|
-
require 'google/cloud/speech/
|
20
|
+
require 'google/cloud/speech/v1/cloud_speech_pb'
|
21
21
|
|
22
22
|
module Google
|
23
23
|
module Cloud
|
24
24
|
module Speech
|
25
|
-
module
|
25
|
+
module V1
|
26
26
|
module Speech
|
27
27
|
# Service that implements Google Cloud Speech API.
|
28
28
|
class Service
|
@@ -31,17 +31,17 @@ module Google
|
|
31
31
|
|
32
32
|
self.marshal_class_method = :encode
|
33
33
|
self.unmarshal_class_method = :decode
|
34
|
-
self.service_name = 'google.cloud.speech.
|
34
|
+
self.service_name = 'google.cloud.speech.v1.Speech'
|
35
35
|
|
36
|
-
#
|
36
|
+
# Performs synchronous speech recognition: receive results after all audio
|
37
37
|
# has been sent and processed.
|
38
|
-
rpc :
|
39
|
-
#
|
38
|
+
rpc :Recognize, RecognizeRequest, RecognizeResponse
|
39
|
+
# Performs asynchronous speech recognition: receive results via the
|
40
40
|
# google.longrunning.Operations interface. Returns either an
|
41
41
|
# `Operation.error` or an `Operation.response` which contains
|
42
|
-
#
|
43
|
-
rpc :
|
44
|
-
#
|
42
|
+
# a `LongRunningRecognizeResponse` message.
|
43
|
+
rpc :LongRunningRecognize, LongRunningRecognizeRequest, Google::Longrunning::Operation
|
44
|
+
# Performs bidirectional streaming speech recognition: receive results while
|
45
45
|
# sending audio. This method is only available via the gRPC API (not REST).
|
46
46
|
rpc :StreamingRecognize, stream(StreamingRecognizeRequest), stream(StreamingRecognizeResponse)
|
47
47
|
end
|
@@ -15,40 +15,37 @@
|
|
15
15
|
module Google
|
16
16
|
module Cloud
|
17
17
|
module Speech
|
18
|
-
module
|
19
|
-
#
|
20
|
-
# the +SyncRecognize+ method.
|
18
|
+
module V1
|
19
|
+
# The top-level message sent by the client for the +Recognize+ method.
|
21
20
|
# @!attribute [rw] config
|
22
|
-
# @return [Google::Cloud::Speech::
|
23
|
-
#
|
24
|
-
#
|
21
|
+
# @return [Google::Cloud::Speech::V1::RecognitionConfig]
|
22
|
+
# *Required* Provides information to the recognizer that specifies how to
|
23
|
+
# process the request.
|
25
24
|
# @!attribute [rw] audio
|
26
|
-
# @return [Google::Cloud::Speech::
|
27
|
-
#
|
28
|
-
class
|
25
|
+
# @return [Google::Cloud::Speech::V1::RecognitionAudio]
|
26
|
+
# *Required* The audio data to be recognized.
|
27
|
+
class RecognizeRequest; end
|
29
28
|
|
30
|
-
#
|
31
|
-
#
|
29
|
+
# The top-level message sent by the client for the +LongRunningRecognize+
|
30
|
+
# method.
|
32
31
|
# @!attribute [rw] config
|
33
|
-
# @return [Google::Cloud::Speech::
|
34
|
-
#
|
35
|
-
#
|
32
|
+
# @return [Google::Cloud::Speech::V1::RecognitionConfig]
|
33
|
+
# *Required* Provides information to the recognizer that specifies how to
|
34
|
+
# process the request.
|
36
35
|
# @!attribute [rw] audio
|
37
|
-
# @return [Google::Cloud::Speech::
|
38
|
-
#
|
39
|
-
class
|
36
|
+
# @return [Google::Cloud::Speech::V1::RecognitionAudio]
|
37
|
+
# *Required* The audio data to be recognized.
|
38
|
+
class LongRunningRecognizeRequest; end
|
40
39
|
|
41
|
-
#
|
42
|
-
#
|
43
|
-
#
|
44
|
-
#
|
45
|
-
#
|
40
|
+
# The top-level message sent by the client for the +StreamingRecognize+ method.
|
41
|
+
# Multiple +StreamingRecognizeRequest+ messages are sent. The first message
|
42
|
+
# must contain a +streaming_config+ message and must not contain +audio+ data.
|
43
|
+
# All subsequent messages must contain +audio+ data and must not contain a
|
44
|
+
# +streaming_config+ message.
|
46
45
|
# @!attribute [rw] streaming_config
|
47
|
-
# @return [Google::Cloud::Speech::
|
48
|
-
#
|
49
|
-
#
|
50
|
-
#
|
51
|
-
# The first +StreamingRecognizeRequest+ message must contain a
|
46
|
+
# @return [Google::Cloud::Speech::V1::StreamingRecognitionConfig]
|
47
|
+
# Provides information to the recognizer that specifies how to process the
|
48
|
+
# request. The first +StreamingRecognizeRequest+ message must contain a
|
52
49
|
# +streaming_config+ message.
|
53
50
|
# @!attribute [rw] audio_content
|
54
51
|
# @return [String]
|
@@ -62,68 +59,69 @@ module Google
|
|
62
59
|
# {audio limits}[https://cloud.google.com/speech/limits#content].
|
63
60
|
class StreamingRecognizeRequest; end
|
64
61
|
|
65
|
-
#
|
66
|
-
#
|
62
|
+
# Provides information to the recognizer that specifies how to process the
|
63
|
+
# request.
|
67
64
|
# @!attribute [rw] config
|
68
|
-
# @return [Google::Cloud::Speech::
|
69
|
-
#
|
70
|
-
#
|
65
|
+
# @return [Google::Cloud::Speech::V1::RecognitionConfig]
|
66
|
+
# *Required* Provides information to the recognizer that specifies how to
|
67
|
+
# process the request.
|
71
68
|
# @!attribute [rw] single_utterance
|
72
69
|
# @return [true, false]
|
73
|
-
#
|
74
|
-
# recognition (continuing to process audio even if the user
|
75
|
-
# until the client closes the
|
76
|
-
# time limit has been reached.
|
77
|
-
# the +is_final+ flag set to +true
|
70
|
+
# *Optional* If +false+ or omitted, the recognizer will perform continuous
|
71
|
+
# recognition (continuing to wait for and process audio even if the user
|
72
|
+
# pauses speaking) until the client closes the input stream (gRPC API) or
|
73
|
+
# until the maximum time limit has been reached. May return multiple
|
74
|
+
# +StreamingRecognitionResult+s with the +is_final+ flag set to +true+.
|
78
75
|
#
|
79
76
|
# If +true+, the recognizer will detect a single spoken utterance. When it
|
80
77
|
# detects that the user has paused or stopped speaking, it will return an
|
81
|
-
# +
|
82
|
-
# one +StreamingRecognitionResult+ with the +is_final+ flag set to
|
78
|
+
# +END_OF_SINGLE_UTTERANCE+ event and cease recognition. It will return no
|
79
|
+
# more than one +StreamingRecognitionResult+ with the +is_final+ flag set to
|
80
|
+
# +true+.
|
83
81
|
# @!attribute [rw] interim_results
|
84
82
|
# @return [true, false]
|
85
|
-
#
|
83
|
+
# *Optional* If +true+, interim results (tentative hypotheses) may be
|
86
84
|
# returned as they become available (these interim results are indicated with
|
87
85
|
# the +is_final=false+ flag).
|
88
86
|
# If +false+ or omitted, only +is_final=true+ result(s) are returned.
|
89
87
|
class StreamingRecognitionConfig; end
|
90
88
|
|
91
|
-
#
|
92
|
-
#
|
89
|
+
# Provides information to the recognizer that specifies how to process the
|
90
|
+
# request.
|
93
91
|
# @!attribute [rw] encoding
|
94
|
-
# @return [Google::Cloud::Speech::
|
95
|
-
#
|
96
|
-
# @!attribute [rw]
|
92
|
+
# @return [Google::Cloud::Speech::V1::RecognitionConfig::AudioEncoding]
|
93
|
+
# *Required* Encoding of audio data sent in all +RecognitionAudio+ messages.
|
94
|
+
# @!attribute [rw] sample_rate_hertz
|
97
95
|
# @return [Integer]
|
98
|
-
#
|
96
|
+
# *Required* Sample rate in Hertz of the audio data sent in all
|
99
97
|
# +RecognitionAudio+ messages. Valid values are: 8000-48000.
|
100
98
|
# 16000 is optimal. For best results, set the sampling rate of the audio
|
101
99
|
# source to 16000 Hz. If that's not possible, use the native sample rate of
|
102
100
|
# the audio source (instead of re-sampling).
|
103
101
|
# @!attribute [rw] language_code
|
104
102
|
# @return [String]
|
105
|
-
#
|
106
|
-
#
|
107
|
-
#
|
108
|
-
# {Language Support}[https://cloud.google.com/speech/docs/
|
103
|
+
# *Required* The language of the supplied audio as a
|
104
|
+
# {BCP-47}[https://www.rfc-editor.org/rfc/bcp/bcp47.txt] language tag.
|
105
|
+
# Example: "en-US".
|
106
|
+
# See {Language Support}[https://cloud.google.com/speech/docs/languages]
|
109
107
|
# for a list of the currently supported language codes.
|
110
108
|
# @!attribute [rw] max_alternatives
|
111
109
|
# @return [Integer]
|
112
|
-
#
|
110
|
+
# *Optional* Maximum number of recognition hypotheses to be returned.
|
113
111
|
# Specifically, the maximum number of +SpeechRecognitionAlternative+ messages
|
114
112
|
# within each +SpeechRecognitionResult+.
|
115
113
|
# The server may return fewer than +max_alternatives+.
|
116
114
|
# Valid values are +0+-+30+. A value of +0+ or +1+ will return a maximum of
|
117
|
-
#
|
115
|
+
# one. If omitted, will return a maximum of one.
|
118
116
|
# @!attribute [rw] profanity_filter
|
119
117
|
# @return [true, false]
|
120
|
-
#
|
118
|
+
# *Optional* If set to +true+, the server will attempt to filter out
|
121
119
|
# profanities, replacing all but the initial character in each filtered word
|
122
120
|
# with asterisks, e.g. "f***". If set to +false+ or omitted, profanities
|
123
121
|
# won't be filtered out.
|
124
|
-
# @!attribute [rw]
|
125
|
-
# @return [Google::Cloud::Speech::
|
126
|
-
#
|
122
|
+
# @!attribute [rw] speech_contexts
|
123
|
+
# @return [Array<Google::Cloud::Speech::V1::SpeechContext>]
|
124
|
+
# *Optional* A means to provide context to assist the speech recognition.
|
127
125
|
class RecognitionConfig
|
128
126
|
# Audio encoding of the data sent in the audio message. All encodings support
|
129
127
|
# only 1 channel (mono) audio. Only +FLAC+ includes a header that describes
|
@@ -132,34 +130,52 @@ module Google
|
|
132
130
|
#
|
133
131
|
# For best results, the audio source should be captured and transmitted using
|
134
132
|
# a lossless encoding (+FLAC+ or +LINEAR16+). Recognition accuracy may be
|
135
|
-
# reduced if lossy codecs
|
136
|
-
# or transmit the audio, particularly if
|
133
|
+
# reduced if lossy codecs, which include the other codecs listed in
|
134
|
+
# this section, are used to capture or transmit the audio, particularly if
|
135
|
+
# background noise is present.
|
137
136
|
module AudioEncoding
|
138
137
|
# Not specified. Will return result Google::Rpc::Code::INVALID_ARGUMENT.
|
139
138
|
ENCODING_UNSPECIFIED = 0
|
140
139
|
|
141
140
|
# Uncompressed 16-bit signed little-endian samples (Linear PCM).
|
142
|
-
# This is the only encoding that may be used by +AsyncRecognize+.
|
143
141
|
LINEAR16 = 1
|
144
142
|
|
145
|
-
#
|
146
|
-
#
|
147
|
-
# recognition
|
148
|
-
#
|
149
|
-
#
|
150
|
-
#
|
151
|
-
# 16-bit and 24-bit samples are supported.
|
152
|
-
# Not all fields in STREAMINFO are supported.
|
143
|
+
# {+FLAC+}[https://xiph.org/flac/documentation.html] (Free Lossless Audio
|
144
|
+
# Codec) is the recommended encoding because it is
|
145
|
+
# lossless--therefore recognition is not compromised--and
|
146
|
+
# requires only about half the bandwidth of +LINEAR16+. +FLAC+ stream
|
147
|
+
# encoding supports 16-bit and 24-bit samples, however, not all fields in
|
148
|
+
# +STREAMINFO+ are supported.
|
153
149
|
FLAC = 2
|
154
150
|
|
155
151
|
# 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law.
|
156
152
|
MULAW = 3
|
157
153
|
|
158
|
-
# Adaptive Multi-Rate Narrowband codec. +
|
154
|
+
# Adaptive Multi-Rate Narrowband codec. +sample_rate_hertz+ must be 8000.
|
159
155
|
AMR = 4
|
160
156
|
|
161
|
-
# Adaptive Multi-Rate Wideband codec. +
|
157
|
+
# Adaptive Multi-Rate Wideband codec. +sample_rate_hertz+ must be 16000.
|
162
158
|
AMR_WB = 5
|
159
|
+
|
160
|
+
# Opus encoded audio frames in Ogg container
|
161
|
+
# ({OggOpus}[https://wiki.xiph.org/OggOpus]).
|
162
|
+
# +sample_rate_hertz+ must be 16000.
|
163
|
+
OGG_OPUS = 6
|
164
|
+
|
165
|
+
# Although the use of lossy encodings is not recommended, if a very low
|
166
|
+
# bitrate encoding is required, +OGG_OPUS+ is highly preferred over
|
167
|
+
# Speex encoding. The {Speex}[https://speex.org/] encoding supported by
|
168
|
+
# Cloud Speech API has a header byte in each block, as in MIME type
|
169
|
+
# +audio/x-speex-with-header-byte+.
|
170
|
+
# It is a variant of the RTP Speex encoding defined in
|
171
|
+
# {RFC 5574}[https://tools.ietf.org/html/rfc5574].
|
172
|
+
# The stream is a sequence of blocks, one block per RTP packet. Each block
|
173
|
+
# starts with a byte containing the length of the block, in bytes, followed
|
174
|
+
# by one or more frames of Speex data, padded to an integral number of
|
175
|
+
# bytes (octets) as specified in RFC 5574. In other words, each RTP header
|
176
|
+
# is replaced with a single byte containing the block length. Only Speex
|
177
|
+
# wideband is supported. +sample_rate_hertz+ must be 16000.
|
178
|
+
SPEEX_WITH_HEADER_BYTE = 7
|
163
179
|
end
|
164
180
|
end
|
165
181
|
|
@@ -167,7 +183,7 @@ module Google
|
|
167
183
|
# in the results.
|
168
184
|
# @!attribute [rw] phrases
|
169
185
|
# @return [Array<String>]
|
170
|
-
#
|
186
|
+
# *Optional* A list of strings containing words and phrases "hints" so that
|
171
187
|
# the speech recognition is more likely to recognize them. This can be used
|
172
188
|
# to improve the accuracy for specific words and phrases, for example, if
|
173
189
|
# specific commands are typically spoken by the user. This can also be used
|
@@ -194,30 +210,29 @@ module Google
|
|
194
210
|
# {Request URIs}[https://cloud.google.com/storage/docs/reference-uris].
|
195
211
|
class RecognitionAudio; end
|
196
212
|
|
197
|
-
#
|
198
|
-
#
|
199
|
-
#
|
213
|
+
# The only message returned to the client by the +Recognize+ method. It
|
214
|
+
# contains the result as zero or more sequential +SpeechRecognitionResult+
|
215
|
+
# messages.
|
200
216
|
# @!attribute [rw] results
|
201
|
-
# @return [Array<Google::Cloud::Speech::
|
202
|
-
#
|
217
|
+
# @return [Array<Google::Cloud::Speech::V1::SpeechRecognitionResult>]
|
218
|
+
# *Output-only* Sequential list of transcription results corresponding to
|
203
219
|
# sequential portions of audio.
|
204
|
-
class
|
220
|
+
class RecognizeResponse; end
|
205
221
|
|
206
|
-
#
|
207
|
-
#
|
208
|
-
#
|
209
|
-
#
|
210
|
-
#
|
222
|
+
# The only message returned to the client by the +LongRunningRecognize+ method.
|
223
|
+
# It contains the result as zero or more sequential +SpeechRecognitionResult+
|
224
|
+
# messages. It is included in the +result.response+ field of the +Operation+
|
225
|
+
# returned by the +GetOperation+ call of the +google::longrunning::Operations+
|
226
|
+
# service.
|
211
227
|
# @!attribute [rw] results
|
212
|
-
# @return [Array<Google::Cloud::Speech::
|
213
|
-
#
|
228
|
+
# @return [Array<Google::Cloud::Speech::V1::SpeechRecognitionResult>]
|
229
|
+
# *Output-only* Sequential list of transcription results corresponding to
|
214
230
|
# sequential portions of audio.
|
215
|
-
class
|
231
|
+
class LongRunningRecognizeResponse; end
|
216
232
|
|
217
|
-
#
|
218
|
-
#
|
219
|
-
# +
|
220
|
-
# +google::longrunning::Operations+ service.
|
233
|
+
# Describes the progress of a long-running +LongRunningRecognize+ call. It is
|
234
|
+
# included in the +metadata+ field of the +Operation+ returned by the
|
235
|
+
# +GetOperation+ call of the +google::longrunning::Operations+ service.
|
221
236
|
# @!attribute [rw] progress_percent
|
222
237
|
# @return [Integer]
|
223
238
|
# Approximate percentage of audio processed thus far. Guaranteed to be 100
|
@@ -228,7 +243,7 @@ module Google
|
|
228
243
|
# @!attribute [rw] last_update_time
|
229
244
|
# @return [Google::Protobuf::Timestamp]
|
230
245
|
# Time of the most recent processing update.
|
231
|
-
class
|
246
|
+
class LongRunningRecognizeMetadata; end
|
232
247
|
|
233
248
|
# +StreamingRecognizeResponse+ is the only message returned to the client by
|
234
249
|
# +StreamingRecognize+. A series of one or more +StreamingRecognizeResponse+
|
@@ -237,139 +252,120 @@ module Google
|
|
237
252
|
# Here's an example of a series of ten +StreamingRecognizeResponse+s that might
|
238
253
|
# be returned while processing audio:
|
239
254
|
#
|
240
|
-
# 1.
|
255
|
+
# 1. results { alternatives { transcript: "tube" } stability: 0.01 }
|
241
256
|
#
|
242
|
-
# 2. results { alternatives { transcript: "
|
243
|
-
# result_index: 0
|
257
|
+
# 2. results { alternatives { transcript: "to be a" } stability: 0.01 }
|
244
258
|
#
|
245
|
-
# 3. results { alternatives { transcript: "to be
|
246
|
-
# result_index: 0
|
247
|
-
#
|
248
|
-
# 4. results { alternatives { transcript: "to be" } stability: 0.9 }
|
259
|
+
# 3. results { alternatives { transcript: "to be" } stability: 0.9 }
|
249
260
|
# results { alternatives { transcript: " or not to be" } stability: 0.01 }
|
250
|
-
# result_index: 0
|
251
261
|
#
|
252
|
-
#
|
262
|
+
# 4. results { alternatives { transcript: "to be or not to be"
|
253
263
|
# confidence: 0.92 }
|
254
264
|
# alternatives { transcript: "to bee or not to bee" }
|
255
265
|
# is_final: true }
|
256
|
-
# result_index: 0
|
257
266
|
#
|
258
|
-
#
|
259
|
-
# result_index: 1
|
267
|
+
# 5. results { alternatives { transcript: " that's" } stability: 0.01 }
|
260
268
|
#
|
261
|
-
#
|
269
|
+
# 6. results { alternatives { transcript: " that is" } stability: 0.9 }
|
262
270
|
# results { alternatives { transcript: " the question" } stability: 0.01 }
|
263
|
-
# result_index: 1
|
264
271
|
#
|
265
|
-
#
|
272
|
+
# 7. speech_event_type: END_OF_SINGLE_UTTERANCE
|
266
273
|
#
|
267
|
-
#
|
274
|
+
# 8. results { alternatives { transcript: " that is the question"
|
268
275
|
# confidence: 0.98 }
|
269
276
|
# alternatives { transcript: " that was the question" }
|
270
277
|
# is_final: true }
|
271
|
-
# result_index: 1
|
272
|
-
#
|
273
|
-
# 10. endpointer_type: END_OF_AUDIO
|
274
278
|
#
|
275
279
|
# Notes:
|
276
280
|
#
|
277
|
-
# - Only two of the above responses #
|
281
|
+
# - Only two of the above responses #4 and #8 contain final results; they are
|
278
282
|
# indicated by +is_final: true+. Concatenating these together generates the
|
279
283
|
# full transcript: "to be or not to be that is the question".
|
280
284
|
#
|
281
|
-
# - The others contain interim +results+. #
|
282
|
-
# +results
|
283
|
-
# change
|
285
|
+
# - The others contain interim +results+. #3 and #6 contain two interim
|
286
|
+
# +results+: the first portion has a high stability and is less likely to
|
287
|
+
# change; the second portion has a low stability and is very likely to
|
284
288
|
# change. A UI designer might choose to show only high stability +results+.
|
285
289
|
#
|
286
|
-
# - The +
|
287
|
-
#
|
288
|
-
#
|
289
|
-
#
|
290
|
+
# - The specific +stability+ and +confidence+ values shown above are only for
|
291
|
+
# illustrative purposes. Actual values may vary.
|
292
|
+
#
|
293
|
+
# - In each response, only one of these fields will be set:
|
294
|
+
# +error+,
|
295
|
+
# +speech_event_type+, or
|
296
|
+
# one or more (repeated) +results+.
|
290
297
|
# @!attribute [rw] error
|
291
298
|
# @return [Google::Rpc::Status]
|
292
|
-
#
|
299
|
+
# *Output-only* If set, returns a Google::Rpc::Status message that
|
293
300
|
# specifies the error for the operation.
|
294
301
|
# @!attribute [rw] results
|
295
|
-
# @return [Array<Google::Cloud::Speech::
|
296
|
-
#
|
302
|
+
# @return [Array<Google::Cloud::Speech::V1::StreamingRecognitionResult>]
|
303
|
+
# *Output-only* This repeated list contains zero or more results that
|
297
304
|
# correspond to consecutive portions of the audio currently being processed.
|
298
305
|
# It contains zero or one +is_final=true+ result (the newly settled portion),
|
299
306
|
# followed by zero or more +is_final=false+ results.
|
300
|
-
# @!attribute [rw]
|
301
|
-
# @return [
|
302
|
-
#
|
303
|
-
# changed. The repeated +StreamingRecognitionResult+ results overwrite past
|
304
|
-
# results at this index and higher.
|
305
|
-
# @!attribute [rw] endpointer_type
|
306
|
-
# @return [Google::Cloud::Speech::V1beta1::StreamingRecognizeResponse::EndpointerType]
|
307
|
-
# [Output-only] Indicates the type of endpointer event.
|
307
|
+
# @!attribute [rw] speech_event_type
|
308
|
+
# @return [Google::Cloud::Speech::V1::StreamingRecognizeResponse::SpeechEventType]
|
309
|
+
# *Output-only* Indicates the type of speech event.
|
308
310
|
class StreamingRecognizeResponse
|
309
|
-
# Indicates the type of
|
310
|
-
module
|
311
|
-
# No
|
312
|
-
|
313
|
-
|
314
|
-
# Speech has been detected in the audio stream.
|
315
|
-
START_OF_SPEECH = 1
|
316
|
-
|
317
|
-
# Speech has ceased to be detected in the audio stream.
|
318
|
-
END_OF_SPEECH = 2
|
319
|
-
|
320
|
-
# The end of the audio stream has been reached. and it is being processed.
|
321
|
-
END_OF_AUDIO = 3
|
311
|
+
# Indicates the type of speech event.
|
312
|
+
module SpeechEventType
|
313
|
+
# No speech event specified.
|
314
|
+
SPEECH_EVENT_UNSPECIFIED = 0
|
322
315
|
|
323
|
-
# This event
|
324
|
-
#
|
325
|
-
#
|
326
|
-
# additional
|
327
|
-
|
316
|
+
# This event indicates that the server has detected the end of the user's
|
317
|
+
# speech utterance and expects no additional speech. Therefore, the server
|
318
|
+
# will not process additional audio (although it may subsequently return
|
319
|
+
# additional results). The client should stop sending additional audio
|
320
|
+
# data, half-close the gRPC connection, and wait for any additional results
|
321
|
+
# until the server closes the gRPC connection. This event is only sent if
|
322
|
+
# +single_utterance+ was set to +true+, and is not used otherwise.
|
323
|
+
END_OF_SINGLE_UTTERANCE = 1
|
328
324
|
end
|
329
325
|
end
|
330
326
|
|
331
327
|
# A streaming speech recognition result corresponding to a portion of the audio
|
332
328
|
# that is currently being processed.
|
333
329
|
# @!attribute [rw] alternatives
|
334
|
-
# @return [Array<Google::Cloud::Speech::
|
335
|
-
#
|
330
|
+
# @return [Array<Google::Cloud::Speech::V1::SpeechRecognitionAlternative>]
|
331
|
+
# *Output-only* May contain one or more recognition hypotheses (up to the
|
336
332
|
# maximum specified in +max_alternatives+).
|
337
333
|
# @!attribute [rw] is_final
|
338
334
|
# @return [true, false]
|
339
|
-
#
|
335
|
+
# *Output-only* If +false+, this +StreamingRecognitionResult+ represents an
|
340
336
|
# interim result that may change. If +true+, this is the final time the
|
341
337
|
# speech service will return this particular +StreamingRecognitionResult+,
|
342
338
|
# the recognizer will not return any further hypotheses for this portion of
|
343
339
|
# the transcript and corresponding audio.
|
344
340
|
# @!attribute [rw] stability
|
345
341
|
# @return [Float]
|
346
|
-
#
|
342
|
+
# *Output-only* An estimate of the likelihood that the recognizer will not
|
347
343
|
# change its guess about this interim result. Values range from 0.0
|
348
|
-
# (completely unstable) to 1.0 (completely stable).
|
349
|
-
# same as +confidence+, which estimates the probability that a recognition
|
350
|
-
# result is correct.
|
344
|
+
# (completely unstable) to 1.0 (completely stable).
|
351
345
|
# This field is only provided for interim results (+is_final=false+).
|
352
|
-
# The default of 0.0 is a sentinel value indicating stability was not set.
|
346
|
+
# The default of 0.0 is a sentinel value indicating +stability+ was not set.
|
353
347
|
class StreamingRecognitionResult; end
|
354
348
|
|
355
349
|
# A speech recognition result corresponding to a portion of the audio.
|
356
350
|
# @!attribute [rw] alternatives
|
357
|
-
# @return [Array<Google::Cloud::Speech::
|
358
|
-
#
|
351
|
+
# @return [Array<Google::Cloud::Speech::V1::SpeechRecognitionAlternative>]
|
352
|
+
# *Output-only* May contain one or more recognition hypotheses (up to the
|
359
353
|
# maximum specified in +max_alternatives+).
|
360
354
|
class SpeechRecognitionResult; end
|
361
355
|
|
362
356
|
# Alternative hypotheses (a.k.a. n-best list).
|
363
357
|
# @!attribute [rw] transcript
|
364
358
|
# @return [String]
|
365
|
-
#
|
359
|
+
# *Output-only* Transcript text representing the words that the user spoke.
|
366
360
|
# @!attribute [rw] confidence
|
367
361
|
# @return [Float]
|
368
|
-
#
|
369
|
-
#
|
370
|
-
# This field is typically provided only for the top hypothesis, and
|
371
|
-
# +is_final=true+ results.
|
372
|
-
#
|
362
|
+
# *Output-only* The confidence estimate between 0.0 and 1.0. A higher number
|
363
|
+
# indicates an estimated greater likelihood that the recognized words are
|
364
|
+
# correct. This field is typically provided only for the top hypothesis, and
|
365
|
+
# only for +is_final=true+ results. Clients should not rely on the
|
366
|
+
# +confidence+ field as it is not guaranteed to be accurate, or even set, in
|
367
|
+
# any of the results.
|
368
|
+
# The default of 0.0 is a sentinel value indicating +confidence+ was not set.
|
373
369
|
class SpeechRecognitionAlternative; end
|
374
370
|
end
|
375
371
|
end
|