google-cloud-speech 0.37.0 → 0.38.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.yardopts +1 -1
- data/README.md +5 -5
- data/lib/google/cloud/speech.rb +4 -4
- data/lib/google/cloud/speech/v1.rb +4 -4
- data/lib/google/cloud/speech/v1/cloud_speech_pb.rb +11 -0
- data/lib/google/cloud/speech/v1/cloud_speech_services_pb.rb +3 -1
- data/lib/google/cloud/speech/v1/doc/google/cloud/speech/v1/cloud_speech.rb +102 -61
- data/lib/google/cloud/speech/v1/speech_client.rb +6 -4
- data/lib/google/cloud/speech/v1/speech_client_config.json +8 -8
- data/lib/google/cloud/speech/v1p1beta1.rb +5 -4
- data/lib/google/cloud/speech/v1p1beta1/cloud_speech_pb.rb +2 -1
- data/lib/google/cloud/speech/v1p1beta1/cloud_speech_services_pb.rb +3 -1
- data/lib/google/cloud/speech/v1p1beta1/doc/google/cloud/speech/v1p1beta1/cloud_speech.rb +82 -87
- data/lib/google/cloud/speech/v1p1beta1/speech_client.rb +6 -4
- data/lib/google/cloud/speech/v1p1beta1/speech_client_config.json +8 -8
- data/lib/google/cloud/speech/version.rb +1 -1
- metadata +5 -4
|
@@ -209,12 +209,12 @@ module Google
|
|
|
209
209
|
# has been sent and processed.
|
|
210
210
|
#
|
|
211
211
|
# @param config [Google::Cloud::Speech::V1::RecognitionConfig | Hash]
|
|
212
|
-
#
|
|
212
|
+
# Required. Provides information to the recognizer that specifies how to
|
|
213
213
|
# process the request.
|
|
214
214
|
# A hash of the same form as `Google::Cloud::Speech::V1::RecognitionConfig`
|
|
215
215
|
# can also be provided.
|
|
216
216
|
# @param audio [Google::Cloud::Speech::V1::RecognitionAudio | Hash]
|
|
217
|
-
#
|
|
217
|
+
# Required. The audio data to be recognized.
|
|
218
218
|
# A hash of the same form as `Google::Cloud::Speech::V1::RecognitionAudio`
|
|
219
219
|
# can also be provided.
|
|
220
220
|
# @param options [Google::Gax::CallOptions]
|
|
@@ -258,14 +258,16 @@ module Google
|
|
|
258
258
|
# google.longrunning.Operations interface. Returns either an
|
|
259
259
|
# `Operation.error` or an `Operation.response` which contains
|
|
260
260
|
# a `LongRunningRecognizeResponse` message.
|
|
261
|
+
# For more information on asynchronous speech recognition, see the
|
|
262
|
+
# [how-to](https://cloud.google.com/speech-to-text/docs/async-recognize).
|
|
261
263
|
#
|
|
262
264
|
# @param config [Google::Cloud::Speech::V1::RecognitionConfig | Hash]
|
|
263
|
-
#
|
|
265
|
+
# Required. Provides information to the recognizer that specifies how to
|
|
264
266
|
# process the request.
|
|
265
267
|
# A hash of the same form as `Google::Cloud::Speech::V1::RecognitionConfig`
|
|
266
268
|
# can also be provided.
|
|
267
269
|
# @param audio [Google::Cloud::Speech::V1::RecognitionAudio | Hash]
|
|
268
|
-
#
|
|
270
|
+
# Required. The audio data to be recognized.
|
|
269
271
|
# A hash of the same form as `Google::Cloud::Speech::V1::RecognitionAudio`
|
|
270
272
|
# can also be provided.
|
|
271
273
|
# @param options [Google::Gax::CallOptions]
|
|
@@ -13,26 +13,26 @@
|
|
|
13
13
|
"initial_retry_delay_millis": 100,
|
|
14
14
|
"retry_delay_multiplier": 1.3,
|
|
15
15
|
"max_retry_delay_millis": 60000,
|
|
16
|
-
"initial_rpc_timeout_millis":
|
|
16
|
+
"initial_rpc_timeout_millis": 20000,
|
|
17
17
|
"rpc_timeout_multiplier": 1.0,
|
|
18
|
-
"max_rpc_timeout_millis":
|
|
19
|
-
"total_timeout_millis":
|
|
18
|
+
"max_rpc_timeout_millis": 20000,
|
|
19
|
+
"total_timeout_millis": 600000
|
|
20
20
|
}
|
|
21
21
|
},
|
|
22
22
|
"methods": {
|
|
23
23
|
"Recognize": {
|
|
24
|
-
"timeout_millis":
|
|
25
|
-
"retry_codes_name": "
|
|
24
|
+
"timeout_millis": 60000,
|
|
25
|
+
"retry_codes_name": "non_idempotent",
|
|
26
26
|
"retry_params_name": "default"
|
|
27
27
|
},
|
|
28
28
|
"LongRunningRecognize": {
|
|
29
|
-
"timeout_millis":
|
|
29
|
+
"timeout_millis": 60000,
|
|
30
30
|
"retry_codes_name": "non_idempotent",
|
|
31
31
|
"retry_params_name": "default"
|
|
32
32
|
},
|
|
33
33
|
"StreamingRecognize": {
|
|
34
|
-
"timeout_millis":
|
|
35
|
-
"retry_codes_name": "
|
|
34
|
+
"timeout_millis": 60000,
|
|
35
|
+
"retry_codes_name": "non_idempotent",
|
|
36
36
|
"retry_params_name": "default"
|
|
37
37
|
}
|
|
38
38
|
}
|
|
@@ -15,6 +15,7 @@
|
|
|
15
15
|
|
|
16
16
|
require "google/cloud/speech/v1p1beta1/speech_client"
|
|
17
17
|
require "google/cloud/speech/v1p1beta1/helpers"
|
|
18
|
+
require "google/cloud/speech/v1p1beta1/cloud_speech_pb"
|
|
18
19
|
|
|
19
20
|
module Google
|
|
20
21
|
module Cloud
|
|
@@ -22,9 +23,9 @@ module Google
|
|
|
22
23
|
# rubocop:disable LineLength
|
|
23
24
|
|
|
24
25
|
##
|
|
25
|
-
# # Ruby Client for Cloud Speech API ([Alpha](https://github.com/googleapis/google-cloud-ruby#versioning))
|
|
26
|
+
# # Ruby Client for Cloud Speech-to-Text API ([Alpha](https://github.com/googleapis/google-cloud-ruby#versioning))
|
|
26
27
|
#
|
|
27
|
-
# [Cloud Speech API][Product Documentation]:
|
|
28
|
+
# [Cloud Speech-to-Text API][Product Documentation]:
|
|
28
29
|
# Converts audio to text by applying powerful neural network models.
|
|
29
30
|
# - [Product Documentation][]
|
|
30
31
|
#
|
|
@@ -34,7 +35,7 @@ module Google
|
|
|
34
35
|
#
|
|
35
36
|
# 1. [Select or create a Cloud Platform project.](https://console.cloud.google.com/project)
|
|
36
37
|
# 2. [Enable billing for your project.](https://cloud.google.com/billing/docs/how-to/modify-project#enable_billing_for_a_project)
|
|
37
|
-
# 3. [Enable the Cloud Speech API.](https://console.cloud.google.com/apis/library/speech.googleapis.com)
|
|
38
|
+
# 3. [Enable the Cloud Speech-to-Text API.](https://console.cloud.google.com/apis/library/speech.googleapis.com)
|
|
38
39
|
# 4. [Setup Authentication.](https://googleapis.dev/ruby/google-cloud-speech/latest/file.AUTHENTICATION.html)
|
|
39
40
|
#
|
|
40
41
|
# ### Installation
|
|
@@ -62,7 +63,7 @@ module Google
|
|
|
62
63
|
# ```
|
|
63
64
|
#
|
|
64
65
|
# ### Next Steps
|
|
65
|
-
# - Read the [Cloud Speech API Product documentation][Product Documentation]
|
|
66
|
+
# - Read the [Cloud Speech-to-Text API Product documentation][Product Documentation]
|
|
66
67
|
# to learn more about the product and see How-to Guides.
|
|
67
68
|
# - View this [repository's main README](https://github.com/googleapis/google-cloud-ruby/blob/master/README.md)
|
|
68
69
|
# to see the full list of Cloud APIs that we cover.
|
|
@@ -5,10 +5,11 @@
|
|
|
5
5
|
require 'google/protobuf'
|
|
6
6
|
|
|
7
7
|
require 'google/api/annotations_pb'
|
|
8
|
+
require 'google/api/client_pb'
|
|
9
|
+
require 'google/api/field_behavior_pb'
|
|
8
10
|
require 'google/longrunning/operations_pb'
|
|
9
11
|
require 'google/protobuf/any_pb'
|
|
10
12
|
require 'google/protobuf/duration_pb'
|
|
11
|
-
require 'google/protobuf/empty_pb'
|
|
12
13
|
require 'google/protobuf/timestamp_pb'
|
|
13
14
|
require 'google/rpc/status_pb'
|
|
14
15
|
Google::Protobuf::DescriptorPool.generated_pool.build do
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
|
2
2
|
# Source: google/cloud/speech/v1p1beta1/cloud_speech.proto for package 'google.cloud.speech.v1p1beta1'
|
|
3
3
|
# Original file comments:
|
|
4
|
-
# Copyright
|
|
4
|
+
# Copyright 2019 Google LLC.
|
|
5
5
|
#
|
|
6
6
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
7
|
# you may not use this file except in compliance with the License.
|
|
@@ -42,6 +42,8 @@ module Google
|
|
|
42
42
|
# google.longrunning.Operations interface. Returns either an
|
|
43
43
|
# `Operation.error` or an `Operation.response` which contains
|
|
44
44
|
# a `LongRunningRecognizeResponse` message.
|
|
45
|
+
# For more information on asynchronous speech recognition, see the
|
|
46
|
+
# [how-to](https://cloud.google.com/speech-to-text/docs/async-recognize).
|
|
45
47
|
rpc :LongRunningRecognize, LongRunningRecognizeRequest, Google::Longrunning::Operation
|
|
46
48
|
# Performs bidirectional streaming speech recognition: receive results while
|
|
47
49
|
# sending audio. This method is only available via the gRPC API (not REST).
|
|
@@ -20,29 +20,29 @@ module Google
|
|
|
20
20
|
# The top-level message sent by the client for the `Recognize` method.
|
|
21
21
|
# @!attribute [rw] config
|
|
22
22
|
# @return [Google::Cloud::Speech::V1p1beta1::RecognitionConfig]
|
|
23
|
-
#
|
|
23
|
+
# Required. Provides information to the recognizer that specifies how to
|
|
24
24
|
# process the request.
|
|
25
25
|
# @!attribute [rw] audio
|
|
26
26
|
# @return [Google::Cloud::Speech::V1p1beta1::RecognitionAudio]
|
|
27
|
-
#
|
|
27
|
+
# Required. The audio data to be recognized.
|
|
28
28
|
class RecognizeRequest; end
|
|
29
29
|
|
|
30
30
|
# The top-level message sent by the client for the `LongRunningRecognize`
|
|
31
31
|
# method.
|
|
32
32
|
# @!attribute [rw] config
|
|
33
33
|
# @return [Google::Cloud::Speech::V1p1beta1::RecognitionConfig]
|
|
34
|
-
#
|
|
34
|
+
# Required. Provides information to the recognizer that specifies how to
|
|
35
35
|
# process the request.
|
|
36
36
|
# @!attribute [rw] audio
|
|
37
37
|
# @return [Google::Cloud::Speech::V1p1beta1::RecognitionAudio]
|
|
38
|
-
#
|
|
38
|
+
# Required. The audio data to be recognized.
|
|
39
39
|
class LongRunningRecognizeRequest; end
|
|
40
40
|
|
|
41
41
|
# The top-level message sent by the client for the `StreamingRecognize` method.
|
|
42
42
|
# Multiple `StreamingRecognizeRequest` messages are sent. The first message
|
|
43
|
-
# must contain a `streaming_config` message and must not contain
|
|
44
|
-
# All subsequent messages must contain `
|
|
45
|
-
# `streaming_config` message.
|
|
43
|
+
# must contain a `streaming_config` message and must not contain
|
|
44
|
+
# `audio_content`. All subsequent messages must contain `audio_content` and
|
|
45
|
+
# must not contain a `streaming_config` message.
|
|
46
46
|
# @!attribute [rw] streaming_config
|
|
47
47
|
# @return [Google::Cloud::Speech::V1p1beta1::StreamingRecognitionConfig]
|
|
48
48
|
# Provides information to the recognizer that specifies how to process the
|
|
@@ -55,7 +55,7 @@ module Google
|
|
|
55
55
|
# `StreamingRecognizeRequest` message must not contain `audio_content` data
|
|
56
56
|
# and all subsequent `StreamingRecognizeRequest` messages must contain
|
|
57
57
|
# `audio_content` data. The audio bytes must be encoded as specified in
|
|
58
|
-
# `RecognitionConfig`. Note: as with all bytes fields,
|
|
58
|
+
# `RecognitionConfig`. Note: as with all bytes fields, proto buffers use a
|
|
59
59
|
# pure binary representation (not base64). See
|
|
60
60
|
# [content limits](https://cloud.google.com/speech-to-text/quotas#content).
|
|
61
61
|
class StreamingRecognizeRequest; end
|
|
@@ -64,11 +64,11 @@ module Google
|
|
|
64
64
|
# request.
|
|
65
65
|
# @!attribute [rw] config
|
|
66
66
|
# @return [Google::Cloud::Speech::V1p1beta1::RecognitionConfig]
|
|
67
|
-
#
|
|
67
|
+
# Required. Provides information to the recognizer that specifies how to
|
|
68
68
|
# process the request.
|
|
69
69
|
# @!attribute [rw] single_utterance
|
|
70
70
|
# @return [true, false]
|
|
71
|
-
#
|
|
71
|
+
# If `false` or omitted, the recognizer will perform continuous
|
|
72
72
|
# recognition (continuing to wait for and process audio even if the user
|
|
73
73
|
# pauses speaking) until the client closes the input stream (gRPC API) or
|
|
74
74
|
# until the maximum time limit has been reached. May return multiple
|
|
@@ -81,7 +81,7 @@ module Google
|
|
|
81
81
|
# `true`.
|
|
82
82
|
# @!attribute [rw] interim_results
|
|
83
83
|
# @return [true, false]
|
|
84
|
-
#
|
|
84
|
+
# If `true`, interim results (tentative hypotheses) may be
|
|
85
85
|
# returned as they become available (these interim results are indicated with
|
|
86
86
|
# the `is_final=false` flag).
|
|
87
87
|
# If `false` or omitted, only `is_final=true` result(s) are returned.
|
|
@@ -93,8 +93,7 @@ module Google
|
|
|
93
93
|
# @return [Google::Cloud::Speech::V1p1beta1::RecognitionConfig::AudioEncoding]
|
|
94
94
|
# Encoding of audio data sent in all `RecognitionAudio` messages.
|
|
95
95
|
# This field is optional for `FLAC` and `WAV` audio files and required
|
|
96
|
-
# for all other audio formats. For details, see
|
|
97
|
-
# {Google::Cloud::Speech::V1p1beta1::RecognitionConfig::AudioEncoding AudioEncoding}.
|
|
96
|
+
# for all other audio formats. For details, see {Google::Cloud::Speech::V1p1beta1::RecognitionConfig::AudioEncoding AudioEncoding}.
|
|
98
97
|
# @!attribute [rw] sample_rate_hertz
|
|
99
98
|
# @return [Integer]
|
|
100
99
|
# Sample rate in Hertz of the audio data sent in all
|
|
@@ -102,12 +101,11 @@ module Google
|
|
|
102
101
|
# 16000 is optimal. For best results, set the sampling rate of the audio
|
|
103
102
|
# source to 16000 Hz. If that's not possible, use the native sample rate of
|
|
104
103
|
# the audio source (instead of re-sampling).
|
|
105
|
-
# This field is optional for
|
|
106
|
-
# for all other audio formats. For details, see
|
|
107
|
-
# {Google::Cloud::Speech::V1p1beta1::RecognitionConfig::AudioEncoding AudioEncoding}.
|
|
104
|
+
# This field is optional for FLAC and WAV audio files, but is
|
|
105
|
+
# required for all other audio formats. For details, see {Google::Cloud::Speech::V1p1beta1::RecognitionConfig::AudioEncoding AudioEncoding}.
|
|
108
106
|
# @!attribute [rw] audio_channel_count
|
|
109
107
|
# @return [Integer]
|
|
110
|
-
#
|
|
108
|
+
# The number of channels in the input audio data.
|
|
111
109
|
# ONLY set this for MULTI-CHANNEL recognition.
|
|
112
110
|
# Valid values for LINEAR16 and FLAC are `1`-`8`.
|
|
113
111
|
# Valid values for OGG_OPUS are '1'-'254'.
|
|
@@ -118,7 +116,7 @@ module Google
|
|
|
118
116
|
# `enable_separate_recognition_per_channel` to 'true'.
|
|
119
117
|
# @!attribute [rw] enable_separate_recognition_per_channel
|
|
120
118
|
# @return [true, false]
|
|
121
|
-
# This needs to be set to
|
|
119
|
+
# This needs to be set to `true` explicitly and `audio_channel_count` > 1
|
|
122
120
|
# to get each channel recognized separately. The recognition result will
|
|
123
121
|
# contain a `channel_tag` field to state which channel that result belongs
|
|
124
122
|
# to. If this is not true, we will only recognize the first channel. The
|
|
@@ -126,28 +124,29 @@ module Google
|
|
|
126
124
|
# `audio_channel_count` multiplied by the length of the audio.
|
|
127
125
|
# @!attribute [rw] language_code
|
|
128
126
|
# @return [String]
|
|
129
|
-
#
|
|
127
|
+
# Required. The language of the supplied audio as a
|
|
130
128
|
# [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag.
|
|
131
129
|
# Example: "en-US".
|
|
132
|
-
# See [Language
|
|
133
|
-
# for a list
|
|
130
|
+
# See [Language
|
|
131
|
+
# Support](https://cloud.google.com/speech-to-text/docs/languages) for a list
|
|
132
|
+
# of the currently supported language codes.
|
|
134
133
|
# @!attribute [rw] alternative_language_codes
|
|
135
134
|
# @return [Array<String>]
|
|
136
|
-
#
|
|
135
|
+
# A list of up to 3 additional
|
|
137
136
|
# [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tags,
|
|
138
137
|
# listing possible alternative languages of the supplied audio.
|
|
139
|
-
# See [Language
|
|
140
|
-
# for a list
|
|
141
|
-
#
|
|
142
|
-
# recognition in the most likely
|
|
143
|
-
# language_code. The recognition result
|
|
144
|
-
# of the language detected in the audio.
|
|
145
|
-
#
|
|
146
|
-
#
|
|
138
|
+
# See [Language
|
|
139
|
+
# Support](https://cloud.google.com/speech-to-text/docs/languages) for a list
|
|
140
|
+
# of the currently supported language codes. If alternative languages are
|
|
141
|
+
# listed, recognition result will contain recognition in the most likely
|
|
142
|
+
# language detected including the main language_code. The recognition result
|
|
143
|
+
# will include the language tag of the language detected in the audio. Note:
|
|
144
|
+
# This feature is only supported for Voice Command and Voice Search use cases
|
|
145
|
+
# and performance may vary for other use cases (e.g., phone call
|
|
147
146
|
# transcription).
|
|
148
147
|
# @!attribute [rw] max_alternatives
|
|
149
148
|
# @return [Integer]
|
|
150
|
-
#
|
|
149
|
+
# Maximum number of recognition hypotheses to be returned.
|
|
151
150
|
# Specifically, the maximum number of `SpeechRecognitionAlternative` messages
|
|
152
151
|
# within each `SpeechRecognitionResult`.
|
|
153
152
|
# The server may return fewer than `max_alternatives`.
|
|
@@ -155,30 +154,31 @@ module Google
|
|
|
155
154
|
# one. If omitted, will return a maximum of one.
|
|
156
155
|
# @!attribute [rw] profanity_filter
|
|
157
156
|
# @return [true, false]
|
|
158
|
-
#
|
|
157
|
+
# If set to `true`, the server will attempt to filter out
|
|
159
158
|
# profanities, replacing all but the initial character in each filtered word
|
|
160
159
|
# with asterisks, e.g. "f***". If set to `false` or omitted, profanities
|
|
161
160
|
# won't be filtered out.
|
|
162
161
|
# @!attribute [rw] speech_contexts
|
|
163
162
|
# @return [Array<Google::Cloud::Speech::V1p1beta1::SpeechContext>]
|
|
164
|
-
#
|
|
165
|
-
#
|
|
166
|
-
#
|
|
167
|
-
# [
|
|
163
|
+
# Array of {Google::Cloud::Speech::V1p1beta1::SpeechContext SpeechContext}.
|
|
164
|
+
# A means to provide context to assist the speech recognition. For more
|
|
165
|
+
# information, see
|
|
166
|
+
# [speech
|
|
167
|
+
# adaptation](https://cloud.google.com/speech-to-text/docs/context-strength).
|
|
168
168
|
# @!attribute [rw] enable_word_time_offsets
|
|
169
169
|
# @return [true, false]
|
|
170
|
-
#
|
|
170
|
+
# If `true`, the top result includes a list of words and
|
|
171
171
|
# the start and end time offsets (timestamps) for those words. If
|
|
172
172
|
# `false`, no word-level time offset information is returned. The default is
|
|
173
173
|
# `false`.
|
|
174
174
|
# @!attribute [rw] enable_word_confidence
|
|
175
175
|
# @return [true, false]
|
|
176
|
-
#
|
|
176
|
+
# If `true`, the top result includes a list of words and the
|
|
177
177
|
# confidence for those words. If `false`, no word-level confidence
|
|
178
178
|
# information is returned. The default is `false`.
|
|
179
179
|
# @!attribute [rw] enable_automatic_punctuation
|
|
180
180
|
# @return [true, false]
|
|
181
|
-
#
|
|
181
|
+
# If 'true', adds punctuation to recognition result hypotheses.
|
|
182
182
|
# This feature is only available in select languages. Setting this for
|
|
183
183
|
# requests in other languages has no effect at all.
|
|
184
184
|
# The default 'false' value does not add punctuation to result hypotheses.
|
|
@@ -187,19 +187,18 @@ module Google
|
|
|
187
187
|
# premium feature.
|
|
188
188
|
# @!attribute [rw] enable_speaker_diarization
|
|
189
189
|
# @return [true, false]
|
|
190
|
-
#
|
|
190
|
+
# If 'true', enables speaker detection for each recognized word in
|
|
191
191
|
# the top alternative of the recognition result using a speaker_tag provided
|
|
192
192
|
# in the WordInfo.
|
|
193
193
|
# Note: Use diarization_config instead.
|
|
194
194
|
# @!attribute [rw] diarization_speaker_count
|
|
195
195
|
# @return [Integer]
|
|
196
|
-
# *Optional*
|
|
197
196
|
# If set, specifies the estimated number of speakers in the conversation.
|
|
198
197
|
# Defaults to '2'. Ignored unless enable_speaker_diarization is set to true.
|
|
199
198
|
# Note: Use diarization_config instead.
|
|
200
199
|
# @!attribute [rw] diarization_config
|
|
201
200
|
# @return [Google::Cloud::Speech::V1p1beta1::SpeakerDiarizationConfig]
|
|
202
|
-
#
|
|
201
|
+
# Config to enable speaker diarization and set additional
|
|
203
202
|
# parameters to make diarization better suited for your application.
|
|
204
203
|
# Note: When this is enabled, we send all the words from the beginning of the
|
|
205
204
|
# audio for the top alternative in every consecutive STREAMING responses.
|
|
@@ -209,10 +208,10 @@ module Google
|
|
|
209
208
|
# in the top alternative of the FINAL SpeechRecognitionResult.
|
|
210
209
|
# @!attribute [rw] metadata
|
|
211
210
|
# @return [Google::Cloud::Speech::V1p1beta1::RecognitionMetadata]
|
|
212
|
-
#
|
|
211
|
+
# Metadata regarding this request.
|
|
213
212
|
# @!attribute [rw] model
|
|
214
213
|
# @return [String]
|
|
215
|
-
#
|
|
214
|
+
# Which model to select for the given request. Select the model
|
|
216
215
|
# best suited to your domain to get best results. If a model is not
|
|
217
216
|
# explicitly specified, then we auto-select a model based on the parameters
|
|
218
217
|
# in the RecognitionConfig.
|
|
@@ -246,7 +245,7 @@ module Google
|
|
|
246
245
|
# </table>
|
|
247
246
|
# @!attribute [rw] use_enhanced
|
|
248
247
|
# @return [true, false]
|
|
249
|
-
#
|
|
248
|
+
# Set to true to use an enhanced model for speech recognition.
|
|
250
249
|
# If `use_enhanced` is set to true and the `model` field is not set, then
|
|
251
250
|
# an appropriate enhanced model is chosen if an enhanced model exists for
|
|
252
251
|
# the audio.
|
|
@@ -257,13 +256,15 @@ module Google
|
|
|
257
256
|
class RecognitionConfig
|
|
258
257
|
# The encoding of the audio data sent in the request.
|
|
259
258
|
#
|
|
260
|
-
# All encodings support only 1 channel (mono) audio
|
|
259
|
+
# All encodings support only 1 channel (mono) audio, unless the
|
|
260
|
+
# `audio_channel_count` and `enable_separate_recognition_per_channel` fields
|
|
261
|
+
# are set.
|
|
261
262
|
#
|
|
262
263
|
# For best results, the audio source should be captured and transmitted using
|
|
263
264
|
# a lossless encoding (`FLAC` or `LINEAR16`). The accuracy of the speech
|
|
264
265
|
# recognition can be reduced if lossy codecs are used to capture or transmit
|
|
265
266
|
# audio, particularly if background noise is present. Lossy codecs include
|
|
266
|
-
# `MULAW`, `AMR`, `AMR_WB`, `OGG_OPUS`, and `
|
|
267
|
+
# `MULAW`, `AMR`, `AMR_WB`, `OGG_OPUS`, `SPEEX_WITH_HEADER_BYTE`, and `MP3`.
|
|
267
268
|
#
|
|
268
269
|
# The `FLAC` and `WAV` audio file formats include a header that describes the
|
|
269
270
|
# included audio content. You can request recognition for `WAV` files that
|
|
@@ -274,8 +275,7 @@ module Google
|
|
|
274
275
|
# an `AudioEncoding` when you send send `FLAC` or `WAV` audio, the
|
|
275
276
|
# encoding configuration must match the encoding described in the audio
|
|
276
277
|
# header; otherwise the request returns an
|
|
277
|
-
# {Google::Rpc::Code::INVALID_ARGUMENT} error
|
|
278
|
-
# code.
|
|
278
|
+
# {Google::Rpc::Code::INVALID_ARGUMENT} error code.
|
|
279
279
|
module AudioEncoding
|
|
280
280
|
# Not specified.
|
|
281
281
|
ENCODING_UNSPECIFIED = 0
|
|
@@ -327,21 +327,19 @@ module Google
|
|
|
327
327
|
end
|
|
328
328
|
end
|
|
329
329
|
|
|
330
|
-
#
|
|
330
|
+
# Config to enable speaker diarization.
|
|
331
331
|
# @!attribute [rw] enable_speaker_diarization
|
|
332
332
|
# @return [true, false]
|
|
333
|
-
#
|
|
333
|
+
# If 'true', enables speaker detection for each recognized word in
|
|
334
334
|
# the top alternative of the recognition result using a speaker_tag provided
|
|
335
335
|
# in the WordInfo.
|
|
336
336
|
# @!attribute [rw] min_speaker_count
|
|
337
337
|
# @return [Integer]
|
|
338
|
-
# *Optional*
|
|
339
338
|
# Minimum number of speakers in the conversation. This range gives you more
|
|
340
339
|
# flexibility by allowing the system to automatically determine the correct
|
|
341
340
|
# number of speakers. If not set, the default value is 2.
|
|
342
341
|
# @!attribute [rw] max_speaker_count
|
|
343
342
|
# @return [Integer]
|
|
344
|
-
# *Optional*
|
|
345
343
|
# Maximum number of speakers in the conversation. This range gives you more
|
|
346
344
|
# flexibility by allowing the system to automatically determine the correct
|
|
347
345
|
# number of speakers. If not set, the default value is 6.
|
|
@@ -482,7 +480,7 @@ module Google
|
|
|
482
480
|
# in the results.
|
|
483
481
|
# @!attribute [rw] phrases
|
|
484
482
|
# @return [Array<String>]
|
|
485
|
-
#
|
|
483
|
+
# A list of strings containing words and phrases "hints" so that
|
|
486
484
|
# the speech recognition is more likely to recognize them. This can be used
|
|
487
485
|
# to improve the accuracy for specific words and phrases, for example, if
|
|
488
486
|
# specific commands are typically spoken by the user. This can also be used
|
|
@@ -508,12 +506,12 @@ module Google
|
|
|
508
506
|
|
|
509
507
|
# Contains audio data in the encoding specified in the `RecognitionConfig`.
|
|
510
508
|
# Either `content` or `uri` must be supplied. Supplying both or neither
|
|
511
|
-
# returns {Google::Rpc::Code::INVALID_ARGUMENT}.
|
|
512
|
-
#
|
|
509
|
+
# returns {Google::Rpc::Code::INVALID_ARGUMENT}. See
|
|
510
|
+
# [content limits](https://cloud.google.com/speech-to-text/quotas#content).
|
|
513
511
|
# @!attribute [rw] content
|
|
514
512
|
# @return [String]
|
|
515
513
|
# The audio data bytes encoded as specified in
|
|
516
|
-
# `RecognitionConfig`. Note: as with all bytes fields,
|
|
514
|
+
# `RecognitionConfig`. Note: as with all bytes fields, proto buffers use a
|
|
517
515
|
# pure binary representation, whereas JSON representations use base64.
|
|
518
516
|
# @!attribute [rw] uri
|
|
519
517
|
# @return [String]
|
|
@@ -522,9 +520,8 @@ module Google
|
|
|
522
520
|
# Currently, only Google Cloud Storage URIs are
|
|
523
521
|
# supported, which must be specified in the following format:
|
|
524
522
|
# `gs://bucket_name/object_name` (other URI formats return
|
|
525
|
-
# {Google::Rpc::Code::INVALID_ARGUMENT}).
|
|
526
|
-
#
|
|
527
|
-
# URIs](https://cloud.google.com/storage/docs/reference-uris).
|
|
523
|
+
# {Google::Rpc::Code::INVALID_ARGUMENT}). For more information, see
|
|
524
|
+
# [Request URIs](https://cloud.google.com/storage/docs/reference-uris).
|
|
528
525
|
class RecognitionAudio; end
|
|
529
526
|
|
|
530
527
|
# The only message returned to the client by the `Recognize` method. It
|
|
@@ -532,7 +529,7 @@ module Google
|
|
|
532
529
|
# messages.
|
|
533
530
|
# @!attribute [rw] results
|
|
534
531
|
# @return [Array<Google::Cloud::Speech::V1p1beta1::SpeechRecognitionResult>]
|
|
535
|
-
#
|
|
532
|
+
# Sequential list of transcription results corresponding to
|
|
536
533
|
# sequential portions of audio.
|
|
537
534
|
class RecognizeResponse; end
|
|
538
535
|
|
|
@@ -543,7 +540,7 @@ module Google
|
|
|
543
540
|
# service.
|
|
544
541
|
# @!attribute [rw] results
|
|
545
542
|
# @return [Array<Google::Cloud::Speech::V1p1beta1::SpeechRecognitionResult>]
|
|
546
|
-
#
|
|
543
|
+
# Sequential list of transcription results corresponding to
|
|
547
544
|
# sequential portions of audio.
|
|
548
545
|
class LongRunningRecognizeResponse; end
|
|
549
546
|
|
|
@@ -613,17 +610,17 @@ module Google
|
|
|
613
610
|
# one or more (repeated) `results`.
|
|
614
611
|
# @!attribute [rw] error
|
|
615
612
|
# @return [Google::Rpc::Status]
|
|
616
|
-
#
|
|
617
|
-
#
|
|
613
|
+
# If set, returns a {Google::Rpc::Status} message that
|
|
614
|
+
# specifies the error for the operation.
|
|
618
615
|
# @!attribute [rw] results
|
|
619
616
|
# @return [Array<Google::Cloud::Speech::V1p1beta1::StreamingRecognitionResult>]
|
|
620
|
-
#
|
|
617
|
+
# This repeated list contains zero or more results that
|
|
621
618
|
# correspond to consecutive portions of the audio currently being processed.
|
|
622
619
|
# It contains zero or one `is_final=true` result (the newly settled portion),
|
|
623
620
|
# followed by zero or more `is_final=false` results (the interim results).
|
|
624
621
|
# @!attribute [rw] speech_event_type
|
|
625
622
|
# @return [Google::Cloud::Speech::V1p1beta1::StreamingRecognizeResponse::SpeechEventType]
|
|
626
|
-
#
|
|
623
|
+
# Indicates the type of speech event.
|
|
627
624
|
class StreamingRecognizeResponse
|
|
628
625
|
# Indicates the type of speech event.
|
|
629
626
|
module SpeechEventType
|
|
@@ -645,27 +642,27 @@ module Google
|
|
|
645
642
|
# that is currently being processed.
|
|
646
643
|
# @!attribute [rw] alternatives
|
|
647
644
|
# @return [Array<Google::Cloud::Speech::V1p1beta1::SpeechRecognitionAlternative>]
|
|
648
|
-
#
|
|
645
|
+
# May contain one or more recognition hypotheses (up to the
|
|
649
646
|
# maximum specified in `max_alternatives`).
|
|
650
647
|
# These alternatives are ordered in terms of accuracy, with the top (first)
|
|
651
648
|
# alternative being the most probable, as ranked by the recognizer.
|
|
652
649
|
# @!attribute [rw] is_final
|
|
653
650
|
# @return [true, false]
|
|
654
|
-
#
|
|
651
|
+
# If `false`, this `StreamingRecognitionResult` represents an
|
|
655
652
|
# interim result that may change. If `true`, this is the final time the
|
|
656
653
|
# speech service will return this particular `StreamingRecognitionResult`,
|
|
657
654
|
# the recognizer will not return any further hypotheses for this portion of
|
|
658
655
|
# the transcript and corresponding audio.
|
|
659
656
|
# @!attribute [rw] stability
|
|
660
657
|
# @return [Float]
|
|
661
|
-
#
|
|
658
|
+
# An estimate of the likelihood that the recognizer will not
|
|
662
659
|
# change its guess about this interim result. Values range from 0.0
|
|
663
660
|
# (completely unstable) to 1.0 (completely stable).
|
|
664
661
|
# This field is only provided for interim results (`is_final=false`).
|
|
665
662
|
# The default of 0.0 is a sentinel value indicating `stability` was not set.
|
|
666
663
|
# @!attribute [rw] result_end_time
|
|
667
664
|
# @return [Google::Protobuf::Duration]
|
|
668
|
-
#
|
|
665
|
+
# Time offset of the end of this result relative to the
|
|
669
666
|
# beginning of the audio.
|
|
670
667
|
# @!attribute [rw] channel_tag
|
|
671
668
|
# @return [Integer]
|
|
@@ -674,16 +671,15 @@ module Google
|
|
|
674
671
|
# For audio_channel_count = N, its output values can range from '1' to 'N'.
|
|
675
672
|
# @!attribute [rw] language_code
|
|
676
673
|
# @return [String]
|
|
677
|
-
#
|
|
678
|
-
#
|
|
679
|
-
#
|
|
680
|
-
# likelihood of being spoken in the audio.
|
|
674
|
+
# The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag
|
|
675
|
+
# of the language in this result. This language code was detected to have
|
|
676
|
+
# the most likelihood of being spoken in the audio.
|
|
681
677
|
class StreamingRecognitionResult; end
|
|
682
678
|
|
|
683
679
|
# A speech recognition result corresponding to a portion of the audio.
|
|
684
680
|
# @!attribute [rw] alternatives
|
|
685
681
|
# @return [Array<Google::Cloud::Speech::V1p1beta1::SpeechRecognitionAlternative>]
|
|
686
|
-
#
|
|
682
|
+
# May contain one or more recognition hypotheses (up to the
|
|
687
683
|
# maximum specified in `max_alternatives`).
|
|
688
684
|
# These alternatives are ordered in terms of accuracy, with the top (first)
|
|
689
685
|
# alternative being the most probable, as ranked by the recognizer.
|
|
@@ -694,19 +690,18 @@ module Google
|
|
|
694
690
|
# For audio_channel_count = N, its output values can range from '1' to 'N'.
|
|
695
691
|
# @!attribute [rw] language_code
|
|
696
692
|
# @return [String]
|
|
697
|
-
#
|
|
698
|
-
#
|
|
699
|
-
#
|
|
700
|
-
# likelihood of being spoken in the audio.
|
|
693
|
+
# The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag
|
|
694
|
+
# of the language in this result. This language code was detected to have
|
|
695
|
+
# the most likelihood of being spoken in the audio.
|
|
701
696
|
class SpeechRecognitionResult; end
|
|
702
697
|
|
|
703
698
|
# Alternative hypotheses (a.k.a. n-best list).
|
|
704
699
|
# @!attribute [rw] transcript
|
|
705
700
|
# @return [String]
|
|
706
|
-
#
|
|
701
|
+
# Transcript text representing the words that the user spoke.
|
|
707
702
|
# @!attribute [rw] confidence
|
|
708
703
|
# @return [Float]
|
|
709
|
-
#
|
|
704
|
+
# The confidence estimate between 0.0 and 1.0. A higher number
|
|
710
705
|
# indicates an estimated greater likelihood that the recognized words are
|
|
711
706
|
# correct. This field is set only for the top alternative of a non-streaming
|
|
712
707
|
# result or, of a streaming result where `is_final=true`.
|
|
@@ -715,7 +710,7 @@ module Google
|
|
|
715
710
|
# The default of 0.0 is a sentinel value indicating `confidence` was not set.
|
|
716
711
|
# @!attribute [rw] words
|
|
717
712
|
# @return [Array<Google::Cloud::Speech::V1p1beta1::WordInfo>]
|
|
718
|
-
#
|
|
713
|
+
# A list of word-specific information for each recognized word.
|
|
719
714
|
# Note: When `enable_speaker_diarization` is true, you will see all the words
|
|
720
715
|
# from the beginning of the audio.
|
|
721
716
|
class SpeechRecognitionAlternative; end
|
|
@@ -723,7 +718,7 @@ module Google
|
|
|
723
718
|
# Word-specific information for recognized words.
|
|
724
719
|
# @!attribute [rw] start_time
|
|
725
720
|
# @return [Google::Protobuf::Duration]
|
|
726
|
-
#
|
|
721
|
+
# Time offset relative to the beginning of the audio,
|
|
727
722
|
# and corresponding to the start of the spoken word.
|
|
728
723
|
# This field is only set if `enable_word_time_offsets=true` and only
|
|
729
724
|
# in the top hypothesis.
|
|
@@ -731,7 +726,7 @@ module Google
|
|
|
731
726
|
# vary.
|
|
732
727
|
# @!attribute [rw] end_time
|
|
733
728
|
# @return [Google::Protobuf::Duration]
|
|
734
|
-
#
|
|
729
|
+
# Time offset relative to the beginning of the audio,
|
|
735
730
|
# and corresponding to the end of the spoken word.
|
|
736
731
|
# This field is only set if `enable_word_time_offsets=true` and only
|
|
737
732
|
# in the top hypothesis.
|
|
@@ -739,10 +734,10 @@ module Google
|
|
|
739
734
|
# vary.
|
|
740
735
|
# @!attribute [rw] word
|
|
741
736
|
# @return [String]
|
|
742
|
-
#
|
|
737
|
+
# The word corresponding to this set of information.
|
|
743
738
|
# @!attribute [rw] confidence
|
|
744
739
|
# @return [Float]
|
|
745
|
-
#
|
|
740
|
+
# The confidence estimate between 0.0 and 1.0. A higher number
|
|
746
741
|
# indicates an estimated greater likelihood that the recognized words are
|
|
747
742
|
# correct. This field is set only for the top alternative of a non-streaming
|
|
748
743
|
# result or, of a streaming result where `is_final=true`.
|
|
@@ -751,7 +746,7 @@ module Google
|
|
|
751
746
|
# The default of 0.0 is a sentinel value indicating `confidence` was not set.
|
|
752
747
|
# @!attribute [rw] speaker_tag
|
|
753
748
|
# @return [Integer]
|
|
754
|
-
#
|
|
749
|
+
# A distinct integer value is assigned for every speaker within
|
|
755
750
|
# the audio. This field specifies which one of those speakers was detected to
|
|
756
751
|
# have spoken this word. Value ranges from '1' to diarization_speaker_count.
|
|
757
752
|
# speaker_tag is set if enable_speaker_diarization = 'true' and only in the
|