google-cloud-speech 0.37.0 → 0.38.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.yardopts +1 -1
- data/README.md +5 -5
- data/lib/google/cloud/speech.rb +4 -4
- data/lib/google/cloud/speech/v1.rb +4 -4
- data/lib/google/cloud/speech/v1/cloud_speech_pb.rb +11 -0
- data/lib/google/cloud/speech/v1/cloud_speech_services_pb.rb +3 -1
- data/lib/google/cloud/speech/v1/doc/google/cloud/speech/v1/cloud_speech.rb +102 -61
- data/lib/google/cloud/speech/v1/speech_client.rb +6 -4
- data/lib/google/cloud/speech/v1/speech_client_config.json +8 -8
- data/lib/google/cloud/speech/v1p1beta1.rb +5 -4
- data/lib/google/cloud/speech/v1p1beta1/cloud_speech_pb.rb +2 -1
- data/lib/google/cloud/speech/v1p1beta1/cloud_speech_services_pb.rb +3 -1
- data/lib/google/cloud/speech/v1p1beta1/doc/google/cloud/speech/v1p1beta1/cloud_speech.rb +82 -87
- data/lib/google/cloud/speech/v1p1beta1/speech_client.rb +6 -4
- data/lib/google/cloud/speech/v1p1beta1/speech_client_config.json +8 -8
- data/lib/google/cloud/speech/version.rb +1 -1
- metadata +5 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 418c1c3abd5c10d1bb15b06d09f9f31bf8c9f48a7ba40b6d9c01f6699d2b920e
|
4
|
+
data.tar.gz: 5b01b710d26d3d6bfcba2eb4231b8a535f86c6306fb62a5daf71db406c8a140d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b45ab7560fea6d5472532e3894328f80673e2498481bbdbcf2d5ab9b9b6fd83f541b7505562052bfb2c08f42e279845615d42100d885bab68695e425447151ad
|
7
|
+
data.tar.gz: 2ed95a11fb97cdc6840d8e241d4c12db07cbaa8de94f1d8d749745a696a8c4de8cac0b25e3fa574ee4e4ba490c2743ad1f590b175b0771e23aed7765b1e2e6dd
|
data/.yardopts
CHANGED
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
|
-
# Ruby Client for Cloud Speech API ([Alpha](https://github.com/googleapis/google-cloud-ruby#versioning))
|
1
|
+
# Ruby Client for Cloud Speech-to-Text API ([Alpha](https://github.com/googleapis/google-cloud-ruby#versioning))
|
2
2
|
|
3
|
-
[Cloud Speech API][Product Documentation]:
|
3
|
+
[Cloud Speech-to-Text API][Product Documentation]:
|
4
4
|
Converts audio to text by applying powerful neural network models.
|
5
5
|
- [Client Library Documentation][]
|
6
6
|
- [Product Documentation][]
|
@@ -11,7 +11,7 @@ steps:
|
|
11
11
|
|
12
12
|
1. [Select or create a Cloud Platform project.](https://console.cloud.google.com/project)
|
13
13
|
2. [Enable billing for your project.](https://cloud.google.com/billing/docs/how-to/modify-project#enable_billing_for_a_project)
|
14
|
-
3. [Enable the Cloud Speech API.](https://console.cloud.google.com/apis/library/speech.googleapis.com)
|
14
|
+
3. [Enable the Cloud Speech-to-Text API.](https://console.cloud.google.com/apis/library/speech.googleapis.com)
|
15
15
|
4. [Setup Authentication.](https://googleapis.dev/ruby/google-cloud-speech/latest/file.AUTHENTICATION.html)
|
16
16
|
|
17
17
|
### Installation
|
@@ -46,9 +46,9 @@ response = speech_client.recognize(config, audio)
|
|
46
46
|
```
|
47
47
|
|
48
48
|
### Next Steps
|
49
|
-
- Read the [Client Library Documentation][] for Cloud Speech API
|
49
|
+
- Read the [Client Library Documentation][] for Cloud Speech-to-Text API
|
50
50
|
to see other available methods on the client.
|
51
|
-
- Read the [Cloud Speech API Product documentation][Product Documentation]
|
51
|
+
- Read the [Cloud Speech-to-Text API Product documentation][Product Documentation]
|
52
52
|
to learn more about the product and see How-to Guides.
|
53
53
|
- View this [repository's main README](https://github.com/googleapis/google-cloud-ruby/blob/master/README.md)
|
54
54
|
to see the full list of Cloud APIs that we cover.
|
data/lib/google/cloud/speech.rb
CHANGED
@@ -21,9 +21,9 @@ module Google
|
|
21
21
|
# rubocop:disable LineLength
|
22
22
|
|
23
23
|
##
|
24
|
-
# # Ruby Client for Cloud Speech API ([Alpha](https://github.com/googleapis/google-cloud-ruby#versioning))
|
24
|
+
# # Ruby Client for Cloud Speech-to-Text API ([Alpha](https://github.com/googleapis/google-cloud-ruby#versioning))
|
25
25
|
#
|
26
|
-
# [Cloud Speech API][Product Documentation]:
|
26
|
+
# [Cloud Speech-to-Text API][Product Documentation]:
|
27
27
|
# Converts audio to text by applying powerful neural network models.
|
28
28
|
# - [Product Documentation][]
|
29
29
|
#
|
@@ -33,7 +33,7 @@ module Google
|
|
33
33
|
#
|
34
34
|
# 1. [Select or create a Cloud Platform project.](https://console.cloud.google.com/project)
|
35
35
|
# 2. [Enable billing for your project.](https://cloud.google.com/billing/docs/how-to/modify-project#enable_billing_for_a_project)
|
36
|
-
# 3. [Enable the Cloud Speech API.](https://console.cloud.google.com/apis/library/speech.googleapis.com)
|
36
|
+
# 3. [Enable the Cloud Speech-to-Text API.](https://console.cloud.google.com/apis/library/speech.googleapis.com)
|
37
37
|
# 4. [Setup Authentication.](https://googleapis.dev/ruby/google-cloud-speech/latest/file.AUTHENTICATION.html)
|
38
38
|
#
|
39
39
|
# ### Installation
|
@@ -68,7 +68,7 @@ module Google
|
|
68
68
|
# ```
|
69
69
|
#
|
70
70
|
# ### Next Steps
|
71
|
-
# - Read the [Cloud Speech API Product documentation][Product Documentation]
|
71
|
+
# - Read the [Cloud Speech-to-Text API Product documentation][Product Documentation]
|
72
72
|
# to learn more about the product and see How-to Guides.
|
73
73
|
# - View this [repository's main README](https://github.com/googleapis/google-cloud-ruby/blob/master/README.md)
|
74
74
|
# to see the full list of Cloud APIs that we cover.
|
@@ -23,9 +23,9 @@ module Google
|
|
23
23
|
# rubocop:disable LineLength
|
24
24
|
|
25
25
|
##
|
26
|
-
# # Ruby Client for Cloud Speech API ([Alpha](https://github.com/googleapis/google-cloud-ruby#versioning))
|
26
|
+
# # Ruby Client for Cloud Speech-to-Text API ([Alpha](https://github.com/googleapis/google-cloud-ruby#versioning))
|
27
27
|
#
|
28
|
-
# [Cloud Speech API][Product Documentation]:
|
28
|
+
# [Cloud Speech-to-Text API][Product Documentation]:
|
29
29
|
# Converts audio to text by applying powerful neural network models.
|
30
30
|
# - [Product Documentation][]
|
31
31
|
#
|
@@ -35,7 +35,7 @@ module Google
|
|
35
35
|
#
|
36
36
|
# 1. [Select or create a Cloud Platform project.](https://console.cloud.google.com/project)
|
37
37
|
# 2. [Enable billing for your project.](https://cloud.google.com/billing/docs/how-to/modify-project#enable_billing_for_a_project)
|
38
|
-
# 3. [Enable the Cloud Speech API.](https://console.cloud.google.com/apis/library/speech.googleapis.com)
|
38
|
+
# 3. [Enable the Cloud Speech-to-Text API.](https://console.cloud.google.com/apis/library/speech.googleapis.com)
|
39
39
|
# 4. [Setup Authentication.](https://googleapis.dev/ruby/google-cloud-speech/latest/file.AUTHENTICATION.html)
|
40
40
|
#
|
41
41
|
# ### Installation
|
@@ -63,7 +63,7 @@ module Google
|
|
63
63
|
# ```
|
64
64
|
#
|
65
65
|
# ### Next Steps
|
66
|
-
# - Read the [Cloud Speech API Product documentation][Product Documentation]
|
66
|
+
# - Read the [Cloud Speech-to-Text API Product documentation][Product Documentation]
|
67
67
|
# to learn more about the product and see How-to Guides.
|
68
68
|
# - View this [repository's main README](https://github.com/googleapis/google-cloud-ruby/blob/master/README.md)
|
69
69
|
# to see the full list of Cloud APIs that we cover.
|
@@ -5,7 +5,10 @@
|
|
5
5
|
require 'google/protobuf'
|
6
6
|
|
7
7
|
require 'google/api/annotations_pb'
|
8
|
+
require 'google/api/client_pb'
|
9
|
+
require 'google/api/field_behavior_pb'
|
8
10
|
require 'google/longrunning/operations_pb'
|
11
|
+
require 'google/protobuf/any_pb'
|
9
12
|
require 'google/protobuf/duration_pb'
|
10
13
|
require 'google/protobuf/timestamp_pb'
|
11
14
|
require 'google/rpc/status_pb'
|
@@ -40,6 +43,7 @@ Google::Protobuf::DescriptorPool.generated_pool.build do
|
|
40
43
|
repeated :speech_contexts, :message, 6, "google.cloud.speech.v1.SpeechContext"
|
41
44
|
optional :enable_word_time_offsets, :bool, 8
|
42
45
|
optional :enable_automatic_punctuation, :bool, 11
|
46
|
+
optional :diarization_config, :message, 19, "google.cloud.speech.v1.SpeakerDiarizationConfig"
|
43
47
|
optional :metadata, :message, 9, "google.cloud.speech.v1.RecognitionMetadata"
|
44
48
|
optional :model, :string, 13
|
45
49
|
optional :use_enhanced, :bool, 14
|
@@ -54,6 +58,12 @@ Google::Protobuf::DescriptorPool.generated_pool.build do
|
|
54
58
|
value :OGG_OPUS, 6
|
55
59
|
value :SPEEX_WITH_HEADER_BYTE, 7
|
56
60
|
end
|
61
|
+
add_message "google.cloud.speech.v1.SpeakerDiarizationConfig" do
|
62
|
+
optional :enable_speaker_diarization, :bool, 1
|
63
|
+
optional :min_speaker_count, :int32, 2
|
64
|
+
optional :max_speaker_count, :int32, 3
|
65
|
+
optional :speaker_tag, :int32, 5
|
66
|
+
end
|
57
67
|
add_message "google.cloud.speech.v1.RecognitionMetadata" do
|
58
68
|
optional :interaction_type, :enum, 1, "google.cloud.speech.v1.RecognitionMetadata.InteractionType"
|
59
69
|
optional :industry_naics_code_of_audio, :uint32, 3
|
@@ -158,6 +168,7 @@ module Google
|
|
158
168
|
StreamingRecognitionConfig = Google::Protobuf::DescriptorPool.generated_pool.lookup("google.cloud.speech.v1.StreamingRecognitionConfig").msgclass
|
159
169
|
RecognitionConfig = Google::Protobuf::DescriptorPool.generated_pool.lookup("google.cloud.speech.v1.RecognitionConfig").msgclass
|
160
170
|
RecognitionConfig::AudioEncoding = Google::Protobuf::DescriptorPool.generated_pool.lookup("google.cloud.speech.v1.RecognitionConfig.AudioEncoding").enummodule
|
171
|
+
SpeakerDiarizationConfig = Google::Protobuf::DescriptorPool.generated_pool.lookup("google.cloud.speech.v1.SpeakerDiarizationConfig").msgclass
|
161
172
|
RecognitionMetadata = Google::Protobuf::DescriptorPool.generated_pool.lookup("google.cloud.speech.v1.RecognitionMetadata").msgclass
|
162
173
|
RecognitionMetadata::InteractionType = Google::Protobuf::DescriptorPool.generated_pool.lookup("google.cloud.speech.v1.RecognitionMetadata.InteractionType").enummodule
|
163
174
|
RecognitionMetadata::MicrophoneDistance = Google::Protobuf::DescriptorPool.generated_pool.lookup("google.cloud.speech.v1.RecognitionMetadata.MicrophoneDistance").enummodule
|
@@ -1,7 +1,7 @@
|
|
1
1
|
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
2
2
|
# Source: google/cloud/speech/v1/cloud_speech.proto for package 'google.cloud.speech.v1'
|
3
3
|
# Original file comments:
|
4
|
-
# Copyright
|
4
|
+
# Copyright 2019 Google LLC.
|
5
5
|
#
|
6
6
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
7
7
|
# you may not use this file except in compliance with the License.
|
@@ -42,6 +42,8 @@ module Google
|
|
42
42
|
# google.longrunning.Operations interface. Returns either an
|
43
43
|
# `Operation.error` or an `Operation.response` which contains
|
44
44
|
# a `LongRunningRecognizeResponse` message.
|
45
|
+
# For more information on asynchronous speech recognition, see the
|
46
|
+
# [how-to](https://cloud.google.com/speech-to-text/docs/async-recognize).
|
45
47
|
rpc :LongRunningRecognize, LongRunningRecognizeRequest, Google::Longrunning::Operation
|
46
48
|
# Performs bidirectional streaming speech recognition: receive results while
|
47
49
|
# sending audio. This method is only available via the gRPC API (not REST).
|
@@ -20,29 +20,29 @@ module Google
|
|
20
20
|
# The top-level message sent by the client for the `Recognize` method.
|
21
21
|
# @!attribute [rw] config
|
22
22
|
# @return [Google::Cloud::Speech::V1::RecognitionConfig]
|
23
|
-
#
|
23
|
+
# Required. Provides information to the recognizer that specifies how to
|
24
24
|
# process the request.
|
25
25
|
# @!attribute [rw] audio
|
26
26
|
# @return [Google::Cloud::Speech::V1::RecognitionAudio]
|
27
|
-
#
|
27
|
+
# Required. The audio data to be recognized.
|
28
28
|
class RecognizeRequest; end
|
29
29
|
|
30
30
|
# The top-level message sent by the client for the `LongRunningRecognize`
|
31
31
|
# method.
|
32
32
|
# @!attribute [rw] config
|
33
33
|
# @return [Google::Cloud::Speech::V1::RecognitionConfig]
|
34
|
-
#
|
34
|
+
# Required. Provides information to the recognizer that specifies how to
|
35
35
|
# process the request.
|
36
36
|
# @!attribute [rw] audio
|
37
37
|
# @return [Google::Cloud::Speech::V1::RecognitionAudio]
|
38
|
-
#
|
38
|
+
# Required. The audio data to be recognized.
|
39
39
|
class LongRunningRecognizeRequest; end
|
40
40
|
|
41
41
|
# The top-level message sent by the client for the `StreamingRecognize` method.
|
42
42
|
# Multiple `StreamingRecognizeRequest` messages are sent. The first message
|
43
|
-
# must contain a `streaming_config` message and must not contain
|
44
|
-
# All subsequent messages must contain `
|
45
|
-
# `streaming_config` message.
|
43
|
+
# must contain a `streaming_config` message and must not contain
|
44
|
+
# `audio_content`. All subsequent messages must contain `audio_content` and
|
45
|
+
# must not contain a `streaming_config` message.
|
46
46
|
# @!attribute [rw] streaming_config
|
47
47
|
# @return [Google::Cloud::Speech::V1::StreamingRecognitionConfig]
|
48
48
|
# Provides information to the recognizer that specifies how to process the
|
@@ -55,7 +55,7 @@ module Google
|
|
55
55
|
# `StreamingRecognizeRequest` message must not contain `audio_content` data
|
56
56
|
# and all subsequent `StreamingRecognizeRequest` messages must contain
|
57
57
|
# `audio_content` data. The audio bytes must be encoded as specified in
|
58
|
-
# `RecognitionConfig`. Note: as with all bytes fields,
|
58
|
+
# `RecognitionConfig`. Note: as with all bytes fields, proto buffers use a
|
59
59
|
# pure binary representation (not base64). See
|
60
60
|
# [content limits](https://cloud.google.com/speech-to-text/quotas#content).
|
61
61
|
class StreamingRecognizeRequest; end
|
@@ -64,11 +64,11 @@ module Google
|
|
64
64
|
# request.
|
65
65
|
# @!attribute [rw] config
|
66
66
|
# @return [Google::Cloud::Speech::V1::RecognitionConfig]
|
67
|
-
#
|
67
|
+
# Required. Provides information to the recognizer that specifies how to
|
68
68
|
# process the request.
|
69
69
|
# @!attribute [rw] single_utterance
|
70
70
|
# @return [true, false]
|
71
|
-
#
|
71
|
+
# If `false` or omitted, the recognizer will perform continuous
|
72
72
|
# recognition (continuing to wait for and process audio even if the user
|
73
73
|
# pauses speaking) until the client closes the input stream (gRPC API) or
|
74
74
|
# until the maximum time limit has been reached. May return multiple
|
@@ -81,7 +81,7 @@ module Google
|
|
81
81
|
# `true`.
|
82
82
|
# @!attribute [rw] interim_results
|
83
83
|
# @return [true, false]
|
84
|
-
#
|
84
|
+
# If `true`, interim results (tentative hypotheses) may be
|
85
85
|
# returned as they become available (these interim results are indicated with
|
86
86
|
# the `is_final=false` flag).
|
87
87
|
# If `false` or omitted, only `is_final=true` result(s) are returned.
|
@@ -93,8 +93,7 @@ module Google
|
|
93
93
|
# @return [Google::Cloud::Speech::V1::RecognitionConfig::AudioEncoding]
|
94
94
|
# Encoding of audio data sent in all `RecognitionAudio` messages.
|
95
95
|
# This field is optional for `FLAC` and `WAV` audio files and required
|
96
|
-
# for all other audio formats. For details, see
|
97
|
-
# {Google::Cloud::Speech::V1::RecognitionConfig::AudioEncoding AudioEncoding}.
|
96
|
+
# for all other audio formats. For details, see {Google::Cloud::Speech::V1::RecognitionConfig::AudioEncoding AudioEncoding}.
|
98
97
|
# @!attribute [rw] sample_rate_hertz
|
99
98
|
# @return [Integer]
|
100
99
|
# Sample rate in Hertz of the audio data sent in all
|
@@ -102,12 +101,11 @@ module Google
|
|
102
101
|
# 16000 is optimal. For best results, set the sampling rate of the audio
|
103
102
|
# source to 16000 Hz. If that's not possible, use the native sample rate of
|
104
103
|
# the audio source (instead of re-sampling).
|
105
|
-
# This field is optional for
|
106
|
-
# for all other audio formats. For details, see
|
107
|
-
# {Google::Cloud::Speech::V1::RecognitionConfig::AudioEncoding AudioEncoding}.
|
104
|
+
# This field is optional for FLAC and WAV audio files, but is
|
105
|
+
# required for all other audio formats. For details, see {Google::Cloud::Speech::V1::RecognitionConfig::AudioEncoding AudioEncoding}.
|
108
106
|
# @!attribute [rw] audio_channel_count
|
109
107
|
# @return [Integer]
|
110
|
-
#
|
108
|
+
# The number of channels in the input audio data.
|
111
109
|
# ONLY set this for MULTI-CHANNEL recognition.
|
112
110
|
# Valid values for LINEAR16 and FLAC are `1`-`8`.
|
113
111
|
# Valid values for OGG_OPUS are '1'-'254'.
|
@@ -126,14 +124,15 @@ module Google
|
|
126
124
|
# `audio_channel_count` multiplied by the length of the audio.
|
127
125
|
# @!attribute [rw] language_code
|
128
126
|
# @return [String]
|
129
|
-
#
|
127
|
+
# Required. The language of the supplied audio as a
|
130
128
|
# [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag.
|
131
129
|
# Example: "en-US".
|
132
|
-
# See [Language
|
133
|
-
# for a list
|
130
|
+
# See [Language
|
131
|
+
# Support](https://cloud.google.com/speech-to-text/docs/languages) for a list
|
132
|
+
# of the currently supported language codes.
|
134
133
|
# @!attribute [rw] max_alternatives
|
135
134
|
# @return [Integer]
|
136
|
-
#
|
135
|
+
# Maximum number of recognition hypotheses to be returned.
|
137
136
|
# Specifically, the maximum number of `SpeechRecognitionAlternative` messages
|
138
137
|
# within each `SpeechRecognitionResult`.
|
139
138
|
# The server may return fewer than `max_alternatives`.
|
@@ -141,36 +140,48 @@ module Google
|
|
141
140
|
# one. If omitted, will return a maximum of one.
|
142
141
|
# @!attribute [rw] profanity_filter
|
143
142
|
# @return [true, false]
|
144
|
-
#
|
143
|
+
# If set to `true`, the server will attempt to filter out
|
145
144
|
# profanities, replacing all but the initial character in each filtered word
|
146
145
|
# with asterisks, e.g. "f***". If set to `false` or omitted, profanities
|
147
146
|
# won't be filtered out.
|
148
147
|
# @!attribute [rw] speech_contexts
|
149
148
|
# @return [Array<Google::Cloud::Speech::V1::SpeechContext>]
|
150
|
-
#
|
149
|
+
# Array of {Google::Cloud::Speech::V1::SpeechContext SpeechContext}.
|
151
150
|
# A means to provide context to assist the speech recognition. For more
|
152
|
-
# information, see
|
151
|
+
# information, see
|
152
|
+
# [speech
|
153
|
+
# adaptation](https://cloud.google.com/speech-to-text/docs/context-strength).
|
153
154
|
# @!attribute [rw] enable_word_time_offsets
|
154
155
|
# @return [true, false]
|
155
|
-
#
|
156
|
+
# If `true`, the top result includes a list of words and
|
156
157
|
# the start and end time offsets (timestamps) for those words. If
|
157
158
|
# `false`, no word-level time offset information is returned. The default is
|
158
159
|
# `false`.
|
159
160
|
# @!attribute [rw] enable_automatic_punctuation
|
160
161
|
# @return [true, false]
|
161
|
-
#
|
162
|
+
# If 'true', adds punctuation to recognition result hypotheses.
|
162
163
|
# This feature is only available in select languages. Setting this for
|
163
164
|
# requests in other languages has no effect at all.
|
164
165
|
# The default 'false' value does not add punctuation to result hypotheses.
|
165
166
|
# Note: This is currently offered as an experimental service, complimentary
|
166
167
|
# to all users. In the future this may be exclusively available as a
|
167
168
|
# premium feature.
|
169
|
+
# @!attribute [rw] diarization_config
|
170
|
+
# @return [Google::Cloud::Speech::V1::SpeakerDiarizationConfig]
|
171
|
+
# Config to enable speaker diarization and set additional
|
172
|
+
# parameters to make diarization better suited for your application.
|
173
|
+
# Note: When this is enabled, we send all the words from the beginning of the
|
174
|
+
# audio for the top alternative in every consecutive STREAMING responses.
|
175
|
+
# This is done in order to improve our speaker tags as our models learn to
|
176
|
+
# identify the speakers in the conversation over time.
|
177
|
+
# For non-streaming requests, the diarization results will be provided only
|
178
|
+
# in the top alternative of the FINAL SpeechRecognitionResult.
|
168
179
|
# @!attribute [rw] metadata
|
169
180
|
# @return [Google::Cloud::Speech::V1::RecognitionMetadata]
|
170
|
-
#
|
181
|
+
# Metadata regarding this request.
|
171
182
|
# @!attribute [rw] model
|
172
183
|
# @return [String]
|
173
|
-
#
|
184
|
+
# Which model to select for the given request. Select the model
|
174
185
|
# best suited to your domain to get best results. If a model is not
|
175
186
|
# explicitly specified, then we auto-select a model based on the parameters
|
176
187
|
# in the RecognitionConfig.
|
@@ -204,7 +215,7 @@ module Google
|
|
204
215
|
# </table>
|
205
216
|
# @!attribute [rw] use_enhanced
|
206
217
|
# @return [true, false]
|
207
|
-
#
|
218
|
+
# Set to true to use an enhanced model for speech recognition.
|
208
219
|
# If `use_enhanced` is set to true and the `model` field is not set, then
|
209
220
|
# an appropriate enhanced model is chosen if an enhanced model exists for
|
210
221
|
# the audio.
|
@@ -215,13 +226,15 @@ module Google
|
|
215
226
|
class RecognitionConfig
|
216
227
|
# The encoding of the audio data sent in the request.
|
217
228
|
#
|
218
|
-
# All encodings support only 1 channel (mono) audio
|
229
|
+
# All encodings support only 1 channel (mono) audio, unless the
|
230
|
+
# `audio_channel_count` and `enable_separate_recognition_per_channel` fields
|
231
|
+
# are set.
|
219
232
|
#
|
220
233
|
# For best results, the audio source should be captured and transmitted using
|
221
234
|
# a lossless encoding (`FLAC` or `LINEAR16`). The accuracy of the speech
|
222
235
|
# recognition can be reduced if lossy codecs are used to capture or transmit
|
223
236
|
# audio, particularly if background noise is present. Lossy codecs include
|
224
|
-
# `MULAW`, `AMR`, `AMR_WB`, `OGG_OPUS`, and `
|
237
|
+
# `MULAW`, `AMR`, `AMR_WB`, `OGG_OPUS`, `SPEEX_WITH_HEADER_BYTE`, and `MP3`.
|
225
238
|
#
|
226
239
|
# The `FLAC` and `WAV` audio file formats include a header that describes the
|
227
240
|
# included audio content. You can request recognition for `WAV` files that
|
@@ -232,8 +245,7 @@ module Google
|
|
232
245
|
# an `AudioEncoding` when you send send `FLAC` or `WAV` audio, the
|
233
246
|
# encoding configuration must match the encoding described in the audio
|
234
247
|
# header; otherwise the request returns an
|
235
|
-
# {Google::Rpc::Code::INVALID_ARGUMENT} error
|
236
|
-
# code.
|
248
|
+
# {Google::Rpc::Code::INVALID_ARGUMENT} error code.
|
237
249
|
module AudioEncoding
|
238
250
|
# Not specified.
|
239
251
|
ENCODING_UNSPECIFIED = 0
|
@@ -280,6 +292,31 @@ module Google
|
|
280
292
|
end
|
281
293
|
end
|
282
294
|
|
295
|
+
# Config to enable speaker diarization.
|
296
|
+
# @!attribute [rw] enable_speaker_diarization
|
297
|
+
# @return [true, false]
|
298
|
+
# If 'true', enables speaker detection for each recognized word in
|
299
|
+
# the top alternative of the recognition result using a speaker_tag provided
|
300
|
+
# in the WordInfo.
|
301
|
+
# @!attribute [rw] min_speaker_count
|
302
|
+
# @return [Integer]
|
303
|
+
# Minimum number of speakers in the conversation. This range gives you more
|
304
|
+
# flexibility by allowing the system to automatically determine the correct
|
305
|
+
# number of speakers. If not set, the default value is 2.
|
306
|
+
# @!attribute [rw] max_speaker_count
|
307
|
+
# @return [Integer]
|
308
|
+
# Maximum number of speakers in the conversation. This range gives you more
|
309
|
+
# flexibility by allowing the system to automatically determine the correct
|
310
|
+
# number of speakers. If not set, the default value is 6.
|
311
|
+
# @!attribute [rw] speaker_tag
|
312
|
+
# @return [Integer]
|
313
|
+
# A distinct integer value is assigned for every speaker within
|
314
|
+
# the audio. This field specifies which one of those speakers was detected to
|
315
|
+
# have spoken this word. Value ranges from '1' to diarization_speaker_count.
|
316
|
+
# speaker_tag is set if enable_speaker_diarization = 'true' and only in the
|
317
|
+
# top alternative.
|
318
|
+
class SpeakerDiarizationConfig; end
|
319
|
+
|
283
320
|
# Description of audio data to be recognized.
|
284
321
|
# @!attribute [rw] interaction_type
|
285
322
|
# @return [Google::Cloud::Speech::V1::RecognitionMetadata::InteractionType]
|
@@ -411,22 +448,28 @@ module Google
|
|
411
448
|
# in the results.
|
412
449
|
# @!attribute [rw] phrases
|
413
450
|
# @return [Array<String>]
|
414
|
-
#
|
451
|
+
# A list of strings containing words and phrases "hints" so that
|
415
452
|
# the speech recognition is more likely to recognize them. This can be used
|
416
453
|
# to improve the accuracy for specific words and phrases, for example, if
|
417
454
|
# specific commands are typically spoken by the user. This can also be used
|
418
455
|
# to add additional words to the vocabulary of the recognizer. See
|
419
456
|
# [usage limits](https://cloud.google.com/speech-to-text/quotas#content).
|
457
|
+
#
|
458
|
+
# List items can also be set to classes for groups of words that represent
|
459
|
+
# common concepts that occur in natural language. For example, rather than
|
460
|
+
# providing phrase hints for every month of the year, using the $MONTH class
|
461
|
+
# improves the likelihood of correctly transcribing audio that includes
|
462
|
+
# months.
|
420
463
|
class SpeechContext; end
|
421
464
|
|
422
465
|
# Contains audio data in the encoding specified in the `RecognitionConfig`.
|
423
466
|
# Either `content` or `uri` must be supplied. Supplying both or neither
|
424
|
-
# returns {Google::Rpc::Code::INVALID_ARGUMENT}.
|
425
|
-
#
|
467
|
+
# returns {Google::Rpc::Code::INVALID_ARGUMENT}. See
|
468
|
+
# [content limits](https://cloud.google.com/speech-to-text/quotas#content).
|
426
469
|
# @!attribute [rw] content
|
427
470
|
# @return [String]
|
428
471
|
# The audio data bytes encoded as specified in
|
429
|
-
# `RecognitionConfig`. Note: as with all bytes fields,
|
472
|
+
# `RecognitionConfig`. Note: as with all bytes fields, proto buffers use a
|
430
473
|
# pure binary representation, whereas JSON representations use base64.
|
431
474
|
# @!attribute [rw] uri
|
432
475
|
# @return [String]
|
@@ -435,9 +478,8 @@ module Google
|
|
435
478
|
# Currently, only Google Cloud Storage URIs are
|
436
479
|
# supported, which must be specified in the following format:
|
437
480
|
# `gs://bucket_name/object_name` (other URI formats return
|
438
|
-
# {Google::Rpc::Code::INVALID_ARGUMENT}).
|
439
|
-
#
|
440
|
-
# URIs](https://cloud.google.com/storage/docs/reference-uris).
|
481
|
+
# {Google::Rpc::Code::INVALID_ARGUMENT}). For more information, see
|
482
|
+
# [Request URIs](https://cloud.google.com/storage/docs/reference-uris).
|
441
483
|
class RecognitionAudio; end
|
442
484
|
|
443
485
|
# The only message returned to the client by the `Recognize` method. It
|
@@ -445,7 +487,7 @@ module Google
|
|
445
487
|
# messages.
|
446
488
|
# @!attribute [rw] results
|
447
489
|
# @return [Array<Google::Cloud::Speech::V1::SpeechRecognitionResult>]
|
448
|
-
#
|
490
|
+
# Sequential list of transcription results corresponding to
|
449
491
|
# sequential portions of audio.
|
450
492
|
class RecognizeResponse; end
|
451
493
|
|
@@ -456,7 +498,7 @@ module Google
|
|
456
498
|
# service.
|
457
499
|
# @!attribute [rw] results
|
458
500
|
# @return [Array<Google::Cloud::Speech::V1::SpeechRecognitionResult>]
|
459
|
-
#
|
501
|
+
# Sequential list of transcription results corresponding to
|
460
502
|
# sequential portions of audio.
|
461
503
|
class LongRunningRecognizeResponse; end
|
462
504
|
|
@@ -526,17 +568,17 @@ module Google
|
|
526
568
|
# one or more (repeated) `results`.
|
527
569
|
# @!attribute [rw] error
|
528
570
|
# @return [Google::Rpc::Status]
|
529
|
-
#
|
530
|
-
#
|
571
|
+
# If set, returns a {Google::Rpc::Status} message that
|
572
|
+
# specifies the error for the operation.
|
531
573
|
# @!attribute [rw] results
|
532
574
|
# @return [Array<Google::Cloud::Speech::V1::StreamingRecognitionResult>]
|
533
|
-
#
|
575
|
+
# This repeated list contains zero or more results that
|
534
576
|
# correspond to consecutive portions of the audio currently being processed.
|
535
577
|
# It contains zero or one `is_final=true` result (the newly settled portion),
|
536
578
|
# followed by zero or more `is_final=false` results (the interim results).
|
537
579
|
# @!attribute [rw] speech_event_type
|
538
580
|
# @return [Google::Cloud::Speech::V1::StreamingRecognizeResponse::SpeechEventType]
|
539
|
-
#
|
581
|
+
# Indicates the type of speech event.
|
540
582
|
class StreamingRecognizeResponse
|
541
583
|
# Indicates the type of speech event.
|
542
584
|
module SpeechEventType
|
@@ -558,27 +600,27 @@ module Google
|
|
558
600
|
# that is currently being processed.
|
559
601
|
# @!attribute [rw] alternatives
|
560
602
|
# @return [Array<Google::Cloud::Speech::V1::SpeechRecognitionAlternative>]
|
561
|
-
#
|
603
|
+
# May contain one or more recognition hypotheses (up to the
|
562
604
|
# maximum specified in `max_alternatives`).
|
563
605
|
# These alternatives are ordered in terms of accuracy, with the top (first)
|
564
606
|
# alternative being the most probable, as ranked by the recognizer.
|
565
607
|
# @!attribute [rw] is_final
|
566
608
|
# @return [true, false]
|
567
|
-
#
|
609
|
+
# If `false`, this `StreamingRecognitionResult` represents an
|
568
610
|
# interim result that may change. If `true`, this is the final time the
|
569
611
|
# speech service will return this particular `StreamingRecognitionResult`,
|
570
612
|
# the recognizer will not return any further hypotheses for this portion of
|
571
613
|
# the transcript and corresponding audio.
|
572
614
|
# @!attribute [rw] stability
|
573
615
|
# @return [Float]
|
574
|
-
#
|
616
|
+
# An estimate of the likelihood that the recognizer will not
|
575
617
|
# change its guess about this interim result. Values range from 0.0
|
576
618
|
# (completely unstable) to 1.0 (completely stable).
|
577
619
|
# This field is only provided for interim results (`is_final=false`).
|
578
620
|
# The default of 0.0 is a sentinel value indicating `stability` was not set.
|
579
621
|
# @!attribute [rw] result_end_time
|
580
622
|
# @return [Google::Protobuf::Duration]
|
581
|
-
#
|
623
|
+
# Time offset of the end of this result relative to the
|
582
624
|
# beginning of the audio.
|
583
625
|
# @!attribute [rw] channel_tag
|
584
626
|
# @return [Integer]
|
@@ -587,16 +629,15 @@ module Google
|
|
587
629
|
# For audio_channel_count = N, its output values can range from '1' to 'N'.
|
588
630
|
# @!attribute [rw] language_code
|
589
631
|
# @return [String]
|
590
|
-
#
|
591
|
-
#
|
592
|
-
#
|
593
|
-
# likelihood of being spoken in the audio.
|
632
|
+
# The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag of
|
633
|
+
# the language in this result. This language code was detected to have the
|
634
|
+
# most likelihood of being spoken in the audio.
|
594
635
|
class StreamingRecognitionResult; end
|
595
636
|
|
596
637
|
# A speech recognition result corresponding to a portion of the audio.
|
597
638
|
# @!attribute [rw] alternatives
|
598
639
|
# @return [Array<Google::Cloud::Speech::V1::SpeechRecognitionAlternative>]
|
599
|
-
#
|
640
|
+
# May contain one or more recognition hypotheses (up to the
|
600
641
|
# maximum specified in `max_alternatives`).
|
601
642
|
# These alternatives are ordered in terms of accuracy, with the top (first)
|
602
643
|
# alternative being the most probable, as ranked by the recognizer.
|
@@ -610,10 +651,10 @@ module Google
|
|
610
651
|
# Alternative hypotheses (a.k.a. n-best list).
|
611
652
|
# @!attribute [rw] transcript
|
612
653
|
# @return [String]
|
613
|
-
#
|
654
|
+
# Transcript text representing the words that the user spoke.
|
614
655
|
# @!attribute [rw] confidence
|
615
656
|
# @return [Float]
|
616
|
-
#
|
657
|
+
# The confidence estimate between 0.0 and 1.0. A higher number
|
617
658
|
# indicates an estimated greater likelihood that the recognized words are
|
618
659
|
# correct. This field is set only for the top alternative of a non-streaming
|
619
660
|
# result or, of a streaming result where `is_final=true`.
|
@@ -622,7 +663,7 @@ module Google
|
|
622
663
|
# The default of 0.0 is a sentinel value indicating `confidence` was not set.
|
623
664
|
# @!attribute [rw] words
|
624
665
|
# @return [Array<Google::Cloud::Speech::V1::WordInfo>]
|
625
|
-
#
|
666
|
+
# A list of word-specific information for each recognized word.
|
626
667
|
# Note: When `enable_speaker_diarization` is true, you will see all the words
|
627
668
|
# from the beginning of the audio.
|
628
669
|
class SpeechRecognitionAlternative; end
|
@@ -630,7 +671,7 @@ module Google
|
|
630
671
|
# Word-specific information for recognized words.
|
631
672
|
# @!attribute [rw] start_time
|
632
673
|
# @return [Google::Protobuf::Duration]
|
633
|
-
#
|
674
|
+
# Time offset relative to the beginning of the audio,
|
634
675
|
# and corresponding to the start of the spoken word.
|
635
676
|
# This field is only set if `enable_word_time_offsets=true` and only
|
636
677
|
# in the top hypothesis.
|
@@ -638,7 +679,7 @@ module Google
|
|
638
679
|
# vary.
|
639
680
|
# @!attribute [rw] end_time
|
640
681
|
# @return [Google::Protobuf::Duration]
|
641
|
-
#
|
682
|
+
# Time offset relative to the beginning of the audio,
|
642
683
|
# and corresponding to the end of the spoken word.
|
643
684
|
# This field is only set if `enable_word_time_offsets=true` and only
|
644
685
|
# in the top hypothesis.
|
@@ -646,7 +687,7 @@ module Google
|
|
646
687
|
# vary.
|
647
688
|
# @!attribute [rw] word
|
648
689
|
# @return [String]
|
649
|
-
#
|
690
|
+
# The word corresponding to this set of information.
|
650
691
|
class WordInfo; end
|
651
692
|
end
|
652
693
|
end
|