google-cloud-speech 0.40.1 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. checksums.yaml +4 -4
  2. data/.yardopts +2 -1
  3. data/AUTHENTICATION.md +51 -59
  4. data/LICENSE.md +203 -0
  5. data/MIGRATING.md +367 -0
  6. data/README.md +35 -49
  7. data/lib/google-cloud-speech.rb +19 -0
  8. data/lib/google/cloud/speech.rb +88 -143
  9. data/lib/google/cloud/speech/version.rb +1 -1
  10. metadata +76 -68
  11. data/LICENSE +0 -201
  12. data/lib/google/cloud/speech/v1.rb +0 -166
  13. data/lib/google/cloud/speech/v1/cloud_speech_pb.rb +0 -192
  14. data/lib/google/cloud/speech/v1/cloud_speech_services_pb.rb +0 -58
  15. data/lib/google/cloud/speech/v1/credentials.rb +0 -41
  16. data/lib/google/cloud/speech/v1/doc/google/cloud/speech/v1/cloud_speech.rb +0 -698
  17. data/lib/google/cloud/speech/v1/doc/google/longrunning/operations.rb +0 -51
  18. data/lib/google/cloud/speech/v1/doc/google/protobuf/any.rb +0 -131
  19. data/lib/google/cloud/speech/v1/doc/google/protobuf/duration.rb +0 -91
  20. data/lib/google/cloud/speech/v1/doc/google/rpc/status.rb +0 -87
  21. data/lib/google/cloud/speech/v1/helpers.rb +0 -136
  22. data/lib/google/cloud/speech/v1/speech_client.rb +0 -343
  23. data/lib/google/cloud/speech/v1/speech_client_config.json +0 -41
  24. data/lib/google/cloud/speech/v1/stream.rb +0 -615
  25. data/lib/google/cloud/speech/v1p1beta1.rb +0 -166
  26. data/lib/google/cloud/speech/v1p1beta1/cloud_speech_pb.rb +0 -200
  27. data/lib/google/cloud/speech/v1p1beta1/cloud_speech_services_pb.rb +0 -58
  28. data/lib/google/cloud/speech/v1p1beta1/credentials.rb +0 -41
  29. data/lib/google/cloud/speech/v1p1beta1/doc/google/cloud/speech/v1p1beta1/cloud_speech.rb +0 -758
  30. data/lib/google/cloud/speech/v1p1beta1/doc/google/longrunning/operations.rb +0 -51
  31. data/lib/google/cloud/speech/v1p1beta1/doc/google/protobuf/any.rb +0 -131
  32. data/lib/google/cloud/speech/v1p1beta1/doc/google/protobuf/duration.rb +0 -91
  33. data/lib/google/cloud/speech/v1p1beta1/doc/google/rpc/status.rb +0 -87
  34. data/lib/google/cloud/speech/v1p1beta1/helpers.rb +0 -136
  35. data/lib/google/cloud/speech/v1p1beta1/speech_client.rb +0 -343
  36. data/lib/google/cloud/speech/v1p1beta1/speech_client_config.json +0 -41
  37. data/lib/google/cloud/speech/v1p1beta1/stream.rb +0 -615
@@ -1,758 +0,0 @@
1
- # Copyright 2019 Google LLC
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # https://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
-
16
- module Google
17
- module Cloud
18
- module Speech
19
- module V1p1beta1
20
- # The top-level message sent by the client for the `Recognize` method.
21
- # @!attribute [rw] config
22
- # @return [Google::Cloud::Speech::V1p1beta1::RecognitionConfig]
23
- # Required. Provides information to the recognizer that specifies how to
24
- # process the request.
25
- # @!attribute [rw] audio
26
- # @return [Google::Cloud::Speech::V1p1beta1::RecognitionAudio]
27
- # Required. The audio data to be recognized.
28
- class RecognizeRequest; end
29
-
30
- # The top-level message sent by the client for the `LongRunningRecognize`
31
- # method.
32
- # @!attribute [rw] config
33
- # @return [Google::Cloud::Speech::V1p1beta1::RecognitionConfig]
34
- # Required. Provides information to the recognizer that specifies how to
35
- # process the request.
36
- # @!attribute [rw] audio
37
- # @return [Google::Cloud::Speech::V1p1beta1::RecognitionAudio]
38
- # Required. The audio data to be recognized.
39
- class LongRunningRecognizeRequest; end
40
-
41
- # The top-level message sent by the client for the `StreamingRecognize` method.
42
- # Multiple `StreamingRecognizeRequest` messages are sent. The first message
43
- # must contain a `streaming_config` message and must not contain
44
- # `audio_content`. All subsequent messages must contain `audio_content` and
45
- # must not contain a `streaming_config` message.
46
- # @!attribute [rw] streaming_config
47
- # @return [Google::Cloud::Speech::V1p1beta1::StreamingRecognitionConfig]
48
- # Provides information to the recognizer that specifies how to process the
49
- # request. The first `StreamingRecognizeRequest` message must contain a
50
- # `streaming_config` message.
51
- # @!attribute [rw] audio_content
52
- # @return [String]
53
- # The audio data to be recognized. Sequential chunks of audio data are sent
54
- # in sequential `StreamingRecognizeRequest` messages. The first
55
- # `StreamingRecognizeRequest` message must not contain `audio_content` data
56
- # and all subsequent `StreamingRecognizeRequest` messages must contain
57
- # `audio_content` data. The audio bytes must be encoded as specified in
58
- # `RecognitionConfig`. Note: as with all bytes fields, proto buffers use a
59
- # pure binary representation (not base64). See
60
- # [content limits](https://cloud.google.com/speech-to-text/quotas#content).
61
- class StreamingRecognizeRequest; end
62
-
63
- # Provides information to the recognizer that specifies how to process the
64
- # request.
65
- # @!attribute [rw] config
66
- # @return [Google::Cloud::Speech::V1p1beta1::RecognitionConfig]
67
- # Required. Provides information to the recognizer that specifies how to
68
- # process the request.
69
- # @!attribute [rw] single_utterance
70
- # @return [true, false]
71
- # If `false` or omitted, the recognizer will perform continuous
72
- # recognition (continuing to wait for and process audio even if the user
73
- # pauses speaking) until the client closes the input stream (gRPC API) or
74
- # until the maximum time limit has been reached. May return multiple
75
- # `StreamingRecognitionResult`s with the `is_final` flag set to `true`.
76
- #
77
- # If `true`, the recognizer will detect a single spoken utterance. When it
78
- # detects that the user has paused or stopped speaking, it will return an
79
- # `END_OF_SINGLE_UTTERANCE` event and cease recognition. It will return no
80
- # more than one `StreamingRecognitionResult` with the `is_final` flag set to
81
- # `true`.
82
- # @!attribute [rw] interim_results
83
- # @return [true, false]
84
- # If `true`, interim results (tentative hypotheses) may be
85
- # returned as they become available (these interim results are indicated with
86
- # the `is_final=false` flag).
87
- # If `false` or omitted, only `is_final=true` result(s) are returned.
88
- class StreamingRecognitionConfig; end
89
-
90
- # Provides information to the recognizer that specifies how to process the
91
- # request.
92
- # @!attribute [rw] encoding
93
- # @return [Google::Cloud::Speech::V1p1beta1::RecognitionConfig::AudioEncoding]
94
- # Encoding of audio data sent in all `RecognitionAudio` messages.
95
- # This field is optional for `FLAC` and `WAV` audio files and required
96
- # for all other audio formats. For details, see {Google::Cloud::Speech::V1p1beta1::RecognitionConfig::AudioEncoding AudioEncoding}.
97
- # @!attribute [rw] sample_rate_hertz
98
- # @return [Integer]
99
- # Sample rate in Hertz of the audio data sent in all
100
- # `RecognitionAudio` messages. Valid values are: 8000-48000.
101
- # 16000 is optimal. For best results, set the sampling rate of the audio
102
- # source to 16000 Hz. If that's not possible, use the native sample rate of
103
- # the audio source (instead of re-sampling).
104
- # This field is optional for FLAC and WAV audio files, but is
105
- # required for all other audio formats. For details, see {Google::Cloud::Speech::V1p1beta1::RecognitionConfig::AudioEncoding AudioEncoding}.
106
- # @!attribute [rw] audio_channel_count
107
- # @return [Integer]
108
- # The number of channels in the input audio data.
109
- # ONLY set this for MULTI-CHANNEL recognition.
110
- # Valid values for LINEAR16 and FLAC are `1`-`8`.
111
- # Valid values for OGG_OPUS are '1'-'254'.
112
- # Valid value for MULAW, AMR, AMR_WB and SPEEX_WITH_HEADER_BYTE is only `1`.
113
- # If `0` or omitted, defaults to one channel (mono).
114
- # Note: We only recognize the first channel by default.
115
- # To perform independent recognition on each channel set
116
- # `enable_separate_recognition_per_channel` to 'true'.
117
- # @!attribute [rw] enable_separate_recognition_per_channel
118
- # @return [true, false]
119
- # This needs to be set to `true` explicitly and `audio_channel_count` > 1
120
- # to get each channel recognized separately. The recognition result will
121
- # contain a `channel_tag` field to state which channel that result belongs
122
- # to. If this is not true, we will only recognize the first channel. The
123
- # request is billed cumulatively for all channels recognized:
124
- # `audio_channel_count` multiplied by the length of the audio.
125
- # @!attribute [rw] language_code
126
- # @return [String]
127
- # Required. The language of the supplied audio as a
128
- # [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag.
129
- # Example: "en-US".
130
- # See [Language
131
- # Support](https://cloud.google.com/speech-to-text/docs/languages) for a list
132
- # of the currently supported language codes.
133
- # @!attribute [rw] alternative_language_codes
134
- # @return [Array<String>]
135
- # A list of up to 3 additional
136
- # [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tags,
137
- # listing possible alternative languages of the supplied audio.
138
- # See [Language
139
- # Support](https://cloud.google.com/speech-to-text/docs/languages) for a list
140
- # of the currently supported language codes. If alternative languages are
141
- # listed, recognition result will contain recognition in the most likely
142
- # language detected including the main language_code. The recognition result
143
- # will include the language tag of the language detected in the audio. Note:
144
- # This feature is only supported for Voice Command and Voice Search use cases
145
- # and performance may vary for other use cases (e.g., phone call
146
- # transcription).
147
- # @!attribute [rw] max_alternatives
148
- # @return [Integer]
149
- # Maximum number of recognition hypotheses to be returned.
150
- # Specifically, the maximum number of `SpeechRecognitionAlternative` messages
151
- # within each `SpeechRecognitionResult`.
152
- # The server may return fewer than `max_alternatives`.
153
- # Valid values are `0`-`30`. A value of `0` or `1` will return a maximum of
154
- # one. If omitted, will return a maximum of one.
155
- # @!attribute [rw] profanity_filter
156
- # @return [true, false]
157
- # If set to `true`, the server will attempt to filter out
158
- # profanities, replacing all but the initial character in each filtered word
159
- # with asterisks, e.g. "f***". If set to `false` or omitted, profanities
160
- # won't be filtered out.
161
- # @!attribute [rw] speech_contexts
162
- # @return [Array<Google::Cloud::Speech::V1p1beta1::SpeechContext>]
163
- # Array of {Google::Cloud::Speech::V1p1beta1::SpeechContext SpeechContext}.
164
- # A means to provide context to assist the speech recognition. For more
165
- # information, see
166
- # [speech
167
- # adaptation](https://cloud.google.com/speech-to-text/docs/context-strength).
168
- # @!attribute [rw] enable_word_time_offsets
169
- # @return [true, false]
170
- # If `true`, the top result includes a list of words and
171
- # the start and end time offsets (timestamps) for those words. If
172
- # `false`, no word-level time offset information is returned. The default is
173
- # `false`.
174
- # @!attribute [rw] enable_word_confidence
175
- # @return [true, false]
176
- # If `true`, the top result includes a list of words and the
177
- # confidence for those words. If `false`, no word-level confidence
178
- # information is returned. The default is `false`.
179
- # @!attribute [rw] enable_automatic_punctuation
180
- # @return [true, false]
181
- # If 'true', adds punctuation to recognition result hypotheses.
182
- # This feature is only available in select languages. Setting this for
183
- # requests in other languages has no effect at all.
184
- # The default 'false' value does not add punctuation to result hypotheses.
185
- # Note: This is currently offered as an experimental service, complimentary
186
- # to all users. In the future this may be exclusively available as a
187
- # premium feature.
188
- # @!attribute [rw] enable_speaker_diarization
189
- # @return [true, false]
190
- # If 'true', enables speaker detection for each recognized word in
191
- # the top alternative of the recognition result using a speaker_tag provided
192
- # in the WordInfo.
193
- # Note: Use diarization_config instead.
194
- # @!attribute [rw] diarization_speaker_count
195
- # @return [Integer]
196
- # If set, specifies the estimated number of speakers in the conversation.
197
- # Defaults to '2'. Ignored unless enable_speaker_diarization is set to true.
198
- # Note: Use diarization_config instead.
199
- # @!attribute [rw] diarization_config
200
- # @return [Google::Cloud::Speech::V1p1beta1::SpeakerDiarizationConfig]
201
- # Config to enable speaker diarization and set additional
202
- # parameters to make diarization better suited for your application.
203
- # Note: When this is enabled, we send all the words from the beginning of the
204
- # audio for the top alternative in every consecutive STREAMING responses.
205
- # This is done in order to improve our speaker tags as our models learn to
206
- # identify the speakers in the conversation over time.
207
- # For non-streaming requests, the diarization results will be provided only
208
- # in the top alternative of the FINAL SpeechRecognitionResult.
209
- # @!attribute [rw] metadata
210
- # @return [Google::Cloud::Speech::V1p1beta1::RecognitionMetadata]
211
- # Metadata regarding this request.
212
- # @!attribute [rw] model
213
- # @return [String]
214
- # Which model to select for the given request. Select the model
215
- # best suited to your domain to get best results. If a model is not
216
- # explicitly specified, then we auto-select a model based on the parameters
217
- # in the RecognitionConfig.
218
- # <table>
219
- # <tr>
220
- # <td><b>Model</b></td>
221
- # <td><b>Description</b></td>
222
- # </tr>
223
- # <tr>
224
- # <td><code>command_and_search</code></td>
225
- # <td>Best for short queries such as voice commands or voice search.</td>
226
- # </tr>
227
- # <tr>
228
- # <td><code>phone_call</code></td>
229
- # <td>Best for audio that originated from a phone call (typically
230
- # recorded at an 8khz sampling rate).</td>
231
- # </tr>
232
- # <tr>
233
- # <td><code>video</code></td>
234
- # <td>Best for audio that originated from from video or includes multiple
235
- # speakers. Ideally the audio is recorded at a 16khz or greater
236
- # sampling rate. This is a premium model that costs more than the
237
- # standard rate.</td>
238
- # </tr>
239
- # <tr>
240
- # <td><code>default</code></td>
241
- # <td>Best for audio that is not one of the specific audio models.
242
- # For example, long-form audio. Ideally the audio is high-fidelity,
243
- # recorded at a 16khz or greater sampling rate.</td>
244
- # </tr>
245
- # </table>
246
- # @!attribute [rw] use_enhanced
247
- # @return [true, false]
248
- # Set to true to use an enhanced model for speech recognition.
249
- # If `use_enhanced` is set to true and the `model` field is not set, then
250
- # an appropriate enhanced model is chosen if an enhanced model exists for
251
- # the audio.
252
- #
253
- # If `use_enhanced` is true and an enhanced version of the specified model
254
- # does not exist, then the speech is recognized using the standard version
255
- # of the specified model.
256
- class RecognitionConfig
257
- # The encoding of the audio data sent in the request.
258
- #
259
- # All encodings support only 1 channel (mono) audio, unless the
260
- # `audio_channel_count` and `enable_separate_recognition_per_channel` fields
261
- # are set.
262
- #
263
- # For best results, the audio source should be captured and transmitted using
264
- # a lossless encoding (`FLAC` or `LINEAR16`). The accuracy of the speech
265
- # recognition can be reduced if lossy codecs are used to capture or transmit
266
- # audio, particularly if background noise is present. Lossy codecs include
267
- # `MULAW`, `AMR`, `AMR_WB`, `OGG_OPUS`, `SPEEX_WITH_HEADER_BYTE`, and `MP3`.
268
- #
269
- # The `FLAC` and `WAV` audio file formats include a header that describes the
270
- # included audio content. You can request recognition for `WAV` files that
271
- # contain either `LINEAR16` or `MULAW` encoded audio.
272
- # If you send `FLAC` or `WAV` audio file format in
273
- # your request, you do not need to specify an `AudioEncoding`; the audio
274
- # encoding format is determined from the file header. If you specify
275
- # an `AudioEncoding` when you send send `FLAC` or `WAV` audio, the
276
- # encoding configuration must match the encoding described in the audio
277
- # header; otherwise the request returns an
278
- # {Google::Rpc::Code::INVALID_ARGUMENT} error code.
279
- module AudioEncoding
280
- # Not specified.
281
- ENCODING_UNSPECIFIED = 0
282
-
283
- # Uncompressed 16-bit signed little-endian samples (Linear PCM).
284
- LINEAR16 = 1
285
-
286
- # `FLAC` (Free Lossless Audio
287
- # Codec) is the recommended encoding because it is
288
- # lossless--therefore recognition is not compromised--and
289
- # requires only about half the bandwidth of `LINEAR16`. `FLAC` stream
290
- # encoding supports 16-bit and 24-bit samples, however, not all fields in
291
- # `STREAMINFO` are supported.
292
- FLAC = 2
293
-
294
- # 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law.
295
- MULAW = 3
296
-
297
- # Adaptive Multi-Rate Narrowband codec. `sample_rate_hertz` must be 8000.
298
- AMR = 4
299
-
300
- # Adaptive Multi-Rate Wideband codec. `sample_rate_hertz` must be 16000.
301
- AMR_WB = 5
302
-
303
- # Opus encoded audio frames in Ogg container
304
- # ([OggOpus](https://wiki.xiph.org/OggOpus)).
305
- # `sample_rate_hertz` must be one of 8000, 12000, 16000, 24000, or 48000.
306
- OGG_OPUS = 6
307
-
308
- # Although the use of lossy encodings is not recommended, if a very low
309
- # bitrate encoding is required, `OGG_OPUS` is highly preferred over
310
- # Speex encoding. The [Speex](https://speex.org/) encoding supported by
311
- # Cloud Speech API has a header byte in each block, as in MIME type
312
- # `audio/x-speex-with-header-byte`.
313
- # It is a variant of the RTP Speex encoding defined in
314
- # [RFC 5574](https://tools.ietf.org/html/rfc5574).
315
- # The stream is a sequence of blocks, one block per RTP packet. Each block
316
- # starts with a byte containing the length of the block, in bytes, followed
317
- # by one or more frames of Speex data, padded to an integral number of
318
- # bytes (octets) as specified in RFC 5574. In other words, each RTP header
319
- # is replaced with a single byte containing the block length. Only Speex
320
- # wideband is supported. `sample_rate_hertz` must be 16000.
321
- SPEEX_WITH_HEADER_BYTE = 7
322
-
323
- # MP3 audio. Support all standard MP3 bitrates (which range from 32-320
324
- # kbps). When using this encoding, `sample_rate_hertz` can be optionally
325
- # unset if not known.
326
- MP3 = 8
327
- end
328
- end
329
-
330
- # Config to enable speaker diarization.
331
- # @!attribute [rw] enable_speaker_diarization
332
- # @return [true, false]
333
- # If 'true', enables speaker detection for each recognized word in
334
- # the top alternative of the recognition result using a speaker_tag provided
335
- # in the WordInfo.
336
- # @!attribute [rw] min_speaker_count
337
- # @return [Integer]
338
- # Minimum number of speakers in the conversation. This range gives you more
339
- # flexibility by allowing the system to automatically determine the correct
340
- # number of speakers. If not set, the default value is 2.
341
- # @!attribute [rw] max_speaker_count
342
- # @return [Integer]
343
- # Maximum number of speakers in the conversation. This range gives you more
344
- # flexibility by allowing the system to automatically determine the correct
345
- # number of speakers. If not set, the default value is 6.
346
- class SpeakerDiarizationConfig; end
347
-
348
- # Description of audio data to be recognized.
349
- # @!attribute [rw] interaction_type
350
- # @return [Google::Cloud::Speech::V1p1beta1::RecognitionMetadata::InteractionType]
351
- # The use case most closely describing the audio content to be recognized.
352
- # @!attribute [rw] industry_naics_code_of_audio
353
- # @return [Integer]
354
- # The industry vertical to which this speech recognition request most
355
- # closely applies. This is most indicative of the topics contained
356
- # in the audio. Use the 6-digit NAICS code to identify the industry
357
- # vertical - see https://www.naics.com/search/.
358
- # @!attribute [rw] microphone_distance
359
- # @return [Google::Cloud::Speech::V1p1beta1::RecognitionMetadata::MicrophoneDistance]
360
- # The audio type that most closely describes the audio being recognized.
361
- # @!attribute [rw] original_media_type
362
- # @return [Google::Cloud::Speech::V1p1beta1::RecognitionMetadata::OriginalMediaType]
363
- # The original media the speech was recorded on.
364
- # @!attribute [rw] recording_device_type
365
- # @return [Google::Cloud::Speech::V1p1beta1::RecognitionMetadata::RecordingDeviceType]
366
- # The type of device the speech was recorded with.
367
- # @!attribute [rw] recording_device_name
368
- # @return [String]
369
- # The device used to make the recording. Examples 'Nexus 5X' or
370
- # 'Polycom SoundStation IP 6000' or 'POTS' or 'VoIP' or
371
- # 'Cardioid Microphone'.
372
- # @!attribute [rw] original_mime_type
373
- # @return [String]
374
- # Mime type of the original audio file. For example `audio/m4a`,
375
- # `audio/x-alaw-basic`, `audio/mp3`, `audio/3gpp`.
376
- # A list of possible audio mime types is maintained at
377
- # http://www.iana.org/assignments/media-types/media-types.xhtml#audio
378
- # @!attribute [rw] obfuscated_id
379
- # @return [Integer]
380
- # Obfuscated (privacy-protected) ID of the user, to identify number of
381
- # unique users using the service.
382
- # @!attribute [rw] audio_topic
383
- # @return [String]
384
- # Description of the content. Eg. "Recordings of federal supreme court
385
- # hearings from 2012".
386
- class RecognitionMetadata
387
- # Use case categories that the audio recognition request can be described
388
- # by.
389
- module InteractionType
390
- # Use case is either unknown or is something other than one of the other
391
- # values below.
392
- INTERACTION_TYPE_UNSPECIFIED = 0
393
-
394
- # Multiple people in a conversation or discussion. For example in a
395
- # meeting with two or more people actively participating. Typically
396
- # all the primary people speaking would be in the same room (if not,
397
- # see PHONE_CALL)
398
- DISCUSSION = 1
399
-
400
- # One or more persons lecturing or presenting to others, mostly
401
- # uninterrupted.
402
- PRESENTATION = 2
403
-
404
- # A phone-call or video-conference in which two or more people, who are
405
- # not in the same room, are actively participating.
406
- PHONE_CALL = 3
407
-
408
- # A recorded message intended for another person to listen to.
409
- VOICEMAIL = 4
410
-
411
- # Professionally produced audio (eg. TV Show, Podcast).
412
- PROFESSIONALLY_PRODUCED = 5
413
-
414
- # Transcribe spoken questions and queries into text.
415
- VOICE_SEARCH = 6
416
-
417
- # Transcribe voice commands, such as for controlling a device.
418
- VOICE_COMMAND = 7
419
-
420
- # Transcribe speech to text to create a written document, such as a
421
- # text-message, email or report.
422
- DICTATION = 8
423
- end
424
-
425
- # Enumerates the types of capture settings describing an audio file.
426
- module MicrophoneDistance
427
- # Audio type is not known.
428
- MICROPHONE_DISTANCE_UNSPECIFIED = 0
429
-
430
- # The audio was captured from a closely placed microphone. Eg. phone,
431
- # dictaphone, or handheld microphone. Generally if there speaker is within
432
- # 1 meter of the microphone.
433
- NEARFIELD = 1
434
-
435
- # The speaker if within 3 meters of the microphone.
436
- MIDFIELD = 2
437
-
438
- # The speaker is more than 3 meters away from the microphone.
439
- FARFIELD = 3
440
- end
441
-
442
- # The original media the speech was recorded on.
443
- module OriginalMediaType
444
- # Unknown original media type.
445
- ORIGINAL_MEDIA_TYPE_UNSPECIFIED = 0
446
-
447
- # The speech data is an audio recording.
448
- AUDIO = 1
449
-
450
- # The speech data originally recorded on a video.
451
- VIDEO = 2
452
- end
453
-
454
- # The type of device the speech was recorded with.
455
- module RecordingDeviceType
456
- # The recording device is unknown.
457
- RECORDING_DEVICE_TYPE_UNSPECIFIED = 0
458
-
459
- # Speech was recorded on a smartphone.
460
- SMARTPHONE = 1
461
-
462
- # Speech was recorded using a personal computer or tablet.
463
- PC = 2
464
-
465
- # Speech was recorded over a phone line.
466
- PHONE_LINE = 3
467
-
468
- # Speech was recorded in a vehicle.
469
- VEHICLE = 4
470
-
471
- # Speech was recorded outdoors.
472
- OTHER_OUTDOOR_DEVICE = 5
473
-
474
- # Speech was recorded indoors.
475
- OTHER_INDOOR_DEVICE = 6
476
- end
477
- end
478
-
479
- # Provides "hints" to the speech recognizer to favor specific words and phrases
480
- # in the results.
481
- # @!attribute [rw] phrases
482
- # @return [Array<String>]
483
- # A list of strings containing words and phrases "hints" so that
484
- # the speech recognition is more likely to recognize them. This can be used
485
- # to improve the accuracy for specific words and phrases, for example, if
486
- # specific commands are typically spoken by the user. This can also be used
487
- # to add additional words to the vocabulary of the recognizer. See
488
- # [usage limits](https://cloud.google.com/speech-to-text/quotas#content).
489
- #
490
- # List items can also be set to classes for groups of words that represent
491
- # common concepts that occur in natural language. For example, rather than
492
- # providing phrase hints for every month of the year, using the $MONTH class
493
- # improves the likelihood of correctly transcribing audio that includes
494
- # months.
495
- # @!attribute [rw] boost
496
- # @return [Float]
497
- # Hint Boost. Positive value will increase the probability that a specific
498
- # phrase will be recognized over other similar sounding phrases. The higher
499
- # the boost, the higher the chance of false positive recognition as well.
500
- # Negative boost values would correspond to anti-biasing. Anti-biasing is not
501
- # enabled, so negative boost will simply be ignored. Though `boost` can
502
- # accept a wide range of positive values, most use cases are best served with
503
- # values between 0 and 20. We recommend using a binary search approach to
504
- # finding the optimal value for your use case.
505
- class SpeechContext; end
506
-
507
- # Contains audio data in the encoding specified in the `RecognitionConfig`.
508
- # Either `content` or `uri` must be supplied. Supplying both or neither
509
- # returns {Google::Rpc::Code::INVALID_ARGUMENT}. See
510
- # [content limits](https://cloud.google.com/speech-to-text/quotas#content).
511
- # @!attribute [rw] content
512
- # @return [String]
513
- # The audio data bytes encoded as specified in
514
- # `RecognitionConfig`. Note: as with all bytes fields, proto buffers use a
515
- # pure binary representation, whereas JSON representations use base64.
516
- # @!attribute [rw] uri
517
- # @return [String]
518
- # URI that points to a file that contains audio data bytes as specified in
519
- # `RecognitionConfig`. The file must not be compressed (for example, gzip).
520
- # Currently, only Google Cloud Storage URIs are
521
- # supported, which must be specified in the following format:
522
- # `gs://bucket_name/object_name` (other URI formats return
523
- # {Google::Rpc::Code::INVALID_ARGUMENT}). For more information, see
524
- # [Request URIs](https://cloud.google.com/storage/docs/reference-uris).
525
- class RecognitionAudio; end
526
-
527
- # The only message returned to the client by the `Recognize` method. It
528
- # contains the result as zero or more sequential `SpeechRecognitionResult`
529
- # messages.
530
- # @!attribute [rw] results
531
- # @return [Array<Google::Cloud::Speech::V1p1beta1::SpeechRecognitionResult>]
532
- # Sequential list of transcription results corresponding to
533
- # sequential portions of audio.
534
- class RecognizeResponse; end
535
-
536
- # The only message returned to the client by the `LongRunningRecognize` method.
537
- # It contains the result as zero or more sequential `SpeechRecognitionResult`
538
- # messages. It is included in the `result.response` field of the `Operation`
539
- # returned by the `GetOperation` call of the `google::longrunning::Operations`
540
- # service.
541
- # @!attribute [rw] results
542
- # @return [Array<Google::Cloud::Speech::V1p1beta1::SpeechRecognitionResult>]
543
- # Sequential list of transcription results corresponding to
544
- # sequential portions of audio.
545
- class LongRunningRecognizeResponse; end
546
-
547
- # Describes the progress of a long-running `LongRunningRecognize` call. It is
548
- # included in the `metadata` field of the `Operation` returned by the
549
- # `GetOperation` call of the `google::longrunning::Operations` service.
550
- # @!attribute [rw] progress_percent
551
- # @return [Integer]
552
- # Approximate percentage of audio processed thus far. Guaranteed to be 100
553
- # when the audio is fully processed and the results are available.
554
- # @!attribute [rw] start_time
555
- # @return [Google::Protobuf::Timestamp]
556
- # Time when the request was received.
557
- # @!attribute [rw] last_update_time
558
- # @return [Google::Protobuf::Timestamp]
559
- # Time of the most recent processing update.
560
- class LongRunningRecognizeMetadata; end
561
-
562
- # `StreamingRecognizeResponse` is the only message returned to the client by
563
- # `StreamingRecognize`. A series of zero or more `StreamingRecognizeResponse`
564
- # messages are streamed back to the client. If there is no recognizable
565
- # audio, and `single_utterance` is set to false, then no messages are streamed
566
- # back to the client.
567
- #
568
- # Here's an example of a series of ten `StreamingRecognizeResponse`s that might
569
- # be returned while processing audio:
570
- #
571
- # 1. results { alternatives { transcript: "tube" } stability: 0.01 }
572
- #
573
- # 2. results { alternatives { transcript: "to be a" } stability: 0.01 }
574
- #
575
- # 3. results { alternatives { transcript: "to be" } stability: 0.9 }
576
- # results { alternatives { transcript: " or not to be" } stability: 0.01 }
577
- #
578
- # 4. results { alternatives { transcript: "to be or not to be"
579
- # confidence: 0.92 }
580
- # alternatives { transcript: "to bee or not to bee" }
581
- # is_final: true }
582
- #
583
- # 5. results { alternatives { transcript: " that's" } stability: 0.01 }
584
- #
585
- # 6. results { alternatives { transcript: " that is" } stability: 0.9 }
586
- # results { alternatives { transcript: " the question" } stability: 0.01 }
587
- #
588
- # 7. results { alternatives { transcript: " that is the question"
589
- # confidence: 0.98 }
590
- # alternatives { transcript: " that was the question" }
591
- # is_final: true }
592
- #
593
- # Notes:
594
- #
595
- # * Only two of the above responses #4 and #7 contain final results; they are
596
- # indicated by `is_final: true`. Concatenating these together generates the
597
- # full transcript: "to be or not to be that is the question".
598
- #
599
- # * The others contain interim `results`. #3 and #6 contain two interim
600
- # `results`: the first portion has a high stability and is less likely to
601
- # change; the second portion has a low stability and is very likely to
602
- # change. A UI designer might choose to show only high stability `results`.
603
- #
604
- # * The specific `stability` and `confidence` values shown above are only for
605
- # illustrative purposes. Actual values may vary.
606
- #
607
- # * In each response, only one of these fields will be set:
608
- # `error`,
609
- # `speech_event_type`, or
610
- # one or more (repeated) `results`.
611
- # @!attribute [rw] error
612
- # @return [Google::Rpc::Status]
613
- # If set, returns a {Google::Rpc::Status} message that
614
- # specifies the error for the operation.
615
- # @!attribute [rw] results
616
- # @return [Array<Google::Cloud::Speech::V1p1beta1::StreamingRecognitionResult>]
617
- # This repeated list contains zero or more results that
618
- # correspond to consecutive portions of the audio currently being processed.
619
- # It contains zero or one `is_final=true` result (the newly settled portion),
620
- # followed by zero or more `is_final=false` results (the interim results).
621
- # @!attribute [rw] speech_event_type
622
- # @return [Google::Cloud::Speech::V1p1beta1::StreamingRecognizeResponse::SpeechEventType]
623
- # Indicates the type of speech event.
624
- class StreamingRecognizeResponse
625
- # Indicates the type of speech event.
626
- module SpeechEventType
627
- # No speech event specified.
628
- SPEECH_EVENT_UNSPECIFIED = 0
629
-
630
- # This event indicates that the server has detected the end of the user's
631
- # speech utterance and expects no additional speech. Therefore, the server
632
- # will not process additional audio (although it may subsequently return
633
- # additional results). The client should stop sending additional audio
634
- # data, half-close the gRPC connection, and wait for any additional results
635
- # until the server closes the gRPC connection. This event is only sent if
636
- # `single_utterance` was set to `true`, and is not used otherwise.
637
- END_OF_SINGLE_UTTERANCE = 1
638
- end
639
- end
640
-
641
- # A streaming speech recognition result corresponding to a portion of the audio
642
- # that is currently being processed.
643
- # @!attribute [rw] alternatives
644
- # @return [Array<Google::Cloud::Speech::V1p1beta1::SpeechRecognitionAlternative>]
645
- # May contain one or more recognition hypotheses (up to the
646
- # maximum specified in `max_alternatives`).
647
- # These alternatives are ordered in terms of accuracy, with the top (first)
648
- # alternative being the most probable, as ranked by the recognizer.
649
- # @!attribute [rw] is_final
650
- # @return [true, false]
651
- # If `false`, this `StreamingRecognitionResult` represents an
652
- # interim result that may change. If `true`, this is the final time the
653
- # speech service will return this particular `StreamingRecognitionResult`,
654
- # the recognizer will not return any further hypotheses for this portion of
655
- # the transcript and corresponding audio.
656
- # @!attribute [rw] stability
657
- # @return [Float]
658
- # An estimate of the likelihood that the recognizer will not
659
- # change its guess about this interim result. Values range from 0.0
660
- # (completely unstable) to 1.0 (completely stable).
661
- # This field is only provided for interim results (`is_final=false`).
662
- # The default of 0.0 is a sentinel value indicating `stability` was not set.
663
- # @!attribute [rw] result_end_time
664
- # @return [Google::Protobuf::Duration]
665
- # Time offset of the end of this result relative to the
666
- # beginning of the audio.
667
- # @!attribute [rw] channel_tag
668
- # @return [Integer]
669
- # For multi-channel audio, this is the channel number corresponding to the
670
- # recognized result for the audio from that channel.
671
- # For audio_channel_count = N, its output values can range from '1' to 'N'.
672
- # @!attribute [rw] language_code
673
- # @return [String]
674
- # The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag
675
- # of the language in this result. This language code was detected to have
676
- # the most likelihood of being spoken in the audio.
677
- class StreamingRecognitionResult; end
678
-
679
- # A speech recognition result corresponding to a portion of the audio.
680
- # @!attribute [rw] alternatives
681
- # @return [Array<Google::Cloud::Speech::V1p1beta1::SpeechRecognitionAlternative>]
682
- # May contain one or more recognition hypotheses (up to the
683
- # maximum specified in `max_alternatives`).
684
- # These alternatives are ordered in terms of accuracy, with the top (first)
685
- # alternative being the most probable, as ranked by the recognizer.
686
- # @!attribute [rw] channel_tag
687
- # @return [Integer]
688
- # For multi-channel audio, this is the channel number corresponding to the
689
- # recognized result for the audio from that channel.
690
- # For audio_channel_count = N, its output values can range from '1' to 'N'.
691
- # @!attribute [rw] language_code
692
- # @return [String]
693
- # The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag
694
- # of the language in this result. This language code was detected to have
695
- # the most likelihood of being spoken in the audio.
696
- class SpeechRecognitionResult; end
697
-
698
- # Alternative hypotheses (a.k.a. n-best list).
699
- # @!attribute [rw] transcript
700
- # @return [String]
701
- # Transcript text representing the words that the user spoke.
702
- # @!attribute [rw] confidence
703
- # @return [Float]
704
- # The confidence estimate between 0.0 and 1.0. A higher number
705
- # indicates an estimated greater likelihood that the recognized words are
706
- # correct. This field is set only for the top alternative of a non-streaming
707
- # result or, of a streaming result where `is_final=true`.
708
- # This field is not guaranteed to be accurate and users should not rely on it
709
- # to be always provided.
710
- # The default of 0.0 is a sentinel value indicating `confidence` was not set.
711
- # @!attribute [rw] words
712
- # @return [Array<Google::Cloud::Speech::V1p1beta1::WordInfo>]
713
- # A list of word-specific information for each recognized word.
714
- # Note: When `enable_speaker_diarization` is true, you will see all the words
715
- # from the beginning of the audio.
716
- class SpeechRecognitionAlternative; end
717
-
718
- # Word-specific information for recognized words.
719
- # @!attribute [rw] start_time
720
- # @return [Google::Protobuf::Duration]
721
- # Time offset relative to the beginning of the audio,
722
- # and corresponding to the start of the spoken word.
723
- # This field is only set if `enable_word_time_offsets=true` and only
724
- # in the top hypothesis.
725
- # This is an experimental feature and the accuracy of the time offset can
726
- # vary.
727
- # @!attribute [rw] end_time
728
- # @return [Google::Protobuf::Duration]
729
- # Time offset relative to the beginning of the audio,
730
- # and corresponding to the end of the spoken word.
731
- # This field is only set if `enable_word_time_offsets=true` and only
732
- # in the top hypothesis.
733
- # This is an experimental feature and the accuracy of the time offset can
734
- # vary.
735
- # @!attribute [rw] word
736
- # @return [String]
737
- # The word corresponding to this set of information.
738
- # @!attribute [rw] confidence
739
- # @return [Float]
740
- # The confidence estimate between 0.0 and 1.0. A higher number
741
- # indicates an estimated greater likelihood that the recognized words are
742
- # correct. This field is set only for the top alternative of a non-streaming
743
- # result or, of a streaming result where `is_final=true`.
744
- # This field is not guaranteed to be accurate and users should not rely on it
745
- # to be always provided.
746
- # The default of 0.0 is a sentinel value indicating `confidence` was not set.
747
- # @!attribute [rw] speaker_tag
748
- # @return [Integer]
749
- # A distinct integer value is assigned for every speaker within
750
- # the audio. This field specifies which one of those speakers was detected to
751
- # have spoken this word. Value ranges from '1' to diarization_speaker_count.
752
- # speaker_tag is set if enable_speaker_diarization = 'true' and only in the
753
- # top alternative.
754
- class WordInfo; end
755
- end
756
- end
757
- end
758
- end