google-cloud-speech 0.23.0 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -18,7 +18,7 @@ require "google/cloud/env"
18
18
  require "google/cloud/speech/service"
19
19
  require "google/cloud/speech/audio"
20
20
  require "google/cloud/speech/result"
21
- require "google/cloud/speech/job"
21
+ require "google/cloud/speech/operation"
22
22
  require "google/cloud/speech/stream"
23
23
 
24
24
  module Google
@@ -44,7 +44,9 @@ module Google
44
44
  # speech = Google::Cloud::Speech.new
45
45
  #
46
46
  # audio = speech.audio "path/to/audio.raw",
47
- # encoding: :raw, sample_rate: 16000
47
+ # encoding: :raw,
48
+ # language: "en-US",
49
+ # sample_rate: 16000
48
50
  # results = audio.recognize
49
51
  #
50
52
  # result = results.first
@@ -120,18 +122,27 @@ module Google
120
122
  # be 8000 Hz.) (AMR)
121
123
  # * `amr_wb` - Adaptive Multi-Rate Wideband codec. (`sample_rate` must
122
124
  # be 16000 Hz.) (AMR_WB)
123
- #
125
+ # * `ogg_opus` - Ogg Mapping for Opus. (OGG_OPUS)
126
+ #
127
+ # Lossy codecs do not recommend, as they result in a lower-quality
128
+ # speech transcription.
129
+ # * `speex` - Speex with header byte. (SPEEX_WITH_HEADER_BYTE)
130
+ #
131
+ # Lossy codecs do not recommend, as they result in a lower-quality
132
+ # speech transcription. If you must use a low-bitrate encoder,
133
+ # OGG_OPUS is preferred.
134
+ #
135
+ # @param [String,Symbol] language The language of the supplied audio as
136
+ # a [BCP-47](https://tools.ietf.org/html/bcp47) language code. e.g.
137
+ # "en-US" for English (United States), "en-GB" for English (United
138
+ # Kingdom), "fr-FR" for French (France). See [Language
139
+ # Support](https://cloud.google.com/speech/docs/languages) for a list
140
+ # of the currently supported language codes. Optional.
124
141
  # @param [Integer] sample_rate Sample rate in Hertz of the audio data
125
142
  # to be recognized. Valid values are: 8000-48000. 16000 is optimal.
126
143
  # For best results, set the sampling rate of the audio source to 16000
127
144
  # Hz. If that's not possible, use the native sample rate of the audio
128
145
  # source (instead of re-sampling). Optional.
129
- # @param [String] language The language of the supplied audio as a
130
- # [BCP-47](https://tools.ietf.org/html/bcp47) language
131
- # code. If not specified, the language defaults to "en-US". See
132
- # [Language
133
- # Support](https://cloud.google.com/speech/docs/languages)
134
- # for a list of the currently supported language codes. Optional.
135
146
  #
136
147
  # @return [Audio] The audio file to be recognized.
137
148
  #
@@ -141,7 +152,9 @@ module Google
141
152
  # speech = Google::Cloud::Speech.new
142
153
  #
143
154
  # audio = speech.audio "path/to/audio.raw",
144
- # encoding: :raw, sample_rate: 16000
155
+ # encoding: :raw,
156
+ # language: "en-US",
157
+ # sample_rate: 16000
145
158
  #
146
159
  # @example With a Google Cloud Storage URI:
147
160
  # require "google/cloud/speech"
@@ -149,7 +162,9 @@ module Google
149
162
  # speech = Google::Cloud::Speech.new
150
163
  #
151
164
  # audio = speech.audio "gs://bucket-name/path/to/audio.raw",
152
- # encoding: :raw, sample_rate: 16000
165
+ # encoding: :raw,
166
+ # language: "en-US",
167
+ # sample_rate: 16000
153
168
  #
154
169
  # @example With a Google Cloud Storage File object:
155
170
  # require "google/cloud/storage"
@@ -163,17 +178,20 @@ module Google
163
178
  #
164
179
  # speech = Google::Cloud::Speech.new
165
180
  #
166
- # audio = speech.audio file, encoding: :raw, sample_rate: 16000
181
+ # audio = speech.audio file,
182
+ # encoding: :raw,
183
+ # language: "en-US",
184
+ # sample_rate: 16000
167
185
  #
168
- def audio source, encoding: nil, sample_rate: nil, language: nil
186
+ def audio source, encoding: nil, language: nil, sample_rate: nil
169
187
  if source.is_a? Audio
170
188
  audio = source.dup
171
189
  else
172
190
  audio = Audio.from_source source, self
173
191
  end
174
192
  audio.encoding = encoding unless encoding.nil?
175
- audio.sample_rate = sample_rate unless sample_rate.nil?
176
193
  audio.language = language unless language.nil?
194
+ audio.sample_rate = sample_rate unless sample_rate.nil?
177
195
  audio
178
196
  end
179
197
 
@@ -216,18 +234,27 @@ module Google
216
234
  # be 8000 Hz.) (AMR)
217
235
  # * `amr_wb` - Adaptive Multi-Rate Wideband codec. (`sample_rate` must
218
236
  # be 16000 Hz.) (AMR_WB)
219
- #
237
+ # * `ogg_opus` - Ogg Mapping for Opus. (OGG_OPUS)
238
+ #
239
+ # Lossy codecs do not recommend, as they result in a lower-quality
240
+ # speech transcription.
241
+ # * `speex` - Speex with header byte. (SPEEX_WITH_HEADER_BYTE)
242
+ #
243
+ # Lossy codecs do not recommend, as they result in a lower-quality
244
+ # speech transcription. If you must use a low-bitrate encoder,
245
+ # OGG_OPUS is preferred.
246
+ #
247
+ # @param [String,Symbol] language The language of the supplied audio as
248
+ # a [BCP-47](https://tools.ietf.org/html/bcp47) language code. e.g.
249
+ # "en-US" for English (United States), "en-GB" for English (United
250
+ # Kingdom), "fr-FR" for French (France). See [Language
251
+ # Support](https://cloud.google.com/speech/docs/languages) for a list
252
+ # of the currently supported language codes. Optional.
220
253
  # @param [Integer] sample_rate Sample rate in Hertz of the audio data
221
254
  # to be recognized. Valid values are: 8000-48000. 16000 is optimal.
222
255
  # For best results, set the sampling rate of the audio source to 16000
223
256
  # Hz. If that's not possible, use the native sample rate of the audio
224
257
  # source (instead of re-sampling). Optional.
225
- # @param [String] language The language of the supplied audio as a
226
- # [BCP-47](https://tools.ietf.org/html/bcp47) language
227
- # code. If not specified, the language defaults to "en-US". See
228
- # [Language
229
- # Support](https://cloud.google.com/speech/docs/languages)
230
- # for a list of the currently supported language codes. Optional.
231
258
  # @param [String] max_alternatives The Maximum number of recognition
232
259
  # hypotheses to be returned. Default is 1. The service may return
233
260
  # fewer. Valid values are 0-30. Defaults to 1. Optional.
@@ -248,7 +275,9 @@ module Google
248
275
  # speech = Google::Cloud::Speech.new
249
276
  #
250
277
  # results = speech.recognize "path/to/audio.raw",
251
- # encoding: :raw, sample_rate: 16000
278
+ # encoding: :raw,
279
+ # language: "en-US",
280
+ # sample_rate: 16000
252
281
  #
253
282
  # @example With a Google Cloud Storage URI:
254
283
  # require "google/cloud/speech"
@@ -256,7 +285,9 @@ module Google
256
285
  # speech = Google::Cloud::Speech.new
257
286
  #
258
287
  # results = speech.recognize "gs://bucket-name/path/to/audio.raw",
259
- # encoding: :raw, sample_rate: 16000
288
+ # encoding: :raw,
289
+ # language: "en-US",
290
+ # sample_rate: 16000
260
291
  #
261
292
  # @example With a Google Cloud Storage File object:
262
293
  # require "google/cloud/storage"
@@ -270,16 +301,18 @@ module Google
270
301
  #
271
302
  # speech = Google::Cloud::Speech.new
272
303
  #
273
- # results = speech.recognize file, encoding: :raw,
304
+ # results = speech.recognize file,
305
+ # encoding: :raw,
306
+ # language: "en-US",
274
307
  # sample_rate: 16000,
275
308
  # max_alternatives: 10
276
309
  #
277
- def recognize source, encoding: nil, sample_rate: nil, language: nil,
310
+ def recognize source, encoding: nil, language: nil, sample_rate: nil,
278
311
  max_alternatives: nil, profanity_filter: nil, phrases: nil
279
312
  ensure_service!
280
313
 
281
- audio_obj = audio source, encoding: encoding,
282
- sample_rate: sample_rate, language: language
314
+ audio_obj = audio source, encoding: encoding, language: language,
315
+ sample_rate: sample_rate
283
316
 
284
317
  config = audio_config(
285
318
  encoding: audio_obj.encoding, sample_rate: audio_obj.sample_rate,
@@ -294,9 +327,9 @@ module Google
294
327
 
295
328
  ##
296
329
  # Performs asynchronous speech recognition. Requests are processed
297
- # asynchronously, meaning a Job is returned once the audio data has been
298
- # sent, and can be refreshed to retrieve recognition results once the
299
- # audio data has been processed.
330
+ # asynchronously, meaning a Operation is returned once the audio data
331
+ # has been sent, and can be refreshed to retrieve recognition results
332
+ # once the audio data has been processed.
300
333
  #
301
334
  # @see https://cloud.google.com/speech/docs/basics#async-responses
302
335
  # Asynchronous Speech API Responses
@@ -309,22 +342,41 @@ module Google
309
342
  # @param [String, Symbol] encoding Encoding of audio data to be
310
343
  # recognized. Optional.
311
344
  #
312
- # Currently, the only acceptable value is:
345
+ # Acceptable values are:
313
346
  #
314
347
  # * `raw` - Uncompressed 16-bit signed little-endian samples.
315
348
  # (LINEAR16)
316
- #
349
+ # * `flac` - The [Free Lossless Audio
350
+ # Codec](http://flac.sourceforge.net/documentation.html) encoding.
351
+ # Only 16-bit samples are supported. Not all fields in STREAMINFO
352
+ # are supported. (FLAC)
353
+ # * `mulaw` - 8-bit samples that compand 14-bit audio samples using
354
+ # G.711 PCMU/mu-law. (MULAW)
355
+ # * `amr` - Adaptive Multi-Rate Narrowband codec. (`sample_rate` must
356
+ # be 8000 Hz.) (AMR)
357
+ # * `amr_wb` - Adaptive Multi-Rate Wideband codec. (`sample_rate` must
358
+ # be 16000 Hz.) (AMR_WB)
359
+ # * `ogg_opus` - Ogg Mapping for Opus. (OGG_OPUS)
360
+ #
361
+ # Lossy codecs do not recommend, as they result in a lower-quality
362
+ # speech transcription.
363
+ # * `speex` - Speex with header byte. (SPEEX_WITH_HEADER_BYTE)
364
+ #
365
+ # Lossy codecs do not recommend, as they result in a lower-quality
366
+ # speech transcription. If you must use a low-bitrate encoder,
367
+ # OGG_OPUS is preferred.
368
+ #
369
+ # @param [String,Symbol] language The language of the supplied audio as
370
+ # a [BCP-47](https://tools.ietf.org/html/bcp47) language code. e.g.
371
+ # "en-US" for English (United States), "en-GB" for English (United
372
+ # Kingdom), "fr-FR" for French (France). See [Language
373
+ # Support](https://cloud.google.com/speech/docs/languages) for a list
374
+ # of the currently supported language codes. Optional.
317
375
  # @param [Integer] sample_rate Sample rate in Hertz of the audio data
318
376
  # to be recognized. Valid values are: 8000-48000. 16000 is optimal.
319
377
  # For best results, set the sampling rate of the audio source to 16000
320
378
  # Hz. If that's not possible, use the native sample rate of the audio
321
379
  # source (instead of re-sampling). Optional.
322
- # @param [String] language The language of the supplied audio as a
323
- # [BCP-47](https://tools.ietf.org/html/bcp47) language
324
- # code. If not specified, the language defaults to "en-US". See
325
- # [Language
326
- # Support](https://cloud.google.com/speech/docs/languages)
327
- # for a list of the currently supported language codes. Optional.
328
380
  # @param [String] max_alternatives The Maximum number of recognition
329
381
  # hypotheses to be returned. Default is 1. The service may return
330
382
  # fewer. Valid values are 0-30. Defaults to 1. Optional.
@@ -337,30 +389,34 @@ module Google
337
389
  # recognize them. See [usage
338
390
  # limits](https://cloud.google.com/speech/limits#content). Optional.
339
391
  #
340
- # @return [Job] A resource represents the long-running, asynchronous
341
- # processing of a speech-recognition operation.
392
+ # @return [Operation] A resource represents the long-running,
393
+ # asynchronous processing of a speech-recognition operation.
342
394
  #
343
395
  # @example
344
396
  # require "google/cloud/speech"
345
397
  #
346
398
  # speech = Google::Cloud::Speech.new
347
399
  #
348
- # job = speech.recognize_job "path/to/audio.raw",
349
- # encoding: :raw, sample_rate: 16000
400
+ # op = speech.process "path/to/audio.raw",
401
+ # encoding: :raw,
402
+ # language: "en-US",
403
+ # sample_rate: 16000
350
404
  #
351
- # job.done? #=> false
352
- # job.reload!
405
+ # op.done? #=> false
406
+ # op.reload!
353
407
  #
354
408
  # @example With a Google Cloud Storage URI:
355
409
  # require "google/cloud/speech"
356
410
  #
357
411
  # speech = Google::Cloud::Speech.new
358
412
  #
359
- # job = speech.recognize_job "gs://bucket-name/path/to/audio.raw",
360
- # encoding: :raw, sample_rate: 16000
413
+ # op = speech.process "gs://bucket-name/path/to/audio.raw",
414
+ # encoding: :raw,
415
+ # language: "en-US",
416
+ # sample_rate: 16000
361
417
  #
362
- # job.done? #=> false
363
- # job.reload!
418
+ # op.done? #=> false
419
+ # op.reload!
364
420
  #
365
421
  # @example With a Google Cloud Storage File object:
366
422
  # require "google/cloud/storage"
@@ -374,20 +430,21 @@ module Google
374
430
  #
375
431
  # speech = Google::Cloud::Speech.new
376
432
  #
377
- # job = speech.recognize_job file, encoding: :raw,
433
+ # op = speech.process file,
434
+ # encoding: :raw,
435
+ # language: "en-US",
378
436
  # sample_rate: 16000,
379
437
  # max_alternatives: 10
380
438
  #
381
- # job.done? #=> false
382
- # job.reload!
439
+ # op.done? #=> false
440
+ # op.reload!
383
441
  #
384
- def recognize_job source, encoding: nil, sample_rate: nil,
385
- language: nil, max_alternatives: nil,
386
- profanity_filter: nil, phrases: nil
442
+ def process source, encoding: nil, sample_rate: nil, language: nil,
443
+ max_alternatives: nil, profanity_filter: nil, phrases: nil
387
444
  ensure_service!
388
445
 
389
- audio_obj = audio source, encoding: encoding,
390
- sample_rate: sample_rate, language: language
446
+ audio_obj = audio source, encoding: encoding, language: language,
447
+ sample_rate: sample_rate
391
448
 
392
449
  config = audio_config(
393
450
  encoding: audio_obj.encoding, sample_rate: audio_obj.sample_rate,
@@ -395,8 +452,10 @@ module Google
395
452
  profanity_filter: profanity_filter, phrases: phrases)
396
453
 
397
454
  grpc = service.recognize_async audio_obj.to_grpc, config
398
- Job.from_grpc grpc
455
+ Operation.from_grpc grpc
399
456
  end
457
+ alias_method :long_running_recognize, :process
458
+ alias_method :recognize_job, :process
400
459
 
401
460
  ##
402
461
  # Creates a Stream object to perform bidirectional streaming
@@ -422,18 +481,27 @@ module Google
422
481
  # be 8000 Hz.) (AMR)
423
482
  # * `amr_wb` - Adaptive Multi-Rate Wideband codec. (`sample_rate` must
424
483
  # be 16000 Hz.) (AMR_WB)
425
- #
484
+ # * `ogg_opus` - Ogg Mapping for Opus. (OGG_OPUS)
485
+ #
486
+ # Lossy codecs do not recommend, as they result in a lower-quality
487
+ # speech transcription.
488
+ # * `speex` - Speex with header byte. (SPEEX_WITH_HEADER_BYTE)
489
+ #
490
+ # Lossy codecs do not recommend, as they result in a lower-quality
491
+ # speech transcription. If you must use a low-bitrate encoder,
492
+ # OGG_OPUS is preferred.
493
+ #
494
+ # @param [String,Symbol] language The language of the supplied audio as
495
+ # a [BCP-47](https://tools.ietf.org/html/bcp47) language code. e.g.
496
+ # "en-US" for English (United States), "en-GB" for English (United
497
+ # Kingdom), "fr-FR" for French (France). See [Language
498
+ # Support](https://cloud.google.com/speech/docs/languages) for a list
499
+ # of the currently supported language codes. Optional.
426
500
  # @param [Integer] sample_rate Sample rate in Hertz of the audio data
427
501
  # to be recognized. Valid values are: 8000-48000. 16000 is optimal.
428
502
  # For best results, set the sampling rate of the audio source to 16000
429
503
  # Hz. If that's not possible, use the native sample rate of the audio
430
504
  # source (instead of re-sampling). Optional.
431
- # @param [String] language The language of the supplied audio as a
432
- # [BCP-47](https://tools.ietf.org/html/bcp47) language
433
- # code. If not specified, the language defaults to "en-US". See
434
- # [Language
435
- # Support](https://cloud.google.com/speech/docs/languages)
436
- # for a list of the currently supported language codes. Optional.
437
505
  # @param [String] max_alternatives The Maximum number of recognition
438
506
  # hypotheses to be returned. Default is 1. The service may return
439
507
  # fewer. Valid values are 0-30. Defaults to 1. Optional.
@@ -462,14 +530,9 @@ module Google
462
530
  #
463
531
  # speech = Google::Cloud::Speech.new
464
532
  #
465
- # stream = speech.stream encoding: :raw, sample_rate: 16000
466
- #
467
- # # register callback for when a result is returned
468
- # stream.on_result do |results|
469
- # result = results.first
470
- # puts result.transcript # "how old is the Brooklyn Bridge"
471
- # puts result.confidence # 0.9826789498329163
472
- # end
533
+ # stream = speech.stream encoding: :raw,
534
+ # language: "en-US",
535
+ # sample_rate: 16000
473
536
  #
474
537
  # # Stream 5 seconds of audio from the microphone
475
538
  # # Actual implementation of microphone input varies by platform
@@ -478,18 +541,24 @@ module Google
478
541
  # end
479
542
  #
480
543
  # stream.stop
544
+ # stream.wait_until_complete!
545
+ #
546
+ # results = stream.results
547
+ # result = results.first
548
+ # result.transcript #=> "how old is the Brooklyn Bridge"
549
+ # result.confidence #=> 0.9826789498329163
481
550
  #
482
- def stream encoding: nil, sample_rate: nil, language: nil,
551
+ def stream encoding: nil, language: nil, sample_rate: nil,
483
552
  max_alternatives: nil, profanity_filter: nil, phrases: nil,
484
553
  utterance: nil, interim: nil
485
554
  ensure_service!
486
555
 
487
- grpc_req = V1beta1::StreamingRecognizeRequest.new(
488
- streaming_config: V1beta1::StreamingRecognitionConfig.new(
556
+ grpc_req = V1::StreamingRecognizeRequest.new(
557
+ streaming_config: V1::StreamingRecognitionConfig.new(
489
558
  {
490
559
  config: audio_config(encoding: convert_encoding(encoding),
491
- sample_rate: sample_rate,
492
560
  language: language,
561
+ sample_rate: sample_rate,
493
562
  max_alternatives: max_alternatives,
494
563
  profanity_filter: profanity_filter,
495
564
  phrases: phrases),
@@ -501,28 +570,62 @@ module Google
501
570
 
502
571
  Stream.new service, grpc_req
503
572
  end
573
+ alias_method :stream_recognize, :stream
574
+
575
+ ##
576
+ # Performs asynchronous speech recognition. Requests are processed
577
+ # asynchronously, meaning a Operation is returned once the audio data
578
+ # has been sent, and can be refreshed to retrieve recognition results
579
+ # once the audio data has been processed.
580
+ #
581
+ # @see https://cloud.google.com/speech/reference/rpc/google.longrunning#google.longrunning.Operations
582
+ # Long-running Operation
583
+ #
584
+ # @param [String] id The unique identifier for the long running
585
+ # operation. Required.
586
+ #
587
+ # @return [Operation] A resource represents the long-running,
588
+ # asynchronous processing of a speech-recognition operation.
589
+ #
590
+ # @example
591
+ # require "google/cloud/speech"
592
+ #
593
+ # speech = Google::Cloud::Speech.new
594
+ #
595
+ # op = speech.operation "1234567890"
596
+ #
597
+ # op.done? #=> false
598
+ # op.reload!
599
+ #
600
+ def operation id
601
+ ensure_service!
602
+
603
+ grpc = service.get_op id
604
+ Operation.from_grpc grpc
605
+ end
504
606
 
505
607
  protected
506
608
 
507
- def audio_config encoding: nil, sample_rate: nil, language: nil,
609
+ def audio_config encoding: nil, language: nil, sample_rate: nil,
508
610
  max_alternatives: nil, profanity_filter: nil,
509
611
  phrases: nil
510
- context = nil
511
- context = V1beta1::SpeechContext.new(phrases: phrases) if phrases
612
+ contexts = nil
613
+ contexts = [V1::SpeechContext.new(phrases: phrases)] if phrases
512
614
  language = String(language) unless language.nil?
513
- V1beta1::RecognitionConfig.new({
615
+ V1::RecognitionConfig.new({
514
616
  encoding: convert_encoding(encoding),
515
- sample_rate: sample_rate,
516
617
  language_code: language,
618
+ sample_rate_hertz: sample_rate,
517
619
  max_alternatives: max_alternatives,
518
620
  profanity_filter: profanity_filter,
519
- speech_context: context
621
+ speech_contexts: contexts
520
622
  }.delete_if { |_, v| v.nil? })
521
623
  end
522
624
 
523
625
  def convert_encoding encoding
524
626
  mapping = { raw: :LINEAR16, linear: :LINEAR16, linear16: :LINEAR16,
525
- flac: :FLAC, mulaw: :MULAW, amr: :AMR, amr_wb: :AMR_WB }
627
+ flac: :FLAC, mulaw: :MULAW, amr: :AMR, amr_wb: :AMR_WB,
628
+ ogg_opus: :OGG_OPUS, speex: :SPEEX_WITH_HEADER_BYTE }
526
629
  mapping[encoding] || encoding
527
630
  end
528
631