google-cloud-speech 0.23.0 → 0.24.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -18,7 +18,7 @@ require "google/cloud/env"
18
18
  require "google/cloud/speech/service"
19
19
  require "google/cloud/speech/audio"
20
20
  require "google/cloud/speech/result"
21
- require "google/cloud/speech/job"
21
+ require "google/cloud/speech/operation"
22
22
  require "google/cloud/speech/stream"
23
23
 
24
24
  module Google
@@ -44,7 +44,9 @@ module Google
44
44
  # speech = Google::Cloud::Speech.new
45
45
  #
46
46
  # audio = speech.audio "path/to/audio.raw",
47
- # encoding: :raw, sample_rate: 16000
47
+ # encoding: :raw,
48
+ # language: "en-US",
49
+ # sample_rate: 16000
48
50
  # results = audio.recognize
49
51
  #
50
52
  # result = results.first
@@ -120,18 +122,27 @@ module Google
120
122
  # be 8000 Hz.) (AMR)
121
123
  # * `amr_wb` - Adaptive Multi-Rate Wideband codec. (`sample_rate` must
122
124
  # be 16000 Hz.) (AMR_WB)
123
- #
125
+ # * `ogg_opus` - Ogg Mapping for Opus. (OGG_OPUS)
126
+ #
127
+ # Lossy codecs do not recommend, as they result in a lower-quality
128
+ # speech transcription.
129
+ # * `speex` - Speex with header byte. (SPEEX_WITH_HEADER_BYTE)
130
+ #
131
+ # Lossy codecs do not recommend, as they result in a lower-quality
132
+ # speech transcription. If you must use a low-bitrate encoder,
133
+ # OGG_OPUS is preferred.
134
+ #
135
+ # @param [String,Symbol] language The language of the supplied audio as
136
+ # a [BCP-47](https://tools.ietf.org/html/bcp47) language code. e.g.
137
+ # "en-US" for English (United States), "en-GB" for English (United
138
+ # Kingdom), "fr-FR" for French (France). See [Language
139
+ # Support](https://cloud.google.com/speech/docs/languages) for a list
140
+ # of the currently supported language codes. Optional.
124
141
  # @param [Integer] sample_rate Sample rate in Hertz of the audio data
125
142
  # to be recognized. Valid values are: 8000-48000. 16000 is optimal.
126
143
  # For best results, set the sampling rate of the audio source to 16000
127
144
  # Hz. If that's not possible, use the native sample rate of the audio
128
145
  # source (instead of re-sampling). Optional.
129
- # @param [String] language The language of the supplied audio as a
130
- # [BCP-47](https://tools.ietf.org/html/bcp47) language
131
- # code. If not specified, the language defaults to "en-US". See
132
- # [Language
133
- # Support](https://cloud.google.com/speech/docs/languages)
134
- # for a list of the currently supported language codes. Optional.
135
146
  #
136
147
  # @return [Audio] The audio file to be recognized.
137
148
  #
@@ -141,7 +152,9 @@ module Google
141
152
  # speech = Google::Cloud::Speech.new
142
153
  #
143
154
  # audio = speech.audio "path/to/audio.raw",
144
- # encoding: :raw, sample_rate: 16000
155
+ # encoding: :raw,
156
+ # language: "en-US",
157
+ # sample_rate: 16000
145
158
  #
146
159
  # @example With a Google Cloud Storage URI:
147
160
  # require "google/cloud/speech"
@@ -149,7 +162,9 @@ module Google
149
162
  # speech = Google::Cloud::Speech.new
150
163
  #
151
164
  # audio = speech.audio "gs://bucket-name/path/to/audio.raw",
152
- # encoding: :raw, sample_rate: 16000
165
+ # encoding: :raw,
166
+ # language: "en-US",
167
+ # sample_rate: 16000
153
168
  #
154
169
  # @example With a Google Cloud Storage File object:
155
170
  # require "google/cloud/storage"
@@ -163,17 +178,20 @@ module Google
163
178
  #
164
179
  # speech = Google::Cloud::Speech.new
165
180
  #
166
- # audio = speech.audio file, encoding: :raw, sample_rate: 16000
181
+ # audio = speech.audio file,
182
+ # encoding: :raw,
183
+ # language: "en-US",
184
+ # sample_rate: 16000
167
185
  #
168
- def audio source, encoding: nil, sample_rate: nil, language: nil
186
+ def audio source, encoding: nil, language: nil, sample_rate: nil
169
187
  if source.is_a? Audio
170
188
  audio = source.dup
171
189
  else
172
190
  audio = Audio.from_source source, self
173
191
  end
174
192
  audio.encoding = encoding unless encoding.nil?
175
- audio.sample_rate = sample_rate unless sample_rate.nil?
176
193
  audio.language = language unless language.nil?
194
+ audio.sample_rate = sample_rate unless sample_rate.nil?
177
195
  audio
178
196
  end
179
197
 
@@ -216,18 +234,27 @@ module Google
216
234
  # be 8000 Hz.) (AMR)
217
235
  # * `amr_wb` - Adaptive Multi-Rate Wideband codec. (`sample_rate` must
218
236
  # be 16000 Hz.) (AMR_WB)
219
- #
237
+ # * `ogg_opus` - Ogg Mapping for Opus. (OGG_OPUS)
238
+ #
239
+ # Lossy codecs do not recommend, as they result in a lower-quality
240
+ # speech transcription.
241
+ # * `speex` - Speex with header byte. (SPEEX_WITH_HEADER_BYTE)
242
+ #
243
+ # Lossy codecs do not recommend, as they result in a lower-quality
244
+ # speech transcription. If you must use a low-bitrate encoder,
245
+ # OGG_OPUS is preferred.
246
+ #
247
+ # @param [String,Symbol] language The language of the supplied audio as
248
+ # a [BCP-47](https://tools.ietf.org/html/bcp47) language code. e.g.
249
+ # "en-US" for English (United States), "en-GB" for English (United
250
+ # Kingdom), "fr-FR" for French (France). See [Language
251
+ # Support](https://cloud.google.com/speech/docs/languages) for a list
252
+ # of the currently supported language codes. Optional.
220
253
  # @param [Integer] sample_rate Sample rate in Hertz of the audio data
221
254
  # to be recognized. Valid values are: 8000-48000. 16000 is optimal.
222
255
  # For best results, set the sampling rate of the audio source to 16000
223
256
  # Hz. If that's not possible, use the native sample rate of the audio
224
257
  # source (instead of re-sampling). Optional.
225
- # @param [String] language The language of the supplied audio as a
226
- # [BCP-47](https://tools.ietf.org/html/bcp47) language
227
- # code. If not specified, the language defaults to "en-US". See
228
- # [Language
229
- # Support](https://cloud.google.com/speech/docs/languages)
230
- # for a list of the currently supported language codes. Optional.
231
258
  # @param [String] max_alternatives The Maximum number of recognition
232
259
  # hypotheses to be returned. Default is 1. The service may return
233
260
  # fewer. Valid values are 0-30. Defaults to 1. Optional.
@@ -248,7 +275,9 @@ module Google
248
275
  # speech = Google::Cloud::Speech.new
249
276
  #
250
277
  # results = speech.recognize "path/to/audio.raw",
251
- # encoding: :raw, sample_rate: 16000
278
+ # encoding: :raw,
279
+ # language: "en-US",
280
+ # sample_rate: 16000
252
281
  #
253
282
  # @example With a Google Cloud Storage URI:
254
283
  # require "google/cloud/speech"
@@ -256,7 +285,9 @@ module Google
256
285
  # speech = Google::Cloud::Speech.new
257
286
  #
258
287
  # results = speech.recognize "gs://bucket-name/path/to/audio.raw",
259
- # encoding: :raw, sample_rate: 16000
288
+ # encoding: :raw,
289
+ # language: "en-US",
290
+ # sample_rate: 16000
260
291
  #
261
292
  # @example With a Google Cloud Storage File object:
262
293
  # require "google/cloud/storage"
@@ -270,16 +301,18 @@ module Google
270
301
  #
271
302
  # speech = Google::Cloud::Speech.new
272
303
  #
273
- # results = speech.recognize file, encoding: :raw,
304
+ # results = speech.recognize file,
305
+ # encoding: :raw,
306
+ # language: "en-US",
274
307
  # sample_rate: 16000,
275
308
  # max_alternatives: 10
276
309
  #
277
- def recognize source, encoding: nil, sample_rate: nil, language: nil,
310
+ def recognize source, encoding: nil, language: nil, sample_rate: nil,
278
311
  max_alternatives: nil, profanity_filter: nil, phrases: nil
279
312
  ensure_service!
280
313
 
281
- audio_obj = audio source, encoding: encoding,
282
- sample_rate: sample_rate, language: language
314
+ audio_obj = audio source, encoding: encoding, language: language,
315
+ sample_rate: sample_rate
283
316
 
284
317
  config = audio_config(
285
318
  encoding: audio_obj.encoding, sample_rate: audio_obj.sample_rate,
@@ -294,9 +327,9 @@ module Google
294
327
 
295
328
  ##
296
329
  # Performs asynchronous speech recognition. Requests are processed
297
- # asynchronously, meaning a Job is returned once the audio data has been
298
- # sent, and can be refreshed to retrieve recognition results once the
299
- # audio data has been processed.
330
+ # asynchronously, meaning a Operation is returned once the audio data
331
+ # has been sent, and can be refreshed to retrieve recognition results
332
+ # once the audio data has been processed.
300
333
  #
301
334
  # @see https://cloud.google.com/speech/docs/basics#async-responses
302
335
  # Asynchronous Speech API Responses
@@ -309,22 +342,41 @@ module Google
309
342
  # @param [String, Symbol] encoding Encoding of audio data to be
310
343
  # recognized. Optional.
311
344
  #
312
- # Currently, the only acceptable value is:
345
+ # Acceptable values are:
313
346
  #
314
347
  # * `raw` - Uncompressed 16-bit signed little-endian samples.
315
348
  # (LINEAR16)
316
- #
349
+ # * `flac` - The [Free Lossless Audio
350
+ # Codec](http://flac.sourceforge.net/documentation.html) encoding.
351
+ # Only 16-bit samples are supported. Not all fields in STREAMINFO
352
+ # are supported. (FLAC)
353
+ # * `mulaw` - 8-bit samples that compand 14-bit audio samples using
354
+ # G.711 PCMU/mu-law. (MULAW)
355
+ # * `amr` - Adaptive Multi-Rate Narrowband codec. (`sample_rate` must
356
+ # be 8000 Hz.) (AMR)
357
+ # * `amr_wb` - Adaptive Multi-Rate Wideband codec. (`sample_rate` must
358
+ # be 16000 Hz.) (AMR_WB)
359
+ # * `ogg_opus` - Ogg Mapping for Opus. (OGG_OPUS)
360
+ #
361
+ # Lossy codecs do not recommend, as they result in a lower-quality
362
+ # speech transcription.
363
+ # * `speex` - Speex with header byte. (SPEEX_WITH_HEADER_BYTE)
364
+ #
365
+ # Lossy codecs do not recommend, as they result in a lower-quality
366
+ # speech transcription. If you must use a low-bitrate encoder,
367
+ # OGG_OPUS is preferred.
368
+ #
369
+ # @param [String,Symbol] language The language of the supplied audio as
370
+ # a [BCP-47](https://tools.ietf.org/html/bcp47) language code. e.g.
371
+ # "en-US" for English (United States), "en-GB" for English (United
372
+ # Kingdom), "fr-FR" for French (France). See [Language
373
+ # Support](https://cloud.google.com/speech/docs/languages) for a list
374
+ # of the currently supported language codes. Optional.
317
375
  # @param [Integer] sample_rate Sample rate in Hertz of the audio data
318
376
  # to be recognized. Valid values are: 8000-48000. 16000 is optimal.
319
377
  # For best results, set the sampling rate of the audio source to 16000
320
378
  # Hz. If that's not possible, use the native sample rate of the audio
321
379
  # source (instead of re-sampling). Optional.
322
- # @param [String] language The language of the supplied audio as a
323
- # [BCP-47](https://tools.ietf.org/html/bcp47) language
324
- # code. If not specified, the language defaults to "en-US". See
325
- # [Language
326
- # Support](https://cloud.google.com/speech/docs/languages)
327
- # for a list of the currently supported language codes. Optional.
328
380
  # @param [String] max_alternatives The Maximum number of recognition
329
381
  # hypotheses to be returned. Default is 1. The service may return
330
382
  # fewer. Valid values are 0-30. Defaults to 1. Optional.
@@ -337,30 +389,34 @@ module Google
337
389
  # recognize them. See [usage
338
390
  # limits](https://cloud.google.com/speech/limits#content). Optional.
339
391
  #
340
- # @return [Job] A resource represents the long-running, asynchronous
341
- # processing of a speech-recognition operation.
392
+ # @return [Operation] A resource represents the long-running,
393
+ # asynchronous processing of a speech-recognition operation.
342
394
  #
343
395
  # @example
344
396
  # require "google/cloud/speech"
345
397
  #
346
398
  # speech = Google::Cloud::Speech.new
347
399
  #
348
- # job = speech.recognize_job "path/to/audio.raw",
349
- # encoding: :raw, sample_rate: 16000
400
+ # op = speech.process "path/to/audio.raw",
401
+ # encoding: :raw,
402
+ # language: "en-US",
403
+ # sample_rate: 16000
350
404
  #
351
- # job.done? #=> false
352
- # job.reload!
405
+ # op.done? #=> false
406
+ # op.reload!
353
407
  #
354
408
  # @example With a Google Cloud Storage URI:
355
409
  # require "google/cloud/speech"
356
410
  #
357
411
  # speech = Google::Cloud::Speech.new
358
412
  #
359
- # job = speech.recognize_job "gs://bucket-name/path/to/audio.raw",
360
- # encoding: :raw, sample_rate: 16000
413
+ # op = speech.process "gs://bucket-name/path/to/audio.raw",
414
+ # encoding: :raw,
415
+ # language: "en-US",
416
+ # sample_rate: 16000
361
417
  #
362
- # job.done? #=> false
363
- # job.reload!
418
+ # op.done? #=> false
419
+ # op.reload!
364
420
  #
365
421
  # @example With a Google Cloud Storage File object:
366
422
  # require "google/cloud/storage"
@@ -374,20 +430,21 @@ module Google
374
430
  #
375
431
  # speech = Google::Cloud::Speech.new
376
432
  #
377
- # job = speech.recognize_job file, encoding: :raw,
433
+ # op = speech.process file,
434
+ # encoding: :raw,
435
+ # language: "en-US",
378
436
  # sample_rate: 16000,
379
437
  # max_alternatives: 10
380
438
  #
381
- # job.done? #=> false
382
- # job.reload!
439
+ # op.done? #=> false
440
+ # op.reload!
383
441
  #
384
- def recognize_job source, encoding: nil, sample_rate: nil,
385
- language: nil, max_alternatives: nil,
386
- profanity_filter: nil, phrases: nil
442
+ def process source, encoding: nil, sample_rate: nil, language: nil,
443
+ max_alternatives: nil, profanity_filter: nil, phrases: nil
387
444
  ensure_service!
388
445
 
389
- audio_obj = audio source, encoding: encoding,
390
- sample_rate: sample_rate, language: language
446
+ audio_obj = audio source, encoding: encoding, language: language,
447
+ sample_rate: sample_rate
391
448
 
392
449
  config = audio_config(
393
450
  encoding: audio_obj.encoding, sample_rate: audio_obj.sample_rate,
@@ -395,8 +452,10 @@ module Google
395
452
  profanity_filter: profanity_filter, phrases: phrases)
396
453
 
397
454
  grpc = service.recognize_async audio_obj.to_grpc, config
398
- Job.from_grpc grpc
455
+ Operation.from_grpc grpc
399
456
  end
457
+ alias_method :long_running_recognize, :process
458
+ alias_method :recognize_job, :process
400
459
 
401
460
  ##
402
461
  # Creates a Stream object to perform bidirectional streaming
@@ -422,18 +481,27 @@ module Google
422
481
  # be 8000 Hz.) (AMR)
423
482
  # * `amr_wb` - Adaptive Multi-Rate Wideband codec. (`sample_rate` must
424
483
  # be 16000 Hz.) (AMR_WB)
425
- #
484
+ # * `ogg_opus` - Ogg Mapping for Opus. (OGG_OPUS)
485
+ #
486
+ # Lossy codecs do not recommend, as they result in a lower-quality
487
+ # speech transcription.
488
+ # * `speex` - Speex with header byte. (SPEEX_WITH_HEADER_BYTE)
489
+ #
490
+ # Lossy codecs do not recommend, as they result in a lower-quality
491
+ # speech transcription. If you must use a low-bitrate encoder,
492
+ # OGG_OPUS is preferred.
493
+ #
494
+ # @param [String,Symbol] language The language of the supplied audio as
495
+ # a [BCP-47](https://tools.ietf.org/html/bcp47) language code. e.g.
496
+ # "en-US" for English (United States), "en-GB" for English (United
497
+ # Kingdom), "fr-FR" for French (France). See [Language
498
+ # Support](https://cloud.google.com/speech/docs/languages) for a list
499
+ # of the currently supported language codes. Optional.
426
500
  # @param [Integer] sample_rate Sample rate in Hertz of the audio data
427
501
  # to be recognized. Valid values are: 8000-48000. 16000 is optimal.
428
502
  # For best results, set the sampling rate of the audio source to 16000
429
503
  # Hz. If that's not possible, use the native sample rate of the audio
430
504
  # source (instead of re-sampling). Optional.
431
- # @param [String] language The language of the supplied audio as a
432
- # [BCP-47](https://tools.ietf.org/html/bcp47) language
433
- # code. If not specified, the language defaults to "en-US". See
434
- # [Language
435
- # Support](https://cloud.google.com/speech/docs/languages)
436
- # for a list of the currently supported language codes. Optional.
437
505
  # @param [String] max_alternatives The Maximum number of recognition
438
506
  # hypotheses to be returned. Default is 1. The service may return
439
507
  # fewer. Valid values are 0-30. Defaults to 1. Optional.
@@ -462,14 +530,9 @@ module Google
462
530
  #
463
531
  # speech = Google::Cloud::Speech.new
464
532
  #
465
- # stream = speech.stream encoding: :raw, sample_rate: 16000
466
- #
467
- # # register callback for when a result is returned
468
- # stream.on_result do |results|
469
- # result = results.first
470
- # puts result.transcript # "how old is the Brooklyn Bridge"
471
- # puts result.confidence # 0.9826789498329163
472
- # end
533
+ # stream = speech.stream encoding: :raw,
534
+ # language: "en-US",
535
+ # sample_rate: 16000
473
536
  #
474
537
  # # Stream 5 seconds of audio from the microphone
475
538
  # # Actual implementation of microphone input varies by platform
@@ -478,18 +541,24 @@ module Google
478
541
  # end
479
542
  #
480
543
  # stream.stop
544
+ # stream.wait_until_complete!
545
+ #
546
+ # results = stream.results
547
+ # result = results.first
548
+ # result.transcript #=> "how old is the Brooklyn Bridge"
549
+ # result.confidence #=> 0.9826789498329163
481
550
  #
482
- def stream encoding: nil, sample_rate: nil, language: nil,
551
+ def stream encoding: nil, language: nil, sample_rate: nil,
483
552
  max_alternatives: nil, profanity_filter: nil, phrases: nil,
484
553
  utterance: nil, interim: nil
485
554
  ensure_service!
486
555
 
487
- grpc_req = V1beta1::StreamingRecognizeRequest.new(
488
- streaming_config: V1beta1::StreamingRecognitionConfig.new(
556
+ grpc_req = V1::StreamingRecognizeRequest.new(
557
+ streaming_config: V1::StreamingRecognitionConfig.new(
489
558
  {
490
559
  config: audio_config(encoding: convert_encoding(encoding),
491
- sample_rate: sample_rate,
492
560
  language: language,
561
+ sample_rate: sample_rate,
493
562
  max_alternatives: max_alternatives,
494
563
  profanity_filter: profanity_filter,
495
564
  phrases: phrases),
@@ -501,28 +570,62 @@ module Google
501
570
 
502
571
  Stream.new service, grpc_req
503
572
  end
573
+ alias_method :stream_recognize, :stream
574
+
575
+ ##
576
+ # Performs asynchronous speech recognition. Requests are processed
577
+ # asynchronously, meaning a Operation is returned once the audio data
578
+ # has been sent, and can be refreshed to retrieve recognition results
579
+ # once the audio data has been processed.
580
+ #
581
+ # @see https://cloud.google.com/speech/reference/rpc/google.longrunning#google.longrunning.Operations
582
+ # Long-running Operation
583
+ #
584
+ # @param [String] id The unique identifier for the long running
585
+ # operation. Required.
586
+ #
587
+ # @return [Operation] A resource represents the long-running,
588
+ # asynchronous processing of a speech-recognition operation.
589
+ #
590
+ # @example
591
+ # require "google/cloud/speech"
592
+ #
593
+ # speech = Google::Cloud::Speech.new
594
+ #
595
+ # op = speech.operation "1234567890"
596
+ #
597
+ # op.done? #=> false
598
+ # op.reload!
599
+ #
600
+ def operation id
601
+ ensure_service!
602
+
603
+ grpc = service.get_op id
604
+ Operation.from_grpc grpc
605
+ end
504
606
 
505
607
  protected
506
608
 
507
- def audio_config encoding: nil, sample_rate: nil, language: nil,
609
+ def audio_config encoding: nil, language: nil, sample_rate: nil,
508
610
  max_alternatives: nil, profanity_filter: nil,
509
611
  phrases: nil
510
- context = nil
511
- context = V1beta1::SpeechContext.new(phrases: phrases) if phrases
612
+ contexts = nil
613
+ contexts = [V1::SpeechContext.new(phrases: phrases)] if phrases
512
614
  language = String(language) unless language.nil?
513
- V1beta1::RecognitionConfig.new({
615
+ V1::RecognitionConfig.new({
514
616
  encoding: convert_encoding(encoding),
515
- sample_rate: sample_rate,
516
617
  language_code: language,
618
+ sample_rate_hertz: sample_rate,
517
619
  max_alternatives: max_alternatives,
518
620
  profanity_filter: profanity_filter,
519
- speech_context: context
621
+ speech_contexts: contexts
520
622
  }.delete_if { |_, v| v.nil? })
521
623
  end
522
624
 
523
625
  def convert_encoding encoding
524
626
  mapping = { raw: :LINEAR16, linear: :LINEAR16, linear16: :LINEAR16,
525
- flac: :FLAC, mulaw: :MULAW, amr: :AMR, amr_wb: :AMR_WB }
627
+ flac: :FLAC, mulaw: :MULAW, amr: :AMR, amr_wb: :AMR_WB,
628
+ ogg_opus: :OGG_OPUS, speex: :SPEEX_WITH_HEADER_BYTE }
526
629
  mapping[encoding] || encoding
527
630
  end
528
631