Anchor-annotator 0.0.11__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
anchor/models.py CHANGED
@@ -31,7 +31,7 @@ from montreal_forced_aligner.utils import mfa_open
31
31
  from PySide6 import QtCore
32
32
  from sqlalchemy.orm import joinedload
33
33
 
34
- from anchor import undo
34
+ from anchor import undo, workers
35
35
  from anchor.settings import AnchorSettings
36
36
 
37
37
 
@@ -148,23 +148,766 @@ class TableModel(QtCore.QAbstractTableModel):
148
148
  return len(self._header_data)
149
149
 
150
150
 
151
+ class FileUtterancesModel(QtCore.QAbstractListModel):
152
+ addCommand = QtCore.Signal(object)
153
+ selectionRequested = QtCore.Signal(object)
154
+
155
+ waveformReady = QtCore.Signal()
156
+ utterancesReady = QtCore.Signal()
157
+
158
+ def __init__(self, *args, **kwargs):
159
+ super().__init__(*args, **kwargs)
160
+ self.utterances = []
161
+ self.file = None
162
+ self.y = None
163
+ self.speakers = []
164
+ self._indices = []
165
+ self._speaker_indices = []
166
+ self.reversed_indices = {}
167
+ self.speaker_channel_mapping = {}
168
+ self.corpus_model: CorpusModel = None
169
+ self.waveform_worker = workers.WaveformWorker()
170
+ self.speaker_tier_worker = workers.SpeakerTierWorker()
171
+ self.speaker_tier_worker.signals.result.connect(self.finalize_loading_utterances)
172
+ self.waveform_worker.signals.result.connect(self.finalize_loading_wave_form)
173
+
174
+ def get_utterance(self, utterance_id: int) -> Utterance:
175
+ try:
176
+ return self.utterances[self.reversed_indices[utterance_id]]
177
+ except KeyError:
178
+ return None
179
+
180
+ def set_corpus_model(self, corpus_model: CorpusModel):
181
+ self.corpus_model = corpus_model
182
+
183
+ def clean_up_for_close(self):
184
+ self.waveform_worker.stop()
185
+ self.speaker_tier_worker.stop()
186
+
187
+ def set_file(self, file_id):
188
+ self.file = (
189
+ self.corpus_model.session.query(File).options(joinedload(File.sound_file)).get(file_id)
190
+ )
191
+ self.y = None
192
+ self.get_utterances()
193
+ self.waveform_worker.stop()
194
+ self.waveform_worker.set_params(self.file.sound_file.sound_file_path)
195
+ self.waveform_worker.start()
196
+
197
+ def finalize_loading_utterances(self, results):
198
+ utterances, file_id = results
199
+ if file_id != self.file.id:
200
+ return
201
+ self.utterances = utterances
202
+ for i, u in enumerate(utterances):
203
+ if u.speaker_id not in self.speakers:
204
+ self.speakers.append(u.speaker_id)
205
+ self._speaker_indices.append(u.speaker_id)
206
+ self.reversed_indices[u.id] = i
207
+ self._indices.append(u.id)
208
+ if self.file.num_channels > 1 and u.speaker_id not in self.speaker_channel_mapping:
209
+ self.speaker_channel_mapping[u.speaker_id] = u.channel
210
+ self.utterancesReady.emit()
211
+
212
+ def finalize_loading_wave_form(self, results):
213
+ y, file_path = results
214
+ if self.file is None or file_path != self.file.sound_file.sound_file_path:
215
+ return
216
+ self.y = y
217
+ self.waveformReady.emit()
218
+
219
+ def get_utterances(self):
220
+ parent_index = self.index(0, 0)
221
+ self.beginRemoveRows(parent_index, 0, len(self.utterances))
222
+ self.utterances = []
223
+ self.speakers = []
224
+ self._indices = []
225
+ self._speaker_indices = []
226
+ self.speaker_channel_mapping = {}
227
+ self.reversed_indices = {}
228
+ self.endRemoveRows()
229
+ if self.file is None:
230
+ return
231
+ self.speaker_tier_worker.stop()
232
+ self.speaker_tier_worker.query_alignment = (
233
+ self.corpus_model.has_alignments
234
+ or self.corpus_model.has_reference_alignments
235
+ or self.corpus_model.has_transcribed_alignments
236
+ )
237
+ self.speaker_tier_worker.session = self.corpus_model.session
238
+ self.speaker_tier_worker.set_params(self.file.id)
239
+ self.speaker_tier_worker.start()
240
+
241
+ def create_utterance(self, speaker_id: Optional[int], begin: float, end: float):
242
+ if not self.corpus_model.editable:
243
+ return
244
+ channel = 0
245
+ if speaker_id is None:
246
+ speaker_id = self.corpus_model.corpus.add_speaker(
247
+ "speech", session=self.corpus_model.session
248
+ ).id
249
+ if self.file.num_channels > 1:
250
+ if speaker_id not in self.speaker_channel_mapping:
251
+ self.speaker_channel_mapping[speaker_id] = 0
252
+ channel = self.speaker_channel_mapping[speaker_id]
253
+ begin = round(begin, 4)
254
+ end = round(end, 4)
255
+ text = ""
256
+ next_pk = self.corpus_model.corpus.get_next_primary_key(Utterance)
257
+ new_utt = Utterance(
258
+ id=next_pk,
259
+ speaker_id=speaker_id,
260
+ file_id=self.file.id,
261
+ file=self.file,
262
+ begin=begin,
263
+ end=end,
264
+ channel=channel,
265
+ text=text,
266
+ normalized_text=text,
267
+ oovs=text,
268
+ )
269
+ print(new_utt.id, new_utt.speaker_id, new_utt.file_id, new_utt.begin, new_utt.end)
270
+ self.addCommand.emit(undo.CreateUtteranceCommand(new_utt, self))
271
+ self.corpus_model.set_file_modified(self.file.id)
272
+ self.corpus_model.set_speaker_modified(speaker_id)
273
+
274
+ def add_table_utterances(self, utterances: typing.List[Utterance]):
275
+ for utterance in utterances:
276
+ if len(self.utterances) > 0:
277
+ for i, u in enumerate(self.utterances):
278
+ if u.begin < utterance.begin:
279
+ continue
280
+ break
281
+ else:
282
+ i = len(self.utterances) - 1
283
+ else:
284
+ i = 0
285
+ parent_index = self.index(i, 0)
286
+ self.beginInsertRows(parent_index, i, i + 1)
287
+ self.utterances.insert(i, utterance)
288
+ self._indices.insert(i, utterance.id)
289
+ self._speaker_indices.insert(i, utterance.speaker_id)
290
+ self.endInsertRows()
291
+ self.reversed_indices = {u: j for j, u in enumerate(self._indices)}
292
+ self.selectionRequested.emit(utterances)
293
+
294
+ def delete_table_utterances(self, utterances: typing.List[Utterance]):
295
+ for utterance in utterances:
296
+ try:
297
+ index = self.reversed_indices.pop(utterance.id)
298
+ except KeyError:
299
+ continue
300
+ parent_index = self.index(index, 0)
301
+ self.beginRemoveRows(parent_index, index, index + 1)
302
+ _ = self.utterances.pop(index)
303
+ _ = self._indices.pop(index)
304
+ _ = self._speaker_indices.pop(index)
305
+ self.reversed_indices = {u: j for j, u in enumerate(self._indices)}
306
+ self.endRemoveRows()
307
+ self.selectionRequested.emit(None)
308
+
309
+ def change_speaker_table_utterances(self, utterances: typing.List[Utterance]):
310
+ for utterance in utterances:
311
+ try:
312
+ index = self.reversed_indices[utterance.id]
313
+ except KeyError:
314
+ continue
315
+ if utterance.speaker_id not in self.speakers:
316
+ self.speakers.append(utterance.speaker_id)
317
+ self.speaker_channel_mapping[utterance.speaker_id] = utterance.channel
318
+ self._speaker_indices[index] = utterance.speaker_id
319
+
320
+ def merge_table_utterances(
321
+ self, merged_utterance: Utterance, split_utterances: typing.List[Utterance]
322
+ ):
323
+ self.delete_table_utterances(split_utterances)
324
+ self.add_table_utterances([merged_utterance])
325
+
326
+ def split_table_utterances(
327
+ self, merged_utterance: Utterance, split_utterances: typing.List[Utterance]
328
+ ):
329
+ self.delete_table_utterances([merged_utterance])
330
+ self.add_table_utterances(split_utterances)
331
+
332
+ def update_utterance_text(self, utterance: Utterance, text):
333
+ if not self.corpus_model.editable:
334
+ return
335
+ if text != utterance.text:
336
+ self.addCommand.emit(undo.UpdateUtteranceTextCommand(utterance, text, self))
337
+ self.corpus_model.set_file_modified(self.file.id)
338
+
339
+ def refresh_utterances(self):
340
+ for utterance in self.utterances:
341
+ self.corpus_model.session.refresh(utterance)
342
+
343
+ def update_utterance_speaker(self, utterance: Utterance, speaker_id: int):
344
+ if not self.corpus_model.editable:
345
+ return
346
+ old_speaker_id = utterance.speaker_id
347
+ if old_speaker_id == speaker_id:
348
+ return
349
+ self.addCommand.emit(undo.UpdateUtteranceSpeakerCommand(utterance, speaker_id, self))
350
+ self.corpus_model.set_file_modified(self.file.id)
351
+ self.corpus_model.set_speaker_modified(speaker_id)
352
+ self.corpus_model.set_speaker_modified(old_speaker_id)
353
+
354
+ def update_utterance_times(
355
+ self, utterance: Utterance, begin: Optional[float] = None, end: Optional[float] = None
356
+ ):
357
+ if not self.corpus_model.editable:
358
+ return
359
+ if utterance.begin == begin and utterance.end == end:
360
+ return
361
+ self.addCommand.emit(undo.UpdateUtteranceTimesCommand(utterance, begin, end, self))
362
+ self.corpus_model.set_file_modified(self.file.id)
363
+
364
+ def split_vad_utterance(
365
+ self, original_utterance_id, replacement_utterance_data: typing.List[KalpyUtterance]
366
+ ):
367
+ if not replacement_utterance_data:
368
+ return
369
+ utt = self.utterances[self.reversed_indices[original_utterance_id]]
370
+ replacement_utterances = []
371
+ next_pk = self.corpus_model.corpus.get_next_primary_key(Utterance)
372
+ speaker_id = utt.speaker_id
373
+ for new_utt in replacement_utterance_data:
374
+ replacement_utterances.append(
375
+ Utterance(
376
+ id=next_pk,
377
+ begin=new_utt.segment.begin,
378
+ end=new_utt.segment.end,
379
+ speaker_id=speaker_id,
380
+ file_id=self.file.id,
381
+ text=new_utt.transcript,
382
+ normalized_text=new_utt.transcript,
383
+ features="",
384
+ in_subset=False,
385
+ ignored=False,
386
+ channel=new_utt.segment.channel,
387
+ )
388
+ )
389
+ next_pk += 1
390
+ self.addCommand.emit(
391
+ undo.SplitUtteranceCommand(utt, replacement_utterances, self, update_table=False)
392
+ )
393
+ self.corpus_model.set_file_modified(self.file.id)
394
+ self.corpus_model.set_speaker_modified(speaker_id)
395
+
396
+ def split_utterances(self, utterance: Utterance):
397
+ if not self.corpus_model.editable:
398
+ return
399
+ beg = utterance.begin
400
+ end = utterance.end
401
+ duration = end - beg
402
+ first_text = []
403
+ second_text = []
404
+ speaker_id = utterance.speaker_id
405
+ if (
406
+ utterance.text
407
+ and utterance.normalized_text
408
+ and " " not in utterance.text
409
+ and " " in utterance.normalized_text
410
+ ):
411
+ t = utterance.normalized_text.split()
412
+ mid_ind = int(len(t) / 2)
413
+ first_text = t[:mid_ind]
414
+ second_text = t[mid_ind:]
415
+ elif utterance.text:
416
+ t = utterance.text.split()
417
+ mid_ind = int(len(t) / 2)
418
+ first_text = t[:mid_ind]
419
+ second_text = t[mid_ind:]
420
+ split_time = beg + (duration / 2)
421
+ oovs = set()
422
+ for w in first_text:
423
+ if not self.corpus_model.dictionary_model.check_word(w, speaker_id):
424
+ oovs.add(w)
425
+ next_pk = self.corpus_model.corpus.get_next_primary_key(Utterance)
426
+ first_utt = Utterance(
427
+ id=next_pk,
428
+ speaker_id=speaker_id,
429
+ file_id=self.file.id,
430
+ begin=beg,
431
+ end=split_time,
432
+ channel=utterance.channel,
433
+ text=" ".join(first_text),
434
+ normalized_text=" ".join(first_text),
435
+ oovs=" ".join(oovs),
436
+ )
437
+ next_pk += 1
438
+ oovs = set()
439
+ for w in second_text:
440
+ if not self.corpus_model.dictionary_model.check_word(w, utterance.speaker_id):
441
+ oovs.add(w)
442
+ second_utt = Utterance(
443
+ id=next_pk,
444
+ speaker_id=speaker_id,
445
+ file_id=self.file.id,
446
+ begin=split_time,
447
+ end=end,
448
+ channel=utterance.channel,
449
+ text=" ".join(second_text),
450
+ normalized_text=" ".join(second_text),
451
+ oovs=" ".join(oovs),
452
+ )
453
+ self.addCommand.emit(undo.SplitUtteranceCommand(utterance, [first_utt, second_utt], self))
454
+ self.corpus_model.set_file_modified(self.file.id)
455
+ self.corpus_model.set_speaker_modified(speaker_id)
456
+ self.selectionRequested.emit([first_utt, second_utt])
457
+
458
+ def merge_utterances(self, utterances: list[Utterance]):
459
+ if not self.corpus_model.editable:
460
+ return
461
+ if not utterances:
462
+ return
463
+ min_begin = 1000000000
464
+ max_end = 0
465
+ text = ""
466
+ normalized_text = ""
467
+ speaker_id = None
468
+ channel = None
469
+ for old_utt in sorted(utterances, key=lambda x: x.begin):
470
+ if speaker_id is None:
471
+ speaker_id = old_utt.speaker_id
472
+ if channel is None:
473
+ channel = old_utt.channel
474
+ if old_utt.begin < min_begin:
475
+ min_begin = old_utt.begin
476
+ if old_utt.end > max_end:
477
+ max_end = old_utt.end
478
+ utt_text = old_utt.text
479
+ if utt_text == "speech" and text.strip() == "speech":
480
+ continue
481
+ text += utt_text + " "
482
+ normalized_text += old_utt.normalized_text + " "
483
+ text = text[:-1]
484
+ normalized_text = normalized_text[:-1]
485
+ next_pk = self.corpus_model.corpus.get_next_primary_key(Utterance)
486
+ oovs = set()
487
+ for w in text.split():
488
+ if not self.corpus_model.dictionary_model.check_word(w, speaker_id):
489
+ oovs.add(w)
490
+ new_utt = Utterance(
491
+ id=next_pk,
492
+ speaker_id=speaker_id,
493
+ file_id=self.file.id,
494
+ begin=min_begin,
495
+ end=max_end,
496
+ channel=channel,
497
+ text=text,
498
+ normalized_text=normalized_text,
499
+ oovs=" ".join(oovs),
500
+ )
501
+ self.addCommand.emit(undo.MergeUtteranceCommand(utterances, new_utt, self))
502
+ self.corpus_model.set_file_modified(self.file.id)
503
+ self.corpus_model.set_speaker_modified(speaker_id)
504
+ self.selectionRequested.emit([new_utt])
505
+
506
+ def delete_utterances(self, utterances: typing.List[Utterance]):
507
+ if not self.corpus_model.editable:
508
+ return
509
+ if not utterances:
510
+ return
511
+ speaker_ids = set(x.speaker_id for x in utterances)
512
+ self.addCommand.emit(undo.DeleteUtteranceCommand(utterances, self))
513
+ self.corpus_model.set_file_modified(self.file.id)
514
+ for speaker_id in speaker_ids:
515
+ self.corpus_model.set_speaker_modified(speaker_id)
516
+
517
+ def rowCount(self, parent=None):
518
+ return len(self.utterances)
519
+
520
+ def data(self, index, role=QtCore.Qt.ItemDataRole.DisplayRole):
521
+ if role == QtCore.Qt.ItemDataRole.DisplayRole:
522
+ return self.utterances[index.row()]
523
+
524
+
525
+ class FileSelectionModel(QtCore.QItemSelectionModel):
526
+ fileAboutToChange = QtCore.Signal()
527
+ fileChanged = QtCore.Signal()
528
+ channelChanged = QtCore.Signal()
529
+ resetView = QtCore.Signal()
530
+ viewChanged = QtCore.Signal(object, object)
531
+ selectionAudioChanged = QtCore.Signal()
532
+ currentTimeChanged = QtCore.Signal(object)
533
+ currentUtteranceChanged = QtCore.Signal()
534
+ speakerRequested = QtCore.Signal(object)
535
+
536
+ spectrogramReady = QtCore.Signal()
537
+ waveformReady = QtCore.Signal()
538
+ pitchTrackReady = QtCore.Signal()
539
+
540
+ def __init__(self, *args, **kwargs):
541
+ super().__init__(*args, **kwargs)
542
+ self.settings = AnchorSettings()
543
+ self.min_time = 0
544
+ self.max_time = 10
545
+ self.selected_min_time = None
546
+ self.selected_max_time = None
547
+ self.x = None
548
+ self.y = None
549
+ self.top_point = 2
550
+ self.bottom_point = 0
551
+ self.separator_point = 1
552
+ self.selected_channel = 0
553
+ self.spectrogram = None
554
+ self.min_db = None
555
+ self.max_db = None
556
+ self.pitch_track_x = None
557
+ self.pitch_track_y = None
558
+ self.waveform_x = None
559
+ self.waveform_y = None
560
+ self.requested_utterance_id = None
561
+ self.auto_waveform_worker = workers.AutoWaveformWorker()
562
+ self.spectrogram_worker = workers.SpectrogramWorker()
563
+ self.pitch_track_worker = workers.PitchWorker()
564
+ self.auto_waveform_worker.signals.result.connect(self.finalize_loading_auto_wave_form)
565
+ self.spectrogram_worker.signals.result.connect(self.finalize_loading_spectrogram)
566
+ self.pitch_track_worker.signals.result.connect(self.finalize_loading_pitch_track)
567
+ self.model().waveformReady.connect(self.load_audio_selection)
568
+ self.model().utterancesReady.connect(self.finalize_set_new_file)
569
+ self.viewChanged.connect(self.load_audio_selection)
570
+ self.model().selectionRequested.connect(self.update_selected_utterances)
571
+
572
+ def selected_utterances(self):
573
+ utts = []
574
+ m = self.model()
575
+ for index in self.selectedRows(0):
576
+ utt = m.utterances[index.row()]
577
+ utts.append(utt)
578
+ return utts
579
+
580
+ def load_audio_selection(self):
581
+ if self.model().y is None:
582
+ return
583
+ begin_samp = int(self.min_time * self.model().file.sample_rate)
584
+ end_samp = int(self.max_time * self.model().file.sample_rate)
585
+ if len(self.model().y.shape) > 1:
586
+ y = self.model().y[begin_samp:end_samp, self.selected_channel]
587
+ else:
588
+ y = self.model().y[begin_samp:end_samp]
589
+ self.spectrogram_worker.stop()
590
+ self.spectrogram_worker.set_params(
591
+ y,
592
+ self.model().file.sound_file.sample_rate,
593
+ self.min_time,
594
+ self.max_time,
595
+ self.selected_channel,
596
+ )
597
+ self.spectrogram_worker.start()
598
+ if self.max_time - self.min_time <= 10:
599
+ self.pitch_track_worker.stop()
600
+ self.pitch_track_worker.set_params(
601
+ y,
602
+ self.model().file.sound_file.sample_rate,
603
+ self.min_time,
604
+ self.max_time,
605
+ self.selected_channel,
606
+ self.bottom_point,
607
+ self.separator_point,
608
+ )
609
+ self.pitch_track_worker.start()
610
+ self.auto_waveform_worker.stop()
611
+ self.auto_waveform_worker.set_params(
612
+ y,
613
+ self.separator_point,
614
+ self.top_point,
615
+ self.min_time,
616
+ self.max_time,
617
+ self.selected_channel,
618
+ )
619
+ self.auto_waveform_worker.start()
620
+
621
+ def clean_up_for_close(self):
622
+ self.spectrogram_worker.stop()
623
+ self.pitch_track_worker.stop()
624
+ self.auto_waveform_worker.stop()
625
+
626
+ @property
627
+ def plot_min(self):
628
+ if self.settings.right_to_left:
629
+ return -self.max_time
630
+ return self.min_time
631
+
632
+ @property
633
+ def plot_max(self):
634
+ if self.settings.right_to_left:
635
+ return -self.min_time
636
+ return self.max_time
637
+
638
+ def finalize_loading_spectrogram(self, results):
639
+ stft, channel, begin, end, min_db, max_db = results
640
+ if self.settings.right_to_left:
641
+ stft = np.flip(stft, 1)
642
+ begin, end = -end, -begin
643
+ if begin != self.plot_min or end != self.plot_max:
644
+ return
645
+ self.spectrogram = stft
646
+ self.min_db = self.min_db
647
+ self.max_db = self.max_db
648
+ self.spectrogramReady.emit()
649
+
650
+ def finalize_loading_pitch_track(self, results):
651
+ pitch_track, voicing_track, channel, begin, end, min_f0, max_f0 = results
652
+ if self.settings.right_to_left:
653
+ pitch_track = np.flip(pitch_track, 0)
654
+ begin, end = -end, -begin
655
+ if begin != self.plot_min or end != self.plot_max:
656
+ return
657
+ self.pitch_track_y = pitch_track
658
+ if pitch_track is None:
659
+ return
660
+ x = np.linspace(
661
+ start=self.plot_min,
662
+ stop=self.plot_max,
663
+ num=pitch_track.shape[0],
664
+ )
665
+ self.pitch_track_x = x
666
+ self.pitchTrackReady.emit()
667
+
668
+ def finalize_loading_auto_wave_form(self, results):
669
+ y, begin, end, channel = results
670
+ if self.settings.right_to_left:
671
+ y = np.flip(y, 0)
672
+ begin, end = -end, -begin
673
+ if begin != self.plot_min or end != self.plot_max:
674
+ return
675
+ x = np.linspace(start=self.plot_min, stop=self.plot_max, num=y.shape[0])
676
+ self.waveform_x = x
677
+ self.waveform_y = y
678
+ self.waveformReady.emit()
679
+
680
+ def select_audio(self, begin, end):
681
+ if end is not None and end - begin < 0.025:
682
+ end = None
683
+ self.selected_min_time = begin
684
+ self.selected_max_time = end
685
+ self.selectionAudioChanged.emit()
686
+
687
+ def request_start_time(self, start_time):
688
+ if start_time >= self.max_time:
689
+ return
690
+ if start_time < self.min_time:
691
+ return
692
+ self.selected_min_time = start_time
693
+ self.selected_max_time = None
694
+ self.selectionAudioChanged.emit()
695
+
696
+ def set_current_channel(self, channel):
697
+ if channel == self.selected_channel:
698
+ return
699
+ self.selected_channel = channel
700
+ self.load_audio_selection()
701
+
702
+ def get_selected_wave_form(self):
703
+ if self.y is None:
704
+ return None, None
705
+ if len(self.y.shape) > 1 and self.y.shape[0] == 2:
706
+ return self.x, self.y[self.selected_channel, :]
707
+ return self.x, self.y
708
+
709
+ def zoom(self, factor, mid_point=None):
710
+ if factor == 0 or self.min_time is None:
711
+ return
712
+ cur_duration = self.max_time - self.min_time
713
+ if mid_point is None:
714
+ mid_point = self.min_time + (cur_duration / 2)
715
+ new_duration = cur_duration / factor
716
+ new_begin = mid_point - (mid_point - self.min_time) / factor
717
+ new_begin = max(new_begin, 0)
718
+ new_end = min(new_begin + new_duration, self.model().file.duration)
719
+ if new_end - new_begin <= 0.025:
720
+ return
721
+ self.set_view_times(new_begin, new_end)
722
+
723
+ def pan(self, factor):
724
+ if self.min_time is None:
725
+ return
726
+ if factor < 1:
727
+ factor = 1 - factor
728
+ right = True
729
+ else:
730
+ right = False
731
+ factor = factor - 1
732
+ if right and self.max_time == self.model().file.duration:
733
+ return
734
+ if not right and self.min_time == 0:
735
+ return
736
+ cur_duration = self.max_time - self.min_time
737
+ shift = factor * cur_duration
738
+ if right:
739
+ new_begin = self.min_time + shift
740
+ new_end = self.max_time + shift
741
+ else:
742
+ new_begin = self.min_time - shift
743
+ new_end = self.max_time - shift
744
+ if new_begin < 0:
745
+ new_end = new_end + abs(new_begin)
746
+ new_begin = 0
747
+ if new_end > self.model().file.duration:
748
+ new_begin -= self.model().file.duration - new_end
749
+ new_end = self.model().file.duration
750
+ self.set_view_times(new_begin, new_end)
751
+
752
+ def zoom_in(self):
753
+ if self.model().file is None:
754
+ return
755
+ self.zoom(1.5)
756
+
757
+ def zoom_out(self):
758
+ if self.model().file is None:
759
+ return
760
+ self.zoom(0.5)
761
+
762
+ def zoom_to_selection(self):
763
+ if self.selected_min_time is not None and self.selected_max_time is not None:
764
+ self.set_view_times(self.selected_min_time, self.selected_max_time)
765
+
766
+ def update_from_slider(self, value):
767
+ if not self.max_time:
768
+ return
769
+ cur_window = self.max_time - self.min_time
770
+ self.set_view_times(value, value + cur_window)
771
+
772
+ def update_selection_audio(self, begin, end):
773
+ if begin < self.min_time:
774
+ begin = self.min_time
775
+ if end > self.max_time:
776
+ end = self.max_time
777
+ self.selected_min_time = begin
778
+ self.selected_max_time = end
779
+ self.selectionAudioChanged.emit()
780
+
781
+ def visible_utterances(self) -> typing.List[Utterance]:
782
+ file_utts = []
783
+ if not self.model().file:
784
+ return file_utts
785
+ if self.model().rowCount() > 1:
786
+ for u in self.model().utterances:
787
+ if u.begin >= self.max_time:
788
+ break
789
+ if u.end <= self.min_time:
790
+ continue
791
+ file_utts.append(u)
792
+ else:
793
+ file_utts.extend(self.model().utterances)
794
+ return file_utts
795
+
796
+ def model(self) -> FileUtterancesModel:
797
+ return super().model()
798
+
799
+ def set_view_times(self, begin, end):
800
+ begin = max(begin, 0)
801
+ end = min(end, self.model().file.duration)
802
+ if (begin, end) == (self.min_time, self.max_time):
803
+ return
804
+ self.min_time = begin
805
+ self.max_time = end
806
+ if (
807
+ self.selected_max_time is not None
808
+ and not self.min_time <= self.selected_min_time <= self.max_time
809
+ ):
810
+ self.selected_min_time = self.min_time
811
+ if (
812
+ self.selected_max_time is not None
813
+ and not self.min_time <= self.selected_max_time <= self.max_time
814
+ ):
815
+ self.selected_max_time = None
816
+ self.viewChanged.emit(self.min_time, self.max_time)
817
+
818
+ def set_current_file(self, info, force_update=False):
819
+ file_id, begin, end, utterance_id, speaker_id = info
820
+ try:
821
+ new_file = self.model().file is None or self.model().file.id != file_id
822
+ except sqlalchemy.orm.exc.DetachedInstanceError:
823
+ new_file = True
824
+ self.requested_utterance_id = utterance_id
825
+ if new_file:
826
+ self.fileAboutToChange.emit()
827
+ self.model().set_file(file_id)
828
+ self.speakerRequested.emit(speaker_id)
829
+ else:
830
+ self.finalize_set_new_file()
831
+ self.speakerRequested.emit(speaker_id)
832
+ self.set_view_times(begin, end)
833
+
834
+ def finalize_set_new_file(self):
835
+ if self.requested_utterance_id is None:
836
+ return
837
+ utterance = self.model().get_utterance(self.requested_utterance_id)
838
+ if utterance is None:
839
+ return
840
+ self.update_select(self.requested_utterance_id, reset=True)
841
+ self.selected_channel = 0
842
+ if utterance is not None and utterance.channel is not None:
843
+ self.selected_channel = utterance.channel
844
+ self.fileChanged.emit()
845
+
846
+ def checkSelected(self, utterance_id: int):
847
+ m = self.model()
848
+ for index in self.selectedRows(0):
849
+ if utterance_id == m._indices[index.row()]:
850
+ return True
851
+ return False
852
+
853
+ def update_selected_utterances(self, utterances):
854
+ super().clearSelection()
855
+ super().clearCurrentIndex()
856
+ if not utterances:
857
+ return
858
+ flags = QtCore.QItemSelectionModel.SelectionFlag.Rows
859
+ flags |= QtCore.QItemSelectionModel.SelectionFlag.Select
860
+ for u in utterances:
861
+ if u.id not in self.model().reversed_indices:
862
+ continue
863
+ row = self.model().reversed_indices[u.id]
864
+
865
+ index = self.model().index(row, 0)
866
+ if not index.isValid():
867
+ return
868
+ self.select(index, flags)
869
+ self.currentUtteranceChanged.emit()
870
+
871
+ def update_select(self, utterance_id: int, deselect=False, reset=False):
872
+ if reset and [x.id for x in self.selected_utterances()] == [utterance_id]:
873
+ return
874
+ flags = QtCore.QItemSelectionModel.SelectionFlag.Rows
875
+ if reset:
876
+ flags |= QtCore.QItemSelectionModel.SelectionFlag.ClearAndSelect
877
+ elif deselect:
878
+ flags |= QtCore.QItemSelectionModel.SelectionFlag.Deselect
879
+ else:
880
+ flags |= QtCore.QItemSelectionModel.SelectionFlag.Select
881
+ if utterance_id not in self.model().reversed_indices:
882
+ return
883
+ row = self.model().reversed_indices[utterance_id]
884
+
885
+ index = self.model().index(row, 0)
886
+ if not index.isValid():
887
+ return
888
+ self.select(index, flags)
889
+ if not deselect:
890
+ self.select_audio(self.model().utterances[row].begin, self.model().utterances[row].end)
891
+ self.currentUtteranceChanged.emit()
892
+
893
+
151
894
  class CorpusSelectionModel(QtCore.QItemSelectionModel):
152
895
  fileChanged = QtCore.Signal()
153
896
  channelChanged = QtCore.Signal()
154
897
  resetView = QtCore.Signal()
155
898
  fileAboutToChange = QtCore.Signal()
156
- viewChanged = QtCore.Signal(object, object)
899
+ fileViewRequested = QtCore.Signal(object)
157
900
  selectionAudioChanged = QtCore.Signal()
158
901
  currentTimeChanged = QtCore.Signal(object)
159
902
  currentUtteranceChanged = QtCore.Signal()
160
903
 
161
904
  def __init__(self, *args, **kwargs):
162
- super(CorpusSelectionModel, self).__init__(*args, **kwargs)
905
+ super().__init__(*args, **kwargs)
906
+ self.settings = AnchorSettings()
163
907
  self.min_time = 0
164
908
  self.max_time = 10
165
909
  self.selected_min_time = None
166
910
  self.selected_max_time = None
167
- self.current_file: Optional[File] = None
168
911
  self.x = None
169
912
  self.y = None
170
913
  self.current_utterance_id = None
@@ -175,10 +918,8 @@ class CorpusSelectionModel(QtCore.QItemSelectionModel):
175
918
  # self.selectionChanged.connect(self.update_selection_audio)
176
919
  # self.selectionChanged.connect(self.update_selection_audio)
177
920
  # self.model().changeCommandFired.connect(self.expire_current)
178
- self.selectionChanged.connect(self._update_selection)
179
921
  self.model().layoutChanged.connect(self.check_selection)
180
922
  self.model().unlockCorpus.connect(self.fileChanged.emit)
181
- self.model().selectionRequested.connect(self.update_select_rows)
182
923
 
183
924
  def set_current_utterance(self, utterance_id):
184
925
  self.current_utterance_id = utterance_id
@@ -190,13 +931,8 @@ class CorpusSelectionModel(QtCore.QItemSelectionModel):
190
931
  elif self.model().rowCount() == 0:
191
932
  self.clearSelection()
192
933
 
193
- def set_current_channel(self, channel):
194
- self.selected_channel = channel
195
- self.channelChanged.emit()
196
-
197
934
  def clearSelection(self) -> None:
198
935
  self.fileAboutToChange.emit()
199
- self.current_file = None
200
936
  self.current_utterance_id = None
201
937
  self.min_time = None
202
938
  self.max_time = None
@@ -206,22 +942,6 @@ class CorpusSelectionModel(QtCore.QItemSelectionModel):
206
942
  super(CorpusSelectionModel, self).clearSelection()
207
943
  self.fileChanged.emit()
208
944
 
209
- def update_selected_wavform(self, *args):
210
- if self.min_time is None or self.current_file is None:
211
- self.x = None
212
- self.y = None
213
- else:
214
- self.x, self.y = self.current_file.sound_file.normalized_waveform(
215
- self.min_time, self.max_time
216
- )
217
-
218
- def get_selected_wave_form(self):
219
- if self.y is None:
220
- return None, None
221
- if len(self.y.shape) > 1 and self.y.shape[0] == 2:
222
- return self.x, self.y[self.selected_channel, :]
223
- return self.x, self.y
224
-
225
945
  def update_select_rows(self, rows: list[int]):
226
946
  super(CorpusSelectionModel, self).clearCurrentIndex()
227
947
  super(CorpusSelectionModel, self).clearSelection()
@@ -237,8 +957,29 @@ class CorpusSelectionModel(QtCore.QItemSelectionModel):
237
957
  | QtCore.QItemSelectionModel.SelectionFlag.Rows,
238
958
  )
239
959
 
960
+ def update_selected_utterances(self, utterances):
961
+ if not utterances:
962
+ return
963
+ first = True
964
+ for u in utterances:
965
+ if u.id not in self.model().reversed_indices:
966
+ continue
967
+ row = self.model().reversed_indices[u.id]
968
+
969
+ index = self.model().index(row, 0)
970
+ if not index.isValid():
971
+ return
972
+ if not first:
973
+ flags = QtCore.QItemSelectionModel.SelectionFlag.Rows
974
+ flags |= QtCore.QItemSelectionModel.SelectionFlag.Select
975
+ else:
976
+ flags = QtCore.QItemSelectionModel.SelectionFlag.Rows
977
+ flags |= QtCore.QItemSelectionModel.SelectionFlag.ClearAndSelect
978
+ first = False
979
+ self.select(index, flags)
980
+
240
981
  def update_select(self, utterance_id: int, deselect=False, reset=False, focus=False):
241
- if reset and [x.id for x in self.selectedUtterances()] == [utterance_id]:
982
+ if reset and self.selected_utterances() == [utterance_id]:
242
983
  return
243
984
  flags = QtCore.QItemSelectionModel.SelectionFlag.Rows
244
985
  if reset:
@@ -253,58 +994,13 @@ class CorpusSelectionModel(QtCore.QItemSelectionModel):
253
994
  if focus:
254
995
  flags |= QtCore.QItemSelectionModel.SelectionFlag.Current
255
996
  if row == self.currentIndex().row():
256
- self.update_view_times(force_update=True)
997
+ self.fileViewRequested.emit(self.model().audio_info_for_utterance(row))
257
998
 
258
999
  index = self.model().index(row, 0)
259
1000
  if not index.isValid():
260
1001
  return
261
1002
  self.select(index, flags)
262
1003
 
263
- def select_audio(self, begin, end):
264
- if end is not None and end - begin < 0.025:
265
- end = None
266
- self.selected_min_time = begin
267
- self.selected_max_time = end
268
- self.selectionAudioChanged.emit()
269
-
270
- def request_start_time(self, start_time):
271
- if start_time >= self.max_time:
272
- return
273
- if start_time < self.min_time:
274
- return
275
- self.selected_min_time = start_time
276
- self.selected_max_time = None
277
- self.selectionAudioChanged.emit()
278
-
279
- def visible_utts(self) -> typing.List[Utterance]:
280
- file_utts = []
281
- if not self.current_file:
282
- return file_utts
283
- if self.current_file.num_utterances > 1:
284
- for u in sorted(self.current_file.utterances, key=lambda x: x.begin):
285
- if u.begin >= self.max_time:
286
- break
287
- if u.end <= self.min_time:
288
- continue
289
- file_utts.append(u)
290
- else:
291
- file_utts.extend(self.current_file.utterances)
292
- return file_utts
293
-
294
- def currentUtterance(self) -> Optional[Utterance]:
295
- if self.current_utterance_id is not None:
296
- return
297
- m = self.model()
298
- utterance = (
299
- m.session.query(Utterance)
300
- .options(
301
- joinedload(Utterance.file).joinedload(File.sound_file),
302
- joinedload(Utterance.file).subqueryload(File.speakers),
303
- )
304
- .get(self.current_utterance_id)
305
- )
306
- return utterance
307
-
308
1004
  def _update_selection(self):
309
1005
  index = self.currentIndex()
310
1006
  if not index.isValid():
@@ -313,20 +1009,20 @@ class CorpusSelectionModel(QtCore.QItemSelectionModel):
313
1009
  self.current_utterance_id = m._indices[index.row()]
314
1010
  self.currentUtteranceChanged.emit()
315
1011
 
316
- def selectedUtterances(self):
317
- utts = []
1012
+ def selected_utterances(self):
1013
+ current_utterance = self.current_utterance_id
1014
+ if current_utterance is None:
1015
+ return []
1016
+ utts = [current_utterance]
318
1017
  m = self.model()
319
- current_utterance = m.utteranceAt(self.currentIndex())
320
1018
  for index in self.selectedRows(1):
321
- if current_utterance is not None and m._indices[index.row()] == current_utterance.id:
1019
+ if current_utterance is not None and m._indices[index.row()] == current_utterance:
322
1020
  continue
323
- utt = m.utteranceAt(index)
1021
+ utt = m.utterance_id_at(index)
324
1022
  if utt is None:
325
1023
  continue
326
1024
  if current_utterance is None:
327
1025
  current_utterance = utt
328
- if utt.file_id != current_utterance.file_id:
329
- continue
330
1026
  utts.append(utt)
331
1027
  return utts
332
1028
 
@@ -341,140 +1037,23 @@ class CorpusSelectionModel(QtCore.QItemSelectionModel):
341
1037
  text = m.data(m.index(index.row(), m.text_column), QtCore.Qt.ItemDataRole.DisplayRole)
342
1038
  return text
343
1039
 
344
- def zoom(self, factor, mid_point=None):
345
- if factor == 0:
346
- return
347
- cur_duration = self.max_time - self.min_time
348
- if mid_point is None:
349
- mid_point = self.min_time + (cur_duration / 2)
350
- new_duration = cur_duration / factor
351
- new_begin = mid_point - (mid_point - self.min_time) / factor
352
- new_begin = max(new_begin, 0)
353
- new_end = min(new_begin + new_duration, self.current_file.duration)
354
- if new_end - new_begin <= 0.025:
355
- return
356
- self.set_view_times(new_begin, new_end)
357
-
358
- def pan(self, factor):
359
- if factor < 1:
360
- factor = 1 - factor
361
- right = True
362
- else:
363
- right = False
364
- factor = factor - 1
365
- if right and self.max_time == self.current_file.duration:
366
- return
367
- if not right and self.min_time == 0:
368
- return
369
- cur_duration = self.max_time - self.min_time
370
- shift = factor * cur_duration
371
- if right:
372
- new_begin = self.min_time + shift
373
- new_end = self.max_time + shift
374
- else:
375
- new_begin = self.min_time - shift
376
- new_end = self.max_time - shift
377
- if new_begin < 0:
378
- new_end = new_end + abs(new_begin)
379
- new_begin = 0
380
- if new_end > self.current_file.duration:
381
- new_begin -= self.current_file.duration - new_end
382
- new_end = self.current_file.duration
383
- self.set_view_times(new_begin, new_end)
384
-
385
- def zoom_in(self):
386
- if self.current_file is None:
387
- return
388
- self.zoom(1.5)
389
-
390
- def zoom_out(self):
391
- if self.current_file is None:
392
- return
393
- self.zoom(0.5)
394
-
395
- def zoom_to_selection(self):
396
- if self.selected_min_time is None or self.selected_max_time is None:
397
- rows = self.selectedRows(1)
398
- if not rows:
399
- return
400
- begin = None
401
- end = None
402
- for r in rows:
403
- u = self.model().utteranceAt(r)
404
- if u is None:
405
- continue
406
- if u.file_id != self.current_file.id:
407
- continue
408
- if begin is None or begin > u.begin:
409
- begin = u.begin
410
- if end is None or end < u.end:
411
- end = u.end
412
- self.set_view_times(begin, end)
413
- else:
414
- self.set_view_times(self.selected_min_time, self.selected_max_time)
415
-
416
- def update_from_slider(self, value):
417
- if not self.max_time:
418
- return
419
- cur_window = self.max_time - self.min_time
420
- self.set_view_times(value, value + cur_window)
421
-
422
- def update_selection_audio(self):
423
- begins = self.selectedRows(self.model().begin_column)
424
- ends = self.selectedRows(self.model().end_column)
425
- begin = None
426
- end = None
427
- if len(begins) > 0:
428
- for i, b in enumerate(begins):
429
- b = self.model().data(b, QtCore.Qt.ItemDataRole.DisplayRole)
430
- e = self.model().data(ends[i], QtCore.Qt.ItemDataRole.DisplayRole)
431
- if begin is None or begin > b:
432
- begin = b
433
- if end is None or end < e:
434
- end = e
435
- if self.current_file is None or begin > self.current_file.duration:
436
- begin = None
437
- end = None
438
- elif end > self.current_file.duration:
439
- end = self.current_file.duration
440
- self.selected_min_time = begin
441
- self.selected_max_time = end
442
- self.selectionAudioChanged.emit()
443
-
444
1040
  def switch_utterance(self, new_index, old_index):
1041
+ if not self.model().fully_loaded:
1042
+ return
445
1043
  if not isinstance(new_index, QtCore.QModelIndex):
446
1044
  row = 0
447
1045
  else:
448
1046
  if not new_index.isValid():
449
1047
  return
450
1048
  row = new_index.row()
451
- utt = self.model().utteranceAt(row)
1049
+ utt = self.model().utterance_id_at(row)
452
1050
  if utt is None:
453
1051
  return
454
- if utt.id == self.current_utterance_id:
1052
+ if utt == self.current_utterance_id:
455
1053
  return
456
- self.current_utterance_id = utt.id
1054
+ self.current_utterance_id = utt
457
1055
  self.currentUtteranceChanged.emit()
458
- self.set_current_file(
459
- utt.file_id, utt.begin, utt.end, channel=utt.channel, force_update=True
460
- )
461
-
462
- def update_view_times(self, *args, force_update=False):
463
- utts = self.selectedUtterances()
464
- if len(utts) == 0:
465
- self.resetView.emit()
466
- return
467
- if len(utts) == 1:
468
- force_update = True
469
- begin = utts[0].begin
470
- f_id = utts[0].file_id
471
- end_ind = -1
472
- while True:
473
- if utts[end_ind].file_id == f_id:
474
- end = utts[end_ind].end
475
- break
476
- self.set_current_file(f_id, begin, end, channel=utts[0].channel, force_update=force_update)
477
- self.selected_min_time = self.min_time
1056
+ self.fileViewRequested.emit(self.model().audio_info_for_utterance(row))
478
1057
 
479
1058
  def model(self) -> CorpusModel:
480
1059
  return super(CorpusSelectionModel, self).model()
@@ -486,43 +1065,6 @@ class CorpusSelectionModel(QtCore.QItemSelectionModel):
486
1065
  return True
487
1066
  return False
488
1067
 
489
- def set_current_file(self, file_id, begin=None, end=None, channel=None, force_update=False):
490
- try:
491
- new_file = self.current_file is None or self.current_file.id != file_id
492
- except sqlalchemy.orm.exc.DetachedInstanceError:
493
- new_file = True
494
- if new_file:
495
- self.selected_min_time = None
496
- self.selected_max_time = None
497
- self.fileAboutToChange.emit()
498
- self.selected_channel = 0 if channel is None else channel
499
- self.current_file = (
500
- self.model().session.query(File).options(joinedload(File.sound_file)).get(file_id)
501
- )
502
- self.min_time = begin
503
- self.max_time = end
504
- self.fileChanged.emit()
505
- elif (
506
- self.current_file is not None
507
- and begin is not None
508
- and end is not None
509
- and force_update
510
- ):
511
- self.selected_channel = channel
512
- self.set_view_times(begin, end)
513
-
514
- def set_view_times(self, begin, end):
515
- begin = max(begin, 0)
516
- end = min(end, self.current_file.duration)
517
- if (begin, end) == (self.min_time, self.max_time):
518
- return
519
- self.min_time = begin
520
- self.max_time = end
521
- self.selected_min_time = self.min_time
522
- if self.selected_max_time is not None and self.selected_max_time > self.max_time:
523
- self.selected_max_time = None
524
- self.viewChanged.emit(self.min_time, self.max_time)
525
-
526
1068
  def focusUtterance(self, index):
527
1069
  m = self.model()
528
1070
  u = m.utteranceAt(index)
@@ -530,10 +1072,8 @@ class CorpusSelectionModel(QtCore.QItemSelectionModel):
530
1072
  self.min_time = 0
531
1073
  self.max_time = 1
532
1074
  self.fileAboutToChange()
533
- self.current_file = None
534
1075
  self.fileChanged.emit()
535
1076
  return
536
- self.current_file = u.file
537
1077
  begin = u.begin
538
1078
  end = u.end
539
1079
  padding = 1
@@ -1043,6 +1583,8 @@ class SpeakerModel(TableModel):
1043
1583
  self.mds_speaker_utterances()
1044
1584
 
1045
1585
  def finish_load_ivectors(self, result, *args, **kwargs):
1586
+ if result is None:
1587
+ return
1046
1588
  speaker_ids, utterance_ids, utt2spk, ivectors = result
1047
1589
  if speaker_ids != self.current_speakers:
1048
1590
  return
@@ -1431,6 +1973,7 @@ class CorpusModel(TableModel):
1431
1973
  self.speakers = {}
1432
1974
  self.speaker_id_mapping = {}
1433
1975
  self.utterances = None
1976
+ self.session: sqlalchemy.orm.scoped_session = None
1434
1977
  self.utterance_count = 0
1435
1978
  self.speaker_count = 0
1436
1979
  self.file_count = 0
@@ -1475,29 +2018,46 @@ class CorpusModel(TableModel):
1475
2018
  return True
1476
2019
  return False
1477
2020
 
1478
- def update_utterance_table_row(self, utterance_id: int):
1479
- if utterance_id not in self.reversed_indices:
1480
- return
1481
- utterance = self.session.query(Utterance).get(utterance_id)
2021
+ def update_utterance_table_row(self, utterance: typing.Union[int, Utterance]):
2022
+ if isinstance(utterance, int):
2023
+ utterance_id = utterance
2024
+ if utterance_id not in self.reversed_indices:
2025
+ return
2026
+ utterance = self.session.query(Utterance).get(utterance_id)
2027
+ else:
2028
+ utterance_id = utterance.id
2029
+ if utterance_id not in self.reversed_indices:
2030
+ return
1482
2031
  index = self.reversed_indices[utterance_id]
1483
2032
  self.layoutAboutToBeChanged.emit()
1484
2033
  self._data[index][self.text_column] = utterance.text
1485
2034
  self._data[index][self.begin_column] = utterance.begin
1486
2035
  self._data[index][self.end_column] = utterance.end
1487
- self._data[index][self.duration_column] = utterance.duration
2036
+ self._data[index][self.duration_column] = utterance.end - utterance.begin
2037
+ self.layoutChanged.emit()
2038
+
2039
+ def change_speaker_table_utterances(self, utterances: typing.List[Utterance]):
2040
+ self.layoutAboutToBeChanged.emit()
2041
+ for u in utterances:
2042
+ if u.id not in self.reversed_indices:
2043
+ continue
2044
+ index = self.reversed_indices[u.id]
2045
+ self._speaker_indices[index] = u.speaker_id
2046
+ self._data[index][self.speaker_column] = self.get_speaker_name(u.speaker_id)
1488
2047
  self.layoutChanged.emit()
1489
2048
 
1490
2049
  def add_table_utterances(self, utterances: typing.List[Utterance]):
1491
2050
  self.layoutAboutToBeChanged.emit()
1492
2051
  rows = []
1493
2052
  for utterance in utterances:
2053
+ speaker_name = self.get_speaker_name(utterance.speaker_id)
1494
2054
  row_data = [
1495
2055
  utterance.oovs,
1496
2056
  utterance.file_name,
1497
- utterance.speaker_name,
2057
+ speaker_name,
1498
2058
  utterance.begin,
1499
2059
  utterance.end,
1500
- utterance.duration,
2060
+ utterance.end - utterance.begin,
1501
2061
  utterance.text,
1502
2062
  ]
1503
2063
  self._data.append(row_data)
@@ -1512,7 +2072,10 @@ class CorpusModel(TableModel):
1512
2072
  def delete_table_utterances(self, utterances: typing.List[Utterance]):
1513
2073
  self.layoutAboutToBeChanged.emit()
1514
2074
  for utterance in utterances:
1515
- index = self.reversed_indices.pop(utterance.id)
2075
+ try:
2076
+ index = self.reversed_indices.pop(utterance.id)
2077
+ except KeyError:
2078
+ continue
1516
2079
  _ = self._data.pop(index)
1517
2080
  _ = self._indices.pop(index)
1518
2081
  _ = self._file_indices.pop(index)
@@ -1533,7 +2096,6 @@ class CorpusModel(TableModel):
1533
2096
 
1534
2097
  self.layoutAboutToBeChanged.emit()
1535
2098
  first = split_utterances[0]
1536
- self.session.merge(first)
1537
2099
  file_name = self._data[index][1]
1538
2100
  speaker_name = self._data[index][2]
1539
2101
  row_data = [
@@ -1542,7 +2104,7 @@ class CorpusModel(TableModel):
1542
2104
  speaker_name,
1543
2105
  first.begin,
1544
2106
  first.end,
1545
- first.duration,
2107
+ first.end - first.begin,
1546
2108
  first.text,
1547
2109
  ]
1548
2110
  self._data[index] = row_data
@@ -1552,7 +2114,6 @@ class CorpusModel(TableModel):
1552
2114
  self.reversed_indices[first.id] = index
1553
2115
  rows = [index]
1554
2116
  for utterance in split_utterances[1:]:
1555
- self.session.merge(utterance)
1556
2117
  index += 1
1557
2118
  rows.append(index)
1558
2119
  self.reversed_indices = {
@@ -1565,7 +2126,7 @@ class CorpusModel(TableModel):
1565
2126
  speaker_name,
1566
2127
  utterance.begin,
1567
2128
  utterance.end,
1568
- utterance.duration,
2129
+ utterance.end - utterance.begin,
1569
2130
  utterance.text,
1570
2131
  ]
1571
2132
  self.reversed_indices[utterance.id] = index
@@ -1584,14 +2145,13 @@ class CorpusModel(TableModel):
1584
2145
  except KeyError:
1585
2146
  return
1586
2147
  self.layoutAboutToBeChanged.emit()
1587
- self.session.merge(merged_utterance)
1588
2148
  row_data = [
1589
2149
  merged_utterance.oovs,
1590
2150
  merged_utterance.file_name,
1591
2151
  merged_utterance.speaker_name,
1592
2152
  merged_utterance.begin,
1593
2153
  merged_utterance.end,
1594
- merged_utterance.duration,
2154
+ merged_utterance.end - merged_utterance.begin,
1595
2155
  merged_utterance.text,
1596
2156
  ]
1597
2157
  first = split_utterances[0]
@@ -1640,32 +2200,6 @@ class CorpusModel(TableModel):
1640
2200
  self.language_model = language_model
1641
2201
  self.languageModelChanged.emit()
1642
2202
 
1643
- def create_utterance(self, file: File, speaker: Optional[Speaker], begin: float, end: float):
1644
- if not self.editable:
1645
- return
1646
- channel = 0
1647
- if file.num_channels > 1:
1648
- ind = file.speaker_ordering.index(speaker)
1649
- if ind >= len(file.speaker_ordering) / 2:
1650
- channel = 1
1651
- if speaker is None:
1652
- speaker = self.corpus.add_speaker("speech", session=self.session)
1653
- begin = round(begin, 4)
1654
- end = round(end, 4)
1655
- text = ""
1656
- next_pk = self.corpus.get_next_primary_key(Utterance)
1657
- new_utt = Utterance(
1658
- id=next_pk,
1659
- speaker_id=speaker.id,
1660
- file_id=file.id,
1661
- begin=begin,
1662
- end=end,
1663
- channel=channel,
1664
- text=text,
1665
- )
1666
- self.addCommand.emit(undo.CreateUtteranceCommand(new_utt, self))
1667
- self.unsaved_files.add(file.id)
1668
-
1669
2203
  def set_file_modified(self, file_id: typing.Union[int, typing.List[int]]):
1670
2204
  if isinstance(file_id, int):
1671
2205
  file_id = [file_id]
@@ -1680,32 +2214,6 @@ class CorpusModel(TableModel):
1680
2214
  )
1681
2215
  self.session.commit()
1682
2216
 
1683
- def update_utterance_text(self, utterance: Utterance, text):
1684
- if text != utterance.text:
1685
- self.addCommand.emit(undo.UpdateUtteranceTextCommand(utterance, text, self))
1686
- self.set_file_modified(utterance.file_id)
1687
-
1688
- def update_utterance_times(
1689
- self, utterance: Utterance, begin: Optional[float] = None, end: Optional[float] = None
1690
- ):
1691
- if not self.editable:
1692
- return
1693
- self.addCommand.emit(undo.UpdateUtteranceTimesCommand(utterance, begin, end, self))
1694
- self.set_file_modified(utterance.file_id)
1695
-
1696
- def update_utterance_speaker(self, utterance: Utterance, speaker: Speaker):
1697
- if not self.editable:
1698
- return
1699
- self.addCommand.emit(undo.UpdateUtteranceSpeakerCommand(utterance, speaker, self))
1700
-
1701
- def delete_utterances(self, utterances: list[Utterance]):
1702
- if not self.editable:
1703
- return
1704
- for u in utterances:
1705
- self.set_file_modified(u.file_id)
1706
- self.set_speaker_modified(u.speaker_id)
1707
- self.addCommand.emit(undo.DeleteUtteranceCommand(utterances, self))
1708
-
1709
2217
  def check_align_lexicon_compiler(self):
1710
2218
  if self.acoustic_model is None:
1711
2219
  return
@@ -1724,150 +2232,13 @@ class CorpusModel(TableModel):
1724
2232
  dictionary_id, self.acoustic_model, disambiguation=True
1725
2233
  )
1726
2234
 
1727
- def split_vad_utterance(
1728
- self, original_utterance_id, replacement_utterance_data: typing.List[KalpyUtterance]
1729
- ):
1730
- utt = self.session.get(Utterance, original_utterance_id)
1731
- replacement_utterances = []
1732
- speaker_id = utt.speaker_id
1733
- file_id = utt.file_id
1734
- next_pk = self.corpus.get_next_primary_key(Utterance)
1735
- for new_utt in replacement_utterance_data:
1736
- replacement_utterances.append(
1737
- Utterance(
1738
- id=next_pk,
1739
- begin=new_utt.segment.begin,
1740
- end=new_utt.segment.end,
1741
- speaker_id=speaker_id,
1742
- file_id=file_id,
1743
- text=new_utt.transcript,
1744
- normalized_text=new_utt.transcript,
1745
- features="",
1746
- in_subset=False,
1747
- ignored=False,
1748
- channel=new_utt.segment.channel,
1749
- )
1750
- )
1751
- next_pk += 1
1752
- splitting_utterances = [[utt, *replacement_utterances]]
1753
- self.addCommand.emit(
1754
- undo.SplitUtteranceCommand(splitting_utterances, self, update_table=False)
1755
- )
1756
- self.requestFileView.emit(utt.file_name)
1757
- self.set_file_modified(file_id)
1758
- self.set_speaker_modified(speaker_id)
1759
-
1760
- def split_utterances(self, utterances: list[Utterance]):
1761
- if not self.editable:
1762
- return
1763
- splitting_utterances = []
1764
- for utt in utterances:
1765
- duration = utt.duration
1766
- beg = utt.begin
1767
- end = utt.end
1768
- first_text = ""
1769
- second_text = ""
1770
- if " " not in utt.text and " " in utt.normalized_text:
1771
- t = utt.normalized_text.split()
1772
- mid_ind = int(len(t) / 2)
1773
- first_text = t[:mid_ind]
1774
- second_text = t[mid_ind:]
1775
- elif utt.text:
1776
- t = utt.text.split()
1777
- mid_ind = int(len(t) / 2)
1778
- first_text = t[:mid_ind]
1779
- second_text = t[mid_ind:]
1780
- split_time = beg + (duration / 2)
1781
- oovs = set()
1782
- for w in first_text:
1783
- if not self.dictionary_model.check_word(w, utt.speaker_id):
1784
- oovs.add(w)
1785
- next_pk = self.corpus.get_next_primary_key(Utterance)
1786
- first_utt = Utterance(
1787
- id=next_pk,
1788
- speaker_id=utt.speaker_id,
1789
- file_id=utt.file_id,
1790
- begin=beg,
1791
- end=split_time,
1792
- channel=utt.channel,
1793
- text=" ".join(first_text),
1794
- normalized_text=" ".join(first_text),
1795
- oovs=" ".join(oovs),
1796
- )
1797
- next_pk += 1
1798
- oovs = set()
1799
- for w in second_text:
1800
- if not self.dictionary_model.check_word(w, utt.speaker_id):
1801
- oovs.add(w)
1802
- second_utt = Utterance(
1803
- id=next_pk,
1804
- speaker_id=utt.speaker_id,
1805
- file_id=utt.file_id,
1806
- begin=split_time,
1807
- end=end,
1808
- channel=utt.channel,
1809
- text=" ".join(second_text),
1810
- normalized_text=" ".join(second_text),
1811
- oovs=" ".join(oovs),
1812
- )
1813
- splitting_utterances.append([utt, first_utt, second_utt])
1814
- self.addCommand.emit(undo.SplitUtteranceCommand(splitting_utterances, self))
1815
- self.set_file_modified([utt[0].file_id for utt in splitting_utterances])
1816
-
1817
2235
  def merge_speakers(self, speakers: list[int]):
1818
2236
  self.addCommand.emit(undo.MergeSpeakersCommand(speakers, self))
1819
2237
 
1820
- def merge_utterances(self, utterances: list[Utterance]):
1821
- if not self.editable:
1822
- return
1823
- min_begin = 1000000000
1824
- max_end = 0
1825
- text = ""
1826
- normalized_text = ""
1827
- speaker = None
1828
- file = None
1829
- channel = None
1830
- for old_utt in sorted(utterances, key=lambda x: x.begin):
1831
- if speaker is None:
1832
- speaker = old_utt.speaker
1833
- if file is None:
1834
- file = old_utt.file
1835
- if channel is None:
1836
- channel = old_utt.channel
1837
- if old_utt.begin < min_begin:
1838
- min_begin = old_utt.begin
1839
- if old_utt.end > max_end:
1840
- max_end = old_utt.end
1841
- utt_text = old_utt.text
1842
- if utt_text == "speech" and text.strip() == "speech":
1843
- continue
1844
- text += utt_text + " "
1845
- normalized_text += old_utt.normalized_text + " "
1846
- text = text[:-1]
1847
- normalized_text = normalized_text[:-1]
1848
- next_pk = self.corpus.get_next_primary_key(Utterance)
1849
- oovs = set()
1850
- for w in text.split():
1851
- if not self.dictionary_model.check_word(w, speaker.id):
1852
- oovs.add(w)
1853
- new_utt = Utterance(
1854
- id=next_pk,
1855
- speaker=speaker,
1856
- file=file,
1857
- begin=min_begin,
1858
- end=max_end,
1859
- channel=channel,
1860
- text=text,
1861
- normalized_text=normalized_text,
1862
- oovs=" ".join(oovs),
1863
- )
1864
- self.set_file_modified(file.id)
1865
- self.addCommand.emit(undo.MergeUtteranceCommand(utterances, new_utt, self))
1866
-
1867
2238
  def replace_all(self, search_query: TextFilterQuery, replacement: str):
1868
2239
  self.addCommand.emit(undo.ReplaceAllCommand(search_query, replacement, self))
1869
2240
 
1870
- def utteranceAt(self, index) -> Optional[Utterance]:
2241
+ def utterance_id_at(self, index) -> Optional[Utterance]:
1871
2242
  if not isinstance(index, int):
1872
2243
  if not index.isValid():
1873
2244
  return None
@@ -1876,15 +2247,16 @@ class CorpusModel(TableModel):
1876
2247
  return None
1877
2248
  if len(self._indices) == 0:
1878
2249
  return None
1879
- utterance = (
1880
- self.session.query(Utterance)
1881
- .options(
1882
- joinedload(Utterance.file).joinedload(File.sound_file),
1883
- joinedload(Utterance.file).subqueryload(File.speakers),
1884
- )
1885
- .get(self._indices[index])
2250
+ return self._indices[index]
2251
+
2252
+ def audio_info_for_utterance(self, row: int):
2253
+ return (
2254
+ self._file_indices[row],
2255
+ self._data[row][self.begin_column],
2256
+ self._data[row][self.end_column],
2257
+ self._indices[row],
2258
+ self._speaker_indices[row],
1886
2259
  )
1887
- return utterance
1888
2260
 
1889
2261
  def fileAt(self, index) -> int:
1890
2262
  if not isinstance(index, int):