Anchor-annotator 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
anchor/models.py CHANGED
@@ -31,7 +31,7 @@ from montreal_forced_aligner.utils import mfa_open
31
31
  from PySide6 import QtCore
32
32
  from sqlalchemy.orm import joinedload
33
33
 
34
- from anchor import undo
34
+ from anchor import undo, workers
35
35
  from anchor.settings import AnchorSettings
36
36
 
37
37
 
@@ -148,24 +148,766 @@ class TableModel(QtCore.QAbstractTableModel):
148
148
  return len(self._header_data)
149
149
 
150
150
 
151
+ class FileUtterancesModel(QtCore.QAbstractListModel):
152
+ addCommand = QtCore.Signal(object)
153
+ selectionRequested = QtCore.Signal(object)
154
+
155
+ waveformReady = QtCore.Signal()
156
+ utterancesReady = QtCore.Signal()
157
+
158
+ def __init__(self, *args, **kwargs):
159
+ super().__init__(*args, **kwargs)
160
+ self.utterances = []
161
+ self.file = None
162
+ self.y = None
163
+ self.speakers = []
164
+ self._indices = []
165
+ self._speaker_indices = []
166
+ self.reversed_indices = {}
167
+ self.speaker_channel_mapping = {}
168
+ self.corpus_model: CorpusModel = None
169
+ self.waveform_worker = workers.WaveformWorker()
170
+ self.speaker_tier_worker = workers.SpeakerTierWorker()
171
+ self.speaker_tier_worker.signals.result.connect(self.finalize_loading_utterances)
172
+ self.waveform_worker.signals.result.connect(self.finalize_loading_wave_form)
173
+
174
+ def get_utterance(self, utterance_id: int) -> Utterance:
175
+ try:
176
+ return self.utterances[self.reversed_indices[utterance_id]]
177
+ except KeyError:
178
+ return None
179
+
180
+ def set_corpus_model(self, corpus_model: CorpusModel):
181
+ self.corpus_model = corpus_model
182
+
183
+ def clean_up_for_close(self):
184
+ self.waveform_worker.stop()
185
+ self.speaker_tier_worker.stop()
186
+
187
+ def set_file(self, file_id):
188
+ self.file = (
189
+ self.corpus_model.session.query(File).options(joinedload(File.sound_file)).get(file_id)
190
+ )
191
+ self.y = None
192
+ self.get_utterances()
193
+ self.waveform_worker.stop()
194
+ self.waveform_worker.set_params(self.file.sound_file.sound_file_path)
195
+ self.waveform_worker.start()
196
+
197
+ def finalize_loading_utterances(self, results):
198
+ utterances, file_id = results
199
+ if file_id != self.file.id:
200
+ return
201
+ self.utterances = utterances
202
+ for i, u in enumerate(utterances):
203
+ if u.speaker_id not in self.speakers:
204
+ self.speakers.append(u.speaker_id)
205
+ self._speaker_indices.append(u.speaker_id)
206
+ self.reversed_indices[u.id] = i
207
+ self._indices.append(u.id)
208
+ if self.file.num_channels > 1 and u.speaker_id not in self.speaker_channel_mapping:
209
+ self.speaker_channel_mapping[u.speaker_id] = u.channel
210
+ self.utterancesReady.emit()
211
+
212
+ def finalize_loading_wave_form(self, results):
213
+ y, file_path = results
214
+ if self.file is None or file_path != self.file.sound_file.sound_file_path:
215
+ return
216
+ self.y = y
217
+ self.waveformReady.emit()
218
+
219
+ def get_utterances(self):
220
+ parent_index = self.index(0, 0)
221
+ self.beginRemoveRows(parent_index, 0, len(self.utterances))
222
+ self.utterances = []
223
+ self.speakers = []
224
+ self._indices = []
225
+ self._speaker_indices = []
226
+ self.speaker_channel_mapping = {}
227
+ self.reversed_indices = {}
228
+ self.endRemoveRows()
229
+ if self.file is None:
230
+ return
231
+ self.speaker_tier_worker.stop()
232
+ self.speaker_tier_worker.query_alignment = (
233
+ self.corpus_model.has_alignments
234
+ or self.corpus_model.has_reference_alignments
235
+ or self.corpus_model.has_transcribed_alignments
236
+ )
237
+ self.speaker_tier_worker.session = self.corpus_model.session
238
+ self.speaker_tier_worker.set_params(self.file.id)
239
+ self.speaker_tier_worker.start()
240
+
241
+ def create_utterance(self, speaker_id: Optional[int], begin: float, end: float):
242
+ if not self.corpus_model.editable:
243
+ return
244
+ channel = 0
245
+ if speaker_id is None:
246
+ speaker_id = self.corpus_model.corpus.add_speaker(
247
+ "speech", session=self.corpus_model.session
248
+ ).id
249
+ if self.file.num_channels > 1:
250
+ if speaker_id not in self.speaker_channel_mapping:
251
+ self.speaker_channel_mapping[speaker_id] = 0
252
+ channel = self.speaker_channel_mapping[speaker_id]
253
+ begin = round(begin, 4)
254
+ end = round(end, 4)
255
+ text = ""
256
+ next_pk = self.corpus_model.corpus.get_next_primary_key(Utterance)
257
+ new_utt = Utterance(
258
+ id=next_pk,
259
+ speaker_id=speaker_id,
260
+ file_id=self.file.id,
261
+ file=self.file,
262
+ begin=begin,
263
+ end=end,
264
+ channel=channel,
265
+ text=text,
266
+ normalized_text=text,
267
+ oovs=text,
268
+ )
269
+ print(new_utt.id, new_utt.speaker_id, new_utt.file_id, new_utt.begin, new_utt.end)
270
+ self.addCommand.emit(undo.CreateUtteranceCommand(new_utt, self))
271
+ self.corpus_model.set_file_modified(self.file.id)
272
+ self.corpus_model.set_speaker_modified(speaker_id)
273
+
274
+ def add_table_utterances(self, utterances: typing.List[Utterance]):
275
+ for utterance in utterances:
276
+ if len(self.utterances) > 0:
277
+ for i, u in enumerate(self.utterances):
278
+ if u.begin < utterance.begin:
279
+ continue
280
+ break
281
+ else:
282
+ i = len(self.utterances) - 1
283
+ else:
284
+ i = 0
285
+ parent_index = self.index(i, 0)
286
+ self.beginInsertRows(parent_index, i, i + 1)
287
+ self.utterances.insert(i, utterance)
288
+ self._indices.insert(i, utterance.id)
289
+ self._speaker_indices.insert(i, utterance.speaker_id)
290
+ self.endInsertRows()
291
+ self.reversed_indices = {u: j for j, u in enumerate(self._indices)}
292
+ self.selectionRequested.emit(utterances)
293
+
294
+ def delete_table_utterances(self, utterances: typing.List[Utterance]):
295
+ for utterance in utterances:
296
+ try:
297
+ index = self.reversed_indices.pop(utterance.id)
298
+ except KeyError:
299
+ continue
300
+ parent_index = self.index(index, 0)
301
+ self.beginRemoveRows(parent_index, index, index + 1)
302
+ _ = self.utterances.pop(index)
303
+ _ = self._indices.pop(index)
304
+ _ = self._speaker_indices.pop(index)
305
+ self.reversed_indices = {u: j for j, u in enumerate(self._indices)}
306
+ self.endRemoveRows()
307
+ self.selectionRequested.emit(None)
308
+
309
+ def change_speaker_table_utterances(self, utterances: typing.List[Utterance]):
310
+ for utterance in utterances:
311
+ try:
312
+ index = self.reversed_indices[utterance.id]
313
+ except KeyError:
314
+ continue
315
+ if utterance.speaker_id not in self.speakers:
316
+ self.speakers.append(utterance.speaker_id)
317
+ self.speaker_channel_mapping[utterance.speaker_id] = utterance.channel
318
+ self._speaker_indices[index] = utterance.speaker_id
319
+
320
+ def merge_table_utterances(
321
+ self, merged_utterance: Utterance, split_utterances: typing.List[Utterance]
322
+ ):
323
+ self.delete_table_utterances(split_utterances)
324
+ self.add_table_utterances([merged_utterance])
325
+
326
+ def split_table_utterances(
327
+ self, merged_utterance: Utterance, split_utterances: typing.List[Utterance]
328
+ ):
329
+ self.delete_table_utterances([merged_utterance])
330
+ self.add_table_utterances(split_utterances)
331
+
332
+ def update_utterance_text(self, utterance: Utterance, text):
333
+ if not self.corpus_model.editable:
334
+ return
335
+ if text != utterance.text:
336
+ self.addCommand.emit(undo.UpdateUtteranceTextCommand(utterance, text, self))
337
+ self.corpus_model.set_file_modified(self.file.id)
338
+
339
+ def refresh_utterances(self):
340
+ for utterance in self.utterances:
341
+ self.corpus_model.session.refresh(utterance)
342
+
343
+ def update_utterance_speaker(self, utterance: Utterance, speaker_id: int):
344
+ if not self.corpus_model.editable:
345
+ return
346
+ old_speaker_id = utterance.speaker_id
347
+ if old_speaker_id == speaker_id:
348
+ return
349
+ self.addCommand.emit(undo.UpdateUtteranceSpeakerCommand(utterance, speaker_id, self))
350
+ self.corpus_model.set_file_modified(self.file.id)
351
+ self.corpus_model.set_speaker_modified(speaker_id)
352
+ self.corpus_model.set_speaker_modified(old_speaker_id)
353
+
354
+ def update_utterance_times(
355
+ self, utterance: Utterance, begin: Optional[float] = None, end: Optional[float] = None
356
+ ):
357
+ if not self.corpus_model.editable:
358
+ return
359
+ if utterance.begin == begin and utterance.end == end:
360
+ return
361
+ self.addCommand.emit(undo.UpdateUtteranceTimesCommand(utterance, begin, end, self))
362
+ self.corpus_model.set_file_modified(self.file.id)
363
+
364
+ def split_vad_utterance(
365
+ self, original_utterance_id, replacement_utterance_data: typing.List[KalpyUtterance]
366
+ ):
367
+ if not replacement_utterance_data:
368
+ return
369
+ utt = self.utterances[self.reversed_indices[original_utterance_id]]
370
+ replacement_utterances = []
371
+ next_pk = self.corpus_model.corpus.get_next_primary_key(Utterance)
372
+ speaker_id = utt.speaker_id
373
+ for new_utt in replacement_utterance_data:
374
+ replacement_utterances.append(
375
+ Utterance(
376
+ id=next_pk,
377
+ begin=new_utt.segment.begin,
378
+ end=new_utt.segment.end,
379
+ speaker_id=speaker_id,
380
+ file_id=self.file.id,
381
+ text=new_utt.transcript,
382
+ normalized_text=new_utt.transcript,
383
+ features="",
384
+ in_subset=False,
385
+ ignored=False,
386
+ channel=new_utt.segment.channel,
387
+ )
388
+ )
389
+ next_pk += 1
390
+ self.addCommand.emit(
391
+ undo.SplitUtteranceCommand(utt, replacement_utterances, self, update_table=False)
392
+ )
393
+ self.corpus_model.set_file_modified(self.file.id)
394
+ self.corpus_model.set_speaker_modified(speaker_id)
395
+
396
+ def split_utterances(self, utterance: Utterance):
397
+ if not self.corpus_model.editable:
398
+ return
399
+ beg = utterance.begin
400
+ end = utterance.end
401
+ duration = end - beg
402
+ first_text = []
403
+ second_text = []
404
+ speaker_id = utterance.speaker_id
405
+ if (
406
+ utterance.text
407
+ and utterance.normalized_text
408
+ and " " not in utterance.text
409
+ and " " in utterance.normalized_text
410
+ ):
411
+ t = utterance.normalized_text.split()
412
+ mid_ind = int(len(t) / 2)
413
+ first_text = t[:mid_ind]
414
+ second_text = t[mid_ind:]
415
+ elif utterance.text:
416
+ t = utterance.text.split()
417
+ mid_ind = int(len(t) / 2)
418
+ first_text = t[:mid_ind]
419
+ second_text = t[mid_ind:]
420
+ split_time = beg + (duration / 2)
421
+ oovs = set()
422
+ for w in first_text:
423
+ if not self.corpus_model.dictionary_model.check_word(w, speaker_id):
424
+ oovs.add(w)
425
+ next_pk = self.corpus_model.corpus.get_next_primary_key(Utterance)
426
+ first_utt = Utterance(
427
+ id=next_pk,
428
+ speaker_id=speaker_id,
429
+ file_id=self.file.id,
430
+ begin=beg,
431
+ end=split_time,
432
+ channel=utterance.channel,
433
+ text=" ".join(first_text),
434
+ normalized_text=" ".join(first_text),
435
+ oovs=" ".join(oovs),
436
+ )
437
+ next_pk += 1
438
+ oovs = set()
439
+ for w in second_text:
440
+ if not self.corpus_model.dictionary_model.check_word(w, utterance.speaker_id):
441
+ oovs.add(w)
442
+ second_utt = Utterance(
443
+ id=next_pk,
444
+ speaker_id=speaker_id,
445
+ file_id=self.file.id,
446
+ begin=split_time,
447
+ end=end,
448
+ channel=utterance.channel,
449
+ text=" ".join(second_text),
450
+ normalized_text=" ".join(second_text),
451
+ oovs=" ".join(oovs),
452
+ )
453
+ self.addCommand.emit(undo.SplitUtteranceCommand(utterance, [first_utt, second_utt], self))
454
+ self.corpus_model.set_file_modified(self.file.id)
455
+ self.corpus_model.set_speaker_modified(speaker_id)
456
+ self.selectionRequested.emit([first_utt, second_utt])
457
+
458
+ def merge_utterances(self, utterances: list[Utterance]):
459
+ if not self.corpus_model.editable:
460
+ return
461
+ if not utterances:
462
+ return
463
+ min_begin = 1000000000
464
+ max_end = 0
465
+ text = ""
466
+ normalized_text = ""
467
+ speaker_id = None
468
+ channel = None
469
+ for old_utt in sorted(utterances, key=lambda x: x.begin):
470
+ if speaker_id is None:
471
+ speaker_id = old_utt.speaker_id
472
+ if channel is None:
473
+ channel = old_utt.channel
474
+ if old_utt.begin < min_begin:
475
+ min_begin = old_utt.begin
476
+ if old_utt.end > max_end:
477
+ max_end = old_utt.end
478
+ utt_text = old_utt.text
479
+ if utt_text == "speech" and text.strip() == "speech":
480
+ continue
481
+ text += utt_text + " "
482
+ normalized_text += old_utt.normalized_text + " "
483
+ text = text[:-1]
484
+ normalized_text = normalized_text[:-1]
485
+ next_pk = self.corpus_model.corpus.get_next_primary_key(Utterance)
486
+ oovs = set()
487
+ for w in text.split():
488
+ if not self.corpus_model.dictionary_model.check_word(w, speaker_id):
489
+ oovs.add(w)
490
+ new_utt = Utterance(
491
+ id=next_pk,
492
+ speaker_id=speaker_id,
493
+ file_id=self.file.id,
494
+ begin=min_begin,
495
+ end=max_end,
496
+ channel=channel,
497
+ text=text,
498
+ normalized_text=normalized_text,
499
+ oovs=" ".join(oovs),
500
+ )
501
+ self.addCommand.emit(undo.MergeUtteranceCommand(utterances, new_utt, self))
502
+ self.corpus_model.set_file_modified(self.file.id)
503
+ self.corpus_model.set_speaker_modified(speaker_id)
504
+ self.selectionRequested.emit([new_utt])
505
+
506
+ def delete_utterances(self, utterances: typing.List[Utterance]):
507
+ if not self.corpus_model.editable:
508
+ return
509
+ if not utterances:
510
+ return
511
+ speaker_ids = set(x.speaker_id for x in utterances)
512
+ self.addCommand.emit(undo.DeleteUtteranceCommand(utterances, self))
513
+ self.corpus_model.set_file_modified(self.file.id)
514
+ for speaker_id in speaker_ids:
515
+ self.corpus_model.set_speaker_modified(speaker_id)
516
+
517
+ def rowCount(self, parent=None):
518
+ return len(self.utterances)
519
+
520
+ def data(self, index, role=QtCore.Qt.ItemDataRole.DisplayRole):
521
+ if role == QtCore.Qt.ItemDataRole.DisplayRole:
522
+ return self.utterances[index.row()]
523
+
524
+
525
+ class FileSelectionModel(QtCore.QItemSelectionModel):
526
+ fileAboutToChange = QtCore.Signal()
527
+ fileChanged = QtCore.Signal()
528
+ channelChanged = QtCore.Signal()
529
+ resetView = QtCore.Signal()
530
+ viewChanged = QtCore.Signal(object, object)
531
+ selectionAudioChanged = QtCore.Signal()
532
+ currentTimeChanged = QtCore.Signal(object)
533
+ currentUtteranceChanged = QtCore.Signal()
534
+ speakerRequested = QtCore.Signal(object)
535
+
536
+ spectrogramReady = QtCore.Signal()
537
+ waveformReady = QtCore.Signal()
538
+ pitchTrackReady = QtCore.Signal()
539
+
540
+ def __init__(self, *args, **kwargs):
541
+ super().__init__(*args, **kwargs)
542
+ self.settings = AnchorSettings()
543
+ self.min_time = 0
544
+ self.max_time = 10
545
+ self.selected_min_time = None
546
+ self.selected_max_time = None
547
+ self.x = None
548
+ self.y = None
549
+ self.top_point = 2
550
+ self.bottom_point = 0
551
+ self.separator_point = 1
552
+ self.selected_channel = 0
553
+ self.spectrogram = None
554
+ self.min_db = None
555
+ self.max_db = None
556
+ self.pitch_track_x = None
557
+ self.pitch_track_y = None
558
+ self.waveform_x = None
559
+ self.waveform_y = None
560
+ self.requested_utterance_id = None
561
+ self.auto_waveform_worker = workers.AutoWaveformWorker()
562
+ self.spectrogram_worker = workers.SpectrogramWorker()
563
+ self.pitch_track_worker = workers.PitchWorker()
564
+ self.auto_waveform_worker.signals.result.connect(self.finalize_loading_auto_wave_form)
565
+ self.spectrogram_worker.signals.result.connect(self.finalize_loading_spectrogram)
566
+ self.pitch_track_worker.signals.result.connect(self.finalize_loading_pitch_track)
567
+ self.model().waveformReady.connect(self.load_audio_selection)
568
+ self.model().utterancesReady.connect(self.finalize_set_new_file)
569
+ self.viewChanged.connect(self.load_audio_selection)
570
+ self.model().selectionRequested.connect(self.update_selected_utterances)
571
+
572
+ def selected_utterances(self):
573
+ utts = []
574
+ m = self.model()
575
+ for index in self.selectedRows(0):
576
+ utt = m.utterances[index.row()]
577
+ utts.append(utt)
578
+ return utts
579
+
580
+ def load_audio_selection(self):
581
+ if self.model().y is None:
582
+ return
583
+ begin_samp = int(self.min_time * self.model().file.sample_rate)
584
+ end_samp = int(self.max_time * self.model().file.sample_rate)
585
+ if len(self.model().y.shape) > 1:
586
+ y = self.model().y[begin_samp:end_samp, self.selected_channel]
587
+ else:
588
+ y = self.model().y[begin_samp:end_samp]
589
+ self.spectrogram_worker.stop()
590
+ self.spectrogram_worker.set_params(
591
+ y,
592
+ self.model().file.sound_file.sample_rate,
593
+ self.min_time,
594
+ self.max_time,
595
+ self.selected_channel,
596
+ )
597
+ self.spectrogram_worker.start()
598
+ if self.max_time - self.min_time <= 10:
599
+ self.pitch_track_worker.stop()
600
+ self.pitch_track_worker.set_params(
601
+ y,
602
+ self.model().file.sound_file.sample_rate,
603
+ self.min_time,
604
+ self.max_time,
605
+ self.selected_channel,
606
+ self.bottom_point,
607
+ self.separator_point,
608
+ )
609
+ self.pitch_track_worker.start()
610
+ self.auto_waveform_worker.stop()
611
+ self.auto_waveform_worker.set_params(
612
+ y,
613
+ self.separator_point,
614
+ self.top_point,
615
+ self.min_time,
616
+ self.max_time,
617
+ self.selected_channel,
618
+ )
619
+ self.auto_waveform_worker.start()
620
+
621
+ def clean_up_for_close(self):
622
+ self.spectrogram_worker.stop()
623
+ self.pitch_track_worker.stop()
624
+ self.auto_waveform_worker.stop()
625
+
626
+ @property
627
+ def plot_min(self):
628
+ if self.settings.right_to_left:
629
+ return -self.max_time
630
+ return self.min_time
631
+
632
+ @property
633
+ def plot_max(self):
634
+ if self.settings.right_to_left:
635
+ return -self.min_time
636
+ return self.max_time
637
+
638
+ def finalize_loading_spectrogram(self, results):
639
+ stft, channel, begin, end, min_db, max_db = results
640
+ if self.settings.right_to_left:
641
+ stft = np.flip(stft, 1)
642
+ begin, end = -end, -begin
643
+ if begin != self.plot_min or end != self.plot_max:
644
+ return
645
+ self.spectrogram = stft
646
+ self.min_db = self.min_db
647
+ self.max_db = self.max_db
648
+ self.spectrogramReady.emit()
649
+
650
+ def finalize_loading_pitch_track(self, results):
651
+ pitch_track, voicing_track, channel, begin, end, min_f0, max_f0 = results
652
+ if self.settings.right_to_left:
653
+ pitch_track = np.flip(pitch_track, 0)
654
+ begin, end = -end, -begin
655
+ if begin != self.plot_min or end != self.plot_max:
656
+ return
657
+ self.pitch_track_y = pitch_track
658
+ if pitch_track is None:
659
+ return
660
+ x = np.linspace(
661
+ start=self.plot_min,
662
+ stop=self.plot_max,
663
+ num=pitch_track.shape[0],
664
+ )
665
+ self.pitch_track_x = x
666
+ self.pitchTrackReady.emit()
667
+
668
+ def finalize_loading_auto_wave_form(self, results):
669
+ y, begin, end, channel = results
670
+ if self.settings.right_to_left:
671
+ y = np.flip(y, 0)
672
+ begin, end = -end, -begin
673
+ if begin != self.plot_min or end != self.plot_max:
674
+ return
675
+ x = np.linspace(start=self.plot_min, stop=self.plot_max, num=y.shape[0])
676
+ self.waveform_x = x
677
+ self.waveform_y = y
678
+ self.waveformReady.emit()
679
+
680
+ def select_audio(self, begin, end):
681
+ if end is not None and end - begin < 0.025:
682
+ end = None
683
+ self.selected_min_time = begin
684
+ self.selected_max_time = end
685
+ self.selectionAudioChanged.emit()
686
+
687
+ def request_start_time(self, start_time):
688
+ if start_time >= self.max_time:
689
+ return
690
+ if start_time < self.min_time:
691
+ return
692
+ self.selected_min_time = start_time
693
+ self.selected_max_time = None
694
+ self.selectionAudioChanged.emit()
695
+
696
+ def set_current_channel(self, channel):
697
+ if channel == self.selected_channel:
698
+ return
699
+ self.selected_channel = channel
700
+ self.load_audio_selection()
701
+
702
+ def get_selected_wave_form(self):
703
+ if self.y is None:
704
+ return None, None
705
+ if len(self.y.shape) > 1 and self.y.shape[0] == 2:
706
+ return self.x, self.y[self.selected_channel, :]
707
+ return self.x, self.y
708
+
709
+ def zoom(self, factor, mid_point=None):
710
+ if factor == 0 or self.min_time is None:
711
+ return
712
+ cur_duration = self.max_time - self.min_time
713
+ if mid_point is None:
714
+ mid_point = self.min_time + (cur_duration / 2)
715
+ new_duration = cur_duration / factor
716
+ new_begin = mid_point - (mid_point - self.min_time) / factor
717
+ new_begin = max(new_begin, 0)
718
+ new_end = min(new_begin + new_duration, self.model().file.duration)
719
+ if new_end - new_begin <= 0.025:
720
+ return
721
+ self.set_view_times(new_begin, new_end)
722
+
723
+ def pan(self, factor):
724
+ if self.min_time is None:
725
+ return
726
+ if factor < 1:
727
+ factor = 1 - factor
728
+ right = True
729
+ else:
730
+ right = False
731
+ factor = factor - 1
732
+ if right and self.max_time == self.model().file.duration:
733
+ return
734
+ if not right and self.min_time == 0:
735
+ return
736
+ cur_duration = self.max_time - self.min_time
737
+ shift = factor * cur_duration
738
+ if right:
739
+ new_begin = self.min_time + shift
740
+ new_end = self.max_time + shift
741
+ else:
742
+ new_begin = self.min_time - shift
743
+ new_end = self.max_time - shift
744
+ if new_begin < 0:
745
+ new_end = new_end + abs(new_begin)
746
+ new_begin = 0
747
+ if new_end > self.model().file.duration:
748
+ new_begin -= self.model().file.duration - new_end
749
+ new_end = self.model().file.duration
750
+ self.set_view_times(new_begin, new_end)
751
+
752
+ def zoom_in(self):
753
+ if self.model().file is None:
754
+ return
755
+ self.zoom(1.5)
756
+
757
+ def zoom_out(self):
758
+ if self.model().file is None:
759
+ return
760
+ self.zoom(0.5)
761
+
762
+ def zoom_to_selection(self):
763
+ if self.selected_min_time is not None and self.selected_max_time is not None:
764
+ self.set_view_times(self.selected_min_time, self.selected_max_time)
765
+
766
+ def update_from_slider(self, value):
767
+ if not self.max_time:
768
+ return
769
+ cur_window = self.max_time - self.min_time
770
+ self.set_view_times(value, value + cur_window)
771
+
772
+ def update_selection_audio(self, begin, end):
773
+ if begin < self.min_time:
774
+ begin = self.min_time
775
+ if end > self.max_time:
776
+ end = self.max_time
777
+ self.selected_min_time = begin
778
+ self.selected_max_time = end
779
+ self.selectionAudioChanged.emit()
780
+
781
+ def visible_utterances(self) -> typing.List[Utterance]:
782
+ file_utts = []
783
+ if not self.model().file:
784
+ return file_utts
785
+ if self.model().rowCount() > 1:
786
+ for u in self.model().utterances:
787
+ if u.begin >= self.max_time:
788
+ break
789
+ if u.end <= self.min_time:
790
+ continue
791
+ file_utts.append(u)
792
+ else:
793
+ file_utts.extend(self.model().utterances)
794
+ return file_utts
795
+
796
+ def model(self) -> FileUtterancesModel:
797
+ return super().model()
798
+
799
+ def set_view_times(self, begin, end):
800
+ begin = max(begin, 0)
801
+ end = min(end, self.model().file.duration)
802
+ if (begin, end) == (self.min_time, self.max_time):
803
+ return
804
+ self.min_time = begin
805
+ self.max_time = end
806
+ if (
807
+ self.selected_max_time is not None
808
+ and not self.min_time <= self.selected_min_time <= self.max_time
809
+ ):
810
+ self.selected_min_time = self.min_time
811
+ if (
812
+ self.selected_max_time is not None
813
+ and not self.min_time <= self.selected_max_time <= self.max_time
814
+ ):
815
+ self.selected_max_time = None
816
+ self.viewChanged.emit(self.min_time, self.max_time)
817
+
818
+ def set_current_file(self, info, force_update=False):
819
+ file_id, begin, end, utterance_id, speaker_id = info
820
+ try:
821
+ new_file = self.model().file is None or self.model().file.id != file_id
822
+ except sqlalchemy.orm.exc.DetachedInstanceError:
823
+ new_file = True
824
+ self.requested_utterance_id = utterance_id
825
+ if new_file:
826
+ self.fileAboutToChange.emit()
827
+ self.model().set_file(file_id)
828
+ self.speakerRequested.emit(speaker_id)
829
+ else:
830
+ self.finalize_set_new_file()
831
+ self.speakerRequested.emit(speaker_id)
832
+ self.set_view_times(begin, end)
833
+
834
+ def finalize_set_new_file(self):
835
+ if self.requested_utterance_id is None:
836
+ return
837
+ utterance = self.model().get_utterance(self.requested_utterance_id)
838
+ if utterance is None:
839
+ return
840
+ self.update_select(self.requested_utterance_id, reset=True)
841
+ self.selected_channel = 0
842
+ if utterance is not None and utterance.channel is not None:
843
+ self.selected_channel = utterance.channel
844
+ self.fileChanged.emit()
845
+
846
+ def checkSelected(self, utterance_id: int):
847
+ m = self.model()
848
+ for index in self.selectedRows(0):
849
+ if utterance_id == m._indices[index.row()]:
850
+ return True
851
+ return False
852
+
853
+ def update_selected_utterances(self, utterances):
854
+ super().clearSelection()
855
+ super().clearCurrentIndex()
856
+ if not utterances:
857
+ return
858
+ flags = QtCore.QItemSelectionModel.SelectionFlag.Rows
859
+ flags |= QtCore.QItemSelectionModel.SelectionFlag.Select
860
+ for u in utterances:
861
+ if u.id not in self.model().reversed_indices:
862
+ continue
863
+ row = self.model().reversed_indices[u.id]
864
+
865
+ index = self.model().index(row, 0)
866
+ if not index.isValid():
867
+ return
868
+ self.select(index, flags)
869
+ self.currentUtteranceChanged.emit()
870
+
871
+ def update_select(self, utterance_id: int, deselect=False, reset=False):
872
+ if reset and [x.id for x in self.selected_utterances()] == [utterance_id]:
873
+ return
874
+ flags = QtCore.QItemSelectionModel.SelectionFlag.Rows
875
+ if reset:
876
+ flags |= QtCore.QItemSelectionModel.SelectionFlag.ClearAndSelect
877
+ elif deselect:
878
+ flags |= QtCore.QItemSelectionModel.SelectionFlag.Deselect
879
+ else:
880
+ flags |= QtCore.QItemSelectionModel.SelectionFlag.Select
881
+ if utterance_id not in self.model().reversed_indices:
882
+ return
883
+ row = self.model().reversed_indices[utterance_id]
884
+
885
+ index = self.model().index(row, 0)
886
+ if not index.isValid():
887
+ return
888
+ self.select(index, flags)
889
+ if not deselect:
890
+ self.select_audio(self.model().utterances[row].begin, self.model().utterances[row].end)
891
+ self.currentUtteranceChanged.emit()
892
+
893
+
151
894
  class CorpusSelectionModel(QtCore.QItemSelectionModel):
152
895
  fileChanged = QtCore.Signal()
153
896
  channelChanged = QtCore.Signal()
154
897
  resetView = QtCore.Signal()
155
898
  fileAboutToChange = QtCore.Signal()
156
- viewChanged = QtCore.Signal(object, object)
899
+ fileViewRequested = QtCore.Signal(object)
157
900
  selectionAudioChanged = QtCore.Signal()
158
901
  currentTimeChanged = QtCore.Signal(object)
159
902
  currentUtteranceChanged = QtCore.Signal()
160
903
 
161
904
  def __init__(self, *args, **kwargs):
162
- super(CorpusSelectionModel, self).__init__(*args, **kwargs)
905
+ super().__init__(*args, **kwargs)
163
906
  self.settings = AnchorSettings()
164
907
  self.min_time = 0
165
908
  self.max_time = 10
166
909
  self.selected_min_time = None
167
910
  self.selected_max_time = None
168
- self.current_file: Optional[File] = None
169
911
  self.x = None
170
912
  self.y = None
171
913
  self.current_utterance_id = None
@@ -176,22 +918,8 @@ class CorpusSelectionModel(QtCore.QItemSelectionModel):
176
918
  # self.selectionChanged.connect(self.update_selection_audio)
177
919
  # self.selectionChanged.connect(self.update_selection_audio)
178
920
  # self.model().changeCommandFired.connect(self.expire_current)
179
- self.selectionChanged.connect(self._update_selection)
180
921
  self.model().layoutChanged.connect(self.check_selection)
181
922
  self.model().unlockCorpus.connect(self.fileChanged.emit)
182
- self.model().selectionRequested.connect(self.update_select_rows)
183
-
184
- @property
185
- def plot_min(self):
186
- if self.settings.right_to_left:
187
- return -self.max_time
188
- return self.min_time
189
-
190
- @property
191
- def plot_max(self):
192
- if self.settings.right_to_left:
193
- return -self.min_time
194
- return self.max_time
195
923
 
196
924
  def set_current_utterance(self, utterance_id):
197
925
  self.current_utterance_id = utterance_id
@@ -203,13 +931,8 @@ class CorpusSelectionModel(QtCore.QItemSelectionModel):
203
931
  elif self.model().rowCount() == 0:
204
932
  self.clearSelection()
205
933
 
206
- def set_current_channel(self, channel):
207
- self.selected_channel = channel
208
- self.channelChanged.emit()
209
-
210
934
  def clearSelection(self) -> None:
211
935
  self.fileAboutToChange.emit()
212
- self.current_file = None
213
936
  self.current_utterance_id = None
214
937
  self.min_time = None
215
938
  self.max_time = None
@@ -219,22 +942,6 @@ class CorpusSelectionModel(QtCore.QItemSelectionModel):
219
942
  super(CorpusSelectionModel, self).clearSelection()
220
943
  self.fileChanged.emit()
221
944
 
222
- def update_selected_wavform(self, *args):
223
- if self.min_time is None or self.current_file is None:
224
- self.x = None
225
- self.y = None
226
- else:
227
- self.x, self.y = self.current_file.sound_file.normalized_waveform(
228
- self.min_time, self.max_time
229
- )
230
-
231
- def get_selected_wave_form(self):
232
- if self.y is None:
233
- return None, None
234
- if len(self.y.shape) > 1 and self.y.shape[0] == 2:
235
- return self.x, self.y[self.selected_channel, :]
236
- return self.x, self.y
237
-
238
945
  def update_select_rows(self, rows: list[int]):
239
946
  super(CorpusSelectionModel, self).clearCurrentIndex()
240
947
  super(CorpusSelectionModel, self).clearSelection()
@@ -250,8 +957,29 @@ class CorpusSelectionModel(QtCore.QItemSelectionModel):
250
957
  | QtCore.QItemSelectionModel.SelectionFlag.Rows,
251
958
  )
252
959
 
960
+ def update_selected_utterances(self, utterances):
961
+ if not utterances:
962
+ return
963
+ first = True
964
+ for u in utterances:
965
+ if u.id not in self.model().reversed_indices:
966
+ continue
967
+ row = self.model().reversed_indices[u.id]
968
+
969
+ index = self.model().index(row, 0)
970
+ if not index.isValid():
971
+ return
972
+ if not first:
973
+ flags = QtCore.QItemSelectionModel.SelectionFlag.Rows
974
+ flags |= QtCore.QItemSelectionModel.SelectionFlag.Select
975
+ else:
976
+ flags = QtCore.QItemSelectionModel.SelectionFlag.Rows
977
+ flags |= QtCore.QItemSelectionModel.SelectionFlag.ClearAndSelect
978
+ first = False
979
+ self.select(index, flags)
980
+
253
981
  def update_select(self, utterance_id: int, deselect=False, reset=False, focus=False):
254
- if reset and [x.id for x in self.selectedUtterances()] == [utterance_id]:
982
+ if reset and self.selected_utterances() == [utterance_id]:
255
983
  return
256
984
  flags = QtCore.QItemSelectionModel.SelectionFlag.Rows
257
985
  if reset:
@@ -266,58 +994,13 @@ class CorpusSelectionModel(QtCore.QItemSelectionModel):
266
994
  if focus:
267
995
  flags |= QtCore.QItemSelectionModel.SelectionFlag.Current
268
996
  if row == self.currentIndex().row():
269
- self.update_view_times(force_update=True)
997
+ self.fileViewRequested.emit(self.model().audio_info_for_utterance(row))
270
998
 
271
999
  index = self.model().index(row, 0)
272
1000
  if not index.isValid():
273
1001
  return
274
1002
  self.select(index, flags)
275
1003
 
276
- def select_audio(self, begin, end):
277
- if end is not None and end - begin < 0.025:
278
- end = None
279
- self.selected_min_time = begin
280
- self.selected_max_time = end
281
- self.selectionAudioChanged.emit()
282
-
283
- def request_start_time(self, start_time):
284
- if start_time >= self.max_time:
285
- return
286
- if start_time < self.min_time:
287
- return
288
- self.selected_min_time = start_time
289
- self.selected_max_time = None
290
- self.selectionAudioChanged.emit()
291
-
292
- def visible_utts(self) -> typing.List[Utterance]:
293
- file_utts = []
294
- if not self.current_file:
295
- return file_utts
296
- if self.current_file.num_utterances > 1:
297
- for u in sorted(self.current_file.utterances, key=lambda x: x.begin):
298
- if u.begin >= self.max_time:
299
- break
300
- if u.end <= self.min_time:
301
- continue
302
- file_utts.append(u)
303
- else:
304
- file_utts.extend(self.current_file.utterances)
305
- return file_utts
306
-
307
- def currentUtterance(self) -> Optional[Utterance]:
308
- if self.current_utterance_id is None:
309
- return
310
- m = self.model()
311
- utterance = (
312
- m.session.query(Utterance)
313
- .options(
314
- joinedload(Utterance.file).joinedload(File.sound_file),
315
- joinedload(Utterance.file).subqueryload(File.speakers),
316
- )
317
- .get(self.current_utterance_id)
318
- )
319
- return utterance
320
-
321
1004
  def _update_selection(self):
322
1005
  index = self.currentIndex()
323
1006
  if not index.isValid():
@@ -326,22 +1009,20 @@ class CorpusSelectionModel(QtCore.QItemSelectionModel):
326
1009
  self.current_utterance_id = m._indices[index.row()]
327
1010
  self.currentUtteranceChanged.emit()
328
1011
 
329
- def selectedUtterances(self):
330
- current_utterance = self.currentUtterance()
1012
+ def selected_utterances(self):
1013
+ current_utterance = self.current_utterance_id
331
1014
  if current_utterance is None:
332
1015
  return []
333
1016
  utts = [current_utterance]
334
1017
  m = self.model()
335
1018
  for index in self.selectedRows(1):
336
- if current_utterance is not None and m._indices[index.row()] == current_utterance.id:
1019
+ if current_utterance is not None and m._indices[index.row()] == current_utterance:
337
1020
  continue
338
- utt = m.utteranceAt(index)
1021
+ utt = m.utterance_id_at(index)
339
1022
  if utt is None:
340
1023
  continue
341
1024
  if current_utterance is None:
342
1025
  current_utterance = utt
343
- if utt.file_id != current_utterance.file_id:
344
- continue
345
1026
  utts.append(utt)
346
1027
  return utts
347
1028
 
@@ -356,142 +1037,23 @@ class CorpusSelectionModel(QtCore.QItemSelectionModel):
356
1037
  text = m.data(m.index(index.row(), m.text_column), QtCore.Qt.ItemDataRole.DisplayRole)
357
1038
  return text
358
1039
 
359
- def zoom(self, factor, mid_point=None):
360
- if factor == 0 or self.min_time is None:
361
- return
362
- cur_duration = self.max_time - self.min_time
363
- if mid_point is None:
364
- mid_point = self.min_time + (cur_duration / 2)
365
- new_duration = cur_duration / factor
366
- new_begin = mid_point - (mid_point - self.min_time) / factor
367
- new_begin = max(new_begin, 0)
368
- new_end = min(new_begin + new_duration, self.current_file.duration)
369
- if new_end - new_begin <= 0.025:
370
- return
371
- self.set_view_times(new_begin, new_end)
372
-
373
- def pan(self, factor):
374
- if self.min_time is None:
375
- return
376
- if factor < 1:
377
- factor = 1 - factor
378
- right = True
379
- else:
380
- right = False
381
- factor = factor - 1
382
- if right and self.max_time == self.current_file.duration:
383
- return
384
- if not right and self.min_time == 0:
385
- return
386
- cur_duration = self.max_time - self.min_time
387
- shift = factor * cur_duration
388
- if right:
389
- new_begin = self.min_time + shift
390
- new_end = self.max_time + shift
391
- else:
392
- new_begin = self.min_time - shift
393
- new_end = self.max_time - shift
394
- if new_begin < 0:
395
- new_end = new_end + abs(new_begin)
396
- new_begin = 0
397
- if new_end > self.current_file.duration:
398
- new_begin -= self.current_file.duration - new_end
399
- new_end = self.current_file.duration
400
- self.set_view_times(new_begin, new_end)
401
-
402
- def zoom_in(self):
403
- if self.current_file is None:
404
- return
405
- self.zoom(1.5)
406
-
407
- def zoom_out(self):
408
- if self.current_file is None:
409
- return
410
- self.zoom(0.5)
411
-
412
- def zoom_to_selection(self):
413
- if self.selected_min_time is None or self.selected_max_time is None:
414
- rows = self.selectedRows(1)
415
- if not rows:
416
- return
417
- begin = None
418
- end = None
419
- for r in rows:
420
- u = self.model().utteranceAt(r)
421
- if u is None:
422
- continue
423
- if u.file_id != self.current_file.id:
424
- continue
425
- if begin is None or begin > u.begin:
426
- begin = u.begin
427
- if end is None or end < u.end:
428
- end = u.end
429
- self.set_view_times(begin, end)
430
- else:
431
- self.set_view_times(self.selected_min_time, self.selected_max_time)
432
-
433
- def update_from_slider(self, value):
434
- if not self.max_time:
435
- return
436
- cur_window = self.max_time - self.min_time
437
- self.set_view_times(value, value + cur_window)
438
-
439
- def update_selection_audio(self):
440
- begins = self.selectedRows(self.model().begin_column)
441
- ends = self.selectedRows(self.model().end_column)
442
- begin = None
443
- end = None
444
- if len(begins) > 0:
445
- for i, b in enumerate(begins):
446
- b = self.model().data(b, QtCore.Qt.ItemDataRole.DisplayRole)
447
- e = self.model().data(ends[i], QtCore.Qt.ItemDataRole.DisplayRole)
448
- if begin is None or begin > b:
449
- begin = b
450
- if end is None or end < e:
451
- end = e
452
- if self.current_file is None or begin > self.current_file.duration:
453
- begin = None
454
- end = None
455
- elif end > self.current_file.duration:
456
- end = self.current_file.duration
457
- self.selected_min_time = begin
458
- self.selected_max_time = end
459
- self.selectionAudioChanged.emit()
460
-
461
1040
  def switch_utterance(self, new_index, old_index):
1041
+ if not self.model().fully_loaded:
1042
+ return
462
1043
  if not isinstance(new_index, QtCore.QModelIndex):
463
1044
  row = 0
464
1045
  else:
465
1046
  if not new_index.isValid():
466
1047
  return
467
1048
  row = new_index.row()
468
- utt = self.model().utteranceAt(row)
1049
+ utt = self.model().utterance_id_at(row)
469
1050
  if utt is None:
470
1051
  return
471
- if utt.id == self.current_utterance_id:
1052
+ if utt == self.current_utterance_id:
472
1053
  return
473
- self.current_utterance_id = utt.id
1054
+ self.current_utterance_id = utt
474
1055
  self.currentUtteranceChanged.emit()
475
- self.set_current_file(
476
- utt.file_id, utt.begin, utt.end, channel=utt.channel, force_update=True
477
- )
478
-
479
- def update_view_times(self, *args, force_update=False):
480
- utts = self.selectedUtterances()
481
- if len(utts) == 0:
482
- self.resetView.emit()
483
- return
484
- if len(utts) == 1:
485
- force_update = True
486
- begin = utts[0].begin
487
- f_id = utts[0].file_id
488
- end_ind = -1
489
- while True:
490
- if utts[end_ind].file_id == f_id:
491
- end = utts[end_ind].end
492
- break
493
- self.set_current_file(f_id, begin, end, channel=utts[0].channel, force_update=force_update)
494
- self.selected_min_time = self.min_time
1056
+ self.fileViewRequested.emit(self.model().audio_info_for_utterance(row))
495
1057
 
496
1058
  def model(self) -> CorpusModel:
497
1059
  return super(CorpusSelectionModel, self).model()
@@ -503,43 +1065,6 @@ class CorpusSelectionModel(QtCore.QItemSelectionModel):
503
1065
  return True
504
1066
  return False
505
1067
 
506
- def set_current_file(self, file_id, begin=None, end=None, channel=None, force_update=False):
507
- try:
508
- new_file = self.current_file is None or self.current_file.id != file_id
509
- except sqlalchemy.orm.exc.DetachedInstanceError:
510
- new_file = True
511
- if new_file:
512
- self.selected_min_time = None
513
- self.selected_max_time = None
514
- self.fileAboutToChange.emit()
515
- self.selected_channel = 0 if channel is None else channel
516
- self.current_file = (
517
- self.model().session.query(File).options(joinedload(File.sound_file)).get(file_id)
518
- )
519
- self.min_time = begin
520
- self.max_time = end
521
- self.fileChanged.emit()
522
- elif (
523
- self.current_file is not None
524
- and begin is not None
525
- and end is not None
526
- and force_update
527
- ):
528
- self.selected_channel = channel
529
- self.set_view_times(begin, end)
530
-
531
- def set_view_times(self, begin, end):
532
- begin = max(begin, 0)
533
- end = min(end, self.current_file.duration)
534
- if (begin, end) == (self.min_time, self.max_time):
535
- return
536
- self.min_time = begin
537
- self.max_time = end
538
- self.selected_min_time = self.min_time
539
- if self.selected_max_time is not None and self.selected_max_time > self.max_time:
540
- self.selected_max_time = None
541
- self.viewChanged.emit(self.min_time, self.max_time)
542
-
543
1068
  def focusUtterance(self, index):
544
1069
  m = self.model()
545
1070
  u = m.utteranceAt(index)
@@ -547,10 +1072,8 @@ class CorpusSelectionModel(QtCore.QItemSelectionModel):
547
1072
  self.min_time = 0
548
1073
  self.max_time = 1
549
1074
  self.fileAboutToChange()
550
- self.current_file = None
551
1075
  self.fileChanged.emit()
552
1076
  return
553
- self.current_file = u.file
554
1077
  begin = u.begin
555
1078
  end = u.end
556
1079
  padding = 1
@@ -1450,6 +1973,7 @@ class CorpusModel(TableModel):
1450
1973
  self.speakers = {}
1451
1974
  self.speaker_id_mapping = {}
1452
1975
  self.utterances = None
1976
+ self.session: sqlalchemy.orm.scoped_session = None
1453
1977
  self.utterance_count = 0
1454
1978
  self.speaker_count = 0
1455
1979
  self.file_count = 0
@@ -1494,29 +2018,46 @@ class CorpusModel(TableModel):
1494
2018
  return True
1495
2019
  return False
1496
2020
 
1497
- def update_utterance_table_row(self, utterance_id: int):
1498
- if utterance_id not in self.reversed_indices:
1499
- return
1500
- utterance = self.session.query(Utterance).get(utterance_id)
2021
+ def update_utterance_table_row(self, utterance: typing.Union[int, Utterance]):
2022
+ if isinstance(utterance, int):
2023
+ utterance_id = utterance
2024
+ if utterance_id not in self.reversed_indices:
2025
+ return
2026
+ utterance = self.session.query(Utterance).get(utterance_id)
2027
+ else:
2028
+ utterance_id = utterance.id
2029
+ if utterance_id not in self.reversed_indices:
2030
+ return
1501
2031
  index = self.reversed_indices[utterance_id]
1502
2032
  self.layoutAboutToBeChanged.emit()
1503
2033
  self._data[index][self.text_column] = utterance.text
1504
2034
  self._data[index][self.begin_column] = utterance.begin
1505
2035
  self._data[index][self.end_column] = utterance.end
1506
- self._data[index][self.duration_column] = utterance.duration
2036
+ self._data[index][self.duration_column] = utterance.end - utterance.begin
2037
+ self.layoutChanged.emit()
2038
+
2039
+ def change_speaker_table_utterances(self, utterances: typing.List[Utterance]):
2040
+ self.layoutAboutToBeChanged.emit()
2041
+ for u in utterances:
2042
+ if u.id not in self.reversed_indices:
2043
+ continue
2044
+ index = self.reversed_indices[u.id]
2045
+ self._speaker_indices[index] = u.speaker_id
2046
+ self._data[index][self.speaker_column] = self.get_speaker_name(u.speaker_id)
1507
2047
  self.layoutChanged.emit()
1508
2048
 
1509
2049
  def add_table_utterances(self, utterances: typing.List[Utterance]):
1510
2050
  self.layoutAboutToBeChanged.emit()
1511
2051
  rows = []
1512
2052
  for utterance in utterances:
2053
+ speaker_name = self.get_speaker_name(utterance.speaker_id)
1513
2054
  row_data = [
1514
2055
  utterance.oovs,
1515
2056
  utterance.file_name,
1516
- utterance.speaker_name,
2057
+ speaker_name,
1517
2058
  utterance.begin,
1518
2059
  utterance.end,
1519
- utterance.duration,
2060
+ utterance.end - utterance.begin,
1520
2061
  utterance.text,
1521
2062
  ]
1522
2063
  self._data.append(row_data)
@@ -1531,7 +2072,10 @@ class CorpusModel(TableModel):
1531
2072
  def delete_table_utterances(self, utterances: typing.List[Utterance]):
1532
2073
  self.layoutAboutToBeChanged.emit()
1533
2074
  for utterance in utterances:
1534
- index = self.reversed_indices.pop(utterance.id)
2075
+ try:
2076
+ index = self.reversed_indices.pop(utterance.id)
2077
+ except KeyError:
2078
+ continue
1535
2079
  _ = self._data.pop(index)
1536
2080
  _ = self._indices.pop(index)
1537
2081
  _ = self._file_indices.pop(index)
@@ -1552,7 +2096,6 @@ class CorpusModel(TableModel):
1552
2096
 
1553
2097
  self.layoutAboutToBeChanged.emit()
1554
2098
  first = split_utterances[0]
1555
- self.session.merge(first)
1556
2099
  file_name = self._data[index][1]
1557
2100
  speaker_name = self._data[index][2]
1558
2101
  row_data = [
@@ -1561,7 +2104,7 @@ class CorpusModel(TableModel):
1561
2104
  speaker_name,
1562
2105
  first.begin,
1563
2106
  first.end,
1564
- first.duration,
2107
+ first.end - first.begin,
1565
2108
  first.text,
1566
2109
  ]
1567
2110
  self._data[index] = row_data
@@ -1571,7 +2114,6 @@ class CorpusModel(TableModel):
1571
2114
  self.reversed_indices[first.id] = index
1572
2115
  rows = [index]
1573
2116
  for utterance in split_utterances[1:]:
1574
- self.session.merge(utterance)
1575
2117
  index += 1
1576
2118
  rows.append(index)
1577
2119
  self.reversed_indices = {
@@ -1584,7 +2126,7 @@ class CorpusModel(TableModel):
1584
2126
  speaker_name,
1585
2127
  utterance.begin,
1586
2128
  utterance.end,
1587
- utterance.duration,
2129
+ utterance.end - utterance.begin,
1588
2130
  utterance.text,
1589
2131
  ]
1590
2132
  self.reversed_indices[utterance.id] = index
@@ -1603,14 +2145,13 @@ class CorpusModel(TableModel):
1603
2145
  except KeyError:
1604
2146
  return
1605
2147
  self.layoutAboutToBeChanged.emit()
1606
- self.session.merge(merged_utterance)
1607
2148
  row_data = [
1608
2149
  merged_utterance.oovs,
1609
2150
  merged_utterance.file_name,
1610
2151
  merged_utterance.speaker_name,
1611
2152
  merged_utterance.begin,
1612
2153
  merged_utterance.end,
1613
- merged_utterance.duration,
2154
+ merged_utterance.end - merged_utterance.begin,
1614
2155
  merged_utterance.text,
1615
2156
  ]
1616
2157
  first = split_utterances[0]
@@ -1659,32 +2200,6 @@ class CorpusModel(TableModel):
1659
2200
  self.language_model = language_model
1660
2201
  self.languageModelChanged.emit()
1661
2202
 
1662
- def create_utterance(self, file: File, speaker: Optional[Speaker], begin: float, end: float):
1663
- if not self.editable:
1664
- return
1665
- channel = 0
1666
- if file.num_channels > 1:
1667
- ind = file.speaker_ordering.index(speaker)
1668
- if ind >= len(file.speaker_ordering) / 2:
1669
- channel = 1
1670
- if speaker is None:
1671
- speaker = self.corpus.add_speaker("speech", session=self.session)
1672
- begin = round(begin, 4)
1673
- end = round(end, 4)
1674
- text = ""
1675
- next_pk = self.corpus.get_next_primary_key(Utterance)
1676
- new_utt = Utterance(
1677
- id=next_pk,
1678
- speaker_id=speaker.id,
1679
- file_id=file.id,
1680
- begin=begin,
1681
- end=end,
1682
- channel=channel,
1683
- text=text,
1684
- )
1685
- self.addCommand.emit(undo.CreateUtteranceCommand(new_utt, self))
1686
- self.unsaved_files.add(file.id)
1687
-
1688
2203
  def set_file_modified(self, file_id: typing.Union[int, typing.List[int]]):
1689
2204
  if isinstance(file_id, int):
1690
2205
  file_id = [file_id]
@@ -1699,32 +2214,6 @@ class CorpusModel(TableModel):
1699
2214
  )
1700
2215
  self.session.commit()
1701
2216
 
1702
- def update_utterance_text(self, utterance: Utterance, text):
1703
- if text != utterance.text:
1704
- self.addCommand.emit(undo.UpdateUtteranceTextCommand(utterance, text, self))
1705
- self.set_file_modified(utterance.file_id)
1706
-
1707
- def update_utterance_times(
1708
- self, utterance: Utterance, begin: Optional[float] = None, end: Optional[float] = None
1709
- ):
1710
- if not self.editable:
1711
- return
1712
- self.addCommand.emit(undo.UpdateUtteranceTimesCommand(utterance, begin, end, self))
1713
- self.set_file_modified(utterance.file_id)
1714
-
1715
- def update_utterance_speaker(self, utterance: Utterance, speaker: Speaker):
1716
- if not self.editable:
1717
- return
1718
- self.addCommand.emit(undo.UpdateUtteranceSpeakerCommand(utterance, speaker, self))
1719
-
1720
- def delete_utterances(self, utterances: list[Utterance]):
1721
- if not self.editable:
1722
- return
1723
- for u in utterances:
1724
- self.set_file_modified(u.file_id)
1725
- self.set_speaker_modified(u.speaker_id)
1726
- self.addCommand.emit(undo.DeleteUtteranceCommand(utterances, self))
1727
-
1728
2217
  def check_align_lexicon_compiler(self):
1729
2218
  if self.acoustic_model is None:
1730
2219
  return
@@ -1743,150 +2232,13 @@ class CorpusModel(TableModel):
1743
2232
  dictionary_id, self.acoustic_model, disambiguation=True
1744
2233
  )
1745
2234
 
1746
- def split_vad_utterance(
1747
- self, original_utterance_id, replacement_utterance_data: typing.List[KalpyUtterance]
1748
- ):
1749
- utt = self.session.get(Utterance, original_utterance_id)
1750
- replacement_utterances = []
1751
- speaker_id = utt.speaker_id
1752
- file_id = utt.file_id
1753
- next_pk = self.corpus.get_next_primary_key(Utterance)
1754
- for new_utt in replacement_utterance_data:
1755
- replacement_utterances.append(
1756
- Utterance(
1757
- id=next_pk,
1758
- begin=new_utt.segment.begin,
1759
- end=new_utt.segment.end,
1760
- speaker_id=speaker_id,
1761
- file_id=file_id,
1762
- text=new_utt.transcript,
1763
- normalized_text=new_utt.transcript,
1764
- features="",
1765
- in_subset=False,
1766
- ignored=False,
1767
- channel=new_utt.segment.channel,
1768
- )
1769
- )
1770
- next_pk += 1
1771
- splitting_utterances = [[utt, *replacement_utterances]]
1772
- self.addCommand.emit(
1773
- undo.SplitUtteranceCommand(splitting_utterances, self, update_table=False)
1774
- )
1775
- self.requestFileView.emit(utt.file_name)
1776
- self.set_file_modified(file_id)
1777
- self.set_speaker_modified(speaker_id)
1778
-
1779
- def split_utterances(self, utterances: list[Utterance]):
1780
- if not self.editable:
1781
- return
1782
- splitting_utterances = []
1783
- for utt in utterances:
1784
- duration = utt.duration
1785
- beg = utt.begin
1786
- end = utt.end
1787
- first_text = ""
1788
- second_text = ""
1789
- if " " not in utt.text and " " in utt.normalized_text:
1790
- t = utt.normalized_text.split()
1791
- mid_ind = int(len(t) / 2)
1792
- first_text = t[:mid_ind]
1793
- second_text = t[mid_ind:]
1794
- elif utt.text:
1795
- t = utt.text.split()
1796
- mid_ind = int(len(t) / 2)
1797
- first_text = t[:mid_ind]
1798
- second_text = t[mid_ind:]
1799
- split_time = beg + (duration / 2)
1800
- oovs = set()
1801
- for w in first_text:
1802
- if not self.dictionary_model.check_word(w, utt.speaker_id):
1803
- oovs.add(w)
1804
- next_pk = self.corpus.get_next_primary_key(Utterance)
1805
- first_utt = Utterance(
1806
- id=next_pk,
1807
- speaker_id=utt.speaker_id,
1808
- file_id=utt.file_id,
1809
- begin=beg,
1810
- end=split_time,
1811
- channel=utt.channel,
1812
- text=" ".join(first_text),
1813
- normalized_text=" ".join(first_text),
1814
- oovs=" ".join(oovs),
1815
- )
1816
- next_pk += 1
1817
- oovs = set()
1818
- for w in second_text:
1819
- if not self.dictionary_model.check_word(w, utt.speaker_id):
1820
- oovs.add(w)
1821
- second_utt = Utterance(
1822
- id=next_pk,
1823
- speaker_id=utt.speaker_id,
1824
- file_id=utt.file_id,
1825
- begin=split_time,
1826
- end=end,
1827
- channel=utt.channel,
1828
- text=" ".join(second_text),
1829
- normalized_text=" ".join(second_text),
1830
- oovs=" ".join(oovs),
1831
- )
1832
- splitting_utterances.append([utt, first_utt, second_utt])
1833
- self.addCommand.emit(undo.SplitUtteranceCommand(splitting_utterances, self))
1834
- self.set_file_modified([utt[0].file_id for utt in splitting_utterances])
1835
-
1836
2235
  def merge_speakers(self, speakers: list[int]):
1837
2236
  self.addCommand.emit(undo.MergeSpeakersCommand(speakers, self))
1838
2237
 
1839
- def merge_utterances(self, utterances: list[Utterance]):
1840
- if not self.editable:
1841
- return
1842
- min_begin = 1000000000
1843
- max_end = 0
1844
- text = ""
1845
- normalized_text = ""
1846
- speaker = None
1847
- file = None
1848
- channel = None
1849
- for old_utt in sorted(utterances, key=lambda x: x.begin):
1850
- if speaker is None:
1851
- speaker = old_utt.speaker
1852
- if file is None:
1853
- file = old_utt.file
1854
- if channel is None:
1855
- channel = old_utt.channel
1856
- if old_utt.begin < min_begin:
1857
- min_begin = old_utt.begin
1858
- if old_utt.end > max_end:
1859
- max_end = old_utt.end
1860
- utt_text = old_utt.text
1861
- if utt_text == "speech" and text.strip() == "speech":
1862
- continue
1863
- text += utt_text + " "
1864
- normalized_text += old_utt.normalized_text + " "
1865
- text = text[:-1]
1866
- normalized_text = normalized_text[:-1]
1867
- next_pk = self.corpus.get_next_primary_key(Utterance)
1868
- oovs = set()
1869
- for w in text.split():
1870
- if not self.dictionary_model.check_word(w, speaker.id):
1871
- oovs.add(w)
1872
- new_utt = Utterance(
1873
- id=next_pk,
1874
- speaker=speaker,
1875
- file=file,
1876
- begin=min_begin,
1877
- end=max_end,
1878
- channel=channel,
1879
- text=text,
1880
- normalized_text=normalized_text,
1881
- oovs=" ".join(oovs),
1882
- )
1883
- self.set_file_modified(file.id)
1884
- self.addCommand.emit(undo.MergeUtteranceCommand(utterances, new_utt, self))
1885
-
1886
2238
  def replace_all(self, search_query: TextFilterQuery, replacement: str):
1887
2239
  self.addCommand.emit(undo.ReplaceAllCommand(search_query, replacement, self))
1888
2240
 
1889
- def utteranceAt(self, index) -> Optional[Utterance]:
2241
+ def utterance_id_at(self, index) -> Optional[Utterance]:
1890
2242
  if not isinstance(index, int):
1891
2243
  if not index.isValid():
1892
2244
  return None
@@ -1895,15 +2247,16 @@ class CorpusModel(TableModel):
1895
2247
  return None
1896
2248
  if len(self._indices) == 0:
1897
2249
  return None
1898
- utterance = (
1899
- self.session.query(Utterance)
1900
- .options(
1901
- joinedload(Utterance.file).joinedload(File.sound_file),
1902
- joinedload(Utterance.file).subqueryload(File.speakers),
1903
- )
1904
- .get(self._indices[index])
2250
+ return self._indices[index]
2251
+
2252
+ def audio_info_for_utterance(self, row: int):
2253
+ return (
2254
+ self._file_indices[row],
2255
+ self._data[row][self.begin_column],
2256
+ self._data[row][self.end_column],
2257
+ self._indices[row],
2258
+ self._speaker_indices[row],
1905
2259
  )
1906
- return utterance
1907
2260
 
1908
2261
  def fileAt(self, index) -> int:
1909
2262
  if not isinstance(index, int):