Anchor-annotator 0.1.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
anchor/models.py CHANGED
@@ -31,7 +31,7 @@ from montreal_forced_aligner.utils import mfa_open
31
31
  from PySide6 import QtCore
32
32
  from sqlalchemy.orm import joinedload
33
33
 
34
- from anchor import undo
34
+ from anchor import undo, workers
35
35
  from anchor.settings import AnchorSettings
36
36
 
37
37
 
@@ -148,38 +148,480 @@ class TableModel(QtCore.QAbstractTableModel):
148
148
  return len(self._header_data)
149
149
 
150
150
 
151
- class CorpusSelectionModel(QtCore.QItemSelectionModel):
151
+ class FileUtterancesModel(QtCore.QAbstractListModel):
152
+ addCommand = QtCore.Signal(object)
153
+ selectionRequested = QtCore.Signal(object)
154
+
155
+ waveformReady = QtCore.Signal()
156
+ utterancesReady = QtCore.Signal()
157
+
158
+ def __init__(self, *args, **kwargs):
159
+ super().__init__(*args, **kwargs)
160
+ self.utterances = []
161
+ self.file = None
162
+ self.y = None
163
+ self.speakers = []
164
+ self._indices = []
165
+ self._speaker_indices = []
166
+ self.reversed_indices = {}
167
+ self.speaker_channel_mapping = {}
168
+ self.corpus_model: CorpusModel = None
169
+ self.waveform_worker = workers.WaveformWorker()
170
+ self.speaker_tier_worker = workers.SpeakerTierWorker()
171
+ self.speaker_tier_worker.signals.result.connect(self.finalize_loading_utterances)
172
+ self.waveform_worker.signals.result.connect(self.finalize_loading_wave_form)
173
+
174
+ def get_utterance(self, utterance_id: int) -> Utterance:
175
+ try:
176
+ return self.utterances[self.reversed_indices[utterance_id]]
177
+ except KeyError:
178
+ return None
179
+
180
+ def set_corpus_model(self, corpus_model: CorpusModel):
181
+ self.corpus_model = corpus_model
182
+
183
+ def clean_up_for_close(self):
184
+ self.waveform_worker.stop()
185
+ self.speaker_tier_worker.stop()
186
+
187
+ def set_file(self, file_id):
188
+ self.file = (
189
+ self.corpus_model.session.query(File).options(joinedload(File.sound_file)).get(file_id)
190
+ )
191
+ self.y = None
192
+ self.get_utterances()
193
+ self.waveform_worker.stop()
194
+ self.waveform_worker.set_params(self.file.sound_file.sound_file_path)
195
+ self.waveform_worker.start()
196
+
197
+ def finalize_loading_utterances(self, results):
198
+ utterances, file_id = results
199
+ if file_id != self.file.id:
200
+ return
201
+ self.utterances = utterances
202
+ for i, u in enumerate(utterances):
203
+ if u.speaker_id not in self.speakers:
204
+ self.speakers.append(u.speaker_id)
205
+ self._speaker_indices.append(u.speaker_id)
206
+ self.reversed_indices[u.id] = i
207
+ self._indices.append(u.id)
208
+ if self.file.num_channels > 1 and u.speaker_id not in self.speaker_channel_mapping:
209
+ self.speaker_channel_mapping[u.speaker_id] = u.channel
210
+ self.utterancesReady.emit()
211
+
212
+ def finalize_loading_wave_form(self, results):
213
+ y, file_path = results
214
+ if self.file is None or file_path != self.file.sound_file.sound_file_path:
215
+ return
216
+ self.y = y
217
+ self.waveformReady.emit()
218
+
219
+ def get_utterances(self):
220
+ parent_index = self.index(0, 0)
221
+ self.beginRemoveRows(parent_index, 0, len(self.utterances))
222
+ self.utterances = []
223
+ self.speakers = []
224
+ self._indices = []
225
+ self._speaker_indices = []
226
+ self.speaker_channel_mapping = {}
227
+ self.reversed_indices = {}
228
+ self.endRemoveRows()
229
+ if self.file is None:
230
+ return
231
+ self.speaker_tier_worker.stop()
232
+ self.speaker_tier_worker.query_alignment = (
233
+ self.corpus_model.has_alignments
234
+ or self.corpus_model.has_reference_alignments
235
+ or self.corpus_model.has_transcribed_alignments
236
+ )
237
+ self.speaker_tier_worker.session = self.corpus_model.session
238
+ self.speaker_tier_worker.set_params(self.file.id)
239
+ self.speaker_tier_worker.start()
240
+
241
+ def create_utterance(self, speaker_id: Optional[int], begin: float, end: float):
242
+ if not self.corpus_model.editable:
243
+ return
244
+ channel = 0
245
+ if speaker_id is None:
246
+ speaker_id = self.corpus_model.corpus.add_speaker(
247
+ "speech", session=self.corpus_model.session
248
+ ).id
249
+ if self.file.num_channels > 1:
250
+ if speaker_id not in self.speaker_channel_mapping:
251
+ self.speaker_channel_mapping[speaker_id] = 0
252
+ channel = self.speaker_channel_mapping[speaker_id]
253
+ begin = round(begin, 4)
254
+ end = round(end, 4)
255
+ text = ""
256
+ next_pk = self.corpus_model.corpus.get_next_primary_key(Utterance)
257
+ new_utt = Utterance(
258
+ id=next_pk,
259
+ speaker_id=speaker_id,
260
+ file_id=self.file.id,
261
+ file=self.file,
262
+ begin=begin,
263
+ end=end,
264
+ channel=channel,
265
+ text=text,
266
+ normalized_text=text,
267
+ oovs=text,
268
+ )
269
+ print(new_utt.id, new_utt.speaker_id, new_utt.file_id, new_utt.begin, new_utt.end)
270
+ self.addCommand.emit(undo.CreateUtteranceCommand(new_utt, self))
271
+ self.corpus_model.set_file_modified(self.file.id)
272
+ self.corpus_model.set_speaker_modified(speaker_id)
273
+
274
+ def add_table_utterances(self, utterances: typing.List[Utterance]):
275
+ for utterance in utterances:
276
+ if len(self.utterances) > 0:
277
+ for i, u in enumerate(self.utterances):
278
+ if u.begin < utterance.begin:
279
+ continue
280
+ break
281
+ else:
282
+ i = len(self.utterances) - 1
283
+ else:
284
+ i = 0
285
+ parent_index = self.index(i, 0)
286
+ self.beginInsertRows(parent_index, i, i + 1)
287
+ self.utterances.insert(i, utterance)
288
+ self._indices.insert(i, utterance.id)
289
+ self._speaker_indices.insert(i, utterance.speaker_id)
290
+ self.endInsertRows()
291
+ self.reversed_indices = {u: j for j, u in enumerate(self._indices)}
292
+ self.selectionRequested.emit(utterances)
293
+
294
+ def delete_table_utterances(self, utterances: typing.List[Utterance]):
295
+ for utterance in utterances:
296
+ try:
297
+ index = self.reversed_indices.pop(utterance.id)
298
+ except KeyError:
299
+ continue
300
+ parent_index = self.index(index, 0)
301
+ self.beginRemoveRows(parent_index, index, index + 1)
302
+ _ = self.utterances.pop(index)
303
+ _ = self._indices.pop(index)
304
+ _ = self._speaker_indices.pop(index)
305
+ self.reversed_indices = {u: j for j, u in enumerate(self._indices)}
306
+ self.endRemoveRows()
307
+ self.selectionRequested.emit(None)
308
+
309
+ def change_speaker_table_utterances(self, utterances: typing.List[Utterance]):
310
+ for utterance in utterances:
311
+ try:
312
+ index = self.reversed_indices[utterance.id]
313
+ except KeyError:
314
+ continue
315
+ if utterance.speaker_id not in self.speakers:
316
+ self.speakers.append(utterance.speaker_id)
317
+ self.speaker_channel_mapping[utterance.speaker_id] = utterance.channel
318
+ self._speaker_indices[index] = utterance.speaker_id
319
+
320
+ def merge_table_utterances(
321
+ self, merged_utterance: Utterance, split_utterances: typing.List[Utterance]
322
+ ):
323
+ self.delete_table_utterances(split_utterances)
324
+ self.add_table_utterances([merged_utterance])
325
+
326
+ def split_table_utterances(
327
+ self, merged_utterance: Utterance, split_utterances: typing.List[Utterance]
328
+ ):
329
+ self.delete_table_utterances([merged_utterance])
330
+ self.add_table_utterances(split_utterances)
331
+
332
+ def update_utterance_text(self, utterance: Utterance, text):
333
+ if not self.corpus_model.editable:
334
+ return
335
+ if text != utterance.text:
336
+ self.addCommand.emit(undo.UpdateUtteranceTextCommand(utterance, text, self))
337
+ self.corpus_model.set_file_modified(self.file.id)
338
+
339
+ def refresh_utterances(self):
340
+ for utterance in self.utterances:
341
+ self.corpus_model.session.refresh(utterance)
342
+
343
+ def update_utterance_speaker(self, utterance: Utterance, speaker_id: int):
344
+ if not self.corpus_model.editable:
345
+ return
346
+ old_speaker_id = utterance.speaker_id
347
+ if old_speaker_id == speaker_id:
348
+ return
349
+ self.addCommand.emit(undo.UpdateUtteranceSpeakerCommand(utterance, speaker_id, self))
350
+ self.corpus_model.set_file_modified(self.file.id)
351
+ self.corpus_model.set_speaker_modified(speaker_id)
352
+ self.corpus_model.set_speaker_modified(old_speaker_id)
353
+
354
+ def update_utterance_times(
355
+ self, utterance: Utterance, begin: Optional[float] = None, end: Optional[float] = None
356
+ ):
357
+ if not self.corpus_model.editable:
358
+ return
359
+ if utterance.begin == begin and utterance.end == end:
360
+ return
361
+ self.addCommand.emit(undo.UpdateUtteranceTimesCommand(utterance, begin, end, self))
362
+ self.corpus_model.set_file_modified(self.file.id)
363
+
364
+ def split_vad_utterance(
365
+ self, original_utterance_id, replacement_utterance_data: typing.List[KalpyUtterance]
366
+ ):
367
+ if not replacement_utterance_data:
368
+ return
369
+ utt = self.utterances[self.reversed_indices[original_utterance_id]]
370
+ replacement_utterances = []
371
+ next_pk = self.corpus_model.corpus.get_next_primary_key(Utterance)
372
+ speaker_id = utt.speaker_id
373
+ for new_utt in replacement_utterance_data:
374
+ replacement_utterances.append(
375
+ Utterance(
376
+ id=next_pk,
377
+ begin=new_utt.segment.begin,
378
+ end=new_utt.segment.end,
379
+ speaker_id=speaker_id,
380
+ file_id=self.file.id,
381
+ text=new_utt.transcript,
382
+ normalized_text=new_utt.transcript,
383
+ features="",
384
+ in_subset=False,
385
+ ignored=False,
386
+ channel=new_utt.segment.channel,
387
+ )
388
+ )
389
+ next_pk += 1
390
+ self.addCommand.emit(
391
+ undo.SplitUtteranceCommand(utt, replacement_utterances, self, update_table=False)
392
+ )
393
+ self.corpus_model.set_file_modified(self.file.id)
394
+ self.corpus_model.set_speaker_modified(speaker_id)
395
+
396
+ def split_utterances(self, utterance: Utterance):
397
+ if not self.corpus_model.editable:
398
+ return
399
+ beg = utterance.begin
400
+ end = utterance.end
401
+ duration = end - beg
402
+ first_text = []
403
+ second_text = []
404
+ speaker_id = utterance.speaker_id
405
+ if (
406
+ utterance.text
407
+ and utterance.normalized_text
408
+ and " " not in utterance.text
409
+ and " " in utterance.normalized_text
410
+ ):
411
+ t = utterance.normalized_text.split()
412
+ mid_ind = int(len(t) / 2)
413
+ first_text = t[:mid_ind]
414
+ second_text = t[mid_ind:]
415
+ elif utterance.text:
416
+ t = utterance.text.split()
417
+ mid_ind = int(len(t) / 2)
418
+ first_text = t[:mid_ind]
419
+ second_text = t[mid_ind:]
420
+ split_time = beg + (duration / 2)
421
+ oovs = set()
422
+ for w in first_text:
423
+ if not self.corpus_model.dictionary_model.check_word(w, speaker_id):
424
+ oovs.add(w)
425
+ next_pk = self.corpus_model.corpus.get_next_primary_key(Utterance)
426
+ first_utt = Utterance(
427
+ id=next_pk,
428
+ speaker_id=speaker_id,
429
+ file_id=self.file.id,
430
+ begin=beg,
431
+ end=split_time,
432
+ channel=utterance.channel,
433
+ text=" ".join(first_text),
434
+ normalized_text=" ".join(first_text),
435
+ oovs=" ".join(oovs),
436
+ )
437
+ next_pk += 1
438
+ oovs = set()
439
+ for w in second_text:
440
+ if not self.corpus_model.dictionary_model.check_word(w, utterance.speaker_id):
441
+ oovs.add(w)
442
+ second_utt = Utterance(
443
+ id=next_pk,
444
+ speaker_id=speaker_id,
445
+ file_id=self.file.id,
446
+ begin=split_time,
447
+ end=end,
448
+ channel=utterance.channel,
449
+ text=" ".join(second_text),
450
+ normalized_text=" ".join(second_text),
451
+ oovs=" ".join(oovs),
452
+ )
453
+ self.addCommand.emit(undo.SplitUtteranceCommand(utterance, [first_utt, second_utt], self))
454
+ self.corpus_model.set_file_modified(self.file.id)
455
+ self.corpus_model.set_speaker_modified(speaker_id)
456
+ self.selectionRequested.emit([first_utt, second_utt])
457
+
458
+ def merge_utterances(self, utterances: list[Utterance]):
459
+ if not self.corpus_model.editable:
460
+ return
461
+ if not utterances:
462
+ return
463
+ min_begin = 1000000000
464
+ max_end = 0
465
+ text = ""
466
+ normalized_text = ""
467
+ speaker_id = None
468
+ channel = None
469
+ for old_utt in sorted(utterances, key=lambda x: x.begin):
470
+ if speaker_id is None:
471
+ speaker_id = old_utt.speaker_id
472
+ if channel is None:
473
+ channel = old_utt.channel
474
+ if old_utt.begin < min_begin:
475
+ min_begin = old_utt.begin
476
+ if old_utt.end > max_end:
477
+ max_end = old_utt.end
478
+ utt_text = old_utt.text
479
+ if utt_text == "speech" and text.strip() == "speech":
480
+ continue
481
+ text += utt_text + " "
482
+ normalized_text += old_utt.normalized_text + " "
483
+ text = text[:-1]
484
+ normalized_text = normalized_text[:-1]
485
+ next_pk = self.corpus_model.corpus.get_next_primary_key(Utterance)
486
+ oovs = set()
487
+ for w in text.split():
488
+ if not self.corpus_model.dictionary_model.check_word(w, speaker_id):
489
+ oovs.add(w)
490
+ new_utt = Utterance(
491
+ id=next_pk,
492
+ speaker_id=speaker_id,
493
+ file_id=self.file.id,
494
+ begin=min_begin,
495
+ end=max_end,
496
+ channel=channel,
497
+ text=text,
498
+ normalized_text=normalized_text,
499
+ oovs=" ".join(oovs),
500
+ )
501
+ self.addCommand.emit(undo.MergeUtteranceCommand(utterances, new_utt, self))
502
+ self.corpus_model.set_file_modified(self.file.id)
503
+ self.corpus_model.set_speaker_modified(speaker_id)
504
+ self.selectionRequested.emit([new_utt])
505
+
506
+ def delete_utterances(self, utterances: typing.List[Utterance]):
507
+ if not self.corpus_model.editable:
508
+ return
509
+ if not utterances:
510
+ return
511
+ speaker_ids = set(x.speaker_id for x in utterances)
512
+ self.addCommand.emit(undo.DeleteUtteranceCommand(utterances, self))
513
+ self.corpus_model.set_file_modified(self.file.id)
514
+ for speaker_id in speaker_ids:
515
+ self.corpus_model.set_speaker_modified(speaker_id)
516
+
517
+ def rowCount(self, parent=None):
518
+ return len(self.utterances)
519
+
520
+ def data(self, index, role=QtCore.Qt.ItemDataRole.DisplayRole):
521
+ if role == QtCore.Qt.ItemDataRole.DisplayRole:
522
+ return self.utterances[index.row()]
523
+
524
+
525
+ class FileSelectionModel(QtCore.QItemSelectionModel):
526
+ fileAboutToChange = QtCore.Signal()
152
527
  fileChanged = QtCore.Signal()
153
528
  channelChanged = QtCore.Signal()
154
529
  resetView = QtCore.Signal()
155
- fileAboutToChange = QtCore.Signal()
156
530
  viewChanged = QtCore.Signal(object, object)
157
531
  selectionAudioChanged = QtCore.Signal()
158
532
  currentTimeChanged = QtCore.Signal(object)
159
533
  currentUtteranceChanged = QtCore.Signal()
534
+ speakerRequested = QtCore.Signal(object)
535
+
536
+ spectrogramReady = QtCore.Signal()
537
+ waveformReady = QtCore.Signal()
538
+ pitchTrackReady = QtCore.Signal()
160
539
 
161
540
  def __init__(self, *args, **kwargs):
162
- super(CorpusSelectionModel, self).__init__(*args, **kwargs)
541
+ super().__init__(*args, **kwargs)
163
542
  self.settings = AnchorSettings()
164
543
  self.min_time = 0
165
544
  self.max_time = 10
166
545
  self.selected_min_time = None
167
546
  self.selected_max_time = None
168
- self.current_file: Optional[File] = None
169
547
  self.x = None
170
548
  self.y = None
171
- self.current_utterance_id = None
549
+ self.top_point = 2
550
+ self.bottom_point = 0
551
+ self.separator_point = 1
172
552
  self.selected_channel = 0
173
- # self.viewChanged.connect(self.update_selected_waveform)
174
- # self.fileChanged.connect(self.update_selected_waveform)
175
- self.currentRowChanged.connect(self.switch_utterance)
176
- # self.selectionChanged.connect(self.update_selection_audio)
177
- # self.selectionChanged.connect(self.update_selection_audio)
178
- # self.model().changeCommandFired.connect(self.expire_current)
179
- self.selectionChanged.connect(self._update_selection)
180
- self.model().layoutChanged.connect(self.check_selection)
181
- self.model().unlockCorpus.connect(self.fileChanged.emit)
182
- self.model().selectionRequested.connect(self.update_select_rows)
553
+ self.spectrogram = None
554
+ self.min_db = None
555
+ self.max_db = None
556
+ self.pitch_track_x = None
557
+ self.pitch_track_y = None
558
+ self.waveform_x = None
559
+ self.waveform_y = None
560
+ self.requested_utterance_id = None
561
+ self.auto_waveform_worker = workers.AutoWaveformWorker()
562
+ self.spectrogram_worker = workers.SpectrogramWorker()
563
+ self.pitch_track_worker = workers.PitchWorker()
564
+ self.auto_waveform_worker.signals.result.connect(self.finalize_loading_auto_wave_form)
565
+ self.spectrogram_worker.signals.result.connect(self.finalize_loading_spectrogram)
566
+ self.pitch_track_worker.signals.result.connect(self.finalize_loading_pitch_track)
567
+ self.model().waveformReady.connect(self.load_audio_selection)
568
+ self.model().utterancesReady.connect(self.finalize_set_new_file)
569
+ self.viewChanged.connect(self.load_audio_selection)
570
+ self.model().selectionRequested.connect(self.update_selected_utterances)
571
+
572
+ def selected_utterances(self):
573
+ utts = []
574
+ m = self.model()
575
+ for index in self.selectedRows(0):
576
+ utt = m.utterances[index.row()]
577
+ utts.append(utt)
578
+ return utts
579
+
580
+ def load_audio_selection(self):
581
+ if self.model().y is None:
582
+ return
583
+ begin_samp = int(self.min_time * self.model().file.sample_rate)
584
+ end_samp = int(self.max_time * self.model().file.sample_rate)
585
+ if len(self.model().y.shape) > 1:
586
+ y = self.model().y[begin_samp:end_samp, self.selected_channel]
587
+ else:
588
+ y = self.model().y[begin_samp:end_samp]
589
+ self.spectrogram_worker.stop()
590
+ self.spectrogram_worker.set_params(
591
+ y,
592
+ self.model().file.sound_file.sample_rate,
593
+ self.min_time,
594
+ self.max_time,
595
+ self.selected_channel,
596
+ )
597
+ self.spectrogram_worker.start()
598
+ if self.max_time - self.min_time <= 10:
599
+ self.pitch_track_worker.stop()
600
+ self.pitch_track_worker.set_params(
601
+ y,
602
+ self.model().file.sound_file.sample_rate,
603
+ self.min_time,
604
+ self.max_time,
605
+ self.selected_channel,
606
+ self.bottom_point,
607
+ self.separator_point,
608
+ )
609
+ self.pitch_track_worker.start()
610
+ self.auto_waveform_worker.stop()
611
+ self.auto_waveform_worker.set_params(
612
+ y,
613
+ self.separator_point,
614
+ self.top_point,
615
+ self.min_time,
616
+ self.max_time,
617
+ self.selected_channel,
618
+ )
619
+ self.auto_waveform_worker.start()
620
+
621
+ def clean_up_for_close(self):
622
+ self.spectrogram_worker.stop()
623
+ self.pitch_track_worker.stop()
624
+ self.auto_waveform_worker.stop()
183
625
 
184
626
  @property
185
627
  def plot_min(self):
@@ -193,6 +635,292 @@ class CorpusSelectionModel(QtCore.QItemSelectionModel):
193
635
  return -self.min_time
194
636
  return self.max_time
195
637
 
638
+ def finalize_loading_spectrogram(self, results):
639
+ stft, channel, begin, end, min_db, max_db = results
640
+ if self.settings.right_to_left:
641
+ stft = np.flip(stft, 1)
642
+ begin, end = -end, -begin
643
+ if begin != self.plot_min or end != self.plot_max:
644
+ return
645
+ self.spectrogram = stft
646
+ self.min_db = self.min_db
647
+ self.max_db = self.max_db
648
+ self.spectrogramReady.emit()
649
+
650
+ def finalize_loading_pitch_track(self, results):
651
+ pitch_track, voicing_track, channel, begin, end, min_f0, max_f0 = results
652
+ if self.settings.right_to_left:
653
+ pitch_track = np.flip(pitch_track, 0)
654
+ begin, end = -end, -begin
655
+ if begin != self.plot_min or end != self.plot_max:
656
+ return
657
+ self.pitch_track_y = pitch_track
658
+ if pitch_track is None:
659
+ return
660
+ x = np.linspace(
661
+ start=self.plot_min,
662
+ stop=self.plot_max,
663
+ num=pitch_track.shape[0],
664
+ )
665
+ self.pitch_track_x = x
666
+ self.pitchTrackReady.emit()
667
+
668
+ def finalize_loading_auto_wave_form(self, results):
669
+ y, begin, end, channel = results
670
+ if self.settings.right_to_left:
671
+ y = np.flip(y, 0)
672
+ begin, end = -end, -begin
673
+ if begin != self.plot_min or end != self.plot_max:
674
+ return
675
+ x = np.linspace(start=self.plot_min, stop=self.plot_max, num=y.shape[0])
676
+ self.waveform_x = x
677
+ self.waveform_y = y
678
+ self.waveformReady.emit()
679
+
680
+ def select_audio(self, begin, end):
681
+ if end is not None and end - begin < 0.025:
682
+ end = None
683
+ self.selected_min_time = begin
684
+ self.selected_max_time = end
685
+ self.selectionAudioChanged.emit()
686
+
687
+ def request_start_time(self, start_time):
688
+ if start_time >= self.max_time:
689
+ return
690
+ if start_time < self.min_time:
691
+ return
692
+ self.selected_min_time = start_time
693
+ self.selected_max_time = None
694
+ self.selectionAudioChanged.emit()
695
+
696
+ def set_current_channel(self, channel):
697
+ if channel == self.selected_channel:
698
+ return
699
+ self.selected_channel = channel
700
+ self.load_audio_selection()
701
+
702
+ def get_selected_wave_form(self):
703
+ if self.y is None:
704
+ return None, None
705
+ if len(self.y.shape) > 1 and self.y.shape[0] == 2:
706
+ return self.x, self.y[self.selected_channel, :]
707
+ return self.x, self.y
708
+
709
+ def zoom(self, factor, mid_point=None):
710
+ if factor == 0 or self.min_time is None:
711
+ return
712
+ cur_duration = self.max_time - self.min_time
713
+ if mid_point is None:
714
+ mid_point = self.min_time + (cur_duration / 2)
715
+ new_duration = cur_duration / factor
716
+ new_begin = mid_point - (mid_point - self.min_time) / factor
717
+ new_begin = max(new_begin, 0)
718
+ new_end = min(new_begin + new_duration, self.model().file.duration)
719
+ if new_end - new_begin <= 0.025:
720
+ return
721
+ self.set_view_times(new_begin, new_end)
722
+
723
+ def pan(self, factor):
724
+ if self.min_time is None:
725
+ return
726
+ if factor < 1:
727
+ factor = 1 - factor
728
+ right = True
729
+ else:
730
+ right = False
731
+ factor = factor - 1
732
+ if right and self.max_time == self.model().file.duration:
733
+ return
734
+ if not right and self.min_time == 0:
735
+ return
736
+ cur_duration = self.max_time - self.min_time
737
+ shift = factor * cur_duration
738
+ if right:
739
+ new_begin = self.min_time + shift
740
+ new_end = self.max_time + shift
741
+ else:
742
+ new_begin = self.min_time - shift
743
+ new_end = self.max_time - shift
744
+ if new_begin < 0:
745
+ new_end = new_end + abs(new_begin)
746
+ new_begin = 0
747
+ if new_end > self.model().file.duration:
748
+ new_begin -= self.model().file.duration - new_end
749
+ new_end = self.model().file.duration
750
+ self.set_view_times(new_begin, new_end)
751
+
752
+ def zoom_in(self):
753
+ if self.model().file is None:
754
+ return
755
+ self.zoom(1.5)
756
+
757
+ def zoom_out(self):
758
+ if self.model().file is None:
759
+ return
760
+ self.zoom(0.5)
761
+
762
+ def zoom_to_selection(self):
763
+ if self.selected_min_time is not None and self.selected_max_time is not None:
764
+ self.set_view_times(self.selected_min_time, self.selected_max_time)
765
+
766
+ def update_from_slider(self, value):
767
+ if not self.max_time:
768
+ return
769
+ cur_window = self.max_time - self.min_time
770
+ self.set_view_times(value, value + cur_window)
771
+
772
+ def update_selection_audio(self, begin, end):
773
+ if begin < self.min_time:
774
+ begin = self.min_time
775
+ if end > self.max_time:
776
+ end = self.max_time
777
+ self.selected_min_time = begin
778
+ self.selected_max_time = end
779
+ self.selectionAudioChanged.emit()
780
+
781
+ def visible_utterances(self) -> typing.List[Utterance]:
782
+ file_utts = []
783
+ if not self.model().file:
784
+ return file_utts
785
+ if self.model().rowCount() > 1:
786
+ for u in self.model().utterances:
787
+ if u.begin >= self.max_time:
788
+ break
789
+ if u.end <= self.min_time:
790
+ continue
791
+ file_utts.append(u)
792
+ else:
793
+ file_utts.extend(self.model().utterances)
794
+ return file_utts
795
+
796
+ def model(self) -> FileUtterancesModel:
797
+ return super().model()
798
+
799
+ def set_view_times(self, begin, end):
800
+ begin = max(begin, 0)
801
+ end = min(end, self.model().file.duration)
802
+ if (begin, end) == (self.min_time, self.max_time):
803
+ return
804
+ self.min_time = begin
805
+ self.max_time = end
806
+ if (
807
+ self.selected_max_time is not None
808
+ and not self.min_time <= self.selected_min_time <= self.max_time
809
+ ):
810
+ self.selected_min_time = self.min_time
811
+ if (
812
+ self.selected_max_time is not None
813
+ and not self.min_time <= self.selected_max_time <= self.max_time
814
+ ):
815
+ self.selected_max_time = None
816
+ self.viewChanged.emit(self.min_time, self.max_time)
817
+
818
+ def set_current_file(self, info, force_update=False):
819
+ file_id, begin, end, utterance_id, speaker_id = info
820
+ try:
821
+ new_file = self.model().file is None or self.model().file.id != file_id
822
+ except sqlalchemy.orm.exc.DetachedInstanceError:
823
+ new_file = True
824
+ self.requested_utterance_id = utterance_id
825
+ if new_file:
826
+ self.fileAboutToChange.emit()
827
+ self.model().set_file(file_id)
828
+ self.speakerRequested.emit(speaker_id)
829
+ else:
830
+ self.finalize_set_new_file()
831
+ self.speakerRequested.emit(speaker_id)
832
+ self.set_view_times(begin, end)
833
+
834
+ def finalize_set_new_file(self):
835
+ if self.requested_utterance_id is None:
836
+ return
837
+ utterance = self.model().get_utterance(self.requested_utterance_id)
838
+ if utterance is None:
839
+ return
840
+ self.update_select(self.requested_utterance_id, reset=True)
841
+ self.selected_channel = 0
842
+ if utterance is not None and utterance.channel is not None:
843
+ self.selected_channel = utterance.channel
844
+ self.fileChanged.emit()
845
+
846
+ def checkSelected(self, utterance_id: int):
847
+ m = self.model()
848
+ for index in self.selectedRows(0):
849
+ if utterance_id == m._indices[index.row()]:
850
+ return True
851
+ return False
852
+
853
+ def update_selected_utterances(self, utterances):
854
+ super().clearSelection()
855
+ super().clearCurrentIndex()
856
+ if not utterances:
857
+ return
858
+ flags = QtCore.QItemSelectionModel.SelectionFlag.Rows
859
+ flags |= QtCore.QItemSelectionModel.SelectionFlag.Select
860
+ for u in utterances:
861
+ if u.id not in self.model().reversed_indices:
862
+ continue
863
+ row = self.model().reversed_indices[u.id]
864
+
865
+ index = self.model().index(row, 0)
866
+ if not index.isValid():
867
+ return
868
+ self.select(index, flags)
869
+ self.currentUtteranceChanged.emit()
870
+
871
+ def update_select(self, utterance_id: int, deselect=False, reset=False):
872
+ if reset and [x.id for x in self.selected_utterances()] == [utterance_id]:
873
+ return
874
+ flags = QtCore.QItemSelectionModel.SelectionFlag.Rows
875
+ if reset:
876
+ flags |= QtCore.QItemSelectionModel.SelectionFlag.ClearAndSelect
877
+ elif deselect:
878
+ flags |= QtCore.QItemSelectionModel.SelectionFlag.Deselect
879
+ else:
880
+ flags |= QtCore.QItemSelectionModel.SelectionFlag.Select
881
+ if utterance_id not in self.model().reversed_indices:
882
+ return
883
+ row = self.model().reversed_indices[utterance_id]
884
+
885
+ index = self.model().index(row, 0)
886
+ if not index.isValid():
887
+ return
888
+ self.select(index, flags)
889
+ if not deselect:
890
+ self.select_audio(self.model().utterances[row].begin, self.model().utterances[row].end)
891
+ self.currentUtteranceChanged.emit()
892
+
893
+
894
+ class CorpusSelectionModel(QtCore.QItemSelectionModel):
895
+ fileChanged = QtCore.Signal()
896
+ channelChanged = QtCore.Signal()
897
+ resetView = QtCore.Signal()
898
+ fileAboutToChange = QtCore.Signal()
899
+ fileViewRequested = QtCore.Signal(object)
900
+ selectionAudioChanged = QtCore.Signal()
901
+ currentTimeChanged = QtCore.Signal(object)
902
+ currentUtteranceChanged = QtCore.Signal()
903
+
904
+ def __init__(self, *args, **kwargs):
905
+ super().__init__(*args, **kwargs)
906
+ self.settings = AnchorSettings()
907
+ self.min_time = 0
908
+ self.max_time = 10
909
+ self.selected_min_time = None
910
+ self.selected_max_time = None
911
+ self.x = None
912
+ self.y = None
913
+ self.current_utterance_id = None
914
+ self.selected_channel = 0
915
+ # self.viewChanged.connect(self.update_selected_waveform)
916
+ # self.fileChanged.connect(self.update_selected_waveform)
917
+ self.currentRowChanged.connect(self.switch_utterance)
918
+ # self.selectionChanged.connect(self.update_selection_audio)
919
+ # self.selectionChanged.connect(self.update_selection_audio)
920
+ # self.model().changeCommandFired.connect(self.expire_current)
921
+ self.model().layoutChanged.connect(self.check_selection)
922
+ self.model().unlockCorpus.connect(self.fileChanged.emit)
923
+
196
924
  def set_current_utterance(self, utterance_id):
197
925
  self.current_utterance_id = utterance_id
198
926
  self.currentUtteranceChanged.emit()
@@ -203,13 +931,8 @@ class CorpusSelectionModel(QtCore.QItemSelectionModel):
203
931
  elif self.model().rowCount() == 0:
204
932
  self.clearSelection()
205
933
 
206
- def set_current_channel(self, channel):
207
- self.selected_channel = channel
208
- self.channelChanged.emit()
209
-
210
934
  def clearSelection(self) -> None:
211
935
  self.fileAboutToChange.emit()
212
- self.current_file = None
213
936
  self.current_utterance_id = None
214
937
  self.min_time = None
215
938
  self.max_time = None
@@ -219,22 +942,6 @@ class CorpusSelectionModel(QtCore.QItemSelectionModel):
219
942
  super(CorpusSelectionModel, self).clearSelection()
220
943
  self.fileChanged.emit()
221
944
 
222
- def update_selected_wavform(self, *args):
223
- if self.min_time is None or self.current_file is None:
224
- self.x = None
225
- self.y = None
226
- else:
227
- self.x, self.y = self.current_file.sound_file.normalized_waveform(
228
- self.min_time, self.max_time
229
- )
230
-
231
- def get_selected_wave_form(self):
232
- if self.y is None:
233
- return None, None
234
- if len(self.y.shape) > 1 and self.y.shape[0] == 2:
235
- return self.x, self.y[self.selected_channel, :]
236
- return self.x, self.y
237
-
238
945
  def update_select_rows(self, rows: list[int]):
239
946
  super(CorpusSelectionModel, self).clearCurrentIndex()
240
947
  super(CorpusSelectionModel, self).clearSelection()
@@ -250,8 +957,29 @@ class CorpusSelectionModel(QtCore.QItemSelectionModel):
250
957
  | QtCore.QItemSelectionModel.SelectionFlag.Rows,
251
958
  )
252
959
 
960
+ def update_selected_utterances(self, utterances):
961
+ if not utterances:
962
+ return
963
+ first = True
964
+ for u in utterances:
965
+ if u.id not in self.model().reversed_indices:
966
+ continue
967
+ row = self.model().reversed_indices[u.id]
968
+
969
+ index = self.model().index(row, 0)
970
+ if not index.isValid():
971
+ return
972
+ if not first:
973
+ flags = QtCore.QItemSelectionModel.SelectionFlag.Rows
974
+ flags |= QtCore.QItemSelectionModel.SelectionFlag.Select
975
+ else:
976
+ flags = QtCore.QItemSelectionModel.SelectionFlag.Rows
977
+ flags |= QtCore.QItemSelectionModel.SelectionFlag.ClearAndSelect
978
+ first = False
979
+ self.select(index, flags)
980
+
253
981
  def update_select(self, utterance_id: int, deselect=False, reset=False, focus=False):
254
- if reset and [x.id for x in self.selectedUtterances()] == [utterance_id]:
982
+ if reset and self.selected_utterances() == [utterance_id]:
255
983
  return
256
984
  flags = QtCore.QItemSelectionModel.SelectionFlag.Rows
257
985
  if reset:
@@ -266,58 +994,13 @@ class CorpusSelectionModel(QtCore.QItemSelectionModel):
266
994
  if focus:
267
995
  flags |= QtCore.QItemSelectionModel.SelectionFlag.Current
268
996
  if row == self.currentIndex().row():
269
- self.update_view_times(force_update=True)
997
+ self.fileViewRequested.emit(self.model().audio_info_for_utterance(row))
270
998
 
271
999
  index = self.model().index(row, 0)
272
1000
  if not index.isValid():
273
1001
  return
274
1002
  self.select(index, flags)
275
1003
 
276
- def select_audio(self, begin, end):
277
- if end is not None and end - begin < 0.025:
278
- end = None
279
- self.selected_min_time = begin
280
- self.selected_max_time = end
281
- self.selectionAudioChanged.emit()
282
-
283
- def request_start_time(self, start_time):
284
- if start_time >= self.max_time:
285
- return
286
- if start_time < self.min_time:
287
- return
288
- self.selected_min_time = start_time
289
- self.selected_max_time = None
290
- self.selectionAudioChanged.emit()
291
-
292
- def visible_utts(self) -> typing.List[Utterance]:
293
- file_utts = []
294
- if not self.current_file:
295
- return file_utts
296
- if self.current_file.num_utterances > 1:
297
- for u in sorted(self.current_file.utterances, key=lambda x: x.begin):
298
- if u.begin >= self.max_time:
299
- break
300
- if u.end <= self.min_time:
301
- continue
302
- file_utts.append(u)
303
- else:
304
- file_utts.extend(self.current_file.utterances)
305
- return file_utts
306
-
307
- def currentUtterance(self) -> Optional[Utterance]:
308
- if self.current_utterance_id is None:
309
- return
310
- m = self.model()
311
- utterance = (
312
- m.session.query(Utterance)
313
- .options(
314
- joinedload(Utterance.file).joinedload(File.sound_file),
315
- joinedload(Utterance.file).subqueryload(File.speakers),
316
- )
317
- .get(self.current_utterance_id)
318
- )
319
- return utterance
320
-
321
1004
  def _update_selection(self):
322
1005
  index = self.currentIndex()
323
1006
  if not index.isValid():
@@ -326,22 +1009,20 @@ class CorpusSelectionModel(QtCore.QItemSelectionModel):
326
1009
  self.current_utterance_id = m._indices[index.row()]
327
1010
  self.currentUtteranceChanged.emit()
328
1011
 
329
- def selectedUtterances(self):
330
- current_utterance = self.currentUtterance()
1012
+ def selected_utterances(self):
1013
+ current_utterance = self.current_utterance_id
331
1014
  if current_utterance is None:
332
1015
  return []
333
1016
  utts = [current_utterance]
334
1017
  m = self.model()
335
1018
  for index in self.selectedRows(1):
336
- if current_utterance is not None and m._indices[index.row()] == current_utterance.id:
1019
+ if current_utterance is not None and m._indices[index.row()] == current_utterance:
337
1020
  continue
338
- utt = m.utteranceAt(index)
1021
+ utt = m.utterance_id_at(index)
339
1022
  if utt is None:
340
1023
  continue
341
1024
  if current_utterance is None:
342
1025
  current_utterance = utt
343
- if utt.file_id != current_utterance.file_id:
344
- continue
345
1026
  utts.append(utt)
346
1027
  return utts
347
1028
 
@@ -356,206 +1037,40 @@ class CorpusSelectionModel(QtCore.QItemSelectionModel):
356
1037
  text = m.data(m.index(index.row(), m.text_column), QtCore.Qt.ItemDataRole.DisplayRole)
357
1038
  return text
358
1039
 
359
- def zoom(self, factor, mid_point=None):
360
- if factor == 0 or self.min_time is None:
361
- return
362
- cur_duration = self.max_time - self.min_time
363
- if mid_point is None:
364
- mid_point = self.min_time + (cur_duration / 2)
365
- new_duration = cur_duration / factor
366
- new_begin = mid_point - (mid_point - self.min_time) / factor
367
- new_begin = max(new_begin, 0)
368
- new_end = min(new_begin + new_duration, self.current_file.duration)
369
- if new_end - new_begin <= 0.025:
370
- return
371
- self.set_view_times(new_begin, new_end)
372
-
373
- def pan(self, factor):
374
- if self.min_time is None:
375
- return
376
- if factor < 1:
377
- factor = 1 - factor
378
- right = True
379
- else:
380
- right = False
381
- factor = factor - 1
382
- if right and self.max_time == self.current_file.duration:
383
- return
384
- if not right and self.min_time == 0:
385
- return
386
- cur_duration = self.max_time - self.min_time
387
- shift = factor * cur_duration
388
- if right:
389
- new_begin = self.min_time + shift
390
- new_end = self.max_time + shift
391
- else:
392
- new_begin = self.min_time - shift
393
- new_end = self.max_time - shift
394
- if new_begin < 0:
395
- new_end = new_end + abs(new_begin)
396
- new_begin = 0
397
- if new_end > self.current_file.duration:
398
- new_begin -= self.current_file.duration - new_end
399
- new_end = self.current_file.duration
400
- self.set_view_times(new_begin, new_end)
401
-
402
- def zoom_in(self):
403
- if self.current_file is None:
404
- return
405
- self.zoom(1.5)
406
-
407
- def zoom_out(self):
408
- if self.current_file is None:
409
- return
410
- self.zoom(0.5)
411
-
412
- def zoom_to_selection(self):
413
- if self.selected_min_time is None or self.selected_max_time is None:
414
- rows = self.selectedRows(1)
415
- if not rows:
416
- return
417
- begin = None
418
- end = None
419
- for r in rows:
420
- u = self.model().utteranceAt(r)
421
- if u is None:
422
- continue
423
- if u.file_id != self.current_file.id:
424
- continue
425
- if begin is None or begin > u.begin:
426
- begin = u.begin
427
- if end is None or end < u.end:
428
- end = u.end
429
- self.set_view_times(begin, end)
430
- else:
431
- self.set_view_times(self.selected_min_time, self.selected_max_time)
432
-
433
- def update_from_slider(self, value):
434
- if not self.max_time:
435
- return
436
- cur_window = self.max_time - self.min_time
437
- self.set_view_times(value, value + cur_window)
438
-
439
- def update_selection_audio(self):
440
- begins = self.selectedRows(self.model().begin_column)
441
- ends = self.selectedRows(self.model().end_column)
442
- begin = None
443
- end = None
444
- if len(begins) > 0:
445
- for i, b in enumerate(begins):
446
- b = self.model().data(b, QtCore.Qt.ItemDataRole.DisplayRole)
447
- e = self.model().data(ends[i], QtCore.Qt.ItemDataRole.DisplayRole)
448
- if begin is None or begin > b:
449
- begin = b
450
- if end is None or end < e:
451
- end = e
452
- if self.current_file is None or begin > self.current_file.duration:
453
- begin = None
454
- end = None
455
- elif end > self.current_file.duration:
456
- end = self.current_file.duration
457
- self.selected_min_time = begin
458
- self.selected_max_time = end
459
- self.selectionAudioChanged.emit()
460
-
461
1040
  def switch_utterance(self, new_index, old_index):
1041
+ if not self.model().fully_loaded:
1042
+ return
462
1043
  if not isinstance(new_index, QtCore.QModelIndex):
463
1044
  row = 0
464
1045
  else:
465
1046
  if not new_index.isValid():
466
1047
  return
467
1048
  row = new_index.row()
468
- utt = self.model().utteranceAt(row)
1049
+ utt = self.model().utterance_id_at(row)
469
1050
  if utt is None:
470
1051
  return
471
- if utt.id == self.current_utterance_id:
1052
+ if utt == self.current_utterance_id:
472
1053
  return
473
- self.current_utterance_id = utt.id
1054
+ self.current_utterance_id = utt
474
1055
  self.currentUtteranceChanged.emit()
475
- self.set_current_file(
476
- utt.file_id, utt.begin, utt.end, channel=utt.channel, force_update=True
477
- )
478
-
479
- def update_view_times(self, *args, force_update=False):
480
- utts = self.selectedUtterances()
481
- if len(utts) == 0:
482
- self.resetView.emit()
483
- return
484
- if len(utts) == 1:
485
- force_update = True
486
- begin = utts[0].begin
487
- f_id = utts[0].file_id
488
- end_ind = -1
489
- while True:
490
- if utts[end_ind].file_id == f_id:
491
- end = utts[end_ind].end
492
- break
493
- self.set_current_file(f_id, begin, end, channel=utts[0].channel, force_update=force_update)
494
- self.selected_min_time = self.min_time
1056
+ self.fileViewRequested.emit(self.model().audio_info_for_utterance(row))
495
1057
 
496
1058
  def model(self) -> CorpusModel:
497
1059
  return super(CorpusSelectionModel, self).model()
498
1060
 
499
- def checkSelected(self, utterance: Utterance):
500
- m = self.model()
501
- for index in self.selectedRows(1):
502
- if utterance.id == m._indices[index.row()]:
503
- return True
504
- return False
505
-
506
- def set_current_file(self, file_id, begin=None, end=None, channel=None, force_update=False):
507
- try:
508
- new_file = self.current_file is None or self.current_file.id != file_id
509
- except sqlalchemy.orm.exc.DetachedInstanceError:
510
- new_file = True
511
- if new_file:
512
- self.selected_min_time = None
513
- self.selected_max_time = None
514
- self.fileAboutToChange.emit()
515
- self.selected_channel = 0 if channel is None else channel
516
- self.current_file = (
517
- self.model().session.query(File).options(joinedload(File.sound_file)).get(file_id)
518
- )
519
- self.min_time = begin
520
- self.max_time = end
521
- self.fileChanged.emit()
522
- elif (
523
- self.current_file is not None
524
- and begin is not None
525
- and end is not None
526
- and force_update
527
- ):
528
- self.selected_channel = channel
529
- self.set_view_times(begin, end)
530
-
531
- def set_view_times(self, begin, end):
532
- begin = max(begin, 0)
533
- end = min(end, self.current_file.duration)
534
- if (begin, end) == (self.min_time, self.max_time):
535
- return
536
- self.min_time = begin
537
- self.max_time = end
538
- self.selected_min_time = self.min_time
539
- if self.selected_max_time is not None and self.selected_max_time > self.max_time:
540
- self.selected_max_time = None
541
- self.viewChanged.emit(self.min_time, self.max_time)
542
-
543
- def focusUtterance(self, index):
1061
+ def focus_utterance(self, index):
544
1062
  m = self.model()
545
- u = m.utteranceAt(index)
546
- if u is None:
1063
+ row = index.row()
1064
+ utt_id = m.utterance_id_at(row)
1065
+ if utt_id is None:
547
1066
  self.min_time = 0
548
1067
  self.max_time = 1
549
1068
  self.fileAboutToChange()
550
- self.current_file = None
551
1069
  self.fileChanged.emit()
552
1070
  return
553
- self.current_file = u.file
554
- begin = u.begin
555
- end = u.end
556
- padding = 1
557
- self.set_view_times(begin - padding, end + padding)
558
- self.selectionAudioChanged.emit()
1071
+ self.current_utterance_id = utt_id
1072
+ self.currentUtteranceChanged.emit()
1073
+ self.fileViewRequested.emit(self.model().audio_info_for_utterance(row))
559
1074
 
560
1075
 
561
1076
  class OovModel(TableModel):
@@ -1450,6 +1965,7 @@ class CorpusModel(TableModel):
1450
1965
  self.speakers = {}
1451
1966
  self.speaker_id_mapping = {}
1452
1967
  self.utterances = None
1968
+ self.session: sqlalchemy.orm.scoped_session = None
1453
1969
  self.utterance_count = 0
1454
1970
  self.speaker_count = 0
1455
1971
  self.file_count = 0
@@ -1494,29 +2010,46 @@ class CorpusModel(TableModel):
1494
2010
  return True
1495
2011
  return False
1496
2012
 
1497
- def update_utterance_table_row(self, utterance_id: int):
1498
- if utterance_id not in self.reversed_indices:
1499
- return
1500
- utterance = self.session.query(Utterance).get(utterance_id)
2013
+ def update_utterance_table_row(self, utterance: typing.Union[int, Utterance]):
2014
+ if isinstance(utterance, int):
2015
+ utterance_id = utterance
2016
+ if utterance_id not in self.reversed_indices:
2017
+ return
2018
+ utterance = self.session.query(Utterance).get(utterance_id)
2019
+ else:
2020
+ utterance_id = utterance.id
2021
+ if utterance_id not in self.reversed_indices:
2022
+ return
1501
2023
  index = self.reversed_indices[utterance_id]
1502
2024
  self.layoutAboutToBeChanged.emit()
1503
2025
  self._data[index][self.text_column] = utterance.text
1504
2026
  self._data[index][self.begin_column] = utterance.begin
1505
2027
  self._data[index][self.end_column] = utterance.end
1506
- self._data[index][self.duration_column] = utterance.duration
2028
+ self._data[index][self.duration_column] = utterance.end - utterance.begin
2029
+ self.layoutChanged.emit()
2030
+
2031
+ def change_speaker_table_utterances(self, utterances: typing.List[Utterance]):
2032
+ self.layoutAboutToBeChanged.emit()
2033
+ for u in utterances:
2034
+ if u.id not in self.reversed_indices:
2035
+ continue
2036
+ index = self.reversed_indices[u.id]
2037
+ self._speaker_indices[index] = u.speaker_id
2038
+ self._data[index][self.speaker_column] = self.get_speaker_name(u.speaker_id)
1507
2039
  self.layoutChanged.emit()
1508
2040
 
1509
2041
  def add_table_utterances(self, utterances: typing.List[Utterance]):
1510
2042
  self.layoutAboutToBeChanged.emit()
1511
2043
  rows = []
1512
2044
  for utterance in utterances:
2045
+ speaker_name = self.get_speaker_name(utterance.speaker_id)
1513
2046
  row_data = [
1514
2047
  utterance.oovs,
1515
2048
  utterance.file_name,
1516
- utterance.speaker_name,
2049
+ speaker_name,
1517
2050
  utterance.begin,
1518
2051
  utterance.end,
1519
- utterance.duration,
2052
+ utterance.end - utterance.begin,
1520
2053
  utterance.text,
1521
2054
  ]
1522
2055
  self._data.append(row_data)
@@ -1531,7 +2064,10 @@ class CorpusModel(TableModel):
1531
2064
  def delete_table_utterances(self, utterances: typing.List[Utterance]):
1532
2065
  self.layoutAboutToBeChanged.emit()
1533
2066
  for utterance in utterances:
1534
- index = self.reversed_indices.pop(utterance.id)
2067
+ try:
2068
+ index = self.reversed_indices.pop(utterance.id)
2069
+ except KeyError:
2070
+ continue
1535
2071
  _ = self._data.pop(index)
1536
2072
  _ = self._indices.pop(index)
1537
2073
  _ = self._file_indices.pop(index)
@@ -1552,7 +2088,6 @@ class CorpusModel(TableModel):
1552
2088
 
1553
2089
  self.layoutAboutToBeChanged.emit()
1554
2090
  first = split_utterances[0]
1555
- self.session.merge(first)
1556
2091
  file_name = self._data[index][1]
1557
2092
  speaker_name = self._data[index][2]
1558
2093
  row_data = [
@@ -1561,7 +2096,7 @@ class CorpusModel(TableModel):
1561
2096
  speaker_name,
1562
2097
  first.begin,
1563
2098
  first.end,
1564
- first.duration,
2099
+ first.end - first.begin,
1565
2100
  first.text,
1566
2101
  ]
1567
2102
  self._data[index] = row_data
@@ -1571,7 +2106,6 @@ class CorpusModel(TableModel):
1571
2106
  self.reversed_indices[first.id] = index
1572
2107
  rows = [index]
1573
2108
  for utterance in split_utterances[1:]:
1574
- self.session.merge(utterance)
1575
2109
  index += 1
1576
2110
  rows.append(index)
1577
2111
  self.reversed_indices = {
@@ -1584,7 +2118,7 @@ class CorpusModel(TableModel):
1584
2118
  speaker_name,
1585
2119
  utterance.begin,
1586
2120
  utterance.end,
1587
- utterance.duration,
2121
+ utterance.end - utterance.begin,
1588
2122
  utterance.text,
1589
2123
  ]
1590
2124
  self.reversed_indices[utterance.id] = index
@@ -1603,14 +2137,13 @@ class CorpusModel(TableModel):
1603
2137
  except KeyError:
1604
2138
  return
1605
2139
  self.layoutAboutToBeChanged.emit()
1606
- self.session.merge(merged_utterance)
1607
2140
  row_data = [
1608
2141
  merged_utterance.oovs,
1609
2142
  merged_utterance.file_name,
1610
2143
  merged_utterance.speaker_name,
1611
2144
  merged_utterance.begin,
1612
2145
  merged_utterance.end,
1613
- merged_utterance.duration,
2146
+ merged_utterance.end - merged_utterance.begin,
1614
2147
  merged_utterance.text,
1615
2148
  ]
1616
2149
  first = split_utterances[0]
@@ -1659,32 +2192,6 @@ class CorpusModel(TableModel):
1659
2192
  self.language_model = language_model
1660
2193
  self.languageModelChanged.emit()
1661
2194
 
1662
- def create_utterance(self, file: File, speaker: Optional[Speaker], begin: float, end: float):
1663
- if not self.editable:
1664
- return
1665
- channel = 0
1666
- if file.num_channels > 1:
1667
- ind = file.speaker_ordering.index(speaker)
1668
- if ind >= len(file.speaker_ordering) / 2:
1669
- channel = 1
1670
- if speaker is None:
1671
- speaker = self.corpus.add_speaker("speech", session=self.session)
1672
- begin = round(begin, 4)
1673
- end = round(end, 4)
1674
- text = ""
1675
- next_pk = self.corpus.get_next_primary_key(Utterance)
1676
- new_utt = Utterance(
1677
- id=next_pk,
1678
- speaker_id=speaker.id,
1679
- file_id=file.id,
1680
- begin=begin,
1681
- end=end,
1682
- channel=channel,
1683
- text=text,
1684
- )
1685
- self.addCommand.emit(undo.CreateUtteranceCommand(new_utt, self))
1686
- self.unsaved_files.add(file.id)
1687
-
1688
2195
  def set_file_modified(self, file_id: typing.Union[int, typing.List[int]]):
1689
2196
  if isinstance(file_id, int):
1690
2197
  file_id = [file_id]
@@ -1699,32 +2206,6 @@ class CorpusModel(TableModel):
1699
2206
  )
1700
2207
  self.session.commit()
1701
2208
 
1702
- def update_utterance_text(self, utterance: Utterance, text):
1703
- if text != utterance.text:
1704
- self.addCommand.emit(undo.UpdateUtteranceTextCommand(utterance, text, self))
1705
- self.set_file_modified(utterance.file_id)
1706
-
1707
- def update_utterance_times(
1708
- self, utterance: Utterance, begin: Optional[float] = None, end: Optional[float] = None
1709
- ):
1710
- if not self.editable:
1711
- return
1712
- self.addCommand.emit(undo.UpdateUtteranceTimesCommand(utterance, begin, end, self))
1713
- self.set_file_modified(utterance.file_id)
1714
-
1715
- def update_utterance_speaker(self, utterance: Utterance, speaker: Speaker):
1716
- if not self.editable:
1717
- return
1718
- self.addCommand.emit(undo.UpdateUtteranceSpeakerCommand(utterance, speaker, self))
1719
-
1720
- def delete_utterances(self, utterances: list[Utterance]):
1721
- if not self.editable:
1722
- return
1723
- for u in utterances:
1724
- self.set_file_modified(u.file_id)
1725
- self.set_speaker_modified(u.speaker_id)
1726
- self.addCommand.emit(undo.DeleteUtteranceCommand(utterances, self))
1727
-
1728
2209
  def check_align_lexicon_compiler(self):
1729
2210
  if self.acoustic_model is None:
1730
2211
  return
@@ -1743,150 +2224,13 @@ class CorpusModel(TableModel):
1743
2224
  dictionary_id, self.acoustic_model, disambiguation=True
1744
2225
  )
1745
2226
 
1746
- def split_vad_utterance(
1747
- self, original_utterance_id, replacement_utterance_data: typing.List[KalpyUtterance]
1748
- ):
1749
- utt = self.session.get(Utterance, original_utterance_id)
1750
- replacement_utterances = []
1751
- speaker_id = utt.speaker_id
1752
- file_id = utt.file_id
1753
- next_pk = self.corpus.get_next_primary_key(Utterance)
1754
- for new_utt in replacement_utterance_data:
1755
- replacement_utterances.append(
1756
- Utterance(
1757
- id=next_pk,
1758
- begin=new_utt.segment.begin,
1759
- end=new_utt.segment.end,
1760
- speaker_id=speaker_id,
1761
- file_id=file_id,
1762
- text=new_utt.transcript,
1763
- normalized_text=new_utt.transcript,
1764
- features="",
1765
- in_subset=False,
1766
- ignored=False,
1767
- channel=new_utt.segment.channel,
1768
- )
1769
- )
1770
- next_pk += 1
1771
- splitting_utterances = [[utt, *replacement_utterances]]
1772
- self.addCommand.emit(
1773
- undo.SplitUtteranceCommand(splitting_utterances, self, update_table=False)
1774
- )
1775
- self.requestFileView.emit(utt.file_name)
1776
- self.set_file_modified(file_id)
1777
- self.set_speaker_modified(speaker_id)
1778
-
1779
- def split_utterances(self, utterances: list[Utterance]):
1780
- if not self.editable:
1781
- return
1782
- splitting_utterances = []
1783
- for utt in utterances:
1784
- duration = utt.duration
1785
- beg = utt.begin
1786
- end = utt.end
1787
- first_text = ""
1788
- second_text = ""
1789
- if " " not in utt.text and " " in utt.normalized_text:
1790
- t = utt.normalized_text.split()
1791
- mid_ind = int(len(t) / 2)
1792
- first_text = t[:mid_ind]
1793
- second_text = t[mid_ind:]
1794
- elif utt.text:
1795
- t = utt.text.split()
1796
- mid_ind = int(len(t) / 2)
1797
- first_text = t[:mid_ind]
1798
- second_text = t[mid_ind:]
1799
- split_time = beg + (duration / 2)
1800
- oovs = set()
1801
- for w in first_text:
1802
- if not self.dictionary_model.check_word(w, utt.speaker_id):
1803
- oovs.add(w)
1804
- next_pk = self.corpus.get_next_primary_key(Utterance)
1805
- first_utt = Utterance(
1806
- id=next_pk,
1807
- speaker_id=utt.speaker_id,
1808
- file_id=utt.file_id,
1809
- begin=beg,
1810
- end=split_time,
1811
- channel=utt.channel,
1812
- text=" ".join(first_text),
1813
- normalized_text=" ".join(first_text),
1814
- oovs=" ".join(oovs),
1815
- )
1816
- next_pk += 1
1817
- oovs = set()
1818
- for w in second_text:
1819
- if not self.dictionary_model.check_word(w, utt.speaker_id):
1820
- oovs.add(w)
1821
- second_utt = Utterance(
1822
- id=next_pk,
1823
- speaker_id=utt.speaker_id,
1824
- file_id=utt.file_id,
1825
- begin=split_time,
1826
- end=end,
1827
- channel=utt.channel,
1828
- text=" ".join(second_text),
1829
- normalized_text=" ".join(second_text),
1830
- oovs=" ".join(oovs),
1831
- )
1832
- splitting_utterances.append([utt, first_utt, second_utt])
1833
- self.addCommand.emit(undo.SplitUtteranceCommand(splitting_utterances, self))
1834
- self.set_file_modified([utt[0].file_id for utt in splitting_utterances])
1835
-
1836
2227
  def merge_speakers(self, speakers: list[int]):
1837
2228
  self.addCommand.emit(undo.MergeSpeakersCommand(speakers, self))
1838
2229
 
1839
- def merge_utterances(self, utterances: list[Utterance]):
1840
- if not self.editable:
1841
- return
1842
- min_begin = 1000000000
1843
- max_end = 0
1844
- text = ""
1845
- normalized_text = ""
1846
- speaker = None
1847
- file = None
1848
- channel = None
1849
- for old_utt in sorted(utterances, key=lambda x: x.begin):
1850
- if speaker is None:
1851
- speaker = old_utt.speaker
1852
- if file is None:
1853
- file = old_utt.file
1854
- if channel is None:
1855
- channel = old_utt.channel
1856
- if old_utt.begin < min_begin:
1857
- min_begin = old_utt.begin
1858
- if old_utt.end > max_end:
1859
- max_end = old_utt.end
1860
- utt_text = old_utt.text
1861
- if utt_text == "speech" and text.strip() == "speech":
1862
- continue
1863
- text += utt_text + " "
1864
- normalized_text += old_utt.normalized_text + " "
1865
- text = text[:-1]
1866
- normalized_text = normalized_text[:-1]
1867
- next_pk = self.corpus.get_next_primary_key(Utterance)
1868
- oovs = set()
1869
- for w in text.split():
1870
- if not self.dictionary_model.check_word(w, speaker.id):
1871
- oovs.add(w)
1872
- new_utt = Utterance(
1873
- id=next_pk,
1874
- speaker=speaker,
1875
- file=file,
1876
- begin=min_begin,
1877
- end=max_end,
1878
- channel=channel,
1879
- text=text,
1880
- normalized_text=normalized_text,
1881
- oovs=" ".join(oovs),
1882
- )
1883
- self.set_file_modified(file.id)
1884
- self.addCommand.emit(undo.MergeUtteranceCommand(utterances, new_utt, self))
1885
-
1886
2230
  def replace_all(self, search_query: TextFilterQuery, replacement: str):
1887
2231
  self.addCommand.emit(undo.ReplaceAllCommand(search_query, replacement, self))
1888
2232
 
1889
- def utteranceAt(self, index) -> Optional[Utterance]:
2233
+ def utterance_id_at(self, index) -> Optional[Utterance]:
1890
2234
  if not isinstance(index, int):
1891
2235
  if not index.isValid():
1892
2236
  return None
@@ -1895,15 +2239,16 @@ class CorpusModel(TableModel):
1895
2239
  return None
1896
2240
  if len(self._indices) == 0:
1897
2241
  return None
1898
- utterance = (
1899
- self.session.query(Utterance)
1900
- .options(
1901
- joinedload(Utterance.file).joinedload(File.sound_file),
1902
- joinedload(Utterance.file).subqueryload(File.speakers),
1903
- )
1904
- .get(self._indices[index])
2242
+ return self._indices[index]
2243
+
2244
+ def audio_info_for_utterance(self, row: int):
2245
+ return (
2246
+ self._file_indices[row],
2247
+ self._data[row][self.begin_column],
2248
+ self._data[row][self.end_column],
2249
+ self._indices[row],
2250
+ self._speaker_indices[row],
1905
2251
  )
1906
- return utterance
1907
2252
 
1908
2253
  def fileAt(self, index) -> int:
1909
2254
  if not isinstance(index, int):