Anchor-annotator 0.8.1__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
anchor/workers.py CHANGED
@@ -39,11 +39,11 @@ from montreal_forced_aligner.corpus.acoustic_corpus import (
39
39
  )
40
40
  from montreal_forced_aligner.corpus.classes import FileData
41
41
  from montreal_forced_aligner.data import (
42
- CtmInterval,
43
42
  DatasetType,
44
43
  DistanceMetric,
45
44
  Language,
46
45
  ManifoldAlgorithm,
46
+ PhoneType,
47
47
  TextFileType,
48
48
  WordType,
49
49
  WorkflowType,
@@ -57,6 +57,8 @@ from montreal_forced_aligner.db import (
57
57
  Phone,
58
58
  PhoneInterval,
59
59
  Pronunciation,
60
+ ReferencePhoneInterval,
61
+ ReferenceWordInterval,
60
62
  SoundFile,
61
63
  Speaker,
62
64
  SpeakerOrdering,
@@ -94,7 +96,7 @@ from montreal_forced_aligner.vad.models import FOUND_SPEECHBRAIN, MfaVAD
94
96
  from montreal_forced_aligner.vad.segmenter import TranscriptionSegmenter, VadSegmenter
95
97
  from montreal_forced_aligner.validation.corpus_validator import PretrainedValidator
96
98
  from PySide6 import QtCore
97
- from sklearn import discriminant_analysis, metrics, preprocessing
99
+ from sklearn import discriminant_analysis
98
100
  from sqlalchemy.orm import joinedload, selectinload, subqueryload
99
101
 
100
102
  import anchor.db
@@ -106,26 +108,6 @@ if typing.TYPE_CHECKING:
106
108
  logger = logging.getLogger("anchor")
107
109
 
108
110
 
109
- @dataclassy.dataclass(slots=True)
110
- class UtteranceData:
111
- id: int
112
- begin: float
113
- end: float
114
- channel: int
115
- text: str
116
- normalized_text: str
117
- transcription_text: str
118
- speaker_id: int
119
- file_id: int
120
- reference_phone_intervals: typing.List[CtmInterval]
121
- aligned_word_intervals: typing.List[CtmInterval]
122
- aligned_phone_intervals: typing.List[CtmInterval]
123
- transcribed_word_intervals: typing.List[CtmInterval]
124
- transcribed_phone_intervals: typing.List[CtmInterval]
125
- per_speaker_transcribed_word_intervals: typing.List[CtmInterval]
126
- per_speaker_transcribed_phone_intervals: typing.List[CtmInterval]
127
-
128
-
129
111
  @dataclassy.dataclass
130
112
  class SpeakerPlda:
131
113
  test_ivectors: typing.List[DoubleVector]
@@ -664,6 +646,9 @@ class ExportFilesWorker(Worker):
664
646
  if self.progress_callback is not None:
665
647
  self.progress_callback.update_total(files.count())
666
648
  for f in files:
649
+ if not f.utterances:
650
+ logger.debug(f"Skipping {f.name} for no utterances")
651
+ continue
667
652
  if self.stopped.is_set():
668
653
  session.rollback()
669
654
  break
@@ -784,8 +769,8 @@ class ChangeSpeakerWorker(Worker):
784
769
  per_utterance = isinstance(self.utterance_ids[0], list)
785
770
  with self.session() as session:
786
771
  try:
787
- if (not per_utterance and self.new_speaker_id <= 0) or any(
788
- x[-1] <= 0 for x in self.utterance_ids
772
+ if (not per_utterance and self.new_speaker_id <= 0) or (
773
+ per_utterance and any(x[-1] <= 0 for x in self.utterance_ids)
789
774
  ):
790
775
  new_speaker_id = session.query(sqlalchemy.func.max(Speaker.id)).scalar() + 1
791
776
  speaker = session.query(Speaker).get(self.old_speaker_id)
@@ -805,6 +790,8 @@ class ChangeSpeakerWorker(Worker):
805
790
  )
806
791
  )
807
792
  session.flush()
793
+ else:
794
+ new_speaker_id = self.new_speaker_id
808
795
  if not per_utterance:
809
796
  utterance_ids = self.utterance_ids
810
797
  if not utterance_ids:
@@ -1064,32 +1051,32 @@ class QueryUtterancesWorker(Worker):
1064
1051
  c = session.query(Corpus).first()
1065
1052
  count_only = self.kwargs.get("count", False)
1066
1053
  has_ivectors = self.kwargs.get("has_ivectors", False)
1067
- if count_only:
1068
- columns = [Utterance.id]
1069
- else:
1070
- columns = [
1071
- Utterance.id,
1072
- Utterance.file_id,
1073
- Utterance.speaker_id,
1074
- Utterance.oovs,
1075
- File.name,
1076
- Speaker.name,
1077
- Utterance.begin,
1078
- Utterance.end,
1079
- Utterance.duration,
1080
- Utterance.text,
1081
- ]
1082
- columns.append(Utterance.alignment_log_likelihood)
1083
- columns.append(Utterance.speech_log_likelihood)
1084
- columns.append(Utterance.duration_deviation)
1085
- columns.append(Utterance.phone_error_rate)
1086
- columns.append(Utterance.alignment_score)
1087
- columns.append(Utterance.transcription_text)
1088
- columns.append(Utterance.word_error_rate)
1089
- if has_ivectors and c.utterance_ivector_column is not None:
1090
- columns.append(
1091
- c.utterance_ivector_column.cosine_distance(c.speaker_ivector_column)
1092
- )
1054
+ filter_nulls = self.kwargs.get("filter_nulls", [])
1055
+ columns = [
1056
+ Utterance.id,
1057
+ Utterance.file_id,
1058
+ Utterance.speaker_id,
1059
+ Utterance.oovs,
1060
+ File.name,
1061
+ Speaker.name,
1062
+ Utterance.begin,
1063
+ Utterance.end,
1064
+ Utterance.duration,
1065
+ Utterance.text,
1066
+ Utterance.alignment_log_likelihood,
1067
+ Utterance.speech_log_likelihood,
1068
+ Utterance.duration_deviation,
1069
+ Utterance.snr,
1070
+ Utterance.phone_error_rate,
1071
+ Utterance.alignment_score,
1072
+ Utterance.transcription_text,
1073
+ Utterance.word_error_rate,
1074
+ ]
1075
+ if has_ivectors and c.utterance_ivector_column is not None:
1076
+ columns.append(
1077
+ c.utterance_ivector_column.cosine_distance(c.speaker_ivector_column)
1078
+ )
1079
+ columns.append(Utterance.diarization_variance)
1093
1080
  speaker_filter = self.kwargs.get("speaker_filter", None)
1094
1081
  file_filter = self.kwargs.get("file_filter", None)
1095
1082
  text_filter: TextFilterQuery = self.kwargs.get("text_filter", None)
@@ -1115,6 +1102,10 @@ class QueryUtterancesWorker(Worker):
1115
1102
  text_column = Utterance.text
1116
1103
  filter_regex = text_filter.generate_expression(posix=True)
1117
1104
  utterances = utterances.filter(text_column.op("~")(filter_regex))
1105
+ for i, null_check in enumerate(filter_nulls):
1106
+ if null_check:
1107
+ column = columns[i + 3]
1108
+ utterances = utterances.filter(column != None) # noqa
1118
1109
  if count_only:
1119
1110
  try:
1120
1111
  return utterances.count()
@@ -1181,9 +1172,9 @@ class QuerySpeakersWorker(Worker):
1181
1172
  Speaker.num_utterances,
1182
1173
  Speaker.dictionary_id,
1183
1174
  ]
1184
- if speaker_filter is None:
1175
+ if not speaker_filter:
1185
1176
  columns.append(
1186
- sqlalchemy.func.avg(
1177
+ sqlalchemy.func.max(
1187
1178
  c.utterance_ivector_column.cosine_distance(c.speaker_ivector_column)
1188
1179
  )
1189
1180
  )
@@ -1257,18 +1248,29 @@ class ClusterSpeakerUtterancesWorker(Worker):
1257
1248
  c = session.query(Corpus).first()
1258
1249
  speaker_name, ivector, utt_count = (
1259
1250
  session.query(Speaker.name, c.speaker_ivector_column, Speaker.num_utterances)
1260
- .filter(Speaker.id == speaker_ids[0], c.utterance_ivector_column != None) # noqa
1251
+ .filter(
1252
+ Speaker.id == self.speaker_ids[0], c.utterance_ivector_column != None # noqa
1253
+ )
1261
1254
  .first()
1262
1255
  )
1263
1256
  if utt_count < 1:
1264
1257
  return None
1265
- query = session.query(Utterance.speaker_id).filter(
1266
- c.utterance_ivector_column != None # noqa
1258
+ query = (
1259
+ session.query(
1260
+ Utterance.speaker_id,
1261
+ c.utterance_ivector_column.cosine_distance(c.speaker_ivector_column),
1262
+ )
1263
+ .join(Utterance.speaker)
1264
+ .filter(c.utterance_ivector_column != None) # noqa
1267
1265
  )
1268
1266
  query = query.filter(Utterance.speaker_id.in_(self.speaker_ids))
1269
1267
  query = query.order_by(Utterance.id)
1270
1268
  additional_data = (
1271
- session.query(Utterance.speaker_id)
1269
+ session.query(
1270
+ Utterance.speaker_id,
1271
+ c.utterance_ivector_column.cosine_distance(c.speaker_ivector_column),
1272
+ )
1273
+ .join(Utterance.speaker)
1272
1274
  .filter(
1273
1275
  c.utterance_ivector_column != None, # noqa
1274
1276
  )
@@ -1281,8 +1283,18 @@ class ClusterSpeakerUtterancesWorker(Worker):
1281
1283
  additional_data = additional_data.order_by(
1282
1284
  c.utterance_ivector_column.cosine_distance(ivector)
1283
1285
  ).limit(min(query.count(), self.limit))
1284
- cluster_ids = np.array([x for x, in query] + [x for x, in additional_data])
1285
- return self.speaker_ids, cluster_ids
1286
+ cluster_ids = []
1287
+ distances = []
1288
+ for speaker_id, distance in query:
1289
+ cluster_ids.append(speaker_id)
1290
+ distances.append(distance)
1291
+ for speaker_id, distance in additional_data:
1292
+ cluster_ids.append(speaker_id)
1293
+ distances.append(distance)
1294
+ cluster_ids = np.array(cluster_ids)
1295
+ distances = np.array(distances)
1296
+ distances = (distances - distances.min()) / distances.max()
1297
+ return self.speaker_ids, cluster_ids, distances
1286
1298
 
1287
1299
 
1288
1300
  class CalculateSpeakerIvectorsWorker(Worker):
@@ -1308,7 +1320,9 @@ class CalculateSpeakerIvectorsWorker(Worker):
1308
1320
  c = session.query(Corpus).first()
1309
1321
  speaker_name, ivector, utt_count = (
1310
1322
  session.query(Speaker.name, c.speaker_ivector_column, Speaker.num_utterances)
1311
- .filter(Speaker.id == speaker_ids[0], c.utterance_ivector_column != None) # noqa
1323
+ .filter(
1324
+ Speaker.id == self.speaker_ids[0], c.utterance_ivector_column != None # noqa
1325
+ )
1312
1326
  .first()
1313
1327
  )
1314
1328
  if utt_count < 1:
@@ -1389,7 +1403,9 @@ class SpeakerMdsWorker(Worker):
1389
1403
  dim = IVECTOR_DIMENSION
1390
1404
  speaker_name, ivector, utt_count = (
1391
1405
  session.query(Speaker.name, c.speaker_ivector_column, Speaker.num_utterances)
1392
- .filter(Speaker.id == speaker_ids[0], c.utterance_ivector_column != None) # noqa
1406
+ .filter(
1407
+ Speaker.id == self.speaker_ids[0], c.utterance_ivector_column != None # noqa
1408
+ )
1393
1409
  .first()
1394
1410
  )
1395
1411
  query = (
@@ -1442,17 +1458,17 @@ class SpeakerMdsWorker(Worker):
1442
1458
  (num_utterances + additional_data.count() + self.limit,), dtype="int32"
1443
1459
  )
1444
1460
  ivectors = np.array(self.plda.transform_ivectors(ivectors, counts))
1445
- metric_type = DistanceMetric.cosine
1461
+ self.metric_type = DistanceMetric.cosine
1446
1462
  if ivectors.shape[0] <= self.perplexity:
1447
- perplexity = ivectors.shape[0] - 1
1463
+ self.perplexity = ivectors.shape[0] - 1
1448
1464
  if self.speaker_space is not None:
1449
1465
  points = self.speaker_space.transform(ivectors)
1450
1466
  else:
1451
1467
  points = visualize_clusters(
1452
1468
  ivectors,
1453
1469
  ManifoldAlgorithm.tsne,
1454
- metric_type,
1455
- perplexity,
1470
+ self.metric_type,
1471
+ self.perplexity,
1456
1472
  self.plda,
1457
1473
  quick=False,
1458
1474
  )
@@ -1460,37 +1476,42 @@ class SpeakerMdsWorker(Worker):
1460
1476
  return self.speaker_ids, points
1461
1477
 
1462
1478
 
1463
- class SpeakerDiarizationWorker(Worker):
1479
+ class AlignmentAnalysisWorker(Worker):
1464
1480
  def __init__(
1465
1481
  self,
1466
1482
  session,
1467
1483
  use_mp=False,
1468
- in_speakers=False,
1469
- use_silhouette: bool = False,
1470
- threshold: float = None,
1471
- metric: typing.Union[str, DistanceMetric] = DistanceMetric.cosine,
1472
- plda: Plda = None,
1473
- speaker_plda: SpeakerPlda = None,
1484
+ speaker_id: int = None,
1485
+ phone_id: int = None,
1486
+ word_filter: TextFilterQuery = None,
1487
+ less_than: float = None,
1488
+ greater_than: float = None,
1489
+ measure: str = "duration",
1490
+ exclude_manual: bool = False,
1491
+ word_mode: bool = False,
1492
+ relative_duration: bool = False,
1474
1493
  limit: int = 100,
1494
+ current_offset: int = 0,
1495
+ sort_index: int = None,
1496
+ sort_desc: bool = False,
1475
1497
  **kwargs,
1476
1498
  ):
1477
1499
  super().__init__(use_mp=use_mp, **kwargs)
1478
1500
  self.session = session
1479
- self.in_speakers = in_speakers
1480
- self.threshold = threshold
1481
- self.metric = metric
1482
- self.plda = plda
1483
- self.speaker_plda = speaker_plda
1484
- self.limit = limit
1485
- self.use_silhouette = use_silhouette
1501
+ self.speaker_id = speaker_id
1502
+ self.phone_id = phone_id
1503
+ self.less_than = less_than
1504
+ self.greater_than = greater_than
1505
+ self.measure = measure
1506
+ self.word_filter = word_filter
1507
+ self.exclude_manual = exclude_manual
1508
+ self.word_mode = word_mode
1509
+ self.relative_duration = relative_duration
1486
1510
 
1487
- if isinstance(self.metric, str):
1488
- self.metric = DistanceMetric[self.metric]
1489
- if self.use_silhouette:
1490
- self.metric = DistanceMetric.cosine
1491
- if self.metric is DistanceMetric.plda:
1492
- if self.plda is None:
1493
- self.metric = DistanceMetric.cosine
1511
+ self.limit = limit
1512
+ self.current_offset = current_offset
1513
+ self.sort_index = sort_index
1514
+ self.sort_desc = sort_desc
1494
1515
 
1495
1516
  def _run(self):
1496
1517
  count_only = self.kwargs.get("count", False)
@@ -1498,137 +1519,164 @@ class SpeakerDiarizationWorker(Worker):
1498
1519
  self.progress_callback.update_total(self.limit)
1499
1520
 
1500
1521
  with self.session() as session:
1501
- c = session.query(Corpus).first()
1502
- suggested_indices = []
1522
+ indices = []
1523
+ file_indices = []
1503
1524
  speaker_indices = []
1504
1525
  utterance_ids = []
1526
+ reversed_indices = {}
1505
1527
  data = []
1506
-
1507
- query = session.query(
1508
- Speaker.id, c.speaker_ivector_column, Speaker.name, Speaker.num_utterances
1509
- ).filter(
1510
- c.speaker_ivector_column != None # noqa
1511
- )
1512
- if self.use_silhouette:
1513
- query = query.filter(Speaker.num_utterances > 1)
1528
+ if not self.word_mode:
1529
+ if not count_only and self.relative_duration:
1530
+ duration_column = sqlalchemy.sql.label(
1531
+ "duration",
1532
+ (PhoneInterval.duration - Phone.mean_duration) / Phone.sd_duration,
1533
+ )
1534
+ else:
1535
+ duration_column = PhoneInterval.duration
1536
+ goodness_column = PhoneInterval.phone_goodness
1537
+ columns = [
1538
+ PhoneInterval.id,
1539
+ PhoneInterval.utterance_id,
1540
+ Utterance.file_id,
1541
+ Utterance.speaker_id,
1542
+ Utterance.begin,
1543
+ Utterance.end,
1544
+ File.name,
1545
+ Speaker.name,
1546
+ Phone.phone,
1547
+ duration_column,
1548
+ goodness_column,
1549
+ Word.word,
1550
+ ]
1551
+ query = (
1552
+ session.query(*columns)
1553
+ .join(PhoneInterval.utterance)
1554
+ .join(PhoneInterval.phone)
1555
+ .join(Utterance.speaker)
1556
+ .join(Utterance.file)
1557
+ .join(PhoneInterval.word_interval)
1558
+ .join(WordInterval.word)
1559
+ )
1514
1560
  else:
1515
- query = query.filter(Speaker.num_utterances > 0)
1516
- if count_only:
1517
- return query.count()
1518
- query = query.order_by(sqlalchemy.func.random())
1519
-
1520
- if self.threshold is None:
1521
- query = query.limit(self.limit).offset(self.kwargs.get("current_offset", 0))
1522
- found = set()
1523
- for speaker_id, ivector, speaker_name, num_utterances in query:
1524
- if self.stopped is not None and self.stopped.is_set():
1525
- break
1526
- if self.metric is DistanceMetric.plda:
1527
- kaldi_ivector = DoubleVector()
1528
- kaldi_ivector.from_numpy(ivector)
1529
- ivector_normalize_length(kaldi_ivector)
1530
- kaldi_ivector = self.plda.transform_ivector(kaldi_ivector, num_utterances)
1531
- index, distance = self.plda.classify_utterance(
1532
- kaldi_ivector, self.speaker_plda.test_ivectors, self.speaker_plda.counts
1561
+ if not count_only and self.relative_duration:
1562
+ duration_column = sqlalchemy.sql.label(
1563
+ "duration",
1564
+ sqlalchemy.func.sum(PhoneInterval.duration)
1565
+ / sqlalchemy.func.sum(Phone.mean_duration),
1533
1566
  )
1534
- suggested_name = self.speaker_plda.suggested_names[index]
1535
- suggested_count = self.speaker_plda.counts[index]
1536
- suggested_id = self.speaker_plda.suggested_ids[index]
1537
- if suggested_id == speaker_id:
1538
- continue
1539
- if self.threshold is not None and distance < self.threshold:
1540
- continue
1541
1567
  else:
1542
- suggested_speaker_query = session.query(
1543
- Speaker.id,
1568
+ duration_column = sqlalchemy.func.avg(PhoneInterval.duration)
1569
+ goodness_column = sqlalchemy.func.min(PhoneInterval.phone_goodness)
1570
+ columns = [
1571
+ WordInterval.id,
1572
+ WordInterval.utterance_id,
1573
+ Utterance.file_id,
1574
+ Utterance.speaker_id,
1575
+ Utterance.begin,
1576
+ Utterance.end,
1577
+ File.name,
1578
+ Speaker.name,
1579
+ sqlalchemy.func.string_agg(
1580
+ Phone.phone,
1581
+ sqlalchemy.dialects.postgresql.aggregate_order_by(
1582
+ sqlalchemy.literal_column("' '"), PhoneInterval.begin
1583
+ ),
1584
+ ),
1585
+ duration_column,
1586
+ goodness_column,
1587
+ Word.word,
1588
+ ]
1589
+ query = (
1590
+ session.query(*columns)
1591
+ .join(PhoneInterval.utterance)
1592
+ .join(PhoneInterval.phone)
1593
+ .join(Utterance.speaker)
1594
+ .join(Utterance.file)
1595
+ .join(PhoneInterval.word_interval)
1596
+ .join(WordInterval.word)
1597
+ .group_by(
1598
+ WordInterval.id,
1599
+ Utterance.id,
1600
+ Utterance.file_id,
1601
+ Utterance.speaker_id,
1602
+ Utterance.begin,
1603
+ Utterance.end,
1604
+ File.name,
1544
1605
  Speaker.name,
1545
- Speaker.num_utterances,
1546
- c.speaker_ivector_column.cosine_distance(ivector),
1547
- ).filter(
1548
- Speaker.id != speaker_id,
1549
- # Speaker.num_utterances <= 200
1606
+ Word.word,
1550
1607
  )
1551
- if self.use_silhouette:
1552
- suggested_speaker_query = suggested_speaker_query.filter(
1553
- Speaker.num_utterances > 1
1554
- )
1555
- suggested_speaker_query = suggested_speaker_query.order_by(
1556
- c.speaker_ivector_column.cosine_distance(ivector)
1557
- ).limit(1)
1558
- r = suggested_speaker_query.first()
1559
- if r is None:
1560
- continue
1561
- suggested_id, suggested_name, suggested_count, distance = r
1562
- if (suggested_id, speaker_id) in found or (speaker_id, suggested_id) in found:
1563
- continue
1564
- if self.use_silhouette:
1565
- utterance_query = (
1566
- session.query(Utterance.speaker_id, c.utterance_ivector_column)
1567
- .filter(Utterance.speaker_id.in_([speaker_id, suggested_id]))
1568
- .filter(c.utterance_ivector_column != None) # noqa
1569
- )
1570
- ivectors = []
1571
- labels = []
1572
- for speaker_id, utterance_ivector in utterance_query:
1573
- labels.append(speaker_id)
1574
- ivectors.append(utterance_ivector)
1575
- ivectors = np.array(ivectors)
1576
- if self.metric is DistanceMetric.cosine:
1577
- ivectors = preprocessing.normalize(ivectors, norm="l2")
1578
- self.metric = "euclidean"
1579
- distance = metrics.silhouette_score(ivectors, labels, metric=self.metric)
1580
- if self.threshold is not None:
1581
- if distance is not None and distance > self.threshold:
1582
- continue
1583
- if distance is None:
1584
- continue
1585
- if self.progress_callback is not None:
1586
- self.progress_callback.increment_progress(1)
1608
+ )
1587
1609
 
1588
- utterance_ids.append(None)
1589
- utterance_name = ""
1590
- if suggested_count >= num_utterances:
1591
- found.add((suggested_id, speaker_id))
1592
- suggested_indices.append(suggested_id)
1593
- speaker_indices.append(speaker_id)
1594
- data.append(
1595
- [
1596
- utterance_name,
1597
- suggested_name,
1598
- suggested_count,
1599
- speaker_name,
1600
- num_utterances,
1601
- distance,
1602
- ]
1603
- )
1610
+ if self.speaker_id is not None:
1611
+ if isinstance(self.speaker_id, int):
1612
+ query = query.filter(Utterance.speaker_id == self.speaker_id)
1604
1613
  else:
1605
- found.add((speaker_id, suggested_id))
1606
- suggested_indices.append(speaker_id)
1607
- speaker_indices.append(suggested_id)
1608
- data.append(
1609
- [
1610
- utterance_name,
1611
- speaker_name,
1612
- num_utterances,
1613
- suggested_name,
1614
- suggested_count,
1615
- distance,
1616
- ]
1617
- )
1618
- if len(data) >= self.limit:
1619
- break
1620
- d = np.array([x[-1] for x in data])
1621
- if self.metric is DistanceMetric.plda:
1622
- d *= -1
1623
- indices = np.argsort(d)
1624
- utterance_ids = [utterance_ids[x] for x in indices]
1625
- suggested_indices = [suggested_indices[x] for x in indices]
1626
- speaker_indices = [speaker_indices[x] for x in indices]
1627
- data = [data[x] for x in indices]
1628
- return data, utterance_ids, suggested_indices, speaker_indices
1614
+ query = query.filter(Speaker.name == self.speaker_id)
1615
+ if self.phone_id is not None:
1616
+ if isinstance(self.phone_id, int):
1617
+ query = query.filter(PhoneInterval.phone_id == self.phone_id)
1618
+ else:
1619
+ query = query.filter(Phone.phone == self.phone_id)
1620
+ else:
1621
+ query = query.filter(Phone.phone_type.in_([PhoneType.non_silence]))
1622
+ if self.exclude_manual:
1623
+ query = query.filter(Utterance.manual_alignments == False) # noqa
1624
+ if self.measure == "duration":
1625
+ measure_column = duration_column
1626
+ else:
1627
+ measure_column = goodness_column
1628
+ if self.less_than is not None or self.greater_than is not None:
1629
+ if self.less_than is not None:
1630
+ query = query.filter(measure_column < self.less_than)
1631
+ if self.greater_than is not None:
1632
+ query = query.filter(measure_column > self.greater_than)
1633
+ if self.word_filter is not None and self.word_filter.text:
1634
+ filter_regex = self.word_filter.generate_expression(posix=True)
1635
+ query = query.filter(Word.word.op("~")(filter_regex))
1636
+ if count_only:
1637
+ try:
1638
+ return query.count()
1639
+ except psycopg2.errors.InvalidRegularExpression:
1640
+ return 0
1641
+ if self.sort_index is not None and self.sort_index + 6 <= len(columns) - 1:
1642
+ sort_column = columns[self.sort_index + 6]
1643
+ if self.sort_desc:
1644
+ sort_column = sort_column.desc()
1645
+ query = query.order_by(sort_column, Utterance.id, PhoneInterval.begin)
1646
+ else:
1647
+ if self.word_mode:
1648
+ query = query.order_by(duration_column, Utterance.id, WordInterval.id)
1649
+ else:
1650
+ query = query.order_by(duration_column, Utterance.id, PhoneInterval.begin)
1651
+ query = query.limit(self.limit).offset(self.current_offset)
1652
+ try:
1653
+ for i, u in enumerate(query):
1654
+ if self.stopped is not None and self.stopped.is_set():
1655
+ return
1656
+ phone_interval_id = u[0]
1657
+ utterance_id = u[1]
1658
+ file_id = u[2]
1659
+ speaker_id = u[3]
1660
+ begin = u[4]
1661
+ end = u[5]
1662
+ file_name = u[6]
1663
+ indices.append(phone_interval_id)
1664
+ reversed_indices[phone_interval_id] = i
1665
+ file_indices.append(file_id)
1666
+ speaker_indices.append(speaker_id)
1629
1667
 
1668
+ utterance_ids.append(utterance_id)
1669
+ utterance_name = f"{file_name} ({begin:.3f}-{end:.3f})"
1670
+ data.append([utterance_name, *u[7:]])
1671
+ if self.progress_callback is not None:
1672
+ self.progress_callback.increment_progress(1)
1673
+
1674
+ except psycopg2.errors.InvalidRegularExpression:
1675
+ pass
1676
+ return data, indices, utterance_ids, file_indices, speaker_indices, reversed_indices
1630
1677
 
1631
- class SpeakerUtterancesWorker(Worker):
1678
+
1679
+ class SpeakerDiarizationWorker(Worker):
1632
1680
  def __init__(
1633
1681
  self,
1634
1682
  session,
@@ -1642,8 +1690,8 @@ class SpeakerUtterancesWorker(Worker):
1642
1690
  speaker_plda: SpeakerPlda = None,
1643
1691
  limit: int = 100,
1644
1692
  inverted: bool = False,
1693
+ utterance_based: bool = False,
1645
1694
  text_filter: TextFilterQuery = None,
1646
- in_speakers: bool = False,
1647
1695
  **kwargs,
1648
1696
  ):
1649
1697
  super().__init__(use_mp=use_mp, **kwargs)
@@ -1657,8 +1705,8 @@ class SpeakerUtterancesWorker(Worker):
1657
1705
  self.speaker_plda = speaker_plda
1658
1706
  self.limit = limit
1659
1707
  self.inverted = inverted
1708
+ self.utterance_based = utterance_based
1660
1709
  self.text_filter = text_filter
1661
- self.in_speakers = in_speakers
1662
1710
 
1663
1711
  if isinstance(self.metric, str):
1664
1712
  self.metric = DistanceMetric[self.metric]
@@ -1684,6 +1732,8 @@ class SpeakerUtterancesWorker(Worker):
1684
1732
  and self.speaker_plda is None
1685
1733
  ):
1686
1734
  speaker_plda = load_speaker_plda(session, self.plda, minimum_count=2)
1735
+ elif self.speaker_plda is not None:
1736
+ speaker_plda = self.speaker_plda
1687
1737
 
1688
1738
  if self.reference_utterance_id is not None:
1689
1739
  utterance_query = (
@@ -2098,7 +2148,6 @@ class SpeakerUtterancesWorker(Worker):
2098
2148
  query = query.filter(
2099
2149
  c.utterance_ivector_column.cosine_distance(ivector) <= self.threshold
2100
2150
  )
2101
-
2102
2151
  if count_only:
2103
2152
  return query.count()
2104
2153
  if self.text_filter is None or not self.text_filter.text:
@@ -2147,33 +2196,52 @@ class SpeakerUtterancesWorker(Worker):
2147
2196
  distance,
2148
2197
  ]
2149
2198
  )
2150
- elif self.in_speakers:
2199
+ else:
2151
2200
  query = (
2152
2201
  session.query(
2153
- Speaker.id, c.speaker_ivector_column, Speaker.name, Speaker.num_utterances
2202
+ Utterance.id,
2203
+ File.id,
2204
+ File.name,
2205
+ Utterance.begin,
2206
+ Utterance.end,
2207
+ c.utterance_ivector_column,
2208
+ Speaker.name,
2209
+ Speaker.id,
2210
+ Speaker.num_utterances,
2154
2211
  )
2155
- .filter(c.speaker_ivector_column != None) # noqa
2156
- .filter(Speaker.num_utterances > 0)
2212
+ .join(Utterance.file)
2213
+ .join(Utterance.speaker)
2214
+ .filter(c.utterance_ivector_column != None) # noqa
2215
+ .filter(Speaker.num_utterances == 1)
2157
2216
  )
2158
2217
  if self.text_filter is not None and self.text_filter.text:
2159
2218
  filter_regex = self.text_filter.generate_expression(posix=True)
2160
- query = query.join(Speaker.utterances)
2161
- query = query.filter(Utterance.text.op("~")(filter_regex)).distinct()
2219
+ query = query.filter(Utterance.text.op("~")(filter_regex))
2162
2220
  if count_only:
2163
2221
  return query.count()
2164
- if self.text_filter is None or not self.text_filter.text:
2165
- # query = query.order_by(c.utterance_ivector_column.cosine_distance(c.speaker_ivector_column).desc())
2166
- query = query.order_by(sqlalchemy.func.random())
2167
- # query = query.order_by(Utterance.duration.desc())
2222
+ query = query.order_by(sqlalchemy.func.random())
2168
2223
 
2169
2224
  if self.threshold is None:
2170
2225
  query = query.limit(self.limit).offset(self.kwargs.get("current_offset", 0))
2171
- for speaker_id, ivector, speaker_name, num_utterances in query:
2226
+ # else:
2227
+ # query = query.limit(limit*100)
2228
+ for (
2229
+ utt_id,
2230
+ file_id,
2231
+ file_name,
2232
+ begin,
2233
+ end,
2234
+ ivector,
2235
+ speaker_name,
2236
+ speaker_id,
2237
+ speaker_num_utterances,
2238
+ ) in query:
2172
2239
  if self.stopped is not None and self.stopped.is_set():
2173
2240
  break
2174
2241
  if self.metric is DistanceMetric.plda:
2175
2242
  kaldi_ivector = DoubleVector()
2176
2243
  kaldi_ivector.from_numpy(ivector)
2244
+ ivector_normalize_length(kaldi_ivector)
2177
2245
  kaldi_ivector = self.plda.transform_ivector(kaldi_ivector, 1)
2178
2246
  index, distance = self.plda.classify_utterance(
2179
2247
  kaldi_ivector, speaker_plda.test_ivectors, speaker_plda.counts
@@ -2186,67 +2254,257 @@ class SpeakerUtterancesWorker(Worker):
2186
2254
  if self.threshold is not None and distance < self.threshold:
2187
2255
  continue
2188
2256
  else:
2189
- suggested_speaker_query = session.query(
2190
- Speaker.id,
2191
- Speaker.name,
2192
- Speaker.num_utterances,
2193
- c.speaker_ivector_column.cosine_distance(ivector),
2194
- ).filter(
2195
- Speaker.id != speaker_id,
2196
- # Speaker.num_utterances <= 200
2197
- )
2198
- suggested_speaker_query = suggested_speaker_query.order_by(
2199
- c.speaker_ivector_column.cosine_distance(ivector)
2200
- ).limit(1)
2201
- r = suggested_speaker_query.first()
2202
- if r is None:
2257
+ if self.utterance_based:
2258
+ sub_query = (
2259
+ session.query(
2260
+ Speaker.id,
2261
+ Speaker.name,
2262
+ Speaker.num_utterances,
2263
+ c.speaker_ivector_column.cosine_distance(ivector).label(
2264
+ "distance"
2265
+ ),
2266
+ )
2267
+ .join(Speaker.utterances)
2268
+ .filter(
2269
+ Speaker.id != speaker_id,
2270
+ )
2271
+ .order_by(c.utterance_ivector_column.cosine_distance(ivector))
2272
+ .limit(100)
2273
+ .subquery()
2274
+ )
2275
+
2276
+ suggested_speaker_query = (
2277
+ session.query(
2278
+ sub_query.c.id,
2279
+ sub_query.c.name,
2280
+ sub_query.c.num_utterances,
2281
+ sub_query.c.distance,
2282
+ )
2283
+ .group_by(
2284
+ sub_query.c.id,
2285
+ sub_query.c.name,
2286
+ sub_query.c.num_utterances,
2287
+ sub_query.c.distance,
2288
+ )
2289
+ .order_by(sqlalchemy.func.count(sub_query.c.id).desc())
2290
+ )
2291
+
2292
+ else:
2293
+ suggested_speaker_query = session.query(
2294
+ Speaker.id,
2295
+ Speaker.name,
2296
+ Speaker.num_utterances,
2297
+ c.speaker_ivector_column.cosine_distance(ivector),
2298
+ ).filter(
2299
+ Speaker.id != speaker_id,
2300
+ )
2301
+ suggested_speaker_query = suggested_speaker_query.order_by(
2302
+ c.speaker_ivector_column.cosine_distance(ivector)
2303
+ ).limit(5)
2304
+ r = suggested_speaker_query.all()
2305
+ if not r:
2203
2306
  continue
2204
- suggested_id, suggested_name, suggested_count, distance = r
2307
+ suggested_id = []
2308
+ suggested_name = []
2309
+ suggested_count = []
2310
+ distance = []
2311
+ for s_id, s_name, s_count, d in r:
2312
+ suggested_id.append(s_id)
2313
+ suggested_name.append(s_name)
2314
+ suggested_count.append(s_count)
2315
+ distance.append(d)
2316
+ if len(suggested_id) == 1:
2317
+ suggested_id = suggested_id[0]
2318
+ suggested_name = suggested_name[0]
2319
+ suggested_count = suggested_count[0]
2320
+ distance = distance[0]
2205
2321
  if self.threshold is not None:
2206
- if distance is not None and distance > self.threshold:
2322
+ if isinstance(distance, list) and distance[0] > self.threshold:
2323
+ continue
2324
+ elif (
2325
+ not isinstance(distance, list)
2326
+ and distance is not None
2327
+ and distance > self.threshold
2328
+ ):
2207
2329
  continue
2208
2330
  if distance is None:
2209
2331
  continue
2210
2332
  if self.progress_callback is not None:
2211
2333
  self.progress_callback.increment_progress(1)
2212
2334
 
2213
- utterance_ids.append(None)
2335
+ utterance_ids.append(utt_id)
2214
2336
  suggested_indices.append(suggested_id)
2215
2337
  speaker_indices.append(speaker_id)
2216
- utterance_name = ""
2338
+ utterance_name = f"{file_name} ({begin:.3f}-{end:.3f})"
2217
2339
  data.append(
2218
2340
  [
2219
2341
  utterance_name,
2220
2342
  suggested_name,
2221
2343
  suggested_count,
2222
2344
  speaker_name,
2223
- num_utterances,
2345
+ speaker_num_utterances,
2224
2346
  distance,
2225
2347
  ]
2226
2348
  )
2227
2349
  if len(data) >= self.limit:
2228
2350
  break
2351
+ d = np.array([x[-1] if not isinstance(x[-1], list) else x[-1][0] for x in data])
2352
+ if self.metric is DistanceMetric.plda:
2353
+ d *= -1
2354
+ indices = np.argsort(d)
2355
+ utterance_ids = [utterance_ids[x] for x in indices]
2356
+ suggested_indices = [suggested_indices[x] for x in indices]
2357
+ speaker_indices = [speaker_indices[x] for x in indices]
2358
+ data = [data[x] for x in indices]
2359
+ return data, utterance_ids, suggested_indices, speaker_indices
2360
+
2361
+
2362
+ class SpeakerComparisonWorker(Worker):
2363
+ def __init__(
2364
+ self,
2365
+ session,
2366
+ use_mp=False,
2367
+ speaker_id: int = None,
2368
+ alternate_speaker_id: int = None,
2369
+ reference_utterance_id: int = None,
2370
+ threshold: float = None,
2371
+ metric: typing.Union[str, DistanceMetric] = DistanceMetric.cosine,
2372
+ plda: Plda = None,
2373
+ speaker_plda: SpeakerPlda = None,
2374
+ limit: int = 100,
2375
+ inverted: bool = False,
2376
+ text_filter: TextFilterQuery = None,
2377
+ **kwargs,
2378
+ ):
2379
+ super().__init__(use_mp=use_mp, **kwargs)
2380
+ self.session = session
2381
+ self.speaker_id = speaker_id
2382
+ self.alternate_speaker_id = alternate_speaker_id
2383
+ self.reference_utterance_id = reference_utterance_id
2384
+ self.threshold = threshold
2385
+ self.metric = metric
2386
+ self.plda = plda
2387
+ self.speaker_plda = speaker_plda
2388
+ self.limit = limit
2389
+ self.inverted = inverted
2390
+ self.text_filter = text_filter
2391
+
2392
+ if isinstance(self.metric, str):
2393
+ self.metric = DistanceMetric[self.metric]
2394
+ if self.metric is DistanceMetric.plda:
2395
+ if self.plda is None:
2396
+ self.metric = DistanceMetric.cosine
2397
+
2398
+ def _run(self):
2399
+ count_only = self.kwargs.get("count", False)
2400
+ if not count_only and self.progress_callback is not None:
2401
+ self.progress_callback.update_total(self.limit)
2402
+
2403
+ with self.session() as session:
2404
+ c = session.query(Corpus).first()
2405
+ suggested_indices = []
2406
+ speaker_indices = []
2407
+ utterance_ids = []
2408
+ data = []
2409
+ if self.inverted or self.speaker_id is None:
2410
+ if (
2411
+ self.metric is DistanceMetric.plda
2412
+ and not count_only
2413
+ and self.speaker_plda is None
2414
+ ):
2415
+ speaker_plda = load_speaker_plda(session, self.plda, minimum_count=2)
2416
+ elif self.speaker_plda is not None:
2417
+ speaker_plda = self.speaker_plda
2418
+ found_set = set()
2419
+ if self.speaker_id is not None:
2420
+ query = session.query(
2421
+ Speaker.name, c.speaker_ivector_column, Speaker.num_utterances
2422
+ )
2423
+ if isinstance(self.speaker_id, int):
2424
+ query = query.filter(Speaker.id == self.speaker_id)
2425
+ else:
2426
+ query = query.filter(Speaker.name == self.speaker_id)
2427
+ r = query.first()
2428
+ if r is None:
2429
+ return data, utterance_ids, suggested_indices
2430
+ suggested_name, ivector, utt_count = r
2431
+
2432
+ if self.metric is DistanceMetric.plda:
2433
+ kaldi_speaker_ivector = DoubleVector()
2434
+ kaldi_speaker_ivector.from_numpy(ivector)
2435
+ kaldi_speaker_ivector = self.plda.transform_ivector(
2436
+ kaldi_speaker_ivector, utt_count
2437
+ )
2438
+ query = session.query(
2439
+ Speaker.id,
2440
+ Speaker.name,
2441
+ Speaker.num_utterances,
2442
+ c.speaker_ivector_column,
2443
+ c.speaker_ivector_column.cosine_distance(ivector),
2444
+ ).filter(Speaker.id != self.speaker_id)
2445
+ if self.alternate_speaker_id is not None:
2446
+ query = query.filter(Speaker.id == self.alternate_speaker_id)
2447
+ if self.threshold is not None:
2448
+ query = query.filter(
2449
+ c.speaker_ivector_column.cosine_distance(ivector) <= self.threshold
2450
+ )
2451
+
2452
+ if count_only:
2453
+ return query.count()
2454
+ query = query.limit(self.limit).offset(self.kwargs.get("current_offset", 0))
2455
+ for (
2456
+ original_id,
2457
+ speaker_name,
2458
+ original_count,
2459
+ original_ivector,
2460
+ distance,
2461
+ ) in query:
2462
+ if self.stopped is not None and self.stopped.is_set():
2463
+ session.rollback()
2464
+ return
2465
+ if distance is None:
2466
+ continue
2467
+ if (self.speaker_id, original_id) in found_set:
2468
+ continue
2469
+ if self.progress_callback is not None:
2470
+ self.progress_callback.increment_progress(1)
2471
+ if self.metric is DistanceMetric.plda:
2472
+ kaldi_utterance_ivector = DoubleVector()
2473
+ kaldi_utterance_ivector.from_numpy(original_ivector)
2474
+ ivector_normalize_length(kaldi_utterance_ivector)
2475
+ kaldi_utterance_ivector = self.plda.transform_ivector(
2476
+ kaldi_utterance_ivector, original_count
2477
+ )
2478
+ distance = self.plda.LogLikelihoodRatio(
2479
+ kaldi_speaker_ivector, utt_count, kaldi_utterance_ivector
2480
+ )
2481
+ utterance_ids.append(None)
2482
+ suggested_indices.append(self.speaker_id)
2483
+ speaker_indices.append(original_id)
2484
+ found_set.add((self.speaker_id, original_id))
2485
+ utterance_name = ""
2486
+ data.append(
2487
+ [
2488
+ utterance_name,
2489
+ suggested_name,
2490
+ utt_count,
2491
+ speaker_name,
2492
+ original_count,
2493
+ distance,
2494
+ ]
2495
+ )
2229
2496
  else:
2230
2497
  query = (
2231
2498
  session.query(
2232
- Utterance.id,
2233
- File.id,
2234
- File.name,
2235
- Utterance.begin,
2236
- Utterance.end,
2237
- c.utterance_ivector_column,
2238
- Speaker.name,
2239
- Speaker.id,
2240
- Speaker.num_utterances,
2499
+ Speaker.id, c.speaker_ivector_column, Speaker.name, Speaker.num_utterances
2241
2500
  )
2242
- .join(Utterance.file)
2243
- .join(Utterance.speaker)
2244
- .filter(c.utterance_ivector_column != None) # noqa
2245
- .filter(Speaker.num_utterances == 1)
2501
+ .filter(c.speaker_ivector_column != None) # noqa
2502
+ .filter(Speaker.num_utterances > 0)
2246
2503
  )
2247
2504
  if self.text_filter is not None and self.text_filter.text:
2248
2505
  filter_regex = self.text_filter.generate_expression(posix=True)
2249
- query = query.filter(Utterance.text.op("~")(filter_regex))
2506
+ query = query.join(Speaker.utterances)
2507
+ query = query.filter(Utterance.text.op("~")(filter_regex)).distinct()
2250
2508
  if count_only:
2251
2509
  return query.count()
2252
2510
  if self.text_filter is None or not self.text_filter.text:
@@ -2256,25 +2514,12 @@ class SpeakerUtterancesWorker(Worker):
2256
2514
 
2257
2515
  if self.threshold is None:
2258
2516
  query = query.limit(self.limit).offset(self.kwargs.get("current_offset", 0))
2259
- # else:
2260
- # query = query.limit(limit*100)
2261
- for (
2262
- utt_id,
2263
- file_id,
2264
- file_name,
2265
- begin,
2266
- end,
2267
- ivector,
2268
- speaker_name,
2269
- speaker_id,
2270
- speaker_num_utterances,
2271
- ) in query:
2517
+ for speaker_id, ivector, speaker_name, num_utterances in query:
2272
2518
  if self.stopped is not None and self.stopped.is_set():
2273
2519
  break
2274
2520
  if self.metric is DistanceMetric.plda:
2275
2521
  kaldi_ivector = DoubleVector()
2276
2522
  kaldi_ivector.from_numpy(ivector)
2277
- ivector_normalize_length(kaldi_ivector)
2278
2523
  kaldi_ivector = self.plda.transform_ivector(kaldi_ivector, 1)
2279
2524
  index, distance = self.plda.classify_utterance(
2280
2525
  kaldi_ivector, speaker_plda.test_ivectors, speaker_plda.counts
@@ -2310,18 +2555,26 @@ class SpeakerUtterancesWorker(Worker):
2310
2555
  continue
2311
2556
  if self.progress_callback is not None:
2312
2557
  self.progress_callback.increment_progress(1)
2313
-
2314
- utterance_ids.append(utt_id)
2558
+ if suggested_count < num_utterances:
2559
+ speaker_id, suggested_id = suggested_id, speaker_id
2560
+ speaker_name, suggested_name = suggested_name, speaker_name
2561
+ num_utterances, suggested_count = suggested_count, num_utterances
2562
+ if (speaker_id, suggested_id) in found_set:
2563
+ continue
2564
+ if (suggested_id, speaker_id) in found_set:
2565
+ continue
2566
+ found_set.add((speaker_id, suggested_id))
2567
+ utterance_ids.append(None)
2315
2568
  suggested_indices.append(suggested_id)
2316
2569
  speaker_indices.append(speaker_id)
2317
- utterance_name = f"{file_name} ({begin:.3f}-{end:.3f})"
2570
+ utterance_name = ""
2318
2571
  data.append(
2319
2572
  [
2320
2573
  utterance_name,
2321
2574
  suggested_name,
2322
2575
  suggested_count,
2323
2576
  speaker_name,
2324
- speaker_num_utterances,
2577
+ num_utterances,
2325
2578
  distance,
2326
2579
  ]
2327
2580
  )
@@ -2846,11 +3099,16 @@ class FileUtterancesWorker(Worker):
2846
3099
  .options(
2847
3100
  selectinload(Utterance.phone_intervals).options(
2848
3101
  joinedload(PhoneInterval.phone, innerjoin=True),
2849
- joinedload(PhoneInterval.workflow, innerjoin=True),
3102
+ ),
3103
+ selectinload(Utterance.reference_phone_intervals).options(
3104
+ joinedload(ReferencePhoneInterval.phone, innerjoin=True),
2850
3105
  ),
2851
3106
  selectinload(Utterance.word_intervals).options(
2852
3107
  joinedload(WordInterval.word, innerjoin=True),
2853
- joinedload(WordInterval.workflow, innerjoin=True),
3108
+ joinedload(WordInterval.pronunciation, innerjoin=True),
3109
+ ),
3110
+ selectinload(Utterance.reference_word_intervals).options(
3111
+ joinedload(ReferenceWordInterval.word, innerjoin=True),
2854
3112
  ),
2855
3113
  joinedload(Utterance.speaker, innerjoin=True),
2856
3114
  )
@@ -3146,45 +3404,82 @@ class WaveformWorker(Worker): # pragma: no cover
3146
3404
 
3147
3405
 
3148
3406
  class SpeakerTierWorker(Worker): # pragma: no cover
3149
- def __init__(self, session, file_id, *args, query_alignment=False):
3407
+ def __init__(
3408
+ self,
3409
+ session,
3410
+ file_id,
3411
+ *args,
3412
+ query_alignment=False,
3413
+ utterance_id=None,
3414
+ begin=None,
3415
+ end=None,
3416
+ ):
3150
3417
  super().__init__("Generating speaker tier", *args)
3151
3418
  self.query_alignment = query_alignment
3152
3419
  self.session = session
3153
3420
  self.file_id = file_id
3421
+ self.utterance_id = utterance_id
3422
+ self.begin = begin
3423
+ self.end = end
3154
3424
  self.settings = AnchorSettings()
3155
3425
 
3156
3426
  def run(self):
3157
3427
  if self.session is None:
3158
3428
  return
3159
3429
  with self.session() as session:
3160
- show_phones = (
3161
- self.settings.value(self.settings.TIER_ALIGNED_PHONES_VISIBLE)
3162
- or self.settings.value(self.settings.TIER_TRANSCRIBED_PHONES_VISIBLE)
3163
- or self.settings.value(self.settings.TIER_REFERENCE_PHONES_VISIBLE)
3164
- )
3165
- show_words = self.settings.value(
3166
- self.settings.TIER_ALIGNED_WORDS_VISIBLE
3167
- ) or self.settings.value(self.settings.TIER_TRANSCRIBED_WORDS_VISIBLE)
3430
+ file = session.get(File, self.file_id)
3431
+ show_phones = self.settings.value(
3432
+ self.settings.TIER_ALIGNED_PHONES_VISIBLE
3433
+ ) or self.settings.value(self.settings.TIER_REFERENCE_PHONES_VISIBLE)
3434
+ show_words = self.settings.value(self.settings.TIER_ALIGNED_WORDS_VISIBLE)
3168
3435
  utterances = session.query(Utterance)
3169
3436
  if self.query_alignment:
3170
3437
  if show_phones:
3171
3438
  utterances = utterances.options(
3172
3439
  selectinload(Utterance.phone_intervals).options(
3173
3440
  joinedload(PhoneInterval.phone, innerjoin=True),
3174
- joinedload(PhoneInterval.workflow, innerjoin=True),
3175
- )
3441
+ ),
3442
+ selectinload(Utterance.reference_phone_intervals).options(
3443
+ joinedload(ReferencePhoneInterval.phone, innerjoin=True),
3444
+ ),
3176
3445
  )
3177
3446
  if show_words:
3178
3447
  utterances = utterances.options(
3179
3448
  selectinload(Utterance.word_intervals).options(
3180
3449
  joinedload(WordInterval.word, innerjoin=True),
3181
- joinedload(WordInterval.workflow, innerjoin=True),
3182
3450
  ),
3183
3451
  )
3184
- utterances = utterances.filter(Utterance.file_id == self.file_id).order_by(
3185
- Utterance.begin
3186
- )
3187
- self.signals.result.emit((utterances.all(), self.file_id))
3452
+ utterances = utterances.options(
3453
+ selectinload(Utterance.reference_word_intervals).options(
3454
+ joinedload(ReferenceWordInterval.word, innerjoin=True),
3455
+ ),
3456
+ )
3457
+ utterances = utterances.filter(
3458
+ Utterance.file_id == self.file_id,
3459
+ ).order_by(Utterance.begin)
3460
+
3461
+ if self.utterance_id is not None:
3462
+ utterances = utterances.filter(Utterance.id == self.utterance_id)
3463
+ if file.duration > 500 and self.begin is not None and self.end is not None:
3464
+ cached_begin = self.begin - 30
3465
+ cached_end = self.end + 30
3466
+ utterances = utterances.filter(
3467
+ Utterance.end >= cached_begin,
3468
+ Utterance.begin <= cached_end,
3469
+ )
3470
+ else:
3471
+ cached_begin = None
3472
+ cached_end = None
3473
+ utterances = utterances.all()
3474
+ if (
3475
+ file.duration > 500
3476
+ and self.begin is not None
3477
+ and self.end is not None
3478
+ and utterances
3479
+ ):
3480
+ cached_begin = min(cached_begin, utterances[0].begin)
3481
+ cached_end = max(cached_end, utterances[-1].end)
3482
+ self.signals.result.emit((utterances, self.file_id, cached_begin, cached_end))
3188
3483
 
3189
3484
 
3190
3485
  class SpectrogramWorker(Worker): # pragma: no cover
@@ -3444,7 +3739,9 @@ class ImportCorpusWorker(FunctionWorker): # pragma: no cover
3444
3739
  ).execution_options(logging_token="inspect_dataset_engine")
3445
3740
  with sqlalchemy.orm.Session(engine) as session:
3446
3741
  dictionary = (
3447
- session.query(Dictionary.path).filter(Dictionary.path != "").first()
3742
+ session.query(Dictionary.path)
3743
+ .filter(Dictionary.path != "", Dictionary.path != ".")
3744
+ .first()
3448
3745
  )
3449
3746
  if dictionary is not None:
3450
3747
  self.dictionary_path = dictionary[0]
@@ -3452,7 +3749,11 @@ class ImportCorpusWorker(FunctionWorker): # pragma: no cover
3452
3749
  pass
3453
3750
  try:
3454
3751
  if dataset_type is DatasetType.NONE:
3455
- if self.dictionary_path and os.path.exists(self.dictionary_path):
3752
+ if (
3753
+ self.dictionary_path
3754
+ and os.path.exists(self.dictionary_path)
3755
+ and str(self.dictionary_path) != "."
3756
+ ):
3456
3757
  self.corpus = AcousticCorpusWithPronunciations(
3457
3758
  corpus_directory=self.corpus_path, dictionary_path=self.dictionary_path
3458
3759
  )
@@ -3460,13 +3761,16 @@ class ImportCorpusWorker(FunctionWorker): # pragma: no cover
3460
3761
  self.corpus.dictionary_setup()
3461
3762
  self.corpus.write_lexicon_information(write_disambiguation=False)
3462
3763
  else:
3463
- self.corpus = AcousticCorpus(corpus_directory=self.corpus_path)
3764
+ self.corpus = AcousticCorpusWithPronunciations(
3765
+ corpus_directory=self.corpus_path
3766
+ )
3464
3767
  self.corpus.initialize_database()
3465
- self.corpus._load_corpus()
3768
+ self.corpus.create_default_dictionary()
3466
3769
 
3467
3770
  elif (
3468
3771
  dataset_type is DatasetType.ACOUSTIC_CORPUS_WITH_DICTIONARY
3469
3772
  and self.dictionary_path
3773
+ and str(self.dictionary_path) != "."
3470
3774
  and os.path.exists(self.dictionary_path)
3471
3775
  ):
3472
3776
  self.corpus = AcousticCorpusWithPronunciations(
@@ -3474,8 +3778,9 @@ class ImportCorpusWorker(FunctionWorker): # pragma: no cover
3474
3778
  )
3475
3779
  self.corpus.inspect_database()
3476
3780
  else:
3477
- self.corpus = AcousticCorpus(corpus_directory=self.corpus_path)
3781
+ self.corpus = AcousticCorpusWithPronunciations(corpus_directory=self.corpus_path)
3478
3782
  self.corpus.inspect_database()
3783
+ self.corpus.create_default_dictionary()
3479
3784
  self.corpus._load_corpus()
3480
3785
  if self.dictionary_path and os.path.exists(self.dictionary_path):
3481
3786
  self.corpus.initialize_jobs()
@@ -3592,10 +3897,7 @@ class LoadReferenceWorker(FunctionWorker): # pragma: no cover
3592
3897
  self.settings.sync()
3593
3898
  try:
3594
3899
  with self.corpus.session() as session:
3595
- session.query(PhoneInterval).filter(
3596
- PhoneInterval.workflow_id == CorpusWorkflow.id,
3597
- CorpusWorkflow.workflow_type == WorkflowType.reference,
3598
- ).delete(synchronize_session=False)
3900
+ session.query(ReferencePhoneInterval).delete(synchronize_session=False)
3599
3901
  session.query(CorpusWorkflow).filter(
3600
3902
  CorpusWorkflow.workflow_type == WorkflowType.reference
3601
3903
  ).delete(synchronize_session=False)
@@ -3794,8 +4096,8 @@ class ImportIvectorExtractorWorker(FunctionWorker): # pragma: no cover
3794
4096
  if not self.model_path:
3795
4097
  return
3796
4098
  try:
3797
- if str(self.model_path) == "speechbrain":
3798
- model = "speechbrain"
4099
+ if str(self.model_path) in {"speechbrain", "pyannote"}:
4100
+ model = str(self.model_path)
3799
4101
  else:
3800
4102
  model = IvectorExtractorModel(self.model_path)
3801
4103
  except Exception:
@@ -3832,36 +4134,20 @@ class AlignUtteranceWorker(FunctionWorker): # pragma: no cover
3832
4134
  )
3833
4135
  .get(self.utterance_id)
3834
4136
  )
3835
- workflow = self.corpus_model.corpus.get_latest_workflow_run(
3836
- WorkflowType.online_alignment, session
3837
- )
3838
-
3839
- alignment_workflows = [
3840
- x
3841
- for x, in session.query(CorpusWorkflow.id).filter(
3842
- CorpusWorkflow.workflow_type.in_(
3843
- [WorkflowType.online_alignment, WorkflowType.alignment]
3844
- )
3845
- )
3846
- ]
3847
4137
  session.query(PhoneInterval).filter(
3848
4138
  PhoneInterval.utterance_id == utterance.id
3849
- ).filter(PhoneInterval.workflow_id.in_(alignment_workflows)).delete(
3850
- synchronize_session=False
3851
- )
4139
+ ).delete(synchronize_session=False)
3852
4140
  session.flush()
3853
4141
  session.query(WordInterval).filter(
3854
4142
  WordInterval.utterance_id == utterance.id
3855
- ).filter(WordInterval.workflow_id.in_(alignment_workflows)).delete(
3856
- synchronize_session=False
3857
- )
4143
+ ).delete(synchronize_session=False)
3858
4144
  session.flush()
3859
4145
  ctm = align_utterance_online(
3860
4146
  self.corpus_model.acoustic_model,
3861
4147
  utterance.to_kalpy(),
3862
4148
  self.corpus_model.align_lexicon_compiler,
3863
4149
  )
3864
- update_utterance_intervals(session, utterance, workflow.id, ctm)
4150
+ update_utterance_intervals(session, utterance, ctm)
3865
4151
  except Exception:
3866
4152
  exctype, value = sys.exc_info()[:2]
3867
4153
  self.signals.error.emit((exctype, value, traceback.format_exc()))
@@ -4081,12 +4367,8 @@ class AlignmentWorker(FunctionWorker): # pragma: no cover
4081
4367
  )
4082
4368
  ]
4083
4369
 
4084
- session.query(PhoneInterval).filter(
4085
- PhoneInterval.workflow_id.in_(alignment_workflows),
4086
- ).delete(synchronize_session=False)
4087
- session.query(WordInterval).filter(
4088
- WordInterval.workflow_id.in_(alignment_workflows)
4089
- ).delete(synchronize_session=False)
4370
+ session.query(PhoneInterval).delete(synchronize_session=False)
4371
+ session.query(WordInterval).delete(synchronize_session=False)
4090
4372
  session.query(CorpusWorkflow).filter(
4091
4373
  CorpusWorkflow.id.in_(alignment_workflows)
4092
4374
  ).delete(synchronize_session=False)
@@ -4163,7 +4445,7 @@ class ComputeIvectorWorker(FunctionWorker): # pragma: no cover
4163
4445
  self.settings.sync()
4164
4446
  diarizer = SpeakerDiarizer(
4165
4447
  ivector_extractor_path=self.corpus_model.ivector_extractor.source
4166
- if self.corpus_model.ivector_extractor != "speechbrain"
4448
+ if self.corpus_model.ivector_extractor not in {"speechbrain", "pyannote"}
4167
4449
  else self.corpus_model.ivector_extractor,
4168
4450
  corpus_directory=self.corpus_model.corpus.corpus_directory,
4169
4451
  cuda=self.settings.value(self.settings.CUDA),
@@ -4174,6 +4456,25 @@ class ComputeIvectorWorker(FunctionWorker): # pragma: no cover
4174
4456
  if self.reset:
4175
4457
  logger.info("Resetting ivectors...")
4176
4458
  self.corpus_model.corpus.reset_features()
4459
+ else:
4460
+ time.sleep(1.0)
4461
+ with self.corpus_model.corpus.session() as session:
4462
+ logger.debug("Dropping indexes...")
4463
+ session.execute(
4464
+ sqlalchemy.text("DROP INDEX IF EXISTS utterance_xvector_index;")
4465
+ )
4466
+ session.execute(sqlalchemy.text("DROP INDEX IF EXISTS speaker_xvector_index;"))
4467
+ session.execute(
4468
+ sqlalchemy.text("DROP INDEX IF EXISTS utterance_ivector_index;")
4469
+ )
4470
+ session.execute(sqlalchemy.text("DROP INDEX IF EXISTS speaker_ivector_index;"))
4471
+ session.execute(
4472
+ sqlalchemy.text("DROP INDEX IF EXISTS utterance_plda_vector_index;")
4473
+ )
4474
+ session.execute(
4475
+ sqlalchemy.text("DROP INDEX IF EXISTS speaker_plda_vector_index;")
4476
+ )
4477
+ session.commit()
4177
4478
  diarizer.inspect_database()
4178
4479
  diarizer.initialize_jobs()
4179
4480
  diarizer.corpus_output_directory = self.corpus_model.corpus.corpus_output_directory
@@ -4217,7 +4518,7 @@ class ComputePldaWorker(FunctionWorker): # pragma: no cover
4217
4518
  self.settings.sync()
4218
4519
  diarizer = SpeakerDiarizer(
4219
4520
  ivector_extractor_path=self.ivector_extractor.source
4220
- if self.ivector_extractor != "speechbrain"
4521
+ if self.ivector_extractor not in {"speechbrain", "pyannote"}
4221
4522
  else self.ivector_extractor,
4222
4523
  corpus_directory=self.corpus.corpus_directory,
4223
4524
  cuda=self.settings.value(self.settings.CUDA),
@@ -4266,7 +4567,7 @@ class ClusterUtterancesWorker(FunctionWorker): # pragma: no cover
4266
4567
  self.parameters["expected_num_speakers"] = self.corpus.num_speakers
4267
4568
  diarizer = SpeakerDiarizer(
4268
4569
  ivector_extractor_path=self.ivector_extractor.source
4269
- if self.ivector_extractor != "speechbrain"
4570
+ if self.ivector_extractor not in {"speechbrain", "pyannote"}
4270
4571
  else self.ivector_extractor,
4271
4572
  corpus_directory=self.corpus.corpus_directory,
4272
4573
  cuda=self.settings.value(self.settings.CUDA),
@@ -4319,7 +4620,7 @@ class ClassifySpeakersWorker(FunctionWorker): # pragma: no cover
4319
4620
  self.settings.sync()
4320
4621
  diarizer = SpeakerDiarizer(
4321
4622
  ivector_extractor_path=self.ivector_extractor.source
4322
- if self.ivector_extractor != "speechbrain"
4623
+ if self.ivector_extractor not in {"speechbrain", "pyannote"}
4323
4624
  else self.ivector_extractor,
4324
4625
  corpus_directory=self.corpus.corpus_directory, # score_threshold = 0.5,
4325
4626
  cluster=False,
@@ -4424,16 +4725,8 @@ class TranscriptionWorker(FunctionWorker): # pragma: no cover
4424
4725
  )
4425
4726
  try:
4426
4727
  with self.corpus.session() as session:
4427
- session.query(PhoneInterval).filter(
4428
- PhoneInterval.workflow_id == CorpusWorkflow.id
4429
- ).filter(CorpusWorkflow.workflow_type == WorkflowType.transcription).delete(
4430
- synchronize_session="fetch"
4431
- )
4432
- session.query(WordInterval).filter(
4433
- WordInterval.workflow_id == CorpusWorkflow.id
4434
- ).filter(CorpusWorkflow.workflow_type == WorkflowType.transcription).delete(
4435
- synchronize_session="fetch"
4436
- )
4728
+ session.query(PhoneInterval).delete(synchronize_session="fetch")
4729
+ session.query(WordInterval).delete(synchronize_session="fetch")
4437
4730
  session.query(CorpusWorkflow).filter(
4438
4731
  CorpusWorkflow.workflow_type == WorkflowType.transcription
4439
4732
  ).delete()
@@ -4487,20 +4780,8 @@ class ValidationWorker(FunctionWorker): # pragma: no cover
4487
4780
  )
4488
4781
  try:
4489
4782
  with self.corpus.session() as session:
4490
- session.query(PhoneInterval).filter(
4491
- PhoneInterval.workflow_id == CorpusWorkflow.id
4492
- ).filter(
4493
- CorpusWorkflow.workflow_type == WorkflowType.per_speaker_transcription
4494
- ).delete(
4495
- synchronize_session="fetch"
4496
- )
4497
- session.query(WordInterval).filter(
4498
- WordInterval.workflow_id == CorpusWorkflow.id
4499
- ).filter(
4500
- CorpusWorkflow.workflow_type == WorkflowType.per_speaker_transcription
4501
- ).delete(
4502
- synchronize_session="fetch"
4503
- )
4783
+ session.query(PhoneInterval).delete(synchronize_session="fetch")
4784
+ session.query(WordInterval).delete(synchronize_session="fetch")
4504
4785
  session.query(CorpusWorkflow).filter(
4505
4786
  CorpusWorkflow.workflow_type == WorkflowType.per_speaker_transcription
4506
4787
  ).delete()