lingualabpy 0.0.5__py3-none-any.whl → 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lingualabpy/__init__.py +25 -20
- lingualabpy/_version.py +34 -0
- lingualabpy/audio/metrics.py +85 -85
- lingualabpy/audio/triming.py +11 -11
- lingualabpy/cli/audio_metrics.py +59 -59
- lingualabpy/cli/audio_triming.py +48 -48
- lingualabpy/cli/docx2json.py +21 -21
- lingualabpy/cli/jsons2csv.py +23 -23
- lingualabpy/cli/plot_sound.py +55 -0
- lingualabpy/io.py +49 -49
- lingualabpy/neuroimaging/__init__.py +0 -0
- lingualabpy/neuroimaging/hcp_connectome.py +143 -0
- lingualabpy/plot.py +23 -0
- lingualabpy/text/parser.py +35 -35
- lingualabpy/text/textgrid.py +41 -41
- lingualabpy/tools/data.py +41 -41
- lingualabpy/tools/interval.py +59 -59
- lingualabpy-0.1.0.dist-info/METADATA +66 -0
- lingualabpy-0.1.0.dist-info/RECORD +26 -0
- {lingualabpy-0.0.5.dist-info → lingualabpy-0.1.0.dist-info}/WHEEL +1 -1
- lingualabpy-0.1.0.dist-info/entry_points.txt +7 -0
- {lingualabpy-0.0.5.dist-info → lingualabpy-0.1.0.dist-info/licenses}/LICENSE +21 -21
- lingualabpy/resources/FilledPauses.praat +0 -536
- lingualabpy/resources/syllablenucleiv3.praat +0 -0
- lingualabpy-0.0.5.dist-info/METADATA +0 -44
- lingualabpy-0.0.5.dist-info/RECORD +0 -23
- lingualabpy-0.0.5.dist-info/entry_points.txt +0 -6
lingualabpy/io.py
CHANGED
|
@@ -1,49 +1,49 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Module contains tools for processing files
|
|
3
|
-
"""
|
|
4
|
-
|
|
5
|
-
import json
|
|
6
|
-
from docx import Document
|
|
7
|
-
from textgrids import TextGrid
|
|
8
|
-
from pydub import AudioSegment
|
|
9
|
-
|
|
10
|
-
from typing import Union
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
# audio files
|
|
14
|
-
def read_audio(sound_path: str) -> AudioSegment:
|
|
15
|
-
""""""
|
|
16
|
-
return AudioSegment.from_file(sound_path)
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
# .docx files
|
|
20
|
-
def read_docx(docx_path: str) -> Document:
|
|
21
|
-
""""""
|
|
22
|
-
return Document(docx_path)
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
# .json files
|
|
26
|
-
def read_json(json_path: str) -> Union[list, dict]:
|
|
27
|
-
""""""
|
|
28
|
-
with open(json_path, "r") as file:
|
|
29
|
-
content = json.load(file)
|
|
30
|
-
return content
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
def write_json(data: Union[list, dict], json_path: str) -> None:
|
|
34
|
-
""""""
|
|
35
|
-
with open(json_path, "w") as file:
|
|
36
|
-
json.dump(data, file, indent=4)
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
# .TextGrid files
|
|
40
|
-
def read_textgrid(textgrid_path: str) -> TextGrid:
|
|
41
|
-
""""""
|
|
42
|
-
textgrid = TextGrid(textgrid_path)
|
|
43
|
-
# Cleaning of the interval text
|
|
44
|
-
for intervals in textgrid.values():
|
|
45
|
-
for interval in intervals:
|
|
46
|
-
interval.text = (
|
|
47
|
-
interval.text.encode().decode("unicode_escape").strip(" \n\r\t")
|
|
48
|
-
)
|
|
49
|
-
return textgrid
|
|
1
|
+
"""
|
|
2
|
+
Module contains tools for processing files
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from docx import Document
|
|
7
|
+
from textgrids import TextGrid
|
|
8
|
+
from pydub import AudioSegment
|
|
9
|
+
|
|
10
|
+
from typing import Union
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
# audio files
|
|
14
|
+
def read_audio(sound_path: str) -> AudioSegment:
|
|
15
|
+
""""""
|
|
16
|
+
return AudioSegment.from_file(sound_path)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
# .docx files
|
|
20
|
+
def read_docx(docx_path: str) -> Document:
|
|
21
|
+
""""""
|
|
22
|
+
return Document(docx_path)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# .json files
|
|
26
|
+
def read_json(json_path: str) -> Union[list, dict]:
|
|
27
|
+
""""""
|
|
28
|
+
with open(json_path, "r") as file:
|
|
29
|
+
content = json.load(file)
|
|
30
|
+
return content
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def write_json(data: Union[list, dict], json_path: str) -> None:
|
|
34
|
+
""""""
|
|
35
|
+
with open(json_path, "w") as file:
|
|
36
|
+
json.dump(data, file, indent=4)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# .TextGrid files
|
|
40
|
+
def read_textgrid(textgrid_path: str) -> TextGrid:
|
|
41
|
+
""""""
|
|
42
|
+
textgrid = TextGrid(textgrid_path)
|
|
43
|
+
# Cleaning of the interval text
|
|
44
|
+
for intervals in textgrid.values():
|
|
45
|
+
for interval in intervals:
|
|
46
|
+
interval.text = (
|
|
47
|
+
interval.text.encode().decode("unicode_escape").strip(" \n\r\t")
|
|
48
|
+
)
|
|
49
|
+
return textgrid
|
|
File without changes
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This module processes resting-state fMRI data from the HCP-Young-Adult-2025 release.
|
|
3
|
+
It extracts timeseries from brain regions using an atlas, computes the connectome,
|
|
4
|
+
and saves the results along with the masker report.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import re
|
|
8
|
+
import click
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
from nilearn.maskers import NiftiLabelsMasker
|
|
14
|
+
from nilearn.connectome import ConnectivityMeasure
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Connectome:
|
|
18
|
+
|
|
19
|
+
path = None
|
|
20
|
+
brainmask = None
|
|
21
|
+
output_folder = None
|
|
22
|
+
timeseries = None
|
|
23
|
+
report = None
|
|
24
|
+
relmat = None
|
|
25
|
+
relmat_z = None
|
|
26
|
+
|
|
27
|
+
def make_output_folder(self):
|
|
28
|
+
self.output_folder.mkdir(parents=True, exist_ok=True)
|
|
29
|
+
|
|
30
|
+
def save_timeseries(self, timeseries):
|
|
31
|
+
pd.DataFrame(timeseries).to_csv(self.timeseries, sep="\t", index=False)
|
|
32
|
+
|
|
33
|
+
def save_report(self, masker):
|
|
34
|
+
masker.generate_report().save_as_html(self.report)
|
|
35
|
+
|
|
36
|
+
def save_connectome(self, connectome):
|
|
37
|
+
pd.DataFrame(connectome).to_csv(self.relmat, sep="\t", index=False)
|
|
38
|
+
|
|
39
|
+
def save_connectome_fisher_z(self, connectome):
|
|
40
|
+
pd.DataFrame(connectome).to_csv(self.relmat_z, sep="\t", index=False)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class ConnectomeHcp2025(Connectome):
|
|
44
|
+
|
|
45
|
+
HCP_2025_PATTERN = re.compile(
|
|
46
|
+
r"^.*/(?P<participant_id>[0-9]{6})/MNINonLinear/Results/"
|
|
47
|
+
r"rfMRI_REST(?P<run>[12])_(?P<pe>LR|RL)/"
|
|
48
|
+
r"rfMRI_REST(?P=run)_(?P=pe)_hp2000_clean_rclean_tclean\.nii\.gz$"
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
def __init__(self, path: Path, output: Path):
|
|
52
|
+
# Check if the path is from the HCP-Young-Adult-2025 release
|
|
53
|
+
hcp_match = self.HCP_2025_PATTERN.match(path.as_posix())
|
|
54
|
+
if not hcp_match:
|
|
55
|
+
raise ValueError(
|
|
56
|
+
f"Invalid HCP-Young-Adult-2025 rs-fMRI path:\n {path}\n"
|
|
57
|
+
"Expected: <participant_id>/MNINonLinear/Results/rfMRI_REST{1,2}_{LR,RL}/rfMRI_REST{1,2}_{LR,RL}_hp2000_clean_rclean_tclean.nii.gz"
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
# Helper variables to build filenames
|
|
61
|
+
output = Path(output)
|
|
62
|
+
pid = f"sub-{hcp_match.group('participant_id')}"
|
|
63
|
+
run = f"run-{hcp_match.group('pe')}{hcp_match.group('run')}"
|
|
64
|
+
basename = f"{pid}_task-rest_{run}_seg-SENSAAS"
|
|
65
|
+
|
|
66
|
+
# HCP-Young-Adult-2025 input
|
|
67
|
+
self.path = path
|
|
68
|
+
self.brainmask = path.parent / "brainmask_fs.2.nii.gz"
|
|
69
|
+
|
|
70
|
+
# Define output filenames
|
|
71
|
+
self.output_folder = output / pid / "func"
|
|
72
|
+
self.timeseries = self.output_folder / f"{basename}_timeseries.tsv"
|
|
73
|
+
self.report = self.output_folder / f"{basename}_report.html"
|
|
74
|
+
self.relmat = (
|
|
75
|
+
self.output_folder / f"{basename}_meas-PearsonCorrelation_relmat.tsv"
|
|
76
|
+
)
|
|
77
|
+
self.relmat_z = self.output_folder / f"{basename}_meas-FisherZ_relmat.tsv"
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@click.command()
|
|
81
|
+
@click.option(
|
|
82
|
+
"--output", type=click.Path(), default="results", help="Directory to save outputs"
|
|
83
|
+
)
|
|
84
|
+
@click.option(
|
|
85
|
+
"--smoothing_fwhm",
|
|
86
|
+
type=float,
|
|
87
|
+
default=5.0,
|
|
88
|
+
help="full-width at half maximum in millimeters of the spatial smoothing to apply to the signal",
|
|
89
|
+
)
|
|
90
|
+
@click.argument("atlas_path", nargs=1, type=click.Path(exists=True))
|
|
91
|
+
@click.argument("lut_path", nargs=1, type=click.Path(exists=True))
|
|
92
|
+
@click.argument("rs_path", nargs=1, type=click.Path(exists=True))
|
|
93
|
+
def main(atlas_path, lut_path, rs_path, output, smoothing_fwhm):
|
|
94
|
+
"""Process resting-state fMRI from the HCP-Young-Adult-2025 release to extract connectome.
|
|
95
|
+
|
|
96
|
+
1. Validates input resting-state fMRI data structure
|
|
97
|
+
|
|
98
|
+
2. Extracts timeseries using an atlas
|
|
99
|
+
|
|
100
|
+
3. Computes Pearson correlations and fisher_z connectomes
|
|
101
|
+
|
|
102
|
+
4. Saves timeseries, connectomes, and visualization report
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
|
|
106
|
+
atlas_path (str): Path to atlas NIfTI file defining brain regions
|
|
107
|
+
|
|
108
|
+
lut_path (str): Path to lookup table file for atlas labels
|
|
109
|
+
|
|
110
|
+
rs_path (str): Path to resting-state fMRI NIfTI file
|
|
111
|
+
|
|
112
|
+
output (str): Path to save results
|
|
113
|
+
|
|
114
|
+
smoothing_fwhm (float): full-width at half maximum in millimeters of the spatial smoothing to apply to the signal
|
|
115
|
+
"""
|
|
116
|
+
resting_state = ConnectomeHcp2025(path=Path(rs_path), output=Path(output))
|
|
117
|
+
|
|
118
|
+
atlas_masker = NiftiLabelsMasker(
|
|
119
|
+
labels_img=atlas_path,
|
|
120
|
+
lut=lut_path,
|
|
121
|
+
mask_img=resting_state.brainmask,
|
|
122
|
+
smoothing_fwhm=smoothing_fwhm,
|
|
123
|
+
standardize="zscore_sample",
|
|
124
|
+
t_r=0.72,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
correlation_measure = ConnectivityMeasure(
|
|
128
|
+
kind="correlation",
|
|
129
|
+
standardize=False,
|
|
130
|
+
vectorize=False,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
# Extract timeseries and connectomes
|
|
134
|
+
timeseries = atlas_masker.fit_transform(resting_state.path)
|
|
135
|
+
connectome = correlation_measure.fit_transform([timeseries])[0]
|
|
136
|
+
connectome_fisher_z = np.arctanh(np.clip(connectome, -0.999999, 0.999999))
|
|
137
|
+
|
|
138
|
+
# Save results
|
|
139
|
+
resting_state.make_output_folder()
|
|
140
|
+
resting_state.save_timeseries(timeseries)
|
|
141
|
+
resting_state.save_report(atlas_masker)
|
|
142
|
+
resting_state.save_connectome(connectome)
|
|
143
|
+
resting_state.save_connectome_fisher_z(connectome_fisher_z)
|
lingualabpy/plot.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import matplotlib.pyplot as plt
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def draw_spectrogram(spectrogram, dynamic_range=70):
|
|
6
|
+
X, Y = spectrogram.x_grid(), spectrogram.y_grid()
|
|
7
|
+
sg_db = 10 * np.log10(spectrogram.values)
|
|
8
|
+
plt.pcolormesh(X, Y, sg_db, vmin=sg_db.max() - dynamic_range, cmap="afmhot")
|
|
9
|
+
plt.ylim([spectrogram.ymin, spectrogram.ymax])
|
|
10
|
+
plt.xlabel("time [s]")
|
|
11
|
+
plt.ylabel("frequency [Hz]")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def draw_pitch(pitch):
|
|
15
|
+
# Extract selected pitch contour, and
|
|
16
|
+
# replace unvoiced samples by NaN to not plot
|
|
17
|
+
pitch_values = pitch.selected_array["frequency"]
|
|
18
|
+
pitch_values[pitch_values == 0] = np.nan
|
|
19
|
+
plt.plot(pitch.xs(), pitch_values, "o", markersize=5, color="w")
|
|
20
|
+
plt.plot(pitch.xs(), pitch_values, "o", markersize=2)
|
|
21
|
+
plt.grid(False)
|
|
22
|
+
plt.ylim(0, pitch.ceiling)
|
|
23
|
+
plt.ylabel("fundamental frequency [Hz]")
|
lingualabpy/text/parser.py
CHANGED
|
@@ -1,35 +1,35 @@
|
|
|
1
|
-
import re
|
|
2
|
-
from docx import Document
|
|
3
|
-
from collections import defaultdict
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def parse_waywithwords(document: Document) -> dict:
|
|
7
|
-
""""""
|
|
8
|
-
waywithwords = {
|
|
9
|
-
"IV": "interviewer",
|
|
10
|
-
"IE": "interviewee",
|
|
11
|
-
}
|
|
12
|
-
|
|
13
|
-
results = defaultdict(list)
|
|
14
|
-
|
|
15
|
-
for para in document.paragraphs:
|
|
16
|
-
try:
|
|
17
|
-
content = para.text.split()
|
|
18
|
-
speaker = content[0]
|
|
19
|
-
transcription = " ".join(content[1:])
|
|
20
|
-
except:
|
|
21
|
-
speaker = None
|
|
22
|
-
|
|
23
|
-
if (
|
|
24
|
-
speaker in waywithwords.keys()
|
|
25
|
-
and not transcription.lower() in waywithwords.values()
|
|
26
|
-
):
|
|
27
|
-
results[waywithwords[speaker]].append(transcription)
|
|
28
|
-
|
|
29
|
-
elif re.findall(r"[0-9][0-9]:[0-5][0-9]:[0-5][0-9]", para.text):
|
|
30
|
-
results["time"].append(para.text)
|
|
31
|
-
|
|
32
|
-
else:
|
|
33
|
-
results["remainder"].append(para.text)
|
|
34
|
-
|
|
35
|
-
return results
|
|
1
|
+
import re
|
|
2
|
+
from docx import Document
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def parse_waywithwords(document: Document) -> dict:
|
|
7
|
+
""""""
|
|
8
|
+
waywithwords = {
|
|
9
|
+
"IV": "interviewer",
|
|
10
|
+
"IE": "interviewee",
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
results = defaultdict(list)
|
|
14
|
+
|
|
15
|
+
for para in document.paragraphs:
|
|
16
|
+
try:
|
|
17
|
+
content = para.text.split()
|
|
18
|
+
speaker = content[0]
|
|
19
|
+
transcription = " ".join(content[1:])
|
|
20
|
+
except:
|
|
21
|
+
speaker = None
|
|
22
|
+
|
|
23
|
+
if (
|
|
24
|
+
speaker in waywithwords.keys()
|
|
25
|
+
and not transcription.lower() in waywithwords.values()
|
|
26
|
+
):
|
|
27
|
+
results[waywithwords[speaker]].append(transcription)
|
|
28
|
+
|
|
29
|
+
elif re.findall(r"[0-9][0-9]:[0-5][0-9]:[0-5][0-9]", para.text):
|
|
30
|
+
results["time"].append(para.text)
|
|
31
|
+
|
|
32
|
+
else:
|
|
33
|
+
results["remainder"].append(para.text)
|
|
34
|
+
|
|
35
|
+
return results
|
lingualabpy/text/textgrid.py
CHANGED
|
@@ -1,41 +1,41 @@
|
|
|
1
|
-
import re
|
|
2
|
-
from textgrids import TextGrid, Interval
|
|
3
|
-
import warnings
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def extract_intervals(textgrid: TextGrid, speakers: list[str]) -> list[list[Interval]]:
|
|
7
|
-
""""""
|
|
8
|
-
# Check if speakers are in the textgrid tiers
|
|
9
|
-
tiers = set(textgrid.keys())
|
|
10
|
-
if not set(speakers).issubset(tiers):
|
|
11
|
-
raise ValueError(
|
|
12
|
-
f"Some speaker(s) '{speakers}' are not a tier in the TextGrid '{tiers}'"
|
|
13
|
-
)
|
|
14
|
-
|
|
15
|
-
# Check if there is other speaker in the textgrid
|
|
16
|
-
if not set(speakers) == tiers:
|
|
17
|
-
warnings.warn(
|
|
18
|
-
f"TextGrid '{tiers}' have more speakers than specify '{speakers}'"
|
|
19
|
-
)
|
|
20
|
-
|
|
21
|
-
# Extraction of intervals with text value
|
|
22
|
-
speakers_intervals = []
|
|
23
|
-
for speaker in speakers:
|
|
24
|
-
speaker_intervals = []
|
|
25
|
-
for interval in textgrid[speaker]:
|
|
26
|
-
if interval.text:
|
|
27
|
-
speaker_intervals.append(interval)
|
|
28
|
-
speakers_intervals.append(speaker_intervals)
|
|
29
|
-
|
|
30
|
-
# Checking if all intervals are correctly labeled
|
|
31
|
-
def interval_qc(intervals, label):
|
|
32
|
-
labels = set([_.text for _ in intervals])
|
|
33
|
-
if not (len(labels) == 1 and labels.pop() == label):
|
|
34
|
-
raise Exception(
|
|
35
|
-
f"TextGrid was not labeled correctly, current label(s) '{labels}', should be '{label}'."
|
|
36
|
-
)
|
|
37
|
-
|
|
38
|
-
for intervals, speaker in zip(speakers_intervals, speakers):
|
|
39
|
-
interval_qc(intervals, speaker)
|
|
40
|
-
|
|
41
|
-
return speakers_intervals
|
|
1
|
+
import re
|
|
2
|
+
from textgrids import TextGrid, Interval
|
|
3
|
+
import warnings
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def extract_intervals(textgrid: TextGrid, speakers: list[str]) -> list[list[Interval]]:
|
|
7
|
+
""""""
|
|
8
|
+
# Check if speakers are in the textgrid tiers
|
|
9
|
+
tiers = set(textgrid.keys())
|
|
10
|
+
if not set(speakers).issubset(tiers):
|
|
11
|
+
raise ValueError(
|
|
12
|
+
f"Some speaker(s) '{speakers}' are not a tier in the TextGrid '{tiers}'"
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
# Check if there is other speaker in the textgrid
|
|
16
|
+
if not set(speakers) == tiers:
|
|
17
|
+
warnings.warn(
|
|
18
|
+
f"TextGrid '{tiers}' have more speakers than specify '{speakers}'"
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
# Extraction of intervals with text value
|
|
22
|
+
speakers_intervals = []
|
|
23
|
+
for speaker in speakers:
|
|
24
|
+
speaker_intervals = []
|
|
25
|
+
for interval in textgrid[speaker]:
|
|
26
|
+
if interval.text:
|
|
27
|
+
speaker_intervals.append(interval)
|
|
28
|
+
speakers_intervals.append(speaker_intervals)
|
|
29
|
+
|
|
30
|
+
# Checking if all intervals are correctly labeled
|
|
31
|
+
def interval_qc(intervals, label):
|
|
32
|
+
labels = set([_.text for _ in intervals])
|
|
33
|
+
if not (len(labels) == 1 and labels.pop() == label):
|
|
34
|
+
raise Exception(
|
|
35
|
+
f"TextGrid was not labeled correctly, current label(s) '{labels}', should be '{label}'."
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
for intervals, speaker in zip(speakers_intervals, speakers):
|
|
39
|
+
interval_qc(intervals, speaker)
|
|
40
|
+
|
|
41
|
+
return speakers_intervals
|
lingualabpy/tools/data.py
CHANGED
|
@@ -1,41 +1,41 @@
|
|
|
1
|
-
from collections import UserDict
|
|
2
|
-
from pandas import DataFrame
|
|
3
|
-
|
|
4
|
-
from typing import Any, Dict, List
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class UnchangeableDict(UserDict):
|
|
8
|
-
"""A dictionary in which you can add new keys but not modify them in the future."""
|
|
9
|
-
|
|
10
|
-
def __setitem__(self, key: Any, item: Any) -> None:
|
|
11
|
-
try:
|
|
12
|
-
self.__getitem__(key)
|
|
13
|
-
raise ValueError("duplicate key '{}' found".format(key))
|
|
14
|
-
except KeyError:
|
|
15
|
-
return super().__setitem__(key, item)
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
def merge_participants_to_df(
|
|
19
|
-
data_participants: List[Dict[Any, Any]],
|
|
20
|
-
participant_col: str,
|
|
21
|
-
) -> DataFrame:
|
|
22
|
-
# Check if all data have a `participant_col` key
|
|
23
|
-
participant_col_checks = [_.get(participant_col) for _ in data_participants]
|
|
24
|
-
if not all(participant_col_checks):
|
|
25
|
-
raise Exception(
|
|
26
|
-
f"One of the samples does not contain the '{participant_col}' information."
|
|
27
|
-
)
|
|
28
|
-
|
|
29
|
-
# Check if there are no duplicates in the data
|
|
30
|
-
df_raw = DataFrame.from_dict(data_participants)
|
|
31
|
-
df_melt = df_raw.melt(id_vars=[participant_col]).dropna()
|
|
32
|
-
df_for_test = df_melt.drop(columns="value")
|
|
33
|
-
duplicates = df_for_test[df_for_test.duplicated()]
|
|
34
|
-
|
|
35
|
-
if duplicates.empty:
|
|
36
|
-
return df_melt.pivot(index=participant_col, columns="variable")["value"]
|
|
37
|
-
else:
|
|
38
|
-
error_msg = "There are duplicates in your data "
|
|
39
|
-
for participant_id, variable in duplicates.values:
|
|
40
|
-
error_msg += f"\n{participant_id}: {variable}"
|
|
41
|
-
raise Exception(error_msg)
|
|
1
|
+
from collections import UserDict
|
|
2
|
+
from pandas import DataFrame
|
|
3
|
+
|
|
4
|
+
from typing import Any, Dict, List
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class UnchangeableDict(UserDict):
|
|
8
|
+
"""A dictionary in which you can add new keys but not modify them in the future."""
|
|
9
|
+
|
|
10
|
+
def __setitem__(self, key: Any, item: Any) -> None:
|
|
11
|
+
try:
|
|
12
|
+
self.__getitem__(key)
|
|
13
|
+
raise ValueError("duplicate key '{}' found".format(key))
|
|
14
|
+
except KeyError:
|
|
15
|
+
return super().__setitem__(key, item)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def merge_participants_to_df(
|
|
19
|
+
data_participants: List[Dict[Any, Any]],
|
|
20
|
+
participant_col: str,
|
|
21
|
+
) -> DataFrame:
|
|
22
|
+
# Check if all data have a `participant_col` key
|
|
23
|
+
participant_col_checks = [_.get(participant_col) for _ in data_participants]
|
|
24
|
+
if not all(participant_col_checks):
|
|
25
|
+
raise Exception(
|
|
26
|
+
f"One of the samples does not contain the '{participant_col}' information."
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
# Check if there are no duplicates in the data
|
|
30
|
+
df_raw = DataFrame.from_dict(data_participants)
|
|
31
|
+
df_melt = df_raw.melt(id_vars=[participant_col]).dropna()
|
|
32
|
+
df_for_test = df_melt.drop(columns="value")
|
|
33
|
+
duplicates = df_for_test[df_for_test.duplicated()]
|
|
34
|
+
|
|
35
|
+
if duplicates.empty:
|
|
36
|
+
return df_melt.pivot(index=participant_col, columns="variable")["value"]
|
|
37
|
+
else:
|
|
38
|
+
error_msg = "There are duplicates in your data "
|
|
39
|
+
for participant_id, variable in duplicates.values:
|
|
40
|
+
error_msg += f"\n{participant_id}: {variable}"
|
|
41
|
+
raise Exception(error_msg)
|
lingualabpy/tools/interval.py
CHANGED
|
@@ -1,59 +1,59 @@
|
|
|
1
|
-
from textgrids import Interval
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
def interval_to_list(interval: Interval) -> list[float]:
|
|
5
|
-
""""""
|
|
6
|
-
return [interval.xmin, interval.xmax]
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
def is_overlap(interval0: Interval, interval1: Interval) -> bool:
|
|
10
|
-
"""Check if two intervals overlap"""
|
|
11
|
-
return interval0.xmin <= interval1.xmax and interval1.xmin <= interval0.xmax
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
def remove_overlap(interval: Interval, interval_to_remove: Interval) -> list[Interval]:
|
|
15
|
-
""""""
|
|
16
|
-
# Return interval as a list if there is no overlap
|
|
17
|
-
if not is_overlap(interval, interval_to_remove):
|
|
18
|
-
return [interval]
|
|
19
|
-
|
|
20
|
-
else:
|
|
21
|
-
updated_intervals = []
|
|
22
|
-
|
|
23
|
-
# If the start of the interval is before the start of the interval to be removed,
|
|
24
|
-
# add the non-overlapping part to the result.
|
|
25
|
-
if interval.xmin < interval_to_remove.xmin:
|
|
26
|
-
updated_intervals.append(
|
|
27
|
-
Interval(xmin=interval.xmin, xmax=interval_to_remove.xmin)
|
|
28
|
-
)
|
|
29
|
-
|
|
30
|
-
# If the end of the interval is after the end of the interval to be removed,
|
|
31
|
-
# add the non-overlapping part to the result.
|
|
32
|
-
if interval.xmax > interval_to_remove.xmax:
|
|
33
|
-
updated_intervals.append(
|
|
34
|
-
Interval(xmin=interval_to_remove.xmax, xmax=interval.xmax)
|
|
35
|
-
)
|
|
36
|
-
|
|
37
|
-
return updated_intervals
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
def intervals_masking(
|
|
41
|
-
intervals: list[Interval], intervals_mask: list[Interval]
|
|
42
|
-
) -> list[list[float]]:
|
|
43
|
-
""""""
|
|
44
|
-
# Each intervals mask will be remove from all the intervals
|
|
45
|
-
for interval_to_remove in intervals_mask:
|
|
46
|
-
new_intervals = []
|
|
47
|
-
for interval in intervals:
|
|
48
|
-
|
|
49
|
-
# if the start of the interval is after the end of the mask
|
|
50
|
-
# we can just add the interval they are sorted
|
|
51
|
-
if interval.xmin > interval_to_remove.xmax:
|
|
52
|
-
new_intervals.append(interval)
|
|
53
|
-
|
|
54
|
-
else:
|
|
55
|
-
new_intervals += remove_overlap(interval, interval_to_remove)
|
|
56
|
-
|
|
57
|
-
intervals = new_intervals
|
|
58
|
-
|
|
59
|
-
return [interval_to_list(_) for _ in intervals]
|
|
1
|
+
from textgrids import Interval
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def interval_to_list(interval: Interval) -> list[float]:
|
|
5
|
+
""""""
|
|
6
|
+
return [interval.xmin, interval.xmax]
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def is_overlap(interval0: Interval, interval1: Interval) -> bool:
|
|
10
|
+
"""Check if two intervals overlap"""
|
|
11
|
+
return interval0.xmin <= interval1.xmax and interval1.xmin <= interval0.xmax
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def remove_overlap(interval: Interval, interval_to_remove: Interval) -> list[Interval]:
|
|
15
|
+
""""""
|
|
16
|
+
# Return interval as a list if there is no overlap
|
|
17
|
+
if not is_overlap(interval, interval_to_remove):
|
|
18
|
+
return [interval]
|
|
19
|
+
|
|
20
|
+
else:
|
|
21
|
+
updated_intervals = []
|
|
22
|
+
|
|
23
|
+
# If the start of the interval is before the start of the interval to be removed,
|
|
24
|
+
# add the non-overlapping part to the result.
|
|
25
|
+
if interval.xmin < interval_to_remove.xmin:
|
|
26
|
+
updated_intervals.append(
|
|
27
|
+
Interval(xmin=interval.xmin, xmax=interval_to_remove.xmin)
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
# If the end of the interval is after the end of the interval to be removed,
|
|
31
|
+
# add the non-overlapping part to the result.
|
|
32
|
+
if interval.xmax > interval_to_remove.xmax:
|
|
33
|
+
updated_intervals.append(
|
|
34
|
+
Interval(xmin=interval_to_remove.xmax, xmax=interval.xmax)
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
return updated_intervals
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def intervals_masking(
|
|
41
|
+
intervals: list[Interval], intervals_mask: list[Interval]
|
|
42
|
+
) -> list[list[float]]:
|
|
43
|
+
""""""
|
|
44
|
+
# Each intervals mask will be remove from all the intervals
|
|
45
|
+
for interval_to_remove in intervals_mask:
|
|
46
|
+
new_intervals = []
|
|
47
|
+
for interval in intervals:
|
|
48
|
+
|
|
49
|
+
# if the start of the interval is after the end of the mask
|
|
50
|
+
# we can just add the interval they are sorted
|
|
51
|
+
if interval.xmin > interval_to_remove.xmax:
|
|
52
|
+
new_intervals.append(interval)
|
|
53
|
+
|
|
54
|
+
else:
|
|
55
|
+
new_intervals += remove_overlap(interval, interval_to_remove)
|
|
56
|
+
|
|
57
|
+
intervals = new_intervals
|
|
58
|
+
|
|
59
|
+
return [interval_to_list(_) for _ in intervals]
|