pelican-nlp 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pelican_nlp/Nils_backup/__init__.py +0 -0
- pelican_nlp/Nils_backup/extract_acoustic_features.py +274 -0
- pelican_nlp/Nils_backup/fluency/__init__.py +0 -0
- pelican_nlp/Nils_backup/fluency/aggregate_fluency_results.py +186 -0
- pelican_nlp/Nils_backup/fluency/behavioral_data.py +42 -0
- pelican_nlp/Nils_backup/fluency/check_duplicates.py +169 -0
- pelican_nlp/Nils_backup/fluency/coherence.py +653 -0
- pelican_nlp/Nils_backup/fluency/config.py +231 -0
- pelican_nlp/Nils_backup/fluency/main.py +182 -0
- pelican_nlp/Nils_backup/fluency/optimality_without_tsa.py +466 -0
- pelican_nlp/Nils_backup/fluency/plot_fluency.py +573 -0
- pelican_nlp/Nils_backup/fluency/plotting_utils.py +170 -0
- pelican_nlp/Nils_backup/fluency/questionnaires_data.py +43 -0
- pelican_nlp/Nils_backup/fluency/stats_fluency.py +930 -0
- pelican_nlp/Nils_backup/fluency/utils.py +41 -0
- pelican_nlp/Nils_backup/speaker_diarization_Nils.py +328 -0
- pelican_nlp/Nils_backup/transcription/__init__.py +0 -0
- pelican_nlp/Nils_backup/transcription/annotation_tool.py +1001 -0
- pelican_nlp/Nils_backup/transcription/annotation_tool_boundaries.py +1122 -0
- pelican_nlp/Nils_backup/transcription/annotation_tool_sandbox.py +985 -0
- pelican_nlp/Nils_backup/transcription/output/holmes_control_nova_all_outputs.json +7948 -0
- pelican_nlp/Nils_backup/transcription/test.json +1 -0
- pelican_nlp/Nils_backup/transcription/transcribe_audio.py +314 -0
- pelican_nlp/Nils_backup/transcription/transcribe_audio_chunked.py +695 -0
- pelican_nlp/Nils_backup/transcription/transcription.py +801 -0
- pelican_nlp/Nils_backup/transcription/transcription_gui.py +955 -0
- pelican_nlp/Nils_backup/transcription/word_boundaries.py +190 -0
- pelican_nlp/Silvia_files/Opensmile/opensmile_feature_extraction.py +66 -0
- pelican_nlp/Silvia_files/prosogram/prosogram.py +104 -0
- pelican_nlp/__init__.py +1 -1
- pelican_nlp/_version.py +1 -0
- pelican_nlp/configuration_files/config_audio.yml +150 -0
- pelican_nlp/configuration_files/config_discourse.yml +104 -0
- pelican_nlp/configuration_files/config_fluency.yml +108 -0
- pelican_nlp/configuration_files/config_general.yml +131 -0
- pelican_nlp/configuration_files/config_morteza.yml +103 -0
- pelican_nlp/praat/__init__.py +29 -0
- {pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.3.dist-info}/METADATA +15 -14
- pelican_nlp-0.1.3.dist-info/RECORD +75 -0
- pelican_nlp-0.1.1.dist-info/RECORD +0 -39
- {pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.3.dist-info}/WHEEL +0 -0
- {pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.3.dist-info}/licenses/LICENSE +0 -0
- {pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,190 @@
|
|
1
|
+
import sys
|
2
|
+
import numpy as np
|
3
|
+
|
4
|
+
from PyQt5.QtWidgets import (
|
5
|
+
QApplication,
|
6
|
+
QMainWindow,
|
7
|
+
QSplitter,
|
8
|
+
QWidget,
|
9
|
+
QVBoxLayout,
|
10
|
+
QTableWidget,
|
11
|
+
QTableWidgetItem,
|
12
|
+
)
|
13
|
+
from PyQt5.QtCore import Qt, pyqtSignal
|
14
|
+
|
15
|
+
from matplotlib.backends.backend_qt5agg import FigureCanvasQTAgg as FigureCanvas
|
16
|
+
from matplotlib.figure import Figure
|
17
|
+
|
18
|
+
|
19
|
+
class WaveformCanvas(FigureCanvas):
|
20
|
+
boundary_changed = pyqtSignal(int, str, float) # index of word, 'start'/'end', new position
|
21
|
+
|
22
|
+
def __init__(self, parent=None):
|
23
|
+
self.fig = Figure(figsize=(5, 4))
|
24
|
+
super(WaveformCanvas, self).__init__(self.fig)
|
25
|
+
self.setParent(parent)
|
26
|
+
|
27
|
+
self.ax = self.fig.add_subplot(111)
|
28
|
+
self.ax.set_title("Waveform with Word Boundaries")
|
29
|
+
|
30
|
+
# Generate a simple sine wave as the waveform
|
31
|
+
t = np.linspace(0, 10, 1000)
|
32
|
+
self.y = np.sin(2 * np.pi * 1 * t)
|
33
|
+
self.ax.plot(t, self.y)
|
34
|
+
|
35
|
+
# Example words with start and end times
|
36
|
+
self.words = [
|
37
|
+
{'word': 'Word1', 'start': 1.0, 'end': 2.0},
|
38
|
+
{'word': 'Word2', 'start': 2.5, 'end': 4.0},
|
39
|
+
{'word': 'Word3', 'start': 4.5, 'end': 6.0},
|
40
|
+
{'word': 'Word4', 'start': 6.5, 'end': 8.0},
|
41
|
+
]
|
42
|
+
|
43
|
+
# Draw lines for start and end times
|
44
|
+
self.lines = [] # List of dicts: {'line': line_object, 'word_idx': idx, 'type': 'start'/'end'}
|
45
|
+
|
46
|
+
for idx, word in enumerate(self.words):
|
47
|
+
# Start time line
|
48
|
+
start_line = self.ax.axvline(word['start'], color='green', linestyle='--', picker=5)
|
49
|
+
self.lines.append({'line': start_line, 'word_idx': idx, 'type': 'start'})
|
50
|
+
|
51
|
+
# End time line
|
52
|
+
end_line = self.ax.axvline(word['end'], color='red', linestyle='--', picker=5)
|
53
|
+
self.lines.append({'line': end_line, 'word_idx': idx, 'type': 'end'})
|
54
|
+
|
55
|
+
self.dragging_line = None
|
56
|
+
self.prev_x = None
|
57
|
+
|
58
|
+
self.cid_press = self.fig.canvas.mpl_connect('button_press_event', self.on_press)
|
59
|
+
self.cid_release = self.fig.canvas.mpl_connect('button_release_event', self.on_release)
|
60
|
+
self.cid_motion = self.fig.canvas.mpl_connect('motion_notify_event', self.on_motion)
|
61
|
+
|
62
|
+
def on_press(self, event):
|
63
|
+
if event.inaxes != self.ax:
|
64
|
+
return
|
65
|
+
for line_dict in self.lines:
|
66
|
+
line = line_dict['line']
|
67
|
+
contains, _ = line.contains(event)
|
68
|
+
if contains:
|
69
|
+
self.dragging_line = line_dict
|
70
|
+
self.prev_x = event.xdata
|
71
|
+
break
|
72
|
+
|
73
|
+
def on_motion(self, event):
|
74
|
+
if self.dragging_line is None or event.inaxes != self.ax:
|
75
|
+
return
|
76
|
+
dx = event.xdata - self.prev_x
|
77
|
+
x = self.dragging_line['line'].get_xdata()[0] + dx
|
78
|
+
self.dragging_line['line'].set_xdata([x, x])
|
79
|
+
self.prev_x = event.xdata
|
80
|
+
self.fig.canvas.draw_idle()
|
81
|
+
|
82
|
+
def on_release(self, event):
|
83
|
+
if self.dragging_line is not None:
|
84
|
+
# Update the boundary positions
|
85
|
+
idx = self.dragging_line['word_idx']
|
86
|
+
boundary_type = self.dragging_line['type']
|
87
|
+
new_pos = self.dragging_line['line'].get_xdata()[0]
|
88
|
+
self.words[idx][boundary_type] = new_pos
|
89
|
+
self.dragging_line = None
|
90
|
+
self.prev_x = None
|
91
|
+
# Emit signal to update table
|
92
|
+
self.boundary_changed.emit(idx, boundary_type, new_pos)
|
93
|
+
|
94
|
+
|
95
|
+
class MainWindow(QMainWindow):
|
96
|
+
def __init__(self):
|
97
|
+
super().__init__()
|
98
|
+
self.setWindowTitle("Waveform Editor")
|
99
|
+
self.resize(800, 600)
|
100
|
+
|
101
|
+
splitter = QSplitter(Qt.Horizontal)
|
102
|
+
|
103
|
+
# Left panel: Waveform
|
104
|
+
self.waveform_widget = QWidget()
|
105
|
+
waveform_layout = QVBoxLayout(self.waveform_widget)
|
106
|
+
self.canvas = WaveformCanvas(parent=self.waveform_widget)
|
107
|
+
waveform_layout.addWidget(self.canvas)
|
108
|
+
splitter.addWidget(self.waveform_widget)
|
109
|
+
|
110
|
+
# Right panel: Table
|
111
|
+
self.table_widget = QTableWidget()
|
112
|
+
self.table_widget.setColumnCount(3)
|
113
|
+
self.table_widget.setHorizontalHeaderLabels(["Word", "Start Time", "End Time"])
|
114
|
+
splitter.addWidget(self.table_widget)
|
115
|
+
|
116
|
+
self.setCentralWidget(splitter)
|
117
|
+
|
118
|
+
# Connect signals
|
119
|
+
self.canvas.boundary_changed.connect(self.on_boundary_changed)
|
120
|
+
self.table_widget.itemChanged.connect(self.on_table_item_changed)
|
121
|
+
|
122
|
+
self.update_table()
|
123
|
+
|
124
|
+
def update_table(self):
|
125
|
+
self.table_widget.blockSignals(True) # Prevent signals while updating
|
126
|
+
words = self.canvas.words
|
127
|
+
self.table_widget.setRowCount(len(words))
|
128
|
+
for i, word in enumerate(words):
|
129
|
+
self.table_widget.setItem(i, 0, QTableWidgetItem(word['word']))
|
130
|
+
self.table_widget.setItem(i, 1, QTableWidgetItem(f"{word['start']:.2f}"))
|
131
|
+
self.table_widget.setItem(i, 2, QTableWidgetItem(f"{word['end']:.2f}"))
|
132
|
+
self.table_widget.blockSignals(False)
|
133
|
+
|
134
|
+
def on_boundary_changed(self, idx, boundary_type, new_pos):
|
135
|
+
# Update the table
|
136
|
+
self.table_widget.blockSignals(True) # Prevent recursive updates
|
137
|
+
if boundary_type == 'start':
|
138
|
+
item = self.table_widget.item(idx, 1)
|
139
|
+
if item is not None:
|
140
|
+
item.setText(f"{new_pos:.2f}")
|
141
|
+
elif boundary_type == 'end':
|
142
|
+
item = self.table_widget.item(idx, 2)
|
143
|
+
if item is not None:
|
144
|
+
item.setText(f"{new_pos:.2f}")
|
145
|
+
self.table_widget.blockSignals(False)
|
146
|
+
|
147
|
+
def on_table_item_changed(self, item):
|
148
|
+
row = item.row()
|
149
|
+
col = item.column()
|
150
|
+
words = self.canvas.words
|
151
|
+
|
152
|
+
if col == 1:
|
153
|
+
# Start time changed
|
154
|
+
try:
|
155
|
+
new_start = float(item.text())
|
156
|
+
words[row]['start'] = new_start
|
157
|
+
# Update the line position in the waveform
|
158
|
+
for line_dict in self.canvas.lines:
|
159
|
+
if line_dict['word_idx'] == row and line_dict['type'] == 'start':
|
160
|
+
line = line_dict['line']
|
161
|
+
line.set_xdata([new_start, new_start])
|
162
|
+
self.canvas.draw()
|
163
|
+
break
|
164
|
+
except ValueError:
|
165
|
+
pass # Invalid input, ignore
|
166
|
+
elif col == 2:
|
167
|
+
# End time changed
|
168
|
+
try:
|
169
|
+
new_end = float(item.text())
|
170
|
+
words[row]['end'] = new_end
|
171
|
+
# Update the line position in the waveform
|
172
|
+
for line_dict in self.canvas.lines:
|
173
|
+
if line_dict['word_idx'] == row and line_dict['type'] == 'end':
|
174
|
+
line = line_dict['line']
|
175
|
+
line.set_xdata([new_end, new_end])
|
176
|
+
self.canvas.draw()
|
177
|
+
break
|
178
|
+
except ValueError:
|
179
|
+
pass # Invalid input, ignore
|
180
|
+
|
181
|
+
|
182
|
+
def main():
|
183
|
+
app = QApplication(sys.argv)
|
184
|
+
window = MainWindow()
|
185
|
+
window.show()
|
186
|
+
sys.exit(app.exec_())
|
187
|
+
|
188
|
+
|
189
|
+
if __name__ == "__main__":
|
190
|
+
main()
|
@@ -0,0 +1,66 @@
|
|
1
|
+
import csv
|
2
|
+
import os
|
3
|
+
from typing import List
|
4
|
+
|
5
|
+
import audiofile
|
6
|
+
import opensmile
|
7
|
+
import glob
|
8
|
+
|
9
|
+
audio_dir = "C:\\Users\\Acer\\Documents\\Silvia\\UMC_Language and schizophrenia\\Research assistant\\Projects\\Language and sex hormones\\P009"
|
10
|
+
files: List[str] = glob.glob('C:\\Users\\Acer\\Documents\\Silvia\\UMC_Language and schizophrenia\\PhD\\Projects\\Interns\\Language and sex hormones\\P009\\*.wav')
|
11
|
+
|
12
|
+
|
13
|
+
#extract whole audiofile
|
14
|
+
#if wish to be shortended, fill in argument 'duration'
|
15
|
+
results = []
|
16
|
+
for file in files:
|
17
|
+
print(file)
|
18
|
+
storage = {}
|
19
|
+
signal, sampling_rate = audiofile.read(
|
20
|
+
file,
|
21
|
+
always_2d=True,
|
22
|
+
#duration=
|
23
|
+
#offset=
|
24
|
+
)
|
25
|
+
|
26
|
+
#extract eGeMAPSv02 feature set
|
27
|
+
smile = opensmile.Smile(
|
28
|
+
feature_set=opensmile.FeatureSet.eGeMAPSv02,
|
29
|
+
feature_level=opensmile.FeatureLevel.Functionals,
|
30
|
+
)
|
31
|
+
|
32
|
+
##print(smile.feature_names)
|
33
|
+
|
34
|
+
|
35
|
+
output = smile.process_signal(
|
36
|
+
signal,
|
37
|
+
sampling_rate
|
38
|
+
)
|
39
|
+
print(output)
|
40
|
+
|
41
|
+
#save output
|
42
|
+
storage['p_nr'] = file[0:4]
|
43
|
+
for feature in smile.feature_names:
|
44
|
+
storage[feature] = output[feature]
|
45
|
+
|
46
|
+
results.append(storage)
|
47
|
+
|
48
|
+
csv_columns = results[0].keys()
|
49
|
+
csv_file = "C:\\Users\\Acer\\Documents\\Silvia\\UMC_Language and schizophrenia\\Research assistant\\Projects\\Language and sex hormones\\Opensmile_results.csv"
|
50
|
+
|
51
|
+
with open(csv_file, 'w', newline="") as csvfile:
|
52
|
+
writer = csv.writer(csvfile)
|
53
|
+
writer.writerow(csv_columns)
|
54
|
+
for i in range (len(results)): #iterate over particpants' results
|
55
|
+
data = results[i]
|
56
|
+
new_array = []
|
57
|
+
for t in data: #iterate all features in participants' results
|
58
|
+
if t == 'p_nr': #in first column, insert particpant number
|
59
|
+
new_array.append(files[i].split(os.sep)[-1]) #filesplit takes only participant number (and not the whole path) #append inserts data in the empty array
|
60
|
+
else: #for all other columns insert feature value
|
61
|
+
new_array.append(float(data[t]))
|
62
|
+
writer.writerow(new_array)
|
63
|
+
|
64
|
+
|
65
|
+
|
66
|
+
|
@@ -0,0 +1,104 @@
|
|
1
|
+
# 1. packages
|
2
|
+
import os
|
3
|
+
import subprocess
|
4
|
+
import shutil
|
5
|
+
import glob
|
6
|
+
import pandas as pd
|
7
|
+
from tqdm import tqdm
|
8
|
+
|
9
|
+
# 2. functions
|
10
|
+
def extract_profile(wav_path):
|
11
|
+
"""
|
12
|
+
|
13
|
+
|
14
|
+
Args:
|
15
|
+
wav_path (str): pathway to the speech file.
|
16
|
+
|
17
|
+
Returns:
|
18
|
+
profile (DataFrame) : the result of prosogram.
|
19
|
+
|
20
|
+
"""
|
21
|
+
|
22
|
+
# get directory
|
23
|
+
original_cwd = os.getcwd()
|
24
|
+
script_dir = os.path.dirname(os.path.abspath("__file__"))
|
25
|
+
|
26
|
+
# create a filefoder to store temporary file
|
27
|
+
tmp_dir = os.path.abspath("./tmp")
|
28
|
+
shutil.rmtree(tmp_dir, ignore_errors=True)
|
29
|
+
os.makedirs(tmp_dir, exist_ok=True)
|
30
|
+
|
31
|
+
try:
|
32
|
+
# copy all prosogram scripts into the temporary folder
|
33
|
+
for p in glob.glob(os.path.join(script_dir, "*.praat")):
|
34
|
+
shutil.copyfile(p, os.path.join(tmp_dir, os.path.basename(p)))
|
35
|
+
|
36
|
+
shutil.copyfile("Praat.exe", os.path.join(tmp_dir, "Praat.exe"))
|
37
|
+
|
38
|
+
# copy the speech file into the temporary folder
|
39
|
+
wav_copy_path = os.path.join(tmp_dir, os.path.basename(wav_path))
|
40
|
+
shutil.copyfile(wav_path, wav_copy_path)
|
41
|
+
|
42
|
+
# create a praat work file in the temporary folder with setting
|
43
|
+
|
44
|
+
# use default setting here
|
45
|
+
# if needed, add custome settings in prosogram_variants of job_content
|
46
|
+
# for example, if the custom settings are: g=0.32 dg=20 dmin=0.035
|
47
|
+
# the job_content should be:
|
48
|
+
# """
|
49
|
+
# include prosomain.praat
|
50
|
+
# @prosogram_variants: "file={os.path.abspath(wav_copy_path)} save=yes draw=no
|
51
|
+
# g=0.32 dg=20 dmin=0.035"
|
52
|
+
# exit
|
53
|
+
# """
|
54
|
+
|
55
|
+
job_contents = f"""include prosomain.praat
|
56
|
+
@prosogram: "file={os.path.abspath(wav_copy_path)} save=yes draw=no"
|
57
|
+
exit"""
|
58
|
+
|
59
|
+
job_path = os.path.join(tmp_dir, "job.praat")
|
60
|
+
with open(job_path, "w") as job_file:
|
61
|
+
job_file.write(job_contents)
|
62
|
+
|
63
|
+
os.chdir(tmp_dir)
|
64
|
+
|
65
|
+
# create a command line to run praat
|
66
|
+
invocation = f"Praat.exe --run {os.path.abspath(job_path)}"
|
67
|
+
status, output = subprocess.getstatusoutput(invocation)
|
68
|
+
# Give a warning when the processing failed (status is not 0)
|
69
|
+
if status != 0:
|
70
|
+
print(output)
|
71
|
+
raise Warning("FAILED: Praat failed! The outputs are printed above.")
|
72
|
+
|
73
|
+
# Read the result file
|
74
|
+
profile_path = os.path.join(tmp_dir,
|
75
|
+
wav_copy_path.replace(".wav", "") + \
|
76
|
+
"_profile_data.txt")
|
77
|
+
profile = pd.read_csv(profile_path, sep="\t")
|
78
|
+
|
79
|
+
finally:
|
80
|
+
# delete all contents in the temporary filefolder
|
81
|
+
shutil.rmtree(tmp_dir, ignore_errors=True)
|
82
|
+
os.chdir(original_cwd)
|
83
|
+
|
84
|
+
return profile
|
85
|
+
|
86
|
+
|
87
|
+
# 4. commands
|
88
|
+
path = 'Audios/'
|
89
|
+
wav_paths = glob.glob(os.path.join(path, "*.wav"))
|
90
|
+
|
91
|
+
# get all data within one data frame
|
92
|
+
profile = pd.DataFrame()
|
93
|
+
for wav_path in tqdm(wav_paths):
|
94
|
+
profile = pd.concat([profile,
|
95
|
+
extract_profile(wav_path)])
|
96
|
+
|
97
|
+
# add the identifier column
|
98
|
+
profile['Audios'] = wav_paths
|
99
|
+
profile = profile.reindex(columns=['Audios']+profile.columns.tolist()[1:])
|
100
|
+
|
101
|
+
# save as a csv file
|
102
|
+
profile.to_csv('Prosogram_results.csv', encoding='utf-8-sig', index=False)
|
103
|
+
|
104
|
+
|
pelican_nlp/__init__.py
CHANGED
pelican_nlp/_version.py
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
__version__ = "0.1.3"
|
@@ -0,0 +1,150 @@
|
|
1
|
+
# Master Configuration File
|
2
|
+
# ========================
|
3
|
+
|
4
|
+
# Basic Settings
|
5
|
+
# -------------
|
6
|
+
input_file: "audio" # Options: 'text' or 'audio'
|
7
|
+
PATH_TO_PROJECT_FOLDER: "/home/yvespauli/PycharmProjects/Silvia"
|
8
|
+
language: "dutch" # Options: 'german', 'english'
|
9
|
+
recompute_everything: true # If false, reuses previously computed results
|
10
|
+
output_document_information: true
|
11
|
+
# Task Configuration
|
12
|
+
# -----------------
|
13
|
+
task_name: "monologue" # Options: 'fluency', 'interview'
|
14
|
+
fluency_task: &fluency_flag false # Flag for fluency-specific settings
|
15
|
+
discourse: &discourse_flag false # Flag for discourse-specific settings
|
16
|
+
corpus_names: # List of task corpora
|
17
|
+
- "interv-removed"
|
18
|
+
|
19
|
+
# Session and Subject Settings
|
20
|
+
# --------------------------
|
21
|
+
multiple_sessions: true
|
22
|
+
number_of_subjects: null # If null, auto-detected
|
23
|
+
number_of_speakers: 1
|
24
|
+
subject_speakertag: null # Speaker tag for subject (e.g., "B")
|
25
|
+
|
26
|
+
# Document Structure
|
27
|
+
# ----------------
|
28
|
+
has_multiple_sections: false
|
29
|
+
has_section_titles: false
|
30
|
+
section_identification: null # e.g., "Section:"
|
31
|
+
number_of_sections: 1 # If null, auto-detected
|
32
|
+
|
33
|
+
# Processing Pipeline
|
34
|
+
# -----------------
|
35
|
+
pipeline_options:
|
36
|
+
quality_check: false
|
37
|
+
clean_text: true
|
38
|
+
tokenize_text: false
|
39
|
+
normalize_text: false
|
40
|
+
|
41
|
+
# Audio Processing
|
42
|
+
# ---------------
|
43
|
+
opensmile_feature_extraction: true
|
44
|
+
prosogram_extraction: false
|
45
|
+
|
46
|
+
opensmile_configurations:
|
47
|
+
always_2d: true
|
48
|
+
duration: null
|
49
|
+
offset: null
|
50
|
+
|
51
|
+
feature_set: opensmile.FeatureSet.eGeMAPSv02
|
52
|
+
feature_level: opensmile.FeatureLevel.Functionals
|
53
|
+
|
54
|
+
|
55
|
+
# Metric Extraction
|
56
|
+
# ---------------
|
57
|
+
metric_to_extract: null # Options: 'embeddings', 'logits'
|
58
|
+
extract_logits: false
|
59
|
+
extract_embeddings: false
|
60
|
+
|
61
|
+
# Cleaning Options
|
62
|
+
# --------------
|
63
|
+
cleaning_options:
|
64
|
+
general_cleaning: true
|
65
|
+
remove_punctuation: false
|
66
|
+
lowercase: true
|
67
|
+
remove_brackets_and_bracketcontent: false
|
68
|
+
remove_timestamps: false
|
69
|
+
timestamp_pattern_example: null # e.g., "#00:00:23-00#"
|
70
|
+
# Fluency-specific options
|
71
|
+
fluency_task: *fluency_flag
|
72
|
+
word_splitter: ';'
|
73
|
+
remove_hyphens: true
|
74
|
+
remove_duplicates: true
|
75
|
+
|
76
|
+
general_cleaning_options:
|
77
|
+
strip_whitespace: true
|
78
|
+
merge_multiple_whitespaces: true
|
79
|
+
remove_whitespace_before_punctuation: true
|
80
|
+
merge_newline_characters: true
|
81
|
+
remove_backslashes: true
|
82
|
+
|
83
|
+
# Embedding Options
|
84
|
+
# ---------------
|
85
|
+
options_embeddings:
|
86
|
+
tokenization_method: "whitespace" # Options: 'whitespace', 'model'
|
87
|
+
model_name: "fastText" # Options: 'fastText', 'xlm-roberta-base'
|
88
|
+
pytorch_based_model: false
|
89
|
+
method: "model_instance"
|
90
|
+
max_length: 512
|
91
|
+
clean_embedding_tokens: true
|
92
|
+
remove_punctuation: false
|
93
|
+
lowercase: false
|
94
|
+
keep_speakertags: false
|
95
|
+
semantic-similarity: true
|
96
|
+
window_size: null
|
97
|
+
clean_tokens: true
|
98
|
+
divergence_from_optimality: false
|
99
|
+
output_options:
|
100
|
+
exclude_special_tokens: true
|
101
|
+
remove_'_'_character: true
|
102
|
+
remove_speaker_labels: true
|
103
|
+
remove_punctuation_and_symbols: true
|
104
|
+
remove_brackets_and_content: true
|
105
|
+
|
106
|
+
# Logits Options
|
107
|
+
# -------------
|
108
|
+
options_logits:
|
109
|
+
chunk_size: 128
|
110
|
+
overlap_size: 64
|
111
|
+
tokenization_method: "model"
|
112
|
+
model_name: "DiscoResearch/Llama3-German-8B-32k"
|
113
|
+
remove_punctuation: true
|
114
|
+
lowercase: true
|
115
|
+
keep_speakertags: true
|
116
|
+
|
117
|
+
# Analysis Options
|
118
|
+
# --------------
|
119
|
+
options_semantic-similarity:
|
120
|
+
window_sizes: # 'all' or window size as integer
|
121
|
+
- 2
|
122
|
+
- 8
|
123
|
+
|
124
|
+
options_dis_from_randomness:
|
125
|
+
window_size: 8
|
126
|
+
min_len: null
|
127
|
+
bootstrap: 10000
|
128
|
+
shuffle_mode: 'include0_includeN'
|
129
|
+
parallel_computing: false
|
130
|
+
|
131
|
+
# Normalization Options
|
132
|
+
# -------------------
|
133
|
+
normalization_options:
|
134
|
+
method: "lemmatization" # Options: 'lemmatization', 'stemming'
|
135
|
+
|
136
|
+
# Filename Configuration
|
137
|
+
# --------------------
|
138
|
+
filename_components:
|
139
|
+
subject: true # mandatory
|
140
|
+
session: false
|
141
|
+
task: true # mandatory
|
142
|
+
task_addition: false
|
143
|
+
corpus: true
|
144
|
+
metric: true
|
145
|
+
additional_tags: []
|
146
|
+
|
147
|
+
document_information_output:
|
148
|
+
parameters:
|
149
|
+
- subject_ID
|
150
|
+
- recording_length
|
@@ -0,0 +1,104 @@
|
|
1
|
+
# Configuration file for discourse protocols
|
2
|
+
#=======================================
|
3
|
+
input_file: "text" #or 'audio'
|
4
|
+
discourse: &discourse_flag true
|
5
|
+
#=====================================
|
6
|
+
|
7
|
+
#general configurations; always adapt
|
8
|
+
PATH_TO_PROJECT_FOLDER: "/home/yvespauli/PycharmProjects/Morteza/"
|
9
|
+
language: "german" # Possibly add options for German and English
|
10
|
+
|
11
|
+
task_name: "interview" # Give name of task used for creation of the input file (e.g., ['fluency', 'interview'])
|
12
|
+
corpus_names:
|
13
|
+
- "schizophrenia"
|
14
|
+
|
15
|
+
metric_to_extract: "embeddings" #Possible options: 'logits' or 'embeddings'
|
16
|
+
|
17
|
+
number_of_speakers: 3
|
18
|
+
subject_speakertag: "B"
|
19
|
+
#=========================================================
|
20
|
+
|
21
|
+
#Optional configurations; Change with preference. However, default settings recommended
|
22
|
+
fluency_task: &fluency_flag false
|
23
|
+
cleaning_options:
|
24
|
+
general_cleaning: true # General cleaning options used for most text preprocessing, default: True.
|
25
|
+
remove_brackets_and_bracketcontent: true
|
26
|
+
remove_timestamps: true
|
27
|
+
timestamp_pattern_example: "#00:00:19-0#"
|
28
|
+
remove_punctuation: false
|
29
|
+
lowercase: false
|
30
|
+
#Options for fluency tasks
|
31
|
+
fluency_task: *fluency_flag
|
32
|
+
word_splitter: null
|
33
|
+
remove_hyphens: null
|
34
|
+
remove_duplicates: null
|
35
|
+
|
36
|
+
options_logits:
|
37
|
+
chunk_size: 128
|
38
|
+
overlap_size: 64
|
39
|
+
tokenization_method: "model"
|
40
|
+
#method: "model_instance" # Options: model, regex, nltk, etc.
|
41
|
+
model_name: "DiscoResearch/Llama3-German-8B-32k" # Replace with your model instance name
|
42
|
+
remove_punctuation: true
|
43
|
+
lowercase: true
|
44
|
+
keep_speakertags: true
|
45
|
+
|
46
|
+
options_embeddings:
|
47
|
+
tokenization_method: "whitespace" #"model" or "whitespace"
|
48
|
+
max_length: 512 #max sequence length
|
49
|
+
model_name: "fastText" #e.g. "fastText", "xlm-roberta-base"
|
50
|
+
pytorch_based_model: false
|
51
|
+
method: "model_instance"
|
52
|
+
remove_punctuation: false
|
53
|
+
lowercase: false
|
54
|
+
keep_speakertags: true
|
55
|
+
clean_embedding_tokens: true
|
56
|
+
output_options:
|
57
|
+
exclude_special_tokens: true
|
58
|
+
remove_'_'_character: true
|
59
|
+
remove_speaker_labels: true
|
60
|
+
remove_punctuation_and_symbols: true
|
61
|
+
remove_brackets_and_content: true
|
62
|
+
semantic-similarity: false
|
63
|
+
window_size: null
|
64
|
+
clean_tokens: false
|
65
|
+
divergence_from_optimality: false
|
66
|
+
#================================================================================
|
67
|
+
|
68
|
+
#Extra configurations:
|
69
|
+
pipeline_options:
|
70
|
+
quality_check: false
|
71
|
+
clean_text: true
|
72
|
+
tokenize_text: false
|
73
|
+
normalize_text: false
|
74
|
+
|
75
|
+
general_cleaning_options:
|
76
|
+
strip_whitespace: true
|
77
|
+
merge_multiple_whitespaces: true
|
78
|
+
remove_whitespace_before_punctuation: true
|
79
|
+
merge_newline_characters: true
|
80
|
+
remove_backslashes: true
|
81
|
+
|
82
|
+
has_multiple_sections: false #evaluated independently
|
83
|
+
has_section_titles: false
|
84
|
+
section_identification: null #e.g. "Section:", 'null' if file does not have multiple sections, use pattern that is unlikely to appear in rest of transcript
|
85
|
+
number_of_sections: null #if 'null' number of sections automatically detected, however, specifying number recommended if known.
|
86
|
+
|
87
|
+
# Options for extract_embeddings
|
88
|
+
window_sizes: [2]
|
89
|
+
metric_function: cosine_similarity
|
90
|
+
aggregation_functions: mean_of_means
|
91
|
+
|
92
|
+
normalization_options:
|
93
|
+
method: "lemmatization" #Options: lemmatization or stemming
|
94
|
+
#================================================================
|
95
|
+
|
96
|
+
#Detail configurations; Changes optional, mostly used for quality checking / error handling
|
97
|
+
number_of_subjects: null # Specify number of subjects; if 'null', number of subjects is automatically detected
|
98
|
+
multiple_sessions: false # Set to True if multiple sessions per subject
|
99
|
+
|
100
|
+
recompute_everything: true #If set to 'false' pelican-nlp will try to reuse previously computed results stored on your drive
|
101
|
+
|
102
|
+
create_aggregation_of_results: false
|
103
|
+
|
104
|
+
|