pelican-nlp 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. pelican_nlp/Nils_backup/__init__.py +0 -0
  2. pelican_nlp/Nils_backup/extract_acoustic_features.py +274 -0
  3. pelican_nlp/Nils_backup/fluency/__init__.py +0 -0
  4. pelican_nlp/Nils_backup/fluency/aggregate_fluency_results.py +186 -0
  5. pelican_nlp/Nils_backup/fluency/behavioral_data.py +42 -0
  6. pelican_nlp/Nils_backup/fluency/check_duplicates.py +169 -0
  7. pelican_nlp/Nils_backup/fluency/coherence.py +653 -0
  8. pelican_nlp/Nils_backup/fluency/config.py +231 -0
  9. pelican_nlp/Nils_backup/fluency/main.py +182 -0
  10. pelican_nlp/Nils_backup/fluency/optimality_without_tsa.py +466 -0
  11. pelican_nlp/Nils_backup/fluency/plot_fluency.py +573 -0
  12. pelican_nlp/Nils_backup/fluency/plotting_utils.py +170 -0
  13. pelican_nlp/Nils_backup/fluency/questionnaires_data.py +43 -0
  14. pelican_nlp/Nils_backup/fluency/stats_fluency.py +930 -0
  15. pelican_nlp/Nils_backup/fluency/utils.py +41 -0
  16. pelican_nlp/Nils_backup/speaker_diarization_Nils.py +328 -0
  17. pelican_nlp/Nils_backup/transcription/__init__.py +0 -0
  18. pelican_nlp/Nils_backup/transcription/annotation_tool.py +1001 -0
  19. pelican_nlp/Nils_backup/transcription/annotation_tool_boundaries.py +1122 -0
  20. pelican_nlp/Nils_backup/transcription/annotation_tool_sandbox.py +985 -0
  21. pelican_nlp/Nils_backup/transcription/output/holmes_control_nova_all_outputs.json +7948 -0
  22. pelican_nlp/Nils_backup/transcription/test.json +1 -0
  23. pelican_nlp/Nils_backup/transcription/transcribe_audio.py +314 -0
  24. pelican_nlp/Nils_backup/transcription/transcribe_audio_chunked.py +695 -0
  25. pelican_nlp/Nils_backup/transcription/transcription.py +801 -0
  26. pelican_nlp/Nils_backup/transcription/transcription_gui.py +955 -0
  27. pelican_nlp/Nils_backup/transcription/word_boundaries.py +190 -0
  28. pelican_nlp/Silvia_files/Opensmile/opensmile_feature_extraction.py +66 -0
  29. pelican_nlp/Silvia_files/prosogram/prosogram.py +104 -0
  30. pelican_nlp/__init__.py +1 -1
  31. pelican_nlp/_version.py +1 -0
  32. pelican_nlp/configuration_files/config_audio.yml +150 -0
  33. pelican_nlp/configuration_files/config_discourse.yml +104 -0
  34. pelican_nlp/configuration_files/config_fluency.yml +108 -0
  35. pelican_nlp/configuration_files/config_general.yml +131 -0
  36. pelican_nlp/configuration_files/config_morteza.yml +103 -0
  37. pelican_nlp/praat/__init__.py +29 -0
  38. {pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.3.dist-info}/METADATA +15 -14
  39. pelican_nlp-0.1.3.dist-info/RECORD +75 -0
  40. pelican_nlp-0.1.1.dist-info/RECORD +0 -39
  41. {pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.3.dist-info}/WHEEL +0 -0
  42. {pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.3.dist-info}/licenses/LICENSE +0 -0
  43. {pelican_nlp-0.1.1.dist-info → pelican_nlp-0.1.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,190 @@
1
+ import sys
2
+ import numpy as np
3
+
4
+ from PyQt5.QtWidgets import (
5
+ QApplication,
6
+ QMainWindow,
7
+ QSplitter,
8
+ QWidget,
9
+ QVBoxLayout,
10
+ QTableWidget,
11
+ QTableWidgetItem,
12
+ )
13
+ from PyQt5.QtCore import Qt, pyqtSignal
14
+
15
+ from matplotlib.backends.backend_qt5agg import FigureCanvasQTAgg as FigureCanvas
16
+ from matplotlib.figure import Figure
17
+
18
+
19
+ class WaveformCanvas(FigureCanvas):
20
+ boundary_changed = pyqtSignal(int, str, float) # index of word, 'start'/'end', new position
21
+
22
+ def __init__(self, parent=None):
23
+ self.fig = Figure(figsize=(5, 4))
24
+ super(WaveformCanvas, self).__init__(self.fig)
25
+ self.setParent(parent)
26
+
27
+ self.ax = self.fig.add_subplot(111)
28
+ self.ax.set_title("Waveform with Word Boundaries")
29
+
30
+ # Generate a simple sine wave as the waveform
31
+ t = np.linspace(0, 10, 1000)
32
+ self.y = np.sin(2 * np.pi * 1 * t)
33
+ self.ax.plot(t, self.y)
34
+
35
+ # Example words with start and end times
36
+ self.words = [
37
+ {'word': 'Word1', 'start': 1.0, 'end': 2.0},
38
+ {'word': 'Word2', 'start': 2.5, 'end': 4.0},
39
+ {'word': 'Word3', 'start': 4.5, 'end': 6.0},
40
+ {'word': 'Word4', 'start': 6.5, 'end': 8.0},
41
+ ]
42
+
43
+ # Draw lines for start and end times
44
+ self.lines = [] # List of dicts: {'line': line_object, 'word_idx': idx, 'type': 'start'/'end'}
45
+
46
+ for idx, word in enumerate(self.words):
47
+ # Start time line
48
+ start_line = self.ax.axvline(word['start'], color='green', linestyle='--', picker=5)
49
+ self.lines.append({'line': start_line, 'word_idx': idx, 'type': 'start'})
50
+
51
+ # End time line
52
+ end_line = self.ax.axvline(word['end'], color='red', linestyle='--', picker=5)
53
+ self.lines.append({'line': end_line, 'word_idx': idx, 'type': 'end'})
54
+
55
+ self.dragging_line = None
56
+ self.prev_x = None
57
+
58
+ self.cid_press = self.fig.canvas.mpl_connect('button_press_event', self.on_press)
59
+ self.cid_release = self.fig.canvas.mpl_connect('button_release_event', self.on_release)
60
+ self.cid_motion = self.fig.canvas.mpl_connect('motion_notify_event', self.on_motion)
61
+
62
+ def on_press(self, event):
63
+ if event.inaxes != self.ax:
64
+ return
65
+ for line_dict in self.lines:
66
+ line = line_dict['line']
67
+ contains, _ = line.contains(event)
68
+ if contains:
69
+ self.dragging_line = line_dict
70
+ self.prev_x = event.xdata
71
+ break
72
+
73
+ def on_motion(self, event):
74
+ if self.dragging_line is None or event.inaxes != self.ax:
75
+ return
76
+ dx = event.xdata - self.prev_x
77
+ x = self.dragging_line['line'].get_xdata()[0] + dx
78
+ self.dragging_line['line'].set_xdata([x, x])
79
+ self.prev_x = event.xdata
80
+ self.fig.canvas.draw_idle()
81
+
82
+ def on_release(self, event):
83
+ if self.dragging_line is not None:
84
+ # Update the boundary positions
85
+ idx = self.dragging_line['word_idx']
86
+ boundary_type = self.dragging_line['type']
87
+ new_pos = self.dragging_line['line'].get_xdata()[0]
88
+ self.words[idx][boundary_type] = new_pos
89
+ self.dragging_line = None
90
+ self.prev_x = None
91
+ # Emit signal to update table
92
+ self.boundary_changed.emit(idx, boundary_type, new_pos)
93
+
94
+
95
+ class MainWindow(QMainWindow):
96
+ def __init__(self):
97
+ super().__init__()
98
+ self.setWindowTitle("Waveform Editor")
99
+ self.resize(800, 600)
100
+
101
+ splitter = QSplitter(Qt.Horizontal)
102
+
103
+ # Left panel: Waveform
104
+ self.waveform_widget = QWidget()
105
+ waveform_layout = QVBoxLayout(self.waveform_widget)
106
+ self.canvas = WaveformCanvas(parent=self.waveform_widget)
107
+ waveform_layout.addWidget(self.canvas)
108
+ splitter.addWidget(self.waveform_widget)
109
+
110
+ # Right panel: Table
111
+ self.table_widget = QTableWidget()
112
+ self.table_widget.setColumnCount(3)
113
+ self.table_widget.setHorizontalHeaderLabels(["Word", "Start Time", "End Time"])
114
+ splitter.addWidget(self.table_widget)
115
+
116
+ self.setCentralWidget(splitter)
117
+
118
+ # Connect signals
119
+ self.canvas.boundary_changed.connect(self.on_boundary_changed)
120
+ self.table_widget.itemChanged.connect(self.on_table_item_changed)
121
+
122
+ self.update_table()
123
+
124
+ def update_table(self):
125
+ self.table_widget.blockSignals(True) # Prevent signals while updating
126
+ words = self.canvas.words
127
+ self.table_widget.setRowCount(len(words))
128
+ for i, word in enumerate(words):
129
+ self.table_widget.setItem(i, 0, QTableWidgetItem(word['word']))
130
+ self.table_widget.setItem(i, 1, QTableWidgetItem(f"{word['start']:.2f}"))
131
+ self.table_widget.setItem(i, 2, QTableWidgetItem(f"{word['end']:.2f}"))
132
+ self.table_widget.blockSignals(False)
133
+
134
+ def on_boundary_changed(self, idx, boundary_type, new_pos):
135
+ # Update the table
136
+ self.table_widget.blockSignals(True) # Prevent recursive updates
137
+ if boundary_type == 'start':
138
+ item = self.table_widget.item(idx, 1)
139
+ if item is not None:
140
+ item.setText(f"{new_pos:.2f}")
141
+ elif boundary_type == 'end':
142
+ item = self.table_widget.item(idx, 2)
143
+ if item is not None:
144
+ item.setText(f"{new_pos:.2f}")
145
+ self.table_widget.blockSignals(False)
146
+
147
+ def on_table_item_changed(self, item):
148
+ row = item.row()
149
+ col = item.column()
150
+ words = self.canvas.words
151
+
152
+ if col == 1:
153
+ # Start time changed
154
+ try:
155
+ new_start = float(item.text())
156
+ words[row]['start'] = new_start
157
+ # Update the line position in the waveform
158
+ for line_dict in self.canvas.lines:
159
+ if line_dict['word_idx'] == row and line_dict['type'] == 'start':
160
+ line = line_dict['line']
161
+ line.set_xdata([new_start, new_start])
162
+ self.canvas.draw()
163
+ break
164
+ except ValueError:
165
+ pass # Invalid input, ignore
166
+ elif col == 2:
167
+ # End time changed
168
+ try:
169
+ new_end = float(item.text())
170
+ words[row]['end'] = new_end
171
+ # Update the line position in the waveform
172
+ for line_dict in self.canvas.lines:
173
+ if line_dict['word_idx'] == row and line_dict['type'] == 'end':
174
+ line = line_dict['line']
175
+ line.set_xdata([new_end, new_end])
176
+ self.canvas.draw()
177
+ break
178
+ except ValueError:
179
+ pass # Invalid input, ignore
180
+
181
+
182
+ def main():
183
+ app = QApplication(sys.argv)
184
+ window = MainWindow()
185
+ window.show()
186
+ sys.exit(app.exec_())
187
+
188
+
189
+ if __name__ == "__main__":
190
+ main()
@@ -0,0 +1,66 @@
1
+ import csv
2
+ import os
3
+ from typing import List
4
+
5
+ import audiofile
6
+ import opensmile
7
+ import glob
8
+
9
+ audio_dir = "C:\\Users\\Acer\\Documents\\Silvia\\UMC_Language and schizophrenia\\Research assistant\\Projects\\Language and sex hormones\\P009"
10
+ files: List[str] = glob.glob('C:\\Users\\Acer\\Documents\\Silvia\\UMC_Language and schizophrenia\\PhD\\Projects\\Interns\\Language and sex hormones\\P009\\*.wav')
11
+
12
+
13
+ #extract whole audiofile
14
+ #if wish to be shortended, fill in argument 'duration'
15
+ results = []
16
+ for file in files:
17
+ print(file)
18
+ storage = {}
19
+ signal, sampling_rate = audiofile.read(
20
+ file,
21
+ always_2d=True,
22
+ #duration=
23
+ #offset=
24
+ )
25
+
26
+ #extract eGeMAPSv02 feature set
27
+ smile = opensmile.Smile(
28
+ feature_set=opensmile.FeatureSet.eGeMAPSv02,
29
+ feature_level=opensmile.FeatureLevel.Functionals,
30
+ )
31
+
32
+ ##print(smile.feature_names)
33
+
34
+
35
+ output = smile.process_signal(
36
+ signal,
37
+ sampling_rate
38
+ )
39
+ print(output)
40
+
41
+ #save output
42
+ storage['p_nr'] = file[0:4]
43
+ for feature in smile.feature_names:
44
+ storage[feature] = output[feature]
45
+
46
+ results.append(storage)
47
+
48
+ csv_columns = results[0].keys()
49
+ csv_file = "C:\\Users\\Acer\\Documents\\Silvia\\UMC_Language and schizophrenia\\Research assistant\\Projects\\Language and sex hormones\\Opensmile_results.csv"
50
+
51
+ with open(csv_file, 'w', newline="") as csvfile:
52
+ writer = csv.writer(csvfile)
53
+ writer.writerow(csv_columns)
54
+ for i in range (len(results)): #iterate over particpants' results
55
+ data = results[i]
56
+ new_array = []
57
+ for t in data: #iterate all features in participants' results
58
+ if t == 'p_nr': #in first column, insert particpant number
59
+ new_array.append(files[i].split(os.sep)[-1]) #filesplit takes only participant number (and not the whole path) #append inserts data in the empty array
60
+ else: #for all other columns insert feature value
61
+ new_array.append(float(data[t]))
62
+ writer.writerow(new_array)
63
+
64
+
65
+
66
+
@@ -0,0 +1,104 @@
1
+ # 1. packages
2
+ import os
3
+ import subprocess
4
+ import shutil
5
+ import glob
6
+ import pandas as pd
7
+ from tqdm import tqdm
8
+
9
+ # 2. functions
10
+ def extract_profile(wav_path):
11
+ """
12
+
13
+
14
+ Args:
15
+ wav_path (str): pathway to the speech file.
16
+
17
+ Returns:
18
+ profile (DataFrame) : the result of prosogram.
19
+
20
+ """
21
+
22
+ # get directory
23
+ original_cwd = os.getcwd()
24
+ script_dir = os.path.dirname(os.path.abspath("__file__"))
25
+
26
+ # create a filefoder to store temporary file
27
+ tmp_dir = os.path.abspath("./tmp")
28
+ shutil.rmtree(tmp_dir, ignore_errors=True)
29
+ os.makedirs(tmp_dir, exist_ok=True)
30
+
31
+ try:
32
+ # copy all prosogram scripts into the temporary folder
33
+ for p in glob.glob(os.path.join(script_dir, "*.praat")):
34
+ shutil.copyfile(p, os.path.join(tmp_dir, os.path.basename(p)))
35
+
36
+ shutil.copyfile("Praat.exe", os.path.join(tmp_dir, "Praat.exe"))
37
+
38
+ # copy the speech file into the temporary folder
39
+ wav_copy_path = os.path.join(tmp_dir, os.path.basename(wav_path))
40
+ shutil.copyfile(wav_path, wav_copy_path)
41
+
42
+ # create a praat work file in the temporary folder with setting
43
+
44
+ # use default setting here
45
+ # if needed, add custome settings in prosogram_variants of job_content
46
+ # for example, if the custom settings are: g=0.32 dg=20 dmin=0.035
47
+ # the job_content should be:
48
+ # """
49
+ # include prosomain.praat
50
+ # @prosogram_variants: "file={os.path.abspath(wav_copy_path)} save=yes draw=no
51
+ # g=0.32 dg=20 dmin=0.035"
52
+ # exit
53
+ # """
54
+
55
+ job_contents = f"""include prosomain.praat
56
+ @prosogram: "file={os.path.abspath(wav_copy_path)} save=yes draw=no"
57
+ exit"""
58
+
59
+ job_path = os.path.join(tmp_dir, "job.praat")
60
+ with open(job_path, "w") as job_file:
61
+ job_file.write(job_contents)
62
+
63
+ os.chdir(tmp_dir)
64
+
65
+ # create a command line to run praat
66
+ invocation = f"Praat.exe --run {os.path.abspath(job_path)}"
67
+ status, output = subprocess.getstatusoutput(invocation)
68
+ # Give a warning when the processing failed (status is not 0)
69
+ if status != 0:
70
+ print(output)
71
+ raise Warning("FAILED: Praat failed! The outputs are printed above.")
72
+
73
+ # Read the result file
74
+ profile_path = os.path.join(tmp_dir,
75
+ wav_copy_path.replace(".wav", "") + \
76
+ "_profile_data.txt")
77
+ profile = pd.read_csv(profile_path, sep="\t")
78
+
79
+ finally:
80
+ # delete all contents in the temporary filefolder
81
+ shutil.rmtree(tmp_dir, ignore_errors=True)
82
+ os.chdir(original_cwd)
83
+
84
+ return profile
85
+
86
+
87
+ # 4. commands
88
+ path = 'Audios/'
89
+ wav_paths = glob.glob(os.path.join(path, "*.wav"))
90
+
91
+ # get all data within one data frame
92
+ profile = pd.DataFrame()
93
+ for wav_path in tqdm(wav_paths):
94
+ profile = pd.concat([profile,
95
+ extract_profile(wav_path)])
96
+
97
+ # add the identifier column
98
+ profile['Audios'] = wav_paths
99
+ profile = profile.reindex(columns=['Audios']+profile.columns.tolist()[1:])
100
+
101
+ # save as a csv file
102
+ profile.to_csv('Prosogram_results.csv', encoding='utf-8-sig', index=False)
103
+
104
+
pelican_nlp/__init__.py CHANGED
@@ -1,5 +1,5 @@
1
1
  # Version and metadata
2
- __version__ = "0.1.1"
2
+ from ._version import __version__
3
3
  __author__ = "Yves Pauli"
4
4
 
5
5
  try:
@@ -0,0 +1 @@
1
+ __version__ = "0.1.3"
@@ -0,0 +1,150 @@
1
+ # Master Configuration File
2
+ # ========================
3
+
4
+ # Basic Settings
5
+ # -------------
6
+ input_file: "audio" # Options: 'text' or 'audio'
7
+ PATH_TO_PROJECT_FOLDER: "/home/yvespauli/PycharmProjects/Silvia"
8
+ language: "dutch" # Options: 'german', 'english'
9
+ recompute_everything: true # If false, reuses previously computed results
10
+ output_document_information: true
11
+ # Task Configuration
12
+ # -----------------
13
+ task_name: "monologue" # Options: 'fluency', 'interview'
14
+ fluency_task: &fluency_flag false # Flag for fluency-specific settings
15
+ discourse: &discourse_flag false # Flag for discourse-specific settings
16
+ corpus_names: # List of task corpora
17
+ - "interv-removed"
18
+
19
+ # Session and Subject Settings
20
+ # --------------------------
21
+ multiple_sessions: true
22
+ number_of_subjects: null # If null, auto-detected
23
+ number_of_speakers: 1
24
+ subject_speakertag: null # Speaker tag for subject (e.g., "B")
25
+
26
+ # Document Structure
27
+ # ----------------
28
+ has_multiple_sections: false
29
+ has_section_titles: false
30
+ section_identification: null # e.g., "Section:"
31
+ number_of_sections: 1 # If null, auto-detected
32
+
33
+ # Processing Pipeline
34
+ # -----------------
35
+ pipeline_options:
36
+ quality_check: false
37
+ clean_text: true
38
+ tokenize_text: false
39
+ normalize_text: false
40
+
41
+ # Audio Processing
42
+ # ---------------
43
+ opensmile_feature_extraction: true
44
+ prosogram_extraction: false
45
+
46
+ opensmile_configurations:
47
+ always_2d: true
48
+ duration: null
49
+ offset: null
50
+
51
+ feature_set: opensmile.FeatureSet.eGeMAPSv02
52
+ feature_level: opensmile.FeatureLevel.Functionals
53
+
54
+
55
+ # Metric Extraction
56
+ # ---------------
57
+ metric_to_extract: null # Options: 'embeddings', 'logits'
58
+ extract_logits: false
59
+ extract_embeddings: false
60
+
61
+ # Cleaning Options
62
+ # --------------
63
+ cleaning_options:
64
+ general_cleaning: true
65
+ remove_punctuation: false
66
+ lowercase: true
67
+ remove_brackets_and_bracketcontent: false
68
+ remove_timestamps: false
69
+ timestamp_pattern_example: null # e.g., "#00:00:23-00#"
70
+ # Fluency-specific options
71
+ fluency_task: *fluency_flag
72
+ word_splitter: ';'
73
+ remove_hyphens: true
74
+ remove_duplicates: true
75
+
76
+ general_cleaning_options:
77
+ strip_whitespace: true
78
+ merge_multiple_whitespaces: true
79
+ remove_whitespace_before_punctuation: true
80
+ merge_newline_characters: true
81
+ remove_backslashes: true
82
+
83
+ # Embedding Options
84
+ # ---------------
85
+ options_embeddings:
86
+ tokenization_method: "whitespace" # Options: 'whitespace', 'model'
87
+ model_name: "fastText" # Options: 'fastText', 'xlm-roberta-base'
88
+ pytorch_based_model: false
89
+ method: "model_instance"
90
+ max_length: 512
91
+ clean_embedding_tokens: true
92
+ remove_punctuation: false
93
+ lowercase: false
94
+ keep_speakertags: false
95
+ semantic-similarity: true
96
+ window_size: null
97
+ clean_tokens: true
98
+ divergence_from_optimality: false
99
+ output_options:
100
+ exclude_special_tokens: true
101
+ remove_'_'_character: true
102
+ remove_speaker_labels: true
103
+ remove_punctuation_and_symbols: true
104
+ remove_brackets_and_content: true
105
+
106
+ # Logits Options
107
+ # -------------
108
+ options_logits:
109
+ chunk_size: 128
110
+ overlap_size: 64
111
+ tokenization_method: "model"
112
+ model_name: "DiscoResearch/Llama3-German-8B-32k"
113
+ remove_punctuation: true
114
+ lowercase: true
115
+ keep_speakertags: true
116
+
117
+ # Analysis Options
118
+ # --------------
119
+ options_semantic-similarity:
120
+ window_sizes: # 'all' or window size as integer
121
+ - 2
122
+ - 8
123
+
124
+ options_dis_from_randomness:
125
+ window_size: 8
126
+ min_len: null
127
+ bootstrap: 10000
128
+ shuffle_mode: 'include0_includeN'
129
+ parallel_computing: false
130
+
131
+ # Normalization Options
132
+ # -------------------
133
+ normalization_options:
134
+ method: "lemmatization" # Options: 'lemmatization', 'stemming'
135
+
136
+ # Filename Configuration
137
+ # --------------------
138
+ filename_components:
139
+ subject: true # mandatory
140
+ session: false
141
+ task: true # mandatory
142
+ task_addition: false
143
+ corpus: true
144
+ metric: true
145
+ additional_tags: []
146
+
147
+ document_information_output:
148
+ parameters:
149
+ - subject_ID
150
+ - recording_length
@@ -0,0 +1,104 @@
1
+ # Configuration file for discourse protocols
2
+ #=======================================
3
+ input_file: "text" #or 'audio'
4
+ discourse: &discourse_flag true
5
+ #=====================================
6
+
7
+ #general configurations; always adapt
8
+ PATH_TO_PROJECT_FOLDER: "/home/yvespauli/PycharmProjects/Morteza/"
9
+ language: "german" # Possibly add options for German and English
10
+
11
+ task_name: "interview" # Give name of task used for creation of the input file (e.g., ['fluency', 'interview'])
12
+ corpus_names:
13
+ - "schizophrenia"
14
+
15
+ metric_to_extract: "embeddings" #Possible options: 'logits' or 'embeddings'
16
+
17
+ number_of_speakers: 3
18
+ subject_speakertag: "B"
19
+ #=========================================================
20
+
21
+ #Optional configurations; Change with preference. However, default settings recommended
22
+ fluency_task: &fluency_flag false
23
+ cleaning_options:
24
+ general_cleaning: true # General cleaning options used for most text preprocessing, default: True.
25
+ remove_brackets_and_bracketcontent: true
26
+ remove_timestamps: true
27
+ timestamp_pattern_example: "#00:00:19-0#"
28
+ remove_punctuation: false
29
+ lowercase: false
30
+ #Options for fluency tasks
31
+ fluency_task: *fluency_flag
32
+ word_splitter: null
33
+ remove_hyphens: null
34
+ remove_duplicates: null
35
+
36
+ options_logits:
37
+ chunk_size: 128
38
+ overlap_size: 64
39
+ tokenization_method: "model"
40
+ #method: "model_instance" # Options: model, regex, nltk, etc.
41
+ model_name: "DiscoResearch/Llama3-German-8B-32k" # Replace with your model instance name
42
+ remove_punctuation: true
43
+ lowercase: true
44
+ keep_speakertags: true
45
+
46
+ options_embeddings:
47
+ tokenization_method: "whitespace" #"model" or "whitespace"
48
+ max_length: 512 #max sequence length
49
+ model_name: "fastText" #e.g. "fastText", "xlm-roberta-base"
50
+ pytorch_based_model: false
51
+ method: "model_instance"
52
+ remove_punctuation: false
53
+ lowercase: false
54
+ keep_speakertags: true
55
+ clean_embedding_tokens: true
56
+ output_options:
57
+ exclude_special_tokens: true
58
+ remove_'_'_character: true
59
+ remove_speaker_labels: true
60
+ remove_punctuation_and_symbols: true
61
+ remove_brackets_and_content: true
62
+ semantic-similarity: false
63
+ window_size: null
64
+ clean_tokens: false
65
+ divergence_from_optimality: false
66
+ #================================================================================
67
+
68
+ #Extra configurations:
69
+ pipeline_options:
70
+ quality_check: false
71
+ clean_text: true
72
+ tokenize_text: false
73
+ normalize_text: false
74
+
75
+ general_cleaning_options:
76
+ strip_whitespace: true
77
+ merge_multiple_whitespaces: true
78
+ remove_whitespace_before_punctuation: true
79
+ merge_newline_characters: true
80
+ remove_backslashes: true
81
+
82
+ has_multiple_sections: false #evaluated independently
83
+ has_section_titles: false
84
+ section_identification: null #e.g. "Section:", 'null' if file does not have multiple sections, use pattern that is unlikely to appear in rest of transcript
85
+ number_of_sections: null #if 'null' number of sections automatically detected, however, specifying number recommended if known.
86
+
87
+ # Options for extract_embeddings
88
+ window_sizes: [2]
89
+ metric_function: cosine_similarity
90
+ aggregation_functions: mean_of_means
91
+
92
+ normalization_options:
93
+ method: "lemmatization" #Options: lemmatization or stemming
94
+ #================================================================
95
+
96
+ #Detail configurations; Changes optional, mostly used for quality checking / error handling
97
+ number_of_subjects: null # Specify number of subjects; if 'null', number of subjects is automatically detected
98
+ multiple_sessions: false # Set to True if multiple sessions per subject
99
+
100
+ recompute_everything: true #If set to 'false' pelican-nlp will try to reuse previously computed results stored on your drive
101
+
102
+ create_aggregation_of_results: false
103
+
104
+