pelican-nlp 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. pelican_nlp/__init__.py +9 -0
  2. pelican_nlp/core/__init__.py +5 -0
  3. pelican_nlp/core/audio_document.py +20 -0
  4. pelican_nlp/core/corpus.py +296 -0
  5. pelican_nlp/core/document.py +226 -0
  6. pelican_nlp/core/subject.py +30 -0
  7. pelican_nlp/extraction/__init__.py +2 -0
  8. pelican_nlp/extraction/acoustic_feature_extraction.py +71 -0
  9. pelican_nlp/extraction/distance_from_randomness.py +109 -0
  10. pelican_nlp/extraction/extract_embeddings.py +57 -0
  11. pelican_nlp/extraction/extract_logits.py +102 -0
  12. pelican_nlp/extraction/language_model.py +71 -0
  13. pelican_nlp/extraction/semantic_similarity.py +60 -0
  14. pelican_nlp/extraction/test_documents/test_features.csv +4 -0
  15. pelican_nlp/extraction/test_documents/wallace_1.15_3.txt +1 -0
  16. pelican_nlp/extraction/test_documents/wallace_1.1_3.txt +1 -0
  17. pelican_nlp/extraction/test_documents/wallace_1_4.txt +1 -0
  18. pelican_nlp/main.py +211 -0
  19. pelican_nlp/metrics_statistics/embeddings_metrics_statistics.py +34 -0
  20. pelican_nlp/preprocessing/LPDS.py +77 -0
  21. pelican_nlp/preprocessing/__init__.py +7 -0
  22. pelican_nlp/preprocessing/pipeline.py +50 -0
  23. pelican_nlp/preprocessing/speaker_diarization.py +33 -0
  24. pelican_nlp/preprocessing/text_cleaner.py +224 -0
  25. pelican_nlp/preprocessing/text_importer.py +42 -0
  26. pelican_nlp/preprocessing/text_normalizer.py +24 -0
  27. pelican_nlp/preprocessing/text_tokenizer.py +43 -0
  28. pelican_nlp/sample_configuration_files/config_discourse.yml +103 -0
  29. pelican_nlp/sample_configuration_files/config_fluency.yml +108 -0
  30. pelican_nlp/sample_configuration_files/config_general.yml +131 -0
  31. pelican_nlp/utils/__init__.py +3 -0
  32. pelican_nlp/utils/csv_functions.py +193 -0
  33. pelican_nlp/utils/sample_usage.py +17 -0
  34. pelican_nlp/utils/setup_functions.py +93 -0
  35. pelican_nlp-0.1.0.dist-info/METADATA +146 -0
  36. pelican_nlp-0.1.0.dist-info/RECORD +39 -0
  37. pelican_nlp-0.1.0.dist-info/WHEEL +5 -0
  38. pelican_nlp-0.1.0.dist-info/licenses/LICENSE +400 -0
  39. pelican_nlp-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,42 @@
1
+ import os
2
+
3
+ class TextImporter:
4
+ def __init__(self, file_path):
5
+ self.file_path = file_path
6
+
7
+ def load_text(self, file_path):
8
+ # Possible file formats txt and docx, expand if necessary
9
+ ext = os.path.splitext(file_path)[-1].lower()
10
+
11
+ if ext == '.txt':
12
+ return self._load_txt(file_path)
13
+ elif ext == '.docx':
14
+ return self._load_docx(file_path)
15
+ elif ext == '.rtf':
16
+ return self._load_rtf(file_path)
17
+ else:
18
+ raise ValueError(f"Unsupported file format: {ext}")
19
+
20
+ def _load_txt(self,file_path):
21
+ with open(file_path, 'r') as file:
22
+ return file.read()
23
+
24
+ def _load_docx(self,file_path):
25
+ import docx2txt
26
+ doc = docx2txt.process(file_path)
27
+ return doc
28
+ #return '\n'.join([para.text for para in doc.paragraphs])
29
+
30
+ def _load_rtf(self, file_path):
31
+ """Read RTF file and convert its content to plain text."""
32
+ from striprtf.striprtf import rtf_to_text
33
+ import chardet
34
+ with open(file_path, "rb") as file:
35
+ raw_data = file.read()
36
+ result = chardet.detect(raw_data)
37
+ encoding = result["encoding"]
38
+
39
+ with open(file_path, "r", encoding=encoding, errors="ignore") as file:
40
+ rtf_content = file.read()
41
+
42
+ return rtf_to_text(rtf_content)
@@ -0,0 +1,24 @@
1
+ import spacy
2
+
3
+ class TextNormalizer:
4
+ def __init__(self, options):
5
+ self.options = options
6
+ self.nlp = None # Initialize as None, load only when needed
7
+
8
+ def _load_model(self):
9
+ """Load spaCy model if not already loaded."""
10
+ if self.nlp is None:
11
+ self.nlp = spacy.load('de_core_news_sm')
12
+
13
+ def normalize(self, tokens):
14
+ method = self.options.get('method')
15
+
16
+ if method == 'lemmatization':
17
+ self._load_model() # Load model only when lemmatization is needed
18
+ return [self.nlp(token)[0].lemma_ for token in tokens]
19
+ elif method == 'stemming':
20
+ self._load_model() # Load model only when stemming is needed
21
+ doc = self.nlp(" ".join(tokens))
22
+ return [token._.stemmed for token in doc]
23
+ else:
24
+ raise ValueError(f"Unsupported normalization method: {method}")
@@ -0,0 +1,43 @@
1
+ import torch
2
+
3
+ class TextTokenizer:
4
+ def __init__(self, method, model_name=None, max_length=None):
5
+ self.tokenization_method = method
6
+ self.model_name = model_name
7
+ self.max_sequence_length=max_length
8
+
9
+ self.tokenizer = self.get_tokenizer()
10
+
11
+ self.device_used = 'cuda' if torch.cuda.is_available() else 'cpu'
12
+
13
+ def tokenize_text(self, text):
14
+
15
+ method = self.tokenization_method
16
+
17
+ if not isinstance(text, str):
18
+ raise ValueError(f"to tokenize a text it must be a in string format, but it is in format {type(text)}")
19
+
20
+ if method == 'whitespace':
21
+ # Tokenize by whitespace
22
+ return text.split()
23
+ elif method == 'model_roberta':
24
+ # Tokenize using the model's tokenizer
25
+ return self.tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=self.max_sequence_length).to(self.device_used)
26
+ elif method == 'model':
27
+ return self.tokenizer.encode(text, return_tensors='pt')
28
+ else:
29
+ raise ValueError(f"Unsupported tokenization method: {method}")
30
+
31
+ def convert_ids_to_tokens(self, ids):
32
+ return self.tokenizer.convert_ids_to_tokens(ids)
33
+
34
+ def get_tokenizer(self):
35
+ if self.tokenization_method == 'model' or self.tokenization_method == 'model_roberta':
36
+ from transformers import AutoTokenizer
37
+ return AutoTokenizer.from_pretrained(
38
+ self.model_name,
39
+ trust_remote_code=False, # Don't execute arbitrary model code
40
+ use_safetensors=True
41
+ )
42
+ else:
43
+ return None
@@ -0,0 +1,103 @@
1
+ # Configuration file for discourse protocols
2
+ #=======================================
3
+ input_file: "text" #or 'audio'
4
+ discourse: &discourse_flag true
5
+ #=====================================
6
+
7
+ #general configurations; always adapt
8
+ PATH_TO_PROJECT_FOLDER: "/path/to/your/project"
9
+ language: "german" # Possibly add options for German and English
10
+
11
+ task_name: "interview" # Give name of task used for creation of the input file (e.g., ['fluency', 'interview'])
12
+ corpus_names:
13
+ - "placebo"
14
+ - "schizophrenia"
15
+
16
+ metric_to_extract: "embeddings" #Possible options: 'logits' or 'embeddings'
17
+
18
+ number_of_speakers: 3
19
+ subject_speakertag: "B"
20
+ #=========================================================
21
+
22
+ #Optional configurations; Change with preference. However, default settings recommended
23
+ fluency_task: &fluency_flag false
24
+ cleaning_options:
25
+ general_cleaning: true # General cleaning options used for most text preprocessing, default: True.
26
+ remove_brackets_and_bracketcontent: true
27
+ remove_timestamps: true
28
+ timestamp_pattern_example: "#00:00:19-0#"
29
+ remove_punctuation: false
30
+ lowercase: false
31
+ #Options for fluency tasks
32
+ fluency_task: *fluency_flag
33
+ word_splitter: null
34
+ remove_hyphens: null
35
+ remove_duplicates: null
36
+
37
+ options_logits:
38
+ chunk_size: 128
39
+ overlap_size: 64
40
+ tokenization_method: "model"
41
+ #method: "model_instance" # Options: model, regex, nltk, etc.
42
+ model_name: "DiscoResearch/Llama3-German-8B-32k" # Replace with your model instance name
43
+ remove_punctuation: true
44
+ lowercase: true
45
+ keep_speakertags: true
46
+
47
+ options_embeddings:
48
+ tokenization_method: "whitespace" #"model" or "whitespace"
49
+ max_length: 512 #max sequence length
50
+ model_name: "fastText" #e.g. "fastText", "xlm-roberta-base"
51
+ pytorch_based_model: false
52
+ method: "model_instance"
53
+ remove_punctuation: false
54
+ lowercase: false
55
+ keep_speakertags: true
56
+ clean_embedding_tokens: true
57
+ output_options:
58
+ exclude_special_tokens: true
59
+ remove_'_'_character: true
60
+ remove_speaker_labels: true
61
+ remove_punctuation_and_symbols: true
62
+ remove_brackets_and_content: true
63
+ semantic-similarity: false
64
+ window_size: null
65
+ clean_tokens: false
66
+ divergence_from_optimality: false
67
+ #================================================================================
68
+
69
+ #Extra configurations:
70
+ pipeline_options:
71
+ quality_check: false
72
+ clean_text: true
73
+ tokenize_text: false
74
+ normalize_text: false
75
+
76
+ general_cleaning_options:
77
+ strip_whitespace: true
78
+ merge_multiple_whitespaces: true
79
+ remove_whitespace_before_punctuation: true
80
+ merge_newline_characters: true
81
+ remove_backslashes: true
82
+
83
+ has_multiple_sections: false #evaluated independently
84
+ has_section_titles: false
85
+ section_identification: null #e.g. "Section:", 'null' if file does not have multiple sections, use pattern that is unlikely to appear in rest of transcript
86
+ number_of_sections: null #if 'null' number of sections automatically detected, however, specifying number recommended if known.
87
+
88
+ # Options for extract_embeddings
89
+ window_sizes: [2]
90
+ metric_function: cosine_similarity
91
+ aggregation_functions: mean_of_means
92
+
93
+ normalization_options:
94
+ method: "lemmatization" #Options: lemmatization or stemming
95
+ #================================================================
96
+
97
+ #Detail configurations; Changes optional, mostly used for quality checking / error handling
98
+ number_of_subjects: null # Specify number of subjects; if 'null', number of subjects is automatically detected
99
+ multiple_sessions: false # Set to True if multiple sessions per subject
100
+
101
+ recompute_everything: true #If set to 'false' pelican-nlp will try to reuse previously computed results stored on your drive
102
+
103
+
@@ -0,0 +1,108 @@
1
+ # Configuration file for fluency task
2
+ # =======================================
3
+ input_file: "text" #or 'audio'
4
+ fluency_task: &fluency_flag true
5
+ #========================================
6
+
7
+ #general configurations; always adapt
8
+ PATH_TO_PROJECT_FOLDER: "/path/to/your/project"
9
+ language: "german"
10
+ multiple_sessions: &session_flag false
11
+
12
+ corpus_names: #names of fluency tasks (e.g. "animals", "clothes")
13
+ - "animals"
14
+ - "clothes"
15
+ - "food"
16
+
17
+ #Specify linguistic metrics to extract
18
+ metric_to_extract: 'embeddings' #Possible options: 'embeddings', 'logits'
19
+ output_document_information: true
20
+ #====================================================================
21
+
22
+ #Optional configurations; Change with preference. However, default settings recommended
23
+ cleaning_options:
24
+ general_cleaning: true
25
+ #Options for fluency tasks
26
+ fluency_task: *fluency_flag
27
+ word_splitter: ';' #default split with ',' add different word_splitter if necessary
28
+ remove_hyphens: true
29
+ remove_duplicates: false
30
+ lowercase: false
31
+ #Optional cleaning
32
+ remove_brackets_and_bracketcontent: false #default 'false'
33
+ remove_timestamps: false #default 'false'
34
+ timestamp_pattern_example: null #e.g. "#00:00:23-00#"
35
+ remove_punctuation: false #Careful!: If set to true word_splitter might be removed
36
+
37
+ options_embeddings:
38
+ tokenization_method: "whitespace" #or "model"
39
+ model_name: "fastText" #e.g. "fastText", "xlm-roberta-base"
40
+ pytorch_based_model: false
41
+ method: "model_instance"
42
+ max_length: null
43
+ clean_embedding_tokens: true
44
+
45
+ semantic-similarity: true
46
+ distance-from-randomness: false
47
+
48
+ options_dis_from_randomness:
49
+ window_size: 8
50
+ min_len: null
51
+ bootstrap: 10000
52
+ shuffle_mode: 'include0_includeN'
53
+ parallel_computing: false #not yet set up
54
+
55
+ options_semantic-similarity:
56
+ window_sizes: #'all' or window size as integer
57
+ - 2
58
+ - 8
59
+ #==================================================================
60
+
61
+ #Extra configurations;
62
+ task_name: "fluency"
63
+ create_aggregation_of_results: true
64
+
65
+ pipeline_options:
66
+ quality_check: false
67
+ clean_text: true
68
+ tokenize_text: false
69
+ normalize_text: false
70
+
71
+ general_cleaning_options:
72
+ strip_whitespace: true
73
+ merge_multiple_whitespaces: true
74
+ remove_whitespace_before_punctuation: true
75
+ merge_newline_characters: true
76
+ remove_backslashes: true
77
+
78
+ has_multiple_sections: false
79
+ has_section_titles: false
80
+ section_identification: null
81
+ number_of_sections: 1
82
+ number_of_speakers: 1
83
+ discourse: false
84
+
85
+ document_information_output:
86
+ parameters:
87
+ - subject_ID
88
+ - fluency_word_count
89
+ - fluency_duplicate_count
90
+
91
+ #================================================================
92
+
93
+ #Detail configurations; Changes optional, mostly used for quality checking / error handling
94
+ recompute_everything: true
95
+ number_of_subjects: null
96
+
97
+ # Filename components configuration
98
+ filename_components:
99
+ subject: true # mandatory
100
+ session: *session_flag
101
+ task: true # mandatory
102
+ task_addition: false
103
+ corpus: true # typically true for fluency tasks (e.g., "animals", "clothes")
104
+ metric: true
105
+ additional_tags: []
106
+
107
+
108
+
@@ -0,0 +1,131 @@
1
+ # Master Configuration File
2
+ # ========================
3
+
4
+ # Basic Settings
5
+ # -------------
6
+ input_file: "text" # Options: 'text' or 'audio'
7
+ PATH_TO_PROJECT_FOLDER: "/path/to/your/project"
8
+ language: "german"
9
+ recompute_everything: true # If false, reuses previously computed results
10
+
11
+ # Task Configuration
12
+ # -----------------
13
+ task_name: # Name of task used for creation of data
14
+ fluency_task: &fluency_flag false # Flag for fluency-specific settings
15
+ discourse: &discourse_flag false # Flag for discourse-specific settings
16
+ corpus_names: # List of task corpora
17
+ - "healthy-control"
18
+
19
+ # Session and Subject Settings
20
+ # --------------------------
21
+ multiple_sessions: false
22
+ number_of_subjects: null # If null, auto-detected
23
+ number_of_speakers: 1
24
+ subject_speakertag: null # Speaker tag for subject (e.g., "B")
25
+
26
+ # Document Structure
27
+ # ----------------
28
+ has_multiple_sections: false
29
+ has_section_titles: false
30
+ section_identification: null # e.g., "Section:"
31
+ number_of_sections: 1 # If null, auto-detected
32
+
33
+ # Processing Pipeline
34
+ # -----------------
35
+ pipeline_options:
36
+ quality_check: false
37
+ clean_text: true
38
+ tokenize_text: false
39
+ normalize_text: false
40
+
41
+ # Metric Extraction
42
+ # ---------------
43
+ metric_to_extract: "embeddings" # Options: 'embeddings', 'logits'
44
+ extract_logits: null
45
+ extract_embeddings: true
46
+
47
+ # Cleaning Options
48
+ # --------------
49
+ cleaning_options:
50
+ general_cleaning: true
51
+ remove_punctuation: false
52
+ lowercase: true
53
+ remove_brackets_and_bracketcontent: false
54
+ remove_timestamps: false
55
+ timestamp_pattern_example: null # e.g., "#00:00:23-00#"
56
+ # Fluency-specific options
57
+ fluency_task: *fluency_flag
58
+ word_splitter: ';'
59
+ remove_hyphens: true
60
+ remove_duplicates: true
61
+
62
+ general_cleaning_options:
63
+ strip_whitespace: true
64
+ merge_multiple_whitespaces: true
65
+ remove_whitespace_before_punctuation: true
66
+ merge_newline_characters: true
67
+ remove_backslashes: true
68
+
69
+ # Embedding Options
70
+ # ---------------
71
+ options_embeddings:
72
+ tokenization_method: "whitespace" # Options: 'whitespace', 'model'
73
+ model_name: "fastText" # Options: 'fastText', 'xlm-roberta-base'
74
+ pytorch_based_model: false
75
+ method: "model_instance"
76
+ max_length: 512
77
+ clean_embedding_tokens: true
78
+ remove_punctuation: false
79
+ lowercase: false
80
+ keep_speakertags: false
81
+ semantic-similarity: true
82
+ window_size: null
83
+ clean_tokens: true
84
+ divergence_from_optimality: false
85
+ output_options:
86
+ exclude_special_tokens: true
87
+ remove_'_'_character: true
88
+ remove_speaker_labels: true
89
+ remove_punctuation_and_symbols: true
90
+ remove_brackets_and_content: true
91
+
92
+ # Logits Options
93
+ # -------------
94
+ options_logits:
95
+ chunk_size: 128
96
+ overlap_size: 64
97
+ tokenization_method: "model"
98
+ model_name: "DiscoResearch/Llama3-German-8B-32k"
99
+ remove_punctuation: true
100
+ lowercase: true
101
+ keep_speakertags: true
102
+
103
+ # Analysis Options
104
+ # --------------
105
+ options_semantic-similarity:
106
+ window_sizes: # 'all' or window size as integer
107
+ - 2
108
+ - 8
109
+
110
+ options_dis_from_randomness:
111
+ window_size: 8
112
+ min_len: null
113
+ bootstrap: 10000
114
+ shuffle_mode: 'include0_includeN'
115
+ parallel_computing: false
116
+
117
+ # Normalization Options
118
+ # -------------------
119
+ normalization_options:
120
+ method: "lemmatization" # Options: 'lemmatization', 'stemming'
121
+
122
+ # Filename Configuration
123
+ # --------------------
124
+ filename_components:
125
+ subject: true # mandatory
126
+ session: false
127
+ task: true # mandatory
128
+ task_addition: false
129
+ corpus: true #mandatory
130
+ metric: true
131
+ additional_tags: []
@@ -0,0 +1,3 @@
1
+ # Import utility functions for easier access
2
+ from .setup_functions import subject_instantiator, load_config, remove_previous_derivative_dir
3
+ from .csv_functions import store_features_to_csv
@@ -0,0 +1,193 @@
1
+ import os
2
+ import csv
3
+ import numpy as np
4
+
5
+ def store_features_to_csv(input_data, derivatives_dir, doc_class, metric):
6
+ """Store various types of features to CSV files with consistent formatting.
7
+
8
+ Args:
9
+ input_data: The data to be stored in CSV format
10
+ derivatives_dir: Base directory for all derivatives
11
+ doc_class: Document class containing subject, session (optional), task, and task_addition (optional) info
12
+ metric: Type of metric being stored
13
+ """
14
+ # Get the appropriate metric folder
15
+ metric_folder = metric
16
+
17
+ # Build base filename parts from doc_class
18
+ filename_parts = [
19
+ doc_class.subject_ID,
20
+ doc_class.task,
21
+ doc_class.corpus_name
22
+ ]
23
+
24
+ # Add session to filename if it exists
25
+ if hasattr(doc_class, 'session') and doc_class.session:
26
+ filename_parts.insert(1, doc_class.session)
27
+
28
+ # Join the base parts with underscores
29
+ filename = "_".join(filename_parts)
30
+
31
+ # Add task_addition with underscore if it exists
32
+ if hasattr(doc_class, 'task_addition') and doc_class.task_addition:
33
+ filename += f"_{doc_class.task_addition}"
34
+
35
+ # Add the metric with an underscore
36
+ filename += f"_{metric}.csv"
37
+
38
+ # Build the full path
39
+ path_components = [
40
+ derivatives_dir,
41
+ metric_folder,
42
+ doc_class.subject_ID,
43
+ ]
44
+
45
+ # Add session to path if it exists
46
+ if hasattr(doc_class, 'session') and doc_class.session:
47
+ path_components.append(doc_class.session)
48
+
49
+ path_components.append(doc_class.task)
50
+
51
+ # Create directory and get final filepath
52
+ final_results_path = os.path.join(*path_components)
53
+ os.makedirs(final_results_path, exist_ok=True)
54
+
55
+ output_filepath = os.path.join(final_results_path, filename)
56
+ file_exists = os.path.exists(output_filepath)
57
+
58
+ # Write data based on metric type
59
+ with open(output_filepath, mode='a', newline='', encoding='utf-8') as file:
60
+ writer = csv.writer(file)
61
+
62
+ if metric == 'embeddings':
63
+ if not isinstance(input_data, list) or not input_data:
64
+ raise ValueError("Input data must be a non-empty list of tuples.")
65
+
66
+ # Get the dimensionality from the first embedding
67
+ embedding_dim = len(input_data[0][1])
68
+ header = ['Token'] + [f"Dim_{i}" for i in range(embedding_dim)]
69
+ _write_csv_header(writer, header, file_exists)
70
+
71
+ for token, embedding in input_data:
72
+ # Handle both list and tensor/array types
73
+ if hasattr(embedding, 'tolist'):
74
+ embedding_list = embedding.tolist()
75
+ elif isinstance(embedding, list):
76
+ embedding_list = embedding
77
+ else:
78
+ raise ValueError(f"Embedding must be either a list or have tolist() method, got {type(embedding)}")
79
+ writer.writerow([token] + embedding_list)
80
+
81
+ elif metric == 'cosine-similarity-matrix':
82
+ _write_csv_header(writer, ['Matrix'], file_exists)
83
+ for row in input_data:
84
+ writer.writerow(row)
85
+
86
+ elif metric.startswith('semantic-similarity-window-'):
87
+ header = ['Metric', 'Similarity_Score']
88
+ _write_csv_header(writer, header, file_exists)
89
+
90
+ for metric_name, score in input_data.items():
91
+ writer.writerow([metric_name, score])
92
+
93
+ elif metric == 'distance-from-randomness':
94
+ header = ['window_index', 'all_pairs_average', 'actual_dist', 'average_dist', 'std_dist']
95
+ _write_csv_header(writer, header, file_exists)
96
+
97
+ # Input data is a dictionary with 'section' key containing list of window results
98
+ for window_result in input_data['section']:
99
+ writer.writerow([
100
+ window_result['window_index'],
101
+ window_result['all_pairs_average'],
102
+ window_result['actual_dist'],
103
+ window_result['average_dist'],
104
+ window_result['std_dist']
105
+ ])
106
+
107
+ elif metric == 'logits':
108
+ if not input_data:
109
+ return
110
+ header = list(input_data[0].keys())
111
+ _write_csv_header(writer, header, file_exists)
112
+
113
+ for entry in input_data:
114
+ writer.writerow(entry.values())
115
+
116
+ elif metric == 'opensmile-features':
117
+ if not input_data:
118
+ return
119
+
120
+ # Get all column names from the first entry
121
+ csv_columns = list(input_data[0].keys()) if isinstance(input_data, list) else list(input_data.keys())
122
+
123
+ # Only write header if file doesn't exist
124
+ if not file_exists:
125
+ writer.writerow(csv_columns)
126
+
127
+ # Handle both list of dictionaries and single dictionary cases
128
+ if isinstance(input_data, list):
129
+ for entry in input_data:
130
+ # Create a new array for the row data
131
+ row_data = []
132
+ for column in csv_columns:
133
+ # Convert numerical values to float
134
+ value = entry[column]
135
+ if isinstance(value, (int, float)):
136
+ value = float(value)
137
+ row_data.append(value)
138
+ writer.writerow(row_data)
139
+ else:
140
+ # Handle single dictionary case
141
+ row_data = []
142
+ for column in csv_columns:
143
+ value = input_data[column]
144
+ if isinstance(value, (int, float)):
145
+ value = float(value)
146
+ row_data.append(value)
147
+ writer.writerow(row_data)
148
+
149
+
150
+ def _build_filename_parts(path_parts, corpus, metric, config=None):
151
+ """Helper function to build filename components."""
152
+ filename_config = config.get('filename_components', {}) if config else {}
153
+
154
+ # Extract mandatory components
155
+ if len(path_parts) < 3:
156
+ raise ValueError("Invalid path format. Expected at least 'project/subject/task'.")
157
+
158
+ subject = path_parts[-3]
159
+ task = path_parts[-1]
160
+
161
+ # Build filename components
162
+ parts = [subject]
163
+
164
+ # Add optional session
165
+ if filename_config.get('session', False) and len(path_parts) >= 4:
166
+ parts.append(path_parts[-3])
167
+
168
+ parts.append(task)
169
+
170
+ # Add optional components
171
+ if filename_config.get('corpus', True):
172
+ parts.append(corpus)
173
+ parts.extend(filename_config.get('additional_tags', []))
174
+ parts.append(metric)
175
+
176
+ return parts
177
+
178
+
179
+ def _get_metric_folder(metric):
180
+ """Determine the appropriate metric folder."""
181
+ if metric.startswith('semantic-similarity') or metric in ['consecutive-similarities', 'cosine-similarity-matrix']:
182
+ return 'semantic-similarity'
183
+ return 'embeddings'
184
+
185
+
186
+ def _write_csv_header(writer, header, file_exists):
187
+ """Write CSV header with section separation if file exists."""
188
+ if not file_exists:
189
+ writer.writerow(header)
190
+ else:
191
+ writer.writerow([]) # Separate sections
192
+ writer.writerow(['New Section'])
193
+ writer.writerow(header)
@@ -0,0 +1,17 @@
1
+ import pelican
2
+
3
+ file_path = 'your/file/path'
4
+
5
+ #return preprocessed transcript
6
+ preprocessed_files = pelican.preprocess(
7
+ file_path=file_path,
8
+ task=image_descriptions,
9
+ general_cleaning=true,
10
+ lowercase=true
11
+ )
12
+
13
+ #return embeddings from transcript
14
+ file_embeddings = pelican.extract_embeddings(
15
+ file_path=file_path,
16
+ mode="example_mode"
17
+ )