pelican-nlp 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. pelican_nlp/__init__.py +9 -0
  2. pelican_nlp/core/__init__.py +5 -0
  3. pelican_nlp/core/audio_document.py +20 -0
  4. pelican_nlp/core/corpus.py +296 -0
  5. pelican_nlp/core/document.py +226 -0
  6. pelican_nlp/core/subject.py +30 -0
  7. pelican_nlp/extraction/__init__.py +2 -0
  8. pelican_nlp/extraction/acoustic_feature_extraction.py +71 -0
  9. pelican_nlp/extraction/distance_from_randomness.py +109 -0
  10. pelican_nlp/extraction/extract_embeddings.py +57 -0
  11. pelican_nlp/extraction/extract_logits.py +102 -0
  12. pelican_nlp/extraction/language_model.py +71 -0
  13. pelican_nlp/extraction/semantic_similarity.py +60 -0
  14. pelican_nlp/extraction/test_documents/test_features.csv +4 -0
  15. pelican_nlp/extraction/test_documents/wallace_1.15_3.txt +1 -0
  16. pelican_nlp/extraction/test_documents/wallace_1.1_3.txt +1 -0
  17. pelican_nlp/extraction/test_documents/wallace_1_4.txt +1 -0
  18. pelican_nlp/main.py +211 -0
  19. pelican_nlp/metrics_statistics/embeddings_metrics_statistics.py +34 -0
  20. pelican_nlp/preprocessing/LPDS.py +77 -0
  21. pelican_nlp/preprocessing/__init__.py +7 -0
  22. pelican_nlp/preprocessing/pipeline.py +50 -0
  23. pelican_nlp/preprocessing/speaker_diarization.py +33 -0
  24. pelican_nlp/preprocessing/text_cleaner.py +224 -0
  25. pelican_nlp/preprocessing/text_importer.py +42 -0
  26. pelican_nlp/preprocessing/text_normalizer.py +24 -0
  27. pelican_nlp/preprocessing/text_tokenizer.py +43 -0
  28. pelican_nlp/sample_configuration_files/config_discourse.yml +103 -0
  29. pelican_nlp/sample_configuration_files/config_fluency.yml +108 -0
  30. pelican_nlp/sample_configuration_files/config_general.yml +131 -0
  31. pelican_nlp/utils/__init__.py +3 -0
  32. pelican_nlp/utils/csv_functions.py +193 -0
  33. pelican_nlp/utils/sample_usage.py +17 -0
  34. pelican_nlp/utils/setup_functions.py +93 -0
  35. pelican_nlp-0.1.0.dist-info/METADATA +146 -0
  36. pelican_nlp-0.1.0.dist-info/RECORD +39 -0
  37. pelican_nlp-0.1.0.dist-info/WHEEL +5 -0
  38. pelican_nlp-0.1.0.dist-info/licenses/LICENSE +400 -0
  39. pelican_nlp-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,109 @@
1
+ import numpy as np
2
+ import scipy
3
+ from typing import Dict, List, Any
4
+
5
+ #Type aliases
6
+ DistanceMatrix = np.ndarray
7
+ EmbeddingDict = Dict[str, Dict[str, List[Any]]]
8
+
9
+ def get_distance_from_randomness(embeddings, config, parallel=False):
10
+
11
+ if parallel:
12
+ print(f'parallel computing not yet set up... '
13
+ f'continuing without calculating divergence from optimality')
14
+ return
15
+
16
+ else:
17
+ results_dict = {}
18
+ result = optimality(
19
+ embeddings, config['window_size'], config['bootstrap'], config['shuffle_mode']
20
+ )
21
+ results_dict[f'section'] = result
22
+ return results_dict
23
+
24
+
25
+ def optimality(embeddings_dict, min_len, bootstrap, shuffle_mode):
26
+
27
+ words = list(embeddings_dict.keys())
28
+ embeddings = list(embeddings_dict.values())
29
+
30
+ answer_res = []
31
+ answer_len = len(words)
32
+
33
+ for i in range((answer_len - min_len) + 1):
34
+
35
+ window = embeddings[i:i + min_len]
36
+ dist_matrix = create_semantic_distance_matrix(window)
37
+
38
+ # Calculate costs for actual sequence and permutations
39
+ perm_costs = []
40
+ for j in range(bootstrap):
41
+ order = (np.arange(len(window)) if j == 0
42
+ else get_shuffled_order(len(window), shuffle_mode, j))
43
+ cost = calculate_total_distance_covered(dist_matrix, order)
44
+ perm_costs.append(cost)
45
+
46
+ if j == 0:
47
+ all_pairs_avg = average_similarity(dist_matrix)
48
+
49
+ # Normalize costs by number of edges
50
+ costs_per_edge = np.array(perm_costs) / (min_len - 1)
51
+ true_cost = costs_per_edge[0]
52
+
53
+ # Store results for this window
54
+ window_results = {
55
+ "window_index": i,
56
+ "all_pairs_average": all_pairs_avg,
57
+ "actual_dist": true_cost,
58
+ "average_dist": np.mean(costs_per_edge[1:]),
59
+ "std_dist": np.std(costs_per_edge[1:])
60
+ }
61
+ answer_res.append(window_results)
62
+
63
+ return answer_res
64
+
65
+
66
+ def create_semantic_distance_matrix(embedding_list: List[np.ndarray]) -> DistanceMatrix:
67
+
68
+ distances = scipy.spatial.distance.cdist(
69
+ np.array(embedding_list),
70
+ np.array(embedding_list),
71
+ 'cosine'
72
+ )
73
+ np.fill_diagonal(distances, 0)
74
+ return distances
75
+
76
+ def get_shuffled_order(n: int, shuffle_mode: str, seed: int) -> np.ndarray:
77
+
78
+ np.random.seed(seed)
79
+
80
+ if shuffle_mode == "include0_includeN":
81
+ order = np.arange(n)
82
+ np.random.shuffle(order)
83
+ elif shuffle_mode == "exclude0_includeN":
84
+ rest = np.arange(1, n)
85
+ np.random.shuffle(rest)
86
+ order = np.concatenate(([0], rest))
87
+ elif shuffle_mode == "exclude0_excludeN":
88
+ middle = np.arange(1, n - 1)
89
+ np.random.shuffle(middle)
90
+ order = np.concatenate(([0], middle, [n - 1]))
91
+ else:
92
+ raise ValueError(f"Invalid shuffle mode: {shuffle_mode}")
93
+
94
+ return order
95
+
96
+ def calculate_total_distance_covered(dist_matrix: DistanceMatrix, order: np.ndarray) -> float:
97
+ distances = dist_matrix[order[:-1], order[1:]]
98
+ return float(np.sum(distances))
99
+
100
+ def average_similarity(matrix: DistanceMatrix) -> float:
101
+
102
+ n = matrix.shape[0]
103
+
104
+ # Only count upper triangle to avoid double counting
105
+ upper_tri = np.triu(matrix, k=1)
106
+ total = np.sum(upper_tri)
107
+ count = (n * (n - 1)) // 2 # Number of pairs
108
+
109
+ return float(total / count) if count > 0 else 0.0
@@ -0,0 +1,57 @@
1
+ from pelican_nlp.extraction.language_model import Model
2
+ from pelican_nlp.preprocessing.text_tokenizer import TextTokenizer
3
+
4
+ class EmbeddingsExtractor:
5
+ def __init__(self, embeddings_configurations, project_path):
6
+ self.embeddings_configurations = embeddings_configurations
7
+ self.model_name = embeddings_configurations['model_name'] # Embedding model instance (e.g., fastText, RoBERTa)
8
+ self.model = Model(self.model_name, project_path)
9
+ self.Tokenizer = TextTokenizer(self.embeddings_configurations['tokenization_method'], self.model_name,
10
+ self.embeddings_configurations['max_length'])
11
+
12
+ self.model.load_model()
13
+ self.model_instance = self.model.model_instance
14
+
15
+ def extract_embeddings_from_text(self, text_list):
16
+
17
+ doc_entry_list = []
18
+
19
+ for text in text_list:
20
+
21
+ embeddings = {}
22
+
23
+ # Tokenize the input text
24
+ inputs = self.Tokenizer.tokenize_text(text)
25
+ print(f'inputs are: {inputs}')
26
+
27
+ if self.embeddings_configurations['pytorch_based_model']:
28
+ #e.g. RoBERTa Model or Llama Model
29
+ import torch
30
+ with torch.no_grad():
31
+ if 'llama' in self.model_name.lower():
32
+ # Handle Llama models which expect input_ids directly
33
+ outputs = self.model_instance(input_ids=inputs['input_ids'])
34
+ else:
35
+ # Handle RoBERTa and other models that accept **inputs
36
+ outputs = self.model_instance(**inputs)
37
+
38
+ # Get word embeddings (last hidden state)
39
+ word_embeddings = outputs.last_hidden_state
40
+
41
+ # Extract input_ids and convert them back to tokens
42
+ input_ids = inputs['input_ids'][0].tolist()
43
+ tokens = self.Tokenizer.tokenizer.convert_ids_to_tokens(input_ids)
44
+
45
+ # Now align the tokens and embeddings
46
+ for token, embedding in zip(tokens, word_embeddings[0]):
47
+ embeddings[token]=embedding.tolist()
48
+
49
+ else:
50
+ if self.model_name == 'fastText':
51
+ embeddings = []
52
+ for token in inputs:
53
+ embeddings.append((token, self.model_instance.get_word_vector(token)))
54
+
55
+ doc_entry_list.append(embeddings)
56
+
57
+ return doc_entry_list, len(inputs)
@@ -0,0 +1,102 @@
1
+ import torch
2
+ import torch.nn.functional as F
3
+ from tqdm import tqdm
4
+
5
+ class LogitsExtractor:
6
+ def __init__(self, options, pipeline, project_path):
7
+
8
+ self.device = 'cuda' if torch.cuda.is_available()==True else 'cpu'
9
+ self.options = options
10
+ self.model_name = self.options['model_name']
11
+ self.pipeline = pipeline
12
+ self.PROJECT_PATH = project_path
13
+
14
+ def extract_features(self, section, tokenizer, model):
15
+
16
+ print(f'section to tokenize: {section}')
17
+ tokens = tokenizer.tokenize_text(section)
18
+ print(tokens)
19
+
20
+ chunk_size = self.options['chunk_size']
21
+ overlap_size = self.options['overlap_size']
22
+
23
+ input_ids = tokens.to(self.device)
24
+ chunks = self._split_into_chunks(input_ids, chunk_size, overlap_size)
25
+
26
+ per_token_data = []
27
+
28
+ total_processed_tokens = 0 # Keep track of total tokens_logits processed to avoid duplicates
29
+
30
+ for i, chunk in enumerate(tqdm(chunks, desc="Processing chunks")):
31
+
32
+ with torch.no_grad():
33
+
34
+ outputs = model(input_ids=chunk)
35
+ logits = outputs.logits # Shape: [1, seq_length, vocab_size]
36
+
37
+ tokens = tokenizer.convert_ids_to_tokens(chunk.squeeze())
38
+ num_tokens = len(tokens)
39
+
40
+ chunk_data = []
41
+
42
+ # Determine the starting index for predictions
43
+ if i == 0:
44
+ # For the first chunk, start from index 1 (since the first token has no previous context)
45
+ start_idx = 1
46
+ else:
47
+ # For subsequent chunks, skip tokens_logits that were already processed in the overlap
48
+ start_idx = overlap_size
49
+
50
+ # Loop over the tokens_logits to predict
51
+ for j in range(start_idx, num_tokens):
52
+ # Compute per-token metrics
53
+ per_token_metrics = self._compute_per_token_metrics(logits, chunk, tokens, j, tokenizer)
54
+ chunk_data.append(per_token_metrics)
55
+
56
+ # Append the chunk data to the per_token_data list
57
+ per_token_data.extend(chunk_data)
58
+ total_processed_tokens += len(chunk_data)
59
+
60
+ return per_token_data
61
+
62
+ def _compute_per_token_metrics(self, logits, chunk, tokens, j, tokenizer):
63
+
64
+ # The model_instance predicts the token at position j using tokens_logits up to position j-1
65
+ # Therefore, logits at position j-1 correspond to predictions for token at position j
66
+ token_logits = logits[:, j - 1, :] # Shape: [1, vocab_size]
67
+ token_probs = F.softmax(token_logits, dim=-1)
68
+ token_logprobs = F.log_softmax(token_logits, dim=-1)
69
+
70
+ actual_token_id = chunk[:, j] # The actual token at position j
71
+ logprob_actual = token_logprobs[0, actual_token_id].item()
72
+ max_logprob, max_token_id = torch.max(token_logprobs, dim=-1)
73
+ max_logprob = max_logprob.item()
74
+ max_token_id = max_token_id.item()
75
+ entropy = -(token_probs * token_logprobs).sum().item()
76
+
77
+ most_likely_token = tokenizer.convert_ids_to_tokens([max_token_id])[0]
78
+ token = tokens[j] # The token at position j
79
+
80
+ return {
81
+ 'token': token,
82
+ 'logprob_actual': logprob_actual,
83
+ 'logprob_max': max_logprob,
84
+ 'entropy': entropy,
85
+ 'most_likely_token': most_likely_token
86
+ }
87
+
88
+ def _split_into_chunks(self, input_ids, chunk_size, overlap_size):
89
+
90
+ input_ids = input_ids.squeeze()
91
+ input_length = input_ids.size(0)
92
+ stride = chunk_size - overlap_size
93
+ chunks = []
94
+
95
+ for i in range(0, input_length, stride):
96
+ end_index = min(i + chunk_size, input_length)
97
+ chunk = input_ids[i:end_index]
98
+ chunks.append(chunk.unsqueeze(0).to(self.device))
99
+ if end_index == input_length:
100
+ break
101
+
102
+ return chunks
@@ -0,0 +1,71 @@
1
+ import torch
2
+ import psutil
3
+
4
+ from accelerate import init_empty_weights, infer_auto_device_map, dispatch_model
5
+ from transformers import AutoModelForCausalLM
6
+
7
+ class Model:
8
+ def __init__(self, model_name, project_path):
9
+ self.model_name = model_name
10
+ self.model_instance = None
11
+ self.device_map = None
12
+ self.PROJECT_PATH = project_path
13
+
14
+ def load_model(self, empty_weights=False, trust_remote_code=False):
15
+ """Loads and configures the model"""
16
+
17
+ if self.model_name == 'fastText':
18
+ import fasttext
19
+ import fasttext.util
20
+ fasttext.util.download_model('de', if_exists='ignore')
21
+ self.model_instance = fasttext.load_model('cc.de.300.bin')
22
+ print('FastText model loaded.')
23
+ elif self.model_name == 'xlm-roberta-base':
24
+ from transformers import AutoModel
25
+ self.model_instance = AutoModel.from_pretrained(
26
+ self.model_name,
27
+ trust_remote_code=trust_remote_code,
28
+ use_safetensors=True
29
+ )
30
+ print('RoBERTa model loaded.')
31
+ elif self.model_name == 'DiscoResearch/Llama3-German-8B-32k':
32
+ if empty_weights:
33
+ with init_empty_weights():
34
+ self.model_instance = AutoModelForCausalLM.from_pretrained(
35
+ self.model_name,
36
+ trust_remote_code=trust_remote_code,
37
+ use_safetensors=True
38
+ )
39
+ else:
40
+ self.model_instance = AutoModelForCausalLM.from_pretrained(
41
+ self.model_name,
42
+ trust_remote_code=trust_remote_code,
43
+ use_safetensors=True
44
+ )
45
+ print(f'Llama3-German-8B-32k loaded')
46
+ else:
47
+ raise ValueError("Invalid model name.")
48
+
49
+ if self.model_name == 'xlm-roberta-base' or self.model_name == 'DiscoResearch/Llama3-German-8B-32k':
50
+ # Additional model setup
51
+ self.device_map_creation()
52
+
53
+ self.model_instance = dispatch_model(self.model_instance, device_map=self.device_map)
54
+ print('Model dispatched to appropriate devices.')
55
+
56
+ def device_map_creation(self):
57
+ #check if cuda is available
58
+ if not torch.cuda.is_available():
59
+ print('Careful: Cuda not available, using CPU. This will be very slow.')
60
+ else:
61
+ print(f'{torch.cuda.get_device_name(0)} available.')
62
+
63
+ available_VRAM = str(int(torch.cuda.get_device_properties(0).total_memory/(1024 ** 3))-3)+'GB'
64
+ available_RAM = str(int(psutil.virtual_memory().total/(1024 ** 3))-3)+'GB'
65
+
66
+ #create device map and offload directory if it doesn't exist
67
+ self.device_map = infer_auto_device_map(self.model_instance, max_memory={
68
+ 0: available_VRAM,
69
+ 'cpu': available_RAM,
70
+ 'disk': '200GB'
71
+ })
@@ -0,0 +1,60 @@
1
+ import numpy as np
2
+ import scipy
3
+ from scipy.spatial.distance import cdist
4
+ import pandas as pd
5
+
6
+ def calculate_semantic_similarity(embedding_vectors):
7
+ # Extract just the vectors from the list of tuples
8
+ vectors = [vector for _, vector in embedding_vectors]
9
+ consecutive_similarities = get_consecutive_vector_similarities(vectors)
10
+ mean_similarity = np.nanmean(consecutive_similarities)
11
+ return consecutive_similarities, mean_similarity
12
+
13
+ def get_consecutive_vector_similarities(vectors):
14
+ return [1 - scipy.spatial.distance.cosine(vectors[i - 1], vectors[i]) for i in range(1, len(vectors))]
15
+
16
+ def get_cosine_similarity_matrix(embedding_vectors):
17
+ # Extract just the vectors from the list of tuples
18
+ vectors = [vector for _, vector in embedding_vectors]
19
+ similarity_matrix = 1 - cdist(vectors, vectors, 'cosine')
20
+ np.fill_diagonal(similarity_matrix, np.nan)
21
+ return similarity_matrix
22
+
23
+ def get_semantic_similarity_windows(embedding_vectors, window_size):
24
+ # Extract tokens and vectors from the list of tuples
25
+ tokens, vectors = zip(*embedding_vectors)
26
+
27
+ # Early return if not enough tokens
28
+ if len(tokens) < 2:
29
+ return np.nan, np.nan, np.nan, np.nan, np.nan
30
+
31
+ # Early return if window size is larger than sequence
32
+ if window_size > len(tokens):
33
+ return np.nan, np.nan, np.nan, np.nan, np.nan
34
+
35
+ if window_size == 'all':
36
+ cosine_similarity_matrix = get_cosine_similarity_matrix(embedding_vectors)
37
+ return calculate_window_statistics(cosine_similarity_matrix)
38
+
39
+ # Collect window statistics
40
+ window_statistics = []
41
+ for i in range(len(tokens) - window_size + 1):
42
+ window_vectors = list(zip(tokens[i:i + window_size], vectors[i:i + window_size]))
43
+ if window_vectors: # Make sure window is not empty
44
+ sim_matrix = get_cosine_similarity_matrix(window_vectors)
45
+ window_statistics.append(calculate_window_statistics(sim_matrix))
46
+
47
+ # Handle case where no valid windows were found
48
+ if not window_statistics:
49
+ return np.nan, np.nan, np.nan, np.nan, np.nan
50
+
51
+ # Unzip the statistics
52
+ window_means, window_stds, window_medians = zip(*window_statistics)
53
+
54
+ return (np.mean(window_means), np.std(window_means),
55
+ np.mean(window_stds), np.std(window_stds),
56
+ np.mean(window_medians))
57
+
58
+ def calculate_window_statistics(cosine_similarity_matrix):
59
+ matrix_values = pd.DataFrame(cosine_similarity_matrix).stack()
60
+ return matrix_values.mean(), matrix_values.std(), matrix_values.median()