pelican-nlp 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pelican_nlp/__init__.py +9 -0
- pelican_nlp/core/__init__.py +5 -0
- pelican_nlp/core/audio_document.py +20 -0
- pelican_nlp/core/corpus.py +296 -0
- pelican_nlp/core/document.py +226 -0
- pelican_nlp/core/subject.py +30 -0
- pelican_nlp/extraction/__init__.py +2 -0
- pelican_nlp/extraction/acoustic_feature_extraction.py +71 -0
- pelican_nlp/extraction/distance_from_randomness.py +109 -0
- pelican_nlp/extraction/extract_embeddings.py +57 -0
- pelican_nlp/extraction/extract_logits.py +102 -0
- pelican_nlp/extraction/language_model.py +71 -0
- pelican_nlp/extraction/semantic_similarity.py +60 -0
- pelican_nlp/extraction/test_documents/test_features.csv +4 -0
- pelican_nlp/extraction/test_documents/wallace_1.15_3.txt +1 -0
- pelican_nlp/extraction/test_documents/wallace_1.1_3.txt +1 -0
- pelican_nlp/extraction/test_documents/wallace_1_4.txt +1 -0
- pelican_nlp/main.py +211 -0
- pelican_nlp/metrics_statistics/embeddings_metrics_statistics.py +34 -0
- pelican_nlp/preprocessing/LPDS.py +77 -0
- pelican_nlp/preprocessing/__init__.py +7 -0
- pelican_nlp/preprocessing/pipeline.py +50 -0
- pelican_nlp/preprocessing/speaker_diarization.py +33 -0
- pelican_nlp/preprocessing/text_cleaner.py +224 -0
- pelican_nlp/preprocessing/text_importer.py +42 -0
- pelican_nlp/preprocessing/text_normalizer.py +24 -0
- pelican_nlp/preprocessing/text_tokenizer.py +43 -0
- pelican_nlp/sample_configuration_files/config_discourse.yml +103 -0
- pelican_nlp/sample_configuration_files/config_fluency.yml +108 -0
- pelican_nlp/sample_configuration_files/config_general.yml +131 -0
- pelican_nlp/utils/__init__.py +3 -0
- pelican_nlp/utils/csv_functions.py +193 -0
- pelican_nlp/utils/sample_usage.py +17 -0
- pelican_nlp/utils/setup_functions.py +93 -0
- pelican_nlp-0.1.0.dist-info/METADATA +146 -0
- pelican_nlp-0.1.0.dist-info/RECORD +39 -0
- pelican_nlp-0.1.0.dist-info/WHEEL +5 -0
- pelican_nlp-0.1.0.dist-info/licenses/LICENSE +400 -0
- pelican_nlp-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,109 @@
|
|
1
|
+
import numpy as np
|
2
|
+
import scipy
|
3
|
+
from typing import Dict, List, Any
|
4
|
+
|
5
|
+
#Type aliases
|
6
|
+
DistanceMatrix = np.ndarray
|
7
|
+
EmbeddingDict = Dict[str, Dict[str, List[Any]]]
|
8
|
+
|
9
|
+
def get_distance_from_randomness(embeddings, config, parallel=False):
|
10
|
+
|
11
|
+
if parallel:
|
12
|
+
print(f'parallel computing not yet set up... '
|
13
|
+
f'continuing without calculating divergence from optimality')
|
14
|
+
return
|
15
|
+
|
16
|
+
else:
|
17
|
+
results_dict = {}
|
18
|
+
result = optimality(
|
19
|
+
embeddings, config['window_size'], config['bootstrap'], config['shuffle_mode']
|
20
|
+
)
|
21
|
+
results_dict[f'section'] = result
|
22
|
+
return results_dict
|
23
|
+
|
24
|
+
|
25
|
+
def optimality(embeddings_dict, min_len, bootstrap, shuffle_mode):
|
26
|
+
|
27
|
+
words = list(embeddings_dict.keys())
|
28
|
+
embeddings = list(embeddings_dict.values())
|
29
|
+
|
30
|
+
answer_res = []
|
31
|
+
answer_len = len(words)
|
32
|
+
|
33
|
+
for i in range((answer_len - min_len) + 1):
|
34
|
+
|
35
|
+
window = embeddings[i:i + min_len]
|
36
|
+
dist_matrix = create_semantic_distance_matrix(window)
|
37
|
+
|
38
|
+
# Calculate costs for actual sequence and permutations
|
39
|
+
perm_costs = []
|
40
|
+
for j in range(bootstrap):
|
41
|
+
order = (np.arange(len(window)) if j == 0
|
42
|
+
else get_shuffled_order(len(window), shuffle_mode, j))
|
43
|
+
cost = calculate_total_distance_covered(dist_matrix, order)
|
44
|
+
perm_costs.append(cost)
|
45
|
+
|
46
|
+
if j == 0:
|
47
|
+
all_pairs_avg = average_similarity(dist_matrix)
|
48
|
+
|
49
|
+
# Normalize costs by number of edges
|
50
|
+
costs_per_edge = np.array(perm_costs) / (min_len - 1)
|
51
|
+
true_cost = costs_per_edge[0]
|
52
|
+
|
53
|
+
# Store results for this window
|
54
|
+
window_results = {
|
55
|
+
"window_index": i,
|
56
|
+
"all_pairs_average": all_pairs_avg,
|
57
|
+
"actual_dist": true_cost,
|
58
|
+
"average_dist": np.mean(costs_per_edge[1:]),
|
59
|
+
"std_dist": np.std(costs_per_edge[1:])
|
60
|
+
}
|
61
|
+
answer_res.append(window_results)
|
62
|
+
|
63
|
+
return answer_res
|
64
|
+
|
65
|
+
|
66
|
+
def create_semantic_distance_matrix(embedding_list: List[np.ndarray]) -> DistanceMatrix:
|
67
|
+
|
68
|
+
distances = scipy.spatial.distance.cdist(
|
69
|
+
np.array(embedding_list),
|
70
|
+
np.array(embedding_list),
|
71
|
+
'cosine'
|
72
|
+
)
|
73
|
+
np.fill_diagonal(distances, 0)
|
74
|
+
return distances
|
75
|
+
|
76
|
+
def get_shuffled_order(n: int, shuffle_mode: str, seed: int) -> np.ndarray:
|
77
|
+
|
78
|
+
np.random.seed(seed)
|
79
|
+
|
80
|
+
if shuffle_mode == "include0_includeN":
|
81
|
+
order = np.arange(n)
|
82
|
+
np.random.shuffle(order)
|
83
|
+
elif shuffle_mode == "exclude0_includeN":
|
84
|
+
rest = np.arange(1, n)
|
85
|
+
np.random.shuffle(rest)
|
86
|
+
order = np.concatenate(([0], rest))
|
87
|
+
elif shuffle_mode == "exclude0_excludeN":
|
88
|
+
middle = np.arange(1, n - 1)
|
89
|
+
np.random.shuffle(middle)
|
90
|
+
order = np.concatenate(([0], middle, [n - 1]))
|
91
|
+
else:
|
92
|
+
raise ValueError(f"Invalid shuffle mode: {shuffle_mode}")
|
93
|
+
|
94
|
+
return order
|
95
|
+
|
96
|
+
def calculate_total_distance_covered(dist_matrix: DistanceMatrix, order: np.ndarray) -> float:
|
97
|
+
distances = dist_matrix[order[:-1], order[1:]]
|
98
|
+
return float(np.sum(distances))
|
99
|
+
|
100
|
+
def average_similarity(matrix: DistanceMatrix) -> float:
|
101
|
+
|
102
|
+
n = matrix.shape[0]
|
103
|
+
|
104
|
+
# Only count upper triangle to avoid double counting
|
105
|
+
upper_tri = np.triu(matrix, k=1)
|
106
|
+
total = np.sum(upper_tri)
|
107
|
+
count = (n * (n - 1)) // 2 # Number of pairs
|
108
|
+
|
109
|
+
return float(total / count) if count > 0 else 0.0
|
@@ -0,0 +1,57 @@
|
|
1
|
+
from pelican_nlp.extraction.language_model import Model
|
2
|
+
from pelican_nlp.preprocessing.text_tokenizer import TextTokenizer
|
3
|
+
|
4
|
+
class EmbeddingsExtractor:
|
5
|
+
def __init__(self, embeddings_configurations, project_path):
|
6
|
+
self.embeddings_configurations = embeddings_configurations
|
7
|
+
self.model_name = embeddings_configurations['model_name'] # Embedding model instance (e.g., fastText, RoBERTa)
|
8
|
+
self.model = Model(self.model_name, project_path)
|
9
|
+
self.Tokenizer = TextTokenizer(self.embeddings_configurations['tokenization_method'], self.model_name,
|
10
|
+
self.embeddings_configurations['max_length'])
|
11
|
+
|
12
|
+
self.model.load_model()
|
13
|
+
self.model_instance = self.model.model_instance
|
14
|
+
|
15
|
+
def extract_embeddings_from_text(self, text_list):
|
16
|
+
|
17
|
+
doc_entry_list = []
|
18
|
+
|
19
|
+
for text in text_list:
|
20
|
+
|
21
|
+
embeddings = {}
|
22
|
+
|
23
|
+
# Tokenize the input text
|
24
|
+
inputs = self.Tokenizer.tokenize_text(text)
|
25
|
+
print(f'inputs are: {inputs}')
|
26
|
+
|
27
|
+
if self.embeddings_configurations['pytorch_based_model']:
|
28
|
+
#e.g. RoBERTa Model or Llama Model
|
29
|
+
import torch
|
30
|
+
with torch.no_grad():
|
31
|
+
if 'llama' in self.model_name.lower():
|
32
|
+
# Handle Llama models which expect input_ids directly
|
33
|
+
outputs = self.model_instance(input_ids=inputs['input_ids'])
|
34
|
+
else:
|
35
|
+
# Handle RoBERTa and other models that accept **inputs
|
36
|
+
outputs = self.model_instance(**inputs)
|
37
|
+
|
38
|
+
# Get word embeddings (last hidden state)
|
39
|
+
word_embeddings = outputs.last_hidden_state
|
40
|
+
|
41
|
+
# Extract input_ids and convert them back to tokens
|
42
|
+
input_ids = inputs['input_ids'][0].tolist()
|
43
|
+
tokens = self.Tokenizer.tokenizer.convert_ids_to_tokens(input_ids)
|
44
|
+
|
45
|
+
# Now align the tokens and embeddings
|
46
|
+
for token, embedding in zip(tokens, word_embeddings[0]):
|
47
|
+
embeddings[token]=embedding.tolist()
|
48
|
+
|
49
|
+
else:
|
50
|
+
if self.model_name == 'fastText':
|
51
|
+
embeddings = []
|
52
|
+
for token in inputs:
|
53
|
+
embeddings.append((token, self.model_instance.get_word_vector(token)))
|
54
|
+
|
55
|
+
doc_entry_list.append(embeddings)
|
56
|
+
|
57
|
+
return doc_entry_list, len(inputs)
|
@@ -0,0 +1,102 @@
|
|
1
|
+
import torch
|
2
|
+
import torch.nn.functional as F
|
3
|
+
from tqdm import tqdm
|
4
|
+
|
5
|
+
class LogitsExtractor:
|
6
|
+
def __init__(self, options, pipeline, project_path):
|
7
|
+
|
8
|
+
self.device = 'cuda' if torch.cuda.is_available()==True else 'cpu'
|
9
|
+
self.options = options
|
10
|
+
self.model_name = self.options['model_name']
|
11
|
+
self.pipeline = pipeline
|
12
|
+
self.PROJECT_PATH = project_path
|
13
|
+
|
14
|
+
def extract_features(self, section, tokenizer, model):
|
15
|
+
|
16
|
+
print(f'section to tokenize: {section}')
|
17
|
+
tokens = tokenizer.tokenize_text(section)
|
18
|
+
print(tokens)
|
19
|
+
|
20
|
+
chunk_size = self.options['chunk_size']
|
21
|
+
overlap_size = self.options['overlap_size']
|
22
|
+
|
23
|
+
input_ids = tokens.to(self.device)
|
24
|
+
chunks = self._split_into_chunks(input_ids, chunk_size, overlap_size)
|
25
|
+
|
26
|
+
per_token_data = []
|
27
|
+
|
28
|
+
total_processed_tokens = 0 # Keep track of total tokens_logits processed to avoid duplicates
|
29
|
+
|
30
|
+
for i, chunk in enumerate(tqdm(chunks, desc="Processing chunks")):
|
31
|
+
|
32
|
+
with torch.no_grad():
|
33
|
+
|
34
|
+
outputs = model(input_ids=chunk)
|
35
|
+
logits = outputs.logits # Shape: [1, seq_length, vocab_size]
|
36
|
+
|
37
|
+
tokens = tokenizer.convert_ids_to_tokens(chunk.squeeze())
|
38
|
+
num_tokens = len(tokens)
|
39
|
+
|
40
|
+
chunk_data = []
|
41
|
+
|
42
|
+
# Determine the starting index for predictions
|
43
|
+
if i == 0:
|
44
|
+
# For the first chunk, start from index 1 (since the first token has no previous context)
|
45
|
+
start_idx = 1
|
46
|
+
else:
|
47
|
+
# For subsequent chunks, skip tokens_logits that were already processed in the overlap
|
48
|
+
start_idx = overlap_size
|
49
|
+
|
50
|
+
# Loop over the tokens_logits to predict
|
51
|
+
for j in range(start_idx, num_tokens):
|
52
|
+
# Compute per-token metrics
|
53
|
+
per_token_metrics = self._compute_per_token_metrics(logits, chunk, tokens, j, tokenizer)
|
54
|
+
chunk_data.append(per_token_metrics)
|
55
|
+
|
56
|
+
# Append the chunk data to the per_token_data list
|
57
|
+
per_token_data.extend(chunk_data)
|
58
|
+
total_processed_tokens += len(chunk_data)
|
59
|
+
|
60
|
+
return per_token_data
|
61
|
+
|
62
|
+
def _compute_per_token_metrics(self, logits, chunk, tokens, j, tokenizer):
|
63
|
+
|
64
|
+
# The model_instance predicts the token at position j using tokens_logits up to position j-1
|
65
|
+
# Therefore, logits at position j-1 correspond to predictions for token at position j
|
66
|
+
token_logits = logits[:, j - 1, :] # Shape: [1, vocab_size]
|
67
|
+
token_probs = F.softmax(token_logits, dim=-1)
|
68
|
+
token_logprobs = F.log_softmax(token_logits, dim=-1)
|
69
|
+
|
70
|
+
actual_token_id = chunk[:, j] # The actual token at position j
|
71
|
+
logprob_actual = token_logprobs[0, actual_token_id].item()
|
72
|
+
max_logprob, max_token_id = torch.max(token_logprobs, dim=-1)
|
73
|
+
max_logprob = max_logprob.item()
|
74
|
+
max_token_id = max_token_id.item()
|
75
|
+
entropy = -(token_probs * token_logprobs).sum().item()
|
76
|
+
|
77
|
+
most_likely_token = tokenizer.convert_ids_to_tokens([max_token_id])[0]
|
78
|
+
token = tokens[j] # The token at position j
|
79
|
+
|
80
|
+
return {
|
81
|
+
'token': token,
|
82
|
+
'logprob_actual': logprob_actual,
|
83
|
+
'logprob_max': max_logprob,
|
84
|
+
'entropy': entropy,
|
85
|
+
'most_likely_token': most_likely_token
|
86
|
+
}
|
87
|
+
|
88
|
+
def _split_into_chunks(self, input_ids, chunk_size, overlap_size):
|
89
|
+
|
90
|
+
input_ids = input_ids.squeeze()
|
91
|
+
input_length = input_ids.size(0)
|
92
|
+
stride = chunk_size - overlap_size
|
93
|
+
chunks = []
|
94
|
+
|
95
|
+
for i in range(0, input_length, stride):
|
96
|
+
end_index = min(i + chunk_size, input_length)
|
97
|
+
chunk = input_ids[i:end_index]
|
98
|
+
chunks.append(chunk.unsqueeze(0).to(self.device))
|
99
|
+
if end_index == input_length:
|
100
|
+
break
|
101
|
+
|
102
|
+
return chunks
|
@@ -0,0 +1,71 @@
|
|
1
|
+
import torch
|
2
|
+
import psutil
|
3
|
+
|
4
|
+
from accelerate import init_empty_weights, infer_auto_device_map, dispatch_model
|
5
|
+
from transformers import AutoModelForCausalLM
|
6
|
+
|
7
|
+
class Model:
|
8
|
+
def __init__(self, model_name, project_path):
|
9
|
+
self.model_name = model_name
|
10
|
+
self.model_instance = None
|
11
|
+
self.device_map = None
|
12
|
+
self.PROJECT_PATH = project_path
|
13
|
+
|
14
|
+
def load_model(self, empty_weights=False, trust_remote_code=False):
|
15
|
+
"""Loads and configures the model"""
|
16
|
+
|
17
|
+
if self.model_name == 'fastText':
|
18
|
+
import fasttext
|
19
|
+
import fasttext.util
|
20
|
+
fasttext.util.download_model('de', if_exists='ignore')
|
21
|
+
self.model_instance = fasttext.load_model('cc.de.300.bin')
|
22
|
+
print('FastText model loaded.')
|
23
|
+
elif self.model_name == 'xlm-roberta-base':
|
24
|
+
from transformers import AutoModel
|
25
|
+
self.model_instance = AutoModel.from_pretrained(
|
26
|
+
self.model_name,
|
27
|
+
trust_remote_code=trust_remote_code,
|
28
|
+
use_safetensors=True
|
29
|
+
)
|
30
|
+
print('RoBERTa model loaded.')
|
31
|
+
elif self.model_name == 'DiscoResearch/Llama3-German-8B-32k':
|
32
|
+
if empty_weights:
|
33
|
+
with init_empty_weights():
|
34
|
+
self.model_instance = AutoModelForCausalLM.from_pretrained(
|
35
|
+
self.model_name,
|
36
|
+
trust_remote_code=trust_remote_code,
|
37
|
+
use_safetensors=True
|
38
|
+
)
|
39
|
+
else:
|
40
|
+
self.model_instance = AutoModelForCausalLM.from_pretrained(
|
41
|
+
self.model_name,
|
42
|
+
trust_remote_code=trust_remote_code,
|
43
|
+
use_safetensors=True
|
44
|
+
)
|
45
|
+
print(f'Llama3-German-8B-32k loaded')
|
46
|
+
else:
|
47
|
+
raise ValueError("Invalid model name.")
|
48
|
+
|
49
|
+
if self.model_name == 'xlm-roberta-base' or self.model_name == 'DiscoResearch/Llama3-German-8B-32k':
|
50
|
+
# Additional model setup
|
51
|
+
self.device_map_creation()
|
52
|
+
|
53
|
+
self.model_instance = dispatch_model(self.model_instance, device_map=self.device_map)
|
54
|
+
print('Model dispatched to appropriate devices.')
|
55
|
+
|
56
|
+
def device_map_creation(self):
|
57
|
+
#check if cuda is available
|
58
|
+
if not torch.cuda.is_available():
|
59
|
+
print('Careful: Cuda not available, using CPU. This will be very slow.')
|
60
|
+
else:
|
61
|
+
print(f'{torch.cuda.get_device_name(0)} available.')
|
62
|
+
|
63
|
+
available_VRAM = str(int(torch.cuda.get_device_properties(0).total_memory/(1024 ** 3))-3)+'GB'
|
64
|
+
available_RAM = str(int(psutil.virtual_memory().total/(1024 ** 3))-3)+'GB'
|
65
|
+
|
66
|
+
#create device map and offload directory if it doesn't exist
|
67
|
+
self.device_map = infer_auto_device_map(self.model_instance, max_memory={
|
68
|
+
0: available_VRAM,
|
69
|
+
'cpu': available_RAM,
|
70
|
+
'disk': '200GB'
|
71
|
+
})
|
@@ -0,0 +1,60 @@
|
|
1
|
+
import numpy as np
|
2
|
+
import scipy
|
3
|
+
from scipy.spatial.distance import cdist
|
4
|
+
import pandas as pd
|
5
|
+
|
6
|
+
def calculate_semantic_similarity(embedding_vectors):
|
7
|
+
# Extract just the vectors from the list of tuples
|
8
|
+
vectors = [vector for _, vector in embedding_vectors]
|
9
|
+
consecutive_similarities = get_consecutive_vector_similarities(vectors)
|
10
|
+
mean_similarity = np.nanmean(consecutive_similarities)
|
11
|
+
return consecutive_similarities, mean_similarity
|
12
|
+
|
13
|
+
def get_consecutive_vector_similarities(vectors):
|
14
|
+
return [1 - scipy.spatial.distance.cosine(vectors[i - 1], vectors[i]) for i in range(1, len(vectors))]
|
15
|
+
|
16
|
+
def get_cosine_similarity_matrix(embedding_vectors):
|
17
|
+
# Extract just the vectors from the list of tuples
|
18
|
+
vectors = [vector for _, vector in embedding_vectors]
|
19
|
+
similarity_matrix = 1 - cdist(vectors, vectors, 'cosine')
|
20
|
+
np.fill_diagonal(similarity_matrix, np.nan)
|
21
|
+
return similarity_matrix
|
22
|
+
|
23
|
+
def get_semantic_similarity_windows(embedding_vectors, window_size):
|
24
|
+
# Extract tokens and vectors from the list of tuples
|
25
|
+
tokens, vectors = zip(*embedding_vectors)
|
26
|
+
|
27
|
+
# Early return if not enough tokens
|
28
|
+
if len(tokens) < 2:
|
29
|
+
return np.nan, np.nan, np.nan, np.nan, np.nan
|
30
|
+
|
31
|
+
# Early return if window size is larger than sequence
|
32
|
+
if window_size > len(tokens):
|
33
|
+
return np.nan, np.nan, np.nan, np.nan, np.nan
|
34
|
+
|
35
|
+
if window_size == 'all':
|
36
|
+
cosine_similarity_matrix = get_cosine_similarity_matrix(embedding_vectors)
|
37
|
+
return calculate_window_statistics(cosine_similarity_matrix)
|
38
|
+
|
39
|
+
# Collect window statistics
|
40
|
+
window_statistics = []
|
41
|
+
for i in range(len(tokens) - window_size + 1):
|
42
|
+
window_vectors = list(zip(tokens[i:i + window_size], vectors[i:i + window_size]))
|
43
|
+
if window_vectors: # Make sure window is not empty
|
44
|
+
sim_matrix = get_cosine_similarity_matrix(window_vectors)
|
45
|
+
window_statistics.append(calculate_window_statistics(sim_matrix))
|
46
|
+
|
47
|
+
# Handle case where no valid windows were found
|
48
|
+
if not window_statistics:
|
49
|
+
return np.nan, np.nan, np.nan, np.nan, np.nan
|
50
|
+
|
51
|
+
# Unzip the statistics
|
52
|
+
window_means, window_stds, window_medians = zip(*window_statistics)
|
53
|
+
|
54
|
+
return (np.mean(window_means), np.std(window_means),
|
55
|
+
np.mean(window_stds), np.std(window_stds),
|
56
|
+
np.mean(window_medians))
|
57
|
+
|
58
|
+
def calculate_window_statistics(cosine_similarity_matrix):
|
59
|
+
matrix_values = pd.DataFrame(cosine_similarity_matrix).stack()
|
60
|
+
return matrix_values.mean(), matrix_values.std(), matrix_values.median()
|