semantic-compressor 1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- compressor/__init__.py +0 -0
- compressor/minbpe/__init__.py +3 -0
- compressor/minbpe/base.py +165 -0
- compressor/minbpe/basic.py +74 -0
- compressor/minbpe/regex.py +128 -0
- compressor/resources/embedding_model.onnx +0 -0
- compressor/resources/en_stopwords.pkl +0 -0
- compressor/resources/lid.176.ftz +0 -0
- compressor/resources/pt_stopwords.pkl +0 -0
- compressor/semantic.py +171 -0
- semantic_compressor-1.0.dist-info/LICENSE +21 -0
- semantic_compressor-1.0.dist-info/METADATA +18 -0
- semantic_compressor-1.0.dist-info/RECORD +15 -0
- semantic_compressor-1.0.dist-info/WHEEL +5 -0
- semantic_compressor-1.0.dist-info/top_level.txt +1 -0
compressor/__init__.py
ADDED
File without changes
|
@@ -0,0 +1,165 @@
|
|
1
|
+
"""
|
2
|
+
Contains the base Tokenizer class and a few common helper functions.
|
3
|
+
The base class also contains the (common) save/load functionality.
|
4
|
+
It would be possible to be a lot more strict about the interface and
|
5
|
+
e.g. isolating all regex/pattern parts to the RegexTokenizer, but
|
6
|
+
some concessions are made for simplicity.
|
7
|
+
"""
|
8
|
+
import unicodedata
|
9
|
+
|
10
|
+
# -----------------------------------------------------------------------------
|
11
|
+
# a few helper functions useful for both BasicTokenizer and RegexTokenizer
|
12
|
+
|
13
|
+
def get_stats(ids, counts=None):
|
14
|
+
"""
|
15
|
+
Given a list of integers, return a dictionary of counts of consecutive pairs
|
16
|
+
Example: [1, 2, 3, 1, 2] -> {(1, 2): 2, (2, 3): 1, (3, 1): 1}
|
17
|
+
Optionally allows to update an existing dictionary of counts
|
18
|
+
"""
|
19
|
+
counts = {} if counts is None else counts
|
20
|
+
for pair in zip(ids, ids[1:]): # iterate consecutive elements
|
21
|
+
counts[pair] = counts.get(pair, 0) + 1
|
22
|
+
return counts
|
23
|
+
|
24
|
+
|
25
|
+
def merge(ids, pair, idx):
|
26
|
+
"""
|
27
|
+
In the list of integers (ids), replace all consecutive occurrences
|
28
|
+
of pair with the new integer token idx
|
29
|
+
Example: ids=[1, 2, 3, 1, 2], pair=(1, 2), idx=4 -> [4, 3, 4]
|
30
|
+
"""
|
31
|
+
newids = []
|
32
|
+
i = 0
|
33
|
+
while i < len(ids):
|
34
|
+
# if not at the very last position AND the pair matches, replace it
|
35
|
+
if ids[i] == pair[0] and i < len(ids) - 1 and ids[i+1] == pair[1]:
|
36
|
+
newids.append(idx)
|
37
|
+
i += 2
|
38
|
+
else:
|
39
|
+
newids.append(ids[i])
|
40
|
+
i += 1
|
41
|
+
return newids
|
42
|
+
|
43
|
+
# first two helper functions...
|
44
|
+
def replace_control_characters(s: str) -> str:
|
45
|
+
# we don't want to print control characters
|
46
|
+
# which distort the output (e.g. \n or much worse)
|
47
|
+
# https://stackoverflow.com/questions/4324790/removing-control-characters-from-a-string-in-python/19016117#19016117
|
48
|
+
# http://www.unicode.org/reports/tr44/#GC_Values_Table
|
49
|
+
chars = []
|
50
|
+
for ch in s:
|
51
|
+
if unicodedata.category(ch)[0] != "C":
|
52
|
+
chars.append(ch) # this character is ok
|
53
|
+
else:
|
54
|
+
chars.append(f"\\u{ord(ch):04x}") # escape
|
55
|
+
return "".join(chars)
|
56
|
+
|
57
|
+
def render_token(t: bytes) -> str:
|
58
|
+
# pretty print a token, escaping control characters
|
59
|
+
s = t.decode('utf-8', errors='replace')
|
60
|
+
s = replace_control_characters(s)
|
61
|
+
return s
|
62
|
+
|
63
|
+
# -----------------------------------------------------------------------------
|
64
|
+
# the base Tokenizer class
|
65
|
+
|
66
|
+
class Tokenizer:
|
67
|
+
"""Base class for Tokenizers"""
|
68
|
+
|
69
|
+
def __init__(self):
|
70
|
+
# default: vocab size of 256 (all bytes), no merges, no patterns
|
71
|
+
self.merges = {} # (int, int) -> int
|
72
|
+
self.pattern = "" # str
|
73
|
+
self.special_tokens = {} # str -> int, e.g. {'<|endoftext|>': 100257}
|
74
|
+
self.vocab = self._build_vocab() # int -> bytes
|
75
|
+
|
76
|
+
def train(self, text, vocab_size, verbose=False):
|
77
|
+
# Tokenizer can train a vocabulary of size vocab_size from text
|
78
|
+
raise NotImplementedError
|
79
|
+
|
80
|
+
def encode(self, text):
|
81
|
+
# Tokenizer can encode a string into a list of integers
|
82
|
+
raise NotImplementedError
|
83
|
+
|
84
|
+
def decode(self, ids):
|
85
|
+
# Tokenizer can decode a list of integers into a string
|
86
|
+
raise NotImplementedError
|
87
|
+
|
88
|
+
def _build_vocab(self):
|
89
|
+
# vocab is simply and deterministically derived from merges
|
90
|
+
vocab = {idx: bytes([idx]) for idx in range(256)}
|
91
|
+
for (p0, p1), idx in self.merges.items():
|
92
|
+
vocab[idx] = vocab[p0] + vocab[p1]
|
93
|
+
for special, idx in self.special_tokens.items():
|
94
|
+
vocab[idx] = special.encode("utf-8")
|
95
|
+
return vocab
|
96
|
+
|
97
|
+
def save(self, file_prefix):
|
98
|
+
"""
|
99
|
+
Saves two files: file_prefix.vocab and file_prefix.model
|
100
|
+
This is inspired (but not equivalent to!) sentencepiece's model saving:
|
101
|
+
- model file is the critical one, intended for load()
|
102
|
+
- vocab file is just a pretty printed version for human inspection only
|
103
|
+
"""
|
104
|
+
# write the model: to be used in load() later
|
105
|
+
model_file = file_prefix + ".model"
|
106
|
+
with open(model_file, 'w') as f:
|
107
|
+
# write the version, pattern and merges, that's all that's needed
|
108
|
+
f.write("minbpe v1\n")
|
109
|
+
f.write(f"{self.pattern}\n")
|
110
|
+
# write the special tokens, first the number of them, then each one
|
111
|
+
f.write(f"{len(self.special_tokens)}\n")
|
112
|
+
for special, idx in self.special_tokens.items():
|
113
|
+
f.write(f"{special} {idx}\n")
|
114
|
+
# the merges dict
|
115
|
+
for idx1, idx2 in self.merges:
|
116
|
+
f.write(f"{idx1} {idx2}\n")
|
117
|
+
# write the vocab: for the human to look at
|
118
|
+
vocab_file = file_prefix + ".vocab"
|
119
|
+
inverted_merges = {idx: pair for pair, idx in self.merges.items()}
|
120
|
+
with open(vocab_file, "w", encoding="utf-8") as f:
|
121
|
+
for idx, token in self.vocab.items():
|
122
|
+
# note: many tokens may be partial utf-8 sequences
|
123
|
+
# and cannot be decoded into valid strings. Here we're using
|
124
|
+
# errors='replace' to replace them with the replacement char �.
|
125
|
+
# this also means that we couldn't possibly use .vocab in load()
|
126
|
+
# because decoding in this way is a lossy operation!
|
127
|
+
s = render_token(token)
|
128
|
+
# find the children of this token, if any
|
129
|
+
if idx in inverted_merges:
|
130
|
+
# if this token has children, render it nicely as a merge
|
131
|
+
idx0, idx1 = inverted_merges[idx]
|
132
|
+
s0 = render_token(self.vocab[idx0])
|
133
|
+
s1 = render_token(self.vocab[idx1])
|
134
|
+
f.write(f"[{s0}][{s1}] -> [{s}] {idx}\n")
|
135
|
+
else:
|
136
|
+
# otherwise this is leaf token, just print it
|
137
|
+
# (this should just be the first 256 tokens, the bytes)
|
138
|
+
f.write(f"[{s}] {idx}\n")
|
139
|
+
|
140
|
+
def load(self, model_file):
|
141
|
+
"""Inverse of save() but only for the model file"""
|
142
|
+
assert model_file.endswith(".model")
|
143
|
+
# read the model file
|
144
|
+
merges = {}
|
145
|
+
special_tokens = {}
|
146
|
+
idx = 256
|
147
|
+
with open(model_file, 'r', encoding="utf-8") as f:
|
148
|
+
# read the version
|
149
|
+
version = f.readline().strip()
|
150
|
+
assert version == "minbpe v1"
|
151
|
+
# read the pattern
|
152
|
+
self.pattern = f.readline().strip()
|
153
|
+
# read the special tokens
|
154
|
+
num_special = int(f.readline().strip())
|
155
|
+
for _ in range(num_special):
|
156
|
+
special, special_idx = f.readline().strip().split()
|
157
|
+
special_tokens[special] = int(special_idx)
|
158
|
+
# read the merges
|
159
|
+
for line in f:
|
160
|
+
idx1, idx2 = map(int, line.split())
|
161
|
+
merges[(idx1, idx2)] = idx
|
162
|
+
idx += 1
|
163
|
+
self.merges = merges
|
164
|
+
self.special_tokens = special_tokens
|
165
|
+
self.vocab = self._build_vocab()
|
@@ -0,0 +1,74 @@
|
|
1
|
+
"""
|
2
|
+
Minimal (byte-level) Byte Pair Encoding tokenizer.
|
3
|
+
|
4
|
+
Algorithmically follows along the GPT tokenizer:
|
5
|
+
https://github.com/openai/gpt-2/blob/master/src/encoder.py
|
6
|
+
|
7
|
+
But:
|
8
|
+
- Does not handle the regular expression splitting pattern.
|
9
|
+
- Does not handle any special tokens.
|
10
|
+
"""
|
11
|
+
|
12
|
+
from .base import Tokenizer, get_stats, merge
|
13
|
+
|
14
|
+
|
15
|
+
class BasicTokenizer(Tokenizer):
|
16
|
+
|
17
|
+
def __init__(self):
|
18
|
+
super().__init__()
|
19
|
+
|
20
|
+
def train(self, text, vocab_size, verbose=False):
|
21
|
+
assert vocab_size >= 256
|
22
|
+
num_merges = vocab_size - 256
|
23
|
+
|
24
|
+
# input text preprocessing
|
25
|
+
text_bytes = text.encode("utf-8") # raw bytes
|
26
|
+
ids = list(text_bytes) # list of integers in range 0..255
|
27
|
+
|
28
|
+
# iteratively merge the most common pairs to create new tokens
|
29
|
+
merges = {} # (int, int) -> int
|
30
|
+
vocab = {idx: bytes([idx]) for idx in range(256)} # int -> bytes
|
31
|
+
for i in range(num_merges):
|
32
|
+
# count up the number of times every consecutive pair appears
|
33
|
+
stats = get_stats(ids)
|
34
|
+
# find the pair with the highest count
|
35
|
+
pair = max(stats, key=stats.get)
|
36
|
+
# mint a new token: assign it the next available id
|
37
|
+
idx = 256 + i
|
38
|
+
# replace all occurrences of pair in ids with idx
|
39
|
+
ids = merge(ids, pair, idx)
|
40
|
+
# save the merge
|
41
|
+
merges[pair] = idx
|
42
|
+
vocab[idx] = vocab[pair[0]] + vocab[pair[1]]
|
43
|
+
# prints
|
44
|
+
if verbose:
|
45
|
+
print(f"merge {i+1}/{num_merges}: {pair} -> {idx} ({vocab[idx]}) had {stats[pair]} occurrences")
|
46
|
+
|
47
|
+
# save class variables
|
48
|
+
self.merges = merges # used in encode()
|
49
|
+
self.vocab = vocab # used in decode()
|
50
|
+
|
51
|
+
def decode(self, ids):
|
52
|
+
# given ids (list of integers), return Python string
|
53
|
+
text_bytes = b"".join(self.vocab[idx] for idx in ids)
|
54
|
+
text = text_bytes.decode("utf-8", errors="replace")
|
55
|
+
return text
|
56
|
+
|
57
|
+
def encode(self, text):
|
58
|
+
# given a string text, return the token ids
|
59
|
+
text_bytes = text.encode("utf-8") # raw bytes
|
60
|
+
ids = list(text_bytes) # list of integers in range 0..255
|
61
|
+
while len(ids) >= 2:
|
62
|
+
# find the pair with the lowest merge index
|
63
|
+
stats = get_stats(ids)
|
64
|
+
pair = min(stats, key=lambda p: self.merges.get(p, float("inf")))
|
65
|
+
# subtle: if there are no more merges available, the key will
|
66
|
+
# result in an inf for every single pair, and the min will be
|
67
|
+
# just the first pair in the list, arbitrarily
|
68
|
+
# we can detect this terminating case by a membership check
|
69
|
+
if pair not in self.merges:
|
70
|
+
break # nothing else can be merged anymore
|
71
|
+
# otherwise let's merge the best pair (lowest merge index)
|
72
|
+
idx = self.merges[pair]
|
73
|
+
ids = merge(ids, pair, idx)
|
74
|
+
return ids
|
@@ -0,0 +1,128 @@
|
|
1
|
+
"""
|
2
|
+
Minimal (byte-level) Byte Pair Encoding tokenizer.
|
3
|
+
|
4
|
+
Algorithmically follows along the GPT tokenizer:
|
5
|
+
https://github.com/openai/gpt-2/blob/master/src/encoder.py
|
6
|
+
|
7
|
+
Unlike BasicTokenizer:
|
8
|
+
- RegexTokenizer handles an optional regex splitting pattern.
|
9
|
+
- RegexTokenizer handles optional special tokens.
|
10
|
+
"""
|
11
|
+
|
12
|
+
import regex as re
|
13
|
+
from .base import Tokenizer, get_stats, merge
|
14
|
+
|
15
|
+
|
16
|
+
# the main GPT text split patterns, see
|
17
|
+
# https://github.com/openai/tiktoken/blob/main/tiktoken_ext/openai_public.py
|
18
|
+
GPT2_SPLIT_PATTERN = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
|
19
|
+
GPT4_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""
|
20
|
+
|
21
|
+
|
22
|
+
class RegexTokenizer(Tokenizer):
|
23
|
+
|
24
|
+
def __init__(self, pattern=None):
|
25
|
+
"""
|
26
|
+
- pattern: optional string to override the default (GPT-4 split pattern)
|
27
|
+
- special_tokens: str -> int dictionary of special tokens
|
28
|
+
example: {'<|endoftext|>': 100257}
|
29
|
+
"""
|
30
|
+
super().__init__()
|
31
|
+
self.pattern = GPT4_SPLIT_PATTERN if pattern is None else pattern
|
32
|
+
self.compiled_pattern = re.compile(self.pattern)
|
33
|
+
self.special_tokens = {}
|
34
|
+
self.inverse_special_tokens = {}
|
35
|
+
|
36
|
+
def register_special_tokens(self, special_tokens):
|
37
|
+
# special_tokens is a dictionary of str -> int
|
38
|
+
# example: {"<|endoftext|>": 100257}
|
39
|
+
self.special_tokens = special_tokens
|
40
|
+
self.inverse_special_tokens = {v: k for k, v in special_tokens.items()}
|
41
|
+
|
42
|
+
def decode(self, ids):
|
43
|
+
# given ids (list of integers), return Python string
|
44
|
+
part_bytes = []
|
45
|
+
for idx in ids:
|
46
|
+
if idx in self.vocab:
|
47
|
+
part_bytes.append(self.vocab[idx])
|
48
|
+
elif idx in self.inverse_special_tokens:
|
49
|
+
part_bytes.append(self.inverse_special_tokens[idx].encode("utf-8"))
|
50
|
+
else:
|
51
|
+
raise ValueError(f"invalid token id: {idx}")
|
52
|
+
text_bytes = b"".join(part_bytes)
|
53
|
+
text = text_bytes.decode("utf-8", errors="replace")
|
54
|
+
return text
|
55
|
+
|
56
|
+
def _encode_chunk(self, text_bytes):
|
57
|
+
# return the token ids
|
58
|
+
# let's begin. first, convert all bytes to integers in range 0..255
|
59
|
+
ids = list(text_bytes)
|
60
|
+
while len(ids) >= 2:
|
61
|
+
# find the pair with the lowest merge index
|
62
|
+
stats = get_stats(ids)
|
63
|
+
pair = min(stats, key=lambda p: self.merges.get(p, float("inf")))
|
64
|
+
# subtle: if there are no more merges available, the key will
|
65
|
+
# result in an inf for every single pair, and the min will be
|
66
|
+
# just the first pair in the list, arbitrarily
|
67
|
+
# we can detect this terminating case by a membership check
|
68
|
+
if pair not in self.merges:
|
69
|
+
break # nothing else can be merged anymore
|
70
|
+
# otherwise let's merge the best pair (lowest merge index)
|
71
|
+
idx = self.merges[pair]
|
72
|
+
ids = merge(ids, pair, idx)
|
73
|
+
return ids
|
74
|
+
|
75
|
+
def encode_ordinary(self, text):
|
76
|
+
"""Encoding that ignores any special tokens."""
|
77
|
+
# split text into chunks of text by categories defined in regex pattern
|
78
|
+
text_chunks = re.findall(self.compiled_pattern, text)
|
79
|
+
# all chunks of text are encoded separately, then results are joined
|
80
|
+
ids = []
|
81
|
+
for chunk in text_chunks:
|
82
|
+
chunk_bytes = chunk.encode("utf-8") # raw bytes
|
83
|
+
chunk_ids = self._encode_chunk(chunk_bytes)
|
84
|
+
ids.extend(chunk_ids)
|
85
|
+
return ids
|
86
|
+
|
87
|
+
def encode(self, text, allowed_special="none_raise"):
|
88
|
+
"""
|
89
|
+
Unlike encode_ordinary, this function handles special tokens.
|
90
|
+
allowed_special: can be "all"|"none"|"none_raise" or a custom set of special tokens
|
91
|
+
if none_raise, then an error is raised if any special token is encountered in text
|
92
|
+
this is the default tiktoken behavior right now as well
|
93
|
+
any other behavior is either annoying, or a major footgun
|
94
|
+
"""
|
95
|
+
# decode the user desire w.r.t. handling of special tokens
|
96
|
+
special = None
|
97
|
+
if allowed_special == "all":
|
98
|
+
special = self.special_tokens
|
99
|
+
elif allowed_special == "none":
|
100
|
+
special = {}
|
101
|
+
elif allowed_special == "none_raise":
|
102
|
+
special = {}
|
103
|
+
assert all(token not in text for token in self.special_tokens)
|
104
|
+
elif isinstance(allowed_special, set):
|
105
|
+
special = {k: v for k, v in self.special_tokens.items() if k in allowed_special}
|
106
|
+
else:
|
107
|
+
raise ValueError(f"allowed_special={allowed_special} not understood")
|
108
|
+
if not special:
|
109
|
+
# shortcut: if no special tokens, just use the ordinary encoding
|
110
|
+
return self.encode_ordinary(text)
|
111
|
+
# otherwise, we have to be careful with potential special tokens in text
|
112
|
+
# we handle special tokens by splitting the text
|
113
|
+
# based on the occurrence of any exact match with any of the special tokens
|
114
|
+
# we can use re.split for this. note that surrounding the pattern with ()
|
115
|
+
# makes it into a capturing group, so the special tokens will be included
|
116
|
+
special_pattern = "(" + "|".join(re.escape(k) for k in special) + ")"
|
117
|
+
special_chunks = re.split(special_pattern, text)
|
118
|
+
# now all the special characters are separated from the rest of the text
|
119
|
+
# all chunks of text are encoded separately, then results are joined
|
120
|
+
ids = []
|
121
|
+
for part in special_chunks:
|
122
|
+
if part in special:
|
123
|
+
# this is a special token, encode it separately as a special case
|
124
|
+
ids.append(special[part])
|
125
|
+
else:
|
126
|
+
# this is an ordinary sequence, encode it normally
|
127
|
+
ids.extend(self.encode_ordinary(part))
|
128
|
+
return ids
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
compressor/semantic.py
ADDED
@@ -0,0 +1,171 @@
|
|
1
|
+
import numpy as np, pickle, fasttext, os, traceback, importlib
|
2
|
+
from sklearn.feature_extraction.text import CountVectorizer
|
3
|
+
from sklearn.decomposition import LatentDirichletAllocation
|
4
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
5
|
+
from onnxruntime_extensions import get_library_path
|
6
|
+
from compressor.minbpe.regex import RegexTokenizer
|
7
|
+
from nltk.tokenize import sent_tokenize
|
8
|
+
from multiprocessing import cpu_count
|
9
|
+
import onnxruntime as ort
|
10
|
+
|
11
|
+
tokenizer = RegexTokenizer()
|
12
|
+
nltk_data_path = str(importlib.resources.files('compressor').joinpath('resources/nltk_data'))
|
13
|
+
|
14
|
+
os.environ['NLTK_DATA'] = nltk_data_path
|
15
|
+
|
16
|
+
english_stopwords_path = str(importlib.resources.files('compressor').joinpath('resources/en_stopwords.pkl'))
|
17
|
+
portuguese_stopwords_path = str(importlib.resources.files('compressor').joinpath('resources/pt_stopwords.pkl'))
|
18
|
+
fasttext_model_path = str(importlib.resources.files('compressor').joinpath('resources/lid.176.ftz'))
|
19
|
+
english_stopwords = pickle.load(open(english_stopwords_path, "rb"))
|
20
|
+
portuguese_stopwords = pickle.load(open(portuguese_stopwords_path, "rb"))
|
21
|
+
langdetect_model = fasttext.load_model(fasttext_model_path)
|
22
|
+
|
23
|
+
embedding_model_cpu_count = os.environ.get('EMBEDDING_MODEL_CPU_COUNT', cpu_count() - 1)
|
24
|
+
|
25
|
+
_options = ort.SessionOptions()
|
26
|
+
_options.inter_op_num_threads, _options.intra_op_num_threads = embedding_model_cpu_count, embedding_model_cpu_count
|
27
|
+
_options.register_custom_ops_library(get_library_path())
|
28
|
+
_providers = ["CPUExecutionProvider"]
|
29
|
+
|
30
|
+
embedding_model = ort.InferenceSession(
|
31
|
+
path_or_bytes = str(importlib.resources.files('compressor').joinpath('resources/embedding_model.onnx')),
|
32
|
+
sess_options=_options,
|
33
|
+
providers=_providers
|
34
|
+
)
|
35
|
+
|
36
|
+
def extract_embeddings(text):
|
37
|
+
return embedding_model.run(output_names=["outputs"], input_feed={"inputs": [text]})[0][0]
|
38
|
+
|
39
|
+
def structurize_text(full_text, tokens_per_chunk=300, chunk_overlap=0):
|
40
|
+
chunks = []
|
41
|
+
current_chunk = []
|
42
|
+
current_chunk_length = 0
|
43
|
+
tokens = tokenizer.encode(full_text)
|
44
|
+
for i, token in enumerate(tokens):
|
45
|
+
if current_chunk_length + 1 > tokens_per_chunk:
|
46
|
+
chunks.append(current_chunk)
|
47
|
+
current_chunk = tokens[i-chunk_overlap:i] if i > chunk_overlap else []
|
48
|
+
current_chunk_length = len(current_chunk)
|
49
|
+
current_chunk.append(token)
|
50
|
+
current_chunk_length += 1
|
51
|
+
chunks.append(current_chunk)
|
52
|
+
chunks = [tokenizer.decode(chunk) for chunk in chunks]
|
53
|
+
return chunks
|
54
|
+
|
55
|
+
def count_tokens(text):
|
56
|
+
return len(tokenizer.encode(text))
|
57
|
+
|
58
|
+
def detect_language(text):
|
59
|
+
detected_lang = langdetect_model.predict(text.replace('\n', ' '), k=1)[0][0]
|
60
|
+
return 'pt' if (str(detected_lang) == '__label__pt' or str(detected_lang) == 'portuguese') else 'en'
|
61
|
+
|
62
|
+
def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
|
63
|
+
def calculate_similarity(embed1, embed2):
|
64
|
+
return cosine_similarity([embed1], [embed2])[0][0]
|
65
|
+
|
66
|
+
def create_lda_model(texts, stopwords):
|
67
|
+
vectorizer = CountVectorizer(stop_words=stopwords)
|
68
|
+
doc_term_matrix = vectorizer.fit_transform(texts)
|
69
|
+
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
|
70
|
+
lda.fit(doc_term_matrix)
|
71
|
+
return lda, vectorizer
|
72
|
+
|
73
|
+
def get_topic_distribution(text, lda, vectorizer):
|
74
|
+
vec = vectorizer.transform([text])
|
75
|
+
return lda.transform(vec)[0]
|
76
|
+
|
77
|
+
def sentence_importance(sentence, doc_embedding, lda_model, vectorizer, stopwords):
|
78
|
+
sentence_embedding = extract_embeddings(sentence)
|
79
|
+
semantic_similarity = calculate_similarity(doc_embedding, sentence_embedding)
|
80
|
+
|
81
|
+
topic_dist = get_topic_distribution(sentence, lda_model, vectorizer)
|
82
|
+
topic_importance = np.max(topic_dist)
|
83
|
+
|
84
|
+
# Calculate lexical diversity
|
85
|
+
words = sentence.split()
|
86
|
+
unique_words = set([word.lower() for word in words if word.lower() not in stopwords])
|
87
|
+
lexical_diversity = len(unique_words) / len(words) if words else 0
|
88
|
+
|
89
|
+
# Combine factors
|
90
|
+
importance = (0.6 * semantic_similarity) + (0.3 * topic_importance) + (0.2 * lexical_diversity)
|
91
|
+
return importance
|
92
|
+
|
93
|
+
try:
|
94
|
+
# Split the text into sentences
|
95
|
+
sentences = sent_tokenize(full_text)
|
96
|
+
|
97
|
+
final_sentences = []
|
98
|
+
for s in sentences:
|
99
|
+
broken_sentences = s.split('\n')
|
100
|
+
final_sentences.extend(broken_sentences)
|
101
|
+
sentences = final_sentences
|
102
|
+
|
103
|
+
text_lang = detect_language(full_text)
|
104
|
+
|
105
|
+
# Create LDA model
|
106
|
+
lda_model, vectorizer = create_lda_model(sentences, portuguese_stopwords if text_lang == 'pt' else english_stopwords)
|
107
|
+
|
108
|
+
# Get document-level embedding
|
109
|
+
doc_embedding = extract_embeddings(full_text)
|
110
|
+
|
111
|
+
# Calculate importance for each sentence
|
112
|
+
sentence_scores = [(sentence, sentence_importance(sentence, doc_embedding, lda_model, vectorizer, portuguese_stopwords if text_lang == 'pt' else english_stopwords))
|
113
|
+
for sentence in sentences]
|
114
|
+
|
115
|
+
# Sort sentences by importance
|
116
|
+
sorted_sentences = sorted(sentence_scores, key=lambda x: x[1], reverse=True)
|
117
|
+
|
118
|
+
# Determine how many words to keep
|
119
|
+
total_words = sum(len(sentence.split()) for sentence in sentences)
|
120
|
+
target_words = int(total_words * compression_rate)
|
121
|
+
|
122
|
+
# Reconstruct the compressed text
|
123
|
+
compressed_text = []
|
124
|
+
current_words = 0
|
125
|
+
for sentence, _ in sorted_sentences:
|
126
|
+
sentence_words = len(sentence.split())
|
127
|
+
if current_words + sentence_words <= target_words:
|
128
|
+
compressed_text.append(sentence)
|
129
|
+
current_words += sentence_words
|
130
|
+
else:
|
131
|
+
break
|
132
|
+
|
133
|
+
if len(compressed_text) == 0:
|
134
|
+
# Pick the first sentence if no compression is possible
|
135
|
+
compressed_text = [sentences[0]]
|
136
|
+
|
137
|
+
# Reorder sentences to maintain original flow
|
138
|
+
compressed_text.sort(key=lambda x: sentences.index(x))
|
139
|
+
|
140
|
+
return ' '.join(compressed_text)
|
141
|
+
except Exception:
|
142
|
+
traceback.print_exc()
|
143
|
+
|
144
|
+
return full_text
|
145
|
+
|
146
|
+
def compress_text(text, *, target_token_count=None, compression_rate=0.7):
|
147
|
+
"""
|
148
|
+
Compress text using either a compression rate or a target token count.
|
149
|
+
If both are provided, the compression rate will be used.
|
150
|
+
|
151
|
+
Args:
|
152
|
+
text (str): The text to be compressed.
|
153
|
+
target_token_count (int, optional): The target token count for compression. Defaults to None.
|
154
|
+
compression_rate (float, optional): The compression rate as a percentage. Defaults to 0.7. Example: 0.7 means 70% reduction.
|
155
|
+
|
156
|
+
Returns:
|
157
|
+
str: The compressed text.
|
158
|
+
"""
|
159
|
+
|
160
|
+
if target_token_count is None:
|
161
|
+
compression_rate = 1 - compression_rate
|
162
|
+
original_token_count = count_tokens(text)
|
163
|
+
target_token_count = int(original_token_count * compression_rate)
|
164
|
+
else:
|
165
|
+
original_token_count = count_tokens(text)
|
166
|
+
if original_token_count <= target_token_count:
|
167
|
+
return text
|
168
|
+
# Get the compression rate
|
169
|
+
compression_rate = target_token_count / original_token_count
|
170
|
+
|
171
|
+
return semantic_compress_text(text, compression_rate)
|
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2024 Carlo Moro
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
@@ -0,0 +1,18 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: semantic_compressor
|
3
|
+
Version: 1.0
|
4
|
+
Author: Carlo Moro
|
5
|
+
Author-email: Carlo Moro <cnmoro@gmail.com>
|
6
|
+
Classifier: Programming Language :: Python :: 3
|
7
|
+
Classifier: License :: OSI Approved :: MIT License
|
8
|
+
Classifier: Operating System :: OS Independent
|
9
|
+
Requires-Python: >=3.7
|
10
|
+
Description-Content-Type: text/markdown
|
11
|
+
License-File: LICENSE
|
12
|
+
Requires-Dist: numpy <2
|
13
|
+
Requires-Dist: nltk
|
14
|
+
Requires-Dist: scikit-learn
|
15
|
+
Requires-Dist: fasttext
|
16
|
+
Requires-Dist: onnxruntime
|
17
|
+
Requires-Dist: onnxruntime-extensions
|
18
|
+
|
@@ -0,0 +1,15 @@
|
|
1
|
+
compressor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
+
compressor/semantic.py,sha256=wXKmerx4j7cdYqjyHDJVkRX_k-2tbnBx5gNa6Qnpgtg,7148
|
3
|
+
compressor/minbpe/__init__.py,sha256=wZ1z2QKkncvGgiZDBc91AP5m7-M-MVenPStKbS6xylE,95
|
4
|
+
compressor/minbpe/base.py,sha256=tTKag04RRFnc4ppoieBbDV0V6thzi_ZvZTlhOYIoY7Q,6881
|
5
|
+
compressor/minbpe/basic.py,sha256=0kD4tU8l2MZegfPaHMfDo5CnaSzb9i1v9tDBy6GwMbg,2883
|
6
|
+
compressor/minbpe/regex.py,sha256=k3bllcxc5c7mi43tUEGg6jX-Zc4Cvfb1CCTGEp7ZcVM,5821
|
7
|
+
compressor/resources/embedding_model.onnx,sha256=uLBbAfCGEJTwR1yyiK0bMDroruLr6W5lDSEIBdXKcPc,71794489
|
8
|
+
compressor/resources/en_stopwords.pkl,sha256=Q2PyGQnphPUs_jxN9NMSqp2EQjYv4b4oMJY2aMYvbSY,1310
|
9
|
+
compressor/resources/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
|
10
|
+
compressor/resources/pt_stopwords.pkl,sha256=-9bJaxJWjeOFxLHLT9D-rI3XTzGC0iLJfMiwBDnkCYI,1716
|
11
|
+
semantic_compressor-1.0.dist-info/LICENSE,sha256=DFRihXonZ3qVRaTrzuXNaDI_-h2jyT2SqWqjtTDHfqI,1067
|
12
|
+
semantic_compressor-1.0.dist-info/METADATA,sha256=XNW1eUfEDeulFSHhs2dd7znll8RLiBRwqwzXZgI5gRA,517
|
13
|
+
semantic_compressor-1.0.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
|
14
|
+
semantic_compressor-1.0.dist-info/top_level.txt,sha256=qb2SlKrEmMrQDVrhwxu3Wr7U6JupPXtDGrJpIQr8xSc,11
|
15
|
+
semantic_compressor-1.0.dist-info/RECORD,,
|
@@ -0,0 +1 @@
|
|
1
|
+
compressor
|