semantic-compressor 1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
compressor/__init__.py ADDED
File without changes
@@ -0,0 +1,3 @@
1
+ from .base import Tokenizer
2
+ from .basic import BasicTokenizer
3
+ from .regex import RegexTokenizer
@@ -0,0 +1,165 @@
1
+ """
2
+ Contains the base Tokenizer class and a few common helper functions.
3
+ The base class also contains the (common) save/load functionality.
4
+ It would be possible to be a lot more strict about the interface and
5
+ e.g. isolating all regex/pattern parts to the RegexTokenizer, but
6
+ some concessions are made for simplicity.
7
+ """
8
+ import unicodedata
9
+
10
+ # -----------------------------------------------------------------------------
11
+ # a few helper functions useful for both BasicTokenizer and RegexTokenizer
12
+
13
+ def get_stats(ids, counts=None):
14
+ """
15
+ Given a list of integers, return a dictionary of counts of consecutive pairs
16
+ Example: [1, 2, 3, 1, 2] -> {(1, 2): 2, (2, 3): 1, (3, 1): 1}
17
+ Optionally allows to update an existing dictionary of counts
18
+ """
19
+ counts = {} if counts is None else counts
20
+ for pair in zip(ids, ids[1:]): # iterate consecutive elements
21
+ counts[pair] = counts.get(pair, 0) + 1
22
+ return counts
23
+
24
+
25
+ def merge(ids, pair, idx):
26
+ """
27
+ In the list of integers (ids), replace all consecutive occurrences
28
+ of pair with the new integer token idx
29
+ Example: ids=[1, 2, 3, 1, 2], pair=(1, 2), idx=4 -> [4, 3, 4]
30
+ """
31
+ newids = []
32
+ i = 0
33
+ while i < len(ids):
34
+ # if not at the very last position AND the pair matches, replace it
35
+ if ids[i] == pair[0] and i < len(ids) - 1 and ids[i+1] == pair[1]:
36
+ newids.append(idx)
37
+ i += 2
38
+ else:
39
+ newids.append(ids[i])
40
+ i += 1
41
+ return newids
42
+
43
+ # first two helper functions...
44
+ def replace_control_characters(s: str) -> str:
45
+ # we don't want to print control characters
46
+ # which distort the output (e.g. \n or much worse)
47
+ # https://stackoverflow.com/questions/4324790/removing-control-characters-from-a-string-in-python/19016117#19016117
48
+ # http://www.unicode.org/reports/tr44/#GC_Values_Table
49
+ chars = []
50
+ for ch in s:
51
+ if unicodedata.category(ch)[0] != "C":
52
+ chars.append(ch) # this character is ok
53
+ else:
54
+ chars.append(f"\\u{ord(ch):04x}") # escape
55
+ return "".join(chars)
56
+
57
+ def render_token(t: bytes) -> str:
58
+ # pretty print a token, escaping control characters
59
+ s = t.decode('utf-8', errors='replace')
60
+ s = replace_control_characters(s)
61
+ return s
62
+
63
+ # -----------------------------------------------------------------------------
64
+ # the base Tokenizer class
65
+
66
+ class Tokenizer:
67
+ """Base class for Tokenizers"""
68
+
69
+ def __init__(self):
70
+ # default: vocab size of 256 (all bytes), no merges, no patterns
71
+ self.merges = {} # (int, int) -> int
72
+ self.pattern = "" # str
73
+ self.special_tokens = {} # str -> int, e.g. {'<|endoftext|>': 100257}
74
+ self.vocab = self._build_vocab() # int -> bytes
75
+
76
+ def train(self, text, vocab_size, verbose=False):
77
+ # Tokenizer can train a vocabulary of size vocab_size from text
78
+ raise NotImplementedError
79
+
80
+ def encode(self, text):
81
+ # Tokenizer can encode a string into a list of integers
82
+ raise NotImplementedError
83
+
84
+ def decode(self, ids):
85
+ # Tokenizer can decode a list of integers into a string
86
+ raise NotImplementedError
87
+
88
+ def _build_vocab(self):
89
+ # vocab is simply and deterministically derived from merges
90
+ vocab = {idx: bytes([idx]) for idx in range(256)}
91
+ for (p0, p1), idx in self.merges.items():
92
+ vocab[idx] = vocab[p0] + vocab[p1]
93
+ for special, idx in self.special_tokens.items():
94
+ vocab[idx] = special.encode("utf-8")
95
+ return vocab
96
+
97
+ def save(self, file_prefix):
98
+ """
99
+ Saves two files: file_prefix.vocab and file_prefix.model
100
+ This is inspired (but not equivalent to!) sentencepiece's model saving:
101
+ - model file is the critical one, intended for load()
102
+ - vocab file is just a pretty printed version for human inspection only
103
+ """
104
+ # write the model: to be used in load() later
105
+ model_file = file_prefix + ".model"
106
+ with open(model_file, 'w') as f:
107
+ # write the version, pattern and merges, that's all that's needed
108
+ f.write("minbpe v1\n")
109
+ f.write(f"{self.pattern}\n")
110
+ # write the special tokens, first the number of them, then each one
111
+ f.write(f"{len(self.special_tokens)}\n")
112
+ for special, idx in self.special_tokens.items():
113
+ f.write(f"{special} {idx}\n")
114
+ # the merges dict
115
+ for idx1, idx2 in self.merges:
116
+ f.write(f"{idx1} {idx2}\n")
117
+ # write the vocab: for the human to look at
118
+ vocab_file = file_prefix + ".vocab"
119
+ inverted_merges = {idx: pair for pair, idx in self.merges.items()}
120
+ with open(vocab_file, "w", encoding="utf-8") as f:
121
+ for idx, token in self.vocab.items():
122
+ # note: many tokens may be partial utf-8 sequences
123
+ # and cannot be decoded into valid strings. Here we're using
124
+ # errors='replace' to replace them with the replacement char �.
125
+ # this also means that we couldn't possibly use .vocab in load()
126
+ # because decoding in this way is a lossy operation!
127
+ s = render_token(token)
128
+ # find the children of this token, if any
129
+ if idx in inverted_merges:
130
+ # if this token has children, render it nicely as a merge
131
+ idx0, idx1 = inverted_merges[idx]
132
+ s0 = render_token(self.vocab[idx0])
133
+ s1 = render_token(self.vocab[idx1])
134
+ f.write(f"[{s0}][{s1}] -> [{s}] {idx}\n")
135
+ else:
136
+ # otherwise this is leaf token, just print it
137
+ # (this should just be the first 256 tokens, the bytes)
138
+ f.write(f"[{s}] {idx}\n")
139
+
140
+ def load(self, model_file):
141
+ """Inverse of save() but only for the model file"""
142
+ assert model_file.endswith(".model")
143
+ # read the model file
144
+ merges = {}
145
+ special_tokens = {}
146
+ idx = 256
147
+ with open(model_file, 'r', encoding="utf-8") as f:
148
+ # read the version
149
+ version = f.readline().strip()
150
+ assert version == "minbpe v1"
151
+ # read the pattern
152
+ self.pattern = f.readline().strip()
153
+ # read the special tokens
154
+ num_special = int(f.readline().strip())
155
+ for _ in range(num_special):
156
+ special, special_idx = f.readline().strip().split()
157
+ special_tokens[special] = int(special_idx)
158
+ # read the merges
159
+ for line in f:
160
+ idx1, idx2 = map(int, line.split())
161
+ merges[(idx1, idx2)] = idx
162
+ idx += 1
163
+ self.merges = merges
164
+ self.special_tokens = special_tokens
165
+ self.vocab = self._build_vocab()
@@ -0,0 +1,74 @@
1
+ """
2
+ Minimal (byte-level) Byte Pair Encoding tokenizer.
3
+
4
+ Algorithmically follows along the GPT tokenizer:
5
+ https://github.com/openai/gpt-2/blob/master/src/encoder.py
6
+
7
+ But:
8
+ - Does not handle the regular expression splitting pattern.
9
+ - Does not handle any special tokens.
10
+ """
11
+
12
+ from .base import Tokenizer, get_stats, merge
13
+
14
+
15
+ class BasicTokenizer(Tokenizer):
16
+
17
+ def __init__(self):
18
+ super().__init__()
19
+
20
+ def train(self, text, vocab_size, verbose=False):
21
+ assert vocab_size >= 256
22
+ num_merges = vocab_size - 256
23
+
24
+ # input text preprocessing
25
+ text_bytes = text.encode("utf-8") # raw bytes
26
+ ids = list(text_bytes) # list of integers in range 0..255
27
+
28
+ # iteratively merge the most common pairs to create new tokens
29
+ merges = {} # (int, int) -> int
30
+ vocab = {idx: bytes([idx]) for idx in range(256)} # int -> bytes
31
+ for i in range(num_merges):
32
+ # count up the number of times every consecutive pair appears
33
+ stats = get_stats(ids)
34
+ # find the pair with the highest count
35
+ pair = max(stats, key=stats.get)
36
+ # mint a new token: assign it the next available id
37
+ idx = 256 + i
38
+ # replace all occurrences of pair in ids with idx
39
+ ids = merge(ids, pair, idx)
40
+ # save the merge
41
+ merges[pair] = idx
42
+ vocab[idx] = vocab[pair[0]] + vocab[pair[1]]
43
+ # prints
44
+ if verbose:
45
+ print(f"merge {i+1}/{num_merges}: {pair} -> {idx} ({vocab[idx]}) had {stats[pair]} occurrences")
46
+
47
+ # save class variables
48
+ self.merges = merges # used in encode()
49
+ self.vocab = vocab # used in decode()
50
+
51
+ def decode(self, ids):
52
+ # given ids (list of integers), return Python string
53
+ text_bytes = b"".join(self.vocab[idx] for idx in ids)
54
+ text = text_bytes.decode("utf-8", errors="replace")
55
+ return text
56
+
57
+ def encode(self, text):
58
+ # given a string text, return the token ids
59
+ text_bytes = text.encode("utf-8") # raw bytes
60
+ ids = list(text_bytes) # list of integers in range 0..255
61
+ while len(ids) >= 2:
62
+ # find the pair with the lowest merge index
63
+ stats = get_stats(ids)
64
+ pair = min(stats, key=lambda p: self.merges.get(p, float("inf")))
65
+ # subtle: if there are no more merges available, the key will
66
+ # result in an inf for every single pair, and the min will be
67
+ # just the first pair in the list, arbitrarily
68
+ # we can detect this terminating case by a membership check
69
+ if pair not in self.merges:
70
+ break # nothing else can be merged anymore
71
+ # otherwise let's merge the best pair (lowest merge index)
72
+ idx = self.merges[pair]
73
+ ids = merge(ids, pair, idx)
74
+ return ids
@@ -0,0 +1,128 @@
1
+ """
2
+ Minimal (byte-level) Byte Pair Encoding tokenizer.
3
+
4
+ Algorithmically follows along the GPT tokenizer:
5
+ https://github.com/openai/gpt-2/blob/master/src/encoder.py
6
+
7
+ Unlike BasicTokenizer:
8
+ - RegexTokenizer handles an optional regex splitting pattern.
9
+ - RegexTokenizer handles optional special tokens.
10
+ """
11
+
12
+ import regex as re
13
+ from .base import Tokenizer, get_stats, merge
14
+
15
+
16
+ # the main GPT text split patterns, see
17
+ # https://github.com/openai/tiktoken/blob/main/tiktoken_ext/openai_public.py
18
+ GPT2_SPLIT_PATTERN = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
19
+ GPT4_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""
20
+
21
+
22
+ class RegexTokenizer(Tokenizer):
23
+
24
+ def __init__(self, pattern=None):
25
+ """
26
+ - pattern: optional string to override the default (GPT-4 split pattern)
27
+ - special_tokens: str -> int dictionary of special tokens
28
+ example: {'<|endoftext|>': 100257}
29
+ """
30
+ super().__init__()
31
+ self.pattern = GPT4_SPLIT_PATTERN if pattern is None else pattern
32
+ self.compiled_pattern = re.compile(self.pattern)
33
+ self.special_tokens = {}
34
+ self.inverse_special_tokens = {}
35
+
36
+ def register_special_tokens(self, special_tokens):
37
+ # special_tokens is a dictionary of str -> int
38
+ # example: {"<|endoftext|>": 100257}
39
+ self.special_tokens = special_tokens
40
+ self.inverse_special_tokens = {v: k for k, v in special_tokens.items()}
41
+
42
+ def decode(self, ids):
43
+ # given ids (list of integers), return Python string
44
+ part_bytes = []
45
+ for idx in ids:
46
+ if idx in self.vocab:
47
+ part_bytes.append(self.vocab[idx])
48
+ elif idx in self.inverse_special_tokens:
49
+ part_bytes.append(self.inverse_special_tokens[idx].encode("utf-8"))
50
+ else:
51
+ raise ValueError(f"invalid token id: {idx}")
52
+ text_bytes = b"".join(part_bytes)
53
+ text = text_bytes.decode("utf-8", errors="replace")
54
+ return text
55
+
56
+ def _encode_chunk(self, text_bytes):
57
+ # return the token ids
58
+ # let's begin. first, convert all bytes to integers in range 0..255
59
+ ids = list(text_bytes)
60
+ while len(ids) >= 2:
61
+ # find the pair with the lowest merge index
62
+ stats = get_stats(ids)
63
+ pair = min(stats, key=lambda p: self.merges.get(p, float("inf")))
64
+ # subtle: if there are no more merges available, the key will
65
+ # result in an inf for every single pair, and the min will be
66
+ # just the first pair in the list, arbitrarily
67
+ # we can detect this terminating case by a membership check
68
+ if pair not in self.merges:
69
+ break # nothing else can be merged anymore
70
+ # otherwise let's merge the best pair (lowest merge index)
71
+ idx = self.merges[pair]
72
+ ids = merge(ids, pair, idx)
73
+ return ids
74
+
75
+ def encode_ordinary(self, text):
76
+ """Encoding that ignores any special tokens."""
77
+ # split text into chunks of text by categories defined in regex pattern
78
+ text_chunks = re.findall(self.compiled_pattern, text)
79
+ # all chunks of text are encoded separately, then results are joined
80
+ ids = []
81
+ for chunk in text_chunks:
82
+ chunk_bytes = chunk.encode("utf-8") # raw bytes
83
+ chunk_ids = self._encode_chunk(chunk_bytes)
84
+ ids.extend(chunk_ids)
85
+ return ids
86
+
87
+ def encode(self, text, allowed_special="none_raise"):
88
+ """
89
+ Unlike encode_ordinary, this function handles special tokens.
90
+ allowed_special: can be "all"|"none"|"none_raise" or a custom set of special tokens
91
+ if none_raise, then an error is raised if any special token is encountered in text
92
+ this is the default tiktoken behavior right now as well
93
+ any other behavior is either annoying, or a major footgun
94
+ """
95
+ # decode the user desire w.r.t. handling of special tokens
96
+ special = None
97
+ if allowed_special == "all":
98
+ special = self.special_tokens
99
+ elif allowed_special == "none":
100
+ special = {}
101
+ elif allowed_special == "none_raise":
102
+ special = {}
103
+ assert all(token not in text for token in self.special_tokens)
104
+ elif isinstance(allowed_special, set):
105
+ special = {k: v for k, v in self.special_tokens.items() if k in allowed_special}
106
+ else:
107
+ raise ValueError(f"allowed_special={allowed_special} not understood")
108
+ if not special:
109
+ # shortcut: if no special tokens, just use the ordinary encoding
110
+ return self.encode_ordinary(text)
111
+ # otherwise, we have to be careful with potential special tokens in text
112
+ # we handle special tokens by splitting the text
113
+ # based on the occurrence of any exact match with any of the special tokens
114
+ # we can use re.split for this. note that surrounding the pattern with ()
115
+ # makes it into a capturing group, so the special tokens will be included
116
+ special_pattern = "(" + "|".join(re.escape(k) for k in special) + ")"
117
+ special_chunks = re.split(special_pattern, text)
118
+ # now all the special characters are separated from the rest of the text
119
+ # all chunks of text are encoded separately, then results are joined
120
+ ids = []
121
+ for part in special_chunks:
122
+ if part in special:
123
+ # this is a special token, encode it separately as a special case
124
+ ids.append(special[part])
125
+ else:
126
+ # this is an ordinary sequence, encode it normally
127
+ ids.extend(self.encode_ordinary(part))
128
+ return ids
Binary file
Binary file
Binary file
compressor/semantic.py ADDED
@@ -0,0 +1,171 @@
1
+ import numpy as np, pickle, fasttext, os, traceback, importlib
2
+ from sklearn.feature_extraction.text import CountVectorizer
3
+ from sklearn.decomposition import LatentDirichletAllocation
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
+ from onnxruntime_extensions import get_library_path
6
+ from compressor.minbpe.regex import RegexTokenizer
7
+ from nltk.tokenize import sent_tokenize
8
+ from multiprocessing import cpu_count
9
+ import onnxruntime as ort
10
+
11
+ tokenizer = RegexTokenizer()
12
+ nltk_data_path = str(importlib.resources.files('compressor').joinpath('resources/nltk_data'))
13
+
14
+ os.environ['NLTK_DATA'] = nltk_data_path
15
+
16
+ english_stopwords_path = str(importlib.resources.files('compressor').joinpath('resources/en_stopwords.pkl'))
17
+ portuguese_stopwords_path = str(importlib.resources.files('compressor').joinpath('resources/pt_stopwords.pkl'))
18
+ fasttext_model_path = str(importlib.resources.files('compressor').joinpath('resources/lid.176.ftz'))
19
+ english_stopwords = pickle.load(open(english_stopwords_path, "rb"))
20
+ portuguese_stopwords = pickle.load(open(portuguese_stopwords_path, "rb"))
21
+ langdetect_model = fasttext.load_model(fasttext_model_path)
22
+
23
+ embedding_model_cpu_count = os.environ.get('EMBEDDING_MODEL_CPU_COUNT', cpu_count() - 1)
24
+
25
+ _options = ort.SessionOptions()
26
+ _options.inter_op_num_threads, _options.intra_op_num_threads = embedding_model_cpu_count, embedding_model_cpu_count
27
+ _options.register_custom_ops_library(get_library_path())
28
+ _providers = ["CPUExecutionProvider"]
29
+
30
+ embedding_model = ort.InferenceSession(
31
+ path_or_bytes = str(importlib.resources.files('compressor').joinpath('resources/embedding_model.onnx')),
32
+ sess_options=_options,
33
+ providers=_providers
34
+ )
35
+
36
+ def extract_embeddings(text):
37
+ return embedding_model.run(output_names=["outputs"], input_feed={"inputs": [text]})[0][0]
38
+
39
+ def structurize_text(full_text, tokens_per_chunk=300, chunk_overlap=0):
40
+ chunks = []
41
+ current_chunk = []
42
+ current_chunk_length = 0
43
+ tokens = tokenizer.encode(full_text)
44
+ for i, token in enumerate(tokens):
45
+ if current_chunk_length + 1 > tokens_per_chunk:
46
+ chunks.append(current_chunk)
47
+ current_chunk = tokens[i-chunk_overlap:i] if i > chunk_overlap else []
48
+ current_chunk_length = len(current_chunk)
49
+ current_chunk.append(token)
50
+ current_chunk_length += 1
51
+ chunks.append(current_chunk)
52
+ chunks = [tokenizer.decode(chunk) for chunk in chunks]
53
+ return chunks
54
+
55
+ def count_tokens(text):
56
+ return len(tokenizer.encode(text))
57
+
58
+ def detect_language(text):
59
+ detected_lang = langdetect_model.predict(text.replace('\n', ' '), k=1)[0][0]
60
+ return 'pt' if (str(detected_lang) == '__label__pt' or str(detected_lang) == 'portuguese') else 'en'
61
+
62
+ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
63
+ def calculate_similarity(embed1, embed2):
64
+ return cosine_similarity([embed1], [embed2])[0][0]
65
+
66
+ def create_lda_model(texts, stopwords):
67
+ vectorizer = CountVectorizer(stop_words=stopwords)
68
+ doc_term_matrix = vectorizer.fit_transform(texts)
69
+ lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
70
+ lda.fit(doc_term_matrix)
71
+ return lda, vectorizer
72
+
73
+ def get_topic_distribution(text, lda, vectorizer):
74
+ vec = vectorizer.transform([text])
75
+ return lda.transform(vec)[0]
76
+
77
+ def sentence_importance(sentence, doc_embedding, lda_model, vectorizer, stopwords):
78
+ sentence_embedding = extract_embeddings(sentence)
79
+ semantic_similarity = calculate_similarity(doc_embedding, sentence_embedding)
80
+
81
+ topic_dist = get_topic_distribution(sentence, lda_model, vectorizer)
82
+ topic_importance = np.max(topic_dist)
83
+
84
+ # Calculate lexical diversity
85
+ words = sentence.split()
86
+ unique_words = set([word.lower() for word in words if word.lower() not in stopwords])
87
+ lexical_diversity = len(unique_words) / len(words) if words else 0
88
+
89
+ # Combine factors
90
+ importance = (0.6 * semantic_similarity) + (0.3 * topic_importance) + (0.2 * lexical_diversity)
91
+ return importance
92
+
93
+ try:
94
+ # Split the text into sentences
95
+ sentences = sent_tokenize(full_text)
96
+
97
+ final_sentences = []
98
+ for s in sentences:
99
+ broken_sentences = s.split('\n')
100
+ final_sentences.extend(broken_sentences)
101
+ sentences = final_sentences
102
+
103
+ text_lang = detect_language(full_text)
104
+
105
+ # Create LDA model
106
+ lda_model, vectorizer = create_lda_model(sentences, portuguese_stopwords if text_lang == 'pt' else english_stopwords)
107
+
108
+ # Get document-level embedding
109
+ doc_embedding = extract_embeddings(full_text)
110
+
111
+ # Calculate importance for each sentence
112
+ sentence_scores = [(sentence, sentence_importance(sentence, doc_embedding, lda_model, vectorizer, portuguese_stopwords if text_lang == 'pt' else english_stopwords))
113
+ for sentence in sentences]
114
+
115
+ # Sort sentences by importance
116
+ sorted_sentences = sorted(sentence_scores, key=lambda x: x[1], reverse=True)
117
+
118
+ # Determine how many words to keep
119
+ total_words = sum(len(sentence.split()) for sentence in sentences)
120
+ target_words = int(total_words * compression_rate)
121
+
122
+ # Reconstruct the compressed text
123
+ compressed_text = []
124
+ current_words = 0
125
+ for sentence, _ in sorted_sentences:
126
+ sentence_words = len(sentence.split())
127
+ if current_words + sentence_words <= target_words:
128
+ compressed_text.append(sentence)
129
+ current_words += sentence_words
130
+ else:
131
+ break
132
+
133
+ if len(compressed_text) == 0:
134
+ # Pick the first sentence if no compression is possible
135
+ compressed_text = [sentences[0]]
136
+
137
+ # Reorder sentences to maintain original flow
138
+ compressed_text.sort(key=lambda x: sentences.index(x))
139
+
140
+ return ' '.join(compressed_text)
141
+ except Exception:
142
+ traceback.print_exc()
143
+
144
+ return full_text
145
+
146
+ def compress_text(text, *, target_token_count=None, compression_rate=0.7):
147
+ """
148
+ Compress text using either a compression rate or a target token count.
149
+ If both are provided, the compression rate will be used.
150
+
151
+ Args:
152
+ text (str): The text to be compressed.
153
+ target_token_count (int, optional): The target token count for compression. Defaults to None.
154
+ compression_rate (float, optional): The compression rate as a percentage. Defaults to 0.7. Example: 0.7 means 70% reduction.
155
+
156
+ Returns:
157
+ str: The compressed text.
158
+ """
159
+
160
+ if target_token_count is None:
161
+ compression_rate = 1 - compression_rate
162
+ original_token_count = count_tokens(text)
163
+ target_token_count = int(original_token_count * compression_rate)
164
+ else:
165
+ original_token_count = count_tokens(text)
166
+ if original_token_count <= target_token_count:
167
+ return text
168
+ # Get the compression rate
169
+ compression_rate = target_token_count / original_token_count
170
+
171
+ return semantic_compress_text(text, compression_rate)
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Carlo Moro
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,18 @@
1
+ Metadata-Version: 2.1
2
+ Name: semantic_compressor
3
+ Version: 1.0
4
+ Author: Carlo Moro
5
+ Author-email: Carlo Moro <cnmoro@gmail.com>
6
+ Classifier: Programming Language :: Python :: 3
7
+ Classifier: License :: OSI Approved :: MIT License
8
+ Classifier: Operating System :: OS Independent
9
+ Requires-Python: >=3.7
10
+ Description-Content-Type: text/markdown
11
+ License-File: LICENSE
12
+ Requires-Dist: numpy <2
13
+ Requires-Dist: nltk
14
+ Requires-Dist: scikit-learn
15
+ Requires-Dist: fasttext
16
+ Requires-Dist: onnxruntime
17
+ Requires-Dist: onnxruntime-extensions
18
+
@@ -0,0 +1,15 @@
1
+ compressor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ compressor/semantic.py,sha256=wXKmerx4j7cdYqjyHDJVkRX_k-2tbnBx5gNa6Qnpgtg,7148
3
+ compressor/minbpe/__init__.py,sha256=wZ1z2QKkncvGgiZDBc91AP5m7-M-MVenPStKbS6xylE,95
4
+ compressor/minbpe/base.py,sha256=tTKag04RRFnc4ppoieBbDV0V6thzi_ZvZTlhOYIoY7Q,6881
5
+ compressor/minbpe/basic.py,sha256=0kD4tU8l2MZegfPaHMfDo5CnaSzb9i1v9tDBy6GwMbg,2883
6
+ compressor/minbpe/regex.py,sha256=k3bllcxc5c7mi43tUEGg6jX-Zc4Cvfb1CCTGEp7ZcVM,5821
7
+ compressor/resources/embedding_model.onnx,sha256=uLBbAfCGEJTwR1yyiK0bMDroruLr6W5lDSEIBdXKcPc,71794489
8
+ compressor/resources/en_stopwords.pkl,sha256=Q2PyGQnphPUs_jxN9NMSqp2EQjYv4b4oMJY2aMYvbSY,1310
9
+ compressor/resources/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
10
+ compressor/resources/pt_stopwords.pkl,sha256=-9bJaxJWjeOFxLHLT9D-rI3XTzGC0iLJfMiwBDnkCYI,1716
11
+ semantic_compressor-1.0.dist-info/LICENSE,sha256=DFRihXonZ3qVRaTrzuXNaDI_-h2jyT2SqWqjtTDHfqI,1067
12
+ semantic_compressor-1.0.dist-info/METADATA,sha256=XNW1eUfEDeulFSHhs2dd7znll8RLiBRwqwzXZgI5gRA,517
13
+ semantic_compressor-1.0.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
14
+ semantic_compressor-1.0.dist-info/top_level.txt,sha256=qb2SlKrEmMrQDVrhwxu3Wr7U6JupPXtDGrJpIQr8xSc,11
15
+ semantic_compressor-1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (72.1.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ compressor