semantic-compressor 1.0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
compressor/__init__.py ADDED
File without changes
@@ -0,0 +1,3 @@
1
+ from .base import Tokenizer
2
+ from .basic import BasicTokenizer
3
+ from .regex import RegexTokenizer
@@ -0,0 +1,165 @@
1
+ """
2
+ Contains the base Tokenizer class and a few common helper functions.
3
+ The base class also contains the (common) save/load functionality.
4
+ It would be possible to be a lot more strict about the interface and
5
+ e.g. isolating all regex/pattern parts to the RegexTokenizer, but
6
+ some concessions are made for simplicity.
7
+ """
8
+ import unicodedata
9
+
10
+ # -----------------------------------------------------------------------------
11
+ # a few helper functions useful for both BasicTokenizer and RegexTokenizer
12
+
13
+ def get_stats(ids, counts=None):
14
+ """
15
+ Given a list of integers, return a dictionary of counts of consecutive pairs
16
+ Example: [1, 2, 3, 1, 2] -> {(1, 2): 2, (2, 3): 1, (3, 1): 1}
17
+ Optionally allows to update an existing dictionary of counts
18
+ """
19
+ counts = {} if counts is None else counts
20
+ for pair in zip(ids, ids[1:]): # iterate consecutive elements
21
+ counts[pair] = counts.get(pair, 0) + 1
22
+ return counts
23
+
24
+
25
+ def merge(ids, pair, idx):
26
+ """
27
+ In the list of integers (ids), replace all consecutive occurrences
28
+ of pair with the new integer token idx
29
+ Example: ids=[1, 2, 3, 1, 2], pair=(1, 2), idx=4 -> [4, 3, 4]
30
+ """
31
+ newids = []
32
+ i = 0
33
+ while i < len(ids):
34
+ # if not at the very last position AND the pair matches, replace it
35
+ if ids[i] == pair[0] and i < len(ids) - 1 and ids[i+1] == pair[1]:
36
+ newids.append(idx)
37
+ i += 2
38
+ else:
39
+ newids.append(ids[i])
40
+ i += 1
41
+ return newids
42
+
43
+ # first two helper functions...
44
+ def replace_control_characters(s: str) -> str:
45
+ # we don't want to print control characters
46
+ # which distort the output (e.g. \n or much worse)
47
+ # https://stackoverflow.com/questions/4324790/removing-control-characters-from-a-string-in-python/19016117#19016117
48
+ # http://www.unicode.org/reports/tr44/#GC_Values_Table
49
+ chars = []
50
+ for ch in s:
51
+ if unicodedata.category(ch)[0] != "C":
52
+ chars.append(ch) # this character is ok
53
+ else:
54
+ chars.append(f"\\u{ord(ch):04x}") # escape
55
+ return "".join(chars)
56
+
57
+ def render_token(t: bytes) -> str:
58
+ # pretty print a token, escaping control characters
59
+ s = t.decode('utf-8', errors='replace')
60
+ s = replace_control_characters(s)
61
+ return s
62
+
63
+ # -----------------------------------------------------------------------------
64
+ # the base Tokenizer class
65
+
66
+ class Tokenizer:
67
+ """Base class for Tokenizers"""
68
+
69
+ def __init__(self):
70
+ # default: vocab size of 256 (all bytes), no merges, no patterns
71
+ self.merges = {} # (int, int) -> int
72
+ self.pattern = "" # str
73
+ self.special_tokens = {} # str -> int, e.g. {'<|endoftext|>': 100257}
74
+ self.vocab = self._build_vocab() # int -> bytes
75
+
76
+ def train(self, text, vocab_size, verbose=False):
77
+ # Tokenizer can train a vocabulary of size vocab_size from text
78
+ raise NotImplementedError
79
+
80
+ def encode(self, text):
81
+ # Tokenizer can encode a string into a list of integers
82
+ raise NotImplementedError
83
+
84
+ def decode(self, ids):
85
+ # Tokenizer can decode a list of integers into a string
86
+ raise NotImplementedError
87
+
88
+ def _build_vocab(self):
89
+ # vocab is simply and deterministically derived from merges
90
+ vocab = {idx: bytes([idx]) for idx in range(256)}
91
+ for (p0, p1), idx in self.merges.items():
92
+ vocab[idx] = vocab[p0] + vocab[p1]
93
+ for special, idx in self.special_tokens.items():
94
+ vocab[idx] = special.encode("utf-8")
95
+ return vocab
96
+
97
+ def save(self, file_prefix):
98
+ """
99
+ Saves two files: file_prefix.vocab and file_prefix.model
100
+ This is inspired (but not equivalent to!) sentencepiece's model saving:
101
+ - model file is the critical one, intended for load()
102
+ - vocab file is just a pretty printed version for human inspection only
103
+ """
104
+ # write the model: to be used in load() later
105
+ model_file = file_prefix + ".model"
106
+ with open(model_file, 'w') as f:
107
+ # write the version, pattern and merges, that's all that's needed
108
+ f.write("minbpe v1\n")
109
+ f.write(f"{self.pattern}\n")
110
+ # write the special tokens, first the number of them, then each one
111
+ f.write(f"{len(self.special_tokens)}\n")
112
+ for special, idx in self.special_tokens.items():
113
+ f.write(f"{special} {idx}\n")
114
+ # the merges dict
115
+ for idx1, idx2 in self.merges:
116
+ f.write(f"{idx1} {idx2}\n")
117
+ # write the vocab: for the human to look at
118
+ vocab_file = file_prefix + ".vocab"
119
+ inverted_merges = {idx: pair for pair, idx in self.merges.items()}
120
+ with open(vocab_file, "w", encoding="utf-8") as f:
121
+ for idx, token in self.vocab.items():
122
+ # note: many tokens may be partial utf-8 sequences
123
+ # and cannot be decoded into valid strings. Here we're using
124
+ # errors='replace' to replace them with the replacement char �.
125
+ # this also means that we couldn't possibly use .vocab in load()
126
+ # because decoding in this way is a lossy operation!
127
+ s = render_token(token)
128
+ # find the children of this token, if any
129
+ if idx in inverted_merges:
130
+ # if this token has children, render it nicely as a merge
131
+ idx0, idx1 = inverted_merges[idx]
132
+ s0 = render_token(self.vocab[idx0])
133
+ s1 = render_token(self.vocab[idx1])
134
+ f.write(f"[{s0}][{s1}] -> [{s}] {idx}\n")
135
+ else:
136
+ # otherwise this is leaf token, just print it
137
+ # (this should just be the first 256 tokens, the bytes)
138
+ f.write(f"[{s}] {idx}\n")
139
+
140
+ def load(self, model_file):
141
+ """Inverse of save() but only for the model file"""
142
+ assert model_file.endswith(".model")
143
+ # read the model file
144
+ merges = {}
145
+ special_tokens = {}
146
+ idx = 256
147
+ with open(model_file, 'r', encoding="utf-8") as f:
148
+ # read the version
149
+ version = f.readline().strip()
150
+ assert version == "minbpe v1"
151
+ # read the pattern
152
+ self.pattern = f.readline().strip()
153
+ # read the special tokens
154
+ num_special = int(f.readline().strip())
155
+ for _ in range(num_special):
156
+ special, special_idx = f.readline().strip().split()
157
+ special_tokens[special] = int(special_idx)
158
+ # read the merges
159
+ for line in f:
160
+ idx1, idx2 = map(int, line.split())
161
+ merges[(idx1, idx2)] = idx
162
+ idx += 1
163
+ self.merges = merges
164
+ self.special_tokens = special_tokens
165
+ self.vocab = self._build_vocab()
@@ -0,0 +1,74 @@
1
+ """
2
+ Minimal (byte-level) Byte Pair Encoding tokenizer.
3
+
4
+ Algorithmically follows along the GPT tokenizer:
5
+ https://github.com/openai/gpt-2/blob/master/src/encoder.py
6
+
7
+ But:
8
+ - Does not handle the regular expression splitting pattern.
9
+ - Does not handle any special tokens.
10
+ """
11
+
12
+ from .base import Tokenizer, get_stats, merge
13
+
14
+
15
+ class BasicTokenizer(Tokenizer):
16
+
17
+ def __init__(self):
18
+ super().__init__()
19
+
20
+ def train(self, text, vocab_size, verbose=False):
21
+ assert vocab_size >= 256
22
+ num_merges = vocab_size - 256
23
+
24
+ # input text preprocessing
25
+ text_bytes = text.encode("utf-8") # raw bytes
26
+ ids = list(text_bytes) # list of integers in range 0..255
27
+
28
+ # iteratively merge the most common pairs to create new tokens
29
+ merges = {} # (int, int) -> int
30
+ vocab = {idx: bytes([idx]) for idx in range(256)} # int -> bytes
31
+ for i in range(num_merges):
32
+ # count up the number of times every consecutive pair appears
33
+ stats = get_stats(ids)
34
+ # find the pair with the highest count
35
+ pair = max(stats, key=stats.get)
36
+ # mint a new token: assign it the next available id
37
+ idx = 256 + i
38
+ # replace all occurrences of pair in ids with idx
39
+ ids = merge(ids, pair, idx)
40
+ # save the merge
41
+ merges[pair] = idx
42
+ vocab[idx] = vocab[pair[0]] + vocab[pair[1]]
43
+ # prints
44
+ if verbose:
45
+ print(f"merge {i+1}/{num_merges}: {pair} -> {idx} ({vocab[idx]}) had {stats[pair]} occurrences")
46
+
47
+ # save class variables
48
+ self.merges = merges # used in encode()
49
+ self.vocab = vocab # used in decode()
50
+
51
+ def decode(self, ids):
52
+ # given ids (list of integers), return Python string
53
+ text_bytes = b"".join(self.vocab[idx] for idx in ids)
54
+ text = text_bytes.decode("utf-8", errors="replace")
55
+ return text
56
+
57
+ def encode(self, text):
58
+ # given a string text, return the token ids
59
+ text_bytes = text.encode("utf-8") # raw bytes
60
+ ids = list(text_bytes) # list of integers in range 0..255
61
+ while len(ids) >= 2:
62
+ # find the pair with the lowest merge index
63
+ stats = get_stats(ids)
64
+ pair = min(stats, key=lambda p: self.merges.get(p, float("inf")))
65
+ # subtle: if there are no more merges available, the key will
66
+ # result in an inf for every single pair, and the min will be
67
+ # just the first pair in the list, arbitrarily
68
+ # we can detect this terminating case by a membership check
69
+ if pair not in self.merges:
70
+ break # nothing else can be merged anymore
71
+ # otherwise let's merge the best pair (lowest merge index)
72
+ idx = self.merges[pair]
73
+ ids = merge(ids, pair, idx)
74
+ return ids
@@ -0,0 +1,128 @@
1
+ """
2
+ Minimal (byte-level) Byte Pair Encoding tokenizer.
3
+
4
+ Algorithmically follows along the GPT tokenizer:
5
+ https://github.com/openai/gpt-2/blob/master/src/encoder.py
6
+
7
+ Unlike BasicTokenizer:
8
+ - RegexTokenizer handles an optional regex splitting pattern.
9
+ - RegexTokenizer handles optional special tokens.
10
+ """
11
+
12
+ import regex as re
13
+ from .base import Tokenizer, get_stats, merge
14
+
15
+
16
+ # the main GPT text split patterns, see
17
+ # https://github.com/openai/tiktoken/blob/main/tiktoken_ext/openai_public.py
18
+ GPT2_SPLIT_PATTERN = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
19
+ GPT4_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""
20
+
21
+
22
+ class RegexTokenizer(Tokenizer):
23
+
24
+ def __init__(self, pattern=None):
25
+ """
26
+ - pattern: optional string to override the default (GPT-4 split pattern)
27
+ - special_tokens: str -> int dictionary of special tokens
28
+ example: {'<|endoftext|>': 100257}
29
+ """
30
+ super().__init__()
31
+ self.pattern = GPT4_SPLIT_PATTERN if pattern is None else pattern
32
+ self.compiled_pattern = re.compile(self.pattern)
33
+ self.special_tokens = {}
34
+ self.inverse_special_tokens = {}
35
+
36
+ def register_special_tokens(self, special_tokens):
37
+ # special_tokens is a dictionary of str -> int
38
+ # example: {"<|endoftext|>": 100257}
39
+ self.special_tokens = special_tokens
40
+ self.inverse_special_tokens = {v: k for k, v in special_tokens.items()}
41
+
42
+ def decode(self, ids):
43
+ # given ids (list of integers), return Python string
44
+ part_bytes = []
45
+ for idx in ids:
46
+ if idx in self.vocab:
47
+ part_bytes.append(self.vocab[idx])
48
+ elif idx in self.inverse_special_tokens:
49
+ part_bytes.append(self.inverse_special_tokens[idx].encode("utf-8"))
50
+ else:
51
+ raise ValueError(f"invalid token id: {idx}")
52
+ text_bytes = b"".join(part_bytes)
53
+ text = text_bytes.decode("utf-8", errors="replace")
54
+ return text
55
+
56
+ def _encode_chunk(self, text_bytes):
57
+ # return the token ids
58
+ # let's begin. first, convert all bytes to integers in range 0..255
59
+ ids = list(text_bytes)
60
+ while len(ids) >= 2:
61
+ # find the pair with the lowest merge index
62
+ stats = get_stats(ids)
63
+ pair = min(stats, key=lambda p: self.merges.get(p, float("inf")))
64
+ # subtle: if there are no more merges available, the key will
65
+ # result in an inf for every single pair, and the min will be
66
+ # just the first pair in the list, arbitrarily
67
+ # we can detect this terminating case by a membership check
68
+ if pair not in self.merges:
69
+ break # nothing else can be merged anymore
70
+ # otherwise let's merge the best pair (lowest merge index)
71
+ idx = self.merges[pair]
72
+ ids = merge(ids, pair, idx)
73
+ return ids
74
+
75
+ def encode_ordinary(self, text):
76
+ """Encoding that ignores any special tokens."""
77
+ # split text into chunks of text by categories defined in regex pattern
78
+ text_chunks = re.findall(self.compiled_pattern, text)
79
+ # all chunks of text are encoded separately, then results are joined
80
+ ids = []
81
+ for chunk in text_chunks:
82
+ chunk_bytes = chunk.encode("utf-8") # raw bytes
83
+ chunk_ids = self._encode_chunk(chunk_bytes)
84
+ ids.extend(chunk_ids)
85
+ return ids
86
+
87
+ def encode(self, text, allowed_special="none_raise"):
88
+ """
89
+ Unlike encode_ordinary, this function handles special tokens.
90
+ allowed_special: can be "all"|"none"|"none_raise" or a custom set of special tokens
91
+ if none_raise, then an error is raised if any special token is encountered in text
92
+ this is the default tiktoken behavior right now as well
93
+ any other behavior is either annoying, or a major footgun
94
+ """
95
+ # decode the user desire w.r.t. handling of special tokens
96
+ special = None
97
+ if allowed_special == "all":
98
+ special = self.special_tokens
99
+ elif allowed_special == "none":
100
+ special = {}
101
+ elif allowed_special == "none_raise":
102
+ special = {}
103
+ assert all(token not in text for token in self.special_tokens)
104
+ elif isinstance(allowed_special, set):
105
+ special = {k: v for k, v in self.special_tokens.items() if k in allowed_special}
106
+ else:
107
+ raise ValueError(f"allowed_special={allowed_special} not understood")
108
+ if not special:
109
+ # shortcut: if no special tokens, just use the ordinary encoding
110
+ return self.encode_ordinary(text)
111
+ # otherwise, we have to be careful with potential special tokens in text
112
+ # we handle special tokens by splitting the text
113
+ # based on the occurrence of any exact match with any of the special tokens
114
+ # we can use re.split for this. note that surrounding the pattern with ()
115
+ # makes it into a capturing group, so the special tokens will be included
116
+ special_pattern = "(" + "|".join(re.escape(k) for k in special) + ")"
117
+ special_chunks = re.split(special_pattern, text)
118
+ # now all the special characters are separated from the rest of the text
119
+ # all chunks of text are encoded separately, then results are joined
120
+ ids = []
121
+ for part in special_chunks:
122
+ if part in special:
123
+ # this is a special token, encode it separately as a special case
124
+ ids.append(special[part])
125
+ else:
126
+ # this is an ordinary sequence, encode it normally
127
+ ids.extend(self.encode_ordinary(part))
128
+ return ids
Binary file
Binary file
Binary file
compressor/semantic.py ADDED
@@ -0,0 +1,171 @@
1
+ import numpy as np, pickle, fasttext, os, traceback, importlib
2
+ from sklearn.feature_extraction.text import CountVectorizer
3
+ from sklearn.decomposition import LatentDirichletAllocation
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
+ from onnxruntime_extensions import get_library_path
6
+ from compressor.minbpe.regex import RegexTokenizer
7
+ from nltk.tokenize import sent_tokenize
8
+ from multiprocessing import cpu_count
9
+ import onnxruntime as ort
10
+
11
+ tokenizer = RegexTokenizer()
12
+ nltk_data_path = str(importlib.resources.files('compressor').joinpath('resources/nltk_data'))
13
+
14
+ os.environ['NLTK_DATA'] = nltk_data_path
15
+
16
+ english_stopwords_path = str(importlib.resources.files('compressor').joinpath('resources/en_stopwords.pkl'))
17
+ portuguese_stopwords_path = str(importlib.resources.files('compressor').joinpath('resources/pt_stopwords.pkl'))
18
+ fasttext_model_path = str(importlib.resources.files('compressor').joinpath('resources/lid.176.ftz'))
19
+ english_stopwords = pickle.load(open(english_stopwords_path, "rb"))
20
+ portuguese_stopwords = pickle.load(open(portuguese_stopwords_path, "rb"))
21
+ langdetect_model = fasttext.load_model(fasttext_model_path)
22
+
23
+ embedding_model_cpu_count = os.environ.get('EMBEDDING_MODEL_CPU_COUNT', cpu_count() - 1)
24
+
25
+ _options = ort.SessionOptions()
26
+ _options.inter_op_num_threads, _options.intra_op_num_threads = embedding_model_cpu_count, embedding_model_cpu_count
27
+ _options.register_custom_ops_library(get_library_path())
28
+ _providers = ["CPUExecutionProvider"]
29
+
30
+ embedding_model = ort.InferenceSession(
31
+ path_or_bytes = str(importlib.resources.files('compressor').joinpath('resources/embedding_model.onnx')),
32
+ sess_options=_options,
33
+ providers=_providers
34
+ )
35
+
36
+ def extract_embeddings(text):
37
+ return embedding_model.run(output_names=["outputs"], input_feed={"inputs": [text]})[0][0]
38
+
39
+ def structurize_text(full_text, tokens_per_chunk=300, chunk_overlap=0):
40
+ chunks = []
41
+ current_chunk = []
42
+ current_chunk_length = 0
43
+ tokens = tokenizer.encode(full_text)
44
+ for i, token in enumerate(tokens):
45
+ if current_chunk_length + 1 > tokens_per_chunk:
46
+ chunks.append(current_chunk)
47
+ current_chunk = tokens[i-chunk_overlap:i] if i > chunk_overlap else []
48
+ current_chunk_length = len(current_chunk)
49
+ current_chunk.append(token)
50
+ current_chunk_length += 1
51
+ chunks.append(current_chunk)
52
+ chunks = [tokenizer.decode(chunk) for chunk in chunks]
53
+ return chunks
54
+
55
+ def count_tokens(text):
56
+ return len(tokenizer.encode(text))
57
+
58
+ def detect_language(text):
59
+ detected_lang = langdetect_model.predict(text.replace('\n', ' '), k=1)[0][0]
60
+ return 'pt' if (str(detected_lang) == '__label__pt' or str(detected_lang) == 'portuguese') else 'en'
61
+
62
+ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
63
+ def calculate_similarity(embed1, embed2):
64
+ return cosine_similarity([embed1], [embed2])[0][0]
65
+
66
+ def create_lda_model(texts, stopwords):
67
+ vectorizer = CountVectorizer(stop_words=stopwords)
68
+ doc_term_matrix = vectorizer.fit_transform(texts)
69
+ lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
70
+ lda.fit(doc_term_matrix)
71
+ return lda, vectorizer
72
+
73
+ def get_topic_distribution(text, lda, vectorizer):
74
+ vec = vectorizer.transform([text])
75
+ return lda.transform(vec)[0]
76
+
77
+ def sentence_importance(sentence, doc_embedding, lda_model, vectorizer, stopwords):
78
+ sentence_embedding = extract_embeddings(sentence)
79
+ semantic_similarity = calculate_similarity(doc_embedding, sentence_embedding)
80
+
81
+ topic_dist = get_topic_distribution(sentence, lda_model, vectorizer)
82
+ topic_importance = np.max(topic_dist)
83
+
84
+ # Calculate lexical diversity
85
+ words = sentence.split()
86
+ unique_words = set([word.lower() for word in words if word.lower() not in stopwords])
87
+ lexical_diversity = len(unique_words) / len(words) if words else 0
88
+
89
+ # Combine factors
90
+ importance = (0.6 * semantic_similarity) + (0.3 * topic_importance) + (0.2 * lexical_diversity)
91
+ return importance
92
+
93
+ try:
94
+ # Split the text into sentences
95
+ sentences = sent_tokenize(full_text)
96
+
97
+ final_sentences = []
98
+ for s in sentences:
99
+ broken_sentences = s.split('\n')
100
+ final_sentences.extend(broken_sentences)
101
+ sentences = final_sentences
102
+
103
+ text_lang = detect_language(full_text)
104
+
105
+ # Create LDA model
106
+ lda_model, vectorizer = create_lda_model(sentences, portuguese_stopwords if text_lang == 'pt' else english_stopwords)
107
+
108
+ # Get document-level embedding
109
+ doc_embedding = extract_embeddings(full_text)
110
+
111
+ # Calculate importance for each sentence
112
+ sentence_scores = [(sentence, sentence_importance(sentence, doc_embedding, lda_model, vectorizer, portuguese_stopwords if text_lang == 'pt' else english_stopwords))
113
+ for sentence in sentences]
114
+
115
+ # Sort sentences by importance
116
+ sorted_sentences = sorted(sentence_scores, key=lambda x: x[1], reverse=True)
117
+
118
+ # Determine how many words to keep
119
+ total_words = sum(len(sentence.split()) for sentence in sentences)
120
+ target_words = int(total_words * compression_rate)
121
+
122
+ # Reconstruct the compressed text
123
+ compressed_text = []
124
+ current_words = 0
125
+ for sentence, _ in sorted_sentences:
126
+ sentence_words = len(sentence.split())
127
+ if current_words + sentence_words <= target_words:
128
+ compressed_text.append(sentence)
129
+ current_words += sentence_words
130
+ else:
131
+ break
132
+
133
+ if len(compressed_text) == 0:
134
+ # Pick the first sentence if no compression is possible
135
+ compressed_text = [sentences[0]]
136
+
137
+ # Reorder sentences to maintain original flow
138
+ compressed_text.sort(key=lambda x: sentences.index(x))
139
+
140
+ return ' '.join(compressed_text)
141
+ except Exception:
142
+ traceback.print_exc()
143
+
144
+ return full_text
145
+
146
+ def compress_text(text, *, target_token_count=None, compression_rate=0.7):
147
+ """
148
+ Compress text using either a compression rate or a target token count.
149
+ If both are provided, the compression rate will be used.
150
+
151
+ Args:
152
+ text (str): The text to be compressed.
153
+ target_token_count (int, optional): The target token count for compression. Defaults to None.
154
+ compression_rate (float, optional): The compression rate as a percentage. Defaults to 0.7. Example: 0.7 means 70% reduction.
155
+
156
+ Returns:
157
+ str: The compressed text.
158
+ """
159
+
160
+ if target_token_count is None:
161
+ compression_rate = 1 - compression_rate
162
+ original_token_count = count_tokens(text)
163
+ target_token_count = int(original_token_count * compression_rate)
164
+ else:
165
+ original_token_count = count_tokens(text)
166
+ if original_token_count <= target_token_count:
167
+ return text
168
+ # Get the compression rate
169
+ compression_rate = target_token_count / original_token_count
170
+
171
+ return semantic_compress_text(text, compression_rate)
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Carlo Moro
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,18 @@
1
+ Metadata-Version: 2.1
2
+ Name: semantic_compressor
3
+ Version: 1.0
4
+ Author: Carlo Moro
5
+ Author-email: Carlo Moro <cnmoro@gmail.com>
6
+ Classifier: Programming Language :: Python :: 3
7
+ Classifier: License :: OSI Approved :: MIT License
8
+ Classifier: Operating System :: OS Independent
9
+ Requires-Python: >=3.7
10
+ Description-Content-Type: text/markdown
11
+ License-File: LICENSE
12
+ Requires-Dist: numpy <2
13
+ Requires-Dist: nltk
14
+ Requires-Dist: scikit-learn
15
+ Requires-Dist: fasttext
16
+ Requires-Dist: onnxruntime
17
+ Requires-Dist: onnxruntime-extensions
18
+
@@ -0,0 +1,15 @@
1
+ compressor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ compressor/semantic.py,sha256=wXKmerx4j7cdYqjyHDJVkRX_k-2tbnBx5gNa6Qnpgtg,7148
3
+ compressor/minbpe/__init__.py,sha256=wZ1z2QKkncvGgiZDBc91AP5m7-M-MVenPStKbS6xylE,95
4
+ compressor/minbpe/base.py,sha256=tTKag04RRFnc4ppoieBbDV0V6thzi_ZvZTlhOYIoY7Q,6881
5
+ compressor/minbpe/basic.py,sha256=0kD4tU8l2MZegfPaHMfDo5CnaSzb9i1v9tDBy6GwMbg,2883
6
+ compressor/minbpe/regex.py,sha256=k3bllcxc5c7mi43tUEGg6jX-Zc4Cvfb1CCTGEp7ZcVM,5821
7
+ compressor/resources/embedding_model.onnx,sha256=uLBbAfCGEJTwR1yyiK0bMDroruLr6W5lDSEIBdXKcPc,71794489
8
+ compressor/resources/en_stopwords.pkl,sha256=Q2PyGQnphPUs_jxN9NMSqp2EQjYv4b4oMJY2aMYvbSY,1310
9
+ compressor/resources/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
10
+ compressor/resources/pt_stopwords.pkl,sha256=-9bJaxJWjeOFxLHLT9D-rI3XTzGC0iLJfMiwBDnkCYI,1716
11
+ semantic_compressor-1.0.dist-info/LICENSE,sha256=DFRihXonZ3qVRaTrzuXNaDI_-h2jyT2SqWqjtTDHfqI,1067
12
+ semantic_compressor-1.0.dist-info/METADATA,sha256=XNW1eUfEDeulFSHhs2dd7znll8RLiBRwqwzXZgI5gRA,517
13
+ semantic_compressor-1.0.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
14
+ semantic_compressor-1.0.dist-info/top_level.txt,sha256=qb2SlKrEmMrQDVrhwxu3Wr7U6JupPXtDGrJpIQr8xSc,11
15
+ semantic_compressor-1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (72.1.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ compressor