semantic-compressor 1.2__py3-none-any.whl → 1.4__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
compressor/semantic.py CHANGED
@@ -6,14 +6,24 @@ from onnxruntime_extensions import get_library_path
6
6
  from compressor.minbpe.regex import RegexTokenizer
7
7
  from nltk.tokenize import sent_tokenize
8
8
  from multiprocessing import cpu_count
9
+ from spellchecker import SpellChecker
10
+ from nltk.stem import PorterStemmer
11
+ from nltk.stem import RSLPStemmer
9
12
  from collections import Counter
10
13
  import onnxruntime as ort
14
+ import nltk
15
+
16
+ # Inicializando os stemmers
17
+ stemmer_english = PorterStemmer()
18
+ stemmer_portuguese = RSLPStemmer()
11
19
 
12
20
  tokenizer = RegexTokenizer()
13
21
  nltk_data_path = str(importlib.resources.files('compressor').joinpath('resources/nltk_data'))
14
22
 
15
23
  os.environ['NLTK_DATA'] = nltk_data_path
16
24
 
25
+ nltk.download('rslp')
26
+
17
27
  english_stopwords_path = str(importlib.resources.files('compressor').joinpath('resources/en_stopwords.pkl'))
18
28
  portuguese_stopwords_path = str(importlib.resources.files('compressor').joinpath('resources/pt_stopwords.pkl'))
19
29
  fasttext_model_path = str(importlib.resources.files('compressor').joinpath('resources/lid.176.ftz'))
@@ -213,8 +223,6 @@ def compress_text(text, *, target_token_count=None, compression_rate=0.7, refere
213
223
  try:
214
224
  if target_token_count is None:
215
225
  compression_rate = 1 - compression_rate
216
- original_token_count = count_tokens(text)
217
- target_token_count = int(original_token_count * compression_rate)
218
226
  else:
219
227
  original_token_count = count_tokens(text)
220
228
  if original_token_count <= target_token_count:
@@ -232,9 +240,28 @@ def compress_text(text, *, target_token_count=None, compression_rate=0.7, refere
232
240
 
233
241
  return text
234
242
 
243
+ def stem_text(text, lang='en'):
244
+ if lang == 'en':
245
+ stems = [stemmer_english.stem(word) for word in text.split()]
246
+ stemmed_text = " ".join(stems)
247
+ else:
248
+ stems = [stemmer_portuguese.stem(word) for word in text.split()]
249
+ stemmed_text = " ".join(stems)
250
+
251
+ return stemmed_text
252
+
253
+ def correct_spelling(frase, detected_lang="pt"):
254
+ spell = SpellChecker(language=detected_lang)
255
+ words = frase.split()
256
+ fixed = [spell.correction(word) for word in words]
257
+ return " ".join(fixed)
258
+
235
259
  def find_needle_in_haystack(
236
- *, haystack: str, needle: str, block_size = 350,
237
- semantic_embeddings_weight: float = 0.3, textual_embeddings_weight: float = 0.7
260
+ *, haystack: str, needle: str, block_size = 300,
261
+ semantic_embeddings_weight: float = 0.3,
262
+ textual_embeddings_weight: float = 0.7,
263
+ use_stemming: bool = False,
264
+ correct_spelling_needle: bool = False
238
265
  ):
239
266
  """
240
267
  Finds the string block in the haystack that contains the needle.
@@ -245,7 +272,9 @@ def find_needle_in_haystack(
245
272
  block_size (int, optional): The size of each string block. The needle will be searched in each block. Defaults to 350.
246
273
  semantic_embeddings_weight (float, optional): The weight of the semantic embeddings in the similarity calculation. Defaults to 0.3.
247
274
  textual_embeddings_weight (float, optional): The weight of the textual embeddings in the similarity calculation. Defaults to 0.7.
248
-
275
+ use_stemming (bool, optional): Whether to use stemming for the text. Defaults to False.
276
+ correct_spelling_needle (bool, optional): Whether to correct the spelling of the needle. Defaults to False.
277
+
249
278
  Returns:
250
279
  str: The string block in the haystack that contains the needle. The size of the needle will be less than or equal to the block size.
251
280
  """
@@ -253,14 +282,19 @@ def find_needle_in_haystack(
253
282
  try:
254
283
  # Split the haystack into blocks
255
284
  blocks = structurize_text(haystack, tokens_per_chunk=block_size)
285
+
286
+ lang = detect_language(f"{needle}\n\n{haystack}")
287
+
288
+ if correct_spelling_needle:
289
+ needle = correct_spelling(needle, lang)
256
290
 
257
291
  # Compute the embeddings of the needle
258
292
  needle_semantic_embedding = extract_semantic_embeddings(needle)
259
- needle_textual_embedding = extract_textual_embeddings(needle.lower())
293
+ needle_textual_embedding = extract_textual_embeddings(needle.lower() if not use_stemming else stem_text(needle, lang))
260
294
 
261
295
  # Compute the embeddings of the haystack (each block)
262
296
  haystack_semantic_embeddings = [extract_semantic_embeddings(block) for block in blocks]
263
- haystack_textual_embeddings = [extract_textual_embeddings(block.lower()) for block in blocks]
297
+ haystack_textual_embeddings = [extract_textual_embeddings(block.lower() if not use_stemming else stem_text(block.lower(), lang)) for block in blocks]
264
298
 
265
299
  # Compute the similarity between the needle and each block
266
300
  semantic_similarities = [calculate_similarity(needle_semantic_embedding, block_embedding) for block_embedding in haystack_semantic_embeddings]
@@ -275,7 +309,9 @@ def find_needle_in_haystack(
275
309
  # Find the index of the needle in all the blocks
276
310
  most_similar_block_index = blocks.index(most_similar_block)
277
311
 
278
- needle_region = blocks[most_similar_block_index-1:most_similar_block_index+2]
312
+ start_index = most_similar_block_index-1 if most_similar_block_index > 0 else 0
313
+
314
+ needle_region = blocks[start_index:most_similar_block_index+2]
279
315
 
280
316
  return ''.join(needle_region).strip()
281
317
  except Exception:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: semantic_compressor
3
- Version: 1.2
3
+ Version: 1.4
4
4
  Author: Carlo Moro
5
5
  Author-email: Carlo Moro <cnmoro@gmail.com>
6
6
  Classifier: Programming Language :: Python :: 3
@@ -15,9 +15,10 @@ Requires-Dist: scikit-learn
15
15
  Requires-Dist: fasttext
16
16
  Requires-Dist: onnxruntime
17
17
  Requires-Dist: onnxruntime-extensions
18
+ Requires-Dist: pyspellchecker
18
19
 
19
20
  ```python
20
- from compressor.semantic import compress_text
21
+ from compressor.semantic import compress_text, find_needle_in_haystack
21
22
 
22
23
  text = """
23
24
  Akin to France's heartier, spicier, richer boeuf bourguignon, "alcatra" is synonymous with a single island in the remote Azores archipelago. The Azores, an archipelago of nine islands belonging to Portugal and located roughly between Europe and the US, are cow country. They're said to be home to more cattle than people, and despite being home to less than 3% of Portugal's population, the islands produce 30% of Portugal's dairy products and 13% of its beef. Beef is part of everyday life in the Azores, and come spring on one particular island, the ingredient even crosses paths with religion. In the days following Easter, Azorean people kick off a series of religious celebrations called Festas do Espírito Santo (Festivals of the Holy Spirit). During the 13th Century, a Catholic sect called the Cult of the Holy Spirit predicted a utopian era on Earth. This fringe faith was discouraged in mainland Europe but lived on in these remote islands in the middle of the Atlantic Ocean. The sect was also promoted by Portuguese queen Elizabeth of Aragon (also known as Elizabeth of Portugal), who was known for her charity. Over the subsequent centuries, a series of festivals emerged on the Azores that blended these utopian aspirations with the queen's alleged generosity. Between Easter and the week following Whitsunday, a total of eight weeks, the islands host a series of parades and other cultural and religious festivals that revolve around brightly coloured community houses called impérios. During this time, the community houses also collect donations from locals, which is then redistributed to people in the form of bread, beef and wine. These three elements generally come together in the form of a soup, called sopa do Espírito Santo, that's served at the impérios during the festivals. But on the island of Terceira, locals combine these ingredients in a different and delicious way, one that's become synonymous with the island's culinary identity. Austin Bush The Festas do Espírito Santo revolve around community houses called impérios (Credit: Austin Bush)Austin Bush The Festas do Espírito Santo revolve around community houses called impérios (Credit: Austin Bush) "People eat alcatra year round, but especially during the celebrations in spring and summer," explains Duarte Fournier. He is the Grand Master of the Brotherhood of Alcatra, a culinary fraternity on Terceira, and is telling me about the island's signature dish: cuts of beef braised in local wine, smoked pork fat and dried spices, resulting in something of a heartier, spicier, richer version of France's famed boeuf bourguignon. We're sitting at a cafe in Angra do Heroísmo, Terceira's largest city, and as we chat, children race to and from a nearby império delivering massive trays of raw beef to neighbours. Fournier tells me that alcatra likely has its origins in northern Portugal, where there's a tradition of baking goat in wine. "We don't know why it's called alcatra," he says. "We suppose it's from Arabic. Al catar means 'small pieces of meat'." According to Fournier, alcatra differs from mainland Portugal's baked meat dishes in that it includes dried spices, generally allspice and black peppercorns, but also sometimes clove or cinnamon.
@@ -30,4 +31,15 @@ print(compressed_text_90_percent)
30
31
  compressed_text_to_100_tokens = compress_text(text, target_token_count=100)
31
32
  print(compressed_text_to_100_tokens)
32
33
  # 'Akin to France\'s heartier, spicier, richer boeuf bourguignon, "alcatra" is synonymous with a single island in the remote Azores archipelago.'
34
+
35
+ text_reference = "Archipelago Islands"
36
+ compressed_text_with_steering = compress_text(text, compression_rate=0.7, reference_text_steering=text_reference)
37
+ # 'Akin to france\'s heartier, spicier, richer boeuf bourguignon, "alcatra" is synonymous with a single island in the remote azores archipelago. The azores, an archipelago of nine islands belonging to portugal and located roughly between europe and the us, are cow country. Beef is part of everyday life in the azores, and come spring on one particular island, the ingredient even crosses paths with religion. But on the island of terceira, locals combine these ingredients in a different and delicious way, one that\'s become synonymous with the island\'s culinary identity. He is the grand master of the brotherhood of alcatra, a culinary fraternity on terceira, and is telling me about the island\'s signature dish: cuts of beef braised in local wine, smoked pork fat and dried spices, resulting in something of a heartier, spicier, richer version of france\'s famed boeuf bourguignon.'
38
+
39
+ needle_in_haystack = find_needle_in_haystack(
40
+ haystack = text,
41
+ needle = "Archipelago Islands",
42
+ block_size = 200
43
+ )
44
+ # 'Akin to France\'s heartier, spicier, richer boeuf bourguignon, "alcatra" is synonymous with a single island in the remote Azores archipelago. The Azores, an archipelago of nine islands belonging to Portugal and located roughly between Europe and the US, are cow country. They\'re said to be home to more cattle than people, and despite being home to less than 3% of Portugal\'s population, the islands'
33
45
  ```
@@ -1,5 +1,5 @@
1
1
  compressor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- compressor/semantic.py,sha256=UUskzKy3Uj90UMDC_zRbPgCr30IxANCUuO1h0nWkKHU,12429
2
+ compressor/semantic.py,sha256=OxqzVCAnICKD3W_P3SAe4JbJt-PyOs5VVR-go8taZVI,13701
3
3
  compressor/minbpe/__init__.py,sha256=wZ1z2QKkncvGgiZDBc91AP5m7-M-MVenPStKbS6xylE,95
4
4
  compressor/minbpe/base.py,sha256=tTKag04RRFnc4ppoieBbDV0V6thzi_ZvZTlhOYIoY7Q,6881
5
5
  compressor/minbpe/basic.py,sha256=0kD4tU8l2MZegfPaHMfDo5CnaSzb9i1v9tDBy6GwMbg,2883
@@ -8,8 +8,8 @@ compressor/resources/embedding_model.onnx,sha256=uLBbAfCGEJTwR1yyiK0bMDroruLr6W5
8
8
  compressor/resources/en_stopwords.pkl,sha256=Q2PyGQnphPUs_jxN9NMSqp2EQjYv4b4oMJY2aMYvbSY,1310
9
9
  compressor/resources/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
10
10
  compressor/resources/pt_stopwords.pkl,sha256=-9bJaxJWjeOFxLHLT9D-rI3XTzGC0iLJfMiwBDnkCYI,1716
11
- semantic_compressor-1.2.dist-info/LICENSE,sha256=DFRihXonZ3qVRaTrzuXNaDI_-h2jyT2SqWqjtTDHfqI,1067
12
- semantic_compressor-1.2.dist-info/METADATA,sha256=s9pltj6AtpXW6OEcZE1h3W8OPYks_PbhdCbJDR9e5b0,4545
13
- semantic_compressor-1.2.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
14
- semantic_compressor-1.2.dist-info/top_level.txt,sha256=qb2SlKrEmMrQDVrhwxu3Wr7U6JupPXtDGrJpIQr8xSc,11
15
- semantic_compressor-1.2.dist-info/RECORD,,
11
+ semantic_compressor-1.4.dist-info/LICENSE,sha256=DFRihXonZ3qVRaTrzuXNaDI_-h2jyT2SqWqjtTDHfqI,1067
12
+ semantic_compressor-1.4.dist-info/METADATA,sha256=BEKlYCs7nYakGXQzbC_8_Gz-MKSAXzSp01pAD0HjIS0,6178
13
+ semantic_compressor-1.4.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
14
+ semantic_compressor-1.4.dist-info/top_level.txt,sha256=qb2SlKrEmMrQDVrhwxu3Wr7U6JupPXtDGrJpIQr8xSc,11
15
+ semantic_compressor-1.4.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (72.1.0)
2
+ Generator: bdist_wheel (0.43.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5