semantic-compressor 2.4__py3-none-any.whl → 2.42__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
compressor/semantic.py CHANGED
@@ -6,7 +6,7 @@ from sklearn.decomposition import LatentDirichletAllocation
6
6
  from sklearn.metrics.pairwise import cosine_similarity
7
7
  from compressor.minbpe.regex import RegexTokenizer
8
8
  from concurrent.futures import ProcessPoolExecutor
9
- import numpy as np, pickle, fasttext, traceback
9
+ import numpy as np, pickle, traceback
10
10
  from nltk.tokenize import sent_tokenize
11
11
  from multiprocessing import cpu_count
12
12
  from spellchecker import SpellChecker
@@ -16,6 +16,10 @@ from collections import Counter
16
16
  from model2vec import StaticModel
17
17
  import re
18
18
 
19
+ from lingua import Language, LanguageDetectorBuilder
20
+ languages = [Language.ENGLISH, Language.PORTUGUESE]
21
+ lang_detector = LanguageDetectorBuilder.from_languages(*languages).build()
22
+
19
23
  tokenizer = RegexTokenizer()
20
24
 
21
25
  # Inicializando os stemmers
@@ -24,12 +28,10 @@ stemmer_portuguese = RSLPStemmer()
24
28
 
25
29
  english_stopwords_path = str(importlib.resources.files('compressor').joinpath('resources/en_stopwords.pkl'))
26
30
  portuguese_stopwords_path = str(importlib.resources.files('compressor').joinpath('resources/pt_stopwords.pkl'))
27
- fasttext_model_path = str(importlib.resources.files('compressor').joinpath('resources/lid.176.ftz'))
28
31
  english_stopwords = pickle.load(open(english_stopwords_path, "rb"))
29
32
  portuguese_stopwords = pickle.load(open(portuguese_stopwords_path, "rb"))
30
- langdetect_model = fasttext.load_model(fasttext_model_path)
31
33
 
32
- embedding_model = StaticModel.from_pretrained("cnmoro/Linq-Embed-Mistral-Distilled")
34
+ embedding_model = StaticModel.from_pretrained("cnmoro/static-nomic-eng-ptbr-tiny")
33
35
 
34
36
  hashing_vectorizer = HashingVectorizer(ngram_range=(1, 6), analyzer='char', n_features=512)
35
37
 
@@ -91,8 +93,8 @@ def count_tokens(text):
91
93
  return len(tokenizer.encode(text))
92
94
 
93
95
  def detect_language(text):
94
- detected_lang = langdetect_model.predict(text.replace('\n', ' '), k=1)[0][0]
95
- return 'pt' if (str(detected_lang) == '__label__pt' or str(detected_lang) == 'portuguese') else 'en'
96
+ detected_lang = lang_detector.detect_language_of(text)
97
+ return 'pt' if detected_lang == Language.PORTUGUESE else 'en'
96
98
 
97
99
  def compute_and_remove_repeated_ngrams(text, ngram_size=3, threshold=3):
98
100
  words = text.split()
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: semantic_compressor
3
- Version: 2.4
3
+ Version: 2.42
4
4
  Author: Carlo Moro
5
5
  Author-email: Carlo Moro <cnmoro@gmail.com>
6
6
  Classifier: Programming Language :: Python :: 3
@@ -12,10 +12,11 @@ License-File: LICENSE
12
12
  Requires-Dist: numpy<2
13
13
  Requires-Dist: nltk
14
14
  Requires-Dist: scikit-learn
15
- Requires-Dist: fasttext
15
+ Requires-Dist: lingua-language-detector
16
16
  Requires-Dist: model2vec
17
17
  Requires-Dist: pyspellchecker
18
18
  Dynamic: author
19
+ Dynamic: license-file
19
20
  Dynamic: requires-python
20
21
 
21
22
  ```python
@@ -1,12 +1,11 @@
1
1
  compressor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- compressor/semantic.py,sha256=sJXbap3_oZpd-XMGkecrqQ7RSon-OE98u8iYcNRIskA,17076
2
+ compressor/semantic.py,sha256=YAjQeipnE3Npo9vr-2uKnjdvo1nZrNEM5iP_n-hO4gY,17023
3
3
  compressor/minbpe/__init__.py,sha256=wZ1z2QKkncvGgiZDBc91AP5m7-M-MVenPStKbS6xylE,95
4
4
  compressor/minbpe/base.py,sha256=tTKag04RRFnc4ppoieBbDV0V6thzi_ZvZTlhOYIoY7Q,6881
5
5
  compressor/minbpe/basic.py,sha256=0kD4tU8l2MZegfPaHMfDo5CnaSzb9i1v9tDBy6GwMbg,2883
6
6
  compressor/minbpe/regex.py,sha256=k3bllcxc5c7mi43tUEGg6jX-Zc4Cvfb1CCTGEp7ZcVM,5821
7
7
  compressor/resources/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
8
  compressor/resources/en_stopwords.pkl,sha256=Q2PyGQnphPUs_jxN9NMSqp2EQjYv4b4oMJY2aMYvbSY,1310
9
- compressor/resources/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
10
9
  compressor/resources/pt_stopwords.pkl,sha256=-9bJaxJWjeOFxLHLT9D-rI3XTzGC0iLJfMiwBDnkCYI,1716
11
10
  compressor/resources/nltk_data/corpora/stopwords.zip,sha256=Qww7buJgh4OBbo46gkso9BXLKPypB-KlyMmBaBkgC6M,36779
12
11
  compressor/resources/nltk_data/corpora/wordnet.zip,sha256=y9pepu7382qXpD1KdfheB_zLtPI2V9J7TMvJPiZGq1k,10775600
@@ -175,8 +174,8 @@ compressor/resources/nltk_data/tokenizers/punkt_tab/turkish/abbrev_types.txt,sha
175
174
  compressor/resources/nltk_data/tokenizers/punkt_tab/turkish/collocations.tab,sha256=BhzimBd2qPh12k8kvr1-E4-NodkFe0PQf1gBSOwQajM,273
176
175
  compressor/resources/nltk_data/tokenizers/punkt_tab/turkish/ortho_context.tab,sha256=_CFCJ_mdXqPucNII3xaxmE6rN10ZRu03kGHGz1wXGL4,642682
177
176
  compressor/resources/nltk_data/tokenizers/punkt_tab/turkish/sent_starters.txt,sha256=kyOftVtdKubZRahKlOEYuoqBYyaxfNwRuoERvqDJeCg,613
178
- semantic_compressor-2.4.dist-info/LICENSE,sha256=DFRihXonZ3qVRaTrzuXNaDI_-h2jyT2SqWqjtTDHfqI,1067
179
- semantic_compressor-2.4.dist-info/METADATA,sha256=-QsGOKQoDo4YBW88p-KWPrr0GlHrgvygoRnSN2C09-Y,6178
180
- semantic_compressor-2.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
181
- semantic_compressor-2.4.dist-info/top_level.txt,sha256=qb2SlKrEmMrQDVrhwxu3Wr7U6JupPXtDGrJpIQr8xSc,11
182
- semantic_compressor-2.4.dist-info/RECORD,,
177
+ semantic_compressor-2.42.dist-info/licenses/LICENSE,sha256=DFRihXonZ3qVRaTrzuXNaDI_-h2jyT2SqWqjtTDHfqI,1067
178
+ semantic_compressor-2.42.dist-info/METADATA,sha256=1ltMJaaIP6YKZQdREJAo1I-AogbX65WaJbeO_Pbls-E,6217
179
+ semantic_compressor-2.42.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
180
+ semantic_compressor-2.42.dist-info/top_level.txt,sha256=qb2SlKrEmMrQDVrhwxu3Wr7U6JupPXtDGrJpIQr8xSc,11
181
+ semantic_compressor-2.42.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.8.0)
2
+ Generator: setuptools (80.9.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
Binary file