infinity-sdk 0.7.0.dev5__tar.gz → 0.7.0.dev7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {infinity_sdk-0.7.0.dev5/python/infinity_sdk/infinity_sdk.egg-info → infinity_sdk-0.7.0.dev7}/PKG-INFO +2 -2
- {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev7}/README.md +1 -1
- {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev7}/pyproject.toml +1 -1
- {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev7}/python/infinity_sdk/README.md +1 -1
- {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev7}/python/infinity_sdk/infinity/rag_tokenizer.py +76 -2
- {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev7}/python/infinity_sdk/infinity/remote_thrift/client.py +3 -2
- {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev7/python/infinity_sdk/infinity_sdk.egg-info}/PKG-INFO +2 -2
- {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev7}/LICENSE +0 -0
- {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev7}/python/infinity_sdk/infinity/__init__.py +0 -0
- {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev7}/python/infinity_sdk/infinity/common.py +0 -0
- {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev7}/python/infinity_sdk/infinity/connection_pool.py +0 -0
- {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev7}/python/infinity_sdk/infinity/db.py +0 -0
- {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev7}/python/infinity_sdk/infinity/errors.py +0 -0
- {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev7}/python/infinity_sdk/infinity/http_utils.py +0 -0
- {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev7}/python/infinity_sdk/infinity/huqie.txt +0 -0
- {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev7}/python/infinity_sdk/infinity/huqie.txt.trie +0 -0
- {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev7}/python/infinity_sdk/infinity/index.py +0 -0
- {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev7}/python/infinity_sdk/infinity/infinity.py +0 -0
- {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev7}/python/infinity_sdk/infinity/infinity_http.py +0 -0
- {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev7}/python/infinity_sdk/infinity/remote_thrift/__init__.py +0 -0
- {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev7}/python/infinity_sdk/infinity/remote_thrift/db.py +0 -0
- {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev7}/python/infinity_sdk/infinity/remote_thrift/infinity.py +0 -0
- {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev7}/python/infinity_sdk/infinity/remote_thrift/infinity_thrift_rpc/InfinityService.py +0 -0
- {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev7}/python/infinity_sdk/infinity/remote_thrift/infinity_thrift_rpc/__init__.py +0 -0
- {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev7}/python/infinity_sdk/infinity/remote_thrift/infinity_thrift_rpc/constants.py +0 -0
- {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev7}/python/infinity_sdk/infinity/remote_thrift/infinity_thrift_rpc/ttypes.py +0 -0
- {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev7}/python/infinity_sdk/infinity/remote_thrift/query_builder.py +0 -0
- {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev7}/python/infinity_sdk/infinity/remote_thrift/table.py +0 -0
- {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev7}/python/infinity_sdk/infinity/remote_thrift/types.py +0 -0
- {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev7}/python/infinity_sdk/infinity/remote_thrift/utils.py +0 -0
- {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev7}/python/infinity_sdk/infinity/table.py +0 -0
- {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev7}/python/infinity_sdk/infinity/utils.py +0 -0
- {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev7}/python/infinity_sdk/infinity_sdk.egg-info/SOURCES.txt +0 -0
- {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev7}/python/infinity_sdk/infinity_sdk.egg-info/dependency_links.txt +0 -0
- {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev7}/python/infinity_sdk/infinity_sdk.egg-info/requires.txt +0 -0
- {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev7}/python/infinity_sdk/infinity_sdk.egg-info/top_level.txt +0 -0
- {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev7}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: infinity-sdk
|
|
3
|
-
Version: 0.7.0.
|
|
3
|
+
Version: 0.7.0.dev7
|
|
4
4
|
Summary: infinity
|
|
5
5
|
License-Expression: Apache-2.0
|
|
6
6
|
Requires-Python: <3.14,>=3.11
|
|
@@ -97,7 +97,7 @@ Infinity supports two working modes, embedded mode and client-server mode. The f
|
|
|
97
97
|
|
|
98
98
|
2. Install the `infinity-sdk` package:
|
|
99
99
|
```bash
|
|
100
|
-
pip install infinity-sdk==0.7.0.
|
|
100
|
+
pip install infinity-sdk==0.7.0.dev7
|
|
101
101
|
```
|
|
102
102
|
|
|
103
103
|
3. Use Infinity to conduct a dense vector search:
|
|
@@ -63,7 +63,7 @@ Infinity supports two working modes, embedded mode and client-server mode. The f
|
|
|
63
63
|
|
|
64
64
|
2. Install the `infinity-sdk` package:
|
|
65
65
|
```bash
|
|
66
|
-
pip install infinity-sdk==0.7.0.
|
|
66
|
+
pip install infinity-sdk==0.7.0.dev7
|
|
67
67
|
```
|
|
68
68
|
|
|
69
69
|
3. Use Infinity to conduct a dense vector search:
|
{infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev7}/python/infinity_sdk/infinity/rag_tokenizer.py
RENAMED
|
@@ -43,6 +43,29 @@ from nltk import word_tokenize
|
|
|
43
43
|
from nltk.stem import SnowballStemmer, WordNetLemmatizer
|
|
44
44
|
|
|
45
45
|
|
|
46
|
+
# Map language names (lowercase) to NLTK SnowballStemmer language names.
|
|
47
|
+
# Used by set_language() to configure language-specific stemming.
|
|
48
|
+
_SNOWBALL_LANGUAGE_MAP = {
|
|
49
|
+
"english": "english",
|
|
50
|
+
"dutch": "dutch",
|
|
51
|
+
"german": "german",
|
|
52
|
+
"french": "french",
|
|
53
|
+
"spanish": "spanish",
|
|
54
|
+
"italian": "italian",
|
|
55
|
+
"portuguese": "portuguese",
|
|
56
|
+
"portuguese br": "portuguese",
|
|
57
|
+
"russian": "russian",
|
|
58
|
+
"arabic": "arabic",
|
|
59
|
+
"danish": "danish",
|
|
60
|
+
"finnish": "finnish",
|
|
61
|
+
"hungarian": "hungarian",
|
|
62
|
+
"norwegian": "norwegian",
|
|
63
|
+
"romanian": "romanian",
|
|
64
|
+
"swedish": "swedish",
|
|
65
|
+
"turkish": "turkish",
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
|
|
46
69
|
class RagTokenizer:
|
|
47
70
|
def key_(self, line):
|
|
48
71
|
return str(line.lower().encode("utf-8"))[2:-1]
|
|
@@ -98,6 +121,7 @@ class RagTokenizer:
|
|
|
98
121
|
|
|
99
122
|
self.stemmer = SnowballStemmer("english")
|
|
100
123
|
self.lemmatizer = WordNetLemmatizer()
|
|
124
|
+
self._use_lemmatizer = True # WordNet only supports English
|
|
101
125
|
|
|
102
126
|
self.SPLIT_CHAR = r"([ ,\.<>/?;:'\[\]\\`!@#$%^&*\(\)\{\}\|_+=《》,。?、;‘’:“”【】~!¥%……()——-]+|[a-zA-Z0-9,\.-]+)"
|
|
103
127
|
|
|
@@ -131,6 +155,38 @@ class RagTokenizer:
|
|
|
131
155
|
def add_user_dict(self, fnm):
|
|
132
156
|
self._load_dict(fnm)
|
|
133
157
|
|
|
158
|
+
def set_language(self, language: str):
|
|
159
|
+
"""Configure stemmer/lemmatizer for the given language.
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
language: Language name (e.g. "English", "Dutch", "Chinese").
|
|
163
|
+
Case-insensitive.
|
|
164
|
+
"""
|
|
165
|
+
lang_key = language.strip().lower()
|
|
166
|
+
snowball_lang = _SNOWBALL_LANGUAGE_MAP.get(lang_key)
|
|
167
|
+
|
|
168
|
+
if snowball_lang is not None:
|
|
169
|
+
self.stemmer = SnowballStemmer(snowball_lang)
|
|
170
|
+
if snowball_lang == "english":
|
|
171
|
+
self.lemmatizer = WordNetLemmatizer()
|
|
172
|
+
self._use_lemmatizer = True
|
|
173
|
+
else:
|
|
174
|
+
# WordNet only supports English; disable lemmatizer for
|
|
175
|
+
# other languages and rely on Snowball stemming alone.
|
|
176
|
+
self._use_lemmatizer = False
|
|
177
|
+
logging.debug(
|
|
178
|
+
"Tokenizer language set to '%s' (Snowball: %s, lemmatizer: %s)",
|
|
179
|
+
language, snowball_lang, self._use_lemmatizer,
|
|
180
|
+
)
|
|
181
|
+
else:
|
|
182
|
+
# Unsupported language (Chinese, Japanese, Korean, etc.) –
|
|
183
|
+
# keep defaults. CJK text uses dictionary segmentation,
|
|
184
|
+
# not stemming.
|
|
185
|
+
logging.debug(
|
|
186
|
+
"Language '%s' has no Snowball stemmer; keeping defaults",
|
|
187
|
+
language,
|
|
188
|
+
)
|
|
189
|
+
|
|
134
190
|
def _strQ2B(self, ustring):
|
|
135
191
|
"""Convert full-width characters to half-width characters"""
|
|
136
192
|
rstring = ""
|
|
@@ -326,7 +382,20 @@ class RagTokenizer:
|
|
|
326
382
|
return self.score_(res[::-1])
|
|
327
383
|
|
|
328
384
|
def english_normalize_(self, tks):
|
|
329
|
-
return [self.
|
|
385
|
+
return [self._normalize_token(t) for t in tks]
|
|
386
|
+
|
|
387
|
+
def _normalize_token(self, t: str) -> str:
|
|
388
|
+
"""Stem (and optionally lemmatize) a single alphabetic token.
|
|
389
|
+
|
|
390
|
+
When the lemmatizer is enabled (English), applies lemmatization
|
|
391
|
+
before stemming. For other Snowball-supported languages, only
|
|
392
|
+
stemming is applied. Non-alphabetic tokens are returned as-is.
|
|
393
|
+
"""
|
|
394
|
+
if re.match(r"[a-zA-Z_-]+$", t):
|
|
395
|
+
if self._use_lemmatizer:
|
|
396
|
+
return self.stemmer.stem(self.lemmatizer.lemmatize(t))
|
|
397
|
+
return self.stemmer.stem(t)
|
|
398
|
+
return t
|
|
330
399
|
|
|
331
400
|
def _split_by_lang(self, line):
|
|
332
401
|
txt_lang_pairs = []
|
|
@@ -360,7 +429,7 @@ class RagTokenizer:
|
|
|
360
429
|
res = []
|
|
361
430
|
for L, lang in arr:
|
|
362
431
|
if not lang:
|
|
363
|
-
res.extend([self.
|
|
432
|
+
res.extend([self._normalize_token(t) for t in word_tokenize(L)])
|
|
364
433
|
continue
|
|
365
434
|
if len(L) < 2 or re.match(r"[a-z\.-]+$", L) or re.match(r"[0-9\.-]+$", L):
|
|
366
435
|
res.append(L)
|
|
@@ -501,11 +570,16 @@ if __name__ == '__main__':
|
|
|
501
570
|
parser.add_argument('--fine-grained', action='store_true',
|
|
502
571
|
help='Use fine-grained tokenization')
|
|
503
572
|
parser.add_argument('--user-dict', help='User dictionary file')
|
|
573
|
+
parser.add_argument('-l', '--language', help='Language for stemming (e.g., english, dutch)')
|
|
504
574
|
|
|
505
575
|
args = parser.parse_args()
|
|
506
576
|
|
|
507
577
|
tokenizer = RagTokenizer(debug=True, user_dict=args.user_dict)
|
|
508
578
|
|
|
579
|
+
# Set language if specified
|
|
580
|
+
if args.language:
|
|
581
|
+
tokenizer.set_language(args.language)
|
|
582
|
+
|
|
509
583
|
# Process input
|
|
510
584
|
if args.file:
|
|
511
585
|
# File mode
|
|
@@ -123,8 +123,9 @@ class ThriftInfinityClient:
|
|
|
123
123
|
# version: 0.6.8 and 0.6.9 and 0.6.10, client_version: 33
|
|
124
124
|
# version: 0.6.13, client_version: 34
|
|
125
125
|
# version: 0.6.15, client_version: 35
|
|
126
|
-
# version: 0.7.0, 0.7.0.dev1, 0.7.0.dev2, 0.7.0.dev3
|
|
127
|
-
|
|
126
|
+
# version: 0.7.0, 0.7.0.dev1, 0.7.0.dev2, 0.7.0.dev3, 0.7.0.dev4, 0.7.0.dev5,
|
|
127
|
+
# 0.7.0.dev6 and 0.7.0.dev7, client_version: 36
|
|
128
|
+
res = self.client.Connect(ConnectRequest(client_version=36)) # 0.7.0.dev7
|
|
128
129
|
if res.error_code != 0:
|
|
129
130
|
raise InfinityException(res.error_code, res.error_msg)
|
|
130
131
|
self.session_id = res.session_id
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: infinity-sdk
|
|
3
|
-
Version: 0.7.0.
|
|
3
|
+
Version: 0.7.0.dev7
|
|
4
4
|
Summary: infinity
|
|
5
5
|
License-Expression: Apache-2.0
|
|
6
6
|
Requires-Python: <3.14,>=3.11
|
|
@@ -97,7 +97,7 @@ Infinity supports two working modes, embedded mode and client-server mode. The f
|
|
|
97
97
|
|
|
98
98
|
2. Install the `infinity-sdk` package:
|
|
99
99
|
```bash
|
|
100
|
-
pip install infinity-sdk==0.7.0.
|
|
100
|
+
pip install infinity-sdk==0.7.0.dev7
|
|
101
101
|
```
|
|
102
102
|
|
|
103
103
|
3. Use Infinity to conduct a dense vector search:
|
|
File without changes
|
{infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev7}/python/infinity_sdk/infinity/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev7}/python/infinity_sdk/infinity/connection_pool.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev7}/python/infinity_sdk/infinity/http_utils.py
RENAMED
|
File without changes
|
|
File without changes
|
{infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev7}/python/infinity_sdk/infinity/huqie.txt.trie
RENAMED
|
File without changes
|
|
File without changes
|
{infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev7}/python/infinity_sdk/infinity/infinity.py
RENAMED
|
File without changes
|
{infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev7}/python/infinity_sdk/infinity/infinity_http.py
RENAMED
|
File without changes
|
|
File without changes
|
{infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev7}/python/infinity_sdk/infinity/remote_thrift/db.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|