infinity-sdk 0.7.0.dev5__tar.gz → 0.7.0.dev6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. {infinity_sdk-0.7.0.dev5/python/infinity_sdk/infinity_sdk.egg-info → infinity_sdk-0.7.0.dev6}/PKG-INFO +2 -2
  2. {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev6}/README.md +1 -1
  3. {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev6}/pyproject.toml +1 -1
  4. {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev6}/python/infinity_sdk/README.md +1 -1
  5. {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev6}/python/infinity_sdk/infinity/rag_tokenizer.py +76 -2
  6. {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev6}/python/infinity_sdk/infinity/remote_thrift/client.py +2 -2
  7. {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev6/python/infinity_sdk/infinity_sdk.egg-info}/PKG-INFO +2 -2
  8. {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev6}/LICENSE +0 -0
  9. {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev6}/python/infinity_sdk/infinity/__init__.py +0 -0
  10. {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev6}/python/infinity_sdk/infinity/common.py +0 -0
  11. {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev6}/python/infinity_sdk/infinity/connection_pool.py +0 -0
  12. {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev6}/python/infinity_sdk/infinity/db.py +0 -0
  13. {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev6}/python/infinity_sdk/infinity/errors.py +0 -0
  14. {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev6}/python/infinity_sdk/infinity/http_utils.py +0 -0
  15. {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev6}/python/infinity_sdk/infinity/huqie.txt +0 -0
  16. {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev6}/python/infinity_sdk/infinity/huqie.txt.trie +0 -0
  17. {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev6}/python/infinity_sdk/infinity/index.py +0 -0
  18. {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev6}/python/infinity_sdk/infinity/infinity.py +0 -0
  19. {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev6}/python/infinity_sdk/infinity/infinity_http.py +0 -0
  20. {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev6}/python/infinity_sdk/infinity/remote_thrift/__init__.py +0 -0
  21. {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev6}/python/infinity_sdk/infinity/remote_thrift/db.py +0 -0
  22. {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev6}/python/infinity_sdk/infinity/remote_thrift/infinity.py +0 -0
  23. {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev6}/python/infinity_sdk/infinity/remote_thrift/infinity_thrift_rpc/InfinityService.py +0 -0
  24. {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev6}/python/infinity_sdk/infinity/remote_thrift/infinity_thrift_rpc/__init__.py +0 -0
  25. {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev6}/python/infinity_sdk/infinity/remote_thrift/infinity_thrift_rpc/constants.py +0 -0
  26. {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev6}/python/infinity_sdk/infinity/remote_thrift/infinity_thrift_rpc/ttypes.py +0 -0
  27. {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev6}/python/infinity_sdk/infinity/remote_thrift/query_builder.py +0 -0
  28. {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev6}/python/infinity_sdk/infinity/remote_thrift/table.py +0 -0
  29. {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev6}/python/infinity_sdk/infinity/remote_thrift/types.py +0 -0
  30. {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev6}/python/infinity_sdk/infinity/remote_thrift/utils.py +0 -0
  31. {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev6}/python/infinity_sdk/infinity/table.py +0 -0
  32. {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev6}/python/infinity_sdk/infinity/utils.py +0 -0
  33. {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev6}/python/infinity_sdk/infinity_sdk.egg-info/SOURCES.txt +0 -0
  34. {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev6}/python/infinity_sdk/infinity_sdk.egg-info/dependency_links.txt +0 -0
  35. {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev6}/python/infinity_sdk/infinity_sdk.egg-info/requires.txt +0 -0
  36. {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev6}/python/infinity_sdk/infinity_sdk.egg-info/top_level.txt +0 -0
  37. {infinity_sdk-0.7.0.dev5 → infinity_sdk-0.7.0.dev6}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: infinity-sdk
3
- Version: 0.7.0.dev5
3
+ Version: 0.7.0.dev6
4
4
  Summary: infinity
5
5
  License-Expression: Apache-2.0
6
6
  Requires-Python: <3.14,>=3.11
@@ -97,7 +97,7 @@ Infinity supports two working modes, embedded mode and client-server mode. The f
97
97
 
98
98
  2. Install the `infinity-sdk` package:
99
99
  ```bash
100
- pip install infinity-sdk==0.7.0.dev5
100
+ pip install infinity-sdk==0.7.0.dev6
101
101
  ```
102
102
 
103
103
  3. Use Infinity to conduct a dense vector search:
@@ -96,7 +96,7 @@ If you are on Windows 10+, you must enable WSL or WSL2 to deploy Infinity using
96
96
  ### Install Infinity client
97
97
 
98
98
  ```
99
- pip install infinity-sdk==0.7.0.dev5
99
+ pip install infinity-sdk==0.7.0.dev6
100
100
  ```
101
101
 
102
102
  ### Run a vector search
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "infinity-sdk"
3
- version = "0.7.0.dev5"
3
+ version = "0.7.0.dev6"
4
4
  description = "infinity"
5
5
  readme = "python/infinity_sdk/README.md"
6
6
  license = "Apache-2.0"
@@ -63,7 +63,7 @@ Infinity supports two working modes, embedded mode and client-server mode. The f
63
63
 
64
64
  2. Install the `infinity-sdk` package:
65
65
  ```bash
66
- pip install infinity-sdk==0.7.0.dev5
66
+ pip install infinity-sdk==0.7.0.dev6
67
67
  ```
68
68
 
69
69
  3. Use Infinity to conduct a dense vector search:
@@ -43,6 +43,29 @@ from nltk import word_tokenize
43
43
  from nltk.stem import SnowballStemmer, WordNetLemmatizer
44
44
 
45
45
 
46
+ # Map language names (lowercase) to NLTK SnowballStemmer language names.
47
+ # Used by set_language() to configure language-specific stemming.
48
+ _SNOWBALL_LANGUAGE_MAP = {
49
+ "english": "english",
50
+ "dutch": "dutch",
51
+ "german": "german",
52
+ "french": "french",
53
+ "spanish": "spanish",
54
+ "italian": "italian",
55
+ "portuguese": "portuguese",
56
+ "portuguese br": "portuguese",
57
+ "russian": "russian",
58
+ "arabic": "arabic",
59
+ "danish": "danish",
60
+ "finnish": "finnish",
61
+ "hungarian": "hungarian",
62
+ "norwegian": "norwegian",
63
+ "romanian": "romanian",
64
+ "swedish": "swedish",
65
+ "turkish": "turkish",
66
+ }
67
+
68
+
46
69
  class RagTokenizer:
47
70
  def key_(self, line):
48
71
  return str(line.lower().encode("utf-8"))[2:-1]
@@ -98,6 +121,7 @@ class RagTokenizer:
98
121
 
99
122
  self.stemmer = SnowballStemmer("english")
100
123
  self.lemmatizer = WordNetLemmatizer()
124
+ self._use_lemmatizer = True # WordNet only supports English
101
125
 
102
126
  self.SPLIT_CHAR = r"([ ,\.<>/?;:'\[\]\\`!@#$%^&*\(\)\{\}\|_+=《》,。?、;‘’:“”【】~!¥%……()——-]+|[a-zA-Z0-9,\.-]+)"
103
127
 
@@ -131,6 +155,38 @@ class RagTokenizer:
131
155
  def add_user_dict(self, fnm):
132
156
  self._load_dict(fnm)
133
157
 
158
+ def set_language(self, language: str):
159
+ """Configure stemmer/lemmatizer for the given language.
160
+
161
+ Args:
162
+ language: Language name (e.g. "English", "Dutch", "Chinese").
163
+ Case-insensitive.
164
+ """
165
+ lang_key = language.strip().lower()
166
+ snowball_lang = _SNOWBALL_LANGUAGE_MAP.get(lang_key)
167
+
168
+ if snowball_lang is not None:
169
+ self.stemmer = SnowballStemmer(snowball_lang)
170
+ if snowball_lang == "english":
171
+ self.lemmatizer = WordNetLemmatizer()
172
+ self._use_lemmatizer = True
173
+ else:
174
+ # WordNet only supports English; disable lemmatizer for
175
+ # other languages and rely on Snowball stemming alone.
176
+ self._use_lemmatizer = False
177
+ logging.debug(
178
+ "Tokenizer language set to '%s' (Snowball: %s, lemmatizer: %s)",
179
+ language, snowball_lang, self._use_lemmatizer,
180
+ )
181
+ else:
182
+ # Unsupported language (Chinese, Japanese, Korean, etc.) –
183
+ # keep defaults. CJK text uses dictionary segmentation,
184
+ # not stemming.
185
+ logging.debug(
186
+ "Language '%s' has no Snowball stemmer; keeping defaults",
187
+ language,
188
+ )
189
+
134
190
  def _strQ2B(self, ustring):
135
191
  """Convert full-width characters to half-width characters"""
136
192
  rstring = ""
@@ -326,7 +382,20 @@ class RagTokenizer:
326
382
  return self.score_(res[::-1])
327
383
 
328
384
  def english_normalize_(self, tks):
329
- return [self.stemmer.stem(self.lemmatizer.lemmatize(t)) if re.match(r"[a-zA-Z_-]+$", t) else t for t in tks]
385
+ return [self._normalize_token(t) for t in tks]
386
+
387
+ def _normalize_token(self, t: str) -> str:
388
+ """Stem (and optionally lemmatize) a single alphabetic token.
389
+
390
+ When the lemmatizer is enabled (English), applies lemmatization
391
+ before stemming. For other Snowball-supported languages, only
392
+ stemming is applied. Non-alphabetic tokens are returned as-is.
393
+ """
394
+ if re.match(r"[a-zA-Z_-]+$", t):
395
+ if self._use_lemmatizer:
396
+ return self.stemmer.stem(self.lemmatizer.lemmatize(t))
397
+ return self.stemmer.stem(t)
398
+ return t
330
399
 
331
400
  def _split_by_lang(self, line):
332
401
  txt_lang_pairs = []
@@ -360,7 +429,7 @@ class RagTokenizer:
360
429
  res = []
361
430
  for L, lang in arr:
362
431
  if not lang:
363
- res.extend([self.stemmer.stem(self.lemmatizer.lemmatize(t)) for t in word_tokenize(L)])
432
+ res.extend([self._normalize_token(t) for t in word_tokenize(L)])
364
433
  continue
365
434
  if len(L) < 2 or re.match(r"[a-z\.-]+$", L) or re.match(r"[0-9\.-]+$", L):
366
435
  res.append(L)
@@ -501,11 +570,16 @@ if __name__ == '__main__':
501
570
  parser.add_argument('--fine-grained', action='store_true',
502
571
  help='Use fine-grained tokenization')
503
572
  parser.add_argument('--user-dict', help='User dictionary file')
573
+ parser.add_argument('-l', '--language', help='Language for stemming (e.g., english, dutch)')
504
574
 
505
575
  args = parser.parse_args()
506
576
 
507
577
  tokenizer = RagTokenizer(debug=True, user_dict=args.user_dict)
508
578
 
579
+ # Set language if specified
580
+ if args.language:
581
+ tokenizer.set_language(args.language)
582
+
509
583
  # Process input
510
584
  if args.file:
511
585
  # File mode
@@ -123,8 +123,8 @@ class ThriftInfinityClient:
123
123
  # version: 0.6.8 and 0.6.9 and 0.6.10, client_version: 33
124
124
  # version: 0.6.13, client_version: 34
125
125
  # version: 0.6.15, client_version: 35
126
- # version: 0.7.0, 0.7.0.dev1, 0.7.0.dev2, 0.7.0.dev3 and 0.7.0.dev4, 0.7.0.dev5, client_version: 36
127
- res = self.client.Connect(ConnectRequest(client_version=36)) # 0.7.0.dev5
126
+ # version: 0.7.0, 0.7.0.dev1, 0.7.0.dev2, 0.7.0.dev3, 0.7.0.dev4, 0.7.0.dev5 and 0.7.0.dev6, client_version: 36
127
+ res = self.client.Connect(ConnectRequest(client_version=36)) # 0.7.0.dev6
128
128
  if res.error_code != 0:
129
129
  raise InfinityException(res.error_code, res.error_msg)
130
130
  self.session_id = res.session_id
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: infinity-sdk
3
- Version: 0.7.0.dev5
3
+ Version: 0.7.0.dev6
4
4
  Summary: infinity
5
5
  License-Expression: Apache-2.0
6
6
  Requires-Python: <3.14,>=3.11
@@ -97,7 +97,7 @@ Infinity supports two working modes, embedded mode and client-server mode. The f
97
97
 
98
98
  2. Install the `infinity-sdk` package:
99
99
  ```bash
100
- pip install infinity-sdk==0.7.0.dev5
100
+ pip install infinity-sdk==0.7.0.dev6
101
101
  ```
102
102
 
103
103
  3. Use Infinity to conduct a dense vector search: