UniTok 4.3.0__tar.gz → 4.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. {UniTok-4.3.0 → UniTok-4.3.1}/PKG-INFO +4 -5
  2. {UniTok-4.3.0 → UniTok-4.3.1}/README.md +3 -4
  3. {UniTok-4.3.0 → UniTok-4.3.1}/UniTok.egg-info/PKG-INFO +4 -5
  4. {UniTok-4.3.0 → UniTok-4.3.1}/setup.py +1 -1
  5. {UniTok-4.3.0 → UniTok-4.3.1}/unitok/unitok.py +2 -5
  6. {UniTok-4.3.0 → UniTok-4.3.1}/LICENSE +0 -0
  7. {UniTok-4.3.0 → UniTok-4.3.1}/UniTok.egg-info/SOURCES.txt +0 -0
  8. {UniTok-4.3.0 → UniTok-4.3.1}/UniTok.egg-info/dependency_links.txt +0 -0
  9. {UniTok-4.3.0 → UniTok-4.3.1}/UniTok.egg-info/entry_points.txt +0 -0
  10. {UniTok-4.3.0 → UniTok-4.3.1}/UniTok.egg-info/requires.txt +0 -0
  11. {UniTok-4.3.0 → UniTok-4.3.1}/UniTok.egg-info/top_level.txt +0 -0
  12. {UniTok-4.3.0 → UniTok-4.3.1}/UniTokv3/__init__.py +0 -0
  13. {UniTok-4.3.0 → UniTok-4.3.1}/UniTokv3/__main__.py +0 -0
  14. {UniTok-4.3.0 → UniTok-4.3.1}/UniTokv3/analysis/__init__.py +0 -0
  15. {UniTok-4.3.0 → UniTok-4.3.1}/UniTokv3/analysis/lengths.py +0 -0
  16. {UniTok-4.3.0 → UniTok-4.3.1}/UniTokv3/analysis/plot.py +0 -0
  17. {UniTok-4.3.0 → UniTok-4.3.1}/UniTokv3/cols.py +0 -0
  18. {UniTok-4.3.0 → UniTok-4.3.1}/UniTokv3/column.py +0 -0
  19. {UniTok-4.3.0 → UniTok-4.3.1}/UniTokv3/fut.py +0 -0
  20. {UniTok-4.3.0 → UniTok-4.3.1}/UniTokv3/global_setting.py +0 -0
  21. {UniTok-4.3.0 → UniTok-4.3.1}/UniTokv3/meta.py +0 -0
  22. {UniTok-4.3.0 → UniTok-4.3.1}/UniTokv3/tok/__init__.py +0 -0
  23. {UniTok-4.3.0 → UniTok-4.3.1}/UniTokv3/tok/bert_tok.py +0 -0
  24. {UniTok-4.3.0 → UniTok-4.3.1}/UniTokv3/tok/ent_tok.py +0 -0
  25. {UniTok-4.3.0 → UniTok-4.3.1}/UniTokv3/tok/id_tok.py +0 -0
  26. {UniTok-4.3.0 → UniTok-4.3.1}/UniTokv3/tok/number_tok.py +0 -0
  27. {UniTok-4.3.0 → UniTok-4.3.1}/UniTokv3/tok/seq_tok.py +0 -0
  28. {UniTok-4.3.0 → UniTok-4.3.1}/UniTokv3/tok/split_tok.py +0 -0
  29. {UniTok-4.3.0 → UniTok-4.3.1}/UniTokv3/tok/tok.py +0 -0
  30. {UniTok-4.3.0 → UniTok-4.3.1}/UniTokv3/unidep.py +0 -0
  31. {UniTok-4.3.0 → UniTok-4.3.1}/UniTokv3/unitok.py +0 -0
  32. {UniTok-4.3.0 → UniTok-4.3.1}/UniTokv3/vocab.py +0 -0
  33. {UniTok-4.3.0 → UniTok-4.3.1}/UniTokv3/vocabs.py +0 -0
  34. {UniTok-4.3.0 → UniTok-4.3.1}/setup.cfg +0 -0
  35. {UniTok-4.3.0 → UniTok-4.3.1}/unitok/__init__.py +0 -0
  36. {UniTok-4.3.0 → UniTok-4.3.1}/unitok/__main__.py +0 -0
  37. {UniTok-4.3.0 → UniTok-4.3.1}/unitok/job.py +0 -0
  38. {UniTok-4.3.0 → UniTok-4.3.1}/unitok/meta.py +0 -0
  39. {UniTok-4.3.0 → UniTok-4.3.1}/unitok/selector.py +0 -0
  40. {UniTok-4.3.0 → UniTok-4.3.1}/unitok/status.py +0 -0
  41. {UniTok-4.3.0 → UniTok-4.3.1}/unitok/tokenizer/__init__.py +0 -0
  42. {UniTok-4.3.0 → UniTok-4.3.1}/unitok/tokenizer/base_tokenizer.py +0 -0
  43. {UniTok-4.3.0 → UniTok-4.3.1}/unitok/tokenizer/digit_tokenizer.py +0 -0
  44. {UniTok-4.3.0 → UniTok-4.3.1}/unitok/tokenizer/entity_tokenizer.py +0 -0
  45. {UniTok-4.3.0 → UniTok-4.3.1}/unitok/tokenizer/glove_tokenizer.py +0 -0
  46. {UniTok-4.3.0 → UniTok-4.3.1}/unitok/tokenizer/split_tokenizer.py +0 -0
  47. {UniTok-4.3.0 → UniTok-4.3.1}/unitok/tokenizer/transformers_tokenizer.py +0 -0
  48. {UniTok-4.3.0 → UniTok-4.3.1}/unitok/tokenizer/union_tokenizer.py +0 -0
  49. {UniTok-4.3.0 → UniTok-4.3.1}/unitok/tokenizer/unknown_tokenizer.py +0 -0
  50. {UniTok-4.3.0 → UniTok-4.3.1}/unitok/utils/__init__.py +0 -0
  51. {UniTok-4.3.0 → UniTok-4.3.1}/unitok/utils/class_pool.py +0 -0
  52. {UniTok-4.3.0 → UniTok-4.3.1}/unitok/utils/data.py +0 -0
  53. {UniTok-4.3.0 → UniTok-4.3.1}/unitok/utils/function.py +0 -0
  54. {UniTok-4.3.0 → UniTok-4.3.1}/unitok/utils/handler/__init__.py +0 -0
  55. {UniTok-4.3.0 → UniTok-4.3.1}/unitok/utils/handler/json_handler.py +0 -0
  56. {UniTok-4.3.0 → UniTok-4.3.1}/unitok/utils/handler/pkl_handler.py +0 -0
  57. {UniTok-4.3.0 → UniTok-4.3.1}/unitok/utils/hub/__init__.py +0 -0
  58. {UniTok-4.3.0 → UniTok-4.3.1}/unitok/utils/hub/hub.py +0 -0
  59. {UniTok-4.3.0 → UniTok-4.3.1}/unitok/utils/hub/param_hub.py +0 -0
  60. {UniTok-4.3.0 → UniTok-4.3.1}/unitok/utils/index_set/__init__.py +0 -0
  61. {UniTok-4.3.0 → UniTok-4.3.1}/unitok/utils/index_set/index_set.py +0 -0
  62. {UniTok-4.3.0 → UniTok-4.3.1}/unitok/utils/index_set/job_set.py +0 -0
  63. {UniTok-4.3.0 → UniTok-4.3.1}/unitok/utils/index_set/tokenizer_set.py +0 -0
  64. {UniTok-4.3.0 → UniTok-4.3.1}/unitok/utils/index_set/vocabulary_set.py +0 -0
  65. {UniTok-4.3.0 → UniTok-4.3.1}/unitok/utils/instance.py +0 -0
  66. {UniTok-4.3.0 → UniTok-4.3.1}/unitok/utils/map.py +0 -0
  67. {UniTok-4.3.0 → UniTok-4.3.1}/unitok/utils/space.py +0 -0
  68. {UniTok-4.3.0 → UniTok-4.3.1}/unitok/utils/symbol.py +0 -0
  69. {UniTok-4.3.0 → UniTok-4.3.1}/unitok/utils/verbose.py +0 -0
  70. {UniTok-4.3.0 → UniTok-4.3.1}/unitok/vocabulary/__init__.py +0 -0
  71. {UniTok-4.3.0 → UniTok-4.3.1}/unitok/vocabulary/counter.py +0 -0
  72. {UniTok-4.3.0 → UniTok-4.3.1}/unitok/vocabulary/vocabulary.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: UniTok
3
- Version: 4.3.0
3
+ Version: 4.3.1
4
4
  Summary: Unified Tokenizer
5
5
  Home-page: https://github.com/Jyonn/UnifiedTokenizer
6
6
  Author: Jyonn Liu
@@ -13,17 +13,16 @@ License-File: LICENSE
13
13
 
14
14
  # UniTok V4
15
15
 
16
- If you want to use the old version, please refer to [v3](README_v3.md) in Chinese.
16
+ The documentation for v3, old version, can be found [here](README_v3.md) in Chinese.
17
17
 
18
18
  ## Overview
19
19
 
20
20
  [![PyPI version](https://badge.fury.io/py/unitok.svg)](https://badge.fury.io/py/unitok)
21
21
 
22
- Welcome to the UniTok documentation!
22
+ Welcome to the UniTok v4!
23
23
  This library provides a unified preprocessing solution for machine learning datasets, handling diverse data types like text, categorical features, and numerical values.
24
- It introduces **SQL-like** data table combinations and a modular workflow that transitions datasets through three states: `initialized`, `tokenized`, and `organized`.
25
24
 
26
- UniTok is designed to simplify preprocessing by offering reusable components such as tokenizers and vocabularies, making it flexible for various datasets and scenarios.
25
+ Please refer to [UniTok Handbook](https://unitok.qijiong.work) for more detailed information.
27
26
 
28
27
  ## Road from V3 to V4
29
28
 
@@ -1,16 +1,15 @@
1
1
  # UniTok V4
2
2
 
3
- If you want to use the old version, please refer to [v3](README_v3.md) in Chinese.
3
+ The documentation for v3, old version, can be found [here](README_v3.md) in Chinese.
4
4
 
5
5
  ## Overview
6
6
 
7
7
  [![PyPI version](https://badge.fury.io/py/unitok.svg)](https://badge.fury.io/py/unitok)
8
8
 
9
- Welcome to the UniTok documentation!
9
+ Welcome to the UniTok v4!
10
10
  This library provides a unified preprocessing solution for machine learning datasets, handling diverse data types like text, categorical features, and numerical values.
11
- It introduces **SQL-like** data table combinations and a modular workflow that transitions datasets through three states: `initialized`, `tokenized`, and `organized`.
12
11
 
13
- UniTok is designed to simplify preprocessing by offering reusable components such as tokenizers and vocabularies, making it flexible for various datasets and scenarios.
12
+ Please refer to [UniTok Handbook](https://unitok.qijiong.work) for more detailed information.
14
13
 
15
14
  ## Road from V3 to V4
16
15
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: UniTok
3
- Version: 4.3.0
3
+ Version: 4.3.1
4
4
  Summary: Unified Tokenizer
5
5
  Home-page: https://github.com/Jyonn/UnifiedTokenizer
6
6
  Author: Jyonn Liu
@@ -13,17 +13,16 @@ License-File: LICENSE
13
13
 
14
14
  # UniTok V4
15
15
 
16
- If you want to use the old version, please refer to [v3](README_v3.md) in Chinese.
16
+ The documentation for v3, old version, can be found [here](README_v3.md) in Chinese.
17
17
 
18
18
  ## Overview
19
19
 
20
20
  [![PyPI version](https://badge.fury.io/py/unitok.svg)](https://badge.fury.io/py/unitok)
21
21
 
22
- Welcome to the UniTok documentation!
22
+ Welcome to the UniTok v4!
23
23
  This library provides a unified preprocessing solution for machine learning datasets, handling diverse data types like text, categorical features, and numerical values.
24
- It introduces **SQL-like** data table combinations and a modular workflow that transitions datasets through three states: `initialized`, `tokenized`, and `organized`.
25
24
 
26
- UniTok is designed to simplify preprocessing by offering reusable components such as tokenizers and vocabularies, making it flexible for various datasets and scenarios.
25
+ Please refer to [UniTok Handbook](https://unitok.qijiong.work) for more detailed information.
27
26
 
28
27
  ## Road from V3 to V4
29
28
 
@@ -6,7 +6,7 @@ long_description = (this_directory / "README.md").read_text(encoding='utf8')
6
6
 
7
7
  setup(
8
8
  name='UniTok',
9
- version='4.3.0',
9
+ version='4.3.1',
10
10
  keywords=['token', 'tokenizer', 'NLP', 'transformers', 'glove', 'bert', 'llama'],
11
11
  description='Unified Tokenizer',
12
12
  long_description=long_description,
@@ -51,7 +51,7 @@ class UniTok(Status):
51
51
  if self._union_type is None:
52
52
  self._union_type = union_type
53
53
  elif self._union_type != union_type:
54
- raise ValueError(f'union type is already set: {self._union_type}')
54
+ raise ValueError(f'Union type is already set: {self._union_type}')
55
55
 
56
56
  @Status.require_not_initialized
57
57
  def init_indices(self):
@@ -340,6 +340,7 @@ class UniTok(Status):
340
340
  sample[job.name] = self.data[job.name][index]
341
341
  return sample
342
342
 
343
+ @Status.require_not_initialized
343
344
  def pack(self, index):
344
345
  if self.is_soft_union:
345
346
  return self._pack_soft_union(index)
@@ -393,10 +394,6 @@ class UniTok(Status):
393
394
  selector = Selector(self.meta, *selector)
394
395
  return selector(sample)
395
396
 
396
- def get_sample_by_id(self, key_id):
397
- index = self.key_job.tokenizer.vocab[key_id]
398
- return self[index]
399
-
400
397
  def __len__(self):
401
398
  return len(self._legal_indices)
402
399
 
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes