UniTok 4.3.0__tar.gz → 4.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {UniTok-4.3.0 → UniTok-4.3.1}/PKG-INFO +4 -5
- {UniTok-4.3.0 → UniTok-4.3.1}/README.md +3 -4
- {UniTok-4.3.0 → UniTok-4.3.1}/UniTok.egg-info/PKG-INFO +4 -5
- {UniTok-4.3.0 → UniTok-4.3.1}/setup.py +1 -1
- {UniTok-4.3.0 → UniTok-4.3.1}/unitok/unitok.py +2 -5
- {UniTok-4.3.0 → UniTok-4.3.1}/LICENSE +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/UniTok.egg-info/SOURCES.txt +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/UniTok.egg-info/dependency_links.txt +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/UniTok.egg-info/entry_points.txt +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/UniTok.egg-info/requires.txt +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/UniTok.egg-info/top_level.txt +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/UniTokv3/__init__.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/UniTokv3/__main__.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/UniTokv3/analysis/__init__.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/UniTokv3/analysis/lengths.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/UniTokv3/analysis/plot.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/UniTokv3/cols.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/UniTokv3/column.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/UniTokv3/fut.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/UniTokv3/global_setting.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/UniTokv3/meta.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/UniTokv3/tok/__init__.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/UniTokv3/tok/bert_tok.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/UniTokv3/tok/ent_tok.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/UniTokv3/tok/id_tok.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/UniTokv3/tok/number_tok.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/UniTokv3/tok/seq_tok.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/UniTokv3/tok/split_tok.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/UniTokv3/tok/tok.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/UniTokv3/unidep.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/UniTokv3/unitok.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/UniTokv3/vocab.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/UniTokv3/vocabs.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/setup.cfg +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/unitok/__init__.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/unitok/__main__.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/unitok/job.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/unitok/meta.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/unitok/selector.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/unitok/status.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/unitok/tokenizer/__init__.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/unitok/tokenizer/base_tokenizer.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/unitok/tokenizer/digit_tokenizer.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/unitok/tokenizer/entity_tokenizer.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/unitok/tokenizer/glove_tokenizer.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/unitok/tokenizer/split_tokenizer.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/unitok/tokenizer/transformers_tokenizer.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/unitok/tokenizer/union_tokenizer.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/unitok/tokenizer/unknown_tokenizer.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/unitok/utils/__init__.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/unitok/utils/class_pool.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/unitok/utils/data.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/unitok/utils/function.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/unitok/utils/handler/__init__.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/unitok/utils/handler/json_handler.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/unitok/utils/handler/pkl_handler.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/unitok/utils/hub/__init__.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/unitok/utils/hub/hub.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/unitok/utils/hub/param_hub.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/unitok/utils/index_set/__init__.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/unitok/utils/index_set/index_set.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/unitok/utils/index_set/job_set.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/unitok/utils/index_set/tokenizer_set.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/unitok/utils/index_set/vocabulary_set.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/unitok/utils/instance.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/unitok/utils/map.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/unitok/utils/space.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/unitok/utils/symbol.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/unitok/utils/verbose.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/unitok/vocabulary/__init__.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/unitok/vocabulary/counter.py +0 -0
- {UniTok-4.3.0 → UniTok-4.3.1}/unitok/vocabulary/vocabulary.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: UniTok
|
3
|
-
Version: 4.3.
|
3
|
+
Version: 4.3.1
|
4
4
|
Summary: Unified Tokenizer
|
5
5
|
Home-page: https://github.com/Jyonn/UnifiedTokenizer
|
6
6
|
Author: Jyonn Liu
|
@@ -13,17 +13,16 @@ License-File: LICENSE
|
|
13
13
|
|
14
14
|
# UniTok V4
|
15
15
|
|
16
|
-
|
16
|
+
The documentation for v3, old version, can be found [here](README_v3.md) in Chinese.
|
17
17
|
|
18
18
|
## Overview
|
19
19
|
|
20
20
|
[](https://badge.fury.io/py/unitok)
|
21
21
|
|
22
|
-
Welcome to the UniTok
|
22
|
+
Welcome to the UniTok v4!
|
23
23
|
This library provides a unified preprocessing solution for machine learning datasets, handling diverse data types like text, categorical features, and numerical values.
|
24
|
-
It introduces **SQL-like** data table combinations and a modular workflow that transitions datasets through three states: `initialized`, `tokenized`, and `organized`.
|
25
24
|
|
26
|
-
|
25
|
+
Please refer to [UniTok Handbook](https://unitok.qijiong.work) for more detailed information.
|
27
26
|
|
28
27
|
## Road from V3 to V4
|
29
28
|
|
@@ -1,16 +1,15 @@
|
|
1
1
|
# UniTok V4
|
2
2
|
|
3
|
-
|
3
|
+
The documentation for v3, old version, can be found [here](README_v3.md) in Chinese.
|
4
4
|
|
5
5
|
## Overview
|
6
6
|
|
7
7
|
[](https://badge.fury.io/py/unitok)
|
8
8
|
|
9
|
-
Welcome to the UniTok
|
9
|
+
Welcome to the UniTok v4!
|
10
10
|
This library provides a unified preprocessing solution for machine learning datasets, handling diverse data types like text, categorical features, and numerical values.
|
11
|
-
It introduces **SQL-like** data table combinations and a modular workflow that transitions datasets through three states: `initialized`, `tokenized`, and `organized`.
|
12
11
|
|
13
|
-
|
12
|
+
Please refer to [UniTok Handbook](https://unitok.qijiong.work) for more detailed information.
|
14
13
|
|
15
14
|
## Road from V3 to V4
|
16
15
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: UniTok
|
3
|
-
Version: 4.3.
|
3
|
+
Version: 4.3.1
|
4
4
|
Summary: Unified Tokenizer
|
5
5
|
Home-page: https://github.com/Jyonn/UnifiedTokenizer
|
6
6
|
Author: Jyonn Liu
|
@@ -13,17 +13,16 @@ License-File: LICENSE
|
|
13
13
|
|
14
14
|
# UniTok V4
|
15
15
|
|
16
|
-
|
16
|
+
The documentation for v3, old version, can be found [here](README_v3.md) in Chinese.
|
17
17
|
|
18
18
|
## Overview
|
19
19
|
|
20
20
|
[](https://badge.fury.io/py/unitok)
|
21
21
|
|
22
|
-
Welcome to the UniTok
|
22
|
+
Welcome to the UniTok v4!
|
23
23
|
This library provides a unified preprocessing solution for machine learning datasets, handling diverse data types like text, categorical features, and numerical values.
|
24
|
-
It introduces **SQL-like** data table combinations and a modular workflow that transitions datasets through three states: `initialized`, `tokenized`, and `organized`.
|
25
24
|
|
26
|
-
|
25
|
+
Please refer to [UniTok Handbook](https://unitok.qijiong.work) for more detailed information.
|
27
26
|
|
28
27
|
## Road from V3 to V4
|
29
28
|
|
@@ -6,7 +6,7 @@ long_description = (this_directory / "README.md").read_text(encoding='utf8')
|
|
6
6
|
|
7
7
|
setup(
|
8
8
|
name='UniTok',
|
9
|
-
version='4.3.
|
9
|
+
version='4.3.1',
|
10
10
|
keywords=['token', 'tokenizer', 'NLP', 'transformers', 'glove', 'bert', 'llama'],
|
11
11
|
description='Unified Tokenizer',
|
12
12
|
long_description=long_description,
|
@@ -51,7 +51,7 @@ class UniTok(Status):
|
|
51
51
|
if self._union_type is None:
|
52
52
|
self._union_type = union_type
|
53
53
|
elif self._union_type != union_type:
|
54
|
-
raise ValueError(f'
|
54
|
+
raise ValueError(f'Union type is already set: {self._union_type}')
|
55
55
|
|
56
56
|
@Status.require_not_initialized
|
57
57
|
def init_indices(self):
|
@@ -340,6 +340,7 @@ class UniTok(Status):
|
|
340
340
|
sample[job.name] = self.data[job.name][index]
|
341
341
|
return sample
|
342
342
|
|
343
|
+
@Status.require_not_initialized
|
343
344
|
def pack(self, index):
|
344
345
|
if self.is_soft_union:
|
345
346
|
return self._pack_soft_union(index)
|
@@ -393,10 +394,6 @@ class UniTok(Status):
|
|
393
394
|
selector = Selector(self.meta, *selector)
|
394
395
|
return selector(sample)
|
395
396
|
|
396
|
-
def get_sample_by_id(self, key_id):
|
397
|
-
index = self.key_job.tokenizer.vocab[key_id]
|
398
|
-
return self[index]
|
399
|
-
|
400
397
|
def __len__(self):
|
401
398
|
return len(self._legal_indices)
|
402
399
|
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|