UniTok 4.3.1__tar.gz → 4.3.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {UniTok-4.3.1 → UniTok-4.3.3}/PKG-INFO +1 -1
- {UniTok-4.3.1 → UniTok-4.3.3}/UniTok.egg-info/PKG-INFO +1 -1
- {UniTok-4.3.1 → UniTok-4.3.3}/UniTok.egg-info/SOURCES.txt +6 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/setup.py +1 -1
- {UniTok-4.3.1 → UniTok-4.3.3}/unitok/unitok.py +2 -2
- {UniTok-4.3.1 → UniTok-4.3.3}/unitok/utils/hub/hub.py +5 -1
- UniTok-4.3.3/unitok/utils/space.py +35 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/unitok/vocabulary/vocabulary.py +4 -0
- UniTok-4.3.1/unitok/utils/space.py +0 -29
- {UniTok-4.3.1 → UniTok-4.3.3}/LICENSE +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/README.md +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/UniTok.egg-info/dependency_links.txt +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/UniTok.egg-info/entry_points.txt +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/UniTok.egg-info/requires.txt +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/UniTok.egg-info/top_level.txt +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/UniTokv3/__init__.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/UniTokv3/__main__.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/UniTokv3/analysis/__init__.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/UniTokv3/analysis/lengths.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/UniTokv3/analysis/plot.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/UniTokv3/cols.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/UniTokv3/column.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/UniTokv3/fut.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/UniTokv3/global_setting.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/UniTokv3/meta.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/UniTokv3/tok/__init__.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/UniTokv3/tok/bert_tok.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/UniTokv3/tok/ent_tok.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/UniTokv3/tok/id_tok.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/UniTokv3/tok/number_tok.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/UniTokv3/tok/seq_tok.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/UniTokv3/tok/split_tok.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/UniTokv3/tok/tok.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/UniTokv3/unidep.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/UniTokv3/unitok.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/UniTokv3/vocab.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/UniTokv3/vocabs.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/setup.cfg +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/unitok/__init__.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/unitok/__main__.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/unitok/job.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/unitok/meta.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/unitok/selector.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/unitok/status.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/unitok/tokenizer/__init__.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/unitok/tokenizer/base_tokenizer.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/unitok/tokenizer/digit_tokenizer.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/unitok/tokenizer/entity_tokenizer.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/unitok/tokenizer/glove_tokenizer.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/unitok/tokenizer/split_tokenizer.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/unitok/tokenizer/transformers_tokenizer.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/unitok/tokenizer/union_tokenizer.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/unitok/tokenizer/unknown_tokenizer.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/unitok/utils/__init__.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/unitok/utils/class_pool.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/unitok/utils/data.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/unitok/utils/function.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/unitok/utils/handler/__init__.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/unitok/utils/handler/json_handler.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/unitok/utils/handler/pkl_handler.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/unitok/utils/hub/__init__.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/unitok/utils/hub/param_hub.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/unitok/utils/index_set/__init__.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/unitok/utils/index_set/index_set.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/unitok/utils/index_set/job_set.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/unitok/utils/index_set/tokenizer_set.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/unitok/utils/index_set/vocabulary_set.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/unitok/utils/instance.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/unitok/utils/map.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/unitok/utils/symbol.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/unitok/utils/verbose.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/unitok/vocabulary/__init__.py +0 -0
- {UniTok-4.3.1 → UniTok-4.3.3}/unitok/vocabulary/counter.py +0 -0
@@ -36,6 +36,12 @@ unitok/meta.py
|
|
36
36
|
unitok/selector.py
|
37
37
|
unitok/status.py
|
38
38
|
unitok/unitok.py
|
39
|
+
unitok.egg-info/PKG-INFO
|
40
|
+
unitok.egg-info/SOURCES.txt
|
41
|
+
unitok.egg-info/dependency_links.txt
|
42
|
+
unitok.egg-info/entry_points.txt
|
43
|
+
unitok.egg-info/requires.txt
|
44
|
+
unitok.egg-info/top_level.txt
|
39
45
|
unitok/tokenizer/__init__.py
|
40
46
|
unitok/tokenizer/base_tokenizer.py
|
41
47
|
unitok/tokenizer/digit_tokenizer.py
|
@@ -6,7 +6,7 @@ long_description = (this_directory / "README.md").read_text(encoding='utf8')
|
|
6
6
|
|
7
7
|
setup(
|
8
8
|
name='UniTok',
|
9
|
-
version='4.3.
|
9
|
+
version='4.3.3',
|
10
10
|
keywords=['token', 'tokenizer', 'NLP', 'transformers', 'glove', 'bert', 'llama'],
|
11
11
|
description='Unified Tokenizer',
|
12
12
|
long_description=long_description,
|
@@ -98,12 +98,12 @@ class UniTok(Status):
|
|
98
98
|
|
99
99
|
def __enter__(self):
|
100
100
|
from unitok.utils import Space
|
101
|
-
Space.
|
101
|
+
Space.push(self)
|
102
102
|
return self
|
103
103
|
|
104
104
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
105
105
|
from unitok.utils import Space
|
106
|
-
Space.
|
106
|
+
Space.pop(self)
|
107
107
|
|
108
108
|
@Status.require_initialized
|
109
109
|
def add_index_job(self, name: str = 'index', tokenizer: DigitTokenizer = None):
|
@@ -13,10 +13,14 @@ class Hub(abc.ABC, Generic[T]):
|
|
13
13
|
@classmethod
|
14
14
|
def add(cls, key, obj: T = None):
|
15
15
|
instance = cls._instance.current()
|
16
|
-
if key in instance and instance[key]
|
16
|
+
if key in instance and cls.notequal(instance[key], obj):
|
17
17
|
raise ValueError(f'Conflict object declaration: {obj} and {instance[key]}')
|
18
18
|
instance[key] = obj
|
19
19
|
|
20
|
+
@classmethod
|
21
|
+
def notequal(cls, a: T, b: T) -> bool:
|
22
|
+
return a is not b
|
23
|
+
|
20
24
|
@classmethod
|
21
25
|
def get(cls, name: str, **kwargs) -> T:
|
22
26
|
"""
|
@@ -0,0 +1,35 @@
|
|
1
|
+
class Space:
|
2
|
+
"""
|
3
|
+
UniTok allows multiple instances to be created, but the "with" statement can only be used with one instance.
|
4
|
+
"""
|
5
|
+
|
6
|
+
_stack = []
|
7
|
+
|
8
|
+
@classmethod
|
9
|
+
def push(cls, obj):
|
10
|
+
"""
|
11
|
+
Lock the unitok instance as the current active instance
|
12
|
+
"""
|
13
|
+
cls._stack.append(obj)
|
14
|
+
# if cls._active_instance is not None:
|
15
|
+
# raise ValueError(f'Space is already locked to {cls._active_instance}')
|
16
|
+
|
17
|
+
@classmethod
|
18
|
+
def pop(cls, obj):
|
19
|
+
"""
|
20
|
+
Unlock the current active instance
|
21
|
+
"""
|
22
|
+
# cls._active_instance = None
|
23
|
+
if not cls._stack:
|
24
|
+
raise ValueError('Space stack is empty')
|
25
|
+
if cls._stack[-1] != obj:
|
26
|
+
raise ValueError('Space stack is not in order')
|
27
|
+
cls._stack.pop()
|
28
|
+
|
29
|
+
@classmethod
|
30
|
+
def get_space(cls):
|
31
|
+
"""
|
32
|
+
Get the current active instance
|
33
|
+
"""
|
34
|
+
# return cls._active_space
|
35
|
+
return cls._stack[-1] if cls._stack else None
|
@@ -164,3 +164,7 @@ class VocabularyHub(Hub[Vocabulary]):
|
|
164
164
|
def add(cls, key, obj: Vocabulary = None):
|
165
165
|
key, obj = key.name, key
|
166
166
|
return super().add(key, obj)
|
167
|
+
|
168
|
+
@classmethod
|
169
|
+
def notequal(cls, a: Vocabulary, b: Vocabulary) -> bool:
|
170
|
+
return a.name != b.name or a.size != b.size
|
@@ -1,29 +0,0 @@
|
|
1
|
-
class Space:
|
2
|
-
"""
|
3
|
-
UniTok allows multiple instances to be created, but the "with" statement can only be used with one instance.
|
4
|
-
"""
|
5
|
-
|
6
|
-
_active_instance = None
|
7
|
-
|
8
|
-
@classmethod
|
9
|
-
def set(cls, obj):
|
10
|
-
"""
|
11
|
-
Lock the unitok instance as the current active instance
|
12
|
-
"""
|
13
|
-
if cls._active_instance is not None:
|
14
|
-
raise ValueError(f'Space is already locked to {cls._active_instance}')
|
15
|
-
cls._active_instance = obj
|
16
|
-
|
17
|
-
@classmethod
|
18
|
-
def unset(cls):
|
19
|
-
"""
|
20
|
-
Unlock the current active instance
|
21
|
-
"""
|
22
|
-
cls._active_instance = None
|
23
|
-
|
24
|
-
@classmethod
|
25
|
-
def get_space(cls):
|
26
|
-
"""
|
27
|
-
Get the current active instance
|
28
|
-
"""
|
29
|
-
return cls._active_instance
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|