UniTok 4.0.3__tar.gz → 4.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {UniTok-4.0.3 → UniTok-4.3.0}/PKG-INFO +14 -13
- {UniTok-4.0.3 → UniTok-4.3.0}/README.md +12 -11
- {UniTok-4.0.3 → UniTok-4.3.0}/UniTok.egg-info/PKG-INFO +14 -13
- {UniTok-4.0.3 → UniTok-4.3.0}/UniTok.egg-info/SOURCES.txt +1 -1
- {UniTok-4.0.3 → UniTok-4.3.0}/setup.py +2 -2
- {UniTok-4.0.3 → UniTok-4.3.0}/unitok/__init__.py +2 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/unitok/__main__.py +36 -6
- {UniTok-4.0.3 → UniTok-4.3.0}/unitok/job.py +17 -2
- {UniTok-4.0.3 → UniTok-4.3.0}/unitok/meta.py +2 -1
- {UniTok-4.0.3 → UniTok-4.3.0}/unitok/tokenizer/__init__.py +3 -3
- {UniTok-4.0.3 → UniTok-4.3.0}/unitok/tokenizer/base_tokenizer.py +7 -1
- UniTok-4.3.0/unitok/tokenizer/glove_tokenizer.py +21 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/unitok/tokenizer/transformers_tokenizer.py +18 -8
- {UniTok-4.0.3 → UniTok-4.3.0}/unitok/unitok.py +30 -2
- {UniTok-4.0.3 → UniTok-4.3.0}/unitok/utils/handler/pkl_handler.py +1 -1
- {UniTok-4.0.3 → UniTok-4.3.0}/unitok/utils/hub/hub.py +1 -1
- {UniTok-4.0.3 → UniTok-4.3.0}/unitok/vocabulary/vocabulary.py +25 -9
- UniTok-4.0.3/unitok/tokenizer/cachable_tokenizer.py +0 -25
- {UniTok-4.0.3 → UniTok-4.3.0}/LICENSE +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/UniTok.egg-info/dependency_links.txt +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/UniTok.egg-info/entry_points.txt +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/UniTok.egg-info/requires.txt +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/UniTok.egg-info/top_level.txt +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/UniTokv3/__init__.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/UniTokv3/__main__.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/UniTokv3/analysis/__init__.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/UniTokv3/analysis/lengths.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/UniTokv3/analysis/plot.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/UniTokv3/cols.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/UniTokv3/column.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/UniTokv3/fut.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/UniTokv3/global_setting.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/UniTokv3/meta.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/UniTokv3/tok/__init__.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/UniTokv3/tok/bert_tok.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/UniTokv3/tok/ent_tok.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/UniTokv3/tok/id_tok.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/UniTokv3/tok/number_tok.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/UniTokv3/tok/seq_tok.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/UniTokv3/tok/split_tok.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/UniTokv3/tok/tok.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/UniTokv3/unidep.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/UniTokv3/unitok.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/UniTokv3/vocab.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/UniTokv3/vocabs.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/setup.cfg +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/unitok/selector.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/unitok/status.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/unitok/tokenizer/digit_tokenizer.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/unitok/tokenizer/entity_tokenizer.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/unitok/tokenizer/split_tokenizer.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/unitok/tokenizer/union_tokenizer.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/unitok/tokenizer/unknown_tokenizer.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/unitok/utils/__init__.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/unitok/utils/class_pool.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/unitok/utils/data.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/unitok/utils/function.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/unitok/utils/handler/__init__.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/unitok/utils/handler/json_handler.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/unitok/utils/hub/__init__.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/unitok/utils/hub/param_hub.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/unitok/utils/index_set/__init__.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/unitok/utils/index_set/index_set.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/unitok/utils/index_set/job_set.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/unitok/utils/index_set/tokenizer_set.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/unitok/utils/index_set/vocabulary_set.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/unitok/utils/instance.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/unitok/utils/map.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/unitok/utils/space.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/unitok/utils/symbol.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/unitok/utils/verbose.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/unitok/vocabulary/__init__.py +0 -0
- {UniTok-4.0.3 → UniTok-4.3.0}/unitok/vocabulary/counter.py +0 -0
@@ -1,12 +1,12 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: UniTok
|
3
|
-
Version: 4.0
|
3
|
+
Version: 4.3.0
|
4
4
|
Summary: Unified Tokenizer
|
5
5
|
Home-page: https://github.com/Jyonn/UnifiedTokenizer
|
6
6
|
Author: Jyonn Liu
|
7
7
|
Author-email: liu@qijiong.work
|
8
8
|
License: MIT Licence
|
9
|
-
Keywords: token,tokenizer
|
9
|
+
Keywords: token,tokenizer,NLP,transformers,glove,bert,llama
|
10
10
|
Platform: any
|
11
11
|
Description-Content-Type: text/markdown
|
12
12
|
License-File: LICENSE
|
@@ -29,17 +29,18 @@ UniTok is designed to simplify preprocessing by offering reusable components suc
|
|
29
29
|
|
30
30
|
### Changes and Comparisons
|
31
31
|
|
32
|
-
| Feature
|
33
|
-
|
34
|
-
| `UniTok` class
|
35
|
-
| `UniDep` class
|
36
|
-
| `Column` class
|
37
|
-
| `Job` class
|
38
|
-
| `Tokenizer` class
|
39
|
-
| `Tokenizer` class
|
40
|
-
| `analyse` method
|
41
|
-
| `Meta` class
|
42
|
-
| `unitok` command
|
32
|
+
| Feature | UniTok v3 | UniTok v4 | Comments |
|
33
|
+
|---------------------------------|-------------------------------------------------------------|-----------------------------------------------------|-------------------------------------------------------------------------------|
|
34
|
+
| `UniTok` class | Solely for tokenization | Manages the entire preprocessing lifecycle | |
|
35
|
+
| `UniDep` class | Data loading and combining | Removed | V4 combines the functionalities of `UniTok` and `UniDep` into a single class. |
|
36
|
+
| `Column` class | Column name is for both the original and tokenized datasets | N/A | V4 introduces a `Job` class. |
|
37
|
+
| `Job` class | N/A | Defines how a specific column should be tokenized | |
|
38
|
+
| `Tokenizer` class | Ambiguous return type definition | `return_list` parameter must be of type `bool` | |
|
39
|
+
| `Tokenizer` class | Only supports `BertTokenizer` for text processing | Supports all Tokenizers in the transformers library | New `TransformersTokenizer` class |
|
40
|
+
| `analyse` method | Supported | Not supported Currently | |
|
41
|
+
| `Meta` class | Only for human-friendly displaying | Manager for `Job`, `Tokenizer`, and `Vocab` | |
|
42
|
+
| `unitok` command | Visualization in the terminal | More colorful and detailed output | |
|
43
|
+
| `Vocab` class (unitok >= 4.1.0) | Save and load vocabulary using text files | Save and load vocabulary using pickle files | Avoids issues with special characters in text files |
|
43
44
|
|
44
45
|
### How to Migrate the Processed Data
|
45
46
|
|
@@ -16,17 +16,18 @@ UniTok is designed to simplify preprocessing by offering reusable components suc
|
|
16
16
|
|
17
17
|
### Changes and Comparisons
|
18
18
|
|
19
|
-
| Feature
|
20
|
-
|
21
|
-
| `UniTok` class
|
22
|
-
| `UniDep` class
|
23
|
-
| `Column` class
|
24
|
-
| `Job` class
|
25
|
-
| `Tokenizer` class
|
26
|
-
| `Tokenizer` class
|
27
|
-
| `analyse` method
|
28
|
-
| `Meta` class
|
29
|
-
| `unitok` command
|
19
|
+
| Feature | UniTok v3 | UniTok v4 | Comments |
|
20
|
+
|---------------------------------|-------------------------------------------------------------|-----------------------------------------------------|-------------------------------------------------------------------------------|
|
21
|
+
| `UniTok` class | Solely for tokenization | Manages the entire preprocessing lifecycle | |
|
22
|
+
| `UniDep` class | Data loading and combining | Removed | V4 combines the functionalities of `UniTok` and `UniDep` into a single class. |
|
23
|
+
| `Column` class | Column name is for both the original and tokenized datasets | N/A | V4 introduces a `Job` class. |
|
24
|
+
| `Job` class | N/A | Defines how a specific column should be tokenized | |
|
25
|
+
| `Tokenizer` class | Ambiguous return type definition | `return_list` parameter must be of type `bool` | |
|
26
|
+
| `Tokenizer` class | Only supports `BertTokenizer` for text processing | Supports all Tokenizers in the transformers library | New `TransformersTokenizer` class |
|
27
|
+
| `analyse` method | Supported | Not supported Currently | |
|
28
|
+
| `Meta` class | Only for human-friendly displaying | Manager for `Job`, `Tokenizer`, and `Vocab` | |
|
29
|
+
| `unitok` command | Visualization in the terminal | More colorful and detailed output | |
|
30
|
+
| `Vocab` class (unitok >= 4.1.0) | Save and load vocabulary using text files | Save and load vocabulary using pickle files | Avoids issues with special characters in text files |
|
30
31
|
|
31
32
|
### How to Migrate the Processed Data
|
32
33
|
|
@@ -1,12 +1,12 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: UniTok
|
3
|
-
Version: 4.0
|
3
|
+
Version: 4.3.0
|
4
4
|
Summary: Unified Tokenizer
|
5
5
|
Home-page: https://github.com/Jyonn/UnifiedTokenizer
|
6
6
|
Author: Jyonn Liu
|
7
7
|
Author-email: liu@qijiong.work
|
8
8
|
License: MIT Licence
|
9
|
-
Keywords: token,tokenizer
|
9
|
+
Keywords: token,tokenizer,NLP,transformers,glove,bert,llama
|
10
10
|
Platform: any
|
11
11
|
Description-Content-Type: text/markdown
|
12
12
|
License-File: LICENSE
|
@@ -29,17 +29,18 @@ UniTok is designed to simplify preprocessing by offering reusable components suc
|
|
29
29
|
|
30
30
|
### Changes and Comparisons
|
31
31
|
|
32
|
-
| Feature
|
33
|
-
|
34
|
-
| `UniTok` class
|
35
|
-
| `UniDep` class
|
36
|
-
| `Column` class
|
37
|
-
| `Job` class
|
38
|
-
| `Tokenizer` class
|
39
|
-
| `Tokenizer` class
|
40
|
-
| `analyse` method
|
41
|
-
| `Meta` class
|
42
|
-
| `unitok` command
|
32
|
+
| Feature | UniTok v3 | UniTok v4 | Comments |
|
33
|
+
|---------------------------------|-------------------------------------------------------------|-----------------------------------------------------|-------------------------------------------------------------------------------|
|
34
|
+
| `UniTok` class | Solely for tokenization | Manages the entire preprocessing lifecycle | |
|
35
|
+
| `UniDep` class | Data loading and combining | Removed | V4 combines the functionalities of `UniTok` and `UniDep` into a single class. |
|
36
|
+
| `Column` class | Column name is for both the original and tokenized datasets | N/A | V4 introduces a `Job` class. |
|
37
|
+
| `Job` class | N/A | Defines how a specific column should be tokenized | |
|
38
|
+
| `Tokenizer` class | Ambiguous return type definition | `return_list` parameter must be of type `bool` | |
|
39
|
+
| `Tokenizer` class | Only supports `BertTokenizer` for text processing | Supports all Tokenizers in the transformers library | New `TransformersTokenizer` class |
|
40
|
+
| `analyse` method | Supported | Not supported Currently | |
|
41
|
+
| `Meta` class | Only for human-friendly displaying | Manager for `Job`, `Tokenizer`, and `Vocab` | |
|
42
|
+
| `unitok` command | Visualization in the terminal | More colorful and detailed output | |
|
43
|
+
| `Vocab` class (unitok >= 4.1.0) | Save and load vocabulary using text files | Save and load vocabulary using pickle files | Avoids issues with special characters in text files |
|
43
44
|
|
44
45
|
### How to Migrate the Processed Data
|
45
46
|
|
@@ -38,9 +38,9 @@ unitok/status.py
|
|
38
38
|
unitok/unitok.py
|
39
39
|
unitok/tokenizer/__init__.py
|
40
40
|
unitok/tokenizer/base_tokenizer.py
|
41
|
-
unitok/tokenizer/cachable_tokenizer.py
|
42
41
|
unitok/tokenizer/digit_tokenizer.py
|
43
42
|
unitok/tokenizer/entity_tokenizer.py
|
43
|
+
unitok/tokenizer/glove_tokenizer.py
|
44
44
|
unitok/tokenizer/split_tokenizer.py
|
45
45
|
unitok/tokenizer/transformers_tokenizer.py
|
46
46
|
unitok/tokenizer/union_tokenizer.py
|
@@ -6,8 +6,8 @@ long_description = (this_directory / "README.md").read_text(encoding='utf8')
|
|
6
6
|
|
7
7
|
setup(
|
8
8
|
name='UniTok',
|
9
|
-
version='4.0
|
10
|
-
keywords=['token', 'tokenizer'],
|
9
|
+
version='4.3.0',
|
10
|
+
keywords=['token', 'tokenizer', 'NLP', 'transformers', 'glove', 'bert', 'llama'],
|
11
11
|
description='Unified Tokenizer',
|
12
12
|
long_description=long_description,
|
13
13
|
long_description_content_type='text/markdown',
|
@@ -9,6 +9,7 @@ from unitok.tokenizer import BaseTokenizer, TokenizerHub
|
|
9
9
|
from unitok.tokenizer import EntityTokenizer, EntitiesTokenizer
|
10
10
|
from unitok.tokenizer import TransformersTokenizer, BertTokenizer
|
11
11
|
from unitok.tokenizer import SplitTokenizer, DigitTokenizer, DigitsTokenizer
|
12
|
+
from unitok.tokenizer import GloVeTokenizer
|
12
13
|
from unitok.job import Job, JobHub
|
13
14
|
|
14
15
|
from unitok.utils.index_set import IndexSet, VocabSet, TokenizerSet, JobSet
|
@@ -29,6 +30,7 @@ __all__ = [
|
|
29
30
|
'EntityTokenizer', 'EntitiesTokenizer',
|
30
31
|
'TransformersTokenizer', 'BertTokenizer',
|
31
32
|
'SplitTokenizer', 'DigitTokenizer', 'DigitsTokenizer',
|
33
|
+
'GloVeTokenizer',
|
32
34
|
'Job', 'JobHub',
|
33
35
|
'IndexSet', 'VocabSet', 'TokenizerSet', 'JobSet',
|
34
36
|
'Meta',
|
@@ -1,7 +1,9 @@
|
|
1
1
|
import argparse
|
2
2
|
|
3
3
|
import pandas as pd
|
4
|
+
from pigmento import pnt
|
4
5
|
|
6
|
+
from unitok import Vocab
|
5
7
|
from unitok.tokenizer import BaseTokenizer
|
6
8
|
from unitok.unitok import UniTok
|
7
9
|
from unitok.utils.class_pool import ClassPool
|
@@ -29,7 +31,7 @@ def integrate():
|
|
29
31
|
if arg.startswith('--t.'):
|
30
32
|
current_param = arg[4:]
|
31
33
|
elif arg.startswith('--tokenizer.'):
|
32
|
-
current_param = arg[
|
34
|
+
current_param = arg[12:]
|
33
35
|
|
34
36
|
if args.file.endswith('.csv') or args.file.endswith('.tsv'):
|
35
37
|
df = pd.read_csv(args.file, sep='\t')
|
@@ -39,17 +41,32 @@ def integrate():
|
|
39
41
|
raise ValueError(f'Unsupported file format: {args.file}')
|
40
42
|
|
41
43
|
with UniTok.load(args.path, tokenizer_lib=args.lib) as ut:
|
44
|
+
tokenizer = None
|
45
|
+
|
42
46
|
if args.tokenizer_id:
|
43
47
|
for t in ut.meta.tokenizers: # type: BaseTokenizer
|
44
48
|
if t.get_tokenizer_id() == args.tokenizer_id:
|
45
49
|
tokenizer = t
|
46
50
|
break
|
47
51
|
else:
|
48
|
-
|
49
|
-
|
50
|
-
|
52
|
+
pnt(f'Unknown tokenizer id: {args.tokenizer_id}, will create a new tokenizer')
|
53
|
+
tokenizer_params['tokenizer_id'] = args.tokenizer_id
|
54
|
+
|
55
|
+
if not tokenizer:
|
56
|
+
if args.tokenizer is None and args.vocab is None:
|
57
|
+
raise ValueError('Tokenizer classname and vocabulary must be specified')
|
58
|
+
|
59
|
+
if args.vocab.endswith('.vocab'):
|
60
|
+
if '/' in args.vocab:
|
61
|
+
vocab_path, vocab_name = args.vocab.rsplit('/', 1)
|
62
|
+
else:
|
63
|
+
vocab_path, vocab_name = '.', args.vocab
|
64
|
+
vocab_name = vocab_name[:-6]
|
65
|
+
args.vocab = Vocab(vocab_name).load(vocab_path)
|
66
|
+
|
51
67
|
tokenizers = ClassPool.tokenizers(args.lib)
|
52
|
-
|
68
|
+
if args.tokenizer not in tokenizers:
|
69
|
+
raise ValueError(f'Unknown tokenizer: {args.tokenizer}. Available tokenizers: {tokenizers.keys()}')
|
53
70
|
tokenizer = tokenizers[args.tokenizer](vocab=args.vocab, **tokenizer_params)
|
54
71
|
|
55
72
|
ut.add_job(tokenizer=tokenizer, column=args.column, name=args.name, truncate=args.truncate)
|
@@ -65,14 +82,27 @@ def summarize():
|
|
65
82
|
ut.summarize()
|
66
83
|
|
67
84
|
|
85
|
+
def remove():
|
86
|
+
parser = argparse.ArgumentParser()
|
87
|
+
parser.add_argument('path', type=str, default='.', help='path to a unitok data directory')
|
88
|
+
parser.add_argument('--name', type=str, help='job name to remove')
|
89
|
+
args, _ = parser.parse_known_args()
|
90
|
+
|
91
|
+
with UniTok.load(args.path) as ut:
|
92
|
+
ut.remove_job(args.name)
|
93
|
+
ut.save(args.path)
|
94
|
+
|
95
|
+
|
68
96
|
def main():
|
69
97
|
parser = argparse.ArgumentParser()
|
70
|
-
parser.add_argument('--action', '-a', type=str, default='summarize', choices=['summarize', 'integrate'])
|
98
|
+
parser.add_argument('--action', '-a', type=str, default='summarize', choices=['summarize', 'integrate', 'remove'])
|
71
99
|
|
72
100
|
args, _ = parser.parse_known_args()
|
73
101
|
action = args.action
|
74
102
|
|
75
103
|
if action == 'integrate':
|
76
104
|
integrate()
|
105
|
+
elif action == 'remove':
|
106
|
+
remove()
|
77
107
|
else:
|
78
108
|
summarize()
|
@@ -1,3 +1,6 @@
|
|
1
|
+
from typing import Union
|
2
|
+
|
3
|
+
from unitok import TokenizerHub, VocabHub
|
1
4
|
from unitok.tokenizer.union_tokenizer import UnionTokenizer
|
2
5
|
|
3
6
|
from unitok.tokenizer import BaseTokenizer
|
@@ -8,7 +11,7 @@ from unitok.utils.hub import Hub
|
|
8
11
|
class Job:
|
9
12
|
def __init__(
|
10
13
|
self,
|
11
|
-
tokenizer: BaseTokenizer,
|
14
|
+
tokenizer: Union[BaseTokenizer, str],
|
12
15
|
column: str,
|
13
16
|
name: str = None,
|
14
17
|
truncate: int = None,
|
@@ -16,7 +19,13 @@ class Job:
|
|
16
19
|
key: bool = False,
|
17
20
|
max_len: int = 0,
|
18
21
|
):
|
22
|
+
if isinstance(tokenizer, str):
|
23
|
+
if TokenizerHub.has(tokenizer):
|
24
|
+
tokenizer = TokenizerHub.get(tokenizer)
|
25
|
+
else:
|
26
|
+
raise ValueError(f"Tokenizer {tokenizer} not found in the tokenizer hub.")
|
19
27
|
self.tokenizer: BaseTokenizer = tokenizer
|
28
|
+
|
20
29
|
self.column: str = column
|
21
30
|
self.name: str = name
|
22
31
|
self.truncate: int = truncate
|
@@ -26,7 +35,8 @@ class Job:
|
|
26
35
|
self.max_len = max_len
|
27
36
|
self.from_union = isinstance(self.tokenizer, UnionTokenizer)
|
28
37
|
|
29
|
-
JobHub.add(self
|
38
|
+
JobHub.add(self)
|
39
|
+
VocabHub.add(self.tokenizer.vocab)
|
30
40
|
|
31
41
|
@property
|
32
42
|
def return_list(self):
|
@@ -77,3 +87,8 @@ class Job:
|
|
77
87
|
|
78
88
|
class JobHub(Hub[Job]):
|
79
89
|
_instance = Instance(compulsory_space=True)
|
90
|
+
|
91
|
+
@classmethod
|
92
|
+
def add(cls, key, obj: Job = None):
|
93
|
+
key, obj = key.name, key
|
94
|
+
return super().add(key, obj)
|
@@ -15,7 +15,7 @@ from unitok.vocabulary import Vocab, VocabHub
|
|
15
15
|
|
16
16
|
|
17
17
|
class Meta:
|
18
|
-
version = 'unidep-
|
18
|
+
version = 'unidep-v4'
|
19
19
|
|
20
20
|
def __init__(self):
|
21
21
|
self.note = ('Not compatible with unitok-v3 or lower version, '
|
@@ -116,6 +116,7 @@ class Meta:
|
|
116
116
|
meta.vocabularies = VocabSet({cls.parse_vocabulary(**v).load(save_dir) for v in kwargs.get('vocabularies')})
|
117
117
|
meta.tokenizers = TokenizerSet({cls.parse_tokenizer(**t) for t in kwargs.get('tokenizers')})
|
118
118
|
meta.jobs = JobSet({cls.parse_job(**j) for j in kwargs.get('jobs')})
|
119
|
+
meta.version = kwargs.get('version')
|
119
120
|
|
120
121
|
return meta
|
121
122
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
from unitok.tokenizer.base_tokenizer import BaseTokenizer, TokenizerHub
|
2
|
-
from unitok.tokenizer.cachable_tokenizer import CachableTokenizer
|
3
2
|
from unitok.tokenizer.entity_tokenizer import EntityTokenizer, EntitiesTokenizer
|
3
|
+
from unitok.tokenizer.glove_tokenizer import GloVeTokenizer
|
4
4
|
from unitok.tokenizer.transformers_tokenizer import TransformersTokenizer, BertTokenizer
|
5
5
|
from unitok.tokenizer.split_tokenizer import SplitTokenizer
|
6
6
|
from unitok.tokenizer.digit_tokenizer import DigitTokenizer, DigitsTokenizer
|
@@ -8,7 +8,6 @@ from unitok.tokenizer.digit_tokenizer import DigitTokenizer, DigitsTokenizer
|
|
8
8
|
|
9
9
|
__all__ = [
|
10
10
|
BaseTokenizer,
|
11
|
-
CachableTokenizer,
|
12
11
|
EntityTokenizer,
|
13
12
|
EntitiesTokenizer,
|
14
13
|
TransformersTokenizer,
|
@@ -16,5 +15,6 @@ __all__ = [
|
|
16
15
|
SplitTokenizer,
|
17
16
|
DigitTokenizer,
|
18
17
|
DigitsTokenizer,
|
19
|
-
|
18
|
+
GloVeTokenizer,
|
19
|
+
TokenizerHub,
|
20
20
|
]
|
@@ -28,7 +28,8 @@ class BaseTokenizer(abc.ABC):
|
|
28
28
|
|
29
29
|
self._tokenizer_id = tokenizer_id
|
30
30
|
|
31
|
-
TokenizerHub.add(self
|
31
|
+
TokenizerHub.add(self)
|
32
|
+
VocabHub.add(self.vocab)
|
32
33
|
|
33
34
|
def get_tokenizer_id(self):
|
34
35
|
if self._tokenizer_id is None:
|
@@ -81,3 +82,8 @@ class BaseTokenizer(abc.ABC):
|
|
81
82
|
|
82
83
|
class TokenizerHub(Hub[BaseTokenizer]):
|
83
84
|
_instance = Instance()
|
85
|
+
|
86
|
+
@classmethod
|
87
|
+
def add(cls, key, obj: BaseTokenizer = None):
|
88
|
+
key, obj = key.get_tokenizer_id(), key
|
89
|
+
return super().add(key, obj)
|
@@ -0,0 +1,21 @@
|
|
1
|
+
import nltk
|
2
|
+
|
3
|
+
from unitok.vocabulary import VocabHub
|
4
|
+
from unitok.tokenizer import BaseTokenizer
|
5
|
+
|
6
|
+
|
7
|
+
class GloVeTokenizer(BaseTokenizer):
|
8
|
+
return_list = True
|
9
|
+
param_list = ['language']
|
10
|
+
|
11
|
+
def __init__(self, vocab, language='english', **kwargs):
|
12
|
+
if isinstance(vocab, str) and not VocabHub.has(vocab):
|
13
|
+
raise ValueError('GloVeTokenizer requires a pre-filled Vocab object that stores valid tokens')
|
14
|
+
|
15
|
+
super().__init__(vocab=vocab, **kwargs)
|
16
|
+
|
17
|
+
self.language = language
|
18
|
+
|
19
|
+
def __call__(self, obj):
|
20
|
+
objs = nltk.tokenize.word_tokenize(obj.lower(), language=self.language)
|
21
|
+
return [self.vocab[o] for o in objs if o in self.vocab]
|
@@ -1,31 +1,37 @@
|
|
1
1
|
from typing import Union
|
2
2
|
|
3
|
+
from pigmento import pnt
|
3
4
|
from transformers import AutoTokenizer
|
4
5
|
|
5
|
-
from
|
6
|
-
from unitok.tokenizer import
|
6
|
+
from unitok.vocabulary import Vocab
|
7
|
+
from unitok.tokenizer import BaseTokenizer
|
7
8
|
|
8
9
|
|
9
|
-
class TransformersTokenizer(
|
10
|
+
class TransformersTokenizer(BaseTokenizer):
|
10
11
|
return_list = True
|
11
|
-
param_list = ['key']
|
12
12
|
|
13
|
-
def __init__(self, vocab: Union[str, Vocab], tokenizer_id: str = None,
|
14
|
-
super().__init__(vocab=vocab, tokenizer_id=tokenizer_id
|
13
|
+
def __init__(self, vocab: Union[str, Vocab], tokenizer_id: str = None, key: str = None, **kwargs):
|
14
|
+
super().__init__(vocab=vocab, tokenizer_id=tokenizer_id)
|
15
15
|
self.key = key
|
16
16
|
|
17
17
|
self.kwargs = kwargs
|
18
|
+
self.param_list = ['key']
|
18
19
|
self.param_list.extend(list(kwargs.keys()))
|
19
20
|
|
20
21
|
self.tokenizer = AutoTokenizer.from_pretrained(self.key, **self.kwargs)
|
21
22
|
self.vocab.extend(self._generate_token_list())
|
22
23
|
|
23
24
|
def _generate_token_list(self):
|
25
|
+
if not hasattr(self.tokenizer, 'vocab'):
|
26
|
+
pnt(f'transformer({self.key}): does not provide vocabulary, generating placeholders instead')
|
27
|
+
return list(range(self.tokenizer.vocab_size))
|
28
|
+
|
24
29
|
tokens = self.tokenizer.vocab
|
25
30
|
if isinstance(tokens, list):
|
26
31
|
return tokens
|
27
32
|
if not isinstance(tokens, dict):
|
28
|
-
|
33
|
+
pnt(f'transformer({self.key}): unsupported type of vocabulary, generating placeholders instead')
|
34
|
+
return list(range(self.tokenizer.vocab_size))
|
29
35
|
|
30
36
|
num_tokens = len(tokens)
|
31
37
|
token_ids = list(tokens.values())
|
@@ -45,11 +51,15 @@ class TransformersTokenizer(CachableTokenizer):
|
|
45
51
|
|
46
52
|
def __call__(self, obj):
|
47
53
|
tokens = self.tokenizer.tokenize(obj)
|
48
|
-
|
54
|
+
tokens = self.tokenizer.convert_tokens_to_ids(tokens)
|
55
|
+
for token in tokens:
|
56
|
+
self.vocab.counter(token)
|
57
|
+
return tokens
|
49
58
|
|
50
59
|
|
51
60
|
class BertTokenizer(TransformersTokenizer):
|
52
61
|
param_list = []
|
53
62
|
|
54
63
|
def __init__(self, **kwargs):
|
64
|
+
kwargs.pop('key', None)
|
55
65
|
super().__init__(key='bert-base-uncased', **kwargs)
|
@@ -173,7 +173,7 @@ class UniTok(Status):
|
|
173
173
|
if tokenizer.return_list:
|
174
174
|
raise AttributeError('Column content of the key job should be tokenized into atomic value')
|
175
175
|
if self.key_job:
|
176
|
-
raise ValueError(f'
|
176
|
+
raise ValueError(f'Key column already exists: {self.key_job.name}')
|
177
177
|
self.key_job = job
|
178
178
|
|
179
179
|
@Status.require_not_organized
|
@@ -282,7 +282,10 @@ class UniTok(Status):
|
|
282
282
|
|
283
283
|
# Prepare introduction header
|
284
284
|
introduction_header = Text.assemble(
|
285
|
-
(
|
285
|
+
(
|
286
|
+
f"UniTok (v{self.meta.parse_version(Meta.version)}), "
|
287
|
+
f"Data (v{self.meta.parse_version(self.meta.version)})\n",
|
288
|
+
"bold cyan"),
|
286
289
|
(f"Sample Size: {self._sample_size}\n", "green"),
|
287
290
|
(f"ID Column: {self.key_job.name}\n", "magenta"),
|
288
291
|
style="dim"
|
@@ -462,3 +465,28 @@ class UniTok(Status):
|
|
462
465
|
|
463
466
|
job.max_len = max_len
|
464
467
|
self.data[job.name] = series
|
468
|
+
|
469
|
+
def remove_job(self, job: Union[Job, str]):
|
470
|
+
if isinstance(job, str):
|
471
|
+
job = self.meta.jobs[job]
|
472
|
+
|
473
|
+
if job.key:
|
474
|
+
raise ValueError('key job cannot be removed')
|
475
|
+
|
476
|
+
self.meta.jobs.remove(job)
|
477
|
+
|
478
|
+
tokenizer = job.tokenizer
|
479
|
+
for j in self.meta.jobs:
|
480
|
+
if j.tokenizer == tokenizer:
|
481
|
+
break
|
482
|
+
else:
|
483
|
+
self.meta.tokenizers.remove(tokenizer)
|
484
|
+
vocab = tokenizer.vocab
|
485
|
+
for t in self.meta.tokenizers:
|
486
|
+
if t.vocab == vocab:
|
487
|
+
break
|
488
|
+
else:
|
489
|
+
self.meta.vocabularies.remove(vocab)
|
490
|
+
|
491
|
+
if job.is_processed:
|
492
|
+
self.data.pop(job.name)
|
@@ -11,7 +11,7 @@ class Hub(abc.ABC, Generic[T]):
|
|
11
11
|
_instance: Instance
|
12
12
|
|
13
13
|
@classmethod
|
14
|
-
def add(cls, key, obj: T):
|
14
|
+
def add(cls, key, obj: T = None):
|
15
15
|
instance = cls._instance.current()
|
16
16
|
if key in instance and instance[key] is not obj:
|
17
17
|
raise ValueError(f'Conflict object declaration: {obj} and {instance[key]}')
|
@@ -1,5 +1,7 @@
|
|
1
1
|
import os
|
2
|
+
from typing import Optional, Union
|
2
3
|
|
4
|
+
from unitok import PickleHandler
|
3
5
|
from unitok.utils import Map, Instance
|
4
6
|
from unitok.utils.hub import Hub
|
5
7
|
from unitok.vocabulary.counter import Counter
|
@@ -17,7 +19,7 @@ class Vocabulary:
|
|
17
19
|
self._editable = True # whether vocab is editable
|
18
20
|
self.counter = Counter()
|
19
21
|
|
20
|
-
VocabularyHub.add(self
|
22
|
+
VocabularyHub.add(self)
|
21
23
|
|
22
24
|
def equals(self, other: 'Vocabulary'):
|
23
25
|
return self.name == other.name and len(self) == len(other)
|
@@ -42,7 +44,7 @@ class Vocabulary:
|
|
42
44
|
"""
|
43
45
|
return [self.append(obj) for obj in objs]
|
44
46
|
|
45
|
-
def append(self, obj, oov_token=None):
|
47
|
+
def append(self, obj, oov_token: Optional[Union[int, str]] = None):
|
46
48
|
obj = str(obj)
|
47
49
|
if obj not in self.o2i:
|
48
50
|
if '\n' in obj:
|
@@ -51,7 +53,11 @@ class Vocabulary:
|
|
51
53
|
if not self._editable:
|
52
54
|
if oov_token is None:
|
53
55
|
raise ValueError(f'the fixed vocab {self.name} is not allowed to add new token ({obj})')
|
54
|
-
|
56
|
+
if isinstance(oov_token, str):
|
57
|
+
return self[oov_token]
|
58
|
+
if len(self) > oov_token >= 0:
|
59
|
+
return oov_token
|
60
|
+
raise ValueError(f'oov_token ({oov_token}) is not in the vocab')
|
55
61
|
|
56
62
|
index = len(self)
|
57
63
|
self.o2i[obj] = index
|
@@ -80,6 +86,9 @@ class Vocabulary:
|
|
80
86
|
return self.i2o[item]
|
81
87
|
return self.o2i[item]
|
82
88
|
|
89
|
+
def __contains__(self, item: str):
|
90
|
+
return item in self.o2i
|
91
|
+
|
83
92
|
def __str__(self):
|
84
93
|
return f'Vocabulary({self.name}, vocab_size={len(self)})'
|
85
94
|
|
@@ -87,6 +96,10 @@ class Vocabulary:
|
|
87
96
|
Editable Methods
|
88
97
|
"""
|
89
98
|
|
99
|
+
@property
|
100
|
+
def editable(self):
|
101
|
+
return self._editable
|
102
|
+
|
90
103
|
def allow_edit(self):
|
91
104
|
self._editable = True
|
92
105
|
return self
|
@@ -112,8 +125,8 @@ class Vocabulary:
|
|
112
125
|
Save & Load Methods
|
113
126
|
"""
|
114
127
|
|
115
|
-
def filepath(self,
|
116
|
-
return os.path.join(
|
128
|
+
def filepath(self, save_dir):
|
129
|
+
return os.path.join(save_dir, self.filename)
|
117
130
|
|
118
131
|
@property
|
119
132
|
def filename(self):
|
@@ -124,8 +137,7 @@ class Vocabulary:
|
|
124
137
|
save_dir = self.filepath(save_dir)
|
125
138
|
|
126
139
|
self.o2i, self.i2o = {}, {}
|
127
|
-
|
128
|
-
objs = f.read().strip().split('\n')
|
140
|
+
objs = PickleHandler.load(save_dir)
|
129
141
|
for index, obj in enumerate(objs):
|
130
142
|
self.o2i[obj] = index
|
131
143
|
self.i2o[index] = obj
|
@@ -134,8 +146,7 @@ class Vocabulary:
|
|
134
146
|
|
135
147
|
def save(self, save_dir):
|
136
148
|
store_path = self.filepath(save_dir)
|
137
|
-
|
138
|
-
f.write('\n'.join(self))
|
149
|
+
PickleHandler.save(list(self), store_path)
|
139
150
|
|
140
151
|
return self
|
141
152
|
|
@@ -148,3 +159,8 @@ class Vocabulary:
|
|
148
159
|
|
149
160
|
class VocabularyHub(Hub[Vocabulary]):
|
150
161
|
_instance = Instance()
|
162
|
+
|
163
|
+
@classmethod
|
164
|
+
def add(cls, key, obj: Vocabulary = None):
|
165
|
+
key, obj = key.name, key
|
166
|
+
return super().add(key, obj)
|
@@ -1,25 +0,0 @@
|
|
1
|
-
from typing import Hashable
|
2
|
-
|
3
|
-
from unitok import warning
|
4
|
-
from unitok.tokenizer import BaseTokenizer
|
5
|
-
|
6
|
-
|
7
|
-
class CachableTokenizer(BaseTokenizer):
|
8
|
-
def __init__(self, use_cache=False, **kwargs):
|
9
|
-
super().__init__(**kwargs)
|
10
|
-
|
11
|
-
if not self.return_list and use_cache:
|
12
|
-
warning(f'Only the tokenizer that return_list=True may need cache, use_cache of {self.get_classname()} will be set to False')
|
13
|
-
use_cache = False
|
14
|
-
self._use_cache = use_cache
|
15
|
-
self._cache = dict()
|
16
|
-
|
17
|
-
def __call__(self, objs):
|
18
|
-
if self._use_cache and isinstance(objs, Hashable):
|
19
|
-
if objs in self._cache:
|
20
|
-
return self._cache[objs]
|
21
|
-
value = super().__call__(objs)
|
22
|
-
self._cache[objs] = value
|
23
|
-
return value
|
24
|
-
|
25
|
-
return super().__call__(objs)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|