UniTok 3.5.2__tar.gz → 4.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {UniTok-3.5.2 → UniTok-4.0.0}/PKG-INFO +15 -15
- {UniTok-3.5.2 → UniTok-4.0.0}/README.md +13 -13
- UniTok-4.0.0/UniTok/__init__.py +37 -0
- UniTok-4.0.0/UniTok/__main__.py +78 -0
- UniTok-4.0.0/UniTok/job.py +76 -0
- UniTok-4.0.0/UniTok/meta.py +136 -0
- UniTok-4.0.0/UniTok/status.py +44 -0
- UniTok-4.0.0/UniTok/tokenizer/__init__.py +18 -0
- UniTok-4.0.0/UniTok/tokenizer/base_tokenizer.py +78 -0
- UniTok-4.0.0/UniTok/tokenizer/digit_tokenizer.py +33 -0
- UniTok-4.0.0/UniTok/tokenizer/entity_tokenizer.py +13 -0
- UniTok-4.0.0/UniTok/tokenizer/split_tokenizer.py +14 -0
- UniTok-4.0.0/UniTok/tokenizer/transformers_tokenizer.py +50 -0
- UniTok-4.0.0/UniTok/tokenizer/union_tokenizer.py +17 -0
- UniTok-4.0.0/UniTok/tokenizer/unknown_tokenizer.py +35 -0
- UniTok-4.0.0/UniTok/unitok.py +411 -0
- UniTok-4.0.0/UniTok/utils/__init__.py +21 -0
- UniTok-4.0.0/UniTok/utils/class_pool.py +107 -0
- UniTok-4.0.0/UniTok/utils/data.py +15 -0
- UniTok-4.0.0/UniTok/utils/function.py +6 -0
- UniTok-4.0.0/UniTok/utils/handler/__init__.py +7 -0
- UniTok-4.0.0/UniTok/utils/handler/json_handler.py +28 -0
- UniTok-4.0.0/UniTok/utils/handler/pkl_handler.py +19 -0
- UniTok-4.0.0/UniTok/utils/hub/__init__.py +4 -0
- UniTok-4.0.0/UniTok/utils/hub/hub.py +44 -0
- UniTok-4.0.0/UniTok/utils/hub/param_hub.py +6 -0
- UniTok-4.0.0/UniTok/utils/index_set/__init__.py +15 -0
- UniTok-4.0.0/UniTok/utils/index_set/index_set.py +71 -0
- UniTok-4.0.0/UniTok/utils/index_set/job_set.py +25 -0
- UniTok-4.0.0/UniTok/utils/index_set/tokenizer_set.py +19 -0
- UniTok-4.0.0/UniTok/utils/index_set/vocabulary_set.py +19 -0
- UniTok-4.0.0/UniTok/utils/instance.py +18 -0
- UniTok-4.0.0/UniTok/utils/map.py +3 -0
- UniTok-4.0.0/UniTok/utils/space.py +29 -0
- UniTok-4.0.0/UniTok/utils/symbol.py +23 -0
- UniTok-4.0.0/UniTok/utils/verbose.py +48 -0
- UniTok-4.0.0/UniTok/vocabulary/__init__.py +11 -0
- UniTok-4.0.0/UniTok/vocabulary/counter.py +85 -0
- UniTok-4.0.0/UniTok/vocabulary/vocabulary.py +150 -0
- {UniTok-3.5.2 → UniTok-4.0.0}/UniTok.egg-info/PKG-INFO +15 -15
- UniTok-4.0.0/UniTok.egg-info/SOURCES.txt +71 -0
- UniTok-4.0.0/UniTok.egg-info/entry_points.txt +5 -0
- UniTok-4.0.0/UniTok.egg-info/top_level.txt +2 -0
- UniTok-4.0.0/UniTokv3/__main__.py +169 -0
- {UniTok-3.5.2/UniTok → UniTok-4.0.0/UniTokv3}/tok/bert_tok.py +1 -1
- {UniTok-3.5.2/UniTok → UniTok-4.0.0/UniTokv3}/tok/ent_tok.py +1 -1
- {UniTok-3.5.2/UniTok → UniTok-4.0.0/UniTokv3}/tok/id_tok.py +1 -1
- {UniTok-3.5.2/UniTok → UniTok-4.0.0/UniTokv3}/tok/number_tok.py +1 -1
- {UniTok-3.5.2/UniTok → UniTok-4.0.0/UniTokv3}/tok/seq_tok.py +1 -1
- {UniTok-3.5.2/UniTok → UniTok-4.0.0/UniTokv3}/tok/split_tok.py +1 -1
- {UniTok-3.5.2/UniTok → UniTok-4.0.0/UniTokv3}/tok/tok.py +1 -1
- {UniTok-3.5.2/UniTok → UniTok-4.0.0/UniTokv3}/unidep.py +9 -2
- {UniTok-3.5.2/UniTok → UniTok-4.0.0/UniTokv3}/unitok.py +10 -3
- {UniTok-3.5.2 → UniTok-4.0.0}/setup.py +5 -3
- UniTok-3.5.2/UniTok/__main__.py +0 -42
- UniTok-3.5.2/UniTok.egg-info/SOURCES.txt +0 -30
- UniTok-3.5.2/UniTok.egg-info/entry_points.txt +0 -3
- UniTok-3.5.2/UniTok.egg-info/top_level.txt +0 -1
- {UniTok-3.5.2 → UniTok-4.0.0}/UniTok.egg-info/dependency_links.txt +0 -0
- {UniTok-3.5.2 → UniTok-4.0.0}/UniTok.egg-info/requires.txt +0 -0
- {UniTok-3.5.2/UniTok → UniTok-4.0.0/UniTokv3}/__init__.py +0 -0
- {UniTok-3.5.2/UniTok → UniTok-4.0.0/UniTokv3}/analysis/__init__.py +0 -0
- {UniTok-3.5.2/UniTok → UniTok-4.0.0/UniTokv3}/analysis/lengths.py +0 -0
- {UniTok-3.5.2/UniTok → UniTok-4.0.0/UniTokv3}/analysis/plot.py +0 -0
- {UniTok-3.5.2/UniTok → UniTok-4.0.0/UniTokv3}/cols.py +0 -0
- {UniTok-3.5.2/UniTok → UniTok-4.0.0/UniTokv3}/column.py +0 -0
- {UniTok-3.5.2/UniTok → UniTok-4.0.0/UniTokv3}/fut.py +0 -0
- {UniTok-3.5.2/UniTok → UniTok-4.0.0/UniTokv3}/global_setting.py +0 -0
- {UniTok-3.5.2/UniTok → UniTok-4.0.0/UniTokv3}/meta.py +0 -0
- {UniTok-3.5.2/UniTok → UniTok-4.0.0/UniTokv3}/tok/__init__.py +0 -0
- {UniTok-3.5.2/UniTok → UniTok-4.0.0/UniTokv3}/vocab.py +0 -0
- {UniTok-3.5.2/UniTok → UniTok-4.0.0/UniTokv3}/vocabs.py +0 -0
- {UniTok-3.5.2 → UniTok-4.0.0}/setup.cfg +0 -0
@@ -1,10 +1,10 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: UniTok
|
3
|
-
Version:
|
3
|
+
Version: 4.0.0
|
4
4
|
Summary: Unified Tokenizer
|
5
5
|
Home-page: https://github.com/Jyonn/UnifiedTokenizer
|
6
6
|
Author: Jyonn Liu
|
7
|
-
Author-email:
|
7
|
+
Author-email: liu@qijiong.work
|
8
8
|
License: MIT Licence
|
9
9
|
Keywords: token,tokenizer
|
10
10
|
Platform: any
|
@@ -73,8 +73,8 @@ UniTok提供了一整套的数据预处理工具,包括不同类型的分词
|
|
73
73
|
通过以下代码,我们可以针对每个文件构建一个UniTok对象:
|
74
74
|
|
75
75
|
```python
|
76
|
-
from
|
77
|
-
from
|
76
|
+
from UniTokv3 import UniTok, Column, Vocab
|
77
|
+
from UniTokv3.tok import IdTok, BertTok, EntTok, SplitTok, NumberTok
|
78
78
|
|
79
79
|
# Create a news id vocab, commonly used in news data, history data, and interaction data.
|
80
80
|
nid_vocab = Vocab('nid')
|
@@ -110,7 +110,7 @@ news_ut.add_col(Column(
|
|
110
110
|
news_ut.read('news.tsv', sep='\t')
|
111
111
|
|
112
112
|
# Tokenize the data.
|
113
|
-
news_ut.tokenize()
|
113
|
+
news_ut.tokenize()
|
114
114
|
|
115
115
|
# Store the tokenized data.
|
116
116
|
news_ut.store('data/news')
|
@@ -130,10 +130,10 @@ user_ut.add_col(Column(
|
|
130
130
|
))
|
131
131
|
|
132
132
|
# Read the data file.
|
133
|
-
user_ut.read('user.tsv', sep='\t')
|
133
|
+
user_ut.read('user.tsv', sep='\t')
|
134
134
|
|
135
135
|
# Tokenize the data.
|
136
|
-
user_ut.tokenize()
|
136
|
+
user_ut.tokenize()
|
137
137
|
|
138
138
|
# Store the tokenized data.
|
139
139
|
user_ut.store('data/user')
|
@@ -142,16 +142,16 @@ user_ut.store('data/user')
|
|
142
142
|
def inter_tokenize(mode):
|
143
143
|
# Create an interaction UniTok object.
|
144
144
|
inter_ut = UniTok()
|
145
|
-
|
145
|
+
|
146
146
|
# Add columns to the interaction UniTok object.
|
147
147
|
inter_ut.add_index_col(
|
148
148
|
# The index column in the interaction data is automatically generated, and the tokenizer does not need to be specified.
|
149
149
|
).add_col(Column(
|
150
150
|
# Align with the uid column in user_ut.
|
151
|
-
tok=EntTok(vocab=uid_vocab),
|
151
|
+
tok=EntTok(vocab=uid_vocab),
|
152
152
|
)).add_col(Column(
|
153
153
|
# Align with the nid column in news_ut.
|
154
|
-
tok=EntTok(vocab=nid_vocab),
|
154
|
+
tok=EntTok(vocab=nid_vocab),
|
155
155
|
)).add_col(Column(
|
156
156
|
name='label',
|
157
157
|
# The label column in the interaction data only has two values, 0 and 1.
|
@@ -160,14 +160,14 @@ def inter_tokenize(mode):
|
|
160
160
|
|
161
161
|
# Read the data file.
|
162
162
|
inter_ut.read(f'{mode}.tsv', sep='\t')
|
163
|
-
|
163
|
+
|
164
164
|
# Tokenize the data.
|
165
|
-
inter_ut.tokenize()
|
166
|
-
|
165
|
+
inter_ut.tokenize()
|
166
|
+
|
167
167
|
# Store the tokenized data.
|
168
168
|
inter_ut.store(mode)
|
169
169
|
|
170
|
-
|
170
|
+
|
171
171
|
inter_tokenize('data/train')
|
172
172
|
inter_tokenize('data/dev')
|
173
173
|
inter_tokenize('data/test')
|
@@ -184,7 +184,7 @@ UniDep 是一个数据依赖处理类,可以用于加载和访问 UniTok 预
|
|
184
184
|
以下是一个简单的使用示例:
|
185
185
|
|
186
186
|
```python
|
187
|
-
from
|
187
|
+
from UniTokv3 import UniDep
|
188
188
|
|
189
189
|
# Load the data.
|
190
190
|
dep = UniDep('data/news')
|
@@ -61,8 +61,8 @@ UniTok提供了一整套的数据预处理工具,包括不同类型的分词
|
|
61
61
|
通过以下代码,我们可以针对每个文件构建一个UniTok对象:
|
62
62
|
|
63
63
|
```python
|
64
|
-
from
|
65
|
-
from
|
64
|
+
from UniTokv3 import UniTok, Column, Vocab
|
65
|
+
from UniTokv3.tok import IdTok, BertTok, EntTok, SplitTok, NumberTok
|
66
66
|
|
67
67
|
# Create a news id vocab, commonly used in news data, history data, and interaction data.
|
68
68
|
nid_vocab = Vocab('nid')
|
@@ -98,7 +98,7 @@ news_ut.add_col(Column(
|
|
98
98
|
news_ut.read('news.tsv', sep='\t')
|
99
99
|
|
100
100
|
# Tokenize the data.
|
101
|
-
news_ut.tokenize()
|
101
|
+
news_ut.tokenize()
|
102
102
|
|
103
103
|
# Store the tokenized data.
|
104
104
|
news_ut.store('data/news')
|
@@ -118,10 +118,10 @@ user_ut.add_col(Column(
|
|
118
118
|
))
|
119
119
|
|
120
120
|
# Read the data file.
|
121
|
-
user_ut.read('user.tsv', sep='\t')
|
121
|
+
user_ut.read('user.tsv', sep='\t')
|
122
122
|
|
123
123
|
# Tokenize the data.
|
124
|
-
user_ut.tokenize()
|
124
|
+
user_ut.tokenize()
|
125
125
|
|
126
126
|
# Store the tokenized data.
|
127
127
|
user_ut.store('data/user')
|
@@ -130,16 +130,16 @@ user_ut.store('data/user')
|
|
130
130
|
def inter_tokenize(mode):
|
131
131
|
# Create an interaction UniTok object.
|
132
132
|
inter_ut = UniTok()
|
133
|
-
|
133
|
+
|
134
134
|
# Add columns to the interaction UniTok object.
|
135
135
|
inter_ut.add_index_col(
|
136
136
|
# The index column in the interaction data is automatically generated, and the tokenizer does not need to be specified.
|
137
137
|
).add_col(Column(
|
138
138
|
# Align with the uid column in user_ut.
|
139
|
-
tok=EntTok(vocab=uid_vocab),
|
139
|
+
tok=EntTok(vocab=uid_vocab),
|
140
140
|
)).add_col(Column(
|
141
141
|
# Align with the nid column in news_ut.
|
142
|
-
tok=EntTok(vocab=nid_vocab),
|
142
|
+
tok=EntTok(vocab=nid_vocab),
|
143
143
|
)).add_col(Column(
|
144
144
|
name='label',
|
145
145
|
# The label column in the interaction data only has two values, 0 and 1.
|
@@ -148,14 +148,14 @@ def inter_tokenize(mode):
|
|
148
148
|
|
149
149
|
# Read the data file.
|
150
150
|
inter_ut.read(f'{mode}.tsv', sep='\t')
|
151
|
-
|
151
|
+
|
152
152
|
# Tokenize the data.
|
153
|
-
inter_ut.tokenize()
|
154
|
-
|
153
|
+
inter_ut.tokenize()
|
154
|
+
|
155
155
|
# Store the tokenized data.
|
156
156
|
inter_ut.store(mode)
|
157
157
|
|
158
|
-
|
158
|
+
|
159
159
|
inter_tokenize('data/train')
|
160
160
|
inter_tokenize('data/dev')
|
161
161
|
inter_tokenize('data/test')
|
@@ -172,7 +172,7 @@ UniDep 是一个数据依赖处理类,可以用于加载和访问 UniTok 预
|
|
172
172
|
以下是一个简单的使用示例:
|
173
173
|
|
174
174
|
```python
|
175
|
-
from
|
175
|
+
from UniTokv3 import UniDep
|
176
176
|
|
177
177
|
# Load the data.
|
178
178
|
dep = UniDep('data/news')
|
@@ -0,0 +1,37 @@
|
|
1
|
+
from unitok.utils import Verbose, warning, error, info, debug
|
2
|
+
from unitok.utils import Symbol, Symbols
|
3
|
+
from unitok.utils import JsonHandler, PickleHandler
|
4
|
+
from unitok.utils import Instance, Space, Map
|
5
|
+
|
6
|
+
from unitok.utils.hub import Hub, ParamHub
|
7
|
+
from unitok.vocabulary import Vocab, Vocabulary, VocabHub, VocabularyHub
|
8
|
+
from unitok.tokenizer import BaseTokenizer, TokenizerHub
|
9
|
+
from unitok.tokenizer import EntityTokenizer, EntitiesTokenizer
|
10
|
+
from unitok.tokenizer import TransformersTokenizer, BertTokenizer
|
11
|
+
from unitok.tokenizer import SplitTokenizer, DigitTokenizer, DigitsTokenizer
|
12
|
+
from unitok.job import Job, JobHub
|
13
|
+
|
14
|
+
from unitok.utils.index_set import IndexSet, VocabSet, TokenizerSet, JobSet
|
15
|
+
|
16
|
+
from unitok.meta import Meta
|
17
|
+
from unitok.status import Status
|
18
|
+
from unitok.unitok import UniTok
|
19
|
+
|
20
|
+
|
21
|
+
__all__ = [
|
22
|
+
'Verbose', 'warning', 'error', 'info', 'debug',
|
23
|
+
'Symbol', 'Symbols',
|
24
|
+
'JsonHandler', 'PickleHandler',
|
25
|
+
'Instance', 'Space', 'Map',
|
26
|
+
'Hub', 'ParamHub',
|
27
|
+
'Vocab', 'Vocabulary', 'VocabHub', 'VocabularyHub',
|
28
|
+
'BaseTokenizer', 'TokenizerHub',
|
29
|
+
'EntityTokenizer', 'EntitiesTokenizer',
|
30
|
+
'TransformersTokenizer', 'BertTokenizer',
|
31
|
+
'SplitTokenizer', 'DigitTokenizer', 'DigitsTokenizer',
|
32
|
+
'Job', 'JobHub',
|
33
|
+
'IndexSet', 'VocabSet', 'TokenizerSet', 'JobSet',
|
34
|
+
'Meta',
|
35
|
+
'Status',
|
36
|
+
'UniTok',
|
37
|
+
]
|
@@ -0,0 +1,78 @@
|
|
1
|
+
import argparse
|
2
|
+
|
3
|
+
import pandas as pd
|
4
|
+
|
5
|
+
from unitok.tokenizer import BaseTokenizer
|
6
|
+
from unitok.unitok import UniTok
|
7
|
+
from unitok.utils.class_pool import ClassPool
|
8
|
+
|
9
|
+
|
10
|
+
def integrate():
|
11
|
+
parser = argparse.ArgumentParser()
|
12
|
+
parser.add_argument('path', type=str, default='.', help='path to a unitok data directory')
|
13
|
+
parser.add_argument('--file', '-f', type=str, help='csv, tsv, parquet format data')
|
14
|
+
parser.add_argument('--lib', type=str, default=None, help='custom tokenizer library')
|
15
|
+
parser.add_argument('--column', '-c', type=str, help='column name to tokenize')
|
16
|
+
parser.add_argument('--name', '-n', type=str, help='job name and export column name')
|
17
|
+
parser.add_argument('--vocab', '-v', type=str, default=None, help='vocabulary name')
|
18
|
+
parser.add_argument('--tokenizer', '-t', type=str, default=None, help='tokenizer classname')
|
19
|
+
parser.add_argument('--tokenizer_id', type=str, default=None, help='tokenizer id')
|
20
|
+
parser.add_argument('--truncate', type=int, help='truncate length', default=None)
|
21
|
+
args, unknown_args = parser.parse_known_args()
|
22
|
+
|
23
|
+
tokenizer_params = dict()
|
24
|
+
current_param = None
|
25
|
+
for arg in unknown_args:
|
26
|
+
if current_param:
|
27
|
+
tokenizer_params[current_param] = arg
|
28
|
+
current_param = None
|
29
|
+
if arg.startswith('--t.'):
|
30
|
+
current_param = arg[4:]
|
31
|
+
elif arg.startswith('--tokenizer.'):
|
32
|
+
current_param = arg[11:]
|
33
|
+
|
34
|
+
if args.file.endswith('.csv') or args.file.endswith('.tsv'):
|
35
|
+
df = pd.read_csv(args.file, sep='\t')
|
36
|
+
elif args.file.endswith('.parquet'):
|
37
|
+
df = pd.read_parquet(args.file)
|
38
|
+
else:
|
39
|
+
raise ValueError(f'Unsupported file format: {args.file}')
|
40
|
+
|
41
|
+
with UniTok.load(args.path, tokenizer_lib=args.lib) as ut:
|
42
|
+
if args.tokenizer_id:
|
43
|
+
for t in ut.meta.tokenizers: # type: BaseTokenizer
|
44
|
+
if t.get_tokenizer_id() == args.tokenizer_id:
|
45
|
+
tokenizer = t
|
46
|
+
break
|
47
|
+
else:
|
48
|
+
raise ValueError(f'Unknown tokenizer id: {args.tokenizer_id}')
|
49
|
+
else:
|
50
|
+
assert args.tokenizer is not None and args.vocab is not None, 'Tokenizer classname and vocabulary must be specified'
|
51
|
+
tokenizers = ClassPool.tokenizers(args.lib)
|
52
|
+
assert args.tokenizer in tokenizers, f'Unknown tokenizer: {args.tokenizer}. Available tokenizers: {tokenizers.keys()}'
|
53
|
+
tokenizer = tokenizers[args.tokenizer](vocab=args.vocab, **tokenizer_params)
|
54
|
+
|
55
|
+
ut.add_job(tokenizer=tokenizer, column=args.column, name=args.name, truncate=args.truncate)
|
56
|
+
ut.tokenize(df).save(args.path)
|
57
|
+
|
58
|
+
|
59
|
+
def summarize():
|
60
|
+
parser = argparse.ArgumentParser()
|
61
|
+
parser.add_argument('path', type=str, default='.', help='path to a unitok data directory')
|
62
|
+
args, _ = parser.parse_known_args()
|
63
|
+
|
64
|
+
with UniTok.load(args.path) as ut:
|
65
|
+
ut.summarize()
|
66
|
+
|
67
|
+
|
68
|
+
def main():
|
69
|
+
parser = argparse.ArgumentParser()
|
70
|
+
parser.add_argument('--action', '-a', type=str, default='summarize', choices=['summarize', 'integrate'])
|
71
|
+
|
72
|
+
args, _ = parser.parse_known_args()
|
73
|
+
action = args.action
|
74
|
+
|
75
|
+
if action == 'integrate':
|
76
|
+
integrate()
|
77
|
+
else:
|
78
|
+
summarize()
|
@@ -0,0 +1,76 @@
|
|
1
|
+
from unitok.tokenizer import BaseTokenizer
|
2
|
+
from unitok.utils import Symbols, Instance
|
3
|
+
from unitok.utils.hub import Hub
|
4
|
+
|
5
|
+
|
6
|
+
class Job:
|
7
|
+
def __init__(
|
8
|
+
self,
|
9
|
+
tokenizer: BaseTokenizer,
|
10
|
+
column: str,
|
11
|
+
name: str = None,
|
12
|
+
truncate: int = None,
|
13
|
+
order: int = -1,
|
14
|
+
key: bool = False,
|
15
|
+
max_len: int = 0,
|
16
|
+
):
|
17
|
+
self.tokenizer: BaseTokenizer = tokenizer
|
18
|
+
self.column: str = column
|
19
|
+
self.name: str = name
|
20
|
+
self.truncate: int = truncate
|
21
|
+
self.order: int = order
|
22
|
+
self.slice: slice = self.get_slice(truncate)
|
23
|
+
self.key: bool = key
|
24
|
+
self.max_len = max_len
|
25
|
+
|
26
|
+
JobHub.add(self.name, self)
|
27
|
+
|
28
|
+
@property
|
29
|
+
def return_list(self):
|
30
|
+
return self.truncate is not None
|
31
|
+
|
32
|
+
def clone(self, **kwargs):
|
33
|
+
attributes = {'tokenizer', 'column', 'name', 'truncate', 'order', 'key', 'max_len'}
|
34
|
+
params = dict()
|
35
|
+
for attr in attributes:
|
36
|
+
params[attr] = kwargs[attr] if attr in kwargs else getattr(self, attr)
|
37
|
+
|
38
|
+
return Job(**params)
|
39
|
+
|
40
|
+
def __str__(self):
|
41
|
+
if self.key:
|
42
|
+
return f'Job({self.column} => {self.name}) [PK]'
|
43
|
+
return f'Job({self.column} => {self.name})'
|
44
|
+
|
45
|
+
def __repr__(self):
|
46
|
+
return str(self)
|
47
|
+
|
48
|
+
@property
|
49
|
+
def is_processed(self):
|
50
|
+
return self.order >= 0
|
51
|
+
|
52
|
+
def json(self):
|
53
|
+
column = str(Symbols.idx) if self.column is Symbols.idx else self.column
|
54
|
+
return {
|
55
|
+
'name': self.name,
|
56
|
+
'column': column,
|
57
|
+
'tokenizer': self.tokenizer.get_tokenizer_id(),
|
58
|
+
'truncate': self.truncate,
|
59
|
+
'order': self.order,
|
60
|
+
'key': self.key,
|
61
|
+
'max_len': self.max_len,
|
62
|
+
}
|
63
|
+
|
64
|
+
@staticmethod
|
65
|
+
def get_slice(truncate):
|
66
|
+
if truncate is None:
|
67
|
+
truncate = 0
|
68
|
+
if truncate > 0:
|
69
|
+
return slice(0, truncate)
|
70
|
+
if truncate < 0:
|
71
|
+
return slice(truncate, None)
|
72
|
+
return slice(None)
|
73
|
+
|
74
|
+
|
75
|
+
class JobHub(Hub[Job]):
|
76
|
+
_instance = Instance(compulsory_space=True)
|
@@ -0,0 +1,136 @@
|
|
1
|
+
import json
|
2
|
+
import os
|
3
|
+
from datetime import datetime
|
4
|
+
|
5
|
+
from unitok.utils.verbose import warning
|
6
|
+
from unitok.job import Job
|
7
|
+
from unitok.tokenizer import TokenizerHub
|
8
|
+
from unitok.tokenizer.union_tokenizer import UnionTokenizer
|
9
|
+
from unitok.tokenizer.unknown_tokenizer import UnknownTokenizer
|
10
|
+
from unitok.utils import Symbols
|
11
|
+
from unitok.utils.handler import JsonHandler
|
12
|
+
from unitok.utils.class_pool import ClassPool
|
13
|
+
from unitok.utils.index_set import VocabSet, TokenizerSet, JobSet
|
14
|
+
from unitok.vocabulary import Vocab, VocabHub
|
15
|
+
|
16
|
+
|
17
|
+
class Meta:
|
18
|
+
version = 'unidep-v4beta'
|
19
|
+
|
20
|
+
def __init__(self):
|
21
|
+
self.note = ('Not compatible with unitok-v3 or lower version, '
|
22
|
+
'please upgrade by `pip install unitok>4.0.0` to load the data.')
|
23
|
+
self.website = 'https://unitok.github.io'
|
24
|
+
self.modified_at = self.created_at = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
25
|
+
self.vocabularies = VocabSet()
|
26
|
+
self.tokenizers = TokenizerSet()
|
27
|
+
self.jobs = JobSet()
|
28
|
+
|
29
|
+
@staticmethod
|
30
|
+
def parse_vocabulary(name: str, **kwargs):
|
31
|
+
return Vocab(name)
|
32
|
+
|
33
|
+
@staticmethod
|
34
|
+
def parse_tokenizer(tokenizer_id: str, classname: str, vocab: str, params: dict):
|
35
|
+
tokenizer_classes = ClassPool.tokenizers()
|
36
|
+
|
37
|
+
if not VocabHub.has(vocab):
|
38
|
+
raise ValueError(f"(unitok.meta) Vocabulary {vocab} not found in the vocabulary hub.")
|
39
|
+
vocab = VocabHub.get(vocab)
|
40
|
+
|
41
|
+
if (classname not in tokenizer_classes or
|
42
|
+
classname in [UnknownTokenizer.get_classname(), UnionTokenizer.get_classname()]):
|
43
|
+
warning(f"(unitok.meta) Tokenizer class {classname} not found in the class hub.")
|
44
|
+
return UnknownTokenizer(tokenizer_id=tokenizer_id, classname=classname, vocab=vocab, **params)
|
45
|
+
return tokenizer_classes[classname](tokenizer_id=tokenizer_id, vocab=vocab, **params)
|
46
|
+
|
47
|
+
@staticmethod
|
48
|
+
def parse_job(name: str, column: str, tokenizer: str, truncate: int, order: int, key: bool, max_len: int):
|
49
|
+
if not TokenizerHub.has(tokenizer):
|
50
|
+
raise ValueError(f"(unitok.meta) Tokenizer {tokenizer} not found in the tokenizer hub.")
|
51
|
+
tokenizer = TokenizerHub.get(tokenizer)
|
52
|
+
|
53
|
+
if column == str(Symbols.idx):
|
54
|
+
column = Symbols.idx
|
55
|
+
|
56
|
+
return Job(
|
57
|
+
name=name,
|
58
|
+
column=column,
|
59
|
+
tokenizer=tokenizer,
|
60
|
+
truncate=truncate,
|
61
|
+
order=order,
|
62
|
+
key=key,
|
63
|
+
max_len=max_len,
|
64
|
+
)
|
65
|
+
|
66
|
+
@staticmethod
|
67
|
+
def parse_version(version):
|
68
|
+
if version.startswith('unidep-v'):
|
69
|
+
return version[8:]
|
70
|
+
|
71
|
+
if version.startswith('UniDep-'):
|
72
|
+
raise ValueError(f'UniDep version ({version}) is not supported. '
|
73
|
+
f'Please downgrade the unitok version by `pip install unitok==3.5.3`, '
|
74
|
+
f'or use `unidep-upgrade-v4` to upgrade the version.')
|
75
|
+
|
76
|
+
raise ValueError(f'UniDep version ({version}) is not supported. '
|
77
|
+
f'Please downgrade the unitok version by `pip install unitok==3.5.3` for compatible upgrade, '
|
78
|
+
f'and then install the latest unitok version, '
|
79
|
+
f'following the use of `unidep-upgrade-v4` to upgrade the version.')
|
80
|
+
|
81
|
+
@classmethod
|
82
|
+
def filename(cls, save_dir):
|
83
|
+
return os.path.join(save_dir, 'meta.json')
|
84
|
+
|
85
|
+
@classmethod
|
86
|
+
def _deprecated_filename(cls, save_dir):
|
87
|
+
return os.path.join(save_dir, 'meta.data.json')
|
88
|
+
|
89
|
+
@classmethod
|
90
|
+
def _compatible_readfile(cls, save_dir):
|
91
|
+
filename = cls.filename(save_dir)
|
92
|
+
if not os.path.exists(filename):
|
93
|
+
filename = cls._deprecated_filename(save_dir)
|
94
|
+
if not os.path.exists(filename):
|
95
|
+
raise FileNotFoundError(f"Meta file not found in {save_dir}")
|
96
|
+
|
97
|
+
meta_data = json.load(open(filename))
|
98
|
+
|
99
|
+
if 'version' not in meta_data:
|
100
|
+
raise ValueError(f"Version not found in the meta file {filename}")
|
101
|
+
|
102
|
+
current_version = cls.parse_version(cls.version)
|
103
|
+
depot_version = cls.parse_version(meta_data.get('version'))
|
104
|
+
|
105
|
+
if current_version != depot_version:
|
106
|
+
warning('Version mismatch, unexpected error may occur.')
|
107
|
+
|
108
|
+
return meta_data
|
109
|
+
|
110
|
+
@classmethod
|
111
|
+
def load(cls, save_dir):
|
112
|
+
kwargs = cls._compatible_readfile(save_dir)
|
113
|
+
|
114
|
+
meta = cls()
|
115
|
+
meta.created_at = kwargs.get('created_at')
|
116
|
+
meta.vocabularies = VocabSet({cls.parse_vocabulary(**v).load(save_dir) for v in kwargs.get('vocabularies')})
|
117
|
+
meta.tokenizers = TokenizerSet({cls.parse_tokenizer(**t) for t in kwargs.get('tokenizers')})
|
118
|
+
meta.jobs = JobSet({cls.parse_job(**j) for j in kwargs.get('jobs')})
|
119
|
+
|
120
|
+
return meta
|
121
|
+
|
122
|
+
def json(self):
|
123
|
+
return {
|
124
|
+
"version": self.version,
|
125
|
+
"note": self.note,
|
126
|
+
"website": self.website,
|
127
|
+
"created_at": self.created_at,
|
128
|
+
"modified_at": self.modified_at,
|
129
|
+
"vocabularies": [v.json() for v in self.vocabularies],
|
130
|
+
"tokenizers": [t.json() for t in self.tokenizers],
|
131
|
+
"jobs": [j.json() for j in self.jobs],
|
132
|
+
}
|
133
|
+
|
134
|
+
def save(self, save_dir):
|
135
|
+
filename = self.filename(save_dir)
|
136
|
+
JsonHandler.save(self.json(), filename)
|
@@ -0,0 +1,44 @@
|
|
1
|
+
from unitok.utils import Symbols, Symbol
|
2
|
+
|
3
|
+
|
4
|
+
class Status:
|
5
|
+
def __init__(self):
|
6
|
+
self.status = Symbols.initialized
|
7
|
+
# initialized
|
8
|
+
# tokenized
|
9
|
+
# organized
|
10
|
+
|
11
|
+
@staticmethod
|
12
|
+
def require_status(*status: Symbol):
|
13
|
+
status_string = '/'.join([s.name for s in status])
|
14
|
+
|
15
|
+
def decorator(func):
|
16
|
+
def wrapper(self, *args, **kwargs):
|
17
|
+
if self.status in status:
|
18
|
+
return func(self, *args, **kwargs)
|
19
|
+
raise ValueError(f'UniTok should be in {status_string} status')
|
20
|
+
|
21
|
+
return wrapper
|
22
|
+
|
23
|
+
return decorator
|
24
|
+
|
25
|
+
require_initialized = require_status(Symbols.initialized)
|
26
|
+
require_tokenized = require_status(Symbols.tokenized)
|
27
|
+
require_organized = require_status(Symbols.organized)
|
28
|
+
|
29
|
+
require_not_initialized = require_status(Symbols.tokenized, Symbols.organized)
|
30
|
+
require_not_tokenized = require_status(Symbols.initialized, Symbols.organized)
|
31
|
+
require_not_organized = require_status(Symbols.initialized, Symbols.tokenized)
|
32
|
+
|
33
|
+
@staticmethod
|
34
|
+
def change_status(status: Symbol):
|
35
|
+
def decorator(func):
|
36
|
+
def wrapper(self, *args, **kwargs):
|
37
|
+
result = func(self, *args, **kwargs)
|
38
|
+
self.status = status
|
39
|
+
return result
|
40
|
+
return wrapper
|
41
|
+
return decorator
|
42
|
+
|
43
|
+
to_tokenized = change_status(Symbols.tokenized)
|
44
|
+
to_organized = change_status(Symbols.organized)
|
@@ -0,0 +1,18 @@
|
|
1
|
+
from unitok.tokenizer.base_tokenizer import BaseTokenizer, TokenizerHub
|
2
|
+
from unitok.tokenizer.entity_tokenizer import EntityTokenizer, EntitiesTokenizer
|
3
|
+
from unitok.tokenizer.transformers_tokenizer import TransformersTokenizer, BertTokenizer
|
4
|
+
from unitok.tokenizer.split_tokenizer import SplitTokenizer
|
5
|
+
from unitok.tokenizer.digit_tokenizer import DigitTokenizer, DigitsTokenizer
|
6
|
+
|
7
|
+
|
8
|
+
__all__ = [
|
9
|
+
BaseTokenizer,
|
10
|
+
EntityTokenizer,
|
11
|
+
EntitiesTokenizer,
|
12
|
+
TransformersTokenizer,
|
13
|
+
BertTokenizer,
|
14
|
+
SplitTokenizer,
|
15
|
+
DigitTokenizer,
|
16
|
+
DigitsTokenizer,
|
17
|
+
TokenizerHub
|
18
|
+
]
|
@@ -0,0 +1,78 @@
|
|
1
|
+
import abc
|
2
|
+
from typing import Union
|
3
|
+
|
4
|
+
from unitok.utils import Instance, function
|
5
|
+
from unitok.utils.hub import Hub
|
6
|
+
from unitok.vocabulary import Vocab, VocabHub
|
7
|
+
|
8
|
+
|
9
|
+
class BaseTokenizer(abc.ABC):
|
10
|
+
return_list: bool
|
11
|
+
param_list: list
|
12
|
+
|
13
|
+
prefix = 'auto_'
|
14
|
+
|
15
|
+
def __init__(self, vocab: Union[str, Vocab], tokenizer_id: str = None, **kwargs):
|
16
|
+
if isinstance(vocab, str):
|
17
|
+
if VocabHub.has(vocab):
|
18
|
+
self.vocab = VocabHub.get(vocab)
|
19
|
+
else:
|
20
|
+
self.vocab = Vocab(name=vocab)
|
21
|
+
else:
|
22
|
+
self.vocab = vocab
|
23
|
+
|
24
|
+
self._tokenizer_id = tokenizer_id
|
25
|
+
|
26
|
+
TokenizerHub.add(self.get_tokenizer_id(), self)
|
27
|
+
|
28
|
+
def get_tokenizer_id(self):
|
29
|
+
if self._tokenizer_id is None:
|
30
|
+
self._tokenizer_id = self.prefix + function.get_random_string(length=6)
|
31
|
+
return self._tokenizer_id
|
32
|
+
|
33
|
+
@classmethod
|
34
|
+
def get_classname(cls):
|
35
|
+
# return cls.classname.lower().replace('tokenizer', '')
|
36
|
+
classname = cls.__name__.lower()
|
37
|
+
if not classname.endswith('tokenizer'):
|
38
|
+
raise ValueError(f'({classname}) Unexpected classname, expecting classname to end with "Tokenizer"')
|
39
|
+
return classname.replace('tokenizer', '')
|
40
|
+
|
41
|
+
def _convert_tokens_to_ids(self, tokens):
|
42
|
+
return_list = isinstance(tokens, list)
|
43
|
+
if return_list != self.return_list:
|
44
|
+
raise ValueError(f'(tokenizer.{self.get_classname()}) Unexpected input, requiring return_list={self.return_list}')
|
45
|
+
|
46
|
+
if not return_list:
|
47
|
+
tokens = [tokens]
|
48
|
+
|
49
|
+
ids = [self.vocab.append(token) for token in tokens]
|
50
|
+
|
51
|
+
if not return_list:
|
52
|
+
ids = ids[0]
|
53
|
+
return ids
|
54
|
+
|
55
|
+
def __call__(self, objs):
|
56
|
+
return self._convert_tokens_to_ids(objs)
|
57
|
+
|
58
|
+
def __str__(self):
|
59
|
+
return f'{self._detailed_classname}({self.get_tokenizer_id()}, vocab={self.vocab.name})'
|
60
|
+
|
61
|
+
def __repr__(self):
|
62
|
+
return str(self)
|
63
|
+
|
64
|
+
def json(self):
|
65
|
+
return {
|
66
|
+
'tokenizer_id': self.get_tokenizer_id(),
|
67
|
+
'vocab': self.vocab.name,
|
68
|
+
'classname': self.get_classname(),
|
69
|
+
'params': {param: getattr(self, param) for param in self.param_list},
|
70
|
+
}
|
71
|
+
|
72
|
+
@property
|
73
|
+
def _detailed_classname(self):
|
74
|
+
return self.__class__.__name__
|
75
|
+
|
76
|
+
|
77
|
+
class TokenizerHub(Hub[BaseTokenizer]):
|
78
|
+
_instance = Instance()
|
@@ -0,0 +1,33 @@
|
|
1
|
+
from unitok.tokenizer import BaseTokenizer
|
2
|
+
|
3
|
+
|
4
|
+
class DigitTokenizer(BaseTokenizer):
|
5
|
+
return_list = False
|
6
|
+
name = 'digit'
|
7
|
+
param_list = ['vocab_size']
|
8
|
+
|
9
|
+
def __init__(self, vocab_size: int = None, **kwargs):
|
10
|
+
super().__init__(**kwargs)
|
11
|
+
|
12
|
+
self.vocab_size = vocab_size
|
13
|
+
if self.vocab_size is not None:
|
14
|
+
self.vocab.extend([str(i) for i in range(vocab_size)])
|
15
|
+
self.vocab.deny_edit()
|
16
|
+
|
17
|
+
def __call__(self, obj):
|
18
|
+
obj = int(obj)
|
19
|
+
if obj >= len(self.vocab):
|
20
|
+
if self.vocab_size is not None:
|
21
|
+
raise ValueError(f'Vocabulary size is limited to {self.vocab_size}, but {obj} is given')
|
22
|
+
self.vocab.extend([str(i) for i in range(len(self.vocab), obj + 1)])
|
23
|
+
return obj
|
24
|
+
|
25
|
+
|
26
|
+
class DigitsTokenizer(DigitTokenizer):
|
27
|
+
return_list = True
|
28
|
+
name = 'digits'
|
29
|
+
|
30
|
+
def __call__(self, obj):
|
31
|
+
obj = [int(o) for o in obj]
|
32
|
+
for o in obj:
|
33
|
+
super().__call__(o)
|