UniTok 3.4.8__tar.gz → 3.4.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {UniTok-3.4.8 → UniTok-3.4.9}/PKG-INFO +11 -4
- {UniTok-3.4.8 → UniTok-3.4.9}/README.md +10 -3
- {UniTok-3.4.8 → UniTok-3.4.9}/UniTok/meta.py +11 -4
- {UniTok-3.4.8 → UniTok-3.4.9}/UniTok/unidep.py +38 -21
- {UniTok-3.4.8 → UniTok-3.4.9}/UniTok.egg-info/PKG-INFO +11 -4
- {UniTok-3.4.8 → UniTok-3.4.9}/setup.py +1 -1
- {UniTok-3.4.8 → UniTok-3.4.9}/UniTok/__init__.py +0 -0
- {UniTok-3.4.8 → UniTok-3.4.9}/UniTok/analysis/__init__.py +0 -0
- {UniTok-3.4.8 → UniTok-3.4.9}/UniTok/analysis/lengths.py +0 -0
- {UniTok-3.4.8 → UniTok-3.4.9}/UniTok/analysis/plot.py +0 -0
- {UniTok-3.4.8 → UniTok-3.4.9}/UniTok/cols.py +0 -0
- {UniTok-3.4.8 → UniTok-3.4.9}/UniTok/column.py +0 -0
- {UniTok-3.4.8 → UniTok-3.4.9}/UniTok/fut.py +0 -0
- {UniTok-3.4.8 → UniTok-3.4.9}/UniTok/global_setting.py +0 -0
- {UniTok-3.4.8 → UniTok-3.4.9}/UniTok/tok/__init__.py +0 -0
- {UniTok-3.4.8 → UniTok-3.4.9}/UniTok/tok/bert_tok.py +0 -0
- {UniTok-3.4.8 → UniTok-3.4.9}/UniTok/tok/ent_tok.py +0 -0
- {UniTok-3.4.8 → UniTok-3.4.9}/UniTok/tok/id_tok.py +0 -0
- {UniTok-3.4.8 → UniTok-3.4.9}/UniTok/tok/number_tok.py +0 -0
- {UniTok-3.4.8 → UniTok-3.4.9}/UniTok/tok/seq_tok.py +0 -0
- {UniTok-3.4.8 → UniTok-3.4.9}/UniTok/tok/split_tok.py +0 -0
- {UniTok-3.4.8 → UniTok-3.4.9}/UniTok/tok/tok.py +0 -0
- {UniTok-3.4.8 → UniTok-3.4.9}/UniTok/unitok.py +0 -0
- {UniTok-3.4.8 → UniTok-3.4.9}/UniTok/vocab.py +0 -0
- {UniTok-3.4.8 → UniTok-3.4.9}/UniTok/vocabs.py +0 -0
- {UniTok-3.4.8 → UniTok-3.4.9}/UniTok.egg-info/SOURCES.txt +0 -0
- {UniTok-3.4.8 → UniTok-3.4.9}/UniTok.egg-info/dependency_links.txt +0 -0
- {UniTok-3.4.8 → UniTok-3.4.9}/UniTok.egg-info/requires.txt +0 -0
- {UniTok-3.4.8 → UniTok-3.4.9}/UniTok.egg-info/top_level.txt +0 -0
- {UniTok-3.4.8 → UniTok-3.4.9}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: UniTok
|
3
|
-
Version: 3.4.
|
3
|
+
Version: 3.4.9
|
4
4
|
Summary: Unified Tokenizer
|
5
5
|
Home-page: https://github.com/Jyonn/UnifiedTokenizer
|
6
6
|
Author: Jyonn Liu
|
@@ -10,18 +10,25 @@ Keywords: token,tokenizer
|
|
10
10
|
Platform: any
|
11
11
|
Description-Content-Type: text/markdown
|
12
12
|
|
13
|
-
# UniTok V3
|
13
|
+
# UniTok V3: 类SQL数据预处理工具包
|
14
|
+
|
15
|
+
Updated on 2023.11.04
|
14
16
|
|
15
17
|
## 1. 简介
|
16
18
|
|
17
|
-
UniTok
|
19
|
+
UniTok 是史上第一个类SQL的数据预处理工具包,提供了一整套的数据封装和编辑工具。
|
20
|
+
|
21
|
+
UniTok 主要包括两大组件:负责统一数据处理的`UniTok` 和 负责数据读取和二次编辑的`UniDep`:
|
22
|
+
- `UniTok` 通过分词器(Tokenizers)和数据列(Columns)等组件将生数据(Raw Data)进行分词与ID化操作,并最终以numpy数组格式存储为一张数据表。
|
23
|
+
- `UniDep` 读取由`UniTok`生成的数据表以及元数据(如词表信息),可以直接与Pytorch的Dataset结合使用,也可以完成二次编辑、和其他数据表合并、导出等操作。
|
24
|
+
- 在3.1.9版本后,我们推出`Fut` 组件,它是`UniTok`的替代品,可以更快速地完成数据预处理。
|
18
25
|
|
19
26
|
## 2. 安装
|
20
27
|
|
21
28
|
使用pip安装:
|
22
29
|
|
23
30
|
```bash
|
24
|
-
pip install unitok>=3.
|
31
|
+
pip install unitok>=3.4.8
|
25
32
|
```
|
26
33
|
|
27
34
|
## 3. 主要功能
|
@@ -1,15 +1,22 @@
|
|
1
|
-
# UniTok V3
|
1
|
+
# UniTok V3: 类SQL数据预处理工具包
|
2
|
+
|
3
|
+
Updated on 2023.11.04
|
2
4
|
|
3
5
|
## 1. 简介
|
4
6
|
|
5
|
-
UniTok
|
7
|
+
UniTok 是史上第一个类SQL的数据预处理工具包,提供了一整套的数据封装和编辑工具。
|
8
|
+
|
9
|
+
UniTok 主要包括两大组件:负责统一数据处理的`UniTok` 和 负责数据读取和二次编辑的`UniDep`:
|
10
|
+
- `UniTok` 通过分词器(Tokenizers)和数据列(Columns)等组件将生数据(Raw Data)进行分词与ID化操作,并最终以numpy数组格式存储为一张数据表。
|
11
|
+
- `UniDep` 读取由`UniTok`生成的数据表以及元数据(如词表信息),可以直接与Pytorch的Dataset结合使用,也可以完成二次编辑、和其他数据表合并、导出等操作。
|
12
|
+
- 在3.1.9版本后,我们推出`Fut` 组件,它是`UniTok`的替代品,可以更快速地完成数据预处理。
|
6
13
|
|
7
14
|
## 2. 安装
|
8
15
|
|
9
16
|
使用pip安装:
|
10
17
|
|
11
18
|
```bash
|
12
|
-
pip install unitok>=3.
|
19
|
+
pip install unitok>=3.4.8
|
13
20
|
```
|
14
21
|
|
15
22
|
## 3. 主要功能
|
@@ -4,9 +4,15 @@ import warnings
|
|
4
4
|
from typing import List, Union
|
5
5
|
|
6
6
|
|
7
|
-
class
|
7
|
+
class Ent:
|
8
|
+
def __init__(self, name, **kwargs):
|
9
|
+
self.name = name
|
10
|
+
|
11
|
+
|
12
|
+
class Col(Ent):
|
8
13
|
def __init__(self, name, voc=None, max_length=None, padding=None, vocab=None):
|
9
|
-
|
14
|
+
super().__init__(name=name)
|
15
|
+
|
10
16
|
self.voc: Union[Voc, str] = voc or vocab
|
11
17
|
self.max_length = max_length
|
12
18
|
self.padding = padding
|
@@ -25,9 +31,10 @@ class Col:
|
|
25
31
|
return info
|
26
32
|
|
27
33
|
|
28
|
-
class Voc:
|
34
|
+
class Voc(Ent):
|
29
35
|
def __init__(self, name, size, cols, store_dir, vocab=None):
|
30
|
-
|
36
|
+
super().__init__(name=name)
|
37
|
+
|
31
38
|
self.size: int = size
|
32
39
|
self.cols: List[Union[Col, str]] = cols
|
33
40
|
self.store_dir = store_dir
|
@@ -3,7 +3,7 @@ import os
|
|
3
3
|
import random
|
4
4
|
import warnings
|
5
5
|
from collections import OrderedDict
|
6
|
-
from typing import Dict, List, Callable, Union, Optional
|
6
|
+
from typing import Dict, List, Callable, Union, Optional, cast
|
7
7
|
|
8
8
|
import numpy as np
|
9
9
|
import tqdm
|
@@ -16,11 +16,20 @@ from .vocabs import Vocabs
|
|
16
16
|
class UniDep:
|
17
17
|
VER = Meta.VER
|
18
18
|
|
19
|
-
def __init__(self, store_dir, silent=
|
19
|
+
def __init__(self, store_dir, verbose=True, silent=None):
|
20
|
+
"""
|
21
|
+
Unified Data Depot Initialization
|
22
|
+
:param store_dir: Store directory of the data processed by our UniTok or Fut
|
23
|
+
:param verbose:
|
24
|
+
"""
|
20
25
|
self.store_dir = os.path.expanduser(store_dir)
|
21
26
|
self.meta = Meta(self.store_dir)
|
22
27
|
|
23
|
-
|
28
|
+
if silent is not None:
|
29
|
+
warnings.warn('unidep.silent is deprecated, '
|
30
|
+
'use verbose instead (will be removed in 4.x version)', DeprecationWarning)
|
31
|
+
verbose = not silent
|
32
|
+
self.verbose = verbose
|
24
33
|
|
25
34
|
self.cached = False
|
26
35
|
self.cached_samples = []
|
@@ -28,8 +37,8 @@ class UniDep:
|
|
28
37
|
self.data_path = os.path.join(self.store_dir, 'data.npy')
|
29
38
|
self.data = np.load(self.data_path, allow_pickle=True)
|
30
39
|
try:
|
31
|
-
|
32
|
-
self.data
|
40
|
+
self.data = self.data.item()
|
41
|
+
self.data = cast(dict, self.data)
|
33
42
|
except Exception as err:
|
34
43
|
print(err)
|
35
44
|
return
|
@@ -53,7 +62,7 @@ class UniDep:
|
|
53
62
|
self.vocabs.append(Vocab(name=vocab_name).load(self.store_dir))
|
54
63
|
for voc in self.vocs:
|
55
64
|
self.vocs[voc].vocab = self.vocabs[voc]
|
56
|
-
self.id2index = self.
|
65
|
+
self.id2index = self.id_voc.vocab.o2i
|
57
66
|
|
58
67
|
self.unions = OrderedDict() # type: Dict[str, List[UniDep]]
|
59
68
|
self._deep_union = False
|
@@ -74,7 +83,7 @@ class UniDep:
|
|
74
83
|
silent-aware printer
|
75
84
|
"""
|
76
85
|
|
77
|
-
if self.
|
86
|
+
if self.verbose:
|
78
87
|
return
|
79
88
|
print(*args, **kwargs)
|
80
89
|
|
@@ -111,7 +120,7 @@ class UniDep:
|
|
111
120
|
|
112
121
|
self.cached = False
|
113
122
|
self.cached_samples = [None] * self._sample_size
|
114
|
-
for sample in tqdm.tqdm(self, disable=self.
|
123
|
+
for sample in tqdm.tqdm(self, disable=self.verbose):
|
115
124
|
self.cached_samples[sample[self.id_col]] = sample
|
116
125
|
self.cached = True
|
117
126
|
|
@@ -139,7 +148,7 @@ class UniDep:
|
|
139
148
|
"""
|
140
149
|
introduction = f"""
|
141
150
|
UniDep ({self.meta.parse_version(self.meta.version)}): {self.store_dir}
|
142
|
-
|
151
|
+
|
143
152
|
Sample Size: {self.sample_size}
|
144
153
|
Id Column: {self.id_col}
|
145
154
|
Columns:\n"""
|
@@ -187,18 +196,26 @@ class UniDep:
|
|
187
196
|
raise ValueError('deep_union can not be changed after union-ed')
|
188
197
|
self._deep_union = value
|
189
198
|
|
190
|
-
def union(self, *depots: 'UniDep'):
|
199
|
+
def union(self, *depots: 'UniDep', union_col: str = None):
|
191
200
|
"""
|
192
201
|
union depots, where id columns in other depots must exist in current main depot
|
193
202
|
"""
|
203
|
+
if union_col and union_col not in self.cols:
|
204
|
+
raise ValueError(f'current depot has no column named {union_col}')
|
205
|
+
|
194
206
|
for depot in depots:
|
195
207
|
# check if id col exists in current depot
|
196
|
-
if
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
self.
|
201
|
-
|
208
|
+
if not union_col:
|
209
|
+
assert depot.id_col in self.cols, (
|
210
|
+
ValueError(f'current depot has no column named {union_col}'))
|
211
|
+
else:
|
212
|
+
assert self.cols[union_col].voc == depot.cols[depot.id_col].voc, (
|
213
|
+
ValueError(f'the vocabs of union col {union_col} and target id col {depot.id_col} are not matched'))
|
214
|
+
current_union_col = union_col or depot.id_col
|
215
|
+
|
216
|
+
if current_union_col not in self.unions:
|
217
|
+
self.unions[current_union_col] = []
|
218
|
+
self.unions[current_union_col].append(depot)
|
202
219
|
|
203
220
|
self.cols = self._merge_cols(self.cols, depot.cols)
|
204
221
|
self.vocs = self._merge_vocs(self.vocs, depot.vocs)
|
@@ -210,7 +227,7 @@ class UniDep:
|
|
210
227
|
|
211
228
|
columns = {col_name: [] for col_name in depot.cols}
|
212
229
|
|
213
|
-
for index in self.data[
|
230
|
+
for index in self.data[current_union_col]:
|
214
231
|
for col_name in columns:
|
215
232
|
columns[col_name].append(depot.data[col_name][index])
|
216
233
|
|
@@ -295,7 +312,7 @@ class UniDep:
|
|
295
312
|
"""
|
296
313
|
visible_indexes = []
|
297
314
|
|
298
|
-
for sample in tqdm.tqdm(self, disable=self.
|
315
|
+
for sample in tqdm.tqdm(self, disable=self.verbose):
|
299
316
|
target = sample if col is None else sample[col]
|
300
317
|
if filter_func(target):
|
301
318
|
visible_indexes.append(sample[self.id_col])
|
@@ -333,7 +350,7 @@ class UniDep:
|
|
333
350
|
for voc in self.vocabs:
|
334
351
|
self.vocabs[voc].save(store_dir)
|
335
352
|
|
336
|
-
for sample in tqdm.tqdm(self, disable=self.
|
353
|
+
for sample in tqdm.tqdm(self, disable=self.verbose):
|
337
354
|
for col_name in sample:
|
338
355
|
if col_name not in data:
|
339
356
|
data[col_name] = []
|
@@ -341,7 +358,7 @@ class UniDep:
|
|
341
358
|
|
342
359
|
for col_name in data:
|
343
360
|
data[col_name] = np.array(data[col_name])
|
344
|
-
np.save(os.path.join(store_dir, 'data.npy'), data, allow_pickle=True)
|
361
|
+
np.save(os.path.join(store_dir, 'data.npy'), cast(data, np.ndarray), allow_pickle=True)
|
345
362
|
|
346
363
|
meta_data = self.meta.get_info()
|
347
364
|
json.dump(meta_data, open(os.path.join(store_dir, 'meta.data.json'), 'w'), indent=2)
|
@@ -373,7 +390,7 @@ class UniDep:
|
|
373
390
|
|
374
391
|
@staticmethod
|
375
392
|
def _get_max_length(values):
|
376
|
-
if isinstance(values[0], list):
|
393
|
+
if isinstance(values[0], list) or isinstance(values[0], np.ndarray):
|
377
394
|
return max([len(value) for value in values])
|
378
395
|
return None
|
379
396
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: UniTok
|
3
|
-
Version: 3.4.
|
3
|
+
Version: 3.4.9
|
4
4
|
Summary: Unified Tokenizer
|
5
5
|
Home-page: https://github.com/Jyonn/UnifiedTokenizer
|
6
6
|
Author: Jyonn Liu
|
@@ -10,18 +10,25 @@ Keywords: token,tokenizer
|
|
10
10
|
Platform: any
|
11
11
|
Description-Content-Type: text/markdown
|
12
12
|
|
13
|
-
# UniTok V3
|
13
|
+
# UniTok V3: 类SQL数据预处理工具包
|
14
|
+
|
15
|
+
Updated on 2023.11.04
|
14
16
|
|
15
17
|
## 1. 简介
|
16
18
|
|
17
|
-
UniTok
|
19
|
+
UniTok 是史上第一个类SQL的数据预处理工具包,提供了一整套的数据封装和编辑工具。
|
20
|
+
|
21
|
+
UniTok 主要包括两大组件:负责统一数据处理的`UniTok` 和 负责数据读取和二次编辑的`UniDep`:
|
22
|
+
- `UniTok` 通过分词器(Tokenizers)和数据列(Columns)等组件将生数据(Raw Data)进行分词与ID化操作,并最终以numpy数组格式存储为一张数据表。
|
23
|
+
- `UniDep` 读取由`UniTok`生成的数据表以及元数据(如词表信息),可以直接与Pytorch的Dataset结合使用,也可以完成二次编辑、和其他数据表合并、导出等操作。
|
24
|
+
- 在3.1.9版本后,我们推出`Fut` 组件,它是`UniTok`的替代品,可以更快速地完成数据预处理。
|
18
25
|
|
19
26
|
## 2. 安装
|
20
27
|
|
21
28
|
使用pip安装:
|
22
29
|
|
23
30
|
```bash
|
24
|
-
pip install unitok>=3.
|
31
|
+
pip install unitok>=3.4.8
|
25
32
|
```
|
26
33
|
|
27
34
|
## 3. 主要功能
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|