UniTok 3.0.3a0__tar.gz → 3.0.3b0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {UniTok-3.0.3a0 → UniTok-3.0.3b0}/PKG-INFO +1 -1
- {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok/unidep.py +12 -11
- {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok/unitok.py +7 -3
- {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok/vocab.py +24 -13
- {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok/vocabs.py +6 -3
- {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok.egg-info/PKG-INFO +1 -1
- {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok.egg-info/SOURCES.txt +0 -2
- {UniTok-3.0.3a0 → UniTok-3.0.3b0}/setup.py +1 -1
- UniTok-3.0.3a0/UniTok/compatible/__init__.py +0 -21
- UniTok-3.0.3a0/UniTok/compatible/uni_warnings.py +0 -70
- {UniTok-3.0.3a0 → UniTok-3.0.3b0}/README.md +0 -0
- {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok/__init__.py +0 -0
- {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok/analysis/__init__.py +0 -0
- {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok/analysis/lengths.py +0 -0
- {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok/analysis/plot.py +0 -0
- {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok/cols.py +0 -0
- {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok/column.py +0 -0
- {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok/global_setting.py +0 -0
- {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok/meta.py +0 -0
- {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok/tok/__init__.py +0 -0
- {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok/tok/bert_tok.py +0 -0
- {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok/tok/entity_tok.py +0 -0
- {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok/tok/id_tok.py +0 -0
- {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok/tok/number_tok.py +0 -0
- {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok/tok/seq_tok.py +0 -0
- {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok/tok/split_tok.py +0 -0
- {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok/tok/tok.py +0 -0
- {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok.egg-info/dependency_links.txt +0 -0
- {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok.egg-info/requires.txt +0 -0
- {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok.egg-info/top_level.txt +0 -0
- {UniTok-3.0.3a0 → UniTok-3.0.3b0}/setup.cfg +0 -0
@@ -1,14 +1,12 @@
|
|
1
1
|
import json
|
2
2
|
import os
|
3
3
|
import random
|
4
|
+
import warnings
|
4
5
|
from typing import Dict, List, Callable
|
5
6
|
|
6
7
|
import numpy as np
|
7
8
|
import tqdm
|
8
9
|
|
9
|
-
from .compatible.uni_warnings import MetaDataDeprecationWarning, VocabInfoDeprecationWarning, \
|
10
|
-
ColInfoDeprecationWarning, GetMaxLengthDeprecationWarning, GetVocabDeprecationWarning, \
|
11
|
-
GetVocabSizeDeprecationWarning, IsListColDeprecationWarning, ShuffleDeprecationWarning
|
12
10
|
from .meta import Meta, Col, Voc
|
13
11
|
from .vocab import Vocab
|
14
12
|
from .vocabs import Vocabs
|
@@ -178,38 +176,41 @@ class UniDep:
|
|
178
176
|
|
179
177
|
@property
|
180
178
|
def meta_data(self):
|
181
|
-
|
179
|
+
warnings.warn('meta_data is deprecated, '
|
180
|
+
'use meta instead (will be removed in 4.x version)', DeprecationWarning)
|
182
181
|
return self.meta
|
183
182
|
|
184
183
|
@property
|
185
184
|
def vocab_info(self):
|
186
|
-
|
185
|
+
warnings.warn('vocab_info is deprecated, '
|
186
|
+
'use vocs instead (will be removed in 4.x version)', DeprecationWarning)
|
187
187
|
return self.vocs
|
188
188
|
|
189
189
|
@property
|
190
190
|
def col_info(self):
|
191
|
-
|
191
|
+
warnings.warn('col_info is deprecated, '
|
192
|
+
'use cols instead (will be removed in 4.x version)', DeprecationWarning)
|
192
193
|
return self.cols
|
193
194
|
|
194
195
|
def get_vocab_size(self, col_name, as_vocab=False):
|
195
|
-
|
196
|
+
warnings.warn('unidep.get_vocab_size is deprecated (will be removed in 4.x version)', DeprecationWarning)
|
196
197
|
vocab_id = col_name if as_vocab else self.get_vocab(col_name)
|
197
198
|
return self.vocs[vocab_id].size
|
198
199
|
|
199
200
|
def get_vocab(self, col_name):
|
200
|
-
|
201
|
+
warnings.warn('unidep.get_vocab is deprecated (will be removed in 4.x version)', DeprecationWarning)
|
201
202
|
return self.cols[col_name].voc.name
|
202
203
|
|
203
204
|
def get_max_length(self, col_name):
|
204
|
-
|
205
|
+
warnings.warn('unidep.get_max_length is deprecated (will be removed in 4.x version)', DeprecationWarning)
|
205
206
|
return self.cols[col_name].max_length
|
206
207
|
|
207
208
|
def is_list_col(self, col_name):
|
208
|
-
|
209
|
+
warnings.warn('unidep.is_list_col is deprecated (will be removed in 4.x version)', DeprecationWarning)
|
209
210
|
return self.cols[col_name].list
|
210
211
|
|
211
212
|
def shuffle(self, shuffle=True):
|
212
|
-
|
213
|
+
warnings.warn('unidep.shuffle is deprecated (will be removed in 4.x version)', DeprecationWarning)
|
213
214
|
if shuffle:
|
214
215
|
random.shuffle(self._indexes)
|
215
216
|
else:
|
@@ -1,12 +1,12 @@
|
|
1
1
|
import json
|
2
2
|
import os
|
3
|
+
import warnings
|
3
4
|
from typing import Optional
|
4
5
|
|
5
6
|
import numpy as np
|
6
7
|
import pandas as pd
|
7
8
|
|
8
9
|
from .cols import Cols
|
9
|
-
from .compatible.uni_warnings import VocabDepotDeprecationWarning, GetTokPathDeprecationWarning
|
10
10
|
from .column import Column, IndexColumn
|
11
11
|
from .tok.bert_tok import BertTok
|
12
12
|
from .tok.entity_tok import EntTok
|
@@ -29,7 +29,8 @@ class UniTok:
|
|
29
29
|
|
30
30
|
@property
|
31
31
|
def vocab_depots(self):
|
32
|
-
|
32
|
+
warnings.warn('vocab_depot is deprecated, '
|
33
|
+
'use vocabs instead (will be removed in 4.x version)', DeprecationWarning)
|
33
34
|
return self.vocabs
|
34
35
|
|
35
36
|
def add_col(self, col: Column):
|
@@ -110,6 +111,9 @@ class UniTok:
|
|
110
111
|
if not self.id_col:
|
111
112
|
raise ValueError('id column is not set')
|
112
113
|
|
114
|
+
for vocab in self.vocabs.values():
|
115
|
+
vocab.set_count_mode(False)
|
116
|
+
|
113
117
|
for col_name in self.cols:
|
114
118
|
print('[ COL:', col_name, ']')
|
115
119
|
col = self.cols[col_name] # type: Column
|
@@ -121,7 +125,7 @@ class UniTok:
|
|
121
125
|
"""
|
122
126
|
Get the store path of the tokenizer of a column
|
123
127
|
"""
|
124
|
-
|
128
|
+
warnings.warn('unitok.get_tok_path is deprecated (will be removed in 4.x version)', DeprecationWarning)
|
125
129
|
return self.cols[col_name].tok.vocab.get_store_path(store_dir)
|
126
130
|
|
127
131
|
def store_data(self, store_dir):
|
@@ -1,12 +1,10 @@
|
|
1
1
|
import math
|
2
2
|
import os
|
3
|
+
import warnings
|
3
4
|
from typing import Union, List
|
4
5
|
|
5
6
|
import numpy as np
|
6
7
|
|
7
|
-
from UniTok.compatible.uni_warnings import VocabMapDeprecationWarning, OOVDefaultDeprecationWarning, \
|
8
|
-
MinFrequencyDeprecationWarning
|
9
|
-
|
10
8
|
|
11
9
|
class VocabMap(dict):
|
12
10
|
def __call__(self, *args, **kwargs):
|
@@ -29,6 +27,7 @@ class Vocab:
|
|
29
27
|
|
30
28
|
self._editable = True # whether vocab is editable
|
31
29
|
self._oov_token = None # out of vocabulary token
|
30
|
+
self._stable_mode = False
|
32
31
|
|
33
32
|
self._count_mode = False # whether count mode is on
|
34
33
|
self._counter = {} # counter for counting occurrence of each token
|
@@ -48,7 +47,8 @@ class Vocab:
|
|
48
47
|
"""
|
49
48
|
Deprecated, use o2i instead
|
50
49
|
"""
|
51
|
-
|
50
|
+
warnings.warn('vocab.index2obj and vocab.obj2index are deprecated, '
|
51
|
+
'use vocab.i2o and vocab.o2i instead (will be removed in 4.x version)', DeprecationWarning)
|
52
52
|
return self.o2i
|
53
53
|
|
54
54
|
@property
|
@@ -56,7 +56,8 @@ class Vocab:
|
|
56
56
|
"""
|
57
57
|
Deprecated, use i2o instead
|
58
58
|
"""
|
59
|
-
|
59
|
+
warnings.warn('vocab.index2obj and vocab.obj2index are deprecated, '
|
60
|
+
'use vocab.i2o and vocab.o2i instead (will be removed in 4.x version)', DeprecationWarning)
|
60
61
|
return self.i2o
|
61
62
|
|
62
63
|
def extend(self, objs):
|
@@ -80,8 +81,10 @@ class Vocab:
|
|
80
81
|
if obj in self.o2i:
|
81
82
|
return self.o2i[obj]
|
82
83
|
|
83
|
-
if self.
|
84
|
-
|
84
|
+
if self._stable_mode:
|
85
|
+
if self._oov_token is not None:
|
86
|
+
return self._oov_token
|
87
|
+
return -1
|
85
88
|
|
86
89
|
if not self._editable:
|
87
90
|
if self._oov_token is not None:
|
@@ -127,7 +130,8 @@ class Vocab:
|
|
127
130
|
|
128
131
|
@property
|
129
132
|
def oov_default(self):
|
130
|
-
|
133
|
+
warnings.warn('vocab.oov_default is deprecated, '
|
134
|
+
'use vocab.oov_token instead (will be removed in 4.x version)', DeprecationWarning)
|
131
135
|
return self._oov_token
|
132
136
|
|
133
137
|
def allow_edit(self):
|
@@ -189,7 +193,8 @@ class Vocab:
|
|
189
193
|
:return:
|
190
194
|
"""
|
191
195
|
if min_count is None:
|
192
|
-
|
196
|
+
warnings.warn('vocab.min_frequency is deprecated, '
|
197
|
+
'use vocab.min_count instead (will be removed in 4.x version)', DeprecationWarning)
|
193
198
|
min_count = min_frequency
|
194
199
|
|
195
200
|
vocabs = []
|
@@ -205,7 +210,7 @@ class Vocab:
|
|
205
210
|
self.reserve(self.reserved_tokens)
|
206
211
|
self.extend(vocabs)
|
207
212
|
|
208
|
-
|
213
|
+
self._stable_mode = True
|
209
214
|
return self
|
210
215
|
|
211
216
|
def summarize(self, base=10):
|
@@ -258,6 +263,12 @@ class Vocab:
|
|
258
263
|
|
259
264
|
def __getattr__(self, item):
|
260
265
|
if item in ['frequency_mode', 'frequency', 'max_frequency', 'frequent_vocab']:
|
261
|
-
|
262
|
-
|
263
|
-
|
266
|
+
raise AttributeError(f'{item} is deprecated after UniTok 3.0, '
|
267
|
+
f'degrade to 2.4.3.2 or lower to use it, '
|
268
|
+
f'or check new features of Vocab class')
|
269
|
+
|
270
|
+
@property
|
271
|
+
def trim_vocab(self):
|
272
|
+
warnings.warn('vocab.trim_vocab is deprecated, '
|
273
|
+
'use vocab.trim instead (will be removed in 4.x version)', DeprecationWarning)
|
274
|
+
return self.trim
|
@@ -1,4 +1,5 @@
|
|
1
|
-
|
1
|
+
import warnings
|
2
|
+
|
2
3
|
from .vocab import Vocab
|
3
4
|
|
4
5
|
|
@@ -9,12 +10,14 @@ class Vocabs(dict):
|
|
9
10
|
|
10
11
|
@property
|
11
12
|
def col_map(self):
|
12
|
-
|
13
|
+
warnings.warn('vocab_depot.col_map is deprecated, '
|
14
|
+
'use vocabs.cols instead (will be removed in 4.x version)', DeprecationWarning)
|
13
15
|
return self.cols
|
14
16
|
|
15
17
|
@property
|
16
18
|
def depots(self):
|
17
|
-
|
19
|
+
warnings.warn('vocab_depot.depots is deprecated, '
|
20
|
+
'use vocabs instead (will be removed in 4.x version)', DeprecationWarning)
|
18
21
|
return self
|
19
22
|
|
20
23
|
def append(self, col_or_vocab):
|
@@ -17,8 +17,6 @@ UniTok.egg-info/top_level.txt
|
|
17
17
|
UniTok/analysis/__init__.py
|
18
18
|
UniTok/analysis/lengths.py
|
19
19
|
UniTok/analysis/plot.py
|
20
|
-
UniTok/compatible/__init__.py
|
21
|
-
UniTok/compatible/uni_warnings.py
|
22
20
|
UniTok/tok/__init__.py
|
23
21
|
UniTok/tok/bert_tok.py
|
24
22
|
UniTok/tok/entity_tok.py
|
@@ -1,21 +0,0 @@
|
|
1
|
-
from .uni_warnings import VocabDepotDeprecationWarning, GetTokPathDeprecationWarning, \
|
2
|
-
VocabMapDeprecationWarning, OOVDefaultDeprecationWarning, MinFrequencyDeprecationWarning, \
|
3
|
-
ColMapDeprecationWarning, DepotsDeprecationWarning, MetaDataDeprecationWarning, \
|
4
|
-
VocabInfoDeprecationWarning, ColInfoDeprecationWarning, GetMaxLengthDeprecationWarning, \
|
5
|
-
GetVocabDeprecationWarning
|
6
|
-
|
7
|
-
|
8
|
-
__all__ = [
|
9
|
-
'VocabDepotDeprecationWarning',
|
10
|
-
'GetTokPathDeprecationWarning',
|
11
|
-
'VocabMapDeprecationWarning',
|
12
|
-
'OOVDefaultDeprecationWarning',
|
13
|
-
'MinFrequencyDeprecationWarning',
|
14
|
-
'ColMapDeprecationWarning',
|
15
|
-
'DepotsDeprecationWarning',
|
16
|
-
'MetaDataDeprecationWarning',
|
17
|
-
'VocabInfoDeprecationWarning',
|
18
|
-
'ColInfoDeprecationWarning',
|
19
|
-
'GetMaxLengthDeprecationWarning',
|
20
|
-
'GetVocabDeprecationWarning',
|
21
|
-
]
|
@@ -1,70 +0,0 @@
|
|
1
|
-
import warnings
|
2
|
-
from typing import Callable
|
3
|
-
|
4
|
-
warned_flags = set()
|
5
|
-
|
6
|
-
|
7
|
-
class UniWarning:
|
8
|
-
def __init__(self, msg, type_: Callable = warnings.warn):
|
9
|
-
self.msg = msg
|
10
|
-
self.type = type_
|
11
|
-
|
12
|
-
def __call__(self, *args, **kwargs):
|
13
|
-
if self not in warned_flags:
|
14
|
-
warned_flags.add(self)
|
15
|
-
self.type(self.msg.format(*args, **kwargs))
|
16
|
-
|
17
|
-
|
18
|
-
VocabMapDeprecationWarning = UniWarning(
|
19
|
-
'vocab.index2obj and vocab.obj2index are deprecated, '
|
20
|
-
'use vocab.i2o and vocab.o2i instead (will be removed in 4.x version)', type_=DeprecationWarning)
|
21
|
-
|
22
|
-
OOVDefaultDeprecationWarning = UniWarning(
|
23
|
-
'vocab.oov_default is deprecated, '
|
24
|
-
'use vocab.oov_token instead (will be removed in 4.x version)', type_=DeprecationWarning)
|
25
|
-
|
26
|
-
MinFrequencyDeprecationWarning = UniWarning(
|
27
|
-
'vocab.min_frequency is deprecated, '
|
28
|
-
'use vocab.min_count instead (will be removed in 4.x version)', type_=DeprecationWarning)
|
29
|
-
|
30
|
-
VocabDepotDeprecationWarning = UniWarning(
|
31
|
-
'vocab_depot is deprecated, '
|
32
|
-
'use vocabs instead (will be removed in 4.x version)', type_=DeprecationWarning)
|
33
|
-
|
34
|
-
ColMapDeprecationWarning = UniWarning(
|
35
|
-
'vocab_depot.col_map is deprecated, '
|
36
|
-
'use vocabs.cols instead (will be removed in 4.x version)', type_=DeprecationWarning)
|
37
|
-
|
38
|
-
GetTokPathDeprecationWarning = UniWarning(
|
39
|
-
'unitok.get_tok_path is deprecated (will be removed in 4.x version)', type_=DeprecationWarning)
|
40
|
-
|
41
|
-
DepotsDeprecationWarning = UniWarning(
|
42
|
-
'vocab_depot.depots is deprecated, '
|
43
|
-
'use vocabs instead (will be removed in 4.x version)', type_=DeprecationWarning)
|
44
|
-
|
45
|
-
MetaDataDeprecationWarning = UniWarning(
|
46
|
-
'meta_data is deprecated, '
|
47
|
-
'use meta instead (will be removed in 4.x version)', type_=DeprecationWarning)
|
48
|
-
|
49
|
-
VocabInfoDeprecationWarning = UniWarning(
|
50
|
-
'vocab_info is deprecated, '
|
51
|
-
'use vocs instead (will be removed in 4.x version)', type_=DeprecationWarning)
|
52
|
-
|
53
|
-
ColInfoDeprecationWarning = UniWarning(
|
54
|
-
'col_info is deprecated, '
|
55
|
-
'use cols instead (will be removed in 4.x version)', type_=DeprecationWarning)
|
56
|
-
|
57
|
-
GetMaxLengthDeprecationWarning = UniWarning(
|
58
|
-
'unidep.get_max_length is deprecated, (will be removed in 4.x version)', type_=DeprecationWarning)
|
59
|
-
|
60
|
-
GetVocabDeprecationWarning = UniWarning(
|
61
|
-
'unidep.get_vocab is deprecated, (will be removed in 4.x version)', type_=DeprecationWarning)
|
62
|
-
|
63
|
-
GetVocabSizeDeprecationWarning = UniWarning(
|
64
|
-
'unidep.get_vocab_size is deprecated, (will be removed in 4.x version)', type_=DeprecationWarning)
|
65
|
-
|
66
|
-
IsListColDeprecationWarning = UniWarning(
|
67
|
-
'unidep.is_list_col is deprecated, (will be removed in 4.x version)', type_=DeprecationWarning)
|
68
|
-
|
69
|
-
ShuffleDeprecationWarning = UniWarning(
|
70
|
-
'unidep.shuffle is deprecated, (will be removed in 4.x version)', type_=DeprecationWarning)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|