UniTok 3.0.4a0__tar.gz → 3.0.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {UniTok-3.0.4a0 → UniTok-3.0.5}/PKG-INFO +1 -1
- {UniTok-3.0.4a0 → UniTok-3.0.5}/UniTok/column.py +13 -1
- UniTok-3.0.5/UniTok/meta.py +108 -0
- {UniTok-3.0.4a0 → UniTok-3.0.5}/UniTok/tok/number_tok.py +13 -5
- {UniTok-3.0.4a0 → UniTok-3.0.5}/UniTok/unidep.py +13 -12
- {UniTok-3.0.4a0 → UniTok-3.0.5}/UniTok/unitok.py +8 -4
- {UniTok-3.0.4a0 → UniTok-3.0.5}/UniTok/vocab.py +18 -16
- {UniTok-3.0.4a0 → UniTok-3.0.5}/UniTok/vocabs.py +11 -5
- {UniTok-3.0.4a0 → UniTok-3.0.5}/UniTok.egg-info/PKG-INFO +1 -1
- {UniTok-3.0.4a0 → UniTok-3.0.5}/UniTok.egg-info/SOURCES.txt +0 -2
- {UniTok-3.0.4a0 → UniTok-3.0.5}/setup.py +1 -1
- UniTok-3.0.4a0/UniTok/compatible/__init__.py +0 -21
- UniTok-3.0.4a0/UniTok/compatible/uni_warnings.py +0 -70
- UniTok-3.0.4a0/UniTok/meta.py +0 -57
- {UniTok-3.0.4a0 → UniTok-3.0.5}/README.md +0 -0
- {UniTok-3.0.4a0 → UniTok-3.0.5}/UniTok/__init__.py +0 -0
- {UniTok-3.0.4a0 → UniTok-3.0.5}/UniTok/analysis/__init__.py +0 -0
- {UniTok-3.0.4a0 → UniTok-3.0.5}/UniTok/analysis/lengths.py +0 -0
- {UniTok-3.0.4a0 → UniTok-3.0.5}/UniTok/analysis/plot.py +0 -0
- {UniTok-3.0.4a0 → UniTok-3.0.5}/UniTok/cols.py +0 -0
- {UniTok-3.0.4a0 → UniTok-3.0.5}/UniTok/global_setting.py +0 -0
- {UniTok-3.0.4a0 → UniTok-3.0.5}/UniTok/tok/__init__.py +0 -0
- {UniTok-3.0.4a0 → UniTok-3.0.5}/UniTok/tok/bert_tok.py +0 -0
- {UniTok-3.0.4a0 → UniTok-3.0.5}/UniTok/tok/entity_tok.py +0 -0
- {UniTok-3.0.4a0 → UniTok-3.0.5}/UniTok/tok/id_tok.py +0 -0
- {UniTok-3.0.4a0 → UniTok-3.0.5}/UniTok/tok/seq_tok.py +0 -0
- {UniTok-3.0.4a0 → UniTok-3.0.5}/UniTok/tok/split_tok.py +0 -0
- {UniTok-3.0.4a0 → UniTok-3.0.5}/UniTok/tok/tok.py +0 -0
- {UniTok-3.0.4a0 → UniTok-3.0.5}/UniTok.egg-info/dependency_links.txt +0 -0
- {UniTok-3.0.4a0 → UniTok-3.0.5}/UniTok.egg-info/requires.txt +0 -0
- {UniTok-3.0.4a0 → UniTok-3.0.5}/UniTok.egg-info/top_level.txt +0 -0
- {UniTok-3.0.4a0 → UniTok-3.0.5}/setup.cfg +0 -0
@@ -42,10 +42,22 @@ class Column:
|
|
42
42
|
tok (BaseTok): The tokenizer of the column.
|
43
43
|
operator (SeqOperator): The operator of the column.
|
44
44
|
"""
|
45
|
-
def __init__(self, name, tok: BaseTok, operator: SeqOperator = None):
|
45
|
+
def __init__(self, name, tok: BaseTok, operator: SeqOperator = None, **kwargs):
|
46
46
|
self.name = name
|
47
47
|
self.tok = tok
|
48
48
|
self.operator = operator
|
49
|
+
|
50
|
+
if kwargs:
|
51
|
+
if operator:
|
52
|
+
raise ValueError('operator and kwargs cannot be set at the same time')
|
53
|
+
self.operator = SeqOperator(
|
54
|
+
max_length=kwargs.get('max_length', 0),
|
55
|
+
padding=kwargs.get('padding', False),
|
56
|
+
slice_post=kwargs.get('slice_post', False),
|
57
|
+
pad_pre=kwargs.get('pad_pre', False),
|
58
|
+
pad=kwargs.get('pad', 0),
|
59
|
+
)
|
60
|
+
|
49
61
|
self.list = bool(operator) or tok.return_list is True # whether the column is a list-element column
|
50
62
|
|
51
63
|
self.data = []
|
@@ -0,0 +1,108 @@
|
|
1
|
+
import json
|
2
|
+
import os
|
3
|
+
import warnings
|
4
|
+
from typing import List
|
5
|
+
|
6
|
+
|
7
|
+
class Col:
|
8
|
+
def __init__(self, name, voc=None, max_length=None, padding=None, vocab=None):
|
9
|
+
self.name: str = name
|
10
|
+
self.voc: Voc = voc or vocab
|
11
|
+
self.max_length = max_length
|
12
|
+
self.padding = padding
|
13
|
+
self.list = max_length is not None
|
14
|
+
|
15
|
+
def __eq__(self, other):
|
16
|
+
return self.name == other.name and self.voc.name == other.voc.name and self.max_length == other.max_length
|
17
|
+
|
18
|
+
def get_info(self):
|
19
|
+
info = {
|
20
|
+
'voc': self.voc.name,
|
21
|
+
}
|
22
|
+
if self.list:
|
23
|
+
info['max_length'] = self.max_length
|
24
|
+
info['padding'] = self.padding
|
25
|
+
return info
|
26
|
+
|
27
|
+
|
28
|
+
class Voc:
|
29
|
+
def __init__(self, name, size, cols):
|
30
|
+
self.name: str = name
|
31
|
+
self.size: int = size
|
32
|
+
self.cols: List[Col] = cols
|
33
|
+
|
34
|
+
def __eq__(self, other):
|
35
|
+
return self.name == other.name and self.size == other.size
|
36
|
+
|
37
|
+
def get_info(self):
|
38
|
+
return {
|
39
|
+
'size': self.size,
|
40
|
+
'cols': [col.name for col in self.cols]
|
41
|
+
}
|
42
|
+
|
43
|
+
|
44
|
+
class Meta:
|
45
|
+
VER = 'UniDep-2.0'
|
46
|
+
|
47
|
+
def __init__(self, store_dir):
|
48
|
+
self.store_dir = store_dir
|
49
|
+
self.path = os.path.join(self.store_dir, 'meta.data.json')
|
50
|
+
|
51
|
+
data = self.load()
|
52
|
+
self.version = data['version']
|
53
|
+
self.cols = data.get('cols') or data['col_info']
|
54
|
+
self.vocs = data.get('vocs') or data['vocab_info']
|
55
|
+
self.id_col = data['id_col']
|
56
|
+
|
57
|
+
# build col-voc graph
|
58
|
+
self.cols = {col: Col(**self.cols[col], name=col) for col in self.cols}
|
59
|
+
self.vocs = {voc: Voc(**self.vocs[voc], name=voc) for voc in self.vocs}
|
60
|
+
|
61
|
+
# connect class objects
|
62
|
+
for col in self.cols.values():
|
63
|
+
col.voc = self.vocs[col.voc]
|
64
|
+
for voc in self.vocs.values():
|
65
|
+
voc.cols = [self.cols[col] for col in voc.cols]
|
66
|
+
|
67
|
+
self.version_check()
|
68
|
+
|
69
|
+
@staticmethod
|
70
|
+
def parse_version(version):
|
71
|
+
if version.startswith('UniDep-'):
|
72
|
+
return version[7:]
|
73
|
+
return f'0.{version}'
|
74
|
+
|
75
|
+
def get_info(self):
|
76
|
+
return {
|
77
|
+
'version': Meta.VER,
|
78
|
+
'id_col': self.id_col,
|
79
|
+
'cols': {col.name: col.get_info() for col in self.cols.values()},
|
80
|
+
'vocs': {voc.name: voc.get_info() for voc in self.vocs.values()}
|
81
|
+
}
|
82
|
+
|
83
|
+
def load(self) -> dict:
|
84
|
+
return json.load(open(self.path))
|
85
|
+
|
86
|
+
def save(self):
|
87
|
+
json.dump(self.get_info(), open(os.path.join(self.store_dir, 'meta.data.json'), 'w'), indent=2)
|
88
|
+
|
89
|
+
def version_check(self):
|
90
|
+
current_version = self.parse_version(Meta.VER)
|
91
|
+
depot_version = self.parse_version(self.version)
|
92
|
+
|
93
|
+
if current_version != depot_version:
|
94
|
+
warnings.warn(
|
95
|
+
f'meta version of depot ({self.store_dir}) mismatch, '
|
96
|
+
f'current version: {current_version}, '
|
97
|
+
f'depot version: {depot_version}. '
|
98
|
+
f'It may cause unexpected error.')
|
99
|
+
|
100
|
+
if current_version <= depot_version:
|
101
|
+
return
|
102
|
+
|
103
|
+
command = input('Press Y to upgrade meta data for future use (Y/n): ')
|
104
|
+
if command.lower() == 'y':
|
105
|
+
os.rename(self.path, self.path + '.bak')
|
106
|
+
print('Old meta data backed up to {}.'.format(self.path + '.bak'))
|
107
|
+
self.save()
|
108
|
+
print('Meta data upgraded.')
|
@@ -1,3 +1,5 @@
|
|
1
|
+
from typing import Iterable
|
2
|
+
|
1
3
|
from .tok import BaseTok
|
2
4
|
|
3
5
|
|
@@ -28,9 +30,15 @@ class NumberTok(BaseTok):
|
|
28
30
|
self.vocab.extend([str(i) for i in range(vocab_size)])
|
29
31
|
|
30
32
|
def t(self, obj):
|
31
|
-
|
32
|
-
if obj
|
33
|
-
|
34
|
-
|
35
|
-
|
33
|
+
# check is iterable
|
34
|
+
if isinstance(obj, Iterable) and not isinstance(obj, str):
|
35
|
+
obj = [int(o) for o in obj]
|
36
|
+
else:
|
37
|
+
obj = int(obj)
|
38
|
+
objs = [obj] if isinstance(obj, int) else obj
|
39
|
+
for o in objs:
|
40
|
+
if o >= len(self.vocab):
|
41
|
+
if self.vocab_size is not None:
|
42
|
+
raise ValueError('vocab_size is {}, but {} is given'.format(self.vocab_size, o))
|
43
|
+
self.vocab.extend([str(i) for i in range(len(self.vocab), o + 1)])
|
36
44
|
return obj
|
@@ -1,14 +1,12 @@
|
|
1
1
|
import json
|
2
2
|
import os
|
3
3
|
import random
|
4
|
+
import warnings
|
4
5
|
from typing import Dict, List, Callable
|
5
6
|
|
6
7
|
import numpy as np
|
7
8
|
import tqdm
|
8
9
|
|
9
|
-
from .compatible.uni_warnings import MetaDataDeprecationWarning, VocabInfoDeprecationWarning, \
|
10
|
-
ColInfoDeprecationWarning, GetMaxLengthDeprecationWarning, GetVocabDeprecationWarning, \
|
11
|
-
GetVocabSizeDeprecationWarning, IsListColDeprecationWarning, ShuffleDeprecationWarning
|
12
10
|
from .meta import Meta, Col, Voc
|
13
11
|
from .vocab import Vocab
|
14
12
|
from .vocabs import Vocabs
|
@@ -25,7 +23,7 @@ class UniDep:
|
|
25
23
|
self.cached_samples = []
|
26
24
|
|
27
25
|
self.meta_path = os.path.join(self.store_dir, 'meta.data.json')
|
28
|
-
self.meta = Meta(**json.load(open(self.meta_path)))
|
26
|
+
self.meta = Meta(**json.load(open(self.meta_path)), store_dir=self.store_dir)
|
29
27
|
|
30
28
|
self.data_path = os.path.join(self.store_dir, 'data.npy')
|
31
29
|
self.data = np.load(self.data_path, allow_pickle=True)
|
@@ -178,38 +176,41 @@ class UniDep:
|
|
178
176
|
|
179
177
|
@property
|
180
178
|
def meta_data(self):
|
181
|
-
|
179
|
+
warnings.warn('meta_data is deprecated, '
|
180
|
+
'use meta instead (will be removed in 4.x version)', DeprecationWarning)
|
182
181
|
return self.meta
|
183
182
|
|
184
183
|
@property
|
185
184
|
def vocab_info(self):
|
186
|
-
|
185
|
+
warnings.warn('vocab_info is deprecated, '
|
186
|
+
'use vocs instead (will be removed in 4.x version)', DeprecationWarning)
|
187
187
|
return self.vocs
|
188
188
|
|
189
189
|
@property
|
190
190
|
def col_info(self):
|
191
|
-
|
191
|
+
warnings.warn('col_info is deprecated, '
|
192
|
+
'use cols instead (will be removed in 4.x version)', DeprecationWarning)
|
192
193
|
return self.cols
|
193
194
|
|
194
195
|
def get_vocab_size(self, col_name, as_vocab=False):
|
195
|
-
|
196
|
+
warnings.warn('unidep.get_vocab_size is deprecated (will be removed in 4.x version)', DeprecationWarning)
|
196
197
|
vocab_id = col_name if as_vocab else self.get_vocab(col_name)
|
197
198
|
return self.vocs[vocab_id].size
|
198
199
|
|
199
200
|
def get_vocab(self, col_name):
|
200
|
-
|
201
|
+
warnings.warn('unidep.get_vocab is deprecated (will be removed in 4.x version)', DeprecationWarning)
|
201
202
|
return self.cols[col_name].voc.name
|
202
203
|
|
203
204
|
def get_max_length(self, col_name):
|
204
|
-
|
205
|
+
warnings.warn('unidep.get_max_length is deprecated (will be removed in 4.x version)', DeprecationWarning)
|
205
206
|
return self.cols[col_name].max_length
|
206
207
|
|
207
208
|
def is_list_col(self, col_name):
|
208
|
-
|
209
|
+
warnings.warn('unidep.is_list_col is deprecated (will be removed in 4.x version)', DeprecationWarning)
|
209
210
|
return self.cols[col_name].list
|
210
211
|
|
211
212
|
def shuffle(self, shuffle=True):
|
212
|
-
|
213
|
+
warnings.warn('unidep.shuffle is deprecated (will be removed in 4.x version)', DeprecationWarning)
|
213
214
|
if shuffle:
|
214
215
|
random.shuffle(self._indexes)
|
215
216
|
else:
|
@@ -1,12 +1,12 @@
|
|
1
1
|
import json
|
2
2
|
import os
|
3
|
+
import warnings
|
3
4
|
from typing import Optional
|
4
5
|
|
5
6
|
import numpy as np
|
6
7
|
import pandas as pd
|
7
8
|
|
8
9
|
from .cols import Cols
|
9
|
-
from .compatible.uni_warnings import VocabDepotDeprecationWarning, GetTokPathDeprecationWarning
|
10
10
|
from .column import Column, IndexColumn
|
11
11
|
from .tok.bert_tok import BertTok
|
12
12
|
from .tok.entity_tok import EntTok
|
@@ -29,7 +29,8 @@ class UniTok:
|
|
29
29
|
|
30
30
|
@property
|
31
31
|
def vocab_depots(self):
|
32
|
-
|
32
|
+
warnings.warn('vocab_depot is deprecated, '
|
33
|
+
'use vocabs instead (will be removed in 4.x version)', DeprecationWarning)
|
33
34
|
return self.vocabs
|
34
35
|
|
35
36
|
def add_col(self, col: Column):
|
@@ -110,6 +111,9 @@ class UniTok:
|
|
110
111
|
if not self.id_col:
|
111
112
|
raise ValueError('id column is not set')
|
112
113
|
|
114
|
+
for vocab in self.vocabs.values():
|
115
|
+
vocab.set_count_mode(False)
|
116
|
+
|
113
117
|
for col_name in self.cols:
|
114
118
|
print('[ COL:', col_name, ']')
|
115
119
|
col = self.cols[col_name] # type: Column
|
@@ -121,7 +125,7 @@ class UniTok:
|
|
121
125
|
"""
|
122
126
|
Get the store path of the tokenizer of a column
|
123
127
|
"""
|
124
|
-
|
128
|
+
warnings.warn('unitok.get_tok_path is deprecated (will be removed in 4.x version)', DeprecationWarning)
|
125
129
|
return self.cols[col_name].tok.vocab.get_store_path(store_dir)
|
126
130
|
|
127
131
|
def store_data(self, store_dir):
|
@@ -145,7 +149,7 @@ class UniTok:
|
|
145
149
|
cols=self.cols.get_info(),
|
146
150
|
id_col=self.id_col.name,
|
147
151
|
)
|
148
|
-
json.dump(meta_data, open(os.path.join(store_dir, 'meta.data.json'), 'w'))
|
152
|
+
json.dump(meta_data, open(os.path.join(store_dir, 'meta.data.json'), 'w'), indent=2)
|
149
153
|
return self
|
150
154
|
|
151
155
|
|
@@ -1,12 +1,10 @@
|
|
1
1
|
import math
|
2
2
|
import os
|
3
|
+
import warnings
|
3
4
|
from typing import Union, List
|
4
5
|
|
5
6
|
import numpy as np
|
6
7
|
|
7
|
-
from UniTok.compatible.uni_warnings import VocabMapDeprecationWarning, OOVDefaultDeprecationWarning, \
|
8
|
-
MinFrequencyDeprecationWarning
|
9
|
-
|
10
8
|
|
11
9
|
class VocabMap(dict):
|
12
10
|
def __call__(self, *args, **kwargs):
|
@@ -34,12 +32,6 @@ class Vocab:
|
|
34
32
|
self._count_mode = False # whether count mode is on
|
35
33
|
self._counter = {} # counter for counting occurrence of each token
|
36
34
|
|
37
|
-
# self.frequency_mode = False
|
38
|
-
# self.frequency = {}
|
39
|
-
# self.max_frequency = 0
|
40
|
-
|
41
|
-
# self.frequent_vocab = []
|
42
|
-
|
43
35
|
"""
|
44
36
|
Basic Methods
|
45
37
|
"""
|
@@ -49,7 +41,8 @@ class Vocab:
|
|
49
41
|
"""
|
50
42
|
Deprecated, use o2i instead
|
51
43
|
"""
|
52
|
-
|
44
|
+
warnings.warn('vocab.index2obj and vocab.obj2index are deprecated, '
|
45
|
+
'use vocab.i2o and vocab.o2i instead (will be removed in 4.x version)', DeprecationWarning)
|
53
46
|
return self.o2i
|
54
47
|
|
55
48
|
@property
|
@@ -57,7 +50,8 @@ class Vocab:
|
|
57
50
|
"""
|
58
51
|
Deprecated, use i2o instead
|
59
52
|
"""
|
60
|
-
|
53
|
+
warnings.warn('vocab.index2obj and vocab.obj2index are deprecated, '
|
54
|
+
'use vocab.i2o and vocab.o2i instead (will be removed in 4.x version)', DeprecationWarning)
|
61
55
|
return self.i2o
|
62
56
|
|
63
57
|
def extend(self, objs):
|
@@ -130,7 +124,8 @@ class Vocab:
|
|
130
124
|
|
131
125
|
@property
|
132
126
|
def oov_default(self):
|
133
|
-
|
127
|
+
warnings.warn('vocab.oov_default is deprecated, '
|
128
|
+
'use vocab.oov_token instead (will be removed in 4.x version)', DeprecationWarning)
|
134
129
|
return self._oov_token
|
135
130
|
|
136
131
|
def allow_edit(self):
|
@@ -192,7 +187,8 @@ class Vocab:
|
|
192
187
|
:return:
|
193
188
|
"""
|
194
189
|
if min_count is None:
|
195
|
-
|
190
|
+
warnings.warn('vocab.min_frequency is deprecated, '
|
191
|
+
'use vocab.min_count instead (will be removed in 4.x version)', DeprecationWarning)
|
196
192
|
min_count = min_frequency
|
197
193
|
|
198
194
|
vocabs = []
|
@@ -261,6 +257,12 @@ class Vocab:
|
|
261
257
|
|
262
258
|
def __getattr__(self, item):
|
263
259
|
if item in ['frequency_mode', 'frequency', 'max_frequency', 'frequent_vocab']:
|
264
|
-
|
265
|
-
|
266
|
-
|
260
|
+
raise AttributeError(f'{item} is deprecated after UniTok 3.0, '
|
261
|
+
f'degrade to 2.4.3.2 or lower to use it, '
|
262
|
+
f'or check new features of Vocab class')
|
263
|
+
|
264
|
+
@property
|
265
|
+
def trim_vocab(self):
|
266
|
+
warnings.warn('vocab.trim_vocab is deprecated, '
|
267
|
+
'use vocab.trim instead (will be removed in 4.x version)', DeprecationWarning)
|
268
|
+
return self.trim
|
@@ -1,4 +1,5 @@
|
|
1
|
-
|
1
|
+
import warnings
|
2
|
+
|
2
3
|
from .vocab import Vocab
|
3
4
|
|
4
5
|
|
@@ -9,12 +10,14 @@ class Vocabs(dict):
|
|
9
10
|
|
10
11
|
@property
|
11
12
|
def col_map(self):
|
12
|
-
|
13
|
+
warnings.warn('vocab_depot.col_map is deprecated, '
|
14
|
+
'use vocabs.cols instead (will be removed in 4.x version)', DeprecationWarning)
|
13
15
|
return self.cols
|
14
16
|
|
15
17
|
@property
|
16
18
|
def depots(self):
|
17
|
-
|
19
|
+
warnings.warn('vocab_depot.depots is deprecated, '
|
20
|
+
'use vocabs instead (will be removed in 4.x version)', DeprecationWarning)
|
18
21
|
return self
|
19
22
|
|
20
23
|
def append(self, col_or_vocab):
|
@@ -33,11 +36,14 @@ class Vocabs(dict):
|
|
33
36
|
raise ValueError(f'vocab {vocab.name} already exists')
|
34
37
|
self[vocab.name] = vocab
|
35
38
|
|
36
|
-
def get_info(self):
|
39
|
+
def get_info(self) -> dict:
|
40
|
+
"""
|
41
|
+
Get the information of all vocabs
|
42
|
+
"""
|
37
43
|
return {vocab.name: dict(
|
38
44
|
size=vocab.get_size(),
|
39
45
|
cols=self.cols[vocab.name],
|
40
|
-
) for vocab in self}
|
46
|
+
) for vocab in self.values()}
|
41
47
|
|
42
48
|
def __call__(self, name) -> Vocab:
|
43
49
|
return self[name]
|
@@ -17,8 +17,6 @@ UniTok.egg-info/top_level.txt
|
|
17
17
|
UniTok/analysis/__init__.py
|
18
18
|
UniTok/analysis/lengths.py
|
19
19
|
UniTok/analysis/plot.py
|
20
|
-
UniTok/compatible/__init__.py
|
21
|
-
UniTok/compatible/uni_warnings.py
|
22
20
|
UniTok/tok/__init__.py
|
23
21
|
UniTok/tok/bert_tok.py
|
24
22
|
UniTok/tok/entity_tok.py
|
@@ -1,21 +0,0 @@
|
|
1
|
-
from .uni_warnings import VocabDepotDeprecationWarning, GetTokPathDeprecationWarning, \
|
2
|
-
VocabMapDeprecationWarning, OOVDefaultDeprecationWarning, MinFrequencyDeprecationWarning, \
|
3
|
-
ColMapDeprecationWarning, DepotsDeprecationWarning, MetaDataDeprecationWarning, \
|
4
|
-
VocabInfoDeprecationWarning, ColInfoDeprecationWarning, GetMaxLengthDeprecationWarning, \
|
5
|
-
GetVocabDeprecationWarning
|
6
|
-
|
7
|
-
|
8
|
-
__all__ = [
|
9
|
-
'VocabDepotDeprecationWarning',
|
10
|
-
'GetTokPathDeprecationWarning',
|
11
|
-
'VocabMapDeprecationWarning',
|
12
|
-
'OOVDefaultDeprecationWarning',
|
13
|
-
'MinFrequencyDeprecationWarning',
|
14
|
-
'ColMapDeprecationWarning',
|
15
|
-
'DepotsDeprecationWarning',
|
16
|
-
'MetaDataDeprecationWarning',
|
17
|
-
'VocabInfoDeprecationWarning',
|
18
|
-
'ColInfoDeprecationWarning',
|
19
|
-
'GetMaxLengthDeprecationWarning',
|
20
|
-
'GetVocabDeprecationWarning',
|
21
|
-
]
|
@@ -1,70 +0,0 @@
|
|
1
|
-
import warnings
|
2
|
-
from typing import Callable
|
3
|
-
|
4
|
-
warned_flags = set()
|
5
|
-
|
6
|
-
|
7
|
-
class UniWarning:
|
8
|
-
def __init__(self, msg, type_: Callable = warnings.warn):
|
9
|
-
self.msg = msg
|
10
|
-
self.type = type_
|
11
|
-
|
12
|
-
def __call__(self, *args, **kwargs):
|
13
|
-
if self not in warned_flags:
|
14
|
-
warned_flags.add(self)
|
15
|
-
self.type(self.msg.format(*args, **kwargs))
|
16
|
-
|
17
|
-
|
18
|
-
VocabMapDeprecationWarning = UniWarning(
|
19
|
-
'vocab.index2obj and vocab.obj2index are deprecated, '
|
20
|
-
'use vocab.i2o and vocab.o2i instead (will be removed in 4.x version)', type_=DeprecationWarning)
|
21
|
-
|
22
|
-
OOVDefaultDeprecationWarning = UniWarning(
|
23
|
-
'vocab.oov_default is deprecated, '
|
24
|
-
'use vocab.oov_token instead (will be removed in 4.x version)', type_=DeprecationWarning)
|
25
|
-
|
26
|
-
MinFrequencyDeprecationWarning = UniWarning(
|
27
|
-
'vocab.min_frequency is deprecated, '
|
28
|
-
'use vocab.min_count instead (will be removed in 4.x version)', type_=DeprecationWarning)
|
29
|
-
|
30
|
-
VocabDepotDeprecationWarning = UniWarning(
|
31
|
-
'vocab_depot is deprecated, '
|
32
|
-
'use vocabs instead (will be removed in 4.x version)', type_=DeprecationWarning)
|
33
|
-
|
34
|
-
ColMapDeprecationWarning = UniWarning(
|
35
|
-
'vocab_depot.col_map is deprecated, '
|
36
|
-
'use vocabs.cols instead (will be removed in 4.x version)', type_=DeprecationWarning)
|
37
|
-
|
38
|
-
GetTokPathDeprecationWarning = UniWarning(
|
39
|
-
'unitok.get_tok_path is deprecated (will be removed in 4.x version)', type_=DeprecationWarning)
|
40
|
-
|
41
|
-
DepotsDeprecationWarning = UniWarning(
|
42
|
-
'vocab_depot.depots is deprecated, '
|
43
|
-
'use vocabs instead (will be removed in 4.x version)', type_=DeprecationWarning)
|
44
|
-
|
45
|
-
MetaDataDeprecationWarning = UniWarning(
|
46
|
-
'meta_data is deprecated, '
|
47
|
-
'use meta instead (will be removed in 4.x version)', type_=DeprecationWarning)
|
48
|
-
|
49
|
-
VocabInfoDeprecationWarning = UniWarning(
|
50
|
-
'vocab_info is deprecated, '
|
51
|
-
'use vocs instead (will be removed in 4.x version)', type_=DeprecationWarning)
|
52
|
-
|
53
|
-
ColInfoDeprecationWarning = UniWarning(
|
54
|
-
'col_info is deprecated, '
|
55
|
-
'use cols instead (will be removed in 4.x version)', type_=DeprecationWarning)
|
56
|
-
|
57
|
-
GetMaxLengthDeprecationWarning = UniWarning(
|
58
|
-
'unidep.get_max_length is deprecated, (will be removed in 4.x version)', type_=DeprecationWarning)
|
59
|
-
|
60
|
-
GetVocabDeprecationWarning = UniWarning(
|
61
|
-
'unidep.get_vocab is deprecated, (will be removed in 4.x version)', type_=DeprecationWarning)
|
62
|
-
|
63
|
-
GetVocabSizeDeprecationWarning = UniWarning(
|
64
|
-
'unidep.get_vocab_size is deprecated, (will be removed in 4.x version)', type_=DeprecationWarning)
|
65
|
-
|
66
|
-
IsListColDeprecationWarning = UniWarning(
|
67
|
-
'unidep.is_list_col is deprecated, (will be removed in 4.x version)', type_=DeprecationWarning)
|
68
|
-
|
69
|
-
ShuffleDeprecationWarning = UniWarning(
|
70
|
-
'unidep.shuffle is deprecated, (will be removed in 4.x version)', type_=DeprecationWarning)
|
UniTok-3.0.4a0/UniTok/meta.py
DELETED
@@ -1,57 +0,0 @@
|
|
1
|
-
import warnings
|
2
|
-
from typing import List
|
3
|
-
|
4
|
-
|
5
|
-
class Col:
|
6
|
-
def __init__(self, name, voc, max_length=None, padding=None):
|
7
|
-
self.name: str = name
|
8
|
-
self.voc: Voc = voc
|
9
|
-
self.max_length = max_length
|
10
|
-
self.padding = padding
|
11
|
-
self.list = max_length is not None
|
12
|
-
|
13
|
-
def __eq__(self, other):
|
14
|
-
return self.name == other.name and self.voc.name == other.voc.name and self.max_length == other.max_length
|
15
|
-
|
16
|
-
|
17
|
-
class Voc:
|
18
|
-
def __init__(self, name, size, cols):
|
19
|
-
self.name: str = name
|
20
|
-
self.size: int = size
|
21
|
-
self.cols: List[Col] = cols
|
22
|
-
|
23
|
-
def __eq__(self, other):
|
24
|
-
return self.name == other.name and self.size == other.size
|
25
|
-
|
26
|
-
|
27
|
-
class Meta:
|
28
|
-
VER = 'UniDep-2.0'
|
29
|
-
|
30
|
-
def __init__(self, version, id_col, col_info=None, vocab_info=None, cols=None, vocs=None):
|
31
|
-
self.version = version
|
32
|
-
|
33
|
-
self.cols = cols or col_info
|
34
|
-
self.vocs = vocs or vocab_info
|
35
|
-
self.id_col = id_col
|
36
|
-
|
37
|
-
# build col-voc graph
|
38
|
-
self.cols = {col: Col(**self.cols[col], name=col) for col in self.cols}
|
39
|
-
self.vocs = {voc: Voc(**self.vocs[voc], name=voc) for voc in self.vocs}
|
40
|
-
|
41
|
-
# connect class objects
|
42
|
-
for col in self.cols.values():
|
43
|
-
col.voc = self.vocs[col.voc]
|
44
|
-
for voc in self.vocs.values():
|
45
|
-
voc.cols = [self.cols[col] for col in voc.cols]
|
46
|
-
|
47
|
-
self.version_check()
|
48
|
-
|
49
|
-
def version_check(self):
|
50
|
-
if self.version != Meta.VER:
|
51
|
-
warnings.warn(
|
52
|
-
'Meta version mismatch, '
|
53
|
-
'current version: {}, '
|
54
|
-
'depot version: {}. '
|
55
|
-
'It may cause unexpected error.'.format(
|
56
|
-
Meta.VER, self.version
|
57
|
-
))
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|