UniTok 3.0.4a0__tar.gz → 3.0.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {UniTok-3.0.4a0 → UniTok-3.0.6}/PKG-INFO +1 -1
  2. {UniTok-3.0.4a0 → UniTok-3.0.6}/UniTok/column.py +13 -1
  3. UniTok-3.0.6/UniTok/meta.py +108 -0
  4. {UniTok-3.0.4a0 → UniTok-3.0.6}/UniTok/tok/number_tok.py +13 -5
  5. {UniTok-3.0.4a0 → UniTok-3.0.6}/UniTok/unidep.py +14 -15
  6. {UniTok-3.0.4a0 → UniTok-3.0.6}/UniTok/unitok.py +8 -4
  7. {UniTok-3.0.4a0 → UniTok-3.0.6}/UniTok/vocab.py +18 -16
  8. {UniTok-3.0.4a0 → UniTok-3.0.6}/UniTok/vocabs.py +11 -5
  9. {UniTok-3.0.4a0 → UniTok-3.0.6}/UniTok.egg-info/PKG-INFO +1 -1
  10. {UniTok-3.0.4a0 → UniTok-3.0.6}/UniTok.egg-info/SOURCES.txt +0 -2
  11. {UniTok-3.0.4a0 → UniTok-3.0.6}/setup.py +1 -1
  12. UniTok-3.0.4a0/UniTok/compatible/__init__.py +0 -21
  13. UniTok-3.0.4a0/UniTok/compatible/uni_warnings.py +0 -70
  14. UniTok-3.0.4a0/UniTok/meta.py +0 -57
  15. {UniTok-3.0.4a0 → UniTok-3.0.6}/README.md +0 -0
  16. {UniTok-3.0.4a0 → UniTok-3.0.6}/UniTok/__init__.py +0 -0
  17. {UniTok-3.0.4a0 → UniTok-3.0.6}/UniTok/analysis/__init__.py +0 -0
  18. {UniTok-3.0.4a0 → UniTok-3.0.6}/UniTok/analysis/lengths.py +0 -0
  19. {UniTok-3.0.4a0 → UniTok-3.0.6}/UniTok/analysis/plot.py +0 -0
  20. {UniTok-3.0.4a0 → UniTok-3.0.6}/UniTok/cols.py +0 -0
  21. {UniTok-3.0.4a0 → UniTok-3.0.6}/UniTok/global_setting.py +0 -0
  22. {UniTok-3.0.4a0 → UniTok-3.0.6}/UniTok/tok/__init__.py +0 -0
  23. {UniTok-3.0.4a0 → UniTok-3.0.6}/UniTok/tok/bert_tok.py +0 -0
  24. {UniTok-3.0.4a0 → UniTok-3.0.6}/UniTok/tok/entity_tok.py +0 -0
  25. {UniTok-3.0.4a0 → UniTok-3.0.6}/UniTok/tok/id_tok.py +0 -0
  26. {UniTok-3.0.4a0 → UniTok-3.0.6}/UniTok/tok/seq_tok.py +0 -0
  27. {UniTok-3.0.4a0 → UniTok-3.0.6}/UniTok/tok/split_tok.py +0 -0
  28. {UniTok-3.0.4a0 → UniTok-3.0.6}/UniTok/tok/tok.py +0 -0
  29. {UniTok-3.0.4a0 → UniTok-3.0.6}/UniTok.egg-info/dependency_links.txt +0 -0
  30. {UniTok-3.0.4a0 → UniTok-3.0.6}/UniTok.egg-info/requires.txt +0 -0
  31. {UniTok-3.0.4a0 → UniTok-3.0.6}/UniTok.egg-info/top_level.txt +0 -0
  32. {UniTok-3.0.4a0 → UniTok-3.0.6}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: UniTok
3
- Version: 3.0.4a0
3
+ Version: 3.0.6
4
4
  Summary: Unified Tokenizer
5
5
  Home-page: https://github.com/Jyonn/UnifiedTokenizer
6
6
  Author: Jyonn Liu
@@ -42,10 +42,22 @@ class Column:
42
42
  tok (BaseTok): The tokenizer of the column.
43
43
  operator (SeqOperator): The operator of the column.
44
44
  """
45
- def __init__(self, name, tok: BaseTok, operator: SeqOperator = None):
45
+ def __init__(self, name, tok: BaseTok, operator: SeqOperator = None, **kwargs):
46
46
  self.name = name
47
47
  self.tok = tok
48
48
  self.operator = operator
49
+
50
+ if kwargs:
51
+ if operator:
52
+ raise ValueError('operator and kwargs cannot be set at the same time')
53
+ self.operator = SeqOperator(
54
+ max_length=kwargs.get('max_length', 0),
55
+ padding=kwargs.get('padding', False),
56
+ slice_post=kwargs.get('slice_post', False),
57
+ pad_pre=kwargs.get('pad_pre', False),
58
+ pad=kwargs.get('pad', 0),
59
+ )
60
+
49
61
  self.list = bool(operator) or tok.return_list is True # whether the column is a list-element column
50
62
 
51
63
  self.data = []
@@ -0,0 +1,108 @@
1
+ import json
2
+ import os
3
+ import warnings
4
+ from typing import List
5
+
6
+
7
+ class Col:
8
+ def __init__(self, name, voc=None, max_length=None, padding=None, vocab=None):
9
+ self.name: str = name
10
+ self.voc: Voc = voc or vocab
11
+ self.max_length = max_length
12
+ self.padding = padding
13
+ self.list = max_length is not None
14
+
15
+ def __eq__(self, other):
16
+ return self.name == other.name and self.voc.name == other.voc.name and self.max_length == other.max_length
17
+
18
+ def get_info(self):
19
+ info = {
20
+ 'voc': self.voc.name,
21
+ }
22
+ if self.list:
23
+ info['max_length'] = self.max_length
24
+ info['padding'] = self.padding
25
+ return info
26
+
27
+
28
+ class Voc:
29
+ def __init__(self, name, size, cols):
30
+ self.name: str = name
31
+ self.size: int = size
32
+ self.cols: List[Col] = cols
33
+
34
+ def __eq__(self, other):
35
+ return self.name == other.name and self.size == other.size
36
+
37
+ def get_info(self):
38
+ return {
39
+ 'size': self.size,
40
+ 'cols': [col.name for col in self.cols]
41
+ }
42
+
43
+
44
+ class Meta:
45
+ VER = 'UniDep-2.0'
46
+
47
+ def __init__(self, store_dir):
48
+ self.store_dir = store_dir
49
+ self.path = os.path.join(self.store_dir, 'meta.data.json')
50
+
51
+ data = self.load()
52
+ self.version = data['version']
53
+ self.cols = data.get('cols') or data['col_info']
54
+ self.vocs = data.get('vocs') or data['vocab_info']
55
+ self.id_col = data['id_col']
56
+
57
+ # build col-voc graph
58
+ self.cols = {col: Col(**self.cols[col], name=col) for col in self.cols}
59
+ self.vocs = {voc: Voc(**self.vocs[voc], name=voc) for voc in self.vocs}
60
+
61
+ # connect class objects
62
+ for col in self.cols.values():
63
+ col.voc = self.vocs[col.voc]
64
+ for voc in self.vocs.values():
65
+ voc.cols = [self.cols[col] for col in voc.cols]
66
+
67
+ self.version_check()
68
+
69
+ @staticmethod
70
+ def parse_version(version):
71
+ if version.startswith('UniDep-'):
72
+ return version[7:]
73
+ return f'0.{version}'
74
+
75
+ def get_info(self):
76
+ return {
77
+ 'version': Meta.VER,
78
+ 'id_col': self.id_col,
79
+ 'cols': {col.name: col.get_info() for col in self.cols.values()},
80
+ 'vocs': {voc.name: voc.get_info() for voc in self.vocs.values()}
81
+ }
82
+
83
+ def load(self) -> dict:
84
+ return json.load(open(self.path))
85
+
86
+ def save(self):
87
+ json.dump(self.get_info(), open(os.path.join(self.store_dir, 'meta.data.json'), 'w'), indent=2)
88
+
89
+ def version_check(self):
90
+ current_version = self.parse_version(Meta.VER)
91
+ depot_version = self.parse_version(self.version)
92
+
93
+ if current_version != depot_version:
94
+ warnings.warn(
95
+ f'meta version of depot ({self.store_dir}) mismatch, '
96
+ f'current version: {current_version}, '
97
+ f'depot version: {depot_version}. '
98
+ f'It may cause unexpected error.')
99
+
100
+ if current_version <= depot_version:
101
+ return
102
+
103
+ command = input('Press Y to upgrade meta data for future use (Y/n): ')
104
+ if command.lower() == 'y':
105
+ os.rename(self.path, self.path + '.bak')
106
+ print('Old meta data backed up to {}.'.format(self.path + '.bak'))
107
+ self.save()
108
+ print('Meta data upgraded.')
@@ -1,3 +1,5 @@
1
+ from typing import Iterable
2
+
1
3
  from .tok import BaseTok
2
4
 
3
5
 
@@ -28,9 +30,15 @@ class NumberTok(BaseTok):
28
30
  self.vocab.extend([str(i) for i in range(vocab_size)])
29
31
 
30
32
  def t(self, obj):
31
- obj = int(obj)
32
- if obj >= len(self.vocab):
33
- if self.vocab_size is not None:
34
- raise ValueError('vocab_size is {}, but {} is given'.format(self.vocab_size, obj))
35
- self.vocab.extend([str(i) for i in range(len(self.vocab), obj + 1)])
33
+ # check is iterable
34
+ if isinstance(obj, Iterable) and not isinstance(obj, str):
35
+ obj = [int(o) for o in obj]
36
+ else:
37
+ obj = int(obj)
38
+ objs = [obj] if isinstance(obj, int) else obj
39
+ for o in objs:
40
+ if o >= len(self.vocab):
41
+ if self.vocab_size is not None:
42
+ raise ValueError('vocab_size is {}, but {} is given'.format(self.vocab_size, o))
43
+ self.vocab.extend([str(i) for i in range(len(self.vocab), o + 1)])
36
44
  return obj
@@ -1,14 +1,11 @@
1
- import json
2
1
  import os
3
2
  import random
3
+ import warnings
4
4
  from typing import Dict, List, Callable
5
5
 
6
6
  import numpy as np
7
7
  import tqdm
8
8
 
9
- from .compatible.uni_warnings import MetaDataDeprecationWarning, VocabInfoDeprecationWarning, \
10
- ColInfoDeprecationWarning, GetMaxLengthDeprecationWarning, GetVocabDeprecationWarning, \
11
- GetVocabSizeDeprecationWarning, IsListColDeprecationWarning, ShuffleDeprecationWarning
12
9
  from .meta import Meta, Col, Voc
13
10
  from .vocab import Vocab
14
11
  from .vocabs import Vocabs
@@ -19,14 +16,13 @@ class UniDep:
19
16
 
20
17
  def __init__(self, store_dir, silent=False):
21
18
  self.store_dir = os.path.expanduser(store_dir)
19
+ self.meta = Meta(self.store_dir)
20
+
22
21
  self.silent = silent
23
22
 
24
23
  self.cached = False
25
24
  self.cached_samples = []
26
25
 
27
- self.meta_path = os.path.join(self.store_dir, 'meta.data.json')
28
- self.meta = Meta(**json.load(open(self.meta_path)))
29
-
30
26
  self.data_path = os.path.join(self.store_dir, 'data.npy')
31
27
  self.data = np.load(self.data_path, allow_pickle=True)
32
28
  try:
@@ -178,38 +174,41 @@ class UniDep:
178
174
 
179
175
  @property
180
176
  def meta_data(self):
181
- MetaDataDeprecationWarning()
177
+ warnings.warn('meta_data is deprecated, '
178
+ 'use meta instead (will be removed in 4.x version)', DeprecationWarning)
182
179
  return self.meta
183
180
 
184
181
  @property
185
182
  def vocab_info(self):
186
- VocabInfoDeprecationWarning()
183
+ warnings.warn('vocab_info is deprecated, '
184
+ 'use vocs instead (will be removed in 4.x version)', DeprecationWarning)
187
185
  return self.vocs
188
186
 
189
187
  @property
190
188
  def col_info(self):
191
- ColInfoDeprecationWarning()
189
+ warnings.warn('col_info is deprecated, '
190
+ 'use cols instead (will be removed in 4.x version)', DeprecationWarning)
192
191
  return self.cols
193
192
 
194
193
  def get_vocab_size(self, col_name, as_vocab=False):
195
- GetVocabSizeDeprecationWarning()
194
+ warnings.warn('unidep.get_vocab_size is deprecated (will be removed in 4.x version)', DeprecationWarning)
196
195
  vocab_id = col_name if as_vocab else self.get_vocab(col_name)
197
196
  return self.vocs[vocab_id].size
198
197
 
199
198
  def get_vocab(self, col_name):
200
- GetVocabDeprecationWarning()
199
+ warnings.warn('unidep.get_vocab is deprecated (will be removed in 4.x version)', DeprecationWarning)
201
200
  return self.cols[col_name].voc.name
202
201
 
203
202
  def get_max_length(self, col_name):
204
- GetMaxLengthDeprecationWarning()
203
+ warnings.warn('unidep.get_max_length is deprecated (will be removed in 4.x version)', DeprecationWarning)
205
204
  return self.cols[col_name].max_length
206
205
 
207
206
  def is_list_col(self, col_name):
208
- IsListColDeprecationWarning()
207
+ warnings.warn('unidep.is_list_col is deprecated (will be removed in 4.x version)', DeprecationWarning)
209
208
  return self.cols[col_name].list
210
209
 
211
210
  def shuffle(self, shuffle=True):
212
- ShuffleDeprecationWarning()
211
+ warnings.warn('unidep.shuffle is deprecated (will be removed in 4.x version)', DeprecationWarning)
213
212
  if shuffle:
214
213
  random.shuffle(self._indexes)
215
214
  else:
@@ -1,12 +1,12 @@
1
1
  import json
2
2
  import os
3
+ import warnings
3
4
  from typing import Optional
4
5
 
5
6
  import numpy as np
6
7
  import pandas as pd
7
8
 
8
9
  from .cols import Cols
9
- from .compatible.uni_warnings import VocabDepotDeprecationWarning, GetTokPathDeprecationWarning
10
10
  from .column import Column, IndexColumn
11
11
  from .tok.bert_tok import BertTok
12
12
  from .tok.entity_tok import EntTok
@@ -29,7 +29,8 @@ class UniTok:
29
29
 
30
30
  @property
31
31
  def vocab_depots(self):
32
- VocabDepotDeprecationWarning()
32
+ warnings.warn('vocab_depot is deprecated, '
33
+ 'use vocabs instead (will be removed in 4.x version)', DeprecationWarning)
33
34
  return self.vocabs
34
35
 
35
36
  def add_col(self, col: Column):
@@ -110,6 +111,9 @@ class UniTok:
110
111
  if not self.id_col:
111
112
  raise ValueError('id column is not set')
112
113
 
114
+ for vocab in self.vocabs.values():
115
+ vocab.set_count_mode(False)
116
+
113
117
  for col_name in self.cols:
114
118
  print('[ COL:', col_name, ']')
115
119
  col = self.cols[col_name] # type: Column
@@ -121,7 +125,7 @@ class UniTok:
121
125
  """
122
126
  Get the store path of the tokenizer of a column
123
127
  """
124
- GetTokPathDeprecationWarning()
128
+ warnings.warn('unitok.get_tok_path is deprecated (will be removed in 4.x version)', DeprecationWarning)
125
129
  return self.cols[col_name].tok.vocab.get_store_path(store_dir)
126
130
 
127
131
  def store_data(self, store_dir):
@@ -145,7 +149,7 @@ class UniTok:
145
149
  cols=self.cols.get_info(),
146
150
  id_col=self.id_col.name,
147
151
  )
148
- json.dump(meta_data, open(os.path.join(store_dir, 'meta.data.json'), 'w'))
152
+ json.dump(meta_data, open(os.path.join(store_dir, 'meta.data.json'), 'w'), indent=2)
149
153
  return self
150
154
 
151
155
 
@@ -1,12 +1,10 @@
1
1
  import math
2
2
  import os
3
+ import warnings
3
4
  from typing import Union, List
4
5
 
5
6
  import numpy as np
6
7
 
7
- from UniTok.compatible.uni_warnings import VocabMapDeprecationWarning, OOVDefaultDeprecationWarning, \
8
- MinFrequencyDeprecationWarning
9
-
10
8
 
11
9
  class VocabMap(dict):
12
10
  def __call__(self, *args, **kwargs):
@@ -34,12 +32,6 @@ class Vocab:
34
32
  self._count_mode = False # whether count mode is on
35
33
  self._counter = {} # counter for counting occurrence of each token
36
34
 
37
- # self.frequency_mode = False
38
- # self.frequency = {}
39
- # self.max_frequency = 0
40
-
41
- # self.frequent_vocab = []
42
-
43
35
  """
44
36
  Basic Methods
45
37
  """
@@ -49,7 +41,8 @@ class Vocab:
49
41
  """
50
42
  Deprecated, use o2i instead
51
43
  """
52
- VocabMapDeprecationWarning()
44
+ warnings.warn('vocab.index2obj and vocab.obj2index are deprecated, '
45
+ 'use vocab.i2o and vocab.o2i instead (will be removed in 4.x version)', DeprecationWarning)
53
46
  return self.o2i
54
47
 
55
48
  @property
@@ -57,7 +50,8 @@ class Vocab:
57
50
  """
58
51
  Deprecated, use i2o instead
59
52
  """
60
- VocabMapDeprecationWarning()
53
+ warnings.warn('vocab.index2obj and vocab.obj2index are deprecated, '
54
+ 'use vocab.i2o and vocab.o2i instead (will be removed in 4.x version)', DeprecationWarning)
61
55
  return self.i2o
62
56
 
63
57
  def extend(self, objs):
@@ -130,7 +124,8 @@ class Vocab:
130
124
 
131
125
  @property
132
126
  def oov_default(self):
133
- OOVDefaultDeprecationWarning()
127
+ warnings.warn('vocab.oov_default is deprecated, '
128
+ 'use vocab.oov_token instead (will be removed in 4.x version)', DeprecationWarning)
134
129
  return self._oov_token
135
130
 
136
131
  def allow_edit(self):
@@ -192,7 +187,8 @@ class Vocab:
192
187
  :return:
193
188
  """
194
189
  if min_count is None:
195
- MinFrequencyDeprecationWarning()
190
+ warnings.warn('vocab.min_frequency is deprecated, '
191
+ 'use vocab.min_count instead (will be removed in 4.x version)', DeprecationWarning)
196
192
  min_count = min_frequency
197
193
 
198
194
  vocabs = []
@@ -261,6 +257,12 @@ class Vocab:
261
257
 
262
258
  def __getattr__(self, item):
263
259
  if item in ['frequency_mode', 'frequency', 'max_frequency', 'frequent_vocab']:
264
- return AttributeError(f'{item} is deprecated after UniTok 3.0, '
265
- f'degrade to 2.4.3.2 or lower to use it, '
266
- f'or check new features of Vocab class')
260
+ raise AttributeError(f'{item} is deprecated after UniTok 3.0, '
261
+ f'degrade to 2.4.3.2 or lower to use it, '
262
+ f'or check new features of Vocab class')
263
+
264
+ @property
265
+ def trim_vocab(self):
266
+ warnings.warn('vocab.trim_vocab is deprecated, '
267
+ 'use vocab.trim instead (will be removed in 4.x version)', DeprecationWarning)
268
+ return self.trim
@@ -1,4 +1,5 @@
1
- from .compatible.uni_warnings import ColMapDeprecationWarning, DepotsDeprecationWarning
1
+ import warnings
2
+
2
3
  from .vocab import Vocab
3
4
 
4
5
 
@@ -9,12 +10,14 @@ class Vocabs(dict):
9
10
 
10
11
  @property
11
12
  def col_map(self):
12
- ColMapDeprecationWarning()
13
+ warnings.warn('vocab_depot.col_map is deprecated, '
14
+ 'use vocabs.cols instead (will be removed in 4.x version)', DeprecationWarning)
13
15
  return self.cols
14
16
 
15
17
  @property
16
18
  def depots(self):
17
- DepotsDeprecationWarning()
19
+ warnings.warn('vocab_depot.depots is deprecated, '
20
+ 'use vocabs instead (will be removed in 4.x version)', DeprecationWarning)
18
21
  return self
19
22
 
20
23
  def append(self, col_or_vocab):
@@ -33,11 +36,14 @@ class Vocabs(dict):
33
36
  raise ValueError(f'vocab {vocab.name} already exists')
34
37
  self[vocab.name] = vocab
35
38
 
36
- def get_info(self):
39
+ def get_info(self) -> dict:
40
+ """
41
+ Get the information of all vocabs
42
+ """
37
43
  return {vocab.name: dict(
38
44
  size=vocab.get_size(),
39
45
  cols=self.cols[vocab.name],
40
- ) for vocab in self}
46
+ ) for vocab in self.values()}
41
47
 
42
48
  def __call__(self, name) -> Vocab:
43
49
  return self[name]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: UniTok
3
- Version: 3.0.4a0
3
+ Version: 3.0.6
4
4
  Summary: Unified Tokenizer
5
5
  Home-page: https://github.com/Jyonn/UnifiedTokenizer
6
6
  Author: Jyonn Liu
@@ -17,8 +17,6 @@ UniTok.egg-info/top_level.txt
17
17
  UniTok/analysis/__init__.py
18
18
  UniTok/analysis/lengths.py
19
19
  UniTok/analysis/plot.py
20
- UniTok/compatible/__init__.py
21
- UniTok/compatible/uni_warnings.py
22
20
  UniTok/tok/__init__.py
23
21
  UniTok/tok/bert_tok.py
24
22
  UniTok/tok/entity_tok.py
@@ -6,7 +6,7 @@ long_description = (this_directory / "README.md").read_text()
6
6
 
7
7
  setup(
8
8
  name='UniTok',
9
- version='3.0.4.alpha',
9
+ version='3.0.6',
10
10
  keywords=['token', 'tokenizer', 'bert'],
11
11
  description='Unified Tokenizer',
12
12
  long_description=long_description,
@@ -1,21 +0,0 @@
1
- from .uni_warnings import VocabDepotDeprecationWarning, GetTokPathDeprecationWarning, \
2
- VocabMapDeprecationWarning, OOVDefaultDeprecationWarning, MinFrequencyDeprecationWarning, \
3
- ColMapDeprecationWarning, DepotsDeprecationWarning, MetaDataDeprecationWarning, \
4
- VocabInfoDeprecationWarning, ColInfoDeprecationWarning, GetMaxLengthDeprecationWarning, \
5
- GetVocabDeprecationWarning
6
-
7
-
8
- __all__ = [
9
- 'VocabDepotDeprecationWarning',
10
- 'GetTokPathDeprecationWarning',
11
- 'VocabMapDeprecationWarning',
12
- 'OOVDefaultDeprecationWarning',
13
- 'MinFrequencyDeprecationWarning',
14
- 'ColMapDeprecationWarning',
15
- 'DepotsDeprecationWarning',
16
- 'MetaDataDeprecationWarning',
17
- 'VocabInfoDeprecationWarning',
18
- 'ColInfoDeprecationWarning',
19
- 'GetMaxLengthDeprecationWarning',
20
- 'GetVocabDeprecationWarning',
21
- ]
@@ -1,70 +0,0 @@
1
- import warnings
2
- from typing import Callable
3
-
4
- warned_flags = set()
5
-
6
-
7
- class UniWarning:
8
- def __init__(self, msg, type_: Callable = warnings.warn):
9
- self.msg = msg
10
- self.type = type_
11
-
12
- def __call__(self, *args, **kwargs):
13
- if self not in warned_flags:
14
- warned_flags.add(self)
15
- self.type(self.msg.format(*args, **kwargs))
16
-
17
-
18
- VocabMapDeprecationWarning = UniWarning(
19
- 'vocab.index2obj and vocab.obj2index are deprecated, '
20
- 'use vocab.i2o and vocab.o2i instead (will be removed in 4.x version)', type_=DeprecationWarning)
21
-
22
- OOVDefaultDeprecationWarning = UniWarning(
23
- 'vocab.oov_default is deprecated, '
24
- 'use vocab.oov_token instead (will be removed in 4.x version)', type_=DeprecationWarning)
25
-
26
- MinFrequencyDeprecationWarning = UniWarning(
27
- 'vocab.min_frequency is deprecated, '
28
- 'use vocab.min_count instead (will be removed in 4.x version)', type_=DeprecationWarning)
29
-
30
- VocabDepotDeprecationWarning = UniWarning(
31
- 'vocab_depot is deprecated, '
32
- 'use vocabs instead (will be removed in 4.x version)', type_=DeprecationWarning)
33
-
34
- ColMapDeprecationWarning = UniWarning(
35
- 'vocab_depot.col_map is deprecated, '
36
- 'use vocabs.cols instead (will be removed in 4.x version)', type_=DeprecationWarning)
37
-
38
- GetTokPathDeprecationWarning = UniWarning(
39
- 'unitok.get_tok_path is deprecated (will be removed in 4.x version)', type_=DeprecationWarning)
40
-
41
- DepotsDeprecationWarning = UniWarning(
42
- 'vocab_depot.depots is deprecated, '
43
- 'use vocabs instead (will be removed in 4.x version)', type_=DeprecationWarning)
44
-
45
- MetaDataDeprecationWarning = UniWarning(
46
- 'meta_data is deprecated, '
47
- 'use meta instead (will be removed in 4.x version)', type_=DeprecationWarning)
48
-
49
- VocabInfoDeprecationWarning = UniWarning(
50
- 'vocab_info is deprecated, '
51
- 'use vocs instead (will be removed in 4.x version)', type_=DeprecationWarning)
52
-
53
- ColInfoDeprecationWarning = UniWarning(
54
- 'col_info is deprecated, '
55
- 'use cols instead (will be removed in 4.x version)', type_=DeprecationWarning)
56
-
57
- GetMaxLengthDeprecationWarning = UniWarning(
58
- 'unidep.get_max_length is deprecated, (will be removed in 4.x version)', type_=DeprecationWarning)
59
-
60
- GetVocabDeprecationWarning = UniWarning(
61
- 'unidep.get_vocab is deprecated, (will be removed in 4.x version)', type_=DeprecationWarning)
62
-
63
- GetVocabSizeDeprecationWarning = UniWarning(
64
- 'unidep.get_vocab_size is deprecated, (will be removed in 4.x version)', type_=DeprecationWarning)
65
-
66
- IsListColDeprecationWarning = UniWarning(
67
- 'unidep.is_list_col is deprecated, (will be removed in 4.x version)', type_=DeprecationWarning)
68
-
69
- ShuffleDeprecationWarning = UniWarning(
70
- 'unidep.shuffle is deprecated, (will be removed in 4.x version)', type_=DeprecationWarning)
@@ -1,57 +0,0 @@
1
- import warnings
2
- from typing import List
3
-
4
-
5
- class Col:
6
- def __init__(self, name, voc, max_length=None, padding=None):
7
- self.name: str = name
8
- self.voc: Voc = voc
9
- self.max_length = max_length
10
- self.padding = padding
11
- self.list = max_length is not None
12
-
13
- def __eq__(self, other):
14
- return self.name == other.name and self.voc.name == other.voc.name and self.max_length == other.max_length
15
-
16
-
17
- class Voc:
18
- def __init__(self, name, size, cols):
19
- self.name: str = name
20
- self.size: int = size
21
- self.cols: List[Col] = cols
22
-
23
- def __eq__(self, other):
24
- return self.name == other.name and self.size == other.size
25
-
26
-
27
- class Meta:
28
- VER = 'UniDep-2.0'
29
-
30
- def __init__(self, version, id_col, col_info=None, vocab_info=None, cols=None, vocs=None):
31
- self.version = version
32
-
33
- self.cols = cols or col_info
34
- self.vocs = vocs or vocab_info
35
- self.id_col = id_col
36
-
37
- # build col-voc graph
38
- self.cols = {col: Col(**self.cols[col], name=col) for col in self.cols}
39
- self.vocs = {voc: Voc(**self.vocs[voc], name=voc) for voc in self.vocs}
40
-
41
- # connect class objects
42
- for col in self.cols.values():
43
- col.voc = self.vocs[col.voc]
44
- for voc in self.vocs.values():
45
- voc.cols = [self.cols[col] for col in voc.cols]
46
-
47
- self.version_check()
48
-
49
- def version_check(self):
50
- if self.version != Meta.VER:
51
- warnings.warn(
52
- 'Meta version mismatch, '
53
- 'current version: {}, '
54
- 'depot version: {}. '
55
- 'It may cause unexpected error.'.format(
56
- Meta.VER, self.version
57
- ))
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes