UniTok 3.0.3a0__tar.gz → 3.0.3b0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. {UniTok-3.0.3a0 → UniTok-3.0.3b0}/PKG-INFO +1 -1
  2. {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok/unidep.py +12 -11
  3. {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok/unitok.py +7 -3
  4. {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok/vocab.py +24 -13
  5. {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok/vocabs.py +6 -3
  6. {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok.egg-info/PKG-INFO +1 -1
  7. {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok.egg-info/SOURCES.txt +0 -2
  8. {UniTok-3.0.3a0 → UniTok-3.0.3b0}/setup.py +1 -1
  9. UniTok-3.0.3a0/UniTok/compatible/__init__.py +0 -21
  10. UniTok-3.0.3a0/UniTok/compatible/uni_warnings.py +0 -70
  11. {UniTok-3.0.3a0 → UniTok-3.0.3b0}/README.md +0 -0
  12. {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok/__init__.py +0 -0
  13. {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok/analysis/__init__.py +0 -0
  14. {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok/analysis/lengths.py +0 -0
  15. {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok/analysis/plot.py +0 -0
  16. {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok/cols.py +0 -0
  17. {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok/column.py +0 -0
  18. {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok/global_setting.py +0 -0
  19. {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok/meta.py +0 -0
  20. {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok/tok/__init__.py +0 -0
  21. {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok/tok/bert_tok.py +0 -0
  22. {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok/tok/entity_tok.py +0 -0
  23. {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok/tok/id_tok.py +0 -0
  24. {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok/tok/number_tok.py +0 -0
  25. {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok/tok/seq_tok.py +0 -0
  26. {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok/tok/split_tok.py +0 -0
  27. {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok/tok/tok.py +0 -0
  28. {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok.egg-info/dependency_links.txt +0 -0
  29. {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok.egg-info/requires.txt +0 -0
  30. {UniTok-3.0.3a0 → UniTok-3.0.3b0}/UniTok.egg-info/top_level.txt +0 -0
  31. {UniTok-3.0.3a0 → UniTok-3.0.3b0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: UniTok
3
- Version: 3.0.3a0
3
+ Version: 3.0.3b0
4
4
  Summary: Unified Tokenizer
5
5
  Home-page: https://github.com/Jyonn/UnifiedTokenizer
6
6
  Author: Jyonn Liu
@@ -1,14 +1,12 @@
1
1
  import json
2
2
  import os
3
3
  import random
4
+ import warnings
4
5
  from typing import Dict, List, Callable
5
6
 
6
7
  import numpy as np
7
8
  import tqdm
8
9
 
9
- from .compatible.uni_warnings import MetaDataDeprecationWarning, VocabInfoDeprecationWarning, \
10
- ColInfoDeprecationWarning, GetMaxLengthDeprecationWarning, GetVocabDeprecationWarning, \
11
- GetVocabSizeDeprecationWarning, IsListColDeprecationWarning, ShuffleDeprecationWarning
12
10
  from .meta import Meta, Col, Voc
13
11
  from .vocab import Vocab
14
12
  from .vocabs import Vocabs
@@ -178,38 +176,41 @@ class UniDep:
178
176
 
179
177
  @property
180
178
  def meta_data(self):
181
- MetaDataDeprecationWarning()
179
+ warnings.warn('meta_data is deprecated, '
180
+ 'use meta instead (will be removed in 4.x version)', DeprecationWarning)
182
181
  return self.meta
183
182
 
184
183
  @property
185
184
  def vocab_info(self):
186
- VocabInfoDeprecationWarning()
185
+ warnings.warn('vocab_info is deprecated, '
186
+ 'use vocs instead (will be removed in 4.x version)', DeprecationWarning)
187
187
  return self.vocs
188
188
 
189
189
  @property
190
190
  def col_info(self):
191
- ColInfoDeprecationWarning()
191
+ warnings.warn('col_info is deprecated, '
192
+ 'use cols instead (will be removed in 4.x version)', DeprecationWarning)
192
193
  return self.cols
193
194
 
194
195
  def get_vocab_size(self, col_name, as_vocab=False):
195
- GetVocabSizeDeprecationWarning()
196
+ warnings.warn('unidep.get_vocab_size is deprecated (will be removed in 4.x version)', DeprecationWarning)
196
197
  vocab_id = col_name if as_vocab else self.get_vocab(col_name)
197
198
  return self.vocs[vocab_id].size
198
199
 
199
200
  def get_vocab(self, col_name):
200
- GetVocabDeprecationWarning()
201
+ warnings.warn('unidep.get_vocab is deprecated (will be removed in 4.x version)', DeprecationWarning)
201
202
  return self.cols[col_name].voc.name
202
203
 
203
204
  def get_max_length(self, col_name):
204
- GetMaxLengthDeprecationWarning()
205
+ warnings.warn('unidep.get_max_length is deprecated (will be removed in 4.x version)', DeprecationWarning)
205
206
  return self.cols[col_name].max_length
206
207
 
207
208
  def is_list_col(self, col_name):
208
- IsListColDeprecationWarning()
209
+ warnings.warn('unidep.is_list_col is deprecated (will be removed in 4.x version)', DeprecationWarning)
209
210
  return self.cols[col_name].list
210
211
 
211
212
  def shuffle(self, shuffle=True):
212
- ShuffleDeprecationWarning()
213
+ warnings.warn('unidep.shuffle is deprecated (will be removed in 4.x version)', DeprecationWarning)
213
214
  if shuffle:
214
215
  random.shuffle(self._indexes)
215
216
  else:
@@ -1,12 +1,12 @@
1
1
  import json
2
2
  import os
3
+ import warnings
3
4
  from typing import Optional
4
5
 
5
6
  import numpy as np
6
7
  import pandas as pd
7
8
 
8
9
  from .cols import Cols
9
- from .compatible.uni_warnings import VocabDepotDeprecationWarning, GetTokPathDeprecationWarning
10
10
  from .column import Column, IndexColumn
11
11
  from .tok.bert_tok import BertTok
12
12
  from .tok.entity_tok import EntTok
@@ -29,7 +29,8 @@ class UniTok:
29
29
 
30
30
  @property
31
31
  def vocab_depots(self):
32
- VocabDepotDeprecationWarning()
32
+ warnings.warn('vocab_depot is deprecated, '
33
+ 'use vocabs instead (will be removed in 4.x version)', DeprecationWarning)
33
34
  return self.vocabs
34
35
 
35
36
  def add_col(self, col: Column):
@@ -110,6 +111,9 @@ class UniTok:
110
111
  if not self.id_col:
111
112
  raise ValueError('id column is not set')
112
113
 
114
+ for vocab in self.vocabs.values():
115
+ vocab.set_count_mode(False)
116
+
113
117
  for col_name in self.cols:
114
118
  print('[ COL:', col_name, ']')
115
119
  col = self.cols[col_name] # type: Column
@@ -121,7 +125,7 @@ class UniTok:
121
125
  """
122
126
  Get the store path of the tokenizer of a column
123
127
  """
124
- GetTokPathDeprecationWarning()
128
+ warnings.warn('unitok.get_tok_path is deprecated (will be removed in 4.x version)', DeprecationWarning)
125
129
  return self.cols[col_name].tok.vocab.get_store_path(store_dir)
126
130
 
127
131
  def store_data(self, store_dir):
@@ -1,12 +1,10 @@
1
1
  import math
2
2
  import os
3
+ import warnings
3
4
  from typing import Union, List
4
5
 
5
6
  import numpy as np
6
7
 
7
- from UniTok.compatible.uni_warnings import VocabMapDeprecationWarning, OOVDefaultDeprecationWarning, \
8
- MinFrequencyDeprecationWarning
9
-
10
8
 
11
9
  class VocabMap(dict):
12
10
  def __call__(self, *args, **kwargs):
@@ -29,6 +27,7 @@ class Vocab:
29
27
 
30
28
  self._editable = True # whether vocab is editable
31
29
  self._oov_token = None # out of vocabulary token
30
+ self._stable_mode = False
32
31
 
33
32
  self._count_mode = False # whether count mode is on
34
33
  self._counter = {} # counter for counting occurrence of each token
@@ -48,7 +47,8 @@ class Vocab:
48
47
  """
49
48
  Deprecated, use o2i instead
50
49
  """
51
- VocabMapDeprecationWarning()
50
+ warnings.warn('vocab.index2obj and vocab.obj2index are deprecated, '
51
+ 'use vocab.i2o and vocab.o2i instead (will be removed in 4.x version)', DeprecationWarning)
52
52
  return self.o2i
53
53
 
54
54
  @property
@@ -56,7 +56,8 @@ class Vocab:
56
56
  """
57
57
  Deprecated, use i2o instead
58
58
  """
59
- VocabMapDeprecationWarning()
59
+ warnings.warn('vocab.index2obj and vocab.obj2index are deprecated, '
60
+ 'use vocab.i2o and vocab.o2i instead (will be removed in 4.x version)', DeprecationWarning)
60
61
  return self.i2o
61
62
 
62
63
  def extend(self, objs):
@@ -80,8 +81,10 @@ class Vocab:
80
81
  if obj in self.o2i:
81
82
  return self.o2i[obj]
82
83
 
83
- if self._count_mode:
84
- return self._oov_token or -1
84
+ if self._stable_mode:
85
+ if self._oov_token is not None:
86
+ return self._oov_token
87
+ return -1
85
88
 
86
89
  if not self._editable:
87
90
  if self._oov_token is not None:
@@ -127,7 +130,8 @@ class Vocab:
127
130
 
128
131
  @property
129
132
  def oov_default(self):
130
- OOVDefaultDeprecationWarning()
133
+ warnings.warn('vocab.oov_default is deprecated, '
134
+ 'use vocab.oov_token instead (will be removed in 4.x version)', DeprecationWarning)
131
135
  return self._oov_token
132
136
 
133
137
  def allow_edit(self):
@@ -189,7 +193,8 @@ class Vocab:
189
193
  :return:
190
194
  """
191
195
  if min_count is None:
192
- MinFrequencyDeprecationWarning()
196
+ warnings.warn('vocab.min_frequency is deprecated, '
197
+ 'use vocab.min_count instead (will be removed in 4.x version)', DeprecationWarning)
193
198
  min_count = min_frequency
194
199
 
195
200
  vocabs = []
@@ -205,7 +210,7 @@ class Vocab:
205
210
  self.reserve(self.reserved_tokens)
206
211
  self.extend(vocabs)
207
212
 
208
- # self.frequency_mode = True
213
+ self._stable_mode = True
209
214
  return self
210
215
 
211
216
  def summarize(self, base=10):
@@ -258,6 +263,12 @@ class Vocab:
258
263
 
259
264
  def __getattr__(self, item):
260
265
  if item in ['frequency_mode', 'frequency', 'max_frequency', 'frequent_vocab']:
261
- return AttributeError(f'{item} is deprecated after UniTok 3.0, '
262
- f'degrade to 2.4.3.2 or lower to use it, '
263
- f'or check new features of Vocab class')
266
+ raise AttributeError(f'{item} is deprecated after UniTok 3.0, '
267
+ f'degrade to 2.4.3.2 or lower to use it, '
268
+ f'or check new features of Vocab class')
269
+
270
+ @property
271
+ def trim_vocab(self):
272
+ warnings.warn('vocab.trim_vocab is deprecated, '
273
+ 'use vocab.trim instead (will be removed in 4.x version)', DeprecationWarning)
274
+ return self.trim
@@ -1,4 +1,5 @@
1
- from .compatible.uni_warnings import ColMapDeprecationWarning, DepotsDeprecationWarning
1
+ import warnings
2
+
2
3
  from .vocab import Vocab
3
4
 
4
5
 
@@ -9,12 +10,14 @@ class Vocabs(dict):
9
10
 
10
11
  @property
11
12
  def col_map(self):
12
- ColMapDeprecationWarning()
13
+ warnings.warn('vocab_depot.col_map is deprecated, '
14
+ 'use vocabs.cols instead (will be removed in 4.x version)', DeprecationWarning)
13
15
  return self.cols
14
16
 
15
17
  @property
16
18
  def depots(self):
17
- DepotsDeprecationWarning()
19
+ warnings.warn('vocab_depot.depots is deprecated, '
20
+ 'use vocabs instead (will be removed in 4.x version)', DeprecationWarning)
18
21
  return self
19
22
 
20
23
  def append(self, col_or_vocab):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: UniTok
3
- Version: 3.0.3a0
3
+ Version: 3.0.3b0
4
4
  Summary: Unified Tokenizer
5
5
  Home-page: https://github.com/Jyonn/UnifiedTokenizer
6
6
  Author: Jyonn Liu
@@ -17,8 +17,6 @@ UniTok.egg-info/top_level.txt
17
17
  UniTok/analysis/__init__.py
18
18
  UniTok/analysis/lengths.py
19
19
  UniTok/analysis/plot.py
20
- UniTok/compatible/__init__.py
21
- UniTok/compatible/uni_warnings.py
22
20
  UniTok/tok/__init__.py
23
21
  UniTok/tok/bert_tok.py
24
22
  UniTok/tok/entity_tok.py
@@ -6,7 +6,7 @@ long_description = (this_directory / "README.md").read_text()
6
6
 
7
7
  setup(
8
8
  name='UniTok',
9
- version='3.0.3.alpha',
9
+ version='3.0.3.beta',
10
10
  keywords=['token', 'tokenizer', 'bert'],
11
11
  description='Unified Tokenizer',
12
12
  long_description=long_description,
@@ -1,21 +0,0 @@
1
- from .uni_warnings import VocabDepotDeprecationWarning, GetTokPathDeprecationWarning, \
2
- VocabMapDeprecationWarning, OOVDefaultDeprecationWarning, MinFrequencyDeprecationWarning, \
3
- ColMapDeprecationWarning, DepotsDeprecationWarning, MetaDataDeprecationWarning, \
4
- VocabInfoDeprecationWarning, ColInfoDeprecationWarning, GetMaxLengthDeprecationWarning, \
5
- GetVocabDeprecationWarning
6
-
7
-
8
- __all__ = [
9
- 'VocabDepotDeprecationWarning',
10
- 'GetTokPathDeprecationWarning',
11
- 'VocabMapDeprecationWarning',
12
- 'OOVDefaultDeprecationWarning',
13
- 'MinFrequencyDeprecationWarning',
14
- 'ColMapDeprecationWarning',
15
- 'DepotsDeprecationWarning',
16
- 'MetaDataDeprecationWarning',
17
- 'VocabInfoDeprecationWarning',
18
- 'ColInfoDeprecationWarning',
19
- 'GetMaxLengthDeprecationWarning',
20
- 'GetVocabDeprecationWarning',
21
- ]
@@ -1,70 +0,0 @@
1
- import warnings
2
- from typing import Callable
3
-
4
- warned_flags = set()
5
-
6
-
7
- class UniWarning:
8
- def __init__(self, msg, type_: Callable = warnings.warn):
9
- self.msg = msg
10
- self.type = type_
11
-
12
- def __call__(self, *args, **kwargs):
13
- if self not in warned_flags:
14
- warned_flags.add(self)
15
- self.type(self.msg.format(*args, **kwargs))
16
-
17
-
18
- VocabMapDeprecationWarning = UniWarning(
19
- 'vocab.index2obj and vocab.obj2index are deprecated, '
20
- 'use vocab.i2o and vocab.o2i instead (will be removed in 4.x version)', type_=DeprecationWarning)
21
-
22
- OOVDefaultDeprecationWarning = UniWarning(
23
- 'vocab.oov_default is deprecated, '
24
- 'use vocab.oov_token instead (will be removed in 4.x version)', type_=DeprecationWarning)
25
-
26
- MinFrequencyDeprecationWarning = UniWarning(
27
- 'vocab.min_frequency is deprecated, '
28
- 'use vocab.min_count instead (will be removed in 4.x version)', type_=DeprecationWarning)
29
-
30
- VocabDepotDeprecationWarning = UniWarning(
31
- 'vocab_depot is deprecated, '
32
- 'use vocabs instead (will be removed in 4.x version)', type_=DeprecationWarning)
33
-
34
- ColMapDeprecationWarning = UniWarning(
35
- 'vocab_depot.col_map is deprecated, '
36
- 'use vocabs.cols instead (will be removed in 4.x version)', type_=DeprecationWarning)
37
-
38
- GetTokPathDeprecationWarning = UniWarning(
39
- 'unitok.get_tok_path is deprecated (will be removed in 4.x version)', type_=DeprecationWarning)
40
-
41
- DepotsDeprecationWarning = UniWarning(
42
- 'vocab_depot.depots is deprecated, '
43
- 'use vocabs instead (will be removed in 4.x version)', type_=DeprecationWarning)
44
-
45
- MetaDataDeprecationWarning = UniWarning(
46
- 'meta_data is deprecated, '
47
- 'use meta instead (will be removed in 4.x version)', type_=DeprecationWarning)
48
-
49
- VocabInfoDeprecationWarning = UniWarning(
50
- 'vocab_info is deprecated, '
51
- 'use vocs instead (will be removed in 4.x version)', type_=DeprecationWarning)
52
-
53
- ColInfoDeprecationWarning = UniWarning(
54
- 'col_info is deprecated, '
55
- 'use cols instead (will be removed in 4.x version)', type_=DeprecationWarning)
56
-
57
- GetMaxLengthDeprecationWarning = UniWarning(
58
- 'unidep.get_max_length is deprecated, (will be removed in 4.x version)', type_=DeprecationWarning)
59
-
60
- GetVocabDeprecationWarning = UniWarning(
61
- 'unidep.get_vocab is deprecated, (will be removed in 4.x version)', type_=DeprecationWarning)
62
-
63
- GetVocabSizeDeprecationWarning = UniWarning(
64
- 'unidep.get_vocab_size is deprecated, (will be removed in 4.x version)', type_=DeprecationWarning)
65
-
66
- IsListColDeprecationWarning = UniWarning(
67
- 'unidep.is_list_col is deprecated, (will be removed in 4.x version)', type_=DeprecationWarning)
68
-
69
- ShuffleDeprecationWarning = UniWarning(
70
- 'unidep.shuffle is deprecated, (will be removed in 4.x version)', type_=DeprecationWarning)
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes