UniTok 3.1.3__tar.gz → 3.1.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. {UniTok-3.1.3 → UniTok-3.1.5}/PKG-INFO +3 -1
  2. {UniTok-3.1.3 → UniTok-3.1.5}/UniTok/column.py +1 -1
  3. {UniTok-3.1.3 → UniTok-3.1.5}/UniTok/tok/bert_tok.py +0 -4
  4. {UniTok-3.1.3 → UniTok-3.1.5}/UniTok/tok/number_tok.py +1 -1
  5. {UniTok-3.1.3 → UniTok-3.1.5}/UniTok/tok/tok.py +2 -0
  6. {UniTok-3.1.3 → UniTok-3.1.5}/UniTok/unidep.py +1 -1
  7. {UniTok-3.1.3 → UniTok-3.1.5}/UniTok/vocab.py +9 -4
  8. {UniTok-3.1.3 → UniTok-3.1.5}/UniTok.egg-info/PKG-INFO +3 -1
  9. {UniTok-3.1.3 → UniTok-3.1.5}/setup.py +1 -1
  10. {UniTok-3.1.3 → UniTok-3.1.5}/README.md +0 -0
  11. {UniTok-3.1.3 → UniTok-3.1.5}/UniTok/__init__.py +0 -0
  12. {UniTok-3.1.3 → UniTok-3.1.5}/UniTok/analysis/__init__.py +0 -0
  13. {UniTok-3.1.3 → UniTok-3.1.5}/UniTok/analysis/lengths.py +0 -0
  14. {UniTok-3.1.3 → UniTok-3.1.5}/UniTok/analysis/plot.py +0 -0
  15. {UniTok-3.1.3 → UniTok-3.1.5}/UniTok/cols.py +0 -0
  16. {UniTok-3.1.3 → UniTok-3.1.5}/UniTok/global_setting.py +0 -0
  17. {UniTok-3.1.3 → UniTok-3.1.5}/UniTok/meta.py +0 -0
  18. {UniTok-3.1.3 → UniTok-3.1.5}/UniTok/tok/__init__.py +0 -0
  19. {UniTok-3.1.3 → UniTok-3.1.5}/UniTok/tok/ent_tok.py +0 -0
  20. {UniTok-3.1.3 → UniTok-3.1.5}/UniTok/tok/id_tok.py +0 -0
  21. {UniTok-3.1.3 → UniTok-3.1.5}/UniTok/tok/seq_tok.py +0 -0
  22. {UniTok-3.1.3 → UniTok-3.1.5}/UniTok/tok/split_tok.py +0 -0
  23. {UniTok-3.1.3 → UniTok-3.1.5}/UniTok/unitok.py +0 -0
  24. {UniTok-3.1.3 → UniTok-3.1.5}/UniTok/vocabs.py +0 -0
  25. {UniTok-3.1.3 → UniTok-3.1.5}/UniTok.egg-info/SOURCES.txt +0 -0
  26. {UniTok-3.1.3 → UniTok-3.1.5}/UniTok.egg-info/dependency_links.txt +0 -0
  27. {UniTok-3.1.3 → UniTok-3.1.5}/UniTok.egg-info/requires.txt +0 -0
  28. {UniTok-3.1.3 → UniTok-3.1.5}/UniTok.egg-info/top_level.txt +0 -0
  29. {UniTok-3.1.3 → UniTok-3.1.5}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: UniTok
3
- Version: 3.1.3
3
+ Version: 3.1.5
4
4
  Summary: Unified Tokenizer
5
5
  Home-page: https://github.com/Jyonn/UnifiedTokenizer
6
6
  Author: Jyonn Liu
@@ -138,3 +138,5 @@ news_dep = UniDep('data/news') # 读取分词结果
138
138
  print(len(news_dep))
139
139
  print(news_dep[0])
140
140
  ```
141
+
142
+
@@ -65,7 +65,7 @@ class Column:
65
65
  pad=kwargs.get('pad', 0),
66
66
  )
67
67
 
68
- self.list = bool(operator) or tok.return_list is True # whether the column is a list-element column
68
+ self.list = bool(self.operator) or tok.return_list is True # whether the column is a list-element column
69
69
 
70
70
  self.data = []
71
71
  self.lengths = Lengths()
@@ -30,8 +30,4 @@ class BertTok(BaseTok):
30
30
  ids = self.tokenizer.convert_tokens_to_ids(ts)
31
31
  else:
32
32
  ids = []
33
-
34
- for index in ids:
35
- self.vocab.count(index)
36
-
37
33
  return ids
@@ -40,5 +40,5 @@ class NumberTok(BaseTok):
40
40
  if o >= len(self.vocab):
41
41
  if self.vocab_size is not None:
42
42
  raise ValueError('vocab_size is {}, but {} is given'.format(self.vocab_size, o))
43
- self.vocab.extend([str(i) for i in range(len(self.vocab), o + 1)])
43
+ self.vocab.extend([str(i) for i in range(len(self.vocab), o + 1)], count=False)
44
44
  return obj
@@ -44,6 +44,8 @@ class BaseTok:
44
44
  wrapped tokenize method, filter out unknown token
45
45
  """
46
46
  ids = self.t(obj)
47
+ self.vocab.counts(ids)
48
+
47
49
  if isinstance(ids, list):
48
50
  return list(filter(lambda index: index > -1, ids))
49
51
  if ids == -1:
@@ -207,7 +207,7 @@ class UniDep:
207
207
  data = dict()
208
208
 
209
209
  for voc in self.vocabs:
210
- self.vocabs[voc.name].save(store_dir)
210
+ self.vocabs[voc].save(store_dir)
211
211
 
212
212
  for sample in tqdm.tqdm(self, disable=self.silent):
213
213
  for col_name in sample:
@@ -63,12 +63,17 @@ class Vocab:
63
63
 
64
64
  def append(self, obj):
65
65
  index = self._append(obj)
66
- self.count(index)
67
66
  return index
68
67
 
69
- def count(self, index):
70
- if self._count_mode and index > -1:
71
- self._counter[index] = self._counter.get(index, 0) + 1
68
+ def counts(self, indexes):
69
+ if self._count_mode:
70
+ for index in indexes:
71
+ if index > -1:
72
+ self._counter[index] = self._counter.get(index, 0) + 1
73
+
74
+ # def count(self, index):
75
+ # if self._count_mode and index > -1:
76
+ # self._counter[index] = self._counter.get(index, 0) + 1
72
77
 
73
78
  def _append(self, obj):
74
79
  """
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: UniTok
3
- Version: 3.1.3
3
+ Version: 3.1.5
4
4
  Summary: Unified Tokenizer
5
5
  Home-page: https://github.com/Jyonn/UnifiedTokenizer
6
6
  Author: Jyonn Liu
@@ -138,3 +138,5 @@ news_dep = UniDep('data/news') # 读取分词结果
138
138
  print(len(news_dep))
139
139
  print(news_dep[0])
140
140
  ```
141
+
142
+
@@ -6,7 +6,7 @@ long_description = (this_directory / "README.md").read_text()
6
6
 
7
7
  setup(
8
8
  name='UniTok',
9
- version='3.1.3',
9
+ version='3.1.5',
10
10
  keywords=['token', 'tokenizer', 'bert'],
11
11
  description='Unified Tokenizer',
12
12
  long_description=long_description,
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes