UniTok 3.0.13__tar.gz → 3.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. {UniTok-3.0.13 → UniTok-3.1.1}/PKG-INFO +1 -1
  2. {UniTok-3.0.13 → UniTok-3.1.1}/UniTok/column.py +10 -3
  3. {UniTok-3.0.13 → UniTok-3.1.1}/UniTok/meta.py +8 -8
  4. {UniTok-3.0.13 → UniTok-3.1.1}/UniTok/unidep.py +6 -1
  5. {UniTok-3.0.13 → UniTok-3.1.1}/UniTok/unitok.py +9 -6
  6. {UniTok-3.0.13 → UniTok-3.1.1}/UniTok/vocab.py +6 -0
  7. {UniTok-3.0.13 → UniTok-3.1.1}/UniTok.egg-info/PKG-INFO +1 -1
  8. {UniTok-3.0.13 → UniTok-3.1.1}/setup.py +1 -1
  9. {UniTok-3.0.13 → UniTok-3.1.1}/README.md +0 -0
  10. {UniTok-3.0.13 → UniTok-3.1.1}/UniTok/__init__.py +0 -0
  11. {UniTok-3.0.13 → UniTok-3.1.1}/UniTok/analysis/__init__.py +0 -0
  12. {UniTok-3.0.13 → UniTok-3.1.1}/UniTok/analysis/lengths.py +0 -0
  13. {UniTok-3.0.13 → UniTok-3.1.1}/UniTok/analysis/plot.py +0 -0
  14. {UniTok-3.0.13 → UniTok-3.1.1}/UniTok/cols.py +0 -0
  15. {UniTok-3.0.13 → UniTok-3.1.1}/UniTok/global_setting.py +0 -0
  16. {UniTok-3.0.13 → UniTok-3.1.1}/UniTok/tok/__init__.py +0 -0
  17. {UniTok-3.0.13 → UniTok-3.1.1}/UniTok/tok/bert_tok.py +0 -0
  18. {UniTok-3.0.13 → UniTok-3.1.1}/UniTok/tok/ent_tok.py +0 -0
  19. {UniTok-3.0.13 → UniTok-3.1.1}/UniTok/tok/id_tok.py +0 -0
  20. {UniTok-3.0.13 → UniTok-3.1.1}/UniTok/tok/number_tok.py +0 -0
  21. {UniTok-3.0.13 → UniTok-3.1.1}/UniTok/tok/seq_tok.py +0 -0
  22. {UniTok-3.0.13 → UniTok-3.1.1}/UniTok/tok/split_tok.py +0 -0
  23. {UniTok-3.0.13 → UniTok-3.1.1}/UniTok/tok/tok.py +0 -0
  24. {UniTok-3.0.13 → UniTok-3.1.1}/UniTok/vocabs.py +0 -0
  25. {UniTok-3.0.13 → UniTok-3.1.1}/UniTok.egg-info/SOURCES.txt +0 -0
  26. {UniTok-3.0.13 → UniTok-3.1.1}/UniTok.egg-info/dependency_links.txt +0 -0
  27. {UniTok-3.0.13 → UniTok-3.1.1}/UniTok.egg-info/requires.txt +0 -0
  28. {UniTok-3.0.13 → UniTok-3.1.1}/UniTok.egg-info/top_level.txt +0 -0
  29. {UniTok-3.0.13 → UniTok-3.1.1}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: UniTok
3
- Version: 3.0.13
3
+ Version: 3.1.1
4
4
  Summary: Unified Tokenizer
5
5
  Home-page: https://github.com/Jyonn/UnifiedTokenizer
6
6
  Author: Jyonn Liu
@@ -1,9 +1,11 @@
1
- from UniTok.tok import BaseTok
1
+ from typing import Type, Union
2
+
2
3
  from tqdm import tqdm
3
4
 
4
5
  from .global_setting import Global
5
6
  from .analysis.lengths import Lengths
6
- from .tok import IdTok
7
+ from .tok import IdTok, BaseTok
8
+ from .vocab import Vocab
7
9
 
8
10
 
9
11
  class SeqOperator:
@@ -42,11 +44,16 @@ class Column:
42
44
  tok (BaseTok): The tokenizer of the column.
43
45
  operator (SeqOperator): The operator of the column.
44
46
  """
45
- def __init__(self, tok: BaseTok, name=None, operator: SeqOperator = None, **kwargs):
47
+ def __init__(self, tok: Union[BaseTok, Type[BaseTok]], name=None, operator: SeqOperator = None, **kwargs):
46
48
  self.tok = tok
47
49
  self.name = name or tok.vocab.name
48
50
  self.operator = operator
49
51
 
52
+ if isinstance(tok, type):
53
+ assert issubclass(tok, BaseTok)
54
+ assert name is not None, 'name must be set when tok is a class'
55
+ self.tok = tok(vocab=Vocab(name=name))
56
+
50
57
  if kwargs:
51
58
  if operator:
52
59
  raise ValueError('operator and kwargs cannot be set at the same time')
@@ -1,13 +1,13 @@
1
1
  import json
2
2
  import os
3
3
  import warnings
4
- from typing import List
4
+ from typing import List, Union
5
5
 
6
6
 
7
7
  class Col:
8
8
  def __init__(self, name, voc=None, max_length=None, padding=None, vocab=None):
9
9
  self.name: str = name
10
- self.voc: Voc = voc or vocab
10
+ self.voc: Union[Voc, str] = voc or vocab
11
11
  self.max_length = max_length
12
12
  self.padding = padding
13
13
  self.list = max_length is not None
@@ -29,7 +29,7 @@ class Voc:
29
29
  def __init__(self, name, size, cols, store_dir):
30
30
  self.name: str = name
31
31
  self.size: int = size
32
- self.cols: List[Col] = cols
32
+ self.cols: List[Union[Col, str]] = cols
33
33
  self.store_dir = store_dir
34
34
 
35
35
  def __eq__(self, other):
@@ -46,7 +46,7 @@ class Voc:
46
46
  vocab = Vocab(name=self.name).load(self.store_dir)
47
47
  vocab.save(store_dir)
48
48
 
49
- def merge(self, other):
49
+ def merge(self, other: 'Voc'):
50
50
  cols = self.cols.copy()
51
51
  for col in other.cols:
52
52
  for _col in cols:
@@ -71,13 +71,13 @@ class Meta:
71
71
 
72
72
  data = self.load()
73
73
  self.version = data['version']
74
- self.cols = data.get('cols') or data['col_info']
75
- self.vocs = data.get('vocs') or data['vocab_info']
74
+ cols = data.get('cols') or data['col_info']
75
+ vocs = data.get('vocs') or data['vocab_info']
76
76
  self.id_col = data['id_col']
77
77
 
78
78
  # build col-voc graph
79
- self.cols = {col: Col(**self.cols[col], name=col) for col in self.cols}
80
- self.vocs = {voc: Voc(**self.vocs[voc], name=voc, store_dir=self.store_dir) for voc in self.vocs}
79
+ self.cols = {col: Col(**cols[col], name=col) for col in cols} # type: dict[str, Col]
80
+ self.vocs = {voc: Voc(**vocs[voc], name=voc, store_dir=self.store_dir) for voc in vocs} # type: dict[str, Voc]
81
81
 
82
82
  # connect class objects
83
83
  for col in self.cols.values():
@@ -46,7 +46,7 @@ class UniDep:
46
46
  self.print('resize sample_size to', self._sample_size)
47
47
  self.sample_size = self._sample_size
48
48
 
49
- self.vocabs = Vocabs()
49
+ self.vocabs = Vocabs() # type: Union[Dict[str, Vocab], Vocabs]
50
50
  for vocab_name in self.vocs:
51
51
  self.vocabs.append(Vocab(name=vocab_name).load(self.store_dir))
52
52
  self.id2index = self.vocabs[self.id_voc.name].o2i
@@ -100,6 +100,11 @@ class UniDep:
100
100
  index = self._indexes[index]
101
101
  return self.pack_sample(index)
102
102
 
103
+ def __iter__(self):
104
+ """vocab obj list iterator"""
105
+ for i in range(len(self)):
106
+ yield self[i]
107
+
103
108
  def __len__(self):
104
109
  return self.sample_size
105
110
 
@@ -1,16 +1,14 @@
1
1
  import json
2
2
  import os
3
3
  import warnings
4
- from typing import Optional
4
+ from typing import Optional, Type, Dict, Union
5
5
 
6
6
  import numpy as np
7
7
  import pandas as pd
8
8
 
9
9
  from .cols import Cols
10
10
  from .column import Column, IndexColumn
11
- from .tok.bert_tok import BertTok
12
- from .tok.ent_tok import EntTok
13
- from .tok.id_tok import IdTok
11
+ from .tok import BaseTok, BertTok, EntTok, IdTok
14
12
  from .vocab import Vocab
15
13
  from .vocabs import Vocabs
16
14
 
@@ -60,7 +58,7 @@ class UniTok:
60
58
 
61
59
  def __init__(self):
62
60
  self.cols = Cols()
63
- self.vocabs = Vocabs()
61
+ self.vocabs = Vocabs() # type: Union[Dict[str, Vocab], Vocabs]
64
62
  self.id_col = None # type: Optional[Column]
65
63
  self.data = None
66
64
 
@@ -70,10 +68,15 @@ class UniTok:
70
68
  'use vocabs instead (will be removed in 4.x version)', DeprecationWarning)
71
69
  return self.vocabs
72
70
 
73
- def add_col(self, col: Column):
71
+ def add_col(self, col: Union[Column, str], tok: Union[BaseTok, Type[BaseTok]] = None):
74
72
  """
75
73
  Declare a column in the DataFrame to be tokenized.
76
74
  """
75
+
76
+ if isinstance(col, str):
77
+ assert tok is not None, 'tok must be specified when col is a string'
78
+ col = Column(tok, name=col)
79
+
77
80
  if isinstance(col.tok, IdTok):
78
81
  if self.id_col:
79
82
  raise ValueError(f'already exists id column {self.id_col.name} before adding {col.name}')
@@ -123,9 +123,15 @@ class Vocab:
123
123
  return True
124
124
 
125
125
  def __iter__(self):
126
+ """vocab obj list iterator"""
126
127
  for i in range(len(self)):
127
128
  yield self.i2o[i]
128
129
 
130
+ def __getitem__(self, item):
131
+ if isinstance(item, int):
132
+ return self.i2o[item]
133
+ return self.o2i[item]
134
+
129
135
  """
130
136
  Editable Methods
131
137
  """
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: UniTok
3
- Version: 3.0.13
3
+ Version: 3.1.1
4
4
  Summary: Unified Tokenizer
5
5
  Home-page: https://github.com/Jyonn/UnifiedTokenizer
6
6
  Author: Jyonn Liu
@@ -6,7 +6,7 @@ long_description = (this_directory / "README.md").read_text()
6
6
 
7
7
  setup(
8
8
  name='UniTok',
9
- version='3.0.13',
9
+ version='3.1.1',
10
10
  keywords=['token', 'tokenizer', 'bert'],
11
11
  description='Unified Tokenizer',
12
12
  long_description=long_description,
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes