UniTok 3.0.13__tar.gz → 3.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {UniTok-3.0.13 → UniTok-3.1.1}/PKG-INFO +1 -1
- {UniTok-3.0.13 → UniTok-3.1.1}/UniTok/column.py +10 -3
- {UniTok-3.0.13 → UniTok-3.1.1}/UniTok/meta.py +8 -8
- {UniTok-3.0.13 → UniTok-3.1.1}/UniTok/unidep.py +6 -1
- {UniTok-3.0.13 → UniTok-3.1.1}/UniTok/unitok.py +9 -6
- {UniTok-3.0.13 → UniTok-3.1.1}/UniTok/vocab.py +6 -0
- {UniTok-3.0.13 → UniTok-3.1.1}/UniTok.egg-info/PKG-INFO +1 -1
- {UniTok-3.0.13 → UniTok-3.1.1}/setup.py +1 -1
- {UniTok-3.0.13 → UniTok-3.1.1}/README.md +0 -0
- {UniTok-3.0.13 → UniTok-3.1.1}/UniTok/__init__.py +0 -0
- {UniTok-3.0.13 → UniTok-3.1.1}/UniTok/analysis/__init__.py +0 -0
- {UniTok-3.0.13 → UniTok-3.1.1}/UniTok/analysis/lengths.py +0 -0
- {UniTok-3.0.13 → UniTok-3.1.1}/UniTok/analysis/plot.py +0 -0
- {UniTok-3.0.13 → UniTok-3.1.1}/UniTok/cols.py +0 -0
- {UniTok-3.0.13 → UniTok-3.1.1}/UniTok/global_setting.py +0 -0
- {UniTok-3.0.13 → UniTok-3.1.1}/UniTok/tok/__init__.py +0 -0
- {UniTok-3.0.13 → UniTok-3.1.1}/UniTok/tok/bert_tok.py +0 -0
- {UniTok-3.0.13 → UniTok-3.1.1}/UniTok/tok/ent_tok.py +0 -0
- {UniTok-3.0.13 → UniTok-3.1.1}/UniTok/tok/id_tok.py +0 -0
- {UniTok-3.0.13 → UniTok-3.1.1}/UniTok/tok/number_tok.py +0 -0
- {UniTok-3.0.13 → UniTok-3.1.1}/UniTok/tok/seq_tok.py +0 -0
- {UniTok-3.0.13 → UniTok-3.1.1}/UniTok/tok/split_tok.py +0 -0
- {UniTok-3.0.13 → UniTok-3.1.1}/UniTok/tok/tok.py +0 -0
- {UniTok-3.0.13 → UniTok-3.1.1}/UniTok/vocabs.py +0 -0
- {UniTok-3.0.13 → UniTok-3.1.1}/UniTok.egg-info/SOURCES.txt +0 -0
- {UniTok-3.0.13 → UniTok-3.1.1}/UniTok.egg-info/dependency_links.txt +0 -0
- {UniTok-3.0.13 → UniTok-3.1.1}/UniTok.egg-info/requires.txt +0 -0
- {UniTok-3.0.13 → UniTok-3.1.1}/UniTok.egg-info/top_level.txt +0 -0
- {UniTok-3.0.13 → UniTok-3.1.1}/setup.cfg +0 -0
@@ -1,9 +1,11 @@
|
|
1
|
-
from
|
1
|
+
from typing import Type, Union
|
2
|
+
|
2
3
|
from tqdm import tqdm
|
3
4
|
|
4
5
|
from .global_setting import Global
|
5
6
|
from .analysis.lengths import Lengths
|
6
|
-
from .tok import IdTok
|
7
|
+
from .tok import IdTok, BaseTok
|
8
|
+
from .vocab import Vocab
|
7
9
|
|
8
10
|
|
9
11
|
class SeqOperator:
|
@@ -42,11 +44,16 @@ class Column:
|
|
42
44
|
tok (BaseTok): The tokenizer of the column.
|
43
45
|
operator (SeqOperator): The operator of the column.
|
44
46
|
"""
|
45
|
-
def __init__(self, tok: BaseTok, name=None, operator: SeqOperator = None, **kwargs):
|
47
|
+
def __init__(self, tok: Union[BaseTok, Type[BaseTok]], name=None, operator: SeqOperator = None, **kwargs):
|
46
48
|
self.tok = tok
|
47
49
|
self.name = name or tok.vocab.name
|
48
50
|
self.operator = operator
|
49
51
|
|
52
|
+
if isinstance(tok, type):
|
53
|
+
assert issubclass(tok, BaseTok)
|
54
|
+
assert name is not None, 'name must be set when tok is a class'
|
55
|
+
self.tok = tok(vocab=Vocab(name=name))
|
56
|
+
|
50
57
|
if kwargs:
|
51
58
|
if operator:
|
52
59
|
raise ValueError('operator and kwargs cannot be set at the same time')
|
@@ -1,13 +1,13 @@
|
|
1
1
|
import json
|
2
2
|
import os
|
3
3
|
import warnings
|
4
|
-
from typing import List
|
4
|
+
from typing import List, Union
|
5
5
|
|
6
6
|
|
7
7
|
class Col:
|
8
8
|
def __init__(self, name, voc=None, max_length=None, padding=None, vocab=None):
|
9
9
|
self.name: str = name
|
10
|
-
self.voc: Voc = voc or vocab
|
10
|
+
self.voc: Union[Voc, str] = voc or vocab
|
11
11
|
self.max_length = max_length
|
12
12
|
self.padding = padding
|
13
13
|
self.list = max_length is not None
|
@@ -29,7 +29,7 @@ class Voc:
|
|
29
29
|
def __init__(self, name, size, cols, store_dir):
|
30
30
|
self.name: str = name
|
31
31
|
self.size: int = size
|
32
|
-
self.cols: List[Col] = cols
|
32
|
+
self.cols: List[Union[Col, str]] = cols
|
33
33
|
self.store_dir = store_dir
|
34
34
|
|
35
35
|
def __eq__(self, other):
|
@@ -46,7 +46,7 @@ class Voc:
|
|
46
46
|
vocab = Vocab(name=self.name).load(self.store_dir)
|
47
47
|
vocab.save(store_dir)
|
48
48
|
|
49
|
-
def merge(self, other):
|
49
|
+
def merge(self, other: 'Voc'):
|
50
50
|
cols = self.cols.copy()
|
51
51
|
for col in other.cols:
|
52
52
|
for _col in cols:
|
@@ -71,13 +71,13 @@ class Meta:
|
|
71
71
|
|
72
72
|
data = self.load()
|
73
73
|
self.version = data['version']
|
74
|
-
|
75
|
-
|
74
|
+
cols = data.get('cols') or data['col_info']
|
75
|
+
vocs = data.get('vocs') or data['vocab_info']
|
76
76
|
self.id_col = data['id_col']
|
77
77
|
|
78
78
|
# build col-voc graph
|
79
|
-
self.cols = {col: Col(**
|
80
|
-
self.vocs = {voc: Voc(**
|
79
|
+
self.cols = {col: Col(**cols[col], name=col) for col in cols} # type: dict[str, Col]
|
80
|
+
self.vocs = {voc: Voc(**vocs[voc], name=voc, store_dir=self.store_dir) for voc in vocs} # type: dict[str, Voc]
|
81
81
|
|
82
82
|
# connect class objects
|
83
83
|
for col in self.cols.values():
|
@@ -46,7 +46,7 @@ class UniDep:
|
|
46
46
|
self.print('resize sample_size to', self._sample_size)
|
47
47
|
self.sample_size = self._sample_size
|
48
48
|
|
49
|
-
self.vocabs = Vocabs()
|
49
|
+
self.vocabs = Vocabs() # type: Union[Dict[str, Vocab], Vocabs]
|
50
50
|
for vocab_name in self.vocs:
|
51
51
|
self.vocabs.append(Vocab(name=vocab_name).load(self.store_dir))
|
52
52
|
self.id2index = self.vocabs[self.id_voc.name].o2i
|
@@ -100,6 +100,11 @@ class UniDep:
|
|
100
100
|
index = self._indexes[index]
|
101
101
|
return self.pack_sample(index)
|
102
102
|
|
103
|
+
def __iter__(self):
|
104
|
+
"""vocab obj list iterator"""
|
105
|
+
for i in range(len(self)):
|
106
|
+
yield self[i]
|
107
|
+
|
103
108
|
def __len__(self):
|
104
109
|
return self.sample_size
|
105
110
|
|
@@ -1,16 +1,14 @@
|
|
1
1
|
import json
|
2
2
|
import os
|
3
3
|
import warnings
|
4
|
-
from typing import Optional
|
4
|
+
from typing import Optional, Type, Dict, Union
|
5
5
|
|
6
6
|
import numpy as np
|
7
7
|
import pandas as pd
|
8
8
|
|
9
9
|
from .cols import Cols
|
10
10
|
from .column import Column, IndexColumn
|
11
|
-
from .tok
|
12
|
-
from .tok.ent_tok import EntTok
|
13
|
-
from .tok.id_tok import IdTok
|
11
|
+
from .tok import BaseTok, BertTok, EntTok, IdTok
|
14
12
|
from .vocab import Vocab
|
15
13
|
from .vocabs import Vocabs
|
16
14
|
|
@@ -60,7 +58,7 @@ class UniTok:
|
|
60
58
|
|
61
59
|
def __init__(self):
|
62
60
|
self.cols = Cols()
|
63
|
-
self.vocabs = Vocabs()
|
61
|
+
self.vocabs = Vocabs() # type: Union[Dict[str, Vocab], Vocabs]
|
64
62
|
self.id_col = None # type: Optional[Column]
|
65
63
|
self.data = None
|
66
64
|
|
@@ -70,10 +68,15 @@ class UniTok:
|
|
70
68
|
'use vocabs instead (will be removed in 4.x version)', DeprecationWarning)
|
71
69
|
return self.vocabs
|
72
70
|
|
73
|
-
def add_col(self, col: Column):
|
71
|
+
def add_col(self, col: Union[Column, str], tok: Union[BaseTok, Type[BaseTok]] = None):
|
74
72
|
"""
|
75
73
|
Declare a column in the DataFrame to be tokenized.
|
76
74
|
"""
|
75
|
+
|
76
|
+
if isinstance(col, str):
|
77
|
+
assert tok is not None, 'tok must be specified when col is a string'
|
78
|
+
col = Column(tok, name=col)
|
79
|
+
|
77
80
|
if isinstance(col.tok, IdTok):
|
78
81
|
if self.id_col:
|
79
82
|
raise ValueError(f'already exists id column {self.id_col.name} before adding {col.name}')
|
@@ -123,9 +123,15 @@ class Vocab:
|
|
123
123
|
return True
|
124
124
|
|
125
125
|
def __iter__(self):
|
126
|
+
"""vocab obj list iterator"""
|
126
127
|
for i in range(len(self)):
|
127
128
|
yield self.i2o[i]
|
128
129
|
|
130
|
+
def __getitem__(self, item):
|
131
|
+
if isinstance(item, int):
|
132
|
+
return self.i2o[item]
|
133
|
+
return self.o2i[item]
|
134
|
+
|
129
135
|
"""
|
130
136
|
Editable Methods
|
131
137
|
"""
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|