UniTok 3.0.6__tar.gz → 3.0.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {UniTok-3.0.6 → UniTok-3.0.8}/PKG-INFO +1 -1
- {UniTok-3.0.6 → UniTok-3.0.8}/UniTok/meta.py +7 -0
- {UniTok-3.0.6 → UniTok-3.0.8}/UniTok/unidep.py +34 -9
- {UniTok-3.0.6 → UniTok-3.0.8}/UniTok/unitok.py +37 -33
- {UniTok-3.0.6 → UniTok-3.0.8}/UniTok.egg-info/PKG-INFO +1 -1
- {UniTok-3.0.6 → UniTok-3.0.8}/setup.py +1 -1
- {UniTok-3.0.6 → UniTok-3.0.8}/README.md +0 -0
- {UniTok-3.0.6 → UniTok-3.0.8}/UniTok/__init__.py +0 -0
- {UniTok-3.0.6 → UniTok-3.0.8}/UniTok/analysis/__init__.py +0 -0
- {UniTok-3.0.6 → UniTok-3.0.8}/UniTok/analysis/lengths.py +0 -0
- {UniTok-3.0.6 → UniTok-3.0.8}/UniTok/analysis/plot.py +0 -0
- {UniTok-3.0.6 → UniTok-3.0.8}/UniTok/cols.py +0 -0
- {UniTok-3.0.6 → UniTok-3.0.8}/UniTok/column.py +0 -0
- {UniTok-3.0.6 → UniTok-3.0.8}/UniTok/global_setting.py +0 -0
- {UniTok-3.0.6 → UniTok-3.0.8}/UniTok/tok/__init__.py +0 -0
- {UniTok-3.0.6 → UniTok-3.0.8}/UniTok/tok/bert_tok.py +0 -0
- {UniTok-3.0.6 → UniTok-3.0.8}/UniTok/tok/entity_tok.py +0 -0
- {UniTok-3.0.6 → UniTok-3.0.8}/UniTok/tok/id_tok.py +0 -0
- {UniTok-3.0.6 → UniTok-3.0.8}/UniTok/tok/number_tok.py +0 -0
- {UniTok-3.0.6 → UniTok-3.0.8}/UniTok/tok/seq_tok.py +0 -0
- {UniTok-3.0.6 → UniTok-3.0.8}/UniTok/tok/split_tok.py +0 -0
- {UniTok-3.0.6 → UniTok-3.0.8}/UniTok/tok/tok.py +0 -0
- {UniTok-3.0.6 → UniTok-3.0.8}/UniTok/vocab.py +0 -0
- {UniTok-3.0.6 → UniTok-3.0.8}/UniTok/vocabs.py +0 -0
- {UniTok-3.0.6 → UniTok-3.0.8}/UniTok.egg-info/SOURCES.txt +0 -0
- {UniTok-3.0.6 → UniTok-3.0.8}/UniTok.egg-info/dependency_links.txt +0 -0
- {UniTok-3.0.6 → UniTok-3.0.8}/UniTok.egg-info/requires.txt +0 -0
- {UniTok-3.0.6 → UniTok-3.0.8}/UniTok.egg-info/top_level.txt +0 -0
- {UniTok-3.0.6 → UniTok-3.0.8}/setup.cfg +0 -0
@@ -1,3 +1,4 @@
|
|
1
|
+
import json
|
1
2
|
import os
|
2
3
|
import random
|
3
4
|
import warnings
|
@@ -120,18 +121,21 @@ class UniDep:
|
|
120
121
|
|
121
122
|
@classmethod
|
122
123
|
def _merge_cols(cls, c1: Dict[str, Col], c2: Dict[str, Col]) -> Dict[str, Col]:
|
123
|
-
for
|
124
|
-
if
|
125
|
-
raise ValueError(f'col {
|
124
|
+
for name, col in c2.items():
|
125
|
+
if name in c1 and c1[name] != col:
|
126
|
+
raise ValueError(f'col {name} config conflict')
|
126
127
|
return cls._merge(c1, c2)
|
127
128
|
|
128
129
|
@classmethod
|
129
130
|
def _merge_vocs(cls, v1: Dict[str, Voc], v2: Dict[str, Voc]) -> Dict[str, Voc]:
|
130
|
-
|
131
|
-
|
132
|
-
if
|
133
|
-
|
134
|
-
|
131
|
+
merged = v1.copy()
|
132
|
+
for name, vocab in v2.items():
|
133
|
+
if name in v1:
|
134
|
+
if v1[name] != vocab:
|
135
|
+
raise ValueError(f'vocab {name} config conflict')
|
136
|
+
vocab = v1[name].merge(vocab)
|
137
|
+
merged[name] = vocab
|
138
|
+
return merged
|
135
139
|
|
136
140
|
def union(self, *depots: 'UniDep'):
|
137
141
|
"""
|
@@ -168,6 +172,27 @@ class UniDep:
|
|
168
172
|
self.sample_size = len(self._indexes)
|
169
173
|
return self
|
170
174
|
|
175
|
+
def export(self, store_dir):
|
176
|
+
"""
|
177
|
+
export unioned or filtered depot
|
178
|
+
"""
|
179
|
+
|
180
|
+
os.makedirs(store_dir, exist_ok=True)
|
181
|
+
data = dict()
|
182
|
+
|
183
|
+
for sample in tqdm.tqdm(self, disable=self.silent):
|
184
|
+
for col_name in sample:
|
185
|
+
if col_name not in data:
|
186
|
+
data[col_name] = []
|
187
|
+
data[col_name].append(sample[col_name])
|
188
|
+
|
189
|
+
for col_name in data:
|
190
|
+
data[col_name] = np.array(data[col_name])
|
191
|
+
np.save(os.path.join(store_dir, 'data.npy'), data, allow_pickle=True)
|
192
|
+
|
193
|
+
meta_data = self.meta.get_info()
|
194
|
+
json.dump(meta_data, open(os.path.join(store_dir, 'meta.data.json'), 'w'), indent=2)
|
195
|
+
|
171
196
|
"""
|
172
197
|
Deprecated properties and methods
|
173
198
|
"""
|
@@ -192,7 +217,7 @@ class UniDep:
|
|
192
217
|
|
193
218
|
def get_vocab_size(self, col_name, as_vocab=False):
|
194
219
|
warnings.warn('unidep.get_vocab_size is deprecated (will be removed in 4.x version)', DeprecationWarning)
|
195
|
-
vocab_id = col_name if as_vocab else self.
|
220
|
+
vocab_id = col_name if as_vocab else self.cols[col_name].voc.name
|
196
221
|
return self.vocs[vocab_id].size
|
197
222
|
|
198
223
|
def get_vocab(self, col_name):
|
@@ -18,6 +18,43 @@ from .vocabs import Vocabs
|
|
18
18
|
class UniTok:
|
19
19
|
"""
|
20
20
|
Unified Tokenizer, which can be used to tokenize different types of data in a DataFrame.
|
21
|
+
|
22
|
+
Example:
|
23
|
+
>>> import pandas as pd
|
24
|
+
>>> from UniTok import UniTok, Column, Vocab
|
25
|
+
>>>
|
26
|
+
>>> # load data
|
27
|
+
>>> df = pd.read_csv(
|
28
|
+
... filepath_or_buffer='news-sample.tsv',
|
29
|
+
... sep='\t',
|
30
|
+
... names=['nid', 'cat', 'subCat', 'title', 'abs', 'url', 'titEnt', 'absEnt'],
|
31
|
+
... usecols=['nid', 'cat', 'subCat', 'title', 'abs'],
|
32
|
+
... )
|
33
|
+
>>>
|
34
|
+
>>> # define tokenizers
|
35
|
+
>>> id_tok = IdTok(name='nid')
|
36
|
+
>>> cat_tok = EntTok(name='cat')
|
37
|
+
>>> text_tok = BertTok(name='eng', vocab_dir='bert-base-uncased')
|
38
|
+
>>>
|
39
|
+
>>> # define UniTok
|
40
|
+
>>> tok = UniTok().add_index_col(name='nid').add_col(Column(
|
41
|
+
... name='cat',
|
42
|
+
... tok=cat_tok,
|
43
|
+
... )).add_col(Column(
|
44
|
+
... name='subCat',
|
45
|
+
... tok=cat_tok,
|
46
|
+
... ))add_col(Column(
|
47
|
+
... name='title',
|
48
|
+
... tok=text_tok,
|
49
|
+
... max_length=20,
|
50
|
+
... )).add_col(Column(
|
51
|
+
... name='abs',
|
52
|
+
... tok=text_tok,
|
53
|
+
... max_length=30,
|
54
|
+
... ))
|
55
|
+
>>>
|
56
|
+
>>> # tokenize
|
57
|
+
>>> tok.read_file(df).tokenize().store_data('news-sample')
|
21
58
|
"""
|
22
59
|
VER = 'v3.0'
|
23
60
|
|
@@ -151,36 +188,3 @@ class UniTok:
|
|
151
188
|
)
|
152
189
|
json.dump(meta_data, open(os.path.join(store_dir, 'meta.data.json'), 'w'), indent=2)
|
153
190
|
return self
|
154
|
-
|
155
|
-
|
156
|
-
if __name__ == '__main__':
|
157
|
-
df = pd.read_csv(
|
158
|
-
filepath_or_buffer='news-sample.tsv',
|
159
|
-
sep='\t',
|
160
|
-
names=['nid', 'cat', 'subCat', 'title', 'abs', 'url', 'titEnt', 'absEnt'],
|
161
|
-
usecols=['nid', 'cat', 'subCat', 'title', 'abs'],
|
162
|
-
)
|
163
|
-
|
164
|
-
ut = UniTok()
|
165
|
-
id_tok = IdTok(name='news')
|
166
|
-
cat_tok = EntTok(name='cat')
|
167
|
-
txt_tok = BertTok(name='english', vocab_dir='bert-base-uncased')
|
168
|
-
cat_tok.vocab.reserve(100)
|
169
|
-
|
170
|
-
ut.add_col(Column(
|
171
|
-
name='nid',
|
172
|
-
tok=id_tok,
|
173
|
-
)).add_col(Column(
|
174
|
-
name='cat',
|
175
|
-
tok=cat_tok,
|
176
|
-
)).add_col(Column(
|
177
|
-
name='subCat',
|
178
|
-
tok=cat_tok,
|
179
|
-
)).add_col(Column(
|
180
|
-
name='title',
|
181
|
-
tok=txt_tok,
|
182
|
-
)).add_col(Column(
|
183
|
-
name='abs',
|
184
|
-
tok=txt_tok,
|
185
|
-
)).read_file(df).tokenize()
|
186
|
-
ut.store_data('news-sample')
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|