UniTok 3.0.6__tar.gz → 3.0.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. {UniTok-3.0.6 → UniTok-3.0.8}/PKG-INFO +1 -1
  2. {UniTok-3.0.6 → UniTok-3.0.8}/UniTok/meta.py +7 -0
  3. {UniTok-3.0.6 → UniTok-3.0.8}/UniTok/unidep.py +34 -9
  4. {UniTok-3.0.6 → UniTok-3.0.8}/UniTok/unitok.py +37 -33
  5. {UniTok-3.0.6 → UniTok-3.0.8}/UniTok.egg-info/PKG-INFO +1 -1
  6. {UniTok-3.0.6 → UniTok-3.0.8}/setup.py +1 -1
  7. {UniTok-3.0.6 → UniTok-3.0.8}/README.md +0 -0
  8. {UniTok-3.0.6 → UniTok-3.0.8}/UniTok/__init__.py +0 -0
  9. {UniTok-3.0.6 → UniTok-3.0.8}/UniTok/analysis/__init__.py +0 -0
  10. {UniTok-3.0.6 → UniTok-3.0.8}/UniTok/analysis/lengths.py +0 -0
  11. {UniTok-3.0.6 → UniTok-3.0.8}/UniTok/analysis/plot.py +0 -0
  12. {UniTok-3.0.6 → UniTok-3.0.8}/UniTok/cols.py +0 -0
  13. {UniTok-3.0.6 → UniTok-3.0.8}/UniTok/column.py +0 -0
  14. {UniTok-3.0.6 → UniTok-3.0.8}/UniTok/global_setting.py +0 -0
  15. {UniTok-3.0.6 → UniTok-3.0.8}/UniTok/tok/__init__.py +0 -0
  16. {UniTok-3.0.6 → UniTok-3.0.8}/UniTok/tok/bert_tok.py +0 -0
  17. {UniTok-3.0.6 → UniTok-3.0.8}/UniTok/tok/entity_tok.py +0 -0
  18. {UniTok-3.0.6 → UniTok-3.0.8}/UniTok/tok/id_tok.py +0 -0
  19. {UniTok-3.0.6 → UniTok-3.0.8}/UniTok/tok/number_tok.py +0 -0
  20. {UniTok-3.0.6 → UniTok-3.0.8}/UniTok/tok/seq_tok.py +0 -0
  21. {UniTok-3.0.6 → UniTok-3.0.8}/UniTok/tok/split_tok.py +0 -0
  22. {UniTok-3.0.6 → UniTok-3.0.8}/UniTok/tok/tok.py +0 -0
  23. {UniTok-3.0.6 → UniTok-3.0.8}/UniTok/vocab.py +0 -0
  24. {UniTok-3.0.6 → UniTok-3.0.8}/UniTok/vocabs.py +0 -0
  25. {UniTok-3.0.6 → UniTok-3.0.8}/UniTok.egg-info/SOURCES.txt +0 -0
  26. {UniTok-3.0.6 → UniTok-3.0.8}/UniTok.egg-info/dependency_links.txt +0 -0
  27. {UniTok-3.0.6 → UniTok-3.0.8}/UniTok.egg-info/requires.txt +0 -0
  28. {UniTok-3.0.6 → UniTok-3.0.8}/UniTok.egg-info/top_level.txt +0 -0
  29. {UniTok-3.0.6 → UniTok-3.0.8}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: UniTok
3
- Version: 3.0.6
3
+ Version: 3.0.8
4
4
  Summary: Unified Tokenizer
5
5
  Home-page: https://github.com/Jyonn/UnifiedTokenizer
6
6
  Author: Jyonn Liu
@@ -40,6 +40,13 @@ class Voc:
40
40
  'cols': [col.name for col in self.cols]
41
41
  }
42
42
 
43
+ def merge(self, other):
44
+ return Voc(
45
+ name=self.name,
46
+ size=self.size,
47
+ cols=list(set(self.cols + other.cols)),
48
+ )
49
+
43
50
 
44
51
  class Meta:
45
52
  VER = 'UniDep-2.0'
@@ -1,3 +1,4 @@
1
+ import json
1
2
  import os
2
3
  import random
3
4
  import warnings
@@ -120,18 +121,21 @@ class UniDep:
120
121
 
121
122
  @classmethod
122
123
  def _merge_cols(cls, c1: Dict[str, Col], c2: Dict[str, Col]) -> Dict[str, Col]:
123
- for col_name, col in c2.items():
124
- if col_name in c1 and c1[col_name] != col:
125
- raise ValueError(f'col {col_name} config conflict')
124
+ for name, col in c2.items():
125
+ if name in c1 and c1[name] != col:
126
+ raise ValueError(f'col {name} config conflict')
126
127
  return cls._merge(c1, c2)
127
128
 
128
129
  @classmethod
129
130
  def _merge_vocs(cls, v1: Dict[str, Voc], v2: Dict[str, Voc]) -> Dict[str, Voc]:
130
- for vocab_name in v2:
131
- vocab_data = v2[vocab_name]
132
- if vocab_name in v1 and v1[vocab_name] != vocab_data:
133
- raise ValueError(f'vocab {vocab_name} config conflict')
134
- return cls._merge(v1, v2)
131
+ merged = v1.copy()
132
+ for name, vocab in v2.items():
133
+ if name in v1:
134
+ if v1[name] != vocab:
135
+ raise ValueError(f'vocab {name} config conflict')
136
+ vocab = v1[name].merge(vocab)
137
+ merged[name] = vocab
138
+ return merged
135
139
 
136
140
  def union(self, *depots: 'UniDep'):
137
141
  """
@@ -168,6 +172,27 @@ class UniDep:
168
172
  self.sample_size = len(self._indexes)
169
173
  return self
170
174
 
175
+ def export(self, store_dir):
176
+ """
177
+ export unioned or filtered depot
178
+ """
179
+
180
+ os.makedirs(store_dir, exist_ok=True)
181
+ data = dict()
182
+
183
+ for sample in tqdm.tqdm(self, disable=self.silent):
184
+ for col_name in sample:
185
+ if col_name not in data:
186
+ data[col_name] = []
187
+ data[col_name].append(sample[col_name])
188
+
189
+ for col_name in data:
190
+ data[col_name] = np.array(data[col_name])
191
+ np.save(os.path.join(store_dir, 'data.npy'), data, allow_pickle=True)
192
+
193
+ meta_data = self.meta.get_info()
194
+ json.dump(meta_data, open(os.path.join(store_dir, 'meta.data.json'), 'w'), indent=2)
195
+
171
196
  """
172
197
  Deprecated properties and methods
173
198
  """
@@ -192,7 +217,7 @@ class UniDep:
192
217
 
193
218
  def get_vocab_size(self, col_name, as_vocab=False):
194
219
  warnings.warn('unidep.get_vocab_size is deprecated (will be removed in 4.x version)', DeprecationWarning)
195
- vocab_id = col_name if as_vocab else self.get_vocab(col_name)
220
+ vocab_id = col_name if as_vocab else self.cols[col_name].voc.name
196
221
  return self.vocs[vocab_id].size
197
222
 
198
223
  def get_vocab(self, col_name):
@@ -18,6 +18,43 @@ from .vocabs import Vocabs
18
18
  class UniTok:
19
19
  """
20
20
  Unified Tokenizer, which can be used to tokenize different types of data in a DataFrame.
21
+
22
+ Example:
23
+ >>> import pandas as pd
24
+ >>> from UniTok import UniTok, Column, Vocab
25
+ >>>
26
+ >>> # load data
27
+ >>> df = pd.read_csv(
28
+ ... filepath_or_buffer='news-sample.tsv',
29
+ ... sep='\t',
30
+ ... names=['nid', 'cat', 'subCat', 'title', 'abs', 'url', 'titEnt', 'absEnt'],
31
+ ... usecols=['nid', 'cat', 'subCat', 'title', 'abs'],
32
+ ... )
33
+ >>>
34
+ >>> # define tokenizers
35
+ >>> id_tok = IdTok(name='nid')
36
+ >>> cat_tok = EntTok(name='cat')
37
+ >>> text_tok = BertTok(name='eng', vocab_dir='bert-base-uncased')
38
+ >>>
39
+ >>> # define UniTok
40
+ >>> tok = UniTok().add_index_col(name='nid').add_col(Column(
41
+ ... name='cat',
42
+ ... tok=cat_tok,
43
+ ... )).add_col(Column(
44
+ ... name='subCat',
45
+ ... tok=cat_tok,
46
+ ... ))add_col(Column(
47
+ ... name='title',
48
+ ... tok=text_tok,
49
+ ... max_length=20,
50
+ ... )).add_col(Column(
51
+ ... name='abs',
52
+ ... tok=text_tok,
53
+ ... max_length=30,
54
+ ... ))
55
+ >>>
56
+ >>> # tokenize
57
+ >>> tok.read_file(df).tokenize().store_data('news-sample')
21
58
  """
22
59
  VER = 'v3.0'
23
60
 
@@ -151,36 +188,3 @@ class UniTok:
151
188
  )
152
189
  json.dump(meta_data, open(os.path.join(store_dir, 'meta.data.json'), 'w'), indent=2)
153
190
  return self
154
-
155
-
156
- if __name__ == '__main__':
157
- df = pd.read_csv(
158
- filepath_or_buffer='news-sample.tsv',
159
- sep='\t',
160
- names=['nid', 'cat', 'subCat', 'title', 'abs', 'url', 'titEnt', 'absEnt'],
161
- usecols=['nid', 'cat', 'subCat', 'title', 'abs'],
162
- )
163
-
164
- ut = UniTok()
165
- id_tok = IdTok(name='news')
166
- cat_tok = EntTok(name='cat')
167
- txt_tok = BertTok(name='english', vocab_dir='bert-base-uncased')
168
- cat_tok.vocab.reserve(100)
169
-
170
- ut.add_col(Column(
171
- name='nid',
172
- tok=id_tok,
173
- )).add_col(Column(
174
- name='cat',
175
- tok=cat_tok,
176
- )).add_col(Column(
177
- name='subCat',
178
- tok=cat_tok,
179
- )).add_col(Column(
180
- name='title',
181
- tok=txt_tok,
182
- )).add_col(Column(
183
- name='abs',
184
- tok=txt_tok,
185
- )).read_file(df).tokenize()
186
- ut.store_data('news-sample')
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: UniTok
3
- Version: 3.0.6
3
+ Version: 3.0.8
4
4
  Summary: Unified Tokenizer
5
5
  Home-page: https://github.com/Jyonn/UnifiedTokenizer
6
6
  Author: Jyonn Liu
@@ -6,7 +6,7 @@ long_description = (this_directory / "README.md").read_text()
6
6
 
7
7
  setup(
8
8
  name='UniTok',
9
- version='3.0.6',
9
+ version='3.0.8',
10
10
  keywords=['token', 'tokenizer', 'bert'],
11
11
  description='Unified Tokenizer',
12
12
  long_description=long_description,
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes