UniTok 3.0.7__tar.gz → 3.0.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. {UniTok-3.0.7 → UniTok-3.0.9}/PKG-INFO +1 -1
  2. {UniTok-3.0.7 → UniTok-3.0.9}/UniTok/meta.py +14 -0
  3. {UniTok-3.0.7 → UniTok-3.0.9}/UniTok/unidep.py +12 -8
  4. {UniTok-3.0.7 → UniTok-3.0.9}/UniTok.egg-info/PKG-INFO +1 -1
  5. {UniTok-3.0.7 → UniTok-3.0.9}/setup.py +1 -1
  6. {UniTok-3.0.7 → UniTok-3.0.9}/README.md +0 -0
  7. {UniTok-3.0.7 → UniTok-3.0.9}/UniTok/__init__.py +0 -0
  8. {UniTok-3.0.7 → UniTok-3.0.9}/UniTok/analysis/__init__.py +0 -0
  9. {UniTok-3.0.7 → UniTok-3.0.9}/UniTok/analysis/lengths.py +0 -0
  10. {UniTok-3.0.7 → UniTok-3.0.9}/UniTok/analysis/plot.py +0 -0
  11. {UniTok-3.0.7 → UniTok-3.0.9}/UniTok/cols.py +0 -0
  12. {UniTok-3.0.7 → UniTok-3.0.9}/UniTok/column.py +0 -0
  13. {UniTok-3.0.7 → UniTok-3.0.9}/UniTok/global_setting.py +0 -0
  14. {UniTok-3.0.7 → UniTok-3.0.9}/UniTok/tok/__init__.py +0 -0
  15. {UniTok-3.0.7 → UniTok-3.0.9}/UniTok/tok/bert_tok.py +0 -0
  16. {UniTok-3.0.7 → UniTok-3.0.9}/UniTok/tok/entity_tok.py +0 -0
  17. {UniTok-3.0.7 → UniTok-3.0.9}/UniTok/tok/id_tok.py +0 -0
  18. {UniTok-3.0.7 → UniTok-3.0.9}/UniTok/tok/number_tok.py +0 -0
  19. {UniTok-3.0.7 → UniTok-3.0.9}/UniTok/tok/seq_tok.py +0 -0
  20. {UniTok-3.0.7 → UniTok-3.0.9}/UniTok/tok/split_tok.py +0 -0
  21. {UniTok-3.0.7 → UniTok-3.0.9}/UniTok/tok/tok.py +0 -0
  22. {UniTok-3.0.7 → UniTok-3.0.9}/UniTok/unitok.py +0 -0
  23. {UniTok-3.0.7 → UniTok-3.0.9}/UniTok/vocab.py +0 -0
  24. {UniTok-3.0.7 → UniTok-3.0.9}/UniTok/vocabs.py +0 -0
  25. {UniTok-3.0.7 → UniTok-3.0.9}/UniTok.egg-info/SOURCES.txt +0 -0
  26. {UniTok-3.0.7 → UniTok-3.0.9}/UniTok.egg-info/dependency_links.txt +0 -0
  27. {UniTok-3.0.7 → UniTok-3.0.9}/UniTok.egg-info/requires.txt +0 -0
  28. {UniTok-3.0.7 → UniTok-3.0.9}/UniTok.egg-info/top_level.txt +0 -0
  29. {UniTok-3.0.7 → UniTok-3.0.9}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: UniTok
3
- Version: 3.0.7
3
+ Version: 3.0.9
4
4
  Summary: Unified Tokenizer
5
5
  Home-page: https://github.com/Jyonn/UnifiedTokenizer
6
6
  Author: Jyonn Liu
@@ -40,6 +40,20 @@ class Voc:
40
40
  'cols': [col.name for col in self.cols]
41
41
  }
42
42
 
43
+ def merge(self, other):
44
+ cols = self.cols.copy()
45
+ for col in other.cols:
46
+ for _col in cols:
47
+ if col.name == _col.name:
48
+ break
49
+ else:
50
+ cols.append(col)
51
+ return Voc(
52
+ name=self.name,
53
+ size=self.size,
54
+ cols=cols,
55
+ )
56
+
43
57
 
44
58
  class Meta:
45
59
  VER = 'UniDep-2.0'
@@ -121,18 +121,21 @@ class UniDep:
121
121
 
122
122
  @classmethod
123
123
  def _merge_cols(cls, c1: Dict[str, Col], c2: Dict[str, Col]) -> Dict[str, Col]:
124
- for col_name, col in c2.items():
125
- if col_name in c1 and c1[col_name] != col:
126
- raise ValueError(f'col {col_name} config conflict')
124
+ for name, col in c2.items():
125
+ if name in c1 and c1[name] != col:
126
+ raise ValueError(f'col {name} config conflict')
127
127
  return cls._merge(c1, c2)
128
128
 
129
129
  @classmethod
130
130
  def _merge_vocs(cls, v1: Dict[str, Voc], v2: Dict[str, Voc]) -> Dict[str, Voc]:
131
- for vocab_name in v2:
132
- vocab_data = v2[vocab_name]
133
- if vocab_name in v1 and v1[vocab_name] != vocab_data:
134
- raise ValueError(f'vocab {vocab_name} config conflict')
135
- return cls._merge(v1, v2)
131
+ merged = v1.copy()
132
+ for name, vocab in v2.items():
133
+ if name in v1:
134
+ if v1[name] != vocab:
135
+ raise ValueError(f'vocab {name} config conflict')
136
+ vocab = v1[name].merge(vocab)
137
+ merged[name] = vocab
138
+ return merged
136
139
 
137
140
  def union(self, *depots: 'UniDep'):
138
141
  """
@@ -174,6 +177,7 @@ class UniDep:
174
177
  export unioned or filtered depot
175
178
  """
176
179
 
180
+ os.makedirs(store_dir, exist_ok=True)
177
181
  data = dict()
178
182
 
179
183
  for sample in tqdm.tqdm(self, disable=self.silent):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: UniTok
3
- Version: 3.0.7
3
+ Version: 3.0.9
4
4
  Summary: Unified Tokenizer
5
5
  Home-page: https://github.com/Jyonn/UnifiedTokenizer
6
6
  Author: Jyonn Liu
@@ -6,7 +6,7 @@ long_description = (this_directory / "README.md").read_text()
6
6
 
7
7
  setup(
8
8
  name='UniTok',
9
- version='3.0.7',
9
+ version='3.0.9',
10
10
  keywords=['token', 'tokenizer', 'bert'],
11
11
  description='Unified Tokenizer',
12
12
  long_description=long_description,
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes