UniTok 3.5.2__tar.gz → 4.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. {UniTok-3.5.2 → UniTok-4.0.0}/PKG-INFO +15 -15
  2. {UniTok-3.5.2 → UniTok-4.0.0}/README.md +13 -13
  3. UniTok-4.0.0/UniTok/__init__.py +37 -0
  4. UniTok-4.0.0/UniTok/__main__.py +78 -0
  5. UniTok-4.0.0/UniTok/job.py +76 -0
  6. UniTok-4.0.0/UniTok/meta.py +136 -0
  7. UniTok-4.0.0/UniTok/status.py +44 -0
  8. UniTok-4.0.0/UniTok/tokenizer/__init__.py +18 -0
  9. UniTok-4.0.0/UniTok/tokenizer/base_tokenizer.py +78 -0
  10. UniTok-4.0.0/UniTok/tokenizer/digit_tokenizer.py +33 -0
  11. UniTok-4.0.0/UniTok/tokenizer/entity_tokenizer.py +13 -0
  12. UniTok-4.0.0/UniTok/tokenizer/split_tokenizer.py +14 -0
  13. UniTok-4.0.0/UniTok/tokenizer/transformers_tokenizer.py +50 -0
  14. UniTok-4.0.0/UniTok/tokenizer/union_tokenizer.py +17 -0
  15. UniTok-4.0.0/UniTok/tokenizer/unknown_tokenizer.py +35 -0
  16. UniTok-4.0.0/UniTok/unitok.py +411 -0
  17. UniTok-4.0.0/UniTok/utils/__init__.py +21 -0
  18. UniTok-4.0.0/UniTok/utils/class_pool.py +107 -0
  19. UniTok-4.0.0/UniTok/utils/data.py +15 -0
  20. UniTok-4.0.0/UniTok/utils/function.py +6 -0
  21. UniTok-4.0.0/UniTok/utils/handler/__init__.py +7 -0
  22. UniTok-4.0.0/UniTok/utils/handler/json_handler.py +28 -0
  23. UniTok-4.0.0/UniTok/utils/handler/pkl_handler.py +19 -0
  24. UniTok-4.0.0/UniTok/utils/hub/__init__.py +4 -0
  25. UniTok-4.0.0/UniTok/utils/hub/hub.py +44 -0
  26. UniTok-4.0.0/UniTok/utils/hub/param_hub.py +6 -0
  27. UniTok-4.0.0/UniTok/utils/index_set/__init__.py +15 -0
  28. UniTok-4.0.0/UniTok/utils/index_set/index_set.py +71 -0
  29. UniTok-4.0.0/UniTok/utils/index_set/job_set.py +25 -0
  30. UniTok-4.0.0/UniTok/utils/index_set/tokenizer_set.py +19 -0
  31. UniTok-4.0.0/UniTok/utils/index_set/vocabulary_set.py +19 -0
  32. UniTok-4.0.0/UniTok/utils/instance.py +18 -0
  33. UniTok-4.0.0/UniTok/utils/map.py +3 -0
  34. UniTok-4.0.0/UniTok/utils/space.py +29 -0
  35. UniTok-4.0.0/UniTok/utils/symbol.py +23 -0
  36. UniTok-4.0.0/UniTok/utils/verbose.py +48 -0
  37. UniTok-4.0.0/UniTok/vocabulary/__init__.py +11 -0
  38. UniTok-4.0.0/UniTok/vocabulary/counter.py +85 -0
  39. UniTok-4.0.0/UniTok/vocabulary/vocabulary.py +150 -0
  40. {UniTok-3.5.2 → UniTok-4.0.0}/UniTok.egg-info/PKG-INFO +15 -15
  41. UniTok-4.0.0/UniTok.egg-info/SOURCES.txt +71 -0
  42. UniTok-4.0.0/UniTok.egg-info/entry_points.txt +5 -0
  43. UniTok-4.0.0/UniTok.egg-info/top_level.txt +2 -0
  44. UniTok-4.0.0/UniTokv3/__main__.py +169 -0
  45. {UniTok-3.5.2/UniTok → UniTok-4.0.0/UniTokv3}/tok/bert_tok.py +1 -1
  46. {UniTok-3.5.2/UniTok → UniTok-4.0.0/UniTokv3}/tok/ent_tok.py +1 -1
  47. {UniTok-3.5.2/UniTok → UniTok-4.0.0/UniTokv3}/tok/id_tok.py +1 -1
  48. {UniTok-3.5.2/UniTok → UniTok-4.0.0/UniTokv3}/tok/number_tok.py +1 -1
  49. {UniTok-3.5.2/UniTok → UniTok-4.0.0/UniTokv3}/tok/seq_tok.py +1 -1
  50. {UniTok-3.5.2/UniTok → UniTok-4.0.0/UniTokv3}/tok/split_tok.py +1 -1
  51. {UniTok-3.5.2/UniTok → UniTok-4.0.0/UniTokv3}/tok/tok.py +1 -1
  52. {UniTok-3.5.2/UniTok → UniTok-4.0.0/UniTokv3}/unidep.py +9 -2
  53. {UniTok-3.5.2/UniTok → UniTok-4.0.0/UniTokv3}/unitok.py +10 -3
  54. {UniTok-3.5.2 → UniTok-4.0.0}/setup.py +5 -3
  55. UniTok-3.5.2/UniTok/__main__.py +0 -42
  56. UniTok-3.5.2/UniTok.egg-info/SOURCES.txt +0 -30
  57. UniTok-3.5.2/UniTok.egg-info/entry_points.txt +0 -3
  58. UniTok-3.5.2/UniTok.egg-info/top_level.txt +0 -1
  59. {UniTok-3.5.2 → UniTok-4.0.0}/UniTok.egg-info/dependency_links.txt +0 -0
  60. {UniTok-3.5.2 → UniTok-4.0.0}/UniTok.egg-info/requires.txt +0 -0
  61. {UniTok-3.5.2/UniTok → UniTok-4.0.0/UniTokv3}/__init__.py +0 -0
  62. {UniTok-3.5.2/UniTok → UniTok-4.0.0/UniTokv3}/analysis/__init__.py +0 -0
  63. {UniTok-3.5.2/UniTok → UniTok-4.0.0/UniTokv3}/analysis/lengths.py +0 -0
  64. {UniTok-3.5.2/UniTok → UniTok-4.0.0/UniTokv3}/analysis/plot.py +0 -0
  65. {UniTok-3.5.2/UniTok → UniTok-4.0.0/UniTokv3}/cols.py +0 -0
  66. {UniTok-3.5.2/UniTok → UniTok-4.0.0/UniTokv3}/column.py +0 -0
  67. {UniTok-3.5.2/UniTok → UniTok-4.0.0/UniTokv3}/fut.py +0 -0
  68. {UniTok-3.5.2/UniTok → UniTok-4.0.0/UniTokv3}/global_setting.py +0 -0
  69. {UniTok-3.5.2/UniTok → UniTok-4.0.0/UniTokv3}/meta.py +0 -0
  70. {UniTok-3.5.2/UniTok → UniTok-4.0.0/UniTokv3}/tok/__init__.py +0 -0
  71. {UniTok-3.5.2/UniTok → UniTok-4.0.0/UniTokv3}/vocab.py +0 -0
  72. {UniTok-3.5.2/UniTok → UniTok-4.0.0/UniTokv3}/vocabs.py +0 -0
  73. {UniTok-3.5.2 → UniTok-4.0.0}/setup.cfg +0 -0
@@ -1,10 +1,10 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: UniTok
3
- Version: 3.5.2
3
+ Version: 4.0.0
4
4
  Summary: Unified Tokenizer
5
5
  Home-page: https://github.com/Jyonn/UnifiedTokenizer
6
6
  Author: Jyonn Liu
7
- Author-email: i@6-79.cn
7
+ Author-email: liu@qijiong.work
8
8
  License: MIT Licence
9
9
  Keywords: token,tokenizer
10
10
  Platform: any
@@ -73,8 +73,8 @@ UniTok提供了一整套的数据预处理工具,包括不同类型的分词
73
73
  通过以下代码,我们可以针对每个文件构建一个UniTok对象:
74
74
 
75
75
  ```python
76
- from UniTok import UniTok, Column, Vocab
77
- from UniTok.tok import IdTok, BertTok, EntTok, SplitTok, NumberTok
76
+ from UniTokv3 import UniTok, Column, Vocab
77
+ from UniTokv3.tok import IdTok, BertTok, EntTok, SplitTok, NumberTok
78
78
 
79
79
  # Create a news id vocab, commonly used in news data, history data, and interaction data.
80
80
  nid_vocab = Vocab('nid')
@@ -110,7 +110,7 @@ news_ut.add_col(Column(
110
110
  news_ut.read('news.tsv', sep='\t')
111
111
 
112
112
  # Tokenize the data.
113
- news_ut.tokenize()
113
+ news_ut.tokenize()
114
114
 
115
115
  # Store the tokenized data.
116
116
  news_ut.store('data/news')
@@ -130,10 +130,10 @@ user_ut.add_col(Column(
130
130
  ))
131
131
 
132
132
  # Read the data file.
133
- user_ut.read('user.tsv', sep='\t')
133
+ user_ut.read('user.tsv', sep='\t')
134
134
 
135
135
  # Tokenize the data.
136
- user_ut.tokenize()
136
+ user_ut.tokenize()
137
137
 
138
138
  # Store the tokenized data.
139
139
  user_ut.store('data/user')
@@ -142,16 +142,16 @@ user_ut.store('data/user')
142
142
  def inter_tokenize(mode):
143
143
  # Create an interaction UniTok object.
144
144
  inter_ut = UniTok()
145
-
145
+
146
146
  # Add columns to the interaction UniTok object.
147
147
  inter_ut.add_index_col(
148
148
  # The index column in the interaction data is automatically generated, and the tokenizer does not need to be specified.
149
149
  ).add_col(Column(
150
150
  # Align with the uid column in user_ut.
151
- tok=EntTok(vocab=uid_vocab),
151
+ tok=EntTok(vocab=uid_vocab),
152
152
  )).add_col(Column(
153
153
  # Align with the nid column in news_ut.
154
- tok=EntTok(vocab=nid_vocab),
154
+ tok=EntTok(vocab=nid_vocab),
155
155
  )).add_col(Column(
156
156
  name='label',
157
157
  # The label column in the interaction data only has two values, 0 and 1.
@@ -160,14 +160,14 @@ def inter_tokenize(mode):
160
160
 
161
161
  # Read the data file.
162
162
  inter_ut.read(f'{mode}.tsv', sep='\t')
163
-
163
+
164
164
  # Tokenize the data.
165
- inter_ut.tokenize()
166
-
165
+ inter_ut.tokenize()
166
+
167
167
  # Store the tokenized data.
168
168
  inter_ut.store(mode)
169
169
 
170
-
170
+
171
171
  inter_tokenize('data/train')
172
172
  inter_tokenize('data/dev')
173
173
  inter_tokenize('data/test')
@@ -184,7 +184,7 @@ UniDep 是一个数据依赖处理类,可以用于加载和访问 UniTok 预
184
184
  以下是一个简单的使用示例:
185
185
 
186
186
  ```python
187
- from UniTok import UniDep
187
+ from UniTokv3 import UniDep
188
188
 
189
189
  # Load the data.
190
190
  dep = UniDep('data/news')
@@ -61,8 +61,8 @@ UniTok提供了一整套的数据预处理工具,包括不同类型的分词
61
61
  通过以下代码,我们可以针对每个文件构建一个UniTok对象:
62
62
 
63
63
  ```python
64
- from UniTok import UniTok, Column, Vocab
65
- from UniTok.tok import IdTok, BertTok, EntTok, SplitTok, NumberTok
64
+ from UniTokv3 import UniTok, Column, Vocab
65
+ from UniTokv3.tok import IdTok, BertTok, EntTok, SplitTok, NumberTok
66
66
 
67
67
  # Create a news id vocab, commonly used in news data, history data, and interaction data.
68
68
  nid_vocab = Vocab('nid')
@@ -98,7 +98,7 @@ news_ut.add_col(Column(
98
98
  news_ut.read('news.tsv', sep='\t')
99
99
 
100
100
  # Tokenize the data.
101
- news_ut.tokenize()
101
+ news_ut.tokenize()
102
102
 
103
103
  # Store the tokenized data.
104
104
  news_ut.store('data/news')
@@ -118,10 +118,10 @@ user_ut.add_col(Column(
118
118
  ))
119
119
 
120
120
  # Read the data file.
121
- user_ut.read('user.tsv', sep='\t')
121
+ user_ut.read('user.tsv', sep='\t')
122
122
 
123
123
  # Tokenize the data.
124
- user_ut.tokenize()
124
+ user_ut.tokenize()
125
125
 
126
126
  # Store the tokenized data.
127
127
  user_ut.store('data/user')
@@ -130,16 +130,16 @@ user_ut.store('data/user')
130
130
  def inter_tokenize(mode):
131
131
  # Create an interaction UniTok object.
132
132
  inter_ut = UniTok()
133
-
133
+
134
134
  # Add columns to the interaction UniTok object.
135
135
  inter_ut.add_index_col(
136
136
  # The index column in the interaction data is automatically generated, and the tokenizer does not need to be specified.
137
137
  ).add_col(Column(
138
138
  # Align with the uid column in user_ut.
139
- tok=EntTok(vocab=uid_vocab),
139
+ tok=EntTok(vocab=uid_vocab),
140
140
  )).add_col(Column(
141
141
  # Align with the nid column in news_ut.
142
- tok=EntTok(vocab=nid_vocab),
142
+ tok=EntTok(vocab=nid_vocab),
143
143
  )).add_col(Column(
144
144
  name='label',
145
145
  # The label column in the interaction data only has two values, 0 and 1.
@@ -148,14 +148,14 @@ def inter_tokenize(mode):
148
148
 
149
149
  # Read the data file.
150
150
  inter_ut.read(f'{mode}.tsv', sep='\t')
151
-
151
+
152
152
  # Tokenize the data.
153
- inter_ut.tokenize()
154
-
153
+ inter_ut.tokenize()
154
+
155
155
  # Store the tokenized data.
156
156
  inter_ut.store(mode)
157
157
 
158
-
158
+
159
159
  inter_tokenize('data/train')
160
160
  inter_tokenize('data/dev')
161
161
  inter_tokenize('data/test')
@@ -172,7 +172,7 @@ UniDep 是一个数据依赖处理类,可以用于加载和访问 UniTok 预
172
172
  以下是一个简单的使用示例:
173
173
 
174
174
  ```python
175
- from UniTok import UniDep
175
+ from UniTokv3 import UniDep
176
176
 
177
177
  # Load the data.
178
178
  dep = UniDep('data/news')
@@ -0,0 +1,37 @@
1
+ from unitok.utils import Verbose, warning, error, info, debug
2
+ from unitok.utils import Symbol, Symbols
3
+ from unitok.utils import JsonHandler, PickleHandler
4
+ from unitok.utils import Instance, Space, Map
5
+
6
+ from unitok.utils.hub import Hub, ParamHub
7
+ from unitok.vocabulary import Vocab, Vocabulary, VocabHub, VocabularyHub
8
+ from unitok.tokenizer import BaseTokenizer, TokenizerHub
9
+ from unitok.tokenizer import EntityTokenizer, EntitiesTokenizer
10
+ from unitok.tokenizer import TransformersTokenizer, BertTokenizer
11
+ from unitok.tokenizer import SplitTokenizer, DigitTokenizer, DigitsTokenizer
12
+ from unitok.job import Job, JobHub
13
+
14
+ from unitok.utils.index_set import IndexSet, VocabSet, TokenizerSet, JobSet
15
+
16
+ from unitok.meta import Meta
17
+ from unitok.status import Status
18
+ from unitok.unitok import UniTok
19
+
20
+
21
+ __all__ = [
22
+ 'Verbose', 'warning', 'error', 'info', 'debug',
23
+ 'Symbol', 'Symbols',
24
+ 'JsonHandler', 'PickleHandler',
25
+ 'Instance', 'Space', 'Map',
26
+ 'Hub', 'ParamHub',
27
+ 'Vocab', 'Vocabulary', 'VocabHub', 'VocabularyHub',
28
+ 'BaseTokenizer', 'TokenizerHub',
29
+ 'EntityTokenizer', 'EntitiesTokenizer',
30
+ 'TransformersTokenizer', 'BertTokenizer',
31
+ 'SplitTokenizer', 'DigitTokenizer', 'DigitsTokenizer',
32
+ 'Job', 'JobHub',
33
+ 'IndexSet', 'VocabSet', 'TokenizerSet', 'JobSet',
34
+ 'Meta',
35
+ 'Status',
36
+ 'UniTok',
37
+ ]
@@ -0,0 +1,78 @@
1
+ import argparse
2
+
3
+ import pandas as pd
4
+
5
+ from unitok.tokenizer import BaseTokenizer
6
+ from unitok.unitok import UniTok
7
+ from unitok.utils.class_pool import ClassPool
8
+
9
+
10
+ def integrate():
11
+ parser = argparse.ArgumentParser()
12
+ parser.add_argument('path', type=str, default='.', help='path to a unitok data directory')
13
+ parser.add_argument('--file', '-f', type=str, help='csv, tsv, parquet format data')
14
+ parser.add_argument('--lib', type=str, default=None, help='custom tokenizer library')
15
+ parser.add_argument('--column', '-c', type=str, help='column name to tokenize')
16
+ parser.add_argument('--name', '-n', type=str, help='job name and export column name')
17
+ parser.add_argument('--vocab', '-v', type=str, default=None, help='vocabulary name')
18
+ parser.add_argument('--tokenizer', '-t', type=str, default=None, help='tokenizer classname')
19
+ parser.add_argument('--tokenizer_id', type=str, default=None, help='tokenizer id')
20
+ parser.add_argument('--truncate', type=int, help='truncate length', default=None)
21
+ args, unknown_args = parser.parse_known_args()
22
+
23
+ tokenizer_params = dict()
24
+ current_param = None
25
+ for arg in unknown_args:
26
+ if current_param:
27
+ tokenizer_params[current_param] = arg
28
+ current_param = None
29
+ if arg.startswith('--t.'):
30
+ current_param = arg[4:]
31
+ elif arg.startswith('--tokenizer.'):
32
+ current_param = arg[11:]
33
+
34
+ if args.file.endswith('.csv') or args.file.endswith('.tsv'):
35
+ df = pd.read_csv(args.file, sep='\t')
36
+ elif args.file.endswith('.parquet'):
37
+ df = pd.read_parquet(args.file)
38
+ else:
39
+ raise ValueError(f'Unsupported file format: {args.file}')
40
+
41
+ with UniTok.load(args.path, tokenizer_lib=args.lib) as ut:
42
+ if args.tokenizer_id:
43
+ for t in ut.meta.tokenizers: # type: BaseTokenizer
44
+ if t.get_tokenizer_id() == args.tokenizer_id:
45
+ tokenizer = t
46
+ break
47
+ else:
48
+ raise ValueError(f'Unknown tokenizer id: {args.tokenizer_id}')
49
+ else:
50
+ assert args.tokenizer is not None and args.vocab is not None, 'Tokenizer classname and vocabulary must be specified'
51
+ tokenizers = ClassPool.tokenizers(args.lib)
52
+ assert args.tokenizer in tokenizers, f'Unknown tokenizer: {args.tokenizer}. Available tokenizers: {tokenizers.keys()}'
53
+ tokenizer = tokenizers[args.tokenizer](vocab=args.vocab, **tokenizer_params)
54
+
55
+ ut.add_job(tokenizer=tokenizer, column=args.column, name=args.name, truncate=args.truncate)
56
+ ut.tokenize(df).save(args.path)
57
+
58
+
59
+ def summarize():
60
+ parser = argparse.ArgumentParser()
61
+ parser.add_argument('path', type=str, default='.', help='path to a unitok data directory')
62
+ args, _ = parser.parse_known_args()
63
+
64
+ with UniTok.load(args.path) as ut:
65
+ ut.summarize()
66
+
67
+
68
+ def main():
69
+ parser = argparse.ArgumentParser()
70
+ parser.add_argument('--action', '-a', type=str, default='summarize', choices=['summarize', 'integrate'])
71
+
72
+ args, _ = parser.parse_known_args()
73
+ action = args.action
74
+
75
+ if action == 'integrate':
76
+ integrate()
77
+ else:
78
+ summarize()
@@ -0,0 +1,76 @@
1
+ from unitok.tokenizer import BaseTokenizer
2
+ from unitok.utils import Symbols, Instance
3
+ from unitok.utils.hub import Hub
4
+
5
+
6
+ class Job:
7
+ def __init__(
8
+ self,
9
+ tokenizer: BaseTokenizer,
10
+ column: str,
11
+ name: str = None,
12
+ truncate: int = None,
13
+ order: int = -1,
14
+ key: bool = False,
15
+ max_len: int = 0,
16
+ ):
17
+ self.tokenizer: BaseTokenizer = tokenizer
18
+ self.column: str = column
19
+ self.name: str = name
20
+ self.truncate: int = truncate
21
+ self.order: int = order
22
+ self.slice: slice = self.get_slice(truncate)
23
+ self.key: bool = key
24
+ self.max_len = max_len
25
+
26
+ JobHub.add(self.name, self)
27
+
28
+ @property
29
+ def return_list(self):
30
+ return self.truncate is not None
31
+
32
+ def clone(self, **kwargs):
33
+ attributes = {'tokenizer', 'column', 'name', 'truncate', 'order', 'key', 'max_len'}
34
+ params = dict()
35
+ for attr in attributes:
36
+ params[attr] = kwargs[attr] if attr in kwargs else getattr(self, attr)
37
+
38
+ return Job(**params)
39
+
40
+ def __str__(self):
41
+ if self.key:
42
+ return f'Job({self.column} => {self.name}) [PK]'
43
+ return f'Job({self.column} => {self.name})'
44
+
45
+ def __repr__(self):
46
+ return str(self)
47
+
48
+ @property
49
+ def is_processed(self):
50
+ return self.order >= 0
51
+
52
+ def json(self):
53
+ column = str(Symbols.idx) if self.column is Symbols.idx else self.column
54
+ return {
55
+ 'name': self.name,
56
+ 'column': column,
57
+ 'tokenizer': self.tokenizer.get_tokenizer_id(),
58
+ 'truncate': self.truncate,
59
+ 'order': self.order,
60
+ 'key': self.key,
61
+ 'max_len': self.max_len,
62
+ }
63
+
64
+ @staticmethod
65
+ def get_slice(truncate):
66
+ if truncate is None:
67
+ truncate = 0
68
+ if truncate > 0:
69
+ return slice(0, truncate)
70
+ if truncate < 0:
71
+ return slice(truncate, None)
72
+ return slice(None)
73
+
74
+
75
+ class JobHub(Hub[Job]):
76
+ _instance = Instance(compulsory_space=True)
@@ -0,0 +1,136 @@
1
+ import json
2
+ import os
3
+ from datetime import datetime
4
+
5
+ from unitok.utils.verbose import warning
6
+ from unitok.job import Job
7
+ from unitok.tokenizer import TokenizerHub
8
+ from unitok.tokenizer.union_tokenizer import UnionTokenizer
9
+ from unitok.tokenizer.unknown_tokenizer import UnknownTokenizer
10
+ from unitok.utils import Symbols
11
+ from unitok.utils.handler import JsonHandler
12
+ from unitok.utils.class_pool import ClassPool
13
+ from unitok.utils.index_set import VocabSet, TokenizerSet, JobSet
14
+ from unitok.vocabulary import Vocab, VocabHub
15
+
16
+
17
+ class Meta:
18
+ version = 'unidep-v4beta'
19
+
20
+ def __init__(self):
21
+ self.note = ('Not compatible with unitok-v3 or lower version, '
22
+ 'please upgrade by `pip install unitok>4.0.0` to load the data.')
23
+ self.website = 'https://unitok.github.io'
24
+ self.modified_at = self.created_at = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
25
+ self.vocabularies = VocabSet()
26
+ self.tokenizers = TokenizerSet()
27
+ self.jobs = JobSet()
28
+
29
+ @staticmethod
30
+ def parse_vocabulary(name: str, **kwargs):
31
+ return Vocab(name)
32
+
33
+ @staticmethod
34
+ def parse_tokenizer(tokenizer_id: str, classname: str, vocab: str, params: dict):
35
+ tokenizer_classes = ClassPool.tokenizers()
36
+
37
+ if not VocabHub.has(vocab):
38
+ raise ValueError(f"(unitok.meta) Vocabulary {vocab} not found in the vocabulary hub.")
39
+ vocab = VocabHub.get(vocab)
40
+
41
+ if (classname not in tokenizer_classes or
42
+ classname in [UnknownTokenizer.get_classname(), UnionTokenizer.get_classname()]):
43
+ warning(f"(unitok.meta) Tokenizer class {classname} not found in the class hub.")
44
+ return UnknownTokenizer(tokenizer_id=tokenizer_id, classname=classname, vocab=vocab, **params)
45
+ return tokenizer_classes[classname](tokenizer_id=tokenizer_id, vocab=vocab, **params)
46
+
47
+ @staticmethod
48
+ def parse_job(name: str, column: str, tokenizer: str, truncate: int, order: int, key: bool, max_len: int):
49
+ if not TokenizerHub.has(tokenizer):
50
+ raise ValueError(f"(unitok.meta) Tokenizer {tokenizer} not found in the tokenizer hub.")
51
+ tokenizer = TokenizerHub.get(tokenizer)
52
+
53
+ if column == str(Symbols.idx):
54
+ column = Symbols.idx
55
+
56
+ return Job(
57
+ name=name,
58
+ column=column,
59
+ tokenizer=tokenizer,
60
+ truncate=truncate,
61
+ order=order,
62
+ key=key,
63
+ max_len=max_len,
64
+ )
65
+
66
+ @staticmethod
67
+ def parse_version(version):
68
+ if version.startswith('unidep-v'):
69
+ return version[8:]
70
+
71
+ if version.startswith('UniDep-'):
72
+ raise ValueError(f'UniDep version ({version}) is not supported. '
73
+ f'Please downgrade the unitok version by `pip install unitok==3.5.3`, '
74
+ f'or use `unidep-upgrade-v4` to upgrade the version.')
75
+
76
+ raise ValueError(f'UniDep version ({version}) is not supported. '
77
+ f'Please downgrade the unitok version by `pip install unitok==3.5.3` for compatible upgrade, '
78
+ f'and then install the latest unitok version, '
79
+ f'following the use of `unidep-upgrade-v4` to upgrade the version.')
80
+
81
+ @classmethod
82
+ def filename(cls, save_dir):
83
+ return os.path.join(save_dir, 'meta.json')
84
+
85
+ @classmethod
86
+ def _deprecated_filename(cls, save_dir):
87
+ return os.path.join(save_dir, 'meta.data.json')
88
+
89
+ @classmethod
90
+ def _compatible_readfile(cls, save_dir):
91
+ filename = cls.filename(save_dir)
92
+ if not os.path.exists(filename):
93
+ filename = cls._deprecated_filename(save_dir)
94
+ if not os.path.exists(filename):
95
+ raise FileNotFoundError(f"Meta file not found in {save_dir}")
96
+
97
+ meta_data = json.load(open(filename))
98
+
99
+ if 'version' not in meta_data:
100
+ raise ValueError(f"Version not found in the meta file {filename}")
101
+
102
+ current_version = cls.parse_version(cls.version)
103
+ depot_version = cls.parse_version(meta_data.get('version'))
104
+
105
+ if current_version != depot_version:
106
+ warning('Version mismatch, unexpected error may occur.')
107
+
108
+ return meta_data
109
+
110
+ @classmethod
111
+ def load(cls, save_dir):
112
+ kwargs = cls._compatible_readfile(save_dir)
113
+
114
+ meta = cls()
115
+ meta.created_at = kwargs.get('created_at')
116
+ meta.vocabularies = VocabSet({cls.parse_vocabulary(**v).load(save_dir) for v in kwargs.get('vocabularies')})
117
+ meta.tokenizers = TokenizerSet({cls.parse_tokenizer(**t) for t in kwargs.get('tokenizers')})
118
+ meta.jobs = JobSet({cls.parse_job(**j) for j in kwargs.get('jobs')})
119
+
120
+ return meta
121
+
122
+ def json(self):
123
+ return {
124
+ "version": self.version,
125
+ "note": self.note,
126
+ "website": self.website,
127
+ "created_at": self.created_at,
128
+ "modified_at": self.modified_at,
129
+ "vocabularies": [v.json() for v in self.vocabularies],
130
+ "tokenizers": [t.json() for t in self.tokenizers],
131
+ "jobs": [j.json() for j in self.jobs],
132
+ }
133
+
134
+ def save(self, save_dir):
135
+ filename = self.filename(save_dir)
136
+ JsonHandler.save(self.json(), filename)
@@ -0,0 +1,44 @@
1
+ from unitok.utils import Symbols, Symbol
2
+
3
+
4
+ class Status:
5
+ def __init__(self):
6
+ self.status = Symbols.initialized
7
+ # initialized
8
+ # tokenized
9
+ # organized
10
+
11
+ @staticmethod
12
+ def require_status(*status: Symbol):
13
+ status_string = '/'.join([s.name for s in status])
14
+
15
+ def decorator(func):
16
+ def wrapper(self, *args, **kwargs):
17
+ if self.status in status:
18
+ return func(self, *args, **kwargs)
19
+ raise ValueError(f'UniTok should be in {status_string} status')
20
+
21
+ return wrapper
22
+
23
+ return decorator
24
+
25
+ require_initialized = require_status(Symbols.initialized)
26
+ require_tokenized = require_status(Symbols.tokenized)
27
+ require_organized = require_status(Symbols.organized)
28
+
29
+ require_not_initialized = require_status(Symbols.tokenized, Symbols.organized)
30
+ require_not_tokenized = require_status(Symbols.initialized, Symbols.organized)
31
+ require_not_organized = require_status(Symbols.initialized, Symbols.tokenized)
32
+
33
+ @staticmethod
34
+ def change_status(status: Symbol):
35
+ def decorator(func):
36
+ def wrapper(self, *args, **kwargs):
37
+ result = func(self, *args, **kwargs)
38
+ self.status = status
39
+ return result
40
+ return wrapper
41
+ return decorator
42
+
43
+ to_tokenized = change_status(Symbols.tokenized)
44
+ to_organized = change_status(Symbols.organized)
@@ -0,0 +1,18 @@
1
+ from unitok.tokenizer.base_tokenizer import BaseTokenizer, TokenizerHub
2
+ from unitok.tokenizer.entity_tokenizer import EntityTokenizer, EntitiesTokenizer
3
+ from unitok.tokenizer.transformers_tokenizer import TransformersTokenizer, BertTokenizer
4
+ from unitok.tokenizer.split_tokenizer import SplitTokenizer
5
+ from unitok.tokenizer.digit_tokenizer import DigitTokenizer, DigitsTokenizer
6
+
7
+
8
+ __all__ = [
9
+ BaseTokenizer,
10
+ EntityTokenizer,
11
+ EntitiesTokenizer,
12
+ TransformersTokenizer,
13
+ BertTokenizer,
14
+ SplitTokenizer,
15
+ DigitTokenizer,
16
+ DigitsTokenizer,
17
+ TokenizerHub
18
+ ]
@@ -0,0 +1,78 @@
1
+ import abc
2
+ from typing import Union
3
+
4
+ from unitok.utils import Instance, function
5
+ from unitok.utils.hub import Hub
6
+ from unitok.vocabulary import Vocab, VocabHub
7
+
8
+
9
+ class BaseTokenizer(abc.ABC):
10
+ return_list: bool
11
+ param_list: list
12
+
13
+ prefix = 'auto_'
14
+
15
+ def __init__(self, vocab: Union[str, Vocab], tokenizer_id: str = None, **kwargs):
16
+ if isinstance(vocab, str):
17
+ if VocabHub.has(vocab):
18
+ self.vocab = VocabHub.get(vocab)
19
+ else:
20
+ self.vocab = Vocab(name=vocab)
21
+ else:
22
+ self.vocab = vocab
23
+
24
+ self._tokenizer_id = tokenizer_id
25
+
26
+ TokenizerHub.add(self.get_tokenizer_id(), self)
27
+
28
+ def get_tokenizer_id(self):
29
+ if self._tokenizer_id is None:
30
+ self._tokenizer_id = self.prefix + function.get_random_string(length=6)
31
+ return self._tokenizer_id
32
+
33
+ @classmethod
34
+ def get_classname(cls):
35
+ # return cls.classname.lower().replace('tokenizer', '')
36
+ classname = cls.__name__.lower()
37
+ if not classname.endswith('tokenizer'):
38
+ raise ValueError(f'({classname}) Unexpected classname, expecting classname to end with "Tokenizer"')
39
+ return classname.replace('tokenizer', '')
40
+
41
+ def _convert_tokens_to_ids(self, tokens):
42
+ return_list = isinstance(tokens, list)
43
+ if return_list != self.return_list:
44
+ raise ValueError(f'(tokenizer.{self.get_classname()}) Unexpected input, requiring return_list={self.return_list}')
45
+
46
+ if not return_list:
47
+ tokens = [tokens]
48
+
49
+ ids = [self.vocab.append(token) for token in tokens]
50
+
51
+ if not return_list:
52
+ ids = ids[0]
53
+ return ids
54
+
55
+ def __call__(self, objs):
56
+ return self._convert_tokens_to_ids(objs)
57
+
58
+ def __str__(self):
59
+ return f'{self._detailed_classname}({self.get_tokenizer_id()}, vocab={self.vocab.name})'
60
+
61
+ def __repr__(self):
62
+ return str(self)
63
+
64
+ def json(self):
65
+ return {
66
+ 'tokenizer_id': self.get_tokenizer_id(),
67
+ 'vocab': self.vocab.name,
68
+ 'classname': self.get_classname(),
69
+ 'params': {param: getattr(self, param) for param in self.param_list},
70
+ }
71
+
72
+ @property
73
+ def _detailed_classname(self):
74
+ return self.__class__.__name__
75
+
76
+
77
+ class TokenizerHub(Hub[BaseTokenizer]):
78
+ _instance = Instance()
@@ -0,0 +1,33 @@
1
+ from unitok.tokenizer import BaseTokenizer
2
+
3
+
4
+ class DigitTokenizer(BaseTokenizer):
5
+ return_list = False
6
+ name = 'digit'
7
+ param_list = ['vocab_size']
8
+
9
+ def __init__(self, vocab_size: int = None, **kwargs):
10
+ super().__init__(**kwargs)
11
+
12
+ self.vocab_size = vocab_size
13
+ if self.vocab_size is not None:
14
+ self.vocab.extend([str(i) for i in range(vocab_size)])
15
+ self.vocab.deny_edit()
16
+
17
+ def __call__(self, obj):
18
+ obj = int(obj)
19
+ if obj >= len(self.vocab):
20
+ if self.vocab_size is not None:
21
+ raise ValueError(f'Vocabulary size is limited to {self.vocab_size}, but {obj} is given')
22
+ self.vocab.extend([str(i) for i in range(len(self.vocab), obj + 1)])
23
+ return obj
24
+
25
+
26
+ class DigitsTokenizer(DigitTokenizer):
27
+ return_list = True
28
+ name = 'digits'
29
+
30
+ def __call__(self, obj):
31
+ obj = [int(o) for o in obj]
32
+ for o in obj:
33
+ super().__call__(o)