UniTok 4.3.9__tar.gz → 4.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {UniTok-4.3.9 → UniTok-4.4.0}/PKG-INFO +33 -25
- {UniTok-4.3.9 → UniTok-4.4.0}/README.md +24 -22
- {UniTok-4.3.9 → UniTok-4.4.0}/UniTok.egg-info/PKG-INFO +33 -25
- {UniTok-4.3.9 → UniTok-4.4.0}/UniTok.egg-info/SOURCES.txt +2 -6
- {UniTok-4.3.9 → UniTok-4.4.0}/UniTok.egg-info/entry_points.txt +0 -1
- {UniTok-4.3.9 → UniTok-4.4.0}/UniTokv3/__main__.py +4 -4
- {UniTok-4.3.9 → UniTok-4.4.0}/setup.py +1 -1
- {UniTok-4.3.9 → UniTok-4.4.0}/unitok/__init__.py +4 -2
- {UniTok-4.3.9 → UniTok-4.4.0}/unitok/__main__.py +4 -4
- UniTok-4.3.9/unitok/job.py → UniTok-4.4.0/unitok/feature.py +7 -7
- UniTok-4.4.0/unitok/job.py +11 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/unitok/meta.py +14 -8
- {UniTok-4.3.9 → UniTok-4.4.0}/unitok/selector.py +4 -4
- {UniTok-4.3.9 → UniTok-4.4.0}/unitok/unitok.py +161 -125
- {UniTok-4.3.9 → UniTok-4.4.0}/unitok/utils/index_set/__init__.py +2 -0
- UniTok-4.4.0/unitok/utils/index_set/feature_set.py +25 -0
- UniTok-4.4.0/unitok/utils/index_set/job_set.py +4 -0
- UniTok-4.3.9/unitok/utils/index_set/job_set.py +0 -25
- {UniTok-4.3.9 → UniTok-4.4.0}/LICENSE +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/UniTok.egg-info/dependency_links.txt +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/UniTok.egg-info/requires.txt +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/UniTok.egg-info/top_level.txt +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/UniTokv3/__init__.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/UniTokv3/analysis/__init__.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/UniTokv3/analysis/lengths.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/UniTokv3/analysis/plot.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/UniTokv3/cols.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/UniTokv3/column.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/UniTokv3/fut.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/UniTokv3/global_setting.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/UniTokv3/meta.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/UniTokv3/tok/__init__.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/UniTokv3/tok/bert_tok.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/UniTokv3/tok/ent_tok.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/UniTokv3/tok/id_tok.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/UniTokv3/tok/number_tok.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/UniTokv3/tok/seq_tok.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/UniTokv3/tok/split_tok.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/UniTokv3/tok/tok.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/UniTokv3/unidep.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/UniTokv3/unitok.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/UniTokv3/vocab.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/UniTokv3/vocabs.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/setup.cfg +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/unitok/status.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/unitok/tokenizer/__init__.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/unitok/tokenizer/base_tokenizer.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/unitok/tokenizer/digit_tokenizer.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/unitok/tokenizer/entity_tokenizer.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/unitok/tokenizer/glove_tokenizer.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/unitok/tokenizer/split_tokenizer.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/unitok/tokenizer/transformers_tokenizer.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/unitok/tokenizer/union_tokenizer.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/unitok/tokenizer/unknown_tokenizer.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/unitok/utils/__init__.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/unitok/utils/class_pool.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/unitok/utils/data.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/unitok/utils/function.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/unitok/utils/handler/__init__.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/unitok/utils/handler/json_handler.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/unitok/utils/handler/pkl_handler.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/unitok/utils/hub/__init__.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/unitok/utils/hub/hub.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/unitok/utils/hub/param_hub.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/unitok/utils/index_set/index_set.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/unitok/utils/index_set/tokenizer_set.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/unitok/utils/index_set/vocabulary_set.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/unitok/utils/instance.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/unitok/utils/map.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/unitok/utils/space.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/unitok/utils/symbol.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/unitok/utils/verbose.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/unitok/vocabulary/__init__.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/unitok/vocabulary/counter.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.0}/unitok/vocabulary/vocabulary.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: UniTok
|
3
|
-
Version: 4.
|
3
|
+
Version: 4.4.0
|
4
4
|
Summary: Unified Tokenizer
|
5
5
|
Home-page: https://github.com/Jyonn/UnifiedTokenizer
|
6
6
|
Author: Jyonn Liu
|
@@ -10,6 +10,14 @@ Keywords: token,tokenizer,NLP,transformers,glove,bert,llama
|
|
10
10
|
Platform: any
|
11
11
|
Description-Content-Type: text/markdown
|
12
12
|
License-File: LICENSE
|
13
|
+
Requires-Dist: termplot==0.0.2
|
14
|
+
Requires-Dist: tqdm
|
15
|
+
Requires-Dist: numpy
|
16
|
+
Requires-Dist: pandas
|
17
|
+
Requires-Dist: transformers
|
18
|
+
Requires-Dist: oba
|
19
|
+
Requires-Dist: prettytable
|
20
|
+
Requires-Dist: rich
|
13
21
|
|
14
22
|
# UniTok V4
|
15
23
|
|
@@ -28,16 +36,18 @@ Please refer to [UniTok Handbook](https://unitok.qijiong.work) for more detailed
|
|
28
36
|
|
29
37
|
### Changes and Comparisons
|
30
38
|
|
39
|
+
> After UniTok 4.4.0, `Job` is renamed to `Feature`.
|
40
|
+
|
31
41
|
| Feature | UniTok v3 | UniTok v4 | Comments |
|
32
42
|
|---------------------------------|-------------------------------------------------------------|-----------------------------------------------------|-------------------------------------------------------------------------------|
|
33
43
|
| `UniTok` class | Solely for tokenization | Manages the entire preprocessing lifecycle | |
|
34
44
|
| `UniDep` class | Data loading and combining | Removed | V4 combines the functionalities of `UniTok` and `UniDep` into a single class. |
|
35
|
-
| `Column` class | Column name is for both the original and tokenized datasets | N/A | V4 introduces a `
|
36
|
-
| `
|
45
|
+
| `Column` class | Column name is for both the original and tokenized datasets | N/A | V4 introduces a `Feature` class. |
|
46
|
+
| `Feature` class | N/A | Defines how a specific column should be tokenized | |
|
37
47
|
| `Tokenizer` class | Ambiguous return type definition | `return_list` parameter must be of type `bool` | |
|
38
48
|
| `Tokenizer` class | Only supports `BertTokenizer` for text processing | Supports all Tokenizers in the transformers library | New `TransformersTokenizer` class |
|
39
49
|
| `analyse` method | Supported | Not supported Currently | |
|
40
|
-
| `Meta` class | Only for human-friendly displaying | Manager for `
|
50
|
+
| `Meta` class | Only for human-friendly displaying | Manager for `Feature`, `Tokenizer`, and `Vocab` | |
|
41
51
|
| `unitok` command | Visualization in the terminal | More colorful and detailed output | |
|
42
52
|
| `Vocab` class (unitok >= 4.1.0) | Save and load vocabulary using text files | Save and load vocabulary using pickle files | Avoids issues with special characters in text files |
|
43
53
|
|
@@ -75,13 +85,13 @@ pip install unitok
|
|
75
85
|
**Components**
|
76
86
|
|
77
87
|
- UniTok: Manages the dataset preprocessing lifecycle.
|
78
|
-
-
|
88
|
+
- Feature: Defines how a specific column should be tokenized.
|
79
89
|
- Tokenizer: Encodes data using various methods (e.g., BERT, splitting by delimiters).
|
80
90
|
- Vocabulary: Stores and manages unique tokens across datasets.
|
81
91
|
|
82
|
-
**Primary Key (
|
92
|
+
**Primary Key (key_feature)**
|
83
93
|
|
84
|
-
The `
|
94
|
+
The `key_feature` acts as the primary key for operations like `getitem` and `union`, ensuring consistency across datasets.
|
85
95
|
|
86
96
|
## Usage Guide
|
87
97
|
|
@@ -113,9 +123,9 @@ interaction = pd.read_csv(
|
|
113
123
|
)
|
114
124
|
```
|
115
125
|
|
116
|
-
### Defining and Adding
|
126
|
+
### Defining and Adding Features
|
117
127
|
|
118
|
-
Define tokenization
|
128
|
+
Define tokenization features for different columns:
|
119
129
|
|
120
130
|
```python
|
121
131
|
from unitok import UniTok, Vocab
|
@@ -128,23 +138,23 @@ with UniTok() as item_ut:
|
|
128
138
|
bert_tokenizer = BertTokenizer(vocab='bert')
|
129
139
|
llama_tokenizer = TransformersTokenizer(vocab='llama', key='huggyllama/llama-7b')
|
130
140
|
|
131
|
-
item_ut.
|
132
|
-
item_ut.
|
133
|
-
item_ut.
|
134
|
-
item_ut.
|
135
|
-
item_ut.
|
136
|
-
item_ut.
|
137
|
-
item_ut.
|
141
|
+
item_ut.add_feature(tokenizer=EntityTokenizer(vocab=item_vocab), column='nid', key=True)
|
142
|
+
item_ut.add_feature(tokenizer=bert_tokenizer, column='title', name='title@bert', truncate=20)
|
143
|
+
item_ut.add_feature(tokenizer=llama_tokenizer, column='title', name='title@llama', truncate=20)
|
144
|
+
item_ut.add_feature(tokenizer=bert_tokenizer, column='abstract', name='abstract@bert', truncate=50)
|
145
|
+
item_ut.add_feature(tokenizer=llama_tokenizer, column='abstract', name='abstract@llama', truncate=50)
|
146
|
+
item_ut.add_feature(tokenizer=EntityTokenizer(vocab='category'), column='category')
|
147
|
+
item_ut.add_feature(tokenizer=EntityTokenizer(vocab='subcategory'), column='subcategory')
|
138
148
|
|
139
149
|
with UniTok() as user_ut:
|
140
|
-
user_ut.
|
141
|
-
user_ut.
|
150
|
+
user_ut.add_feature(tokenizer=EntityTokenizer(vocab=user_vocab), column='uid', key=True)
|
151
|
+
user_ut.add_feature(tokenizer=SplitTokenizer(vocab=item_vocab, sep=','), column='history', truncate=30)
|
142
152
|
|
143
153
|
with UniTok() as inter_ut:
|
144
|
-
inter_ut.
|
145
|
-
inter_ut.
|
146
|
-
inter_ut.
|
147
|
-
inter_ut.
|
154
|
+
inter_ut.add_index_feature(name='index')
|
155
|
+
inter_ut.add_feature(tokenizer=EntityTokenizer(vocab=user_vocab), column='uid')
|
156
|
+
inter_ut.add_feature(tokenizer=EntityTokenizer(vocab=item_vocab), column='nid')
|
157
|
+
inter_ut.add_feature(tokenizer=DigitTokenizer(vocab='click', vocab_size=2), column='click')
|
148
158
|
```
|
149
159
|
|
150
160
|
### Tokenizing Data
|
@@ -190,7 +200,7 @@ UniTok (4beta)
|
|
190
200
|
Sample Size: 10
|
191
201
|
ID Column: nid
|
192
202
|
|
193
|
-
|
203
|
+
Features
|
194
204
|
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓
|
195
205
|
┃ Tokenizer ┃ Tokenizer ID ┃ Column Mapping ┃ Vocab ┃ Max Length ┃
|
196
206
|
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩
|
@@ -224,5 +234,3 @@ Our TODO list includes:
|
|
224
234
|
## License
|
225
235
|
|
226
236
|
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
227
|
-
|
228
|
-
|
@@ -15,16 +15,18 @@ Please refer to [UniTok Handbook](https://unitok.qijiong.work) for more detailed
|
|
15
15
|
|
16
16
|
### Changes and Comparisons
|
17
17
|
|
18
|
+
> After UniTok 4.4.0, `Job` is renamed to `Feature`.
|
19
|
+
|
18
20
|
| Feature | UniTok v3 | UniTok v4 | Comments |
|
19
21
|
|---------------------------------|-------------------------------------------------------------|-----------------------------------------------------|-------------------------------------------------------------------------------|
|
20
22
|
| `UniTok` class | Solely for tokenization | Manages the entire preprocessing lifecycle | |
|
21
23
|
| `UniDep` class | Data loading and combining | Removed | V4 combines the functionalities of `UniTok` and `UniDep` into a single class. |
|
22
|
-
| `Column` class | Column name is for both the original and tokenized datasets | N/A | V4 introduces a `
|
23
|
-
| `
|
24
|
+
| `Column` class | Column name is for both the original and tokenized datasets | N/A | V4 introduces a `Feature` class. |
|
25
|
+
| `Feature` class | N/A | Defines how a specific column should be tokenized | |
|
24
26
|
| `Tokenizer` class | Ambiguous return type definition | `return_list` parameter must be of type `bool` | |
|
25
27
|
| `Tokenizer` class | Only supports `BertTokenizer` for text processing | Supports all Tokenizers in the transformers library | New `TransformersTokenizer` class |
|
26
28
|
| `analyse` method | Supported | Not supported Currently | |
|
27
|
-
| `Meta` class | Only for human-friendly displaying | Manager for `
|
29
|
+
| `Meta` class | Only for human-friendly displaying | Manager for `Feature`, `Tokenizer`, and `Vocab` | |
|
28
30
|
| `unitok` command | Visualization in the terminal | More colorful and detailed output | |
|
29
31
|
| `Vocab` class (unitok >= 4.1.0) | Save and load vocabulary using text files | Save and load vocabulary using pickle files | Avoids issues with special characters in text files |
|
30
32
|
|
@@ -62,13 +64,13 @@ pip install unitok
|
|
62
64
|
**Components**
|
63
65
|
|
64
66
|
- UniTok: Manages the dataset preprocessing lifecycle.
|
65
|
-
-
|
67
|
+
- Feature: Defines how a specific column should be tokenized.
|
66
68
|
- Tokenizer: Encodes data using various methods (e.g., BERT, splitting by delimiters).
|
67
69
|
- Vocabulary: Stores and manages unique tokens across datasets.
|
68
70
|
|
69
|
-
**Primary Key (
|
71
|
+
**Primary Key (key_feature)**
|
70
72
|
|
71
|
-
The `
|
73
|
+
The `key_feature` acts as the primary key for operations like `getitem` and `union`, ensuring consistency across datasets.
|
72
74
|
|
73
75
|
## Usage Guide
|
74
76
|
|
@@ -100,9 +102,9 @@ interaction = pd.read_csv(
|
|
100
102
|
)
|
101
103
|
```
|
102
104
|
|
103
|
-
### Defining and Adding
|
105
|
+
### Defining and Adding Features
|
104
106
|
|
105
|
-
Define tokenization
|
107
|
+
Define tokenization features for different columns:
|
106
108
|
|
107
109
|
```python
|
108
110
|
from unitok import UniTok, Vocab
|
@@ -115,23 +117,23 @@ with UniTok() as item_ut:
|
|
115
117
|
bert_tokenizer = BertTokenizer(vocab='bert')
|
116
118
|
llama_tokenizer = TransformersTokenizer(vocab='llama', key='huggyllama/llama-7b')
|
117
119
|
|
118
|
-
item_ut.
|
119
|
-
item_ut.
|
120
|
-
item_ut.
|
121
|
-
item_ut.
|
122
|
-
item_ut.
|
123
|
-
item_ut.
|
124
|
-
item_ut.
|
120
|
+
item_ut.add_feature(tokenizer=EntityTokenizer(vocab=item_vocab), column='nid', key=True)
|
121
|
+
item_ut.add_feature(tokenizer=bert_tokenizer, column='title', name='title@bert', truncate=20)
|
122
|
+
item_ut.add_feature(tokenizer=llama_tokenizer, column='title', name='title@llama', truncate=20)
|
123
|
+
item_ut.add_feature(tokenizer=bert_tokenizer, column='abstract', name='abstract@bert', truncate=50)
|
124
|
+
item_ut.add_feature(tokenizer=llama_tokenizer, column='abstract', name='abstract@llama', truncate=50)
|
125
|
+
item_ut.add_feature(tokenizer=EntityTokenizer(vocab='category'), column='category')
|
126
|
+
item_ut.add_feature(tokenizer=EntityTokenizer(vocab='subcategory'), column='subcategory')
|
125
127
|
|
126
128
|
with UniTok() as user_ut:
|
127
|
-
user_ut.
|
128
|
-
user_ut.
|
129
|
+
user_ut.add_feature(tokenizer=EntityTokenizer(vocab=user_vocab), column='uid', key=True)
|
130
|
+
user_ut.add_feature(tokenizer=SplitTokenizer(vocab=item_vocab, sep=','), column='history', truncate=30)
|
129
131
|
|
130
132
|
with UniTok() as inter_ut:
|
131
|
-
inter_ut.
|
132
|
-
inter_ut.
|
133
|
-
inter_ut.
|
134
|
-
inter_ut.
|
133
|
+
inter_ut.add_index_feature(name='index')
|
134
|
+
inter_ut.add_feature(tokenizer=EntityTokenizer(vocab=user_vocab), column='uid')
|
135
|
+
inter_ut.add_feature(tokenizer=EntityTokenizer(vocab=item_vocab), column='nid')
|
136
|
+
inter_ut.add_feature(tokenizer=DigitTokenizer(vocab='click', vocab_size=2), column='click')
|
135
137
|
```
|
136
138
|
|
137
139
|
### Tokenizing Data
|
@@ -177,7 +179,7 @@ UniTok (4beta)
|
|
177
179
|
Sample Size: 10
|
178
180
|
ID Column: nid
|
179
181
|
|
180
|
-
|
182
|
+
Features
|
181
183
|
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓
|
182
184
|
┃ Tokenizer ┃ Tokenizer ID ┃ Column Mapping ┃ Vocab ┃ Max Length ┃
|
183
185
|
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: UniTok
|
3
|
-
Version: 4.
|
3
|
+
Version: 4.4.0
|
4
4
|
Summary: Unified Tokenizer
|
5
5
|
Home-page: https://github.com/Jyonn/UnifiedTokenizer
|
6
6
|
Author: Jyonn Liu
|
@@ -10,6 +10,14 @@ Keywords: token,tokenizer,NLP,transformers,glove,bert,llama
|
|
10
10
|
Platform: any
|
11
11
|
Description-Content-Type: text/markdown
|
12
12
|
License-File: LICENSE
|
13
|
+
Requires-Dist: termplot==0.0.2
|
14
|
+
Requires-Dist: tqdm
|
15
|
+
Requires-Dist: numpy
|
16
|
+
Requires-Dist: pandas
|
17
|
+
Requires-Dist: transformers
|
18
|
+
Requires-Dist: oba
|
19
|
+
Requires-Dist: prettytable
|
20
|
+
Requires-Dist: rich
|
13
21
|
|
14
22
|
# UniTok V4
|
15
23
|
|
@@ -28,16 +36,18 @@ Please refer to [UniTok Handbook](https://unitok.qijiong.work) for more detailed
|
|
28
36
|
|
29
37
|
### Changes and Comparisons
|
30
38
|
|
39
|
+
> After UniTok 4.4.0, `Job` is renamed to `Feature`.
|
40
|
+
|
31
41
|
| Feature | UniTok v3 | UniTok v4 | Comments |
|
32
42
|
|---------------------------------|-------------------------------------------------------------|-----------------------------------------------------|-------------------------------------------------------------------------------|
|
33
43
|
| `UniTok` class | Solely for tokenization | Manages the entire preprocessing lifecycle | |
|
34
44
|
| `UniDep` class | Data loading and combining | Removed | V4 combines the functionalities of `UniTok` and `UniDep` into a single class. |
|
35
|
-
| `Column` class | Column name is for both the original and tokenized datasets | N/A | V4 introduces a `
|
36
|
-
| `
|
45
|
+
| `Column` class | Column name is for both the original and tokenized datasets | N/A | V4 introduces a `Feature` class. |
|
46
|
+
| `Feature` class | N/A | Defines how a specific column should be tokenized | |
|
37
47
|
| `Tokenizer` class | Ambiguous return type definition | `return_list` parameter must be of type `bool` | |
|
38
48
|
| `Tokenizer` class | Only supports `BertTokenizer` for text processing | Supports all Tokenizers in the transformers library | New `TransformersTokenizer` class |
|
39
49
|
| `analyse` method | Supported | Not supported Currently | |
|
40
|
-
| `Meta` class | Only for human-friendly displaying | Manager for `
|
50
|
+
| `Meta` class | Only for human-friendly displaying | Manager for `Feature`, `Tokenizer`, and `Vocab` | |
|
41
51
|
| `unitok` command | Visualization in the terminal | More colorful and detailed output | |
|
42
52
|
| `Vocab` class (unitok >= 4.1.0) | Save and load vocabulary using text files | Save and load vocabulary using pickle files | Avoids issues with special characters in text files |
|
43
53
|
|
@@ -75,13 +85,13 @@ pip install unitok
|
|
75
85
|
**Components**
|
76
86
|
|
77
87
|
- UniTok: Manages the dataset preprocessing lifecycle.
|
78
|
-
-
|
88
|
+
- Feature: Defines how a specific column should be tokenized.
|
79
89
|
- Tokenizer: Encodes data using various methods (e.g., BERT, splitting by delimiters).
|
80
90
|
- Vocabulary: Stores and manages unique tokens across datasets.
|
81
91
|
|
82
|
-
**Primary Key (
|
92
|
+
**Primary Key (key_feature)**
|
83
93
|
|
84
|
-
The `
|
94
|
+
The `key_feature` acts as the primary key for operations like `getitem` and `union`, ensuring consistency across datasets.
|
85
95
|
|
86
96
|
## Usage Guide
|
87
97
|
|
@@ -113,9 +123,9 @@ interaction = pd.read_csv(
|
|
113
123
|
)
|
114
124
|
```
|
115
125
|
|
116
|
-
### Defining and Adding
|
126
|
+
### Defining and Adding Features
|
117
127
|
|
118
|
-
Define tokenization
|
128
|
+
Define tokenization features for different columns:
|
119
129
|
|
120
130
|
```python
|
121
131
|
from unitok import UniTok, Vocab
|
@@ -128,23 +138,23 @@ with UniTok() as item_ut:
|
|
128
138
|
bert_tokenizer = BertTokenizer(vocab='bert')
|
129
139
|
llama_tokenizer = TransformersTokenizer(vocab='llama', key='huggyllama/llama-7b')
|
130
140
|
|
131
|
-
item_ut.
|
132
|
-
item_ut.
|
133
|
-
item_ut.
|
134
|
-
item_ut.
|
135
|
-
item_ut.
|
136
|
-
item_ut.
|
137
|
-
item_ut.
|
141
|
+
item_ut.add_feature(tokenizer=EntityTokenizer(vocab=item_vocab), column='nid', key=True)
|
142
|
+
item_ut.add_feature(tokenizer=bert_tokenizer, column='title', name='title@bert', truncate=20)
|
143
|
+
item_ut.add_feature(tokenizer=llama_tokenizer, column='title', name='title@llama', truncate=20)
|
144
|
+
item_ut.add_feature(tokenizer=bert_tokenizer, column='abstract', name='abstract@bert', truncate=50)
|
145
|
+
item_ut.add_feature(tokenizer=llama_tokenizer, column='abstract', name='abstract@llama', truncate=50)
|
146
|
+
item_ut.add_feature(tokenizer=EntityTokenizer(vocab='category'), column='category')
|
147
|
+
item_ut.add_feature(tokenizer=EntityTokenizer(vocab='subcategory'), column='subcategory')
|
138
148
|
|
139
149
|
with UniTok() as user_ut:
|
140
|
-
user_ut.
|
141
|
-
user_ut.
|
150
|
+
user_ut.add_feature(tokenizer=EntityTokenizer(vocab=user_vocab), column='uid', key=True)
|
151
|
+
user_ut.add_feature(tokenizer=SplitTokenizer(vocab=item_vocab, sep=','), column='history', truncate=30)
|
142
152
|
|
143
153
|
with UniTok() as inter_ut:
|
144
|
-
inter_ut.
|
145
|
-
inter_ut.
|
146
|
-
inter_ut.
|
147
|
-
inter_ut.
|
154
|
+
inter_ut.add_index_feature(name='index')
|
155
|
+
inter_ut.add_feature(tokenizer=EntityTokenizer(vocab=user_vocab), column='uid')
|
156
|
+
inter_ut.add_feature(tokenizer=EntityTokenizer(vocab=item_vocab), column='nid')
|
157
|
+
inter_ut.add_feature(tokenizer=DigitTokenizer(vocab='click', vocab_size=2), column='click')
|
148
158
|
```
|
149
159
|
|
150
160
|
### Tokenizing Data
|
@@ -190,7 +200,7 @@ UniTok (4beta)
|
|
190
200
|
Sample Size: 10
|
191
201
|
ID Column: nid
|
192
202
|
|
193
|
-
|
203
|
+
Features
|
194
204
|
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓
|
195
205
|
┃ Tokenizer ┃ Tokenizer ID ┃ Column Mapping ┃ Vocab ┃ Max Length ┃
|
196
206
|
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩
|
@@ -224,5 +234,3 @@ Our TODO list includes:
|
|
224
234
|
## License
|
225
235
|
|
226
236
|
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
227
|
-
|
228
|
-
|
@@ -31,17 +31,12 @@ UniTokv3/tok/split_tok.py
|
|
31
31
|
UniTokv3/tok/tok.py
|
32
32
|
unitok/__init__.py
|
33
33
|
unitok/__main__.py
|
34
|
+
unitok/feature.py
|
34
35
|
unitok/job.py
|
35
36
|
unitok/meta.py
|
36
37
|
unitok/selector.py
|
37
38
|
unitok/status.py
|
38
39
|
unitok/unitok.py
|
39
|
-
unitok.egg-info/PKG-INFO
|
40
|
-
unitok.egg-info/SOURCES.txt
|
41
|
-
unitok.egg-info/dependency_links.txt
|
42
|
-
unitok.egg-info/entry_points.txt
|
43
|
-
unitok.egg-info/requires.txt
|
44
|
-
unitok.egg-info/top_level.txt
|
45
40
|
unitok/tokenizer/__init__.py
|
46
41
|
unitok/tokenizer/base_tokenizer.py
|
47
42
|
unitok/tokenizer/digit_tokenizer.py
|
@@ -67,6 +62,7 @@ unitok/utils/hub/__init__.py
|
|
67
62
|
unitok/utils/hub/hub.py
|
68
63
|
unitok/utils/hub/param_hub.py
|
69
64
|
unitok/utils/index_set/__init__.py
|
65
|
+
unitok/utils/index_set/feature_set.py
|
70
66
|
unitok/utils/index_set/index_set.py
|
71
67
|
unitok/utils/index_set/job_set.py
|
72
68
|
unitok/utils/index_set/tokenizer_set.py
|
@@ -9,7 +9,7 @@ from rich.table import Table
|
|
9
9
|
|
10
10
|
from UniTokv3 import UniDep, Meta, Vocab
|
11
11
|
from unitok.vocabulary import Vocab as Vocabv4
|
12
|
-
from unitok.
|
12
|
+
from unitok.feature import Feature as Featurev4
|
13
13
|
from unitok.tokenizer.unknown_tokenizer import UnknownTokenizer
|
14
14
|
from unitok.unitok import UniTok as UniTokv4
|
15
15
|
from unitok.meta import Meta as Metav4
|
@@ -127,7 +127,7 @@ def upgrade():
|
|
127
127
|
ut.meta.vocabularies.add(vocab_beta)
|
128
128
|
|
129
129
|
for col in voc.cols:
|
130
|
-
print(f'\tUpgrade
|
130
|
+
print(f'\tUpgrade feature {col.name}')
|
131
131
|
col_data = data[col.name]
|
132
132
|
if not len(col_data):
|
133
133
|
print(f'\t\tWarning: empty column {col.name}, defaulting to an atom column')
|
@@ -149,7 +149,7 @@ def upgrade():
|
|
149
149
|
tokenizer_id='upgrade_' + col.name,
|
150
150
|
vocab=vocab_beta,
|
151
151
|
)
|
152
|
-
|
152
|
+
feature = Featurev4(
|
153
153
|
name=col.name,
|
154
154
|
column=col.name,
|
155
155
|
tokenizer=tokenizer,
|
@@ -159,7 +159,7 @@ def upgrade():
|
|
159
159
|
max_len=max_len,
|
160
160
|
)
|
161
161
|
ut.meta.tokenizers.add(tokenizer)
|
162
|
-
ut.meta.
|
162
|
+
ut.meta.features.add(feature)
|
163
163
|
|
164
164
|
ut.meta.save(path)
|
165
165
|
|
@@ -6,7 +6,7 @@ long_description = (this_directory / "README.md").read_text(encoding='utf8')
|
|
6
6
|
|
7
7
|
setup(
|
8
8
|
name='UniTok',
|
9
|
-
version='4.
|
9
|
+
version='4.4.0',
|
10
10
|
keywords=['token', 'tokenizer', 'NLP', 'transformers', 'glove', 'bert', 'llama'],
|
11
11
|
description='Unified Tokenizer',
|
12
12
|
long_description=long_description,
|
@@ -11,8 +11,9 @@ from unitok.tokenizer import TransformersTokenizer, BertTokenizer
|
|
11
11
|
from unitok.tokenizer import SplitTokenizer, DigitTokenizer, DigitsTokenizer
|
12
12
|
from unitok.tokenizer import GloVeTokenizer
|
13
13
|
from unitok.job import Job, JobHub
|
14
|
+
from unitok.feature import Feature, FeatureHub
|
14
15
|
|
15
|
-
from unitok.utils.index_set import IndexSet, VocabSet, TokenizerSet, JobSet
|
16
|
+
from unitok.utils.index_set import IndexSet, VocabSet, TokenizerSet, JobSet, FeatureSet
|
16
17
|
|
17
18
|
from unitok.meta import Meta
|
18
19
|
from unitok.status import Status
|
@@ -32,7 +33,8 @@ __all__ = [
|
|
32
33
|
'SplitTokenizer', 'DigitTokenizer', 'DigitsTokenizer',
|
33
34
|
'GloVeTokenizer',
|
34
35
|
'Job', 'JobHub',
|
35
|
-
'
|
36
|
+
'Feature', 'FeatureHub',
|
37
|
+
'IndexSet', 'VocabSet', 'TokenizerSet', 'JobSet', 'FeatureSet',
|
36
38
|
'Meta',
|
37
39
|
'Status',
|
38
40
|
'UniTok',
|
@@ -15,7 +15,7 @@ def integrate():
|
|
15
15
|
parser.add_argument('--file', '-f', type=str, help='csv, tsv, parquet format data')
|
16
16
|
parser.add_argument('--lib', type=str, default=None, help='custom tokenizer library')
|
17
17
|
parser.add_argument('--column', '-c', type=str, help='column name to tokenize')
|
18
|
-
parser.add_argument('--name', '-n', type=str, help='
|
18
|
+
parser.add_argument('--name', '-n', type=str, help='export feature name name')
|
19
19
|
parser.add_argument('--vocab', '-v', type=str, default=None, help='vocabulary name')
|
20
20
|
parser.add_argument('--tokenizer', '-t', type=str, default=None, help='tokenizer classname')
|
21
21
|
parser.add_argument('--tokenizer_id', type=str, default=None, help='tokenizer id')
|
@@ -69,7 +69,7 @@ def integrate():
|
|
69
69
|
raise ValueError(f'Unknown tokenizer: {args.tokenizer}. Available tokenizers: {tokenizers.keys()}')
|
70
70
|
tokenizer = tokenizers[args.tokenizer](vocab=args.vocab, **tokenizer_params)
|
71
71
|
|
72
|
-
ut.
|
72
|
+
ut.add_feature(tokenizer=tokenizer, column=args.column, name=args.name, truncate=args.truncate)
|
73
73
|
ut.tokenize(df).save(args.path)
|
74
74
|
|
75
75
|
|
@@ -85,11 +85,11 @@ def summarize():
|
|
85
85
|
def remove():
|
86
86
|
parser = argparse.ArgumentParser()
|
87
87
|
parser.add_argument('path', type=str, default='.', help='path to a unitok data directory')
|
88
|
-
parser.add_argument('--name', type=str, help='
|
88
|
+
parser.add_argument('--name', type=str, help='feature name to remove')
|
89
89
|
args, _ = parser.parse_known_args()
|
90
90
|
|
91
91
|
with UniTok.load(args.path) as ut:
|
92
|
-
ut.
|
92
|
+
ut.remove_feature(args.name)
|
93
93
|
ut.save(args.path)
|
94
94
|
|
95
95
|
|
@@ -8,7 +8,7 @@ from unitok.utils import Symbols, Instance
|
|
8
8
|
from unitok.utils.hub import Hub
|
9
9
|
|
10
10
|
|
11
|
-
class
|
11
|
+
class Feature:
|
12
12
|
def __init__(
|
13
13
|
self,
|
14
14
|
tokenizer: Union[BaseTokenizer, str],
|
@@ -35,7 +35,7 @@ class Job:
|
|
35
35
|
self.max_len = max_len
|
36
36
|
self.from_union = isinstance(self.tokenizer, UnionTokenizer)
|
37
37
|
|
38
|
-
|
38
|
+
FeatureHub.add(self)
|
39
39
|
VocabHub.add(self.tokenizer.vocab)
|
40
40
|
|
41
41
|
@property
|
@@ -48,12 +48,12 @@ class Job:
|
|
48
48
|
for attr in attributes:
|
49
49
|
params[attr] = kwargs[attr] if attr in kwargs else getattr(self, attr)
|
50
50
|
|
51
|
-
return
|
51
|
+
return Feature(**params)
|
52
52
|
|
53
53
|
def __str__(self):
|
54
54
|
if self.key:
|
55
|
-
return f'
|
56
|
-
return f'
|
55
|
+
return f'Feature({self.column} => {self.name}) [PK]'
|
56
|
+
return f'Feature({self.column} => {self.name})'
|
57
57
|
|
58
58
|
def __repr__(self):
|
59
59
|
return str(self)
|
@@ -85,10 +85,10 @@ class Job:
|
|
85
85
|
return slice(None)
|
86
86
|
|
87
87
|
|
88
|
-
class
|
88
|
+
class FeatureHub(Hub[Feature]):
|
89
89
|
_instance = Instance(compulsory_space=True)
|
90
90
|
|
91
91
|
@classmethod
|
92
|
-
def add(cls, key, obj:
|
92
|
+
def add(cls, key, obj: Feature = None):
|
93
93
|
key, obj = key.name, key
|
94
94
|
return super().add(key, obj)
|
@@ -1,21 +1,22 @@
|
|
1
1
|
import json
|
2
2
|
import os
|
3
|
+
import warnings
|
3
4
|
from datetime import datetime
|
4
5
|
|
6
|
+
from unitok.feature import Feature
|
5
7
|
from unitok.utils.verbose import warning
|
6
|
-
from unitok.job import Job
|
7
8
|
from unitok.tokenizer import TokenizerHub
|
8
9
|
from unitok.tokenizer.union_tokenizer import UnionTokenizer
|
9
10
|
from unitok.tokenizer.unknown_tokenizer import UnknownTokenizer
|
10
11
|
from unitok.utils import Symbols
|
11
12
|
from unitok.utils.handler import JsonHandler
|
12
13
|
from unitok.utils.class_pool import ClassPool
|
13
|
-
from unitok.utils.index_set import VocabSet, TokenizerSet,
|
14
|
+
from unitok.utils.index_set import VocabSet, TokenizerSet, FeatureSet
|
14
15
|
from unitok.vocabulary import Vocab, VocabHub
|
15
16
|
|
16
17
|
|
17
18
|
class Meta:
|
18
|
-
version = 'unidep-v4'
|
19
|
+
version = 'unidep-v4.1'
|
19
20
|
|
20
21
|
def __init__(self):
|
21
22
|
self.note = ('Not compatible with unitok-v3 or lower version, '
|
@@ -24,7 +25,7 @@ class Meta:
|
|
24
25
|
self.modified_at = self.created_at = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
25
26
|
self.vocabularies = VocabSet()
|
26
27
|
self.tokenizers = TokenizerSet()
|
27
|
-
self.
|
28
|
+
self.features = FeatureSet()
|
28
29
|
|
29
30
|
@staticmethod
|
30
31
|
def parse_vocabulary(name: str, **kwargs):
|
@@ -45,7 +46,7 @@ class Meta:
|
|
45
46
|
return tokenizer_classes[classname](tokenizer_id=tokenizer_id, vocab=vocab, **params)
|
46
47
|
|
47
48
|
@staticmethod
|
48
|
-
def
|
49
|
+
def parse_feature(name: str, column: str, tokenizer: str, truncate: int, order: int, key: bool, max_len: int):
|
49
50
|
if not TokenizerHub.has(tokenizer):
|
50
51
|
raise ValueError(f"(unitok.meta) Tokenizer {tokenizer} not found in the tokenizer hub.")
|
51
52
|
tokenizer = TokenizerHub.get(tokenizer)
|
@@ -53,7 +54,7 @@ class Meta:
|
|
53
54
|
if column == str(Symbols.idx):
|
54
55
|
column = Symbols.idx
|
55
56
|
|
56
|
-
return
|
57
|
+
return Feature(
|
57
58
|
name=name,
|
58
59
|
column=column,
|
59
60
|
tokenizer=tokenizer,
|
@@ -63,6 +64,11 @@ class Meta:
|
|
63
64
|
max_len=max_len,
|
64
65
|
)
|
65
66
|
|
67
|
+
@staticmethod
|
68
|
+
def parse_job(name: str, column: str, tokenizer: str, truncate: int, order: int, key: bool, max_len: int):
|
69
|
+
warnings.deprecated('`parse_job` is deprecated, use `parse_feature` instead.', stacklevel=2)
|
70
|
+
return Meta.parse_feature(name, column, tokenizer, truncate, order, key, max_len)
|
71
|
+
|
66
72
|
@staticmethod
|
67
73
|
def parse_version(version):
|
68
74
|
if version.startswith('unidep-v'):
|
@@ -115,7 +121,7 @@ class Meta:
|
|
115
121
|
meta.created_at = kwargs.get('created_at')
|
116
122
|
meta.vocabularies = VocabSet({cls.parse_vocabulary(**v).load(save_dir) for v in kwargs.get('vocabularies')})
|
117
123
|
meta.tokenizers = TokenizerSet({cls.parse_tokenizer(**t) for t in kwargs.get('tokenizers')})
|
118
|
-
meta.
|
124
|
+
meta.features = FeatureSet({cls.parse_feature(**f) for f in kwargs.get('features') or kwargs.get('jobs')})
|
119
125
|
meta.version = kwargs.get('version')
|
120
126
|
|
121
127
|
return meta
|
@@ -129,7 +135,7 @@ class Meta:
|
|
129
135
|
"modified_at": self.modified_at,
|
130
136
|
"vocabularies": [v.json() for v in self.vocabularies],
|
131
137
|
"tokenizers": [t.json() for t in self.tokenizers],
|
132
|
-
"
|
138
|
+
"features": [f.json() for f in self.features],
|
133
139
|
}
|
134
140
|
|
135
141
|
def save(self, save_dir):
|