UniTok 4.3.9__tar.gz → 4.4.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {UniTok-4.3.9 → UniTok-4.4.1}/PKG-INFO +34 -25
- {UniTok-4.3.9 → UniTok-4.4.1}/README.md +24 -22
- {UniTok-4.3.9 → UniTok-4.4.1}/UniTok.egg-info/PKG-INFO +34 -25
- {UniTok-4.3.9 → UniTok-4.4.1}/UniTok.egg-info/SOURCES.txt +2 -6
- {UniTok-4.3.9 → UniTok-4.4.1}/UniTok.egg-info/entry_points.txt +0 -1
- {UniTok-4.3.9 → UniTok-4.4.1}/UniTok.egg-info/requires.txt +1 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/UniTokv3/__main__.py +4 -4
- {UniTok-4.3.9 → UniTok-4.4.1}/setup.py +3 -2
- {UniTok-4.3.9 → UniTok-4.4.1}/unitok/__init__.py +4 -2
- {UniTok-4.3.9 → UniTok-4.4.1}/unitok/__main__.py +4 -4
- UniTok-4.3.9/unitok/job.py → UniTok-4.4.1/unitok/feature.py +7 -7
- UniTok-4.4.1/unitok/job.py +11 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/unitok/meta.py +14 -8
- {UniTok-4.3.9 → UniTok-4.4.1}/unitok/selector.py +4 -4
- {UniTok-4.3.9 → UniTok-4.4.1}/unitok/unitok.py +161 -125
- {UniTok-4.3.9 → UniTok-4.4.1}/unitok/utils/index_set/__init__.py +2 -0
- UniTok-4.4.1/unitok/utils/index_set/feature_set.py +25 -0
- UniTok-4.4.1/unitok/utils/index_set/job_set.py +4 -0
- UniTok-4.3.9/unitok/utils/index_set/job_set.py +0 -25
- {UniTok-4.3.9 → UniTok-4.4.1}/LICENSE +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/UniTok.egg-info/dependency_links.txt +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/UniTok.egg-info/top_level.txt +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/UniTokv3/__init__.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/UniTokv3/analysis/__init__.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/UniTokv3/analysis/lengths.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/UniTokv3/analysis/plot.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/UniTokv3/cols.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/UniTokv3/column.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/UniTokv3/fut.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/UniTokv3/global_setting.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/UniTokv3/meta.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/UniTokv3/tok/__init__.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/UniTokv3/tok/bert_tok.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/UniTokv3/tok/ent_tok.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/UniTokv3/tok/id_tok.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/UniTokv3/tok/number_tok.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/UniTokv3/tok/seq_tok.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/UniTokv3/tok/split_tok.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/UniTokv3/tok/tok.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/UniTokv3/unidep.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/UniTokv3/unitok.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/UniTokv3/vocab.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/UniTokv3/vocabs.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/setup.cfg +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/unitok/status.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/unitok/tokenizer/__init__.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/unitok/tokenizer/base_tokenizer.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/unitok/tokenizer/digit_tokenizer.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/unitok/tokenizer/entity_tokenizer.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/unitok/tokenizer/glove_tokenizer.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/unitok/tokenizer/split_tokenizer.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/unitok/tokenizer/transformers_tokenizer.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/unitok/tokenizer/union_tokenizer.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/unitok/tokenizer/unknown_tokenizer.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/unitok/utils/__init__.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/unitok/utils/class_pool.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/unitok/utils/data.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/unitok/utils/function.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/unitok/utils/handler/__init__.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/unitok/utils/handler/json_handler.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/unitok/utils/handler/pkl_handler.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/unitok/utils/hub/__init__.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/unitok/utils/hub/hub.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/unitok/utils/hub/param_hub.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/unitok/utils/index_set/index_set.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/unitok/utils/index_set/tokenizer_set.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/unitok/utils/index_set/vocabulary_set.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/unitok/utils/instance.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/unitok/utils/map.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/unitok/utils/space.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/unitok/utils/symbol.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/unitok/utils/verbose.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/unitok/vocabulary/__init__.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/unitok/vocabulary/counter.py +0 -0
- {UniTok-4.3.9 → UniTok-4.4.1}/unitok/vocabulary/vocabulary.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: UniTok
|
3
|
-
Version: 4.
|
3
|
+
Version: 4.4.1
|
4
4
|
Summary: Unified Tokenizer
|
5
5
|
Home-page: https://github.com/Jyonn/UnifiedTokenizer
|
6
6
|
Author: Jyonn Liu
|
@@ -10,6 +10,15 @@ Keywords: token,tokenizer,NLP,transformers,glove,bert,llama
|
|
10
10
|
Platform: any
|
11
11
|
Description-Content-Type: text/markdown
|
12
12
|
License-File: LICENSE
|
13
|
+
Requires-Dist: termplot==0.0.2
|
14
|
+
Requires-Dist: tqdm
|
15
|
+
Requires-Dist: numpy
|
16
|
+
Requires-Dist: pandas
|
17
|
+
Requires-Dist: transformers
|
18
|
+
Requires-Dist: oba
|
19
|
+
Requires-Dist: prettytable
|
20
|
+
Requires-Dist: rich
|
21
|
+
Requires-Dist: fastparquet
|
13
22
|
|
14
23
|
# UniTok V4
|
15
24
|
|
@@ -28,16 +37,18 @@ Please refer to [UniTok Handbook](https://unitok.qijiong.work) for more detailed
|
|
28
37
|
|
29
38
|
### Changes and Comparisons
|
30
39
|
|
40
|
+
> After UniTok 4.4.0, `Job` is renamed to `Feature`.
|
41
|
+
|
31
42
|
| Feature | UniTok v3 | UniTok v4 | Comments |
|
32
43
|
|---------------------------------|-------------------------------------------------------------|-----------------------------------------------------|-------------------------------------------------------------------------------|
|
33
44
|
| `UniTok` class | Solely for tokenization | Manages the entire preprocessing lifecycle | |
|
34
45
|
| `UniDep` class | Data loading and combining | Removed | V4 combines the functionalities of `UniTok` and `UniDep` into a single class. |
|
35
|
-
| `Column` class | Column name is for both the original and tokenized datasets | N/A | V4 introduces a `
|
36
|
-
| `
|
46
|
+
| `Column` class | Column name is for both the original and tokenized datasets | N/A | V4 introduces a `Feature` class. |
|
47
|
+
| `Feature` class | N/A | Defines how a specific column should be tokenized | |
|
37
48
|
| `Tokenizer` class | Ambiguous return type definition | `return_list` parameter must be of type `bool` | |
|
38
49
|
| `Tokenizer` class | Only supports `BertTokenizer` for text processing | Supports all Tokenizers in the transformers library | New `TransformersTokenizer` class |
|
39
50
|
| `analyse` method | Supported | Not supported Currently | |
|
40
|
-
| `Meta` class | Only for human-friendly displaying | Manager for `
|
51
|
+
| `Meta` class | Only for human-friendly displaying | Manager for `Feature`, `Tokenizer`, and `Vocab` | |
|
41
52
|
| `unitok` command | Visualization in the terminal | More colorful and detailed output | |
|
42
53
|
| `Vocab` class (unitok >= 4.1.0) | Save and load vocabulary using text files | Save and load vocabulary using pickle files | Avoids issues with special characters in text files |
|
43
54
|
|
@@ -75,13 +86,13 @@ pip install unitok
|
|
75
86
|
**Components**
|
76
87
|
|
77
88
|
- UniTok: Manages the dataset preprocessing lifecycle.
|
78
|
-
-
|
89
|
+
- Feature: Defines how a specific column should be tokenized.
|
79
90
|
- Tokenizer: Encodes data using various methods (e.g., BERT, splitting by delimiters).
|
80
91
|
- Vocabulary: Stores and manages unique tokens across datasets.
|
81
92
|
|
82
|
-
**Primary Key (
|
93
|
+
**Primary Key (key_feature)**
|
83
94
|
|
84
|
-
The `
|
95
|
+
The `key_feature` acts as the primary key for operations like `getitem` and `union`, ensuring consistency across datasets.
|
85
96
|
|
86
97
|
## Usage Guide
|
87
98
|
|
@@ -113,9 +124,9 @@ interaction = pd.read_csv(
|
|
113
124
|
)
|
114
125
|
```
|
115
126
|
|
116
|
-
### Defining and Adding
|
127
|
+
### Defining and Adding Features
|
117
128
|
|
118
|
-
Define tokenization
|
129
|
+
Define tokenization features for different columns:
|
119
130
|
|
120
131
|
```python
|
121
132
|
from unitok import UniTok, Vocab
|
@@ -128,23 +139,23 @@ with UniTok() as item_ut:
|
|
128
139
|
bert_tokenizer = BertTokenizer(vocab='bert')
|
129
140
|
llama_tokenizer = TransformersTokenizer(vocab='llama', key='huggyllama/llama-7b')
|
130
141
|
|
131
|
-
item_ut.
|
132
|
-
item_ut.
|
133
|
-
item_ut.
|
134
|
-
item_ut.
|
135
|
-
item_ut.
|
136
|
-
item_ut.
|
137
|
-
item_ut.
|
142
|
+
item_ut.add_feature(tokenizer=EntityTokenizer(vocab=item_vocab), column='nid', key=True)
|
143
|
+
item_ut.add_feature(tokenizer=bert_tokenizer, column='title', name='title@bert', truncate=20)
|
144
|
+
item_ut.add_feature(tokenizer=llama_tokenizer, column='title', name='title@llama', truncate=20)
|
145
|
+
item_ut.add_feature(tokenizer=bert_tokenizer, column='abstract', name='abstract@bert', truncate=50)
|
146
|
+
item_ut.add_feature(tokenizer=llama_tokenizer, column='abstract', name='abstract@llama', truncate=50)
|
147
|
+
item_ut.add_feature(tokenizer=EntityTokenizer(vocab='category'), column='category')
|
148
|
+
item_ut.add_feature(tokenizer=EntityTokenizer(vocab='subcategory'), column='subcategory')
|
138
149
|
|
139
150
|
with UniTok() as user_ut:
|
140
|
-
user_ut.
|
141
|
-
user_ut.
|
151
|
+
user_ut.add_feature(tokenizer=EntityTokenizer(vocab=user_vocab), column='uid', key=True)
|
152
|
+
user_ut.add_feature(tokenizer=SplitTokenizer(vocab=item_vocab, sep=','), column='history', truncate=30)
|
142
153
|
|
143
154
|
with UniTok() as inter_ut:
|
144
|
-
inter_ut.
|
145
|
-
inter_ut.
|
146
|
-
inter_ut.
|
147
|
-
inter_ut.
|
155
|
+
inter_ut.add_index_feature(name='index')
|
156
|
+
inter_ut.add_feature(tokenizer=EntityTokenizer(vocab=user_vocab), column='uid')
|
157
|
+
inter_ut.add_feature(tokenizer=EntityTokenizer(vocab=item_vocab), column='nid')
|
158
|
+
inter_ut.add_feature(tokenizer=DigitTokenizer(vocab='click', vocab_size=2), column='click')
|
148
159
|
```
|
149
160
|
|
150
161
|
### Tokenizing Data
|
@@ -190,7 +201,7 @@ UniTok (4beta)
|
|
190
201
|
Sample Size: 10
|
191
202
|
ID Column: nid
|
192
203
|
|
193
|
-
|
204
|
+
Features
|
194
205
|
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓
|
195
206
|
┃ Tokenizer ┃ Tokenizer ID ┃ Column Mapping ┃ Vocab ┃ Max Length ┃
|
196
207
|
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩
|
@@ -224,5 +235,3 @@ Our TODO list includes:
|
|
224
235
|
## License
|
225
236
|
|
226
237
|
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
227
|
-
|
228
|
-
|
@@ -15,16 +15,18 @@ Please refer to [UniTok Handbook](https://unitok.qijiong.work) for more detailed
|
|
15
15
|
|
16
16
|
### Changes and Comparisons
|
17
17
|
|
18
|
+
> After UniTok 4.4.0, `Job` is renamed to `Feature`.
|
19
|
+
|
18
20
|
| Feature | UniTok v3 | UniTok v4 | Comments |
|
19
21
|
|---------------------------------|-------------------------------------------------------------|-----------------------------------------------------|-------------------------------------------------------------------------------|
|
20
22
|
| `UniTok` class | Solely for tokenization | Manages the entire preprocessing lifecycle | |
|
21
23
|
| `UniDep` class | Data loading and combining | Removed | V4 combines the functionalities of `UniTok` and `UniDep` into a single class. |
|
22
|
-
| `Column` class | Column name is for both the original and tokenized datasets | N/A | V4 introduces a `
|
23
|
-
| `
|
24
|
+
| `Column` class | Column name is for both the original and tokenized datasets | N/A | V4 introduces a `Feature` class. |
|
25
|
+
| `Feature` class | N/A | Defines how a specific column should be tokenized | |
|
24
26
|
| `Tokenizer` class | Ambiguous return type definition | `return_list` parameter must be of type `bool` | |
|
25
27
|
| `Tokenizer` class | Only supports `BertTokenizer` for text processing | Supports all Tokenizers in the transformers library | New `TransformersTokenizer` class |
|
26
28
|
| `analyse` method | Supported | Not supported Currently | |
|
27
|
-
| `Meta` class | Only for human-friendly displaying | Manager for `
|
29
|
+
| `Meta` class | Only for human-friendly displaying | Manager for `Feature`, `Tokenizer`, and `Vocab` | |
|
28
30
|
| `unitok` command | Visualization in the terminal | More colorful and detailed output | |
|
29
31
|
| `Vocab` class (unitok >= 4.1.0) | Save and load vocabulary using text files | Save and load vocabulary using pickle files | Avoids issues with special characters in text files |
|
30
32
|
|
@@ -62,13 +64,13 @@ pip install unitok
|
|
62
64
|
**Components**
|
63
65
|
|
64
66
|
- UniTok: Manages the dataset preprocessing lifecycle.
|
65
|
-
-
|
67
|
+
- Feature: Defines how a specific column should be tokenized.
|
66
68
|
- Tokenizer: Encodes data using various methods (e.g., BERT, splitting by delimiters).
|
67
69
|
- Vocabulary: Stores and manages unique tokens across datasets.
|
68
70
|
|
69
|
-
**Primary Key (
|
71
|
+
**Primary Key (key_feature)**
|
70
72
|
|
71
|
-
The `
|
73
|
+
The `key_feature` acts as the primary key for operations like `getitem` and `union`, ensuring consistency across datasets.
|
72
74
|
|
73
75
|
## Usage Guide
|
74
76
|
|
@@ -100,9 +102,9 @@ interaction = pd.read_csv(
|
|
100
102
|
)
|
101
103
|
```
|
102
104
|
|
103
|
-
### Defining and Adding
|
105
|
+
### Defining and Adding Features
|
104
106
|
|
105
|
-
Define tokenization
|
107
|
+
Define tokenization features for different columns:
|
106
108
|
|
107
109
|
```python
|
108
110
|
from unitok import UniTok, Vocab
|
@@ -115,23 +117,23 @@ with UniTok() as item_ut:
|
|
115
117
|
bert_tokenizer = BertTokenizer(vocab='bert')
|
116
118
|
llama_tokenizer = TransformersTokenizer(vocab='llama', key='huggyllama/llama-7b')
|
117
119
|
|
118
|
-
item_ut.
|
119
|
-
item_ut.
|
120
|
-
item_ut.
|
121
|
-
item_ut.
|
122
|
-
item_ut.
|
123
|
-
item_ut.
|
124
|
-
item_ut.
|
120
|
+
item_ut.add_feature(tokenizer=EntityTokenizer(vocab=item_vocab), column='nid', key=True)
|
121
|
+
item_ut.add_feature(tokenizer=bert_tokenizer, column='title', name='title@bert', truncate=20)
|
122
|
+
item_ut.add_feature(tokenizer=llama_tokenizer, column='title', name='title@llama', truncate=20)
|
123
|
+
item_ut.add_feature(tokenizer=bert_tokenizer, column='abstract', name='abstract@bert', truncate=50)
|
124
|
+
item_ut.add_feature(tokenizer=llama_tokenizer, column='abstract', name='abstract@llama', truncate=50)
|
125
|
+
item_ut.add_feature(tokenizer=EntityTokenizer(vocab='category'), column='category')
|
126
|
+
item_ut.add_feature(tokenizer=EntityTokenizer(vocab='subcategory'), column='subcategory')
|
125
127
|
|
126
128
|
with UniTok() as user_ut:
|
127
|
-
user_ut.
|
128
|
-
user_ut.
|
129
|
+
user_ut.add_feature(tokenizer=EntityTokenizer(vocab=user_vocab), column='uid', key=True)
|
130
|
+
user_ut.add_feature(tokenizer=SplitTokenizer(vocab=item_vocab, sep=','), column='history', truncate=30)
|
129
131
|
|
130
132
|
with UniTok() as inter_ut:
|
131
|
-
inter_ut.
|
132
|
-
inter_ut.
|
133
|
-
inter_ut.
|
134
|
-
inter_ut.
|
133
|
+
inter_ut.add_index_feature(name='index')
|
134
|
+
inter_ut.add_feature(tokenizer=EntityTokenizer(vocab=user_vocab), column='uid')
|
135
|
+
inter_ut.add_feature(tokenizer=EntityTokenizer(vocab=item_vocab), column='nid')
|
136
|
+
inter_ut.add_feature(tokenizer=DigitTokenizer(vocab='click', vocab_size=2), column='click')
|
135
137
|
```
|
136
138
|
|
137
139
|
### Tokenizing Data
|
@@ -177,7 +179,7 @@ UniTok (4beta)
|
|
177
179
|
Sample Size: 10
|
178
180
|
ID Column: nid
|
179
181
|
|
180
|
-
|
182
|
+
Features
|
181
183
|
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓
|
182
184
|
┃ Tokenizer ┃ Tokenizer ID ┃ Column Mapping ┃ Vocab ┃ Max Length ┃
|
183
185
|
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: UniTok
|
3
|
-
Version: 4.
|
3
|
+
Version: 4.4.1
|
4
4
|
Summary: Unified Tokenizer
|
5
5
|
Home-page: https://github.com/Jyonn/UnifiedTokenizer
|
6
6
|
Author: Jyonn Liu
|
@@ -10,6 +10,15 @@ Keywords: token,tokenizer,NLP,transformers,glove,bert,llama
|
|
10
10
|
Platform: any
|
11
11
|
Description-Content-Type: text/markdown
|
12
12
|
License-File: LICENSE
|
13
|
+
Requires-Dist: termplot==0.0.2
|
14
|
+
Requires-Dist: tqdm
|
15
|
+
Requires-Dist: numpy
|
16
|
+
Requires-Dist: pandas
|
17
|
+
Requires-Dist: transformers
|
18
|
+
Requires-Dist: oba
|
19
|
+
Requires-Dist: prettytable
|
20
|
+
Requires-Dist: rich
|
21
|
+
Requires-Dist: fastparquet
|
13
22
|
|
14
23
|
# UniTok V4
|
15
24
|
|
@@ -28,16 +37,18 @@ Please refer to [UniTok Handbook](https://unitok.qijiong.work) for more detailed
|
|
28
37
|
|
29
38
|
### Changes and Comparisons
|
30
39
|
|
40
|
+
> After UniTok 4.4.0, `Job` is renamed to `Feature`.
|
41
|
+
|
31
42
|
| Feature | UniTok v3 | UniTok v4 | Comments |
|
32
43
|
|---------------------------------|-------------------------------------------------------------|-----------------------------------------------------|-------------------------------------------------------------------------------|
|
33
44
|
| `UniTok` class | Solely for tokenization | Manages the entire preprocessing lifecycle | |
|
34
45
|
| `UniDep` class | Data loading and combining | Removed | V4 combines the functionalities of `UniTok` and `UniDep` into a single class. |
|
35
|
-
| `Column` class | Column name is for both the original and tokenized datasets | N/A | V4 introduces a `
|
36
|
-
| `
|
46
|
+
| `Column` class | Column name is for both the original and tokenized datasets | N/A | V4 introduces a `Feature` class. |
|
47
|
+
| `Feature` class | N/A | Defines how a specific column should be tokenized | |
|
37
48
|
| `Tokenizer` class | Ambiguous return type definition | `return_list` parameter must be of type `bool` | |
|
38
49
|
| `Tokenizer` class | Only supports `BertTokenizer` for text processing | Supports all Tokenizers in the transformers library | New `TransformersTokenizer` class |
|
39
50
|
| `analyse` method | Supported | Not supported Currently | |
|
40
|
-
| `Meta` class | Only for human-friendly displaying | Manager for `
|
51
|
+
| `Meta` class | Only for human-friendly displaying | Manager for `Feature`, `Tokenizer`, and `Vocab` | |
|
41
52
|
| `unitok` command | Visualization in the terminal | More colorful and detailed output | |
|
42
53
|
| `Vocab` class (unitok >= 4.1.0) | Save and load vocabulary using text files | Save and load vocabulary using pickle files | Avoids issues with special characters in text files |
|
43
54
|
|
@@ -75,13 +86,13 @@ pip install unitok
|
|
75
86
|
**Components**
|
76
87
|
|
77
88
|
- UniTok: Manages the dataset preprocessing lifecycle.
|
78
|
-
-
|
89
|
+
- Feature: Defines how a specific column should be tokenized.
|
79
90
|
- Tokenizer: Encodes data using various methods (e.g., BERT, splitting by delimiters).
|
80
91
|
- Vocabulary: Stores and manages unique tokens across datasets.
|
81
92
|
|
82
|
-
**Primary Key (
|
93
|
+
**Primary Key (key_feature)**
|
83
94
|
|
84
|
-
The `
|
95
|
+
The `key_feature` acts as the primary key for operations like `getitem` and `union`, ensuring consistency across datasets.
|
85
96
|
|
86
97
|
## Usage Guide
|
87
98
|
|
@@ -113,9 +124,9 @@ interaction = pd.read_csv(
|
|
113
124
|
)
|
114
125
|
```
|
115
126
|
|
116
|
-
### Defining and Adding
|
127
|
+
### Defining and Adding Features
|
117
128
|
|
118
|
-
Define tokenization
|
129
|
+
Define tokenization features for different columns:
|
119
130
|
|
120
131
|
```python
|
121
132
|
from unitok import UniTok, Vocab
|
@@ -128,23 +139,23 @@ with UniTok() as item_ut:
|
|
128
139
|
bert_tokenizer = BertTokenizer(vocab='bert')
|
129
140
|
llama_tokenizer = TransformersTokenizer(vocab='llama', key='huggyllama/llama-7b')
|
130
141
|
|
131
|
-
item_ut.
|
132
|
-
item_ut.
|
133
|
-
item_ut.
|
134
|
-
item_ut.
|
135
|
-
item_ut.
|
136
|
-
item_ut.
|
137
|
-
item_ut.
|
142
|
+
item_ut.add_feature(tokenizer=EntityTokenizer(vocab=item_vocab), column='nid', key=True)
|
143
|
+
item_ut.add_feature(tokenizer=bert_tokenizer, column='title', name='title@bert', truncate=20)
|
144
|
+
item_ut.add_feature(tokenizer=llama_tokenizer, column='title', name='title@llama', truncate=20)
|
145
|
+
item_ut.add_feature(tokenizer=bert_tokenizer, column='abstract', name='abstract@bert', truncate=50)
|
146
|
+
item_ut.add_feature(tokenizer=llama_tokenizer, column='abstract', name='abstract@llama', truncate=50)
|
147
|
+
item_ut.add_feature(tokenizer=EntityTokenizer(vocab='category'), column='category')
|
148
|
+
item_ut.add_feature(tokenizer=EntityTokenizer(vocab='subcategory'), column='subcategory')
|
138
149
|
|
139
150
|
with UniTok() as user_ut:
|
140
|
-
user_ut.
|
141
|
-
user_ut.
|
151
|
+
user_ut.add_feature(tokenizer=EntityTokenizer(vocab=user_vocab), column='uid', key=True)
|
152
|
+
user_ut.add_feature(tokenizer=SplitTokenizer(vocab=item_vocab, sep=','), column='history', truncate=30)
|
142
153
|
|
143
154
|
with UniTok() as inter_ut:
|
144
|
-
inter_ut.
|
145
|
-
inter_ut.
|
146
|
-
inter_ut.
|
147
|
-
inter_ut.
|
155
|
+
inter_ut.add_index_feature(name='index')
|
156
|
+
inter_ut.add_feature(tokenizer=EntityTokenizer(vocab=user_vocab), column='uid')
|
157
|
+
inter_ut.add_feature(tokenizer=EntityTokenizer(vocab=item_vocab), column='nid')
|
158
|
+
inter_ut.add_feature(tokenizer=DigitTokenizer(vocab='click', vocab_size=2), column='click')
|
148
159
|
```
|
149
160
|
|
150
161
|
### Tokenizing Data
|
@@ -190,7 +201,7 @@ UniTok (4beta)
|
|
190
201
|
Sample Size: 10
|
191
202
|
ID Column: nid
|
192
203
|
|
193
|
-
|
204
|
+
Features
|
194
205
|
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓
|
195
206
|
┃ Tokenizer ┃ Tokenizer ID ┃ Column Mapping ┃ Vocab ┃ Max Length ┃
|
196
207
|
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩
|
@@ -224,5 +235,3 @@ Our TODO list includes:
|
|
224
235
|
## License
|
225
236
|
|
226
237
|
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
227
|
-
|
228
|
-
|
@@ -31,17 +31,12 @@ UniTokv3/tok/split_tok.py
|
|
31
31
|
UniTokv3/tok/tok.py
|
32
32
|
unitok/__init__.py
|
33
33
|
unitok/__main__.py
|
34
|
+
unitok/feature.py
|
34
35
|
unitok/job.py
|
35
36
|
unitok/meta.py
|
36
37
|
unitok/selector.py
|
37
38
|
unitok/status.py
|
38
39
|
unitok/unitok.py
|
39
|
-
unitok.egg-info/PKG-INFO
|
40
|
-
unitok.egg-info/SOURCES.txt
|
41
|
-
unitok.egg-info/dependency_links.txt
|
42
|
-
unitok.egg-info/entry_points.txt
|
43
|
-
unitok.egg-info/requires.txt
|
44
|
-
unitok.egg-info/top_level.txt
|
45
40
|
unitok/tokenizer/__init__.py
|
46
41
|
unitok/tokenizer/base_tokenizer.py
|
47
42
|
unitok/tokenizer/digit_tokenizer.py
|
@@ -67,6 +62,7 @@ unitok/utils/hub/__init__.py
|
|
67
62
|
unitok/utils/hub/hub.py
|
68
63
|
unitok/utils/hub/param_hub.py
|
69
64
|
unitok/utils/index_set/__init__.py
|
65
|
+
unitok/utils/index_set/feature_set.py
|
70
66
|
unitok/utils/index_set/index_set.py
|
71
67
|
unitok/utils/index_set/job_set.py
|
72
68
|
unitok/utils/index_set/tokenizer_set.py
|
@@ -9,7 +9,7 @@ from rich.table import Table
|
|
9
9
|
|
10
10
|
from UniTokv3 import UniDep, Meta, Vocab
|
11
11
|
from unitok.vocabulary import Vocab as Vocabv4
|
12
|
-
from unitok.
|
12
|
+
from unitok.feature import Feature as Featurev4
|
13
13
|
from unitok.tokenizer.unknown_tokenizer import UnknownTokenizer
|
14
14
|
from unitok.unitok import UniTok as UniTokv4
|
15
15
|
from unitok.meta import Meta as Metav4
|
@@ -127,7 +127,7 @@ def upgrade():
|
|
127
127
|
ut.meta.vocabularies.add(vocab_beta)
|
128
128
|
|
129
129
|
for col in voc.cols:
|
130
|
-
print(f'\tUpgrade
|
130
|
+
print(f'\tUpgrade feature {col.name}')
|
131
131
|
col_data = data[col.name]
|
132
132
|
if not len(col_data):
|
133
133
|
print(f'\t\tWarning: empty column {col.name}, defaulting to an atom column')
|
@@ -149,7 +149,7 @@ def upgrade():
|
|
149
149
|
tokenizer_id='upgrade_' + col.name,
|
150
150
|
vocab=vocab_beta,
|
151
151
|
)
|
152
|
-
|
152
|
+
feature = Featurev4(
|
153
153
|
name=col.name,
|
154
154
|
column=col.name,
|
155
155
|
tokenizer=tokenizer,
|
@@ -159,7 +159,7 @@ def upgrade():
|
|
159
159
|
max_len=max_len,
|
160
160
|
)
|
161
161
|
ut.meta.tokenizers.add(tokenizer)
|
162
|
-
ut.meta.
|
162
|
+
ut.meta.features.add(feature)
|
163
163
|
|
164
164
|
ut.meta.save(path)
|
165
165
|
|
@@ -6,7 +6,7 @@ long_description = (this_directory / "README.md").read_text(encoding='utf8')
|
|
6
6
|
|
7
7
|
setup(
|
8
8
|
name='UniTok',
|
9
|
-
version='4.
|
9
|
+
version='4.4.1',
|
10
10
|
keywords=['token', 'tokenizer', 'NLP', 'transformers', 'glove', 'bert', 'llama'],
|
11
11
|
description='Unified Tokenizer',
|
12
12
|
long_description=long_description,
|
@@ -25,7 +25,8 @@ setup(
|
|
25
25
|
'transformers',
|
26
26
|
'oba',
|
27
27
|
'prettytable',
|
28
|
-
'rich'
|
28
|
+
'rich',
|
29
|
+
'fastparquet'
|
29
30
|
],
|
30
31
|
entry_points={
|
31
32
|
'console_scripts': [
|
@@ -11,8 +11,9 @@ from unitok.tokenizer import TransformersTokenizer, BertTokenizer
|
|
11
11
|
from unitok.tokenizer import SplitTokenizer, DigitTokenizer, DigitsTokenizer
|
12
12
|
from unitok.tokenizer import GloVeTokenizer
|
13
13
|
from unitok.job import Job, JobHub
|
14
|
+
from unitok.feature import Feature, FeatureHub
|
14
15
|
|
15
|
-
from unitok.utils.index_set import IndexSet, VocabSet, TokenizerSet, JobSet
|
16
|
+
from unitok.utils.index_set import IndexSet, VocabSet, TokenizerSet, JobSet, FeatureSet
|
16
17
|
|
17
18
|
from unitok.meta import Meta
|
18
19
|
from unitok.status import Status
|
@@ -32,7 +33,8 @@ __all__ = [
|
|
32
33
|
'SplitTokenizer', 'DigitTokenizer', 'DigitsTokenizer',
|
33
34
|
'GloVeTokenizer',
|
34
35
|
'Job', 'JobHub',
|
35
|
-
'
|
36
|
+
'Feature', 'FeatureHub',
|
37
|
+
'IndexSet', 'VocabSet', 'TokenizerSet', 'JobSet', 'FeatureSet',
|
36
38
|
'Meta',
|
37
39
|
'Status',
|
38
40
|
'UniTok',
|
@@ -15,7 +15,7 @@ def integrate():
|
|
15
15
|
parser.add_argument('--file', '-f', type=str, help='csv, tsv, parquet format data')
|
16
16
|
parser.add_argument('--lib', type=str, default=None, help='custom tokenizer library')
|
17
17
|
parser.add_argument('--column', '-c', type=str, help='column name to tokenize')
|
18
|
-
parser.add_argument('--name', '-n', type=str, help='
|
18
|
+
parser.add_argument('--name', '-n', type=str, help='export feature name name')
|
19
19
|
parser.add_argument('--vocab', '-v', type=str, default=None, help='vocabulary name')
|
20
20
|
parser.add_argument('--tokenizer', '-t', type=str, default=None, help='tokenizer classname')
|
21
21
|
parser.add_argument('--tokenizer_id', type=str, default=None, help='tokenizer id')
|
@@ -69,7 +69,7 @@ def integrate():
|
|
69
69
|
raise ValueError(f'Unknown tokenizer: {args.tokenizer}. Available tokenizers: {tokenizers.keys()}')
|
70
70
|
tokenizer = tokenizers[args.tokenizer](vocab=args.vocab, **tokenizer_params)
|
71
71
|
|
72
|
-
ut.
|
72
|
+
ut.add_feature(tokenizer=tokenizer, column=args.column, name=args.name, truncate=args.truncate)
|
73
73
|
ut.tokenize(df).save(args.path)
|
74
74
|
|
75
75
|
|
@@ -85,11 +85,11 @@ def summarize():
|
|
85
85
|
def remove():
|
86
86
|
parser = argparse.ArgumentParser()
|
87
87
|
parser.add_argument('path', type=str, default='.', help='path to a unitok data directory')
|
88
|
-
parser.add_argument('--name', type=str, help='
|
88
|
+
parser.add_argument('--name', type=str, help='feature name to remove')
|
89
89
|
args, _ = parser.parse_known_args()
|
90
90
|
|
91
91
|
with UniTok.load(args.path) as ut:
|
92
|
-
ut.
|
92
|
+
ut.remove_feature(args.name)
|
93
93
|
ut.save(args.path)
|
94
94
|
|
95
95
|
|
@@ -8,7 +8,7 @@ from unitok.utils import Symbols, Instance
|
|
8
8
|
from unitok.utils.hub import Hub
|
9
9
|
|
10
10
|
|
11
|
-
class
|
11
|
+
class Feature:
|
12
12
|
def __init__(
|
13
13
|
self,
|
14
14
|
tokenizer: Union[BaseTokenizer, str],
|
@@ -35,7 +35,7 @@ class Job:
|
|
35
35
|
self.max_len = max_len
|
36
36
|
self.from_union = isinstance(self.tokenizer, UnionTokenizer)
|
37
37
|
|
38
|
-
|
38
|
+
FeatureHub.add(self)
|
39
39
|
VocabHub.add(self.tokenizer.vocab)
|
40
40
|
|
41
41
|
@property
|
@@ -48,12 +48,12 @@ class Job:
|
|
48
48
|
for attr in attributes:
|
49
49
|
params[attr] = kwargs[attr] if attr in kwargs else getattr(self, attr)
|
50
50
|
|
51
|
-
return
|
51
|
+
return Feature(**params)
|
52
52
|
|
53
53
|
def __str__(self):
|
54
54
|
if self.key:
|
55
|
-
return f'
|
56
|
-
return f'
|
55
|
+
return f'Feature({self.column} => {self.name}) [PK]'
|
56
|
+
return f'Feature({self.column} => {self.name})'
|
57
57
|
|
58
58
|
def __repr__(self):
|
59
59
|
return str(self)
|
@@ -85,10 +85,10 @@ class Job:
|
|
85
85
|
return slice(None)
|
86
86
|
|
87
87
|
|
88
|
-
class
|
88
|
+
class FeatureHub(Hub[Feature]):
|
89
89
|
_instance = Instance(compulsory_space=True)
|
90
90
|
|
91
91
|
@classmethod
|
92
|
-
def add(cls, key, obj:
|
92
|
+
def add(cls, key, obj: Feature = None):
|
93
93
|
key, obj = key.name, key
|
94
94
|
return super().add(key, obj)
|
@@ -0,0 +1,11 @@
|
|
1
|
+
import warnings
|
2
|
+
from unitok.feature import Feature, FeatureHub
|
3
|
+
|
4
|
+
|
5
|
+
class Job(Feature):
|
6
|
+
def __init__(self, **kwargs):
|
7
|
+
warnings.warn(f'`Job` class is deprecated, use `Feature`.', DeprecationWarning, stacklevel=2)
|
8
|
+
super().__init__(**kwargs)
|
9
|
+
|
10
|
+
|
11
|
+
JobHub = FeatureHub
|