UniTok 4.0.0__tar.gz → 4.2.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. UniTok-4.2.5/LICENSE +21 -0
  2. UniTok-4.2.5/PKG-INFO +229 -0
  3. UniTok-4.2.5/README.md +214 -0
  4. UniTok-4.2.5/UniTok.egg-info/PKG-INFO +229 -0
  5. {UniTok-4.0.0 → UniTok-4.2.5}/UniTok.egg-info/SOURCES.txt +3 -4
  6. {UniTok-4.0.0 → UniTok-4.2.5}/UniTok.egg-info/requires.txt +1 -0
  7. {UniTok-4.0.0 → UniTok-4.2.5}/UniTokv3/__main__.py +8 -8
  8. {UniTok-4.0.0 → UniTok-4.2.5}/UniTokv3/unidep.py +3 -1
  9. {UniTok-4.0.0 → UniTok-4.2.5}/UniTokv3/unitok.py +3 -1
  10. {UniTok-4.0.0 → UniTok-4.2.5}/setup.py +3 -2
  11. {UniTok-4.0.0/UniTok → UniTok-4.2.5/unitok}/__main__.py +36 -6
  12. {UniTok-4.0.0/UniTok → UniTok-4.2.5/unitok}/job.py +3 -0
  13. UniTok-4.2.5/unitok/selector.py +29 -0
  14. {UniTok-4.0.0/UniTok → UniTok-4.2.5/unitok}/tokenizer/base_tokenizer.py +6 -1
  15. UniTok-4.2.5/unitok/tokenizer/glove_tokenizer.py +21 -0
  16. {UniTok-4.0.0/UniTok → UniTok-4.2.5/unitok}/tokenizer/transformers_tokenizer.py +19 -4
  17. {UniTok-4.0.0/UniTok → UniTok-4.2.5/unitok}/unitok.py +85 -7
  18. {UniTok-4.0.0/UniTok → UniTok-4.2.5/unitok}/utils/handler/pkl_handler.py +1 -1
  19. {UniTok-4.0.0/UniTok → UniTok-4.2.5/unitok}/vocabulary/vocabulary.py +3 -4
  20. UniTok-4.0.0/PKG-INFO +0 -199
  21. UniTok-4.0.0/README.md +0 -185
  22. UniTok-4.0.0/UniTok.egg-info/PKG-INFO +0 -199
  23. {UniTok-4.0.0 → UniTok-4.2.5}/UniTok.egg-info/dependency_links.txt +0 -0
  24. {UniTok-4.0.0 → UniTok-4.2.5}/UniTok.egg-info/entry_points.txt +0 -0
  25. {UniTok-4.0.0 → UniTok-4.2.5}/UniTok.egg-info/top_level.txt +0 -0
  26. {UniTok-4.0.0 → UniTok-4.2.5}/UniTokv3/__init__.py +0 -0
  27. {UniTok-4.0.0 → UniTok-4.2.5}/UniTokv3/analysis/__init__.py +0 -0
  28. {UniTok-4.0.0 → UniTok-4.2.5}/UniTokv3/analysis/lengths.py +0 -0
  29. {UniTok-4.0.0 → UniTok-4.2.5}/UniTokv3/analysis/plot.py +0 -0
  30. {UniTok-4.0.0 → UniTok-4.2.5}/UniTokv3/cols.py +0 -0
  31. {UniTok-4.0.0 → UniTok-4.2.5}/UniTokv3/column.py +0 -0
  32. {UniTok-4.0.0 → UniTok-4.2.5}/UniTokv3/fut.py +0 -0
  33. {UniTok-4.0.0 → UniTok-4.2.5}/UniTokv3/global_setting.py +0 -0
  34. {UniTok-4.0.0 → UniTok-4.2.5}/UniTokv3/meta.py +0 -0
  35. {UniTok-4.0.0 → UniTok-4.2.5}/UniTokv3/tok/__init__.py +0 -0
  36. {UniTok-4.0.0 → UniTok-4.2.5}/UniTokv3/tok/bert_tok.py +0 -0
  37. {UniTok-4.0.0 → UniTok-4.2.5}/UniTokv3/tok/ent_tok.py +0 -0
  38. {UniTok-4.0.0 → UniTok-4.2.5}/UniTokv3/tok/id_tok.py +0 -0
  39. {UniTok-4.0.0 → UniTok-4.2.5}/UniTokv3/tok/number_tok.py +0 -0
  40. {UniTok-4.0.0 → UniTok-4.2.5}/UniTokv3/tok/seq_tok.py +0 -0
  41. {UniTok-4.0.0 → UniTok-4.2.5}/UniTokv3/tok/split_tok.py +0 -0
  42. {UniTok-4.0.0 → UniTok-4.2.5}/UniTokv3/tok/tok.py +0 -0
  43. {UniTok-4.0.0 → UniTok-4.2.5}/UniTokv3/vocab.py +0 -0
  44. {UniTok-4.0.0 → UniTok-4.2.5}/UniTokv3/vocabs.py +0 -0
  45. {UniTok-4.0.0 → UniTok-4.2.5}/setup.cfg +0 -0
  46. {UniTok-4.0.0/UniTok → UniTok-4.2.5/unitok}/__init__.py +0 -0
  47. {UniTok-4.0.0/UniTok → UniTok-4.2.5/unitok}/meta.py +0 -0
  48. {UniTok-4.0.0/UniTok → UniTok-4.2.5/unitok}/status.py +0 -0
  49. {UniTok-4.0.0/UniTok → UniTok-4.2.5/unitok}/tokenizer/__init__.py +0 -0
  50. {UniTok-4.0.0/UniTok → UniTok-4.2.5/unitok}/tokenizer/digit_tokenizer.py +0 -0
  51. {UniTok-4.0.0/UniTok → UniTok-4.2.5/unitok}/tokenizer/entity_tokenizer.py +0 -0
  52. {UniTok-4.0.0/UniTok → UniTok-4.2.5/unitok}/tokenizer/split_tokenizer.py +0 -0
  53. {UniTok-4.0.0/UniTok → UniTok-4.2.5/unitok}/tokenizer/union_tokenizer.py +0 -0
  54. {UniTok-4.0.0/UniTok → UniTok-4.2.5/unitok}/tokenizer/unknown_tokenizer.py +0 -0
  55. {UniTok-4.0.0/UniTok → UniTok-4.2.5/unitok}/utils/__init__.py +0 -0
  56. {UniTok-4.0.0/UniTok → UniTok-4.2.5/unitok}/utils/class_pool.py +0 -0
  57. {UniTok-4.0.0/UniTok → UniTok-4.2.5/unitok}/utils/data.py +0 -0
  58. {UniTok-4.0.0/UniTok → UniTok-4.2.5/unitok}/utils/function.py +0 -0
  59. {UniTok-4.0.0/UniTok → UniTok-4.2.5/unitok}/utils/handler/__init__.py +0 -0
  60. {UniTok-4.0.0/UniTok → UniTok-4.2.5/unitok}/utils/handler/json_handler.py +0 -0
  61. {UniTok-4.0.0/UniTok → UniTok-4.2.5/unitok}/utils/hub/__init__.py +0 -0
  62. {UniTok-4.0.0/UniTok → UniTok-4.2.5/unitok}/utils/hub/hub.py +0 -0
  63. {UniTok-4.0.0/UniTok → UniTok-4.2.5/unitok}/utils/hub/param_hub.py +0 -0
  64. {UniTok-4.0.0/UniTok → UniTok-4.2.5/unitok}/utils/index_set/__init__.py +0 -0
  65. {UniTok-4.0.0/UniTok → UniTok-4.2.5/unitok}/utils/index_set/index_set.py +0 -0
  66. {UniTok-4.0.0/UniTok → UniTok-4.2.5/unitok}/utils/index_set/job_set.py +0 -0
  67. {UniTok-4.0.0/UniTok → UniTok-4.2.5/unitok}/utils/index_set/tokenizer_set.py +0 -0
  68. {UniTok-4.0.0/UniTok → UniTok-4.2.5/unitok}/utils/index_set/vocabulary_set.py +0 -0
  69. {UniTok-4.0.0/UniTok → UniTok-4.2.5/unitok}/utils/instance.py +0 -0
  70. {UniTok-4.0.0/UniTok → UniTok-4.2.5/unitok}/utils/map.py +0 -0
  71. {UniTok-4.0.0/UniTok → UniTok-4.2.5/unitok}/utils/space.py +0 -0
  72. {UniTok-4.0.0/UniTok → UniTok-4.2.5/unitok}/utils/symbol.py +0 -0
  73. {UniTok-4.0.0/UniTok → UniTok-4.2.5/unitok}/utils/verbose.py +0 -0
  74. {UniTok-4.0.0/UniTok → UniTok-4.2.5/unitok}/vocabulary/__init__.py +0 -0
  75. {UniTok-4.0.0/UniTok → UniTok-4.2.5/unitok}/vocabulary/counter.py +0 -0
UniTok-4.2.5/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Jyonn
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
UniTok-4.2.5/PKG-INFO ADDED
@@ -0,0 +1,229 @@
1
+ Metadata-Version: 2.1
2
+ Name: UniTok
3
+ Version: 4.2.5
4
+ Summary: Unified Tokenizer
5
+ Home-page: https://github.com/Jyonn/UnifiedTokenizer
6
+ Author: Jyonn Liu
7
+ Author-email: liu@qijiong.work
8
+ License: MIT Licence
9
+ Keywords: token,tokenizer,NLP,transformers,glove,bert,llama
10
+ Platform: any
11
+ Description-Content-Type: text/markdown
12
+ License-File: LICENSE
13
+
14
+ # UniTok V4
15
+
16
+ If you want to use the old version, please refer to [v3](README_v3.md) in Chinese.
17
+
18
+ ## Overview
19
+
20
+ [![PyPI version](https://badge.fury.io/py/unitok.svg)](https://badge.fury.io/py/unitok)
21
+
22
+ Welcome to the UniTok documentation!
23
+ This library provides a unified preprocessing solution for machine learning datasets, handling diverse data types like text, categorical features, and numerical values.
24
+ It introduces **SQL-like** data table combinations and a modular workflow that transitions datasets through three states: `initialized`, `tokenized`, and `organized`.
25
+
26
+ UniTok is designed to simplify preprocessing by offering reusable components such as tokenizers and vocabularies, making it flexible for various datasets and scenarios.
27
+
28
+ ## Road from V3 to V4
29
+
30
+ ### Changes and Comparisons
31
+
32
+ | Feature | UniTok v3 | UniTok v4 | Comments |
33
+ |---------------------------------|-------------------------------------------------------------|-----------------------------------------------------|-------------------------------------------------------------------------------|
34
+ | `UniTok` class | Solely for tokenization | Manages the entire preprocessing lifecycle | |
35
+ | `UniDep` class | Data loading and combining | Removed | V4 combines the functionalities of `UniTok` and `UniDep` into a single class. |
36
+ | `Column` class | Column name is for both the original and tokenized datasets | N/A | V4 introduces a `Job` class. |
37
+ | `Job` class | N/A | Defines how a specific column should be tokenized | |
38
+ | `Tokenizer` class | Ambiguous return type definition | `return_list` parameter must be of type `bool` | |
39
+ | `Tokenizer` class | Only supports `BertTokenizer` for text processing | Supports all Tokenizers in the transformers library | New `TransformersTokenizer` class |
40
+ | `analyse` method | Supported | Not supported Currently | |
41
+ | `Meta` class | Only for human-friendly displaying | Manager for `Job`, `Tokenizer`, and `Vocab` | |
42
+ | `unitok` command | Visualization in the terminal | More colorful and detailed output | |
43
+ | `Vocab` class (unitok >= 4.1.0) | Save and load vocabulary using text files | Save and load vocabulary using pickle files | Avoids issues with special characters in text files |
44
+
45
+ ### How to Migrate the Processed Data
46
+
47
+ ```bash
48
+ unidep-upgrade-v4 <path>
49
+ ```
50
+
51
+ ## Installation
52
+
53
+ **Requirements**
54
+
55
+ - Python 3.7 or later
56
+ - Dependencies:
57
+ - pandas
58
+ - transformers
59
+ - tqdm
60
+ - rich
61
+
62
+ **Install UniTok via pip**
63
+
64
+ ```bash
65
+ pip install unitok
66
+ ```
67
+
68
+ ## Core Concepts
69
+
70
+ **States**
71
+
72
+ - `initialized`: The initial state after creating a UniTok instance.
73
+ - `tokenized`: Achieved after applying tokenization to the dataset.
74
+ - `organized`: Reached after combining multiple datasets via operations like union.
75
+
76
+ **Components**
77
+
78
+ - UniTok: Manages the dataset preprocessing lifecycle.
79
+ - Job: Defines how a specific column should be tokenized.
80
+ - Tokenizer: Encodes data using various methods (e.g., BERT, splitting by delimiters).
81
+ - Vocabulary: Stores and manages unique tokens across datasets.
82
+
83
+ **Primary Key (key_job)**
84
+
85
+ The `key_job` acts as the primary key for operations like `getitem` and `union`, ensuring consistency across datasets.
86
+
87
+ ## Usage Guide
88
+
89
+ ### Loading Data
90
+
91
+ Load datasets using pandas:
92
+
93
+ ```python
94
+ import pandas as pd
95
+
96
+ item = pd.read_csv(
97
+ filepath_or_buffer='news-sample.tsv',
98
+ sep='\t',
99
+ names=['nid', 'category', 'subcategory', 'title', 'abstract', 'url', 'title_entities', 'abstract_entities'],
100
+ usecols=['nid', 'category', 'subcategory', 'title', 'abstract'],
101
+ )
102
+ item['abstract'] = item['abstract'].fillna('') # Handle missing values
103
+
104
+ user = pd.read_csv(
105
+ filepath_or_buffer='user-sample.tsv',
106
+ sep='\t',
107
+ names=['uid', 'history'],
108
+ )
109
+
110
+ interaction = pd.read_csv(
111
+ filepath_or_buffer='interaction-sample.tsv',
112
+ sep='\t',
113
+ names=['uid', 'nid', 'click'],
114
+ )
115
+ ```
116
+
117
+ ### Defining and Adding Jobs
118
+
119
+ Define tokenization jobs for different columns:
120
+
121
+ ```python
122
+ from unitok import UniTok, Vocab
123
+ from unitok.tokenizer import BertTokenizer, TransformersTokenizer, EntityTokenizer, SplitTokenizer, DigitTokenizer
124
+
125
+ item_vocab = Vocab(name='nid') # will be used across datasets
126
+ user_vocab = Vocab(name='uid') # will be used across datasets
127
+
128
+ with UniTok() as item_ut:
129
+ bert_tokenizer = BertTokenizer(vocab='bert')
130
+ llama_tokenizer = TransformersTokenizer(vocab='llama', key='huggyllama/llama-7b')
131
+
132
+ item_ut.add_job(tokenizer=EntityTokenizer(vocab=item_vocab), column='nid', key=True)
133
+ item_ut.add_job(tokenizer=bert_tokenizer, column='title', name='title@bert', truncate=20)
134
+ item_ut.add_job(tokenizer=llama_tokenizer, column='title', name='title@llama', truncate=20)
135
+ item_ut.add_job(tokenizer=bert_tokenizer, column='abstract', name='abstract@bert', truncate=50)
136
+ item_ut.add_job(tokenizer=llama_tokenizer, column='abstract', name='abstract@llama', truncate=50)
137
+ item_ut.add_job(tokenizer=EntityTokenizer(vocab='category'), column='category')
138
+ item_ut.add_job(tokenizer=EntityTokenizer(vocab='subcategory'), column='subcategory')
139
+
140
+ with UniTok() as user_ut:
141
+ user_ut.add_job(tokenizer=EntityTokenizer(vocab=user_vocab), column='uid', key=True)
142
+ user_ut.add_job(tokenizer=SplitTokenizer(vocab=item_vocab, sep=','), column='history', truncate=30)
143
+
144
+ with UniTok() as inter_ut:
145
+ inter_ut.add_index_job(name='index')
146
+ inter_ut.add_job(tokenizer=EntityTokenizer(vocab=user_vocab), column='uid')
147
+ inter_ut.add_job(tokenizer=EntityTokenizer(vocab=item_vocab), column='nid')
148
+ inter_ut.add_job(tokenizer=DigitTokenizer(vocab='click', vocab_size=2), column='click')
149
+ ```
150
+
151
+ ### Tokenizing Data
152
+
153
+ Tokenize and save the processed data:
154
+
155
+ ```python
156
+ item_ut.tokenize(item).save('sample-ut/item')
157
+ item_vocab.deny_edit() # will raise an error if new items are detected in the user or interaction datasets
158
+ user_ut.tokenize(user).save('sample-ut/user')
159
+ inter_ut.tokenize(interaction).save('sample-ut/interaction')
160
+ ```
161
+
162
+ ### Combining Datasets
163
+
164
+ Combine datasets using union:
165
+
166
+ ```python
167
+ # => {'category': 0, 'nid': 0, 'title@bert': [1996, 9639, 3035, 3870, ...], 'title@llama': [450, 1771, 4167, 10470, ...], 'abstract@bert': [4497, 1996, 14960, 2015, ...], 'abstract@llama': [1383, 459, 278, 451, ...], 'subcategory': 0}
168
+ print(item_ut[0])
169
+
170
+ # => {'uid': 0, 'history': [0, 1, 2]}
171
+ print(user_ut[0])
172
+
173
+ # => {'uid': 0, 'nid': 7, 'index': 0, 'click': 1}
174
+ print(inter_ut[0])
175
+
176
+ with inter_ut:
177
+ inter_ut.union(user_ut)
178
+
179
+ # => {'index': 0, 'click': 1, 'uid': 0, 'nid': 7, 'history': [0, 1, 2]}
180
+ print(inter_ut[0])
181
+ ```
182
+
183
+ ### Glance at the Terminal
184
+
185
+ ```bash
186
+ unitok sample-ut/item
187
+ ```
188
+
189
+ ```text
190
+ UniTok (4beta)
191
+ Sample Size: 10
192
+ ID Column: nid
193
+
194
+ Jobs
195
+ ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓
196
+ ┃ Tokenizer ┃ Tokenizer ID ┃ Column Mapping ┃ Vocab ┃ Max Length ┃
197
+ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩
198
+ │ TransformersTokenizer │ auto_2VN5Ko │ abstract -> abstract@llama │ llama (size=32024) │ 50 │
199
+ ├──────────────────────────────────────┼───────────────────────┼──────────────────────────────────────────────┼───────────────────────────────────┼───────────────────┤
200
+ │ EntityTokenizer │ auto_C0b9Du │ subcategory -> subcategory │ subcategory (size=8) │ N/A │
201
+ ├──────────────────────────────────────┼───────────────────────┼──────────────────────────────────────────────┼───────────────────────────────────┼───────────────────┤
202
+ │ TransformersTokenizer │ auto_2VN5Ko │ title -> title@llama │ llama (size=32024) │ 20 │
203
+ ├──────────────────────────────────────┼───────────────────────┼──────────────────────────────────────────────┼───────────────────────────────────┼───────────────────┤
204
+ │ EntityTokenizer │ auto_4WQYxo │ category -> category │ category (size=4) │ N/A │
205
+ ├──────────────────────────────────────┼───────────────────────┼──────────────────────────────────────────────┼───────────────────────────────────┼───────────────────┤
206
+ │ BertTokenizer │ auto_Y9tADT │ abstract -> abstract@bert │ bert (size=30522) │ 46 │
207
+ ├──────────────────────────────────────┼───────────────────────┼──────────────────────────────────────────────┼───────────────────────────────────┼───────────────────┤
208
+ │ BertTokenizer │ auto_Y9tADT │ title -> title@bert │ bert (size=30522) │ 16 │
209
+ ├──────────────────────────────────────┼───────────────────────┼──────────────────────────────────────────────┼───────────────────────────────────┼───────────────────┤
210
+ │ EntityTokenizer │ auto_qwQALc │ nid -> nid │ nid (size=10) │ N/A │
211
+ └──────────────────────────────────────┴───────────────────────┴──────────────────────────────────────────────┴───────────────────────────────────┴───────────────────┘
212
+ ```
213
+
214
+ ## Contributing
215
+
216
+ We welcome contributions to UniTok! We appreciate your feedback, bug reports, and pull requests.
217
+
218
+ Our TODO list includes:
219
+
220
+ - [ ] More detailed documentation
221
+ - [ ] More examples and tutorials
222
+ - [ ] More SQL-like operations
223
+ - [ ] Analysis and visualization tools
224
+
225
+ ## License
226
+
227
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
228
+
229
+
UniTok-4.2.5/README.md ADDED
@@ -0,0 +1,214 @@
1
+ # UniTok V4
2
+
3
+ If you want to use the old version, please refer to [v3](README_v3.md) in Chinese.
4
+
5
+ ## Overview
6
+
7
+ [![PyPI version](https://badge.fury.io/py/unitok.svg)](https://badge.fury.io/py/unitok)
8
+
9
+ Welcome to the UniTok documentation!
10
+ This library provides a unified preprocessing solution for machine learning datasets, handling diverse data types like text, categorical features, and numerical values.
11
+ It introduces **SQL-like** data table combinations and a modular workflow that transitions datasets through three states: `initialized`, `tokenized`, and `organized`.
12
+
13
+ UniTok is designed to simplify preprocessing by offering reusable components such as tokenizers and vocabularies, making it flexible for various datasets and scenarios.
14
+
15
+ ## Road from V3 to V4
16
+
17
+ ### Changes and Comparisons
18
+
19
+ | Feature | UniTok v3 | UniTok v4 | Comments |
20
+ |---------------------------------|-------------------------------------------------------------|-----------------------------------------------------|-------------------------------------------------------------------------------|
21
+ | `UniTok` class | Solely for tokenization | Manages the entire preprocessing lifecycle | |
22
+ | `UniDep` class | Data loading and combining | Removed | V4 combines the functionalities of `UniTok` and `UniDep` into a single class. |
23
+ | `Column` class | Column name is for both the original and tokenized datasets | N/A | V4 introduces a `Job` class. |
24
+ | `Job` class | N/A | Defines how a specific column should be tokenized | |
25
+ | `Tokenizer` class | Ambiguous return type definition | `return_list` parameter must be of type `bool` | |
26
+ | `Tokenizer` class | Only supports `BertTokenizer` for text processing | Supports all Tokenizers in the transformers library | New `TransformersTokenizer` class |
27
+ | `analyse` method | Supported | Not supported Currently | |
28
+ | `Meta` class | Only for human-friendly displaying | Manager for `Job`, `Tokenizer`, and `Vocab` | |
29
+ | `unitok` command | Visualization in the terminal | More colorful and detailed output | |
30
+ | `Vocab` class (unitok >= 4.1.0) | Save and load vocabulary using text files | Save and load vocabulary using pickle files | Avoids issues with special characters in text files |
31
+
32
+ ### How to Migrate the Processed Data
33
+
34
+ ```bash
35
+ unidep-upgrade-v4 <path>
36
+ ```
37
+
38
+ ## Installation
39
+
40
+ **Requirements**
41
+
42
+ - Python 3.7 or later
43
+ - Dependencies:
44
+ - pandas
45
+ - transformers
46
+ - tqdm
47
+ - rich
48
+
49
+ **Install UniTok via pip**
50
+
51
+ ```bash
52
+ pip install unitok
53
+ ```
54
+
55
+ ## Core Concepts
56
+
57
+ **States**
58
+
59
+ - `initialized`: The initial state after creating a UniTok instance.
60
+ - `tokenized`: Achieved after applying tokenization to the dataset.
61
+ - `organized`: Reached after combining multiple datasets via operations like union.
62
+
63
+ **Components**
64
+
65
+ - UniTok: Manages the dataset preprocessing lifecycle.
66
+ - Job: Defines how a specific column should be tokenized.
67
+ - Tokenizer: Encodes data using various methods (e.g., BERT, splitting by delimiters).
68
+ - Vocabulary: Stores and manages unique tokens across datasets.
69
+
70
+ **Primary Key (key_job)**
71
+
72
+ The `key_job` acts as the primary key for operations like `getitem` and `union`, ensuring consistency across datasets.
73
+
74
+ ## Usage Guide
75
+
76
+ ### Loading Data
77
+
78
+ Load datasets using pandas:
79
+
80
+ ```python
81
+ import pandas as pd
82
+
83
+ item = pd.read_csv(
84
+ filepath_or_buffer='news-sample.tsv',
85
+ sep='\t',
86
+ names=['nid', 'category', 'subcategory', 'title', 'abstract', 'url', 'title_entities', 'abstract_entities'],
87
+ usecols=['nid', 'category', 'subcategory', 'title', 'abstract'],
88
+ )
89
+ item['abstract'] = item['abstract'].fillna('') # Handle missing values
90
+
91
+ user = pd.read_csv(
92
+ filepath_or_buffer='user-sample.tsv',
93
+ sep='\t',
94
+ names=['uid', 'history'],
95
+ )
96
+
97
+ interaction = pd.read_csv(
98
+ filepath_or_buffer='interaction-sample.tsv',
99
+ sep='\t',
100
+ names=['uid', 'nid', 'click'],
101
+ )
102
+ ```
103
+
104
+ ### Defining and Adding Jobs
105
+
106
+ Define tokenization jobs for different columns:
107
+
108
+ ```python
109
+ from unitok import UniTok, Vocab
110
+ from unitok.tokenizer import BertTokenizer, TransformersTokenizer, EntityTokenizer, SplitTokenizer, DigitTokenizer
111
+
112
+ item_vocab = Vocab(name='nid') # will be used across datasets
113
+ user_vocab = Vocab(name='uid') # will be used across datasets
114
+
115
+ with UniTok() as item_ut:
116
+ bert_tokenizer = BertTokenizer(vocab='bert')
117
+ llama_tokenizer = TransformersTokenizer(vocab='llama', key='huggyllama/llama-7b')
118
+
119
+ item_ut.add_job(tokenizer=EntityTokenizer(vocab=item_vocab), column='nid', key=True)
120
+ item_ut.add_job(tokenizer=bert_tokenizer, column='title', name='title@bert', truncate=20)
121
+ item_ut.add_job(tokenizer=llama_tokenizer, column='title', name='title@llama', truncate=20)
122
+ item_ut.add_job(tokenizer=bert_tokenizer, column='abstract', name='abstract@bert', truncate=50)
123
+ item_ut.add_job(tokenizer=llama_tokenizer, column='abstract', name='abstract@llama', truncate=50)
124
+ item_ut.add_job(tokenizer=EntityTokenizer(vocab='category'), column='category')
125
+ item_ut.add_job(tokenizer=EntityTokenizer(vocab='subcategory'), column='subcategory')
126
+
127
+ with UniTok() as user_ut:
128
+ user_ut.add_job(tokenizer=EntityTokenizer(vocab=user_vocab), column='uid', key=True)
129
+ user_ut.add_job(tokenizer=SplitTokenizer(vocab=item_vocab, sep=','), column='history', truncate=30)
130
+
131
+ with UniTok() as inter_ut:
132
+ inter_ut.add_index_job(name='index')
133
+ inter_ut.add_job(tokenizer=EntityTokenizer(vocab=user_vocab), column='uid')
134
+ inter_ut.add_job(tokenizer=EntityTokenizer(vocab=item_vocab), column='nid')
135
+ inter_ut.add_job(tokenizer=DigitTokenizer(vocab='click', vocab_size=2), column='click')
136
+ ```
137
+
138
+ ### Tokenizing Data
139
+
140
+ Tokenize and save the processed data:
141
+
142
+ ```python
143
+ item_ut.tokenize(item).save('sample-ut/item')
144
+ item_vocab.deny_edit() # will raise an error if new items are detected in the user or interaction datasets
145
+ user_ut.tokenize(user).save('sample-ut/user')
146
+ inter_ut.tokenize(interaction).save('sample-ut/interaction')
147
+ ```
148
+
149
+ ### Combining Datasets
150
+
151
+ Combine datasets using union:
152
+
153
+ ```python
154
+ # => {'category': 0, 'nid': 0, 'title@bert': [1996, 9639, 3035, 3870, ...], 'title@llama': [450, 1771, 4167, 10470, ...], 'abstract@bert': [4497, 1996, 14960, 2015, ...], 'abstract@llama': [1383, 459, 278, 451, ...], 'subcategory': 0}
155
+ print(item_ut[0])
156
+
157
+ # => {'uid': 0, 'history': [0, 1, 2]}
158
+ print(user_ut[0])
159
+
160
+ # => {'uid': 0, 'nid': 7, 'index': 0, 'click': 1}
161
+ print(inter_ut[0])
162
+
163
+ with inter_ut:
164
+ inter_ut.union(user_ut)
165
+
166
+ # => {'index': 0, 'click': 1, 'uid': 0, 'nid': 7, 'history': [0, 1, 2]}
167
+ print(inter_ut[0])
168
+ ```
169
+
170
+ ### Glance at the Terminal
171
+
172
+ ```bash
173
+ unitok sample-ut/item
174
+ ```
175
+
176
+ ```text
177
+ UniTok (4beta)
178
+ Sample Size: 10
179
+ ID Column: nid
180
+
181
+ Jobs
182
+ ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓
183
+ ┃ Tokenizer ┃ Tokenizer ID ┃ Column Mapping ┃ Vocab ┃ Max Length ┃
184
+ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩
185
+ │ TransformersTokenizer │ auto_2VN5Ko │ abstract -> abstract@llama │ llama (size=32024) │ 50 │
186
+ ├──────────────────────────────────────┼───────────────────────┼──────────────────────────────────────────────┼───────────────────────────────────┼───────────────────┤
187
+ │ EntityTokenizer │ auto_C0b9Du │ subcategory -> subcategory │ subcategory (size=8) │ N/A │
188
+ ├──────────────────────────────────────┼───────────────────────┼──────────────────────────────────────────────┼───────────────────────────────────┼───────────────────┤
189
+ │ TransformersTokenizer │ auto_2VN5Ko │ title -> title@llama │ llama (size=32024) │ 20 │
190
+ ├──────────────────────────────────────┼───────────────────────┼──────────────────────────────────────────────┼───────────────────────────────────┼───────────────────┤
191
+ │ EntityTokenizer │ auto_4WQYxo │ category -> category │ category (size=4) │ N/A │
192
+ ├──────────────────────────────────────┼───────────────────────┼──────────────────────────────────────────────┼───────────────────────────────────┼───────────────────┤
193
+ │ BertTokenizer │ auto_Y9tADT │ abstract -> abstract@bert │ bert (size=30522) │ 46 │
194
+ ├──────────────────────────────────────┼───────────────────────┼──────────────────────────────────────────────┼───────────────────────────────────┼───────────────────┤
195
+ │ BertTokenizer │ auto_Y9tADT │ title -> title@bert │ bert (size=30522) │ 16 │
196
+ ├──────────────────────────────────────┼───────────────────────┼──────────────────────────────────────────────┼───────────────────────────────────┼───────────────────┤
197
+ │ EntityTokenizer │ auto_qwQALc │ nid -> nid │ nid (size=10) │ N/A │
198
+ └──────────────────────────────────────┴───────────────────────┴──────────────────────────────────────────────┴───────────────────────────────────┴───────────────────┘
199
+ ```
200
+
201
+ ## Contributing
202
+
203
+ We welcome contributions to UniTok! We appreciate your feedback, bug reports, and pull requests.
204
+
205
+ Our TODO list includes:
206
+
207
+ - [ ] More detailed documentation
208
+ - [ ] More examples and tutorials
209
+ - [ ] More SQL-like operations
210
+ - [ ] Analysis and visualization tools
211
+
212
+ ## License
213
+
214
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.