UniTok 4.3.9__tar.gz → 4.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. {UniTok-4.3.9 → UniTok-4.4.0}/PKG-INFO +33 -25
  2. {UniTok-4.3.9 → UniTok-4.4.0}/README.md +24 -22
  3. {UniTok-4.3.9 → UniTok-4.4.0}/UniTok.egg-info/PKG-INFO +33 -25
  4. {UniTok-4.3.9 → UniTok-4.4.0}/UniTok.egg-info/SOURCES.txt +2 -6
  5. {UniTok-4.3.9 → UniTok-4.4.0}/UniTok.egg-info/entry_points.txt +0 -1
  6. {UniTok-4.3.9 → UniTok-4.4.0}/UniTokv3/__main__.py +4 -4
  7. {UniTok-4.3.9 → UniTok-4.4.0}/setup.py +1 -1
  8. {UniTok-4.3.9 → UniTok-4.4.0}/unitok/__init__.py +4 -2
  9. {UniTok-4.3.9 → UniTok-4.4.0}/unitok/__main__.py +4 -4
  10. UniTok-4.3.9/unitok/job.py → UniTok-4.4.0/unitok/feature.py +7 -7
  11. UniTok-4.4.0/unitok/job.py +11 -0
  12. {UniTok-4.3.9 → UniTok-4.4.0}/unitok/meta.py +14 -8
  13. {UniTok-4.3.9 → UniTok-4.4.0}/unitok/selector.py +4 -4
  14. {UniTok-4.3.9 → UniTok-4.4.0}/unitok/unitok.py +161 -125
  15. {UniTok-4.3.9 → UniTok-4.4.0}/unitok/utils/index_set/__init__.py +2 -0
  16. UniTok-4.4.0/unitok/utils/index_set/feature_set.py +25 -0
  17. UniTok-4.4.0/unitok/utils/index_set/job_set.py +4 -0
  18. UniTok-4.3.9/unitok/utils/index_set/job_set.py +0 -25
  19. {UniTok-4.3.9 → UniTok-4.4.0}/LICENSE +0 -0
  20. {UniTok-4.3.9 → UniTok-4.4.0}/UniTok.egg-info/dependency_links.txt +0 -0
  21. {UniTok-4.3.9 → UniTok-4.4.0}/UniTok.egg-info/requires.txt +0 -0
  22. {UniTok-4.3.9 → UniTok-4.4.0}/UniTok.egg-info/top_level.txt +0 -0
  23. {UniTok-4.3.9 → UniTok-4.4.0}/UniTokv3/__init__.py +0 -0
  24. {UniTok-4.3.9 → UniTok-4.4.0}/UniTokv3/analysis/__init__.py +0 -0
  25. {UniTok-4.3.9 → UniTok-4.4.0}/UniTokv3/analysis/lengths.py +0 -0
  26. {UniTok-4.3.9 → UniTok-4.4.0}/UniTokv3/analysis/plot.py +0 -0
  27. {UniTok-4.3.9 → UniTok-4.4.0}/UniTokv3/cols.py +0 -0
  28. {UniTok-4.3.9 → UniTok-4.4.0}/UniTokv3/column.py +0 -0
  29. {UniTok-4.3.9 → UniTok-4.4.0}/UniTokv3/fut.py +0 -0
  30. {UniTok-4.3.9 → UniTok-4.4.0}/UniTokv3/global_setting.py +0 -0
  31. {UniTok-4.3.9 → UniTok-4.4.0}/UniTokv3/meta.py +0 -0
  32. {UniTok-4.3.9 → UniTok-4.4.0}/UniTokv3/tok/__init__.py +0 -0
  33. {UniTok-4.3.9 → UniTok-4.4.0}/UniTokv3/tok/bert_tok.py +0 -0
  34. {UniTok-4.3.9 → UniTok-4.4.0}/UniTokv3/tok/ent_tok.py +0 -0
  35. {UniTok-4.3.9 → UniTok-4.4.0}/UniTokv3/tok/id_tok.py +0 -0
  36. {UniTok-4.3.9 → UniTok-4.4.0}/UniTokv3/tok/number_tok.py +0 -0
  37. {UniTok-4.3.9 → UniTok-4.4.0}/UniTokv3/tok/seq_tok.py +0 -0
  38. {UniTok-4.3.9 → UniTok-4.4.0}/UniTokv3/tok/split_tok.py +0 -0
  39. {UniTok-4.3.9 → UniTok-4.4.0}/UniTokv3/tok/tok.py +0 -0
  40. {UniTok-4.3.9 → UniTok-4.4.0}/UniTokv3/unidep.py +0 -0
  41. {UniTok-4.3.9 → UniTok-4.4.0}/UniTokv3/unitok.py +0 -0
  42. {UniTok-4.3.9 → UniTok-4.4.0}/UniTokv3/vocab.py +0 -0
  43. {UniTok-4.3.9 → UniTok-4.4.0}/UniTokv3/vocabs.py +0 -0
  44. {UniTok-4.3.9 → UniTok-4.4.0}/setup.cfg +0 -0
  45. {UniTok-4.3.9 → UniTok-4.4.0}/unitok/status.py +0 -0
  46. {UniTok-4.3.9 → UniTok-4.4.0}/unitok/tokenizer/__init__.py +0 -0
  47. {UniTok-4.3.9 → UniTok-4.4.0}/unitok/tokenizer/base_tokenizer.py +0 -0
  48. {UniTok-4.3.9 → UniTok-4.4.0}/unitok/tokenizer/digit_tokenizer.py +0 -0
  49. {UniTok-4.3.9 → UniTok-4.4.0}/unitok/tokenizer/entity_tokenizer.py +0 -0
  50. {UniTok-4.3.9 → UniTok-4.4.0}/unitok/tokenizer/glove_tokenizer.py +0 -0
  51. {UniTok-4.3.9 → UniTok-4.4.0}/unitok/tokenizer/split_tokenizer.py +0 -0
  52. {UniTok-4.3.9 → UniTok-4.4.0}/unitok/tokenizer/transformers_tokenizer.py +0 -0
  53. {UniTok-4.3.9 → UniTok-4.4.0}/unitok/tokenizer/union_tokenizer.py +0 -0
  54. {UniTok-4.3.9 → UniTok-4.4.0}/unitok/tokenizer/unknown_tokenizer.py +0 -0
  55. {UniTok-4.3.9 → UniTok-4.4.0}/unitok/utils/__init__.py +0 -0
  56. {UniTok-4.3.9 → UniTok-4.4.0}/unitok/utils/class_pool.py +0 -0
  57. {UniTok-4.3.9 → UniTok-4.4.0}/unitok/utils/data.py +0 -0
  58. {UniTok-4.3.9 → UniTok-4.4.0}/unitok/utils/function.py +0 -0
  59. {UniTok-4.3.9 → UniTok-4.4.0}/unitok/utils/handler/__init__.py +0 -0
  60. {UniTok-4.3.9 → UniTok-4.4.0}/unitok/utils/handler/json_handler.py +0 -0
  61. {UniTok-4.3.9 → UniTok-4.4.0}/unitok/utils/handler/pkl_handler.py +0 -0
  62. {UniTok-4.3.9 → UniTok-4.4.0}/unitok/utils/hub/__init__.py +0 -0
  63. {UniTok-4.3.9 → UniTok-4.4.0}/unitok/utils/hub/hub.py +0 -0
  64. {UniTok-4.3.9 → UniTok-4.4.0}/unitok/utils/hub/param_hub.py +0 -0
  65. {UniTok-4.3.9 → UniTok-4.4.0}/unitok/utils/index_set/index_set.py +0 -0
  66. {UniTok-4.3.9 → UniTok-4.4.0}/unitok/utils/index_set/tokenizer_set.py +0 -0
  67. {UniTok-4.3.9 → UniTok-4.4.0}/unitok/utils/index_set/vocabulary_set.py +0 -0
  68. {UniTok-4.3.9 → UniTok-4.4.0}/unitok/utils/instance.py +0 -0
  69. {UniTok-4.3.9 → UniTok-4.4.0}/unitok/utils/map.py +0 -0
  70. {UniTok-4.3.9 → UniTok-4.4.0}/unitok/utils/space.py +0 -0
  71. {UniTok-4.3.9 → UniTok-4.4.0}/unitok/utils/symbol.py +0 -0
  72. {UniTok-4.3.9 → UniTok-4.4.0}/unitok/utils/verbose.py +0 -0
  73. {UniTok-4.3.9 → UniTok-4.4.0}/unitok/vocabulary/__init__.py +0 -0
  74. {UniTok-4.3.9 → UniTok-4.4.0}/unitok/vocabulary/counter.py +0 -0
  75. {UniTok-4.3.9 → UniTok-4.4.0}/unitok/vocabulary/vocabulary.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: UniTok
3
- Version: 4.3.9
3
+ Version: 4.4.0
4
4
  Summary: Unified Tokenizer
5
5
  Home-page: https://github.com/Jyonn/UnifiedTokenizer
6
6
  Author: Jyonn Liu
@@ -10,6 +10,14 @@ Keywords: token,tokenizer,NLP,transformers,glove,bert,llama
10
10
  Platform: any
11
11
  Description-Content-Type: text/markdown
12
12
  License-File: LICENSE
13
+ Requires-Dist: termplot==0.0.2
14
+ Requires-Dist: tqdm
15
+ Requires-Dist: numpy
16
+ Requires-Dist: pandas
17
+ Requires-Dist: transformers
18
+ Requires-Dist: oba
19
+ Requires-Dist: prettytable
20
+ Requires-Dist: rich
13
21
 
14
22
  # UniTok V4
15
23
 
@@ -28,16 +36,18 @@ Please refer to [UniTok Handbook](https://unitok.qijiong.work) for more detailed
28
36
 
29
37
  ### Changes and Comparisons
30
38
 
39
+ > After UniTok 4.4.0, `Job` is renamed to `Feature`.
40
+
31
41
  | Feature | UniTok v3 | UniTok v4 | Comments |
32
42
  |---------------------------------|-------------------------------------------------------------|-----------------------------------------------------|-------------------------------------------------------------------------------|
33
43
  | `UniTok` class | Solely for tokenization | Manages the entire preprocessing lifecycle | |
34
44
  | `UniDep` class | Data loading and combining | Removed | V4 combines the functionalities of `UniTok` and `UniDep` into a single class. |
35
- | `Column` class | Column name is for both the original and tokenized datasets | N/A | V4 introduces a `Job` class. |
36
- | `Job` class | N/A | Defines how a specific column should be tokenized | |
45
+ | `Column` class | Column name is for both the original and tokenized datasets | N/A | V4 introduces a `Feature` class. |
46
+ | `Feature` class | N/A | Defines how a specific column should be tokenized | |
37
47
  | `Tokenizer` class | Ambiguous return type definition | `return_list` parameter must be of type `bool` | |
38
48
  | `Tokenizer` class | Only supports `BertTokenizer` for text processing | Supports all Tokenizers in the transformers library | New `TransformersTokenizer` class |
39
49
  | `analyse` method | Supported | Not supported Currently | |
40
- | `Meta` class | Only for human-friendly displaying | Manager for `Job`, `Tokenizer`, and `Vocab` | |
50
+ | `Meta` class | Only for human-friendly displaying | Manager for `Feature`, `Tokenizer`, and `Vocab` | |
41
51
  | `unitok` command | Visualization in the terminal | More colorful and detailed output | |
42
52
  | `Vocab` class (unitok >= 4.1.0) | Save and load vocabulary using text files | Save and load vocabulary using pickle files | Avoids issues with special characters in text files |
43
53
 
@@ -75,13 +85,13 @@ pip install unitok
75
85
  **Components**
76
86
 
77
87
  - UniTok: Manages the dataset preprocessing lifecycle.
78
- - Job: Defines how a specific column should be tokenized.
88
+ - Feature: Defines how a specific column should be tokenized.
79
89
  - Tokenizer: Encodes data using various methods (e.g., BERT, splitting by delimiters).
80
90
  - Vocabulary: Stores and manages unique tokens across datasets.
81
91
 
82
- **Primary Key (key_job)**
92
+ **Primary Key (key_feature)**
83
93
 
84
- The `key_job` acts as the primary key for operations like `getitem` and `union`, ensuring consistency across datasets.
94
+ The `key_feature` acts as the primary key for operations like `getitem` and `union`, ensuring consistency across datasets.
85
95
 
86
96
  ## Usage Guide
87
97
 
@@ -113,9 +123,9 @@ interaction = pd.read_csv(
113
123
  )
114
124
  ```
115
125
 
116
- ### Defining and Adding Jobs
126
+ ### Defining and Adding Features
117
127
 
118
- Define tokenization jobs for different columns:
128
+ Define tokenization features for different columns:
119
129
 
120
130
  ```python
121
131
  from unitok import UniTok, Vocab
@@ -128,23 +138,23 @@ with UniTok() as item_ut:
128
138
  bert_tokenizer = BertTokenizer(vocab='bert')
129
139
  llama_tokenizer = TransformersTokenizer(vocab='llama', key='huggyllama/llama-7b')
130
140
 
131
- item_ut.add_job(tokenizer=EntityTokenizer(vocab=item_vocab), column='nid', key=True)
132
- item_ut.add_job(tokenizer=bert_tokenizer, column='title', name='title@bert', truncate=20)
133
- item_ut.add_job(tokenizer=llama_tokenizer, column='title', name='title@llama', truncate=20)
134
- item_ut.add_job(tokenizer=bert_tokenizer, column='abstract', name='abstract@bert', truncate=50)
135
- item_ut.add_job(tokenizer=llama_tokenizer, column='abstract', name='abstract@llama', truncate=50)
136
- item_ut.add_job(tokenizer=EntityTokenizer(vocab='category'), column='category')
137
- item_ut.add_job(tokenizer=EntityTokenizer(vocab='subcategory'), column='subcategory')
141
+ item_ut.add_feature(tokenizer=EntityTokenizer(vocab=item_vocab), column='nid', key=True)
142
+ item_ut.add_feature(tokenizer=bert_tokenizer, column='title', name='title@bert', truncate=20)
143
+ item_ut.add_feature(tokenizer=llama_tokenizer, column='title', name='title@llama', truncate=20)
144
+ item_ut.add_feature(tokenizer=bert_tokenizer, column='abstract', name='abstract@bert', truncate=50)
145
+ item_ut.add_feature(tokenizer=llama_tokenizer, column='abstract', name='abstract@llama', truncate=50)
146
+ item_ut.add_feature(tokenizer=EntityTokenizer(vocab='category'), column='category')
147
+ item_ut.add_feature(tokenizer=EntityTokenizer(vocab='subcategory'), column='subcategory')
138
148
 
139
149
  with UniTok() as user_ut:
140
- user_ut.add_job(tokenizer=EntityTokenizer(vocab=user_vocab), column='uid', key=True)
141
- user_ut.add_job(tokenizer=SplitTokenizer(vocab=item_vocab, sep=','), column='history', truncate=30)
150
+ user_ut.add_feature(tokenizer=EntityTokenizer(vocab=user_vocab), column='uid', key=True)
151
+ user_ut.add_feature(tokenizer=SplitTokenizer(vocab=item_vocab, sep=','), column='history', truncate=30)
142
152
 
143
153
  with UniTok() as inter_ut:
144
- inter_ut.add_index_job(name='index')
145
- inter_ut.add_job(tokenizer=EntityTokenizer(vocab=user_vocab), column='uid')
146
- inter_ut.add_job(tokenizer=EntityTokenizer(vocab=item_vocab), column='nid')
147
- inter_ut.add_job(tokenizer=DigitTokenizer(vocab='click', vocab_size=2), column='click')
154
+ inter_ut.add_index_feature(name='index')
155
+ inter_ut.add_feature(tokenizer=EntityTokenizer(vocab=user_vocab), column='uid')
156
+ inter_ut.add_feature(tokenizer=EntityTokenizer(vocab=item_vocab), column='nid')
157
+ inter_ut.add_feature(tokenizer=DigitTokenizer(vocab='click', vocab_size=2), column='click')
148
158
  ```
149
159
 
150
160
  ### Tokenizing Data
@@ -190,7 +200,7 @@ UniTok (4beta)
190
200
  Sample Size: 10
191
201
  ID Column: nid
192
202
 
193
- Jobs
203
+ Features
194
204
  ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓
195
205
  ┃ Tokenizer ┃ Tokenizer ID ┃ Column Mapping ┃ Vocab ┃ Max Length ┃
196
206
  ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩
@@ -224,5 +234,3 @@ Our TODO list includes:
224
234
  ## License
225
235
 
226
236
  This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
227
-
228
-
@@ -15,16 +15,18 @@ Please refer to [UniTok Handbook](https://unitok.qijiong.work) for more detailed
15
15
 
16
16
  ### Changes and Comparisons
17
17
 
18
+ > After UniTok 4.4.0, `Job` is renamed to `Feature`.
19
+
18
20
  | Feature | UniTok v3 | UniTok v4 | Comments |
19
21
  |---------------------------------|-------------------------------------------------------------|-----------------------------------------------------|-------------------------------------------------------------------------------|
20
22
  | `UniTok` class | Solely for tokenization | Manages the entire preprocessing lifecycle | |
21
23
  | `UniDep` class | Data loading and combining | Removed | V4 combines the functionalities of `UniTok` and `UniDep` into a single class. |
22
- | `Column` class | Column name is for both the original and tokenized datasets | N/A | V4 introduces a `Job` class. |
23
- | `Job` class | N/A | Defines how a specific column should be tokenized | |
24
+ | `Column` class | Column name is for both the original and tokenized datasets | N/A | V4 introduces a `Feature` class. |
25
+ | `Feature` class | N/A | Defines how a specific column should be tokenized | |
24
26
  | `Tokenizer` class | Ambiguous return type definition | `return_list` parameter must be of type `bool` | |
25
27
  | `Tokenizer` class | Only supports `BertTokenizer` for text processing | Supports all Tokenizers in the transformers library | New `TransformersTokenizer` class |
26
28
  | `analyse` method | Supported | Not supported Currently | |
27
- | `Meta` class | Only for human-friendly displaying | Manager for `Job`, `Tokenizer`, and `Vocab` | |
29
+ | `Meta` class | Only for human-friendly displaying | Manager for `Feature`, `Tokenizer`, and `Vocab` | |
28
30
  | `unitok` command | Visualization in the terminal | More colorful and detailed output | |
29
31
  | `Vocab` class (unitok >= 4.1.0) | Save and load vocabulary using text files | Save and load vocabulary using pickle files | Avoids issues with special characters in text files |
30
32
 
@@ -62,13 +64,13 @@ pip install unitok
62
64
  **Components**
63
65
 
64
66
  - UniTok: Manages the dataset preprocessing lifecycle.
65
- - Job: Defines how a specific column should be tokenized.
67
+ - Feature: Defines how a specific column should be tokenized.
66
68
  - Tokenizer: Encodes data using various methods (e.g., BERT, splitting by delimiters).
67
69
  - Vocabulary: Stores and manages unique tokens across datasets.
68
70
 
69
- **Primary Key (key_job)**
71
+ **Primary Key (key_feature)**
70
72
 
71
- The `key_job` acts as the primary key for operations like `getitem` and `union`, ensuring consistency across datasets.
73
+ The `key_feature` acts as the primary key for operations like `getitem` and `union`, ensuring consistency across datasets.
72
74
 
73
75
  ## Usage Guide
74
76
 
@@ -100,9 +102,9 @@ interaction = pd.read_csv(
100
102
  )
101
103
  ```
102
104
 
103
- ### Defining and Adding Jobs
105
+ ### Defining and Adding Features
104
106
 
105
- Define tokenization jobs for different columns:
107
+ Define tokenization features for different columns:
106
108
 
107
109
  ```python
108
110
  from unitok import UniTok, Vocab
@@ -115,23 +117,23 @@ with UniTok() as item_ut:
115
117
  bert_tokenizer = BertTokenizer(vocab='bert')
116
118
  llama_tokenizer = TransformersTokenizer(vocab='llama', key='huggyllama/llama-7b')
117
119
 
118
- item_ut.add_job(tokenizer=EntityTokenizer(vocab=item_vocab), column='nid', key=True)
119
- item_ut.add_job(tokenizer=bert_tokenizer, column='title', name='title@bert', truncate=20)
120
- item_ut.add_job(tokenizer=llama_tokenizer, column='title', name='title@llama', truncate=20)
121
- item_ut.add_job(tokenizer=bert_tokenizer, column='abstract', name='abstract@bert', truncate=50)
122
- item_ut.add_job(tokenizer=llama_tokenizer, column='abstract', name='abstract@llama', truncate=50)
123
- item_ut.add_job(tokenizer=EntityTokenizer(vocab='category'), column='category')
124
- item_ut.add_job(tokenizer=EntityTokenizer(vocab='subcategory'), column='subcategory')
120
+ item_ut.add_feature(tokenizer=EntityTokenizer(vocab=item_vocab), column='nid', key=True)
121
+ item_ut.add_feature(tokenizer=bert_tokenizer, column='title', name='title@bert', truncate=20)
122
+ item_ut.add_feature(tokenizer=llama_tokenizer, column='title', name='title@llama', truncate=20)
123
+ item_ut.add_feature(tokenizer=bert_tokenizer, column='abstract', name='abstract@bert', truncate=50)
124
+ item_ut.add_feature(tokenizer=llama_tokenizer, column='abstract', name='abstract@llama', truncate=50)
125
+ item_ut.add_feature(tokenizer=EntityTokenizer(vocab='category'), column='category')
126
+ item_ut.add_feature(tokenizer=EntityTokenizer(vocab='subcategory'), column='subcategory')
125
127
 
126
128
  with UniTok() as user_ut:
127
- user_ut.add_job(tokenizer=EntityTokenizer(vocab=user_vocab), column='uid', key=True)
128
- user_ut.add_job(tokenizer=SplitTokenizer(vocab=item_vocab, sep=','), column='history', truncate=30)
129
+ user_ut.add_feature(tokenizer=EntityTokenizer(vocab=user_vocab), column='uid', key=True)
130
+ user_ut.add_feature(tokenizer=SplitTokenizer(vocab=item_vocab, sep=','), column='history', truncate=30)
129
131
 
130
132
  with UniTok() as inter_ut:
131
- inter_ut.add_index_job(name='index')
132
- inter_ut.add_job(tokenizer=EntityTokenizer(vocab=user_vocab), column='uid')
133
- inter_ut.add_job(tokenizer=EntityTokenizer(vocab=item_vocab), column='nid')
134
- inter_ut.add_job(tokenizer=DigitTokenizer(vocab='click', vocab_size=2), column='click')
133
+ inter_ut.add_index_feature(name='index')
134
+ inter_ut.add_feature(tokenizer=EntityTokenizer(vocab=user_vocab), column='uid')
135
+ inter_ut.add_feature(tokenizer=EntityTokenizer(vocab=item_vocab), column='nid')
136
+ inter_ut.add_feature(tokenizer=DigitTokenizer(vocab='click', vocab_size=2), column='click')
135
137
  ```
136
138
 
137
139
  ### Tokenizing Data
@@ -177,7 +179,7 @@ UniTok (4beta)
177
179
  Sample Size: 10
178
180
  ID Column: nid
179
181
 
180
- Jobs
182
+ Features
181
183
  ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓
182
184
  ┃ Tokenizer ┃ Tokenizer ID ┃ Column Mapping ┃ Vocab ┃ Max Length ┃
183
185
  ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: UniTok
3
- Version: 4.3.9
3
+ Version: 4.4.0
4
4
  Summary: Unified Tokenizer
5
5
  Home-page: https://github.com/Jyonn/UnifiedTokenizer
6
6
  Author: Jyonn Liu
@@ -10,6 +10,14 @@ Keywords: token,tokenizer,NLP,transformers,glove,bert,llama
10
10
  Platform: any
11
11
  Description-Content-Type: text/markdown
12
12
  License-File: LICENSE
13
+ Requires-Dist: termplot==0.0.2
14
+ Requires-Dist: tqdm
15
+ Requires-Dist: numpy
16
+ Requires-Dist: pandas
17
+ Requires-Dist: transformers
18
+ Requires-Dist: oba
19
+ Requires-Dist: prettytable
20
+ Requires-Dist: rich
13
21
 
14
22
  # UniTok V4
15
23
 
@@ -28,16 +36,18 @@ Please refer to [UniTok Handbook](https://unitok.qijiong.work) for more detailed
28
36
 
29
37
  ### Changes and Comparisons
30
38
 
39
+ > After UniTok 4.4.0, `Job` is renamed to `Feature`.
40
+
31
41
  | Feature | UniTok v3 | UniTok v4 | Comments |
32
42
  |---------------------------------|-------------------------------------------------------------|-----------------------------------------------------|-------------------------------------------------------------------------------|
33
43
  | `UniTok` class | Solely for tokenization | Manages the entire preprocessing lifecycle | |
34
44
  | `UniDep` class | Data loading and combining | Removed | V4 combines the functionalities of `UniTok` and `UniDep` into a single class. |
35
- | `Column` class | Column name is for both the original and tokenized datasets | N/A | V4 introduces a `Job` class. |
36
- | `Job` class | N/A | Defines how a specific column should be tokenized | |
45
+ | `Column` class | Column name is for both the original and tokenized datasets | N/A | V4 introduces a `Feature` class. |
46
+ | `Feature` class | N/A | Defines how a specific column should be tokenized | |
37
47
  | `Tokenizer` class | Ambiguous return type definition | `return_list` parameter must be of type `bool` | |
38
48
  | `Tokenizer` class | Only supports `BertTokenizer` for text processing | Supports all Tokenizers in the transformers library | New `TransformersTokenizer` class |
39
49
  | `analyse` method | Supported | Not supported Currently | |
40
- | `Meta` class | Only for human-friendly displaying | Manager for `Job`, `Tokenizer`, and `Vocab` | |
50
+ | `Meta` class | Only for human-friendly displaying | Manager for `Feature`, `Tokenizer`, and `Vocab` | |
41
51
  | `unitok` command | Visualization in the terminal | More colorful and detailed output | |
42
52
  | `Vocab` class (unitok >= 4.1.0) | Save and load vocabulary using text files | Save and load vocabulary using pickle files | Avoids issues with special characters in text files |
43
53
 
@@ -75,13 +85,13 @@ pip install unitok
75
85
  **Components**
76
86
 
77
87
  - UniTok: Manages the dataset preprocessing lifecycle.
78
- - Job: Defines how a specific column should be tokenized.
88
+ - Feature: Defines how a specific column should be tokenized.
79
89
  - Tokenizer: Encodes data using various methods (e.g., BERT, splitting by delimiters).
80
90
  - Vocabulary: Stores and manages unique tokens across datasets.
81
91
 
82
- **Primary Key (key_job)**
92
+ **Primary Key (key_feature)**
83
93
 
84
- The `key_job` acts as the primary key for operations like `getitem` and `union`, ensuring consistency across datasets.
94
+ The `key_feature` acts as the primary key for operations like `getitem` and `union`, ensuring consistency across datasets.
85
95
 
86
96
  ## Usage Guide
87
97
 
@@ -113,9 +123,9 @@ interaction = pd.read_csv(
113
123
  )
114
124
  ```
115
125
 
116
- ### Defining and Adding Jobs
126
+ ### Defining and Adding Features
117
127
 
118
- Define tokenization jobs for different columns:
128
+ Define tokenization features for different columns:
119
129
 
120
130
  ```python
121
131
  from unitok import UniTok, Vocab
@@ -128,23 +138,23 @@ with UniTok() as item_ut:
128
138
  bert_tokenizer = BertTokenizer(vocab='bert')
129
139
  llama_tokenizer = TransformersTokenizer(vocab='llama', key='huggyllama/llama-7b')
130
140
 
131
- item_ut.add_job(tokenizer=EntityTokenizer(vocab=item_vocab), column='nid', key=True)
132
- item_ut.add_job(tokenizer=bert_tokenizer, column='title', name='title@bert', truncate=20)
133
- item_ut.add_job(tokenizer=llama_tokenizer, column='title', name='title@llama', truncate=20)
134
- item_ut.add_job(tokenizer=bert_tokenizer, column='abstract', name='abstract@bert', truncate=50)
135
- item_ut.add_job(tokenizer=llama_tokenizer, column='abstract', name='abstract@llama', truncate=50)
136
- item_ut.add_job(tokenizer=EntityTokenizer(vocab='category'), column='category')
137
- item_ut.add_job(tokenizer=EntityTokenizer(vocab='subcategory'), column='subcategory')
141
+ item_ut.add_feature(tokenizer=EntityTokenizer(vocab=item_vocab), column='nid', key=True)
142
+ item_ut.add_feature(tokenizer=bert_tokenizer, column='title', name='title@bert', truncate=20)
143
+ item_ut.add_feature(tokenizer=llama_tokenizer, column='title', name='title@llama', truncate=20)
144
+ item_ut.add_feature(tokenizer=bert_tokenizer, column='abstract', name='abstract@bert', truncate=50)
145
+ item_ut.add_feature(tokenizer=llama_tokenizer, column='abstract', name='abstract@llama', truncate=50)
146
+ item_ut.add_feature(tokenizer=EntityTokenizer(vocab='category'), column='category')
147
+ item_ut.add_feature(tokenizer=EntityTokenizer(vocab='subcategory'), column='subcategory')
138
148
 
139
149
  with UniTok() as user_ut:
140
- user_ut.add_job(tokenizer=EntityTokenizer(vocab=user_vocab), column='uid', key=True)
141
- user_ut.add_job(tokenizer=SplitTokenizer(vocab=item_vocab, sep=','), column='history', truncate=30)
150
+ user_ut.add_feature(tokenizer=EntityTokenizer(vocab=user_vocab), column='uid', key=True)
151
+ user_ut.add_feature(tokenizer=SplitTokenizer(vocab=item_vocab, sep=','), column='history', truncate=30)
142
152
 
143
153
  with UniTok() as inter_ut:
144
- inter_ut.add_index_job(name='index')
145
- inter_ut.add_job(tokenizer=EntityTokenizer(vocab=user_vocab), column='uid')
146
- inter_ut.add_job(tokenizer=EntityTokenizer(vocab=item_vocab), column='nid')
147
- inter_ut.add_job(tokenizer=DigitTokenizer(vocab='click', vocab_size=2), column='click')
154
+ inter_ut.add_index_feature(name='index')
155
+ inter_ut.add_feature(tokenizer=EntityTokenizer(vocab=user_vocab), column='uid')
156
+ inter_ut.add_feature(tokenizer=EntityTokenizer(vocab=item_vocab), column='nid')
157
+ inter_ut.add_feature(tokenizer=DigitTokenizer(vocab='click', vocab_size=2), column='click')
148
158
  ```
149
159
 
150
160
  ### Tokenizing Data
@@ -190,7 +200,7 @@ UniTok (4beta)
190
200
  Sample Size: 10
191
201
  ID Column: nid
192
202
 
193
- Jobs
203
+ Features
194
204
  ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓
195
205
  ┃ Tokenizer ┃ Tokenizer ID ┃ Column Mapping ┃ Vocab ┃ Max Length ┃
196
206
  ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩
@@ -224,5 +234,3 @@ Our TODO list includes:
224
234
  ## License
225
235
 
226
236
  This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
227
-
228
-
@@ -31,17 +31,12 @@ UniTokv3/tok/split_tok.py
31
31
  UniTokv3/tok/tok.py
32
32
  unitok/__init__.py
33
33
  unitok/__main__.py
34
+ unitok/feature.py
34
35
  unitok/job.py
35
36
  unitok/meta.py
36
37
  unitok/selector.py
37
38
  unitok/status.py
38
39
  unitok/unitok.py
39
- unitok.egg-info/PKG-INFO
40
- unitok.egg-info/SOURCES.txt
41
- unitok.egg-info/dependency_links.txt
42
- unitok.egg-info/entry_points.txt
43
- unitok.egg-info/requires.txt
44
- unitok.egg-info/top_level.txt
45
40
  unitok/tokenizer/__init__.py
46
41
  unitok/tokenizer/base_tokenizer.py
47
42
  unitok/tokenizer/digit_tokenizer.py
@@ -67,6 +62,7 @@ unitok/utils/hub/__init__.py
67
62
  unitok/utils/hub/hub.py
68
63
  unitok/utils/hub/param_hub.py
69
64
  unitok/utils/index_set/__init__.py
65
+ unitok/utils/index_set/feature_set.py
70
66
  unitok/utils/index_set/index_set.py
71
67
  unitok/utils/index_set/job_set.py
72
68
  unitok/utils/index_set/tokenizer_set.py
@@ -2,4 +2,3 @@
2
2
  unidep-upgrade-v4 = UniTokv3.__main__:upgrade
3
3
  unitok = unitok.__main__:main
4
4
  unitokv3 = UniTokv3.__main__:main
5
-
@@ -9,7 +9,7 @@ from rich.table import Table
9
9
 
10
10
  from UniTokv3 import UniDep, Meta, Vocab
11
11
  from unitok.vocabulary import Vocab as Vocabv4
12
- from unitok.job import Job as Jobv4
12
+ from unitok.feature import Feature as Featurev4
13
13
  from unitok.tokenizer.unknown_tokenizer import UnknownTokenizer
14
14
  from unitok.unitok import UniTok as UniTokv4
15
15
  from unitok.meta import Meta as Metav4
@@ -127,7 +127,7 @@ def upgrade():
127
127
  ut.meta.vocabularies.add(vocab_beta)
128
128
 
129
129
  for col in voc.cols:
130
- print(f'\tUpgrade job {col.name}')
130
+ print(f'\tUpgrade feature {col.name}')
131
131
  col_data = data[col.name]
132
132
  if not len(col_data):
133
133
  print(f'\t\tWarning: empty column {col.name}, defaulting to an atom column')
@@ -149,7 +149,7 @@ def upgrade():
149
149
  tokenizer_id='upgrade_' + col.name,
150
150
  vocab=vocab_beta,
151
151
  )
152
- job = Jobv4(
152
+ feature = Featurev4(
153
153
  name=col.name,
154
154
  column=col.name,
155
155
  tokenizer=tokenizer,
@@ -159,7 +159,7 @@ def upgrade():
159
159
  max_len=max_len,
160
160
  )
161
161
  ut.meta.tokenizers.add(tokenizer)
162
- ut.meta.jobs.add(job)
162
+ ut.meta.features.add(feature)
163
163
 
164
164
  ut.meta.save(path)
165
165
 
@@ -6,7 +6,7 @@ long_description = (this_directory / "README.md").read_text(encoding='utf8')
6
6
 
7
7
  setup(
8
8
  name='UniTok',
9
- version='4.3.9',
9
+ version='4.4.0',
10
10
  keywords=['token', 'tokenizer', 'NLP', 'transformers', 'glove', 'bert', 'llama'],
11
11
  description='Unified Tokenizer',
12
12
  long_description=long_description,
@@ -11,8 +11,9 @@ from unitok.tokenizer import TransformersTokenizer, BertTokenizer
11
11
  from unitok.tokenizer import SplitTokenizer, DigitTokenizer, DigitsTokenizer
12
12
  from unitok.tokenizer import GloVeTokenizer
13
13
  from unitok.job import Job, JobHub
14
+ from unitok.feature import Feature, FeatureHub
14
15
 
15
- from unitok.utils.index_set import IndexSet, VocabSet, TokenizerSet, JobSet
16
+ from unitok.utils.index_set import IndexSet, VocabSet, TokenizerSet, JobSet, FeatureSet
16
17
 
17
18
  from unitok.meta import Meta
18
19
  from unitok.status import Status
@@ -32,7 +33,8 @@ __all__ = [
32
33
  'SplitTokenizer', 'DigitTokenizer', 'DigitsTokenizer',
33
34
  'GloVeTokenizer',
34
35
  'Job', 'JobHub',
35
- 'IndexSet', 'VocabSet', 'TokenizerSet', 'JobSet',
36
+ 'Feature', 'FeatureHub',
37
+ 'IndexSet', 'VocabSet', 'TokenizerSet', 'JobSet', 'FeatureSet',
36
38
  'Meta',
37
39
  'Status',
38
40
  'UniTok',
@@ -15,7 +15,7 @@ def integrate():
15
15
  parser.add_argument('--file', '-f', type=str, help='csv, tsv, parquet format data')
16
16
  parser.add_argument('--lib', type=str, default=None, help='custom tokenizer library')
17
17
  parser.add_argument('--column', '-c', type=str, help='column name to tokenize')
18
- parser.add_argument('--name', '-n', type=str, help='job name and export column name')
18
+ parser.add_argument('--name', '-n', type=str, help='export feature name name')
19
19
  parser.add_argument('--vocab', '-v', type=str, default=None, help='vocabulary name')
20
20
  parser.add_argument('--tokenizer', '-t', type=str, default=None, help='tokenizer classname')
21
21
  parser.add_argument('--tokenizer_id', type=str, default=None, help='tokenizer id')
@@ -69,7 +69,7 @@ def integrate():
69
69
  raise ValueError(f'Unknown tokenizer: {args.tokenizer}. Available tokenizers: {tokenizers.keys()}')
70
70
  tokenizer = tokenizers[args.tokenizer](vocab=args.vocab, **tokenizer_params)
71
71
 
72
- ut.add_job(tokenizer=tokenizer, column=args.column, name=args.name, truncate=args.truncate)
72
+ ut.add_feature(tokenizer=tokenizer, column=args.column, name=args.name, truncate=args.truncate)
73
73
  ut.tokenize(df).save(args.path)
74
74
 
75
75
 
@@ -85,11 +85,11 @@ def summarize():
85
85
  def remove():
86
86
  parser = argparse.ArgumentParser()
87
87
  parser.add_argument('path', type=str, default='.', help='path to a unitok data directory')
88
- parser.add_argument('--name', type=str, help='job name to remove')
88
+ parser.add_argument('--name', type=str, help='feature name to remove')
89
89
  args, _ = parser.parse_known_args()
90
90
 
91
91
  with UniTok.load(args.path) as ut:
92
- ut.remove_job(args.name)
92
+ ut.remove_feature(args.name)
93
93
  ut.save(args.path)
94
94
 
95
95
 
@@ -8,7 +8,7 @@ from unitok.utils import Symbols, Instance
8
8
  from unitok.utils.hub import Hub
9
9
 
10
10
 
11
- class Job:
11
+ class Feature:
12
12
  def __init__(
13
13
  self,
14
14
  tokenizer: Union[BaseTokenizer, str],
@@ -35,7 +35,7 @@ class Job:
35
35
  self.max_len = max_len
36
36
  self.from_union = isinstance(self.tokenizer, UnionTokenizer)
37
37
 
38
- JobHub.add(self)
38
+ FeatureHub.add(self)
39
39
  VocabHub.add(self.tokenizer.vocab)
40
40
 
41
41
  @property
@@ -48,12 +48,12 @@ class Job:
48
48
  for attr in attributes:
49
49
  params[attr] = kwargs[attr] if attr in kwargs else getattr(self, attr)
50
50
 
51
- return Job(**params)
51
+ return Feature(**params)
52
52
 
53
53
  def __str__(self):
54
54
  if self.key:
55
- return f'Job({self.column} => {self.name}) [PK]'
56
- return f'Job({self.column} => {self.name})'
55
+ return f'Feature({self.column} => {self.name}) [PK]'
56
+ return f'Feature({self.column} => {self.name})'
57
57
 
58
58
  def __repr__(self):
59
59
  return str(self)
@@ -85,10 +85,10 @@ class Job:
85
85
  return slice(None)
86
86
 
87
87
 
88
- class JobHub(Hub[Job]):
88
+ class FeatureHub(Hub[Feature]):
89
89
  _instance = Instance(compulsory_space=True)
90
90
 
91
91
  @classmethod
92
- def add(cls, key, obj: Job = None):
92
+ def add(cls, key, obj: Feature = None):
93
93
  key, obj = key.name, key
94
94
  return super().add(key, obj)
@@ -0,0 +1,11 @@
1
+ import warnings
2
+ from unitok.feature import Feature, FeatureHub
3
+
4
+
5
+ class Job(Feature):
6
+ def __init__(self, **kwargs):
7
+ warnings.deprecated(f'Job is deprecated, use Feature instead.')
8
+ super().__init__(**kwargs)
9
+
10
+
11
+ JobHub = FeatureHub
@@ -1,21 +1,22 @@
1
1
  import json
2
2
  import os
3
+ import warnings
3
4
  from datetime import datetime
4
5
 
6
+ from unitok.feature import Feature
5
7
  from unitok.utils.verbose import warning
6
- from unitok.job import Job
7
8
  from unitok.tokenizer import TokenizerHub
8
9
  from unitok.tokenizer.union_tokenizer import UnionTokenizer
9
10
  from unitok.tokenizer.unknown_tokenizer import UnknownTokenizer
10
11
  from unitok.utils import Symbols
11
12
  from unitok.utils.handler import JsonHandler
12
13
  from unitok.utils.class_pool import ClassPool
13
- from unitok.utils.index_set import VocabSet, TokenizerSet, JobSet
14
+ from unitok.utils.index_set import VocabSet, TokenizerSet, FeatureSet
14
15
  from unitok.vocabulary import Vocab, VocabHub
15
16
 
16
17
 
17
18
  class Meta:
18
- version = 'unidep-v4'
19
+ version = 'unidep-v4.1'
19
20
 
20
21
  def __init__(self):
21
22
  self.note = ('Not compatible with unitok-v3 or lower version, '
@@ -24,7 +25,7 @@ class Meta:
24
25
  self.modified_at = self.created_at = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
25
26
  self.vocabularies = VocabSet()
26
27
  self.tokenizers = TokenizerSet()
27
- self.jobs = JobSet()
28
+ self.features = FeatureSet()
28
29
 
29
30
  @staticmethod
30
31
  def parse_vocabulary(name: str, **kwargs):
@@ -45,7 +46,7 @@ class Meta:
45
46
  return tokenizer_classes[classname](tokenizer_id=tokenizer_id, vocab=vocab, **params)
46
47
 
47
48
  @staticmethod
48
- def parse_job(name: str, column: str, tokenizer: str, truncate: int, order: int, key: bool, max_len: int):
49
+ def parse_feature(name: str, column: str, tokenizer: str, truncate: int, order: int, key: bool, max_len: int):
49
50
  if not TokenizerHub.has(tokenizer):
50
51
  raise ValueError(f"(unitok.meta) Tokenizer {tokenizer} not found in the tokenizer hub.")
51
52
  tokenizer = TokenizerHub.get(tokenizer)
@@ -53,7 +54,7 @@ class Meta:
53
54
  if column == str(Symbols.idx):
54
55
  column = Symbols.idx
55
56
 
56
- return Job(
57
+ return Feature(
57
58
  name=name,
58
59
  column=column,
59
60
  tokenizer=tokenizer,
@@ -63,6 +64,11 @@ class Meta:
63
64
  max_len=max_len,
64
65
  )
65
66
 
67
+ @staticmethod
68
+ def parse_job(name: str, column: str, tokenizer: str, truncate: int, order: int, key: bool, max_len: int):
69
+ warnings.deprecated('`parse_job` is deprecated, use `parse_feature` instead.', stacklevel=2)
70
+ return Meta.parse_feature(name, column, tokenizer, truncate, order, key, max_len)
71
+
66
72
  @staticmethod
67
73
  def parse_version(version):
68
74
  if version.startswith('unidep-v'):
@@ -115,7 +121,7 @@ class Meta:
115
121
  meta.created_at = kwargs.get('created_at')
116
122
  meta.vocabularies = VocabSet({cls.parse_vocabulary(**v).load(save_dir) for v in kwargs.get('vocabularies')})
117
123
  meta.tokenizers = TokenizerSet({cls.parse_tokenizer(**t) for t in kwargs.get('tokenizers')})
118
- meta.jobs = JobSet({cls.parse_job(**j) for j in kwargs.get('jobs')})
124
+ meta.features = FeatureSet({cls.parse_feature(**f) for f in kwargs.get('features') or kwargs.get('jobs')})
119
125
  meta.version = kwargs.get('version')
120
126
 
121
127
  return meta
@@ -129,7 +135,7 @@ class Meta:
129
135
  "modified_at": self.modified_at,
130
136
  "vocabularies": [v.json() for v in self.vocabularies],
131
137
  "tokenizers": [t.json() for t in self.tokenizers],
132
- "jobs": [j.json() for j in self.jobs],
138
+ "features": [f.json() for f in self.features],
133
139
  }
134
140
 
135
141
  def save(self, save_dir):