UniTok 4.3.9__tar.gz → 4.4.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. {UniTok-4.3.9 → UniTok-4.4.1}/PKG-INFO +34 -25
  2. {UniTok-4.3.9 → UniTok-4.4.1}/README.md +24 -22
  3. {UniTok-4.3.9 → UniTok-4.4.1}/UniTok.egg-info/PKG-INFO +34 -25
  4. {UniTok-4.3.9 → UniTok-4.4.1}/UniTok.egg-info/SOURCES.txt +2 -6
  5. {UniTok-4.3.9 → UniTok-4.4.1}/UniTok.egg-info/entry_points.txt +0 -1
  6. {UniTok-4.3.9 → UniTok-4.4.1}/UniTok.egg-info/requires.txt +1 -0
  7. {UniTok-4.3.9 → UniTok-4.4.1}/UniTokv3/__main__.py +4 -4
  8. {UniTok-4.3.9 → UniTok-4.4.1}/setup.py +3 -2
  9. {UniTok-4.3.9 → UniTok-4.4.1}/unitok/__init__.py +4 -2
  10. {UniTok-4.3.9 → UniTok-4.4.1}/unitok/__main__.py +4 -4
  11. UniTok-4.3.9/unitok/job.py → UniTok-4.4.1/unitok/feature.py +7 -7
  12. UniTok-4.4.1/unitok/job.py +11 -0
  13. {UniTok-4.3.9 → UniTok-4.4.1}/unitok/meta.py +14 -8
  14. {UniTok-4.3.9 → UniTok-4.4.1}/unitok/selector.py +4 -4
  15. {UniTok-4.3.9 → UniTok-4.4.1}/unitok/unitok.py +161 -125
  16. {UniTok-4.3.9 → UniTok-4.4.1}/unitok/utils/index_set/__init__.py +2 -0
  17. UniTok-4.4.1/unitok/utils/index_set/feature_set.py +25 -0
  18. UniTok-4.4.1/unitok/utils/index_set/job_set.py +4 -0
  19. UniTok-4.3.9/unitok/utils/index_set/job_set.py +0 -25
  20. {UniTok-4.3.9 → UniTok-4.4.1}/LICENSE +0 -0
  21. {UniTok-4.3.9 → UniTok-4.4.1}/UniTok.egg-info/dependency_links.txt +0 -0
  22. {UniTok-4.3.9 → UniTok-4.4.1}/UniTok.egg-info/top_level.txt +0 -0
  23. {UniTok-4.3.9 → UniTok-4.4.1}/UniTokv3/__init__.py +0 -0
  24. {UniTok-4.3.9 → UniTok-4.4.1}/UniTokv3/analysis/__init__.py +0 -0
  25. {UniTok-4.3.9 → UniTok-4.4.1}/UniTokv3/analysis/lengths.py +0 -0
  26. {UniTok-4.3.9 → UniTok-4.4.1}/UniTokv3/analysis/plot.py +0 -0
  27. {UniTok-4.3.9 → UniTok-4.4.1}/UniTokv3/cols.py +0 -0
  28. {UniTok-4.3.9 → UniTok-4.4.1}/UniTokv3/column.py +0 -0
  29. {UniTok-4.3.9 → UniTok-4.4.1}/UniTokv3/fut.py +0 -0
  30. {UniTok-4.3.9 → UniTok-4.4.1}/UniTokv3/global_setting.py +0 -0
  31. {UniTok-4.3.9 → UniTok-4.4.1}/UniTokv3/meta.py +0 -0
  32. {UniTok-4.3.9 → UniTok-4.4.1}/UniTokv3/tok/__init__.py +0 -0
  33. {UniTok-4.3.9 → UniTok-4.4.1}/UniTokv3/tok/bert_tok.py +0 -0
  34. {UniTok-4.3.9 → UniTok-4.4.1}/UniTokv3/tok/ent_tok.py +0 -0
  35. {UniTok-4.3.9 → UniTok-4.4.1}/UniTokv3/tok/id_tok.py +0 -0
  36. {UniTok-4.3.9 → UniTok-4.4.1}/UniTokv3/tok/number_tok.py +0 -0
  37. {UniTok-4.3.9 → UniTok-4.4.1}/UniTokv3/tok/seq_tok.py +0 -0
  38. {UniTok-4.3.9 → UniTok-4.4.1}/UniTokv3/tok/split_tok.py +0 -0
  39. {UniTok-4.3.9 → UniTok-4.4.1}/UniTokv3/tok/tok.py +0 -0
  40. {UniTok-4.3.9 → UniTok-4.4.1}/UniTokv3/unidep.py +0 -0
  41. {UniTok-4.3.9 → UniTok-4.4.1}/UniTokv3/unitok.py +0 -0
  42. {UniTok-4.3.9 → UniTok-4.4.1}/UniTokv3/vocab.py +0 -0
  43. {UniTok-4.3.9 → UniTok-4.4.1}/UniTokv3/vocabs.py +0 -0
  44. {UniTok-4.3.9 → UniTok-4.4.1}/setup.cfg +0 -0
  45. {UniTok-4.3.9 → UniTok-4.4.1}/unitok/status.py +0 -0
  46. {UniTok-4.3.9 → UniTok-4.4.1}/unitok/tokenizer/__init__.py +0 -0
  47. {UniTok-4.3.9 → UniTok-4.4.1}/unitok/tokenizer/base_tokenizer.py +0 -0
  48. {UniTok-4.3.9 → UniTok-4.4.1}/unitok/tokenizer/digit_tokenizer.py +0 -0
  49. {UniTok-4.3.9 → UniTok-4.4.1}/unitok/tokenizer/entity_tokenizer.py +0 -0
  50. {UniTok-4.3.9 → UniTok-4.4.1}/unitok/tokenizer/glove_tokenizer.py +0 -0
  51. {UniTok-4.3.9 → UniTok-4.4.1}/unitok/tokenizer/split_tokenizer.py +0 -0
  52. {UniTok-4.3.9 → UniTok-4.4.1}/unitok/tokenizer/transformers_tokenizer.py +0 -0
  53. {UniTok-4.3.9 → UniTok-4.4.1}/unitok/tokenizer/union_tokenizer.py +0 -0
  54. {UniTok-4.3.9 → UniTok-4.4.1}/unitok/tokenizer/unknown_tokenizer.py +0 -0
  55. {UniTok-4.3.9 → UniTok-4.4.1}/unitok/utils/__init__.py +0 -0
  56. {UniTok-4.3.9 → UniTok-4.4.1}/unitok/utils/class_pool.py +0 -0
  57. {UniTok-4.3.9 → UniTok-4.4.1}/unitok/utils/data.py +0 -0
  58. {UniTok-4.3.9 → UniTok-4.4.1}/unitok/utils/function.py +0 -0
  59. {UniTok-4.3.9 → UniTok-4.4.1}/unitok/utils/handler/__init__.py +0 -0
  60. {UniTok-4.3.9 → UniTok-4.4.1}/unitok/utils/handler/json_handler.py +0 -0
  61. {UniTok-4.3.9 → UniTok-4.4.1}/unitok/utils/handler/pkl_handler.py +0 -0
  62. {UniTok-4.3.9 → UniTok-4.4.1}/unitok/utils/hub/__init__.py +0 -0
  63. {UniTok-4.3.9 → UniTok-4.4.1}/unitok/utils/hub/hub.py +0 -0
  64. {UniTok-4.3.9 → UniTok-4.4.1}/unitok/utils/hub/param_hub.py +0 -0
  65. {UniTok-4.3.9 → UniTok-4.4.1}/unitok/utils/index_set/index_set.py +0 -0
  66. {UniTok-4.3.9 → UniTok-4.4.1}/unitok/utils/index_set/tokenizer_set.py +0 -0
  67. {UniTok-4.3.9 → UniTok-4.4.1}/unitok/utils/index_set/vocabulary_set.py +0 -0
  68. {UniTok-4.3.9 → UniTok-4.4.1}/unitok/utils/instance.py +0 -0
  69. {UniTok-4.3.9 → UniTok-4.4.1}/unitok/utils/map.py +0 -0
  70. {UniTok-4.3.9 → UniTok-4.4.1}/unitok/utils/space.py +0 -0
  71. {UniTok-4.3.9 → UniTok-4.4.1}/unitok/utils/symbol.py +0 -0
  72. {UniTok-4.3.9 → UniTok-4.4.1}/unitok/utils/verbose.py +0 -0
  73. {UniTok-4.3.9 → UniTok-4.4.1}/unitok/vocabulary/__init__.py +0 -0
  74. {UniTok-4.3.9 → UniTok-4.4.1}/unitok/vocabulary/counter.py +0 -0
  75. {UniTok-4.3.9 → UniTok-4.4.1}/unitok/vocabulary/vocabulary.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: UniTok
3
- Version: 4.3.9
3
+ Version: 4.4.1
4
4
  Summary: Unified Tokenizer
5
5
  Home-page: https://github.com/Jyonn/UnifiedTokenizer
6
6
  Author: Jyonn Liu
@@ -10,6 +10,15 @@ Keywords: token,tokenizer,NLP,transformers,glove,bert,llama
10
10
  Platform: any
11
11
  Description-Content-Type: text/markdown
12
12
  License-File: LICENSE
13
+ Requires-Dist: termplot==0.0.2
14
+ Requires-Dist: tqdm
15
+ Requires-Dist: numpy
16
+ Requires-Dist: pandas
17
+ Requires-Dist: transformers
18
+ Requires-Dist: oba
19
+ Requires-Dist: prettytable
20
+ Requires-Dist: rich
21
+ Requires-Dist: fastparquet
13
22
 
14
23
  # UniTok V4
15
24
 
@@ -28,16 +37,18 @@ Please refer to [UniTok Handbook](https://unitok.qijiong.work) for more detailed
28
37
 
29
38
  ### Changes and Comparisons
30
39
 
40
+ > After UniTok 4.4.0, `Job` is renamed to `Feature`.
41
+
31
42
  | Feature | UniTok v3 | UniTok v4 | Comments |
32
43
  |---------------------------------|-------------------------------------------------------------|-----------------------------------------------------|-------------------------------------------------------------------------------|
33
44
  | `UniTok` class | Solely for tokenization | Manages the entire preprocessing lifecycle | |
34
45
  | `UniDep` class | Data loading and combining | Removed | V4 combines the functionalities of `UniTok` and `UniDep` into a single class. |
35
- | `Column` class | Column name is for both the original and tokenized datasets | N/A | V4 introduces a `Job` class. |
36
- | `Job` class | N/A | Defines how a specific column should be tokenized | |
46
+ | `Column` class | Column name is for both the original and tokenized datasets | N/A | V4 introduces a `Feature` class. |
47
+ | `Feature` class | N/A | Defines how a specific column should be tokenized | |
37
48
  | `Tokenizer` class | Ambiguous return type definition | `return_list` parameter must be of type `bool` | |
38
49
  | `Tokenizer` class | Only supports `BertTokenizer` for text processing | Supports all Tokenizers in the transformers library | New `TransformersTokenizer` class |
39
50
  | `analyse` method | Supported | Not supported Currently | |
40
- | `Meta` class | Only for human-friendly displaying | Manager for `Job`, `Tokenizer`, and `Vocab` | |
51
+ | `Meta` class | Only for human-friendly displaying | Manager for `Feature`, `Tokenizer`, and `Vocab` | |
41
52
  | `unitok` command | Visualization in the terminal | More colorful and detailed output | |
42
53
  | `Vocab` class (unitok >= 4.1.0) | Save and load vocabulary using text files | Save and load vocabulary using pickle files | Avoids issues with special characters in text files |
43
54
 
@@ -75,13 +86,13 @@ pip install unitok
75
86
  **Components**
76
87
 
77
88
  - UniTok: Manages the dataset preprocessing lifecycle.
78
- - Job: Defines how a specific column should be tokenized.
89
+ - Feature: Defines how a specific column should be tokenized.
79
90
  - Tokenizer: Encodes data using various methods (e.g., BERT, splitting by delimiters).
80
91
  - Vocabulary: Stores and manages unique tokens across datasets.
81
92
 
82
- **Primary Key (key_job)**
93
+ **Primary Key (key_feature)**
83
94
 
84
- The `key_job` acts as the primary key for operations like `getitem` and `union`, ensuring consistency across datasets.
95
+ The `key_feature` acts as the primary key for operations like `getitem` and `union`, ensuring consistency across datasets.
85
96
 
86
97
  ## Usage Guide
87
98
 
@@ -113,9 +124,9 @@ interaction = pd.read_csv(
113
124
  )
114
125
  ```
115
126
 
116
- ### Defining and Adding Jobs
127
+ ### Defining and Adding Features
117
128
 
118
- Define tokenization jobs for different columns:
129
+ Define tokenization features for different columns:
119
130
 
120
131
  ```python
121
132
  from unitok import UniTok, Vocab
@@ -128,23 +139,23 @@ with UniTok() as item_ut:
128
139
  bert_tokenizer = BertTokenizer(vocab='bert')
129
140
  llama_tokenizer = TransformersTokenizer(vocab='llama', key='huggyllama/llama-7b')
130
141
 
131
- item_ut.add_job(tokenizer=EntityTokenizer(vocab=item_vocab), column='nid', key=True)
132
- item_ut.add_job(tokenizer=bert_tokenizer, column='title', name='title@bert', truncate=20)
133
- item_ut.add_job(tokenizer=llama_tokenizer, column='title', name='title@llama', truncate=20)
134
- item_ut.add_job(tokenizer=bert_tokenizer, column='abstract', name='abstract@bert', truncate=50)
135
- item_ut.add_job(tokenizer=llama_tokenizer, column='abstract', name='abstract@llama', truncate=50)
136
- item_ut.add_job(tokenizer=EntityTokenizer(vocab='category'), column='category')
137
- item_ut.add_job(tokenizer=EntityTokenizer(vocab='subcategory'), column='subcategory')
142
+ item_ut.add_feature(tokenizer=EntityTokenizer(vocab=item_vocab), column='nid', key=True)
143
+ item_ut.add_feature(tokenizer=bert_tokenizer, column='title', name='title@bert', truncate=20)
144
+ item_ut.add_feature(tokenizer=llama_tokenizer, column='title', name='title@llama', truncate=20)
145
+ item_ut.add_feature(tokenizer=bert_tokenizer, column='abstract', name='abstract@bert', truncate=50)
146
+ item_ut.add_feature(tokenizer=llama_tokenizer, column='abstract', name='abstract@llama', truncate=50)
147
+ item_ut.add_feature(tokenizer=EntityTokenizer(vocab='category'), column='category')
148
+ item_ut.add_feature(tokenizer=EntityTokenizer(vocab='subcategory'), column='subcategory')
138
149
 
139
150
  with UniTok() as user_ut:
140
- user_ut.add_job(tokenizer=EntityTokenizer(vocab=user_vocab), column='uid', key=True)
141
- user_ut.add_job(tokenizer=SplitTokenizer(vocab=item_vocab, sep=','), column='history', truncate=30)
151
+ user_ut.add_feature(tokenizer=EntityTokenizer(vocab=user_vocab), column='uid', key=True)
152
+ user_ut.add_feature(tokenizer=SplitTokenizer(vocab=item_vocab, sep=','), column='history', truncate=30)
142
153
 
143
154
  with UniTok() as inter_ut:
144
- inter_ut.add_index_job(name='index')
145
- inter_ut.add_job(tokenizer=EntityTokenizer(vocab=user_vocab), column='uid')
146
- inter_ut.add_job(tokenizer=EntityTokenizer(vocab=item_vocab), column='nid')
147
- inter_ut.add_job(tokenizer=DigitTokenizer(vocab='click', vocab_size=2), column='click')
155
+ inter_ut.add_index_feature(name='index')
156
+ inter_ut.add_feature(tokenizer=EntityTokenizer(vocab=user_vocab), column='uid')
157
+ inter_ut.add_feature(tokenizer=EntityTokenizer(vocab=item_vocab), column='nid')
158
+ inter_ut.add_feature(tokenizer=DigitTokenizer(vocab='click', vocab_size=2), column='click')
148
159
  ```
149
160
 
150
161
  ### Tokenizing Data
@@ -190,7 +201,7 @@ UniTok (4beta)
190
201
  Sample Size: 10
191
202
  ID Column: nid
192
203
 
193
- Jobs
204
+ Features
194
205
  ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓
195
206
  ┃ Tokenizer ┃ Tokenizer ID ┃ Column Mapping ┃ Vocab ┃ Max Length ┃
196
207
  ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩
@@ -224,5 +235,3 @@ Our TODO list includes:
224
235
  ## License
225
236
 
226
237
  This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
227
-
228
-
@@ -15,16 +15,18 @@ Please refer to [UniTok Handbook](https://unitok.qijiong.work) for more detailed
15
15
 
16
16
  ### Changes and Comparisons
17
17
 
18
+ > After UniTok 4.4.0, `Job` is renamed to `Feature`.
19
+
18
20
  | Feature | UniTok v3 | UniTok v4 | Comments |
19
21
  |---------------------------------|-------------------------------------------------------------|-----------------------------------------------------|-------------------------------------------------------------------------------|
20
22
  | `UniTok` class | Solely for tokenization | Manages the entire preprocessing lifecycle | |
21
23
  | `UniDep` class | Data loading and combining | Removed | V4 combines the functionalities of `UniTok` and `UniDep` into a single class. |
22
- | `Column` class | Column name is for both the original and tokenized datasets | N/A | V4 introduces a `Job` class. |
23
- | `Job` class | N/A | Defines how a specific column should be tokenized | |
24
+ | `Column` class | Column name is for both the original and tokenized datasets | N/A | V4 introduces a `Feature` class. |
25
+ | `Feature` class | N/A | Defines how a specific column should be tokenized | |
24
26
  | `Tokenizer` class | Ambiguous return type definition | `return_list` parameter must be of type `bool` | |
25
27
  | `Tokenizer` class | Only supports `BertTokenizer` for text processing | Supports all Tokenizers in the transformers library | New `TransformersTokenizer` class |
26
28
  | `analyse` method | Supported | Not supported Currently | |
27
- | `Meta` class | Only for human-friendly displaying | Manager for `Job`, `Tokenizer`, and `Vocab` | |
29
+ | `Meta` class | Only for human-friendly displaying | Manager for `Feature`, `Tokenizer`, and `Vocab` | |
28
30
  | `unitok` command | Visualization in the terminal | More colorful and detailed output | |
29
31
  | `Vocab` class (unitok >= 4.1.0) | Save and load vocabulary using text files | Save and load vocabulary using pickle files | Avoids issues with special characters in text files |
30
32
 
@@ -62,13 +64,13 @@ pip install unitok
62
64
  **Components**
63
65
 
64
66
  - UniTok: Manages the dataset preprocessing lifecycle.
65
- - Job: Defines how a specific column should be tokenized.
67
+ - Feature: Defines how a specific column should be tokenized.
66
68
  - Tokenizer: Encodes data using various methods (e.g., BERT, splitting by delimiters).
67
69
  - Vocabulary: Stores and manages unique tokens across datasets.
68
70
 
69
- **Primary Key (key_job)**
71
+ **Primary Key (key_feature)**
70
72
 
71
- The `key_job` acts as the primary key for operations like `getitem` and `union`, ensuring consistency across datasets.
73
+ The `key_feature` acts as the primary key for operations like `getitem` and `union`, ensuring consistency across datasets.
72
74
 
73
75
  ## Usage Guide
74
76
 
@@ -100,9 +102,9 @@ interaction = pd.read_csv(
100
102
  )
101
103
  ```
102
104
 
103
- ### Defining and Adding Jobs
105
+ ### Defining and Adding Features
104
106
 
105
- Define tokenization jobs for different columns:
107
+ Define tokenization features for different columns:
106
108
 
107
109
  ```python
108
110
  from unitok import UniTok, Vocab
@@ -115,23 +117,23 @@ with UniTok() as item_ut:
115
117
  bert_tokenizer = BertTokenizer(vocab='bert')
116
118
  llama_tokenizer = TransformersTokenizer(vocab='llama', key='huggyllama/llama-7b')
117
119
 
118
- item_ut.add_job(tokenizer=EntityTokenizer(vocab=item_vocab), column='nid', key=True)
119
- item_ut.add_job(tokenizer=bert_tokenizer, column='title', name='title@bert', truncate=20)
120
- item_ut.add_job(tokenizer=llama_tokenizer, column='title', name='title@llama', truncate=20)
121
- item_ut.add_job(tokenizer=bert_tokenizer, column='abstract', name='abstract@bert', truncate=50)
122
- item_ut.add_job(tokenizer=llama_tokenizer, column='abstract', name='abstract@llama', truncate=50)
123
- item_ut.add_job(tokenizer=EntityTokenizer(vocab='category'), column='category')
124
- item_ut.add_job(tokenizer=EntityTokenizer(vocab='subcategory'), column='subcategory')
120
+ item_ut.add_feature(tokenizer=EntityTokenizer(vocab=item_vocab), column='nid', key=True)
121
+ item_ut.add_feature(tokenizer=bert_tokenizer, column='title', name='title@bert', truncate=20)
122
+ item_ut.add_feature(tokenizer=llama_tokenizer, column='title', name='title@llama', truncate=20)
123
+ item_ut.add_feature(tokenizer=bert_tokenizer, column='abstract', name='abstract@bert', truncate=50)
124
+ item_ut.add_feature(tokenizer=llama_tokenizer, column='abstract', name='abstract@llama', truncate=50)
125
+ item_ut.add_feature(tokenizer=EntityTokenizer(vocab='category'), column='category')
126
+ item_ut.add_feature(tokenizer=EntityTokenizer(vocab='subcategory'), column='subcategory')
125
127
 
126
128
  with UniTok() as user_ut:
127
- user_ut.add_job(tokenizer=EntityTokenizer(vocab=user_vocab), column='uid', key=True)
128
- user_ut.add_job(tokenizer=SplitTokenizer(vocab=item_vocab, sep=','), column='history', truncate=30)
129
+ user_ut.add_feature(tokenizer=EntityTokenizer(vocab=user_vocab), column='uid', key=True)
130
+ user_ut.add_feature(tokenizer=SplitTokenizer(vocab=item_vocab, sep=','), column='history', truncate=30)
129
131
 
130
132
  with UniTok() as inter_ut:
131
- inter_ut.add_index_job(name='index')
132
- inter_ut.add_job(tokenizer=EntityTokenizer(vocab=user_vocab), column='uid')
133
- inter_ut.add_job(tokenizer=EntityTokenizer(vocab=item_vocab), column='nid')
134
- inter_ut.add_job(tokenizer=DigitTokenizer(vocab='click', vocab_size=2), column='click')
133
+ inter_ut.add_index_feature(name='index')
134
+ inter_ut.add_feature(tokenizer=EntityTokenizer(vocab=user_vocab), column='uid')
135
+ inter_ut.add_feature(tokenizer=EntityTokenizer(vocab=item_vocab), column='nid')
136
+ inter_ut.add_feature(tokenizer=DigitTokenizer(vocab='click', vocab_size=2), column='click')
135
137
  ```
136
138
 
137
139
  ### Tokenizing Data
@@ -177,7 +179,7 @@ UniTok (4beta)
177
179
  Sample Size: 10
178
180
  ID Column: nid
179
181
 
180
- Jobs
182
+ Features
181
183
  ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓
182
184
  ┃ Tokenizer ┃ Tokenizer ID ┃ Column Mapping ┃ Vocab ┃ Max Length ┃
183
185
  ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: UniTok
3
- Version: 4.3.9
3
+ Version: 4.4.1
4
4
  Summary: Unified Tokenizer
5
5
  Home-page: https://github.com/Jyonn/UnifiedTokenizer
6
6
  Author: Jyonn Liu
@@ -10,6 +10,15 @@ Keywords: token,tokenizer,NLP,transformers,glove,bert,llama
10
10
  Platform: any
11
11
  Description-Content-Type: text/markdown
12
12
  License-File: LICENSE
13
+ Requires-Dist: termplot==0.0.2
14
+ Requires-Dist: tqdm
15
+ Requires-Dist: numpy
16
+ Requires-Dist: pandas
17
+ Requires-Dist: transformers
18
+ Requires-Dist: oba
19
+ Requires-Dist: prettytable
20
+ Requires-Dist: rich
21
+ Requires-Dist: fastparquet
13
22
 
14
23
  # UniTok V4
15
24
 
@@ -28,16 +37,18 @@ Please refer to [UniTok Handbook](https://unitok.qijiong.work) for more detailed
28
37
 
29
38
  ### Changes and Comparisons
30
39
 
40
+ > After UniTok 4.4.0, `Job` is renamed to `Feature`.
41
+
31
42
  | Feature | UniTok v3 | UniTok v4 | Comments |
32
43
  |---------------------------------|-------------------------------------------------------------|-----------------------------------------------------|-------------------------------------------------------------------------------|
33
44
  | `UniTok` class | Solely for tokenization | Manages the entire preprocessing lifecycle | |
34
45
  | `UniDep` class | Data loading and combining | Removed | V4 combines the functionalities of `UniTok` and `UniDep` into a single class. |
35
- | `Column` class | Column name is for both the original and tokenized datasets | N/A | V4 introduces a `Job` class. |
36
- | `Job` class | N/A | Defines how a specific column should be tokenized | |
46
+ | `Column` class | Column name is for both the original and tokenized datasets | N/A | V4 introduces a `Feature` class. |
47
+ | `Feature` class | N/A | Defines how a specific column should be tokenized | |
37
48
  | `Tokenizer` class | Ambiguous return type definition | `return_list` parameter must be of type `bool` | |
38
49
  | `Tokenizer` class | Only supports `BertTokenizer` for text processing | Supports all Tokenizers in the transformers library | New `TransformersTokenizer` class |
39
50
  | `analyse` method | Supported | Not supported Currently | |
40
- | `Meta` class | Only for human-friendly displaying | Manager for `Job`, `Tokenizer`, and `Vocab` | |
51
+ | `Meta` class | Only for human-friendly displaying | Manager for `Feature`, `Tokenizer`, and `Vocab` | |
41
52
  | `unitok` command | Visualization in the terminal | More colorful and detailed output | |
42
53
  | `Vocab` class (unitok >= 4.1.0) | Save and load vocabulary using text files | Save and load vocabulary using pickle files | Avoids issues with special characters in text files |
43
54
 
@@ -75,13 +86,13 @@ pip install unitok
75
86
  **Components**
76
87
 
77
88
  - UniTok: Manages the dataset preprocessing lifecycle.
78
- - Job: Defines how a specific column should be tokenized.
89
+ - Feature: Defines how a specific column should be tokenized.
79
90
  - Tokenizer: Encodes data using various methods (e.g., BERT, splitting by delimiters).
80
91
  - Vocabulary: Stores and manages unique tokens across datasets.
81
92
 
82
- **Primary Key (key_job)**
93
+ **Primary Key (key_feature)**
83
94
 
84
- The `key_job` acts as the primary key for operations like `getitem` and `union`, ensuring consistency across datasets.
95
+ The `key_feature` acts as the primary key for operations like `getitem` and `union`, ensuring consistency across datasets.
85
96
 
86
97
  ## Usage Guide
87
98
 
@@ -113,9 +124,9 @@ interaction = pd.read_csv(
113
124
  )
114
125
  ```
115
126
 
116
- ### Defining and Adding Jobs
127
+ ### Defining and Adding Features
117
128
 
118
- Define tokenization jobs for different columns:
129
+ Define tokenization features for different columns:
119
130
 
120
131
  ```python
121
132
  from unitok import UniTok, Vocab
@@ -128,23 +139,23 @@ with UniTok() as item_ut:
128
139
  bert_tokenizer = BertTokenizer(vocab='bert')
129
140
  llama_tokenizer = TransformersTokenizer(vocab='llama', key='huggyllama/llama-7b')
130
141
 
131
- item_ut.add_job(tokenizer=EntityTokenizer(vocab=item_vocab), column='nid', key=True)
132
- item_ut.add_job(tokenizer=bert_tokenizer, column='title', name='title@bert', truncate=20)
133
- item_ut.add_job(tokenizer=llama_tokenizer, column='title', name='title@llama', truncate=20)
134
- item_ut.add_job(tokenizer=bert_tokenizer, column='abstract', name='abstract@bert', truncate=50)
135
- item_ut.add_job(tokenizer=llama_tokenizer, column='abstract', name='abstract@llama', truncate=50)
136
- item_ut.add_job(tokenizer=EntityTokenizer(vocab='category'), column='category')
137
- item_ut.add_job(tokenizer=EntityTokenizer(vocab='subcategory'), column='subcategory')
142
+ item_ut.add_feature(tokenizer=EntityTokenizer(vocab=item_vocab), column='nid', key=True)
143
+ item_ut.add_feature(tokenizer=bert_tokenizer, column='title', name='title@bert', truncate=20)
144
+ item_ut.add_feature(tokenizer=llama_tokenizer, column='title', name='title@llama', truncate=20)
145
+ item_ut.add_feature(tokenizer=bert_tokenizer, column='abstract', name='abstract@bert', truncate=50)
146
+ item_ut.add_feature(tokenizer=llama_tokenizer, column='abstract', name='abstract@llama', truncate=50)
147
+ item_ut.add_feature(tokenizer=EntityTokenizer(vocab='category'), column='category')
148
+ item_ut.add_feature(tokenizer=EntityTokenizer(vocab='subcategory'), column='subcategory')
138
149
 
139
150
  with UniTok() as user_ut:
140
- user_ut.add_job(tokenizer=EntityTokenizer(vocab=user_vocab), column='uid', key=True)
141
- user_ut.add_job(tokenizer=SplitTokenizer(vocab=item_vocab, sep=','), column='history', truncate=30)
151
+ user_ut.add_feature(tokenizer=EntityTokenizer(vocab=user_vocab), column='uid', key=True)
152
+ user_ut.add_feature(tokenizer=SplitTokenizer(vocab=item_vocab, sep=','), column='history', truncate=30)
142
153
 
143
154
  with UniTok() as inter_ut:
144
- inter_ut.add_index_job(name='index')
145
- inter_ut.add_job(tokenizer=EntityTokenizer(vocab=user_vocab), column='uid')
146
- inter_ut.add_job(tokenizer=EntityTokenizer(vocab=item_vocab), column='nid')
147
- inter_ut.add_job(tokenizer=DigitTokenizer(vocab='click', vocab_size=2), column='click')
155
+ inter_ut.add_index_feature(name='index')
156
+ inter_ut.add_feature(tokenizer=EntityTokenizer(vocab=user_vocab), column='uid')
157
+ inter_ut.add_feature(tokenizer=EntityTokenizer(vocab=item_vocab), column='nid')
158
+ inter_ut.add_feature(tokenizer=DigitTokenizer(vocab='click', vocab_size=2), column='click')
148
159
  ```
149
160
 
150
161
  ### Tokenizing Data
@@ -190,7 +201,7 @@ UniTok (4beta)
190
201
  Sample Size: 10
191
202
  ID Column: nid
192
203
 
193
- Jobs
204
+ Features
194
205
  ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓
195
206
  ┃ Tokenizer ┃ Tokenizer ID ┃ Column Mapping ┃ Vocab ┃ Max Length ┃
196
207
  ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩
@@ -224,5 +235,3 @@ Our TODO list includes:
224
235
  ## License
225
236
 
226
237
  This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
227
-
228
-
@@ -31,17 +31,12 @@ UniTokv3/tok/split_tok.py
31
31
  UniTokv3/tok/tok.py
32
32
  unitok/__init__.py
33
33
  unitok/__main__.py
34
+ unitok/feature.py
34
35
  unitok/job.py
35
36
  unitok/meta.py
36
37
  unitok/selector.py
37
38
  unitok/status.py
38
39
  unitok/unitok.py
39
- unitok.egg-info/PKG-INFO
40
- unitok.egg-info/SOURCES.txt
41
- unitok.egg-info/dependency_links.txt
42
- unitok.egg-info/entry_points.txt
43
- unitok.egg-info/requires.txt
44
- unitok.egg-info/top_level.txt
45
40
  unitok/tokenizer/__init__.py
46
41
  unitok/tokenizer/base_tokenizer.py
47
42
  unitok/tokenizer/digit_tokenizer.py
@@ -67,6 +62,7 @@ unitok/utils/hub/__init__.py
67
62
  unitok/utils/hub/hub.py
68
63
  unitok/utils/hub/param_hub.py
69
64
  unitok/utils/index_set/__init__.py
65
+ unitok/utils/index_set/feature_set.py
70
66
  unitok/utils/index_set/index_set.py
71
67
  unitok/utils/index_set/job_set.py
72
68
  unitok/utils/index_set/tokenizer_set.py
@@ -2,4 +2,3 @@
2
2
  unidep-upgrade-v4 = UniTokv3.__main__:upgrade
3
3
  unitok = unitok.__main__:main
4
4
  unitokv3 = UniTokv3.__main__:main
5
-
@@ -6,3 +6,4 @@ transformers
6
6
  oba
7
7
  prettytable
8
8
  rich
9
+ fastparquet
@@ -9,7 +9,7 @@ from rich.table import Table
9
9
 
10
10
  from UniTokv3 import UniDep, Meta, Vocab
11
11
  from unitok.vocabulary import Vocab as Vocabv4
12
- from unitok.job import Job as Jobv4
12
+ from unitok.feature import Feature as Featurev4
13
13
  from unitok.tokenizer.unknown_tokenizer import UnknownTokenizer
14
14
  from unitok.unitok import UniTok as UniTokv4
15
15
  from unitok.meta import Meta as Metav4
@@ -127,7 +127,7 @@ def upgrade():
127
127
  ut.meta.vocabularies.add(vocab_beta)
128
128
 
129
129
  for col in voc.cols:
130
- print(f'\tUpgrade job {col.name}')
130
+ print(f'\tUpgrade feature {col.name}')
131
131
  col_data = data[col.name]
132
132
  if not len(col_data):
133
133
  print(f'\t\tWarning: empty column {col.name}, defaulting to an atom column')
@@ -149,7 +149,7 @@ def upgrade():
149
149
  tokenizer_id='upgrade_' + col.name,
150
150
  vocab=vocab_beta,
151
151
  )
152
- job = Jobv4(
152
+ feature = Featurev4(
153
153
  name=col.name,
154
154
  column=col.name,
155
155
  tokenizer=tokenizer,
@@ -159,7 +159,7 @@ def upgrade():
159
159
  max_len=max_len,
160
160
  )
161
161
  ut.meta.tokenizers.add(tokenizer)
162
- ut.meta.jobs.add(job)
162
+ ut.meta.features.add(feature)
163
163
 
164
164
  ut.meta.save(path)
165
165
 
@@ -6,7 +6,7 @@ long_description = (this_directory / "README.md").read_text(encoding='utf8')
6
6
 
7
7
  setup(
8
8
  name='UniTok',
9
- version='4.3.9',
9
+ version='4.4.1',
10
10
  keywords=['token', 'tokenizer', 'NLP', 'transformers', 'glove', 'bert', 'llama'],
11
11
  description='Unified Tokenizer',
12
12
  long_description=long_description,
@@ -25,7 +25,8 @@ setup(
25
25
  'transformers',
26
26
  'oba',
27
27
  'prettytable',
28
- 'rich'
28
+ 'rich',
29
+ 'fastparquet'
29
30
  ],
30
31
  entry_points={
31
32
  'console_scripts': [
@@ -11,8 +11,9 @@ from unitok.tokenizer import TransformersTokenizer, BertTokenizer
11
11
  from unitok.tokenizer import SplitTokenizer, DigitTokenizer, DigitsTokenizer
12
12
  from unitok.tokenizer import GloVeTokenizer
13
13
  from unitok.job import Job, JobHub
14
+ from unitok.feature import Feature, FeatureHub
14
15
 
15
- from unitok.utils.index_set import IndexSet, VocabSet, TokenizerSet, JobSet
16
+ from unitok.utils.index_set import IndexSet, VocabSet, TokenizerSet, JobSet, FeatureSet
16
17
 
17
18
  from unitok.meta import Meta
18
19
  from unitok.status import Status
@@ -32,7 +33,8 @@ __all__ = [
32
33
  'SplitTokenizer', 'DigitTokenizer', 'DigitsTokenizer',
33
34
  'GloVeTokenizer',
34
35
  'Job', 'JobHub',
35
- 'IndexSet', 'VocabSet', 'TokenizerSet', 'JobSet',
36
+ 'Feature', 'FeatureHub',
37
+ 'IndexSet', 'VocabSet', 'TokenizerSet', 'JobSet', 'FeatureSet',
36
38
  'Meta',
37
39
  'Status',
38
40
  'UniTok',
@@ -15,7 +15,7 @@ def integrate():
15
15
  parser.add_argument('--file', '-f', type=str, help='csv, tsv, parquet format data')
16
16
  parser.add_argument('--lib', type=str, default=None, help='custom tokenizer library')
17
17
  parser.add_argument('--column', '-c', type=str, help='column name to tokenize')
18
- parser.add_argument('--name', '-n', type=str, help='job name and export column name')
18
+ parser.add_argument('--name', '-n', type=str, help='export feature name name')
19
19
  parser.add_argument('--vocab', '-v', type=str, default=None, help='vocabulary name')
20
20
  parser.add_argument('--tokenizer', '-t', type=str, default=None, help='tokenizer classname')
21
21
  parser.add_argument('--tokenizer_id', type=str, default=None, help='tokenizer id')
@@ -69,7 +69,7 @@ def integrate():
69
69
  raise ValueError(f'Unknown tokenizer: {args.tokenizer}. Available tokenizers: {tokenizers.keys()}')
70
70
  tokenizer = tokenizers[args.tokenizer](vocab=args.vocab, **tokenizer_params)
71
71
 
72
- ut.add_job(tokenizer=tokenizer, column=args.column, name=args.name, truncate=args.truncate)
72
+ ut.add_feature(tokenizer=tokenizer, column=args.column, name=args.name, truncate=args.truncate)
73
73
  ut.tokenize(df).save(args.path)
74
74
 
75
75
 
@@ -85,11 +85,11 @@ def summarize():
85
85
  def remove():
86
86
  parser = argparse.ArgumentParser()
87
87
  parser.add_argument('path', type=str, default='.', help='path to a unitok data directory')
88
- parser.add_argument('--name', type=str, help='job name to remove')
88
+ parser.add_argument('--name', type=str, help='feature name to remove')
89
89
  args, _ = parser.parse_known_args()
90
90
 
91
91
  with UniTok.load(args.path) as ut:
92
- ut.remove_job(args.name)
92
+ ut.remove_feature(args.name)
93
93
  ut.save(args.path)
94
94
 
95
95
 
@@ -8,7 +8,7 @@ from unitok.utils import Symbols, Instance
8
8
  from unitok.utils.hub import Hub
9
9
 
10
10
 
11
- class Job:
11
+ class Feature:
12
12
  def __init__(
13
13
  self,
14
14
  tokenizer: Union[BaseTokenizer, str],
@@ -35,7 +35,7 @@ class Job:
35
35
  self.max_len = max_len
36
36
  self.from_union = isinstance(self.tokenizer, UnionTokenizer)
37
37
 
38
- JobHub.add(self)
38
+ FeatureHub.add(self)
39
39
  VocabHub.add(self.tokenizer.vocab)
40
40
 
41
41
  @property
@@ -48,12 +48,12 @@ class Job:
48
48
  for attr in attributes:
49
49
  params[attr] = kwargs[attr] if attr in kwargs else getattr(self, attr)
50
50
 
51
- return Job(**params)
51
+ return Feature(**params)
52
52
 
53
53
  def __str__(self):
54
54
  if self.key:
55
- return f'Job({self.column} => {self.name}) [PK]'
56
- return f'Job({self.column} => {self.name})'
55
+ return f'Feature({self.column} => {self.name}) [PK]'
56
+ return f'Feature({self.column} => {self.name})'
57
57
 
58
58
  def __repr__(self):
59
59
  return str(self)
@@ -85,10 +85,10 @@ class Job:
85
85
  return slice(None)
86
86
 
87
87
 
88
- class JobHub(Hub[Job]):
88
+ class FeatureHub(Hub[Feature]):
89
89
  _instance = Instance(compulsory_space=True)
90
90
 
91
91
  @classmethod
92
- def add(cls, key, obj: Job = None):
92
+ def add(cls, key, obj: Feature = None):
93
93
  key, obj = key.name, key
94
94
  return super().add(key, obj)
@@ -0,0 +1,11 @@
1
+ import warnings
2
+ from unitok.feature import Feature, FeatureHub
3
+
4
+
5
+ class Job(Feature):
6
+ def __init__(self, **kwargs):
7
+ warnings.warn(f'`Job` class is deprecated, use `Feature`.', DeprecationWarning, stacklevel=2)
8
+ super().__init__(**kwargs)
9
+
10
+
11
+ JobHub = FeatureHub