unitok 4.4.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. unitok-4.4.4/LICENSE +21 -0
  2. unitok-4.4.4/PKG-INFO +179 -0
  3. unitok-4.4.4/README.md +164 -0
  4. unitok-4.4.4/UniTok.egg-info/PKG-INFO +179 -0
  5. unitok-4.4.4/UniTok.egg-info/SOURCES.txt +78 -0
  6. unitok-4.4.4/UniTok.egg-info/dependency_links.txt +1 -0
  7. unitok-4.4.4/UniTok.egg-info/entry_points.txt +5 -0
  8. unitok-4.4.4/UniTok.egg-info/requires.txt +9 -0
  9. unitok-4.4.4/UniTok.egg-info/top_level.txt +2 -0
  10. unitok-4.4.4/UniTokv3/__init__.py +26 -0
  11. unitok-4.4.4/UniTokv3/__main__.py +169 -0
  12. unitok-4.4.4/UniTokv3/analysis/__init__.py +6 -0
  13. unitok-4.4.4/UniTokv3/analysis/lengths.py +35 -0
  14. unitok-4.4.4/UniTokv3/analysis/plot.py +48 -0
  15. unitok-4.4.4/UniTokv3/cols.py +3 -0
  16. unitok-4.4.4/UniTokv3/column.py +125 -0
  17. unitok-4.4.4/UniTokv3/fut.py +128 -0
  18. unitok-4.4.4/UniTokv3/global_setting.py +10 -0
  19. unitok-4.4.4/UniTokv3/meta.py +146 -0
  20. unitok-4.4.4/UniTokv3/tok/__init__.py +11 -0
  21. unitok-4.4.4/UniTokv3/tok/bert_tok.py +33 -0
  22. unitok-4.4.4/UniTokv3/tok/ent_tok.py +24 -0
  23. unitok-4.4.4/UniTokv3/tok/id_tok.py +8 -0
  24. unitok-4.4.4/UniTokv3/tok/number_tok.py +44 -0
  25. unitok-4.4.4/UniTokv3/tok/seq_tok.py +17 -0
  26. unitok-4.4.4/UniTokv3/tok/split_tok.py +32 -0
  27. unitok-4.4.4/UniTokv3/tok/tok.py +61 -0
  28. unitok-4.4.4/UniTokv3/unidep.py +523 -0
  29. unitok-4.4.4/UniTokv3/unitok.py +223 -0
  30. unitok-4.4.4/UniTokv3/vocab.py +294 -0
  31. unitok-4.4.4/UniTokv3/vocabs.py +58 -0
  32. unitok-4.4.4/setup.cfg +4 -0
  33. unitok-4.4.4/setup.py +38 -0
  34. unitok-4.4.4/unitok/__init__.py +41 -0
  35. unitok-4.4.4/unitok/__main__.py +180 -0
  36. unitok-4.4.4/unitok/feature.py +94 -0
  37. unitok-4.4.4/unitok/job.py +11 -0
  38. unitok-4.4.4/unitok/meta.py +148 -0
  39. unitok-4.4.4/unitok/selector.py +29 -0
  40. unitok-4.4.4/unitok/status.py +44 -0
  41. unitok-4.4.4/unitok/tokenizer/__init__.py +20 -0
  42. unitok-4.4.4/unitok/tokenizer/base_tokenizer.py +89 -0
  43. unitok-4.4.4/unitok/tokenizer/digit_tokenizer.py +34 -0
  44. unitok-4.4.4/unitok/tokenizer/entity_tokenizer.py +13 -0
  45. unitok-4.4.4/unitok/tokenizer/glove_tokenizer.py +21 -0
  46. unitok-4.4.4/unitok/tokenizer/split_tokenizer.py +14 -0
  47. unitok-4.4.4/unitok/tokenizer/transformers_tokenizer.py +74 -0
  48. unitok-4.4.4/unitok/tokenizer/union_tokenizer.py +20 -0
  49. unitok-4.4.4/unitok/tokenizer/unknown_tokenizer.py +35 -0
  50. unitok-4.4.4/unitok/unitok.py +548 -0
  51. unitok-4.4.4/unitok/utils/__init__.py +21 -0
  52. unitok-4.4.4/unitok/utils/class_pool.py +107 -0
  53. unitok-4.4.4/unitok/utils/data.py +15 -0
  54. unitok-4.4.4/unitok/utils/function.py +6 -0
  55. unitok-4.4.4/unitok/utils/handler/__init__.py +7 -0
  56. unitok-4.4.4/unitok/utils/handler/json_handler.py +28 -0
  57. unitok-4.4.4/unitok/utils/handler/pkl_handler.py +19 -0
  58. unitok-4.4.4/unitok/utils/hub/__init__.py +4 -0
  59. unitok-4.4.4/unitok/utils/hub/hub.py +48 -0
  60. unitok-4.4.4/unitok/utils/hub/param_hub.py +6 -0
  61. unitok-4.4.4/unitok/utils/index_set/__init__.py +17 -0
  62. unitok-4.4.4/unitok/utils/index_set/feature_set.py +26 -0
  63. unitok-4.4.4/unitok/utils/index_set/index_set.py +71 -0
  64. unitok-4.4.4/unitok/utils/index_set/job_set.py +4 -0
  65. unitok-4.4.4/unitok/utils/index_set/tokenizer_set.py +19 -0
  66. unitok-4.4.4/unitok/utils/index_set/vocabulary_set.py +19 -0
  67. unitok-4.4.4/unitok/utils/instance.py +18 -0
  68. unitok-4.4.4/unitok/utils/map.py +3 -0
  69. unitok-4.4.4/unitok/utils/space.py +35 -0
  70. unitok-4.4.4/unitok/utils/symbol.py +23 -0
  71. unitok-4.4.4/unitok/utils/verbose.py +48 -0
  72. unitok-4.4.4/unitok/vocabulary/__init__.py +11 -0
  73. unitok-4.4.4/unitok/vocabulary/counter.py +85 -0
  74. unitok-4.4.4/unitok/vocabulary/vocabulary.py +170 -0
unitok-4.4.4/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Jyonn
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
unitok-4.4.4/PKG-INFO ADDED
@@ -0,0 +1,179 @@
1
+ Metadata-Version: 2.1
2
+ Name: unitok
3
+ Version: 4.4.4
4
+ Summary: Unified Tokenizer
5
+ Home-page: https://github.com/Jyonn/UnifiedTokenizer
6
+ Author: Jyonn Liu
7
+ Author-email: liu@qijiong.work
8
+ License: MIT Licence
9
+ Keywords: token,tokenizer,NLP,transformers,glove,bert,llama
10
+ Platform: any
11
+ Description-Content-Type: text/markdown
12
+ License-File: LICENSE
13
+
14
+ # UniTok v4
15
+
16
+ Unified preprocessing for heterogeneous ML tables: text, categorical, and numerical columns in one pipeline.
17
+
18
+ - Python package: `unitok`
19
+ - Current package version: 4.4.2 (from `setup.py`)
20
+ - Legacy v3 docs: `README_v3.md`
21
+
22
+ ## Why UniTok
23
+
24
+ UniTok turns raw tabular data into model-ready numeric tables while preserving:
25
+
26
+ - Consistent vocabularies across multiple datasets
27
+ - Clear feature definitions (column -> tokenizer -> output feature)
28
+ - Reproducible metadata and saved artifacts
29
+ - Simple unions across datasets via shared keys
30
+
31
+ ## Core Ideas
32
+
33
+ - **UniTok**: Orchestrates preprocessing lifecycle and holds processed data.
34
+ - **Feature**: Binds a column to a tokenizer and output name.
35
+ - **Tokenizer**: Encodes objects to ids (entity, split, digit, transformers).
36
+ - **Vocab**: Global index for tokens; shared across datasets.
37
+ - **Meta**: Stores schema, tokenizers, vocabularies, and feature definitions.
38
+ - **State**: `initialized` -> `tokenized` -> `organized`.
39
+
40
+ ## Install
41
+
42
+ ```bash
43
+ pip install unitok
44
+ ```
45
+
46
+ Requirements: Python 3.7+, pandas, transformers, tqdm, rich.
47
+
48
+ ## Quickstart
49
+
50
+ ```python
51
+ import pandas as pd
52
+ from unitok import UniTok, Vocab
53
+ from unitok.tokenizer import BertTokenizer, TransformersTokenizer, EntityTokenizer, SplitTokenizer, DigitTokenizer
54
+
55
+ item = pd.read_csv(
56
+ 'news-sample.tsv', sep='\t',
57
+ names=['nid', 'category', 'subcategory', 'title', 'abstract'],
58
+ usecols=['nid', 'category', 'subcategory', 'title', 'abstract'],
59
+ )
60
+ item['abstract'] = item['abstract'].fillna('')
61
+
62
+ user = pd.read_csv(
63
+ 'user-sample.tsv', sep='\t',
64
+ names=['uid', 'history'],
65
+ )
66
+
67
+ interaction = pd.read_csv(
68
+ 'interaction-sample.tsv', sep='\t',
69
+ names=['uid', 'nid', 'click'],
70
+ )
71
+
72
+ item_vocab = Vocab(name='nid')
73
+ user_vocab = Vocab(name='uid')
74
+
75
+ with UniTok() as item_ut:
76
+ bert = BertTokenizer(vocab='bert')
77
+ llama = TransformersTokenizer(vocab='llama', key='huggyllama/llama-7b')
78
+
79
+ item_ut.add_feature(tokenizer=EntityTokenizer(vocab=item_vocab), column='nid', key=True)
80
+ item_ut.add_feature(tokenizer=bert, column='title', name='title@bert', truncate=20)
81
+ item_ut.add_feature(tokenizer=llama, column='title', name='title@llama', truncate=20)
82
+ item_ut.add_feature(tokenizer=bert, column='abstract', name='abstract@bert', truncate=50)
83
+ item_ut.add_feature(tokenizer=llama, column='abstract', name='abstract@llama', truncate=50)
84
+ item_ut.add_feature(tokenizer=EntityTokenizer(vocab='category'), column='category')
85
+ item_ut.add_feature(tokenizer=EntityTokenizer(vocab='subcategory'), column='subcategory')
86
+
87
+ with UniTok() as user_ut:
88
+ user_ut.add_feature(tokenizer=EntityTokenizer(vocab=user_vocab), column='uid', key=True)
89
+ user_ut.add_feature(tokenizer=SplitTokenizer(vocab=item_vocab, sep=','), column='history', truncate=30)
90
+
91
+ with UniTok() as inter_ut:
92
+ inter_ut.add_index_feature(name='index')
93
+ inter_ut.add_feature(tokenizer=EntityTokenizer(vocab=user_vocab), column='uid')
94
+ inter_ut.add_feature(tokenizer=EntityTokenizer(vocab=item_vocab), column='nid')
95
+ inter_ut.add_feature(tokenizer=DigitTokenizer(vocab='click', vocab_size=2), column='click')
96
+
97
+ item_ut.tokenize(item).save('sample-ut/item')
98
+ item_vocab.deny_edit()
99
+ user_ut.tokenize(user).save('sample-ut/user')
100
+ inter_ut.tokenize(interaction).save('sample-ut/interaction')
101
+ ```
102
+
103
+ ## Loading Saved Data
104
+
105
+ ```python
106
+ from unitok import UniTok
107
+
108
+ ut = UniTok.load('sample-ut/item')
109
+ print(len(ut))
110
+ print(ut[0])
111
+ ```
112
+
113
+ ## Combining Datasets (Union)
114
+
115
+ ```python
116
+ with inter_ut:
117
+ inter_ut.union(user_ut)
118
+ print(inter_ut[0])
119
+ ```
120
+
121
+ - **Soft union** (default): links tables and resolves on access
122
+ - **Hard union**: materializes merged columns
123
+
124
+ ## CLI
125
+
126
+ Summarize a saved table:
127
+
128
+ ```bash
129
+ unitok path/to/data
130
+ ```
131
+
132
+ Add a feature into an existing table (integrate):
133
+
134
+ ```bash
135
+ unitok integrate path/to/data --file data.tsv --column title --name title@bert \
136
+ --vocab bert --tokenizer transformers --t.key bert-base-uncased
137
+ ```
138
+
139
+ Remove a feature from a saved table:
140
+
141
+ ```bash
142
+ unitok remove path/to/data --name title@bert
143
+ ```
144
+
145
+ ## Data Artifacts
146
+
147
+ Saved directories include:
148
+
149
+ - `meta.json` with schema, tokenizers, vocabularies
150
+ - `data.pkl` with tokenized columns
151
+ - `*.vocab` pickled vocabularies
152
+
153
+ ## Migration From v3
154
+
155
+ If you have v3 artifacts:
156
+
157
+ ```bash
158
+ unidep-upgrade-v4 <path>
159
+ ```
160
+
161
+ ## Notes and Constraints
162
+
163
+ - Key feature must be **atomic** (tokenizer returns a single id, not a list).
164
+ - Shared vocabularies must match for unions.
165
+ - `truncate=None` means an atomic feature; list features must use a truncate.
166
+ - `Feature` supersedes the deprecated `Job` class.
167
+
168
+ ## Repository Layout (High-Level)
169
+
170
+ - `unitok/` core library
171
+ - `UniTokv3/` legacy v3 code
172
+ - `dist/` built distributions
173
+ - `setup.py`, `requirements.txt`
174
+
175
+ ## License
176
+
177
+ MIT License. See `LICENSE`.
178
+
179
+
unitok-4.4.4/README.md ADDED
@@ -0,0 +1,164 @@
1
+ # UniTok v4
2
+
3
+ Unified preprocessing for heterogeneous ML tables: text, categorical, and numerical columns in one pipeline.
4
+
5
+ - Python package: `unitok`
6
+ - Current package version: 4.4.2 (from `setup.py`)
7
+ - Legacy v3 docs: `README_v3.md`
8
+
9
+ ## Why UniTok
10
+
11
+ UniTok turns raw tabular data into model-ready numeric tables while preserving:
12
+
13
+ - Consistent vocabularies across multiple datasets
14
+ - Clear feature definitions (column -> tokenizer -> output feature)
15
+ - Reproducible metadata and saved artifacts
16
+ - Simple unions across datasets via shared keys
17
+
18
+ ## Core Ideas
19
+
20
+ - **UniTok**: Orchestrates preprocessing lifecycle and holds processed data.
21
+ - **Feature**: Binds a column to a tokenizer and output name.
22
+ - **Tokenizer**: Encodes objects to ids (entity, split, digit, transformers).
23
+ - **Vocab**: Global index for tokens; shared across datasets.
24
+ - **Meta**: Stores schema, tokenizers, vocabularies, and feature definitions.
25
+ - **State**: `initialized` -> `tokenized` -> `organized`.
26
+
27
+ ## Install
28
+
29
+ ```bash
30
+ pip install unitok
31
+ ```
32
+
33
+ Requirements: Python 3.7+, pandas, transformers, tqdm, rich.
34
+
35
+ ## Quickstart
36
+
37
+ ```python
38
+ import pandas as pd
39
+ from unitok import UniTok, Vocab
40
+ from unitok.tokenizer import BertTokenizer, TransformersTokenizer, EntityTokenizer, SplitTokenizer, DigitTokenizer
41
+
42
+ item = pd.read_csv(
43
+ 'news-sample.tsv', sep='\t',
44
+ names=['nid', 'category', 'subcategory', 'title', 'abstract'],
45
+ usecols=['nid', 'category', 'subcategory', 'title', 'abstract'],
46
+ )
47
+ item['abstract'] = item['abstract'].fillna('')
48
+
49
+ user = pd.read_csv(
50
+ 'user-sample.tsv', sep='\t',
51
+ names=['uid', 'history'],
52
+ )
53
+
54
+ interaction = pd.read_csv(
55
+ 'interaction-sample.tsv', sep='\t',
56
+ names=['uid', 'nid', 'click'],
57
+ )
58
+
59
+ item_vocab = Vocab(name='nid')
60
+ user_vocab = Vocab(name='uid')
61
+
62
+ with UniTok() as item_ut:
63
+ bert = BertTokenizer(vocab='bert')
64
+ llama = TransformersTokenizer(vocab='llama', key='huggyllama/llama-7b')
65
+
66
+ item_ut.add_feature(tokenizer=EntityTokenizer(vocab=item_vocab), column='nid', key=True)
67
+ item_ut.add_feature(tokenizer=bert, column='title', name='title@bert', truncate=20)
68
+ item_ut.add_feature(tokenizer=llama, column='title', name='title@llama', truncate=20)
69
+ item_ut.add_feature(tokenizer=bert, column='abstract', name='abstract@bert', truncate=50)
70
+ item_ut.add_feature(tokenizer=llama, column='abstract', name='abstract@llama', truncate=50)
71
+ item_ut.add_feature(tokenizer=EntityTokenizer(vocab='category'), column='category')
72
+ item_ut.add_feature(tokenizer=EntityTokenizer(vocab='subcategory'), column='subcategory')
73
+
74
+ with UniTok() as user_ut:
75
+ user_ut.add_feature(tokenizer=EntityTokenizer(vocab=user_vocab), column='uid', key=True)
76
+ user_ut.add_feature(tokenizer=SplitTokenizer(vocab=item_vocab, sep=','), column='history', truncate=30)
77
+
78
+ with UniTok() as inter_ut:
79
+ inter_ut.add_index_feature(name='index')
80
+ inter_ut.add_feature(tokenizer=EntityTokenizer(vocab=user_vocab), column='uid')
81
+ inter_ut.add_feature(tokenizer=EntityTokenizer(vocab=item_vocab), column='nid')
82
+ inter_ut.add_feature(tokenizer=DigitTokenizer(vocab='click', vocab_size=2), column='click')
83
+
84
+ item_ut.tokenize(item).save('sample-ut/item')
85
+ item_vocab.deny_edit()
86
+ user_ut.tokenize(user).save('sample-ut/user')
87
+ inter_ut.tokenize(interaction).save('sample-ut/interaction')
88
+ ```
89
+
90
+ ## Loading Saved Data
91
+
92
+ ```python
93
+ from unitok import UniTok
94
+
95
+ ut = UniTok.load('sample-ut/item')
96
+ print(len(ut))
97
+ print(ut[0])
98
+ ```
99
+
100
+ ## Combining Datasets (Union)
101
+
102
+ ```python
103
+ with inter_ut:
104
+ inter_ut.union(user_ut)
105
+ print(inter_ut[0])
106
+ ```
107
+
108
+ - **Soft union** (default): links tables and resolves on access
109
+ - **Hard union**: materializes merged columns
110
+
111
+ ## CLI
112
+
113
+ Summarize a saved table:
114
+
115
+ ```bash
116
+ unitok path/to/data
117
+ ```
118
+
119
+ Add a feature into an existing table (integrate):
120
+
121
+ ```bash
122
+ unitok integrate path/to/data --file data.tsv --column title --name title@bert \
123
+ --vocab bert --tokenizer transformers --t.key bert-base-uncased
124
+ ```
125
+
126
+ Remove a feature from a saved table:
127
+
128
+ ```bash
129
+ unitok remove path/to/data --name title@bert
130
+ ```
131
+
132
+ ## Data Artifacts
133
+
134
+ Saved directories include:
135
+
136
+ - `meta.json` with schema, tokenizers, vocabularies
137
+ - `data.pkl` with tokenized columns
138
+ - `*.vocab` pickled vocabularies
139
+
140
+ ## Migration From v3
141
+
142
+ If you have v3 artifacts:
143
+
144
+ ```bash
145
+ unidep-upgrade-v4 <path>
146
+ ```
147
+
148
+ ## Notes and Constraints
149
+
150
+ - Key feature must be **atomic** (tokenizer returns a single id, not a list).
151
+ - Shared vocabularies must match for unions.
152
+ - `truncate=None` means an atomic feature; list features must use a truncate.
153
+ - `Feature` supersedes the deprecated `Job` class.
154
+
155
+ ## Repository Layout (High-Level)
156
+
157
+ - `unitok/` core library
158
+ - `UniTokv3/` legacy v3 code
159
+ - `dist/` built distributions
160
+ - `setup.py`, `requirements.txt`
161
+
162
+ ## License
163
+
164
+ MIT License. See `LICENSE`.
@@ -0,0 +1,179 @@
1
+ Metadata-Version: 2.1
2
+ Name: unitok
3
+ Version: 4.4.4
4
+ Summary: Unified Tokenizer
5
+ Home-page: https://github.com/Jyonn/UnifiedTokenizer
6
+ Author: Jyonn Liu
7
+ Author-email: liu@qijiong.work
8
+ License: MIT Licence
9
+ Keywords: token,tokenizer,NLP,transformers,glove,bert,llama
10
+ Platform: any
11
+ Description-Content-Type: text/markdown
12
+ License-File: LICENSE
13
+
14
+ # UniTok v4
15
+
16
+ Unified preprocessing for heterogeneous ML tables: text, categorical, and numerical columns in one pipeline.
17
+
18
+ - Python package: `unitok`
19
+ - Current package version: 4.4.2 (from `setup.py`)
20
+ - Legacy v3 docs: `README_v3.md`
21
+
22
+ ## Why UniTok
23
+
24
+ UniTok turns raw tabular data into model-ready numeric tables while preserving:
25
+
26
+ - Consistent vocabularies across multiple datasets
27
+ - Clear feature definitions (column -> tokenizer -> output feature)
28
+ - Reproducible metadata and saved artifacts
29
+ - Simple unions across datasets via shared keys
30
+
31
+ ## Core Ideas
32
+
33
+ - **UniTok**: Orchestrates preprocessing lifecycle and holds processed data.
34
+ - **Feature**: Binds a column to a tokenizer and output name.
35
+ - **Tokenizer**: Encodes objects to ids (entity, split, digit, transformers).
36
+ - **Vocab**: Global index for tokens; shared across datasets.
37
+ - **Meta**: Stores schema, tokenizers, vocabularies, and feature definitions.
38
+ - **State**: `initialized` -> `tokenized` -> `organized`.
39
+
40
+ ## Install
41
+
42
+ ```bash
43
+ pip install unitok
44
+ ```
45
+
46
+ Requirements: Python 3.7+, pandas, transformers, tqdm, rich.
47
+
48
+ ## Quickstart
49
+
50
+ ```python
51
+ import pandas as pd
52
+ from unitok import UniTok, Vocab
53
+ from unitok.tokenizer import BertTokenizer, TransformersTokenizer, EntityTokenizer, SplitTokenizer, DigitTokenizer
54
+
55
+ item = pd.read_csv(
56
+ 'news-sample.tsv', sep='\t',
57
+ names=['nid', 'category', 'subcategory', 'title', 'abstract'],
58
+ usecols=['nid', 'category', 'subcategory', 'title', 'abstract'],
59
+ )
60
+ item['abstract'] = item['abstract'].fillna('')
61
+
62
+ user = pd.read_csv(
63
+ 'user-sample.tsv', sep='\t',
64
+ names=['uid', 'history'],
65
+ )
66
+
67
+ interaction = pd.read_csv(
68
+ 'interaction-sample.tsv', sep='\t',
69
+ names=['uid', 'nid', 'click'],
70
+ )
71
+
72
+ item_vocab = Vocab(name='nid')
73
+ user_vocab = Vocab(name='uid')
74
+
75
+ with UniTok() as item_ut:
76
+ bert = BertTokenizer(vocab='bert')
77
+ llama = TransformersTokenizer(vocab='llama', key='huggyllama/llama-7b')
78
+
79
+ item_ut.add_feature(tokenizer=EntityTokenizer(vocab=item_vocab), column='nid', key=True)
80
+ item_ut.add_feature(tokenizer=bert, column='title', name='title@bert', truncate=20)
81
+ item_ut.add_feature(tokenizer=llama, column='title', name='title@llama', truncate=20)
82
+ item_ut.add_feature(tokenizer=bert, column='abstract', name='abstract@bert', truncate=50)
83
+ item_ut.add_feature(tokenizer=llama, column='abstract', name='abstract@llama', truncate=50)
84
+ item_ut.add_feature(tokenizer=EntityTokenizer(vocab='category'), column='category')
85
+ item_ut.add_feature(tokenizer=EntityTokenizer(vocab='subcategory'), column='subcategory')
86
+
87
+ with UniTok() as user_ut:
88
+ user_ut.add_feature(tokenizer=EntityTokenizer(vocab=user_vocab), column='uid', key=True)
89
+ user_ut.add_feature(tokenizer=SplitTokenizer(vocab=item_vocab, sep=','), column='history', truncate=30)
90
+
91
+ with UniTok() as inter_ut:
92
+ inter_ut.add_index_feature(name='index')
93
+ inter_ut.add_feature(tokenizer=EntityTokenizer(vocab=user_vocab), column='uid')
94
+ inter_ut.add_feature(tokenizer=EntityTokenizer(vocab=item_vocab), column='nid')
95
+ inter_ut.add_feature(tokenizer=DigitTokenizer(vocab='click', vocab_size=2), column='click')
96
+
97
+ item_ut.tokenize(item).save('sample-ut/item')
98
+ item_vocab.deny_edit()
99
+ user_ut.tokenize(user).save('sample-ut/user')
100
+ inter_ut.tokenize(interaction).save('sample-ut/interaction')
101
+ ```
102
+
103
+ ## Loading Saved Data
104
+
105
+ ```python
106
+ from unitok import UniTok
107
+
108
+ ut = UniTok.load('sample-ut/item')
109
+ print(len(ut))
110
+ print(ut[0])
111
+ ```
112
+
113
+ ## Combining Datasets (Union)
114
+
115
+ ```python
116
+ with inter_ut:
117
+ inter_ut.union(user_ut)
118
+ print(inter_ut[0])
119
+ ```
120
+
121
+ - **Soft union** (default): links tables and resolves on access
122
+ - **Hard union**: materializes merged columns
123
+
124
+ ## CLI
125
+
126
+ Summarize a saved table:
127
+
128
+ ```bash
129
+ unitok path/to/data
130
+ ```
131
+
132
+ Add a feature into an existing table (integrate):
133
+
134
+ ```bash
135
+ unitok integrate path/to/data --file data.tsv --column title --name title@bert \
136
+ --vocab bert --tokenizer transformers --t.key bert-base-uncased
137
+ ```
138
+
139
+ Remove a feature from a saved table:
140
+
141
+ ```bash
142
+ unitok remove path/to/data --name title@bert
143
+ ```
144
+
145
+ ## Data Artifacts
146
+
147
+ Saved directories include:
148
+
149
+ - `meta.json` with schema, tokenizers, vocabularies
150
+ - `data.pkl` with tokenized columns
151
+ - `*.vocab` pickled vocabularies
152
+
153
+ ## Migration From v3
154
+
155
+ If you have v3 artifacts:
156
+
157
+ ```bash
158
+ unidep-upgrade-v4 <path>
159
+ ```
160
+
161
+ ## Notes and Constraints
162
+
163
+ - Key feature must be **atomic** (tokenizer returns a single id, not a list).
164
+ - Shared vocabularies must match for unions.
165
+ - `truncate=None` means an atomic feature; list features must use a truncate.
166
+ - `Feature` supersedes the deprecated `Job` class.
167
+
168
+ ## Repository Layout (High-Level)
169
+
170
+ - `unitok/` core library
171
+ - `UniTokv3/` legacy v3 code
172
+ - `dist/` built distributions
173
+ - `setup.py`, `requirements.txt`
174
+
175
+ ## License
176
+
177
+ MIT License. See `LICENSE`.
178
+
179
+
@@ -0,0 +1,78 @@
1
+ LICENSE
2
+ README.md
3
+ setup.py
4
+ UniTok.egg-info/PKG-INFO
5
+ UniTok.egg-info/SOURCES.txt
6
+ UniTok.egg-info/dependency_links.txt
7
+ UniTok.egg-info/entry_points.txt
8
+ UniTok.egg-info/requires.txt
9
+ UniTok.egg-info/top_level.txt
10
+ UniTokv3/__init__.py
11
+ UniTokv3/__main__.py
12
+ UniTokv3/cols.py
13
+ UniTokv3/column.py
14
+ UniTokv3/fut.py
15
+ UniTokv3/global_setting.py
16
+ UniTokv3/meta.py
17
+ UniTokv3/unidep.py
18
+ UniTokv3/unitok.py
19
+ UniTokv3/vocab.py
20
+ UniTokv3/vocabs.py
21
+ UniTokv3/analysis/__init__.py
22
+ UniTokv3/analysis/lengths.py
23
+ UniTokv3/analysis/plot.py
24
+ UniTokv3/tok/__init__.py
25
+ UniTokv3/tok/bert_tok.py
26
+ UniTokv3/tok/ent_tok.py
27
+ UniTokv3/tok/id_tok.py
28
+ UniTokv3/tok/number_tok.py
29
+ UniTokv3/tok/seq_tok.py
30
+ UniTokv3/tok/split_tok.py
31
+ UniTokv3/tok/tok.py
32
+ unitok/__init__.py
33
+ unitok/__main__.py
34
+ unitok/feature.py
35
+ unitok/job.py
36
+ unitok/meta.py
37
+ unitok/selector.py
38
+ unitok/status.py
39
+ unitok/unitok.py
40
+ unitok.egg-info/PKG-INFO
41
+ unitok.egg-info/SOURCES.txt
42
+ unitok.egg-info/dependency_links.txt
43
+ unitok.egg-info/entry_points.txt
44
+ unitok.egg-info/requires.txt
45
+ unitok.egg-info/top_level.txt
46
+ unitok/tokenizer/__init__.py
47
+ unitok/tokenizer/base_tokenizer.py
48
+ unitok/tokenizer/digit_tokenizer.py
49
+ unitok/tokenizer/entity_tokenizer.py
50
+ unitok/tokenizer/glove_tokenizer.py
51
+ unitok/tokenizer/split_tokenizer.py
52
+ unitok/tokenizer/transformers_tokenizer.py
53
+ unitok/tokenizer/union_tokenizer.py
54
+ unitok/tokenizer/unknown_tokenizer.py
55
+ unitok/utils/__init__.py
56
+ unitok/utils/class_pool.py
57
+ unitok/utils/data.py
58
+ unitok/utils/function.py
59
+ unitok/utils/instance.py
60
+ unitok/utils/map.py
61
+ unitok/utils/space.py
62
+ unitok/utils/symbol.py
63
+ unitok/utils/verbose.py
64
+ unitok/utils/handler/__init__.py
65
+ unitok/utils/handler/json_handler.py
66
+ unitok/utils/handler/pkl_handler.py
67
+ unitok/utils/hub/__init__.py
68
+ unitok/utils/hub/hub.py
69
+ unitok/utils/hub/param_hub.py
70
+ unitok/utils/index_set/__init__.py
71
+ unitok/utils/index_set/feature_set.py
72
+ unitok/utils/index_set/index_set.py
73
+ unitok/utils/index_set/job_set.py
74
+ unitok/utils/index_set/tokenizer_set.py
75
+ unitok/utils/index_set/vocabulary_set.py
76
+ unitok/vocabulary/__init__.py
77
+ unitok/vocabulary/counter.py
78
+ unitok/vocabulary/vocabulary.py
@@ -0,0 +1,5 @@
1
+ [console_scripts]
2
+ unidep-upgrade-v4 = UniTokv3.__main__:upgrade
3
+ unitok = unitok.__main__:main
4
+ unitokv3 = UniTokv3.__main__:main
5
+
@@ -0,0 +1,9 @@
1
+ termplot==0.0.2
2
+ tqdm
3
+ numpy
4
+ pandas
5
+ transformers
6
+ oba
7
+ prettytable
8
+ rich
9
+ fastparquet
@@ -0,0 +1,2 @@
1
+ UniTokv3
2
+ unitok
@@ -0,0 +1,26 @@
1
+ from .unitok import UniTok
2
+ from .vocab import Vocab
3
+ from .vocabs import Vocabs
4
+ from .cols import Cols
5
+ from .column import Column
6
+ from .analysis import Lengths, Plot
7
+ from .fut import Fut
8
+
9
+ from .unidep import UniDep
10
+ from .meta import Meta, Col, Voc
11
+
12
+ from .global_setting import Global
13
+
14
+ __all__ = [
15
+ UniTok,
16
+ UniDep,
17
+ Lengths,
18
+ Plot,
19
+ Column,
20
+ column,
21
+ analysis,
22
+ Vocab,
23
+ Vocabs,
24
+ Global,
25
+ Fut,
26
+ ]