chartoken-vp 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chartoken_vp-1.0.0/LICENSE +21 -0
- chartoken_vp-1.0.0/PKG-INFO +14 -0
- chartoken_vp-1.0.0/README.md +37 -0
- chartoken_vp-1.0.0/chartoken/__init__.py +4 -0
- chartoken_vp-1.0.0/chartoken/feature_vocab.py +58 -0
- chartoken_vp-1.0.0/chartoken/vocab.py +74 -0
- chartoken_vp-1.0.0/chartoken_vp.egg-info/PKG-INFO +14 -0
- chartoken_vp-1.0.0/chartoken_vp.egg-info/SOURCES.txt +11 -0
- chartoken_vp-1.0.0/chartoken_vp.egg-info/dependency_links.txt +1 -0
- chartoken_vp-1.0.0/chartoken_vp.egg-info/requires.txt +1 -0
- chartoken_vp-1.0.0/chartoken_vp.egg-info/top_level.txt +1 -0
- chartoken_vp-1.0.0/pyproject.toml +28 -0
- chartoken_vp-1.0.0/setup.cfg +4 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025-2026 F000NK / Voluntas Progressus
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: chartoken-vp
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Character-level vocabulary and morphological feature encoding
|
|
5
|
+
Author: F000NK, Voluntas Progressus
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
11
|
+
Requires-Python: >=3.14
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Requires-Dist: torch>=2.0.0
|
|
14
|
+
Dynamic: license-file
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# chartoken
|
|
2
|
+
|
|
3
|
+
Character-level vocabulary and morphological feature encoding for NLP pipelines.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install chartoken
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
Requires Python >= 3.14 and PyTorch >= 2.0.
|
|
12
|
+
|
|
13
|
+
## Features
|
|
14
|
+
|
|
15
|
+
- **CharVocab** — character-level tokenizer with NFKC normalization, SOS/EOS/PAD tokens
|
|
16
|
+
- **FeatureVocab** — morphological feature encoder (UniMorph tag set) with padding masks
|
|
17
|
+
- Encode text to padded tensors or plain ID lists
|
|
18
|
+
- Serialize/deserialize vocabularies for checkpoint compatibility
|
|
19
|
+
|
|
20
|
+
## Quick Start
|
|
21
|
+
|
|
22
|
+
```python
|
|
23
|
+
from chartoken import CharVocab, FeatureVocab
|
|
24
|
+
|
|
25
|
+
# Build vocab from texts
|
|
26
|
+
vocab = CharVocab.from_texts(["hello", "world"])
|
|
27
|
+
ids = vocab.encode("hello", max_len=32)
|
|
28
|
+
print(vocab.decode(ids.tolist())) # "hello"
|
|
29
|
+
|
|
30
|
+
# Feature vocab
|
|
31
|
+
feat_vocab = FeatureVocab.from_tags([["V", "IND", "PRS"], ["N", "SG"]])
|
|
32
|
+
feat_ids, feat_mask = feat_vocab.encode(["V", "IND"], max_features=12)
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## License
|
|
36
|
+
|
|
37
|
+
See LICENSE file in the repository root.
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""Morphological feature vocabulary (UniMorph tag set)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
|
|
7
|
+
import torch
|
|
8
|
+
|
|
9
|
+
FEATURE_PAD = 0
|
|
10
|
+
FEATURE_SPECIAL = ["<pad>"]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class FeatureVocab:
|
|
15
|
+
stoi: dict[str, int]
|
|
16
|
+
itos: list[str]
|
|
17
|
+
|
|
18
|
+
@property
|
|
19
|
+
def size(self) -> int:
|
|
20
|
+
return len(self.itos)
|
|
21
|
+
|
|
22
|
+
@classmethod
|
|
23
|
+
def from_tags(cls, all_tags: list[list[str]]) -> FeatureVocab:
|
|
24
|
+
unique_tags = sorted({tag for tags in all_tags for tag in tags})
|
|
25
|
+
itos = FEATURE_SPECIAL + unique_tags
|
|
26
|
+
stoi = {tag: idx for idx, tag in enumerate(itos)}
|
|
27
|
+
return cls(stoi=stoi, itos=itos)
|
|
28
|
+
|
|
29
|
+
def encode(
|
|
30
|
+
self, tags: list[str], *, max_features: int,
|
|
31
|
+
) -> tuple[list[int], list[float]]:
|
|
32
|
+
"""Encode *tags* into padded ``(feature_ids, feature_mask)`` lists of length *max_features*."""
|
|
33
|
+
ids = [self.stoi[tag] for tag in tags if tag in self.stoi]
|
|
34
|
+
ids = ids[:max_features]
|
|
35
|
+
mask = [1.0] * len(ids)
|
|
36
|
+
padding_length = max_features - len(ids)
|
|
37
|
+
if padding_length > 0:
|
|
38
|
+
ids.extend([FEATURE_PAD] * padding_length)
|
|
39
|
+
mask.extend([0.0] * padding_length)
|
|
40
|
+
return ids, mask
|
|
41
|
+
|
|
42
|
+
def encode_tensor(
|
|
43
|
+
self, tags: list[str], *, max_features: int, device: torch.device | None = None,
|
|
44
|
+
) -> tuple[torch.Tensor, torch.Tensor]:
|
|
45
|
+
ids, mask = self.encode(tags, max_features=max_features)
|
|
46
|
+
ids_tensor = torch.tensor(ids, dtype=torch.long)
|
|
47
|
+
mask_tensor = torch.tensor(mask, dtype=torch.float)
|
|
48
|
+
if device is not None:
|
|
49
|
+
ids_tensor = ids_tensor.to(device)
|
|
50
|
+
mask_tensor = mask_tensor.to(device)
|
|
51
|
+
return ids_tensor, mask_tensor
|
|
52
|
+
|
|
53
|
+
def to_dict(self) -> dict:
|
|
54
|
+
return {"stoi": self.stoi, "itos": self.itos}
|
|
55
|
+
|
|
56
|
+
@classmethod
|
|
57
|
+
def from_dict(cls, data: dict) -> FeatureVocab:
|
|
58
|
+
return cls(stoi=data["stoi"], itos=data["itos"])
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"""Character-level vocabulary with NFKC normalization."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import unicodedata
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
|
|
8
|
+
import torch
|
|
9
|
+
|
|
10
|
+
PAD = 0
|
|
11
|
+
SOS = 1
|
|
12
|
+
EOS = 2
|
|
13
|
+
SPECIAL_TOKENS = ["<pad>", "<sos>", "<eos>"]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def normalize_text(text: str) -> str:
|
|
17
|
+
return unicodedata.normalize("NFKC", text)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class CharVocab:
|
|
22
|
+
stoi: dict[str, int]
|
|
23
|
+
itos: list[str]
|
|
24
|
+
|
|
25
|
+
@property
|
|
26
|
+
def size(self) -> int:
|
|
27
|
+
return len(self.itos)
|
|
28
|
+
|
|
29
|
+
@classmethod
|
|
30
|
+
def from_texts(cls, texts: list[str]) -> CharVocab:
|
|
31
|
+
chars = sorted({char for text in texts for char in normalize_text(text)})
|
|
32
|
+
itos = SPECIAL_TOKENS + [char for char in chars if char not in SPECIAL_TOKENS]
|
|
33
|
+
stoi = {char: idx for idx, char in enumerate(itos)}
|
|
34
|
+
return cls(stoi=stoi, itos=itos)
|
|
35
|
+
|
|
36
|
+
def encode(
|
|
37
|
+
self, text: str, *, max_len: int, device: torch.device | None = None,
|
|
38
|
+
) -> torch.Tensor:
|
|
39
|
+
"""Encode *text* into a padded long tensor ``[SOS, ...chars, EOS, PAD...]``."""
|
|
40
|
+
text = normalize_text(text)
|
|
41
|
+
token_ids = [SOS] + [self.stoi[char] for char in text if char in self.stoi] + [EOS]
|
|
42
|
+
token_ids = token_ids[:max_len]
|
|
43
|
+
padding_length = max_len - len(token_ids)
|
|
44
|
+
if padding_length > 0:
|
|
45
|
+
token_ids.extend([PAD] * padding_length)
|
|
46
|
+
tensor = torch.tensor(token_ids, dtype=torch.long)
|
|
47
|
+
return tensor if device is None else tensor.to(device)
|
|
48
|
+
|
|
49
|
+
def encode_ids(self, text: str, max_len: int) -> list[int]:
|
|
50
|
+
text = normalize_text(text)
|
|
51
|
+
token_ids = [SOS] + [self.stoi[char] for char in text if char in self.stoi] + [EOS]
|
|
52
|
+
token_ids = token_ids[:max_len]
|
|
53
|
+
padding_length = max_len - len(token_ids)
|
|
54
|
+
if padding_length > 0:
|
|
55
|
+
token_ids.extend([PAD] * padding_length)
|
|
56
|
+
return token_ids
|
|
57
|
+
|
|
58
|
+
def decode(self, token_ids: list[int]) -> str:
|
|
59
|
+
chars: list[str] = []
|
|
60
|
+
for token_id in token_ids:
|
|
61
|
+
if token_id in (PAD, SOS):
|
|
62
|
+
continue
|
|
63
|
+
if token_id == EOS:
|
|
64
|
+
break
|
|
65
|
+
if 0 <= token_id < len(self.itos):
|
|
66
|
+
chars.append(self.itos[token_id])
|
|
67
|
+
return "".join(chars)
|
|
68
|
+
|
|
69
|
+
def to_dict(self) -> dict:
|
|
70
|
+
return {"stoi": self.stoi, "itos": self.itos}
|
|
71
|
+
|
|
72
|
+
@classmethod
|
|
73
|
+
def from_dict(cls, data: dict) -> CharVocab:
|
|
74
|
+
return cls(stoi=data["stoi"], itos=data["itos"])
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: chartoken-vp
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Character-level vocabulary and morphological feature encoding
|
|
5
|
+
Author: F000NK, Voluntas Progressus
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
11
|
+
Requires-Python: >=3.14
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Requires-Dist: torch>=2.0.0
|
|
14
|
+
Dynamic: license-file
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
chartoken/__init__.py
|
|
5
|
+
chartoken/feature_vocab.py
|
|
6
|
+
chartoken/vocab.py
|
|
7
|
+
chartoken_vp.egg-info/PKG-INFO
|
|
8
|
+
chartoken_vp.egg-info/SOURCES.txt
|
|
9
|
+
chartoken_vp.egg-info/dependency_links.txt
|
|
10
|
+
chartoken_vp.egg-info/requires.txt
|
|
11
|
+
chartoken_vp.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
torch>=2.0.0
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
chartoken
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=69.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "chartoken-vp"
|
|
7
|
+
version = "1.0.0"
|
|
8
|
+
authors = [
|
|
9
|
+
{name = "F000NK"},
|
|
10
|
+
{name = "Voluntas Progressus"},
|
|
11
|
+
]
|
|
12
|
+
description = "Character-level vocabulary and morphological feature encoding"
|
|
13
|
+
license = "MIT"
|
|
14
|
+
requires-python = ">=3.14"
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
"Programming Language :: Python :: 3.14",
|
|
18
|
+
"Operating System :: OS Independent",
|
|
19
|
+
"Topic :: Text Processing :: Linguistic",
|
|
20
|
+
]
|
|
21
|
+
dependencies = ["torch>=2.0.0"]
|
|
22
|
+
|
|
23
|
+
[tool.setuptools.packages.find]
|
|
24
|
+
include = ["chartoken*"]
|
|
25
|
+
|
|
26
|
+
[tool.ruff]
|
|
27
|
+
line-length = 120
|
|
28
|
+
target-version = "py314"
|