chartoken-vp 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025-2026 F000NK / Voluntas Progressus
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,14 @@
1
+ Metadata-Version: 2.4
2
+ Name: chartoken-vp
3
+ Version: 1.0.0
4
+ Summary: Character-level vocabulary and morphological feature encoding
5
+ Author: F000NK, Voluntas Progressus
6
+ License-Expression: MIT
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: Programming Language :: Python :: 3.14
9
+ Classifier: Operating System :: OS Independent
10
+ Classifier: Topic :: Text Processing :: Linguistic
11
+ Requires-Python: >=3.14
12
+ License-File: LICENSE
13
+ Requires-Dist: torch>=2.0.0
14
+ Dynamic: license-file
@@ -0,0 +1,37 @@
1
+ # chartoken
2
+
3
+ Character-level vocabulary and morphological feature encoding for NLP pipelines.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install chartoken
9
+ ```
10
+
11
+ Requires Python >= 3.14 and PyTorch >= 2.0.
12
+
13
+ ## Features
14
+
15
+ - **CharVocab** — character-level tokenizer with NFKC normalization, SOS/EOS/PAD tokens
16
+ - **FeatureVocab** — morphological feature encoder (UniMorph tag set) with padding masks
17
+ - Encode text to padded tensors or plain ID lists
18
+ - Serialize/deserialize vocabularies for checkpoint compatibility
19
+
20
+ ## Quick Start
21
+
22
+ ```python
23
+ from chartoken import CharVocab, FeatureVocab
24
+
25
+ # Build vocab from texts
26
+ vocab = CharVocab.from_texts(["hello", "world"])
27
+ ids = vocab.encode("hello", max_len=32)
28
+ print(vocab.decode(ids.tolist())) # "hello"
29
+
30
+ # Feature vocab
31
+ feat_vocab = FeatureVocab.from_tags([["V", "IND", "PRS"], ["N", "SG"]])
32
+ feat_ids, feat_mask = feat_vocab.encode(["V", "IND"], max_features=12)
33
+ ```
34
+
35
+ ## License
36
+
37
+ See LICENSE file in the repository root.
@@ -0,0 +1,4 @@
1
+ from chartoken.vocab import CharVocab, PAD, SOS, EOS, normalize_text, SPECIAL_TOKENS
2
+ from chartoken.feature_vocab import FeatureVocab, FEATURE_PAD
3
+
4
+ __version__ = "1.0.0"
@@ -0,0 +1,58 @@
1
+ """Morphological feature vocabulary (UniMorph tag set)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+
7
+ import torch
8
+
9
+ FEATURE_PAD = 0
10
+ FEATURE_SPECIAL = ["<pad>"]
11
+
12
+
13
+ @dataclass
14
+ class FeatureVocab:
15
+ stoi: dict[str, int]
16
+ itos: list[str]
17
+
18
+ @property
19
+ def size(self) -> int:
20
+ return len(self.itos)
21
+
22
+ @classmethod
23
+ def from_tags(cls, all_tags: list[list[str]]) -> FeatureVocab:
24
+ unique_tags = sorted({tag for tags in all_tags for tag in tags})
25
+ itos = FEATURE_SPECIAL + unique_tags
26
+ stoi = {tag: idx for idx, tag in enumerate(itos)}
27
+ return cls(stoi=stoi, itos=itos)
28
+
29
+ def encode(
30
+ self, tags: list[str], *, max_features: int,
31
+ ) -> tuple[list[int], list[float]]:
32
+ """Encode *tags* into padded ``(feature_ids, feature_mask)`` lists of length *max_features*."""
33
+ ids = [self.stoi[tag] for tag in tags if tag in self.stoi]
34
+ ids = ids[:max_features]
35
+ mask = [1.0] * len(ids)
36
+ padding_length = max_features - len(ids)
37
+ if padding_length > 0:
38
+ ids.extend([FEATURE_PAD] * padding_length)
39
+ mask.extend([0.0] * padding_length)
40
+ return ids, mask
41
+
42
+ def encode_tensor(
43
+ self, tags: list[str], *, max_features: int, device: torch.device | None = None,
44
+ ) -> tuple[torch.Tensor, torch.Tensor]:
45
+ ids, mask = self.encode(tags, max_features=max_features)
46
+ ids_tensor = torch.tensor(ids, dtype=torch.long)
47
+ mask_tensor = torch.tensor(mask, dtype=torch.float)
48
+ if device is not None:
49
+ ids_tensor = ids_tensor.to(device)
50
+ mask_tensor = mask_tensor.to(device)
51
+ return ids_tensor, mask_tensor
52
+
53
+ def to_dict(self) -> dict:
54
+ return {"stoi": self.stoi, "itos": self.itos}
55
+
56
+ @classmethod
57
+ def from_dict(cls, data: dict) -> FeatureVocab:
58
+ return cls(stoi=data["stoi"], itos=data["itos"])
@@ -0,0 +1,74 @@
1
+ """Character-level vocabulary with NFKC normalization."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import unicodedata
6
+ from dataclasses import dataclass
7
+
8
+ import torch
9
+
10
+ PAD = 0
11
+ SOS = 1
12
+ EOS = 2
13
+ SPECIAL_TOKENS = ["<pad>", "<sos>", "<eos>"]
14
+
15
+
16
+ def normalize_text(text: str) -> str:
17
+ return unicodedata.normalize("NFKC", text)
18
+
19
+
20
+ @dataclass
21
+ class CharVocab:
22
+ stoi: dict[str, int]
23
+ itos: list[str]
24
+
25
+ @property
26
+ def size(self) -> int:
27
+ return len(self.itos)
28
+
29
+ @classmethod
30
+ def from_texts(cls, texts: list[str]) -> CharVocab:
31
+ chars = sorted({char for text in texts for char in normalize_text(text)})
32
+ itos = SPECIAL_TOKENS + [char for char in chars if char not in SPECIAL_TOKENS]
33
+ stoi = {char: idx for idx, char in enumerate(itos)}
34
+ return cls(stoi=stoi, itos=itos)
35
+
36
+ def encode(
37
+ self, text: str, *, max_len: int, device: torch.device | None = None,
38
+ ) -> torch.Tensor:
39
+ """Encode *text* into a padded long tensor ``[SOS, ...chars, EOS, PAD...]``."""
40
+ text = normalize_text(text)
41
+ token_ids = [SOS] + [self.stoi[char] for char in text if char in self.stoi] + [EOS]
42
+ token_ids = token_ids[:max_len]
43
+ padding_length = max_len - len(token_ids)
44
+ if padding_length > 0:
45
+ token_ids.extend([PAD] * padding_length)
46
+ tensor = torch.tensor(token_ids, dtype=torch.long)
47
+ return tensor if device is None else tensor.to(device)
48
+
49
+ def encode_ids(self, text: str, max_len: int) -> list[int]:
50
+ text = normalize_text(text)
51
+ token_ids = [SOS] + [self.stoi[char] for char in text if char in self.stoi] + [EOS]
52
+ token_ids = token_ids[:max_len]
53
+ padding_length = max_len - len(token_ids)
54
+ if padding_length > 0:
55
+ token_ids.extend([PAD] * padding_length)
56
+ return token_ids
57
+
58
+ def decode(self, token_ids: list[int]) -> str:
59
+ chars: list[str] = []
60
+ for token_id in token_ids:
61
+ if token_id in (PAD, SOS):
62
+ continue
63
+ if token_id == EOS:
64
+ break
65
+ if 0 <= token_id < len(self.itos):
66
+ chars.append(self.itos[token_id])
67
+ return "".join(chars)
68
+
69
+ def to_dict(self) -> dict:
70
+ return {"stoi": self.stoi, "itos": self.itos}
71
+
72
+ @classmethod
73
+ def from_dict(cls, data: dict) -> CharVocab:
74
+ return cls(stoi=data["stoi"], itos=data["itos"])
@@ -0,0 +1,14 @@
1
+ Metadata-Version: 2.4
2
+ Name: chartoken-vp
3
+ Version: 1.0.0
4
+ Summary: Character-level vocabulary and morphological feature encoding
5
+ Author: F000NK, Voluntas Progressus
6
+ License-Expression: MIT
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: Programming Language :: Python :: 3.14
9
+ Classifier: Operating System :: OS Independent
10
+ Classifier: Topic :: Text Processing :: Linguistic
11
+ Requires-Python: >=3.14
12
+ License-File: LICENSE
13
+ Requires-Dist: torch>=2.0.0
14
+ Dynamic: license-file
@@ -0,0 +1,11 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ chartoken/__init__.py
5
+ chartoken/feature_vocab.py
6
+ chartoken/vocab.py
7
+ chartoken_vp.egg-info/PKG-INFO
8
+ chartoken_vp.egg-info/SOURCES.txt
9
+ chartoken_vp.egg-info/dependency_links.txt
10
+ chartoken_vp.egg-info/requires.txt
11
+ chartoken_vp.egg-info/top_level.txt
@@ -0,0 +1 @@
1
+ torch>=2.0.0
@@ -0,0 +1 @@
1
+ chartoken
@@ -0,0 +1,28 @@
1
+ [build-system]
2
+ requires = ["setuptools>=69.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "chartoken-vp"
7
+ version = "1.0.0"
8
+ authors = [
9
+ {name = "F000NK"},
10
+ {name = "Voluntas Progressus"},
11
+ ]
12
+ description = "Character-level vocabulary and morphological feature encoding"
13
+ license = "MIT"
14
+ requires-python = ">=3.14"
15
+ classifiers = [
16
+ "Programming Language :: Python :: 3",
17
+ "Programming Language :: Python :: 3.14",
18
+ "Operating System :: OS Independent",
19
+ "Topic :: Text Processing :: Linguistic",
20
+ ]
21
+ dependencies = ["torch>=2.0.0"]
22
+
23
+ [tool.setuptools.packages.find]
24
+ include = ["chartoken*"]
25
+
26
+ [tool.ruff]
27
+ line-length = 120
28
+ target-version = "py314"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+