jtoken 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- jtoken/__init__.py +28 -0
- jtoken/__main__.py +4 -0
- jtoken/_codec.py +160 -0
- jtoken/cli.py +141 -0
- jtoken/exceptions.py +10 -0
- jtoken/tokens.py +137 -0
- jtoken-0.1.0.dist-info/METADATA +264 -0
- jtoken-0.1.0.dist-info/RECORD +11 -0
- jtoken-0.1.0.dist-info/WHEEL +4 -0
- jtoken-0.1.0.dist-info/entry_points.txt +2 -0
- jtoken-0.1.0.dist-info/licenses/LICENSE +21 -0
jtoken/__init__.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""jtoken — Compress JSON for LLM prompts with ~30% fewer tokens."""
|
|
2
|
+
|
|
3
|
+
from ._codec import decode, encode
|
|
4
|
+
from .exceptions import JPackDecodeError, JPackEncodeError, JPackError
|
|
5
|
+
from .tokens import TokenCountError, TokenSavings, count_tokens, token_savings
|
|
6
|
+
|
|
7
|
+
__version__ = "0.1.0"
|
|
8
|
+
__author__ = "Hermann Samimi"
|
|
9
|
+
|
|
10
|
+
# json-style aliases
|
|
11
|
+
dumps = encode
|
|
12
|
+
loads = decode
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"encode",
|
|
16
|
+
"decode",
|
|
17
|
+
"dumps",
|
|
18
|
+
"loads",
|
|
19
|
+
"count_tokens",
|
|
20
|
+
"token_savings",
|
|
21
|
+
"TokenSavings",
|
|
22
|
+
"JPackError",
|
|
23
|
+
"JPackEncodeError",
|
|
24
|
+
"JPackDecodeError",
|
|
25
|
+
"TokenCountError",
|
|
26
|
+
"__version__",
|
|
27
|
+
"__author__",
|
|
28
|
+
]
|
jtoken/__main__.py
ADDED
jtoken/_codec.py
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from .exceptions import JPackDecodeError, JPackEncodeError
|
|
6
|
+
|
|
7
|
+
_SEP = ": "
|
|
8
|
+
_NULLS_KEY = "nulls"
|
|
9
|
+
_TRUES_KEY = "trues"
|
|
10
|
+
_FALSES_KEY = "falses"
|
|
11
|
+
_RESERVED = {_NULLS_KEY, _TRUES_KEY, _FALSES_KEY}
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def encode(data: dict[str, Any]) -> str:
|
|
15
|
+
"""Compress a JSON-like dict into jtoken format.
|
|
16
|
+
|
|
17
|
+
Strips JSON syntax and collapses all null, true, and false fields each into
|
|
18
|
+
a single summary line. Nested dicts are flattened with dot notation.
|
|
19
|
+
The result is lossless: decode(encode(data)) == data.
|
|
20
|
+
"""
|
|
21
|
+
if not isinstance(data, dict):
|
|
22
|
+
raise JPackEncodeError(f"Expected dict, got {type(data).__name__}")
|
|
23
|
+
|
|
24
|
+
flat = _flatten(data)
|
|
25
|
+
|
|
26
|
+
null_keys: list[str] = []
|
|
27
|
+
true_keys: list[str] = []
|
|
28
|
+
false_keys: list[str] = []
|
|
29
|
+
lines: list[str] = []
|
|
30
|
+
|
|
31
|
+
for k, v in flat.items():
|
|
32
|
+
if v is None:
|
|
33
|
+
null_keys.append(k)
|
|
34
|
+
elif v is True:
|
|
35
|
+
true_keys.append(k)
|
|
36
|
+
elif v is False:
|
|
37
|
+
false_keys.append(k)
|
|
38
|
+
elif isinstance(v, (int, float)):
|
|
39
|
+
lines.append(f"{k}{_SEP}{v}")
|
|
40
|
+
elif isinstance(v, str):
|
|
41
|
+
val = f'"{v}"' if _is_ambiguous(v) else v
|
|
42
|
+
lines.append(f"{k}{_SEP}{val}")
|
|
43
|
+
else:
|
|
44
|
+
raise JPackEncodeError(
|
|
45
|
+
f"Unsupported value type for key {k!r}: {type(v).__name__}. "
|
|
46
|
+
"Supported types: str, int, float, bool, None."
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
if true_keys:
|
|
50
|
+
lines.append(f"{_TRUES_KEY}{_SEP}{','.join(true_keys)}")
|
|
51
|
+
if false_keys:
|
|
52
|
+
lines.append(f"{_FALSES_KEY}{_SEP}{','.join(false_keys)}")
|
|
53
|
+
if null_keys:
|
|
54
|
+
lines.append(f"{_NULLS_KEY}{_SEP}{','.join(null_keys)}")
|
|
55
|
+
|
|
56
|
+
return "\n".join(lines)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def decode(text: str) -> dict[str, Any]:
|
|
60
|
+
"""Reconstruct a dict from a jtoken-compressed string."""
|
|
61
|
+
if not isinstance(text, str):
|
|
62
|
+
raise JPackDecodeError(f"Expected str, got {type(text).__name__}")
|
|
63
|
+
|
|
64
|
+
flat: dict[str, Any] = {}
|
|
65
|
+
|
|
66
|
+
for lineno, line in enumerate(text.strip().splitlines(), 1):
|
|
67
|
+
if not line.strip():
|
|
68
|
+
continue
|
|
69
|
+
if _SEP not in line:
|
|
70
|
+
raise JPackDecodeError(
|
|
71
|
+
f"Invalid format on line {lineno}: missing {_SEP!r} separator"
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
key, _, value = line.partition(_SEP)
|
|
75
|
+
|
|
76
|
+
if key == _NULLS_KEY:
|
|
77
|
+
for k in value.split(","):
|
|
78
|
+
flat[k.strip()] = None
|
|
79
|
+
elif key == _TRUES_KEY:
|
|
80
|
+
for k in value.split(","):
|
|
81
|
+
flat[k.strip()] = True
|
|
82
|
+
elif key == _FALSES_KEY:
|
|
83
|
+
for k in value.split(","):
|
|
84
|
+
flat[k.strip()] = False
|
|
85
|
+
elif _is_quoted(value):
|
|
86
|
+
flat[key] = value[1:-1]
|
|
87
|
+
elif value.lower() == "true":
|
|
88
|
+
flat[key] = True # backward-compat with inline key: true
|
|
89
|
+
elif value.lower() == "false":
|
|
90
|
+
flat[key] = False # backward-compat with inline key: false
|
|
91
|
+
else:
|
|
92
|
+
try:
|
|
93
|
+
flat[key] = int(value)
|
|
94
|
+
except ValueError:
|
|
95
|
+
try:
|
|
96
|
+
flat[key] = float(value)
|
|
97
|
+
except ValueError:
|
|
98
|
+
flat[key] = value
|
|
99
|
+
|
|
100
|
+
return _unflatten(flat)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
# ── helpers ───────────────────────────────────────────────────────────────────
|
|
104
|
+
|
|
105
|
+
def _flatten(data: dict[str, Any], prefix: str = "") -> dict[str, Any]:
|
|
106
|
+
"""Recursively flatten a nested dict using dot-notation keys."""
|
|
107
|
+
result: dict[str, Any] = {}
|
|
108
|
+
for k, v in data.items():
|
|
109
|
+
k_str = str(k)
|
|
110
|
+
if "." in k_str:
|
|
111
|
+
raise JPackEncodeError(
|
|
112
|
+
f"Key {k_str!r} contains '.' which is reserved for nested paths"
|
|
113
|
+
)
|
|
114
|
+
if _SEP in k_str:
|
|
115
|
+
raise JPackEncodeError(f"Key cannot contain {_SEP!r}: {k_str!r}")
|
|
116
|
+
if not prefix and k_str in _RESERVED:
|
|
117
|
+
raise JPackEncodeError(f"Key '{k_str}' is reserved")
|
|
118
|
+
key = f"{prefix}.{k_str}" if prefix else k_str
|
|
119
|
+
if isinstance(v, dict):
|
|
120
|
+
result.update(_flatten(v, key))
|
|
121
|
+
else:
|
|
122
|
+
result[key] = v
|
|
123
|
+
return result
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _unflatten(flat: dict[str, Any]) -> dict[str, Any]:
|
|
127
|
+
"""Reconstruct a nested dict from dot-notation keys."""
|
|
128
|
+
result: dict[str, Any] = {}
|
|
129
|
+
for dotted_key, value in flat.items():
|
|
130
|
+
parts = dotted_key.split(".")
|
|
131
|
+
d = result
|
|
132
|
+
for part in parts[:-1]:
|
|
133
|
+
if part not in d or not isinstance(d[part], dict):
|
|
134
|
+
d[part] = {}
|
|
135
|
+
d = d[part]
|
|
136
|
+
d[parts[-1]] = value
|
|
137
|
+
return result
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _is_ambiguous(v: str) -> bool:
|
|
141
|
+
"""True if this string would be mistyped as a number or bool on decode."""
|
|
142
|
+
if not v:
|
|
143
|
+
return True
|
|
144
|
+
if v.lower() in ("true", "false"):
|
|
145
|
+
return True
|
|
146
|
+
try:
|
|
147
|
+
int(v)
|
|
148
|
+
return True
|
|
149
|
+
except ValueError:
|
|
150
|
+
pass
|
|
151
|
+
try:
|
|
152
|
+
float(v)
|
|
153
|
+
return True
|
|
154
|
+
except ValueError:
|
|
155
|
+
pass
|
|
156
|
+
return False
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def _is_quoted(v: str) -> bool:
|
|
160
|
+
return len(v) >= 2 and v[0] == '"' and v[-1] == '"'
|
jtoken/cli.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import json
|
|
5
|
+
import sys
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from . import decode, encode, count_tokens, token_savings
|
|
9
|
+
from .exceptions import JPackDecodeError, JPackEncodeError, JPackError
|
|
10
|
+
from .tokens import TokenCountError
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _read_input(path: str | None) -> str:
|
|
14
|
+
if path:
|
|
15
|
+
with open(path, encoding="utf-8") as handle:
|
|
16
|
+
return handle.read()
|
|
17
|
+
return sys.stdin.read()
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _load_json_object(text: str) -> dict[str, Any]:
|
|
21
|
+
try:
|
|
22
|
+
data = json.loads(text)
|
|
23
|
+
except json.JSONDecodeError as exc:
|
|
24
|
+
raise SystemExit(f"Invalid JSON input: {exc}") from exc
|
|
25
|
+
if not isinstance(data, dict):
|
|
26
|
+
raise SystemExit("JSON input must be an object")
|
|
27
|
+
return data
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _load_json_or_jtoken(text: str) -> dict[str, Any] | str:
|
|
31
|
+
stripped = text.strip()
|
|
32
|
+
if not stripped:
|
|
33
|
+
raise SystemExit("Input is empty")
|
|
34
|
+
if stripped[0] in "{[":
|
|
35
|
+
return _load_json_object(text)
|
|
36
|
+
return stripped
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _handle_errors(func):
|
|
40
|
+
def wrapper(*args, **kwargs):
|
|
41
|
+
try:
|
|
42
|
+
return func(*args, **kwargs)
|
|
43
|
+
except (JPackError, TokenCountError) as exc:
|
|
44
|
+
print(exc, file=sys.stderr)
|
|
45
|
+
raise SystemExit(1) from exc
|
|
46
|
+
|
|
47
|
+
return wrapper
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@_handle_errors
|
|
51
|
+
def _cmd_encode(args: argparse.Namespace) -> None:
|
|
52
|
+
data = _load_json_object(_read_input(args.file))
|
|
53
|
+
sys.stdout.write(encode(data))
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@_handle_errors
|
|
57
|
+
def _cmd_decode(args: argparse.Namespace) -> None:
|
|
58
|
+
text = _read_input(args.file)
|
|
59
|
+
data = decode(text)
|
|
60
|
+
json.dump(data, sys.stdout, indent=2, sort_keys=True)
|
|
61
|
+
sys.stdout.write("\n")
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@_handle_errors
|
|
65
|
+
def _cmd_stats(args: argparse.Namespace) -> None:
|
|
66
|
+
payload = _load_json_or_jtoken(_read_input(args.file))
|
|
67
|
+
stats = token_savings(payload, model=args.model, backend=args.backend)
|
|
68
|
+
print(stats)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@_handle_errors
|
|
72
|
+
def _cmd_count(args: argparse.Namespace) -> None:
|
|
73
|
+
payload = _load_json_or_jtoken(_read_input(args.file))
|
|
74
|
+
print(count_tokens(payload, model=args.model, backend=args.backend))
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _add_token_flags(parser: argparse.ArgumentParser) -> None:
|
|
78
|
+
parser.add_argument(
|
|
79
|
+
"--model",
|
|
80
|
+
default="cl100k_base",
|
|
81
|
+
help="tiktoken model or encoding name (default: cl100k_base)",
|
|
82
|
+
)
|
|
83
|
+
parser.add_argument(
|
|
84
|
+
"--backend",
|
|
85
|
+
choices=("auto", "tiktoken", "estimate"),
|
|
86
|
+
default="auto",
|
|
87
|
+
help="token counting backend (default: auto)",
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _add_input_file(parser: argparse.ArgumentParser) -> None:
|
|
92
|
+
parser.add_argument(
|
|
93
|
+
"-f",
|
|
94
|
+
"--file",
|
|
95
|
+
help="read input from a file instead of stdin",
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _build_parser() -> argparse.ArgumentParser:
|
|
100
|
+
parser = argparse.ArgumentParser(prog="jtoken", description="jtoken CLI")
|
|
101
|
+
input_parent = argparse.ArgumentParser(add_help=False)
|
|
102
|
+
_add_input_file(input_parent)
|
|
103
|
+
subparsers = parser.add_subparsers(dest="command", required=True)
|
|
104
|
+
|
|
105
|
+
subparsers.add_parser(
|
|
106
|
+
"encode",
|
|
107
|
+
parents=[input_parent],
|
|
108
|
+
help="encode JSON to jtoken",
|
|
109
|
+
).set_defaults(func=_cmd_encode)
|
|
110
|
+
subparsers.add_parser(
|
|
111
|
+
"decode",
|
|
112
|
+
parents=[input_parent],
|
|
113
|
+
help="decode jtoken to JSON",
|
|
114
|
+
).set_defaults(func=_cmd_decode)
|
|
115
|
+
stats_parser = subparsers.add_parser(
|
|
116
|
+
"stats",
|
|
117
|
+
parents=[input_parent],
|
|
118
|
+
help="compare jtoken vs JSON token usage",
|
|
119
|
+
)
|
|
120
|
+
_add_token_flags(stats_parser)
|
|
121
|
+
stats_parser.set_defaults(func=_cmd_stats)
|
|
122
|
+
|
|
123
|
+
count_parser = subparsers.add_parser(
|
|
124
|
+
"count",
|
|
125
|
+
parents=[input_parent],
|
|
126
|
+
help="count jtoken tokens",
|
|
127
|
+
)
|
|
128
|
+
_add_token_flags(count_parser)
|
|
129
|
+
count_parser.set_defaults(func=_cmd_count)
|
|
130
|
+
|
|
131
|
+
return parser
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def main(argv: list[str] | None = None) -> None:
|
|
135
|
+
parser = _build_parser()
|
|
136
|
+
args = parser.parse_args(argv)
|
|
137
|
+
args.func(args)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
if __name__ == "__main__":
|
|
141
|
+
main()
|
jtoken/exceptions.py
ADDED
jtoken/tokens.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from typing import Any, Union
|
|
6
|
+
|
|
7
|
+
from ._codec import decode, encode
|
|
8
|
+
from .exceptions import JPackError
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
import tiktoken as _tiktoken
|
|
12
|
+
|
|
13
|
+
_TIKTOKEN_AVAILABLE = True
|
|
14
|
+
except ImportError:
|
|
15
|
+
_TIKTOKEN_AVAILABLE = False
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class TokenCountError(JPackError):
|
|
19
|
+
"""Raised when token counting cannot be completed."""
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class TokenSavings:
|
|
24
|
+
"""Token comparison between jtoken and JSON representations."""
|
|
25
|
+
|
|
26
|
+
jtoken_tokens: int
|
|
27
|
+
json_tokens: int
|
|
28
|
+
|
|
29
|
+
@property
|
|
30
|
+
def saved(self) -> int:
|
|
31
|
+
return self.json_tokens - self.jtoken_tokens
|
|
32
|
+
|
|
33
|
+
@property
|
|
34
|
+
def percent(self) -> float:
|
|
35
|
+
if self.json_tokens == 0:
|
|
36
|
+
return 0.0
|
|
37
|
+
return self.saved / self.json_tokens * 100
|
|
38
|
+
|
|
39
|
+
def __str__(self) -> str:
|
|
40
|
+
return (
|
|
41
|
+
f"jtoken: {self.jtoken_tokens} tokens | "
|
|
42
|
+
f"json: {self.json_tokens} tokens | "
|
|
43
|
+
f"saved: {self.saved} ({self.percent:.1f}%)"
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def count_tokens(
|
|
48
|
+
data: Union[dict[str, Any], str],
|
|
49
|
+
*,
|
|
50
|
+
model: str = "cl100k_base",
|
|
51
|
+
backend: str = "auto",
|
|
52
|
+
) -> int:
|
|
53
|
+
"""Count the LLM tokens in jtoken-encoded data.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
data: A dict (auto-encoded to jtoken) or an already-encoded jtoken string.
|
|
57
|
+
model: tiktoken encoding or model name (default: cl100k_base, used by
|
|
58
|
+
GPT-4 and a close approximation for Claude).
|
|
59
|
+
Accepts encoding names ("cl100k_base", "o200k_base") or
|
|
60
|
+
OpenAI model names ("gpt-4", "gpt-4o").
|
|
61
|
+
backend: "auto" — tiktoken if installed, otherwise estimates.
|
|
62
|
+
"tiktoken" — tiktoken required; raises TokenCountError if absent.
|
|
63
|
+
"estimate" — always uses the ~4 chars/token heuristic.
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
Integer token count for the jtoken representation.
|
|
67
|
+
"""
|
|
68
|
+
text = encode(data) if isinstance(data, dict) else data
|
|
69
|
+
return _count(text, model=model, backend=backend)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def token_savings(
|
|
73
|
+
data: Union[dict[str, Any], str],
|
|
74
|
+
*,
|
|
75
|
+
model: str = "cl100k_base",
|
|
76
|
+
backend: str = "auto",
|
|
77
|
+
) -> TokenSavings:
|
|
78
|
+
"""Compare token usage between jtoken and JSON for the same data.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
data: A dict or an already-encoded jtoken string.
|
|
82
|
+
model: tiktoken encoding or model name (see count_tokens).
|
|
83
|
+
backend: counting backend (see count_tokens).
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
TokenSavings with jtoken_tokens, json_tokens, saved, and percent.
|
|
87
|
+
|
|
88
|
+
Example::
|
|
89
|
+
|
|
90
|
+
stats = jtoken.token_savings({"name": "Alice", "age": 30, "active": True})
|
|
91
|
+
print(stats)
|
|
92
|
+
# jtoken: 8 tokens | json: 12 tokens | saved: 4 (33.3%)
|
|
93
|
+
"""
|
|
94
|
+
if isinstance(data, str):
|
|
95
|
+
source_dict = decode(data)
|
|
96
|
+
jtoken_text = data
|
|
97
|
+
else:
|
|
98
|
+
source_dict = data
|
|
99
|
+
jtoken_text = encode(data)
|
|
100
|
+
|
|
101
|
+
json_text = json.dumps(source_dict)
|
|
102
|
+
|
|
103
|
+
jtoken_n = _count(jtoken_text, model=model, backend=backend)
|
|
104
|
+
json_n = _count(json_text, model=model, backend=backend)
|
|
105
|
+
|
|
106
|
+
return TokenSavings(jtoken_tokens=jtoken_n, json_tokens=json_n)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _count(text: str, *, model: str, backend: str) -> int:
|
|
110
|
+
if backend == "estimate":
|
|
111
|
+
return _estimate(text)
|
|
112
|
+
|
|
113
|
+
if backend == "tiktoken" and not _TIKTOKEN_AVAILABLE:
|
|
114
|
+
raise TokenCountError(
|
|
115
|
+
"tiktoken is not installed. Run: pip install jtoken[tiktoken]"
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
if _TIKTOKEN_AVAILABLE and backend in ("auto", "tiktoken"):
|
|
119
|
+
try:
|
|
120
|
+
enc = _tiktoken.encoding_for_model(model)
|
|
121
|
+
except KeyError:
|
|
122
|
+
try:
|
|
123
|
+
enc = _tiktoken.get_encoding(model)
|
|
124
|
+
except Exception as exc:
|
|
125
|
+
if backend == "tiktoken":
|
|
126
|
+
raise TokenCountError(
|
|
127
|
+
f"Unknown tiktoken model/encoding: {model!r}"
|
|
128
|
+
) from exc
|
|
129
|
+
return _estimate(text)
|
|
130
|
+
return len(enc.encode(text))
|
|
131
|
+
|
|
132
|
+
return _estimate(text)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def _estimate(text: str) -> int:
|
|
136
|
+
"""~4 characters per token heuristic."""
|
|
137
|
+
return max(1, (len(text) + 3) // 4) if text else 0
|
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: jtoken
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A lightweight, human-readable key-value serialization format
|
|
5
|
+
Project-URL: Homepage, https://github.com/hermannsamimi/jtoken
|
|
6
|
+
Project-URL: Repository, https://github.com/hermannsamimi/jtoken
|
|
7
|
+
Project-URL: Issues, https://github.com/hermannsamimi/jtoken/issues
|
|
8
|
+
Author-email: Hermann Samimi <hermannsamimi@gmail.com>
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: encoding,format,key-value,llm,serialization,text,tokens
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
22
|
+
Classifier: Topic :: Text Processing :: General
|
|
23
|
+
Requires-Python: >=3.8
|
|
24
|
+
Provides-Extra: dev
|
|
25
|
+
Requires-Dist: build>=1.0; extra == 'dev'
|
|
26
|
+
Requires-Dist: pytest-cov>=4.0; extra == 'dev'
|
|
27
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
28
|
+
Requires-Dist: tiktoken>=0.5; extra == 'dev'
|
|
29
|
+
Provides-Extra: tiktoken
|
|
30
|
+
Requires-Dist: tiktoken>=0.5; extra == 'tiktoken'
|
|
31
|
+
Description-Content-Type: text/markdown
|
|
32
|
+
|
|
33
|
+
# jtoken
|
|
34
|
+
|
|
35
|
+
Compress JSON for LLM prompts — same data, fewer tokens.
|
|
36
|
+
|
|
37
|
+
## What it does
|
|
38
|
+
|
|
39
|
+
jtoken strips the syntactic noise from JSON (`"`, `{}`, `,`) and collapses all
|
|
40
|
+
`null`, `true`, and `false` fields each into a single summary line. Nested dicts
|
|
41
|
+
are flattened with dot notation so the same collapse applies at every level.
|
|
42
|
+
The result is a compact format an LLM reads just as well as JSON.
|
|
43
|
+
|
|
44
|
+
**JSON (30 tokens):**
|
|
45
|
+
```json
|
|
46
|
+
{"name": "Alice", "age": 30, "active": true, "verified": false, "ref": null}
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
**jtoken (21 tokens):**
|
|
50
|
+
```
|
|
51
|
+
name: Alice
|
|
52
|
+
age: 30
|
|
53
|
+
trues: active
|
|
54
|
+
falses: verified
|
|
55
|
+
nulls: ref
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
The round-trip is lossless: `decode(encode(data)) == data` for all supported types.
|
|
59
|
+
|
|
60
|
+
## Installation
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
# Core — no external dependencies
|
|
64
|
+
pip install jtoken
|
|
65
|
+
|
|
66
|
+
# With accurate LLM token counting
|
|
67
|
+
pip install jtoken[tiktoken]
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## Quick start
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
import jtoken
|
|
74
|
+
|
|
75
|
+
data = {
|
|
76
|
+
"user": "alice",
|
|
77
|
+
"age": 30,
|
|
78
|
+
"premium": True,
|
|
79
|
+
"verified": True,
|
|
80
|
+
"is_remote": False,
|
|
81
|
+
"trial": False,
|
|
82
|
+
"score": 9.5,
|
|
83
|
+
"referral": None,
|
|
84
|
+
"last_login": None,
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
text = jtoken.encode(data)
|
|
88
|
+
# user: alice
|
|
89
|
+
# age: 30
|
|
90
|
+
# score: 9.5
|
|
91
|
+
# trues: premium,verified
|
|
92
|
+
# falses: is_remote,trial
|
|
93
|
+
# nulls: referral,last_login
|
|
94
|
+
|
|
95
|
+
original = jtoken.decode(text)
|
|
96
|
+
assert original == data
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
`dumps` / `loads` are available as `json`-style aliases.
|
|
100
|
+
|
|
101
|
+
## CLI
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
echo '{"name": "Alice", "active": true}' | jtoken encode
|
|
105
|
+
echo 'name: Alice\ntrues: active' | jtoken decode
|
|
106
|
+
echo '{"name": "Alice", "active": true}' | jtoken stats
|
|
107
|
+
echo '{"name": "Alice", "active": true}' | jtoken count
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
Use `-f/--file` to read from a file instead of stdin. `stats` and `count` accept
|
|
111
|
+
`--model` and `--backend` (`auto`, `tiktoken`, `estimate`).
|
|
112
|
+
|
|
113
|
+
## Nested documents
|
|
114
|
+
|
|
115
|
+
Nested dicts are flattened with dot notation. Booleans and nulls at any depth
|
|
116
|
+
are collapsed into the same summary lines.
|
|
117
|
+
|
|
118
|
+
```python
|
|
119
|
+
data = {
|
|
120
|
+
"title": "Engineer",
|
|
121
|
+
"metadata": {
|
|
122
|
+
"verified": True,
|
|
123
|
+
"sponsored": False,
|
|
124
|
+
"score": None,
|
|
125
|
+
"source": {
|
|
126
|
+
"crawled": True,
|
|
127
|
+
"enriched": None,
|
|
128
|
+
},
|
|
129
|
+
},
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
print(jtoken.encode(data))
|
|
133
|
+
# title: Engineer
|
|
134
|
+
# trues: metadata.verified,metadata.source.crawled
|
|
135
|
+
# falses: metadata.sponsored
|
|
136
|
+
# nulls: metadata.score,metadata.source.enriched
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
Decode reconstructs the full nested structure:
|
|
140
|
+
|
|
141
|
+
```python
|
|
142
|
+
assert jtoken.decode(jtoken.encode(data)) == data # ✓
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
**Limitation:** keys cannot contain `.` (reserved for nesting) or `": "`.
|
|
146
|
+
Arrays are not supported.
|
|
147
|
+
|
|
148
|
+
## Token savings
|
|
149
|
+
|
|
150
|
+
```python
|
|
151
|
+
import jtoken
|
|
152
|
+
|
|
153
|
+
stats = jtoken.token_savings(data)
|
|
154
|
+
print(stats)
|
|
155
|
+
# jtoken: 22 tokens | json: 36 tokens | saved: 14 (38.9%)
|
|
156
|
+
|
|
157
|
+
n = jtoken.count_tokens(data) # count jtoken tokens only
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
Savings are compared against `json.dumps(data)` — the standard representation
|
|
161
|
+
you'd paste into a prompt. Savings are highest when a document has many `null`
|
|
162
|
+
or boolean fields.
|
|
163
|
+
|
|
164
|
+
```python
|
|
165
|
+
# Specify model or encoding
|
|
166
|
+
stats = jtoken.token_savings(data, model="gpt-4o")
|
|
167
|
+
stats = jtoken.token_savings(data, model="o200k_base")
|
|
168
|
+
|
|
169
|
+
# No tiktoken dependency
|
|
170
|
+
stats = jtoken.token_savings(data, backend="estimate")
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
## API
|
|
174
|
+
|
|
175
|
+
### `encode(data: dict) -> str`
|
|
176
|
+
|
|
177
|
+
Compresses a dict into jtoken. Supported value types: `str`, `int`, `float`,
|
|
178
|
+
`bool`, `None`, nested `dict`.
|
|
179
|
+
|
|
180
|
+
**Summary lines (always at the end):**
|
|
181
|
+
|
|
182
|
+
| line | contains |
|
|
183
|
+
|---|---|
|
|
184
|
+
| `trues: k1,k2,...` | all keys whose value is `True` |
|
|
185
|
+
| `falses: k1,k2,...` | all keys whose value is `False` |
|
|
186
|
+
| `nulls: k1,k2,...` | all keys whose value is `None` |
|
|
187
|
+
|
|
188
|
+
String values that would decode ambiguously (look like a number or boolean)
|
|
189
|
+
keep their quotes:
|
|
190
|
+
|
|
191
|
+
```python
|
|
192
|
+
jtoken.encode({"zip": "90210"}) # → 'zip: "90210"' (string, quotes kept)
|
|
193
|
+
jtoken.encode({"zip": 90210}) # → 'zip: 90210' (int, no quotes)
|
|
194
|
+
jtoken.encode({"ok": "true"}) # → 'ok: "true"' (string, quotes kept)
|
|
195
|
+
jtoken.encode({"ok": True}) # → 'trues: ok' (bool, collapsed)
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
Raises `JPackEncodeError` for unsupported types, dots or `": "` in keys, or
|
|
199
|
+
reserved key names (`nulls`, `trues`, `falses`).
|
|
200
|
+
|
|
201
|
+
### `decode(text: str) -> dict`
|
|
202
|
+
|
|
203
|
+
Reconstructs the original dict, including nested structure from dot-notation
|
|
204
|
+
keys. Type inference for scalar values:
|
|
205
|
+
|
|
206
|
+
| value | decoded as |
|
|
207
|
+
|---|---|
|
|
208
|
+
| `"quoted"` | `str` (always) |
|
|
209
|
+
| key in `trues:` line | `True` |
|
|
210
|
+
| key in `falses:` line | `False` |
|
|
211
|
+
| key in `nulls:` line | `None` |
|
|
212
|
+
| integer literal, e.g. `42` | `int` |
|
|
213
|
+
| float literal, e.g. `3.14` | `float` |
|
|
214
|
+
| anything else | `str` |
|
|
215
|
+
|
|
216
|
+
Raises `JPackDecodeError` for invalid input.
|
|
217
|
+
|
|
218
|
+
### `token_savings(data, *, model, backend) -> TokenSavings`
|
|
219
|
+
|
|
220
|
+
Compares jtoken vs `json.dumps` token usage.
|
|
221
|
+
|
|
222
|
+
```python
|
|
223
|
+
stats.jtoken_tokens # int
|
|
224
|
+
stats.json_tokens # int
|
|
225
|
+
stats.saved # int
|
|
226
|
+
stats.percent # float
|
|
227
|
+
str(stats) # "jtoken: 22 tokens | json: 36 tokens | saved: 14 (38.9%)"
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
### `count_tokens(data, *, model, backend) -> int`
|
|
231
|
+
|
|
232
|
+
Counts LLM tokens in the jtoken representation. Accepts a dict or an
|
|
233
|
+
already-encoded jtoken string.
|
|
234
|
+
|
|
235
|
+
**`backend` options:**
|
|
236
|
+
|
|
237
|
+
| value | behaviour |
|
|
238
|
+
|---|---|
|
|
239
|
+
| `"auto"` (default) | tiktoken if installed, otherwise estimates |
|
|
240
|
+
| `"tiktoken"` | requires tiktoken; raises `TokenCountError` if absent |
|
|
241
|
+
| `"estimate"` | ~4 chars/token heuristic, no extra dependency |
|
|
242
|
+
|
|
243
|
+
## Exceptions
|
|
244
|
+
|
|
245
|
+
```
|
|
246
|
+
JPackError
|
|
247
|
+
├── JPackEncodeError
|
|
248
|
+
├── JPackDecodeError
|
|
249
|
+
└── TokenCountError
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
## Development
|
|
253
|
+
|
|
254
|
+
```bash
|
|
255
|
+
git clone https://github.com/hermannsamimi/jtoken
|
|
256
|
+
cd jtoken
|
|
257
|
+
pip install -e ".[dev]"
|
|
258
|
+
pytest
|
|
259
|
+
pytest --cov=jtoken --cov-report=term-missing
|
|
260
|
+
```
|
|
261
|
+
|
|
262
|
+
## License
|
|
263
|
+
|
|
264
|
+
MIT — © 2026 Hermann Samimi
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
jtoken/__init__.py,sha256=KeGDSIUyUn51_cjdOLyQmpy1o4RiQswrYE1-L-WaZUA,618
|
|
2
|
+
jtoken/__main__.py,sha256=MSmt_5Xg84uHqzTN38JwgseJK8rsJn_11A8WD99VtEo,61
|
|
3
|
+
jtoken/_codec.py,sha256=2j9i-pjjEUgEdu_VkddUMZBX_0mnrizhJJU5bzjo09w,5107
|
|
4
|
+
jtoken/cli.py,sha256=1fMEvO_a5iM-Fp9IxF8PHhHAKj_95zM_f1UfPVMcIDI,3859
|
|
5
|
+
jtoken/exceptions.py,sha256=-B11CECC_BEHVMm39Ub4CPBfF_usR-Bs4EDq9w6MHWw,252
|
|
6
|
+
jtoken/tokens.py,sha256=HGr4R1jeGSm9D_3UFjRys_EE1axwCbHKlEkwHOcRERE,4077
|
|
7
|
+
jtoken-0.1.0.dist-info/METADATA,sha256=hHxFwTqdLEl6EtYrnkrwMmU_FNn2Oj6rqY2-5qouQZ4,6965
|
|
8
|
+
jtoken-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
9
|
+
jtoken-0.1.0.dist-info/entry_points.txt,sha256=hUIc42NM-4aqDuGlYffU3cmJaqs7GG_UzggmTneDwfY,43
|
|
10
|
+
jtoken-0.1.0.dist-info/licenses/LICENSE,sha256=06gleizuefVN7pyIwD4F1-bHnBkKPkQTuYABA6fHWH4,1071
|
|
11
|
+
jtoken-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Hermann Samimi
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|