jtoken 0.2.2__tar.gz → 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: jtoken
3
- Version: 0.2.2
3
+ Version: 0.2.4
4
4
  Summary: Compress JSON-shaped documents for LLM prompts with normalization, CLI, and token measurement
5
5
  Project-URL: Homepage, https://github.com/hermannsamimi/jtoken
6
6
  Project-URL: Repository, https://github.com/hermannsamimi/jtoken
@@ -254,6 +254,19 @@ print(stats.jtoken_tokens, stats.json_tokens, stats.saved, stats.percent)
254
254
 
255
255
  `json_indent=2` compares against prompt-style pretty JSON. Use `json_indent=None` for compact JSON.
256
256
 
257
+ ### Representative token counts
258
+
259
+ Sample payloads measured as pretty JSON versus jtoken on representative documents:
260
+
261
+ | Document type | JSON | jtoken |
262
+ |---|---:|---:|
263
+ | ELK hit | 1537 | 583 |
264
+ | Mongo shell | 770 | 508 |
265
+ | PostgreSQL structured document | 831 | 685 |
266
+ | Standard JSON | 617 | 503 |
267
+
268
+ ![Token count by representation](https://raw.githubusercontent.com/hermannsamimi/jtoken/main/docs/token-savings-bar-chart.svg)
269
+
257
270
  ## CLI
258
271
 
259
272
  ```bash
@@ -150,6 +150,19 @@ print(stats.jtoken_tokens, stats.json_tokens, stats.saved, stats.percent)
150
150
 
151
151
  `count_tokens` and `count_text_tokens` are also available. Savings compare the jtoken representation against pretty JSON by default (`json_indent=2`).
152
152
 
153
+ ### Representative token counts
154
+
155
+ Sample payloads measured as pretty JSON versus jtoken on representative documents:
156
+
157
+ | Document type | JSON | jtoken |
158
+ |---|---:|---:|
159
+ | ELK hit | 1537 | 583 |
160
+ | Mongo shell | 770 | 508 |
161
+ | PostgreSQL structured document | 831 | 685 |
162
+ | Standard JSON | 617 | 503 |
163
+
164
+ ![Token count by representation](docs/token-savings-bar-chart.svg)
165
+
153
166
  ## API reference
154
167
 
155
168
  ### Package metadata
@@ -222,6 +222,19 @@ print(stats.jtoken_tokens, stats.json_tokens, stats.saved, stats.percent)
222
222
 
223
223
  `json_indent=2` compares against prompt-style pretty JSON. Use `json_indent=None` for compact JSON.
224
224
 
225
+ ### Representative token counts
226
+
227
+ Sample payloads measured as pretty JSON versus jtoken on representative documents:
228
+
229
+ | Document type | JSON | jtoken |
230
+ |---|---:|---:|
231
+ | ELK hit | 1537 | 583 |
232
+ | Mongo shell | 770 | 508 |
233
+ | PostgreSQL structured document | 831 | 685 |
234
+ | Standard JSON | 617 | 503 |
235
+
236
+ ![Token count by representation](https://raw.githubusercontent.com/hermannsamimi/jtoken/main/docs/token-savings-bar-chart.svg)
237
+
225
238
  ## CLI
226
239
 
227
240
  ```bash
@@ -0,0 +1,40 @@
1
+ <svg xmlns="http://www.w3.org/2000/svg" width="960" height="520" viewBox="0 0 960 520" role="img" aria-label="Token count by representation">
2
+ <rect width="100%" height="100%" fill="#ffffff"/>
3
+ <text x="24" y="34" font-family="Inter, Arial, sans-serif" font-size="20" font-weight="600" fill="#1f2937">Token count by representation</text>
4
+ <text x="72" y="492" font-family="Inter, Arial, sans-serif" font-size="13" fill="#4b5563">Document type</text>
5
+ <text x="18" y="248.0" font-family="Inter, Arial, sans-serif" font-size="13" fill="#4b5563" transform="rotate(-90 18 248.0)">Tokens</text>
6
+ <line x1="72" y1="424.0" x2="928" y2="424.0" stroke="#e5e7eb" stroke-width="1"/>
7
+ <text x="62" y="428.0" text-anchor="end" font-family="Inter, Arial, sans-serif" font-size="11" fill="#6b7280">0</text>
8
+ <line x1="72" y1="344.3" x2="928" y2="344.3" stroke="#e5e7eb" stroke-width="1"/>
9
+ <text x="62" y="348.3" text-anchor="end" font-family="Inter, Arial, sans-serif" font-size="11" fill="#6b7280">400</text>
10
+ <line x1="72" y1="264.7" x2="928" y2="264.7" stroke="#e5e7eb" stroke-width="1"/>
11
+ <text x="62" y="268.7" text-anchor="end" font-family="Inter, Arial, sans-serif" font-size="11" fill="#6b7280">800</text>
12
+ <line x1="72" y1="185.0" x2="928" y2="185.0" stroke="#e5e7eb" stroke-width="1"/>
13
+ <text x="62" y="189.0" text-anchor="end" font-family="Inter, Arial, sans-serif" font-size="11" fill="#6b7280">1200</text>
14
+ <line x1="72" y1="105.4" x2="928" y2="105.4" stroke="#e5e7eb" stroke-width="1"/>
15
+ <text x="62" y="109.4" text-anchor="end" font-family="Inter, Arial, sans-serif" font-size="11" fill="#6b7280">1600</text>
16
+ <rect x="140.0" y="117.9" width="34" height="306.1" rx="4" fill="#4C78A8"/>
17
+ <text x="157.0" y="109.9" text-anchor="middle" font-family="Inter, Arial, sans-serif" font-size="11" fill="#374151">1537</text>
18
+ <rect x="184.0" y="307.9" width="34" height="116.1" rx="4" fill="#F58518"/>
19
+ <text x="201.0" y="299.9" text-anchor="middle" font-family="Inter, Arial, sans-serif" font-size="11" fill="#374151">583</text>
20
+ <text x="179.0" y="462" text-anchor="middle" font-family="Inter, Arial, sans-serif" font-size="12" fill="#374151">ELK hit</text>
21
+ <rect x="354.0" y="270.7" width="34" height="153.3" rx="4" fill="#4C78A8"/>
22
+ <text x="371.0" y="262.7" text-anchor="middle" font-family="Inter, Arial, sans-serif" font-size="11" fill="#374151">770</text>
23
+ <rect x="398.0" y="322.8" width="34" height="101.2" rx="4" fill="#F58518"/>
24
+ <text x="415.0" y="314.8" text-anchor="middle" font-family="Inter, Arial, sans-serif" font-size="11" fill="#374151">508</text>
25
+ <text x="393.0" y="462" text-anchor="middle" font-family="Inter, Arial, sans-serif" font-size="12" fill="#374151">Mongo shell</text>
26
+ <rect x="568.0" y="258.5" width="34" height="165.5" rx="4" fill="#4C78A8"/>
27
+ <text x="585.0" y="250.5" text-anchor="middle" font-family="Inter, Arial, sans-serif" font-size="11" fill="#374151">831</text>
28
+ <rect x="612.0" y="287.6" width="34" height="136.4" rx="4" fill="#F58518"/>
29
+ <text x="629.0" y="279.6" text-anchor="middle" font-family="Inter, Arial, sans-serif" font-size="11" fill="#374151">685</text>
30
+ <text x="607.0" y="462" text-anchor="middle" font-family="Inter, Arial, sans-serif" font-size="12" fill="#374151">PostgreSQL document</text>
31
+ <rect x="782.0" y="301.1" width="34" height="122.9" rx="4" fill="#4C78A8"/>
32
+ <text x="799.0" y="293.1" text-anchor="middle" font-family="Inter, Arial, sans-serif" font-size="11" fill="#374151">617</text>
33
+ <rect x="826.0" y="323.8" width="34" height="100.2" rx="4" fill="#F58518"/>
34
+ <text x="843.0" y="315.8" text-anchor="middle" font-family="Inter, Arial, sans-serif" font-size="11" fill="#374151">503</text>
35
+ <text x="821.0" y="462" text-anchor="middle" font-family="Inter, Arial, sans-serif" font-size="12" fill="#374151">Standard JSON</text>
36
+ <rect x="778" y="28" width="14" height="14" rx="3" fill="#4C78A8"/>
37
+ <text x="798" y="39" font-family="Inter, Arial, sans-serif" font-size="12" fill="#374151">JSON</text>
38
+ <rect x="856" y="28" width="14" height="14" rx="3" fill="#F58518"/>
39
+ <text x="876" y="39" font-family="Inter, Arial, sans-serif" font-size="12" fill="#374151">jtoken</text>
40
+ </svg>
@@ -19,7 +19,7 @@ from .tokens import (
19
19
  token_savings,
20
20
  )
21
21
 
22
- __version__ = "0.2.2"
22
+ __version__ = "0.2.4"
23
23
  __author__ = "Hermann Samimi"
24
24
 
25
25
  # json-style aliases
@@ -0,0 +1,169 @@
1
+ data = {
2
+ "organization": {
3
+ "id": 7001,
4
+ "name": "Nexus Corp",
5
+ "founded": "2001-04-12",
6
+ "dissolved": None,
7
+ "active": True,
8
+ "public": False,
9
+ "revenue": 4_500_000.75,
10
+ "employeeCount": 312,
11
+ "website": "https://nexuscorp.example.com",
12
+ "tagline": None,
13
+
14
+ "headquarters": {
15
+ "street": "88 Innovation Drive",
16
+ "city": "Austin",
17
+ "state": "TX",
18
+ "country": "US",
19
+ "postalCode": "78701",
20
+ "geo": {
21
+ "lat": 30.2672,
22
+ "lng": -97.7431,
23
+ "altitude_m": 149,
24
+ "verified": True,
25
+ "lastChecked": "2026-03-01T00:00:00Z",
26
+ },
27
+ },
28
+
29
+ "departments": [
30
+ {
31
+ "deptId": "DEPT-ENG",
32
+ "name": "Engineering",
33
+ "headCount": 120,
34
+ "remote": True,
35
+ "budget": 1_200_000.00,
36
+ "managerId": "USR-0042",
37
+ "parentDept": None,
38
+ "tags": ["backend", "infra", "ml"],
39
+ "kpis": {
40
+ "deployFrequency": "daily",
41
+ "avgLeadTimeDays": 3.5,
42
+ "changeFailureRate": 0.02,
43
+ "onCallActive": True,
44
+ "incidentCount": 7,
45
+ "lastIncident": "2026-04-29T03:12:00Z",
46
+ "slaBreached": False,
47
+ },
48
+ "teams": [
49
+ {
50
+ "teamId": "TEAM-CORE",
51
+ "name": "Core Platform",
52
+ "size": 8,
53
+ "lead": "Alice Nakamura",
54
+ "stack": ["Python", "Go", "PostgreSQL"],
55
+ "activeSprintId": "SPR-2026-19",
56
+ "velocity": 42,
57
+ "archived": False,
58
+ },
59
+ {
60
+ "teamId": "TEAM-ML",
61
+ "name": "Machine Learning",
62
+ "size": 5,
63
+ "lead": None,
64
+ "stack": ["Python", "PyTorch", "CUDA"],
65
+ "activeSprintId": None,
66
+ "velocity": 0,
67
+ "archived": False,
68
+ },
69
+ ],
70
+ },
71
+ {
72
+ "deptId": "DEPT-OPS",
73
+ "name": "Operations",
74
+ "headCount": 45,
75
+ "remote": False,
76
+ "budget": 620_000.00,
77
+ "managerId": "USR-0017",
78
+ "parentDept": None,
79
+ "tags": ["logistics", "support"],
80
+ "kpis": {
81
+ "ticketsClosedMonthly": 980,
82
+ "avgResolutionHrs": 4.2,
83
+ "customerSatisfaction": 4.7,
84
+ "escalationRate": 0.05,
85
+ "slaBreached": False,
86
+ "lastIncident": None,
87
+ },
88
+ "teams": [],
89
+ },
90
+ ],
91
+
92
+ "auditLog": [
93
+ {
94
+ "eventId": "EVT-001",
95
+ "action": "ORG_CREATED",
96
+ "performedBy": "USR-0001",
97
+ "timestamp": "2001-04-12T09:00:00Z",
98
+ "ipAddress": "192.168.1.1",
99
+ "success": True,
100
+ "errorCode": None,
101
+ "meta": {},
102
+ },
103
+ {
104
+ "eventId": "EVT-419",
105
+ "action": "DEPT_BUDGET_UPDATE",
106
+ "performedBy": "USR-0042",
107
+ "timestamp": "2026-05-01T11:30:00+02:00",
108
+ "ipAddress": "10.0.0.55",
109
+ "success": False,
110
+ "errorCode": "PERMISSION_DENIED",
111
+ "meta": {
112
+ "attemptedValue": 2_000_000,
113
+ "currentValue": 1_200_000,
114
+ "delta": 800_000,
115
+ "requiresApproval": True,
116
+ "approvedBy": None,
117
+ "approvedAt": None,
118
+ },
119
+ },
120
+ ],
121
+
122
+ "settings": {
123
+ "locale": "en-US",
124
+ "timezone": "America/Chicago",
125
+ "currency": "USD",
126
+ "fiscalYearStart": "01-01",
127
+ "mfaRequired": True,
128
+ "ssoEnabled": False,
129
+ "ssoProvider": None,
130
+ "allowedDomains": ["nexuscorp.example.com", "nexus.internal"],
131
+ "blockedIPs": [],
132
+ "retentionDays": 365,
133
+ "notifications": {
134
+ "email": True,
135
+ "slack": True,
136
+ "sms": False,
137
+ "webhookUrl": "https://hooks.example.com/nexus",
138
+ "webhookSecret": None,
139
+ "digest": {
140
+ "enabled": True,
141
+ "frequency": "weekly",
142
+ "day": "Monday",
143
+ "time": "08:00",
144
+ "lastSentAt": "2026-05-04T08:00:00Z",
145
+ "nextScheduledAt": "2026-05-11T08:00:00Z",
146
+ "failedAttempts": 0,
147
+ "paused": False,
148
+ },
149
+ },
150
+ },
151
+
152
+ "metrics": {
153
+ "uptime": 99.97,
154
+ "requestsPerDay": 1_482_903,
155
+ "errorRate": 0.003,
156
+ "p50LatencyMs": 12,
157
+ "p95LatencyMs": 88,
158
+ "p99LatencyMs": 210,
159
+ "degraded": False,
160
+ "lastDowntime": "2025-09-14T02:11:00Z",
161
+ "maintenanceWindow": None,
162
+ "regions": {
163
+ "us-east-1": {"healthy": True, "load": 0.61, "instances": 4},
164
+ "eu-west-1": {"healthy": True, "load": 0.44, "instances": 2},
165
+ "ap-southeast-1": {"healthy": False, "load": None, "instances": 0},
166
+ },
167
+ },
168
+ }
169
+ }
@@ -80,8 +80,7 @@ def normalize(
80
80
  ctx = context or NormalizationContext()
81
81
  if isinstance(data, str):
82
82
  data = parse_input(data, source=source)
83
- if not isinstance(data, dict):
84
- raise NormalizationError(f"Expected dict, got {type(data).__name__}")
83
+ data = _coerce_root_document(data, ctx)
85
84
 
86
85
  if source != InputFormat.AUTO.value:
87
86
  fmt = InputFormat(source)
@@ -116,15 +115,41 @@ def _resolve_input_format(text: str, source: str) -> InputFormat:
116
115
  stripped = text.lstrip()
117
116
  if _MONGO_SHELL_OBJECT_ID.search(text) or _MONGO_SHELL_ISO_DATE.search(text):
118
117
  return InputFormat.MONGO_SHELL
119
- if stripped.startswith("{"):
118
+ if stripped.startswith("{") or stripped.startswith("["):
120
119
  try:
121
120
  data = json.loads(text)
122
121
  except json.JSONDecodeError as exc:
123
122
  raise NormalizationError(f"Invalid JSON input: {exc}") from exc
124
- return _detect_dict_format(data)
123
+ return _detect_parsed_format(data)
125
124
  raise NormalizationError("Could not detect input format")
126
125
 
127
126
 
127
+ def _coerce_root_document(
128
+ data: Any,
129
+ ctx: NormalizationContext,
130
+ ) -> dict[str, Any]:
131
+ if isinstance(data, dict):
132
+ return data
133
+ if isinstance(data, list):
134
+ if len(data) == 1 and isinstance(data[0], dict):
135
+ return data[0]
136
+ ctx.lists.add("")
137
+ if not data:
138
+ return {}
139
+ return {str(index): item for index, item in enumerate(data)}
140
+ raise NormalizationError(f"Expected dict or list, got {type(data).__name__}")
141
+
142
+
143
+ def _detect_parsed_format(data: Any) -> InputFormat:
144
+ if isinstance(data, dict):
145
+ return _detect_dict_format(data)
146
+ if isinstance(data, list):
147
+ if len(data) == 1 and isinstance(data[0], dict):
148
+ return _detect_dict_format(data[0])
149
+ return InputFormat.JSON
150
+ raise NormalizationError("Expected a JSON object or array")
151
+
152
+
128
153
  def _detect_dict_format(data: dict[str, Any]) -> InputFormat:
129
154
  if "_source" in data and isinstance(data.get("_source"), dict):
130
155
  return InputFormat.ELASTIC_HIT
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "jtoken"
7
- version = "0.2.2"
7
+ version = "0.2.4"
8
8
  description = "Compress JSON-shaped documents for LLM prompts with normalization, CLI, and token measurement"
9
9
  readme = "README.pypi.md"
10
10
  requires-python = ">=3.8"
@@ -60,6 +60,12 @@ class TestParseInput:
60
60
  def test_parse_json(self):
61
61
  assert parse_input('{"a": 1}', source="json") == {"a": 1}
62
62
 
63
+ def test_parse_json_array(self):
64
+ assert parse_input('[{"a": 1}]', source="json") == [{"a": 1}]
65
+
66
+ def test_parse_json_array_auto(self):
67
+ assert parse_input('[{"a": 1}]', source="auto") == [{"a": 1}]
68
+
63
69
  def test_parse_mongo_shell(self):
64
70
  parsed = parse_input(MONGO_SHELL_DOC, source="mongo_shell")
65
71
  assert parsed["_id"]["$oid"] == "69ca983fbf8c8953c43c2407"
@@ -126,3 +132,27 @@ class TestNormalizeErrors:
126
132
  def test_unsupported_type_raises(self):
127
133
  with pytest.raises(NormalizationError):
128
134
  normalize({"bad": object()}, source="json")
135
+
136
+
137
+ class TestNormalizeJsonArrays:
138
+ def test_single_object_array_is_unwrapped(self):
139
+ normalized, context = normalize('[{"QUERY_ID": "q-1", "ROWS_DELETED": 0}]', source="json")
140
+ assert normalized["QUERY_ID"] == "q-1"
141
+ assert normalized["ROWS_DELETED"] == 0
142
+ assert "" not in context.lists
143
+
144
+ def test_single_object_array_auto(self):
145
+ normalized, _ = normalize('[{"a": 1}]', source="auto")
146
+ assert normalized == {"a": 1}
147
+
148
+ def test_multi_object_array_is_indexed(self):
149
+ normalized, context = normalize('[{"a": 1}, {"b": 2}]', source="json")
150
+ assert normalized["0"]["a"] == 1
151
+ assert normalized["1"]["b"] == 2
152
+ assert "" in context.lists
153
+
154
+ def test_primitive_array_is_indexed(self):
155
+ normalized, context = normalize('["a", "b"]', source="json")
156
+ assert normalized["0"] == "a"
157
+ assert normalized["1"] == "b"
158
+ assert "" in context.lists
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes