jtoken 0.2.2__tar.gz → 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {jtoken-0.2.2 → jtoken-0.2.3}/PKG-INFO +1 -1
- {jtoken-0.2.2 → jtoken-0.2.3}/jtoken/__init__.py +1 -1
- {jtoken-0.2.2 → jtoken-0.2.3}/jtoken/normalize.py +29 -4
- {jtoken-0.2.2 → jtoken-0.2.3}/pyproject.toml +1 -1
- {jtoken-0.2.2 → jtoken-0.2.3}/tests/test_normalize.py +30 -0
- {jtoken-0.2.2 → jtoken-0.2.3}/.gitignore +0 -0
- {jtoken-0.2.2 → jtoken-0.2.3}/LICENSE +0 -0
- {jtoken-0.2.2 → jtoken-0.2.3}/README.md +0 -0
- {jtoken-0.2.2 → jtoken-0.2.3}/README.pypi.md +0 -0
- {jtoken-0.2.2 → jtoken-0.2.3}/jtoken/__main__.py +0 -0
- {jtoken-0.2.2 → jtoken-0.2.3}/jtoken/_codec.py +0 -0
- {jtoken-0.2.2 → jtoken-0.2.3}/jtoken/cli.py +0 -0
- {jtoken-0.2.2 → jtoken-0.2.3}/jtoken/denormalize.py +0 -0
- {jtoken-0.2.2 → jtoken-0.2.3}/jtoken/exceptions.py +0 -0
- {jtoken-0.2.2 → jtoken-0.2.3}/jtoken/formats.py +0 -0
- {jtoken-0.2.2 → jtoken-0.2.3}/jtoken/tokens.py +0 -0
- {jtoken-0.2.2 → jtoken-0.2.3}/tests/__init__.py +0 -0
- {jtoken-0.2.2 → jtoken-0.2.3}/tests/test_cli.py +0 -0
- {jtoken-0.2.2 → jtoken-0.2.3}/tests/test_codec.py +0 -0
- {jtoken-0.2.2 → jtoken-0.2.3}/tests/test_denormalize.py +0 -0
- {jtoken-0.2.2 → jtoken-0.2.3}/tests/test_roundtrip.py +0 -0
- {jtoken-0.2.2 → jtoken-0.2.3}/tests/test_tokens.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: jtoken
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.3
|
|
4
4
|
Summary: Compress JSON-shaped documents for LLM prompts with normalization, CLI, and token measurement
|
|
5
5
|
Project-URL: Homepage, https://github.com/hermannsamimi/jtoken
|
|
6
6
|
Project-URL: Repository, https://github.com/hermannsamimi/jtoken
|
|
@@ -80,8 +80,7 @@ def normalize(
|
|
|
80
80
|
ctx = context or NormalizationContext()
|
|
81
81
|
if isinstance(data, str):
|
|
82
82
|
data = parse_input(data, source=source)
|
|
83
|
-
|
|
84
|
-
raise NormalizationError(f"Expected dict, got {type(data).__name__}")
|
|
83
|
+
data = _coerce_root_document(data, ctx)
|
|
85
84
|
|
|
86
85
|
if source != InputFormat.AUTO.value:
|
|
87
86
|
fmt = InputFormat(source)
|
|
@@ -116,15 +115,41 @@ def _resolve_input_format(text: str, source: str) -> InputFormat:
|
|
|
116
115
|
stripped = text.lstrip()
|
|
117
116
|
if _MONGO_SHELL_OBJECT_ID.search(text) or _MONGO_SHELL_ISO_DATE.search(text):
|
|
118
117
|
return InputFormat.MONGO_SHELL
|
|
119
|
-
if stripped.startswith("{"):
|
|
118
|
+
if stripped.startswith("{") or stripped.startswith("["):
|
|
120
119
|
try:
|
|
121
120
|
data = json.loads(text)
|
|
122
121
|
except json.JSONDecodeError as exc:
|
|
123
122
|
raise NormalizationError(f"Invalid JSON input: {exc}") from exc
|
|
124
|
-
return
|
|
123
|
+
return _detect_parsed_format(data)
|
|
125
124
|
raise NormalizationError("Could not detect input format")
|
|
126
125
|
|
|
127
126
|
|
|
127
|
+
def _coerce_root_document(
|
|
128
|
+
data: Any,
|
|
129
|
+
ctx: NormalizationContext,
|
|
130
|
+
) -> dict[str, Any]:
|
|
131
|
+
if isinstance(data, dict):
|
|
132
|
+
return data
|
|
133
|
+
if isinstance(data, list):
|
|
134
|
+
if len(data) == 1 and isinstance(data[0], dict):
|
|
135
|
+
return data[0]
|
|
136
|
+
ctx.lists.add("")
|
|
137
|
+
if not data:
|
|
138
|
+
return {}
|
|
139
|
+
return {str(index): item for index, item in enumerate(data)}
|
|
140
|
+
raise NormalizationError(f"Expected dict or list, got {type(data).__name__}")
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _detect_parsed_format(data: Any) -> InputFormat:
|
|
144
|
+
if isinstance(data, dict):
|
|
145
|
+
return _detect_dict_format(data)
|
|
146
|
+
if isinstance(data, list):
|
|
147
|
+
if len(data) == 1 and isinstance(data[0], dict):
|
|
148
|
+
return _detect_dict_format(data[0])
|
|
149
|
+
return InputFormat.JSON
|
|
150
|
+
raise NormalizationError("Expected a JSON object or array")
|
|
151
|
+
|
|
152
|
+
|
|
128
153
|
def _detect_dict_format(data: dict[str, Any]) -> InputFormat:
|
|
129
154
|
if "_source" in data and isinstance(data.get("_source"), dict):
|
|
130
155
|
return InputFormat.ELASTIC_HIT
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "jtoken"
|
|
7
|
-
version = "0.2.
|
|
7
|
+
version = "0.2.3"
|
|
8
8
|
description = "Compress JSON-shaped documents for LLM prompts with normalization, CLI, and token measurement"
|
|
9
9
|
readme = "README.pypi.md"
|
|
10
10
|
requires-python = ">=3.8"
|
|
@@ -60,6 +60,12 @@ class TestParseInput:
|
|
|
60
60
|
def test_parse_json(self):
|
|
61
61
|
assert parse_input('{"a": 1}', source="json") == {"a": 1}
|
|
62
62
|
|
|
63
|
+
def test_parse_json_array(self):
|
|
64
|
+
assert parse_input('[{"a": 1}]', source="json") == [{"a": 1}]
|
|
65
|
+
|
|
66
|
+
def test_parse_json_array_auto(self):
|
|
67
|
+
assert parse_input('[{"a": 1}]', source="auto") == [{"a": 1}]
|
|
68
|
+
|
|
63
69
|
def test_parse_mongo_shell(self):
|
|
64
70
|
parsed = parse_input(MONGO_SHELL_DOC, source="mongo_shell")
|
|
65
71
|
assert parsed["_id"]["$oid"] == "69ca983fbf8c8953c43c2407"
|
|
@@ -126,3 +132,27 @@ class TestNormalizeErrors:
|
|
|
126
132
|
def test_unsupported_type_raises(self):
|
|
127
133
|
with pytest.raises(NormalizationError):
|
|
128
134
|
normalize({"bad": object()}, source="json")
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
class TestNormalizeJsonArrays:
|
|
138
|
+
def test_single_object_array_is_unwrapped(self):
|
|
139
|
+
normalized, context = normalize('[{"QUERY_ID": "q-1", "ROWS_DELETED": 0}]', source="json")
|
|
140
|
+
assert normalized["QUERY_ID"] == "q-1"
|
|
141
|
+
assert normalized["ROWS_DELETED"] == 0
|
|
142
|
+
assert "" not in context.lists
|
|
143
|
+
|
|
144
|
+
def test_single_object_array_auto(self):
|
|
145
|
+
normalized, _ = normalize('[{"a": 1}]', source="auto")
|
|
146
|
+
assert normalized == {"a": 1}
|
|
147
|
+
|
|
148
|
+
def test_multi_object_array_is_indexed(self):
|
|
149
|
+
normalized, context = normalize('[{"a": 1}, {"b": 2}]', source="json")
|
|
150
|
+
assert normalized["0"]["a"] == 1
|
|
151
|
+
assert normalized["1"]["b"] == 2
|
|
152
|
+
assert "" in context.lists
|
|
153
|
+
|
|
154
|
+
def test_primitive_array_is_indexed(self):
|
|
155
|
+
normalized, context = normalize('["a", "b"]', source="json")
|
|
156
|
+
assert normalized["0"] == "a"
|
|
157
|
+
assert normalized["1"] == "b"
|
|
158
|
+
assert "" in context.lists
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|