jtoken 0.2.2__tar.gz → 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: jtoken
3
- Version: 0.2.2
3
+ Version: 0.2.3
4
4
  Summary: Compress JSON-shaped documents for LLM prompts with normalization, CLI, and token measurement
5
5
  Project-URL: Homepage, https://github.com/hermannsamimi/jtoken
6
6
  Project-URL: Repository, https://github.com/hermannsamimi/jtoken
@@ -19,7 +19,7 @@ from .tokens import (
19
19
  token_savings,
20
20
  )
21
21
 
22
- __version__ = "0.2.2"
22
+ __version__ = "0.2.3"
23
23
  __author__ = "Hermann Samimi"
24
24
 
25
25
  # json-style aliases
@@ -80,8 +80,7 @@ def normalize(
80
80
  ctx = context or NormalizationContext()
81
81
  if isinstance(data, str):
82
82
  data = parse_input(data, source=source)
83
- if not isinstance(data, dict):
84
- raise NormalizationError(f"Expected dict, got {type(data).__name__}")
83
+ data = _coerce_root_document(data, ctx)
85
84
 
86
85
  if source != InputFormat.AUTO.value:
87
86
  fmt = InputFormat(source)
@@ -116,15 +115,41 @@ def _resolve_input_format(text: str, source: str) -> InputFormat:
116
115
  stripped = text.lstrip()
117
116
  if _MONGO_SHELL_OBJECT_ID.search(text) or _MONGO_SHELL_ISO_DATE.search(text):
118
117
  return InputFormat.MONGO_SHELL
119
- if stripped.startswith("{"):
118
+ if stripped.startswith("{") or stripped.startswith("["):
120
119
  try:
121
120
  data = json.loads(text)
122
121
  except json.JSONDecodeError as exc:
123
122
  raise NormalizationError(f"Invalid JSON input: {exc}") from exc
124
- return _detect_dict_format(data)
123
+ return _detect_parsed_format(data)
125
124
  raise NormalizationError("Could not detect input format")
126
125
 
127
126
 
127
+ def _coerce_root_document(
128
+ data: Any,
129
+ ctx: NormalizationContext,
130
+ ) -> dict[str, Any]:
131
+ if isinstance(data, dict):
132
+ return data
133
+ if isinstance(data, list):
134
+ if len(data) == 1 and isinstance(data[0], dict):
135
+ return data[0]
136
+ ctx.lists.add("")
137
+ if not data:
138
+ return {}
139
+ return {str(index): item for index, item in enumerate(data)}
140
+ raise NormalizationError(f"Expected dict or list, got {type(data).__name__}")
141
+
142
+
143
+ def _detect_parsed_format(data: Any) -> InputFormat:
144
+ if isinstance(data, dict):
145
+ return _detect_dict_format(data)
146
+ if isinstance(data, list):
147
+ if len(data) == 1 and isinstance(data[0], dict):
148
+ return _detect_dict_format(data[0])
149
+ return InputFormat.JSON
150
+ raise NormalizationError("Expected a JSON object or array")
151
+
152
+
128
153
  def _detect_dict_format(data: dict[str, Any]) -> InputFormat:
129
154
  if "_source" in data and isinstance(data.get("_source"), dict):
130
155
  return InputFormat.ELASTIC_HIT
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "jtoken"
7
- version = "0.2.2"
7
+ version = "0.2.3"
8
8
  description = "Compress JSON-shaped documents for LLM prompts with normalization, CLI, and token measurement"
9
9
  readme = "README.pypi.md"
10
10
  requires-python = ">=3.8"
@@ -60,6 +60,12 @@ class TestParseInput:
60
60
  def test_parse_json(self):
61
61
  assert parse_input('{"a": 1}', source="json") == {"a": 1}
62
62
 
63
+ def test_parse_json_array(self):
64
+ assert parse_input('[{"a": 1}]', source="json") == [{"a": 1}]
65
+
66
+ def test_parse_json_array_auto(self):
67
+ assert parse_input('[{"a": 1}]', source="auto") == [{"a": 1}]
68
+
63
69
  def test_parse_mongo_shell(self):
64
70
  parsed = parse_input(MONGO_SHELL_DOC, source="mongo_shell")
65
71
  assert parsed["_id"]["$oid"] == "69ca983fbf8c8953c43c2407"
@@ -126,3 +132,27 @@ class TestNormalizeErrors:
126
132
  def test_unsupported_type_raises(self):
127
133
  with pytest.raises(NormalizationError):
128
134
  normalize({"bad": object()}, source="json")
135
+
136
+
137
+ class TestNormalizeJsonArrays:
138
+ def test_single_object_array_is_unwrapped(self):
139
+ normalized, context = normalize('[{"QUERY_ID": "q-1", "ROWS_DELETED": 0}]', source="json")
140
+ assert normalized["QUERY_ID"] == "q-1"
141
+ assert normalized["ROWS_DELETED"] == 0
142
+ assert "" not in context.lists
143
+
144
+ def test_single_object_array_auto(self):
145
+ normalized, _ = normalize('[{"a": 1}]', source="auto")
146
+ assert normalized == {"a": 1}
147
+
148
+ def test_multi_object_array_is_indexed(self):
149
+ normalized, context = normalize('[{"a": 1}, {"b": 2}]', source="json")
150
+ assert normalized["0"]["a"] == 1
151
+ assert normalized["1"]["b"] == 2
152
+ assert "" in context.lists
153
+
154
+ def test_primitive_array_is_indexed(self):
155
+ normalized, context = normalize('["a", "b"]', source="json")
156
+ assert normalized["0"] == "a"
157
+ assert normalized["1"] == "b"
158
+ assert "" in context.lists
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes