headson 0.3.0__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of headson might be problematic. Click here for more details.

Files changed (53) hide show
  1. {headson-0.3.0 → headson-0.5.0}/Cargo.lock +1 -1
  2. {headson-0.3.0 → headson-0.5.0}/Cargo.toml +1 -1
  3. {headson-0.3.0 → headson-0.5.0}/PKG-INFO +56 -8
  4. {headson-0.3.0 → headson-0.5.0}/README.md +54 -6
  5. {headson-0.3.0 → headson-0.5.0}/pyproject.toml +3 -3
  6. {headson-0.3.0 → headson-0.5.0}/python/Cargo.lock +2 -2
  7. {headson-0.3.0 → headson-0.5.0}/python/Cargo.toml +2 -2
  8. {headson-0.3.0 → headson-0.5.0}/python/README.md +13 -2
  9. headson-0.5.0/python/headson/__init__.py +6 -0
  10. {headson-0.3.0 → headson-0.5.0}/python/src/lib.rs +31 -14
  11. {headson-0.3.0 → headson-0.5.0}/src/json_ingest/builder.rs +43 -25
  12. {headson-0.3.0 → headson-0.5.0}/src/json_ingest/mod.rs +10 -3
  13. headson-0.5.0/src/json_ingest/samplers/default.rs +219 -0
  14. headson-0.5.0/src/json_ingest/samplers/head.rs +79 -0
  15. headson-0.5.0/src/json_ingest/samplers/mod.rs +53 -0
  16. headson-0.5.0/src/json_ingest/samplers/tail.rs +107 -0
  17. {headson-0.3.0 → headson-0.5.0}/src/lib.rs +13 -11
  18. {headson-0.3.0 → headson-0.5.0}/src/main.rs +15 -0
  19. {headson-0.3.0 → headson-0.5.0}/src/order/build.rs +114 -66
  20. {headson-0.3.0 → headson-0.5.0}/src/order/types.rs +23 -1
  21. headson-0.5.0/src/serialization/fileset.rs +164 -0
  22. {headson-0.3.0 → headson-0.5.0}/src/serialization/mod.rs +166 -197
  23. {headson-0.3.0 → headson-0.5.0}/src/serialization/templates/core.rs +16 -3
  24. {headson-0.3.0 → headson-0.5.0}/src/serialization/templates/js.rs +11 -0
  25. {headson-0.3.0 → headson-0.5.0}/src/serialization/templates/pseudo.rs +9 -0
  26. headson-0.5.0/src/utils/graph.rs +61 -0
  27. {headson-0.3.0 → headson-0.5.0}/src/utils/tree_arena.rs +9 -0
  28. headson-0.3.0/python/headson/__init__.py +0 -4
  29. headson-0.3.0/src/utils/graph.rs +0 -54
  30. {headson-0.3.0 → headson-0.5.0}/JSONTestSuite/LICENSE +0 -0
  31. {headson-0.3.0 → headson-0.5.0}/JSONTestSuite/README.md +0 -0
  32. {headson-0.3.0 → headson-0.5.0}/LICENSE +0 -0
  33. {headson-0.3.0 → headson-0.5.0}/src/order/mod.rs +0 -0
  34. {headson-0.3.0 → headson-0.5.0}/src/order/scoring.rs +0 -0
  35. {headson-0.3.0 → headson-0.5.0}/src/order/snapshots/headson__order__build__tests__order_empty_array_order.snap +0 -0
  36. {headson-0.3.0 → headson-0.5.0}/src/order/snapshots/headson__order__build__tests__order_single_string_array_order.snap +0 -0
  37. {headson-0.3.0 → headson-0.5.0}/src/serialization/snapshots/headson__serialization__tests__arena_render_empty.snap +0 -0
  38. {headson-0.3.0 → headson-0.5.0}/src/serialization/snapshots/headson__serialization__tests__arena_render_single.snap +0 -0
  39. {headson-0.3.0 → headson-0.5.0}/src/serialization/templates/json.rs +0 -0
  40. {headson-0.3.0 → headson-0.5.0}/src/serialization/templates/mod.rs +0 -0
  41. {headson-0.3.0 → headson-0.5.0}/src/serialization/types.rs +0 -0
  42. {headson-0.3.0 → headson-0.5.0}/src/snapshots/headson__order__tests__order_empty_array_order.snap +0 -0
  43. {headson-0.3.0 → headson-0.5.0}/src/snapshots/headson__order__tests__order_single_string_array_order.snap +0 -0
  44. {headson-0.3.0 → headson-0.5.0}/src/snapshots/headson__order__tests__pq_empty_array_queue.snap +0 -0
  45. {headson-0.3.0 → headson-0.5.0}/src/snapshots/headson__order__tests__pq_single_string_array_queue.snap +0 -0
  46. {headson-0.3.0 → headson-0.5.0}/src/snapshots/headson__queue__tests__pq_empty_array_queue.snap +0 -0
  47. {headson-0.3.0 → headson-0.5.0}/src/snapshots/headson__queue__tests__pq_single_string_array_queue.snap +0 -0
  48. {headson-0.3.0 → headson-0.5.0}/src/snapshots/headson__tree__tests__build_tree_empty.snap +0 -0
  49. {headson-0.3.0 → headson-0.5.0}/src/snapshots/headson__tree__tests__build_tree_single.snap +0 -0
  50. {headson-0.3.0 → headson-0.5.0}/src/utils/json.rs +0 -0
  51. {headson-0.3.0 → headson-0.5.0}/src/utils/mod.rs +0 -0
  52. {headson-0.3.0 → headson-0.5.0}/src/utils/search.rs +0 -0
  53. {headson-0.3.0 → headson-0.5.0}/src/utils/text.rs +0 -0
@@ -266,7 +266,7 @@ dependencies = [
266
266
 
267
267
  [[package]]
268
268
  name = "headson"
269
- version = "0.3.0"
269
+ version = "0.5.0"
270
270
  dependencies = [
271
271
  "anyhow",
272
272
  "assert_cmd",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "headson"
3
- version = "0.3.0"
3
+ version = "0.5.0"
4
4
  edition = "2024"
5
5
  description = "Budget‑constrained JSON preview renderer"
6
6
  readme = "README.md"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: headson
3
- Version: 0.3.0
3
+ Version: 0.5.0
4
4
  Classifier: Programming Language :: Python
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: Programming Language :: Rust
@@ -10,7 +10,7 @@ Provides-Extra: test
10
10
  License-File: LICENSE
11
11
  Summary: Budget‑constrained JSON preview renderer (Python bindings)
12
12
  Keywords: json,preview,summarize,cli,bindings
13
- Requires-Python: >=3.8
13
+ Requires-Python: >=3.10
14
14
  Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
15
15
 
16
16
  # headson
@@ -66,16 +66,19 @@ Common flags:
66
66
  - `--no-space`: no space after `:` in objects
67
67
  - `--indent <STR>`: indentation unit (default: two spaces)
68
68
  - `--string-cap <N>`: max graphemes to consider per string (default: 500)
69
- - `--tail`: prefer the end of arrays when truncating. Strings are unaffected. In `pseudo`/`js` templates the omission marker appears at the start; `json` remains strict JSON with no annotations.
69
+ - `--head`: prefer the beginning of arrays when truncating (keep first N). Strings are unaffected. In `pseudo`/`js` templates the omission marker appears near the end; `json` remains strict. Mutually exclusive with `--tail`.
70
+ - `--tail`: prefer the end of arrays when truncating (keep last N). Strings are unaffected. In `pseudo`/`js` templates the omission marker appears at the start; `json` remains strict. Mutually exclusive with `--head`.
70
71
 
71
72
  Notes:
72
73
 
73
74
  - With multiple input files:
74
75
  - JSON template outputs a single JSON object keyed by the input file paths.
75
- - Pseudo and JS templates render file sections with human-readable headers.
76
+ - Pseudo and JS templates render file sections with human-readable headers when newlines are enabled.
77
+ - If you use `--compact` or `--no-newline` (both disable newlines), fileset output falls back to standard inline rendering (no per-file headers) to remain compact.
76
78
  - Using `--global-budget` may truncate or omit entire files to respect the total budget.
77
79
  - The tool finds the largest preview that fits the budget; if even the tiniest preview exceeds it, you still get a minimal, valid preview.
78
80
  - When passing file paths, directories and binary files are ignored; a notice is printed to stderr for each (e.g., `Ignored binary file: ./path/to/file`). Stdin mode reads the stream as-is.
81
+ - Head vs Tail sampling: these options bias which part of arrays are kept before rendering. They guarantee the kept segment is contiguous at the chosen side (prefix for `--head`, suffix for `--tail`). Display templates may still insert additional internal gap markers inside that kept segment to honor very small budgets; `json` remains strict and unannotated.
79
82
 
80
83
  Quick one‑liners:
81
84
 
@@ -134,12 +137,12 @@ headson -n 120 -f json users.json
134
137
 
135
138
  A thin Python extension module is available on PyPI as `headson`.
136
139
 
137
- - Install: `pip install headson` (prebuilt wheels for CPython 3.10–3.12 on Linux/macOS/Windows). Older/newer Python versions may build from source if Rust is installed.
140
+ - Install: `pip install headson` (ABI3 wheels for Python 3.10+ on Linux/macOS/Windows).
138
141
  - API:
139
- - `headson.summarize(text: str, *, template: str = "pseudo", character_budget: int | None = None, tail: bool = False) -> str`
142
+ - `headson.summarize(text: str, *, template: str = "pseudo", character_budget: int | None = None, skew: str = "balanced") -> str`
140
143
  - `template`: one of `"json" | "pseudo" | "js"`
141
144
  - `character_budget`: maximum output size in characters (default: 500)
142
- - `tail`: prefer the end of arrays when truncating; strings unaffected. Affects only display templates (`pseudo`/`js`); `json` remains strict.
145
+ - `skew`: one of `"balanced" | "head" | "tail"` (focus arrays on start vs end; only affects display templates; `json` remains strict).
143
146
 
144
147
  Example:
145
148
 
@@ -157,11 +160,56 @@ print(
157
160
  json.dumps(list(range(100))),
158
161
  template="pseudo",
159
162
  character_budget=80,
160
- tail=True,
163
+ skew="tail",
161
164
  )
162
165
  )
163
166
  ```
164
167
 
168
+ # Algorithm
169
+
170
+ ```mermaid
171
+ %%{init: {"themeCSS": ".cluster > rect { fill: transparent; stroke: transparent; } .clusterLabel > text { font-size: 16px; font-weight: 600; } .clusterLabel span { padding: 6px 10px; font-size: 16px; font-weight: 600; }"}}%%
172
+ flowchart TD
173
+ subgraph Deserialization
174
+ direction TB
175
+ A["Input file(s)"]
176
+ A -- Single --> C["Parse into optimized tree (with array pre‑sampling) ¹"]
177
+ A -- Multiple --> D["Parse each file and wrap into a fileset object"]
178
+ D --> C
179
+ end
180
+ subgraph Prioritization
181
+ direction TB
182
+ E["Build priority order ²"]
183
+ F["Choose top N nodes ³"]
184
+ end
185
+ subgraph Serialization
186
+ direction TB
187
+ G["Render attempt ⁴"]
188
+ H["Output preview string"]
189
+ end
190
+ C --> E
191
+ E --> F
192
+ F --> G
193
+ G --> F
194
+ F --> H
195
+ %% Color classes for categories
196
+ classDef des fill:#eaf2ff,stroke:#3b82f6,stroke-width:1px,color:#0f172a;
197
+ classDef prio fill:#ecfdf5,stroke:#10b981,stroke-width:1px,color:#064e3b;
198
+ classDef ser fill:#fff1f2,stroke:#f43f5e,stroke-width:1px,color:#7f1d1d;
199
+ class A,C,D des;
200
+ class E,F prio;
201
+ class G,H ser;
202
+ style Deserialization fill:transparent,stroke:transparent
203
+ style Prioritization fill:transparent,stroke:transparent
204
+ style Serialization fill:transparent,stroke:transparent
205
+ ```
206
+
207
+ ## Footnotes
208
+ - <sup><b>[1]</b></sup> <b>Optimized tree representation</b>: An arena‑style tree stored in flat, contiguous buffers. Each node records its kind and value plus index ranges into shared child and key arrays. Arrays are ingested in a single pass and may be deterministically pre‑sampled: the first element is always kept; additional elements are selected via a fixed per‑index inclusion test; for kept elements, original indices are stored and full lengths are counted. This enables accurate omission info and internal gap markers later, while minimizing pointer chasing.
209
+ - <sup><b>[2]</b></sup> <b>Priority order</b>: Nodes are scored so previews surface representative structure and values first. Arrays can favor head/mid/tail coverage (default) or strictly the head; tail preference flips head/tail when configured. Object properties are ordered by key, and strings expand by grapheme with early characters prioritized over very deep expansions.
210
+ - <sup><b>[3]</b></sup> <b>Choose top N nodes (binary search)</b>: Iteratively picks N so that the rendered preview fits within the character budget, looping between “choose N” and a render attempt to converge quickly.
211
+ - <sup><b>[4]</b></sup> <b>Render attempt</b>: Serializes the currently included nodes using the selected template. Omission summaries and per-file section headers appear in display templates (pseudo/js); json remains strict. For arrays, display templates may insert internal gap markers between non‑contiguous kept items using original indices.
212
+
165
213
  ## License
166
214
 
167
215
  MIT
@@ -51,16 +51,19 @@ Common flags:
51
51
  - `--no-space`: no space after `:` in objects
52
52
  - `--indent <STR>`: indentation unit (default: two spaces)
53
53
  - `--string-cap <N>`: max graphemes to consider per string (default: 500)
54
- - `--tail`: prefer the end of arrays when truncating. Strings are unaffected. In `pseudo`/`js` templates the omission marker appears at the start; `json` remains strict JSON with no annotations.
54
+ - `--head`: prefer the beginning of arrays when truncating (keep first N). Strings are unaffected. In `pseudo`/`js` templates the omission marker appears near the end; `json` remains strict. Mutually exclusive with `--tail`.
55
+ - `--tail`: prefer the end of arrays when truncating (keep last N). Strings are unaffected. In `pseudo`/`js` templates the omission marker appears at the start; `json` remains strict. Mutually exclusive with `--head`.
55
56
 
56
57
  Notes:
57
58
 
58
59
  - With multiple input files:
59
60
  - JSON template outputs a single JSON object keyed by the input file paths.
60
- - Pseudo and JS templates render file sections with human-readable headers.
61
+ - Pseudo and JS templates render file sections with human-readable headers when newlines are enabled.
62
+ - If you use `--compact` or `--no-newline` (both disable newlines), fileset output falls back to standard inline rendering (no per-file headers) to remain compact.
61
63
  - Using `--global-budget` may truncate or omit entire files to respect the total budget.
62
64
  - The tool finds the largest preview that fits the budget; if even the tiniest preview exceeds it, you still get a minimal, valid preview.
63
65
  - When passing file paths, directories and binary files are ignored; a notice is printed to stderr for each (e.g., `Ignored binary file: ./path/to/file`). Stdin mode reads the stream as-is.
66
+ - Head vs Tail sampling: these options bias which part of arrays are kept before rendering. They guarantee the kept segment is contiguous at the chosen side (prefix for `--head`, suffix for `--tail`). Display templates may still insert additional internal gap markers inside that kept segment to honor very small budgets; `json` remains strict and unannotated.
64
67
 
65
68
  Quick one‑liners:
66
69
 
@@ -119,12 +122,12 @@ headson -n 120 -f json users.json
119
122
 
120
123
  A thin Python extension module is available on PyPI as `headson`.
121
124
 
122
- - Install: `pip install headson` (prebuilt wheels for CPython 3.10–3.12 on Linux/macOS/Windows). Older/newer Python versions may build from source if Rust is installed.
125
+ - Install: `pip install headson` (ABI3 wheels for Python 3.10+ on Linux/macOS/Windows).
123
126
  - API:
124
- - `headson.summarize(text: str, *, template: str = "pseudo", character_budget: int | None = None, tail: bool = False) -> str`
127
+ - `headson.summarize(text: str, *, template: str = "pseudo", character_budget: int | None = None, skew: str = "balanced") -> str`
125
128
  - `template`: one of `"json" | "pseudo" | "js"`
126
129
  - `character_budget`: maximum output size in characters (default: 500)
127
- - `tail`: prefer the end of arrays when truncating; strings unaffected. Affects only display templates (`pseudo`/`js`); `json` remains strict.
130
+ - `skew`: one of `"balanced" | "head" | "tail"` (focus arrays on start vs end; only affects display templates; `json` remains strict).
128
131
 
129
132
  Example:
130
133
 
@@ -142,11 +145,56 @@ print(
142
145
  json.dumps(list(range(100))),
143
146
  template="pseudo",
144
147
  character_budget=80,
145
- tail=True,
148
+ skew="tail",
146
149
  )
147
150
  )
148
151
  ```
149
152
 
153
+ # Algorithm
154
+
155
+ ```mermaid
156
+ %%{init: {"themeCSS": ".cluster > rect { fill: transparent; stroke: transparent; } .clusterLabel > text { font-size: 16px; font-weight: 600; } .clusterLabel span { padding: 6px 10px; font-size: 16px; font-weight: 600; }"}}%%
157
+ flowchart TD
158
+ subgraph Deserialization
159
+ direction TB
160
+ A["Input file(s)"]
161
+ A -- Single --> C["Parse into optimized tree (with array pre‑sampling) ¹"]
162
+ A -- Multiple --> D["Parse each file and wrap into a fileset object"]
163
+ D --> C
164
+ end
165
+ subgraph Prioritization
166
+ direction TB
167
+ E["Build priority order ²"]
168
+ F["Choose top N nodes ³"]
169
+ end
170
+ subgraph Serialization
171
+ direction TB
172
+ G["Render attempt ⁴"]
173
+ H["Output preview string"]
174
+ end
175
+ C --> E
176
+ E --> F
177
+ F --> G
178
+ G --> F
179
+ F --> H
180
+ %% Color classes for categories
181
+ classDef des fill:#eaf2ff,stroke:#3b82f6,stroke-width:1px,color:#0f172a;
182
+ classDef prio fill:#ecfdf5,stroke:#10b981,stroke-width:1px,color:#064e3b;
183
+ classDef ser fill:#fff1f2,stroke:#f43f5e,stroke-width:1px,color:#7f1d1d;
184
+ class A,C,D des;
185
+ class E,F prio;
186
+ class G,H ser;
187
+ style Deserialization fill:transparent,stroke:transparent
188
+ style Prioritization fill:transparent,stroke:transparent
189
+ style Serialization fill:transparent,stroke:transparent
190
+ ```
191
+
192
+ ## Footnotes
193
+ - <sup><b>[1]</b></sup> <b>Optimized tree representation</b>: An arena‑style tree stored in flat, contiguous buffers. Each node records its kind and value plus index ranges into shared child and key arrays. Arrays are ingested in a single pass and may be deterministically pre‑sampled: the first element is always kept; additional elements are selected via a fixed per‑index inclusion test; for kept elements, original indices are stored and full lengths are counted. This enables accurate omission info and internal gap markers later, while minimizing pointer chasing.
194
+ - <sup><b>[2]</b></sup> <b>Priority order</b>: Nodes are scored so previews surface representative structure and values first. Arrays can favor head/mid/tail coverage (default) or strictly the head; tail preference flips head/tail when configured. Object properties are ordered by key, and strings expand by grapheme with early characters prioritized over very deep expansions.
195
+ - <sup><b>[3]</b></sup> <b>Choose top N nodes (binary search)</b>: Iteratively picks N so that the rendered preview fits within the character budget, looping between “choose N” and a render attempt to converge quickly.
196
+ - <sup><b>[4]</b></sup> <b>Render attempt</b>: Serializes the currently included nodes using the selected template. Omission summaries and per-file section headers appear in display templates (pseudo/js); json remains strict. For arrays, display templates may insert internal gap markers between non‑contiguous kept items using original indices.
197
+
150
198
  ## License
151
199
 
152
200
  MIT
@@ -4,10 +4,10 @@ build-backend = "maturin"
4
4
 
5
5
  [project]
6
6
  name = "headson"
7
- version = "0.3.0"
7
+ version = "0.5.0"
8
8
  description = "Budget‑constrained JSON preview renderer (Python bindings)"
9
9
  readme = "README.md"
10
- requires-python = ">=3.8"
10
+ requires-python = ">=3.10"
11
11
  classifiers = [
12
12
  "Programming Language :: Python",
13
13
  "Programming Language :: Python :: 3",
@@ -32,7 +32,7 @@ python-source = "python"
32
32
  dev = [
33
33
  "pytest>=8",
34
34
  "maturin>=1.7,<2",
35
- "ruff==0.6.9",
35
+ "ruff==0.14.2",
36
36
  ]
37
37
 
38
38
  [tool.ruff]
@@ -169,7 +169,7 @@ dependencies = [
169
169
 
170
170
  [[package]]
171
171
  name = "headson"
172
- version = "0.3.0"
172
+ version = "0.5.0"
173
173
  dependencies = [
174
174
  "anyhow",
175
175
  "clap",
@@ -182,7 +182,7 @@ dependencies = [
182
182
 
183
183
  [[package]]
184
184
  name = "headson-python"
185
- version = "0.3.0"
185
+ version = "0.5.0"
186
186
  dependencies = [
187
187
  "anyhow",
188
188
  "headson",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "headson-python"
3
- version = "0.3.0"
3
+ version = "0.5.0"
4
4
  edition = "2021"
5
5
  publish = false
6
6
  readme = "README.md"
@@ -11,5 +11,5 @@ crate-type = ["cdylib"]
11
11
 
12
12
  [dependencies]
13
13
  anyhow = "1"
14
- pyo3 = { version = "0.27", features = ["extension-module"] }
14
+ pyo3 = { version = "0.27", features = ["extension-module", "abi3-py310"] }
15
15
  headson_core = { package = "headson", path = ".." }
@@ -4,19 +4,30 @@ Minimal Python API for the `headson` JSON preview renderer.
4
4
 
5
5
  Currently exported function:
6
6
 
7
- - `headson.summarize(text: str, *, template: str = "pseudo", character_budget: int | None = None) -> str`
7
+ - `headson.summarize(text: str, *, template: str = "pseudo", character_budget: int | None = None, skew: str = "balanced") -> str`
8
8
  - `template`: one of `"json" | "pseudo" | "js"`.
9
9
  - `character_budget`: maximum output size in characters (defaults to 500 if not set).
10
+ - `skew`: one of `"balanced" | "head" | "tail"`.
11
+ - `balanced`: default behavior.
12
+ - `head`: focus arrays on the beginning (keep first N).
13
+ - `tail`: focus arrays on the end (keep last N); pseudo/js place omission markers at the start.
10
14
 
11
15
  Examples:
12
16
 
13
17
  ```python
14
18
  import headson
15
19
 
20
+ # Pseudo template with a small budget (structure-aware preview)
16
21
  print(headson.summarize('{"a": 1, "b": [1,2,3]}', template="pseudo", character_budget=80))
22
+
23
+ # Strict JSON template preserves valid JSON output
17
24
  print(headson.summarize('{"a": 1, "b": {"c": 2}}', template="json", character_budget=10_000))
25
+
26
+ # JS template with tail skew: prefer the end of arrays when truncating
18
27
  arr = ','.join(str(i) for i in range(100))
19
- print(headson.summarize('{"arr": [' + arr + ']}', template="js", character_budget=60))
28
+ print(headson.summarize('{"arr": [' + arr + ']}', template="js", character_budget=60, skew="tail"))
29
+
30
+ # Note: tail mode affects only pseudo/js display templates; the json template stays strict.
20
31
  ```
21
32
 
22
33
  Install for development:
@@ -0,0 +1,6 @@
1
+ from __future__ import annotations
2
+
3
+ # Directly re-export the compiled extension function with the final signature.
4
+ from .headson import summarize # type: ignore
5
+
6
+ __all__ = ["summarize"]
@@ -2,7 +2,7 @@ use anyhow::{bail, Result};
2
2
  use pyo3::exceptions::PyRuntimeError;
3
3
  use pyo3::prelude::*;
4
4
  use pyo3::types::PyModule;
5
- use headson_core::{OutputTemplate, RenderConfig, PriorityConfig};
5
+ use headson_core::{ArraySamplerStrategy, OutputTemplate, PriorityConfig, RenderConfig};
6
6
 
7
7
  fn to_template(s: &str) -> Result<OutputTemplate> {
8
8
  match s.to_ascii_lowercase().as_str() {
@@ -13,25 +13,41 @@ fn to_template(s: &str) -> Result<OutputTemplate> {
13
13
  }
14
14
  }
15
15
 
16
- fn render_config(template: &str, prefer_tail_arrays: bool) -> Result<RenderConfig> {
16
+ fn render_config_with_sampler(
17
+ template: &str,
18
+ sampler: ArraySamplerStrategy,
19
+ ) -> Result<RenderConfig> {
17
20
  let t = to_template(template)?;
18
21
  let space = " ".to_string();
19
22
  let newline = "\n".to_string();
20
23
  let indent_unit = " ".to_string();
21
- Ok(RenderConfig {
22
- template: t,
23
- indent_unit,
24
- space,
25
- newline,
26
- prefer_tail_arrays,
27
- })
24
+ let prefer_tail_arrays = matches!(sampler, ArraySamplerStrategy::Tail);
25
+ Ok(RenderConfig { template: t, indent_unit, space, newline, prefer_tail_arrays })
26
+ }
27
+
28
+ fn parse_skew(skew: &str) -> Result<ArraySamplerStrategy> {
29
+ match skew.to_ascii_lowercase().as_str() {
30
+ "balanced" => Ok(ArraySamplerStrategy::Default),
31
+ "head" => Ok(ArraySamplerStrategy::Head),
32
+ "tail" => Ok(ArraySamplerStrategy::Tail),
33
+ other => bail!(
34
+ "unknown skew: {} (expected 'balanced' | 'head' | 'tail')",
35
+ other
36
+ ),
37
+ }
28
38
  }
29
39
 
30
- fn priority_config(per_file_budget: usize, prefer_tail_arrays: bool) -> PriorityConfig {
40
+ fn priority_config(
41
+ per_file_budget: usize,
42
+ sampler: ArraySamplerStrategy,
43
+ ) -> PriorityConfig {
44
+ let prefer_tail_arrays = matches!(sampler, ArraySamplerStrategy::Tail);
31
45
  PriorityConfig {
32
46
  max_string_graphemes: 500,
33
47
  array_max_items: (per_file_budget / 2).max(1),
34
48
  prefer_tail_arrays,
49
+ array_bias: headson_core::ArrayBias::HeadMidTail,
50
+ array_sampler: sampler,
35
51
  }
36
52
  }
37
53
 
@@ -40,18 +56,19 @@ fn to_pyerr(e: anyhow::Error) -> PyErr {
40
56
  }
41
57
 
42
58
  #[pyfunction]
43
- #[pyo3(signature = (text, *, template="pseudo", character_budget=None, tail=false))]
59
+ #[pyo3(signature = (text, *, template="pseudo", character_budget=None, skew="balanced"))]
44
60
  fn summarize(
45
61
  py: Python<'_>,
46
62
  text: &str,
47
63
  template: &str,
48
64
  character_budget: Option<usize>,
49
- tail: bool,
65
+ skew: &str,
50
66
  ) -> PyResult<String> {
51
- let cfg = render_config(template, tail).map_err(to_pyerr)?;
67
+ let sampler = parse_skew(skew).map_err(to_pyerr)?;
68
+ let cfg = render_config_with_sampler(template, sampler).map_err(to_pyerr)?;
52
69
  let budget = character_budget.unwrap_or(500);
53
70
  let per_file_for_priority = budget.max(1);
54
- let prio = priority_config(per_file_for_priority, tail);
71
+ let prio = priority_config(per_file_for_priority, sampler);
55
72
  let input = text.as_bytes().to_vec();
56
73
  py.detach(|| headson_core::headson(input, &cfg, &prio, budget).map_err(to_pyerr))
57
74
  }
@@ -1,21 +1,28 @@
1
1
  use serde::Deserializer;
2
- use serde::de::{DeserializeSeed, IgnoredAny, MapAccess, SeqAccess, Visitor};
2
+ use serde::de::{DeserializeSeed, MapAccess, SeqAccess, Visitor};
3
3
  use std::cell::RefCell;
4
4
 
5
5
  use crate::order::NodeKind;
6
6
  use crate::utils::tree_arena::{JsonTreeArena, JsonTreeNode};
7
7
 
8
+ use super::samplers::ArraySamplerKind;
9
+
8
10
  #[derive(Default)]
9
11
  pub(crate) struct JsonTreeBuilder {
10
12
  arena: RefCell<JsonTreeArena>,
11
13
  pub(crate) array_cap: usize,
14
+ sampler: ArraySamplerKind,
12
15
  }
13
16
 
14
17
  impl JsonTreeBuilder {
15
- pub(crate) fn new(array_cap: usize) -> Self {
18
+ pub(crate) fn new(
19
+ array_cap: usize,
20
+ sampler: super::samplers::ArraySamplerKind,
21
+ ) -> Self {
16
22
  Self {
17
23
  arena: RefCell::new(JsonTreeArena::default()),
18
24
  array_cap,
25
+ sampler,
19
26
  }
20
27
  }
21
28
 
@@ -88,15 +95,34 @@ impl JsonTreeBuilder {
88
95
  kept: usize,
89
96
  total: usize,
90
97
  local_children: Vec<usize>,
98
+ local_indices: Vec<usize>,
91
99
  ) {
92
100
  let mut a = self.arena.borrow_mut();
93
101
  let children_start = a.children.len();
94
102
  a.children.extend(local_children);
103
+
104
+ // Detect contiguous indices 0..kept-1 to skip storing arr_indices data
105
+ let contiguous = local_indices.len() == kept
106
+ && local_indices.iter().enumerate().all(|(i, &idx)| idx == i);
107
+
108
+ let (arr_indices_start, pushed_len) =
109
+ if kept == 0 || contiguous || local_indices.is_empty() {
110
+ (0usize, 0usize)
111
+ } else {
112
+ let start = a.arr_indices.len();
113
+ a.arr_indices.extend(local_indices);
114
+ let pushed = a.arr_indices.len().saturating_sub(start);
115
+ (start, pushed)
116
+ };
117
+
95
118
  let n = &mut a.nodes[id];
96
119
  n.kind = NodeKind::Array;
97
120
  n.children_start = children_start;
98
121
  n.children_len = kept;
99
122
  n.array_len = Some(total);
123
+ n.arr_indices_start = arr_indices_start;
124
+ // When no indices were pushed, mark len=0 to indicate contiguous 0..kept
125
+ n.arr_indices_len = pushed_len.min(kept);
100
126
  }
101
127
 
102
128
  fn finish_object(
@@ -205,29 +231,19 @@ impl<'de> Visitor<'de> for NodeVisitor<'_> {
205
231
  A: SeqAccess<'de>,
206
232
  {
207
233
  let id = self.b.push_default();
208
- let mut local_children: Vec<usize> = Vec::new();
209
- let low = seq.size_hint().unwrap_or(0);
210
- local_children.reserve(low.min(self.b.array_cap));
211
- let mut kept = 0usize;
212
- let mut total = 0usize;
213
- while kept < self.b.array_cap {
214
- let next = {
215
- let seed = self.b.seed();
216
- seq.next_element_seed(seed)?
217
- };
218
- match next {
219
- Some(cid) => {
220
- local_children.push(cid);
221
- kept += 1;
222
- total += 1;
223
- }
224
- None => break,
225
- }
226
- }
227
- while (seq.next_element::<IgnoredAny>()?).is_some() {
228
- total += 1;
229
- }
230
- self.b.finish_array(id, kept, total, local_children);
234
+ let sampled = self.b.sampler.sample_stream(
235
+ &mut seq,
236
+ self.b,
237
+ self.b.array_cap,
238
+ )?;
239
+ let kept = sampled.children.len();
240
+ self.b.finish_array(
241
+ id,
242
+ kept,
243
+ sampled.total_len,
244
+ sampled.children,
245
+ sampled.indices,
246
+ );
231
247
  Ok(id)
232
248
  }
233
249
 
@@ -255,3 +271,5 @@ impl<'de> Visitor<'de> for NodeVisitor<'_> {
255
271
  Ok(id)
256
272
  }
257
273
  }
274
+
275
+ impl JsonTreeBuilder {}
@@ -1,7 +1,8 @@
1
1
  mod builder;
2
+ mod samplers;
2
3
  use serde::de::DeserializeSeed;
3
4
 
4
- use crate::order::PriorityConfig;
5
+ use crate::PriorityConfig;
5
6
  use crate::utils::tree_arena::JsonTreeArena;
6
7
  use anyhow::Result;
7
8
  use builder::JsonTreeBuilder;
@@ -19,7 +20,10 @@ pub fn build_json_tree_arena_from_bytes(
19
20
  config: &PriorityConfig,
20
21
  ) -> Result<JsonTreeArena> {
21
22
  let mut de = simd_json::Deserializer::from_slice(&mut bytes)?;
22
- let builder = JsonTreeBuilder::new(config.array_max_items);
23
+ let builder = JsonTreeBuilder::new(
24
+ config.array_max_items,
25
+ config.array_sampler.into(),
26
+ );
23
27
  let root_id: usize = {
24
28
  let seed = builder.seed();
25
29
  seed.deserialize(&mut de)?
@@ -33,7 +37,10 @@ pub fn build_json_tree_arena_from_many(
33
37
  mut inputs: Vec<(String, Vec<u8>)>,
34
38
  config: &PriorityConfig,
35
39
  ) -> Result<JsonTreeArena> {
36
- let builder = JsonTreeBuilder::new(config.array_max_items);
40
+ let builder = JsonTreeBuilder::new(
41
+ config.array_max_items,
42
+ config.array_sampler.into(),
43
+ );
37
44
  let mut child_ids: Vec<usize> = Vec::with_capacity(inputs.len());
38
45
  let mut keys: Vec<String> = Vec::with_capacity(inputs.len());
39
46
  for (key, mut bytes) in inputs.drain(..) {