headson 0.6.3__tar.gz → 0.6.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {headson-0.6.3 → headson-0.6.5}/Cargo.lock +1 -1
- {headson-0.6.3 → headson-0.6.5}/Cargo.toml +1 -1
- {headson-0.6.3 → headson-0.6.5}/PKG-INFO +25 -20
- {headson-0.6.3 → headson-0.6.5}/README.md +24 -19
- headson-0.6.5/docs/assets/tapes/demo.gif +0 -0
- {headson-0.6.3 → headson-0.6.5}/pyproject.toml +1 -1
- {headson-0.6.3 → headson-0.6.5}/python/Cargo.lock +2 -2
- {headson-0.6.3 → headson-0.6.5}/python/Cargo.toml +1 -1
- {headson-0.6.3 → headson-0.6.5}/python/README.md +9 -9
- {headson-0.6.3 → headson-0.6.5}/python/src/lib.rs +5 -3
- {headson-0.6.3/src/json_ingest → headson-0.6.5/src/ingest/formats/json}/builder.rs +2 -5
- {headson-0.6.3/src/json_ingest → headson-0.6.5/src/ingest/formats/json}/mod.rs +41 -6
- {headson-0.6.3/src/json_ingest → headson-0.6.5/src/ingest/formats/json}/samplers/default.rs +4 -6
- {headson-0.6.3/src/json_ingest → headson-0.6.5/src/ingest/formats/json}/samplers/head.rs +2 -1
- {headson-0.6.3/src/json_ingest → headson-0.6.5/src/ingest/formats/json}/samplers/mod.rs +2 -20
- {headson-0.6.3/src/json_ingest → headson-0.6.5/src/ingest/formats/json}/samplers/tail.rs +6 -3
- headson-0.6.5/src/ingest/formats/mod.rs +9 -0
- {headson-0.6.3/src/text_ingest → headson-0.6.5/src/ingest/formats/text}/mod.rs +97 -14
- {headson-0.6.3/src/yaml_ingest → headson-0.6.5/src/ingest/formats/yaml}/mod.rs +124 -78
- {headson-0.6.3 → headson-0.6.5}/src/ingest/mod.rs +14 -8
- headson-0.6.5/src/ingest/sampling/mod.rs +118 -0
- {headson-0.6.3 → headson-0.6.5}/src/lib.rs +162 -15
- {headson-0.6.3 → headson-0.6.5}/src/main.rs +154 -70
- {headson-0.6.3 → headson-0.6.5}/src/order/build.rs +2 -2
- {headson-0.6.3 → headson-0.6.5}/src/order/types.rs +5 -0
- {headson-0.6.3 → headson-0.6.5}/src/serialization/mod.rs +79 -27
- {headson-0.6.3 → headson-0.6.5}/src/serialization/types.rs +3 -0
- headson-0.6.5/src/utils/measure.rs +42 -0
- {headson-0.6.3 → headson-0.6.5}/src/utils/mod.rs +1 -0
- headson-0.6.3/docs/assets/tapes/demo.gif +0 -0
- headson-0.6.3/src/ingest/json.rs +0 -37
- headson-0.6.3/src/ingest/text.rs +0 -45
- headson-0.6.3/src/ingest/yaml.rs +0 -39
- {headson-0.6.3 → headson-0.6.5}/docs/assets/algorithm.svg +0 -0
- {headson-0.6.3 → headson-0.6.5}/docs/assets/logo.png +0 -0
- {headson-0.6.3 → headson-0.6.5}/docs/assets/logo.svg +0 -0
- {headson-0.6.3 → headson-0.6.5}/python/headson/__init__.py +0 -0
- {headson-0.6.3 → headson-0.6.5}/src/format.rs +0 -0
- {headson-0.6.3 → headson-0.6.5}/src/order/mod.rs +0 -0
- {headson-0.6.3 → headson-0.6.5}/src/order/scoring.rs +0 -0
- {headson-0.6.3 → headson-0.6.5}/src/order/snapshots/headson__order__build__tests__order_empty_array_order.snap +0 -0
- {headson-0.6.3 → headson-0.6.5}/src/order/snapshots/headson__order__build__tests__order_single_string_array_order.snap +0 -0
- {headson-0.6.3 → headson-0.6.5}/src/serialization/color.rs +0 -0
- {headson-0.6.3 → headson-0.6.5}/src/serialization/fileset.rs +0 -0
- {headson-0.6.3 → headson-0.6.5}/src/serialization/output.rs +0 -0
- {headson-0.6.3 → headson-0.6.5}/src/serialization/snapshots/headson__serialization__tests__arena_render_empty.snap +0 -0
- {headson-0.6.3 → headson-0.6.5}/src/serialization/snapshots/headson__serialization__tests__arena_render_empty_yaml.snap +0 -0
- {headson-0.6.3 → headson-0.6.5}/src/serialization/snapshots/headson__serialization__tests__arena_render_single.snap +0 -0
- {headson-0.6.3 → headson-0.6.5}/src/serialization/snapshots/headson__serialization__tests__arena_render_single_yaml.snap +0 -0
- {headson-0.6.3 → headson-0.6.5}/src/serialization/snapshots/headson__serialization__tests__array_internal_gaps_yaml.snap +0 -0
- {headson-0.6.3 → headson-0.6.5}/src/serialization/snapshots/headson__serialization__tests__array_omitted_js_head.snap +0 -0
- {headson-0.6.3 → headson-0.6.5}/src/serialization/snapshots/headson__serialization__tests__array_omitted_js_tail.snap +0 -0
- {headson-0.6.3 → headson-0.6.5}/src/serialization/snapshots/headson__serialization__tests__array_omitted_pseudo_head.snap +0 -0
- {headson-0.6.3 → headson-0.6.5}/src/serialization/snapshots/headson__serialization__tests__array_omitted_pseudo_tail.snap +0 -0
- {headson-0.6.3 → headson-0.6.5}/src/serialization/snapshots/headson__serialization__tests__array_omitted_yaml_head.snap +0 -0
- {headson-0.6.3 → headson-0.6.5}/src/serialization/snapshots/headson__serialization__tests__array_omitted_yaml_tail.snap +0 -0
- {headson-0.6.3 → headson-0.6.5}/src/serialization/snapshots/headson__serialization__tests__inline_open_array_in_object_json.snap +0 -0
- {headson-0.6.3 → headson-0.6.5}/src/serialization/snapshots/headson__serialization__tests__inline_open_array_in_object_yaml.snap +0 -0
- {headson-0.6.3 → headson-0.6.5}/src/serialization/templates/core.rs +0 -0
- {headson-0.6.3 → headson-0.6.5}/src/serialization/templates/js.rs +0 -0
- {headson-0.6.3 → headson-0.6.5}/src/serialization/templates/json.rs +0 -0
- {headson-0.6.3 → headson-0.6.5}/src/serialization/templates/mod.rs +0 -0
- {headson-0.6.3 → headson-0.6.5}/src/serialization/templates/pseudo.rs +0 -0
- {headson-0.6.3 → headson-0.6.5}/src/serialization/templates/text.rs +0 -0
- {headson-0.6.3 → headson-0.6.5}/src/serialization/templates/yaml.rs +0 -0
- {headson-0.6.3 → headson-0.6.5}/src/snapshots/headson__order__tests__order_empty_array_order.snap +0 -0
- {headson-0.6.3 → headson-0.6.5}/src/snapshots/headson__order__tests__order_single_string_array_order.snap +0 -0
- {headson-0.6.3 → headson-0.6.5}/src/utils/graph.rs +0 -0
- {headson-0.6.3 → headson-0.6.5}/src/utils/json.rs +0 -0
- {headson-0.6.3 → headson-0.6.5}/src/utils/search.rs +0 -0
- {headson-0.6.3 → headson-0.6.5}/src/utils/text.rs +0 -0
- {headson-0.6.3 → headson-0.6.5}/src/utils/tree_arena.rs +0 -0
- {headson-0.6.3 → headson-0.6.5}/tests/fixtures/json/JSONTestSuite/LICENSE +0 -0
- {headson-0.6.3 → headson-0.6.5}/tests/fixtures/json/JSONTestSuite/README.md +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: headson
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.5
|
|
4
4
|
Classifier: Programming Language :: Python
|
|
5
5
|
Classifier: Programming Language :: Python :: 3
|
|
6
6
|
Classifier: Programming Language :: Rust
|
|
@@ -20,12 +20,15 @@ Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
|
|
20
20
|
<br/>
|
|
21
21
|
</p>
|
|
22
22
|
|
|
23
|
-
`heal`/`tail` for JSON, YAML - but structure‑aware. Get a compact preview that shows both the shape and representative values of your data, all within a strict
|
|
23
|
+
`heal`/`tail` for JSON, YAML - but structure‑aware. Get a compact preview that shows both the shape and representative values of your data, all within a strict byte budget. (Just like `head`/`tail`, `headson` can also work with unstructured text files.)
|
|
24
24
|
|
|
25
25
|
Available as:
|
|
26
26
|
- CLI (see [Usage](#usage))
|
|
27
27
|
- Python library (see [Python Bindings](#python-bindings))
|
|
28
28
|
|
|
29
|
+
  
|
|
30
|
+
|
|
31
|
+
|
|
29
32
|
## Install
|
|
30
33
|
|
|
31
34
|
Using Cargo:
|
|
@@ -67,8 +70,8 @@ If you’re comfortable with tools like `head` and `tail`, use `headson` when yo
|
|
|
67
70
|
|
|
68
71
|
Common flags:
|
|
69
72
|
|
|
70
|
-
- `-
|
|
71
|
-
- `-
|
|
73
|
+
- `-c, --bytes <BYTES>`: per‑file output budget. For multiple inputs, default total budget is `<BYTES> * number_of_inputs`.
|
|
74
|
+
- `-C, --global-bytes <BYTES>`: total output budget across all inputs. With `--bytes`, the effective total is the smaller of the two.
|
|
72
75
|
- `-f, --format <auto|json|yaml|text>`: output format (default: `auto`).
|
|
73
76
|
- Auto: stdin → JSON family; filesets → per‑file based on extension (`.json` → JSON family, `.yaml`/`.yml` → YAML, unknown → Text).
|
|
74
77
|
- `-t, --template <strict|default|detailed>`: output style (default: `default`).
|
|
@@ -89,7 +92,7 @@ Notes:
|
|
|
89
92
|
- With newlines enabled, file sections are rendered with human‑readable headers. In compact/single‑line modes, headers are omitted.
|
|
90
93
|
- In `--format auto`, each file uses its own best format: JSON family for `.json`, YAML for `.yaml`/`.yml`.
|
|
91
94
|
- Unknown extensions are treated as Text (raw lines) — safe for logs and `.txt` files.
|
|
92
|
-
- `--global-
|
|
95
|
+
- `--global-bytes` may truncate or omit entire files to respect the total budget.
|
|
93
96
|
- The tool finds the largest preview that fits the budget; even if extremely tight, you still get a minimal, valid preview.
|
|
94
97
|
- Directories and binary files are ignored; a notice is printed to stderr for each. Stdin reads the stream as‑is.
|
|
95
98
|
- Head vs Tail sampling: these options bias which part of arrays are kept before rendering. Display styles may still insert internal gap markers to honor very small budgets; strict JSON stays unannotated.
|
|
@@ -98,33 +101,33 @@ Quick one‑liners:
|
|
|
98
101
|
|
|
99
102
|
- Peek a big JSON stream (keeps structure):
|
|
100
103
|
|
|
101
|
-
zstdcat huge.json.zst | headson -
|
|
104
|
+
zstdcat huge.json.zst | headson -c 800 -f json -t default
|
|
102
105
|
|
|
103
106
|
- Many files with a fixed overall size:
|
|
104
107
|
|
|
105
|
-
headson -
|
|
108
|
+
headson -C 1200 -f json -t strict logs/*.json
|
|
106
109
|
|
|
107
110
|
- Glance at a file, JavaScript‑style comments for omissions:
|
|
108
111
|
|
|
109
|
-
headson -
|
|
112
|
+
headson -c 400 -f json -t detailed data.json
|
|
110
113
|
|
|
111
114
|
- YAML with detailed comments:
|
|
112
115
|
|
|
113
|
-
headson -
|
|
116
|
+
headson -c 400 -f yaml -t detailed config.yaml
|
|
114
117
|
|
|
115
118
|
### Text mode
|
|
116
119
|
|
|
117
120
|
- Single file (auto):
|
|
118
121
|
|
|
119
|
-
headson -
|
|
122
|
+
headson -c 200 notes.txt
|
|
120
123
|
|
|
121
124
|
- Force Text ingest/output (useful when mixing with other extensions):
|
|
122
125
|
|
|
123
|
-
headson -
|
|
126
|
+
headson -c 200 -i text -f text notes.txt
|
|
124
127
|
|
|
125
128
|
- Many text files (fileset):
|
|
126
129
|
|
|
127
|
-
headson -
|
|
130
|
+
headson -c 800 -i text -f text logs/*.txt
|
|
128
131
|
|
|
129
132
|
- Styles on Text:
|
|
130
133
|
- default: omission as a standalone `…` line.
|
|
@@ -135,6 +138,8 @@ Show help:
|
|
|
135
138
|
|
|
136
139
|
headson --help
|
|
137
140
|
|
|
141
|
+
Note: flags align with head/tail conventions (`-c/--bytes`, `-C/--global-bytes`).
|
|
142
|
+
|
|
138
143
|
## Examples: head vs headson
|
|
139
144
|
|
|
140
145
|
Input:
|
|
@@ -153,7 +158,7 @@ jq -c . users.json | head -c 80
|
|
|
153
158
|
Structured preview with headson (JSON family, default style → Pseudo):
|
|
154
159
|
|
|
155
160
|
```bash
|
|
156
|
-
headson -
|
|
161
|
+
headson -c 120 -f json -t default users.json
|
|
157
162
|
# {
|
|
158
163
|
# users: [
|
|
159
164
|
# { id: 1, name: "Ana", roles: [ "admin", … ] },
|
|
@@ -166,7 +171,7 @@ headson -n 120 -f json -t default users.json
|
|
|
166
171
|
Machine‑readable preview (JSON family, strict style → strict JSON):
|
|
167
172
|
|
|
168
173
|
```bash
|
|
169
|
-
headson -
|
|
174
|
+
headson -c 120 -f json -t strict users.json
|
|
170
175
|
# {"users":[{"id":1,"name":"Ana","roles":["admin"]}],"meta":{"count":2}}
|
|
171
176
|
```
|
|
172
177
|
|
|
@@ -185,11 +190,11 @@ A thin Python extension module is available on PyPI as `headson`.
|
|
|
185
190
|
|
|
186
191
|
- Install: `pip install headson` (ABI3 wheels for Python 3.10+ on Linux/macOS/Windows).
|
|
187
192
|
- API:
|
|
188
|
-
- `headson.summarize(text: str, *, format: str = "auto", style: str = "default", input_format: str = "json",
|
|
193
|
+
- `headson.summarize(text: str, *, format: str = "auto", style: str = "default", input_format: str = "json", byte_budget: int | None = None, skew: str = "balanced") -> str`
|
|
189
194
|
- `format`: `"auto" | "json" | "yaml"` (auto maps to JSON family for single inputs)
|
|
190
195
|
- `style`: `"strict" | "default" | "detailed"`
|
|
191
196
|
- `input_format`: `"json" | "yaml"` (ingestion)
|
|
192
|
-
- `
|
|
197
|
+
- `byte_budget`: maximum output size in bytes (default: 500)
|
|
193
198
|
- `skew`: `"balanced" | "head" | "tail"` (affects display styles; strict JSON remains unannotated)
|
|
194
199
|
|
|
195
200
|
Examples:
|
|
@@ -199,7 +204,7 @@ import json
|
|
|
199
204
|
import headson
|
|
200
205
|
|
|
201
206
|
data = {"foo": [1, 2, 3], "bar": {"x": "y"}}
|
|
202
|
-
preview = headson.summarize(json.dumps(data), format="json", style="strict",
|
|
207
|
+
preview = headson.summarize(json.dumps(data), format="json", style="strict", byte_budget=200)
|
|
203
208
|
print(preview)
|
|
204
209
|
|
|
205
210
|
# Prefer the tail of arrays (annotations show with style="default"/"detailed")
|
|
@@ -208,14 +213,14 @@ print(
|
|
|
208
213
|
json.dumps(list(range(100))),
|
|
209
214
|
format="json",
|
|
210
215
|
style="detailed",
|
|
211
|
-
|
|
216
|
+
byte_budget=80,
|
|
212
217
|
skew="tail",
|
|
213
218
|
)
|
|
214
219
|
)
|
|
215
220
|
|
|
216
221
|
# YAML support
|
|
217
222
|
doc = "root:\n items: [1,2,3,4,5,6,7,8,9,10]\n"
|
|
218
|
-
print(headson.summarize(doc, format="yaml", style="default", input_format="yaml",
|
|
223
|
+
print(headson.summarize(doc, format="yaml", style="default", input_format="yaml", byte_budget=60))
|
|
219
224
|
```
|
|
220
225
|
|
|
221
226
|
# Algorithm
|
|
@@ -225,7 +230,7 @@ print(headson.summarize(doc, format="yaml", style="default", input_format="yaml"
|
|
|
225
230
|
## Footnotes
|
|
226
231
|
- <sup><b>[1]</b></sup> <b>Optimized tree representation</b>: An arena‑style tree stored in flat, contiguous buffers. Each node records its kind and value plus index ranges into shared child and key arrays. Arrays are ingested in a single pass and may be deterministically pre‑sampled: the first element is always kept; additional elements are selected via a fixed per‑index inclusion test; for kept elements, original indices are stored and full lengths are counted. This enables accurate omission info and internal gap markers later, while minimizing pointer chasing.
|
|
227
232
|
- <sup><b>[2]</b></sup> <b>Priority order</b>: Nodes are scored so previews surface representative structure and values first. Arrays can favor head/mid/tail coverage (default) or strictly the head; tail preference flips head/tail when configured. Object properties are ordered by key, and strings expand by grapheme with early characters prioritized over very deep expansions.
|
|
228
|
-
- <sup><b>[3]</b></sup> <b>Choose top N nodes (binary search)</b>: Iteratively picks N so that the rendered preview fits within the
|
|
233
|
+
- <sup><b>[3]</b></sup> <b>Choose top N nodes (binary search)</b>: Iteratively picks N so that the rendered preview fits within the byte budget, looping between “choose N” and a render attempt to converge quickly.
|
|
229
234
|
- <sup><b>[4]</b></sup> <b>Render attempt</b>: Serializes the currently included nodes using the selected template. Omission summaries and per-file section headers appear in display templates (pseudo/js); json remains strict. For arrays, display templates may insert internal gap markers between non‑contiguous kept items using original indices.
|
|
230
235
|
- <sup><b>[5]</b></sup> <b>Diagram source</b>: The Algorithm diagram is generated from `docs/diagrams/algorithm.mmd`. Regenerate the SVG with `cargo make diagrams` before releasing.
|
|
231
236
|
|
|
@@ -6,12 +6,15 @@
|
|
|
6
6
|
<br/>
|
|
7
7
|
</p>
|
|
8
8
|
|
|
9
|
-
`heal`/`tail` for JSON, YAML - but structure‑aware. Get a compact preview that shows both the shape and representative values of your data, all within a strict
|
|
9
|
+
`heal`/`tail` for JSON, YAML - but structure‑aware. Get a compact preview that shows both the shape and representative values of your data, all within a strict byte budget. (Just like `head`/`tail`, `headson` can also work with unstructured text files.)
|
|
10
10
|
|
|
11
11
|
Available as:
|
|
12
12
|
- CLI (see [Usage](#usage))
|
|
13
13
|
- Python library (see [Python Bindings](#python-bindings))
|
|
14
14
|
|
|
15
|
+
  
|
|
16
|
+
|
|
17
|
+
|
|
15
18
|
## Install
|
|
16
19
|
|
|
17
20
|
Using Cargo:
|
|
@@ -53,8 +56,8 @@ If you’re comfortable with tools like `head` and `tail`, use `headson` when yo
|
|
|
53
56
|
|
|
54
57
|
Common flags:
|
|
55
58
|
|
|
56
|
-
- `-
|
|
57
|
-
- `-
|
|
59
|
+
- `-c, --bytes <BYTES>`: per‑file output budget. For multiple inputs, default total budget is `<BYTES> * number_of_inputs`.
|
|
60
|
+
- `-C, --global-bytes <BYTES>`: total output budget across all inputs. With `--bytes`, the effective total is the smaller of the two.
|
|
58
61
|
- `-f, --format <auto|json|yaml|text>`: output format (default: `auto`).
|
|
59
62
|
- Auto: stdin → JSON family; filesets → per‑file based on extension (`.json` → JSON family, `.yaml`/`.yml` → YAML, unknown → Text).
|
|
60
63
|
- `-t, --template <strict|default|detailed>`: output style (default: `default`).
|
|
@@ -75,7 +78,7 @@ Notes:
|
|
|
75
78
|
- With newlines enabled, file sections are rendered with human‑readable headers. In compact/single‑line modes, headers are omitted.
|
|
76
79
|
- In `--format auto`, each file uses its own best format: JSON family for `.json`, YAML for `.yaml`/`.yml`.
|
|
77
80
|
- Unknown extensions are treated as Text (raw lines) — safe for logs and `.txt` files.
|
|
78
|
-
- `--global-
|
|
81
|
+
- `--global-bytes` may truncate or omit entire files to respect the total budget.
|
|
79
82
|
- The tool finds the largest preview that fits the budget; even if extremely tight, you still get a minimal, valid preview.
|
|
80
83
|
- Directories and binary files are ignored; a notice is printed to stderr for each. Stdin reads the stream as‑is.
|
|
81
84
|
- Head vs Tail sampling: these options bias which part of arrays are kept before rendering. Display styles may still insert internal gap markers to honor very small budgets; strict JSON stays unannotated.
|
|
@@ -84,33 +87,33 @@ Quick one‑liners:
|
|
|
84
87
|
|
|
85
88
|
- Peek a big JSON stream (keeps structure):
|
|
86
89
|
|
|
87
|
-
zstdcat huge.json.zst | headson -
|
|
90
|
+
zstdcat huge.json.zst | headson -c 800 -f json -t default
|
|
88
91
|
|
|
89
92
|
- Many files with a fixed overall size:
|
|
90
93
|
|
|
91
|
-
headson -
|
|
94
|
+
headson -C 1200 -f json -t strict logs/*.json
|
|
92
95
|
|
|
93
96
|
- Glance at a file, JavaScript‑style comments for omissions:
|
|
94
97
|
|
|
95
|
-
headson -
|
|
98
|
+
headson -c 400 -f json -t detailed data.json
|
|
96
99
|
|
|
97
100
|
- YAML with detailed comments:
|
|
98
101
|
|
|
99
|
-
headson -
|
|
102
|
+
headson -c 400 -f yaml -t detailed config.yaml
|
|
100
103
|
|
|
101
104
|
### Text mode
|
|
102
105
|
|
|
103
106
|
- Single file (auto):
|
|
104
107
|
|
|
105
|
-
headson -
|
|
108
|
+
headson -c 200 notes.txt
|
|
106
109
|
|
|
107
110
|
- Force Text ingest/output (useful when mixing with other extensions):
|
|
108
111
|
|
|
109
|
-
headson -
|
|
112
|
+
headson -c 200 -i text -f text notes.txt
|
|
110
113
|
|
|
111
114
|
- Many text files (fileset):
|
|
112
115
|
|
|
113
|
-
headson -
|
|
116
|
+
headson -c 800 -i text -f text logs/*.txt
|
|
114
117
|
|
|
115
118
|
- Styles on Text:
|
|
116
119
|
- default: omission as a standalone `…` line.
|
|
@@ -121,6 +124,8 @@ Show help:
|
|
|
121
124
|
|
|
122
125
|
headson --help
|
|
123
126
|
|
|
127
|
+
Note: flags align with head/tail conventions (`-c/--bytes`, `-C/--global-bytes`).
|
|
128
|
+
|
|
124
129
|
## Examples: head vs headson
|
|
125
130
|
|
|
126
131
|
Input:
|
|
@@ -139,7 +144,7 @@ jq -c . users.json | head -c 80
|
|
|
139
144
|
Structured preview with headson (JSON family, default style → Pseudo):
|
|
140
145
|
|
|
141
146
|
```bash
|
|
142
|
-
headson -
|
|
147
|
+
headson -c 120 -f json -t default users.json
|
|
143
148
|
# {
|
|
144
149
|
# users: [
|
|
145
150
|
# { id: 1, name: "Ana", roles: [ "admin", … ] },
|
|
@@ -152,7 +157,7 @@ headson -n 120 -f json -t default users.json
|
|
|
152
157
|
Machine‑readable preview (JSON family, strict style → strict JSON):
|
|
153
158
|
|
|
154
159
|
```bash
|
|
155
|
-
headson -
|
|
160
|
+
headson -c 120 -f json -t strict users.json
|
|
156
161
|
# {"users":[{"id":1,"name":"Ana","roles":["admin"]}],"meta":{"count":2}}
|
|
157
162
|
```
|
|
158
163
|
|
|
@@ -171,11 +176,11 @@ A thin Python extension module is available on PyPI as `headson`.
|
|
|
171
176
|
|
|
172
177
|
- Install: `pip install headson` (ABI3 wheels for Python 3.10+ on Linux/macOS/Windows).
|
|
173
178
|
- API:
|
|
174
|
-
- `headson.summarize(text: str, *, format: str = "auto", style: str = "default", input_format: str = "json",
|
|
179
|
+
- `headson.summarize(text: str, *, format: str = "auto", style: str = "default", input_format: str = "json", byte_budget: int | None = None, skew: str = "balanced") -> str`
|
|
175
180
|
- `format`: `"auto" | "json" | "yaml"` (auto maps to JSON family for single inputs)
|
|
176
181
|
- `style`: `"strict" | "default" | "detailed"`
|
|
177
182
|
- `input_format`: `"json" | "yaml"` (ingestion)
|
|
178
|
-
- `
|
|
183
|
+
- `byte_budget`: maximum output size in bytes (default: 500)
|
|
179
184
|
- `skew`: `"balanced" | "head" | "tail"` (affects display styles; strict JSON remains unannotated)
|
|
180
185
|
|
|
181
186
|
Examples:
|
|
@@ -185,7 +190,7 @@ import json
|
|
|
185
190
|
import headson
|
|
186
191
|
|
|
187
192
|
data = {"foo": [1, 2, 3], "bar": {"x": "y"}}
|
|
188
|
-
preview = headson.summarize(json.dumps(data), format="json", style="strict",
|
|
193
|
+
preview = headson.summarize(json.dumps(data), format="json", style="strict", byte_budget=200)
|
|
189
194
|
print(preview)
|
|
190
195
|
|
|
191
196
|
# Prefer the tail of arrays (annotations show with style="default"/"detailed")
|
|
@@ -194,14 +199,14 @@ print(
|
|
|
194
199
|
json.dumps(list(range(100))),
|
|
195
200
|
format="json",
|
|
196
201
|
style="detailed",
|
|
197
|
-
|
|
202
|
+
byte_budget=80,
|
|
198
203
|
skew="tail",
|
|
199
204
|
)
|
|
200
205
|
)
|
|
201
206
|
|
|
202
207
|
# YAML support
|
|
203
208
|
doc = "root:\n items: [1,2,3,4,5,6,7,8,9,10]\n"
|
|
204
|
-
print(headson.summarize(doc, format="yaml", style="default", input_format="yaml",
|
|
209
|
+
print(headson.summarize(doc, format="yaml", style="default", input_format="yaml", byte_budget=60))
|
|
205
210
|
```
|
|
206
211
|
|
|
207
212
|
# Algorithm
|
|
@@ -211,7 +216,7 @@ print(headson.summarize(doc, format="yaml", style="default", input_format="yaml"
|
|
|
211
216
|
## Footnotes
|
|
212
217
|
- <sup><b>[1]</b></sup> <b>Optimized tree representation</b>: An arena‑style tree stored in flat, contiguous buffers. Each node records its kind and value plus index ranges into shared child and key arrays. Arrays are ingested in a single pass and may be deterministically pre‑sampled: the first element is always kept; additional elements are selected via a fixed per‑index inclusion test; for kept elements, original indices are stored and full lengths are counted. This enables accurate omission info and internal gap markers later, while minimizing pointer chasing.
|
|
213
218
|
- <sup><b>[2]</b></sup> <b>Priority order</b>: Nodes are scored so previews surface representative structure and values first. Arrays can favor head/mid/tail coverage (default) or strictly the head; tail preference flips head/tail when configured. Object properties are ordered by key, and strings expand by grapheme with early characters prioritized over very deep expansions.
|
|
214
|
-
- <sup><b>[3]</b></sup> <b>Choose top N nodes (binary search)</b>: Iteratively picks N so that the rendered preview fits within the
|
|
219
|
+
- <sup><b>[3]</b></sup> <b>Choose top N nodes (binary search)</b>: Iteratively picks N so that the rendered preview fits within the byte budget, looping between “choose N” and a render attempt to converge quickly.
|
|
215
220
|
- <sup><b>[4]</b></sup> <b>Render attempt</b>: Serializes the currently included nodes using the selected template. Omission summaries and per-file section headers appear in display templates (pseudo/js); json remains strict. For arrays, display templates may insert internal gap markers between non‑contiguous kept items using original indices.
|
|
216
221
|
- <sup><b>[5]</b></sup> <b>Diagram source</b>: The Algorithm diagram is generated from `docs/diagrams/algorithm.mmd`. Regenerate the SVG with `cargo make diagrams` before releasing.
|
|
217
222
|
|
|
Binary file
|
|
@@ -214,7 +214,7 @@ dependencies = [
|
|
|
214
214
|
|
|
215
215
|
[[package]]
|
|
216
216
|
name = "headson"
|
|
217
|
-
version = "0.6.
|
|
217
|
+
version = "0.6.5"
|
|
218
218
|
dependencies = [
|
|
219
219
|
"anyhow",
|
|
220
220
|
"clap",
|
|
@@ -228,7 +228,7 @@ dependencies = [
|
|
|
228
228
|
|
|
229
229
|
[[package]]
|
|
230
230
|
name = "headson-python"
|
|
231
|
-
version = "0.6.
|
|
231
|
+
version = "0.6.5"
|
|
232
232
|
dependencies = [
|
|
233
233
|
"anyhow",
|
|
234
234
|
"headson",
|
|
@@ -4,11 +4,11 @@ Minimal Python API for the `headson` preview renderer.
|
|
|
4
4
|
|
|
5
5
|
API
|
|
6
6
|
|
|
7
|
-
- `headson.summarize(text: str, *, format: str = "auto", style: str = "default", input_format: str = "json",
|
|
7
|
+
- `headson.summarize(text: str, *, format: str = "auto", style: str = "default", input_format: str = "json", byte_budget: int | None = None, skew: str = "balanced") -> str`
|
|
8
8
|
- `format`: output format — `"auto" | "json" | "yaml" | "text"`.
|
|
9
9
|
- `style`: output style — `"strict" | "default" | "detailed"`.
|
|
10
10
|
- `input_format`: ingestion format — `"json" | "yaml" | "text"`.
|
|
11
|
-
- `
|
|
11
|
+
- `byte_budget`: maximum output size in bytes (defaults to 500 if not set).
|
|
12
12
|
- `skew`: one of `"balanced" | "head" | "tail"`.
|
|
13
13
|
- `balanced` (default), `head` keeps first N, `tail` keeps last N. Display styles place omission markers accordingly; strict JSON remains unannotated.
|
|
14
14
|
- Notes:
|
|
@@ -20,26 +20,26 @@ Examples:
|
|
|
20
20
|
import headson
|
|
21
21
|
|
|
22
22
|
# Human-friendly JSON (Pseudo) with a small budget
|
|
23
|
-
print(headson.summarize('{"a": 1, "b": [1,2,3]}', format="json", style="default",
|
|
23
|
+
print(headson.summarize('{"a": 1, "b": [1,2,3]}', format="json", style="default", byte_budget=80))
|
|
24
24
|
|
|
25
25
|
# Strict JSON stays valid JSON
|
|
26
|
-
print(headson.summarize('{"a": 1, "b": {"c": 2}}', format="json", style="strict",
|
|
26
|
+
print(headson.summarize('{"a": 1, "b": {"c": 2}}', format="json", style="strict", byte_budget=10_000))
|
|
27
27
|
|
|
28
28
|
# Annotated JSON (JS) with tail skew: prefer the end of arrays when truncating
|
|
29
29
|
arr = ','.join(str(i) for i in range(100))
|
|
30
|
-
print(headson.summarize('{"arr": [' + arr + ']}', format="json", style="detailed",
|
|
30
|
+
print(headson.summarize('{"arr": [' + arr + ']}', format="json", style="detailed", byte_budget=60, skew="tail"))
|
|
31
31
|
|
|
32
32
|
# YAML styles: strict (no comments), default (… comments), detailed (counts)
|
|
33
33
|
doc = 'root:\n items: [1,2,3,4,5,6,7,8,9,10]\n'
|
|
34
|
-
print(headson.summarize(doc, format="yaml", style="strict", input_format="yaml",
|
|
35
|
-
print(headson.summarize(doc, format="yaml", style="default", input_format="yaml",
|
|
36
|
-
print(headson.summarize(doc, format="yaml", style="detailed", input_format="yaml",
|
|
34
|
+
print(headson.summarize(doc, format="yaml", style="strict", input_format="yaml", byte_budget=60))
|
|
35
|
+
print(headson.summarize(doc, format="yaml", style="default", input_format="yaml", byte_budget=60))
|
|
36
|
+
print(headson.summarize(doc, format="yaml", style="detailed", input_format="yaml", byte_budget=60))
|
|
37
37
|
|
|
38
38
|
# Note: tail mode affects only display styles; strict JSON stays strict.
|
|
39
39
|
|
|
40
40
|
# Text: render raw lines with omission markers depending on style
|
|
41
41
|
text = "one\ntwo\nthree\n"
|
|
42
|
-
print(headson.summarize(text, format="text", style="default", input_format="text",
|
|
42
|
+
print(headson.summarize(text, format="text", style="default", input_format="text", byte_budget=10))
|
|
43
43
|
```
|
|
44
44
|
|
|
45
45
|
Install for development:
|
|
@@ -59,6 +59,7 @@ fn render_config_with_sampler(
|
|
|
59
59
|
color_mode: ColorMode::Auto,
|
|
60
60
|
color_enabled: false,
|
|
61
61
|
style: s,
|
|
62
|
+
string_free_prefix_graphemes: None,
|
|
62
63
|
})
|
|
63
64
|
}
|
|
64
65
|
|
|
@@ -85,6 +86,7 @@ fn priority_config(
|
|
|
85
86
|
prefer_tail_arrays,
|
|
86
87
|
array_bias: headson_core::ArrayBias::HeadMidTail,
|
|
87
88
|
array_sampler: sampler,
|
|
89
|
+
line_budget_only: false,
|
|
88
90
|
}
|
|
89
91
|
}
|
|
90
92
|
|
|
@@ -93,19 +95,19 @@ fn to_pyerr(e: anyhow::Error) -> PyErr {
|
|
|
93
95
|
}
|
|
94
96
|
|
|
95
97
|
#[pyfunction]
|
|
96
|
-
#[pyo3(signature = (text, *, format="auto", style="default",
|
|
98
|
+
#[pyo3(signature = (text, *, format="auto", style="default", byte_budget=None, skew="balanced", input_format="json"))]
|
|
97
99
|
fn summarize(
|
|
98
100
|
py: Python<'_>,
|
|
99
101
|
text: &str,
|
|
100
102
|
format: &str,
|
|
101
103
|
style: &str,
|
|
102
|
-
|
|
104
|
+
byte_budget: Option<usize>,
|
|
103
105
|
skew: &str,
|
|
104
106
|
input_format: &str,
|
|
105
107
|
) -> PyResult<String> {
|
|
106
108
|
let sampler = parse_skew(skew).map_err(to_pyerr)?;
|
|
107
109
|
let cfg = render_config_with_sampler(format, style, sampler).map_err(to_pyerr)?;
|
|
108
|
-
let budget =
|
|
110
|
+
let budget = byte_budget.unwrap_or(500);
|
|
109
111
|
let per_file_for_priority = budget.max(1);
|
|
110
112
|
let prio = priority_config(per_file_for_priority, sampler);
|
|
111
113
|
let input = text.as_bytes().to_vec();
|
|
@@ -5,7 +5,7 @@ use std::cell::RefCell;
|
|
|
5
5
|
use crate::order::NodeKind;
|
|
6
6
|
use crate::utils::tree_arena::{JsonTreeArena, JsonTreeNode};
|
|
7
7
|
|
|
8
|
-
use
|
|
8
|
+
use crate::ingest::sampling::ArraySamplerKind;
|
|
9
9
|
|
|
10
10
|
#[derive(Default)]
|
|
11
11
|
pub(crate) struct JsonTreeBuilder {
|
|
@@ -15,10 +15,7 @@ pub(crate) struct JsonTreeBuilder {
|
|
|
15
15
|
}
|
|
16
16
|
|
|
17
17
|
impl JsonTreeBuilder {
|
|
18
|
-
pub(crate) fn new(
|
|
19
|
-
array_cap: usize,
|
|
20
|
-
sampler: super::samplers::ArraySamplerKind,
|
|
21
|
-
) -> Self {
|
|
18
|
+
pub(crate) fn new(array_cap: usize, sampler: ArraySamplerKind) -> Self {
|
|
22
19
|
Self {
|
|
23
20
|
arena: RefCell::new(JsonTreeArena::default()),
|
|
24
21
|
array_cap,
|
|
@@ -1,24 +1,27 @@
|
|
|
1
1
|
mod builder;
|
|
2
2
|
mod samplers;
|
|
3
|
-
use serde::de::DeserializeSeed;
|
|
4
3
|
|
|
5
|
-
use crate::PriorityConfig;
|
|
6
|
-
use crate::utils::tree_arena::JsonTreeArena;
|
|
7
4
|
use anyhow::Result;
|
|
8
5
|
use builder::JsonTreeBuilder;
|
|
6
|
+
use serde::de::DeserializeSeed;
|
|
7
|
+
|
|
8
|
+
use crate::PriorityConfig;
|
|
9
|
+
use crate::utils::tree_arena::JsonTreeArena as TreeArena;
|
|
10
|
+
|
|
11
|
+
use crate::ingest::Ingest;
|
|
9
12
|
|
|
10
13
|
#[cfg(test)]
|
|
11
14
|
pub fn build_json_tree_arena(
|
|
12
15
|
input: &str,
|
|
13
16
|
config: &PriorityConfig,
|
|
14
|
-
) -> Result<
|
|
17
|
+
) -> Result<TreeArena> {
|
|
15
18
|
build_json_tree_arena_from_bytes(input.as_bytes().to_vec(), config)
|
|
16
19
|
}
|
|
17
20
|
|
|
18
21
|
pub fn build_json_tree_arena_from_bytes(
|
|
19
22
|
mut bytes: Vec<u8>,
|
|
20
23
|
config: &PriorityConfig,
|
|
21
|
-
) -> Result<
|
|
24
|
+
) -> Result<TreeArena> {
|
|
22
25
|
let mut de = simd_json::Deserializer::from_slice(&mut bytes)?;
|
|
23
26
|
let builder = JsonTreeBuilder::new(
|
|
24
27
|
config.array_max_items,
|
|
@@ -36,7 +39,7 @@ pub fn build_json_tree_arena_from_bytes(
|
|
|
36
39
|
pub fn build_json_tree_arena_from_many(
|
|
37
40
|
mut inputs: Vec<(String, Vec<u8>)>,
|
|
38
41
|
config: &PriorityConfig,
|
|
39
|
-
) -> Result<
|
|
42
|
+
) -> Result<TreeArena> {
|
|
40
43
|
let builder = JsonTreeBuilder::new(
|
|
41
44
|
config.array_max_items,
|
|
42
45
|
config.array_sampler.into(),
|
|
@@ -57,6 +60,38 @@ pub fn build_json_tree_arena_from_many(
|
|
|
57
60
|
Ok(arena)
|
|
58
61
|
}
|
|
59
62
|
|
|
63
|
+
/// JSON adapter for the ingest boundary. Delegates to the JSON builder to
|
|
64
|
+
/// produce the neutral `TreeArena`.
|
|
65
|
+
pub struct JsonIngest;
|
|
66
|
+
|
|
67
|
+
impl Ingest for JsonIngest {
|
|
68
|
+
fn parse_one(bytes: Vec<u8>, cfg: &PriorityConfig) -> Result<TreeArena> {
|
|
69
|
+
build_json_tree_arena_from_bytes(bytes, cfg)
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
fn parse_many(
|
|
73
|
+
inputs: Vec<(String, Vec<u8>)>,
|
|
74
|
+
cfg: &PriorityConfig,
|
|
75
|
+
) -> Result<TreeArena> {
|
|
76
|
+
build_json_tree_arena_from_many(inputs, cfg)
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
/// Convenience functions for the JSON ingest path.
|
|
81
|
+
pub fn parse_json_one(
|
|
82
|
+
bytes: Vec<u8>,
|
|
83
|
+
cfg: &PriorityConfig,
|
|
84
|
+
) -> Result<TreeArena> {
|
|
85
|
+
JsonIngest::parse_one(bytes, cfg)
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
pub fn parse_json_many(
|
|
89
|
+
inputs: Vec<(String, Vec<u8>)>,
|
|
90
|
+
cfg: &PriorityConfig,
|
|
91
|
+
) -> Result<TreeArena> {
|
|
92
|
+
JsonIngest::parse_many(inputs, cfg)
|
|
93
|
+
}
|
|
94
|
+
|
|
60
95
|
#[cfg(test)]
|
|
61
96
|
mod tests {
|
|
62
97
|
use super::*;
|
|
@@ -1,14 +1,12 @@
|
|
|
1
1
|
use serde::de::{IgnoredAny, SeqAccess};
|
|
2
2
|
|
|
3
|
-
use super::
|
|
3
|
+
use super::JsonTreeBuilder;
|
|
4
|
+
use super::SampledArray;
|
|
4
5
|
|
|
5
|
-
//
|
|
6
|
+
// Default strategy phases: keep-first, greedy, then index-hash acceptance (~50%).
|
|
6
7
|
const RANDOM_ACCEPT_SEED: u64 = 0x9e37_79b9_7f4a_7c15;
|
|
7
|
-
|
|
8
|
-
const RANDOM_ACCEPT_THRESHOLD: u32 = 0x8000_0000;
|
|
9
|
-
// Keep a small, fixed number of items from the head before greedy/random phases.
|
|
8
|
+
const RANDOM_ACCEPT_THRESHOLD: u32 = 0x8000_0000; // ~50%
|
|
10
9
|
const KEEP_FIRST_COUNT: usize = 3;
|
|
11
|
-
// Take roughly half of the remaining capacity greedily after the first items.
|
|
12
10
|
const GREEDY_PORTION_DIVISOR: usize = 2;
|
|
13
11
|
|
|
14
12
|
struct PhaseState {
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
use serde::de::SeqAccess;
|
|
2
2
|
|
|
3
|
-
use crate::
|
|
4
|
-
use crate::
|
|
3
|
+
use crate::ingest::formats::json::builder::JsonTreeBuilder;
|
|
4
|
+
use crate::ingest::sampling::ArraySamplerKind;
|
|
5
5
|
|
|
6
6
|
#[derive(Debug)]
|
|
7
7
|
pub(crate) struct SampledArray {
|
|
@@ -10,14 +10,6 @@ pub(crate) struct SampledArray {
|
|
|
10
10
|
pub total_len: usize,
|
|
11
11
|
}
|
|
12
12
|
|
|
13
|
-
#[derive(Copy, Clone, Debug, Default)]
|
|
14
|
-
pub(crate) enum ArraySamplerKind {
|
|
15
|
-
#[default]
|
|
16
|
-
Default,
|
|
17
|
-
Head,
|
|
18
|
-
Tail,
|
|
19
|
-
}
|
|
20
|
-
|
|
21
13
|
impl ArraySamplerKind {
|
|
22
14
|
pub(crate) fn sample_stream<'de, A>(
|
|
23
15
|
self,
|
|
@@ -38,16 +30,6 @@ impl ArraySamplerKind {
|
|
|
38
30
|
}
|
|
39
31
|
}
|
|
40
32
|
|
|
41
|
-
impl From<ArraySamplerStrategy> for ArraySamplerKind {
|
|
42
|
-
fn from(strategy: ArraySamplerStrategy) -> Self {
|
|
43
|
-
match strategy {
|
|
44
|
-
ArraySamplerStrategy::Default => ArraySamplerKind::Default,
|
|
45
|
-
ArraySamplerStrategy::Head => ArraySamplerKind::Head,
|
|
46
|
-
ArraySamplerStrategy::Tail => ArraySamplerKind::Tail,
|
|
47
|
-
}
|
|
48
|
-
}
|
|
49
|
-
}
|
|
50
|
-
|
|
51
33
|
mod default;
|
|
52
34
|
mod head;
|
|
53
35
|
mod tail;
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
use serde::de::{IgnoredAny, SeqAccess};
|
|
2
2
|
|
|
3
|
-
use super::
|
|
3
|
+
use super::JsonTreeBuilder;
|
|
4
|
+
use super::SampledArray;
|
|
4
5
|
|
|
5
6
|
pub(crate) fn sample_stream<'de, A>(
|
|
6
7
|
seq: &mut A,
|
|
@@ -89,8 +90,10 @@ mod tests {
|
|
|
89
90
|
let mut cfg = PriorityConfig::new(usize::MAX, 5);
|
|
90
91
|
cfg.array_sampler = crate::ArraySamplerStrategy::Tail;
|
|
91
92
|
let arena =
|
|
92
|
-
crate::
|
|
93
|
-
|
|
93
|
+
crate::ingest::formats::json::build_json_tree_arena_from_bytes(
|
|
94
|
+
input, &cfg,
|
|
95
|
+
)
|
|
96
|
+
.expect("arena");
|
|
94
97
|
let root = &arena.nodes[arena.root_id];
|
|
95
98
|
assert_eq!(root.children_len, 5, "kept 5");
|
|
96
99
|
let mut orig_indices = Vec::new();
|