headson 0.4.0__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of headson might be problematic. Click here for more details.
- {headson-0.4.0 → headson-0.5.0}/Cargo.lock +1 -1
- {headson-0.4.0 → headson-0.5.0}/Cargo.toml +1 -1
- {headson-0.4.0 → headson-0.5.0}/PKG-INFO +52 -5
- {headson-0.4.0 → headson-0.5.0}/README.md +51 -4
- {headson-0.4.0 → headson-0.5.0}/pyproject.toml +1 -1
- {headson-0.4.0 → headson-0.5.0}/python/Cargo.lock +2 -2
- {headson-0.4.0 → headson-0.5.0}/python/Cargo.toml +1 -1
- {headson-0.4.0 → headson-0.5.0}/python/README.md +7 -4
- headson-0.5.0/python/headson/__init__.py +6 -0
- {headson-0.4.0 → headson-0.5.0}/python/src/lib.rs +31 -14
- {headson-0.4.0 → headson-0.5.0}/src/json_ingest/builder.rs +43 -25
- {headson-0.4.0 → headson-0.5.0}/src/json_ingest/mod.rs +10 -3
- headson-0.5.0/src/json_ingest/samplers/default.rs +219 -0
- headson-0.5.0/src/json_ingest/samplers/head.rs +79 -0
- headson-0.5.0/src/json_ingest/samplers/mod.rs +53 -0
- headson-0.5.0/src/json_ingest/samplers/tail.rs +107 -0
- {headson-0.4.0 → headson-0.5.0}/src/lib.rs +1 -1
- {headson-0.4.0 → headson-0.5.0}/src/main.rs +15 -0
- {headson-0.4.0 → headson-0.5.0}/src/order/build.rs +83 -37
- {headson-0.4.0 → headson-0.5.0}/src/order/types.rs +22 -0
- headson-0.5.0/src/serialization/fileset.rs +164 -0
- {headson-0.4.0 → headson-0.5.0}/src/serialization/mod.rs +51 -156
- {headson-0.4.0 → headson-0.5.0}/src/serialization/templates/core.rs +16 -3
- {headson-0.4.0 → headson-0.5.0}/src/serialization/templates/js.rs +11 -0
- {headson-0.4.0 → headson-0.5.0}/src/serialization/templates/pseudo.rs +9 -0
- {headson-0.4.0 → headson-0.5.0}/src/utils/tree_arena.rs +9 -0
- headson-0.4.0/python/headson/__init__.py +0 -4
- {headson-0.4.0 → headson-0.5.0}/JSONTestSuite/LICENSE +0 -0
- {headson-0.4.0 → headson-0.5.0}/JSONTestSuite/README.md +0 -0
- {headson-0.4.0 → headson-0.5.0}/LICENSE +0 -0
- {headson-0.4.0 → headson-0.5.0}/src/order/mod.rs +0 -0
- {headson-0.4.0 → headson-0.5.0}/src/order/scoring.rs +0 -0
- {headson-0.4.0 → headson-0.5.0}/src/order/snapshots/headson__order__build__tests__order_empty_array_order.snap +0 -0
- {headson-0.4.0 → headson-0.5.0}/src/order/snapshots/headson__order__build__tests__order_single_string_array_order.snap +0 -0
- {headson-0.4.0 → headson-0.5.0}/src/serialization/snapshots/headson__serialization__tests__arena_render_empty.snap +0 -0
- {headson-0.4.0 → headson-0.5.0}/src/serialization/snapshots/headson__serialization__tests__arena_render_single.snap +0 -0
- {headson-0.4.0 → headson-0.5.0}/src/serialization/templates/json.rs +0 -0
- {headson-0.4.0 → headson-0.5.0}/src/serialization/templates/mod.rs +0 -0
- {headson-0.4.0 → headson-0.5.0}/src/serialization/types.rs +0 -0
- {headson-0.4.0 → headson-0.5.0}/src/snapshots/headson__order__tests__order_empty_array_order.snap +0 -0
- {headson-0.4.0 → headson-0.5.0}/src/snapshots/headson__order__tests__order_single_string_array_order.snap +0 -0
- {headson-0.4.0 → headson-0.5.0}/src/snapshots/headson__order__tests__pq_empty_array_queue.snap +0 -0
- {headson-0.4.0 → headson-0.5.0}/src/snapshots/headson__order__tests__pq_single_string_array_queue.snap +0 -0
- {headson-0.4.0 → headson-0.5.0}/src/snapshots/headson__queue__tests__pq_empty_array_queue.snap +0 -0
- {headson-0.4.0 → headson-0.5.0}/src/snapshots/headson__queue__tests__pq_single_string_array_queue.snap +0 -0
- {headson-0.4.0 → headson-0.5.0}/src/snapshots/headson__tree__tests__build_tree_empty.snap +0 -0
- {headson-0.4.0 → headson-0.5.0}/src/snapshots/headson__tree__tests__build_tree_single.snap +0 -0
- {headson-0.4.0 → headson-0.5.0}/src/utils/graph.rs +0 -0
- {headson-0.4.0 → headson-0.5.0}/src/utils/json.rs +0 -0
- {headson-0.4.0 → headson-0.5.0}/src/utils/mod.rs +0 -0
- {headson-0.4.0 → headson-0.5.0}/src/utils/search.rs +0 -0
- {headson-0.4.0 → headson-0.5.0}/src/utils/text.rs +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: headson
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0
|
|
4
4
|
Classifier: Programming Language :: Python
|
|
5
5
|
Classifier: Programming Language :: Python :: 3
|
|
6
6
|
Classifier: Programming Language :: Rust
|
|
@@ -66,7 +66,8 @@ Common flags:
|
|
|
66
66
|
- `--no-space`: no space after `:` in objects
|
|
67
67
|
- `--indent <STR>`: indentation unit (default: two spaces)
|
|
68
68
|
- `--string-cap <N>`: max graphemes to consider per string (default: 500)
|
|
69
|
-
- `--
|
|
69
|
+
- `--head`: prefer the beginning of arrays when truncating (keep first N). Strings are unaffected. In `pseudo`/`js` templates the omission marker appears near the end; `json` remains strict. Mutually exclusive with `--tail`.
|
|
70
|
+
- `--tail`: prefer the end of arrays when truncating (keep last N). Strings are unaffected. In `pseudo`/`js` templates the omission marker appears at the start; `json` remains strict. Mutually exclusive with `--head`.
|
|
70
71
|
|
|
71
72
|
Notes:
|
|
72
73
|
|
|
@@ -77,6 +78,7 @@ Notes:
|
|
|
77
78
|
- Using `--global-budget` may truncate or omit entire files to respect the total budget.
|
|
78
79
|
- The tool finds the largest preview that fits the budget; if even the tiniest preview exceeds it, you still get a minimal, valid preview.
|
|
79
80
|
- When passing file paths, directories and binary files are ignored; a notice is printed to stderr for each (e.g., `Ignored binary file: ./path/to/file`). Stdin mode reads the stream as-is.
|
|
81
|
+
- Head vs Tail sampling: these options bias which part of arrays are kept before rendering. They guarantee the kept segment is contiguous at the chosen side (prefix for `--head`, suffix for `--tail`). Display templates may still insert additional internal gap markers inside that kept segment to honor very small budgets; `json` remains strict and unannotated.
|
|
80
82
|
|
|
81
83
|
Quick one‑liners:
|
|
82
84
|
|
|
@@ -137,10 +139,10 @@ A thin Python extension module is available on PyPI as `headson`.
|
|
|
137
139
|
|
|
138
140
|
- Install: `pip install headson` (ABI3 wheels for Python 3.10+ on Linux/macOS/Windows).
|
|
139
141
|
- API:
|
|
140
|
-
- `headson.summarize(text: str, *, template: str = "pseudo", character_budget: int | None = None,
|
|
142
|
+
- `headson.summarize(text: str, *, template: str = "pseudo", character_budget: int | None = None, skew: str = "balanced") -> str`
|
|
141
143
|
- `template`: one of `"json" | "pseudo" | "js"`
|
|
142
144
|
- `character_budget`: maximum output size in characters (default: 500)
|
|
143
|
-
|
|
145
|
+
- `skew`: one of `"balanced" | "head" | "tail"` (focus arrays on start vs end; only affects display templates; `json` remains strict).
|
|
144
146
|
|
|
145
147
|
Example:
|
|
146
148
|
|
|
@@ -158,11 +160,56 @@ print(
|
|
|
158
160
|
json.dumps(list(range(100))),
|
|
159
161
|
template="pseudo",
|
|
160
162
|
character_budget=80,
|
|
161
|
-
tail
|
|
163
|
+
skew="tail",
|
|
162
164
|
)
|
|
163
165
|
)
|
|
164
166
|
```
|
|
165
167
|
|
|
168
|
+
# Algorithm
|
|
169
|
+
|
|
170
|
+
```mermaid
|
|
171
|
+
%%{init: {"themeCSS": ".cluster > rect { fill: transparent; stroke: transparent; } .clusterLabel > text { font-size: 16px; font-weight: 600; } .clusterLabel span { padding: 6px 10px; font-size: 16px; font-weight: 600; }"}}%%
|
|
172
|
+
flowchart TD
|
|
173
|
+
subgraph Deserialization
|
|
174
|
+
direction TB
|
|
175
|
+
A["Input file(s)"]
|
|
176
|
+
A -- Single --> C["Parse into optimized tree (with array pre‑sampling) ¹"]
|
|
177
|
+
A -- Multiple --> D["Parse each file and wrap into a fileset object"]
|
|
178
|
+
D --> C
|
|
179
|
+
end
|
|
180
|
+
subgraph Prioritization
|
|
181
|
+
direction TB
|
|
182
|
+
E["Build priority order ²"]
|
|
183
|
+
F["Choose top N nodes ³"]
|
|
184
|
+
end
|
|
185
|
+
subgraph Serialization
|
|
186
|
+
direction TB
|
|
187
|
+
G["Render attempt ⁴"]
|
|
188
|
+
H["Output preview string"]
|
|
189
|
+
end
|
|
190
|
+
C --> E
|
|
191
|
+
E --> F
|
|
192
|
+
F --> G
|
|
193
|
+
G --> F
|
|
194
|
+
F --> H
|
|
195
|
+
%% Color classes for categories
|
|
196
|
+
classDef des fill:#eaf2ff,stroke:#3b82f6,stroke-width:1px,color:#0f172a;
|
|
197
|
+
classDef prio fill:#ecfdf5,stroke:#10b981,stroke-width:1px,color:#064e3b;
|
|
198
|
+
classDef ser fill:#fff1f2,stroke:#f43f5e,stroke-width:1px,color:#7f1d1d;
|
|
199
|
+
class A,C,D des;
|
|
200
|
+
class E,F prio;
|
|
201
|
+
class G,H ser;
|
|
202
|
+
style Deserialization fill:transparent,stroke:transparent
|
|
203
|
+
style Prioritization fill:transparent,stroke:transparent
|
|
204
|
+
style Serialization fill:transparent,stroke:transparent
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
## Footnotes
|
|
208
|
+
- <sup><b>[1]</b></sup> <b>Optimized tree representation</b>: An arena‑style tree stored in flat, contiguous buffers. Each node records its kind and value plus index ranges into shared child and key arrays. Arrays are ingested in a single pass and may be deterministically pre‑sampled: the first element is always kept; additional elements are selected via a fixed per‑index inclusion test; for kept elements, original indices are stored and full lengths are counted. This enables accurate omission info and internal gap markers later, while minimizing pointer chasing.
|
|
209
|
+
- <sup><b>[2]</b></sup> <b>Priority order</b>: Nodes are scored so previews surface representative structure and values first. Arrays can favor head/mid/tail coverage (default) or strictly the head; tail preference flips head/tail when configured. Object properties are ordered by key, and strings expand by grapheme with early characters prioritized over very deep expansions.
|
|
210
|
+
- <sup><b>[3]</b></sup> <b>Choose top N nodes (binary search)</b>: Iteratively picks N so that the rendered preview fits within the character budget, looping between “choose N” and a render attempt to converge quickly.
|
|
211
|
+
- <sup><b>[4]</b></sup> <b>Render attempt</b>: Serializes the currently included nodes using the selected template. Omission summaries and per-file section headers appear in display templates (pseudo/js); json remains strict. For arrays, display templates may insert internal gap markers between non‑contiguous kept items using original indices.
|
|
212
|
+
|
|
166
213
|
## License
|
|
167
214
|
|
|
168
215
|
MIT
|
|
@@ -51,7 +51,8 @@ Common flags:
|
|
|
51
51
|
- `--no-space`: no space after `:` in objects
|
|
52
52
|
- `--indent <STR>`: indentation unit (default: two spaces)
|
|
53
53
|
- `--string-cap <N>`: max graphemes to consider per string (default: 500)
|
|
54
|
-
- `--
|
|
54
|
+
- `--head`: prefer the beginning of arrays when truncating (keep first N). Strings are unaffected. In `pseudo`/`js` templates the omission marker appears near the end; `json` remains strict. Mutually exclusive with `--tail`.
|
|
55
|
+
- `--tail`: prefer the end of arrays when truncating (keep last N). Strings are unaffected. In `pseudo`/`js` templates the omission marker appears at the start; `json` remains strict. Mutually exclusive with `--head`.
|
|
55
56
|
|
|
56
57
|
Notes:
|
|
57
58
|
|
|
@@ -62,6 +63,7 @@ Notes:
|
|
|
62
63
|
- Using `--global-budget` may truncate or omit entire files to respect the total budget.
|
|
63
64
|
- The tool finds the largest preview that fits the budget; if even the tiniest preview exceeds it, you still get a minimal, valid preview.
|
|
64
65
|
- When passing file paths, directories and binary files are ignored; a notice is printed to stderr for each (e.g., `Ignored binary file: ./path/to/file`). Stdin mode reads the stream as-is.
|
|
66
|
+
- Head vs Tail sampling: these options bias which part of arrays are kept before rendering. They guarantee the kept segment is contiguous at the chosen side (prefix for `--head`, suffix for `--tail`). Display templates may still insert additional internal gap markers inside that kept segment to honor very small budgets; `json` remains strict and unannotated.
|
|
65
67
|
|
|
66
68
|
Quick one‑liners:
|
|
67
69
|
|
|
@@ -122,10 +124,10 @@ A thin Python extension module is available on PyPI as `headson`.
|
|
|
122
124
|
|
|
123
125
|
- Install: `pip install headson` (ABI3 wheels for Python 3.10+ on Linux/macOS/Windows).
|
|
124
126
|
- API:
|
|
125
|
-
- `headson.summarize(text: str, *, template: str = "pseudo", character_budget: int | None = None,
|
|
127
|
+
- `headson.summarize(text: str, *, template: str = "pseudo", character_budget: int | None = None, skew: str = "balanced") -> str`
|
|
126
128
|
- `template`: one of `"json" | "pseudo" | "js"`
|
|
127
129
|
- `character_budget`: maximum output size in characters (default: 500)
|
|
128
|
-
|
|
130
|
+
- `skew`: one of `"balanced" | "head" | "tail"` (focus arrays on start vs end; only affects display templates; `json` remains strict).
|
|
129
131
|
|
|
130
132
|
Example:
|
|
131
133
|
|
|
@@ -143,11 +145,56 @@ print(
|
|
|
143
145
|
json.dumps(list(range(100))),
|
|
144
146
|
template="pseudo",
|
|
145
147
|
character_budget=80,
|
|
146
|
-
tail
|
|
148
|
+
skew="tail",
|
|
147
149
|
)
|
|
148
150
|
)
|
|
149
151
|
```
|
|
150
152
|
|
|
153
|
+
# Algorithm
|
|
154
|
+
|
|
155
|
+
```mermaid
|
|
156
|
+
%%{init: {"themeCSS": ".cluster > rect { fill: transparent; stroke: transparent; } .clusterLabel > text { font-size: 16px; font-weight: 600; } .clusterLabel span { padding: 6px 10px; font-size: 16px; font-weight: 600; }"}}%%
|
|
157
|
+
flowchart TD
|
|
158
|
+
subgraph Deserialization
|
|
159
|
+
direction TB
|
|
160
|
+
A["Input file(s)"]
|
|
161
|
+
A -- Single --> C["Parse into optimized tree (with array pre‑sampling) ¹"]
|
|
162
|
+
A -- Multiple --> D["Parse each file and wrap into a fileset object"]
|
|
163
|
+
D --> C
|
|
164
|
+
end
|
|
165
|
+
subgraph Prioritization
|
|
166
|
+
direction TB
|
|
167
|
+
E["Build priority order ²"]
|
|
168
|
+
F["Choose top N nodes ³"]
|
|
169
|
+
end
|
|
170
|
+
subgraph Serialization
|
|
171
|
+
direction TB
|
|
172
|
+
G["Render attempt ⁴"]
|
|
173
|
+
H["Output preview string"]
|
|
174
|
+
end
|
|
175
|
+
C --> E
|
|
176
|
+
E --> F
|
|
177
|
+
F --> G
|
|
178
|
+
G --> F
|
|
179
|
+
F --> H
|
|
180
|
+
%% Color classes for categories
|
|
181
|
+
classDef des fill:#eaf2ff,stroke:#3b82f6,stroke-width:1px,color:#0f172a;
|
|
182
|
+
classDef prio fill:#ecfdf5,stroke:#10b981,stroke-width:1px,color:#064e3b;
|
|
183
|
+
classDef ser fill:#fff1f2,stroke:#f43f5e,stroke-width:1px,color:#7f1d1d;
|
|
184
|
+
class A,C,D des;
|
|
185
|
+
class E,F prio;
|
|
186
|
+
class G,H ser;
|
|
187
|
+
style Deserialization fill:transparent,stroke:transparent
|
|
188
|
+
style Prioritization fill:transparent,stroke:transparent
|
|
189
|
+
style Serialization fill:transparent,stroke:transparent
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
## Footnotes
|
|
193
|
+
- <sup><b>[1]</b></sup> <b>Optimized tree representation</b>: An arena‑style tree stored in flat, contiguous buffers. Each node records its kind and value plus index ranges into shared child and key arrays. Arrays are ingested in a single pass and may be deterministically pre‑sampled: the first element is always kept; additional elements are selected via a fixed per‑index inclusion test; for kept elements, original indices are stored and full lengths are counted. This enables accurate omission info and internal gap markers later, while minimizing pointer chasing.
|
|
194
|
+
- <sup><b>[2]</b></sup> <b>Priority order</b>: Nodes are scored so previews surface representative structure and values first. Arrays can favor head/mid/tail coverage (default) or strictly the head; tail preference flips head/tail when configured. Object properties are ordered by key, and strings expand by grapheme with early characters prioritized over very deep expansions.
|
|
195
|
+
- <sup><b>[3]</b></sup> <b>Choose top N nodes (binary search)</b>: Iteratively picks N so that the rendered preview fits within the character budget, looping between “choose N” and a render attempt to converge quickly.
|
|
196
|
+
- <sup><b>[4]</b></sup> <b>Render attempt</b>: Serializes the currently included nodes using the selected template. Omission summaries and per-file section headers appear in display templates (pseudo/js); json remains strict. For arrays, display templates may insert internal gap markers between non‑contiguous kept items using original indices.
|
|
197
|
+
|
|
151
198
|
## License
|
|
152
199
|
|
|
153
200
|
MIT
|
|
@@ -169,7 +169,7 @@ dependencies = [
|
|
|
169
169
|
|
|
170
170
|
[[package]]
|
|
171
171
|
name = "headson"
|
|
172
|
-
version = "0.
|
|
172
|
+
version = "0.5.0"
|
|
173
173
|
dependencies = [
|
|
174
174
|
"anyhow",
|
|
175
175
|
"clap",
|
|
@@ -182,7 +182,7 @@ dependencies = [
|
|
|
182
182
|
|
|
183
183
|
[[package]]
|
|
184
184
|
name = "headson-python"
|
|
185
|
-
version = "0.
|
|
185
|
+
version = "0.5.0"
|
|
186
186
|
dependencies = [
|
|
187
187
|
"anyhow",
|
|
188
188
|
"headson",
|
|
@@ -4,10 +4,13 @@ Minimal Python API for the `headson` JSON preview renderer.
|
|
|
4
4
|
|
|
5
5
|
Currently exported function:
|
|
6
6
|
|
|
7
|
-
- `headson.summarize(text: str, *, template: str = "pseudo", character_budget: int | None = None,
|
|
7
|
+
- `headson.summarize(text: str, *, template: str = "pseudo", character_budget: int | None = None, skew: str = "balanced") -> str`
|
|
8
8
|
- `template`: one of `"json" | "pseudo" | "js"`.
|
|
9
9
|
- `character_budget`: maximum output size in characters (defaults to 500 if not set).
|
|
10
|
-
- `
|
|
10
|
+
- `skew`: one of `"balanced" | "head" | "tail"`.
|
|
11
|
+
- `balanced`: default behavior.
|
|
12
|
+
- `head`: focus arrays on the beginning (keep first N).
|
|
13
|
+
- `tail`: focus arrays on the end (keep last N); pseudo/js place omission markers at the start.
|
|
11
14
|
|
|
12
15
|
Examples:
|
|
13
16
|
|
|
@@ -20,9 +23,9 @@ print(headson.summarize('{"a": 1, "b": [1,2,3]}', template="pseudo", character_b
|
|
|
20
23
|
# Strict JSON template preserves valid JSON output
|
|
21
24
|
print(headson.summarize('{"a": 1, "b": {"c": 2}}', template="json", character_budget=10_000))
|
|
22
25
|
|
|
23
|
-
# JS template with tail
|
|
26
|
+
# JS template with tail skew: prefer the end of arrays when truncating
|
|
24
27
|
arr = ','.join(str(i) for i in range(100))
|
|
25
|
-
print(headson.summarize('{"arr": [' + arr + ']}', template="js", character_budget=60, tail
|
|
28
|
+
print(headson.summarize('{"arr": [' + arr + ']}', template="js", character_budget=60, skew="tail"))
|
|
26
29
|
|
|
27
30
|
# Note: tail mode affects only pseudo/js display templates; the json template stays strict.
|
|
28
31
|
```
|
|
@@ -2,7 +2,7 @@ use anyhow::{bail, Result};
|
|
|
2
2
|
use pyo3::exceptions::PyRuntimeError;
|
|
3
3
|
use pyo3::prelude::*;
|
|
4
4
|
use pyo3::types::PyModule;
|
|
5
|
-
use headson_core::{OutputTemplate,
|
|
5
|
+
use headson_core::{ArraySamplerStrategy, OutputTemplate, PriorityConfig, RenderConfig};
|
|
6
6
|
|
|
7
7
|
fn to_template(s: &str) -> Result<OutputTemplate> {
|
|
8
8
|
match s.to_ascii_lowercase().as_str() {
|
|
@@ -13,25 +13,41 @@ fn to_template(s: &str) -> Result<OutputTemplate> {
|
|
|
13
13
|
}
|
|
14
14
|
}
|
|
15
15
|
|
|
16
|
-
fn
|
|
16
|
+
fn render_config_with_sampler(
|
|
17
|
+
template: &str,
|
|
18
|
+
sampler: ArraySamplerStrategy,
|
|
19
|
+
) -> Result<RenderConfig> {
|
|
17
20
|
let t = to_template(template)?;
|
|
18
21
|
let space = " ".to_string();
|
|
19
22
|
let newline = "\n".to_string();
|
|
20
23
|
let indent_unit = " ".to_string();
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
24
|
+
let prefer_tail_arrays = matches!(sampler, ArraySamplerStrategy::Tail);
|
|
25
|
+
Ok(RenderConfig { template: t, indent_unit, space, newline, prefer_tail_arrays })
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
fn parse_skew(skew: &str) -> Result<ArraySamplerStrategy> {
|
|
29
|
+
match skew.to_ascii_lowercase().as_str() {
|
|
30
|
+
"balanced" => Ok(ArraySamplerStrategy::Default),
|
|
31
|
+
"head" => Ok(ArraySamplerStrategy::Head),
|
|
32
|
+
"tail" => Ok(ArraySamplerStrategy::Tail),
|
|
33
|
+
other => bail!(
|
|
34
|
+
"unknown skew: {} (expected 'balanced' | 'head' | 'tail')",
|
|
35
|
+
other
|
|
36
|
+
),
|
|
37
|
+
}
|
|
28
38
|
}
|
|
29
39
|
|
|
30
|
-
fn priority_config(
|
|
40
|
+
fn priority_config(
|
|
41
|
+
per_file_budget: usize,
|
|
42
|
+
sampler: ArraySamplerStrategy,
|
|
43
|
+
) -> PriorityConfig {
|
|
44
|
+
let prefer_tail_arrays = matches!(sampler, ArraySamplerStrategy::Tail);
|
|
31
45
|
PriorityConfig {
|
|
32
46
|
max_string_graphemes: 500,
|
|
33
47
|
array_max_items: (per_file_budget / 2).max(1),
|
|
34
48
|
prefer_tail_arrays,
|
|
49
|
+
array_bias: headson_core::ArrayBias::HeadMidTail,
|
|
50
|
+
array_sampler: sampler,
|
|
35
51
|
}
|
|
36
52
|
}
|
|
37
53
|
|
|
@@ -40,18 +56,19 @@ fn to_pyerr(e: anyhow::Error) -> PyErr {
|
|
|
40
56
|
}
|
|
41
57
|
|
|
42
58
|
#[pyfunction]
|
|
43
|
-
#[pyo3(signature = (text, *, template="pseudo", character_budget=None,
|
|
59
|
+
#[pyo3(signature = (text, *, template="pseudo", character_budget=None, skew="balanced"))]
|
|
44
60
|
fn summarize(
|
|
45
61
|
py: Python<'_>,
|
|
46
62
|
text: &str,
|
|
47
63
|
template: &str,
|
|
48
64
|
character_budget: Option<usize>,
|
|
49
|
-
|
|
65
|
+
skew: &str,
|
|
50
66
|
) -> PyResult<String> {
|
|
51
|
-
let
|
|
67
|
+
let sampler = parse_skew(skew).map_err(to_pyerr)?;
|
|
68
|
+
let cfg = render_config_with_sampler(template, sampler).map_err(to_pyerr)?;
|
|
52
69
|
let budget = character_budget.unwrap_or(500);
|
|
53
70
|
let per_file_for_priority = budget.max(1);
|
|
54
|
-
let prio = priority_config(per_file_for_priority,
|
|
71
|
+
let prio = priority_config(per_file_for_priority, sampler);
|
|
55
72
|
let input = text.as_bytes().to_vec();
|
|
56
73
|
py.detach(|| headson_core::headson(input, &cfg, &prio, budget).map_err(to_pyerr))
|
|
57
74
|
}
|
|
@@ -1,21 +1,28 @@
|
|
|
1
1
|
use serde::Deserializer;
|
|
2
|
-
use serde::de::{DeserializeSeed,
|
|
2
|
+
use serde::de::{DeserializeSeed, MapAccess, SeqAccess, Visitor};
|
|
3
3
|
use std::cell::RefCell;
|
|
4
4
|
|
|
5
5
|
use crate::order::NodeKind;
|
|
6
6
|
use crate::utils::tree_arena::{JsonTreeArena, JsonTreeNode};
|
|
7
7
|
|
|
8
|
+
use super::samplers::ArraySamplerKind;
|
|
9
|
+
|
|
8
10
|
#[derive(Default)]
|
|
9
11
|
pub(crate) struct JsonTreeBuilder {
|
|
10
12
|
arena: RefCell<JsonTreeArena>,
|
|
11
13
|
pub(crate) array_cap: usize,
|
|
14
|
+
sampler: ArraySamplerKind,
|
|
12
15
|
}
|
|
13
16
|
|
|
14
17
|
impl JsonTreeBuilder {
|
|
15
|
-
pub(crate) fn new(
|
|
18
|
+
pub(crate) fn new(
|
|
19
|
+
array_cap: usize,
|
|
20
|
+
sampler: super::samplers::ArraySamplerKind,
|
|
21
|
+
) -> Self {
|
|
16
22
|
Self {
|
|
17
23
|
arena: RefCell::new(JsonTreeArena::default()),
|
|
18
24
|
array_cap,
|
|
25
|
+
sampler,
|
|
19
26
|
}
|
|
20
27
|
}
|
|
21
28
|
|
|
@@ -88,15 +95,34 @@ impl JsonTreeBuilder {
|
|
|
88
95
|
kept: usize,
|
|
89
96
|
total: usize,
|
|
90
97
|
local_children: Vec<usize>,
|
|
98
|
+
local_indices: Vec<usize>,
|
|
91
99
|
) {
|
|
92
100
|
let mut a = self.arena.borrow_mut();
|
|
93
101
|
let children_start = a.children.len();
|
|
94
102
|
a.children.extend(local_children);
|
|
103
|
+
|
|
104
|
+
// Detect contiguous indices 0..kept-1 to skip storing arr_indices data
|
|
105
|
+
let contiguous = local_indices.len() == kept
|
|
106
|
+
&& local_indices.iter().enumerate().all(|(i, &idx)| idx == i);
|
|
107
|
+
|
|
108
|
+
let (arr_indices_start, pushed_len) =
|
|
109
|
+
if kept == 0 || contiguous || local_indices.is_empty() {
|
|
110
|
+
(0usize, 0usize)
|
|
111
|
+
} else {
|
|
112
|
+
let start = a.arr_indices.len();
|
|
113
|
+
a.arr_indices.extend(local_indices);
|
|
114
|
+
let pushed = a.arr_indices.len().saturating_sub(start);
|
|
115
|
+
(start, pushed)
|
|
116
|
+
};
|
|
117
|
+
|
|
95
118
|
let n = &mut a.nodes[id];
|
|
96
119
|
n.kind = NodeKind::Array;
|
|
97
120
|
n.children_start = children_start;
|
|
98
121
|
n.children_len = kept;
|
|
99
122
|
n.array_len = Some(total);
|
|
123
|
+
n.arr_indices_start = arr_indices_start;
|
|
124
|
+
// When no indices were pushed, mark len=0 to indicate contiguous 0..kept
|
|
125
|
+
n.arr_indices_len = pushed_len.min(kept);
|
|
100
126
|
}
|
|
101
127
|
|
|
102
128
|
fn finish_object(
|
|
@@ -205,29 +231,19 @@ impl<'de> Visitor<'de> for NodeVisitor<'_> {
|
|
|
205
231
|
A: SeqAccess<'de>,
|
|
206
232
|
{
|
|
207
233
|
let id = self.b.push_default();
|
|
208
|
-
let
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
kept += 1;
|
|
222
|
-
total += 1;
|
|
223
|
-
}
|
|
224
|
-
None => break,
|
|
225
|
-
}
|
|
226
|
-
}
|
|
227
|
-
while (seq.next_element::<IgnoredAny>()?).is_some() {
|
|
228
|
-
total += 1;
|
|
229
|
-
}
|
|
230
|
-
self.b.finish_array(id, kept, total, local_children);
|
|
234
|
+
let sampled = self.b.sampler.sample_stream(
|
|
235
|
+
&mut seq,
|
|
236
|
+
self.b,
|
|
237
|
+
self.b.array_cap,
|
|
238
|
+
)?;
|
|
239
|
+
let kept = sampled.children.len();
|
|
240
|
+
self.b.finish_array(
|
|
241
|
+
id,
|
|
242
|
+
kept,
|
|
243
|
+
sampled.total_len,
|
|
244
|
+
sampled.children,
|
|
245
|
+
sampled.indices,
|
|
246
|
+
);
|
|
231
247
|
Ok(id)
|
|
232
248
|
}
|
|
233
249
|
|
|
@@ -255,3 +271,5 @@ impl<'de> Visitor<'de> for NodeVisitor<'_> {
|
|
|
255
271
|
Ok(id)
|
|
256
272
|
}
|
|
257
273
|
}
|
|
274
|
+
|
|
275
|
+
impl JsonTreeBuilder {}
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
mod builder;
|
|
2
|
+
mod samplers;
|
|
2
3
|
use serde::de::DeserializeSeed;
|
|
3
4
|
|
|
4
|
-
use crate::
|
|
5
|
+
use crate::PriorityConfig;
|
|
5
6
|
use crate::utils::tree_arena::JsonTreeArena;
|
|
6
7
|
use anyhow::Result;
|
|
7
8
|
use builder::JsonTreeBuilder;
|
|
@@ -19,7 +20,10 @@ pub fn build_json_tree_arena_from_bytes(
|
|
|
19
20
|
config: &PriorityConfig,
|
|
20
21
|
) -> Result<JsonTreeArena> {
|
|
21
22
|
let mut de = simd_json::Deserializer::from_slice(&mut bytes)?;
|
|
22
|
-
let builder = JsonTreeBuilder::new(
|
|
23
|
+
let builder = JsonTreeBuilder::new(
|
|
24
|
+
config.array_max_items,
|
|
25
|
+
config.array_sampler.into(),
|
|
26
|
+
);
|
|
23
27
|
let root_id: usize = {
|
|
24
28
|
let seed = builder.seed();
|
|
25
29
|
seed.deserialize(&mut de)?
|
|
@@ -33,7 +37,10 @@ pub fn build_json_tree_arena_from_many(
|
|
|
33
37
|
mut inputs: Vec<(String, Vec<u8>)>,
|
|
34
38
|
config: &PriorityConfig,
|
|
35
39
|
) -> Result<JsonTreeArena> {
|
|
36
|
-
let builder = JsonTreeBuilder::new(
|
|
40
|
+
let builder = JsonTreeBuilder::new(
|
|
41
|
+
config.array_max_items,
|
|
42
|
+
config.array_sampler.into(),
|
|
43
|
+
);
|
|
37
44
|
let mut child_ids: Vec<usize> = Vec::with_capacity(inputs.len());
|
|
38
45
|
let mut keys: Vec<String> = Vec::with_capacity(inputs.len());
|
|
39
46
|
for (key, mut bytes) in inputs.drain(..) {
|