headson 0.6.3__tar.gz → 0.6.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {headson-0.6.3 → headson-0.6.4}/Cargo.lock +1 -1
- {headson-0.6.3 → headson-0.6.4}/Cargo.toml +1 -1
- {headson-0.6.3 → headson-0.6.4}/PKG-INFO +18 -13
- {headson-0.6.3 → headson-0.6.4}/README.md +17 -12
- headson-0.6.4/docs/assets/tapes/demo.gif +0 -0
- {headson-0.6.3 → headson-0.6.4}/pyproject.toml +1 -1
- {headson-0.6.3 → headson-0.6.4}/python/Cargo.lock +2 -2
- {headson-0.6.3 → headson-0.6.4}/python/Cargo.toml +1 -1
- {headson-0.6.3/src/json_ingest → headson-0.6.4/src/ingest/formats/json}/builder.rs +2 -5
- {headson-0.6.3/src/json_ingest → headson-0.6.4/src/ingest/formats/json}/mod.rs +41 -6
- {headson-0.6.3/src/json_ingest → headson-0.6.4/src/ingest/formats/json}/samplers/default.rs +4 -6
- {headson-0.6.3/src/json_ingest → headson-0.6.4/src/ingest/formats/json}/samplers/head.rs +2 -1
- {headson-0.6.3/src/json_ingest → headson-0.6.4/src/ingest/formats/json}/samplers/mod.rs +2 -20
- {headson-0.6.3/src/json_ingest → headson-0.6.4/src/ingest/formats/json}/samplers/tail.rs +6 -3
- headson-0.6.4/src/ingest/formats/mod.rs +9 -0
- {headson-0.6.3/src/text_ingest → headson-0.6.4/src/ingest/formats/text}/mod.rs +96 -14
- {headson-0.6.3/src/yaml_ingest → headson-0.6.4/src/ingest/formats/yaml}/mod.rs +124 -78
- {headson-0.6.3 → headson-0.6.4}/src/ingest/mod.rs +14 -8
- headson-0.6.4/src/ingest/sampling/mod.rs +118 -0
- {headson-0.6.3 → headson-0.6.4}/src/lib.rs +0 -3
- {headson-0.6.3 → headson-0.6.4}/src/main.rs +4 -4
- {headson-0.6.3 → headson-0.6.4}/src/order/build.rs +2 -2
- {headson-0.6.3 → headson-0.6.4}/src/serialization/mod.rs +24 -21
- headson-0.6.3/docs/assets/tapes/demo.gif +0 -0
- headson-0.6.3/src/ingest/json.rs +0 -37
- headson-0.6.3/src/ingest/text.rs +0 -45
- headson-0.6.3/src/ingest/yaml.rs +0 -39
- {headson-0.6.3 → headson-0.6.4}/docs/assets/algorithm.svg +0 -0
- {headson-0.6.3 → headson-0.6.4}/docs/assets/logo.png +0 -0
- {headson-0.6.3 → headson-0.6.4}/docs/assets/logo.svg +0 -0
- {headson-0.6.3 → headson-0.6.4}/python/README.md +0 -0
- {headson-0.6.3 → headson-0.6.4}/python/headson/__init__.py +0 -0
- {headson-0.6.3 → headson-0.6.4}/python/src/lib.rs +0 -0
- {headson-0.6.3 → headson-0.6.4}/src/format.rs +0 -0
- {headson-0.6.3 → headson-0.6.4}/src/order/mod.rs +0 -0
- {headson-0.6.3 → headson-0.6.4}/src/order/scoring.rs +0 -0
- {headson-0.6.3 → headson-0.6.4}/src/order/snapshots/headson__order__build__tests__order_empty_array_order.snap +0 -0
- {headson-0.6.3 → headson-0.6.4}/src/order/snapshots/headson__order__build__tests__order_single_string_array_order.snap +0 -0
- {headson-0.6.3 → headson-0.6.4}/src/order/types.rs +0 -0
- {headson-0.6.3 → headson-0.6.4}/src/serialization/color.rs +0 -0
- {headson-0.6.3 → headson-0.6.4}/src/serialization/fileset.rs +0 -0
- {headson-0.6.3 → headson-0.6.4}/src/serialization/output.rs +0 -0
- {headson-0.6.3 → headson-0.6.4}/src/serialization/snapshots/headson__serialization__tests__arena_render_empty.snap +0 -0
- {headson-0.6.3 → headson-0.6.4}/src/serialization/snapshots/headson__serialization__tests__arena_render_empty_yaml.snap +0 -0
- {headson-0.6.3 → headson-0.6.4}/src/serialization/snapshots/headson__serialization__tests__arena_render_single.snap +0 -0
- {headson-0.6.3 → headson-0.6.4}/src/serialization/snapshots/headson__serialization__tests__arena_render_single_yaml.snap +0 -0
- {headson-0.6.3 → headson-0.6.4}/src/serialization/snapshots/headson__serialization__tests__array_internal_gaps_yaml.snap +0 -0
- {headson-0.6.3 → headson-0.6.4}/src/serialization/snapshots/headson__serialization__tests__array_omitted_js_head.snap +0 -0
- {headson-0.6.3 → headson-0.6.4}/src/serialization/snapshots/headson__serialization__tests__array_omitted_js_tail.snap +0 -0
- {headson-0.6.3 → headson-0.6.4}/src/serialization/snapshots/headson__serialization__tests__array_omitted_pseudo_head.snap +0 -0
- {headson-0.6.3 → headson-0.6.4}/src/serialization/snapshots/headson__serialization__tests__array_omitted_pseudo_tail.snap +0 -0
- {headson-0.6.3 → headson-0.6.4}/src/serialization/snapshots/headson__serialization__tests__array_omitted_yaml_head.snap +0 -0
- {headson-0.6.3 → headson-0.6.4}/src/serialization/snapshots/headson__serialization__tests__array_omitted_yaml_tail.snap +0 -0
- {headson-0.6.3 → headson-0.6.4}/src/serialization/snapshots/headson__serialization__tests__inline_open_array_in_object_json.snap +0 -0
- {headson-0.6.3 → headson-0.6.4}/src/serialization/snapshots/headson__serialization__tests__inline_open_array_in_object_yaml.snap +0 -0
- {headson-0.6.3 → headson-0.6.4}/src/serialization/templates/core.rs +0 -0
- {headson-0.6.3 → headson-0.6.4}/src/serialization/templates/js.rs +0 -0
- {headson-0.6.3 → headson-0.6.4}/src/serialization/templates/json.rs +0 -0
- {headson-0.6.3 → headson-0.6.4}/src/serialization/templates/mod.rs +0 -0
- {headson-0.6.3 → headson-0.6.4}/src/serialization/templates/pseudo.rs +0 -0
- {headson-0.6.3 → headson-0.6.4}/src/serialization/templates/text.rs +0 -0
- {headson-0.6.3 → headson-0.6.4}/src/serialization/templates/yaml.rs +0 -0
- {headson-0.6.3 → headson-0.6.4}/src/serialization/types.rs +0 -0
- {headson-0.6.3 → headson-0.6.4}/src/snapshots/headson__order__tests__order_empty_array_order.snap +0 -0
- {headson-0.6.3 → headson-0.6.4}/src/snapshots/headson__order__tests__order_single_string_array_order.snap +0 -0
- {headson-0.6.3 → headson-0.6.4}/src/utils/graph.rs +0 -0
- {headson-0.6.3 → headson-0.6.4}/src/utils/json.rs +0 -0
- {headson-0.6.3 → headson-0.6.4}/src/utils/mod.rs +0 -0
- {headson-0.6.3 → headson-0.6.4}/src/utils/search.rs +0 -0
- {headson-0.6.3 → headson-0.6.4}/src/utils/text.rs +0 -0
- {headson-0.6.3 → headson-0.6.4}/src/utils/tree_arena.rs +0 -0
- {headson-0.6.3 → headson-0.6.4}/tests/fixtures/json/JSONTestSuite/LICENSE +0 -0
- {headson-0.6.3 → headson-0.6.4}/tests/fixtures/json/JSONTestSuite/README.md +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: headson
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.4
|
|
4
4
|
Classifier: Programming Language :: Python
|
|
5
5
|
Classifier: Programming Language :: Python :: 3
|
|
6
6
|
Classifier: Programming Language :: Rust
|
|
@@ -26,6 +26,9 @@ Available as:
|
|
|
26
26
|
- CLI (see [Usage](#usage))
|
|
27
27
|
- Python library (see [Python Bindings](#python-bindings))
|
|
28
28
|
|
|
29
|
+
  
|
|
30
|
+
|
|
31
|
+
|
|
29
32
|
## Install
|
|
30
33
|
|
|
31
34
|
Using Cargo:
|
|
@@ -67,8 +70,8 @@ If you’re comfortable with tools like `head` and `tail`, use `headson` when yo
|
|
|
67
70
|
|
|
68
71
|
Common flags:
|
|
69
72
|
|
|
70
|
-
- `-
|
|
71
|
-
- `-
|
|
73
|
+
- `-c, --bytes <BYTES>`: per‑file output budget. For multiple inputs, default total budget is `<BYTES> * number_of_inputs`.
|
|
74
|
+
- `-C, --global-bytes <BYTES>`: total output budget across all inputs. With `--bytes`, the effective total is the smaller of the two.
|
|
72
75
|
- `-f, --format <auto|json|yaml|text>`: output format (default: `auto`).
|
|
73
76
|
- Auto: stdin → JSON family; filesets → per‑file based on extension (`.json` → JSON family, `.yaml`/`.yml` → YAML, unknown → Text).
|
|
74
77
|
- `-t, --template <strict|default|detailed>`: output style (default: `default`).
|
|
@@ -89,7 +92,7 @@ Notes:
|
|
|
89
92
|
- With newlines enabled, file sections are rendered with human‑readable headers. In compact/single‑line modes, headers are omitted.
|
|
90
93
|
- In `--format auto`, each file uses its own best format: JSON family for `.json`, YAML for `.yaml`/`.yml`.
|
|
91
94
|
- Unknown extensions are treated as Text (raw lines) — safe for logs and `.txt` files.
|
|
92
|
-
- `--global-
|
|
95
|
+
- `--global-bytes` may truncate or omit entire files to respect the total budget.
|
|
93
96
|
- The tool finds the largest preview that fits the budget; even if extremely tight, you still get a minimal, valid preview.
|
|
94
97
|
- Directories and binary files are ignored; a notice is printed to stderr for each. Stdin reads the stream as‑is.
|
|
95
98
|
- Head vs Tail sampling: these options bias which part of arrays are kept before rendering. Display styles may still insert internal gap markers to honor very small budgets; strict JSON stays unannotated.
|
|
@@ -98,33 +101,33 @@ Quick one‑liners:
|
|
|
98
101
|
|
|
99
102
|
- Peek a big JSON stream (keeps structure):
|
|
100
103
|
|
|
101
|
-
zstdcat huge.json.zst | headson -
|
|
104
|
+
zstdcat huge.json.zst | headson -c 800 -f json -t default
|
|
102
105
|
|
|
103
106
|
- Many files with a fixed overall size:
|
|
104
107
|
|
|
105
|
-
headson -
|
|
108
|
+
headson -C 1200 -f json -t strict logs/*.json
|
|
106
109
|
|
|
107
110
|
- Glance at a file, JavaScript‑style comments for omissions:
|
|
108
111
|
|
|
109
|
-
headson -
|
|
112
|
+
headson -c 400 -f json -t detailed data.json
|
|
110
113
|
|
|
111
114
|
- YAML with detailed comments:
|
|
112
115
|
|
|
113
|
-
headson -
|
|
116
|
+
headson -c 400 -f yaml -t detailed config.yaml
|
|
114
117
|
|
|
115
118
|
### Text mode
|
|
116
119
|
|
|
117
120
|
- Single file (auto):
|
|
118
121
|
|
|
119
|
-
headson -
|
|
122
|
+
headson -c 200 notes.txt
|
|
120
123
|
|
|
121
124
|
- Force Text ingest/output (useful when mixing with other extensions):
|
|
122
125
|
|
|
123
|
-
headson -
|
|
126
|
+
headson -c 200 -i text -f text notes.txt
|
|
124
127
|
|
|
125
128
|
- Many text files (fileset):
|
|
126
129
|
|
|
127
|
-
headson -
|
|
130
|
+
headson -c 800 -i text -f text logs/*.txt
|
|
128
131
|
|
|
129
132
|
- Styles on Text:
|
|
130
133
|
- default: omission as a standalone `…` line.
|
|
@@ -135,6 +138,8 @@ Show help:
|
|
|
135
138
|
|
|
136
139
|
headson --help
|
|
137
140
|
|
|
141
|
+
Note: flags align with head/tail conventions (`-c/--bytes`, `-C/--global-bytes`).
|
|
142
|
+
|
|
138
143
|
## Examples: head vs headson
|
|
139
144
|
|
|
140
145
|
Input:
|
|
@@ -153,7 +158,7 @@ jq -c . users.json | head -c 80
|
|
|
153
158
|
Structured preview with headson (JSON family, default style → Pseudo):
|
|
154
159
|
|
|
155
160
|
```bash
|
|
156
|
-
headson -
|
|
161
|
+
headson -c 120 -f json -t default users.json
|
|
157
162
|
# {
|
|
158
163
|
# users: [
|
|
159
164
|
# { id: 1, name: "Ana", roles: [ "admin", … ] },
|
|
@@ -166,7 +171,7 @@ headson -n 120 -f json -t default users.json
|
|
|
166
171
|
Machine‑readable preview (JSON family, strict style → strict JSON):
|
|
167
172
|
|
|
168
173
|
```bash
|
|
169
|
-
headson -
|
|
174
|
+
headson -c 120 -f json -t strict users.json
|
|
170
175
|
# {"users":[{"id":1,"name":"Ana","roles":["admin"]}],"meta":{"count":2}}
|
|
171
176
|
```
|
|
172
177
|
|
|
@@ -12,6 +12,9 @@ Available as:
|
|
|
12
12
|
- CLI (see [Usage](#usage))
|
|
13
13
|
- Python library (see [Python Bindings](#python-bindings))
|
|
14
14
|
|
|
15
|
+
  
|
|
16
|
+
|
|
17
|
+
|
|
15
18
|
## Install
|
|
16
19
|
|
|
17
20
|
Using Cargo:
|
|
@@ -53,8 +56,8 @@ If you’re comfortable with tools like `head` and `tail`, use `headson` when yo
|
|
|
53
56
|
|
|
54
57
|
Common flags:
|
|
55
58
|
|
|
56
|
-
- `-
|
|
57
|
-
- `-
|
|
59
|
+
- `-c, --bytes <BYTES>`: per‑file output budget. For multiple inputs, default total budget is `<BYTES> * number_of_inputs`.
|
|
60
|
+
- `-C, --global-bytes <BYTES>`: total output budget across all inputs. With `--bytes`, the effective total is the smaller of the two.
|
|
58
61
|
- `-f, --format <auto|json|yaml|text>`: output format (default: `auto`).
|
|
59
62
|
- Auto: stdin → JSON family; filesets → per‑file based on extension (`.json` → JSON family, `.yaml`/`.yml` → YAML, unknown → Text).
|
|
60
63
|
- `-t, --template <strict|default|detailed>`: output style (default: `default`).
|
|
@@ -75,7 +78,7 @@ Notes:
|
|
|
75
78
|
- With newlines enabled, file sections are rendered with human‑readable headers. In compact/single‑line modes, headers are omitted.
|
|
76
79
|
- In `--format auto`, each file uses its own best format: JSON family for `.json`, YAML for `.yaml`/`.yml`.
|
|
77
80
|
- Unknown extensions are treated as Text (raw lines) — safe for logs and `.txt` files.
|
|
78
|
-
- `--global-
|
|
81
|
+
- `--global-bytes` may truncate or omit entire files to respect the total budget.
|
|
79
82
|
- The tool finds the largest preview that fits the budget; even if extremely tight, you still get a minimal, valid preview.
|
|
80
83
|
- Directories and binary files are ignored; a notice is printed to stderr for each. Stdin reads the stream as‑is.
|
|
81
84
|
- Head vs Tail sampling: these options bias which part of arrays are kept before rendering. Display styles may still insert internal gap markers to honor very small budgets; strict JSON stays unannotated.
|
|
@@ -84,33 +87,33 @@ Quick one‑liners:
|
|
|
84
87
|
|
|
85
88
|
- Peek a big JSON stream (keeps structure):
|
|
86
89
|
|
|
87
|
-
zstdcat huge.json.zst | headson -
|
|
90
|
+
zstdcat huge.json.zst | headson -c 800 -f json -t default
|
|
88
91
|
|
|
89
92
|
- Many files with a fixed overall size:
|
|
90
93
|
|
|
91
|
-
headson -
|
|
94
|
+
headson -C 1200 -f json -t strict logs/*.json
|
|
92
95
|
|
|
93
96
|
- Glance at a file, JavaScript‑style comments for omissions:
|
|
94
97
|
|
|
95
|
-
headson -
|
|
98
|
+
headson -c 400 -f json -t detailed data.json
|
|
96
99
|
|
|
97
100
|
- YAML with detailed comments:
|
|
98
101
|
|
|
99
|
-
headson -
|
|
102
|
+
headson -c 400 -f yaml -t detailed config.yaml
|
|
100
103
|
|
|
101
104
|
### Text mode
|
|
102
105
|
|
|
103
106
|
- Single file (auto):
|
|
104
107
|
|
|
105
|
-
headson -
|
|
108
|
+
headson -c 200 notes.txt
|
|
106
109
|
|
|
107
110
|
- Force Text ingest/output (useful when mixing with other extensions):
|
|
108
111
|
|
|
109
|
-
headson -
|
|
112
|
+
headson -c 200 -i text -f text notes.txt
|
|
110
113
|
|
|
111
114
|
- Many text files (fileset):
|
|
112
115
|
|
|
113
|
-
headson -
|
|
116
|
+
headson -c 800 -i text -f text logs/*.txt
|
|
114
117
|
|
|
115
118
|
- Styles on Text:
|
|
116
119
|
- default: omission as a standalone `…` line.
|
|
@@ -121,6 +124,8 @@ Show help:
|
|
|
121
124
|
|
|
122
125
|
headson --help
|
|
123
126
|
|
|
127
|
+
Note: flags align with head/tail conventions (`-c/--bytes`, `-C/--global-bytes`).
|
|
128
|
+
|
|
124
129
|
## Examples: head vs headson
|
|
125
130
|
|
|
126
131
|
Input:
|
|
@@ -139,7 +144,7 @@ jq -c . users.json | head -c 80
|
|
|
139
144
|
Structured preview with headson (JSON family, default style → Pseudo):
|
|
140
145
|
|
|
141
146
|
```bash
|
|
142
|
-
headson -
|
|
147
|
+
headson -c 120 -f json -t default users.json
|
|
143
148
|
# {
|
|
144
149
|
# users: [
|
|
145
150
|
# { id: 1, name: "Ana", roles: [ "admin", … ] },
|
|
@@ -152,7 +157,7 @@ headson -n 120 -f json -t default users.json
|
|
|
152
157
|
Machine‑readable preview (JSON family, strict style → strict JSON):
|
|
153
158
|
|
|
154
159
|
```bash
|
|
155
|
-
headson -
|
|
160
|
+
headson -c 120 -f json -t strict users.json
|
|
156
161
|
# {"users":[{"id":1,"name":"Ana","roles":["admin"]}],"meta":{"count":2}}
|
|
157
162
|
```
|
|
158
163
|
|
|
Binary file
|
|
@@ -214,7 +214,7 @@ dependencies = [
|
|
|
214
214
|
|
|
215
215
|
[[package]]
|
|
216
216
|
name = "headson"
|
|
217
|
-
version = "0.6.
|
|
217
|
+
version = "0.6.4"
|
|
218
218
|
dependencies = [
|
|
219
219
|
"anyhow",
|
|
220
220
|
"clap",
|
|
@@ -228,7 +228,7 @@ dependencies = [
|
|
|
228
228
|
|
|
229
229
|
[[package]]
|
|
230
230
|
name = "headson-python"
|
|
231
|
-
version = "0.6.
|
|
231
|
+
version = "0.6.4"
|
|
232
232
|
dependencies = [
|
|
233
233
|
"anyhow",
|
|
234
234
|
"headson",
|
|
@@ -5,7 +5,7 @@ use std::cell::RefCell;
|
|
|
5
5
|
use crate::order::NodeKind;
|
|
6
6
|
use crate::utils::tree_arena::{JsonTreeArena, JsonTreeNode};
|
|
7
7
|
|
|
8
|
-
use
|
|
8
|
+
use crate::ingest::sampling::ArraySamplerKind;
|
|
9
9
|
|
|
10
10
|
#[derive(Default)]
|
|
11
11
|
pub(crate) struct JsonTreeBuilder {
|
|
@@ -15,10 +15,7 @@ pub(crate) struct JsonTreeBuilder {
|
|
|
15
15
|
}
|
|
16
16
|
|
|
17
17
|
impl JsonTreeBuilder {
|
|
18
|
-
pub(crate) fn new(
|
|
19
|
-
array_cap: usize,
|
|
20
|
-
sampler: super::samplers::ArraySamplerKind,
|
|
21
|
-
) -> Self {
|
|
18
|
+
pub(crate) fn new(array_cap: usize, sampler: ArraySamplerKind) -> Self {
|
|
22
19
|
Self {
|
|
23
20
|
arena: RefCell::new(JsonTreeArena::default()),
|
|
24
21
|
array_cap,
|
|
@@ -1,24 +1,27 @@
|
|
|
1
1
|
mod builder;
|
|
2
2
|
mod samplers;
|
|
3
|
-
use serde::de::DeserializeSeed;
|
|
4
3
|
|
|
5
|
-
use crate::PriorityConfig;
|
|
6
|
-
use crate::utils::tree_arena::JsonTreeArena;
|
|
7
4
|
use anyhow::Result;
|
|
8
5
|
use builder::JsonTreeBuilder;
|
|
6
|
+
use serde::de::DeserializeSeed;
|
|
7
|
+
|
|
8
|
+
use crate::PriorityConfig;
|
|
9
|
+
use crate::utils::tree_arena::JsonTreeArena as TreeArena;
|
|
10
|
+
|
|
11
|
+
use crate::ingest::Ingest;
|
|
9
12
|
|
|
10
13
|
#[cfg(test)]
|
|
11
14
|
pub fn build_json_tree_arena(
|
|
12
15
|
input: &str,
|
|
13
16
|
config: &PriorityConfig,
|
|
14
|
-
) -> Result<
|
|
17
|
+
) -> Result<TreeArena> {
|
|
15
18
|
build_json_tree_arena_from_bytes(input.as_bytes().to_vec(), config)
|
|
16
19
|
}
|
|
17
20
|
|
|
18
21
|
pub fn build_json_tree_arena_from_bytes(
|
|
19
22
|
mut bytes: Vec<u8>,
|
|
20
23
|
config: &PriorityConfig,
|
|
21
|
-
) -> Result<
|
|
24
|
+
) -> Result<TreeArena> {
|
|
22
25
|
let mut de = simd_json::Deserializer::from_slice(&mut bytes)?;
|
|
23
26
|
let builder = JsonTreeBuilder::new(
|
|
24
27
|
config.array_max_items,
|
|
@@ -36,7 +39,7 @@ pub fn build_json_tree_arena_from_bytes(
|
|
|
36
39
|
pub fn build_json_tree_arena_from_many(
|
|
37
40
|
mut inputs: Vec<(String, Vec<u8>)>,
|
|
38
41
|
config: &PriorityConfig,
|
|
39
|
-
) -> Result<
|
|
42
|
+
) -> Result<TreeArena> {
|
|
40
43
|
let builder = JsonTreeBuilder::new(
|
|
41
44
|
config.array_max_items,
|
|
42
45
|
config.array_sampler.into(),
|
|
@@ -57,6 +60,38 @@ pub fn build_json_tree_arena_from_many(
|
|
|
57
60
|
Ok(arena)
|
|
58
61
|
}
|
|
59
62
|
|
|
63
|
+
/// JSON adapter for the ingest boundary. Delegates to the JSON builder to
|
|
64
|
+
/// produce the neutral `TreeArena`.
|
|
65
|
+
pub struct JsonIngest;
|
|
66
|
+
|
|
67
|
+
impl Ingest for JsonIngest {
|
|
68
|
+
fn parse_one(bytes: Vec<u8>, cfg: &PriorityConfig) -> Result<TreeArena> {
|
|
69
|
+
build_json_tree_arena_from_bytes(bytes, cfg)
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
fn parse_many(
|
|
73
|
+
inputs: Vec<(String, Vec<u8>)>,
|
|
74
|
+
cfg: &PriorityConfig,
|
|
75
|
+
) -> Result<TreeArena> {
|
|
76
|
+
build_json_tree_arena_from_many(inputs, cfg)
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
/// Convenience functions for the JSON ingest path.
|
|
81
|
+
pub fn parse_json_one(
|
|
82
|
+
bytes: Vec<u8>,
|
|
83
|
+
cfg: &PriorityConfig,
|
|
84
|
+
) -> Result<TreeArena> {
|
|
85
|
+
JsonIngest::parse_one(bytes, cfg)
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
pub fn parse_json_many(
|
|
89
|
+
inputs: Vec<(String, Vec<u8>)>,
|
|
90
|
+
cfg: &PriorityConfig,
|
|
91
|
+
) -> Result<TreeArena> {
|
|
92
|
+
JsonIngest::parse_many(inputs, cfg)
|
|
93
|
+
}
|
|
94
|
+
|
|
60
95
|
#[cfg(test)]
|
|
61
96
|
mod tests {
|
|
62
97
|
use super::*;
|
|
@@ -1,14 +1,12 @@
|
|
|
1
1
|
use serde::de::{IgnoredAny, SeqAccess};
|
|
2
2
|
|
|
3
|
-
use super::
|
|
3
|
+
use super::JsonTreeBuilder;
|
|
4
|
+
use super::SampledArray;
|
|
4
5
|
|
|
5
|
-
//
|
|
6
|
+
// Default strategy phases: keep-first, greedy, then index-hash acceptance (~50%).
|
|
6
7
|
const RANDOM_ACCEPT_SEED: u64 = 0x9e37_79b9_7f4a_7c15;
|
|
7
|
-
|
|
8
|
-
const RANDOM_ACCEPT_THRESHOLD: u32 = 0x8000_0000;
|
|
9
|
-
// Keep a small, fixed number of items from the head before greedy/random phases.
|
|
8
|
+
const RANDOM_ACCEPT_THRESHOLD: u32 = 0x8000_0000; // ~50%
|
|
10
9
|
const KEEP_FIRST_COUNT: usize = 3;
|
|
11
|
-
// Take roughly half of the remaining capacity greedily after the first items.
|
|
12
10
|
const GREEDY_PORTION_DIVISOR: usize = 2;
|
|
13
11
|
|
|
14
12
|
struct PhaseState {
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
use serde::de::SeqAccess;
|
|
2
2
|
|
|
3
|
-
use crate::
|
|
4
|
-
use crate::
|
|
3
|
+
use crate::ingest::formats::json::builder::JsonTreeBuilder;
|
|
4
|
+
use crate::ingest::sampling::ArraySamplerKind;
|
|
5
5
|
|
|
6
6
|
#[derive(Debug)]
|
|
7
7
|
pub(crate) struct SampledArray {
|
|
@@ -10,14 +10,6 @@ pub(crate) struct SampledArray {
|
|
|
10
10
|
pub total_len: usize,
|
|
11
11
|
}
|
|
12
12
|
|
|
13
|
-
#[derive(Copy, Clone, Debug, Default)]
|
|
14
|
-
pub(crate) enum ArraySamplerKind {
|
|
15
|
-
#[default]
|
|
16
|
-
Default,
|
|
17
|
-
Head,
|
|
18
|
-
Tail,
|
|
19
|
-
}
|
|
20
|
-
|
|
21
13
|
impl ArraySamplerKind {
|
|
22
14
|
pub(crate) fn sample_stream<'de, A>(
|
|
23
15
|
self,
|
|
@@ -38,16 +30,6 @@ impl ArraySamplerKind {
|
|
|
38
30
|
}
|
|
39
31
|
}
|
|
40
32
|
|
|
41
|
-
impl From<ArraySamplerStrategy> for ArraySamplerKind {
|
|
42
|
-
fn from(strategy: ArraySamplerStrategy) -> Self {
|
|
43
|
-
match strategy {
|
|
44
|
-
ArraySamplerStrategy::Default => ArraySamplerKind::Default,
|
|
45
|
-
ArraySamplerStrategy::Head => ArraySamplerKind::Head,
|
|
46
|
-
ArraySamplerStrategy::Tail => ArraySamplerKind::Tail,
|
|
47
|
-
}
|
|
48
|
-
}
|
|
49
|
-
}
|
|
50
|
-
|
|
51
33
|
mod default;
|
|
52
34
|
mod head;
|
|
53
35
|
mod tail;
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
use serde::de::{IgnoredAny, SeqAccess};
|
|
2
2
|
|
|
3
|
-
use super::
|
|
3
|
+
use super::JsonTreeBuilder;
|
|
4
|
+
use super::SampledArray;
|
|
4
5
|
|
|
5
6
|
pub(crate) fn sample_stream<'de, A>(
|
|
6
7
|
seq: &mut A,
|
|
@@ -89,8 +90,10 @@ mod tests {
|
|
|
89
90
|
let mut cfg = PriorityConfig::new(usize::MAX, 5);
|
|
90
91
|
cfg.array_sampler = crate::ArraySamplerStrategy::Tail;
|
|
91
92
|
let arena =
|
|
92
|
-
crate::
|
|
93
|
-
|
|
93
|
+
crate::ingest::formats::json::build_json_tree_arena_from_bytes(
|
|
94
|
+
input, &cfg,
|
|
95
|
+
)
|
|
96
|
+
.expect("arena");
|
|
94
97
|
let root = &arena.nodes[arena.root_id];
|
|
95
98
|
assert_eq!(root.children_len, 5, "kept 5");
|
|
96
99
|
let mut orig_indices = Vec::new();
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
// File-format specific ingest adapters live under this module.
|
|
2
|
+
pub mod json;
|
|
3
|
+
pub mod text;
|
|
4
|
+
pub mod yaml;
|
|
5
|
+
|
|
6
|
+
// Re-export commonly used helpers for convenience
|
|
7
|
+
pub use json::{parse_json_many, parse_json_one};
|
|
8
|
+
pub use text::{parse_text_many, parse_text_one};
|
|
9
|
+
pub use yaml::{parse_yaml_many, parse_yaml_one};
|
|
@@ -5,6 +5,9 @@ use crate::PriorityConfig;
|
|
|
5
5
|
use crate::order::NodeKind;
|
|
6
6
|
use crate::utils::tree_arena::{JsonTreeArena, JsonTreeNode};
|
|
7
7
|
|
|
8
|
+
use crate::ingest::Ingest;
|
|
9
|
+
use crate::ingest::sampling::{ArraySamplerKind, choose_indices};
|
|
10
|
+
|
|
8
11
|
fn normalize_newlines(s: &str) -> Cow<'_, str> {
|
|
9
12
|
// Normalize CRLF and CR to LF in a single allocation when needed.
|
|
10
13
|
if s.as_bytes().contains(&b'\r') {
|
|
@@ -18,13 +21,15 @@ fn normalize_newlines(s: &str) -> Cow<'_, str> {
|
|
|
18
21
|
struct TextArenaBuilder {
|
|
19
22
|
arena: JsonTreeArena,
|
|
20
23
|
array_cap: usize,
|
|
24
|
+
sampler: ArraySamplerKind,
|
|
21
25
|
}
|
|
22
26
|
|
|
23
27
|
impl TextArenaBuilder {
|
|
24
|
-
fn new(array_cap: usize) -> Self {
|
|
28
|
+
fn new(array_cap: usize, sampler: ArraySamplerKind) -> Self {
|
|
25
29
|
Self {
|
|
26
30
|
arena: JsonTreeArena::default(),
|
|
27
31
|
array_cap,
|
|
32
|
+
sampler,
|
|
28
33
|
}
|
|
29
34
|
}
|
|
30
35
|
|
|
@@ -48,26 +53,38 @@ impl TextArenaBuilder {
|
|
|
48
53
|
|
|
49
54
|
fn push_array_of_lines(
|
|
50
55
|
&mut self,
|
|
51
|
-
lines:
|
|
56
|
+
lines: &[String],
|
|
52
57
|
total: usize,
|
|
53
58
|
) -> usize {
|
|
54
59
|
let id = self.push_default();
|
|
55
|
-
let
|
|
60
|
+
let idxs = choose_indices(self.sampler, total, self.array_cap);
|
|
61
|
+
let kept = idxs.len().min(self.array_cap);
|
|
56
62
|
let mut pushed = 0usize;
|
|
57
|
-
for (i,
|
|
58
|
-
if
|
|
59
|
-
|
|
63
|
+
for (i, &orig_index) in idxs.iter().take(kept).enumerate() {
|
|
64
|
+
if let Some(line) = lines.get(orig_index) {
|
|
65
|
+
let child = self.push_string(line.clone());
|
|
66
|
+
self.arena.children.push(child);
|
|
67
|
+
pushed = i + 1;
|
|
60
68
|
}
|
|
61
|
-
let child = self.push_string(line);
|
|
62
|
-
self.arena.children.push(child);
|
|
63
|
-
pushed += 1;
|
|
64
69
|
}
|
|
65
70
|
let n = &mut self.arena.nodes[id];
|
|
66
71
|
n.kind = NodeKind::Array;
|
|
67
|
-
// children for this array were appended after previous nodes; compute start = len(children) - pushed
|
|
68
72
|
n.children_start = self.arena.children.len().saturating_sub(pushed);
|
|
69
73
|
n.children_len = pushed;
|
|
70
74
|
n.array_len = Some(total);
|
|
75
|
+
// Store arr_indices when not contiguous head prefix
|
|
76
|
+
let contiguous =
|
|
77
|
+
idxs.iter().take(kept).enumerate().all(|(i, &idx)| i == idx);
|
|
78
|
+
if pushed == 0 || contiguous {
|
|
79
|
+
n.arr_indices_start = 0;
|
|
80
|
+
n.arr_indices_len = 0;
|
|
81
|
+
} else {
|
|
82
|
+
let start = self.arena.arr_indices.len();
|
|
83
|
+
self.arena.arr_indices.extend(idxs.into_iter().take(kept));
|
|
84
|
+
let len = self.arena.arr_indices.len().saturating_sub(start);
|
|
85
|
+
n.arr_indices_start = start;
|
|
86
|
+
n.arr_indices_len = len.min(pushed);
|
|
87
|
+
}
|
|
71
88
|
id
|
|
72
89
|
}
|
|
73
90
|
|
|
@@ -110,8 +127,11 @@ pub fn build_text_tree_arena_from_bytes(
|
|
|
110
127
|
.map(std::string::ToString::to_string)
|
|
111
128
|
.collect();
|
|
112
129
|
let total = lines_vec.len();
|
|
113
|
-
let mut b = TextArenaBuilder::new(
|
|
114
|
-
|
|
130
|
+
let mut b = TextArenaBuilder::new(
|
|
131
|
+
config.array_max_items,
|
|
132
|
+
config.array_sampler.into(),
|
|
133
|
+
);
|
|
134
|
+
let root_id = b.push_array_of_lines(&lines_vec, total);
|
|
115
135
|
let mut a = b.finish();
|
|
116
136
|
a.root_id = root_id;
|
|
117
137
|
Ok(a)
|
|
@@ -125,7 +145,10 @@ pub fn build_text_tree_arena_from_many(
|
|
|
125
145
|
mut inputs: Vec<(String, Vec<u8>)>,
|
|
126
146
|
config: &PriorityConfig,
|
|
127
147
|
) -> Result<JsonTreeArena> {
|
|
128
|
-
let mut b = TextArenaBuilder::new(
|
|
148
|
+
let mut b = TextArenaBuilder::new(
|
|
149
|
+
config.array_max_items,
|
|
150
|
+
config.array_sampler.into(),
|
|
151
|
+
);
|
|
129
152
|
let mut keys: Vec<String> = Vec::with_capacity(inputs.len());
|
|
130
153
|
let mut children_ids: Vec<usize> = Vec::with_capacity(inputs.len());
|
|
131
154
|
for (key, bytes) in inputs.drain(..) {
|
|
@@ -136,7 +159,7 @@ pub fn build_text_tree_arena_from_many(
|
|
|
136
159
|
.map(std::string::ToString::to_string)
|
|
137
160
|
.collect();
|
|
138
161
|
let total = lines_vec.len();
|
|
139
|
-
let child_id = b.push_array_of_lines(lines_vec, total);
|
|
162
|
+
let child_id = b.push_array_of_lines(&lines_vec, total);
|
|
140
163
|
keys.push(key);
|
|
141
164
|
children_ids.push(child_id);
|
|
142
165
|
}
|
|
@@ -147,6 +170,39 @@ pub fn build_text_tree_arena_from_many(
|
|
|
147
170
|
Ok(a)
|
|
148
171
|
}
|
|
149
172
|
|
|
173
|
+
pub struct TextIngest;
|
|
174
|
+
|
|
175
|
+
impl Ingest for TextIngest {
|
|
176
|
+
fn parse_one(
|
|
177
|
+
bytes: Vec<u8>,
|
|
178
|
+
cfg: &PriorityConfig,
|
|
179
|
+
) -> Result<JsonTreeArena> {
|
|
180
|
+
build_text_tree_arena_from_bytes(bytes, cfg)
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
fn parse_many(
|
|
184
|
+
inputs: Vec<(String, Vec<u8>)>,
|
|
185
|
+
cfg: &PriorityConfig,
|
|
186
|
+
) -> Result<JsonTreeArena> {
|
|
187
|
+
build_text_tree_arena_from_many(inputs, cfg)
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
/// Convenience functions for the Text ingest path.
|
|
192
|
+
pub fn parse_text_one(
|
|
193
|
+
bytes: Vec<u8>,
|
|
194
|
+
cfg: &PriorityConfig,
|
|
195
|
+
) -> Result<JsonTreeArena> {
|
|
196
|
+
TextIngest::parse_one(bytes, cfg)
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
pub fn parse_text_many(
|
|
200
|
+
inputs: Vec<(String, Vec<u8>)>,
|
|
201
|
+
cfg: &PriorityConfig,
|
|
202
|
+
) -> Result<JsonTreeArena> {
|
|
203
|
+
TextIngest::parse_many(inputs, cfg)
|
|
204
|
+
}
|
|
205
|
+
|
|
150
206
|
#[cfg(test)]
|
|
151
207
|
mod tests {
|
|
152
208
|
use crate::{
|
|
@@ -189,4 +245,30 @@ mod tests {
|
|
|
189
245
|
let out = headson_text(input.into_bytes(), &cfg, &prio, 20).unwrap();
|
|
190
246
|
assert!(out.contains("…\n"));
|
|
191
247
|
}
|
|
248
|
+
|
|
249
|
+
#[test]
|
|
250
|
+
fn tail_sampler_keeps_last_n_indices_text() {
|
|
251
|
+
// Build 10 lines; with array_max_items=5 and tail sampler we should keep last 5
|
|
252
|
+
let lines = (0..10)
|
|
253
|
+
.map(|i| i.to_string())
|
|
254
|
+
.collect::<Vec<_>>()
|
|
255
|
+
.join("\n");
|
|
256
|
+
let mut cfg = PriorityConfig::new(usize::MAX, 5);
|
|
257
|
+
cfg.array_sampler = crate::ArraySamplerStrategy::Tail;
|
|
258
|
+
let arena =
|
|
259
|
+
super::build_text_tree_arena_from_bytes(lines.into_bytes(), &cfg)
|
|
260
|
+
.expect("arena");
|
|
261
|
+
let root = &arena.nodes[arena.root_id];
|
|
262
|
+
assert_eq!(root.children_len, 5, "kept 5");
|
|
263
|
+
let mut orig_indices = Vec::new();
|
|
264
|
+
for i in 0..root.children_len {
|
|
265
|
+
let oi = if root.arr_indices_len > 0 {
|
|
266
|
+
arena.arr_indices[root.arr_indices_start + i]
|
|
267
|
+
} else {
|
|
268
|
+
i
|
|
269
|
+
};
|
|
270
|
+
orig_indices.push(oi);
|
|
271
|
+
}
|
|
272
|
+
assert_eq!(orig_indices, vec![5, 6, 7, 8, 9]);
|
|
273
|
+
}
|
|
192
274
|
}
|