headson 0.6.3__tar.gz → 0.6.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. {headson-0.6.3 → headson-0.6.4}/Cargo.lock +1 -1
  2. {headson-0.6.3 → headson-0.6.4}/Cargo.toml +1 -1
  3. {headson-0.6.3 → headson-0.6.4}/PKG-INFO +18 -13
  4. {headson-0.6.3 → headson-0.6.4}/README.md +17 -12
  5. headson-0.6.4/docs/assets/tapes/demo.gif +0 -0
  6. {headson-0.6.3 → headson-0.6.4}/pyproject.toml +1 -1
  7. {headson-0.6.3 → headson-0.6.4}/python/Cargo.lock +2 -2
  8. {headson-0.6.3 → headson-0.6.4}/python/Cargo.toml +1 -1
  9. {headson-0.6.3/src/json_ingest → headson-0.6.4/src/ingest/formats/json}/builder.rs +2 -5
  10. {headson-0.6.3/src/json_ingest → headson-0.6.4/src/ingest/formats/json}/mod.rs +41 -6
  11. {headson-0.6.3/src/json_ingest → headson-0.6.4/src/ingest/formats/json}/samplers/default.rs +4 -6
  12. {headson-0.6.3/src/json_ingest → headson-0.6.4/src/ingest/formats/json}/samplers/head.rs +2 -1
  13. {headson-0.6.3/src/json_ingest → headson-0.6.4/src/ingest/formats/json}/samplers/mod.rs +2 -20
  14. {headson-0.6.3/src/json_ingest → headson-0.6.4/src/ingest/formats/json}/samplers/tail.rs +6 -3
  15. headson-0.6.4/src/ingest/formats/mod.rs +9 -0
  16. {headson-0.6.3/src/text_ingest → headson-0.6.4/src/ingest/formats/text}/mod.rs +96 -14
  17. {headson-0.6.3/src/yaml_ingest → headson-0.6.4/src/ingest/formats/yaml}/mod.rs +124 -78
  18. {headson-0.6.3 → headson-0.6.4}/src/ingest/mod.rs +14 -8
  19. headson-0.6.4/src/ingest/sampling/mod.rs +118 -0
  20. {headson-0.6.3 → headson-0.6.4}/src/lib.rs +0 -3
  21. {headson-0.6.3 → headson-0.6.4}/src/main.rs +4 -4
  22. {headson-0.6.3 → headson-0.6.4}/src/order/build.rs +2 -2
  23. {headson-0.6.3 → headson-0.6.4}/src/serialization/mod.rs +24 -21
  24. headson-0.6.3/docs/assets/tapes/demo.gif +0 -0
  25. headson-0.6.3/src/ingest/json.rs +0 -37
  26. headson-0.6.3/src/ingest/text.rs +0 -45
  27. headson-0.6.3/src/ingest/yaml.rs +0 -39
  28. {headson-0.6.3 → headson-0.6.4}/docs/assets/algorithm.svg +0 -0
  29. {headson-0.6.3 → headson-0.6.4}/docs/assets/logo.png +0 -0
  30. {headson-0.6.3 → headson-0.6.4}/docs/assets/logo.svg +0 -0
  31. {headson-0.6.3 → headson-0.6.4}/python/README.md +0 -0
  32. {headson-0.6.3 → headson-0.6.4}/python/headson/__init__.py +0 -0
  33. {headson-0.6.3 → headson-0.6.4}/python/src/lib.rs +0 -0
  34. {headson-0.6.3 → headson-0.6.4}/src/format.rs +0 -0
  35. {headson-0.6.3 → headson-0.6.4}/src/order/mod.rs +0 -0
  36. {headson-0.6.3 → headson-0.6.4}/src/order/scoring.rs +0 -0
  37. {headson-0.6.3 → headson-0.6.4}/src/order/snapshots/headson__order__build__tests__order_empty_array_order.snap +0 -0
  38. {headson-0.6.3 → headson-0.6.4}/src/order/snapshots/headson__order__build__tests__order_single_string_array_order.snap +0 -0
  39. {headson-0.6.3 → headson-0.6.4}/src/order/types.rs +0 -0
  40. {headson-0.6.3 → headson-0.6.4}/src/serialization/color.rs +0 -0
  41. {headson-0.6.3 → headson-0.6.4}/src/serialization/fileset.rs +0 -0
  42. {headson-0.6.3 → headson-0.6.4}/src/serialization/output.rs +0 -0
  43. {headson-0.6.3 → headson-0.6.4}/src/serialization/snapshots/headson__serialization__tests__arena_render_empty.snap +0 -0
  44. {headson-0.6.3 → headson-0.6.4}/src/serialization/snapshots/headson__serialization__tests__arena_render_empty_yaml.snap +0 -0
  45. {headson-0.6.3 → headson-0.6.4}/src/serialization/snapshots/headson__serialization__tests__arena_render_single.snap +0 -0
  46. {headson-0.6.3 → headson-0.6.4}/src/serialization/snapshots/headson__serialization__tests__arena_render_single_yaml.snap +0 -0
  47. {headson-0.6.3 → headson-0.6.4}/src/serialization/snapshots/headson__serialization__tests__array_internal_gaps_yaml.snap +0 -0
  48. {headson-0.6.3 → headson-0.6.4}/src/serialization/snapshots/headson__serialization__tests__array_omitted_js_head.snap +0 -0
  49. {headson-0.6.3 → headson-0.6.4}/src/serialization/snapshots/headson__serialization__tests__array_omitted_js_tail.snap +0 -0
  50. {headson-0.6.3 → headson-0.6.4}/src/serialization/snapshots/headson__serialization__tests__array_omitted_pseudo_head.snap +0 -0
  51. {headson-0.6.3 → headson-0.6.4}/src/serialization/snapshots/headson__serialization__tests__array_omitted_pseudo_tail.snap +0 -0
  52. {headson-0.6.3 → headson-0.6.4}/src/serialization/snapshots/headson__serialization__tests__array_omitted_yaml_head.snap +0 -0
  53. {headson-0.6.3 → headson-0.6.4}/src/serialization/snapshots/headson__serialization__tests__array_omitted_yaml_tail.snap +0 -0
  54. {headson-0.6.3 → headson-0.6.4}/src/serialization/snapshots/headson__serialization__tests__inline_open_array_in_object_json.snap +0 -0
  55. {headson-0.6.3 → headson-0.6.4}/src/serialization/snapshots/headson__serialization__tests__inline_open_array_in_object_yaml.snap +0 -0
  56. {headson-0.6.3 → headson-0.6.4}/src/serialization/templates/core.rs +0 -0
  57. {headson-0.6.3 → headson-0.6.4}/src/serialization/templates/js.rs +0 -0
  58. {headson-0.6.3 → headson-0.6.4}/src/serialization/templates/json.rs +0 -0
  59. {headson-0.6.3 → headson-0.6.4}/src/serialization/templates/mod.rs +0 -0
  60. {headson-0.6.3 → headson-0.6.4}/src/serialization/templates/pseudo.rs +0 -0
  61. {headson-0.6.3 → headson-0.6.4}/src/serialization/templates/text.rs +0 -0
  62. {headson-0.6.3 → headson-0.6.4}/src/serialization/templates/yaml.rs +0 -0
  63. {headson-0.6.3 → headson-0.6.4}/src/serialization/types.rs +0 -0
  64. {headson-0.6.3 → headson-0.6.4}/src/snapshots/headson__order__tests__order_empty_array_order.snap +0 -0
  65. {headson-0.6.3 → headson-0.6.4}/src/snapshots/headson__order__tests__order_single_string_array_order.snap +0 -0
  66. {headson-0.6.3 → headson-0.6.4}/src/utils/graph.rs +0 -0
  67. {headson-0.6.3 → headson-0.6.4}/src/utils/json.rs +0 -0
  68. {headson-0.6.3 → headson-0.6.4}/src/utils/mod.rs +0 -0
  69. {headson-0.6.3 → headson-0.6.4}/src/utils/search.rs +0 -0
  70. {headson-0.6.3 → headson-0.6.4}/src/utils/text.rs +0 -0
  71. {headson-0.6.3 → headson-0.6.4}/src/utils/tree_arena.rs +0 -0
  72. {headson-0.6.3 → headson-0.6.4}/tests/fixtures/json/JSONTestSuite/LICENSE +0 -0
  73. {headson-0.6.3 → headson-0.6.4}/tests/fixtures/json/JSONTestSuite/README.md +0 -0
@@ -298,7 +298,7 @@ dependencies = [
298
298
 
299
299
  [[package]]
300
300
  name = "headson"
301
- version = "0.6.3"
301
+ version = "0.6.4"
302
302
  dependencies = [
303
303
  "anyhow",
304
304
  "assert_cmd",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "headson"
3
- version = "0.6.3"
3
+ version = "0.6.4"
4
4
  edition = "2024"
5
5
  description = "Budget‑constrained JSON preview renderer"
6
6
  readme = "README.md"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: headson
3
- Version: 0.6.3
3
+ Version: 0.6.4
4
4
  Classifier: Programming Language :: Python
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: Programming Language :: Rust
@@ -26,6 +26,9 @@ Available as:
26
26
  - CLI (see [Usage](#usage))
27
27
  - Python library (see [Python Bindings](#python-bindings))
28
28
 
29
+ ![Codecov](https://img.shields.io/codecov/c/github/kantord/headson?style=flat-square) ![Crates.io Version](https://img.shields.io/crates/v/headson?style=flat-square) ![PyPI - Version](https://img.shields.io/pypi/v/headson?style=flat-square)
30
+
31
+
29
32
  ## Install
30
33
 
31
34
  Using Cargo:
@@ -67,8 +70,8 @@ If you’re comfortable with tools like `head` and `tail`, use `headson` when yo
67
70
 
68
71
  Common flags:
69
72
 
70
- - `-n, --budget <BYTES>`: per‑file output budget. For multiple inputs, default total budget is `<BYTES> * number_of_inputs`.
71
- - `-N, --global-budget <BYTES>`: total output budget across all inputs. With `--budget`, the effective total is the smaller of the two.
73
+ - `-c, --bytes <BYTES>`: per‑file output budget. For multiple inputs, default total budget is `<BYTES> * number_of_inputs`.
74
+ - `-C, --global-bytes <BYTES>`: total output budget across all inputs. With `--bytes`, the effective total is the smaller of the two.
72
75
  - `-f, --format <auto|json|yaml|text>`: output format (default: `auto`).
73
76
  - Auto: stdin → JSON family; filesets → per‑file based on extension (`.json` → JSON family, `.yaml`/`.yml` → YAML, unknown → Text).
74
77
  - `-t, --template <strict|default|detailed>`: output style (default: `default`).
@@ -89,7 +92,7 @@ Notes:
89
92
  - With newlines enabled, file sections are rendered with human‑readable headers. In compact/single‑line modes, headers are omitted.
90
93
  - In `--format auto`, each file uses its own best format: JSON family for `.json`, YAML for `.yaml`/`.yml`.
91
94
  - Unknown extensions are treated as Text (raw lines) — safe for logs and `.txt` files.
92
- - `--global-budget` may truncate or omit entire files to respect the total budget.
95
+ - `--global-bytes` may truncate or omit entire files to respect the total budget.
93
96
  - The tool finds the largest preview that fits the budget; even if extremely tight, you still get a minimal, valid preview.
94
97
  - Directories and binary files are ignored; a notice is printed to stderr for each. Stdin reads the stream as‑is.
95
98
  - Head vs Tail sampling: these options bias which part of arrays are kept before rendering. Display styles may still insert internal gap markers to honor very small budgets; strict JSON stays unannotated.
@@ -98,33 +101,33 @@ Quick one‑liners:
98
101
 
99
102
  - Peek a big JSON stream (keeps structure):
100
103
 
101
- zstdcat huge.json.zst | headson -n 800 -f json -t default
104
+ zstdcat huge.json.zst | headson -c 800 -f json -t default
102
105
 
103
106
  - Many files with a fixed overall size:
104
107
 
105
- headson -N 1200 -f json -t strict logs/*.json
108
+ headson -C 1200 -f json -t strict logs/*.json
106
109
 
107
110
  - Glance at a file, JavaScript‑style comments for omissions:
108
111
 
109
- headson -n 400 -f json -t detailed data.json
112
+ headson -c 400 -f json -t detailed data.json
110
113
 
111
114
  - YAML with detailed comments:
112
115
 
113
- headson -n 400 -f yaml -t detailed config.yaml
116
+ headson -c 400 -f yaml -t detailed config.yaml
114
117
 
115
118
  ### Text mode
116
119
 
117
120
  - Single file (auto):
118
121
 
119
- headson -n 200 notes.txt
122
+ headson -c 200 notes.txt
120
123
 
121
124
  - Force Text ingest/output (useful when mixing with other extensions):
122
125
 
123
- headson -n 200 -i text -f text notes.txt
126
+ headson -c 200 -i text -f text notes.txt
124
127
 
125
128
  - Many text files (fileset):
126
129
 
127
- headson -n 800 -i text -f text logs/*.txt
130
+ headson -c 800 -i text -f text logs/*.txt
128
131
 
129
132
  - Styles on Text:
130
133
  - default: omission as a standalone `…` line.
@@ -135,6 +138,8 @@ Show help:
135
138
 
136
139
  headson --help
137
140
 
141
+ Note: flags align with head/tail conventions (`-c/--bytes`, `-C/--global-bytes`).
142
+
138
143
  ## Examples: head vs headson
139
144
 
140
145
  Input:
@@ -153,7 +158,7 @@ jq -c . users.json | head -c 80
153
158
  Structured preview with headson (JSON family, default style → Pseudo):
154
159
 
155
160
  ```bash
156
- headson -n 120 -f json -t default users.json
161
+ headson -c 120 -f json -t default users.json
157
162
  # {
158
163
  # users: [
159
164
  # { id: 1, name: "Ana", roles: [ "admin", … ] },
@@ -166,7 +171,7 @@ headson -n 120 -f json -t default users.json
166
171
  Machine‑readable preview (JSON family, strict style → strict JSON):
167
172
 
168
173
  ```bash
169
- headson -n 120 -f json -t strict users.json
174
+ headson -c 120 -f json -t strict users.json
170
175
  # {"users":[{"id":1,"name":"Ana","roles":["admin"]}],"meta":{"count":2}}
171
176
  ```
172
177
 
@@ -12,6 +12,9 @@ Available as:
12
12
  - CLI (see [Usage](#usage))
13
13
  - Python library (see [Python Bindings](#python-bindings))
14
14
 
15
+ ![Codecov](https://img.shields.io/codecov/c/github/kantord/headson?style=flat-square) ![Crates.io Version](https://img.shields.io/crates/v/headson?style=flat-square) ![PyPI - Version](https://img.shields.io/pypi/v/headson?style=flat-square)
16
+
17
+
15
18
  ## Install
16
19
 
17
20
  Using Cargo:
@@ -53,8 +56,8 @@ If you’re comfortable with tools like `head` and `tail`, use `headson` when yo
53
56
 
54
57
  Common flags:
55
58
 
56
- - `-n, --budget <BYTES>`: per‑file output budget. For multiple inputs, default total budget is `<BYTES> * number_of_inputs`.
57
- - `-N, --global-budget <BYTES>`: total output budget across all inputs. With `--budget`, the effective total is the smaller of the two.
59
+ - `-c, --bytes <BYTES>`: per‑file output budget. For multiple inputs, default total budget is `<BYTES> * number_of_inputs`.
60
+ - `-C, --global-bytes <BYTES>`: total output budget across all inputs. With `--bytes`, the effective total is the smaller of the two.
58
61
  - `-f, --format <auto|json|yaml|text>`: output format (default: `auto`).
59
62
  - Auto: stdin → JSON family; filesets → per‑file based on extension (`.json` → JSON family, `.yaml`/`.yml` → YAML, unknown → Text).
60
63
  - `-t, --template <strict|default|detailed>`: output style (default: `default`).
@@ -75,7 +78,7 @@ Notes:
75
78
  - With newlines enabled, file sections are rendered with human‑readable headers. In compact/single‑line modes, headers are omitted.
76
79
  - In `--format auto`, each file uses its own best format: JSON family for `.json`, YAML for `.yaml`/`.yml`.
77
80
  - Unknown extensions are treated as Text (raw lines) — safe for logs and `.txt` files.
78
- - `--global-budget` may truncate or omit entire files to respect the total budget.
81
+ - `--global-bytes` may truncate or omit entire files to respect the total budget.
79
82
  - The tool finds the largest preview that fits the budget; even if extremely tight, you still get a minimal, valid preview.
80
83
  - Directories and binary files are ignored; a notice is printed to stderr for each. Stdin reads the stream as‑is.
81
84
  - Head vs Tail sampling: these options bias which part of arrays are kept before rendering. Display styles may still insert internal gap markers to honor very small budgets; strict JSON stays unannotated.
@@ -84,33 +87,33 @@ Quick one‑liners:
84
87
 
85
88
  - Peek a big JSON stream (keeps structure):
86
89
 
87
- zstdcat huge.json.zst | headson -n 800 -f json -t default
90
+ zstdcat huge.json.zst | headson -c 800 -f json -t default
88
91
 
89
92
  - Many files with a fixed overall size:
90
93
 
91
- headson -N 1200 -f json -t strict logs/*.json
94
+ headson -C 1200 -f json -t strict logs/*.json
92
95
 
93
96
  - Glance at a file, JavaScript‑style comments for omissions:
94
97
 
95
- headson -n 400 -f json -t detailed data.json
98
+ headson -c 400 -f json -t detailed data.json
96
99
 
97
100
  - YAML with detailed comments:
98
101
 
99
- headson -n 400 -f yaml -t detailed config.yaml
102
+ headson -c 400 -f yaml -t detailed config.yaml
100
103
 
101
104
  ### Text mode
102
105
 
103
106
  - Single file (auto):
104
107
 
105
- headson -n 200 notes.txt
108
+ headson -c 200 notes.txt
106
109
 
107
110
  - Force Text ingest/output (useful when mixing with other extensions):
108
111
 
109
- headson -n 200 -i text -f text notes.txt
112
+ headson -c 200 -i text -f text notes.txt
110
113
 
111
114
  - Many text files (fileset):
112
115
 
113
- headson -n 800 -i text -f text logs/*.txt
116
+ headson -c 800 -i text -f text logs/*.txt
114
117
 
115
118
  - Styles on Text:
116
119
  - default: omission as a standalone `…` line.
@@ -121,6 +124,8 @@ Show help:
121
124
 
122
125
  headson --help
123
126
 
127
+ Note: flags align with head/tail conventions (`-c/--bytes`, `-C/--global-bytes`).
128
+
124
129
  ## Examples: head vs headson
125
130
 
126
131
  Input:
@@ -139,7 +144,7 @@ jq -c . users.json | head -c 80
139
144
  Structured preview with headson (JSON family, default style → Pseudo):
140
145
 
141
146
  ```bash
142
- headson -n 120 -f json -t default users.json
147
+ headson -c 120 -f json -t default users.json
143
148
  # {
144
149
  # users: [
145
150
  # { id: 1, name: "Ana", roles: [ "admin", … ] },
@@ -152,7 +157,7 @@ headson -n 120 -f json -t default users.json
152
157
  Machine‑readable preview (JSON family, strict style → strict JSON):
153
158
 
154
159
  ```bash
155
- headson -n 120 -f json -t strict users.json
160
+ headson -c 120 -f json -t strict users.json
156
161
  # {"users":[{"id":1,"name":"Ana","roles":["admin"]}],"meta":{"count":2}}
157
162
  ```
158
163
 
Binary file
@@ -4,7 +4,7 @@ build-backend = "maturin"
4
4
 
5
5
  [project]
6
6
  name = "headson"
7
- version = "0.6.3"
7
+ version = "0.6.4"
8
8
  description = "Budget‑constrained JSON preview renderer (Python bindings)"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -214,7 +214,7 @@ dependencies = [
214
214
 
215
215
  [[package]]
216
216
  name = "headson"
217
- version = "0.6.3"
217
+ version = "0.6.4"
218
218
  dependencies = [
219
219
  "anyhow",
220
220
  "clap",
@@ -228,7 +228,7 @@ dependencies = [
228
228
 
229
229
  [[package]]
230
230
  name = "headson-python"
231
- version = "0.6.3"
231
+ version = "0.6.4"
232
232
  dependencies = [
233
233
  "anyhow",
234
234
  "headson",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "headson-python"
3
- version = "0.6.3"
3
+ version = "0.6.4"
4
4
  edition = "2021"
5
5
  publish = false
6
6
  readme = "README.md"
@@ -5,7 +5,7 @@ use std::cell::RefCell;
5
5
  use crate::order::NodeKind;
6
6
  use crate::utils::tree_arena::{JsonTreeArena, JsonTreeNode};
7
7
 
8
- use super::samplers::ArraySamplerKind;
8
+ use crate::ingest::sampling::ArraySamplerKind;
9
9
 
10
10
  #[derive(Default)]
11
11
  pub(crate) struct JsonTreeBuilder {
@@ -15,10 +15,7 @@ pub(crate) struct JsonTreeBuilder {
15
15
  }
16
16
 
17
17
  impl JsonTreeBuilder {
18
- pub(crate) fn new(
19
- array_cap: usize,
20
- sampler: super::samplers::ArraySamplerKind,
21
- ) -> Self {
18
+ pub(crate) fn new(array_cap: usize, sampler: ArraySamplerKind) -> Self {
22
19
  Self {
23
20
  arena: RefCell::new(JsonTreeArena::default()),
24
21
  array_cap,
@@ -1,24 +1,27 @@
1
1
  mod builder;
2
2
  mod samplers;
3
- use serde::de::DeserializeSeed;
4
3
 
5
- use crate::PriorityConfig;
6
- use crate::utils::tree_arena::JsonTreeArena;
7
4
  use anyhow::Result;
8
5
  use builder::JsonTreeBuilder;
6
+ use serde::de::DeserializeSeed;
7
+
8
+ use crate::PriorityConfig;
9
+ use crate::utils::tree_arena::JsonTreeArena as TreeArena;
10
+
11
+ use crate::ingest::Ingest;
9
12
 
10
13
  #[cfg(test)]
11
14
  pub fn build_json_tree_arena(
12
15
  input: &str,
13
16
  config: &PriorityConfig,
14
- ) -> Result<JsonTreeArena> {
17
+ ) -> Result<TreeArena> {
15
18
  build_json_tree_arena_from_bytes(input.as_bytes().to_vec(), config)
16
19
  }
17
20
 
18
21
  pub fn build_json_tree_arena_from_bytes(
19
22
  mut bytes: Vec<u8>,
20
23
  config: &PriorityConfig,
21
- ) -> Result<JsonTreeArena> {
24
+ ) -> Result<TreeArena> {
22
25
  let mut de = simd_json::Deserializer::from_slice(&mut bytes)?;
23
26
  let builder = JsonTreeBuilder::new(
24
27
  config.array_max_items,
@@ -36,7 +39,7 @@ pub fn build_json_tree_arena_from_bytes(
36
39
  pub fn build_json_tree_arena_from_many(
37
40
  mut inputs: Vec<(String, Vec<u8>)>,
38
41
  config: &PriorityConfig,
39
- ) -> Result<JsonTreeArena> {
42
+ ) -> Result<TreeArena> {
40
43
  let builder = JsonTreeBuilder::new(
41
44
  config.array_max_items,
42
45
  config.array_sampler.into(),
@@ -57,6 +60,38 @@ pub fn build_json_tree_arena_from_many(
57
60
  Ok(arena)
58
61
  }
59
62
 
63
+ /// JSON adapter for the ingest boundary. Delegates to the JSON builder to
64
+ /// produce the neutral `TreeArena`.
65
+ pub struct JsonIngest;
66
+
67
+ impl Ingest for JsonIngest {
68
+ fn parse_one(bytes: Vec<u8>, cfg: &PriorityConfig) -> Result<TreeArena> {
69
+ build_json_tree_arena_from_bytes(bytes, cfg)
70
+ }
71
+
72
+ fn parse_many(
73
+ inputs: Vec<(String, Vec<u8>)>,
74
+ cfg: &PriorityConfig,
75
+ ) -> Result<TreeArena> {
76
+ build_json_tree_arena_from_many(inputs, cfg)
77
+ }
78
+ }
79
+
80
+ /// Convenience functions for the JSON ingest path.
81
+ pub fn parse_json_one(
82
+ bytes: Vec<u8>,
83
+ cfg: &PriorityConfig,
84
+ ) -> Result<TreeArena> {
85
+ JsonIngest::parse_one(bytes, cfg)
86
+ }
87
+
88
+ pub fn parse_json_many(
89
+ inputs: Vec<(String, Vec<u8>)>,
90
+ cfg: &PriorityConfig,
91
+ ) -> Result<TreeArena> {
92
+ JsonIngest::parse_many(inputs, cfg)
93
+ }
94
+
60
95
  #[cfg(test)]
61
96
  mod tests {
62
97
  use super::*;
@@ -1,14 +1,12 @@
1
1
  use serde::de::{IgnoredAny, SeqAccess};
2
2
 
3
- use super::{JsonTreeBuilder, SampledArray};
3
+ use super::JsonTreeBuilder;
4
+ use super::SampledArray;
4
5
 
5
- // Tunable sampling constants for the default strategy.
6
+ // Default strategy phases: keep-first, greedy, then index-hash acceptance (~50%).
6
7
  const RANDOM_ACCEPT_SEED: u64 = 0x9e37_79b9_7f4a_7c15;
7
- // ~50% acceptance to thin remaining elements in the random phase.
8
- const RANDOM_ACCEPT_THRESHOLD: u32 = 0x8000_0000;
9
- // Keep a small, fixed number of items from the head before greedy/random phases.
8
+ const RANDOM_ACCEPT_THRESHOLD: u32 = 0x8000_0000; // ~50%
10
9
  const KEEP_FIRST_COUNT: usize = 3;
11
- // Take roughly half of the remaining capacity greedily after the first items.
12
10
  const GREEDY_PORTION_DIVISOR: usize = 2;
13
11
 
14
12
  struct PhaseState {
@@ -1,6 +1,7 @@
1
1
  use serde::de::{IgnoredAny, SeqAccess};
2
2
 
3
- use super::{JsonTreeBuilder, SampledArray};
3
+ use super::JsonTreeBuilder;
4
+ use super::SampledArray;
4
5
 
5
6
  fn parse_keep<'de, A>(
6
7
  seq: &mut A,
@@ -1,7 +1,7 @@
1
1
  use serde::de::SeqAccess;
2
2
 
3
- use crate::ArraySamplerStrategy;
4
- use crate::json_ingest::builder::JsonTreeBuilder;
3
+ use crate::ingest::formats::json::builder::JsonTreeBuilder;
4
+ use crate::ingest::sampling::ArraySamplerKind;
5
5
 
6
6
  #[derive(Debug)]
7
7
  pub(crate) struct SampledArray {
@@ -10,14 +10,6 @@ pub(crate) struct SampledArray {
10
10
  pub total_len: usize,
11
11
  }
12
12
 
13
- #[derive(Copy, Clone, Debug, Default)]
14
- pub(crate) enum ArraySamplerKind {
15
- #[default]
16
- Default,
17
- Head,
18
- Tail,
19
- }
20
-
21
13
  impl ArraySamplerKind {
22
14
  pub(crate) fn sample_stream<'de, A>(
23
15
  self,
@@ -38,16 +30,6 @@ impl ArraySamplerKind {
38
30
  }
39
31
  }
40
32
 
41
- impl From<ArraySamplerStrategy> for ArraySamplerKind {
42
- fn from(strategy: ArraySamplerStrategy) -> Self {
43
- match strategy {
44
- ArraySamplerStrategy::Default => ArraySamplerKind::Default,
45
- ArraySamplerStrategy::Head => ArraySamplerKind::Head,
46
- ArraySamplerStrategy::Tail => ArraySamplerKind::Tail,
47
- }
48
- }
49
- }
50
-
51
33
  mod default;
52
34
  mod head;
53
35
  mod tail;
@@ -1,6 +1,7 @@
1
1
  use serde::de::{IgnoredAny, SeqAccess};
2
2
 
3
- use super::{JsonTreeBuilder, SampledArray};
3
+ use super::JsonTreeBuilder;
4
+ use super::SampledArray;
4
5
 
5
6
  pub(crate) fn sample_stream<'de, A>(
6
7
  seq: &mut A,
@@ -89,8 +90,10 @@ mod tests {
89
90
  let mut cfg = PriorityConfig::new(usize::MAX, 5);
90
91
  cfg.array_sampler = crate::ArraySamplerStrategy::Tail;
91
92
  let arena =
92
- crate::json_ingest::build_json_tree_arena_from_bytes(input, &cfg)
93
- .expect("arena");
93
+ crate::ingest::formats::json::build_json_tree_arena_from_bytes(
94
+ input, &cfg,
95
+ )
96
+ .expect("arena");
94
97
  let root = &arena.nodes[arena.root_id];
95
98
  assert_eq!(root.children_len, 5, "kept 5");
96
99
  let mut orig_indices = Vec::new();
@@ -0,0 +1,9 @@
1
+ // File-format specific ingest adapters live under this module.
2
+ pub mod json;
3
+ pub mod text;
4
+ pub mod yaml;
5
+
6
+ // Re-export commonly used helpers for convenience
7
+ pub use json::{parse_json_many, parse_json_one};
8
+ pub use text::{parse_text_many, parse_text_one};
9
+ pub use yaml::{parse_yaml_many, parse_yaml_one};
@@ -5,6 +5,9 @@ use crate::PriorityConfig;
5
5
  use crate::order::NodeKind;
6
6
  use crate::utils::tree_arena::{JsonTreeArena, JsonTreeNode};
7
7
 
8
+ use crate::ingest::Ingest;
9
+ use crate::ingest::sampling::{ArraySamplerKind, choose_indices};
10
+
8
11
  fn normalize_newlines(s: &str) -> Cow<'_, str> {
9
12
  // Normalize CRLF and CR to LF in a single allocation when needed.
10
13
  if s.as_bytes().contains(&b'\r') {
@@ -18,13 +21,15 @@ fn normalize_newlines(s: &str) -> Cow<'_, str> {
18
21
  struct TextArenaBuilder {
19
22
  arena: JsonTreeArena,
20
23
  array_cap: usize,
24
+ sampler: ArraySamplerKind,
21
25
  }
22
26
 
23
27
  impl TextArenaBuilder {
24
- fn new(array_cap: usize) -> Self {
28
+ fn new(array_cap: usize, sampler: ArraySamplerKind) -> Self {
25
29
  Self {
26
30
  arena: JsonTreeArena::default(),
27
31
  array_cap,
32
+ sampler,
28
33
  }
29
34
  }
30
35
 
@@ -48,26 +53,38 @@ impl TextArenaBuilder {
48
53
 
49
54
  fn push_array_of_lines(
50
55
  &mut self,
51
- lines: impl IntoIterator<Item = String>,
56
+ lines: &[String],
52
57
  total: usize,
53
58
  ) -> usize {
54
59
  let id = self.push_default();
55
- let kept = total.min(self.array_cap);
60
+ let idxs = choose_indices(self.sampler, total, self.array_cap);
61
+ let kept = idxs.len().min(self.array_cap);
56
62
  let mut pushed = 0usize;
57
- for (i, line) in lines.into_iter().enumerate() {
58
- if i >= kept {
59
- break;
63
+ for (i, &orig_index) in idxs.iter().take(kept).enumerate() {
64
+ if let Some(line) = lines.get(orig_index) {
65
+ let child = self.push_string(line.clone());
66
+ self.arena.children.push(child);
67
+ pushed = i + 1;
60
68
  }
61
- let child = self.push_string(line);
62
- self.arena.children.push(child);
63
- pushed += 1;
64
69
  }
65
70
  let n = &mut self.arena.nodes[id];
66
71
  n.kind = NodeKind::Array;
67
- // children for this array were appended after previous nodes; compute start = len(children) - pushed
68
72
  n.children_start = self.arena.children.len().saturating_sub(pushed);
69
73
  n.children_len = pushed;
70
74
  n.array_len = Some(total);
75
+ // Store arr_indices when not contiguous head prefix
76
+ let contiguous =
77
+ idxs.iter().take(kept).enumerate().all(|(i, &idx)| i == idx);
78
+ if pushed == 0 || contiguous {
79
+ n.arr_indices_start = 0;
80
+ n.arr_indices_len = 0;
81
+ } else {
82
+ let start = self.arena.arr_indices.len();
83
+ self.arena.arr_indices.extend(idxs.into_iter().take(kept));
84
+ let len = self.arena.arr_indices.len().saturating_sub(start);
85
+ n.arr_indices_start = start;
86
+ n.arr_indices_len = len.min(pushed);
87
+ }
71
88
  id
72
89
  }
73
90
 
@@ -110,8 +127,11 @@ pub fn build_text_tree_arena_from_bytes(
110
127
  .map(std::string::ToString::to_string)
111
128
  .collect();
112
129
  let total = lines_vec.len();
113
- let mut b = TextArenaBuilder::new(config.array_max_items);
114
- let root_id = b.push_array_of_lines(lines_vec, total);
130
+ let mut b = TextArenaBuilder::new(
131
+ config.array_max_items,
132
+ config.array_sampler.into(),
133
+ );
134
+ let root_id = b.push_array_of_lines(&lines_vec, total);
115
135
  let mut a = b.finish();
116
136
  a.root_id = root_id;
117
137
  Ok(a)
@@ -125,7 +145,10 @@ pub fn build_text_tree_arena_from_many(
125
145
  mut inputs: Vec<(String, Vec<u8>)>,
126
146
  config: &PriorityConfig,
127
147
  ) -> Result<JsonTreeArena> {
128
- let mut b = TextArenaBuilder::new(config.array_max_items);
148
+ let mut b = TextArenaBuilder::new(
149
+ config.array_max_items,
150
+ config.array_sampler.into(),
151
+ );
129
152
  let mut keys: Vec<String> = Vec::with_capacity(inputs.len());
130
153
  let mut children_ids: Vec<usize> = Vec::with_capacity(inputs.len());
131
154
  for (key, bytes) in inputs.drain(..) {
@@ -136,7 +159,7 @@ pub fn build_text_tree_arena_from_many(
136
159
  .map(std::string::ToString::to_string)
137
160
  .collect();
138
161
  let total = lines_vec.len();
139
- let child_id = b.push_array_of_lines(lines_vec, total);
162
+ let child_id = b.push_array_of_lines(&lines_vec, total);
140
163
  keys.push(key);
141
164
  children_ids.push(child_id);
142
165
  }
@@ -147,6 +170,39 @@ pub fn build_text_tree_arena_from_many(
147
170
  Ok(a)
148
171
  }
149
172
 
173
+ pub struct TextIngest;
174
+
175
+ impl Ingest for TextIngest {
176
+ fn parse_one(
177
+ bytes: Vec<u8>,
178
+ cfg: &PriorityConfig,
179
+ ) -> Result<JsonTreeArena> {
180
+ build_text_tree_arena_from_bytes(bytes, cfg)
181
+ }
182
+
183
+ fn parse_many(
184
+ inputs: Vec<(String, Vec<u8>)>,
185
+ cfg: &PriorityConfig,
186
+ ) -> Result<JsonTreeArena> {
187
+ build_text_tree_arena_from_many(inputs, cfg)
188
+ }
189
+ }
190
+
191
+ /// Convenience functions for the Text ingest path.
192
+ pub fn parse_text_one(
193
+ bytes: Vec<u8>,
194
+ cfg: &PriorityConfig,
195
+ ) -> Result<JsonTreeArena> {
196
+ TextIngest::parse_one(bytes, cfg)
197
+ }
198
+
199
+ pub fn parse_text_many(
200
+ inputs: Vec<(String, Vec<u8>)>,
201
+ cfg: &PriorityConfig,
202
+ ) -> Result<JsonTreeArena> {
203
+ TextIngest::parse_many(inputs, cfg)
204
+ }
205
+
150
206
  #[cfg(test)]
151
207
  mod tests {
152
208
  use crate::{
@@ -189,4 +245,30 @@ mod tests {
189
245
  let out = headson_text(input.into_bytes(), &cfg, &prio, 20).unwrap();
190
246
  assert!(out.contains("…\n"));
191
247
  }
248
+
249
+ #[test]
250
+ fn tail_sampler_keeps_last_n_indices_text() {
251
+ // Build 10 lines; with array_max_items=5 and tail sampler we should keep last 5
252
+ let lines = (0..10)
253
+ .map(|i| i.to_string())
254
+ .collect::<Vec<_>>()
255
+ .join("\n");
256
+ let mut cfg = PriorityConfig::new(usize::MAX, 5);
257
+ cfg.array_sampler = crate::ArraySamplerStrategy::Tail;
258
+ let arena =
259
+ super::build_text_tree_arena_from_bytes(lines.into_bytes(), &cfg)
260
+ .expect("arena");
261
+ let root = &arena.nodes[arena.root_id];
262
+ assert_eq!(root.children_len, 5, "kept 5");
263
+ let mut orig_indices = Vec::new();
264
+ for i in 0..root.children_len {
265
+ let oi = if root.arr_indices_len > 0 {
266
+ arena.arr_indices[root.arr_indices_start + i]
267
+ } else {
268
+ i
269
+ };
270
+ orig_indices.push(oi);
271
+ }
272
+ assert_eq!(orig_indices, vec![5, 6, 7, 8, 9]);
273
+ }
192
274
  }