gcf-python 0.4.0__tar.gz → 0.5.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gcf_python-0.4.0 → gcf_python-0.5.1}/CHANGELOG.md +6 -0
- {gcf_python-0.4.0 → gcf_python-0.5.1}/PKG-INFO +21 -48
- {gcf_python-0.4.0 → gcf_python-0.5.1}/README.md +20 -47
- {gcf_python-0.4.0 → gcf_python-0.5.1}/pyproject.toml +1 -1
- {gcf_python-0.4.0 → gcf_python-0.5.1}/src/gcf/__init__.py +4 -0
- gcf_python-0.5.1/src/gcf/decode_generic.py +255 -0
- gcf_python-0.5.1/src/gcf/stream_generic.py +111 -0
- gcf_python-0.5.1/tests/test_stream_generic.py +126 -0
- {gcf_python-0.4.0 → gcf_python-0.5.1}/.github/workflows/ci.yml +0 -0
- {gcf_python-0.4.0 → gcf_python-0.5.1}/.github/workflows/publish.yml +0 -0
- {gcf_python-0.4.0 → gcf_python-0.5.1}/.gitignore +0 -0
- {gcf_python-0.4.0 → gcf_python-0.5.1}/LICENSE +0 -0
- {gcf_python-0.4.0 → gcf_python-0.5.1}/src/gcf/cli.py +0 -0
- {gcf_python-0.4.0 → gcf_python-0.5.1}/src/gcf/constants.py +0 -0
- {gcf_python-0.4.0 → gcf_python-0.5.1}/src/gcf/decode.py +0 -0
- {gcf_python-0.4.0 → gcf_python-0.5.1}/src/gcf/delta.py +0 -0
- {gcf_python-0.4.0 → gcf_python-0.5.1}/src/gcf/encode.py +0 -0
- {gcf_python-0.4.0 → gcf_python-0.5.1}/src/gcf/generic.py +0 -0
- {gcf_python-0.4.0 → gcf_python-0.5.1}/src/gcf/session.py +0 -0
- {gcf_python-0.4.0 → gcf_python-0.5.1}/src/gcf/stream.py +0 -0
- {gcf_python-0.4.0 → gcf_python-0.5.1}/src/gcf/types.py +0 -0
- {gcf_python-0.4.0 → gcf_python-0.5.1}/tests/__init__.py +0 -0
- {gcf_python-0.4.0 → gcf_python-0.5.1}/tests/test_decode.py +0 -0
- {gcf_python-0.4.0 → gcf_python-0.5.1}/tests/test_delta.py +0 -0
- {gcf_python-0.4.0 → gcf_python-0.5.1}/tests/test_encode.py +0 -0
- {gcf_python-0.4.0 → gcf_python-0.5.1}/tests/test_generic.py +0 -0
- {gcf_python-0.4.0 → gcf_python-0.5.1}/tests/test_roundtrip.py +0 -0
- {gcf_python-0.4.0 → gcf_python-0.5.1}/tests/test_session.py +0 -0
- {gcf_python-0.4.0 → gcf_python-0.5.1}/tests/test_stream.py +0 -0
|
@@ -1,5 +1,11 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## v0.5.0 (2026-06-06)
|
|
4
|
+
|
|
5
|
+
- `GenericStreamEncoder`: zero-buffering tabular streaming encode (begin_array/write_row/end_array/write_kv/write_section/write_inline_array)
|
|
6
|
+
- `decode_generic`: decode any GCF text (tabular or graph) back to Python objects
|
|
7
|
+
- `StreamEncoder`: zero-buffering streaming encode (added in v0.4.0)
|
|
8
|
+
|
|
3
9
|
## v0.3.0 (2026-06-05)
|
|
4
10
|
|
|
5
11
|
- `encode_generic`: primitive arrays inlined as `name[N]: val1,val2,val3`
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: gcf-python
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.1
|
|
4
4
|
Summary: Python implementation of GCF (Graph Compact Format): token-optimized wire format for LLM tool responses
|
|
5
5
|
Project-URL: Homepage, https://github.com/blackwell-systems/gcf-python
|
|
6
6
|
Project-URL: Documentation, https://blackwell-systems.github.io/gcf/
|
|
@@ -30,9 +30,9 @@ Description-Content-Type: text/markdown
|
|
|
30
30
|
|
|
31
31
|
# gcf-python
|
|
32
32
|
|
|
33
|
-
Python implementation of [GCF
|
|
33
|
+
Python implementation of [GCF](https://gcformat.com/) — the most token-efficient wire format for LLMs. A drop-in alternative to JSON and TOON for any structured data.
|
|
34
34
|
|
|
35
|
-
**79% fewer input tokens than JSON.
|
|
35
|
+
**79% fewer input tokens than JSON. 63% fewer output tokens. 90.5% average comprehension accuracy across 10 models and 3 providers (four models hit 100%). 1,300+ LLM evaluations. Zero training.**
|
|
36
36
|
|
|
37
37
|
Docs: [gcformat.com](https://gcformat.com/) · [Playground](https://gcformat.com/playground.html) · [GCF vs TOON](https://gcformat.com/guide/vs-toon.html)
|
|
38
38
|
|
|
@@ -66,33 +66,21 @@ Payload: 50 symbols, 20 edges
|
|
|
66
66
|
### Quick Start
|
|
67
67
|
|
|
68
68
|
```python
|
|
69
|
-
from gcf import
|
|
69
|
+
from gcf import encode_generic
|
|
70
70
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
symbols=[
|
|
76
|
-
Symbol(qualified_name="pkg.AuthMiddleware", kind="function", score=0.78, provenance="lsp_resolved", distance=0),
|
|
77
|
-
Symbol(qualified_name="pkg.NewServer", kind="function", score=0.54, provenance="lsp_resolved", distance=1),
|
|
78
|
-
],
|
|
79
|
-
edges=[
|
|
80
|
-
Edge(source="pkg.NewServer", target="pkg.AuthMiddleware", edge_type="calls"),
|
|
71
|
+
output = encode_generic({
|
|
72
|
+
"employees": [
|
|
73
|
+
{"id": 1, "name": "Alice", "department": "Engineering", "salary": 95000},
|
|
74
|
+
{"id": 2, "name": "Bob", "department": "Sales", "salary": 72000},
|
|
81
75
|
],
|
|
82
|
-
)
|
|
83
|
-
|
|
84
|
-
output = encode(p)
|
|
76
|
+
})
|
|
85
77
|
```
|
|
86
78
|
|
|
87
79
|
Output:
|
|
88
80
|
```
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
## related
|
|
93
|
-
@1 fn pkg.NewServer 0.54 lsp_resolved
|
|
94
|
-
## edges [1]
|
|
95
|
-
@0<@1 calls
|
|
81
|
+
## employees [2]{id,name,department,salary}
|
|
82
|
+
1|Alice|Engineering|95000
|
|
83
|
+
2|Bob|Sales|72000
|
|
96
84
|
```
|
|
97
85
|
|
|
98
86
|
## Decode
|
|
@@ -216,33 +204,18 @@ Works on dicts, lists, and primitives. Lists of uniform dicts get tabular rows.
|
|
|
216
204
|
| `Session` | Thread-safe tracker for multi-call deduplication |
|
|
217
205
|
| `KIND_ABBREV` / `KIND_EXPAND` | Bidirectional kind abbreviation dicts |
|
|
218
206
|
|
|
219
|
-
##
|
|
220
|
-
|
|
221
|
-
Rigorous 3-way benchmark (GCF vs TOON vs JSON) at 500 symbols, 200 edges. 13 structured extraction questions sent to an LLM with zero format instructions:
|
|
222
|
-
|
|
223
|
-
| Format | Accuracy | Tokens | vs JSON |
|
|
224
|
-
|--------|----------|--------|---------|
|
|
225
|
-
| **GCF** | **100%** (13/13) | **11,090** | **79% fewer** |
|
|
226
|
-
| TOON | 92.3% (12/13) | 16,378 | 69% fewer |
|
|
227
|
-
| JSON | 76.9% (10/13) | 53,341 | baseline |
|
|
228
|
-
|
|
229
|
-
GCF is the only format with perfect accuracy at scale, at 32% fewer tokens than TOON.
|
|
230
|
-
|
|
231
|
-
Reproduce: `git clone https://github.com/blackwell-systems/gcf-go && cd gcf-go/eval && GOWORK=off go test -run TestComprehension -v -timeout 0`
|
|
232
|
-
|
|
233
|
-
## Token Efficiency (TOON's Own Benchmark)
|
|
234
|
-
|
|
235
|
-
Running [TOON's benchmark harness](https://github.com/blackwell-systems/toon/tree/gcf-comparison) with GCF inserted (their datasets, their tokenizer):
|
|
207
|
+
## Benchmarks
|
|
236
208
|
|
|
237
|
-
|
|
238
|
-
|-------|-----|------|--------|
|
|
239
|
-
| Mixed-structure (nested, semi-uniform) | 170,367 | 227,896 | **GCF 34% smaller** |
|
|
240
|
-
| Flat-only (tabular) | 66,029 | 67,837 | **GCF 3% smaller** |
|
|
241
|
-
| Semi-uniform event logs | 108,158 | 154,032 | **GCF 42% smaller** |
|
|
209
|
+
1,300+ LLM evaluations across 10 models, 3 providers, and 51 independent test runs.
|
|
242
210
|
|
|
243
|
-
|
|
211
|
+
| | GCF | TOON | JSON |
|
|
212
|
+
|---|---|---|---|
|
|
213
|
+
| **Comprehension** (23 runs, 10 models) | **90.5%** | 68.5% | 53.6% |
|
|
214
|
+
| **Generation** (28 runs, 9 models) | **5/5** | 1.0/5 | 5.0/5 |
|
|
215
|
+
| **Input tokens** (500 symbols) | **11,090** | 16,378 | 53,341 |
|
|
216
|
+
| **Output tokens** (100 symbols) | **5,976** | 8,937 | 16,121 |
|
|
244
217
|
|
|
245
|
-
|
|
218
|
+
GCF wins all 6 datasets on [TOON's own benchmark](https://github.com/blackwell-systems/toon/tree/gcf-comparison). Full results: [gcformat.com/guide/benchmarks](https://gcformat.com/guide/benchmarks.html)
|
|
246
219
|
|
|
247
220
|
## Links
|
|
248
221
|
|
|
@@ -5,9 +5,9 @@
|
|
|
5
5
|
|
|
6
6
|
# gcf-python
|
|
7
7
|
|
|
8
|
-
Python implementation of [GCF
|
|
8
|
+
Python implementation of [GCF](https://gcformat.com/) — the most token-efficient wire format for LLMs. A drop-in alternative to JSON and TOON for any structured data.
|
|
9
9
|
|
|
10
|
-
**79% fewer input tokens than JSON.
|
|
10
|
+
**79% fewer input tokens than JSON. 63% fewer output tokens. 90.5% average comprehension accuracy across 10 models and 3 providers (four models hit 100%). 1,300+ LLM evaluations. Zero training.**
|
|
11
11
|
|
|
12
12
|
Docs: [gcformat.com](https://gcformat.com/) · [Playground](https://gcformat.com/playground.html) · [GCF vs TOON](https://gcformat.com/guide/vs-toon.html)
|
|
13
13
|
|
|
@@ -41,33 +41,21 @@ Payload: 50 symbols, 20 edges
|
|
|
41
41
|
### Quick Start
|
|
42
42
|
|
|
43
43
|
```python
|
|
44
|
-
from gcf import
|
|
44
|
+
from gcf import encode_generic
|
|
45
45
|
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
symbols=[
|
|
51
|
-
Symbol(qualified_name="pkg.AuthMiddleware", kind="function", score=0.78, provenance="lsp_resolved", distance=0),
|
|
52
|
-
Symbol(qualified_name="pkg.NewServer", kind="function", score=0.54, provenance="lsp_resolved", distance=1),
|
|
53
|
-
],
|
|
54
|
-
edges=[
|
|
55
|
-
Edge(source="pkg.NewServer", target="pkg.AuthMiddleware", edge_type="calls"),
|
|
46
|
+
output = encode_generic({
|
|
47
|
+
"employees": [
|
|
48
|
+
{"id": 1, "name": "Alice", "department": "Engineering", "salary": 95000},
|
|
49
|
+
{"id": 2, "name": "Bob", "department": "Sales", "salary": 72000},
|
|
56
50
|
],
|
|
57
|
-
)
|
|
58
|
-
|
|
59
|
-
output = encode(p)
|
|
51
|
+
})
|
|
60
52
|
```
|
|
61
53
|
|
|
62
54
|
Output:
|
|
63
55
|
```
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
## related
|
|
68
|
-
@1 fn pkg.NewServer 0.54 lsp_resolved
|
|
69
|
-
## edges [1]
|
|
70
|
-
@0<@1 calls
|
|
56
|
+
## employees [2]{id,name,department,salary}
|
|
57
|
+
1|Alice|Engineering|95000
|
|
58
|
+
2|Bob|Sales|72000
|
|
71
59
|
```
|
|
72
60
|
|
|
73
61
|
## Decode
|
|
@@ -191,33 +179,18 @@ Works on dicts, lists, and primitives. Lists of uniform dicts get tabular rows.
|
|
|
191
179
|
| `Session` | Thread-safe tracker for multi-call deduplication |
|
|
192
180
|
| `KIND_ABBREV` / `KIND_EXPAND` | Bidirectional kind abbreviation dicts |
|
|
193
181
|
|
|
194
|
-
##
|
|
195
|
-
|
|
196
|
-
Rigorous 3-way benchmark (GCF vs TOON vs JSON) at 500 symbols, 200 edges. 13 structured extraction questions sent to an LLM with zero format instructions:
|
|
197
|
-
|
|
198
|
-
| Format | Accuracy | Tokens | vs JSON |
|
|
199
|
-
|--------|----------|--------|---------|
|
|
200
|
-
| **GCF** | **100%** (13/13) | **11,090** | **79% fewer** |
|
|
201
|
-
| TOON | 92.3% (12/13) | 16,378 | 69% fewer |
|
|
202
|
-
| JSON | 76.9% (10/13) | 53,341 | baseline |
|
|
203
|
-
|
|
204
|
-
GCF is the only format with perfect accuracy at scale, at 32% fewer tokens than TOON.
|
|
205
|
-
|
|
206
|
-
Reproduce: `git clone https://github.com/blackwell-systems/gcf-go && cd gcf-go/eval && GOWORK=off go test -run TestComprehension -v -timeout 0`
|
|
207
|
-
|
|
208
|
-
## Token Efficiency (TOON's Own Benchmark)
|
|
209
|
-
|
|
210
|
-
Running [TOON's benchmark harness](https://github.com/blackwell-systems/toon/tree/gcf-comparison) with GCF inserted (their datasets, their tokenizer):
|
|
182
|
+
## Benchmarks
|
|
211
183
|
|
|
212
|
-
|
|
213
|
-
|-------|-----|------|--------|
|
|
214
|
-
| Mixed-structure (nested, semi-uniform) | 170,367 | 227,896 | **GCF 34% smaller** |
|
|
215
|
-
| Flat-only (tabular) | 66,029 | 67,837 | **GCF 3% smaller** |
|
|
216
|
-
| Semi-uniform event logs | 108,158 | 154,032 | **GCF 42% smaller** |
|
|
184
|
+
1,300+ LLM evaluations across 10 models, 3 providers, and 51 independent test runs.
|
|
217
185
|
|
|
218
|
-
|
|
186
|
+
| | GCF | TOON | JSON |
|
|
187
|
+
|---|---|---|---|
|
|
188
|
+
| **Comprehension** (23 runs, 10 models) | **90.5%** | 68.5% | 53.6% |
|
|
189
|
+
| **Generation** (28 runs, 9 models) | **5/5** | 1.0/5 | 5.0/5 |
|
|
190
|
+
| **Input tokens** (500 symbols) | **11,090** | 16,378 | 53,341 |
|
|
191
|
+
| **Output tokens** (100 symbols) | **5,976** | 8,937 | 16,121 |
|
|
219
192
|
|
|
220
|
-
|
|
193
|
+
GCF wins all 6 datasets on [TOON's own benchmark](https://github.com/blackwell-systems/toon/tree/gcf-comparison). Full results: [gcformat.com/guide/benchmarks](https://gcformat.com/guide/benchmarks.html)
|
|
221
194
|
|
|
222
195
|
## Links
|
|
223
196
|
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "gcf-python"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.5.1"
|
|
8
8
|
description = "Python implementation of GCF (Graph Compact Format): token-optimized wire format for LLM tool responses"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = {text = "MIT"}
|
|
@@ -40,13 +40,16 @@ from .delta import encode_delta
|
|
|
40
40
|
from .encode import encode
|
|
41
41
|
from .generic import encode_generic
|
|
42
42
|
from .session import Session, encode_with_session
|
|
43
|
+
from .decode_generic import decode_generic
|
|
43
44
|
from .stream import StreamEncoder
|
|
45
|
+
from .stream_generic import GenericStreamEncoder
|
|
44
46
|
from .types import Components, DeltaPayload, Edge, Payload, Symbol
|
|
45
47
|
|
|
46
48
|
__all__ = [
|
|
47
49
|
"Components",
|
|
48
50
|
"DecodeError",
|
|
49
51
|
"DeltaPayload",
|
|
52
|
+
"GenericStreamEncoder",
|
|
50
53
|
"Edge",
|
|
51
54
|
"KIND_ABBREV",
|
|
52
55
|
"KIND_EXPAND",
|
|
@@ -55,6 +58,7 @@ __all__ = [
|
|
|
55
58
|
"StreamEncoder",
|
|
56
59
|
"Symbol",
|
|
57
60
|
"decode",
|
|
61
|
+
"decode_generic",
|
|
58
62
|
"encode",
|
|
59
63
|
"encode_delta",
|
|
60
64
|
"encode_generic",
|
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
"""GCF generic decoder: parses any GCF text (tabular or graph) back to Python objects."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from .decode import decode
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def decode_generic(input_text: str) -> Any:
|
|
11
|
+
"""Decode any GCF text back into Python objects.
|
|
12
|
+
|
|
13
|
+
Handles tabular arrays, key-value pairs, nested sections, inline
|
|
14
|
+
primitive arrays, and graph profile payloads.
|
|
15
|
+
|
|
16
|
+
Returns dicts, lists, and primitives matching the original structure.
|
|
17
|
+
"""
|
|
18
|
+
input_text = input_text.rstrip("\n\r")
|
|
19
|
+
if not input_text:
|
|
20
|
+
return None
|
|
21
|
+
|
|
22
|
+
lines = input_text.split("\n")
|
|
23
|
+
|
|
24
|
+
# Graph profile fallback.
|
|
25
|
+
if lines[0].startswith("GCF "):
|
|
26
|
+
p = decode(input_text)
|
|
27
|
+
return {
|
|
28
|
+
"tool": p.tool,
|
|
29
|
+
"tokenBudget": p.token_budget,
|
|
30
|
+
"tokensUsed": p.tokens_used,
|
|
31
|
+
"packRoot": p.pack_root,
|
|
32
|
+
"symbols": [
|
|
33
|
+
{
|
|
34
|
+
"qualifiedName": s.qualified_name,
|
|
35
|
+
"kind": s.kind,
|
|
36
|
+
"score": s.score,
|
|
37
|
+
"provenance": s.provenance,
|
|
38
|
+
"distance": s.distance,
|
|
39
|
+
}
|
|
40
|
+
for s in p.symbols
|
|
41
|
+
],
|
|
42
|
+
"edges": [
|
|
43
|
+
{
|
|
44
|
+
"source": e.source,
|
|
45
|
+
"target": e.target,
|
|
46
|
+
"edgeType": e.edge_type,
|
|
47
|
+
**({"status": e.status} if e.status else {}),
|
|
48
|
+
}
|
|
49
|
+
for e in p.edges
|
|
50
|
+
],
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
result: dict[str, Any] = {}
|
|
54
|
+
_parse_object(lines, 0, 0, result)
|
|
55
|
+
return result
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _parse_object(lines: list[str], start: int, depth: int, out: dict[str, Any]) -> int:
|
|
59
|
+
indent = " " * depth
|
|
60
|
+
i = start
|
|
61
|
+
|
|
62
|
+
while i < len(lines):
|
|
63
|
+
raw = lines[i].rstrip("\r")
|
|
64
|
+
if raw == "" or raw.startswith("# "):
|
|
65
|
+
i += 1
|
|
66
|
+
continue
|
|
67
|
+
|
|
68
|
+
if depth > 0 and not raw.startswith(indent):
|
|
69
|
+
break
|
|
70
|
+
|
|
71
|
+
content = raw[len(indent):] if depth > 0 else raw
|
|
72
|
+
|
|
73
|
+
if content.startswith("## _summary"):
|
|
74
|
+
i += 1
|
|
75
|
+
continue
|
|
76
|
+
|
|
77
|
+
if content.startswith("## "):
|
|
78
|
+
header = content[3:]
|
|
79
|
+
bracket_idx = header.find(" [")
|
|
80
|
+
|
|
81
|
+
if bracket_idx >= 0:
|
|
82
|
+
name = header[:bracket_idx]
|
|
83
|
+
rest = header[bracket_idx + 2:]
|
|
84
|
+
close_bracket = rest.find("]")
|
|
85
|
+
|
|
86
|
+
if close_bracket >= 0:
|
|
87
|
+
after_bracket = rest[close_bracket + 1:]
|
|
88
|
+
|
|
89
|
+
if after_bracket.startswith("{"):
|
|
90
|
+
field_end = after_bracket.find("}")
|
|
91
|
+
if field_end >= 0:
|
|
92
|
+
fields = after_bracket[1:field_end].split(",")
|
|
93
|
+
i += 1
|
|
94
|
+
rows, consumed = _parse_tabular_rows(lines, i, depth, fields)
|
|
95
|
+
out[name] = rows
|
|
96
|
+
i += consumed
|
|
97
|
+
continue
|
|
98
|
+
else:
|
|
99
|
+
count_str = rest[:close_bracket]
|
|
100
|
+
if count_str == "0":
|
|
101
|
+
out[name] = []
|
|
102
|
+
i += 1
|
|
103
|
+
continue
|
|
104
|
+
i += 1
|
|
105
|
+
items, consumed = _parse_non_uniform_array(lines, i, depth)
|
|
106
|
+
out[name] = items
|
|
107
|
+
i += consumed
|
|
108
|
+
continue
|
|
109
|
+
|
|
110
|
+
name = header
|
|
111
|
+
bi = name.find(" [")
|
|
112
|
+
if bi >= 0:
|
|
113
|
+
name = name[:bi]
|
|
114
|
+
i += 1
|
|
115
|
+
nested: dict[str, Any] = {}
|
|
116
|
+
consumed = _parse_object(lines, i, depth + 1, nested)
|
|
117
|
+
out[name] = nested
|
|
118
|
+
i += consumed
|
|
119
|
+
continue
|
|
120
|
+
|
|
121
|
+
# Inline primitive array.
|
|
122
|
+
bracket_idx = content.find("[")
|
|
123
|
+
if bracket_idx > 0:
|
|
124
|
+
colon_idx = content.find("]: ")
|
|
125
|
+
if colon_idx > bracket_idx:
|
|
126
|
+
name = content[:bracket_idx]
|
|
127
|
+
vals_str = content[colon_idx + 3:]
|
|
128
|
+
out[name] = [_parse_value(v.strip()) for v in vals_str.split(",")]
|
|
129
|
+
i += 1
|
|
130
|
+
continue
|
|
131
|
+
|
|
132
|
+
# Key=value.
|
|
133
|
+
eq_idx = content.find("=")
|
|
134
|
+
if eq_idx > 0:
|
|
135
|
+
key = content[:eq_idx]
|
|
136
|
+
val = content[eq_idx + 1:]
|
|
137
|
+
out[key] = _parse_value(val)
|
|
138
|
+
i += 1
|
|
139
|
+
continue
|
|
140
|
+
|
|
141
|
+
i += 1
|
|
142
|
+
|
|
143
|
+
return i - start
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def _parse_tabular_rows(
|
|
147
|
+
lines: list[str], start: int, depth: int, fields: list[str]
|
|
148
|
+
) -> tuple[list[Any], int]:
|
|
149
|
+
indent = " " * depth
|
|
150
|
+
rows: list[Any] = []
|
|
151
|
+
i = start
|
|
152
|
+
|
|
153
|
+
while i < len(lines):
|
|
154
|
+
raw = lines[i].rstrip("\r")
|
|
155
|
+
if raw == "":
|
|
156
|
+
i += 1
|
|
157
|
+
continue
|
|
158
|
+
|
|
159
|
+
if depth > 0 and not raw.startswith(indent):
|
|
160
|
+
break
|
|
161
|
+
content = raw[len(indent):] if depth > 0 else raw
|
|
162
|
+
|
|
163
|
+
if content.startswith("## "):
|
|
164
|
+
break
|
|
165
|
+
if content.startswith("# "):
|
|
166
|
+
i += 1
|
|
167
|
+
continue
|
|
168
|
+
|
|
169
|
+
row_data = content
|
|
170
|
+
has_nested = False
|
|
171
|
+
if row_data.startswith("@"):
|
|
172
|
+
sp = row_data.find(" ")
|
|
173
|
+
if sp > 0:
|
|
174
|
+
row_data = row_data[sp + 1:]
|
|
175
|
+
has_nested = True
|
|
176
|
+
|
|
177
|
+
vals = row_data.split("|")
|
|
178
|
+
row: dict[str, Any] = {}
|
|
179
|
+
for j, f in enumerate(fields):
|
|
180
|
+
row[f] = _parse_value(vals[j]) if j < len(vals) else None
|
|
181
|
+
|
|
182
|
+
i += 1
|
|
183
|
+
|
|
184
|
+
if has_nested:
|
|
185
|
+
nested_indent = indent + " "
|
|
186
|
+
while i < len(lines):
|
|
187
|
+
nl = lines[i].rstrip("\r")
|
|
188
|
+
if not nl.startswith(nested_indent):
|
|
189
|
+
break
|
|
190
|
+
nc = nl[len(nested_indent):]
|
|
191
|
+
|
|
192
|
+
if nc.startswith("."):
|
|
193
|
+
field_name = nc[1:]
|
|
194
|
+
i += 1
|
|
195
|
+
nested: dict[str, Any] = {}
|
|
196
|
+
consumed = _parse_object(lines, i, depth + 2, nested)
|
|
197
|
+
row[field_name] = nested
|
|
198
|
+
i += consumed
|
|
199
|
+
else:
|
|
200
|
+
break
|
|
201
|
+
|
|
202
|
+
rows.append(row)
|
|
203
|
+
|
|
204
|
+
return rows, i - start
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def _parse_non_uniform_array(
|
|
208
|
+
lines: list[str], start: int, depth: int
|
|
209
|
+
) -> tuple[list[Any], int]:
|
|
210
|
+
indent = " " * depth
|
|
211
|
+
items: list[Any] = []
|
|
212
|
+
i = start
|
|
213
|
+
|
|
214
|
+
while i < len(lines):
|
|
215
|
+
raw = lines[i].rstrip("\r")
|
|
216
|
+
if raw == "":
|
|
217
|
+
i += 1
|
|
218
|
+
continue
|
|
219
|
+
if depth > 0 and not raw.startswith(indent):
|
|
220
|
+
break
|
|
221
|
+
content = raw[len(indent):] if depth > 0 else raw
|
|
222
|
+
if content.startswith("## "):
|
|
223
|
+
break
|
|
224
|
+
|
|
225
|
+
if content.startswith("@"):
|
|
226
|
+
sp = content.find(" ")
|
|
227
|
+
if sp > 0:
|
|
228
|
+
items.append(_parse_value(content[sp + 1:]))
|
|
229
|
+
i += 1
|
|
230
|
+
else:
|
|
231
|
+
break
|
|
232
|
+
|
|
233
|
+
return items, i - start
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def _parse_value(s: str) -> Any:
|
|
237
|
+
if s == "-":
|
|
238
|
+
return None
|
|
239
|
+
if s == "true":
|
|
240
|
+
return True
|
|
241
|
+
if s == "false":
|
|
242
|
+
return False
|
|
243
|
+
if s == '""':
|
|
244
|
+
return ""
|
|
245
|
+
if len(s) >= 2 and s[0] == '"' and s[-1] == '"':
|
|
246
|
+
return s[1:-1].replace('\\"', '"').replace("\\\\", "\\")
|
|
247
|
+
try:
|
|
248
|
+
return int(s)
|
|
249
|
+
except ValueError:
|
|
250
|
+
pass
|
|
251
|
+
try:
|
|
252
|
+
return float(s)
|
|
253
|
+
except ValueError:
|
|
254
|
+
pass
|
|
255
|
+
return s
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
"""GCF generic streaming encoder: zero-buffering tabular encode to any writable."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import threading
|
|
6
|
+
from typing import Any, Sequence
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class GenericStreamEncoder:
|
|
10
|
+
"""Writes GCF tabular output incrementally as rows arrive.
|
|
11
|
+
|
|
12
|
+
Zero buffering: each row is written immediately. A trailer summary is
|
|
13
|
+
emitted on close() with the final counts.
|
|
14
|
+
|
|
15
|
+
Example::
|
|
16
|
+
|
|
17
|
+
enc = GenericStreamEncoder(sys.stdout)
|
|
18
|
+
enc.begin_array("employees", ["id", "name", "department", "salary"])
|
|
19
|
+
enc.write_row([1, "Alice", "Engineering", 95000])
|
|
20
|
+
enc.write_row([2, "Bob", "Sales", 72000])
|
|
21
|
+
enc.end_array()
|
|
22
|
+
enc.close()
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(self, writer: Any) -> None:
|
|
26
|
+
self._w = writer
|
|
27
|
+
self._lock = threading.Lock()
|
|
28
|
+
self._sections: list[tuple[str, int]] = []
|
|
29
|
+
self._current: dict[str, Any] | None = None
|
|
30
|
+
|
|
31
|
+
def begin_array(self, name: str, fields: Sequence[str]) -> None:
|
|
32
|
+
"""Start a tabular array section with deferred count [?]."""
|
|
33
|
+
with self._lock:
|
|
34
|
+
if self._current is not None:
|
|
35
|
+
self._end_array_locked()
|
|
36
|
+
self._w.write(f"## {name} [?]{{{','.join(fields)}}}\n")
|
|
37
|
+
self._current = {"name": name, "fields": list(fields), "count": 0}
|
|
38
|
+
|
|
39
|
+
def write_row(self, values: Sequence[Any]) -> None:
|
|
40
|
+
"""Emit a single pipe-separated row immediately."""
|
|
41
|
+
with self._lock:
|
|
42
|
+
if self._current is None:
|
|
43
|
+
return
|
|
44
|
+
parts = [_format_value(v) for v in values]
|
|
45
|
+
self._w.write("|".join(parts) + "\n")
|
|
46
|
+
self._current["count"] += 1
|
|
47
|
+
|
|
48
|
+
def end_array(self) -> None:
|
|
49
|
+
"""Close the current array section and record its count."""
|
|
50
|
+
with self._lock:
|
|
51
|
+
self._end_array_locked()
|
|
52
|
+
|
|
53
|
+
def write_kv(self, key: str, value: Any) -> None:
|
|
54
|
+
"""Emit a key=value line immediately."""
|
|
55
|
+
with self._lock:
|
|
56
|
+
self._w.write(f"{key}={_format_value(value)}\n")
|
|
57
|
+
|
|
58
|
+
def write_section(self, name: str) -> None:
|
|
59
|
+
"""Start a nested object section (## key)."""
|
|
60
|
+
with self._lock:
|
|
61
|
+
if self._current is not None:
|
|
62
|
+
self._end_array_locked()
|
|
63
|
+
self._w.write(f"## {name}\n")
|
|
64
|
+
|
|
65
|
+
def write_inline_array(self, name: str, values: Sequence[Any]) -> None:
|
|
66
|
+
"""Emit a primitive array inline: name[N]: val1,val2,val3"""
|
|
67
|
+
with self._lock:
|
|
68
|
+
parts = [_format_value(v) for v in values]
|
|
69
|
+
self._w.write(f"{name}[{len(values)}]: {','.join(parts)}\n")
|
|
70
|
+
|
|
71
|
+
def close(self) -> None:
|
|
72
|
+
"""Emit the ## _summary trailer with final counts."""
|
|
73
|
+
with self._lock:
|
|
74
|
+
if self._current is not None:
|
|
75
|
+
self._end_array_locked()
|
|
76
|
+
if not self._sections:
|
|
77
|
+
return
|
|
78
|
+
total_rows = 0
|
|
79
|
+
section_parts: list[str] = []
|
|
80
|
+
for name, count in self._sections:
|
|
81
|
+
section_parts.append(f"{name}:{count}")
|
|
82
|
+
total_rows += count
|
|
83
|
+
self._w.write(
|
|
84
|
+
f"## _summary rows={total_rows} sections={','.join(section_parts)}\n"
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
def _end_array_locked(self) -> None:
|
|
88
|
+
if self._current is None:
|
|
89
|
+
return
|
|
90
|
+
self._sections.append((self._current["name"], self._current["count"]))
|
|
91
|
+
self._current = None
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _format_value(v: Any) -> str:
|
|
95
|
+
if v is None:
|
|
96
|
+
return "-"
|
|
97
|
+
if isinstance(v, bool):
|
|
98
|
+
return "true" if v else "false"
|
|
99
|
+
if isinstance(v, int):
|
|
100
|
+
return str(v)
|
|
101
|
+
if isinstance(v, float):
|
|
102
|
+
# Match Go's %g formatting
|
|
103
|
+
s = f"{v:g}"
|
|
104
|
+
return s
|
|
105
|
+
if isinstance(v, str):
|
|
106
|
+
if v == "":
|
|
107
|
+
return '""'
|
|
108
|
+
if "|" in v or "\n" in v:
|
|
109
|
+
return '"' + v.replace('"', '\\"') + '"'
|
|
110
|
+
return v
|
|
111
|
+
return str(v)
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
"""Tests for the GenericStreamEncoder."""
|
|
2
|
+
|
|
3
|
+
import io
|
|
4
|
+
|
|
5
|
+
from gcf import GenericStreamEncoder
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def test_tabular():
|
|
9
|
+
buf = io.StringIO()
|
|
10
|
+
enc = GenericStreamEncoder(buf)
|
|
11
|
+
|
|
12
|
+
enc.begin_array("employees", ["id", "name", "department", "salary"])
|
|
13
|
+
enc.write_row([1, "Alice", "Engineering", 95000])
|
|
14
|
+
enc.write_row([2, "Bob", "Sales", 72000])
|
|
15
|
+
enc.write_row([3, "Carol", "Marketing", 85000])
|
|
16
|
+
enc.end_array()
|
|
17
|
+
enc.close()
|
|
18
|
+
|
|
19
|
+
out = buf.getvalue()
|
|
20
|
+
assert "## employees [?]{id,name,department,salary}" in out
|
|
21
|
+
assert "1|Alice|Engineering|95000" in out
|
|
22
|
+
assert "## _summary rows=3 sections=employees:3" in out
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def test_kv_and_inline_array():
|
|
26
|
+
buf = io.StringIO()
|
|
27
|
+
enc = GenericStreamEncoder(buf)
|
|
28
|
+
|
|
29
|
+
enc.write_kv("name", "my-service")
|
|
30
|
+
enc.write_kv("version", "2.1.0")
|
|
31
|
+
enc.write_inline_array("tags", ["production", "us-east-1", "critical"])
|
|
32
|
+
enc.close()
|
|
33
|
+
|
|
34
|
+
out = buf.getvalue()
|
|
35
|
+
assert "name=my-service" in out
|
|
36
|
+
assert "tags[3]: production,us-east-1,critical" in out
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def test_incremental():
|
|
40
|
+
buf = io.StringIO()
|
|
41
|
+
enc = GenericStreamEncoder(buf)
|
|
42
|
+
|
|
43
|
+
enc.begin_array("data", ["id", "val"])
|
|
44
|
+
assert len(buf.getvalue()) > 0, "header should be written immediately"
|
|
45
|
+
|
|
46
|
+
header_len = len(buf.getvalue())
|
|
47
|
+
enc.write_row([1, "a"])
|
|
48
|
+
assert len(buf.getvalue()) > header_len, "row should be written immediately"
|
|
49
|
+
|
|
50
|
+
enc.end_array()
|
|
51
|
+
enc.close()
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def test_multiple_arrays():
|
|
55
|
+
buf = io.StringIO()
|
|
56
|
+
enc = GenericStreamEncoder(buf)
|
|
57
|
+
|
|
58
|
+
enc.begin_array("users", ["id", "name"])
|
|
59
|
+
enc.write_row([1, "Alice"])
|
|
60
|
+
enc.write_row([2, "Bob"])
|
|
61
|
+
enc.end_array()
|
|
62
|
+
|
|
63
|
+
enc.begin_array("roles", ["name", "level"])
|
|
64
|
+
enc.write_row(["admin", 10])
|
|
65
|
+
enc.end_array()
|
|
66
|
+
|
|
67
|
+
enc.close()
|
|
68
|
+
|
|
69
|
+
out = buf.getvalue()
|
|
70
|
+
assert "sections=users:2,roles:1" in out
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def test_null_and_bool():
|
|
74
|
+
buf = io.StringIO()
|
|
75
|
+
enc = GenericStreamEncoder(buf)
|
|
76
|
+
|
|
77
|
+
enc.begin_array("data", ["a", "b", "c"])
|
|
78
|
+
enc.write_row([None, True, False])
|
|
79
|
+
enc.end_array()
|
|
80
|
+
enc.close()
|
|
81
|
+
|
|
82
|
+
out = buf.getvalue()
|
|
83
|
+
assert "-|true|false" in out
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def test_empty_string_and_pipe():
|
|
87
|
+
buf = io.StringIO()
|
|
88
|
+
enc = GenericStreamEncoder(buf)
|
|
89
|
+
|
|
90
|
+
enc.begin_array("data", ["a", "b"])
|
|
91
|
+
enc.write_row(["", "has|pipe"])
|
|
92
|
+
enc.end_array()
|
|
93
|
+
enc.close()
|
|
94
|
+
|
|
95
|
+
out = buf.getvalue()
|
|
96
|
+
assert '""|"has|pipe"' in out
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def test_auto_close_on_begin_array():
|
|
100
|
+
buf = io.StringIO()
|
|
101
|
+
enc = GenericStreamEncoder(buf)
|
|
102
|
+
|
|
103
|
+
enc.begin_array("first", ["a"])
|
|
104
|
+
enc.write_row([1])
|
|
105
|
+
enc.begin_array("second", ["b"])
|
|
106
|
+
enc.write_row([2])
|
|
107
|
+
enc.end_array()
|
|
108
|
+
enc.close()
|
|
109
|
+
|
|
110
|
+
out = buf.getvalue()
|
|
111
|
+
assert "sections=first:1,second:1" in out
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def test_write_section():
|
|
115
|
+
buf = io.StringIO()
|
|
116
|
+
enc = GenericStreamEncoder(buf)
|
|
117
|
+
|
|
118
|
+
enc.begin_array("items", ["id"])
|
|
119
|
+
enc.write_row([1])
|
|
120
|
+
enc.write_section("metadata")
|
|
121
|
+
enc.write_kv("count", 1)
|
|
122
|
+
enc.close()
|
|
123
|
+
|
|
124
|
+
out = buf.getvalue()
|
|
125
|
+
assert "## metadata" in out
|
|
126
|
+
assert "## _summary rows=1 sections=items:1" in out
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|