gcf-python 0.3.0__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gcf_python-0.3.0 → gcf_python-0.4.0}/PKG-INFO +43 -12
- {gcf_python-0.3.0 → gcf_python-0.4.0}/README.md +42 -11
- {gcf_python-0.3.0 → gcf_python-0.4.0}/pyproject.toml +1 -1
- {gcf_python-0.3.0 → gcf_python-0.4.0}/src/gcf/__init__.py +2 -0
- gcf_python-0.4.0/src/gcf/stream.py +151 -0
- gcf_python-0.4.0/tests/test_stream.py +116 -0
- {gcf_python-0.3.0 → gcf_python-0.4.0}/.github/workflows/ci.yml +0 -0
- {gcf_python-0.3.0 → gcf_python-0.4.0}/.github/workflows/publish.yml +0 -0
- {gcf_python-0.3.0 → gcf_python-0.4.0}/.gitignore +0 -0
- {gcf_python-0.3.0 → gcf_python-0.4.0}/CHANGELOG.md +0 -0
- {gcf_python-0.3.0 → gcf_python-0.4.0}/LICENSE +0 -0
- {gcf_python-0.3.0 → gcf_python-0.4.0}/src/gcf/cli.py +0 -0
- {gcf_python-0.3.0 → gcf_python-0.4.0}/src/gcf/constants.py +0 -0
- {gcf_python-0.3.0 → gcf_python-0.4.0}/src/gcf/decode.py +0 -0
- {gcf_python-0.3.0 → gcf_python-0.4.0}/src/gcf/delta.py +0 -0
- {gcf_python-0.3.0 → gcf_python-0.4.0}/src/gcf/encode.py +0 -0
- {gcf_python-0.3.0 → gcf_python-0.4.0}/src/gcf/generic.py +0 -0
- {gcf_python-0.3.0 → gcf_python-0.4.0}/src/gcf/session.py +0 -0
- {gcf_python-0.3.0 → gcf_python-0.4.0}/src/gcf/types.py +0 -0
- {gcf_python-0.3.0 → gcf_python-0.4.0}/tests/__init__.py +0 -0
- {gcf_python-0.3.0 → gcf_python-0.4.0}/tests/test_decode.py +0 -0
- {gcf_python-0.3.0 → gcf_python-0.4.0}/tests/test_delta.py +0 -0
- {gcf_python-0.3.0 → gcf_python-0.4.0}/tests/test_encode.py +0 -0
- {gcf_python-0.3.0 → gcf_python-0.4.0}/tests/test_generic.py +0 -0
- {gcf_python-0.3.0 → gcf_python-0.4.0}/tests/test_roundtrip.py +0 -0
- {gcf_python-0.3.0 → gcf_python-0.4.0}/tests/test_session.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: gcf-python
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: Python implementation of GCF (Graph Compact Format): token-optimized wire format for LLM tool responses
|
|
5
5
|
Project-URL: Homepage, https://github.com/blackwell-systems/gcf-python
|
|
6
6
|
Project-URL: Documentation, https://blackwell-systems.github.io/gcf/
|
|
@@ -32,7 +32,7 @@ Description-Content-Type: text/markdown
|
|
|
32
32
|
|
|
33
33
|
Python implementation of [GCF (Graph Compact Format)](https://gcformat.com/) — the most token-efficient wire format for LLMs. A drop-in alternative to JSON and TOON for any structured data.
|
|
34
34
|
|
|
35
|
-
**79% fewer input tokens than JSON. 75% fewer output tokens. 52% smaller than TOON. 100% LLM comprehension at 500 symbols, where JSON
|
|
35
|
+
**79% fewer input tokens than JSON. 75% fewer output tokens. 52% smaller than TOON. 100% LLM comprehension at 500 symbols, where JSON scores 76.9% and TOON scores 92.3%.**
|
|
36
36
|
|
|
37
37
|
Docs: [gcformat.com](https://gcformat.com/) · [Playground](https://gcformat.com/playground.html) · [GCF vs TOON](https://gcformat.com/guide/vs-toon.html)
|
|
38
38
|
|
|
@@ -119,6 +119,35 @@ out2 = encode_with_session(payload2, sess) # reused symbols as "@N # previousl
|
|
|
119
119
|
|
|
120
120
|
By the 5th call in a session: 92.7% token savings vs JSON.
|
|
121
121
|
|
|
122
|
+
## Streaming Encode
|
|
123
|
+
|
|
124
|
+
Write GCF output incrementally as symbols and edges arrive. Zero buffering, O(1) memory per row:
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
from gcf import StreamEncoder, Symbol, Edge
|
|
128
|
+
|
|
129
|
+
enc = StreamEncoder(sys.stdout, "context_for_task", token_budget=5000)
|
|
130
|
+
|
|
131
|
+
enc.write_symbol(Symbol(qualified_name="pkg.Auth", kind="function", score=0.95, provenance="lsp", distance=0))
|
|
132
|
+
enc.write_symbol(Symbol(qualified_name="pkg.Server", kind="function", score=0.60, provenance="lsp", distance=1))
|
|
133
|
+
enc.write_edge(Edge(source="pkg.Server", target="pkg.Auth", edge_type="calls"))
|
|
134
|
+
enc.close() # emits ## _summary trailer
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
Output:
|
|
138
|
+
```
|
|
139
|
+
GCF tool=context_for_task budget=5000
|
|
140
|
+
## targets
|
|
141
|
+
@0 fn pkg.Auth 0.95 lsp
|
|
142
|
+
## related
|
|
143
|
+
@1 fn pkg.Server 0.60 lsp
|
|
144
|
+
## edges [?]
|
|
145
|
+
@0<@1 calls
|
|
146
|
+
## _summary symbols=2 edges=1 sections=targets:1,related:1,edges:1
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
The writer is any object with a `write(s: str)` method. Thread-safe. Standard `decode()` handles streaming output with no changes.
|
|
150
|
+
|
|
122
151
|
## Delta Encoding
|
|
123
152
|
|
|
124
153
|
When the consumer already has a prior context pack, send only what changed:
|
|
@@ -189,15 +218,17 @@ Works on dicts, lists, and primitives. Lists of uniform dicts get tabular rows.
|
|
|
189
218
|
|
|
190
219
|
## Comprehension Eval
|
|
191
220
|
|
|
192
|
-
Rigorous 3-way benchmark (GCF vs TOON vs JSON) at 500 symbols, 200 edges.
|
|
221
|
+
Rigorous 3-way benchmark (GCF vs TOON vs JSON) at 500 symbols, 200 edges. 13 structured extraction questions sent to an LLM with zero format instructions:
|
|
193
222
|
|
|
194
223
|
| Format | Accuracy | Tokens | vs JSON |
|
|
195
224
|
|--------|----------|--------|---------|
|
|
196
|
-
| **GCF** | **100%** (
|
|
197
|
-
| TOON |
|
|
198
|
-
| JSON |
|
|
225
|
+
| **GCF** | **100%** (13/13) | **11,090** | **79% fewer** |
|
|
226
|
+
| TOON | 92.3% (12/13) | 16,378 | 69% fewer |
|
|
227
|
+
| JSON | 76.9% (10/13) | 53,341 | baseline |
|
|
228
|
+
|
|
229
|
+
GCF is the only format with perfect accuracy at scale, at 32% fewer tokens than TOON.
|
|
199
230
|
|
|
200
|
-
|
|
231
|
+
Reproduce: `git clone https://github.com/blackwell-systems/gcf-go && cd gcf-go/eval && GOWORK=off go test -run TestComprehension -v -timeout 0`
|
|
201
232
|
|
|
202
233
|
## Token Efficiency (TOON's Own Benchmark)
|
|
203
234
|
|
|
@@ -205,13 +236,13 @@ Running [TOON's benchmark harness](https://github.com/blackwell-systems/toon/tre
|
|
|
205
236
|
|
|
206
237
|
| Track | GCF | TOON | Result |
|
|
207
238
|
|-------|-----|------|--------|
|
|
208
|
-
| Mixed-structure (nested, semi-uniform) |
|
|
209
|
-
| Flat-only (tabular) | 66,
|
|
210
|
-
| Semi-uniform event logs |
|
|
239
|
+
| Mixed-structure (nested, semi-uniform) | 170,367 | 227,896 | **GCF 34% smaller** |
|
|
240
|
+
| Flat-only (tabular) | 66,029 | 67,837 | **GCF 3% smaller** |
|
|
241
|
+
| Semi-uniform event logs | 108,158 | 154,032 | **GCF 42% smaller** |
|
|
211
242
|
|
|
212
|
-
GCF wins
|
|
243
|
+
GCF wins all 6 datasets. On semi-uniform data (the most common real-world pattern), GCF uses 42% fewer tokens than TOON.
|
|
213
244
|
|
|
214
|
-
|
|
245
|
+
Reproduce: `git clone https://github.com/blackwell-systems/toon && cd toon && git checkout gcf-comparison && cd benchmarks && pnpm install && pnpm benchmark:tokens`
|
|
215
246
|
|
|
216
247
|
## Links
|
|
217
248
|
|
|
@@ -7,7 +7,7 @@
|
|
|
7
7
|
|
|
8
8
|
Python implementation of [GCF (Graph Compact Format)](https://gcformat.com/) — the most token-efficient wire format for LLMs. A drop-in alternative to JSON and TOON for any structured data.
|
|
9
9
|
|
|
10
|
-
**79% fewer input tokens than JSON. 75% fewer output tokens. 52% smaller than TOON. 100% LLM comprehension at 500 symbols, where JSON
|
|
10
|
+
**79% fewer input tokens than JSON. 75% fewer output tokens. 52% smaller than TOON. 100% LLM comprehension at 500 symbols, where JSON scores 76.9% and TOON scores 92.3%.**
|
|
11
11
|
|
|
12
12
|
Docs: [gcformat.com](https://gcformat.com/) · [Playground](https://gcformat.com/playground.html) · [GCF vs TOON](https://gcformat.com/guide/vs-toon.html)
|
|
13
13
|
|
|
@@ -94,6 +94,35 @@ out2 = encode_with_session(payload2, sess) # reused symbols as "@N # previousl
|
|
|
94
94
|
|
|
95
95
|
By the 5th call in a session: 92.7% token savings vs JSON.
|
|
96
96
|
|
|
97
|
+
## Streaming Encode
|
|
98
|
+
|
|
99
|
+
Write GCF output incrementally as symbols and edges arrive. Zero buffering, O(1) memory per row:
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
from gcf import StreamEncoder, Symbol, Edge
|
|
103
|
+
|
|
104
|
+
enc = StreamEncoder(sys.stdout, "context_for_task", token_budget=5000)
|
|
105
|
+
|
|
106
|
+
enc.write_symbol(Symbol(qualified_name="pkg.Auth", kind="function", score=0.95, provenance="lsp", distance=0))
|
|
107
|
+
enc.write_symbol(Symbol(qualified_name="pkg.Server", kind="function", score=0.60, provenance="lsp", distance=1))
|
|
108
|
+
enc.write_edge(Edge(source="pkg.Server", target="pkg.Auth", edge_type="calls"))
|
|
109
|
+
enc.close() # emits ## _summary trailer
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
Output:
|
|
113
|
+
```
|
|
114
|
+
GCF tool=context_for_task budget=5000
|
|
115
|
+
## targets
|
|
116
|
+
@0 fn pkg.Auth 0.95 lsp
|
|
117
|
+
## related
|
|
118
|
+
@1 fn pkg.Server 0.60 lsp
|
|
119
|
+
## edges [?]
|
|
120
|
+
@0<@1 calls
|
|
121
|
+
## _summary symbols=2 edges=1 sections=targets:1,related:1,edges:1
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
The writer is any object with a `write(s: str)` method. Thread-safe. Standard `decode()` handles streaming output with no changes.
|
|
125
|
+
|
|
97
126
|
## Delta Encoding
|
|
98
127
|
|
|
99
128
|
When the consumer already has a prior context pack, send only what changed:
|
|
@@ -164,15 +193,17 @@ Works on dicts, lists, and primitives. Lists of uniform dicts get tabular rows.
|
|
|
164
193
|
|
|
165
194
|
## Comprehension Eval
|
|
166
195
|
|
|
167
|
-
Rigorous 3-way benchmark (GCF vs TOON vs JSON) at 500 symbols, 200 edges.
|
|
196
|
+
Rigorous 3-way benchmark (GCF vs TOON vs JSON) at 500 symbols, 200 edges. 13 structured extraction questions sent to an LLM with zero format instructions:
|
|
168
197
|
|
|
169
198
|
| Format | Accuracy | Tokens | vs JSON |
|
|
170
199
|
|--------|----------|--------|---------|
|
|
171
|
-
| **GCF** | **100%** (
|
|
172
|
-
| TOON |
|
|
173
|
-
| JSON |
|
|
200
|
+
| **GCF** | **100%** (13/13) | **11,090** | **79% fewer** |
|
|
201
|
+
| TOON | 92.3% (12/13) | 16,378 | 69% fewer |
|
|
202
|
+
| JSON | 76.9% (10/13) | 53,341 | baseline |
|
|
203
|
+
|
|
204
|
+
GCF is the only format with perfect accuracy at scale, at 32% fewer tokens than TOON.
|
|
174
205
|
|
|
175
|
-
|
|
206
|
+
Reproduce: `git clone https://github.com/blackwell-systems/gcf-go && cd gcf-go/eval && GOWORK=off go test -run TestComprehension -v -timeout 0`
|
|
176
207
|
|
|
177
208
|
## Token Efficiency (TOON's Own Benchmark)
|
|
178
209
|
|
|
@@ -180,13 +211,13 @@ Running [TOON's benchmark harness](https://github.com/blackwell-systems/toon/tre
|
|
|
180
211
|
|
|
181
212
|
| Track | GCF | TOON | Result |
|
|
182
213
|
|-------|-----|------|--------|
|
|
183
|
-
| Mixed-structure (nested, semi-uniform) |
|
|
184
|
-
| Flat-only (tabular) | 66,
|
|
185
|
-
| Semi-uniform event logs |
|
|
214
|
+
| Mixed-structure (nested, semi-uniform) | 170,367 | 227,896 | **GCF 34% smaller** |
|
|
215
|
+
| Flat-only (tabular) | 66,029 | 67,837 | **GCF 3% smaller** |
|
|
216
|
+
| Semi-uniform event logs | 108,158 | 154,032 | **GCF 42% smaller** |
|
|
186
217
|
|
|
187
|
-
GCF wins
|
|
218
|
+
GCF wins all 6 datasets. On semi-uniform data (the most common real-world pattern), GCF uses 42% fewer tokens than TOON.
|
|
188
219
|
|
|
189
|
-
|
|
220
|
+
Reproduce: `git clone https://github.com/blackwell-systems/toon && cd toon && git checkout gcf-comparison && cd benchmarks && pnpm install && pnpm benchmark:tokens`
|
|
190
221
|
|
|
191
222
|
## Links
|
|
192
223
|
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "gcf-python"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.4.0"
|
|
8
8
|
description = "Python implementation of GCF (Graph Compact Format): token-optimized wire format for LLM tool responses"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = {text = "MIT"}
|
|
@@ -40,6 +40,7 @@ from .delta import encode_delta
|
|
|
40
40
|
from .encode import encode
|
|
41
41
|
from .generic import encode_generic
|
|
42
42
|
from .session import Session, encode_with_session
|
|
43
|
+
from .stream import StreamEncoder
|
|
43
44
|
from .types import Components, DeltaPayload, Edge, Payload, Symbol
|
|
44
45
|
|
|
45
46
|
__all__ = [
|
|
@@ -51,6 +52,7 @@ __all__ = [
|
|
|
51
52
|
"KIND_EXPAND",
|
|
52
53
|
"Payload",
|
|
53
54
|
"Session",
|
|
55
|
+
"StreamEncoder",
|
|
54
56
|
"Symbol",
|
|
55
57
|
"decode",
|
|
56
58
|
"encode",
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
"""GCF streaming encoder: zero-buffering encode to any writable."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import threading
|
|
6
|
+
from typing import Any, Protocol
|
|
7
|
+
|
|
8
|
+
from .constants import KIND_ABBREV
|
|
9
|
+
from .types import Edge, Symbol
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class StreamWriter(Protocol):
|
|
13
|
+
"""Any object with a write(s: str) method."""
|
|
14
|
+
|
|
15
|
+
def write(self, s: str) -> Any: ...
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class StreamEncoder:
|
|
19
|
+
"""Writes GCF output incrementally as symbols and edges arrive.
|
|
20
|
+
|
|
21
|
+
Zero buffering: each symbol/edge is written immediately. A trailer summary
|
|
22
|
+
is emitted on close() with the final counts.
|
|
23
|
+
|
|
24
|
+
Example::
|
|
25
|
+
|
|
26
|
+
enc = StreamEncoder(sys.stdout, "context_for_task", token_budget=5000)
|
|
27
|
+
enc.write_symbol(sym1) # emitted immediately
|
|
28
|
+
enc.write_edge(edge1) # emitted immediately
|
|
29
|
+
enc.close() # emits ## _summary trailer
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
writer: StreamWriter,
|
|
35
|
+
tool: str,
|
|
36
|
+
*,
|
|
37
|
+
token_budget: int = 0,
|
|
38
|
+
tokens_used: int = 0,
|
|
39
|
+
pack_root: str = "",
|
|
40
|
+
session: bool = False,
|
|
41
|
+
) -> None:
|
|
42
|
+
self._w = writer
|
|
43
|
+
self._lock = threading.Lock()
|
|
44
|
+
self._sym_index: dict[str, int] = {}
|
|
45
|
+
self._next_id = 0
|
|
46
|
+
self._current_group = ""
|
|
47
|
+
self._group_counts: dict[str, int] = {}
|
|
48
|
+
self._edge_count = 0
|
|
49
|
+
self._edges_started = False
|
|
50
|
+
|
|
51
|
+
# Emit header immediately.
|
|
52
|
+
parts = [f"GCF tool={tool}"]
|
|
53
|
+
if token_budget:
|
|
54
|
+
parts.append(f"budget={token_budget}")
|
|
55
|
+
if tokens_used:
|
|
56
|
+
parts.append(f"tokens={tokens_used}")
|
|
57
|
+
if pack_root:
|
|
58
|
+
parts.append(f"pack_root={pack_root}")
|
|
59
|
+
if session:
|
|
60
|
+
parts.append("session=true")
|
|
61
|
+
self._w.write(" ".join(parts) + "\n")
|
|
62
|
+
|
|
63
|
+
def write_symbol(self, s: Symbol) -> None:
|
|
64
|
+
"""Emit a symbol line immediately. Group headers auto-managed."""
|
|
65
|
+
with self._lock:
|
|
66
|
+
group_names = ["targets", "related", "extended"]
|
|
67
|
+
if s.distance < len(group_names):
|
|
68
|
+
group_name = group_names[s.distance]
|
|
69
|
+
else:
|
|
70
|
+
group_name = f"distance_{s.distance}"
|
|
71
|
+
|
|
72
|
+
if group_name != self._current_group:
|
|
73
|
+
self._w.write(f"## {group_name}\n")
|
|
74
|
+
self._current_group = group_name
|
|
75
|
+
|
|
76
|
+
idx = self._next_id
|
|
77
|
+
self._sym_index[s.qualified_name] = idx
|
|
78
|
+
self._next_id += 1
|
|
79
|
+
|
|
80
|
+
kind = KIND_ABBREV.get(s.kind, s.kind)
|
|
81
|
+
self._w.write(f"@{idx} {kind} {s.qualified_name} {s.score:.2f} {s.provenance}\n")
|
|
82
|
+
|
|
83
|
+
self._group_counts[group_name] = self._group_counts.get(group_name, 0) + 1
|
|
84
|
+
|
|
85
|
+
def write_edge(self, e: Edge) -> None:
|
|
86
|
+
"""Emit an edge line immediately. Edges section header auto-emitted on first edge."""
|
|
87
|
+
with self._lock:
|
|
88
|
+
src_idx = self._sym_index.get(e.source)
|
|
89
|
+
tgt_idx = self._sym_index.get(e.target)
|
|
90
|
+
if src_idx is None or tgt_idx is None:
|
|
91
|
+
return
|
|
92
|
+
|
|
93
|
+
if not self._edges_started:
|
|
94
|
+
self._w.write("## edges [?]\n")
|
|
95
|
+
self._edges_started = True
|
|
96
|
+
|
|
97
|
+
line = f"@{tgt_idx}<@{src_idx} {e.edge_type}"
|
|
98
|
+
if e.status and e.status != "unchanged":
|
|
99
|
+
line += f" {e.status}"
|
|
100
|
+
self._w.write(line + "\n")
|
|
101
|
+
self._edge_count += 1
|
|
102
|
+
|
|
103
|
+
def write_bare_ref(self, qname: str, distance: int) -> None:
|
|
104
|
+
"""Emit a bare reference for a previously-transmitted symbol (session mode)."""
|
|
105
|
+
with self._lock:
|
|
106
|
+
group_names = ["targets", "related", "extended"]
|
|
107
|
+
if distance < len(group_names):
|
|
108
|
+
group_name = group_names[distance]
|
|
109
|
+
else:
|
|
110
|
+
group_name = f"distance_{distance}"
|
|
111
|
+
|
|
112
|
+
if group_name != self._current_group:
|
|
113
|
+
self._w.write(f"## {group_name}\n")
|
|
114
|
+
self._current_group = group_name
|
|
115
|
+
|
|
116
|
+
idx = self._next_id
|
|
117
|
+
self._sym_index[qname] = idx
|
|
118
|
+
self._next_id += 1
|
|
119
|
+
self._w.write(f"@{idx} # previously transmitted\n")
|
|
120
|
+
self._group_counts[group_name] = self._group_counts.get(group_name, 0) + 1
|
|
121
|
+
|
|
122
|
+
def close(self) -> None:
|
|
123
|
+
"""Emit ## _summary trailer with final counts."""
|
|
124
|
+
with self._lock:
|
|
125
|
+
sections: list[str] = []
|
|
126
|
+
group_order = ["targets", "related", "extended"]
|
|
127
|
+
|
|
128
|
+
for g in group_order:
|
|
129
|
+
c = self._group_counts.get(g, 0)
|
|
130
|
+
if c > 0:
|
|
131
|
+
sections.append(f"{g}:{c}")
|
|
132
|
+
for g, c in self._group_counts.items():
|
|
133
|
+
if g not in group_order and c > 0:
|
|
134
|
+
sections.append(f"{g}:{c}")
|
|
135
|
+
if self._edge_count > 0:
|
|
136
|
+
sections.append(f"edges:{self._edge_count}")
|
|
137
|
+
|
|
138
|
+
self._w.write(
|
|
139
|
+
f"## _summary symbols={self._next_id} edges={self._edge_count}"
|
|
140
|
+
f" sections={','.join(sections)}\n"
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
@property
|
|
144
|
+
def symbol_count(self) -> int:
|
|
145
|
+
"""Number of symbols written so far."""
|
|
146
|
+
return self._next_id
|
|
147
|
+
|
|
148
|
+
@property
|
|
149
|
+
def edge_count(self) -> int:
|
|
150
|
+
"""Number of edges written so far."""
|
|
151
|
+
return self._edge_count
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
"""Tests for the StreamEncoder."""
|
|
2
|
+
|
|
3
|
+
import io
|
|
4
|
+
|
|
5
|
+
from gcf import StreamEncoder, Symbol, Edge, decode
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def test_stream_basic():
|
|
9
|
+
buf = io.StringIO()
|
|
10
|
+
enc = StreamEncoder(buf, "context_for_task", token_budget=5000)
|
|
11
|
+
|
|
12
|
+
enc.write_symbol(Symbol(qualified_name="pkg.Auth", kind="function", score=0.78, provenance="lsp_resolved", distance=0))
|
|
13
|
+
enc.write_symbol(Symbol(qualified_name="pkg.Server", kind="function", score=0.54, provenance="lsp_resolved", distance=1))
|
|
14
|
+
enc.write_edge(Edge(source="pkg.Server", target="pkg.Auth", edge_type="calls"))
|
|
15
|
+
enc.close()
|
|
16
|
+
|
|
17
|
+
out = buf.getvalue()
|
|
18
|
+
assert "GCF tool=context_for_task budget=5000\n" in out
|
|
19
|
+
assert "## targets\n" in out
|
|
20
|
+
assert "@0 fn pkg.Auth 0.78 lsp_resolved\n" in out
|
|
21
|
+
assert "## related\n" in out
|
|
22
|
+
assert "@1 fn pkg.Server 0.54 lsp_resolved\n" in out
|
|
23
|
+
assert "## edges [?]\n" in out
|
|
24
|
+
assert "@0<@1 calls\n" in out
|
|
25
|
+
assert "## _summary symbols=2 edges=1" in out
|
|
26
|
+
|
|
27
|
+
# Header should not have symbols= or edges=
|
|
28
|
+
header = out.split("\n")[0]
|
|
29
|
+
assert "symbols=" not in header
|
|
30
|
+
assert "edges=" not in header
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def test_stream_round_trip():
|
|
34
|
+
buf = io.StringIO()
|
|
35
|
+
enc = StreamEncoder(buf, "blast_radius", token_budget=10000)
|
|
36
|
+
|
|
37
|
+
enc.write_symbol(Symbol(qualified_name="pkg.Auth", kind="function", score=0.95, provenance="lsp", distance=0))
|
|
38
|
+
enc.write_symbol(Symbol(qualified_name="pkg.Config", kind="type", score=0.80, provenance="ast", distance=0))
|
|
39
|
+
enc.write_symbol(Symbol(qualified_name="pkg.Server", kind="function", score=0.60, provenance="lsp", distance=1))
|
|
40
|
+
enc.write_edge(Edge(source="pkg.Server", target="pkg.Auth", edge_type="calls"))
|
|
41
|
+
enc.write_edge(Edge(source="pkg.Auth", target="pkg.Config", edge_type="references"))
|
|
42
|
+
enc.close()
|
|
43
|
+
|
|
44
|
+
p = decode(buf.getvalue())
|
|
45
|
+
assert p.tool == "blast_radius"
|
|
46
|
+
assert len(p.symbols) == 3
|
|
47
|
+
assert len(p.edges) == 2
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def test_stream_no_edges():
|
|
51
|
+
buf = io.StringIO()
|
|
52
|
+
enc = StreamEncoder(buf, "test")
|
|
53
|
+
|
|
54
|
+
enc.write_symbol(Symbol(qualified_name="a.A", kind="function", score=0.9, provenance="x", distance=0))
|
|
55
|
+
enc.close()
|
|
56
|
+
|
|
57
|
+
out = buf.getvalue()
|
|
58
|
+
assert "## edges" not in out
|
|
59
|
+
assert "edges=0" in out
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def test_stream_multiple_groups():
|
|
63
|
+
buf = io.StringIO()
|
|
64
|
+
enc = StreamEncoder(buf, "test")
|
|
65
|
+
|
|
66
|
+
enc.write_symbol(Symbol(qualified_name="a", kind="function", score=1.0, provenance="x", distance=0))
|
|
67
|
+
enc.write_symbol(Symbol(qualified_name="b", kind="function", score=0.8, provenance="x", distance=1))
|
|
68
|
+
enc.write_symbol(Symbol(qualified_name="c", kind="function", score=0.6, provenance="x", distance=2))
|
|
69
|
+
enc.write_symbol(Symbol(qualified_name="d", kind="function", score=0.4, provenance="x", distance=5))
|
|
70
|
+
enc.close()
|
|
71
|
+
|
|
72
|
+
out = buf.getvalue()
|
|
73
|
+
assert "## targets\n" in out
|
|
74
|
+
assert "## related\n" in out
|
|
75
|
+
assert "## extended\n" in out
|
|
76
|
+
assert "## distance_5\n" in out
|
|
77
|
+
assert "sections=targets:1,related:1,extended:1,distance_5:1" in out
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def test_stream_skips_unknown_refs():
|
|
81
|
+
buf = io.StringIO()
|
|
82
|
+
enc = StreamEncoder(buf, "test")
|
|
83
|
+
|
|
84
|
+
enc.write_symbol(Symbol(qualified_name="a.A", kind="function", score=0.9, provenance="x", distance=0))
|
|
85
|
+
enc.write_edge(Edge(source="unknown.B", target="a.A", edge_type="calls"))
|
|
86
|
+
enc.close()
|
|
87
|
+
|
|
88
|
+
out = buf.getvalue()
|
|
89
|
+
assert "calls" not in out
|
|
90
|
+
assert "edges=0" in out
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def test_stream_incremental():
|
|
94
|
+
buf = io.StringIO()
|
|
95
|
+
enc = StreamEncoder(buf, "test")
|
|
96
|
+
|
|
97
|
+
# Header written immediately.
|
|
98
|
+
assert buf.tell() > 0
|
|
99
|
+
pos_after_header = buf.tell()
|
|
100
|
+
|
|
101
|
+
enc.write_symbol(Symbol(qualified_name="a.A", kind="function", score=0.9, provenance="x", distance=0))
|
|
102
|
+
assert buf.tell() > pos_after_header
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def test_stream_bare_ref():
|
|
106
|
+
buf = io.StringIO()
|
|
107
|
+
enc = StreamEncoder(buf, "test", session=True)
|
|
108
|
+
|
|
109
|
+
enc.write_bare_ref("pkg.Auth", 0)
|
|
110
|
+
enc.write_symbol(Symbol(qualified_name="pkg.New", kind="function", score=0.85, provenance="lsp", distance=0))
|
|
111
|
+
enc.close()
|
|
112
|
+
|
|
113
|
+
out = buf.getvalue()
|
|
114
|
+
assert "session=true" in out
|
|
115
|
+
assert "@0 # previously transmitted" in out
|
|
116
|
+
assert "@1 fn pkg.New 0.85 lsp" in out
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|