causaliq-knowledge 0.1.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. {causaliq_knowledge-0.1.0/src/causaliq_knowledge.egg-info → causaliq_knowledge-0.3.0}/PKG-INFO +9 -10
  2. {causaliq_knowledge-0.1.0 → causaliq_knowledge-0.3.0}/README.md +8 -9
  3. {causaliq_knowledge-0.1.0 → causaliq_knowledge-0.3.0}/src/causaliq_knowledge/__init__.py +3 -3
  4. causaliq_knowledge-0.3.0/src/causaliq_knowledge/cache/__init__.py +18 -0
  5. causaliq_knowledge-0.3.0/src/causaliq_knowledge/cache/encoders/__init__.py +13 -0
  6. causaliq_knowledge-0.3.0/src/causaliq_knowledge/cache/encoders/base.py +90 -0
  7. causaliq_knowledge-0.3.0/src/causaliq_knowledge/cache/encoders/json_encoder.py +418 -0
  8. causaliq_knowledge-0.3.0/src/causaliq_knowledge/cache/token_cache.py +632 -0
  9. causaliq_knowledge-0.3.0/src/causaliq_knowledge/cli.py +757 -0
  10. causaliq_knowledge-0.3.0/src/causaliq_knowledge/llm/__init__.py +63 -0
  11. causaliq_knowledge-0.3.0/src/causaliq_knowledge/llm/anthropic_client.py +256 -0
  12. causaliq_knowledge-0.3.0/src/causaliq_knowledge/llm/base_client.py +360 -0
  13. causaliq_knowledge-0.3.0/src/causaliq_knowledge/llm/cache.py +380 -0
  14. causaliq_knowledge-0.3.0/src/causaliq_knowledge/llm/deepseek_client.py +108 -0
  15. {causaliq_knowledge-0.1.0 → causaliq_knowledge-0.3.0}/src/causaliq_knowledge/llm/gemini_client.py +117 -39
  16. causaliq_knowledge-0.3.0/src/causaliq_knowledge/llm/groq_client.py +223 -0
  17. causaliq_knowledge-0.3.0/src/causaliq_knowledge/llm/mistral_client.py +122 -0
  18. causaliq_knowledge-0.3.0/src/causaliq_knowledge/llm/ollama_client.py +240 -0
  19. causaliq_knowledge-0.3.0/src/causaliq_knowledge/llm/openai_client.py +115 -0
  20. causaliq_knowledge-0.3.0/src/causaliq_knowledge/llm/openai_compat_client.py +287 -0
  21. {causaliq_knowledge-0.1.0 → causaliq_knowledge-0.3.0}/src/causaliq_knowledge/llm/provider.py +99 -46
  22. {causaliq_knowledge-0.1.0 → causaliq_knowledge-0.3.0/src/causaliq_knowledge.egg-info}/PKG-INFO +9 -10
  23. causaliq_knowledge-0.3.0/src/causaliq_knowledge.egg-info/SOURCES.txt +31 -0
  24. causaliq_knowledge-0.1.0/src/causaliq_knowledge/cli.py +0 -207
  25. causaliq_knowledge-0.1.0/src/causaliq_knowledge/llm/__init__.py +0 -34
  26. causaliq_knowledge-0.1.0/src/causaliq_knowledge/llm/groq_client.py +0 -148
  27. causaliq_knowledge-0.1.0/src/causaliq_knowledge.egg-info/SOURCES.txt +0 -18
  28. {causaliq_knowledge-0.1.0 → causaliq_knowledge-0.3.0}/LICENSE +0 -0
  29. {causaliq_knowledge-0.1.0 → causaliq_knowledge-0.3.0}/pyproject.toml +0 -0
  30. {causaliq_knowledge-0.1.0 → causaliq_knowledge-0.3.0}/setup.cfg +0 -0
  31. {causaliq_knowledge-0.1.0 → causaliq_knowledge-0.3.0}/src/causaliq_knowledge/base.py +0 -0
  32. {causaliq_knowledge-0.1.0 → causaliq_knowledge-0.3.0}/src/causaliq_knowledge/llm/prompts.py +0 -0
  33. {causaliq_knowledge-0.1.0 → causaliq_knowledge-0.3.0}/src/causaliq_knowledge/models.py +0 -0
  34. {causaliq_knowledge-0.1.0 → causaliq_knowledge-0.3.0}/src/causaliq_knowledge.egg-info/dependency_links.txt +0 -0
  35. {causaliq_knowledge-0.1.0 → causaliq_knowledge-0.3.0}/src/causaliq_knowledge.egg-info/entry_points.txt +0 -0
  36. {causaliq_knowledge-0.1.0 → causaliq_knowledge-0.3.0}/src/causaliq_knowledge.egg-info/requires.txt +0 -0
  37. {causaliq_knowledge-0.1.0 → causaliq_knowledge-0.3.0}/src/causaliq_knowledge.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: causaliq-knowledge
3
- Version: 0.1.0
3
+ Version: 0.3.0
4
4
  Summary: Incorporating LLM and human knowledge into causal discovery
5
5
  Author-email: CausalIQ <info@causaliq.com>
6
6
  Maintainer-email: CausalIQ <info@causaliq.com>
@@ -85,18 +85,14 @@ print(f"Reasoning: {result.reasoning}")
85
85
 
86
86
  ## Features
87
87
 
88
- Under development:
89
-
90
- - **Release v0.1.0 - Foundation LLM**: Simple LLM queries to 1 or 2 LLMs about edge existence and orientation to support graph averaging
91
-
92
88
  Currently implemented releases:
93
89
 
94
- - None
90
+ - **Release v0.1.0 - Foundation LLM**: Simple LLM queries to 1 or 2 LLMs about edge existence and orientation to support graph averaging
91
+ - **Release v0.2.0 - Additional LLMs**: Support for 7 LLM providers (Groq, Gemini, OpenAI, Anthropic, DeepSeek, Mistral, Ollama)
92
+ - **Release v0.3.0 - LLM Caching** *(in development)*: SQLite-based response caching with CLI tools for cache management
95
93
 
96
94
  Planned:
97
95
 
98
- - **Release v0.2.0 - Additional LLMs**: Support for more LLM providers (OpenAI, Anthropic)
99
- - **Release v0.3.0 - LLM Caching**: Caching of LLM queries and responses
100
96
  - **Release v0.4.0 - LLM Context**: Variable/role/literature etc context
101
97
  - **Release v0.5.0 - Algorithm integration**: Integration into structure learning algorithms
102
98
  - **Release v0.6.0 - Legacy Reference**: Support for legacy approaches of deriving knowledge from reference networks
@@ -128,8 +124,11 @@ This approach keeps the package lightweight, reliable, and easy to debug.
128
124
  |----------|--------|--------|-----------|
129
125
  | **Groq** | `GroqClient` | llama-3.1-8b-instant | ✅ Generous |
130
126
  | **Google Gemini** | `GeminiClient` | gemini-2.5-flash | ✅ Generous |
131
-
132
- Additional providers (OpenAI, Anthropic) can be added in future releases.
127
+ | **OpenAI** | `OpenAIClient` | gpt-4o-mini | ❌ Paid |
128
+ | **Anthropic** | `AnthropicClient` | claude-sonnet-4-20250514 | Paid |
129
+ | **DeepSeek** | `DeepSeekClient` | deepseek-chat | ✅ Low cost |
130
+ | **Mistral** | `MistralClient` | mistral-small-latest | ❌ Paid |
131
+ | **Ollama** | `OllamaClient` | llama3 | ✅ Free (local) |
133
132
 
134
133
  ## Upcoming Key Innovations
135
134
 
@@ -31,18 +31,14 @@ print(f"Reasoning: {result.reasoning}")
31
31
 
32
32
  ## Features
33
33
 
34
- Under development:
35
-
36
- - **Release v0.1.0 - Foundation LLM**: Simple LLM queries to 1 or 2 LLMs about edge existence and orientation to support graph averaging
37
-
38
34
  Currently implemented releases:
39
35
 
40
- - None
36
+ - **Release v0.1.0 - Foundation LLM**: Simple LLM queries to 1 or 2 LLMs about edge existence and orientation to support graph averaging
37
+ - **Release v0.2.0 - Additional LLMs**: Support for 7 LLM providers (Groq, Gemini, OpenAI, Anthropic, DeepSeek, Mistral, Ollama)
38
+ - **Release v0.3.0 - LLM Caching** *(in development)*: SQLite-based response caching with CLI tools for cache management
41
39
 
42
40
  Planned:
43
41
 
44
- - **Release v0.2.0 - Additional LLMs**: Support for more LLM providers (OpenAI, Anthropic)
45
- - **Release v0.3.0 - LLM Caching**: Caching of LLM queries and responses
46
42
  - **Release v0.4.0 - LLM Context**: Variable/role/literature etc context
47
43
  - **Release v0.5.0 - Algorithm integration**: Integration into structure learning algorithms
48
44
  - **Release v0.6.0 - Legacy Reference**: Support for legacy approaches of deriving knowledge from reference networks
@@ -74,8 +70,11 @@ This approach keeps the package lightweight, reliable, and easy to debug.
74
70
  |----------|--------|--------|-----------|
75
71
  | **Groq** | `GroqClient` | llama-3.1-8b-instant | ✅ Generous |
76
72
  | **Google Gemini** | `GeminiClient` | gemini-2.5-flash | ✅ Generous |
77
-
78
- Additional providers (OpenAI, Anthropic) can be added in future releases.
73
+ | **OpenAI** | `OpenAIClient` | gpt-4o-mini | ❌ Paid |
74
+ | **Anthropic** | `AnthropicClient` | claude-sonnet-4-20250514 | Paid |
75
+ | **DeepSeek** | `DeepSeekClient` | deepseek-chat | ✅ Low cost |
76
+ | **Mistral** | `MistralClient` | mistral-small-latest | ❌ Paid |
77
+ | **Ollama** | `OllamaClient` | llama3 | ✅ Free (local) |
79
78
 
80
79
  ## Upcoming Key Innovations
81
80
 
@@ -5,7 +5,7 @@ causaliq-knowledge: LLM and human knowledge for causal discovery.
5
5
  from causaliq_knowledge.base import KnowledgeProvider
6
6
  from causaliq_knowledge.models import EdgeDirection, EdgeKnowledge
7
7
 
8
- __version__ = "0.1.0"
8
+ __version__ = "0.3.0"
9
9
  __author__ = "CausalIQ"
10
10
  __email__ = "info@causaliq.com"
11
11
 
@@ -16,8 +16,8 @@ __description__ = "LLM and human knowledge for causal discovery"
16
16
  __url__ = "https://github.com/causaliq/causaliq-knowledge"
17
17
  __license__ = "MIT"
18
18
 
19
- # Version tuple for programmatic access
20
- VERSION = tuple(map(int, __version__.split(".")))
19
+ # Version tuple for programmatic access (major, minor, patch)
20
+ VERSION = (0, 3, 0)
21
21
 
22
22
  __all__ = [
23
23
  "__version__",
@@ -0,0 +1,18 @@
1
+ """
2
+ Core caching infrastructure for causaliq.
3
+
4
+ This module provides a generic caching system with:
5
+ - SQLite-backed storage with concurrency support
6
+ - Pluggable encoders for type-specific compression
7
+ - Shared token dictionary for cross-entry compression
8
+ - Import/export for human-readable formats
9
+
10
+ Note: This module is designed for future migration to causaliq-core.
11
+ LLM-specific caching code remains in causaliq_knowledge.llm.cache.
12
+ """
13
+
14
+ from causaliq_knowledge.cache.token_cache import TokenCache
15
+
16
+ __all__ = [
17
+ "TokenCache",
18
+ ]
@@ -0,0 +1,13 @@
1
+ """
2
+ Pluggable encoders for type-specific cache entry compression.
3
+
4
+ Encoders transform data to/from compact binary representations,
5
+ using a shared token dictionary for cross-entry compression.
6
+
7
+ Note: This submodule is designed for future migration to causaliq-core.
8
+ """
9
+
10
+ from causaliq_knowledge.cache.encoders.base import EntryEncoder
11
+ from causaliq_knowledge.cache.encoders.json_encoder import JsonEncoder
12
+
13
+ __all__ = ["EntryEncoder", "JsonEncoder"]
@@ -0,0 +1,90 @@
1
+ """
2
+ Abstract base class for cache entry encoders.
3
+
4
+ Encoders transform data to/from compact binary representations,
5
+ optionally using a shared token dictionary for compression.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from abc import ABC, abstractmethod
11
+ from pathlib import Path
12
+ from typing import TYPE_CHECKING, Any
13
+
14
+ if TYPE_CHECKING: # pragma: no cover
15
+ from causaliq_knowledge.cache.token_cache import TokenCache
16
+
17
+
18
+ class EntryEncoder(ABC):
19
+ """Abstract base class for type-specific cache entry encoders.
20
+
21
+ Encoders handle:
22
+ - Encoding data to compact binary format for storage
23
+ - Decoding binary data back to original structure
24
+ - Exporting to human-readable formats (JSON, GraphML, etc.)
25
+ - Importing from human-readable formats
26
+
27
+ Encoders may use the shared token dictionary in TokenCache
28
+ for cross-entry compression of repeated strings.
29
+
30
+ Example:
31
+ >>> class MyEncoder(EntryEncoder):
32
+ ... def encode(self, data, token_cache):
33
+ ... return json.dumps(data).encode()
34
+ ... def decode(self, blob, token_cache):
35
+ ... return json.loads(blob.decode())
36
+ ... # ... export/import methods
37
+ """
38
+
39
+ @property
40
+ def default_export_format(self) -> str:
41
+ """Default file extension for exports (e.g. 'json', 'graphml')."""
42
+ return "json"
43
+
44
+ @abstractmethod
45
+ def encode(self, data: Any, token_cache: TokenCache) -> bytes:
46
+ """Encode data to binary format.
47
+
48
+ Args:
49
+ data: The data to encode (type depends on encoder).
50
+ token_cache: Cache instance for shared token dictionary.
51
+
52
+ Returns:
53
+ Compact binary representation.
54
+ """
55
+ ...
56
+
57
+ @abstractmethod
58
+ def decode(self, blob: bytes, token_cache: TokenCache) -> Any:
59
+ """Decode binary data back to original structure.
60
+
61
+ Args:
62
+ blob: Binary data from cache.
63
+ token_cache: Cache instance for shared token dictionary.
64
+
65
+ Returns:
66
+ Decoded data in original format.
67
+ """
68
+ ...
69
+
70
+ @abstractmethod
71
+ def export(self, data: Any, path: Path) -> None:
72
+ """Export data to human-readable file format.
73
+
74
+ Args:
75
+ data: The data to export (decoded format).
76
+ path: Destination file path.
77
+ """
78
+ ...
79
+
80
+ @abstractmethod
81
+ def import_(self, path: Path) -> Any:
82
+ """Import data from human-readable file format.
83
+
84
+ Args:
85
+ path: Source file path.
86
+
87
+ Returns:
88
+ Imported data ready for encoding.
89
+ """
90
+ ...
@@ -0,0 +1,418 @@
1
+ """
2
+ Generic JSON encoder with tokenisation and literal handling.
3
+
4
+ Tokenises JSON structure (keys, structural chars, string values) while
5
+ storing numbers as compact binary literals. Achieves 50-70% compression
6
+ on typical JSON data.
7
+
8
+ Note: This module is designed for future migration to causaliq-core.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import json
14
+ import re
15
+ import struct
16
+ from pathlib import Path
17
+ from typing import TYPE_CHECKING, Any
18
+
19
+ from causaliq_knowledge.cache.encoders.base import EntryEncoder
20
+
21
+ if TYPE_CHECKING: # pragma: no cover
22
+ from causaliq_knowledge.cache.token_cache import TokenCache
23
+
24
+
25
+ # Type markers for encoded values
26
+ TOKEN_REF = 0x00
27
+ LITERAL_INT = 0x01
28
+ LITERAL_FLOAT = 0x02
29
+
30
+
31
+ class JsonEncoder(EntryEncoder):
32
+ """Tokenised encoding for JSON-serialisable data.
33
+
34
+ Uses shared token dictionary for JSON structure and text content.
35
+ Numbers are stored as binary literals. Typical compression is 50-70%.
36
+
37
+ Encoding format:
38
+ - Token reference: 0x00 + uint16 (token ID)
39
+ - Integer literal: 0x01 + int64 (8 bytes, signed)
40
+ - Float literal: 0x02 + float64 (8 bytes, double)
41
+
42
+ Example:
43
+ >>> from causaliq_knowledge.cache import TokenCache
44
+ >>> with TokenCache(":memory:") as cache:
45
+ ... encoder = JsonEncoder()
46
+ ... data = {"key": "value", "count": 42}
47
+ ... blob = encoder.encode(data, cache)
48
+ ... decoded = encoder.decode(blob, cache)
49
+ ... assert decoded == data
50
+ """
51
+
52
+ def _get_token(self, token_id: int, token_cache: TokenCache) -> str:
53
+ """Get token by ID, raising error if not found.
54
+
55
+ Args:
56
+ token_id: The token ID to look up.
57
+ token_cache: Cache instance for token dictionary.
58
+
59
+ Returns:
60
+ The token string.
61
+
62
+ Raises:
63
+ ValueError: If token ID not found (corrupted cache).
64
+ """
65
+ token = token_cache.get_token(token_id)
66
+ if token is None:
67
+ raise ValueError(f"Unknown token ID: {token_id}")
68
+ return token
69
+
70
+ @property
71
+ def default_export_format(self) -> str:
72
+ """Default file extension for exports."""
73
+ return "json"
74
+
75
+ def encode(self, data: Any, token_cache: TokenCache) -> bytes:
76
+ """Encode JSON-serialisable data to tokenised binary format.
77
+
78
+ Args:
79
+ data: Any JSON-serialisable data (dict, list, str, int, etc.).
80
+ token_cache: Cache instance for shared token dictionary.
81
+
82
+ Returns:
83
+ Compact binary representation using token IDs and literals.
84
+ """
85
+ result = bytearray()
86
+ self._encode_value(data, token_cache, result)
87
+ return bytes(result)
88
+
89
+ def decode(self, blob: bytes, token_cache: TokenCache) -> Any:
90
+ """Decode tokenised binary data back to JSON structure.
91
+
92
+ Args:
93
+ blob: Binary data from cache.
94
+ token_cache: Cache instance for shared token dictionary.
95
+
96
+ Returns:
97
+ Decoded JSON-compatible data structure.
98
+ """
99
+ offset = 0
100
+ value, _ = self._decode_value(blob, offset, token_cache)
101
+ return value
102
+
103
+ def export(self, data: Any, path: Path) -> None:
104
+ """Export data to JSON file.
105
+
106
+ Args:
107
+ data: The decoded data to export.
108
+ path: Destination file path.
109
+ """
110
+ path.write_text(json.dumps(data, indent=2))
111
+
112
+ def import_(self, path: Path) -> Any:
113
+ """Import data from JSON file.
114
+
115
+ Args:
116
+ path: Source file path.
117
+
118
+ Returns:
119
+ Imported JSON data ready for encoding.
120
+ """
121
+ return json.loads(path.read_text())
122
+
123
+ def _encode_value(
124
+ self, value: Any, token_cache: TokenCache, result: bytearray
125
+ ) -> None:
126
+ """Recursively encode a JSON value.
127
+
128
+ Args:
129
+ value: Value to encode.
130
+ token_cache: Cache for token dictionary.
131
+ result: Bytearray to append encoded data to.
132
+ """
133
+ if value is None:
134
+ self._encode_token("null", token_cache, result)
135
+ elif isinstance(value, bool):
136
+ # Must check bool before int (bool is subclass of int)
137
+ self._encode_token(
138
+ "true" if value else "false", token_cache, result
139
+ )
140
+ elif isinstance(value, int):
141
+ result.append(LITERAL_INT)
142
+ result.extend(struct.pack("<q", value))
143
+ elif isinstance(value, float):
144
+ result.append(LITERAL_FLOAT)
145
+ result.extend(struct.pack("<d", value))
146
+ elif isinstance(value, str):
147
+ self._encode_string(value, token_cache, result)
148
+ elif isinstance(value, list):
149
+ self._encode_list(value, token_cache, result)
150
+ elif isinstance(value, dict):
151
+ self._encode_dict(value, token_cache, result)
152
+ else:
153
+ # Fallback: convert to string
154
+ self._encode_string(str(value), token_cache, result)
155
+
156
+ def _encode_token(
157
+ self, token: str, token_cache: TokenCache, result: bytearray
158
+ ) -> None:
159
+ """Encode a single token reference.
160
+
161
+ Args:
162
+ token: Token string to encode.
163
+ token_cache: Cache for token dictionary.
164
+ result: Bytearray to append encoded data to.
165
+ """
166
+ token_id = token_cache.get_or_create_token(token)
167
+ result.append(TOKEN_REF)
168
+ result.extend(struct.pack("<H", token_id))
169
+
170
+ def _encode_string(
171
+ self, value: str, token_cache: TokenCache, result: bytearray
172
+ ) -> None:
173
+ """Encode a string value with tokenisation.
174
+
175
+ Strings are split into tokens (words/punctuation) with special
176
+ markers for string start/end.
177
+
178
+ Args:
179
+ value: String to encode.
180
+ token_cache: Cache for token dictionary.
181
+ result: Bytearray to append encoded data to.
182
+ """
183
+ self._encode_token('"', token_cache, result)
184
+ # Split on whitespace and punctuation, keeping delimiters
185
+ tokens = self._tokenise_string(value)
186
+ for token in tokens:
187
+ self._encode_token(token, token_cache, result)
188
+ self._encode_token('"', token_cache, result)
189
+
190
+ def _encode_list(
191
+ self, value: list, token_cache: TokenCache, result: bytearray
192
+ ) -> None:
193
+ """Encode a list value.
194
+
195
+ Args:
196
+ value: List to encode.
197
+ token_cache: Cache for token dictionary.
198
+ result: Bytearray to append encoded data to.
199
+ """
200
+ self._encode_token("[", token_cache, result)
201
+ for i, item in enumerate(value):
202
+ if i > 0:
203
+ self._encode_token(",", token_cache, result)
204
+ self._encode_value(item, token_cache, result)
205
+ self._encode_token("]", token_cache, result)
206
+
207
+ def _encode_dict(
208
+ self, value: dict, token_cache: TokenCache, result: bytearray
209
+ ) -> None:
210
+ """Encode a dict value.
211
+
212
+ Args:
213
+ value: Dict to encode.
214
+ token_cache: Cache for token dictionary.
215
+ result: Bytearray to append encoded data to.
216
+ """
217
+ self._encode_token("{", token_cache, result)
218
+ for i, (key, val) in enumerate(value.items()):
219
+ if i > 0:
220
+ self._encode_token(",", token_cache, result)
221
+ self._encode_string(str(key), token_cache, result)
222
+ self._encode_token(":", token_cache, result)
223
+ self._encode_value(val, token_cache, result)
224
+ self._encode_token("}", token_cache, result)
225
+
226
+ def _tokenise_string(self, value: str) -> list[str]:
227
+ """Split string into tokens for encoding.
228
+
229
+ Splits on whitespace and punctuation boundaries, preserving
230
+ all characters. Empty string returns empty list.
231
+
232
+ Args:
233
+ value: String to tokenise.
234
+
235
+ Returns:
236
+ List of token strings.
237
+ """
238
+ if not value:
239
+ return []
240
+ # Split on word boundaries, keeping all parts
241
+ # Matches: word chars, whitespace runs, or single punctuation
242
+ tokens = re.findall(r"\w+|\s+|[^\w\s]", value)
243
+ return tokens
244
+
245
+ def _decode_value(
246
+ self, blob: bytes, offset: int, token_cache: TokenCache
247
+ ) -> tuple[Any, int]:
248
+ """Decode a single value from blob at offset.
249
+
250
+ Args:
251
+ blob: Binary data to decode.
252
+ offset: Current position in blob.
253
+ token_cache: Cache for token dictionary.
254
+
255
+ Returns:
256
+ Tuple of (decoded value, new offset).
257
+ """
258
+ if offset >= len(blob):
259
+ raise ValueError("Unexpected end of data")
260
+
261
+ type_marker = blob[offset]
262
+ offset += 1
263
+
264
+ if type_marker == LITERAL_INT:
265
+ value = struct.unpack("<q", blob[offset : offset + 8])[0]
266
+ return value, offset + 8
267
+ elif type_marker == LITERAL_FLOAT:
268
+ value = struct.unpack("<d", blob[offset : offset + 8])[0]
269
+ return value, offset + 8
270
+ elif type_marker == TOKEN_REF:
271
+ token_id = struct.unpack("<H", blob[offset : offset + 2])[0]
272
+ offset += 2
273
+ token = self._get_token(token_id, token_cache)
274
+
275
+ if token == "null":
276
+ return None, offset
277
+ elif token == "true":
278
+ return True, offset
279
+ elif token == "false":
280
+ return False, offset
281
+ elif token == '"':
282
+ return self._decode_string(blob, offset, token_cache)
283
+ elif token == "[":
284
+ return self._decode_list(blob, offset, token_cache)
285
+ elif token == "{":
286
+ return self._decode_dict(blob, offset, token_cache)
287
+ else:
288
+ raise ValueError(
289
+ f"Unexpected token at value position: {token}"
290
+ )
291
+ else:
292
+ raise ValueError(f"Unknown type marker: {type_marker}")
293
+
294
+ def _decode_string(
295
+ self, blob: bytes, offset: int, token_cache: TokenCache
296
+ ) -> tuple[str, int]:
297
+ """Decode a string value (after opening quote consumed).
298
+
299
+ Args:
300
+ blob: Binary data to decode.
301
+ offset: Current position (after opening quote).
302
+ token_cache: Cache for token dictionary.
303
+
304
+ Returns:
305
+ Tuple of (decoded string, new offset).
306
+ """
307
+ parts: list[str] = []
308
+ while offset < len(blob):
309
+ type_marker = blob[offset]
310
+ if type_marker != TOKEN_REF:
311
+ raise ValueError(
312
+ f"Expected token in string, got {type_marker}"
313
+ )
314
+ token_id = struct.unpack("<H", blob[offset + 1 : offset + 3])[0]
315
+ offset += 3
316
+ token = self._get_token(token_id, token_cache)
317
+ if token == '"':
318
+ # End of string
319
+ return "".join(parts), offset
320
+ parts.append(token)
321
+ raise ValueError("Unterminated string")
322
+
323
+ def _decode_list(
324
+ self, blob: bytes, offset: int, token_cache: TokenCache
325
+ ) -> tuple[list, int]:
326
+ """Decode a list value (after opening bracket consumed).
327
+
328
+ Args:
329
+ blob: Binary data to decode.
330
+ offset: Current position (after opening bracket).
331
+ token_cache: Cache for token dictionary.
332
+
333
+ Returns:
334
+ Tuple of (decoded list, new offset).
335
+ """
336
+ items = []
337
+ # Check for empty list
338
+ if offset < len(blob) and blob[offset] == TOKEN_REF:
339
+ token_id = struct.unpack("<H", blob[offset + 1 : offset + 3])[0]
340
+ token = self._get_token(token_id, token_cache)
341
+ if token == "]":
342
+ return [], offset + 3
343
+
344
+ while offset < len(blob):
345
+ value, offset = self._decode_value(blob, offset, token_cache)
346
+ items.append(value)
347
+
348
+ # Check for comma or closing bracket
349
+ if offset >= len(blob):
350
+ raise ValueError("Unterminated list")
351
+ if blob[offset] != TOKEN_REF:
352
+ raise ValueError("Expected token after list item")
353
+ token_id = struct.unpack("<H", blob[offset + 1 : offset + 3])[0]
354
+ offset += 3
355
+ token = self._get_token(token_id, token_cache)
356
+ if token == "]":
357
+ return items, offset
358
+ elif token != ",":
359
+ raise ValueError(f"Expected ',' or ']' in list, got '{token}'")
360
+
361
+ raise ValueError("Unterminated list") # pragma: no cover
362
+
363
+ def _decode_dict(
364
+ self, blob: bytes, offset: int, token_cache: TokenCache
365
+ ) -> tuple[dict, int]:
366
+ """Decode a dict value (after opening brace consumed).
367
+
368
+ Args:
369
+ blob: Binary data to decode.
370
+ offset: Current position (after opening brace).
371
+ token_cache: Cache for token dictionary.
372
+
373
+ Returns:
374
+ Tuple of (decoded dict, new offset).
375
+ """
376
+ result = {}
377
+ # Check for empty dict
378
+ if offset < len(blob) and blob[offset] == TOKEN_REF:
379
+ token_id = struct.unpack("<H", blob[offset + 1 : offset + 3])[0]
380
+ token = self._get_token(token_id, token_cache)
381
+ if token == "}":
382
+ return {}, offset + 3
383
+
384
+ while offset < len(blob):
385
+ # Decode key (must be string)
386
+ key, offset = self._decode_value(blob, offset, token_cache)
387
+ if not isinstance(key, str):
388
+ raise ValueError(f"Dict key must be string, got {type(key)}")
389
+
390
+ # Expect colon
391
+ if offset >= len(blob) or blob[offset] != TOKEN_REF:
392
+ raise ValueError("Expected ':' after dict key")
393
+ token_id = struct.unpack("<H", blob[offset + 1 : offset + 3])[0]
394
+ offset += 3
395
+ token = self._get_token(token_id, token_cache)
396
+ if token != ":":
397
+ raise ValueError(f"Expected ':', got '{token}'")
398
+
399
+ # Decode value
400
+ value, offset = self._decode_value(blob, offset, token_cache)
401
+ result[key] = value
402
+
403
+ # Check for comma or closing brace
404
+ if offset >= len(blob):
405
+ raise ValueError("Unterminated dict")
406
+ if blob[offset] != TOKEN_REF:
407
+ raise ValueError("Expected token after dict value")
408
+ token_id = struct.unpack("<H", blob[offset + 1 : offset + 3])[0]
409
+ offset += 3
410
+ token = self._get_token(token_id, token_cache)
411
+ if token == "}":
412
+ return result, offset
413
+ elif token != ",":
414
+ raise ValueError(
415
+ f"Expected ',' or '}}' in dict, got '{token}'"
416
+ )
417
+
418
+ raise ValueError("Unterminated dict") # pragma: no cover