causaliq-knowledge 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. causaliq_knowledge/__init__.py +6 -3
  2. causaliq_knowledge/action.py +480 -0
  3. causaliq_knowledge/cache/__init__.py +18 -0
  4. causaliq_knowledge/cache/encoders/__init__.py +13 -0
  5. causaliq_knowledge/cache/encoders/base.py +90 -0
  6. causaliq_knowledge/cache/encoders/json_encoder.py +430 -0
  7. causaliq_knowledge/cache/token_cache.py +666 -0
  8. causaliq_knowledge/cli/__init__.py +15 -0
  9. causaliq_knowledge/cli/cache.py +478 -0
  10. causaliq_knowledge/cli/generate.py +410 -0
  11. causaliq_knowledge/cli/main.py +172 -0
  12. causaliq_knowledge/cli/models.py +309 -0
  13. causaliq_knowledge/graph/__init__.py +78 -0
  14. causaliq_knowledge/graph/generator.py +457 -0
  15. causaliq_knowledge/graph/loader.py +222 -0
  16. causaliq_knowledge/graph/models.py +426 -0
  17. causaliq_knowledge/graph/params.py +175 -0
  18. causaliq_knowledge/graph/prompts.py +445 -0
  19. causaliq_knowledge/graph/response.py +392 -0
  20. causaliq_knowledge/graph/view_filter.py +154 -0
  21. causaliq_knowledge/llm/base_client.py +147 -1
  22. causaliq_knowledge/llm/cache.py +443 -0
  23. causaliq_knowledge/py.typed +0 -0
  24. {causaliq_knowledge-0.2.0.dist-info → causaliq_knowledge-0.4.0.dist-info}/METADATA +10 -6
  25. causaliq_knowledge-0.4.0.dist-info/RECORD +42 -0
  26. {causaliq_knowledge-0.2.0.dist-info → causaliq_knowledge-0.4.0.dist-info}/WHEEL +1 -1
  27. {causaliq_knowledge-0.2.0.dist-info → causaliq_knowledge-0.4.0.dist-info}/entry_points.txt +3 -0
  28. causaliq_knowledge/cli.py +0 -414
  29. causaliq_knowledge-0.2.0.dist-info/RECORD +0 -22
  30. {causaliq_knowledge-0.2.0.dist-info → causaliq_knowledge-0.4.0.dist-info}/licenses/LICENSE +0 -0
  31. {causaliq_knowledge-0.2.0.dist-info → causaliq_knowledge-0.4.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,430 @@
1
+ """
2
+ Generic JSON encoder with tokenisation and literal handling.
3
+
4
+ Tokenises JSON structure (keys, structural chars, string values) while
5
+ storing numbers as compact binary literals. Achieves 50-70% compression
6
+ on typical JSON data.
7
+
8
+ Note: This module is designed for future migration to causaliq-core.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import json
14
+ import re
15
+ import struct
16
+ from pathlib import Path
17
+ from typing import TYPE_CHECKING, Any
18
+
19
+ from causaliq_knowledge.cache.encoders.base import EntryEncoder
20
+
21
+ if TYPE_CHECKING: # pragma: no cover
22
+ from causaliq_knowledge.cache.token_cache import TokenCache
23
+
24
+
25
+ # Type markers for encoded values
26
+ TOKEN_REF = 0x00
27
+ LITERAL_INT = 0x01
28
+ LITERAL_FLOAT = 0x02
29
+
30
+
31
+ class JsonEncoder(EntryEncoder):
32
+ """Tokenised encoding for JSON-serialisable data.
33
+
34
+ Uses shared token dictionary for JSON structure and text content.
35
+ Numbers are stored as binary literals. Typical compression is 50-70%.
36
+
37
+ Encoding format:
38
+ - Token reference: 0x00 + uint16 (token ID)
39
+ - Integer literal: 0x01 + int64 (8 bytes, signed)
40
+ - Float literal: 0x02 + float64 (8 bytes, double)
41
+
42
+ Example:
43
+ >>> from causaliq_knowledge.cache import TokenCache
44
+ >>> with TokenCache(":memory:") as cache:
45
+ ... encoder = JsonEncoder()
46
+ ... data = {"key": "value", "count": 42}
47
+ ... blob = encoder.encode(data, cache)
48
+ ... decoded = encoder.decode(blob, cache)
49
+ ... assert decoded == data
50
+ """
51
+
52
+ def _get_token(self, token_id: int, token_cache: TokenCache) -> str:
53
+ """Get token by ID, raising error if not found.
54
+
55
+ Args:
56
+ token_id: The token ID to look up.
57
+ token_cache: Cache instance for token dictionary.
58
+
59
+ Returns:
60
+ The token string.
61
+
62
+ Raises:
63
+ ValueError: If token ID not found (corrupted cache).
64
+ """
65
+ token = token_cache.get_token(token_id)
66
+ if token is None:
67
+ raise ValueError(f"Unknown token ID: {token_id}")
68
+ return token
69
+
70
+ @property
71
+ def default_export_format(self) -> str:
72
+ """Default file extension for exports."""
73
+ return "json"
74
+
75
+ def encode(self, data: Any, token_cache: TokenCache) -> bytes:
76
+ """Encode JSON-serialisable data to tokenised binary format.
77
+
78
+ Args:
79
+ data: Any JSON-serialisable data (dict, list, str, int, etc.).
80
+ token_cache: Cache instance for shared token dictionary.
81
+
82
+ Returns:
83
+ Compact binary representation using token IDs and literals.
84
+ """
85
+ result = bytearray()
86
+ self._encode_value(data, token_cache, result)
87
+ return bytes(result)
88
+
89
+ def decode(self, blob: bytes, token_cache: TokenCache) -> Any:
90
+ """Decode tokenised binary data back to JSON structure.
91
+
92
+ Args:
93
+ blob: Binary data from cache.
94
+ token_cache: Cache instance for shared token dictionary.
95
+
96
+ Returns:
97
+ Decoded JSON-compatible data structure.
98
+ """
99
+ offset = 0
100
+ value, _ = self._decode_value(blob, offset, token_cache)
101
+ return value
102
+
103
+ def export(self, data: Any, path: Path) -> None:
104
+ """Export data to JSON file.
105
+
106
+ Args:
107
+ data: The decoded data to export.
108
+ path: Destination file path.
109
+ """
110
+ path.write_text(json.dumps(data, indent=2))
111
+
112
+ def import_(self, path: Path) -> Any:
113
+ """Import data from JSON file.
114
+
115
+ Args:
116
+ path: Source file path.
117
+
118
+ Returns:
119
+ Imported JSON data ready for encoding.
120
+ """
121
+ return json.loads(path.read_text())
122
+
123
+ def _encode_value(
124
+ self, value: Any, token_cache: TokenCache, result: bytearray
125
+ ) -> None:
126
+ """Recursively encode a JSON value.
127
+
128
+ Args:
129
+ value: Value to encode.
130
+ token_cache: Cache for token dictionary.
131
+ result: Bytearray to append encoded data to.
132
+ """
133
+ if value is None:
134
+ self._encode_token("null", token_cache, result)
135
+ elif isinstance(value, bool):
136
+ # Must check bool before int (bool is subclass of int)
137
+ self._encode_token(
138
+ "true" if value else "false", token_cache, result
139
+ )
140
+ elif isinstance(value, int):
141
+ result.append(LITERAL_INT)
142
+ result.extend(struct.pack("<q", value))
143
+ elif isinstance(value, float):
144
+ result.append(LITERAL_FLOAT)
145
+ result.extend(struct.pack("<d", value))
146
+ elif isinstance(value, str):
147
+ self._encode_string(value, token_cache, result)
148
+ elif isinstance(value, list):
149
+ self._encode_list(value, token_cache, result)
150
+ elif isinstance(value, dict):
151
+ self._encode_dict(value, token_cache, result)
152
+ else:
153
+ # Fallback: convert to string
154
+ self._encode_string(str(value), token_cache, result)
155
+
156
+ def _encode_token(
157
+ self, token: str, token_cache: TokenCache, result: bytearray
158
+ ) -> None:
159
+ """Encode a single token reference.
160
+
161
+ Args:
162
+ token: Token string to encode.
163
+ token_cache: Cache for token dictionary.
164
+ result: Bytearray to append encoded data to.
165
+ """
166
+ token_id = token_cache.get_or_create_token(token)
167
+ result.append(TOKEN_REF)
168
+ result.extend(struct.pack("<H", token_id))
169
+
170
+ def _encode_string(
171
+ self, value: str, token_cache: TokenCache, result: bytearray
172
+ ) -> None:
173
+ """Encode a string value with tokenisation.
174
+
175
+ Strings are split into tokens (words/punctuation) with special
176
+ markers for string start/end. Double quotes within the string
177
+ are encoded as '\\"' token to distinguish from string delimiters.
178
+
179
+ Args:
180
+ value: String to encode.
181
+ token_cache: Cache for token dictionary.
182
+ result: Bytearray to append encoded data to.
183
+ """
184
+ self._encode_token('"', token_cache, result)
185
+ # Split on whitespace and punctuation, keeping delimiters
186
+ tokens = self._tokenise_string(value)
187
+ for token in tokens:
188
+ # Escape embedded quotes to distinguish from string delimiter
189
+ if token == '"':
190
+ self._encode_token('\\"', token_cache, result)
191
+ else:
192
+ self._encode_token(token, token_cache, result)
193
+ self._encode_token('"', token_cache, result)
194
+
195
+ def _encode_list(
196
+ self, value: list, token_cache: TokenCache, result: bytearray
197
+ ) -> None:
198
+ """Encode a list value.
199
+
200
+ Args:
201
+ value: List to encode.
202
+ token_cache: Cache for token dictionary.
203
+ result: Bytearray to append encoded data to.
204
+ """
205
+ self._encode_token("[", token_cache, result)
206
+ for i, item in enumerate(value):
207
+ if i > 0:
208
+ self._encode_token(",", token_cache, result)
209
+ self._encode_value(item, token_cache, result)
210
+ self._encode_token("]", token_cache, result)
211
+
212
+ def _encode_dict(
213
+ self, value: dict, token_cache: TokenCache, result: bytearray
214
+ ) -> None:
215
+ """Encode a dict value.
216
+
217
+ Args:
218
+ value: Dict to encode.
219
+ token_cache: Cache for token dictionary.
220
+ result: Bytearray to append encoded data to.
221
+ """
222
+ self._encode_token("{", token_cache, result)
223
+ for i, (key, val) in enumerate(value.items()):
224
+ if i > 0:
225
+ self._encode_token(",", token_cache, result)
226
+ self._encode_string(str(key), token_cache, result)
227
+ self._encode_token(":", token_cache, result)
228
+ self._encode_value(val, token_cache, result)
229
+ self._encode_token("}", token_cache, result)
230
+
231
+ def _tokenise_string(self, value: str) -> list[str]:
232
+ """Split string into tokens for encoding.
233
+
234
+ Splits on whitespace and punctuation boundaries, preserving
235
+ all characters. Empty string returns empty list.
236
+
237
+ Args:
238
+ value: String to tokenise.
239
+
240
+ Returns:
241
+ List of token strings.
242
+ """
243
+ if not value:
244
+ return []
245
+ # Split on word boundaries, keeping all parts
246
+ # Matches: word chars, whitespace runs, or single punctuation
247
+ tokens = re.findall(r"\w+|\s+|[^\w\s]", value)
248
+ return tokens
249
+
250
+ def _decode_value(
251
+ self, blob: bytes, offset: int, token_cache: TokenCache
252
+ ) -> tuple[Any, int]:
253
+ """Decode a single value from blob at offset.
254
+
255
+ Args:
256
+ blob: Binary data to decode.
257
+ offset: Current position in blob.
258
+ token_cache: Cache for token dictionary.
259
+
260
+ Returns:
261
+ Tuple of (decoded value, new offset).
262
+ """
263
+ if offset >= len(blob):
264
+ raise ValueError("Unexpected end of data")
265
+
266
+ type_marker = blob[offset]
267
+ offset += 1
268
+
269
+ if type_marker == LITERAL_INT:
270
+ value = struct.unpack("<q", blob[offset : offset + 8])[0]
271
+ return value, offset + 8
272
+ elif type_marker == LITERAL_FLOAT:
273
+ value = struct.unpack("<d", blob[offset : offset + 8])[0]
274
+ return value, offset + 8
275
+ elif type_marker == TOKEN_REF:
276
+ token_id = struct.unpack("<H", blob[offset : offset + 2])[0]
277
+ offset += 2
278
+ token = self._get_token(token_id, token_cache)
279
+
280
+ if token == "null":
281
+ return None, offset
282
+ elif token == "true":
283
+ return True, offset
284
+ elif token == "false":
285
+ return False, offset
286
+ elif token == '"':
287
+ return self._decode_string(blob, offset, token_cache)
288
+ elif token == "[":
289
+ return self._decode_list(blob, offset, token_cache)
290
+ elif token == "{":
291
+ return self._decode_dict(blob, offset, token_cache)
292
+ else:
293
+ raise ValueError(
294
+ f"Unexpected token at value position: {token}"
295
+ )
296
+ else:
297
+ raise ValueError(f"Unknown type marker: {type_marker}")
298
+
299
+ def _decode_string(
300
+ self, blob: bytes, offset: int, token_cache: TokenCache
301
+ ) -> tuple[str, int]:
302
+ """Decode a string value (after opening quote consumed).
303
+
304
+ Handles escaped quotes ('\\"' token) which represent literal
305
+ double quotes within the string content.
306
+
307
+ Args:
308
+ blob: Binary data to decode.
309
+ offset: Current position (after opening quote).
310
+ token_cache: Cache for token dictionary.
311
+
312
+ Returns:
313
+ Tuple of (decoded string, new offset).
314
+ """
315
+ parts: list[str] = []
316
+ while offset < len(blob):
317
+ type_marker = blob[offset]
318
+ if type_marker != TOKEN_REF:
319
+ raise ValueError(
320
+ f"Expected token in string, got {type_marker}"
321
+ )
322
+ token_id = struct.unpack("<H", blob[offset + 1 : offset + 3])[0]
323
+ offset += 3
324
+ token = self._get_token(token_id, token_cache)
325
+ if token == '"':
326
+ # End of string
327
+ return "".join(parts), offset
328
+ elif token == '\\"':
329
+ # Escaped quote - append literal quote character
330
+ parts.append('"')
331
+ else:
332
+ parts.append(token)
333
+ raise ValueError("Unterminated string")
334
+
335
+ def _decode_list(
336
+ self, blob: bytes, offset: int, token_cache: TokenCache
337
+ ) -> tuple[list, int]:
338
+ """Decode a list value (after opening bracket consumed).
339
+
340
+ Args:
341
+ blob: Binary data to decode.
342
+ offset: Current position (after opening bracket).
343
+ token_cache: Cache for token dictionary.
344
+
345
+ Returns:
346
+ Tuple of (decoded list, new offset).
347
+ """
348
+ items = []
349
+ # Check for empty list
350
+ if offset < len(blob) and blob[offset] == TOKEN_REF:
351
+ token_id = struct.unpack("<H", blob[offset + 1 : offset + 3])[0]
352
+ token = self._get_token(token_id, token_cache)
353
+ if token == "]":
354
+ return [], offset + 3
355
+
356
+ while offset < len(blob):
357
+ value, offset = self._decode_value(blob, offset, token_cache)
358
+ items.append(value)
359
+
360
+ # Check for comma or closing bracket
361
+ if offset >= len(blob):
362
+ raise ValueError("Unterminated list")
363
+ if blob[offset] != TOKEN_REF:
364
+ raise ValueError("Expected token after list item")
365
+ token_id = struct.unpack("<H", blob[offset + 1 : offset + 3])[0]
366
+ offset += 3
367
+ token = self._get_token(token_id, token_cache)
368
+ if token == "]":
369
+ return items, offset
370
+ elif token != ",":
371
+ raise ValueError(f"Expected ',' or ']' in list, got '{token}'")
372
+
373
+ raise ValueError("Unterminated list") # pragma: no cover
374
+
375
+ def _decode_dict(
376
+ self, blob: bytes, offset: int, token_cache: TokenCache
377
+ ) -> tuple[dict, int]:
378
+ """Decode a dict value (after opening brace consumed).
379
+
380
+ Args:
381
+ blob: Binary data to decode.
382
+ offset: Current position (after opening brace).
383
+ token_cache: Cache for token dictionary.
384
+
385
+ Returns:
386
+ Tuple of (decoded dict, new offset).
387
+ """
388
+ result = {}
389
+ # Check for empty dict
390
+ if offset < len(blob) and blob[offset] == TOKEN_REF:
391
+ token_id = struct.unpack("<H", blob[offset + 1 : offset + 3])[0]
392
+ token = self._get_token(token_id, token_cache)
393
+ if token == "}":
394
+ return {}, offset + 3
395
+
396
+ while offset < len(blob):
397
+ # Decode key (must be string)
398
+ key, offset = self._decode_value(blob, offset, token_cache)
399
+ if not isinstance(key, str):
400
+ raise ValueError(f"Dict key must be string, got {type(key)}")
401
+
402
+ # Expect colon
403
+ if offset >= len(blob) or blob[offset] != TOKEN_REF:
404
+ raise ValueError("Expected ':' after dict key")
405
+ token_id = struct.unpack("<H", blob[offset + 1 : offset + 3])[0]
406
+ offset += 3
407
+ token = self._get_token(token_id, token_cache)
408
+ if token != ":":
409
+ raise ValueError(f"Expected ':', got '{token}'")
410
+
411
+ # Decode value
412
+ value, offset = self._decode_value(blob, offset, token_cache)
413
+ result[key] = value
414
+
415
+ # Check for comma or closing brace
416
+ if offset >= len(blob):
417
+ raise ValueError("Unterminated dict")
418
+ if blob[offset] != TOKEN_REF:
419
+ raise ValueError("Expected token after dict value")
420
+ token_id = struct.unpack("<H", blob[offset + 1 : offset + 3])[0]
421
+ offset += 3
422
+ token = self._get_token(token_id, token_cache)
423
+ if token == "}":
424
+ return result, offset
425
+ elif token != ",":
426
+ raise ValueError(
427
+ f"Expected ',' or '}}' in dict, got '{token}'"
428
+ )
429
+
430
+ raise ValueError("Unterminated dict") # pragma: no cover