thoughtflow 0.0.2__py3-none-any.whl → 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thoughtflow/__init__.py +57 -14
- thoughtflow/_util.py +713 -69
- thoughtflow/action.py +357 -0
- thoughtflow/agent.py +49 -130
- thoughtflow/llm.py +250 -0
- thoughtflow/memory/__init__.py +20 -15
- thoughtflow/memory/base.py +1615 -99
- thoughtflow/thought.py +1102 -0
- thoughtflow/thoughtflow6.py +4180 -0
- thoughtflow-0.0.4.dist-info/METADATA +1686 -0
- thoughtflow-0.0.4.dist-info/RECORD +25 -0
- thoughtflow/adapters/__init__.py +0 -43
- thoughtflow/adapters/anthropic.py +0 -119
- thoughtflow/adapters/base.py +0 -140
- thoughtflow/adapters/local.py +0 -133
- thoughtflow/adapters/openai.py +0 -118
- thoughtflow-0.0.2.dist-info/METADATA +0 -215
- thoughtflow-0.0.2.dist-info/RECORD +0 -26
- {thoughtflow-0.0.2.dist-info → thoughtflow-0.0.4.dist-info}/WHEEL +0 -0
- {thoughtflow-0.0.2.dist-info → thoughtflow-0.0.4.dist-info}/licenses/LICENSE +0 -0
thoughtflow/_util.py
CHANGED
|
@@ -1,108 +1,752 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Internal utilities for ThoughtFlow.
|
|
3
3
|
|
|
4
|
-
This module contains helper functions used
|
|
5
|
-
These are NOT part of the public API and may change without notice.
|
|
6
|
-
|
|
7
|
-
Note: The underscore prefix indicates this is internal/private.
|
|
4
|
+
This module contains helper functions and classes used by ThoughtFlow.
|
|
8
5
|
"""
|
|
9
6
|
|
|
10
7
|
from __future__ import annotations
|
|
11
8
|
|
|
12
|
-
|
|
13
|
-
|
|
9
|
+
#############################################################################
|
|
10
|
+
#############################################################################
|
|
14
11
|
|
|
15
|
-
|
|
12
|
+
### IMPORTS AND SETTINGS
|
|
16
13
|
|
|
14
|
+
import os, sys, time, pickle, json, uuid
|
|
15
|
+
import http, urllib, socket, ssl, gzip, copy
|
|
16
|
+
import urllib.request
|
|
17
|
+
import pprint
|
|
18
|
+
import random
|
|
19
|
+
import re, ast
|
|
20
|
+
from typing import Mapping, Any, Iterable, Optional, Tuple, Union
|
|
17
21
|
|
|
18
|
-
|
|
19
|
-
|
|
22
|
+
import time,hashlib,pickle
|
|
23
|
+
from random import randint
|
|
24
|
+
from functools import reduce
|
|
20
25
|
|
|
21
|
-
|
|
22
|
-
|
|
26
|
+
import datetime as dtt
|
|
27
|
+
from zoneinfo import ZoneInfo
|
|
28
|
+
|
|
29
|
+
tz_bog = ZoneInfo("America/Bogota")
|
|
30
|
+
tz_utc = ZoneInfo("UTC")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
#############################################################################
|
|
34
|
+
#############################################################################
|
|
35
|
+
|
|
36
|
+
### EVENT STAMP LOGIC
|
|
37
|
+
|
|
38
|
+
class EventStamp:
|
|
23
39
|
"""
|
|
24
|
-
|
|
40
|
+
Generates and decodes deterministic event stamps using Base62 encoding.
|
|
41
|
+
|
|
42
|
+
Event stamps combine encoded time, document hash, and random components
|
|
43
|
+
into a compact 16-character identifier.
|
|
44
|
+
|
|
45
|
+
Usage:
|
|
46
|
+
EventStamp.stamp() # Generate a new stamp
|
|
47
|
+
EventStamp.decode_time(s) # Decode timestamp from stamp
|
|
48
|
+
EventStamp.hashify("text") # Generate deterministic hash
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
CHARSET = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
|
|
52
|
+
|
|
53
|
+
def sha256_hash(input_string):
|
|
54
|
+
"""Generate a SHA-256 hash and return it as an integer."""
|
|
55
|
+
hash_bytes = hashlib.sha256(input_string.encode("utf-8")).digest()
|
|
56
|
+
return int.from_bytes(hash_bytes, byteorder="big")
|
|
57
|
+
|
|
58
|
+
def base62_encode(number, length):
|
|
59
|
+
"""Encode an integer into a fixed-length Base62 string."""
|
|
60
|
+
base = len(EventStamp.CHARSET)
|
|
61
|
+
encoded = []
|
|
62
|
+
for _ in range(length):
|
|
63
|
+
number, remainder = divmod(number, base)
|
|
64
|
+
encoded.append(EventStamp.CHARSET[remainder])
|
|
65
|
+
return ''.join(encoded[::-1]) # Reverse to get correct order
|
|
66
|
+
|
|
67
|
+
def hashify(input_string, length=32):
|
|
68
|
+
"""Generate a deterministic hash using all uppercase/lowercase letters and digits."""
|
|
69
|
+
hashed_int = EventStamp.sha256_hash(input_string)
|
|
70
|
+
return EventStamp.base62_encode(hashed_int, length)
|
|
71
|
+
|
|
72
|
+
def encode_num(num, charset=None):
|
|
73
|
+
"""Encode a number in the given base/charset."""
|
|
74
|
+
if charset is None:
|
|
75
|
+
charset = EventStamp.CHARSET
|
|
76
|
+
base = len(charset)
|
|
77
|
+
if num < base:
|
|
78
|
+
return charset[num]
|
|
79
|
+
else:
|
|
80
|
+
return EventStamp.encode_num(num // base, charset) + charset[num % base]
|
|
81
|
+
|
|
82
|
+
def decode_num(encoded_str, charset=None):
|
|
83
|
+
"""Decode a base-encoded string back to an integer."""
|
|
84
|
+
if charset is None:
|
|
85
|
+
charset = EventStamp.CHARSET
|
|
86
|
+
base = len(charset)
|
|
87
|
+
char_to_value = {c: i for i, c in enumerate(charset)}
|
|
88
|
+
return reduce(lambda num, c: num * base + char_to_value[c], encoded_str, 0)
|
|
89
|
+
|
|
90
|
+
def encode_time(unix_time=0):
|
|
91
|
+
"""Encode current or given unix time."""
|
|
92
|
+
if unix_time == 0:
|
|
93
|
+
t = int(time.time() * 10000)
|
|
94
|
+
else:
|
|
95
|
+
t = int(unix_time * 10000)
|
|
96
|
+
return EventStamp.encode_num(t)
|
|
97
|
+
|
|
98
|
+
def encode_doc(doc={}):
|
|
99
|
+
"""Encode a document/value to a 5-character hash."""
|
|
100
|
+
return EventStamp.hashify(str(doc), 5)
|
|
101
|
+
|
|
102
|
+
def encode_rando(length=3):
|
|
103
|
+
"""Generate a random code of specified length."""
|
|
104
|
+
n = randint(300000, 900000)
|
|
105
|
+
c = '000' + EventStamp.encode_num(n)
|
|
106
|
+
return c[-length:]
|
|
107
|
+
|
|
108
|
+
def stamp(doc={}):
|
|
109
|
+
"""
|
|
110
|
+
Generate an event stamp.
|
|
111
|
+
|
|
112
|
+
Combines encoded time, document hash, and random component
|
|
113
|
+
into a 16-character identifier.
|
|
114
|
+
"""
|
|
115
|
+
time_code = EventStamp.encode_time()
|
|
116
|
+
rando_code = EventStamp.encode_rando()
|
|
117
|
+
if len(str(doc)) > 2:
|
|
118
|
+
doc_code = EventStamp.encode_doc(doc)
|
|
119
|
+
else:
|
|
120
|
+
arb = time_code + rando_code
|
|
121
|
+
doc_code = EventStamp.encode_doc(arb)
|
|
122
|
+
return (time_code + doc_code + rando_code)[:16]
|
|
123
|
+
|
|
124
|
+
def decode_time(stamp, charset=None):
|
|
125
|
+
"""Decode the time component from an event stamp."""
|
|
126
|
+
if charset is None:
|
|
127
|
+
charset = EventStamp.CHARSET
|
|
128
|
+
stamp_prefix = stamp[:8]
|
|
129
|
+
scaled_time = EventStamp.decode_num(stamp_prefix, charset)
|
|
130
|
+
unix_time_seconds = scaled_time / 10000
|
|
131
|
+
return unix_time_seconds
|
|
25
132
|
|
|
26
133
|
|
|
27
|
-
|
|
28
|
-
|
|
134
|
+
# Backwards compatibility aliases
|
|
135
|
+
event_stamp = EventStamp.stamp
|
|
136
|
+
hashify = EventStamp.hashify
|
|
137
|
+
encode_num = EventStamp.encode_num
|
|
138
|
+
decode_num = EventStamp.decode_num
|
|
29
139
|
|
|
30
|
-
|
|
31
|
-
|
|
140
|
+
#############################################################################
|
|
141
|
+
#############################################################################
|
|
142
|
+
|
|
143
|
+
### HELPER FUNCTIONS
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
default_header = '''
|
|
147
|
+
Markers like <start … l4zk> and </end … l4zk>
|
|
148
|
+
indicate where a text section begins and ends.
|
|
149
|
+
Never mix boundaries. Each block is separate.
|
|
150
|
+
This is to improve your ease-of-reading.
|
|
151
|
+
'''
|
|
152
|
+
|
|
153
|
+
def construct_prompt(
|
|
154
|
+
prompt_obj = {},
|
|
155
|
+
order = [],
|
|
156
|
+
header = '',
|
|
157
|
+
):
|
|
158
|
+
if order: sections = list(order)
|
|
159
|
+
else: sections = [a for a in prompt_obj]
|
|
160
|
+
rnum = str(randint(1,9))
|
|
161
|
+
stamp = event_stamp()[-4:].lower()
|
|
162
|
+
stamp = stamp[:2]+rnum+stamp[2:]
|
|
163
|
+
L = []
|
|
164
|
+
if header:
|
|
165
|
+
if header=='default':
|
|
166
|
+
L.append(default_header+'\n')
|
|
167
|
+
else:
|
|
168
|
+
L.append(header+'\n\n')
|
|
169
|
+
L.append('<start prompt stamp>\n\n')
|
|
170
|
+
for s in sections:
|
|
171
|
+
text = prompt_obj[s]
|
|
172
|
+
s2 = s.strip().replace(' ','_')
|
|
173
|
+
label1 = "<start "+s2+" stamp>\n"
|
|
174
|
+
label2 = "\n</end "+s2+" stamp>\n\n"
|
|
175
|
+
block = label1 + text + label2
|
|
176
|
+
L.append(block)
|
|
177
|
+
L.append('</end prompt stamp>')
|
|
178
|
+
prompt = ''.join(L).replace(' stamp>',' '+stamp+'>')
|
|
179
|
+
return prompt
|
|
180
|
+
|
|
181
|
+
def construct_msgs(
|
|
182
|
+
usr_prompt = '',
|
|
183
|
+
vars = {},
|
|
184
|
+
sys_prompt = '',
|
|
185
|
+
msgs = [],
|
|
186
|
+
):
|
|
187
|
+
if sys_prompt:
|
|
188
|
+
if type(sys_prompt)==dict:
|
|
189
|
+
sys_prompt = construct_prompt(sys_prompt)
|
|
190
|
+
m = {'role':'system','content':sys_prompt}
|
|
191
|
+
msgs.insert(0,m)
|
|
192
|
+
if usr_prompt:
|
|
193
|
+
if type(usr_prompt)==dict:
|
|
194
|
+
usr_prompt = construct_prompt(usr_prompt)
|
|
195
|
+
m = {'role':'user','content':usr_prompt}
|
|
196
|
+
msgs.append(m)
|
|
197
|
+
#msgs2 = []
|
|
198
|
+
#for m in msgs:
|
|
199
|
+
# m_copy = m.copy()
|
|
200
|
+
# if isinstance(m_copy, dict) and 'content' in m_copy and isinstance(m_copy['content'], str):
|
|
201
|
+
# for k, v in vars.items():
|
|
202
|
+
# m_copy['content'] = m_copy['content'].replace(k, str(v))
|
|
203
|
+
# msgs2.append(m_copy)
|
|
204
|
+
#return msgs2
|
|
205
|
+
msgs2 = []
|
|
206
|
+
for m in msgs:
|
|
207
|
+
m_copy = dict(m)
|
|
208
|
+
if isinstance(m_copy.get("content"), str):
|
|
209
|
+
for k, v in vars.items():
|
|
210
|
+
m_copy["content"] = m_copy["content"].replace(k, str(v))
|
|
211
|
+
msgs2.append(m_copy)
|
|
212
|
+
return msgs2
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
#############################################################################
|
|
217
|
+
|
|
218
|
+
class ValidExtractError(ValueError):
|
|
219
|
+
"""Raised when extraction or validation fails."""
|
|
220
|
+
|
|
221
|
+
def valid_extract(raw_text: str, parsing_rules: Mapping[str, Any]) -> Any:
|
|
32
222
|
"""
|
|
33
|
-
|
|
223
|
+
Extract and validate a target Python structure from noisy LLM text.
|
|
34
224
|
|
|
225
|
+
Parameters
|
|
226
|
+
----------
|
|
227
|
+
raw_text : str
|
|
228
|
+
The original model output (may include extra prose, code fences, etc.).
|
|
229
|
+
parsing_rules : dict
|
|
230
|
+
Rules controlling extraction/validation. Required keys:
|
|
231
|
+
- 'kind': currently supports 'python' (default). ('json' also supported.)
|
|
232
|
+
- 'format': schema describing the expected structure, e.g. [], {}, {'name': ''}, {'num_list': [], 'info': {}}
|
|
35
233
|
|
|
36
|
-
|
|
37
|
-
|
|
234
|
+
Schema language:
|
|
235
|
+
* [] : list of anything
|
|
236
|
+
* [schema] : list of items matching 'schema'
|
|
237
|
+
* {} : dict of anything
|
|
238
|
+
* {'k': sch} : dict with required key 'k' matching 'sch'
|
|
239
|
+
* {'k?': sch} : OPTIONAL key 'k' (if present, must match 'sch')
|
|
240
|
+
* '' or str : str
|
|
241
|
+
* 0 or int : int
|
|
242
|
+
* 0.0 or float: float
|
|
243
|
+
* True/False or bool: bool
|
|
244
|
+
* None : NoneType
|
|
38
245
|
|
|
39
|
-
|
|
246
|
+
Returns
|
|
247
|
+
-------
|
|
248
|
+
Any
|
|
249
|
+
The parsed Python object that satisfies the schema.
|
|
40
250
|
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
251
|
+
Raises
|
|
252
|
+
------
|
|
253
|
+
ValidExtractError
|
|
254
|
+
If extraction fails or the parsed object does not validate against the schema.
|
|
44
255
|
|
|
45
|
-
|
|
46
|
-
|
|
256
|
+
Examples
|
|
257
|
+
--------
|
|
258
|
+
>>> rules = {'kind': 'python', 'format': []}
|
|
259
|
+
>>> txt = "Here you go:\\n```python\\n[1, 2, 3]\\n```\\nLet me know!"
|
|
260
|
+
>>> valid_extract(txt, rules)
|
|
261
|
+
[1, 2, 3]
|
|
262
|
+
|
|
263
|
+
>>> rules = {'kind': 'python', 'format': {'num_list': [], 'my_info': {}, 'name': ''}}
|
|
264
|
+
>>> txt = "noise { 'num_list':[1,2], 'my_info':{'x':1}, 'name':'Ada' } trailing"
|
|
265
|
+
>>> valid_extract(txt, rules)
|
|
266
|
+
{'num_list': [1, 2], 'my_info': {'x': 1}, 'name': 'Ada'}
|
|
267
|
+
"""
|
|
268
|
+
if not isinstance(parsing_rules, Mapping):
|
|
269
|
+
raise ValidExtractError("parsing_rules must be a mapping.")
|
|
270
|
+
|
|
271
|
+
kind = parsing_rules.get("kind", "python")
|
|
272
|
+
schema = parsing_rules.get("format", None)
|
|
273
|
+
if schema is None:
|
|
274
|
+
raise ValidExtractError("parsing_rules['format'] is required.")
|
|
275
|
+
|
|
276
|
+
# 1) Collect candidate text segments in a robust order.
|
|
277
|
+
candidates: Iterable[str] = _candidate_segments(raw_text, schema, prefer_fences_first=True)
|
|
278
|
+
|
|
279
|
+
last_err: Optional[Exception] = None
|
|
280
|
+
for segment in candidates:
|
|
281
|
+
try:
|
|
282
|
+
obj = _parse_segment(segment, kind=kind)
|
|
283
|
+
except Exception as e:
|
|
284
|
+
last_err = e
|
|
285
|
+
continue
|
|
286
|
+
|
|
287
|
+
ok, msg = _validate_schema(obj, schema)
|
|
288
|
+
if ok:
|
|
289
|
+
return obj
|
|
290
|
+
last_err = ValidExtractError("Validation failed for candidate: {}".format(msg))
|
|
291
|
+
|
|
292
|
+
# If we got here, nothing parsed+validated.
|
|
293
|
+
if last_err:
|
|
294
|
+
raise ValidExtractError(str(last_err))
|
|
295
|
+
raise ValidExtractError("No parseable candidates found.")
|
|
296
|
+
|
|
297
|
+
# ----------------------------
|
|
298
|
+
# Parsing helpers
|
|
299
|
+
# ----------------------------
|
|
300
|
+
|
|
301
|
+
# --- Replace the fence regex with this (accepts inline fences) ---
|
|
302
|
+
_FENCE_RE = re.compile(
|
|
303
|
+
r"```(?P<lang>[a-zA-Z0-9_\-\.]*)\s*\n?(?P<body>.*?)```",
|
|
304
|
+
re.DOTALL
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
def _candidate_segments(raw_text: str, schema: Any, prefer_fences_first: bool = True) -> Iterable[str]:
|
|
308
|
+
"""
|
|
309
|
+
Yield candidate substrings likely to contain the target structure.
|
|
310
|
+
|
|
311
|
+
Strategy:
|
|
312
|
+
1) Fenced code blocks (```) first, in order, if requested.
|
|
313
|
+
2) Balanced slice for the top-level delimiter suggested by the schema.
|
|
314
|
+
3) As a fallback, return raw_text itself (last resort).
|
|
315
|
+
"""
|
|
316
|
+
# 1) From code fences
|
|
317
|
+
if prefer_fences_first:
|
|
318
|
+
for m in _FENCE_RE.finditer(raw_text):
|
|
319
|
+
lang = (m.group("lang") or "").strip().lower()
|
|
320
|
+
body = m.group("body")
|
|
321
|
+
# If the fence declares "python" or "json", prioritize; otherwise still try.
|
|
322
|
+
yield body
|
|
323
|
+
|
|
324
|
+
# 2) From balanced slice based on schema's top-level delimiter
|
|
325
|
+
opener, closer = _delims_for_schema(schema)
|
|
326
|
+
if opener and closer:
|
|
327
|
+
slice_ = _balanced_slice(raw_text, opener, closer)
|
|
328
|
+
if slice_ is not None:
|
|
329
|
+
yield slice_
|
|
330
|
+
|
|
331
|
+
# 3) Whole text (very last resort)
|
|
332
|
+
yield raw_text
|
|
333
|
+
|
|
334
|
+
def _parse_segment(segment: str, kind: str = "python") -> Any:
|
|
47
335
|
"""
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
336
|
+
Parse a segment into a Python object according to 'kind'.
|
|
337
|
+
- python: ast.literal_eval
|
|
338
|
+
- json: json.loads (with fallback: try literal_eval if JSON fails, for LLM single-quote dicts)
|
|
339
|
+
"""
|
|
340
|
+
text = segment.strip()
|
|
341
|
+
|
|
342
|
+
if kind == "python":
|
|
343
|
+
# Remove leading language hints often kept when copying from fences
|
|
344
|
+
if text.startswith("python\n"):
|
|
345
|
+
text = text[len("python\n") :].lstrip()
|
|
346
|
+
return ast.literal_eval(text)
|
|
347
|
+
|
|
348
|
+
if kind == "json":
|
|
349
|
+
try:
|
|
350
|
+
return json.loads(text)
|
|
351
|
+
except json.JSONDecodeError:
|
|
352
|
+
# LLMs often return Python-style dicts (single quotes). Try literal_eval as a fallback.
|
|
353
|
+
return ast.literal_eval(text)
|
|
354
|
+
|
|
355
|
+
raise ValidExtractError("Unsupported kind: {!r}".format(kind))
|
|
356
|
+
|
|
357
|
+
def _delims_for_schema(schema: Any) -> Tuple[Optional[str], Optional[str]]:
|
|
358
|
+
"""
|
|
359
|
+
Infer top-level delimiters from the schema.
|
|
360
|
+
- list-like → [ ]
|
|
361
|
+
- dict-like → { }
|
|
362
|
+
- tuple-like (if used) → ( )
|
|
363
|
+
- string/number/bool/None → no delimiters (None, None)
|
|
364
|
+
"""
|
|
365
|
+
# list
|
|
366
|
+
if isinstance(schema, list):
|
|
367
|
+
return "[", "]"
|
|
368
|
+
# dict
|
|
369
|
+
if isinstance(schema, dict):
|
|
370
|
+
return "{", "}"
|
|
371
|
+
# tuple schema (rare, but supported)
|
|
372
|
+
if isinstance(schema, tuple):
|
|
373
|
+
return "(", ")"
|
|
374
|
+
# primitives: cannot infer a unique delimiter—return None
|
|
375
|
+
return None, None
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
def _balanced_slice(text: str, open_ch: str, close_ch: str) -> Optional[str]:
|
|
379
|
+
"""
|
|
380
|
+
Return the first balanced substring between open_ch and close_ch,
|
|
381
|
+
scanning from the *first occurrence of open_ch* (so prose apostrophes
|
|
382
|
+
before the opener don't confuse quote tracking).
|
|
383
|
+
"""
|
|
384
|
+
start = text.find(open_ch)
|
|
385
|
+
if start == -1:
|
|
386
|
+
return None
|
|
387
|
+
|
|
388
|
+
depth = 0
|
|
389
|
+
in_str: Optional[str] = None # quote char if inside ' or "
|
|
390
|
+
escape = False
|
|
391
|
+
i = start
|
|
392
|
+
|
|
393
|
+
while i < len(text):
|
|
394
|
+
ch = text[i]
|
|
395
|
+
if in_str:
|
|
396
|
+
if escape:
|
|
397
|
+
escape = False
|
|
398
|
+
elif ch == "\\":
|
|
399
|
+
escape = True
|
|
400
|
+
elif ch == in_str:
|
|
401
|
+
in_str = None
|
|
52
402
|
else:
|
|
53
|
-
|
|
54
|
-
|
|
403
|
+
if ch in ("'", '"'):
|
|
404
|
+
in_str = ch
|
|
405
|
+
elif ch == open_ch:
|
|
406
|
+
depth += 1
|
|
407
|
+
elif ch == close_ch and depth > 0:
|
|
408
|
+
depth -= 1
|
|
409
|
+
if depth == 0:
|
|
410
|
+
return text[start : i + 1]
|
|
411
|
+
i += 1
|
|
412
|
+
return None
|
|
55
413
|
|
|
56
414
|
|
|
57
|
-
|
|
58
|
-
|
|
415
|
+
# ----------------------------
|
|
416
|
+
# Schema validation
|
|
417
|
+
# ----------------------------
|
|
59
418
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
419
|
+
def _is_optional_key(k: str) -> Tuple[str, bool]:
|
|
420
|
+
"""Return (base_key, optional_flag) for keys with a trailing '?'."""
|
|
421
|
+
if isinstance(k, str) and k.endswith("?"):
|
|
422
|
+
return k[:-1], True
|
|
423
|
+
return k, False
|
|
64
424
|
|
|
65
|
-
|
|
66
|
-
The truncated string.
|
|
425
|
+
def _schema_type(schema: Any) -> Union[type, Tuple[type, ...], None]:
|
|
67
426
|
"""
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
427
|
+
Map schema exemplars to Python types.
|
|
428
|
+
Accepts either exemplar values ('' -> str, 0 -> int, 0.0 -> float, True -> bool, None -> NoneType)
|
|
429
|
+
OR actual types (str, int, float, bool).
|
|
430
|
+
"""
|
|
431
|
+
if schema is None:
|
|
432
|
+
return type(None)
|
|
433
|
+
if schema is str or isinstance(schema, str):
|
|
434
|
+
return str
|
|
435
|
+
if schema is int or (isinstance(schema, int) and not isinstance(schema, bool)):
|
|
436
|
+
return int
|
|
437
|
+
if schema is float or isinstance(schema, float):
|
|
438
|
+
return float
|
|
439
|
+
if schema is bool or isinstance(schema, bool):
|
|
440
|
+
return bool
|
|
441
|
+
if schema is list:
|
|
442
|
+
return list
|
|
443
|
+
if schema is dict:
|
|
444
|
+
return dict
|
|
445
|
+
if schema is tuple:
|
|
446
|
+
return tuple
|
|
447
|
+
return None # composite or unknown marker
|
|
71
448
|
|
|
449
|
+
def _validate_schema(obj: Any, schema: Any, path: str = "$") -> Tuple[bool, str]:
|
|
450
|
+
"""
|
|
451
|
+
Recursively validate 'obj' against 'schema'. Returns (ok, message).
|
|
452
|
+
"""
|
|
453
|
+
# 1) Primitive types via exemplar or type
|
|
454
|
+
t = _schema_type(schema)
|
|
455
|
+
if t is not None and t not in (list, dict, tuple):
|
|
456
|
+
if isinstance(obj, t):
|
|
457
|
+
return True, "ok"
|
|
458
|
+
return False, "{}: expected {}, got {}".format(path, t.__name__, type(obj).__name__)
|
|
72
459
|
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
)
|
|
79
|
-
|
|
460
|
+
# 2) List schemas
|
|
461
|
+
if isinstance(schema, list):
|
|
462
|
+
if not isinstance(obj, list):
|
|
463
|
+
return False, "{}: expected list, got {}".format(path, type(obj).__name__)
|
|
464
|
+
# If schema is [], any list passes
|
|
465
|
+
if len(schema) == 0:
|
|
466
|
+
return True, "ok"
|
|
467
|
+
# If schema is [subschema], every element must match subschema
|
|
468
|
+
if len(schema) == 1:
|
|
469
|
+
subschema = schema[0]
|
|
470
|
+
for i, el in enumerate(obj):
|
|
471
|
+
ok, msg = _validate_schema(el, subschema, "{}[{}]".format(path, i))
|
|
472
|
+
if not ok:
|
|
473
|
+
return ok, msg
|
|
474
|
+
return True, "ok"
|
|
475
|
+
# Otherwise treat as "structure-by-position" (rare)
|
|
476
|
+
if len(obj) != len(schema):
|
|
477
|
+
return False, "{}: expected list length {}, got {}".format(path, len(schema), len(obj))
|
|
478
|
+
for i, (el, subschema) in enumerate(zip(obj, schema)):
|
|
479
|
+
ok, msg = _validate_schema(el, subschema, "{}[{}]".format(path, i))
|
|
480
|
+
if not ok:
|
|
481
|
+
return ok, msg
|
|
482
|
+
return True, "ok"
|
|
80
483
|
|
|
81
|
-
|
|
82
|
-
|
|
484
|
+
# 3) Dict schemas
|
|
485
|
+
if isinstance(schema, dict):
|
|
486
|
+
if not isinstance(obj, dict):
|
|
487
|
+
return False, "{}: expected dict, got {}".format(path, type(obj).__name__)
|
|
83
488
|
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
489
|
+
# Check required/optional keys in schema
|
|
490
|
+
for skey, subschema in schema.items():
|
|
491
|
+
base_key, optional = _is_optional_key(skey)
|
|
492
|
+
if base_key not in obj:
|
|
493
|
+
if optional:
|
|
494
|
+
continue
|
|
495
|
+
return False, "{}: missing required key '{}'".format(path, base_key)
|
|
496
|
+
ok, msg = _validate_schema(obj[base_key], subschema, "{}.{}".format(path, base_key))
|
|
497
|
+
if not ok:
|
|
498
|
+
return ok, msg
|
|
499
|
+
return True, "ok"
|
|
500
|
+
|
|
501
|
+
# 4) Tuple schemas (optional)
|
|
502
|
+
if isinstance(schema, tuple):
|
|
503
|
+
if not isinstance(obj, tuple):
|
|
504
|
+
return False, "{}: expected tuple, got {}".format(path, type(obj).__name__)
|
|
505
|
+
if len(schema) == 0:
|
|
506
|
+
return True, "ok"
|
|
507
|
+
if len(schema) == 1:
|
|
508
|
+
subschema = schema[0]
|
|
509
|
+
for i, el in enumerate(obj):
|
|
510
|
+
ok, msg = _validate_schema(el, subschema, "{}[{}]".format(path, i))
|
|
511
|
+
if not ok:
|
|
512
|
+
return ok, msg
|
|
513
|
+
return True, "ok"
|
|
514
|
+
if len(obj) != len(schema):
|
|
515
|
+
return False, "{}: expected tuple length {}, got {}".format(path, len(schema), len(obj))
|
|
516
|
+
for i, (el, subschema) in enumerate(zip(obj, schema)):
|
|
517
|
+
ok, msg = _validate_schema(el, subschema, "{}[{}]".format(path, i))
|
|
518
|
+
if not ok:
|
|
519
|
+
return ok, msg
|
|
520
|
+
return True, "ok"
|
|
521
|
+
|
|
522
|
+
# 5) If schema is a type object (e.g., list, dict) we handled above; unknown markers:
|
|
523
|
+
st = type(schema).__name__
|
|
524
|
+
return False, "{}: unsupported schema marker of type {!r}".format(path, st)
|
|
525
|
+
|
|
526
|
+
|
|
527
|
+
ParsingExamples = """
|
|
528
|
+
|
|
529
|
+
# Examples showing how to use the valid_extract function
|
|
530
|
+
#------------------------------------------------------------------
|
|
531
|
+
|
|
532
|
+
# Basic list
|
|
533
|
+
txt = "Noise before ```python\n[1, 2, 3]\n``` noise after"
|
|
534
|
+
rules = {"kind": "python", "format": []}
|
|
535
|
+
assert valid_extract(txt, rules) == [1, 2, 3]
|
|
536
|
+
|
|
537
|
+
# Basic dict
|
|
538
|
+
txt2 = "Header\n{ 'a': 1, 'b': 2 }\nFooter"
|
|
539
|
+
rules2 = {"kind": "python", "format": {}}
|
|
540
|
+
assert valid_extract(txt2, rules2) == {"a": 1, "b": 2}
|
|
541
|
+
|
|
542
|
+
# Nested dict with types
|
|
543
|
+
txt3 = "reply: { 'num_list':[1,2,3], 'my_info':{'x':1}, 'name':'Ada' } ok."
|
|
544
|
+
rules3 = {"kind": "python",
|
|
545
|
+
"format": {'num_list': [int], 'my_info': {}, 'name': ''}}
|
|
546
|
+
assert valid_extract(txt3, rules3)["name"] == "Ada"
|
|
547
|
+
|
|
548
|
+
# Optional key example
|
|
549
|
+
txt4 = ''' I think this is how I'd answer: ``` {'a': 1}``` is this good enough?'''
|
|
550
|
+
rules4 = {"kind": "python", "format": {'a': int, 'b?': ''}}
|
|
551
|
+
assert valid_extract(txt4, rules4) == {'a': 1}
|
|
552
|
+
|
|
553
|
+
txt = " I think this is how I'd answer: ``` {'a': 1}``` is this good enough?"
|
|
554
|
+
rules = {"kind": "python", "format": {"a": int, "b?": ""}}
|
|
555
|
+
assert valid_extract(txt, rules) == {"a": 1}
|
|
556
|
+
|
|
557
|
+
txt2 = "noise before {'a': 1} and after"
|
|
558
|
+
assert valid_extract(txt2, rules) == {"a": 1}
|
|
559
|
+
|
|
560
|
+
txt3 = "ok ```python\n[1,2,3]\n``` end"
|
|
561
|
+
assert valid_extract(txt3, {"kind": "python", "format": []}) == [1,2,3]
|
|
562
|
+
|
|
563
|
+
txt4 = "inline ```[{'k': 'v'}]```"
|
|
564
|
+
assert valid_extract(txt4, {"kind": "python", "format": [{"k": ""}]}) == [{"k": "v"}]
|
|
565
|
+
|
|
566
|
+
"""
|
|
567
|
+
|
|
568
|
+
|
|
569
|
+
#############################################################################
|
|
570
|
+
#############################################################################
|
|
571
|
+
|
|
572
|
+
### VAR_DELETED SENTINEL
|
|
573
|
+
|
|
574
|
+
# Sentinel class to mark deleted variables
|
|
575
|
+
class _VarDeleted:
|
|
576
|
+
"""Sentinel value indicating a variable has been deleted."""
|
|
577
|
+
_instance = None
|
|
578
|
+
|
|
579
|
+
def __new__(cls):
|
|
580
|
+
if cls._instance is None:
|
|
581
|
+
cls._instance = super().__new__(cls)
|
|
582
|
+
return cls._instance
|
|
583
|
+
|
|
584
|
+
def __repr__(self):
|
|
585
|
+
return '<DELETED>'
|
|
586
|
+
|
|
587
|
+
def __str__(self):
|
|
588
|
+
return '<DELETED>'
|
|
589
|
+
|
|
590
|
+
# Singleton instance for deleted marker
|
|
591
|
+
VAR_DELETED = _VarDeleted()
|
|
592
|
+
|
|
593
|
+
|
|
594
|
+
#############################################################################
|
|
595
|
+
#############################################################################
|
|
89
596
|
|
|
597
|
+
### OBJECT COMPRESSION UTILITIES
|
|
598
|
+
|
|
599
|
+
import zlib
|
|
600
|
+
import base64
|
|
601
|
+
|
|
602
|
+
def compress_to_json(data, content_type='auto'):
|
|
603
|
+
"""
|
|
604
|
+
Compress data to a JSON-serializable dict.
|
|
605
|
+
|
|
606
|
+
Args:
|
|
607
|
+
data: bytes, str, or JSON-serializable object
|
|
608
|
+
content_type: 'bytes', 'text', 'json', 'pickle', or 'auto'
|
|
609
|
+
|
|
90
610
|
Returns:
|
|
91
|
-
|
|
611
|
+
dict with 'data' (base64 string), sizes, and content_type
|
|
612
|
+
"""
|
|
613
|
+
# Convert to bytes based on type
|
|
614
|
+
if content_type == 'auto':
|
|
615
|
+
if isinstance(data, bytes):
|
|
616
|
+
content_type = 'bytes'
|
|
617
|
+
raw_bytes = data
|
|
618
|
+
elif isinstance(data, str):
|
|
619
|
+
content_type = 'text'
|
|
620
|
+
raw_bytes = data.encode('utf-8')
|
|
621
|
+
else:
|
|
622
|
+
# Try JSON first, fall back to pickle
|
|
623
|
+
try:
|
|
624
|
+
content_type = 'json'
|
|
625
|
+
raw_bytes = json.dumps(data).encode('utf-8')
|
|
626
|
+
except (TypeError, ValueError):
|
|
627
|
+
content_type = 'pickle'
|
|
628
|
+
raw_bytes = pickle.dumps(data)
|
|
629
|
+
elif content_type == 'bytes':
|
|
630
|
+
raw_bytes = data
|
|
631
|
+
elif content_type == 'text':
|
|
632
|
+
raw_bytes = data.encode('utf-8')
|
|
633
|
+
elif content_type == 'json':
|
|
634
|
+
raw_bytes = json.dumps(data).encode('utf-8')
|
|
635
|
+
elif content_type == 'pickle':
|
|
636
|
+
raw_bytes = pickle.dumps(data)
|
|
637
|
+
else:
|
|
638
|
+
raise ValueError("Unknown content_type: {}".format(content_type))
|
|
639
|
+
|
|
640
|
+
# Compress and base64 encode
|
|
641
|
+
compressed = zlib.compress(raw_bytes, level=9)
|
|
642
|
+
encoded = base64.b64encode(compressed).decode('ascii')
|
|
643
|
+
|
|
644
|
+
return {
|
|
645
|
+
'data': encoded,
|
|
646
|
+
'size_original': len(raw_bytes),
|
|
647
|
+
'size_compressed': len(compressed),
|
|
648
|
+
'content_type': content_type,
|
|
649
|
+
}
|
|
650
|
+
|
|
92
651
|
|
|
93
|
-
|
|
94
|
-
Exception: The last exception if all retries fail.
|
|
652
|
+
def decompress_from_json(obj_dict):
|
|
95
653
|
"""
|
|
96
|
-
|
|
97
|
-
|
|
654
|
+
Decompress data from JSON-serializable dict.
|
|
655
|
+
|
|
656
|
+
Args:
|
|
657
|
+
obj_dict: dict from compress_to_json
|
|
658
|
+
|
|
659
|
+
Returns:
|
|
660
|
+
Original data in its original type
|
|
661
|
+
"""
|
|
662
|
+
encoded = obj_dict['data']
|
|
663
|
+
content_type = obj_dict['content_type']
|
|
664
|
+
|
|
665
|
+
# Decode and decompress
|
|
666
|
+
compressed = base64.b64decode(encoded)
|
|
667
|
+
raw_bytes = zlib.decompress(compressed)
|
|
668
|
+
|
|
669
|
+
# Convert back to original type
|
|
670
|
+
if content_type == 'bytes':
|
|
671
|
+
return raw_bytes
|
|
672
|
+
elif content_type == 'text':
|
|
673
|
+
return raw_bytes.decode('utf-8')
|
|
674
|
+
elif content_type == 'json':
|
|
675
|
+
return json.loads(raw_bytes.decode('utf-8'))
|
|
676
|
+
elif content_type == 'pickle':
|
|
677
|
+
return pickle.loads(raw_bytes)
|
|
678
|
+
else:
|
|
679
|
+
raise ValueError("Unknown content_type: {}".format(content_type))
|
|
680
|
+
|
|
98
681
|
|
|
99
|
-
|
|
682
|
+
def estimate_size(value):
|
|
683
|
+
"""
|
|
684
|
+
Estimate the serialized size of a value in bytes.
|
|
685
|
+
|
|
686
|
+
Args:
|
|
687
|
+
value: Any value
|
|
688
|
+
|
|
689
|
+
Returns:
|
|
690
|
+
int: Estimated size in bytes
|
|
691
|
+
"""
|
|
692
|
+
if isinstance(value, bytes):
|
|
693
|
+
return len(value)
|
|
694
|
+
elif isinstance(value, str):
|
|
695
|
+
return len(value.encode('utf-8'))
|
|
696
|
+
else:
|
|
100
697
|
try:
|
|
101
|
-
return
|
|
102
|
-
except
|
|
103
|
-
|
|
104
|
-
if attempt < max_retries:
|
|
105
|
-
time.sleep(min(delay, max_delay))
|
|
106
|
-
delay *= 2 # Exponential backoff
|
|
698
|
+
return len(json.dumps(value).encode('utf-8'))
|
|
699
|
+
except (TypeError, ValueError):
|
|
700
|
+
return len(pickle.dumps(value))
|
|
107
701
|
|
|
108
|
-
|
|
702
|
+
|
|
703
|
+
def is_obj_ref(value):
|
|
704
|
+
"""
|
|
705
|
+
Check if a value is an object reference.
|
|
706
|
+
|
|
707
|
+
Args:
|
|
708
|
+
value: Any value
|
|
709
|
+
|
|
710
|
+
Returns:
|
|
711
|
+
bool: True if value is an object reference dict
|
|
712
|
+
"""
|
|
713
|
+
return isinstance(value, dict) and '_obj_ref' in value
|
|
714
|
+
|
|
715
|
+
|
|
716
|
+
def truncate_content(content, stamp, threshold=500, header_len=200, footer_len=200):
|
|
717
|
+
"""
|
|
718
|
+
Truncate long content by keeping header and footer with an expandable marker.
|
|
719
|
+
|
|
720
|
+
If content is shorter than threshold, returns content unchanged.
|
|
721
|
+
Otherwise, keeps the first header_len chars and last footer_len chars,
|
|
722
|
+
with a marker in between indicating truncation and providing the stamp
|
|
723
|
+
for expansion.
|
|
724
|
+
|
|
725
|
+
Args:
|
|
726
|
+
content: The text content to potentially truncate
|
|
727
|
+
stamp: The event stamp (ID) for the content, used in expansion marker
|
|
728
|
+
threshold: Minimum length before truncation applies (default 500)
|
|
729
|
+
header_len: Characters to keep from start (default 200)
|
|
730
|
+
footer_len: Characters to keep from end (default 200)
|
|
731
|
+
|
|
732
|
+
Returns:
|
|
733
|
+
str: Original content if short enough, or truncated content with marker
|
|
734
|
+
|
|
735
|
+
Example:
|
|
736
|
+
truncated = truncate_content(long_text, 'ABC123', threshold=500)
|
|
737
|
+
# Returns: "First 200 chars...\n\n[...TRUNCATED: 1,847 chars omitted. To expand, request stamp: ABC123...]\n\n...last 200 chars"
|
|
738
|
+
"""
|
|
739
|
+
if len(content) <= threshold:
|
|
740
|
+
return content
|
|
741
|
+
|
|
742
|
+
# Calculate how much we're removing
|
|
743
|
+
chars_omitted = len(content) - header_len - footer_len
|
|
744
|
+
|
|
745
|
+
# Build the truncation marker
|
|
746
|
+
marker = "\n\n[...TRUNCATED: {:,} chars omitted. To expand, request stamp: {}...]\n\n".format(chars_omitted, stamp)
|
|
747
|
+
|
|
748
|
+
# Extract header and footer
|
|
749
|
+
header = content[:header_len]
|
|
750
|
+
footer = content[-footer_len:]
|
|
751
|
+
|
|
752
|
+
return header + marker + footer
|