thoughtflow 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
thoughtflow/_util.py ADDED
@@ -0,0 +1,752 @@
1
+ """
2
+ Internal utilities for ThoughtFlow.
3
+
4
+ This module contains helper functions and classes used by ThoughtFlow.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ #############################################################################
10
+ #############################################################################
11
+
12
+ ### IMPORTS AND SETTINGS
13
+
14
+ import os, sys, time, pickle, json, uuid
15
+ import http, urllib, socket, ssl, gzip, copy
16
+ import urllib.request
17
+ import pprint
18
+ import random
19
+ import re, ast
20
+ from typing import Mapping, Any, Iterable, Optional, Tuple, Union
21
+
22
+ import time,hashlib,pickle
23
+ from random import randint
24
+ from functools import reduce
25
+
26
+ import datetime as dtt
27
+ from zoneinfo import ZoneInfo
28
+
29
+ tz_bog = ZoneInfo("America/Bogota")
30
+ tz_utc = ZoneInfo("UTC")
31
+
32
+
33
+ #############################################################################
34
+ #############################################################################
35
+
36
+ ### EVENT STAMP LOGIC
37
+
38
+ class EventStamp:
39
+ """
40
+ Generates and decodes deterministic event stamps using Base62 encoding.
41
+
42
+ Event stamps combine encoded time, document hash, and random components
43
+ into a compact 16-character identifier.
44
+
45
+ Usage:
46
+ EventStamp.stamp() # Generate a new stamp
47
+ EventStamp.decode_time(s) # Decode timestamp from stamp
48
+ EventStamp.hashify("text") # Generate deterministic hash
49
+ """
50
+
51
+ CHARSET = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
52
+
53
+ def sha256_hash(input_string):
54
+ """Generate a SHA-256 hash and return it as an integer."""
55
+ hash_bytes = hashlib.sha256(input_string.encode("utf-8")).digest()
56
+ return int.from_bytes(hash_bytes, byteorder="big")
57
+
58
+ def base62_encode(number, length):
59
+ """Encode an integer into a fixed-length Base62 string."""
60
+ base = len(EventStamp.CHARSET)
61
+ encoded = []
62
+ for _ in range(length):
63
+ number, remainder = divmod(number, base)
64
+ encoded.append(EventStamp.CHARSET[remainder])
65
+ return ''.join(encoded[::-1]) # Reverse to get correct order
66
+
67
+ def hashify(input_string, length=32):
68
+ """Generate a deterministic hash using all uppercase/lowercase letters and digits."""
69
+ hashed_int = EventStamp.sha256_hash(input_string)
70
+ return EventStamp.base62_encode(hashed_int, length)
71
+
72
+ def encode_num(num, charset=None):
73
+ """Encode a number in the given base/charset."""
74
+ if charset is None:
75
+ charset = EventStamp.CHARSET
76
+ base = len(charset)
77
+ if num < base:
78
+ return charset[num]
79
+ else:
80
+ return EventStamp.encode_num(num // base, charset) + charset[num % base]
81
+
82
+ def decode_num(encoded_str, charset=None):
83
+ """Decode a base-encoded string back to an integer."""
84
+ if charset is None:
85
+ charset = EventStamp.CHARSET
86
+ base = len(charset)
87
+ char_to_value = {c: i for i, c in enumerate(charset)}
88
+ return reduce(lambda num, c: num * base + char_to_value[c], encoded_str, 0)
89
+
90
+ def encode_time(unix_time=0):
91
+ """Encode current or given unix time."""
92
+ if unix_time == 0:
93
+ t = int(time.time() * 10000)
94
+ else:
95
+ t = int(unix_time * 10000)
96
+ return EventStamp.encode_num(t)
97
+
98
+ def encode_doc(doc={}):
99
+ """Encode a document/value to a 5-character hash."""
100
+ return EventStamp.hashify(str(doc), 5)
101
+
102
+ def encode_rando(length=3):
103
+ """Generate a random code of specified length."""
104
+ n = randint(300000, 900000)
105
+ c = '000' + EventStamp.encode_num(n)
106
+ return c[-length:]
107
+
108
+ def stamp(doc={}):
109
+ """
110
+ Generate an event stamp.
111
+
112
+ Combines encoded time, document hash, and random component
113
+ into a 16-character identifier.
114
+ """
115
+ time_code = EventStamp.encode_time()
116
+ rando_code = EventStamp.encode_rando()
117
+ if len(str(doc)) > 2:
118
+ doc_code = EventStamp.encode_doc(doc)
119
+ else:
120
+ arb = time_code + rando_code
121
+ doc_code = EventStamp.encode_doc(arb)
122
+ return (time_code + doc_code + rando_code)[:16]
123
+
124
+ def decode_time(stamp, charset=None):
125
+ """Decode the time component from an event stamp."""
126
+ if charset is None:
127
+ charset = EventStamp.CHARSET
128
+ stamp_prefix = stamp[:8]
129
+ scaled_time = EventStamp.decode_num(stamp_prefix, charset)
130
+ unix_time_seconds = scaled_time / 10000
131
+ return unix_time_seconds
132
+
133
+
134
+ # Backwards compatibility aliases
135
+ event_stamp = EventStamp.stamp
136
+ hashify = EventStamp.hashify
137
+ encode_num = EventStamp.encode_num
138
+ decode_num = EventStamp.decode_num
139
+
140
+ #############################################################################
141
+ #############################################################################
142
+
143
+ ### HELPER FUNCTIONS
144
+
145
+
146
+ default_header = '''
147
+ Markers like <start … l4zk> and </end … l4zk>
148
+ indicate where a text section begins and ends.
149
+ Never mix boundaries. Each block is separate.
150
+ This is to improve your ease-of-reading.
151
+ '''
152
+
153
+ def construct_prompt(
154
+ prompt_obj = {},
155
+ order = [],
156
+ header = '',
157
+ ):
158
+ if order: sections = list(order)
159
+ else: sections = [a for a in prompt_obj]
160
+ rnum = str(randint(1,9))
161
+ stamp = event_stamp()[-4:].lower()
162
+ stamp = stamp[:2]+rnum+stamp[2:]
163
+ L = []
164
+ if header:
165
+ if header=='default':
166
+ L.append(default_header+'\n')
167
+ else:
168
+ L.append(header+'\n\n')
169
+ L.append('<start prompt stamp>\n\n')
170
+ for s in sections:
171
+ text = prompt_obj[s]
172
+ s2 = s.strip().replace(' ','_')
173
+ label1 = "<start "+s2+" stamp>\n"
174
+ label2 = "\n</end "+s2+" stamp>\n\n"
175
+ block = label1 + text + label2
176
+ L.append(block)
177
+ L.append('</end prompt stamp>')
178
+ prompt = ''.join(L).replace(' stamp>',' '+stamp+'>')
179
+ return prompt
180
+
181
+ def construct_msgs(
182
+ usr_prompt = '',
183
+ vars = {},
184
+ sys_prompt = '',
185
+ msgs = [],
186
+ ):
187
+ if sys_prompt:
188
+ if type(sys_prompt)==dict:
189
+ sys_prompt = construct_prompt(sys_prompt)
190
+ m = {'role':'system','content':sys_prompt}
191
+ msgs.insert(0,m)
192
+ if usr_prompt:
193
+ if type(usr_prompt)==dict:
194
+ usr_prompt = construct_prompt(usr_prompt)
195
+ m = {'role':'user','content':usr_prompt}
196
+ msgs.append(m)
197
+ #msgs2 = []
198
+ #for m in msgs:
199
+ # m_copy = m.copy()
200
+ # if isinstance(m_copy, dict) and 'content' in m_copy and isinstance(m_copy['content'], str):
201
+ # for k, v in vars.items():
202
+ # m_copy['content'] = m_copy['content'].replace(k, str(v))
203
+ # msgs2.append(m_copy)
204
+ #return msgs2
205
+ msgs2 = []
206
+ for m in msgs:
207
+ m_copy = dict(m)
208
+ if isinstance(m_copy.get("content"), str):
209
+ for k, v in vars.items():
210
+ m_copy["content"] = m_copy["content"].replace(k, str(v))
211
+ msgs2.append(m_copy)
212
+ return msgs2
213
+
214
+
215
+
216
+ #############################################################################
217
+
218
+ class ValidExtractError(ValueError):
219
+ """Raised when extraction or validation fails."""
220
+
221
+ def valid_extract(raw_text: str, parsing_rules: Mapping[str, Any]) -> Any:
222
+ """
223
+ Extract and validate a target Python structure from noisy LLM text.
224
+
225
+ Parameters
226
+ ----------
227
+ raw_text : str
228
+ The original model output (may include extra prose, code fences, etc.).
229
+ parsing_rules : dict
230
+ Rules controlling extraction/validation. Required keys:
231
+ - 'kind': currently supports 'python' (default). ('json' also supported.)
232
+ - 'format': schema describing the expected structure, e.g. [], {}, {'name': ''}, {'num_list': [], 'info': {}}
233
+
234
+ Schema language:
235
+ * [] : list of anything
236
+ * [schema] : list of items matching 'schema'
237
+ * {} : dict of anything
238
+ * {'k': sch} : dict with required key 'k' matching 'sch'
239
+ * {'k?': sch} : OPTIONAL key 'k' (if present, must match 'sch')
240
+ * '' or str : str
241
+ * 0 or int : int
242
+ * 0.0 or float: float
243
+ * True/False or bool: bool
244
+ * None : NoneType
245
+
246
+ Returns
247
+ -------
248
+ Any
249
+ The parsed Python object that satisfies the schema.
250
+
251
+ Raises
252
+ ------
253
+ ValidExtractError
254
+ If extraction fails or the parsed object does not validate against the schema.
255
+
256
+ Examples
257
+ --------
258
+ >>> rules = {'kind': 'python', 'format': []}
259
+ >>> txt = "Here you go:\\n```python\\n[1, 2, 3]\\n```\\nLet me know!"
260
+ >>> valid_extract(txt, rules)
261
+ [1, 2, 3]
262
+
263
+ >>> rules = {'kind': 'python', 'format': {'num_list': [], 'my_info': {}, 'name': ''}}
264
+ >>> txt = "noise { 'num_list':[1,2], 'my_info':{'x':1}, 'name':'Ada' } trailing"
265
+ >>> valid_extract(txt, rules)
266
+ {'num_list': [1, 2], 'my_info': {'x': 1}, 'name': 'Ada'}
267
+ """
268
+ if not isinstance(parsing_rules, Mapping):
269
+ raise ValidExtractError("parsing_rules must be a mapping.")
270
+
271
+ kind = parsing_rules.get("kind", "python")
272
+ schema = parsing_rules.get("format", None)
273
+ if schema is None:
274
+ raise ValidExtractError("parsing_rules['format'] is required.")
275
+
276
+ # 1) Collect candidate text segments in a robust order.
277
+ candidates: Iterable[str] = _candidate_segments(raw_text, schema, prefer_fences_first=True)
278
+
279
+ last_err: Optional[Exception] = None
280
+ for segment in candidates:
281
+ try:
282
+ obj = _parse_segment(segment, kind=kind)
283
+ except Exception as e:
284
+ last_err = e
285
+ continue
286
+
287
+ ok, msg = _validate_schema(obj, schema)
288
+ if ok:
289
+ return obj
290
+ last_err = ValidExtractError("Validation failed for candidate: {}".format(msg))
291
+
292
+ # If we got here, nothing parsed+validated.
293
+ if last_err:
294
+ raise ValidExtractError(str(last_err))
295
+ raise ValidExtractError("No parseable candidates found.")
296
+
297
+ # ----------------------------
298
+ # Parsing helpers
299
+ # ----------------------------
300
+
301
+ # --- Replace the fence regex with this (accepts inline fences) ---
302
+ _FENCE_RE = re.compile(
303
+ r"```(?P<lang>[a-zA-Z0-9_\-\.]*)\s*\n?(?P<body>.*?)```",
304
+ re.DOTALL
305
+ )
306
+
307
+ def _candidate_segments(raw_text: str, schema: Any, prefer_fences_first: bool = True) -> Iterable[str]:
308
+ """
309
+ Yield candidate substrings likely to contain the target structure.
310
+
311
+ Strategy:
312
+ 1) Fenced code blocks (```) first, in order, if requested.
313
+ 2) Balanced slice for the top-level delimiter suggested by the schema.
314
+ 3) As a fallback, return raw_text itself (last resort).
315
+ """
316
+ # 1) From code fences
317
+ if prefer_fences_first:
318
+ for m in _FENCE_RE.finditer(raw_text):
319
+ lang = (m.group("lang") or "").strip().lower()
320
+ body = m.group("body")
321
+ # If the fence declares "python" or "json", prioritize; otherwise still try.
322
+ yield body
323
+
324
+ # 2) From balanced slice based on schema's top-level delimiter
325
+ opener, closer = _delims_for_schema(schema)
326
+ if opener and closer:
327
+ slice_ = _balanced_slice(raw_text, opener, closer)
328
+ if slice_ is not None:
329
+ yield slice_
330
+
331
+ # 3) Whole text (very last resort)
332
+ yield raw_text
333
+
334
+ def _parse_segment(segment: str, kind: str = "python") -> Any:
335
+ """
336
+ Parse a segment into a Python object according to 'kind'.
337
+ - python: ast.literal_eval
338
+ - json: json.loads (with fallback: try literal_eval if JSON fails, for LLM single-quote dicts)
339
+ """
340
+ text = segment.strip()
341
+
342
+ if kind == "python":
343
+ # Remove leading language hints often kept when copying from fences
344
+ if text.startswith("python\n"):
345
+ text = text[len("python\n") :].lstrip()
346
+ return ast.literal_eval(text)
347
+
348
+ if kind == "json":
349
+ try:
350
+ return json.loads(text)
351
+ except json.JSONDecodeError:
352
+ # LLMs often return Python-style dicts (single quotes). Try literal_eval as a fallback.
353
+ return ast.literal_eval(text)
354
+
355
+ raise ValidExtractError("Unsupported kind: {!r}".format(kind))
356
+
357
+ def _delims_for_schema(schema: Any) -> Tuple[Optional[str], Optional[str]]:
358
+ """
359
+ Infer top-level delimiters from the schema.
360
+ - list-like → [ ]
361
+ - dict-like → { }
362
+ - tuple-like (if used) → ( )
363
+ - string/number/bool/None → no delimiters (None, None)
364
+ """
365
+ # list
366
+ if isinstance(schema, list):
367
+ return "[", "]"
368
+ # dict
369
+ if isinstance(schema, dict):
370
+ return "{", "}"
371
+ # tuple schema (rare, but supported)
372
+ if isinstance(schema, tuple):
373
+ return "(", ")"
374
+ # primitives: cannot infer a unique delimiter—return None
375
+ return None, None
376
+
377
+
378
+ def _balanced_slice(text: str, open_ch: str, close_ch: str) -> Optional[str]:
379
+ """
380
+ Return the first balanced substring between open_ch and close_ch,
381
+ scanning from the *first occurrence of open_ch* (so prose apostrophes
382
+ before the opener don't confuse quote tracking).
383
+ """
384
+ start = text.find(open_ch)
385
+ if start == -1:
386
+ return None
387
+
388
+ depth = 0
389
+ in_str: Optional[str] = None # quote char if inside ' or "
390
+ escape = False
391
+ i = start
392
+
393
+ while i < len(text):
394
+ ch = text[i]
395
+ if in_str:
396
+ if escape:
397
+ escape = False
398
+ elif ch == "\\":
399
+ escape = True
400
+ elif ch == in_str:
401
+ in_str = None
402
+ else:
403
+ if ch in ("'", '"'):
404
+ in_str = ch
405
+ elif ch == open_ch:
406
+ depth += 1
407
+ elif ch == close_ch and depth > 0:
408
+ depth -= 1
409
+ if depth == 0:
410
+ return text[start : i + 1]
411
+ i += 1
412
+ return None
413
+
414
+
415
+ # ----------------------------
416
+ # Schema validation
417
+ # ----------------------------
418
+
419
+ def _is_optional_key(k: str) -> Tuple[str, bool]:
420
+ """Return (base_key, optional_flag) for keys with a trailing '?'."""
421
+ if isinstance(k, str) and k.endswith("?"):
422
+ return k[:-1], True
423
+ return k, False
424
+
425
+ def _schema_type(schema: Any) -> Union[type, Tuple[type, ...], None]:
426
+ """
427
+ Map schema exemplars to Python types.
428
+ Accepts either exemplar values ('' -> str, 0 -> int, 0.0 -> float, True -> bool, None -> NoneType)
429
+ OR actual types (str, int, float, bool).
430
+ """
431
+ if schema is None:
432
+ return type(None)
433
+ if schema is str or isinstance(schema, str):
434
+ return str
435
+ if schema is int or (isinstance(schema, int) and not isinstance(schema, bool)):
436
+ return int
437
+ if schema is float or isinstance(schema, float):
438
+ return float
439
+ if schema is bool or isinstance(schema, bool):
440
+ return bool
441
+ if schema is list:
442
+ return list
443
+ if schema is dict:
444
+ return dict
445
+ if schema is tuple:
446
+ return tuple
447
+ return None # composite or unknown marker
448
+
449
+ def _validate_schema(obj: Any, schema: Any, path: str = "$") -> Tuple[bool, str]:
450
+ """
451
+ Recursively validate 'obj' against 'schema'. Returns (ok, message).
452
+ """
453
+ # 1) Primitive types via exemplar or type
454
+ t = _schema_type(schema)
455
+ if t is not None and t not in (list, dict, tuple):
456
+ if isinstance(obj, t):
457
+ return True, "ok"
458
+ return False, "{}: expected {}, got {}".format(path, t.__name__, type(obj).__name__)
459
+
460
+ # 2) List schemas
461
+ if isinstance(schema, list):
462
+ if not isinstance(obj, list):
463
+ return False, "{}: expected list, got {}".format(path, type(obj).__name__)
464
+ # If schema is [], any list passes
465
+ if len(schema) == 0:
466
+ return True, "ok"
467
+ # If schema is [subschema], every element must match subschema
468
+ if len(schema) == 1:
469
+ subschema = schema[0]
470
+ for i, el in enumerate(obj):
471
+ ok, msg = _validate_schema(el, subschema, "{}[{}]".format(path, i))
472
+ if not ok:
473
+ return ok, msg
474
+ return True, "ok"
475
+ # Otherwise treat as "structure-by-position" (rare)
476
+ if len(obj) != len(schema):
477
+ return False, "{}: expected list length {}, got {}".format(path, len(schema), len(obj))
478
+ for i, (el, subschema) in enumerate(zip(obj, schema)):
479
+ ok, msg = _validate_schema(el, subschema, "{}[{}]".format(path, i))
480
+ if not ok:
481
+ return ok, msg
482
+ return True, "ok"
483
+
484
+ # 3) Dict schemas
485
+ if isinstance(schema, dict):
486
+ if not isinstance(obj, dict):
487
+ return False, "{}: expected dict, got {}".format(path, type(obj).__name__)
488
+
489
+ # Check required/optional keys in schema
490
+ for skey, subschema in schema.items():
491
+ base_key, optional = _is_optional_key(skey)
492
+ if base_key not in obj:
493
+ if optional:
494
+ continue
495
+ return False, "{}: missing required key '{}'".format(path, base_key)
496
+ ok, msg = _validate_schema(obj[base_key], subschema, "{}.{}".format(path, base_key))
497
+ if not ok:
498
+ return ok, msg
499
+ return True, "ok"
500
+
501
+ # 4) Tuple schemas (optional)
502
+ if isinstance(schema, tuple):
503
+ if not isinstance(obj, tuple):
504
+ return False, "{}: expected tuple, got {}".format(path, type(obj).__name__)
505
+ if len(schema) == 0:
506
+ return True, "ok"
507
+ if len(schema) == 1:
508
+ subschema = schema[0]
509
+ for i, el in enumerate(obj):
510
+ ok, msg = _validate_schema(el, subschema, "{}[{}]".format(path, i))
511
+ if not ok:
512
+ return ok, msg
513
+ return True, "ok"
514
+ if len(obj) != len(schema):
515
+ return False, "{}: expected tuple length {}, got {}".format(path, len(schema), len(obj))
516
+ for i, (el, subschema) in enumerate(zip(obj, schema)):
517
+ ok, msg = _validate_schema(el, subschema, "{}[{}]".format(path, i))
518
+ if not ok:
519
+ return ok, msg
520
+ return True, "ok"
521
+
522
+ # 5) If schema is a type object (e.g., list, dict) we handled above; unknown markers:
523
+ st = type(schema).__name__
524
+ return False, "{}: unsupported schema marker of type {!r}".format(path, st)
525
+
526
+
527
+ ParsingExamples = """
528
+
529
+ # Examples showing how to use the valid_extract function
530
+ #------------------------------------------------------------------
531
+
532
+ # Basic list
533
+ txt = "Noise before ```python\n[1, 2, 3]\n``` noise after"
534
+ rules = {"kind": "python", "format": []}
535
+ assert valid_extract(txt, rules) == [1, 2, 3]
536
+
537
+ # Basic dict
538
+ txt2 = "Header\n{ 'a': 1, 'b': 2 }\nFooter"
539
+ rules2 = {"kind": "python", "format": {}}
540
+ assert valid_extract(txt2, rules2) == {"a": 1, "b": 2}
541
+
542
+ # Nested dict with types
543
+ txt3 = "reply: { 'num_list':[1,2,3], 'my_info':{'x':1}, 'name':'Ada' } ok."
544
+ rules3 = {"kind": "python",
545
+ "format": {'num_list': [int], 'my_info': {}, 'name': ''}}
546
+ assert valid_extract(txt3, rules3)["name"] == "Ada"
547
+
548
+ # Optional key example
549
+ txt4 = ''' I think this is how I'd answer: ``` {'a': 1}``` is this good enough?'''
550
+ rules4 = {"kind": "python", "format": {'a': int, 'b?': ''}}
551
+ assert valid_extract(txt4, rules4) == {'a': 1}
552
+
553
+ txt = " I think this is how I'd answer: ``` {'a': 1}``` is this good enough?"
554
+ rules = {"kind": "python", "format": {"a": int, "b?": ""}}
555
+ assert valid_extract(txt, rules) == {"a": 1}
556
+
557
+ txt2 = "noise before {'a': 1} and after"
558
+ assert valid_extract(txt2, rules) == {"a": 1}
559
+
560
+ txt3 = "ok ```python\n[1,2,3]\n``` end"
561
+ assert valid_extract(txt3, {"kind": "python", "format": []}) == [1,2,3]
562
+
563
+ txt4 = "inline ```[{'k': 'v'}]```"
564
+ assert valid_extract(txt4, {"kind": "python", "format": [{"k": ""}]}) == [{"k": "v"}]
565
+
566
+ """
567
+
568
+
569
+ #############################################################################
570
+ #############################################################################
571
+
572
+ ### VAR_DELETED SENTINEL
573
+
574
+ # Sentinel class to mark deleted variables
575
+ class _VarDeleted:
576
+ """Sentinel value indicating a variable has been deleted."""
577
+ _instance = None
578
+
579
+ def __new__(cls):
580
+ if cls._instance is None:
581
+ cls._instance = super().__new__(cls)
582
+ return cls._instance
583
+
584
+ def __repr__(self):
585
+ return '<DELETED>'
586
+
587
+ def __str__(self):
588
+ return '<DELETED>'
589
+
590
+ # Singleton instance for deleted marker
591
+ VAR_DELETED = _VarDeleted()
592
+
593
+
594
+ #############################################################################
595
+ #############################################################################
596
+
597
+ ### OBJECT COMPRESSION UTILITIES
598
+
599
+ import zlib
600
+ import base64
601
+
602
+ def compress_to_json(data, content_type='auto'):
603
+ """
604
+ Compress data to a JSON-serializable dict.
605
+
606
+ Args:
607
+ data: bytes, str, or JSON-serializable object
608
+ content_type: 'bytes', 'text', 'json', 'pickle', or 'auto'
609
+
610
+ Returns:
611
+ dict with 'data' (base64 string), sizes, and content_type
612
+ """
613
+ # Convert to bytes based on type
614
+ if content_type == 'auto':
615
+ if isinstance(data, bytes):
616
+ content_type = 'bytes'
617
+ raw_bytes = data
618
+ elif isinstance(data, str):
619
+ content_type = 'text'
620
+ raw_bytes = data.encode('utf-8')
621
+ else:
622
+ # Try JSON first, fall back to pickle
623
+ try:
624
+ content_type = 'json'
625
+ raw_bytes = json.dumps(data).encode('utf-8')
626
+ except (TypeError, ValueError):
627
+ content_type = 'pickle'
628
+ raw_bytes = pickle.dumps(data)
629
+ elif content_type == 'bytes':
630
+ raw_bytes = data
631
+ elif content_type == 'text':
632
+ raw_bytes = data.encode('utf-8')
633
+ elif content_type == 'json':
634
+ raw_bytes = json.dumps(data).encode('utf-8')
635
+ elif content_type == 'pickle':
636
+ raw_bytes = pickle.dumps(data)
637
+ else:
638
+ raise ValueError("Unknown content_type: {}".format(content_type))
639
+
640
+ # Compress and base64 encode
641
+ compressed = zlib.compress(raw_bytes, level=9)
642
+ encoded = base64.b64encode(compressed).decode('ascii')
643
+
644
+ return {
645
+ 'data': encoded,
646
+ 'size_original': len(raw_bytes),
647
+ 'size_compressed': len(compressed),
648
+ 'content_type': content_type,
649
+ }
650
+
651
+
652
+ def decompress_from_json(obj_dict):
653
+ """
654
+ Decompress data from JSON-serializable dict.
655
+
656
+ Args:
657
+ obj_dict: dict from compress_to_json
658
+
659
+ Returns:
660
+ Original data in its original type
661
+ """
662
+ encoded = obj_dict['data']
663
+ content_type = obj_dict['content_type']
664
+
665
+ # Decode and decompress
666
+ compressed = base64.b64decode(encoded)
667
+ raw_bytes = zlib.decompress(compressed)
668
+
669
+ # Convert back to original type
670
+ if content_type == 'bytes':
671
+ return raw_bytes
672
+ elif content_type == 'text':
673
+ return raw_bytes.decode('utf-8')
674
+ elif content_type == 'json':
675
+ return json.loads(raw_bytes.decode('utf-8'))
676
+ elif content_type == 'pickle':
677
+ return pickle.loads(raw_bytes)
678
+ else:
679
+ raise ValueError("Unknown content_type: {}".format(content_type))
680
+
681
+
682
+ def estimate_size(value):
683
+ """
684
+ Estimate the serialized size of a value in bytes.
685
+
686
+ Args:
687
+ value: Any value
688
+
689
+ Returns:
690
+ int: Estimated size in bytes
691
+ """
692
+ if isinstance(value, bytes):
693
+ return len(value)
694
+ elif isinstance(value, str):
695
+ return len(value.encode('utf-8'))
696
+ else:
697
+ try:
698
+ return len(json.dumps(value).encode('utf-8'))
699
+ except (TypeError, ValueError):
700
+ return len(pickle.dumps(value))
701
+
702
+
703
+ def is_obj_ref(value):
704
+ """
705
+ Check if a value is an object reference.
706
+
707
+ Args:
708
+ value: Any value
709
+
710
+ Returns:
711
+ bool: True if value is an object reference dict
712
+ """
713
+ return isinstance(value, dict) and '_obj_ref' in value
714
+
715
+
716
+ def truncate_content(content, stamp, threshold=500, header_len=200, footer_len=200):
717
+ """
718
+ Truncate long content by keeping header and footer with an expandable marker.
719
+
720
+ If content is shorter than threshold, returns content unchanged.
721
+ Otherwise, keeps the first header_len chars and last footer_len chars,
722
+ with a marker in between indicating truncation and providing the stamp
723
+ for expansion.
724
+
725
+ Args:
726
+ content: The text content to potentially truncate
727
+ stamp: The event stamp (ID) for the content, used in expansion marker
728
+ threshold: Minimum length before truncation applies (default 500)
729
+ header_len: Characters to keep from start (default 200)
730
+ footer_len: Characters to keep from end (default 200)
731
+
732
+ Returns:
733
+ str: Original content if short enough, or truncated content with marker
734
+
735
+ Example:
736
+ truncated = truncate_content(long_text, 'ABC123', threshold=500)
737
+ # Returns: "First 200 chars...\n\n[...TRUNCATED: 1,847 chars omitted. To expand, request stamp: ABC123...]\n\n...last 200 chars"
738
+ """
739
+ if len(content) <= threshold:
740
+ return content
741
+
742
+ # Calculate how much we're removing
743
+ chars_omitted = len(content) - header_len - footer_len
744
+
745
+ # Build the truncation marker
746
+ marker = "\n\n[...TRUNCATED: {:,} chars omitted. To expand, request stamp: {}...]\n\n".format(chars_omitted, stamp)
747
+
748
+ # Extract header and footer
749
+ header = content[:header_len]
750
+ footer = content[-footer_len:]
751
+
752
+ return header + marker + footer