codeastra 1.0.0__tar.gz → 1.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,15 +1,14 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: codeastra
3
- Version: 1.0.0
3
+ Version: 1.0.1
4
4
  Summary: Blind Agent SDK — drop-in middleware for LangChain, CrewAI, AutoGPT. Two lines makes any agent blind to real data.
5
- License: MIT
5
+ License-Expression: MIT
6
6
  Project-URL: Homepage, https://codeastra.dev
7
7
  Project-URL: Documentation, https://docs.codeastra.dev
8
8
  Project-URL: Repository, https://github.com/codeastra/codeastra-python
9
9
  Keywords: ai,agents,langchain,crewai,privacy,hipaa,security,tokenization
10
10
  Classifier: Development Status :: 5 - Production/Stable
11
11
  Classifier: Intended Audience :: Developers
12
- Classifier: License :: OSI Approved :: MIT License
13
12
  Classifier: Programming Language :: Python :: 3
14
13
  Classifier: Programming Language :: Python :: 3.10
15
14
  Classifier: Programming Language :: Python :: 3.11
@@ -410,3 +410,291 @@ class BlindAgentMiddleware:
410
410
  def __exit__(self, *_): self.close()
411
411
  async def __aenter__(self): return self
412
412
  async def __aexit__(self, *_): await self.aclose()
413
+
414
+
415
+ # ══════════════════════════════════════════════════════════════════════════════
416
+ # INPUT SCANNER — scans prompt text for raw PII/PHI/PCI before agent sees it
417
+ # OUTPUT SCANNER — scans agent response for any leaked real values
418
+ # ══════════════════════════════════════════════════════════════════════════════
419
+
420
+ import re as _re
421
+
422
+ # Regex patterns for detecting raw sensitive data in free text
423
+ _PATTERNS = {
424
+ # SSN: 123-45-6789 or 123456789
425
+ "ssn": _re.compile(
426
+ r'\b(?!000|666|9\d{2})\d{3}[-\s]?(?!00)\d{2}[-\s]?(?!0000)\d{4}\b'
427
+ ),
428
+ # Credit card: 13-19 digits, passes Luhn
429
+ "credit_card": _re.compile(
430
+ r'\b(?:4[0-9]{12}(?:[0-9]{3})?' # Visa
431
+ r'|5[1-5][0-9]{14}' # Mastercard
432
+ r'|3[47][0-9]{13}' # Amex
433
+ r'|6(?:011|5[0-9]{2})[0-9]{12}' # Discover
434
+ r'|(?:2131|1800|35\d{3})\d{11})\b' # JCB
435
+ ),
436
+ # Email
437
+ "email": _re.compile(
438
+ r'\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}\b'
439
+ ),
440
+ # Phone: various formats
441
+ "phone": _re.compile(
442
+ r'\b(?:\+?1[-.\s]?)?'
443
+ r'(?:\(?\d{3}\)?[-.\s]?)'
444
+ r'\d{3}[-.\s]?\d{4}\b'
445
+ ),
446
+ # DOB: MM/DD/YYYY or YYYY-MM-DD
447
+ "dob": _re.compile(
448
+ r'\b(?:0[1-9]|1[0-2])[\/\-](?:0[1-9]|[12]\d|3[01])[\/\-](?:19|20)\d{2}\b'
449
+ r'|\b(?:19|20)\d{2}[\/\-](?:0[1-9]|1[0-2])[\/\-](?:0[1-9]|[12]\d|3[01])\b'
450
+ ),
451
+ # MRN: MRN- or MRN: followed by digits
452
+ "mrn": _re.compile(
453
+ r'\bMRN[-:\s]*\s*[A-Z0-9]{4,12}\b', _re.IGNORECASE
454
+ ),
455
+ # IP address
456
+ "ip_address": _re.compile(
457
+ r'\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\b'
458
+ ),
459
+ }
460
+
461
+
462
+ def _luhn_check(number: str) -> bool:
463
+ """Validate credit card number with Luhn algorithm."""
464
+ digits = [int(d) for d in number if d.isdigit()]
465
+ if len(digits) < 13:
466
+ return False
467
+ total = 0
468
+ for i, d in enumerate(reversed(digits)):
469
+ if i % 2 == 1:
470
+ d *= 2
471
+ if d > 9:
472
+ d -= 9
473
+ total += d
474
+ return total % 10 == 0
475
+
476
+
477
+ def _scan_text_for_pii(text: str) -> dict:
478
+ """
479
+ Scan free text for raw PII/PHI/PCI patterns.
480
+ Returns {synthetic_field_key: matched_value} for tokenization.
481
+
482
+ Example:
483
+ "Patient John Smith SSN 123-45-6789 email john@hospital.org"
484
+ → {"ssn_0": "123-45-6789", "email_0": "john@hospital.org"}
485
+ """
486
+ found = {}
487
+ if not isinstance(text, str):
488
+ return found
489
+
490
+ for field, pattern in _PATTERNS.items():
491
+ matches = pattern.findall(text)
492
+ for i, match in enumerate(matches):
493
+ val = match.strip() if isinstance(match, str) else match[0].strip()
494
+ if not val or TOKEN_RE.search(val):
495
+ continue
496
+ # Extra validation for credit cards
497
+ if field == "credit_card":
498
+ digits_only = _re.sub(r'\D', '', val)
499
+ if not _luhn_check(digits_only):
500
+ continue
501
+ key = f"{field}_{i}" if i > 0 else field
502
+ found[key] = val
503
+
504
+ return found
505
+
506
+
507
+ def _scan_obj_for_pii(obj: Any) -> dict:
508
+ """Scan any object (str, dict, list) for raw PII in free text."""
509
+ if isinstance(obj, str):
510
+ return _scan_text_for_pii(obj)
511
+ elif isinstance(obj, dict):
512
+ combined = {}
513
+ for v in obj.values():
514
+ combined.update(_scan_obj_for_pii(v))
515
+ return combined
516
+ elif isinstance(obj, list):
517
+ combined = {}
518
+ for item in obj:
519
+ combined.update(_scan_obj_for_pii(item))
520
+ return combined
521
+ return {}
522
+
523
+
524
+ def _blind_text(text: str, token_map: dict) -> str:
525
+ """Replace all known real values in text with their tokens."""
526
+ if not isinstance(text, str):
527
+ return text
528
+ for real, token in token_map.items():
529
+ if real and real in text:
530
+ text = text.replace(real, token)
531
+ return text
532
+
533
+
534
+ def _blind_any(obj: Any, token_map: dict) -> Any:
535
+ """Replace real values anywhere in obj with tokens."""
536
+ if isinstance(obj, str):
537
+ return _blind_text(obj, token_map)
538
+ elif isinstance(obj, dict):
539
+ return {k: _blind_any(v, token_map) for k, v in obj.items()}
540
+ elif isinstance(obj, list):
541
+ return [_blind_any(i, token_map) for i in obj]
542
+ return obj
543
+
544
+
545
+ # ── Patch BlindAgentMiddleware with input + output scanning ──────────────────
546
+
547
+ _orig_run = BlindAgentMiddleware.run
548
+ _orig_invoke = BlindAgentMiddleware.invoke
549
+ _orig_chat = BlindAgentMiddleware.chat
550
+ _orig_arun = BlindAgentMiddleware.arun
551
+ _orig_ainvoke = BlindAgentMiddleware.ainvoke
552
+
553
+
554
+ def _scan_and_blind_input(self, *args, **kwargs):
555
+ """
556
+ Scan all input args/kwargs for raw PII/PHI/PCI.
557
+ Tokenize any found values before passing to the agent.
558
+ Returns (new_args, new_kwargs).
559
+ """
560
+ # Collect all text from args and kwargs
561
+ all_text = json.dumps(list(args)) + json.dumps(kwargs)
562
+ raw_pii = _scan_obj_for_pii(all_text)
563
+
564
+ if not raw_pii:
565
+ return args, kwargs
566
+
567
+ # Tokenize detected values
568
+ try:
569
+ classification = _classify(set(k.split("_")[0] for k in raw_pii))
570
+ minted = self._client.tokenize(raw_pii, classification=classification)
571
+ # Build replacement map: {real_value: token}
572
+ for field, token in minted.items():
573
+ real_val = raw_pii.get(field)
574
+ if real_val:
575
+ self._value_to_token[real_val] = token
576
+ self._session_tokens[field] = token
577
+
578
+ if self._verbose:
579
+ print(f"[CodeAstra] Input scan: tokenized {len(minted)} value(s) in prompt: {list(minted.keys())}")
580
+
581
+ # Replace real values in args and kwargs
582
+ new_args = tuple(_blind_any(a, self._value_to_token) for a in args)
583
+ new_kwargs = {k: _blind_any(v, self._value_to_token) for k, v in kwargs.items()}
584
+ return new_args, new_kwargs
585
+
586
+ except Exception as e:
587
+ if self._verbose:
588
+ print(f"[CodeAstra] Input scan warning: {e}")
589
+ return args, kwargs
590
+
591
+
592
+ def _scan_output(self, result: Any) -> Any:
593
+ """
594
+ Scan agent output for any real values that leaked through.
595
+ Replace with tokens using session's value_to_token map.
596
+ Also scan output text for any NEW raw PII not yet tokenized.
597
+ """
598
+ # Step 1: replace known real values with existing tokens
599
+ if self._value_to_token:
600
+ result = _blind_any(result, self._value_to_token)
601
+
602
+ # Step 2: scan output for any new raw PII that leaked
603
+ new_pii = _scan_obj_for_pii(result)
604
+ if new_pii:
605
+ try:
606
+ classification = _classify(set(k.split("_")[0] for k in new_pii))
607
+ minted = self._client.tokenize(new_pii, classification=classification)
608
+ for field, token in minted.items():
609
+ real_val = new_pii.get(field)
610
+ if real_val:
611
+ self._value_to_token[real_val] = token
612
+ self._session_tokens[field] = token
613
+
614
+ result = _blind_any(result, self._value_to_token)
615
+
616
+ if self._verbose:
617
+ print(f"[CodeAstra] Output gate: caught {len(minted)} leaked value(s): {list(minted.keys())}")
618
+ except Exception as e:
619
+ if self._verbose:
620
+ print(f"[CodeAstra] Output gate warning: {e}")
621
+
622
+ return result
623
+
624
+
625
+ async def _ascan_output(self, result: Any) -> Any:
626
+ """Async version of _scan_output."""
627
+ if self._value_to_token:
628
+ result = _blind_any(result, self._value_to_token)
629
+
630
+ new_pii = _scan_obj_for_pii(result)
631
+ if new_pii:
632
+ try:
633
+ classification = _classify(set(k.split("_")[0] for k in new_pii))
634
+ minted = await self._client.atokenize(new_pii, classification=classification)
635
+ for field, token in minted.items():
636
+ real_val = new_pii.get(field)
637
+ if real_val:
638
+ self._value_to_token[real_val] = token
639
+ self._session_tokens[field] = token
640
+ result = _blind_any(result, self._value_to_token)
641
+ if self._verbose:
642
+ print(f"[CodeAstra] Output gate (async): caught {len(minted)} leaked value(s)")
643
+ except Exception as e:
644
+ if self._verbose:
645
+ print(f"[CodeAstra] Output gate warning: {e}")
646
+ return result
647
+
648
+
649
+ # ── Monkey-patch all proxy methods with input + output scanning ───────────────
650
+
651
+ def _patched_run(self, *args, **kwargs):
652
+ args, kwargs = _scan_and_blind_input(self, *args, **kwargs)
653
+ result = self._agent.run(*args, **kwargs)
654
+ result = self._blind_output(result) # tool output scan (existing)
655
+ return _scan_output(self, result) # output gate scan (new)
656
+
657
+ def _patched_invoke(self, *args, **kwargs):
658
+ args, kwargs = _scan_and_blind_input(self, *args, **kwargs)
659
+ result = self._agent.invoke(*args, **kwargs)
660
+ if isinstance(result, dict) and "output" in result:
661
+ result["output"] = self._blind_output(result["output"])
662
+ result["output"] = _scan_output(self, result["output"])
663
+ return result
664
+ result = self._blind_output(result)
665
+ return _scan_output(self, result)
666
+
667
+ def _patched_chat(self, *args, **kwargs):
668
+ args, kwargs = _scan_and_blind_input(self, *args, **kwargs)
669
+ result = self._agent.chat(*args, **kwargs)
670
+ result = self._blind_output(result)
671
+ return _scan_output(self, result)
672
+
673
+ async def _patched_arun(self, *args, **kwargs):
674
+ args, kwargs = _scan_and_blind_input(self, *args, **kwargs)
675
+ result = await self._agent.arun(*args, **kwargs)
676
+ result = await self._ablind_output(result)
677
+ return await _ascan_output(self, result)
678
+
679
+ async def _patched_ainvoke(self, *args, **kwargs):
680
+ args, kwargs = _scan_and_blind_input(self, *args, **kwargs)
681
+ result = await self._agent.ainvoke(*args, **kwargs)
682
+ if isinstance(result, dict) and "output" in result:
683
+ result["output"] = await self._ablind_output(result["output"])
684
+ result["output"] = await _ascan_output(self, result["output"])
685
+ return result
686
+ result = await self._ablind_output(result)
687
+ return await _ascan_output(self, result)
688
+
689
+
690
+ # Apply patches
691
+ BlindAgentMiddleware.run = _patched_run
692
+ BlindAgentMiddleware.invoke = _patched_invoke
693
+ BlindAgentMiddleware.chat = _patched_chat
694
+ BlindAgentMiddleware.arun = _patched_arun
695
+ BlindAgentMiddleware.ainvoke = _patched_ainvoke
696
+
697
+ # Expose scanner functions for direct use
698
+ BlindAgentMiddleware._scan_input = _scan_and_blind_input
699
+ BlindAgentMiddleware._scan_output = _scan_output
700
+ BlindAgentMiddleware.scan_text = staticmethod(_scan_text_for_pii)
@@ -1,15 +1,14 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: codeastra
3
- Version: 1.0.0
3
+ Version: 1.0.1
4
4
  Summary: Blind Agent SDK — drop-in middleware for LangChain, CrewAI, AutoGPT. Two lines makes any agent blind to real data.
5
- License: MIT
5
+ License-Expression: MIT
6
6
  Project-URL: Homepage, https://codeastra.dev
7
7
  Project-URL: Documentation, https://docs.codeastra.dev
8
8
  Project-URL: Repository, https://github.com/codeastra/codeastra-python
9
9
  Keywords: ai,agents,langchain,crewai,privacy,hipaa,security,tokenization
10
10
  Classifier: Development Status :: 5 - Production/Stable
11
11
  Classifier: Intended Audience :: Developers
12
- Classifier: License :: OSI Approved :: MIT License
13
12
  Classifier: Programming Language :: Python :: 3
14
13
  Classifier: Programming Language :: Python :: 3.10
15
14
  Classifier: Programming Language :: Python :: 3.11
@@ -4,16 +4,15 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "codeastra"
7
- version = "1.0.0"
7
+ version = "1.0.1"
8
8
  description = "Blind Agent SDK — drop-in middleware for LangChain, CrewAI, AutoGPT. Two lines makes any agent blind to real data."
9
9
  readme = "README.md"
10
- license = {text = "MIT"}
10
+ license = "MIT"
11
11
  requires-python = ">=3.10"
12
12
  keywords = ["ai", "agents", "langchain", "crewai", "privacy", "hipaa", "security", "tokenization"]
13
13
  classifiers = [
14
14
  "Development Status :: 5 - Production/Stable",
15
15
  "Intended Audience :: Developers",
16
- "License :: OSI Approved :: MIT License",
17
16
  "Programming Language :: Python :: 3",
18
17
  "Programming Language :: Python :: 3.10",
19
18
  "Programming Language :: Python :: 3.11",
File without changes
File without changes