kailash 0.3.2__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (151) hide show
  1. kailash/__init__.py +33 -1
  2. kailash/access_control/__init__.py +129 -0
  3. kailash/access_control/managers.py +461 -0
  4. kailash/access_control/rule_evaluators.py +467 -0
  5. kailash/access_control_abac.py +825 -0
  6. kailash/config/__init__.py +27 -0
  7. kailash/config/database_config.py +359 -0
  8. kailash/database/__init__.py +28 -0
  9. kailash/database/execution_pipeline.py +499 -0
  10. kailash/middleware/__init__.py +306 -0
  11. kailash/middleware/auth/__init__.py +33 -0
  12. kailash/middleware/auth/access_control.py +436 -0
  13. kailash/middleware/auth/auth_manager.py +422 -0
  14. kailash/middleware/auth/jwt_auth.py +477 -0
  15. kailash/middleware/auth/kailash_jwt_auth.py +616 -0
  16. kailash/middleware/communication/__init__.py +37 -0
  17. kailash/middleware/communication/ai_chat.py +989 -0
  18. kailash/middleware/communication/api_gateway.py +802 -0
  19. kailash/middleware/communication/events.py +470 -0
  20. kailash/middleware/communication/realtime.py +710 -0
  21. kailash/middleware/core/__init__.py +21 -0
  22. kailash/middleware/core/agent_ui.py +890 -0
  23. kailash/middleware/core/schema.py +643 -0
  24. kailash/middleware/core/workflows.py +396 -0
  25. kailash/middleware/database/__init__.py +63 -0
  26. kailash/middleware/database/base.py +113 -0
  27. kailash/middleware/database/base_models.py +525 -0
  28. kailash/middleware/database/enums.py +106 -0
  29. kailash/middleware/database/migrations.py +12 -0
  30. kailash/{api/database.py → middleware/database/models.py} +183 -291
  31. kailash/middleware/database/repositories.py +685 -0
  32. kailash/middleware/database/session_manager.py +19 -0
  33. kailash/middleware/mcp/__init__.py +38 -0
  34. kailash/middleware/mcp/client_integration.py +585 -0
  35. kailash/middleware/mcp/enhanced_server.py +576 -0
  36. kailash/nodes/__init__.py +27 -3
  37. kailash/nodes/admin/__init__.py +42 -0
  38. kailash/nodes/admin/audit_log.py +794 -0
  39. kailash/nodes/admin/permission_check.py +864 -0
  40. kailash/nodes/admin/role_management.py +823 -0
  41. kailash/nodes/admin/security_event.py +1523 -0
  42. kailash/nodes/admin/user_management.py +944 -0
  43. kailash/nodes/ai/a2a.py +24 -7
  44. kailash/nodes/ai/ai_providers.py +248 -40
  45. kailash/nodes/ai/embedding_generator.py +11 -11
  46. kailash/nodes/ai/intelligent_agent_orchestrator.py +99 -11
  47. kailash/nodes/ai/llm_agent.py +436 -5
  48. kailash/nodes/ai/self_organizing.py +85 -10
  49. kailash/nodes/ai/vision_utils.py +148 -0
  50. kailash/nodes/alerts/__init__.py +26 -0
  51. kailash/nodes/alerts/base.py +234 -0
  52. kailash/nodes/alerts/discord.py +499 -0
  53. kailash/nodes/api/auth.py +287 -6
  54. kailash/nodes/api/rest.py +151 -0
  55. kailash/nodes/auth/__init__.py +17 -0
  56. kailash/nodes/auth/directory_integration.py +1228 -0
  57. kailash/nodes/auth/enterprise_auth_provider.py +1328 -0
  58. kailash/nodes/auth/mfa.py +2338 -0
  59. kailash/nodes/auth/risk_assessment.py +872 -0
  60. kailash/nodes/auth/session_management.py +1093 -0
  61. kailash/nodes/auth/sso.py +1040 -0
  62. kailash/nodes/base.py +344 -13
  63. kailash/nodes/base_cycle_aware.py +4 -2
  64. kailash/nodes/base_with_acl.py +1 -1
  65. kailash/nodes/code/python.py +283 -10
  66. kailash/nodes/compliance/__init__.py +9 -0
  67. kailash/nodes/compliance/data_retention.py +1888 -0
  68. kailash/nodes/compliance/gdpr.py +2004 -0
  69. kailash/nodes/data/__init__.py +22 -2
  70. kailash/nodes/data/async_connection.py +469 -0
  71. kailash/nodes/data/async_sql.py +757 -0
  72. kailash/nodes/data/async_vector.py +598 -0
  73. kailash/nodes/data/readers.py +767 -0
  74. kailash/nodes/data/retrieval.py +360 -1
  75. kailash/nodes/data/sharepoint_graph.py +397 -21
  76. kailash/nodes/data/sql.py +94 -5
  77. kailash/nodes/data/streaming.py +68 -8
  78. kailash/nodes/data/vector_db.py +54 -4
  79. kailash/nodes/enterprise/__init__.py +13 -0
  80. kailash/nodes/enterprise/batch_processor.py +741 -0
  81. kailash/nodes/enterprise/data_lineage.py +497 -0
  82. kailash/nodes/logic/convergence.py +31 -9
  83. kailash/nodes/logic/operations.py +14 -3
  84. kailash/nodes/mixins/__init__.py +8 -0
  85. kailash/nodes/mixins/event_emitter.py +201 -0
  86. kailash/nodes/mixins/mcp.py +9 -4
  87. kailash/nodes/mixins/security.py +165 -0
  88. kailash/nodes/monitoring/__init__.py +7 -0
  89. kailash/nodes/monitoring/performance_benchmark.py +2497 -0
  90. kailash/nodes/rag/__init__.py +284 -0
  91. kailash/nodes/rag/advanced.py +1615 -0
  92. kailash/nodes/rag/agentic.py +773 -0
  93. kailash/nodes/rag/conversational.py +999 -0
  94. kailash/nodes/rag/evaluation.py +875 -0
  95. kailash/nodes/rag/federated.py +1188 -0
  96. kailash/nodes/rag/graph.py +721 -0
  97. kailash/nodes/rag/multimodal.py +671 -0
  98. kailash/nodes/rag/optimized.py +933 -0
  99. kailash/nodes/rag/privacy.py +1059 -0
  100. kailash/nodes/rag/query_processing.py +1335 -0
  101. kailash/nodes/rag/realtime.py +764 -0
  102. kailash/nodes/rag/registry.py +547 -0
  103. kailash/nodes/rag/router.py +837 -0
  104. kailash/nodes/rag/similarity.py +1854 -0
  105. kailash/nodes/rag/strategies.py +566 -0
  106. kailash/nodes/rag/workflows.py +575 -0
  107. kailash/nodes/security/__init__.py +19 -0
  108. kailash/nodes/security/abac_evaluator.py +1411 -0
  109. kailash/nodes/security/audit_log.py +103 -0
  110. kailash/nodes/security/behavior_analysis.py +1893 -0
  111. kailash/nodes/security/credential_manager.py +401 -0
  112. kailash/nodes/security/rotating_credentials.py +760 -0
  113. kailash/nodes/security/security_event.py +133 -0
  114. kailash/nodes/security/threat_detection.py +1103 -0
  115. kailash/nodes/testing/__init__.py +9 -0
  116. kailash/nodes/testing/credential_testing.py +499 -0
  117. kailash/nodes/transform/__init__.py +10 -2
  118. kailash/nodes/transform/chunkers.py +592 -1
  119. kailash/nodes/transform/processors.py +484 -14
  120. kailash/nodes/validation.py +321 -0
  121. kailash/runtime/access_controlled.py +1 -1
  122. kailash/runtime/async_local.py +41 -7
  123. kailash/runtime/docker.py +1 -1
  124. kailash/runtime/local.py +474 -55
  125. kailash/runtime/parallel.py +1 -1
  126. kailash/runtime/parallel_cyclic.py +1 -1
  127. kailash/runtime/testing.py +210 -2
  128. kailash/security.py +1 -1
  129. kailash/utils/migrations/__init__.py +25 -0
  130. kailash/utils/migrations/generator.py +433 -0
  131. kailash/utils/migrations/models.py +231 -0
  132. kailash/utils/migrations/runner.py +489 -0
  133. kailash/utils/secure_logging.py +342 -0
  134. kailash/workflow/__init__.py +16 -0
  135. kailash/workflow/cyclic_runner.py +3 -4
  136. kailash/workflow/graph.py +70 -2
  137. kailash/workflow/resilience.py +249 -0
  138. kailash/workflow/templates.py +726 -0
  139. {kailash-0.3.2.dist-info → kailash-0.4.1.dist-info}/METADATA +256 -20
  140. kailash-0.4.1.dist-info/RECORD +227 -0
  141. kailash/api/__init__.py +0 -17
  142. kailash/api/__main__.py +0 -6
  143. kailash/api/studio_secure.py +0 -893
  144. kailash/mcp/__main__.py +0 -13
  145. kailash/mcp/server_new.py +0 -336
  146. kailash/mcp/servers/__init__.py +0 -12
  147. kailash-0.3.2.dist-info/RECORD +0 -136
  148. {kailash-0.3.2.dist-info → kailash-0.4.1.dist-info}/WHEEL +0 -0
  149. {kailash-0.3.2.dist-info → kailash-0.4.1.dist-info}/entry_points.txt +0 -0
  150. {kailash-0.3.2.dist-info → kailash-0.4.1.dist-info}/licenses/LICENSE +0 -0
  151. {kailash-0.3.2.dist-info → kailash-0.4.1.dist-info}/top_level.txt +0 -0
@@ -352,15 +352,13 @@ class DataTransformer(Node):
352
352
  return validated
353
353
 
354
354
  def run(self, **kwargs) -> dict[str, Any]:
355
- # Extract the transformation functions
356
- transformations = kwargs.get("transformations", [])
355
+ # Extract the transformation functions from config first, then kwargs
356
+ transformations = self.config.get("transformations", []) or kwargs.get(
357
+ "transformations", []
358
+ )
357
359
  if not transformations:
358
360
  return {"result": kwargs.get("data", [])}
359
361
 
360
- # Debug: Check what kwargs we received
361
- print(f"DATATRANSFORMER RUN DEBUG: kwargs keys = {list(kwargs.keys())}")
362
- print(f"DATATRANSFORMER RUN DEBUG: kwargs = {kwargs}")
363
-
364
362
  # Get all input data
365
363
  input_data = {}
366
364
  for key, value in kwargs.items():
@@ -368,7 +366,13 @@ class DataTransformer(Node):
368
366
  input_data[key] = value
369
367
 
370
368
  # Execute the transformations
371
- result = input_data.get("data", [])
369
+ # Initialize result - default to empty dict if no data key and we have other inputs
370
+ if "data" in input_data:
371
+ result = input_data["data"]
372
+ elif input_data: # If we have other inputs but no 'data' key
373
+ result = {} # Default to empty dict instead of list
374
+ else:
375
+ result = [] # Only use empty list if no inputs at all
372
376
 
373
377
  for transform_str in transformations:
374
378
  try:
@@ -386,6 +390,10 @@ class DataTransformer(Node):
386
390
  "float": float,
387
391
  "bool": bool,
388
392
  "sorted": sorted,
393
+ "print": print, # Allow print for debugging
394
+ "isinstance": isinstance,
395
+ "type": type,
396
+ "__builtins__": {"__import__": __import__}, # Allow imports
389
397
  }
390
398
 
391
399
  # For multi-line code blocks
@@ -394,13 +402,8 @@ class DataTransformer(Node):
394
402
  local_vars = input_data.copy()
395
403
  local_vars["result"] = result
396
404
 
397
- # Debug: Print available variables
398
- print(
399
- f"DataTransformer DEBUG - Available variables: {list(local_vars.keys())}"
400
- )
401
- print(
402
- f"DataTransformer DEBUG - Input data keys: {list(input_data.keys())}"
403
- )
405
+ # Add a locals function that returns the current local_vars
406
+ safe_globals["locals"] = lambda: local_vars
404
407
 
405
408
  # Execute the code block
406
409
  exec(transform_str, safe_globals, local_vars) # noqa: S102
@@ -473,6 +476,9 @@ class DataTransformer(Node):
473
476
  except Exception as e:
474
477
  tb = traceback.format_exc()
475
478
  self.logger.error(f"Error executing transformation: {e}")
479
+ self.logger.error(f"Transformation: {transform_str}")
480
+ self.logger.error(f"Input data: {input_data}")
481
+ self.logger.error(f"Result before error: {result}")
476
482
  raise RuntimeError(
477
483
  f"Error executing transformation '{transform_str}': {str(e)}\n{tb}"
478
484
  )
@@ -523,5 +529,469 @@ class Sort(Node):
523
529
  return {"sorted_data": sorted_data}
524
530
 
525
531
 
532
+ @register_node()
533
+ class ContextualCompressorNode(Node):
534
+ """
535
+ Contextual compression node that filters and compresses retrieved content
536
+ to maximize relevant information density for optimal context utilization.
537
+
538
+ This node is essential for managing LLM context windows by intelligently
539
+ compressing retrieved documents while preserving query-relevant information.
540
+ It uses multiple compression strategies and relevance scoring to ensure
541
+ optimal information density.
542
+
543
+ Design Philosophy:
544
+ The ContextualCompressorNode embodies "information density optimization."
545
+ Rather than naive truncation, it uses semantic understanding to preserve
546
+ the most relevant information for the given query while respecting token
547
+ budget constraints.
548
+
549
+ Upstream Dependencies:
550
+ - Retrieval nodes providing candidate documents
551
+ - Embedding nodes for semantic analysis
552
+ - LLM nodes for relevance scoring
553
+ - Query transformation nodes
554
+
555
+ Downstream Consumers:
556
+ - LLM Agent nodes consuming compressed context
557
+ - Response generation nodes
558
+ - Context-aware processing nodes
559
+ - Token-budgeted operations
560
+
561
+ Configuration:
562
+ - max_tokens: Maximum token budget for compressed output
563
+ - compression_ratio: Target compression ratio (0.0-1.0)
564
+ - relevance_threshold: Minimum relevance score for inclusion
565
+ - compression_strategy: Method for content compression
566
+
567
+ Examples:
568
+ >>> compressor = ContextualCompressorNode(
569
+ ... max_tokens=2000,
570
+ ... compression_ratio=0.6,
571
+ ... relevance_threshold=0.7
572
+ ... )
573
+ >>> result = compressor.run(
574
+ ... query="machine learning algorithms",
575
+ ... retrieved_docs=[{"content": "...", "metadata": {}}],
576
+ ... compression_target=1500
577
+ ... )
578
+ >>> compressed_context = result["compressed_context"]
579
+ """
580
+
581
+ def __init__(self, name: str = "contextual_compressor", **kwargs):
582
+ # Set attributes before calling super().__init__() as Kailash validates during init
583
+ self.max_tokens = kwargs.get("max_tokens", 4000)
584
+ self.compression_ratio = kwargs.get("compression_ratio", 0.6)
585
+ self.relevance_threshold = kwargs.get("relevance_threshold", 0.7)
586
+ self.compression_strategy = kwargs.get(
587
+ "compression_strategy", "extractive_summarization"
588
+ )
589
+
590
+ super().__init__(name=name)
591
+
592
+ def get_parameters(self) -> dict[str, NodeParameter]:
593
+ """Get node parameters for Kailash framework."""
594
+ return {
595
+ "query": NodeParameter(
596
+ name="query",
597
+ type=str,
598
+ required=True,
599
+ description="Query for relevance-based compression",
600
+ ),
601
+ "retrieved_docs": NodeParameter(
602
+ name="retrieved_docs",
603
+ type=list,
604
+ required=True,
605
+ description="List of retrieved documents to compress",
606
+ ),
607
+ "compression_target": NodeParameter(
608
+ name="compression_target",
609
+ type=int,
610
+ required=False,
611
+ default=self.max_tokens,
612
+ description="Target token count for compressed content",
613
+ ),
614
+ "max_tokens": NodeParameter(
615
+ name="max_tokens",
616
+ type=int,
617
+ required=False,
618
+ default=self.max_tokens,
619
+ description="Maximum tokens for contextual compression",
620
+ ),
621
+ "compression_ratio": NodeParameter(
622
+ name="compression_ratio",
623
+ type=float,
624
+ required=False,
625
+ default=self.compression_ratio,
626
+ description="Target compression ratio (0.0-1.0)",
627
+ ),
628
+ "relevance_threshold": NodeParameter(
629
+ name="relevance_threshold",
630
+ type=float,
631
+ required=False,
632
+ default=self.relevance_threshold,
633
+ description="Relevance threshold for passage selection",
634
+ ),
635
+ "compression_strategy": NodeParameter(
636
+ name="compression_strategy",
637
+ type=str,
638
+ required=False,
639
+ default=self.compression_strategy,
640
+ description="Compression strategy (extractive_summarization, abstractive_synthesis, hierarchical_organization)",
641
+ ),
642
+ }
643
+
644
+ def run(self, **kwargs) -> dict[str, Any]:
645
+ """Run contextual compression on retrieved documents."""
646
+ query = kwargs.get("query", "")
647
+ retrieved_docs = kwargs.get("retrieved_docs", [])
648
+ compression_target = kwargs.get("compression_target", self.max_tokens)
649
+
650
+ if not query:
651
+ return {
652
+ "error": "Query is required for contextual compression",
653
+ "compressed_context": "",
654
+ "compression_metadata": {},
655
+ }
656
+
657
+ if not retrieved_docs:
658
+ return {
659
+ "compressed_context": "",
660
+ "compression_metadata": {
661
+ "original_document_count": 0,
662
+ "selected_passage_count": 0,
663
+ "compression_ratio": 0.0,
664
+ },
665
+ "num_input_docs": 0,
666
+ "compression_success": False,
667
+ }
668
+
669
+ try:
670
+ # Stage 1: Score passages for relevance
671
+ scored_passages = self._score_passage_relevance(query, retrieved_docs)
672
+
673
+ # Stage 2: Select optimal passages within budget
674
+ selected_passages = self._select_optimal_passages(
675
+ scored_passages, compression_target
676
+ )
677
+
678
+ # Stage 3: Compress selected content
679
+ compressed_context = self._compress_selected_content(
680
+ query, selected_passages
681
+ )
682
+
683
+ # Stage 4: Generate metadata
684
+ compression_metadata = self._generate_compression_metadata(
685
+ retrieved_docs, selected_passages, compressed_context
686
+ )
687
+
688
+ return {
689
+ "compressed_context": compressed_context,
690
+ "compression_metadata": compression_metadata,
691
+ "selected_passages": selected_passages,
692
+ "num_input_docs": len(retrieved_docs),
693
+ "compression_success": len(compressed_context) > 0,
694
+ }
695
+
696
+ except Exception as e:
697
+ return {
698
+ "error": f"Compression failed: {str(e)}",
699
+ "compressed_context": "",
700
+ "compression_metadata": {},
701
+ "num_input_docs": len(retrieved_docs),
702
+ "compression_success": False,
703
+ }
704
+
705
+ def _score_passage_relevance(self, query: str, documents: list) -> list:
706
+ """Score each passage for relevance to the query using heuristic methods."""
707
+ scored_passages = []
708
+ query_words = set(query.lower().split())
709
+
710
+ for i, doc in enumerate(documents):
711
+ content = doc.get("content", "") if isinstance(doc, dict) else str(doc)
712
+
713
+ if not content.strip():
714
+ continue
715
+
716
+ # Calculate relevance score using multiple factors
717
+ content_words = set(content.lower().split())
718
+
719
+ # 1. Keyword overlap score
720
+ keyword_overlap = (
721
+ len(query_words & content_words) / len(query_words)
722
+ if query_words
723
+ else 0
724
+ )
725
+
726
+ # 2. Content density score (information per word)
727
+ word_count = len(content_words)
728
+ density_score = min(1.0, word_count / 100) # Normalize to reasonable length
729
+
730
+ # 3. Position bonus (earlier documents often more relevant)
731
+ position_bonus = max(0.1, 1.0 - (i * 0.1))
732
+
733
+ # 4. Original similarity score if available
734
+ original_score = (
735
+ doc.get("similarity_score", 0.5) if isinstance(doc, dict) else 0.5
736
+ )
737
+
738
+ # Combine scores
739
+ relevance_score = (
740
+ 0.4 * keyword_overlap
741
+ + 0.2 * density_score
742
+ + 0.1 * position_bonus
743
+ + 0.3 * original_score
744
+ )
745
+
746
+ # Apply relevance threshold
747
+ if relevance_score >= self.relevance_threshold:
748
+ scored_passages.append(
749
+ {
750
+ "document": doc,
751
+ "content": content,
752
+ "relevance_score": relevance_score,
753
+ "keyword_overlap": keyword_overlap,
754
+ "original_index": i,
755
+ "token_count": len(content.split())
756
+ * 1.3, # Rough token estimate
757
+ }
758
+ )
759
+
760
+ # Sort by relevance score
761
+ scored_passages.sort(key=lambda x: x["relevance_score"], reverse=True)
762
+ return scored_passages
763
+
764
+ def _select_optimal_passages(
765
+ self, scored_passages: list, target_tokens: int
766
+ ) -> list:
767
+ """Select optimal passages within token budget."""
768
+ if not scored_passages:
769
+ return []
770
+
771
+ selected = []
772
+ total_tokens = 0
773
+ diversity_threshold = 0.8
774
+
775
+ for passage in scored_passages:
776
+ passage_tokens = passage["token_count"]
777
+
778
+ # Check token budget
779
+ if total_tokens + passage_tokens > target_tokens:
780
+ # Try to fit partial content if it's high value
781
+ if passage["relevance_score"] > 0.9 and len(selected) < 3:
782
+ remaining_tokens = target_tokens - total_tokens
783
+ if remaining_tokens > 50: # Minimum useful content
784
+ # Truncate passage to fit
785
+ truncated_content = self._truncate_passage(
786
+ passage["content"], remaining_tokens
787
+ )
788
+ passage_copy = passage.copy()
789
+ passage_copy["content"] = truncated_content
790
+ passage_copy["token_count"] = remaining_tokens
791
+ passage_copy["is_truncated"] = True
792
+ selected.append(passage_copy)
793
+ total_tokens = target_tokens
794
+ break
795
+
796
+ # Check diversity (avoid near-duplicate content)
797
+ is_diverse = True
798
+ for selected_passage in selected:
799
+ similarity = self._calculate_content_similarity(
800
+ passage["content"], selected_passage["content"]
801
+ )
802
+ if similarity > diversity_threshold:
803
+ is_diverse = False
804
+ break
805
+
806
+ if is_diverse:
807
+ selected.append(passage)
808
+ total_tokens += passage_tokens
809
+
810
+ return selected
811
+
812
+ def _compress_selected_content(self, query: str, selected_passages: list) -> str:
813
+ """Compress selected passages into coherent context."""
814
+ if not selected_passages:
815
+ return ""
816
+
817
+ # For now, use extractive summarization (concatenate most relevant parts)
818
+ if self.compression_strategy == "extractive_summarization":
819
+ return self._extractive_compression(query, selected_passages)
820
+ elif self.compression_strategy == "abstractive_synthesis":
821
+ return self._abstractive_compression(query, selected_passages)
822
+ elif self.compression_strategy == "hierarchical_organization":
823
+ return self._hierarchical_compression(query, selected_passages)
824
+ else:
825
+ # Default to extractive
826
+ return self._extractive_compression(query, selected_passages)
827
+
828
+ def _extractive_compression(self, query: str, passages: list) -> str:
829
+ """Extract and concatenate the most relevant sentences."""
830
+ compressed_parts = []
831
+ query_words = set(query.lower().split())
832
+
833
+ for passage in passages:
834
+ content = passage["content"]
835
+
836
+ # Split into sentences
837
+ sentences = self._split_into_sentences(content)
838
+
839
+ # Score each sentence for relevance
840
+ sentence_scores = []
841
+ for sentence in sentences:
842
+ sentence_words = set(sentence.lower().split())
843
+ overlap = (
844
+ len(query_words & sentence_words) / len(query_words)
845
+ if query_words
846
+ else 0
847
+ )
848
+ sentence_scores.append((sentence, overlap))
849
+
850
+ # Sort by relevance and take top sentences
851
+ sentence_scores.sort(key=lambda x: x[1], reverse=True)
852
+ top_sentences = [
853
+ s[0] for s in sentence_scores[:3]
854
+ ] # Top 3 sentences per passage
855
+
856
+ if top_sentences:
857
+ compressed_parts.append(" ".join(top_sentences))
858
+
859
+ return "\n\n".join(compressed_parts)
860
+
861
+ def _abstractive_compression(self, query: str, passages: list) -> str:
862
+ """Create abstractive summary (simplified version)."""
863
+ # In a real implementation, this would use an LLM
864
+ # For now, create a structured summary
865
+ key_points = []
866
+
867
+ for passage in passages:
868
+ content = passage["content"]
869
+ # Extract key phrases (simplified)
870
+ sentences = self._split_into_sentences(content)
871
+ if sentences:
872
+ # Take first and last sentence as key points
873
+ key_points.append(sentences[0])
874
+ if len(sentences) > 1:
875
+ key_points.append(sentences[-1])
876
+
877
+ return f"Summary for query '{query}':\n" + "\n".join(
878
+ f"• {point}" for point in key_points[:10]
879
+ )
880
+
881
+ def _hierarchical_compression(self, query: str, passages: list) -> str:
882
+ """Organize information hierarchically."""
883
+ organized_content = {
884
+ "primary_information": [],
885
+ "supporting_details": [],
886
+ "additional_context": [],
887
+ }
888
+
889
+ for i, passage in enumerate(passages):
890
+ content = passage["content"]
891
+ relevance = passage["relevance_score"]
892
+
893
+ if relevance > 0.8:
894
+ organized_content["primary_information"].append(content)
895
+ elif relevance > 0.6:
896
+ organized_content["supporting_details"].append(content)
897
+ else:
898
+ organized_content["additional_context"].append(content)
899
+
900
+ result_parts = []
901
+
902
+ if organized_content["primary_information"]:
903
+ result_parts.append("PRIMARY INFORMATION:")
904
+ result_parts.extend(organized_content["primary_information"])
905
+
906
+ if organized_content["supporting_details"]:
907
+ result_parts.append("\nSUPPORTING DETAILS:")
908
+ result_parts.extend(organized_content["supporting_details"])
909
+
910
+ if organized_content["additional_context"]:
911
+ result_parts.append("\nADDITIONAL CONTEXT:")
912
+ result_parts.extend(
913
+ organized_content["additional_context"][:2]
914
+ ) # Limit additional context
915
+
916
+ return "\n".join(result_parts)
917
+
918
+ def _split_into_sentences(self, text: str) -> list:
919
+ """Split text into sentences (simplified)."""
920
+ import re
921
+
922
+ sentences = re.split(r"[.!?]+", text)
923
+ return [s.strip() for s in sentences if s.strip()]
924
+
925
+ def _calculate_content_similarity(self, content1: str, content2: str) -> float:
926
+ """Calculate Jaccard similarity between two content pieces."""
927
+ words1 = set(content1.lower().split())
928
+ words2 = set(content2.lower().split())
929
+
930
+ if not words1 or not words2:
931
+ return 0.0
932
+
933
+ intersection = len(words1 & words2)
934
+ union = len(words1 | words2)
935
+
936
+ return intersection / union if union > 0 else 0.0
937
+
938
+ def _truncate_passage(self, content: str, max_tokens: int) -> str:
939
+ """Intelligently truncate passage to fit token budget."""
940
+ words = content.split()
941
+ target_words = int(max_tokens / 1.3) # Rough token-to-word ratio
942
+
943
+ if len(words) <= target_words:
944
+ return content
945
+
946
+ # Try to end at sentence boundary
947
+ truncated_words = words[:target_words]
948
+ truncated_text = " ".join(truncated_words)
949
+
950
+ # Find last sentence boundary
951
+ last_sentence_end = max(
952
+ truncated_text.rfind("."),
953
+ truncated_text.rfind("!"),
954
+ truncated_text.rfind("?"),
955
+ )
956
+
957
+ if (
958
+ last_sentence_end > len(truncated_text) * 0.7
959
+ ): # If we can preserve most content
960
+ return truncated_text[: last_sentence_end + 1]
961
+ else:
962
+ return truncated_text + "..."
963
+
964
+ def _generate_compression_metadata(
965
+ self, original_docs: list, selected_passages: list, compressed_context: str
966
+ ) -> dict:
967
+ """Generate metadata about the compression process."""
968
+ original_length = sum(
969
+ len(doc.get("content", "") if isinstance(doc, dict) else str(doc))
970
+ for doc in original_docs
971
+ )
972
+ compressed_length = len(compressed_context)
973
+
974
+ return {
975
+ "original_document_count": len(original_docs),
976
+ "selected_passage_count": len(selected_passages),
977
+ "original_char_count": original_length,
978
+ "compressed_char_count": compressed_length,
979
+ "compression_ratio": (
980
+ compressed_length / original_length if original_length > 0 else 0
981
+ ),
982
+ "avg_relevance_score": (
983
+ sum(p["relevance_score"] for p in selected_passages)
984
+ / len(selected_passages)
985
+ if selected_passages
986
+ else 0
987
+ ),
988
+ "compression_strategy": self.compression_strategy,
989
+ "token_budget": self.max_tokens,
990
+ "passages_truncated": sum(
991
+ 1 for p in selected_passages if p.get("is_truncated", False)
992
+ ),
993
+ }
994
+
995
+
526
996
  # Backward compatibility aliases
527
997
  Filter = FilterNode