kailash 0.3.2__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kailash/__init__.py +33 -1
- kailash/access_control/__init__.py +129 -0
- kailash/access_control/managers.py +461 -0
- kailash/access_control/rule_evaluators.py +467 -0
- kailash/access_control_abac.py +825 -0
- kailash/config/__init__.py +27 -0
- kailash/config/database_config.py +359 -0
- kailash/database/__init__.py +28 -0
- kailash/database/execution_pipeline.py +499 -0
- kailash/middleware/__init__.py +306 -0
- kailash/middleware/auth/__init__.py +33 -0
- kailash/middleware/auth/access_control.py +436 -0
- kailash/middleware/auth/auth_manager.py +422 -0
- kailash/middleware/auth/jwt_auth.py +477 -0
- kailash/middleware/auth/kailash_jwt_auth.py +616 -0
- kailash/middleware/communication/__init__.py +37 -0
- kailash/middleware/communication/ai_chat.py +989 -0
- kailash/middleware/communication/api_gateway.py +802 -0
- kailash/middleware/communication/events.py +470 -0
- kailash/middleware/communication/realtime.py +710 -0
- kailash/middleware/core/__init__.py +21 -0
- kailash/middleware/core/agent_ui.py +890 -0
- kailash/middleware/core/schema.py +643 -0
- kailash/middleware/core/workflows.py +396 -0
- kailash/middleware/database/__init__.py +63 -0
- kailash/middleware/database/base.py +113 -0
- kailash/middleware/database/base_models.py +525 -0
- kailash/middleware/database/enums.py +106 -0
- kailash/middleware/database/migrations.py +12 -0
- kailash/{api/database.py → middleware/database/models.py} +183 -291
- kailash/middleware/database/repositories.py +685 -0
- kailash/middleware/database/session_manager.py +19 -0
- kailash/middleware/mcp/__init__.py +38 -0
- kailash/middleware/mcp/client_integration.py +585 -0
- kailash/middleware/mcp/enhanced_server.py +576 -0
- kailash/nodes/__init__.py +25 -3
- kailash/nodes/admin/__init__.py +35 -0
- kailash/nodes/admin/audit_log.py +794 -0
- kailash/nodes/admin/permission_check.py +864 -0
- kailash/nodes/admin/role_management.py +823 -0
- kailash/nodes/admin/security_event.py +1519 -0
- kailash/nodes/admin/user_management.py +944 -0
- kailash/nodes/ai/a2a.py +24 -7
- kailash/nodes/ai/ai_providers.py +1 -0
- kailash/nodes/ai/embedding_generator.py +11 -11
- kailash/nodes/ai/intelligent_agent_orchestrator.py +99 -11
- kailash/nodes/ai/llm_agent.py +407 -2
- kailash/nodes/ai/self_organizing.py +85 -10
- kailash/nodes/api/auth.py +287 -6
- kailash/nodes/api/rest.py +151 -0
- kailash/nodes/auth/__init__.py +17 -0
- kailash/nodes/auth/directory_integration.py +1228 -0
- kailash/nodes/auth/enterprise_auth_provider.py +1328 -0
- kailash/nodes/auth/mfa.py +2338 -0
- kailash/nodes/auth/risk_assessment.py +872 -0
- kailash/nodes/auth/session_management.py +1093 -0
- kailash/nodes/auth/sso.py +1040 -0
- kailash/nodes/base.py +344 -13
- kailash/nodes/base_cycle_aware.py +4 -2
- kailash/nodes/base_with_acl.py +1 -1
- kailash/nodes/code/python.py +283 -10
- kailash/nodes/compliance/__init__.py +9 -0
- kailash/nodes/compliance/data_retention.py +1888 -0
- kailash/nodes/compliance/gdpr.py +2004 -0
- kailash/nodes/data/__init__.py +22 -2
- kailash/nodes/data/async_connection.py +469 -0
- kailash/nodes/data/async_sql.py +757 -0
- kailash/nodes/data/async_vector.py +598 -0
- kailash/nodes/data/readers.py +767 -0
- kailash/nodes/data/retrieval.py +360 -1
- kailash/nodes/data/sharepoint_graph.py +397 -21
- kailash/nodes/data/sql.py +94 -5
- kailash/nodes/data/streaming.py +68 -8
- kailash/nodes/data/vector_db.py +54 -4
- kailash/nodes/enterprise/__init__.py +13 -0
- kailash/nodes/enterprise/batch_processor.py +741 -0
- kailash/nodes/enterprise/data_lineage.py +497 -0
- kailash/nodes/logic/convergence.py +31 -9
- kailash/nodes/logic/operations.py +14 -3
- kailash/nodes/mixins/__init__.py +8 -0
- kailash/nodes/mixins/event_emitter.py +201 -0
- kailash/nodes/mixins/mcp.py +9 -4
- kailash/nodes/mixins/security.py +165 -0
- kailash/nodes/monitoring/__init__.py +7 -0
- kailash/nodes/monitoring/performance_benchmark.py +2497 -0
- kailash/nodes/rag/__init__.py +284 -0
- kailash/nodes/rag/advanced.py +1615 -0
- kailash/nodes/rag/agentic.py +773 -0
- kailash/nodes/rag/conversational.py +999 -0
- kailash/nodes/rag/evaluation.py +875 -0
- kailash/nodes/rag/federated.py +1188 -0
- kailash/nodes/rag/graph.py +721 -0
- kailash/nodes/rag/multimodal.py +671 -0
- kailash/nodes/rag/optimized.py +933 -0
- kailash/nodes/rag/privacy.py +1059 -0
- kailash/nodes/rag/query_processing.py +1335 -0
- kailash/nodes/rag/realtime.py +764 -0
- kailash/nodes/rag/registry.py +547 -0
- kailash/nodes/rag/router.py +837 -0
- kailash/nodes/rag/similarity.py +1854 -0
- kailash/nodes/rag/strategies.py +566 -0
- kailash/nodes/rag/workflows.py +575 -0
- kailash/nodes/security/__init__.py +19 -0
- kailash/nodes/security/abac_evaluator.py +1411 -0
- kailash/nodes/security/audit_log.py +91 -0
- kailash/nodes/security/behavior_analysis.py +1893 -0
- kailash/nodes/security/credential_manager.py +401 -0
- kailash/nodes/security/rotating_credentials.py +760 -0
- kailash/nodes/security/security_event.py +132 -0
- kailash/nodes/security/threat_detection.py +1103 -0
- kailash/nodes/testing/__init__.py +9 -0
- kailash/nodes/testing/credential_testing.py +499 -0
- kailash/nodes/transform/__init__.py +10 -2
- kailash/nodes/transform/chunkers.py +592 -1
- kailash/nodes/transform/processors.py +484 -14
- kailash/nodes/validation.py +321 -0
- kailash/runtime/access_controlled.py +1 -1
- kailash/runtime/async_local.py +41 -7
- kailash/runtime/docker.py +1 -1
- kailash/runtime/local.py +474 -55
- kailash/runtime/parallel.py +1 -1
- kailash/runtime/parallel_cyclic.py +1 -1
- kailash/runtime/testing.py +210 -2
- kailash/utils/migrations/__init__.py +25 -0
- kailash/utils/migrations/generator.py +433 -0
- kailash/utils/migrations/models.py +231 -0
- kailash/utils/migrations/runner.py +489 -0
- kailash/utils/secure_logging.py +342 -0
- kailash/workflow/__init__.py +16 -0
- kailash/workflow/cyclic_runner.py +3 -4
- kailash/workflow/graph.py +70 -2
- kailash/workflow/resilience.py +249 -0
- kailash/workflow/templates.py +726 -0
- {kailash-0.3.2.dist-info → kailash-0.4.0.dist-info}/METADATA +253 -20
- kailash-0.4.0.dist-info/RECORD +223 -0
- kailash/api/__init__.py +0 -17
- kailash/api/__main__.py +0 -6
- kailash/api/studio_secure.py +0 -893
- kailash/mcp/__main__.py +0 -13
- kailash/mcp/server_new.py +0 -336
- kailash/mcp/servers/__init__.py +0 -12
- kailash-0.3.2.dist-info/RECORD +0 -136
- {kailash-0.3.2.dist-info → kailash-0.4.0.dist-info}/WHEEL +0 -0
- {kailash-0.3.2.dist-info → kailash-0.4.0.dist-info}/entry_points.txt +0 -0
- {kailash-0.3.2.dist-info → kailash-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {kailash-0.3.2.dist-info → kailash-0.4.0.dist-info}/top_level.txt +0 -0
@@ -352,15 +352,13 @@ class DataTransformer(Node):
|
|
352
352
|
return validated
|
353
353
|
|
354
354
|
def run(self, **kwargs) -> dict[str, Any]:
|
355
|
-
# Extract the transformation functions
|
356
|
-
transformations =
|
355
|
+
# Extract the transformation functions from config first, then kwargs
|
356
|
+
transformations = self.config.get("transformations", []) or kwargs.get(
|
357
|
+
"transformations", []
|
358
|
+
)
|
357
359
|
if not transformations:
|
358
360
|
return {"result": kwargs.get("data", [])}
|
359
361
|
|
360
|
-
# Debug: Check what kwargs we received
|
361
|
-
print(f"DATATRANSFORMER RUN DEBUG: kwargs keys = {list(kwargs.keys())}")
|
362
|
-
print(f"DATATRANSFORMER RUN DEBUG: kwargs = {kwargs}")
|
363
|
-
|
364
362
|
# Get all input data
|
365
363
|
input_data = {}
|
366
364
|
for key, value in kwargs.items():
|
@@ -368,7 +366,13 @@ class DataTransformer(Node):
|
|
368
366
|
input_data[key] = value
|
369
367
|
|
370
368
|
# Execute the transformations
|
371
|
-
result
|
369
|
+
# Initialize result - default to empty dict if no data key and we have other inputs
|
370
|
+
if "data" in input_data:
|
371
|
+
result = input_data["data"]
|
372
|
+
elif input_data: # If we have other inputs but no 'data' key
|
373
|
+
result = {} # Default to empty dict instead of list
|
374
|
+
else:
|
375
|
+
result = [] # Only use empty list if no inputs at all
|
372
376
|
|
373
377
|
for transform_str in transformations:
|
374
378
|
try:
|
@@ -386,6 +390,10 @@ class DataTransformer(Node):
|
|
386
390
|
"float": float,
|
387
391
|
"bool": bool,
|
388
392
|
"sorted": sorted,
|
393
|
+
"print": print, # Allow print for debugging
|
394
|
+
"isinstance": isinstance,
|
395
|
+
"type": type,
|
396
|
+
"__builtins__": {"__import__": __import__}, # Allow imports
|
389
397
|
}
|
390
398
|
|
391
399
|
# For multi-line code blocks
|
@@ -394,13 +402,8 @@ class DataTransformer(Node):
|
|
394
402
|
local_vars = input_data.copy()
|
395
403
|
local_vars["result"] = result
|
396
404
|
|
397
|
-
#
|
398
|
-
|
399
|
-
f"DataTransformer DEBUG - Available variables: {list(local_vars.keys())}"
|
400
|
-
)
|
401
|
-
print(
|
402
|
-
f"DataTransformer DEBUG - Input data keys: {list(input_data.keys())}"
|
403
|
-
)
|
405
|
+
# Add a locals function that returns the current local_vars
|
406
|
+
safe_globals["locals"] = lambda: local_vars
|
404
407
|
|
405
408
|
# Execute the code block
|
406
409
|
exec(transform_str, safe_globals, local_vars) # noqa: S102
|
@@ -473,6 +476,9 @@ class DataTransformer(Node):
|
|
473
476
|
except Exception as e:
|
474
477
|
tb = traceback.format_exc()
|
475
478
|
self.logger.error(f"Error executing transformation: {e}")
|
479
|
+
self.logger.error(f"Transformation: {transform_str}")
|
480
|
+
self.logger.error(f"Input data: {input_data}")
|
481
|
+
self.logger.error(f"Result before error: {result}")
|
476
482
|
raise RuntimeError(
|
477
483
|
f"Error executing transformation '{transform_str}': {str(e)}\n{tb}"
|
478
484
|
)
|
@@ -523,5 +529,469 @@ class Sort(Node):
|
|
523
529
|
return {"sorted_data": sorted_data}
|
524
530
|
|
525
531
|
|
532
|
+
@register_node()
|
533
|
+
class ContextualCompressorNode(Node):
|
534
|
+
"""
|
535
|
+
Contextual compression node that filters and compresses retrieved content
|
536
|
+
to maximize relevant information density for optimal context utilization.
|
537
|
+
|
538
|
+
This node is essential for managing LLM context windows by intelligently
|
539
|
+
compressing retrieved documents while preserving query-relevant information.
|
540
|
+
It uses multiple compression strategies and relevance scoring to ensure
|
541
|
+
optimal information density.
|
542
|
+
|
543
|
+
Design Philosophy:
|
544
|
+
The ContextualCompressorNode embodies "information density optimization."
|
545
|
+
Rather than naive truncation, it uses semantic understanding to preserve
|
546
|
+
the most relevant information for the given query while respecting token
|
547
|
+
budget constraints.
|
548
|
+
|
549
|
+
Upstream Dependencies:
|
550
|
+
- Retrieval nodes providing candidate documents
|
551
|
+
- Embedding nodes for semantic analysis
|
552
|
+
- LLM nodes for relevance scoring
|
553
|
+
- Query transformation nodes
|
554
|
+
|
555
|
+
Downstream Consumers:
|
556
|
+
- LLM Agent nodes consuming compressed context
|
557
|
+
- Response generation nodes
|
558
|
+
- Context-aware processing nodes
|
559
|
+
- Token-budgeted operations
|
560
|
+
|
561
|
+
Configuration:
|
562
|
+
- max_tokens: Maximum token budget for compressed output
|
563
|
+
- compression_ratio: Target compression ratio (0.0-1.0)
|
564
|
+
- relevance_threshold: Minimum relevance score for inclusion
|
565
|
+
- compression_strategy: Method for content compression
|
566
|
+
|
567
|
+
Examples:
|
568
|
+
>>> compressor = ContextualCompressorNode(
|
569
|
+
... max_tokens=2000,
|
570
|
+
... compression_ratio=0.6,
|
571
|
+
... relevance_threshold=0.7
|
572
|
+
... )
|
573
|
+
>>> result = compressor.run(
|
574
|
+
... query="machine learning algorithms",
|
575
|
+
... retrieved_docs=[{"content": "...", "metadata": {}}],
|
576
|
+
... compression_target=1500
|
577
|
+
... )
|
578
|
+
>>> compressed_context = result["compressed_context"]
|
579
|
+
"""
|
580
|
+
|
581
|
+
def __init__(self, name: str = "contextual_compressor", **kwargs):
|
582
|
+
# Set attributes before calling super().__init__() as Kailash validates during init
|
583
|
+
self.max_tokens = kwargs.get("max_tokens", 4000)
|
584
|
+
self.compression_ratio = kwargs.get("compression_ratio", 0.6)
|
585
|
+
self.relevance_threshold = kwargs.get("relevance_threshold", 0.7)
|
586
|
+
self.compression_strategy = kwargs.get(
|
587
|
+
"compression_strategy", "extractive_summarization"
|
588
|
+
)
|
589
|
+
|
590
|
+
super().__init__(name=name)
|
591
|
+
|
592
|
+
def get_parameters(self) -> dict[str, NodeParameter]:
|
593
|
+
"""Get node parameters for Kailash framework."""
|
594
|
+
return {
|
595
|
+
"query": NodeParameter(
|
596
|
+
name="query",
|
597
|
+
type=str,
|
598
|
+
required=True,
|
599
|
+
description="Query for relevance-based compression",
|
600
|
+
),
|
601
|
+
"retrieved_docs": NodeParameter(
|
602
|
+
name="retrieved_docs",
|
603
|
+
type=list,
|
604
|
+
required=True,
|
605
|
+
description="List of retrieved documents to compress",
|
606
|
+
),
|
607
|
+
"compression_target": NodeParameter(
|
608
|
+
name="compression_target",
|
609
|
+
type=int,
|
610
|
+
required=False,
|
611
|
+
default=self.max_tokens,
|
612
|
+
description="Target token count for compressed content",
|
613
|
+
),
|
614
|
+
"max_tokens": NodeParameter(
|
615
|
+
name="max_tokens",
|
616
|
+
type=int,
|
617
|
+
required=False,
|
618
|
+
default=self.max_tokens,
|
619
|
+
description="Maximum tokens for contextual compression",
|
620
|
+
),
|
621
|
+
"compression_ratio": NodeParameter(
|
622
|
+
name="compression_ratio",
|
623
|
+
type=float,
|
624
|
+
required=False,
|
625
|
+
default=self.compression_ratio,
|
626
|
+
description="Target compression ratio (0.0-1.0)",
|
627
|
+
),
|
628
|
+
"relevance_threshold": NodeParameter(
|
629
|
+
name="relevance_threshold",
|
630
|
+
type=float,
|
631
|
+
required=False,
|
632
|
+
default=self.relevance_threshold,
|
633
|
+
description="Relevance threshold for passage selection",
|
634
|
+
),
|
635
|
+
"compression_strategy": NodeParameter(
|
636
|
+
name="compression_strategy",
|
637
|
+
type=str,
|
638
|
+
required=False,
|
639
|
+
default=self.compression_strategy,
|
640
|
+
description="Compression strategy (extractive_summarization, abstractive_synthesis, hierarchical_organization)",
|
641
|
+
),
|
642
|
+
}
|
643
|
+
|
644
|
+
def run(self, **kwargs) -> dict[str, Any]:
|
645
|
+
"""Run contextual compression on retrieved documents."""
|
646
|
+
query = kwargs.get("query", "")
|
647
|
+
retrieved_docs = kwargs.get("retrieved_docs", [])
|
648
|
+
compression_target = kwargs.get("compression_target", self.max_tokens)
|
649
|
+
|
650
|
+
if not query:
|
651
|
+
return {
|
652
|
+
"error": "Query is required for contextual compression",
|
653
|
+
"compressed_context": "",
|
654
|
+
"compression_metadata": {},
|
655
|
+
}
|
656
|
+
|
657
|
+
if not retrieved_docs:
|
658
|
+
return {
|
659
|
+
"compressed_context": "",
|
660
|
+
"compression_metadata": {
|
661
|
+
"original_document_count": 0,
|
662
|
+
"selected_passage_count": 0,
|
663
|
+
"compression_ratio": 0.0,
|
664
|
+
},
|
665
|
+
"num_input_docs": 0,
|
666
|
+
"compression_success": False,
|
667
|
+
}
|
668
|
+
|
669
|
+
try:
|
670
|
+
# Stage 1: Score passages for relevance
|
671
|
+
scored_passages = self._score_passage_relevance(query, retrieved_docs)
|
672
|
+
|
673
|
+
# Stage 2: Select optimal passages within budget
|
674
|
+
selected_passages = self._select_optimal_passages(
|
675
|
+
scored_passages, compression_target
|
676
|
+
)
|
677
|
+
|
678
|
+
# Stage 3: Compress selected content
|
679
|
+
compressed_context = self._compress_selected_content(
|
680
|
+
query, selected_passages
|
681
|
+
)
|
682
|
+
|
683
|
+
# Stage 4: Generate metadata
|
684
|
+
compression_metadata = self._generate_compression_metadata(
|
685
|
+
retrieved_docs, selected_passages, compressed_context
|
686
|
+
)
|
687
|
+
|
688
|
+
return {
|
689
|
+
"compressed_context": compressed_context,
|
690
|
+
"compression_metadata": compression_metadata,
|
691
|
+
"selected_passages": selected_passages,
|
692
|
+
"num_input_docs": len(retrieved_docs),
|
693
|
+
"compression_success": len(compressed_context) > 0,
|
694
|
+
}
|
695
|
+
|
696
|
+
except Exception as e:
|
697
|
+
return {
|
698
|
+
"error": f"Compression failed: {str(e)}",
|
699
|
+
"compressed_context": "",
|
700
|
+
"compression_metadata": {},
|
701
|
+
"num_input_docs": len(retrieved_docs),
|
702
|
+
"compression_success": False,
|
703
|
+
}
|
704
|
+
|
705
|
+
def _score_passage_relevance(self, query: str, documents: list) -> list:
|
706
|
+
"""Score each passage for relevance to the query using heuristic methods."""
|
707
|
+
scored_passages = []
|
708
|
+
query_words = set(query.lower().split())
|
709
|
+
|
710
|
+
for i, doc in enumerate(documents):
|
711
|
+
content = doc.get("content", "") if isinstance(doc, dict) else str(doc)
|
712
|
+
|
713
|
+
if not content.strip():
|
714
|
+
continue
|
715
|
+
|
716
|
+
# Calculate relevance score using multiple factors
|
717
|
+
content_words = set(content.lower().split())
|
718
|
+
|
719
|
+
# 1. Keyword overlap score
|
720
|
+
keyword_overlap = (
|
721
|
+
len(query_words & content_words) / len(query_words)
|
722
|
+
if query_words
|
723
|
+
else 0
|
724
|
+
)
|
725
|
+
|
726
|
+
# 2. Content density score (information per word)
|
727
|
+
word_count = len(content_words)
|
728
|
+
density_score = min(1.0, word_count / 100) # Normalize to reasonable length
|
729
|
+
|
730
|
+
# 3. Position bonus (earlier documents often more relevant)
|
731
|
+
position_bonus = max(0.1, 1.0 - (i * 0.1))
|
732
|
+
|
733
|
+
# 4. Original similarity score if available
|
734
|
+
original_score = (
|
735
|
+
doc.get("similarity_score", 0.5) if isinstance(doc, dict) else 0.5
|
736
|
+
)
|
737
|
+
|
738
|
+
# Combine scores
|
739
|
+
relevance_score = (
|
740
|
+
0.4 * keyword_overlap
|
741
|
+
+ 0.2 * density_score
|
742
|
+
+ 0.1 * position_bonus
|
743
|
+
+ 0.3 * original_score
|
744
|
+
)
|
745
|
+
|
746
|
+
# Apply relevance threshold
|
747
|
+
if relevance_score >= self.relevance_threshold:
|
748
|
+
scored_passages.append(
|
749
|
+
{
|
750
|
+
"document": doc,
|
751
|
+
"content": content,
|
752
|
+
"relevance_score": relevance_score,
|
753
|
+
"keyword_overlap": keyword_overlap,
|
754
|
+
"original_index": i,
|
755
|
+
"token_count": len(content.split())
|
756
|
+
* 1.3, # Rough token estimate
|
757
|
+
}
|
758
|
+
)
|
759
|
+
|
760
|
+
# Sort by relevance score
|
761
|
+
scored_passages.sort(key=lambda x: x["relevance_score"], reverse=True)
|
762
|
+
return scored_passages
|
763
|
+
|
764
|
+
def _select_optimal_passages(
|
765
|
+
self, scored_passages: list, target_tokens: int
|
766
|
+
) -> list:
|
767
|
+
"""Select optimal passages within token budget."""
|
768
|
+
if not scored_passages:
|
769
|
+
return []
|
770
|
+
|
771
|
+
selected = []
|
772
|
+
total_tokens = 0
|
773
|
+
diversity_threshold = 0.8
|
774
|
+
|
775
|
+
for passage in scored_passages:
|
776
|
+
passage_tokens = passage["token_count"]
|
777
|
+
|
778
|
+
# Check token budget
|
779
|
+
if total_tokens + passage_tokens > target_tokens:
|
780
|
+
# Try to fit partial content if it's high value
|
781
|
+
if passage["relevance_score"] > 0.9 and len(selected) < 3:
|
782
|
+
remaining_tokens = target_tokens - total_tokens
|
783
|
+
if remaining_tokens > 50: # Minimum useful content
|
784
|
+
# Truncate passage to fit
|
785
|
+
truncated_content = self._truncate_passage(
|
786
|
+
passage["content"], remaining_tokens
|
787
|
+
)
|
788
|
+
passage_copy = passage.copy()
|
789
|
+
passage_copy["content"] = truncated_content
|
790
|
+
passage_copy["token_count"] = remaining_tokens
|
791
|
+
passage_copy["is_truncated"] = True
|
792
|
+
selected.append(passage_copy)
|
793
|
+
total_tokens = target_tokens
|
794
|
+
break
|
795
|
+
|
796
|
+
# Check diversity (avoid near-duplicate content)
|
797
|
+
is_diverse = True
|
798
|
+
for selected_passage in selected:
|
799
|
+
similarity = self._calculate_content_similarity(
|
800
|
+
passage["content"], selected_passage["content"]
|
801
|
+
)
|
802
|
+
if similarity > diversity_threshold:
|
803
|
+
is_diverse = False
|
804
|
+
break
|
805
|
+
|
806
|
+
if is_diverse:
|
807
|
+
selected.append(passage)
|
808
|
+
total_tokens += passage_tokens
|
809
|
+
|
810
|
+
return selected
|
811
|
+
|
812
|
+
def _compress_selected_content(self, query: str, selected_passages: list) -> str:
|
813
|
+
"""Compress selected passages into coherent context."""
|
814
|
+
if not selected_passages:
|
815
|
+
return ""
|
816
|
+
|
817
|
+
# For now, use extractive summarization (concatenate most relevant parts)
|
818
|
+
if self.compression_strategy == "extractive_summarization":
|
819
|
+
return self._extractive_compression(query, selected_passages)
|
820
|
+
elif self.compression_strategy == "abstractive_synthesis":
|
821
|
+
return self._abstractive_compression(query, selected_passages)
|
822
|
+
elif self.compression_strategy == "hierarchical_organization":
|
823
|
+
return self._hierarchical_compression(query, selected_passages)
|
824
|
+
else:
|
825
|
+
# Default to extractive
|
826
|
+
return self._extractive_compression(query, selected_passages)
|
827
|
+
|
828
|
+
def _extractive_compression(self, query: str, passages: list) -> str:
|
829
|
+
"""Extract and concatenate the most relevant sentences."""
|
830
|
+
compressed_parts = []
|
831
|
+
query_words = set(query.lower().split())
|
832
|
+
|
833
|
+
for passage in passages:
|
834
|
+
content = passage["content"]
|
835
|
+
|
836
|
+
# Split into sentences
|
837
|
+
sentences = self._split_into_sentences(content)
|
838
|
+
|
839
|
+
# Score each sentence for relevance
|
840
|
+
sentence_scores = []
|
841
|
+
for sentence in sentences:
|
842
|
+
sentence_words = set(sentence.lower().split())
|
843
|
+
overlap = (
|
844
|
+
len(query_words & sentence_words) / len(query_words)
|
845
|
+
if query_words
|
846
|
+
else 0
|
847
|
+
)
|
848
|
+
sentence_scores.append((sentence, overlap))
|
849
|
+
|
850
|
+
# Sort by relevance and take top sentences
|
851
|
+
sentence_scores.sort(key=lambda x: x[1], reverse=True)
|
852
|
+
top_sentences = [
|
853
|
+
s[0] for s in sentence_scores[:3]
|
854
|
+
] # Top 3 sentences per passage
|
855
|
+
|
856
|
+
if top_sentences:
|
857
|
+
compressed_parts.append(" ".join(top_sentences))
|
858
|
+
|
859
|
+
return "\n\n".join(compressed_parts)
|
860
|
+
|
861
|
+
def _abstractive_compression(self, query: str, passages: list) -> str:
|
862
|
+
"""Create abstractive summary (simplified version)."""
|
863
|
+
# In a real implementation, this would use an LLM
|
864
|
+
# For now, create a structured summary
|
865
|
+
key_points = []
|
866
|
+
|
867
|
+
for passage in passages:
|
868
|
+
content = passage["content"]
|
869
|
+
# Extract key phrases (simplified)
|
870
|
+
sentences = self._split_into_sentences(content)
|
871
|
+
if sentences:
|
872
|
+
# Take first and last sentence as key points
|
873
|
+
key_points.append(sentences[0])
|
874
|
+
if len(sentences) > 1:
|
875
|
+
key_points.append(sentences[-1])
|
876
|
+
|
877
|
+
return f"Summary for query '{query}':\n" + "\n".join(
|
878
|
+
f"• {point}" for point in key_points[:10]
|
879
|
+
)
|
880
|
+
|
881
|
+
def _hierarchical_compression(self, query: str, passages: list) -> str:
|
882
|
+
"""Organize information hierarchically."""
|
883
|
+
organized_content = {
|
884
|
+
"primary_information": [],
|
885
|
+
"supporting_details": [],
|
886
|
+
"additional_context": [],
|
887
|
+
}
|
888
|
+
|
889
|
+
for i, passage in enumerate(passages):
|
890
|
+
content = passage["content"]
|
891
|
+
relevance = passage["relevance_score"]
|
892
|
+
|
893
|
+
if relevance > 0.8:
|
894
|
+
organized_content["primary_information"].append(content)
|
895
|
+
elif relevance > 0.6:
|
896
|
+
organized_content["supporting_details"].append(content)
|
897
|
+
else:
|
898
|
+
organized_content["additional_context"].append(content)
|
899
|
+
|
900
|
+
result_parts = []
|
901
|
+
|
902
|
+
if organized_content["primary_information"]:
|
903
|
+
result_parts.append("PRIMARY INFORMATION:")
|
904
|
+
result_parts.extend(organized_content["primary_information"])
|
905
|
+
|
906
|
+
if organized_content["supporting_details"]:
|
907
|
+
result_parts.append("\nSUPPORTING DETAILS:")
|
908
|
+
result_parts.extend(organized_content["supporting_details"])
|
909
|
+
|
910
|
+
if organized_content["additional_context"]:
|
911
|
+
result_parts.append("\nADDITIONAL CONTEXT:")
|
912
|
+
result_parts.extend(
|
913
|
+
organized_content["additional_context"][:2]
|
914
|
+
) # Limit additional context
|
915
|
+
|
916
|
+
return "\n".join(result_parts)
|
917
|
+
|
918
|
+
def _split_into_sentences(self, text: str) -> list:
|
919
|
+
"""Split text into sentences (simplified)."""
|
920
|
+
import re
|
921
|
+
|
922
|
+
sentences = re.split(r"[.!?]+", text)
|
923
|
+
return [s.strip() for s in sentences if s.strip()]
|
924
|
+
|
925
|
+
def _calculate_content_similarity(self, content1: str, content2: str) -> float:
|
926
|
+
"""Calculate Jaccard similarity between two content pieces."""
|
927
|
+
words1 = set(content1.lower().split())
|
928
|
+
words2 = set(content2.lower().split())
|
929
|
+
|
930
|
+
if not words1 or not words2:
|
931
|
+
return 0.0
|
932
|
+
|
933
|
+
intersection = len(words1 & words2)
|
934
|
+
union = len(words1 | words2)
|
935
|
+
|
936
|
+
return intersection / union if union > 0 else 0.0
|
937
|
+
|
938
|
+
def _truncate_passage(self, content: str, max_tokens: int) -> str:
|
939
|
+
"""Intelligently truncate passage to fit token budget."""
|
940
|
+
words = content.split()
|
941
|
+
target_words = int(max_tokens / 1.3) # Rough token-to-word ratio
|
942
|
+
|
943
|
+
if len(words) <= target_words:
|
944
|
+
return content
|
945
|
+
|
946
|
+
# Try to end at sentence boundary
|
947
|
+
truncated_words = words[:target_words]
|
948
|
+
truncated_text = " ".join(truncated_words)
|
949
|
+
|
950
|
+
# Find last sentence boundary
|
951
|
+
last_sentence_end = max(
|
952
|
+
truncated_text.rfind("."),
|
953
|
+
truncated_text.rfind("!"),
|
954
|
+
truncated_text.rfind("?"),
|
955
|
+
)
|
956
|
+
|
957
|
+
if (
|
958
|
+
last_sentence_end > len(truncated_text) * 0.7
|
959
|
+
): # If we can preserve most content
|
960
|
+
return truncated_text[: last_sentence_end + 1]
|
961
|
+
else:
|
962
|
+
return truncated_text + "..."
|
963
|
+
|
964
|
+
def _generate_compression_metadata(
|
965
|
+
self, original_docs: list, selected_passages: list, compressed_context: str
|
966
|
+
) -> dict:
|
967
|
+
"""Generate metadata about the compression process."""
|
968
|
+
original_length = sum(
|
969
|
+
len(doc.get("content", "") if isinstance(doc, dict) else str(doc))
|
970
|
+
for doc in original_docs
|
971
|
+
)
|
972
|
+
compressed_length = len(compressed_context)
|
973
|
+
|
974
|
+
return {
|
975
|
+
"original_document_count": len(original_docs),
|
976
|
+
"selected_passage_count": len(selected_passages),
|
977
|
+
"original_char_count": original_length,
|
978
|
+
"compressed_char_count": compressed_length,
|
979
|
+
"compression_ratio": (
|
980
|
+
compressed_length / original_length if original_length > 0 else 0
|
981
|
+
),
|
982
|
+
"avg_relevance_score": (
|
983
|
+
sum(p["relevance_score"] for p in selected_passages)
|
984
|
+
/ len(selected_passages)
|
985
|
+
if selected_passages
|
986
|
+
else 0
|
987
|
+
),
|
988
|
+
"compression_strategy": self.compression_strategy,
|
989
|
+
"token_budget": self.max_tokens,
|
990
|
+
"passages_truncated": sum(
|
991
|
+
1 for p in selected_passages if p.get("is_truncated", False)
|
992
|
+
),
|
993
|
+
}
|
994
|
+
|
995
|
+
|
526
996
|
# Backward compatibility aliases
|
527
997
|
Filter = FilterNode
|