crprotocol 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (153) hide show
  1. crp/__init__.py +126 -0
  2. crp/__main__.py +8 -0
  3. crp/_typing.py +27 -0
  4. crp/_version.py +5 -0
  5. crp/adapters.py +31 -0
  6. crp/advanced/__init__.py +40 -0
  7. crp/advanced/auto_ingest.py +400 -0
  8. crp/advanced/cqs.py +235 -0
  9. crp/advanced/cross_window.py +477 -0
  10. crp/advanced/curator.py +265 -0
  11. crp/advanced/feedback.py +146 -0
  12. crp/advanced/hierarchical.py +211 -0
  13. crp/advanced/meta_learning.py +401 -0
  14. crp/advanced/parallel.py +98 -0
  15. crp/advanced/review_cycle.py +329 -0
  16. crp/advanced/scale_mode.py +129 -0
  17. crp/advanced/source_grounding.py +207 -0
  18. crp/ckf/__init__.py +35 -0
  19. crp/ckf/community.py +377 -0
  20. crp/ckf/fabric.py +445 -0
  21. crp/ckf/gc.py +175 -0
  22. crp/ckf/graph_walk.py +87 -0
  23. crp/ckf/merge.py +133 -0
  24. crp/ckf/pattern_query.py +122 -0
  25. crp/ckf/pubsub.py +128 -0
  26. crp/ckf/semantic.py +207 -0
  27. crp/cli/__init__.py +7 -0
  28. crp/cli/main.py +329 -0
  29. crp/cli/sidecar.py +929 -0
  30. crp/cli/startup.py +272 -0
  31. crp/continuation/__init__.py +103 -0
  32. crp/continuation/completion.py +348 -0
  33. crp/continuation/degradation.py +157 -0
  34. crp/continuation/document_map.py +160 -0
  35. crp/continuation/flow.py +109 -0
  36. crp/continuation/gap.py +419 -0
  37. crp/continuation/manager.py +484 -0
  38. crp/continuation/quality_monitor.py +179 -0
  39. crp/continuation/stitch.py +419 -0
  40. crp/continuation/trigger.py +142 -0
  41. crp/continuation/voice.py +157 -0
  42. crp/core/__init__.py +69 -0
  43. crp/core/batch.py +77 -0
  44. crp/core/circuit_breaker.py +116 -0
  45. crp/core/config.py +377 -0
  46. crp/core/context_tools.py +540 -0
  47. crp/core/dispatch_router.py +3977 -0
  48. crp/core/errors.py +128 -0
  49. crp/core/extraction_facade.py +384 -0
  50. crp/core/facilitator.py +713 -0
  51. crp/core/idempotency.py +215 -0
  52. crp/core/orchestrator.py +1435 -0
  53. crp/core/relay_strategies.py +613 -0
  54. crp/core/security_manager.py +140 -0
  55. crp/core/session.py +134 -0
  56. crp/core/task_intent.py +36 -0
  57. crp/core/window.py +363 -0
  58. crp/envelope/__init__.py +30 -0
  59. crp/envelope/builder.py +288 -0
  60. crp/envelope/decomposer.py +236 -0
  61. crp/envelope/formatter.py +168 -0
  62. crp/envelope/packer.py +211 -0
  63. crp/envelope/reranker.py +209 -0
  64. crp/envelope/scoring.py +310 -0
  65. crp/extraction/__init__.py +45 -0
  66. crp/extraction/complexity.py +96 -0
  67. crp/extraction/contradiction.py +132 -0
  68. crp/extraction/pipeline.py +360 -0
  69. crp/extraction/quality_gate.py +237 -0
  70. crp/extraction/stage1_regex.py +173 -0
  71. crp/extraction/stage2_statistical.py +244 -0
  72. crp/extraction/stage3_gliner.py +210 -0
  73. crp/extraction/stage4_uie.py +183 -0
  74. crp/extraction/stage5_discourse.py +175 -0
  75. crp/extraction/stage6_llm.py +178 -0
  76. crp/extraction/structured_output.py +219 -0
  77. crp/extraction/types.py +299 -0
  78. crp/license_guard.py +722 -0
  79. crp/observability/__init__.py +30 -0
  80. crp/observability/audit.py +118 -0
  81. crp/observability/events.py +233 -0
  82. crp/observability/metrics.py +264 -0
  83. crp/observability/quality.py +135 -0
  84. crp/observability/structured_logging.py +81 -0
  85. crp/observability/telemetry.py +117 -0
  86. crp/provenance/__init__.py +314 -0
  87. crp/provenance/_embeddings.py +97 -0
  88. crp/provenance/_types.py +378 -0
  89. crp/provenance/attribution_scorer.py +252 -0
  90. crp/provenance/claim_detector.py +229 -0
  91. crp/provenance/contradiction_detector.py +243 -0
  92. crp/provenance/distortion_detector.py +397 -0
  93. crp/provenance/entailment_verifier.py +358 -0
  94. crp/provenance/fabrication_detector.py +203 -0
  95. crp/provenance/hallucination_scorer.py +320 -0
  96. crp/provenance/omission_analyzer.py +106 -0
  97. crp/provenance/provenance_chain.py +205 -0
  98. crp/provenance/report_generator.py +440 -0
  99. crp/providers/__init__.py +43 -0
  100. crp/providers/anthropic.py +270 -0
  101. crp/providers/base.py +135 -0
  102. crp/providers/custom.py +63 -0
  103. crp/providers/diagnostic.py +251 -0
  104. crp/providers/llamacpp.py +224 -0
  105. crp/providers/manager.py +139 -0
  106. crp/providers/ollama.py +243 -0
  107. crp/providers/openai.py +628 -0
  108. crp/providers/tokenizers.py +48 -0
  109. crp/py.typed +0 -0
  110. crp/resources/__init__.py +53 -0
  111. crp/resources/adaptive_allocator.py +525 -0
  112. crp/resources/cost_model.py +388 -0
  113. crp/resources/overhead_manager.py +217 -0
  114. crp/resources/resource_manager.py +262 -0
  115. crp/schemas/__init__.py +20 -0
  116. crp/schemas/cost-estimate.json +33 -0
  117. crp/schemas/crp-error.json +43 -0
  118. crp/schemas/envelope-preview.json +40 -0
  119. crp/schemas/persisted-state-header.json +27 -0
  120. crp/schemas/quality-report.json +94 -0
  121. crp/schemas/session-handle.json +33 -0
  122. crp/schemas/session-status.json +57 -0
  123. crp/schemas/stream-event.json +18 -0
  124. crp/schemas/task-intent.json +42 -0
  125. crp/security/__init__.py +93 -0
  126. crp/security/audit_trail.py +392 -0
  127. crp/security/binding.py +192 -0
  128. crp/security/compliance.py +813 -0
  129. crp/security/consent.py +593 -0
  130. crp/security/embedding_defense.py +161 -0
  131. crp/security/encryption.py +202 -0
  132. crp/security/injection.py +335 -0
  133. crp/security/integrity.py +267 -0
  134. crp/security/privacy.py +662 -0
  135. crp/security/quarantine.py +249 -0
  136. crp/security/rbac.py +221 -0
  137. crp/security/validation.py +164 -0
  138. crp/state/__init__.py +31 -0
  139. crp/state/cold_storage.py +258 -0
  140. crp/state/compaction.py +263 -0
  141. crp/state/critical_state.py +104 -0
  142. crp/state/event_log.py +313 -0
  143. crp/state/fact.py +189 -0
  144. crp/state/serialization.py +189 -0
  145. crp/state/session_cleanup.py +77 -0
  146. crp/state/snapshot.py +290 -0
  147. crp/state/warm_store.py +346 -0
  148. crprotocol-2.0.0.dist-info/METADATA +1295 -0
  149. crprotocol-2.0.0.dist-info/RECORD +153 -0
  150. crprotocol-2.0.0.dist-info/WHEEL +4 -0
  151. crprotocol-2.0.0.dist-info/entry_points.txt +2 -0
  152. crprotocol-2.0.0.dist-info/licenses/LICENSE.md +170 -0
  153. crprotocol-2.0.0.dist-info/licenses/NOTICE +18 -0
crp/__init__.py ADDED
@@ -0,0 +1,126 @@
1
+ # Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
2
+ # Licensed under Elastic License 2.0 — see LICENSE.md for details.
3
+ """
4
+ CRP — Context Relay Protocol.
5
+
6
+ Unbounded context · Unbounded generation · Amplified reasoning.
7
+
8
+ Usage::
9
+
10
+ import crp
11
+
12
+ client = crp.Client(provider=my_provider)
13
+ result = client.dispatch("You are helpful.", "Explain CRP.")
14
+ print(result.output)
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ from crp._version import __version__
20
+ from crp.core.config import CRPConfig, ConfigurationResolver
21
+
22
+ # ── License guard — advisory IP protection checks (§ELv2) ──
23
+ from crp.license_guard import _startup_check as _license_startup_check
24
+ _license_startup_check()
25
+
26
+ from crp.core.errors import (
27
+ CRPError,
28
+ BudgetExhaustedError,
29
+ ChainVerificationFailedError,
30
+ ErrorCode,
31
+ ProviderError,
32
+ ProviderTimeoutError,
33
+ RateLimitExceededError,
34
+ SecurityInvariantError,
35
+ SessionClosedError,
36
+ SessionExpiredError,
37
+ SignatureInvalidError,
38
+ StateCorruptedError,
39
+ ValidationError,
40
+ )
41
+ from crp.core.orchestrator import CRPOrchestrator, ExtractionResult, StreamEvent
42
+ from crp.core.session import (
43
+ CostEstimate,
44
+ QualityReport,
45
+ SessionHandle,
46
+ SessionStatus,
47
+ )
48
+ from crp.core.task_intent import TaskIntent
49
+
50
+ # Convenience alias — spec §9.1 says ``import crp; client = crp.Client(...)``
51
+ Client = CRPOrchestrator
52
+
53
+ # Lazy imports for advanced types — avoids pulling heavy subsystems on ``import crp``
54
+ def __getattr__(name: str):
55
+ _ADVANCED = {
56
+ "CKFConfig": "crp.ckf.fabric",
57
+ "CKFHealth": "crp.ckf.fabric",
58
+ "ContextualKnowledgeFabric": "crp.ckf.fabric",
59
+ "ContinuationConfig": "crp.continuation.manager",
60
+ "ContinuationManager": "crp.continuation.manager",
61
+ "CriticalState": "crp.state.critical_state",
62
+ "StructuralState": "crp.state.critical_state",
63
+ "EnvelopePreview": "crp.core.session",
64
+ "EnvelopeResult": "crp.envelope.builder",
65
+ "EnvelopeState": "crp.envelope.builder",
66
+ "ExtractionPipeline": "crp.extraction.pipeline",
67
+ "Fact": "crp.extraction.types",
68
+ "FactEdge": "crp.extraction.types",
69
+ "FactGraph": "crp.extraction.types",
70
+ "WarmStateStore": "crp.state.warm_store",
71
+ "WarmStoreConfig": "crp.state.warm_store",
72
+ }
73
+ if name in _ADVANCED:
74
+ import importlib
75
+ mod = importlib.import_module(_ADVANCED[name])
76
+ return getattr(mod, name)
77
+ raise AttributeError(f"module 'crp' has no attribute {name!r}")
78
+
79
+
80
+ __all__ = [
81
+ # Core public API
82
+ "__version__",
83
+ "Client",
84
+ "CRPOrchestrator",
85
+ "CRPConfig",
86
+ "ConfigurationResolver",
87
+ "TaskIntent",
88
+ # Error types (§audit L2)
89
+ "CRPError",
90
+ "ErrorCode",
91
+ "BudgetExhaustedError",
92
+ "ChainVerificationFailedError",
93
+ "ProviderError",
94
+ "ProviderTimeoutError",
95
+ "RateLimitExceededError",
96
+ "SecurityInvariantError",
97
+ "SessionClosedError",
98
+ "SessionExpiredError",
99
+ "SignatureInvalidError",
100
+ "StateCorruptedError",
101
+ "ValidationError",
102
+ # Results
103
+ "QualityReport",
104
+ "CostEstimate",
105
+ "SessionHandle",
106
+ "SessionStatus",
107
+ "StreamEvent",
108
+ "ExtractionResult",
109
+ # Advanced (lazy-loaded)
110
+ "CKFConfig",
111
+ "CKFHealth",
112
+ "ContextualKnowledgeFabric",
113
+ "ContinuationConfig",
114
+ "ContinuationManager",
115
+ "CriticalState",
116
+ "EnvelopePreview",
117
+ "EnvelopeResult",
118
+ "EnvelopeState",
119
+ "ExtractionPipeline",
120
+ "Fact",
121
+ "FactEdge",
122
+ "FactGraph",
123
+ "StructuralState",
124
+ "WarmStateStore",
125
+ "WarmStoreConfig",
126
+ ]
crp/__main__.py ADDED
@@ -0,0 +1,8 @@
1
+ # Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
2
+ # Licensed under Elastic License 2.0 — see LICENSE.md for details.
3
+ """Allow ``python -m crp`` to invoke the CLI."""
4
+
5
+ from crp.cli.main import cli
6
+
7
+ if __name__ == "__main__":
8
+ cli()
crp/_typing.py ADDED
@@ -0,0 +1,27 @@
1
+ # Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
2
+ # Licensed under Elastic License 2.0 — see LICENSE.md for details.
3
+ """Shared type aliases and protocols for CRP (§audit L5)."""
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import Any, TypeAlias
8
+ from collections.abc import Callable
9
+
10
+ # ── Type aliases ──────────────────────────────────────────────────────────
11
+ JSON: TypeAlias = dict[str, Any]
12
+ FactID: TypeAlias = str
13
+ WindowID: TypeAlias = str
14
+ SessionID: TypeAlias = str
15
+ TokenCount: TypeAlias = int
16
+ EmbeddingVector: TypeAlias = list[float]
17
+ EmbeddingFn: TypeAlias = Callable[[list[str]], list[EmbeddingVector]]
18
+
19
+ # ── Dispatch return types (§audit4 CQ-H2) ────────────────────────────────
20
+ # The various dispatch strategies intentionally return different types:
21
+ # dispatch() → tuple[str, QualityReport]
22
+ # dispatch_hierarchical() → tuple[list[str], QualityReport]
23
+ # dispatch_stream() → Generator[StreamEvent, None, None]
24
+ # dispatch_stream_augmented → tuple[str, QualityReport]
25
+ # A unified DispatchResult would break tuple unpacking at all call sites.
26
+ # The common pattern is (output, QualityReport) where output varies by
27
+ # strategy. See CRPOrchestrator method docstrings for specifics.
crp/_version.py ADDED
@@ -0,0 +1,5 @@
1
+ # Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
2
+ # Licensed under Elastic License 2.0 — see LICENSE.md for details.
3
+ """CRP — Context Relay Protocol SDK."""
4
+
5
+ __version__ = "2.0.0"
crp/adapters.py ADDED
@@ -0,0 +1,31 @@
1
+ # Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
2
+ # Licensed under Elastic License 2.0 — see LICENSE.md for details.
3
+ """Compatibility alias — ``from crp.adapters import ...``
4
+
5
+ This module re-exports everything from :mod:`crp.providers` so that
6
+ documentation examples using ``crp.adapters`` work unchanged.
7
+
8
+ Canonical location is ``crp.providers``.
9
+ """
10
+
11
+ from crp.providers import ( # noqa: F401
12
+ AnthropicAdapter,
13
+ BaseAdapter,
14
+ CallableAdapter,
15
+ CustomProvider,
16
+ LlamaCppAdapter,
17
+ LLMProvider,
18
+ OllamaAdapter,
19
+ OpenAIAdapter,
20
+ )
21
+
22
+ __all__ = [
23
+ "LLMProvider",
24
+ "BaseAdapter",
25
+ "OpenAIAdapter",
26
+ "AnthropicAdapter",
27
+ "OllamaAdapter",
28
+ "LlamaCppAdapter",
29
+ "CustomProvider",
30
+ "CallableAdapter",
31
+ ]
@@ -0,0 +1,40 @@
1
+ # Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
2
+ # Licensed under Elastic License 2.0 — see LICENSE.md for details.
3
+ """Advanced features — hierarchical, parallel, auto-ingest, CQS, meta-learning."""
4
+
5
+ from crp.advanced.auto_ingest import IngestFact, IngestResult, auto_ingest
6
+ from crp.advanced.cqs import ContextHungerSignal, CQSDetector, CQSResponse
7
+ from crp.advanced.cross_window import ConsistencyIssue, CrossWindowValidator, ValidationResult
8
+ from crp.advanced.curator import CurationConfig, LLMContextCurator, LLMSynthesis
9
+ from crp.advanced.feedback import FeedbackEntry, FeedbackLoop
10
+ from crp.advanced.hierarchical import HierarchicalPlan, HierarchicalProcessor
11
+ from crp.advanced.meta_learning import MetaLearningEngine, ORCResult, ReasoningTrace
12
+ from crp.advanced.parallel import FanOutResult, FanOutTask, ParallelFanOut
13
+ from crp.advanced.review_cycle import AssessmentResult, ReviewCycleManager, ReviewGuidance
14
+ from crp.advanced.scale_mode import QualityTier, ScaleModeSelector, SessionConfig
15
+ from crp.advanced.source_grounding import SourceGroundingEngine, SourcePassage
16
+
17
+ __all__ = [
18
+ # auto_ingest
19
+ "auto_ingest", "IngestResult", "IngestFact",
20
+ # cqs
21
+ "CQSDetector", "ContextHungerSignal", "CQSResponse",
22
+ # cross_window
23
+ "CrossWindowValidator", "ConsistencyIssue", "ValidationResult",
24
+ # curator
25
+ "LLMContextCurator", "LLMSynthesis", "CurationConfig",
26
+ # feedback
27
+ "FeedbackLoop", "FeedbackEntry",
28
+ # hierarchical
29
+ "HierarchicalProcessor", "HierarchicalPlan",
30
+ # meta_learning
31
+ "MetaLearningEngine", "ReasoningTrace", "ORCResult",
32
+ # parallel
33
+ "ParallelFanOut", "FanOutTask", "FanOutResult",
34
+ # review_cycle
35
+ "ReviewCycleManager", "ReviewGuidance", "AssessmentResult",
36
+ # scale_mode
37
+ "ScaleModeSelector", "QualityTier", "SessionConfig",
38
+ # source_grounding
39
+ "SourceGroundingEngine", "SourcePassage",
40
+ ]
@@ -0,0 +1,400 @@
1
+ # Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
2
+ # Licensed under Elastic License 2.0 — see LICENSE.md for details.
3
+ """Auto-ingest — oversized input handling with structure-aware chunking (§4.6).
4
+
5
+ Triggers when system_tokens + task_tokens > context_window - gen_reserve.
6
+ Zero LLM cost by default: uses graduated extraction (stages 1-5) per chunk,
7
+ then reconciles boundary duplicates/complements via embedding similarity.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import math
13
+ import re
14
+ import uuid
15
+ from collections.abc import Callable
16
+ from dataclasses import dataclass, field
17
+ from typing import Any
18
+
19
+ # ---------------------------------------------------------------------------
20
+ # Constants (§4.6)
21
+ # ---------------------------------------------------------------------------
22
+
23
+ ENVELOPE_OVERHEAD_RESERVE = 500 # tokens reserved for envelope framing
24
+ DUP_SIMILARITY = 0.95
25
+ COMPLEMENT_SIMILARITY = 0.75
26
+ TOKEN_OVERLAP_THRESHOLD = 0.3
27
+
28
+ # Protected structure patterns
29
+ _CODE_BLOCK = re.compile(r"```[\s\S]*?```", re.MULTILINE)
30
+ _TABLE_ROW = re.compile(r"^\|.+\|$", re.MULTILINE)
31
+ _JSON_BLOCK = re.compile(r"\{[^}]{50,}\}", re.DOTALL)
32
+ _NUMBERED_LIST = re.compile(r"(?:^|\n)\d+\.\s.+(?:\n\d+\.\s.+){2,}", re.MULTILINE)
33
+
34
+
35
+ # ---------------------------------------------------------------------------
36
+ # Data types
37
+ # ---------------------------------------------------------------------------
38
+
39
+
40
+ @dataclass
41
+ class ProtectedSpan:
42
+ """Region that must not be split mid-structure."""
43
+
44
+ start: int
45
+ end: int
46
+ span_type: str # "code_block" | "table" | "json_block" | "numbered_list"
47
+
48
+
49
+ @dataclass
50
+ class Chunk:
51
+ """One chunk of the oversized input."""
52
+
53
+ index: int
54
+ text: str
55
+ offset_start: int
56
+ offset_end: int
57
+ token_count: int = 0
58
+
59
+
60
+ @dataclass
61
+ class IngestResult:
62
+ """Summary returned by auto_ingest()."""
63
+
64
+ chunks_created: int = 0
65
+ facts_extracted: int = 0
66
+ facts_after_reconciliation: int = 0
67
+ synthesized_task: str = ""
68
+ raw_stored: bool = False
69
+
70
+
71
+ @dataclass
72
+ class IngestFact:
73
+ """Lightweight fact from per-chunk extraction."""
74
+
75
+ id: str = field(default_factory=lambda: str(uuid.uuid4()))
76
+ text: str = ""
77
+ confidence: float = 0.0
78
+ chunk_index: int = 0
79
+ chunk_offset_start: int = 0
80
+ chunk_offset_end: int = 0
81
+ source: str = ""
82
+ metadata: dict[str, Any] = field(default_factory=dict)
83
+
84
+
85
+ # ---------------------------------------------------------------------------
86
+ # Protected structure detection
87
+ # ---------------------------------------------------------------------------
88
+
89
+
90
+ def detect_protected_structures(text: str) -> list[ProtectedSpan]:
91
+ """Find code blocks, tables, JSON blocks, numbered lists."""
92
+ spans: list[ProtectedSpan] = []
93
+ for m in _CODE_BLOCK.finditer(text):
94
+ spans.append(ProtectedSpan(m.start(), m.end(), "code_block"))
95
+ # Tables: contiguous runs of | lines
96
+ table_lines: list[tuple[int, int]] = []
97
+ for m in _TABLE_ROW.finditer(text):
98
+ table_lines.append((m.start(), m.end()))
99
+ if table_lines:
100
+ run_start = table_lines[0][0]
101
+ run_end = table_lines[0][1]
102
+ for s, e in table_lines[1:]:
103
+ if s - run_end <= 2:
104
+ run_end = e
105
+ else:
106
+ if run_end - run_start > 20:
107
+ spans.append(ProtectedSpan(run_start, run_end, "table"))
108
+ run_start, run_end = s, e
109
+ if run_end - run_start > 20:
110
+ spans.append(ProtectedSpan(run_start, run_end, "table"))
111
+ for m in _JSON_BLOCK.finditer(text):
112
+ spans.append(ProtectedSpan(m.start(), m.end(), "json_block"))
113
+ for m in _NUMBERED_LIST.finditer(text):
114
+ spans.append(ProtectedSpan(m.start(), m.end(), "numbered_list"))
115
+ return merge_overlapping_spans(spans)
116
+
117
+
118
+ def merge_overlapping_spans(spans: list[ProtectedSpan]) -> list[ProtectedSpan]:
119
+ """Merge overlapping/adjacent protected spans."""
120
+ if not spans:
121
+ return []
122
+ sorted_spans = sorted(spans, key=lambda s: s.start)
123
+ merged: list[ProtectedSpan] = [sorted_spans[0]]
124
+ for s in sorted_spans[1:]:
125
+ prev = merged[-1]
126
+ if s.start <= prev.end:
127
+ prev.end = max(prev.end, s.end)
128
+ prev.span_type = f"{prev.span_type}+{s.span_type}"
129
+ else:
130
+ merged.append(s)
131
+ return merged
132
+
133
+
134
+ # ---------------------------------------------------------------------------
135
+ # Structure-aware splitting
136
+ # ---------------------------------------------------------------------------
137
+
138
+
139
+ def _in_protected(pos: int, spans: list[ProtectedSpan]) -> bool:
140
+ return any(s.start <= pos < s.end for s in spans)
141
+
142
+
143
+ def _find_split_point(
144
+ text: str, start: int, end: int, spans: list[ProtectedSpan],
145
+ ) -> int:
146
+ """Find best split point in [start, end) following priority order."""
147
+ region = text[start:end]
148
+ # Priority 1: heading break
149
+ for m in re.finditer(r"\n# ", region):
150
+ pos = start + m.start()
151
+ if not _in_protected(pos, spans):
152
+ return pos
153
+ # Priority 2: paragraph break
154
+ for m in re.finditer(r"\n\n", region):
155
+ pos = start + m.start()
156
+ if not _in_protected(pos, spans):
157
+ return pos
158
+ # Priority 3: between protected structures
159
+ for i in range(len(spans) - 1):
160
+ gap_start = spans[i].end
161
+ if start <= gap_start < end:
162
+ return gap_start
163
+ # Priority 4: sentence boundary
164
+ for m in re.finditer(r"\.\s", region):
165
+ pos = start + m.start() + 1
166
+ if not _in_protected(pos, spans):
167
+ return pos
168
+ # Priority 5: line break
169
+ for m in re.finditer(r"\n", region):
170
+ pos = start + m.start()
171
+ if not _in_protected(pos, spans):
172
+ return pos
173
+ # Priority 6: word boundary (last resort)
174
+ for m in re.finditer(r" ", region):
175
+ pos = start + m.start()
176
+ if not _in_protected(pos, spans):
177
+ return pos
178
+ return end
179
+
180
+
181
+ def split_at_boundaries(
182
+ text: str,
183
+ chunk_size_chars: int,
184
+ overlap_chars: int,
185
+ protected_spans: list[ProtectedSpan],
186
+ ) -> list[Chunk]:
187
+ """Split text into chunks respecting protected structures."""
188
+ chunks: list[Chunk] = []
189
+ pos = 0
190
+ idx = 0
191
+ text_len = len(text)
192
+
193
+ while pos < text_len:
194
+ end = min(pos + chunk_size_chars, text_len)
195
+ if end < text_len:
196
+ split = _find_split_point(text, pos, end, protected_spans)
197
+ if split <= pos:
198
+ split = end # Fallback: force split
199
+ else:
200
+ split = end
201
+
202
+ chunks.append(Chunk(
203
+ index=idx,
204
+ text=text[pos:split],
205
+ offset_start=pos,
206
+ offset_end=split,
207
+ ))
208
+ idx += 1
209
+ # Advance with overlap
210
+ pos = max(pos + 1, split - overlap_chars)
211
+
212
+ return chunks
213
+
214
+
215
+ # ---------------------------------------------------------------------------
216
+ # Boundary reconciliation
217
+ # ---------------------------------------------------------------------------
218
+
219
+
220
+ def _text_overlap_ratio(a: str, b: str) -> float:
221
+ """Token-level overlap ratio between two texts."""
222
+ words_a = set(a.lower().split())
223
+ words_b = set(b.lower().split())
224
+ if not words_a or not words_b:
225
+ return 0.0
226
+ intersection = words_a & words_b
227
+ return len(intersection) / min(len(words_a), len(words_b))
228
+
229
+
230
+ def _cosine_sim(a: list[float], b: list[float]) -> float:
231
+ """Cosine similarity between two vectors."""
232
+ if len(a) != len(b) or not a:
233
+ return 0.0
234
+ dot = sum(x * y for x, y in zip(a, b))
235
+ na = math.sqrt(sum(x * x for x in a))
236
+ nb = math.sqrt(sum(x * x for x in b))
237
+ return dot / (na * nb) if na > 0 and nb > 0 else 0.0
238
+
239
+
240
+ def reconcile_chunk_boundaries(
241
+ per_chunk_facts: list[list[IngestFact]],
242
+ embedding_fn: Callable[[str], list[float]] | None = None,
243
+ ) -> list[IngestFact]:
244
+ """Deduplicate/merge facts at chunk boundaries.
245
+
246
+ - cosine_similarity > 0.95 → duplicate → skip
247
+ - cosine_similarity > 0.75 AND token_overlap > 0.3 → complement → merge
248
+ - Otherwise → new fact → keep
249
+ """
250
+ if not per_chunk_facts:
251
+ return []
252
+ result = list(per_chunk_facts[0])
253
+
254
+ for chunk_idx in range(1, len(per_chunk_facts)):
255
+ for fact in per_chunk_facts[chunk_idx]:
256
+ is_dup = False
257
+ merge_target = None
258
+
259
+ for existing in result:
260
+ if embedding_fn:
261
+ emb_a = embedding_fn(existing.text)
262
+ emb_b = embedding_fn(fact.text)
263
+ sim = _cosine_sim(emb_a, emb_b)
264
+ else:
265
+ sim = _text_overlap_ratio(existing.text, fact.text)
266
+
267
+ if sim > DUP_SIMILARITY:
268
+ is_dup = True
269
+ break
270
+ if sim > COMPLEMENT_SIMILARITY:
271
+ tok_overlap = _text_overlap_ratio(existing.text, fact.text)
272
+ if tok_overlap > TOKEN_OVERLAP_THRESHOLD:
273
+ merge_target = existing
274
+ break
275
+
276
+ if is_dup:
277
+ continue
278
+ if merge_target is not None:
279
+ merge_target.text = merge_fact_texts(merge_target.text, fact.text)
280
+ merge_target.confidence = max(merge_target.confidence, fact.confidence)
281
+ else:
282
+ result.append(fact)
283
+
284
+ return result
285
+
286
+
287
+ def merge_fact_texts(a: str, b: str) -> str:
288
+ """Merge two complementary fact texts, keeping unique content."""
289
+ words_a = a.split()
290
+ words_b = b.split()
291
+ seen = set(w.lower() for w in words_a)
292
+ extra = [w for w in words_b if w.lower() not in seen]
293
+ if extra:
294
+ return a + " " + " ".join(extra)
295
+ return a
296
+
297
+
298
+ # ---------------------------------------------------------------------------
299
+ # Main auto_ingest function
300
+ # ---------------------------------------------------------------------------
301
+
302
+
303
+ def auto_ingest(
304
+ system_prompt: str,
305
+ task_input: str,
306
+ task_intent_text: str,
307
+ context_window: int,
308
+ count_tokens: Callable[[str], int],
309
+ extract_fn: Callable[[str, str], list[IngestFact]] | None = None,
310
+ embedding_fn: Callable[[str], list[float]] | None = None,
311
+ store_raw_fn: Callable[[str, str], None] | None = None,
312
+ session_id: str = "",
313
+ ) -> tuple[list[IngestFact], IngestResult]:
314
+ """Handle oversized inputs with structure-aware chunking.
315
+
316
+ Args:
317
+ system_prompt: The system prompt (not modified).
318
+ task_input: Raw oversized input text.
319
+ task_intent_text: Short description of task intent.
320
+ context_window: Total context window in tokens.
321
+ count_tokens: Token counting function.
322
+ extract_fn: Per-chunk fact extractor (stages 1-5). If None, returns dummy facts.
323
+ embedding_fn: Optional embedding function for reconciliation.
324
+ store_raw_fn: Optional function to store raw input in cold storage.
325
+ session_id: Current session ID.
326
+
327
+ Returns:
328
+ (reconciled_facts, ingest_result)
329
+ """
330
+ # Step 1: Compute available space
331
+ sys_tokens = count_tokens(system_prompt)
332
+ gen_reserve = max(context_window // 4, 1024)
333
+ available_tokens = context_window - sys_tokens - gen_reserve
334
+ if available_tokens <= 0:
335
+ available_tokens = 1024 # Minimum sane value
336
+
337
+ # Step 2: Detect protected structures
338
+ protected_spans = detect_protected_structures(task_input)
339
+
340
+ # Step 3: Chunk with structure-aware boundaries
341
+ chunk_budget = available_tokens - ENVELOPE_OVERHEAD_RESERVE
342
+ # Approximate chars-per-token ratio
343
+ total_chars = len(task_input)
344
+ total_tokens = count_tokens(task_input)
345
+ chars_per_token = total_chars / max(total_tokens, 1)
346
+ chunk_size_chars = int(chunk_budget * chars_per_token)
347
+ overlap_chars = min(chunk_size_chars // 10, int(500 * chars_per_token))
348
+
349
+ chunks = split_at_boundaries(task_input, chunk_size_chars, overlap_chars, protected_spans)
350
+
351
+ # Step 4: Per-chunk extraction (zero LLM)
352
+ per_chunk_facts: list[list[IngestFact]] = []
353
+ total_extracted = 0
354
+ for chunk in chunks:
355
+ chunk.token_count = count_tokens(chunk.text)
356
+ if extract_fn:
357
+ facts = extract_fn(chunk.text, task_intent_text)
358
+ else:
359
+ # Minimal fallback: treat entire chunk as one fact
360
+ facts = [IngestFact(
361
+ text=chunk.text[:500],
362
+ confidence=0.5,
363
+ chunk_index=chunk.index,
364
+ chunk_offset_start=chunk.offset_start,
365
+ chunk_offset_end=chunk.offset_end,
366
+ source=f"input_chunk_{chunk.index + 1}_of_{len(chunks)}",
367
+ )]
368
+ for f in facts:
369
+ f.chunk_index = chunk.index
370
+ f.chunk_offset_start = chunk.offset_start
371
+ f.chunk_offset_end = chunk.offset_end
372
+ f.source = f"input_chunk_{chunk.index + 1}_of_{len(chunks)}"
373
+ per_chunk_facts.append(facts)
374
+ total_extracted += len(facts)
375
+
376
+ # Step 5: Boundary reconciliation
377
+ reconciled = reconcile_chunk_boundaries(per_chunk_facts, embedding_fn)
378
+
379
+ # Step 6: Store raw input in cold storage
380
+ raw_stored = False
381
+ if store_raw_fn and session_id:
382
+ store_raw_fn(task_input, session_id)
383
+ raw_stored = True
384
+
385
+ # Step 7: Synthesize task
386
+ synthesized = (
387
+ f"Process the following material ({len(chunks)} sections ingested, "
388
+ f"{len(reconciled)} facts extracted). Original request: "
389
+ f"{task_intent_text[:500]}"
390
+ )
391
+
392
+ result = IngestResult(
393
+ chunks_created=len(chunks),
394
+ facts_extracted=total_extracted,
395
+ facts_after_reconciliation=len(reconciled),
396
+ synthesized_task=synthesized,
397
+ raw_stored=raw_stored,
398
+ )
399
+
400
+ return reconciled, result