crprotocol 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crp/__init__.py +126 -0
- crp/__main__.py +8 -0
- crp/_typing.py +27 -0
- crp/_version.py +5 -0
- crp/adapters.py +31 -0
- crp/advanced/__init__.py +40 -0
- crp/advanced/auto_ingest.py +400 -0
- crp/advanced/cqs.py +235 -0
- crp/advanced/cross_window.py +477 -0
- crp/advanced/curator.py +265 -0
- crp/advanced/feedback.py +146 -0
- crp/advanced/hierarchical.py +211 -0
- crp/advanced/meta_learning.py +401 -0
- crp/advanced/parallel.py +98 -0
- crp/advanced/review_cycle.py +329 -0
- crp/advanced/scale_mode.py +129 -0
- crp/advanced/source_grounding.py +207 -0
- crp/ckf/__init__.py +35 -0
- crp/ckf/community.py +377 -0
- crp/ckf/fabric.py +445 -0
- crp/ckf/gc.py +175 -0
- crp/ckf/graph_walk.py +87 -0
- crp/ckf/merge.py +133 -0
- crp/ckf/pattern_query.py +122 -0
- crp/ckf/pubsub.py +128 -0
- crp/ckf/semantic.py +207 -0
- crp/cli/__init__.py +7 -0
- crp/cli/main.py +329 -0
- crp/cli/sidecar.py +929 -0
- crp/cli/startup.py +272 -0
- crp/continuation/__init__.py +103 -0
- crp/continuation/completion.py +348 -0
- crp/continuation/degradation.py +157 -0
- crp/continuation/document_map.py +160 -0
- crp/continuation/flow.py +109 -0
- crp/continuation/gap.py +419 -0
- crp/continuation/manager.py +484 -0
- crp/continuation/quality_monitor.py +179 -0
- crp/continuation/stitch.py +419 -0
- crp/continuation/trigger.py +142 -0
- crp/continuation/voice.py +157 -0
- crp/core/__init__.py +69 -0
- crp/core/batch.py +77 -0
- crp/core/circuit_breaker.py +116 -0
- crp/core/config.py +377 -0
- crp/core/context_tools.py +540 -0
- crp/core/dispatch_router.py +3977 -0
- crp/core/errors.py +128 -0
- crp/core/extraction_facade.py +384 -0
- crp/core/facilitator.py +713 -0
- crp/core/idempotency.py +215 -0
- crp/core/orchestrator.py +1435 -0
- crp/core/relay_strategies.py +613 -0
- crp/core/security_manager.py +140 -0
- crp/core/session.py +134 -0
- crp/core/task_intent.py +36 -0
- crp/core/window.py +363 -0
- crp/envelope/__init__.py +30 -0
- crp/envelope/builder.py +288 -0
- crp/envelope/decomposer.py +236 -0
- crp/envelope/formatter.py +168 -0
- crp/envelope/packer.py +211 -0
- crp/envelope/reranker.py +209 -0
- crp/envelope/scoring.py +310 -0
- crp/extraction/__init__.py +45 -0
- crp/extraction/complexity.py +96 -0
- crp/extraction/contradiction.py +132 -0
- crp/extraction/pipeline.py +360 -0
- crp/extraction/quality_gate.py +237 -0
- crp/extraction/stage1_regex.py +173 -0
- crp/extraction/stage2_statistical.py +244 -0
- crp/extraction/stage3_gliner.py +210 -0
- crp/extraction/stage4_uie.py +183 -0
- crp/extraction/stage5_discourse.py +175 -0
- crp/extraction/stage6_llm.py +178 -0
- crp/extraction/structured_output.py +219 -0
- crp/extraction/types.py +299 -0
- crp/license_guard.py +722 -0
- crp/observability/__init__.py +30 -0
- crp/observability/audit.py +118 -0
- crp/observability/events.py +233 -0
- crp/observability/metrics.py +264 -0
- crp/observability/quality.py +135 -0
- crp/observability/structured_logging.py +81 -0
- crp/observability/telemetry.py +117 -0
- crp/provenance/__init__.py +314 -0
- crp/provenance/_embeddings.py +97 -0
- crp/provenance/_types.py +378 -0
- crp/provenance/attribution_scorer.py +252 -0
- crp/provenance/claim_detector.py +229 -0
- crp/provenance/contradiction_detector.py +243 -0
- crp/provenance/distortion_detector.py +397 -0
- crp/provenance/entailment_verifier.py +358 -0
- crp/provenance/fabrication_detector.py +203 -0
- crp/provenance/hallucination_scorer.py +320 -0
- crp/provenance/omission_analyzer.py +106 -0
- crp/provenance/provenance_chain.py +205 -0
- crp/provenance/report_generator.py +440 -0
- crp/providers/__init__.py +43 -0
- crp/providers/anthropic.py +270 -0
- crp/providers/base.py +135 -0
- crp/providers/custom.py +63 -0
- crp/providers/diagnostic.py +251 -0
- crp/providers/llamacpp.py +224 -0
- crp/providers/manager.py +139 -0
- crp/providers/ollama.py +243 -0
- crp/providers/openai.py +628 -0
- crp/providers/tokenizers.py +48 -0
- crp/py.typed +0 -0
- crp/resources/__init__.py +53 -0
- crp/resources/adaptive_allocator.py +525 -0
- crp/resources/cost_model.py +388 -0
- crp/resources/overhead_manager.py +217 -0
- crp/resources/resource_manager.py +262 -0
- crp/schemas/__init__.py +20 -0
- crp/schemas/cost-estimate.json +33 -0
- crp/schemas/crp-error.json +43 -0
- crp/schemas/envelope-preview.json +40 -0
- crp/schemas/persisted-state-header.json +27 -0
- crp/schemas/quality-report.json +94 -0
- crp/schemas/session-handle.json +33 -0
- crp/schemas/session-status.json +57 -0
- crp/schemas/stream-event.json +18 -0
- crp/schemas/task-intent.json +42 -0
- crp/security/__init__.py +93 -0
- crp/security/audit_trail.py +392 -0
- crp/security/binding.py +192 -0
- crp/security/compliance.py +813 -0
- crp/security/consent.py +593 -0
- crp/security/embedding_defense.py +161 -0
- crp/security/encryption.py +202 -0
- crp/security/injection.py +335 -0
- crp/security/integrity.py +267 -0
- crp/security/privacy.py +662 -0
- crp/security/quarantine.py +249 -0
- crp/security/rbac.py +221 -0
- crp/security/validation.py +164 -0
- crp/state/__init__.py +31 -0
- crp/state/cold_storage.py +258 -0
- crp/state/compaction.py +263 -0
- crp/state/critical_state.py +104 -0
- crp/state/event_log.py +313 -0
- crp/state/fact.py +189 -0
- crp/state/serialization.py +189 -0
- crp/state/session_cleanup.py +77 -0
- crp/state/snapshot.py +290 -0
- crp/state/warm_store.py +346 -0
- crprotocol-2.0.0.dist-info/METADATA +1295 -0
- crprotocol-2.0.0.dist-info/RECORD +153 -0
- crprotocol-2.0.0.dist-info/WHEEL +4 -0
- crprotocol-2.0.0.dist-info/entry_points.txt +2 -0
- crprotocol-2.0.0.dist-info/licenses/LICENSE.md +170 -0
- crprotocol-2.0.0.dist-info/licenses/NOTICE +18 -0
crp/__init__.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
# Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
|
|
2
|
+
# Licensed under Elastic License 2.0 — see LICENSE.md for details.
|
|
3
|
+
"""
|
|
4
|
+
CRP — Context Relay Protocol.
|
|
5
|
+
|
|
6
|
+
Unbounded context · Unbounded generation · Amplified reasoning.
|
|
7
|
+
|
|
8
|
+
Usage::
|
|
9
|
+
|
|
10
|
+
import crp
|
|
11
|
+
|
|
12
|
+
client = crp.Client(provider=my_provider)
|
|
13
|
+
result = client.dispatch("You are helpful.", "Explain CRP.")
|
|
14
|
+
print(result.output)
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
from crp._version import __version__
|
|
20
|
+
from crp.core.config import CRPConfig, ConfigurationResolver
|
|
21
|
+
|
|
22
|
+
# ── License guard — advisory IP protection checks (§ELv2) ──
|
|
23
|
+
from crp.license_guard import _startup_check as _license_startup_check
|
|
24
|
+
_license_startup_check()
|
|
25
|
+
|
|
26
|
+
from crp.core.errors import (
|
|
27
|
+
CRPError,
|
|
28
|
+
BudgetExhaustedError,
|
|
29
|
+
ChainVerificationFailedError,
|
|
30
|
+
ErrorCode,
|
|
31
|
+
ProviderError,
|
|
32
|
+
ProviderTimeoutError,
|
|
33
|
+
RateLimitExceededError,
|
|
34
|
+
SecurityInvariantError,
|
|
35
|
+
SessionClosedError,
|
|
36
|
+
SessionExpiredError,
|
|
37
|
+
SignatureInvalidError,
|
|
38
|
+
StateCorruptedError,
|
|
39
|
+
ValidationError,
|
|
40
|
+
)
|
|
41
|
+
from crp.core.orchestrator import CRPOrchestrator, ExtractionResult, StreamEvent
|
|
42
|
+
from crp.core.session import (
|
|
43
|
+
CostEstimate,
|
|
44
|
+
QualityReport,
|
|
45
|
+
SessionHandle,
|
|
46
|
+
SessionStatus,
|
|
47
|
+
)
|
|
48
|
+
from crp.core.task_intent import TaskIntent
|
|
49
|
+
|
|
50
|
+
# Convenience alias — spec §9.1 says ``import crp; client = crp.Client(...)``
|
|
51
|
+
Client = CRPOrchestrator
|
|
52
|
+
|
|
53
|
+
# Lazy imports for advanced types — avoids pulling heavy subsystems on ``import crp``
|
|
54
|
+
def __getattr__(name: str):
|
|
55
|
+
_ADVANCED = {
|
|
56
|
+
"CKFConfig": "crp.ckf.fabric",
|
|
57
|
+
"CKFHealth": "crp.ckf.fabric",
|
|
58
|
+
"ContextualKnowledgeFabric": "crp.ckf.fabric",
|
|
59
|
+
"ContinuationConfig": "crp.continuation.manager",
|
|
60
|
+
"ContinuationManager": "crp.continuation.manager",
|
|
61
|
+
"CriticalState": "crp.state.critical_state",
|
|
62
|
+
"StructuralState": "crp.state.critical_state",
|
|
63
|
+
"EnvelopePreview": "crp.core.session",
|
|
64
|
+
"EnvelopeResult": "crp.envelope.builder",
|
|
65
|
+
"EnvelopeState": "crp.envelope.builder",
|
|
66
|
+
"ExtractionPipeline": "crp.extraction.pipeline",
|
|
67
|
+
"Fact": "crp.extraction.types",
|
|
68
|
+
"FactEdge": "crp.extraction.types",
|
|
69
|
+
"FactGraph": "crp.extraction.types",
|
|
70
|
+
"WarmStateStore": "crp.state.warm_store",
|
|
71
|
+
"WarmStoreConfig": "crp.state.warm_store",
|
|
72
|
+
}
|
|
73
|
+
if name in _ADVANCED:
|
|
74
|
+
import importlib
|
|
75
|
+
mod = importlib.import_module(_ADVANCED[name])
|
|
76
|
+
return getattr(mod, name)
|
|
77
|
+
raise AttributeError(f"module 'crp' has no attribute {name!r}")
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
__all__ = [
|
|
81
|
+
# Core public API
|
|
82
|
+
"__version__",
|
|
83
|
+
"Client",
|
|
84
|
+
"CRPOrchestrator",
|
|
85
|
+
"CRPConfig",
|
|
86
|
+
"ConfigurationResolver",
|
|
87
|
+
"TaskIntent",
|
|
88
|
+
# Error types (§audit L2)
|
|
89
|
+
"CRPError",
|
|
90
|
+
"ErrorCode",
|
|
91
|
+
"BudgetExhaustedError",
|
|
92
|
+
"ChainVerificationFailedError",
|
|
93
|
+
"ProviderError",
|
|
94
|
+
"ProviderTimeoutError",
|
|
95
|
+
"RateLimitExceededError",
|
|
96
|
+
"SecurityInvariantError",
|
|
97
|
+
"SessionClosedError",
|
|
98
|
+
"SessionExpiredError",
|
|
99
|
+
"SignatureInvalidError",
|
|
100
|
+
"StateCorruptedError",
|
|
101
|
+
"ValidationError",
|
|
102
|
+
# Results
|
|
103
|
+
"QualityReport",
|
|
104
|
+
"CostEstimate",
|
|
105
|
+
"SessionHandle",
|
|
106
|
+
"SessionStatus",
|
|
107
|
+
"StreamEvent",
|
|
108
|
+
"ExtractionResult",
|
|
109
|
+
# Advanced (lazy-loaded)
|
|
110
|
+
"CKFConfig",
|
|
111
|
+
"CKFHealth",
|
|
112
|
+
"ContextualKnowledgeFabric",
|
|
113
|
+
"ContinuationConfig",
|
|
114
|
+
"ContinuationManager",
|
|
115
|
+
"CriticalState",
|
|
116
|
+
"EnvelopePreview",
|
|
117
|
+
"EnvelopeResult",
|
|
118
|
+
"EnvelopeState",
|
|
119
|
+
"ExtractionPipeline",
|
|
120
|
+
"Fact",
|
|
121
|
+
"FactEdge",
|
|
122
|
+
"FactGraph",
|
|
123
|
+
"StructuralState",
|
|
124
|
+
"WarmStateStore",
|
|
125
|
+
"WarmStoreConfig",
|
|
126
|
+
]
|
crp/__main__.py
ADDED
crp/_typing.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
|
|
2
|
+
# Licensed under Elastic License 2.0 — see LICENSE.md for details.
|
|
3
|
+
"""Shared type aliases and protocols for CRP (§audit L5)."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import Any, TypeAlias
|
|
8
|
+
from collections.abc import Callable
|
|
9
|
+
|
|
10
|
+
# ── Type aliases ──────────────────────────────────────────────────────────
|
|
11
|
+
JSON: TypeAlias = dict[str, Any]
|
|
12
|
+
FactID: TypeAlias = str
|
|
13
|
+
WindowID: TypeAlias = str
|
|
14
|
+
SessionID: TypeAlias = str
|
|
15
|
+
TokenCount: TypeAlias = int
|
|
16
|
+
EmbeddingVector: TypeAlias = list[float]
|
|
17
|
+
EmbeddingFn: TypeAlias = Callable[[list[str]], list[EmbeddingVector]]
|
|
18
|
+
|
|
19
|
+
# ── Dispatch return types (§audit4 CQ-H2) ────────────────────────────────
|
|
20
|
+
# The various dispatch strategies intentionally return different types:
|
|
21
|
+
# dispatch() → tuple[str, QualityReport]
|
|
22
|
+
# dispatch_hierarchical() → tuple[list[str], QualityReport]
|
|
23
|
+
# dispatch_stream() → Generator[StreamEvent, None, None]
|
|
24
|
+
# dispatch_stream_augmented → tuple[str, QualityReport]
|
|
25
|
+
# A unified DispatchResult would break tuple unpacking at all call sites.
|
|
26
|
+
# The common pattern is (output, QualityReport) where output varies by
|
|
27
|
+
# strategy. See CRPOrchestrator method docstrings for specifics.
|
crp/_version.py
ADDED
crp/adapters.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
|
|
2
|
+
# Licensed under Elastic License 2.0 — see LICENSE.md for details.
|
|
3
|
+
"""Compatibility alias — ``from crp.adapters import ...``
|
|
4
|
+
|
|
5
|
+
This module re-exports everything from :mod:`crp.providers` so that
|
|
6
|
+
documentation examples using ``crp.adapters`` work unchanged.
|
|
7
|
+
|
|
8
|
+
Canonical location is ``crp.providers``.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from crp.providers import ( # noqa: F401
|
|
12
|
+
AnthropicAdapter,
|
|
13
|
+
BaseAdapter,
|
|
14
|
+
CallableAdapter,
|
|
15
|
+
CustomProvider,
|
|
16
|
+
LlamaCppAdapter,
|
|
17
|
+
LLMProvider,
|
|
18
|
+
OllamaAdapter,
|
|
19
|
+
OpenAIAdapter,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
__all__ = [
|
|
23
|
+
"LLMProvider",
|
|
24
|
+
"BaseAdapter",
|
|
25
|
+
"OpenAIAdapter",
|
|
26
|
+
"AnthropicAdapter",
|
|
27
|
+
"OllamaAdapter",
|
|
28
|
+
"LlamaCppAdapter",
|
|
29
|
+
"CustomProvider",
|
|
30
|
+
"CallableAdapter",
|
|
31
|
+
]
|
crp/advanced/__init__.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
|
|
2
|
+
# Licensed under Elastic License 2.0 — see LICENSE.md for details.
|
|
3
|
+
"""Advanced features — hierarchical, parallel, auto-ingest, CQS, meta-learning."""
|
|
4
|
+
|
|
5
|
+
from crp.advanced.auto_ingest import IngestFact, IngestResult, auto_ingest
|
|
6
|
+
from crp.advanced.cqs import ContextHungerSignal, CQSDetector, CQSResponse
|
|
7
|
+
from crp.advanced.cross_window import ConsistencyIssue, CrossWindowValidator, ValidationResult
|
|
8
|
+
from crp.advanced.curator import CurationConfig, LLMContextCurator, LLMSynthesis
|
|
9
|
+
from crp.advanced.feedback import FeedbackEntry, FeedbackLoop
|
|
10
|
+
from crp.advanced.hierarchical import HierarchicalPlan, HierarchicalProcessor
|
|
11
|
+
from crp.advanced.meta_learning import MetaLearningEngine, ORCResult, ReasoningTrace
|
|
12
|
+
from crp.advanced.parallel import FanOutResult, FanOutTask, ParallelFanOut
|
|
13
|
+
from crp.advanced.review_cycle import AssessmentResult, ReviewCycleManager, ReviewGuidance
|
|
14
|
+
from crp.advanced.scale_mode import QualityTier, ScaleModeSelector, SessionConfig
|
|
15
|
+
from crp.advanced.source_grounding import SourceGroundingEngine, SourcePassage
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
# auto_ingest
|
|
19
|
+
"auto_ingest", "IngestResult", "IngestFact",
|
|
20
|
+
# cqs
|
|
21
|
+
"CQSDetector", "ContextHungerSignal", "CQSResponse",
|
|
22
|
+
# cross_window
|
|
23
|
+
"CrossWindowValidator", "ConsistencyIssue", "ValidationResult",
|
|
24
|
+
# curator
|
|
25
|
+
"LLMContextCurator", "LLMSynthesis", "CurationConfig",
|
|
26
|
+
# feedback
|
|
27
|
+
"FeedbackLoop", "FeedbackEntry",
|
|
28
|
+
# hierarchical
|
|
29
|
+
"HierarchicalProcessor", "HierarchicalPlan",
|
|
30
|
+
# meta_learning
|
|
31
|
+
"MetaLearningEngine", "ReasoningTrace", "ORCResult",
|
|
32
|
+
# parallel
|
|
33
|
+
"ParallelFanOut", "FanOutTask", "FanOutResult",
|
|
34
|
+
# review_cycle
|
|
35
|
+
"ReviewCycleManager", "ReviewGuidance", "AssessmentResult",
|
|
36
|
+
# scale_mode
|
|
37
|
+
"ScaleModeSelector", "QualityTier", "SessionConfig",
|
|
38
|
+
# source_grounding
|
|
39
|
+
"SourceGroundingEngine", "SourcePassage",
|
|
40
|
+
]
|
|
@@ -0,0 +1,400 @@
|
|
|
1
|
+
# Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
|
|
2
|
+
# Licensed under Elastic License 2.0 — see LICENSE.md for details.
|
|
3
|
+
"""Auto-ingest — oversized input handling with structure-aware chunking (§4.6).
|
|
4
|
+
|
|
5
|
+
Triggers when system_tokens + task_tokens > context_window - gen_reserve.
|
|
6
|
+
Zero LLM cost by default: uses graduated extraction (stages 1-5) per chunk,
|
|
7
|
+
then reconciles boundary duplicates/complements via embedding similarity.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import math
|
|
13
|
+
import re
|
|
14
|
+
import uuid
|
|
15
|
+
from collections.abc import Callable
|
|
16
|
+
from dataclasses import dataclass, field
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
# ---------------------------------------------------------------------------
|
|
20
|
+
# Constants (§4.6)
|
|
21
|
+
# ---------------------------------------------------------------------------
|
|
22
|
+
|
|
23
|
+
ENVELOPE_OVERHEAD_RESERVE = 500 # tokens reserved for envelope framing
|
|
24
|
+
DUP_SIMILARITY = 0.95
|
|
25
|
+
COMPLEMENT_SIMILARITY = 0.75
|
|
26
|
+
TOKEN_OVERLAP_THRESHOLD = 0.3
|
|
27
|
+
|
|
28
|
+
# Protected structure patterns
|
|
29
|
+
_CODE_BLOCK = re.compile(r"```[\s\S]*?```", re.MULTILINE)
|
|
30
|
+
_TABLE_ROW = re.compile(r"^\|.+\|$", re.MULTILINE)
|
|
31
|
+
_JSON_BLOCK = re.compile(r"\{[^}]{50,}\}", re.DOTALL)
|
|
32
|
+
_NUMBERED_LIST = re.compile(r"(?:^|\n)\d+\.\s.+(?:\n\d+\.\s.+){2,}", re.MULTILINE)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# ---------------------------------------------------------------------------
|
|
36
|
+
# Data types
|
|
37
|
+
# ---------------------------------------------------------------------------
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class ProtectedSpan:
|
|
42
|
+
"""Region that must not be split mid-structure."""
|
|
43
|
+
|
|
44
|
+
start: int
|
|
45
|
+
end: int
|
|
46
|
+
span_type: str # "code_block" | "table" | "json_block" | "numbered_list"
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass
|
|
50
|
+
class Chunk:
|
|
51
|
+
"""One chunk of the oversized input."""
|
|
52
|
+
|
|
53
|
+
index: int
|
|
54
|
+
text: str
|
|
55
|
+
offset_start: int
|
|
56
|
+
offset_end: int
|
|
57
|
+
token_count: int = 0
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@dataclass
|
|
61
|
+
class IngestResult:
|
|
62
|
+
"""Summary returned by auto_ingest()."""
|
|
63
|
+
|
|
64
|
+
chunks_created: int = 0
|
|
65
|
+
facts_extracted: int = 0
|
|
66
|
+
facts_after_reconciliation: int = 0
|
|
67
|
+
synthesized_task: str = ""
|
|
68
|
+
raw_stored: bool = False
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@dataclass
|
|
72
|
+
class IngestFact:
|
|
73
|
+
"""Lightweight fact from per-chunk extraction."""
|
|
74
|
+
|
|
75
|
+
id: str = field(default_factory=lambda: str(uuid.uuid4()))
|
|
76
|
+
text: str = ""
|
|
77
|
+
confidence: float = 0.0
|
|
78
|
+
chunk_index: int = 0
|
|
79
|
+
chunk_offset_start: int = 0
|
|
80
|
+
chunk_offset_end: int = 0
|
|
81
|
+
source: str = ""
|
|
82
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
# ---------------------------------------------------------------------------
|
|
86
|
+
# Protected structure detection
|
|
87
|
+
# ---------------------------------------------------------------------------
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def detect_protected_structures(text: str) -> list[ProtectedSpan]:
|
|
91
|
+
"""Find code blocks, tables, JSON blocks, numbered lists."""
|
|
92
|
+
spans: list[ProtectedSpan] = []
|
|
93
|
+
for m in _CODE_BLOCK.finditer(text):
|
|
94
|
+
spans.append(ProtectedSpan(m.start(), m.end(), "code_block"))
|
|
95
|
+
# Tables: contiguous runs of | lines
|
|
96
|
+
table_lines: list[tuple[int, int]] = []
|
|
97
|
+
for m in _TABLE_ROW.finditer(text):
|
|
98
|
+
table_lines.append((m.start(), m.end()))
|
|
99
|
+
if table_lines:
|
|
100
|
+
run_start = table_lines[0][0]
|
|
101
|
+
run_end = table_lines[0][1]
|
|
102
|
+
for s, e in table_lines[1:]:
|
|
103
|
+
if s - run_end <= 2:
|
|
104
|
+
run_end = e
|
|
105
|
+
else:
|
|
106
|
+
if run_end - run_start > 20:
|
|
107
|
+
spans.append(ProtectedSpan(run_start, run_end, "table"))
|
|
108
|
+
run_start, run_end = s, e
|
|
109
|
+
if run_end - run_start > 20:
|
|
110
|
+
spans.append(ProtectedSpan(run_start, run_end, "table"))
|
|
111
|
+
for m in _JSON_BLOCK.finditer(text):
|
|
112
|
+
spans.append(ProtectedSpan(m.start(), m.end(), "json_block"))
|
|
113
|
+
for m in _NUMBERED_LIST.finditer(text):
|
|
114
|
+
spans.append(ProtectedSpan(m.start(), m.end(), "numbered_list"))
|
|
115
|
+
return merge_overlapping_spans(spans)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def merge_overlapping_spans(spans: list[ProtectedSpan]) -> list[ProtectedSpan]:
|
|
119
|
+
"""Merge overlapping/adjacent protected spans."""
|
|
120
|
+
if not spans:
|
|
121
|
+
return []
|
|
122
|
+
sorted_spans = sorted(spans, key=lambda s: s.start)
|
|
123
|
+
merged: list[ProtectedSpan] = [sorted_spans[0]]
|
|
124
|
+
for s in sorted_spans[1:]:
|
|
125
|
+
prev = merged[-1]
|
|
126
|
+
if s.start <= prev.end:
|
|
127
|
+
prev.end = max(prev.end, s.end)
|
|
128
|
+
prev.span_type = f"{prev.span_type}+{s.span_type}"
|
|
129
|
+
else:
|
|
130
|
+
merged.append(s)
|
|
131
|
+
return merged
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
# ---------------------------------------------------------------------------
|
|
135
|
+
# Structure-aware splitting
|
|
136
|
+
# ---------------------------------------------------------------------------
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _in_protected(pos: int, spans: list[ProtectedSpan]) -> bool:
|
|
140
|
+
return any(s.start <= pos < s.end for s in spans)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _find_split_point(
|
|
144
|
+
text: str, start: int, end: int, spans: list[ProtectedSpan],
|
|
145
|
+
) -> int:
|
|
146
|
+
"""Find best split point in [start, end) following priority order."""
|
|
147
|
+
region = text[start:end]
|
|
148
|
+
# Priority 1: heading break
|
|
149
|
+
for m in re.finditer(r"\n# ", region):
|
|
150
|
+
pos = start + m.start()
|
|
151
|
+
if not _in_protected(pos, spans):
|
|
152
|
+
return pos
|
|
153
|
+
# Priority 2: paragraph break
|
|
154
|
+
for m in re.finditer(r"\n\n", region):
|
|
155
|
+
pos = start + m.start()
|
|
156
|
+
if not _in_protected(pos, spans):
|
|
157
|
+
return pos
|
|
158
|
+
# Priority 3: between protected structures
|
|
159
|
+
for i in range(len(spans) - 1):
|
|
160
|
+
gap_start = spans[i].end
|
|
161
|
+
if start <= gap_start < end:
|
|
162
|
+
return gap_start
|
|
163
|
+
# Priority 4: sentence boundary
|
|
164
|
+
for m in re.finditer(r"\.\s", region):
|
|
165
|
+
pos = start + m.start() + 1
|
|
166
|
+
if not _in_protected(pos, spans):
|
|
167
|
+
return pos
|
|
168
|
+
# Priority 5: line break
|
|
169
|
+
for m in re.finditer(r"\n", region):
|
|
170
|
+
pos = start + m.start()
|
|
171
|
+
if not _in_protected(pos, spans):
|
|
172
|
+
return pos
|
|
173
|
+
# Priority 6: word boundary (last resort)
|
|
174
|
+
for m in re.finditer(r" ", region):
|
|
175
|
+
pos = start + m.start()
|
|
176
|
+
if not _in_protected(pos, spans):
|
|
177
|
+
return pos
|
|
178
|
+
return end
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def split_at_boundaries(
|
|
182
|
+
text: str,
|
|
183
|
+
chunk_size_chars: int,
|
|
184
|
+
overlap_chars: int,
|
|
185
|
+
protected_spans: list[ProtectedSpan],
|
|
186
|
+
) -> list[Chunk]:
|
|
187
|
+
"""Split text into chunks respecting protected structures."""
|
|
188
|
+
chunks: list[Chunk] = []
|
|
189
|
+
pos = 0
|
|
190
|
+
idx = 0
|
|
191
|
+
text_len = len(text)
|
|
192
|
+
|
|
193
|
+
while pos < text_len:
|
|
194
|
+
end = min(pos + chunk_size_chars, text_len)
|
|
195
|
+
if end < text_len:
|
|
196
|
+
split = _find_split_point(text, pos, end, protected_spans)
|
|
197
|
+
if split <= pos:
|
|
198
|
+
split = end # Fallback: force split
|
|
199
|
+
else:
|
|
200
|
+
split = end
|
|
201
|
+
|
|
202
|
+
chunks.append(Chunk(
|
|
203
|
+
index=idx,
|
|
204
|
+
text=text[pos:split],
|
|
205
|
+
offset_start=pos,
|
|
206
|
+
offset_end=split,
|
|
207
|
+
))
|
|
208
|
+
idx += 1
|
|
209
|
+
# Advance with overlap
|
|
210
|
+
pos = max(pos + 1, split - overlap_chars)
|
|
211
|
+
|
|
212
|
+
return chunks
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
# ---------------------------------------------------------------------------
|
|
216
|
+
# Boundary reconciliation
|
|
217
|
+
# ---------------------------------------------------------------------------
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def _text_overlap_ratio(a: str, b: str) -> float:
|
|
221
|
+
"""Token-level overlap ratio between two texts."""
|
|
222
|
+
words_a = set(a.lower().split())
|
|
223
|
+
words_b = set(b.lower().split())
|
|
224
|
+
if not words_a or not words_b:
|
|
225
|
+
return 0.0
|
|
226
|
+
intersection = words_a & words_b
|
|
227
|
+
return len(intersection) / min(len(words_a), len(words_b))
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def _cosine_sim(a: list[float], b: list[float]) -> float:
|
|
231
|
+
"""Cosine similarity between two vectors."""
|
|
232
|
+
if len(a) != len(b) or not a:
|
|
233
|
+
return 0.0
|
|
234
|
+
dot = sum(x * y for x, y in zip(a, b))
|
|
235
|
+
na = math.sqrt(sum(x * x for x in a))
|
|
236
|
+
nb = math.sqrt(sum(x * x for x in b))
|
|
237
|
+
return dot / (na * nb) if na > 0 and nb > 0 else 0.0
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def reconcile_chunk_boundaries(
|
|
241
|
+
per_chunk_facts: list[list[IngestFact]],
|
|
242
|
+
embedding_fn: Callable[[str], list[float]] | None = None,
|
|
243
|
+
) -> list[IngestFact]:
|
|
244
|
+
"""Deduplicate/merge facts at chunk boundaries.
|
|
245
|
+
|
|
246
|
+
- cosine_similarity > 0.95 → duplicate → skip
|
|
247
|
+
- cosine_similarity > 0.75 AND token_overlap > 0.3 → complement → merge
|
|
248
|
+
- Otherwise → new fact → keep
|
|
249
|
+
"""
|
|
250
|
+
if not per_chunk_facts:
|
|
251
|
+
return []
|
|
252
|
+
result = list(per_chunk_facts[0])
|
|
253
|
+
|
|
254
|
+
for chunk_idx in range(1, len(per_chunk_facts)):
|
|
255
|
+
for fact in per_chunk_facts[chunk_idx]:
|
|
256
|
+
is_dup = False
|
|
257
|
+
merge_target = None
|
|
258
|
+
|
|
259
|
+
for existing in result:
|
|
260
|
+
if embedding_fn:
|
|
261
|
+
emb_a = embedding_fn(existing.text)
|
|
262
|
+
emb_b = embedding_fn(fact.text)
|
|
263
|
+
sim = _cosine_sim(emb_a, emb_b)
|
|
264
|
+
else:
|
|
265
|
+
sim = _text_overlap_ratio(existing.text, fact.text)
|
|
266
|
+
|
|
267
|
+
if sim > DUP_SIMILARITY:
|
|
268
|
+
is_dup = True
|
|
269
|
+
break
|
|
270
|
+
if sim > COMPLEMENT_SIMILARITY:
|
|
271
|
+
tok_overlap = _text_overlap_ratio(existing.text, fact.text)
|
|
272
|
+
if tok_overlap > TOKEN_OVERLAP_THRESHOLD:
|
|
273
|
+
merge_target = existing
|
|
274
|
+
break
|
|
275
|
+
|
|
276
|
+
if is_dup:
|
|
277
|
+
continue
|
|
278
|
+
if merge_target is not None:
|
|
279
|
+
merge_target.text = merge_fact_texts(merge_target.text, fact.text)
|
|
280
|
+
merge_target.confidence = max(merge_target.confidence, fact.confidence)
|
|
281
|
+
else:
|
|
282
|
+
result.append(fact)
|
|
283
|
+
|
|
284
|
+
return result
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def merge_fact_texts(a: str, b: str) -> str:
|
|
288
|
+
"""Merge two complementary fact texts, keeping unique content."""
|
|
289
|
+
words_a = a.split()
|
|
290
|
+
words_b = b.split()
|
|
291
|
+
seen = set(w.lower() for w in words_a)
|
|
292
|
+
extra = [w for w in words_b if w.lower() not in seen]
|
|
293
|
+
if extra:
|
|
294
|
+
return a + " " + " ".join(extra)
|
|
295
|
+
return a
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
# ---------------------------------------------------------------------------
|
|
299
|
+
# Main auto_ingest function
|
|
300
|
+
# ---------------------------------------------------------------------------
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def auto_ingest(
|
|
304
|
+
system_prompt: str,
|
|
305
|
+
task_input: str,
|
|
306
|
+
task_intent_text: str,
|
|
307
|
+
context_window: int,
|
|
308
|
+
count_tokens: Callable[[str], int],
|
|
309
|
+
extract_fn: Callable[[str, str], list[IngestFact]] | None = None,
|
|
310
|
+
embedding_fn: Callable[[str], list[float]] | None = None,
|
|
311
|
+
store_raw_fn: Callable[[str, str], None] | None = None,
|
|
312
|
+
session_id: str = "",
|
|
313
|
+
) -> tuple[list[IngestFact], IngestResult]:
|
|
314
|
+
"""Handle oversized inputs with structure-aware chunking.
|
|
315
|
+
|
|
316
|
+
Args:
|
|
317
|
+
system_prompt: The system prompt (not modified).
|
|
318
|
+
task_input: Raw oversized input text.
|
|
319
|
+
task_intent_text: Short description of task intent.
|
|
320
|
+
context_window: Total context window in tokens.
|
|
321
|
+
count_tokens: Token counting function.
|
|
322
|
+
extract_fn: Per-chunk fact extractor (stages 1-5). If None, returns dummy facts.
|
|
323
|
+
embedding_fn: Optional embedding function for reconciliation.
|
|
324
|
+
store_raw_fn: Optional function to store raw input in cold storage.
|
|
325
|
+
session_id: Current session ID.
|
|
326
|
+
|
|
327
|
+
Returns:
|
|
328
|
+
(reconciled_facts, ingest_result)
|
|
329
|
+
"""
|
|
330
|
+
# Step 1: Compute available space
|
|
331
|
+
sys_tokens = count_tokens(system_prompt)
|
|
332
|
+
gen_reserve = max(context_window // 4, 1024)
|
|
333
|
+
available_tokens = context_window - sys_tokens - gen_reserve
|
|
334
|
+
if available_tokens <= 0:
|
|
335
|
+
available_tokens = 1024 # Minimum sane value
|
|
336
|
+
|
|
337
|
+
# Step 2: Detect protected structures
|
|
338
|
+
protected_spans = detect_protected_structures(task_input)
|
|
339
|
+
|
|
340
|
+
# Step 3: Chunk with structure-aware boundaries
|
|
341
|
+
chunk_budget = available_tokens - ENVELOPE_OVERHEAD_RESERVE
|
|
342
|
+
# Approximate chars-per-token ratio
|
|
343
|
+
total_chars = len(task_input)
|
|
344
|
+
total_tokens = count_tokens(task_input)
|
|
345
|
+
chars_per_token = total_chars / max(total_tokens, 1)
|
|
346
|
+
chunk_size_chars = int(chunk_budget * chars_per_token)
|
|
347
|
+
overlap_chars = min(chunk_size_chars // 10, int(500 * chars_per_token))
|
|
348
|
+
|
|
349
|
+
chunks = split_at_boundaries(task_input, chunk_size_chars, overlap_chars, protected_spans)
|
|
350
|
+
|
|
351
|
+
# Step 4: Per-chunk extraction (zero LLM)
|
|
352
|
+
per_chunk_facts: list[list[IngestFact]] = []
|
|
353
|
+
total_extracted = 0
|
|
354
|
+
for chunk in chunks:
|
|
355
|
+
chunk.token_count = count_tokens(chunk.text)
|
|
356
|
+
if extract_fn:
|
|
357
|
+
facts = extract_fn(chunk.text, task_intent_text)
|
|
358
|
+
else:
|
|
359
|
+
# Minimal fallback: treat entire chunk as one fact
|
|
360
|
+
facts = [IngestFact(
|
|
361
|
+
text=chunk.text[:500],
|
|
362
|
+
confidence=0.5,
|
|
363
|
+
chunk_index=chunk.index,
|
|
364
|
+
chunk_offset_start=chunk.offset_start,
|
|
365
|
+
chunk_offset_end=chunk.offset_end,
|
|
366
|
+
source=f"input_chunk_{chunk.index + 1}_of_{len(chunks)}",
|
|
367
|
+
)]
|
|
368
|
+
for f in facts:
|
|
369
|
+
f.chunk_index = chunk.index
|
|
370
|
+
f.chunk_offset_start = chunk.offset_start
|
|
371
|
+
f.chunk_offset_end = chunk.offset_end
|
|
372
|
+
f.source = f"input_chunk_{chunk.index + 1}_of_{len(chunks)}"
|
|
373
|
+
per_chunk_facts.append(facts)
|
|
374
|
+
total_extracted += len(facts)
|
|
375
|
+
|
|
376
|
+
# Step 5: Boundary reconciliation
|
|
377
|
+
reconciled = reconcile_chunk_boundaries(per_chunk_facts, embedding_fn)
|
|
378
|
+
|
|
379
|
+
# Step 6: Store raw input in cold storage
|
|
380
|
+
raw_stored = False
|
|
381
|
+
if store_raw_fn and session_id:
|
|
382
|
+
store_raw_fn(task_input, session_id)
|
|
383
|
+
raw_stored = True
|
|
384
|
+
|
|
385
|
+
# Step 7: Synthesize task
|
|
386
|
+
synthesized = (
|
|
387
|
+
f"Process the following material ({len(chunks)} sections ingested, "
|
|
388
|
+
f"{len(reconciled)} facts extracted). Original request: "
|
|
389
|
+
f"{task_intent_text[:500]}"
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
result = IngestResult(
|
|
393
|
+
chunks_created=len(chunks),
|
|
394
|
+
facts_extracted=total_extracted,
|
|
395
|
+
facts_after_reconciliation=len(reconciled),
|
|
396
|
+
synthesized_task=synthesized,
|
|
397
|
+
raw_stored=raw_stored,
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
return reconciled, result
|