agent_os_kernel 3.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_control_plane/__init__.py +662 -0
- agent_control_plane/a2a_adapter.py +543 -0
- agent_control_plane/adapter.py +417 -0
- agent_control_plane/agent_hibernation.py +394 -0
- agent_control_plane/agent_kernel.py +470 -0
- agent_control_plane/compliance.py +720 -0
- agent_control_plane/constraint_graphs.py +478 -0
- agent_control_plane/control_plane.py +854 -0
- agent_control_plane/example_executors.py +195 -0
- agent_control_plane/execution_engine.py +231 -0
- agent_control_plane/flight_recorder.py +846 -0
- agent_control_plane/governance_layer.py +435 -0
- agent_control_plane/hf_utils.py +563 -0
- agent_control_plane/interfaces/__init__.py +55 -0
- agent_control_plane/interfaces/kernel_interface.py +361 -0
- agent_control_plane/interfaces/plugin_interface.py +497 -0
- agent_control_plane/interfaces/protocol_interfaces.py +387 -0
- agent_control_plane/kernel_space.py +1009 -0
- agent_control_plane/langchain_adapter.py +424 -0
- agent_control_plane/lifecycle.py +3113 -0
- agent_control_plane/mcp_adapter.py +653 -0
- agent_control_plane/ml_safety.py +563 -0
- agent_control_plane/multimodal.py +727 -0
- agent_control_plane/mute_agent.py +422 -0
- agent_control_plane/observability.py +787 -0
- agent_control_plane/orchestrator.py +482 -0
- agent_control_plane/plugin_registry.py +750 -0
- agent_control_plane/policy_engine.py +954 -0
- agent_control_plane/process_isolation.py +777 -0
- agent_control_plane/shadow_mode.py +310 -0
- agent_control_plane/signals.py +493 -0
- agent_control_plane/supervisor_agents.py +430 -0
- agent_control_plane/time_travel_debugger.py +557 -0
- agent_control_plane/tool_registry.py +452 -0
- agent_control_plane/vfs.py +697 -0
- agent_kernel/__init__.py +69 -0
- agent_kernel/analyzer.py +435 -0
- agent_kernel/auditor.py +36 -0
- agent_kernel/completeness_auditor.py +237 -0
- agent_kernel/detector.py +203 -0
- agent_kernel/kernel.py +744 -0
- agent_kernel/memory_manager.py +85 -0
- agent_kernel/models.py +374 -0
- agent_kernel/nudge_mechanism.py +263 -0
- agent_kernel/outcome_analyzer.py +338 -0
- agent_kernel/patcher.py +582 -0
- agent_kernel/semantic_analyzer.py +316 -0
- agent_kernel/semantic_purge.py +349 -0
- agent_kernel/simulator.py +449 -0
- agent_kernel/teacher.py +85 -0
- agent_kernel/triage.py +152 -0
- agent_os/__init__.py +409 -0
- agent_os/_adversarial_impl.py +200 -0
- agent_os/_circuit_breaker_impl.py +232 -0
- agent_os/_mcp_metrics.py +193 -0
- agent_os/adversarial.py +20 -0
- agent_os/agents_compat.py +490 -0
- agent_os/audit_logger.py +135 -0
- agent_os/base_agent.py +651 -0
- agent_os/circuit_breaker.py +34 -0
- agent_os/cli/__init__.py +659 -0
- agent_os/cli/cmd_audit.py +128 -0
- agent_os/cli/cmd_init.py +152 -0
- agent_os/cli/cmd_policy.py +41 -0
- agent_os/cli/cmd_policy_gen.py +180 -0
- agent_os/cli/cmd_validate.py +258 -0
- agent_os/cli/mcp_scan.py +265 -0
- agent_os/cli/output.py +192 -0
- agent_os/cli/policy_checker.py +330 -0
- agent_os/compat.py +74 -0
- agent_os/constraint_graph.py +234 -0
- agent_os/content_governance.py +140 -0
- agent_os/context_budget.py +305 -0
- agent_os/credential_redactor.py +224 -0
- agent_os/diff_policy.py +89 -0
- agent_os/egress_policy.py +159 -0
- agent_os/escalation.py +276 -0
- agent_os/event_bus.py +124 -0
- agent_os/exceptions.py +180 -0
- agent_os/execution_context_policy.py +141 -0
- agent_os/github_enterprise.py +96 -0
- agent_os/health.py +20 -0
- agent_os/integrations/__init__.py +279 -0
- agent_os/integrations/a2a_adapter.py +279 -0
- agent_os/integrations/agent_lightning/__init__.py +30 -0
- agent_os/integrations/anthropic_adapter.py +420 -0
- agent_os/integrations/autogen_adapter.py +620 -0
- agent_os/integrations/base.py +1137 -0
- agent_os/integrations/compat.py +229 -0
- agent_os/integrations/config.py +98 -0
- agent_os/integrations/conversation_guardian.py +957 -0
- agent_os/integrations/crewai_adapter.py +467 -0
- agent_os/integrations/drift_detector.py +425 -0
- agent_os/integrations/dry_run.py +124 -0
- agent_os/integrations/escalation.py +582 -0
- agent_os/integrations/gemini_adapter.py +364 -0
- agent_os/integrations/google_adk_adapter.py +633 -0
- agent_os/integrations/guardrails_adapter.py +394 -0
- agent_os/integrations/health.py +197 -0
- agent_os/integrations/langchain_adapter.py +654 -0
- agent_os/integrations/llamafirewall.py +343 -0
- agent_os/integrations/llamaindex_adapter.py +188 -0
- agent_os/integrations/logging.py +191 -0
- agent_os/integrations/maf_adapter.py +631 -0
- agent_os/integrations/mistral_adapter.py +365 -0
- agent_os/integrations/openai_adapter.py +816 -0
- agent_os/integrations/openai_agents_sdk.py +406 -0
- agent_os/integrations/policy_compose.py +171 -0
- agent_os/integrations/profiling.py +144 -0
- agent_os/integrations/pydantic_ai_adapter.py +420 -0
- agent_os/integrations/rate_limiter.py +130 -0
- agent_os/integrations/rbac.py +143 -0
- agent_os/integrations/registry.py +113 -0
- agent_os/integrations/scope_guard.py +303 -0
- agent_os/integrations/semantic_kernel_adapter.py +769 -0
- agent_os/integrations/smolagents_adapter.py +629 -0
- agent_os/integrations/templates.py +178 -0
- agent_os/integrations/token_budget.py +134 -0
- agent_os/integrations/tool_aliases.py +190 -0
- agent_os/integrations/webhooks.py +177 -0
- agent_os/lite.py +208 -0
- agent_os/mcp_gateway.py +385 -0
- agent_os/mcp_message_signer.py +273 -0
- agent_os/mcp_protocols.py +161 -0
- agent_os/mcp_response_scanner.py +232 -0
- agent_os/mcp_security.py +924 -0
- agent_os/mcp_session_auth.py +231 -0
- agent_os/mcp_sliding_rate_limiter.py +184 -0
- agent_os/memory_guard.py +409 -0
- agent_os/metrics.py +134 -0
- agent_os/mute.py +428 -0
- agent_os/mute_agent.py +209 -0
- agent_os/policies/__init__.py +77 -0
- agent_os/policies/async_evaluator.py +275 -0
- agent_os/policies/backends.py +670 -0
- agent_os/policies/bridge.py +169 -0
- agent_os/policies/budget.py +85 -0
- agent_os/policies/cli.py +294 -0
- agent_os/policies/conflict_resolution.py +270 -0
- agent_os/policies/data_classification.py +252 -0
- agent_os/policies/evaluator.py +239 -0
- agent_os/policies/policy_schema.json +228 -0
- agent_os/policies/rate_limiting.py +145 -0
- agent_os/policies/schema.py +115 -0
- agent_os/policies/shared.py +331 -0
- agent_os/prompt_injection.py +694 -0
- agent_os/providers.py +182 -0
- agent_os/py.typed +0 -0
- agent_os/retry.py +81 -0
- agent_os/reversibility.py +251 -0
- agent_os/sandbox.py +432 -0
- agent_os/sandbox_provider.py +140 -0
- agent_os/secure_codegen.py +525 -0
- agent_os/security_skills.py +538 -0
- agent_os/semantic_policy.py +422 -0
- agent_os/server/__init__.py +15 -0
- agent_os/server/__main__.py +25 -0
- agent_os/server/app.py +277 -0
- agent_os/server/models.py +104 -0
- agent_os/shift_left_metrics.py +130 -0
- agent_os/stateless.py +742 -0
- agent_os/supervisor.py +148 -0
- agent_os/task_outcome.py +148 -0
- agent_os/transparency.py +181 -0
- agent_os/trust_root.py +128 -0
- agent_os_kernel-3.1.0.dist-info/METADATA +1269 -0
- agent_os_kernel-3.1.0.dist-info/RECORD +337 -0
- agent_os_kernel-3.1.0.dist-info/WHEEL +4 -0
- agent_os_kernel-3.1.0.dist-info/entry_points.txt +2 -0
- agent_os_kernel-3.1.0.dist-info/licenses/LICENSE +21 -0
- agent_os_observability/__init__.py +27 -0
- agent_os_observability/dashboards.py +898 -0
- agent_os_observability/metrics.py +398 -0
- agent_os_observability/server.py +223 -0
- agent_os_observability/tracer.py +232 -0
- agent_primitives/__init__.py +24 -0
- agent_primitives/failures.py +84 -0
- agent_primitives/py.typed +0 -0
- amb_core/__init__.py +177 -0
- amb_core/adapters/__init__.py +57 -0
- amb_core/adapters/aws_sqs_broker.py +376 -0
- amb_core/adapters/azure_servicebus_broker.py +340 -0
- amb_core/adapters/kafka_broker.py +260 -0
- amb_core/adapters/nats_broker.py +285 -0
- amb_core/adapters/rabbitmq_broker.py +235 -0
- amb_core/adapters/redis_broker.py +262 -0
- amb_core/broker.py +145 -0
- amb_core/bus.py +481 -0
- amb_core/cloudevents.py +509 -0
- amb_core/dlq.py +345 -0
- amb_core/hf_utils.py +536 -0
- amb_core/memory_broker.py +410 -0
- amb_core/models.py +141 -0
- amb_core/persistence.py +529 -0
- amb_core/schema.py +294 -0
- amb_core/tracing.py +358 -0
- atr/__init__.py +640 -0
- atr/access.py +348 -0
- atr/composition.py +645 -0
- atr/decorator.py +357 -0
- atr/executor.py +384 -0
- atr/health.py +557 -0
- atr/hf_utils.py +449 -0
- atr/injection.py +422 -0
- atr/metrics.py +440 -0
- atr/policies.py +403 -0
- atr/py.typed +2 -0
- atr/registry.py +452 -0
- atr/schema.py +480 -0
- atr/tools/safe/__init__.py +75 -0
- atr/tools/safe/calculator.py +467 -0
- atr/tools/safe/datetime_tool.py +443 -0
- atr/tools/safe/file_reader.py +402 -0
- atr/tools/safe/http_client.py +316 -0
- atr/tools/safe/json_parser.py +374 -0
- atr/tools/safe/text_tool.py +537 -0
- atr/tools/safe/toolkit.py +175 -0
- caas/__init__.py +162 -0
- caas/api/__init__.py +7 -0
- caas/api/server.py +1328 -0
- caas/caching.py +834 -0
- caas/cli.py +210 -0
- caas/conversation.py +223 -0
- caas/decay.py +72 -0
- caas/detection/__init__.py +9 -0
- caas/detection/detector.py +238 -0
- caas/enrichment.py +130 -0
- caas/gateway/__init__.py +27 -0
- caas/gateway/trust_gateway.py +474 -0
- caas/hf_utils.py +479 -0
- caas/ingestion/__init__.py +23 -0
- caas/ingestion/processors.py +253 -0
- caas/ingestion/structure_parser.py +188 -0
- caas/models.py +356 -0
- caas/pragmatic_truth.py +444 -0
- caas/routing/__init__.py +10 -0
- caas/routing/heuristic_router.py +58 -0
- caas/storage/__init__.py +9 -0
- caas/storage/store.py +389 -0
- caas/triad.py +213 -0
- caas/tuning/__init__.py +9 -0
- caas/tuning/tuner.py +329 -0
- caas/vfs/__init__.py +14 -0
- caas/vfs/filesystem.py +452 -0
- cmvk/__init__.py +218 -0
- cmvk/audit.py +402 -0
- cmvk/benchmarks.py +478 -0
- cmvk/constitutional.py +904 -0
- cmvk/hf_utils.py +301 -0
- cmvk/metrics.py +473 -0
- cmvk/profiles.py +300 -0
- cmvk/py.typed +0 -0
- cmvk/types.py +12 -0
- cmvk/verification.py +956 -0
- emk/__init__.py +89 -0
- emk/causal.py +352 -0
- emk/hf_utils.py +421 -0
- emk/indexer.py +83 -0
- emk/py.typed +0 -0
- emk/schema.py +204 -0
- emk/sleep_cycle.py +347 -0
- emk/store.py +281 -0
- iatp/__init__.py +166 -0
- iatp/attestation.py +461 -0
- iatp/cli.py +317 -0
- iatp/hf_utils.py +472 -0
- iatp/ipc_pipes.py +580 -0
- iatp/main.py +412 -0
- iatp/models/__init__.py +447 -0
- iatp/policy_engine.py +337 -0
- iatp/py.typed +2 -0
- iatp/recovery.py +321 -0
- iatp/security/__init__.py +270 -0
- iatp/sidecar/__init__.py +519 -0
- iatp/telemetry/__init__.py +164 -0
- iatp/tests/__init__.py +1 -0
- iatp/tests/test_attestation.py +370 -0
- iatp/tests/test_cli.py +131 -0
- iatp/tests/test_ed25519_attestation.py +211 -0
- iatp/tests/test_models.py +130 -0
- iatp/tests/test_policy_engine.py +347 -0
- iatp/tests/test_recovery.py +281 -0
- iatp/tests/test_security.py +222 -0
- iatp/tests/test_sidecar.py +167 -0
- iatp/tests/test_telemetry.py +175 -0
- mcp_kernel_server/__init__.py +28 -0
- mcp_kernel_server/cli.py +274 -0
- mcp_kernel_server/resources.py +217 -0
- mcp_kernel_server/server.py +564 -0
- mcp_kernel_server/tools.py +1174 -0
- mute_agent/__init__.py +68 -0
- mute_agent/core/__init__.py +1 -0
- mute_agent/core/execution_agent.py +166 -0
- mute_agent/core/handshake_protocol.py +201 -0
- mute_agent/core/reasoning_agent.py +238 -0
- mute_agent/knowledge_graph/__init__.py +1 -0
- mute_agent/knowledge_graph/graph_elements.py +65 -0
- mute_agent/knowledge_graph/multidimensional_graph.py +170 -0
- mute_agent/knowledge_graph/subgraph.py +224 -0
- mute_agent/listener/__init__.py +43 -0
- mute_agent/listener/adapters/__init__.py +31 -0
- mute_agent/listener/adapters/base_adapter.py +189 -0
- mute_agent/listener/adapters/caas_adapter.py +344 -0
- mute_agent/listener/adapters/control_plane_adapter.py +436 -0
- mute_agent/listener/adapters/iatp_adapter.py +332 -0
- mute_agent/listener/adapters/scak_adapter.py +251 -0
- mute_agent/listener/listener.py +610 -0
- mute_agent/listener/state_observer.py +436 -0
- mute_agent/listener/threshold_config.py +313 -0
- mute_agent/super_system/__init__.py +1 -0
- mute_agent/super_system/router.py +204 -0
- mute_agent/visualization/__init__.py +10 -0
- mute_agent/visualization/graph_debugger.py +502 -0
- nexus/README.md +60 -0
- nexus/__init__.py +51 -0
- nexus/arbiter.py +359 -0
- nexus/client.py +466 -0
- nexus/dmz.py +444 -0
- nexus/escrow.py +430 -0
- nexus/exceptions.py +286 -0
- nexus/pyproject.toml +36 -0
- nexus/registry.py +393 -0
- nexus/reputation.py +425 -0
- nexus/schemas/__init__.py +51 -0
- nexus/schemas/compliance.py +276 -0
- nexus/schemas/escrow.py +251 -0
- nexus/schemas/manifest.py +225 -0
- nexus/schemas/receipt.py +208 -0
- nexus/tests/__init__.py +0 -0
- nexus/tests/conftest.py +146 -0
- nexus/tests/test_arbiter.py +192 -0
- nexus/tests/test_dmz.py +194 -0
- nexus/tests/test_escrow.py +276 -0
- nexus/tests/test_exceptions.py +225 -0
- nexus/tests/test_registry.py +232 -0
- nexus/tests/test_reputation.py +328 -0
- nexus/tests/test_schemas.py +295 -0
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
# Copyright (c) Microsoft Corporation.
|
|
2
|
+
# Licensed under the MIT License.
|
|
3
|
+
"""
|
|
4
|
+
Data ingestion module for processing different file formats.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import re
|
|
8
|
+
from abc import ABC, abstractmethod
|
|
9
|
+
from typing import Dict, Any, List
|
|
10
|
+
from io import BytesIO
|
|
11
|
+
|
|
12
|
+
from caas.models import Document, ContentFormat, DocumentType, Section
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class BaseProcessor(ABC):
|
|
16
|
+
"""Base class for document processors."""
|
|
17
|
+
|
|
18
|
+
@abstractmethod
|
|
19
|
+
def process(self, content: bytes, metadata: Dict[str, Any]) -> Document:
|
|
20
|
+
"""Process raw content into a Document."""
|
|
21
|
+
pass
|
|
22
|
+
|
|
23
|
+
def _extract_sections(self, text: str) -> List[Section]:
|
|
24
|
+
"""Extract sections from text based on common patterns."""
|
|
25
|
+
sections = []
|
|
26
|
+
|
|
27
|
+
# Pattern for headers (markdown-style or numbered)
|
|
28
|
+
header_pattern = r'(?:^|\n)(#{1,6}\s+.+|[A-Z][^\n]{5,80}:|\d+\.\s+[A-Z][^\n]+)'
|
|
29
|
+
matches = list(re.finditer(header_pattern, text))
|
|
30
|
+
|
|
31
|
+
if not matches:
|
|
32
|
+
# No clear sections, treat as single section
|
|
33
|
+
return [Section(
|
|
34
|
+
title="Main Content",
|
|
35
|
+
content=text,
|
|
36
|
+
start_pos=0,
|
|
37
|
+
end_pos=len(text)
|
|
38
|
+
)]
|
|
39
|
+
|
|
40
|
+
for i, match in enumerate(matches):
|
|
41
|
+
start = match.start()
|
|
42
|
+
end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
|
|
43
|
+
title = match.group(1).strip('#: ')
|
|
44
|
+
content = text[start:end].strip()
|
|
45
|
+
|
|
46
|
+
sections.append(Section(
|
|
47
|
+
title=title,
|
|
48
|
+
content=content,
|
|
49
|
+
start_pos=start,
|
|
50
|
+
end_pos=end
|
|
51
|
+
))
|
|
52
|
+
|
|
53
|
+
return sections
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class PDFProcessor(BaseProcessor):
|
|
57
|
+
"""Processor for PDF documents."""
|
|
58
|
+
|
|
59
|
+
def process(self, content: bytes, metadata: Dict[str, Any]) -> Document:
|
|
60
|
+
"""Process PDF content."""
|
|
61
|
+
try:
|
|
62
|
+
from pypdf import PdfReader
|
|
63
|
+
except ImportError:
|
|
64
|
+
raise ImportError("pypdf is required for PDF processing")
|
|
65
|
+
|
|
66
|
+
pdf_file = BytesIO(content)
|
|
67
|
+
reader = PdfReader(pdf_file)
|
|
68
|
+
|
|
69
|
+
text = ""
|
|
70
|
+
for page in reader.pages:
|
|
71
|
+
text += page.extract_text() + "\n"
|
|
72
|
+
|
|
73
|
+
sections = self._extract_sections(text)
|
|
74
|
+
|
|
75
|
+
return Document(
|
|
76
|
+
id=metadata.get("id", ""),
|
|
77
|
+
title=metadata.get("title", "Untitled PDF"),
|
|
78
|
+
content=text,
|
|
79
|
+
format=ContentFormat.PDF,
|
|
80
|
+
detected_type=DocumentType.UNKNOWN,
|
|
81
|
+
sections=sections,
|
|
82
|
+
metadata=metadata
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class HTMLProcessor(BaseProcessor):
|
|
87
|
+
"""Processor for HTML documents."""
|
|
88
|
+
|
|
89
|
+
def process(self, content: bytes, metadata: Dict[str, Any]) -> Document:
|
|
90
|
+
"""Process HTML content."""
|
|
91
|
+
try:
|
|
92
|
+
from bs4 import BeautifulSoup
|
|
93
|
+
except ImportError:
|
|
94
|
+
raise ImportError("beautifulsoup4 is required for HTML processing")
|
|
95
|
+
|
|
96
|
+
soup = BeautifulSoup(content, 'lxml')
|
|
97
|
+
|
|
98
|
+
# Extract title
|
|
99
|
+
title = soup.title.string if soup.title else "Untitled HTML"
|
|
100
|
+
|
|
101
|
+
# Remove script and style elements
|
|
102
|
+
for element in soup(['script', 'style', 'nav', 'footer']):
|
|
103
|
+
element.decompose()
|
|
104
|
+
|
|
105
|
+
# Extract sections based on headers with hierarchy tracking
|
|
106
|
+
sections = []
|
|
107
|
+
current_h1 = None # Track current chapter (H1)
|
|
108
|
+
current_h2 = None # Track current parent section (H2)
|
|
109
|
+
|
|
110
|
+
for header in soup.find_all(['h1', 'h2', 'h3', 'h4']):
|
|
111
|
+
section_title = header.get_text().strip()
|
|
112
|
+
header_level = header.name # 'h1', 'h2', etc.
|
|
113
|
+
|
|
114
|
+
# Update hierarchy tracking BEFORE processing content
|
|
115
|
+
if header_level == 'h1':
|
|
116
|
+
current_h1 = section_title
|
|
117
|
+
current_h2 = None
|
|
118
|
+
elif header_level == 'h2':
|
|
119
|
+
current_h2 = section_title
|
|
120
|
+
|
|
121
|
+
# Get content until next header or end
|
|
122
|
+
content_parts = []
|
|
123
|
+
for sibling in header.find_next_siblings():
|
|
124
|
+
if sibling.name in ['h1', 'h2', 'h3', 'h4']:
|
|
125
|
+
break
|
|
126
|
+
content_parts.append(sibling.get_text())
|
|
127
|
+
|
|
128
|
+
section_content = '\n'.join(content_parts).strip()
|
|
129
|
+
if section_content:
|
|
130
|
+
# Assign hierarchy based on current tracking
|
|
131
|
+
chapter = None
|
|
132
|
+
parent_section = None
|
|
133
|
+
|
|
134
|
+
if header_level == 'h1':
|
|
135
|
+
# H1 sections don't have chapter/parent
|
|
136
|
+
pass
|
|
137
|
+
elif header_level == 'h2':
|
|
138
|
+
chapter = current_h1
|
|
139
|
+
elif header_level in ['h3', 'h4']:
|
|
140
|
+
chapter = current_h1
|
|
141
|
+
parent_section = current_h2
|
|
142
|
+
|
|
143
|
+
sections.append(Section(
|
|
144
|
+
title=section_title,
|
|
145
|
+
content=section_content,
|
|
146
|
+
start_pos=0,
|
|
147
|
+
end_pos=len(section_content),
|
|
148
|
+
chapter=chapter,
|
|
149
|
+
parent_section=parent_section
|
|
150
|
+
))
|
|
151
|
+
|
|
152
|
+
# Get all text
|
|
153
|
+
text = soup.get_text(separator='\n', strip=True)
|
|
154
|
+
|
|
155
|
+
if not sections:
|
|
156
|
+
sections = self._extract_sections(text)
|
|
157
|
+
|
|
158
|
+
return Document(
|
|
159
|
+
id=metadata.get("id", ""),
|
|
160
|
+
title=title,
|
|
161
|
+
content=text,
|
|
162
|
+
format=ContentFormat.HTML,
|
|
163
|
+
detected_type=DocumentType.UNKNOWN,
|
|
164
|
+
sections=sections,
|
|
165
|
+
metadata=metadata
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
class CodeProcessor(BaseProcessor):
|
|
170
|
+
"""Processor for source code files."""
|
|
171
|
+
|
|
172
|
+
def process(self, content: bytes, metadata: Dict[str, Any]) -> Document:
|
|
173
|
+
"""Process source code content."""
|
|
174
|
+
text = content.decode('utf-8', errors='ignore')
|
|
175
|
+
|
|
176
|
+
# Detect programming language from metadata or content
|
|
177
|
+
language = metadata.get("language", self._detect_language(text))
|
|
178
|
+
|
|
179
|
+
# Extract sections (classes, functions, etc.)
|
|
180
|
+
sections = self._extract_code_sections(text, language)
|
|
181
|
+
|
|
182
|
+
if not sections:
|
|
183
|
+
sections = [Section(
|
|
184
|
+
title="Source Code",
|
|
185
|
+
content=text,
|
|
186
|
+
start_pos=0,
|
|
187
|
+
end_pos=len(text)
|
|
188
|
+
)]
|
|
189
|
+
|
|
190
|
+
return Document(
|
|
191
|
+
id=metadata.get("id", ""),
|
|
192
|
+
title=metadata.get("title", "Source Code"),
|
|
193
|
+
content=text,
|
|
194
|
+
format=ContentFormat.CODE,
|
|
195
|
+
detected_type=DocumentType.SOURCE_CODE,
|
|
196
|
+
sections=sections,
|
|
197
|
+
metadata={**metadata, "language": language}
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
def _detect_language(self, text: str) -> str:
|
|
201
|
+
"""Simple language detection based on syntax."""
|
|
202
|
+
if 'def ' in text and ':' in text:
|
|
203
|
+
return 'python'
|
|
204
|
+
elif 'function' in text and '{' in text:
|
|
205
|
+
return 'javascript'
|
|
206
|
+
elif 'public class' in text or 'private class' in text:
|
|
207
|
+
return 'java'
|
|
208
|
+
elif '#include' in text:
|
|
209
|
+
return 'c++'
|
|
210
|
+
return 'unknown'
|
|
211
|
+
|
|
212
|
+
def _extract_code_sections(self, text: str, language: str) -> List[Section]:
|
|
213
|
+
"""Extract code sections (functions, classes, etc.)."""
|
|
214
|
+
sections = []
|
|
215
|
+
|
|
216
|
+
if language == 'python':
|
|
217
|
+
# Match class and function definitions
|
|
218
|
+
pattern = r'(?:^|\n)((?:class|def)\s+\w+[^\n]*:)'
|
|
219
|
+
matches = list(re.finditer(pattern, text, re.MULTILINE))
|
|
220
|
+
|
|
221
|
+
for i, match in enumerate(matches):
|
|
222
|
+
start = match.start()
|
|
223
|
+
end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
|
|
224
|
+
title = match.group(1).strip()
|
|
225
|
+
content = text[start:end].strip()
|
|
226
|
+
|
|
227
|
+
sections.append(Section(
|
|
228
|
+
title=title,
|
|
229
|
+
content=content,
|
|
230
|
+
start_pos=start,
|
|
231
|
+
end_pos=end
|
|
232
|
+
))
|
|
233
|
+
|
|
234
|
+
return sections
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
class ProcessorFactory:
|
|
238
|
+
"""Factory for creating appropriate processors."""
|
|
239
|
+
|
|
240
|
+
@staticmethod
|
|
241
|
+
def get_processor(format: ContentFormat) -> BaseProcessor:
|
|
242
|
+
"""Get processor for given format."""
|
|
243
|
+
processors = {
|
|
244
|
+
ContentFormat.PDF: PDFProcessor(),
|
|
245
|
+
ContentFormat.HTML: HTMLProcessor(),
|
|
246
|
+
ContentFormat.CODE: CodeProcessor(),
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
processor = processors.get(format)
|
|
250
|
+
if not processor:
|
|
251
|
+
raise ValueError(f"No processor available for format: {format}")
|
|
252
|
+
|
|
253
|
+
return processor
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
# Copyright (c) Microsoft Corporation.
|
|
2
|
+
# Licensed under the MIT License.
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
Structure-Aware Parser for assigning content tiers.
|
|
6
|
+
|
|
7
|
+
Implements hierarchical structure parsing that assigns weights based on
|
|
8
|
+
content importance tiers as described in the "Flat Chunk Fallacy".
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import re
|
|
12
|
+
from typing import List
|
|
13
|
+
from caas.models import Section, ContentTier, DocumentType
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class StructureParser:
|
|
17
|
+
"""
|
|
18
|
+
Parses document structure and assigns content tiers.
|
|
19
|
+
|
|
20
|
+
Tier 1 (High Weight): Titles, Headers, Class Definitions, API Contracts
|
|
21
|
+
Tier 2 (Medium Weight): Body text, Function logic
|
|
22
|
+
Tier 3 (Low Weight): Footnotes, Comments, Disclaimers
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
# Patterns for identifying Tier 1 content (High Value)
|
|
26
|
+
TIER_1_PATTERNS = {
|
|
27
|
+
DocumentType.SOURCE_CODE: [
|
|
28
|
+
# Matches: public class MyClass, private interface IAuth, protected enum Status
|
|
29
|
+
r'^(public|private|protected)?\s*(class|interface|enum)\s+\w+',
|
|
30
|
+
# Matches: public void login(...) { (Java/C-style API methods)
|
|
31
|
+
r'^(public|private|protected)?\s*\w+\s+\w+\s*\([^)]*\)\s*{',
|
|
32
|
+
# Matches: def login(self, username, password): (Python top-level functions)
|
|
33
|
+
r'^\s*def\s+\w+\s*\([^)]*\)\s*:',
|
|
34
|
+
# Matches: export function authenticate, async function getData
|
|
35
|
+
r'^\s*(export\s+)?(async\s+)?function\s+\w+',
|
|
36
|
+
# Matches: @api, @Api, @API decorators/annotations
|
|
37
|
+
r'@(api|Api|API)',
|
|
38
|
+
],
|
|
39
|
+
DocumentType.TECHNICAL_DOCUMENTATION: [
|
|
40
|
+
r'^#{1,2}\s+', # H1, H2 headers (markdown)
|
|
41
|
+
r'^(API|Endpoint|Request|Response|Authentication|Authorization)', # API sections
|
|
42
|
+
r'^\s*(GET|POST|PUT|DELETE|PATCH)\s+/', # HTTP methods
|
|
43
|
+
],
|
|
44
|
+
DocumentType.LEGAL_CONTRACT: [
|
|
45
|
+
r'^#{1,2}\s+', # Main headers
|
|
46
|
+
r'^(DEFINITIONS?|TERMS?|OBLIGATIONS?|LIABILITY|INDEMNITY)', # Key legal sections
|
|
47
|
+
r'^\d+\.\s+[A-Z][^:]+:', # Numbered main clauses
|
|
48
|
+
],
|
|
49
|
+
DocumentType.RESEARCH_PAPER: [
|
|
50
|
+
r'^(ABSTRACT|INTRODUCTION|CONCLUSION|RESULTS)', # Key sections
|
|
51
|
+
r'^#{1,2}\s+', # Main headers
|
|
52
|
+
],
|
|
53
|
+
DocumentType.API_DOCUMENTATION: [
|
|
54
|
+
r'^#{1,2}\s+', # H1, H2 headers
|
|
55
|
+
r'^\s*(GET|POST|PUT|DELETE|PATCH)\s+/', # Endpoints
|
|
56
|
+
r'^(Authentication|Authorization|Endpoint)', # Critical API sections
|
|
57
|
+
],
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
# Patterns for identifying Tier 3 content (Low Value)
|
|
61
|
+
TIER_3_PATTERNS = [
|
|
62
|
+
r'^\s*#.*$', # Comments (code)
|
|
63
|
+
r'^\s*//.*$', # Single-line comments
|
|
64
|
+
r'^\s*/\*.*\*/', # Multi-line comments
|
|
65
|
+
r'TODO:|FIXME:|XXX:|HACK:', # Comment markers
|
|
66
|
+
r'^\s*\*\s+Note:', # Footnotes
|
|
67
|
+
r'^\s*\*\s+Disclaimer:', # Disclaimers
|
|
68
|
+
r'^Footnote[s]?:', # Footnotes
|
|
69
|
+
r'^Disclaimer[s]?:', # Disclaimers
|
|
70
|
+
r'^Note[s]?:', # Notes
|
|
71
|
+
]
|
|
72
|
+
|
|
73
|
+
def parse_and_assign_tiers(
|
|
74
|
+
self,
|
|
75
|
+
sections: List[Section],
|
|
76
|
+
doc_type: DocumentType,
|
|
77
|
+
content: str = ""
|
|
78
|
+
) -> List[Section]:
|
|
79
|
+
"""
|
|
80
|
+
Parse sections and assign content tiers.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
sections: List of document sections
|
|
84
|
+
doc_type: Type of document
|
|
85
|
+
content: Full document content (optional, for context)
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
List of sections with assigned tiers
|
|
89
|
+
"""
|
|
90
|
+
for section in sections:
|
|
91
|
+
tier = self._determine_tier(section, doc_type)
|
|
92
|
+
section.tier = tier
|
|
93
|
+
|
|
94
|
+
return sections
|
|
95
|
+
|
|
96
|
+
def _determine_tier(self, section: Section, doc_type: DocumentType) -> ContentTier:
|
|
97
|
+
"""
|
|
98
|
+
Determine the content tier for a section.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
section: The section to classify
|
|
102
|
+
doc_type: Type of document
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
The assigned content tier
|
|
106
|
+
"""
|
|
107
|
+
content = section.content
|
|
108
|
+
title = section.title
|
|
109
|
+
|
|
110
|
+
# Check for Tier 3 (Low Value) first
|
|
111
|
+
if self._is_tier_3_content(content, title):
|
|
112
|
+
return ContentTier.TIER_3_LOW
|
|
113
|
+
|
|
114
|
+
# Check for Tier 1 (High Value)
|
|
115
|
+
if self._is_tier_1_content(content, title, doc_type):
|
|
116
|
+
return ContentTier.TIER_1_HIGH
|
|
117
|
+
|
|
118
|
+
# Default to Tier 2 (Medium Value)
|
|
119
|
+
return ContentTier.TIER_2_MEDIUM
|
|
120
|
+
|
|
121
|
+
def _is_tier_1_content(self, content: str, title: str, doc_type: DocumentType) -> bool:
|
|
122
|
+
"""Check if content is Tier 1 (High Value)."""
|
|
123
|
+
combined_text = f"{title}\n{content}"
|
|
124
|
+
|
|
125
|
+
# Check title for high-value indicators
|
|
126
|
+
title_lower = title.lower()
|
|
127
|
+
high_value_title_keywords = [
|
|
128
|
+
'definition', 'api', 'class', 'interface', 'contract',
|
|
129
|
+
'authentication', 'authorization', 'endpoint', 'abstract',
|
|
130
|
+
'introduction', 'conclusion', 'overview', 'summary'
|
|
131
|
+
]
|
|
132
|
+
|
|
133
|
+
if any(keyword in title_lower for keyword in high_value_title_keywords):
|
|
134
|
+
return True
|
|
135
|
+
|
|
136
|
+
# Check doc-type specific patterns
|
|
137
|
+
tier_1_patterns = self.TIER_1_PATTERNS.get(doc_type, [])
|
|
138
|
+
for pattern in tier_1_patterns:
|
|
139
|
+
if re.search(pattern, combined_text, re.MULTILINE | re.IGNORECASE):
|
|
140
|
+
return True
|
|
141
|
+
|
|
142
|
+
# Check for API contracts (general)
|
|
143
|
+
if re.search(r'(contract|interface|protocol|specification)', combined_text, re.IGNORECASE):
|
|
144
|
+
return True
|
|
145
|
+
|
|
146
|
+
return False
|
|
147
|
+
|
|
148
|
+
def _is_tier_3_content(self, content: str, title: str) -> bool:
|
|
149
|
+
"""Check if content is Tier 3 (Low Value)."""
|
|
150
|
+
combined_text = f"{title}\n{content}"
|
|
151
|
+
|
|
152
|
+
# Check title for low-value indicators
|
|
153
|
+
title_lower = title.lower()
|
|
154
|
+
low_value_keywords = [
|
|
155
|
+
'footnote', 'disclaimer', 'note', 'comment', 'appendix',
|
|
156
|
+
'acknowledgment', 'copyright', 'license'
|
|
157
|
+
]
|
|
158
|
+
|
|
159
|
+
if any(keyword in title_lower for keyword in low_value_keywords):
|
|
160
|
+
return True
|
|
161
|
+
|
|
162
|
+
# Check patterns
|
|
163
|
+
for pattern in self.TIER_3_PATTERNS:
|
|
164
|
+
if re.search(pattern, combined_text, re.MULTILINE | re.IGNORECASE):
|
|
165
|
+
# Make sure it's substantial (not just one comment line in a large section)
|
|
166
|
+
comment_lines = len(re.findall(pattern, combined_text, re.MULTILINE | re.IGNORECASE))
|
|
167
|
+
total_lines = len(combined_text.split('\n'))
|
|
168
|
+
if total_lines > 0 and comment_lines / total_lines > 0.5:
|
|
169
|
+
return True
|
|
170
|
+
|
|
171
|
+
return False
|
|
172
|
+
|
|
173
|
+
def get_tier_base_weight(self, tier: ContentTier) -> float:
|
|
174
|
+
"""
|
|
175
|
+
Get the base weight multiplier for a tier.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
tier: The content tier
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
Base weight multiplier
|
|
182
|
+
"""
|
|
183
|
+
tier_weights = {
|
|
184
|
+
ContentTier.TIER_1_HIGH: 2.0, # High value content gets 2x base weight
|
|
185
|
+
ContentTier.TIER_2_MEDIUM: 1.0, # Medium value gets 1x base weight
|
|
186
|
+
ContentTier.TIER_3_LOW: 0.5, # Low value gets 0.5x base weight
|
|
187
|
+
}
|
|
188
|
+
return tier_weights.get(tier, 1.0)
|