agent_os_kernel 3.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (337) hide show
  1. agent_control_plane/__init__.py +662 -0
  2. agent_control_plane/a2a_adapter.py +543 -0
  3. agent_control_plane/adapter.py +417 -0
  4. agent_control_plane/agent_hibernation.py +394 -0
  5. agent_control_plane/agent_kernel.py +470 -0
  6. agent_control_plane/compliance.py +720 -0
  7. agent_control_plane/constraint_graphs.py +478 -0
  8. agent_control_plane/control_plane.py +854 -0
  9. agent_control_plane/example_executors.py +195 -0
  10. agent_control_plane/execution_engine.py +231 -0
  11. agent_control_plane/flight_recorder.py +846 -0
  12. agent_control_plane/governance_layer.py +435 -0
  13. agent_control_plane/hf_utils.py +563 -0
  14. agent_control_plane/interfaces/__init__.py +55 -0
  15. agent_control_plane/interfaces/kernel_interface.py +361 -0
  16. agent_control_plane/interfaces/plugin_interface.py +497 -0
  17. agent_control_plane/interfaces/protocol_interfaces.py +387 -0
  18. agent_control_plane/kernel_space.py +1009 -0
  19. agent_control_plane/langchain_adapter.py +424 -0
  20. agent_control_plane/lifecycle.py +3113 -0
  21. agent_control_plane/mcp_adapter.py +653 -0
  22. agent_control_plane/ml_safety.py +563 -0
  23. agent_control_plane/multimodal.py +727 -0
  24. agent_control_plane/mute_agent.py +422 -0
  25. agent_control_plane/observability.py +787 -0
  26. agent_control_plane/orchestrator.py +482 -0
  27. agent_control_plane/plugin_registry.py +750 -0
  28. agent_control_plane/policy_engine.py +954 -0
  29. agent_control_plane/process_isolation.py +777 -0
  30. agent_control_plane/shadow_mode.py +310 -0
  31. agent_control_plane/signals.py +493 -0
  32. agent_control_plane/supervisor_agents.py +430 -0
  33. agent_control_plane/time_travel_debugger.py +557 -0
  34. agent_control_plane/tool_registry.py +452 -0
  35. agent_control_plane/vfs.py +697 -0
  36. agent_kernel/__init__.py +69 -0
  37. agent_kernel/analyzer.py +435 -0
  38. agent_kernel/auditor.py +36 -0
  39. agent_kernel/completeness_auditor.py +237 -0
  40. agent_kernel/detector.py +203 -0
  41. agent_kernel/kernel.py +744 -0
  42. agent_kernel/memory_manager.py +85 -0
  43. agent_kernel/models.py +374 -0
  44. agent_kernel/nudge_mechanism.py +263 -0
  45. agent_kernel/outcome_analyzer.py +338 -0
  46. agent_kernel/patcher.py +582 -0
  47. agent_kernel/semantic_analyzer.py +316 -0
  48. agent_kernel/semantic_purge.py +349 -0
  49. agent_kernel/simulator.py +449 -0
  50. agent_kernel/teacher.py +85 -0
  51. agent_kernel/triage.py +152 -0
  52. agent_os/__init__.py +409 -0
  53. agent_os/_adversarial_impl.py +200 -0
  54. agent_os/_circuit_breaker_impl.py +232 -0
  55. agent_os/_mcp_metrics.py +193 -0
  56. agent_os/adversarial.py +20 -0
  57. agent_os/agents_compat.py +490 -0
  58. agent_os/audit_logger.py +135 -0
  59. agent_os/base_agent.py +651 -0
  60. agent_os/circuit_breaker.py +34 -0
  61. agent_os/cli/__init__.py +659 -0
  62. agent_os/cli/cmd_audit.py +128 -0
  63. agent_os/cli/cmd_init.py +152 -0
  64. agent_os/cli/cmd_policy.py +41 -0
  65. agent_os/cli/cmd_policy_gen.py +180 -0
  66. agent_os/cli/cmd_validate.py +258 -0
  67. agent_os/cli/mcp_scan.py +265 -0
  68. agent_os/cli/output.py +192 -0
  69. agent_os/cli/policy_checker.py +330 -0
  70. agent_os/compat.py +74 -0
  71. agent_os/constraint_graph.py +234 -0
  72. agent_os/content_governance.py +140 -0
  73. agent_os/context_budget.py +305 -0
  74. agent_os/credential_redactor.py +224 -0
  75. agent_os/diff_policy.py +89 -0
  76. agent_os/egress_policy.py +159 -0
  77. agent_os/escalation.py +276 -0
  78. agent_os/event_bus.py +124 -0
  79. agent_os/exceptions.py +180 -0
  80. agent_os/execution_context_policy.py +141 -0
  81. agent_os/github_enterprise.py +96 -0
  82. agent_os/health.py +20 -0
  83. agent_os/integrations/__init__.py +279 -0
  84. agent_os/integrations/a2a_adapter.py +279 -0
  85. agent_os/integrations/agent_lightning/__init__.py +30 -0
  86. agent_os/integrations/anthropic_adapter.py +420 -0
  87. agent_os/integrations/autogen_adapter.py +620 -0
  88. agent_os/integrations/base.py +1137 -0
  89. agent_os/integrations/compat.py +229 -0
  90. agent_os/integrations/config.py +98 -0
  91. agent_os/integrations/conversation_guardian.py +957 -0
  92. agent_os/integrations/crewai_adapter.py +467 -0
  93. agent_os/integrations/drift_detector.py +425 -0
  94. agent_os/integrations/dry_run.py +124 -0
  95. agent_os/integrations/escalation.py +582 -0
  96. agent_os/integrations/gemini_adapter.py +364 -0
  97. agent_os/integrations/google_adk_adapter.py +633 -0
  98. agent_os/integrations/guardrails_adapter.py +394 -0
  99. agent_os/integrations/health.py +197 -0
  100. agent_os/integrations/langchain_adapter.py +654 -0
  101. agent_os/integrations/llamafirewall.py +343 -0
  102. agent_os/integrations/llamaindex_adapter.py +188 -0
  103. agent_os/integrations/logging.py +191 -0
  104. agent_os/integrations/maf_adapter.py +631 -0
  105. agent_os/integrations/mistral_adapter.py +365 -0
  106. agent_os/integrations/openai_adapter.py +816 -0
  107. agent_os/integrations/openai_agents_sdk.py +406 -0
  108. agent_os/integrations/policy_compose.py +171 -0
  109. agent_os/integrations/profiling.py +144 -0
  110. agent_os/integrations/pydantic_ai_adapter.py +420 -0
  111. agent_os/integrations/rate_limiter.py +130 -0
  112. agent_os/integrations/rbac.py +143 -0
  113. agent_os/integrations/registry.py +113 -0
  114. agent_os/integrations/scope_guard.py +303 -0
  115. agent_os/integrations/semantic_kernel_adapter.py +769 -0
  116. agent_os/integrations/smolagents_adapter.py +629 -0
  117. agent_os/integrations/templates.py +178 -0
  118. agent_os/integrations/token_budget.py +134 -0
  119. agent_os/integrations/tool_aliases.py +190 -0
  120. agent_os/integrations/webhooks.py +177 -0
  121. agent_os/lite.py +208 -0
  122. agent_os/mcp_gateway.py +385 -0
  123. agent_os/mcp_message_signer.py +273 -0
  124. agent_os/mcp_protocols.py +161 -0
  125. agent_os/mcp_response_scanner.py +232 -0
  126. agent_os/mcp_security.py +924 -0
  127. agent_os/mcp_session_auth.py +231 -0
  128. agent_os/mcp_sliding_rate_limiter.py +184 -0
  129. agent_os/memory_guard.py +409 -0
  130. agent_os/metrics.py +134 -0
  131. agent_os/mute.py +428 -0
  132. agent_os/mute_agent.py +209 -0
  133. agent_os/policies/__init__.py +77 -0
  134. agent_os/policies/async_evaluator.py +275 -0
  135. agent_os/policies/backends.py +670 -0
  136. agent_os/policies/bridge.py +169 -0
  137. agent_os/policies/budget.py +85 -0
  138. agent_os/policies/cli.py +294 -0
  139. agent_os/policies/conflict_resolution.py +270 -0
  140. agent_os/policies/data_classification.py +252 -0
  141. agent_os/policies/evaluator.py +239 -0
  142. agent_os/policies/policy_schema.json +228 -0
  143. agent_os/policies/rate_limiting.py +145 -0
  144. agent_os/policies/schema.py +115 -0
  145. agent_os/policies/shared.py +331 -0
  146. agent_os/prompt_injection.py +694 -0
  147. agent_os/providers.py +182 -0
  148. agent_os/py.typed +0 -0
  149. agent_os/retry.py +81 -0
  150. agent_os/reversibility.py +251 -0
  151. agent_os/sandbox.py +432 -0
  152. agent_os/sandbox_provider.py +140 -0
  153. agent_os/secure_codegen.py +525 -0
  154. agent_os/security_skills.py +538 -0
  155. agent_os/semantic_policy.py +422 -0
  156. agent_os/server/__init__.py +15 -0
  157. agent_os/server/__main__.py +25 -0
  158. agent_os/server/app.py +277 -0
  159. agent_os/server/models.py +104 -0
  160. agent_os/shift_left_metrics.py +130 -0
  161. agent_os/stateless.py +742 -0
  162. agent_os/supervisor.py +148 -0
  163. agent_os/task_outcome.py +148 -0
  164. agent_os/transparency.py +181 -0
  165. agent_os/trust_root.py +128 -0
  166. agent_os_kernel-3.1.0.dist-info/METADATA +1269 -0
  167. agent_os_kernel-3.1.0.dist-info/RECORD +337 -0
  168. agent_os_kernel-3.1.0.dist-info/WHEEL +4 -0
  169. agent_os_kernel-3.1.0.dist-info/entry_points.txt +2 -0
  170. agent_os_kernel-3.1.0.dist-info/licenses/LICENSE +21 -0
  171. agent_os_observability/__init__.py +27 -0
  172. agent_os_observability/dashboards.py +898 -0
  173. agent_os_observability/metrics.py +398 -0
  174. agent_os_observability/server.py +223 -0
  175. agent_os_observability/tracer.py +232 -0
  176. agent_primitives/__init__.py +24 -0
  177. agent_primitives/failures.py +84 -0
  178. agent_primitives/py.typed +0 -0
  179. amb_core/__init__.py +177 -0
  180. amb_core/adapters/__init__.py +57 -0
  181. amb_core/adapters/aws_sqs_broker.py +376 -0
  182. amb_core/adapters/azure_servicebus_broker.py +340 -0
  183. amb_core/adapters/kafka_broker.py +260 -0
  184. amb_core/adapters/nats_broker.py +285 -0
  185. amb_core/adapters/rabbitmq_broker.py +235 -0
  186. amb_core/adapters/redis_broker.py +262 -0
  187. amb_core/broker.py +145 -0
  188. amb_core/bus.py +481 -0
  189. amb_core/cloudevents.py +509 -0
  190. amb_core/dlq.py +345 -0
  191. amb_core/hf_utils.py +536 -0
  192. amb_core/memory_broker.py +410 -0
  193. amb_core/models.py +141 -0
  194. amb_core/persistence.py +529 -0
  195. amb_core/schema.py +294 -0
  196. amb_core/tracing.py +358 -0
  197. atr/__init__.py +640 -0
  198. atr/access.py +348 -0
  199. atr/composition.py +645 -0
  200. atr/decorator.py +357 -0
  201. atr/executor.py +384 -0
  202. atr/health.py +557 -0
  203. atr/hf_utils.py +449 -0
  204. atr/injection.py +422 -0
  205. atr/metrics.py +440 -0
  206. atr/policies.py +403 -0
  207. atr/py.typed +2 -0
  208. atr/registry.py +452 -0
  209. atr/schema.py +480 -0
  210. atr/tools/safe/__init__.py +75 -0
  211. atr/tools/safe/calculator.py +467 -0
  212. atr/tools/safe/datetime_tool.py +443 -0
  213. atr/tools/safe/file_reader.py +402 -0
  214. atr/tools/safe/http_client.py +316 -0
  215. atr/tools/safe/json_parser.py +374 -0
  216. atr/tools/safe/text_tool.py +537 -0
  217. atr/tools/safe/toolkit.py +175 -0
  218. caas/__init__.py +162 -0
  219. caas/api/__init__.py +7 -0
  220. caas/api/server.py +1328 -0
  221. caas/caching.py +834 -0
  222. caas/cli.py +210 -0
  223. caas/conversation.py +223 -0
  224. caas/decay.py +72 -0
  225. caas/detection/__init__.py +9 -0
  226. caas/detection/detector.py +238 -0
  227. caas/enrichment.py +130 -0
  228. caas/gateway/__init__.py +27 -0
  229. caas/gateway/trust_gateway.py +474 -0
  230. caas/hf_utils.py +479 -0
  231. caas/ingestion/__init__.py +23 -0
  232. caas/ingestion/processors.py +253 -0
  233. caas/ingestion/structure_parser.py +188 -0
  234. caas/models.py +356 -0
  235. caas/pragmatic_truth.py +444 -0
  236. caas/routing/__init__.py +10 -0
  237. caas/routing/heuristic_router.py +58 -0
  238. caas/storage/__init__.py +9 -0
  239. caas/storage/store.py +389 -0
  240. caas/triad.py +213 -0
  241. caas/tuning/__init__.py +9 -0
  242. caas/tuning/tuner.py +329 -0
  243. caas/vfs/__init__.py +14 -0
  244. caas/vfs/filesystem.py +452 -0
  245. cmvk/__init__.py +218 -0
  246. cmvk/audit.py +402 -0
  247. cmvk/benchmarks.py +478 -0
  248. cmvk/constitutional.py +904 -0
  249. cmvk/hf_utils.py +301 -0
  250. cmvk/metrics.py +473 -0
  251. cmvk/profiles.py +300 -0
  252. cmvk/py.typed +0 -0
  253. cmvk/types.py +12 -0
  254. cmvk/verification.py +956 -0
  255. emk/__init__.py +89 -0
  256. emk/causal.py +352 -0
  257. emk/hf_utils.py +421 -0
  258. emk/indexer.py +83 -0
  259. emk/py.typed +0 -0
  260. emk/schema.py +204 -0
  261. emk/sleep_cycle.py +347 -0
  262. emk/store.py +281 -0
  263. iatp/__init__.py +166 -0
  264. iatp/attestation.py +461 -0
  265. iatp/cli.py +317 -0
  266. iatp/hf_utils.py +472 -0
  267. iatp/ipc_pipes.py +580 -0
  268. iatp/main.py +412 -0
  269. iatp/models/__init__.py +447 -0
  270. iatp/policy_engine.py +337 -0
  271. iatp/py.typed +2 -0
  272. iatp/recovery.py +321 -0
  273. iatp/security/__init__.py +270 -0
  274. iatp/sidecar/__init__.py +519 -0
  275. iatp/telemetry/__init__.py +164 -0
  276. iatp/tests/__init__.py +1 -0
  277. iatp/tests/test_attestation.py +370 -0
  278. iatp/tests/test_cli.py +131 -0
  279. iatp/tests/test_ed25519_attestation.py +211 -0
  280. iatp/tests/test_models.py +130 -0
  281. iatp/tests/test_policy_engine.py +347 -0
  282. iatp/tests/test_recovery.py +281 -0
  283. iatp/tests/test_security.py +222 -0
  284. iatp/tests/test_sidecar.py +167 -0
  285. iatp/tests/test_telemetry.py +175 -0
  286. mcp_kernel_server/__init__.py +28 -0
  287. mcp_kernel_server/cli.py +274 -0
  288. mcp_kernel_server/resources.py +217 -0
  289. mcp_kernel_server/server.py +564 -0
  290. mcp_kernel_server/tools.py +1174 -0
  291. mute_agent/__init__.py +68 -0
  292. mute_agent/core/__init__.py +1 -0
  293. mute_agent/core/execution_agent.py +166 -0
  294. mute_agent/core/handshake_protocol.py +201 -0
  295. mute_agent/core/reasoning_agent.py +238 -0
  296. mute_agent/knowledge_graph/__init__.py +1 -0
  297. mute_agent/knowledge_graph/graph_elements.py +65 -0
  298. mute_agent/knowledge_graph/multidimensional_graph.py +170 -0
  299. mute_agent/knowledge_graph/subgraph.py +224 -0
  300. mute_agent/listener/__init__.py +43 -0
  301. mute_agent/listener/adapters/__init__.py +31 -0
  302. mute_agent/listener/adapters/base_adapter.py +189 -0
  303. mute_agent/listener/adapters/caas_adapter.py +344 -0
  304. mute_agent/listener/adapters/control_plane_adapter.py +436 -0
  305. mute_agent/listener/adapters/iatp_adapter.py +332 -0
  306. mute_agent/listener/adapters/scak_adapter.py +251 -0
  307. mute_agent/listener/listener.py +610 -0
  308. mute_agent/listener/state_observer.py +436 -0
  309. mute_agent/listener/threshold_config.py +313 -0
  310. mute_agent/super_system/__init__.py +1 -0
  311. mute_agent/super_system/router.py +204 -0
  312. mute_agent/visualization/__init__.py +10 -0
  313. mute_agent/visualization/graph_debugger.py +502 -0
  314. nexus/README.md +60 -0
  315. nexus/__init__.py +51 -0
  316. nexus/arbiter.py +359 -0
  317. nexus/client.py +466 -0
  318. nexus/dmz.py +444 -0
  319. nexus/escrow.py +430 -0
  320. nexus/exceptions.py +286 -0
  321. nexus/pyproject.toml +36 -0
  322. nexus/registry.py +393 -0
  323. nexus/reputation.py +425 -0
  324. nexus/schemas/__init__.py +51 -0
  325. nexus/schemas/compliance.py +276 -0
  326. nexus/schemas/escrow.py +251 -0
  327. nexus/schemas/manifest.py +225 -0
  328. nexus/schemas/receipt.py +208 -0
  329. nexus/tests/__init__.py +0 -0
  330. nexus/tests/conftest.py +146 -0
  331. nexus/tests/test_arbiter.py +192 -0
  332. nexus/tests/test_dmz.py +194 -0
  333. nexus/tests/test_escrow.py +276 -0
  334. nexus/tests/test_exceptions.py +225 -0
  335. nexus/tests/test_registry.py +232 -0
  336. nexus/tests/test_reputation.py +328 -0
  337. nexus/tests/test_schemas.py +295 -0
@@ -0,0 +1,253 @@
1
+ # Copyright (c) Microsoft Corporation.
2
+ # Licensed under the MIT License.
3
+ """
4
+ Data ingestion module for processing different file formats.
5
+ """
6
+
7
+ import re
8
+ from abc import ABC, abstractmethod
9
+ from typing import Dict, Any, List
10
+ from io import BytesIO
11
+
12
+ from caas.models import Document, ContentFormat, DocumentType, Section
13
+
14
+
15
+ class BaseProcessor(ABC):
16
+ """Base class for document processors."""
17
+
18
+ @abstractmethod
19
+ def process(self, content: bytes, metadata: Dict[str, Any]) -> Document:
20
+ """Process raw content into a Document."""
21
+ pass
22
+
23
+ def _extract_sections(self, text: str) -> List[Section]:
24
+ """Extract sections from text based on common patterns."""
25
+ sections = []
26
+
27
+ # Pattern for headers (markdown-style or numbered)
28
+ header_pattern = r'(?:^|\n)(#{1,6}\s+.+|[A-Z][^\n]{5,80}:|\d+\.\s+[A-Z][^\n]+)'
29
+ matches = list(re.finditer(header_pattern, text))
30
+
31
+ if not matches:
32
+ # No clear sections, treat as single section
33
+ return [Section(
34
+ title="Main Content",
35
+ content=text,
36
+ start_pos=0,
37
+ end_pos=len(text)
38
+ )]
39
+
40
+ for i, match in enumerate(matches):
41
+ start = match.start()
42
+ end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
43
+ title = match.group(1).strip('#: ')
44
+ content = text[start:end].strip()
45
+
46
+ sections.append(Section(
47
+ title=title,
48
+ content=content,
49
+ start_pos=start,
50
+ end_pos=end
51
+ ))
52
+
53
+ return sections
54
+
55
+
56
+ class PDFProcessor(BaseProcessor):
57
+ """Processor for PDF documents."""
58
+
59
+ def process(self, content: bytes, metadata: Dict[str, Any]) -> Document:
60
+ """Process PDF content."""
61
+ try:
62
+ from pypdf import PdfReader
63
+ except ImportError:
64
+ raise ImportError("pypdf is required for PDF processing")
65
+
66
+ pdf_file = BytesIO(content)
67
+ reader = PdfReader(pdf_file)
68
+
69
+ text = ""
70
+ for page in reader.pages:
71
+ text += page.extract_text() + "\n"
72
+
73
+ sections = self._extract_sections(text)
74
+
75
+ return Document(
76
+ id=metadata.get("id", ""),
77
+ title=metadata.get("title", "Untitled PDF"),
78
+ content=text,
79
+ format=ContentFormat.PDF,
80
+ detected_type=DocumentType.UNKNOWN,
81
+ sections=sections,
82
+ metadata=metadata
83
+ )
84
+
85
+
86
+ class HTMLProcessor(BaseProcessor):
87
+ """Processor for HTML documents."""
88
+
89
+ def process(self, content: bytes, metadata: Dict[str, Any]) -> Document:
90
+ """Process HTML content."""
91
+ try:
92
+ from bs4 import BeautifulSoup
93
+ except ImportError:
94
+ raise ImportError("beautifulsoup4 is required for HTML processing")
95
+
96
+ soup = BeautifulSoup(content, 'lxml')
97
+
98
+ # Extract title
99
+ title = soup.title.string if soup.title else "Untitled HTML"
100
+
101
+ # Remove script and style elements
102
+ for element in soup(['script', 'style', 'nav', 'footer']):
103
+ element.decompose()
104
+
105
+ # Extract sections based on headers with hierarchy tracking
106
+ sections = []
107
+ current_h1 = None # Track current chapter (H1)
108
+ current_h2 = None # Track current parent section (H2)
109
+
110
+ for header in soup.find_all(['h1', 'h2', 'h3', 'h4']):
111
+ section_title = header.get_text().strip()
112
+ header_level = header.name # 'h1', 'h2', etc.
113
+
114
+ # Update hierarchy tracking BEFORE processing content
115
+ if header_level == 'h1':
116
+ current_h1 = section_title
117
+ current_h2 = None
118
+ elif header_level == 'h2':
119
+ current_h2 = section_title
120
+
121
+ # Get content until next header or end
122
+ content_parts = []
123
+ for sibling in header.find_next_siblings():
124
+ if sibling.name in ['h1', 'h2', 'h3', 'h4']:
125
+ break
126
+ content_parts.append(sibling.get_text())
127
+
128
+ section_content = '\n'.join(content_parts).strip()
129
+ if section_content:
130
+ # Assign hierarchy based on current tracking
131
+ chapter = None
132
+ parent_section = None
133
+
134
+ if header_level == 'h1':
135
+ # H1 sections don't have chapter/parent
136
+ pass
137
+ elif header_level == 'h2':
138
+ chapter = current_h1
139
+ elif header_level in ['h3', 'h4']:
140
+ chapter = current_h1
141
+ parent_section = current_h2
142
+
143
+ sections.append(Section(
144
+ title=section_title,
145
+ content=section_content,
146
+ start_pos=0,
147
+ end_pos=len(section_content),
148
+ chapter=chapter,
149
+ parent_section=parent_section
150
+ ))
151
+
152
+ # Get all text
153
+ text = soup.get_text(separator='\n', strip=True)
154
+
155
+ if not sections:
156
+ sections = self._extract_sections(text)
157
+
158
+ return Document(
159
+ id=metadata.get("id", ""),
160
+ title=title,
161
+ content=text,
162
+ format=ContentFormat.HTML,
163
+ detected_type=DocumentType.UNKNOWN,
164
+ sections=sections,
165
+ metadata=metadata
166
+ )
167
+
168
+
169
+ class CodeProcessor(BaseProcessor):
170
+ """Processor for source code files."""
171
+
172
+ def process(self, content: bytes, metadata: Dict[str, Any]) -> Document:
173
+ """Process source code content."""
174
+ text = content.decode('utf-8', errors='ignore')
175
+
176
+ # Detect programming language from metadata or content
177
+ language = metadata.get("language", self._detect_language(text))
178
+
179
+ # Extract sections (classes, functions, etc.)
180
+ sections = self._extract_code_sections(text, language)
181
+
182
+ if not sections:
183
+ sections = [Section(
184
+ title="Source Code",
185
+ content=text,
186
+ start_pos=0,
187
+ end_pos=len(text)
188
+ )]
189
+
190
+ return Document(
191
+ id=metadata.get("id", ""),
192
+ title=metadata.get("title", "Source Code"),
193
+ content=text,
194
+ format=ContentFormat.CODE,
195
+ detected_type=DocumentType.SOURCE_CODE,
196
+ sections=sections,
197
+ metadata={**metadata, "language": language}
198
+ )
199
+
200
+ def _detect_language(self, text: str) -> str:
201
+ """Simple language detection based on syntax."""
202
+ if 'def ' in text and ':' in text:
203
+ return 'python'
204
+ elif 'function' in text and '{' in text:
205
+ return 'javascript'
206
+ elif 'public class' in text or 'private class' in text:
207
+ return 'java'
208
+ elif '#include' in text:
209
+ return 'c++'
210
+ return 'unknown'
211
+
212
+ def _extract_code_sections(self, text: str, language: str) -> List[Section]:
213
+ """Extract code sections (functions, classes, etc.)."""
214
+ sections = []
215
+
216
+ if language == 'python':
217
+ # Match class and function definitions
218
+ pattern = r'(?:^|\n)((?:class|def)\s+\w+[^\n]*:)'
219
+ matches = list(re.finditer(pattern, text, re.MULTILINE))
220
+
221
+ for i, match in enumerate(matches):
222
+ start = match.start()
223
+ end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
224
+ title = match.group(1).strip()
225
+ content = text[start:end].strip()
226
+
227
+ sections.append(Section(
228
+ title=title,
229
+ content=content,
230
+ start_pos=start,
231
+ end_pos=end
232
+ ))
233
+
234
+ return sections
235
+
236
+
237
+ class ProcessorFactory:
238
+ """Factory for creating appropriate processors."""
239
+
240
+ @staticmethod
241
+ def get_processor(format: ContentFormat) -> BaseProcessor:
242
+ """Get processor for given format."""
243
+ processors = {
244
+ ContentFormat.PDF: PDFProcessor(),
245
+ ContentFormat.HTML: HTMLProcessor(),
246
+ ContentFormat.CODE: CodeProcessor(),
247
+ }
248
+
249
+ processor = processors.get(format)
250
+ if not processor:
251
+ raise ValueError(f"No processor available for format: {format}")
252
+
253
+ return processor
@@ -0,0 +1,188 @@
1
+ # Copyright (c) Microsoft Corporation.
2
+ # Licensed under the MIT License.
3
+
4
+ """
5
+ Structure-Aware Parser for assigning content tiers.
6
+
7
+ Implements hierarchical structure parsing that assigns weights based on
8
+ content importance tiers as described in the "Flat Chunk Fallacy".
9
+ """
10
+
11
+ import re
12
+ from typing import List
13
+ from caas.models import Section, ContentTier, DocumentType
14
+
15
+
16
+ class StructureParser:
17
+ """
18
+ Parses document structure and assigns content tiers.
19
+
20
+ Tier 1 (High Weight): Titles, Headers, Class Definitions, API Contracts
21
+ Tier 2 (Medium Weight): Body text, Function logic
22
+ Tier 3 (Low Weight): Footnotes, Comments, Disclaimers
23
+ """
24
+
25
+ # Patterns for identifying Tier 1 content (High Value)
26
+ TIER_1_PATTERNS = {
27
+ DocumentType.SOURCE_CODE: [
28
+ # Matches: public class MyClass, private interface IAuth, protected enum Status
29
+ r'^(public|private|protected)?\s*(class|interface|enum)\s+\w+',
30
+ # Matches: public void login(...) { (Java/C-style API methods)
31
+ r'^(public|private|protected)?\s*\w+\s+\w+\s*\([^)]*\)\s*{',
32
+ # Matches: def login(self, username, password): (Python top-level functions)
33
+ r'^\s*def\s+\w+\s*\([^)]*\)\s*:',
34
+ # Matches: export function authenticate, async function getData
35
+ r'^\s*(export\s+)?(async\s+)?function\s+\w+',
36
+ # Matches: @api, @Api, @API decorators/annotations
37
+ r'@(api|Api|API)',
38
+ ],
39
+ DocumentType.TECHNICAL_DOCUMENTATION: [
40
+ r'^#{1,2}\s+', # H1, H2 headers (markdown)
41
+ r'^(API|Endpoint|Request|Response|Authentication|Authorization)', # API sections
42
+ r'^\s*(GET|POST|PUT|DELETE|PATCH)\s+/', # HTTP methods
43
+ ],
44
+ DocumentType.LEGAL_CONTRACT: [
45
+ r'^#{1,2}\s+', # Main headers
46
+ r'^(DEFINITIONS?|TERMS?|OBLIGATIONS?|LIABILITY|INDEMNITY)', # Key legal sections
47
+ r'^\d+\.\s+[A-Z][^:]+:', # Numbered main clauses
48
+ ],
49
+ DocumentType.RESEARCH_PAPER: [
50
+ r'^(ABSTRACT|INTRODUCTION|CONCLUSION|RESULTS)', # Key sections
51
+ r'^#{1,2}\s+', # Main headers
52
+ ],
53
+ DocumentType.API_DOCUMENTATION: [
54
+ r'^#{1,2}\s+', # H1, H2 headers
55
+ r'^\s*(GET|POST|PUT|DELETE|PATCH)\s+/', # Endpoints
56
+ r'^(Authentication|Authorization|Endpoint)', # Critical API sections
57
+ ],
58
+ }
59
+
60
+ # Patterns for identifying Tier 3 content (Low Value)
61
+ TIER_3_PATTERNS = [
62
+ r'^\s*#.*$', # Comments (code)
63
+ r'^\s*//.*$', # Single-line comments
64
+ r'^\s*/\*.*\*/', # Multi-line comments
65
+ r'TODO:|FIXME:|XXX:|HACK:', # Comment markers
66
+ r'^\s*\*\s+Note:', # Footnotes
67
+ r'^\s*\*\s+Disclaimer:', # Disclaimers
68
+ r'^Footnote[s]?:', # Footnotes
69
+ r'^Disclaimer[s]?:', # Disclaimers
70
+ r'^Note[s]?:', # Notes
71
+ ]
72
+
73
+ def parse_and_assign_tiers(
74
+ self,
75
+ sections: List[Section],
76
+ doc_type: DocumentType,
77
+ content: str = ""
78
+ ) -> List[Section]:
79
+ """
80
+ Parse sections and assign content tiers.
81
+
82
+ Args:
83
+ sections: List of document sections
84
+ doc_type: Type of document
85
+ content: Full document content (optional, for context)
86
+
87
+ Returns:
88
+ List of sections with assigned tiers
89
+ """
90
+ for section in sections:
91
+ tier = self._determine_tier(section, doc_type)
92
+ section.tier = tier
93
+
94
+ return sections
95
+
96
+ def _determine_tier(self, section: Section, doc_type: DocumentType) -> ContentTier:
97
+ """
98
+ Determine the content tier for a section.
99
+
100
+ Args:
101
+ section: The section to classify
102
+ doc_type: Type of document
103
+
104
+ Returns:
105
+ The assigned content tier
106
+ """
107
+ content = section.content
108
+ title = section.title
109
+
110
+ # Check for Tier 3 (Low Value) first
111
+ if self._is_tier_3_content(content, title):
112
+ return ContentTier.TIER_3_LOW
113
+
114
+ # Check for Tier 1 (High Value)
115
+ if self._is_tier_1_content(content, title, doc_type):
116
+ return ContentTier.TIER_1_HIGH
117
+
118
+ # Default to Tier 2 (Medium Value)
119
+ return ContentTier.TIER_2_MEDIUM
120
+
121
+ def _is_tier_1_content(self, content: str, title: str, doc_type: DocumentType) -> bool:
122
+ """Check if content is Tier 1 (High Value)."""
123
+ combined_text = f"{title}\n{content}"
124
+
125
+ # Check title for high-value indicators
126
+ title_lower = title.lower()
127
+ high_value_title_keywords = [
128
+ 'definition', 'api', 'class', 'interface', 'contract',
129
+ 'authentication', 'authorization', 'endpoint', 'abstract',
130
+ 'introduction', 'conclusion', 'overview', 'summary'
131
+ ]
132
+
133
+ if any(keyword in title_lower for keyword in high_value_title_keywords):
134
+ return True
135
+
136
+ # Check doc-type specific patterns
137
+ tier_1_patterns = self.TIER_1_PATTERNS.get(doc_type, [])
138
+ for pattern in tier_1_patterns:
139
+ if re.search(pattern, combined_text, re.MULTILINE | re.IGNORECASE):
140
+ return True
141
+
142
+ # Check for API contracts (general)
143
+ if re.search(r'(contract|interface|protocol|specification)', combined_text, re.IGNORECASE):
144
+ return True
145
+
146
+ return False
147
+
148
+ def _is_tier_3_content(self, content: str, title: str) -> bool:
149
+ """Check if content is Tier 3 (Low Value)."""
150
+ combined_text = f"{title}\n{content}"
151
+
152
+ # Check title for low-value indicators
153
+ title_lower = title.lower()
154
+ low_value_keywords = [
155
+ 'footnote', 'disclaimer', 'note', 'comment', 'appendix',
156
+ 'acknowledgment', 'copyright', 'license'
157
+ ]
158
+
159
+ if any(keyword in title_lower for keyword in low_value_keywords):
160
+ return True
161
+
162
+ # Check patterns
163
+ for pattern in self.TIER_3_PATTERNS:
164
+ if re.search(pattern, combined_text, re.MULTILINE | re.IGNORECASE):
165
+ # Make sure it's substantial (not just one comment line in a large section)
166
+ comment_lines = len(re.findall(pattern, combined_text, re.MULTILINE | re.IGNORECASE))
167
+ total_lines = len(combined_text.split('\n'))
168
+ if total_lines > 0 and comment_lines / total_lines > 0.5:
169
+ return True
170
+
171
+ return False
172
+
173
+ def get_tier_base_weight(self, tier: ContentTier) -> float:
174
+ """
175
+ Get the base weight multiplier for a tier.
176
+
177
+ Args:
178
+ tier: The content tier
179
+
180
+ Returns:
181
+ Base weight multiplier
182
+ """
183
+ tier_weights = {
184
+ ContentTier.TIER_1_HIGH: 2.0, # High value content gets 2x base weight
185
+ ContentTier.TIER_2_MEDIUM: 1.0, # Medium value gets 1x base weight
186
+ ContentTier.TIER_3_LOW: 0.5, # Low value gets 0.5x base weight
187
+ }
188
+ return tier_weights.get(tier, 1.0)