cd1-agent 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (412) hide show
  1. cd1_agent-0.1.0.dist-info/METADATA +429 -0
  2. cd1_agent-0.1.0.dist-info/RECORD +412 -0
  3. cd1_agent-0.1.0.dist-info/WHEEL +5 -0
  4. cd1_agent-0.1.0.dist-info/top_level.txt +1 -0
  5. src/__init__.py +11 -0
  6. src/agent_server/__init__.py +1 -0
  7. src/agent_server/adw_dashboard/__init__.py +1 -0
  8. src/agent_server/adw_dashboard/providers/__init__.py +5 -0
  9. src/agent_server/adw_dashboard/providers/bastion_provider.py +87 -0
  10. src/agent_server/adw_dashboard/providers/bigquery_provider.py +464 -0
  11. src/agent_server/adw_dashboard/providers/cloudsql_provider.py +73 -0
  12. src/agent_server/adw_dashboard/providers/composer_provider.py +165 -0
  13. src/agent_server/adw_dashboard/providers/dag_processor_provider.py +71 -0
  14. src/agent_server/adw_dashboard/providers/factory.py +106 -0
  15. src/agent_server/adw_dashboard/providers/gcs_infra_provider.py +69 -0
  16. src/agent_server/adw_dashboard/providers/gcs_provider.py +95 -0
  17. src/agent_server/adw_dashboard/providers/scheduler_provider.py +78 -0
  18. src/agent_server/adw_dashboard/providers/triggerer_provider.py +70 -0
  19. src/agent_server/adw_dashboard/providers/webserver_provider.py +71 -0
  20. src/agent_server/adw_dashboard/providers/worker_provider.py +130 -0
  21. src/agent_server/adw_dashboard/services/__init__.py +1 -0
  22. src/agent_server/adw_dashboard/services/adw_metrics_service.py +144 -0
  23. src/agent_server/adw_dashboard/services/infra_report_generator.py +391 -0
  24. src/agent_server/adw_dashboard/services/report_data.py +51 -0
  25. src/agent_server/adw_dashboard/services/service_report_generator.py +261 -0
  26. src/agent_server/app.py +112 -0
  27. src/agent_server/auth.py +33 -0
  28. src/agent_server/bdp_airflow/__init__.py +10 -0
  29. src/agent_server/bdp_airflow/handler.py +611 -0
  30. src/agent_server/bdp_airflow/server.py +257 -0
  31. src/agent_server/bdp_airflow/services/__init__.py +5 -0
  32. src/agent_server/bdp_airflow/services/airflow_failure_store.py +396 -0
  33. src/agent_server/bdp_airflow/services/cross_dag_analyzer.py +213 -0
  34. src/agent_server/bdp_airflow/services/daily_report_generator.py +225 -0
  35. src/agent_server/bdp_airflow/services/e2e_report_generator.py +924 -0
  36. src/agent_server/bdp_airflow/services/e2e_runner.py +583 -0
  37. src/agent_server/bdp_airflow/services/e2e_ssr_report_generator.py +529 -0
  38. src/agent_server/bdp_airflow/services/event_publisher.py +152 -0
  39. src/agent_server/bdp_airflow/services/failure_detector.py +870 -0
  40. src/agent_server/bdp_airflow/services/log_analyzer.py +413 -0
  41. src/agent_server/bdp_airflow/services/models.py +315 -0
  42. src/agent_server/bdp_airflow/services/op_kwargs_analyzer.py +160 -0
  43. src/agent_server/bdp_airflow/services/precursor_analyzer.py +364 -0
  44. src/agent_server/bdp_airflow/services/remediation_engine.py +356 -0
  45. src/agent_server/bdp_airflow/services/report_store.py +454 -0
  46. src/agent_server/bdp_airflow/services/summary_generator.py +218 -0
  47. src/agent_server/bdp_common/__init__.py +28 -0
  48. src/agent_server/bdp_common/anomaly/__init__.py +31 -0
  49. src/agent_server/bdp_common/anomaly/detector.py +395 -0
  50. src/agent_server/bdp_common/anomaly/ecod.py +122 -0
  51. src/agent_server/bdp_common/anomaly/models.py +66 -0
  52. src/agent_server/bdp_common/anomaly/patterns.py +460 -0
  53. src/agent_server/bdp_common/aws_session.py +12 -0
  54. src/agent_server/bdp_common/chart_utils.py +256 -0
  55. src/agent_server/bdp_common/charts/__init__.py +5 -0
  56. src/agent_server/bdp_common/charts/generator.py +370 -0
  57. src/agent_server/bdp_common/charts/png_chart.py +323 -0
  58. src/agent_server/bdp_common/charts/svg_chart.py +356 -0
  59. src/agent_server/bdp_common/clients/__init__.py +5 -0
  60. src/agent_server/bdp_common/clients/cached_bigquery.py +77 -0
  61. src/agent_server/bdp_common/clients/cached_cloudwatch.py +133 -0
  62. src/agent_server/bdp_common/clients/cached_cloudwatch_logs.py +187 -0
  63. src/agent_server/bdp_common/deep_agent/__init__.py +22 -0
  64. src/agent_server/bdp_common/deep_agent/executor.py +178 -0
  65. src/agent_server/bdp_common/deep_agent/graph.py +95 -0
  66. src/agent_server/bdp_common/deep_agent/nodes.py +623 -0
  67. src/agent_server/bdp_common/deep_agent/schemas.py +80 -0
  68. src/agent_server/bdp_common/deep_agent/state.py +31 -0
  69. src/agent_server/bdp_common/deep_agent/tools.py +581 -0
  70. src/agent_server/bdp_common/eventbridge/__init__.py +21 -0
  71. src/agent_server/bdp_common/eventbridge/publisher.py +446 -0
  72. src/agent_server/bdp_common/kakao/__init__.py +6 -0
  73. src/agent_server/bdp_common/kakao/models.py +18 -0
  74. src/agent_server/bdp_common/kakao/notifier.py +535 -0
  75. src/agent_server/bdp_common/logging_utils.py +46 -0
  76. src/agent_server/bdp_common/mail/__init__.py +5 -0
  77. src/agent_server/bdp_common/mail/scheduler.py +340 -0
  78. src/agent_server/bdp_common/mail/sender.py +130 -0
  79. src/agent_server/bdp_common/reports/__init__.py +6 -0
  80. src/agent_server/bdp_common/reports/base.py +428 -0
  81. src/agent_server/bdp_common/reports/charts.py +1038 -0
  82. src/agent_server/bdp_common/reports/csv_export.py +84 -0
  83. src/agent_server/bdp_common/reports/email_charts.py +974 -0
  84. src/agent_server/bdp_common/reports/email_utils.py +108 -0
  85. src/agent_server/bdp_common/reports/pdf.py +291 -0
  86. src/agent_server/bdp_common/reports/png.py +107 -0
  87. src/agent_server/bdp_common/reports/s3_export.py +132 -0
  88. src/agent_server/bdp_common/reports/styles.py +451 -0
  89. src/agent_server/bdp_common/retry.py +70 -0
  90. src/agent_server/bdp_common/stores/__init__.py +17 -0
  91. src/agent_server/bdp_common/stores/admin_settings_reader.py +70 -0
  92. src/agent_server/bdp_common/stores/alert_config_reader.py +110 -0
  93. src/agent_server/bdp_common/stores/base.py +116 -0
  94. src/agent_server/bdp_common/stores/bigquery_cache.py +195 -0
  95. src/agent_server/bdp_common/stores/log_cache.py +406 -0
  96. src/agent_server/bdp_common/stores/metric_cache.py +305 -0
  97. src/agent_server/bdp_common/stores/metric_config_reader.py +90 -0
  98. src/agent_server/bdp_common/tests/__init__.py +0 -0
  99. src/agent_server/bdp_common/tests/test_stores_base.py +175 -0
  100. src/agent_server/bdp_cost/__init__.py +10 -0
  101. src/agent_server/bdp_cost/bdp_cost/__init__.py +44 -0
  102. src/agent_server/bdp_cost/bdp_cost/handler.py +807 -0
  103. src/agent_server/bdp_cost/bdp_cost/server.py +239 -0
  104. src/agent_server/bdp_cost/bdp_cost/services/__init__.py +83 -0
  105. src/agent_server/bdp_cost/bdp_cost/services/anomaly_detector.py +520 -0
  106. src/agent_server/bdp_cost/bdp_cost/services/athena_cost_analyzer.py +534 -0
  107. src/agent_server/bdp_cost/bdp_cost/services/athena_cost_models.py +59 -0
  108. src/agent_server/bdp_cost/bdp_cost/services/athena_cost_report_generator.py +413 -0
  109. src/agent_server/bdp_cost/bdp_cost/services/athena_pricing_provider.py +188 -0
  110. src/agent_server/bdp_cost/bdp_cost/services/comparison_analyzer.py +134 -0
  111. src/agent_server/bdp_cost/bdp_cost/services/comparison_models.py +58 -0
  112. src/agent_server/bdp_cost/bdp_cost/services/comparison_report_generator.py +471 -0
  113. src/agent_server/bdp_cost/bdp_cost/services/html_report_generator.py +349 -0
  114. src/agent_server/bdp_cost/bdp_cost/services/multi_account_provider.py +988 -0
  115. src/agent_server/bdp_cost/bdp_cost/services/summary_generator.py +307 -0
  116. src/agent_server/bdp_cost/handler.py +623 -0
  117. src/agent_server/bdp_cost/server.py +266 -0
  118. src/agent_server/bdp_cost/services/__init__.py +74 -0
  119. src/agent_server/bdp_cost/services/anomaly_detector.py +719 -0
  120. src/agent_server/bdp_cost/services/chart_generator.py +359 -0
  121. src/agent_server/bdp_cost/services/comparison_analyzer.py +126 -0
  122. src/agent_server/bdp_cost/services/comparison_models.py +111 -0
  123. src/agent_server/bdp_cost/services/comparison_report_generator.py +475 -0
  124. src/agent_server/bdp_cost/services/config_loader.py +256 -0
  125. src/agent_server/bdp_cost/services/cost_explorer_provider.py +729 -0
  126. src/agent_server/bdp_cost/services/cost_status_service.py +332 -0
  127. src/agent_server/bdp_cost/services/e2e_report_generator.py +1111 -0
  128. src/agent_server/bdp_cost/services/e2e_runner.py +234 -0
  129. src/agent_server/bdp_cost/services/event_publisher.py +92 -0
  130. src/agent_server/bdp_cost/services/html_report_generator.py +1321 -0
  131. src/agent_server/bdp_cost/services/kakao_notifier.py +698 -0
  132. src/agent_server/bdp_cost/services/notification_router.py +438 -0
  133. src/agent_server/bdp_cost/services/pattern_recognizers.py +848 -0
  134. src/agent_server/bdp_cost/services/report_generator.py +194 -0
  135. src/agent_server/bdp_cost/services/report_store.py +463 -0
  136. src/agent_server/bdp_cost/services/summary_generator.py +521 -0
  137. src/agent_server/bdp_dashboard/__init__.py +0 -0
  138. src/agent_server/bdp_dashboard/providers/__init__.py +5 -0
  139. src/agent_server/bdp_dashboard/providers/dag_processor_provider.py +49 -0
  140. src/agent_server/bdp_dashboard/providers/database_provider.py +73 -0
  141. src/agent_server/bdp_dashboard/providers/environment_provider.py +59 -0
  142. src/agent_server/bdp_dashboard/providers/factory.py +103 -0
  143. src/agent_server/bdp_dashboard/providers/scheduler_provider.py +57 -0
  144. src/agent_server/bdp_dashboard/providers/storage_provider.py +50 -0
  145. src/agent_server/bdp_dashboard/providers/triggerer_provider.py +50 -0
  146. src/agent_server/bdp_dashboard/providers/webserver_provider.py +56 -0
  147. src/agent_server/bdp_dashboard/providers/worker_provider.py +97 -0
  148. src/agent_server/bdp_dashboard/services/__init__.py +0 -0
  149. src/agent_server/bdp_dashboard/services/airflow_stats_service.py +226 -0
  150. src/agent_server/bdp_dashboard/services/daily_report_generator.py +1419 -0
  151. src/agent_server/bdp_dashboard/services/detection_service.py +214 -0
  152. src/agent_server/bdp_dashboard/services/infra_log_service.py +548 -0
  153. src/agent_server/bdp_dashboard/services/infra_metrics_service.py +4146 -0
  154. src/agent_server/bdp_dashboard/services/infra_report_generator.py +1970 -0
  155. src/agent_server/bdp_dashboard/services/infra_service.py +4499 -0
  156. src/agent_server/bdp_dashboard/services/kpi_mock_data.py +388 -0
  157. src/agent_server/bdp_dashboard/services/kpi_service.py +213 -0
  158. src/agent_server/bdp_dashboard/services/report_store.py +450 -0
  159. src/agent_server/bdp_dashboard/services/universe_pipeline_service.py +266 -0
  160. src/agent_server/bdp_drift/__init__.py +8 -0
  161. src/agent_server/bdp_drift/bdp_drift/__init__.py +19 -0
  162. src/agent_server/bdp_drift/bdp_drift/handler.py +571 -0
  163. src/agent_server/bdp_drift/bdp_drift/services/__init__.py +27 -0
  164. src/agent_server/bdp_drift/bdp_drift/services/baseline_store.py +1126 -0
  165. src/agent_server/bdp_drift/bdp_drift/services/chart_generator.py +341 -0
  166. src/agent_server/bdp_drift/bdp_drift/services/config_fetcher.py +721 -0
  167. src/agent_server/bdp_drift/bdp_drift/services/config_mapper.py +185 -0
  168. src/agent_server/bdp_drift/bdp_drift/services/daily_report_generator.py +511 -0
  169. src/agent_server/bdp_drift/bdp_drift/services/drift_detector.py +424 -0
  170. src/agent_server/bdp_drift/bdp_drift/services/e2e_report_generator.py +747 -0
  171. src/agent_server/bdp_drift/bdp_drift/services/e2e_runner.py +356 -0
  172. src/agent_server/bdp_drift/bdp_drift/services/fetchers/__init__.py +25 -0
  173. src/agent_server/bdp_drift/bdp_drift/services/fetchers/eks_fetcher.py +285 -0
  174. src/agent_server/bdp_drift/bdp_drift/services/fetchers/msk_fetcher.py +218 -0
  175. src/agent_server/bdp_drift/bdp_drift/services/fetchers/sagemaker_fetcher.py +557 -0
  176. src/agent_server/bdp_drift/bdp_drift/services/fetchers/service_quota_fetcher.py +223 -0
  177. src/agent_server/bdp_drift/bdp_drift/services/fetchers/vpc_fetcher.py +275 -0
  178. src/agent_server/bdp_drift/bdp_drift/services/html_report_generator.py +2572 -0
  179. src/agent_server/bdp_drift/bdp_drift/services/models.py +390 -0
  180. src/agent_server/bdp_drift/bdp_drift/services/report_store.py +459 -0
  181. src/agent_server/bdp_drift/bdp_drift/services/summary_generator.py +291 -0
  182. src/agent_server/bdp_drift/handler.py +584 -0
  183. src/agent_server/bdp_drift/server.py +567 -0
  184. src/agent_server/bdp_metrics/__init__.py +1 -0
  185. src/agent_server/bdp_metrics/bdp_metrics/__init__.py +1 -0
  186. src/agent_server/bdp_metrics/bdp_metrics/handler.py +1972 -0
  187. src/agent_server/bdp_metrics/bdp_metrics/services/__init__.py +1 -0
  188. src/agent_server/bdp_metrics/bdp_metrics/services/baseline_store.py +1224 -0
  189. src/agent_server/bdp_metrics/bdp_metrics/services/daily_report_generator.py +511 -0
  190. src/agent_server/bdp_metrics/bdp_metrics/services/e2e_report_generator.py +424 -0
  191. src/agent_server/bdp_metrics/bdp_metrics/services/e2e_runner.py +317 -0
  192. src/agent_server/bdp_metrics/bdp_metrics/services/fetchers/__init__.py +25 -0
  193. src/agent_server/bdp_metrics/bdp_metrics/services/fetchers/eks_node_metrics_fetcher.py +759 -0
  194. src/agent_server/bdp_metrics/bdp_metrics/services/fetchers/eks_pod_metrics_fetcher.py +431 -0
  195. src/agent_server/bdp_metrics/bdp_metrics/services/fetchers/emr_cluster_metrics_fetcher.py +365 -0
  196. src/agent_server/bdp_metrics/bdp_metrics/services/fetchers/emr_serverless_fetcher.py +432 -0
  197. src/agent_server/bdp_metrics/bdp_metrics/services/fetchers/msk_metrics_fetcher.py +362 -0
  198. src/agent_server/bdp_metrics/bdp_metrics/services/fetchers/mwaa_metrics_fetcher.py +402 -0
  199. src/agent_server/bdp_metrics/bdp_metrics/services/fetchers/quota_fetcher.py +325 -0
  200. src/agent_server/bdp_metrics/bdp_metrics/services/fetchers/rds_metrics_fetcher.py +396 -0
  201. src/agent_server/bdp_metrics/bdp_metrics/services/fetchers/sagemaker_endpoint_fetcher.py +348 -0
  202. src/agent_server/bdp_metrics/bdp_metrics/services/fetchers/subnet_metrics_fetcher.py +286 -0
  203. src/agent_server/bdp_metrics/bdp_metrics/services/metric_anomaly_detector.py +152 -0
  204. src/agent_server/bdp_metrics/bdp_metrics/services/models.py +435 -0
  205. src/agent_server/bdp_metrics/bdp_metrics/services/report_store.py +561 -0
  206. src/agent_server/bdp_stats/__init__.py +1 -0
  207. src/agent_server/bdp_stats/bdp_stats/__init__.py +1 -0
  208. src/agent_server/bdp_stats/bdp_stats/handler.py +496 -0
  209. src/agent_server/bdp_stats/bdp_stats/services/__init__.py +1 -0
  210. src/agent_server/bdp_stats/bdp_stats/services/fetchers/__init__.py +11 -0
  211. src/agent_server/bdp_stats/bdp_stats/services/fetchers/athena_stats_fetcher.py +346 -0
  212. src/agent_server/bdp_stats/bdp_stats/services/fetchers/emr_stats_fetcher.py +378 -0
  213. src/agent_server/bdp_stats/bdp_stats/services/fetchers/sagemaker_stats_fetcher.py +293 -0
  214. src/agent_server/bdp_stats/bdp_stats/services/models.py +136 -0
  215. src/agent_server/bdp_stats/bdp_stats/services/project_code_provider.py +201 -0
  216. src/agent_server/config.py +41 -0
  217. src/agent_server/hdsp_alertmanager/__init__.py +8 -0
  218. src/agent_server/hdsp_alertmanager/handler.py +311 -0
  219. src/agent_server/hdsp_alertmanager/scripts/test_alerts.py +230 -0
  220. src/agent_server/hdsp_alertmanager/server.py +238 -0
  221. src/agent_server/hdsp_alertmanager/services/__init__.py +39 -0
  222. src/agent_server/hdsp_alertmanager/services/alert_processor.py +379 -0
  223. src/agent_server/hdsp_alertmanager/services/deduplication_store.py +212 -0
  224. src/agent_server/hdsp_alertmanager/services/html_report_generator.py +2313 -0
  225. src/agent_server/hdsp_alertmanager/services/models.py +273 -0
  226. src/agent_server/hdsp_alertmanager/services/notification_router.py +508 -0
  227. src/agent_server/hdsp_alertmanager/services/prometheus_alert_fetcher.py +380 -0
  228. src/agent_server/hdsp_alertmanager/services/severity_mapper.py +222 -0
  229. src/agent_server/hdsp_alertmanager/services/summary_generator.py +425 -0
  230. src/agent_server/hdsp_dashboard/__init__.py +0 -0
  231. src/agent_server/hdsp_dashboard/providers/__init__.py +5 -0
  232. src/agent_server/hdsp_dashboard/providers/api_provider.py +62 -0
  233. src/agent_server/hdsp_dashboard/providers/backup_provider.py +69 -0
  234. src/agent_server/hdsp_dashboard/providers/cost_provider.py +251 -0
  235. src/agent_server/hdsp_dashboard/providers/factory.py +93 -0
  236. src/agent_server/hdsp_dashboard/providers/haproxy_provider.py +61 -0
  237. src/agent_server/hdsp_dashboard/providers/jupyter_provider.py +392 -0
  238. src/agent_server/hdsp_dashboard/providers/k8s_provider.py +515 -0
  239. src/agent_server/hdsp_dashboard/providers/minio_provider.py +240 -0
  240. src/agent_server/hdsp_dashboard/providers/nfs_provider.py +247 -0
  241. src/agent_server/hdsp_dashboard/providers/pipeline_provider.py +57 -0
  242. src/agent_server/hdsp_dashboard/providers/registry_provider.py +49 -0
  243. src/agent_server/hdsp_dashboard/providers/spark_provider.py +499 -0
  244. src/agent_server/hdsp_dashboard/services/__init__.py +0 -0
  245. src/agent_server/hdsp_dashboard/services/detection_service.py +200 -0
  246. src/agent_server/hdsp_dashboard/services/hdsp_metrics_service.py +143 -0
  247. src/agent_server/hdsp_dashboard/services/hdsp_utils.py +35 -0
  248. src/agent_server/hdsp_dashboard/services/infra_report_generator.py +843 -0
  249. src/agent_server/hdsp_dashboard/services/k8s_infra_service.py +1281 -0
  250. src/agent_server/hdsp_dashboard/services/l0_ingestion_service.py +118 -0
  251. src/agent_server/hdsp_dashboard/services/report_store.py +459 -0
  252. src/agent_server/hdsp_dashboard/services/service_report_generator.py +266 -0
  253. src/agent_server/hdsp_metrics/__init__.py +8 -0
  254. src/agent_server/hdsp_metrics/handler.py +344 -0
  255. src/agent_server/hdsp_metrics/server.py +172 -0
  256. src/agent_server/hdsp_metrics/services/__init__.py +31 -0
  257. src/agent_server/hdsp_metrics/services/anomaly_detector.py +546 -0
  258. src/agent_server/hdsp_metrics/services/prometheus_client.py +728 -0
  259. src/agent_server/routers/__init__.py +37 -0
  260. src/agent_server/routers/adw_dashboard.py +615 -0
  261. src/agent_server/routers/bdp_airflow.py +43 -0
  262. src/agent_server/routers/bdp_cost.py +43 -0
  263. src/agent_server/routers/bdp_dashboard.py +1988 -0
  264. src/agent_server/routers/bdp_drift.py +44 -0
  265. src/agent_server/routers/bdp_metrics.py +49 -0
  266. src/agent_server/routers/bdp_stats.py +50 -0
  267. src/agent_server/routers/hdsp_dashboard.py +859 -0
  268. src/agent_server/routers/supervisor.py +54 -0
  269. src/agent_server/supervisor/__init__.py +1 -0
  270. src/agent_server/supervisor/graph.py +116 -0
  271. src/agent_server/supervisor/handler.py +156 -0
  272. src/agent_server/supervisor/nodes.py +309 -0
  273. src/agent_server/supervisor/schemas.py +180 -0
  274. src/agent_server/supervisor/services/__init__.py +1 -0
  275. src/agent_server/supervisor/services/correlator.py +322 -0
  276. src/agent_server/supervisor/services/dedup.py +88 -0
  277. src/agent_server/supervisor/services/event_collector.py +136 -0
  278. src/agent_server/supervisor/services/incident_store.py +103 -0
  279. src/agent_server/supervisor/state.py +36 -0
  280. src/agents/bdp_common/aws_session.py +12 -0
  281. src/common/__init__.py +1 -0
  282. src/common/agent/__init__.py +47 -0
  283. src/common/agent/executor.py +237 -0
  284. src/common/agent/graph.py +151 -0
  285. src/common/agent/nodes.py +371 -0
  286. src/common/agent/rds_tools.py +355 -0
  287. src/common/agent/tools.py +377 -0
  288. src/common/chat/__init__.py +50 -0
  289. src/common/chat/agent.py +479 -0
  290. src/common/chat/config.py +150 -0
  291. src/common/chat/nodes/__init__.py +21 -0
  292. src/common/chat/nodes/act.py +159 -0
  293. src/common/chat/nodes/human_review.py +182 -0
  294. src/common/chat/nodes/observe.py +98 -0
  295. src/common/chat/nodes/plan.py +100 -0
  296. src/common/chat/nodes/reflect.py +133 -0
  297. src/common/chat/nodes/respond.py +154 -0
  298. src/common/chat/state.py +255 -0
  299. src/common/chat/tools/__init__.py +120 -0
  300. src/common/chat/tools/airflow.py +354 -0
  301. src/common/chat/tools/cloudwatch.py +144 -0
  302. src/common/chat/tools/drift.py +362 -0
  303. src/common/chat/tools/prometheus.py +216 -0
  304. src/common/chat/tools/rds.py +293 -0
  305. src/common/chat/tools/service_health.py +332 -0
  306. src/common/handlers/__init__.py +14 -0
  307. src/common/handlers/analysis_handler.py +294 -0
  308. src/common/handlers/base_handler.py +223 -0
  309. src/common/handlers/remediation_handler.py +345 -0
  310. src/common/hitl/__init__.py +24 -0
  311. src/common/hitl/router.py +295 -0
  312. src/common/hitl/schemas.py +115 -0
  313. src/common/hitl/store.py +1012 -0
  314. src/common/models/__init__.py +40 -0
  315. src/common/models/agent_state.py +121 -0
  316. src/common/models/analysis_result.py +101 -0
  317. src/common/models/anomaly.py +100 -0
  318. src/common/prompts/__init__.py +46 -0
  319. src/common/prompts/analysis_prompts.py +250 -0
  320. src/common/prompts/detection_prompts.py +252 -0
  321. src/common/prompts/reflection_prompts.py +245 -0
  322. src/common/prompts/replan_prompts.py +317 -0
  323. src/common/prompts/utils.py +242 -0
  324. src/common/server/__init__.py +18 -0
  325. src/common/server/adapter.py +118 -0
  326. src/common/server/base_app.py +159 -0
  327. src/common/server/config.py +150 -0
  328. src/common/server/middleware.py +98 -0
  329. src/common/server/routers/__init__.py +13 -0
  330. src/common/server/routers/health.py +204 -0
  331. src/common/server/routers/metrics.py +194 -0
  332. src/common/server/schemas/__init__.py +23 -0
  333. src/common/server/schemas/detection.py +265 -0
  334. src/common/services/__init__.py +17 -0
  335. src/common/services/aws_client.py +869 -0
  336. src/common/services/aws_session.py +97 -0
  337. src/common/services/llm_client.py +294 -0
  338. src/common/services/rds_client.py +802 -0
  339. src/common/services/schema_loader.py +414 -0
  340. src/common/timezone.py +13 -0
  341. src/portal/__init__.py +1 -0
  342. src/portal/app.py +346 -0
  343. src/portal/auth.py +97 -0
  344. src/portal/config.py +77 -0
  345. src/portal/dependencies.py +203 -0
  346. src/portal/routers/__init__.py +1 -0
  347. src/portal/routers/airflow_config.py +112 -0
  348. src/portal/routers/airflow_reports.py +27 -0
  349. src/portal/routers/airflow_status.py +27 -0
  350. src/portal/routers/api/__init__.py +0 -0
  351. src/portal/routers/api/adw.py +205 -0
  352. src/portal/routers/api/airflow_config.py +72 -0
  353. src/portal/routers/api/airflow_reports.py +64 -0
  354. src/portal/routers/api/airflow_stats.py +37 -0
  355. src/portal/routers/api/alert_config.py +97 -0
  356. src/portal/routers/api/auth.py +16 -0
  357. src/portal/routers/api/cost.py +66 -0
  358. src/portal/routers/api/cost_status.py +80 -0
  359. src/portal/routers/api/dashboard.py +17 -0
  360. src/portal/routers/api/dashboard_reports.py +64 -0
  361. src/portal/routers/api/drift_config.py +171 -0
  362. src/portal/routers/api/drift_reports.py +72 -0
  363. src/portal/routers/api/hdsp.py +528 -0
  364. src/portal/routers/api/hdsp_dashboard_reports.py +64 -0
  365. src/portal/routers/api/hdsp_metrics_reports.py +132 -0
  366. src/portal/routers/api/hitl.py +313 -0
  367. src/portal/routers/api/holmes_gpt.py +77 -0
  368. src/portal/routers/api/infra.py +749 -0
  369. src/portal/routers/api/mail_config.py +121 -0
  370. src/portal/routers/api/metric_config.py +154 -0
  371. src/portal/routers/api/metric_status.py +16 -0
  372. src/portal/routers/api/metrics.py +68 -0
  373. src/portal/routers/api/settings.py +55 -0
  374. src/portal/routers/auth_routes.py +68 -0
  375. src/portal/routers/baselines.py +260 -0
  376. src/portal/routers/cost_reports.py +83 -0
  377. src/portal/routers/cost_status.py +81 -0
  378. src/portal/routers/dashboard.py +24 -0
  379. src/portal/routers/drift_config.py +231 -0
  380. src/portal/routers/drift_reports.py +27 -0
  381. src/portal/routers/drift_status.py +27 -0
  382. src/portal/routers/hitl.py +210 -0
  383. src/portal/routers/infra.py +53 -0
  384. src/portal/routers/metric_config.py +240 -0
  385. src/portal/routers/metric_status.py +68 -0
  386. src/portal/routers/metrics.py +154 -0
  387. src/portal/routers/settings.py +128 -0
  388. src/portal/services/__init__.py +1 -0
  389. src/portal/services/_import_helper.py +173 -0
  390. src/portal/services/airflow_config_service.py +306 -0
  391. src/portal/services/airflow_config_store.py +150 -0
  392. src/portal/services/airflow_report_service.py +89 -0
  393. src/portal/services/alert_config_service.py +56 -0
  394. src/portal/services/alert_config_store.py +231 -0
  395. src/portal/services/baseline_service.py +170 -0
  396. src/portal/services/cost_report_service.py +166 -0
  397. src/portal/services/dashboard_report_service.py +89 -0
  398. src/portal/services/db.py +371 -0
  399. src/portal/services/drift_config_service.py +699 -0
  400. src/portal/services/drift_config_store.py +304 -0
  401. src/portal/services/drift_report_service.py +375 -0
  402. src/portal/services/hdsp_dashboard_report_service.py +88 -0
  403. src/portal/services/hitl_service.py +159 -0
  404. src/portal/services/holmesgpt_service.py +405 -0
  405. src/portal/services/holmesgpt_session_store.py +290 -0
  406. src/portal/services/mail_config_service.py +111 -0
  407. src/portal/services/metric_catalog_service.py +44 -0
  408. src/portal/services/metric_config_service.py +205 -0
  409. src/portal/services/metric_config_store.py +255 -0
  410. src/portal/services/metric_status_service.py +253 -0
  411. src/portal/services/metrics_report_service.py +102 -0
  412. src/portal/services/settings_service.py +356 -0
@@ -0,0 +1,429 @@
1
+ Metadata-Version: 2.4
2
+ Name: cd1-agent
3
+ Version: 0.1.0
4
+ Summary: AWS Lambda-based intelligent log analysis and auto-remediation system
5
+ Author: CD1 Team
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/lks21c/cd1-agent
8
+ Project-URL: Documentation, https://github.com/lks21c/cd1-agent#readme
9
+ Project-URL: Repository, https://github.com/lks21c/cd1-agent
10
+ Keywords: aws,lambda,langgraph,anomaly-detection,remediation
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Classifier: Programming Language :: Python :: 3.14
17
+ Requires-Python: >=3.13
18
+ Description-Content-Type: text/markdown
19
+ Requires-Dist: pydantic>=2.0.0
20
+ Requires-Dist: langchain-core>=0.1.0
21
+ Requires-Dist: langgraph>=0.0.30
22
+ Requires-Dist: boto3>=1.28.0
23
+ Provides-Extra: dev
24
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
25
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
26
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
27
+ Requires-Dist: black>=23.0.0; extra == "dev"
28
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
29
+ Requires-Dist: mypy>=1.0.0; extra == "dev"
30
+ Requires-Dist: build>=1.0.0; extra == "dev"
31
+ Requires-Dist: twine>=4.0.0; extra == "dev"
32
+ Provides-Extra: vllm
33
+ Requires-Dist: openai>=1.0.0; extra == "vllm"
34
+ Requires-Dist: httpx>=0.25.0; extra == "vllm"
35
+ Provides-Extra: victoria
36
+ Requires-Dist: requests>=2.31.0; extra == "victoria"
37
+ Provides-Extra: rds
38
+ Requires-Dist: pymysql>=1.1.0; extra == "rds"
39
+ Requires-Dist: cryptography>=41.0.0; extra == "rds"
40
+ Provides-Extra: pyod
41
+ Requires-Dist: pyod>=2.0.0; extra == "pyod"
42
+ Requires-Dist: numpy>=1.24.0; extra == "pyod"
43
+ Requires-Dist: scipy>=1.10.0; extra == "pyod"
44
+ Provides-Extra: bdp-cost
45
+ Requires-Dist: cd1-agent[pyod,server]; extra == "bdp-cost"
46
+ Provides-Extra: gcp
47
+ Requires-Dist: google-cloud-bigquery>=3.0.0; extra == "gcp"
48
+ Provides-Extra: server
49
+ Requires-Dist: fastapi>=0.115.0; extra == "server"
50
+ Requires-Dist: uvicorn[standard]>=0.32.0; extra == "server"
51
+ Requires-Dist: pydantic-settings>=2.0.0; extra == "server"
52
+ Requires-Dist: prometheus-client>=0.21.0; extra == "server"
53
+ Requires-Dist: httpx>=0.27.0; extra == "server"
54
+ Requires-Dist: jinja2>=3.1.0; extra == "server"
55
+ Requires-Dist: weasyprint>=62.0; extra == "server"
56
+ Requires-Dist: kubernetes>=28.1.0; extra == "server"
57
+ Requires-Dist: apscheduler<4,>=3.10; extra == "server"
58
+ Requires-Dist: cairosvg>=2.7.0; extra == "server"
59
+ Requires-Dist: pymupdf>=1.25.0; extra == "server"
60
+ Provides-Extra: portal
61
+ Requires-Dist: cd1-agent[rds,server]; extra == "portal"
62
+ Requires-Dist: authlib>=1.3.0; extra == "portal"
63
+ Requires-Dist: itsdangerous>=2.1.0; extra == "portal"
64
+ Requires-Dist: python-multipart>=0.0.6; extra == "portal"
65
+ Provides-Extra: lambda-layer
66
+ Requires-Dist: pydantic>=2.0.0; extra == "lambda-layer"
67
+ Requires-Dist: langchain-core>=0.1.0; extra == "lambda-layer"
68
+ Requires-Dist: langgraph>=0.0.30; extra == "lambda-layer"
69
+ Requires-Dist: boto3>=1.28.0; extra == "lambda-layer"
70
+ Provides-Extra: all
71
+ Requires-Dist: cd1-agent[pyod,rds,server,victoria,vllm]; extra == "all"
72
+
73
+ # CD1 Agent
74
+
75
+ AWS Lambda 기반 서버리스 멀티 에이전트 이상 탐지 및 자동 복구 플랫폼
76
+
77
+ ## Overview
78
+
79
+ CD1 Agent는 **4개의 독립적인 서브 에이전트**로 구성된 이상 탐지 플랫폼입니다. 각 에이전트는 MWAA(Airflow)에서 5분 주기로 호출되며, 독립적인 Step Functions 워크플로우를 통해 탐지 → 분석 → 복구 조치를 수행합니다.
80
+
81
+ ### 서브 에이전트 구성
82
+
83
+ | Agent | 대상 | 탐지 방식 | 설명 |
84
+ |-------|------|----------|------|
85
+ | **BDP Agent** | AWS 인프라 | CloudWatch Logs/Metrics | 로그 패턴, 메트릭 이상, 에러 스파이크 감지 |
86
+ | **HDSP Agent** | On-Prem K8s | Prometheus 메트릭 | Pod/Node 상태, CPU/Memory 이상, OOMKill 감지 |
87
+ | **Cost Agent** | AWS 비용 | Cost Explorer + Luminol | 비용 이상, 서비스별 급증, 근본 원인 분석 |
88
+ | **Drift Agent** | AWS 설정 | Git Baseline 비교 | 구성 드리프트, 보안 설정 변경 감지 |
89
+
90
+ ### Provider 구성
91
+
92
+ #### LLM Provider
93
+
94
+ | 환경 | Provider | 모델 | 용도 |
95
+ |------|----------|------|------|
96
+ | **On-Premise** | vLLM | 자체 호스팅 LLM | 프로덕션 분석 |
97
+ | **Public (Mock)** | Google Gemini | Gemini 2.5 Pro/Flash | 개발/테스트 |
98
+ | **로컬 테스트** | Mock LLM | 내장 Mock | AWS/LLM 없이 로직 테스트 |
99
+
100
+ #### AWS Provider
101
+
102
+ | 환경 | Provider | 용도 |
103
+ |------|----------|------|
104
+ | **Production** | AWS | 실제 AWS 서비스 호출 |
105
+ | **Public/로컬** | Mock | AWS 없이 전체 로직 테스트 |
106
+
107
+ ### 주요 기능
108
+
109
+ - **LangGraph Agent**: 동적 ReAct 루프 기반 분석 에이전트
110
+ - **주기적 로그 감지**: 5-10분 간격으로 CloudWatch 및 RDS 통합 로그 분석
111
+ - **AI 기반 근본 원인 분석**: vLLM 또는 Gemini를 활용한 ReAct 패턴 분석
112
+ - **승인 기반 실행**:
113
+ - 0.5+ : 승인 요청 후 실행
114
+ - <0.5 : 담당자 에스컬레이션
115
+ - **AWS 리소스 조정**: Lambda 재시작, RDS 파라미터 변경, Auto Scaling 조정
116
+ - **EventBridge 알림**: 외부 시스템 연동 (Slack, Teams 등)
117
+
118
+ ## Architecture
119
+
120
+ ### Multi-Agent Orchestration
121
+
122
+ ```
123
+ ┌─────────────────────────────────────────────────────────────────────────┐
124
+ │ MWAA (Airflow DAGs) - 5분 주기 │
125
+ └────────┬────────────────┬────────────────┬────────────────┬─────────────┘
126
+ │ │ │ │
127
+ ▼ ▼ ▼ ▼
128
+ ┌────────────────┐ ┌────────────────┐ ┌────────────────┐ ┌────────────────┐
129
+ │ BDP Agent │ │ HDSP Agent │ │ Cost Agent │ │ Drift Agent │
130
+ │ (AWS 로그) │ │ (Prometheus) │ │ (Cost Explorer)│ │ (Git Baseline) │
131
+ ├────────────────┤ ├────────────────┤ ├────────────────┤ ├────────────────┤
132
+ │ Detection │ │ Detection │ │ Detection │ │ Detection │
133
+ │ ↓ │ │ ↓ │ │ ↓ │ │ ↓ │
134
+ │ Step Functions │ │ Step Functions │ │ Step Functions │ │ Step Functions │
135
+ │ (개별 WF) │ │ (개별 WF) │ │ (개별 WF) │ │ (개별 WF) │
136
+ │ ↓ │ │ ↓ │ │ ↓ │ │ ↓ │
137
+ │ Analysis │ │ Analysis │ │ Analysis │ │ Analysis │
138
+ │ ↓ │ │ ↓ │ │ ↓ │ │ ↓ │
139
+ │ Action │ │ Action │ │ Action │ │ Action │
140
+ └────────────────┘ └────────────────┘ └────────────────┘ └────────────────┘
141
+ │ │ │ │
142
+ └────────────────┴────────────────┴────────────────┘
143
+
144
+
145
+ ┌──────────────────────────┐
146
+ │ 공통 컴포넌트 │
147
+ │ - Analysis Agent (LLM) │
148
+ │ - Action Engine │
149
+ │ - EventBridge 알림 │
150
+ └──────────────────────────┘
151
+ ```
152
+
153
+ **핵심 설계 원칙**:
154
+ - **독립적 워크플로우**: 각 에이전트는 개별 Step Functions 워크플로우 실행
155
+ - **공통 분석 엔진**: LangGraph 기반 ReAct 루프를 모든 에이전트가 공유
156
+ - **유연한 스케줄링**: MWAA DAG별 독립적인 실행 주기 설정 가능
157
+
158
+ ### 워크플로우 상세
159
+
160
+ ```mermaid
161
+ stateDiagram-v2
162
+ [*] --> DetectAnomalies
163
+ DetectAnomalies --> CheckAnomalies
164
+
165
+ CheckAnomalies --> NoAnomalies: No Issues
166
+ CheckAnomalies --> AnalyzeRootCause: Issues Found
167
+
168
+ NoAnomalies --> [*]
169
+
170
+ AnalyzeRootCause --> EvaluateConfidence
171
+
172
+ EvaluateConfidence --> RequestApproval: >= 0.5
173
+ EvaluateConfidence --> Escalate: < 0.5
174
+
175
+ RequestApproval --> CheckApproval
176
+ CheckApproval --> ExecuteApproved: Approved
177
+ CheckApproval --> Rejected: Rejected
178
+ ExecuteApproved --> Reflect
179
+
180
+ Reflect --> Success: Resolved
181
+ Reflect --> Replan: Needs Retry
182
+ Replan --> AnalyzeRootCause: Attempt < 3
183
+ Replan --> Escalate: Max Attempts
184
+
185
+ Escalate --> [*]
186
+ Rejected --> [*]
187
+ Success --> [*]
188
+ ```
189
+
190
+ ## Project Structure
191
+
192
+ ```
193
+ cd1-agent/
194
+ ├── docs/
195
+ │ ├── ARCHITECTURE.md # 상세 아키텍처 문서
196
+ │ ├── PROMPTS.md # 프롬프트 템플릿 설계
197
+ │ ├── COST_OPTIMIZATION.md # 비용 최적화 전략
198
+ │ ├── IMPLEMENTATION_GUIDE.md # 구현 가이드
199
+ │ ├── HDSP_DETECTION.md # HDSP Agent 문서
200
+ │ ├── COST_ANOMALY_DETECTION.md # Cost Agent 문서
201
+ │ └── CONFIG_DRIFT_DETECTION.md # Drift Agent 문서
202
+ ├── src/
203
+ │ ├── common/ # 공통 코드
204
+ │ │ ├── handlers/ # 공통 핸들러 (base, analysis, remediation)
205
+ │ │ ├── services/ # 공통 서비스 (llm_client, aws_client, rds_client)
206
+ │ │ ├── models/ # 데이터 모델
207
+ │ │ ├── prompts/ # 프롬프트 템플릿
208
+ │ │ ├── agent/ # LangGraph Agent
209
+ │ │ └── chat/ # Interactive Chat
210
+ │ └── agents/ # Agent별 코드
211
+ │ ├── bdp/ # BDP Agent (AWS CloudWatch)
212
+ │ │ └── handler.py
213
+ │ ├── hdsp/ # HDSP Agent (Prometheus/K8s)
214
+ │ │ ├── handler.py
215
+ │ │ └── services/ # prometheus_client, anomaly_detector
216
+ │ ├── cost/ # Cost Agent (AWS Cost Explorer)
217
+ │ │ ├── handler.py
218
+ │ │ └── services/ # cost_explorer_client, anomaly_detector
219
+ │ └── drift/ # Drift Agent (GitLab Baseline)
220
+ │ ├── handler.py
221
+ │ └── services/ # config_fetcher, drift_detector, gitlab_client
222
+ ├── tests/
223
+ │ ├── common/ # 공통 코드 테스트
224
+ │ │ ├── agent/ # LangGraph 테스트
225
+ │ │ └── chat/ # Chat 테스트
226
+ │ └── agents/ # Agent별 테스트
227
+ │ ├── bdp/
228
+ │ ├── hdsp/
229
+ │ ├── cost/
230
+ │ └── drift/
231
+ └── dags/ # Airflow DAG 파일
232
+ ```
233
+
234
+ ## Quick Start
235
+
236
+ ### Prerequisites
237
+
238
+ - Python 3.12+
239
+ - AWS CLI configured
240
+
241
+ ### Installation
242
+
243
+ ```bash
244
+ # 1. Clone repository
245
+ git clone https://github.com/lks21c/cd1-agent.git
246
+ cd cd1-agent
247
+
248
+ # 2. Create virtual environment
249
+ python -m venv .venv
250
+ source .venv/bin/activate # Linux/Mac
251
+ # .venv\Scripts\activate # Windows
252
+
253
+ # 3. Install dependencies
254
+ pip install -r requirements.txt
255
+ ```
256
+
257
+ ### Configuration
258
+
259
+ #### Environment Variables
260
+
261
+ **Mock Mode (Public/Local Testing)**
262
+
263
+ | Variable | Description | Default |
264
+ |----------|-------------|---------|
265
+ | `AWS_MOCK` | AWS Mock 모드 활성화 (`true`/`false`) | `false` |
266
+ | `LLM_MOCK` | LLM Mock 모드 활성화 (`true`/`false`) | `false` |
267
+
268
+ **LLM Configuration**
269
+
270
+ | Variable | Description | Default |
271
+ |----------|-------------|---------|
272
+ | `LLM_PROVIDER` | LLM 제공자 (`vllm` 또는 `gemini`) | `vllm` |
273
+ | `VLLM_BASE_URL` | vLLM 서버 엔드포인트 (On-Prem) | `http://localhost:8000/v1` |
274
+ | `VLLM_MODEL_NAME` | vLLM 모델 이름 | Required (vllm 사용 시) |
275
+ | `GEMINI_API_KEY` | Gemini API 키 (Public Mock) | Required (gemini 사용 시) |
276
+ | `GEMINI_MODEL_ID` | Gemini 모델 ID | `gemini-2.5-flash` |
277
+
278
+ **AWS Configuration**
279
+
280
+ | Variable | Description | Default |
281
+ |----------|-------------|---------|
282
+ | `RDS_CLUSTER_ARN` | RDS Aurora Serverless 클러스터 ARN | Required (AWS 모드) |
283
+ | `RDS_SECRET_ARN` | RDS 접속 정보가 담긴 Secrets Manager ARN | Required (AWS 모드) |
284
+ | `RDS_DATABASE` | 데이터베이스 이름 | `unified_logs` |
285
+ | `DEDUP_TABLE` | DynamoDB 중복 제거 테이블 이름 | `bdp-anomaly-tracking` |
286
+
287
+ **Quick Start for Mock Mode**
288
+
289
+ ```bash
290
+ # AWS와 LLM 없이 로컬에서 로직 테스트
291
+ export AWS_MOCK=true
292
+ export LLM_MOCK=true
293
+ python -m examples.services.aws_client # AWS Mock 테스트
294
+ python -m examples.services.llm_client # LLM Mock 테스트
295
+ ```
296
+
297
+ #### DynamoDB Tables
298
+
299
+ | Table | Purpose | Key |
300
+ |-------|---------|-----|
301
+ | `bdp-anomaly-tracking` | 중복 제거 (TTL 7일) | `signature` |
302
+ | `bdp-workflow-state` | 워크플로우 상태 | `workflow_id`, `timestamp` |
303
+ | `bdp-action-history` | 복구 조치 감사 로그 | `action_id` |
304
+
305
+ ## Lambda Functions
306
+
307
+ ### Detection Lambda (에이전트별)
308
+
309
+ | Function | Memory | Timeout | Trigger | Description |
310
+ |----------|--------|---------|---------|-------------|
311
+ | `bdp-detection` | 512MB | 60s | MWAA DAG | AWS CloudWatch 로그/메트릭 이상 감지 |
312
+ | `bdp-hdsp-detection` | 512MB | 60s | MWAA DAG | On-Prem K8s Prometheus 메트릭 감지 |
313
+ | `bdp-cost-detection` | 512MB | 60s | MWAA DAG | AWS Cost Explorer 비용 이상 감지 |
314
+ | `bdp-drift-detection` | 512MB | 120s | MWAA DAG | AWS 설정 Git Baseline 드리프트 감지 |
315
+
316
+ ### 공통 Lambda
317
+
318
+ | Function | Memory | Timeout | Trigger | Description |
319
+ |----------|--------|---------|---------|-------------|
320
+ | `bdp-analysis` | 1024MB | 120s | Step Functions | LLM 기반 근본 원인 분석 |
321
+ | `bdp-action` | 512MB | 60s | Step Functions | 복구 조치 실행 |
322
+ | `bdp-approval` | 256MB | 30s | API Gateway | 승인 요청 처리 |
323
+
324
+ ## MWAA DAG 구성
325
+
326
+ | DAG | Schedule | Target Lambda | Description |
327
+ |-----|----------|---------------|-------------|
328
+ | `bdp_detection_dag` | `*/5 * * * *` | bdp-detection | AWS 로그/메트릭 감지 |
329
+ | `bdp_hdsp_detection_dag` | `*/5 * * * *` | bdp-hdsp-detection | K8s 장애 감지 |
330
+ | `bdp_cost_detection_dag` | `*/5 * * * *` | bdp-cost-detection | 비용 이상 감지 |
331
+ | `bdp_drift_detection_dag` | `*/5 * * * *` | bdp-drift-detection | 설정 드리프트 감지 |
332
+
333
+ ## Cost Estimation
334
+
335
+ ### Monthly Cost (~$11/month for 1M events, excluding LLM)
336
+
337
+ | Component | Cost |
338
+ |-----------|------|
339
+ | Lambda (ARM64) | ~$5 |
340
+ | Step Functions | ~$3 |
341
+ | DynamoDB (On-demand) | ~$2 |
342
+ | EventBridge | ~$1 |
343
+
344
+ ### LLM 비용
345
+
346
+ | Provider | 환경 | 비용 모델 |
347
+ |----------|------|----------|
348
+ | **vLLM (On-Prem)** | 프로덕션 | 자체 인프라 비용 (GPU 서버) |
349
+ | **Gemini 2.5 Pro** | Mock/개발 | ~$0.00125/1K input, ~$0.005/1K output |
350
+ | **Gemini 2.5 Flash** | Mock/개발 | ~$0.00015/1K input, ~$0.0006/1K output |
351
+
352
+ ### Cost Optimization Strategies
353
+
354
+ 1. **CloudWatch Field Indexing**: 67% 스캔 비용 감소
355
+ 2. **Hierarchical Summarization**: 80-90% 토큰 절감
356
+ 3. **ARM64/Graviton2**: 20-34% Lambda 비용 절감
357
+ 4. **Provisioned Concurrency**: Cold start 제거 (MWAA 트리거 사용 시)
358
+
359
+ ## Decision Flow
360
+
361
+ 모든 복구 조치는 **승인 후 실행** 방식으로 동작합니다.
362
+
363
+ | Confidence | Action | Use Case |
364
+ |------------|--------|----------|
365
+ | >= 0.5 | Request Approval | 분석 완료, 승인 요청 |
366
+ | < 0.5 | Escalate | 추가 분석 필요, 담당자 에스컬레이션 |
367
+
368
+ > **Note**: 자동 실행(Auto Execute) 기능은 현재 비활성화되어 있습니다. 모든 조치는 담당자 승인 후 실행됩니다.
369
+
370
+ ## 지원 복구 조치 (Supported Actions)
371
+
372
+ - `lambda_restart`: Lambda 함수 재시작
373
+ - `rds_parameter`: RDS 파라미터 변경
374
+ - `auto_scaling`: Auto Scaling 설정 조정
375
+ - `eventbridge_event`: 이벤트 발행 (알림)
376
+ - `investigate`: 추가 정보 수집 요청
377
+
378
+ ## Documentation
379
+
380
+ ### 시스템 문서
381
+ - [Architecture Guide](docs/ARCHITECTURE.md) - 상세 시스템 아키텍처
382
+ - [Prompt Templates](docs/PROMPTS.md) - AI 프롬프트 설계
383
+ - [Cost Optimization](docs/COST_OPTIMIZATION.md) - 비용 최적화 전략
384
+ - [Implementation Guide](docs/IMPLEMENTATION_GUIDE.md) - 단계별 구현 가이드
385
+
386
+ ### 에이전트별 문서
387
+ - [HDSP Detection](docs/HDSP_DETECTION.md) - On-Prem K8s 장애 감지 (HDSP Agent)
388
+ - [Cost Anomaly Detection](docs/COST_ANOMALY_DETECTION.md) - 비용 이상 탐지 (Cost Agent)
389
+ - [Config Drift Detection](docs/CONFIG_DRIFT_DETECTION.md) - 설정 드리프트 감지 (Drift Agent)
390
+
391
+ ## Development
392
+
393
+ ### Running Tests
394
+
395
+ ```bash
396
+ # Unit tests
397
+ pytest tests/unit/
398
+
399
+ # Integration tests
400
+ pytest tests/integration/
401
+
402
+ # All tests with coverage
403
+ pytest --cov=src tests/
404
+ ```
405
+
406
+ ### Code Quality
407
+
408
+ ```bash
409
+ # Linting
410
+ ruff check src/
411
+
412
+ # Type checking
413
+ mypy src/
414
+
415
+ # Formatting
416
+ black src/
417
+ ```
418
+
419
+ ## Contributing
420
+
421
+ 1. Fork the repository
422
+ 2. Create a feature branch (`git checkout -b feature/amazing-feature`)
423
+ 3. Commit your changes (`git commit -m 'feat: Add amazing feature'`)
424
+ 4. Push to the branch (`git push origin feature/amazing-feature`)
425
+ 5. Open a Pull Request
426
+
427
+ ## License
428
+
429
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.