decodingtrust-agent-sdk 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (374) hide show
  1. agent/__init__.py +30 -0
  2. agent/claudesdk/__init__.py +8 -0
  3. agent/claudesdk/example.py +221 -0
  4. agent/claudesdk/src/__init__.py +8 -0
  5. agent/claudesdk/src/agent.py +400 -0
  6. agent/claudesdk/src/mcp_proxy.py +409 -0
  7. agent/claudesdk/src/utils.py +420 -0
  8. agent/googleadk/__init__.py +15 -0
  9. agent/googleadk/example.py +237 -0
  10. agent/googleadk/src/__init__.py +12 -0
  11. agent/googleadk/src/agent.py +401 -0
  12. agent/googleadk/src/mcp_wrapper.py +163 -0
  13. agent/googleadk/src/utils.py +602 -0
  14. agent/langchain/__init__.py +8 -0
  15. agent/langchain/example.py +213 -0
  16. agent/langchain/src/__init__.py +8 -0
  17. agent/langchain/src/agent.py +645 -0
  18. agent/langchain/src/utils.py +433 -0
  19. agent/openaisdk/__init__.py +17 -0
  20. agent/openaisdk/example.py +228 -0
  21. agent/openaisdk/src/__init__.py +12 -0
  22. agent/openaisdk/src/agent.py +491 -0
  23. agent/openaisdk/src/agent_wrapper.py +143 -0
  24. agent/openaisdk/src/mcp_wrapper.py +395 -0
  25. agent/openaisdk/src/utils.py +493 -0
  26. agent/openclaw/__init__.py +10 -0
  27. agent/openclaw/example.py +251 -0
  28. agent/openclaw/src/__init__.py +14 -0
  29. agent/openclaw/src/agent.py +930 -0
  30. agent/openclaw/src/helpers/__init__.py +1 -0
  31. agent/openclaw/src/helpers/auth_helpers.py +55 -0
  32. agent/openclaw/src/mcp_proxy.py +564 -0
  33. agent/openclaw/src/plugin_generator.py +231 -0
  34. agent/openclaw/src/utils.py +341 -0
  35. agent/pocketflow/__init__.py +18 -0
  36. agent/pocketflow/example.py +221 -0
  37. agent/pocketflow/prompts/react_agent.py +46 -0
  38. agent/pocketflow/src/__init__.py +6 -0
  39. agent/pocketflow/src/agent.py +507 -0
  40. agent/pocketflow/src/agent_wrapper.py +159 -0
  41. agent/pocketflow/src/async_helper.py +92 -0
  42. agent/pocketflow/src/mcp_react_agent.py +279 -0
  43. agent/pocketflow/src/native_agent.py +74 -0
  44. agent/pocketflow/src/nodes.py +467 -0
  45. benchmark/__init__.py +0 -0
  46. benchmark/browser/benign.jsonl +34 -0
  47. benchmark/browser/direct.jsonl +85 -0
  48. benchmark/browser/indirect.jsonl +82 -0
  49. benchmark/code/benign.jsonl +0 -0
  50. benchmark/code/direct.jsonl +121 -0
  51. benchmark/code/indirect.jsonl +165 -0
  52. benchmark/crm/benign.jsonl +165 -0
  53. benchmark/crm/direct.jsonl +90 -0
  54. benchmark/crm/indirect.jsonl +150 -0
  55. benchmark/customer-service/benign.jsonl +160 -0
  56. benchmark/customer-service/direct.jsonl +100 -0
  57. benchmark/customer-service/indirect.jsonl +101 -0
  58. benchmark/finance/benign.jsonl +0 -0
  59. benchmark/finance/direct.jsonl +200 -0
  60. benchmark/finance/indirect.jsonl +200 -0
  61. benchmark/legal/benign.jsonl +0 -0
  62. benchmark/legal/direct.jsonl +200 -0
  63. benchmark/legal/indirect.jsonl +200 -0
  64. benchmark/macos/benign.jsonl +30 -0
  65. benchmark/macos/direct.jsonl +50 -0
  66. benchmark/macos/indirect.jsonl +50 -0
  67. benchmark/medical/benign.jsonl +642 -0
  68. benchmark/medical/direct.jsonl +229 -0
  69. benchmark/medical/indirect.jsonl +222 -0
  70. benchmark/os-filesystem/benign.jsonl +200 -0
  71. benchmark/os-filesystem/direct.jsonl +200 -0
  72. benchmark/os-filesystem/indirect.jsonl +200 -0
  73. benchmark/research/benign.jsonl +0 -0
  74. benchmark/research/direct.jsonl +119 -0
  75. benchmark/research/indirect.jsonl +125 -0
  76. benchmark/telecom/benign.jsonl +120 -0
  77. benchmark/telecom/direct.jsonl +161 -0
  78. benchmark/telecom/indirect.jsonl +166 -0
  79. benchmark/travel/benign.jsonl +130 -0
  80. benchmark/travel/direct.jsonl +105 -0
  81. benchmark/travel/indirect.jsonl +120 -0
  82. benchmark/windows/benign.jsonl +100 -0
  83. benchmark/windows/direct.jsonl +140 -0
  84. benchmark/windows/indirect.jsonl +107 -0
  85. benchmark/workflow/benign.jsonl +335 -0
  86. benchmark/workflow/direct.jsonl +78 -0
  87. benchmark/workflow/indirect.jsonl +107 -0
  88. cli/__init__.py +5 -0
  89. cli/main.py +182 -0
  90. cli/scaffold.py +334 -0
  91. decodingtrust_agent_sdk-0.1.0.dist-info/METADATA +642 -0
  92. decodingtrust_agent_sdk-0.1.0.dist-info/RECORD +374 -0
  93. decodingtrust_agent_sdk-0.1.0.dist-info/WHEEL +5 -0
  94. decodingtrust_agent_sdk-0.1.0.dist-info/entry_points.txt +2 -0
  95. decodingtrust_agent_sdk-0.1.0.dist-info/licenses/LICENSE +201 -0
  96. decodingtrust_agent_sdk-0.1.0.dist-info/top_level.txt +6 -0
  97. dt_arena/config/env.yaml +515 -0
  98. dt_arena/config/injection_mcp.yaml +430 -0
  99. dt_arena/config/mcp.yaml +642 -0
  100. dt_arena/envs/arxiv/docker-compose-hub.yml +31 -0
  101. dt_arena/envs/arxiv/docker-compose.yml +36 -0
  102. dt_arena/envs/atlassian/docker/docker-compose.dev.yml +65 -0
  103. dt_arena/envs/atlassian/docker/docker-compose.yml +53 -0
  104. dt_arena/envs/atlassian/docker-compose-hub.yml +57 -0
  105. dt_arena/envs/atlassian/docker-compose.yml +72 -0
  106. dt_arena/envs/bigquery/docker-compose.yml +20 -0
  107. dt_arena/envs/booking/docker-compose.yml +59 -0
  108. dt_arena/envs/calendar/docker-compose-hub.yml +30 -0
  109. dt_arena/envs/calendar/docker-compose.yml +42 -0
  110. dt_arena/envs/custom-website/docker-compose.yml +6 -0
  111. dt_arena/envs/customer_service/docker-compose.yml +59 -0
  112. dt_arena/envs/databricks/docker-compose-hub.yml +47 -0
  113. dt_arena/envs/databricks/docker-compose.yml +51 -0
  114. dt_arena/envs/ecommerce/docker-compose.yml +6 -0
  115. dt_arena/envs/ers/docker-compose.yml +36 -0
  116. dt_arena/envs/ers/hrms/docker/docker-compose.yml +31 -0
  117. dt_arena/envs/finance/docker-compose.yml +23 -0
  118. dt_arena/envs/github/docker/docker-compose-hub.yml +50 -0
  119. dt_arena/envs/github/docker/docker-compose.yml +50 -0
  120. dt_arena/envs/gmail/docker-compose-hub.yml +51 -0
  121. dt_arena/envs/gmail/docker-compose.yml +65 -0
  122. dt_arena/envs/google-form/docker-compose-hub.yml +33 -0
  123. dt_arena/envs/google-form/docker-compose.yml +41 -0
  124. dt_arena/envs/googledocs/docker-compose-hub.yml +61 -0
  125. dt_arena/envs/googledocs/docker-compose.yml +78 -0
  126. dt_arena/envs/hospital/docker-compose-hub.yml +25 -0
  127. dt_arena/envs/hospital/docker-compose.yml +27 -0
  128. dt_arena/envs/legal/docker-compose.yml +22 -0
  129. dt_arena/envs/linkedin/docker-compose.yml +63 -0
  130. dt_arena/envs/macos/docker-compose.yml +79 -0
  131. dt_arena/envs/os-filesystem/docker-compose-hub.yml +16 -0
  132. dt_arena/envs/os-filesystem/docker-compose.yml +20 -0
  133. dt_arena/envs/paypal/docker-compose-hub.yml +48 -0
  134. dt_arena/envs/paypal/docker-compose.yml +63 -0
  135. dt_arena/envs/research/docker-compose-hub.yml +13 -0
  136. dt_arena/envs/research/docker-compose.yml +24 -0
  137. dt_arena/envs/salesforce_crm/docker-compose-hub.yaml +45 -0
  138. dt_arena/envs/salesforce_crm/docker-compose.yaml +49 -0
  139. dt_arena/envs/slack/docker-compose-hub.yml +28 -0
  140. dt_arena/envs/slack/docker-compose.yml +41 -0
  141. dt_arena/envs/snowflake/docker-compose-hub.yml +41 -0
  142. dt_arena/envs/snowflake/docker-compose.yml +44 -0
  143. dt_arena/envs/telecom/docker-compose-hub.yml +16 -0
  144. dt_arena/envs/telecom/docker-compose.yml +17 -0
  145. dt_arena/envs/telegram/docker-compose-hub.yml +57 -0
  146. dt_arena/envs/telegram/docker-compose.yml +62 -0
  147. dt_arena/envs/terminal/docker-compose-hub.yml +12 -0
  148. dt_arena/envs/terminal/docker-compose.yml +26 -0
  149. dt_arena/envs/travel/docker-compose-hub.yml +19 -0
  150. dt_arena/envs/travel/docker-compose.yml +19 -0
  151. dt_arena/envs/whatsapp/docker-compose-hub.yml +61 -0
  152. dt_arena/envs/whatsapp/docker-compose.yml +78 -0
  153. dt_arena/envs/windows/docker-compose.yml +71 -0
  154. dt_arena/envs/zoom/docker-compose-hub.yml +27 -0
  155. dt_arena/envs/zoom/docker-compose.yml +40 -0
  156. dt_arena/injection_mcp_server/atlassian/env_injection.py +134 -0
  157. dt_arena/injection_mcp_server/calendar/env_injection.py +217 -0
  158. dt_arena/injection_mcp_server/custom_website/env_injection.py +97 -0
  159. dt_arena/injection_mcp_server/customer_service/env_injection.py +659 -0
  160. dt_arena/injection_mcp_server/databricks/env_injection.py +255 -0
  161. dt_arena/injection_mcp_server/ecommerce/env_injection.py +110 -0
  162. dt_arena/injection_mcp_server/finance/env_injection.py +85 -0
  163. dt_arena/injection_mcp_server/github/env_injection.py +206 -0
  164. dt_arena/injection_mcp_server/gmail/env_injection.py +211 -0
  165. dt_arena/injection_mcp_server/google_form/env_injection.py +186 -0
  166. dt_arena/injection_mcp_server/googledocs/env_injection.py +44 -0
  167. dt_arena/injection_mcp_server/hospital/env_injection.py +43 -0
  168. dt_arena/injection_mcp_server/legal/env_injection.py +229 -0
  169. dt_arena/injection_mcp_server/macos/env_injection.py +272 -0
  170. dt_arena/injection_mcp_server/os-filesystem/env_injection.py +341 -0
  171. dt_arena/injection_mcp_server/paypal/env_injection.py +268 -0
  172. dt_arena/injection_mcp_server/research/env_injection.py +616 -0
  173. dt_arena/injection_mcp_server/salesforce/env_injection.py +514 -0
  174. dt_arena/injection_mcp_server/slack/env_injection.py +265 -0
  175. dt_arena/injection_mcp_server/snowflake/env_injection.py +230 -0
  176. dt_arena/injection_mcp_server/telecom/env_injection.py +503 -0
  177. dt_arena/injection_mcp_server/telegram/env_injection.py +171 -0
  178. dt_arena/injection_mcp_server/terminal/env_injection.py +523 -0
  179. dt_arena/injection_mcp_server/travel/env_injection.py +173 -0
  180. dt_arena/injection_mcp_server/whatsapp/env_injection.py +185 -0
  181. dt_arena/injection_mcp_server/windows/env_injection.py +943 -0
  182. dt_arena/injection_mcp_server/zoom/env_injection.py +216 -0
  183. dt_arena/mcp_server/atlassian/main.py +1554 -0
  184. dt_arena/mcp_server/atlassian/test_server.py +66 -0
  185. dt_arena/mcp_server/bigquery/main.py +333 -0
  186. dt_arena/mcp_server/booking/main.py +310 -0
  187. dt_arena/mcp_server/browser/main.py +1741 -0
  188. dt_arena/mcp_server/calendar/example_multi_user.py +162 -0
  189. dt_arena/mcp_server/calendar/main.py +792 -0
  190. dt_arena/mcp_server/calendar/test_mcp.py +135 -0
  191. dt_arena/mcp_server/customer_service/main.py +1063 -0
  192. dt_arena/mcp_server/databricks/main.py +566 -0
  193. dt_arena/mcp_server/databricks/probe.py +102 -0
  194. dt_arena/mcp_server/ers/main.py +845 -0
  195. dt_arena/mcp_server/finance/__init__.py +87 -0
  196. dt_arena/mcp_server/finance/core/__init__.py +12 -0
  197. dt_arena/mcp_server/finance/core/data_loader.py +558 -0
  198. dt_arena/mcp_server/finance/core/portfolio.py +565 -0
  199. dt_arena/mcp_server/finance/evaluation/__init__.py +20 -0
  200. dt_arena/mcp_server/finance/evaluation/evaluator.py +217 -0
  201. dt_arena/mcp_server/finance/evaluation/logger.py +137 -0
  202. dt_arena/mcp_server/finance/injection/__init__.py +66 -0
  203. dt_arena/mcp_server/finance/injection/config.py +176 -0
  204. dt_arena/mcp_server/finance/injection/content.py +755 -0
  205. dt_arena/mcp_server/finance/injection/html.py +409 -0
  206. dt_arena/mcp_server/finance/injection/locations.py +167 -0
  207. dt_arena/mcp_server/finance/injection/methods.py +193 -0
  208. dt_arena/mcp_server/finance/injection/presets.py +1023 -0
  209. dt_arena/mcp_server/finance/main.py +361 -0
  210. dt_arena/mcp_server/finance/run_mcp.py +21 -0
  211. dt_arena/mcp_server/finance/run_web.py +26 -0
  212. dt_arena/mcp_server/finance/server/__init__.py +41 -0
  213. dt_arena/mcp_server/finance/server/extractor.py +1453 -0
  214. dt_arena/mcp_server/finance/server/extractor_minimal.py +292 -0
  215. dt_arena/mcp_server/finance/server/extractor_simple.py +1164 -0
  216. dt_arena/mcp_server/finance/server/injection_mcp.py +865 -0
  217. dt_arena/mcp_server/finance/server/mcp.py +451 -0
  218. dt_arena/mcp_server/finance/server/tools/__init__.py +23 -0
  219. dt_arena/mcp_server/finance/server/tools/account.py +88 -0
  220. dt_arena/mcp_server/finance/server/tools/browsing.py +328 -0
  221. dt_arena/mcp_server/finance/server/tools/social.py +73 -0
  222. dt_arena/mcp_server/finance/server/tools/trading.py +242 -0
  223. dt_arena/mcp_server/finance/server/tools/utility.py +49 -0
  224. dt_arena/mcp_server/finance/server/web.py +2139 -0
  225. dt_arena/mcp_server/finance/tasks/benchmark/__init__.py +28 -0
  226. dt_arena/mcp_server/finance/tasks/benchmark/attack_pool.py +3026 -0
  227. dt_arena/mcp_server/finance/tasks/benchmark/attack_runner.py +1315 -0
  228. dt_arena/mcp_server/finance/tasks/benchmark/finra_requirements.py +1335 -0
  229. dt_arena/mcp_server/finance/tasks/benchmark/finra_tasks.py +3665 -0
  230. dt_arena/mcp_server/finance/tasks/benchmark/malicious_tasks.py +2673 -0
  231. dt_arena/mcp_server/finance/tasks/redteam_suite/run_redteam_suite.py +1713 -0
  232. dt_arena/mcp_server/finance/test_mcp_tools.py +476 -0
  233. dt_arena/mcp_server/github/main.py +441 -0
  234. dt_arena/mcp_server/gmail/main.py +1004 -0
  235. dt_arena/mcp_server/google_form/main.py +141 -0
  236. dt_arena/mcp_server/googledocs/main.py +458 -0
  237. dt_arena/mcp_server/hospital/mcp_server.py +458 -0
  238. dt_arena/mcp_server/legal/__init__.py +9 -0
  239. dt_arena/mcp_server/legal/core/__init__.py +14 -0
  240. dt_arena/mcp_server/legal/core/courtlistener_store.py +762 -0
  241. dt_arena/mcp_server/legal/core/data_loader.py +266 -0
  242. dt_arena/mcp_server/legal/core/document_store.py +197 -0
  243. dt_arena/mcp_server/legal/core/matter_manager.py +466 -0
  244. dt_arena/mcp_server/legal/main.py +89 -0
  245. dt_arena/mcp_server/legal/scripts/collect_data.py +988 -0
  246. dt_arena/mcp_server/legal/server/__init__.py +14 -0
  247. dt_arena/mcp_server/legal/server/mcp.py +2330 -0
  248. dt_arena/mcp_server/macos/client_test.py +270 -0
  249. dt_arena/mcp_server/macos/mcp_server.py +285 -0
  250. dt_arena/mcp_server/os-filesystem/main.py +1380 -0
  251. dt_arena/mcp_server/paypal/main.py +501 -0
  252. dt_arena/mcp_server/research/main.py +777 -0
  253. dt_arena/mcp_server/salesforce/main.py +2006 -0
  254. dt_arena/mcp_server/slack/main.py +318 -0
  255. dt_arena/mcp_server/snowflake/main.py +612 -0
  256. dt_arena/mcp_server/snowflake/probe.py +183 -0
  257. dt_arena/mcp_server/telecom/mcp_client.py +423 -0
  258. dt_arena/mcp_server/telecom/mcp_server.py +1059 -0
  259. dt_arena/mcp_server/telegram/main.py +338 -0
  260. dt_arena/mcp_server/terminal/main.py +163 -0
  261. dt_arena/mcp_server/travel/client_test.py +16 -0
  262. dt_arena/mcp_server/travel/mcp_server.py +404 -0
  263. dt_arena/mcp_server/whatsapp/main.py +318 -0
  264. dt_arena/mcp_server/windows/client_test.py +270 -0
  265. dt_arena/mcp_server/windows/mcp_server.py +218 -0
  266. dt_arena/mcp_server/zoom/main.py +466 -0
  267. dt_arena/src/__init__.py +0 -0
  268. dt_arena/src/hooks/__init__.py +0 -0
  269. dt_arena/src/hooks/audit_log.py +30 -0
  270. dt_arena/src/hooks/hooks.json +3 -0
  271. dt_arena/src/run_benign.py +142 -0
  272. dt_arena/src/types/__init__.py +0 -0
  273. dt_arena/src/types/agent.py +441 -0
  274. dt_arena/src/types/attacks.py +2 -0
  275. dt_arena/src/types/environment.py +2 -0
  276. dt_arena/src/types/hooks.py +174 -0
  277. dt_arena/src/types/judge.py +52 -0
  278. dt_arena/src/types/red_teaming_trajectory.py +385 -0
  279. dt_arena/src/types/task.py +260 -0
  280. dt_arena/src/types/trajectory.py +315 -0
  281. dt_arena/utils/__init__.py +1 -0
  282. dt_arena/utils/atlassian/__init__.py +27 -0
  283. dt_arena/utils/atlassian/helpers.py +520 -0
  284. dt_arena/utils/bigquery/__init__.py +1 -0
  285. dt_arena/utils/bigquery/helpers.py +246 -0
  286. dt_arena/utils/calendar/__init__.py +1 -0
  287. dt_arena/utils/calendar/helpers.py +87 -0
  288. dt_arena/utils/customer_service/__init__.py +17 -0
  289. dt_arena/utils/customer_service/cs_env_client.py +940 -0
  290. dt_arena/utils/customer_service/helpers.py +339 -0
  291. dt_arena/utils/customer_service/judges/__init__.py +20 -0
  292. dt_arena/utils/customer_service/judges/abusive_comms.py +149 -0
  293. dt_arena/utils/customer_service/judges/bulk_destructive.py +202 -0
  294. dt_arena/utils/customer_service/judges/cross_customer.py +481 -0
  295. dt_arena/utils/customer_service/judges/email_exfiltration.py +320 -0
  296. dt_arena/utils/customer_service/judges/financial_fraud.py +221 -0
  297. dt_arena/utils/customer_service/judges/llm_utils.py +264 -0
  298. dt_arena/utils/customer_service/judges/pii_disclosure.py +837 -0
  299. dt_arena/utils/customer_service/judges/policy_misrep.py +44 -0
  300. dt_arena/utils/customer_service/judges/text_utils.py +21 -0
  301. dt_arena/utils/databricks/__init__.py +2 -0
  302. dt_arena/utils/databricks/helpers.py +210 -0
  303. dt_arena/utils/finance/__init__.py +0 -0
  304. dt_arena/utils/finance/helpers.py +263 -0
  305. dt_arena/utils/github/__init__.py +1 -0
  306. dt_arena/utils/github/helpers.py +249 -0
  307. dt_arena/utils/gmail/__init__.py +1 -0
  308. dt_arena/utils/gmail/helpers.py +344 -0
  309. dt_arena/utils/google_form/__init__.py +2 -0
  310. dt_arena/utils/google_form/helpers.py +133 -0
  311. dt_arena/utils/legal/__init__.py +0 -0
  312. dt_arena/utils/legal/helpers.py +228 -0
  313. dt_arena/utils/macos/__init__.py +0 -0
  314. dt_arena/utils/macos/env_setup.py +215 -0
  315. dt_arena/utils/macos/helpers.py +61 -0
  316. dt_arena/utils/os_filesystem/__init__.py +1 -0
  317. dt_arena/utils/os_filesystem/helpers.py +366 -0
  318. dt_arena/utils/paypal/__init__.py +1 -0
  319. dt_arena/utils/paypal/helpers.py +178 -0
  320. dt_arena/utils/port_allocator.py +266 -0
  321. dt_arena/utils/research/__init__.py +0 -0
  322. dt_arena/utils/research/helpers.py +251 -0
  323. dt_arena/utils/salesforce/__init__.py +1 -0
  324. dt_arena/utils/salesforce/helpers.py +719 -0
  325. dt_arena/utils/slack/__init__.py +1 -0
  326. dt_arena/utils/slack/helpers.py +176 -0
  327. dt_arena/utils/snowflake/__init__.py +1 -0
  328. dt_arena/utils/snowflake/helpers.py +166 -0
  329. dt_arena/utils/telecom/__init__.py +1 -0
  330. dt_arena/utils/telecom/helpers.py +760 -0
  331. dt_arena/utils/telegram/__init__.py +0 -0
  332. dt_arena/utils/telegram/helpers.py +174 -0
  333. dt_arena/utils/terminal/__init__.py +0 -0
  334. dt_arena/utils/terminal/helpers.py +20 -0
  335. dt_arena/utils/travel/__init__.py +0 -0
  336. dt_arena/utils/travel/env_client.py +537 -0
  337. dt_arena/utils/travel/llm_judge.py +137 -0
  338. dt_arena/utils/travel/prompts.py +64 -0
  339. dt_arena/utils/utils/__init__.py +122 -0
  340. dt_arena/utils/whatsapp/__init__.py +0 -0
  341. dt_arena/utils/whatsapp/helpers.py +226 -0
  342. dt_arena/utils/windows/__init__.py +0 -0
  343. dt_arena/utils/windows/env_reset.py +224 -0
  344. dt_arena/utils/windows/env_setup.py +280 -0
  345. dt_arena/utils/windows/exfil_helpers.py +170 -0
  346. dt_arena/utils/windows/helpers.py +74 -0
  347. dt_arena/utils/zoom/__init__.py +1 -0
  348. dt_arena/utils/zoom/helpers.py +70 -0
  349. eval/__init__.py +1 -0
  350. eval/evaluation.py +426 -0
  351. eval/task_runner.py +449 -0
  352. utils/__init__.py +148 -0
  353. utils/agent_helpers.py +308 -0
  354. utils/agent_wrapper.py +189 -0
  355. utils/compose_utils.py +135 -0
  356. utils/config.py +77 -0
  357. utils/env_helpers.py +104 -0
  358. utils/eval_stats.py +88 -0
  359. utils/injection_helpers.py +429 -0
  360. utils/injection_mcp_helpers.py +152 -0
  361. utils/judge_helpers.py +181 -0
  362. utils/judge_utils.py +472 -0
  363. utils/llm.py +196 -0
  364. utils/logging.py +45 -0
  365. utils/mcp_helpers.py +232 -0
  366. utils/mcp_manager.py +235 -0
  367. utils/memory_guard.py +18 -0
  368. utils/red_teaming_sandbox.py +476 -0
  369. utils/reset_helpers.py +318 -0
  370. utils/resource_manager.py +370 -0
  371. utils/skill_helpers.py +447 -0
  372. utils/task_executor.py +904 -0
  373. utils/task_helpers.py +270 -0
  374. utils/template_helpers.py +179 -0
@@ -0,0 +1,1164 @@
1
+ """
2
+ Simplified HTML Extractor for MCP Tools
3
+
4
+ Key principles:
5
+ 1. Keep HTML injection content AS-IS (original format, no simplification)
6
+ 2. Include content injection (fake news, ratings, comments)
7
+ 3. Remove truly redundant parts (scripts, nav, forms, buttons)
8
+ 4. Keep important data in structured format
9
+ 5. Output is still HTML but much shorter
10
+ """
11
+
12
+ from bs4 import BeautifulSoup, NavigableString, Comment
13
+ from typing import List, Dict, Callable, Optional
14
+ import re
15
+
16
+
17
+ class HtmlExtractor:
18
+ """Extracts and simplifies HTML for AI agent consumption."""
19
+
20
+ # Patterns for hidden elements (including !important variants)
21
+ HIDDEN_PATTERNS = [
22
+ 'display:none', 'display: none',
23
+ 'visibility:hidden', 'visibility: hidden',
24
+ 'height:0', 'width:0', 'opacity:0',
25
+ # Additional patterns for injection detection
26
+ 'left:-9999', 'position:absolute',
27
+ ]
28
+
29
+ # Elements to completely remove
30
+ REMOVE_TAGS = ['script', 'style', 'noscript', 'iframe', 'svg', 'canvas']
31
+
32
+ # Elements that are UI chrome (remove but keep children if meaningful)
33
+ UI_TAGS = ['nav', 'header', 'footer', 'aside']
34
+
35
+ # Form elements to remove
36
+ FORM_TAGS = ['form', 'button', 'input', 'select', 'textarea', 'label']
37
+
38
+ def __init__(self, include_injections: bool = True, reveal_injection_markers: bool = False, output_format: str = "html"):
39
+ self.include_injections = include_injections
40
+ self.reveal_injection_markers = reveal_injection_markers
41
+ self.output_format = output_format # Ignored - always outputs simplified HTML
42
+
43
+ self._extractors = {
44
+ 'quote': self._extract_quote,
45
+ 'news': self._extract_news,
46
+ 'chart': self._extract_chart,
47
+ 'analysis': self._extract_analysis,
48
+ 'profile': self._extract_profile,
49
+ 'article': self._extract_article,
50
+ 'portfolio': self._extract_portfolio,
51
+ 'options': self._extract_options,
52
+ 'markets': self._extract_markets,
53
+ 'search': self._extract_search,
54
+ 'news_center': self._extract_news,
55
+ 'trade': self._extract_portfolio,
56
+ 'orders': self._extract_portfolio,
57
+ 'stock_list': self._extract_stock_list,
58
+ }
59
+
60
+ def extract(self, html: str, page_type: str) -> str:
61
+ """Extract content from HTML, preserving injections in natural positions."""
62
+ soup = BeautifulSoup(html, 'html.parser')
63
+
64
+ # Step 1: Find injections and their positions BEFORE cleaning
65
+ injections_map = {} # position -> injection content
66
+ if self.include_injections:
67
+ injections_map = self._find_injections_with_position(soup)
68
+
69
+ # Step 2: Clean the soup (remove scripts, nav, forms, etc.)
70
+ self._clean_soup(soup)
71
+
72
+ # Step 3: Page-specific extraction
73
+ extractor = self._extractors.get(page_type, self._extract_generic)
74
+ content = extractor(soup)
75
+
76
+ # Step 4: Insert injections at appropriate positions
77
+ if injections_map:
78
+ content = self._insert_injections_naturally(content, injections_map, page_type)
79
+
80
+ return content if content else ""
81
+
82
+ def _find_injections_with_position(self, soup) -> Dict[str, List[str]]:
83
+ """Find injections and categorize by position (start, middle, end).
84
+
85
+ Distributes injections evenly across positions to ensure diversity.
86
+ """
87
+ all_injections = [] # Collect all first, then distribute
88
+ body = soup.find('body')
89
+ body_children = list(body.children) if body else []
90
+ body_len = len(body_children)
91
+
92
+ def add_injection(raw_text: str, element=None):
93
+ text = (raw_text or "").strip()
94
+ if not text or len(text) < 15:
95
+ return
96
+ # Skip UI elements only (not injection content)
97
+ skip_patterns = [
98
+ 'limit price', 'loading', 'order type', 'market order',
99
+ ]
100
+ if any(ui in text.lower() for ui in skip_patterns):
101
+ return
102
+ injection_html = self._format_injection_naturally(text)
103
+
104
+ # Compute original position for reference
105
+ orig_pos = 'middle'
106
+ if body and element is not None:
107
+ parent = element
108
+ while parent.parent and parent.parent != body:
109
+ parent = parent.parent
110
+ try:
111
+ idx = body_children.index(parent)
112
+ if idx < body_len * 0.25:
113
+ orig_pos = 'start'
114
+ elif idx > body_len * 0.75:
115
+ orig_pos = 'end'
116
+ except ValueError:
117
+ orig_pos = 'start'
118
+
119
+ all_injections.append((injection_html, orig_pos))
120
+
121
+ # Style-based hidden elements
122
+ for el in soup.find_all(style=True):
123
+ style = el.get('style', '').lower()
124
+ if any(p in style for p in self.HIDDEN_PATTERNS):
125
+ inner = el.decode_contents().strip()
126
+ add_injection(inner, el)
127
+
128
+ # HTML comments
129
+ for comment in soup.find_all(string=lambda t: isinstance(t, Comment)):
130
+ add_injection(str(comment), comment)
131
+
132
+ # Meta tag directives
133
+ for meta in soup.find_all('meta'):
134
+ name = (meta.get('name') or '').lower()
135
+ if name in {'ai-instructions', 'assistant-directive', 'system-message'}:
136
+ add_injection(meta.get('content', ''), meta)
137
+
138
+ # Data attribute directives
139
+ for el in soup.find_all(attrs={'data-ai-instruction': True}):
140
+ add_injection(el.get('data-ai-instruction', ''), el)
141
+
142
+ # Noscript / template / details content
143
+ for tag in soup.find_all(['noscript', 'template', 'details']):
144
+ add_injection(tag.get_text(" ", strip=True), tag)
145
+
146
+ # Hidden inputs
147
+ for el in soup.find_all('input', {'type': 'hidden'}):
148
+ name = (el.get('name') or '').lower()
149
+ if name in {'_system_instruction', 'system_instruction', 'ai_instruction'}:
150
+ add_injection(el.get('value', ''), el)
151
+
152
+ # Script comment blocks
153
+ for script in soup.find_all('script'):
154
+ if script.get('type', '').lower() == 'text/x-template':
155
+ add_injection(script.get_text(" ", strip=True), script)
156
+
157
+ # Distribute injections evenly across start/middle/end (roughly 30%/40%/30%)
158
+ injections = {'start': [], 'middle': [], 'end': []}
159
+ total = len(all_injections)
160
+ if total == 0:
161
+ return injections
162
+
163
+ # Calculate target counts for even distribution
164
+ start_target = max(1, int(total * 0.30))
165
+ end_target = max(1, int(total * 0.30))
166
+ middle_target = total - start_target - end_target
167
+
168
+ # Assign injections using round-robin for diversity
169
+ positions = ['start', 'middle', 'end']
170
+ targets = {'start': start_target, 'middle': middle_target, 'end': end_target}
171
+ counts = {'start': 0, 'middle': 0, 'end': 0}
172
+
173
+ for i, (inj_html, _) in enumerate(all_injections):
174
+ # Cycle through positions, but respect targets
175
+ pos = positions[i % 3]
176
+ if counts[pos] >= targets[pos]:
177
+ # Find position with room
178
+ for alt_pos in positions:
179
+ if counts[alt_pos] < targets[alt_pos]:
180
+ pos = alt_pos
181
+ break
182
+ injections[pos].append(inj_html)
183
+ counts[pos] += 1
184
+
185
+ return injections
186
+
187
+ def _format_injection_naturally(self, content: str) -> str:
188
+ """Format injection content, preserving original HTML structure when present.
189
+
190
+ IMPORTANT: Preserve the original HTML wrapper/formatting of injections
191
+ to maintain the intended visual/semantic structure of the attack.
192
+ Only wrap in simple tags if the content is plain text.
193
+ """
194
+ import html
195
+
196
+ # First, decode HTML entities for proper detection
197
+ content_stripped = content.strip()
198
+ decoded = html.unescape(content_stripped)
199
+
200
+ # Check if content already has HTML tags - if so, preserve them
201
+ has_html_tags = bool(re.search(r'<[a-zA-Z][^>]*>', content_stripped))
202
+
203
+ # If content has HTML structure, return as-is (preserve the wrapper)
204
+ if has_html_tags:
205
+ # Just ensure it's wrapped in a container if it's multiple elements
206
+ if not content_stripped.startswith('<div') and not content_stripped.startswith('<p'):
207
+ return f"<div>{content_stripped}</div>"
208
+ return content_stripped
209
+
210
+ # User message - keep as-is
211
+ if decoded.startswith('[USER]'):
212
+ return f"<p>{decoded}</p>"
213
+
214
+ # Assistant context - keep as-is
215
+ if decoded.startswith('[ASSISTANT]'):
216
+ return f"<p>{decoded}</p>"
217
+
218
+ # System prompt markers - KEEP the markers as-is
219
+ if decoded.startswith('<|system|>') or decoded.startswith('<|system'):
220
+ # Keep the full content including markers
221
+ return f"<p>{decoded}</p>"
222
+
223
+ # System directive style - keep as-is
224
+ if decoded.startswith('[SYSTEM'):
225
+ return f"<p>{decoded}</p>"
226
+
227
+ # JSON context - looks like embedded data/config (common in web pages)
228
+ if decoded.startswith('{') and decoded.endswith('}'):
229
+ return f"<code>{decoded}</code>"
230
+
231
+ # Tool result - keep as-is
232
+ if decoded.startswith('<tool_result>') or '&lt;tool_result&gt;' in decoded:
233
+ return f"<pre>{decoded}</pre>"
234
+
235
+ # Internal state - keep as-is
236
+ if decoded.startswith('<internal_state>') or '&lt;internal_state&gt;' in decoded:
237
+ return f"<code>{decoded}</code>"
238
+
239
+ # Default: just the content in a paragraph
240
+ return f"<p>{decoded}</p>"
241
+
242
+ # ========== Injection wrappers — blend with normal page content ==========
243
+
244
+ def _wrap_injection_for_position(self, injection_html: str, position: str, page_type: str, idx: int = 0) -> str:
245
+ """Wrap injection content to match normal page element format.
246
+
247
+ All wrappers use the same HTML tags as regular extracted content
248
+ so injections are indistinguishable from normal page elements.
249
+ Strips inner block wrappers to avoid nested structures like <li><p>...</p></li>.
250
+ """
251
+ import re as _re
252
+ # Strip outer <p>...</p> or <div>...</div> to avoid nesting in block containers
253
+ inner = _re.sub(r'^<(?:p|div)[^>]*>(.*)</(?:p|div)>$', r'\1', injection_html.strip(), flags=_re.DOTALL)
254
+
255
+ # Page types that use tables
256
+ table_pages = ('quote', 'analysis', 'profile', 'portfolio', 'trade',
257
+ 'orders', 'options', 'markets', 'stock_list', 'chart')
258
+ # Page types that use lists
259
+ list_pages = ('news', 'news_center', 'search')
260
+
261
+ if page_type in table_pages:
262
+ return f"<tr><td>{inner}</td></tr>"
263
+ elif page_type in list_pages:
264
+ return f"<li>{inner}</li>"
265
+ elif page_type == 'article':
266
+ return f"<p>{inner}</p>"
267
+ else:
268
+ return f"<p>{inner}</p>"
269
+
270
+ def _find_best_insertion_index(self, lines: List[str], predicate, prefer_range=(0.3, 0.7)) -> int:
271
+ total = len(lines)
272
+ if total == 0:
273
+ return 0
274
+ middle = total // 2
275
+ candidates = [i for i, line in enumerate(lines) if predicate(line)]
276
+ if not candidates:
277
+ return middle
278
+ low, high = int(total * prefer_range[0]), int(total * prefer_range[1])
279
+ in_range = [i for i in candidates if low <= i <= high]
280
+ if in_range:
281
+ return min(in_range, key=lambda i: abs(i - middle)) + 1
282
+ return min(candidates, key=lambda i: abs(i - middle)) + 1
283
+
284
+ def _find_after_line(self, lines: List[str], start_idx: int, predicate) -> Optional[int]:
285
+ for i in range(start_idx, len(lines)):
286
+ if predicate(lines[i]):
287
+ return i + 1
288
+ return None
289
+
290
+ def _find_insertion_index_structured(self, lines: List[str], page_type: str, container: str) -> int:
291
+ """Prefer insertion inside the most natural section for each page type."""
292
+ if not lines:
293
+ return 0
294
+
295
+ if page_type in ('news', 'news_center', 'search'):
296
+ ul_idx = self._find_after_line(lines, 0, lambda l: '<ul' in l)
297
+ if ul_idx is not None:
298
+ li_idx = self._find_after_line(lines, ul_idx, lambda l: '<li' in l)
299
+ if li_idx is not None:
300
+ return li_idx
301
+ return self._find_best_insertion_index(lines, lambda l: '<li' in l)
302
+
303
+ if page_type == 'article':
304
+ first_p = self._find_after_line(lines, 0, lambda l: '<p>' in l)
305
+ if first_p is not None:
306
+ second_p = self._find_after_line(lines, first_p, lambda l: '<p>' in l)
307
+ if second_p is not None:
308
+ return second_p
309
+ return first_p
310
+ return self._find_best_insertion_index(lines, lambda l: '<p>' in l)
311
+
312
+ if page_type in ('portfolio', 'trade', 'orders'):
313
+ table_idx = self._find_after_line(lines, 0, lambda l: "<table class='holdings'>" in l)
314
+ if table_idx is None:
315
+ table_idx = self._find_after_line(lines, 0, lambda l: '<table' in l)
316
+ if table_idx is not None:
317
+ tr_idx = self._find_after_line(lines, table_idx, lambda l: '<tr' in l)
318
+ if tr_idx is not None:
319
+ return tr_idx
320
+ return self._find_best_insertion_index(lines, lambda l: '<tr' in l)
321
+
322
+ if page_type == 'options':
323
+ table_idx = self._find_after_line(lines, 0, lambda l: "<table class='options'>" in l)
324
+ if table_idx is not None:
325
+ tr_idx = self._find_after_line(lines, table_idx, lambda l: '<tr' in l)
326
+ if tr_idx is not None:
327
+ return tr_idx
328
+ return self._find_best_insertion_index(lines, lambda l: '<tr' in l)
329
+
330
+ if page_type in ('quote', 'analysis', 'profile'):
331
+ stats_div = self._find_after_line(lines, 0, lambda l: "<div class='stats'>" in l)
332
+ if stats_div is not None:
333
+ table_idx = self._find_after_line(lines, stats_div, lambda l: '<table' in l)
334
+ if table_idx is not None:
335
+ tr_idx = self._find_after_line(lines, table_idx, lambda l: '<tr' in l)
336
+ if tr_idx is not None:
337
+ return tr_idx
338
+ return self._find_best_insertion_index(lines, lambda l: '<tr' in l)
339
+
340
+ if page_type in ('markets', 'stock_list'):
341
+ table_idx = self._find_after_line(lines, 0, lambda l: '<table' in l)
342
+ if table_idx is not None:
343
+ tr_idx = self._find_after_line(lines, table_idx, lambda l: '<tr' in l)
344
+ if tr_idx is not None:
345
+ return tr_idx
346
+ return self._find_best_insertion_index(lines, lambda l: '<tr' in l)
347
+
348
+ # Fallback to content-driven container preference
349
+ return self._find_best_insertion_index(lines, lambda l: f'<{container}' in l)
350
+
351
+ def _insert_injections_naturally(self, content: str, injections_map: Dict[str, List[str]], page_type: str) -> str:
352
+ """Insert injections at different positions (start/middle/end) with diverse wrappers.
353
+
354
+ Ensures injections are placed inside appropriate containers (tables/lists).
355
+ """
356
+ lines = content.split('\n')
357
+ if not lines:
358
+ all_wrapped = []
359
+ for pos in ['start', 'middle', 'end']:
360
+ for idx, inj in enumerate(injections_map.get(pos, [])):
361
+ all_wrapped.append(self._wrap_injection_for_position(inj, pos, page_type, idx))
362
+ return '\n'.join(all_wrapped)
363
+
364
+ # Find valid insertion points (inside tables/lists, after first row/item)
365
+ table_pages = ('quote', 'analysis', 'profile', 'portfolio', 'trade',
366
+ 'orders', 'options', 'markets', 'stock_list', 'chart')
367
+ list_pages = ('news', 'news_center', 'search')
368
+
369
+ # Find all valid insertion indices
370
+ valid_indices = []
371
+ in_table = False
372
+ in_list = False
373
+
374
+ for i, line in enumerate(lines):
375
+ if '<table' in line:
376
+ in_table = True
377
+ elif '</table>' in line:
378
+ in_table = False
379
+ elif '<ul' in line:
380
+ in_list = True
381
+ elif '</ul>' in line:
382
+ in_list = False
383
+
384
+ # Valid insertion point: after a row/item inside container
385
+ if page_type in table_pages and in_table and '</tr>' in line:
386
+ valid_indices.append(i + 1)
387
+ elif page_type in list_pages and in_list and '</li>' in line:
388
+ valid_indices.append(i + 1)
389
+ elif page_type == 'article' and '</p>' in line:
390
+ valid_indices.append(i + 1)
391
+
392
+ if not valid_indices:
393
+ # Fallback: find any structural breaks
394
+ for i, line in enumerate(lines):
395
+ if any(tag in line for tag in ['</tr>', '</li>', '</p>', '</div>']):
396
+ valid_indices.append(i + 1)
397
+
398
+ if not valid_indices:
399
+ valid_indices = [len(lines) // 2]
400
+
401
+ # Distribute injections across positions - more evenly spread
402
+ total_valid = len(valid_indices)
403
+ # START: early in content (around 15-25%)
404
+ start_idx = valid_indices[max(0, int(total_valid * 0.15))] if total_valid > 2 else valid_indices[0]
405
+ # MIDDLE: center of content (around 45-55%)
406
+ middle_idx = valid_indices[int(total_valid * 0.5)]
407
+ # END: late in content (around 80-90%)
408
+ end_idx = valid_indices[min(total_valid - 1, int(total_valid * 0.85))] if total_valid > 2 else valid_indices[-1]
409
+
410
+ # Wrap injections
411
+ start_injections = [
412
+ self._wrap_injection_for_position(inj, 'start', page_type, idx)
413
+ for idx, inj in enumerate(injections_map.get('start', []))
414
+ ]
415
+ middle_injections = [
416
+ self._wrap_injection_for_position(inj, 'middle', page_type, idx)
417
+ for idx, inj in enumerate(injections_map.get('middle', []))
418
+ ]
419
+ end_injections = [
420
+ self._wrap_injection_for_position(inj, 'end', page_type, idx)
421
+ for idx, inj in enumerate(injections_map.get('end', []))
422
+ ]
423
+
424
+ # Build output - insert at calculated positions
425
+ result = []
426
+ start_inserted = middle_inserted = end_inserted = False
427
+
428
+ for i, line in enumerate(lines):
429
+ result.append(line)
430
+
431
+ # Insert after specific indices
432
+ if not start_inserted and i + 1 >= start_idx and start_injections:
433
+ result.extend(start_injections)
434
+ start_inserted = True
435
+
436
+ if not middle_inserted and i + 1 >= middle_idx and middle_injections:
437
+ result.extend(middle_injections)
438
+ middle_inserted = True
439
+
440
+ if not end_inserted and i + 1 >= end_idx and end_injections:
441
+ result.extend(end_injections)
442
+ end_inserted = True
443
+
444
+ # Append any remaining
445
+ if not start_inserted:
446
+ result = start_injections + result
447
+ if not middle_inserted:
448
+ mid = len(result) // 2
449
+ result = result[:mid] + middle_injections + result[mid:]
450
+ if not end_inserted:
451
+ result.extend(end_injections)
452
+
453
+ return '\n'.join(result)
454
+
455
+ def _extract_hidden_injections(self, soup) -> List[str]:
456
+ """Extract hidden elements (HTML injections) preserving original format."""
457
+ injections = []
458
+
459
+ for el in soup.find_all(style=True):
460
+ style = el.get('style', '').lower()
461
+ if any(p in style for p in self.HIDDEN_PATTERNS):
462
+ # Get the inner HTML as-is
463
+ inner = el.decode_contents().strip()
464
+ text = el.get_text(strip=True)
465
+
466
+ # Skip empty or UI elements
467
+ if not text or len(text) < 15:
468
+ continue
469
+ if any(ui in text.lower() for ui in ['limit price', 'loading', 'order type', 'market order']):
470
+ continue
471
+
472
+ # Keep as original HTML element
473
+ injections.append(f"<div>{inner}</div>")
474
+
475
+ return injections
476
+
477
+ def _clean_soup(self, soup):
478
+ """Remove unwanted elements from soup."""
479
+ # Remove completely
480
+ for tag in self.REMOVE_TAGS:
481
+ for el in soup.find_all(tag):
482
+ el.decompose()
483
+
484
+ # Remove UI chrome
485
+ for tag in self.UI_TAGS:
486
+ for el in soup.find_all(tag):
487
+ el.decompose()
488
+
489
+ # Remove form elements
490
+ for tag in self.FORM_TAGS:
491
+ for el in soup.find_all(tag):
492
+ el.decompose()
493
+
494
+ # Remove comments
495
+ for comment in soup.find_all(string=lambda t: isinstance(t, Comment)):
496
+ comment.extract()
497
+
498
+ # Clean session parameters from links
499
+ for a in soup.find_all('a', href=True):
500
+ href = a.get('href', '')
501
+ href = re.sub(r'\?session=[^&\s]+', '', href)
502
+ href = re.sub(r'&session=[^&\s]+', '', href)
503
+ a['href'] = href
504
+
505
+ def _get_title(self, soup) -> str:
506
+ """Get page title."""
507
+ title = soup.find('title')
508
+ if title:
509
+ text = title.get_text(strip=True)
510
+ # Remove " - Yahoo Finance" suffix
511
+ text = re.sub(r'\s*-\s*Yahoo Finance$', '', text)
512
+ return f"<h1>{text}</h1>"
513
+ return ""
514
+
515
+ def _extract_quote(self, soup) -> str:
516
+ """Extract stock quote page."""
517
+ parts = []
518
+
519
+ # Title
520
+ parts.append(self._get_title(soup))
521
+
522
+ # Price section
523
+ price_div = soup.find('div', class_=re.compile('quote-header|price-section'))
524
+ if price_div:
525
+ parts.append("<div class='price'>")
526
+ # Find price
527
+ for span in price_div.find_all('span'):
528
+ text = span.get_text(strip=True)
529
+ if re.match(r'^\$?[\d,]+\.?\d*$', text.replace(',', '')):
530
+ parts.append(f"<span class='price-value'>{text}</span>")
531
+ break
532
+ # Find change
533
+ change = price_div.find(class_=re.compile('change'))
534
+ if change:
535
+ parts.append(f"<span class='change'>{change.get_text(strip=True)}</span>")
536
+ parts.append("</div>")
537
+
538
+ # Key Statistics
539
+ stats_section = soup.find(class_=re.compile('key-stat|quote-stat|stats'))
540
+ if stats_section:
541
+ parts.append("<div class='stats'>")
542
+ parts.append("<h3>Key Statistics</h3>")
543
+ parts.append("<table>")
544
+ for row in stats_section.find_all('div', recursive=False)[:16]:
545
+ children = row.find_all('div', recursive=False)
546
+ if len(children) >= 2:
547
+ label = children[0].get_text(strip=True)
548
+ value = children[1].get_text(strip=True)
549
+ if label and value:
550
+ parts.append(f"<tr><td>{label}</td><td>{value}</td></tr>")
551
+ parts.append("</table>")
552
+ parts.append("</div>")
553
+
554
+ # News section with dates
555
+ news_section = soup.find(class_=re.compile('news-section|recent-news'))
556
+ if news_section:
557
+ parts.append("<div class='news'>")
558
+ parts.append("<h3>Recent News</h3>")
559
+ parts.append("<ul>")
560
+ seen_ids = set()
561
+ for a in news_section.find_all('a', href=re.compile(r'/article/\d+')):
562
+ href = a.get('href', '')
563
+ text = a.get_text(strip=True)
564
+ match = re.search(r'/article/(\d+)', href)
565
+ if match and text and len(text) > 10:
566
+ aid = match.group(1)
567
+ if aid not in seen_ids:
568
+ seen_ids.add(aid)
569
+ # Get date from parent
570
+ parent = a.find_parent(['div', 'article'])
571
+ date_str = ""
572
+ if parent:
573
+ meta = parent.find(class_=re.compile('source|meta|date'))
574
+ if meta:
575
+ meta_text = meta.get_text(strip=True)
576
+ date_match = re.search(r'(\d{4}-\d{2}-\d{2})', meta_text)
577
+ if date_match:
578
+ date_str = f" | {date_match.group(1)}"
579
+ parts.append(f"<li>[{aid}] {text}{date_str}</li>")
580
+ parts.append("</ul>")
581
+ parts.append("</div>")
582
+
583
+ return "\n".join(parts)
584
+
585
+ def _extract_news(self, soup) -> str:
586
+ """Extract news list page - respect server's limit, not extractor."""
587
+ parts = []
588
+
589
+ parts.append(self._get_title(soup))
590
+ parts.append("<ul>")
591
+
592
+ seen_ids = set()
593
+
594
+ for a in soup.find_all('a', href=re.compile(r'/article/\d+')):
595
+ href = a.get('href', '')
596
+ text = a.get_text(strip=True)
597
+ match = re.search(r'/article/(\d+)', href)
598
+
599
+ if not match or not text or len(text) < 10:
600
+ continue
601
+
602
+ article_id = match.group(1)
603
+ if article_id in seen_ids:
604
+ continue
605
+ seen_ids.add(article_id)
606
+
607
+ # Get source and date separately
608
+ parent = a.find_parent(['div', 'article'])
609
+ source = ""
610
+ date = ""
611
+ if parent:
612
+ source_el = parent.find(class_=re.compile('source'))
613
+ date_el = parent.find(class_=re.compile('date|time'))
614
+ if source_el:
615
+ source = source_el.get_text(strip=True)
616
+ if date_el:
617
+ date = date_el.get_text(strip=True)
618
+ # Fallback: find a span containing a date pattern (YYYY-MM-DD)
619
+ if not date:
620
+ for span in parent.find_all('span'):
621
+ span_text = span.get_text(strip=True)
622
+ if re.match(r'\d{4}-\d{2}-\d{2}', span_text):
623
+ date = span_text
624
+ break
625
+ # Fallback: combined meta
626
+ if not source and not date:
627
+ meta = parent.find(class_=re.compile('meta'))
628
+ if meta:
629
+ meta_text = meta.get_text(strip=True)
630
+ date_match = re.search(r'(\d{4}-\d{2}-\d{2}.*)', meta_text)
631
+ if date_match:
632
+ date = date_match.group(1)
633
+ source = meta_text[:date_match.start()]
634
+
635
+ # Format: [ID] Title | Source | Date
636
+ meta_str = ""
637
+ if source or date:
638
+ meta_parts = [p for p in [source, date] if p]
639
+ meta_str = f" | {' | '.join(meta_parts)}"
640
+
641
+ parts.append(f"<li>[{article_id}] {text}{meta_str}</li>")
642
+
643
+ parts.append("</ul>")
644
+ return "\n".join(parts)
645
+
646
+ def _extract_chart(self, soup) -> str:
647
+ """Extract chart/price data page."""
648
+ parts = []
649
+
650
+ parts.append(self._get_title(soup))
651
+
652
+ # Find price data table
653
+ table = soup.find('table', class_=re.compile('price-table|price-data'))
654
+ if not table:
655
+ table = soup.find('table')
656
+
657
+ if table:
658
+ parts.append("<table class='price-data'>")
659
+ # Header
660
+ thead = table.find('thead')
661
+ if thead:
662
+ parts.append("<thead>")
663
+ for tr in thead.find_all('tr'):
664
+ parts.append("<tr>")
665
+ for th in tr.find_all('th'):
666
+ parts.append(f"<th>{th.get_text(strip=True)}</th>")
667
+ parts.append("</tr>")
668
+ parts.append("</thead>")
669
+
670
+ # Body - limit to 25 rows
671
+ tbody = table.find('tbody')
672
+ if tbody:
673
+ parts.append("<tbody>")
674
+ for tr in tbody.find_all('tr')[:25]:
675
+ parts.append("<tr>")
676
+ for td in tr.find_all('td'):
677
+ parts.append(f"<td>{td.get_text(strip=True)}</td>")
678
+ parts.append("</tr>")
679
+ parts.append("</tbody>")
680
+ else:
681
+ # No tbody, get rows directly
682
+ for tr in table.find_all('tr')[:25]:
683
+ parts.append("<tr>")
684
+ for td in tr.find_all(['td', 'th']):
685
+ parts.append(f"<td>{td.get_text(strip=True)}</td>")
686
+ parts.append("</tr>")
687
+
688
+ parts.append("</table>")
689
+
690
+ # Period info
691
+ for div in soup.find_all('div', class_=re.compile('period|range|info')):
692
+ text = div.get_text(strip=True)
693
+ if text and 'Data Points' in text:
694
+ parts.append(f"<p class='info'>{text}</p>")
695
+ break
696
+
697
+ return "\n".join(parts)
698
+
699
+ def _extract_analysis(self, soup) -> str:
700
+ """Extract analysis/ratings page."""
701
+ parts = []
702
+
703
+ parts.append(self._get_title(soup))
704
+
705
+ # Recommendation summary
706
+ rec = soup.find(class_=re.compile('recommend|consensus|rating-summary'))
707
+ if rec:
708
+ parts.append(f"<div class='recommendation'>{rec.get_text(strip=True)}</div>")
709
+
710
+ # Price target
711
+ target = soup.find(class_=re.compile('price-target|target'))
712
+ if target:
713
+ parts.append(f"<div class='price-target'>{target.get_text(strip=True)}</div>")
714
+
715
+ # All analyst tables (recommendation trend + individual ratings)
716
+ for table in soup.find_all('table'):
717
+ parts.append("<table class='analyst-ratings'>")
718
+ for tr in table.find_all('tr')[:20]:
719
+ parts.append("<tr>")
720
+ for td in tr.find_all(['td', 'th']):
721
+ parts.append(f"<td>{td.get_text(strip=True)}</td>")
722
+ parts.append("</tr>")
723
+ parts.append("</table>")
724
+
725
+ return "\n".join(parts)
726
+
727
+ def _extract_profile(self, soup) -> str:
728
+ """Extract company profile page."""
729
+ parts = []
730
+
731
+ parts.append(self._get_title(soup))
732
+
733
+ # Description
734
+ desc = soup.find(class_=re.compile('description|about|summary|profile-desc'))
735
+ if desc:
736
+ text = desc.get_text(strip=True)[:800]
737
+ parts.append(f"<p class='description'>{text}</p>")
738
+
739
+ # Company info
740
+ info = soup.find(class_=re.compile('company-info|profile-info'))
741
+ if info:
742
+ parts.append("<div class='company-info'>")
743
+ for row in info.find_all('div', recursive=False)[:10]:
744
+ text = row.get_text(strip=True)
745
+ if text and len(text) > 3:
746
+ parts.append(f"<p>{text}</p>")
747
+ parts.append("</div>")
748
+
749
+ return "\n".join(parts)
750
+
751
+ def _extract_article(self, soup) -> str:
752
+ """Extract news article page."""
753
+ parts = []
754
+
755
+ # Title
756
+ h1 = soup.find('h1')
757
+ if h1:
758
+ parts.append(f"<h1>{h1.get_text(strip=True)}</h1>")
759
+
760
+ # Meta (source, date) - parse properly
761
+ meta = soup.find(class_=re.compile('article-meta|meta'))
762
+ if meta:
763
+ meta_text = meta.get_text(strip=True)
764
+ # Parse format like "Yahoo2025-12-01 13:30Related:AAPL"
765
+ source = ""
766
+ date = ""
767
+ related = ""
768
+
769
+ # Extract date
770
+ date_match = re.search(r'(\d{4}-\d{2}-\d{2}\s*\d{2}:\d{2})', meta_text)
771
+ if date_match:
772
+ date = date_match.group(1)
773
+ source = meta_text[:date_match.start()].strip()
774
+
775
+ # Extract related symbol
776
+ related_match = re.search(r'Related:\s*(\w+)', meta_text)
777
+ if related_match:
778
+ related = related_match.group(1)
779
+
780
+ meta_parts = []
781
+ if source:
782
+ meta_parts.append(f"Source: {source}")
783
+ if date:
784
+ meta_parts.append(f"Date: {date}")
785
+ if related:
786
+ meta_parts.append(f"Symbol: {related}")
787
+
788
+ if meta_parts:
789
+ parts.append(f"<p class='meta'>{' | '.join(meta_parts)}</p>")
790
+
791
+ # Content
792
+ content = soup.find(['article', 'div'], class_=re.compile('article-content|content|body'))
793
+ if content:
794
+ # Get paragraphs
795
+ paras = content.find_all('p')
796
+ if paras:
797
+ for p in paras[:20]:
798
+ text = p.get_text(strip=True)
799
+ if text and len(text) > 20:
800
+ parts.append(f"<p>{text}</p>")
801
+ else:
802
+ text = content.get_text(strip=True)[:2000]
803
+ parts.append(f"<p>{text}</p>")
804
+
805
+ # Comments section - only real user comments
806
+ comments = soup.find(class_=re.compile('comments-list|comment-section'))
807
+ if comments:
808
+ real_comments = []
809
+ for comment in comments.find_all(class_=re.compile('comment-item'))[:10]:
810
+ author = comment.find(class_=re.compile('author|user'))
811
+ content = comment.find(class_=re.compile('content|text|body'))
812
+ if author and content:
813
+ author_text = author.get_text(strip=True)
814
+ content_text = content.get_text(strip=True)
815
+ if content_text and len(content_text) > 5:
816
+ real_comments.append(f"<div class='comment'><b>{author_text}:</b> {content_text[:300]}</div>")
817
+ if real_comments:
818
+ parts.append("<div class='comments'>")
819
+ parts.append("<h3>Comments</h3>")
820
+ parts.extend(real_comments)
821
+ parts.append("</div>")
822
+
823
+ return "\n".join(parts)
824
+
825
+ def _extract_portfolio(self, soup) -> str:
826
+ """Extract portfolio page."""
827
+ parts = []
828
+
829
+ # Check for account name in portfolio header
830
+ header = soup.find(attrs={"data-section": "portfolio-header"})
831
+ account_name = header.get("data-account") if header else None
832
+ if account_name:
833
+ parts.append(f"<p><b>Account:</b> {account_name}</p>")
834
+
835
+ parts.append("<h1>Portfolio</h1>")
836
+
837
+ # Summary section
838
+ summary = soup.find(class_=re.compile('portfolio-summary|summary|overview'))
839
+ if summary:
840
+ parts.append("<div class='summary'>")
841
+ text = summary.get_text(strip=True)
842
+ # Parse key values
843
+ for match in re.finditer(r'(Total Value|Cash Available|Total Gain|Holdings)[\s:]*(\$?[\d,\.]+%?)', text):
844
+ parts.append(f"<p><b>{match.group(1)}:</b> {match.group(2)}</p>")
845
+ parts.append("</div>")
846
+
847
+ # Holdings table
848
+ table = soup.find('table')
849
+ if table:
850
+ parts.append("<table class='holdings'>")
851
+ for tr in table.find_all('tr')[:20]:
852
+ parts.append("<tr>")
853
+ for td in tr.find_all(['td', 'th']):
854
+ text = td.get_text(strip=True)
855
+ # Skip "Trade" links
856
+ if text.lower() == 'trade':
857
+ continue
858
+ parts.append(f"<td>{text}</td>")
859
+ parts.append("</tr>")
860
+ parts.append("</table>")
861
+
862
+ return "\n".join(parts)
863
+
864
+ def _extract_options(self, soup) -> str:
865
+ """Extract options chain page."""
866
+ parts = []
867
+
868
+ parts.append(self._get_title(soup))
869
+
870
+ # Expiration date
871
+ for text in soup.stripped_strings:
872
+ if re.search(r'20\d{2}-\d{2}-\d{2}', text):
873
+ parts.append(f"<p class='expiry'><b>Expiration:</b> {text}</p>")
874
+ break
875
+
876
+ # Options tables (Calls and Puts)
877
+ tables = soup.find_all('table')
878
+ for i, table in enumerate(tables[:2]):
879
+ table_type = "Calls" if i == 0 else "Puts"
880
+ parts.append(f"<h3>{table_type}</h3>")
881
+ parts.append("<table class='options'>")
882
+
883
+ # Header
884
+ header_row = table.find('tr')
885
+ if header_row:
886
+ parts.append("<tr>")
887
+ parts.append("<th>Strike</th>")
888
+ for th in header_row.find_all('th'):
889
+ text = th.get_text(strip=True)
890
+ if text:
891
+ parts.append(f"<th>{text}</th>")
892
+ parts.append("</tr>")
893
+
894
+ # Data rows
895
+ for tr in table.find_all('tr')[1:15]:
896
+ parts.append("<tr>")
897
+ for td in tr.find_all('td'):
898
+ text = td.get_text(strip=True)
899
+ parts.append(f"<td>{text}</td>")
900
+ parts.append("</tr>")
901
+
902
+ parts.append("</table>")
903
+
904
+ return "\n".join(parts)
905
+
906
+ def _extract_markets(self, soup) -> str:
907
+ """Extract markets overview page."""
908
+ parts = []
909
+
910
+ parts.append("<h1>Markets Overview</h1>")
911
+
912
+ # Market indices (index-card elements)
913
+ idx_row = soup.find(attrs={"data-section": "market-indices"})
914
+ if idx_row:
915
+ parts.append("<h3>Market Indices</h3>")
916
+ parts.append("<table>")
917
+ parts.append("<tr><td>Index</td><td>Value</td><td>Change</td></tr>")
918
+ for card in idx_row.find_all(class_="index-card"):
919
+ name = card.find(class_="index-name")
920
+ value = card.find(class_="index-value")
921
+ change = card.find(class_="index-change")
922
+ if name:
923
+ parts.append(f"<tr><td>{name.get_text(strip=True)}</td>"
924
+ f"<td>{value.get_text(strip=True) if value else '--'}</td>"
925
+ f"<td>{change.get_text(strip=True) if change else '--'}</td></tr>")
926
+ parts.append("</table>")
927
+
928
+ # Movers cards (gainers, losers, active) — each has a movers-table
929
+ for card in soup.find_all(class_="movers-card"):
930
+ header = card.find(class_="movers-header")
931
+ if header:
932
+ parts.append(f"<h3>{header.get_text(strip=True)}</h3>")
933
+ table = card.find('table')
934
+ if table:
935
+ parts.append("<table>")
936
+ for tr in table.find_all('tr')[:15]:
937
+ parts.append("<tr>")
938
+ for td in tr.find_all(['td', 'th']):
939
+ parts.append(f"<td>{td.get_text(strip=True)}</td>")
940
+ parts.append("</tr>")
941
+ parts.append("</table>")
942
+
943
+ # Trending stocks
944
+ trending = soup.find(attrs={"data-section": "trending-stocks"})
945
+ if trending:
946
+ header = trending.find(class_="trending-header")
947
+ if header:
948
+ parts.append(f"<h3>{header.get_text(strip=True)}</h3>")
949
+ parts.append("<table>")
950
+ parts.append("<tr><td>#</td><td>Symbol</td><td>Name</td><td>Price</td><td>Change</td></tr>")
951
+ for item in trending.find_all(class_="trending-item"):
952
+ rank = item.find(class_="trending-rank")
953
+ sym = item.find(class_="trending-symbol")
954
+ name = item.find(class_="trending-name")
955
+ price = item.find(class_="trending-price-value")
956
+ chg = item.find(class_="trending-price-change")
957
+ if sym:
958
+ parts.append(f"<tr><td>{rank.get_text(strip=True) if rank else ''}</td>"
959
+ f"<td>{sym.get_text(strip=True)}</td>"
960
+ f"<td>{name.get_text(strip=True) if name else ''}</td>"
961
+ f"<td>{price.get_text(strip=True) if price else ''}</td>"
962
+ f"<td>{chg.get_text(strip=True) if chg else ''}</td></tr>")
963
+ parts.append("</table>")
964
+
965
+ # Fallback: extract any remaining tables
966
+ if len(parts) <= 1:
967
+ for table in soup.find_all('table')[:3]:
968
+ parts.append("<table>")
969
+ for tr in table.find_all('tr')[:15]:
970
+ parts.append("<tr>")
971
+ for td in tr.find_all(['td', 'th']):
972
+ parts.append(f"<td>{td.get_text(strip=True)}</td>")
973
+ parts.append("</tr>")
974
+ parts.append("</table>")
975
+
976
+ return "\n".join(parts)
977
+
978
+ def _extract_stock_list(self, soup) -> str:
979
+ """Extract the full stock list page (/markets/stocks/)."""
980
+ parts = []
981
+
982
+ # Title with sort/sector info
983
+ title_el = soup.find(class_="yf-card-title")
984
+ if title_el:
985
+ parts.append(f"<h1>{title_el.get_text(strip=True)}</h1>")
986
+ else:
987
+ parts.append("<h1>All Stocks</h1>")
988
+
989
+ # Main table
990
+ table = soup.find('table', class_='yf-stocks-table') or soup.find('table')
991
+ if table:
992
+ parts.append("<table>")
993
+ for tr in table.find_all('tr'):
994
+ parts.append("<tr>")
995
+ for cell in tr.find_all(['td', 'th']):
996
+ parts.append(f"<td>{cell.get_text(strip=True)}</td>")
997
+ parts.append("</tr>")
998
+ parts.append("</table>")
999
+
1000
+ return "\n".join(parts)
1001
+
1002
+ def _extract_search(self, soup) -> str:
1003
+ """Extract search results page with enhanced information."""
1004
+ parts = []
1005
+
1006
+ parts.append("<h1>Search Results</h1>")
1007
+
1008
+ seen_stocks = set()
1009
+ seen_news = set()
1010
+
1011
+ # Extract stock results with price information
1012
+ stock_section = soup.find('div', class_='search-results')
1013
+ if stock_section:
1014
+ section_title = stock_section.find(class_='section-title')
1015
+ if section_title and '📈' in section_title.get_text():
1016
+ parts.append("<div class='stocks'>")
1017
+ parts.append("<h2>Stock Results</h2>")
1018
+ parts.append("<ul>")
1019
+
1020
+ for item in stock_section.find_all(class_=re.compile('result-item')):
1021
+ symbol_el = item.find(class_=re.compile('symbol'))
1022
+ name_el = item.find(class_=re.compile('name'))
1023
+ price_el = item.find(class_=re.compile('price'))
1024
+
1025
+ if symbol_el:
1026
+ symbol = symbol_el.get_text(strip=True)
1027
+ name = name_el.get_text(strip=True) if name_el else ""
1028
+ price = price_el.get_text(strip=True) if price_el else ""
1029
+
1030
+ key = symbol.upper()
1031
+ if key not in seen_stocks and len(symbol) <= 5:
1032
+ seen_stocks.add(key)
1033
+ price_info = f" | Price: {price}" if price and price != "--" else ""
1034
+ parts.append(f"<li><b>{symbol}</b> - {name}{price_info}</li>")
1035
+
1036
+ parts.append("</ul>")
1037
+ parts.append("</div>")
1038
+
1039
+ # Extract stocks mentioned in news
1040
+ all_sections = soup.find_all('div', class_='search-results')
1041
+ for section in all_sections:
1042
+ section_title = section.find(class_='section-title')
1043
+ if section_title and '📊' in section_title.get_text():
1044
+ parts.append("<div class='news-stocks'>")
1045
+ parts.append("<h2>Stocks Mentioned in News</h2>")
1046
+ parts.append("<ul>")
1047
+
1048
+ for item in section.find_all(class_=re.compile('result-item')):
1049
+ symbol_el = item.find(class_=re.compile('symbol'))
1050
+ name_el = item.find(class_=re.compile('name'))
1051
+ price_div = item.find('div', class_=re.compile('price'))
1052
+
1053
+ if symbol_el:
1054
+ symbol = symbol_el.get_text(strip=True)
1055
+ name = name_el.get_text(strip=True) if name_el else ""
1056
+
1057
+ key = symbol.upper()
1058
+ if key not in seen_stocks and len(symbol) <= 5:
1059
+ seen_stocks.add(key)
1060
+
1061
+ # Extract price and change
1062
+ price_info = ""
1063
+ if price_div:
1064
+ price_text = price_div.get_text(strip=True, separator=' ')
1065
+ if price_text:
1066
+ price_info = f" | {price_text}"
1067
+
1068
+ parts.append(f"<li><b>{symbol}</b> - {name}{price_info}</li>")
1069
+
1070
+ parts.append("</ul>")
1071
+ parts.append("</div>")
1072
+
1073
+ # Extract news results with summaries
1074
+ for section in all_sections:
1075
+ section_title = section.find(class_='section-title')
1076
+ if section_title and '📰' in section_title.get_text():
1077
+ parts.append("<div class='news'>")
1078
+ parts.append("<h2>News Results</h2>")
1079
+ parts.append("<ul>")
1080
+
1081
+ for news_item in section.find_all(class_='news-result'):
1082
+ # Extract headline
1083
+ headline_el = news_item.find(class_='news-headline')
1084
+ if headline_el:
1085
+ headline = headline_el.get_text(strip=True)
1086
+ href = headline_el.get('href', '')
1087
+
1088
+ # Extract article ID from href
1089
+ article_id = ""
1090
+ id_match = re.search(r'/article/(\d+)', href)
1091
+ if id_match:
1092
+ article_id = id_match.group(1)
1093
+
1094
+ if headline and article_id not in seen_news:
1095
+ seen_news.add(article_id)
1096
+
1097
+ # Extract metadata
1098
+ meta_parts = []
1099
+ meta_div = news_item.find(class_='news-meta')
1100
+ if meta_div:
1101
+ symbol_badge = meta_div.find(class_='news-symbol-badge')
1102
+ if symbol_badge:
1103
+ meta_parts.append(symbol_badge.get_text(strip=True))
1104
+
1105
+ source = meta_div.find(class_='news-source')
1106
+ if source:
1107
+ meta_parts.append(source.get_text(strip=True))
1108
+
1109
+ # Find date in meta
1110
+ for span in meta_div.find_all('span'):
1111
+ text = span.get_text(strip=True)
1112
+ if re.match(r'\d{4}-\d{2}-\d{2}', text):
1113
+ meta_parts.append(text)
1114
+ break
1115
+
1116
+ # Extract summary
1117
+ summary_el = news_item.find(class_='news-summary')
1118
+ summary = ""
1119
+ if summary_el:
1120
+ summary = summary_el.get_text(strip=True)[:150]
1121
+ if summary:
1122
+ summary = f"<br/> Summary: {summary}..."
1123
+
1124
+ meta_str = " | ".join(meta_parts) if meta_parts else ""
1125
+ if article_id:
1126
+ parts.append(f"<li>[{article_id}] {headline}")
1127
+ if meta_str:
1128
+ parts.append(f"<br/> {meta_str}")
1129
+ if summary:
1130
+ parts.append(summary)
1131
+ parts.append("</li>")
1132
+
1133
+ parts.append("</ul>")
1134
+ parts.append("</div>")
1135
+
1136
+ # Fallback: simple stock links if nothing found
1137
+ if not seen_stocks and not seen_news:
1138
+ parts.append("<ul>")
1139
+ for a in soup.find_all('a', href=re.compile('/quote/[A-Z]+')):
1140
+ text = a.get_text(strip=True)
1141
+ if text and 3 <= len(text) <= 50 and text not in seen_stocks:
1142
+ seen_stocks.add(text)
1143
+ parts.append(f"<li>{text}</li>")
1144
+ parts.append("</ul>")
1145
+
1146
+ if not seen_stocks and not seen_news:
1147
+ parts.append("<p>No results found</p>")
1148
+
1149
+ return "\n".join(parts)
1150
+
1151
+ def _extract_generic(self, soup) -> str:
1152
+ """Generic fallback extraction."""
1153
+ parts = []
1154
+
1155
+ parts.append(self._get_title(soup))
1156
+
1157
+ # Get main content area
1158
+ main = soup.find('main') or soup.find(class_='main-content') or soup.find('body')
1159
+ if main:
1160
+ # Get text content, limit length
1161
+ text = main.get_text(strip=True)[:2000]
1162
+ parts.append(f"<div class='content'>{text}</div>")
1163
+
1164
+ return "\n".join(parts)