decodingtrust-agent-sdk 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent/__init__.py +30 -0
- agent/claudesdk/__init__.py +8 -0
- agent/claudesdk/example.py +221 -0
- agent/claudesdk/src/__init__.py +8 -0
- agent/claudesdk/src/agent.py +400 -0
- agent/claudesdk/src/mcp_proxy.py +409 -0
- agent/claudesdk/src/utils.py +420 -0
- agent/googleadk/__init__.py +15 -0
- agent/googleadk/example.py +237 -0
- agent/googleadk/src/__init__.py +12 -0
- agent/googleadk/src/agent.py +401 -0
- agent/googleadk/src/mcp_wrapper.py +163 -0
- agent/googleadk/src/utils.py +602 -0
- agent/langchain/__init__.py +8 -0
- agent/langchain/example.py +213 -0
- agent/langchain/src/__init__.py +8 -0
- agent/langchain/src/agent.py +645 -0
- agent/langchain/src/utils.py +433 -0
- agent/openaisdk/__init__.py +17 -0
- agent/openaisdk/example.py +228 -0
- agent/openaisdk/src/__init__.py +12 -0
- agent/openaisdk/src/agent.py +491 -0
- agent/openaisdk/src/agent_wrapper.py +143 -0
- agent/openaisdk/src/mcp_wrapper.py +395 -0
- agent/openaisdk/src/utils.py +493 -0
- agent/openclaw/__init__.py +10 -0
- agent/openclaw/example.py +251 -0
- agent/openclaw/src/__init__.py +14 -0
- agent/openclaw/src/agent.py +930 -0
- agent/openclaw/src/helpers/__init__.py +1 -0
- agent/openclaw/src/helpers/auth_helpers.py +55 -0
- agent/openclaw/src/mcp_proxy.py +564 -0
- agent/openclaw/src/plugin_generator.py +231 -0
- agent/openclaw/src/utils.py +341 -0
- agent/pocketflow/__init__.py +18 -0
- agent/pocketflow/example.py +221 -0
- agent/pocketflow/prompts/react_agent.py +46 -0
- agent/pocketflow/src/__init__.py +6 -0
- agent/pocketflow/src/agent.py +507 -0
- agent/pocketflow/src/agent_wrapper.py +159 -0
- agent/pocketflow/src/async_helper.py +92 -0
- agent/pocketflow/src/mcp_react_agent.py +279 -0
- agent/pocketflow/src/native_agent.py +74 -0
- agent/pocketflow/src/nodes.py +467 -0
- benchmark/__init__.py +0 -0
- benchmark/browser/benign.jsonl +34 -0
- benchmark/browser/direct.jsonl +85 -0
- benchmark/browser/indirect.jsonl +82 -0
- benchmark/code/benign.jsonl +0 -0
- benchmark/code/direct.jsonl +121 -0
- benchmark/code/indirect.jsonl +165 -0
- benchmark/crm/benign.jsonl +165 -0
- benchmark/crm/direct.jsonl +90 -0
- benchmark/crm/indirect.jsonl +150 -0
- benchmark/customer-service/benign.jsonl +160 -0
- benchmark/customer-service/direct.jsonl +100 -0
- benchmark/customer-service/indirect.jsonl +101 -0
- benchmark/finance/benign.jsonl +0 -0
- benchmark/finance/direct.jsonl +200 -0
- benchmark/finance/indirect.jsonl +200 -0
- benchmark/legal/benign.jsonl +0 -0
- benchmark/legal/direct.jsonl +200 -0
- benchmark/legal/indirect.jsonl +200 -0
- benchmark/macos/benign.jsonl +30 -0
- benchmark/macos/direct.jsonl +50 -0
- benchmark/macos/indirect.jsonl +50 -0
- benchmark/medical/benign.jsonl +642 -0
- benchmark/medical/direct.jsonl +229 -0
- benchmark/medical/indirect.jsonl +222 -0
- benchmark/os-filesystem/benign.jsonl +200 -0
- benchmark/os-filesystem/direct.jsonl +200 -0
- benchmark/os-filesystem/indirect.jsonl +200 -0
- benchmark/research/benign.jsonl +0 -0
- benchmark/research/direct.jsonl +119 -0
- benchmark/research/indirect.jsonl +125 -0
- benchmark/telecom/benign.jsonl +120 -0
- benchmark/telecom/direct.jsonl +161 -0
- benchmark/telecom/indirect.jsonl +166 -0
- benchmark/travel/benign.jsonl +130 -0
- benchmark/travel/direct.jsonl +105 -0
- benchmark/travel/indirect.jsonl +120 -0
- benchmark/windows/benign.jsonl +100 -0
- benchmark/windows/direct.jsonl +140 -0
- benchmark/windows/indirect.jsonl +107 -0
- benchmark/workflow/benign.jsonl +335 -0
- benchmark/workflow/direct.jsonl +78 -0
- benchmark/workflow/indirect.jsonl +107 -0
- cli/__init__.py +5 -0
- cli/main.py +182 -0
- cli/scaffold.py +334 -0
- decodingtrust_agent_sdk-0.1.0.dist-info/METADATA +642 -0
- decodingtrust_agent_sdk-0.1.0.dist-info/RECORD +374 -0
- decodingtrust_agent_sdk-0.1.0.dist-info/WHEEL +5 -0
- decodingtrust_agent_sdk-0.1.0.dist-info/entry_points.txt +2 -0
- decodingtrust_agent_sdk-0.1.0.dist-info/licenses/LICENSE +201 -0
- decodingtrust_agent_sdk-0.1.0.dist-info/top_level.txt +6 -0
- dt_arena/config/env.yaml +515 -0
- dt_arena/config/injection_mcp.yaml +430 -0
- dt_arena/config/mcp.yaml +642 -0
- dt_arena/envs/arxiv/docker-compose-hub.yml +31 -0
- dt_arena/envs/arxiv/docker-compose.yml +36 -0
- dt_arena/envs/atlassian/docker/docker-compose.dev.yml +65 -0
- dt_arena/envs/atlassian/docker/docker-compose.yml +53 -0
- dt_arena/envs/atlassian/docker-compose-hub.yml +57 -0
- dt_arena/envs/atlassian/docker-compose.yml +72 -0
- dt_arena/envs/bigquery/docker-compose.yml +20 -0
- dt_arena/envs/booking/docker-compose.yml +59 -0
- dt_arena/envs/calendar/docker-compose-hub.yml +30 -0
- dt_arena/envs/calendar/docker-compose.yml +42 -0
- dt_arena/envs/custom-website/docker-compose.yml +6 -0
- dt_arena/envs/customer_service/docker-compose.yml +59 -0
- dt_arena/envs/databricks/docker-compose-hub.yml +47 -0
- dt_arena/envs/databricks/docker-compose.yml +51 -0
- dt_arena/envs/ecommerce/docker-compose.yml +6 -0
- dt_arena/envs/ers/docker-compose.yml +36 -0
- dt_arena/envs/ers/hrms/docker/docker-compose.yml +31 -0
- dt_arena/envs/finance/docker-compose.yml +23 -0
- dt_arena/envs/github/docker/docker-compose-hub.yml +50 -0
- dt_arena/envs/github/docker/docker-compose.yml +50 -0
- dt_arena/envs/gmail/docker-compose-hub.yml +51 -0
- dt_arena/envs/gmail/docker-compose.yml +65 -0
- dt_arena/envs/google-form/docker-compose-hub.yml +33 -0
- dt_arena/envs/google-form/docker-compose.yml +41 -0
- dt_arena/envs/googledocs/docker-compose-hub.yml +61 -0
- dt_arena/envs/googledocs/docker-compose.yml +78 -0
- dt_arena/envs/hospital/docker-compose-hub.yml +25 -0
- dt_arena/envs/hospital/docker-compose.yml +27 -0
- dt_arena/envs/legal/docker-compose.yml +22 -0
- dt_arena/envs/linkedin/docker-compose.yml +63 -0
- dt_arena/envs/macos/docker-compose.yml +79 -0
- dt_arena/envs/os-filesystem/docker-compose-hub.yml +16 -0
- dt_arena/envs/os-filesystem/docker-compose.yml +20 -0
- dt_arena/envs/paypal/docker-compose-hub.yml +48 -0
- dt_arena/envs/paypal/docker-compose.yml +63 -0
- dt_arena/envs/research/docker-compose-hub.yml +13 -0
- dt_arena/envs/research/docker-compose.yml +24 -0
- dt_arena/envs/salesforce_crm/docker-compose-hub.yaml +45 -0
- dt_arena/envs/salesforce_crm/docker-compose.yaml +49 -0
- dt_arena/envs/slack/docker-compose-hub.yml +28 -0
- dt_arena/envs/slack/docker-compose.yml +41 -0
- dt_arena/envs/snowflake/docker-compose-hub.yml +41 -0
- dt_arena/envs/snowflake/docker-compose.yml +44 -0
- dt_arena/envs/telecom/docker-compose-hub.yml +16 -0
- dt_arena/envs/telecom/docker-compose.yml +17 -0
- dt_arena/envs/telegram/docker-compose-hub.yml +57 -0
- dt_arena/envs/telegram/docker-compose.yml +62 -0
- dt_arena/envs/terminal/docker-compose-hub.yml +12 -0
- dt_arena/envs/terminal/docker-compose.yml +26 -0
- dt_arena/envs/travel/docker-compose-hub.yml +19 -0
- dt_arena/envs/travel/docker-compose.yml +19 -0
- dt_arena/envs/whatsapp/docker-compose-hub.yml +61 -0
- dt_arena/envs/whatsapp/docker-compose.yml +78 -0
- dt_arena/envs/windows/docker-compose.yml +71 -0
- dt_arena/envs/zoom/docker-compose-hub.yml +27 -0
- dt_arena/envs/zoom/docker-compose.yml +40 -0
- dt_arena/injection_mcp_server/atlassian/env_injection.py +134 -0
- dt_arena/injection_mcp_server/calendar/env_injection.py +217 -0
- dt_arena/injection_mcp_server/custom_website/env_injection.py +97 -0
- dt_arena/injection_mcp_server/customer_service/env_injection.py +659 -0
- dt_arena/injection_mcp_server/databricks/env_injection.py +255 -0
- dt_arena/injection_mcp_server/ecommerce/env_injection.py +110 -0
- dt_arena/injection_mcp_server/finance/env_injection.py +85 -0
- dt_arena/injection_mcp_server/github/env_injection.py +206 -0
- dt_arena/injection_mcp_server/gmail/env_injection.py +211 -0
- dt_arena/injection_mcp_server/google_form/env_injection.py +186 -0
- dt_arena/injection_mcp_server/googledocs/env_injection.py +44 -0
- dt_arena/injection_mcp_server/hospital/env_injection.py +43 -0
- dt_arena/injection_mcp_server/legal/env_injection.py +229 -0
- dt_arena/injection_mcp_server/macos/env_injection.py +272 -0
- dt_arena/injection_mcp_server/os-filesystem/env_injection.py +341 -0
- dt_arena/injection_mcp_server/paypal/env_injection.py +268 -0
- dt_arena/injection_mcp_server/research/env_injection.py +616 -0
- dt_arena/injection_mcp_server/salesforce/env_injection.py +514 -0
- dt_arena/injection_mcp_server/slack/env_injection.py +265 -0
- dt_arena/injection_mcp_server/snowflake/env_injection.py +230 -0
- dt_arena/injection_mcp_server/telecom/env_injection.py +503 -0
- dt_arena/injection_mcp_server/telegram/env_injection.py +171 -0
- dt_arena/injection_mcp_server/terminal/env_injection.py +523 -0
- dt_arena/injection_mcp_server/travel/env_injection.py +173 -0
- dt_arena/injection_mcp_server/whatsapp/env_injection.py +185 -0
- dt_arena/injection_mcp_server/windows/env_injection.py +943 -0
- dt_arena/injection_mcp_server/zoom/env_injection.py +216 -0
- dt_arena/mcp_server/atlassian/main.py +1554 -0
- dt_arena/mcp_server/atlassian/test_server.py +66 -0
- dt_arena/mcp_server/bigquery/main.py +333 -0
- dt_arena/mcp_server/booking/main.py +310 -0
- dt_arena/mcp_server/browser/main.py +1741 -0
- dt_arena/mcp_server/calendar/example_multi_user.py +162 -0
- dt_arena/mcp_server/calendar/main.py +792 -0
- dt_arena/mcp_server/calendar/test_mcp.py +135 -0
- dt_arena/mcp_server/customer_service/main.py +1063 -0
- dt_arena/mcp_server/databricks/main.py +566 -0
- dt_arena/mcp_server/databricks/probe.py +102 -0
- dt_arena/mcp_server/ers/main.py +845 -0
- dt_arena/mcp_server/finance/__init__.py +87 -0
- dt_arena/mcp_server/finance/core/__init__.py +12 -0
- dt_arena/mcp_server/finance/core/data_loader.py +558 -0
- dt_arena/mcp_server/finance/core/portfolio.py +565 -0
- dt_arena/mcp_server/finance/evaluation/__init__.py +20 -0
- dt_arena/mcp_server/finance/evaluation/evaluator.py +217 -0
- dt_arena/mcp_server/finance/evaluation/logger.py +137 -0
- dt_arena/mcp_server/finance/injection/__init__.py +66 -0
- dt_arena/mcp_server/finance/injection/config.py +176 -0
- dt_arena/mcp_server/finance/injection/content.py +755 -0
- dt_arena/mcp_server/finance/injection/html.py +409 -0
- dt_arena/mcp_server/finance/injection/locations.py +167 -0
- dt_arena/mcp_server/finance/injection/methods.py +193 -0
- dt_arena/mcp_server/finance/injection/presets.py +1023 -0
- dt_arena/mcp_server/finance/main.py +361 -0
- dt_arena/mcp_server/finance/run_mcp.py +21 -0
- dt_arena/mcp_server/finance/run_web.py +26 -0
- dt_arena/mcp_server/finance/server/__init__.py +41 -0
- dt_arena/mcp_server/finance/server/extractor.py +1453 -0
- dt_arena/mcp_server/finance/server/extractor_minimal.py +292 -0
- dt_arena/mcp_server/finance/server/extractor_simple.py +1164 -0
- dt_arena/mcp_server/finance/server/injection_mcp.py +865 -0
- dt_arena/mcp_server/finance/server/mcp.py +451 -0
- dt_arena/mcp_server/finance/server/tools/__init__.py +23 -0
- dt_arena/mcp_server/finance/server/tools/account.py +88 -0
- dt_arena/mcp_server/finance/server/tools/browsing.py +328 -0
- dt_arena/mcp_server/finance/server/tools/social.py +73 -0
- dt_arena/mcp_server/finance/server/tools/trading.py +242 -0
- dt_arena/mcp_server/finance/server/tools/utility.py +49 -0
- dt_arena/mcp_server/finance/server/web.py +2139 -0
- dt_arena/mcp_server/finance/tasks/benchmark/__init__.py +28 -0
- dt_arena/mcp_server/finance/tasks/benchmark/attack_pool.py +3026 -0
- dt_arena/mcp_server/finance/tasks/benchmark/attack_runner.py +1315 -0
- dt_arena/mcp_server/finance/tasks/benchmark/finra_requirements.py +1335 -0
- dt_arena/mcp_server/finance/tasks/benchmark/finra_tasks.py +3665 -0
- dt_arena/mcp_server/finance/tasks/benchmark/malicious_tasks.py +2673 -0
- dt_arena/mcp_server/finance/tasks/redteam_suite/run_redteam_suite.py +1713 -0
- dt_arena/mcp_server/finance/test_mcp_tools.py +476 -0
- dt_arena/mcp_server/github/main.py +441 -0
- dt_arena/mcp_server/gmail/main.py +1004 -0
- dt_arena/mcp_server/google_form/main.py +141 -0
- dt_arena/mcp_server/googledocs/main.py +458 -0
- dt_arena/mcp_server/hospital/mcp_server.py +458 -0
- dt_arena/mcp_server/legal/__init__.py +9 -0
- dt_arena/mcp_server/legal/core/__init__.py +14 -0
- dt_arena/mcp_server/legal/core/courtlistener_store.py +762 -0
- dt_arena/mcp_server/legal/core/data_loader.py +266 -0
- dt_arena/mcp_server/legal/core/document_store.py +197 -0
- dt_arena/mcp_server/legal/core/matter_manager.py +466 -0
- dt_arena/mcp_server/legal/main.py +89 -0
- dt_arena/mcp_server/legal/scripts/collect_data.py +988 -0
- dt_arena/mcp_server/legal/server/__init__.py +14 -0
- dt_arena/mcp_server/legal/server/mcp.py +2330 -0
- dt_arena/mcp_server/macos/client_test.py +270 -0
- dt_arena/mcp_server/macos/mcp_server.py +285 -0
- dt_arena/mcp_server/os-filesystem/main.py +1380 -0
- dt_arena/mcp_server/paypal/main.py +501 -0
- dt_arena/mcp_server/research/main.py +777 -0
- dt_arena/mcp_server/salesforce/main.py +2006 -0
- dt_arena/mcp_server/slack/main.py +318 -0
- dt_arena/mcp_server/snowflake/main.py +612 -0
- dt_arena/mcp_server/snowflake/probe.py +183 -0
- dt_arena/mcp_server/telecom/mcp_client.py +423 -0
- dt_arena/mcp_server/telecom/mcp_server.py +1059 -0
- dt_arena/mcp_server/telegram/main.py +338 -0
- dt_arena/mcp_server/terminal/main.py +163 -0
- dt_arena/mcp_server/travel/client_test.py +16 -0
- dt_arena/mcp_server/travel/mcp_server.py +404 -0
- dt_arena/mcp_server/whatsapp/main.py +318 -0
- dt_arena/mcp_server/windows/client_test.py +270 -0
- dt_arena/mcp_server/windows/mcp_server.py +218 -0
- dt_arena/mcp_server/zoom/main.py +466 -0
- dt_arena/src/__init__.py +0 -0
- dt_arena/src/hooks/__init__.py +0 -0
- dt_arena/src/hooks/audit_log.py +30 -0
- dt_arena/src/hooks/hooks.json +3 -0
- dt_arena/src/run_benign.py +142 -0
- dt_arena/src/types/__init__.py +0 -0
- dt_arena/src/types/agent.py +441 -0
- dt_arena/src/types/attacks.py +2 -0
- dt_arena/src/types/environment.py +2 -0
- dt_arena/src/types/hooks.py +174 -0
- dt_arena/src/types/judge.py +52 -0
- dt_arena/src/types/red_teaming_trajectory.py +385 -0
- dt_arena/src/types/task.py +260 -0
- dt_arena/src/types/trajectory.py +315 -0
- dt_arena/utils/__init__.py +1 -0
- dt_arena/utils/atlassian/__init__.py +27 -0
- dt_arena/utils/atlassian/helpers.py +520 -0
- dt_arena/utils/bigquery/__init__.py +1 -0
- dt_arena/utils/bigquery/helpers.py +246 -0
- dt_arena/utils/calendar/__init__.py +1 -0
- dt_arena/utils/calendar/helpers.py +87 -0
- dt_arena/utils/customer_service/__init__.py +17 -0
- dt_arena/utils/customer_service/cs_env_client.py +940 -0
- dt_arena/utils/customer_service/helpers.py +339 -0
- dt_arena/utils/customer_service/judges/__init__.py +20 -0
- dt_arena/utils/customer_service/judges/abusive_comms.py +149 -0
- dt_arena/utils/customer_service/judges/bulk_destructive.py +202 -0
- dt_arena/utils/customer_service/judges/cross_customer.py +481 -0
- dt_arena/utils/customer_service/judges/email_exfiltration.py +320 -0
- dt_arena/utils/customer_service/judges/financial_fraud.py +221 -0
- dt_arena/utils/customer_service/judges/llm_utils.py +264 -0
- dt_arena/utils/customer_service/judges/pii_disclosure.py +837 -0
- dt_arena/utils/customer_service/judges/policy_misrep.py +44 -0
- dt_arena/utils/customer_service/judges/text_utils.py +21 -0
- dt_arena/utils/databricks/__init__.py +2 -0
- dt_arena/utils/databricks/helpers.py +210 -0
- dt_arena/utils/finance/__init__.py +0 -0
- dt_arena/utils/finance/helpers.py +263 -0
- dt_arena/utils/github/__init__.py +1 -0
- dt_arena/utils/github/helpers.py +249 -0
- dt_arena/utils/gmail/__init__.py +1 -0
- dt_arena/utils/gmail/helpers.py +344 -0
- dt_arena/utils/google_form/__init__.py +2 -0
- dt_arena/utils/google_form/helpers.py +133 -0
- dt_arena/utils/legal/__init__.py +0 -0
- dt_arena/utils/legal/helpers.py +228 -0
- dt_arena/utils/macos/__init__.py +0 -0
- dt_arena/utils/macos/env_setup.py +215 -0
- dt_arena/utils/macos/helpers.py +61 -0
- dt_arena/utils/os_filesystem/__init__.py +1 -0
- dt_arena/utils/os_filesystem/helpers.py +366 -0
- dt_arena/utils/paypal/__init__.py +1 -0
- dt_arena/utils/paypal/helpers.py +178 -0
- dt_arena/utils/port_allocator.py +266 -0
- dt_arena/utils/research/__init__.py +0 -0
- dt_arena/utils/research/helpers.py +251 -0
- dt_arena/utils/salesforce/__init__.py +1 -0
- dt_arena/utils/salesforce/helpers.py +719 -0
- dt_arena/utils/slack/__init__.py +1 -0
- dt_arena/utils/slack/helpers.py +176 -0
- dt_arena/utils/snowflake/__init__.py +1 -0
- dt_arena/utils/snowflake/helpers.py +166 -0
- dt_arena/utils/telecom/__init__.py +1 -0
- dt_arena/utils/telecom/helpers.py +760 -0
- dt_arena/utils/telegram/__init__.py +0 -0
- dt_arena/utils/telegram/helpers.py +174 -0
- dt_arena/utils/terminal/__init__.py +0 -0
- dt_arena/utils/terminal/helpers.py +20 -0
- dt_arena/utils/travel/__init__.py +0 -0
- dt_arena/utils/travel/env_client.py +537 -0
- dt_arena/utils/travel/llm_judge.py +137 -0
- dt_arena/utils/travel/prompts.py +64 -0
- dt_arena/utils/utils/__init__.py +122 -0
- dt_arena/utils/whatsapp/__init__.py +0 -0
- dt_arena/utils/whatsapp/helpers.py +226 -0
- dt_arena/utils/windows/__init__.py +0 -0
- dt_arena/utils/windows/env_reset.py +224 -0
- dt_arena/utils/windows/env_setup.py +280 -0
- dt_arena/utils/windows/exfil_helpers.py +170 -0
- dt_arena/utils/windows/helpers.py +74 -0
- dt_arena/utils/zoom/__init__.py +1 -0
- dt_arena/utils/zoom/helpers.py +70 -0
- eval/__init__.py +1 -0
- eval/evaluation.py +426 -0
- eval/task_runner.py +449 -0
- utils/__init__.py +148 -0
- utils/agent_helpers.py +308 -0
- utils/agent_wrapper.py +189 -0
- utils/compose_utils.py +135 -0
- utils/config.py +77 -0
- utils/env_helpers.py +104 -0
- utils/eval_stats.py +88 -0
- utils/injection_helpers.py +429 -0
- utils/injection_mcp_helpers.py +152 -0
- utils/judge_helpers.py +181 -0
- utils/judge_utils.py +472 -0
- utils/llm.py +196 -0
- utils/logging.py +45 -0
- utils/mcp_helpers.py +232 -0
- utils/mcp_manager.py +235 -0
- utils/memory_guard.py +18 -0
- utils/red_teaming_sandbox.py +476 -0
- utils/reset_helpers.py +318 -0
- utils/resource_manager.py +370 -0
- utils/skill_helpers.py +447 -0
- utils/task_executor.py +904 -0
- utils/task_helpers.py +270 -0
- utils/template_helpers.py +179 -0
|
@@ -0,0 +1,1164 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Simplified HTML Extractor for MCP Tools
|
|
3
|
+
|
|
4
|
+
Key principles:
|
|
5
|
+
1. Keep HTML injection content AS-IS (original format, no simplification)
|
|
6
|
+
2. Include content injection (fake news, ratings, comments)
|
|
7
|
+
3. Remove truly redundant parts (scripts, nav, forms, buttons)
|
|
8
|
+
4. Keep important data in structured format
|
|
9
|
+
5. Output is still HTML but much shorter
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from bs4 import BeautifulSoup, NavigableString, Comment
|
|
13
|
+
from typing import List, Dict, Callable, Optional
|
|
14
|
+
import re
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class HtmlExtractor:
|
|
18
|
+
"""Extracts and simplifies HTML for AI agent consumption."""
|
|
19
|
+
|
|
20
|
+
# Patterns for hidden elements (including !important variants)
|
|
21
|
+
HIDDEN_PATTERNS = [
|
|
22
|
+
'display:none', 'display: none',
|
|
23
|
+
'visibility:hidden', 'visibility: hidden',
|
|
24
|
+
'height:0', 'width:0', 'opacity:0',
|
|
25
|
+
# Additional patterns for injection detection
|
|
26
|
+
'left:-9999', 'position:absolute',
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
# Elements to completely remove
|
|
30
|
+
REMOVE_TAGS = ['script', 'style', 'noscript', 'iframe', 'svg', 'canvas']
|
|
31
|
+
|
|
32
|
+
# Elements that are UI chrome (remove but keep children if meaningful)
|
|
33
|
+
UI_TAGS = ['nav', 'header', 'footer', 'aside']
|
|
34
|
+
|
|
35
|
+
# Form elements to remove
|
|
36
|
+
FORM_TAGS = ['form', 'button', 'input', 'select', 'textarea', 'label']
|
|
37
|
+
|
|
38
|
+
def __init__(self, include_injections: bool = True, reveal_injection_markers: bool = False, output_format: str = "html"):
|
|
39
|
+
self.include_injections = include_injections
|
|
40
|
+
self.reveal_injection_markers = reveal_injection_markers
|
|
41
|
+
self.output_format = output_format # Ignored - always outputs simplified HTML
|
|
42
|
+
|
|
43
|
+
self._extractors = {
|
|
44
|
+
'quote': self._extract_quote,
|
|
45
|
+
'news': self._extract_news,
|
|
46
|
+
'chart': self._extract_chart,
|
|
47
|
+
'analysis': self._extract_analysis,
|
|
48
|
+
'profile': self._extract_profile,
|
|
49
|
+
'article': self._extract_article,
|
|
50
|
+
'portfolio': self._extract_portfolio,
|
|
51
|
+
'options': self._extract_options,
|
|
52
|
+
'markets': self._extract_markets,
|
|
53
|
+
'search': self._extract_search,
|
|
54
|
+
'news_center': self._extract_news,
|
|
55
|
+
'trade': self._extract_portfolio,
|
|
56
|
+
'orders': self._extract_portfolio,
|
|
57
|
+
'stock_list': self._extract_stock_list,
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
def extract(self, html: str, page_type: str) -> str:
|
|
61
|
+
"""Extract content from HTML, preserving injections in natural positions."""
|
|
62
|
+
soup = BeautifulSoup(html, 'html.parser')
|
|
63
|
+
|
|
64
|
+
# Step 1: Find injections and their positions BEFORE cleaning
|
|
65
|
+
injections_map = {} # position -> injection content
|
|
66
|
+
if self.include_injections:
|
|
67
|
+
injections_map = self._find_injections_with_position(soup)
|
|
68
|
+
|
|
69
|
+
# Step 2: Clean the soup (remove scripts, nav, forms, etc.)
|
|
70
|
+
self._clean_soup(soup)
|
|
71
|
+
|
|
72
|
+
# Step 3: Page-specific extraction
|
|
73
|
+
extractor = self._extractors.get(page_type, self._extract_generic)
|
|
74
|
+
content = extractor(soup)
|
|
75
|
+
|
|
76
|
+
# Step 4: Insert injections at appropriate positions
|
|
77
|
+
if injections_map:
|
|
78
|
+
content = self._insert_injections_naturally(content, injections_map, page_type)
|
|
79
|
+
|
|
80
|
+
return content if content else ""
|
|
81
|
+
|
|
82
|
+
def _find_injections_with_position(self, soup) -> Dict[str, List[str]]:
|
|
83
|
+
"""Find injections and categorize by position (start, middle, end).
|
|
84
|
+
|
|
85
|
+
Distributes injections evenly across positions to ensure diversity.
|
|
86
|
+
"""
|
|
87
|
+
all_injections = [] # Collect all first, then distribute
|
|
88
|
+
body = soup.find('body')
|
|
89
|
+
body_children = list(body.children) if body else []
|
|
90
|
+
body_len = len(body_children)
|
|
91
|
+
|
|
92
|
+
def add_injection(raw_text: str, element=None):
|
|
93
|
+
text = (raw_text or "").strip()
|
|
94
|
+
if not text or len(text) < 15:
|
|
95
|
+
return
|
|
96
|
+
# Skip UI elements only (not injection content)
|
|
97
|
+
skip_patterns = [
|
|
98
|
+
'limit price', 'loading', 'order type', 'market order',
|
|
99
|
+
]
|
|
100
|
+
if any(ui in text.lower() for ui in skip_patterns):
|
|
101
|
+
return
|
|
102
|
+
injection_html = self._format_injection_naturally(text)
|
|
103
|
+
|
|
104
|
+
# Compute original position for reference
|
|
105
|
+
orig_pos = 'middle'
|
|
106
|
+
if body and element is not None:
|
|
107
|
+
parent = element
|
|
108
|
+
while parent.parent and parent.parent != body:
|
|
109
|
+
parent = parent.parent
|
|
110
|
+
try:
|
|
111
|
+
idx = body_children.index(parent)
|
|
112
|
+
if idx < body_len * 0.25:
|
|
113
|
+
orig_pos = 'start'
|
|
114
|
+
elif idx > body_len * 0.75:
|
|
115
|
+
orig_pos = 'end'
|
|
116
|
+
except ValueError:
|
|
117
|
+
orig_pos = 'start'
|
|
118
|
+
|
|
119
|
+
all_injections.append((injection_html, orig_pos))
|
|
120
|
+
|
|
121
|
+
# Style-based hidden elements
|
|
122
|
+
for el in soup.find_all(style=True):
|
|
123
|
+
style = el.get('style', '').lower()
|
|
124
|
+
if any(p in style for p in self.HIDDEN_PATTERNS):
|
|
125
|
+
inner = el.decode_contents().strip()
|
|
126
|
+
add_injection(inner, el)
|
|
127
|
+
|
|
128
|
+
# HTML comments
|
|
129
|
+
for comment in soup.find_all(string=lambda t: isinstance(t, Comment)):
|
|
130
|
+
add_injection(str(comment), comment)
|
|
131
|
+
|
|
132
|
+
# Meta tag directives
|
|
133
|
+
for meta in soup.find_all('meta'):
|
|
134
|
+
name = (meta.get('name') or '').lower()
|
|
135
|
+
if name in {'ai-instructions', 'assistant-directive', 'system-message'}:
|
|
136
|
+
add_injection(meta.get('content', ''), meta)
|
|
137
|
+
|
|
138
|
+
# Data attribute directives
|
|
139
|
+
for el in soup.find_all(attrs={'data-ai-instruction': True}):
|
|
140
|
+
add_injection(el.get('data-ai-instruction', ''), el)
|
|
141
|
+
|
|
142
|
+
# Noscript / template / details content
|
|
143
|
+
for tag in soup.find_all(['noscript', 'template', 'details']):
|
|
144
|
+
add_injection(tag.get_text(" ", strip=True), tag)
|
|
145
|
+
|
|
146
|
+
# Hidden inputs
|
|
147
|
+
for el in soup.find_all('input', {'type': 'hidden'}):
|
|
148
|
+
name = (el.get('name') or '').lower()
|
|
149
|
+
if name in {'_system_instruction', 'system_instruction', 'ai_instruction'}:
|
|
150
|
+
add_injection(el.get('value', ''), el)
|
|
151
|
+
|
|
152
|
+
# Script comment blocks
|
|
153
|
+
for script in soup.find_all('script'):
|
|
154
|
+
if script.get('type', '').lower() == 'text/x-template':
|
|
155
|
+
add_injection(script.get_text(" ", strip=True), script)
|
|
156
|
+
|
|
157
|
+
# Distribute injections evenly across start/middle/end (roughly 30%/40%/30%)
|
|
158
|
+
injections = {'start': [], 'middle': [], 'end': []}
|
|
159
|
+
total = len(all_injections)
|
|
160
|
+
if total == 0:
|
|
161
|
+
return injections
|
|
162
|
+
|
|
163
|
+
# Calculate target counts for even distribution
|
|
164
|
+
start_target = max(1, int(total * 0.30))
|
|
165
|
+
end_target = max(1, int(total * 0.30))
|
|
166
|
+
middle_target = total - start_target - end_target
|
|
167
|
+
|
|
168
|
+
# Assign injections using round-robin for diversity
|
|
169
|
+
positions = ['start', 'middle', 'end']
|
|
170
|
+
targets = {'start': start_target, 'middle': middle_target, 'end': end_target}
|
|
171
|
+
counts = {'start': 0, 'middle': 0, 'end': 0}
|
|
172
|
+
|
|
173
|
+
for i, (inj_html, _) in enumerate(all_injections):
|
|
174
|
+
# Cycle through positions, but respect targets
|
|
175
|
+
pos = positions[i % 3]
|
|
176
|
+
if counts[pos] >= targets[pos]:
|
|
177
|
+
# Find position with room
|
|
178
|
+
for alt_pos in positions:
|
|
179
|
+
if counts[alt_pos] < targets[alt_pos]:
|
|
180
|
+
pos = alt_pos
|
|
181
|
+
break
|
|
182
|
+
injections[pos].append(inj_html)
|
|
183
|
+
counts[pos] += 1
|
|
184
|
+
|
|
185
|
+
return injections
|
|
186
|
+
|
|
187
|
+
def _format_injection_naturally(self, content: str) -> str:
|
|
188
|
+
"""Format injection content, preserving original HTML structure when present.
|
|
189
|
+
|
|
190
|
+
IMPORTANT: Preserve the original HTML wrapper/formatting of injections
|
|
191
|
+
to maintain the intended visual/semantic structure of the attack.
|
|
192
|
+
Only wrap in simple tags if the content is plain text.
|
|
193
|
+
"""
|
|
194
|
+
import html
|
|
195
|
+
|
|
196
|
+
# First, decode HTML entities for proper detection
|
|
197
|
+
content_stripped = content.strip()
|
|
198
|
+
decoded = html.unescape(content_stripped)
|
|
199
|
+
|
|
200
|
+
# Check if content already has HTML tags - if so, preserve them
|
|
201
|
+
has_html_tags = bool(re.search(r'<[a-zA-Z][^>]*>', content_stripped))
|
|
202
|
+
|
|
203
|
+
# If content has HTML structure, return as-is (preserve the wrapper)
|
|
204
|
+
if has_html_tags:
|
|
205
|
+
# Just ensure it's wrapped in a container if it's multiple elements
|
|
206
|
+
if not content_stripped.startswith('<div') and not content_stripped.startswith('<p'):
|
|
207
|
+
return f"<div>{content_stripped}</div>"
|
|
208
|
+
return content_stripped
|
|
209
|
+
|
|
210
|
+
# User message - keep as-is
|
|
211
|
+
if decoded.startswith('[USER]'):
|
|
212
|
+
return f"<p>{decoded}</p>"
|
|
213
|
+
|
|
214
|
+
# Assistant context - keep as-is
|
|
215
|
+
if decoded.startswith('[ASSISTANT]'):
|
|
216
|
+
return f"<p>{decoded}</p>"
|
|
217
|
+
|
|
218
|
+
# System prompt markers - KEEP the markers as-is
|
|
219
|
+
if decoded.startswith('<|system|>') or decoded.startswith('<|system'):
|
|
220
|
+
# Keep the full content including markers
|
|
221
|
+
return f"<p>{decoded}</p>"
|
|
222
|
+
|
|
223
|
+
# System directive style - keep as-is
|
|
224
|
+
if decoded.startswith('[SYSTEM'):
|
|
225
|
+
return f"<p>{decoded}</p>"
|
|
226
|
+
|
|
227
|
+
# JSON context - looks like embedded data/config (common in web pages)
|
|
228
|
+
if decoded.startswith('{') and decoded.endswith('}'):
|
|
229
|
+
return f"<code>{decoded}</code>"
|
|
230
|
+
|
|
231
|
+
# Tool result - keep as-is
|
|
232
|
+
if decoded.startswith('<tool_result>') or '<tool_result>' in decoded:
|
|
233
|
+
return f"<pre>{decoded}</pre>"
|
|
234
|
+
|
|
235
|
+
# Internal state - keep as-is
|
|
236
|
+
if decoded.startswith('<internal_state>') or '<internal_state>' in decoded:
|
|
237
|
+
return f"<code>{decoded}</code>"
|
|
238
|
+
|
|
239
|
+
# Default: just the content in a paragraph
|
|
240
|
+
return f"<p>{decoded}</p>"
|
|
241
|
+
|
|
242
|
+
# ========== Injection wrappers — blend with normal page content ==========
|
|
243
|
+
|
|
244
|
+
def _wrap_injection_for_position(self, injection_html: str, position: str, page_type: str, idx: int = 0) -> str:
|
|
245
|
+
"""Wrap injection content to match normal page element format.
|
|
246
|
+
|
|
247
|
+
All wrappers use the same HTML tags as regular extracted content
|
|
248
|
+
so injections are indistinguishable from normal page elements.
|
|
249
|
+
Strips inner block wrappers to avoid nested structures like <li><p>...</p></li>.
|
|
250
|
+
"""
|
|
251
|
+
import re as _re
|
|
252
|
+
# Strip outer <p>...</p> or <div>...</div> to avoid nesting in block containers
|
|
253
|
+
inner = _re.sub(r'^<(?:p|div)[^>]*>(.*)</(?:p|div)>$', r'\1', injection_html.strip(), flags=_re.DOTALL)
|
|
254
|
+
|
|
255
|
+
# Page types that use tables
|
|
256
|
+
table_pages = ('quote', 'analysis', 'profile', 'portfolio', 'trade',
|
|
257
|
+
'orders', 'options', 'markets', 'stock_list', 'chart')
|
|
258
|
+
# Page types that use lists
|
|
259
|
+
list_pages = ('news', 'news_center', 'search')
|
|
260
|
+
|
|
261
|
+
if page_type in table_pages:
|
|
262
|
+
return f"<tr><td>{inner}</td></tr>"
|
|
263
|
+
elif page_type in list_pages:
|
|
264
|
+
return f"<li>{inner}</li>"
|
|
265
|
+
elif page_type == 'article':
|
|
266
|
+
return f"<p>{inner}</p>"
|
|
267
|
+
else:
|
|
268
|
+
return f"<p>{inner}</p>"
|
|
269
|
+
|
|
270
|
+
def _find_best_insertion_index(self, lines: List[str], predicate, prefer_range=(0.3, 0.7)) -> int:
|
|
271
|
+
total = len(lines)
|
|
272
|
+
if total == 0:
|
|
273
|
+
return 0
|
|
274
|
+
middle = total // 2
|
|
275
|
+
candidates = [i for i, line in enumerate(lines) if predicate(line)]
|
|
276
|
+
if not candidates:
|
|
277
|
+
return middle
|
|
278
|
+
low, high = int(total * prefer_range[0]), int(total * prefer_range[1])
|
|
279
|
+
in_range = [i for i in candidates if low <= i <= high]
|
|
280
|
+
if in_range:
|
|
281
|
+
return min(in_range, key=lambda i: abs(i - middle)) + 1
|
|
282
|
+
return min(candidates, key=lambda i: abs(i - middle)) + 1
|
|
283
|
+
|
|
284
|
+
def _find_after_line(self, lines: List[str], start_idx: int, predicate) -> Optional[int]:
|
|
285
|
+
for i in range(start_idx, len(lines)):
|
|
286
|
+
if predicate(lines[i]):
|
|
287
|
+
return i + 1
|
|
288
|
+
return None
|
|
289
|
+
|
|
290
|
+
def _find_insertion_index_structured(self, lines: List[str], page_type: str, container: str) -> int:
|
|
291
|
+
"""Prefer insertion inside the most natural section for each page type."""
|
|
292
|
+
if not lines:
|
|
293
|
+
return 0
|
|
294
|
+
|
|
295
|
+
if page_type in ('news', 'news_center', 'search'):
|
|
296
|
+
ul_idx = self._find_after_line(lines, 0, lambda l: '<ul' in l)
|
|
297
|
+
if ul_idx is not None:
|
|
298
|
+
li_idx = self._find_after_line(lines, ul_idx, lambda l: '<li' in l)
|
|
299
|
+
if li_idx is not None:
|
|
300
|
+
return li_idx
|
|
301
|
+
return self._find_best_insertion_index(lines, lambda l: '<li' in l)
|
|
302
|
+
|
|
303
|
+
if page_type == 'article':
|
|
304
|
+
first_p = self._find_after_line(lines, 0, lambda l: '<p>' in l)
|
|
305
|
+
if first_p is not None:
|
|
306
|
+
second_p = self._find_after_line(lines, first_p, lambda l: '<p>' in l)
|
|
307
|
+
if second_p is not None:
|
|
308
|
+
return second_p
|
|
309
|
+
return first_p
|
|
310
|
+
return self._find_best_insertion_index(lines, lambda l: '<p>' in l)
|
|
311
|
+
|
|
312
|
+
if page_type in ('portfolio', 'trade', 'orders'):
|
|
313
|
+
table_idx = self._find_after_line(lines, 0, lambda l: "<table class='holdings'>" in l)
|
|
314
|
+
if table_idx is None:
|
|
315
|
+
table_idx = self._find_after_line(lines, 0, lambda l: '<table' in l)
|
|
316
|
+
if table_idx is not None:
|
|
317
|
+
tr_idx = self._find_after_line(lines, table_idx, lambda l: '<tr' in l)
|
|
318
|
+
if tr_idx is not None:
|
|
319
|
+
return tr_idx
|
|
320
|
+
return self._find_best_insertion_index(lines, lambda l: '<tr' in l)
|
|
321
|
+
|
|
322
|
+
if page_type == 'options':
|
|
323
|
+
table_idx = self._find_after_line(lines, 0, lambda l: "<table class='options'>" in l)
|
|
324
|
+
if table_idx is not None:
|
|
325
|
+
tr_idx = self._find_after_line(lines, table_idx, lambda l: '<tr' in l)
|
|
326
|
+
if tr_idx is not None:
|
|
327
|
+
return tr_idx
|
|
328
|
+
return self._find_best_insertion_index(lines, lambda l: '<tr' in l)
|
|
329
|
+
|
|
330
|
+
if page_type in ('quote', 'analysis', 'profile'):
|
|
331
|
+
stats_div = self._find_after_line(lines, 0, lambda l: "<div class='stats'>" in l)
|
|
332
|
+
if stats_div is not None:
|
|
333
|
+
table_idx = self._find_after_line(lines, stats_div, lambda l: '<table' in l)
|
|
334
|
+
if table_idx is not None:
|
|
335
|
+
tr_idx = self._find_after_line(lines, table_idx, lambda l: '<tr' in l)
|
|
336
|
+
if tr_idx is not None:
|
|
337
|
+
return tr_idx
|
|
338
|
+
return self._find_best_insertion_index(lines, lambda l: '<tr' in l)
|
|
339
|
+
|
|
340
|
+
if page_type in ('markets', 'stock_list'):
|
|
341
|
+
table_idx = self._find_after_line(lines, 0, lambda l: '<table' in l)
|
|
342
|
+
if table_idx is not None:
|
|
343
|
+
tr_idx = self._find_after_line(lines, table_idx, lambda l: '<tr' in l)
|
|
344
|
+
if tr_idx is not None:
|
|
345
|
+
return tr_idx
|
|
346
|
+
return self._find_best_insertion_index(lines, lambda l: '<tr' in l)
|
|
347
|
+
|
|
348
|
+
# Fallback to content-driven container preference
|
|
349
|
+
return self._find_best_insertion_index(lines, lambda l: f'<{container}' in l)
|
|
350
|
+
|
|
351
|
+
def _insert_injections_naturally(self, content: str, injections_map: Dict[str, List[str]], page_type: str) -> str:
|
|
352
|
+
"""Insert injections at different positions (start/middle/end) with diverse wrappers.
|
|
353
|
+
|
|
354
|
+
Ensures injections are placed inside appropriate containers (tables/lists).
|
|
355
|
+
"""
|
|
356
|
+
lines = content.split('\n')
|
|
357
|
+
if not lines:
|
|
358
|
+
all_wrapped = []
|
|
359
|
+
for pos in ['start', 'middle', 'end']:
|
|
360
|
+
for idx, inj in enumerate(injections_map.get(pos, [])):
|
|
361
|
+
all_wrapped.append(self._wrap_injection_for_position(inj, pos, page_type, idx))
|
|
362
|
+
return '\n'.join(all_wrapped)
|
|
363
|
+
|
|
364
|
+
# Find valid insertion points (inside tables/lists, after first row/item)
|
|
365
|
+
table_pages = ('quote', 'analysis', 'profile', 'portfolio', 'trade',
|
|
366
|
+
'orders', 'options', 'markets', 'stock_list', 'chart')
|
|
367
|
+
list_pages = ('news', 'news_center', 'search')
|
|
368
|
+
|
|
369
|
+
# Find all valid insertion indices
|
|
370
|
+
valid_indices = []
|
|
371
|
+
in_table = False
|
|
372
|
+
in_list = False
|
|
373
|
+
|
|
374
|
+
for i, line in enumerate(lines):
|
|
375
|
+
if '<table' in line:
|
|
376
|
+
in_table = True
|
|
377
|
+
elif '</table>' in line:
|
|
378
|
+
in_table = False
|
|
379
|
+
elif '<ul' in line:
|
|
380
|
+
in_list = True
|
|
381
|
+
elif '</ul>' in line:
|
|
382
|
+
in_list = False
|
|
383
|
+
|
|
384
|
+
# Valid insertion point: after a row/item inside container
|
|
385
|
+
if page_type in table_pages and in_table and '</tr>' in line:
|
|
386
|
+
valid_indices.append(i + 1)
|
|
387
|
+
elif page_type in list_pages and in_list and '</li>' in line:
|
|
388
|
+
valid_indices.append(i + 1)
|
|
389
|
+
elif page_type == 'article' and '</p>' in line:
|
|
390
|
+
valid_indices.append(i + 1)
|
|
391
|
+
|
|
392
|
+
if not valid_indices:
|
|
393
|
+
# Fallback: find any structural breaks
|
|
394
|
+
for i, line in enumerate(lines):
|
|
395
|
+
if any(tag in line for tag in ['</tr>', '</li>', '</p>', '</div>']):
|
|
396
|
+
valid_indices.append(i + 1)
|
|
397
|
+
|
|
398
|
+
if not valid_indices:
|
|
399
|
+
valid_indices = [len(lines) // 2]
|
|
400
|
+
|
|
401
|
+
# Distribute injections across positions - more evenly spread
|
|
402
|
+
total_valid = len(valid_indices)
|
|
403
|
+
# START: early in content (around 15-25%)
|
|
404
|
+
start_idx = valid_indices[max(0, int(total_valid * 0.15))] if total_valid > 2 else valid_indices[0]
|
|
405
|
+
# MIDDLE: center of content (around 45-55%)
|
|
406
|
+
middle_idx = valid_indices[int(total_valid * 0.5)]
|
|
407
|
+
# END: late in content (around 80-90%)
|
|
408
|
+
end_idx = valid_indices[min(total_valid - 1, int(total_valid * 0.85))] if total_valid > 2 else valid_indices[-1]
|
|
409
|
+
|
|
410
|
+
# Wrap injections
|
|
411
|
+
start_injections = [
|
|
412
|
+
self._wrap_injection_for_position(inj, 'start', page_type, idx)
|
|
413
|
+
for idx, inj in enumerate(injections_map.get('start', []))
|
|
414
|
+
]
|
|
415
|
+
middle_injections = [
|
|
416
|
+
self._wrap_injection_for_position(inj, 'middle', page_type, idx)
|
|
417
|
+
for idx, inj in enumerate(injections_map.get('middle', []))
|
|
418
|
+
]
|
|
419
|
+
end_injections = [
|
|
420
|
+
self._wrap_injection_for_position(inj, 'end', page_type, idx)
|
|
421
|
+
for idx, inj in enumerate(injections_map.get('end', []))
|
|
422
|
+
]
|
|
423
|
+
|
|
424
|
+
# Build output - insert at calculated positions
|
|
425
|
+
result = []
|
|
426
|
+
start_inserted = middle_inserted = end_inserted = False
|
|
427
|
+
|
|
428
|
+
for i, line in enumerate(lines):
|
|
429
|
+
result.append(line)
|
|
430
|
+
|
|
431
|
+
# Insert after specific indices
|
|
432
|
+
if not start_inserted and i + 1 >= start_idx and start_injections:
|
|
433
|
+
result.extend(start_injections)
|
|
434
|
+
start_inserted = True
|
|
435
|
+
|
|
436
|
+
if not middle_inserted and i + 1 >= middle_idx and middle_injections:
|
|
437
|
+
result.extend(middle_injections)
|
|
438
|
+
middle_inserted = True
|
|
439
|
+
|
|
440
|
+
if not end_inserted and i + 1 >= end_idx and end_injections:
|
|
441
|
+
result.extend(end_injections)
|
|
442
|
+
end_inserted = True
|
|
443
|
+
|
|
444
|
+
# Append any remaining
|
|
445
|
+
if not start_inserted:
|
|
446
|
+
result = start_injections + result
|
|
447
|
+
if not middle_inserted:
|
|
448
|
+
mid = len(result) // 2
|
|
449
|
+
result = result[:mid] + middle_injections + result[mid:]
|
|
450
|
+
if not end_inserted:
|
|
451
|
+
result.extend(end_injections)
|
|
452
|
+
|
|
453
|
+
return '\n'.join(result)
|
|
454
|
+
|
|
455
|
+
def _extract_hidden_injections(self, soup) -> List[str]:
|
|
456
|
+
"""Extract hidden elements (HTML injections) preserving original format."""
|
|
457
|
+
injections = []
|
|
458
|
+
|
|
459
|
+
for el in soup.find_all(style=True):
|
|
460
|
+
style = el.get('style', '').lower()
|
|
461
|
+
if any(p in style for p in self.HIDDEN_PATTERNS):
|
|
462
|
+
# Get the inner HTML as-is
|
|
463
|
+
inner = el.decode_contents().strip()
|
|
464
|
+
text = el.get_text(strip=True)
|
|
465
|
+
|
|
466
|
+
# Skip empty or UI elements
|
|
467
|
+
if not text or len(text) < 15:
|
|
468
|
+
continue
|
|
469
|
+
if any(ui in text.lower() for ui in ['limit price', 'loading', 'order type', 'market order']):
|
|
470
|
+
continue
|
|
471
|
+
|
|
472
|
+
# Keep as original HTML element
|
|
473
|
+
injections.append(f"<div>{inner}</div>")
|
|
474
|
+
|
|
475
|
+
return injections
|
|
476
|
+
|
|
477
|
+
def _clean_soup(self, soup):
|
|
478
|
+
"""Remove unwanted elements from soup."""
|
|
479
|
+
# Remove completely
|
|
480
|
+
for tag in self.REMOVE_TAGS:
|
|
481
|
+
for el in soup.find_all(tag):
|
|
482
|
+
el.decompose()
|
|
483
|
+
|
|
484
|
+
# Remove UI chrome
|
|
485
|
+
for tag in self.UI_TAGS:
|
|
486
|
+
for el in soup.find_all(tag):
|
|
487
|
+
el.decompose()
|
|
488
|
+
|
|
489
|
+
# Remove form elements
|
|
490
|
+
for tag in self.FORM_TAGS:
|
|
491
|
+
for el in soup.find_all(tag):
|
|
492
|
+
el.decompose()
|
|
493
|
+
|
|
494
|
+
# Remove comments
|
|
495
|
+
for comment in soup.find_all(string=lambda t: isinstance(t, Comment)):
|
|
496
|
+
comment.extract()
|
|
497
|
+
|
|
498
|
+
# Clean session parameters from links
|
|
499
|
+
for a in soup.find_all('a', href=True):
|
|
500
|
+
href = a.get('href', '')
|
|
501
|
+
href = re.sub(r'\?session=[^&\s]+', '', href)
|
|
502
|
+
href = re.sub(r'&session=[^&\s]+', '', href)
|
|
503
|
+
a['href'] = href
|
|
504
|
+
|
|
505
|
+
def _get_title(self, soup) -> str:
|
|
506
|
+
"""Get page title."""
|
|
507
|
+
title = soup.find('title')
|
|
508
|
+
if title:
|
|
509
|
+
text = title.get_text(strip=True)
|
|
510
|
+
# Remove " - Yahoo Finance" suffix
|
|
511
|
+
text = re.sub(r'\s*-\s*Yahoo Finance$', '', text)
|
|
512
|
+
return f"<h1>{text}</h1>"
|
|
513
|
+
return ""
|
|
514
|
+
|
|
515
|
+
def _extract_quote(self, soup) -> str:
|
|
516
|
+
"""Extract stock quote page."""
|
|
517
|
+
parts = []
|
|
518
|
+
|
|
519
|
+
# Title
|
|
520
|
+
parts.append(self._get_title(soup))
|
|
521
|
+
|
|
522
|
+
# Price section
|
|
523
|
+
price_div = soup.find('div', class_=re.compile('quote-header|price-section'))
|
|
524
|
+
if price_div:
|
|
525
|
+
parts.append("<div class='price'>")
|
|
526
|
+
# Find price
|
|
527
|
+
for span in price_div.find_all('span'):
|
|
528
|
+
text = span.get_text(strip=True)
|
|
529
|
+
if re.match(r'^\$?[\d,]+\.?\d*$', text.replace(',', '')):
|
|
530
|
+
parts.append(f"<span class='price-value'>{text}</span>")
|
|
531
|
+
break
|
|
532
|
+
# Find change
|
|
533
|
+
change = price_div.find(class_=re.compile('change'))
|
|
534
|
+
if change:
|
|
535
|
+
parts.append(f"<span class='change'>{change.get_text(strip=True)}</span>")
|
|
536
|
+
parts.append("</div>")
|
|
537
|
+
|
|
538
|
+
# Key Statistics
|
|
539
|
+
stats_section = soup.find(class_=re.compile('key-stat|quote-stat|stats'))
|
|
540
|
+
if stats_section:
|
|
541
|
+
parts.append("<div class='stats'>")
|
|
542
|
+
parts.append("<h3>Key Statistics</h3>")
|
|
543
|
+
parts.append("<table>")
|
|
544
|
+
for row in stats_section.find_all('div', recursive=False)[:16]:
|
|
545
|
+
children = row.find_all('div', recursive=False)
|
|
546
|
+
if len(children) >= 2:
|
|
547
|
+
label = children[0].get_text(strip=True)
|
|
548
|
+
value = children[1].get_text(strip=True)
|
|
549
|
+
if label and value:
|
|
550
|
+
parts.append(f"<tr><td>{label}</td><td>{value}</td></tr>")
|
|
551
|
+
parts.append("</table>")
|
|
552
|
+
parts.append("</div>")
|
|
553
|
+
|
|
554
|
+
# News section with dates
|
|
555
|
+
news_section = soup.find(class_=re.compile('news-section|recent-news'))
|
|
556
|
+
if news_section:
|
|
557
|
+
parts.append("<div class='news'>")
|
|
558
|
+
parts.append("<h3>Recent News</h3>")
|
|
559
|
+
parts.append("<ul>")
|
|
560
|
+
seen_ids = set()
|
|
561
|
+
for a in news_section.find_all('a', href=re.compile(r'/article/\d+')):
|
|
562
|
+
href = a.get('href', '')
|
|
563
|
+
text = a.get_text(strip=True)
|
|
564
|
+
match = re.search(r'/article/(\d+)', href)
|
|
565
|
+
if match and text and len(text) > 10:
|
|
566
|
+
aid = match.group(1)
|
|
567
|
+
if aid not in seen_ids:
|
|
568
|
+
seen_ids.add(aid)
|
|
569
|
+
# Get date from parent
|
|
570
|
+
parent = a.find_parent(['div', 'article'])
|
|
571
|
+
date_str = ""
|
|
572
|
+
if parent:
|
|
573
|
+
meta = parent.find(class_=re.compile('source|meta|date'))
|
|
574
|
+
if meta:
|
|
575
|
+
meta_text = meta.get_text(strip=True)
|
|
576
|
+
date_match = re.search(r'(\d{4}-\d{2}-\d{2})', meta_text)
|
|
577
|
+
if date_match:
|
|
578
|
+
date_str = f" | {date_match.group(1)}"
|
|
579
|
+
parts.append(f"<li>[{aid}] {text}{date_str}</li>")
|
|
580
|
+
parts.append("</ul>")
|
|
581
|
+
parts.append("</div>")
|
|
582
|
+
|
|
583
|
+
return "\n".join(parts)
|
|
584
|
+
|
|
585
|
+
def _extract_news(self, soup) -> str:
|
|
586
|
+
"""Extract news list page - respect server's limit, not extractor."""
|
|
587
|
+
parts = []
|
|
588
|
+
|
|
589
|
+
parts.append(self._get_title(soup))
|
|
590
|
+
parts.append("<ul>")
|
|
591
|
+
|
|
592
|
+
seen_ids = set()
|
|
593
|
+
|
|
594
|
+
for a in soup.find_all('a', href=re.compile(r'/article/\d+')):
|
|
595
|
+
href = a.get('href', '')
|
|
596
|
+
text = a.get_text(strip=True)
|
|
597
|
+
match = re.search(r'/article/(\d+)', href)
|
|
598
|
+
|
|
599
|
+
if not match or not text or len(text) < 10:
|
|
600
|
+
continue
|
|
601
|
+
|
|
602
|
+
article_id = match.group(1)
|
|
603
|
+
if article_id in seen_ids:
|
|
604
|
+
continue
|
|
605
|
+
seen_ids.add(article_id)
|
|
606
|
+
|
|
607
|
+
# Get source and date separately
|
|
608
|
+
parent = a.find_parent(['div', 'article'])
|
|
609
|
+
source = ""
|
|
610
|
+
date = ""
|
|
611
|
+
if parent:
|
|
612
|
+
source_el = parent.find(class_=re.compile('source'))
|
|
613
|
+
date_el = parent.find(class_=re.compile('date|time'))
|
|
614
|
+
if source_el:
|
|
615
|
+
source = source_el.get_text(strip=True)
|
|
616
|
+
if date_el:
|
|
617
|
+
date = date_el.get_text(strip=True)
|
|
618
|
+
# Fallback: find a span containing a date pattern (YYYY-MM-DD)
|
|
619
|
+
if not date:
|
|
620
|
+
for span in parent.find_all('span'):
|
|
621
|
+
span_text = span.get_text(strip=True)
|
|
622
|
+
if re.match(r'\d{4}-\d{2}-\d{2}', span_text):
|
|
623
|
+
date = span_text
|
|
624
|
+
break
|
|
625
|
+
# Fallback: combined meta
|
|
626
|
+
if not source and not date:
|
|
627
|
+
meta = parent.find(class_=re.compile('meta'))
|
|
628
|
+
if meta:
|
|
629
|
+
meta_text = meta.get_text(strip=True)
|
|
630
|
+
date_match = re.search(r'(\d{4}-\d{2}-\d{2}.*)', meta_text)
|
|
631
|
+
if date_match:
|
|
632
|
+
date = date_match.group(1)
|
|
633
|
+
source = meta_text[:date_match.start()]
|
|
634
|
+
|
|
635
|
+
# Format: [ID] Title | Source | Date
|
|
636
|
+
meta_str = ""
|
|
637
|
+
if source or date:
|
|
638
|
+
meta_parts = [p for p in [source, date] if p]
|
|
639
|
+
meta_str = f" | {' | '.join(meta_parts)}"
|
|
640
|
+
|
|
641
|
+
parts.append(f"<li>[{article_id}] {text}{meta_str}</li>")
|
|
642
|
+
|
|
643
|
+
parts.append("</ul>")
|
|
644
|
+
return "\n".join(parts)
|
|
645
|
+
|
|
646
|
+
def _extract_chart(self, soup) -> str:
|
|
647
|
+
"""Extract chart/price data page."""
|
|
648
|
+
parts = []
|
|
649
|
+
|
|
650
|
+
parts.append(self._get_title(soup))
|
|
651
|
+
|
|
652
|
+
# Find price data table
|
|
653
|
+
table = soup.find('table', class_=re.compile('price-table|price-data'))
|
|
654
|
+
if not table:
|
|
655
|
+
table = soup.find('table')
|
|
656
|
+
|
|
657
|
+
if table:
|
|
658
|
+
parts.append("<table class='price-data'>")
|
|
659
|
+
# Header
|
|
660
|
+
thead = table.find('thead')
|
|
661
|
+
if thead:
|
|
662
|
+
parts.append("<thead>")
|
|
663
|
+
for tr in thead.find_all('tr'):
|
|
664
|
+
parts.append("<tr>")
|
|
665
|
+
for th in tr.find_all('th'):
|
|
666
|
+
parts.append(f"<th>{th.get_text(strip=True)}</th>")
|
|
667
|
+
parts.append("</tr>")
|
|
668
|
+
parts.append("</thead>")
|
|
669
|
+
|
|
670
|
+
# Body - limit to 25 rows
|
|
671
|
+
tbody = table.find('tbody')
|
|
672
|
+
if tbody:
|
|
673
|
+
parts.append("<tbody>")
|
|
674
|
+
for tr in tbody.find_all('tr')[:25]:
|
|
675
|
+
parts.append("<tr>")
|
|
676
|
+
for td in tr.find_all('td'):
|
|
677
|
+
parts.append(f"<td>{td.get_text(strip=True)}</td>")
|
|
678
|
+
parts.append("</tr>")
|
|
679
|
+
parts.append("</tbody>")
|
|
680
|
+
else:
|
|
681
|
+
# No tbody, get rows directly
|
|
682
|
+
for tr in table.find_all('tr')[:25]:
|
|
683
|
+
parts.append("<tr>")
|
|
684
|
+
for td in tr.find_all(['td', 'th']):
|
|
685
|
+
parts.append(f"<td>{td.get_text(strip=True)}</td>")
|
|
686
|
+
parts.append("</tr>")
|
|
687
|
+
|
|
688
|
+
parts.append("</table>")
|
|
689
|
+
|
|
690
|
+
# Period info
|
|
691
|
+
for div in soup.find_all('div', class_=re.compile('period|range|info')):
|
|
692
|
+
text = div.get_text(strip=True)
|
|
693
|
+
if text and 'Data Points' in text:
|
|
694
|
+
parts.append(f"<p class='info'>{text}</p>")
|
|
695
|
+
break
|
|
696
|
+
|
|
697
|
+
return "\n".join(parts)
|
|
698
|
+
|
|
699
|
+
def _extract_analysis(self, soup) -> str:
|
|
700
|
+
"""Extract analysis/ratings page."""
|
|
701
|
+
parts = []
|
|
702
|
+
|
|
703
|
+
parts.append(self._get_title(soup))
|
|
704
|
+
|
|
705
|
+
# Recommendation summary
|
|
706
|
+
rec = soup.find(class_=re.compile('recommend|consensus|rating-summary'))
|
|
707
|
+
if rec:
|
|
708
|
+
parts.append(f"<div class='recommendation'>{rec.get_text(strip=True)}</div>")
|
|
709
|
+
|
|
710
|
+
# Price target
|
|
711
|
+
target = soup.find(class_=re.compile('price-target|target'))
|
|
712
|
+
if target:
|
|
713
|
+
parts.append(f"<div class='price-target'>{target.get_text(strip=True)}</div>")
|
|
714
|
+
|
|
715
|
+
# All analyst tables (recommendation trend + individual ratings)
|
|
716
|
+
for table in soup.find_all('table'):
|
|
717
|
+
parts.append("<table class='analyst-ratings'>")
|
|
718
|
+
for tr in table.find_all('tr')[:20]:
|
|
719
|
+
parts.append("<tr>")
|
|
720
|
+
for td in tr.find_all(['td', 'th']):
|
|
721
|
+
parts.append(f"<td>{td.get_text(strip=True)}</td>")
|
|
722
|
+
parts.append("</tr>")
|
|
723
|
+
parts.append("</table>")
|
|
724
|
+
|
|
725
|
+
return "\n".join(parts)
|
|
726
|
+
|
|
727
|
+
def _extract_profile(self, soup) -> str:
|
|
728
|
+
"""Extract company profile page."""
|
|
729
|
+
parts = []
|
|
730
|
+
|
|
731
|
+
parts.append(self._get_title(soup))
|
|
732
|
+
|
|
733
|
+
# Description
|
|
734
|
+
desc = soup.find(class_=re.compile('description|about|summary|profile-desc'))
|
|
735
|
+
if desc:
|
|
736
|
+
text = desc.get_text(strip=True)[:800]
|
|
737
|
+
parts.append(f"<p class='description'>{text}</p>")
|
|
738
|
+
|
|
739
|
+
# Company info
|
|
740
|
+
info = soup.find(class_=re.compile('company-info|profile-info'))
|
|
741
|
+
if info:
|
|
742
|
+
parts.append("<div class='company-info'>")
|
|
743
|
+
for row in info.find_all('div', recursive=False)[:10]:
|
|
744
|
+
text = row.get_text(strip=True)
|
|
745
|
+
if text and len(text) > 3:
|
|
746
|
+
parts.append(f"<p>{text}</p>")
|
|
747
|
+
parts.append("</div>")
|
|
748
|
+
|
|
749
|
+
return "\n".join(parts)
|
|
750
|
+
|
|
751
|
+
def _extract_article(self, soup) -> str:
|
|
752
|
+
"""Extract news article page."""
|
|
753
|
+
parts = []
|
|
754
|
+
|
|
755
|
+
# Title
|
|
756
|
+
h1 = soup.find('h1')
|
|
757
|
+
if h1:
|
|
758
|
+
parts.append(f"<h1>{h1.get_text(strip=True)}</h1>")
|
|
759
|
+
|
|
760
|
+
# Meta (source, date) - parse properly
|
|
761
|
+
meta = soup.find(class_=re.compile('article-meta|meta'))
|
|
762
|
+
if meta:
|
|
763
|
+
meta_text = meta.get_text(strip=True)
|
|
764
|
+
# Parse format like "Yahoo2025-12-01 13:30Related:AAPL"
|
|
765
|
+
source = ""
|
|
766
|
+
date = ""
|
|
767
|
+
related = ""
|
|
768
|
+
|
|
769
|
+
# Extract date
|
|
770
|
+
date_match = re.search(r'(\d{4}-\d{2}-\d{2}\s*\d{2}:\d{2})', meta_text)
|
|
771
|
+
if date_match:
|
|
772
|
+
date = date_match.group(1)
|
|
773
|
+
source = meta_text[:date_match.start()].strip()
|
|
774
|
+
|
|
775
|
+
# Extract related symbol
|
|
776
|
+
related_match = re.search(r'Related:\s*(\w+)', meta_text)
|
|
777
|
+
if related_match:
|
|
778
|
+
related = related_match.group(1)
|
|
779
|
+
|
|
780
|
+
meta_parts = []
|
|
781
|
+
if source:
|
|
782
|
+
meta_parts.append(f"Source: {source}")
|
|
783
|
+
if date:
|
|
784
|
+
meta_parts.append(f"Date: {date}")
|
|
785
|
+
if related:
|
|
786
|
+
meta_parts.append(f"Symbol: {related}")
|
|
787
|
+
|
|
788
|
+
if meta_parts:
|
|
789
|
+
parts.append(f"<p class='meta'>{' | '.join(meta_parts)}</p>")
|
|
790
|
+
|
|
791
|
+
# Content
|
|
792
|
+
content = soup.find(['article', 'div'], class_=re.compile('article-content|content|body'))
|
|
793
|
+
if content:
|
|
794
|
+
# Get paragraphs
|
|
795
|
+
paras = content.find_all('p')
|
|
796
|
+
if paras:
|
|
797
|
+
for p in paras[:20]:
|
|
798
|
+
text = p.get_text(strip=True)
|
|
799
|
+
if text and len(text) > 20:
|
|
800
|
+
parts.append(f"<p>{text}</p>")
|
|
801
|
+
else:
|
|
802
|
+
text = content.get_text(strip=True)[:2000]
|
|
803
|
+
parts.append(f"<p>{text}</p>")
|
|
804
|
+
|
|
805
|
+
# Comments section - only real user comments
|
|
806
|
+
comments = soup.find(class_=re.compile('comments-list|comment-section'))
|
|
807
|
+
if comments:
|
|
808
|
+
real_comments = []
|
|
809
|
+
for comment in comments.find_all(class_=re.compile('comment-item'))[:10]:
|
|
810
|
+
author = comment.find(class_=re.compile('author|user'))
|
|
811
|
+
content = comment.find(class_=re.compile('content|text|body'))
|
|
812
|
+
if author and content:
|
|
813
|
+
author_text = author.get_text(strip=True)
|
|
814
|
+
content_text = content.get_text(strip=True)
|
|
815
|
+
if content_text and len(content_text) > 5:
|
|
816
|
+
real_comments.append(f"<div class='comment'><b>{author_text}:</b> {content_text[:300]}</div>")
|
|
817
|
+
if real_comments:
|
|
818
|
+
parts.append("<div class='comments'>")
|
|
819
|
+
parts.append("<h3>Comments</h3>")
|
|
820
|
+
parts.extend(real_comments)
|
|
821
|
+
parts.append("</div>")
|
|
822
|
+
|
|
823
|
+
return "\n".join(parts)
|
|
824
|
+
|
|
825
|
+
def _extract_portfolio(self, soup) -> str:
|
|
826
|
+
"""Extract portfolio page."""
|
|
827
|
+
parts = []
|
|
828
|
+
|
|
829
|
+
# Check for account name in portfolio header
|
|
830
|
+
header = soup.find(attrs={"data-section": "portfolio-header"})
|
|
831
|
+
account_name = header.get("data-account") if header else None
|
|
832
|
+
if account_name:
|
|
833
|
+
parts.append(f"<p><b>Account:</b> {account_name}</p>")
|
|
834
|
+
|
|
835
|
+
parts.append("<h1>Portfolio</h1>")
|
|
836
|
+
|
|
837
|
+
# Summary section
|
|
838
|
+
summary = soup.find(class_=re.compile('portfolio-summary|summary|overview'))
|
|
839
|
+
if summary:
|
|
840
|
+
parts.append("<div class='summary'>")
|
|
841
|
+
text = summary.get_text(strip=True)
|
|
842
|
+
# Parse key values
|
|
843
|
+
for match in re.finditer(r'(Total Value|Cash Available|Total Gain|Holdings)[\s:]*(\$?[\d,\.]+%?)', text):
|
|
844
|
+
parts.append(f"<p><b>{match.group(1)}:</b> {match.group(2)}</p>")
|
|
845
|
+
parts.append("</div>")
|
|
846
|
+
|
|
847
|
+
# Holdings table
|
|
848
|
+
table = soup.find('table')
|
|
849
|
+
if table:
|
|
850
|
+
parts.append("<table class='holdings'>")
|
|
851
|
+
for tr in table.find_all('tr')[:20]:
|
|
852
|
+
parts.append("<tr>")
|
|
853
|
+
for td in tr.find_all(['td', 'th']):
|
|
854
|
+
text = td.get_text(strip=True)
|
|
855
|
+
# Skip "Trade" links
|
|
856
|
+
if text.lower() == 'trade':
|
|
857
|
+
continue
|
|
858
|
+
parts.append(f"<td>{text}</td>")
|
|
859
|
+
parts.append("</tr>")
|
|
860
|
+
parts.append("</table>")
|
|
861
|
+
|
|
862
|
+
return "\n".join(parts)
|
|
863
|
+
|
|
864
|
+
def _extract_options(self, soup) -> str:
|
|
865
|
+
"""Extract options chain page."""
|
|
866
|
+
parts = []
|
|
867
|
+
|
|
868
|
+
parts.append(self._get_title(soup))
|
|
869
|
+
|
|
870
|
+
# Expiration date
|
|
871
|
+
for text in soup.stripped_strings:
|
|
872
|
+
if re.search(r'20\d{2}-\d{2}-\d{2}', text):
|
|
873
|
+
parts.append(f"<p class='expiry'><b>Expiration:</b> {text}</p>")
|
|
874
|
+
break
|
|
875
|
+
|
|
876
|
+
# Options tables (Calls and Puts)
|
|
877
|
+
tables = soup.find_all('table')
|
|
878
|
+
for i, table in enumerate(tables[:2]):
|
|
879
|
+
table_type = "Calls" if i == 0 else "Puts"
|
|
880
|
+
parts.append(f"<h3>{table_type}</h3>")
|
|
881
|
+
parts.append("<table class='options'>")
|
|
882
|
+
|
|
883
|
+
# Header
|
|
884
|
+
header_row = table.find('tr')
|
|
885
|
+
if header_row:
|
|
886
|
+
parts.append("<tr>")
|
|
887
|
+
parts.append("<th>Strike</th>")
|
|
888
|
+
for th in header_row.find_all('th'):
|
|
889
|
+
text = th.get_text(strip=True)
|
|
890
|
+
if text:
|
|
891
|
+
parts.append(f"<th>{text}</th>")
|
|
892
|
+
parts.append("</tr>")
|
|
893
|
+
|
|
894
|
+
# Data rows
|
|
895
|
+
for tr in table.find_all('tr')[1:15]:
|
|
896
|
+
parts.append("<tr>")
|
|
897
|
+
for td in tr.find_all('td'):
|
|
898
|
+
text = td.get_text(strip=True)
|
|
899
|
+
parts.append(f"<td>{text}</td>")
|
|
900
|
+
parts.append("</tr>")
|
|
901
|
+
|
|
902
|
+
parts.append("</table>")
|
|
903
|
+
|
|
904
|
+
return "\n".join(parts)
|
|
905
|
+
|
|
906
|
+
def _extract_markets(self, soup) -> str:
|
|
907
|
+
"""Extract markets overview page."""
|
|
908
|
+
parts = []
|
|
909
|
+
|
|
910
|
+
parts.append("<h1>Markets Overview</h1>")
|
|
911
|
+
|
|
912
|
+
# Market indices (index-card elements)
|
|
913
|
+
idx_row = soup.find(attrs={"data-section": "market-indices"})
|
|
914
|
+
if idx_row:
|
|
915
|
+
parts.append("<h3>Market Indices</h3>")
|
|
916
|
+
parts.append("<table>")
|
|
917
|
+
parts.append("<tr><td>Index</td><td>Value</td><td>Change</td></tr>")
|
|
918
|
+
for card in idx_row.find_all(class_="index-card"):
|
|
919
|
+
name = card.find(class_="index-name")
|
|
920
|
+
value = card.find(class_="index-value")
|
|
921
|
+
change = card.find(class_="index-change")
|
|
922
|
+
if name:
|
|
923
|
+
parts.append(f"<tr><td>{name.get_text(strip=True)}</td>"
|
|
924
|
+
f"<td>{value.get_text(strip=True) if value else '--'}</td>"
|
|
925
|
+
f"<td>{change.get_text(strip=True) if change else '--'}</td></tr>")
|
|
926
|
+
parts.append("</table>")
|
|
927
|
+
|
|
928
|
+
# Movers cards (gainers, losers, active) — each has a movers-table
|
|
929
|
+
for card in soup.find_all(class_="movers-card"):
|
|
930
|
+
header = card.find(class_="movers-header")
|
|
931
|
+
if header:
|
|
932
|
+
parts.append(f"<h3>{header.get_text(strip=True)}</h3>")
|
|
933
|
+
table = card.find('table')
|
|
934
|
+
if table:
|
|
935
|
+
parts.append("<table>")
|
|
936
|
+
for tr in table.find_all('tr')[:15]:
|
|
937
|
+
parts.append("<tr>")
|
|
938
|
+
for td in tr.find_all(['td', 'th']):
|
|
939
|
+
parts.append(f"<td>{td.get_text(strip=True)}</td>")
|
|
940
|
+
parts.append("</tr>")
|
|
941
|
+
parts.append("</table>")
|
|
942
|
+
|
|
943
|
+
# Trending stocks
|
|
944
|
+
trending = soup.find(attrs={"data-section": "trending-stocks"})
|
|
945
|
+
if trending:
|
|
946
|
+
header = trending.find(class_="trending-header")
|
|
947
|
+
if header:
|
|
948
|
+
parts.append(f"<h3>{header.get_text(strip=True)}</h3>")
|
|
949
|
+
parts.append("<table>")
|
|
950
|
+
parts.append("<tr><td>#</td><td>Symbol</td><td>Name</td><td>Price</td><td>Change</td></tr>")
|
|
951
|
+
for item in trending.find_all(class_="trending-item"):
|
|
952
|
+
rank = item.find(class_="trending-rank")
|
|
953
|
+
sym = item.find(class_="trending-symbol")
|
|
954
|
+
name = item.find(class_="trending-name")
|
|
955
|
+
price = item.find(class_="trending-price-value")
|
|
956
|
+
chg = item.find(class_="trending-price-change")
|
|
957
|
+
if sym:
|
|
958
|
+
parts.append(f"<tr><td>{rank.get_text(strip=True) if rank else ''}</td>"
|
|
959
|
+
f"<td>{sym.get_text(strip=True)}</td>"
|
|
960
|
+
f"<td>{name.get_text(strip=True) if name else ''}</td>"
|
|
961
|
+
f"<td>{price.get_text(strip=True) if price else ''}</td>"
|
|
962
|
+
f"<td>{chg.get_text(strip=True) if chg else ''}</td></tr>")
|
|
963
|
+
parts.append("</table>")
|
|
964
|
+
|
|
965
|
+
# Fallback: extract any remaining tables
|
|
966
|
+
if len(parts) <= 1:
|
|
967
|
+
for table in soup.find_all('table')[:3]:
|
|
968
|
+
parts.append("<table>")
|
|
969
|
+
for tr in table.find_all('tr')[:15]:
|
|
970
|
+
parts.append("<tr>")
|
|
971
|
+
for td in tr.find_all(['td', 'th']):
|
|
972
|
+
parts.append(f"<td>{td.get_text(strip=True)}</td>")
|
|
973
|
+
parts.append("</tr>")
|
|
974
|
+
parts.append("</table>")
|
|
975
|
+
|
|
976
|
+
return "\n".join(parts)
|
|
977
|
+
|
|
978
|
+
def _extract_stock_list(self, soup) -> str:
|
|
979
|
+
"""Extract the full stock list page (/markets/stocks/)."""
|
|
980
|
+
parts = []
|
|
981
|
+
|
|
982
|
+
# Title with sort/sector info
|
|
983
|
+
title_el = soup.find(class_="yf-card-title")
|
|
984
|
+
if title_el:
|
|
985
|
+
parts.append(f"<h1>{title_el.get_text(strip=True)}</h1>")
|
|
986
|
+
else:
|
|
987
|
+
parts.append("<h1>All Stocks</h1>")
|
|
988
|
+
|
|
989
|
+
# Main table
|
|
990
|
+
table = soup.find('table', class_='yf-stocks-table') or soup.find('table')
|
|
991
|
+
if table:
|
|
992
|
+
parts.append("<table>")
|
|
993
|
+
for tr in table.find_all('tr'):
|
|
994
|
+
parts.append("<tr>")
|
|
995
|
+
for cell in tr.find_all(['td', 'th']):
|
|
996
|
+
parts.append(f"<td>{cell.get_text(strip=True)}</td>")
|
|
997
|
+
parts.append("</tr>")
|
|
998
|
+
parts.append("</table>")
|
|
999
|
+
|
|
1000
|
+
return "\n".join(parts)
|
|
1001
|
+
|
|
1002
|
+
def _extract_search(self, soup) -> str:
|
|
1003
|
+
"""Extract search results page with enhanced information."""
|
|
1004
|
+
parts = []
|
|
1005
|
+
|
|
1006
|
+
parts.append("<h1>Search Results</h1>")
|
|
1007
|
+
|
|
1008
|
+
seen_stocks = set()
|
|
1009
|
+
seen_news = set()
|
|
1010
|
+
|
|
1011
|
+
# Extract stock results with price information
|
|
1012
|
+
stock_section = soup.find('div', class_='search-results')
|
|
1013
|
+
if stock_section:
|
|
1014
|
+
section_title = stock_section.find(class_='section-title')
|
|
1015
|
+
if section_title and '📈' in section_title.get_text():
|
|
1016
|
+
parts.append("<div class='stocks'>")
|
|
1017
|
+
parts.append("<h2>Stock Results</h2>")
|
|
1018
|
+
parts.append("<ul>")
|
|
1019
|
+
|
|
1020
|
+
for item in stock_section.find_all(class_=re.compile('result-item')):
|
|
1021
|
+
symbol_el = item.find(class_=re.compile('symbol'))
|
|
1022
|
+
name_el = item.find(class_=re.compile('name'))
|
|
1023
|
+
price_el = item.find(class_=re.compile('price'))
|
|
1024
|
+
|
|
1025
|
+
if symbol_el:
|
|
1026
|
+
symbol = symbol_el.get_text(strip=True)
|
|
1027
|
+
name = name_el.get_text(strip=True) if name_el else ""
|
|
1028
|
+
price = price_el.get_text(strip=True) if price_el else ""
|
|
1029
|
+
|
|
1030
|
+
key = symbol.upper()
|
|
1031
|
+
if key not in seen_stocks and len(symbol) <= 5:
|
|
1032
|
+
seen_stocks.add(key)
|
|
1033
|
+
price_info = f" | Price: {price}" if price and price != "--" else ""
|
|
1034
|
+
parts.append(f"<li><b>{symbol}</b> - {name}{price_info}</li>")
|
|
1035
|
+
|
|
1036
|
+
parts.append("</ul>")
|
|
1037
|
+
parts.append("</div>")
|
|
1038
|
+
|
|
1039
|
+
# Extract stocks mentioned in news
|
|
1040
|
+
all_sections = soup.find_all('div', class_='search-results')
|
|
1041
|
+
for section in all_sections:
|
|
1042
|
+
section_title = section.find(class_='section-title')
|
|
1043
|
+
if section_title and '📊' in section_title.get_text():
|
|
1044
|
+
parts.append("<div class='news-stocks'>")
|
|
1045
|
+
parts.append("<h2>Stocks Mentioned in News</h2>")
|
|
1046
|
+
parts.append("<ul>")
|
|
1047
|
+
|
|
1048
|
+
for item in section.find_all(class_=re.compile('result-item')):
|
|
1049
|
+
symbol_el = item.find(class_=re.compile('symbol'))
|
|
1050
|
+
name_el = item.find(class_=re.compile('name'))
|
|
1051
|
+
price_div = item.find('div', class_=re.compile('price'))
|
|
1052
|
+
|
|
1053
|
+
if symbol_el:
|
|
1054
|
+
symbol = symbol_el.get_text(strip=True)
|
|
1055
|
+
name = name_el.get_text(strip=True) if name_el else ""
|
|
1056
|
+
|
|
1057
|
+
key = symbol.upper()
|
|
1058
|
+
if key not in seen_stocks and len(symbol) <= 5:
|
|
1059
|
+
seen_stocks.add(key)
|
|
1060
|
+
|
|
1061
|
+
# Extract price and change
|
|
1062
|
+
price_info = ""
|
|
1063
|
+
if price_div:
|
|
1064
|
+
price_text = price_div.get_text(strip=True, separator=' ')
|
|
1065
|
+
if price_text:
|
|
1066
|
+
price_info = f" | {price_text}"
|
|
1067
|
+
|
|
1068
|
+
parts.append(f"<li><b>{symbol}</b> - {name}{price_info}</li>")
|
|
1069
|
+
|
|
1070
|
+
parts.append("</ul>")
|
|
1071
|
+
parts.append("</div>")
|
|
1072
|
+
|
|
1073
|
+
# Extract news results with summaries
|
|
1074
|
+
for section in all_sections:
|
|
1075
|
+
section_title = section.find(class_='section-title')
|
|
1076
|
+
if section_title and '📰' in section_title.get_text():
|
|
1077
|
+
parts.append("<div class='news'>")
|
|
1078
|
+
parts.append("<h2>News Results</h2>")
|
|
1079
|
+
parts.append("<ul>")
|
|
1080
|
+
|
|
1081
|
+
for news_item in section.find_all(class_='news-result'):
|
|
1082
|
+
# Extract headline
|
|
1083
|
+
headline_el = news_item.find(class_='news-headline')
|
|
1084
|
+
if headline_el:
|
|
1085
|
+
headline = headline_el.get_text(strip=True)
|
|
1086
|
+
href = headline_el.get('href', '')
|
|
1087
|
+
|
|
1088
|
+
# Extract article ID from href
|
|
1089
|
+
article_id = ""
|
|
1090
|
+
id_match = re.search(r'/article/(\d+)', href)
|
|
1091
|
+
if id_match:
|
|
1092
|
+
article_id = id_match.group(1)
|
|
1093
|
+
|
|
1094
|
+
if headline and article_id not in seen_news:
|
|
1095
|
+
seen_news.add(article_id)
|
|
1096
|
+
|
|
1097
|
+
# Extract metadata
|
|
1098
|
+
meta_parts = []
|
|
1099
|
+
meta_div = news_item.find(class_='news-meta')
|
|
1100
|
+
if meta_div:
|
|
1101
|
+
symbol_badge = meta_div.find(class_='news-symbol-badge')
|
|
1102
|
+
if symbol_badge:
|
|
1103
|
+
meta_parts.append(symbol_badge.get_text(strip=True))
|
|
1104
|
+
|
|
1105
|
+
source = meta_div.find(class_='news-source')
|
|
1106
|
+
if source:
|
|
1107
|
+
meta_parts.append(source.get_text(strip=True))
|
|
1108
|
+
|
|
1109
|
+
# Find date in meta
|
|
1110
|
+
for span in meta_div.find_all('span'):
|
|
1111
|
+
text = span.get_text(strip=True)
|
|
1112
|
+
if re.match(r'\d{4}-\d{2}-\d{2}', text):
|
|
1113
|
+
meta_parts.append(text)
|
|
1114
|
+
break
|
|
1115
|
+
|
|
1116
|
+
# Extract summary
|
|
1117
|
+
summary_el = news_item.find(class_='news-summary')
|
|
1118
|
+
summary = ""
|
|
1119
|
+
if summary_el:
|
|
1120
|
+
summary = summary_el.get_text(strip=True)[:150]
|
|
1121
|
+
if summary:
|
|
1122
|
+
summary = f"<br/> Summary: {summary}..."
|
|
1123
|
+
|
|
1124
|
+
meta_str = " | ".join(meta_parts) if meta_parts else ""
|
|
1125
|
+
if article_id:
|
|
1126
|
+
parts.append(f"<li>[{article_id}] {headline}")
|
|
1127
|
+
if meta_str:
|
|
1128
|
+
parts.append(f"<br/> {meta_str}")
|
|
1129
|
+
if summary:
|
|
1130
|
+
parts.append(summary)
|
|
1131
|
+
parts.append("</li>")
|
|
1132
|
+
|
|
1133
|
+
parts.append("</ul>")
|
|
1134
|
+
parts.append("</div>")
|
|
1135
|
+
|
|
1136
|
+
# Fallback: simple stock links if nothing found
|
|
1137
|
+
if not seen_stocks and not seen_news:
|
|
1138
|
+
parts.append("<ul>")
|
|
1139
|
+
for a in soup.find_all('a', href=re.compile('/quote/[A-Z]+')):
|
|
1140
|
+
text = a.get_text(strip=True)
|
|
1141
|
+
if text and 3 <= len(text) <= 50 and text not in seen_stocks:
|
|
1142
|
+
seen_stocks.add(text)
|
|
1143
|
+
parts.append(f"<li>{text}</li>")
|
|
1144
|
+
parts.append("</ul>")
|
|
1145
|
+
|
|
1146
|
+
if not seen_stocks and not seen_news:
|
|
1147
|
+
parts.append("<p>No results found</p>")
|
|
1148
|
+
|
|
1149
|
+
return "\n".join(parts)
|
|
1150
|
+
|
|
1151
|
+
def _extract_generic(self, soup) -> str:
|
|
1152
|
+
"""Generic fallback extraction."""
|
|
1153
|
+
parts = []
|
|
1154
|
+
|
|
1155
|
+
parts.append(self._get_title(soup))
|
|
1156
|
+
|
|
1157
|
+
# Get main content area
|
|
1158
|
+
main = soup.find('main') or soup.find(class_='main-content') or soup.find('body')
|
|
1159
|
+
if main:
|
|
1160
|
+
# Get text content, limit length
|
|
1161
|
+
text = main.get_text(strip=True)[:2000]
|
|
1162
|
+
parts.append(f"<div class='content'>{text}</div>")
|
|
1163
|
+
|
|
1164
|
+
return "\n".join(parts)
|