aiecs 1.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aiecs/__init__.py +72 -0
- aiecs/__main__.py +41 -0
- aiecs/aiecs_client.py +469 -0
- aiecs/application/__init__.py +10 -0
- aiecs/application/executors/__init__.py +10 -0
- aiecs/application/executors/operation_executor.py +363 -0
- aiecs/application/knowledge_graph/__init__.py +7 -0
- aiecs/application/knowledge_graph/builder/__init__.py +37 -0
- aiecs/application/knowledge_graph/builder/document_builder.py +375 -0
- aiecs/application/knowledge_graph/builder/graph_builder.py +356 -0
- aiecs/application/knowledge_graph/builder/schema_mapping.py +531 -0
- aiecs/application/knowledge_graph/builder/structured_pipeline.py +443 -0
- aiecs/application/knowledge_graph/builder/text_chunker.py +319 -0
- aiecs/application/knowledge_graph/extractors/__init__.py +27 -0
- aiecs/application/knowledge_graph/extractors/base.py +100 -0
- aiecs/application/knowledge_graph/extractors/llm_entity_extractor.py +327 -0
- aiecs/application/knowledge_graph/extractors/llm_relation_extractor.py +349 -0
- aiecs/application/knowledge_graph/extractors/ner_entity_extractor.py +244 -0
- aiecs/application/knowledge_graph/fusion/__init__.py +23 -0
- aiecs/application/knowledge_graph/fusion/entity_deduplicator.py +387 -0
- aiecs/application/knowledge_graph/fusion/entity_linker.py +343 -0
- aiecs/application/knowledge_graph/fusion/knowledge_fusion.py +580 -0
- aiecs/application/knowledge_graph/fusion/relation_deduplicator.py +189 -0
- aiecs/application/knowledge_graph/pattern_matching/__init__.py +21 -0
- aiecs/application/knowledge_graph/pattern_matching/pattern_matcher.py +344 -0
- aiecs/application/knowledge_graph/pattern_matching/query_executor.py +378 -0
- aiecs/application/knowledge_graph/profiling/__init__.py +12 -0
- aiecs/application/knowledge_graph/profiling/query_plan_visualizer.py +199 -0
- aiecs/application/knowledge_graph/profiling/query_profiler.py +223 -0
- aiecs/application/knowledge_graph/reasoning/__init__.py +27 -0
- aiecs/application/knowledge_graph/reasoning/evidence_synthesis.py +347 -0
- aiecs/application/knowledge_graph/reasoning/inference_engine.py +504 -0
- aiecs/application/knowledge_graph/reasoning/logic_form_parser.py +167 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/__init__.py +79 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_builder.py +513 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_nodes.py +630 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_validator.py +654 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/error_handler.py +477 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/parser.py +390 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/query_context.py +217 -0
- aiecs/application/knowledge_graph/reasoning/logic_query_integration.py +169 -0
- aiecs/application/knowledge_graph/reasoning/query_planner.py +872 -0
- aiecs/application/knowledge_graph/reasoning/reasoning_engine.py +554 -0
- aiecs/application/knowledge_graph/retrieval/__init__.py +19 -0
- aiecs/application/knowledge_graph/retrieval/retrieval_strategies.py +596 -0
- aiecs/application/knowledge_graph/search/__init__.py +59 -0
- aiecs/application/knowledge_graph/search/hybrid_search.py +423 -0
- aiecs/application/knowledge_graph/search/reranker.py +295 -0
- aiecs/application/knowledge_graph/search/reranker_strategies.py +553 -0
- aiecs/application/knowledge_graph/search/text_similarity.py +398 -0
- aiecs/application/knowledge_graph/traversal/__init__.py +15 -0
- aiecs/application/knowledge_graph/traversal/enhanced_traversal.py +329 -0
- aiecs/application/knowledge_graph/traversal/path_scorer.py +269 -0
- aiecs/application/knowledge_graph/validators/__init__.py +13 -0
- aiecs/application/knowledge_graph/validators/relation_validator.py +189 -0
- aiecs/application/knowledge_graph/visualization/__init__.py +11 -0
- aiecs/application/knowledge_graph/visualization/graph_visualizer.py +321 -0
- aiecs/common/__init__.py +9 -0
- aiecs/common/knowledge_graph/__init__.py +17 -0
- aiecs/common/knowledge_graph/runnable.py +484 -0
- aiecs/config/__init__.py +16 -0
- aiecs/config/config.py +498 -0
- aiecs/config/graph_config.py +137 -0
- aiecs/config/registry.py +23 -0
- aiecs/core/__init__.py +46 -0
- aiecs/core/interface/__init__.py +34 -0
- aiecs/core/interface/execution_interface.py +152 -0
- aiecs/core/interface/storage_interface.py +171 -0
- aiecs/domain/__init__.py +289 -0
- aiecs/domain/agent/__init__.py +189 -0
- aiecs/domain/agent/base_agent.py +697 -0
- aiecs/domain/agent/exceptions.py +103 -0
- aiecs/domain/agent/graph_aware_mixin.py +559 -0
- aiecs/domain/agent/hybrid_agent.py +490 -0
- aiecs/domain/agent/integration/__init__.py +26 -0
- aiecs/domain/agent/integration/context_compressor.py +222 -0
- aiecs/domain/agent/integration/context_engine_adapter.py +252 -0
- aiecs/domain/agent/integration/retry_policy.py +219 -0
- aiecs/domain/agent/integration/role_config.py +213 -0
- aiecs/domain/agent/knowledge_aware_agent.py +646 -0
- aiecs/domain/agent/lifecycle.py +296 -0
- aiecs/domain/agent/llm_agent.py +300 -0
- aiecs/domain/agent/memory/__init__.py +12 -0
- aiecs/domain/agent/memory/conversation.py +197 -0
- aiecs/domain/agent/migration/__init__.py +14 -0
- aiecs/domain/agent/migration/conversion.py +160 -0
- aiecs/domain/agent/migration/legacy_wrapper.py +90 -0
- aiecs/domain/agent/models.py +317 -0
- aiecs/domain/agent/observability.py +407 -0
- aiecs/domain/agent/persistence.py +289 -0
- aiecs/domain/agent/prompts/__init__.py +29 -0
- aiecs/domain/agent/prompts/builder.py +161 -0
- aiecs/domain/agent/prompts/formatters.py +189 -0
- aiecs/domain/agent/prompts/template.py +255 -0
- aiecs/domain/agent/registry.py +260 -0
- aiecs/domain/agent/tool_agent.py +257 -0
- aiecs/domain/agent/tools/__init__.py +12 -0
- aiecs/domain/agent/tools/schema_generator.py +221 -0
- aiecs/domain/community/__init__.py +155 -0
- aiecs/domain/community/agent_adapter.py +477 -0
- aiecs/domain/community/analytics.py +481 -0
- aiecs/domain/community/collaborative_workflow.py +642 -0
- aiecs/domain/community/communication_hub.py +645 -0
- aiecs/domain/community/community_builder.py +320 -0
- aiecs/domain/community/community_integration.py +800 -0
- aiecs/domain/community/community_manager.py +813 -0
- aiecs/domain/community/decision_engine.py +879 -0
- aiecs/domain/community/exceptions.py +225 -0
- aiecs/domain/community/models/__init__.py +33 -0
- aiecs/domain/community/models/community_models.py +268 -0
- aiecs/domain/community/resource_manager.py +457 -0
- aiecs/domain/community/shared_context_manager.py +603 -0
- aiecs/domain/context/__init__.py +58 -0
- aiecs/domain/context/context_engine.py +989 -0
- aiecs/domain/context/conversation_models.py +354 -0
- aiecs/domain/context/graph_memory.py +467 -0
- aiecs/domain/execution/__init__.py +12 -0
- aiecs/domain/execution/model.py +57 -0
- aiecs/domain/knowledge_graph/__init__.py +19 -0
- aiecs/domain/knowledge_graph/models/__init__.py +52 -0
- aiecs/domain/knowledge_graph/models/entity.py +130 -0
- aiecs/domain/knowledge_graph/models/evidence.py +194 -0
- aiecs/domain/knowledge_graph/models/inference_rule.py +186 -0
- aiecs/domain/knowledge_graph/models/path.py +179 -0
- aiecs/domain/knowledge_graph/models/path_pattern.py +173 -0
- aiecs/domain/knowledge_graph/models/query.py +272 -0
- aiecs/domain/knowledge_graph/models/query_plan.py +187 -0
- aiecs/domain/knowledge_graph/models/relation.py +136 -0
- aiecs/domain/knowledge_graph/schema/__init__.py +23 -0
- aiecs/domain/knowledge_graph/schema/entity_type.py +135 -0
- aiecs/domain/knowledge_graph/schema/graph_schema.py +271 -0
- aiecs/domain/knowledge_graph/schema/property_schema.py +155 -0
- aiecs/domain/knowledge_graph/schema/relation_type.py +171 -0
- aiecs/domain/knowledge_graph/schema/schema_manager.py +496 -0
- aiecs/domain/knowledge_graph/schema/type_enums.py +205 -0
- aiecs/domain/task/__init__.py +13 -0
- aiecs/domain/task/dsl_processor.py +613 -0
- aiecs/domain/task/model.py +62 -0
- aiecs/domain/task/task_context.py +268 -0
- aiecs/infrastructure/__init__.py +24 -0
- aiecs/infrastructure/graph_storage/__init__.py +11 -0
- aiecs/infrastructure/graph_storage/base.py +601 -0
- aiecs/infrastructure/graph_storage/batch_operations.py +449 -0
- aiecs/infrastructure/graph_storage/cache.py +429 -0
- aiecs/infrastructure/graph_storage/distributed.py +226 -0
- aiecs/infrastructure/graph_storage/error_handling.py +390 -0
- aiecs/infrastructure/graph_storage/graceful_degradation.py +306 -0
- aiecs/infrastructure/graph_storage/health_checks.py +378 -0
- aiecs/infrastructure/graph_storage/in_memory.py +514 -0
- aiecs/infrastructure/graph_storage/index_optimization.py +483 -0
- aiecs/infrastructure/graph_storage/lazy_loading.py +410 -0
- aiecs/infrastructure/graph_storage/metrics.py +357 -0
- aiecs/infrastructure/graph_storage/migration.py +413 -0
- aiecs/infrastructure/graph_storage/pagination.py +471 -0
- aiecs/infrastructure/graph_storage/performance_monitoring.py +466 -0
- aiecs/infrastructure/graph_storage/postgres.py +871 -0
- aiecs/infrastructure/graph_storage/query_optimizer.py +635 -0
- aiecs/infrastructure/graph_storage/schema_cache.py +290 -0
- aiecs/infrastructure/graph_storage/sqlite.py +623 -0
- aiecs/infrastructure/graph_storage/streaming.py +495 -0
- aiecs/infrastructure/messaging/__init__.py +13 -0
- aiecs/infrastructure/messaging/celery_task_manager.py +383 -0
- aiecs/infrastructure/messaging/websocket_manager.py +298 -0
- aiecs/infrastructure/monitoring/__init__.py +34 -0
- aiecs/infrastructure/monitoring/executor_metrics.py +174 -0
- aiecs/infrastructure/monitoring/global_metrics_manager.py +213 -0
- aiecs/infrastructure/monitoring/structured_logger.py +48 -0
- aiecs/infrastructure/monitoring/tracing_manager.py +410 -0
- aiecs/infrastructure/persistence/__init__.py +24 -0
- aiecs/infrastructure/persistence/context_engine_client.py +187 -0
- aiecs/infrastructure/persistence/database_manager.py +333 -0
- aiecs/infrastructure/persistence/file_storage.py +754 -0
- aiecs/infrastructure/persistence/redis_client.py +220 -0
- aiecs/llm/__init__.py +86 -0
- aiecs/llm/callbacks/__init__.py +11 -0
- aiecs/llm/callbacks/custom_callbacks.py +264 -0
- aiecs/llm/client_factory.py +420 -0
- aiecs/llm/clients/__init__.py +33 -0
- aiecs/llm/clients/base_client.py +193 -0
- aiecs/llm/clients/googleai_client.py +181 -0
- aiecs/llm/clients/openai_client.py +131 -0
- aiecs/llm/clients/vertex_client.py +437 -0
- aiecs/llm/clients/xai_client.py +184 -0
- aiecs/llm/config/__init__.py +51 -0
- aiecs/llm/config/config_loader.py +275 -0
- aiecs/llm/config/config_validator.py +236 -0
- aiecs/llm/config/model_config.py +151 -0
- aiecs/llm/utils/__init__.py +10 -0
- aiecs/llm/utils/validate_config.py +91 -0
- aiecs/main.py +363 -0
- aiecs/scripts/__init__.py +3 -0
- aiecs/scripts/aid/VERSION_MANAGEMENT.md +97 -0
- aiecs/scripts/aid/__init__.py +19 -0
- aiecs/scripts/aid/version_manager.py +215 -0
- aiecs/scripts/dependance_check/DEPENDENCY_SYSTEM_SUMMARY.md +242 -0
- aiecs/scripts/dependance_check/README_DEPENDENCY_CHECKER.md +310 -0
- aiecs/scripts/dependance_check/__init__.py +17 -0
- aiecs/scripts/dependance_check/dependency_checker.py +938 -0
- aiecs/scripts/dependance_check/dependency_fixer.py +391 -0
- aiecs/scripts/dependance_check/download_nlp_data.py +396 -0
- aiecs/scripts/dependance_check/quick_dependency_check.py +270 -0
- aiecs/scripts/dependance_check/setup_nlp_data.sh +217 -0
- aiecs/scripts/dependance_patch/__init__.py +7 -0
- aiecs/scripts/dependance_patch/fix_weasel/README_WEASEL_PATCH.md +126 -0
- aiecs/scripts/dependance_patch/fix_weasel/__init__.py +11 -0
- aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.py +128 -0
- aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.sh +82 -0
- aiecs/scripts/dependance_patch/fix_weasel/patch_weasel_library.sh +188 -0
- aiecs/scripts/dependance_patch/fix_weasel/run_weasel_patch.sh +41 -0
- aiecs/scripts/tools_develop/README.md +449 -0
- aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
- aiecs/scripts/tools_develop/__init__.py +21 -0
- aiecs/scripts/tools_develop/check_type_annotations.py +259 -0
- aiecs/scripts/tools_develop/validate_tool_schemas.py +422 -0
- aiecs/scripts/tools_develop/verify_tools.py +356 -0
- aiecs/tasks/__init__.py +1 -0
- aiecs/tasks/worker.py +172 -0
- aiecs/tools/__init__.py +299 -0
- aiecs/tools/apisource/__init__.py +99 -0
- aiecs/tools/apisource/intelligence/__init__.py +19 -0
- aiecs/tools/apisource/intelligence/data_fusion.py +381 -0
- aiecs/tools/apisource/intelligence/query_analyzer.py +413 -0
- aiecs/tools/apisource/intelligence/search_enhancer.py +388 -0
- aiecs/tools/apisource/monitoring/__init__.py +9 -0
- aiecs/tools/apisource/monitoring/metrics.py +303 -0
- aiecs/tools/apisource/providers/__init__.py +115 -0
- aiecs/tools/apisource/providers/base.py +664 -0
- aiecs/tools/apisource/providers/census.py +401 -0
- aiecs/tools/apisource/providers/fred.py +564 -0
- aiecs/tools/apisource/providers/newsapi.py +412 -0
- aiecs/tools/apisource/providers/worldbank.py +357 -0
- aiecs/tools/apisource/reliability/__init__.py +12 -0
- aiecs/tools/apisource/reliability/error_handler.py +375 -0
- aiecs/tools/apisource/reliability/fallback_strategy.py +391 -0
- aiecs/tools/apisource/tool.py +850 -0
- aiecs/tools/apisource/utils/__init__.py +9 -0
- aiecs/tools/apisource/utils/validators.py +338 -0
- aiecs/tools/base_tool.py +201 -0
- aiecs/tools/docs/__init__.py +121 -0
- aiecs/tools/docs/ai_document_orchestrator.py +599 -0
- aiecs/tools/docs/ai_document_writer_orchestrator.py +2403 -0
- aiecs/tools/docs/content_insertion_tool.py +1333 -0
- aiecs/tools/docs/document_creator_tool.py +1317 -0
- aiecs/tools/docs/document_layout_tool.py +1166 -0
- aiecs/tools/docs/document_parser_tool.py +994 -0
- aiecs/tools/docs/document_writer_tool.py +1818 -0
- aiecs/tools/knowledge_graph/__init__.py +17 -0
- aiecs/tools/knowledge_graph/graph_reasoning_tool.py +734 -0
- aiecs/tools/knowledge_graph/graph_search_tool.py +923 -0
- aiecs/tools/knowledge_graph/kg_builder_tool.py +476 -0
- aiecs/tools/langchain_adapter.py +542 -0
- aiecs/tools/schema_generator.py +275 -0
- aiecs/tools/search_tool/__init__.py +100 -0
- aiecs/tools/search_tool/analyzers.py +589 -0
- aiecs/tools/search_tool/cache.py +260 -0
- aiecs/tools/search_tool/constants.py +128 -0
- aiecs/tools/search_tool/context.py +216 -0
- aiecs/tools/search_tool/core.py +749 -0
- aiecs/tools/search_tool/deduplicator.py +123 -0
- aiecs/tools/search_tool/error_handler.py +271 -0
- aiecs/tools/search_tool/metrics.py +371 -0
- aiecs/tools/search_tool/rate_limiter.py +178 -0
- aiecs/tools/search_tool/schemas.py +277 -0
- aiecs/tools/statistics/__init__.py +80 -0
- aiecs/tools/statistics/ai_data_analysis_orchestrator.py +643 -0
- aiecs/tools/statistics/ai_insight_generator_tool.py +505 -0
- aiecs/tools/statistics/ai_report_orchestrator_tool.py +694 -0
- aiecs/tools/statistics/data_loader_tool.py +564 -0
- aiecs/tools/statistics/data_profiler_tool.py +658 -0
- aiecs/tools/statistics/data_transformer_tool.py +573 -0
- aiecs/tools/statistics/data_visualizer_tool.py +495 -0
- aiecs/tools/statistics/model_trainer_tool.py +487 -0
- aiecs/tools/statistics/statistical_analyzer_tool.py +459 -0
- aiecs/tools/task_tools/__init__.py +86 -0
- aiecs/tools/task_tools/chart_tool.py +732 -0
- aiecs/tools/task_tools/classfire_tool.py +922 -0
- aiecs/tools/task_tools/image_tool.py +447 -0
- aiecs/tools/task_tools/office_tool.py +684 -0
- aiecs/tools/task_tools/pandas_tool.py +635 -0
- aiecs/tools/task_tools/report_tool.py +635 -0
- aiecs/tools/task_tools/research_tool.py +392 -0
- aiecs/tools/task_tools/scraper_tool.py +715 -0
- aiecs/tools/task_tools/stats_tool.py +688 -0
- aiecs/tools/temp_file_manager.py +130 -0
- aiecs/tools/tool_executor/__init__.py +37 -0
- aiecs/tools/tool_executor/tool_executor.py +881 -0
- aiecs/utils/LLM_output_structor.py +445 -0
- aiecs/utils/__init__.py +34 -0
- aiecs/utils/base_callback.py +47 -0
- aiecs/utils/cache_provider.py +695 -0
- aiecs/utils/execution_utils.py +184 -0
- aiecs/utils/logging.py +1 -0
- aiecs/utils/prompt_loader.py +14 -0
- aiecs/utils/token_usage_repository.py +323 -0
- aiecs/ws/__init__.py +0 -0
- aiecs/ws/socket_server.py +52 -0
- aiecs-1.5.1.dist-info/METADATA +608 -0
- aiecs-1.5.1.dist-info/RECORD +302 -0
- aiecs-1.5.1.dist-info/WHEEL +5 -0
- aiecs-1.5.1.dist-info/entry_points.txt +10 -0
- aiecs-1.5.1.dist-info/licenses/LICENSE +225 -0
- aiecs-1.5.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,715 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
import time
|
|
4
|
+
import logging
|
|
5
|
+
import tempfile
|
|
6
|
+
import subprocess
|
|
7
|
+
from typing import Dict, Any, List, Optional, Tuple
|
|
8
|
+
from enum import Enum
|
|
9
|
+
|
|
10
|
+
import httpx
|
|
11
|
+
from bs4 import BeautifulSoup
|
|
12
|
+
from urllib import request as urllib_request
|
|
13
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
14
|
+
|
|
15
|
+
from aiecs.tools.base_tool import BaseTool
|
|
16
|
+
from aiecs.tools import register_tool
|
|
17
|
+
|
|
18
|
+
# Enums for configuration options
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class HttpMethod(str, Enum):
|
|
22
|
+
GET = "get"
|
|
23
|
+
POST = "post"
|
|
24
|
+
PUT = "put"
|
|
25
|
+
DELETE = "delete"
|
|
26
|
+
HEAD = "head"
|
|
27
|
+
OPTIONS = "options"
|
|
28
|
+
PATCH = "patch"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class ContentType(str, Enum):
|
|
32
|
+
HTML = "html"
|
|
33
|
+
JSON = "json"
|
|
34
|
+
TEXT = "text"
|
|
35
|
+
BINARY = "binary"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class OutputFormat(str, Enum):
|
|
39
|
+
TEXT = "text"
|
|
40
|
+
JSON = "json"
|
|
41
|
+
HTML = "html"
|
|
42
|
+
MARKDOWN = "markdown"
|
|
43
|
+
CSV = "csv"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class RenderEngine(str, Enum):
|
|
47
|
+
NONE = "none"
|
|
48
|
+
PLAYWRIGHT = "playwright"
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
# Exceptions
|
|
52
|
+
class ScraperToolError(Exception):
|
|
53
|
+
"""Base exception for ScraperTool errors."""
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class HttpError(ScraperToolError):
|
|
57
|
+
"""Raised when HTTP requests fail."""
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class TimeoutError(ScraperToolError):
|
|
61
|
+
"""Raised when operations time out."""
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class RateLimitError(ScraperToolError):
|
|
65
|
+
"""Raised when rate limits are exceeded."""
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class ParsingError(ScraperToolError):
|
|
69
|
+
"""Raised when HTML parsing fails."""
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class RenderingError(ScraperToolError):
|
|
73
|
+
"""Raised when rendering fails."""
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class ExternalToolError(ScraperToolError):
|
|
77
|
+
"""Raised when external tools fail."""
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class FileOperationError(ScraperToolError):
|
|
81
|
+
"""Raised when file operations fail."""
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@register_tool("scraper")
|
|
85
|
+
class ScraperTool(BaseTool):
|
|
86
|
+
"""
|
|
87
|
+
Enhanced web scraping tool with multiple HTTP clients, JavaScript rendering,
|
|
88
|
+
HTML parsing, and security features.
|
|
89
|
+
|
|
90
|
+
Features:
|
|
91
|
+
- Multiple HTTP clients: httpx, urllib
|
|
92
|
+
- JavaScript rendering with Playwright or Selenium
|
|
93
|
+
- HTML parsing with BeautifulSoup
|
|
94
|
+
- Scrapy integration for advanced crawling
|
|
95
|
+
- Output in various formats: text, JSON, HTML, Markdown, CSV
|
|
96
|
+
"""
|
|
97
|
+
|
|
98
|
+
# Configuration schema
|
|
99
|
+
class Config(BaseModel):
|
|
100
|
+
"""Configuration for the scraper tool"""
|
|
101
|
+
|
|
102
|
+
model_config = ConfigDict(env_prefix="SCRAPER_TOOL_")
|
|
103
|
+
|
|
104
|
+
user_agent: str = Field(
|
|
105
|
+
default="PythonMiddlewareScraper/2.0",
|
|
106
|
+
description="User agent for HTTP requests",
|
|
107
|
+
)
|
|
108
|
+
max_content_length: int = Field(
|
|
109
|
+
default=10 * 1024 * 1024,
|
|
110
|
+
description="Maximum content length in bytes",
|
|
111
|
+
)
|
|
112
|
+
output_dir: str = Field(
|
|
113
|
+
default=os.path.join(tempfile.gettempdir(), "scraper_outputs"),
|
|
114
|
+
description="Directory for output files",
|
|
115
|
+
)
|
|
116
|
+
scrapy_command: str = Field(default="scrapy", description="Command to run Scrapy")
|
|
117
|
+
allowed_domains: List[str] = Field(default=[], description="Allowed domains for scraping")
|
|
118
|
+
blocked_domains: List[str] = Field(default=[], description="Blocked domains for scraping")
|
|
119
|
+
playwright_available: bool = Field(
|
|
120
|
+
default=False,
|
|
121
|
+
description="Whether Playwright is available (auto-detected)",
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
def __init__(self, config: Optional[Dict] = None):
|
|
125
|
+
"""
|
|
126
|
+
Initialize ScraperTool with settings and resources.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
config (Dict, optional): Configuration overrides for ScraperTool.
|
|
130
|
+
|
|
131
|
+
Raises:
|
|
132
|
+
ValueError: If config contains invalid settings.
|
|
133
|
+
"""
|
|
134
|
+
super().__init__(config)
|
|
135
|
+
|
|
136
|
+
# Parse configuration
|
|
137
|
+
self.config = self.Config(**(config or {}))
|
|
138
|
+
|
|
139
|
+
self.logger = logging.getLogger(__name__)
|
|
140
|
+
if not self.logger.handlers:
|
|
141
|
+
handler = logging.StreamHandler()
|
|
142
|
+
handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(message)s"))
|
|
143
|
+
self.logger.addHandler(handler)
|
|
144
|
+
self.logger.setLevel(logging.INFO)
|
|
145
|
+
os.makedirs(self.config.output_dir, exist_ok=True)
|
|
146
|
+
self._check_external_tools()
|
|
147
|
+
|
|
148
|
+
def _check_external_tools(self):
|
|
149
|
+
"""Check if external tools are available."""
|
|
150
|
+
try:
|
|
151
|
+
self.config.playwright_available = True
|
|
152
|
+
except ImportError:
|
|
153
|
+
self.config.playwright_available = False
|
|
154
|
+
|
|
155
|
+
async def _save_output(self, content: Any, path: str, format: OutputFormat) -> None:
|
|
156
|
+
"""Save content to file in the specified format."""
|
|
157
|
+
try:
|
|
158
|
+
os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True)
|
|
159
|
+
if format == OutputFormat.TEXT:
|
|
160
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
161
|
+
if isinstance(content, dict):
|
|
162
|
+
f.write(json.dumps(content, indent=2))
|
|
163
|
+
else:
|
|
164
|
+
f.write(str(content))
|
|
165
|
+
elif format == OutputFormat.JSON:
|
|
166
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
167
|
+
if isinstance(content, dict):
|
|
168
|
+
json.dump(content, f, indent=2)
|
|
169
|
+
else:
|
|
170
|
+
json.dump({"content": content}, f, indent=2)
|
|
171
|
+
elif format == OutputFormat.HTML:
|
|
172
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
173
|
+
if isinstance(content, dict) and "html" in content:
|
|
174
|
+
f.write(content["html"])
|
|
175
|
+
else:
|
|
176
|
+
f.write(str(content))
|
|
177
|
+
elif format == OutputFormat.MARKDOWN:
|
|
178
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
179
|
+
if isinstance(content, dict):
|
|
180
|
+
f.write("# Scraper Results\n\n")
|
|
181
|
+
for key, value in content.items():
|
|
182
|
+
f.write(f"## {key}\n\n")
|
|
183
|
+
f.write(f"{value}\n\n")
|
|
184
|
+
else:
|
|
185
|
+
f.write("# Scraper Results\n\n")
|
|
186
|
+
f.write(str(content))
|
|
187
|
+
elif format == OutputFormat.CSV:
|
|
188
|
+
import csv
|
|
189
|
+
|
|
190
|
+
with open(path, "w", newline="", encoding="utf-8") as f:
|
|
191
|
+
if isinstance(content, dict):
|
|
192
|
+
writer = csv.writer(f)
|
|
193
|
+
writer.writerow(content.keys())
|
|
194
|
+
writer.writerow(content.values())
|
|
195
|
+
elif isinstance(content, list) and all(
|
|
196
|
+
isinstance(item, dict) for item in content
|
|
197
|
+
):
|
|
198
|
+
if content:
|
|
199
|
+
writer = csv.DictWriter(f, fieldnames=content[0].keys())
|
|
200
|
+
writer.writeheader()
|
|
201
|
+
writer.writerows(content)
|
|
202
|
+
else:
|
|
203
|
+
writer = csv.writer(f)
|
|
204
|
+
writer.writerow(["content"])
|
|
205
|
+
writer.writerow([str(content)])
|
|
206
|
+
except Exception as e:
|
|
207
|
+
raise FileOperationError(f"Error saving output: {str(e)}")
|
|
208
|
+
|
|
209
|
+
async def get_httpx(
|
|
210
|
+
self,
|
|
211
|
+
url: str,
|
|
212
|
+
method: HttpMethod = HttpMethod.GET,
|
|
213
|
+
params: Optional[Dict[str, str]] = None,
|
|
214
|
+
data: Optional[Dict[str, Any]] = None,
|
|
215
|
+
json_data: Optional[Dict[str, Any]] = None,
|
|
216
|
+
cookies: Optional[Dict[str, str]] = None,
|
|
217
|
+
auth: Optional[Tuple[str, str]] = None,
|
|
218
|
+
verify_ssl: Optional[bool] = None,
|
|
219
|
+
allow_redirects: bool = True,
|
|
220
|
+
content_type: ContentType = ContentType.TEXT,
|
|
221
|
+
headers: Optional[Dict[str, str]] = None,
|
|
222
|
+
output_format: Optional[OutputFormat] = None,
|
|
223
|
+
output_path: Optional[str] = None,
|
|
224
|
+
async_mode: bool = True,
|
|
225
|
+
) -> Any:
|
|
226
|
+
"""
|
|
227
|
+
Execute HTTP request using httpx library (supports both sync and async).
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
url (str): URL to scrape.
|
|
231
|
+
method (HttpMethod): HTTP method to use.
|
|
232
|
+
params (Optional[Dict[str, str]]): Query parameters.
|
|
233
|
+
data (Optional[Dict[str, Any]]): Form data.
|
|
234
|
+
json_data (Optional[Dict[str, Any]]): JSON data.
|
|
235
|
+
cookies (Optional[Dict[str, str]]): Cookies.
|
|
236
|
+
auth (Optional[Tuple[str, str]]): Authentication credentials.
|
|
237
|
+
verify_ssl (Optional[bool]): Verify SSL certificates.
|
|
238
|
+
allow_redirects (bool): Allow redirects.
|
|
239
|
+
content_type (ContentType): Expected content type.
|
|
240
|
+
headers (Optional[Dict[str, str]]): Custom headers.
|
|
241
|
+
output_format (Optional[OutputFormat]): Output format.
|
|
242
|
+
output_path (Optional[str]): Path to save output.
|
|
243
|
+
async_mode (bool): Whether to use async client.
|
|
244
|
+
|
|
245
|
+
Returns:
|
|
246
|
+
Any: Scraped content (dict, str, or bytes).
|
|
247
|
+
|
|
248
|
+
Raises:
|
|
249
|
+
HttpError: If the request fails.
|
|
250
|
+
"""
|
|
251
|
+
try:
|
|
252
|
+
headers = headers or {}
|
|
253
|
+
if "User-Agent" not in headers:
|
|
254
|
+
headers["User-Agent"] = self.config.user_agent
|
|
255
|
+
kwargs = {
|
|
256
|
+
"params": params,
|
|
257
|
+
"headers": headers,
|
|
258
|
+
"follow_redirects": allow_redirects,
|
|
259
|
+
}
|
|
260
|
+
if auth:
|
|
261
|
+
kwargs["auth"] = auth
|
|
262
|
+
if cookies:
|
|
263
|
+
kwargs["cookies"] = cookies
|
|
264
|
+
if json_data:
|
|
265
|
+
kwargs["json"] = json_data
|
|
266
|
+
elif data:
|
|
267
|
+
kwargs["data"] = data
|
|
268
|
+
|
|
269
|
+
if async_mode:
|
|
270
|
+
async with httpx.AsyncClient(
|
|
271
|
+
verify=verify_ssl if verify_ssl is not None else True
|
|
272
|
+
) as client:
|
|
273
|
+
method_fn = getattr(client, method.value)
|
|
274
|
+
resp = await method_fn(str(url), **kwargs)
|
|
275
|
+
else:
|
|
276
|
+
with httpx.Client(verify=verify_ssl if verify_ssl is not None else True) as client:
|
|
277
|
+
method_fn = getattr(client, method.value)
|
|
278
|
+
resp = method_fn(str(url), **kwargs)
|
|
279
|
+
|
|
280
|
+
try:
|
|
281
|
+
resp.raise_for_status()
|
|
282
|
+
except httpx.HTTPStatusError as e:
|
|
283
|
+
raise HttpError(
|
|
284
|
+
f"HTTP {e.response.status_code}: {e.response.reason_phrase} for {url}"
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
if len(resp.content) > self.config.max_content_length:
|
|
288
|
+
raise HttpError(f"Response content too large: {len(resp.content)} bytes")
|
|
289
|
+
|
|
290
|
+
if content_type == ContentType.JSON:
|
|
291
|
+
result = resp.json()
|
|
292
|
+
elif content_type == ContentType.HTML:
|
|
293
|
+
result = {
|
|
294
|
+
"html": resp.text,
|
|
295
|
+
"url": str(resp.url),
|
|
296
|
+
"status": resp.status_code,
|
|
297
|
+
}
|
|
298
|
+
elif content_type == ContentType.BINARY:
|
|
299
|
+
result = {
|
|
300
|
+
"content": resp.content,
|
|
301
|
+
"url": str(resp.url),
|
|
302
|
+
"status": resp.status_code,
|
|
303
|
+
}
|
|
304
|
+
else:
|
|
305
|
+
result = resp.text
|
|
306
|
+
|
|
307
|
+
if output_format and output_path:
|
|
308
|
+
await self._save_output(result, output_path, output_format)
|
|
309
|
+
if isinstance(result, dict):
|
|
310
|
+
result["saved_to"] = output_path
|
|
311
|
+
else:
|
|
312
|
+
result = {"content": result, "saved_to": output_path}
|
|
313
|
+
return result
|
|
314
|
+
except httpx.RequestError as e:
|
|
315
|
+
raise HttpError(f"Request failed: {str(e)}")
|
|
316
|
+
|
|
317
|
+
async def get_urllib(
|
|
318
|
+
self,
|
|
319
|
+
url: str,
|
|
320
|
+
method: HttpMethod = HttpMethod.GET,
|
|
321
|
+
data: Optional[Dict[str, Any]] = None,
|
|
322
|
+
content_type: ContentType = ContentType.TEXT,
|
|
323
|
+
headers: Optional[Dict[str, str]] = None,
|
|
324
|
+
output_format: Optional[OutputFormat] = None,
|
|
325
|
+
output_path: Optional[str] = None,
|
|
326
|
+
) -> Any:
|
|
327
|
+
"""
|
|
328
|
+
Execute HTTP request using urllib.
|
|
329
|
+
|
|
330
|
+
Args:
|
|
331
|
+
url (str): URL to scrape.
|
|
332
|
+
method (HttpMethod): HTTP method to use.
|
|
333
|
+
data (Optional[Dict[str, Any]]): Form data.
|
|
334
|
+
content_type (ContentType): Expected content type.
|
|
335
|
+
headers (Optional[Dict[str, str]]): Custom headers.
|
|
336
|
+
output_format (Optional[OutputFormat]): Output format.
|
|
337
|
+
output_path (Optional[str]): Path to save output.
|
|
338
|
+
|
|
339
|
+
Returns:
|
|
340
|
+
Any: Scraped content (dict, str, or bytes).
|
|
341
|
+
|
|
342
|
+
Raises:
|
|
343
|
+
HttpError: If the request fails.
|
|
344
|
+
"""
|
|
345
|
+
try:
|
|
346
|
+
import urllib.parse
|
|
347
|
+
import urllib.error
|
|
348
|
+
|
|
349
|
+
headers = headers or {}
|
|
350
|
+
if "User-Agent" not in headers:
|
|
351
|
+
headers["User-Agent"] = self.config.user_agent
|
|
352
|
+
data_bytes = None
|
|
353
|
+
if data:
|
|
354
|
+
data_bytes = urllib.parse.urlencode(data).encode()
|
|
355
|
+
req = urllib_request.Request(
|
|
356
|
+
str(url),
|
|
357
|
+
data=data_bytes,
|
|
358
|
+
headers=headers,
|
|
359
|
+
method=method.value.upper(),
|
|
360
|
+
)
|
|
361
|
+
with urllib_request.urlopen(req) as resp:
|
|
362
|
+
content_length = resp.getheader("Content-Length")
|
|
363
|
+
if content_length and int(content_length) > self.config.max_content_length:
|
|
364
|
+
raise HttpError(f"Response content too large: {content_length} bytes")
|
|
365
|
+
content = resp.read()
|
|
366
|
+
charset = resp.headers.get_content_charset() or "utf-8"
|
|
367
|
+
if content_type == ContentType.JSON:
|
|
368
|
+
result = json.loads(content.decode(charset, errors="ignore"))
|
|
369
|
+
elif content_type == ContentType.HTML:
|
|
370
|
+
result = {
|
|
371
|
+
"html": content.decode(charset, errors="ignore"),
|
|
372
|
+
"url": resp.url,
|
|
373
|
+
"status": resp.status,
|
|
374
|
+
}
|
|
375
|
+
elif content_type == ContentType.BINARY:
|
|
376
|
+
result = {
|
|
377
|
+
"content": content,
|
|
378
|
+
"url": resp.url,
|
|
379
|
+
"status": resp.status,
|
|
380
|
+
}
|
|
381
|
+
else:
|
|
382
|
+
result = content.decode(charset, errors="ignore")
|
|
383
|
+
if output_format and output_path:
|
|
384
|
+
await self._save_output(result, output_path, output_format)
|
|
385
|
+
if isinstance(result, dict):
|
|
386
|
+
result["saved_to"] = output_path
|
|
387
|
+
else:
|
|
388
|
+
result = {"content": result, "saved_to": output_path}
|
|
389
|
+
return result
|
|
390
|
+
except urllib.error.URLError as e:
|
|
391
|
+
raise HttpError(f"Request failed: {str(e)}")
|
|
392
|
+
|
|
393
|
+
# Legacy method names for backward compatibility
|
|
394
|
+
async def get_requests(
|
|
395
|
+
self,
|
|
396
|
+
url: str,
|
|
397
|
+
method: HttpMethod = HttpMethod.GET,
|
|
398
|
+
params: Optional[Dict[str, str]] = None,
|
|
399
|
+
data: Optional[Dict[str, Any]] = None,
|
|
400
|
+
json_data: Optional[Dict[str, Any]] = None,
|
|
401
|
+
cookies: Optional[Dict[str, str]] = None,
|
|
402
|
+
auth: Optional[Tuple[str, str]] = None,
|
|
403
|
+
verify_ssl: Optional[bool] = None,
|
|
404
|
+
allow_redirects: bool = True,
|
|
405
|
+
content_type: ContentType = ContentType.TEXT,
|
|
406
|
+
headers: Optional[Dict[str, str]] = None,
|
|
407
|
+
output_format: Optional[OutputFormat] = None,
|
|
408
|
+
output_path: Optional[str] = None,
|
|
409
|
+
) -> Any:
|
|
410
|
+
"""Legacy method - now uses httpx in sync mode."""
|
|
411
|
+
return await self.get_httpx(
|
|
412
|
+
url,
|
|
413
|
+
method,
|
|
414
|
+
params,
|
|
415
|
+
data,
|
|
416
|
+
json_data,
|
|
417
|
+
cookies,
|
|
418
|
+
auth,
|
|
419
|
+
verify_ssl,
|
|
420
|
+
allow_redirects,
|
|
421
|
+
content_type,
|
|
422
|
+
headers,
|
|
423
|
+
output_format,
|
|
424
|
+
output_path,
|
|
425
|
+
async_mode=False,
|
|
426
|
+
)
|
|
427
|
+
|
|
428
|
+
async def get_aiohttp(
|
|
429
|
+
self,
|
|
430
|
+
url: str,
|
|
431
|
+
method: HttpMethod = HttpMethod.GET,
|
|
432
|
+
params: Optional[Dict[str, str]] = None,
|
|
433
|
+
data: Optional[Dict[str, Any]] = None,
|
|
434
|
+
json_data: Optional[Dict[str, Any]] = None,
|
|
435
|
+
cookies: Optional[Dict[str, str]] = None,
|
|
436
|
+
auth: Optional[Tuple[str, str]] = None,
|
|
437
|
+
verify_ssl: Optional[bool] = None,
|
|
438
|
+
allow_redirects: bool = True,
|
|
439
|
+
content_type: ContentType = ContentType.TEXT,
|
|
440
|
+
headers: Optional[Dict[str, str]] = None,
|
|
441
|
+
output_format: Optional[OutputFormat] = None,
|
|
442
|
+
output_path: Optional[str] = None,
|
|
443
|
+
) -> Any:
|
|
444
|
+
"""Legacy method - now uses httpx in async mode."""
|
|
445
|
+
return await self.get_httpx(
|
|
446
|
+
url,
|
|
447
|
+
method,
|
|
448
|
+
params,
|
|
449
|
+
data,
|
|
450
|
+
json_data,
|
|
451
|
+
cookies,
|
|
452
|
+
auth,
|
|
453
|
+
verify_ssl,
|
|
454
|
+
allow_redirects,
|
|
455
|
+
content_type,
|
|
456
|
+
headers,
|
|
457
|
+
output_format,
|
|
458
|
+
output_path,
|
|
459
|
+
async_mode=True,
|
|
460
|
+
)
|
|
461
|
+
|
|
462
|
+
async def render(
|
|
463
|
+
self,
|
|
464
|
+
url: str,
|
|
465
|
+
engine: RenderEngine = RenderEngine.PLAYWRIGHT,
|
|
466
|
+
wait_time: int = 5,
|
|
467
|
+
wait_selector: Optional[str] = None,
|
|
468
|
+
scroll_to_bottom: bool = False,
|
|
469
|
+
screenshot: bool = False,
|
|
470
|
+
screenshot_path: Optional[str] = None,
|
|
471
|
+
headers: Optional[Dict[str, str]] = None,
|
|
472
|
+
output_format: Optional[OutputFormat] = None,
|
|
473
|
+
output_path: Optional[str] = None,
|
|
474
|
+
) -> Dict[str, Any]:
|
|
475
|
+
"""
|
|
476
|
+
Render a web page using a headless browser (Playwright).
|
|
477
|
+
|
|
478
|
+
Args:
|
|
479
|
+
url (str): URL to render.
|
|
480
|
+
engine (RenderEngine): Rendering engine to use (only PLAYWRIGHT supported).
|
|
481
|
+
wait_time (int): Time to wait for JS execution.
|
|
482
|
+
wait_selector (Optional[str]): CSS selector to wait for.
|
|
483
|
+
scroll_to_bottom (bool): Whether to scroll to the bottom of the page.
|
|
484
|
+
screenshot (bool): Whether to take a screenshot.
|
|
485
|
+
screenshot_path (Optional[str]): Path to save the screenshot.
|
|
486
|
+
headers (Optional[Dict[str, str]]): Custom headers.
|
|
487
|
+
output_format (Optional[OutputFormat]): Output format.
|
|
488
|
+
output_path (Optional[str]): Path to save output.
|
|
489
|
+
|
|
490
|
+
Returns:
|
|
491
|
+
Dict[str, Any]: Rendered page content {'html': str, 'title': str, 'url': str, 'screenshot': Optional[str]}.
|
|
492
|
+
|
|
493
|
+
Raises:
|
|
494
|
+
RenderingError: If rendering fails.
|
|
495
|
+
"""
|
|
496
|
+
try:
|
|
497
|
+
if engine == RenderEngine.PLAYWRIGHT:
|
|
498
|
+
if not self.config.playwright_available:
|
|
499
|
+
raise RenderingError(
|
|
500
|
+
"Playwright is not available. Install with 'pip install playwright'"
|
|
501
|
+
)
|
|
502
|
+
result = await self._render_with_playwright(
|
|
503
|
+
url,
|
|
504
|
+
wait_time,
|
|
505
|
+
wait_selector,
|
|
506
|
+
scroll_to_bottom,
|
|
507
|
+
screenshot,
|
|
508
|
+
screenshot_path,
|
|
509
|
+
)
|
|
510
|
+
else:
|
|
511
|
+
raise RenderingError(
|
|
512
|
+
f"Unsupported rendering engine: {engine}. Only PLAYWRIGHT is supported."
|
|
513
|
+
)
|
|
514
|
+
if output_format and output_path:
|
|
515
|
+
await self._save_output(result, output_path, output_format)
|
|
516
|
+
result["saved_to"] = output_path
|
|
517
|
+
return result
|
|
518
|
+
except Exception as e:
|
|
519
|
+
raise RenderingError(f"Failed to render page: {str(e)}")
|
|
520
|
+
|
|
521
|
+
async def _render_with_playwright(
|
|
522
|
+
self,
|
|
523
|
+
url: str,
|
|
524
|
+
wait_time: int,
|
|
525
|
+
wait_selector: Optional[str],
|
|
526
|
+
scroll_to_bottom: bool,
|
|
527
|
+
screenshot: bool,
|
|
528
|
+
screenshot_path: Optional[str],
|
|
529
|
+
) -> Dict[str, Any]:
|
|
530
|
+
"""Render a web page using Playwright with async API."""
|
|
531
|
+
from playwright.async_api import async_playwright
|
|
532
|
+
|
|
533
|
+
async with async_playwright() as p:
|
|
534
|
+
browser = await p.chromium.launch()
|
|
535
|
+
page = await browser.new_page(
|
|
536
|
+
user_agent=self.config.user_agent,
|
|
537
|
+
viewport={"width": 1280, "height": 800},
|
|
538
|
+
)
|
|
539
|
+
try:
|
|
540
|
+
await page.goto(url)
|
|
541
|
+
if wait_selector:
|
|
542
|
+
await page.wait_for_selector(wait_selector)
|
|
543
|
+
else:
|
|
544
|
+
await page.wait_for_load_state("networkidle")
|
|
545
|
+
if scroll_to_bottom:
|
|
546
|
+
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
|
547
|
+
await page.wait_for_timeout(1000)
|
|
548
|
+
screenshot_result = None
|
|
549
|
+
if screenshot:
|
|
550
|
+
screenshot_path = screenshot_path or os.path.join(
|
|
551
|
+
self.config.output_dir,
|
|
552
|
+
f"screenshot_{int(time.time())}.png",
|
|
553
|
+
)
|
|
554
|
+
os.makedirs(
|
|
555
|
+
os.path.dirname(os.path.abspath(screenshot_path)),
|
|
556
|
+
exist_ok=True,
|
|
557
|
+
)
|
|
558
|
+
await page.screenshot(path=screenshot_path)
|
|
559
|
+
screenshot_result = screenshot_path
|
|
560
|
+
html = await page.content()
|
|
561
|
+
title = await page.title()
|
|
562
|
+
result = {
|
|
563
|
+
"html": html,
|
|
564
|
+
"title": title,
|
|
565
|
+
"url": page.url,
|
|
566
|
+
"screenshot": screenshot_result,
|
|
567
|
+
}
|
|
568
|
+
return result
|
|
569
|
+
finally:
|
|
570
|
+
await browser.close()
|
|
571
|
+
|
|
572
|
+
def crawl_scrapy(
|
|
573
|
+
self,
|
|
574
|
+
project_path: str,
|
|
575
|
+
spider_name: str,
|
|
576
|
+
output_path: str,
|
|
577
|
+
spider_args: Optional[Dict[str, str]] = None,
|
|
578
|
+
headers: Optional[Dict[str, str]] = None,
|
|
579
|
+
output_format: Optional[OutputFormat] = None,
|
|
580
|
+
) -> Dict[str, Any]:
|
|
581
|
+
"""
|
|
582
|
+
Execute a Scrapy spider in an existing project and output results to a file.
|
|
583
|
+
|
|
584
|
+
Args:
|
|
585
|
+
project_path (str): Path to the Scrapy project.
|
|
586
|
+
spider_name (str): Name of the spider to run.
|
|
587
|
+
output_path (str): Path to save the output.
|
|
588
|
+
spider_args (Optional[Dict[str, str]]): Arguments to pass to the spider.
|
|
589
|
+
headers (Optional[Dict[str, str]]): Custom headers.
|
|
590
|
+
output_format (Optional[OutputFormat]): Output format.
|
|
591
|
+
|
|
592
|
+
Returns:
|
|
593
|
+
Dict[str, Any]: Crawl results {'output_path': str, 'execution_time': float, 'file_size': int, 'stdout': str, 'stderr': str}.
|
|
594
|
+
|
|
595
|
+
Raises:
|
|
596
|
+
ExternalToolError: If Scrapy fails.
|
|
597
|
+
TimeoutError: If the operation times out.
|
|
598
|
+
"""
|
|
599
|
+
try:
|
|
600
|
+
start_time = time.time()
|
|
601
|
+
os.makedirs(os.path.dirname(os.path.abspath(output_path)), exist_ok=True)
|
|
602
|
+
cmd = [
|
|
603
|
+
self.config.scrapy_command,
|
|
604
|
+
"crawl",
|
|
605
|
+
spider_name,
|
|
606
|
+
"-o",
|
|
607
|
+
output_path,
|
|
608
|
+
"-s",
|
|
609
|
+
f"USER_AGENT={self.config.user_agent}",
|
|
610
|
+
"-s",
|
|
611
|
+
"LOG_LEVEL=INFO",
|
|
612
|
+
]
|
|
613
|
+
if spider_args:
|
|
614
|
+
for k, v in spider_args.items():
|
|
615
|
+
cmd += ["-a", f"{k}={v}"]
|
|
616
|
+
process = subprocess.run(
|
|
617
|
+
cmd,
|
|
618
|
+
cwd=project_path,
|
|
619
|
+
stdout=subprocess.PIPE,
|
|
620
|
+
stderr=subprocess.PIPE,
|
|
621
|
+
text=True,
|
|
622
|
+
)
|
|
623
|
+
if process.returncode != 0:
|
|
624
|
+
error_msg = process.stderr.strip()
|
|
625
|
+
raise ExternalToolError(f"Scrapy crawl failed: {error_msg}")
|
|
626
|
+
if not os.path.exists(output_path):
|
|
627
|
+
raise ExternalToolError(f"Scrapy crawl did not create output file: {output_path}")
|
|
628
|
+
file_size = os.path.getsize(output_path)
|
|
629
|
+
result = {
|
|
630
|
+
"output_path": output_path,
|
|
631
|
+
"execution_time": time.time() - start_time,
|
|
632
|
+
"file_size": file_size,
|
|
633
|
+
"stdout": process.stdout.strip(),
|
|
634
|
+
"stderr": process.stderr.strip(),
|
|
635
|
+
}
|
|
636
|
+
return result
|
|
637
|
+
except subprocess.TimeoutExpired:
|
|
638
|
+
raise TimeoutError("Scrapy crawl timed out")
|
|
639
|
+
except Exception as e:
|
|
640
|
+
raise ExternalToolError(f"Error running Scrapy: {str(e)}")
|
|
641
|
+
|
|
642
|
+
def parse_html(
|
|
643
|
+
self,
|
|
644
|
+
html: str,
|
|
645
|
+
selector: str,
|
|
646
|
+
selector_type: str = "css",
|
|
647
|
+
extract_attr: Optional[str] = None,
|
|
648
|
+
extract_text: bool = True,
|
|
649
|
+
) -> Dict[str, Any]:
|
|
650
|
+
"""
|
|
651
|
+
Parse HTML content using BeautifulSoup.
|
|
652
|
+
|
|
653
|
+
Args:
|
|
654
|
+
html (str): HTML content to parse.
|
|
655
|
+
selector (str): CSS or XPath selector.
|
|
656
|
+
selector_type (str): Selector type ('css' or 'xpath').
|
|
657
|
+
extract_attr (Optional[str]): Attribute to extract.
|
|
658
|
+
extract_text (bool): Whether to extract text content.
|
|
659
|
+
|
|
660
|
+
Returns:
|
|
661
|
+
Dict[str, Any]: Parsed results {'selector': str, 'selector_type': str, 'count': int, 'results': List[str]}.
|
|
662
|
+
|
|
663
|
+
Raises:
|
|
664
|
+
ParsingError: If parsing fails.
|
|
665
|
+
"""
|
|
666
|
+
try:
|
|
667
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
668
|
+
if selector_type == "css":
|
|
669
|
+
elements = soup.select(selector)
|
|
670
|
+
else:
|
|
671
|
+
from lxml.html import fromstring
|
|
672
|
+
from lxml.etree import XPath
|
|
673
|
+
|
|
674
|
+
root = fromstring(html)
|
|
675
|
+
xpath = XPath(selector)
|
|
676
|
+
elements = xpath(root)
|
|
677
|
+
results = []
|
|
678
|
+
for element in elements:
|
|
679
|
+
if extract_attr:
|
|
680
|
+
value = (
|
|
681
|
+
element.get(extract_attr)
|
|
682
|
+
if hasattr(element, "get")
|
|
683
|
+
else element.get(extract_attr)
|
|
684
|
+
)
|
|
685
|
+
if value is not None:
|
|
686
|
+
results.append(value)
|
|
687
|
+
elif extract_text:
|
|
688
|
+
if hasattr(element, "text_content") and callable(
|
|
689
|
+
getattr(element, "text_content")
|
|
690
|
+
):
|
|
691
|
+
# lxml element
|
|
692
|
+
text = element.text_content()
|
|
693
|
+
else:
|
|
694
|
+
# BeautifulSoup element
|
|
695
|
+
text = element.get_text()
|
|
696
|
+
|
|
697
|
+
if text and text.strip():
|
|
698
|
+
results.append(text.strip())
|
|
699
|
+
return {
|
|
700
|
+
"selector": selector,
|
|
701
|
+
"selector_type": selector_type,
|
|
702
|
+
"count": len(results),
|
|
703
|
+
"results": results,
|
|
704
|
+
}
|
|
705
|
+
except Exception as e:
|
|
706
|
+
raise ParsingError(f"Error parsing HTML: {str(e)}")
|
|
707
|
+
|
|
708
|
+
# HTTP method shortcuts
|
|
709
|
+
get = get_httpx
|
|
710
|
+
post = get_httpx
|
|
711
|
+
put = get_httpx
|
|
712
|
+
delete = get_httpx
|
|
713
|
+
head = get_httpx
|
|
714
|
+
options = get_httpx
|
|
715
|
+
patch = get_httpx
|