aiecs 1.0.1__py3-none-any.whl → 1.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of aiecs might be problematic. Click here for more details.
- aiecs/__init__.py +13 -16
- aiecs/__main__.py +7 -7
- aiecs/aiecs_client.py +269 -75
- aiecs/application/executors/operation_executor.py +79 -54
- aiecs/application/knowledge_graph/__init__.py +7 -0
- aiecs/application/knowledge_graph/builder/__init__.py +37 -0
- aiecs/application/knowledge_graph/builder/data_quality.py +302 -0
- aiecs/application/knowledge_graph/builder/data_reshaping.py +293 -0
- aiecs/application/knowledge_graph/builder/document_builder.py +369 -0
- aiecs/application/knowledge_graph/builder/graph_builder.py +490 -0
- aiecs/application/knowledge_graph/builder/import_optimizer.py +396 -0
- aiecs/application/knowledge_graph/builder/schema_inference.py +462 -0
- aiecs/application/knowledge_graph/builder/schema_mapping.py +563 -0
- aiecs/application/knowledge_graph/builder/structured_pipeline.py +1384 -0
- aiecs/application/knowledge_graph/builder/text_chunker.py +317 -0
- aiecs/application/knowledge_graph/extractors/__init__.py +27 -0
- aiecs/application/knowledge_graph/extractors/base.py +98 -0
- aiecs/application/knowledge_graph/extractors/llm_entity_extractor.py +422 -0
- aiecs/application/knowledge_graph/extractors/llm_relation_extractor.py +347 -0
- aiecs/application/knowledge_graph/extractors/ner_entity_extractor.py +241 -0
- aiecs/application/knowledge_graph/fusion/__init__.py +78 -0
- aiecs/application/knowledge_graph/fusion/ab_testing.py +395 -0
- aiecs/application/knowledge_graph/fusion/abbreviation_expander.py +327 -0
- aiecs/application/knowledge_graph/fusion/alias_index.py +597 -0
- aiecs/application/knowledge_graph/fusion/alias_matcher.py +384 -0
- aiecs/application/knowledge_graph/fusion/cache_coordinator.py +343 -0
- aiecs/application/knowledge_graph/fusion/entity_deduplicator.py +433 -0
- aiecs/application/knowledge_graph/fusion/entity_linker.py +511 -0
- aiecs/application/knowledge_graph/fusion/evaluation_dataset.py +240 -0
- aiecs/application/knowledge_graph/fusion/knowledge_fusion.py +632 -0
- aiecs/application/knowledge_graph/fusion/matching_config.py +489 -0
- aiecs/application/knowledge_graph/fusion/name_normalizer.py +352 -0
- aiecs/application/knowledge_graph/fusion/relation_deduplicator.py +183 -0
- aiecs/application/knowledge_graph/fusion/semantic_name_matcher.py +464 -0
- aiecs/application/knowledge_graph/fusion/similarity_pipeline.py +534 -0
- aiecs/application/knowledge_graph/pattern_matching/__init__.py +21 -0
- aiecs/application/knowledge_graph/pattern_matching/pattern_matcher.py +342 -0
- aiecs/application/knowledge_graph/pattern_matching/query_executor.py +366 -0
- aiecs/application/knowledge_graph/profiling/__init__.py +12 -0
- aiecs/application/knowledge_graph/profiling/query_plan_visualizer.py +195 -0
- aiecs/application/knowledge_graph/profiling/query_profiler.py +223 -0
- aiecs/application/knowledge_graph/reasoning/__init__.py +27 -0
- aiecs/application/knowledge_graph/reasoning/evidence_synthesis.py +341 -0
- aiecs/application/knowledge_graph/reasoning/inference_engine.py +500 -0
- aiecs/application/knowledge_graph/reasoning/logic_form_parser.py +163 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/__init__.py +79 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_builder.py +513 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_nodes.py +913 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_validator.py +866 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/error_handler.py +475 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/parser.py +396 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/query_context.py +208 -0
- aiecs/application/knowledge_graph/reasoning/logic_query_integration.py +170 -0
- aiecs/application/knowledge_graph/reasoning/query_planner.py +855 -0
- aiecs/application/knowledge_graph/reasoning/reasoning_engine.py +518 -0
- aiecs/application/knowledge_graph/retrieval/__init__.py +27 -0
- aiecs/application/knowledge_graph/retrieval/query_intent_classifier.py +211 -0
- aiecs/application/knowledge_graph/retrieval/retrieval_strategies.py +592 -0
- aiecs/application/knowledge_graph/retrieval/strategy_types.py +23 -0
- aiecs/application/knowledge_graph/search/__init__.py +59 -0
- aiecs/application/knowledge_graph/search/hybrid_search.py +457 -0
- aiecs/application/knowledge_graph/search/reranker.py +293 -0
- aiecs/application/knowledge_graph/search/reranker_strategies.py +535 -0
- aiecs/application/knowledge_graph/search/text_similarity.py +392 -0
- aiecs/application/knowledge_graph/traversal/__init__.py +15 -0
- aiecs/application/knowledge_graph/traversal/enhanced_traversal.py +305 -0
- aiecs/application/knowledge_graph/traversal/path_scorer.py +271 -0
- aiecs/application/knowledge_graph/validators/__init__.py +13 -0
- aiecs/application/knowledge_graph/validators/relation_validator.py +239 -0
- aiecs/application/knowledge_graph/visualization/__init__.py +11 -0
- aiecs/application/knowledge_graph/visualization/graph_visualizer.py +313 -0
- aiecs/common/__init__.py +9 -0
- aiecs/common/knowledge_graph/__init__.py +17 -0
- aiecs/common/knowledge_graph/runnable.py +471 -0
- aiecs/config/__init__.py +20 -5
- aiecs/config/config.py +762 -31
- aiecs/config/graph_config.py +131 -0
- aiecs/config/tool_config.py +399 -0
- aiecs/core/__init__.py +29 -13
- aiecs/core/interface/__init__.py +2 -2
- aiecs/core/interface/execution_interface.py +22 -22
- aiecs/core/interface/storage_interface.py +37 -88
- aiecs/core/registry/__init__.py +31 -0
- aiecs/core/registry/service_registry.py +92 -0
- aiecs/domain/__init__.py +270 -1
- aiecs/domain/agent/__init__.py +191 -0
- aiecs/domain/agent/base_agent.py +3870 -0
- aiecs/domain/agent/exceptions.py +99 -0
- aiecs/domain/agent/graph_aware_mixin.py +569 -0
- aiecs/domain/agent/hybrid_agent.py +1435 -0
- aiecs/domain/agent/integration/__init__.py +29 -0
- aiecs/domain/agent/integration/context_compressor.py +216 -0
- aiecs/domain/agent/integration/context_engine_adapter.py +587 -0
- aiecs/domain/agent/integration/protocols.py +281 -0
- aiecs/domain/agent/integration/retry_policy.py +218 -0
- aiecs/domain/agent/integration/role_config.py +213 -0
- aiecs/domain/agent/knowledge_aware_agent.py +1892 -0
- aiecs/domain/agent/lifecycle.py +291 -0
- aiecs/domain/agent/llm_agent.py +692 -0
- aiecs/domain/agent/memory/__init__.py +12 -0
- aiecs/domain/agent/memory/conversation.py +1124 -0
- aiecs/domain/agent/migration/__init__.py +14 -0
- aiecs/domain/agent/migration/conversion.py +163 -0
- aiecs/domain/agent/migration/legacy_wrapper.py +86 -0
- aiecs/domain/agent/models.py +884 -0
- aiecs/domain/agent/observability.py +479 -0
- aiecs/domain/agent/persistence.py +449 -0
- aiecs/domain/agent/prompts/__init__.py +29 -0
- aiecs/domain/agent/prompts/builder.py +159 -0
- aiecs/domain/agent/prompts/formatters.py +187 -0
- aiecs/domain/agent/prompts/template.py +255 -0
- aiecs/domain/agent/registry.py +253 -0
- aiecs/domain/agent/tool_agent.py +444 -0
- aiecs/domain/agent/tools/__init__.py +15 -0
- aiecs/domain/agent/tools/schema_generator.py +364 -0
- aiecs/domain/community/__init__.py +155 -0
- aiecs/domain/community/agent_adapter.py +469 -0
- aiecs/domain/community/analytics.py +432 -0
- aiecs/domain/community/collaborative_workflow.py +648 -0
- aiecs/domain/community/communication_hub.py +634 -0
- aiecs/domain/community/community_builder.py +320 -0
- aiecs/domain/community/community_integration.py +796 -0
- aiecs/domain/community/community_manager.py +803 -0
- aiecs/domain/community/decision_engine.py +849 -0
- aiecs/domain/community/exceptions.py +231 -0
- aiecs/domain/community/models/__init__.py +33 -0
- aiecs/domain/community/models/community_models.py +234 -0
- aiecs/domain/community/resource_manager.py +461 -0
- aiecs/domain/community/shared_context_manager.py +589 -0
- aiecs/domain/context/__init__.py +40 -10
- aiecs/domain/context/context_engine.py +1910 -0
- aiecs/domain/context/conversation_models.py +87 -53
- aiecs/domain/context/graph_memory.py +582 -0
- aiecs/domain/execution/model.py +12 -4
- aiecs/domain/knowledge_graph/__init__.py +19 -0
- aiecs/domain/knowledge_graph/models/__init__.py +52 -0
- aiecs/domain/knowledge_graph/models/entity.py +148 -0
- aiecs/domain/knowledge_graph/models/evidence.py +178 -0
- aiecs/domain/knowledge_graph/models/inference_rule.py +184 -0
- aiecs/domain/knowledge_graph/models/path.py +171 -0
- aiecs/domain/knowledge_graph/models/path_pattern.py +171 -0
- aiecs/domain/knowledge_graph/models/query.py +261 -0
- aiecs/domain/knowledge_graph/models/query_plan.py +181 -0
- aiecs/domain/knowledge_graph/models/relation.py +202 -0
- aiecs/domain/knowledge_graph/schema/__init__.py +23 -0
- aiecs/domain/knowledge_graph/schema/entity_type.py +131 -0
- aiecs/domain/knowledge_graph/schema/graph_schema.py +253 -0
- aiecs/domain/knowledge_graph/schema/property_schema.py +143 -0
- aiecs/domain/knowledge_graph/schema/relation_type.py +163 -0
- aiecs/domain/knowledge_graph/schema/schema_manager.py +691 -0
- aiecs/domain/knowledge_graph/schema/type_enums.py +209 -0
- aiecs/domain/task/dsl_processor.py +172 -56
- aiecs/domain/task/model.py +20 -8
- aiecs/domain/task/task_context.py +27 -24
- aiecs/infrastructure/__init__.py +0 -2
- aiecs/infrastructure/graph_storage/__init__.py +11 -0
- aiecs/infrastructure/graph_storage/base.py +837 -0
- aiecs/infrastructure/graph_storage/batch_operations.py +458 -0
- aiecs/infrastructure/graph_storage/cache.py +424 -0
- aiecs/infrastructure/graph_storage/distributed.py +223 -0
- aiecs/infrastructure/graph_storage/error_handling.py +380 -0
- aiecs/infrastructure/graph_storage/graceful_degradation.py +294 -0
- aiecs/infrastructure/graph_storage/health_checks.py +378 -0
- aiecs/infrastructure/graph_storage/in_memory.py +1197 -0
- aiecs/infrastructure/graph_storage/index_optimization.py +446 -0
- aiecs/infrastructure/graph_storage/lazy_loading.py +431 -0
- aiecs/infrastructure/graph_storage/metrics.py +344 -0
- aiecs/infrastructure/graph_storage/migration.py +400 -0
- aiecs/infrastructure/graph_storage/pagination.py +483 -0
- aiecs/infrastructure/graph_storage/performance_monitoring.py +456 -0
- aiecs/infrastructure/graph_storage/postgres.py +1563 -0
- aiecs/infrastructure/graph_storage/property_storage.py +353 -0
- aiecs/infrastructure/graph_storage/protocols.py +76 -0
- aiecs/infrastructure/graph_storage/query_optimizer.py +642 -0
- aiecs/infrastructure/graph_storage/schema_cache.py +290 -0
- aiecs/infrastructure/graph_storage/sqlite.py +1373 -0
- aiecs/infrastructure/graph_storage/streaming.py +487 -0
- aiecs/infrastructure/graph_storage/tenant.py +412 -0
- aiecs/infrastructure/messaging/celery_task_manager.py +92 -54
- aiecs/infrastructure/messaging/websocket_manager.py +51 -35
- aiecs/infrastructure/monitoring/__init__.py +22 -0
- aiecs/infrastructure/monitoring/executor_metrics.py +45 -11
- aiecs/infrastructure/monitoring/global_metrics_manager.py +212 -0
- aiecs/infrastructure/monitoring/structured_logger.py +3 -7
- aiecs/infrastructure/monitoring/tracing_manager.py +63 -35
- aiecs/infrastructure/persistence/__init__.py +14 -1
- aiecs/infrastructure/persistence/context_engine_client.py +184 -0
- aiecs/infrastructure/persistence/database_manager.py +67 -43
- aiecs/infrastructure/persistence/file_storage.py +180 -103
- aiecs/infrastructure/persistence/redis_client.py +74 -21
- aiecs/llm/__init__.py +73 -25
- aiecs/llm/callbacks/__init__.py +11 -0
- aiecs/llm/{custom_callbacks.py → callbacks/custom_callbacks.py} +26 -19
- aiecs/llm/client_factory.py +224 -36
- aiecs/llm/client_resolver.py +155 -0
- aiecs/llm/clients/__init__.py +38 -0
- aiecs/llm/clients/base_client.py +324 -0
- aiecs/llm/clients/google_function_calling_mixin.py +457 -0
- aiecs/llm/clients/googleai_client.py +241 -0
- aiecs/llm/clients/openai_client.py +158 -0
- aiecs/llm/clients/openai_compatible_mixin.py +367 -0
- aiecs/llm/clients/vertex_client.py +897 -0
- aiecs/llm/clients/xai_client.py +201 -0
- aiecs/llm/config/__init__.py +51 -0
- aiecs/llm/config/config_loader.py +272 -0
- aiecs/llm/config/config_validator.py +206 -0
- aiecs/llm/config/model_config.py +143 -0
- aiecs/llm/protocols.py +149 -0
- aiecs/llm/utils/__init__.py +10 -0
- aiecs/llm/utils/validate_config.py +89 -0
- aiecs/main.py +140 -121
- aiecs/scripts/aid/VERSION_MANAGEMENT.md +138 -0
- aiecs/scripts/aid/__init__.py +19 -0
- aiecs/scripts/aid/module_checker.py +499 -0
- aiecs/scripts/aid/version_manager.py +235 -0
- aiecs/scripts/{DEPENDENCY_SYSTEM_SUMMARY.md → dependance_check/DEPENDENCY_SYSTEM_SUMMARY.md} +1 -0
- aiecs/scripts/{README_DEPENDENCY_CHECKER.md → dependance_check/README_DEPENDENCY_CHECKER.md} +1 -0
- aiecs/scripts/dependance_check/__init__.py +15 -0
- aiecs/scripts/dependance_check/dependency_checker.py +1835 -0
- aiecs/scripts/{dependency_fixer.py → dependance_check/dependency_fixer.py} +192 -90
- aiecs/scripts/{download_nlp_data.py → dependance_check/download_nlp_data.py} +203 -71
- aiecs/scripts/dependance_patch/__init__.py +7 -0
- aiecs/scripts/dependance_patch/fix_weasel/__init__.py +11 -0
- aiecs/scripts/{fix_weasel_validator.py → dependance_patch/fix_weasel/fix_weasel_validator.py} +21 -14
- aiecs/scripts/{patch_weasel_library.sh → dependance_patch/fix_weasel/patch_weasel_library.sh} +1 -1
- aiecs/scripts/knowledge_graph/__init__.py +3 -0
- aiecs/scripts/knowledge_graph/run_threshold_experiments.py +212 -0
- aiecs/scripts/migrations/multi_tenancy/README.md +142 -0
- aiecs/scripts/tools_develop/README.md +671 -0
- aiecs/scripts/tools_develop/README_CONFIG_CHECKER.md +273 -0
- aiecs/scripts/tools_develop/TOOLS_CONFIG_GUIDE.md +1287 -0
- aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
- aiecs/scripts/tools_develop/__init__.py +21 -0
- aiecs/scripts/tools_develop/check_all_tools_config.py +548 -0
- aiecs/scripts/tools_develop/check_type_annotations.py +257 -0
- aiecs/scripts/tools_develop/pre-commit-schema-coverage.sh +66 -0
- aiecs/scripts/tools_develop/schema_coverage.py +511 -0
- aiecs/scripts/tools_develop/validate_tool_schemas.py +475 -0
- aiecs/scripts/tools_develop/verify_executor_config_fix.py +98 -0
- aiecs/scripts/tools_develop/verify_tools.py +352 -0
- aiecs/tasks/__init__.py +0 -1
- aiecs/tasks/worker.py +115 -47
- aiecs/tools/__init__.py +194 -72
- aiecs/tools/apisource/__init__.py +99 -0
- aiecs/tools/apisource/intelligence/__init__.py +19 -0
- aiecs/tools/apisource/intelligence/data_fusion.py +632 -0
- aiecs/tools/apisource/intelligence/query_analyzer.py +417 -0
- aiecs/tools/apisource/intelligence/search_enhancer.py +385 -0
- aiecs/tools/apisource/monitoring/__init__.py +9 -0
- aiecs/tools/apisource/monitoring/metrics.py +330 -0
- aiecs/tools/apisource/providers/__init__.py +112 -0
- aiecs/tools/apisource/providers/base.py +671 -0
- aiecs/tools/apisource/providers/census.py +397 -0
- aiecs/tools/apisource/providers/fred.py +535 -0
- aiecs/tools/apisource/providers/newsapi.py +409 -0
- aiecs/tools/apisource/providers/worldbank.py +352 -0
- aiecs/tools/apisource/reliability/__init__.py +12 -0
- aiecs/tools/apisource/reliability/error_handler.py +363 -0
- aiecs/tools/apisource/reliability/fallback_strategy.py +376 -0
- aiecs/tools/apisource/tool.py +832 -0
- aiecs/tools/apisource/utils/__init__.py +9 -0
- aiecs/tools/apisource/utils/validators.py +334 -0
- aiecs/tools/base_tool.py +415 -21
- aiecs/tools/docs/__init__.py +121 -0
- aiecs/tools/docs/ai_document_orchestrator.py +607 -0
- aiecs/tools/docs/ai_document_writer_orchestrator.py +2350 -0
- aiecs/tools/docs/content_insertion_tool.py +1320 -0
- aiecs/tools/docs/document_creator_tool.py +1323 -0
- aiecs/tools/docs/document_layout_tool.py +1160 -0
- aiecs/tools/docs/document_parser_tool.py +1011 -0
- aiecs/tools/docs/document_writer_tool.py +1829 -0
- aiecs/tools/knowledge_graph/__init__.py +17 -0
- aiecs/tools/knowledge_graph/graph_reasoning_tool.py +807 -0
- aiecs/tools/knowledge_graph/graph_search_tool.py +944 -0
- aiecs/tools/knowledge_graph/kg_builder_tool.py +524 -0
- aiecs/tools/langchain_adapter.py +300 -138
- aiecs/tools/schema_generator.py +455 -0
- aiecs/tools/search_tool/__init__.py +100 -0
- aiecs/tools/search_tool/analyzers.py +581 -0
- aiecs/tools/search_tool/cache.py +264 -0
- aiecs/tools/search_tool/constants.py +128 -0
- aiecs/tools/search_tool/context.py +224 -0
- aiecs/tools/search_tool/core.py +778 -0
- aiecs/tools/search_tool/deduplicator.py +119 -0
- aiecs/tools/search_tool/error_handler.py +242 -0
- aiecs/tools/search_tool/metrics.py +343 -0
- aiecs/tools/search_tool/rate_limiter.py +172 -0
- aiecs/tools/search_tool/schemas.py +275 -0
- aiecs/tools/statistics/__init__.py +80 -0
- aiecs/tools/statistics/ai_data_analysis_orchestrator.py +646 -0
- aiecs/tools/statistics/ai_insight_generator_tool.py +508 -0
- aiecs/tools/statistics/ai_report_orchestrator_tool.py +684 -0
- aiecs/tools/statistics/data_loader_tool.py +555 -0
- aiecs/tools/statistics/data_profiler_tool.py +638 -0
- aiecs/tools/statistics/data_transformer_tool.py +580 -0
- aiecs/tools/statistics/data_visualizer_tool.py +498 -0
- aiecs/tools/statistics/model_trainer_tool.py +507 -0
- aiecs/tools/statistics/statistical_analyzer_tool.py +472 -0
- aiecs/tools/task_tools/__init__.py +49 -36
- aiecs/tools/task_tools/chart_tool.py +200 -184
- aiecs/tools/task_tools/classfire_tool.py +268 -267
- aiecs/tools/task_tools/image_tool.py +175 -131
- aiecs/tools/task_tools/office_tool.py +226 -146
- aiecs/tools/task_tools/pandas_tool.py +477 -121
- aiecs/tools/task_tools/report_tool.py +390 -142
- aiecs/tools/task_tools/research_tool.py +149 -79
- aiecs/tools/task_tools/scraper_tool.py +339 -145
- aiecs/tools/task_tools/stats_tool.py +448 -209
- aiecs/tools/temp_file_manager.py +26 -24
- aiecs/tools/tool_executor/__init__.py +18 -16
- aiecs/tools/tool_executor/tool_executor.py +364 -52
- aiecs/utils/LLM_output_structor.py +74 -48
- aiecs/utils/__init__.py +14 -3
- aiecs/utils/base_callback.py +0 -3
- aiecs/utils/cache_provider.py +696 -0
- aiecs/utils/execution_utils.py +50 -31
- aiecs/utils/prompt_loader.py +1 -0
- aiecs/utils/token_usage_repository.py +37 -11
- aiecs/ws/socket_server.py +14 -4
- {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/METADATA +52 -15
- aiecs-1.7.6.dist-info/RECORD +337 -0
- aiecs-1.7.6.dist-info/entry_points.txt +13 -0
- aiecs/config/registry.py +0 -19
- aiecs/domain/context/content_engine.py +0 -982
- aiecs/llm/base_client.py +0 -99
- aiecs/llm/openai_client.py +0 -125
- aiecs/llm/vertex_client.py +0 -186
- aiecs/llm/xai_client.py +0 -184
- aiecs/scripts/dependency_checker.py +0 -857
- aiecs/scripts/quick_dependency_check.py +0 -269
- aiecs/tools/task_tools/search_api.py +0 -7
- aiecs-1.0.1.dist-info/RECORD +0 -90
- aiecs-1.0.1.dist-info/entry_points.txt +0 -7
- /aiecs/scripts/{setup_nlp_data.sh → dependance_check/setup_nlp_data.sh} +0 -0
- /aiecs/scripts/{README_WEASEL_PATCH.md → dependance_patch/fix_weasel/README_WEASEL_PATCH.md} +0 -0
- /aiecs/scripts/{fix_weasel_validator.sh → dependance_patch/fix_weasel/fix_weasel_validator.sh} +0 -0
- /aiecs/scripts/{run_weasel_patch.sh → dependance_patch/fix_weasel/run_weasel_patch.sh} +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/WHEEL +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/licenses/LICENSE +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/top_level.txt +0 -0
|
@@ -2,23 +2,24 @@ import os
|
|
|
2
2
|
import json
|
|
3
3
|
import time
|
|
4
4
|
import logging
|
|
5
|
-
import asyncio
|
|
6
5
|
import tempfile
|
|
7
6
|
import subprocess
|
|
8
|
-
from typing import Dict, Any, List, Optional,
|
|
7
|
+
from typing import Dict, Any, List, Optional, Tuple, Union
|
|
8
|
+
import csv
|
|
9
9
|
from enum import Enum
|
|
10
|
-
from urllib.parse import urlparse, urljoin
|
|
11
10
|
|
|
12
11
|
import httpx
|
|
13
12
|
from bs4 import BeautifulSoup
|
|
14
13
|
from urllib import request as urllib_request
|
|
15
|
-
from pydantic import BaseModel,
|
|
16
|
-
from pydantic_settings import BaseSettings
|
|
14
|
+
from pydantic import BaseModel, Field
|
|
15
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
17
16
|
|
|
18
17
|
from aiecs.tools.base_tool import BaseTool
|
|
19
18
|
from aiecs.tools import register_tool
|
|
20
19
|
|
|
21
20
|
# Enums for configuration options
|
|
21
|
+
|
|
22
|
+
|
|
22
23
|
class HttpMethod(str, Enum):
|
|
23
24
|
GET = "get"
|
|
24
25
|
POST = "post"
|
|
@@ -28,12 +29,14 @@ class HttpMethod(str, Enum):
|
|
|
28
29
|
OPTIONS = "options"
|
|
29
30
|
PATCH = "patch"
|
|
30
31
|
|
|
32
|
+
|
|
31
33
|
class ContentType(str, Enum):
|
|
32
34
|
HTML = "html"
|
|
33
35
|
JSON = "json"
|
|
34
36
|
TEXT = "text"
|
|
35
37
|
BINARY = "binary"
|
|
36
38
|
|
|
39
|
+
|
|
37
40
|
class OutputFormat(str, Enum):
|
|
38
41
|
TEXT = "text"
|
|
39
42
|
JSON = "json"
|
|
@@ -41,68 +44,44 @@ class OutputFormat(str, Enum):
|
|
|
41
44
|
MARKDOWN = "markdown"
|
|
42
45
|
CSV = "csv"
|
|
43
46
|
|
|
47
|
+
|
|
44
48
|
class RenderEngine(str, Enum):
|
|
45
49
|
NONE = "none"
|
|
46
50
|
PLAYWRIGHT = "playwright"
|
|
47
51
|
|
|
48
|
-
# Global settings
|
|
49
|
-
class ScraperSettings(BaseSettings):
|
|
50
|
-
"""
|
|
51
|
-
Configuration for ScraperTool.
|
|
52
|
-
|
|
53
|
-
Attributes:
|
|
54
|
-
user_agent (str): User agent for HTTP requests.
|
|
55
|
-
max_content_length (int): Maximum content length in bytes.
|
|
56
|
-
output_dir (str): Directory for output files.
|
|
57
|
-
scrapy_command (str): Command to run Scrapy.
|
|
58
|
-
allowed_domains (List[str]): Allowed domains for scraping.
|
|
59
|
-
blocked_domains (List[str]): Blocked domains for scraping.
|
|
60
|
-
playwright_available (bool): Whether Playwright is available.
|
|
61
|
-
env_prefix (str): Environment variable prefix.
|
|
62
|
-
"""
|
|
63
|
-
user_agent: str = "PythonMiddlewareScraper/2.0"
|
|
64
|
-
max_content_length: int = 10 * 1024 * 1024 # 10MB
|
|
65
|
-
output_dir: str = os.path.join(tempfile.gettempdir(), 'scraper_outputs')
|
|
66
|
-
scrapy_command: str = "scrapy"
|
|
67
|
-
allowed_domains: List[str] = []
|
|
68
|
-
blocked_domains: List[str] = []
|
|
69
|
-
playwright_available: bool = False
|
|
70
|
-
env_prefix: str = "SCRAPER_TOOL_"
|
|
71
|
-
|
|
72
|
-
model_config = ConfigDict(env_prefix="SCRAPER_TOOL_")
|
|
73
52
|
|
|
74
53
|
# Exceptions
|
|
75
54
|
class ScraperToolError(Exception):
|
|
76
55
|
"""Base exception for ScraperTool errors."""
|
|
77
|
-
|
|
56
|
+
|
|
78
57
|
|
|
79
58
|
class HttpError(ScraperToolError):
|
|
80
59
|
"""Raised when HTTP requests fail."""
|
|
81
|
-
|
|
60
|
+
|
|
82
61
|
|
|
83
62
|
class TimeoutError(ScraperToolError):
|
|
84
63
|
"""Raised when operations time out."""
|
|
85
|
-
|
|
64
|
+
|
|
86
65
|
|
|
87
66
|
class RateLimitError(ScraperToolError):
|
|
88
67
|
"""Raised when rate limits are exceeded."""
|
|
89
|
-
|
|
68
|
+
|
|
90
69
|
|
|
91
70
|
class ParsingError(ScraperToolError):
|
|
92
71
|
"""Raised when HTML parsing fails."""
|
|
93
|
-
|
|
72
|
+
|
|
94
73
|
|
|
95
74
|
class RenderingError(ScraperToolError):
|
|
96
75
|
"""Raised when rendering fails."""
|
|
97
|
-
|
|
76
|
+
|
|
98
77
|
|
|
99
78
|
class ExternalToolError(ScraperToolError):
|
|
100
79
|
"""Raised when external tools fail."""
|
|
101
|
-
|
|
80
|
+
|
|
102
81
|
|
|
103
82
|
class FileOperationError(ScraperToolError):
|
|
104
83
|
"""Raised when file operations fail."""
|
|
105
|
-
|
|
84
|
+
|
|
106
85
|
|
|
107
86
|
@register_tool("scraper")
|
|
108
87
|
class ScraperTool(BaseTool):
|
|
@@ -117,65 +96,128 @@ class ScraperTool(BaseTool):
|
|
|
117
96
|
- Scrapy integration for advanced crawling
|
|
118
97
|
- Output in various formats: text, JSON, HTML, Markdown, CSV
|
|
119
98
|
"""
|
|
120
|
-
|
|
99
|
+
|
|
100
|
+
# Configuration schema
|
|
101
|
+
class Config(BaseSettings):
|
|
102
|
+
"""Configuration for the scraper tool
|
|
103
|
+
|
|
104
|
+
Automatically reads from environment variables with SCRAPER_TOOL_ prefix.
|
|
105
|
+
Example: SCRAPER_TOOL_USER_AGENT -> user_agent
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
model_config = SettingsConfigDict(env_prefix="SCRAPER_TOOL_")
|
|
109
|
+
|
|
110
|
+
user_agent: str = Field(
|
|
111
|
+
default="PythonMiddlewareScraper/2.0",
|
|
112
|
+
description="User agent for HTTP requests",
|
|
113
|
+
)
|
|
114
|
+
max_content_length: int = Field(
|
|
115
|
+
default=10 * 1024 * 1024,
|
|
116
|
+
description="Maximum content length in bytes",
|
|
117
|
+
)
|
|
118
|
+
output_dir: str = Field(
|
|
119
|
+
default=os.path.join(tempfile.gettempdir(), "scraper_outputs"),
|
|
120
|
+
description="Directory for output files",
|
|
121
|
+
)
|
|
122
|
+
scrapy_command: str = Field(default="scrapy", description="Command to run Scrapy")
|
|
123
|
+
allowed_domains: List[str] = Field(default=[], description="Allowed domains for scraping")
|
|
124
|
+
blocked_domains: List[str] = Field(default=[], description="Blocked domains for scraping")
|
|
125
|
+
playwright_available: bool = Field(
|
|
126
|
+
default=False,
|
|
127
|
+
description="Whether Playwright is available (auto-detected)",
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
# Schema definitions
|
|
131
|
+
class Get_httpxSchema(BaseModel):
|
|
132
|
+
"""Schema for get_httpx operation"""
|
|
133
|
+
|
|
134
|
+
url: str = Field(description="URL to scrape")
|
|
135
|
+
method: HttpMethod = Field(default=HttpMethod.GET, description="HTTP method to use: GET, POST, PUT, DELETE, HEAD, OPTIONS, or PATCH")
|
|
136
|
+
params: Optional[Dict[str, str]] = Field(default=None, description="Optional query parameters as dictionary")
|
|
137
|
+
data: Optional[Dict[str, Any]] = Field(default=None, description="Optional form data as dictionary. Mutually exclusive with json_data")
|
|
138
|
+
json_data: Optional[Dict[str, Any]] = Field(default=None, description="Optional JSON data as dictionary. Mutually exclusive with data")
|
|
139
|
+
cookies: Optional[Dict[str, str]] = Field(default=None, description="Optional cookies as dictionary")
|
|
140
|
+
auth: Optional[Tuple[str, str]] = Field(default=None, description="Optional authentication credentials as (username, password) tuple")
|
|
141
|
+
verify_ssl: Optional[bool] = Field(default=None, description="Optional SSL certificate verification. If None, defaults to True")
|
|
142
|
+
allow_redirects: bool = Field(default=True, description="Whether to allow HTTP redirects")
|
|
143
|
+
content_type: ContentType = Field(default=ContentType.TEXT, description="Expected content type: TEXT, JSON, HTML, or BINARY")
|
|
144
|
+
headers: Optional[Dict[str, str]] = Field(default=None, description="Optional custom HTTP headers as dictionary")
|
|
145
|
+
output_format: Optional[OutputFormat] = Field(default=None, description="Optional output format for saving: TEXT, JSON, HTML, MARKDOWN, or CSV")
|
|
146
|
+
output_path: Optional[str] = Field(default=None, description="Optional path to save output file. Requires output_format to be specified")
|
|
147
|
+
async_mode: bool = Field(default=True, description="Whether to use async HTTP client. If False, uses synchronous client")
|
|
148
|
+
|
|
149
|
+
class Parse_htmlSchema(BaseModel):
|
|
150
|
+
"""Schema for parse_html operation"""
|
|
151
|
+
|
|
152
|
+
html: str = Field(description="HTML content string to parse")
|
|
153
|
+
selector: str = Field(description="CSS selector or XPath expression to find elements")
|
|
154
|
+
selector_type: str = Field(default="css", description="Selector type: 'css' for CSS selectors or 'xpath' for XPath expressions")
|
|
155
|
+
extract_attr: Optional[str] = Field(default=None, description="Optional attribute name to extract from matched elements (e.g., 'href', 'src')")
|
|
156
|
+
extract_text: bool = Field(default=True, description="Whether to extract text content from matched elements. Ignored if extract_attr is specified")
|
|
157
|
+
|
|
158
|
+
def __init__(self, config: Optional[Dict] = None, **kwargs):
|
|
121
159
|
"""
|
|
122
160
|
Initialize ScraperTool with settings and resources.
|
|
123
161
|
|
|
124
162
|
Args:
|
|
125
|
-
config (Dict, optional): Configuration overrides for
|
|
163
|
+
config (Dict, optional): Configuration overrides for ScraperTool.
|
|
164
|
+
**kwargs: Additional arguments passed to BaseTool (e.g., tool_name)
|
|
126
165
|
|
|
127
166
|
Raises:
|
|
128
167
|
ValueError: If config contains invalid settings.
|
|
168
|
+
|
|
169
|
+
Configuration is automatically loaded by BaseTool from:
|
|
170
|
+
1. Explicit config dict (highest priority)
|
|
171
|
+
2. YAML config files (config/tools/scraper.yaml)
|
|
172
|
+
3. Environment variables (via dotenv from .env files)
|
|
173
|
+
4. Tool defaults (lowest priority)
|
|
129
174
|
"""
|
|
130
|
-
super().__init__(config)
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
raise ValueError(f"Invalid settings: {e}")
|
|
175
|
+
super().__init__(config, **kwargs)
|
|
176
|
+
|
|
177
|
+
# Configuration is automatically loaded by BaseTool into self._config_obj
|
|
178
|
+
# Access config via self._config_obj (BaseSettings instance)
|
|
179
|
+
self.config = self._config_obj if self._config_obj else self.Config()
|
|
180
|
+
|
|
137
181
|
self.logger = logging.getLogger(__name__)
|
|
138
182
|
if not self.logger.handlers:
|
|
139
183
|
handler = logging.StreamHandler()
|
|
140
|
-
handler.setFormatter(logging.Formatter(
|
|
184
|
+
handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(message)s"))
|
|
141
185
|
self.logger.addHandler(handler)
|
|
142
186
|
self.logger.setLevel(logging.INFO)
|
|
143
|
-
os.makedirs(self.
|
|
187
|
+
os.makedirs(self.config.output_dir, exist_ok=True)
|
|
144
188
|
self._check_external_tools()
|
|
145
189
|
|
|
146
190
|
def _check_external_tools(self):
|
|
147
191
|
"""Check if external tools are available."""
|
|
148
192
|
try:
|
|
149
|
-
|
|
150
|
-
self.settings.playwright_available = True
|
|
193
|
+
self.config.playwright_available = True
|
|
151
194
|
except ImportError:
|
|
152
|
-
self.
|
|
153
|
-
|
|
195
|
+
self.config.playwright_available = False
|
|
154
196
|
|
|
155
197
|
async def _save_output(self, content: Any, path: str, format: OutputFormat) -> None:
|
|
156
198
|
"""Save content to file in the specified format."""
|
|
157
199
|
try:
|
|
158
200
|
os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True)
|
|
159
201
|
if format == OutputFormat.TEXT:
|
|
160
|
-
with open(path,
|
|
202
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
161
203
|
if isinstance(content, dict):
|
|
162
204
|
f.write(json.dumps(content, indent=2))
|
|
163
205
|
else:
|
|
164
206
|
f.write(str(content))
|
|
165
207
|
elif format == OutputFormat.JSON:
|
|
166
|
-
with open(path,
|
|
208
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
167
209
|
if isinstance(content, dict):
|
|
168
210
|
json.dump(content, f, indent=2)
|
|
169
211
|
else:
|
|
170
212
|
json.dump({"content": content}, f, indent=2)
|
|
171
213
|
elif format == OutputFormat.HTML:
|
|
172
|
-
with open(path,
|
|
173
|
-
if isinstance(content, dict) and
|
|
174
|
-
f.write(content[
|
|
214
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
215
|
+
if isinstance(content, dict) and "html" in content:
|
|
216
|
+
f.write(content["html"])
|
|
175
217
|
else:
|
|
176
218
|
f.write(str(content))
|
|
177
219
|
elif format == OutputFormat.MARKDOWN:
|
|
178
|
-
with open(path,
|
|
220
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
179
221
|
if isinstance(content, dict):
|
|
180
222
|
f.write("# Scraper Results\n\n")
|
|
181
223
|
for key, value in content.items():
|
|
@@ -186,7 +228,9 @@ class ScraperTool(BaseTool):
|
|
|
186
228
|
f.write(str(content))
|
|
187
229
|
elif format == OutputFormat.CSV:
|
|
188
230
|
import csv
|
|
189
|
-
|
|
231
|
+
|
|
232
|
+
with open(path, "w", newline="", encoding="utf-8") as f:
|
|
233
|
+
writer: Union[Any, Any] # csv.writer or csv.DictWriter instance
|
|
190
234
|
if isinstance(content, dict):
|
|
191
235
|
writer = csv.writer(f)
|
|
192
236
|
writer.writerow(content.keys())
|
|
@@ -203,7 +247,23 @@ class ScraperTool(BaseTool):
|
|
|
203
247
|
except Exception as e:
|
|
204
248
|
raise FileOperationError(f"Error saving output: {str(e)}")
|
|
205
249
|
|
|
206
|
-
async def get_httpx(
|
|
250
|
+
async def get_httpx(
|
|
251
|
+
self,
|
|
252
|
+
url: str,
|
|
253
|
+
method: HttpMethod = HttpMethod.GET,
|
|
254
|
+
params: Optional[Dict[str, str]] = None,
|
|
255
|
+
data: Optional[Dict[str, Any]] = None,
|
|
256
|
+
json_data: Optional[Dict[str, Any]] = None,
|
|
257
|
+
cookies: Optional[Dict[str, str]] = None,
|
|
258
|
+
auth: Optional[Tuple[str, str]] = None,
|
|
259
|
+
verify_ssl: Optional[bool] = None,
|
|
260
|
+
allow_redirects: bool = True,
|
|
261
|
+
content_type: ContentType = ContentType.TEXT,
|
|
262
|
+
headers: Optional[Dict[str, str]] = None,
|
|
263
|
+
output_format: Optional[OutputFormat] = None,
|
|
264
|
+
output_path: Optional[str] = None,
|
|
265
|
+
async_mode: bool = True,
|
|
266
|
+
) -> Any:
|
|
207
267
|
"""
|
|
208
268
|
Execute HTTP request using httpx library (supports both sync and async).
|
|
209
269
|
|
|
@@ -231,21 +291,21 @@ class ScraperTool(BaseTool):
|
|
|
231
291
|
"""
|
|
232
292
|
try:
|
|
233
293
|
headers = headers or {}
|
|
234
|
-
if
|
|
235
|
-
headers[
|
|
236
|
-
kwargs = {
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
294
|
+
if "User-Agent" not in headers:
|
|
295
|
+
headers["User-Agent"] = self.config.user_agent
|
|
296
|
+
kwargs: Dict[str, Any] = {
|
|
297
|
+
"params": params,
|
|
298
|
+
"headers": headers,
|
|
299
|
+
"follow_redirects": allow_redirects,
|
|
240
300
|
}
|
|
241
301
|
if auth:
|
|
242
|
-
kwargs[
|
|
302
|
+
kwargs["auth"] = auth # httpx accepts Tuple[str, str] for auth
|
|
243
303
|
if cookies:
|
|
244
|
-
kwargs[
|
|
304
|
+
kwargs["cookies"] = cookies
|
|
245
305
|
if json_data:
|
|
246
|
-
kwargs[
|
|
306
|
+
kwargs["json"] = json_data
|
|
247
307
|
elif data:
|
|
248
|
-
kwargs[
|
|
308
|
+
kwargs["data"] = data
|
|
249
309
|
|
|
250
310
|
if async_mode:
|
|
251
311
|
async with httpx.AsyncClient(verify=verify_ssl if verify_ssl is not None else True) as client:
|
|
@@ -260,30 +320,47 @@ class ScraperTool(BaseTool):
|
|
|
260
320
|
resp.raise_for_status()
|
|
261
321
|
except httpx.HTTPStatusError as e:
|
|
262
322
|
raise HttpError(f"HTTP {e.response.status_code}: {e.response.reason_phrase} for {url}")
|
|
263
|
-
|
|
264
|
-
if len(resp.content) > self.
|
|
323
|
+
|
|
324
|
+
if len(resp.content) > self.config.max_content_length:
|
|
265
325
|
raise HttpError(f"Response content too large: {len(resp.content)} bytes")
|
|
266
326
|
|
|
267
327
|
if content_type == ContentType.JSON:
|
|
268
328
|
result = resp.json()
|
|
269
329
|
elif content_type == ContentType.HTML:
|
|
270
|
-
result = {
|
|
330
|
+
result = {
|
|
331
|
+
"html": resp.text,
|
|
332
|
+
"url": str(resp.url),
|
|
333
|
+
"status": resp.status_code,
|
|
334
|
+
}
|
|
271
335
|
elif content_type == ContentType.BINARY:
|
|
272
|
-
result = {
|
|
336
|
+
result = {
|
|
337
|
+
"content": resp.content,
|
|
338
|
+
"url": str(resp.url),
|
|
339
|
+
"status": resp.status_code,
|
|
340
|
+
}
|
|
273
341
|
else:
|
|
274
342
|
result = resp.text
|
|
275
343
|
|
|
276
344
|
if output_format and output_path:
|
|
277
345
|
await self._save_output(result, output_path, output_format)
|
|
278
346
|
if isinstance(result, dict):
|
|
279
|
-
result[
|
|
347
|
+
result["saved_to"] = output_path
|
|
280
348
|
else:
|
|
281
|
-
result = {
|
|
349
|
+
result = {"content": result, "saved_to": output_path}
|
|
282
350
|
return result
|
|
283
351
|
except httpx.RequestError as e:
|
|
284
352
|
raise HttpError(f"Request failed: {str(e)}")
|
|
285
353
|
|
|
286
|
-
async def get_urllib(
|
|
354
|
+
async def get_urllib(
|
|
355
|
+
self,
|
|
356
|
+
url: str,
|
|
357
|
+
method: HttpMethod = HttpMethod.GET,
|
|
358
|
+
data: Optional[Dict[str, Any]] = None,
|
|
359
|
+
content_type: ContentType = ContentType.TEXT,
|
|
360
|
+
headers: Optional[Dict[str, str]] = None,
|
|
361
|
+
output_format: Optional[OutputFormat] = None,
|
|
362
|
+
output_path: Optional[str] = None,
|
|
363
|
+
) -> Any:
|
|
287
364
|
"""
|
|
288
365
|
Execute HTTP request using urllib.
|
|
289
366
|
|
|
@@ -305,10 +382,10 @@ class ScraperTool(BaseTool):
|
|
|
305
382
|
try:
|
|
306
383
|
import urllib.parse
|
|
307
384
|
import urllib.error
|
|
308
|
-
|
|
385
|
+
|
|
309
386
|
headers = headers or {}
|
|
310
|
-
if
|
|
311
|
-
headers[
|
|
387
|
+
if "User-Agent" not in headers:
|
|
388
|
+
headers["User-Agent"] = self.config.user_agent
|
|
312
389
|
data_bytes = None
|
|
313
390
|
if data:
|
|
314
391
|
data_bytes = urllib.parse.urlencode(data).encode()
|
|
@@ -316,42 +393,122 @@ class ScraperTool(BaseTool):
|
|
|
316
393
|
str(url),
|
|
317
394
|
data=data_bytes,
|
|
318
395
|
headers=headers,
|
|
319
|
-
method=method.value.upper()
|
|
396
|
+
method=method.value.upper(),
|
|
320
397
|
)
|
|
321
398
|
with urllib_request.urlopen(req) as resp:
|
|
322
|
-
content_length = resp.getheader(
|
|
323
|
-
if content_length and int(content_length) > self.
|
|
399
|
+
content_length = resp.getheader("Content-Length")
|
|
400
|
+
if content_length and int(content_length) > self.config.max_content_length:
|
|
324
401
|
raise HttpError(f"Response content too large: {content_length} bytes")
|
|
325
402
|
content = resp.read()
|
|
326
|
-
charset = resp.headers.get_content_charset() or
|
|
403
|
+
charset = resp.headers.get_content_charset() or "utf-8"
|
|
327
404
|
if content_type == ContentType.JSON:
|
|
328
|
-
result = json.loads(content.decode(charset, errors=
|
|
405
|
+
result = json.loads(content.decode(charset, errors="ignore"))
|
|
329
406
|
elif content_type == ContentType.HTML:
|
|
330
|
-
result = {
|
|
407
|
+
result = {
|
|
408
|
+
"html": content.decode(charset, errors="ignore"),
|
|
409
|
+
"url": resp.url,
|
|
410
|
+
"status": resp.status,
|
|
411
|
+
}
|
|
331
412
|
elif content_type == ContentType.BINARY:
|
|
332
|
-
result = {
|
|
413
|
+
result = {
|
|
414
|
+
"content": content,
|
|
415
|
+
"url": resp.url,
|
|
416
|
+
"status": resp.status,
|
|
417
|
+
}
|
|
333
418
|
else:
|
|
334
|
-
result = content.decode(charset, errors=
|
|
419
|
+
result = content.decode(charset, errors="ignore")
|
|
335
420
|
if output_format and output_path:
|
|
336
421
|
await self._save_output(result, output_path, output_format)
|
|
337
422
|
if isinstance(result, dict):
|
|
338
|
-
result[
|
|
423
|
+
result["saved_to"] = output_path
|
|
339
424
|
else:
|
|
340
|
-
result = {
|
|
425
|
+
result = {"content": result, "saved_to": output_path}
|
|
341
426
|
return result
|
|
342
427
|
except urllib.error.URLError as e:
|
|
343
428
|
raise HttpError(f"Request failed: {str(e)}")
|
|
344
429
|
|
|
345
430
|
# Legacy method names for backward compatibility
|
|
346
|
-
async def get_requests(
|
|
431
|
+
async def get_requests(
|
|
432
|
+
self,
|
|
433
|
+
url: str,
|
|
434
|
+
method: HttpMethod = HttpMethod.GET,
|
|
435
|
+
params: Optional[Dict[str, str]] = None,
|
|
436
|
+
data: Optional[Dict[str, Any]] = None,
|
|
437
|
+
json_data: Optional[Dict[str, Any]] = None,
|
|
438
|
+
cookies: Optional[Dict[str, str]] = None,
|
|
439
|
+
auth: Optional[Tuple[str, str]] = None,
|
|
440
|
+
verify_ssl: Optional[bool] = None,
|
|
441
|
+
allow_redirects: bool = True,
|
|
442
|
+
content_type: ContentType = ContentType.TEXT,
|
|
443
|
+
headers: Optional[Dict[str, str]] = None,
|
|
444
|
+
output_format: Optional[OutputFormat] = None,
|
|
445
|
+
output_path: Optional[str] = None,
|
|
446
|
+
) -> Any:
|
|
347
447
|
"""Legacy method - now uses httpx in sync mode."""
|
|
348
|
-
return await self.get_httpx(
|
|
349
|
-
|
|
350
|
-
|
|
448
|
+
return await self.get_httpx(
|
|
449
|
+
url,
|
|
450
|
+
method,
|
|
451
|
+
params,
|
|
452
|
+
data,
|
|
453
|
+
json_data,
|
|
454
|
+
cookies,
|
|
455
|
+
auth,
|
|
456
|
+
verify_ssl,
|
|
457
|
+
allow_redirects,
|
|
458
|
+
content_type,
|
|
459
|
+
headers,
|
|
460
|
+
output_format,
|
|
461
|
+
output_path,
|
|
462
|
+
async_mode=False,
|
|
463
|
+
)
|
|
464
|
+
|
|
465
|
+
async def get_aiohttp(
|
|
466
|
+
self,
|
|
467
|
+
url: str,
|
|
468
|
+
method: HttpMethod = HttpMethod.GET,
|
|
469
|
+
params: Optional[Dict[str, str]] = None,
|
|
470
|
+
data: Optional[Dict[str, Any]] = None,
|
|
471
|
+
json_data: Optional[Dict[str, Any]] = None,
|
|
472
|
+
cookies: Optional[Dict[str, str]] = None,
|
|
473
|
+
auth: Optional[Tuple[str, str]] = None,
|
|
474
|
+
verify_ssl: Optional[bool] = None,
|
|
475
|
+
allow_redirects: bool = True,
|
|
476
|
+
content_type: ContentType = ContentType.TEXT,
|
|
477
|
+
headers: Optional[Dict[str, str]] = None,
|
|
478
|
+
output_format: Optional[OutputFormat] = None,
|
|
479
|
+
output_path: Optional[str] = None,
|
|
480
|
+
) -> Any:
|
|
351
481
|
"""Legacy method - now uses httpx in async mode."""
|
|
352
|
-
return await self.get_httpx(
|
|
353
|
-
|
|
354
|
-
|
|
482
|
+
return await self.get_httpx(
|
|
483
|
+
url,
|
|
484
|
+
method,
|
|
485
|
+
params,
|
|
486
|
+
data,
|
|
487
|
+
json_data,
|
|
488
|
+
cookies,
|
|
489
|
+
auth,
|
|
490
|
+
verify_ssl,
|
|
491
|
+
allow_redirects,
|
|
492
|
+
content_type,
|
|
493
|
+
headers,
|
|
494
|
+
output_format,
|
|
495
|
+
output_path,
|
|
496
|
+
async_mode=True,
|
|
497
|
+
)
|
|
498
|
+
|
|
499
|
+
async def render(
|
|
500
|
+
self,
|
|
501
|
+
url: str,
|
|
502
|
+
engine: RenderEngine = RenderEngine.PLAYWRIGHT,
|
|
503
|
+
wait_time: int = 5,
|
|
504
|
+
wait_selector: Optional[str] = None,
|
|
505
|
+
scroll_to_bottom: bool = False,
|
|
506
|
+
screenshot: bool = False,
|
|
507
|
+
screenshot_path: Optional[str] = None,
|
|
508
|
+
headers: Optional[Dict[str, str]] = None,
|
|
509
|
+
output_format: Optional[OutputFormat] = None,
|
|
510
|
+
output_path: Optional[str] = None,
|
|
511
|
+
) -> Dict[str, Any]:
|
|
355
512
|
"""
|
|
356
513
|
Render a web page using a headless browser (Playwright).
|
|
357
514
|
|
|
@@ -375,56 +532,85 @@ class ScraperTool(BaseTool):
|
|
|
375
532
|
"""
|
|
376
533
|
try:
|
|
377
534
|
if engine == RenderEngine.PLAYWRIGHT:
|
|
378
|
-
if not self.
|
|
535
|
+
if not self.config.playwright_available:
|
|
379
536
|
raise RenderingError("Playwright is not available. Install with 'pip install playwright'")
|
|
380
|
-
result = await self._render_with_playwright(
|
|
537
|
+
result = await self._render_with_playwright(
|
|
538
|
+
url,
|
|
539
|
+
wait_time,
|
|
540
|
+
wait_selector,
|
|
541
|
+
scroll_to_bottom,
|
|
542
|
+
screenshot,
|
|
543
|
+
screenshot_path,
|
|
544
|
+
)
|
|
381
545
|
else:
|
|
382
546
|
raise RenderingError(f"Unsupported rendering engine: {engine}. Only PLAYWRIGHT is supported.")
|
|
383
547
|
if output_format and output_path:
|
|
384
548
|
await self._save_output(result, output_path, output_format)
|
|
385
|
-
result[
|
|
549
|
+
result["saved_to"] = output_path
|
|
386
550
|
return result
|
|
387
551
|
except Exception as e:
|
|
388
552
|
raise RenderingError(f"Failed to render page: {str(e)}")
|
|
389
553
|
|
|
390
|
-
async def _render_with_playwright(
|
|
554
|
+
async def _render_with_playwright(
|
|
555
|
+
self,
|
|
556
|
+
url: str,
|
|
557
|
+
wait_time: int,
|
|
558
|
+
wait_selector: Optional[str],
|
|
559
|
+
scroll_to_bottom: bool,
|
|
560
|
+
screenshot: bool,
|
|
561
|
+
screenshot_path: Optional[str],
|
|
562
|
+
) -> Dict[str, Any]:
|
|
391
563
|
"""Render a web page using Playwright with async API."""
|
|
392
564
|
from playwright.async_api import async_playwright
|
|
565
|
+
|
|
393
566
|
async with async_playwright() as p:
|
|
394
567
|
browser = await p.chromium.launch()
|
|
395
568
|
page = await browser.new_page(
|
|
396
|
-
user_agent=self.
|
|
397
|
-
viewport={
|
|
569
|
+
user_agent=self.config.user_agent,
|
|
570
|
+
viewport={"width": 1280, "height": 800},
|
|
398
571
|
)
|
|
399
572
|
try:
|
|
400
573
|
await page.goto(url)
|
|
401
574
|
if wait_selector:
|
|
402
575
|
await page.wait_for_selector(wait_selector)
|
|
403
576
|
else:
|
|
404
|
-
await page.wait_for_load_state(
|
|
577
|
+
await page.wait_for_load_state("networkidle")
|
|
405
578
|
if scroll_to_bottom:
|
|
406
|
-
await page.evaluate(
|
|
579
|
+
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
|
407
580
|
await page.wait_for_timeout(1000)
|
|
408
581
|
screenshot_result = None
|
|
409
582
|
if screenshot:
|
|
410
|
-
screenshot_path = screenshot_path or os.path.join(
|
|
411
|
-
|
|
583
|
+
screenshot_path = screenshot_path or os.path.join(
|
|
584
|
+
self.config.output_dir,
|
|
585
|
+
f"screenshot_{int(time.time())}.png",
|
|
586
|
+
)
|
|
587
|
+
os.makedirs(
|
|
588
|
+
os.path.dirname(os.path.abspath(screenshot_path)),
|
|
589
|
+
exist_ok=True,
|
|
590
|
+
)
|
|
412
591
|
await page.screenshot(path=screenshot_path)
|
|
413
592
|
screenshot_result = screenshot_path
|
|
414
593
|
html = await page.content()
|
|
415
594
|
title = await page.title()
|
|
416
595
|
result = {
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
596
|
+
"html": html,
|
|
597
|
+
"title": title,
|
|
598
|
+
"url": page.url,
|
|
599
|
+
"screenshot": screenshot_result,
|
|
421
600
|
}
|
|
422
601
|
return result
|
|
423
602
|
finally:
|
|
424
603
|
await browser.close()
|
|
425
604
|
|
|
426
|
-
|
|
427
|
-
|
|
605
|
+
def crawl_scrapy(
|
|
606
|
+
self,
|
|
607
|
+
project_path: str,
|
|
608
|
+
spider_name: str,
|
|
609
|
+
output_path: str,
|
|
610
|
+
spider_args: Optional[Dict[str, str]] = None,
|
|
611
|
+
headers: Optional[Dict[str, str]] = None,
|
|
612
|
+
output_format: Optional[OutputFormat] = None,
|
|
613
|
+
) -> Dict[str, Any]:
|
|
428
614
|
"""
|
|
429
615
|
Execute a Scrapy spider in an existing project and output results to a file.
|
|
430
616
|
|
|
@@ -447,21 +633,25 @@ class ScraperTool(BaseTool):
|
|
|
447
633
|
start_time = time.time()
|
|
448
634
|
os.makedirs(os.path.dirname(os.path.abspath(output_path)), exist_ok=True)
|
|
449
635
|
cmd = [
|
|
450
|
-
self.
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
636
|
+
self.config.scrapy_command,
|
|
637
|
+
"crawl",
|
|
638
|
+
spider_name,
|
|
639
|
+
"-o",
|
|
640
|
+
output_path,
|
|
641
|
+
"-s",
|
|
642
|
+
f"USER_AGENT={self.config.user_agent}",
|
|
643
|
+
"-s",
|
|
644
|
+
"LOG_LEVEL=INFO",
|
|
455
645
|
]
|
|
456
646
|
if spider_args:
|
|
457
647
|
for k, v in spider_args.items():
|
|
458
|
-
cmd += [
|
|
648
|
+
cmd += ["-a", f"{k}={v}"]
|
|
459
649
|
process = subprocess.run(
|
|
460
650
|
cmd,
|
|
461
651
|
cwd=project_path,
|
|
462
652
|
stdout=subprocess.PIPE,
|
|
463
653
|
stderr=subprocess.PIPE,
|
|
464
|
-
text=True
|
|
654
|
+
text=True,
|
|
465
655
|
)
|
|
466
656
|
if process.returncode != 0:
|
|
467
657
|
error_msg = process.stderr.strip()
|
|
@@ -470,19 +660,26 @@ class ScraperTool(BaseTool):
|
|
|
470
660
|
raise ExternalToolError(f"Scrapy crawl did not create output file: {output_path}")
|
|
471
661
|
file_size = os.path.getsize(output_path)
|
|
472
662
|
result = {
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
663
|
+
"output_path": output_path,
|
|
664
|
+
"execution_time": time.time() - start_time,
|
|
665
|
+
"file_size": file_size,
|
|
666
|
+
"stdout": process.stdout.strip(),
|
|
667
|
+
"stderr": process.stderr.strip(),
|
|
478
668
|
}
|
|
479
669
|
return result
|
|
480
670
|
except subprocess.TimeoutExpired:
|
|
481
|
-
raise TimeoutError(
|
|
671
|
+
raise TimeoutError("Scrapy crawl timed out")
|
|
482
672
|
except Exception as e:
|
|
483
673
|
raise ExternalToolError(f"Error running Scrapy: {str(e)}")
|
|
484
674
|
|
|
485
|
-
def parse_html(
|
|
675
|
+
def parse_html(
|
|
676
|
+
self,
|
|
677
|
+
html: str,
|
|
678
|
+
selector: str,
|
|
679
|
+
selector_type: str = "css",
|
|
680
|
+
extract_attr: Optional[str] = None,
|
|
681
|
+
extract_text: bool = True,
|
|
682
|
+
) -> Dict[str, Any]:
|
|
486
683
|
"""
|
|
487
684
|
Parse HTML content using BeautifulSoup.
|
|
488
685
|
|
|
@@ -500,36 +697,37 @@ class ScraperTool(BaseTool):
|
|
|
500
697
|
ParsingError: If parsing fails.
|
|
501
698
|
"""
|
|
502
699
|
try:
|
|
503
|
-
soup = BeautifulSoup(html,
|
|
504
|
-
if selector_type ==
|
|
700
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
701
|
+
if selector_type == "css":
|
|
505
702
|
elements = soup.select(selector)
|
|
506
703
|
else:
|
|
507
704
|
from lxml.html import fromstring
|
|
508
705
|
from lxml.etree import XPath
|
|
706
|
+
|
|
509
707
|
root = fromstring(html)
|
|
510
708
|
xpath = XPath(selector)
|
|
511
709
|
elements = xpath(root)
|
|
512
710
|
results = []
|
|
513
711
|
for element in elements:
|
|
514
712
|
if extract_attr:
|
|
515
|
-
value = element.get(extract_attr) if hasattr(element,
|
|
713
|
+
value = element.get(extract_attr) if hasattr(element, "get") else element.get(extract_attr)
|
|
516
714
|
if value is not None:
|
|
517
715
|
results.append(value)
|
|
518
716
|
elif extract_text:
|
|
519
|
-
if hasattr(element,
|
|
717
|
+
if hasattr(element, "text_content") and callable(getattr(element, "text_content")):
|
|
520
718
|
# lxml element
|
|
521
|
-
text = element.text_content()
|
|
719
|
+
text = element.text_content() # type: ignore[misc]
|
|
522
720
|
else:
|
|
523
721
|
# BeautifulSoup element
|
|
524
|
-
text = element.get_text()
|
|
525
|
-
|
|
526
|
-
if text and text.strip():
|
|
527
|
-
results.append(text.strip())
|
|
722
|
+
text = element.get_text() # type: ignore[misc]
|
|
723
|
+
|
|
724
|
+
if text and text.strip(): # type: ignore[misc]
|
|
725
|
+
results.append(text.strip()) # type: ignore[misc]
|
|
528
726
|
return {
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
727
|
+
"selector": selector,
|
|
728
|
+
"selector_type": selector_type,
|
|
729
|
+
"count": len(results),
|
|
730
|
+
"results": results,
|
|
533
731
|
}
|
|
534
732
|
except Exception as e:
|
|
535
733
|
raise ParsingError(f"Error parsing HTML: {str(e)}")
|
|
@@ -542,7 +740,3 @@ class ScraperTool(BaseTool):
|
|
|
542
740
|
head = get_httpx
|
|
543
741
|
options = get_httpx
|
|
544
742
|
patch = get_httpx
|
|
545
|
-
|
|
546
|
-
# Legacy method aliases
|
|
547
|
-
get_requests = get_httpx
|
|
548
|
-
get_aiohttp = get_httpx
|