aiecs 1.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. aiecs/__init__.py +72 -0
  2. aiecs/__main__.py +41 -0
  3. aiecs/aiecs_client.py +469 -0
  4. aiecs/application/__init__.py +10 -0
  5. aiecs/application/executors/__init__.py +10 -0
  6. aiecs/application/executors/operation_executor.py +363 -0
  7. aiecs/application/knowledge_graph/__init__.py +7 -0
  8. aiecs/application/knowledge_graph/builder/__init__.py +37 -0
  9. aiecs/application/knowledge_graph/builder/document_builder.py +375 -0
  10. aiecs/application/knowledge_graph/builder/graph_builder.py +356 -0
  11. aiecs/application/knowledge_graph/builder/schema_mapping.py +531 -0
  12. aiecs/application/knowledge_graph/builder/structured_pipeline.py +443 -0
  13. aiecs/application/knowledge_graph/builder/text_chunker.py +319 -0
  14. aiecs/application/knowledge_graph/extractors/__init__.py +27 -0
  15. aiecs/application/knowledge_graph/extractors/base.py +100 -0
  16. aiecs/application/knowledge_graph/extractors/llm_entity_extractor.py +327 -0
  17. aiecs/application/knowledge_graph/extractors/llm_relation_extractor.py +349 -0
  18. aiecs/application/knowledge_graph/extractors/ner_entity_extractor.py +244 -0
  19. aiecs/application/knowledge_graph/fusion/__init__.py +23 -0
  20. aiecs/application/knowledge_graph/fusion/entity_deduplicator.py +387 -0
  21. aiecs/application/knowledge_graph/fusion/entity_linker.py +343 -0
  22. aiecs/application/knowledge_graph/fusion/knowledge_fusion.py +580 -0
  23. aiecs/application/knowledge_graph/fusion/relation_deduplicator.py +189 -0
  24. aiecs/application/knowledge_graph/pattern_matching/__init__.py +21 -0
  25. aiecs/application/knowledge_graph/pattern_matching/pattern_matcher.py +344 -0
  26. aiecs/application/knowledge_graph/pattern_matching/query_executor.py +378 -0
  27. aiecs/application/knowledge_graph/profiling/__init__.py +12 -0
  28. aiecs/application/knowledge_graph/profiling/query_plan_visualizer.py +199 -0
  29. aiecs/application/knowledge_graph/profiling/query_profiler.py +223 -0
  30. aiecs/application/knowledge_graph/reasoning/__init__.py +27 -0
  31. aiecs/application/knowledge_graph/reasoning/evidence_synthesis.py +347 -0
  32. aiecs/application/knowledge_graph/reasoning/inference_engine.py +504 -0
  33. aiecs/application/knowledge_graph/reasoning/logic_form_parser.py +167 -0
  34. aiecs/application/knowledge_graph/reasoning/logic_parser/__init__.py +79 -0
  35. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_builder.py +513 -0
  36. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_nodes.py +630 -0
  37. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_validator.py +654 -0
  38. aiecs/application/knowledge_graph/reasoning/logic_parser/error_handler.py +477 -0
  39. aiecs/application/knowledge_graph/reasoning/logic_parser/parser.py +390 -0
  40. aiecs/application/knowledge_graph/reasoning/logic_parser/query_context.py +217 -0
  41. aiecs/application/knowledge_graph/reasoning/logic_query_integration.py +169 -0
  42. aiecs/application/knowledge_graph/reasoning/query_planner.py +872 -0
  43. aiecs/application/knowledge_graph/reasoning/reasoning_engine.py +554 -0
  44. aiecs/application/knowledge_graph/retrieval/__init__.py +19 -0
  45. aiecs/application/knowledge_graph/retrieval/retrieval_strategies.py +596 -0
  46. aiecs/application/knowledge_graph/search/__init__.py +59 -0
  47. aiecs/application/knowledge_graph/search/hybrid_search.py +423 -0
  48. aiecs/application/knowledge_graph/search/reranker.py +295 -0
  49. aiecs/application/knowledge_graph/search/reranker_strategies.py +553 -0
  50. aiecs/application/knowledge_graph/search/text_similarity.py +398 -0
  51. aiecs/application/knowledge_graph/traversal/__init__.py +15 -0
  52. aiecs/application/knowledge_graph/traversal/enhanced_traversal.py +329 -0
  53. aiecs/application/knowledge_graph/traversal/path_scorer.py +269 -0
  54. aiecs/application/knowledge_graph/validators/__init__.py +13 -0
  55. aiecs/application/knowledge_graph/validators/relation_validator.py +189 -0
  56. aiecs/application/knowledge_graph/visualization/__init__.py +11 -0
  57. aiecs/application/knowledge_graph/visualization/graph_visualizer.py +321 -0
  58. aiecs/common/__init__.py +9 -0
  59. aiecs/common/knowledge_graph/__init__.py +17 -0
  60. aiecs/common/knowledge_graph/runnable.py +484 -0
  61. aiecs/config/__init__.py +16 -0
  62. aiecs/config/config.py +498 -0
  63. aiecs/config/graph_config.py +137 -0
  64. aiecs/config/registry.py +23 -0
  65. aiecs/core/__init__.py +46 -0
  66. aiecs/core/interface/__init__.py +34 -0
  67. aiecs/core/interface/execution_interface.py +152 -0
  68. aiecs/core/interface/storage_interface.py +171 -0
  69. aiecs/domain/__init__.py +289 -0
  70. aiecs/domain/agent/__init__.py +189 -0
  71. aiecs/domain/agent/base_agent.py +697 -0
  72. aiecs/domain/agent/exceptions.py +103 -0
  73. aiecs/domain/agent/graph_aware_mixin.py +559 -0
  74. aiecs/domain/agent/hybrid_agent.py +490 -0
  75. aiecs/domain/agent/integration/__init__.py +26 -0
  76. aiecs/domain/agent/integration/context_compressor.py +222 -0
  77. aiecs/domain/agent/integration/context_engine_adapter.py +252 -0
  78. aiecs/domain/agent/integration/retry_policy.py +219 -0
  79. aiecs/domain/agent/integration/role_config.py +213 -0
  80. aiecs/domain/agent/knowledge_aware_agent.py +646 -0
  81. aiecs/domain/agent/lifecycle.py +296 -0
  82. aiecs/domain/agent/llm_agent.py +300 -0
  83. aiecs/domain/agent/memory/__init__.py +12 -0
  84. aiecs/domain/agent/memory/conversation.py +197 -0
  85. aiecs/domain/agent/migration/__init__.py +14 -0
  86. aiecs/domain/agent/migration/conversion.py +160 -0
  87. aiecs/domain/agent/migration/legacy_wrapper.py +90 -0
  88. aiecs/domain/agent/models.py +317 -0
  89. aiecs/domain/agent/observability.py +407 -0
  90. aiecs/domain/agent/persistence.py +289 -0
  91. aiecs/domain/agent/prompts/__init__.py +29 -0
  92. aiecs/domain/agent/prompts/builder.py +161 -0
  93. aiecs/domain/agent/prompts/formatters.py +189 -0
  94. aiecs/domain/agent/prompts/template.py +255 -0
  95. aiecs/domain/agent/registry.py +260 -0
  96. aiecs/domain/agent/tool_agent.py +257 -0
  97. aiecs/domain/agent/tools/__init__.py +12 -0
  98. aiecs/domain/agent/tools/schema_generator.py +221 -0
  99. aiecs/domain/community/__init__.py +155 -0
  100. aiecs/domain/community/agent_adapter.py +477 -0
  101. aiecs/domain/community/analytics.py +481 -0
  102. aiecs/domain/community/collaborative_workflow.py +642 -0
  103. aiecs/domain/community/communication_hub.py +645 -0
  104. aiecs/domain/community/community_builder.py +320 -0
  105. aiecs/domain/community/community_integration.py +800 -0
  106. aiecs/domain/community/community_manager.py +813 -0
  107. aiecs/domain/community/decision_engine.py +879 -0
  108. aiecs/domain/community/exceptions.py +225 -0
  109. aiecs/domain/community/models/__init__.py +33 -0
  110. aiecs/domain/community/models/community_models.py +268 -0
  111. aiecs/domain/community/resource_manager.py +457 -0
  112. aiecs/domain/community/shared_context_manager.py +603 -0
  113. aiecs/domain/context/__init__.py +58 -0
  114. aiecs/domain/context/context_engine.py +989 -0
  115. aiecs/domain/context/conversation_models.py +354 -0
  116. aiecs/domain/context/graph_memory.py +467 -0
  117. aiecs/domain/execution/__init__.py +12 -0
  118. aiecs/domain/execution/model.py +57 -0
  119. aiecs/domain/knowledge_graph/__init__.py +19 -0
  120. aiecs/domain/knowledge_graph/models/__init__.py +52 -0
  121. aiecs/domain/knowledge_graph/models/entity.py +130 -0
  122. aiecs/domain/knowledge_graph/models/evidence.py +194 -0
  123. aiecs/domain/knowledge_graph/models/inference_rule.py +186 -0
  124. aiecs/domain/knowledge_graph/models/path.py +179 -0
  125. aiecs/domain/knowledge_graph/models/path_pattern.py +173 -0
  126. aiecs/domain/knowledge_graph/models/query.py +272 -0
  127. aiecs/domain/knowledge_graph/models/query_plan.py +187 -0
  128. aiecs/domain/knowledge_graph/models/relation.py +136 -0
  129. aiecs/domain/knowledge_graph/schema/__init__.py +23 -0
  130. aiecs/domain/knowledge_graph/schema/entity_type.py +135 -0
  131. aiecs/domain/knowledge_graph/schema/graph_schema.py +271 -0
  132. aiecs/domain/knowledge_graph/schema/property_schema.py +155 -0
  133. aiecs/domain/knowledge_graph/schema/relation_type.py +171 -0
  134. aiecs/domain/knowledge_graph/schema/schema_manager.py +496 -0
  135. aiecs/domain/knowledge_graph/schema/type_enums.py +205 -0
  136. aiecs/domain/task/__init__.py +13 -0
  137. aiecs/domain/task/dsl_processor.py +613 -0
  138. aiecs/domain/task/model.py +62 -0
  139. aiecs/domain/task/task_context.py +268 -0
  140. aiecs/infrastructure/__init__.py +24 -0
  141. aiecs/infrastructure/graph_storage/__init__.py +11 -0
  142. aiecs/infrastructure/graph_storage/base.py +601 -0
  143. aiecs/infrastructure/graph_storage/batch_operations.py +449 -0
  144. aiecs/infrastructure/graph_storage/cache.py +429 -0
  145. aiecs/infrastructure/graph_storage/distributed.py +226 -0
  146. aiecs/infrastructure/graph_storage/error_handling.py +390 -0
  147. aiecs/infrastructure/graph_storage/graceful_degradation.py +306 -0
  148. aiecs/infrastructure/graph_storage/health_checks.py +378 -0
  149. aiecs/infrastructure/graph_storage/in_memory.py +514 -0
  150. aiecs/infrastructure/graph_storage/index_optimization.py +483 -0
  151. aiecs/infrastructure/graph_storage/lazy_loading.py +410 -0
  152. aiecs/infrastructure/graph_storage/metrics.py +357 -0
  153. aiecs/infrastructure/graph_storage/migration.py +413 -0
  154. aiecs/infrastructure/graph_storage/pagination.py +471 -0
  155. aiecs/infrastructure/graph_storage/performance_monitoring.py +466 -0
  156. aiecs/infrastructure/graph_storage/postgres.py +871 -0
  157. aiecs/infrastructure/graph_storage/query_optimizer.py +635 -0
  158. aiecs/infrastructure/graph_storage/schema_cache.py +290 -0
  159. aiecs/infrastructure/graph_storage/sqlite.py +623 -0
  160. aiecs/infrastructure/graph_storage/streaming.py +495 -0
  161. aiecs/infrastructure/messaging/__init__.py +13 -0
  162. aiecs/infrastructure/messaging/celery_task_manager.py +383 -0
  163. aiecs/infrastructure/messaging/websocket_manager.py +298 -0
  164. aiecs/infrastructure/monitoring/__init__.py +34 -0
  165. aiecs/infrastructure/monitoring/executor_metrics.py +174 -0
  166. aiecs/infrastructure/monitoring/global_metrics_manager.py +213 -0
  167. aiecs/infrastructure/monitoring/structured_logger.py +48 -0
  168. aiecs/infrastructure/monitoring/tracing_manager.py +410 -0
  169. aiecs/infrastructure/persistence/__init__.py +24 -0
  170. aiecs/infrastructure/persistence/context_engine_client.py +187 -0
  171. aiecs/infrastructure/persistence/database_manager.py +333 -0
  172. aiecs/infrastructure/persistence/file_storage.py +754 -0
  173. aiecs/infrastructure/persistence/redis_client.py +220 -0
  174. aiecs/llm/__init__.py +86 -0
  175. aiecs/llm/callbacks/__init__.py +11 -0
  176. aiecs/llm/callbacks/custom_callbacks.py +264 -0
  177. aiecs/llm/client_factory.py +420 -0
  178. aiecs/llm/clients/__init__.py +33 -0
  179. aiecs/llm/clients/base_client.py +193 -0
  180. aiecs/llm/clients/googleai_client.py +181 -0
  181. aiecs/llm/clients/openai_client.py +131 -0
  182. aiecs/llm/clients/vertex_client.py +437 -0
  183. aiecs/llm/clients/xai_client.py +184 -0
  184. aiecs/llm/config/__init__.py +51 -0
  185. aiecs/llm/config/config_loader.py +275 -0
  186. aiecs/llm/config/config_validator.py +236 -0
  187. aiecs/llm/config/model_config.py +151 -0
  188. aiecs/llm/utils/__init__.py +10 -0
  189. aiecs/llm/utils/validate_config.py +91 -0
  190. aiecs/main.py +363 -0
  191. aiecs/scripts/__init__.py +3 -0
  192. aiecs/scripts/aid/VERSION_MANAGEMENT.md +97 -0
  193. aiecs/scripts/aid/__init__.py +19 -0
  194. aiecs/scripts/aid/version_manager.py +215 -0
  195. aiecs/scripts/dependance_check/DEPENDENCY_SYSTEM_SUMMARY.md +242 -0
  196. aiecs/scripts/dependance_check/README_DEPENDENCY_CHECKER.md +310 -0
  197. aiecs/scripts/dependance_check/__init__.py +17 -0
  198. aiecs/scripts/dependance_check/dependency_checker.py +938 -0
  199. aiecs/scripts/dependance_check/dependency_fixer.py +391 -0
  200. aiecs/scripts/dependance_check/download_nlp_data.py +396 -0
  201. aiecs/scripts/dependance_check/quick_dependency_check.py +270 -0
  202. aiecs/scripts/dependance_check/setup_nlp_data.sh +217 -0
  203. aiecs/scripts/dependance_patch/__init__.py +7 -0
  204. aiecs/scripts/dependance_patch/fix_weasel/README_WEASEL_PATCH.md +126 -0
  205. aiecs/scripts/dependance_patch/fix_weasel/__init__.py +11 -0
  206. aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.py +128 -0
  207. aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.sh +82 -0
  208. aiecs/scripts/dependance_patch/fix_weasel/patch_weasel_library.sh +188 -0
  209. aiecs/scripts/dependance_patch/fix_weasel/run_weasel_patch.sh +41 -0
  210. aiecs/scripts/tools_develop/README.md +449 -0
  211. aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
  212. aiecs/scripts/tools_develop/__init__.py +21 -0
  213. aiecs/scripts/tools_develop/check_type_annotations.py +259 -0
  214. aiecs/scripts/tools_develop/validate_tool_schemas.py +422 -0
  215. aiecs/scripts/tools_develop/verify_tools.py +356 -0
  216. aiecs/tasks/__init__.py +1 -0
  217. aiecs/tasks/worker.py +172 -0
  218. aiecs/tools/__init__.py +299 -0
  219. aiecs/tools/apisource/__init__.py +99 -0
  220. aiecs/tools/apisource/intelligence/__init__.py +19 -0
  221. aiecs/tools/apisource/intelligence/data_fusion.py +381 -0
  222. aiecs/tools/apisource/intelligence/query_analyzer.py +413 -0
  223. aiecs/tools/apisource/intelligence/search_enhancer.py +388 -0
  224. aiecs/tools/apisource/monitoring/__init__.py +9 -0
  225. aiecs/tools/apisource/monitoring/metrics.py +303 -0
  226. aiecs/tools/apisource/providers/__init__.py +115 -0
  227. aiecs/tools/apisource/providers/base.py +664 -0
  228. aiecs/tools/apisource/providers/census.py +401 -0
  229. aiecs/tools/apisource/providers/fred.py +564 -0
  230. aiecs/tools/apisource/providers/newsapi.py +412 -0
  231. aiecs/tools/apisource/providers/worldbank.py +357 -0
  232. aiecs/tools/apisource/reliability/__init__.py +12 -0
  233. aiecs/tools/apisource/reliability/error_handler.py +375 -0
  234. aiecs/tools/apisource/reliability/fallback_strategy.py +391 -0
  235. aiecs/tools/apisource/tool.py +850 -0
  236. aiecs/tools/apisource/utils/__init__.py +9 -0
  237. aiecs/tools/apisource/utils/validators.py +338 -0
  238. aiecs/tools/base_tool.py +201 -0
  239. aiecs/tools/docs/__init__.py +121 -0
  240. aiecs/tools/docs/ai_document_orchestrator.py +599 -0
  241. aiecs/tools/docs/ai_document_writer_orchestrator.py +2403 -0
  242. aiecs/tools/docs/content_insertion_tool.py +1333 -0
  243. aiecs/tools/docs/document_creator_tool.py +1317 -0
  244. aiecs/tools/docs/document_layout_tool.py +1166 -0
  245. aiecs/tools/docs/document_parser_tool.py +994 -0
  246. aiecs/tools/docs/document_writer_tool.py +1818 -0
  247. aiecs/tools/knowledge_graph/__init__.py +17 -0
  248. aiecs/tools/knowledge_graph/graph_reasoning_tool.py +734 -0
  249. aiecs/tools/knowledge_graph/graph_search_tool.py +923 -0
  250. aiecs/tools/knowledge_graph/kg_builder_tool.py +476 -0
  251. aiecs/tools/langchain_adapter.py +542 -0
  252. aiecs/tools/schema_generator.py +275 -0
  253. aiecs/tools/search_tool/__init__.py +100 -0
  254. aiecs/tools/search_tool/analyzers.py +589 -0
  255. aiecs/tools/search_tool/cache.py +260 -0
  256. aiecs/tools/search_tool/constants.py +128 -0
  257. aiecs/tools/search_tool/context.py +216 -0
  258. aiecs/tools/search_tool/core.py +749 -0
  259. aiecs/tools/search_tool/deduplicator.py +123 -0
  260. aiecs/tools/search_tool/error_handler.py +271 -0
  261. aiecs/tools/search_tool/metrics.py +371 -0
  262. aiecs/tools/search_tool/rate_limiter.py +178 -0
  263. aiecs/tools/search_tool/schemas.py +277 -0
  264. aiecs/tools/statistics/__init__.py +80 -0
  265. aiecs/tools/statistics/ai_data_analysis_orchestrator.py +643 -0
  266. aiecs/tools/statistics/ai_insight_generator_tool.py +505 -0
  267. aiecs/tools/statistics/ai_report_orchestrator_tool.py +694 -0
  268. aiecs/tools/statistics/data_loader_tool.py +564 -0
  269. aiecs/tools/statistics/data_profiler_tool.py +658 -0
  270. aiecs/tools/statistics/data_transformer_tool.py +573 -0
  271. aiecs/tools/statistics/data_visualizer_tool.py +495 -0
  272. aiecs/tools/statistics/model_trainer_tool.py +487 -0
  273. aiecs/tools/statistics/statistical_analyzer_tool.py +459 -0
  274. aiecs/tools/task_tools/__init__.py +86 -0
  275. aiecs/tools/task_tools/chart_tool.py +732 -0
  276. aiecs/tools/task_tools/classfire_tool.py +922 -0
  277. aiecs/tools/task_tools/image_tool.py +447 -0
  278. aiecs/tools/task_tools/office_tool.py +684 -0
  279. aiecs/tools/task_tools/pandas_tool.py +635 -0
  280. aiecs/tools/task_tools/report_tool.py +635 -0
  281. aiecs/tools/task_tools/research_tool.py +392 -0
  282. aiecs/tools/task_tools/scraper_tool.py +715 -0
  283. aiecs/tools/task_tools/stats_tool.py +688 -0
  284. aiecs/tools/temp_file_manager.py +130 -0
  285. aiecs/tools/tool_executor/__init__.py +37 -0
  286. aiecs/tools/tool_executor/tool_executor.py +881 -0
  287. aiecs/utils/LLM_output_structor.py +445 -0
  288. aiecs/utils/__init__.py +34 -0
  289. aiecs/utils/base_callback.py +47 -0
  290. aiecs/utils/cache_provider.py +695 -0
  291. aiecs/utils/execution_utils.py +184 -0
  292. aiecs/utils/logging.py +1 -0
  293. aiecs/utils/prompt_loader.py +14 -0
  294. aiecs/utils/token_usage_repository.py +323 -0
  295. aiecs/ws/__init__.py +0 -0
  296. aiecs/ws/socket_server.py +52 -0
  297. aiecs-1.5.1.dist-info/METADATA +608 -0
  298. aiecs-1.5.1.dist-info/RECORD +302 -0
  299. aiecs-1.5.1.dist-info/WHEEL +5 -0
  300. aiecs-1.5.1.dist-info/entry_points.txt +10 -0
  301. aiecs-1.5.1.dist-info/licenses/LICENSE +225 -0
  302. aiecs-1.5.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,715 @@
1
+ import os
2
+ import json
3
+ import time
4
+ import logging
5
+ import tempfile
6
+ import subprocess
7
+ from typing import Dict, Any, List, Optional, Tuple
8
+ from enum import Enum
9
+
10
+ import httpx
11
+ from bs4 import BeautifulSoup
12
+ from urllib import request as urllib_request
13
+ from pydantic import BaseModel, ConfigDict, Field
14
+
15
+ from aiecs.tools.base_tool import BaseTool
16
+ from aiecs.tools import register_tool
17
+
18
+ # Enums for configuration options
19
+
20
+
21
+ class HttpMethod(str, Enum):
22
+ GET = "get"
23
+ POST = "post"
24
+ PUT = "put"
25
+ DELETE = "delete"
26
+ HEAD = "head"
27
+ OPTIONS = "options"
28
+ PATCH = "patch"
29
+
30
+
31
+ class ContentType(str, Enum):
32
+ HTML = "html"
33
+ JSON = "json"
34
+ TEXT = "text"
35
+ BINARY = "binary"
36
+
37
+
38
+ class OutputFormat(str, Enum):
39
+ TEXT = "text"
40
+ JSON = "json"
41
+ HTML = "html"
42
+ MARKDOWN = "markdown"
43
+ CSV = "csv"
44
+
45
+
46
+ class RenderEngine(str, Enum):
47
+ NONE = "none"
48
+ PLAYWRIGHT = "playwright"
49
+
50
+
51
+ # Exceptions
52
+ class ScraperToolError(Exception):
53
+ """Base exception for ScraperTool errors."""
54
+
55
+
56
+ class HttpError(ScraperToolError):
57
+ """Raised when HTTP requests fail."""
58
+
59
+
60
+ class TimeoutError(ScraperToolError):
61
+ """Raised when operations time out."""
62
+
63
+
64
+ class RateLimitError(ScraperToolError):
65
+ """Raised when rate limits are exceeded."""
66
+
67
+
68
+ class ParsingError(ScraperToolError):
69
+ """Raised when HTML parsing fails."""
70
+
71
+
72
+ class RenderingError(ScraperToolError):
73
+ """Raised when rendering fails."""
74
+
75
+
76
+ class ExternalToolError(ScraperToolError):
77
+ """Raised when external tools fail."""
78
+
79
+
80
+ class FileOperationError(ScraperToolError):
81
+ """Raised when file operations fail."""
82
+
83
+
84
+ @register_tool("scraper")
85
+ class ScraperTool(BaseTool):
86
+ """
87
+ Enhanced web scraping tool with multiple HTTP clients, JavaScript rendering,
88
+ HTML parsing, and security features.
89
+
90
+ Features:
91
+ - Multiple HTTP clients: httpx, urllib
92
+ - JavaScript rendering with Playwright or Selenium
93
+ - HTML parsing with BeautifulSoup
94
+ - Scrapy integration for advanced crawling
95
+ - Output in various formats: text, JSON, HTML, Markdown, CSV
96
+ """
97
+
98
+ # Configuration schema
99
+ class Config(BaseModel):
100
+ """Configuration for the scraper tool"""
101
+
102
+ model_config = ConfigDict(env_prefix="SCRAPER_TOOL_")
103
+
104
+ user_agent: str = Field(
105
+ default="PythonMiddlewareScraper/2.0",
106
+ description="User agent for HTTP requests",
107
+ )
108
+ max_content_length: int = Field(
109
+ default=10 * 1024 * 1024,
110
+ description="Maximum content length in bytes",
111
+ )
112
+ output_dir: str = Field(
113
+ default=os.path.join(tempfile.gettempdir(), "scraper_outputs"),
114
+ description="Directory for output files",
115
+ )
116
+ scrapy_command: str = Field(default="scrapy", description="Command to run Scrapy")
117
+ allowed_domains: List[str] = Field(default=[], description="Allowed domains for scraping")
118
+ blocked_domains: List[str] = Field(default=[], description="Blocked domains for scraping")
119
+ playwright_available: bool = Field(
120
+ default=False,
121
+ description="Whether Playwright is available (auto-detected)",
122
+ )
123
+
124
+ def __init__(self, config: Optional[Dict] = None):
125
+ """
126
+ Initialize ScraperTool with settings and resources.
127
+
128
+ Args:
129
+ config (Dict, optional): Configuration overrides for ScraperTool.
130
+
131
+ Raises:
132
+ ValueError: If config contains invalid settings.
133
+ """
134
+ super().__init__(config)
135
+
136
+ # Parse configuration
137
+ self.config = self.Config(**(config or {}))
138
+
139
+ self.logger = logging.getLogger(__name__)
140
+ if not self.logger.handlers:
141
+ handler = logging.StreamHandler()
142
+ handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(message)s"))
143
+ self.logger.addHandler(handler)
144
+ self.logger.setLevel(logging.INFO)
145
+ os.makedirs(self.config.output_dir, exist_ok=True)
146
+ self._check_external_tools()
147
+
148
+ def _check_external_tools(self):
149
+ """Check if external tools are available."""
150
+ try:
151
+ self.config.playwright_available = True
152
+ except ImportError:
153
+ self.config.playwright_available = False
154
+
155
+ async def _save_output(self, content: Any, path: str, format: OutputFormat) -> None:
156
+ """Save content to file in the specified format."""
157
+ try:
158
+ os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True)
159
+ if format == OutputFormat.TEXT:
160
+ with open(path, "w", encoding="utf-8") as f:
161
+ if isinstance(content, dict):
162
+ f.write(json.dumps(content, indent=2))
163
+ else:
164
+ f.write(str(content))
165
+ elif format == OutputFormat.JSON:
166
+ with open(path, "w", encoding="utf-8") as f:
167
+ if isinstance(content, dict):
168
+ json.dump(content, f, indent=2)
169
+ else:
170
+ json.dump({"content": content}, f, indent=2)
171
+ elif format == OutputFormat.HTML:
172
+ with open(path, "w", encoding="utf-8") as f:
173
+ if isinstance(content, dict) and "html" in content:
174
+ f.write(content["html"])
175
+ else:
176
+ f.write(str(content))
177
+ elif format == OutputFormat.MARKDOWN:
178
+ with open(path, "w", encoding="utf-8") as f:
179
+ if isinstance(content, dict):
180
+ f.write("# Scraper Results\n\n")
181
+ for key, value in content.items():
182
+ f.write(f"## {key}\n\n")
183
+ f.write(f"{value}\n\n")
184
+ else:
185
+ f.write("# Scraper Results\n\n")
186
+ f.write(str(content))
187
+ elif format == OutputFormat.CSV:
188
+ import csv
189
+
190
+ with open(path, "w", newline="", encoding="utf-8") as f:
191
+ if isinstance(content, dict):
192
+ writer = csv.writer(f)
193
+ writer.writerow(content.keys())
194
+ writer.writerow(content.values())
195
+ elif isinstance(content, list) and all(
196
+ isinstance(item, dict) for item in content
197
+ ):
198
+ if content:
199
+ writer = csv.DictWriter(f, fieldnames=content[0].keys())
200
+ writer.writeheader()
201
+ writer.writerows(content)
202
+ else:
203
+ writer = csv.writer(f)
204
+ writer.writerow(["content"])
205
+ writer.writerow([str(content)])
206
+ except Exception as e:
207
+ raise FileOperationError(f"Error saving output: {str(e)}")
208
+
209
+ async def get_httpx(
210
+ self,
211
+ url: str,
212
+ method: HttpMethod = HttpMethod.GET,
213
+ params: Optional[Dict[str, str]] = None,
214
+ data: Optional[Dict[str, Any]] = None,
215
+ json_data: Optional[Dict[str, Any]] = None,
216
+ cookies: Optional[Dict[str, str]] = None,
217
+ auth: Optional[Tuple[str, str]] = None,
218
+ verify_ssl: Optional[bool] = None,
219
+ allow_redirects: bool = True,
220
+ content_type: ContentType = ContentType.TEXT,
221
+ headers: Optional[Dict[str, str]] = None,
222
+ output_format: Optional[OutputFormat] = None,
223
+ output_path: Optional[str] = None,
224
+ async_mode: bool = True,
225
+ ) -> Any:
226
+ """
227
+ Execute HTTP request using httpx library (supports both sync and async).
228
+
229
+ Args:
230
+ url (str): URL to scrape.
231
+ method (HttpMethod): HTTP method to use.
232
+ params (Optional[Dict[str, str]]): Query parameters.
233
+ data (Optional[Dict[str, Any]]): Form data.
234
+ json_data (Optional[Dict[str, Any]]): JSON data.
235
+ cookies (Optional[Dict[str, str]]): Cookies.
236
+ auth (Optional[Tuple[str, str]]): Authentication credentials.
237
+ verify_ssl (Optional[bool]): Verify SSL certificates.
238
+ allow_redirects (bool): Allow redirects.
239
+ content_type (ContentType): Expected content type.
240
+ headers (Optional[Dict[str, str]]): Custom headers.
241
+ output_format (Optional[OutputFormat]): Output format.
242
+ output_path (Optional[str]): Path to save output.
243
+ async_mode (bool): Whether to use async client.
244
+
245
+ Returns:
246
+ Any: Scraped content (dict, str, or bytes).
247
+
248
+ Raises:
249
+ HttpError: If the request fails.
250
+ """
251
+ try:
252
+ headers = headers or {}
253
+ if "User-Agent" not in headers:
254
+ headers["User-Agent"] = self.config.user_agent
255
+ kwargs = {
256
+ "params": params,
257
+ "headers": headers,
258
+ "follow_redirects": allow_redirects,
259
+ }
260
+ if auth:
261
+ kwargs["auth"] = auth
262
+ if cookies:
263
+ kwargs["cookies"] = cookies
264
+ if json_data:
265
+ kwargs["json"] = json_data
266
+ elif data:
267
+ kwargs["data"] = data
268
+
269
+ if async_mode:
270
+ async with httpx.AsyncClient(
271
+ verify=verify_ssl if verify_ssl is not None else True
272
+ ) as client:
273
+ method_fn = getattr(client, method.value)
274
+ resp = await method_fn(str(url), **kwargs)
275
+ else:
276
+ with httpx.Client(verify=verify_ssl if verify_ssl is not None else True) as client:
277
+ method_fn = getattr(client, method.value)
278
+ resp = method_fn(str(url), **kwargs)
279
+
280
+ try:
281
+ resp.raise_for_status()
282
+ except httpx.HTTPStatusError as e:
283
+ raise HttpError(
284
+ f"HTTP {e.response.status_code}: {e.response.reason_phrase} for {url}"
285
+ )
286
+
287
+ if len(resp.content) > self.config.max_content_length:
288
+ raise HttpError(f"Response content too large: {len(resp.content)} bytes")
289
+
290
+ if content_type == ContentType.JSON:
291
+ result = resp.json()
292
+ elif content_type == ContentType.HTML:
293
+ result = {
294
+ "html": resp.text,
295
+ "url": str(resp.url),
296
+ "status": resp.status_code,
297
+ }
298
+ elif content_type == ContentType.BINARY:
299
+ result = {
300
+ "content": resp.content,
301
+ "url": str(resp.url),
302
+ "status": resp.status_code,
303
+ }
304
+ else:
305
+ result = resp.text
306
+
307
+ if output_format and output_path:
308
+ await self._save_output(result, output_path, output_format)
309
+ if isinstance(result, dict):
310
+ result["saved_to"] = output_path
311
+ else:
312
+ result = {"content": result, "saved_to": output_path}
313
+ return result
314
+ except httpx.RequestError as e:
315
+ raise HttpError(f"Request failed: {str(e)}")
316
+
317
+ async def get_urllib(
318
+ self,
319
+ url: str,
320
+ method: HttpMethod = HttpMethod.GET,
321
+ data: Optional[Dict[str, Any]] = None,
322
+ content_type: ContentType = ContentType.TEXT,
323
+ headers: Optional[Dict[str, str]] = None,
324
+ output_format: Optional[OutputFormat] = None,
325
+ output_path: Optional[str] = None,
326
+ ) -> Any:
327
+ """
328
+ Execute HTTP request using urllib.
329
+
330
+ Args:
331
+ url (str): URL to scrape.
332
+ method (HttpMethod): HTTP method to use.
333
+ data (Optional[Dict[str, Any]]): Form data.
334
+ content_type (ContentType): Expected content type.
335
+ headers (Optional[Dict[str, str]]): Custom headers.
336
+ output_format (Optional[OutputFormat]): Output format.
337
+ output_path (Optional[str]): Path to save output.
338
+
339
+ Returns:
340
+ Any: Scraped content (dict, str, or bytes).
341
+
342
+ Raises:
343
+ HttpError: If the request fails.
344
+ """
345
+ try:
346
+ import urllib.parse
347
+ import urllib.error
348
+
349
+ headers = headers or {}
350
+ if "User-Agent" not in headers:
351
+ headers["User-Agent"] = self.config.user_agent
352
+ data_bytes = None
353
+ if data:
354
+ data_bytes = urllib.parse.urlencode(data).encode()
355
+ req = urllib_request.Request(
356
+ str(url),
357
+ data=data_bytes,
358
+ headers=headers,
359
+ method=method.value.upper(),
360
+ )
361
+ with urllib_request.urlopen(req) as resp:
362
+ content_length = resp.getheader("Content-Length")
363
+ if content_length and int(content_length) > self.config.max_content_length:
364
+ raise HttpError(f"Response content too large: {content_length} bytes")
365
+ content = resp.read()
366
+ charset = resp.headers.get_content_charset() or "utf-8"
367
+ if content_type == ContentType.JSON:
368
+ result = json.loads(content.decode(charset, errors="ignore"))
369
+ elif content_type == ContentType.HTML:
370
+ result = {
371
+ "html": content.decode(charset, errors="ignore"),
372
+ "url": resp.url,
373
+ "status": resp.status,
374
+ }
375
+ elif content_type == ContentType.BINARY:
376
+ result = {
377
+ "content": content,
378
+ "url": resp.url,
379
+ "status": resp.status,
380
+ }
381
+ else:
382
+ result = content.decode(charset, errors="ignore")
383
+ if output_format and output_path:
384
+ await self._save_output(result, output_path, output_format)
385
+ if isinstance(result, dict):
386
+ result["saved_to"] = output_path
387
+ else:
388
+ result = {"content": result, "saved_to": output_path}
389
+ return result
390
+ except urllib.error.URLError as e:
391
+ raise HttpError(f"Request failed: {str(e)}")
392
+
393
+ # Legacy method names for backward compatibility
394
+ async def get_requests(
395
+ self,
396
+ url: str,
397
+ method: HttpMethod = HttpMethod.GET,
398
+ params: Optional[Dict[str, str]] = None,
399
+ data: Optional[Dict[str, Any]] = None,
400
+ json_data: Optional[Dict[str, Any]] = None,
401
+ cookies: Optional[Dict[str, str]] = None,
402
+ auth: Optional[Tuple[str, str]] = None,
403
+ verify_ssl: Optional[bool] = None,
404
+ allow_redirects: bool = True,
405
+ content_type: ContentType = ContentType.TEXT,
406
+ headers: Optional[Dict[str, str]] = None,
407
+ output_format: Optional[OutputFormat] = None,
408
+ output_path: Optional[str] = None,
409
+ ) -> Any:
410
+ """Legacy method - now uses httpx in sync mode."""
411
+ return await self.get_httpx(
412
+ url,
413
+ method,
414
+ params,
415
+ data,
416
+ json_data,
417
+ cookies,
418
+ auth,
419
+ verify_ssl,
420
+ allow_redirects,
421
+ content_type,
422
+ headers,
423
+ output_format,
424
+ output_path,
425
+ async_mode=False,
426
+ )
427
+
428
+ async def get_aiohttp(
429
+ self,
430
+ url: str,
431
+ method: HttpMethod = HttpMethod.GET,
432
+ params: Optional[Dict[str, str]] = None,
433
+ data: Optional[Dict[str, Any]] = None,
434
+ json_data: Optional[Dict[str, Any]] = None,
435
+ cookies: Optional[Dict[str, str]] = None,
436
+ auth: Optional[Tuple[str, str]] = None,
437
+ verify_ssl: Optional[bool] = None,
438
+ allow_redirects: bool = True,
439
+ content_type: ContentType = ContentType.TEXT,
440
+ headers: Optional[Dict[str, str]] = None,
441
+ output_format: Optional[OutputFormat] = None,
442
+ output_path: Optional[str] = None,
443
+ ) -> Any:
444
+ """Legacy method - now uses httpx in async mode."""
445
+ return await self.get_httpx(
446
+ url,
447
+ method,
448
+ params,
449
+ data,
450
+ json_data,
451
+ cookies,
452
+ auth,
453
+ verify_ssl,
454
+ allow_redirects,
455
+ content_type,
456
+ headers,
457
+ output_format,
458
+ output_path,
459
+ async_mode=True,
460
+ )
461
+
462
+ async def render(
463
+ self,
464
+ url: str,
465
+ engine: RenderEngine = RenderEngine.PLAYWRIGHT,
466
+ wait_time: int = 5,
467
+ wait_selector: Optional[str] = None,
468
+ scroll_to_bottom: bool = False,
469
+ screenshot: bool = False,
470
+ screenshot_path: Optional[str] = None,
471
+ headers: Optional[Dict[str, str]] = None,
472
+ output_format: Optional[OutputFormat] = None,
473
+ output_path: Optional[str] = None,
474
+ ) -> Dict[str, Any]:
475
+ """
476
+ Render a web page using a headless browser (Playwright).
477
+
478
+ Args:
479
+ url (str): URL to render.
480
+ engine (RenderEngine): Rendering engine to use (only PLAYWRIGHT supported).
481
+ wait_time (int): Time to wait for JS execution.
482
+ wait_selector (Optional[str]): CSS selector to wait for.
483
+ scroll_to_bottom (bool): Whether to scroll to the bottom of the page.
484
+ screenshot (bool): Whether to take a screenshot.
485
+ screenshot_path (Optional[str]): Path to save the screenshot.
486
+ headers (Optional[Dict[str, str]]): Custom headers.
487
+ output_format (Optional[OutputFormat]): Output format.
488
+ output_path (Optional[str]): Path to save output.
489
+
490
+ Returns:
491
+ Dict[str, Any]: Rendered page content {'html': str, 'title': str, 'url': str, 'screenshot': Optional[str]}.
492
+
493
+ Raises:
494
+ RenderingError: If rendering fails.
495
+ """
496
+ try:
497
+ if engine == RenderEngine.PLAYWRIGHT:
498
+ if not self.config.playwright_available:
499
+ raise RenderingError(
500
+ "Playwright is not available. Install with 'pip install playwright'"
501
+ )
502
+ result = await self._render_with_playwright(
503
+ url,
504
+ wait_time,
505
+ wait_selector,
506
+ scroll_to_bottom,
507
+ screenshot,
508
+ screenshot_path,
509
+ )
510
+ else:
511
+ raise RenderingError(
512
+ f"Unsupported rendering engine: {engine}. Only PLAYWRIGHT is supported."
513
+ )
514
+ if output_format and output_path:
515
+ await self._save_output(result, output_path, output_format)
516
+ result["saved_to"] = output_path
517
+ return result
518
+ except Exception as e:
519
+ raise RenderingError(f"Failed to render page: {str(e)}")
520
+
521
+ async def _render_with_playwright(
522
+ self,
523
+ url: str,
524
+ wait_time: int,
525
+ wait_selector: Optional[str],
526
+ scroll_to_bottom: bool,
527
+ screenshot: bool,
528
+ screenshot_path: Optional[str],
529
+ ) -> Dict[str, Any]:
530
+ """Render a web page using Playwright with async API."""
531
+ from playwright.async_api import async_playwright
532
+
533
+ async with async_playwright() as p:
534
+ browser = await p.chromium.launch()
535
+ page = await browser.new_page(
536
+ user_agent=self.config.user_agent,
537
+ viewport={"width": 1280, "height": 800},
538
+ )
539
+ try:
540
+ await page.goto(url)
541
+ if wait_selector:
542
+ await page.wait_for_selector(wait_selector)
543
+ else:
544
+ await page.wait_for_load_state("networkidle")
545
+ if scroll_to_bottom:
546
+ await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
547
+ await page.wait_for_timeout(1000)
548
+ screenshot_result = None
549
+ if screenshot:
550
+ screenshot_path = screenshot_path or os.path.join(
551
+ self.config.output_dir,
552
+ f"screenshot_{int(time.time())}.png",
553
+ )
554
+ os.makedirs(
555
+ os.path.dirname(os.path.abspath(screenshot_path)),
556
+ exist_ok=True,
557
+ )
558
+ await page.screenshot(path=screenshot_path)
559
+ screenshot_result = screenshot_path
560
+ html = await page.content()
561
+ title = await page.title()
562
+ result = {
563
+ "html": html,
564
+ "title": title,
565
+ "url": page.url,
566
+ "screenshot": screenshot_result,
567
+ }
568
+ return result
569
+ finally:
570
+ await browser.close()
571
+
572
+ def crawl_scrapy(
573
+ self,
574
+ project_path: str,
575
+ spider_name: str,
576
+ output_path: str,
577
+ spider_args: Optional[Dict[str, str]] = None,
578
+ headers: Optional[Dict[str, str]] = None,
579
+ output_format: Optional[OutputFormat] = None,
580
+ ) -> Dict[str, Any]:
581
+ """
582
+ Execute a Scrapy spider in an existing project and output results to a file.
583
+
584
+ Args:
585
+ project_path (str): Path to the Scrapy project.
586
+ spider_name (str): Name of the spider to run.
587
+ output_path (str): Path to save the output.
588
+ spider_args (Optional[Dict[str, str]]): Arguments to pass to the spider.
589
+ headers (Optional[Dict[str, str]]): Custom headers.
590
+ output_format (Optional[OutputFormat]): Output format.
591
+
592
+ Returns:
593
+ Dict[str, Any]: Crawl results {'output_path': str, 'execution_time': float, 'file_size': int, 'stdout': str, 'stderr': str}.
594
+
595
+ Raises:
596
+ ExternalToolError: If Scrapy fails.
597
+ TimeoutError: If the operation times out.
598
+ """
599
+ try:
600
+ start_time = time.time()
601
+ os.makedirs(os.path.dirname(os.path.abspath(output_path)), exist_ok=True)
602
+ cmd = [
603
+ self.config.scrapy_command,
604
+ "crawl",
605
+ spider_name,
606
+ "-o",
607
+ output_path,
608
+ "-s",
609
+ f"USER_AGENT={self.config.user_agent}",
610
+ "-s",
611
+ "LOG_LEVEL=INFO",
612
+ ]
613
+ if spider_args:
614
+ for k, v in spider_args.items():
615
+ cmd += ["-a", f"{k}={v}"]
616
+ process = subprocess.run(
617
+ cmd,
618
+ cwd=project_path,
619
+ stdout=subprocess.PIPE,
620
+ stderr=subprocess.PIPE,
621
+ text=True,
622
+ )
623
+ if process.returncode != 0:
624
+ error_msg = process.stderr.strip()
625
+ raise ExternalToolError(f"Scrapy crawl failed: {error_msg}")
626
+ if not os.path.exists(output_path):
627
+ raise ExternalToolError(f"Scrapy crawl did not create output file: {output_path}")
628
+ file_size = os.path.getsize(output_path)
629
+ result = {
630
+ "output_path": output_path,
631
+ "execution_time": time.time() - start_time,
632
+ "file_size": file_size,
633
+ "stdout": process.stdout.strip(),
634
+ "stderr": process.stderr.strip(),
635
+ }
636
+ return result
637
+ except subprocess.TimeoutExpired:
638
+ raise TimeoutError("Scrapy crawl timed out")
639
+ except Exception as e:
640
+ raise ExternalToolError(f"Error running Scrapy: {str(e)}")
641
+
642
+ def parse_html(
643
+ self,
644
+ html: str,
645
+ selector: str,
646
+ selector_type: str = "css",
647
+ extract_attr: Optional[str] = None,
648
+ extract_text: bool = True,
649
+ ) -> Dict[str, Any]:
650
+ """
651
+ Parse HTML content using BeautifulSoup.
652
+
653
+ Args:
654
+ html (str): HTML content to parse.
655
+ selector (str): CSS or XPath selector.
656
+ selector_type (str): Selector type ('css' or 'xpath').
657
+ extract_attr (Optional[str]): Attribute to extract.
658
+ extract_text (bool): Whether to extract text content.
659
+
660
+ Returns:
661
+ Dict[str, Any]: Parsed results {'selector': str, 'selector_type': str, 'count': int, 'results': List[str]}.
662
+
663
+ Raises:
664
+ ParsingError: If parsing fails.
665
+ """
666
+ try:
667
+ soup = BeautifulSoup(html, "html.parser")
668
+ if selector_type == "css":
669
+ elements = soup.select(selector)
670
+ else:
671
+ from lxml.html import fromstring
672
+ from lxml.etree import XPath
673
+
674
+ root = fromstring(html)
675
+ xpath = XPath(selector)
676
+ elements = xpath(root)
677
+ results = []
678
+ for element in elements:
679
+ if extract_attr:
680
+ value = (
681
+ element.get(extract_attr)
682
+ if hasattr(element, "get")
683
+ else element.get(extract_attr)
684
+ )
685
+ if value is not None:
686
+ results.append(value)
687
+ elif extract_text:
688
+ if hasattr(element, "text_content") and callable(
689
+ getattr(element, "text_content")
690
+ ):
691
+ # lxml element
692
+ text = element.text_content()
693
+ else:
694
+ # BeautifulSoup element
695
+ text = element.get_text()
696
+
697
+ if text and text.strip():
698
+ results.append(text.strip())
699
+ return {
700
+ "selector": selector,
701
+ "selector_type": selector_type,
702
+ "count": len(results),
703
+ "results": results,
704
+ }
705
+ except Exception as e:
706
+ raise ParsingError(f"Error parsing HTML: {str(e)}")
707
+
708
+ # HTTP method shortcuts
709
+ get = get_httpx
710
+ post = get_httpx
711
+ put = get_httpx
712
+ delete = get_httpx
713
+ head = get_httpx
714
+ options = get_httpx
715
+ patch = get_httpx