ai-parrot 0.17.2__cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentui/.prettierrc +15 -0
- agentui/QUICKSTART.md +272 -0
- agentui/README.md +59 -0
- agentui/env.example +16 -0
- agentui/jsconfig.json +14 -0
- agentui/package-lock.json +4242 -0
- agentui/package.json +34 -0
- agentui/scripts/postinstall/apply-patches.mjs +260 -0
- agentui/src/app.css +61 -0
- agentui/src/app.d.ts +13 -0
- agentui/src/app.html +12 -0
- agentui/src/components/LoadingSpinner.svelte +64 -0
- agentui/src/components/ThemeSwitcher.svelte +159 -0
- agentui/src/components/index.js +4 -0
- agentui/src/lib/api/bots.ts +60 -0
- agentui/src/lib/api/chat.ts +22 -0
- agentui/src/lib/api/http.ts +25 -0
- agentui/src/lib/components/BotCard.svelte +33 -0
- agentui/src/lib/components/ChatBubble.svelte +63 -0
- agentui/src/lib/components/Toast.svelte +21 -0
- agentui/src/lib/config.ts +20 -0
- agentui/src/lib/stores/auth.svelte.ts +73 -0
- agentui/src/lib/stores/theme.svelte.js +64 -0
- agentui/src/lib/stores/toast.svelte.ts +31 -0
- agentui/src/lib/utils/conversation.ts +39 -0
- agentui/src/routes/+layout.svelte +20 -0
- agentui/src/routes/+page.svelte +232 -0
- agentui/src/routes/login/+page.svelte +200 -0
- agentui/src/routes/talk/[agentId]/+page.svelte +297 -0
- agentui/src/routes/talk/[agentId]/+page.ts +7 -0
- agentui/static/README.md +1 -0
- agentui/svelte.config.js +11 -0
- agentui/tailwind.config.ts +53 -0
- agentui/tsconfig.json +3 -0
- agentui/vite.config.ts +10 -0
- ai_parrot-0.17.2.dist-info/METADATA +472 -0
- ai_parrot-0.17.2.dist-info/RECORD +535 -0
- ai_parrot-0.17.2.dist-info/WHEEL +6 -0
- ai_parrot-0.17.2.dist-info/entry_points.txt +2 -0
- ai_parrot-0.17.2.dist-info/licenses/LICENSE +21 -0
- ai_parrot-0.17.2.dist-info/top_level.txt +6 -0
- crew-builder/.prettierrc +15 -0
- crew-builder/QUICKSTART.md +259 -0
- crew-builder/README.md +113 -0
- crew-builder/env.example +17 -0
- crew-builder/jsconfig.json +14 -0
- crew-builder/package-lock.json +4182 -0
- crew-builder/package.json +37 -0
- crew-builder/scripts/postinstall/apply-patches.mjs +260 -0
- crew-builder/src/app.css +62 -0
- crew-builder/src/app.d.ts +13 -0
- crew-builder/src/app.html +12 -0
- crew-builder/src/components/LoadingSpinner.svelte +64 -0
- crew-builder/src/components/ThemeSwitcher.svelte +149 -0
- crew-builder/src/components/index.js +9 -0
- crew-builder/src/lib/api/bots.ts +60 -0
- crew-builder/src/lib/api/chat.ts +80 -0
- crew-builder/src/lib/api/client.ts +56 -0
- crew-builder/src/lib/api/crew/crew.ts +136 -0
- crew-builder/src/lib/api/index.ts +5 -0
- crew-builder/src/lib/api/o365/auth.ts +65 -0
- crew-builder/src/lib/auth/auth.ts +54 -0
- crew-builder/src/lib/components/AgentNode.svelte +43 -0
- crew-builder/src/lib/components/BotCard.svelte +33 -0
- crew-builder/src/lib/components/ChatBubble.svelte +67 -0
- crew-builder/src/lib/components/ConfigPanel.svelte +278 -0
- crew-builder/src/lib/components/JsonTreeNode.svelte +76 -0
- crew-builder/src/lib/components/JsonViewer.svelte +24 -0
- crew-builder/src/lib/components/MarkdownEditor.svelte +48 -0
- crew-builder/src/lib/components/ThemeToggle.svelte +36 -0
- crew-builder/src/lib/components/Toast.svelte +67 -0
- crew-builder/src/lib/components/Toolbar.svelte +157 -0
- crew-builder/src/lib/components/index.ts +10 -0
- crew-builder/src/lib/config.ts +8 -0
- crew-builder/src/lib/stores/auth.svelte.ts +228 -0
- crew-builder/src/lib/stores/crewStore.ts +369 -0
- crew-builder/src/lib/stores/theme.svelte.js +145 -0
- crew-builder/src/lib/stores/toast.svelte.ts +69 -0
- crew-builder/src/lib/utils/conversation.ts +39 -0
- crew-builder/src/lib/utils/markdown.ts +122 -0
- crew-builder/src/lib/utils/talkHistory.ts +47 -0
- crew-builder/src/routes/+layout.svelte +20 -0
- crew-builder/src/routes/+page.svelte +539 -0
- crew-builder/src/routes/agents/+page.svelte +247 -0
- crew-builder/src/routes/agents/[agentId]/+page.svelte +288 -0
- crew-builder/src/routes/agents/[agentId]/+page.ts +7 -0
- crew-builder/src/routes/builder/+page.svelte +204 -0
- crew-builder/src/routes/crew/ask/+page.svelte +1052 -0
- crew-builder/src/routes/crew/ask/+page.ts +1 -0
- crew-builder/src/routes/integrations/o365/+page.svelte +304 -0
- crew-builder/src/routes/login/+page.svelte +197 -0
- crew-builder/src/routes/talk/[agentId]/+page.svelte +487 -0
- crew-builder/src/routes/talk/[agentId]/+page.ts +7 -0
- crew-builder/static/README.md +1 -0
- crew-builder/svelte.config.js +11 -0
- crew-builder/tailwind.config.ts +53 -0
- crew-builder/tsconfig.json +3 -0
- crew-builder/vite.config.ts +10 -0
- mcp_servers/calculator_server.py +309 -0
- parrot/__init__.py +27 -0
- parrot/__pycache__/__init__.cpython-310.pyc +0 -0
- parrot/__pycache__/version.cpython-310.pyc +0 -0
- parrot/_version.py +34 -0
- parrot/a2a/__init__.py +48 -0
- parrot/a2a/client.py +658 -0
- parrot/a2a/discovery.py +89 -0
- parrot/a2a/mixin.py +257 -0
- parrot/a2a/models.py +376 -0
- parrot/a2a/server.py +770 -0
- parrot/agents/__init__.py +29 -0
- parrot/bots/__init__.py +12 -0
- parrot/bots/a2a_agent.py +19 -0
- parrot/bots/abstract.py +3139 -0
- parrot/bots/agent.py +1129 -0
- parrot/bots/basic.py +9 -0
- parrot/bots/chatbot.py +669 -0
- parrot/bots/data.py +1618 -0
- parrot/bots/database/__init__.py +5 -0
- parrot/bots/database/abstract.py +3071 -0
- parrot/bots/database/cache.py +286 -0
- parrot/bots/database/models.py +468 -0
- parrot/bots/database/prompts.py +154 -0
- parrot/bots/database/retries.py +98 -0
- parrot/bots/database/router.py +269 -0
- parrot/bots/database/sql.py +41 -0
- parrot/bots/db/__init__.py +6 -0
- parrot/bots/db/abstract.py +556 -0
- parrot/bots/db/bigquery.py +602 -0
- parrot/bots/db/cache.py +85 -0
- parrot/bots/db/documentdb.py +668 -0
- parrot/bots/db/elastic.py +1014 -0
- parrot/bots/db/influx.py +898 -0
- parrot/bots/db/mock.py +96 -0
- parrot/bots/db/multi.py +783 -0
- parrot/bots/db/prompts.py +185 -0
- parrot/bots/db/sql.py +1255 -0
- parrot/bots/db/tools.py +212 -0
- parrot/bots/document.py +680 -0
- parrot/bots/hrbot.py +15 -0
- parrot/bots/kb.py +170 -0
- parrot/bots/mcp.py +36 -0
- parrot/bots/orchestration/README.md +463 -0
- parrot/bots/orchestration/__init__.py +1 -0
- parrot/bots/orchestration/agent.py +155 -0
- parrot/bots/orchestration/crew.py +3330 -0
- parrot/bots/orchestration/fsm.py +1179 -0
- parrot/bots/orchestration/hr.py +434 -0
- parrot/bots/orchestration/storage/__init__.py +4 -0
- parrot/bots/orchestration/storage/memory.py +100 -0
- parrot/bots/orchestration/storage/mixin.py +119 -0
- parrot/bots/orchestration/verify.py +202 -0
- parrot/bots/product.py +204 -0
- parrot/bots/prompts/__init__.py +96 -0
- parrot/bots/prompts/agents.py +155 -0
- parrot/bots/prompts/data.py +216 -0
- parrot/bots/prompts/output_generation.py +8 -0
- parrot/bots/scraper/__init__.py +3 -0
- parrot/bots/scraper/models.py +122 -0
- parrot/bots/scraper/scraper.py +1173 -0
- parrot/bots/scraper/templates.py +115 -0
- parrot/bots/stores/__init__.py +5 -0
- parrot/bots/stores/local.py +172 -0
- parrot/bots/webdev.py +81 -0
- parrot/cli.py +17 -0
- parrot/clients/__init__.py +16 -0
- parrot/clients/base.py +1491 -0
- parrot/clients/claude.py +1191 -0
- parrot/clients/factory.py +129 -0
- parrot/clients/google.py +4567 -0
- parrot/clients/gpt.py +1975 -0
- parrot/clients/grok.py +432 -0
- parrot/clients/groq.py +986 -0
- parrot/clients/hf.py +582 -0
- parrot/clients/models.py +18 -0
- parrot/conf.py +395 -0
- parrot/embeddings/__init__.py +9 -0
- parrot/embeddings/base.py +157 -0
- parrot/embeddings/google.py +98 -0
- parrot/embeddings/huggingface.py +74 -0
- parrot/embeddings/openai.py +84 -0
- parrot/embeddings/processor.py +88 -0
- parrot/exceptions.c +13868 -0
- parrot/exceptions.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/exceptions.pxd +22 -0
- parrot/exceptions.pxi +15 -0
- parrot/exceptions.pyx +44 -0
- parrot/generators/__init__.py +29 -0
- parrot/generators/base.py +200 -0
- parrot/generators/html.py +293 -0
- parrot/generators/react.py +205 -0
- parrot/generators/streamlit.py +203 -0
- parrot/generators/template.py +105 -0
- parrot/handlers/__init__.py +4 -0
- parrot/handlers/agent.py +861 -0
- parrot/handlers/agents/__init__.py +1 -0
- parrot/handlers/agents/abstract.py +900 -0
- parrot/handlers/bots.py +338 -0
- parrot/handlers/chat.py +915 -0
- parrot/handlers/creation.sql +192 -0
- parrot/handlers/crew/ARCHITECTURE.md +362 -0
- parrot/handlers/crew/README_BOTMANAGER_PERSISTENCE.md +303 -0
- parrot/handlers/crew/README_REDIS_PERSISTENCE.md +366 -0
- parrot/handlers/crew/__init__.py +0 -0
- parrot/handlers/crew/handler.py +801 -0
- parrot/handlers/crew/models.py +229 -0
- parrot/handlers/crew/redis_persistence.py +523 -0
- parrot/handlers/jobs/__init__.py +10 -0
- parrot/handlers/jobs/job.py +384 -0
- parrot/handlers/jobs/mixin.py +627 -0
- parrot/handlers/jobs/models.py +115 -0
- parrot/handlers/jobs/worker.py +31 -0
- parrot/handlers/models.py +596 -0
- parrot/handlers/o365_auth.py +105 -0
- parrot/handlers/stream.py +337 -0
- parrot/interfaces/__init__.py +6 -0
- parrot/interfaces/aws.py +143 -0
- parrot/interfaces/credentials.py +113 -0
- parrot/interfaces/database.py +27 -0
- parrot/interfaces/google.py +1123 -0
- parrot/interfaces/hierarchy.py +1227 -0
- parrot/interfaces/http.py +651 -0
- parrot/interfaces/images/__init__.py +0 -0
- parrot/interfaces/images/plugins/__init__.py +24 -0
- parrot/interfaces/images/plugins/abstract.py +58 -0
- parrot/interfaces/images/plugins/analisys.py +148 -0
- parrot/interfaces/images/plugins/classify.py +150 -0
- parrot/interfaces/images/plugins/classifybase.py +182 -0
- parrot/interfaces/images/plugins/detect.py +150 -0
- parrot/interfaces/images/plugins/exif.py +1103 -0
- parrot/interfaces/images/plugins/hash.py +52 -0
- parrot/interfaces/images/plugins/vision.py +104 -0
- parrot/interfaces/images/plugins/yolo.py +66 -0
- parrot/interfaces/images/plugins/zerodetect.py +197 -0
- parrot/interfaces/o365.py +978 -0
- parrot/interfaces/onedrive.py +822 -0
- parrot/interfaces/sharepoint.py +1435 -0
- parrot/interfaces/soap.py +257 -0
- parrot/loaders/__init__.py +8 -0
- parrot/loaders/abstract.py +1131 -0
- parrot/loaders/audio.py +199 -0
- parrot/loaders/basepdf.py +53 -0
- parrot/loaders/basevideo.py +1568 -0
- parrot/loaders/csv.py +409 -0
- parrot/loaders/docx.py +116 -0
- parrot/loaders/epubloader.py +316 -0
- parrot/loaders/excel.py +199 -0
- parrot/loaders/factory.py +55 -0
- parrot/loaders/files/__init__.py +0 -0
- parrot/loaders/files/abstract.py +39 -0
- parrot/loaders/files/html.py +26 -0
- parrot/loaders/files/text.py +63 -0
- parrot/loaders/html.py +152 -0
- parrot/loaders/markdown.py +442 -0
- parrot/loaders/pdf.py +373 -0
- parrot/loaders/pdfmark.py +320 -0
- parrot/loaders/pdftables.py +506 -0
- parrot/loaders/ppt.py +476 -0
- parrot/loaders/qa.py +63 -0
- parrot/loaders/splitters/__init__.py +10 -0
- parrot/loaders/splitters/base.py +138 -0
- parrot/loaders/splitters/md.py +228 -0
- parrot/loaders/splitters/token.py +143 -0
- parrot/loaders/txt.py +26 -0
- parrot/loaders/video.py +89 -0
- parrot/loaders/videolocal.py +218 -0
- parrot/loaders/videounderstanding.py +377 -0
- parrot/loaders/vimeo.py +167 -0
- parrot/loaders/web.py +599 -0
- parrot/loaders/youtube.py +504 -0
- parrot/manager/__init__.py +5 -0
- parrot/manager/manager.py +1030 -0
- parrot/mcp/__init__.py +28 -0
- parrot/mcp/adapter.py +105 -0
- parrot/mcp/cli.py +174 -0
- parrot/mcp/client.py +119 -0
- parrot/mcp/config.py +75 -0
- parrot/mcp/integration.py +842 -0
- parrot/mcp/oauth.py +933 -0
- parrot/mcp/server.py +225 -0
- parrot/mcp/transports/__init__.py +3 -0
- parrot/mcp/transports/base.py +279 -0
- parrot/mcp/transports/grpc_session.py +163 -0
- parrot/mcp/transports/http.py +312 -0
- parrot/mcp/transports/mcp.proto +108 -0
- parrot/mcp/transports/quic.py +1082 -0
- parrot/mcp/transports/sse.py +330 -0
- parrot/mcp/transports/stdio.py +309 -0
- parrot/mcp/transports/unix.py +395 -0
- parrot/mcp/transports/websocket.py +547 -0
- parrot/memory/__init__.py +16 -0
- parrot/memory/abstract.py +209 -0
- parrot/memory/agent.py +32 -0
- parrot/memory/cache.py +175 -0
- parrot/memory/core.py +555 -0
- parrot/memory/file.py +153 -0
- parrot/memory/mem.py +131 -0
- parrot/memory/redis.py +613 -0
- parrot/models/__init__.py +46 -0
- parrot/models/basic.py +118 -0
- parrot/models/compliance.py +208 -0
- parrot/models/crew.py +395 -0
- parrot/models/detections.py +654 -0
- parrot/models/generation.py +85 -0
- parrot/models/google.py +223 -0
- parrot/models/groq.py +23 -0
- parrot/models/openai.py +30 -0
- parrot/models/outputs.py +285 -0
- parrot/models/responses.py +938 -0
- parrot/notifications/__init__.py +743 -0
- parrot/openapi/__init__.py +3 -0
- parrot/openapi/components.yaml +641 -0
- parrot/openapi/config.py +322 -0
- parrot/outputs/__init__.py +32 -0
- parrot/outputs/formats/__init__.py +108 -0
- parrot/outputs/formats/altair.py +359 -0
- parrot/outputs/formats/application.py +122 -0
- parrot/outputs/formats/base.py +351 -0
- parrot/outputs/formats/bokeh.py +356 -0
- parrot/outputs/formats/card.py +424 -0
- parrot/outputs/formats/chart.py +436 -0
- parrot/outputs/formats/d3.py +255 -0
- parrot/outputs/formats/echarts.py +310 -0
- parrot/outputs/formats/generators/__init__.py +0 -0
- parrot/outputs/formats/generators/abstract.py +61 -0
- parrot/outputs/formats/generators/panel.py +145 -0
- parrot/outputs/formats/generators/streamlit.py +86 -0
- parrot/outputs/formats/generators/terminal.py +63 -0
- parrot/outputs/formats/holoviews.py +310 -0
- parrot/outputs/formats/html.py +147 -0
- parrot/outputs/formats/jinja2.py +46 -0
- parrot/outputs/formats/json.py +87 -0
- parrot/outputs/formats/map.py +933 -0
- parrot/outputs/formats/markdown.py +172 -0
- parrot/outputs/formats/matplotlib.py +237 -0
- parrot/outputs/formats/mixins/__init__.py +0 -0
- parrot/outputs/formats/mixins/emaps.py +855 -0
- parrot/outputs/formats/plotly.py +341 -0
- parrot/outputs/formats/seaborn.py +310 -0
- parrot/outputs/formats/table.py +397 -0
- parrot/outputs/formats/template_report.py +138 -0
- parrot/outputs/formats/yaml.py +125 -0
- parrot/outputs/formatter.py +152 -0
- parrot/outputs/templates/__init__.py +95 -0
- parrot/pipelines/__init__.py +0 -0
- parrot/pipelines/abstract.py +210 -0
- parrot/pipelines/detector.py +124 -0
- parrot/pipelines/models.py +90 -0
- parrot/pipelines/planogram.py +3002 -0
- parrot/pipelines/table.sql +97 -0
- parrot/plugins/__init__.py +106 -0
- parrot/plugins/importer.py +80 -0
- parrot/py.typed +0 -0
- parrot/registry/__init__.py +18 -0
- parrot/registry/registry.py +594 -0
- parrot/scheduler/__init__.py +1189 -0
- parrot/scheduler/models.py +60 -0
- parrot/security/__init__.py +16 -0
- parrot/security/prompt_injection.py +268 -0
- parrot/security/security_events.sql +25 -0
- parrot/services/__init__.py +1 -0
- parrot/services/mcp/__init__.py +8 -0
- parrot/services/mcp/config.py +13 -0
- parrot/services/mcp/server.py +295 -0
- parrot/services/o365_remote_auth.py +235 -0
- parrot/stores/__init__.py +7 -0
- parrot/stores/abstract.py +352 -0
- parrot/stores/arango.py +1090 -0
- parrot/stores/bigquery.py +1377 -0
- parrot/stores/cache.py +106 -0
- parrot/stores/empty.py +10 -0
- parrot/stores/faiss_store.py +1157 -0
- parrot/stores/kb/__init__.py +9 -0
- parrot/stores/kb/abstract.py +68 -0
- parrot/stores/kb/cache.py +165 -0
- parrot/stores/kb/doc.py +325 -0
- parrot/stores/kb/hierarchy.py +346 -0
- parrot/stores/kb/local.py +457 -0
- parrot/stores/kb/prompt.py +28 -0
- parrot/stores/kb/redis.py +659 -0
- parrot/stores/kb/store.py +115 -0
- parrot/stores/kb/user.py +374 -0
- parrot/stores/models.py +59 -0
- parrot/stores/pgvector.py +3 -0
- parrot/stores/postgres.py +2853 -0
- parrot/stores/utils/__init__.py +0 -0
- parrot/stores/utils/chunking.py +197 -0
- parrot/telemetry/__init__.py +3 -0
- parrot/telemetry/mixin.py +111 -0
- parrot/template/__init__.py +3 -0
- parrot/template/engine.py +259 -0
- parrot/tools/__init__.py +23 -0
- parrot/tools/abstract.py +644 -0
- parrot/tools/agent.py +363 -0
- parrot/tools/arangodbsearch.py +537 -0
- parrot/tools/arxiv_tool.py +188 -0
- parrot/tools/calculator/__init__.py +3 -0
- parrot/tools/calculator/operations/__init__.py +38 -0
- parrot/tools/calculator/operations/calculus.py +80 -0
- parrot/tools/calculator/operations/statistics.py +76 -0
- parrot/tools/calculator/tool.py +150 -0
- parrot/tools/cloudwatch.py +988 -0
- parrot/tools/codeinterpreter/__init__.py +127 -0
- parrot/tools/codeinterpreter/executor.py +371 -0
- parrot/tools/codeinterpreter/internals.py +473 -0
- parrot/tools/codeinterpreter/models.py +643 -0
- parrot/tools/codeinterpreter/prompts.py +224 -0
- parrot/tools/codeinterpreter/tool.py +664 -0
- parrot/tools/company_info/__init__.py +6 -0
- parrot/tools/company_info/tool.py +1138 -0
- parrot/tools/correlationanalysis.py +437 -0
- parrot/tools/database/abstract.py +286 -0
- parrot/tools/database/bq.py +115 -0
- parrot/tools/database/cache.py +284 -0
- parrot/tools/database/models.py +95 -0
- parrot/tools/database/pg.py +343 -0
- parrot/tools/databasequery.py +1159 -0
- parrot/tools/db.py +1800 -0
- parrot/tools/ddgo.py +370 -0
- parrot/tools/decorators.py +271 -0
- parrot/tools/dftohtml.py +282 -0
- parrot/tools/document.py +549 -0
- parrot/tools/ecs.py +819 -0
- parrot/tools/edareport.py +368 -0
- parrot/tools/elasticsearch.py +1049 -0
- parrot/tools/employees.py +462 -0
- parrot/tools/epson/__init__.py +96 -0
- parrot/tools/excel.py +683 -0
- parrot/tools/file/__init__.py +13 -0
- parrot/tools/file/abstract.py +76 -0
- parrot/tools/file/gcs.py +378 -0
- parrot/tools/file/local.py +284 -0
- parrot/tools/file/s3.py +511 -0
- parrot/tools/file/tmp.py +309 -0
- parrot/tools/file/tool.py +501 -0
- parrot/tools/file_reader.py +129 -0
- parrot/tools/flowtask/__init__.py +19 -0
- parrot/tools/flowtask/tool.py +761 -0
- parrot/tools/gittoolkit.py +508 -0
- parrot/tools/google/__init__.py +18 -0
- parrot/tools/google/base.py +169 -0
- parrot/tools/google/tools.py +1251 -0
- parrot/tools/googlelocation.py +5 -0
- parrot/tools/googleroutes.py +5 -0
- parrot/tools/googlesearch.py +5 -0
- parrot/tools/googlesitesearch.py +5 -0
- parrot/tools/googlevoice.py +2 -0
- parrot/tools/gvoice.py +695 -0
- parrot/tools/ibisworld/README.md +225 -0
- parrot/tools/ibisworld/__init__.py +11 -0
- parrot/tools/ibisworld/tool.py +366 -0
- parrot/tools/jiratoolkit.py +1718 -0
- parrot/tools/manager.py +1098 -0
- parrot/tools/math.py +152 -0
- parrot/tools/metadata.py +476 -0
- parrot/tools/msteams.py +1621 -0
- parrot/tools/msword.py +635 -0
- parrot/tools/multidb.py +580 -0
- parrot/tools/multistoresearch.py +369 -0
- parrot/tools/networkninja.py +167 -0
- parrot/tools/nextstop/__init__.py +4 -0
- parrot/tools/nextstop/base.py +286 -0
- parrot/tools/nextstop/employee.py +733 -0
- parrot/tools/nextstop/store.py +462 -0
- parrot/tools/notification.py +435 -0
- parrot/tools/o365/__init__.py +42 -0
- parrot/tools/o365/base.py +295 -0
- parrot/tools/o365/bundle.py +522 -0
- parrot/tools/o365/events.py +554 -0
- parrot/tools/o365/mail.py +992 -0
- parrot/tools/o365/onedrive.py +497 -0
- parrot/tools/o365/sharepoint.py +641 -0
- parrot/tools/openapi_toolkit.py +904 -0
- parrot/tools/openweather.py +527 -0
- parrot/tools/pdfprint.py +1001 -0
- parrot/tools/powerbi.py +518 -0
- parrot/tools/powerpoint.py +1113 -0
- parrot/tools/pricestool.py +146 -0
- parrot/tools/products/__init__.py +246 -0
- parrot/tools/prophet_tool.py +171 -0
- parrot/tools/pythonpandas.py +630 -0
- parrot/tools/pythonrepl.py +910 -0
- parrot/tools/qsource.py +436 -0
- parrot/tools/querytoolkit.py +395 -0
- parrot/tools/quickeda.py +827 -0
- parrot/tools/resttool.py +553 -0
- parrot/tools/retail/__init__.py +0 -0
- parrot/tools/retail/bby.py +528 -0
- parrot/tools/sandboxtool.py +703 -0
- parrot/tools/sassie/__init__.py +352 -0
- parrot/tools/scraping/__init__.py +7 -0
- parrot/tools/scraping/docs/select.md +466 -0
- parrot/tools/scraping/documentation.md +1278 -0
- parrot/tools/scraping/driver.py +436 -0
- parrot/tools/scraping/models.py +576 -0
- parrot/tools/scraping/options.py +85 -0
- parrot/tools/scraping/orchestrator.py +517 -0
- parrot/tools/scraping/readme.md +740 -0
- parrot/tools/scraping/tool.py +3115 -0
- parrot/tools/seasonaldetection.py +642 -0
- parrot/tools/shell_tool/__init__.py +5 -0
- parrot/tools/shell_tool/actions.py +408 -0
- parrot/tools/shell_tool/engine.py +155 -0
- parrot/tools/shell_tool/models.py +322 -0
- parrot/tools/shell_tool/tool.py +442 -0
- parrot/tools/site_search.py +214 -0
- parrot/tools/textfile.py +418 -0
- parrot/tools/think.py +378 -0
- parrot/tools/toolkit.py +298 -0
- parrot/tools/webapp_tool.py +187 -0
- parrot/tools/whatif.py +1279 -0
- parrot/tools/workday/MULTI_WSDL_EXAMPLE.md +249 -0
- parrot/tools/workday/__init__.py +6 -0
- parrot/tools/workday/models.py +1389 -0
- parrot/tools/workday/tool.py +1293 -0
- parrot/tools/yfinance_tool.py +306 -0
- parrot/tools/zipcode.py +217 -0
- parrot/utils/__init__.py +2 -0
- parrot/utils/helpers.py +73 -0
- parrot/utils/parsers/__init__.py +5 -0
- parrot/utils/parsers/toml.c +12078 -0
- parrot/utils/parsers/toml.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/utils/parsers/toml.pyx +21 -0
- parrot/utils/toml.py +11 -0
- parrot/utils/types.cpp +20936 -0
- parrot/utils/types.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/utils/types.pyx +213 -0
- parrot/utils/uv.py +11 -0
- parrot/version.py +10 -0
- parrot/yaml-rs/Cargo.lock +350 -0
- parrot/yaml-rs/Cargo.toml +19 -0
- parrot/yaml-rs/pyproject.toml +19 -0
- parrot/yaml-rs/python/yaml_rs/__init__.py +81 -0
- parrot/yaml-rs/src/lib.rs +222 -0
- requirements/docker-compose.yml +24 -0
- requirements/requirements-dev.txt +21 -0
|
@@ -0,0 +1,1173 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ScrapingAgent for AI-Parrot
|
|
3
|
+
LLM-powered agent that makes intelligent decisions about web scraping
|
|
4
|
+
Updated to better integrate with current WebScrapingTool architecture
|
|
5
|
+
"""
|
|
6
|
+
from typing import Dict, List, Any, Optional, Literal
|
|
7
|
+
import json
|
|
8
|
+
import re
|
|
9
|
+
import logging
|
|
10
|
+
from datetime import datetime
|
|
11
|
+
from urllib.parse import urlparse
|
|
12
|
+
from bs4 import BeautifulSoup
|
|
13
|
+
from ..abstract import AbstractBot
|
|
14
|
+
from ...tools.scraping import (
|
|
15
|
+
WebScrapingTool,
|
|
16
|
+
ScrapingStep,
|
|
17
|
+
ScrapingSelector,
|
|
18
|
+
ScrapingResult
|
|
19
|
+
)
|
|
20
|
+
from .templates import (
|
|
21
|
+
BESTBUY_TEMPLATE,
|
|
22
|
+
AMAZON_TEMPLATE,
|
|
23
|
+
EBAY_TEMPLATE
|
|
24
|
+
)
|
|
25
|
+
from .models import (
|
|
26
|
+
ScrapingPlanSchema
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class ScrapingAgent(AbstractBot):
|
|
31
|
+
"""
|
|
32
|
+
Intelligent web scraping agent that uses LLM to:
|
|
33
|
+
- Analyze web pages and determine optimal scraping strategies
|
|
34
|
+
- Generate navigation steps based on page structure
|
|
35
|
+
- Adapt selectors based on content analysis
|
|
36
|
+
- Handle dynamic content and authentication flows
|
|
37
|
+
- Recommend optimal browser configurations
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
def __init__(
|
|
41
|
+
self,
|
|
42
|
+
name: str = "WebScrapingAgent",
|
|
43
|
+
browser: Literal['chrome', 'firefox', 'edge', 'safari', 'undetected'] = 'chrome',
|
|
44
|
+
driver_type: Literal['selenium', 'playwright'] = 'selenium',
|
|
45
|
+
headless: bool = True,
|
|
46
|
+
mobile: bool = False,
|
|
47
|
+
mobile_device: Optional[str] = None,
|
|
48
|
+
auto_install: bool = True,
|
|
49
|
+
**kwargs
|
|
50
|
+
):
|
|
51
|
+
# Enhanced system prompt for web scraping
|
|
52
|
+
system_prompt = self._build_scraping_system_prompt()
|
|
53
|
+
|
|
54
|
+
super().__init__(
|
|
55
|
+
name=name,
|
|
56
|
+
system_prompt=system_prompt,
|
|
57
|
+
**kwargs
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
# Store browser configuration for dynamic adjustments
|
|
61
|
+
self.browser_config = {
|
|
62
|
+
'browser': browser,
|
|
63
|
+
'driver_type': driver_type,
|
|
64
|
+
'headless': headless,
|
|
65
|
+
'mobile': mobile,
|
|
66
|
+
'mobile_device': mobile_device,
|
|
67
|
+
'auto_install': auto_install,
|
|
68
|
+
**kwargs
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
# Initialize scraping tool with configuration
|
|
72
|
+
self.scraping_tool = WebScrapingTool(**self.browser_config)
|
|
73
|
+
self.tool_manager.register_tool(self.scraping_tool)
|
|
74
|
+
self.logger = logging.getLogger(f"AI-Parrot.ScrapingAgent")
|
|
75
|
+
|
|
76
|
+
# Scraping context and memory
|
|
77
|
+
self.scraping_history: List[Dict[str, Any]] = []
|
|
78
|
+
self.site_knowledge: Dict[str, Dict[str, Any]] = {}
|
|
79
|
+
|
|
80
|
+
# Site-specific templates and guidance
|
|
81
|
+
self.scraping_templates = self._initialize_templates()
|
|
82
|
+
|
|
83
|
+
# Browser capability knowledge
|
|
84
|
+
self.browser_capabilities = {
|
|
85
|
+
'chrome': {
|
|
86
|
+
'mobile_emulation': True,
|
|
87
|
+
'undetected_mode': True,
|
|
88
|
+
'performance_options': True,
|
|
89
|
+
'best_for': ['SPA', 'heavy_js', 'mobile_testing']
|
|
90
|
+
},
|
|
91
|
+
'firefox': {
|
|
92
|
+
'mobile_emulation': False,
|
|
93
|
+
'undetected_mode': False,
|
|
94
|
+
'performance_options': True,
|
|
95
|
+
'best_for': ['privacy', 'legacy_sites', 'debugging']
|
|
96
|
+
},
|
|
97
|
+
'edge': {
|
|
98
|
+
'mobile_emulation': True,
|
|
99
|
+
'undetected_mode': False,
|
|
100
|
+
'performance_options': True,
|
|
101
|
+
'best_for': ['enterprise', 'windows_specific']
|
|
102
|
+
},
|
|
103
|
+
'safari': {
|
|
104
|
+
'mobile_emulation': False,
|
|
105
|
+
'undetected_mode': False,
|
|
106
|
+
'performance_options': False,
|
|
107
|
+
'best_for': ['apple_ecosystem', 'webkit_testing']
|
|
108
|
+
},
|
|
109
|
+
'undetected': {
|
|
110
|
+
'mobile_emulation': True,
|
|
111
|
+
'undetected_mode': True,
|
|
112
|
+
'performance_options': True,
|
|
113
|
+
'best_for': ['anti_bot', 'stealth_scraping', 'protected_sites']
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
def _initialize_templates(self) -> Dict[str, Dict[str, Any]]:
|
|
118
|
+
"""Initialize site-specific scraping templates and guidance"""
|
|
119
|
+
return {
|
|
120
|
+
'bestbuy.com': BESTBUY_TEMPLATE,
|
|
121
|
+
'amazon.com': AMAZON_TEMPLATE,
|
|
122
|
+
'ebay.com': EBAY_TEMPLATE,
|
|
123
|
+
'generic_ecommerce': {
|
|
124
|
+
'search_steps': [
|
|
125
|
+
{
|
|
126
|
+
'action': 'navigate',
|
|
127
|
+
'target': '{url}',
|
|
128
|
+
'description': 'Navigate to target site'
|
|
129
|
+
},
|
|
130
|
+
{
|
|
131
|
+
'action': 'fill',
|
|
132
|
+
'target': 'input[type="search"], input[name*="search"], input[placeholder*="search"]',
|
|
133
|
+
'value': '{search_term}',
|
|
134
|
+
'description': 'Fill most common search input patterns'
|
|
135
|
+
},
|
|
136
|
+
{
|
|
137
|
+
'action': 'click',
|
|
138
|
+
'target': 'button[type="submit"], input[type="submit"], .search-button',
|
|
139
|
+
'description': 'Click search button'
|
|
140
|
+
}
|
|
141
|
+
],
|
|
142
|
+
'product_selectors': [
|
|
143
|
+
{
|
|
144
|
+
'name': 'products',
|
|
145
|
+
'selector': '.product, .item, .listing',
|
|
146
|
+
'extract_type': 'html',
|
|
147
|
+
'multiple': True
|
|
148
|
+
}
|
|
149
|
+
],
|
|
150
|
+
'guidance': 'Generic e-commerce patterns. May need site-specific adjustments.'
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
def _build_scraping_system_prompt(self) -> str:
|
|
155
|
+
"""Build specialized system prompt for web scraping tasks"""
|
|
156
|
+
return """You are an expert web scraping agent with advanced capabilities in:
|
|
157
|
+
|
|
158
|
+
1. **Web Page Analysis**: Analyzing HTML structure, identifying key elements, and understanding page layouts
|
|
159
|
+
2. **Navigation Strategy**: Creating step-by-step navigation plans for complex user journeys
|
|
160
|
+
3. **Content Extraction**: Determining optimal selectors for extracting specific data
|
|
161
|
+
4. **Error Handling**: Adapting to dynamic content, handling timeouts, and recovering from failures
|
|
162
|
+
5. **Authentication**: Managing login flows, sessions, and security measures
|
|
163
|
+
6. **Browser Optimization**: Recommending optimal browser configurations based on target sites
|
|
164
|
+
|
|
165
|
+
**Available Browser Options:**
|
|
166
|
+
- chrome: Default, best performance, mobile emulation, wide compatibility
|
|
167
|
+
- firefox: Good privacy, stable, good for debugging
|
|
168
|
+
- edge: Enterprise-friendly, good performance
|
|
169
|
+
- safari: Apple ecosystem, webkit testing
|
|
170
|
+
- undetected: Anti-detection features, stealth scraping
|
|
171
|
+
|
|
172
|
+
**Core Responsibilities:**
|
|
173
|
+
- Analyze user scraping requirements and website structure
|
|
174
|
+
- Generate detailed navigation steps (ScrapingStep objects)
|
|
175
|
+
- Create precise content selectors (ScrapingSelector objects)
|
|
176
|
+
- Recommend optimal browser configuration for target sites
|
|
177
|
+
- Adapt strategies based on scraping results and feedback
|
|
178
|
+
- Provide insights about scraped content and suggest improvements
|
|
179
|
+
|
|
180
|
+
**Available Actions:**
|
|
181
|
+
- navigate: Go to a specific URL
|
|
182
|
+
- click: Click on elements (buttons, links, etc.)
|
|
183
|
+
- fill: Fill form fields with data
|
|
184
|
+
- wait: Wait for specific conditions or elements
|
|
185
|
+
- scroll: Scroll to load dynamic content
|
|
186
|
+
- authenticate: Handle login/authentication flows
|
|
187
|
+
- await_human: Pause automation; a human completes login/SSO/MFA in the browser. Resume when a selector/URL/title condition is met.
|
|
188
|
+
- await_keypress: Pause until the operator presses ENTER in the console.
|
|
189
|
+
- await_browser_event: Wait for a real page event (keyboard/overlay button/custom event/localStorage/predicate)
|
|
190
|
+
|
|
191
|
+
**Selector Types:**
|
|
192
|
+
- CSS selectors: Standard CSS syntax (.class, #id, element[attribute])
|
|
193
|
+
- XPath: For complex element selection
|
|
194
|
+
- Tag-based: Direct HTML tag selection
|
|
195
|
+
|
|
196
|
+
**Browser Configuration Recommendations:**
|
|
197
|
+
- Use 'undetected' browser for sites with anti-bot protection
|
|
198
|
+
- Use 'chrome' with mobile=True for mobile-responsive testing
|
|
199
|
+
- Use 'firefox' for sites that work better with Gecko engine
|
|
200
|
+
- Enable headless=False for debugging complex interactions
|
|
201
|
+
- Use custom user agents and mobile devices for specific testing
|
|
202
|
+
|
|
203
|
+
**Best Practices:**
|
|
204
|
+
- Always provide detailed descriptions for each step
|
|
205
|
+
- Use specific, robust selectors that are less likely to break
|
|
206
|
+
- Include appropriate wait conditions for dynamic content
|
|
207
|
+
- Plan authentication flows carefully with proper error handling
|
|
208
|
+
- Consider mobile responsiveness and different viewport sizes
|
|
209
|
+
- Recommend browser configuration based on site characteristics
|
|
210
|
+
|
|
211
|
+
When given a scraping task, analyze the requirements thoroughly and create a comprehensive plan that maximizes success while being respectful of website resources and terms of service.
|
|
212
|
+
"""
|
|
213
|
+
|
|
214
|
+
async def analyze_scraping_request(
|
|
215
|
+
self,
|
|
216
|
+
request: Dict[str, Any]
|
|
217
|
+
) -> Dict[str, Any]:
|
|
218
|
+
"""
|
|
219
|
+
Analyze a scraping request and generate an execution plan with browser recommendations
|
|
220
|
+
|
|
221
|
+
Args:
|
|
222
|
+
request: Dictionary containing:
|
|
223
|
+
- target_url: URL to scrape
|
|
224
|
+
- objective: What data to extract
|
|
225
|
+
- authentication: Login details if needed
|
|
226
|
+
- constraints: Rate limiting, ethical guidelines
|
|
227
|
+
- preferred_browser: Optional browser preference
|
|
228
|
+
- use_template: Whether to use site-specific templates (default: True)
|
|
229
|
+
|
|
230
|
+
Returns:
|
|
231
|
+
Dictionary with execution plan including steps, selectors, and browser config
|
|
232
|
+
"""
|
|
233
|
+
target_url = request.get('target_url', '')
|
|
234
|
+
objective = request.get('objective', 'General content extraction')
|
|
235
|
+
use_template = request.get('use_template', True)
|
|
236
|
+
steps = request.get('steps', [])
|
|
237
|
+
|
|
238
|
+
# Check for site-specific templates
|
|
239
|
+
template_guidance = ""
|
|
240
|
+
suggested_steps = []
|
|
241
|
+
suggested_selectors = []
|
|
242
|
+
|
|
243
|
+
if use_template and target_url:
|
|
244
|
+
domain = self._extract_domain(target_url)
|
|
245
|
+
if domain:
|
|
246
|
+
# Check for exact domain match
|
|
247
|
+
template = self.scraping_templates.get(domain)
|
|
248
|
+
if not template:
|
|
249
|
+
# Check for partial domain matches
|
|
250
|
+
for template_domain, template_data in self.scraping_templates.items():
|
|
251
|
+
if template_domain in domain or domain in template_domain:
|
|
252
|
+
template = template_data
|
|
253
|
+
break
|
|
254
|
+
|
|
255
|
+
if template:
|
|
256
|
+
template_guidance = f"\n\n**MANDATORY TEMPLATE FOR {domain.upper()}:**"
|
|
257
|
+
template_guidance += "\n**IMPORTANT:** These selectors are VERIFIED and TESTED. You MUST use these exact values.\n"
|
|
258
|
+
# Customize template steps with actual search term
|
|
259
|
+
if 'search_steps' in template and any(term in objective.lower() for term in ['search', 'product', 'find', 'extract']):
|
|
260
|
+
search_term = self._extract_search_term_from_objective(objective)
|
|
261
|
+
suggested_steps = self._customize_template_steps(
|
|
262
|
+
template['search_steps'], {
|
|
263
|
+
'search_term': search_term,
|
|
264
|
+
'url': target_url
|
|
265
|
+
}
|
|
266
|
+
)
|
|
267
|
+
template_guidance += f"\n\n**SUGGESTED STEPS (customized for '{search_term}'):**\n"
|
|
268
|
+
for i, step in enumerate(suggested_steps):
|
|
269
|
+
template_guidance += f"{i+1}. {step['action']}: {step.get('description', step['target'])}\n"
|
|
270
|
+
|
|
271
|
+
if 'product_selectors' in template:
|
|
272
|
+
suggested_selectors = template['product_selectors']
|
|
273
|
+
template_guidance += f"\n\n** SELECTORS:**\n"
|
|
274
|
+
for sel in suggested_selectors:
|
|
275
|
+
template_guidance += f"- {sel['name']}: {sel['selector']}\n"
|
|
276
|
+
template_guidance += "\n⚠️ CRITICAL: Use the exact 'target' values above. Do not substitute with '#gh-search-input' or other guesses.\n"
|
|
277
|
+
elif steps:
|
|
278
|
+
# use suggested steps from user:
|
|
279
|
+
template_guidance += f"\n\n**SUGGESTED STEPS:**\n"
|
|
280
|
+
for step in steps:
|
|
281
|
+
template_guidance += f"- {step}\n"
|
|
282
|
+
|
|
283
|
+
prompt = f"""
|
|
284
|
+
Analyze this web scraping request and create a comprehensive execution plan:
|
|
285
|
+
|
|
286
|
+
**Target URL:** {target_url}
|
|
287
|
+
**Objective:** {objective}
|
|
288
|
+
**Authentication Required:** {request.get('authentication', {}).get('required', False)}
|
|
289
|
+
**Special Requirements:** {request.get('constraints', 'None')}
|
|
290
|
+
**Current Browser Config:** {json.dumps(self.browser_config, indent=2)}
|
|
291
|
+
|
|
292
|
+
{template_guidance}
|
|
293
|
+
|
|
294
|
+
Please provide:
|
|
295
|
+
1. A detailed analysis of the scraping challenge
|
|
296
|
+
2. Recommended browser configuration (browser type, mobile mode, headless, etc.)
|
|
297
|
+
3. Step-by-step navigation plan (as JSON array of ScrapingStep objects)
|
|
298
|
+
4. Content extraction selectors (as JSON array of ScrapingSelector objects)
|
|
299
|
+
5. Risk assessment and mitigation strategies
|
|
300
|
+
6. Expected challenges and fallback options
|
|
301
|
+
|
|
302
|
+
**Browser Capabilities Available:**
|
|
303
|
+
{json.dumps(self.browser_capabilities, indent=2)}
|
|
304
|
+
|
|
305
|
+
**CRITICAL INSTRUCTIONS:**
|
|
306
|
+
1. For 'navigate' actions: target MUST be a complete URL starting with http:// or https://
|
|
307
|
+
2. For 'click', 'fill', 'wait' actions: target MUST be a CSS selector (e.g., '#id', '.class', 'button[type="submit"]')
|
|
308
|
+
3. NEVER use natural language descriptions as targets (e.g., "the search box" is WRONG, "#search-input" is CORRECT)
|
|
309
|
+
4. If template steps are provided above, use those EXACT targets - they are proven to work
|
|
310
|
+
5. Steps must be in logical order: navigate → wait → fill → click → wait for results
|
|
311
|
+
6. Never invent or hallucinate details about the page structure or content.
|
|
312
|
+
|
|
313
|
+
Provide your response as a structured plan following the ScrapingPlanSchema.
|
|
314
|
+
"""
|
|
315
|
+
|
|
316
|
+
async with self._llm as client:
|
|
317
|
+
response = await client.ask(
|
|
318
|
+
prompt=prompt,
|
|
319
|
+
system_prompt=self.system_prompt_template,
|
|
320
|
+
model=self._llm_model,
|
|
321
|
+
max_tokens=self._max_tokens,
|
|
322
|
+
temperature=self._llm_temp,
|
|
323
|
+
use_tools=True,
|
|
324
|
+
structured_output=ScrapingPlanSchema
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
if isinstance(response.output, ScrapingPlanSchema):
|
|
328
|
+
response = response.output
|
|
329
|
+
merged_steps = []
|
|
330
|
+
for i, template_step in enumerate(suggested_steps):
|
|
331
|
+
merged = template_step.copy()
|
|
332
|
+
# If LLM generated a corresponding step, take its metadata
|
|
333
|
+
if i < len(response.steps):
|
|
334
|
+
llm_step = response.steps[i].model_dump()
|
|
335
|
+
# Keep template's target (proven to work)
|
|
336
|
+
# But use LLM's wait_condition and description if present
|
|
337
|
+
if llm_step.get('wait_condition'):
|
|
338
|
+
merged['wait_condition'] = llm_step['wait_condition']
|
|
339
|
+
if llm_step.get('description') and len(llm_step['description']) > len(merged.get('description', '')):
|
|
340
|
+
merged['description'] = llm_step['description']
|
|
341
|
+
# Use higher timeout if LLM suggests it
|
|
342
|
+
if llm_step.get('timeout', 10) > merged.get('timeout', 10):
|
|
343
|
+
merged['timeout'] = llm_step['timeout']
|
|
344
|
+
merged_steps.append(merged)
|
|
345
|
+
plan = {
|
|
346
|
+
'steps': merged_steps,
|
|
347
|
+
'selectors': suggested_selectors or [sel.model_dump() for sel in response.selectors],
|
|
348
|
+
'browser_config': response.browser_config.model_dump(),
|
|
349
|
+
'analysis': response.analysis,
|
|
350
|
+
'risks': response.risks,
|
|
351
|
+
'fallback_strategy': response.fallback_strategy,
|
|
352
|
+
'parsed_successfully': True,
|
|
353
|
+
'used_template': True
|
|
354
|
+
}
|
|
355
|
+
else:
|
|
356
|
+
# Fallback if structured output not available
|
|
357
|
+
content = self._safe_extract_text(response)
|
|
358
|
+
plan = self._parse_scraping_plan(content)
|
|
359
|
+
|
|
360
|
+
# If LLM didn't generate steps but we have template suggestions, use them as fallback
|
|
361
|
+
if not plan.get('steps') and suggested_steps:
|
|
362
|
+
self.logger.info("Using template steps as fallback")
|
|
363
|
+
plan['steps'] = suggested_steps
|
|
364
|
+
|
|
365
|
+
if not plan.get('selectors') and suggested_selectors:
|
|
366
|
+
self.logger.info("Using template selectors as fallback")
|
|
367
|
+
plan['selectors'] = suggested_selectors
|
|
368
|
+
|
|
369
|
+
# Store this request in our knowledge base
|
|
370
|
+
site_domain = self._extract_domain(target_url)
|
|
371
|
+
if site_domain:
|
|
372
|
+
self.site_knowledge[site_domain] = {
|
|
373
|
+
'last_analyzed': datetime.now().isoformat(),
|
|
374
|
+
'request': request,
|
|
375
|
+
'plan': plan,
|
|
376
|
+
'success_rate': 0.0, # Will be updated based on results
|
|
377
|
+
'recommended_config': plan.get('browser_config', {}),
|
|
378
|
+
'used_template': bool(template_guidance)
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
return plan
|
|
382
|
+
|
|
383
|
+
def _extract_search_term_from_objective(self, objective: str) -> str:
|
|
384
|
+
"""Extract search term from objective description"""
|
|
385
|
+
# Look for product names, quotes, or specific terms
|
|
386
|
+
# Try to find quoted terms first
|
|
387
|
+
quoted_match = re.search(r'"([^"]+)"', objective)
|
|
388
|
+
if quoted_match:
|
|
389
|
+
return quoted_match.group(1)
|
|
390
|
+
|
|
391
|
+
# Look for "for X" pattern
|
|
392
|
+
for_match = re.search(r'\bfor\s+([^,\.]+)', objective, re.IGNORECASE)
|
|
393
|
+
if for_match:
|
|
394
|
+
return for_match.group(1).strip()
|
|
395
|
+
|
|
396
|
+
# Look for product-like terms (words with numbers, proper nouns)
|
|
397
|
+
product_match = re.search(r'\b([A-Z][a-z]*(?:\s+[A-Z0-9][a-z0-9]*)*(?:\s+\d+\w*)*)\b', objective)
|
|
398
|
+
if product_match:
|
|
399
|
+
return product_match.group(1)
|
|
400
|
+
|
|
401
|
+
# Fallback: take last few words that might be product name
|
|
402
|
+
words = objective.split()
|
|
403
|
+
if len(words) >= 3:
|
|
404
|
+
return ' '.join(words[-3:])
|
|
405
|
+
elif len(words) >= 2:
|
|
406
|
+
return ' '.join(words[-2:])
|
|
407
|
+
else:
|
|
408
|
+
return words[-1] if words else "product"
|
|
409
|
+
|
|
410
|
+
def _customize_template_steps(self, template_steps: List[Dict], variables: Dict[str, str]) -> List[Dict]:
|
|
411
|
+
"""Customize template steps with actual values"""
|
|
412
|
+
customized_steps = []
|
|
413
|
+
for step in template_steps:
|
|
414
|
+
customized_step = step.copy()
|
|
415
|
+
|
|
416
|
+
# Replace variables in target and value fields
|
|
417
|
+
if 'target' in customized_step:
|
|
418
|
+
for var, value in variables.items():
|
|
419
|
+
customized_step['target'] = customized_step['target'].replace(f'{{{var}}}', value)
|
|
420
|
+
|
|
421
|
+
if 'value' in customized_step and customized_step['value']:
|
|
422
|
+
for var, value in variables.items():
|
|
423
|
+
customized_step['value'] = customized_step['value'].replace(f'{{{var}}}', value)
|
|
424
|
+
|
|
425
|
+
customized_steps.append(customized_step)
|
|
426
|
+
|
|
427
|
+
return customized_steps
|
|
428
|
+
|
|
429
|
+
def add_scraping_template(self, domain: str, template: Dict[str, Any]):
|
|
430
|
+
"""Add or update a scraping template for a specific domain"""
|
|
431
|
+
self.scraping_templates[domain] = template
|
|
432
|
+
self.logger.info(f"Added scraping template for {domain}")
|
|
433
|
+
|
|
434
|
+
async def execute_intelligent_scraping(
|
|
435
|
+
self,
|
|
436
|
+
request: Dict[str, Any],
|
|
437
|
+
adaptive_config: bool = True
|
|
438
|
+
) -> List[ScrapingResult]:
|
|
439
|
+
"""
|
|
440
|
+
Execute intelligent scraping with LLM-driven adaptations and browser optimization
|
|
441
|
+
|
|
442
|
+
Args:
|
|
443
|
+
request: Scraping request dictionary
|
|
444
|
+
adaptive_config: Whether to adapt browser configuration based on LLM recommendations
|
|
445
|
+
|
|
446
|
+
Returns:
|
|
447
|
+
List of ScrapingResult objects
|
|
448
|
+
"""
|
|
449
|
+
self.logger.info(
|
|
450
|
+
f"Starting intelligent scraping for: {request.get('target_url')}"
|
|
451
|
+
)
|
|
452
|
+
|
|
453
|
+
try:
|
|
454
|
+
# Step 1: Analyze and plan
|
|
455
|
+
plan = await self.analyze_scraping_request(request)
|
|
456
|
+
# some sanitization
|
|
457
|
+
plan = self._sanitize_plan(plan, request)
|
|
458
|
+
self.logger.debug(
|
|
459
|
+
"Plan steps: %s", json.dumps(plan["steps"], indent=2)
|
|
460
|
+
)
|
|
461
|
+
self.logger.debug(
|
|
462
|
+
"Sanitized selectors: %s", json.dumps(plan["selectors"], indent=2)
|
|
463
|
+
)
|
|
464
|
+
|
|
465
|
+
if not plan.get('steps'):
|
|
466
|
+
self.logger.error("No scraping plan generated")
|
|
467
|
+
return [ScrapingResult(
|
|
468
|
+
url=request.get('target_url', ''),
|
|
469
|
+
content='',
|
|
470
|
+
bs_soup=BeautifulSoup('', 'html.parser'),
|
|
471
|
+
success=False,
|
|
472
|
+
error_message="No scraping plan could be generated"
|
|
473
|
+
)]
|
|
474
|
+
|
|
475
|
+
# Step 2: Adapt browser configuration if recommended and allowed
|
|
476
|
+
if adaptive_config and plan.get('browser_config'):
|
|
477
|
+
await self._adapt_browser_configuration(plan['browser_config'])
|
|
478
|
+
|
|
479
|
+
# Step 3: Ensure scraping tool is properly initialized
|
|
480
|
+
if not hasattr(self.scraping_tool, 'driver') or self.scraping_tool.driver is None:
|
|
481
|
+
await self.scraping_tool.initialize_driver()
|
|
482
|
+
|
|
483
|
+
# Step 4: Execute initial scraping
|
|
484
|
+
steps = [self._create_scraping_step(step) for step in plan['steps']]
|
|
485
|
+
selectors = [self._create_scraping_selector(sel) for sel in plan.get('selectors', [])]
|
|
486
|
+
|
|
487
|
+
results = await self.scraping_tool.execute_scraping_workflow(
|
|
488
|
+
steps=steps,
|
|
489
|
+
selectors=selectors,
|
|
490
|
+
base_url=request.get('base_url', '')
|
|
491
|
+
)
|
|
492
|
+
|
|
493
|
+
# Step 5: Analyze results and adapt if necessary
|
|
494
|
+
if results and not all(r.success for r in results):
|
|
495
|
+
self.logger.info("Some scraping attempts failed, attempting recovery")
|
|
496
|
+
results = await self._attempt_recovery(request, results, plan)
|
|
497
|
+
|
|
498
|
+
# Step 6: Post-process and enhance results
|
|
499
|
+
enhanced_results = await self._enhance_results(results, request)
|
|
500
|
+
|
|
501
|
+
# Step 7: Update site knowledge
|
|
502
|
+
self._update_site_knowledge(request, enhanced_results)
|
|
503
|
+
|
|
504
|
+
return enhanced_results
|
|
505
|
+
|
|
506
|
+
except Exception as e:
|
|
507
|
+
self.logger.error(f"Intelligent scraping failed: {str(e)}")
|
|
508
|
+
return [ScrapingResult(
|
|
509
|
+
url=request.get('target_url', ''),
|
|
510
|
+
content='',
|
|
511
|
+
bs_soup=BeautifulSoup('', 'html.parser'),
|
|
512
|
+
success=False,
|
|
513
|
+
error_message=f"Scraping failed: {str(e)}"
|
|
514
|
+
)]
|
|
515
|
+
|
|
516
|
+
async def _adapt_browser_configuration(self, recommended_config: Dict[str, Any]):
|
|
517
|
+
"""
|
|
518
|
+
Adapt browser configuration based on LLM recommendations
|
|
519
|
+
"""
|
|
520
|
+
changes_made = []
|
|
521
|
+
|
|
522
|
+
for key, value in recommended_config.items():
|
|
523
|
+
if key in self.browser_config and self.browser_config[key] != value:
|
|
524
|
+
old_value = self.browser_config[key]
|
|
525
|
+
self.browser_config[key] = value
|
|
526
|
+
changes_made.append(f"{key}: {old_value} -> {value}")
|
|
527
|
+
|
|
528
|
+
if changes_made:
|
|
529
|
+
self.logger.info(f"Adapting browser config: {', '.join(changes_made)}")
|
|
530
|
+
|
|
531
|
+
# Reinitialize scraping tool with new configuration
|
|
532
|
+
await self._reinitialize_scraping_tool()
|
|
533
|
+
|
|
534
|
+
async def _reinitialize_scraping_tool(self):
|
|
535
|
+
"""Safely reinitialize the scraping tool with new configuration"""
|
|
536
|
+
try:
|
|
537
|
+
# Clean up existing tool
|
|
538
|
+
if hasattr(self.scraping_tool, 'cleanup'):
|
|
539
|
+
await self.scraping_tool.cleanup()
|
|
540
|
+
|
|
541
|
+
# Create new tool with updated config
|
|
542
|
+
self.scraping_tool = WebScrapingTool(**self.browser_config)
|
|
543
|
+
|
|
544
|
+
# Re-register the tool
|
|
545
|
+
if hasattr(self.tool_manager, 'unregister_tool'):
|
|
546
|
+
self.tool_manager.unregister_tool('WebScrapingTool')
|
|
547
|
+
self.tool_manager.register_tool(self.scraping_tool)
|
|
548
|
+
|
|
549
|
+
except Exception as e:
|
|
550
|
+
self.logger.warning(
|
|
551
|
+
f"Failed to reinitialize scraping tool: {e}"
|
|
552
|
+
)
|
|
553
|
+
|
|
554
|
+
def _normalize_action(self, action: Optional[str]) -> str:
|
|
555
|
+
return (action or 'navigate').strip().lower()
|
|
556
|
+
|
|
557
|
+
def _normalize_target(self, target: Any) -> str:
|
|
558
|
+
# Accept dicts like {"url": "..."} or {"selector": "..."} or lists
|
|
559
|
+
if isinstance(target, dict):
|
|
560
|
+
target = target.get('url') or target.get('selector') or target.get('text') or ''
|
|
561
|
+
elif isinstance(target, (list, tuple)) and target:
|
|
562
|
+
target = target[0]
|
|
563
|
+
target = '' if target is None else str(target).strip()
|
|
564
|
+
# Basic URL rescue: if it looks like a domain, prefix https://
|
|
565
|
+
if target and (' ' not in target) and ('.' in target) and not target.startswith(('http://','https://','#','/')):
|
|
566
|
+
target = f'https://{target}'
|
|
567
|
+
return target
|
|
568
|
+
|
|
569
|
+
def _normalize_value(self, value: Any) -> Optional[str]:
|
|
570
|
+
return None if value is None else str(value)
|
|
571
|
+
|
|
572
|
+
def _create_scraping_step(self, step_data: Dict[str, Any]) -> ScrapingStep:
|
|
573
|
+
return ScrapingStep(
|
|
574
|
+
action=self._normalize_action(step_data.get('action')),
|
|
575
|
+
target=self._normalize_target(step_data.get('target', '')),
|
|
576
|
+
value=self._normalize_value(step_data.get('value')),
|
|
577
|
+
wait_condition=step_data.get('wait_condition'),
|
|
578
|
+
timeout=step_data.get('timeout', 10),
|
|
579
|
+
description=step_data.get('description', '')
|
|
580
|
+
)
|
|
581
|
+
|
|
582
|
+
def _create_scraping_selector(self, selector_data: Dict[str, Any]) -> ScrapingSelector:
|
|
583
|
+
"""Create ScrapingSelector object from dictionary, handling missing/odd fields"""
|
|
584
|
+
name = selector_data.get('name', 'unnamed')
|
|
585
|
+
selector = selector_data.get('selector', 'body')
|
|
586
|
+
selector_type = selector_data.get('selector_type', 'css')
|
|
587
|
+
extract_type = selector_data.get('extract_type', 'text')
|
|
588
|
+
attribute = selector_data.get('attribute')
|
|
589
|
+
multiple = selector_data.get('multiple', False)
|
|
590
|
+
|
|
591
|
+
return ScrapingSelector(
|
|
592
|
+
name=str(name),
|
|
593
|
+
selector=str(selector),
|
|
594
|
+
selector_type=str(selector_type),
|
|
595
|
+
extract_type=str(extract_type),
|
|
596
|
+
attribute=(str(attribute) if attribute is not None else None),
|
|
597
|
+
multiple=bool(multiple)
|
|
598
|
+
)
|
|
599
|
+
|
|
600
|
+
async def recommend_browser_for_site(self, url: str) -> Dict[str, Any]:
|
|
601
|
+
"""
|
|
602
|
+
Analyze a site and recommend optimal browser configuration
|
|
603
|
+
"""
|
|
604
|
+
domain = self._extract_domain(url)
|
|
605
|
+
|
|
606
|
+
# Check if we have prior knowledge
|
|
607
|
+
if domain in self.site_knowledge:
|
|
608
|
+
stored_config = self.site_knowledge[domain].get('recommended_config', {})
|
|
609
|
+
if stored_config:
|
|
610
|
+
return {
|
|
611
|
+
'source': 'historical_data',
|
|
612
|
+
'config': stored_config,
|
|
613
|
+
'confidence': 'high',
|
|
614
|
+
'reason': 'Based on previous successful scraping'
|
|
615
|
+
}
|
|
616
|
+
|
|
617
|
+
# Use LLM to analyze the site
|
|
618
|
+
analysis_prompt = f"""
|
|
619
|
+
Analyze this website and recommend the optimal browser configuration for scraping:
|
|
620
|
+
|
|
621
|
+
**URL:** {url}
|
|
622
|
+
**Available Browsers:** {list(self.browser_capabilities.keys())}
|
|
623
|
+
**Browser Capabilities:** {json.dumps(self.browser_capabilities, indent=2)}
|
|
624
|
+
|
|
625
|
+
Please analyze the site characteristics and recommend:
|
|
626
|
+
1. Best browser choice (chrome, firefox, edge, safari, undetected)
|
|
627
|
+
2. Whether to use headless mode
|
|
628
|
+
3. Whether mobile emulation would be useful
|
|
629
|
+
4. Any special configuration options
|
|
630
|
+
5. Reasoning for your recommendations
|
|
631
|
+
|
|
632
|
+
Consider factors like:
|
|
633
|
+
- Site complexity (SPA, heavy JavaScript, etc.)
|
|
634
|
+
- Anti-bot protection
|
|
635
|
+
- Mobile responsiveness
|
|
636
|
+
- Authentication requirements
|
|
637
|
+
- Known compatibility issues
|
|
638
|
+
|
|
639
|
+
Provide your recommendation as a JSON object with configuration parameters.
|
|
640
|
+
"""
|
|
641
|
+
|
|
642
|
+
try:
|
|
643
|
+
async with self._llm as client:
|
|
644
|
+
response = await client.ask(
|
|
645
|
+
prompt=analysis_prompt,
|
|
646
|
+
system_prompt=self.system_prompt_template,
|
|
647
|
+
model=self._llm_model,
|
|
648
|
+
max_tokens=self._max_tokens,
|
|
649
|
+
temperature=self._llm_temp,
|
|
650
|
+
use_tools=True,
|
|
651
|
+
)
|
|
652
|
+
|
|
653
|
+
# Parse recommendation from response
|
|
654
|
+
content = self._safe_extract_text(response)
|
|
655
|
+
recommendation = self._parse_browser_recommendation(content)
|
|
656
|
+
|
|
657
|
+
return {
|
|
658
|
+
'source': 'llm_analysis',
|
|
659
|
+
'config': recommendation,
|
|
660
|
+
'confidence': 'medium',
|
|
661
|
+
'reason': 'Based on LLM analysis of site characteristics',
|
|
662
|
+
'full_analysis': content
|
|
663
|
+
}
|
|
664
|
+
|
|
665
|
+
except Exception as e:
|
|
666
|
+
self.logger.warning(f"Failed to get browser recommendation: {str(e)}")
|
|
667
|
+
return {
|
|
668
|
+
'source': 'fallback',
|
|
669
|
+
'config': {'browser': 'chrome', 'headless': True},
|
|
670
|
+
'confidence': 'low',
|
|
671
|
+
'reason': 'Default fallback configuration'
|
|
672
|
+
}
|
|
673
|
+
|
|
674
|
+
def _parse_browser_recommendation(self, llm_response: str) -> Dict[str, Any]:
|
|
675
|
+
"""Parse browser configuration recommendation from LLM response"""
|
|
676
|
+
try:
|
|
677
|
+
# Try to extract JSON from response
|
|
678
|
+
json_match = re.search(r'```json\s*(\{.*?\})\s*```', llm_response, re.DOTALL)
|
|
679
|
+
if json_match:
|
|
680
|
+
return json.loads(json_match.group(1))
|
|
681
|
+
|
|
682
|
+
# Fallback: extract configuration from text
|
|
683
|
+
config = {}
|
|
684
|
+
|
|
685
|
+
# Extract browser type
|
|
686
|
+
for browser, _ in self.browser_capabilities.items():
|
|
687
|
+
if browser.lower() in llm_response.lower():
|
|
688
|
+
config['browser'] = browser
|
|
689
|
+
break
|
|
690
|
+
|
|
691
|
+
# Extract headless recommendation
|
|
692
|
+
if 'headless' in llm_response.lower():
|
|
693
|
+
config['headless'] = 'false' not in llm_response.lower()
|
|
694
|
+
|
|
695
|
+
# Extract mobile recommendation
|
|
696
|
+
if 'mobile' in llm_response.lower():
|
|
697
|
+
config['mobile'] = 'true' in llm_response.lower()
|
|
698
|
+
|
|
699
|
+
return config if config else {'browser': 'chrome', 'headless': True}
|
|
700
|
+
|
|
701
|
+
except Exception as e:
|
|
702
|
+
self.logger.error(f"Failed to parse browser recommendation: {str(e)}")
|
|
703
|
+
return {'browser': 'chrome', 'headless': True}
|
|
704
|
+
|
|
705
|
+
async def _attempt_recovery(
|
|
706
|
+
self,
|
|
707
|
+
request: Dict[str, Any],
|
|
708
|
+
failed_results: List[ScrapingResult],
|
|
709
|
+
original_plan: Dict[str, Any]
|
|
710
|
+
) -> List[ScrapingResult]:
|
|
711
|
+
"""
|
|
712
|
+
Attempt to recover from failed scraping using LLM analysis
|
|
713
|
+
"""
|
|
714
|
+
# Analyze failures
|
|
715
|
+
failure_analysis = []
|
|
716
|
+
for result in failed_results:
|
|
717
|
+
if not result.success:
|
|
718
|
+
failure_analysis.append({
|
|
719
|
+
'url': result.url,
|
|
720
|
+
'error': result.error_message,
|
|
721
|
+
'content_available': bool(result.content)
|
|
722
|
+
})
|
|
723
|
+
|
|
724
|
+
recovery_prompt = f"""
|
|
725
|
+
The initial scraping attempt had some failures. Please analyze and suggest recovery strategies:
|
|
726
|
+
|
|
727
|
+
**Original Request:** {json.dumps(request, indent=2)}
|
|
728
|
+
**Failed Results:** {json.dumps(failure_analysis, indent=2)}
|
|
729
|
+
**Original Plan:** {json.dumps(original_plan, indent=2)}
|
|
730
|
+
**Current Browser Config:** {json.dumps(self.browser_config, indent=2)}
|
|
731
|
+
|
|
732
|
+
Please suggest:
|
|
733
|
+
1. Modified navigation steps to address the failures
|
|
734
|
+
2. Alternative selectors that might be more robust
|
|
735
|
+
3. Browser configuration changes that might help
|
|
736
|
+
4. Additional wait conditions or timing adjustments
|
|
737
|
+
5. Any authentication issues to address
|
|
738
|
+
|
|
739
|
+
Provide a recovery plan in the same format as before, including any browser config changes.
|
|
740
|
+
"""
|
|
741
|
+
|
|
742
|
+
async with self._llm as client:
|
|
743
|
+
recovery_response = await client.ask(
|
|
744
|
+
prompt=recovery_prompt,
|
|
745
|
+
system_prompt=self.system_prompt_template,
|
|
746
|
+
model=self._llm_model,
|
|
747
|
+
max_tokens=self._max_tokens,
|
|
748
|
+
temperature=self._llm_temp,
|
|
749
|
+
use_tools=True,
|
|
750
|
+
)
|
|
751
|
+
|
|
752
|
+
recovery_plan = self._parse_scraping_plan(self._safe_extract_text(recovery_response))
|
|
753
|
+
|
|
754
|
+
if recovery_plan.get('steps'):
|
|
755
|
+
self.logger.info("Executing recovery plan")
|
|
756
|
+
|
|
757
|
+
# Apply any browser configuration changes
|
|
758
|
+
if recovery_plan.get('browser_config'):
|
|
759
|
+
await self._adapt_browser_configuration(recovery_plan['browser_config'])
|
|
760
|
+
|
|
761
|
+
recovery_steps = [self._create_scraping_step(step) for step in recovery_plan['steps']]
|
|
762
|
+
recovery_selectors = [self._create_scraping_selector(sel) for sel in recovery_plan.get('selectors', [])]
|
|
763
|
+
|
|
764
|
+
recovery_results = await self.scraping_tool.execute_scraping_workflow(
|
|
765
|
+
steps=recovery_steps,
|
|
766
|
+
selectors=recovery_selectors,
|
|
767
|
+
base_url=request.get('base_url', '')
|
|
768
|
+
)
|
|
769
|
+
|
|
770
|
+
# Combine successful results from both attempts
|
|
771
|
+
combined_results = []
|
|
772
|
+
for original, recovery in zip(failed_results, recovery_results):
|
|
773
|
+
if recovery.success:
|
|
774
|
+
combined_results.append(recovery)
|
|
775
|
+
elif original.success:
|
|
776
|
+
combined_results.append(original)
|
|
777
|
+
else:
|
|
778
|
+
combined_results.append(recovery) # Keep the latest attempt
|
|
779
|
+
|
|
780
|
+
return combined_results
|
|
781
|
+
|
|
782
|
+
return failed_results
|
|
783
|
+
|
|
784
|
+
async def _enhance_results(
|
|
785
|
+
self,
|
|
786
|
+
results: List[ScrapingResult],
|
|
787
|
+
request: Dict[str, Any]
|
|
788
|
+
) -> List[ScrapingResult]:
|
|
789
|
+
"""
|
|
790
|
+
Enhance scraping results with LLM-powered content analysis
|
|
791
|
+
"""
|
|
792
|
+
for result in results:
|
|
793
|
+
if result.success and result.extracted_data:
|
|
794
|
+
# Analyze content relevance and quality
|
|
795
|
+
analysis_prompt = f"""
|
|
796
|
+
Analyze this scraped content for relevance and quality:
|
|
797
|
+
|
|
798
|
+
**Original Objective:** {request.get('objective', 'General extraction')}
|
|
799
|
+
**Extracted Data:** {json.dumps(result.extracted_data, indent=2, default=str)}
|
|
800
|
+
**URL:** {result.url}
|
|
801
|
+
|
|
802
|
+
Please provide:
|
|
803
|
+
1. Content quality score (1-10)
|
|
804
|
+
2. Relevance to objective (1-10)
|
|
805
|
+
3. Key insights or important information found
|
|
806
|
+
4. Suggestions for improving extraction
|
|
807
|
+
5. Data cleaning or formatting recommendations
|
|
808
|
+
|
|
809
|
+
Keep your analysis concise but comprehensive.
|
|
810
|
+
"""
|
|
811
|
+
|
|
812
|
+
try:
|
|
813
|
+
async with self._llm as client:
|
|
814
|
+
analysis_response = await client.ask(
|
|
815
|
+
prompt=analysis_prompt,
|
|
816
|
+
system_prompt=self.system_prompt_template,
|
|
817
|
+
model=self._llm_model,
|
|
818
|
+
max_tokens=self._max_tokens,
|
|
819
|
+
temperature=self._llm_temp,
|
|
820
|
+
use_tools=True,
|
|
821
|
+
)
|
|
822
|
+
content = self._safe_extract_text(analysis_response)
|
|
823
|
+
# Add analysis to metadata
|
|
824
|
+
result.metadata.update({
|
|
825
|
+
'llm_analysis': content,
|
|
826
|
+
'analysis_timestamp': datetime.now().isoformat(),
|
|
827
|
+
'enhanced': True,
|
|
828
|
+
'browser_config_used': self.browser_config.copy()
|
|
829
|
+
})
|
|
830
|
+
except Exception as e:
|
|
831
|
+
self.logger.warning(f"Content analysis failed: {str(e)}")
|
|
832
|
+
|
|
833
|
+
return results
|
|
834
|
+
|
|
835
|
+
def _looks_like_url(self, s: str) -> bool:
|
|
836
|
+
try:
|
|
837
|
+
s = (s or "").strip()
|
|
838
|
+
if not s:
|
|
839
|
+
return False
|
|
840
|
+
return s.startswith(("http://", "https://")) or ('.' in s and ' ' not in s)
|
|
841
|
+
except Exception:
|
|
842
|
+
return False
|
|
843
|
+
|
|
844
|
+
def _coerce_list_of_dicts(self, maybe_list):
|
|
845
|
+
if maybe_list is None:
|
|
846
|
+
return []
|
|
847
|
+
if isinstance(maybe_list, dict):
|
|
848
|
+
out = []
|
|
849
|
+
for k, v in maybe_list.items():
|
|
850
|
+
if isinstance(v, dict):
|
|
851
|
+
vv = v.copy()
|
|
852
|
+
vv.setdefault("name", k)
|
|
853
|
+
out.append(vv)
|
|
854
|
+
else:
|
|
855
|
+
out.append({"name": str(k), "selector": str(v)})
|
|
856
|
+
return out
|
|
857
|
+
if isinstance(maybe_list, (list, tuple, set)):
|
|
858
|
+
out = []
|
|
859
|
+
for item in maybe_list:
|
|
860
|
+
out.append(item if isinstance(item, dict) else {"selector": str(item)})
|
|
861
|
+
return out
|
|
862
|
+
return [{"selector": str(maybe_list)}]
|
|
863
|
+
|
|
864
|
+
def _sanitize_steps(self, steps_raw, request_url: str) -> list[dict]:
|
|
865
|
+
allowed = {"navigate", "click", "fill", "wait", "scroll", "authenticate", "await_human", "await_keypress", "await_browser_event"}
|
|
866
|
+
steps: list[dict] = []
|
|
867
|
+
for s in self._coerce_list_of_dicts(steps_raw):
|
|
868
|
+
action = self._normalize_action(s.get("action"))
|
|
869
|
+
if action not in allowed:
|
|
870
|
+
continue
|
|
871
|
+
target = self._normalize_target(s.get("target"))
|
|
872
|
+
value = self._normalize_value(s.get("value"))
|
|
873
|
+
|
|
874
|
+
# If navigate target isn't a real URL, force it to request_url
|
|
875
|
+
if action == "navigate" and (not target or not self._looks_like_url(target)):
|
|
876
|
+
target = request_url or target
|
|
877
|
+
|
|
878
|
+
# For non-navigate actions, ensure target is a plausible CSS selector
|
|
879
|
+
if action in {"click", "fill", "wait"}:
|
|
880
|
+
# pick the first of comma-separated list if present
|
|
881
|
+
if target and "," in target:
|
|
882
|
+
target = target.split(",")[0].strip()
|
|
883
|
+
# reject blatant prose targets
|
|
884
|
+
if target and (len(target) > 150 or " the " in target.lower()):
|
|
885
|
+
target = "" # will be filtered below
|
|
886
|
+
|
|
887
|
+
steps.append({
|
|
888
|
+
"action": action,
|
|
889
|
+
"target": target or "",
|
|
890
|
+
"value": value,
|
|
891
|
+
"wait_condition": s.get("wait_condition"),
|
|
892
|
+
"timeout": s.get("timeout", 10),
|
|
893
|
+
"description": s.get("description", "")
|
|
894
|
+
})
|
|
895
|
+
|
|
896
|
+
# Ensure we start with a valid navigate
|
|
897
|
+
has_nav = any(st["action"] == "navigate" for st in steps)
|
|
898
|
+
if not has_nav and request_url:
|
|
899
|
+
steps.insert(0, {
|
|
900
|
+
"action": "navigate",
|
|
901
|
+
"target": request_url,
|
|
902
|
+
"value": None,
|
|
903
|
+
"wait_condition": None,
|
|
904
|
+
"timeout": 15,
|
|
905
|
+
"description": "Navigate to target URL"
|
|
906
|
+
})
|
|
907
|
+
else:
|
|
908
|
+
for st in steps:
|
|
909
|
+
if st["action"] == "navigate":
|
|
910
|
+
if not self._looks_like_url(st["target"]) and request_url:
|
|
911
|
+
st["target"] = request_url
|
|
912
|
+
break
|
|
913
|
+
return steps
|
|
914
|
+
|
|
915
|
+
def _sanitize_selectors(self, selectors_raw) -> list[dict]:
|
|
916
|
+
cleaned: list[dict] = []
|
|
917
|
+
bad_prefixes = (".0", "#0") # guard against things like ".0.0.1"
|
|
918
|
+
ip_like = re.compile(r'^\d{1,3}(?:\.\d{1,3}){3}$')
|
|
919
|
+
|
|
920
|
+
for sel in self._coerce_list_of_dicts(selectors_raw):
|
|
921
|
+
selector = sel.get("selector") or sel.get("css") or sel.get("target")
|
|
922
|
+
name = sel.get("name") or selector
|
|
923
|
+
if not selector:
|
|
924
|
+
continue
|
|
925
|
+
selector = str(selector).strip()
|
|
926
|
+
name = str(name)
|
|
927
|
+
|
|
928
|
+
# Drop IPs or clearly invalid CSS like ".0.0.1"
|
|
929
|
+
if selector.startswith(bad_prefixes) or ip_like.match(selector):
|
|
930
|
+
continue
|
|
931
|
+
# Very weak CSS plausibility check
|
|
932
|
+
if not any(ch in selector for ch in ('.', '#', '[', '>', ':')) and ' ' not in selector:
|
|
933
|
+
# allow tag-only selectors like 'a', 'h2' by whitelisting when short
|
|
934
|
+
if selector.lower() not in {"a", "h1", "h2", "h3", "p", "span", "div"}:
|
|
935
|
+
continue
|
|
936
|
+
|
|
937
|
+
cleaned.append({
|
|
938
|
+
"name": name,
|
|
939
|
+
"selector": selector,
|
|
940
|
+
"selector_type": str(sel.get("selector_type", "css")),
|
|
941
|
+
"extract_type": str(sel.get("extract_type", "text")),
|
|
942
|
+
"attribute": (str(sel["attribute"]) if sel.get("attribute") is not None else None),
|
|
943
|
+
"multiple": bool(sel.get("multiple", True))
|
|
944
|
+
})
|
|
945
|
+
return cleaned
|
|
946
|
+
|
|
947
|
+
def _sanitize_plan(self, plan: dict, request: dict) -> dict:
|
|
948
|
+
url = request.get("target_url") or request.get("base_url") or ""
|
|
949
|
+
plan = dict(plan or {})
|
|
950
|
+
plan["steps"] = self._sanitize_steps(plan.get("steps") or [], url)
|
|
951
|
+
plan["selectors"] = self._sanitize_selectors(plan.get("selectors") or [])
|
|
952
|
+
bcfg = plan.get("browser_config")
|
|
953
|
+
if not isinstance(bcfg, dict):
|
|
954
|
+
bcfg = {}
|
|
955
|
+
plan["browser_config"] = bcfg
|
|
956
|
+
return plan
|
|
957
|
+
|
|
958
|
+
def _parse_scraping_plan(self, llm_response: str) -> Dict[str, Any]:
|
|
959
|
+
"""
|
|
960
|
+
Parse LLM response to extract structured scraping plan
|
|
961
|
+
"""
|
|
962
|
+
try:
|
|
963
|
+
plan = {
|
|
964
|
+
'steps': [],
|
|
965
|
+
'selectors': [],
|
|
966
|
+
'browser_config': {},
|
|
967
|
+
'analysis': llm_response,
|
|
968
|
+
'parsed_successfully': False
|
|
969
|
+
}
|
|
970
|
+
|
|
971
|
+
# Extract JSON sections from the response
|
|
972
|
+
json_blocks = re.findall(r'```json\s*(\{.*?\}|\[.*?\])\s*```', llm_response, re.DOTALL)
|
|
973
|
+
|
|
974
|
+
for block in json_blocks:
|
|
975
|
+
try:
|
|
976
|
+
parsed = json.loads(block)
|
|
977
|
+
if isinstance(parsed, list):
|
|
978
|
+
# Could be steps or selectors
|
|
979
|
+
if parsed and 'action' in str(parsed[0]):
|
|
980
|
+
plan['steps'] = parsed
|
|
981
|
+
elif parsed and 'selector' in str(parsed[0]):
|
|
982
|
+
plan['selectors'] = parsed
|
|
983
|
+
elif isinstance(parsed, dict):
|
|
984
|
+
# Could be browser config
|
|
985
|
+
if any(key in parsed for key in ['browser', 'headless', 'mobile']):
|
|
986
|
+
plan['browser_config'] = parsed
|
|
987
|
+
except json.JSONDecodeError:
|
|
988
|
+
continue
|
|
989
|
+
|
|
990
|
+
# Fallback: try to extract from text
|
|
991
|
+
if not plan['steps']:
|
|
992
|
+
plan['steps'] = self._extract_steps_from_text(llm_response)
|
|
993
|
+
|
|
994
|
+
if not plan['selectors']:
|
|
995
|
+
plan['selectors'] = self._extract_selectors_from_text(llm_response)
|
|
996
|
+
|
|
997
|
+
plan['parsed_successfully'] = bool(plan['steps'] or plan['selectors'])
|
|
998
|
+
return plan
|
|
999
|
+
|
|
1000
|
+
except Exception as e:
|
|
1001
|
+
self.logger.error(f"Failed to parse scraping plan: {str(e)}")
|
|
1002
|
+
return {
|
|
1003
|
+
'steps': [],
|
|
1004
|
+
'selectors': [],
|
|
1005
|
+
'browser_config': {},
|
|
1006
|
+
'analysis': llm_response,
|
|
1007
|
+
'parsed_successfully': False,
|
|
1008
|
+
'parse_error': str(e)
|
|
1009
|
+
}
|
|
1010
|
+
|
|
1011
|
+
def _extract_steps_from_text(self, text: str) -> List[Dict[str, Any]]:
|
|
1012
|
+
"""Fallback method to extract steps from unstructured text"""
|
|
1013
|
+
steps = []
|
|
1014
|
+
|
|
1015
|
+
# Look for step patterns in text
|
|
1016
|
+
step_patterns = [
|
|
1017
|
+
r'navigate to (.*?)[\n\.]',
|
|
1018
|
+
r'click on (.*?)[\n\.]',
|
|
1019
|
+
r'fill (.*?) with (.*?)[\n\.]',
|
|
1020
|
+
r'wait for (.*?)[\n\.]',
|
|
1021
|
+
r'scroll to (.*?)[\n\.]'
|
|
1022
|
+
]
|
|
1023
|
+
|
|
1024
|
+
actions = ['navigate', 'click', 'fill', 'wait', 'scroll']
|
|
1025
|
+
|
|
1026
|
+
for i, pattern in enumerate(step_patterns):
|
|
1027
|
+
matches = re.findall(pattern, text, re.IGNORECASE)
|
|
1028
|
+
for match in matches:
|
|
1029
|
+
if isinstance(match, tuple):
|
|
1030
|
+
# For fill action
|
|
1031
|
+
steps.append({
|
|
1032
|
+
'action': actions[i],
|
|
1033
|
+
'target': match[0].strip(),
|
|
1034
|
+
'value': match[1].strip() if len(match) > 1 else None,
|
|
1035
|
+
'description': f"{actions[i].title()} {match[0].strip()}"
|
|
1036
|
+
})
|
|
1037
|
+
else:
|
|
1038
|
+
steps.append({
|
|
1039
|
+
'action': actions[i],
|
|
1040
|
+
'target': match.strip(),
|
|
1041
|
+
'description': f"{actions[i].title()} {match.strip()}"
|
|
1042
|
+
})
|
|
1043
|
+
|
|
1044
|
+
return steps
|
|
1045
|
+
|
|
1046
|
+
def _extract_selectors_from_text(self, text: str) -> List[Dict[str, Any]]:
|
|
1047
|
+
"""Fallback method to extract selectors from unstructured text"""
|
|
1048
|
+
selectors = []
|
|
1049
|
+
|
|
1050
|
+
# Look for selector patterns
|
|
1051
|
+
css_selectors = re.findall(r'[\.#][\w-]+(?:\s*[\.#][\w-]+)*', text)
|
|
1052
|
+
|
|
1053
|
+
for i, selector in enumerate(css_selectors):
|
|
1054
|
+
selectors.append({
|
|
1055
|
+
'name': f'selector_{i+1}',
|
|
1056
|
+
'selector': selector.strip(),
|
|
1057
|
+
'selector_type': 'css',
|
|
1058
|
+
'extract_type': 'text'
|
|
1059
|
+
})
|
|
1060
|
+
|
|
1061
|
+
return selectors
|
|
1062
|
+
|
|
1063
|
+
def _extract_domain(self, url: str) -> Optional[str]:
|
|
1064
|
+
"""Extract domain from URL"""
|
|
1065
|
+
try:
|
|
1066
|
+
parsed = urlparse(url)
|
|
1067
|
+
return parsed.netloc if parsed.netloc else None
|
|
1068
|
+
except:
|
|
1069
|
+
return None
|
|
1070
|
+
|
|
1071
|
+
def _update_site_knowledge(
|
|
1072
|
+
self,
|
|
1073
|
+
request: Dict[str, Any],
|
|
1074
|
+
results: List[ScrapingResult]
|
|
1075
|
+
):
|
|
1076
|
+
"""Update our knowledge base about specific sites"""
|
|
1077
|
+
domain = self._extract_domain(request.get('target_url', ''))
|
|
1078
|
+
if domain and domain in self.site_knowledge:
|
|
1079
|
+
successful_results = [r for r in results if r.success]
|
|
1080
|
+
success_rate = len(successful_results) / len(results) if results else 0.0
|
|
1081
|
+
|
|
1082
|
+
self.site_knowledge[domain].update({
|
|
1083
|
+
'success_rate': success_rate,
|
|
1084
|
+
'last_scrape': datetime.now().isoformat(),
|
|
1085
|
+
'total_attempts': self.site_knowledge[domain].get('total_attempts', 0) + 1,
|
|
1086
|
+
'last_successful_config': self.browser_config.copy() if success_rate > 0.5 else None
|
|
1087
|
+
})
|
|
1088
|
+
|
|
1089
|
+
async def get_site_recommendations(self, url: str) -> Dict[str, Any]:
|
|
1090
|
+
"""Get comprehensive recommendations for scraping a specific site"""
|
|
1091
|
+
domain = self._extract_domain(url)
|
|
1092
|
+
recommendations = {
|
|
1093
|
+
'domain': domain,
|
|
1094
|
+
'browser_recommendation': None,
|
|
1095
|
+
'scraping_strategy': None,
|
|
1096
|
+
'historical_data': None
|
|
1097
|
+
}
|
|
1098
|
+
|
|
1099
|
+
# Get browser recommendation
|
|
1100
|
+
browser_rec = await self.recommend_browser_for_site(url)
|
|
1101
|
+
recommendations['browser_recommendation'] = browser_rec
|
|
1102
|
+
|
|
1103
|
+
# Get historical data if available
|
|
1104
|
+
if domain in self.site_knowledge:
|
|
1105
|
+
knowledge = self.site_knowledge[domain]
|
|
1106
|
+
recommendations['historical_data'] = {
|
|
1107
|
+
'success_rate': knowledge.get('success_rate', 0.0),
|
|
1108
|
+
'last_successful_scrape': knowledge.get('last_scrape'),
|
|
1109
|
+
'total_attempts': knowledge.get('total_attempts', 0),
|
|
1110
|
+
'last_successful_config': knowledge.get('last_successful_config')
|
|
1111
|
+
}
|
|
1112
|
+
|
|
1113
|
+
# Generate comprehensive strategy recommendations
|
|
1114
|
+
strategy_prompt = f"""
|
|
1115
|
+
Provide comprehensive scraping strategy recommendations for this site:
|
|
1116
|
+
|
|
1117
|
+
**Domain:** {domain}
|
|
1118
|
+
**URL:** {url}
|
|
1119
|
+
**Browser Recommendation:** {json.dumps(browser_rec, indent=2)}
|
|
1120
|
+
**Historical Data:** {json.dumps(recommendations.get('historical_data', {}), indent=2)}
|
|
1121
|
+
|
|
1122
|
+
Please suggest:
|
|
1123
|
+
1. Overall scraping approach and strategy
|
|
1124
|
+
2. Timing and rate limiting recommendations
|
|
1125
|
+
3. Common challenges and how to handle them
|
|
1126
|
+
4. Authentication strategies if needed
|
|
1127
|
+
5. Content extraction best practices
|
|
1128
|
+
6. Error handling and recovery strategies
|
|
1129
|
+
"""
|
|
1130
|
+
|
|
1131
|
+
try:
|
|
1132
|
+
async with self._llm as client:
|
|
1133
|
+
strategy_response = await client.ask(
|
|
1134
|
+
prompt=strategy_prompt,
|
|
1135
|
+
system_prompt=self.system_prompt_template,
|
|
1136
|
+
model=self._llm_model,
|
|
1137
|
+
max_tokens=self._max_tokens,
|
|
1138
|
+
temperature=self._llm_temp,
|
|
1139
|
+
use_tools=True,
|
|
1140
|
+
)
|
|
1141
|
+
recommendations['scraping_strategy'] = self._safe_extract_text(strategy_response)
|
|
1142
|
+
except Exception as e:
|
|
1143
|
+
self.logger.warning(f"Failed to generate strategy recommendations: {str(e)}")
|
|
1144
|
+
recommendations['scraping_strategy'] = "Unable to generate strategy recommendations"
|
|
1145
|
+
|
|
1146
|
+
return recommendations
|
|
1147
|
+
|
|
1148
|
+
async def cleanup(self):
|
|
1149
|
+
"""Clean up resources"""
|
|
1150
|
+
if hasattr(self.scraping_tool, 'cleanup'):
|
|
1151
|
+
await self.scraping_tool.cleanup()
|
|
1152
|
+
|
|
1153
|
+
def get_available_templates(self) -> Dict[str, str]:
|
|
1154
|
+
"""Get list of available scraping templates"""
|
|
1155
|
+
return {domain: template.get('guidance', 'No guidance available')
|
|
1156
|
+
for domain, template in self.scraping_templates.items()}
|
|
1157
|
+
|
|
1158
|
+
def get_template_for_url(self, url: str) -> Optional[Dict[str, Any]]:
|
|
1159
|
+
"""Get the best matching template for a given URL"""
|
|
1160
|
+
domain = self._extract_domain(url)
|
|
1161
|
+
if not domain:
|
|
1162
|
+
return None
|
|
1163
|
+
|
|
1164
|
+
# Check for exact match
|
|
1165
|
+
if domain in self.scraping_templates:
|
|
1166
|
+
return self.scraping_templates[domain]
|
|
1167
|
+
|
|
1168
|
+
# Check for partial matches
|
|
1169
|
+
for template_domain, template_data in self.scraping_templates.items():
|
|
1170
|
+
if template_domain in domain or domain in template_domain:
|
|
1171
|
+
return template_data
|
|
1172
|
+
|
|
1173
|
+
return None
|