ai-parrot 0.17.2__cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentui/.prettierrc +15 -0
- agentui/QUICKSTART.md +272 -0
- agentui/README.md +59 -0
- agentui/env.example +16 -0
- agentui/jsconfig.json +14 -0
- agentui/package-lock.json +4242 -0
- agentui/package.json +34 -0
- agentui/scripts/postinstall/apply-patches.mjs +260 -0
- agentui/src/app.css +61 -0
- agentui/src/app.d.ts +13 -0
- agentui/src/app.html +12 -0
- agentui/src/components/LoadingSpinner.svelte +64 -0
- agentui/src/components/ThemeSwitcher.svelte +159 -0
- agentui/src/components/index.js +4 -0
- agentui/src/lib/api/bots.ts +60 -0
- agentui/src/lib/api/chat.ts +22 -0
- agentui/src/lib/api/http.ts +25 -0
- agentui/src/lib/components/BotCard.svelte +33 -0
- agentui/src/lib/components/ChatBubble.svelte +63 -0
- agentui/src/lib/components/Toast.svelte +21 -0
- agentui/src/lib/config.ts +20 -0
- agentui/src/lib/stores/auth.svelte.ts +73 -0
- agentui/src/lib/stores/theme.svelte.js +64 -0
- agentui/src/lib/stores/toast.svelte.ts +31 -0
- agentui/src/lib/utils/conversation.ts +39 -0
- agentui/src/routes/+layout.svelte +20 -0
- agentui/src/routes/+page.svelte +232 -0
- agentui/src/routes/login/+page.svelte +200 -0
- agentui/src/routes/talk/[agentId]/+page.svelte +297 -0
- agentui/src/routes/talk/[agentId]/+page.ts +7 -0
- agentui/static/README.md +1 -0
- agentui/svelte.config.js +11 -0
- agentui/tailwind.config.ts +53 -0
- agentui/tsconfig.json +3 -0
- agentui/vite.config.ts +10 -0
- ai_parrot-0.17.2.dist-info/METADATA +472 -0
- ai_parrot-0.17.2.dist-info/RECORD +535 -0
- ai_parrot-0.17.2.dist-info/WHEEL +6 -0
- ai_parrot-0.17.2.dist-info/entry_points.txt +2 -0
- ai_parrot-0.17.2.dist-info/licenses/LICENSE +21 -0
- ai_parrot-0.17.2.dist-info/top_level.txt +6 -0
- crew-builder/.prettierrc +15 -0
- crew-builder/QUICKSTART.md +259 -0
- crew-builder/README.md +113 -0
- crew-builder/env.example +17 -0
- crew-builder/jsconfig.json +14 -0
- crew-builder/package-lock.json +4182 -0
- crew-builder/package.json +37 -0
- crew-builder/scripts/postinstall/apply-patches.mjs +260 -0
- crew-builder/src/app.css +62 -0
- crew-builder/src/app.d.ts +13 -0
- crew-builder/src/app.html +12 -0
- crew-builder/src/components/LoadingSpinner.svelte +64 -0
- crew-builder/src/components/ThemeSwitcher.svelte +149 -0
- crew-builder/src/components/index.js +9 -0
- crew-builder/src/lib/api/bots.ts +60 -0
- crew-builder/src/lib/api/chat.ts +80 -0
- crew-builder/src/lib/api/client.ts +56 -0
- crew-builder/src/lib/api/crew/crew.ts +136 -0
- crew-builder/src/lib/api/index.ts +5 -0
- crew-builder/src/lib/api/o365/auth.ts +65 -0
- crew-builder/src/lib/auth/auth.ts +54 -0
- crew-builder/src/lib/components/AgentNode.svelte +43 -0
- crew-builder/src/lib/components/BotCard.svelte +33 -0
- crew-builder/src/lib/components/ChatBubble.svelte +67 -0
- crew-builder/src/lib/components/ConfigPanel.svelte +278 -0
- crew-builder/src/lib/components/JsonTreeNode.svelte +76 -0
- crew-builder/src/lib/components/JsonViewer.svelte +24 -0
- crew-builder/src/lib/components/MarkdownEditor.svelte +48 -0
- crew-builder/src/lib/components/ThemeToggle.svelte +36 -0
- crew-builder/src/lib/components/Toast.svelte +67 -0
- crew-builder/src/lib/components/Toolbar.svelte +157 -0
- crew-builder/src/lib/components/index.ts +10 -0
- crew-builder/src/lib/config.ts +8 -0
- crew-builder/src/lib/stores/auth.svelte.ts +228 -0
- crew-builder/src/lib/stores/crewStore.ts +369 -0
- crew-builder/src/lib/stores/theme.svelte.js +145 -0
- crew-builder/src/lib/stores/toast.svelte.ts +69 -0
- crew-builder/src/lib/utils/conversation.ts +39 -0
- crew-builder/src/lib/utils/markdown.ts +122 -0
- crew-builder/src/lib/utils/talkHistory.ts +47 -0
- crew-builder/src/routes/+layout.svelte +20 -0
- crew-builder/src/routes/+page.svelte +539 -0
- crew-builder/src/routes/agents/+page.svelte +247 -0
- crew-builder/src/routes/agents/[agentId]/+page.svelte +288 -0
- crew-builder/src/routes/agents/[agentId]/+page.ts +7 -0
- crew-builder/src/routes/builder/+page.svelte +204 -0
- crew-builder/src/routes/crew/ask/+page.svelte +1052 -0
- crew-builder/src/routes/crew/ask/+page.ts +1 -0
- crew-builder/src/routes/integrations/o365/+page.svelte +304 -0
- crew-builder/src/routes/login/+page.svelte +197 -0
- crew-builder/src/routes/talk/[agentId]/+page.svelte +487 -0
- crew-builder/src/routes/talk/[agentId]/+page.ts +7 -0
- crew-builder/static/README.md +1 -0
- crew-builder/svelte.config.js +11 -0
- crew-builder/tailwind.config.ts +53 -0
- crew-builder/tsconfig.json +3 -0
- crew-builder/vite.config.ts +10 -0
- mcp_servers/calculator_server.py +309 -0
- parrot/__init__.py +27 -0
- parrot/__pycache__/__init__.cpython-310.pyc +0 -0
- parrot/__pycache__/version.cpython-310.pyc +0 -0
- parrot/_version.py +34 -0
- parrot/a2a/__init__.py +48 -0
- parrot/a2a/client.py +658 -0
- parrot/a2a/discovery.py +89 -0
- parrot/a2a/mixin.py +257 -0
- parrot/a2a/models.py +376 -0
- parrot/a2a/server.py +770 -0
- parrot/agents/__init__.py +29 -0
- parrot/bots/__init__.py +12 -0
- parrot/bots/a2a_agent.py +19 -0
- parrot/bots/abstract.py +3139 -0
- parrot/bots/agent.py +1129 -0
- parrot/bots/basic.py +9 -0
- parrot/bots/chatbot.py +669 -0
- parrot/bots/data.py +1618 -0
- parrot/bots/database/__init__.py +5 -0
- parrot/bots/database/abstract.py +3071 -0
- parrot/bots/database/cache.py +286 -0
- parrot/bots/database/models.py +468 -0
- parrot/bots/database/prompts.py +154 -0
- parrot/bots/database/retries.py +98 -0
- parrot/bots/database/router.py +269 -0
- parrot/bots/database/sql.py +41 -0
- parrot/bots/db/__init__.py +6 -0
- parrot/bots/db/abstract.py +556 -0
- parrot/bots/db/bigquery.py +602 -0
- parrot/bots/db/cache.py +85 -0
- parrot/bots/db/documentdb.py +668 -0
- parrot/bots/db/elastic.py +1014 -0
- parrot/bots/db/influx.py +898 -0
- parrot/bots/db/mock.py +96 -0
- parrot/bots/db/multi.py +783 -0
- parrot/bots/db/prompts.py +185 -0
- parrot/bots/db/sql.py +1255 -0
- parrot/bots/db/tools.py +212 -0
- parrot/bots/document.py +680 -0
- parrot/bots/hrbot.py +15 -0
- parrot/bots/kb.py +170 -0
- parrot/bots/mcp.py +36 -0
- parrot/bots/orchestration/README.md +463 -0
- parrot/bots/orchestration/__init__.py +1 -0
- parrot/bots/orchestration/agent.py +155 -0
- parrot/bots/orchestration/crew.py +3330 -0
- parrot/bots/orchestration/fsm.py +1179 -0
- parrot/bots/orchestration/hr.py +434 -0
- parrot/bots/orchestration/storage/__init__.py +4 -0
- parrot/bots/orchestration/storage/memory.py +100 -0
- parrot/bots/orchestration/storage/mixin.py +119 -0
- parrot/bots/orchestration/verify.py +202 -0
- parrot/bots/product.py +204 -0
- parrot/bots/prompts/__init__.py +96 -0
- parrot/bots/prompts/agents.py +155 -0
- parrot/bots/prompts/data.py +216 -0
- parrot/bots/prompts/output_generation.py +8 -0
- parrot/bots/scraper/__init__.py +3 -0
- parrot/bots/scraper/models.py +122 -0
- parrot/bots/scraper/scraper.py +1173 -0
- parrot/bots/scraper/templates.py +115 -0
- parrot/bots/stores/__init__.py +5 -0
- parrot/bots/stores/local.py +172 -0
- parrot/bots/webdev.py +81 -0
- parrot/cli.py +17 -0
- parrot/clients/__init__.py +16 -0
- parrot/clients/base.py +1491 -0
- parrot/clients/claude.py +1191 -0
- parrot/clients/factory.py +129 -0
- parrot/clients/google.py +4567 -0
- parrot/clients/gpt.py +1975 -0
- parrot/clients/grok.py +432 -0
- parrot/clients/groq.py +986 -0
- parrot/clients/hf.py +582 -0
- parrot/clients/models.py +18 -0
- parrot/conf.py +395 -0
- parrot/embeddings/__init__.py +9 -0
- parrot/embeddings/base.py +157 -0
- parrot/embeddings/google.py +98 -0
- parrot/embeddings/huggingface.py +74 -0
- parrot/embeddings/openai.py +84 -0
- parrot/embeddings/processor.py +88 -0
- parrot/exceptions.c +13868 -0
- parrot/exceptions.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/exceptions.pxd +22 -0
- parrot/exceptions.pxi +15 -0
- parrot/exceptions.pyx +44 -0
- parrot/generators/__init__.py +29 -0
- parrot/generators/base.py +200 -0
- parrot/generators/html.py +293 -0
- parrot/generators/react.py +205 -0
- parrot/generators/streamlit.py +203 -0
- parrot/generators/template.py +105 -0
- parrot/handlers/__init__.py +4 -0
- parrot/handlers/agent.py +861 -0
- parrot/handlers/agents/__init__.py +1 -0
- parrot/handlers/agents/abstract.py +900 -0
- parrot/handlers/bots.py +338 -0
- parrot/handlers/chat.py +915 -0
- parrot/handlers/creation.sql +192 -0
- parrot/handlers/crew/ARCHITECTURE.md +362 -0
- parrot/handlers/crew/README_BOTMANAGER_PERSISTENCE.md +303 -0
- parrot/handlers/crew/README_REDIS_PERSISTENCE.md +366 -0
- parrot/handlers/crew/__init__.py +0 -0
- parrot/handlers/crew/handler.py +801 -0
- parrot/handlers/crew/models.py +229 -0
- parrot/handlers/crew/redis_persistence.py +523 -0
- parrot/handlers/jobs/__init__.py +10 -0
- parrot/handlers/jobs/job.py +384 -0
- parrot/handlers/jobs/mixin.py +627 -0
- parrot/handlers/jobs/models.py +115 -0
- parrot/handlers/jobs/worker.py +31 -0
- parrot/handlers/models.py +596 -0
- parrot/handlers/o365_auth.py +105 -0
- parrot/handlers/stream.py +337 -0
- parrot/interfaces/__init__.py +6 -0
- parrot/interfaces/aws.py +143 -0
- parrot/interfaces/credentials.py +113 -0
- parrot/interfaces/database.py +27 -0
- parrot/interfaces/google.py +1123 -0
- parrot/interfaces/hierarchy.py +1227 -0
- parrot/interfaces/http.py +651 -0
- parrot/interfaces/images/__init__.py +0 -0
- parrot/interfaces/images/plugins/__init__.py +24 -0
- parrot/interfaces/images/plugins/abstract.py +58 -0
- parrot/interfaces/images/plugins/analisys.py +148 -0
- parrot/interfaces/images/plugins/classify.py +150 -0
- parrot/interfaces/images/plugins/classifybase.py +182 -0
- parrot/interfaces/images/plugins/detect.py +150 -0
- parrot/interfaces/images/plugins/exif.py +1103 -0
- parrot/interfaces/images/plugins/hash.py +52 -0
- parrot/interfaces/images/plugins/vision.py +104 -0
- parrot/interfaces/images/plugins/yolo.py +66 -0
- parrot/interfaces/images/plugins/zerodetect.py +197 -0
- parrot/interfaces/o365.py +978 -0
- parrot/interfaces/onedrive.py +822 -0
- parrot/interfaces/sharepoint.py +1435 -0
- parrot/interfaces/soap.py +257 -0
- parrot/loaders/__init__.py +8 -0
- parrot/loaders/abstract.py +1131 -0
- parrot/loaders/audio.py +199 -0
- parrot/loaders/basepdf.py +53 -0
- parrot/loaders/basevideo.py +1568 -0
- parrot/loaders/csv.py +409 -0
- parrot/loaders/docx.py +116 -0
- parrot/loaders/epubloader.py +316 -0
- parrot/loaders/excel.py +199 -0
- parrot/loaders/factory.py +55 -0
- parrot/loaders/files/__init__.py +0 -0
- parrot/loaders/files/abstract.py +39 -0
- parrot/loaders/files/html.py +26 -0
- parrot/loaders/files/text.py +63 -0
- parrot/loaders/html.py +152 -0
- parrot/loaders/markdown.py +442 -0
- parrot/loaders/pdf.py +373 -0
- parrot/loaders/pdfmark.py +320 -0
- parrot/loaders/pdftables.py +506 -0
- parrot/loaders/ppt.py +476 -0
- parrot/loaders/qa.py +63 -0
- parrot/loaders/splitters/__init__.py +10 -0
- parrot/loaders/splitters/base.py +138 -0
- parrot/loaders/splitters/md.py +228 -0
- parrot/loaders/splitters/token.py +143 -0
- parrot/loaders/txt.py +26 -0
- parrot/loaders/video.py +89 -0
- parrot/loaders/videolocal.py +218 -0
- parrot/loaders/videounderstanding.py +377 -0
- parrot/loaders/vimeo.py +167 -0
- parrot/loaders/web.py +599 -0
- parrot/loaders/youtube.py +504 -0
- parrot/manager/__init__.py +5 -0
- parrot/manager/manager.py +1030 -0
- parrot/mcp/__init__.py +28 -0
- parrot/mcp/adapter.py +105 -0
- parrot/mcp/cli.py +174 -0
- parrot/mcp/client.py +119 -0
- parrot/mcp/config.py +75 -0
- parrot/mcp/integration.py +842 -0
- parrot/mcp/oauth.py +933 -0
- parrot/mcp/server.py +225 -0
- parrot/mcp/transports/__init__.py +3 -0
- parrot/mcp/transports/base.py +279 -0
- parrot/mcp/transports/grpc_session.py +163 -0
- parrot/mcp/transports/http.py +312 -0
- parrot/mcp/transports/mcp.proto +108 -0
- parrot/mcp/transports/quic.py +1082 -0
- parrot/mcp/transports/sse.py +330 -0
- parrot/mcp/transports/stdio.py +309 -0
- parrot/mcp/transports/unix.py +395 -0
- parrot/mcp/transports/websocket.py +547 -0
- parrot/memory/__init__.py +16 -0
- parrot/memory/abstract.py +209 -0
- parrot/memory/agent.py +32 -0
- parrot/memory/cache.py +175 -0
- parrot/memory/core.py +555 -0
- parrot/memory/file.py +153 -0
- parrot/memory/mem.py +131 -0
- parrot/memory/redis.py +613 -0
- parrot/models/__init__.py +46 -0
- parrot/models/basic.py +118 -0
- parrot/models/compliance.py +208 -0
- parrot/models/crew.py +395 -0
- parrot/models/detections.py +654 -0
- parrot/models/generation.py +85 -0
- parrot/models/google.py +223 -0
- parrot/models/groq.py +23 -0
- parrot/models/openai.py +30 -0
- parrot/models/outputs.py +285 -0
- parrot/models/responses.py +938 -0
- parrot/notifications/__init__.py +743 -0
- parrot/openapi/__init__.py +3 -0
- parrot/openapi/components.yaml +641 -0
- parrot/openapi/config.py +322 -0
- parrot/outputs/__init__.py +32 -0
- parrot/outputs/formats/__init__.py +108 -0
- parrot/outputs/formats/altair.py +359 -0
- parrot/outputs/formats/application.py +122 -0
- parrot/outputs/formats/base.py +351 -0
- parrot/outputs/formats/bokeh.py +356 -0
- parrot/outputs/formats/card.py +424 -0
- parrot/outputs/formats/chart.py +436 -0
- parrot/outputs/formats/d3.py +255 -0
- parrot/outputs/formats/echarts.py +310 -0
- parrot/outputs/formats/generators/__init__.py +0 -0
- parrot/outputs/formats/generators/abstract.py +61 -0
- parrot/outputs/formats/generators/panel.py +145 -0
- parrot/outputs/formats/generators/streamlit.py +86 -0
- parrot/outputs/formats/generators/terminal.py +63 -0
- parrot/outputs/formats/holoviews.py +310 -0
- parrot/outputs/formats/html.py +147 -0
- parrot/outputs/formats/jinja2.py +46 -0
- parrot/outputs/formats/json.py +87 -0
- parrot/outputs/formats/map.py +933 -0
- parrot/outputs/formats/markdown.py +172 -0
- parrot/outputs/formats/matplotlib.py +237 -0
- parrot/outputs/formats/mixins/__init__.py +0 -0
- parrot/outputs/formats/mixins/emaps.py +855 -0
- parrot/outputs/formats/plotly.py +341 -0
- parrot/outputs/formats/seaborn.py +310 -0
- parrot/outputs/formats/table.py +397 -0
- parrot/outputs/formats/template_report.py +138 -0
- parrot/outputs/formats/yaml.py +125 -0
- parrot/outputs/formatter.py +152 -0
- parrot/outputs/templates/__init__.py +95 -0
- parrot/pipelines/__init__.py +0 -0
- parrot/pipelines/abstract.py +210 -0
- parrot/pipelines/detector.py +124 -0
- parrot/pipelines/models.py +90 -0
- parrot/pipelines/planogram.py +3002 -0
- parrot/pipelines/table.sql +97 -0
- parrot/plugins/__init__.py +106 -0
- parrot/plugins/importer.py +80 -0
- parrot/py.typed +0 -0
- parrot/registry/__init__.py +18 -0
- parrot/registry/registry.py +594 -0
- parrot/scheduler/__init__.py +1189 -0
- parrot/scheduler/models.py +60 -0
- parrot/security/__init__.py +16 -0
- parrot/security/prompt_injection.py +268 -0
- parrot/security/security_events.sql +25 -0
- parrot/services/__init__.py +1 -0
- parrot/services/mcp/__init__.py +8 -0
- parrot/services/mcp/config.py +13 -0
- parrot/services/mcp/server.py +295 -0
- parrot/services/o365_remote_auth.py +235 -0
- parrot/stores/__init__.py +7 -0
- parrot/stores/abstract.py +352 -0
- parrot/stores/arango.py +1090 -0
- parrot/stores/bigquery.py +1377 -0
- parrot/stores/cache.py +106 -0
- parrot/stores/empty.py +10 -0
- parrot/stores/faiss_store.py +1157 -0
- parrot/stores/kb/__init__.py +9 -0
- parrot/stores/kb/abstract.py +68 -0
- parrot/stores/kb/cache.py +165 -0
- parrot/stores/kb/doc.py +325 -0
- parrot/stores/kb/hierarchy.py +346 -0
- parrot/stores/kb/local.py +457 -0
- parrot/stores/kb/prompt.py +28 -0
- parrot/stores/kb/redis.py +659 -0
- parrot/stores/kb/store.py +115 -0
- parrot/stores/kb/user.py +374 -0
- parrot/stores/models.py +59 -0
- parrot/stores/pgvector.py +3 -0
- parrot/stores/postgres.py +2853 -0
- parrot/stores/utils/__init__.py +0 -0
- parrot/stores/utils/chunking.py +197 -0
- parrot/telemetry/__init__.py +3 -0
- parrot/telemetry/mixin.py +111 -0
- parrot/template/__init__.py +3 -0
- parrot/template/engine.py +259 -0
- parrot/tools/__init__.py +23 -0
- parrot/tools/abstract.py +644 -0
- parrot/tools/agent.py +363 -0
- parrot/tools/arangodbsearch.py +537 -0
- parrot/tools/arxiv_tool.py +188 -0
- parrot/tools/calculator/__init__.py +3 -0
- parrot/tools/calculator/operations/__init__.py +38 -0
- parrot/tools/calculator/operations/calculus.py +80 -0
- parrot/tools/calculator/operations/statistics.py +76 -0
- parrot/tools/calculator/tool.py +150 -0
- parrot/tools/cloudwatch.py +988 -0
- parrot/tools/codeinterpreter/__init__.py +127 -0
- parrot/tools/codeinterpreter/executor.py +371 -0
- parrot/tools/codeinterpreter/internals.py +473 -0
- parrot/tools/codeinterpreter/models.py +643 -0
- parrot/tools/codeinterpreter/prompts.py +224 -0
- parrot/tools/codeinterpreter/tool.py +664 -0
- parrot/tools/company_info/__init__.py +6 -0
- parrot/tools/company_info/tool.py +1138 -0
- parrot/tools/correlationanalysis.py +437 -0
- parrot/tools/database/abstract.py +286 -0
- parrot/tools/database/bq.py +115 -0
- parrot/tools/database/cache.py +284 -0
- parrot/tools/database/models.py +95 -0
- parrot/tools/database/pg.py +343 -0
- parrot/tools/databasequery.py +1159 -0
- parrot/tools/db.py +1800 -0
- parrot/tools/ddgo.py +370 -0
- parrot/tools/decorators.py +271 -0
- parrot/tools/dftohtml.py +282 -0
- parrot/tools/document.py +549 -0
- parrot/tools/ecs.py +819 -0
- parrot/tools/edareport.py +368 -0
- parrot/tools/elasticsearch.py +1049 -0
- parrot/tools/employees.py +462 -0
- parrot/tools/epson/__init__.py +96 -0
- parrot/tools/excel.py +683 -0
- parrot/tools/file/__init__.py +13 -0
- parrot/tools/file/abstract.py +76 -0
- parrot/tools/file/gcs.py +378 -0
- parrot/tools/file/local.py +284 -0
- parrot/tools/file/s3.py +511 -0
- parrot/tools/file/tmp.py +309 -0
- parrot/tools/file/tool.py +501 -0
- parrot/tools/file_reader.py +129 -0
- parrot/tools/flowtask/__init__.py +19 -0
- parrot/tools/flowtask/tool.py +761 -0
- parrot/tools/gittoolkit.py +508 -0
- parrot/tools/google/__init__.py +18 -0
- parrot/tools/google/base.py +169 -0
- parrot/tools/google/tools.py +1251 -0
- parrot/tools/googlelocation.py +5 -0
- parrot/tools/googleroutes.py +5 -0
- parrot/tools/googlesearch.py +5 -0
- parrot/tools/googlesitesearch.py +5 -0
- parrot/tools/googlevoice.py +2 -0
- parrot/tools/gvoice.py +695 -0
- parrot/tools/ibisworld/README.md +225 -0
- parrot/tools/ibisworld/__init__.py +11 -0
- parrot/tools/ibisworld/tool.py +366 -0
- parrot/tools/jiratoolkit.py +1718 -0
- parrot/tools/manager.py +1098 -0
- parrot/tools/math.py +152 -0
- parrot/tools/metadata.py +476 -0
- parrot/tools/msteams.py +1621 -0
- parrot/tools/msword.py +635 -0
- parrot/tools/multidb.py +580 -0
- parrot/tools/multistoresearch.py +369 -0
- parrot/tools/networkninja.py +167 -0
- parrot/tools/nextstop/__init__.py +4 -0
- parrot/tools/nextstop/base.py +286 -0
- parrot/tools/nextstop/employee.py +733 -0
- parrot/tools/nextstop/store.py +462 -0
- parrot/tools/notification.py +435 -0
- parrot/tools/o365/__init__.py +42 -0
- parrot/tools/o365/base.py +295 -0
- parrot/tools/o365/bundle.py +522 -0
- parrot/tools/o365/events.py +554 -0
- parrot/tools/o365/mail.py +992 -0
- parrot/tools/o365/onedrive.py +497 -0
- parrot/tools/o365/sharepoint.py +641 -0
- parrot/tools/openapi_toolkit.py +904 -0
- parrot/tools/openweather.py +527 -0
- parrot/tools/pdfprint.py +1001 -0
- parrot/tools/powerbi.py +518 -0
- parrot/tools/powerpoint.py +1113 -0
- parrot/tools/pricestool.py +146 -0
- parrot/tools/products/__init__.py +246 -0
- parrot/tools/prophet_tool.py +171 -0
- parrot/tools/pythonpandas.py +630 -0
- parrot/tools/pythonrepl.py +910 -0
- parrot/tools/qsource.py +436 -0
- parrot/tools/querytoolkit.py +395 -0
- parrot/tools/quickeda.py +827 -0
- parrot/tools/resttool.py +553 -0
- parrot/tools/retail/__init__.py +0 -0
- parrot/tools/retail/bby.py +528 -0
- parrot/tools/sandboxtool.py +703 -0
- parrot/tools/sassie/__init__.py +352 -0
- parrot/tools/scraping/__init__.py +7 -0
- parrot/tools/scraping/docs/select.md +466 -0
- parrot/tools/scraping/documentation.md +1278 -0
- parrot/tools/scraping/driver.py +436 -0
- parrot/tools/scraping/models.py +576 -0
- parrot/tools/scraping/options.py +85 -0
- parrot/tools/scraping/orchestrator.py +517 -0
- parrot/tools/scraping/readme.md +740 -0
- parrot/tools/scraping/tool.py +3115 -0
- parrot/tools/seasonaldetection.py +642 -0
- parrot/tools/shell_tool/__init__.py +5 -0
- parrot/tools/shell_tool/actions.py +408 -0
- parrot/tools/shell_tool/engine.py +155 -0
- parrot/tools/shell_tool/models.py +322 -0
- parrot/tools/shell_tool/tool.py +442 -0
- parrot/tools/site_search.py +214 -0
- parrot/tools/textfile.py +418 -0
- parrot/tools/think.py +378 -0
- parrot/tools/toolkit.py +298 -0
- parrot/tools/webapp_tool.py +187 -0
- parrot/tools/whatif.py +1279 -0
- parrot/tools/workday/MULTI_WSDL_EXAMPLE.md +249 -0
- parrot/tools/workday/__init__.py +6 -0
- parrot/tools/workday/models.py +1389 -0
- parrot/tools/workday/tool.py +1293 -0
- parrot/tools/yfinance_tool.py +306 -0
- parrot/tools/zipcode.py +217 -0
- parrot/utils/__init__.py +2 -0
- parrot/utils/helpers.py +73 -0
- parrot/utils/parsers/__init__.py +5 -0
- parrot/utils/parsers/toml.c +12078 -0
- parrot/utils/parsers/toml.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/utils/parsers/toml.pyx +21 -0
- parrot/utils/toml.py +11 -0
- parrot/utils/types.cpp +20936 -0
- parrot/utils/types.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/utils/types.pyx +213 -0
- parrot/utils/uv.py +11 -0
- parrot/version.py +10 -0
- parrot/yaml-rs/Cargo.lock +350 -0
- parrot/yaml-rs/Cargo.toml +19 -0
- parrot/yaml-rs/pyproject.toml +19 -0
- parrot/yaml-rs/python/yaml_rs/__init__.py +81 -0
- parrot/yaml-rs/src/lib.rs +222 -0
- requirements/docker-compose.yml +24 -0
- requirements/requirements-dev.txt +21 -0
|
@@ -0,0 +1,3115 @@
|
|
|
1
|
+
"""
|
|
2
|
+
WebScrapingTool for AI-Parrot
|
|
3
|
+
Combines Selenium/Playwright automation with LLM-directed scraping
|
|
4
|
+
"""
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
import random
|
|
7
|
+
import sys
|
|
8
|
+
from typing import Dict, List, Any, Optional, Union, Literal
|
|
9
|
+
import select
|
|
10
|
+
import time
|
|
11
|
+
import asyncio
|
|
12
|
+
import logging
|
|
13
|
+
import base64
|
|
14
|
+
import re
|
|
15
|
+
import json
|
|
16
|
+
import contextlib
|
|
17
|
+
from urllib.parse import urlparse, urljoin
|
|
18
|
+
from lxml import html as lxml_html
|
|
19
|
+
import aiofiles
|
|
20
|
+
from pydantic import BaseModel, Field
|
|
21
|
+
from bs4 import BeautifulSoup
|
|
22
|
+
# Selenium imports
|
|
23
|
+
try:
|
|
24
|
+
from seleniumwire import webdriver
|
|
25
|
+
except ImportError:
|
|
26
|
+
from selenium import webdriver
|
|
27
|
+
from selenium.webdriver.chrome.options import Options
|
|
28
|
+
from selenium.webdriver.common.by import By
|
|
29
|
+
from selenium.webdriver.common.keys import Keys
|
|
30
|
+
from selenium.webdriver.support.ui import WebDriverWait
|
|
31
|
+
from selenium.webdriver.support import expected_conditions as EC
|
|
32
|
+
from selenium.common.exceptions import NoSuchElementException, TimeoutException
|
|
33
|
+
# For Playwright alternative
|
|
34
|
+
try:
|
|
35
|
+
from playwright.async_api import async_playwright, Page, Browser
|
|
36
|
+
PLAYWRIGHT_AVAILABLE = True
|
|
37
|
+
except ImportError:
|
|
38
|
+
PLAYWRIGHT_AVAILABLE = False
|
|
39
|
+
from ..abstract import AbstractTool
|
|
40
|
+
from .driver import SeleniumSetup
|
|
41
|
+
from .models import (
|
|
42
|
+
BrowserAction,
|
|
43
|
+
Navigate,
|
|
44
|
+
Click,
|
|
45
|
+
Fill,
|
|
46
|
+
Select,
|
|
47
|
+
Evaluate,
|
|
48
|
+
PressKey,
|
|
49
|
+
Refresh,
|
|
50
|
+
Back,
|
|
51
|
+
Wait,
|
|
52
|
+
Scroll,
|
|
53
|
+
Authenticate,
|
|
54
|
+
GetCookies,
|
|
55
|
+
SetCookies,
|
|
56
|
+
GetText,
|
|
57
|
+
GetHTML,
|
|
58
|
+
Screenshot,
|
|
59
|
+
WaitForDownload,
|
|
60
|
+
UploadFile,
|
|
61
|
+
AwaitHuman,
|
|
62
|
+
AwaitKeyPress,
|
|
63
|
+
AwaitBrowserEvent,
|
|
64
|
+
Loop,
|
|
65
|
+
ScrapingStep,
|
|
66
|
+
ScrapingSelector,
|
|
67
|
+
ScrapingResult,
|
|
68
|
+
Conditional
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class WebScrapingToolArgs(BaseModel):
|
|
73
|
+
"""Arguments schema for WebScrapingTool."""
|
|
74
|
+
steps: List[Dict[str, Any]] = Field(
|
|
75
|
+
description="List of navigation and interaction steps. Each step should have 'action' and 'description'"
|
|
76
|
+
)
|
|
77
|
+
selectors: Optional[List[Dict[str, Any]]] = Field(
|
|
78
|
+
default=None,
|
|
79
|
+
description="Content selectors for extraction. Each selector should have 'name', 'selector', and optional 'extract_type', 'multiple'"
|
|
80
|
+
)
|
|
81
|
+
base_url: Optional[str] = Field(
|
|
82
|
+
default="",
|
|
83
|
+
description="Base URL for relative links"
|
|
84
|
+
)
|
|
85
|
+
browser_config: Optional[Dict[str, Any]] = Field(
|
|
86
|
+
default=None,
|
|
87
|
+
description="Any Selenium configuration overrides (e.g., headless, mobile, browser type)"
|
|
88
|
+
)
|
|
89
|
+
full_page: bool = Field(
|
|
90
|
+
default=False,
|
|
91
|
+
description="Whether to capture full page content"
|
|
92
|
+
)
|
|
93
|
+
headless: bool = Field(
|
|
94
|
+
default=True,
|
|
95
|
+
description="Whether to run the browser in headless mode"
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class WebScrapingTool(AbstractTool):
|
|
100
|
+
"""
|
|
101
|
+
Advanced web scraping tool with LLM integration support.
|
|
102
|
+
|
|
103
|
+
Features:
|
|
104
|
+
- Support for both Selenium and Playwright
|
|
105
|
+
- Step-by-step navigation instructions
|
|
106
|
+
- Flexible content extraction
|
|
107
|
+
- Intermediate result storage
|
|
108
|
+
- Error handling and retry logic
|
|
109
|
+
|
|
110
|
+
Supported Actions:
|
|
111
|
+
* Navigation: navigate, back, refresh
|
|
112
|
+
* Interaction: click, fill, press_key, scroll
|
|
113
|
+
* Data Extraction: get_text, get_html, get_cookies
|
|
114
|
+
* Authentication: authenticate
|
|
115
|
+
* File Operations: upload_file, wait_for_download, screenshot
|
|
116
|
+
* State Management: set_cookies
|
|
117
|
+
* Waiting: wait, await_human, await_keypress, await_browser_event
|
|
118
|
+
* Evaluation: evaluate
|
|
119
|
+
* Control Flow: loop
|
|
120
|
+
"""
|
|
121
|
+
|
|
122
|
+
name = "WebScrapingTool"
|
|
123
|
+
description = """Execute automated web scraping with JSON-based, step-by-step navigation and content extraction.
|
|
124
|
+
|
|
125
|
+
IMPORTANT: This tool requires a 'steps' parameter (not 'actions'!) containing a list of navigation/interaction steps.
|
|
126
|
+
|
|
127
|
+
Example usage:
|
|
128
|
+
{
|
|
129
|
+
"steps": [
|
|
130
|
+
{"action": "navigate", "url": "https://example.com/login", "description": "Navigate to login page"},
|
|
131
|
+
{"action": "fill", "selector": "#email", "selector_type": "css", "value": "user@example.com", "description": "Fill email field"},
|
|
132
|
+
{"action": "fill", "selector": "#password", "selector_type": "css", "value": "password123", "description": "Fill password field"},
|
|
133
|
+
{"action": "click", "selector": "button[type='submit']", "selector_type": "css", "description": "Click login button"},
|
|
134
|
+
{"action": "navigate", "url": "https://example.com/dashboard", "description": "Navigate to dashboard"}
|
|
135
|
+
],
|
|
136
|
+
"selectors": [ // Optional - if omitted, returns full page HTML
|
|
137
|
+
{"name": "title", "selector": "h1", "selector_type": "css"},
|
|
138
|
+
{"name": "content", "selector": ".main-content", "selector_type": "css"}
|
|
139
|
+
],
|
|
140
|
+
"full_page": true // Optional - set to true to capture full page content when no selectors provided
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
Each step must include:
|
|
144
|
+
- "action": The action type (required)
|
|
145
|
+
- "description": Why this step is needed (required for clarity)
|
|
146
|
+
- Additional fields depending on action type (e.g., "url" for navigate, "selector" for click/fill)
|
|
147
|
+
|
|
148
|
+
Pair every selector with a `selector_type` (`css`, `xpath`, or `text`). Keep waits explicit via `condition_type` (`simple`, `selector`, `url_is`, `url_contains`, `title_contains`, or `custom`).
|
|
149
|
+
|
|
150
|
+
Supported actions:
|
|
151
|
+
- Navigation: navigate, back, refresh
|
|
152
|
+
- Interaction: click, fill, press_key, scroll, select
|
|
153
|
+
- Data Extraction: get_text, get_html, get_cookies
|
|
154
|
+
- Authentication: authenticate (include method, selectors, credentials)
|
|
155
|
+
- File Operations: upload_file, wait_for_download, screenshot
|
|
156
|
+
- State Management: set_cookies
|
|
157
|
+
- Waiting: wait, await_human, await_keypress, await_browser_event
|
|
158
|
+
- Evaluation: evaluate
|
|
159
|
+
- Control Flow: loop
|
|
160
|
+
|
|
161
|
+
If no selectors are provided and full_page is False, the tool will still return the complete HTML body of the final page for your reference."""
|
|
162
|
+
args_schema = WebScrapingToolArgs
|
|
163
|
+
|
|
164
|
+
def __init__(
|
|
165
|
+
self,
|
|
166
|
+
browser: Literal['chrome', 'firefox', 'edge', 'safari', 'undetected'] = 'chrome',
|
|
167
|
+
driver_type: Literal['selenium', 'playwright'] = 'selenium',
|
|
168
|
+
full_page: bool = False,
|
|
169
|
+
headless: bool = True,
|
|
170
|
+
mobile: bool = False,
|
|
171
|
+
mobile_device: Optional[str] = None,
|
|
172
|
+
browser_binary: Optional[str] = None,
|
|
173
|
+
driver_binary: Optional[str] = None,
|
|
174
|
+
auto_install: bool = True,
|
|
175
|
+
**kwargs
|
|
176
|
+
):
|
|
177
|
+
super().__init__(**kwargs)
|
|
178
|
+
self.driver_type = driver_type
|
|
179
|
+
# Browser configuration
|
|
180
|
+
self.browser_config = {
|
|
181
|
+
'browser': browser,
|
|
182
|
+
'headless': headless,
|
|
183
|
+
'mobile': mobile,
|
|
184
|
+
'mobile_device': mobile_device,
|
|
185
|
+
'browser_binary': browser_binary,
|
|
186
|
+
'driver_binary': driver_binary,
|
|
187
|
+
'auto_install': auto_install,
|
|
188
|
+
**kwargs
|
|
189
|
+
}
|
|
190
|
+
self.driver = None
|
|
191
|
+
self.browser = None # For Playwright
|
|
192
|
+
self.page = None # For Playwright
|
|
193
|
+
self._full_page: bool = full_page
|
|
194
|
+
self.results: List[ScrapingResult] = []
|
|
195
|
+
# Allow turning overlay housekeeping on/off (default ON)
|
|
196
|
+
self.overlay_housekeeping: bool = kwargs.get('overlay_housekeeping', True)
|
|
197
|
+
# Configuration
|
|
198
|
+
self.default_timeout = kwargs.get('default_timeout', 10)
|
|
199
|
+
self.retry_attempts = kwargs.get('retry_attempts', 3)
|
|
200
|
+
self.delay_between_actions = kwargs.get('delay_between_actions', 1)
|
|
201
|
+
# extracted cookies and headers from Driver
|
|
202
|
+
self.extracted_cookies: Dict[str, str] = {}
|
|
203
|
+
self.extracted_headers: Dict[str, str] = {}
|
|
204
|
+
self.extracted_authorization: str = None
|
|
205
|
+
logging.getLogger("urllib3.connectionpool").setLevel(logging.ERROR)
|
|
206
|
+
|
|
207
|
+
async def _execute(
|
|
208
|
+
self,
|
|
209
|
+
steps: List[Dict[str, Any]],
|
|
210
|
+
selectors: Optional[List[Dict[str, Any]]] = None,
|
|
211
|
+
base_url: str = "",
|
|
212
|
+
browser_config: Optional[Dict[str, Any]] = None,
|
|
213
|
+
**kwargs
|
|
214
|
+
) -> Dict[str, Any]:
|
|
215
|
+
"""
|
|
216
|
+
Execute the web scraping workflow.
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
steps: List of navigation/interaction steps
|
|
220
|
+
selectors: List of content selectors to do extraction
|
|
221
|
+
base_url: Base URL for relative links
|
|
222
|
+
|
|
223
|
+
Returns:
|
|
224
|
+
Dictionary with scraping results
|
|
225
|
+
"""
|
|
226
|
+
self.results = []
|
|
227
|
+
|
|
228
|
+
try:
|
|
229
|
+
await self.initialize_driver(
|
|
230
|
+
config_overrides=browser_config
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
# Convert dictionaries to dataclasses
|
|
234
|
+
scraping_steps = [ScrapingStep.from_dict(step) for step in steps]
|
|
235
|
+
scraping_selectors = [ScrapingSelector(**sel) for sel in selectors] if selectors else None
|
|
236
|
+
|
|
237
|
+
# Execute scraping workflow
|
|
238
|
+
results = await self.execute_scraping_workflow(
|
|
239
|
+
scraping_steps,
|
|
240
|
+
scraping_selectors,
|
|
241
|
+
base_url
|
|
242
|
+
)
|
|
243
|
+
successful_scrapes = len([r for r in results if r.success])
|
|
244
|
+
return {
|
|
245
|
+
"status": "success" if successful_scrapes > 0 else "failed",
|
|
246
|
+
"result": [
|
|
247
|
+
{
|
|
248
|
+
"url": r.url,
|
|
249
|
+
"extracted_data": r.extracted_data,
|
|
250
|
+
"metadata": r.metadata,
|
|
251
|
+
"success": r.success,
|
|
252
|
+
"error_message": r.error_message,
|
|
253
|
+
"content": r.content
|
|
254
|
+
} for r in results
|
|
255
|
+
],
|
|
256
|
+
"metadata": {
|
|
257
|
+
"total_pages_scraped": len(results),
|
|
258
|
+
"successful_scrapes": successful_scrapes,
|
|
259
|
+
"browser_used": self.selenium_setup.browser,
|
|
260
|
+
"mobile_mode": self.selenium_setup.mobile,
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
except Exception as e:
|
|
265
|
+
self.logger.error(f"Scraping execution failed: {str(e)}")
|
|
266
|
+
return {
|
|
267
|
+
"status": "error",
|
|
268
|
+
"error": str(e),
|
|
269
|
+
"result": [],
|
|
270
|
+
"metadata": {
|
|
271
|
+
"browser_used": self.browser_config.get('browser', 'unknown'),
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
async def initialize_driver(self, config_overrides: Optional[Dict[str, Any]] = None):
|
|
276
|
+
"""Initialize the web driver based on configuration"""
|
|
277
|
+
if self.driver_type == 'selenium':
|
|
278
|
+
await self._setup_selenium(config_overrides)
|
|
279
|
+
elif self.driver_type == 'playwright' and PLAYWRIGHT_AVAILABLE:
|
|
280
|
+
await self._setup_playwright()
|
|
281
|
+
else:
|
|
282
|
+
raise ValueError(
|
|
283
|
+
f"Driver type '{self.driver_type}' not supported or not available"
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
async def _get_selenium_driver(self, config: Dict[str, Any]) -> webdriver.Chrome:
|
|
287
|
+
# Create Selenium setup
|
|
288
|
+
self.selenium_setup = SeleniumSetup(**config)
|
|
289
|
+
# Get the driver
|
|
290
|
+
return await self.selenium_setup.get_driver()
|
|
291
|
+
|
|
292
|
+
async def _setup_selenium(self, config_overrides: Optional[Dict[str, Any]] = None):
|
|
293
|
+
"""Setup Selenium WebDriver"""
|
|
294
|
+
final_config = self.browser_config.copy()
|
|
295
|
+
if config_overrides:
|
|
296
|
+
final_config.update(config_overrides)
|
|
297
|
+
self.driver = await self._get_selenium_driver(final_config)
|
|
298
|
+
# Attempt to capture from performance logs first
|
|
299
|
+
try:
|
|
300
|
+
# turn on CDP Network domain
|
|
301
|
+
self.driver.execute_cdp_cmd("Network.enable", {})
|
|
302
|
+
except Exception: # pragma: no cover - command may not exist
|
|
303
|
+
pass
|
|
304
|
+
return self.driver
|
|
305
|
+
|
|
306
|
+
async def _setup_playwright(self):
|
|
307
|
+
"""Setup Playwright browser"""
|
|
308
|
+
if not PLAYWRIGHT_AVAILABLE:
|
|
309
|
+
raise ImportError("Playwright is not installed. Install with: pip install playwright")
|
|
310
|
+
|
|
311
|
+
playwright = await async_playwright().start()
|
|
312
|
+
self.browser = await playwright.chromium.launch(
|
|
313
|
+
headless=self.browser_config.get('headless', True)
|
|
314
|
+
)
|
|
315
|
+
self.page = await self.browser.new_page()
|
|
316
|
+
await self.page.set_viewport_size({"width": 1920, "height": 1080})
|
|
317
|
+
|
|
318
|
+
async def execute_scraping_workflow(
|
|
319
|
+
self,
|
|
320
|
+
steps: List[ScrapingStep],
|
|
321
|
+
selectors: Optional[List[ScrapingSelector]] = None,
|
|
322
|
+
base_url: str = ""
|
|
323
|
+
) -> List[ScrapingResult]:
|
|
324
|
+
"""
|
|
325
|
+
Execute a complete scraping workflow
|
|
326
|
+
|
|
327
|
+
Args:
|
|
328
|
+
steps: List of navigation/interaction steps
|
|
329
|
+
selectors: List of content selectors to extract
|
|
330
|
+
base_url: Base URL for relative links
|
|
331
|
+
|
|
332
|
+
Returns:
|
|
333
|
+
List of ScrapingResult objects
|
|
334
|
+
"""
|
|
335
|
+
self.results = []
|
|
336
|
+
|
|
337
|
+
try:
|
|
338
|
+
# Execute each step in sequence
|
|
339
|
+
for i, step in enumerate(steps):
|
|
340
|
+
self.logger.info(f"Executing step {i+1}/{len(steps)}: {step.description}")
|
|
341
|
+
print(' DEBUG STEP > ', step, base_url)
|
|
342
|
+
try:
|
|
343
|
+
success = await self._execute_step(step, base_url)
|
|
344
|
+
except TimeoutError:
|
|
345
|
+
self.logger.error(f"Step timed out: {step.description}")
|
|
346
|
+
success = False
|
|
347
|
+
break
|
|
348
|
+
|
|
349
|
+
if not success and step.action in ['navigate', 'authenticate']:
|
|
350
|
+
# Critical steps - abort if they fail
|
|
351
|
+
self.logger.error(
|
|
352
|
+
f"Critical step failed: {step.description}"
|
|
353
|
+
)
|
|
354
|
+
break
|
|
355
|
+
|
|
356
|
+
# Add delay between actions
|
|
357
|
+
await asyncio.sleep(self.delay_between_actions)
|
|
358
|
+
|
|
359
|
+
# Extract content using selectors
|
|
360
|
+
if selectors:
|
|
361
|
+
current_url = await self._get_current_url()
|
|
362
|
+
result = await self._extract_content(current_url, selectors)
|
|
363
|
+
if result:
|
|
364
|
+
self.results.append(result)
|
|
365
|
+
else:
|
|
366
|
+
# When no selectors provided, always extract full page content
|
|
367
|
+
# This ensures the tool returns the HTML body for reference
|
|
368
|
+
current_url = await self._get_current_url()
|
|
369
|
+
result = await self._extract_full_content(current_url)
|
|
370
|
+
if result:
|
|
371
|
+
self.results.append(result)
|
|
372
|
+
# and extract the headers, authorization and cookies
|
|
373
|
+
try:
|
|
374
|
+
self.extracted_headers = self._extract_headers()
|
|
375
|
+
self.extracted_authorization = self._extract_authorization()
|
|
376
|
+
self.extracted_cookies = self._collect_cookies()
|
|
377
|
+
except Exception as e:
|
|
378
|
+
self.logger.error(
|
|
379
|
+
f"Error extracting headers, authorization, or cookies: {str(e)}"
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
except Exception as e:
|
|
383
|
+
self.logger.error(f"Scraping workflow failed: {str(e)}")
|
|
384
|
+
# Create error result
|
|
385
|
+
error_result = ScrapingResult(
|
|
386
|
+
url="",
|
|
387
|
+
content="",
|
|
388
|
+
bs_soup=BeautifulSoup("", 'html.parser'),
|
|
389
|
+
success=False,
|
|
390
|
+
error_message=str(e)
|
|
391
|
+
)
|
|
392
|
+
self.results.append(error_result)
|
|
393
|
+
|
|
394
|
+
finally:
|
|
395
|
+
await self.cleanup()
|
|
396
|
+
|
|
397
|
+
return self.results
|
|
398
|
+
|
|
399
|
+
async def _execute_step(self, step: ScrapingStep, base_url: str = "", args: dict = None) -> bool:
|
|
400
|
+
"""Execute a single scraping step with a hard timeout per step."""
|
|
401
|
+
action = step.action
|
|
402
|
+
action_type = action.get_action_type()
|
|
403
|
+
result = None
|
|
404
|
+
try:
|
|
405
|
+
if action_type == 'navigate':
|
|
406
|
+
result = await self._navigate_to(action, base_url)
|
|
407
|
+
elif action_type == 'click':
|
|
408
|
+
result = await self._click(
|
|
409
|
+
action,
|
|
410
|
+
timeout=action.timeout or self.default_timeout
|
|
411
|
+
)
|
|
412
|
+
elif action_type == 'fill':
|
|
413
|
+
result = await self._fill(action)
|
|
414
|
+
elif action_type == 'select':
|
|
415
|
+
result = await self._select(action)
|
|
416
|
+
elif action_type == 'evaluate':
|
|
417
|
+
result = await self._evaluate_js(action)
|
|
418
|
+
elif action_type == 'await_human':
|
|
419
|
+
result = await self._await_human(action)
|
|
420
|
+
elif action_type == 'press_key':
|
|
421
|
+
result = await self._press_key(action)
|
|
422
|
+
elif action_type == 'refresh':
|
|
423
|
+
result = await self._handle_refresh(action)
|
|
424
|
+
elif action_type == 'back':
|
|
425
|
+
result = await self._handle_back(action)
|
|
426
|
+
elif action_type == 'get_cookies':
|
|
427
|
+
result = await self._get_cookies(action)
|
|
428
|
+
elif action_type == 'set_cookies':
|
|
429
|
+
result = await self._set_cookies(action)
|
|
430
|
+
elif action_type == 'get_text':
|
|
431
|
+
result = await self._get_text(action)
|
|
432
|
+
elif action_type == 'get_html':
|
|
433
|
+
result = await self._get_html(action, args)
|
|
434
|
+
elif action_type == 'screenshot':
|
|
435
|
+
result = await self._take_screenshot(action)
|
|
436
|
+
elif action_type == 'wait_for_download':
|
|
437
|
+
result = await self._wait_for_download(action)
|
|
438
|
+
elif action_type == 'upload_file':
|
|
439
|
+
result = await self._upload_file(action)
|
|
440
|
+
elif action_type == 'await_keypress':
|
|
441
|
+
try:
|
|
442
|
+
result = await self._await_keypress(action)
|
|
443
|
+
except TimeoutError:
|
|
444
|
+
raise
|
|
445
|
+
elif action_type == 'await_browser_event':
|
|
446
|
+
try:
|
|
447
|
+
result = await self._await_browser_event(action)
|
|
448
|
+
except TimeoutError:
|
|
449
|
+
raise
|
|
450
|
+
elif action_type == 'wait':
|
|
451
|
+
result = await self._wait_for_condition(
|
|
452
|
+
action,
|
|
453
|
+
step.action.timeout or self.default_timeout
|
|
454
|
+
)
|
|
455
|
+
elif action_type == 'scroll':
|
|
456
|
+
result = await self._scroll_page(action)
|
|
457
|
+
elif action_type == 'authenticate':
|
|
458
|
+
result = await self._handle_authentication(action)
|
|
459
|
+
elif action_type == 'loop':
|
|
460
|
+
result = await self._exec_loop(action, base_url)
|
|
461
|
+
elif action_type == 'conditional':
|
|
462
|
+
result = await self._exec_conditional(action, base_url, args)
|
|
463
|
+
else:
|
|
464
|
+
self.logger.warning(f"Unknown action: {step.action}")
|
|
465
|
+
return False
|
|
466
|
+
return result
|
|
467
|
+
except asyncio.TimeoutError:
|
|
468
|
+
self.logger.error(f"Step timed out: {step.description or step.action}")
|
|
469
|
+
return False
|
|
470
|
+
except Exception as e:
|
|
471
|
+
self.logger.error(f"Step execution failed: {step.action} - {str(e)}")
|
|
472
|
+
return False
|
|
473
|
+
|
|
474
|
+
async def _select_option(
|
|
475
|
+
self,
|
|
476
|
+
selector: str,
|
|
477
|
+
value: Optional[str] = None,
|
|
478
|
+
text: Optional[str] = None,
|
|
479
|
+
index: Optional[int] = None,
|
|
480
|
+
by: str = 'value',
|
|
481
|
+
blur_after: bool = True,
|
|
482
|
+
wait_after_select: Optional[str] = None,
|
|
483
|
+
wait_timeout: int = 2
|
|
484
|
+
) -> bool:
|
|
485
|
+
"""Select an option from a dropdown/select element"""
|
|
486
|
+
|
|
487
|
+
if self.driver_type == 'selenium':
|
|
488
|
+
from selenium.webdriver.support.ui import Select as SeleniumSelect
|
|
489
|
+
|
|
490
|
+
loop = asyncio.get_running_loop()
|
|
491
|
+
|
|
492
|
+
def select_sync():
|
|
493
|
+
# Wait for select element to be present
|
|
494
|
+
element = WebDriverWait(
|
|
495
|
+
self.driver,
|
|
496
|
+
self.default_timeout,
|
|
497
|
+
poll_frequency=0.25
|
|
498
|
+
).until(
|
|
499
|
+
EC.presence_of_element_located((By.CSS_SELECTOR, selector))
|
|
500
|
+
)
|
|
501
|
+
|
|
502
|
+
# Create Select object
|
|
503
|
+
select = SeleniumSelect(element)
|
|
504
|
+
|
|
505
|
+
# Perform selection based on method
|
|
506
|
+
if by == 'value':
|
|
507
|
+
select.select_by_value(value)
|
|
508
|
+
elif by == 'text':
|
|
509
|
+
select.select_by_visible_text(text)
|
|
510
|
+
elif by == 'index':
|
|
511
|
+
select.select_by_index(index)
|
|
512
|
+
|
|
513
|
+
# Trigger blur/change events if requested
|
|
514
|
+
if blur_after:
|
|
515
|
+
# Trigger change event
|
|
516
|
+
self.driver.execute_script(
|
|
517
|
+
"arguments[0].dispatchEvent(new Event('change', { bubbles: true }));",
|
|
518
|
+
element
|
|
519
|
+
)
|
|
520
|
+
# Trigger blur event
|
|
521
|
+
self.driver.execute_script(
|
|
522
|
+
"arguments[0].blur();",
|
|
523
|
+
element
|
|
524
|
+
)
|
|
525
|
+
|
|
526
|
+
# Wait for post-select element if specified
|
|
527
|
+
if wait_after_select:
|
|
528
|
+
try:
|
|
529
|
+
WebDriverWait(self.driver, wait_timeout).until(
|
|
530
|
+
EC.presence_of_element_located((By.CSS_SELECTOR, wait_after_select))
|
|
531
|
+
)
|
|
532
|
+
self.logger.debug(f"Post-select element found: {wait_after_select}")
|
|
533
|
+
except TimeoutException:
|
|
534
|
+
self.logger.warning(
|
|
535
|
+
f"Post-select wait timed out: {wait_after_select}"
|
|
536
|
+
)
|
|
537
|
+
|
|
538
|
+
await loop.run_in_executor(None, select_sync)
|
|
539
|
+
return True
|
|
540
|
+
|
|
541
|
+
else: # Playwright
|
|
542
|
+
# Playwright has built-in select support
|
|
543
|
+
if by == 'value':
|
|
544
|
+
await self.page.select_option(selector, value=value)
|
|
545
|
+
elif by == 'text':
|
|
546
|
+
await self.page.select_option(selector, label=text)
|
|
547
|
+
elif by == 'index':
|
|
548
|
+
await self.page.select_option(selector, index=index)
|
|
549
|
+
|
|
550
|
+
# Trigger blur/change events if requested
|
|
551
|
+
if blur_after:
|
|
552
|
+
await self.page.evaluate(f"""
|
|
553
|
+
const select = document.querySelector('{selector}');
|
|
554
|
+
select.dispatchEvent(new Event('change', {{ bubbles: true }}));
|
|
555
|
+
select.blur();
|
|
556
|
+
""")
|
|
557
|
+
|
|
558
|
+
# Wait for post-select element if specified
|
|
559
|
+
if wait_after_select:
|
|
560
|
+
try:
|
|
561
|
+
await self.page.wait_for_selector(
|
|
562
|
+
wait_after_select,
|
|
563
|
+
timeout=wait_timeout * 1000
|
|
564
|
+
)
|
|
565
|
+
self.logger.debug(f"Post-select element found: {wait_after_select}")
|
|
566
|
+
except Exception:
|
|
567
|
+
self.logger.warning(
|
|
568
|
+
f"Post-select wait timed out: {wait_after_select}"
|
|
569
|
+
)
|
|
570
|
+
|
|
571
|
+
return True
|
|
572
|
+
|
|
573
|
+
|
|
574
|
+
async def _select(self, action: Select):
|
|
575
|
+
"""Handle select action"""
|
|
576
|
+
return await self._select_option(
|
|
577
|
+
selector=action.selector,
|
|
578
|
+
value=action.value,
|
|
579
|
+
text=action.text,
|
|
580
|
+
index=action.index,
|
|
581
|
+
by=action.by,
|
|
582
|
+
blur_after=action.blur_after,
|
|
583
|
+
wait_after_select=action.wait_after_select,
|
|
584
|
+
wait_timeout=action.wait_timeout
|
|
585
|
+
)
|
|
586
|
+
|
|
587
|
+
async def _evaluate_js(self, action: Evaluate) -> Any:
|
|
588
|
+
"""Handle Evaluate action"""
|
|
589
|
+
script = action.script
|
|
590
|
+
|
|
591
|
+
# Load script from file if specified
|
|
592
|
+
if action.script_file:
|
|
593
|
+
with open(action.script_file, 'r') as f:
|
|
594
|
+
script = f.read()
|
|
595
|
+
|
|
596
|
+
if not script:
|
|
597
|
+
self.logger.warning(
|
|
598
|
+
"No script provided for Evaluate action"
|
|
599
|
+
)
|
|
600
|
+
return False
|
|
601
|
+
|
|
602
|
+
if self.driver_type == 'selenium':
|
|
603
|
+
loop = asyncio.get_running_loop()
|
|
604
|
+
result = await loop.run_in_executor(
|
|
605
|
+
None,
|
|
606
|
+
lambda: self.driver.execute_script(script, *action.args)
|
|
607
|
+
)
|
|
608
|
+
else: # Playwright
|
|
609
|
+
result = await self.page.evaluate(script, *action.args)
|
|
610
|
+
|
|
611
|
+
return result if action.return_value else True
|
|
612
|
+
|
|
613
|
+
async def _press_key(self, action: PressKey) -> bool:
|
|
614
|
+
"""Handle PressKey action"""
|
|
615
|
+
# Focus on target element if specified
|
|
616
|
+
if action.target:
|
|
617
|
+
if self.driver_type == 'selenium':
|
|
618
|
+
element = self.driver.find_element(By.CSS_SELECTOR, action.target)
|
|
619
|
+
element.click()
|
|
620
|
+
else:
|
|
621
|
+
await self.page.focus(action.target)
|
|
622
|
+
|
|
623
|
+
# Press keys
|
|
624
|
+
for key in action.keys:
|
|
625
|
+
if self.driver_type == 'selenium':
|
|
626
|
+
key_obj = getattr(Keys, key.upper(), key)
|
|
627
|
+
if action.target:
|
|
628
|
+
element.send_keys(key_obj)
|
|
629
|
+
else:
|
|
630
|
+
self.driver.switch_to.active_element.send_keys(key_obj)
|
|
631
|
+
else: # Playwright
|
|
632
|
+
await self.page.keyboard.press(key)
|
|
633
|
+
|
|
634
|
+
return True
|
|
635
|
+
|
|
636
|
+
async def _handle_refresh(self, action: Refresh) -> bool:
|
|
637
|
+
"""Handle Refresh action"""
|
|
638
|
+
if self.driver_type == 'selenium':
|
|
639
|
+
loop = asyncio.get_running_loop()
|
|
640
|
+
if action.hard:
|
|
641
|
+
await loop.run_in_executor(
|
|
642
|
+
None,
|
|
643
|
+
lambda: self.driver.execute_script("location.reload(true)")
|
|
644
|
+
)
|
|
645
|
+
else:
|
|
646
|
+
await loop.run_in_executor(None, self.driver.refresh)
|
|
647
|
+
else: # Playwright
|
|
648
|
+
await self.page.reload(wait_until='domcontentloaded')
|
|
649
|
+
|
|
650
|
+
return True
|
|
651
|
+
|
|
652
|
+
async def _handle_back(self, action: Back) -> bool:
|
|
653
|
+
"""Handle Back action"""
|
|
654
|
+
for _ in range(action.steps):
|
|
655
|
+
if self.driver_type == 'selenium':
|
|
656
|
+
loop = asyncio.get_running_loop()
|
|
657
|
+
await loop.run_in_executor(None, self.driver.back)
|
|
658
|
+
else: # Playwright
|
|
659
|
+
await self.page.go_back()
|
|
660
|
+
|
|
661
|
+
return True
|
|
662
|
+
|
|
663
|
+
async def _post_navigate_housekeeping(self):
|
|
664
|
+
"""Best-effort, non-blocking overlay dismissal. Never stalls navigation."""
|
|
665
|
+
selectors = [
|
|
666
|
+
".c-close-icon",
|
|
667
|
+
"button#attn-overlay-close",
|
|
668
|
+
"button[aria-label*='Close']",
|
|
669
|
+
"button[aria-label*='close']",
|
|
670
|
+
"button[aria-label*='Dismiss']",
|
|
671
|
+
"#onetrust-accept-btn-handler",
|
|
672
|
+
".oci-accept-button",
|
|
673
|
+
]
|
|
674
|
+
|
|
675
|
+
if self.driver_type == 'selenium':
|
|
676
|
+
loop = asyncio.get_running_loop()
|
|
677
|
+
|
|
678
|
+
def quick_dismiss():
|
|
679
|
+
clicked = 0
|
|
680
|
+
for sel in selectors:
|
|
681
|
+
try:
|
|
682
|
+
# No waits—instant check
|
|
683
|
+
els = self.driver.find_elements(By.CSS_SELECTOR, sel)
|
|
684
|
+
if not els:
|
|
685
|
+
continue
|
|
686
|
+
# Try first two matches at most
|
|
687
|
+
for el in els[:2]:
|
|
688
|
+
try:
|
|
689
|
+
el.click()
|
|
690
|
+
clicked += 1
|
|
691
|
+
except Exception:
|
|
692
|
+
try:
|
|
693
|
+
self.driver.execute_script("arguments[0].scrollIntoView({block:'center'});", el)
|
|
694
|
+
self.driver.execute_script("arguments[0].click();", el)
|
|
695
|
+
clicked += 1
|
|
696
|
+
except Exception:
|
|
697
|
+
continue
|
|
698
|
+
except Exception:
|
|
699
|
+
continue
|
|
700
|
+
return clicked
|
|
701
|
+
|
|
702
|
+
# Run quickly in executor; don't care about result
|
|
703
|
+
try:
|
|
704
|
+
await asyncio.wait_for(
|
|
705
|
+
loop.run_in_executor(None, quick_dismiss), timeout=1.0
|
|
706
|
+
)
|
|
707
|
+
except Exception:
|
|
708
|
+
pass
|
|
709
|
+
|
|
710
|
+
else:
|
|
711
|
+
# Playwright: tiny timeouts; ignore errors
|
|
712
|
+
for sel in selectors:
|
|
713
|
+
try:
|
|
714
|
+
await self.page.click(sel, timeout=300) # 0.3s max per selector
|
|
715
|
+
except Exception:
|
|
716
|
+
continue
|
|
717
|
+
|
|
718
|
+
def _session_alive(self) -> bool:
|
|
719
|
+
"""Cheap ping to confirm the driver session is alive."""
|
|
720
|
+
try:
|
|
721
|
+
# current_url is a lightweight call; will raise if session is gone
|
|
722
|
+
_ = self.driver.current_url if self.driver_type == 'selenium' else self.page.url
|
|
723
|
+
return True
|
|
724
|
+
except Exception:
|
|
725
|
+
return False
|
|
726
|
+
|
|
727
|
+
async def _navigate_to(self, action: Navigate, base_url: str):
|
|
728
|
+
url = urljoin(base_url, action.url) if base_url else action.url
|
|
729
|
+
if self.driver_type == 'selenium':
|
|
730
|
+
loop = asyncio.get_running_loop()
|
|
731
|
+
await loop.run_in_executor(None, self.driver.get, url)
|
|
732
|
+
if self.overlay_housekeeping:
|
|
733
|
+
try:
|
|
734
|
+
current = self.driver.current_url
|
|
735
|
+
host = (urlparse(current).hostname or "").lower()
|
|
736
|
+
# TODO create a whitelist of hosts where overlays are common
|
|
737
|
+
if host and any(x in host for x in ['bestbuy', 'amazon', 'ebay', 'walmart', 'target']):
|
|
738
|
+
try:
|
|
739
|
+
await asyncio.wait_for(
|
|
740
|
+
self._post_navigate_housekeeping(), timeout=1.25
|
|
741
|
+
)
|
|
742
|
+
except Exception:
|
|
743
|
+
pass
|
|
744
|
+
except Exception:
|
|
745
|
+
pass
|
|
746
|
+
else:
|
|
747
|
+
await self.page.goto(url, wait_until='domcontentloaded')
|
|
748
|
+
if self.overlay_housekeeping:
|
|
749
|
+
try:
|
|
750
|
+
await asyncio.wait_for(self._post_navigate_housekeeping(), timeout=1.25)
|
|
751
|
+
except Exception:
|
|
752
|
+
pass
|
|
753
|
+
return True
|
|
754
|
+
|
|
755
|
+
def js_click(self, driver, element):
|
|
756
|
+
try:
|
|
757
|
+
driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", element)
|
|
758
|
+
driver.execute_script("arguments[0].click();", element)
|
|
759
|
+
return True
|
|
760
|
+
except Exception:
|
|
761
|
+
return False
|
|
762
|
+
|
|
763
|
+
async def _click_element(
|
|
764
|
+
self,
|
|
765
|
+
selector: str,
|
|
766
|
+
timeout: Optional[int] = None
|
|
767
|
+
):
|
|
768
|
+
"""Click an element by selector."""
|
|
769
|
+
wait = WebDriverWait(
|
|
770
|
+
self.driver,
|
|
771
|
+
timeout or self.default_timeout,
|
|
772
|
+
poll_frequency=0.25
|
|
773
|
+
)
|
|
774
|
+
try:
|
|
775
|
+
el = wait.until(
|
|
776
|
+
EC.presence_of_element_located(
|
|
777
|
+
(By.CSS_SELECTOR, selector)
|
|
778
|
+
)
|
|
779
|
+
)
|
|
780
|
+
el.click()
|
|
781
|
+
except Exception:
|
|
782
|
+
# fallback to JS click
|
|
783
|
+
try:
|
|
784
|
+
self.js_click(self.driver, el)
|
|
785
|
+
except Exception:
|
|
786
|
+
return False
|
|
787
|
+
|
|
788
|
+
async def _click(self, action: Click, timeout: Optional[int] = None) -> bool:
|
|
789
|
+
"""
|
|
790
|
+
Enhanced click method supporting CSS, XPath, and text-based selection.
|
|
791
|
+
|
|
792
|
+
Args:
|
|
793
|
+
action: Click action with selector and options
|
|
794
|
+
timeout: Optional timeout override
|
|
795
|
+
|
|
796
|
+
Returns:
|
|
797
|
+
bool: True if click successful
|
|
798
|
+
"""
|
|
799
|
+
selector = action.selector
|
|
800
|
+
selector_type = action.selector_type
|
|
801
|
+
timeout = timeout or action.timeout or self.default_timeout
|
|
802
|
+
|
|
803
|
+
if self.driver_type == 'selenium':
|
|
804
|
+
loop = asyncio.get_running_loop()
|
|
805
|
+
|
|
806
|
+
def click_sync():
|
|
807
|
+
# Determine the locator strategy based on selector_type
|
|
808
|
+
if selector_type == 'xpath':
|
|
809
|
+
by_type = By.XPATH
|
|
810
|
+
locator = selector
|
|
811
|
+
elif selector_type == 'text':
|
|
812
|
+
# Convert text search to XPath
|
|
813
|
+
# Supports exact match, contains, and case-insensitive
|
|
814
|
+
if selector.startswith('='):
|
|
815
|
+
# Exact match: =Filters
|
|
816
|
+
text = selector[1:]
|
|
817
|
+
by_type = By.XPATH
|
|
818
|
+
locator = f"//*[normalize-space(text())='{text}']"
|
|
819
|
+
elif selector.startswith('~'):
|
|
820
|
+
# Case-insensitive contains: ~filters
|
|
821
|
+
text = selector[1:].lower()
|
|
822
|
+
by_type = By.XPATH
|
|
823
|
+
locator = f"//*[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{text}')]"
|
|
824
|
+
else:
|
|
825
|
+
# Default: contains (case-sensitive)
|
|
826
|
+
by_type = By.XPATH
|
|
827
|
+
locator = f"//*[contains(text(), '{selector}')]"
|
|
828
|
+
else: # css (default)
|
|
829
|
+
by_type = By.CSS_SELECTOR
|
|
830
|
+
locator = selector
|
|
831
|
+
|
|
832
|
+
self.logger.debug(f"Clicking element: {by_type}='{locator}'")
|
|
833
|
+
|
|
834
|
+
wait = WebDriverWait(
|
|
835
|
+
self.driver,
|
|
836
|
+
timeout,
|
|
837
|
+
poll_frequency=0.25
|
|
838
|
+
)
|
|
839
|
+
|
|
840
|
+
# Wait for element to be present
|
|
841
|
+
try:
|
|
842
|
+
element = wait.until(
|
|
843
|
+
EC.presence_of_element_located((by_type, locator))
|
|
844
|
+
)
|
|
845
|
+
except Exception as e:
|
|
846
|
+
self.logger.error(f"Element not found: {by_type}='{locator}'")
|
|
847
|
+
raise
|
|
848
|
+
|
|
849
|
+
# Try regular click first
|
|
850
|
+
try:
|
|
851
|
+
# Wait for element to be clickable
|
|
852
|
+
element = wait.until(
|
|
853
|
+
EC.element_to_be_clickable((by_type, locator))
|
|
854
|
+
)
|
|
855
|
+
element.click()
|
|
856
|
+
self.logger.debug(f"Click performed on: {locator}")
|
|
857
|
+
except Exception:
|
|
858
|
+
# Fallback to JS click
|
|
859
|
+
try:
|
|
860
|
+
self.logger.debug("Regular click failed, trying JS click")
|
|
861
|
+
self.js_click(self.driver, element)
|
|
862
|
+
except Exception as e:
|
|
863
|
+
self.logger.error(f"Both click methods failed: {str(e)}")
|
|
864
|
+
raise
|
|
865
|
+
|
|
866
|
+
# Handle post-click waiting
|
|
867
|
+
if action.no_wait:
|
|
868
|
+
self.logger.debug("no_wait=True, skipping post-click wait")
|
|
869
|
+
return True
|
|
870
|
+
elif action.wait_after_click:
|
|
871
|
+
# Wait for specified element to appear
|
|
872
|
+
try:
|
|
873
|
+
WebDriverWait(
|
|
874
|
+
self.driver,
|
|
875
|
+
action.wait_timeout or self.default_timeout,
|
|
876
|
+
poll_frequency=0.25
|
|
877
|
+
).until(
|
|
878
|
+
EC.presence_of_element_located(
|
|
879
|
+
(By.CSS_SELECTOR, action.wait_after_click)
|
|
880
|
+
)
|
|
881
|
+
)
|
|
882
|
+
self.logger.debug(f"Post-click element found: {action.wait_after_click}")
|
|
883
|
+
except Exception:
|
|
884
|
+
self.logger.warning(
|
|
885
|
+
f"Post-click wait element not found: {action.wait_after_click}"
|
|
886
|
+
)
|
|
887
|
+
else:
|
|
888
|
+
# Default: small sleep to allow any navigation/JS to start
|
|
889
|
+
time.sleep(0.5)
|
|
890
|
+
|
|
891
|
+
return True
|
|
892
|
+
|
|
893
|
+
await loop.run_in_executor(None, click_sync)
|
|
894
|
+
return True
|
|
895
|
+
|
|
896
|
+
else: # Playwright
|
|
897
|
+
if selector_type == 'xpath':
|
|
898
|
+
# Playwright supports XPath directly
|
|
899
|
+
await self.page.click(f"xpath={selector}", timeout=timeout * 1000)
|
|
900
|
+
elif selector_type == 'text':
|
|
901
|
+
# Playwright has native text selection
|
|
902
|
+
if selector.startswith('='):
|
|
903
|
+
# Exact text match
|
|
904
|
+
text = selector[1:]
|
|
905
|
+
await self.page.click(f"text={text}", timeout=timeout * 1000)
|
|
906
|
+
elif selector.startswith('~'):
|
|
907
|
+
# Case-insensitive (Playwright doesn't have built-in, use XPath)
|
|
908
|
+
text = selector[1:].lower()
|
|
909
|
+
xpath = f"//*[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{text}')]"
|
|
910
|
+
await self.page.click(f"xpath={xpath}", timeout=timeout * 1000)
|
|
911
|
+
else:
|
|
912
|
+
# Contains (partial match)
|
|
913
|
+
await self.page.click(f"text={selector}", timeout=timeout * 1000)
|
|
914
|
+
else:
|
|
915
|
+
# CSS selector
|
|
916
|
+
await self.page.click(selector, timeout=timeout * 1000)
|
|
917
|
+
|
|
918
|
+
# Handle post-click waiting for Playwright
|
|
919
|
+
if action.no_wait:
|
|
920
|
+
self.logger.debug("no_wait=True, skipping post-click wait")
|
|
921
|
+
elif action.wait_after_click:
|
|
922
|
+
try:
|
|
923
|
+
await self.page.wait_for_selector(
|
|
924
|
+
action.wait_after_click,
|
|
925
|
+
timeout=(action.wait_timeout or self.default_timeout) * 1000
|
|
926
|
+
)
|
|
927
|
+
self.logger.debug(f"Post-click element found: {action.wait_after_click}")
|
|
928
|
+
except Exception:
|
|
929
|
+
self.logger.warning(
|
|
930
|
+
f"Post-click wait timed out: {action.wait_after_click}"
|
|
931
|
+
)
|
|
932
|
+
|
|
933
|
+
return True
|
|
934
|
+
|
|
935
|
+
async def _fill_element(
|
|
936
|
+
self,
|
|
937
|
+
selector: Any,
|
|
938
|
+
value: str,
|
|
939
|
+
selector_type: str = 'css',
|
|
940
|
+
clear_first: bool = False,
|
|
941
|
+
press_enter: bool = False
|
|
942
|
+
) -> bool:
|
|
943
|
+
"""Fill an input element"""
|
|
944
|
+
if self.driver_type == 'selenium':
|
|
945
|
+
loop = asyncio.get_running_loop()
|
|
946
|
+
def fill_sync():
|
|
947
|
+
if selector_type == 'xpath':
|
|
948
|
+
by_type = By.XPATH
|
|
949
|
+
locator = selector
|
|
950
|
+
elif selector_type == 'text':
|
|
951
|
+
# Convert text to XPath for form fields
|
|
952
|
+
by_type = By.XPATH
|
|
953
|
+
if selector.startswith('='):
|
|
954
|
+
text = selector[1:]
|
|
955
|
+
# Find input with label containing text
|
|
956
|
+
locator = f"//label[contains(text(), '{text}')]/following-sibling::input | //input[@placeholder='{text}']"
|
|
957
|
+
else:
|
|
958
|
+
locator = f"//label[contains(text(), '{selector}')]/following-sibling::input | //input[@placeholder='{selector}']"
|
|
959
|
+
else:
|
|
960
|
+
by_type = By.CSS_SELECTOR
|
|
961
|
+
locator = selector
|
|
962
|
+
element = WebDriverWait(
|
|
963
|
+
self.driver,
|
|
964
|
+
self.default_timeout,
|
|
965
|
+
poll_frequency=0.25
|
|
966
|
+
).until(
|
|
967
|
+
EC.presence_of_element_located((by_type, locator))
|
|
968
|
+
)
|
|
969
|
+
if clear_first:
|
|
970
|
+
element.clear()
|
|
971
|
+
element.send_keys(value)
|
|
972
|
+
if press_enter:
|
|
973
|
+
element.send_keys(Keys.ENTER)
|
|
974
|
+
await loop.run_in_executor(None, fill_sync)
|
|
975
|
+
return True
|
|
976
|
+
else: # Playwright
|
|
977
|
+
if selector_type == 'xpath':
|
|
978
|
+
await self.page.fill(f"xpath={selector}", value)
|
|
979
|
+
elif selector_type == 'text':
|
|
980
|
+
# Playwright text selector for inputs
|
|
981
|
+
if selector.startswith('='):
|
|
982
|
+
text = selector[1:]
|
|
983
|
+
await self.page.fill(f"text={text}", value)
|
|
984
|
+
else:
|
|
985
|
+
await self.page.fill(f"text={selector}", value)
|
|
986
|
+
else:
|
|
987
|
+
await self.page.fill(selector, value)
|
|
988
|
+
|
|
989
|
+
if press_enter:
|
|
990
|
+
await self.page.keyboard.press('Enter')
|
|
991
|
+
|
|
992
|
+
return True
|
|
993
|
+
|
|
994
|
+
async def _fill(self, action: Fill):
|
|
995
|
+
"""Fill an input element"""
|
|
996
|
+
selector = action.selector
|
|
997
|
+
value = action.value
|
|
998
|
+
clear_first = action.clear_first
|
|
999
|
+
press_enter = action.press_enter
|
|
1000
|
+
selector_type = getattr(action, 'selector_type', 'css')
|
|
1001
|
+
return await self._fill_element(
|
|
1002
|
+
selector, value,
|
|
1003
|
+
selector_type=selector_type,
|
|
1004
|
+
clear_first=clear_first,
|
|
1005
|
+
press_enter=press_enter
|
|
1006
|
+
)
|
|
1007
|
+
|
|
1008
|
+
async def _wait_for_condition(self, action: Wait, timeout: int = 5):
|
|
1009
|
+
"""
|
|
1010
|
+
Wait for a specific condition to be met.
|
|
1011
|
+
Handles multiple selectors separated by commas.
|
|
1012
|
+
"""
|
|
1013
|
+
condition = action.condition
|
|
1014
|
+
if self.driver_type == 'selenium':
|
|
1015
|
+
loop = asyncio.get_running_loop()
|
|
1016
|
+
|
|
1017
|
+
def wait_sync():
|
|
1018
|
+
# Fail fast if session died
|
|
1019
|
+
try:
|
|
1020
|
+
_ = self.driver.current_url
|
|
1021
|
+
except Exception as e:
|
|
1022
|
+
raise RuntimeError(
|
|
1023
|
+
f"Selenium session not alive: {e}"
|
|
1024
|
+
) from e
|
|
1025
|
+
if action.condition_type == 'simple':
|
|
1026
|
+
# do a simple wait of N.seconds:
|
|
1027
|
+
time.sleep(int(timeout))
|
|
1028
|
+
return True
|
|
1029
|
+
elif action.condition_type == 'url_contains':
|
|
1030
|
+
WebDriverWait(self.driver, timeout, poll_frequency=0.25).until(
|
|
1031
|
+
EC.url_contains(condition)
|
|
1032
|
+
)
|
|
1033
|
+
self.logger.debug(f"URL contains: {condition}")
|
|
1034
|
+
return True
|
|
1035
|
+
elif action.condition_type == 'url_is':
|
|
1036
|
+
WebDriverWait(self.driver, timeout, poll_frequency=0.25).until(
|
|
1037
|
+
EC.url_to_be(condition)
|
|
1038
|
+
)
|
|
1039
|
+
self.logger.debug(f"URL is: {condition}")
|
|
1040
|
+
return True
|
|
1041
|
+
elif action.condition_type == 'selector':
|
|
1042
|
+
# Check if selector is present.
|
|
1043
|
+
selectors = [s.strip() for s in condition.split(',')]
|
|
1044
|
+
for selector in selectors:
|
|
1045
|
+
try:
|
|
1046
|
+
WebDriverWait(self.driver, timeout, poll_frequency=0.25).until(
|
|
1047
|
+
EC.presence_of_element_located((By.CSS_SELECTOR, selector))
|
|
1048
|
+
)
|
|
1049
|
+
self.logger.debug(f"Element found: {selector}")
|
|
1050
|
+
return True
|
|
1051
|
+
except TimeoutException:
|
|
1052
|
+
if selector == selectors[-1]: # Last selector
|
|
1053
|
+
raise TimeoutException(f"None of the selectors found: {selectors}")
|
|
1054
|
+
continue # Try next selector
|
|
1055
|
+
|
|
1056
|
+
# Handle prefixed conditions
|
|
1057
|
+
if condition.startswith('presence_of_element_located:'):
|
|
1058
|
+
selectors_str = condition.split(':', 1)[1]
|
|
1059
|
+
selectors = [s.strip() for s in selectors_str.split(',')]
|
|
1060
|
+
|
|
1061
|
+
# Try each selector until one works
|
|
1062
|
+
for selector in selectors:
|
|
1063
|
+
try:
|
|
1064
|
+
WebDriverWait(self.driver, timeout, poll_frequency=0.25).until(
|
|
1065
|
+
EC.presence_of_element_located((By.CSS_SELECTOR, selector))
|
|
1066
|
+
)
|
|
1067
|
+
self.logger.debug(f"Element found: {selector}")
|
|
1068
|
+
return True # IMPORTANT: Return immediately when found
|
|
1069
|
+
except TimeoutException:
|
|
1070
|
+
if selector == selectors[-1]: # Last selector
|
|
1071
|
+
raise TimeoutException(f"None of the selectors found: {selectors}")
|
|
1072
|
+
continue # Try next selector
|
|
1073
|
+
|
|
1074
|
+
elif condition.startswith('element_to_be_clickable:'):
|
|
1075
|
+
selectors_str = condition.split(':', 1)[1]
|
|
1076
|
+
selectors = [s.strip() for s in selectors_str.split(',')]
|
|
1077
|
+
|
|
1078
|
+
for selector in selectors:
|
|
1079
|
+
try:
|
|
1080
|
+
WebDriverWait(self.driver, timeout).until(
|
|
1081
|
+
EC.element_to_be_clickable((By.CSS_SELECTOR, selector))
|
|
1082
|
+
)
|
|
1083
|
+
self.logger.debug(f"Clickable element found: {selector}")
|
|
1084
|
+
return True # Return immediately
|
|
1085
|
+
except TimeoutException:
|
|
1086
|
+
if selector == selectors[-1]:
|
|
1087
|
+
raise TimeoutException(f"None of the selectors clickable: {selectors}")
|
|
1088
|
+
continue
|
|
1089
|
+
|
|
1090
|
+
elif condition.startswith('text_to_be_present:'):
|
|
1091
|
+
text = condition.split(':', 1)[1]
|
|
1092
|
+
WebDriverWait(self.driver, timeout, poll_frequency=0.25).until(
|
|
1093
|
+
EC.text_to_be_present_in_element((By.TAG_NAME, "body"), text)
|
|
1094
|
+
)
|
|
1095
|
+
self.logger.debug(f"Text found: {text}")
|
|
1096
|
+
return True # Return immediately
|
|
1097
|
+
|
|
1098
|
+
elif condition.startswith('invisibility_of_element:'):
|
|
1099
|
+
selector = condition.split(':', 1)[1]
|
|
1100
|
+
WebDriverWait(self.driver, timeout).until(
|
|
1101
|
+
EC.invisibility_of_element_located((By.CSS_SELECTOR, selector))
|
|
1102
|
+
)
|
|
1103
|
+
self.logger.debug(f"Element invisible: {selector}")
|
|
1104
|
+
return True # Return immediately
|
|
1105
|
+
|
|
1106
|
+
else:
|
|
1107
|
+
# DEFAULT: Plain CSS selector(s) - use fast JS polling
|
|
1108
|
+
selectors = [s.strip() for s in condition.split(',')]
|
|
1109
|
+
deadline = time.monotonic() + timeout
|
|
1110
|
+
while time.monotonic() < deadline:
|
|
1111
|
+
for selector in selectors:
|
|
1112
|
+
try:
|
|
1113
|
+
count = self.driver.execute_script(
|
|
1114
|
+
"return document.querySelectorAll(arguments[0]).length;",
|
|
1115
|
+
selector
|
|
1116
|
+
)
|
|
1117
|
+
if isinstance(count, int) and count > 0:
|
|
1118
|
+
self.logger.debug(f"Element found via JS: {selector}")
|
|
1119
|
+
return True # Return immediately when found
|
|
1120
|
+
except Exception:
|
|
1121
|
+
pass
|
|
1122
|
+
time.sleep(0.15) # Small delay before retry
|
|
1123
|
+
# Timeout reached
|
|
1124
|
+
raise TimeoutException(f"Timeout waiting for selectors: {selectors}")
|
|
1125
|
+
|
|
1126
|
+
# Execute and return result
|
|
1127
|
+
result = await loop.run_in_executor(None, wait_sync)
|
|
1128
|
+
return result
|
|
1129
|
+
|
|
1130
|
+
else: # Playwright
|
|
1131
|
+
if condition.startswith('presence_of_element_located:'):
|
|
1132
|
+
selectors_str = condition.replace('presence_of_element_located:', '')
|
|
1133
|
+
selectors = [s.strip() for s in selectors_str.split(',')]
|
|
1134
|
+
|
|
1135
|
+
# Try each selector
|
|
1136
|
+
for selector in selectors:
|
|
1137
|
+
try:
|
|
1138
|
+
await self.page.wait_for_selector(selector, timeout=timeout * 1000)
|
|
1139
|
+
self.logger.debug(f"Playwright found: {selector}")
|
|
1140
|
+
return True
|
|
1141
|
+
except Exception:
|
|
1142
|
+
if selector == selectors[-1]:
|
|
1143
|
+
raise
|
|
1144
|
+
continue
|
|
1145
|
+
|
|
1146
|
+
elif condition.startswith('text_to_be_present:'):
|
|
1147
|
+
text = condition.replace('text_to_be_present:', '')
|
|
1148
|
+
await self.page.wait_for_function(
|
|
1149
|
+
f"document.body.textContent.includes('{text}')",
|
|
1150
|
+
timeout=timeout * 1000
|
|
1151
|
+
)
|
|
1152
|
+
return True
|
|
1153
|
+
|
|
1154
|
+
else:
|
|
1155
|
+
# Try multiple selectors if comma-separated
|
|
1156
|
+
selectors = [s.strip() for s in condition.split(',')]
|
|
1157
|
+
for selector in selectors:
|
|
1158
|
+
try:
|
|
1159
|
+
await self.page.wait_for_selector(selector, timeout=timeout * 1000)
|
|
1160
|
+
return True
|
|
1161
|
+
except Exception:
|
|
1162
|
+
if selector == selectors[-1]:
|
|
1163
|
+
raise
|
|
1164
|
+
continue
|
|
1165
|
+
|
|
1166
|
+
return True
|
|
1167
|
+
|
|
1168
|
+
async def _get_text(self, action: GetText) -> bool:
|
|
1169
|
+
"""
|
|
1170
|
+
Extract pure text content from elements and save to results.
|
|
1171
|
+
|
|
1172
|
+
Args:
|
|
1173
|
+
action: GetText action with selector and options
|
|
1174
|
+
|
|
1175
|
+
Returns:
|
|
1176
|
+
bool: True if extraction successful
|
|
1177
|
+
"""
|
|
1178
|
+
try:
|
|
1179
|
+
# Get current URL
|
|
1180
|
+
current_url = await self._get_current_url()
|
|
1181
|
+
|
|
1182
|
+
# Get page source
|
|
1183
|
+
if self.driver_type == 'selenium':
|
|
1184
|
+
loop = asyncio.get_running_loop()
|
|
1185
|
+
page_source = await loop.run_in_executor(None, lambda: self.driver.page_source)
|
|
1186
|
+
else: # Playwright
|
|
1187
|
+
page_source = await self.page.content()
|
|
1188
|
+
|
|
1189
|
+
# Parse with BeautifulSoup
|
|
1190
|
+
soup = BeautifulSoup(page_source, 'html.parser')
|
|
1191
|
+
|
|
1192
|
+
# Find elements by selector
|
|
1193
|
+
elements = soup.select(action.selector)
|
|
1194
|
+
|
|
1195
|
+
if not elements:
|
|
1196
|
+
self.logger.warning(f"No elements found for selector: {action.selector}")
|
|
1197
|
+
extracted_text = None
|
|
1198
|
+
elif action.multiple:
|
|
1199
|
+
# Extract text from all matching elements
|
|
1200
|
+
extracted_text = [elem.get_text(strip=True) for elem in elements]
|
|
1201
|
+
else:
|
|
1202
|
+
# Extract text from first element only
|
|
1203
|
+
extracted_text = elements[0].get_text(strip=True)
|
|
1204
|
+
|
|
1205
|
+
# Create ScrapingResult and append to results
|
|
1206
|
+
result = ScrapingResult(
|
|
1207
|
+
url=current_url,
|
|
1208
|
+
content=page_source,
|
|
1209
|
+
bs_soup=soup,
|
|
1210
|
+
extracted_data={action.extract_name: extracted_text},
|
|
1211
|
+
metadata={
|
|
1212
|
+
"selector": action.selector,
|
|
1213
|
+
"multiple": action.multiple,
|
|
1214
|
+
"elements_found": len(elements)
|
|
1215
|
+
},
|
|
1216
|
+
timestamp=str(time.time()),
|
|
1217
|
+
success=extracted_text is not None
|
|
1218
|
+
)
|
|
1219
|
+
|
|
1220
|
+
self.results.append(result)
|
|
1221
|
+
self.logger.info(
|
|
1222
|
+
f"Extracted text from {len(elements)} element(s) using selector: {action.selector}"
|
|
1223
|
+
)
|
|
1224
|
+
|
|
1225
|
+
return True
|
|
1226
|
+
|
|
1227
|
+
except Exception as e:
|
|
1228
|
+
self.logger.error(f"GetText action failed: {str(e)}")
|
|
1229
|
+
# Create error result
|
|
1230
|
+
error_result = ScrapingResult(
|
|
1231
|
+
url=await self._get_current_url() if hasattr(self, 'driver') or hasattr(self, 'page') else "",
|
|
1232
|
+
content="",
|
|
1233
|
+
bs_soup=BeautifulSoup("", 'html.parser'),
|
|
1234
|
+
extracted_data={action.extract_name: None},
|
|
1235
|
+
success=False,
|
|
1236
|
+
error_message=str(e),
|
|
1237
|
+
timestamp=str(time.time())
|
|
1238
|
+
)
|
|
1239
|
+
self.results.append(error_result)
|
|
1240
|
+
return False
|
|
1241
|
+
|
|
1242
|
+
|
|
1243
|
+
async def _get_html(self, action: GetHTML, args: dict) -> bool:
|
|
1244
|
+
"""
|
|
1245
|
+
Extract complete HTML content from elements and save to results.
|
|
1246
|
+
|
|
1247
|
+
Args:
|
|
1248
|
+
action: GetHTML action with selector and options
|
|
1249
|
+
args: Additional arguments for the action
|
|
1250
|
+
|
|
1251
|
+
Returns:
|
|
1252
|
+
bool: True if extraction successful
|
|
1253
|
+
"""
|
|
1254
|
+
try:
|
|
1255
|
+
# Get current URL
|
|
1256
|
+
current_url = await self._get_current_url()
|
|
1257
|
+
|
|
1258
|
+
# Get page source
|
|
1259
|
+
if self.driver_type == 'selenium':
|
|
1260
|
+
loop = asyncio.get_running_loop()
|
|
1261
|
+
page_source = await loop.run_in_executor(None, lambda: self.driver.page_source)
|
|
1262
|
+
else: # Playwright
|
|
1263
|
+
page_source = await self.page.content()
|
|
1264
|
+
|
|
1265
|
+
# Parse with BeautifulSoup
|
|
1266
|
+
soup = BeautifulSoup(page_source, 'html.parser')
|
|
1267
|
+
|
|
1268
|
+
# Handle different selector types
|
|
1269
|
+
selector_type = getattr(action, 'selector_type', 'css')
|
|
1270
|
+
|
|
1271
|
+
# Find elements by selector
|
|
1272
|
+
if selector_type == 'xpath':
|
|
1273
|
+
# Use lxml for XPath support
|
|
1274
|
+
tree = lxml_html.fromstring(page_source)
|
|
1275
|
+
elements_lxml = tree.xpath(action.selector)
|
|
1276
|
+
|
|
1277
|
+
# Convert lxml elements back to BeautifulSoup for consistency
|
|
1278
|
+
elements = []
|
|
1279
|
+
for elem in elements_lxml:
|
|
1280
|
+
html_str = lxml_html.tostring(elem, encoding='unicode')
|
|
1281
|
+
elements.append(BeautifulSoup(html_str, 'html.parser'))
|
|
1282
|
+
else:
|
|
1283
|
+
# CSS selector (default)
|
|
1284
|
+
elements = soup.select(action.selector)
|
|
1285
|
+
|
|
1286
|
+
if not elements:
|
|
1287
|
+
self.logger.warning(f"No elements found for selector: {action.selector}")
|
|
1288
|
+
extracted_html = None
|
|
1289
|
+
|
|
1290
|
+
# Extract HTML from all matching elements
|
|
1291
|
+
elif action.multiple:
|
|
1292
|
+
for elem in elements:
|
|
1293
|
+
# generate one scrapping result per element:
|
|
1294
|
+
elem_bs = elem if isinstance(elem, BeautifulSoup) else BeautifulSoup(str(elem), 'html.parser')
|
|
1295
|
+
data = args.get('data', {}) if args else {}
|
|
1296
|
+
result = ScrapingResult(
|
|
1297
|
+
url=current_url,
|
|
1298
|
+
content=page_source,
|
|
1299
|
+
bs_soup=elem_bs,
|
|
1300
|
+
extracted_data={action.extract_name: str(elem)},
|
|
1301
|
+
metadata={
|
|
1302
|
+
"selector": action.selector,
|
|
1303
|
+
"selector_type": selector_type,
|
|
1304
|
+
"multiple": action.multiple,
|
|
1305
|
+
"iteration": (args or {}).get("iteration"),
|
|
1306
|
+
"data": data,
|
|
1307
|
+
},
|
|
1308
|
+
timestamp=str(time.time()),
|
|
1309
|
+
success=True
|
|
1310
|
+
)
|
|
1311
|
+
# print('DEBUG HTML > ', result)
|
|
1312
|
+
self.results.append(result)
|
|
1313
|
+
else:
|
|
1314
|
+
extracted_html = str(elements[0])
|
|
1315
|
+
# Create ScrapingResult and append to results
|
|
1316
|
+
result = ScrapingResult(
|
|
1317
|
+
url=current_url,
|
|
1318
|
+
content=page_source,
|
|
1319
|
+
bs_soup=soup,
|
|
1320
|
+
extracted_data={action.extract_name: extracted_html},
|
|
1321
|
+
metadata={
|
|
1322
|
+
"selector": action.selector,
|
|
1323
|
+
"selector_type": selector_type,
|
|
1324
|
+
"multiple": action.multiple,
|
|
1325
|
+
"elements_found": len(elements)
|
|
1326
|
+
},
|
|
1327
|
+
timestamp=str(time.time()),
|
|
1328
|
+
success=extracted_html is not None
|
|
1329
|
+
)
|
|
1330
|
+
|
|
1331
|
+
self.results.append(result)
|
|
1332
|
+
self.logger.info(
|
|
1333
|
+
f"Extracted HTML from {len(elements)} element(s) using selector: {action.selector}"
|
|
1334
|
+
)
|
|
1335
|
+
|
|
1336
|
+
return True
|
|
1337
|
+
|
|
1338
|
+
except Exception as e:
|
|
1339
|
+
self.logger.error(f"GetHTML action failed: {str(e)}")
|
|
1340
|
+
# Create error result
|
|
1341
|
+
error_result = ScrapingResult(
|
|
1342
|
+
url=await self._get_current_url() if hasattr(self, 'driver') or hasattr(self, 'page') else "",
|
|
1343
|
+
content="",
|
|
1344
|
+
bs_soup=BeautifulSoup("", 'html.parser'),
|
|
1345
|
+
extracted_data={action.extract_name: None},
|
|
1346
|
+
success=False,
|
|
1347
|
+
error_message=str(e),
|
|
1348
|
+
timestamp=str(time.time())
|
|
1349
|
+
)
|
|
1350
|
+
self.results.append(error_result)
|
|
1351
|
+
return False
|
|
1352
|
+
|
|
1353
|
+
|
|
1354
|
+
async def _take_screenshot(self, action: Screenshot) -> bool:
|
|
1355
|
+
"""
|
|
1356
|
+
Take a screenshot of the page or specific element.
|
|
1357
|
+
|
|
1358
|
+
Args:
|
|
1359
|
+
action: Screenshot action with options
|
|
1360
|
+
|
|
1361
|
+
Returns:
|
|
1362
|
+
bool: True if screenshot successful
|
|
1363
|
+
"""
|
|
1364
|
+
try:
|
|
1365
|
+
screenshot_data = None
|
|
1366
|
+
output_path = action.output_path
|
|
1367
|
+
if isinstance(output_path, str):
|
|
1368
|
+
output_path = Path(output_path).resolve()
|
|
1369
|
+
screenshot_name = action.get_filename()
|
|
1370
|
+
|
|
1371
|
+
if self.driver_type == 'selenium':
|
|
1372
|
+
loop = asyncio.get_running_loop()
|
|
1373
|
+
|
|
1374
|
+
def take_screenshot_sync():
|
|
1375
|
+
if action.selector:
|
|
1376
|
+
# Screenshot of specific element
|
|
1377
|
+
element = self.driver.find_element(By.CSS_SELECTOR, action.selector)
|
|
1378
|
+
screenshot_bytes = element.screenshot_as_png
|
|
1379
|
+
else:
|
|
1380
|
+
# Full page screenshot
|
|
1381
|
+
if action.full_page:
|
|
1382
|
+
# Full page screenshot (requires scrolling for some drivers)
|
|
1383
|
+
screenshot_bytes = self.driver.get_screenshot_as_png()
|
|
1384
|
+
else:
|
|
1385
|
+
# Viewport screenshot only
|
|
1386
|
+
screenshot_bytes = self.driver.get_screenshot_as_png()
|
|
1387
|
+
|
|
1388
|
+
return screenshot_bytes
|
|
1389
|
+
|
|
1390
|
+
screenshot_bytes = await loop.run_in_executor(None, take_screenshot_sync)
|
|
1391
|
+
|
|
1392
|
+
# Save to file if path provided
|
|
1393
|
+
filename = output_path.joinpath(screenshot_name)
|
|
1394
|
+
async with aiofiles.open(filename, 'wb') as f:
|
|
1395
|
+
await f.write(screenshot_bytes)
|
|
1396
|
+
self.logger.info(f"Screenshot saved to: {filename}")
|
|
1397
|
+
|
|
1398
|
+
# Return base64 if requested
|
|
1399
|
+
if action.return_base64:
|
|
1400
|
+
return base64.b64encode(screenshot_bytes).decode('utf-8')
|
|
1401
|
+
|
|
1402
|
+
return True
|
|
1403
|
+
|
|
1404
|
+
else: # Playwright
|
|
1405
|
+
screenshot_options = {}
|
|
1406
|
+
|
|
1407
|
+
if action.full_page:
|
|
1408
|
+
screenshot_options['full_page'] = True
|
|
1409
|
+
|
|
1410
|
+
if action.selector:
|
|
1411
|
+
# Screenshot of specific element
|
|
1412
|
+
element = self.page.locator(action.selector)
|
|
1413
|
+
screenshot_bytes = await element.screenshot(**screenshot_options)
|
|
1414
|
+
else:
|
|
1415
|
+
# Page screenshot
|
|
1416
|
+
screenshot_bytes = await self.page.screenshot(**screenshot_options)
|
|
1417
|
+
|
|
1418
|
+
# Save to file if path provided
|
|
1419
|
+
if output_path:
|
|
1420
|
+
with open(output_path, 'wb') as f:
|
|
1421
|
+
f.write(screenshot_bytes)
|
|
1422
|
+
self.logger.info(f"Screenshot saved to: {output_path}")
|
|
1423
|
+
|
|
1424
|
+
# Return base64 if requested
|
|
1425
|
+
if action.return_base64:
|
|
1426
|
+
screenshot_data = base64.b64encode(screenshot_bytes).decode('utf-8')
|
|
1427
|
+
else:
|
|
1428
|
+
screenshot_data = True
|
|
1429
|
+
|
|
1430
|
+
# Create ScrapingResult with screenshot data
|
|
1431
|
+
current_url = await self._get_current_url()
|
|
1432
|
+
|
|
1433
|
+
result = ScrapingResult(
|
|
1434
|
+
url=current_url,
|
|
1435
|
+
content="", # No HTML content for screenshots
|
|
1436
|
+
bs_soup=BeautifulSoup("", 'html.parser'),
|
|
1437
|
+
extracted_data={
|
|
1438
|
+
"screenshot": screenshot_data if action.return_base64 else output_path,
|
|
1439
|
+
"screenshot_base64": screenshot_data if action.return_base64 else None
|
|
1440
|
+
},
|
|
1441
|
+
metadata={
|
|
1442
|
+
"selector": action.selector,
|
|
1443
|
+
"full_page": action.full_page,
|
|
1444
|
+
"output_path": output_path,
|
|
1445
|
+
"returned_base64": action.return_base64
|
|
1446
|
+
},
|
|
1447
|
+
timestamp=str(time.time()),
|
|
1448
|
+
success=True
|
|
1449
|
+
)
|
|
1450
|
+
|
|
1451
|
+
self.results.append(result)
|
|
1452
|
+
self.logger.info(
|
|
1453
|
+
f"Screenshot taken: {'element ' + action.selector if action.selector else 'full page'}"
|
|
1454
|
+
)
|
|
1455
|
+
|
|
1456
|
+
return True
|
|
1457
|
+
|
|
1458
|
+
except Exception as e:
|
|
1459
|
+
self.logger.error(f"Screenshot action failed: {str(e)}")
|
|
1460
|
+
# Create error result
|
|
1461
|
+
error_result = ScrapingResult(
|
|
1462
|
+
url=await self._get_current_url() if hasattr(self, 'driver') or hasattr(self, 'page') else "",
|
|
1463
|
+
content="",
|
|
1464
|
+
bs_soup=BeautifulSoup("", 'html.parser'),
|
|
1465
|
+
extracted_data={"screenshot": None},
|
|
1466
|
+
success=False,
|
|
1467
|
+
error_message=str(e),
|
|
1468
|
+
timestamp=str(time.time())
|
|
1469
|
+
)
|
|
1470
|
+
self.results.append(error_result)
|
|
1471
|
+
return False
|
|
1472
|
+
|
|
1473
|
+
async def _scroll_page(self, action: Scroll):
|
|
1474
|
+
"""Scroll the page"""
|
|
1475
|
+
if self.driver_type == 'selenium':
|
|
1476
|
+
target = f"document.querySelector('{action.selector}')" if action.selector else "window"
|
|
1477
|
+
behavior = "'smooth'" if action.smooth else "'auto'"
|
|
1478
|
+
loop = asyncio.get_running_loop()
|
|
1479
|
+
def scroll_sync():
|
|
1480
|
+
if action.direction == "top":
|
|
1481
|
+
return f"{target}.scrollTo({{top: 0, behavior: {behavior}}});"
|
|
1482
|
+
elif action.direction == "bottom":
|
|
1483
|
+
return f"{target}.scrollTo({{top: {target}.scrollHeight, behavior: {behavior}}});"
|
|
1484
|
+
elif action.direction == "up":
|
|
1485
|
+
amount = action.amount or 300
|
|
1486
|
+
return f"{target}.scrollBy({{top: -{amount}, behavior: {behavior}}});"
|
|
1487
|
+
elif action.direction == "down":
|
|
1488
|
+
amount = action.amount or 300
|
|
1489
|
+
return f"{target}.scrollBy({{top: {amount}, behavior: {behavior}}});"
|
|
1490
|
+
elif action.amount:
|
|
1491
|
+
self.driver.execute_script(f"window.scrollBy(0, {action.amount});")
|
|
1492
|
+
elif action.selector:
|
|
1493
|
+
# Scroll to element
|
|
1494
|
+
try:
|
|
1495
|
+
element = self.driver.find_element(By.CSS_SELECTOR, action.selector)
|
|
1496
|
+
self.driver.execute_script("arguments[0].scrollIntoView();", element)
|
|
1497
|
+
except NoSuchElementException:
|
|
1498
|
+
self.logger.warning(
|
|
1499
|
+
f"Element not found for scrolling: {action.selector}"
|
|
1500
|
+
)
|
|
1501
|
+
|
|
1502
|
+
await loop.run_in_executor(None, scroll_sync)
|
|
1503
|
+
else: # Playwright
|
|
1504
|
+
if action.direction == "bottom":
|
|
1505
|
+
await self.page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
|
1506
|
+
elif action.direction == "top":
|
|
1507
|
+
await self.page.evaluate("window.scrollTo(0, 0)")
|
|
1508
|
+
elif action.amount:
|
|
1509
|
+
await self.page.evaluate(f"window.scrollBy(0, {action.amount})")
|
|
1510
|
+
else:
|
|
1511
|
+
# Scroll to element
|
|
1512
|
+
try:
|
|
1513
|
+
await self.page.locator(action.selector).scroll_into_view_if_needed()
|
|
1514
|
+
except:
|
|
1515
|
+
self.logger.warning(f"Element not found for scrolling: {action.selector}")
|
|
1516
|
+
|
|
1517
|
+
async def _get_cookies(self, action: GetCookies) -> Dict[str, Any]:
|
|
1518
|
+
"""Handle GetCookies action"""
|
|
1519
|
+
if self.driver_type == 'selenium':
|
|
1520
|
+
loop = asyncio.get_running_loop()
|
|
1521
|
+
cookies = await loop.run_in_executor(None, self.driver.get_cookies)
|
|
1522
|
+
else: # Playwright
|
|
1523
|
+
cookies = await self.page.context.cookies()
|
|
1524
|
+
|
|
1525
|
+
# Filter by names if specified
|
|
1526
|
+
if action.names:
|
|
1527
|
+
cookies = [c for c in cookies if c.get('name') in action.names]
|
|
1528
|
+
|
|
1529
|
+
# Filter by domain if specified
|
|
1530
|
+
if action.domain:
|
|
1531
|
+
cookies = [c for c in cookies if action.domain in c.get('domain', '')]
|
|
1532
|
+
|
|
1533
|
+
self.logger.info(f"Retrieved {len(cookies)} cookies")
|
|
1534
|
+
return {"cookies": cookies}
|
|
1535
|
+
|
|
1536
|
+
async def _set_cookies(self, action: SetCookies) -> bool:
|
|
1537
|
+
"""Handle SetCookies action"""
|
|
1538
|
+
if self.driver_type == 'selenium':
|
|
1539
|
+
loop = asyncio.get_running_loop()
|
|
1540
|
+
for cookie in action.cookies:
|
|
1541
|
+
await loop.run_in_executor(
|
|
1542
|
+
None,
|
|
1543
|
+
lambda c=cookie: self.driver.add_cookie(c)
|
|
1544
|
+
)
|
|
1545
|
+
else: # Playwright
|
|
1546
|
+
await self.page.context.add_cookies(action.cookies)
|
|
1547
|
+
|
|
1548
|
+
self.logger.info(f"Set {len(action.cookies)} cookies")
|
|
1549
|
+
return True
|
|
1550
|
+
|
|
1551
|
+
async def _handle_authentication(self, action: Authenticate):
|
|
1552
|
+
"""Handle authentication flows"""
|
|
1553
|
+
if action.method == 'bearer':
|
|
1554
|
+
if not action.token:
|
|
1555
|
+
self.logger.error("Bearer token authentication requires a 'token' value.")
|
|
1556
|
+
return False
|
|
1557
|
+
# Construct the header from the provided format and token
|
|
1558
|
+
header_value = action.header_value_format.format(action.token)
|
|
1559
|
+
headers = {action.header_name: header_value}
|
|
1560
|
+
if self.driver_type == 'selenium':
|
|
1561
|
+
# For Selenium, we use the Chrome DevTools Protocol (CDP) to set headers.
|
|
1562
|
+
# This requires a Chromium-based browser (Chrome, Edge).
|
|
1563
|
+
if not hasattr(self.driver, 'execute_cdp_cmd'):
|
|
1564
|
+
self.logger.error(
|
|
1565
|
+
"Bearer token injection for Selenium is only supported on Chromium-based browsers."
|
|
1566
|
+
)
|
|
1567
|
+
return False
|
|
1568
|
+
self.logger.info(f"Setting extra HTTP headers for Selenium session: {list(headers.keys())}")
|
|
1569
|
+
loop = asyncio.get_running_loop()
|
|
1570
|
+
await loop.run_in_executor(
|
|
1571
|
+
None,
|
|
1572
|
+
lambda: self.driver.execute_cdp_cmd(
|
|
1573
|
+
'Network.setExtraHTTPHeaders', {'headers': headers}
|
|
1574
|
+
)
|
|
1575
|
+
)
|
|
1576
|
+
|
|
1577
|
+
elif self.driver_type == 'playwright' and PLAYWRIGHT_AVAILABLE:
|
|
1578
|
+
# Playwright has a direct and simple method for this.
|
|
1579
|
+
self.logger.info(f"Setting extra HTTP headers for Playwright session: {list(headers.keys())}")
|
|
1580
|
+
await self.page.set_extra_http_headers(headers)
|
|
1581
|
+
|
|
1582
|
+
else:
|
|
1583
|
+
self.logger.error(f"Bearer token authentication is not implemented for driver type: {self.driver_type}")
|
|
1584
|
+
return False
|
|
1585
|
+
|
|
1586
|
+
self.logger.info("Bearer token authentication configured. All subsequent requests will include the specified header.")
|
|
1587
|
+
return True
|
|
1588
|
+
|
|
1589
|
+
# action form (only programmed until now)
|
|
1590
|
+
username = action.username
|
|
1591
|
+
password = action.password
|
|
1592
|
+
username_selector = action.username_selector or '#username'
|
|
1593
|
+
password_selector = action.password_selector or '#password'
|
|
1594
|
+
submit_selector = action.submit_selector or 'input[type="submit"], button[type="submit"]'
|
|
1595
|
+
|
|
1596
|
+
if not username or not password:
|
|
1597
|
+
self.logger.error(
|
|
1598
|
+
"Authentication requires username and password"
|
|
1599
|
+
)
|
|
1600
|
+
return
|
|
1601
|
+
|
|
1602
|
+
try:
|
|
1603
|
+
# Fill username
|
|
1604
|
+
await self._fill_element(username_selector, username, press_enter=action.enter_on_username)
|
|
1605
|
+
await asyncio.sleep(0.5)
|
|
1606
|
+
|
|
1607
|
+
# Fill password
|
|
1608
|
+
await self._fill_element(password_selector, password)
|
|
1609
|
+
await asyncio.sleep(0.5)
|
|
1610
|
+
|
|
1611
|
+
# Submit form
|
|
1612
|
+
await self._click_element(submit_selector)
|
|
1613
|
+
|
|
1614
|
+
# Wait for navigation/login completion
|
|
1615
|
+
await asyncio.sleep(2)
|
|
1616
|
+
|
|
1617
|
+
self.logger.info("Authentication completed")
|
|
1618
|
+
|
|
1619
|
+
except Exception as e:
|
|
1620
|
+
self.logger.error(f"Authentication failed: {str(e)}")
|
|
1621
|
+
raise
|
|
1622
|
+
|
|
1623
|
+
async def _await_browser_event(self, action: AwaitBrowserEvent) -> bool:
|
|
1624
|
+
"""
|
|
1625
|
+
Pause automation until a user triggers a browser-side event.
|
|
1626
|
+
|
|
1627
|
+
Config (put in step.wait_condition or step.target as dict):
|
|
1628
|
+
- key_combo: one of ["ctrl_enter", "cmd_enter", "alt_shift_s"] (default: "ctrl_enter")
|
|
1629
|
+
- show_overlay_button: bool (default False) → injects a floating "Resume" button
|
|
1630
|
+
- local_storage_key: str (default "__scrapeResume")
|
|
1631
|
+
- predicate_js: str (optional) → JS snippet returning boolean; if true, resume
|
|
1632
|
+
- custom_event_name: str (optional) → window.dispatchEvent(new Event(name)) resumes
|
|
1633
|
+
|
|
1634
|
+
Any of these will resume:
|
|
1635
|
+
1) Pressing the configured key combo in the page
|
|
1636
|
+
2) Clicking the optional overlay "Resume" button
|
|
1637
|
+
3) Dispatching the custom event: window.dispatchEvent(new Event('scrape-resume'))
|
|
1638
|
+
4) Setting localStorage[local_storage_key] = "1"
|
|
1639
|
+
5) predicate_js() evaluates to true
|
|
1640
|
+
"""
|
|
1641
|
+
cfg = action.wait_condition or action.target or {}
|
|
1642
|
+
if isinstance(cfg, str):
|
|
1643
|
+
cfg = {"key_combo": cfg}
|
|
1644
|
+
|
|
1645
|
+
key_combo = (cfg.get("key_combo") or "ctrl_enter").lower()
|
|
1646
|
+
show_overlay = bool(cfg.get("show_overlay_button", False))
|
|
1647
|
+
ls_key = cfg.get("local_storage_key", "__scrapeResume")
|
|
1648
|
+
predicate_js = cfg.get("predicate_js") # e.g., "return !!document.querySelector('.dashboard');"
|
|
1649
|
+
custom_event = cfg.get("custom_event_name", "scrape-resume")
|
|
1650
|
+
timeout = int(action.timeout or 300)
|
|
1651
|
+
|
|
1652
|
+
# Inject listener with green button and auto-removal
|
|
1653
|
+
inject_script = f"""
|
|
1654
|
+
(function() {{
|
|
1655
|
+
if (window.__scrapeSignal && window.__scrapeSignal._bound) return 0;
|
|
1656
|
+
window.__scrapeSignal = window.__scrapeSignal || {{ ready:false, _bound:false }};
|
|
1657
|
+
function signal() {{
|
|
1658
|
+
try {{ localStorage.setItem('{ls_key}', '1'); }} catch(e) {{}}
|
|
1659
|
+
window.__scrapeSignal.ready = true;
|
|
1660
|
+
// Remove the button when clicked
|
|
1661
|
+
var btn = document.getElementById('__scrapeResumeBtn');
|
|
1662
|
+
if (btn) {{ btn.remove(); }}
|
|
1663
|
+
}}
|
|
1664
|
+
|
|
1665
|
+
// Key combos
|
|
1666
|
+
window.addEventListener('keydown', function(e) {{
|
|
1667
|
+
try {{
|
|
1668
|
+
var k = '{key_combo}';
|
|
1669
|
+
if (k === 'ctrl_enter' && (e.ctrlKey || e.metaKey) && e.key === 'Enter') {{ e.preventDefault(); signal(); }}
|
|
1670
|
+
else if (k === 'cmd_enter' && e.metaKey && e.key === 'Enter') {{ e.preventDefault(); signal(); }}
|
|
1671
|
+
else if (k === 'alt_shift_s' && e.altKey && e.shiftKey && (e.key.toLowerCase() === 's')) {{ e.preventDefault(); signal(); }}
|
|
1672
|
+
}} catch(_e) {{}}
|
|
1673
|
+
}}, true);
|
|
1674
|
+
|
|
1675
|
+
// Custom DOM event
|
|
1676
|
+
try {{
|
|
1677
|
+
window.addEventListener('{custom_event}', function() {{ signal(); }}, false);
|
|
1678
|
+
}} catch(_e) {{}}
|
|
1679
|
+
|
|
1680
|
+
// Optional overlay button with green background
|
|
1681
|
+
if ({'true' if show_overlay else 'false'}) {{
|
|
1682
|
+
try {{
|
|
1683
|
+
if (!document.getElementById('__scrapeResumeBtn')) {{
|
|
1684
|
+
var btn = document.createElement('button');
|
|
1685
|
+
btn.id = '__scrapeResumeBtn';
|
|
1686
|
+
btn.textContent = 'Resume scraping';
|
|
1687
|
+
Object.assign(btn.style, {{
|
|
1688
|
+
position: 'fixed',
|
|
1689
|
+
right: '16px',
|
|
1690
|
+
bottom: '16px',
|
|
1691
|
+
zIndex: 2147483647,
|
|
1692
|
+
padding: '10px 14px',
|
|
1693
|
+
fontSize: '14px',
|
|
1694
|
+
borderRadius: '8px',
|
|
1695
|
+
border: 'none',
|
|
1696
|
+
cursor: 'pointer',
|
|
1697
|
+
background: '#10b981',
|
|
1698
|
+
color: '#fff',
|
|
1699
|
+
boxShadow: '0 4px 12px rgba(0,0,0,0.2)'
|
|
1700
|
+
}});
|
|
1701
|
+
btn.addEventListener('click', function(e) {{ e.preventDefault(); signal(); }});
|
|
1702
|
+
document.body.appendChild(btn);
|
|
1703
|
+
}}
|
|
1704
|
+
}} catch(_e) {{}}
|
|
1705
|
+
}}
|
|
1706
|
+
|
|
1707
|
+
window.__scrapeSignal._bound = true;
|
|
1708
|
+
return 1;
|
|
1709
|
+
}})();
|
|
1710
|
+
"""
|
|
1711
|
+
|
|
1712
|
+
def _inject_and_check_ready():
|
|
1713
|
+
# Return True if already signaled
|
|
1714
|
+
try:
|
|
1715
|
+
if self.driver_type == 'selenium':
|
|
1716
|
+
# inject
|
|
1717
|
+
try:
|
|
1718
|
+
self.driver.execute_script(inject_script)
|
|
1719
|
+
except Exception:
|
|
1720
|
+
pass
|
|
1721
|
+
# check any of the resume signals
|
|
1722
|
+
if predicate_js:
|
|
1723
|
+
try:
|
|
1724
|
+
ok = self.driver.execute_script(predicate_js)
|
|
1725
|
+
if bool(ok):
|
|
1726
|
+
return True
|
|
1727
|
+
except Exception:
|
|
1728
|
+
pass
|
|
1729
|
+
try:
|
|
1730
|
+
# localStorage flag
|
|
1731
|
+
val = self.driver.execute_script(f"try{{return localStorage.getItem('{ls_key}')}}catch(e){{return null}}")
|
|
1732
|
+
if val == "1":
|
|
1733
|
+
return True
|
|
1734
|
+
except Exception:
|
|
1735
|
+
pass
|
|
1736
|
+
try:
|
|
1737
|
+
# in-memory flag
|
|
1738
|
+
ready = self.driver.execute_script("return !!(window.__scrapeSignal && window.__scrapeSignal.ready);")
|
|
1739
|
+
if bool(ready):
|
|
1740
|
+
return True
|
|
1741
|
+
except Exception:
|
|
1742
|
+
pass
|
|
1743
|
+
return False
|
|
1744
|
+
else:
|
|
1745
|
+
# Playwright branch (optional): basic injection + predicate check
|
|
1746
|
+
try:
|
|
1747
|
+
self.page.evaluate(inject_script)
|
|
1748
|
+
except Exception:
|
|
1749
|
+
pass
|
|
1750
|
+
if predicate_js:
|
|
1751
|
+
try:
|
|
1752
|
+
ok = self.page.evaluate(predicate_js)
|
|
1753
|
+
if bool(ok):
|
|
1754
|
+
return True
|
|
1755
|
+
except Exception:
|
|
1756
|
+
pass
|
|
1757
|
+
try:
|
|
1758
|
+
val = self.page.evaluate(f"try{{return localStorage.getItem('{ls_key}')}}catch(e){{return null}}")
|
|
1759
|
+
if val == "1":
|
|
1760
|
+
return True
|
|
1761
|
+
except Exception:
|
|
1762
|
+
pass
|
|
1763
|
+
try:
|
|
1764
|
+
ready = self.page.evaluate("() => !!(window.__scrapeSignal && window.__scrapeSignal.ready)")
|
|
1765
|
+
if bool(ready):
|
|
1766
|
+
return True
|
|
1767
|
+
except Exception:
|
|
1768
|
+
pass
|
|
1769
|
+
return False
|
|
1770
|
+
except Exception:
|
|
1771
|
+
return False
|
|
1772
|
+
|
|
1773
|
+
loop = asyncio.get_running_loop()
|
|
1774
|
+
self.logger.info(
|
|
1775
|
+
"🛑 Awaiting browser event: press the configured key combo in the page, click the floating button, dispatch the custom event, or set the localStorage flag to resume."
|
|
1776
|
+
)
|
|
1777
|
+
|
|
1778
|
+
deadline = time.monotonic() + timeout
|
|
1779
|
+
while time.monotonic() < deadline:
|
|
1780
|
+
if await loop.run_in_executor(None, _inject_and_check_ready):
|
|
1781
|
+
# Clear the LS flag so future waits don't auto-trigger
|
|
1782
|
+
try:
|
|
1783
|
+
if self.driver_type == 'selenium':
|
|
1784
|
+
self.driver.execute_script(f"try{{localStorage.removeItem('{ls_key}')}}catch(e){{}}")
|
|
1785
|
+
self.driver.execute_script("if(window.__scrapeSignal){window.__scrapeSignal.ready=false}")
|
|
1786
|
+
else:
|
|
1787
|
+
self.page.evaluate(f"() => {{ try{{localStorage.removeItem('{ls_key}')}}catch(e){{}}; if(window.__scrapeSignal) window.__scrapeSignal.ready=false; }}")
|
|
1788
|
+
except Exception:
|
|
1789
|
+
pass
|
|
1790
|
+
self.logger.info("✅ Browser event received. Resuming automation.")
|
|
1791
|
+
return
|
|
1792
|
+
await asyncio.sleep(0.3)
|
|
1793
|
+
|
|
1794
|
+
raise TimeoutError("await_browser_event timed out.")
|
|
1795
|
+
|
|
1796
|
+
async def _await_human(self, action: AwaitHuman):
|
|
1797
|
+
"""
|
|
1798
|
+
Let a human drive the already-open browser, then resume when a condition is met.
|
|
1799
|
+
'wait_condition' or 'target' may contain:
|
|
1800
|
+
- selector: CSS selector to appear (presence)
|
|
1801
|
+
- url_contains: substring expected in current URL
|
|
1802
|
+
- title_contains: substring expected in document.title
|
|
1803
|
+
"""
|
|
1804
|
+
timeout = int(action.timeout or 300)
|
|
1805
|
+
selector = None
|
|
1806
|
+
url_contains = None
|
|
1807
|
+
title_contains = None
|
|
1808
|
+
|
|
1809
|
+
if action.condition_type == 'selector':
|
|
1810
|
+
selector = action.target
|
|
1811
|
+
elif action.condition_type == 'url_contains':
|
|
1812
|
+
selector = None
|
|
1813
|
+
url_contains = action.target
|
|
1814
|
+
elif action.condition_type == 'title_contains':
|
|
1815
|
+
selector = None
|
|
1816
|
+
title_contains = action.target
|
|
1817
|
+
else:
|
|
1818
|
+
# Default: expect a dict in target or wait_condition
|
|
1819
|
+
cond = action.wait_condition or action.target or {}
|
|
1820
|
+
if isinstance(cond, str):
|
|
1821
|
+
cond = {"selector": cond}
|
|
1822
|
+
selector = cond.get("selector")
|
|
1823
|
+
if not selector:
|
|
1824
|
+
self.logger.error("await_human requires at least one condition (selector, url_contains, title_contains)")
|
|
1825
|
+
return
|
|
1826
|
+
|
|
1827
|
+
loop = asyncio.get_running_loop()
|
|
1828
|
+
|
|
1829
|
+
def _check_sync() -> bool:
|
|
1830
|
+
try:
|
|
1831
|
+
if self.driver_type == 'selenium':
|
|
1832
|
+
cur_url = self.driver.current_url
|
|
1833
|
+
cur_title = self.driver.title
|
|
1834
|
+
if url_contains and (url_contains not in cur_url):
|
|
1835
|
+
return False
|
|
1836
|
+
if title_contains and (title_contains not in cur_title):
|
|
1837
|
+
return False
|
|
1838
|
+
if selector:
|
|
1839
|
+
try:
|
|
1840
|
+
count = self.driver.execute_script(
|
|
1841
|
+
"return document.querySelectorAll(arguments[0]).length;", selector
|
|
1842
|
+
)
|
|
1843
|
+
if int(count) <= 0:
|
|
1844
|
+
return False
|
|
1845
|
+
except Exception:
|
|
1846
|
+
return False
|
|
1847
|
+
return True
|
|
1848
|
+
else:
|
|
1849
|
+
cur_url = self.page.url
|
|
1850
|
+
if url_contains and (url_contains not in cur_url):
|
|
1851
|
+
return False
|
|
1852
|
+
if selector:
|
|
1853
|
+
try:
|
|
1854
|
+
# tiny, non-blocking check
|
|
1855
|
+
el = self.page.query_selector(selector)
|
|
1856
|
+
if not el:
|
|
1857
|
+
return False
|
|
1858
|
+
except Exception:
|
|
1859
|
+
return False
|
|
1860
|
+
return True
|
|
1861
|
+
except Exception:
|
|
1862
|
+
return False
|
|
1863
|
+
|
|
1864
|
+
self.logger.info(
|
|
1865
|
+
f"🛑 {action.message} in the browser window..."
|
|
1866
|
+
)
|
|
1867
|
+
self.logger.info(
|
|
1868
|
+
"ℹ️ I’ll resume automatically when the expected page/element is present."
|
|
1869
|
+
)
|
|
1870
|
+
|
|
1871
|
+
deadline = time.monotonic() + timeout
|
|
1872
|
+
while time.monotonic() < deadline:
|
|
1873
|
+
ok = await loop.run_in_executor(None, _check_sync)
|
|
1874
|
+
if ok:
|
|
1875
|
+
self.logger.info(
|
|
1876
|
+
"✅ Human step condition satisfied. Resuming automation."
|
|
1877
|
+
)
|
|
1878
|
+
return
|
|
1879
|
+
await asyncio.sleep(0.5)
|
|
1880
|
+
|
|
1881
|
+
raise TimeoutError(
|
|
1882
|
+
"await_human timed out waiting for the specified condition."
|
|
1883
|
+
)
|
|
1884
|
+
|
|
1885
|
+
async def _await_keypress(self, action: AwaitKeyPress):
|
|
1886
|
+
"""
|
|
1887
|
+
Pause until the operator presses ENTER in the console.
|
|
1888
|
+
Useful when there is no reliable selector to wait on.
|
|
1889
|
+
"""
|
|
1890
|
+
timeout = int(action.timeout or 300)
|
|
1891
|
+
prompt = action.message or "Press ENTER to continue..."
|
|
1892
|
+
expected_key = action.key
|
|
1893
|
+
|
|
1894
|
+
self.logger.info(f"🛑 {prompt}")
|
|
1895
|
+
start = time.monotonic()
|
|
1896
|
+
|
|
1897
|
+
loop = asyncio.get_running_loop()
|
|
1898
|
+
while time.monotonic() - start < timeout:
|
|
1899
|
+
ready, _, _ = await loop.run_in_executor(
|
|
1900
|
+
None, lambda: select.select([sys.stdin], [], [], 0.5)
|
|
1901
|
+
)
|
|
1902
|
+
if ready:
|
|
1903
|
+
try:
|
|
1904
|
+
keypress = sys.stdin.readline().strip()
|
|
1905
|
+
if expected_key is None or keypress == expected_key:
|
|
1906
|
+
self.logger.info("✅ Continuing after keypress.")
|
|
1907
|
+
return
|
|
1908
|
+
except Exception:
|
|
1909
|
+
pass
|
|
1910
|
+
raise TimeoutError("await_keypress timed out.")
|
|
1911
|
+
|
|
1912
|
+
async def _wait_for_download(self, action: WaitForDownload) -> bool:
|
|
1913
|
+
"""
|
|
1914
|
+
Wait for a file download to complete.
|
|
1915
|
+
|
|
1916
|
+
Args:
|
|
1917
|
+
action: WaitForDownload action with download monitoring options
|
|
1918
|
+
|
|
1919
|
+
Returns:
|
|
1920
|
+
bool: True if download detected successfully
|
|
1921
|
+
"""
|
|
1922
|
+
try:
|
|
1923
|
+
# Determine download directory
|
|
1924
|
+
if action.download_path:
|
|
1925
|
+
download_dir = Path(action.download_path)
|
|
1926
|
+
else:
|
|
1927
|
+
# Try to get default download directory from browser
|
|
1928
|
+
if self.driver_type == 'selenium':
|
|
1929
|
+
# Check Chrome prefs for download directory
|
|
1930
|
+
try:
|
|
1931
|
+
prefs = self.driver.execute_cdp_cmd(
|
|
1932
|
+
'Page.getDownloadInfo', {}
|
|
1933
|
+
)
|
|
1934
|
+
download_dir = Path(prefs.get('behavior', {}).get('downloadPath', '.'))
|
|
1935
|
+
except:
|
|
1936
|
+
# Fallback to common default locations
|
|
1937
|
+
download_dir = Path.home() / 'Downloads'
|
|
1938
|
+
else: # Playwright
|
|
1939
|
+
# Playwright typically uses its own download handling
|
|
1940
|
+
download_dir = Path.cwd() / 'downloads'
|
|
1941
|
+
|
|
1942
|
+
if not download_dir.exists():
|
|
1943
|
+
download_dir.mkdir(parents=True, exist_ok=True)
|
|
1944
|
+
|
|
1945
|
+
self.logger.info(f"Monitoring for downloads in: {download_dir}")
|
|
1946
|
+
|
|
1947
|
+
# Get initial files in directory
|
|
1948
|
+
initial_files = set(download_dir.glob('*'))
|
|
1949
|
+
|
|
1950
|
+
# Wait for new file to appear
|
|
1951
|
+
timeout = action.timeout
|
|
1952
|
+
start_time = time.time()
|
|
1953
|
+
downloaded_file = None
|
|
1954
|
+
|
|
1955
|
+
while time.time() - start_time < timeout:
|
|
1956
|
+
current_files = set(download_dir.glob('*'))
|
|
1957
|
+
new_files = current_files - initial_files
|
|
1958
|
+
|
|
1959
|
+
# Filter by pattern if specified
|
|
1960
|
+
if action.filename_pattern:
|
|
1961
|
+
matching_files = [
|
|
1962
|
+
f for f in new_files
|
|
1963
|
+
if f.match(action.filename_pattern)
|
|
1964
|
+
]
|
|
1965
|
+
else:
|
|
1966
|
+
matching_files = list(new_files)
|
|
1967
|
+
|
|
1968
|
+
# Check if any new files are complete (not .tmp, .crdownload, .part, etc.)
|
|
1969
|
+
for file_path in matching_files:
|
|
1970
|
+
# Skip temporary download files
|
|
1971
|
+
if any(ext in file_path.suffix.lower() for ext in ['.tmp', '.crdownload', '.part', '.download']):
|
|
1972
|
+
continue
|
|
1973
|
+
|
|
1974
|
+
# Check if file is still being written (size changing)
|
|
1975
|
+
try:
|
|
1976
|
+
size1 = file_path.stat().st_size
|
|
1977
|
+
await asyncio.sleep(0.5)
|
|
1978
|
+
size2 = file_path.stat().st_size
|
|
1979
|
+
|
|
1980
|
+
if size1 == size2 and size1 > 0:
|
|
1981
|
+
# File size stable and non-zero - download complete
|
|
1982
|
+
downloaded_file = file_path
|
|
1983
|
+
break
|
|
1984
|
+
except:
|
|
1985
|
+
continue
|
|
1986
|
+
|
|
1987
|
+
if downloaded_file:
|
|
1988
|
+
break
|
|
1989
|
+
|
|
1990
|
+
await asyncio.sleep(1)
|
|
1991
|
+
|
|
1992
|
+
if not downloaded_file:
|
|
1993
|
+
self.logger.error(
|
|
1994
|
+
f"Download not detected within {timeout} seconds"
|
|
1995
|
+
)
|
|
1996
|
+
return False
|
|
1997
|
+
|
|
1998
|
+
self.logger.info(f"Download complete: {downloaded_file.name}")
|
|
1999
|
+
|
|
2000
|
+
# Move file if requested
|
|
2001
|
+
if action.move_to:
|
|
2002
|
+
move_to_path = Path(action.move_to)
|
|
2003
|
+
if move_to_path.is_dir():
|
|
2004
|
+
final_path = move_to_path / downloaded_file.name
|
|
2005
|
+
else:
|
|
2006
|
+
final_path = move_to_path
|
|
2007
|
+
|
|
2008
|
+
final_path.parent.mkdir(parents=True, exist_ok=True)
|
|
2009
|
+
downloaded_file.rename(final_path)
|
|
2010
|
+
self.logger.info(f"Moved download to: {final_path}")
|
|
2011
|
+
downloaded_file = final_path
|
|
2012
|
+
|
|
2013
|
+
# Store download info in results
|
|
2014
|
+
current_url = await self._get_current_url()
|
|
2015
|
+
result = ScrapingResult(
|
|
2016
|
+
url=current_url,
|
|
2017
|
+
content="",
|
|
2018
|
+
bs_soup=BeautifulSoup("", 'html.parser'),
|
|
2019
|
+
extracted_data={
|
|
2020
|
+
"downloaded_file": str(downloaded_file),
|
|
2021
|
+
"file_name": downloaded_file.name,
|
|
2022
|
+
"file_size": downloaded_file.stat().st_size
|
|
2023
|
+
},
|
|
2024
|
+
metadata={
|
|
2025
|
+
"download_path": str(download_dir),
|
|
2026
|
+
"filename_pattern": action.filename_pattern,
|
|
2027
|
+
"moved_to": action.move_to
|
|
2028
|
+
},
|
|
2029
|
+
timestamp=str(time.time()),
|
|
2030
|
+
success=True
|
|
2031
|
+
)
|
|
2032
|
+
self.results.append(result)
|
|
2033
|
+
|
|
2034
|
+
# Delete file if requested
|
|
2035
|
+
if action.delete_after:
|
|
2036
|
+
downloaded_file.unlink()
|
|
2037
|
+
self.logger.info(f"Deleted file: {downloaded_file.name}")
|
|
2038
|
+
|
|
2039
|
+
return True
|
|
2040
|
+
|
|
2041
|
+
except Exception as e:
|
|
2042
|
+
self.logger.error(f"WaitForDownload action failed: {str(e)}")
|
|
2043
|
+
return False
|
|
2044
|
+
|
|
2045
|
+
|
|
2046
|
+
async def _upload_file(self, action: UploadFile) -> bool:
|
|
2047
|
+
"""
|
|
2048
|
+
Upload a file to a file input element.
|
|
2049
|
+
|
|
2050
|
+
Args:
|
|
2051
|
+
action: UploadFile action with file path and selector
|
|
2052
|
+
|
|
2053
|
+
Returns:
|
|
2054
|
+
bool: True if upload successful
|
|
2055
|
+
"""
|
|
2056
|
+
try:
|
|
2057
|
+
# Determine file paths
|
|
2058
|
+
if action.multiple_files and action.file_paths:
|
|
2059
|
+
file_paths = [Path(fp).resolve() for fp in action.file_paths]
|
|
2060
|
+
else:
|
|
2061
|
+
file_paths = [Path(action.file_path).resolve()]
|
|
2062
|
+
|
|
2063
|
+
# Verify files exist
|
|
2064
|
+
for file_path in file_paths:
|
|
2065
|
+
if not file_path.exists():
|
|
2066
|
+
self.logger.error(f"File not found: {file_path}")
|
|
2067
|
+
return False
|
|
2068
|
+
|
|
2069
|
+
self.logger.info(f"Uploading {len(file_paths)} file(s)")
|
|
2070
|
+
|
|
2071
|
+
if self.driver_type == 'selenium':
|
|
2072
|
+
loop = asyncio.get_running_loop()
|
|
2073
|
+
|
|
2074
|
+
def upload_sync():
|
|
2075
|
+
# Find the file input element
|
|
2076
|
+
file_input = WebDriverWait(
|
|
2077
|
+
self.driver,
|
|
2078
|
+
action.timeout or self.default_timeout
|
|
2079
|
+
).until(
|
|
2080
|
+
EC.presence_of_element_located(
|
|
2081
|
+
(By.CSS_SELECTOR, action.selector)
|
|
2082
|
+
)
|
|
2083
|
+
)
|
|
2084
|
+
|
|
2085
|
+
# Send file paths to input
|
|
2086
|
+
if len(file_paths) == 1:
|
|
2087
|
+
file_input.send_keys(str(file_paths[0]))
|
|
2088
|
+
else:
|
|
2089
|
+
# Multiple files - join with newline
|
|
2090
|
+
file_input.send_keys('\n'.join(str(fp) for fp in file_paths))
|
|
2091
|
+
|
|
2092
|
+
self.logger.info("File(s) uploaded successfully")
|
|
2093
|
+
|
|
2094
|
+
# Wait for post-upload element if specified
|
|
2095
|
+
if action.wait_after_upload:
|
|
2096
|
+
try:
|
|
2097
|
+
WebDriverWait(
|
|
2098
|
+
self.driver,
|
|
2099
|
+
action.wait_timeout
|
|
2100
|
+
).until(
|
|
2101
|
+
EC.presence_of_element_located(
|
|
2102
|
+
(By.CSS_SELECTOR, action.wait_after_upload)
|
|
2103
|
+
)
|
|
2104
|
+
)
|
|
2105
|
+
self.logger.info(
|
|
2106
|
+
f"Post-upload element found: {action.wait_after_upload}"
|
|
2107
|
+
)
|
|
2108
|
+
except Exception as e:
|
|
2109
|
+
self.logger.warning(
|
|
2110
|
+
f"Post-upload wait timed out: {action.wait_after_upload}"
|
|
2111
|
+
)
|
|
2112
|
+
|
|
2113
|
+
await loop.run_in_executor(None, upload_sync)
|
|
2114
|
+
|
|
2115
|
+
else: # Playwright
|
|
2116
|
+
# For Playwright, set the files directly
|
|
2117
|
+
if len(file_paths) == 1:
|
|
2118
|
+
await self.page.set_input_files(action.selector, str(file_paths[0]))
|
|
2119
|
+
else:
|
|
2120
|
+
await self.page.set_input_files(
|
|
2121
|
+
action.selector,
|
|
2122
|
+
[str(fp) for fp in file_paths]
|
|
2123
|
+
)
|
|
2124
|
+
|
|
2125
|
+
self.logger.info("File(s) uploaded successfully")
|
|
2126
|
+
|
|
2127
|
+
# Wait for post-upload element if specified
|
|
2128
|
+
if action.wait_after_upload:
|
|
2129
|
+
try:
|
|
2130
|
+
await self.page.wait_for_selector(
|
|
2131
|
+
action.wait_after_upload,
|
|
2132
|
+
timeout=action.wait_timeout * 1000
|
|
2133
|
+
)
|
|
2134
|
+
self.logger.info(
|
|
2135
|
+
f"Post-upload element found: {action.wait_after_upload}"
|
|
2136
|
+
)
|
|
2137
|
+
except Exception:
|
|
2138
|
+
self.logger.warning(
|
|
2139
|
+
f"Post-upload wait timed out: {action.wait_after_upload}"
|
|
2140
|
+
)
|
|
2141
|
+
|
|
2142
|
+
# Store upload info in results
|
|
2143
|
+
current_url = await self._get_current_url()
|
|
2144
|
+
result = ScrapingResult(
|
|
2145
|
+
url=current_url,
|
|
2146
|
+
content="",
|
|
2147
|
+
bs_soup=BeautifulSoup("", 'html.parser'),
|
|
2148
|
+
extracted_data={
|
|
2149
|
+
"uploaded_files": [fp.name for fp in file_paths],
|
|
2150
|
+
"file_count": len(file_paths)
|
|
2151
|
+
},
|
|
2152
|
+
metadata={
|
|
2153
|
+
"selector": action.selector,
|
|
2154
|
+
"file_paths": [str(fp) for fp in file_paths],
|
|
2155
|
+
"multiple_files": action.multiple_files
|
|
2156
|
+
},
|
|
2157
|
+
timestamp=str(time.time()),
|
|
2158
|
+
success=True
|
|
2159
|
+
)
|
|
2160
|
+
self.results.append(result)
|
|
2161
|
+
|
|
2162
|
+
return True
|
|
2163
|
+
|
|
2164
|
+
except Exception as e:
|
|
2165
|
+
self.logger.error(f"UploadFile action failed: {str(e)}")
|
|
2166
|
+
return False
|
|
2167
|
+
|
|
2168
|
+
async def _exec_conditional(
|
|
2169
|
+
self,
|
|
2170
|
+
action: Conditional,
|
|
2171
|
+
base_url: str = "",
|
|
2172
|
+
args: Optional[dict] = None
|
|
2173
|
+
) -> bool:
|
|
2174
|
+
"""Handle Conditional action - execute actions based on a condition."""
|
|
2175
|
+
|
|
2176
|
+
CONDITION_TYPES = {
|
|
2177
|
+
'exists': lambda element, expected: element is not None,
|
|
2178
|
+
'not_exists': lambda element, expected: element is None,
|
|
2179
|
+
'text_contains': lambda element, expected: expected in (element.text if element else ''),
|
|
2180
|
+
'text_equals': lambda element, expected: (element.text if element else '') == expected,
|
|
2181
|
+
'attribute_equals': lambda element, expected: element.get_attribute(expected['attr']) == expected['value'] if element else False,
|
|
2182
|
+
}
|
|
2183
|
+
|
|
2184
|
+
target = action.target
|
|
2185
|
+
target_type = action.target_type or 'css'
|
|
2186
|
+
condition_type = action.condition_type
|
|
2187
|
+
expected_value = action.expected_value
|
|
2188
|
+
timeout = action.timeout or 5
|
|
2189
|
+
|
|
2190
|
+
self.logger.info(
|
|
2191
|
+
f"Evaluating conditional: {condition_type} on {target_type}='{target}' with value '*{expected_value}*'"
|
|
2192
|
+
)
|
|
2193
|
+
|
|
2194
|
+
# Find the element
|
|
2195
|
+
element = None
|
|
2196
|
+
if self.driver_type == 'selenium':
|
|
2197
|
+
loop = asyncio.get_running_loop()
|
|
2198
|
+
|
|
2199
|
+
def find_element_sync():
|
|
2200
|
+
try:
|
|
2201
|
+
# Determine locator type
|
|
2202
|
+
if target_type == 'xpath':
|
|
2203
|
+
by_type = By.XPATH
|
|
2204
|
+
else: # css
|
|
2205
|
+
by_type = By.CSS_SELECTOR
|
|
2206
|
+
|
|
2207
|
+
# Try to find element with timeout
|
|
2208
|
+
try:
|
|
2209
|
+
el = WebDriverWait(
|
|
2210
|
+
self.driver,
|
|
2211
|
+
timeout,
|
|
2212
|
+
poll_frequency=0.25
|
|
2213
|
+
).until(
|
|
2214
|
+
EC.presence_of_element_located((by_type, target))
|
|
2215
|
+
)
|
|
2216
|
+
return el
|
|
2217
|
+
except (TimeoutException, NoSuchElementException):
|
|
2218
|
+
return None
|
|
2219
|
+
except Exception as e:
|
|
2220
|
+
self.logger.debug(f"Error finding element: {str(e)}")
|
|
2221
|
+
return None
|
|
2222
|
+
|
|
2223
|
+
element = await loop.run_in_executor(None, find_element_sync)
|
|
2224
|
+
|
|
2225
|
+
else: # Playwright
|
|
2226
|
+
try:
|
|
2227
|
+
if target_type == 'xpath':
|
|
2228
|
+
selector = f"xpath={target}"
|
|
2229
|
+
else:
|
|
2230
|
+
selector = target
|
|
2231
|
+
|
|
2232
|
+
element = await self.page.wait_for_selector(
|
|
2233
|
+
selector,
|
|
2234
|
+
timeout=timeout * 1000,
|
|
2235
|
+
state='attached'
|
|
2236
|
+
)
|
|
2237
|
+
except Exception:
|
|
2238
|
+
element = None
|
|
2239
|
+
|
|
2240
|
+
# Evaluate condition
|
|
2241
|
+
condition_func = CONDITION_TYPES.get(condition_type)
|
|
2242
|
+
if not condition_func:
|
|
2243
|
+
self.logger.error(f"Unknown condition type: {condition_type}")
|
|
2244
|
+
return False
|
|
2245
|
+
|
|
2246
|
+
# For attribute_equals, expected_value should be a dict
|
|
2247
|
+
if condition_type == 'attribute_equals' and isinstance(expected_value, str):
|
|
2248
|
+
# Try to parse as "attr=value"
|
|
2249
|
+
if '=' in expected_value:
|
|
2250
|
+
attr, val = expected_value.split('=', 1)
|
|
2251
|
+
expected_value = {'attr': attr.strip(), 'value': val.strip()}
|
|
2252
|
+
|
|
2253
|
+
try:
|
|
2254
|
+
condition_result = condition_func(element, expected_value)
|
|
2255
|
+
except Exception as e:
|
|
2256
|
+
self.logger.error(f"Error evaluating condition: {str(e)}")
|
|
2257
|
+
condition_result = False
|
|
2258
|
+
|
|
2259
|
+
self.logger.notice(
|
|
2260
|
+
f"Condition result: {condition_result}"
|
|
2261
|
+
)
|
|
2262
|
+
|
|
2263
|
+
# Determine which actions to execute
|
|
2264
|
+
actions_to_execute = (
|
|
2265
|
+
action.actions_if_true if condition_result
|
|
2266
|
+
else (action.actions_if_false or [])
|
|
2267
|
+
)
|
|
2268
|
+
|
|
2269
|
+
if not actions_to_execute:
|
|
2270
|
+
self.logger.info(
|
|
2271
|
+
f"No actions to execute for condition result: {condition_result}"
|
|
2272
|
+
)
|
|
2273
|
+
return True
|
|
2274
|
+
|
|
2275
|
+
self.logger.info(
|
|
2276
|
+
f"Executing {len(actions_to_execute)} action(s) based on condition result"
|
|
2277
|
+
)
|
|
2278
|
+
|
|
2279
|
+
# Execute the actions
|
|
2280
|
+
all_success = True
|
|
2281
|
+
for sub_action in actions_to_execute:
|
|
2282
|
+
step = ScrapingStep(action=sub_action)
|
|
2283
|
+
success = await self._execute_step(step, base_url, args)
|
|
2284
|
+
|
|
2285
|
+
if not success:
|
|
2286
|
+
self.logger.warning(
|
|
2287
|
+
f"Conditional sub-action failed: {sub_action.description}"
|
|
2288
|
+
)
|
|
2289
|
+
all_success = False
|
|
2290
|
+
# Continue executing remaining actions even if one fails
|
|
2291
|
+
|
|
2292
|
+
return all_success
|
|
2293
|
+
|
|
2294
|
+
async def _exec_loop(self, action: Loop, base_url: str) -> bool:
|
|
2295
|
+
"""Handle Loop action - execute actions repeatedly.
|
|
2296
|
+
|
|
2297
|
+
Supports:
|
|
2298
|
+
- Fixed iterations
|
|
2299
|
+
- Iterating over a list of values
|
|
2300
|
+
- Template variable substitution
|
|
2301
|
+
|
|
2302
|
+
Template Variables:
|
|
2303
|
+
- {i}, {index}, {iteration} - Current iteration number
|
|
2304
|
+
- {i+1} - 1-based iteration (useful for page numbers)
|
|
2305
|
+
- {i-1}, {i*2}, etc. - Arithmetic expressions
|
|
2306
|
+
- {value} - Current value from values list
|
|
2307
|
+
|
|
2308
|
+
Example:
|
|
2309
|
+
Loop with iterations=3, start_index=1:
|
|
2310
|
+
- First iteration: {i} -> 1, {i+1} -> 2
|
|
2311
|
+
- Second iteration: {i} -> 2, {i+1} -> 3
|
|
2312
|
+
- Third iteration: {i} -> 3, {i+1} -> 4
|
|
2313
|
+
"""
|
|
2314
|
+
iteration = 0
|
|
2315
|
+
start_index = action.start_index
|
|
2316
|
+
value_name = action.value_name
|
|
2317
|
+
|
|
2318
|
+
if action.values:
|
|
2319
|
+
max_iter = len(action.values)
|
|
2320
|
+
self.logger.info(
|
|
2321
|
+
f"Starting loop over {max_iter} values, start_index={start_index}"
|
|
2322
|
+
)
|
|
2323
|
+
else:
|
|
2324
|
+
max_iter = action.iterations or action.max_iterations
|
|
2325
|
+
self.logger.info(
|
|
2326
|
+
f"Starting loop: {max_iter} iterations, start_index={start_index}"
|
|
2327
|
+
)
|
|
2328
|
+
|
|
2329
|
+
while iteration < max_iter:
|
|
2330
|
+
display_index = start_index + iteration
|
|
2331
|
+
# Get current value if iterating over values
|
|
2332
|
+
current_value = action.values[iteration] if action.values else None
|
|
2333
|
+
|
|
2334
|
+
# Check condition if provided
|
|
2335
|
+
if action.condition:
|
|
2336
|
+
should_continue = await self._evaluate_condition(action.condition)
|
|
2337
|
+
if not should_continue:
|
|
2338
|
+
break
|
|
2339
|
+
|
|
2340
|
+
# Execute all actions in the loop
|
|
2341
|
+
for loop_action in action.actions:
|
|
2342
|
+
# Substitute template variables in the action
|
|
2343
|
+
if action.do_replace:
|
|
2344
|
+
sub_action = self._substitute_action_vars(
|
|
2345
|
+
loop_action,
|
|
2346
|
+
iteration,
|
|
2347
|
+
start_index,
|
|
2348
|
+
current_value
|
|
2349
|
+
)
|
|
2350
|
+
else:
|
|
2351
|
+
sub_action = loop_action
|
|
2352
|
+
|
|
2353
|
+
step = ScrapingStep(action=sub_action)
|
|
2354
|
+
args = {
|
|
2355
|
+
"iteration": iteration,
|
|
2356
|
+
"data": {
|
|
2357
|
+
"index": display_index,
|
|
2358
|
+
value_name: current_value
|
|
2359
|
+
}
|
|
2360
|
+
}
|
|
2361
|
+
success = await self._execute_step(step, base_url, args)
|
|
2362
|
+
|
|
2363
|
+
if not success and action.break_on_error:
|
|
2364
|
+
self.logger.warning(f"Loop stopped at iteration {iteration} due to error")
|
|
2365
|
+
return False
|
|
2366
|
+
|
|
2367
|
+
iteration += 1
|
|
2368
|
+
|
|
2369
|
+
# Break if we've reached specified iterations
|
|
2370
|
+
if action.iterations and iteration >= action.iterations:
|
|
2371
|
+
break
|
|
2372
|
+
# do a small delay (random) between iterations
|
|
2373
|
+
await asyncio.sleep(random.uniform(0.1, 0.5))
|
|
2374
|
+
|
|
2375
|
+
self.logger.info(f"Loop completed {iteration} iterations")
|
|
2376
|
+
return True
|
|
2377
|
+
|
|
2378
|
+
async def _evaluate_condition(self, condition: str) -> bool:
|
|
2379
|
+
"""Evaluate a JavaScript condition"""
|
|
2380
|
+
if self.driver_type == 'selenium':
|
|
2381
|
+
loop = asyncio.get_running_loop()
|
|
2382
|
+
result = await loop.run_in_executor(
|
|
2383
|
+
None,
|
|
2384
|
+
lambda: self.driver.execute_script(f"return Boolean({condition})")
|
|
2385
|
+
)
|
|
2386
|
+
else: # Playwright
|
|
2387
|
+
result = await self.page.evaluate(f"() => Boolean({condition})")
|
|
2388
|
+
|
|
2389
|
+
return bool(result)
|
|
2390
|
+
|
|
2391
|
+
async def _extract_content(
|
|
2392
|
+
self,
|
|
2393
|
+
url: str,
|
|
2394
|
+
selectors: List[ScrapingSelector]
|
|
2395
|
+
) -> ScrapingResult:
|
|
2396
|
+
"""Extract content based on provided selectors"""
|
|
2397
|
+
# Get page source
|
|
2398
|
+
if self.driver_type == 'selenium':
|
|
2399
|
+
loop = asyncio.get_running_loop()
|
|
2400
|
+
page_source = await loop.run_in_executor(None, lambda: self.driver.page_source)
|
|
2401
|
+
else: # Playwright
|
|
2402
|
+
page_source = await self.page.content()
|
|
2403
|
+
|
|
2404
|
+
# Parse with BeautifulSoup
|
|
2405
|
+
soup = BeautifulSoup(page_source, 'html.parser')
|
|
2406
|
+
|
|
2407
|
+
# Extract data based on selectors
|
|
2408
|
+
extracted_data = {}
|
|
2409
|
+
for selector_config in selectors:
|
|
2410
|
+
try:
|
|
2411
|
+
data = await self._extract_by_selector(soup, selector_config)
|
|
2412
|
+
extracted_data[selector_config.name] = data
|
|
2413
|
+
except Exception as e:
|
|
2414
|
+
self.logger.warning(f"Failed to extract {selector_config.name}: {str(e)}")
|
|
2415
|
+
extracted_data[selector_config.name] = None
|
|
2416
|
+
|
|
2417
|
+
return ScrapingResult(
|
|
2418
|
+
url=url,
|
|
2419
|
+
content=page_source,
|
|
2420
|
+
bs_soup=soup,
|
|
2421
|
+
extracted_data=extracted_data,
|
|
2422
|
+
timestamp=str(time.time())
|
|
2423
|
+
)
|
|
2424
|
+
|
|
2425
|
+
async def _extract_full_content(self, url: str) -> ScrapingResult:
|
|
2426
|
+
"""Extract full page content when no selectors provided"""
|
|
2427
|
+
# Get page source
|
|
2428
|
+
if self.driver_type == 'selenium':
|
|
2429
|
+
loop = asyncio.get_running_loop()
|
|
2430
|
+
page_source = await loop.run_in_executor(None, lambda: self.driver.page_source)
|
|
2431
|
+
else: # Playwright
|
|
2432
|
+
page_source = await self.page.content()
|
|
2433
|
+
|
|
2434
|
+
# Parse with BeautifulSoup
|
|
2435
|
+
soup = BeautifulSoup(page_source, 'html.parser')
|
|
2436
|
+
|
|
2437
|
+
# Extract basic page information
|
|
2438
|
+
extracted_data = {
|
|
2439
|
+
"title": soup.title.string if soup.title else "",
|
|
2440
|
+
"body_text": soup.get_text(strip=True),
|
|
2441
|
+
"links": [a.get('href') for a in soup.find_all('a', href=True)],
|
|
2442
|
+
"images": [img.get('src') for img in soup.find_all('img', src=True)]
|
|
2443
|
+
}
|
|
2444
|
+
|
|
2445
|
+
return ScrapingResult(
|
|
2446
|
+
url=url,
|
|
2447
|
+
content=page_source,
|
|
2448
|
+
bs_soup=soup,
|
|
2449
|
+
extracted_data=extracted_data,
|
|
2450
|
+
timestamp=str(time.time())
|
|
2451
|
+
)
|
|
2452
|
+
|
|
2453
|
+
async def _extract_by_selector(
|
|
2454
|
+
self,
|
|
2455
|
+
soup: BeautifulSoup,
|
|
2456
|
+
selector_config: ScrapingSelector
|
|
2457
|
+
) -> Union[str, List[str], Dict[str, Any]]:
|
|
2458
|
+
"""Extract content using a specific selector configuration"""
|
|
2459
|
+
if selector_config.selector_type == 'css':
|
|
2460
|
+
elements = soup.select(selector_config.selector)
|
|
2461
|
+
elif selector_config.selector_type == 'xpath':
|
|
2462
|
+
# BeautifulSoup doesn't support XPath, you'd need lxml here
|
|
2463
|
+
# For now, fallback to CSS
|
|
2464
|
+
elements = soup.select(selector_config.selector)
|
|
2465
|
+
else: # tag
|
|
2466
|
+
elements = soup.find_all(selector_config.selector)
|
|
2467
|
+
|
|
2468
|
+
if not elements:
|
|
2469
|
+
return None if not selector_config.multiple else []
|
|
2470
|
+
|
|
2471
|
+
# Extract content based on type
|
|
2472
|
+
extracted = []
|
|
2473
|
+
for element in elements:
|
|
2474
|
+
if selector_config.extract_type == 'text':
|
|
2475
|
+
content = element.get_text(strip=True)
|
|
2476
|
+
elif selector_config.extract_type == 'html':
|
|
2477
|
+
content = str(element)
|
|
2478
|
+
elif selector_config.extract_type == 'attribute':
|
|
2479
|
+
content = element.get(selector_config.attribute, '')
|
|
2480
|
+
else:
|
|
2481
|
+
content = element.get_text(strip=True)
|
|
2482
|
+
|
|
2483
|
+
extracted.append(content)
|
|
2484
|
+
|
|
2485
|
+
return extracted if selector_config.multiple else extracted[0] if extracted else None
|
|
2486
|
+
|
|
2487
|
+
async def _get_current_url(self) -> str:
|
|
2488
|
+
"""Get current page URL"""
|
|
2489
|
+
if self.driver_type == 'selenium':
|
|
2490
|
+
loop = asyncio.get_running_loop()
|
|
2491
|
+
return await loop.run_in_executor(None, lambda: self.driver.current_url)
|
|
2492
|
+
else: # Playwright
|
|
2493
|
+
return self.page.url
|
|
2494
|
+
|
|
2495
|
+
async def cleanup(self):
|
|
2496
|
+
"""Clean up resources"""
|
|
2497
|
+
try:
|
|
2498
|
+
if self.driver_type == 'selenium' and self.driver:
|
|
2499
|
+
loop = asyncio.get_running_loop()
|
|
2500
|
+
await loop.run_in_executor(None, self.driver.quit)
|
|
2501
|
+
elif self.browser:
|
|
2502
|
+
await self.browser.close()
|
|
2503
|
+
except Exception as e:
|
|
2504
|
+
self.logger.error(f"Cleanup failed: {str(e)}")
|
|
2505
|
+
|
|
2506
|
+
def get_tool_schema(self) -> Dict[str, Any]:
|
|
2507
|
+
"""
|
|
2508
|
+
Define the tool schema for LLM interaction.
|
|
2509
|
+
Provides comprehensive documentation of all available actions and their parameters.
|
|
2510
|
+
"""
|
|
2511
|
+
return {
|
|
2512
|
+
"type": "function",
|
|
2513
|
+
"function": {
|
|
2514
|
+
"name": "web_scraping_tool",
|
|
2515
|
+
"description": """Execute automated web scraping with step-by-step navigation and content extraction.
|
|
2516
|
+
Supports navigation, interaction, authentication, content extraction, screenshots, file uploads, and download monitoring.
|
|
2517
|
+
Works with both Selenium and Playwright drivers.""",
|
|
2518
|
+
"parameters": {
|
|
2519
|
+
"type": "object",
|
|
2520
|
+
"properties": {
|
|
2521
|
+
"steps": {
|
|
2522
|
+
"type": "array",
|
|
2523
|
+
"description": "List of navigation and interaction steps to execute in sequence",
|
|
2524
|
+
"items": {
|
|
2525
|
+
"type": "object",
|
|
2526
|
+
"required": ["action"],
|
|
2527
|
+
"properties": {
|
|
2528
|
+
"action": {
|
|
2529
|
+
"type": "string",
|
|
2530
|
+
"enum": [
|
|
2531
|
+
"navigate",
|
|
2532
|
+
"click",
|
|
2533
|
+
"fill",
|
|
2534
|
+
"evaluate",
|
|
2535
|
+
"press_key",
|
|
2536
|
+
"refresh",
|
|
2537
|
+
"back",
|
|
2538
|
+
"scroll",
|
|
2539
|
+
"get_cookies",
|
|
2540
|
+
"set_cookies",
|
|
2541
|
+
"wait",
|
|
2542
|
+
"authenticate",
|
|
2543
|
+
"await_human",
|
|
2544
|
+
"await_keypress",
|
|
2545
|
+
"await_browser_event",
|
|
2546
|
+
"loop",
|
|
2547
|
+
"get_text",
|
|
2548
|
+
"get_html",
|
|
2549
|
+
"screenshot",
|
|
2550
|
+
"wait_for_download",
|
|
2551
|
+
"upload_file"
|
|
2552
|
+
],
|
|
2553
|
+
"description": "Type of action to perform"
|
|
2554
|
+
},
|
|
2555
|
+
"description": {
|
|
2556
|
+
"type": "string",
|
|
2557
|
+
"description": "Human-readable description of what this action does"
|
|
2558
|
+
},
|
|
2559
|
+
"timeout": {
|
|
2560
|
+
"type": "integer",
|
|
2561
|
+
"description": "Maximum time to wait for action completion (seconds)"
|
|
2562
|
+
},
|
|
2563
|
+
|
|
2564
|
+
# Navigate action
|
|
2565
|
+
"url": {
|
|
2566
|
+
"type": "string",
|
|
2567
|
+
"description": "URL to navigate to (for 'navigate' action)"
|
|
2568
|
+
},
|
|
2569
|
+
|
|
2570
|
+
# Click action
|
|
2571
|
+
"selector": {
|
|
2572
|
+
"type": "string",
|
|
2573
|
+
"description": "CSS selector for element (for 'click', 'fill', 'get_text', 'get_html', 'screenshot', 'upload_file' actions)"
|
|
2574
|
+
},
|
|
2575
|
+
"click_type": {
|
|
2576
|
+
"type": "string",
|
|
2577
|
+
"enum": ["single", "double", "right"],
|
|
2578
|
+
"description": "Type of click (for 'click' action)"
|
|
2579
|
+
},
|
|
2580
|
+
"wait_after_click": {
|
|
2581
|
+
"type": "string",
|
|
2582
|
+
"description": "CSS selector of element to wait for after clicking (for 'click' action)"
|
|
2583
|
+
},
|
|
2584
|
+
"wait_timeout": {
|
|
2585
|
+
"type": "integer",
|
|
2586
|
+
"description": "Timeout for post-click wait in seconds (for 'click' action)"
|
|
2587
|
+
},
|
|
2588
|
+
"no_wait": {
|
|
2589
|
+
"type": "boolean",
|
|
2590
|
+
"description": "Skip waiting after click (for 'click' action)"
|
|
2591
|
+
},
|
|
2592
|
+
|
|
2593
|
+
# Fill action
|
|
2594
|
+
"value": {
|
|
2595
|
+
"type": "string",
|
|
2596
|
+
"description": "Text value to enter (for 'fill' action)"
|
|
2597
|
+
},
|
|
2598
|
+
"clear_first": {
|
|
2599
|
+
"type": "boolean",
|
|
2600
|
+
"description": "Clear existing content before filling (for 'fill' action)"
|
|
2601
|
+
},
|
|
2602
|
+
"press_enter": {
|
|
2603
|
+
"type": "boolean",
|
|
2604
|
+
"description": "Press Enter after filling (for 'fill' action)"
|
|
2605
|
+
},
|
|
2606
|
+
|
|
2607
|
+
# Evaluate action
|
|
2608
|
+
"script": {
|
|
2609
|
+
"type": "string",
|
|
2610
|
+
"description": "JavaScript code to execute (for 'evaluate' action)"
|
|
2611
|
+
},
|
|
2612
|
+
"script_file": {
|
|
2613
|
+
"type": "string",
|
|
2614
|
+
"description": "Path to JavaScript file to execute (for 'evaluate' action)"
|
|
2615
|
+
},
|
|
2616
|
+
"args": {
|
|
2617
|
+
"type": "array",
|
|
2618
|
+
"description": "Arguments to pass to script (for 'evaluate' action)",
|
|
2619
|
+
"items": {"type": "string"}
|
|
2620
|
+
},
|
|
2621
|
+
"return_value": {
|
|
2622
|
+
"type": "boolean",
|
|
2623
|
+
"description": "Whether to return script result (for 'evaluate' action)"
|
|
2624
|
+
},
|
|
2625
|
+
|
|
2626
|
+
# PressKey action
|
|
2627
|
+
"keys": {
|
|
2628
|
+
"type": "array",
|
|
2629
|
+
"description": "Keys to press, e.g., ['Tab', 'Enter'] (for 'press_key' action)",
|
|
2630
|
+
"items": {"type": "string"}
|
|
2631
|
+
},
|
|
2632
|
+
"sequential": {
|
|
2633
|
+
"type": "boolean",
|
|
2634
|
+
"description": "Press keys sequentially vs as combination (for 'press_key' action)"
|
|
2635
|
+
},
|
|
2636
|
+
"target": {
|
|
2637
|
+
"type": "string",
|
|
2638
|
+
"description": "CSS selector to focus before pressing keys (for 'press_key' action)"
|
|
2639
|
+
},
|
|
2640
|
+
|
|
2641
|
+
# Refresh action
|
|
2642
|
+
"hard": {
|
|
2643
|
+
"type": "boolean",
|
|
2644
|
+
"description": "Perform hard refresh clearing cache (for 'refresh' action)"
|
|
2645
|
+
},
|
|
2646
|
+
|
|
2647
|
+
# Back action
|
|
2648
|
+
"steps": {
|
|
2649
|
+
"type": "integer",
|
|
2650
|
+
"description": "Number of steps to go back in history (for 'back' action)"
|
|
2651
|
+
},
|
|
2652
|
+
|
|
2653
|
+
# Scroll action
|
|
2654
|
+
"direction": {
|
|
2655
|
+
"type": "string",
|
|
2656
|
+
"enum": ["up", "down", "top", "bottom"],
|
|
2657
|
+
"description": "Scroll direction (for 'scroll' action)"
|
|
2658
|
+
},
|
|
2659
|
+
"amount": {
|
|
2660
|
+
"type": "integer",
|
|
2661
|
+
"description": "Pixels to scroll (for 'scroll' action)"
|
|
2662
|
+
},
|
|
2663
|
+
"smooth": {
|
|
2664
|
+
"type": "boolean",
|
|
2665
|
+
"description": "Use smooth scrolling animation (for 'scroll' action)"
|
|
2666
|
+
},
|
|
2667
|
+
|
|
2668
|
+
# GetCookies action
|
|
2669
|
+
"names": {
|
|
2670
|
+
"type": "array",
|
|
2671
|
+
"description": "Specific cookie names to retrieve (for 'get_cookies' action)",
|
|
2672
|
+
"items": {"type": "string"}
|
|
2673
|
+
},
|
|
2674
|
+
"domain": {
|
|
2675
|
+
"type": "string",
|
|
2676
|
+
"description": "Filter cookies by domain (for 'get_cookies' action)"
|
|
2677
|
+
},
|
|
2678
|
+
|
|
2679
|
+
# SetCookies action
|
|
2680
|
+
"cookies": {
|
|
2681
|
+
"type": "array",
|
|
2682
|
+
"description": "List of cookie objects to set (for 'set_cookies' action)",
|
|
2683
|
+
"items": {
|
|
2684
|
+
"type": "object",
|
|
2685
|
+
"properties": {
|
|
2686
|
+
"name": {"type": "string"},
|
|
2687
|
+
"value": {"type": "string"},
|
|
2688
|
+
"domain": {"type": "string"},
|
|
2689
|
+
"path": {"type": "string"},
|
|
2690
|
+
"secure": {"type": "boolean"},
|
|
2691
|
+
"httpOnly": {"type": "boolean"}
|
|
2692
|
+
}
|
|
2693
|
+
}
|
|
2694
|
+
},
|
|
2695
|
+
|
|
2696
|
+
# Wait action
|
|
2697
|
+
"condition": {
|
|
2698
|
+
"type": "string",
|
|
2699
|
+
"description": "Condition value - CSS selector, URL substring, etc. (for 'wait' action)"
|
|
2700
|
+
},
|
|
2701
|
+
"condition_type": {
|
|
2702
|
+
"type": "string",
|
|
2703
|
+
"enum": ["selector", "url_contains", "title_contains", "custom"],
|
|
2704
|
+
"description": "Type of condition to wait for (for 'wait' action)"
|
|
2705
|
+
},
|
|
2706
|
+
"custom_script": {
|
|
2707
|
+
"type": "string",
|
|
2708
|
+
"description": "JavaScript returning boolean for custom wait (for 'wait' action)"
|
|
2709
|
+
},
|
|
2710
|
+
|
|
2711
|
+
# Authenticate action
|
|
2712
|
+
"method": {
|
|
2713
|
+
"type": "string",
|
|
2714
|
+
"enum": ["form", "basic", "oauth", "custom"],
|
|
2715
|
+
"description": "Authentication method (for 'authenticate' action)"
|
|
2716
|
+
},
|
|
2717
|
+
"username": {
|
|
2718
|
+
"type": "string",
|
|
2719
|
+
"description": "Username or email (for 'authenticate' action)"
|
|
2720
|
+
},
|
|
2721
|
+
"enter_on_username": {
|
|
2722
|
+
"type": "boolean",
|
|
2723
|
+
"description": "Press Enter after filling username (for multi-step logins, 'authenticate' action)"
|
|
2724
|
+
},
|
|
2725
|
+
"password": {
|
|
2726
|
+
"type": "string",
|
|
2727
|
+
"description": "Password (for 'authenticate' action)"
|
|
2728
|
+
},
|
|
2729
|
+
"username_selector": {
|
|
2730
|
+
"type": "string",
|
|
2731
|
+
"description": "CSS selector for username field (for 'authenticate' action)"
|
|
2732
|
+
},
|
|
2733
|
+
"password_selector": {
|
|
2734
|
+
"type": "string",
|
|
2735
|
+
"description": "CSS selector for password field (for 'authenticate' action)"
|
|
2736
|
+
},
|
|
2737
|
+
"submit_selector": {
|
|
2738
|
+
"type": "string",
|
|
2739
|
+
"description": "CSS selector for submit button (for 'authenticate' action)"
|
|
2740
|
+
},
|
|
2741
|
+
|
|
2742
|
+
# AwaitHuman action
|
|
2743
|
+
"message": {
|
|
2744
|
+
"type": "string",
|
|
2745
|
+
"description": "Message to display while waiting (for 'await_human', 'await_keypress' actions)"
|
|
2746
|
+
},
|
|
2747
|
+
|
|
2748
|
+
# AwaitKeyPress action
|
|
2749
|
+
"expected_key": {
|
|
2750
|
+
"type": "string",
|
|
2751
|
+
"description": "Specific key to wait for (for 'await_keypress' action)"
|
|
2752
|
+
},
|
|
2753
|
+
|
|
2754
|
+
# AwaitBrowserEvent action
|
|
2755
|
+
"wait_condition": {
|
|
2756
|
+
"type": "object",
|
|
2757
|
+
"description": "Condition configuration for browser event (for 'await_browser_event' action)"
|
|
2758
|
+
},
|
|
2759
|
+
|
|
2760
|
+
# Loop action
|
|
2761
|
+
"actions": {
|
|
2762
|
+
"type": "array",
|
|
2763
|
+
"description": "List of actions to repeat (for 'loop' action)",
|
|
2764
|
+
"items": {"type": "object"}
|
|
2765
|
+
},
|
|
2766
|
+
"iterations": {
|
|
2767
|
+
"type": "integer",
|
|
2768
|
+
"description": "Number of times to repeat (for 'loop' action)"
|
|
2769
|
+
},
|
|
2770
|
+
"break_on_error": {
|
|
2771
|
+
"type": "boolean",
|
|
2772
|
+
"description": "Stop loop if action fails (for 'loop' action)"
|
|
2773
|
+
},
|
|
2774
|
+
"max_iterations": {
|
|
2775
|
+
"type": "integer",
|
|
2776
|
+
"description": "Safety limit for condition-based loops (for 'loop' action)"
|
|
2777
|
+
},
|
|
2778
|
+
|
|
2779
|
+
# GetText action
|
|
2780
|
+
"multiple": {
|
|
2781
|
+
"type": "boolean",
|
|
2782
|
+
"description": "Extract from all matching elements (for 'get_text', 'get_html' actions)"
|
|
2783
|
+
},
|
|
2784
|
+
"extract_name": {
|
|
2785
|
+
"type": "string",
|
|
2786
|
+
"description": "Name for extracted data in results (for 'get_text', 'get_html' actions)"
|
|
2787
|
+
},
|
|
2788
|
+
|
|
2789
|
+
# Screenshot action
|
|
2790
|
+
"full_page": {
|
|
2791
|
+
"type": "boolean",
|
|
2792
|
+
"description": "Capture full scrollable page (for 'screenshot' action)"
|
|
2793
|
+
},
|
|
2794
|
+
"output_path": {
|
|
2795
|
+
"type": "string",
|
|
2796
|
+
"description": "File path to save screenshot (for 'screenshot' action)"
|
|
2797
|
+
},
|
|
2798
|
+
"return_base64": {
|
|
2799
|
+
"type": "boolean",
|
|
2800
|
+
"description": "Return screenshot as base64 (for 'screenshot' action)"
|
|
2801
|
+
},
|
|
2802
|
+
|
|
2803
|
+
# WaitForDownload action
|
|
2804
|
+
"filename_pattern": {
|
|
2805
|
+
"type": "string",
|
|
2806
|
+
"description": "Filename pattern to match, e.g., '*.pdf' (for 'wait_for_download' action)"
|
|
2807
|
+
},
|
|
2808
|
+
"download_path": {
|
|
2809
|
+
"type": "string",
|
|
2810
|
+
"description": "Directory to monitor for downloads (for 'wait_for_download' action)"
|
|
2811
|
+
},
|
|
2812
|
+
"move_to": {
|
|
2813
|
+
"type": "string",
|
|
2814
|
+
"description": "Path to move downloaded file (for 'wait_for_download' action)"
|
|
2815
|
+
},
|
|
2816
|
+
"delete_after": {
|
|
2817
|
+
"type": "boolean",
|
|
2818
|
+
"description": "Delete file after detection (for 'wait_for_download' action)"
|
|
2819
|
+
},
|
|
2820
|
+
|
|
2821
|
+
# UploadFile action
|
|
2822
|
+
"file_path": {
|
|
2823
|
+
"type": "string",
|
|
2824
|
+
"description": "Path to file to upload (for 'upload_file' action)"
|
|
2825
|
+
},
|
|
2826
|
+
"wait_after_upload": {
|
|
2827
|
+
"type": "string",
|
|
2828
|
+
"description": "CSS selector to wait for after upload (for 'upload_file' action)"
|
|
2829
|
+
},
|
|
2830
|
+
"multiple_files": {
|
|
2831
|
+
"type": "boolean",
|
|
2832
|
+
"description": "Whether uploading multiple files (for 'upload_file' action)"
|
|
2833
|
+
},
|
|
2834
|
+
"file_paths": {
|
|
2835
|
+
"type": "array",
|
|
2836
|
+
"description": "List of file paths for multiple uploads (for 'upload_file' action)",
|
|
2837
|
+
"items": {"type": "string"}
|
|
2838
|
+
}
|
|
2839
|
+
}
|
|
2840
|
+
}
|
|
2841
|
+
},
|
|
2842
|
+
"selectors": {
|
|
2843
|
+
"type": "array",
|
|
2844
|
+
"description": "Content selectors for extraction (legacy - prefer using get_text/get_html actions)",
|
|
2845
|
+
"items": {
|
|
2846
|
+
"type": "object",
|
|
2847
|
+
"required": ["name", "selector"],
|
|
2848
|
+
"properties": {
|
|
2849
|
+
"name": {
|
|
2850
|
+
"type": "string",
|
|
2851
|
+
"description": "Friendly name for the extracted content"
|
|
2852
|
+
},
|
|
2853
|
+
"selector": {
|
|
2854
|
+
"type": "string",
|
|
2855
|
+
"description": "CSS selector for the content"
|
|
2856
|
+
},
|
|
2857
|
+
"selector_type": {
|
|
2858
|
+
"type": "string",
|
|
2859
|
+
"enum": ["css", "xpath", "tag"],
|
|
2860
|
+
"description": "Type of selector"
|
|
2861
|
+
},
|
|
2862
|
+
"extract_type": {
|
|
2863
|
+
"type": "string",
|
|
2864
|
+
"enum": ["text", "html", "attribute"],
|
|
2865
|
+
"description": "What to extract from matched elements"
|
|
2866
|
+
},
|
|
2867
|
+
"attribute": {
|
|
2868
|
+
"type": "string",
|
|
2869
|
+
"description": "Attribute name (when extract_type is 'attribute')"
|
|
2870
|
+
},
|
|
2871
|
+
"multiple": {
|
|
2872
|
+
"type": "boolean",
|
|
2873
|
+
"description": "Extract from all matching elements"
|
|
2874
|
+
}
|
|
2875
|
+
}
|
|
2876
|
+
}
|
|
2877
|
+
},
|
|
2878
|
+
"base_url": {
|
|
2879
|
+
"type": "string",
|
|
2880
|
+
"description": "Base URL for resolving relative links"
|
|
2881
|
+
},
|
|
2882
|
+
"browser_config": {
|
|
2883
|
+
"type": "object",
|
|
2884
|
+
"description": "Browser configuration overrides",
|
|
2885
|
+
"properties": {
|
|
2886
|
+
"browser": {
|
|
2887
|
+
"type": "string",
|
|
2888
|
+
"enum": ["chrome", "firefox", "edge", "safari", "undetected"],
|
|
2889
|
+
"description": "Browser to use"
|
|
2890
|
+
},
|
|
2891
|
+
"headless": {
|
|
2892
|
+
"type": "boolean",
|
|
2893
|
+
"description": "Run browser in headless mode"
|
|
2894
|
+
},
|
|
2895
|
+
"mobile": {
|
|
2896
|
+
"type": "boolean",
|
|
2897
|
+
"description": "Emulate mobile device"
|
|
2898
|
+
},
|
|
2899
|
+
"mobile_device": {
|
|
2900
|
+
"type": "string",
|
|
2901
|
+
"description": "Specific mobile device to emulate"
|
|
2902
|
+
}
|
|
2903
|
+
}
|
|
2904
|
+
}
|
|
2905
|
+
},
|
|
2906
|
+
"required": ["steps"]
|
|
2907
|
+
}
|
|
2908
|
+
}
|
|
2909
|
+
}
|
|
2910
|
+
|
|
2911
|
+
def _substitute_template_vars(
|
|
2912
|
+
self,
|
|
2913
|
+
value: Any,
|
|
2914
|
+
iteration: int,
|
|
2915
|
+
start_index: int = 0,
|
|
2916
|
+
current_value: Any = None
|
|
2917
|
+
) -> Any:
|
|
2918
|
+
"""
|
|
2919
|
+
Recursively substitute template variables in strings.
|
|
2920
|
+
|
|
2921
|
+
Supported variables:
|
|
2922
|
+
- {i}, {index}, {iteration} - Current iteration (0-based by default)
|
|
2923
|
+
- {i+1}, {index+1}, {iteration+1} - Current iteration + 1 (1-based)
|
|
2924
|
+
- {i-1}, {index-1} - Current iteration - 1
|
|
2925
|
+
- {value} - Current value from values list (if provided)
|
|
2926
|
+
- Any arithmetic expression: {i*2}, {i+5}, etc.
|
|
2927
|
+
|
|
2928
|
+
Args:
|
|
2929
|
+
value: Value to substitute (can be str, dict, list, or other)
|
|
2930
|
+
iteration: Current iteration number (internal, 0-based counter)
|
|
2931
|
+
start_index: Starting index for display (default 0)
|
|
2932
|
+
current_value: Current value from the values list (if iterating over values)
|
|
2933
|
+
|
|
2934
|
+
Returns:
|
|
2935
|
+
Value with substituted variables
|
|
2936
|
+
"""
|
|
2937
|
+
if isinstance(value, str):
|
|
2938
|
+
# Actual index to expose to user (respects start_index)
|
|
2939
|
+
actual_index = start_index + iteration
|
|
2940
|
+
|
|
2941
|
+
# Replace simple variables first
|
|
2942
|
+
value = value.replace('{i}', str(actual_index))
|
|
2943
|
+
value = value.replace('{index}', str(actual_index))
|
|
2944
|
+
value = value.replace('{iteration}', str(actual_index))
|
|
2945
|
+
|
|
2946
|
+
# Replace {value} with current value from list
|
|
2947
|
+
if current_value is not None:
|
|
2948
|
+
value = value.replace('{value}', str(current_value))
|
|
2949
|
+
|
|
2950
|
+
# Handle arithmetic expressions like {i+1}, {i-1}, {i*2}, etc.
|
|
2951
|
+
def eval_expr(match):
|
|
2952
|
+
expr = match.group(1)
|
|
2953
|
+
# Replace variable names with actual value
|
|
2954
|
+
expr = expr.replace('i', str(actual_index))
|
|
2955
|
+
expr = expr.replace('index', str(actual_index))
|
|
2956
|
+
expr = expr.replace('iteration', str(actual_index))
|
|
2957
|
+
try:
|
|
2958
|
+
# Safe evaluation of arithmetic
|
|
2959
|
+
result = eval(expr, {"__builtins__": {}}, {})
|
|
2960
|
+
return str(result)
|
|
2961
|
+
except:
|
|
2962
|
+
# If evaluation fails, return original
|
|
2963
|
+
return match.group(0)
|
|
2964
|
+
|
|
2965
|
+
# Pattern to match {expression} where expression contains i/index/iteration
|
|
2966
|
+
pattern = r'\{([^}]*(?:i|index|iteration)[^}]*)\}'
|
|
2967
|
+
value = re.sub(pattern, eval_expr, value)
|
|
2968
|
+
|
|
2969
|
+
return value
|
|
2970
|
+
|
|
2971
|
+
elif isinstance(value, dict):
|
|
2972
|
+
return {k: self._substitute_template_vars(v, iteration, start_index, current_value) for k, v in value.items()}
|
|
2973
|
+
|
|
2974
|
+
elif isinstance(value, list):
|
|
2975
|
+
return [self._substitute_template_vars(item, iteration, start_index, current_value) for item in value]
|
|
2976
|
+
else:
|
|
2977
|
+
# Return as-is for other types (int, bool, None, etc.)
|
|
2978
|
+
return value
|
|
2979
|
+
|
|
2980
|
+
def _substitute_action_vars(
|
|
2981
|
+
self,
|
|
2982
|
+
action: BrowserAction,
|
|
2983
|
+
iteration: int,
|
|
2984
|
+
start_index: int = 0,
|
|
2985
|
+
current_value: Any = None
|
|
2986
|
+
) -> BrowserAction:
|
|
2987
|
+
"""
|
|
2988
|
+
Create a copy of the action with template variables substituted.
|
|
2989
|
+
|
|
2990
|
+
Args:
|
|
2991
|
+
action: Original action
|
|
2992
|
+
iteration: Current iteration number (0-based internally)
|
|
2993
|
+
start_index: Starting index for display
|
|
2994
|
+
current_value: Current value from values list (if provided)
|
|
2995
|
+
|
|
2996
|
+
Returns:
|
|
2997
|
+
New action instance with substituted values
|
|
2998
|
+
"""
|
|
2999
|
+
# Get the action as a dictionary
|
|
3000
|
+
action_dict = action.model_dump()
|
|
3001
|
+
|
|
3002
|
+
# Substitute variables in all string fields
|
|
3003
|
+
substituted_dict = self._substitute_template_vars(
|
|
3004
|
+
action_dict,
|
|
3005
|
+
iteration,
|
|
3006
|
+
start_index,
|
|
3007
|
+
current_value
|
|
3008
|
+
)
|
|
3009
|
+
|
|
3010
|
+
# Create new action instance from substituted dict
|
|
3011
|
+
action_class = type(action)
|
|
3012
|
+
return action_class(**substituted_dict)
|
|
3013
|
+
|
|
3014
|
+
def _collect_cookies(self) -> Dict[str, str]:
|
|
3015
|
+
if not self.driver:
|
|
3016
|
+
raise RuntimeError(
|
|
3017
|
+
"Selenium driver not available after scraping flow"
|
|
3018
|
+
)
|
|
3019
|
+
cookies: Dict[str, str] = {}
|
|
3020
|
+
with contextlib.suppress(Exception):
|
|
3021
|
+
cookies = self.driver.execute_cdp_cmd("Network.getAllCookies", {})["cookies"]
|
|
3022
|
+
if not cookies:
|
|
3023
|
+
for cookie in self.driver.get_cookies():
|
|
3024
|
+
name = cookie.get("name")
|
|
3025
|
+
if name:
|
|
3026
|
+
cookies[name] = cookie.get("value", "")
|
|
3027
|
+
return cookies
|
|
3028
|
+
|
|
3029
|
+
def _extract_headers(self) -> Dict[str, str]:
|
|
3030
|
+
headers: Dict[str, str] = {}
|
|
3031
|
+
if not self.driver:
|
|
3032
|
+
return headers
|
|
3033
|
+
|
|
3034
|
+
# for Selenium Wire, this path:
|
|
3035
|
+
try:
|
|
3036
|
+
for req in self.driver.requests:
|
|
3037
|
+
for key, value in req.headers.items():
|
|
3038
|
+
headers[key] = value
|
|
3039
|
+
return headers
|
|
3040
|
+
except Exception:
|
|
3041
|
+
pass
|
|
3042
|
+
|
|
3043
|
+
try:
|
|
3044
|
+
performance_logs = self.driver.get_log("performance")
|
|
3045
|
+
except Exception:
|
|
3046
|
+
performance_logs = []
|
|
3047
|
+
|
|
3048
|
+
for entry in reversed(performance_logs):
|
|
3049
|
+
try:
|
|
3050
|
+
message = json.loads(entry.get("message", "{}"))
|
|
3051
|
+
log = message.get("message", {})
|
|
3052
|
+
if log.get("method") != "Network.requestWillBeSent":
|
|
3053
|
+
continue
|
|
3054
|
+
req_headers = log.get("params", {}).get("request", {}).get("headers", {})
|
|
3055
|
+
for key, value in req_headers.items():
|
|
3056
|
+
if key not in headers:
|
|
3057
|
+
headers[key] = value
|
|
3058
|
+
except (ValueError, TypeError):
|
|
3059
|
+
continue
|
|
3060
|
+
|
|
3061
|
+
return headers
|
|
3062
|
+
|
|
3063
|
+
def _extract_authorization(self) -> Optional[str]:
|
|
3064
|
+
if not self.driver:
|
|
3065
|
+
return None
|
|
3066
|
+
|
|
3067
|
+
# Check first if Authorization is in headers:
|
|
3068
|
+
if 'Authorization' in self.extracted_headers:
|
|
3069
|
+
return self.extracted_headers['Authorization']
|
|
3070
|
+
if 'authorization' in self.extracted_headers:
|
|
3071
|
+
return self.extracted_headers['authorization']
|
|
3072
|
+
|
|
3073
|
+
# Attempt to capture from performance logs first
|
|
3074
|
+
try:
|
|
3075
|
+
self.driver.execute_cdp_cmd("Network.enable", {})
|
|
3076
|
+
except Exception: # pragma: no cover - command may not exist
|
|
3077
|
+
pass
|
|
3078
|
+
|
|
3079
|
+
try:
|
|
3080
|
+
performance_logs = self.driver.get_log("performance")
|
|
3081
|
+
except Exception:
|
|
3082
|
+
performance_logs = []
|
|
3083
|
+
|
|
3084
|
+
for entry in reversed(performance_logs):
|
|
3085
|
+
try:
|
|
3086
|
+
message = json.loads(entry.get("message", "{}"))
|
|
3087
|
+
log = message.get("message", {})
|
|
3088
|
+
if log.get("method") != "Network.requestWillBeSent":
|
|
3089
|
+
continue
|
|
3090
|
+
headers = log.get("params", {}).get("request", {}).get("headers", {})
|
|
3091
|
+
authorization = headers.get("Authorization") or headers.get("authorization")
|
|
3092
|
+
if authorization:
|
|
3093
|
+
return authorization
|
|
3094
|
+
except (ValueError, TypeError):
|
|
3095
|
+
continue
|
|
3096
|
+
|
|
3097
|
+
# Fallback: check localStorage/sessionStorage for tokens
|
|
3098
|
+
script_templates = [
|
|
3099
|
+
"return window.sessionStorage.getItem('authorization');",
|
|
3100
|
+
"return window.localStorage.getItem('authorization');",
|
|
3101
|
+
"return window.sessionStorage.getItem('authToken');",
|
|
3102
|
+
"return window.localStorage.getItem('authToken');",
|
|
3103
|
+
"return window.localStorage.getItem('token');",
|
|
3104
|
+
]
|
|
3105
|
+
for script in script_templates:
|
|
3106
|
+
try:
|
|
3107
|
+
token = self.driver.execute_script(script)
|
|
3108
|
+
except Exception:
|
|
3109
|
+
token = None
|
|
3110
|
+
if token:
|
|
3111
|
+
if not token.lower().startswith("bearer"):
|
|
3112
|
+
token = f"Bearer {token}".strip()
|
|
3113
|
+
return token
|
|
3114
|
+
|
|
3115
|
+
return None
|