ai-parrot 0.17.2__cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentui/.prettierrc +15 -0
- agentui/QUICKSTART.md +272 -0
- agentui/README.md +59 -0
- agentui/env.example +16 -0
- agentui/jsconfig.json +14 -0
- agentui/package-lock.json +4242 -0
- agentui/package.json +34 -0
- agentui/scripts/postinstall/apply-patches.mjs +260 -0
- agentui/src/app.css +61 -0
- agentui/src/app.d.ts +13 -0
- agentui/src/app.html +12 -0
- agentui/src/components/LoadingSpinner.svelte +64 -0
- agentui/src/components/ThemeSwitcher.svelte +159 -0
- agentui/src/components/index.js +4 -0
- agentui/src/lib/api/bots.ts +60 -0
- agentui/src/lib/api/chat.ts +22 -0
- agentui/src/lib/api/http.ts +25 -0
- agentui/src/lib/components/BotCard.svelte +33 -0
- agentui/src/lib/components/ChatBubble.svelte +63 -0
- agentui/src/lib/components/Toast.svelte +21 -0
- agentui/src/lib/config.ts +20 -0
- agentui/src/lib/stores/auth.svelte.ts +73 -0
- agentui/src/lib/stores/theme.svelte.js +64 -0
- agentui/src/lib/stores/toast.svelte.ts +31 -0
- agentui/src/lib/utils/conversation.ts +39 -0
- agentui/src/routes/+layout.svelte +20 -0
- agentui/src/routes/+page.svelte +232 -0
- agentui/src/routes/login/+page.svelte +200 -0
- agentui/src/routes/talk/[agentId]/+page.svelte +297 -0
- agentui/src/routes/talk/[agentId]/+page.ts +7 -0
- agentui/static/README.md +1 -0
- agentui/svelte.config.js +11 -0
- agentui/tailwind.config.ts +53 -0
- agentui/tsconfig.json +3 -0
- agentui/vite.config.ts +10 -0
- ai_parrot-0.17.2.dist-info/METADATA +472 -0
- ai_parrot-0.17.2.dist-info/RECORD +535 -0
- ai_parrot-0.17.2.dist-info/WHEEL +6 -0
- ai_parrot-0.17.2.dist-info/entry_points.txt +2 -0
- ai_parrot-0.17.2.dist-info/licenses/LICENSE +21 -0
- ai_parrot-0.17.2.dist-info/top_level.txt +6 -0
- crew-builder/.prettierrc +15 -0
- crew-builder/QUICKSTART.md +259 -0
- crew-builder/README.md +113 -0
- crew-builder/env.example +17 -0
- crew-builder/jsconfig.json +14 -0
- crew-builder/package-lock.json +4182 -0
- crew-builder/package.json +37 -0
- crew-builder/scripts/postinstall/apply-patches.mjs +260 -0
- crew-builder/src/app.css +62 -0
- crew-builder/src/app.d.ts +13 -0
- crew-builder/src/app.html +12 -0
- crew-builder/src/components/LoadingSpinner.svelte +64 -0
- crew-builder/src/components/ThemeSwitcher.svelte +149 -0
- crew-builder/src/components/index.js +9 -0
- crew-builder/src/lib/api/bots.ts +60 -0
- crew-builder/src/lib/api/chat.ts +80 -0
- crew-builder/src/lib/api/client.ts +56 -0
- crew-builder/src/lib/api/crew/crew.ts +136 -0
- crew-builder/src/lib/api/index.ts +5 -0
- crew-builder/src/lib/api/o365/auth.ts +65 -0
- crew-builder/src/lib/auth/auth.ts +54 -0
- crew-builder/src/lib/components/AgentNode.svelte +43 -0
- crew-builder/src/lib/components/BotCard.svelte +33 -0
- crew-builder/src/lib/components/ChatBubble.svelte +67 -0
- crew-builder/src/lib/components/ConfigPanel.svelte +278 -0
- crew-builder/src/lib/components/JsonTreeNode.svelte +76 -0
- crew-builder/src/lib/components/JsonViewer.svelte +24 -0
- crew-builder/src/lib/components/MarkdownEditor.svelte +48 -0
- crew-builder/src/lib/components/ThemeToggle.svelte +36 -0
- crew-builder/src/lib/components/Toast.svelte +67 -0
- crew-builder/src/lib/components/Toolbar.svelte +157 -0
- crew-builder/src/lib/components/index.ts +10 -0
- crew-builder/src/lib/config.ts +8 -0
- crew-builder/src/lib/stores/auth.svelte.ts +228 -0
- crew-builder/src/lib/stores/crewStore.ts +369 -0
- crew-builder/src/lib/stores/theme.svelte.js +145 -0
- crew-builder/src/lib/stores/toast.svelte.ts +69 -0
- crew-builder/src/lib/utils/conversation.ts +39 -0
- crew-builder/src/lib/utils/markdown.ts +122 -0
- crew-builder/src/lib/utils/talkHistory.ts +47 -0
- crew-builder/src/routes/+layout.svelte +20 -0
- crew-builder/src/routes/+page.svelte +539 -0
- crew-builder/src/routes/agents/+page.svelte +247 -0
- crew-builder/src/routes/agents/[agentId]/+page.svelte +288 -0
- crew-builder/src/routes/agents/[agentId]/+page.ts +7 -0
- crew-builder/src/routes/builder/+page.svelte +204 -0
- crew-builder/src/routes/crew/ask/+page.svelte +1052 -0
- crew-builder/src/routes/crew/ask/+page.ts +1 -0
- crew-builder/src/routes/integrations/o365/+page.svelte +304 -0
- crew-builder/src/routes/login/+page.svelte +197 -0
- crew-builder/src/routes/talk/[agentId]/+page.svelte +487 -0
- crew-builder/src/routes/talk/[agentId]/+page.ts +7 -0
- crew-builder/static/README.md +1 -0
- crew-builder/svelte.config.js +11 -0
- crew-builder/tailwind.config.ts +53 -0
- crew-builder/tsconfig.json +3 -0
- crew-builder/vite.config.ts +10 -0
- mcp_servers/calculator_server.py +309 -0
- parrot/__init__.py +27 -0
- parrot/__pycache__/__init__.cpython-310.pyc +0 -0
- parrot/__pycache__/version.cpython-310.pyc +0 -0
- parrot/_version.py +34 -0
- parrot/a2a/__init__.py +48 -0
- parrot/a2a/client.py +658 -0
- parrot/a2a/discovery.py +89 -0
- parrot/a2a/mixin.py +257 -0
- parrot/a2a/models.py +376 -0
- parrot/a2a/server.py +770 -0
- parrot/agents/__init__.py +29 -0
- parrot/bots/__init__.py +12 -0
- parrot/bots/a2a_agent.py +19 -0
- parrot/bots/abstract.py +3139 -0
- parrot/bots/agent.py +1129 -0
- parrot/bots/basic.py +9 -0
- parrot/bots/chatbot.py +669 -0
- parrot/bots/data.py +1618 -0
- parrot/bots/database/__init__.py +5 -0
- parrot/bots/database/abstract.py +3071 -0
- parrot/bots/database/cache.py +286 -0
- parrot/bots/database/models.py +468 -0
- parrot/bots/database/prompts.py +154 -0
- parrot/bots/database/retries.py +98 -0
- parrot/bots/database/router.py +269 -0
- parrot/bots/database/sql.py +41 -0
- parrot/bots/db/__init__.py +6 -0
- parrot/bots/db/abstract.py +556 -0
- parrot/bots/db/bigquery.py +602 -0
- parrot/bots/db/cache.py +85 -0
- parrot/bots/db/documentdb.py +668 -0
- parrot/bots/db/elastic.py +1014 -0
- parrot/bots/db/influx.py +898 -0
- parrot/bots/db/mock.py +96 -0
- parrot/bots/db/multi.py +783 -0
- parrot/bots/db/prompts.py +185 -0
- parrot/bots/db/sql.py +1255 -0
- parrot/bots/db/tools.py +212 -0
- parrot/bots/document.py +680 -0
- parrot/bots/hrbot.py +15 -0
- parrot/bots/kb.py +170 -0
- parrot/bots/mcp.py +36 -0
- parrot/bots/orchestration/README.md +463 -0
- parrot/bots/orchestration/__init__.py +1 -0
- parrot/bots/orchestration/agent.py +155 -0
- parrot/bots/orchestration/crew.py +3330 -0
- parrot/bots/orchestration/fsm.py +1179 -0
- parrot/bots/orchestration/hr.py +434 -0
- parrot/bots/orchestration/storage/__init__.py +4 -0
- parrot/bots/orchestration/storage/memory.py +100 -0
- parrot/bots/orchestration/storage/mixin.py +119 -0
- parrot/bots/orchestration/verify.py +202 -0
- parrot/bots/product.py +204 -0
- parrot/bots/prompts/__init__.py +96 -0
- parrot/bots/prompts/agents.py +155 -0
- parrot/bots/prompts/data.py +216 -0
- parrot/bots/prompts/output_generation.py +8 -0
- parrot/bots/scraper/__init__.py +3 -0
- parrot/bots/scraper/models.py +122 -0
- parrot/bots/scraper/scraper.py +1173 -0
- parrot/bots/scraper/templates.py +115 -0
- parrot/bots/stores/__init__.py +5 -0
- parrot/bots/stores/local.py +172 -0
- parrot/bots/webdev.py +81 -0
- parrot/cli.py +17 -0
- parrot/clients/__init__.py +16 -0
- parrot/clients/base.py +1491 -0
- parrot/clients/claude.py +1191 -0
- parrot/clients/factory.py +129 -0
- parrot/clients/google.py +4567 -0
- parrot/clients/gpt.py +1975 -0
- parrot/clients/grok.py +432 -0
- parrot/clients/groq.py +986 -0
- parrot/clients/hf.py +582 -0
- parrot/clients/models.py +18 -0
- parrot/conf.py +395 -0
- parrot/embeddings/__init__.py +9 -0
- parrot/embeddings/base.py +157 -0
- parrot/embeddings/google.py +98 -0
- parrot/embeddings/huggingface.py +74 -0
- parrot/embeddings/openai.py +84 -0
- parrot/embeddings/processor.py +88 -0
- parrot/exceptions.c +13868 -0
- parrot/exceptions.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/exceptions.pxd +22 -0
- parrot/exceptions.pxi +15 -0
- parrot/exceptions.pyx +44 -0
- parrot/generators/__init__.py +29 -0
- parrot/generators/base.py +200 -0
- parrot/generators/html.py +293 -0
- parrot/generators/react.py +205 -0
- parrot/generators/streamlit.py +203 -0
- parrot/generators/template.py +105 -0
- parrot/handlers/__init__.py +4 -0
- parrot/handlers/agent.py +861 -0
- parrot/handlers/agents/__init__.py +1 -0
- parrot/handlers/agents/abstract.py +900 -0
- parrot/handlers/bots.py +338 -0
- parrot/handlers/chat.py +915 -0
- parrot/handlers/creation.sql +192 -0
- parrot/handlers/crew/ARCHITECTURE.md +362 -0
- parrot/handlers/crew/README_BOTMANAGER_PERSISTENCE.md +303 -0
- parrot/handlers/crew/README_REDIS_PERSISTENCE.md +366 -0
- parrot/handlers/crew/__init__.py +0 -0
- parrot/handlers/crew/handler.py +801 -0
- parrot/handlers/crew/models.py +229 -0
- parrot/handlers/crew/redis_persistence.py +523 -0
- parrot/handlers/jobs/__init__.py +10 -0
- parrot/handlers/jobs/job.py +384 -0
- parrot/handlers/jobs/mixin.py +627 -0
- parrot/handlers/jobs/models.py +115 -0
- parrot/handlers/jobs/worker.py +31 -0
- parrot/handlers/models.py +596 -0
- parrot/handlers/o365_auth.py +105 -0
- parrot/handlers/stream.py +337 -0
- parrot/interfaces/__init__.py +6 -0
- parrot/interfaces/aws.py +143 -0
- parrot/interfaces/credentials.py +113 -0
- parrot/interfaces/database.py +27 -0
- parrot/interfaces/google.py +1123 -0
- parrot/interfaces/hierarchy.py +1227 -0
- parrot/interfaces/http.py +651 -0
- parrot/interfaces/images/__init__.py +0 -0
- parrot/interfaces/images/plugins/__init__.py +24 -0
- parrot/interfaces/images/plugins/abstract.py +58 -0
- parrot/interfaces/images/plugins/analisys.py +148 -0
- parrot/interfaces/images/plugins/classify.py +150 -0
- parrot/interfaces/images/plugins/classifybase.py +182 -0
- parrot/interfaces/images/plugins/detect.py +150 -0
- parrot/interfaces/images/plugins/exif.py +1103 -0
- parrot/interfaces/images/plugins/hash.py +52 -0
- parrot/interfaces/images/plugins/vision.py +104 -0
- parrot/interfaces/images/plugins/yolo.py +66 -0
- parrot/interfaces/images/plugins/zerodetect.py +197 -0
- parrot/interfaces/o365.py +978 -0
- parrot/interfaces/onedrive.py +822 -0
- parrot/interfaces/sharepoint.py +1435 -0
- parrot/interfaces/soap.py +257 -0
- parrot/loaders/__init__.py +8 -0
- parrot/loaders/abstract.py +1131 -0
- parrot/loaders/audio.py +199 -0
- parrot/loaders/basepdf.py +53 -0
- parrot/loaders/basevideo.py +1568 -0
- parrot/loaders/csv.py +409 -0
- parrot/loaders/docx.py +116 -0
- parrot/loaders/epubloader.py +316 -0
- parrot/loaders/excel.py +199 -0
- parrot/loaders/factory.py +55 -0
- parrot/loaders/files/__init__.py +0 -0
- parrot/loaders/files/abstract.py +39 -0
- parrot/loaders/files/html.py +26 -0
- parrot/loaders/files/text.py +63 -0
- parrot/loaders/html.py +152 -0
- parrot/loaders/markdown.py +442 -0
- parrot/loaders/pdf.py +373 -0
- parrot/loaders/pdfmark.py +320 -0
- parrot/loaders/pdftables.py +506 -0
- parrot/loaders/ppt.py +476 -0
- parrot/loaders/qa.py +63 -0
- parrot/loaders/splitters/__init__.py +10 -0
- parrot/loaders/splitters/base.py +138 -0
- parrot/loaders/splitters/md.py +228 -0
- parrot/loaders/splitters/token.py +143 -0
- parrot/loaders/txt.py +26 -0
- parrot/loaders/video.py +89 -0
- parrot/loaders/videolocal.py +218 -0
- parrot/loaders/videounderstanding.py +377 -0
- parrot/loaders/vimeo.py +167 -0
- parrot/loaders/web.py +599 -0
- parrot/loaders/youtube.py +504 -0
- parrot/manager/__init__.py +5 -0
- parrot/manager/manager.py +1030 -0
- parrot/mcp/__init__.py +28 -0
- parrot/mcp/adapter.py +105 -0
- parrot/mcp/cli.py +174 -0
- parrot/mcp/client.py +119 -0
- parrot/mcp/config.py +75 -0
- parrot/mcp/integration.py +842 -0
- parrot/mcp/oauth.py +933 -0
- parrot/mcp/server.py +225 -0
- parrot/mcp/transports/__init__.py +3 -0
- parrot/mcp/transports/base.py +279 -0
- parrot/mcp/transports/grpc_session.py +163 -0
- parrot/mcp/transports/http.py +312 -0
- parrot/mcp/transports/mcp.proto +108 -0
- parrot/mcp/transports/quic.py +1082 -0
- parrot/mcp/transports/sse.py +330 -0
- parrot/mcp/transports/stdio.py +309 -0
- parrot/mcp/transports/unix.py +395 -0
- parrot/mcp/transports/websocket.py +547 -0
- parrot/memory/__init__.py +16 -0
- parrot/memory/abstract.py +209 -0
- parrot/memory/agent.py +32 -0
- parrot/memory/cache.py +175 -0
- parrot/memory/core.py +555 -0
- parrot/memory/file.py +153 -0
- parrot/memory/mem.py +131 -0
- parrot/memory/redis.py +613 -0
- parrot/models/__init__.py +46 -0
- parrot/models/basic.py +118 -0
- parrot/models/compliance.py +208 -0
- parrot/models/crew.py +395 -0
- parrot/models/detections.py +654 -0
- parrot/models/generation.py +85 -0
- parrot/models/google.py +223 -0
- parrot/models/groq.py +23 -0
- parrot/models/openai.py +30 -0
- parrot/models/outputs.py +285 -0
- parrot/models/responses.py +938 -0
- parrot/notifications/__init__.py +743 -0
- parrot/openapi/__init__.py +3 -0
- parrot/openapi/components.yaml +641 -0
- parrot/openapi/config.py +322 -0
- parrot/outputs/__init__.py +32 -0
- parrot/outputs/formats/__init__.py +108 -0
- parrot/outputs/formats/altair.py +359 -0
- parrot/outputs/formats/application.py +122 -0
- parrot/outputs/formats/base.py +351 -0
- parrot/outputs/formats/bokeh.py +356 -0
- parrot/outputs/formats/card.py +424 -0
- parrot/outputs/formats/chart.py +436 -0
- parrot/outputs/formats/d3.py +255 -0
- parrot/outputs/formats/echarts.py +310 -0
- parrot/outputs/formats/generators/__init__.py +0 -0
- parrot/outputs/formats/generators/abstract.py +61 -0
- parrot/outputs/formats/generators/panel.py +145 -0
- parrot/outputs/formats/generators/streamlit.py +86 -0
- parrot/outputs/formats/generators/terminal.py +63 -0
- parrot/outputs/formats/holoviews.py +310 -0
- parrot/outputs/formats/html.py +147 -0
- parrot/outputs/formats/jinja2.py +46 -0
- parrot/outputs/formats/json.py +87 -0
- parrot/outputs/formats/map.py +933 -0
- parrot/outputs/formats/markdown.py +172 -0
- parrot/outputs/formats/matplotlib.py +237 -0
- parrot/outputs/formats/mixins/__init__.py +0 -0
- parrot/outputs/formats/mixins/emaps.py +855 -0
- parrot/outputs/formats/plotly.py +341 -0
- parrot/outputs/formats/seaborn.py +310 -0
- parrot/outputs/formats/table.py +397 -0
- parrot/outputs/formats/template_report.py +138 -0
- parrot/outputs/formats/yaml.py +125 -0
- parrot/outputs/formatter.py +152 -0
- parrot/outputs/templates/__init__.py +95 -0
- parrot/pipelines/__init__.py +0 -0
- parrot/pipelines/abstract.py +210 -0
- parrot/pipelines/detector.py +124 -0
- parrot/pipelines/models.py +90 -0
- parrot/pipelines/planogram.py +3002 -0
- parrot/pipelines/table.sql +97 -0
- parrot/plugins/__init__.py +106 -0
- parrot/plugins/importer.py +80 -0
- parrot/py.typed +0 -0
- parrot/registry/__init__.py +18 -0
- parrot/registry/registry.py +594 -0
- parrot/scheduler/__init__.py +1189 -0
- parrot/scheduler/models.py +60 -0
- parrot/security/__init__.py +16 -0
- parrot/security/prompt_injection.py +268 -0
- parrot/security/security_events.sql +25 -0
- parrot/services/__init__.py +1 -0
- parrot/services/mcp/__init__.py +8 -0
- parrot/services/mcp/config.py +13 -0
- parrot/services/mcp/server.py +295 -0
- parrot/services/o365_remote_auth.py +235 -0
- parrot/stores/__init__.py +7 -0
- parrot/stores/abstract.py +352 -0
- parrot/stores/arango.py +1090 -0
- parrot/stores/bigquery.py +1377 -0
- parrot/stores/cache.py +106 -0
- parrot/stores/empty.py +10 -0
- parrot/stores/faiss_store.py +1157 -0
- parrot/stores/kb/__init__.py +9 -0
- parrot/stores/kb/abstract.py +68 -0
- parrot/stores/kb/cache.py +165 -0
- parrot/stores/kb/doc.py +325 -0
- parrot/stores/kb/hierarchy.py +346 -0
- parrot/stores/kb/local.py +457 -0
- parrot/stores/kb/prompt.py +28 -0
- parrot/stores/kb/redis.py +659 -0
- parrot/stores/kb/store.py +115 -0
- parrot/stores/kb/user.py +374 -0
- parrot/stores/models.py +59 -0
- parrot/stores/pgvector.py +3 -0
- parrot/stores/postgres.py +2853 -0
- parrot/stores/utils/__init__.py +0 -0
- parrot/stores/utils/chunking.py +197 -0
- parrot/telemetry/__init__.py +3 -0
- parrot/telemetry/mixin.py +111 -0
- parrot/template/__init__.py +3 -0
- parrot/template/engine.py +259 -0
- parrot/tools/__init__.py +23 -0
- parrot/tools/abstract.py +644 -0
- parrot/tools/agent.py +363 -0
- parrot/tools/arangodbsearch.py +537 -0
- parrot/tools/arxiv_tool.py +188 -0
- parrot/tools/calculator/__init__.py +3 -0
- parrot/tools/calculator/operations/__init__.py +38 -0
- parrot/tools/calculator/operations/calculus.py +80 -0
- parrot/tools/calculator/operations/statistics.py +76 -0
- parrot/tools/calculator/tool.py +150 -0
- parrot/tools/cloudwatch.py +988 -0
- parrot/tools/codeinterpreter/__init__.py +127 -0
- parrot/tools/codeinterpreter/executor.py +371 -0
- parrot/tools/codeinterpreter/internals.py +473 -0
- parrot/tools/codeinterpreter/models.py +643 -0
- parrot/tools/codeinterpreter/prompts.py +224 -0
- parrot/tools/codeinterpreter/tool.py +664 -0
- parrot/tools/company_info/__init__.py +6 -0
- parrot/tools/company_info/tool.py +1138 -0
- parrot/tools/correlationanalysis.py +437 -0
- parrot/tools/database/abstract.py +286 -0
- parrot/tools/database/bq.py +115 -0
- parrot/tools/database/cache.py +284 -0
- parrot/tools/database/models.py +95 -0
- parrot/tools/database/pg.py +343 -0
- parrot/tools/databasequery.py +1159 -0
- parrot/tools/db.py +1800 -0
- parrot/tools/ddgo.py +370 -0
- parrot/tools/decorators.py +271 -0
- parrot/tools/dftohtml.py +282 -0
- parrot/tools/document.py +549 -0
- parrot/tools/ecs.py +819 -0
- parrot/tools/edareport.py +368 -0
- parrot/tools/elasticsearch.py +1049 -0
- parrot/tools/employees.py +462 -0
- parrot/tools/epson/__init__.py +96 -0
- parrot/tools/excel.py +683 -0
- parrot/tools/file/__init__.py +13 -0
- parrot/tools/file/abstract.py +76 -0
- parrot/tools/file/gcs.py +378 -0
- parrot/tools/file/local.py +284 -0
- parrot/tools/file/s3.py +511 -0
- parrot/tools/file/tmp.py +309 -0
- parrot/tools/file/tool.py +501 -0
- parrot/tools/file_reader.py +129 -0
- parrot/tools/flowtask/__init__.py +19 -0
- parrot/tools/flowtask/tool.py +761 -0
- parrot/tools/gittoolkit.py +508 -0
- parrot/tools/google/__init__.py +18 -0
- parrot/tools/google/base.py +169 -0
- parrot/tools/google/tools.py +1251 -0
- parrot/tools/googlelocation.py +5 -0
- parrot/tools/googleroutes.py +5 -0
- parrot/tools/googlesearch.py +5 -0
- parrot/tools/googlesitesearch.py +5 -0
- parrot/tools/googlevoice.py +2 -0
- parrot/tools/gvoice.py +695 -0
- parrot/tools/ibisworld/README.md +225 -0
- parrot/tools/ibisworld/__init__.py +11 -0
- parrot/tools/ibisworld/tool.py +366 -0
- parrot/tools/jiratoolkit.py +1718 -0
- parrot/tools/manager.py +1098 -0
- parrot/tools/math.py +152 -0
- parrot/tools/metadata.py +476 -0
- parrot/tools/msteams.py +1621 -0
- parrot/tools/msword.py +635 -0
- parrot/tools/multidb.py +580 -0
- parrot/tools/multistoresearch.py +369 -0
- parrot/tools/networkninja.py +167 -0
- parrot/tools/nextstop/__init__.py +4 -0
- parrot/tools/nextstop/base.py +286 -0
- parrot/tools/nextstop/employee.py +733 -0
- parrot/tools/nextstop/store.py +462 -0
- parrot/tools/notification.py +435 -0
- parrot/tools/o365/__init__.py +42 -0
- parrot/tools/o365/base.py +295 -0
- parrot/tools/o365/bundle.py +522 -0
- parrot/tools/o365/events.py +554 -0
- parrot/tools/o365/mail.py +992 -0
- parrot/tools/o365/onedrive.py +497 -0
- parrot/tools/o365/sharepoint.py +641 -0
- parrot/tools/openapi_toolkit.py +904 -0
- parrot/tools/openweather.py +527 -0
- parrot/tools/pdfprint.py +1001 -0
- parrot/tools/powerbi.py +518 -0
- parrot/tools/powerpoint.py +1113 -0
- parrot/tools/pricestool.py +146 -0
- parrot/tools/products/__init__.py +246 -0
- parrot/tools/prophet_tool.py +171 -0
- parrot/tools/pythonpandas.py +630 -0
- parrot/tools/pythonrepl.py +910 -0
- parrot/tools/qsource.py +436 -0
- parrot/tools/querytoolkit.py +395 -0
- parrot/tools/quickeda.py +827 -0
- parrot/tools/resttool.py +553 -0
- parrot/tools/retail/__init__.py +0 -0
- parrot/tools/retail/bby.py +528 -0
- parrot/tools/sandboxtool.py +703 -0
- parrot/tools/sassie/__init__.py +352 -0
- parrot/tools/scraping/__init__.py +7 -0
- parrot/tools/scraping/docs/select.md +466 -0
- parrot/tools/scraping/documentation.md +1278 -0
- parrot/tools/scraping/driver.py +436 -0
- parrot/tools/scraping/models.py +576 -0
- parrot/tools/scraping/options.py +85 -0
- parrot/tools/scraping/orchestrator.py +517 -0
- parrot/tools/scraping/readme.md +740 -0
- parrot/tools/scraping/tool.py +3115 -0
- parrot/tools/seasonaldetection.py +642 -0
- parrot/tools/shell_tool/__init__.py +5 -0
- parrot/tools/shell_tool/actions.py +408 -0
- parrot/tools/shell_tool/engine.py +155 -0
- parrot/tools/shell_tool/models.py +322 -0
- parrot/tools/shell_tool/tool.py +442 -0
- parrot/tools/site_search.py +214 -0
- parrot/tools/textfile.py +418 -0
- parrot/tools/think.py +378 -0
- parrot/tools/toolkit.py +298 -0
- parrot/tools/webapp_tool.py +187 -0
- parrot/tools/whatif.py +1279 -0
- parrot/tools/workday/MULTI_WSDL_EXAMPLE.md +249 -0
- parrot/tools/workday/__init__.py +6 -0
- parrot/tools/workday/models.py +1389 -0
- parrot/tools/workday/tool.py +1293 -0
- parrot/tools/yfinance_tool.py +306 -0
- parrot/tools/zipcode.py +217 -0
- parrot/utils/__init__.py +2 -0
- parrot/utils/helpers.py +73 -0
- parrot/utils/parsers/__init__.py +5 -0
- parrot/utils/parsers/toml.c +12078 -0
- parrot/utils/parsers/toml.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/utils/parsers/toml.pyx +21 -0
- parrot/utils/toml.py +11 -0
- parrot/utils/types.cpp +20936 -0
- parrot/utils/types.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/utils/types.pyx +213 -0
- parrot/utils/uv.py +11 -0
- parrot/version.py +10 -0
- parrot/yaml-rs/Cargo.lock +350 -0
- parrot/yaml-rs/Cargo.toml +19 -0
- parrot/yaml-rs/pyproject.toml +19 -0
- parrot/yaml-rs/python/yaml_rs/__init__.py +81 -0
- parrot/yaml-rs/src/lib.rs +222 -0
- requirements/docker-compose.yml +24 -0
- requirements/requirements-dev.txt +21 -0
|
@@ -0,0 +1,740 @@
|
|
|
1
|
+
# ScrapingAgent for AI-Parrot
|
|
2
|
+
|
|
3
|
+
An intelligent web scraping agent that uses natural language to control web scraping operations with LLM-powered planning and execution.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
The ScrapingAgent combines the power of large language models with browser automation to create a natural language interface for web scraping. It analyzes web pages, generates optimal scraping strategies, and executes complex scraping workflows with minimal manual configuration.
|
|
8
|
+
|
|
9
|
+
### Key Features
|
|
10
|
+
|
|
11
|
+
- **Natural Language Control**: Describe what you want to scrape in plain English
|
|
12
|
+
- **Intelligent Analysis**: Automatically analyzes page structure and complexity
|
|
13
|
+
- **Strategic Planning**: Generates step-by-step navigation and extraction plans
|
|
14
|
+
- **Structured Output**: Uses Pydantic models for validation and type safety
|
|
15
|
+
- **Multiple Browser Support**: Selenium and Playwright, regular and undetected modes
|
|
16
|
+
- **Mobile Emulation**: Scrape mobile versions of websites
|
|
17
|
+
- **Authentication Handling**: Built-in support for login workflows
|
|
18
|
+
- **Plan Refinement**: Iteratively improve plans based on execution results
|
|
19
|
+
- **RESTful API**: Full HTTP API for integration with other services
|
|
20
|
+
|
|
21
|
+
## Architecture
|
|
22
|
+
|
|
23
|
+
```
|
|
24
|
+
┌─────────────────────────────────────────────────────┐
|
|
25
|
+
│ ScrapingAgent │
|
|
26
|
+
│ (Inherits from BasicAgent → AbstractBot) │
|
|
27
|
+
├─────────────────────────────────────────────────────┤
|
|
28
|
+
│ │
|
|
29
|
+
│ ┌──────────────┐ ┌────────────────┐ │
|
|
30
|
+
│ │ Analysis │ │ Plan Generation│ │
|
|
31
|
+
│ │ Module │ │ & Validation │ │
|
|
32
|
+
│ └──────┬───────┘ └────────┬────────┘ │
|
|
33
|
+
│ │ │ │
|
|
34
|
+
│ └────────┬──────────┘ │
|
|
35
|
+
│ │ │
|
|
36
|
+
│ ┌────────▼────────┐ │
|
|
37
|
+
│ │ Execution │ │
|
|
38
|
+
│ │ Orchestrator │ │
|
|
39
|
+
│ └────────┬────────┘ │
|
|
40
|
+
│ │ │
|
|
41
|
+
│ ┌────────▼────────┐ │
|
|
42
|
+
│ │ WebScrapingTool │ │
|
|
43
|
+
│ └────────┬────────┘ │
|
|
44
|
+
│ │ │
|
|
45
|
+
│ ┌─────────────┴─────────────┐ │
|
|
46
|
+
│ │ │ │
|
|
47
|
+
│ ┌──▼────────┐ ┌───────▼────┐ │
|
|
48
|
+
│ │ Selenium │ │ Playwright │ │
|
|
49
|
+
│ │ Driver │ │ Driver │ │
|
|
50
|
+
│ └───────────┘ └────────────┘ │
|
|
51
|
+
└─────────────────────────────────────────────────────┘
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Installation
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
# Install AI-parrot with scraping dependencies
|
|
58
|
+
pip install ai-parrot[scraping]
|
|
59
|
+
|
|
60
|
+
# Or install individual dependencies
|
|
61
|
+
pip install selenium playwright undetected-chromedriver
|
|
62
|
+
pip install beautifulsoup4 lxml
|
|
63
|
+
|
|
64
|
+
# Install playwright browsers
|
|
65
|
+
playwright install
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## Quick Start
|
|
69
|
+
|
|
70
|
+
### Basic Usage
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
import asyncio
|
|
74
|
+
from parrot.agents.scraping_agent import ScrapingAgent
|
|
75
|
+
|
|
76
|
+
async def main():
|
|
77
|
+
# Create agent
|
|
78
|
+
agent = ScrapingAgent(
|
|
79
|
+
name="MyScraper",
|
|
80
|
+
llm="openai",
|
|
81
|
+
model="gpt-4"
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# Configure agent
|
|
85
|
+
await agent.configure()
|
|
86
|
+
|
|
87
|
+
# Scrape with natural language
|
|
88
|
+
result = await agent.scrape(
|
|
89
|
+
"Extract all article titles and authors from https://news.ycombinator.com"
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
print(f"Status: {result['status']}")
|
|
93
|
+
print(f"Pages scraped: {result['metadata']['total_pages_scraped']}")
|
|
94
|
+
|
|
95
|
+
# Access extracted data
|
|
96
|
+
for page_result in result['result']:
|
|
97
|
+
if page_result['success']:
|
|
98
|
+
print(f"\nURL: {page_result['url']}")
|
|
99
|
+
print(f"Data: {page_result['extracted_data']}")
|
|
100
|
+
|
|
101
|
+
asyncio.run(main())
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### Advanced Usage with Plan Control
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
async def advanced_scraping():
|
|
108
|
+
agent = ScrapingAgent(
|
|
109
|
+
name="AdvancedScraper",
|
|
110
|
+
llm="anthropic",
|
|
111
|
+
model="claude-sonnet-4"
|
|
112
|
+
)
|
|
113
|
+
await agent.configure()
|
|
114
|
+
|
|
115
|
+
# Step 1: Generate plan
|
|
116
|
+
plan = await agent.generate_scraping_plan(
|
|
117
|
+
objective="Search for Python jobs and extract job titles, companies, and locations",
|
|
118
|
+
url="https://jobs.example.com",
|
|
119
|
+
context={
|
|
120
|
+
"search_query": "Python Developer",
|
|
121
|
+
"location": "Remote"
|
|
122
|
+
}
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
# Step 2: Review and modify plan if needed
|
|
126
|
+
print(f"Generated {len(plan.steps)} steps")
|
|
127
|
+
print(f"Using {len(plan.selectors)} selectors")
|
|
128
|
+
|
|
129
|
+
# Optionally modify the plan
|
|
130
|
+
plan.browser_config.headless = False # Show browser
|
|
131
|
+
|
|
132
|
+
# Step 3: Execute the plan
|
|
133
|
+
result = await agent.execute_plan(plan)
|
|
134
|
+
|
|
135
|
+
# Step 4: Refine if needed
|
|
136
|
+
if not result['status']:
|
|
137
|
+
refined_plan = await agent.refine_plan(
|
|
138
|
+
plan,
|
|
139
|
+
feedback="The search button selector was incorrect. Try '#search-btn' instead."
|
|
140
|
+
)
|
|
141
|
+
result = await agent.execute_plan(refined_plan)
|
|
142
|
+
|
|
143
|
+
return result
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
## Structured Output Schemas
|
|
147
|
+
|
|
148
|
+
### ScrapingPlanSchema
|
|
149
|
+
|
|
150
|
+
The complete plan for a scraping operation:
|
|
151
|
+
|
|
152
|
+
```python
|
|
153
|
+
from parrot.agents.scraping_agent import (
|
|
154
|
+
ScrapingPlanSchema,
|
|
155
|
+
BrowserConfigSchema,
|
|
156
|
+
NavigationStepSchema,
|
|
157
|
+
SelectorSchema,
|
|
158
|
+
PageAnalysisSchema
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
# Create a manual plan
|
|
162
|
+
plan = ScrapingPlanSchema(
|
|
163
|
+
objective="Extract product information",
|
|
164
|
+
analysis=PageAnalysisSchema(
|
|
165
|
+
url="https://shop.example.com",
|
|
166
|
+
page_type="product listing",
|
|
167
|
+
complexity="moderate",
|
|
168
|
+
requires_javascript=True,
|
|
169
|
+
has_pagination=True,
|
|
170
|
+
has_authentication=False,
|
|
171
|
+
key_elements=["product cards", "prices"],
|
|
172
|
+
potential_challenges=["lazy loading"],
|
|
173
|
+
recommended_approach="Use browser with scroll"
|
|
174
|
+
),
|
|
175
|
+
browser_config=BrowserConfigSchema(
|
|
176
|
+
browser="chrome",
|
|
177
|
+
headless=True,
|
|
178
|
+
mobile=False
|
|
179
|
+
),
|
|
180
|
+
steps=[
|
|
181
|
+
NavigationStepSchema(
|
|
182
|
+
action="navigate",
|
|
183
|
+
description="Go to products page",
|
|
184
|
+
target="https://shop.example.com/products"
|
|
185
|
+
),
|
|
186
|
+
NavigationStepSchema(
|
|
187
|
+
action="wait",
|
|
188
|
+
description="Wait for products",
|
|
189
|
+
target=".product-card",
|
|
190
|
+
timeout=10
|
|
191
|
+
)
|
|
192
|
+
],
|
|
193
|
+
selectors=[
|
|
194
|
+
SelectorSchema(
|
|
195
|
+
name="titles",
|
|
196
|
+
selector=".product-title",
|
|
197
|
+
extract_type="text",
|
|
198
|
+
multiple=True
|
|
199
|
+
)
|
|
200
|
+
]
|
|
201
|
+
)
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
### BrowserConfigSchema
|
|
205
|
+
|
|
206
|
+
Browser configuration options:
|
|
207
|
+
|
|
208
|
+
```python
|
|
209
|
+
config = BrowserConfigSchema(
|
|
210
|
+
browser="chrome", # or "firefox", "edge", "safari", "undetected"
|
|
211
|
+
headless=True, # Run without UI
|
|
212
|
+
mobile=False, # Emulate mobile device
|
|
213
|
+
mobile_device="iPhone 12", # Specific device to emulate
|
|
214
|
+
driver_type="selenium", # or "playwright"
|
|
215
|
+
auto_install=True # Auto-install drivers
|
|
216
|
+
)
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
### NavigationStepSchema
|
|
220
|
+
|
|
221
|
+
Individual scraping steps:
|
|
222
|
+
|
|
223
|
+
```python
|
|
224
|
+
# Navigate to URL
|
|
225
|
+
step1 = NavigationStepSchema(
|
|
226
|
+
action="navigate",
|
|
227
|
+
description="Go to homepage",
|
|
228
|
+
target="https://example.com"
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
# Click element
|
|
232
|
+
step2 = NavigationStepSchema(
|
|
233
|
+
action="click",
|
|
234
|
+
description="Click search button",
|
|
235
|
+
target="#search-btn",
|
|
236
|
+
wait_after=2.0
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
# Fill form
|
|
240
|
+
step3 = NavigationStepSchema(
|
|
241
|
+
action="fill",
|
|
242
|
+
description="Enter search query",
|
|
243
|
+
target="input[name='q']",
|
|
244
|
+
value="web scraping"
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
# Wait for element
|
|
248
|
+
step4 = NavigationStepSchema(
|
|
249
|
+
action="wait",
|
|
250
|
+
description="Wait for results",
|
|
251
|
+
target=".search-result",
|
|
252
|
+
timeout=10
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
# Scroll
|
|
256
|
+
step5 = NavigationStepSchema(
|
|
257
|
+
action="scroll",
|
|
258
|
+
description="Scroll to bottom",
|
|
259
|
+
target="bottom"
|
|
260
|
+
)
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
### SelectorSchema
|
|
264
|
+
|
|
265
|
+
Content extraction selectors:
|
|
266
|
+
|
|
267
|
+
```python
|
|
268
|
+
# Extract text
|
|
269
|
+
selector1 = SelectorSchema(
|
|
270
|
+
name="product_titles",
|
|
271
|
+
selector=".product h2",
|
|
272
|
+
selector_type="css",
|
|
273
|
+
extract_type="text",
|
|
274
|
+
multiple=True
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
# Extract attribute
|
|
278
|
+
selector2 = SelectorSchema(
|
|
279
|
+
name="product_images",
|
|
280
|
+
selector=".product img",
|
|
281
|
+
selector_type="css",
|
|
282
|
+
extract_type="attribute",
|
|
283
|
+
attribute="src",
|
|
284
|
+
multiple=True
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
# Extract HTML
|
|
288
|
+
selector3 = SelectorSchema(
|
|
289
|
+
name="product_descriptions",
|
|
290
|
+
selector=".description",
|
|
291
|
+
extract_type="html",
|
|
292
|
+
multiple=False
|
|
293
|
+
)
|
|
294
|
+
```
|
|
295
|
+
|
|
296
|
+
## Integration Patterns
|
|
297
|
+
|
|
298
|
+
### With BotManager
|
|
299
|
+
|
|
300
|
+
```python
|
|
301
|
+
from parrot.manager import BotManager
|
|
302
|
+
from parrot.agents.scraping_agent import ScrapingAgent
|
|
303
|
+
|
|
304
|
+
async def with_manager():
|
|
305
|
+
manager = BotManager()
|
|
306
|
+
|
|
307
|
+
# Create through manager
|
|
308
|
+
agent = await manager.create_agent(
|
|
309
|
+
class_name=ScrapingAgent,
|
|
310
|
+
name="ManagedScraper",
|
|
311
|
+
llm={"name": "openai", "model": "gpt-4"}
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
# Use the agent
|
|
315
|
+
result = await agent.scrape(
|
|
316
|
+
"Extract news headlines from BBC"
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
return result
|
|
320
|
+
```
|
|
321
|
+
|
|
322
|
+
### With Agent Registry
|
|
323
|
+
|
|
324
|
+
```python
|
|
325
|
+
from parrot.registry import agent_registry
|
|
326
|
+
from parrot.agents.scraping_agent import ScrapingAgent
|
|
327
|
+
|
|
328
|
+
# Register at startup
|
|
329
|
+
@agent_registry.register_agent(
|
|
330
|
+
name="ScrapingAgent",
|
|
331
|
+
singleton=True,
|
|
332
|
+
at_startup=True,
|
|
333
|
+
startup_config={
|
|
334
|
+
"llm": "anthropic",
|
|
335
|
+
"model": "claude-sonnet-4"
|
|
336
|
+
},
|
|
337
|
+
tags={"scraping", "automation"},
|
|
338
|
+
priority=100
|
|
339
|
+
)
|
|
340
|
+
class MyScrapingAgent(ScrapingAgent):
|
|
341
|
+
pass
|
|
342
|
+
|
|
343
|
+
# Later, get the agent
|
|
344
|
+
agent = await agent_registry.get_instance("ScrapingAgent")
|
|
345
|
+
```
|
|
346
|
+
|
|
347
|
+
### RESTful API
|
|
348
|
+
|
|
349
|
+
```python
|
|
350
|
+
from aiohttp import web
|
|
351
|
+
from parrot.handlers.scraping_agent_handler import create_scraping_api
|
|
352
|
+
|
|
353
|
+
async def run_api():
|
|
354
|
+
app = await create_scraping_api(
|
|
355
|
+
llm="openai",
|
|
356
|
+
model="gpt-4"
|
|
357
|
+
)
|
|
358
|
+
web.run_app(app, host="0.0.0.0", port=8080)
|
|
359
|
+
|
|
360
|
+
# API Endpoints:
|
|
361
|
+
# POST /api/v1/scraping/analyze - Analyze page
|
|
362
|
+
# POST /api/v1/scraping/plan - Generate plan
|
|
363
|
+
# POST /api/v1/scraping/execute - Execute plan
|
|
364
|
+
# POST /api/v1/scraping/scrape - Complete workflow
|
|
365
|
+
# GET /api/v1/scraping/plans/{id} - Get plan
|
|
366
|
+
# POST /api/v1/scraping/plans/{id}/refine - Refine plan
|
|
367
|
+
# GET /api/v1/scraping/health - Health check
|
|
368
|
+
```
|
|
369
|
+
|
|
370
|
+
Example API request:
|
|
371
|
+
|
|
372
|
+
```bash
|
|
373
|
+
curl -X POST http://localhost:8080/api/v1/scraping/scrape \
|
|
374
|
+
-H "Content-Type: application/json" \
|
|
375
|
+
-d '{
|
|
376
|
+
"objective": "Extract product names and prices",
|
|
377
|
+
"url": "https://shop.example.com",
|
|
378
|
+
"return_plan": true
|
|
379
|
+
}'
|
|
380
|
+
```
|
|
381
|
+
|
|
382
|
+
## Common Use Cases
|
|
383
|
+
|
|
384
|
+
### 1. E-commerce Scraping
|
|
385
|
+
|
|
386
|
+
```python
|
|
387
|
+
result = await agent.scrape("""
|
|
388
|
+
Go to https://shop.example.com/laptops
|
|
389
|
+
Extract for each product:
|
|
390
|
+
- Product name
|
|
391
|
+
- Price
|
|
392
|
+
- Rating
|
|
393
|
+
- Availability
|
|
394
|
+
- Image URL
|
|
395
|
+
Handle pagination to get all products
|
|
396
|
+
""")
|
|
397
|
+
```
|
|
398
|
+
|
|
399
|
+
### 2. News Aggregation
|
|
400
|
+
|
|
401
|
+
```python
|
|
402
|
+
result = await agent.scrape("""
|
|
403
|
+
From https://news.example.com:
|
|
404
|
+
1. Get all article headlines
|
|
405
|
+
2. For each article, extract:
|
|
406
|
+
- Title
|
|
407
|
+
- Author
|
|
408
|
+
- Publication date
|
|
409
|
+
- Summary
|
|
410
|
+
- Category tags
|
|
411
|
+
3. Handle "Load More" button
|
|
412
|
+
""")
|
|
413
|
+
```
|
|
414
|
+
|
|
415
|
+
### 3. Job Board Scraping
|
|
416
|
+
|
|
417
|
+
```python
|
|
418
|
+
result = await agent.scrape(
|
|
419
|
+
objective="""
|
|
420
|
+
Search for 'Python Developer' jobs
|
|
421
|
+
Extract: job title, company, location, salary range
|
|
422
|
+
Apply filters: Remote only, Full-time
|
|
423
|
+
Get results from all pages
|
|
424
|
+
""",
|
|
425
|
+
url="https://jobs.example.com",
|
|
426
|
+
context={
|
|
427
|
+
"requires_search": True,
|
|
428
|
+
"has_filters": True
|
|
429
|
+
}
|
|
430
|
+
)
|
|
431
|
+
```
|
|
432
|
+
|
|
433
|
+
### 4. Social Media Scraping
|
|
434
|
+
|
|
435
|
+
```python
|
|
436
|
+
# Requires authentication
|
|
437
|
+
result = await agent.scrape(
|
|
438
|
+
objective="Extract my last 10 posts with engagement metrics",
|
|
439
|
+
url="https://social.example.com/profile",
|
|
440
|
+
context={
|
|
441
|
+
"requires_login": True,
|
|
442
|
+
"credentials": {
|
|
443
|
+
"username": "user@example.com",
|
|
444
|
+
"password": os.getenv("PASSWORD")
|
|
445
|
+
}
|
|
446
|
+
}
|
|
447
|
+
)
|
|
448
|
+
```
|
|
449
|
+
|
|
450
|
+
### 5. Real Estate Listings
|
|
451
|
+
|
|
452
|
+
```python
|
|
453
|
+
result = await agent.scrape("""
|
|
454
|
+
From https://realestate.example.com:
|
|
455
|
+
Search for: Apartments in San Francisco, $2000-$3000
|
|
456
|
+
Extract:
|
|
457
|
+
- Address
|
|
458
|
+
- Price
|
|
459
|
+
- Bedrooms/Bathrooms
|
|
460
|
+
- Square footage
|
|
461
|
+
- Photos (URLs)
|
|
462
|
+
- Contact information
|
|
463
|
+
Navigate through all result pages
|
|
464
|
+
""")
|
|
465
|
+
```
|
|
466
|
+
|
|
467
|
+
## Advanced Features
|
|
468
|
+
|
|
469
|
+
### Mobile Scraping
|
|
470
|
+
|
|
471
|
+
```python
|
|
472
|
+
# Scrape mobile version
|
|
473
|
+
plan = await agent.generate_scraping_plan(
|
|
474
|
+
objective="Extract mobile app features",
|
|
475
|
+
url="https://app-store.example.com"
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
# Enable mobile mode
|
|
479
|
+
plan.browser_config.mobile = True
|
|
480
|
+
plan.browser_config.mobile_device = "iPhone 12"
|
|
481
|
+
|
|
482
|
+
result = await agent.execute_plan(plan)
|
|
483
|
+
```
|
|
484
|
+
|
|
485
|
+
### Anti-Bot Bypass
|
|
486
|
+
|
|
487
|
+
```python
|
|
488
|
+
# Use undetected browser for sites with Cloudflare
|
|
489
|
+
plan.browser_config.browser = "undetected"
|
|
490
|
+
plan.browser_config.headless = False # Often required
|
|
491
|
+
```
|
|
492
|
+
|
|
493
|
+
### Authentication
|
|
494
|
+
|
|
495
|
+
```python
|
|
496
|
+
result = await agent.scrape(
|
|
497
|
+
objective="Extract dashboard data after login",
|
|
498
|
+
url="https://app.example.com",
|
|
499
|
+
context={
|
|
500
|
+
"requires_login": True,
|
|
501
|
+
"login_url": "https://app.example.com/login",
|
|
502
|
+
"credentials": {
|
|
503
|
+
"username": "user@example.com",
|
|
504
|
+
"password": os.getenv("PASSWORD")
|
|
505
|
+
},
|
|
506
|
+
"username_selector": "#email",
|
|
507
|
+
"password_selector": "#password",
|
|
508
|
+
"submit_selector": "button[type='submit']"
|
|
509
|
+
}
|
|
510
|
+
)
|
|
511
|
+
```
|
|
512
|
+
|
|
513
|
+
### Pagination Handling
|
|
514
|
+
|
|
515
|
+
```python
|
|
516
|
+
# Agent automatically detects and handles pagination
|
|
517
|
+
result = await agent.scrape("""
|
|
518
|
+
Extract all products from https://shop.example.com
|
|
519
|
+
Handle pagination - click 'Next' until no more pages
|
|
520
|
+
Extract: name, price, rating for each product
|
|
521
|
+
""")
|
|
522
|
+
```
|
|
523
|
+
|
|
524
|
+
### Error Handling and Retry
|
|
525
|
+
|
|
526
|
+
```python
|
|
527
|
+
plan = await agent.generate_scraping_plan(
|
|
528
|
+
objective="Scrape with retry logic",
|
|
529
|
+
url="https://unstable-site.example.com"
|
|
530
|
+
)
|
|
531
|
+
|
|
532
|
+
# Configure retry behavior
|
|
533
|
+
plan.retry_config = {
|
|
534
|
+
"max_retries": 5,
|
|
535
|
+
"retry_delay": 3,
|
|
536
|
+
"retry_on_failure": True
|
|
537
|
+
}
|
|
538
|
+
|
|
539
|
+
result = await agent.execute_plan(plan)
|
|
540
|
+
```
|
|
541
|
+
|
|
542
|
+
## Best Practices
|
|
543
|
+
|
|
544
|
+
### 1. Be Specific in Objectives
|
|
545
|
+
|
|
546
|
+
❌ **Bad**: "Get data from the website"
|
|
547
|
+
|
|
548
|
+
✅ **Good**: "Extract product names, prices, and ratings from all pages of https://shop.example.com/electronics"
|
|
549
|
+
|
|
550
|
+
### 2. Provide Context
|
|
551
|
+
|
|
552
|
+
```python
|
|
553
|
+
result = await agent.scrape(
|
|
554
|
+
objective="Extract job listings",
|
|
555
|
+
url="https://jobs.example.com",
|
|
556
|
+
context={
|
|
557
|
+
"page_type": "job board",
|
|
558
|
+
"requires_search": True,
|
|
559
|
+
"search_query": "Python Developer",
|
|
560
|
+
"has_filters": True,
|
|
561
|
+
"pagination_type": "infinite scroll"
|
|
562
|
+
}
|
|
563
|
+
)
|
|
564
|
+
```
|
|
565
|
+
|
|
566
|
+
### 3. Review Plans Before Execution
|
|
567
|
+
|
|
568
|
+
```python
|
|
569
|
+
# Generate plan first
|
|
570
|
+
plan = await agent.generate_scraping_plan(objective, url)
|
|
571
|
+
|
|
572
|
+
# Review
|
|
573
|
+
print(f"Steps: {len(plan.steps)}")
|
|
574
|
+
print(f"Selectors: {len(plan.selectors)}")
|
|
575
|
+
for step in plan.steps:
|
|
576
|
+
print(f"- {step.action}: {step.description}")
|
|
577
|
+
|
|
578
|
+
# Modify if needed
|
|
579
|
+
plan.browser_config.headless = False
|
|
580
|
+
|
|
581
|
+
# Then execute
|
|
582
|
+
result = await agent.execute_plan(plan)
|
|
583
|
+
```
|
|
584
|
+
|
|
585
|
+
### 4. Use Appropriate Browser Mode
|
|
586
|
+
|
|
587
|
+
```python
|
|
588
|
+
# For JavaScript-heavy sites
|
|
589
|
+
config.browser = "chrome"
|
|
590
|
+
config.headless = True
|
|
591
|
+
|
|
592
|
+
# For anti-bot sites
|
|
593
|
+
config.browser = "undetected"
|
|
594
|
+
config.headless = False
|
|
595
|
+
|
|
596
|
+
# For simple static sites
|
|
597
|
+
config.browser = "chrome"
|
|
598
|
+
config.headless = True
|
|
599
|
+
```
|
|
600
|
+
|
|
601
|
+
### 5. Handle Rate Limiting
|
|
602
|
+
|
|
603
|
+
```python
|
|
604
|
+
# Add delays between requests
|
|
605
|
+
for step in plan.steps:
|
|
606
|
+
step.wait_after = 2.0 # Wait 2 seconds after each action
|
|
607
|
+
|
|
608
|
+
# Or in retry config
|
|
609
|
+
plan.retry_config["retry_delay"] = 5 # Wait 5 seconds between retries
|
|
610
|
+
```
|
|
611
|
+
|
|
612
|
+
## Troubleshooting
|
|
613
|
+
|
|
614
|
+
### Issue: Selectors not finding elements
|
|
615
|
+
|
|
616
|
+
**Solution**: Refine the plan with correct selectors
|
|
617
|
+
|
|
618
|
+
```python
|
|
619
|
+
refined_plan = await agent.refine_plan(
|
|
620
|
+
plan,
|
|
621
|
+
feedback="Selector '.title' not found. The correct selector is '.product-name'"
|
|
622
|
+
)
|
|
623
|
+
```
|
|
624
|
+
|
|
625
|
+
### Issue: Page requires JavaScript but not rendering
|
|
626
|
+
|
|
627
|
+
**Solution**: Ensure browser config allows JavaScript
|
|
628
|
+
|
|
629
|
+
```python
|
|
630
|
+
plan.analysis.requires_javascript = True
|
|
631
|
+
plan.browser_config.driver_type = "selenium" # or "playwright"
|
|
632
|
+
```
|
|
633
|
+
|
|
634
|
+
### Issue: Anti-bot detection
|
|
635
|
+
|
|
636
|
+
**Solution**: Use undetected browser mode
|
|
637
|
+
|
|
638
|
+
```python
|
|
639
|
+
plan.browser_config.browser = "undetected"
|
|
640
|
+
plan.browser_config.headless = False
|
|
641
|
+
```
|
|
642
|
+
|
|
643
|
+
### Issue: Slow page loading
|
|
644
|
+
|
|
645
|
+
**Solution**: Increase timeouts
|
|
646
|
+
|
|
647
|
+
```python
|
|
648
|
+
for step in plan.steps:
|
|
649
|
+
if step.action == "wait":
|
|
650
|
+
step.timeout = 30 # Increase to 30 seconds
|
|
651
|
+
```
|
|
652
|
+
|
|
653
|
+
## Performance Considerations
|
|
654
|
+
|
|
655
|
+
### Parallel Scraping
|
|
656
|
+
|
|
657
|
+
```python
|
|
658
|
+
import asyncio
|
|
659
|
+
|
|
660
|
+
async def scrape_multiple_urls(urls):
|
|
661
|
+
agent = ScrapingAgent()
|
|
662
|
+
await agent.configure()
|
|
663
|
+
|
|
664
|
+
tasks = [
|
|
665
|
+
agent.scrape(f"Extract data from {url}")
|
|
666
|
+
for url in urls
|
|
667
|
+
]
|
|
668
|
+
|
|
669
|
+
results = await asyncio.gather(*tasks)
|
|
670
|
+
return results
|
|
671
|
+
```
|
|
672
|
+
|
|
673
|
+
### Resource Management
|
|
674
|
+
|
|
675
|
+
```python
|
|
676
|
+
# Always cleanup
|
|
677
|
+
async def scrape_with_cleanup():
|
|
678
|
+
agent = ScrapingAgent()
|
|
679
|
+
try:
|
|
680
|
+
await agent.configure()
|
|
681
|
+
result = await agent.scrape(objective)
|
|
682
|
+
return result
|
|
683
|
+
finally:
|
|
684
|
+
# Cleanup happens automatically via context manager
|
|
685
|
+
pass
|
|
686
|
+
```
|
|
687
|
+
|
|
688
|
+
### Caching Plans
|
|
689
|
+
|
|
690
|
+
```python
|
|
691
|
+
# Store plans for reuse
|
|
692
|
+
plan = await agent.generate_scraping_plan(objective, url)
|
|
693
|
+
|
|
694
|
+
# Save plan
|
|
695
|
+
with open('scraping_plan.json', 'w') as f:
|
|
696
|
+
f.write(plan.model_dump_json())
|
|
697
|
+
|
|
698
|
+
# Load and reuse later
|
|
699
|
+
with open('scraping_plan.json', 'r') as f:
|
|
700
|
+
plan_data = json.load(f)
|
|
701
|
+
plan = ScrapingPlanSchema(**plan_data)
|
|
702
|
+
|
|
703
|
+
result = await agent.execute_plan(plan)
|
|
704
|
+
```
|
|
705
|
+
|
|
706
|
+
## Legal and Ethical Considerations
|
|
707
|
+
|
|
708
|
+
⚠️ **Important**: Always respect website terms of service and robots.txt
|
|
709
|
+
|
|
710
|
+
- Check robots.txt before scraping
|
|
711
|
+
- Respect rate limits
|
|
712
|
+
- Don't overload servers
|
|
713
|
+
- Don't scrape copyrighted content without permission
|
|
714
|
+
- Include delays between requests
|
|
715
|
+
- Use appropriate user agents
|
|
716
|
+
- Cache results to minimize requests
|
|
717
|
+
|
|
718
|
+
## Contributing
|
|
719
|
+
|
|
720
|
+
We welcome contributions! Areas for improvement:
|
|
721
|
+
|
|
722
|
+
- Additional browser support (Safari, Edge)
|
|
723
|
+
- More sophisticated anti-detection techniques
|
|
724
|
+
- Enhanced pagination detection
|
|
725
|
+
- Better error recovery strategies
|
|
726
|
+
- Performance optimizations
|
|
727
|
+
|
|
728
|
+
## License
|
|
729
|
+
|
|
730
|
+
AI-Parrot and ScrapingAgent are open source under the MIT License.
|
|
731
|
+
|
|
732
|
+
## Support
|
|
733
|
+
|
|
734
|
+
- Documentation: https://ai-parrot.readthedocs.io
|
|
735
|
+
- Issues: https://github.com/your-org/ai-parrot/issues
|
|
736
|
+
- Discord: https://discord.gg/ai-parrot
|
|
737
|
+
|
|
738
|
+
---
|
|
739
|
+
|
|
740
|
+
Built with ❤️ by the AI-Parrot team
|