ai-parrot 0.17.2__cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentui/.prettierrc +15 -0
- agentui/QUICKSTART.md +272 -0
- agentui/README.md +59 -0
- agentui/env.example +16 -0
- agentui/jsconfig.json +14 -0
- agentui/package-lock.json +4242 -0
- agentui/package.json +34 -0
- agentui/scripts/postinstall/apply-patches.mjs +260 -0
- agentui/src/app.css +61 -0
- agentui/src/app.d.ts +13 -0
- agentui/src/app.html +12 -0
- agentui/src/components/LoadingSpinner.svelte +64 -0
- agentui/src/components/ThemeSwitcher.svelte +159 -0
- agentui/src/components/index.js +4 -0
- agentui/src/lib/api/bots.ts +60 -0
- agentui/src/lib/api/chat.ts +22 -0
- agentui/src/lib/api/http.ts +25 -0
- agentui/src/lib/components/BotCard.svelte +33 -0
- agentui/src/lib/components/ChatBubble.svelte +63 -0
- agentui/src/lib/components/Toast.svelte +21 -0
- agentui/src/lib/config.ts +20 -0
- agentui/src/lib/stores/auth.svelte.ts +73 -0
- agentui/src/lib/stores/theme.svelte.js +64 -0
- agentui/src/lib/stores/toast.svelte.ts +31 -0
- agentui/src/lib/utils/conversation.ts +39 -0
- agentui/src/routes/+layout.svelte +20 -0
- agentui/src/routes/+page.svelte +232 -0
- agentui/src/routes/login/+page.svelte +200 -0
- agentui/src/routes/talk/[agentId]/+page.svelte +297 -0
- agentui/src/routes/talk/[agentId]/+page.ts +7 -0
- agentui/static/README.md +1 -0
- agentui/svelte.config.js +11 -0
- agentui/tailwind.config.ts +53 -0
- agentui/tsconfig.json +3 -0
- agentui/vite.config.ts +10 -0
- ai_parrot-0.17.2.dist-info/METADATA +472 -0
- ai_parrot-0.17.2.dist-info/RECORD +535 -0
- ai_parrot-0.17.2.dist-info/WHEEL +6 -0
- ai_parrot-0.17.2.dist-info/entry_points.txt +2 -0
- ai_parrot-0.17.2.dist-info/licenses/LICENSE +21 -0
- ai_parrot-0.17.2.dist-info/top_level.txt +6 -0
- crew-builder/.prettierrc +15 -0
- crew-builder/QUICKSTART.md +259 -0
- crew-builder/README.md +113 -0
- crew-builder/env.example +17 -0
- crew-builder/jsconfig.json +14 -0
- crew-builder/package-lock.json +4182 -0
- crew-builder/package.json +37 -0
- crew-builder/scripts/postinstall/apply-patches.mjs +260 -0
- crew-builder/src/app.css +62 -0
- crew-builder/src/app.d.ts +13 -0
- crew-builder/src/app.html +12 -0
- crew-builder/src/components/LoadingSpinner.svelte +64 -0
- crew-builder/src/components/ThemeSwitcher.svelte +149 -0
- crew-builder/src/components/index.js +9 -0
- crew-builder/src/lib/api/bots.ts +60 -0
- crew-builder/src/lib/api/chat.ts +80 -0
- crew-builder/src/lib/api/client.ts +56 -0
- crew-builder/src/lib/api/crew/crew.ts +136 -0
- crew-builder/src/lib/api/index.ts +5 -0
- crew-builder/src/lib/api/o365/auth.ts +65 -0
- crew-builder/src/lib/auth/auth.ts +54 -0
- crew-builder/src/lib/components/AgentNode.svelte +43 -0
- crew-builder/src/lib/components/BotCard.svelte +33 -0
- crew-builder/src/lib/components/ChatBubble.svelte +67 -0
- crew-builder/src/lib/components/ConfigPanel.svelte +278 -0
- crew-builder/src/lib/components/JsonTreeNode.svelte +76 -0
- crew-builder/src/lib/components/JsonViewer.svelte +24 -0
- crew-builder/src/lib/components/MarkdownEditor.svelte +48 -0
- crew-builder/src/lib/components/ThemeToggle.svelte +36 -0
- crew-builder/src/lib/components/Toast.svelte +67 -0
- crew-builder/src/lib/components/Toolbar.svelte +157 -0
- crew-builder/src/lib/components/index.ts +10 -0
- crew-builder/src/lib/config.ts +8 -0
- crew-builder/src/lib/stores/auth.svelte.ts +228 -0
- crew-builder/src/lib/stores/crewStore.ts +369 -0
- crew-builder/src/lib/stores/theme.svelte.js +145 -0
- crew-builder/src/lib/stores/toast.svelte.ts +69 -0
- crew-builder/src/lib/utils/conversation.ts +39 -0
- crew-builder/src/lib/utils/markdown.ts +122 -0
- crew-builder/src/lib/utils/talkHistory.ts +47 -0
- crew-builder/src/routes/+layout.svelte +20 -0
- crew-builder/src/routes/+page.svelte +539 -0
- crew-builder/src/routes/agents/+page.svelte +247 -0
- crew-builder/src/routes/agents/[agentId]/+page.svelte +288 -0
- crew-builder/src/routes/agents/[agentId]/+page.ts +7 -0
- crew-builder/src/routes/builder/+page.svelte +204 -0
- crew-builder/src/routes/crew/ask/+page.svelte +1052 -0
- crew-builder/src/routes/crew/ask/+page.ts +1 -0
- crew-builder/src/routes/integrations/o365/+page.svelte +304 -0
- crew-builder/src/routes/login/+page.svelte +197 -0
- crew-builder/src/routes/talk/[agentId]/+page.svelte +487 -0
- crew-builder/src/routes/talk/[agentId]/+page.ts +7 -0
- crew-builder/static/README.md +1 -0
- crew-builder/svelte.config.js +11 -0
- crew-builder/tailwind.config.ts +53 -0
- crew-builder/tsconfig.json +3 -0
- crew-builder/vite.config.ts +10 -0
- mcp_servers/calculator_server.py +309 -0
- parrot/__init__.py +27 -0
- parrot/__pycache__/__init__.cpython-310.pyc +0 -0
- parrot/__pycache__/version.cpython-310.pyc +0 -0
- parrot/_version.py +34 -0
- parrot/a2a/__init__.py +48 -0
- parrot/a2a/client.py +658 -0
- parrot/a2a/discovery.py +89 -0
- parrot/a2a/mixin.py +257 -0
- parrot/a2a/models.py +376 -0
- parrot/a2a/server.py +770 -0
- parrot/agents/__init__.py +29 -0
- parrot/bots/__init__.py +12 -0
- parrot/bots/a2a_agent.py +19 -0
- parrot/bots/abstract.py +3139 -0
- parrot/bots/agent.py +1129 -0
- parrot/bots/basic.py +9 -0
- parrot/bots/chatbot.py +669 -0
- parrot/bots/data.py +1618 -0
- parrot/bots/database/__init__.py +5 -0
- parrot/bots/database/abstract.py +3071 -0
- parrot/bots/database/cache.py +286 -0
- parrot/bots/database/models.py +468 -0
- parrot/bots/database/prompts.py +154 -0
- parrot/bots/database/retries.py +98 -0
- parrot/bots/database/router.py +269 -0
- parrot/bots/database/sql.py +41 -0
- parrot/bots/db/__init__.py +6 -0
- parrot/bots/db/abstract.py +556 -0
- parrot/bots/db/bigquery.py +602 -0
- parrot/bots/db/cache.py +85 -0
- parrot/bots/db/documentdb.py +668 -0
- parrot/bots/db/elastic.py +1014 -0
- parrot/bots/db/influx.py +898 -0
- parrot/bots/db/mock.py +96 -0
- parrot/bots/db/multi.py +783 -0
- parrot/bots/db/prompts.py +185 -0
- parrot/bots/db/sql.py +1255 -0
- parrot/bots/db/tools.py +212 -0
- parrot/bots/document.py +680 -0
- parrot/bots/hrbot.py +15 -0
- parrot/bots/kb.py +170 -0
- parrot/bots/mcp.py +36 -0
- parrot/bots/orchestration/README.md +463 -0
- parrot/bots/orchestration/__init__.py +1 -0
- parrot/bots/orchestration/agent.py +155 -0
- parrot/bots/orchestration/crew.py +3330 -0
- parrot/bots/orchestration/fsm.py +1179 -0
- parrot/bots/orchestration/hr.py +434 -0
- parrot/bots/orchestration/storage/__init__.py +4 -0
- parrot/bots/orchestration/storage/memory.py +100 -0
- parrot/bots/orchestration/storage/mixin.py +119 -0
- parrot/bots/orchestration/verify.py +202 -0
- parrot/bots/product.py +204 -0
- parrot/bots/prompts/__init__.py +96 -0
- parrot/bots/prompts/agents.py +155 -0
- parrot/bots/prompts/data.py +216 -0
- parrot/bots/prompts/output_generation.py +8 -0
- parrot/bots/scraper/__init__.py +3 -0
- parrot/bots/scraper/models.py +122 -0
- parrot/bots/scraper/scraper.py +1173 -0
- parrot/bots/scraper/templates.py +115 -0
- parrot/bots/stores/__init__.py +5 -0
- parrot/bots/stores/local.py +172 -0
- parrot/bots/webdev.py +81 -0
- parrot/cli.py +17 -0
- parrot/clients/__init__.py +16 -0
- parrot/clients/base.py +1491 -0
- parrot/clients/claude.py +1191 -0
- parrot/clients/factory.py +129 -0
- parrot/clients/google.py +4567 -0
- parrot/clients/gpt.py +1975 -0
- parrot/clients/grok.py +432 -0
- parrot/clients/groq.py +986 -0
- parrot/clients/hf.py +582 -0
- parrot/clients/models.py +18 -0
- parrot/conf.py +395 -0
- parrot/embeddings/__init__.py +9 -0
- parrot/embeddings/base.py +157 -0
- parrot/embeddings/google.py +98 -0
- parrot/embeddings/huggingface.py +74 -0
- parrot/embeddings/openai.py +84 -0
- parrot/embeddings/processor.py +88 -0
- parrot/exceptions.c +13868 -0
- parrot/exceptions.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/exceptions.pxd +22 -0
- parrot/exceptions.pxi +15 -0
- parrot/exceptions.pyx +44 -0
- parrot/generators/__init__.py +29 -0
- parrot/generators/base.py +200 -0
- parrot/generators/html.py +293 -0
- parrot/generators/react.py +205 -0
- parrot/generators/streamlit.py +203 -0
- parrot/generators/template.py +105 -0
- parrot/handlers/__init__.py +4 -0
- parrot/handlers/agent.py +861 -0
- parrot/handlers/agents/__init__.py +1 -0
- parrot/handlers/agents/abstract.py +900 -0
- parrot/handlers/bots.py +338 -0
- parrot/handlers/chat.py +915 -0
- parrot/handlers/creation.sql +192 -0
- parrot/handlers/crew/ARCHITECTURE.md +362 -0
- parrot/handlers/crew/README_BOTMANAGER_PERSISTENCE.md +303 -0
- parrot/handlers/crew/README_REDIS_PERSISTENCE.md +366 -0
- parrot/handlers/crew/__init__.py +0 -0
- parrot/handlers/crew/handler.py +801 -0
- parrot/handlers/crew/models.py +229 -0
- parrot/handlers/crew/redis_persistence.py +523 -0
- parrot/handlers/jobs/__init__.py +10 -0
- parrot/handlers/jobs/job.py +384 -0
- parrot/handlers/jobs/mixin.py +627 -0
- parrot/handlers/jobs/models.py +115 -0
- parrot/handlers/jobs/worker.py +31 -0
- parrot/handlers/models.py +596 -0
- parrot/handlers/o365_auth.py +105 -0
- parrot/handlers/stream.py +337 -0
- parrot/interfaces/__init__.py +6 -0
- parrot/interfaces/aws.py +143 -0
- parrot/interfaces/credentials.py +113 -0
- parrot/interfaces/database.py +27 -0
- parrot/interfaces/google.py +1123 -0
- parrot/interfaces/hierarchy.py +1227 -0
- parrot/interfaces/http.py +651 -0
- parrot/interfaces/images/__init__.py +0 -0
- parrot/interfaces/images/plugins/__init__.py +24 -0
- parrot/interfaces/images/plugins/abstract.py +58 -0
- parrot/interfaces/images/plugins/analisys.py +148 -0
- parrot/interfaces/images/plugins/classify.py +150 -0
- parrot/interfaces/images/plugins/classifybase.py +182 -0
- parrot/interfaces/images/plugins/detect.py +150 -0
- parrot/interfaces/images/plugins/exif.py +1103 -0
- parrot/interfaces/images/plugins/hash.py +52 -0
- parrot/interfaces/images/plugins/vision.py +104 -0
- parrot/interfaces/images/plugins/yolo.py +66 -0
- parrot/interfaces/images/plugins/zerodetect.py +197 -0
- parrot/interfaces/o365.py +978 -0
- parrot/interfaces/onedrive.py +822 -0
- parrot/interfaces/sharepoint.py +1435 -0
- parrot/interfaces/soap.py +257 -0
- parrot/loaders/__init__.py +8 -0
- parrot/loaders/abstract.py +1131 -0
- parrot/loaders/audio.py +199 -0
- parrot/loaders/basepdf.py +53 -0
- parrot/loaders/basevideo.py +1568 -0
- parrot/loaders/csv.py +409 -0
- parrot/loaders/docx.py +116 -0
- parrot/loaders/epubloader.py +316 -0
- parrot/loaders/excel.py +199 -0
- parrot/loaders/factory.py +55 -0
- parrot/loaders/files/__init__.py +0 -0
- parrot/loaders/files/abstract.py +39 -0
- parrot/loaders/files/html.py +26 -0
- parrot/loaders/files/text.py +63 -0
- parrot/loaders/html.py +152 -0
- parrot/loaders/markdown.py +442 -0
- parrot/loaders/pdf.py +373 -0
- parrot/loaders/pdfmark.py +320 -0
- parrot/loaders/pdftables.py +506 -0
- parrot/loaders/ppt.py +476 -0
- parrot/loaders/qa.py +63 -0
- parrot/loaders/splitters/__init__.py +10 -0
- parrot/loaders/splitters/base.py +138 -0
- parrot/loaders/splitters/md.py +228 -0
- parrot/loaders/splitters/token.py +143 -0
- parrot/loaders/txt.py +26 -0
- parrot/loaders/video.py +89 -0
- parrot/loaders/videolocal.py +218 -0
- parrot/loaders/videounderstanding.py +377 -0
- parrot/loaders/vimeo.py +167 -0
- parrot/loaders/web.py +599 -0
- parrot/loaders/youtube.py +504 -0
- parrot/manager/__init__.py +5 -0
- parrot/manager/manager.py +1030 -0
- parrot/mcp/__init__.py +28 -0
- parrot/mcp/adapter.py +105 -0
- parrot/mcp/cli.py +174 -0
- parrot/mcp/client.py +119 -0
- parrot/mcp/config.py +75 -0
- parrot/mcp/integration.py +842 -0
- parrot/mcp/oauth.py +933 -0
- parrot/mcp/server.py +225 -0
- parrot/mcp/transports/__init__.py +3 -0
- parrot/mcp/transports/base.py +279 -0
- parrot/mcp/transports/grpc_session.py +163 -0
- parrot/mcp/transports/http.py +312 -0
- parrot/mcp/transports/mcp.proto +108 -0
- parrot/mcp/transports/quic.py +1082 -0
- parrot/mcp/transports/sse.py +330 -0
- parrot/mcp/transports/stdio.py +309 -0
- parrot/mcp/transports/unix.py +395 -0
- parrot/mcp/transports/websocket.py +547 -0
- parrot/memory/__init__.py +16 -0
- parrot/memory/abstract.py +209 -0
- parrot/memory/agent.py +32 -0
- parrot/memory/cache.py +175 -0
- parrot/memory/core.py +555 -0
- parrot/memory/file.py +153 -0
- parrot/memory/mem.py +131 -0
- parrot/memory/redis.py +613 -0
- parrot/models/__init__.py +46 -0
- parrot/models/basic.py +118 -0
- parrot/models/compliance.py +208 -0
- parrot/models/crew.py +395 -0
- parrot/models/detections.py +654 -0
- parrot/models/generation.py +85 -0
- parrot/models/google.py +223 -0
- parrot/models/groq.py +23 -0
- parrot/models/openai.py +30 -0
- parrot/models/outputs.py +285 -0
- parrot/models/responses.py +938 -0
- parrot/notifications/__init__.py +743 -0
- parrot/openapi/__init__.py +3 -0
- parrot/openapi/components.yaml +641 -0
- parrot/openapi/config.py +322 -0
- parrot/outputs/__init__.py +32 -0
- parrot/outputs/formats/__init__.py +108 -0
- parrot/outputs/formats/altair.py +359 -0
- parrot/outputs/formats/application.py +122 -0
- parrot/outputs/formats/base.py +351 -0
- parrot/outputs/formats/bokeh.py +356 -0
- parrot/outputs/formats/card.py +424 -0
- parrot/outputs/formats/chart.py +436 -0
- parrot/outputs/formats/d3.py +255 -0
- parrot/outputs/formats/echarts.py +310 -0
- parrot/outputs/formats/generators/__init__.py +0 -0
- parrot/outputs/formats/generators/abstract.py +61 -0
- parrot/outputs/formats/generators/panel.py +145 -0
- parrot/outputs/formats/generators/streamlit.py +86 -0
- parrot/outputs/formats/generators/terminal.py +63 -0
- parrot/outputs/formats/holoviews.py +310 -0
- parrot/outputs/formats/html.py +147 -0
- parrot/outputs/formats/jinja2.py +46 -0
- parrot/outputs/formats/json.py +87 -0
- parrot/outputs/formats/map.py +933 -0
- parrot/outputs/formats/markdown.py +172 -0
- parrot/outputs/formats/matplotlib.py +237 -0
- parrot/outputs/formats/mixins/__init__.py +0 -0
- parrot/outputs/formats/mixins/emaps.py +855 -0
- parrot/outputs/formats/plotly.py +341 -0
- parrot/outputs/formats/seaborn.py +310 -0
- parrot/outputs/formats/table.py +397 -0
- parrot/outputs/formats/template_report.py +138 -0
- parrot/outputs/formats/yaml.py +125 -0
- parrot/outputs/formatter.py +152 -0
- parrot/outputs/templates/__init__.py +95 -0
- parrot/pipelines/__init__.py +0 -0
- parrot/pipelines/abstract.py +210 -0
- parrot/pipelines/detector.py +124 -0
- parrot/pipelines/models.py +90 -0
- parrot/pipelines/planogram.py +3002 -0
- parrot/pipelines/table.sql +97 -0
- parrot/plugins/__init__.py +106 -0
- parrot/plugins/importer.py +80 -0
- parrot/py.typed +0 -0
- parrot/registry/__init__.py +18 -0
- parrot/registry/registry.py +594 -0
- parrot/scheduler/__init__.py +1189 -0
- parrot/scheduler/models.py +60 -0
- parrot/security/__init__.py +16 -0
- parrot/security/prompt_injection.py +268 -0
- parrot/security/security_events.sql +25 -0
- parrot/services/__init__.py +1 -0
- parrot/services/mcp/__init__.py +8 -0
- parrot/services/mcp/config.py +13 -0
- parrot/services/mcp/server.py +295 -0
- parrot/services/o365_remote_auth.py +235 -0
- parrot/stores/__init__.py +7 -0
- parrot/stores/abstract.py +352 -0
- parrot/stores/arango.py +1090 -0
- parrot/stores/bigquery.py +1377 -0
- parrot/stores/cache.py +106 -0
- parrot/stores/empty.py +10 -0
- parrot/stores/faiss_store.py +1157 -0
- parrot/stores/kb/__init__.py +9 -0
- parrot/stores/kb/abstract.py +68 -0
- parrot/stores/kb/cache.py +165 -0
- parrot/stores/kb/doc.py +325 -0
- parrot/stores/kb/hierarchy.py +346 -0
- parrot/stores/kb/local.py +457 -0
- parrot/stores/kb/prompt.py +28 -0
- parrot/stores/kb/redis.py +659 -0
- parrot/stores/kb/store.py +115 -0
- parrot/stores/kb/user.py +374 -0
- parrot/stores/models.py +59 -0
- parrot/stores/pgvector.py +3 -0
- parrot/stores/postgres.py +2853 -0
- parrot/stores/utils/__init__.py +0 -0
- parrot/stores/utils/chunking.py +197 -0
- parrot/telemetry/__init__.py +3 -0
- parrot/telemetry/mixin.py +111 -0
- parrot/template/__init__.py +3 -0
- parrot/template/engine.py +259 -0
- parrot/tools/__init__.py +23 -0
- parrot/tools/abstract.py +644 -0
- parrot/tools/agent.py +363 -0
- parrot/tools/arangodbsearch.py +537 -0
- parrot/tools/arxiv_tool.py +188 -0
- parrot/tools/calculator/__init__.py +3 -0
- parrot/tools/calculator/operations/__init__.py +38 -0
- parrot/tools/calculator/operations/calculus.py +80 -0
- parrot/tools/calculator/operations/statistics.py +76 -0
- parrot/tools/calculator/tool.py +150 -0
- parrot/tools/cloudwatch.py +988 -0
- parrot/tools/codeinterpreter/__init__.py +127 -0
- parrot/tools/codeinterpreter/executor.py +371 -0
- parrot/tools/codeinterpreter/internals.py +473 -0
- parrot/tools/codeinterpreter/models.py +643 -0
- parrot/tools/codeinterpreter/prompts.py +224 -0
- parrot/tools/codeinterpreter/tool.py +664 -0
- parrot/tools/company_info/__init__.py +6 -0
- parrot/tools/company_info/tool.py +1138 -0
- parrot/tools/correlationanalysis.py +437 -0
- parrot/tools/database/abstract.py +286 -0
- parrot/tools/database/bq.py +115 -0
- parrot/tools/database/cache.py +284 -0
- parrot/tools/database/models.py +95 -0
- parrot/tools/database/pg.py +343 -0
- parrot/tools/databasequery.py +1159 -0
- parrot/tools/db.py +1800 -0
- parrot/tools/ddgo.py +370 -0
- parrot/tools/decorators.py +271 -0
- parrot/tools/dftohtml.py +282 -0
- parrot/tools/document.py +549 -0
- parrot/tools/ecs.py +819 -0
- parrot/tools/edareport.py +368 -0
- parrot/tools/elasticsearch.py +1049 -0
- parrot/tools/employees.py +462 -0
- parrot/tools/epson/__init__.py +96 -0
- parrot/tools/excel.py +683 -0
- parrot/tools/file/__init__.py +13 -0
- parrot/tools/file/abstract.py +76 -0
- parrot/tools/file/gcs.py +378 -0
- parrot/tools/file/local.py +284 -0
- parrot/tools/file/s3.py +511 -0
- parrot/tools/file/tmp.py +309 -0
- parrot/tools/file/tool.py +501 -0
- parrot/tools/file_reader.py +129 -0
- parrot/tools/flowtask/__init__.py +19 -0
- parrot/tools/flowtask/tool.py +761 -0
- parrot/tools/gittoolkit.py +508 -0
- parrot/tools/google/__init__.py +18 -0
- parrot/tools/google/base.py +169 -0
- parrot/tools/google/tools.py +1251 -0
- parrot/tools/googlelocation.py +5 -0
- parrot/tools/googleroutes.py +5 -0
- parrot/tools/googlesearch.py +5 -0
- parrot/tools/googlesitesearch.py +5 -0
- parrot/tools/googlevoice.py +2 -0
- parrot/tools/gvoice.py +695 -0
- parrot/tools/ibisworld/README.md +225 -0
- parrot/tools/ibisworld/__init__.py +11 -0
- parrot/tools/ibisworld/tool.py +366 -0
- parrot/tools/jiratoolkit.py +1718 -0
- parrot/tools/manager.py +1098 -0
- parrot/tools/math.py +152 -0
- parrot/tools/metadata.py +476 -0
- parrot/tools/msteams.py +1621 -0
- parrot/tools/msword.py +635 -0
- parrot/tools/multidb.py +580 -0
- parrot/tools/multistoresearch.py +369 -0
- parrot/tools/networkninja.py +167 -0
- parrot/tools/nextstop/__init__.py +4 -0
- parrot/tools/nextstop/base.py +286 -0
- parrot/tools/nextstop/employee.py +733 -0
- parrot/tools/nextstop/store.py +462 -0
- parrot/tools/notification.py +435 -0
- parrot/tools/o365/__init__.py +42 -0
- parrot/tools/o365/base.py +295 -0
- parrot/tools/o365/bundle.py +522 -0
- parrot/tools/o365/events.py +554 -0
- parrot/tools/o365/mail.py +992 -0
- parrot/tools/o365/onedrive.py +497 -0
- parrot/tools/o365/sharepoint.py +641 -0
- parrot/tools/openapi_toolkit.py +904 -0
- parrot/tools/openweather.py +527 -0
- parrot/tools/pdfprint.py +1001 -0
- parrot/tools/powerbi.py +518 -0
- parrot/tools/powerpoint.py +1113 -0
- parrot/tools/pricestool.py +146 -0
- parrot/tools/products/__init__.py +246 -0
- parrot/tools/prophet_tool.py +171 -0
- parrot/tools/pythonpandas.py +630 -0
- parrot/tools/pythonrepl.py +910 -0
- parrot/tools/qsource.py +436 -0
- parrot/tools/querytoolkit.py +395 -0
- parrot/tools/quickeda.py +827 -0
- parrot/tools/resttool.py +553 -0
- parrot/tools/retail/__init__.py +0 -0
- parrot/tools/retail/bby.py +528 -0
- parrot/tools/sandboxtool.py +703 -0
- parrot/tools/sassie/__init__.py +352 -0
- parrot/tools/scraping/__init__.py +7 -0
- parrot/tools/scraping/docs/select.md +466 -0
- parrot/tools/scraping/documentation.md +1278 -0
- parrot/tools/scraping/driver.py +436 -0
- parrot/tools/scraping/models.py +576 -0
- parrot/tools/scraping/options.py +85 -0
- parrot/tools/scraping/orchestrator.py +517 -0
- parrot/tools/scraping/readme.md +740 -0
- parrot/tools/scraping/tool.py +3115 -0
- parrot/tools/seasonaldetection.py +642 -0
- parrot/tools/shell_tool/__init__.py +5 -0
- parrot/tools/shell_tool/actions.py +408 -0
- parrot/tools/shell_tool/engine.py +155 -0
- parrot/tools/shell_tool/models.py +322 -0
- parrot/tools/shell_tool/tool.py +442 -0
- parrot/tools/site_search.py +214 -0
- parrot/tools/textfile.py +418 -0
- parrot/tools/think.py +378 -0
- parrot/tools/toolkit.py +298 -0
- parrot/tools/webapp_tool.py +187 -0
- parrot/tools/whatif.py +1279 -0
- parrot/tools/workday/MULTI_WSDL_EXAMPLE.md +249 -0
- parrot/tools/workday/__init__.py +6 -0
- parrot/tools/workday/models.py +1389 -0
- parrot/tools/workday/tool.py +1293 -0
- parrot/tools/yfinance_tool.py +306 -0
- parrot/tools/zipcode.py +217 -0
- parrot/utils/__init__.py +2 -0
- parrot/utils/helpers.py +73 -0
- parrot/utils/parsers/__init__.py +5 -0
- parrot/utils/parsers/toml.c +12078 -0
- parrot/utils/parsers/toml.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/utils/parsers/toml.pyx +21 -0
- parrot/utils/toml.py +11 -0
- parrot/utils/types.cpp +20936 -0
- parrot/utils/types.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/utils/types.pyx +213 -0
- parrot/utils/uv.py +11 -0
- parrot/version.py +10 -0
- parrot/yaml-rs/Cargo.lock +350 -0
- parrot/yaml-rs/Cargo.toml +19 -0
- parrot/yaml-rs/pyproject.toml +19 -0
- parrot/yaml-rs/python/yaml_rs/__init__.py +81 -0
- parrot/yaml-rs/src/lib.rs +222 -0
- requirements/docker-compose.yml +24 -0
- requirements/requirements-dev.txt +21 -0
|
@@ -0,0 +1,517 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ScrapingOrchestrator for AI-Parrot
|
|
3
|
+
Complete integration layer that coordinates LLM-directed web scraping
|
|
4
|
+
"""
|
|
5
|
+
from typing import Dict, List, Any, Optional, Union, Callable
|
|
6
|
+
import asyncio
|
|
7
|
+
import json
|
|
8
|
+
import logging
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
from ...bots.scraper import ScrapingAgent
|
|
13
|
+
from .tool import WebScrapingTool
|
|
14
|
+
from .models import ScrapingStep, ScrapingSelector, ScrapingResult
|
|
15
|
+
from ...stores.kb import KnowledgeBaseStore
|
|
16
|
+
from ...loaders.text import TextLoader
|
|
17
|
+
from ...models.responses import AgentResponse
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ScrapingOrchestrator:
|
|
21
|
+
"""
|
|
22
|
+
High-level orchestrator that manages the complete LLM-directed scraping workflow.
|
|
23
|
+
|
|
24
|
+
This class integrates with AI-parrot's existing infrastructure:
|
|
25
|
+
- Uses the knowledge base system for storing scraped content
|
|
26
|
+
- Integrates with the loader system for content processing
|
|
27
|
+
- Supports agent orchestration patterns
|
|
28
|
+
- Provides hooks for custom post-processing
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
agent_name: str = "WebScrapingAgent",
|
|
34
|
+
driver_type: str = 'selenium',
|
|
35
|
+
knowledge_base: Optional[KnowledgeBaseStore] = None,
|
|
36
|
+
**kwargs
|
|
37
|
+
):
|
|
38
|
+
self.logger = logging.getLogger("AI-Parrot.ScrapingOrchestrator")
|
|
39
|
+
|
|
40
|
+
# Initialize the scraping agent
|
|
41
|
+
self.scraping_agent = ScrapingAgent(
|
|
42
|
+
name=agent_name,
|
|
43
|
+
driver_type=driver_type,
|
|
44
|
+
**kwargs
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
# Knowledge base integration
|
|
48
|
+
self.knowledge_base = knowledge_base
|
|
49
|
+
self.auto_store_results = kwargs.get('auto_store_results', True)
|
|
50
|
+
|
|
51
|
+
# Result processing
|
|
52
|
+
self.post_processors: List[Callable] = []
|
|
53
|
+
self.result_filters: List[Callable] = []
|
|
54
|
+
|
|
55
|
+
# Configuration
|
|
56
|
+
self.max_concurrent_scrapes = kwargs.get('max_concurrent_scrapes', 3)
|
|
57
|
+
self.retry_failed_scrapes = kwargs.get('retry_failed_scrapes', True)
|
|
58
|
+
self.respect_robots_txt = kwargs.get('respect_robots_txt', True)
|
|
59
|
+
|
|
60
|
+
# Statistics tracking
|
|
61
|
+
self.session_stats = {
|
|
62
|
+
'total_requests': 0,
|
|
63
|
+
'successful_scrapes': 0,
|
|
64
|
+
'failed_scrapes': 0,
|
|
65
|
+
'pages_processed': 0,
|
|
66
|
+
'start_time': datetime.now()
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
async def execute_scraping_mission(
|
|
70
|
+
self,
|
|
71
|
+
mission_config: Dict[str, Any]
|
|
72
|
+
) -> Dict[str, Any]:
|
|
73
|
+
"""
|
|
74
|
+
Execute a complete scraping mission with multiple targets and objectives.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
mission_config: Configuration dictionary containing:
|
|
78
|
+
- targets: List of URLs or site configurations
|
|
79
|
+
- objectives: What to extract from each target
|
|
80
|
+
- authentication: Login credentials if needed
|
|
81
|
+
- output_config: How to store/process results
|
|
82
|
+
- constraints: Rate limiting, ethics, etc.
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
Dictionary with complete mission results and statistics
|
|
86
|
+
"""
|
|
87
|
+
self.logger.info(f"Starting scraping mission with {len(mission_config.get('targets', []))} targets")
|
|
88
|
+
|
|
89
|
+
mission_results = {
|
|
90
|
+
'mission_id': mission_config.get('mission_id', f"mission_{datetime.now().strftime('%Y%m%d_%H%M%S')}"),
|
|
91
|
+
'start_time': datetime.now().isoformat(),
|
|
92
|
+
'targets': [],
|
|
93
|
+
'statistics': {},
|
|
94
|
+
'errors': []
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
try:
|
|
98
|
+
targets = mission_config.get('targets', [])
|
|
99
|
+
|
|
100
|
+
# Process targets concurrently with semaphore control
|
|
101
|
+
semaphore = asyncio.Semaphore(self.max_concurrent_scrapes)
|
|
102
|
+
tasks = []
|
|
103
|
+
|
|
104
|
+
for i, target in enumerate(targets):
|
|
105
|
+
task = self._process_single_target(semaphore, target, mission_config, i)
|
|
106
|
+
tasks.append(task)
|
|
107
|
+
|
|
108
|
+
# Execute all scraping tasks
|
|
109
|
+
target_results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
110
|
+
|
|
111
|
+
# Process results
|
|
112
|
+
for i, result in enumerate(target_results):
|
|
113
|
+
if isinstance(result, Exception):
|
|
114
|
+
self.logger.error(f"Target {i} failed: {str(result)}")
|
|
115
|
+
mission_results['errors'].append({
|
|
116
|
+
'target_index': i,
|
|
117
|
+
'error': str(result),
|
|
118
|
+
'target_config': targets[i] if i < len(targets) else 'unknown'
|
|
119
|
+
})
|
|
120
|
+
else:
|
|
121
|
+
mission_results['targets'].append(result)
|
|
122
|
+
|
|
123
|
+
# Calculate statistics
|
|
124
|
+
mission_results['statistics'] = self._calculate_mission_statistics(mission_results)
|
|
125
|
+
|
|
126
|
+
# Store results if configured
|
|
127
|
+
if self.auto_store_results and self.knowledge_base:
|
|
128
|
+
await self._store_mission_results(mission_results)
|
|
129
|
+
|
|
130
|
+
except Exception as e:
|
|
131
|
+
self.logger.error(f"Mission execution failed: {str(e)}")
|
|
132
|
+
mission_results['errors'].append({
|
|
133
|
+
'type': 'mission_failure',
|
|
134
|
+
'error': str(e)
|
|
135
|
+
})
|
|
136
|
+
|
|
137
|
+
finally:
|
|
138
|
+
mission_results['end_time'] = datetime.now().isoformat()
|
|
139
|
+
mission_results['duration'] = (
|
|
140
|
+
datetime.fromisoformat(mission_results['end_time']) -
|
|
141
|
+
datetime.fromisoformat(mission_results['start_time'])
|
|
142
|
+
).total_seconds()
|
|
143
|
+
|
|
144
|
+
return mission_results
|
|
145
|
+
|
|
146
|
+
async def _process_single_target(
|
|
147
|
+
self,
|
|
148
|
+
semaphore: asyncio.Semaphore,
|
|
149
|
+
target_config: Dict[str, Any],
|
|
150
|
+
mission_config: Dict[str, Any],
|
|
151
|
+
target_index: int
|
|
152
|
+
) -> Dict[str, Any]:
|
|
153
|
+
"""Process a single scraping target with concurrency control"""
|
|
154
|
+
async with semaphore:
|
|
155
|
+
self.session_stats['total_requests'] += 1
|
|
156
|
+
|
|
157
|
+
# Build complete request for this target
|
|
158
|
+
request = {
|
|
159
|
+
'target_url': target_config.get('url') or target_config.get('target_url'),
|
|
160
|
+
'objective': target_config.get('objective') or mission_config.get('default_objective'),
|
|
161
|
+
'authentication': target_config.get('authentication') or mission_config.get('authentication'),
|
|
162
|
+
'constraints': mission_config.get('constraints', {}),
|
|
163
|
+
'base_url': target_config.get('base_url', ''),
|
|
164
|
+
'custom_selectors': target_config.get('selectors', []),
|
|
165
|
+
'custom_steps': target_config.get('steps', [])
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
# Check if we have prior knowledge about this site
|
|
169
|
+
recommendations = await self.scraping_agent.get_site_recommendations(request['target_url'])
|
|
170
|
+
|
|
171
|
+
# Execute the intelligent scraping
|
|
172
|
+
scraping_results = await self.scraping_agent.execute_intelligent_scraping(request)
|
|
173
|
+
|
|
174
|
+
# Process results through filters and post-processors
|
|
175
|
+
processed_results = await self._process_results(scraping_results, target_config)
|
|
176
|
+
|
|
177
|
+
# Update statistics
|
|
178
|
+
if processed_results:
|
|
179
|
+
successful_results = [r for r in processed_results if r.success]
|
|
180
|
+
self.session_stats['successful_scrapes'] += len(successful_results)
|
|
181
|
+
self.session_stats['failed_scrapes'] += len(processed_results) - len(successful_results)
|
|
182
|
+
self.session_stats['pages_processed'] += len(processed_results)
|
|
183
|
+
|
|
184
|
+
return {
|
|
185
|
+
'target_index': target_index,
|
|
186
|
+
'target_config': target_config,
|
|
187
|
+
'request': request,
|
|
188
|
+
'recommendations': recommendations,
|
|
189
|
+
'scraping_results': [
|
|
190
|
+
{
|
|
191
|
+
'url': r.url,
|
|
192
|
+
'success': r.success,
|
|
193
|
+
'extracted_data': r.extracted_data,
|
|
194
|
+
'metadata': r.metadata,
|
|
195
|
+
'error_message': r.error_message
|
|
196
|
+
} for r in processed_results
|
|
197
|
+
],
|
|
198
|
+
'processed_at': datetime.now().isoformat()
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
async def _process_results(
|
|
202
|
+
self,
|
|
203
|
+
results: List[ScrapingResult],
|
|
204
|
+
target_config: Dict[str, Any]
|
|
205
|
+
) -> List[ScrapingResult]:
|
|
206
|
+
"""Apply filters and post-processors to results"""
|
|
207
|
+
processed_results = results.copy()
|
|
208
|
+
|
|
209
|
+
# Apply result filters
|
|
210
|
+
for result_filter in self.result_filters:
|
|
211
|
+
processed_results = [r for r in processed_results if result_filter(r, target_config)]
|
|
212
|
+
|
|
213
|
+
# Apply post-processors
|
|
214
|
+
for post_processor in self.post_processors:
|
|
215
|
+
processed_results = await post_processor(processed_results, target_config)
|
|
216
|
+
|
|
217
|
+
return processed_results
|
|
218
|
+
|
|
219
|
+
def add_result_filter(self, filter_func: Callable[[ScrapingResult, Dict[str, Any]], bool]):
|
|
220
|
+
"""Add a filter function to exclude certain results"""
|
|
221
|
+
self.result_filters.append(filter_func)
|
|
222
|
+
|
|
223
|
+
def add_post_processor(self, processor_func: Callable):
|
|
224
|
+
"""Add a post-processor function for result enhancement"""
|
|
225
|
+
self.post_processors.append(processor_func)
|
|
226
|
+
|
|
227
|
+
async def _store_mission_results(self, mission_results: Dict[str, Any]):
|
|
228
|
+
"""Store scraping results in the knowledge base"""
|
|
229
|
+
if not self.knowledge_base:
|
|
230
|
+
return
|
|
231
|
+
|
|
232
|
+
try:
|
|
233
|
+
for target_result in mission_results['targets']:
|
|
234
|
+
for scraping_result in target_result['scraping_results']:
|
|
235
|
+
if scraping_result['success'] and scraping_result['extracted_data']:
|
|
236
|
+
# Prepare document for knowledge base
|
|
237
|
+
document = {
|
|
238
|
+
'content': json.dumps(scraping_result['extracted_data'], indent=2),
|
|
239
|
+
'metadata': {
|
|
240
|
+
'source_url': scraping_result['url'],
|
|
241
|
+
'scraping_mission_id': mission_results['mission_id'],
|
|
242
|
+
'scraped_at': scraping_result['metadata'].get('timestamp'),
|
|
243
|
+
'content_type': 'scraped_data',
|
|
244
|
+
'target_objective': target_result['request']['objective']
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
# Store in knowledge base
|
|
249
|
+
await self.knowledge_base.add_document(document)
|
|
250
|
+
|
|
251
|
+
self.logger.info(f"Stored mission results in knowledge base: {mission_results['mission_id']}")
|
|
252
|
+
|
|
253
|
+
except Exception as e:
|
|
254
|
+
self.logger.error(f"Failed to store mission results: {str(e)}")
|
|
255
|
+
|
|
256
|
+
def _calculate_mission_statistics(self, mission_results: Dict[str, Any]) -> Dict[str, Any]:
|
|
257
|
+
"""Calculate comprehensive statistics for the mission"""
|
|
258
|
+
total_targets = len(mission_results['targets'])
|
|
259
|
+
total_scrapes = sum(len(t['scraping_results']) for t in mission_results['targets'])
|
|
260
|
+
successful_scrapes = sum(
|
|
261
|
+
len([r for r in t['scraping_results'] if r['success']])
|
|
262
|
+
for t in mission_results['targets']
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
return {
|
|
266
|
+
'total_targets': total_targets,
|
|
267
|
+
'total_scrapes': total_scrapes,
|
|
268
|
+
'successful_scrapes': successful_scrapes,
|
|
269
|
+
'success_rate': successful_scrapes / total_scrapes if total_scrapes > 0 else 0,
|
|
270
|
+
'targets_with_data': len([t for t in mission_results['targets']
|
|
271
|
+
if any(r['extracted_data'] for r in t['scraping_results'])]),
|
|
272
|
+
'average_pages_per_target': total_scrapes / total_targets if total_targets > 0 else 0,
|
|
273
|
+
'session_stats': self.session_stats.copy()
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
# Example usage and integration patterns
|
|
278
|
+
class ScrapingMissionBuilder:
|
|
279
|
+
"""Builder pattern for creating complex scraping missions"""
|
|
280
|
+
|
|
281
|
+
def __init__(self):
|
|
282
|
+
self.mission_config = {
|
|
283
|
+
'targets': [],
|
|
284
|
+
'constraints': {},
|
|
285
|
+
'output_config': {}
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
def add_target(
|
|
289
|
+
self,
|
|
290
|
+
url: str,
|
|
291
|
+
objective: str = "Extract all relevant content",
|
|
292
|
+
authentication: Optional[Dict[str, Any]] = None,
|
|
293
|
+
custom_steps: Optional[List[Dict[str, Any]]] = None,
|
|
294
|
+
custom_selectors: Optional[List[Dict[str, Any]]] = None
|
|
295
|
+
) -> 'ScrapingMissionBuilder':
|
|
296
|
+
"""Add a target to the scraping mission"""
|
|
297
|
+
target = {
|
|
298
|
+
'url': url,
|
|
299
|
+
'objective': objective
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
if authentication:
|
|
303
|
+
target['authentication'] = authentication
|
|
304
|
+
if custom_steps:
|
|
305
|
+
target['steps'] = custom_steps
|
|
306
|
+
if custom_selectors:
|
|
307
|
+
target['selectors'] = custom_selectors
|
|
308
|
+
|
|
309
|
+
self.mission_config['targets'].append(target)
|
|
310
|
+
return self
|
|
311
|
+
|
|
312
|
+
def set_rate_limiting(
|
|
313
|
+
self,
|
|
314
|
+
requests_per_minute: int = 30,
|
|
315
|
+
delay_between_requests: float = 2.0
|
|
316
|
+
) -> 'ScrapingMissionBuilder':
|
|
317
|
+
"""Set rate limiting constraints"""
|
|
318
|
+
self.mission_config['constraints'].update({
|
|
319
|
+
'requests_per_minute': requests_per_minute,
|
|
320
|
+
'delay_between_requests': delay_between_requests
|
|
321
|
+
})
|
|
322
|
+
return self
|
|
323
|
+
|
|
324
|
+
def set_authentication(
|
|
325
|
+
self,
|
|
326
|
+
username: str,
|
|
327
|
+
password: str,
|
|
328
|
+
login_url: str,
|
|
329
|
+
username_selector: str = "#username",
|
|
330
|
+
password_selector: str = "#password",
|
|
331
|
+
submit_selector: str = "input[type=submit]"
|
|
332
|
+
) -> 'ScrapingMissionBuilder':
|
|
333
|
+
"""Set global authentication for all targets"""
|
|
334
|
+
self.mission_config['authentication'] = {
|
|
335
|
+
'required': True,
|
|
336
|
+
'username': username,
|
|
337
|
+
'password': password,
|
|
338
|
+
'login_url': login_url,
|
|
339
|
+
'selectors': {
|
|
340
|
+
'username': username_selector,
|
|
341
|
+
'password': password_selector,
|
|
342
|
+
'submit': submit_selector
|
|
343
|
+
}
|
|
344
|
+
}
|
|
345
|
+
return self
|
|
346
|
+
|
|
347
|
+
def enable_content_analysis(
|
|
348
|
+
self,
|
|
349
|
+
summarize_content: bool = True,
|
|
350
|
+
extract_entities: bool = True,
|
|
351
|
+
sentiment_analysis: bool = False
|
|
352
|
+
) -> 'ScrapingMissionBuilder':
|
|
353
|
+
"""Enable advanced content analysis features"""
|
|
354
|
+
self.mission_config['output_config'].update({
|
|
355
|
+
'summarize_content': summarize_content,
|
|
356
|
+
'extract_entities': extract_entities,
|
|
357
|
+
'sentiment_analysis': sentiment_analysis
|
|
358
|
+
})
|
|
359
|
+
return self
|
|
360
|
+
|
|
361
|
+
def build(self) -> Dict[str, Any]:
|
|
362
|
+
"""Build the final mission configuration"""
|
|
363
|
+
self.mission_config['mission_id'] = f"mission_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
|
364
|
+
return self.mission_config.copy()
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
# Example usage scenarios
|
|
368
|
+
async def example_ecommerce_scraping():
|
|
369
|
+
"""Example: Scraping product information from e-commerce sites"""
|
|
370
|
+
|
|
371
|
+
# Build mission using the builder pattern
|
|
372
|
+
mission = (ScrapingMissionBuilder()
|
|
373
|
+
.add_target(
|
|
374
|
+
url="https://example-store.com/products/laptops",
|
|
375
|
+
objective="Extract laptop product details including name, price, specifications, and reviews",
|
|
376
|
+
custom_selectors=[
|
|
377
|
+
{
|
|
378
|
+
"name": "product_name",
|
|
379
|
+
"selector": "h1.product-title",
|
|
380
|
+
"extract_type": "text"
|
|
381
|
+
},
|
|
382
|
+
{
|
|
383
|
+
"name": "price",
|
|
384
|
+
"selector": ".price-current",
|
|
385
|
+
"extract_type": "text"
|
|
386
|
+
},
|
|
387
|
+
{
|
|
388
|
+
"name": "specifications",
|
|
389
|
+
"selector": ".product-specs li",
|
|
390
|
+
"extract_type": "text",
|
|
391
|
+
"multiple": True
|
|
392
|
+
}
|
|
393
|
+
]
|
|
394
|
+
)
|
|
395
|
+
.add_target(
|
|
396
|
+
url="https://competitor-store.com/laptops",
|
|
397
|
+
objective="Extract competing laptop prices for comparison"
|
|
398
|
+
)
|
|
399
|
+
.set_rate_limiting(requests_per_minute=20, delay_between_requests=3.0)
|
|
400
|
+
.enable_content_analysis(summarize_content=True, extract_entities=True)
|
|
401
|
+
.build()
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
# Execute the mission
|
|
405
|
+
orchestrator = ScrapingOrchestrator(
|
|
406
|
+
driver_type='selenium',
|
|
407
|
+
headless=True
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
# Add custom post-processor for price comparison
|
|
411
|
+
async def price_comparison_processor(results, target_config):
|
|
412
|
+
"""Extract and normalize price data for comparison"""
|
|
413
|
+
for result in results:
|
|
414
|
+
if 'price' in result.extracted_data:
|
|
415
|
+
# Add price normalization logic here
|
|
416
|
+
result.metadata['normalized_price'] = extract_price_number(result.extracted_data['price'])
|
|
417
|
+
return results
|
|
418
|
+
|
|
419
|
+
orchestrator.add_post_processor(price_comparison_processor)
|
|
420
|
+
|
|
421
|
+
# Execute mission
|
|
422
|
+
mission_results = await orchestrator.execute_scraping_mission(mission)
|
|
423
|
+
|
|
424
|
+
return mission_results
|
|
425
|
+
|
|
426
|
+
async def example_news_monitoring():
|
|
427
|
+
"""Example: Monitor news sites for specific topics"""
|
|
428
|
+
|
|
429
|
+
mission = (ScrapingMissionBuilder()
|
|
430
|
+
.add_target(
|
|
431
|
+
url="https://news-site.com/technology",
|
|
432
|
+
objective="Extract technology news articles with headlines, summaries, and publication dates",
|
|
433
|
+
custom_selectors=[
|
|
434
|
+
{
|
|
435
|
+
"name": "headlines",
|
|
436
|
+
"selector": "h2.article-title a",
|
|
437
|
+
"extract_type": "text",
|
|
438
|
+
"multiple": True
|
|
439
|
+
},
|
|
440
|
+
{
|
|
441
|
+
"name": "summaries",
|
|
442
|
+
"selector": ".article-summary",
|
|
443
|
+
"extract_type": "text",
|
|
444
|
+
"multiple": True
|
|
445
|
+
}
|
|
446
|
+
]
|
|
447
|
+
)
|
|
448
|
+
.set_rate_limiting(requests_per_minute=15)
|
|
449
|
+
.enable_content_analysis(
|
|
450
|
+
summarize_content=True,
|
|
451
|
+
extract_entities=True,
|
|
452
|
+
sentiment_analysis=True
|
|
453
|
+
)
|
|
454
|
+
.build()
|
|
455
|
+
)
|
|
456
|
+
|
|
457
|
+
orchestrator = ScrapingOrchestrator()
|
|
458
|
+
|
|
459
|
+
# Add filter to only keep articles about AI/ML
|
|
460
|
+
def ai_ml_filter(result: ScrapingResult, target_config: Dict[str, Any]) -> bool:
|
|
461
|
+
if not result.success or not result.extracted_data:
|
|
462
|
+
return False
|
|
463
|
+
|
|
464
|
+
content_text = str(result.extracted_data).lower()
|
|
465
|
+
ai_keywords = ['artificial intelligence', 'machine learning', 'deep learning', 'neural network']
|
|
466
|
+
|
|
467
|
+
return any(keyword in content_text for keyword in ai_keywords)
|
|
468
|
+
|
|
469
|
+
orchestrator.add_result_filter(ai_ml_filter)
|
|
470
|
+
|
|
471
|
+
return await orchestrator.execute_scraping_mission(mission)
|
|
472
|
+
|
|
473
|
+
def extract_price_number(price_text: str) -> Optional[float]:
|
|
474
|
+
"""Helper function to extract numeric price from text"""
|
|
475
|
+
import re
|
|
476
|
+
price_match = re.search(r'[\d,]+\.?\d*', price_text.replace(',', ''))
|
|
477
|
+
return float(price_match.group()) if price_match else None
|
|
478
|
+
|
|
479
|
+
|
|
480
|
+
# Integration with existing AI-parrot infrastructure
|
|
481
|
+
async def integrate_with_knowledge_base(kb_store: KnowledgeBaseStore):
|
|
482
|
+
"""Example of full integration with AI-parrot knowledge base"""
|
|
483
|
+
|
|
484
|
+
orchestrator = ScrapingOrchestrator(
|
|
485
|
+
knowledge_base=kb_store,
|
|
486
|
+
auto_store_results=True
|
|
487
|
+
)
|
|
488
|
+
|
|
489
|
+
# Custom post-processor that uses text loaders for content processing
|
|
490
|
+
async def knowledge_base_processor(results, target_config):
|
|
491
|
+
"""Process scraped content using AI-parrot text loaders"""
|
|
492
|
+
from ..loaders.text import TextLoader
|
|
493
|
+
|
|
494
|
+
for result in results:
|
|
495
|
+
if result.success and result.extracted_data:
|
|
496
|
+
# Create temporary text file with scraped content
|
|
497
|
+
content = json.dumps(result.extracted_data, indent=2)
|
|
498
|
+
|
|
499
|
+
# Use text loader to process and chunk content
|
|
500
|
+
loader = TextLoader(
|
|
501
|
+
source=content,
|
|
502
|
+
chunk_size=800,
|
|
503
|
+
chunk_overlap=100
|
|
504
|
+
)
|
|
505
|
+
|
|
506
|
+
# Process content into chunks
|
|
507
|
+
chunks = await loader.process_documents()
|
|
508
|
+
|
|
509
|
+
# Add processed chunks to result metadata
|
|
510
|
+
result.metadata['processed_chunks'] = len(chunks)
|
|
511
|
+
result.metadata['content_processed'] = True
|
|
512
|
+
|
|
513
|
+
return results
|
|
514
|
+
|
|
515
|
+
orchestrator.add_post_processor(knowledge_base_processor)
|
|
516
|
+
|
|
517
|
+
return orchestrator
|