ai-parrot 0.17.2__cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentui/.prettierrc +15 -0
- agentui/QUICKSTART.md +272 -0
- agentui/README.md +59 -0
- agentui/env.example +16 -0
- agentui/jsconfig.json +14 -0
- agentui/package-lock.json +4242 -0
- agentui/package.json +34 -0
- agentui/scripts/postinstall/apply-patches.mjs +260 -0
- agentui/src/app.css +61 -0
- agentui/src/app.d.ts +13 -0
- agentui/src/app.html +12 -0
- agentui/src/components/LoadingSpinner.svelte +64 -0
- agentui/src/components/ThemeSwitcher.svelte +159 -0
- agentui/src/components/index.js +4 -0
- agentui/src/lib/api/bots.ts +60 -0
- agentui/src/lib/api/chat.ts +22 -0
- agentui/src/lib/api/http.ts +25 -0
- agentui/src/lib/components/BotCard.svelte +33 -0
- agentui/src/lib/components/ChatBubble.svelte +63 -0
- agentui/src/lib/components/Toast.svelte +21 -0
- agentui/src/lib/config.ts +20 -0
- agentui/src/lib/stores/auth.svelte.ts +73 -0
- agentui/src/lib/stores/theme.svelte.js +64 -0
- agentui/src/lib/stores/toast.svelte.ts +31 -0
- agentui/src/lib/utils/conversation.ts +39 -0
- agentui/src/routes/+layout.svelte +20 -0
- agentui/src/routes/+page.svelte +232 -0
- agentui/src/routes/login/+page.svelte +200 -0
- agentui/src/routes/talk/[agentId]/+page.svelte +297 -0
- agentui/src/routes/talk/[agentId]/+page.ts +7 -0
- agentui/static/README.md +1 -0
- agentui/svelte.config.js +11 -0
- agentui/tailwind.config.ts +53 -0
- agentui/tsconfig.json +3 -0
- agentui/vite.config.ts +10 -0
- ai_parrot-0.17.2.dist-info/METADATA +472 -0
- ai_parrot-0.17.2.dist-info/RECORD +535 -0
- ai_parrot-0.17.2.dist-info/WHEEL +6 -0
- ai_parrot-0.17.2.dist-info/entry_points.txt +2 -0
- ai_parrot-0.17.2.dist-info/licenses/LICENSE +21 -0
- ai_parrot-0.17.2.dist-info/top_level.txt +6 -0
- crew-builder/.prettierrc +15 -0
- crew-builder/QUICKSTART.md +259 -0
- crew-builder/README.md +113 -0
- crew-builder/env.example +17 -0
- crew-builder/jsconfig.json +14 -0
- crew-builder/package-lock.json +4182 -0
- crew-builder/package.json +37 -0
- crew-builder/scripts/postinstall/apply-patches.mjs +260 -0
- crew-builder/src/app.css +62 -0
- crew-builder/src/app.d.ts +13 -0
- crew-builder/src/app.html +12 -0
- crew-builder/src/components/LoadingSpinner.svelte +64 -0
- crew-builder/src/components/ThemeSwitcher.svelte +149 -0
- crew-builder/src/components/index.js +9 -0
- crew-builder/src/lib/api/bots.ts +60 -0
- crew-builder/src/lib/api/chat.ts +80 -0
- crew-builder/src/lib/api/client.ts +56 -0
- crew-builder/src/lib/api/crew/crew.ts +136 -0
- crew-builder/src/lib/api/index.ts +5 -0
- crew-builder/src/lib/api/o365/auth.ts +65 -0
- crew-builder/src/lib/auth/auth.ts +54 -0
- crew-builder/src/lib/components/AgentNode.svelte +43 -0
- crew-builder/src/lib/components/BotCard.svelte +33 -0
- crew-builder/src/lib/components/ChatBubble.svelte +67 -0
- crew-builder/src/lib/components/ConfigPanel.svelte +278 -0
- crew-builder/src/lib/components/JsonTreeNode.svelte +76 -0
- crew-builder/src/lib/components/JsonViewer.svelte +24 -0
- crew-builder/src/lib/components/MarkdownEditor.svelte +48 -0
- crew-builder/src/lib/components/ThemeToggle.svelte +36 -0
- crew-builder/src/lib/components/Toast.svelte +67 -0
- crew-builder/src/lib/components/Toolbar.svelte +157 -0
- crew-builder/src/lib/components/index.ts +10 -0
- crew-builder/src/lib/config.ts +8 -0
- crew-builder/src/lib/stores/auth.svelte.ts +228 -0
- crew-builder/src/lib/stores/crewStore.ts +369 -0
- crew-builder/src/lib/stores/theme.svelte.js +145 -0
- crew-builder/src/lib/stores/toast.svelte.ts +69 -0
- crew-builder/src/lib/utils/conversation.ts +39 -0
- crew-builder/src/lib/utils/markdown.ts +122 -0
- crew-builder/src/lib/utils/talkHistory.ts +47 -0
- crew-builder/src/routes/+layout.svelte +20 -0
- crew-builder/src/routes/+page.svelte +539 -0
- crew-builder/src/routes/agents/+page.svelte +247 -0
- crew-builder/src/routes/agents/[agentId]/+page.svelte +288 -0
- crew-builder/src/routes/agents/[agentId]/+page.ts +7 -0
- crew-builder/src/routes/builder/+page.svelte +204 -0
- crew-builder/src/routes/crew/ask/+page.svelte +1052 -0
- crew-builder/src/routes/crew/ask/+page.ts +1 -0
- crew-builder/src/routes/integrations/o365/+page.svelte +304 -0
- crew-builder/src/routes/login/+page.svelte +197 -0
- crew-builder/src/routes/talk/[agentId]/+page.svelte +487 -0
- crew-builder/src/routes/talk/[agentId]/+page.ts +7 -0
- crew-builder/static/README.md +1 -0
- crew-builder/svelte.config.js +11 -0
- crew-builder/tailwind.config.ts +53 -0
- crew-builder/tsconfig.json +3 -0
- crew-builder/vite.config.ts +10 -0
- mcp_servers/calculator_server.py +309 -0
- parrot/__init__.py +27 -0
- parrot/__pycache__/__init__.cpython-310.pyc +0 -0
- parrot/__pycache__/version.cpython-310.pyc +0 -0
- parrot/_version.py +34 -0
- parrot/a2a/__init__.py +48 -0
- parrot/a2a/client.py +658 -0
- parrot/a2a/discovery.py +89 -0
- parrot/a2a/mixin.py +257 -0
- parrot/a2a/models.py +376 -0
- parrot/a2a/server.py +770 -0
- parrot/agents/__init__.py +29 -0
- parrot/bots/__init__.py +12 -0
- parrot/bots/a2a_agent.py +19 -0
- parrot/bots/abstract.py +3139 -0
- parrot/bots/agent.py +1129 -0
- parrot/bots/basic.py +9 -0
- parrot/bots/chatbot.py +669 -0
- parrot/bots/data.py +1618 -0
- parrot/bots/database/__init__.py +5 -0
- parrot/bots/database/abstract.py +3071 -0
- parrot/bots/database/cache.py +286 -0
- parrot/bots/database/models.py +468 -0
- parrot/bots/database/prompts.py +154 -0
- parrot/bots/database/retries.py +98 -0
- parrot/bots/database/router.py +269 -0
- parrot/bots/database/sql.py +41 -0
- parrot/bots/db/__init__.py +6 -0
- parrot/bots/db/abstract.py +556 -0
- parrot/bots/db/bigquery.py +602 -0
- parrot/bots/db/cache.py +85 -0
- parrot/bots/db/documentdb.py +668 -0
- parrot/bots/db/elastic.py +1014 -0
- parrot/bots/db/influx.py +898 -0
- parrot/bots/db/mock.py +96 -0
- parrot/bots/db/multi.py +783 -0
- parrot/bots/db/prompts.py +185 -0
- parrot/bots/db/sql.py +1255 -0
- parrot/bots/db/tools.py +212 -0
- parrot/bots/document.py +680 -0
- parrot/bots/hrbot.py +15 -0
- parrot/bots/kb.py +170 -0
- parrot/bots/mcp.py +36 -0
- parrot/bots/orchestration/README.md +463 -0
- parrot/bots/orchestration/__init__.py +1 -0
- parrot/bots/orchestration/agent.py +155 -0
- parrot/bots/orchestration/crew.py +3330 -0
- parrot/bots/orchestration/fsm.py +1179 -0
- parrot/bots/orchestration/hr.py +434 -0
- parrot/bots/orchestration/storage/__init__.py +4 -0
- parrot/bots/orchestration/storage/memory.py +100 -0
- parrot/bots/orchestration/storage/mixin.py +119 -0
- parrot/bots/orchestration/verify.py +202 -0
- parrot/bots/product.py +204 -0
- parrot/bots/prompts/__init__.py +96 -0
- parrot/bots/prompts/agents.py +155 -0
- parrot/bots/prompts/data.py +216 -0
- parrot/bots/prompts/output_generation.py +8 -0
- parrot/bots/scraper/__init__.py +3 -0
- parrot/bots/scraper/models.py +122 -0
- parrot/bots/scraper/scraper.py +1173 -0
- parrot/bots/scraper/templates.py +115 -0
- parrot/bots/stores/__init__.py +5 -0
- parrot/bots/stores/local.py +172 -0
- parrot/bots/webdev.py +81 -0
- parrot/cli.py +17 -0
- parrot/clients/__init__.py +16 -0
- parrot/clients/base.py +1491 -0
- parrot/clients/claude.py +1191 -0
- parrot/clients/factory.py +129 -0
- parrot/clients/google.py +4567 -0
- parrot/clients/gpt.py +1975 -0
- parrot/clients/grok.py +432 -0
- parrot/clients/groq.py +986 -0
- parrot/clients/hf.py +582 -0
- parrot/clients/models.py +18 -0
- parrot/conf.py +395 -0
- parrot/embeddings/__init__.py +9 -0
- parrot/embeddings/base.py +157 -0
- parrot/embeddings/google.py +98 -0
- parrot/embeddings/huggingface.py +74 -0
- parrot/embeddings/openai.py +84 -0
- parrot/embeddings/processor.py +88 -0
- parrot/exceptions.c +13868 -0
- parrot/exceptions.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/exceptions.pxd +22 -0
- parrot/exceptions.pxi +15 -0
- parrot/exceptions.pyx +44 -0
- parrot/generators/__init__.py +29 -0
- parrot/generators/base.py +200 -0
- parrot/generators/html.py +293 -0
- parrot/generators/react.py +205 -0
- parrot/generators/streamlit.py +203 -0
- parrot/generators/template.py +105 -0
- parrot/handlers/__init__.py +4 -0
- parrot/handlers/agent.py +861 -0
- parrot/handlers/agents/__init__.py +1 -0
- parrot/handlers/agents/abstract.py +900 -0
- parrot/handlers/bots.py +338 -0
- parrot/handlers/chat.py +915 -0
- parrot/handlers/creation.sql +192 -0
- parrot/handlers/crew/ARCHITECTURE.md +362 -0
- parrot/handlers/crew/README_BOTMANAGER_PERSISTENCE.md +303 -0
- parrot/handlers/crew/README_REDIS_PERSISTENCE.md +366 -0
- parrot/handlers/crew/__init__.py +0 -0
- parrot/handlers/crew/handler.py +801 -0
- parrot/handlers/crew/models.py +229 -0
- parrot/handlers/crew/redis_persistence.py +523 -0
- parrot/handlers/jobs/__init__.py +10 -0
- parrot/handlers/jobs/job.py +384 -0
- parrot/handlers/jobs/mixin.py +627 -0
- parrot/handlers/jobs/models.py +115 -0
- parrot/handlers/jobs/worker.py +31 -0
- parrot/handlers/models.py +596 -0
- parrot/handlers/o365_auth.py +105 -0
- parrot/handlers/stream.py +337 -0
- parrot/interfaces/__init__.py +6 -0
- parrot/interfaces/aws.py +143 -0
- parrot/interfaces/credentials.py +113 -0
- parrot/interfaces/database.py +27 -0
- parrot/interfaces/google.py +1123 -0
- parrot/interfaces/hierarchy.py +1227 -0
- parrot/interfaces/http.py +651 -0
- parrot/interfaces/images/__init__.py +0 -0
- parrot/interfaces/images/plugins/__init__.py +24 -0
- parrot/interfaces/images/plugins/abstract.py +58 -0
- parrot/interfaces/images/plugins/analisys.py +148 -0
- parrot/interfaces/images/plugins/classify.py +150 -0
- parrot/interfaces/images/plugins/classifybase.py +182 -0
- parrot/interfaces/images/plugins/detect.py +150 -0
- parrot/interfaces/images/plugins/exif.py +1103 -0
- parrot/interfaces/images/plugins/hash.py +52 -0
- parrot/interfaces/images/plugins/vision.py +104 -0
- parrot/interfaces/images/plugins/yolo.py +66 -0
- parrot/interfaces/images/plugins/zerodetect.py +197 -0
- parrot/interfaces/o365.py +978 -0
- parrot/interfaces/onedrive.py +822 -0
- parrot/interfaces/sharepoint.py +1435 -0
- parrot/interfaces/soap.py +257 -0
- parrot/loaders/__init__.py +8 -0
- parrot/loaders/abstract.py +1131 -0
- parrot/loaders/audio.py +199 -0
- parrot/loaders/basepdf.py +53 -0
- parrot/loaders/basevideo.py +1568 -0
- parrot/loaders/csv.py +409 -0
- parrot/loaders/docx.py +116 -0
- parrot/loaders/epubloader.py +316 -0
- parrot/loaders/excel.py +199 -0
- parrot/loaders/factory.py +55 -0
- parrot/loaders/files/__init__.py +0 -0
- parrot/loaders/files/abstract.py +39 -0
- parrot/loaders/files/html.py +26 -0
- parrot/loaders/files/text.py +63 -0
- parrot/loaders/html.py +152 -0
- parrot/loaders/markdown.py +442 -0
- parrot/loaders/pdf.py +373 -0
- parrot/loaders/pdfmark.py +320 -0
- parrot/loaders/pdftables.py +506 -0
- parrot/loaders/ppt.py +476 -0
- parrot/loaders/qa.py +63 -0
- parrot/loaders/splitters/__init__.py +10 -0
- parrot/loaders/splitters/base.py +138 -0
- parrot/loaders/splitters/md.py +228 -0
- parrot/loaders/splitters/token.py +143 -0
- parrot/loaders/txt.py +26 -0
- parrot/loaders/video.py +89 -0
- parrot/loaders/videolocal.py +218 -0
- parrot/loaders/videounderstanding.py +377 -0
- parrot/loaders/vimeo.py +167 -0
- parrot/loaders/web.py +599 -0
- parrot/loaders/youtube.py +504 -0
- parrot/manager/__init__.py +5 -0
- parrot/manager/manager.py +1030 -0
- parrot/mcp/__init__.py +28 -0
- parrot/mcp/adapter.py +105 -0
- parrot/mcp/cli.py +174 -0
- parrot/mcp/client.py +119 -0
- parrot/mcp/config.py +75 -0
- parrot/mcp/integration.py +842 -0
- parrot/mcp/oauth.py +933 -0
- parrot/mcp/server.py +225 -0
- parrot/mcp/transports/__init__.py +3 -0
- parrot/mcp/transports/base.py +279 -0
- parrot/mcp/transports/grpc_session.py +163 -0
- parrot/mcp/transports/http.py +312 -0
- parrot/mcp/transports/mcp.proto +108 -0
- parrot/mcp/transports/quic.py +1082 -0
- parrot/mcp/transports/sse.py +330 -0
- parrot/mcp/transports/stdio.py +309 -0
- parrot/mcp/transports/unix.py +395 -0
- parrot/mcp/transports/websocket.py +547 -0
- parrot/memory/__init__.py +16 -0
- parrot/memory/abstract.py +209 -0
- parrot/memory/agent.py +32 -0
- parrot/memory/cache.py +175 -0
- parrot/memory/core.py +555 -0
- parrot/memory/file.py +153 -0
- parrot/memory/mem.py +131 -0
- parrot/memory/redis.py +613 -0
- parrot/models/__init__.py +46 -0
- parrot/models/basic.py +118 -0
- parrot/models/compliance.py +208 -0
- parrot/models/crew.py +395 -0
- parrot/models/detections.py +654 -0
- parrot/models/generation.py +85 -0
- parrot/models/google.py +223 -0
- parrot/models/groq.py +23 -0
- parrot/models/openai.py +30 -0
- parrot/models/outputs.py +285 -0
- parrot/models/responses.py +938 -0
- parrot/notifications/__init__.py +743 -0
- parrot/openapi/__init__.py +3 -0
- parrot/openapi/components.yaml +641 -0
- parrot/openapi/config.py +322 -0
- parrot/outputs/__init__.py +32 -0
- parrot/outputs/formats/__init__.py +108 -0
- parrot/outputs/formats/altair.py +359 -0
- parrot/outputs/formats/application.py +122 -0
- parrot/outputs/formats/base.py +351 -0
- parrot/outputs/formats/bokeh.py +356 -0
- parrot/outputs/formats/card.py +424 -0
- parrot/outputs/formats/chart.py +436 -0
- parrot/outputs/formats/d3.py +255 -0
- parrot/outputs/formats/echarts.py +310 -0
- parrot/outputs/formats/generators/__init__.py +0 -0
- parrot/outputs/formats/generators/abstract.py +61 -0
- parrot/outputs/formats/generators/panel.py +145 -0
- parrot/outputs/formats/generators/streamlit.py +86 -0
- parrot/outputs/formats/generators/terminal.py +63 -0
- parrot/outputs/formats/holoviews.py +310 -0
- parrot/outputs/formats/html.py +147 -0
- parrot/outputs/formats/jinja2.py +46 -0
- parrot/outputs/formats/json.py +87 -0
- parrot/outputs/formats/map.py +933 -0
- parrot/outputs/formats/markdown.py +172 -0
- parrot/outputs/formats/matplotlib.py +237 -0
- parrot/outputs/formats/mixins/__init__.py +0 -0
- parrot/outputs/formats/mixins/emaps.py +855 -0
- parrot/outputs/formats/plotly.py +341 -0
- parrot/outputs/formats/seaborn.py +310 -0
- parrot/outputs/formats/table.py +397 -0
- parrot/outputs/formats/template_report.py +138 -0
- parrot/outputs/formats/yaml.py +125 -0
- parrot/outputs/formatter.py +152 -0
- parrot/outputs/templates/__init__.py +95 -0
- parrot/pipelines/__init__.py +0 -0
- parrot/pipelines/abstract.py +210 -0
- parrot/pipelines/detector.py +124 -0
- parrot/pipelines/models.py +90 -0
- parrot/pipelines/planogram.py +3002 -0
- parrot/pipelines/table.sql +97 -0
- parrot/plugins/__init__.py +106 -0
- parrot/plugins/importer.py +80 -0
- parrot/py.typed +0 -0
- parrot/registry/__init__.py +18 -0
- parrot/registry/registry.py +594 -0
- parrot/scheduler/__init__.py +1189 -0
- parrot/scheduler/models.py +60 -0
- parrot/security/__init__.py +16 -0
- parrot/security/prompt_injection.py +268 -0
- parrot/security/security_events.sql +25 -0
- parrot/services/__init__.py +1 -0
- parrot/services/mcp/__init__.py +8 -0
- parrot/services/mcp/config.py +13 -0
- parrot/services/mcp/server.py +295 -0
- parrot/services/o365_remote_auth.py +235 -0
- parrot/stores/__init__.py +7 -0
- parrot/stores/abstract.py +352 -0
- parrot/stores/arango.py +1090 -0
- parrot/stores/bigquery.py +1377 -0
- parrot/stores/cache.py +106 -0
- parrot/stores/empty.py +10 -0
- parrot/stores/faiss_store.py +1157 -0
- parrot/stores/kb/__init__.py +9 -0
- parrot/stores/kb/abstract.py +68 -0
- parrot/stores/kb/cache.py +165 -0
- parrot/stores/kb/doc.py +325 -0
- parrot/stores/kb/hierarchy.py +346 -0
- parrot/stores/kb/local.py +457 -0
- parrot/stores/kb/prompt.py +28 -0
- parrot/stores/kb/redis.py +659 -0
- parrot/stores/kb/store.py +115 -0
- parrot/stores/kb/user.py +374 -0
- parrot/stores/models.py +59 -0
- parrot/stores/pgvector.py +3 -0
- parrot/stores/postgres.py +2853 -0
- parrot/stores/utils/__init__.py +0 -0
- parrot/stores/utils/chunking.py +197 -0
- parrot/telemetry/__init__.py +3 -0
- parrot/telemetry/mixin.py +111 -0
- parrot/template/__init__.py +3 -0
- parrot/template/engine.py +259 -0
- parrot/tools/__init__.py +23 -0
- parrot/tools/abstract.py +644 -0
- parrot/tools/agent.py +363 -0
- parrot/tools/arangodbsearch.py +537 -0
- parrot/tools/arxiv_tool.py +188 -0
- parrot/tools/calculator/__init__.py +3 -0
- parrot/tools/calculator/operations/__init__.py +38 -0
- parrot/tools/calculator/operations/calculus.py +80 -0
- parrot/tools/calculator/operations/statistics.py +76 -0
- parrot/tools/calculator/tool.py +150 -0
- parrot/tools/cloudwatch.py +988 -0
- parrot/tools/codeinterpreter/__init__.py +127 -0
- parrot/tools/codeinterpreter/executor.py +371 -0
- parrot/tools/codeinterpreter/internals.py +473 -0
- parrot/tools/codeinterpreter/models.py +643 -0
- parrot/tools/codeinterpreter/prompts.py +224 -0
- parrot/tools/codeinterpreter/tool.py +664 -0
- parrot/tools/company_info/__init__.py +6 -0
- parrot/tools/company_info/tool.py +1138 -0
- parrot/tools/correlationanalysis.py +437 -0
- parrot/tools/database/abstract.py +286 -0
- parrot/tools/database/bq.py +115 -0
- parrot/tools/database/cache.py +284 -0
- parrot/tools/database/models.py +95 -0
- parrot/tools/database/pg.py +343 -0
- parrot/tools/databasequery.py +1159 -0
- parrot/tools/db.py +1800 -0
- parrot/tools/ddgo.py +370 -0
- parrot/tools/decorators.py +271 -0
- parrot/tools/dftohtml.py +282 -0
- parrot/tools/document.py +549 -0
- parrot/tools/ecs.py +819 -0
- parrot/tools/edareport.py +368 -0
- parrot/tools/elasticsearch.py +1049 -0
- parrot/tools/employees.py +462 -0
- parrot/tools/epson/__init__.py +96 -0
- parrot/tools/excel.py +683 -0
- parrot/tools/file/__init__.py +13 -0
- parrot/tools/file/abstract.py +76 -0
- parrot/tools/file/gcs.py +378 -0
- parrot/tools/file/local.py +284 -0
- parrot/tools/file/s3.py +511 -0
- parrot/tools/file/tmp.py +309 -0
- parrot/tools/file/tool.py +501 -0
- parrot/tools/file_reader.py +129 -0
- parrot/tools/flowtask/__init__.py +19 -0
- parrot/tools/flowtask/tool.py +761 -0
- parrot/tools/gittoolkit.py +508 -0
- parrot/tools/google/__init__.py +18 -0
- parrot/tools/google/base.py +169 -0
- parrot/tools/google/tools.py +1251 -0
- parrot/tools/googlelocation.py +5 -0
- parrot/tools/googleroutes.py +5 -0
- parrot/tools/googlesearch.py +5 -0
- parrot/tools/googlesitesearch.py +5 -0
- parrot/tools/googlevoice.py +2 -0
- parrot/tools/gvoice.py +695 -0
- parrot/tools/ibisworld/README.md +225 -0
- parrot/tools/ibisworld/__init__.py +11 -0
- parrot/tools/ibisworld/tool.py +366 -0
- parrot/tools/jiratoolkit.py +1718 -0
- parrot/tools/manager.py +1098 -0
- parrot/tools/math.py +152 -0
- parrot/tools/metadata.py +476 -0
- parrot/tools/msteams.py +1621 -0
- parrot/tools/msword.py +635 -0
- parrot/tools/multidb.py +580 -0
- parrot/tools/multistoresearch.py +369 -0
- parrot/tools/networkninja.py +167 -0
- parrot/tools/nextstop/__init__.py +4 -0
- parrot/tools/nextstop/base.py +286 -0
- parrot/tools/nextstop/employee.py +733 -0
- parrot/tools/nextstop/store.py +462 -0
- parrot/tools/notification.py +435 -0
- parrot/tools/o365/__init__.py +42 -0
- parrot/tools/o365/base.py +295 -0
- parrot/tools/o365/bundle.py +522 -0
- parrot/tools/o365/events.py +554 -0
- parrot/tools/o365/mail.py +992 -0
- parrot/tools/o365/onedrive.py +497 -0
- parrot/tools/o365/sharepoint.py +641 -0
- parrot/tools/openapi_toolkit.py +904 -0
- parrot/tools/openweather.py +527 -0
- parrot/tools/pdfprint.py +1001 -0
- parrot/tools/powerbi.py +518 -0
- parrot/tools/powerpoint.py +1113 -0
- parrot/tools/pricestool.py +146 -0
- parrot/tools/products/__init__.py +246 -0
- parrot/tools/prophet_tool.py +171 -0
- parrot/tools/pythonpandas.py +630 -0
- parrot/tools/pythonrepl.py +910 -0
- parrot/tools/qsource.py +436 -0
- parrot/tools/querytoolkit.py +395 -0
- parrot/tools/quickeda.py +827 -0
- parrot/tools/resttool.py +553 -0
- parrot/tools/retail/__init__.py +0 -0
- parrot/tools/retail/bby.py +528 -0
- parrot/tools/sandboxtool.py +703 -0
- parrot/tools/sassie/__init__.py +352 -0
- parrot/tools/scraping/__init__.py +7 -0
- parrot/tools/scraping/docs/select.md +466 -0
- parrot/tools/scraping/documentation.md +1278 -0
- parrot/tools/scraping/driver.py +436 -0
- parrot/tools/scraping/models.py +576 -0
- parrot/tools/scraping/options.py +85 -0
- parrot/tools/scraping/orchestrator.py +517 -0
- parrot/tools/scraping/readme.md +740 -0
- parrot/tools/scraping/tool.py +3115 -0
- parrot/tools/seasonaldetection.py +642 -0
- parrot/tools/shell_tool/__init__.py +5 -0
- parrot/tools/shell_tool/actions.py +408 -0
- parrot/tools/shell_tool/engine.py +155 -0
- parrot/tools/shell_tool/models.py +322 -0
- parrot/tools/shell_tool/tool.py +442 -0
- parrot/tools/site_search.py +214 -0
- parrot/tools/textfile.py +418 -0
- parrot/tools/think.py +378 -0
- parrot/tools/toolkit.py +298 -0
- parrot/tools/webapp_tool.py +187 -0
- parrot/tools/whatif.py +1279 -0
- parrot/tools/workday/MULTI_WSDL_EXAMPLE.md +249 -0
- parrot/tools/workday/__init__.py +6 -0
- parrot/tools/workday/models.py +1389 -0
- parrot/tools/workday/tool.py +1293 -0
- parrot/tools/yfinance_tool.py +306 -0
- parrot/tools/zipcode.py +217 -0
- parrot/utils/__init__.py +2 -0
- parrot/utils/helpers.py +73 -0
- parrot/utils/parsers/__init__.py +5 -0
- parrot/utils/parsers/toml.c +12078 -0
- parrot/utils/parsers/toml.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/utils/parsers/toml.pyx +21 -0
- parrot/utils/toml.py +11 -0
- parrot/utils/types.cpp +20936 -0
- parrot/utils/types.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/utils/types.pyx +213 -0
- parrot/utils/uv.py +11 -0
- parrot/version.py +10 -0
- parrot/yaml-rs/Cargo.lock +350 -0
- parrot/yaml-rs/Cargo.toml +19 -0
- parrot/yaml-rs/pyproject.toml +19 -0
- parrot/yaml-rs/python/yaml_rs/__init__.py +81 -0
- parrot/yaml-rs/src/lib.rs +222 -0
- requirements/docker-compose.yml +24 -0
- requirements/requirements-dev.txt +21 -0
parrot/loaders/web.py
ADDED
|
@@ -0,0 +1,599 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import time
|
|
3
|
+
from typing import Union, List, Optional, Tuple, Dict, Any
|
|
4
|
+
from bs4 import BeautifulSoup, NavigableString
|
|
5
|
+
from markdownify import MarkdownConverter
|
|
6
|
+
from webdriver_manager.chrome import ChromeDriverManager
|
|
7
|
+
from webdriver_manager.firefox import GeckoDriverManager
|
|
8
|
+
from selenium import webdriver
|
|
9
|
+
from selenium.webdriver.chrome.service import Service as ChromeService
|
|
10
|
+
from selenium.webdriver.firefox.service import Service as FirefoxService
|
|
11
|
+
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
|
12
|
+
from selenium.webdriver.firefox.options import Options as FirefoxOptions
|
|
13
|
+
from selenium.webdriver.common.by import By
|
|
14
|
+
from selenium.webdriver.support.ui import WebDriverWait
|
|
15
|
+
from selenium.webdriver.support import expected_conditions as EC
|
|
16
|
+
from navconfig.logging import logging
|
|
17
|
+
from .abstract import AbstractLoader
|
|
18
|
+
from ..stores.models import Document
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
logging.getLogger(name='selenium.webdriver').setLevel(logging.WARNING)
|
|
22
|
+
logging.getLogger(name='WDM').setLevel(logging.WARNING)
|
|
23
|
+
logging.getLogger(name='matplotlib').setLevel(logging.WARNING)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
DEFAULT_UA = (
|
|
27
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
28
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
29
|
+
"Chrome/122.0.0.0 Safari/537.36"
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class WebDriverPool:
|
|
34
|
+
"""Async WebDriver pool for efficient browser management."""
|
|
35
|
+
|
|
36
|
+
def __init__(self, max_drivers: int = 3, browser: str = "chrome", **driver_kwargs):
|
|
37
|
+
self.max_drivers = max_drivers
|
|
38
|
+
self.browser = browser.lower()
|
|
39
|
+
self.driver_kwargs = driver_kwargs
|
|
40
|
+
self.pool = asyncio.Queue(maxsize=max_drivers)
|
|
41
|
+
self.active_drivers = set()
|
|
42
|
+
self.lock = asyncio.Lock()
|
|
43
|
+
self.logger = logging.getLogger(self.__class__.__name__)
|
|
44
|
+
|
|
45
|
+
async def get_driver(self) -> webdriver:
|
|
46
|
+
"""Get a driver from the pool or create a new one."""
|
|
47
|
+
try:
|
|
48
|
+
# Try to get an existing driver from the pool
|
|
49
|
+
driver = self.pool.get_nowait()
|
|
50
|
+
self.logger.debug("Reusing driver from pool")
|
|
51
|
+
return driver
|
|
52
|
+
except asyncio.QueueEmpty:
|
|
53
|
+
# Pool is empty, create new driver if under limit
|
|
54
|
+
async with self.lock:
|
|
55
|
+
if len(self.active_drivers) < self.max_drivers:
|
|
56
|
+
driver = await asyncio.get_event_loop().run_in_executor(
|
|
57
|
+
None, self._create_driver
|
|
58
|
+
)
|
|
59
|
+
self.active_drivers.add(driver)
|
|
60
|
+
self.logger.debug(f"Created new driver. Active: {len(self.active_drivers)}")
|
|
61
|
+
return driver
|
|
62
|
+
else:
|
|
63
|
+
# Wait for a driver to become available
|
|
64
|
+
self.logger.debug("Waiting for available driver")
|
|
65
|
+
return await self.pool.get()
|
|
66
|
+
|
|
67
|
+
def _create_driver(self) -> webdriver:
|
|
68
|
+
"""Create a new WebDriver instance synchronously."""
|
|
69
|
+
chrome_args = [
|
|
70
|
+
"--headless=new",
|
|
71
|
+
"--enable-automation",
|
|
72
|
+
"--lang=en",
|
|
73
|
+
"--disable-extensions",
|
|
74
|
+
"--disable-gpu",
|
|
75
|
+
"--no-sandbox",
|
|
76
|
+
"--disable-dev-shm-usage",
|
|
77
|
+
]
|
|
78
|
+
|
|
79
|
+
firefox_args = [
|
|
80
|
+
"-headless",
|
|
81
|
+
]
|
|
82
|
+
|
|
83
|
+
if self.browser == "firefox":
|
|
84
|
+
options = FirefoxOptions()
|
|
85
|
+
for arg in firefox_args:
|
|
86
|
+
options.add_argument(arg)
|
|
87
|
+
|
|
88
|
+
user_agent = self.driver_kwargs.get('user_agent')
|
|
89
|
+
if user_agent:
|
|
90
|
+
options.set_preference("general.useragent.override", user_agent)
|
|
91
|
+
|
|
92
|
+
page_load_strategy = self.driver_kwargs.get('page_load_strategy', 'normal')
|
|
93
|
+
caps = webdriver.DesiredCapabilities.FIREFOX.copy()
|
|
94
|
+
caps["pageLoadStrategy"] = page_load_strategy
|
|
95
|
+
|
|
96
|
+
service = FirefoxService(GeckoDriverManager().install())
|
|
97
|
+
return webdriver.Firefox(service=service, options=options)
|
|
98
|
+
|
|
99
|
+
else: # Chrome
|
|
100
|
+
options = ChromeOptions()
|
|
101
|
+
for arg in chrome_args:
|
|
102
|
+
options.add_argument(arg)
|
|
103
|
+
|
|
104
|
+
user_agent = self.driver_kwargs.get('user_agent', DEFAULT_UA)
|
|
105
|
+
if user_agent:
|
|
106
|
+
options.add_argument(f"user-agent={user_agent}")
|
|
107
|
+
|
|
108
|
+
page_load_strategy = self.driver_kwargs.get('page_load_strategy', 'normal')
|
|
109
|
+
options.page_load_strategy = page_load_strategy
|
|
110
|
+
|
|
111
|
+
service = ChromeService(ChromeDriverManager().install())
|
|
112
|
+
return webdriver.Chrome(service=service, options=options)
|
|
113
|
+
|
|
114
|
+
async def return_driver(self, driver: webdriver):
|
|
115
|
+
"""Return a driver to the pool after cleaning it."""
|
|
116
|
+
try:
|
|
117
|
+
# Clean the driver (clear cookies, navigate to blank page, etc.)
|
|
118
|
+
await asyncio.get_event_loop().run_in_executor(
|
|
119
|
+
None, self._clean_driver, driver
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
# Return to pool
|
|
123
|
+
await self.pool.put(driver)
|
|
124
|
+
self.logger.debug("Returned cleaned driver to pool")
|
|
125
|
+
except Exception as e:
|
|
126
|
+
self.logger.error(f"Error returning driver to pool: {e}")
|
|
127
|
+
await self._destroy_driver(driver)
|
|
128
|
+
|
|
129
|
+
def _clean_driver(self, driver: webdriver):
|
|
130
|
+
"""Clean driver state synchronously."""
|
|
131
|
+
try:
|
|
132
|
+
driver.delete_all_cookies()
|
|
133
|
+
driver.execute_script("window.localStorage.clear();")
|
|
134
|
+
driver.execute_script("window.sessionStorage.clear();")
|
|
135
|
+
driver.get("about:blank")
|
|
136
|
+
except Exception as e:
|
|
137
|
+
self.logger.warning(f"Error cleaning driver: {e}")
|
|
138
|
+
|
|
139
|
+
async def _destroy_driver(self, driver: webdriver):
|
|
140
|
+
"""Destroy a driver and remove it from active set."""
|
|
141
|
+
try:
|
|
142
|
+
await asyncio.get_event_loop().run_in_executor(None, driver.quit)
|
|
143
|
+
except Exception as e:
|
|
144
|
+
self.logger.error(f"Error quitting driver: {e}")
|
|
145
|
+
finally:
|
|
146
|
+
async with self.lock:
|
|
147
|
+
self.active_drivers.discard(driver)
|
|
148
|
+
|
|
149
|
+
async def close_all(self):
|
|
150
|
+
"""Close all drivers in the pool."""
|
|
151
|
+
async with self.lock:
|
|
152
|
+
# Close drivers in pool
|
|
153
|
+
while not self.pool.empty():
|
|
154
|
+
try:
|
|
155
|
+
driver = await self.pool.get()
|
|
156
|
+
await self._destroy_driver(driver)
|
|
157
|
+
except:
|
|
158
|
+
pass
|
|
159
|
+
|
|
160
|
+
# Close active drivers
|
|
161
|
+
destroy_tasks = [self._destroy_driver(driver) for driver in self.active_drivers.copy()]
|
|
162
|
+
if destroy_tasks:
|
|
163
|
+
await asyncio.gather(*destroy_tasks, return_exceptions=True)
|
|
164
|
+
|
|
165
|
+
self.active_drivers.clear()
|
|
166
|
+
self.logger.info("Closed all WebDriver instances")
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
class WebLoader(AbstractLoader):
|
|
170
|
+
"""Load web pages and extract HTML + Markdown + structured bits (videos/nav/tables)."""
|
|
171
|
+
|
|
172
|
+
def __init__(
|
|
173
|
+
self,
|
|
174
|
+
source_type: str = 'website',
|
|
175
|
+
*,
|
|
176
|
+
browser: str = "chrome",
|
|
177
|
+
timeout: int = 60,
|
|
178
|
+
page_load_strategy: str = "normal",
|
|
179
|
+
user_agent: Optional[str] = DEFAULT_UA,
|
|
180
|
+
max_drivers: int = 3,
|
|
181
|
+
driver_pool: Optional[WebDriverPool] = None,
|
|
182
|
+
**kwargs
|
|
183
|
+
):
|
|
184
|
+
super().__init__(source_type=source_type, **kwargs)
|
|
185
|
+
|
|
186
|
+
self.timeout = timeout
|
|
187
|
+
self.browser = browser.lower()
|
|
188
|
+
self.page_load_strategy = page_load_strategy
|
|
189
|
+
self.user_agent = user_agent
|
|
190
|
+
self.max_drivers = max_drivers
|
|
191
|
+
|
|
192
|
+
# Use provided pool or create our own
|
|
193
|
+
if driver_pool:
|
|
194
|
+
self.driver_pool = driver_pool
|
|
195
|
+
self._own_pool = False
|
|
196
|
+
else:
|
|
197
|
+
self.driver_pool = WebDriverPool(
|
|
198
|
+
max_drivers=max_drivers,
|
|
199
|
+
browser=browser,
|
|
200
|
+
page_load_strategy=page_load_strategy,
|
|
201
|
+
user_agent=user_agent
|
|
202
|
+
)
|
|
203
|
+
self._own_pool = True
|
|
204
|
+
|
|
205
|
+
self.driver = None
|
|
206
|
+
|
|
207
|
+
async def open(self):
|
|
208
|
+
"""Initialize resources - called by AbstractLoader's __aenter__."""
|
|
209
|
+
self.logger.debug("Opening WebLoader")
|
|
210
|
+
# Driver pool is ready to use, no additional setup needed
|
|
211
|
+
pass
|
|
212
|
+
|
|
213
|
+
async def close(self):
|
|
214
|
+
"""Clean up resources - called by AbstractLoader's __aexit__."""
|
|
215
|
+
self.logger.debug("Closing WebLoader")
|
|
216
|
+
if self._own_pool and self.driver_pool:
|
|
217
|
+
await self.driver_pool.close_all()
|
|
218
|
+
|
|
219
|
+
def md(self, soup: BeautifulSoup, **options) -> str:
|
|
220
|
+
"""Convert BeautifulSoup to Markdown."""
|
|
221
|
+
return MarkdownConverter(**options).convert_soup(soup)
|
|
222
|
+
|
|
223
|
+
def _text(self, node: Any) -> str:
|
|
224
|
+
"""Extract text content from a node."""
|
|
225
|
+
if node is None:
|
|
226
|
+
return ""
|
|
227
|
+
if isinstance(node, NavigableString):
|
|
228
|
+
return str(node).strip()
|
|
229
|
+
return node.get_text(" ", strip=True)
|
|
230
|
+
|
|
231
|
+
def _collect_video_links(self, soup: BeautifulSoup) -> List[str]:
|
|
232
|
+
"""Extract video links from the page."""
|
|
233
|
+
items: List[str] = []
|
|
234
|
+
|
|
235
|
+
# iframes (YouTube/Vimeo/etc.)
|
|
236
|
+
for iframe in soup.find_all("iframe"):
|
|
237
|
+
src = iframe.get("src")
|
|
238
|
+
if not src:
|
|
239
|
+
continue
|
|
240
|
+
items.append(f"Video Link: {src}")
|
|
241
|
+
|
|
242
|
+
# <video> and <source>
|
|
243
|
+
for video in soup.find_all("video"):
|
|
244
|
+
src = video.get("src")
|
|
245
|
+
if src:
|
|
246
|
+
items.append(f"Video Link: {src}")
|
|
247
|
+
for source in video.find_all("source"):
|
|
248
|
+
s = source.get("src")
|
|
249
|
+
if s:
|
|
250
|
+
items.append(f"Video Source: {s}")
|
|
251
|
+
|
|
252
|
+
# Deduplicate while preserving order
|
|
253
|
+
seen = set()
|
|
254
|
+
result = []
|
|
255
|
+
for x in items:
|
|
256
|
+
if x not in seen:
|
|
257
|
+
result.append(x)
|
|
258
|
+
seen.add(x)
|
|
259
|
+
return result
|
|
260
|
+
|
|
261
|
+
def _collect_navbars(self, soup: BeautifulSoup) -> List[str]:
|
|
262
|
+
"""Extract navigation menus as Markdown lists."""
|
|
263
|
+
nav_texts: List[str] = []
|
|
264
|
+
|
|
265
|
+
def nav_to_markdown(nav) -> str:
|
|
266
|
+
lines = []
|
|
267
|
+
blocks = nav.find_all(["ul", "ol"], recursive=True)
|
|
268
|
+
if not blocks:
|
|
269
|
+
# Fallback: collect links directly under nav
|
|
270
|
+
for a in nav.find_all("a", href=True):
|
|
271
|
+
txt = self._text(a)
|
|
272
|
+
href = a.get("href", "")
|
|
273
|
+
if txt or href:
|
|
274
|
+
lines.append(f"- {txt} (Link: {href})" if href else f"- {txt}")
|
|
275
|
+
else:
|
|
276
|
+
for block in blocks:
|
|
277
|
+
for li in block.find_all("li", recursive=False):
|
|
278
|
+
a = li.find("a", href=True)
|
|
279
|
+
if a:
|
|
280
|
+
txt = self._text(a)
|
|
281
|
+
href = a.get("href", "")
|
|
282
|
+
lines.append(f"- {txt} (Link: {href})" if href else f"- {txt}")
|
|
283
|
+
else:
|
|
284
|
+
t = self._text(li)
|
|
285
|
+
if t:
|
|
286
|
+
lines.append(f"- {t}")
|
|
287
|
+
|
|
288
|
+
# nested lists
|
|
289
|
+
for sub in li.find_all(["ul", "ol"], recursive=False):
|
|
290
|
+
for sub_li in sub.find_all("li", recursive=False):
|
|
291
|
+
a2 = sub_li.find("a", href=True)
|
|
292
|
+
if a2:
|
|
293
|
+
txt2 = self._text(a2)
|
|
294
|
+
href2 = a2.get("href", "")
|
|
295
|
+
lines.append(f" - {txt2} (Link: {href2})" if href2 else f" - {txt2}")
|
|
296
|
+
else:
|
|
297
|
+
t2 = self._text(sub_li)
|
|
298
|
+
if t2:
|
|
299
|
+
lines.append(f" - {t2}")
|
|
300
|
+
return "\n".join(lines)
|
|
301
|
+
|
|
302
|
+
# <nav> regions
|
|
303
|
+
for nav in soup.find_all("nav"):
|
|
304
|
+
md_list = nav_to_markdown(nav)
|
|
305
|
+
if md_list.strip():
|
|
306
|
+
nav_texts.append("Navigation:\n" + md_list)
|
|
307
|
+
|
|
308
|
+
# Common menu containers if no <nav>
|
|
309
|
+
if not nav_texts:
|
|
310
|
+
candidates = soup.select("[role='navigation'], .navbar, .menu, .nav")
|
|
311
|
+
for nav in candidates:
|
|
312
|
+
md_list = nav_to_markdown(nav)
|
|
313
|
+
if md_list.strip():
|
|
314
|
+
nav_texts.append("Navigation:\n" + md_list)
|
|
315
|
+
|
|
316
|
+
return nav_texts
|
|
317
|
+
|
|
318
|
+
def _table_to_markdown(self, table) -> str:
|
|
319
|
+
"""Convert a <table> to GitHub-flavored Markdown."""
|
|
320
|
+
# Caption
|
|
321
|
+
caption_el = table.find("caption")
|
|
322
|
+
caption = self._text(caption_el) if caption_el else ""
|
|
323
|
+
|
|
324
|
+
# Headers
|
|
325
|
+
headers = []
|
|
326
|
+
thead = table.find("thead")
|
|
327
|
+
if thead:
|
|
328
|
+
ths = thead.find_all("th")
|
|
329
|
+
if ths:
|
|
330
|
+
headers = [self._text(th) for th in ths]
|
|
331
|
+
|
|
332
|
+
# If no thead, try first row as headers
|
|
333
|
+
if not headers:
|
|
334
|
+
first_row = table.find("tr")
|
|
335
|
+
if first_row:
|
|
336
|
+
cells = first_row.find_all(["th", "td"])
|
|
337
|
+
headers = [self._text(c) for c in cells]
|
|
338
|
+
|
|
339
|
+
# Rows
|
|
340
|
+
rows = []
|
|
341
|
+
for tr in table.find_all("tr"):
|
|
342
|
+
cells = tr.find_all(["td"])
|
|
343
|
+
if not cells:
|
|
344
|
+
continue
|
|
345
|
+
rows.append([self._text(td) for td in cells])
|
|
346
|
+
|
|
347
|
+
if not headers and rows:
|
|
348
|
+
headers = [f"Col {i+1}" for i in range(len(rows[0]))]
|
|
349
|
+
|
|
350
|
+
# Normalize column count
|
|
351
|
+
ncol = len(headers)
|
|
352
|
+
norm_rows = []
|
|
353
|
+
for r in rows:
|
|
354
|
+
if len(r) < ncol:
|
|
355
|
+
r = r + [""] * (ncol - len(r))
|
|
356
|
+
elif len(r) > ncol:
|
|
357
|
+
r = r[:ncol]
|
|
358
|
+
norm_rows.append(r)
|
|
359
|
+
|
|
360
|
+
def esc(cell: str) -> str:
|
|
361
|
+
return (cell or "").replace("|", "\\|").strip()
|
|
362
|
+
|
|
363
|
+
md = []
|
|
364
|
+
if caption:
|
|
365
|
+
md.append(f"Table: {caption}\n")
|
|
366
|
+
if headers:
|
|
367
|
+
md.append("| " + " | ".join(esc(h) for h in headers) + " |")
|
|
368
|
+
md.append("| " + " | ".join("---" for _ in headers) + " |")
|
|
369
|
+
for r in norm_rows:
|
|
370
|
+
md.append("| " + " | ".join(esc(c) for c in r) + " |")
|
|
371
|
+
return "\n".join(md).strip()
|
|
372
|
+
|
|
373
|
+
def _collect_tables(self, soup: BeautifulSoup, max_tables: int = 25) -> List[str]:
|
|
374
|
+
"""Extract tables as Markdown."""
|
|
375
|
+
out = []
|
|
376
|
+
for i, table in enumerate(soup.find_all("table")):
|
|
377
|
+
if i >= max_tables:
|
|
378
|
+
break
|
|
379
|
+
try:
|
|
380
|
+
out.append(self._table_to_markdown(table))
|
|
381
|
+
except Exception:
|
|
382
|
+
continue
|
|
383
|
+
return out
|
|
384
|
+
|
|
385
|
+
def _fetch_page_sync(self, driver: webdriver, url: str, args: dict) -> str:
|
|
386
|
+
"""Synchronously fetch page content using WebDriver."""
|
|
387
|
+
# Waiting / cookie handling
|
|
388
|
+
locator = args.get('locator', (By.TAG_NAME, 'body'))
|
|
389
|
+
wait = WebDriverWait(driver, self.timeout)
|
|
390
|
+
acookies = args.get('accept_cookies', False)
|
|
391
|
+
sleep_after = args.get('sleep_after', 0)
|
|
392
|
+
|
|
393
|
+
try:
|
|
394
|
+
driver.get(url)
|
|
395
|
+
wait.until(EC.presence_of_element_located(locator))
|
|
396
|
+
|
|
397
|
+
if acookies:
|
|
398
|
+
try:
|
|
399
|
+
btn = wait.until(EC.element_to_be_clickable(acookies))
|
|
400
|
+
btn.click()
|
|
401
|
+
except Exception:
|
|
402
|
+
pass
|
|
403
|
+
except Exception as exc:
|
|
404
|
+
self.logger.error(f"Failed to load {url}: {exc}")
|
|
405
|
+
raise
|
|
406
|
+
|
|
407
|
+
if sleep_after:
|
|
408
|
+
time.sleep(float(sleep_after))
|
|
409
|
+
|
|
410
|
+
return driver.page_source
|
|
411
|
+
|
|
412
|
+
def clean_html(
|
|
413
|
+
self,
|
|
414
|
+
html: str,
|
|
415
|
+
tags: List[str],
|
|
416
|
+
objects: List[Dict[str, Dict[str, Any]]] = [],
|
|
417
|
+
*,
|
|
418
|
+
parse_videos: bool = True,
|
|
419
|
+
parse_navs: bool = True,
|
|
420
|
+
parse_tables: bool = True
|
|
421
|
+
) -> Tuple[List[str], str, str]:
|
|
422
|
+
"""Clean and extract content from HTML."""
|
|
423
|
+
soup = BeautifulSoup(html, 'html.parser')
|
|
424
|
+
|
|
425
|
+
# Remove script/style/link early
|
|
426
|
+
for el in soup(["script", "style", "link", "noscript"]):
|
|
427
|
+
el.decompose()
|
|
428
|
+
|
|
429
|
+
# Title
|
|
430
|
+
page_title = ""
|
|
431
|
+
try:
|
|
432
|
+
if soup.title and soup.title.string:
|
|
433
|
+
page_title = soup.title.string.strip()
|
|
434
|
+
if not page_title:
|
|
435
|
+
og = soup.find("meta", property="og:title")
|
|
436
|
+
if og and og.get("content"):
|
|
437
|
+
page_title = og["content"].strip()
|
|
438
|
+
except Exception:
|
|
439
|
+
page_title = ""
|
|
440
|
+
|
|
441
|
+
# Full-page Markdown
|
|
442
|
+
md_text = self.md(soup)
|
|
443
|
+
|
|
444
|
+
content: List[str] = []
|
|
445
|
+
|
|
446
|
+
# Paragraphs/headers/sections
|
|
447
|
+
for p in soup.find_all(tags):
|
|
448
|
+
text = ' '.join(p.get_text(" ", strip=True).split())
|
|
449
|
+
if text:
|
|
450
|
+
content.append(text)
|
|
451
|
+
|
|
452
|
+
# Videos
|
|
453
|
+
if parse_videos:
|
|
454
|
+
content.extend(self._collect_video_links(soup))
|
|
455
|
+
|
|
456
|
+
# Navbars
|
|
457
|
+
if parse_navs:
|
|
458
|
+
content.extend(self._collect_navbars(soup))
|
|
459
|
+
|
|
460
|
+
# Tables
|
|
461
|
+
if parse_tables:
|
|
462
|
+
content.extend(self._collect_tables(soup))
|
|
463
|
+
|
|
464
|
+
# Custom objects (keeping existing behavior)
|
|
465
|
+
if objects:
|
|
466
|
+
for obj in objects:
|
|
467
|
+
(element, args), = obj.items()
|
|
468
|
+
if 'parse_list' in args:
|
|
469
|
+
parse_list = args.pop('parse_list')
|
|
470
|
+
container = soup.find(element, attrs=args)
|
|
471
|
+
if not container:
|
|
472
|
+
continue
|
|
473
|
+
name_type = parse_list.pop('type', 'List')
|
|
474
|
+
params = parse_list.get('find', [])
|
|
475
|
+
el = params[0] if params else 'ul'
|
|
476
|
+
attrs = params[1] if len(params) > 1 else {}
|
|
477
|
+
elements = container.find_all(el, attrs=attrs)
|
|
478
|
+
structured_text = ''
|
|
479
|
+
for element in elements:
|
|
480
|
+
title_el = element.find('span', class_='title')
|
|
481
|
+
title = title_el.get_text(strip=True) if title_el else ''
|
|
482
|
+
lists = element.find_all('ul')
|
|
483
|
+
if lists:
|
|
484
|
+
if title:
|
|
485
|
+
structured_text += f"\nCategory: {title}\n{name_type}:\n"
|
|
486
|
+
for ul in lists:
|
|
487
|
+
items = [f"- {li.get_text(strip=True)}" for li in ul.select('li')]
|
|
488
|
+
structured_text += '\n'.join(items)
|
|
489
|
+
structured_text += "\n"
|
|
490
|
+
if structured_text.strip():
|
|
491
|
+
content.append(structured_text.strip())
|
|
492
|
+
else:
|
|
493
|
+
elements = soup.find_all(element, attrs=args)
|
|
494
|
+
for element in elements:
|
|
495
|
+
for link in element.find_all('a'):
|
|
496
|
+
link_text = link.get_text(strip=True)
|
|
497
|
+
href = link.get('href', '')
|
|
498
|
+
formatted = f"{link_text} (Link: {href})" if href else link_text
|
|
499
|
+
link.replace_with(formatted)
|
|
500
|
+
|
|
501
|
+
for ul in element.find_all('ul'):
|
|
502
|
+
items = [li.get_text(strip=True) for li in ul.select('li')]
|
|
503
|
+
if items:
|
|
504
|
+
content.append('\n'.join(items))
|
|
505
|
+
|
|
506
|
+
cleaned_text = ' '.join(element.get_text().split())
|
|
507
|
+
if cleaned_text:
|
|
508
|
+
content.append(cleaned_text)
|
|
509
|
+
|
|
510
|
+
return (content, md_text, page_title)
|
|
511
|
+
|
|
512
|
+
def _normalize_url_args(self, address, kwargs):
|
|
513
|
+
"""Normalize URL and arguments from different input formats."""
|
|
514
|
+
if isinstance(address, str):
|
|
515
|
+
url = address
|
|
516
|
+
args = dict(kwargs) if kwargs else {}
|
|
517
|
+
return url, args
|
|
518
|
+
|
|
519
|
+
if isinstance(address, dict):
|
|
520
|
+
(url, args), = address.items()
|
|
521
|
+
args = dict(args or {})
|
|
522
|
+
if kwargs:
|
|
523
|
+
args.update(kwargs)
|
|
524
|
+
return url, args
|
|
525
|
+
|
|
526
|
+
raise TypeError(f"Unsupported address type for WebLoader: {type(address)}")
|
|
527
|
+
|
|
528
|
+
async def _load(self, address: Union[str, dict], **kwargs) -> List[Document]:
|
|
529
|
+
"""Load a single web page."""
|
|
530
|
+
url, args = self._normalize_url_args(address, kwargs)
|
|
531
|
+
self.logger.info(f'Loading URL: {url}')
|
|
532
|
+
|
|
533
|
+
# Get driver from pool
|
|
534
|
+
driver = await self.driver_pool.get_driver()
|
|
535
|
+
|
|
536
|
+
try:
|
|
537
|
+
# Fetch page content in executor
|
|
538
|
+
html_content = await asyncio.get_event_loop().run_in_executor(
|
|
539
|
+
None, self._fetch_page_sync, driver, url, args
|
|
540
|
+
)
|
|
541
|
+
|
|
542
|
+
# Process content
|
|
543
|
+
extract_tags = args.get('tags', ['p', 'title', 'h1', 'h2', 'section', 'article'])
|
|
544
|
+
objects = args.get('objects', [])
|
|
545
|
+
parse_videos = args.get('parse_videos', True)
|
|
546
|
+
parse_navs = args.get('parse_navs', True)
|
|
547
|
+
parse_tables = args.get('parse_tables', True)
|
|
548
|
+
source_type = args.get('source_type', self._source_type)
|
|
549
|
+
|
|
550
|
+
content, md_text, page_title = self.clean_html(
|
|
551
|
+
html_content,
|
|
552
|
+
extract_tags,
|
|
553
|
+
objects,
|
|
554
|
+
parse_videos=parse_videos,
|
|
555
|
+
parse_navs=parse_navs,
|
|
556
|
+
parse_tables=parse_tables
|
|
557
|
+
)
|
|
558
|
+
|
|
559
|
+
if not page_title:
|
|
560
|
+
page_title = url
|
|
561
|
+
|
|
562
|
+
metadata = {
|
|
563
|
+
"source": url,
|
|
564
|
+
"url": url,
|
|
565
|
+
"filename": page_title,
|
|
566
|
+
"source_type": source_type,
|
|
567
|
+
"type": "webpage",
|
|
568
|
+
"document_meta": {
|
|
569
|
+
"language": "en",
|
|
570
|
+
"title": page_title,
|
|
571
|
+
},
|
|
572
|
+
}
|
|
573
|
+
|
|
574
|
+
docs: List[Document] = []
|
|
575
|
+
if md_text:
|
|
576
|
+
docs.append(
|
|
577
|
+
Document(
|
|
578
|
+
page_content=md_text,
|
|
579
|
+
metadata={**metadata, "content_kind": "markdown_full"}
|
|
580
|
+
)
|
|
581
|
+
)
|
|
582
|
+
|
|
583
|
+
for chunk in content:
|
|
584
|
+
if chunk and isinstance(chunk, str):
|
|
585
|
+
docs.append(
|
|
586
|
+
Document(
|
|
587
|
+
page_content=chunk,
|
|
588
|
+
metadata={**metadata, "content_kind": "fragment"}
|
|
589
|
+
)
|
|
590
|
+
)
|
|
591
|
+
|
|
592
|
+
return docs
|
|
593
|
+
|
|
594
|
+
except Exception as exc:
|
|
595
|
+
self.logger.error(f"Failed to load {url}: {exc}")
|
|
596
|
+
raise
|
|
597
|
+
finally:
|
|
598
|
+
# Return driver to pool
|
|
599
|
+
await self.driver_pool.return_driver(driver)
|