ai-parrot 0.17.2__cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentui/.prettierrc +15 -0
- agentui/QUICKSTART.md +272 -0
- agentui/README.md +59 -0
- agentui/env.example +16 -0
- agentui/jsconfig.json +14 -0
- agentui/package-lock.json +4242 -0
- agentui/package.json +34 -0
- agentui/scripts/postinstall/apply-patches.mjs +260 -0
- agentui/src/app.css +61 -0
- agentui/src/app.d.ts +13 -0
- agentui/src/app.html +12 -0
- agentui/src/components/LoadingSpinner.svelte +64 -0
- agentui/src/components/ThemeSwitcher.svelte +159 -0
- agentui/src/components/index.js +4 -0
- agentui/src/lib/api/bots.ts +60 -0
- agentui/src/lib/api/chat.ts +22 -0
- agentui/src/lib/api/http.ts +25 -0
- agentui/src/lib/components/BotCard.svelte +33 -0
- agentui/src/lib/components/ChatBubble.svelte +63 -0
- agentui/src/lib/components/Toast.svelte +21 -0
- agentui/src/lib/config.ts +20 -0
- agentui/src/lib/stores/auth.svelte.ts +73 -0
- agentui/src/lib/stores/theme.svelte.js +64 -0
- agentui/src/lib/stores/toast.svelte.ts +31 -0
- agentui/src/lib/utils/conversation.ts +39 -0
- agentui/src/routes/+layout.svelte +20 -0
- agentui/src/routes/+page.svelte +232 -0
- agentui/src/routes/login/+page.svelte +200 -0
- agentui/src/routes/talk/[agentId]/+page.svelte +297 -0
- agentui/src/routes/talk/[agentId]/+page.ts +7 -0
- agentui/static/README.md +1 -0
- agentui/svelte.config.js +11 -0
- agentui/tailwind.config.ts +53 -0
- agentui/tsconfig.json +3 -0
- agentui/vite.config.ts +10 -0
- ai_parrot-0.17.2.dist-info/METADATA +472 -0
- ai_parrot-0.17.2.dist-info/RECORD +535 -0
- ai_parrot-0.17.2.dist-info/WHEEL +6 -0
- ai_parrot-0.17.2.dist-info/entry_points.txt +2 -0
- ai_parrot-0.17.2.dist-info/licenses/LICENSE +21 -0
- ai_parrot-0.17.2.dist-info/top_level.txt +6 -0
- crew-builder/.prettierrc +15 -0
- crew-builder/QUICKSTART.md +259 -0
- crew-builder/README.md +113 -0
- crew-builder/env.example +17 -0
- crew-builder/jsconfig.json +14 -0
- crew-builder/package-lock.json +4182 -0
- crew-builder/package.json +37 -0
- crew-builder/scripts/postinstall/apply-patches.mjs +260 -0
- crew-builder/src/app.css +62 -0
- crew-builder/src/app.d.ts +13 -0
- crew-builder/src/app.html +12 -0
- crew-builder/src/components/LoadingSpinner.svelte +64 -0
- crew-builder/src/components/ThemeSwitcher.svelte +149 -0
- crew-builder/src/components/index.js +9 -0
- crew-builder/src/lib/api/bots.ts +60 -0
- crew-builder/src/lib/api/chat.ts +80 -0
- crew-builder/src/lib/api/client.ts +56 -0
- crew-builder/src/lib/api/crew/crew.ts +136 -0
- crew-builder/src/lib/api/index.ts +5 -0
- crew-builder/src/lib/api/o365/auth.ts +65 -0
- crew-builder/src/lib/auth/auth.ts +54 -0
- crew-builder/src/lib/components/AgentNode.svelte +43 -0
- crew-builder/src/lib/components/BotCard.svelte +33 -0
- crew-builder/src/lib/components/ChatBubble.svelte +67 -0
- crew-builder/src/lib/components/ConfigPanel.svelte +278 -0
- crew-builder/src/lib/components/JsonTreeNode.svelte +76 -0
- crew-builder/src/lib/components/JsonViewer.svelte +24 -0
- crew-builder/src/lib/components/MarkdownEditor.svelte +48 -0
- crew-builder/src/lib/components/ThemeToggle.svelte +36 -0
- crew-builder/src/lib/components/Toast.svelte +67 -0
- crew-builder/src/lib/components/Toolbar.svelte +157 -0
- crew-builder/src/lib/components/index.ts +10 -0
- crew-builder/src/lib/config.ts +8 -0
- crew-builder/src/lib/stores/auth.svelte.ts +228 -0
- crew-builder/src/lib/stores/crewStore.ts +369 -0
- crew-builder/src/lib/stores/theme.svelte.js +145 -0
- crew-builder/src/lib/stores/toast.svelte.ts +69 -0
- crew-builder/src/lib/utils/conversation.ts +39 -0
- crew-builder/src/lib/utils/markdown.ts +122 -0
- crew-builder/src/lib/utils/talkHistory.ts +47 -0
- crew-builder/src/routes/+layout.svelte +20 -0
- crew-builder/src/routes/+page.svelte +539 -0
- crew-builder/src/routes/agents/+page.svelte +247 -0
- crew-builder/src/routes/agents/[agentId]/+page.svelte +288 -0
- crew-builder/src/routes/agents/[agentId]/+page.ts +7 -0
- crew-builder/src/routes/builder/+page.svelte +204 -0
- crew-builder/src/routes/crew/ask/+page.svelte +1052 -0
- crew-builder/src/routes/crew/ask/+page.ts +1 -0
- crew-builder/src/routes/integrations/o365/+page.svelte +304 -0
- crew-builder/src/routes/login/+page.svelte +197 -0
- crew-builder/src/routes/talk/[agentId]/+page.svelte +487 -0
- crew-builder/src/routes/talk/[agentId]/+page.ts +7 -0
- crew-builder/static/README.md +1 -0
- crew-builder/svelte.config.js +11 -0
- crew-builder/tailwind.config.ts +53 -0
- crew-builder/tsconfig.json +3 -0
- crew-builder/vite.config.ts +10 -0
- mcp_servers/calculator_server.py +309 -0
- parrot/__init__.py +27 -0
- parrot/__pycache__/__init__.cpython-310.pyc +0 -0
- parrot/__pycache__/version.cpython-310.pyc +0 -0
- parrot/_version.py +34 -0
- parrot/a2a/__init__.py +48 -0
- parrot/a2a/client.py +658 -0
- parrot/a2a/discovery.py +89 -0
- parrot/a2a/mixin.py +257 -0
- parrot/a2a/models.py +376 -0
- parrot/a2a/server.py +770 -0
- parrot/agents/__init__.py +29 -0
- parrot/bots/__init__.py +12 -0
- parrot/bots/a2a_agent.py +19 -0
- parrot/bots/abstract.py +3139 -0
- parrot/bots/agent.py +1129 -0
- parrot/bots/basic.py +9 -0
- parrot/bots/chatbot.py +669 -0
- parrot/bots/data.py +1618 -0
- parrot/bots/database/__init__.py +5 -0
- parrot/bots/database/abstract.py +3071 -0
- parrot/bots/database/cache.py +286 -0
- parrot/bots/database/models.py +468 -0
- parrot/bots/database/prompts.py +154 -0
- parrot/bots/database/retries.py +98 -0
- parrot/bots/database/router.py +269 -0
- parrot/bots/database/sql.py +41 -0
- parrot/bots/db/__init__.py +6 -0
- parrot/bots/db/abstract.py +556 -0
- parrot/bots/db/bigquery.py +602 -0
- parrot/bots/db/cache.py +85 -0
- parrot/bots/db/documentdb.py +668 -0
- parrot/bots/db/elastic.py +1014 -0
- parrot/bots/db/influx.py +898 -0
- parrot/bots/db/mock.py +96 -0
- parrot/bots/db/multi.py +783 -0
- parrot/bots/db/prompts.py +185 -0
- parrot/bots/db/sql.py +1255 -0
- parrot/bots/db/tools.py +212 -0
- parrot/bots/document.py +680 -0
- parrot/bots/hrbot.py +15 -0
- parrot/bots/kb.py +170 -0
- parrot/bots/mcp.py +36 -0
- parrot/bots/orchestration/README.md +463 -0
- parrot/bots/orchestration/__init__.py +1 -0
- parrot/bots/orchestration/agent.py +155 -0
- parrot/bots/orchestration/crew.py +3330 -0
- parrot/bots/orchestration/fsm.py +1179 -0
- parrot/bots/orchestration/hr.py +434 -0
- parrot/bots/orchestration/storage/__init__.py +4 -0
- parrot/bots/orchestration/storage/memory.py +100 -0
- parrot/bots/orchestration/storage/mixin.py +119 -0
- parrot/bots/orchestration/verify.py +202 -0
- parrot/bots/product.py +204 -0
- parrot/bots/prompts/__init__.py +96 -0
- parrot/bots/prompts/agents.py +155 -0
- parrot/bots/prompts/data.py +216 -0
- parrot/bots/prompts/output_generation.py +8 -0
- parrot/bots/scraper/__init__.py +3 -0
- parrot/bots/scraper/models.py +122 -0
- parrot/bots/scraper/scraper.py +1173 -0
- parrot/bots/scraper/templates.py +115 -0
- parrot/bots/stores/__init__.py +5 -0
- parrot/bots/stores/local.py +172 -0
- parrot/bots/webdev.py +81 -0
- parrot/cli.py +17 -0
- parrot/clients/__init__.py +16 -0
- parrot/clients/base.py +1491 -0
- parrot/clients/claude.py +1191 -0
- parrot/clients/factory.py +129 -0
- parrot/clients/google.py +4567 -0
- parrot/clients/gpt.py +1975 -0
- parrot/clients/grok.py +432 -0
- parrot/clients/groq.py +986 -0
- parrot/clients/hf.py +582 -0
- parrot/clients/models.py +18 -0
- parrot/conf.py +395 -0
- parrot/embeddings/__init__.py +9 -0
- parrot/embeddings/base.py +157 -0
- parrot/embeddings/google.py +98 -0
- parrot/embeddings/huggingface.py +74 -0
- parrot/embeddings/openai.py +84 -0
- parrot/embeddings/processor.py +88 -0
- parrot/exceptions.c +13868 -0
- parrot/exceptions.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/exceptions.pxd +22 -0
- parrot/exceptions.pxi +15 -0
- parrot/exceptions.pyx +44 -0
- parrot/generators/__init__.py +29 -0
- parrot/generators/base.py +200 -0
- parrot/generators/html.py +293 -0
- parrot/generators/react.py +205 -0
- parrot/generators/streamlit.py +203 -0
- parrot/generators/template.py +105 -0
- parrot/handlers/__init__.py +4 -0
- parrot/handlers/agent.py +861 -0
- parrot/handlers/agents/__init__.py +1 -0
- parrot/handlers/agents/abstract.py +900 -0
- parrot/handlers/bots.py +338 -0
- parrot/handlers/chat.py +915 -0
- parrot/handlers/creation.sql +192 -0
- parrot/handlers/crew/ARCHITECTURE.md +362 -0
- parrot/handlers/crew/README_BOTMANAGER_PERSISTENCE.md +303 -0
- parrot/handlers/crew/README_REDIS_PERSISTENCE.md +366 -0
- parrot/handlers/crew/__init__.py +0 -0
- parrot/handlers/crew/handler.py +801 -0
- parrot/handlers/crew/models.py +229 -0
- parrot/handlers/crew/redis_persistence.py +523 -0
- parrot/handlers/jobs/__init__.py +10 -0
- parrot/handlers/jobs/job.py +384 -0
- parrot/handlers/jobs/mixin.py +627 -0
- parrot/handlers/jobs/models.py +115 -0
- parrot/handlers/jobs/worker.py +31 -0
- parrot/handlers/models.py +596 -0
- parrot/handlers/o365_auth.py +105 -0
- parrot/handlers/stream.py +337 -0
- parrot/interfaces/__init__.py +6 -0
- parrot/interfaces/aws.py +143 -0
- parrot/interfaces/credentials.py +113 -0
- parrot/interfaces/database.py +27 -0
- parrot/interfaces/google.py +1123 -0
- parrot/interfaces/hierarchy.py +1227 -0
- parrot/interfaces/http.py +651 -0
- parrot/interfaces/images/__init__.py +0 -0
- parrot/interfaces/images/plugins/__init__.py +24 -0
- parrot/interfaces/images/plugins/abstract.py +58 -0
- parrot/interfaces/images/plugins/analisys.py +148 -0
- parrot/interfaces/images/plugins/classify.py +150 -0
- parrot/interfaces/images/plugins/classifybase.py +182 -0
- parrot/interfaces/images/plugins/detect.py +150 -0
- parrot/interfaces/images/plugins/exif.py +1103 -0
- parrot/interfaces/images/plugins/hash.py +52 -0
- parrot/interfaces/images/plugins/vision.py +104 -0
- parrot/interfaces/images/plugins/yolo.py +66 -0
- parrot/interfaces/images/plugins/zerodetect.py +197 -0
- parrot/interfaces/o365.py +978 -0
- parrot/interfaces/onedrive.py +822 -0
- parrot/interfaces/sharepoint.py +1435 -0
- parrot/interfaces/soap.py +257 -0
- parrot/loaders/__init__.py +8 -0
- parrot/loaders/abstract.py +1131 -0
- parrot/loaders/audio.py +199 -0
- parrot/loaders/basepdf.py +53 -0
- parrot/loaders/basevideo.py +1568 -0
- parrot/loaders/csv.py +409 -0
- parrot/loaders/docx.py +116 -0
- parrot/loaders/epubloader.py +316 -0
- parrot/loaders/excel.py +199 -0
- parrot/loaders/factory.py +55 -0
- parrot/loaders/files/__init__.py +0 -0
- parrot/loaders/files/abstract.py +39 -0
- parrot/loaders/files/html.py +26 -0
- parrot/loaders/files/text.py +63 -0
- parrot/loaders/html.py +152 -0
- parrot/loaders/markdown.py +442 -0
- parrot/loaders/pdf.py +373 -0
- parrot/loaders/pdfmark.py +320 -0
- parrot/loaders/pdftables.py +506 -0
- parrot/loaders/ppt.py +476 -0
- parrot/loaders/qa.py +63 -0
- parrot/loaders/splitters/__init__.py +10 -0
- parrot/loaders/splitters/base.py +138 -0
- parrot/loaders/splitters/md.py +228 -0
- parrot/loaders/splitters/token.py +143 -0
- parrot/loaders/txt.py +26 -0
- parrot/loaders/video.py +89 -0
- parrot/loaders/videolocal.py +218 -0
- parrot/loaders/videounderstanding.py +377 -0
- parrot/loaders/vimeo.py +167 -0
- parrot/loaders/web.py +599 -0
- parrot/loaders/youtube.py +504 -0
- parrot/manager/__init__.py +5 -0
- parrot/manager/manager.py +1030 -0
- parrot/mcp/__init__.py +28 -0
- parrot/mcp/adapter.py +105 -0
- parrot/mcp/cli.py +174 -0
- parrot/mcp/client.py +119 -0
- parrot/mcp/config.py +75 -0
- parrot/mcp/integration.py +842 -0
- parrot/mcp/oauth.py +933 -0
- parrot/mcp/server.py +225 -0
- parrot/mcp/transports/__init__.py +3 -0
- parrot/mcp/transports/base.py +279 -0
- parrot/mcp/transports/grpc_session.py +163 -0
- parrot/mcp/transports/http.py +312 -0
- parrot/mcp/transports/mcp.proto +108 -0
- parrot/mcp/transports/quic.py +1082 -0
- parrot/mcp/transports/sse.py +330 -0
- parrot/mcp/transports/stdio.py +309 -0
- parrot/mcp/transports/unix.py +395 -0
- parrot/mcp/transports/websocket.py +547 -0
- parrot/memory/__init__.py +16 -0
- parrot/memory/abstract.py +209 -0
- parrot/memory/agent.py +32 -0
- parrot/memory/cache.py +175 -0
- parrot/memory/core.py +555 -0
- parrot/memory/file.py +153 -0
- parrot/memory/mem.py +131 -0
- parrot/memory/redis.py +613 -0
- parrot/models/__init__.py +46 -0
- parrot/models/basic.py +118 -0
- parrot/models/compliance.py +208 -0
- parrot/models/crew.py +395 -0
- parrot/models/detections.py +654 -0
- parrot/models/generation.py +85 -0
- parrot/models/google.py +223 -0
- parrot/models/groq.py +23 -0
- parrot/models/openai.py +30 -0
- parrot/models/outputs.py +285 -0
- parrot/models/responses.py +938 -0
- parrot/notifications/__init__.py +743 -0
- parrot/openapi/__init__.py +3 -0
- parrot/openapi/components.yaml +641 -0
- parrot/openapi/config.py +322 -0
- parrot/outputs/__init__.py +32 -0
- parrot/outputs/formats/__init__.py +108 -0
- parrot/outputs/formats/altair.py +359 -0
- parrot/outputs/formats/application.py +122 -0
- parrot/outputs/formats/base.py +351 -0
- parrot/outputs/formats/bokeh.py +356 -0
- parrot/outputs/formats/card.py +424 -0
- parrot/outputs/formats/chart.py +436 -0
- parrot/outputs/formats/d3.py +255 -0
- parrot/outputs/formats/echarts.py +310 -0
- parrot/outputs/formats/generators/__init__.py +0 -0
- parrot/outputs/formats/generators/abstract.py +61 -0
- parrot/outputs/formats/generators/panel.py +145 -0
- parrot/outputs/formats/generators/streamlit.py +86 -0
- parrot/outputs/formats/generators/terminal.py +63 -0
- parrot/outputs/formats/holoviews.py +310 -0
- parrot/outputs/formats/html.py +147 -0
- parrot/outputs/formats/jinja2.py +46 -0
- parrot/outputs/formats/json.py +87 -0
- parrot/outputs/formats/map.py +933 -0
- parrot/outputs/formats/markdown.py +172 -0
- parrot/outputs/formats/matplotlib.py +237 -0
- parrot/outputs/formats/mixins/__init__.py +0 -0
- parrot/outputs/formats/mixins/emaps.py +855 -0
- parrot/outputs/formats/plotly.py +341 -0
- parrot/outputs/formats/seaborn.py +310 -0
- parrot/outputs/formats/table.py +397 -0
- parrot/outputs/formats/template_report.py +138 -0
- parrot/outputs/formats/yaml.py +125 -0
- parrot/outputs/formatter.py +152 -0
- parrot/outputs/templates/__init__.py +95 -0
- parrot/pipelines/__init__.py +0 -0
- parrot/pipelines/abstract.py +210 -0
- parrot/pipelines/detector.py +124 -0
- parrot/pipelines/models.py +90 -0
- parrot/pipelines/planogram.py +3002 -0
- parrot/pipelines/table.sql +97 -0
- parrot/plugins/__init__.py +106 -0
- parrot/plugins/importer.py +80 -0
- parrot/py.typed +0 -0
- parrot/registry/__init__.py +18 -0
- parrot/registry/registry.py +594 -0
- parrot/scheduler/__init__.py +1189 -0
- parrot/scheduler/models.py +60 -0
- parrot/security/__init__.py +16 -0
- parrot/security/prompt_injection.py +268 -0
- parrot/security/security_events.sql +25 -0
- parrot/services/__init__.py +1 -0
- parrot/services/mcp/__init__.py +8 -0
- parrot/services/mcp/config.py +13 -0
- parrot/services/mcp/server.py +295 -0
- parrot/services/o365_remote_auth.py +235 -0
- parrot/stores/__init__.py +7 -0
- parrot/stores/abstract.py +352 -0
- parrot/stores/arango.py +1090 -0
- parrot/stores/bigquery.py +1377 -0
- parrot/stores/cache.py +106 -0
- parrot/stores/empty.py +10 -0
- parrot/stores/faiss_store.py +1157 -0
- parrot/stores/kb/__init__.py +9 -0
- parrot/stores/kb/abstract.py +68 -0
- parrot/stores/kb/cache.py +165 -0
- parrot/stores/kb/doc.py +325 -0
- parrot/stores/kb/hierarchy.py +346 -0
- parrot/stores/kb/local.py +457 -0
- parrot/stores/kb/prompt.py +28 -0
- parrot/stores/kb/redis.py +659 -0
- parrot/stores/kb/store.py +115 -0
- parrot/stores/kb/user.py +374 -0
- parrot/stores/models.py +59 -0
- parrot/stores/pgvector.py +3 -0
- parrot/stores/postgres.py +2853 -0
- parrot/stores/utils/__init__.py +0 -0
- parrot/stores/utils/chunking.py +197 -0
- parrot/telemetry/__init__.py +3 -0
- parrot/telemetry/mixin.py +111 -0
- parrot/template/__init__.py +3 -0
- parrot/template/engine.py +259 -0
- parrot/tools/__init__.py +23 -0
- parrot/tools/abstract.py +644 -0
- parrot/tools/agent.py +363 -0
- parrot/tools/arangodbsearch.py +537 -0
- parrot/tools/arxiv_tool.py +188 -0
- parrot/tools/calculator/__init__.py +3 -0
- parrot/tools/calculator/operations/__init__.py +38 -0
- parrot/tools/calculator/operations/calculus.py +80 -0
- parrot/tools/calculator/operations/statistics.py +76 -0
- parrot/tools/calculator/tool.py +150 -0
- parrot/tools/cloudwatch.py +988 -0
- parrot/tools/codeinterpreter/__init__.py +127 -0
- parrot/tools/codeinterpreter/executor.py +371 -0
- parrot/tools/codeinterpreter/internals.py +473 -0
- parrot/tools/codeinterpreter/models.py +643 -0
- parrot/tools/codeinterpreter/prompts.py +224 -0
- parrot/tools/codeinterpreter/tool.py +664 -0
- parrot/tools/company_info/__init__.py +6 -0
- parrot/tools/company_info/tool.py +1138 -0
- parrot/tools/correlationanalysis.py +437 -0
- parrot/tools/database/abstract.py +286 -0
- parrot/tools/database/bq.py +115 -0
- parrot/tools/database/cache.py +284 -0
- parrot/tools/database/models.py +95 -0
- parrot/tools/database/pg.py +343 -0
- parrot/tools/databasequery.py +1159 -0
- parrot/tools/db.py +1800 -0
- parrot/tools/ddgo.py +370 -0
- parrot/tools/decorators.py +271 -0
- parrot/tools/dftohtml.py +282 -0
- parrot/tools/document.py +549 -0
- parrot/tools/ecs.py +819 -0
- parrot/tools/edareport.py +368 -0
- parrot/tools/elasticsearch.py +1049 -0
- parrot/tools/employees.py +462 -0
- parrot/tools/epson/__init__.py +96 -0
- parrot/tools/excel.py +683 -0
- parrot/tools/file/__init__.py +13 -0
- parrot/tools/file/abstract.py +76 -0
- parrot/tools/file/gcs.py +378 -0
- parrot/tools/file/local.py +284 -0
- parrot/tools/file/s3.py +511 -0
- parrot/tools/file/tmp.py +309 -0
- parrot/tools/file/tool.py +501 -0
- parrot/tools/file_reader.py +129 -0
- parrot/tools/flowtask/__init__.py +19 -0
- parrot/tools/flowtask/tool.py +761 -0
- parrot/tools/gittoolkit.py +508 -0
- parrot/tools/google/__init__.py +18 -0
- parrot/tools/google/base.py +169 -0
- parrot/tools/google/tools.py +1251 -0
- parrot/tools/googlelocation.py +5 -0
- parrot/tools/googleroutes.py +5 -0
- parrot/tools/googlesearch.py +5 -0
- parrot/tools/googlesitesearch.py +5 -0
- parrot/tools/googlevoice.py +2 -0
- parrot/tools/gvoice.py +695 -0
- parrot/tools/ibisworld/README.md +225 -0
- parrot/tools/ibisworld/__init__.py +11 -0
- parrot/tools/ibisworld/tool.py +366 -0
- parrot/tools/jiratoolkit.py +1718 -0
- parrot/tools/manager.py +1098 -0
- parrot/tools/math.py +152 -0
- parrot/tools/metadata.py +476 -0
- parrot/tools/msteams.py +1621 -0
- parrot/tools/msword.py +635 -0
- parrot/tools/multidb.py +580 -0
- parrot/tools/multistoresearch.py +369 -0
- parrot/tools/networkninja.py +167 -0
- parrot/tools/nextstop/__init__.py +4 -0
- parrot/tools/nextstop/base.py +286 -0
- parrot/tools/nextstop/employee.py +733 -0
- parrot/tools/nextstop/store.py +462 -0
- parrot/tools/notification.py +435 -0
- parrot/tools/o365/__init__.py +42 -0
- parrot/tools/o365/base.py +295 -0
- parrot/tools/o365/bundle.py +522 -0
- parrot/tools/o365/events.py +554 -0
- parrot/tools/o365/mail.py +992 -0
- parrot/tools/o365/onedrive.py +497 -0
- parrot/tools/o365/sharepoint.py +641 -0
- parrot/tools/openapi_toolkit.py +904 -0
- parrot/tools/openweather.py +527 -0
- parrot/tools/pdfprint.py +1001 -0
- parrot/tools/powerbi.py +518 -0
- parrot/tools/powerpoint.py +1113 -0
- parrot/tools/pricestool.py +146 -0
- parrot/tools/products/__init__.py +246 -0
- parrot/tools/prophet_tool.py +171 -0
- parrot/tools/pythonpandas.py +630 -0
- parrot/tools/pythonrepl.py +910 -0
- parrot/tools/qsource.py +436 -0
- parrot/tools/querytoolkit.py +395 -0
- parrot/tools/quickeda.py +827 -0
- parrot/tools/resttool.py +553 -0
- parrot/tools/retail/__init__.py +0 -0
- parrot/tools/retail/bby.py +528 -0
- parrot/tools/sandboxtool.py +703 -0
- parrot/tools/sassie/__init__.py +352 -0
- parrot/tools/scraping/__init__.py +7 -0
- parrot/tools/scraping/docs/select.md +466 -0
- parrot/tools/scraping/documentation.md +1278 -0
- parrot/tools/scraping/driver.py +436 -0
- parrot/tools/scraping/models.py +576 -0
- parrot/tools/scraping/options.py +85 -0
- parrot/tools/scraping/orchestrator.py +517 -0
- parrot/tools/scraping/readme.md +740 -0
- parrot/tools/scraping/tool.py +3115 -0
- parrot/tools/seasonaldetection.py +642 -0
- parrot/tools/shell_tool/__init__.py +5 -0
- parrot/tools/shell_tool/actions.py +408 -0
- parrot/tools/shell_tool/engine.py +155 -0
- parrot/tools/shell_tool/models.py +322 -0
- parrot/tools/shell_tool/tool.py +442 -0
- parrot/tools/site_search.py +214 -0
- parrot/tools/textfile.py +418 -0
- parrot/tools/think.py +378 -0
- parrot/tools/toolkit.py +298 -0
- parrot/tools/webapp_tool.py +187 -0
- parrot/tools/whatif.py +1279 -0
- parrot/tools/workday/MULTI_WSDL_EXAMPLE.md +249 -0
- parrot/tools/workday/__init__.py +6 -0
- parrot/tools/workday/models.py +1389 -0
- parrot/tools/workday/tool.py +1293 -0
- parrot/tools/yfinance_tool.py +306 -0
- parrot/tools/zipcode.py +217 -0
- parrot/utils/__init__.py +2 -0
- parrot/utils/helpers.py +73 -0
- parrot/utils/parsers/__init__.py +5 -0
- parrot/utils/parsers/toml.c +12078 -0
- parrot/utils/parsers/toml.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/utils/parsers/toml.pyx +21 -0
- parrot/utils/toml.py +11 -0
- parrot/utils/types.cpp +20936 -0
- parrot/utils/types.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/utils/types.pyx +213 -0
- parrot/utils/uv.py +11 -0
- parrot/version.py +10 -0
- parrot/yaml-rs/Cargo.lock +350 -0
- parrot/yaml-rs/Cargo.toml +19 -0
- parrot/yaml-rs/pyproject.toml +19 -0
- parrot/yaml-rs/python/yaml_rs/__init__.py +81 -0
- parrot/yaml-rs/src/lib.rs +222 -0
- requirements/docker-compose.yml +24 -0
- requirements/requirements-dev.txt +21 -0
|
@@ -0,0 +1,506 @@
|
|
|
1
|
+
from typing import Any, Union, List, Dict, Literal
|
|
2
|
+
import re
|
|
3
|
+
from collections.abc import Callable
|
|
4
|
+
from pathlib import PurePath
|
|
5
|
+
import json
|
|
6
|
+
import fitz
|
|
7
|
+
from ..stores.models import Document
|
|
8
|
+
from .basepdf import BasePDF
|
|
9
|
+
|
|
10
|
+
# Optional dependencies for enhanced table extraction
|
|
11
|
+
try:
|
|
12
|
+
from markitdown import MarkItDown
|
|
13
|
+
import pandas as pd
|
|
14
|
+
ENHANCED_BACKENDS_AVAILABLE = True
|
|
15
|
+
except ImportError:
|
|
16
|
+
ENHANCED_BACKENDS_AVAILABLE = False
|
|
17
|
+
|
|
18
|
+
try:
|
|
19
|
+
import pandas as pd
|
|
20
|
+
PANDAS_AVAILABLE = True
|
|
21
|
+
except ImportError:
|
|
22
|
+
PANDAS_AVAILABLE = False
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class PDFTablesLoader(BasePDF):
|
|
26
|
+
"""
|
|
27
|
+
Specialized loader for extracting tables from PDF files.
|
|
28
|
+
|
|
29
|
+
This loader focuses on table extraction with multiple backends:
|
|
30
|
+
1. PyMuPDF (fitz) with configurable table detection settings
|
|
31
|
+
2. MarkItDown for universal table extraction (optional)
|
|
32
|
+
|
|
33
|
+
Supports output formats:
|
|
34
|
+
- JSON (via pandas DataFrame serialization)
|
|
35
|
+
- Markdown table format
|
|
36
|
+
- Raw table data (list of lists)
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(
|
|
40
|
+
self,
|
|
41
|
+
source: Union[str, PurePath, List[PurePath]],
|
|
42
|
+
tokenizer: Callable[..., Any] = None,
|
|
43
|
+
text_splitter: Callable[..., Any] = None,
|
|
44
|
+
source_type: str = 'pdf',
|
|
45
|
+
|
|
46
|
+
# Backend selection
|
|
47
|
+
table_backend: str = "auto", # "fitz", "markitdown", "auto"
|
|
48
|
+
|
|
49
|
+
# Output format
|
|
50
|
+
output_format: Literal["json", "markdown", "raw"] = "json",
|
|
51
|
+
|
|
52
|
+
# PyMuPDF table extraction settings
|
|
53
|
+
intersection_tolerance: float = 3.0,
|
|
54
|
+
vertical_strategy: str = "lines", # "lines", "text", "explicit"
|
|
55
|
+
horizontal_strategy: str = "lines", # "lines", "text", "explicit"
|
|
56
|
+
min_words_vertical: int = 3,
|
|
57
|
+
min_words_horizontal: int = 1,
|
|
58
|
+
keep_blank_chars: bool = False,
|
|
59
|
+
snap_tolerance: float = 3.0,
|
|
60
|
+
join_tolerance: float = 3.0,
|
|
61
|
+
edge_min_length: float = 3.0,
|
|
62
|
+
|
|
63
|
+
# Table filtering and processing
|
|
64
|
+
min_table_rows: int = 2,
|
|
65
|
+
min_table_cols: int = 2,
|
|
66
|
+
min_cell_content_length: int = 1,
|
|
67
|
+
skip_empty_tables: bool = True,
|
|
68
|
+
merge_duplicate_headers: bool = True,
|
|
69
|
+
|
|
70
|
+
# Content processing
|
|
71
|
+
clean_whitespace: bool = True,
|
|
72
|
+
strip_html: bool = True,
|
|
73
|
+
|
|
74
|
+
**kwargs
|
|
75
|
+
):
|
|
76
|
+
super().__init__(
|
|
77
|
+
source=source,
|
|
78
|
+
tokenizer=tokenizer,
|
|
79
|
+
text_splitter=text_splitter,
|
|
80
|
+
source_type=source_type,
|
|
81
|
+
**kwargs
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# Backend configuration
|
|
85
|
+
self.table_backend = self._select_backend(table_backend)
|
|
86
|
+
self.output_format = output_format
|
|
87
|
+
|
|
88
|
+
# PyMuPDF table settings
|
|
89
|
+
self.table_settings = {
|
|
90
|
+
"intersection_tolerance": intersection_tolerance,
|
|
91
|
+
"vertical_strategy": vertical_strategy,
|
|
92
|
+
"horizontal_strategy": horizontal_strategy,
|
|
93
|
+
"min_words_vertical": min_words_vertical,
|
|
94
|
+
"min_words_horizontal": min_words_horizontal,
|
|
95
|
+
"keep_blank_chars": keep_blank_chars,
|
|
96
|
+
"snap_tolerance": snap_tolerance,
|
|
97
|
+
"join_tolerance": join_tolerance,
|
|
98
|
+
"edge_min_length": edge_min_length,
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
# Table filtering
|
|
102
|
+
self.min_table_rows = min_table_rows
|
|
103
|
+
self.min_table_cols = min_table_cols
|
|
104
|
+
self.min_cell_content_length = min_cell_content_length
|
|
105
|
+
self.skip_empty_tables = skip_empty_tables
|
|
106
|
+
self.merge_duplicate_headers = merge_duplicate_headers
|
|
107
|
+
|
|
108
|
+
# Content processing
|
|
109
|
+
self.clean_whitespace = clean_whitespace
|
|
110
|
+
self.strip_html = strip_html
|
|
111
|
+
|
|
112
|
+
# Initialize backend
|
|
113
|
+
self._setup_backend()
|
|
114
|
+
|
|
115
|
+
def _select_backend(self, preferred: str) -> str:
|
|
116
|
+
"""Select the best available backend for table extraction."""
|
|
117
|
+
if preferred == "auto":
|
|
118
|
+
if ENHANCED_BACKENDS_AVAILABLE:
|
|
119
|
+
return "markitdown"
|
|
120
|
+
else:
|
|
121
|
+
return "fitz"
|
|
122
|
+
elif preferred == "markitdown" and ENHANCED_BACKENDS_AVAILABLE:
|
|
123
|
+
return "markitdown"
|
|
124
|
+
elif preferred == "fitz":
|
|
125
|
+
return "fitz"
|
|
126
|
+
else:
|
|
127
|
+
self.logger.warning(f"Backend '{preferred}' not available, falling back to fitz")
|
|
128
|
+
return "fitz"
|
|
129
|
+
|
|
130
|
+
def _setup_backend(self):
|
|
131
|
+
"""Initialize the selected backend."""
|
|
132
|
+
if self.table_backend == "markitdown":
|
|
133
|
+
self.md_converter = MarkItDown()
|
|
134
|
+
self.logger.info("Using MarkItDown backend for table extraction")
|
|
135
|
+
else:
|
|
136
|
+
self.logger.info("Using PyMuPDF (fitz) backend for table extraction")
|
|
137
|
+
|
|
138
|
+
def _clean_cell_content(self, content: str) -> str:
|
|
139
|
+
"""Clean and normalize cell content."""
|
|
140
|
+
if not content:
|
|
141
|
+
return ""
|
|
142
|
+
|
|
143
|
+
content = str(content)
|
|
144
|
+
|
|
145
|
+
if self.clean_whitespace:
|
|
146
|
+
# Normalize whitespace
|
|
147
|
+
content = " ".join(content.split())
|
|
148
|
+
|
|
149
|
+
if self.strip_html:
|
|
150
|
+
# Basic HTML tag removal
|
|
151
|
+
import re
|
|
152
|
+
content = re.sub(r'<[^>]+>', '', content)
|
|
153
|
+
|
|
154
|
+
return content.strip()
|
|
155
|
+
|
|
156
|
+
def _is_valid_table(self, table_data: List[List[str]]) -> bool:
|
|
157
|
+
"""Check if extracted table meets minimum requirements."""
|
|
158
|
+
if not table_data:
|
|
159
|
+
return False
|
|
160
|
+
|
|
161
|
+
# Check dimensions
|
|
162
|
+
if len(table_data) < self.min_table_rows:
|
|
163
|
+
return False
|
|
164
|
+
|
|
165
|
+
if not all(len(row) >= self.min_table_cols for row in table_data):
|
|
166
|
+
return False
|
|
167
|
+
|
|
168
|
+
# Check if table has meaningful content
|
|
169
|
+
if self.skip_empty_tables:
|
|
170
|
+
non_empty_cells = 0
|
|
171
|
+
total_cells = 0
|
|
172
|
+
|
|
173
|
+
for row in table_data:
|
|
174
|
+
for cell in row:
|
|
175
|
+
total_cells += 1
|
|
176
|
+
if cell and len(str(cell).strip()) >= self.min_cell_content_length:
|
|
177
|
+
non_empty_cells += 1
|
|
178
|
+
|
|
179
|
+
# Require at least 30% non-empty cells
|
|
180
|
+
if total_cells > 0 and (non_empty_cells / total_cells) < 0.3:
|
|
181
|
+
return False
|
|
182
|
+
|
|
183
|
+
return True
|
|
184
|
+
|
|
185
|
+
def _format_table_as_json(self, table_data: List[List[str]], table_index: int) -> str:
|
|
186
|
+
"""Convert table data to JSON format using pandas."""
|
|
187
|
+
if not PANDAS_AVAILABLE:
|
|
188
|
+
# Fallback to basic JSON without pandas
|
|
189
|
+
return json.dumps({
|
|
190
|
+
"table_index": table_index,
|
|
191
|
+
"headers": table_data[0] if table_data else [],
|
|
192
|
+
"data": table_data[1:] if len(table_data) > 1 else [],
|
|
193
|
+
"rows": len(table_data),
|
|
194
|
+
"cols": len(table_data[0]) if table_data else 0
|
|
195
|
+
}, ensure_ascii=False, indent=2)
|
|
196
|
+
|
|
197
|
+
try:
|
|
198
|
+
# Use pandas for better JSON structure
|
|
199
|
+
df = pd.DataFrame(table_data[1:], columns=table_data[0] if table_data else [])
|
|
200
|
+
|
|
201
|
+
# Create structured JSON
|
|
202
|
+
result = {
|
|
203
|
+
"table_index": table_index,
|
|
204
|
+
"shape": {"rows": len(df), "cols": len(df.columns)},
|
|
205
|
+
"columns": df.columns.tolist(),
|
|
206
|
+
"data": df.to_dict('records'), # List of row dictionaries
|
|
207
|
+
"summary": {
|
|
208
|
+
"total_cells": len(df) * len(df.columns),
|
|
209
|
+
"empty_cells": df.isnull().sum().sum(),
|
|
210
|
+
"data_types": df.dtypes.astype(str).to_dict()
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
return json.dumps(result, ensure_ascii=False, indent=2, default=str)
|
|
215
|
+
|
|
216
|
+
except Exception as e:
|
|
217
|
+
self.logger.warning(f"Pandas JSON conversion failed: {e}, using fallback")
|
|
218
|
+
return json.dumps({
|
|
219
|
+
"table_index": table_index,
|
|
220
|
+
"headers": table_data[0] if table_data else [],
|
|
221
|
+
"data": table_data[1:] if len(table_data) > 1 else [],
|
|
222
|
+
"error": str(e)
|
|
223
|
+
}, ensure_ascii=False, indent=2)
|
|
224
|
+
|
|
225
|
+
def _format_table_as_markdown(self, table_data: List[List[str]]) -> str:
|
|
226
|
+
"""Convert table data to markdown format."""
|
|
227
|
+
if not table_data or len(table_data) < 1:
|
|
228
|
+
return ""
|
|
229
|
+
|
|
230
|
+
markdown_lines = []
|
|
231
|
+
|
|
232
|
+
# Header row
|
|
233
|
+
headers = [self._clean_cell_content(cell) for cell in table_data[0]]
|
|
234
|
+
header_row = " | ".join(headers)
|
|
235
|
+
markdown_lines.append(f"| {header_row} |")
|
|
236
|
+
|
|
237
|
+
# Separator row
|
|
238
|
+
separator = " | ".join("---" for _ in headers)
|
|
239
|
+
markdown_lines.append(f"| {separator} |")
|
|
240
|
+
|
|
241
|
+
# Data rows
|
|
242
|
+
for row in table_data[1:]:
|
|
243
|
+
cleaned_row = [self._clean_cell_content(cell) for cell in row]
|
|
244
|
+
# Ensure row has same number of columns as header
|
|
245
|
+
while len(cleaned_row) < len(headers):
|
|
246
|
+
cleaned_row.append("")
|
|
247
|
+
data_row = " | ".join(cleaned_row[:len(headers)])
|
|
248
|
+
markdown_lines.append(f"| {data_row} |")
|
|
249
|
+
|
|
250
|
+
return "\n".join(markdown_lines)
|
|
251
|
+
|
|
252
|
+
def _extract_tables_with_fitz(self, path: Union[str, PurePath]) -> List[Dict]:
|
|
253
|
+
"""Extract tables using PyMuPDF with configurable settings."""
|
|
254
|
+
tables_data = []
|
|
255
|
+
|
|
256
|
+
try:
|
|
257
|
+
doc = fitz.open(str(path))
|
|
258
|
+
|
|
259
|
+
for _, page_num in enumerate(doc):
|
|
260
|
+
page = doc[page_num]
|
|
261
|
+
|
|
262
|
+
# Find tables with custom settings
|
|
263
|
+
tables = page.find_tables(
|
|
264
|
+
vertical_strategy=self.table_settings["vertical_strategy"],
|
|
265
|
+
horizontal_strategy=self.table_settings["horizontal_strategy"],
|
|
266
|
+
intersection_tolerance=self.table_settings["intersection_tolerance"],
|
|
267
|
+
min_words_vertical=self.table_settings["min_words_vertical"],
|
|
268
|
+
min_words_horizontal=self.table_settings["min_words_horizontal"],
|
|
269
|
+
keep_blank_chars=self.table_settings["keep_blank_chars"],
|
|
270
|
+
snap_tolerance=self.table_settings["snap_tolerance"],
|
|
271
|
+
join_tolerance=self.table_settings["join_tolerance"],
|
|
272
|
+
edge_min_length=self.table_settings["edge_min_length"],
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
for table_index, table in enumerate(tables):
|
|
276
|
+
try:
|
|
277
|
+
# Extract table data
|
|
278
|
+
raw_data = table.extract()
|
|
279
|
+
|
|
280
|
+
if not raw_data or not self._is_valid_table(raw_data):
|
|
281
|
+
continue
|
|
282
|
+
|
|
283
|
+
# Clean cell contents
|
|
284
|
+
cleaned_data = []
|
|
285
|
+
for row in raw_data:
|
|
286
|
+
cleaned_row = [self._clean_cell_content(cell) for cell in row]
|
|
287
|
+
cleaned_data.append(cleaned_row)
|
|
288
|
+
|
|
289
|
+
# Get table bbox for positioning info
|
|
290
|
+
bbox = table.bbox
|
|
291
|
+
|
|
292
|
+
table_info = {
|
|
293
|
+
"page_number": page_num + 1,
|
|
294
|
+
"table_index": table_index,
|
|
295
|
+
"global_table_index": len(tables_data),
|
|
296
|
+
"data": cleaned_data,
|
|
297
|
+
"dimensions": {
|
|
298
|
+
"rows": len(cleaned_data),
|
|
299
|
+
"cols": len(cleaned_data[0]) if cleaned_data else 0
|
|
300
|
+
},
|
|
301
|
+
"bbox": {
|
|
302
|
+
"x0": bbox.x0, "y0": bbox.y0,
|
|
303
|
+
"x1": bbox.x1, "y1": bbox.y1
|
|
304
|
+
},
|
|
305
|
+
"extraction_backend": "fitz",
|
|
306
|
+
"extraction_settings": self.table_settings.copy()
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
tables_data.append(table_info)
|
|
310
|
+
|
|
311
|
+
except Exception as e:
|
|
312
|
+
self.logger.warning(
|
|
313
|
+
f"Failed to extract table {table_index} from page {page_num + 1}: {e}"
|
|
314
|
+
)
|
|
315
|
+
continue
|
|
316
|
+
|
|
317
|
+
doc.close()
|
|
318
|
+
|
|
319
|
+
except Exception as e:
|
|
320
|
+
self.logger.error(f"Failed to extract tables with fitz: {e}")
|
|
321
|
+
|
|
322
|
+
return tables_data
|
|
323
|
+
|
|
324
|
+
def _extract_tables_with_markitdown(self, path: Union[str, PurePath]) -> List[Dict]:
|
|
325
|
+
"""Extract tables using MarkItDown backend."""
|
|
326
|
+
tables_data = []
|
|
327
|
+
|
|
328
|
+
try:
|
|
329
|
+
result = self.md_converter.convert(str(path))
|
|
330
|
+
if not result or not result.text_content:
|
|
331
|
+
return tables_data
|
|
332
|
+
|
|
333
|
+
markdown_content = result.text_content
|
|
334
|
+
|
|
335
|
+
# Extract markdown tables using regex
|
|
336
|
+
table_pattern = r'(\|[^|\n]*\|(?:\n\|[^|\n]*\|)*)'
|
|
337
|
+
tables = re.findall(table_pattern, markdown_content)
|
|
338
|
+
|
|
339
|
+
for global_index, table_text in enumerate(tables):
|
|
340
|
+
try:
|
|
341
|
+
lines = [line.strip() for line in table_text.split('\n') if line.strip()]
|
|
342
|
+
if len(lines) < 2: # Need at least header and separator
|
|
343
|
+
continue
|
|
344
|
+
|
|
345
|
+
# Parse table rows
|
|
346
|
+
table_rows = []
|
|
347
|
+
for line in lines:
|
|
348
|
+
if '---' in line: # Skip separator line
|
|
349
|
+
continue
|
|
350
|
+
|
|
351
|
+
# Split by | and clean
|
|
352
|
+
cells = [cell.strip() for cell in line.split('|')[1:-1]] # Remove first and last empty
|
|
353
|
+
if cells:
|
|
354
|
+
table_rows.append(cells)
|
|
355
|
+
|
|
356
|
+
if not table_rows or not self._is_valid_table(table_rows):
|
|
357
|
+
continue
|
|
358
|
+
|
|
359
|
+
table_info = {
|
|
360
|
+
"page_number": 1, # MarkItDown doesn't provide page info
|
|
361
|
+
"table_index": global_index,
|
|
362
|
+
"global_table_index": global_index,
|
|
363
|
+
"data": table_rows,
|
|
364
|
+
"dimensions": {
|
|
365
|
+
"rows": len(table_rows),
|
|
366
|
+
"cols": len(table_rows[0]) if table_rows else 0
|
|
367
|
+
},
|
|
368
|
+
"extraction_backend": "markitdown"
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
tables_data.append(table_info)
|
|
372
|
+
|
|
373
|
+
except Exception as e:
|
|
374
|
+
self.logger.warning(f"Failed to parse table {global_index}: {e}")
|
|
375
|
+
continue
|
|
376
|
+
|
|
377
|
+
except Exception as e:
|
|
378
|
+
self.logger.error(f"Failed to extract tables with markitdown: {e}")
|
|
379
|
+
|
|
380
|
+
return tables_data
|
|
381
|
+
|
|
382
|
+
async def _load(self, path: Union[str, PurePath, List[PurePath]], **kwargs) -> List[Document]:
|
|
383
|
+
"""
|
|
384
|
+
Load tables from PDF file.
|
|
385
|
+
|
|
386
|
+
Args:
|
|
387
|
+
path: Path to the PDF file
|
|
388
|
+
|
|
389
|
+
Returns:
|
|
390
|
+
List of Document objects, one per extracted table
|
|
391
|
+
"""
|
|
392
|
+
self.logger.info(f"Extracting tables from PDF: {path}")
|
|
393
|
+
docs = []
|
|
394
|
+
|
|
395
|
+
# Extract tables using selected backend
|
|
396
|
+
if self.table_backend == "markitdown":
|
|
397
|
+
tables_data = self._extract_tables_with_markitdown(path)
|
|
398
|
+
else:
|
|
399
|
+
tables_data = self._extract_tables_with_fitz(path)
|
|
400
|
+
|
|
401
|
+
if not tables_data:
|
|
402
|
+
self.logger.warning(f"No tables found in {path}")
|
|
403
|
+
return docs
|
|
404
|
+
|
|
405
|
+
self.logger.info(f"Found {len(tables_data)} tables in {path}")
|
|
406
|
+
|
|
407
|
+
# Extract PDF metadata
|
|
408
|
+
try:
|
|
409
|
+
pdf = fitz.open(str(path))
|
|
410
|
+
pdf_metadata = pdf.metadata # pylint: disable=E1101 # noqa: E1101
|
|
411
|
+
pdf.close()
|
|
412
|
+
except Exception as e:
|
|
413
|
+
self.logger.warning(f"Could not extract PDF metadata: {e}")
|
|
414
|
+
pdf_metadata = {}
|
|
415
|
+
|
|
416
|
+
# Create documents for each table
|
|
417
|
+
for table_info in tables_data:
|
|
418
|
+
table_data = table_info["data"]
|
|
419
|
+
|
|
420
|
+
# Format table content based on output format
|
|
421
|
+
if self.output_format == "json":
|
|
422
|
+
content = self._format_table_as_json(table_data, table_info["global_table_index"])
|
|
423
|
+
content_type = "application/json"
|
|
424
|
+
elif self.output_format == "markdown":
|
|
425
|
+
content = self._format_table_as_markdown(table_data)
|
|
426
|
+
content_type = "text/markdown"
|
|
427
|
+
else: # raw
|
|
428
|
+
content = json.dumps(table_data, ensure_ascii=False, indent=2)
|
|
429
|
+
content_type = "application/json"
|
|
430
|
+
|
|
431
|
+
# Create metadata
|
|
432
|
+
metadata = {
|
|
433
|
+
"filename": path.name if hasattr(path, 'name') else str(path).split('/')[-1],
|
|
434
|
+
"source": str(path),
|
|
435
|
+
"type": "pdf_table",
|
|
436
|
+
"category": self.category,
|
|
437
|
+
"source_type": self._source_type,
|
|
438
|
+
"content_type": content_type,
|
|
439
|
+
"output_format": self.output_format,
|
|
440
|
+
|
|
441
|
+
# Table-specific metadata
|
|
442
|
+
"table_info": {
|
|
443
|
+
"page_number": table_info["page_number"],
|
|
444
|
+
"table_index": table_info["table_index"],
|
|
445
|
+
"global_table_index": table_info["global_table_index"],
|
|
446
|
+
"dimensions": table_info["dimensions"],
|
|
447
|
+
"extraction_backend": table_info["extraction_backend"]
|
|
448
|
+
},
|
|
449
|
+
|
|
450
|
+
# PDF metadata
|
|
451
|
+
"document_meta": {
|
|
452
|
+
"title": pdf_metadata.get("title", ""),
|
|
453
|
+
"author": pdf_metadata.get("author", ""),
|
|
454
|
+
"creationDate": pdf_metadata.get("creationDate", ""),
|
|
455
|
+
}
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
# Add backend-specific metadata
|
|
459
|
+
if "bbox" in table_info:
|
|
460
|
+
metadata["table_info"]["bbox"] = table_info["bbox"]
|
|
461
|
+
if "extraction_settings" in table_info:
|
|
462
|
+
metadata["table_info"]["extraction_settings"] = table_info["extraction_settings"]
|
|
463
|
+
|
|
464
|
+
docs.append(
|
|
465
|
+
Document(
|
|
466
|
+
page_content=content,
|
|
467
|
+
metadata=metadata
|
|
468
|
+
)
|
|
469
|
+
)
|
|
470
|
+
|
|
471
|
+
return docs
|
|
472
|
+
|
|
473
|
+
def get_table_settings(self) -> Dict[str, Any]:
|
|
474
|
+
"""Get current table extraction settings."""
|
|
475
|
+
return {
|
|
476
|
+
"backend": self.table_backend,
|
|
477
|
+
"output_format": self.output_format,
|
|
478
|
+
"table_settings": self.table_settings.copy(),
|
|
479
|
+
"filtering": {
|
|
480
|
+
"min_table_rows": self.min_table_rows,
|
|
481
|
+
"min_table_cols": self.min_table_cols,
|
|
482
|
+
"min_cell_content_length": self.min_cell_content_length,
|
|
483
|
+
"skip_empty_tables": self.skip_empty_tables,
|
|
484
|
+
}
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
def update_table_settings(self, **settings):
|
|
488
|
+
"""Update table extraction settings."""
|
|
489
|
+
for key, value in settings.items():
|
|
490
|
+
if key in self.table_settings:
|
|
491
|
+
self.table_settings[key] = value
|
|
492
|
+
self.logger.info(f"Updated table setting {key} = {value}")
|
|
493
|
+
elif hasattr(self, key):
|
|
494
|
+
setattr(self, key, value)
|
|
495
|
+
self.logger.info(f"Updated loader setting {key} = {value}")
|
|
496
|
+
else:
|
|
497
|
+
self.logger.warning(f"Unknown setting: {key}")
|
|
498
|
+
|
|
499
|
+
def get_supported_backends(self) -> List[str]:
|
|
500
|
+
"""Get list of available backends."""
|
|
501
|
+
backends = ["fitz"] # Always available
|
|
502
|
+
|
|
503
|
+
if ENHANCED_BACKENDS_AVAILABLE:
|
|
504
|
+
backends.append("markitdown")
|
|
505
|
+
|
|
506
|
+
return backends
|