ai-parrot 0.17.2__cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentui/.prettierrc +15 -0
- agentui/QUICKSTART.md +272 -0
- agentui/README.md +59 -0
- agentui/env.example +16 -0
- agentui/jsconfig.json +14 -0
- agentui/package-lock.json +4242 -0
- agentui/package.json +34 -0
- agentui/scripts/postinstall/apply-patches.mjs +260 -0
- agentui/src/app.css +61 -0
- agentui/src/app.d.ts +13 -0
- agentui/src/app.html +12 -0
- agentui/src/components/LoadingSpinner.svelte +64 -0
- agentui/src/components/ThemeSwitcher.svelte +159 -0
- agentui/src/components/index.js +4 -0
- agentui/src/lib/api/bots.ts +60 -0
- agentui/src/lib/api/chat.ts +22 -0
- agentui/src/lib/api/http.ts +25 -0
- agentui/src/lib/components/BotCard.svelte +33 -0
- agentui/src/lib/components/ChatBubble.svelte +63 -0
- agentui/src/lib/components/Toast.svelte +21 -0
- agentui/src/lib/config.ts +20 -0
- agentui/src/lib/stores/auth.svelte.ts +73 -0
- agentui/src/lib/stores/theme.svelte.js +64 -0
- agentui/src/lib/stores/toast.svelte.ts +31 -0
- agentui/src/lib/utils/conversation.ts +39 -0
- agentui/src/routes/+layout.svelte +20 -0
- agentui/src/routes/+page.svelte +232 -0
- agentui/src/routes/login/+page.svelte +200 -0
- agentui/src/routes/talk/[agentId]/+page.svelte +297 -0
- agentui/src/routes/talk/[agentId]/+page.ts +7 -0
- agentui/static/README.md +1 -0
- agentui/svelte.config.js +11 -0
- agentui/tailwind.config.ts +53 -0
- agentui/tsconfig.json +3 -0
- agentui/vite.config.ts +10 -0
- ai_parrot-0.17.2.dist-info/METADATA +472 -0
- ai_parrot-0.17.2.dist-info/RECORD +535 -0
- ai_parrot-0.17.2.dist-info/WHEEL +6 -0
- ai_parrot-0.17.2.dist-info/entry_points.txt +2 -0
- ai_parrot-0.17.2.dist-info/licenses/LICENSE +21 -0
- ai_parrot-0.17.2.dist-info/top_level.txt +6 -0
- crew-builder/.prettierrc +15 -0
- crew-builder/QUICKSTART.md +259 -0
- crew-builder/README.md +113 -0
- crew-builder/env.example +17 -0
- crew-builder/jsconfig.json +14 -0
- crew-builder/package-lock.json +4182 -0
- crew-builder/package.json +37 -0
- crew-builder/scripts/postinstall/apply-patches.mjs +260 -0
- crew-builder/src/app.css +62 -0
- crew-builder/src/app.d.ts +13 -0
- crew-builder/src/app.html +12 -0
- crew-builder/src/components/LoadingSpinner.svelte +64 -0
- crew-builder/src/components/ThemeSwitcher.svelte +149 -0
- crew-builder/src/components/index.js +9 -0
- crew-builder/src/lib/api/bots.ts +60 -0
- crew-builder/src/lib/api/chat.ts +80 -0
- crew-builder/src/lib/api/client.ts +56 -0
- crew-builder/src/lib/api/crew/crew.ts +136 -0
- crew-builder/src/lib/api/index.ts +5 -0
- crew-builder/src/lib/api/o365/auth.ts +65 -0
- crew-builder/src/lib/auth/auth.ts +54 -0
- crew-builder/src/lib/components/AgentNode.svelte +43 -0
- crew-builder/src/lib/components/BotCard.svelte +33 -0
- crew-builder/src/lib/components/ChatBubble.svelte +67 -0
- crew-builder/src/lib/components/ConfigPanel.svelte +278 -0
- crew-builder/src/lib/components/JsonTreeNode.svelte +76 -0
- crew-builder/src/lib/components/JsonViewer.svelte +24 -0
- crew-builder/src/lib/components/MarkdownEditor.svelte +48 -0
- crew-builder/src/lib/components/ThemeToggle.svelte +36 -0
- crew-builder/src/lib/components/Toast.svelte +67 -0
- crew-builder/src/lib/components/Toolbar.svelte +157 -0
- crew-builder/src/lib/components/index.ts +10 -0
- crew-builder/src/lib/config.ts +8 -0
- crew-builder/src/lib/stores/auth.svelte.ts +228 -0
- crew-builder/src/lib/stores/crewStore.ts +369 -0
- crew-builder/src/lib/stores/theme.svelte.js +145 -0
- crew-builder/src/lib/stores/toast.svelte.ts +69 -0
- crew-builder/src/lib/utils/conversation.ts +39 -0
- crew-builder/src/lib/utils/markdown.ts +122 -0
- crew-builder/src/lib/utils/talkHistory.ts +47 -0
- crew-builder/src/routes/+layout.svelte +20 -0
- crew-builder/src/routes/+page.svelte +539 -0
- crew-builder/src/routes/agents/+page.svelte +247 -0
- crew-builder/src/routes/agents/[agentId]/+page.svelte +288 -0
- crew-builder/src/routes/agents/[agentId]/+page.ts +7 -0
- crew-builder/src/routes/builder/+page.svelte +204 -0
- crew-builder/src/routes/crew/ask/+page.svelte +1052 -0
- crew-builder/src/routes/crew/ask/+page.ts +1 -0
- crew-builder/src/routes/integrations/o365/+page.svelte +304 -0
- crew-builder/src/routes/login/+page.svelte +197 -0
- crew-builder/src/routes/talk/[agentId]/+page.svelte +487 -0
- crew-builder/src/routes/talk/[agentId]/+page.ts +7 -0
- crew-builder/static/README.md +1 -0
- crew-builder/svelte.config.js +11 -0
- crew-builder/tailwind.config.ts +53 -0
- crew-builder/tsconfig.json +3 -0
- crew-builder/vite.config.ts +10 -0
- mcp_servers/calculator_server.py +309 -0
- parrot/__init__.py +27 -0
- parrot/__pycache__/__init__.cpython-310.pyc +0 -0
- parrot/__pycache__/version.cpython-310.pyc +0 -0
- parrot/_version.py +34 -0
- parrot/a2a/__init__.py +48 -0
- parrot/a2a/client.py +658 -0
- parrot/a2a/discovery.py +89 -0
- parrot/a2a/mixin.py +257 -0
- parrot/a2a/models.py +376 -0
- parrot/a2a/server.py +770 -0
- parrot/agents/__init__.py +29 -0
- parrot/bots/__init__.py +12 -0
- parrot/bots/a2a_agent.py +19 -0
- parrot/bots/abstract.py +3139 -0
- parrot/bots/agent.py +1129 -0
- parrot/bots/basic.py +9 -0
- parrot/bots/chatbot.py +669 -0
- parrot/bots/data.py +1618 -0
- parrot/bots/database/__init__.py +5 -0
- parrot/bots/database/abstract.py +3071 -0
- parrot/bots/database/cache.py +286 -0
- parrot/bots/database/models.py +468 -0
- parrot/bots/database/prompts.py +154 -0
- parrot/bots/database/retries.py +98 -0
- parrot/bots/database/router.py +269 -0
- parrot/bots/database/sql.py +41 -0
- parrot/bots/db/__init__.py +6 -0
- parrot/bots/db/abstract.py +556 -0
- parrot/bots/db/bigquery.py +602 -0
- parrot/bots/db/cache.py +85 -0
- parrot/bots/db/documentdb.py +668 -0
- parrot/bots/db/elastic.py +1014 -0
- parrot/bots/db/influx.py +898 -0
- parrot/bots/db/mock.py +96 -0
- parrot/bots/db/multi.py +783 -0
- parrot/bots/db/prompts.py +185 -0
- parrot/bots/db/sql.py +1255 -0
- parrot/bots/db/tools.py +212 -0
- parrot/bots/document.py +680 -0
- parrot/bots/hrbot.py +15 -0
- parrot/bots/kb.py +170 -0
- parrot/bots/mcp.py +36 -0
- parrot/bots/orchestration/README.md +463 -0
- parrot/bots/orchestration/__init__.py +1 -0
- parrot/bots/orchestration/agent.py +155 -0
- parrot/bots/orchestration/crew.py +3330 -0
- parrot/bots/orchestration/fsm.py +1179 -0
- parrot/bots/orchestration/hr.py +434 -0
- parrot/bots/orchestration/storage/__init__.py +4 -0
- parrot/bots/orchestration/storage/memory.py +100 -0
- parrot/bots/orchestration/storage/mixin.py +119 -0
- parrot/bots/orchestration/verify.py +202 -0
- parrot/bots/product.py +204 -0
- parrot/bots/prompts/__init__.py +96 -0
- parrot/bots/prompts/agents.py +155 -0
- parrot/bots/prompts/data.py +216 -0
- parrot/bots/prompts/output_generation.py +8 -0
- parrot/bots/scraper/__init__.py +3 -0
- parrot/bots/scraper/models.py +122 -0
- parrot/bots/scraper/scraper.py +1173 -0
- parrot/bots/scraper/templates.py +115 -0
- parrot/bots/stores/__init__.py +5 -0
- parrot/bots/stores/local.py +172 -0
- parrot/bots/webdev.py +81 -0
- parrot/cli.py +17 -0
- parrot/clients/__init__.py +16 -0
- parrot/clients/base.py +1491 -0
- parrot/clients/claude.py +1191 -0
- parrot/clients/factory.py +129 -0
- parrot/clients/google.py +4567 -0
- parrot/clients/gpt.py +1975 -0
- parrot/clients/grok.py +432 -0
- parrot/clients/groq.py +986 -0
- parrot/clients/hf.py +582 -0
- parrot/clients/models.py +18 -0
- parrot/conf.py +395 -0
- parrot/embeddings/__init__.py +9 -0
- parrot/embeddings/base.py +157 -0
- parrot/embeddings/google.py +98 -0
- parrot/embeddings/huggingface.py +74 -0
- parrot/embeddings/openai.py +84 -0
- parrot/embeddings/processor.py +88 -0
- parrot/exceptions.c +13868 -0
- parrot/exceptions.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/exceptions.pxd +22 -0
- parrot/exceptions.pxi +15 -0
- parrot/exceptions.pyx +44 -0
- parrot/generators/__init__.py +29 -0
- parrot/generators/base.py +200 -0
- parrot/generators/html.py +293 -0
- parrot/generators/react.py +205 -0
- parrot/generators/streamlit.py +203 -0
- parrot/generators/template.py +105 -0
- parrot/handlers/__init__.py +4 -0
- parrot/handlers/agent.py +861 -0
- parrot/handlers/agents/__init__.py +1 -0
- parrot/handlers/agents/abstract.py +900 -0
- parrot/handlers/bots.py +338 -0
- parrot/handlers/chat.py +915 -0
- parrot/handlers/creation.sql +192 -0
- parrot/handlers/crew/ARCHITECTURE.md +362 -0
- parrot/handlers/crew/README_BOTMANAGER_PERSISTENCE.md +303 -0
- parrot/handlers/crew/README_REDIS_PERSISTENCE.md +366 -0
- parrot/handlers/crew/__init__.py +0 -0
- parrot/handlers/crew/handler.py +801 -0
- parrot/handlers/crew/models.py +229 -0
- parrot/handlers/crew/redis_persistence.py +523 -0
- parrot/handlers/jobs/__init__.py +10 -0
- parrot/handlers/jobs/job.py +384 -0
- parrot/handlers/jobs/mixin.py +627 -0
- parrot/handlers/jobs/models.py +115 -0
- parrot/handlers/jobs/worker.py +31 -0
- parrot/handlers/models.py +596 -0
- parrot/handlers/o365_auth.py +105 -0
- parrot/handlers/stream.py +337 -0
- parrot/interfaces/__init__.py +6 -0
- parrot/interfaces/aws.py +143 -0
- parrot/interfaces/credentials.py +113 -0
- parrot/interfaces/database.py +27 -0
- parrot/interfaces/google.py +1123 -0
- parrot/interfaces/hierarchy.py +1227 -0
- parrot/interfaces/http.py +651 -0
- parrot/interfaces/images/__init__.py +0 -0
- parrot/interfaces/images/plugins/__init__.py +24 -0
- parrot/interfaces/images/plugins/abstract.py +58 -0
- parrot/interfaces/images/plugins/analisys.py +148 -0
- parrot/interfaces/images/plugins/classify.py +150 -0
- parrot/interfaces/images/plugins/classifybase.py +182 -0
- parrot/interfaces/images/plugins/detect.py +150 -0
- parrot/interfaces/images/plugins/exif.py +1103 -0
- parrot/interfaces/images/plugins/hash.py +52 -0
- parrot/interfaces/images/plugins/vision.py +104 -0
- parrot/interfaces/images/plugins/yolo.py +66 -0
- parrot/interfaces/images/plugins/zerodetect.py +197 -0
- parrot/interfaces/o365.py +978 -0
- parrot/interfaces/onedrive.py +822 -0
- parrot/interfaces/sharepoint.py +1435 -0
- parrot/interfaces/soap.py +257 -0
- parrot/loaders/__init__.py +8 -0
- parrot/loaders/abstract.py +1131 -0
- parrot/loaders/audio.py +199 -0
- parrot/loaders/basepdf.py +53 -0
- parrot/loaders/basevideo.py +1568 -0
- parrot/loaders/csv.py +409 -0
- parrot/loaders/docx.py +116 -0
- parrot/loaders/epubloader.py +316 -0
- parrot/loaders/excel.py +199 -0
- parrot/loaders/factory.py +55 -0
- parrot/loaders/files/__init__.py +0 -0
- parrot/loaders/files/abstract.py +39 -0
- parrot/loaders/files/html.py +26 -0
- parrot/loaders/files/text.py +63 -0
- parrot/loaders/html.py +152 -0
- parrot/loaders/markdown.py +442 -0
- parrot/loaders/pdf.py +373 -0
- parrot/loaders/pdfmark.py +320 -0
- parrot/loaders/pdftables.py +506 -0
- parrot/loaders/ppt.py +476 -0
- parrot/loaders/qa.py +63 -0
- parrot/loaders/splitters/__init__.py +10 -0
- parrot/loaders/splitters/base.py +138 -0
- parrot/loaders/splitters/md.py +228 -0
- parrot/loaders/splitters/token.py +143 -0
- parrot/loaders/txt.py +26 -0
- parrot/loaders/video.py +89 -0
- parrot/loaders/videolocal.py +218 -0
- parrot/loaders/videounderstanding.py +377 -0
- parrot/loaders/vimeo.py +167 -0
- parrot/loaders/web.py +599 -0
- parrot/loaders/youtube.py +504 -0
- parrot/manager/__init__.py +5 -0
- parrot/manager/manager.py +1030 -0
- parrot/mcp/__init__.py +28 -0
- parrot/mcp/adapter.py +105 -0
- parrot/mcp/cli.py +174 -0
- parrot/mcp/client.py +119 -0
- parrot/mcp/config.py +75 -0
- parrot/mcp/integration.py +842 -0
- parrot/mcp/oauth.py +933 -0
- parrot/mcp/server.py +225 -0
- parrot/mcp/transports/__init__.py +3 -0
- parrot/mcp/transports/base.py +279 -0
- parrot/mcp/transports/grpc_session.py +163 -0
- parrot/mcp/transports/http.py +312 -0
- parrot/mcp/transports/mcp.proto +108 -0
- parrot/mcp/transports/quic.py +1082 -0
- parrot/mcp/transports/sse.py +330 -0
- parrot/mcp/transports/stdio.py +309 -0
- parrot/mcp/transports/unix.py +395 -0
- parrot/mcp/transports/websocket.py +547 -0
- parrot/memory/__init__.py +16 -0
- parrot/memory/abstract.py +209 -0
- parrot/memory/agent.py +32 -0
- parrot/memory/cache.py +175 -0
- parrot/memory/core.py +555 -0
- parrot/memory/file.py +153 -0
- parrot/memory/mem.py +131 -0
- parrot/memory/redis.py +613 -0
- parrot/models/__init__.py +46 -0
- parrot/models/basic.py +118 -0
- parrot/models/compliance.py +208 -0
- parrot/models/crew.py +395 -0
- parrot/models/detections.py +654 -0
- parrot/models/generation.py +85 -0
- parrot/models/google.py +223 -0
- parrot/models/groq.py +23 -0
- parrot/models/openai.py +30 -0
- parrot/models/outputs.py +285 -0
- parrot/models/responses.py +938 -0
- parrot/notifications/__init__.py +743 -0
- parrot/openapi/__init__.py +3 -0
- parrot/openapi/components.yaml +641 -0
- parrot/openapi/config.py +322 -0
- parrot/outputs/__init__.py +32 -0
- parrot/outputs/formats/__init__.py +108 -0
- parrot/outputs/formats/altair.py +359 -0
- parrot/outputs/formats/application.py +122 -0
- parrot/outputs/formats/base.py +351 -0
- parrot/outputs/formats/bokeh.py +356 -0
- parrot/outputs/formats/card.py +424 -0
- parrot/outputs/formats/chart.py +436 -0
- parrot/outputs/formats/d3.py +255 -0
- parrot/outputs/formats/echarts.py +310 -0
- parrot/outputs/formats/generators/__init__.py +0 -0
- parrot/outputs/formats/generators/abstract.py +61 -0
- parrot/outputs/formats/generators/panel.py +145 -0
- parrot/outputs/formats/generators/streamlit.py +86 -0
- parrot/outputs/formats/generators/terminal.py +63 -0
- parrot/outputs/formats/holoviews.py +310 -0
- parrot/outputs/formats/html.py +147 -0
- parrot/outputs/formats/jinja2.py +46 -0
- parrot/outputs/formats/json.py +87 -0
- parrot/outputs/formats/map.py +933 -0
- parrot/outputs/formats/markdown.py +172 -0
- parrot/outputs/formats/matplotlib.py +237 -0
- parrot/outputs/formats/mixins/__init__.py +0 -0
- parrot/outputs/formats/mixins/emaps.py +855 -0
- parrot/outputs/formats/plotly.py +341 -0
- parrot/outputs/formats/seaborn.py +310 -0
- parrot/outputs/formats/table.py +397 -0
- parrot/outputs/formats/template_report.py +138 -0
- parrot/outputs/formats/yaml.py +125 -0
- parrot/outputs/formatter.py +152 -0
- parrot/outputs/templates/__init__.py +95 -0
- parrot/pipelines/__init__.py +0 -0
- parrot/pipelines/abstract.py +210 -0
- parrot/pipelines/detector.py +124 -0
- parrot/pipelines/models.py +90 -0
- parrot/pipelines/planogram.py +3002 -0
- parrot/pipelines/table.sql +97 -0
- parrot/plugins/__init__.py +106 -0
- parrot/plugins/importer.py +80 -0
- parrot/py.typed +0 -0
- parrot/registry/__init__.py +18 -0
- parrot/registry/registry.py +594 -0
- parrot/scheduler/__init__.py +1189 -0
- parrot/scheduler/models.py +60 -0
- parrot/security/__init__.py +16 -0
- parrot/security/prompt_injection.py +268 -0
- parrot/security/security_events.sql +25 -0
- parrot/services/__init__.py +1 -0
- parrot/services/mcp/__init__.py +8 -0
- parrot/services/mcp/config.py +13 -0
- parrot/services/mcp/server.py +295 -0
- parrot/services/o365_remote_auth.py +235 -0
- parrot/stores/__init__.py +7 -0
- parrot/stores/abstract.py +352 -0
- parrot/stores/arango.py +1090 -0
- parrot/stores/bigquery.py +1377 -0
- parrot/stores/cache.py +106 -0
- parrot/stores/empty.py +10 -0
- parrot/stores/faiss_store.py +1157 -0
- parrot/stores/kb/__init__.py +9 -0
- parrot/stores/kb/abstract.py +68 -0
- parrot/stores/kb/cache.py +165 -0
- parrot/stores/kb/doc.py +325 -0
- parrot/stores/kb/hierarchy.py +346 -0
- parrot/stores/kb/local.py +457 -0
- parrot/stores/kb/prompt.py +28 -0
- parrot/stores/kb/redis.py +659 -0
- parrot/stores/kb/store.py +115 -0
- parrot/stores/kb/user.py +374 -0
- parrot/stores/models.py +59 -0
- parrot/stores/pgvector.py +3 -0
- parrot/stores/postgres.py +2853 -0
- parrot/stores/utils/__init__.py +0 -0
- parrot/stores/utils/chunking.py +197 -0
- parrot/telemetry/__init__.py +3 -0
- parrot/telemetry/mixin.py +111 -0
- parrot/template/__init__.py +3 -0
- parrot/template/engine.py +259 -0
- parrot/tools/__init__.py +23 -0
- parrot/tools/abstract.py +644 -0
- parrot/tools/agent.py +363 -0
- parrot/tools/arangodbsearch.py +537 -0
- parrot/tools/arxiv_tool.py +188 -0
- parrot/tools/calculator/__init__.py +3 -0
- parrot/tools/calculator/operations/__init__.py +38 -0
- parrot/tools/calculator/operations/calculus.py +80 -0
- parrot/tools/calculator/operations/statistics.py +76 -0
- parrot/tools/calculator/tool.py +150 -0
- parrot/tools/cloudwatch.py +988 -0
- parrot/tools/codeinterpreter/__init__.py +127 -0
- parrot/tools/codeinterpreter/executor.py +371 -0
- parrot/tools/codeinterpreter/internals.py +473 -0
- parrot/tools/codeinterpreter/models.py +643 -0
- parrot/tools/codeinterpreter/prompts.py +224 -0
- parrot/tools/codeinterpreter/tool.py +664 -0
- parrot/tools/company_info/__init__.py +6 -0
- parrot/tools/company_info/tool.py +1138 -0
- parrot/tools/correlationanalysis.py +437 -0
- parrot/tools/database/abstract.py +286 -0
- parrot/tools/database/bq.py +115 -0
- parrot/tools/database/cache.py +284 -0
- parrot/tools/database/models.py +95 -0
- parrot/tools/database/pg.py +343 -0
- parrot/tools/databasequery.py +1159 -0
- parrot/tools/db.py +1800 -0
- parrot/tools/ddgo.py +370 -0
- parrot/tools/decorators.py +271 -0
- parrot/tools/dftohtml.py +282 -0
- parrot/tools/document.py +549 -0
- parrot/tools/ecs.py +819 -0
- parrot/tools/edareport.py +368 -0
- parrot/tools/elasticsearch.py +1049 -0
- parrot/tools/employees.py +462 -0
- parrot/tools/epson/__init__.py +96 -0
- parrot/tools/excel.py +683 -0
- parrot/tools/file/__init__.py +13 -0
- parrot/tools/file/abstract.py +76 -0
- parrot/tools/file/gcs.py +378 -0
- parrot/tools/file/local.py +284 -0
- parrot/tools/file/s3.py +511 -0
- parrot/tools/file/tmp.py +309 -0
- parrot/tools/file/tool.py +501 -0
- parrot/tools/file_reader.py +129 -0
- parrot/tools/flowtask/__init__.py +19 -0
- parrot/tools/flowtask/tool.py +761 -0
- parrot/tools/gittoolkit.py +508 -0
- parrot/tools/google/__init__.py +18 -0
- parrot/tools/google/base.py +169 -0
- parrot/tools/google/tools.py +1251 -0
- parrot/tools/googlelocation.py +5 -0
- parrot/tools/googleroutes.py +5 -0
- parrot/tools/googlesearch.py +5 -0
- parrot/tools/googlesitesearch.py +5 -0
- parrot/tools/googlevoice.py +2 -0
- parrot/tools/gvoice.py +695 -0
- parrot/tools/ibisworld/README.md +225 -0
- parrot/tools/ibisworld/__init__.py +11 -0
- parrot/tools/ibisworld/tool.py +366 -0
- parrot/tools/jiratoolkit.py +1718 -0
- parrot/tools/manager.py +1098 -0
- parrot/tools/math.py +152 -0
- parrot/tools/metadata.py +476 -0
- parrot/tools/msteams.py +1621 -0
- parrot/tools/msword.py +635 -0
- parrot/tools/multidb.py +580 -0
- parrot/tools/multistoresearch.py +369 -0
- parrot/tools/networkninja.py +167 -0
- parrot/tools/nextstop/__init__.py +4 -0
- parrot/tools/nextstop/base.py +286 -0
- parrot/tools/nextstop/employee.py +733 -0
- parrot/tools/nextstop/store.py +462 -0
- parrot/tools/notification.py +435 -0
- parrot/tools/o365/__init__.py +42 -0
- parrot/tools/o365/base.py +295 -0
- parrot/tools/o365/bundle.py +522 -0
- parrot/tools/o365/events.py +554 -0
- parrot/tools/o365/mail.py +992 -0
- parrot/tools/o365/onedrive.py +497 -0
- parrot/tools/o365/sharepoint.py +641 -0
- parrot/tools/openapi_toolkit.py +904 -0
- parrot/tools/openweather.py +527 -0
- parrot/tools/pdfprint.py +1001 -0
- parrot/tools/powerbi.py +518 -0
- parrot/tools/powerpoint.py +1113 -0
- parrot/tools/pricestool.py +146 -0
- parrot/tools/products/__init__.py +246 -0
- parrot/tools/prophet_tool.py +171 -0
- parrot/tools/pythonpandas.py +630 -0
- parrot/tools/pythonrepl.py +910 -0
- parrot/tools/qsource.py +436 -0
- parrot/tools/querytoolkit.py +395 -0
- parrot/tools/quickeda.py +827 -0
- parrot/tools/resttool.py +553 -0
- parrot/tools/retail/__init__.py +0 -0
- parrot/tools/retail/bby.py +528 -0
- parrot/tools/sandboxtool.py +703 -0
- parrot/tools/sassie/__init__.py +352 -0
- parrot/tools/scraping/__init__.py +7 -0
- parrot/tools/scraping/docs/select.md +466 -0
- parrot/tools/scraping/documentation.md +1278 -0
- parrot/tools/scraping/driver.py +436 -0
- parrot/tools/scraping/models.py +576 -0
- parrot/tools/scraping/options.py +85 -0
- parrot/tools/scraping/orchestrator.py +517 -0
- parrot/tools/scraping/readme.md +740 -0
- parrot/tools/scraping/tool.py +3115 -0
- parrot/tools/seasonaldetection.py +642 -0
- parrot/tools/shell_tool/__init__.py +5 -0
- parrot/tools/shell_tool/actions.py +408 -0
- parrot/tools/shell_tool/engine.py +155 -0
- parrot/tools/shell_tool/models.py +322 -0
- parrot/tools/shell_tool/tool.py +442 -0
- parrot/tools/site_search.py +214 -0
- parrot/tools/textfile.py +418 -0
- parrot/tools/think.py +378 -0
- parrot/tools/toolkit.py +298 -0
- parrot/tools/webapp_tool.py +187 -0
- parrot/tools/whatif.py +1279 -0
- parrot/tools/workday/MULTI_WSDL_EXAMPLE.md +249 -0
- parrot/tools/workday/__init__.py +6 -0
- parrot/tools/workday/models.py +1389 -0
- parrot/tools/workday/tool.py +1293 -0
- parrot/tools/yfinance_tool.py +306 -0
- parrot/tools/zipcode.py +217 -0
- parrot/utils/__init__.py +2 -0
- parrot/utils/helpers.py +73 -0
- parrot/utils/parsers/__init__.py +5 -0
- parrot/utils/parsers/toml.c +12078 -0
- parrot/utils/parsers/toml.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/utils/parsers/toml.pyx +21 -0
- parrot/utils/toml.py +11 -0
- parrot/utils/types.cpp +20936 -0
- parrot/utils/types.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/utils/types.pyx +213 -0
- parrot/utils/uv.py +11 -0
- parrot/version.py +10 -0
- parrot/yaml-rs/Cargo.lock +350 -0
- parrot/yaml-rs/Cargo.toml +19 -0
- parrot/yaml-rs/pyproject.toml +19 -0
- parrot/yaml-rs/python/yaml_rs/__init__.py +81 -0
- parrot/yaml-rs/src/lib.rs +222 -0
- requirements/docker-compose.yml +24 -0
- requirements/requirements-dev.txt +21 -0
|
@@ -0,0 +1,1138 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CompanyInfoToolkit - Unified toolkit for scraping company information from multiple sources.
|
|
3
|
+
|
|
4
|
+
This toolkit extends AbstractToolkit and provides methods to scrape company data from:
|
|
5
|
+
- explorium.ai
|
|
6
|
+
- leadiq.com
|
|
7
|
+
- rocketreach.co
|
|
8
|
+
- siccode.com
|
|
9
|
+
- zoominfo.com
|
|
10
|
+
|
|
11
|
+
Each public async method becomes a tool that:
|
|
12
|
+
1. Performs a Google site search for the company
|
|
13
|
+
2. Fetches the first result using Selenium
|
|
14
|
+
3. Parses the page with BeautifulSoup
|
|
15
|
+
4. Extracts company information
|
|
16
|
+
5. Returns structured data (CompanyInfo model or JSON)
|
|
17
|
+
|
|
18
|
+
Dependencies:
|
|
19
|
+
- selenium
|
|
20
|
+
- beautifulsoup4
|
|
21
|
+
- pydantic
|
|
22
|
+
- google-api-python-client
|
|
23
|
+
- aiohttp
|
|
24
|
+
|
|
25
|
+
Example usage:
|
|
26
|
+
toolkit = CompanyInfoToolkit(
|
|
27
|
+
google_api_key="your-api-key",
|
|
28
|
+
google_cse_id="your-cse-id",
|
|
29
|
+
use_proxy=False,
|
|
30
|
+
headless=True
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
# Get all tools
|
|
34
|
+
tools = toolkit.get_tools()
|
|
35
|
+
|
|
36
|
+
# Or use methods directly
|
|
37
|
+
result = await toolkit.scrape_zoominfo("PetSmart")
|
|
38
|
+
print(result.company_name)
|
|
39
|
+
"""
|
|
40
|
+
from __future__ import annotations
|
|
41
|
+
|
|
42
|
+
import asyncio
|
|
43
|
+
import json
|
|
44
|
+
import re
|
|
45
|
+
import time
|
|
46
|
+
from typing import Dict, List, Any, Optional, Union
|
|
47
|
+
from urllib.parse import urljoin
|
|
48
|
+
from bs4 import BeautifulSoup as bs
|
|
49
|
+
from pydantic import BaseModel, Field, model_validator
|
|
50
|
+
from googleapiclient.discovery import build
|
|
51
|
+
from navconfig import config
|
|
52
|
+
from navconfig.logging import logging
|
|
53
|
+
try:
|
|
54
|
+
from selenium import webdriver
|
|
55
|
+
from selenium.webdriver.chrome.options import Options
|
|
56
|
+
from selenium.webdriver.common.by import By
|
|
57
|
+
from selenium.webdriver.support import expected_conditions as EC
|
|
58
|
+
from selenium.webdriver.support.ui import WebDriverWait
|
|
59
|
+
from selenium.common.exceptions import (
|
|
60
|
+
TimeoutException,
|
|
61
|
+
NoSuchElementException,
|
|
62
|
+
WebDriverException
|
|
63
|
+
)
|
|
64
|
+
except ImportError as e:
|
|
65
|
+
raise ImportError("Please install selenium: pip install selenium") from e
|
|
66
|
+
|
|
67
|
+
from ..toolkit import AbstractToolkit
|
|
68
|
+
from ..decorators import tool_schema
|
|
69
|
+
from ..scraping.driver import SeleniumSetup
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
# ===========================
|
|
73
|
+
# Pydantic Models
|
|
74
|
+
# ===========================
|
|
75
|
+
|
|
76
|
+
class CompanyInput(BaseModel):
|
|
77
|
+
"""Input model for company scraping tools."""
|
|
78
|
+
company_name: str = Field(..., description="Name of the company to search for")
|
|
79
|
+
return_json: bool = Field(
|
|
80
|
+
False,
|
|
81
|
+
description="If True, return JSON string instead of CompanyInfo object"
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
class CompanyInfo(BaseModel):
|
|
85
|
+
"""
|
|
86
|
+
Structured output model for company information.
|
|
87
|
+
Homogenized across all scraping platforms.
|
|
88
|
+
"""
|
|
89
|
+
# Search metadata
|
|
90
|
+
search_term: Optional[str] = Field(None, description="Search term used")
|
|
91
|
+
search_url: Optional[str] = Field(None, description="URL of the scraped page")
|
|
92
|
+
source_platform: Optional[str] = Field(None, description="Source platform (e.g., zoominfo, leadiq)")
|
|
93
|
+
scrape_status: str = Field("pending", description="Status: pending, success, no_data, error")
|
|
94
|
+
|
|
95
|
+
# Company basic info
|
|
96
|
+
company_name: Optional[str] = Field(None, description="Company name")
|
|
97
|
+
logo_url: Optional[str] = Field(None, description="Company logo URL")
|
|
98
|
+
company_description: Optional[str] = Field(None, description="Company description")
|
|
99
|
+
|
|
100
|
+
# Location info
|
|
101
|
+
headquarters: Optional[str] = Field(None, description="Headquarters address")
|
|
102
|
+
address: Optional[str] = Field(None, description="Street address")
|
|
103
|
+
city: Optional[str] = Field(None, description="City")
|
|
104
|
+
state: Optional[str] = Field(None, description="State/Province")
|
|
105
|
+
zip_code: Optional[str] = Field(None, description="ZIP/Postal code")
|
|
106
|
+
country: Optional[str] = Field(None, description="Country")
|
|
107
|
+
metro_area: Optional[str] = Field(None, description="Metro area")
|
|
108
|
+
|
|
109
|
+
# Contact info
|
|
110
|
+
phone_number: Optional[str] = Field(None, description="Phone number")
|
|
111
|
+
website: Optional[str] = Field(None, description="Company website")
|
|
112
|
+
|
|
113
|
+
# Business classification
|
|
114
|
+
industry: Optional[Union[str, List[str]]] = Field(None, description="Industry")
|
|
115
|
+
industry_category: Optional[str] = Field(None, description="Industry category")
|
|
116
|
+
category: Optional[str] = Field(None, description="Business category")
|
|
117
|
+
keywords: Optional[List[str]] = Field(None, description="Business keywords")
|
|
118
|
+
naics_code: Optional[str] = Field(None, description="NAICS code(s)")
|
|
119
|
+
sic_code: Optional[str] = Field(None, description="SIC code(s)")
|
|
120
|
+
|
|
121
|
+
# Financial & size info
|
|
122
|
+
stock_symbol: Optional[str] = Field(None, description="Stock ticker symbol")
|
|
123
|
+
revenue_range: Optional[str] = Field(None, description="Revenue range")
|
|
124
|
+
employee_count: Optional[str] = Field(None, description="Number of employees")
|
|
125
|
+
number_employees: Optional[str] = Field(None, description="Employee count description")
|
|
126
|
+
company_size: Optional[str] = Field(None, description="Company size category")
|
|
127
|
+
founded: Optional[str] = Field(None, description="Year founded")
|
|
128
|
+
funding: Optional[str] = Field(None, description="Funding information")
|
|
129
|
+
years_in_business: Optional[str] = Field(None, description="Years in business")
|
|
130
|
+
|
|
131
|
+
# Additional info
|
|
132
|
+
executives: Optional[List[Dict[str, str]]] = Field(None, description="Executive team")
|
|
133
|
+
similar_companies: Optional[Union[str, List[Dict]]] = Field(None, description="Similar companies")
|
|
134
|
+
social_media: Optional[Dict[str, str]] = Field(None, description="Social media links")
|
|
135
|
+
|
|
136
|
+
# Metadata
|
|
137
|
+
timestamp: Optional[str] = Field(None, description="Scrape timestamp")
|
|
138
|
+
error_message: Optional[str] = Field(None, description="Error message if any")
|
|
139
|
+
|
|
140
|
+
def to_json(self, **kwargs) -> str:
|
|
141
|
+
"""Convert to JSON string."""
|
|
142
|
+
return self.model_dump_json(exclude_none=True, **kwargs)
|
|
143
|
+
|
|
144
|
+
@classmethod
|
|
145
|
+
def from_dict(cls, data: Dict[str, Any]) -> "CompanyInfo":
|
|
146
|
+
"""Create from dictionary."""
|
|
147
|
+
return cls(**data)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
class GoogleSearchResult(BaseModel):
|
|
151
|
+
"""Result from Google site search."""
|
|
152
|
+
query: str = Field(description="Search query used")
|
|
153
|
+
site: str = Field(description="Site searched")
|
|
154
|
+
url: Optional[str] = Field(None, description="First result URL")
|
|
155
|
+
title: Optional[str] = Field(None, description="Result title")
|
|
156
|
+
snippet: Optional[str] = Field(None, description="Result snippet")
|
|
157
|
+
total_results: int = Field(0, description="Total results found")
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
# ===========================
|
|
161
|
+
# Main Toolkit Class
|
|
162
|
+
# ===========================
|
|
163
|
+
|
|
164
|
+
class CompanyInfoToolkit(AbstractToolkit):
|
|
165
|
+
"""
|
|
166
|
+
Toolkit for scraping company information from multiple platforms.
|
|
167
|
+
|
|
168
|
+
Each public async method is automatically converted to a tool by AbstractToolkit.
|
|
169
|
+
Methods perform:
|
|
170
|
+
1. Google site search for company
|
|
171
|
+
2. Selenium page fetch
|
|
172
|
+
3. BeautifulSoup parsing
|
|
173
|
+
4. Structured data extraction
|
|
174
|
+
"""
|
|
175
|
+
|
|
176
|
+
def __init__(
|
|
177
|
+
self,
|
|
178
|
+
google_api_key: Optional[str] = None,
|
|
179
|
+
google_cse_id: Optional[str] = None,
|
|
180
|
+
browser: str = 'chrome',
|
|
181
|
+
headless: bool = True,
|
|
182
|
+
timeout: int = 30,
|
|
183
|
+
auto_install: bool = True,
|
|
184
|
+
mobile: bool = False,
|
|
185
|
+
mobile_device: Optional[str] = None,
|
|
186
|
+
use_undetected: bool = False,
|
|
187
|
+
**kwargs
|
|
188
|
+
):
|
|
189
|
+
"""
|
|
190
|
+
Initialize the CompanyInfoToolkit.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
google_api_key: Google Custom Search API key
|
|
194
|
+
google_cse_id: Google Custom Search Engine ID
|
|
195
|
+
browser: Browser type ('chrome', 'firefox', 'edge', 'safari', 'undetected')
|
|
196
|
+
headless: Run browser in headless mode
|
|
197
|
+
timeout: Default timeout for page loads (seconds)
|
|
198
|
+
auto_install: Auto-install webdriver if not found
|
|
199
|
+
mobile: Enable mobile emulation (Chrome only)
|
|
200
|
+
mobile_device: Specific mobile device to emulate
|
|
201
|
+
use_undetected: Use undetected-chromedriver (requires package)
|
|
202
|
+
**kwargs: Additional arguments passed to AbstractToolkit and SeleniumSetup
|
|
203
|
+
"""
|
|
204
|
+
super().__init__(**kwargs)
|
|
205
|
+
|
|
206
|
+
# Google Search configuration
|
|
207
|
+
self.google_api_key = google_api_key or config.get('GOOGLE_SEARCH_API_KEY')
|
|
208
|
+
self.google_cse_id = google_cse_id or config.get('GOOGLE_SEARCH_ENGINE_ID')
|
|
209
|
+
# Service Selection:
|
|
210
|
+
self.service = build("customsearch", "v1", developerKey=self.google_api_key)
|
|
211
|
+
|
|
212
|
+
# Browser configuration for SeleniumSetup
|
|
213
|
+
self.browser_config = {
|
|
214
|
+
'browser': 'undetected' if use_undetected else browser,
|
|
215
|
+
'headless': headless,
|
|
216
|
+
'auto_install': auto_install,
|
|
217
|
+
'mobile': mobile,
|
|
218
|
+
'mobile_device': mobile_device,
|
|
219
|
+
'timeout': timeout,
|
|
220
|
+
**kwargs # Pass through any additional kwargs
|
|
221
|
+
}
|
|
222
|
+
# Selenium setup instance and driver
|
|
223
|
+
self._selenium_setup: Optional[SeleniumSetup] = None
|
|
224
|
+
|
|
225
|
+
# Current driver instance
|
|
226
|
+
self._driver = None
|
|
227
|
+
|
|
228
|
+
# Logger
|
|
229
|
+
self.logger = logging.getLogger(self.__class__.__name__)
|
|
230
|
+
|
|
231
|
+
# ===========================
|
|
232
|
+
# Core Utility Methods
|
|
233
|
+
# ===========================
|
|
234
|
+
async def _get_driver(self) -> webdriver.Chrome:
|
|
235
|
+
"""Get or create Selenium WebDriver instance using SeleniumSetup."""
|
|
236
|
+
if self._driver is None:
|
|
237
|
+
if SeleniumSetup is None:
|
|
238
|
+
raise ImportError(
|
|
239
|
+
"SeleniumSetup not available. Please ensure parrot.tools.scraping.driver is installed."
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
self.logger.info("Initializing Selenium WebDriver...")
|
|
243
|
+
|
|
244
|
+
# Create SeleniumSetup instance
|
|
245
|
+
self._selenium_setup = SeleniumSetup(**self.browser_config)
|
|
246
|
+
|
|
247
|
+
# Get driver using SeleniumSetup's async method
|
|
248
|
+
self._driver = await self._selenium_setup.get_driver()
|
|
249
|
+
|
|
250
|
+
self.logger.info("Selenium WebDriver initialized successfully")
|
|
251
|
+
|
|
252
|
+
return self._driver
|
|
253
|
+
|
|
254
|
+
async def _close_driver(self):
|
|
255
|
+
"""Close the Selenium driver if open."""
|
|
256
|
+
if self._driver is not None:
|
|
257
|
+
try:
|
|
258
|
+
loop = asyncio.get_running_loop()
|
|
259
|
+
await loop.run_in_executor(None, self._driver.quit)
|
|
260
|
+
self.logger.info("Selenium WebDriver closed")
|
|
261
|
+
except Exception as e:
|
|
262
|
+
self.logger.warning(f"Error closing driver: {e}")
|
|
263
|
+
finally:
|
|
264
|
+
self._driver = None
|
|
265
|
+
self._selenium_setup = None
|
|
266
|
+
|
|
267
|
+
async def _google_site_search(
|
|
268
|
+
self,
|
|
269
|
+
company_name: str,
|
|
270
|
+
site: str,
|
|
271
|
+
additional_terms: str = "",
|
|
272
|
+
max_results: int = 5
|
|
273
|
+
) -> GoogleSearchResult:
|
|
274
|
+
"""
|
|
275
|
+
Perform Google site search for a company.
|
|
276
|
+
|
|
277
|
+
Args:
|
|
278
|
+
company_name: Company name to search for
|
|
279
|
+
site: Site domain to search within (e.g., "zoominfo.com")
|
|
280
|
+
additional_terms: Additional search terms (e.g., "Overview")
|
|
281
|
+
max_results: Maximum number of results
|
|
282
|
+
|
|
283
|
+
Returns:
|
|
284
|
+
GoogleSearchResult with first result URL
|
|
285
|
+
"""
|
|
286
|
+
# Build search query
|
|
287
|
+
query = f"{company_name} {additional_terms}".strip()
|
|
288
|
+
search_query = f"site:{site} {query}"
|
|
289
|
+
|
|
290
|
+
self.logger.info(f"Google search: {search_query}")
|
|
291
|
+
|
|
292
|
+
try:
|
|
293
|
+
# Execute search
|
|
294
|
+
loop = asyncio.get_running_loop()
|
|
295
|
+
res = await loop.run_in_executor(
|
|
296
|
+
None,
|
|
297
|
+
lambda: self.service.cse().list( # pylint: disable=E1101 # noqa
|
|
298
|
+
q=search_query,
|
|
299
|
+
cx=self.google_cse_id,
|
|
300
|
+
num=max_results
|
|
301
|
+
).execute()
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
items = res.get('items', [])
|
|
305
|
+
|
|
306
|
+
if not items:
|
|
307
|
+
self.logger.warning(
|
|
308
|
+
f"No results found for: {search_query}"
|
|
309
|
+
)
|
|
310
|
+
return GoogleSearchResult(
|
|
311
|
+
query=query,
|
|
312
|
+
site=site,
|
|
313
|
+
total_results=0
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
# Return first result
|
|
317
|
+
first = items[0]
|
|
318
|
+
return GoogleSearchResult(
|
|
319
|
+
query=query,
|
|
320
|
+
site=site,
|
|
321
|
+
url=first['link'],
|
|
322
|
+
title=first.get('title'),
|
|
323
|
+
snippet=first.get('snippet'),
|
|
324
|
+
total_results=len(items)
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
except Exception as e:
|
|
328
|
+
self.logger.error(f"Google search error: {e}")
|
|
329
|
+
return GoogleSearchResult(
|
|
330
|
+
query=query,
|
|
331
|
+
site=site,
|
|
332
|
+
total_results=0
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
async def _fetch_page_with_selenium(self, url: str) -> Optional[bs]:
|
|
336
|
+
"""
|
|
337
|
+
Fetch a page using Selenium and return BeautifulSoup object.
|
|
338
|
+
|
|
339
|
+
Args:
|
|
340
|
+
url: URL to fetch
|
|
341
|
+
|
|
342
|
+
Returns:
|
|
343
|
+
BeautifulSoup object or None if failed
|
|
344
|
+
"""
|
|
345
|
+
driver = await self._get_driver()
|
|
346
|
+
|
|
347
|
+
try:
|
|
348
|
+
self.logger.info(f"Fetching URL: {url}")
|
|
349
|
+
|
|
350
|
+
# Navigate to URL
|
|
351
|
+
loop = asyncio.get_running_loop()
|
|
352
|
+
await loop.run_in_executor(None, driver.get, url)
|
|
353
|
+
|
|
354
|
+
# Wait for page to load
|
|
355
|
+
await asyncio.sleep(2)
|
|
356
|
+
|
|
357
|
+
# Get page source
|
|
358
|
+
page_source = await loop.run_in_executor(
|
|
359
|
+
None,
|
|
360
|
+
lambda: driver.page_source
|
|
361
|
+
)
|
|
362
|
+
# Parse with BeautifulSoup
|
|
363
|
+
return bs(page_source, 'html.parser')
|
|
364
|
+
|
|
365
|
+
except TimeoutException:
|
|
366
|
+
self.logger.error(f"Timeout fetching: {url}")
|
|
367
|
+
return None
|
|
368
|
+
except Exception as e:
|
|
369
|
+
self.logger.error(f"Error fetching page: {e}")
|
|
370
|
+
return None
|
|
371
|
+
|
|
372
|
+
def _parse_address(self, address_text: str) -> Dict[str, Optional[str]]:
|
|
373
|
+
"""
|
|
374
|
+
Parse an address string into components.
|
|
375
|
+
|
|
376
|
+
Args:
|
|
377
|
+
address_text: Full address string
|
|
378
|
+
|
|
379
|
+
Returns:
|
|
380
|
+
Dictionary with address, city, state, zip_code, country
|
|
381
|
+
"""
|
|
382
|
+
result = {
|
|
383
|
+
'address': address_text,
|
|
384
|
+
'city': None,
|
|
385
|
+
'state': None,
|
|
386
|
+
'zip_code': None,
|
|
387
|
+
'country': None
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
# Simple parsing logic - can be enhanced
|
|
391
|
+
parts = [p.strip() for p in address_text.split(',')]
|
|
392
|
+
|
|
393
|
+
if len(parts) >= 2:
|
|
394
|
+
result['city'] = parts[0]
|
|
395
|
+
result['country'] = parts[-1]
|
|
396
|
+
|
|
397
|
+
if len(parts) >= 3:
|
|
398
|
+
# Try to extract state and zip
|
|
399
|
+
state_zip = parts[-2].strip()
|
|
400
|
+
if match := re.search(r'([A-Z]{2})\s+(\d{5}(?:-\d{4})?)', state_zip):
|
|
401
|
+
result['state'] = match[1]
|
|
402
|
+
result['zip_code'] = match[2]
|
|
403
|
+
|
|
404
|
+
return result
|
|
405
|
+
|
|
406
|
+
def _standardize_name(self, name: str) -> str:
|
|
407
|
+
"""Standardize company name for searching."""
|
|
408
|
+
# Remove common suffixes
|
|
409
|
+
suffixes = [
|
|
410
|
+
'Inc.', 'Inc', 'LLC', 'Ltd.', 'Ltd', 'Corporation',
|
|
411
|
+
'Corp.', 'Corp', 'Company', 'Co.', 'Co'
|
|
412
|
+
]
|
|
413
|
+
|
|
414
|
+
cleaned = name
|
|
415
|
+
for suffix in suffixes:
|
|
416
|
+
cleaned = re.sub(
|
|
417
|
+
rf'\b{re.escape(suffix)}\b',
|
|
418
|
+
'',
|
|
419
|
+
cleaned,
|
|
420
|
+
flags=re.IGNORECASE
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
return cleaned.strip()
|
|
424
|
+
|
|
425
|
+
# ===========================
|
|
426
|
+
# Platform-Specific Methods (Tools)
|
|
427
|
+
# ===========================
|
|
428
|
+
|
|
429
|
+
@tool_schema(CompanyInput)
|
|
430
|
+
async def scrape_zoominfo(
|
|
431
|
+
self,
|
|
432
|
+
company_name: str,
|
|
433
|
+
return_json: bool = False
|
|
434
|
+
) -> Union[CompanyInfo, str]:
|
|
435
|
+
"""
|
|
436
|
+
Scrape company information from ZoomInfo.
|
|
437
|
+
|
|
438
|
+
Args:
|
|
439
|
+
company_name: Name of the company to search for
|
|
440
|
+
return_json: If True, return JSON string instead of CompanyInfo object
|
|
441
|
+
|
|
442
|
+
Returns:
|
|
443
|
+
CompanyInfo object or JSON string with company data
|
|
444
|
+
"""
|
|
445
|
+
site = "zoominfo.com"
|
|
446
|
+
search_term = f"site:zoominfo.com {company_name} Overview"
|
|
447
|
+
|
|
448
|
+
# Initialize result
|
|
449
|
+
result = CompanyInfo(
|
|
450
|
+
search_term=search_term,
|
|
451
|
+
source_platform='zoominfo',
|
|
452
|
+
scrape_status='pending',
|
|
453
|
+
timestamp=str(time.time())
|
|
454
|
+
)
|
|
455
|
+
|
|
456
|
+
try:
|
|
457
|
+
# 1. Google site search
|
|
458
|
+
search_result = await self._google_site_search(
|
|
459
|
+
company_name=company_name,
|
|
460
|
+
site=site,
|
|
461
|
+
additional_terms="Overview"
|
|
462
|
+
)
|
|
463
|
+
|
|
464
|
+
if not search_result.url:
|
|
465
|
+
result.scrape_status = 'no_data'
|
|
466
|
+
result.error_message = 'No search results found'
|
|
467
|
+
return result.to_json() if return_json else result
|
|
468
|
+
|
|
469
|
+
result.search_url = search_result.url
|
|
470
|
+
|
|
471
|
+
# 2. Fetch page with Selenium
|
|
472
|
+
document = await self._fetch_page_with_selenium(search_result.url)
|
|
473
|
+
|
|
474
|
+
if not document:
|
|
475
|
+
result.scrape_status = 'error'
|
|
476
|
+
result.error_message = 'Failed to fetch page'
|
|
477
|
+
return result.to_json() if return_json else result
|
|
478
|
+
|
|
479
|
+
# 3. Parse company information
|
|
480
|
+
# Company name
|
|
481
|
+
if company_header := document.select_one("h2#company-description-text-header"):
|
|
482
|
+
result.company_name = company_header.text.strip()
|
|
483
|
+
|
|
484
|
+
# Headquarters
|
|
485
|
+
if hq_elem := document.select_one(".icon-label:-soup-contains('Headquarters') + .content"):
|
|
486
|
+
result.headquarters = hq_elem.text.strip()
|
|
487
|
+
|
|
488
|
+
# Phone
|
|
489
|
+
if phone_elem := document.select_one(".icon-label:-soup-contains('Phone Number') + .content"):
|
|
490
|
+
result.phone_number = phone_elem.text.strip()
|
|
491
|
+
|
|
492
|
+
# Website
|
|
493
|
+
if website_elem := document.select_one(".icon-label:-soup-contains('Website') + a"):
|
|
494
|
+
result.website = website_elem.get('href')
|
|
495
|
+
|
|
496
|
+
# Revenue
|
|
497
|
+
if revenue_elem := document.select_one(".icon-label:-soup-contains('Revenue') + .content"):
|
|
498
|
+
result.revenue_range = revenue_elem.text.strip()
|
|
499
|
+
|
|
500
|
+
# Stock symbol
|
|
501
|
+
if stock_elem := document.select_one(".icon-label:-soup-contains('Stock Symbol') + .content"):
|
|
502
|
+
result.stock_symbol = stock_elem.text.strip()
|
|
503
|
+
|
|
504
|
+
# Industry
|
|
505
|
+
if industry_elems := document.select("#company-chips-wrapper a"):
|
|
506
|
+
result.industry = [i.text.strip() for i in industry_elems]
|
|
507
|
+
|
|
508
|
+
# Description
|
|
509
|
+
if desc_elem := document.select_one("#company-description-text-content .company-desc"):
|
|
510
|
+
result.company_description = desc_elem.text.strip()
|
|
511
|
+
|
|
512
|
+
# NAICS and SIC codes
|
|
513
|
+
codes_section = document.select("#codes-wrapper .codes-content")
|
|
514
|
+
for code in codes_section:
|
|
515
|
+
text = code.text.strip()
|
|
516
|
+
if "NAICS Code" in text:
|
|
517
|
+
result.naics_code = text.replace("NAICS Code", "").strip()
|
|
518
|
+
elif "SIC Code" in text:
|
|
519
|
+
result.sic_code = text.replace("SIC Code", "").strip()
|
|
520
|
+
|
|
521
|
+
# Executives
|
|
522
|
+
exec_elems = document.select(".org-chart .person-right-content")
|
|
523
|
+
executives = []
|
|
524
|
+
for exec_elem in exec_elems:
|
|
525
|
+
if name_elem := exec_elem.select_one(".person-name"):
|
|
526
|
+
executives.append({
|
|
527
|
+
"name": name_elem.text.strip(),
|
|
528
|
+
"title": exec_elem.select_one(".job-title").text.strip() if exec_elem.select_one(".job-title") else "",
|
|
529
|
+
"profile_link": name_elem.get('href', '')
|
|
530
|
+
})
|
|
531
|
+
if executives:
|
|
532
|
+
result.executives = executives
|
|
533
|
+
|
|
534
|
+
# Check if we found meaningful data
|
|
535
|
+
has_data = any([
|
|
536
|
+
result.company_name,
|
|
537
|
+
result.headquarters,
|
|
538
|
+
result.phone_number,
|
|
539
|
+
result.website,
|
|
540
|
+
result.revenue_range
|
|
541
|
+
])
|
|
542
|
+
|
|
543
|
+
result.scrape_status = 'success' if has_data else 'no_data'
|
|
544
|
+
|
|
545
|
+
except Exception as e:
|
|
546
|
+
self.logger.error(f"Error scraping ZoomInfo: {e}")
|
|
547
|
+
result.scrape_status = 'error'
|
|
548
|
+
result.error_message = str(e)[:100]
|
|
549
|
+
finally:
|
|
550
|
+
await self._close_driver()
|
|
551
|
+
|
|
552
|
+
return result.to_json() if return_json else result
|
|
553
|
+
|
|
554
|
+
@tool_schema(CompanyInput)
|
|
555
|
+
async def scrape_explorium(
|
|
556
|
+
self,
|
|
557
|
+
company_name: str,
|
|
558
|
+
return_json: bool = False
|
|
559
|
+
) -> Union[CompanyInfo, str]:
|
|
560
|
+
"""
|
|
561
|
+
Scrape company information from Explorium.ai.
|
|
562
|
+
|
|
563
|
+
Args:
|
|
564
|
+
company_name: Name of the company to search for
|
|
565
|
+
return_json: If True, return JSON string instead of CompanyInfo object
|
|
566
|
+
|
|
567
|
+
Returns:
|
|
568
|
+
CompanyInfo object or JSON string with company data
|
|
569
|
+
"""
|
|
570
|
+
site = "explorium.ai"
|
|
571
|
+
search_term = f"site:explorium.ai {company_name}"
|
|
572
|
+
|
|
573
|
+
result = CompanyInfo(
|
|
574
|
+
search_term=search_term,
|
|
575
|
+
source_platform='explorium',
|
|
576
|
+
scrape_status='pending',
|
|
577
|
+
timestamp=str(time.time())
|
|
578
|
+
)
|
|
579
|
+
|
|
580
|
+
try:
|
|
581
|
+
# Google site search
|
|
582
|
+
search_result = await self._google_site_search(
|
|
583
|
+
company_name=company_name,
|
|
584
|
+
site=site,
|
|
585
|
+
additional_terms="overview - services"
|
|
586
|
+
)
|
|
587
|
+
|
|
588
|
+
if not search_result.url:
|
|
589
|
+
result.scrape_status = 'no_data'
|
|
590
|
+
result.error_message = 'No search results found'
|
|
591
|
+
return result.to_json() if return_json else result
|
|
592
|
+
|
|
593
|
+
result.search_url = search_result.url
|
|
594
|
+
|
|
595
|
+
# Fetch page
|
|
596
|
+
document = await self._fetch_page_with_selenium(search_result.url)
|
|
597
|
+
|
|
598
|
+
if not document:
|
|
599
|
+
result.scrape_status = 'error'
|
|
600
|
+
result.error_message = 'Failed to fetch page'
|
|
601
|
+
return result.to_json() if return_json else result
|
|
602
|
+
|
|
603
|
+
# Parse data
|
|
604
|
+
# Company name from header
|
|
605
|
+
name_elem = document.find('h1', {'data-id': 'txt-company-name'})
|
|
606
|
+
if name_elem:
|
|
607
|
+
result.company_name = name_elem.text.strip()
|
|
608
|
+
|
|
609
|
+
# Address
|
|
610
|
+
if address_section := document.find('div', {'data-id': 'info-address'}):
|
|
611
|
+
if address_elem := address_section.find('p', {'aria-label': True}):
|
|
612
|
+
address_text = address_elem.get('aria-label', '').strip()
|
|
613
|
+
result.headquarters = address_text
|
|
614
|
+
|
|
615
|
+
# Extract country
|
|
616
|
+
country = address_text.split(',')[-1].strip()
|
|
617
|
+
result.country = country or None
|
|
618
|
+
|
|
619
|
+
# Company description
|
|
620
|
+
desc_elem = document.find('p', {'class': 'ExpTypography-root ExpTypography-body1'})
|
|
621
|
+
if desc_elem and name_elem:
|
|
622
|
+
result.company_description = f"{name_elem.text.strip()}: {desc_elem.text.strip()}"
|
|
623
|
+
|
|
624
|
+
# Logo
|
|
625
|
+
if logo_elem := document.find('img', {'alt': True, 'src': True}):
|
|
626
|
+
result.logo_url = logo_elem['src']
|
|
627
|
+
|
|
628
|
+
# NAICS codes
|
|
629
|
+
if naics_section := document.find('div', {'data-id': 'company-stat-naics'}):
|
|
630
|
+
naics_entries = naics_section.find_all('p', {'class': 'ExpTypography-root'})
|
|
631
|
+
naics_codes = []
|
|
632
|
+
industries = []
|
|
633
|
+
for entry in naics_entries:
|
|
634
|
+
code = entry.text.strip().strip(',')
|
|
635
|
+
industry_desc = entry.get('aria-label', '').strip()
|
|
636
|
+
if code:
|
|
637
|
+
naics_codes.append(code)
|
|
638
|
+
if industry_desc:
|
|
639
|
+
industries.append(industry_desc)
|
|
640
|
+
|
|
641
|
+
if naics_codes:
|
|
642
|
+
result.naics_code = ', '.join(naics_codes)
|
|
643
|
+
if industries:
|
|
644
|
+
result.industry = ', '.join(industries)
|
|
645
|
+
|
|
646
|
+
# SIC codes
|
|
647
|
+
if sic_section := document.find('div', {'data-id': 'company-stat-sic'}):
|
|
648
|
+
sic_entries = sic_section.find_all('p', {'class': 'ExpTypography-root'})
|
|
649
|
+
sic_codes = []
|
|
650
|
+
for entry in sic_entries:
|
|
651
|
+
if code := entry.text.strip().strip(','):
|
|
652
|
+
sic_codes.append(code)
|
|
653
|
+
|
|
654
|
+
if sic_codes:
|
|
655
|
+
result.sic_code = ', '.join(sic_codes)
|
|
656
|
+
|
|
657
|
+
# Check for data
|
|
658
|
+
has_data = any([
|
|
659
|
+
result.company_name,
|
|
660
|
+
result.headquarters,
|
|
661
|
+
result.naics_code,
|
|
662
|
+
result.sic_code
|
|
663
|
+
])
|
|
664
|
+
|
|
665
|
+
result.scrape_status = 'success' if has_data else 'no_data'
|
|
666
|
+
|
|
667
|
+
except Exception as e:
|
|
668
|
+
self.logger.error(f"Error scraping Explorium: {e}")
|
|
669
|
+
result.scrape_status = 'error'
|
|
670
|
+
result.error_message = str(e)[:100]
|
|
671
|
+
finally:
|
|
672
|
+
await self._close_driver()
|
|
673
|
+
|
|
674
|
+
return result.to_json() if return_json else result
|
|
675
|
+
|
|
676
|
+
@tool_schema(CompanyInput)
|
|
677
|
+
async def scrape_leadiq(
|
|
678
|
+
self,
|
|
679
|
+
company_name: str,
|
|
680
|
+
return_json: bool = False
|
|
681
|
+
) -> Union[CompanyInfo, str]:
|
|
682
|
+
"""
|
|
683
|
+
Scrape company information from LeadIQ.
|
|
684
|
+
|
|
685
|
+
Args:
|
|
686
|
+
company_name: Name of the company to search for
|
|
687
|
+
return_json: If True, return JSON string instead of CompanyInfo object
|
|
688
|
+
|
|
689
|
+
Returns:
|
|
690
|
+
CompanyInfo object or JSON string with company data
|
|
691
|
+
"""
|
|
692
|
+
site = "leadiq.com"
|
|
693
|
+
standardized_name = self._standardize_name(company_name)
|
|
694
|
+
search_term = f"site:leadiq.com {standardized_name}"
|
|
695
|
+
|
|
696
|
+
result = CompanyInfo(
|
|
697
|
+
search_term=search_term,
|
|
698
|
+
source_platform='leadiq',
|
|
699
|
+
scrape_status='pending',
|
|
700
|
+
timestamp=str(time.time())
|
|
701
|
+
)
|
|
702
|
+
|
|
703
|
+
try:
|
|
704
|
+
# Google site search
|
|
705
|
+
search_result = await self._google_site_search(
|
|
706
|
+
company_name=standardized_name,
|
|
707
|
+
site=site,
|
|
708
|
+
additional_terms="Company Overview"
|
|
709
|
+
)
|
|
710
|
+
|
|
711
|
+
if not search_result.url:
|
|
712
|
+
result.scrape_status = 'no_data'
|
|
713
|
+
result.error_message = 'No search results found'
|
|
714
|
+
return result.to_json() if return_json else result
|
|
715
|
+
|
|
716
|
+
result.search_url = search_result.url
|
|
717
|
+
|
|
718
|
+
# Fetch page
|
|
719
|
+
document = await self._fetch_page_with_selenium(search_result.url)
|
|
720
|
+
|
|
721
|
+
if not document:
|
|
722
|
+
result.scrape_status = 'error'
|
|
723
|
+
result.error_message = 'Failed to fetch page'
|
|
724
|
+
return result.to_json() if return_json else result
|
|
725
|
+
|
|
726
|
+
# Parse data
|
|
727
|
+
# Company logo and name
|
|
728
|
+
if logo := document.find('img', {'alt': True, 'width': '76.747'}):
|
|
729
|
+
result.company_name = logo.get('alt')
|
|
730
|
+
result.logo_url = logo.get('src')
|
|
731
|
+
|
|
732
|
+
# Revenue range
|
|
733
|
+
if highlight_right := document.find('div', {'class': 'highlight-right'}):
|
|
734
|
+
if revenue_span := highlight_right.find('span', {'class': 'start'}):
|
|
735
|
+
start_value = revenue_span.text.strip()
|
|
736
|
+
if end_span := revenue_span.find_next_sibling('span', {'class': 'end'}):
|
|
737
|
+
end_value = end_span.text.strip()
|
|
738
|
+
result.revenue_range = f"{start_value} - {end_value}"
|
|
739
|
+
else:
|
|
740
|
+
result.revenue_range = start_value
|
|
741
|
+
|
|
742
|
+
# Company details
|
|
743
|
+
if highlight_left := document.find('div', {'class': 'highlight-left'}):
|
|
744
|
+
if overview_section := highlight_left.find('div', {'class': 'card span'}):
|
|
745
|
+
if dl_element := overview_section.find('dl'):
|
|
746
|
+
for item in dl_element.find_all('div', {'class': 'item'}):
|
|
747
|
+
dt = item.find('dt')
|
|
748
|
+
dd = item.find('dd')
|
|
749
|
+
if dt and dd:
|
|
750
|
+
field = dt.text.strip().lower()
|
|
751
|
+
value = dd.text.strip()
|
|
752
|
+
|
|
753
|
+
if field == 'headquarters':
|
|
754
|
+
address_info = self._parse_address(value)
|
|
755
|
+
result.headquarters = value
|
|
756
|
+
result.address = address_info.get('address')
|
|
757
|
+
result.city = address_info.get('city')
|
|
758
|
+
result.state = address_info.get('state')
|
|
759
|
+
result.zip_code = address_info.get('zip_code')
|
|
760
|
+
result.country = address_info.get('country')
|
|
761
|
+
elif field == 'phone number':
|
|
762
|
+
result.phone_number = value.replace('****', '0000')
|
|
763
|
+
elif field == 'website':
|
|
764
|
+
website = dd.find('a')
|
|
765
|
+
result.website = website['href'] if website else value
|
|
766
|
+
elif field == 'stock symbol':
|
|
767
|
+
result.stock_symbol = value
|
|
768
|
+
elif field == 'naics code':
|
|
769
|
+
result.naics_code = value
|
|
770
|
+
elif field == 'employees':
|
|
771
|
+
result.employee_count = value
|
|
772
|
+
elif field == 'sic code':
|
|
773
|
+
result.sic_code = value
|
|
774
|
+
|
|
775
|
+
# Hero section
|
|
776
|
+
if hero_section := document.find('div', {'class': 'card hero snug'}):
|
|
777
|
+
# Company name
|
|
778
|
+
if company_name_elem := hero_section.find('h1'):
|
|
779
|
+
result.company_name = company_name_elem.text.strip()
|
|
780
|
+
|
|
781
|
+
# Industry, location, employees
|
|
782
|
+
if info_p := hero_section.find('p', {'class': 'info'}):
|
|
783
|
+
spans = info_p.find_all('span')
|
|
784
|
+
if len(spans) >= 3:
|
|
785
|
+
if not result.industry:
|
|
786
|
+
result.industry = spans[0].text.strip()
|
|
787
|
+
result.number_employees = spans[2].text.strip()
|
|
788
|
+
|
|
789
|
+
# Description
|
|
790
|
+
if description_p := hero_section.find('pre'):
|
|
791
|
+
result.company_description = description_p.text.strip()
|
|
792
|
+
|
|
793
|
+
# Similar companies
|
|
794
|
+
similar_companies = []
|
|
795
|
+
if similar_section := document.find('div', {'id': 'similar'}):
|
|
796
|
+
for company in similar_section.find_all('li'):
|
|
797
|
+
company_link = company.find('a')
|
|
798
|
+
if not company_link:
|
|
799
|
+
continue
|
|
800
|
+
|
|
801
|
+
company_logo = company_link.find('img')
|
|
802
|
+
if company_name_elem := company_link.find('h3'):
|
|
803
|
+
similar_company = {
|
|
804
|
+
'name': company_name_elem.text.strip(),
|
|
805
|
+
'leadiq_url': company_link['href'],
|
|
806
|
+
'logo_url': company_logo['src'] if company_logo else None
|
|
807
|
+
}
|
|
808
|
+
similar_companies.append(similar_company)
|
|
809
|
+
|
|
810
|
+
if similar_companies:
|
|
811
|
+
result.similar_companies = json.dumps(
|
|
812
|
+
similar_companies,
|
|
813
|
+
ensure_ascii=False
|
|
814
|
+
)
|
|
815
|
+
|
|
816
|
+
# Check for data
|
|
817
|
+
has_data = any([
|
|
818
|
+
result.company_name,
|
|
819
|
+
result.logo_url,
|
|
820
|
+
result.headquarters,
|
|
821
|
+
result.phone_number,
|
|
822
|
+
result.website
|
|
823
|
+
])
|
|
824
|
+
|
|
825
|
+
result.scrape_status = 'success' if has_data else 'no_data'
|
|
826
|
+
|
|
827
|
+
except Exception as e:
|
|
828
|
+
self.logger.error(f"Error scraping LeadIQ: {e}")
|
|
829
|
+
result.scrape_status = 'error'
|
|
830
|
+
result.error_message = str(e)[:100]
|
|
831
|
+
finally:
|
|
832
|
+
await self._close_driver()
|
|
833
|
+
|
|
834
|
+
return result.to_json() if return_json else result
|
|
835
|
+
|
|
836
|
+
@tool_schema(CompanyInput)
|
|
837
|
+
async def scrape_rocketreach(
|
|
838
|
+
self,
|
|
839
|
+
company_name: str,
|
|
840
|
+
return_json: bool = False
|
|
841
|
+
) -> Union[CompanyInfo, str]:
|
|
842
|
+
"""
|
|
843
|
+
Scrape company information from RocketReach.
|
|
844
|
+
|
|
845
|
+
Args:
|
|
846
|
+
company_name: Name of the company to search for
|
|
847
|
+
return_json: If True, return JSON string instead of CompanyInfo object
|
|
848
|
+
|
|
849
|
+
Returns:
|
|
850
|
+
CompanyInfo object or JSON string with company data
|
|
851
|
+
"""
|
|
852
|
+
site = "rocketreach.co"
|
|
853
|
+
search_term = f"site:rocketreach.co '{company_name}'"
|
|
854
|
+
|
|
855
|
+
result = CompanyInfo(
|
|
856
|
+
search_term=search_term,
|
|
857
|
+
source_platform='rocketreach',
|
|
858
|
+
scrape_status='pending',
|
|
859
|
+
timestamp=str(time.time())
|
|
860
|
+
)
|
|
861
|
+
|
|
862
|
+
try:
|
|
863
|
+
# Google site search
|
|
864
|
+
search_result = await self._google_site_search(
|
|
865
|
+
company_name=company_name,
|
|
866
|
+
site=site,
|
|
867
|
+
additional_terms=" Information - RocketReach"
|
|
868
|
+
)
|
|
869
|
+
|
|
870
|
+
if not search_result.url:
|
|
871
|
+
result.scrape_status = 'no_data'
|
|
872
|
+
result.error_message = 'No search results found'
|
|
873
|
+
return result.to_json() if return_json else result
|
|
874
|
+
|
|
875
|
+
result.search_url = search_result.url
|
|
876
|
+
|
|
877
|
+
# Fetch page
|
|
878
|
+
document = await self._fetch_page_with_selenium(search_result.url)
|
|
879
|
+
|
|
880
|
+
if not document:
|
|
881
|
+
result.scrape_status = 'error'
|
|
882
|
+
result.error_message = 'Failed to fetch page'
|
|
883
|
+
return result.to_json() if return_json else result
|
|
884
|
+
|
|
885
|
+
# Parse data
|
|
886
|
+
# Company header
|
|
887
|
+
if company_header := document.select_one(".company-header"):
|
|
888
|
+
# Logo
|
|
889
|
+
img_tag = company_header.select_one(".company-logo")
|
|
890
|
+
result.logo_url = img_tag["src"] if img_tag else None
|
|
891
|
+
|
|
892
|
+
# Company name
|
|
893
|
+
if title_tag := company_header.select_one(".company-title"):
|
|
894
|
+
result.company_name = title_tag.text.replace(" Information", "").strip()
|
|
895
|
+
|
|
896
|
+
# Description
|
|
897
|
+
headline_summary = document.select_one(".headline-summary p")
|
|
898
|
+
result.company_description = headline_summary.text.strip() if headline_summary else None
|
|
899
|
+
|
|
900
|
+
# Information table
|
|
901
|
+
info_table = document.select(".headline-summary table tbody tr")
|
|
902
|
+
for row in info_table:
|
|
903
|
+
key = row.select_one("td strong")
|
|
904
|
+
value = row.select_one("td:nth-of-type(2)")
|
|
905
|
+
|
|
906
|
+
if key and value:
|
|
907
|
+
key_text = key.text.strip().lower()
|
|
908
|
+
value_text = value.text.strip()
|
|
909
|
+
|
|
910
|
+
if "website" in key_text:
|
|
911
|
+
result.website = value.select_one("a")["href"] if value.select_one("a") else value_text
|
|
912
|
+
elif "ticker" in key_text:
|
|
913
|
+
result.stock_symbol = value_text
|
|
914
|
+
elif "revenue" in key_text:
|
|
915
|
+
result.revenue_range = value_text
|
|
916
|
+
elif "funding" in key_text:
|
|
917
|
+
result.funding = value_text
|
|
918
|
+
elif "employees" in key_text:
|
|
919
|
+
result.employee_count = value_text.split()[0]
|
|
920
|
+
result.number_employees = value_text
|
|
921
|
+
elif "founded" in key_text:
|
|
922
|
+
result.founded = value_text
|
|
923
|
+
elif "address" in key_text:
|
|
924
|
+
result.headquarters = value.select_one("a").text.strip() if value.select_one("a") else value_text
|
|
925
|
+
elif "phone" in key_text:
|
|
926
|
+
result.phone_number = value.select_one("a").text.strip() if value.select_one("a") else value_text
|
|
927
|
+
elif "industry" in key_text:
|
|
928
|
+
result.industry = [i.strip() for i in value_text.split(",")]
|
|
929
|
+
elif "keywords" in key_text:
|
|
930
|
+
result.keywords = [i.strip() for i in value_text.split(",")]
|
|
931
|
+
elif "sic" in key_text:
|
|
932
|
+
# Extract codes
|
|
933
|
+
codes = []
|
|
934
|
+
for link in value.find_all("a"):
|
|
935
|
+
if match := re.search(r"\b\d+\b", link.text):
|
|
936
|
+
codes.append(match.group())
|
|
937
|
+
result.sic_code = ', '.join(codes) if codes else None
|
|
938
|
+
elif "naics" in key_text:
|
|
939
|
+
# Extract codes
|
|
940
|
+
codes = []
|
|
941
|
+
for link in value.find_all("a"):
|
|
942
|
+
if match := re.search(r"\b\d+\b", link.text):
|
|
943
|
+
codes.append(match.group())
|
|
944
|
+
result.naics_code = ', '.join(codes) if codes else None
|
|
945
|
+
|
|
946
|
+
# Check for data
|
|
947
|
+
has_data = any([
|
|
948
|
+
result.company_name,
|
|
949
|
+
result.logo_url,
|
|
950
|
+
result.headquarters,
|
|
951
|
+
result.phone_number,
|
|
952
|
+
result.website
|
|
953
|
+
])
|
|
954
|
+
|
|
955
|
+
result.scrape_status = 'success' if has_data else 'no_data'
|
|
956
|
+
|
|
957
|
+
except Exception as e:
|
|
958
|
+
self.logger.error(f"Error scraping RocketReach: {e}")
|
|
959
|
+
result.scrape_status = 'error'
|
|
960
|
+
result.error_message = str(e)[:100]
|
|
961
|
+
finally:
|
|
962
|
+
await self._close_driver()
|
|
963
|
+
|
|
964
|
+
return result.to_json() if return_json else result
|
|
965
|
+
|
|
966
|
+
@tool_schema(CompanyInput)
|
|
967
|
+
async def scrape_siccode(
|
|
968
|
+
self,
|
|
969
|
+
company_name: str,
|
|
970
|
+
return_json: bool = False
|
|
971
|
+
) -> Union[CompanyInfo, str]:
|
|
972
|
+
"""
|
|
973
|
+
Scrape company information from SICCode.com.
|
|
974
|
+
|
|
975
|
+
Args:
|
|
976
|
+
company_name: Name of the company to search for
|
|
977
|
+
return_json: If True, return JSON string instead of CompanyInfo object
|
|
978
|
+
|
|
979
|
+
Returns:
|
|
980
|
+
CompanyInfo object or JSON string with company data
|
|
981
|
+
"""
|
|
982
|
+
site = "siccode.com"
|
|
983
|
+
search_term = f"site:siccode.com '{company_name}' +NAICS"
|
|
984
|
+
|
|
985
|
+
result = CompanyInfo(
|
|
986
|
+
search_term=search_term,
|
|
987
|
+
source_platform='siccode',
|
|
988
|
+
scrape_status='pending',
|
|
989
|
+
timestamp=str(time.time())
|
|
990
|
+
)
|
|
991
|
+
|
|
992
|
+
try:
|
|
993
|
+
# Google site search
|
|
994
|
+
search_result = await self._google_site_search(
|
|
995
|
+
company_name=company_name,
|
|
996
|
+
site=site,
|
|
997
|
+
additional_terms="+NAICS"
|
|
998
|
+
)
|
|
999
|
+
|
|
1000
|
+
if not search_result.url:
|
|
1001
|
+
result.scrape_status = 'no_data'
|
|
1002
|
+
result.error_message = 'No search results found'
|
|
1003
|
+
return result.to_json() if return_json else result
|
|
1004
|
+
|
|
1005
|
+
result.search_url = search_result.url
|
|
1006
|
+
|
|
1007
|
+
# Fetch page
|
|
1008
|
+
document = await self._fetch_page_with_selenium(search_result.url)
|
|
1009
|
+
|
|
1010
|
+
if not document:
|
|
1011
|
+
result.scrape_status = 'error'
|
|
1012
|
+
result.error_message = 'Failed to fetch page'
|
|
1013
|
+
return result.to_json() if return_json else result
|
|
1014
|
+
|
|
1015
|
+
# Parse data
|
|
1016
|
+
if header := document.select_one("div.main-title"):
|
|
1017
|
+
# Company name
|
|
1018
|
+
if name_elem := header.select_one("h1.size-h2 a span"):
|
|
1019
|
+
result.company_name = name_elem.text.strip()
|
|
1020
|
+
|
|
1021
|
+
# Industry category
|
|
1022
|
+
if cat_elem := header.select_one("b.p-category"):
|
|
1023
|
+
result.industry_category = cat_elem.text.strip()
|
|
1024
|
+
|
|
1025
|
+
# SIC and NAICS codes
|
|
1026
|
+
if desc := document.find('div', {'id': 'description'}):
|
|
1027
|
+
sic_code_elem = desc.select_one("a.sic")
|
|
1028
|
+
naics_code_elem = desc.select_one("a.naics")
|
|
1029
|
+
|
|
1030
|
+
if sic_code_elem:
|
|
1031
|
+
sic_text = sic_code_elem.text.split("SIC CODE")[-1].strip()
|
|
1032
|
+
if ' - ' in sic_text:
|
|
1033
|
+
parts = sic_text.split(' - ')
|
|
1034
|
+
result.sic_code = parts[0].strip()
|
|
1035
|
+
result.industry = parts[1].strip() if len(parts) > 1 else None
|
|
1036
|
+
|
|
1037
|
+
if naics_code_elem:
|
|
1038
|
+
naics_text = naics_code_elem.text.split("NAICS CODE")[-1].strip()
|
|
1039
|
+
if ' - ' in naics_text:
|
|
1040
|
+
parts = naics_text.split(' - ')
|
|
1041
|
+
result.naics_code = parts[0].strip()
|
|
1042
|
+
result.category = parts[1].strip() if len(parts) > 1 else None
|
|
1043
|
+
|
|
1044
|
+
# Location details
|
|
1045
|
+
if overview := document.find('div', {'id': 'overview'}):
|
|
1046
|
+
# Description
|
|
1047
|
+
if desc_elem := overview.select_one("p.p-note"):
|
|
1048
|
+
result.company_description = desc_elem.text.strip()
|
|
1049
|
+
|
|
1050
|
+
# Location fields
|
|
1051
|
+
city_elem = overview.select_one(".p-locality")
|
|
1052
|
+
state_elem = overview.select_one(".p-region")
|
|
1053
|
+
zip_elem = overview.select_one(".p-postal-code")
|
|
1054
|
+
country_elem = overview.select_one(".p-country-name")
|
|
1055
|
+
metro_elem = overview.select_one("div[title]")
|
|
1056
|
+
|
|
1057
|
+
if city_elem:
|
|
1058
|
+
result.city = city_elem.text.strip()
|
|
1059
|
+
if state_elem:
|
|
1060
|
+
result.state = state_elem.text.strip()
|
|
1061
|
+
if zip_elem:
|
|
1062
|
+
result.zip_code = zip_elem.text.strip()
|
|
1063
|
+
if country_elem:
|
|
1064
|
+
result.country = country_elem.text.strip()
|
|
1065
|
+
if metro_elem:
|
|
1066
|
+
result.metro_area = metro_elem.text.strip()
|
|
1067
|
+
|
|
1068
|
+
# Construct headquarters
|
|
1069
|
+
parts = [result.city, result.state, result.zip_code, result.country]
|
|
1070
|
+
result.headquarters = ", ".join(filter(None, parts))
|
|
1071
|
+
|
|
1072
|
+
# Check for data
|
|
1073
|
+
has_data = any([
|
|
1074
|
+
result.company_name,
|
|
1075
|
+
result.sic_code,
|
|
1076
|
+
result.naics_code,
|
|
1077
|
+
result.headquarters
|
|
1078
|
+
])
|
|
1079
|
+
|
|
1080
|
+
result.scrape_status = 'success' if has_data else 'no_data'
|
|
1081
|
+
|
|
1082
|
+
except Exception as e:
|
|
1083
|
+
self.logger.error(f"Error scraping SICCode: {e}")
|
|
1084
|
+
result.scrape_status = 'error'
|
|
1085
|
+
result.error_message = str(e)[:100]
|
|
1086
|
+
finally:
|
|
1087
|
+
await self._close_driver()
|
|
1088
|
+
|
|
1089
|
+
return result.to_json() if return_json else result
|
|
1090
|
+
|
|
1091
|
+
@tool_schema(CompanyInput)
|
|
1092
|
+
async def scrape_all_sources(
|
|
1093
|
+
self,
|
|
1094
|
+
company_name: str,
|
|
1095
|
+
return_json: bool = False
|
|
1096
|
+
) -> Union[List[CompanyInfo], str]:
|
|
1097
|
+
"""
|
|
1098
|
+
Scrape company information from ALL available sources.
|
|
1099
|
+
|
|
1100
|
+
This method runs all scraping tools in parallel and returns
|
|
1101
|
+
aggregated results from all platforms.
|
|
1102
|
+
|
|
1103
|
+
Args:
|
|
1104
|
+
company_name: Name of the company to search for
|
|
1105
|
+
return_json: If True, return JSON string instead of list of CompanyInfo objects
|
|
1106
|
+
|
|
1107
|
+
Returns:
|
|
1108
|
+
List of CompanyInfo objects or JSON string with all results
|
|
1109
|
+
"""
|
|
1110
|
+
self.logger.info(f"Scraping all sources for: {company_name}")
|
|
1111
|
+
|
|
1112
|
+
# Run all scraping methods in parallel
|
|
1113
|
+
tasks = [
|
|
1114
|
+
self.scrape_zoominfo(company_name, return_json=False),
|
|
1115
|
+
self.scrape_explorium(company_name, return_json=False),
|
|
1116
|
+
self.scrape_leadiq(company_name, return_json=False),
|
|
1117
|
+
self.scrape_rocketreach(company_name, return_json=False),
|
|
1118
|
+
self.scrape_siccode(company_name, return_json=False)
|
|
1119
|
+
]
|
|
1120
|
+
|
|
1121
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
1122
|
+
|
|
1123
|
+
# Filter out exceptions and failed results
|
|
1124
|
+
valid_results = []
|
|
1125
|
+
for result in results:
|
|
1126
|
+
if isinstance(result, Exception):
|
|
1127
|
+
self.logger.error(f"Scraping error: {result}")
|
|
1128
|
+
elif isinstance(result, CompanyInfo):
|
|
1129
|
+
valid_results.append(result)
|
|
1130
|
+
|
|
1131
|
+
if return_json:
|
|
1132
|
+
return json.dumps(
|
|
1133
|
+
[r.model_dump(exclude_none=True) for r in valid_results],
|
|
1134
|
+
ensure_ascii=False,
|
|
1135
|
+
indent=2
|
|
1136
|
+
)
|
|
1137
|
+
|
|
1138
|
+
return valid_results
|