flowtask 5.8.4__cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowtask/__init__.py +93 -0
- flowtask/__main__.py +38 -0
- flowtask/bots/__init__.py +6 -0
- flowtask/bots/check.py +93 -0
- flowtask/bots/codebot.py +51 -0
- flowtask/components/ASPX.py +148 -0
- flowtask/components/AddDataset.py +352 -0
- flowtask/components/Amazon.py +523 -0
- flowtask/components/AutoTask.py +314 -0
- flowtask/components/Azure.py +80 -0
- flowtask/components/AzureUsers.py +106 -0
- flowtask/components/BaseAction.py +91 -0
- flowtask/components/BaseLoop.py +198 -0
- flowtask/components/BestBuy.py +800 -0
- flowtask/components/CSVToGCS.py +120 -0
- flowtask/components/CompanyScraper/__init__.py +1 -0
- flowtask/components/CompanyScraper/parsers/__init__.py +6 -0
- flowtask/components/CompanyScraper/parsers/base.py +102 -0
- flowtask/components/CompanyScraper/parsers/explorium.py +192 -0
- flowtask/components/CompanyScraper/parsers/leadiq.py +206 -0
- flowtask/components/CompanyScraper/parsers/rocket.py +133 -0
- flowtask/components/CompanyScraper/parsers/siccode.py +109 -0
- flowtask/components/CompanyScraper/parsers/visualvisitor.py +130 -0
- flowtask/components/CompanyScraper/parsers/zoominfo.py +118 -0
- flowtask/components/CompanyScraper/scrapper.py +1054 -0
- flowtask/components/CopyTo.py +177 -0
- flowtask/components/CopyToBigQuery.py +243 -0
- flowtask/components/CopyToMongoDB.py +291 -0
- flowtask/components/CopyToPg.py +609 -0
- flowtask/components/CopyToRethink.py +207 -0
- flowtask/components/CreateGCSBucket.py +102 -0
- flowtask/components/CreateReport/CreateReport.py +228 -0
- flowtask/components/CreateReport/__init__.py +9 -0
- flowtask/components/CreateReport/charts/__init__.py +15 -0
- flowtask/components/CreateReport/charts/bar.py +51 -0
- flowtask/components/CreateReport/charts/base.py +66 -0
- flowtask/components/CreateReport/charts/pie.py +64 -0
- flowtask/components/CreateReport/utils.py +9 -0
- flowtask/components/CustomerSatisfaction.py +196 -0
- flowtask/components/DataInput.py +200 -0
- flowtask/components/DateList.py +255 -0
- flowtask/components/DbClient.py +163 -0
- flowtask/components/DialPad.py +146 -0
- flowtask/components/DocumentDBQuery.py +200 -0
- flowtask/components/DownloadFrom.py +371 -0
- flowtask/components/DownloadFromD2L.py +113 -0
- flowtask/components/DownloadFromFTP.py +181 -0
- flowtask/components/DownloadFromIMAP.py +315 -0
- flowtask/components/DownloadFromS3.py +198 -0
- flowtask/components/DownloadFromSFTP.py +265 -0
- flowtask/components/DownloadFromSharepoint.py +110 -0
- flowtask/components/DownloadFromSmartSheet.py +114 -0
- flowtask/components/DownloadS3File.py +229 -0
- flowtask/components/Dummy.py +59 -0
- flowtask/components/DuplicatePhoto.py +411 -0
- flowtask/components/EmployeeEvaluation.py +237 -0
- flowtask/components/ExecuteSQL.py +323 -0
- flowtask/components/ExtractHTML.py +178 -0
- flowtask/components/FileBase.py +178 -0
- flowtask/components/FileCopy.py +181 -0
- flowtask/components/FileDelete.py +82 -0
- flowtask/components/FileExists.py +146 -0
- flowtask/components/FileIteratorDelete.py +112 -0
- flowtask/components/FileList.py +194 -0
- flowtask/components/FileOpen.py +75 -0
- flowtask/components/FileRead.py +120 -0
- flowtask/components/FileRename.py +106 -0
- flowtask/components/FilterIf.py +284 -0
- flowtask/components/FilterRows/FilterRows.py +200 -0
- flowtask/components/FilterRows/__init__.py +10 -0
- flowtask/components/FilterRows/functions.py +4 -0
- flowtask/components/GCSToBigQuery.py +103 -0
- flowtask/components/GoogleA4.py +150 -0
- flowtask/components/GoogleGeoCoding.py +344 -0
- flowtask/components/GooglePlaces.py +315 -0
- flowtask/components/GoogleSearch.py +539 -0
- flowtask/components/HTTPClient.py +268 -0
- flowtask/components/ICIMS.py +146 -0
- flowtask/components/IF.py +179 -0
- flowtask/components/IcimsFolderCopy.py +173 -0
- flowtask/components/ImageFeatures/__init__.py +5 -0
- flowtask/components/ImageFeatures/process.py +233 -0
- flowtask/components/IteratorBase.py +251 -0
- flowtask/components/LangchainLoader/__init__.py +5 -0
- flowtask/components/LangchainLoader/loader.py +194 -0
- flowtask/components/LangchainLoader/loaders/__init__.py +22 -0
- flowtask/components/LangchainLoader/loaders/abstract.py +362 -0
- flowtask/components/LangchainLoader/loaders/basepdf.py +50 -0
- flowtask/components/LangchainLoader/loaders/docx.py +91 -0
- flowtask/components/LangchainLoader/loaders/html.py +119 -0
- flowtask/components/LangchainLoader/loaders/pdfblocks.py +146 -0
- flowtask/components/LangchainLoader/loaders/pdfmark.py +79 -0
- flowtask/components/LangchainLoader/loaders/pdftables.py +135 -0
- flowtask/components/LangchainLoader/loaders/qa.py +67 -0
- flowtask/components/LangchainLoader/loaders/txt.py +55 -0
- flowtask/components/LeadIQ.py +650 -0
- flowtask/components/Loop.py +253 -0
- flowtask/components/Lowes.py +334 -0
- flowtask/components/MS365Usage.py +156 -0
- flowtask/components/MSTeamsMessages.py +320 -0
- flowtask/components/MarketClustering.py +1051 -0
- flowtask/components/MergeFiles.py +362 -0
- flowtask/components/MilvusOutput.py +87 -0
- flowtask/components/NearByStores.py +175 -0
- flowtask/components/NetworkNinja/__init__.py +6 -0
- flowtask/components/NetworkNinja/models/__init__.py +52 -0
- flowtask/components/NetworkNinja/models/abstract.py +177 -0
- flowtask/components/NetworkNinja/models/account.py +39 -0
- flowtask/components/NetworkNinja/models/client.py +19 -0
- flowtask/components/NetworkNinja/models/district.py +14 -0
- flowtask/components/NetworkNinja/models/events.py +101 -0
- flowtask/components/NetworkNinja/models/forms.py +499 -0
- flowtask/components/NetworkNinja/models/market.py +16 -0
- flowtask/components/NetworkNinja/models/organization.py +34 -0
- flowtask/components/NetworkNinja/models/photos.py +125 -0
- flowtask/components/NetworkNinja/models/project.py +44 -0
- flowtask/components/NetworkNinja/models/region.py +28 -0
- flowtask/components/NetworkNinja/models/store.py +203 -0
- flowtask/components/NetworkNinja/models/user.py +151 -0
- flowtask/components/NetworkNinja/router.py +854 -0
- flowtask/components/Odoo.py +175 -0
- flowtask/components/OdooInjector.py +192 -0
- flowtask/components/OpenFromXML.py +126 -0
- flowtask/components/OpenWeather.py +41 -0
- flowtask/components/OpenWithBase.py +616 -0
- flowtask/components/OpenWithPandas.py +715 -0
- flowtask/components/PGPDecrypt.py +199 -0
- flowtask/components/PandasIterator.py +187 -0
- flowtask/components/PandasToFile.py +189 -0
- flowtask/components/Paradox.py +339 -0
- flowtask/components/ParamIterator.py +117 -0
- flowtask/components/ParseHTML.py +84 -0
- flowtask/components/PlacerStores.py +249 -0
- flowtask/components/Pokemon.py +507 -0
- flowtask/components/PositiveBot.py +62 -0
- flowtask/components/PowerPointSlide.py +400 -0
- flowtask/components/PrintMessage.py +127 -0
- flowtask/components/ProductCompetitors/__init__.py +5 -0
- flowtask/components/ProductCompetitors/parsers/__init__.py +7 -0
- flowtask/components/ProductCompetitors/parsers/base.py +72 -0
- flowtask/components/ProductCompetitors/parsers/bestbuy.py +86 -0
- flowtask/components/ProductCompetitors/parsers/lowes.py +103 -0
- flowtask/components/ProductCompetitors/scrapper.py +155 -0
- flowtask/components/ProductCompliant.py +169 -0
- flowtask/components/ProductInfo/__init__.py +1 -0
- flowtask/components/ProductInfo/parsers/__init__.py +5 -0
- flowtask/components/ProductInfo/parsers/base.py +83 -0
- flowtask/components/ProductInfo/parsers/brother.py +97 -0
- flowtask/components/ProductInfo/parsers/canon.py +167 -0
- flowtask/components/ProductInfo/parsers/epson.py +118 -0
- flowtask/components/ProductInfo/parsers/hp.py +131 -0
- flowtask/components/ProductInfo/parsers/samsung.py +97 -0
- flowtask/components/ProductInfo/scraper.py +319 -0
- flowtask/components/ProductPricing.py +118 -0
- flowtask/components/QS.py +261 -0
- flowtask/components/QSBase.py +201 -0
- flowtask/components/QueryIterator.py +273 -0
- flowtask/components/QueryToInsert.py +327 -0
- flowtask/components/QueryToPandas.py +432 -0
- flowtask/components/RESTClient.py +195 -0
- flowtask/components/RethinkDBQuery.py +189 -0
- flowtask/components/Rsync.py +74 -0
- flowtask/components/RunSSH.py +59 -0
- flowtask/components/RunShell.py +71 -0
- flowtask/components/SalesForce.py +20 -0
- flowtask/components/SaveImageBank/__init__.py +257 -0
- flowtask/components/SchedulingVisits.py +592 -0
- flowtask/components/ScrapPage.py +216 -0
- flowtask/components/ScrapSearch.py +79 -0
- flowtask/components/SendNotify.py +257 -0
- flowtask/components/SentimentAnalysis.py +694 -0
- flowtask/components/ServiceScrapper/__init__.py +5 -0
- flowtask/components/ServiceScrapper/parsers/__init__.py +1 -0
- flowtask/components/ServiceScrapper/parsers/base.py +94 -0
- flowtask/components/ServiceScrapper/parsers/costco.py +93 -0
- flowtask/components/ServiceScrapper/scrapper.py +199 -0
- flowtask/components/SetVariables.py +156 -0
- flowtask/components/SubTask.py +182 -0
- flowtask/components/SuiteCRM.py +48 -0
- flowtask/components/Switch.py +175 -0
- flowtask/components/TableBase.py +148 -0
- flowtask/components/TableDelete.py +312 -0
- flowtask/components/TableInput.py +143 -0
- flowtask/components/TableOutput/TableOutput.py +384 -0
- flowtask/components/TableOutput/__init__.py +3 -0
- flowtask/components/TableSchema.py +534 -0
- flowtask/components/Target.py +223 -0
- flowtask/components/ThumbnailGenerator.py +156 -0
- flowtask/components/ToPandas.py +67 -0
- flowtask/components/TransformRows/TransformRows.py +507 -0
- flowtask/components/TransformRows/__init__.py +9 -0
- flowtask/components/TransformRows/functions.py +559 -0
- flowtask/components/TransposeRows.py +176 -0
- flowtask/components/UPCDatabase.py +86 -0
- flowtask/components/UnGzip.py +171 -0
- flowtask/components/Uncompress.py +172 -0
- flowtask/components/UniqueRows.py +126 -0
- flowtask/components/Unzip.py +107 -0
- flowtask/components/UpdateOperationalVars.py +147 -0
- flowtask/components/UploadTo.py +299 -0
- flowtask/components/UploadToS3.py +136 -0
- flowtask/components/UploadToSFTP.py +160 -0
- flowtask/components/UploadToSharepoint.py +205 -0
- flowtask/components/UserFunc.py +122 -0
- flowtask/components/VivaTracker.py +140 -0
- flowtask/components/WSDLClient.py +123 -0
- flowtask/components/Wait.py +18 -0
- flowtask/components/Walmart.py +199 -0
- flowtask/components/Workplace.py +134 -0
- flowtask/components/XMLToPandas.py +267 -0
- flowtask/components/Zammad/__init__.py +41 -0
- flowtask/components/Zammad/models.py +0 -0
- flowtask/components/ZoomInfoScraper.py +409 -0
- flowtask/components/__init__.py +104 -0
- flowtask/components/abstract.py +18 -0
- flowtask/components/flow.py +530 -0
- flowtask/components/google.py +335 -0
- flowtask/components/group.py +221 -0
- flowtask/components/py.typed +0 -0
- flowtask/components/reviewscrap.py +132 -0
- flowtask/components/tAutoincrement.py +117 -0
- flowtask/components/tConcat.py +109 -0
- flowtask/components/tExplode.py +119 -0
- flowtask/components/tFilter.py +184 -0
- flowtask/components/tGroup.py +236 -0
- flowtask/components/tJoin.py +270 -0
- flowtask/components/tMap/__init__.py +9 -0
- flowtask/components/tMap/functions.py +54 -0
- flowtask/components/tMap/tMap.py +450 -0
- flowtask/components/tMelt.py +112 -0
- flowtask/components/tMerge.py +114 -0
- flowtask/components/tOrder.py +93 -0
- flowtask/components/tPandas.py +94 -0
- flowtask/components/tPivot.py +71 -0
- flowtask/components/tPluckCols.py +76 -0
- flowtask/components/tUnnest.py +82 -0
- flowtask/components/user.py +401 -0
- flowtask/conf.py +457 -0
- flowtask/download.py +102 -0
- flowtask/events/__init__.py +11 -0
- flowtask/events/events/__init__.py +20 -0
- flowtask/events/events/abstract.py +95 -0
- flowtask/events/events/alerts/__init__.py +362 -0
- flowtask/events/events/alerts/colfunctions.py +131 -0
- flowtask/events/events/alerts/functions.py +158 -0
- flowtask/events/events/dummy.py +12 -0
- flowtask/events/events/exec.py +124 -0
- flowtask/events/events/file/__init__.py +7 -0
- flowtask/events/events/file/base.py +51 -0
- flowtask/events/events/file/copy.py +23 -0
- flowtask/events/events/file/delete.py +16 -0
- flowtask/events/events/interfaces/__init__.py +9 -0
- flowtask/events/events/interfaces/client.py +67 -0
- flowtask/events/events/interfaces/credentials.py +28 -0
- flowtask/events/events/interfaces/notifications.py +58 -0
- flowtask/events/events/jira.py +122 -0
- flowtask/events/events/log.py +26 -0
- flowtask/events/events/logerr.py +52 -0
- flowtask/events/events/notify.py +59 -0
- flowtask/events/events/notify_event.py +160 -0
- flowtask/events/events/publish.py +54 -0
- flowtask/events/events/sendfile.py +104 -0
- flowtask/events/events/task.py +97 -0
- flowtask/events/events/teams.py +98 -0
- flowtask/events/events/webhook.py +58 -0
- flowtask/events/manager.py +287 -0
- flowtask/exceptions.c +39393 -0
- flowtask/exceptions.cpython-312-x86_64-linux-gnu.so +0 -0
- flowtask/extensions/__init__.py +3 -0
- flowtask/extensions/abstract.py +82 -0
- flowtask/extensions/logging/__init__.py +65 -0
- flowtask/hooks/__init__.py +9 -0
- flowtask/hooks/actions/__init__.py +22 -0
- flowtask/hooks/actions/abstract.py +66 -0
- flowtask/hooks/actions/dummy.py +23 -0
- flowtask/hooks/actions/jira.py +74 -0
- flowtask/hooks/actions/rest.py +320 -0
- flowtask/hooks/actions/sampledata.py +37 -0
- flowtask/hooks/actions/sensor.py +23 -0
- flowtask/hooks/actions/task.py +9 -0
- flowtask/hooks/actions/ticket.py +37 -0
- flowtask/hooks/actions/zammad.py +55 -0
- flowtask/hooks/hook.py +62 -0
- flowtask/hooks/models.py +17 -0
- flowtask/hooks/service.py +187 -0
- flowtask/hooks/step.py +91 -0
- flowtask/hooks/types/__init__.py +23 -0
- flowtask/hooks/types/base.py +129 -0
- flowtask/hooks/types/brokers/__init__.py +11 -0
- flowtask/hooks/types/brokers/base.py +54 -0
- flowtask/hooks/types/brokers/mqtt.py +35 -0
- flowtask/hooks/types/brokers/rabbitmq.py +82 -0
- flowtask/hooks/types/brokers/redis.py +83 -0
- flowtask/hooks/types/brokers/sqs.py +44 -0
- flowtask/hooks/types/fs.py +232 -0
- flowtask/hooks/types/http.py +49 -0
- flowtask/hooks/types/imap.py +200 -0
- flowtask/hooks/types/jira.py +279 -0
- flowtask/hooks/types/mail.py +205 -0
- flowtask/hooks/types/postgres.py +98 -0
- flowtask/hooks/types/responses/__init__.py +8 -0
- flowtask/hooks/types/responses/base.py +5 -0
- flowtask/hooks/types/sharepoint.py +288 -0
- flowtask/hooks/types/ssh.py +141 -0
- flowtask/hooks/types/tagged.py +59 -0
- flowtask/hooks/types/upload.py +85 -0
- flowtask/hooks/types/watch.py +71 -0
- flowtask/hooks/types/web.py +36 -0
- flowtask/interfaces/AzureClient.py +137 -0
- flowtask/interfaces/AzureGraph.py +839 -0
- flowtask/interfaces/Boto3Client.py +326 -0
- flowtask/interfaces/DropboxClient.py +173 -0
- flowtask/interfaces/ExcelHandler.py +94 -0
- flowtask/interfaces/FTPClient.py +131 -0
- flowtask/interfaces/GoogleCalendar.py +201 -0
- flowtask/interfaces/GoogleClient.py +133 -0
- flowtask/interfaces/GoogleDrive.py +127 -0
- flowtask/interfaces/GoogleGCS.py +89 -0
- flowtask/interfaces/GoogleGeocoding.py +93 -0
- flowtask/interfaces/GoogleLang.py +114 -0
- flowtask/interfaces/GooglePub.py +61 -0
- flowtask/interfaces/GoogleSheet.py +68 -0
- flowtask/interfaces/IMAPClient.py +137 -0
- flowtask/interfaces/O365Calendar.py +113 -0
- flowtask/interfaces/O365Client.py +220 -0
- flowtask/interfaces/OneDrive.py +284 -0
- flowtask/interfaces/Outlook.py +155 -0
- flowtask/interfaces/ParrotBot.py +130 -0
- flowtask/interfaces/SSHClient.py +378 -0
- flowtask/interfaces/Sharepoint.py +496 -0
- flowtask/interfaces/__init__.py +36 -0
- flowtask/interfaces/azureauth.py +119 -0
- flowtask/interfaces/cache.py +201 -0
- flowtask/interfaces/client.py +82 -0
- flowtask/interfaces/compress.py +525 -0
- flowtask/interfaces/credentials.py +124 -0
- flowtask/interfaces/d2l.py +239 -0
- flowtask/interfaces/databases/__init__.py +5 -0
- flowtask/interfaces/databases/db.py +223 -0
- flowtask/interfaces/databases/documentdb.py +55 -0
- flowtask/interfaces/databases/rethink.py +39 -0
- flowtask/interfaces/dataframes/__init__.py +11 -0
- flowtask/interfaces/dataframes/abstract.py +21 -0
- flowtask/interfaces/dataframes/arrow.py +71 -0
- flowtask/interfaces/dataframes/dt.py +69 -0
- flowtask/interfaces/dataframes/pandas.py +167 -0
- flowtask/interfaces/dataframes/polars.py +60 -0
- flowtask/interfaces/db.py +263 -0
- flowtask/interfaces/env.py +46 -0
- flowtask/interfaces/func.py +137 -0
- flowtask/interfaces/http.py +1780 -0
- flowtask/interfaces/locale.py +40 -0
- flowtask/interfaces/log.py +75 -0
- flowtask/interfaces/mask.py +143 -0
- flowtask/interfaces/notification.py +154 -0
- flowtask/interfaces/playwright.py +339 -0
- flowtask/interfaces/powerpoint.py +368 -0
- flowtask/interfaces/py.typed +0 -0
- flowtask/interfaces/qs.py +376 -0
- flowtask/interfaces/result.py +87 -0
- flowtask/interfaces/selenium_service.py +779 -0
- flowtask/interfaces/smartsheet.py +154 -0
- flowtask/interfaces/stat.py +39 -0
- flowtask/interfaces/task.py +96 -0
- flowtask/interfaces/template.py +118 -0
- flowtask/interfaces/vectorstores/__init__.py +1 -0
- flowtask/interfaces/vectorstores/abstract.py +133 -0
- flowtask/interfaces/vectorstores/milvus.py +669 -0
- flowtask/interfaces/zammad.py +107 -0
- flowtask/models.py +193 -0
- flowtask/parsers/__init__.py +15 -0
- flowtask/parsers/_yaml.c +11978 -0
- flowtask/parsers/_yaml.cpython-312-x86_64-linux-gnu.so +0 -0
- flowtask/parsers/argparser.py +235 -0
- flowtask/parsers/base.c +15155 -0
- flowtask/parsers/base.cpython-312-x86_64-linux-gnu.so +0 -0
- flowtask/parsers/json.c +11968 -0
- flowtask/parsers/json.cpython-312-x86_64-linux-gnu.so +0 -0
- flowtask/parsers/maps.py +49 -0
- flowtask/parsers/toml.c +11968 -0
- flowtask/parsers/toml.cpython-312-x86_64-linux-gnu.so +0 -0
- flowtask/plugins/__init__.py +16 -0
- flowtask/plugins/components/__init__.py +0 -0
- flowtask/plugins/handler/__init__.py +45 -0
- flowtask/plugins/importer.py +31 -0
- flowtask/plugins/sources/__init__.py +0 -0
- flowtask/runner.py +283 -0
- flowtask/scheduler/__init__.py +9 -0
- flowtask/scheduler/functions.py +493 -0
- flowtask/scheduler/handlers/__init__.py +8 -0
- flowtask/scheduler/handlers/manager.py +504 -0
- flowtask/scheduler/handlers/models.py +58 -0
- flowtask/scheduler/handlers/service.py +72 -0
- flowtask/scheduler/notifications.py +65 -0
- flowtask/scheduler/scheduler.py +993 -0
- flowtask/services/__init__.py +0 -0
- flowtask/services/bots/__init__.py +0 -0
- flowtask/services/bots/telegram.py +264 -0
- flowtask/services/files/__init__.py +11 -0
- flowtask/services/files/manager.py +522 -0
- flowtask/services/files/model.py +37 -0
- flowtask/services/files/service.py +767 -0
- flowtask/services/jira/__init__.py +3 -0
- flowtask/services/jira/jira_actions.py +191 -0
- flowtask/services/tasks/__init__.py +13 -0
- flowtask/services/tasks/launcher.py +213 -0
- flowtask/services/tasks/manager.py +323 -0
- flowtask/services/tasks/service.py +275 -0
- flowtask/services/tasks/task_manager.py +376 -0
- flowtask/services/tasks/tasks.py +155 -0
- flowtask/storages/__init__.py +16 -0
- flowtask/storages/exceptions.py +12 -0
- flowtask/storages/files/__init__.py +8 -0
- flowtask/storages/files/abstract.py +29 -0
- flowtask/storages/files/filesystem.py +66 -0
- flowtask/storages/tasks/__init__.py +19 -0
- flowtask/storages/tasks/abstract.py +26 -0
- flowtask/storages/tasks/database.py +33 -0
- flowtask/storages/tasks/filesystem.py +108 -0
- flowtask/storages/tasks/github.py +119 -0
- flowtask/storages/tasks/memory.py +45 -0
- flowtask/storages/tasks/row.py +25 -0
- flowtask/tasks/__init__.py +0 -0
- flowtask/tasks/abstract.py +526 -0
- flowtask/tasks/command.py +118 -0
- flowtask/tasks/pile.py +486 -0
- flowtask/tasks/py.typed +0 -0
- flowtask/tasks/task.py +778 -0
- flowtask/template/__init__.py +161 -0
- flowtask/tests.py +257 -0
- flowtask/types/__init__.py +8 -0
- flowtask/types/typedefs.c +11347 -0
- flowtask/types/typedefs.cpython-312-x86_64-linux-gnu.so +0 -0
- flowtask/utils/__init__.py +24 -0
- flowtask/utils/constants.py +117 -0
- flowtask/utils/encoders.py +21 -0
- flowtask/utils/executor.py +112 -0
- flowtask/utils/functions.cpp +14280 -0
- flowtask/utils/functions.cpython-312-x86_64-linux-gnu.so +0 -0
- flowtask/utils/json.cpp +13349 -0
- flowtask/utils/json.cpython-312-x86_64-linux-gnu.so +0 -0
- flowtask/utils/mail.py +63 -0
- flowtask/utils/parseqs.c +13324 -0
- flowtask/utils/parserqs.cpython-312-x86_64-linux-gnu.so +0 -0
- flowtask/utils/stats.py +308 -0
- flowtask/utils/transformations.py +74 -0
- flowtask/utils/uv.py +12 -0
- flowtask/utils/validators.py +97 -0
- flowtask/version.py +11 -0
- flowtask-5.8.4.dist-info/LICENSE +201 -0
- flowtask-5.8.4.dist-info/METADATA +209 -0
- flowtask-5.8.4.dist-info/RECORD +470 -0
- flowtask-5.8.4.dist-info/WHEEL +6 -0
- flowtask-5.8.4.dist-info/entry_points.txt +3 -0
- flowtask-5.8.4.dist-info/top_level.txt +2 -0
- plugins/components/CreateQR.py +39 -0
- plugins/components/TestComponent.py +28 -0
- plugins/components/Use1.py +13 -0
- plugins/components/Workplace.py +117 -0
- plugins/components/__init__.py +3 -0
- plugins/sources/__init__.py +0 -0
- plugins/sources/get_populartimes.py +78 -0
- plugins/sources/google.py +150 -0
- plugins/sources/hubspot.py +679 -0
- plugins/sources/icims.py +679 -0
- plugins/sources/mobileinsight.py +501 -0
- plugins/sources/newrelic.py +262 -0
- plugins/sources/uap.py +268 -0
- plugins/sources/venu.py +244 -0
- plugins/sources/vocinity.py +314 -0
@@ -0,0 +1,120 @@
|
|
1
|
+
import asyncio
|
2
|
+
from pathlib import Path
|
3
|
+
from typing import Callable, Tuple
|
4
|
+
from asyncdb import AsyncDB
|
5
|
+
from querysource.datasources.drivers.bigquery import bigquery_default
|
6
|
+
from .flow import FlowComponent
|
7
|
+
from ..exceptions import ComponentError
|
8
|
+
|
9
|
+
class CSVToGCS(FlowComponent):
|
10
|
+
"""
|
11
|
+
CSVToGCS.
|
12
|
+
|
13
|
+
Este componente sube un archivo CSV desde el sistema local a un bucket específico de Google Cloud Storage (GCS).
|
14
|
+
Opcionalmente, puede crear el bucket si no existe.
|
15
|
+
"""
|
16
|
+
|
17
|
+
def __init__(
|
18
|
+
self,
|
19
|
+
loop: asyncio.AbstractEventLoop = None,
|
20
|
+
job: Callable = None,
|
21
|
+
stat: Callable = None,
|
22
|
+
**kwargs,
|
23
|
+
):
|
24
|
+
self.csv_path: Path = Path(kwargs.pop('csv_path'))
|
25
|
+
self.bucket_uri: str = kwargs.pop('bucket_uri', None) # Puede ser proporcionado directamente o generado
|
26
|
+
self.object_name: str = kwargs.pop('object_name', self.csv_path.name)
|
27
|
+
self.overwrite: bool = kwargs.pop('overwrite', False)
|
28
|
+
self.create_bucket: bool = kwargs.pop('create_bucket', False)
|
29
|
+
self.storage_class: str = kwargs.pop('storage_class', 'STANDARD')
|
30
|
+
self.location: str = kwargs.pop('location', 'US')
|
31
|
+
self.delete_local: bool = kwargs.pop('delete_local', False)
|
32
|
+
self.bq = None # Instancia de AsyncDB
|
33
|
+
self.bucket_name: str = kwargs.pop('bucket_name', None) # Necesario si bucket_uri no se proporciona
|
34
|
+
super(CSVToGCS, self).__init__(loop=loop, job=job, stat=stat, **kwargs)
|
35
|
+
|
36
|
+
async def start(self, **kwargs):
|
37
|
+
"""Inicializa el componente configurando la conexión AsyncDB."""
|
38
|
+
# Validar parámetros requeridos
|
39
|
+
if not self.csv_path.exists():
|
40
|
+
raise ComponentError(f"CSVToGCS: El archivo CSV '{self.csv_path}' no existe.")
|
41
|
+
|
42
|
+
if not bigquery_default:
|
43
|
+
raise ComponentError("CSVToGCS: 'bigquery_default' no está configurado correctamente.")
|
44
|
+
|
45
|
+
# Obtener credenciales y parámetros del driver
|
46
|
+
credentials = bigquery_default.get_credentials()
|
47
|
+
|
48
|
+
# Inicializar AsyncDB con el driver de BigQuery
|
49
|
+
try:
|
50
|
+
self.bq = AsyncDB("bigquery", params=credentials)
|
51
|
+
self._logger.info("CSVToGCS: Instancia de AsyncDB creada exitosamente.")
|
52
|
+
except Exception as e:
|
53
|
+
raise ComponentError(f"CSVToGCS: Error al inicializar AsyncDB: {e}") from e
|
54
|
+
|
55
|
+
async def run(self) -> Tuple[str, str]:
|
56
|
+
"""Ejecuta la carga del archivo CSV a GCS y retorna bucket_uri y object_uri."""
|
57
|
+
if not self.bq:
|
58
|
+
raise ComponentError("CSVToGCS: AsyncDB no está inicializado. Asegúrate de ejecutar 'start' antes de 'run'.")
|
59
|
+
|
60
|
+
try:
|
61
|
+
async with await self.bq.connection() as conn:
|
62
|
+
# Obtener bucket_uri y bucket_name del componente anterior si no se proporcionan
|
63
|
+
if not self.bucket_uri:
|
64
|
+
self.bucket_uri = self.getTaskVar('bucket_uri')
|
65
|
+
if not self.bucket_name:
|
66
|
+
self.bucket_name = self.getTaskVar('bucket_name')
|
67
|
+
|
68
|
+
if not self.bucket_uri:
|
69
|
+
if not self.bucket_name:
|
70
|
+
raise ComponentError("CSVToGCS: 'bucket_uri' o 'bucket_name' deben ser proporcionados.")
|
71
|
+
self.bucket_uri = f"gs://{self.bucket_name}"
|
72
|
+
|
73
|
+
# Verificar si el bucket existe
|
74
|
+
bucket_exists = await conn.bucket_exists(self.bucket_name)
|
75
|
+
if not bucket_exists:
|
76
|
+
if self.create_bucket:
|
77
|
+
await conn.create_bucket(
|
78
|
+
bucket_name=self.bucket_name,
|
79
|
+
location=self.location,
|
80
|
+
storage_class=self.storage_class
|
81
|
+
)
|
82
|
+
self._logger.info(f"CSVToGCS: Bucket '{self.bucket_name}' creado exitosamente en la región '{self.location}' con clase de almacenamiento '{self.storage_class}'.")
|
83
|
+
else:
|
84
|
+
raise ComponentError(f"CSVToGCS: El bucket '{self.bucket_name}' no existe y 'create_bucket' está establecido en False.")
|
85
|
+
else:
|
86
|
+
self._logger.info(f"CSVToGCS: Bucket '{self.bucket_name}' ya existe.")
|
87
|
+
|
88
|
+
# Subir el archivo CSV a GCS
|
89
|
+
object_uri, message = await conn.create_gcs_from_csv(
|
90
|
+
bucket_uri=self.bucket_uri,
|
91
|
+
object_name=self.object_name,
|
92
|
+
csv_data=self.csv_path,
|
93
|
+
overwrite=self.overwrite
|
94
|
+
)
|
95
|
+
self._logger.info(f"CSVToGCS: {message}")
|
96
|
+
|
97
|
+
# Guardar bucket_uri y object_uri para el siguiente componente
|
98
|
+
self.setTaskVar('bucket_uri', self.bucket_uri)
|
99
|
+
self.setTaskVar('object_uri', object_uri)
|
100
|
+
|
101
|
+
# Opcionalmente eliminar el archivo local
|
102
|
+
if self.delete_local and object_uri:
|
103
|
+
self.csv_path.unlink()
|
104
|
+
self._logger.info(f"CSVToGCS: Archivo local '{self.csv_path}' eliminado exitosamente después de la carga.")
|
105
|
+
|
106
|
+
return self.bucket_uri, object_uri
|
107
|
+
|
108
|
+
except ComponentError as ce:
|
109
|
+
raise ce # Re-lanzar errores específicos de componentes
|
110
|
+
except Exception as e:
|
111
|
+
raise ComponentError(f"CSVToGCS: Error durante la carga a GCS: {e}") from e
|
112
|
+
|
113
|
+
async def close(self):
|
114
|
+
"""Cierra la conexión AsyncDB."""
|
115
|
+
try:
|
116
|
+
if self.bq:
|
117
|
+
await self.bq.close()
|
118
|
+
self._logger.info("CSVToGCS: AsyncDB cerrado exitosamente.")
|
119
|
+
except Exception as e:
|
120
|
+
self._logger.error(f"CSVToGCS: Error al cerrar AsyncDB: {e}")
|
@@ -0,0 +1 @@
|
|
1
|
+
from .scrapper import CompanyScraper
|
@@ -0,0 +1,102 @@
|
|
1
|
+
from typing import Any, List, Dict
|
2
|
+
from bs4 import BeautifulSoup as bs
|
3
|
+
from abc import abstractmethod
|
4
|
+
from ....interfaces import SeleniumService, HTTPService
|
5
|
+
import re
|
6
|
+
import logging
|
7
|
+
|
8
|
+
class ScrapperBase(SeleniumService, HTTPService):
|
9
|
+
"""
|
10
|
+
ScrapperBase Model.
|
11
|
+
|
12
|
+
|
13
|
+
Define how scrappers should be work.-
|
14
|
+
"""
|
15
|
+
domain: str
|
16
|
+
search_term: str
|
17
|
+
cookies: Any
|
18
|
+
keywords: List[str]
|
19
|
+
|
20
|
+
def __init__(self, *args, **kwargs):
|
21
|
+
self.cookies = kwargs.get('cookies', None)
|
22
|
+
self._logger = logging.getLogger(self.__class__.__name__)
|
23
|
+
self._counter: int = 0
|
24
|
+
self.search_term_used: str = ''
|
25
|
+
super().__init__(*args, **kwargs)
|
26
|
+
|
27
|
+
@abstractmethod
|
28
|
+
async def scrapping(self, document: bs, idx: int, row: dict):
|
29
|
+
pass
|
30
|
+
|
31
|
+
@abstractmethod
|
32
|
+
def define_search_term(self, term: str):
|
33
|
+
pass
|
34
|
+
|
35
|
+
async def get(self, url, headers: dict):
|
36
|
+
return await self._get(url, headers=headers, use_proxy=True)
|
37
|
+
|
38
|
+
def _parse_address(self, address: str) -> Dict[str, str]:
|
39
|
+
"""
|
40
|
+
Parse address string to extract state, zipcode and country.
|
41
|
+
|
42
|
+
Args:
|
43
|
+
address (str): Raw address string
|
44
|
+
|
45
|
+
Returns:
|
46
|
+
Dict with parsed address components:
|
47
|
+
{
|
48
|
+
'address': str,
|
49
|
+
'state': str,
|
50
|
+
'zipcode': str,
|
51
|
+
'country': str
|
52
|
+
}
|
53
|
+
"""
|
54
|
+
if not address:
|
55
|
+
return {
|
56
|
+
'address': None,
|
57
|
+
'state': None,
|
58
|
+
'zipcode': None,
|
59
|
+
'country': None
|
60
|
+
}
|
61
|
+
|
62
|
+
# Mantener la dirección original
|
63
|
+
result = {'address': address}
|
64
|
+
|
65
|
+
# Primera regex para formato completo
|
66
|
+
pattern1 = r'^.*,\s+([^,]+?)\s+([\w\s-]+)\s+([A-Z]{2})$'
|
67
|
+
# Segunda regex como fallback
|
68
|
+
pattern2 = r'^.*,\s*([^,]+?),\s+([\w\s-]+?)\s*([A-Z]{2})'
|
69
|
+
|
70
|
+
try:
|
71
|
+
# Intentar con la primera regex
|
72
|
+
match = re.search(pattern1, address)
|
73
|
+
if not match:
|
74
|
+
# Si no hay match, intentar con la segunda
|
75
|
+
match = re.search(pattern2, address)
|
76
|
+
|
77
|
+
if match:
|
78
|
+
result['state'] = match.group(1).strip()
|
79
|
+
result['zipcode'] = match.group(2).strip()
|
80
|
+
result['country'] = match.group(3).strip()
|
81
|
+
else:
|
82
|
+
self._logger.warning(f"Could not parse address: {address}")
|
83
|
+
result.update({
|
84
|
+
'state': None,
|
85
|
+
'zipcode': None,
|
86
|
+
'country': None
|
87
|
+
})
|
88
|
+
except Exception as e:
|
89
|
+
self._logger.error(f"Error parsing address {address}: {str(e)}")
|
90
|
+
result.update({
|
91
|
+
'state': None,
|
92
|
+
'zipcode': None,
|
93
|
+
'country': None
|
94
|
+
})
|
95
|
+
|
96
|
+
return result
|
97
|
+
|
98
|
+
def _standardize_name(self, text: str) -> str:
|
99
|
+
"""Estandariza el formato del texto: lowercase y guiones en lugar de espacios."""
|
100
|
+
# Primero limpiamos caracteres especiales y espacios extras
|
101
|
+
cleaned = text.strip().lower().replace(' ', '-')
|
102
|
+
return f"\'{cleaned}\'"
|
@@ -0,0 +1,192 @@
|
|
1
|
+
from bs4 import BeautifulSoup as bs
|
2
|
+
from .base import ScrapperBase
|
3
|
+
import json
|
4
|
+
|
5
|
+
|
6
|
+
class ExploriumScrapper(ScrapperBase):
|
7
|
+
"""
|
8
|
+
ExploriumScrapper Model.
|
9
|
+
"""
|
10
|
+
domain: str = 'explorium.ai'
|
11
|
+
search_term: str = 'site:explorium.ai {}'
|
12
|
+
keywords: list = [
|
13
|
+
'overview - services',
|
14
|
+
]
|
15
|
+
|
16
|
+
def define_search_term(self, term: str):
|
17
|
+
cleaned = term.strip().lower()
|
18
|
+
return self.search_term.format(cleaned)
|
19
|
+
|
20
|
+
async def scrapping(self, document: bs, idx: int, row: dict):
|
21
|
+
"""
|
22
|
+
Scrape company information from Explorium.
|
23
|
+
Updates the existing row with new data from Explorium.
|
24
|
+
"""
|
25
|
+
# Start with the existing row data
|
26
|
+
result = row.copy()
|
27
|
+
|
28
|
+
# Actualizamos solo los campos específicos de Explorium
|
29
|
+
result.update({
|
30
|
+
'source_platform': 'explorium',
|
31
|
+
'scrape_status': 'pending',
|
32
|
+
'search_term': self.search_term_used
|
33
|
+
})
|
34
|
+
|
35
|
+
try:
|
36
|
+
# Extraer información de la compañía
|
37
|
+
company_info = document.find('div', {'class': 'company-info'})
|
38
|
+
if company_info:
|
39
|
+
# Nombre de la compañía
|
40
|
+
company_name = company_info.find('h1', {'class': 'company-name'})
|
41
|
+
if company_name:
|
42
|
+
result['company_name'] = company_name.text.strip()
|
43
|
+
|
44
|
+
# Dirección
|
45
|
+
address = company_info.find('div', {'class': 'address'})
|
46
|
+
if address:
|
47
|
+
address_info = self._parse_address(address.text.strip())
|
48
|
+
result.update(address_info)
|
49
|
+
|
50
|
+
# Otros detalles de la compañía
|
51
|
+
details = company_info.find_all('div', {'class': 'detail-item'})
|
52
|
+
for detail in details:
|
53
|
+
label = detail.find('span', {'class': 'label'})
|
54
|
+
value = detail.find('span', {'class': 'value'})
|
55
|
+
if label and value:
|
56
|
+
field = label.text.strip().lower()
|
57
|
+
val = value.text.strip()
|
58
|
+
|
59
|
+
if 'phone' in field:
|
60
|
+
result['phone_number'] = val
|
61
|
+
elif 'website' in field:
|
62
|
+
result['website'] = val
|
63
|
+
elif 'employees' in field:
|
64
|
+
result['employee_count'] = val
|
65
|
+
elif 'revenue' in field:
|
66
|
+
result['revenue_range'] = val
|
67
|
+
elif 'naics' in field:
|
68
|
+
result['naics_code'] = val
|
69
|
+
elif 'sic' in field:
|
70
|
+
result['sic_code'] = val
|
71
|
+
|
72
|
+
# 🔍 Extract NAICS & SIC codes and industry descriptions
|
73
|
+
result.update(self._extract_naics_sic(document))
|
74
|
+
|
75
|
+
# Extract company logo, headquarters, country, and description
|
76
|
+
result.update(self._extract_company_info(document))
|
77
|
+
|
78
|
+
# Verificamos si se encontró algún dato
|
79
|
+
has_data = any([
|
80
|
+
result.get('company_name'),
|
81
|
+
result.get('headquarters'),
|
82
|
+
result.get('country'),
|
83
|
+
result.get('phone_number'),
|
84
|
+
result.get('website'),
|
85
|
+
result.get('stock_symbol'),
|
86
|
+
result.get('naics_code'),
|
87
|
+
result.get('sic_code'),
|
88
|
+
result.get('employee_count'),
|
89
|
+
result.get('revenue_range'),
|
90
|
+
result.get('company_description'),
|
91
|
+
result.get('logo_url')
|
92
|
+
])
|
93
|
+
|
94
|
+
# Establecemos el estado según si encontramos datos o no
|
95
|
+
result['scrape_status'] = 'success' if has_data else 'no_data'
|
96
|
+
|
97
|
+
# Siempre devolvemos el resultado, tenga datos o no
|
98
|
+
return idx, result
|
99
|
+
|
100
|
+
except Exception as e:
|
101
|
+
self._logger.error(f"Error parsing Explorium data: {str(e)}")
|
102
|
+
result['scrape_status'] = f'error: {str(e)[:50]}'
|
103
|
+
return idx, result
|
104
|
+
|
105
|
+
def _extract_naics_sic(self, document: bs):
|
106
|
+
"""
|
107
|
+
Extract NAICS & SIC codes along with their industry descriptions.
|
108
|
+
|
109
|
+
Returns:
|
110
|
+
dict: A dictionary containing 'naics_code', 'sic_code', and 'industry' (comma-separated).
|
111
|
+
"""
|
112
|
+
result = {
|
113
|
+
'naics_code': None,
|
114
|
+
'sic_code': None,
|
115
|
+
'industry': None
|
116
|
+
}
|
117
|
+
|
118
|
+
naics_codes = []
|
119
|
+
sic_codes = []
|
120
|
+
industries = []
|
121
|
+
|
122
|
+
# Extract NAICS section
|
123
|
+
naics_section = document.find('div', {'data-id': 'company-stat-naics'})
|
124
|
+
if naics_section:
|
125
|
+
naics_entries = naics_section.find_all('p', {'class': 'ExpTypography-root'})
|
126
|
+
for entry in naics_entries:
|
127
|
+
code = entry.text.strip().strip(',')
|
128
|
+
industry_desc = entry.get('aria-label', '').strip()
|
129
|
+
if code:
|
130
|
+
naics_codes.append(code)
|
131
|
+
if industry_desc:
|
132
|
+
industries.append(industry_desc)
|
133
|
+
|
134
|
+
# Extract SIC section
|
135
|
+
sic_section = document.find('div', {'data-id': 'company-stat-sic'})
|
136
|
+
if sic_section:
|
137
|
+
sic_entries = sic_section.find_all('p', {'class': 'ExpTypography-root'})
|
138
|
+
for entry in sic_entries:
|
139
|
+
code = entry.text.strip().strip(',')
|
140
|
+
industry_desc = entry.get('aria-label', '').strip()
|
141
|
+
if code:
|
142
|
+
sic_codes.append(code)
|
143
|
+
if industry_desc:
|
144
|
+
industries.append(industry_desc)
|
145
|
+
|
146
|
+
# Convert lists to comma-separated strings
|
147
|
+
if naics_codes:
|
148
|
+
result['naics_code'] = ', '.join(naics_codes)
|
149
|
+
if sic_codes:
|
150
|
+
result['sic_code'] = ', '.join(sic_codes)
|
151
|
+
if industries:
|
152
|
+
result['industry'] = ', '.join(industries)
|
153
|
+
|
154
|
+
return result
|
155
|
+
|
156
|
+
def _extract_company_info(self, document: bs):
|
157
|
+
"""
|
158
|
+
Extract headquarters, country, company description, and logo.
|
159
|
+
"""
|
160
|
+
result = {
|
161
|
+
'headquarters': None,
|
162
|
+
'country': None,
|
163
|
+
'company_description': None,
|
164
|
+
'logo_url': None
|
165
|
+
}
|
166
|
+
|
167
|
+
# Extract headquarters address
|
168
|
+
address_section = document.find('div', {'data-id': 'info-address'})
|
169
|
+
if address_section:
|
170
|
+
address_element = address_section.find('p', {'aria-label': True})
|
171
|
+
if address_element:
|
172
|
+
address_text = address_element.get('aria-label', '').strip()
|
173
|
+
result['headquarters'] = address_text
|
174
|
+
|
175
|
+
# Extract country (last word in the address)
|
176
|
+
country = address_text.split(',')[-1].strip()
|
177
|
+
result['country'] = country if country else None
|
178
|
+
|
179
|
+
# Extract company description
|
180
|
+
name_element = document.find('h1', {'data-id': 'txt-company-name'})
|
181
|
+
description_element = document.find('p', {'class': 'ExpTypography-root ExpTypography-body1'})
|
182
|
+
if name_element and description_element:
|
183
|
+
company_name = name_element.text.strip()
|
184
|
+
company_desc = description_element.text.strip()
|
185
|
+
result['company_description'] = f"{company_name}: {company_desc}"
|
186
|
+
|
187
|
+
# Extract company logo
|
188
|
+
logo_element = document.find('img', {'alt': True, 'src': True})
|
189
|
+
if logo_element:
|
190
|
+
result['logo_url'] = logo_element['src']
|
191
|
+
|
192
|
+
return result
|
@@ -0,0 +1,206 @@
|
|
1
|
+
from bs4 import BeautifulSoup as bs
|
2
|
+
from .base import ScrapperBase
|
3
|
+
import json
|
4
|
+
|
5
|
+
|
6
|
+
class LeadiqScrapper(ScrapperBase):
|
7
|
+
"""
|
8
|
+
LeadiqScrapper Model.
|
9
|
+
"""
|
10
|
+
domain: str = 'leadiq.com'
|
11
|
+
search_term: str = "site:leadiq.com {}"
|
12
|
+
keywords: list = [
|
13
|
+
'Email Formats & Email Address',
|
14
|
+
'Company Overview',
|
15
|
+
'Employee Directory',
|
16
|
+
'Contact Details & Competitors',
|
17
|
+
'Email Format'
|
18
|
+
]
|
19
|
+
|
20
|
+
def define_search_term(self, term: str):
|
21
|
+
standardized_term = self._standardize_name(term)
|
22
|
+
search_term = self.search_term.format(standardized_term)
|
23
|
+
return search_term
|
24
|
+
|
25
|
+
async def scrapping(self, document: bs, idx: int, row: dict):
|
26
|
+
"""
|
27
|
+
Scrape company information from LeadIQ.
|
28
|
+
Updates the existing row with new data from LeadIQ.
|
29
|
+
"""
|
30
|
+
# Start with the existing row data
|
31
|
+
result = row.copy()
|
32
|
+
|
33
|
+
# Actualizamos solo los campos específicos de LeadIQ
|
34
|
+
result.update({
|
35
|
+
'source_platform': 'leadiq',
|
36
|
+
'scrape_status': 'pending',
|
37
|
+
'search_term': self.search_term_used
|
38
|
+
})
|
39
|
+
|
40
|
+
try:
|
41
|
+
# Get company name and logo URL from logo image
|
42
|
+
logo = document.find('img', {'alt': True, 'width': '76.747'})
|
43
|
+
if logo:
|
44
|
+
result['company_name'] = logo.get('alt')
|
45
|
+
result['logo_url'] = logo.get('src')
|
46
|
+
|
47
|
+
# Get company revenue range from highlight-right section
|
48
|
+
highlight_right = document.find('div', {'class': 'highlight-right'})
|
49
|
+
if highlight_right:
|
50
|
+
revenue_span = highlight_right.find('span', {'class': 'start'})
|
51
|
+
if revenue_span:
|
52
|
+
start_value = revenue_span.text.strip()
|
53
|
+
end_span = revenue_span.find_next_sibling('span', {'class': 'end'})
|
54
|
+
if end_span:
|
55
|
+
end_value = end_span.text.strip()
|
56
|
+
result['revenue_range'] = f"{start_value} - {end_value}"
|
57
|
+
else:
|
58
|
+
result['revenue_range'] = start_value
|
59
|
+
|
60
|
+
# First find the highlight-left section that contains company info
|
61
|
+
highlight_left = document.find('div', {'class': 'highlight-left'})
|
62
|
+
if not highlight_left:
|
63
|
+
self._logger.warning("Could not find highlight-left section")
|
64
|
+
return idx, result
|
65
|
+
|
66
|
+
# Then find the card span within highlight-left
|
67
|
+
overview_section = highlight_left.find('div', {'class': 'card span'})
|
68
|
+
if not overview_section:
|
69
|
+
return idx, result
|
70
|
+
|
71
|
+
# Extract information from dl/dt/dd elements
|
72
|
+
dl_element = overview_section.find('dl')
|
73
|
+
if dl_element:
|
74
|
+
for item in dl_element.find_all('div', {'class': 'item'}):
|
75
|
+
dt = item.find('dt')
|
76
|
+
dd = item.find('dd')
|
77
|
+
if dt and dd:
|
78
|
+
field = dt.text.strip().lower()
|
79
|
+
value = dd.text.strip()
|
80
|
+
|
81
|
+
# Map fields to our column names
|
82
|
+
if field == 'headquarters':
|
83
|
+
address_info = self._parse_address(value)
|
84
|
+
result.update(address_info)
|
85
|
+
# Extract country from headquarters
|
86
|
+
parts = value.split()
|
87
|
+
result['country'] = parts[-1] if len(parts) > 1 else None
|
88
|
+
elif field == 'phone number':
|
89
|
+
phone = value.replace('****', '0000')
|
90
|
+
result['phone_number'] = phone
|
91
|
+
elif field == 'website':
|
92
|
+
website = dd.find('a')
|
93
|
+
result['website'] = website['href'] if website else value
|
94
|
+
elif field == 'stock symbol':
|
95
|
+
result['stock_symbol'] = value
|
96
|
+
elif field == 'naics code':
|
97
|
+
result['naics_code'] = value
|
98
|
+
elif field == 'employees':
|
99
|
+
result['employee_count'] = value
|
100
|
+
elif field == 'sic code':
|
101
|
+
result['sic_code'] = value
|
102
|
+
|
103
|
+
# Extract information from the hero section
|
104
|
+
hero_section = document.find('div', {'class': 'card hero snug'})
|
105
|
+
if hero_section:
|
106
|
+
# Company name
|
107
|
+
company_name_element = hero_section.find('h1')
|
108
|
+
if company_name_element:
|
109
|
+
result['company_name'] = company_name_element.text.strip()
|
110
|
+
|
111
|
+
# Industry, location, and number of employees
|
112
|
+
info_p = hero_section.find('p', {'class': 'info'})
|
113
|
+
if info_p:
|
114
|
+
spans = info_p.find_all('span')
|
115
|
+
if len(spans) >= 3:
|
116
|
+
result['industry'] = spans[0].text.strip()
|
117
|
+
result['location'] = spans[1].text.strip()
|
118
|
+
result['number_employees'] = spans[2].text.strip()
|
119
|
+
|
120
|
+
# Company description
|
121
|
+
description_p = hero_section.find('pre')
|
122
|
+
if description_p:
|
123
|
+
result['company_description'] = description_p.text.strip()
|
124
|
+
|
125
|
+
# Extract similar companies
|
126
|
+
similar_companies = []
|
127
|
+
similar_section = document.find('div', {'id': 'similar'})
|
128
|
+
if similar_section:
|
129
|
+
for company in similar_section.find_all('li'):
|
130
|
+
company_link = company.find('a')
|
131
|
+
if not company_link:
|
132
|
+
continue
|
133
|
+
|
134
|
+
company_logo = company_link.find('img')
|
135
|
+
company_name = company_link.find('h3')
|
136
|
+
|
137
|
+
# Find revenue span
|
138
|
+
revenue_spans = company_link.find_all('span')
|
139
|
+
revenue_span = None
|
140
|
+
for span in revenue_spans:
|
141
|
+
if span.find('span', {'class': 'start'}):
|
142
|
+
revenue_span = span
|
143
|
+
break
|
144
|
+
|
145
|
+
if company_name:
|
146
|
+
similar_company = {
|
147
|
+
'name': company_name.text.strip(), # No escapamos las comillas
|
148
|
+
'leadiq_url': company_link['href'],
|
149
|
+
'logo_url': company_logo['src'] if company_logo else None,
|
150
|
+
}
|
151
|
+
|
152
|
+
# Extract revenue range
|
153
|
+
if revenue_span:
|
154
|
+
start = revenue_span.find('span', {'class': 'start'})
|
155
|
+
end = revenue_span.find('span', {'class': 'end'})
|
156
|
+
|
157
|
+
if start:
|
158
|
+
start_value = start.text.strip()
|
159
|
+
if end:
|
160
|
+
end_value = end.text.strip()
|
161
|
+
similar_company['revenue_range'] = f"{start_value} - {end_value}"
|
162
|
+
else:
|
163
|
+
similar_company['revenue_range'] = start_value
|
164
|
+
|
165
|
+
similar_companies.append(similar_company)
|
166
|
+
|
167
|
+
if similar_companies:
|
168
|
+
try:
|
169
|
+
result['similar_companies'] = json.dumps(
|
170
|
+
similar_companies,
|
171
|
+
ensure_ascii=False,
|
172
|
+
allow_nan=False,
|
173
|
+
separators=(',', ':')
|
174
|
+
)
|
175
|
+
except Exception as e:
|
176
|
+
self._logger.error(
|
177
|
+
f"Error formatting similar companies JSON: {str(e)}"
|
178
|
+
)
|
179
|
+
result['similar_companies'] = None
|
180
|
+
|
181
|
+
# Actualizamos el contador y el estado
|
182
|
+
self._counter += 1
|
183
|
+
|
184
|
+
# Verificamos si se encontró algún dato
|
185
|
+
has_data = any([
|
186
|
+
result.get('company_name'),
|
187
|
+
result.get('logo_url'),
|
188
|
+
result.get('address'),
|
189
|
+
result.get('phone_number'),
|
190
|
+
result.get('website'),
|
191
|
+
result.get('stock_symbol'),
|
192
|
+
result.get('naics_code'),
|
193
|
+
result.get('employee_count'),
|
194
|
+
result.get('revenue_range'),
|
195
|
+
result.get('similar_companies')
|
196
|
+
])
|
197
|
+
|
198
|
+
# Establecemos el estado según si encontramos datos o no
|
199
|
+
result['scrape_status'] = 'success' if has_data else 'no_data'
|
200
|
+
# Siempre devolvemos el resultado, tenga datos o no
|
201
|
+
return idx, result
|
202
|
+
|
203
|
+
except Exception as e:
|
204
|
+
self._logger.error(f"Error parsing LeadIQ data: {str(e)}")
|
205
|
+
result['scrape_status'] = f'error: {str(e)[:50]}'
|
206
|
+
return idx, result
|