flowtask 5.8.4__cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowtask/__init__.py +93 -0
- flowtask/__main__.py +38 -0
- flowtask/bots/__init__.py +6 -0
- flowtask/bots/check.py +93 -0
- flowtask/bots/codebot.py +51 -0
- flowtask/components/ASPX.py +148 -0
- flowtask/components/AddDataset.py +352 -0
- flowtask/components/Amazon.py +523 -0
- flowtask/components/AutoTask.py +314 -0
- flowtask/components/Azure.py +80 -0
- flowtask/components/AzureUsers.py +106 -0
- flowtask/components/BaseAction.py +91 -0
- flowtask/components/BaseLoop.py +198 -0
- flowtask/components/BestBuy.py +800 -0
- flowtask/components/CSVToGCS.py +120 -0
- flowtask/components/CompanyScraper/__init__.py +1 -0
- flowtask/components/CompanyScraper/parsers/__init__.py +6 -0
- flowtask/components/CompanyScraper/parsers/base.py +102 -0
- flowtask/components/CompanyScraper/parsers/explorium.py +192 -0
- flowtask/components/CompanyScraper/parsers/leadiq.py +206 -0
- flowtask/components/CompanyScraper/parsers/rocket.py +133 -0
- flowtask/components/CompanyScraper/parsers/siccode.py +109 -0
- flowtask/components/CompanyScraper/parsers/visualvisitor.py +130 -0
- flowtask/components/CompanyScraper/parsers/zoominfo.py +118 -0
- flowtask/components/CompanyScraper/scrapper.py +1054 -0
- flowtask/components/CopyTo.py +177 -0
- flowtask/components/CopyToBigQuery.py +243 -0
- flowtask/components/CopyToMongoDB.py +291 -0
- flowtask/components/CopyToPg.py +609 -0
- flowtask/components/CopyToRethink.py +207 -0
- flowtask/components/CreateGCSBucket.py +102 -0
- flowtask/components/CreateReport/CreateReport.py +228 -0
- flowtask/components/CreateReport/__init__.py +9 -0
- flowtask/components/CreateReport/charts/__init__.py +15 -0
- flowtask/components/CreateReport/charts/bar.py +51 -0
- flowtask/components/CreateReport/charts/base.py +66 -0
- flowtask/components/CreateReport/charts/pie.py +64 -0
- flowtask/components/CreateReport/utils.py +9 -0
- flowtask/components/CustomerSatisfaction.py +196 -0
- flowtask/components/DataInput.py +200 -0
- flowtask/components/DateList.py +255 -0
- flowtask/components/DbClient.py +163 -0
- flowtask/components/DialPad.py +146 -0
- flowtask/components/DocumentDBQuery.py +200 -0
- flowtask/components/DownloadFrom.py +371 -0
- flowtask/components/DownloadFromD2L.py +113 -0
- flowtask/components/DownloadFromFTP.py +181 -0
- flowtask/components/DownloadFromIMAP.py +315 -0
- flowtask/components/DownloadFromS3.py +198 -0
- flowtask/components/DownloadFromSFTP.py +265 -0
- flowtask/components/DownloadFromSharepoint.py +110 -0
- flowtask/components/DownloadFromSmartSheet.py +114 -0
- flowtask/components/DownloadS3File.py +229 -0
- flowtask/components/Dummy.py +59 -0
- flowtask/components/DuplicatePhoto.py +411 -0
- flowtask/components/EmployeeEvaluation.py +237 -0
- flowtask/components/ExecuteSQL.py +323 -0
- flowtask/components/ExtractHTML.py +178 -0
- flowtask/components/FileBase.py +178 -0
- flowtask/components/FileCopy.py +181 -0
- flowtask/components/FileDelete.py +82 -0
- flowtask/components/FileExists.py +146 -0
- flowtask/components/FileIteratorDelete.py +112 -0
- flowtask/components/FileList.py +194 -0
- flowtask/components/FileOpen.py +75 -0
- flowtask/components/FileRead.py +120 -0
- flowtask/components/FileRename.py +106 -0
- flowtask/components/FilterIf.py +284 -0
- flowtask/components/FilterRows/FilterRows.py +200 -0
- flowtask/components/FilterRows/__init__.py +10 -0
- flowtask/components/FilterRows/functions.py +4 -0
- flowtask/components/GCSToBigQuery.py +103 -0
- flowtask/components/GoogleA4.py +150 -0
- flowtask/components/GoogleGeoCoding.py +344 -0
- flowtask/components/GooglePlaces.py +315 -0
- flowtask/components/GoogleSearch.py +539 -0
- flowtask/components/HTTPClient.py +268 -0
- flowtask/components/ICIMS.py +146 -0
- flowtask/components/IF.py +179 -0
- flowtask/components/IcimsFolderCopy.py +173 -0
- flowtask/components/ImageFeatures/__init__.py +5 -0
- flowtask/components/ImageFeatures/process.py +233 -0
- flowtask/components/IteratorBase.py +251 -0
- flowtask/components/LangchainLoader/__init__.py +5 -0
- flowtask/components/LangchainLoader/loader.py +194 -0
- flowtask/components/LangchainLoader/loaders/__init__.py +22 -0
- flowtask/components/LangchainLoader/loaders/abstract.py +362 -0
- flowtask/components/LangchainLoader/loaders/basepdf.py +50 -0
- flowtask/components/LangchainLoader/loaders/docx.py +91 -0
- flowtask/components/LangchainLoader/loaders/html.py +119 -0
- flowtask/components/LangchainLoader/loaders/pdfblocks.py +146 -0
- flowtask/components/LangchainLoader/loaders/pdfmark.py +79 -0
- flowtask/components/LangchainLoader/loaders/pdftables.py +135 -0
- flowtask/components/LangchainLoader/loaders/qa.py +67 -0
- flowtask/components/LangchainLoader/loaders/txt.py +55 -0
- flowtask/components/LeadIQ.py +650 -0
- flowtask/components/Loop.py +253 -0
- flowtask/components/Lowes.py +334 -0
- flowtask/components/MS365Usage.py +156 -0
- flowtask/components/MSTeamsMessages.py +320 -0
- flowtask/components/MarketClustering.py +1051 -0
- flowtask/components/MergeFiles.py +362 -0
- flowtask/components/MilvusOutput.py +87 -0
- flowtask/components/NearByStores.py +175 -0
- flowtask/components/NetworkNinja/__init__.py +6 -0
- flowtask/components/NetworkNinja/models/__init__.py +52 -0
- flowtask/components/NetworkNinja/models/abstract.py +177 -0
- flowtask/components/NetworkNinja/models/account.py +39 -0
- flowtask/components/NetworkNinja/models/client.py +19 -0
- flowtask/components/NetworkNinja/models/district.py +14 -0
- flowtask/components/NetworkNinja/models/events.py +101 -0
- flowtask/components/NetworkNinja/models/forms.py +499 -0
- flowtask/components/NetworkNinja/models/market.py +16 -0
- flowtask/components/NetworkNinja/models/organization.py +34 -0
- flowtask/components/NetworkNinja/models/photos.py +125 -0
- flowtask/components/NetworkNinja/models/project.py +44 -0
- flowtask/components/NetworkNinja/models/region.py +28 -0
- flowtask/components/NetworkNinja/models/store.py +203 -0
- flowtask/components/NetworkNinja/models/user.py +151 -0
- flowtask/components/NetworkNinja/router.py +854 -0
- flowtask/components/Odoo.py +175 -0
- flowtask/components/OdooInjector.py +192 -0
- flowtask/components/OpenFromXML.py +126 -0
- flowtask/components/OpenWeather.py +41 -0
- flowtask/components/OpenWithBase.py +616 -0
- flowtask/components/OpenWithPandas.py +715 -0
- flowtask/components/PGPDecrypt.py +199 -0
- flowtask/components/PandasIterator.py +187 -0
- flowtask/components/PandasToFile.py +189 -0
- flowtask/components/Paradox.py +339 -0
- flowtask/components/ParamIterator.py +117 -0
- flowtask/components/ParseHTML.py +84 -0
- flowtask/components/PlacerStores.py +249 -0
- flowtask/components/Pokemon.py +507 -0
- flowtask/components/PositiveBot.py +62 -0
- flowtask/components/PowerPointSlide.py +400 -0
- flowtask/components/PrintMessage.py +127 -0
- flowtask/components/ProductCompetitors/__init__.py +5 -0
- flowtask/components/ProductCompetitors/parsers/__init__.py +7 -0
- flowtask/components/ProductCompetitors/parsers/base.py +72 -0
- flowtask/components/ProductCompetitors/parsers/bestbuy.py +86 -0
- flowtask/components/ProductCompetitors/parsers/lowes.py +103 -0
- flowtask/components/ProductCompetitors/scrapper.py +155 -0
- flowtask/components/ProductCompliant.py +169 -0
- flowtask/components/ProductInfo/__init__.py +1 -0
- flowtask/components/ProductInfo/parsers/__init__.py +5 -0
- flowtask/components/ProductInfo/parsers/base.py +83 -0
- flowtask/components/ProductInfo/parsers/brother.py +97 -0
- flowtask/components/ProductInfo/parsers/canon.py +167 -0
- flowtask/components/ProductInfo/parsers/epson.py +118 -0
- flowtask/components/ProductInfo/parsers/hp.py +131 -0
- flowtask/components/ProductInfo/parsers/samsung.py +97 -0
- flowtask/components/ProductInfo/scraper.py +319 -0
- flowtask/components/ProductPricing.py +118 -0
- flowtask/components/QS.py +261 -0
- flowtask/components/QSBase.py +201 -0
- flowtask/components/QueryIterator.py +273 -0
- flowtask/components/QueryToInsert.py +327 -0
- flowtask/components/QueryToPandas.py +432 -0
- flowtask/components/RESTClient.py +195 -0
- flowtask/components/RethinkDBQuery.py +189 -0
- flowtask/components/Rsync.py +74 -0
- flowtask/components/RunSSH.py +59 -0
- flowtask/components/RunShell.py +71 -0
- flowtask/components/SalesForce.py +20 -0
- flowtask/components/SaveImageBank/__init__.py +257 -0
- flowtask/components/SchedulingVisits.py +592 -0
- flowtask/components/ScrapPage.py +216 -0
- flowtask/components/ScrapSearch.py +79 -0
- flowtask/components/SendNotify.py +257 -0
- flowtask/components/SentimentAnalysis.py +694 -0
- flowtask/components/ServiceScrapper/__init__.py +5 -0
- flowtask/components/ServiceScrapper/parsers/__init__.py +1 -0
- flowtask/components/ServiceScrapper/parsers/base.py +94 -0
- flowtask/components/ServiceScrapper/parsers/costco.py +93 -0
- flowtask/components/ServiceScrapper/scrapper.py +199 -0
- flowtask/components/SetVariables.py +156 -0
- flowtask/components/SubTask.py +182 -0
- flowtask/components/SuiteCRM.py +48 -0
- flowtask/components/Switch.py +175 -0
- flowtask/components/TableBase.py +148 -0
- flowtask/components/TableDelete.py +312 -0
- flowtask/components/TableInput.py +143 -0
- flowtask/components/TableOutput/TableOutput.py +384 -0
- flowtask/components/TableOutput/__init__.py +3 -0
- flowtask/components/TableSchema.py +534 -0
- flowtask/components/Target.py +223 -0
- flowtask/components/ThumbnailGenerator.py +156 -0
- flowtask/components/ToPandas.py +67 -0
- flowtask/components/TransformRows/TransformRows.py +507 -0
- flowtask/components/TransformRows/__init__.py +9 -0
- flowtask/components/TransformRows/functions.py +559 -0
- flowtask/components/TransposeRows.py +176 -0
- flowtask/components/UPCDatabase.py +86 -0
- flowtask/components/UnGzip.py +171 -0
- flowtask/components/Uncompress.py +172 -0
- flowtask/components/UniqueRows.py +126 -0
- flowtask/components/Unzip.py +107 -0
- flowtask/components/UpdateOperationalVars.py +147 -0
- flowtask/components/UploadTo.py +299 -0
- flowtask/components/UploadToS3.py +136 -0
- flowtask/components/UploadToSFTP.py +160 -0
- flowtask/components/UploadToSharepoint.py +205 -0
- flowtask/components/UserFunc.py +122 -0
- flowtask/components/VivaTracker.py +140 -0
- flowtask/components/WSDLClient.py +123 -0
- flowtask/components/Wait.py +18 -0
- flowtask/components/Walmart.py +199 -0
- flowtask/components/Workplace.py +134 -0
- flowtask/components/XMLToPandas.py +267 -0
- flowtask/components/Zammad/__init__.py +41 -0
- flowtask/components/Zammad/models.py +0 -0
- flowtask/components/ZoomInfoScraper.py +409 -0
- flowtask/components/__init__.py +104 -0
- flowtask/components/abstract.py +18 -0
- flowtask/components/flow.py +530 -0
- flowtask/components/google.py +335 -0
- flowtask/components/group.py +221 -0
- flowtask/components/py.typed +0 -0
- flowtask/components/reviewscrap.py +132 -0
- flowtask/components/tAutoincrement.py +117 -0
- flowtask/components/tConcat.py +109 -0
- flowtask/components/tExplode.py +119 -0
- flowtask/components/tFilter.py +184 -0
- flowtask/components/tGroup.py +236 -0
- flowtask/components/tJoin.py +270 -0
- flowtask/components/tMap/__init__.py +9 -0
- flowtask/components/tMap/functions.py +54 -0
- flowtask/components/tMap/tMap.py +450 -0
- flowtask/components/tMelt.py +112 -0
- flowtask/components/tMerge.py +114 -0
- flowtask/components/tOrder.py +93 -0
- flowtask/components/tPandas.py +94 -0
- flowtask/components/tPivot.py +71 -0
- flowtask/components/tPluckCols.py +76 -0
- flowtask/components/tUnnest.py +82 -0
- flowtask/components/user.py +401 -0
- flowtask/conf.py +457 -0
- flowtask/download.py +102 -0
- flowtask/events/__init__.py +11 -0
- flowtask/events/events/__init__.py +20 -0
- flowtask/events/events/abstract.py +95 -0
- flowtask/events/events/alerts/__init__.py +362 -0
- flowtask/events/events/alerts/colfunctions.py +131 -0
- flowtask/events/events/alerts/functions.py +158 -0
- flowtask/events/events/dummy.py +12 -0
- flowtask/events/events/exec.py +124 -0
- flowtask/events/events/file/__init__.py +7 -0
- flowtask/events/events/file/base.py +51 -0
- flowtask/events/events/file/copy.py +23 -0
- flowtask/events/events/file/delete.py +16 -0
- flowtask/events/events/interfaces/__init__.py +9 -0
- flowtask/events/events/interfaces/client.py +67 -0
- flowtask/events/events/interfaces/credentials.py +28 -0
- flowtask/events/events/interfaces/notifications.py +58 -0
- flowtask/events/events/jira.py +122 -0
- flowtask/events/events/log.py +26 -0
- flowtask/events/events/logerr.py +52 -0
- flowtask/events/events/notify.py +59 -0
- flowtask/events/events/notify_event.py +160 -0
- flowtask/events/events/publish.py +54 -0
- flowtask/events/events/sendfile.py +104 -0
- flowtask/events/events/task.py +97 -0
- flowtask/events/events/teams.py +98 -0
- flowtask/events/events/webhook.py +58 -0
- flowtask/events/manager.py +287 -0
- flowtask/exceptions.c +39393 -0
- flowtask/exceptions.cpython-312-x86_64-linux-gnu.so +0 -0
- flowtask/extensions/__init__.py +3 -0
- flowtask/extensions/abstract.py +82 -0
- flowtask/extensions/logging/__init__.py +65 -0
- flowtask/hooks/__init__.py +9 -0
- flowtask/hooks/actions/__init__.py +22 -0
- flowtask/hooks/actions/abstract.py +66 -0
- flowtask/hooks/actions/dummy.py +23 -0
- flowtask/hooks/actions/jira.py +74 -0
- flowtask/hooks/actions/rest.py +320 -0
- flowtask/hooks/actions/sampledata.py +37 -0
- flowtask/hooks/actions/sensor.py +23 -0
- flowtask/hooks/actions/task.py +9 -0
- flowtask/hooks/actions/ticket.py +37 -0
- flowtask/hooks/actions/zammad.py +55 -0
- flowtask/hooks/hook.py +62 -0
- flowtask/hooks/models.py +17 -0
- flowtask/hooks/service.py +187 -0
- flowtask/hooks/step.py +91 -0
- flowtask/hooks/types/__init__.py +23 -0
- flowtask/hooks/types/base.py +129 -0
- flowtask/hooks/types/brokers/__init__.py +11 -0
- flowtask/hooks/types/brokers/base.py +54 -0
- flowtask/hooks/types/brokers/mqtt.py +35 -0
- flowtask/hooks/types/brokers/rabbitmq.py +82 -0
- flowtask/hooks/types/brokers/redis.py +83 -0
- flowtask/hooks/types/brokers/sqs.py +44 -0
- flowtask/hooks/types/fs.py +232 -0
- flowtask/hooks/types/http.py +49 -0
- flowtask/hooks/types/imap.py +200 -0
- flowtask/hooks/types/jira.py +279 -0
- flowtask/hooks/types/mail.py +205 -0
- flowtask/hooks/types/postgres.py +98 -0
- flowtask/hooks/types/responses/__init__.py +8 -0
- flowtask/hooks/types/responses/base.py +5 -0
- flowtask/hooks/types/sharepoint.py +288 -0
- flowtask/hooks/types/ssh.py +141 -0
- flowtask/hooks/types/tagged.py +59 -0
- flowtask/hooks/types/upload.py +85 -0
- flowtask/hooks/types/watch.py +71 -0
- flowtask/hooks/types/web.py +36 -0
- flowtask/interfaces/AzureClient.py +137 -0
- flowtask/interfaces/AzureGraph.py +839 -0
- flowtask/interfaces/Boto3Client.py +326 -0
- flowtask/interfaces/DropboxClient.py +173 -0
- flowtask/interfaces/ExcelHandler.py +94 -0
- flowtask/interfaces/FTPClient.py +131 -0
- flowtask/interfaces/GoogleCalendar.py +201 -0
- flowtask/interfaces/GoogleClient.py +133 -0
- flowtask/interfaces/GoogleDrive.py +127 -0
- flowtask/interfaces/GoogleGCS.py +89 -0
- flowtask/interfaces/GoogleGeocoding.py +93 -0
- flowtask/interfaces/GoogleLang.py +114 -0
- flowtask/interfaces/GooglePub.py +61 -0
- flowtask/interfaces/GoogleSheet.py +68 -0
- flowtask/interfaces/IMAPClient.py +137 -0
- flowtask/interfaces/O365Calendar.py +113 -0
- flowtask/interfaces/O365Client.py +220 -0
- flowtask/interfaces/OneDrive.py +284 -0
- flowtask/interfaces/Outlook.py +155 -0
- flowtask/interfaces/ParrotBot.py +130 -0
- flowtask/interfaces/SSHClient.py +378 -0
- flowtask/interfaces/Sharepoint.py +496 -0
- flowtask/interfaces/__init__.py +36 -0
- flowtask/interfaces/azureauth.py +119 -0
- flowtask/interfaces/cache.py +201 -0
- flowtask/interfaces/client.py +82 -0
- flowtask/interfaces/compress.py +525 -0
- flowtask/interfaces/credentials.py +124 -0
- flowtask/interfaces/d2l.py +239 -0
- flowtask/interfaces/databases/__init__.py +5 -0
- flowtask/interfaces/databases/db.py +223 -0
- flowtask/interfaces/databases/documentdb.py +55 -0
- flowtask/interfaces/databases/rethink.py +39 -0
- flowtask/interfaces/dataframes/__init__.py +11 -0
- flowtask/interfaces/dataframes/abstract.py +21 -0
- flowtask/interfaces/dataframes/arrow.py +71 -0
- flowtask/interfaces/dataframes/dt.py +69 -0
- flowtask/interfaces/dataframes/pandas.py +167 -0
- flowtask/interfaces/dataframes/polars.py +60 -0
- flowtask/interfaces/db.py +263 -0
- flowtask/interfaces/env.py +46 -0
- flowtask/interfaces/func.py +137 -0
- flowtask/interfaces/http.py +1780 -0
- flowtask/interfaces/locale.py +40 -0
- flowtask/interfaces/log.py +75 -0
- flowtask/interfaces/mask.py +143 -0
- flowtask/interfaces/notification.py +154 -0
- flowtask/interfaces/playwright.py +339 -0
- flowtask/interfaces/powerpoint.py +368 -0
- flowtask/interfaces/py.typed +0 -0
- flowtask/interfaces/qs.py +376 -0
- flowtask/interfaces/result.py +87 -0
- flowtask/interfaces/selenium_service.py +779 -0
- flowtask/interfaces/smartsheet.py +154 -0
- flowtask/interfaces/stat.py +39 -0
- flowtask/interfaces/task.py +96 -0
- flowtask/interfaces/template.py +118 -0
- flowtask/interfaces/vectorstores/__init__.py +1 -0
- flowtask/interfaces/vectorstores/abstract.py +133 -0
- flowtask/interfaces/vectorstores/milvus.py +669 -0
- flowtask/interfaces/zammad.py +107 -0
- flowtask/models.py +193 -0
- flowtask/parsers/__init__.py +15 -0
- flowtask/parsers/_yaml.c +11978 -0
- flowtask/parsers/_yaml.cpython-312-x86_64-linux-gnu.so +0 -0
- flowtask/parsers/argparser.py +235 -0
- flowtask/parsers/base.c +15155 -0
- flowtask/parsers/base.cpython-312-x86_64-linux-gnu.so +0 -0
- flowtask/parsers/json.c +11968 -0
- flowtask/parsers/json.cpython-312-x86_64-linux-gnu.so +0 -0
- flowtask/parsers/maps.py +49 -0
- flowtask/parsers/toml.c +11968 -0
- flowtask/parsers/toml.cpython-312-x86_64-linux-gnu.so +0 -0
- flowtask/plugins/__init__.py +16 -0
- flowtask/plugins/components/__init__.py +0 -0
- flowtask/plugins/handler/__init__.py +45 -0
- flowtask/plugins/importer.py +31 -0
- flowtask/plugins/sources/__init__.py +0 -0
- flowtask/runner.py +283 -0
- flowtask/scheduler/__init__.py +9 -0
- flowtask/scheduler/functions.py +493 -0
- flowtask/scheduler/handlers/__init__.py +8 -0
- flowtask/scheduler/handlers/manager.py +504 -0
- flowtask/scheduler/handlers/models.py +58 -0
- flowtask/scheduler/handlers/service.py +72 -0
- flowtask/scheduler/notifications.py +65 -0
- flowtask/scheduler/scheduler.py +993 -0
- flowtask/services/__init__.py +0 -0
- flowtask/services/bots/__init__.py +0 -0
- flowtask/services/bots/telegram.py +264 -0
- flowtask/services/files/__init__.py +11 -0
- flowtask/services/files/manager.py +522 -0
- flowtask/services/files/model.py +37 -0
- flowtask/services/files/service.py +767 -0
- flowtask/services/jira/__init__.py +3 -0
- flowtask/services/jira/jira_actions.py +191 -0
- flowtask/services/tasks/__init__.py +13 -0
- flowtask/services/tasks/launcher.py +213 -0
- flowtask/services/tasks/manager.py +323 -0
- flowtask/services/tasks/service.py +275 -0
- flowtask/services/tasks/task_manager.py +376 -0
- flowtask/services/tasks/tasks.py +155 -0
- flowtask/storages/__init__.py +16 -0
- flowtask/storages/exceptions.py +12 -0
- flowtask/storages/files/__init__.py +8 -0
- flowtask/storages/files/abstract.py +29 -0
- flowtask/storages/files/filesystem.py +66 -0
- flowtask/storages/tasks/__init__.py +19 -0
- flowtask/storages/tasks/abstract.py +26 -0
- flowtask/storages/tasks/database.py +33 -0
- flowtask/storages/tasks/filesystem.py +108 -0
- flowtask/storages/tasks/github.py +119 -0
- flowtask/storages/tasks/memory.py +45 -0
- flowtask/storages/tasks/row.py +25 -0
- flowtask/tasks/__init__.py +0 -0
- flowtask/tasks/abstract.py +526 -0
- flowtask/tasks/command.py +118 -0
- flowtask/tasks/pile.py +486 -0
- flowtask/tasks/py.typed +0 -0
- flowtask/tasks/task.py +778 -0
- flowtask/template/__init__.py +161 -0
- flowtask/tests.py +257 -0
- flowtask/types/__init__.py +8 -0
- flowtask/types/typedefs.c +11347 -0
- flowtask/types/typedefs.cpython-312-x86_64-linux-gnu.so +0 -0
- flowtask/utils/__init__.py +24 -0
- flowtask/utils/constants.py +117 -0
- flowtask/utils/encoders.py +21 -0
- flowtask/utils/executor.py +112 -0
- flowtask/utils/functions.cpp +14280 -0
- flowtask/utils/functions.cpython-312-x86_64-linux-gnu.so +0 -0
- flowtask/utils/json.cpp +13349 -0
- flowtask/utils/json.cpython-312-x86_64-linux-gnu.so +0 -0
- flowtask/utils/mail.py +63 -0
- flowtask/utils/parseqs.c +13324 -0
- flowtask/utils/parserqs.cpython-312-x86_64-linux-gnu.so +0 -0
- flowtask/utils/stats.py +308 -0
- flowtask/utils/transformations.py +74 -0
- flowtask/utils/uv.py +12 -0
- flowtask/utils/validators.py +97 -0
- flowtask/version.py +11 -0
- flowtask-5.8.4.dist-info/LICENSE +201 -0
- flowtask-5.8.4.dist-info/METADATA +209 -0
- flowtask-5.8.4.dist-info/RECORD +470 -0
- flowtask-5.8.4.dist-info/WHEEL +6 -0
- flowtask-5.8.4.dist-info/entry_points.txt +3 -0
- flowtask-5.8.4.dist-info/top_level.txt +2 -0
- plugins/components/CreateQR.py +39 -0
- plugins/components/TestComponent.py +28 -0
- plugins/components/Use1.py +13 -0
- plugins/components/Workplace.py +117 -0
- plugins/components/__init__.py +3 -0
- plugins/sources/__init__.py +0 -0
- plugins/sources/get_populartimes.py +78 -0
- plugins/sources/google.py +150 -0
- plugins/sources/hubspot.py +679 -0
- plugins/sources/icims.py +679 -0
- plugins/sources/mobileinsight.py +501 -0
- plugins/sources/newrelic.py +262 -0
- plugins/sources/uap.py +268 -0
- plugins/sources/venu.py +244 -0
- plugins/sources/vocinity.py +314 -0
@@ -0,0 +1,1054 @@
|
|
1
|
+
from collections.abc import Callable
|
2
|
+
import asyncio
|
3
|
+
import random
|
4
|
+
import backoff
|
5
|
+
import httpx
|
6
|
+
from typing import List, Optional, Dict, Any
|
7
|
+
from tqdm.asyncio import tqdm
|
8
|
+
from fuzzywuzzy import fuzz
|
9
|
+
import pandas as pd
|
10
|
+
from bs4 import BeautifulSoup
|
11
|
+
from duckduckgo_search.exceptions import RatelimitException
|
12
|
+
from ...exceptions import ComponentError, ConfigError
|
13
|
+
from ...interfaces import HTTPService, SeleniumService
|
14
|
+
from ...interfaces.http import ua, bad_gateway_exception
|
15
|
+
from ..flow import FlowComponent
|
16
|
+
from .parsers import (
|
17
|
+
LeadiqScrapper,
|
18
|
+
ExploriumScrapper,
|
19
|
+
ZoomInfoScrapper,
|
20
|
+
SicCodeScrapper,
|
21
|
+
RocketReachScrapper,
|
22
|
+
VisualVisitorScrapper
|
23
|
+
)
|
24
|
+
import json
|
25
|
+
import re
|
26
|
+
|
27
|
+
|
28
|
+
class CompanyScraper(FlowComponent, SeleniumService, HTTPService):
|
29
|
+
"""
|
30
|
+
Company Scraper Component
|
31
|
+
|
32
|
+
Overview:
|
33
|
+
|
34
|
+
This component scrapes company information from different sources using HTTPService.
|
35
|
+
It can receive URLs from a previous component (like GoogleSearch) and extract
|
36
|
+
specific company information.
|
37
|
+
|
38
|
+
.. table:: Properties
|
39
|
+
:widths: auto
|
40
|
+
|
41
|
+
+-----------------------+----------+------------------------------------------------------------------------------------------------------+
|
42
|
+
| Name | Required | Description |
|
43
|
+
+-----------------------+----------+------------------------------------------------------------------------------------------------------+
|
44
|
+
| url_column (str) | Yes | Name of the column containing URLs to scrape (default: 'search_url') |
|
45
|
+
+-----------------------+----------+------------------------------------------------------------------------------------------------------+
|
46
|
+
| wait_for (tuple) | No | Element to wait for before scraping (default: ('class', 'company-overview')) |
|
47
|
+
+-----------------------+----------+------------------------------------------------------------------------------------------------------+
|
48
|
+
|
49
|
+
Return:
|
50
|
+
|
51
|
+
The component adds new columns to the DataFrame with company information:
|
52
|
+
- headquarters
|
53
|
+
- phone_number
|
54
|
+
- website
|
55
|
+
- stock_symbol
|
56
|
+
- naics_code
|
57
|
+
- employee_count
|
58
|
+
""" # noqa: E501
|
59
|
+
|
60
|
+
def __init__(
|
61
|
+
self,
|
62
|
+
loop: asyncio.AbstractEventLoop = None,
|
63
|
+
job: Callable = None,
|
64
|
+
stat: Callable = None,
|
65
|
+
**kwargs,
|
66
|
+
) -> None:
|
67
|
+
self.info_column: str = kwargs.get('column_name', 'company_name')
|
68
|
+
self.scrappers: list = kwargs.get('scrappers', ['leadiq'])
|
69
|
+
self.wait_for: tuple = kwargs.get('wait_for', ('class', 'company-overview'))
|
70
|
+
self._counter: int = 0
|
71
|
+
self.use_proxy: bool = True
|
72
|
+
self._free_proxy: bool = False
|
73
|
+
self.paid_proxy: bool = True
|
74
|
+
self.chunk_size: int = kwargs.get('chunk_size', 100)
|
75
|
+
self.concurrently: bool = kwargs.get('concurrently', True)
|
76
|
+
self.task_parts: int = kwargs.get('task_parts', 10)
|
77
|
+
super().__init__(loop=loop, job=job, stat=stat, **kwargs)
|
78
|
+
# Headers configuration
|
79
|
+
self.headers: dict = {
|
80
|
+
"Accept": self.accept,
|
81
|
+
"TE": "trailers",
|
82
|
+
"Accept-Encoding": "gzip, deflate",
|
83
|
+
"DNT": "1",
|
84
|
+
"Connection": "keep-alive",
|
85
|
+
"Upgrade-Insecure-Requests": "1",
|
86
|
+
"User-Agent": random.choice(ua),
|
87
|
+
**kwargs.get('headers', {})
|
88
|
+
}
|
89
|
+
self._free_proxy = False
|
90
|
+
|
91
|
+
def split_parts(self, task_list, num_parts: int = 5) -> list:
|
92
|
+
"""Split task list into parts for concurrent processing."""
|
93
|
+
part_size = len(task_list) // num_parts
|
94
|
+
remainder = len(task_list) % num_parts
|
95
|
+
parts = []
|
96
|
+
start = 0
|
97
|
+
for i in range(num_parts):
|
98
|
+
# Distribute the remainder across the first `remainder` parts
|
99
|
+
end = start + part_size + (1 if i < remainder else 0)
|
100
|
+
parts.append(task_list[start:end])
|
101
|
+
start = end
|
102
|
+
return parts
|
103
|
+
|
104
|
+
async def _processing_tasks(self, tasks: list) -> pd.DataFrame:
|
105
|
+
"""Process tasks concurrently and format the results."""
|
106
|
+
results = []
|
107
|
+
total_tasks = len(tasks)
|
108
|
+
with tqdm(total=total_tasks, desc="Scraping Progress", unit="task") as pbar_total: # Overall progress bar
|
109
|
+
if self.concurrently is False:
|
110
|
+
# run every task in a sequential manner:
|
111
|
+
for task in tasks:
|
112
|
+
try:
|
113
|
+
idx, row = await task
|
114
|
+
results.append((idx, row)) # Append as tuple (idx, row)
|
115
|
+
await asyncio.sleep(
|
116
|
+
random.uniform(0.5, 2)
|
117
|
+
)
|
118
|
+
except Exception as e:
|
119
|
+
self._logger.error(f"Task error: {str(e)}")
|
120
|
+
idx, row = self._get_error_info(e) # Handle error
|
121
|
+
results.append((idx, row)) # Store the failure result
|
122
|
+
finally:
|
123
|
+
pbar_total.update(1)
|
124
|
+
else:
|
125
|
+
# run all tasks concurrently
|
126
|
+
for chunk in self.split_parts(tasks, self.task_parts):
|
127
|
+
chunk_size = len(chunk)
|
128
|
+
# Usar return_exceptions=True para capturar errores sin detener la ejecución
|
129
|
+
chunk_results = await asyncio.gather(
|
130
|
+
*chunk, return_exceptions=True
|
131
|
+
)
|
132
|
+
for result in chunk_results:
|
133
|
+
if isinstance(result, Exception):
|
134
|
+
self._logger.error(f"Task error: {str(result)}")
|
135
|
+
idx, row = self._get_error_info(result) # Extract idx, row from error
|
136
|
+
results.append((idx, row))
|
137
|
+
else:
|
138
|
+
results.append(result)
|
139
|
+
|
140
|
+
pbar_total.update(chunk_size)
|
141
|
+
|
142
|
+
# Convert results to DataFrame
|
143
|
+
if not results:
|
144
|
+
return pd.DataFrame()
|
145
|
+
|
146
|
+
indices, data_dicts = zip(*results) if results else ([], [])
|
147
|
+
df = pd.DataFrame(data_dicts, index=indices)
|
148
|
+
|
149
|
+
# Ensure all expected columns exist
|
150
|
+
expected_columns = [
|
151
|
+
'company_name',
|
152
|
+
'logo_url',
|
153
|
+
'address',
|
154
|
+
'phone_number',
|
155
|
+
'website',
|
156
|
+
'stock_symbol',
|
157
|
+
'naics_code',
|
158
|
+
'sic_code',
|
159
|
+
'employee_count',
|
160
|
+
'revenue_range',
|
161
|
+
'similar_companies',
|
162
|
+
'search_term',
|
163
|
+
'search_url'
|
164
|
+
]
|
165
|
+
|
166
|
+
for col in expected_columns:
|
167
|
+
if col not in df.columns:
|
168
|
+
df[col] = None
|
169
|
+
|
170
|
+
return df
|
171
|
+
|
172
|
+
async def start(self, **kwargs) -> bool:
|
173
|
+
"""Initialize the component and validate required parameters."""
|
174
|
+
if self.previous:
|
175
|
+
self.data = self.input
|
176
|
+
|
177
|
+
if not isinstance(self.data, pd.DataFrame):
|
178
|
+
raise ComponentError(
|
179
|
+
"Input must be a DataFrame", status=404
|
180
|
+
)
|
181
|
+
|
182
|
+
if self.info_column not in self.data.columns:
|
183
|
+
raise ConfigError(
|
184
|
+
f"Column {self.info_column} not found in DataFrame"
|
185
|
+
)
|
186
|
+
|
187
|
+
# Initialize result columns
|
188
|
+
new_columns = [
|
189
|
+
'search_term',
|
190
|
+
'search_url',
|
191
|
+
'company_name',
|
192
|
+
'logo_url',
|
193
|
+
'address',
|
194
|
+
'phone_number',
|
195
|
+
'website',
|
196
|
+
'stock_symbol',
|
197
|
+
'naics_code',
|
198
|
+
'sic_code',
|
199
|
+
'employee_count',
|
200
|
+
'revenue_range',
|
201
|
+
'similar_companies',
|
202
|
+
'industry_category',
|
203
|
+
'industry',
|
204
|
+
'category',
|
205
|
+
'company_description',
|
206
|
+
'city',
|
207
|
+
'state',
|
208
|
+
'zip_code',
|
209
|
+
'country',
|
210
|
+
'metro_area',
|
211
|
+
'headquarters',
|
212
|
+
'location',
|
213
|
+
'number_employees',
|
214
|
+
'founded',
|
215
|
+
'search_status',
|
216
|
+
'scrape_status'
|
217
|
+
]
|
218
|
+
for col in new_columns:
|
219
|
+
if col not in self.data.columns:
|
220
|
+
self.data[col] = None
|
221
|
+
|
222
|
+
return True
|
223
|
+
|
224
|
+
def extract_company_info(self, soup: BeautifulSoup, search_term: str, search_url: str) -> Dict[str, Any]:
|
225
|
+
"""Extract company information from the page."""
|
226
|
+
result = {}
|
227
|
+
result['search_term'] = search_term
|
228
|
+
result['search_url'] = search_url
|
229
|
+
# Get company name and logo URL from logo image
|
230
|
+
logo = soup.find('img', {'alt': True, 'width': '76.747'})
|
231
|
+
if logo:
|
232
|
+
result['company_name'] = logo.get('alt')
|
233
|
+
result['logo_url'] = logo.get('src')
|
234
|
+
|
235
|
+
# Get company revenue range from highlight-right section
|
236
|
+
highlight_right = soup.find('div', {'class': 'highlight-right'})
|
237
|
+
if highlight_right:
|
238
|
+
revenue_span = highlight_right.find('span', {'class': 'start'})
|
239
|
+
if revenue_span:
|
240
|
+
start_value = revenue_span.text.strip()
|
241
|
+
end_span = revenue_span.find_next_sibling('span', {'class': 'end'})
|
242
|
+
if end_span:
|
243
|
+
end_value = end_span.text.strip()
|
244
|
+
result['revenue_range'] = f"{start_value} - {end_value}"
|
245
|
+
else:
|
246
|
+
result['revenue_range'] = start_value
|
247
|
+
|
248
|
+
# First find the highlight-left section that contains company info
|
249
|
+
highlight_left = soup.find('div', {'class': 'highlight-left'})
|
250
|
+
if not highlight_left:
|
251
|
+
self._logger.warning("Could not find highlight-left section")
|
252
|
+
return result
|
253
|
+
|
254
|
+
# Then find the card span within highlight-left
|
255
|
+
overview_section = highlight_left.find('div', {'class': 'card span'})
|
256
|
+
if not overview_section:
|
257
|
+
return result
|
258
|
+
|
259
|
+
# Extract information from dl/dt/dd elements
|
260
|
+
dl_element = overview_section.find('dl')
|
261
|
+
if dl_element:
|
262
|
+
for item in dl_element.find_all('div', {'class': 'item'}):
|
263
|
+
dt = item.find('dt')
|
264
|
+
dd = item.find('dd')
|
265
|
+
if dt and dd:
|
266
|
+
field = dt.text.strip().lower()
|
267
|
+
value = dd.text.strip()
|
268
|
+
|
269
|
+
# Map fields to our column names
|
270
|
+
if field == 'headquarters':
|
271
|
+
result['address'] = value
|
272
|
+
elif field == 'phone number':
|
273
|
+
phone = value.replace('****', '0000')
|
274
|
+
result['phone_number'] = phone
|
275
|
+
elif field == 'website':
|
276
|
+
website = dd.find('a')
|
277
|
+
result['website'] = website['href'] if website else value
|
278
|
+
elif field == 'stock symbol':
|
279
|
+
result['stock_symbol'] = value
|
280
|
+
elif field == 'naics code':
|
281
|
+
result['naics_code'] = value
|
282
|
+
elif field == 'employees':
|
283
|
+
result['employee_count'] = value
|
284
|
+
elif field == 'sic code':
|
285
|
+
result['sic_code'] = value
|
286
|
+
|
287
|
+
# Extract similar companies
|
288
|
+
similar_companies = []
|
289
|
+
similar_section = soup.find('div', {'id': 'similar'})
|
290
|
+
if similar_section:
|
291
|
+
for company in similar_section.find_all('li'):
|
292
|
+
company_link = company.find('a')
|
293
|
+
if not company_link:
|
294
|
+
continue
|
295
|
+
|
296
|
+
company_logo = company_link.find('img')
|
297
|
+
company_name = company_link.find('h3')
|
298
|
+
|
299
|
+
# Find revenue span
|
300
|
+
revenue_spans = company_link.find_all('span')
|
301
|
+
revenue_span = None
|
302
|
+
for span in revenue_spans:
|
303
|
+
if span.find('span', {'class': 'start'}):
|
304
|
+
revenue_span = span
|
305
|
+
break
|
306
|
+
|
307
|
+
if company_name:
|
308
|
+
similar_company = {
|
309
|
+
'name': company_name.text.strip(), # No escapamos las comillas
|
310
|
+
'leadiq_url': company_link['href'],
|
311
|
+
'logo_url': company_logo['src'] if company_logo else None,
|
312
|
+
}
|
313
|
+
|
314
|
+
# Extract revenue range
|
315
|
+
if revenue_span:
|
316
|
+
start = revenue_span.find('span', {'class': 'start'})
|
317
|
+
end = revenue_span.find('span', {'class': 'end'})
|
318
|
+
|
319
|
+
if start:
|
320
|
+
start_value = start.text.strip()
|
321
|
+
if end:
|
322
|
+
end_value = end.text.strip()
|
323
|
+
similar_company['revenue_range'] = f"{start_value} - {end_value}"
|
324
|
+
else:
|
325
|
+
similar_company['revenue_range'] = start_value
|
326
|
+
|
327
|
+
similar_companies.append(similar_company)
|
328
|
+
|
329
|
+
if similar_companies:
|
330
|
+
try:
|
331
|
+
# Convertir a string JSON con las opciones correctas para PostgreSQL
|
332
|
+
result['similar_companies'] = json.dumps(
|
333
|
+
similar_companies,
|
334
|
+
ensure_ascii=False, # Permitir caracteres Unicode
|
335
|
+
allow_nan=False, # No permitir NaN/Infinity
|
336
|
+
separators=(',', ':') # Usar formato compacto
|
337
|
+
)
|
338
|
+
except Exception as e:
|
339
|
+
self._logger.error(f"Error formatting similar companies JSON: {str(e)}")
|
340
|
+
result['similar_companies'] = None
|
341
|
+
|
342
|
+
if not result:
|
343
|
+
self._logger.warning("No data was extracted from the page")
|
344
|
+
else:
|
345
|
+
self._logger.info("Successfully extracted data")
|
346
|
+
|
347
|
+
return result
|
348
|
+
|
349
|
+
@backoff.on_exception(
|
350
|
+
backoff.expo,
|
351
|
+
(httpx.ConnectTimeout, httpx.ReadTimeout, httpx.HTTPStatusError),
|
352
|
+
max_tries=3,
|
353
|
+
max_time=60,
|
354
|
+
giveup=lambda e: not bad_gateway_exception(e) and not isinstance(e, (httpx.ConnectTimeout, httpx.ReadTimeout))
|
355
|
+
)
|
356
|
+
async def scrape_url(self, idx: int, url: str) -> tuple[int, Optional[Dict[str, Any]]]:
|
357
|
+
"""Scrape company information from URL."""
|
358
|
+
if not url:
|
359
|
+
return idx, None
|
360
|
+
|
361
|
+
try:
|
362
|
+
# Determinar qué tipo de URL es
|
363
|
+
if 'leadiq.com' in url:
|
364
|
+
return await self._scrape_leadiq(idx, url)
|
365
|
+
elif 'explorium.ai' in url:
|
366
|
+
return await self._scrape_explorium(idx, url)
|
367
|
+
else:
|
368
|
+
self._logger.warning(f"Unsupported URL domain: {url}")
|
369
|
+
return idx, None
|
370
|
+
|
371
|
+
except Exception as e:
|
372
|
+
self._logger.error(f"Error scraping {url}: {str(e)}")
|
373
|
+
return idx, None
|
374
|
+
|
375
|
+
def _parse_address(self, address: str) -> Dict[str, str]:
|
376
|
+
"""Parse address string to extract state, zipcode and country."""
|
377
|
+
if not address:
|
378
|
+
return {
|
379
|
+
'address': None,
|
380
|
+
'state': None,
|
381
|
+
'zipcode': None,
|
382
|
+
'country': None
|
383
|
+
}
|
384
|
+
|
385
|
+
# Mantener la dirección original
|
386
|
+
result = {'address': address}
|
387
|
+
|
388
|
+
# Primera regex para formato completo
|
389
|
+
pattern1 = r'^.*,\s+([^,]+?)\s+([\w\s-]+)\s+([A-Z]{2})$'
|
390
|
+
# Segunda regex como fallback
|
391
|
+
pattern2 = r'^.*,\s*([^,]+?),\s+([\w\s-]+?)\s*([A-Z]{2})'
|
392
|
+
|
393
|
+
try:
|
394
|
+
# Intentar con la primera regex
|
395
|
+
match = re.search(pattern1, address)
|
396
|
+
if not match:
|
397
|
+
# Si no hay match, intentar con la segunda
|
398
|
+
match = re.search(pattern2, address)
|
399
|
+
|
400
|
+
if match:
|
401
|
+
result['state'] = match.group(1).strip()
|
402
|
+
result['zipcode'] = match.group(2).strip()
|
403
|
+
result['country'] = match.group(3).strip()
|
404
|
+
else:
|
405
|
+
self._logger.warning(f"Could not parse address: {address}")
|
406
|
+
result.update({
|
407
|
+
'state': None,
|
408
|
+
'zipcode': None,
|
409
|
+
'country': None
|
410
|
+
})
|
411
|
+
except Exception as e:
|
412
|
+
self._logger.error(f"Error parsing address {address}: {str(e)}")
|
413
|
+
result.update({
|
414
|
+
'state': None,
|
415
|
+
'zipcode': None,
|
416
|
+
'country': None
|
417
|
+
})
|
418
|
+
|
419
|
+
return result
|
420
|
+
|
421
|
+
@backoff.on_exception(
|
422
|
+
backoff.expo,
|
423
|
+
(httpx.ConnectTimeout, httpx.ReadTimeout, httpx.HTTPStatusError),
|
424
|
+
max_tries=3,
|
425
|
+
max_time=60,
|
426
|
+
giveup=lambda e: not bad_gateway_exception(e) and not isinstance(e, (httpx.ConnectTimeout, httpx.ReadTimeout))
|
427
|
+
)
|
428
|
+
async def _scrape_explorium(self, idx: int, url: str) -> tuple[int, Optional[Dict[str, Any]]]:
|
429
|
+
"""Scrape company information from Explorium.ai."""
|
430
|
+
# Inicializar el resultado con valores por defecto
|
431
|
+
result = {
|
432
|
+
'search_term': self.data.iloc[idx].get('search_term', ''),
|
433
|
+
'search_url': url,
|
434
|
+
'source_platform': 'explorium',
|
435
|
+
'company_name': None,
|
436
|
+
'logo_url': None,
|
437
|
+
'address': None,
|
438
|
+
'state': None,
|
439
|
+
'zipcode': None,
|
440
|
+
'country': None,
|
441
|
+
'phone_number': None,
|
442
|
+
'website': None,
|
443
|
+
'stock_symbol': None,
|
444
|
+
'naics_code': None,
|
445
|
+
'sic_code': None,
|
446
|
+
'employee_count': None,
|
447
|
+
'revenue_range': None,
|
448
|
+
'similar_companies': None,
|
449
|
+
'scrape_status': 'pending'
|
450
|
+
}
|
451
|
+
|
452
|
+
try:
|
453
|
+
self._logger.notice(f"Scraping Explorium URL: {url}")
|
454
|
+
|
455
|
+
self.headers["User-Agent"] = random.choice(ua)
|
456
|
+
|
457
|
+
# Usar el cliente HTTP con timeout
|
458
|
+
async with httpx.AsyncClient(timeout=30.0) as client:
|
459
|
+
response = await self._get(url, headers=self.headers)
|
460
|
+
|
461
|
+
if response.status_code != 200:
|
462
|
+
self._logger.error(f"Failed to fetch URL {url}: {response.status_code}")
|
463
|
+
return idx, None
|
464
|
+
|
465
|
+
await asyncio.sleep(random.uniform(1, 3))
|
466
|
+
|
467
|
+
content = response.text
|
468
|
+
soup = BeautifulSoup(content, 'html.parser')
|
469
|
+
|
470
|
+
# Extraer nombre de la compañía
|
471
|
+
title = soup.find('h1')
|
472
|
+
if title:
|
473
|
+
result['company_name'] = title.text.strip()
|
474
|
+
|
475
|
+
# Extraer logo si existe
|
476
|
+
logo = soup.find('img', {'class': 'company-logo'}) # Ajustar selector según HTML
|
477
|
+
if logo:
|
478
|
+
result['logo_url'] = logo.get('src')
|
479
|
+
|
480
|
+
# Extraer otros detalles
|
481
|
+
details = soup.find_all('div', {'class': 'company-detail'})
|
482
|
+
for detail in details:
|
483
|
+
label = detail.find('span', {'class': 'label'})
|
484
|
+
value = detail.find('span', {'class': 'value'})
|
485
|
+
if label and value:
|
486
|
+
label_text = label.text.strip().lower()
|
487
|
+
value_text = value.text.strip()
|
488
|
+
|
489
|
+
# Mapear campos de Explorium a la estructura de LeadIQ
|
490
|
+
if 'website' in label_text:
|
491
|
+
result['website'] = value_text
|
492
|
+
elif 'location' in label_text:
|
493
|
+
address_info = self._parse_address(value_text)
|
494
|
+
result.update(address_info)
|
495
|
+
elif 'size' in label_text or 'employees' in label_text:
|
496
|
+
result['employee_count'] = value_text
|
497
|
+
elif 'revenue' in label_text:
|
498
|
+
result['revenue_range'] = value_text
|
499
|
+
elif 'naics' in label_text:
|
500
|
+
result['naics_code'] = value_text
|
501
|
+
elif 'sic' in label_text:
|
502
|
+
result['sic_code'] = value_text
|
503
|
+
elif 'phone' in label_text:
|
504
|
+
result['phone_number'] = value_text
|
505
|
+
elif 'stock' in label_text:
|
506
|
+
result['stock_symbol'] = value_text
|
507
|
+
|
508
|
+
# Extraer compañías similares si existen
|
509
|
+
similar_section = soup.find('div', {'class': 'similar-companies'}) # Ajustar selector
|
510
|
+
if similar_section:
|
511
|
+
similar_companies = []
|
512
|
+
for company in similar_section.find_all('div', {'class': 'company-card'}): # Ajustar selector
|
513
|
+
company_name = company.find('h3')
|
514
|
+
if company_name:
|
515
|
+
similar_company = {
|
516
|
+
'name': company_name.text.strip(),
|
517
|
+
'explorium_url': company.find('a')['href'] if company.find('a') else None,
|
518
|
+
'logo_url': company.find('img')['src'] if company.find('img') else None,
|
519
|
+
}
|
520
|
+
similar_companies.append(similar_company)
|
521
|
+
|
522
|
+
if similar_companies:
|
523
|
+
try:
|
524
|
+
result['similar_companies'] = json.dumps(
|
525
|
+
similar_companies,
|
526
|
+
ensure_ascii=False,
|
527
|
+
allow_nan=False,
|
528
|
+
separators=(',', ':')
|
529
|
+
)
|
530
|
+
except Exception as e:
|
531
|
+
self._logger.error(f"Error formatting similar companies JSON: {str(e)}")
|
532
|
+
|
533
|
+
if result:
|
534
|
+
self._counter += 1
|
535
|
+
result['scrape_status'] = 'success'
|
536
|
+
return idx, result
|
537
|
+
|
538
|
+
return idx, None
|
539
|
+
|
540
|
+
except httpx.TimeoutException as e:
|
541
|
+
self._logger.error(f"Timeout scraping Explorium URL {url}: {str(e)}")
|
542
|
+
result['scrape_status'] = 'timeout'
|
543
|
+
return idx, result
|
544
|
+
except Exception as e:
|
545
|
+
self._logger.error(f"Error scraping Explorium URL {url}: {str(e)}")
|
546
|
+
result['scrape_status'] = f'error: {str(e)[:50]}'
|
547
|
+
return idx, result
|
548
|
+
|
549
|
+
@backoff.on_exception(
|
550
|
+
backoff.expo,
|
551
|
+
(httpx.ConnectTimeout, httpx.ReadTimeout, httpx.HTTPStatusError),
|
552
|
+
max_tries=3,
|
553
|
+
max_time=60,
|
554
|
+
giveup=lambda e: not bad_gateway_exception(e) and not isinstance(e, (httpx.ConnectTimeout, httpx.ReadTimeout))
|
555
|
+
)
|
556
|
+
async def _scrape_leadiq(self, idx: int, url: str) -> tuple[int, Optional[Dict[str, Any]]]:
|
557
|
+
"""Scrape company information from LeadIQ."""
|
558
|
+
# Inicializar el resultado con valores por defecto
|
559
|
+
result = {
|
560
|
+
'search_term': self.data.iloc[idx].get('search_term', ''),
|
561
|
+
'search_url': url,
|
562
|
+
'source_platform': 'leadiq',
|
563
|
+
'company_name': None,
|
564
|
+
'logo_url': None,
|
565
|
+
'address': None,
|
566
|
+
'state': None,
|
567
|
+
'zipcode': None,
|
568
|
+
'country': None,
|
569
|
+
'phone_number': None,
|
570
|
+
'website': None,
|
571
|
+
'stock_symbol': None,
|
572
|
+
'naics_code': None,
|
573
|
+
'sic_code': None,
|
574
|
+
'employee_count': None,
|
575
|
+
'revenue_range': None,
|
576
|
+
'similar_companies': None,
|
577
|
+
'scrape_status': 'pending'
|
578
|
+
}
|
579
|
+
|
580
|
+
try:
|
581
|
+
self._logger.notice(f"Scraping LeadIQ URL: {url}")
|
582
|
+
|
583
|
+
self.headers["User-Agent"] = random.choice(ua)
|
584
|
+
|
585
|
+
# Usar el cliente HTTP con timeout
|
586
|
+
async with httpx.AsyncClient(timeout=30.0) as client:
|
587
|
+
response = await self._get(url, headers=self.headers)
|
588
|
+
|
589
|
+
if response.status_code != 200:
|
590
|
+
self._logger.error(f"Failed to fetch URL {url}: {response.status_code}")
|
591
|
+
return idx, None
|
592
|
+
|
593
|
+
await asyncio.sleep(random.uniform(1, 3))
|
594
|
+
|
595
|
+
content = response.text
|
596
|
+
soup = BeautifulSoup(content, 'html.parser')
|
597
|
+
|
598
|
+
# Get company name and logo URL from logo image
|
599
|
+
logo = soup.find('img', {'alt': True, 'width': '76.747'})
|
600
|
+
if logo:
|
601
|
+
result['company_name'] = logo.get('alt')
|
602
|
+
result['logo_url'] = logo.get('src')
|
603
|
+
|
604
|
+
# Get company revenue range from highlight-right section
|
605
|
+
highlight_right = soup.find('div', {'class': 'highlight-right'})
|
606
|
+
if highlight_right:
|
607
|
+
revenue_span = highlight_right.find('span', {'class': 'start'})
|
608
|
+
if revenue_span:
|
609
|
+
start_value = revenue_span.text.strip()
|
610
|
+
end_span = revenue_span.find_next_sibling('span', {'class': 'end'})
|
611
|
+
if end_span:
|
612
|
+
end_value = end_span.text.strip()
|
613
|
+
result['revenue_range'] = f"{start_value} - {end_value}"
|
614
|
+
else:
|
615
|
+
result['revenue_range'] = start_value
|
616
|
+
|
617
|
+
# First find the highlight-left section that contains company info
|
618
|
+
highlight_left = soup.find('div', {'class': 'highlight-left'})
|
619
|
+
if not highlight_left:
|
620
|
+
self._logger.warning("Could not find highlight-left section")
|
621
|
+
return result
|
622
|
+
|
623
|
+
# Then find the card span within highlight-left
|
624
|
+
overview_section = highlight_left.find('div', {'class': 'card span'})
|
625
|
+
if not overview_section:
|
626
|
+
return result
|
627
|
+
|
628
|
+
# Extract information from dl/dt/dd elements
|
629
|
+
dl_element = overview_section.find('dl')
|
630
|
+
if dl_element:
|
631
|
+
for item in dl_element.find_all('div', {'class': 'item'}):
|
632
|
+
dt = item.find('dt')
|
633
|
+
dd = item.find('dd')
|
634
|
+
if dt and dd:
|
635
|
+
field = dt.text.strip().lower()
|
636
|
+
value = dd.text.strip()
|
637
|
+
|
638
|
+
# Map fields to our column names
|
639
|
+
if field == 'headquarters':
|
640
|
+
address_info = self._parse_address(value)
|
641
|
+
result.update(address_info)
|
642
|
+
elif field == 'phone number':
|
643
|
+
phone = value.replace('****', '0000')
|
644
|
+
result['phone_number'] = phone
|
645
|
+
elif field == 'website':
|
646
|
+
website = dd.find('a')
|
647
|
+
result['website'] = website['href'] if website else value
|
648
|
+
elif field == 'stock symbol':
|
649
|
+
result['stock_symbol'] = value
|
650
|
+
elif field == 'naics code':
|
651
|
+
result['naics_code'] = value
|
652
|
+
elif field == 'employees':
|
653
|
+
result['employee_count'] = value
|
654
|
+
elif field == 'sic code':
|
655
|
+
result['sic_code'] = value
|
656
|
+
|
657
|
+
# Extract similar companies
|
658
|
+
similar_companies = []
|
659
|
+
similar_section = soup.find('div', {'id': 'similar'})
|
660
|
+
if similar_section:
|
661
|
+
for company in similar_section.find_all('li'):
|
662
|
+
company_link = company.find('a')
|
663
|
+
if not company_link:
|
664
|
+
continue
|
665
|
+
|
666
|
+
company_logo = company_link.find('img')
|
667
|
+
company_name = company_link.find('h3')
|
668
|
+
|
669
|
+
# Find revenue span
|
670
|
+
revenue_spans = company_link.find_all('span')
|
671
|
+
revenue_span = None
|
672
|
+
for span in revenue_spans:
|
673
|
+
if span.find('span', {'class': 'start'}):
|
674
|
+
revenue_span = span
|
675
|
+
break
|
676
|
+
|
677
|
+
if company_name:
|
678
|
+
similar_company = {
|
679
|
+
'name': company_name.text.strip(), # No escapamos las comillas
|
680
|
+
'leadiq_url': company_link['href'],
|
681
|
+
'logo_url': company_logo['src'] if company_logo else None,
|
682
|
+
}
|
683
|
+
|
684
|
+
# Extract revenue range
|
685
|
+
if revenue_span:
|
686
|
+
start = revenue_span.find('span', {'class': 'start'})
|
687
|
+
end = revenue_span.find('span', {'class': 'end'})
|
688
|
+
|
689
|
+
if start:
|
690
|
+
start_value = start.text.strip()
|
691
|
+
if end:
|
692
|
+
end_value = end.text.strip()
|
693
|
+
similar_company['revenue_range'] = f"{start_value} - {end_value}"
|
694
|
+
else:
|
695
|
+
similar_company['revenue_range'] = start_value
|
696
|
+
|
697
|
+
similar_companies.append(similar_company)
|
698
|
+
|
699
|
+
if similar_companies:
|
700
|
+
try:
|
701
|
+
# Convertir a string JSON con las opciones correctas para PostgreSQL
|
702
|
+
result['similar_companies'] = json.dumps(
|
703
|
+
similar_companies,
|
704
|
+
ensure_ascii=False, # Permitir caracteres Unicode
|
705
|
+
allow_nan=False, # No permitir NaN/Infinity
|
706
|
+
separators=(',', ':') # Usar formato compacto
|
707
|
+
)
|
708
|
+
except Exception as e:
|
709
|
+
self._logger.error(f"Error formatting similar companies JSON: {str(e)}")
|
710
|
+
result['similar_companies'] = None
|
711
|
+
|
712
|
+
if result:
|
713
|
+
self._counter += 1
|
714
|
+
result['scrape_status'] = 'success'
|
715
|
+
return idx, result
|
716
|
+
|
717
|
+
return idx, None
|
718
|
+
|
719
|
+
except httpx.TimeoutException as e:
|
720
|
+
self._logger.error(f"Timeout scraping LeadIQ URL {url}: {str(e)}")
|
721
|
+
result['scrape_status'] = 'timeout'
|
722
|
+
return idx, result
|
723
|
+
except Exception as e:
|
724
|
+
self._logger.error(f"Error scraping LeadIQ URL {url}: {str(e)}")
|
725
|
+
result['scrape_status'] = f'error: {str(e)[:50]}'
|
726
|
+
return idx, result
|
727
|
+
|
728
|
+
def _check_company_name(self, company_name: str, title: str, scrapper: Any):
|
729
|
+
# Extract the Company Name from the title provided
|
730
|
+
pattern = r'\b(' + '|'.join(re.escape(kw) for kw in scrapper.keywords) + r')\b'
|
731
|
+
# Search for the first occurrence of any keyword
|
732
|
+
match = re.search(pattern, title, re.IGNORECASE)
|
733
|
+
if not match:
|
734
|
+
return False
|
735
|
+
|
736
|
+
result = title[:match.start()].strip()
|
737
|
+
if not result: # Si result está vacío
|
738
|
+
return False
|
739
|
+
|
740
|
+
company = company_name.strip()
|
741
|
+
# print('Company Name: ', company_name)
|
742
|
+
# print("COMPANY > ", result)
|
743
|
+
if company.lower() == result.lower():
|
744
|
+
return True
|
745
|
+
|
746
|
+
# second way, normalize names reducing to one element each:
|
747
|
+
cp = result.split()[0]
|
748
|
+
cp2 = company.split()[0]
|
749
|
+
if cp.lower() == cp2.lower():
|
750
|
+
return True
|
751
|
+
|
752
|
+
# Check with Fuzzy Search if Company matches.
|
753
|
+
score = fuzz.ratio(company.lower(), result.lower())
|
754
|
+
if score > 85:
|
755
|
+
return True
|
756
|
+
|
757
|
+
return False
|
758
|
+
|
759
|
+
def _standardize_name(self, text: str) -> str:
|
760
|
+
"""Estandariza el formato del texto: lowercase y guiones en lugar de espacios."""
|
761
|
+
# Primero limpiamos caracteres especiales y espacios extras
|
762
|
+
cleaned = text.strip().lower().replace(' ', '-')
|
763
|
+
return f"\'{cleaned}\'"
|
764
|
+
|
765
|
+
async def search_in_ddg(
|
766
|
+
self,
|
767
|
+
search_term: str,
|
768
|
+
company_name: str,
|
769
|
+
scrapper: Any,
|
770
|
+
backend: str = 'html',
|
771
|
+
region: str = 'wt-wt'
|
772
|
+
):
|
773
|
+
"""Search for a term in DuckDuckGo."""
|
774
|
+
try:
|
775
|
+
results = await self._search_duckduckgo(
|
776
|
+
search_term,
|
777
|
+
use_proxy=True,
|
778
|
+
headers=self.headers,
|
779
|
+
max_results=10,
|
780
|
+
backend=backend,
|
781
|
+
region=region,
|
782
|
+
)
|
783
|
+
if not results:
|
784
|
+
raise RuntimeError("Could not find any results")
|
785
|
+
if company := self._company_exists(results, company_name, scrapper):
|
786
|
+
return company
|
787
|
+
else:
|
788
|
+
raise RuntimeError(
|
789
|
+
"Could not find a company matching the search term"
|
790
|
+
)
|
791
|
+
except (RatelimitException, RuntimeError) as e:
|
792
|
+
self._logger.warning(f'Search Error: {e}')
|
793
|
+
raise RuntimeError('Search Error')
|
794
|
+
|
795
|
+
async def search_in_google(
|
796
|
+
self,
|
797
|
+
search_term,
|
798
|
+
company_name: str,
|
799
|
+
scrapper: Any,
|
800
|
+
use_selenium: bool = False
|
801
|
+
):
|
802
|
+
# Try to find company on Google Search:
|
803
|
+
try:
|
804
|
+
if use_selenium:
|
805
|
+
results = await self.search_google_cse(search_term, max_results=10)
|
806
|
+
else:
|
807
|
+
try:
|
808
|
+
response = await self._search_google(
|
809
|
+
search_term,
|
810
|
+
use_proxy=True,
|
811
|
+
headers=self.headers,
|
812
|
+
max_results=10,
|
813
|
+
region='us',
|
814
|
+
language='lang_en',
|
815
|
+
country='countryUS'
|
816
|
+
)
|
817
|
+
results = response.get('items', [])
|
818
|
+
except (httpx.ConnectError, httpx.RemoteProtocolError, httpx.WriteTimeout) as e:
|
819
|
+
self._logger.warning(
|
820
|
+
f"Connection error with Google API: {str(e)}, trying with Selenium..."
|
821
|
+
)
|
822
|
+
try:
|
823
|
+
results = await self.search_google_cse(search_term, max_results=10)
|
824
|
+
except (RuntimeError, ComponentError):
|
825
|
+
raise RuntimeError("Could not find any results")
|
826
|
+
if company := self._company_exists(results, company_name, scrapper):
|
827
|
+
return company
|
828
|
+
else:
|
829
|
+
raise RuntimeError(
|
830
|
+
"Could not find a company matching the search term"
|
831
|
+
)
|
832
|
+
except RuntimeError as e:
|
833
|
+
if str(e) == "No results found":
|
834
|
+
self._logger.warning(f"No results found for search term: {search_term}")
|
835
|
+
raise RuntimeError(
|
836
|
+
"Could not find a company matching the search term"
|
837
|
+
)
|
838
|
+
|
839
|
+
def _company_exists(self, results: list, company: str, scrapper: Any):
|
840
|
+
# Check if the Company Name is present in the title of the search results.
|
841
|
+
for r in results:
|
842
|
+
title = r.get('title', None)
|
843
|
+
# print('TITLE > ', title)
|
844
|
+
if not title:
|
845
|
+
continue
|
846
|
+
if any(keyword in title for keyword in scrapper.keywords):
|
847
|
+
# print('KEYword > ', title)
|
848
|
+
if self._check_company_name(company, title, scrapper):
|
849
|
+
self._logger.debug(f"Company Found: {company}")
|
850
|
+
return r
|
851
|
+
return None
|
852
|
+
|
853
|
+
async def _search_company(self, idx, row, cookies):
|
854
|
+
try:
|
855
|
+
async with self._semaphore:
|
856
|
+
# Extract the Company Name:
|
857
|
+
company_name = row[self.info_column]
|
858
|
+
# Let's mark this company as not found.
|
859
|
+
row['search_status'] = 'Not Found'
|
860
|
+
# Wait a random amount of time between 1 and 2 seconds to avoid
|
861
|
+
# DuckDuckGo rate limiting.
|
862
|
+
await asyncio.sleep(
|
863
|
+
random.uniform(1, 2)
|
864
|
+
)
|
865
|
+
# First step, search for Company in DuckDuckGo or fallback in Google (GSE):
|
866
|
+
for scrapper in self.scrappers:
|
867
|
+
search_term = scrapper.define_search_term(company_name)
|
868
|
+
## search_term = scrapper.search_term.format(standardized_term)
|
869
|
+
scrapper.search_term_used = search_term
|
870
|
+
self._logger.notice(f"Searching for: {search_term}")
|
871
|
+
|
872
|
+
try:
|
873
|
+
company = await self.search_in_ddg(
|
874
|
+
search_term, company_name, scrapper
|
875
|
+
)
|
876
|
+
except RuntimeError as e:
|
877
|
+
self._logger.warning(f'Search Error: {e}')
|
878
|
+
try:
|
879
|
+
company = await self.search_in_google(
|
880
|
+
search_term, company_name, scrapper
|
881
|
+
)
|
882
|
+
except RuntimeError:
|
883
|
+
try:
|
884
|
+
company = await self.search_in_google(
|
885
|
+
search_term,
|
886
|
+
company_name,
|
887
|
+
scrapper,
|
888
|
+
use_selenium=True
|
889
|
+
)
|
890
|
+
except Exception as e:
|
891
|
+
self._logger.error(f"Search failed: {str(e)}")
|
892
|
+
row['search_status'] = f'Failed: {str(e)}'
|
893
|
+
continue
|
894
|
+
if not company:
|
895
|
+
continue
|
896
|
+
|
897
|
+
# Second, extract URL from search results:
|
898
|
+
url = company.get('link', None)
|
899
|
+
if not url:
|
900
|
+
url = company.get('href', company.get('url', None))
|
901
|
+
if not url:
|
902
|
+
row['search_status'] = 'URL not found'
|
903
|
+
continue
|
904
|
+
|
905
|
+
# Limpiar la URL de sufijos no deseados
|
906
|
+
if '/employee-directory' in url:
|
907
|
+
url = url.replace('/employee-directory', '')
|
908
|
+
elif '/email-format' in url:
|
909
|
+
url = url.replace('/email-format', '')
|
910
|
+
|
911
|
+
try:
|
912
|
+
row['search_url'] = url
|
913
|
+
company_page = await scrapper.get(url, headers=self.headers)
|
914
|
+
if not company_page:
|
915
|
+
continue
|
916
|
+
except (httpx.WriteTimeout, httpx.ConnectError, httpx.RemoteProtocolError, httpx.HTTPError) as e:
|
917
|
+
self._logger.warning(f"HTTP error accessing {url}: {str(e)}")
|
918
|
+
# Intentar con Selenium como fallback
|
919
|
+
try:
|
920
|
+
driver = await self.get_driver()
|
921
|
+
await asyncio.sleep(2) # Dar tiempo para que la página cargue
|
922
|
+
driver.get(url)
|
923
|
+
company_page_text = driver.page_source
|
924
|
+
company = BeautifulSoup(company_page_text, 'html.parser')
|
925
|
+
_, scraped_data = await scrapper.scrapping(company, idx, row)
|
926
|
+
if scraped_data is not None and scraped_data['scrape_status'] == 'success':
|
927
|
+
row.update(scraped_data)
|
928
|
+
row['search_status'] = f'Found in {scrapper.domain}'
|
929
|
+
return idx, row
|
930
|
+
except Exception as se:
|
931
|
+
self._logger.error(f"Selenium fallback failed: {str(se)}")
|
932
|
+
continue
|
933
|
+
finally:
|
934
|
+
self.close_driver()
|
935
|
+
continue
|
936
|
+
|
937
|
+
# Third, scrape company information from content:
|
938
|
+
company = BeautifulSoup(company_page.text, 'html.parser')
|
939
|
+
scraped_idx, scraped_data = await scrapper.scrapping(company, idx, row)
|
940
|
+
if scraped_data is not None and scraped_data['scrape_status'] == 'success':
|
941
|
+
await asyncio.sleep(1.5)
|
942
|
+
row.update(scraped_data)
|
943
|
+
row['search_status'] = f'Found in {scrapper.domain}'
|
944
|
+
return idx, row
|
945
|
+
# Third, scrape company information from URL:
|
946
|
+
row['search_status'] = 'Not Found on any website'
|
947
|
+
return idx, row
|
948
|
+
except Exception as e:
|
949
|
+
# Marcar la fila como fallida y preservar la información
|
950
|
+
row['search_status'] = f'Failed: {str(e)}'
|
951
|
+
# Crear una excepción que contenga idx y row
|
952
|
+
error = RuntimeError(f"Search failed: {str(e)}")
|
953
|
+
error.idx = idx
|
954
|
+
error.row = row
|
955
|
+
raise error
|
956
|
+
|
957
|
+
async def run(self):
|
958
|
+
"""Execute scraping for each URL in the DataFrame."""
|
959
|
+
httpx_cookies = self.get_httpx_cookies(
|
960
|
+
domain='leadiq.com', cookies=self.cookies
|
961
|
+
)
|
962
|
+
scrappers = []
|
963
|
+
for scrapper in self.scrappers:
|
964
|
+
if scrapper == 'leadiq':
|
965
|
+
httpx_cookies = self.get_httpx_cookies(
|
966
|
+
domain='.leadiq.com', cookies=self.cookies
|
967
|
+
)
|
968
|
+
scp = LeadiqScrapper(
|
969
|
+
cookies=httpx_cookies
|
970
|
+
)
|
971
|
+
scrappers.append(
|
972
|
+
scp
|
973
|
+
)
|
974
|
+
if scrapper == 'explorium':
|
975
|
+
httpx_cookies = self.get_httpx_cookies(
|
976
|
+
domain='explorium.ai', cookies=self.cookies
|
977
|
+
)
|
978
|
+
scp = ExploriumScrapper(
|
979
|
+
cookies=httpx_cookies
|
980
|
+
)
|
981
|
+
scrappers.append(
|
982
|
+
scp
|
983
|
+
)
|
984
|
+
if scrapper == 'zoominfo':
|
985
|
+
httpx_cookies = self.get_httpx_cookies(
|
986
|
+
domain='zoominfo.com', cookies=self.cookies
|
987
|
+
)
|
988
|
+
scp = ZoomInfoScrapper(
|
989
|
+
cookies=httpx_cookies
|
990
|
+
)
|
991
|
+
scrappers.append(
|
992
|
+
scp
|
993
|
+
)
|
994
|
+
if scrapper == 'siccode':
|
995
|
+
httpx_cookies = self.get_httpx_cookies(
|
996
|
+
domain='siccode.com', cookies=self.cookies
|
997
|
+
)
|
998
|
+
scp = SicCodeScrapper(
|
999
|
+
cookies=httpx_cookies
|
1000
|
+
)
|
1001
|
+
scrappers.append(
|
1002
|
+
scp
|
1003
|
+
)
|
1004
|
+
if scrapper == 'rocketreach':
|
1005
|
+
httpx_cookies = self.get_httpx_cookies(
|
1006
|
+
domain='rocketreach.co', cookies=self.cookies
|
1007
|
+
)
|
1008
|
+
scp = RocketReachScrapper(
|
1009
|
+
cookies=httpx_cookies
|
1010
|
+
)
|
1011
|
+
scrappers.append(
|
1012
|
+
scp
|
1013
|
+
)
|
1014
|
+
if scrapper == 'visualvisitor':
|
1015
|
+
httpx_cookies = self.get_httpx_cookies(
|
1016
|
+
domain='visualvisitor.com', cookies=self.cookies
|
1017
|
+
)
|
1018
|
+
scp = VisualVisitorScrapper(
|
1019
|
+
cookies=httpx_cookies
|
1020
|
+
)
|
1021
|
+
scrappers.append(
|
1022
|
+
scp
|
1023
|
+
)
|
1024
|
+
# else:
|
1025
|
+
# self._logger.warning(
|
1026
|
+
# f"Unsupported scrapper: {scrapper}"
|
1027
|
+
# )
|
1028
|
+
# return scrappers list to self.scrappers
|
1029
|
+
self.scrappers = scrappers
|
1030
|
+
if not scrappers:
|
1031
|
+
raise ConfigError(
|
1032
|
+
"No valid scrappers were found or provided in configuration"
|
1033
|
+
)
|
1034
|
+
tasks = [
|
1035
|
+
self._search_company(
|
1036
|
+
idx, row, httpx_cookies
|
1037
|
+
) for idx, row in self.data.iterrows()
|
1038
|
+
]
|
1039
|
+
companies_info = await self._processing_tasks(tasks)
|
1040
|
+
self._print_data_(companies_info, 'Company Search Results')
|
1041
|
+
|
1042
|
+
self._result = companies_info
|
1043
|
+
return self._result
|
1044
|
+
|
1045
|
+
async def close(self):
|
1046
|
+
"""Clean up resources."""
|
1047
|
+
return True
|
1048
|
+
|
1049
|
+
def _get_error_info(self, error):
|
1050
|
+
"""Extrae idx y row de un error."""
|
1051
|
+
if hasattr(error, 'idx') and hasattr(error, 'row'):
|
1052
|
+
return error.idx, error.row
|
1053
|
+
# Si no podemos obtener la info, crear una fila con información básica
|
1054
|
+
return None, {'search_status': f'Failed: {str(error)}'}
|