flowtask 5.8.4__cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowtask/__init__.py +93 -0
- flowtask/__main__.py +38 -0
- flowtask/bots/__init__.py +6 -0
- flowtask/bots/check.py +93 -0
- flowtask/bots/codebot.py +51 -0
- flowtask/components/ASPX.py +148 -0
- flowtask/components/AddDataset.py +352 -0
- flowtask/components/Amazon.py +523 -0
- flowtask/components/AutoTask.py +314 -0
- flowtask/components/Azure.py +80 -0
- flowtask/components/AzureUsers.py +106 -0
- flowtask/components/BaseAction.py +91 -0
- flowtask/components/BaseLoop.py +198 -0
- flowtask/components/BestBuy.py +800 -0
- flowtask/components/CSVToGCS.py +120 -0
- flowtask/components/CompanyScraper/__init__.py +1 -0
- flowtask/components/CompanyScraper/parsers/__init__.py +6 -0
- flowtask/components/CompanyScraper/parsers/base.py +102 -0
- flowtask/components/CompanyScraper/parsers/explorium.py +192 -0
- flowtask/components/CompanyScraper/parsers/leadiq.py +206 -0
- flowtask/components/CompanyScraper/parsers/rocket.py +133 -0
- flowtask/components/CompanyScraper/parsers/siccode.py +109 -0
- flowtask/components/CompanyScraper/parsers/visualvisitor.py +130 -0
- flowtask/components/CompanyScraper/parsers/zoominfo.py +118 -0
- flowtask/components/CompanyScraper/scrapper.py +1054 -0
- flowtask/components/CopyTo.py +177 -0
- flowtask/components/CopyToBigQuery.py +243 -0
- flowtask/components/CopyToMongoDB.py +291 -0
- flowtask/components/CopyToPg.py +609 -0
- flowtask/components/CopyToRethink.py +207 -0
- flowtask/components/CreateGCSBucket.py +102 -0
- flowtask/components/CreateReport/CreateReport.py +228 -0
- flowtask/components/CreateReport/__init__.py +9 -0
- flowtask/components/CreateReport/charts/__init__.py +15 -0
- flowtask/components/CreateReport/charts/bar.py +51 -0
- flowtask/components/CreateReport/charts/base.py +66 -0
- flowtask/components/CreateReport/charts/pie.py +64 -0
- flowtask/components/CreateReport/utils.py +9 -0
- flowtask/components/CustomerSatisfaction.py +196 -0
- flowtask/components/DataInput.py +200 -0
- flowtask/components/DateList.py +255 -0
- flowtask/components/DbClient.py +163 -0
- flowtask/components/DialPad.py +146 -0
- flowtask/components/DocumentDBQuery.py +200 -0
- flowtask/components/DownloadFrom.py +371 -0
- flowtask/components/DownloadFromD2L.py +113 -0
- flowtask/components/DownloadFromFTP.py +181 -0
- flowtask/components/DownloadFromIMAP.py +315 -0
- flowtask/components/DownloadFromS3.py +198 -0
- flowtask/components/DownloadFromSFTP.py +265 -0
- flowtask/components/DownloadFromSharepoint.py +110 -0
- flowtask/components/DownloadFromSmartSheet.py +114 -0
- flowtask/components/DownloadS3File.py +229 -0
- flowtask/components/Dummy.py +59 -0
- flowtask/components/DuplicatePhoto.py +411 -0
- flowtask/components/EmployeeEvaluation.py +237 -0
- flowtask/components/ExecuteSQL.py +323 -0
- flowtask/components/ExtractHTML.py +178 -0
- flowtask/components/FileBase.py +178 -0
- flowtask/components/FileCopy.py +181 -0
- flowtask/components/FileDelete.py +82 -0
- flowtask/components/FileExists.py +146 -0
- flowtask/components/FileIteratorDelete.py +112 -0
- flowtask/components/FileList.py +194 -0
- flowtask/components/FileOpen.py +75 -0
- flowtask/components/FileRead.py +120 -0
- flowtask/components/FileRename.py +106 -0
- flowtask/components/FilterIf.py +284 -0
- flowtask/components/FilterRows/FilterRows.py +200 -0
- flowtask/components/FilterRows/__init__.py +10 -0
- flowtask/components/FilterRows/functions.py +4 -0
- flowtask/components/GCSToBigQuery.py +103 -0
- flowtask/components/GoogleA4.py +150 -0
- flowtask/components/GoogleGeoCoding.py +344 -0
- flowtask/components/GooglePlaces.py +315 -0
- flowtask/components/GoogleSearch.py +539 -0
- flowtask/components/HTTPClient.py +268 -0
- flowtask/components/ICIMS.py +146 -0
- flowtask/components/IF.py +179 -0
- flowtask/components/IcimsFolderCopy.py +173 -0
- flowtask/components/ImageFeatures/__init__.py +5 -0
- flowtask/components/ImageFeatures/process.py +233 -0
- flowtask/components/IteratorBase.py +251 -0
- flowtask/components/LangchainLoader/__init__.py +5 -0
- flowtask/components/LangchainLoader/loader.py +194 -0
- flowtask/components/LangchainLoader/loaders/__init__.py +22 -0
- flowtask/components/LangchainLoader/loaders/abstract.py +362 -0
- flowtask/components/LangchainLoader/loaders/basepdf.py +50 -0
- flowtask/components/LangchainLoader/loaders/docx.py +91 -0
- flowtask/components/LangchainLoader/loaders/html.py +119 -0
- flowtask/components/LangchainLoader/loaders/pdfblocks.py +146 -0
- flowtask/components/LangchainLoader/loaders/pdfmark.py +79 -0
- flowtask/components/LangchainLoader/loaders/pdftables.py +135 -0
- flowtask/components/LangchainLoader/loaders/qa.py +67 -0
- flowtask/components/LangchainLoader/loaders/txt.py +55 -0
- flowtask/components/LeadIQ.py +650 -0
- flowtask/components/Loop.py +253 -0
- flowtask/components/Lowes.py +334 -0
- flowtask/components/MS365Usage.py +156 -0
- flowtask/components/MSTeamsMessages.py +320 -0
- flowtask/components/MarketClustering.py +1051 -0
- flowtask/components/MergeFiles.py +362 -0
- flowtask/components/MilvusOutput.py +87 -0
- flowtask/components/NearByStores.py +175 -0
- flowtask/components/NetworkNinja/__init__.py +6 -0
- flowtask/components/NetworkNinja/models/__init__.py +52 -0
- flowtask/components/NetworkNinja/models/abstract.py +177 -0
- flowtask/components/NetworkNinja/models/account.py +39 -0
- flowtask/components/NetworkNinja/models/client.py +19 -0
- flowtask/components/NetworkNinja/models/district.py +14 -0
- flowtask/components/NetworkNinja/models/events.py +101 -0
- flowtask/components/NetworkNinja/models/forms.py +499 -0
- flowtask/components/NetworkNinja/models/market.py +16 -0
- flowtask/components/NetworkNinja/models/organization.py +34 -0
- flowtask/components/NetworkNinja/models/photos.py +125 -0
- flowtask/components/NetworkNinja/models/project.py +44 -0
- flowtask/components/NetworkNinja/models/region.py +28 -0
- flowtask/components/NetworkNinja/models/store.py +203 -0
- flowtask/components/NetworkNinja/models/user.py +151 -0
- flowtask/components/NetworkNinja/router.py +854 -0
- flowtask/components/Odoo.py +175 -0
- flowtask/components/OdooInjector.py +192 -0
- flowtask/components/OpenFromXML.py +126 -0
- flowtask/components/OpenWeather.py +41 -0
- flowtask/components/OpenWithBase.py +616 -0
- flowtask/components/OpenWithPandas.py +715 -0
- flowtask/components/PGPDecrypt.py +199 -0
- flowtask/components/PandasIterator.py +187 -0
- flowtask/components/PandasToFile.py +189 -0
- flowtask/components/Paradox.py +339 -0
- flowtask/components/ParamIterator.py +117 -0
- flowtask/components/ParseHTML.py +84 -0
- flowtask/components/PlacerStores.py +249 -0
- flowtask/components/Pokemon.py +507 -0
- flowtask/components/PositiveBot.py +62 -0
- flowtask/components/PowerPointSlide.py +400 -0
- flowtask/components/PrintMessage.py +127 -0
- flowtask/components/ProductCompetitors/__init__.py +5 -0
- flowtask/components/ProductCompetitors/parsers/__init__.py +7 -0
- flowtask/components/ProductCompetitors/parsers/base.py +72 -0
- flowtask/components/ProductCompetitors/parsers/bestbuy.py +86 -0
- flowtask/components/ProductCompetitors/parsers/lowes.py +103 -0
- flowtask/components/ProductCompetitors/scrapper.py +155 -0
- flowtask/components/ProductCompliant.py +169 -0
- flowtask/components/ProductInfo/__init__.py +1 -0
- flowtask/components/ProductInfo/parsers/__init__.py +5 -0
- flowtask/components/ProductInfo/parsers/base.py +83 -0
- flowtask/components/ProductInfo/parsers/brother.py +97 -0
- flowtask/components/ProductInfo/parsers/canon.py +167 -0
- flowtask/components/ProductInfo/parsers/epson.py +118 -0
- flowtask/components/ProductInfo/parsers/hp.py +131 -0
- flowtask/components/ProductInfo/parsers/samsung.py +97 -0
- flowtask/components/ProductInfo/scraper.py +319 -0
- flowtask/components/ProductPricing.py +118 -0
- flowtask/components/QS.py +261 -0
- flowtask/components/QSBase.py +201 -0
- flowtask/components/QueryIterator.py +273 -0
- flowtask/components/QueryToInsert.py +327 -0
- flowtask/components/QueryToPandas.py +432 -0
- flowtask/components/RESTClient.py +195 -0
- flowtask/components/RethinkDBQuery.py +189 -0
- flowtask/components/Rsync.py +74 -0
- flowtask/components/RunSSH.py +59 -0
- flowtask/components/RunShell.py +71 -0
- flowtask/components/SalesForce.py +20 -0
- flowtask/components/SaveImageBank/__init__.py +257 -0
- flowtask/components/SchedulingVisits.py +592 -0
- flowtask/components/ScrapPage.py +216 -0
- flowtask/components/ScrapSearch.py +79 -0
- flowtask/components/SendNotify.py +257 -0
- flowtask/components/SentimentAnalysis.py +694 -0
- flowtask/components/ServiceScrapper/__init__.py +5 -0
- flowtask/components/ServiceScrapper/parsers/__init__.py +1 -0
- flowtask/components/ServiceScrapper/parsers/base.py +94 -0
- flowtask/components/ServiceScrapper/parsers/costco.py +93 -0
- flowtask/components/ServiceScrapper/scrapper.py +199 -0
- flowtask/components/SetVariables.py +156 -0
- flowtask/components/SubTask.py +182 -0
- flowtask/components/SuiteCRM.py +48 -0
- flowtask/components/Switch.py +175 -0
- flowtask/components/TableBase.py +148 -0
- flowtask/components/TableDelete.py +312 -0
- flowtask/components/TableInput.py +143 -0
- flowtask/components/TableOutput/TableOutput.py +384 -0
- flowtask/components/TableOutput/__init__.py +3 -0
- flowtask/components/TableSchema.py +534 -0
- flowtask/components/Target.py +223 -0
- flowtask/components/ThumbnailGenerator.py +156 -0
- flowtask/components/ToPandas.py +67 -0
- flowtask/components/TransformRows/TransformRows.py +507 -0
- flowtask/components/TransformRows/__init__.py +9 -0
- flowtask/components/TransformRows/functions.py +559 -0
- flowtask/components/TransposeRows.py +176 -0
- flowtask/components/UPCDatabase.py +86 -0
- flowtask/components/UnGzip.py +171 -0
- flowtask/components/Uncompress.py +172 -0
- flowtask/components/UniqueRows.py +126 -0
- flowtask/components/Unzip.py +107 -0
- flowtask/components/UpdateOperationalVars.py +147 -0
- flowtask/components/UploadTo.py +299 -0
- flowtask/components/UploadToS3.py +136 -0
- flowtask/components/UploadToSFTP.py +160 -0
- flowtask/components/UploadToSharepoint.py +205 -0
- flowtask/components/UserFunc.py +122 -0
- flowtask/components/VivaTracker.py +140 -0
- flowtask/components/WSDLClient.py +123 -0
- flowtask/components/Wait.py +18 -0
- flowtask/components/Walmart.py +199 -0
- flowtask/components/Workplace.py +134 -0
- flowtask/components/XMLToPandas.py +267 -0
- flowtask/components/Zammad/__init__.py +41 -0
- flowtask/components/Zammad/models.py +0 -0
- flowtask/components/ZoomInfoScraper.py +409 -0
- flowtask/components/__init__.py +104 -0
- flowtask/components/abstract.py +18 -0
- flowtask/components/flow.py +530 -0
- flowtask/components/google.py +335 -0
- flowtask/components/group.py +221 -0
- flowtask/components/py.typed +0 -0
- flowtask/components/reviewscrap.py +132 -0
- flowtask/components/tAutoincrement.py +117 -0
- flowtask/components/tConcat.py +109 -0
- flowtask/components/tExplode.py +119 -0
- flowtask/components/tFilter.py +184 -0
- flowtask/components/tGroup.py +236 -0
- flowtask/components/tJoin.py +270 -0
- flowtask/components/tMap/__init__.py +9 -0
- flowtask/components/tMap/functions.py +54 -0
- flowtask/components/tMap/tMap.py +450 -0
- flowtask/components/tMelt.py +112 -0
- flowtask/components/tMerge.py +114 -0
- flowtask/components/tOrder.py +93 -0
- flowtask/components/tPandas.py +94 -0
- flowtask/components/tPivot.py +71 -0
- flowtask/components/tPluckCols.py +76 -0
- flowtask/components/tUnnest.py +82 -0
- flowtask/components/user.py +401 -0
- flowtask/conf.py +457 -0
- flowtask/download.py +102 -0
- flowtask/events/__init__.py +11 -0
- flowtask/events/events/__init__.py +20 -0
- flowtask/events/events/abstract.py +95 -0
- flowtask/events/events/alerts/__init__.py +362 -0
- flowtask/events/events/alerts/colfunctions.py +131 -0
- flowtask/events/events/alerts/functions.py +158 -0
- flowtask/events/events/dummy.py +12 -0
- flowtask/events/events/exec.py +124 -0
- flowtask/events/events/file/__init__.py +7 -0
- flowtask/events/events/file/base.py +51 -0
- flowtask/events/events/file/copy.py +23 -0
- flowtask/events/events/file/delete.py +16 -0
- flowtask/events/events/interfaces/__init__.py +9 -0
- flowtask/events/events/interfaces/client.py +67 -0
- flowtask/events/events/interfaces/credentials.py +28 -0
- flowtask/events/events/interfaces/notifications.py +58 -0
- flowtask/events/events/jira.py +122 -0
- flowtask/events/events/log.py +26 -0
- flowtask/events/events/logerr.py +52 -0
- flowtask/events/events/notify.py +59 -0
- flowtask/events/events/notify_event.py +160 -0
- flowtask/events/events/publish.py +54 -0
- flowtask/events/events/sendfile.py +104 -0
- flowtask/events/events/task.py +97 -0
- flowtask/events/events/teams.py +98 -0
- flowtask/events/events/webhook.py +58 -0
- flowtask/events/manager.py +287 -0
- flowtask/exceptions.c +39393 -0
- flowtask/exceptions.cpython-310-x86_64-linux-gnu.so +0 -0
- flowtask/extensions/__init__.py +3 -0
- flowtask/extensions/abstract.py +82 -0
- flowtask/extensions/logging/__init__.py +65 -0
- flowtask/hooks/__init__.py +9 -0
- flowtask/hooks/actions/__init__.py +22 -0
- flowtask/hooks/actions/abstract.py +66 -0
- flowtask/hooks/actions/dummy.py +23 -0
- flowtask/hooks/actions/jira.py +74 -0
- flowtask/hooks/actions/rest.py +320 -0
- flowtask/hooks/actions/sampledata.py +37 -0
- flowtask/hooks/actions/sensor.py +23 -0
- flowtask/hooks/actions/task.py +9 -0
- flowtask/hooks/actions/ticket.py +37 -0
- flowtask/hooks/actions/zammad.py +55 -0
- flowtask/hooks/hook.py +62 -0
- flowtask/hooks/models.py +17 -0
- flowtask/hooks/service.py +187 -0
- flowtask/hooks/step.py +91 -0
- flowtask/hooks/types/__init__.py +23 -0
- flowtask/hooks/types/base.py +129 -0
- flowtask/hooks/types/brokers/__init__.py +11 -0
- flowtask/hooks/types/brokers/base.py +54 -0
- flowtask/hooks/types/brokers/mqtt.py +35 -0
- flowtask/hooks/types/brokers/rabbitmq.py +82 -0
- flowtask/hooks/types/brokers/redis.py +83 -0
- flowtask/hooks/types/brokers/sqs.py +44 -0
- flowtask/hooks/types/fs.py +232 -0
- flowtask/hooks/types/http.py +49 -0
- flowtask/hooks/types/imap.py +200 -0
- flowtask/hooks/types/jira.py +279 -0
- flowtask/hooks/types/mail.py +205 -0
- flowtask/hooks/types/postgres.py +98 -0
- flowtask/hooks/types/responses/__init__.py +8 -0
- flowtask/hooks/types/responses/base.py +5 -0
- flowtask/hooks/types/sharepoint.py +288 -0
- flowtask/hooks/types/ssh.py +141 -0
- flowtask/hooks/types/tagged.py +59 -0
- flowtask/hooks/types/upload.py +85 -0
- flowtask/hooks/types/watch.py +71 -0
- flowtask/hooks/types/web.py +36 -0
- flowtask/interfaces/AzureClient.py +137 -0
- flowtask/interfaces/AzureGraph.py +839 -0
- flowtask/interfaces/Boto3Client.py +326 -0
- flowtask/interfaces/DropboxClient.py +173 -0
- flowtask/interfaces/ExcelHandler.py +94 -0
- flowtask/interfaces/FTPClient.py +131 -0
- flowtask/interfaces/GoogleCalendar.py +201 -0
- flowtask/interfaces/GoogleClient.py +133 -0
- flowtask/interfaces/GoogleDrive.py +127 -0
- flowtask/interfaces/GoogleGCS.py +89 -0
- flowtask/interfaces/GoogleGeocoding.py +93 -0
- flowtask/interfaces/GoogleLang.py +114 -0
- flowtask/interfaces/GooglePub.py +61 -0
- flowtask/interfaces/GoogleSheet.py +68 -0
- flowtask/interfaces/IMAPClient.py +137 -0
- flowtask/interfaces/O365Calendar.py +113 -0
- flowtask/interfaces/O365Client.py +220 -0
- flowtask/interfaces/OneDrive.py +284 -0
- flowtask/interfaces/Outlook.py +155 -0
- flowtask/interfaces/ParrotBot.py +130 -0
- flowtask/interfaces/SSHClient.py +378 -0
- flowtask/interfaces/Sharepoint.py +496 -0
- flowtask/interfaces/__init__.py +36 -0
- flowtask/interfaces/azureauth.py +119 -0
- flowtask/interfaces/cache.py +201 -0
- flowtask/interfaces/client.py +82 -0
- flowtask/interfaces/compress.py +525 -0
- flowtask/interfaces/credentials.py +124 -0
- flowtask/interfaces/d2l.py +239 -0
- flowtask/interfaces/databases/__init__.py +5 -0
- flowtask/interfaces/databases/db.py +223 -0
- flowtask/interfaces/databases/documentdb.py +55 -0
- flowtask/interfaces/databases/rethink.py +39 -0
- flowtask/interfaces/dataframes/__init__.py +11 -0
- flowtask/interfaces/dataframes/abstract.py +21 -0
- flowtask/interfaces/dataframes/arrow.py +71 -0
- flowtask/interfaces/dataframes/dt.py +69 -0
- flowtask/interfaces/dataframes/pandas.py +167 -0
- flowtask/interfaces/dataframes/polars.py +60 -0
- flowtask/interfaces/db.py +263 -0
- flowtask/interfaces/env.py +46 -0
- flowtask/interfaces/func.py +137 -0
- flowtask/interfaces/http.py +1780 -0
- flowtask/interfaces/locale.py +40 -0
- flowtask/interfaces/log.py +75 -0
- flowtask/interfaces/mask.py +143 -0
- flowtask/interfaces/notification.py +154 -0
- flowtask/interfaces/playwright.py +339 -0
- flowtask/interfaces/powerpoint.py +368 -0
- flowtask/interfaces/py.typed +0 -0
- flowtask/interfaces/qs.py +376 -0
- flowtask/interfaces/result.py +87 -0
- flowtask/interfaces/selenium_service.py +779 -0
- flowtask/interfaces/smartsheet.py +154 -0
- flowtask/interfaces/stat.py +39 -0
- flowtask/interfaces/task.py +96 -0
- flowtask/interfaces/template.py +118 -0
- flowtask/interfaces/vectorstores/__init__.py +1 -0
- flowtask/interfaces/vectorstores/abstract.py +133 -0
- flowtask/interfaces/vectorstores/milvus.py +669 -0
- flowtask/interfaces/zammad.py +107 -0
- flowtask/models.py +193 -0
- flowtask/parsers/__init__.py +15 -0
- flowtask/parsers/_yaml.c +11978 -0
- flowtask/parsers/_yaml.cpython-310-x86_64-linux-gnu.so +0 -0
- flowtask/parsers/argparser.py +235 -0
- flowtask/parsers/base.c +15155 -0
- flowtask/parsers/base.cpython-310-x86_64-linux-gnu.so +0 -0
- flowtask/parsers/json.c +11968 -0
- flowtask/parsers/json.cpython-310-x86_64-linux-gnu.so +0 -0
- flowtask/parsers/maps.py +49 -0
- flowtask/parsers/toml.c +11968 -0
- flowtask/parsers/toml.cpython-310-x86_64-linux-gnu.so +0 -0
- flowtask/plugins/__init__.py +16 -0
- flowtask/plugins/components/__init__.py +0 -0
- flowtask/plugins/handler/__init__.py +45 -0
- flowtask/plugins/importer.py +31 -0
- flowtask/plugins/sources/__init__.py +0 -0
- flowtask/runner.py +283 -0
- flowtask/scheduler/__init__.py +9 -0
- flowtask/scheduler/functions.py +493 -0
- flowtask/scheduler/handlers/__init__.py +8 -0
- flowtask/scheduler/handlers/manager.py +504 -0
- flowtask/scheduler/handlers/models.py +58 -0
- flowtask/scheduler/handlers/service.py +72 -0
- flowtask/scheduler/notifications.py +65 -0
- flowtask/scheduler/scheduler.py +993 -0
- flowtask/services/__init__.py +0 -0
- flowtask/services/bots/__init__.py +0 -0
- flowtask/services/bots/telegram.py +264 -0
- flowtask/services/files/__init__.py +11 -0
- flowtask/services/files/manager.py +522 -0
- flowtask/services/files/model.py +37 -0
- flowtask/services/files/service.py +767 -0
- flowtask/services/jira/__init__.py +3 -0
- flowtask/services/jira/jira_actions.py +191 -0
- flowtask/services/tasks/__init__.py +13 -0
- flowtask/services/tasks/launcher.py +213 -0
- flowtask/services/tasks/manager.py +323 -0
- flowtask/services/tasks/service.py +275 -0
- flowtask/services/tasks/task_manager.py +376 -0
- flowtask/services/tasks/tasks.py +155 -0
- flowtask/storages/__init__.py +16 -0
- flowtask/storages/exceptions.py +12 -0
- flowtask/storages/files/__init__.py +8 -0
- flowtask/storages/files/abstract.py +29 -0
- flowtask/storages/files/filesystem.py +66 -0
- flowtask/storages/tasks/__init__.py +19 -0
- flowtask/storages/tasks/abstract.py +26 -0
- flowtask/storages/tasks/database.py +33 -0
- flowtask/storages/tasks/filesystem.py +108 -0
- flowtask/storages/tasks/github.py +119 -0
- flowtask/storages/tasks/memory.py +45 -0
- flowtask/storages/tasks/row.py +25 -0
- flowtask/tasks/__init__.py +0 -0
- flowtask/tasks/abstract.py +526 -0
- flowtask/tasks/command.py +118 -0
- flowtask/tasks/pile.py +486 -0
- flowtask/tasks/py.typed +0 -0
- flowtask/tasks/task.py +778 -0
- flowtask/template/__init__.py +161 -0
- flowtask/tests.py +257 -0
- flowtask/types/__init__.py +8 -0
- flowtask/types/typedefs.c +11347 -0
- flowtask/types/typedefs.cpython-310-x86_64-linux-gnu.so +0 -0
- flowtask/utils/__init__.py +24 -0
- flowtask/utils/constants.py +117 -0
- flowtask/utils/encoders.py +21 -0
- flowtask/utils/executor.py +112 -0
- flowtask/utils/functions.cpp +14280 -0
- flowtask/utils/functions.cpython-310-x86_64-linux-gnu.so +0 -0
- flowtask/utils/json.cpp +13349 -0
- flowtask/utils/json.cpython-310-x86_64-linux-gnu.so +0 -0
- flowtask/utils/mail.py +63 -0
- flowtask/utils/parseqs.c +13324 -0
- flowtask/utils/parserqs.cpython-310-x86_64-linux-gnu.so +0 -0
- flowtask/utils/stats.py +308 -0
- flowtask/utils/transformations.py +74 -0
- flowtask/utils/uv.py +12 -0
- flowtask/utils/validators.py +97 -0
- flowtask/version.py +11 -0
- flowtask-5.8.4.dist-info/LICENSE +201 -0
- flowtask-5.8.4.dist-info/METADATA +209 -0
- flowtask-5.8.4.dist-info/RECORD +470 -0
- flowtask-5.8.4.dist-info/WHEEL +6 -0
- flowtask-5.8.4.dist-info/entry_points.txt +3 -0
- flowtask-5.8.4.dist-info/top_level.txt +2 -0
- plugins/components/CreateQR.py +39 -0
- plugins/components/TestComponent.py +28 -0
- plugins/components/Use1.py +13 -0
- plugins/components/Workplace.py +117 -0
- plugins/components/__init__.py +3 -0
- plugins/sources/__init__.py +0 -0
- plugins/sources/get_populartimes.py +78 -0
- plugins/sources/google.py +150 -0
- plugins/sources/hubspot.py +679 -0
- plugins/sources/icims.py +679 -0
- plugins/sources/mobileinsight.py +501 -0
- plugins/sources/newrelic.py +262 -0
- plugins/sources/uap.py +268 -0
- plugins/sources/venu.py +244 -0
- plugins/sources/vocinity.py +314 -0
@@ -0,0 +1,523 @@
|
|
1
|
+
"""
|
2
|
+
Scrapping a Web Page Using Selenium + ChromeDriver + BeautifulSoup.
|
3
|
+
|
4
|
+
|
5
|
+
Example:
|
6
|
+
|
7
|
+
```yaml
|
8
|
+
Amazon:
|
9
|
+
type: product_info
|
10
|
+
use_proxies: true
|
11
|
+
paid_proxy: true
|
12
|
+
```
|
13
|
+
|
14
|
+
"""
|
15
|
+
from typing import Any
|
16
|
+
import asyncio
|
17
|
+
from collections.abc import Callable
|
18
|
+
import re
|
19
|
+
from urllib.parse import urljoin
|
20
|
+
from bs4 import BeautifulSoup
|
21
|
+
import random
|
22
|
+
import httpx
|
23
|
+
import pandas as pd
|
24
|
+
import backoff
|
25
|
+
# Internals
|
26
|
+
from ..exceptions import (
|
27
|
+
ComponentError,
|
28
|
+
ConfigError,
|
29
|
+
NotSupported,
|
30
|
+
DataNotFound,
|
31
|
+
DataError
|
32
|
+
)
|
33
|
+
from ..interfaces.http import ua
|
34
|
+
from .reviewscrap import ReviewScrapper, on_backoff, bad_gateway_exception
|
35
|
+
|
36
|
+
|
37
|
+
class Amazon(ReviewScrapper):
|
38
|
+
"""Amazon.
|
39
|
+
|
40
|
+
Combining API Key and Web Scrapping, this component will be able to extract
|
41
|
+
Amazon Product Information (reviews, etc).
|
42
|
+
"""
|
43
|
+
def __init__(
|
44
|
+
self,
|
45
|
+
loop: asyncio.AbstractEventLoop = None,
|
46
|
+
job: Callable = None,
|
47
|
+
stat: Callable = None,
|
48
|
+
**kwargs,
|
49
|
+
):
|
50
|
+
super(Amazon, self).__init__(
|
51
|
+
loop=loop,
|
52
|
+
job=job,
|
53
|
+
stat=stat,
|
54
|
+
**kwargs
|
55
|
+
)
|
56
|
+
# Always use proxies:
|
57
|
+
self.use_proxy: bool = True
|
58
|
+
self._free_proxy: bool = False
|
59
|
+
self.cookies = {
|
60
|
+
# "aws-session-id": "241-9979986-0092756",
|
61
|
+
}
|
62
|
+
self.headers: dict = {
|
63
|
+
'authority': 'www.amazon.com',
|
64
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
|
65
|
+
"Accept-Encoding": "gzip, deflate, br, zstd",
|
66
|
+
"Accept-Language": "es-US,es;q=0.9,en-US;q=0.8,en;q=0.7,es-419;q=0.6",
|
67
|
+
"Origin": "https://www.amazon.com",
|
68
|
+
"Referer": "https://www.amazon.com/dp/",
|
69
|
+
"Sec-CH-UA": '"Not A(Brand";v="8", "Chromium";v="132", "Google Chrome";v="132"',
|
70
|
+
"Sec-CH-UA-Mobile": "?0",
|
71
|
+
"Sec-CH-UA-Platform": '"Linux"',
|
72
|
+
'sec-fetch-site': 'none',
|
73
|
+
'sec-fetch-mode': 'navigate',
|
74
|
+
'sec-fetch-dest': 'document',
|
75
|
+
"Sec-Fetch-Site": "same-origin",
|
76
|
+
"User-Agent": random.choice(ua),
|
77
|
+
"Connection": "keep-alive",
|
78
|
+
'dnt': '1',
|
79
|
+
'upgrade-insecure-requests': '1',
|
80
|
+
}
|
81
|
+
self.semaphore = asyncio.Semaphore(10)
|
82
|
+
|
83
|
+
def _extract_reviews_from_page(self, soup: BeautifulSoup) -> list:
|
84
|
+
"""
|
85
|
+
Given a BeautifulSoup-parsed Amazon reviews page, extract individual reviews.
|
86
|
+
Returns a list of dictionaries.
|
87
|
+
"""
|
88
|
+
reviews = []
|
89
|
+
# Reviews are contained within the element with id 'cm-cr-review_list'
|
90
|
+
reviews_container = soup.find(
|
91
|
+
"ul", id="cm-cr-review_list"
|
92
|
+
) or soup.find("div", {"data-hook": "reviews-medley-widget"})
|
93
|
+
if reviews_container:
|
94
|
+
# Each review is typically in a <li> element with data-hook "review"
|
95
|
+
for review_el in reviews_container.find_all("li", {"data-hook": "review"}):
|
96
|
+
try:
|
97
|
+
# Extract review title
|
98
|
+
title_el = review_el.select_one("[data-hook=review-title] > span")
|
99
|
+
title = title_el.get_text(strip=True) if title_el else None
|
100
|
+
# Extract review body text
|
101
|
+
body_el = review_el.select_one("[data-hook=review-body]")
|
102
|
+
body = " ".join(body_el.stripped_strings) if body_el else None
|
103
|
+
# Extract review date and/or location/date information
|
104
|
+
date_el = review_el.select_one("[data-hook=review-date]")
|
105
|
+
date_text = date_el.get_text(strip=True) if date_el else None
|
106
|
+
# Extract rating (look for an element with data-hook containing 'review-star-rating')
|
107
|
+
if rating_el := review_el.select_one("[data-hook*='review-star-rating'] span.a-icon-alt"):
|
108
|
+
# Extract numeric rating (first match of digits possibly with a decimal)
|
109
|
+
import re
|
110
|
+
rating_match = re.search(r"(\d+\.?\d*) out", rating_el.get_text(strip=True))
|
111
|
+
rating = rating_match.group(1) if rating_match else None
|
112
|
+
else:
|
113
|
+
rating = None
|
114
|
+
# Extract Verified Purchase badge (if exists)
|
115
|
+
verified = bool(review_el.select_one("[data-hook=avp-badge]"))
|
116
|
+
|
117
|
+
review_dict = {
|
118
|
+
"title": title,
|
119
|
+
"review": body,
|
120
|
+
"location_and_date": date_text,
|
121
|
+
"rating": rating,
|
122
|
+
"verified": verified
|
123
|
+
}
|
124
|
+
reviews.append(review_dict)
|
125
|
+
except Exception as e:
|
126
|
+
# Log exception for this review, but continue extracting others
|
127
|
+
self._logger.error(
|
128
|
+
f"Failed to parse a review: {e}"
|
129
|
+
)
|
130
|
+
return reviews
|
131
|
+
|
132
|
+
def _extract_next_page_url(self, soup: BeautifulSoup, base_url: str) -> str:
|
133
|
+
"""
|
134
|
+
Look for a 'Next' page link in the pagination (typically via the CSS selector
|
135
|
+
'.a-pagination .a-last > a').
|
136
|
+
Returns an absolute URL string if found, otherwise returns None.
|
137
|
+
"""
|
138
|
+
pagination_el = soup.select_one(".a-pagination .a-last > a")
|
139
|
+
next_page_relative = pagination_el.get("href") if pagination_el else None
|
140
|
+
return urljoin(base_url, next_page_relative) if next_page_relative else None
|
141
|
+
|
142
|
+
@backoff.on_exception(
|
143
|
+
backoff.expo,
|
144
|
+
(httpx.TimeoutException, httpx.ConnectTimeout, httpx.HTTPStatusError, httpx.HTTPError),
|
145
|
+
max_tries=3,
|
146
|
+
jitter=backoff.full_jitter,
|
147
|
+
on_backoff=on_backoff,
|
148
|
+
giveup=lambda e: not bad_gateway_exception(e) and not isinstance(e, httpx.ConnectTimeout)
|
149
|
+
)
|
150
|
+
async def _fetch_product_page(self, asin: str, cookies: httpx.Cookies, for_reviews: bool = False) -> tuple:
|
151
|
+
product_page_url = f"https://www.amazon.com/dp/{asin}"
|
152
|
+
response = await self._get(url=product_page_url, cookies=cookies, headers=self.headers)
|
153
|
+
if response.status_code != 200:
|
154
|
+
raise DataError(
|
155
|
+
f"Failed to fetch product page, status code: {response.status_code}"
|
156
|
+
)
|
157
|
+
html = response.text
|
158
|
+
soup = BeautifulSoup(html, "html.parser")
|
159
|
+
if for_reviews:
|
160
|
+
if medley := soup.find("div", id="reviewsMedley"):
|
161
|
+
return product_page_url, html, soup
|
162
|
+
elif title_div := soup.find("div", id="title_feature_div"):
|
163
|
+
product_name = title_div.get_text(separator=" ", strip=True)
|
164
|
+
self._logger.info(f"Extracted product name: {product_name} from {product_page_url}")
|
165
|
+
return product_page_url, html, soup
|
166
|
+
else:
|
167
|
+
await asyncio.sleep(1.5)
|
168
|
+
raise httpx.HTTPError(
|
169
|
+
f"Failed to find product name on product page: {product_page_url}"
|
170
|
+
)
|
171
|
+
|
172
|
+
def _extract_reviews_from_product_page(self, url: str, row: Any, soup: BeautifulSoup) -> list:
|
173
|
+
"""Extract review snippet(s) from the product page (fallback)."""
|
174
|
+
reviews = []
|
175
|
+
if medley := soup.find("div", id="reviewsMedley"):
|
176
|
+
for li in medley.find_all("li", {"data-hook": "review"}):
|
177
|
+
try:
|
178
|
+
profile_user = li.find("div", {"class": "a-profile-content"})
|
179
|
+
profile_name = profile_user.find("span", {"class": "a-profile-name"}).get_text(strip=True)
|
180
|
+
customer_reviews = ""
|
181
|
+
title_text = ""
|
182
|
+
if title := li.find("a", {"data-hook": "review-title"}):
|
183
|
+
customer_reviews = title["href"]
|
184
|
+
title_text = title.find_all("span")[-1].text.strip()
|
185
|
+
|
186
|
+
body = li.select_one("[data-hook=review-body]")
|
187
|
+
body_text = " ".join(body.stripped_strings) if body else None
|
188
|
+
|
189
|
+
date_el = li.select_one("[data-hook=review-date]")
|
190
|
+
date_text = date_el.get_text(strip=True) if date_el else None
|
191
|
+
|
192
|
+
rating_el = li.select_one("[data-hook*='review-star-rating'] span.a-icon-alt")
|
193
|
+
rating_match = re.search(r"(\d+\.?\d*) out", rating_el.get_text(strip=True)) if rating_el else None
|
194
|
+
rating = rating_match.group(1) if rating_match else None
|
195
|
+
|
196
|
+
verified = bool(li.select_one("[data-hook=avp-badge]"))
|
197
|
+
_data = row.to_dict()
|
198
|
+
review_dict = {
|
199
|
+
"url": url,
|
200
|
+
"user": profile_name,
|
201
|
+
"customer_reviews": customer_reviews,
|
202
|
+
"title": title_text,
|
203
|
+
"review": body_text,
|
204
|
+
"location_and_date": date_text,
|
205
|
+
"rating": rating,
|
206
|
+
"verified": verified,
|
207
|
+
**_data
|
208
|
+
}
|
209
|
+
reviews.append(review_dict)
|
210
|
+
except Exception as e:
|
211
|
+
self._logger.error(f"Error parsing a fallback review: {e}")
|
212
|
+
return reviews
|
213
|
+
|
214
|
+
async def _fetch_review_page(self, url: str, cookies: httpx.Cookies) -> str:
|
215
|
+
"""
|
216
|
+
Fetches the review page HTML for a given URL.
|
217
|
+
Returns the HTML text.
|
218
|
+
"""
|
219
|
+
try:
|
220
|
+
response = await self._get(url=url, cookies=cookies, headers=self.headers)
|
221
|
+
if response.status_code != 200:
|
222
|
+
raise DataError(f"Failed to fetch reviews page (status code: {response.status_code})")
|
223
|
+
return response.text
|
224
|
+
except Exception as e:
|
225
|
+
raise DataError(f"Failed to fetch reviews page: {e}") from e
|
226
|
+
|
227
|
+
async def _product_reviews(self, idx, row, cookies, max_pages: int = 5) -> list:
|
228
|
+
async with self.semaphore:
|
229
|
+
# Prepare payload for the API request
|
230
|
+
asin = row['asin']
|
231
|
+
reviews = []
|
232
|
+
# base_review_url = f"https://www.amazon.com/product-reviews/{asin}/"
|
233
|
+
#
|
234
|
+
# try:
|
235
|
+
# # Try fetching the reviews page
|
236
|
+
# html = await self._fetch_review_page(base_review_url, cookies)
|
237
|
+
# soup = BeautifulSoup(html, "html.parser")
|
238
|
+
# reviews.extend(self._extract_reviews_from_page(soup))
|
239
|
+
# self._logger.info(f"Fetched reviews from reviews URL for ASIN {asin}")
|
240
|
+
# except DataError as e:
|
241
|
+
# # If a redirect (or other error) is detected, log and fall back to the product page.
|
242
|
+
# self._logger.warning(
|
243
|
+
# f"Direct reviews page fetch failed ({e}); falling back to product page for ASIN {asin}"
|
244
|
+
# )
|
245
|
+
try:
|
246
|
+
url, _, soup = await self._fetch_product_page(asin, cookies=cookies, for_reviews=True)
|
247
|
+
reviews.extend(
|
248
|
+
self._extract_reviews_from_product_page(url, row, soup)
|
249
|
+
)
|
250
|
+
except Exception as ee:
|
251
|
+
self._logger.error(
|
252
|
+
f"Fallback product page review extraction failed: {ee}"
|
253
|
+
)
|
254
|
+
return []
|
255
|
+
self._logger.info(
|
256
|
+
f"Fetched {len(reviews)} reviews for ASIN {asin}."
|
257
|
+
)
|
258
|
+
await asyncio.sleep(random.randint(3, 5))
|
259
|
+
return reviews
|
260
|
+
|
261
|
+
async def reviews(self):
|
262
|
+
"""reviews.
|
263
|
+
|
264
|
+
Target Product Reviews.
|
265
|
+
"""
|
266
|
+
httpx_cookies = httpx.Cookies()
|
267
|
+
for key, value in self.cookies.items():
|
268
|
+
httpx_cookies.set(
|
269
|
+
key, value,
|
270
|
+
domain='.amazon.com',
|
271
|
+
path='/'
|
272
|
+
)
|
273
|
+
|
274
|
+
# Iterate over each row in the DataFrame
|
275
|
+
print('starting ...')
|
276
|
+
tasks = [
|
277
|
+
self._product_reviews(
|
278
|
+
idx,
|
279
|
+
row,
|
280
|
+
httpx_cookies,
|
281
|
+
max_pages=2
|
282
|
+
) for idx, row in self.data.iterrows()
|
283
|
+
]
|
284
|
+
# Gather results concurrently
|
285
|
+
all_reviews_nested = await self._processing_tasks(tasks)
|
286
|
+
# Flatten the nested list: one item per review, and add the asin as reference.
|
287
|
+
reviews_flat = []
|
288
|
+
for idx, review_list in enumerate(all_reviews_nested):
|
289
|
+
asin = self.data.iloc[idx]['asin']
|
290
|
+
for review in review_list:
|
291
|
+
review['asin'] = asin
|
292
|
+
reviews_flat.append(review)
|
293
|
+
|
294
|
+
reviews_df = pd.DataFrame(reviews_flat)
|
295
|
+
self._logger.notice(f"Extracted total {len(reviews_df)} reviews.")
|
296
|
+
|
297
|
+
# at the end, adding a column for origin of reviews:
|
298
|
+
reviews_df['origin'] = 'amazon'
|
299
|
+
self.data = reviews_df # or store separately
|
300
|
+
return self.data
|
301
|
+
|
302
|
+
def _extract_product_name(self, soup: BeautifulSoup) -> str:
|
303
|
+
if title_div := soup.find("div", id="title_feature_div"):
|
304
|
+
return title_div.get_text(separator=" ", strip=True)
|
305
|
+
return None
|
306
|
+
|
307
|
+
def _extract_price(self, soup: BeautifulSoup) -> str:
|
308
|
+
price_element = soup.select_one("span.a-offscreen")
|
309
|
+
return price_element.get_text(strip=True) if price_element else None
|
310
|
+
|
311
|
+
def _extract_product_description(self, soup: BeautifulSoup) -> str:
|
312
|
+
if desc_div := soup.find("div", id="productDescription_feature_div"):
|
313
|
+
# Sometimes there is an inner div with id="productDescription"
|
314
|
+
if desc_inner := desc_div.find("div", id="productDescription"):
|
315
|
+
# Join all paragraph texts into one string
|
316
|
+
paragraphs = [p.get_text(separator=" ", strip=True) for p in desc_inner.find_all("p")]
|
317
|
+
product_description = "\n".join([p for p in paragraphs if p])
|
318
|
+
else:
|
319
|
+
product_description = desc_div.get_text(separator=" ", strip=True)
|
320
|
+
return product_description
|
321
|
+
return None
|
322
|
+
|
323
|
+
def _extract_rating(self, soup: BeautifulSoup) -> tuple:
|
324
|
+
"""
|
325
|
+
Extract the average rating and review count from an Amazon product page.
|
326
|
+
|
327
|
+
This function parses the BeautifulSoup object to find and extract the average
|
328
|
+
customer rating and the total number of reviews for a product.
|
329
|
+
|
330
|
+
Args:
|
331
|
+
soup (BeautifulSoup): A BeautifulSoup object representing the parsed HTML
|
332
|
+
of an Amazon product page.
|
333
|
+
|
334
|
+
Returns:
|
335
|
+
tuple: A tuple containing two elements:
|
336
|
+
- review_rating (str or None): The average rating of the product
|
337
|
+
(e.g., "4.5 out of 5 stars"), or None if not found.
|
338
|
+
- review_count (str or None): The total number of reviews for the
|
339
|
+
product (e.g., "1,234"), or None if not found.
|
340
|
+
"""
|
341
|
+
review_rating = None
|
342
|
+
review_count = None
|
343
|
+
if acr_div := soup.find("div", id="averageCustomerReviews_feature_div"):
|
344
|
+
# The star rating is contained inside a span within the "acrPopover"
|
345
|
+
if acr_popover := acr_div.find("span", id="acrPopover"):
|
346
|
+
if rating_span := acr_popover.find("span", class_="a-color-base"):
|
347
|
+
review_rating = rating_span.get_text(strip=True)
|
348
|
+
# The review count is extracted from the anchor "acrCustomerReviewLink"
|
349
|
+
if review_link := acr_div.find("a", id="acrCustomerReviewLink"):
|
350
|
+
if count_span := review_link.find("span", id="acrCustomerReviewText"):
|
351
|
+
review_count = count_span.get_text(strip=True).replace('ratings', '').strip()
|
352
|
+
return review_rating, review_count
|
353
|
+
return None, None
|
354
|
+
|
355
|
+
def _extract_product_overview(self, soup: BeautifulSoup) -> dict:
|
356
|
+
overview = {}
|
357
|
+
# Check if the overview container is present
|
358
|
+
if overview_container := soup.find("div", id="productOverview_hoc_view_div"):
|
359
|
+
# Iterate over each row in the container. Each row is typically a div with class "a-row"
|
360
|
+
for row in overview_container.find_all("div", class_="a-row"):
|
361
|
+
# Amazon structure: each row contains at least 2 columns.
|
362
|
+
columns = row.find_all("div", class_="a-column")
|
363
|
+
if len(columns) >= 2:
|
364
|
+
# The first column typically contains the label (e.g., "Screen Size")
|
365
|
+
label = columns[0].get_text(separator=" ", strip=True)
|
366
|
+
# The second column typically contains the value (e.g., "86 Inches")
|
367
|
+
value = columns[1].get_text(separator=" ", strip=True)
|
368
|
+
if label and value:
|
369
|
+
overview[label] = value
|
370
|
+
elif overview_div := soup.find("div", id="productOverview_feature_div"):
|
371
|
+
if table := overview_div.find("table", {"class": "a-spacing-micro"}):
|
372
|
+
for row in table.find_all("tr"):
|
373
|
+
th = row.find("th")
|
374
|
+
td = row.find("td")
|
375
|
+
if th and td:
|
376
|
+
key = th.get_text(separator=" ", strip=True)
|
377
|
+
value = td.get_text(separator=" ", strip=True)
|
378
|
+
overview[key] = value
|
379
|
+
return overview
|
380
|
+
|
381
|
+
def _extract_product_details(self, soup: BeautifulSoup) -> tuple:
|
382
|
+
# Extract technical specifications from "productDetails_techSpec_section_1"
|
383
|
+
tech_details = {}
|
384
|
+
if details_table := soup.find("table", id="productDetails_techSpec_section_1"):
|
385
|
+
for tr in details_table.find_all("tr"):
|
386
|
+
th = tr.find("th")
|
387
|
+
td = tr.find("td")
|
388
|
+
if th and td:
|
389
|
+
key = th.get_text(separator=" ", strip=True)
|
390
|
+
value = td.get_text(separator=" ", strip=True)
|
391
|
+
tech_details[key] = value
|
392
|
+
|
393
|
+
# Extract additional product details from "productDetails_detailBullets_sections1"
|
394
|
+
additional_details = {}
|
395
|
+
if additional_table := soup.find("table", id="productDetails_detailBullets_sections1"):
|
396
|
+
for tr in additional_table.find_all("tr"):
|
397
|
+
th = tr.find("th")
|
398
|
+
td = tr.find("td")
|
399
|
+
if th and td:
|
400
|
+
key = th.get_text(separator=" ", strip=True)
|
401
|
+
value = td.get_text(separator=" ", strip=True)
|
402
|
+
additional_details[key] = value
|
403
|
+
|
404
|
+
return tech_details, additional_details
|
405
|
+
|
406
|
+
def _extract_product_info(self, url: str, row: Any, soup: BeautifulSoup) -> dict:
|
407
|
+
"""
|
408
|
+
Extract product information from the Amazon product page.
|
409
|
+
Returns a dictionary with:
|
410
|
+
- productName: from 'title_feature_div'
|
411
|
+
- overview: (e.g., screen size, brand, display technology, resolution, refresh rate)
|
412
|
+
- reviewRating: from 'acrPopover' (if available)
|
413
|
+
- reviewCount: from 'acrCustomerReviewText' (if available)
|
414
|
+
- technicalDetails: from table "productDetails_techSpec_section_1"
|
415
|
+
- additionalDetails: from table "productDetails_detailBullets_sections1"
|
416
|
+
"""
|
417
|
+
# Extract product information here
|
418
|
+
# Return a dictionary with relevant fields
|
419
|
+
# Extract product name from "title_feature_div"
|
420
|
+
# --- Product Name ---
|
421
|
+
product_name = self._extract_product_name(soup)
|
422
|
+
prince = self._extract_price(soup)
|
423
|
+
|
424
|
+
# Extract review rating and count from "averageCustomerReviews"
|
425
|
+
review_rating, review_count = self._extract_rating(soup)
|
426
|
+
|
427
|
+
# --- Overview (revised for dynamic content) ---
|
428
|
+
overview = self._extract_product_overview(soup)
|
429
|
+
|
430
|
+
# --- Technical Details ---
|
431
|
+
tech_details, additional_details = self._extract_product_details(soup)
|
432
|
+
|
433
|
+
# --- Product Description ---
|
434
|
+
product_description = self._extract_product_description(soup)
|
435
|
+
|
436
|
+
# --- About This Item (feature bullets) ---
|
437
|
+
about_this_item = []
|
438
|
+
if featurebullets_div := soup.find("div", id="featurebullets_feature_div"):
|
439
|
+
if ul := featurebullets_div.find("ul", {"class": "a-unordered-list"}):
|
440
|
+
# Extract each bullet text and add to list
|
441
|
+
for li in ul.find_all("li"):
|
442
|
+
if text := li.get_text(separator=" ", strip=True):
|
443
|
+
about_this_item.append(text)
|
444
|
+
_data = row.to_dict()
|
445
|
+
return {
|
446
|
+
"product_name": product_name,
|
447
|
+
"price": prince,
|
448
|
+
"url": url,
|
449
|
+
"about_this_item": about_this_item,
|
450
|
+
"rating": review_rating,
|
451
|
+
"review_count": review_count,
|
452
|
+
"overview": overview,
|
453
|
+
"description": product_description,
|
454
|
+
"tech_details": tech_details,
|
455
|
+
"additional_details": additional_details,
|
456
|
+
**_data
|
457
|
+
}
|
458
|
+
|
459
|
+
async def _product_information(self, idx, row, cookies):
|
460
|
+
async with self.semaphore:
|
461
|
+
# Prepare payload for the API request
|
462
|
+
asin = row['asin']
|
463
|
+
try:
|
464
|
+
# Fetch the product page
|
465
|
+
url, html, soup = await self._fetch_product_page(asin, cookies=cookies, for_reviews=False)
|
466
|
+
if not html:
|
467
|
+
self._logger.warning(
|
468
|
+
f"No Product Information found for {asin}."
|
469
|
+
)
|
470
|
+
return {}
|
471
|
+
except (httpx.TimeoutException, httpx.HTTPError) as ex:
|
472
|
+
self._logger.warning(f"Request failed: {ex}")
|
473
|
+
return []
|
474
|
+
except Exception as ex:
|
475
|
+
self._logger.error(f"An error occurred: {ex}")
|
476
|
+
return []
|
477
|
+
|
478
|
+
# Extract the product information using BeautifulSoup
|
479
|
+
if product_info := self._extract_product_info(url, row, soup):
|
480
|
+
return product_info
|
481
|
+
raise DataNotFound(
|
482
|
+
f"Failed to extract product information for {asin}"
|
483
|
+
)
|
484
|
+
|
485
|
+
async def product_info(self):
|
486
|
+
"""product_info.
|
487
|
+
|
488
|
+
Product Information.
|
489
|
+
"""
|
490
|
+
httpx_cookies = httpx.Cookies()
|
491
|
+
for key, value in self.cookies.items():
|
492
|
+
httpx_cookies.set(
|
493
|
+
key, value,
|
494
|
+
domain='.amazon.com',
|
495
|
+
path='/'
|
496
|
+
)
|
497
|
+
|
498
|
+
# Iterate over each row in the DataFrame
|
499
|
+
print('starting ...')
|
500
|
+
|
501
|
+
tasks = [
|
502
|
+
self._product_information(
|
503
|
+
idx,
|
504
|
+
row,
|
505
|
+
httpx_cookies
|
506
|
+
) for idx, row in self.data.iterrows()
|
507
|
+
]
|
508
|
+
# Gather results concurrently
|
509
|
+
all_products = await self._processing_tasks(tasks)
|
510
|
+
|
511
|
+
# Convert to DataFrame
|
512
|
+
df = pd.DataFrame(all_products)
|
513
|
+
|
514
|
+
# show the num of rows in final dataframe:
|
515
|
+
self._logger.notice(
|
516
|
+
f"Ending Product Info: {len(df)}"
|
517
|
+
)
|
518
|
+
|
519
|
+
# Override previous dataframe:
|
520
|
+
self.data = df
|
521
|
+
|
522
|
+
# return existing data
|
523
|
+
return self.data
|