flowtask 5.8.4__cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowtask/__init__.py +93 -0
- flowtask/__main__.py +38 -0
- flowtask/bots/__init__.py +6 -0
- flowtask/bots/check.py +93 -0
- flowtask/bots/codebot.py +51 -0
- flowtask/components/ASPX.py +148 -0
- flowtask/components/AddDataset.py +352 -0
- flowtask/components/Amazon.py +523 -0
- flowtask/components/AutoTask.py +314 -0
- flowtask/components/Azure.py +80 -0
- flowtask/components/AzureUsers.py +106 -0
- flowtask/components/BaseAction.py +91 -0
- flowtask/components/BaseLoop.py +198 -0
- flowtask/components/BestBuy.py +800 -0
- flowtask/components/CSVToGCS.py +120 -0
- flowtask/components/CompanyScraper/__init__.py +1 -0
- flowtask/components/CompanyScraper/parsers/__init__.py +6 -0
- flowtask/components/CompanyScraper/parsers/base.py +102 -0
- flowtask/components/CompanyScraper/parsers/explorium.py +192 -0
- flowtask/components/CompanyScraper/parsers/leadiq.py +206 -0
- flowtask/components/CompanyScraper/parsers/rocket.py +133 -0
- flowtask/components/CompanyScraper/parsers/siccode.py +109 -0
- flowtask/components/CompanyScraper/parsers/visualvisitor.py +130 -0
- flowtask/components/CompanyScraper/parsers/zoominfo.py +118 -0
- flowtask/components/CompanyScraper/scrapper.py +1054 -0
- flowtask/components/CopyTo.py +177 -0
- flowtask/components/CopyToBigQuery.py +243 -0
- flowtask/components/CopyToMongoDB.py +291 -0
- flowtask/components/CopyToPg.py +609 -0
- flowtask/components/CopyToRethink.py +207 -0
- flowtask/components/CreateGCSBucket.py +102 -0
- flowtask/components/CreateReport/CreateReport.py +228 -0
- flowtask/components/CreateReport/__init__.py +9 -0
- flowtask/components/CreateReport/charts/__init__.py +15 -0
- flowtask/components/CreateReport/charts/bar.py +51 -0
- flowtask/components/CreateReport/charts/base.py +66 -0
- flowtask/components/CreateReport/charts/pie.py +64 -0
- flowtask/components/CreateReport/utils.py +9 -0
- flowtask/components/CustomerSatisfaction.py +196 -0
- flowtask/components/DataInput.py +200 -0
- flowtask/components/DateList.py +255 -0
- flowtask/components/DbClient.py +163 -0
- flowtask/components/DialPad.py +146 -0
- flowtask/components/DocumentDBQuery.py +200 -0
- flowtask/components/DownloadFrom.py +371 -0
- flowtask/components/DownloadFromD2L.py +113 -0
- flowtask/components/DownloadFromFTP.py +181 -0
- flowtask/components/DownloadFromIMAP.py +315 -0
- flowtask/components/DownloadFromS3.py +198 -0
- flowtask/components/DownloadFromSFTP.py +265 -0
- flowtask/components/DownloadFromSharepoint.py +110 -0
- flowtask/components/DownloadFromSmartSheet.py +114 -0
- flowtask/components/DownloadS3File.py +229 -0
- flowtask/components/Dummy.py +59 -0
- flowtask/components/DuplicatePhoto.py +411 -0
- flowtask/components/EmployeeEvaluation.py +237 -0
- flowtask/components/ExecuteSQL.py +323 -0
- flowtask/components/ExtractHTML.py +178 -0
- flowtask/components/FileBase.py +178 -0
- flowtask/components/FileCopy.py +181 -0
- flowtask/components/FileDelete.py +82 -0
- flowtask/components/FileExists.py +146 -0
- flowtask/components/FileIteratorDelete.py +112 -0
- flowtask/components/FileList.py +194 -0
- flowtask/components/FileOpen.py +75 -0
- flowtask/components/FileRead.py +120 -0
- flowtask/components/FileRename.py +106 -0
- flowtask/components/FilterIf.py +284 -0
- flowtask/components/FilterRows/FilterRows.py +200 -0
- flowtask/components/FilterRows/__init__.py +10 -0
- flowtask/components/FilterRows/functions.py +4 -0
- flowtask/components/GCSToBigQuery.py +103 -0
- flowtask/components/GoogleA4.py +150 -0
- flowtask/components/GoogleGeoCoding.py +344 -0
- flowtask/components/GooglePlaces.py +315 -0
- flowtask/components/GoogleSearch.py +539 -0
- flowtask/components/HTTPClient.py +268 -0
- flowtask/components/ICIMS.py +146 -0
- flowtask/components/IF.py +179 -0
- flowtask/components/IcimsFolderCopy.py +173 -0
- flowtask/components/ImageFeatures/__init__.py +5 -0
- flowtask/components/ImageFeatures/process.py +233 -0
- flowtask/components/IteratorBase.py +251 -0
- flowtask/components/LangchainLoader/__init__.py +5 -0
- flowtask/components/LangchainLoader/loader.py +194 -0
- flowtask/components/LangchainLoader/loaders/__init__.py +22 -0
- flowtask/components/LangchainLoader/loaders/abstract.py +362 -0
- flowtask/components/LangchainLoader/loaders/basepdf.py +50 -0
- flowtask/components/LangchainLoader/loaders/docx.py +91 -0
- flowtask/components/LangchainLoader/loaders/html.py +119 -0
- flowtask/components/LangchainLoader/loaders/pdfblocks.py +146 -0
- flowtask/components/LangchainLoader/loaders/pdfmark.py +79 -0
- flowtask/components/LangchainLoader/loaders/pdftables.py +135 -0
- flowtask/components/LangchainLoader/loaders/qa.py +67 -0
- flowtask/components/LangchainLoader/loaders/txt.py +55 -0
- flowtask/components/LeadIQ.py +650 -0
- flowtask/components/Loop.py +253 -0
- flowtask/components/Lowes.py +334 -0
- flowtask/components/MS365Usage.py +156 -0
- flowtask/components/MSTeamsMessages.py +320 -0
- flowtask/components/MarketClustering.py +1051 -0
- flowtask/components/MergeFiles.py +362 -0
- flowtask/components/MilvusOutput.py +87 -0
- flowtask/components/NearByStores.py +175 -0
- flowtask/components/NetworkNinja/__init__.py +6 -0
- flowtask/components/NetworkNinja/models/__init__.py +52 -0
- flowtask/components/NetworkNinja/models/abstract.py +177 -0
- flowtask/components/NetworkNinja/models/account.py +39 -0
- flowtask/components/NetworkNinja/models/client.py +19 -0
- flowtask/components/NetworkNinja/models/district.py +14 -0
- flowtask/components/NetworkNinja/models/events.py +101 -0
- flowtask/components/NetworkNinja/models/forms.py +499 -0
- flowtask/components/NetworkNinja/models/market.py +16 -0
- flowtask/components/NetworkNinja/models/organization.py +34 -0
- flowtask/components/NetworkNinja/models/photos.py +125 -0
- flowtask/components/NetworkNinja/models/project.py +44 -0
- flowtask/components/NetworkNinja/models/region.py +28 -0
- flowtask/components/NetworkNinja/models/store.py +203 -0
- flowtask/components/NetworkNinja/models/user.py +151 -0
- flowtask/components/NetworkNinja/router.py +854 -0
- flowtask/components/Odoo.py +175 -0
- flowtask/components/OdooInjector.py +192 -0
- flowtask/components/OpenFromXML.py +126 -0
- flowtask/components/OpenWeather.py +41 -0
- flowtask/components/OpenWithBase.py +616 -0
- flowtask/components/OpenWithPandas.py +715 -0
- flowtask/components/PGPDecrypt.py +199 -0
- flowtask/components/PandasIterator.py +187 -0
- flowtask/components/PandasToFile.py +189 -0
- flowtask/components/Paradox.py +339 -0
- flowtask/components/ParamIterator.py +117 -0
- flowtask/components/ParseHTML.py +84 -0
- flowtask/components/PlacerStores.py +249 -0
- flowtask/components/Pokemon.py +507 -0
- flowtask/components/PositiveBot.py +62 -0
- flowtask/components/PowerPointSlide.py +400 -0
- flowtask/components/PrintMessage.py +127 -0
- flowtask/components/ProductCompetitors/__init__.py +5 -0
- flowtask/components/ProductCompetitors/parsers/__init__.py +7 -0
- flowtask/components/ProductCompetitors/parsers/base.py +72 -0
- flowtask/components/ProductCompetitors/parsers/bestbuy.py +86 -0
- flowtask/components/ProductCompetitors/parsers/lowes.py +103 -0
- flowtask/components/ProductCompetitors/scrapper.py +155 -0
- flowtask/components/ProductCompliant.py +169 -0
- flowtask/components/ProductInfo/__init__.py +1 -0
- flowtask/components/ProductInfo/parsers/__init__.py +5 -0
- flowtask/components/ProductInfo/parsers/base.py +83 -0
- flowtask/components/ProductInfo/parsers/brother.py +97 -0
- flowtask/components/ProductInfo/parsers/canon.py +167 -0
- flowtask/components/ProductInfo/parsers/epson.py +118 -0
- flowtask/components/ProductInfo/parsers/hp.py +131 -0
- flowtask/components/ProductInfo/parsers/samsung.py +97 -0
- flowtask/components/ProductInfo/scraper.py +319 -0
- flowtask/components/ProductPricing.py +118 -0
- flowtask/components/QS.py +261 -0
- flowtask/components/QSBase.py +201 -0
- flowtask/components/QueryIterator.py +273 -0
- flowtask/components/QueryToInsert.py +327 -0
- flowtask/components/QueryToPandas.py +432 -0
- flowtask/components/RESTClient.py +195 -0
- flowtask/components/RethinkDBQuery.py +189 -0
- flowtask/components/Rsync.py +74 -0
- flowtask/components/RunSSH.py +59 -0
- flowtask/components/RunShell.py +71 -0
- flowtask/components/SalesForce.py +20 -0
- flowtask/components/SaveImageBank/__init__.py +257 -0
- flowtask/components/SchedulingVisits.py +592 -0
- flowtask/components/ScrapPage.py +216 -0
- flowtask/components/ScrapSearch.py +79 -0
- flowtask/components/SendNotify.py +257 -0
- flowtask/components/SentimentAnalysis.py +694 -0
- flowtask/components/ServiceScrapper/__init__.py +5 -0
- flowtask/components/ServiceScrapper/parsers/__init__.py +1 -0
- flowtask/components/ServiceScrapper/parsers/base.py +94 -0
- flowtask/components/ServiceScrapper/parsers/costco.py +93 -0
- flowtask/components/ServiceScrapper/scrapper.py +199 -0
- flowtask/components/SetVariables.py +156 -0
- flowtask/components/SubTask.py +182 -0
- flowtask/components/SuiteCRM.py +48 -0
- flowtask/components/Switch.py +175 -0
- flowtask/components/TableBase.py +148 -0
- flowtask/components/TableDelete.py +312 -0
- flowtask/components/TableInput.py +143 -0
- flowtask/components/TableOutput/TableOutput.py +384 -0
- flowtask/components/TableOutput/__init__.py +3 -0
- flowtask/components/TableSchema.py +534 -0
- flowtask/components/Target.py +223 -0
- flowtask/components/ThumbnailGenerator.py +156 -0
- flowtask/components/ToPandas.py +67 -0
- flowtask/components/TransformRows/TransformRows.py +507 -0
- flowtask/components/TransformRows/__init__.py +9 -0
- flowtask/components/TransformRows/functions.py +559 -0
- flowtask/components/TransposeRows.py +176 -0
- flowtask/components/UPCDatabase.py +86 -0
- flowtask/components/UnGzip.py +171 -0
- flowtask/components/Uncompress.py +172 -0
- flowtask/components/UniqueRows.py +126 -0
- flowtask/components/Unzip.py +107 -0
- flowtask/components/UpdateOperationalVars.py +147 -0
- flowtask/components/UploadTo.py +299 -0
- flowtask/components/UploadToS3.py +136 -0
- flowtask/components/UploadToSFTP.py +160 -0
- flowtask/components/UploadToSharepoint.py +205 -0
- flowtask/components/UserFunc.py +122 -0
- flowtask/components/VivaTracker.py +140 -0
- flowtask/components/WSDLClient.py +123 -0
- flowtask/components/Wait.py +18 -0
- flowtask/components/Walmart.py +199 -0
- flowtask/components/Workplace.py +134 -0
- flowtask/components/XMLToPandas.py +267 -0
- flowtask/components/Zammad/__init__.py +41 -0
- flowtask/components/Zammad/models.py +0 -0
- flowtask/components/ZoomInfoScraper.py +409 -0
- flowtask/components/__init__.py +104 -0
- flowtask/components/abstract.py +18 -0
- flowtask/components/flow.py +530 -0
- flowtask/components/google.py +335 -0
- flowtask/components/group.py +221 -0
- flowtask/components/py.typed +0 -0
- flowtask/components/reviewscrap.py +132 -0
- flowtask/components/tAutoincrement.py +117 -0
- flowtask/components/tConcat.py +109 -0
- flowtask/components/tExplode.py +119 -0
- flowtask/components/tFilter.py +184 -0
- flowtask/components/tGroup.py +236 -0
- flowtask/components/tJoin.py +270 -0
- flowtask/components/tMap/__init__.py +9 -0
- flowtask/components/tMap/functions.py +54 -0
- flowtask/components/tMap/tMap.py +450 -0
- flowtask/components/tMelt.py +112 -0
- flowtask/components/tMerge.py +114 -0
- flowtask/components/tOrder.py +93 -0
- flowtask/components/tPandas.py +94 -0
- flowtask/components/tPivot.py +71 -0
- flowtask/components/tPluckCols.py +76 -0
- flowtask/components/tUnnest.py +82 -0
- flowtask/components/user.py +401 -0
- flowtask/conf.py +457 -0
- flowtask/download.py +102 -0
- flowtask/events/__init__.py +11 -0
- flowtask/events/events/__init__.py +20 -0
- flowtask/events/events/abstract.py +95 -0
- flowtask/events/events/alerts/__init__.py +362 -0
- flowtask/events/events/alerts/colfunctions.py +131 -0
- flowtask/events/events/alerts/functions.py +158 -0
- flowtask/events/events/dummy.py +12 -0
- flowtask/events/events/exec.py +124 -0
- flowtask/events/events/file/__init__.py +7 -0
- flowtask/events/events/file/base.py +51 -0
- flowtask/events/events/file/copy.py +23 -0
- flowtask/events/events/file/delete.py +16 -0
- flowtask/events/events/interfaces/__init__.py +9 -0
- flowtask/events/events/interfaces/client.py +67 -0
- flowtask/events/events/interfaces/credentials.py +28 -0
- flowtask/events/events/interfaces/notifications.py +58 -0
- flowtask/events/events/jira.py +122 -0
- flowtask/events/events/log.py +26 -0
- flowtask/events/events/logerr.py +52 -0
- flowtask/events/events/notify.py +59 -0
- flowtask/events/events/notify_event.py +160 -0
- flowtask/events/events/publish.py +54 -0
- flowtask/events/events/sendfile.py +104 -0
- flowtask/events/events/task.py +97 -0
- flowtask/events/events/teams.py +98 -0
- flowtask/events/events/webhook.py +58 -0
- flowtask/events/manager.py +287 -0
- flowtask/exceptions.c +39393 -0
- flowtask/exceptions.cpython-310-x86_64-linux-gnu.so +0 -0
- flowtask/extensions/__init__.py +3 -0
- flowtask/extensions/abstract.py +82 -0
- flowtask/extensions/logging/__init__.py +65 -0
- flowtask/hooks/__init__.py +9 -0
- flowtask/hooks/actions/__init__.py +22 -0
- flowtask/hooks/actions/abstract.py +66 -0
- flowtask/hooks/actions/dummy.py +23 -0
- flowtask/hooks/actions/jira.py +74 -0
- flowtask/hooks/actions/rest.py +320 -0
- flowtask/hooks/actions/sampledata.py +37 -0
- flowtask/hooks/actions/sensor.py +23 -0
- flowtask/hooks/actions/task.py +9 -0
- flowtask/hooks/actions/ticket.py +37 -0
- flowtask/hooks/actions/zammad.py +55 -0
- flowtask/hooks/hook.py +62 -0
- flowtask/hooks/models.py +17 -0
- flowtask/hooks/service.py +187 -0
- flowtask/hooks/step.py +91 -0
- flowtask/hooks/types/__init__.py +23 -0
- flowtask/hooks/types/base.py +129 -0
- flowtask/hooks/types/brokers/__init__.py +11 -0
- flowtask/hooks/types/brokers/base.py +54 -0
- flowtask/hooks/types/brokers/mqtt.py +35 -0
- flowtask/hooks/types/brokers/rabbitmq.py +82 -0
- flowtask/hooks/types/brokers/redis.py +83 -0
- flowtask/hooks/types/brokers/sqs.py +44 -0
- flowtask/hooks/types/fs.py +232 -0
- flowtask/hooks/types/http.py +49 -0
- flowtask/hooks/types/imap.py +200 -0
- flowtask/hooks/types/jira.py +279 -0
- flowtask/hooks/types/mail.py +205 -0
- flowtask/hooks/types/postgres.py +98 -0
- flowtask/hooks/types/responses/__init__.py +8 -0
- flowtask/hooks/types/responses/base.py +5 -0
- flowtask/hooks/types/sharepoint.py +288 -0
- flowtask/hooks/types/ssh.py +141 -0
- flowtask/hooks/types/tagged.py +59 -0
- flowtask/hooks/types/upload.py +85 -0
- flowtask/hooks/types/watch.py +71 -0
- flowtask/hooks/types/web.py +36 -0
- flowtask/interfaces/AzureClient.py +137 -0
- flowtask/interfaces/AzureGraph.py +839 -0
- flowtask/interfaces/Boto3Client.py +326 -0
- flowtask/interfaces/DropboxClient.py +173 -0
- flowtask/interfaces/ExcelHandler.py +94 -0
- flowtask/interfaces/FTPClient.py +131 -0
- flowtask/interfaces/GoogleCalendar.py +201 -0
- flowtask/interfaces/GoogleClient.py +133 -0
- flowtask/interfaces/GoogleDrive.py +127 -0
- flowtask/interfaces/GoogleGCS.py +89 -0
- flowtask/interfaces/GoogleGeocoding.py +93 -0
- flowtask/interfaces/GoogleLang.py +114 -0
- flowtask/interfaces/GooglePub.py +61 -0
- flowtask/interfaces/GoogleSheet.py +68 -0
- flowtask/interfaces/IMAPClient.py +137 -0
- flowtask/interfaces/O365Calendar.py +113 -0
- flowtask/interfaces/O365Client.py +220 -0
- flowtask/interfaces/OneDrive.py +284 -0
- flowtask/interfaces/Outlook.py +155 -0
- flowtask/interfaces/ParrotBot.py +130 -0
- flowtask/interfaces/SSHClient.py +378 -0
- flowtask/interfaces/Sharepoint.py +496 -0
- flowtask/interfaces/__init__.py +36 -0
- flowtask/interfaces/azureauth.py +119 -0
- flowtask/interfaces/cache.py +201 -0
- flowtask/interfaces/client.py +82 -0
- flowtask/interfaces/compress.py +525 -0
- flowtask/interfaces/credentials.py +124 -0
- flowtask/interfaces/d2l.py +239 -0
- flowtask/interfaces/databases/__init__.py +5 -0
- flowtask/interfaces/databases/db.py +223 -0
- flowtask/interfaces/databases/documentdb.py +55 -0
- flowtask/interfaces/databases/rethink.py +39 -0
- flowtask/interfaces/dataframes/__init__.py +11 -0
- flowtask/interfaces/dataframes/abstract.py +21 -0
- flowtask/interfaces/dataframes/arrow.py +71 -0
- flowtask/interfaces/dataframes/dt.py +69 -0
- flowtask/interfaces/dataframes/pandas.py +167 -0
- flowtask/interfaces/dataframes/polars.py +60 -0
- flowtask/interfaces/db.py +263 -0
- flowtask/interfaces/env.py +46 -0
- flowtask/interfaces/func.py +137 -0
- flowtask/interfaces/http.py +1780 -0
- flowtask/interfaces/locale.py +40 -0
- flowtask/interfaces/log.py +75 -0
- flowtask/interfaces/mask.py +143 -0
- flowtask/interfaces/notification.py +154 -0
- flowtask/interfaces/playwright.py +339 -0
- flowtask/interfaces/powerpoint.py +368 -0
- flowtask/interfaces/py.typed +0 -0
- flowtask/interfaces/qs.py +376 -0
- flowtask/interfaces/result.py +87 -0
- flowtask/interfaces/selenium_service.py +779 -0
- flowtask/interfaces/smartsheet.py +154 -0
- flowtask/interfaces/stat.py +39 -0
- flowtask/interfaces/task.py +96 -0
- flowtask/interfaces/template.py +118 -0
- flowtask/interfaces/vectorstores/__init__.py +1 -0
- flowtask/interfaces/vectorstores/abstract.py +133 -0
- flowtask/interfaces/vectorstores/milvus.py +669 -0
- flowtask/interfaces/zammad.py +107 -0
- flowtask/models.py +193 -0
- flowtask/parsers/__init__.py +15 -0
- flowtask/parsers/_yaml.c +11978 -0
- flowtask/parsers/_yaml.cpython-310-x86_64-linux-gnu.so +0 -0
- flowtask/parsers/argparser.py +235 -0
- flowtask/parsers/base.c +15155 -0
- flowtask/parsers/base.cpython-310-x86_64-linux-gnu.so +0 -0
- flowtask/parsers/json.c +11968 -0
- flowtask/parsers/json.cpython-310-x86_64-linux-gnu.so +0 -0
- flowtask/parsers/maps.py +49 -0
- flowtask/parsers/toml.c +11968 -0
- flowtask/parsers/toml.cpython-310-x86_64-linux-gnu.so +0 -0
- flowtask/plugins/__init__.py +16 -0
- flowtask/plugins/components/__init__.py +0 -0
- flowtask/plugins/handler/__init__.py +45 -0
- flowtask/plugins/importer.py +31 -0
- flowtask/plugins/sources/__init__.py +0 -0
- flowtask/runner.py +283 -0
- flowtask/scheduler/__init__.py +9 -0
- flowtask/scheduler/functions.py +493 -0
- flowtask/scheduler/handlers/__init__.py +8 -0
- flowtask/scheduler/handlers/manager.py +504 -0
- flowtask/scheduler/handlers/models.py +58 -0
- flowtask/scheduler/handlers/service.py +72 -0
- flowtask/scheduler/notifications.py +65 -0
- flowtask/scheduler/scheduler.py +993 -0
- flowtask/services/__init__.py +0 -0
- flowtask/services/bots/__init__.py +0 -0
- flowtask/services/bots/telegram.py +264 -0
- flowtask/services/files/__init__.py +11 -0
- flowtask/services/files/manager.py +522 -0
- flowtask/services/files/model.py +37 -0
- flowtask/services/files/service.py +767 -0
- flowtask/services/jira/__init__.py +3 -0
- flowtask/services/jira/jira_actions.py +191 -0
- flowtask/services/tasks/__init__.py +13 -0
- flowtask/services/tasks/launcher.py +213 -0
- flowtask/services/tasks/manager.py +323 -0
- flowtask/services/tasks/service.py +275 -0
- flowtask/services/tasks/task_manager.py +376 -0
- flowtask/services/tasks/tasks.py +155 -0
- flowtask/storages/__init__.py +16 -0
- flowtask/storages/exceptions.py +12 -0
- flowtask/storages/files/__init__.py +8 -0
- flowtask/storages/files/abstract.py +29 -0
- flowtask/storages/files/filesystem.py +66 -0
- flowtask/storages/tasks/__init__.py +19 -0
- flowtask/storages/tasks/abstract.py +26 -0
- flowtask/storages/tasks/database.py +33 -0
- flowtask/storages/tasks/filesystem.py +108 -0
- flowtask/storages/tasks/github.py +119 -0
- flowtask/storages/tasks/memory.py +45 -0
- flowtask/storages/tasks/row.py +25 -0
- flowtask/tasks/__init__.py +0 -0
- flowtask/tasks/abstract.py +526 -0
- flowtask/tasks/command.py +118 -0
- flowtask/tasks/pile.py +486 -0
- flowtask/tasks/py.typed +0 -0
- flowtask/tasks/task.py +778 -0
- flowtask/template/__init__.py +161 -0
- flowtask/tests.py +257 -0
- flowtask/types/__init__.py +8 -0
- flowtask/types/typedefs.c +11347 -0
- flowtask/types/typedefs.cpython-310-x86_64-linux-gnu.so +0 -0
- flowtask/utils/__init__.py +24 -0
- flowtask/utils/constants.py +117 -0
- flowtask/utils/encoders.py +21 -0
- flowtask/utils/executor.py +112 -0
- flowtask/utils/functions.cpp +14280 -0
- flowtask/utils/functions.cpython-310-x86_64-linux-gnu.so +0 -0
- flowtask/utils/json.cpp +13349 -0
- flowtask/utils/json.cpython-310-x86_64-linux-gnu.so +0 -0
- flowtask/utils/mail.py +63 -0
- flowtask/utils/parseqs.c +13324 -0
- flowtask/utils/parserqs.cpython-310-x86_64-linux-gnu.so +0 -0
- flowtask/utils/stats.py +308 -0
- flowtask/utils/transformations.py +74 -0
- flowtask/utils/uv.py +12 -0
- flowtask/utils/validators.py +97 -0
- flowtask/version.py +11 -0
- flowtask-5.8.4.dist-info/LICENSE +201 -0
- flowtask-5.8.4.dist-info/METADATA +209 -0
- flowtask-5.8.4.dist-info/RECORD +470 -0
- flowtask-5.8.4.dist-info/WHEEL +6 -0
- flowtask-5.8.4.dist-info/entry_points.txt +3 -0
- flowtask-5.8.4.dist-info/top_level.txt +2 -0
- plugins/components/CreateQR.py +39 -0
- plugins/components/TestComponent.py +28 -0
- plugins/components/Use1.py +13 -0
- plugins/components/Workplace.py +117 -0
- plugins/components/__init__.py +3 -0
- plugins/sources/__init__.py +0 -0
- plugins/sources/get_populartimes.py +78 -0
- plugins/sources/google.py +150 -0
- plugins/sources/hubspot.py +679 -0
- plugins/sources/icims.py +679 -0
- plugins/sources/mobileinsight.py +501 -0
- plugins/sources/newrelic.py +262 -0
- plugins/sources/uap.py +268 -0
- plugins/sources/venu.py +244 -0
- plugins/sources/vocinity.py +314 -0
@@ -0,0 +1,194 @@
|
|
1
|
+
import asyncio
|
2
|
+
from typing import List
|
3
|
+
from collections.abc import Callable
|
4
|
+
import importlib
|
5
|
+
from pathlib import Path, PurePath
|
6
|
+
from parrot.loaders import AbstractLoader, Document
|
7
|
+
from parrot.llms.vertex import VertexLLM
|
8
|
+
from ..flow import FlowComponent
|
9
|
+
from ...exceptions import ConfigError, ComponentError
|
10
|
+
from ...conf import (
|
11
|
+
DEFAULT_LLM_MODEL,
|
12
|
+
DEFAULT_LLM_TEMPERATURE
|
13
|
+
)
|
14
|
+
|
15
|
+
class LangchainLoader(FlowComponent):
|
16
|
+
"""LangchainLoader.
|
17
|
+
|
18
|
+
Overview:
|
19
|
+
|
20
|
+
Getting a list of documents and convert into Langchain Documents.
|
21
|
+
|
22
|
+
|
23
|
+
Example:
|
24
|
+
|
25
|
+
```yaml
|
26
|
+
LangchainLoader:
|
27
|
+
path: /home/ubuntu/symbits/lg/bot/products_positive
|
28
|
+
source_type: Product-Top-Reviews
|
29
|
+
loader: HTMLLoader
|
30
|
+
chunk_size: 2048
|
31
|
+
elements:
|
32
|
+
- div: .product
|
33
|
+
```
|
34
|
+
|
35
|
+
"""
|
36
|
+
|
37
|
+
def __init__(
|
38
|
+
self,
|
39
|
+
loop: asyncio.AbstractEventLoop = None,
|
40
|
+
job: Callable = None,
|
41
|
+
stat: Callable = None,
|
42
|
+
**kwargs,
|
43
|
+
):
|
44
|
+
self.extensions: list = kwargs.pop('extensions', [])
|
45
|
+
self.encoding: str = kwargs.get('encoding', 'utf-8')
|
46
|
+
self.path: str = kwargs.pop('path', None)
|
47
|
+
self.skip_directories: List[str] = kwargs.pop('skip_directories', [])
|
48
|
+
self._chunk_size = kwargs.get('chunk_size', 2048)
|
49
|
+
self._embed_size: int = kwargs.pop('embed_size', 768)
|
50
|
+
self.source_type: str = kwargs.pop('source_type', 'document')
|
51
|
+
self.doctype: str = kwargs.pop('doctype', 'document')
|
52
|
+
# LLM (if required)
|
53
|
+
self._llm = kwargs.pop('llm', None)
|
54
|
+
super().__init__(
|
55
|
+
loop=loop, job=job, stat=stat, **kwargs
|
56
|
+
)
|
57
|
+
self._device: str = kwargs.get('device', 'cpu')
|
58
|
+
self._cuda_number: int = kwargs.get('cuda_device', 0)
|
59
|
+
# Use caching to avoid instanciate several times same loader
|
60
|
+
self._caching_loaders: dict = {}
|
61
|
+
|
62
|
+
async def close(self):
|
63
|
+
# Destroy effectively all Models.
|
64
|
+
pass
|
65
|
+
|
66
|
+
def get_default_llm(self):
|
67
|
+
"""Return a VertexLLM instance."""
|
68
|
+
return VertexLLM(
|
69
|
+
model=DEFAULT_LLM_MODEL,
|
70
|
+
temperature=DEFAULT_LLM_TEMPERATURE,
|
71
|
+
top_k=30,
|
72
|
+
top_p=0.5,
|
73
|
+
)
|
74
|
+
|
75
|
+
async def start(self, **kwargs):
|
76
|
+
await super().start(**kwargs)
|
77
|
+
if self.path:
|
78
|
+
if isinstance(self.path, str):
|
79
|
+
self.path = self.mask_replacement_recursively(self.path)
|
80
|
+
self.path = Path(self.path).resolve()
|
81
|
+
if not self.path.exists():
|
82
|
+
raise ComponentError(
|
83
|
+
f"Langchain: {self.path} doesn't exists."
|
84
|
+
)
|
85
|
+
else:
|
86
|
+
raise ConfigError(
|
87
|
+
"Provide at least one directory or filename in *path* attribute."
|
88
|
+
)
|
89
|
+
|
90
|
+
def _get_loader(self, suffix: str, **kwargs):
|
91
|
+
"""
|
92
|
+
Get a Document Loader based on Prefix.
|
93
|
+
TODO: a more automated way using importlib.
|
94
|
+
"""
|
95
|
+
# Common Arguments
|
96
|
+
args = {
|
97
|
+
"markdown_splitter": self._md_splitter,
|
98
|
+
"summarization_model": self.summarization_model,
|
99
|
+
"device": self._device,
|
100
|
+
"cuda_number": self._cuda_number,
|
101
|
+
"source_type": self.source_type,
|
102
|
+
"encoding": self.encoding,
|
103
|
+
"llm": self._llm
|
104
|
+
}
|
105
|
+
|
106
|
+
def _load_loader(self, name: str, **kwargs):
|
107
|
+
"""Dynamically imports a loader class from the loaders module.
|
108
|
+
|
109
|
+
Args:
|
110
|
+
loader_name: The name of the loader class to import (e.g., 'QALoader').
|
111
|
+
|
112
|
+
Returns:
|
113
|
+
The imported loader class.
|
114
|
+
"""
|
115
|
+
try:
|
116
|
+
module_path = ".loaders"
|
117
|
+
module = importlib.import_module(module_path, package=__package__)
|
118
|
+
cls = getattr(module, name)
|
119
|
+
if cls:
|
120
|
+
args = {
|
121
|
+
"markdown_splitter": self._md_splitter,
|
122
|
+
"summarization_model": self.summarization_model,
|
123
|
+
"device": self._device,
|
124
|
+
"cuda_number": self._cuda_number,
|
125
|
+
"source_type": self.source_type,
|
126
|
+
"encoding": self.encoding,
|
127
|
+
"llm": self._llm,
|
128
|
+
**kwargs
|
129
|
+
}
|
130
|
+
loader = cls(**args)
|
131
|
+
self._caching_loaders[name] = loader
|
132
|
+
return loader
|
133
|
+
except (ModuleNotFoundError, AttributeError) as e:
|
134
|
+
raise ImportError(
|
135
|
+
f"Unable to load the loader '{name}': {e}"
|
136
|
+
) from e
|
137
|
+
|
138
|
+
async def _load_document(self, path: PurePath) -> List[Document]:
|
139
|
+
documents = []
|
140
|
+
suffix = path.suffix
|
141
|
+
if suffix in self._caching_loaders:
|
142
|
+
loader = self._caching_loaders[suffix]
|
143
|
+
else:
|
144
|
+
loader = self._get_loader(suffix)
|
145
|
+
self._caching_loaders[suffix] = loader
|
146
|
+
async with loader as ld:
|
147
|
+
documents = await ld.load(path)
|
148
|
+
# split or not split?
|
149
|
+
return documents
|
150
|
+
|
151
|
+
async def run(self):
|
152
|
+
documents = []
|
153
|
+
if hasattr(self, 'loader'):
|
154
|
+
print('PARAMS >> ', self._attrs)
|
155
|
+
loader = self._load_loader(self.loader, **self._attrs)
|
156
|
+
async with loader as ld:
|
157
|
+
ext = loader.supported_extensions()
|
158
|
+
if self.path.is_dir():
|
159
|
+
if self.extensions:
|
160
|
+
# iterate over the files in the directory
|
161
|
+
for ext in self.extensions:
|
162
|
+
for item in self.path.glob(f'*{ext}'):
|
163
|
+
if item.is_file() and set(item.parts).isdisjoint(self.skip_directories):
|
164
|
+
documents.extend(await ld.load(item))
|
165
|
+
else:
|
166
|
+
for item in self.path.glob('*.*'):
|
167
|
+
if item.is_file() and set(item.parts).isdisjoint(self.skip_directories):
|
168
|
+
documents.extend(await ld.load(item))
|
169
|
+
else:
|
170
|
+
documents = await ld.load(self.path)
|
171
|
+
else:
|
172
|
+
if self.path.is_dir():
|
173
|
+
# iterate over the files in the directory
|
174
|
+
if self.extensions:
|
175
|
+
for ext in self.extensions:
|
176
|
+
for item in self.path.glob(f'*{ext}'):
|
177
|
+
if item.is_file() and set(item.parts).isdisjoint(self.skip_directories):
|
178
|
+
documents.extend(await self._load_document(item))
|
179
|
+
else:
|
180
|
+
for item in self.path.glob('*.*'):
|
181
|
+
if item.is_file() and set(item.parts).isdisjoint(self.skip_directories):
|
182
|
+
documents.extend(await self._load_document(item))
|
183
|
+
elif self.path.is_file():
|
184
|
+
if self.path.suffix in self.extensions:
|
185
|
+
if set(self.path.parts).isdisjoint(self.skip_directories):
|
186
|
+
documents = await self._load_document(self.path)
|
187
|
+
else:
|
188
|
+
raise ValueError(
|
189
|
+
f"Langchain Loader: Invalid path: {self.path}"
|
190
|
+
)
|
191
|
+
self._result = documents
|
192
|
+
self.add_metric('NUM_DOCUMENTS', len(documents))
|
193
|
+
# return self._result
|
194
|
+
return True
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# """
|
2
|
+
# Langchain Loaders.
|
3
|
+
|
4
|
+
# Basic Documents Loaders, adapted to be used in Flowtask Tasks.
|
5
|
+
# """
|
6
|
+
# from .docx import MSWordLoader
|
7
|
+
# from .qa import QAFileLoader
|
8
|
+
# from .pdfmark import PDFMarkdown
|
9
|
+
# from .pdftables import PDFTables
|
10
|
+
# from .pdfblocks import PDFBlocks
|
11
|
+
# from .txt import TXTLoader
|
12
|
+
# from .html import HTMLLoader
|
13
|
+
|
14
|
+
# __all__ = (
|
15
|
+
# "MSWordLoader",
|
16
|
+
# "QAFileLoader",
|
17
|
+
# "PDFMarkdown",
|
18
|
+
# "PDFTables",
|
19
|
+
# "TXTLoader",
|
20
|
+
# "PDFBlocks",
|
21
|
+
# "HTMLLoader",
|
22
|
+
# )
|
@@ -0,0 +1,362 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
from typing import Union, List, Optional
|
3
|
+
from collections.abc import Callable
|
4
|
+
from datetime import datetime
|
5
|
+
from pathlib import Path, PurePath
|
6
|
+
import torch
|
7
|
+
from langchain.docstore.document import Document
|
8
|
+
from langchain.chains.summarize import load_summarize_chain
|
9
|
+
from langchain.text_splitter import (
|
10
|
+
TokenTextSplitter
|
11
|
+
)
|
12
|
+
from langchain_huggingface import HuggingFacePipeline
|
13
|
+
from transformers import (
|
14
|
+
AutoModelForSeq2SeqLM,
|
15
|
+
AutoTokenizer,
|
16
|
+
pipeline
|
17
|
+
)
|
18
|
+
from langchain_core.prompts import PromptTemplate
|
19
|
+
from navconfig.logging import logging
|
20
|
+
from navigator.libs.json import JSONContent # pylint: disable=E0611
|
21
|
+
from parrot.llms.vertex import VertexLLM
|
22
|
+
from ....conf import (
|
23
|
+
EMBEDDING_DEVICE,
|
24
|
+
DEFAULT_LLM_MODEL,
|
25
|
+
DEFAULT_LLM_TEMPERATURE,
|
26
|
+
)
|
27
|
+
|
28
|
+
class AbstractLoader(ABC):
|
29
|
+
"""
|
30
|
+
Abstract class for Document loaders.
|
31
|
+
"""
|
32
|
+
_extension: List[str] = []
|
33
|
+
|
34
|
+
def __init__(
|
35
|
+
self,
|
36
|
+
tokenizer: Union[str, Callable] = None,
|
37
|
+
text_splitter: Union[str, Callable] = None,
|
38
|
+
summarizer: Union[str, Callable] = None,
|
39
|
+
markdown_splitter: Union[str, Callable] = None,
|
40
|
+
source_type: str = 'file',
|
41
|
+
doctype: Optional[str] = 'document',
|
42
|
+
device: str = None,
|
43
|
+
cuda_number: int = 0,
|
44
|
+
llm: Callable = None,
|
45
|
+
**kwargs
|
46
|
+
):
|
47
|
+
self.tokenizer = tokenizer
|
48
|
+
self._summary_model = summarizer
|
49
|
+
self.text_splitter = text_splitter
|
50
|
+
self.markdown_splitter = markdown_splitter
|
51
|
+
self.doctype = doctype
|
52
|
+
self.logger = logging.getLogger(
|
53
|
+
f"Loader.{self.__class__.__name__}"
|
54
|
+
)
|
55
|
+
self.path = kwargs.pop('path', None)
|
56
|
+
self._source_type = source_type
|
57
|
+
# LLM (if required)
|
58
|
+
self._llm = llm
|
59
|
+
# JSON encoder:
|
60
|
+
self._encoder = JSONContent()
|
61
|
+
self.device_name = device
|
62
|
+
self.cuda_number = cuda_number
|
63
|
+
self._device = None
|
64
|
+
self.encoding: str = kwargs.get('encoding', 'utf-8')
|
65
|
+
self.summarization_model = kwargs.get(
|
66
|
+
'summarization_model',
|
67
|
+
"facebook/bart-large-cnn"
|
68
|
+
)
|
69
|
+
self._no_summarization = kwargs.get('no_summarization', False)
|
70
|
+
self._translation = kwargs.get('translation', False)
|
71
|
+
self.category: str = kwargs.get('category', 'document')
|
72
|
+
|
73
|
+
async def __aenter__(self):
|
74
|
+
# Cuda Device:
|
75
|
+
self._device = self._get_device(
|
76
|
+
self.device_name,
|
77
|
+
self.cuda_number
|
78
|
+
)
|
79
|
+
return self
|
80
|
+
|
81
|
+
def supported_extensions(self):
|
82
|
+
return self._extension
|
83
|
+
|
84
|
+
async def __aexit__(self, *exc_info):
|
85
|
+
self.post_load()
|
86
|
+
|
87
|
+
def post_load(self):
|
88
|
+
self.tokenizer = None # Reset the tokenizer
|
89
|
+
self.text_splitter = None # Reset the text splitter
|
90
|
+
torch.cuda.synchronize() # Wait for all kernels to finish
|
91
|
+
torch.cuda.empty_cache() # Clear unused memory
|
92
|
+
|
93
|
+
def _get_device(self, device_type: str = None, cuda_number: int = 0):
|
94
|
+
"""Get Default device for Torch and transformers.
|
95
|
+
|
96
|
+
"""
|
97
|
+
if device_type == 'cpu':
|
98
|
+
return torch.device('cpu')
|
99
|
+
elif device_type == 'cuda':
|
100
|
+
return torch.device(f'cuda:{cuda_number}')
|
101
|
+
else:
|
102
|
+
if torch.cuda.is_available():
|
103
|
+
# Use CUDA GPU if available
|
104
|
+
return torch.device(f'cuda:{cuda_number}')
|
105
|
+
if torch.backends.mps.is_available():
|
106
|
+
# Use CUDA Multi-Processing Service if available
|
107
|
+
return torch.device("mps")
|
108
|
+
if EMBEDDING_DEVICE == 'cuda':
|
109
|
+
return torch.device(f'cuda:{cuda_number}')
|
110
|
+
else:
|
111
|
+
return torch.device(EMBEDDING_DEVICE)
|
112
|
+
|
113
|
+
def _check_path(
|
114
|
+
self,
|
115
|
+
path: PurePath,
|
116
|
+
suffix: Optional[List[str]] = None
|
117
|
+
) -> bool:
|
118
|
+
"""Check if the file path exists.
|
119
|
+
Args:
|
120
|
+
path (PurePath): The path to the file.
|
121
|
+
Returns:
|
122
|
+
bool: True if the file exists, False otherwise.
|
123
|
+
"""
|
124
|
+
if isinstance(path, str):
|
125
|
+
path = Path(path).resolve()
|
126
|
+
if not suffix:
|
127
|
+
suffix = self._extension
|
128
|
+
return path.exists() and path.is_file() and path.suffix in suffix
|
129
|
+
|
130
|
+
def create_metadata(
|
131
|
+
self,
|
132
|
+
path: Union[str, PurePath],
|
133
|
+
doctype: str = 'document',
|
134
|
+
source_type: str = 'source',
|
135
|
+
doc_metadata: Optional[dict] = None,
|
136
|
+
summary: Optional[str] = '',
|
137
|
+
**kwargs
|
138
|
+
):
|
139
|
+
if not doc_metadata:
|
140
|
+
doc_metadata = {}
|
141
|
+
if isinstance(path, PurePath):
|
142
|
+
origin = path.name
|
143
|
+
url = f'file://{path.name}'
|
144
|
+
filename = path
|
145
|
+
else:
|
146
|
+
origin = path
|
147
|
+
url = path
|
148
|
+
filename = f'file://{path}'
|
149
|
+
metadata = {
|
150
|
+
"url": url,
|
151
|
+
"source": origin,
|
152
|
+
"filename": str(filename),
|
153
|
+
"type": doctype,
|
154
|
+
"question": '',
|
155
|
+
"answer": '',
|
156
|
+
"summary": summary,
|
157
|
+
"source_type": source_type,
|
158
|
+
"created_at": datetime.now().strftime("%Y-%m-%d, %H:%M:%S"),
|
159
|
+
"category": self.category,
|
160
|
+
"document_meta": {
|
161
|
+
**doc_metadata
|
162
|
+
},
|
163
|
+
**kwargs
|
164
|
+
}
|
165
|
+
return metadata
|
166
|
+
|
167
|
+
def get_default_llm(self):
|
168
|
+
"""Return a VertexLLM instance."""
|
169
|
+
return VertexLLM(
|
170
|
+
model=DEFAULT_LLM_MODEL,
|
171
|
+
temperature=DEFAULT_LLM_TEMPERATURE,
|
172
|
+
top_k=30,
|
173
|
+
top_p=0.5,
|
174
|
+
)
|
175
|
+
|
176
|
+
def get_summary_from_text(self, text: str, use_gpu: bool = False) -> str:
|
177
|
+
"""
|
178
|
+
Get a summary of a text.
|
179
|
+
"""
|
180
|
+
if not text:
|
181
|
+
# NO data to be summarized
|
182
|
+
return ''
|
183
|
+
# splitter = TokenTextSplitter(
|
184
|
+
# chunk_size=2048,
|
185
|
+
# chunk_overlap=100,
|
186
|
+
# )
|
187
|
+
prompt_template = """Write a summary of the following, please also identify the main theme:
|
188
|
+
{text}
|
189
|
+
SUMMARY:"""
|
190
|
+
prompt = PromptTemplate.from_template(prompt_template)
|
191
|
+
refine_template = (
|
192
|
+
"Your job is to produce a final summary\n"
|
193
|
+
"We have provided an existing summary up to a certain point: {existing_answer}\n"
|
194
|
+
"We have the opportunity to refine the existing summary"
|
195
|
+
"(only if needed) with some more context below.\n"
|
196
|
+
"------------\n"
|
197
|
+
"{text}\n"
|
198
|
+
"------------\n"
|
199
|
+
"Given the new context, refine the original summary adding more explanation."
|
200
|
+
"If the context isn't useful, return the original summary."
|
201
|
+
)
|
202
|
+
refine_prompt = PromptTemplate.from_template(refine_template)
|
203
|
+
# if self._llm:
|
204
|
+
# llm = self._llm
|
205
|
+
# else:
|
206
|
+
# llm = self.get_summarization_model(
|
207
|
+
# self.summarization_model,
|
208
|
+
# use_gpu=use_gpu
|
209
|
+
# )
|
210
|
+
# if not llm:
|
211
|
+
# return ''
|
212
|
+
llm = self.get_default_llm()
|
213
|
+
llm = llm.get_llm()
|
214
|
+
summarize_chain = load_summarize_chain(
|
215
|
+
llm=llm,
|
216
|
+
chain_type="refine",
|
217
|
+
question_prompt=prompt,
|
218
|
+
refine_prompt=refine_prompt,
|
219
|
+
return_intermediate_steps=False,
|
220
|
+
input_key="input_documents",
|
221
|
+
output_key="output_text",
|
222
|
+
)
|
223
|
+
doc = Document(page_content=text)
|
224
|
+
try:
|
225
|
+
summary = summarize_chain.invoke(
|
226
|
+
{"input_documents": [doc]}, return_only_outputs=True
|
227
|
+
)
|
228
|
+
return summary.get('output_text', '')
|
229
|
+
except Exception as e:
|
230
|
+
print('ERROR in get_summary_from_text:', e)
|
231
|
+
return ""
|
232
|
+
|
233
|
+
def get_translator(self, model_name: str = 'Helsinki-NLP/opus-mt-en-es'):
|
234
|
+
if not self._translation:
|
235
|
+
return None
|
236
|
+
trans_model = AutoModelForSeq2SeqLM.from_pretrained(
|
237
|
+
model_name,
|
238
|
+
# device_map="auto",
|
239
|
+
# torch_dtype=torch.bfloat16,
|
240
|
+
trust_remote_code=True
|
241
|
+
)
|
242
|
+
trans_tokenizer = AutoTokenizer.from_pretrained(model_name)
|
243
|
+
translator = pipeline(
|
244
|
+
"translation",
|
245
|
+
model=trans_model,
|
246
|
+
tokenizer=trans_tokenizer,
|
247
|
+
batch_size=True,
|
248
|
+
max_new_tokens=500,
|
249
|
+
min_new_tokens=300,
|
250
|
+
use_fast=True
|
251
|
+
)
|
252
|
+
return translator
|
253
|
+
|
254
|
+
def get_summarization_model(
|
255
|
+
self,
|
256
|
+
model_name: str = 'facebook/bart-large-cnn',
|
257
|
+
use_gpu: bool = False
|
258
|
+
):
|
259
|
+
if self._no_summarization is True:
|
260
|
+
return None
|
261
|
+
if not self._summary_model:
|
262
|
+
summarize_model = AutoModelForSeq2SeqLM.from_pretrained(
|
263
|
+
model_name,
|
264
|
+
# torch_dtype=torch.float32,
|
265
|
+
torch_dtype=torch.bfloat16,
|
266
|
+
trust_remote_code=True
|
267
|
+
)
|
268
|
+
if use_gpu:
|
269
|
+
# summarize_model.to(0)
|
270
|
+
summarize_model.cuda()
|
271
|
+
summarize_tokenizer = AutoTokenizer.from_pretrained(
|
272
|
+
model_name,
|
273
|
+
padding_side="left"
|
274
|
+
)
|
275
|
+
pipe_summary = pipeline(
|
276
|
+
"summarization",
|
277
|
+
model=summarize_model,
|
278
|
+
tokenizer=summarize_tokenizer,
|
279
|
+
# device='cuda:0',
|
280
|
+
# batch_size=True,
|
281
|
+
max_new_tokens=256,
|
282
|
+
# min_new_tokens=300,
|
283
|
+
use_fast=True
|
284
|
+
)
|
285
|
+
self._summary_model = HuggingFacePipeline(
|
286
|
+
model_id=model_name,
|
287
|
+
pipeline=pipe_summary,
|
288
|
+
verbose=True
|
289
|
+
)
|
290
|
+
return self._summary_model
|
291
|
+
|
292
|
+
def resolve_paths(self, path: Union[str, PurePath, List[PurePath]]) -> List[Path]:
|
293
|
+
"""
|
294
|
+
Resolve the input path into a list of file paths.
|
295
|
+
Handles lists, directories, glob patterns, and single file paths.
|
296
|
+
|
297
|
+
Args:
|
298
|
+
path (Union[str, PurePath, List[PurePath]]): Input path(s).
|
299
|
+
|
300
|
+
Returns:
|
301
|
+
List[Path]: A list of resolved file paths.
|
302
|
+
"""
|
303
|
+
resolved_paths = []
|
304
|
+
|
305
|
+
if isinstance(path, str):
|
306
|
+
if "*" in path:
|
307
|
+
# Glob pattern
|
308
|
+
resolved_paths = list(Path().glob(path))
|
309
|
+
else:
|
310
|
+
# Single path as string
|
311
|
+
resolved_paths = [Path(path)]
|
312
|
+
elif isinstance(path, PurePath):
|
313
|
+
# Single Path
|
314
|
+
resolved_paths = [Path(path)]
|
315
|
+
elif isinstance(path, list):
|
316
|
+
# List of paths
|
317
|
+
resolved_paths = [Path(p) for p in path]
|
318
|
+
|
319
|
+
final_paths = []
|
320
|
+
for p in resolved_paths:
|
321
|
+
if p.is_dir():
|
322
|
+
# Add all matching files in the directory
|
323
|
+
if self._extension:
|
324
|
+
for ext in self._extension:
|
325
|
+
final_paths.extend(p.glob(f"*{ext}"))
|
326
|
+
else:
|
327
|
+
final_paths.extend(p.glob("*"))
|
328
|
+
elif p.is_file():
|
329
|
+
final_paths.append(p)
|
330
|
+
|
331
|
+
return final_paths
|
332
|
+
|
333
|
+
async def load(self, path: Union[str, PurePath, List[PurePath]]) -> List[Document]:
|
334
|
+
"""Load data from a source and return it as a Langchain Document.
|
335
|
+
|
336
|
+
Args:
|
337
|
+
path (Union[str, PurePath, List[PurePath]]): The source of the data.
|
338
|
+
|
339
|
+
Returns:
|
340
|
+
List[Document]: A list of Langchain Documents.
|
341
|
+
"""
|
342
|
+
self.logger.info(
|
343
|
+
f"Loading file: {path}"
|
344
|
+
)
|
345
|
+
paths = self.resolve_paths(path)
|
346
|
+
docs = []
|
347
|
+
for p in paths:
|
348
|
+
if p.exists():
|
349
|
+
docs.extend(await self._load_document(p))
|
350
|
+
return docs
|
351
|
+
|
352
|
+
async def _load_document(self, path: Path) -> List:
|
353
|
+
"""
|
354
|
+
Abstract method for loading a document.
|
355
|
+
|
356
|
+
Args:
|
357
|
+
path (Path): The path to the file.
|
358
|
+
|
359
|
+
Returns:
|
360
|
+
List: A list of Langchain documents.
|
361
|
+
"""
|
362
|
+
pass
|
@@ -0,0 +1,50 @@
|
|
1
|
+
from abc import abstractmethod
|
2
|
+
from typing import List, Union
|
3
|
+
from pathlib import Path, PurePath
|
4
|
+
from markdownify import markdownify as md
|
5
|
+
from langchain.docstore.document import Document
|
6
|
+
from .abstract import AbstractLoader
|
7
|
+
|
8
|
+
|
9
|
+
class BasePDF(AbstractLoader):
|
10
|
+
"""
|
11
|
+
Base Abstract loader for all PDF-file Loaders.
|
12
|
+
"""
|
13
|
+
_extension = ['.pdf']
|
14
|
+
chunk_size = 768
|
15
|
+
|
16
|
+
def __init__(self, **kwargs):
|
17
|
+
self._lang = 'eng'
|
18
|
+
super().__init__(**kwargs)
|
19
|
+
|
20
|
+
@abstractmethod
|
21
|
+
def _load_pdf(self, path: Path) -> list:
|
22
|
+
"""
|
23
|
+
Load a PDF file using Fitz.
|
24
|
+
|
25
|
+
Args:
|
26
|
+
path (Path): The path to the PDF file.
|
27
|
+
|
28
|
+
Returns:
|
29
|
+
list: A list of Langchain Documents.
|
30
|
+
"""
|
31
|
+
pass
|
32
|
+
|
33
|
+
async def load(self, path: Union[str, PurePath, List[PurePath]]) -> List[Document]:
|
34
|
+
"""Load data from a source and return it as a Langchain Document.
|
35
|
+
|
36
|
+
Args:
|
37
|
+
path (Union[str, PurePath, List[PurePath]]): The source of the data.
|
38
|
+
|
39
|
+
Returns:
|
40
|
+
List[Document]: A list of Langchain Documents.
|
41
|
+
"""
|
42
|
+
self.logger.info(
|
43
|
+
f"Loading file: {path}"
|
44
|
+
)
|
45
|
+
paths = self.resolve_paths(path)
|
46
|
+
docs = []
|
47
|
+
for p in paths:
|
48
|
+
if p.exists():
|
49
|
+
docs.extend(self._load_pdf(p))
|
50
|
+
return docs
|