flowtask 5.8.4__cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowtask/__init__.py +93 -0
- flowtask/__main__.py +38 -0
- flowtask/bots/__init__.py +6 -0
- flowtask/bots/check.py +93 -0
- flowtask/bots/codebot.py +51 -0
- flowtask/components/ASPX.py +148 -0
- flowtask/components/AddDataset.py +352 -0
- flowtask/components/Amazon.py +523 -0
- flowtask/components/AutoTask.py +314 -0
- flowtask/components/Azure.py +80 -0
- flowtask/components/AzureUsers.py +106 -0
- flowtask/components/BaseAction.py +91 -0
- flowtask/components/BaseLoop.py +198 -0
- flowtask/components/BestBuy.py +800 -0
- flowtask/components/CSVToGCS.py +120 -0
- flowtask/components/CompanyScraper/__init__.py +1 -0
- flowtask/components/CompanyScraper/parsers/__init__.py +6 -0
- flowtask/components/CompanyScraper/parsers/base.py +102 -0
- flowtask/components/CompanyScraper/parsers/explorium.py +192 -0
- flowtask/components/CompanyScraper/parsers/leadiq.py +206 -0
- flowtask/components/CompanyScraper/parsers/rocket.py +133 -0
- flowtask/components/CompanyScraper/parsers/siccode.py +109 -0
- flowtask/components/CompanyScraper/parsers/visualvisitor.py +130 -0
- flowtask/components/CompanyScraper/parsers/zoominfo.py +118 -0
- flowtask/components/CompanyScraper/scrapper.py +1054 -0
- flowtask/components/CopyTo.py +177 -0
- flowtask/components/CopyToBigQuery.py +243 -0
- flowtask/components/CopyToMongoDB.py +291 -0
- flowtask/components/CopyToPg.py +609 -0
- flowtask/components/CopyToRethink.py +207 -0
- flowtask/components/CreateGCSBucket.py +102 -0
- flowtask/components/CreateReport/CreateReport.py +228 -0
- flowtask/components/CreateReport/__init__.py +9 -0
- flowtask/components/CreateReport/charts/__init__.py +15 -0
- flowtask/components/CreateReport/charts/bar.py +51 -0
- flowtask/components/CreateReport/charts/base.py +66 -0
- flowtask/components/CreateReport/charts/pie.py +64 -0
- flowtask/components/CreateReport/utils.py +9 -0
- flowtask/components/CustomerSatisfaction.py +196 -0
- flowtask/components/DataInput.py +200 -0
- flowtask/components/DateList.py +255 -0
- flowtask/components/DbClient.py +163 -0
- flowtask/components/DialPad.py +146 -0
- flowtask/components/DocumentDBQuery.py +200 -0
- flowtask/components/DownloadFrom.py +371 -0
- flowtask/components/DownloadFromD2L.py +113 -0
- flowtask/components/DownloadFromFTP.py +181 -0
- flowtask/components/DownloadFromIMAP.py +315 -0
- flowtask/components/DownloadFromS3.py +198 -0
- flowtask/components/DownloadFromSFTP.py +265 -0
- flowtask/components/DownloadFromSharepoint.py +110 -0
- flowtask/components/DownloadFromSmartSheet.py +114 -0
- flowtask/components/DownloadS3File.py +229 -0
- flowtask/components/Dummy.py +59 -0
- flowtask/components/DuplicatePhoto.py +411 -0
- flowtask/components/EmployeeEvaluation.py +237 -0
- flowtask/components/ExecuteSQL.py +323 -0
- flowtask/components/ExtractHTML.py +178 -0
- flowtask/components/FileBase.py +178 -0
- flowtask/components/FileCopy.py +181 -0
- flowtask/components/FileDelete.py +82 -0
- flowtask/components/FileExists.py +146 -0
- flowtask/components/FileIteratorDelete.py +112 -0
- flowtask/components/FileList.py +194 -0
- flowtask/components/FileOpen.py +75 -0
- flowtask/components/FileRead.py +120 -0
- flowtask/components/FileRename.py +106 -0
- flowtask/components/FilterIf.py +284 -0
- flowtask/components/FilterRows/FilterRows.py +200 -0
- flowtask/components/FilterRows/__init__.py +10 -0
- flowtask/components/FilterRows/functions.py +4 -0
- flowtask/components/GCSToBigQuery.py +103 -0
- flowtask/components/GoogleA4.py +150 -0
- flowtask/components/GoogleGeoCoding.py +344 -0
- flowtask/components/GooglePlaces.py +315 -0
- flowtask/components/GoogleSearch.py +539 -0
- flowtask/components/HTTPClient.py +268 -0
- flowtask/components/ICIMS.py +146 -0
- flowtask/components/IF.py +179 -0
- flowtask/components/IcimsFolderCopy.py +173 -0
- flowtask/components/ImageFeatures/__init__.py +5 -0
- flowtask/components/ImageFeatures/process.py +233 -0
- flowtask/components/IteratorBase.py +251 -0
- flowtask/components/LangchainLoader/__init__.py +5 -0
- flowtask/components/LangchainLoader/loader.py +194 -0
- flowtask/components/LangchainLoader/loaders/__init__.py +22 -0
- flowtask/components/LangchainLoader/loaders/abstract.py +362 -0
- flowtask/components/LangchainLoader/loaders/basepdf.py +50 -0
- flowtask/components/LangchainLoader/loaders/docx.py +91 -0
- flowtask/components/LangchainLoader/loaders/html.py +119 -0
- flowtask/components/LangchainLoader/loaders/pdfblocks.py +146 -0
- flowtask/components/LangchainLoader/loaders/pdfmark.py +79 -0
- flowtask/components/LangchainLoader/loaders/pdftables.py +135 -0
- flowtask/components/LangchainLoader/loaders/qa.py +67 -0
- flowtask/components/LangchainLoader/loaders/txt.py +55 -0
- flowtask/components/LeadIQ.py +650 -0
- flowtask/components/Loop.py +253 -0
- flowtask/components/Lowes.py +334 -0
- flowtask/components/MS365Usage.py +156 -0
- flowtask/components/MSTeamsMessages.py +320 -0
- flowtask/components/MarketClustering.py +1051 -0
- flowtask/components/MergeFiles.py +362 -0
- flowtask/components/MilvusOutput.py +87 -0
- flowtask/components/NearByStores.py +175 -0
- flowtask/components/NetworkNinja/__init__.py +6 -0
- flowtask/components/NetworkNinja/models/__init__.py +52 -0
- flowtask/components/NetworkNinja/models/abstract.py +177 -0
- flowtask/components/NetworkNinja/models/account.py +39 -0
- flowtask/components/NetworkNinja/models/client.py +19 -0
- flowtask/components/NetworkNinja/models/district.py +14 -0
- flowtask/components/NetworkNinja/models/events.py +101 -0
- flowtask/components/NetworkNinja/models/forms.py +499 -0
- flowtask/components/NetworkNinja/models/market.py +16 -0
- flowtask/components/NetworkNinja/models/organization.py +34 -0
- flowtask/components/NetworkNinja/models/photos.py +125 -0
- flowtask/components/NetworkNinja/models/project.py +44 -0
- flowtask/components/NetworkNinja/models/region.py +28 -0
- flowtask/components/NetworkNinja/models/store.py +203 -0
- flowtask/components/NetworkNinja/models/user.py +151 -0
- flowtask/components/NetworkNinja/router.py +854 -0
- flowtask/components/Odoo.py +175 -0
- flowtask/components/OdooInjector.py +192 -0
- flowtask/components/OpenFromXML.py +126 -0
- flowtask/components/OpenWeather.py +41 -0
- flowtask/components/OpenWithBase.py +616 -0
- flowtask/components/OpenWithPandas.py +715 -0
- flowtask/components/PGPDecrypt.py +199 -0
- flowtask/components/PandasIterator.py +187 -0
- flowtask/components/PandasToFile.py +189 -0
- flowtask/components/Paradox.py +339 -0
- flowtask/components/ParamIterator.py +117 -0
- flowtask/components/ParseHTML.py +84 -0
- flowtask/components/PlacerStores.py +249 -0
- flowtask/components/Pokemon.py +507 -0
- flowtask/components/PositiveBot.py +62 -0
- flowtask/components/PowerPointSlide.py +400 -0
- flowtask/components/PrintMessage.py +127 -0
- flowtask/components/ProductCompetitors/__init__.py +5 -0
- flowtask/components/ProductCompetitors/parsers/__init__.py +7 -0
- flowtask/components/ProductCompetitors/parsers/base.py +72 -0
- flowtask/components/ProductCompetitors/parsers/bestbuy.py +86 -0
- flowtask/components/ProductCompetitors/parsers/lowes.py +103 -0
- flowtask/components/ProductCompetitors/scrapper.py +155 -0
- flowtask/components/ProductCompliant.py +169 -0
- flowtask/components/ProductInfo/__init__.py +1 -0
- flowtask/components/ProductInfo/parsers/__init__.py +5 -0
- flowtask/components/ProductInfo/parsers/base.py +83 -0
- flowtask/components/ProductInfo/parsers/brother.py +97 -0
- flowtask/components/ProductInfo/parsers/canon.py +167 -0
- flowtask/components/ProductInfo/parsers/epson.py +118 -0
- flowtask/components/ProductInfo/parsers/hp.py +131 -0
- flowtask/components/ProductInfo/parsers/samsung.py +97 -0
- flowtask/components/ProductInfo/scraper.py +319 -0
- flowtask/components/ProductPricing.py +118 -0
- flowtask/components/QS.py +261 -0
- flowtask/components/QSBase.py +201 -0
- flowtask/components/QueryIterator.py +273 -0
- flowtask/components/QueryToInsert.py +327 -0
- flowtask/components/QueryToPandas.py +432 -0
- flowtask/components/RESTClient.py +195 -0
- flowtask/components/RethinkDBQuery.py +189 -0
- flowtask/components/Rsync.py +74 -0
- flowtask/components/RunSSH.py +59 -0
- flowtask/components/RunShell.py +71 -0
- flowtask/components/SalesForce.py +20 -0
- flowtask/components/SaveImageBank/__init__.py +257 -0
- flowtask/components/SchedulingVisits.py +592 -0
- flowtask/components/ScrapPage.py +216 -0
- flowtask/components/ScrapSearch.py +79 -0
- flowtask/components/SendNotify.py +257 -0
- flowtask/components/SentimentAnalysis.py +694 -0
- flowtask/components/ServiceScrapper/__init__.py +5 -0
- flowtask/components/ServiceScrapper/parsers/__init__.py +1 -0
- flowtask/components/ServiceScrapper/parsers/base.py +94 -0
- flowtask/components/ServiceScrapper/parsers/costco.py +93 -0
- flowtask/components/ServiceScrapper/scrapper.py +199 -0
- flowtask/components/SetVariables.py +156 -0
- flowtask/components/SubTask.py +182 -0
- flowtask/components/SuiteCRM.py +48 -0
- flowtask/components/Switch.py +175 -0
- flowtask/components/TableBase.py +148 -0
- flowtask/components/TableDelete.py +312 -0
- flowtask/components/TableInput.py +143 -0
- flowtask/components/TableOutput/TableOutput.py +384 -0
- flowtask/components/TableOutput/__init__.py +3 -0
- flowtask/components/TableSchema.py +534 -0
- flowtask/components/Target.py +223 -0
- flowtask/components/ThumbnailGenerator.py +156 -0
- flowtask/components/ToPandas.py +67 -0
- flowtask/components/TransformRows/TransformRows.py +507 -0
- flowtask/components/TransformRows/__init__.py +9 -0
- flowtask/components/TransformRows/functions.py +559 -0
- flowtask/components/TransposeRows.py +176 -0
- flowtask/components/UPCDatabase.py +86 -0
- flowtask/components/UnGzip.py +171 -0
- flowtask/components/Uncompress.py +172 -0
- flowtask/components/UniqueRows.py +126 -0
- flowtask/components/Unzip.py +107 -0
- flowtask/components/UpdateOperationalVars.py +147 -0
- flowtask/components/UploadTo.py +299 -0
- flowtask/components/UploadToS3.py +136 -0
- flowtask/components/UploadToSFTP.py +160 -0
- flowtask/components/UploadToSharepoint.py +205 -0
- flowtask/components/UserFunc.py +122 -0
- flowtask/components/VivaTracker.py +140 -0
- flowtask/components/WSDLClient.py +123 -0
- flowtask/components/Wait.py +18 -0
- flowtask/components/Walmart.py +199 -0
- flowtask/components/Workplace.py +134 -0
- flowtask/components/XMLToPandas.py +267 -0
- flowtask/components/Zammad/__init__.py +41 -0
- flowtask/components/Zammad/models.py +0 -0
- flowtask/components/ZoomInfoScraper.py +409 -0
- flowtask/components/__init__.py +104 -0
- flowtask/components/abstract.py +18 -0
- flowtask/components/flow.py +530 -0
- flowtask/components/google.py +335 -0
- flowtask/components/group.py +221 -0
- flowtask/components/py.typed +0 -0
- flowtask/components/reviewscrap.py +132 -0
- flowtask/components/tAutoincrement.py +117 -0
- flowtask/components/tConcat.py +109 -0
- flowtask/components/tExplode.py +119 -0
- flowtask/components/tFilter.py +184 -0
- flowtask/components/tGroup.py +236 -0
- flowtask/components/tJoin.py +270 -0
- flowtask/components/tMap/__init__.py +9 -0
- flowtask/components/tMap/functions.py +54 -0
- flowtask/components/tMap/tMap.py +450 -0
- flowtask/components/tMelt.py +112 -0
- flowtask/components/tMerge.py +114 -0
- flowtask/components/tOrder.py +93 -0
- flowtask/components/tPandas.py +94 -0
- flowtask/components/tPivot.py +71 -0
- flowtask/components/tPluckCols.py +76 -0
- flowtask/components/tUnnest.py +82 -0
- flowtask/components/user.py +401 -0
- flowtask/conf.py +457 -0
- flowtask/download.py +102 -0
- flowtask/events/__init__.py +11 -0
- flowtask/events/events/__init__.py +20 -0
- flowtask/events/events/abstract.py +95 -0
- flowtask/events/events/alerts/__init__.py +362 -0
- flowtask/events/events/alerts/colfunctions.py +131 -0
- flowtask/events/events/alerts/functions.py +158 -0
- flowtask/events/events/dummy.py +12 -0
- flowtask/events/events/exec.py +124 -0
- flowtask/events/events/file/__init__.py +7 -0
- flowtask/events/events/file/base.py +51 -0
- flowtask/events/events/file/copy.py +23 -0
- flowtask/events/events/file/delete.py +16 -0
- flowtask/events/events/interfaces/__init__.py +9 -0
- flowtask/events/events/interfaces/client.py +67 -0
- flowtask/events/events/interfaces/credentials.py +28 -0
- flowtask/events/events/interfaces/notifications.py +58 -0
- flowtask/events/events/jira.py +122 -0
- flowtask/events/events/log.py +26 -0
- flowtask/events/events/logerr.py +52 -0
- flowtask/events/events/notify.py +59 -0
- flowtask/events/events/notify_event.py +160 -0
- flowtask/events/events/publish.py +54 -0
- flowtask/events/events/sendfile.py +104 -0
- flowtask/events/events/task.py +97 -0
- flowtask/events/events/teams.py +98 -0
- flowtask/events/events/webhook.py +58 -0
- flowtask/events/manager.py +287 -0
- flowtask/exceptions.c +39393 -0
- flowtask/exceptions.cpython-312-x86_64-linux-gnu.so +0 -0
- flowtask/extensions/__init__.py +3 -0
- flowtask/extensions/abstract.py +82 -0
- flowtask/extensions/logging/__init__.py +65 -0
- flowtask/hooks/__init__.py +9 -0
- flowtask/hooks/actions/__init__.py +22 -0
- flowtask/hooks/actions/abstract.py +66 -0
- flowtask/hooks/actions/dummy.py +23 -0
- flowtask/hooks/actions/jira.py +74 -0
- flowtask/hooks/actions/rest.py +320 -0
- flowtask/hooks/actions/sampledata.py +37 -0
- flowtask/hooks/actions/sensor.py +23 -0
- flowtask/hooks/actions/task.py +9 -0
- flowtask/hooks/actions/ticket.py +37 -0
- flowtask/hooks/actions/zammad.py +55 -0
- flowtask/hooks/hook.py +62 -0
- flowtask/hooks/models.py +17 -0
- flowtask/hooks/service.py +187 -0
- flowtask/hooks/step.py +91 -0
- flowtask/hooks/types/__init__.py +23 -0
- flowtask/hooks/types/base.py +129 -0
- flowtask/hooks/types/brokers/__init__.py +11 -0
- flowtask/hooks/types/brokers/base.py +54 -0
- flowtask/hooks/types/brokers/mqtt.py +35 -0
- flowtask/hooks/types/brokers/rabbitmq.py +82 -0
- flowtask/hooks/types/brokers/redis.py +83 -0
- flowtask/hooks/types/brokers/sqs.py +44 -0
- flowtask/hooks/types/fs.py +232 -0
- flowtask/hooks/types/http.py +49 -0
- flowtask/hooks/types/imap.py +200 -0
- flowtask/hooks/types/jira.py +279 -0
- flowtask/hooks/types/mail.py +205 -0
- flowtask/hooks/types/postgres.py +98 -0
- flowtask/hooks/types/responses/__init__.py +8 -0
- flowtask/hooks/types/responses/base.py +5 -0
- flowtask/hooks/types/sharepoint.py +288 -0
- flowtask/hooks/types/ssh.py +141 -0
- flowtask/hooks/types/tagged.py +59 -0
- flowtask/hooks/types/upload.py +85 -0
- flowtask/hooks/types/watch.py +71 -0
- flowtask/hooks/types/web.py +36 -0
- flowtask/interfaces/AzureClient.py +137 -0
- flowtask/interfaces/AzureGraph.py +839 -0
- flowtask/interfaces/Boto3Client.py +326 -0
- flowtask/interfaces/DropboxClient.py +173 -0
- flowtask/interfaces/ExcelHandler.py +94 -0
- flowtask/interfaces/FTPClient.py +131 -0
- flowtask/interfaces/GoogleCalendar.py +201 -0
- flowtask/interfaces/GoogleClient.py +133 -0
- flowtask/interfaces/GoogleDrive.py +127 -0
- flowtask/interfaces/GoogleGCS.py +89 -0
- flowtask/interfaces/GoogleGeocoding.py +93 -0
- flowtask/interfaces/GoogleLang.py +114 -0
- flowtask/interfaces/GooglePub.py +61 -0
- flowtask/interfaces/GoogleSheet.py +68 -0
- flowtask/interfaces/IMAPClient.py +137 -0
- flowtask/interfaces/O365Calendar.py +113 -0
- flowtask/interfaces/O365Client.py +220 -0
- flowtask/interfaces/OneDrive.py +284 -0
- flowtask/interfaces/Outlook.py +155 -0
- flowtask/interfaces/ParrotBot.py +130 -0
- flowtask/interfaces/SSHClient.py +378 -0
- flowtask/interfaces/Sharepoint.py +496 -0
- flowtask/interfaces/__init__.py +36 -0
- flowtask/interfaces/azureauth.py +119 -0
- flowtask/interfaces/cache.py +201 -0
- flowtask/interfaces/client.py +82 -0
- flowtask/interfaces/compress.py +525 -0
- flowtask/interfaces/credentials.py +124 -0
- flowtask/interfaces/d2l.py +239 -0
- flowtask/interfaces/databases/__init__.py +5 -0
- flowtask/interfaces/databases/db.py +223 -0
- flowtask/interfaces/databases/documentdb.py +55 -0
- flowtask/interfaces/databases/rethink.py +39 -0
- flowtask/interfaces/dataframes/__init__.py +11 -0
- flowtask/interfaces/dataframes/abstract.py +21 -0
- flowtask/interfaces/dataframes/arrow.py +71 -0
- flowtask/interfaces/dataframes/dt.py +69 -0
- flowtask/interfaces/dataframes/pandas.py +167 -0
- flowtask/interfaces/dataframes/polars.py +60 -0
- flowtask/interfaces/db.py +263 -0
- flowtask/interfaces/env.py +46 -0
- flowtask/interfaces/func.py +137 -0
- flowtask/interfaces/http.py +1780 -0
- flowtask/interfaces/locale.py +40 -0
- flowtask/interfaces/log.py +75 -0
- flowtask/interfaces/mask.py +143 -0
- flowtask/interfaces/notification.py +154 -0
- flowtask/interfaces/playwright.py +339 -0
- flowtask/interfaces/powerpoint.py +368 -0
- flowtask/interfaces/py.typed +0 -0
- flowtask/interfaces/qs.py +376 -0
- flowtask/interfaces/result.py +87 -0
- flowtask/interfaces/selenium_service.py +779 -0
- flowtask/interfaces/smartsheet.py +154 -0
- flowtask/interfaces/stat.py +39 -0
- flowtask/interfaces/task.py +96 -0
- flowtask/interfaces/template.py +118 -0
- flowtask/interfaces/vectorstores/__init__.py +1 -0
- flowtask/interfaces/vectorstores/abstract.py +133 -0
- flowtask/interfaces/vectorstores/milvus.py +669 -0
- flowtask/interfaces/zammad.py +107 -0
- flowtask/models.py +193 -0
- flowtask/parsers/__init__.py +15 -0
- flowtask/parsers/_yaml.c +11978 -0
- flowtask/parsers/_yaml.cpython-312-x86_64-linux-gnu.so +0 -0
- flowtask/parsers/argparser.py +235 -0
- flowtask/parsers/base.c +15155 -0
- flowtask/parsers/base.cpython-312-x86_64-linux-gnu.so +0 -0
- flowtask/parsers/json.c +11968 -0
- flowtask/parsers/json.cpython-312-x86_64-linux-gnu.so +0 -0
- flowtask/parsers/maps.py +49 -0
- flowtask/parsers/toml.c +11968 -0
- flowtask/parsers/toml.cpython-312-x86_64-linux-gnu.so +0 -0
- flowtask/plugins/__init__.py +16 -0
- flowtask/plugins/components/__init__.py +0 -0
- flowtask/plugins/handler/__init__.py +45 -0
- flowtask/plugins/importer.py +31 -0
- flowtask/plugins/sources/__init__.py +0 -0
- flowtask/runner.py +283 -0
- flowtask/scheduler/__init__.py +9 -0
- flowtask/scheduler/functions.py +493 -0
- flowtask/scheduler/handlers/__init__.py +8 -0
- flowtask/scheduler/handlers/manager.py +504 -0
- flowtask/scheduler/handlers/models.py +58 -0
- flowtask/scheduler/handlers/service.py +72 -0
- flowtask/scheduler/notifications.py +65 -0
- flowtask/scheduler/scheduler.py +993 -0
- flowtask/services/__init__.py +0 -0
- flowtask/services/bots/__init__.py +0 -0
- flowtask/services/bots/telegram.py +264 -0
- flowtask/services/files/__init__.py +11 -0
- flowtask/services/files/manager.py +522 -0
- flowtask/services/files/model.py +37 -0
- flowtask/services/files/service.py +767 -0
- flowtask/services/jira/__init__.py +3 -0
- flowtask/services/jira/jira_actions.py +191 -0
- flowtask/services/tasks/__init__.py +13 -0
- flowtask/services/tasks/launcher.py +213 -0
- flowtask/services/tasks/manager.py +323 -0
- flowtask/services/tasks/service.py +275 -0
- flowtask/services/tasks/task_manager.py +376 -0
- flowtask/services/tasks/tasks.py +155 -0
- flowtask/storages/__init__.py +16 -0
- flowtask/storages/exceptions.py +12 -0
- flowtask/storages/files/__init__.py +8 -0
- flowtask/storages/files/abstract.py +29 -0
- flowtask/storages/files/filesystem.py +66 -0
- flowtask/storages/tasks/__init__.py +19 -0
- flowtask/storages/tasks/abstract.py +26 -0
- flowtask/storages/tasks/database.py +33 -0
- flowtask/storages/tasks/filesystem.py +108 -0
- flowtask/storages/tasks/github.py +119 -0
- flowtask/storages/tasks/memory.py +45 -0
- flowtask/storages/tasks/row.py +25 -0
- flowtask/tasks/__init__.py +0 -0
- flowtask/tasks/abstract.py +526 -0
- flowtask/tasks/command.py +118 -0
- flowtask/tasks/pile.py +486 -0
- flowtask/tasks/py.typed +0 -0
- flowtask/tasks/task.py +778 -0
- flowtask/template/__init__.py +161 -0
- flowtask/tests.py +257 -0
- flowtask/types/__init__.py +8 -0
- flowtask/types/typedefs.c +11347 -0
- flowtask/types/typedefs.cpython-312-x86_64-linux-gnu.so +0 -0
- flowtask/utils/__init__.py +24 -0
- flowtask/utils/constants.py +117 -0
- flowtask/utils/encoders.py +21 -0
- flowtask/utils/executor.py +112 -0
- flowtask/utils/functions.cpp +14280 -0
- flowtask/utils/functions.cpython-312-x86_64-linux-gnu.so +0 -0
- flowtask/utils/json.cpp +13349 -0
- flowtask/utils/json.cpython-312-x86_64-linux-gnu.so +0 -0
- flowtask/utils/mail.py +63 -0
- flowtask/utils/parseqs.c +13324 -0
- flowtask/utils/parserqs.cpython-312-x86_64-linux-gnu.so +0 -0
- flowtask/utils/stats.py +308 -0
- flowtask/utils/transformations.py +74 -0
- flowtask/utils/uv.py +12 -0
- flowtask/utils/validators.py +97 -0
- flowtask/version.py +11 -0
- flowtask-5.8.4.dist-info/LICENSE +201 -0
- flowtask-5.8.4.dist-info/METADATA +209 -0
- flowtask-5.8.4.dist-info/RECORD +470 -0
- flowtask-5.8.4.dist-info/WHEEL +6 -0
- flowtask-5.8.4.dist-info/entry_points.txt +3 -0
- flowtask-5.8.4.dist-info/top_level.txt +2 -0
- plugins/components/CreateQR.py +39 -0
- plugins/components/TestComponent.py +28 -0
- plugins/components/Use1.py +13 -0
- plugins/components/Workplace.py +117 -0
- plugins/components/__init__.py +3 -0
- plugins/sources/__init__.py +0 -0
- plugins/sources/get_populartimes.py +78 -0
- plugins/sources/google.py +150 -0
- plugins/sources/hubspot.py +679 -0
- plugins/sources/icims.py +679 -0
- plugins/sources/mobileinsight.py +501 -0
- plugins/sources/newrelic.py +262 -0
- plugins/sources/uap.py +268 -0
- plugins/sources/venu.py +244 -0
- plugins/sources/vocinity.py +314 -0
@@ -0,0 +1,91 @@
|
|
1
|
+
from typing import List
|
2
|
+
from pathlib import PurePath
|
3
|
+
import mammoth
|
4
|
+
import docx
|
5
|
+
from markdownify import markdownify as md
|
6
|
+
from langchain.docstore.document import Document
|
7
|
+
from .abstract import AbstractLoader
|
8
|
+
|
9
|
+
|
10
|
+
class MSWordLoader(AbstractLoader):
|
11
|
+
"""
|
12
|
+
Load Microsoft Docx as Langchain Documents.
|
13
|
+
"""
|
14
|
+
def extract_text(self, path):
|
15
|
+
"""Extract text from a docx file.
|
16
|
+
|
17
|
+
Args:
|
18
|
+
path (Path): The source of the data.
|
19
|
+
|
20
|
+
Returns:
|
21
|
+
str: The extracted text.
|
22
|
+
"""
|
23
|
+
doc = docx.Document(str(path))
|
24
|
+
text = []
|
25
|
+
for paragraph in doc.paragraphs:
|
26
|
+
text.append(paragraph.text)
|
27
|
+
return "\n".join(text)
|
28
|
+
|
29
|
+
async def _load_document(self, path: PurePath) -> List[Document]:
|
30
|
+
"""Load data from a source and return it as a Langchain Document.
|
31
|
+
|
32
|
+
Args:
|
33
|
+
path (Path): The source of the data.
|
34
|
+
|
35
|
+
Returns:
|
36
|
+
List[Document]: A list of Langchain Documents.
|
37
|
+
"""
|
38
|
+
self.logger.info(
|
39
|
+
f"Loading Word file: {path}"
|
40
|
+
)
|
41
|
+
docs = []
|
42
|
+
with open(path, "rb") as docx_file:
|
43
|
+
doc = docx.Document(str(path))
|
44
|
+
properties = doc.core_properties
|
45
|
+
result = mammoth.convert_to_html(docx_file)
|
46
|
+
# text_file = mammoth.extract_raw_text(docx_file) # Use text File for summary
|
47
|
+
html = result.value # The generated HTML
|
48
|
+
md_text = md(html) # The generated Markdown
|
49
|
+
|
50
|
+
print(f"Type of HTML result: {type(html)}")
|
51
|
+
print(f"Length of HTML: {len(html)}")
|
52
|
+
print(f"First 100 characters: {html[:100]}")
|
53
|
+
print(f"Messages from conversion: {result.messages}")
|
54
|
+
# TODO: add summarization and translation if requested
|
55
|
+
summary = ''
|
56
|
+
# try:
|
57
|
+
# summary = self.get_summary_from_text(md_text, use_gpu=True)
|
58
|
+
# except Exception:
|
59
|
+
# summary = ''
|
60
|
+
document_meta = {
|
61
|
+
"author": properties.author,
|
62
|
+
"version": properties.version,
|
63
|
+
"title": properties.title,
|
64
|
+
# "created": properties.created.strftime("%Y-%m-%d %H:%M:%S"),
|
65
|
+
# "last_modified": properties.modified.strftime("%Y-%m-%d %H:%M:%S")
|
66
|
+
}
|
67
|
+
metadata = self.create_metadata(
|
68
|
+
path=path,
|
69
|
+
doctype=self.doctype,
|
70
|
+
source_type=self._source_type,
|
71
|
+
summary=summary,
|
72
|
+
doc_metadata=document_meta
|
73
|
+
)
|
74
|
+
# Create document-level context
|
75
|
+
document_context = f"File Name: {path.name}\n"
|
76
|
+
document_context += f"Document Type: {self.doctype}\n"
|
77
|
+
document_context += f"Source Type: {self._source_type}\n"
|
78
|
+
document_context += f"Summary: {summary}\n"
|
79
|
+
document_context += "======\n"
|
80
|
+
# splitting the content:
|
81
|
+
for chunk in self.markdown_splitter.split_text(md_text):
|
82
|
+
_idx = {
|
83
|
+
**metadata
|
84
|
+
}
|
85
|
+
docs.append(
|
86
|
+
Document(
|
87
|
+
page_content=document_context + chunk,
|
88
|
+
metadata=_idx
|
89
|
+
)
|
90
|
+
)
|
91
|
+
return docs
|
@@ -0,0 +1,119 @@
|
|
1
|
+
from bs4 import BeautifulSoup
|
2
|
+
from langchain.docstore.document import Document
|
3
|
+
from .abstract import AbstractLoader
|
4
|
+
from pathlib import Path, PurePath
|
5
|
+
from markdownify import markdownify as md
|
6
|
+
from datetime import datetime
|
7
|
+
|
8
|
+
|
9
|
+
class HTMLLoader(AbstractLoader):
|
10
|
+
"""
|
11
|
+
Loader for HTML files to convert into Langchain Documents.
|
12
|
+
|
13
|
+
Processes HTML files, extracts relevant content, converts to Markdown,
|
14
|
+
and associates metadata with each document.
|
15
|
+
"""
|
16
|
+
|
17
|
+
_extension = ['.html', '.htm']
|
18
|
+
|
19
|
+
def __init__(self, **kwargs):
|
20
|
+
"""Initialize the HTMLLoader."""
|
21
|
+
self.elements: list = kwargs.pop('elements', [])
|
22
|
+
super().__init__(**kwargs)
|
23
|
+
|
24
|
+
async def _load_document(self, path: PurePath) -> list[Document]:
|
25
|
+
"""
|
26
|
+
Load an HTML file and convert its content into Langchain documents.
|
27
|
+
|
28
|
+
Args:
|
29
|
+
path (PurePath): Path to the HTML file.
|
30
|
+
|
31
|
+
Returns:
|
32
|
+
list[Document]: A list of Langchain documents with content and metadata.
|
33
|
+
"""
|
34
|
+
documents = []
|
35
|
+
|
36
|
+
# Check if the file exists and is valid
|
37
|
+
if not self._check_path(path):
|
38
|
+
raise ValueError(f"File {path} does not exist or is not a valid HTML file.")
|
39
|
+
|
40
|
+
# Read and parse the HTML file
|
41
|
+
with open(path, 'r', encoding=self.encoding) as file:
|
42
|
+
html_content = file.read()
|
43
|
+
|
44
|
+
soup = BeautifulSoup(html_content, 'html.parser')
|
45
|
+
|
46
|
+
# Extract the entire <body> content or
|
47
|
+
# Determine the top-level element to process
|
48
|
+
top_element = soup.body or soup
|
49
|
+
if not top_element:
|
50
|
+
raise ValueError(
|
51
|
+
"The HTML file does not contain a <body> or Top element tag."
|
52
|
+
)
|
53
|
+
|
54
|
+
extracted_elements = []
|
55
|
+
|
56
|
+
if self.elements:
|
57
|
+
# Extract content from specific elements
|
58
|
+
for element in self.elements:
|
59
|
+
for tag, selector in element.items():
|
60
|
+
extracted_elements.extend(top_element.find_all(tag, class_=selector.lstrip('.')))
|
61
|
+
|
62
|
+
if not extracted_elements:
|
63
|
+
extracted_elements = [top_element]
|
64
|
+
|
65
|
+
# Process each extracted element
|
66
|
+
for elem in extracted_elements:
|
67
|
+
# Get the plain text content
|
68
|
+
text = elem.get_text(separator="\n", strip=True)
|
69
|
+
|
70
|
+
# Generate a summary for the extracted text
|
71
|
+
try:
|
72
|
+
summary = self.get_summary_from_text(text, use_gpu=True)
|
73
|
+
except Exception as e:
|
74
|
+
if self.logger:
|
75
|
+
self.logger.error(f"Error generating summary: {e}")
|
76
|
+
summary = "Summary not available."
|
77
|
+
|
78
|
+
# Create document-level context
|
79
|
+
document_context = f"File Name: {path.name}\n"
|
80
|
+
document_context += f"Document Type: {self.doctype}\n"
|
81
|
+
document_context += f"Source Type: {self._source_type}\n"
|
82
|
+
document_context += f"Element: {elem.name}\n"
|
83
|
+
document_context += f"Summary: {summary}\n\n"
|
84
|
+
|
85
|
+
# Convert the entire <body> to Markdown for better structure
|
86
|
+
markdown_content = md(str(elem))
|
87
|
+
|
88
|
+
# Metadata preparation
|
89
|
+
document_meta = self.create_metadata(
|
90
|
+
path=path,
|
91
|
+
doctype=self.doctype,
|
92
|
+
source_type=self._source_type,
|
93
|
+
summary=summary,
|
94
|
+
doc_metadata={
|
95
|
+
"type": "html",
|
96
|
+
"category": self.category,
|
97
|
+
}
|
98
|
+
)
|
99
|
+
|
100
|
+
# Create a single Langchain Document with the full body content
|
101
|
+
document = Document(
|
102
|
+
page_content=document_context + markdown_content,
|
103
|
+
metadata=document_meta
|
104
|
+
)
|
105
|
+
documents.append(document)
|
106
|
+
|
107
|
+
# splitting the content:
|
108
|
+
for chunk in self.markdown_splitter.split_text(text):
|
109
|
+
_idx = {
|
110
|
+
**document_meta
|
111
|
+
}
|
112
|
+
# Create a Langchain Document
|
113
|
+
documents.append(
|
114
|
+
Document(
|
115
|
+
page_content=document_context + chunk,
|
116
|
+
metadata=_idx
|
117
|
+
)
|
118
|
+
)
|
119
|
+
return documents
|
@@ -0,0 +1,146 @@
|
|
1
|
+
from io import StringIO
|
2
|
+
from pathlib import Path
|
3
|
+
from datetime import datetime
|
4
|
+
import fitz
|
5
|
+
import pandas as pd
|
6
|
+
from langchain.docstore.document import Document
|
7
|
+
from .basepdf import BasePDF
|
8
|
+
|
9
|
+
|
10
|
+
class PDFBlocks(BasePDF):
|
11
|
+
"""
|
12
|
+
Load a PDF Table as Blocks of text.
|
13
|
+
"""
|
14
|
+
_extension = ['.pdf']
|
15
|
+
|
16
|
+
def __init__(
|
17
|
+
self,
|
18
|
+
table_settings: dict = {},
|
19
|
+
**kwargs
|
20
|
+
):
|
21
|
+
self._skiprows = kwargs.pop('skiprows', None)
|
22
|
+
super().__init__(**kwargs)
|
23
|
+
# Table Settings:
|
24
|
+
self.table_settings = {
|
25
|
+
# "vertical_strategy": "text",
|
26
|
+
# "horizontal_strategy": "text",
|
27
|
+
"intersection_x_tolerance": 5,
|
28
|
+
"intersection_y_tolerance": 5
|
29
|
+
}
|
30
|
+
if table_settings:
|
31
|
+
self.table_settings.update(table_settings)
|
32
|
+
|
33
|
+
def unique_columns(self, df: pd.DataFrame) -> pd.DataFrame:
|
34
|
+
"""
|
35
|
+
Rename duplicate columns in the DataFrame to ensure they are unique.
|
36
|
+
|
37
|
+
Args:
|
38
|
+
df (pd.DataFrame): The DataFrame with potential duplicate column names.
|
39
|
+
|
40
|
+
Returns:
|
41
|
+
pd.DataFrame: A DataFrame with unique column names.
|
42
|
+
"""
|
43
|
+
seen = {}
|
44
|
+
new_columns = []
|
45
|
+
for col in df.columns:
|
46
|
+
new_col = col
|
47
|
+
count = seen.get(col, 0)
|
48
|
+
while new_col in new_columns:
|
49
|
+
count += 1
|
50
|
+
new_col = f"{col}_{count}"
|
51
|
+
new_columns.append(new_col)
|
52
|
+
seen[col] = count
|
53
|
+
df.columns = new_columns
|
54
|
+
return df
|
55
|
+
|
56
|
+
def get_markdown(self, df: pd.DataFrame) -> str:
|
57
|
+
"""
|
58
|
+
Convert a DataFrame to a Markdown string.
|
59
|
+
|
60
|
+
Args:
|
61
|
+
df (pd.DataFrame): The DataFrame to convert.
|
62
|
+
|
63
|
+
Returns:
|
64
|
+
str: The JSON string.
|
65
|
+
"""
|
66
|
+
buffer = StringIO()
|
67
|
+
df = self.unique_columns(df)
|
68
|
+
df.to_markdown(buffer)
|
69
|
+
buffer.seek(0)
|
70
|
+
return buffer.getvalue()
|
71
|
+
|
72
|
+
def parse_table(self, table_idx, table, page_number, path) -> pd.DataFrame:
|
73
|
+
df = table.to_pandas() # convert to pandas DataFrame
|
74
|
+
df = df.dropna(axis=1, how='all')
|
75
|
+
df = df.dropna(how='all', axis=0) # Drop empty rows
|
76
|
+
page = page_number + 1
|
77
|
+
table_meta = {
|
78
|
+
"url": '',
|
79
|
+
"source": f"{path.name} Page.#{page} Table.#{table_idx}",
|
80
|
+
"filename": path.name,
|
81
|
+
"question": '',
|
82
|
+
"answer": '',
|
83
|
+
"type": 'table',
|
84
|
+
"summary": '',
|
85
|
+
"category": self.category,
|
86
|
+
"source_type": self._source_type,
|
87
|
+
"created_at": datetime.now().strftime("%Y-%m-%d, %H:%M:%S"),
|
88
|
+
"document_meta": {
|
89
|
+
"table_index": table_idx,
|
90
|
+
"table_shape": df.shape,
|
91
|
+
"table_columns": df.columns.tolist(),
|
92
|
+
"description": f"Extracted from Page.#{page}."
|
93
|
+
}
|
94
|
+
}
|
95
|
+
return df, table_meta
|
96
|
+
|
97
|
+
def _load_pdf(self, path: Path) -> list:
|
98
|
+
"""
|
99
|
+
Load a PDF file using the Fitz library.
|
100
|
+
|
101
|
+
Args:
|
102
|
+
path (Path): The path to the PDF file.
|
103
|
+
|
104
|
+
Returns:
|
105
|
+
list: A list of Langchain Documents.
|
106
|
+
"""
|
107
|
+
if self._check_path(path):
|
108
|
+
self.logger.info(f"Loading PDF file: {path}")
|
109
|
+
pdf = fitz.open(str(path)) # Open the PDF file
|
110
|
+
docs = []
|
111
|
+
# Create document-level context
|
112
|
+
document_context = f"File Name: {path.name}\n"
|
113
|
+
document_context += f"Document Type: {self.doctype}\n"
|
114
|
+
document_context += f"Source Type: {self._source_type}\n\n"
|
115
|
+
for page_number in range(pdf.page_count):
|
116
|
+
page = pdf[page_number]
|
117
|
+
try:
|
118
|
+
tabs = page.find_tables(**self.table_settings)
|
119
|
+
for tab_idx, tab in enumerate(tabs):
|
120
|
+
df, _meta = self.parse_table(tab_idx, tab, page_number, path)
|
121
|
+
try:
|
122
|
+
markdown_table = self.get_markdown(df)
|
123
|
+
docs.append(
|
124
|
+
Document(
|
125
|
+
page_content=document_context + markdown_table,
|
126
|
+
metadata=_meta
|
127
|
+
)
|
128
|
+
)
|
129
|
+
except Exception as exc:
|
130
|
+
print(exc)
|
131
|
+
## Sample information:
|
132
|
+
print('::: Printing Table Information === ')
|
133
|
+
print(df)
|
134
|
+
print("::: Printing Column Information === ")
|
135
|
+
for column, t in df.dtypes.items():
|
136
|
+
print(column, "->", t, "->", df[column].iloc[0])
|
137
|
+
# convert into markdown:
|
138
|
+
txt = df.to_markdown()
|
139
|
+
if txt:
|
140
|
+
docs.append(
|
141
|
+
Document(page_content=document_context + txt, metadata=_meta)
|
142
|
+
)
|
143
|
+
except Exception as exc:
|
144
|
+
print(exc)
|
145
|
+
continue
|
146
|
+
return docs
|
@@ -0,0 +1,79 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
import fitz
|
3
|
+
from pdf4llm import to_markdown
|
4
|
+
from langchain.docstore.document import Document
|
5
|
+
from langchain.text_splitter import MarkdownTextSplitter
|
6
|
+
from .basepdf import BasePDF
|
7
|
+
|
8
|
+
|
9
|
+
class PDFMarkdown(BasePDF):
|
10
|
+
"""
|
11
|
+
Loader for PDF files converted content to markdown.
|
12
|
+
"""
|
13
|
+
def __init__(
|
14
|
+
self,
|
15
|
+
**kwargs
|
16
|
+
):
|
17
|
+
super().__init__(**kwargs)
|
18
|
+
self._splitter = MarkdownTextSplitter(chunk_size=2048, chunk_overlap=10)
|
19
|
+
|
20
|
+
def _load_pdf(self, path: Path) -> list:
|
21
|
+
"""
|
22
|
+
Load a PDF file using the PDFMiner library.
|
23
|
+
|
24
|
+
Args:
|
25
|
+
path (Path): The path to the PDF file.
|
26
|
+
|
27
|
+
Returns:
|
28
|
+
list: A list of Langchain Documents.
|
29
|
+
"""
|
30
|
+
if self._check_path(path):
|
31
|
+
self.logger.info(f"Loading PDF file: {path}")
|
32
|
+
docs = []
|
33
|
+
pdf = fitz.open(str(path))
|
34
|
+
md_text = to_markdown(pdf)
|
35
|
+
try:
|
36
|
+
summary = self.get_summary_from_text(md_text, use_gpu=True)
|
37
|
+
except Exception:
|
38
|
+
summary = ''
|
39
|
+
document_meta = {
|
40
|
+
"title": pdf.metadata.get("title", ""),
|
41
|
+
"creationDate": pdf.metadata.get("creationDate", ""),
|
42
|
+
"author": pdf.metadata.get("author", ""),
|
43
|
+
}
|
44
|
+
metadata = self.create_metadata(
|
45
|
+
path=path,
|
46
|
+
doctype=self.doctype,
|
47
|
+
source_type=self._source_type,
|
48
|
+
summary=summary,
|
49
|
+
doc_metadata=document_meta
|
50
|
+
)
|
51
|
+
|
52
|
+
# Prepend document-level context
|
53
|
+
document_context = f"Document Title: {document_meta.get('title', '')}\n"
|
54
|
+
# document_context += f"Document Author: {document_meta.get('author', '')}\n"
|
55
|
+
document_context += f"File Path: {str(path)}\n"
|
56
|
+
document_context += f"Summary: {summary}\n\n"
|
57
|
+
|
58
|
+
for _, chunk in enumerate(self._splitter.split_text(md_text)):
|
59
|
+
docs.append(
|
60
|
+
Document(
|
61
|
+
page_content=document_context + chunk,
|
62
|
+
metadata=metadata
|
63
|
+
)
|
64
|
+
)
|
65
|
+
# also, creating a document for summary:
|
66
|
+
if summary:
|
67
|
+
_info = {
|
68
|
+
"category": "Summary",
|
69
|
+
**metadata
|
70
|
+
}
|
71
|
+
docs.append(
|
72
|
+
Document(
|
73
|
+
page_content=f"**Summary:** {summary}",
|
74
|
+
metadata=_info
|
75
|
+
)
|
76
|
+
)
|
77
|
+
return docs
|
78
|
+
else:
|
79
|
+
return []
|
@@ -0,0 +1,135 @@
|
|
1
|
+
from io import StringIO
|
2
|
+
from pathlib import Path
|
3
|
+
from datetime import datetime
|
4
|
+
import fitz
|
5
|
+
import pandas as pd
|
6
|
+
from langchain.docstore.document import Document
|
7
|
+
from .basepdf import BasePDF
|
8
|
+
|
9
|
+
|
10
|
+
class PDFTables(BasePDF):
|
11
|
+
"""
|
12
|
+
Loader for Tables present on PDF Files.
|
13
|
+
"""
|
14
|
+
_extension = ['.pdf']
|
15
|
+
|
16
|
+
def __init__(
|
17
|
+
self,
|
18
|
+
table_settings: dict = {},
|
19
|
+
**kwargs
|
20
|
+
):
|
21
|
+
self._skiprows = kwargs.pop('skiprows', None)
|
22
|
+
super().__init__(**kwargs)
|
23
|
+
# Table Settings:
|
24
|
+
self.table_settings = {
|
25
|
+
# "vertical_strategy": "text",
|
26
|
+
# "horizontal_strategy": "text",
|
27
|
+
"intersection_x_tolerance": 5,
|
28
|
+
"intersection_y_tolerance": 5
|
29
|
+
}
|
30
|
+
if table_settings:
|
31
|
+
self.table_settings.update(table_settings)
|
32
|
+
|
33
|
+
def unique_columns(self, df: pd.DataFrame) -> pd.DataFrame:
|
34
|
+
"""
|
35
|
+
Rename duplicate columns in the DataFrame to ensure they are unique.
|
36
|
+
|
37
|
+
Args:
|
38
|
+
df (pd.DataFrame): The DataFrame with potential duplicate column names.
|
39
|
+
|
40
|
+
Returns:
|
41
|
+
pd.DataFrame: A DataFrame with unique column names.
|
42
|
+
"""
|
43
|
+
seen = {}
|
44
|
+
new_columns = []
|
45
|
+
for col in df.columns:
|
46
|
+
new_col = col
|
47
|
+
count = seen.get(col, 0)
|
48
|
+
while new_col in new_columns:
|
49
|
+
count += 1
|
50
|
+
new_col = f"{col}_{count}"
|
51
|
+
new_columns.append(new_col)
|
52
|
+
seen[col] = count
|
53
|
+
df.columns = new_columns
|
54
|
+
return df
|
55
|
+
|
56
|
+
def get_markdown(self, df: pd.DataFrame) -> str:
|
57
|
+
"""
|
58
|
+
Convert a DataFrame to a Markdown string.
|
59
|
+
|
60
|
+
Args:
|
61
|
+
df (pd.DataFrame): The DataFrame to convert.
|
62
|
+
|
63
|
+
Returns:
|
64
|
+
str: The JSON string.
|
65
|
+
"""
|
66
|
+
buffer = StringIO()
|
67
|
+
df = self.unique_columns(df)
|
68
|
+
df.to_markdown(buffer)
|
69
|
+
buffer.seek(0)
|
70
|
+
return buffer.getvalue()
|
71
|
+
|
72
|
+
def parse_table(self, table_idx, table, page_number, path) -> pd.DataFrame:
|
73
|
+
df = table.to_pandas() # convert to pandas DataFrame
|
74
|
+
df = df.dropna(axis=1, how='all')
|
75
|
+
df = df.dropna(how='all', axis=0) # Drop empty rows
|
76
|
+
page = page_number + 1
|
77
|
+
table_meta = {
|
78
|
+
"url": '',
|
79
|
+
"source": f"{path.name} Page.#{page} Table.#{table_idx}",
|
80
|
+
"filename": path.name,
|
81
|
+
"question": '',
|
82
|
+
"answer": '',
|
83
|
+
"type": 'table',
|
84
|
+
"summary": '',
|
85
|
+
"category": self.category,
|
86
|
+
"source_type": self._source_type,
|
87
|
+
"created_at": datetime.now().strftime("%Y-%m-%d, %H:%M:%S"),
|
88
|
+
"document_meta": {
|
89
|
+
"table_index": table_idx,
|
90
|
+
"table_shape": df.shape,
|
91
|
+
"table_columns": df.columns.tolist(),
|
92
|
+
"description": f"Extracted from Page.#{page}."
|
93
|
+
}
|
94
|
+
}
|
95
|
+
return df, table_meta
|
96
|
+
|
97
|
+
def _load_pdf(self, path: Path) -> list:
|
98
|
+
"""
|
99
|
+
Load a PDF file using the Fitz library.
|
100
|
+
|
101
|
+
Args:
|
102
|
+
path (Path): The path to the PDF file.
|
103
|
+
|
104
|
+
Returns:
|
105
|
+
list: A list of Langchain Documents.
|
106
|
+
"""
|
107
|
+
if self._check_path(path):
|
108
|
+
self.logger.info(f"Loading PDF file: {path}")
|
109
|
+
pdf = fitz.open(str(path)) # Open the PDF file
|
110
|
+
docs = []
|
111
|
+
document_context = f"File Name: {path.name}\n"
|
112
|
+
document_context += f"Document Type: {self.doctype}\n"
|
113
|
+
document_context += f"Source Type: {self._source_type}\n\n"
|
114
|
+
for page_number in range(pdf.page_count):
|
115
|
+
page = pdf[page_number]
|
116
|
+
try:
|
117
|
+
tabs = page.find_tables(**self.table_settings)
|
118
|
+
for tab_idx, tab in enumerate(tabs):
|
119
|
+
df, _meta = self.parse_table(tab_idx, tab, page_number, path)
|
120
|
+
## Sample information:
|
121
|
+
print('::: Printing Table Information === ')
|
122
|
+
print(df)
|
123
|
+
print("::: Printing Column Information === ")
|
124
|
+
for column, t in df.dtypes.items():
|
125
|
+
print(column, "->", t, "->", df[column].iloc[0])
|
126
|
+
# convert into markdown:
|
127
|
+
txt = df.to_markdown()
|
128
|
+
if txt:
|
129
|
+
docs.append(
|
130
|
+
Document(page_content=document_context + txt, metadata=_meta)
|
131
|
+
)
|
132
|
+
except Exception as exc:
|
133
|
+
print(exc)
|
134
|
+
continue
|
135
|
+
return docs
|
@@ -0,0 +1,67 @@
|
|
1
|
+
|
2
|
+
from pathlib import PurePath
|
3
|
+
from typing import List
|
4
|
+
import pandas as pd
|
5
|
+
from langchain.docstore.document import Document
|
6
|
+
from .abstract import AbstractLoader
|
7
|
+
|
8
|
+
|
9
|
+
class QAFileLoader(AbstractLoader):
|
10
|
+
"""
|
11
|
+
Question and Answers File based on Excel, coverted to Langchain Documents.
|
12
|
+
"""
|
13
|
+
_extension = ['.xlsx']
|
14
|
+
chunk_size = 768
|
15
|
+
|
16
|
+
def __init__(
|
17
|
+
self,
|
18
|
+
columns: list = ['Question', 'Answer'],
|
19
|
+
**kwargs
|
20
|
+
):
|
21
|
+
super().__init__(**kwargs)
|
22
|
+
self._columns = columns
|
23
|
+
|
24
|
+
def _load_document(self, path: PurePath) -> list:
|
25
|
+
df = pd.read_excel(path)
|
26
|
+
q = self._columns[0]
|
27
|
+
a = self._columns[1]
|
28
|
+
docs = []
|
29
|
+
for idx, row in df.iterrows():
|
30
|
+
# Question Document
|
31
|
+
document_meta = {
|
32
|
+
"question": row[q],
|
33
|
+
"answer": row[a],
|
34
|
+
}
|
35
|
+
metadata = self.create_metadata(
|
36
|
+
path=path,
|
37
|
+
doctype=self.doctype,
|
38
|
+
source_type=self._source_type,
|
39
|
+
summary=f"Question: {row[q]}?: **{row[a]}**",
|
40
|
+
doc_metadata=document_meta,
|
41
|
+
type="QA",
|
42
|
+
question=row[q],
|
43
|
+
answer=row[a],
|
44
|
+
)
|
45
|
+
doc = Document(
|
46
|
+
page_content=f"**Question:** {row[q]}: **Answer:** {row[a]}",
|
47
|
+
metadata=metadata,
|
48
|
+
)
|
49
|
+
docs.append(doc)
|
50
|
+
return docs
|
51
|
+
|
52
|
+
async def load(self, path: PurePath) -> List[Document]:
|
53
|
+
"""Load data from a source and return it as a Langchain Document.
|
54
|
+
|
55
|
+
Args:
|
56
|
+
path (Path): The source of the data.
|
57
|
+
|
58
|
+
Returns:
|
59
|
+
List[Document]: A list of Langchain Documents.
|
60
|
+
"""
|
61
|
+
self.logger.info(
|
62
|
+
f"Loading Excel FAQ file: {path}"
|
63
|
+
)
|
64
|
+
docs = []
|
65
|
+
if path.exists():
|
66
|
+
docs = self._load_document(path)
|
67
|
+
return docs
|
@@ -0,0 +1,55 @@
|
|
1
|
+
from typing import List
|
2
|
+
from pathlib import PurePath
|
3
|
+
from langchain.docstore.document import Document
|
4
|
+
from .abstract import AbstractLoader
|
5
|
+
|
6
|
+
|
7
|
+
class TXTLoader(AbstractLoader):
|
8
|
+
"""
|
9
|
+
Loader for PDF files.
|
10
|
+
"""
|
11
|
+
_extension = ['.txt']
|
12
|
+
|
13
|
+
def _load_document(self, path: PurePath) -> List[Document]:
|
14
|
+
"""
|
15
|
+
Load a TXT file.
|
16
|
+
|
17
|
+
Args:
|
18
|
+
path (Path): The path to the TXT file.
|
19
|
+
|
20
|
+
Returns:
|
21
|
+
list: A list of Langchain Documents.
|
22
|
+
"""
|
23
|
+
docs = []
|
24
|
+
if self._check_path(path):
|
25
|
+
self.logger.info(f"Loading TXT file: {path}")
|
26
|
+
with open(path, 'r') as file:
|
27
|
+
text = file.read()
|
28
|
+
try:
|
29
|
+
summary = self.get_summary_from_text(text, use_gpu=True)
|
30
|
+
except Exception:
|
31
|
+
summary = ''
|
32
|
+
metadata = self.create_metadata(
|
33
|
+
path=path,
|
34
|
+
doctype=self.doctype,
|
35
|
+
source_type=self._source_type,
|
36
|
+
summary=summary,
|
37
|
+
doc_metadata={}
|
38
|
+
)
|
39
|
+
# Create document-level context
|
40
|
+
document_context = f"File Name: {path.name}\n"
|
41
|
+
document_context += f"Document Type: {self.doctype}\n"
|
42
|
+
document_context += f"Source Type: {self._source_type}\n"
|
43
|
+
document_context += f"Summary: {summary}\n\n"
|
44
|
+
# splitting the content:
|
45
|
+
for chunk in self.markdown_splitter.split_text(text):
|
46
|
+
_idx = {
|
47
|
+
**metadata
|
48
|
+
}
|
49
|
+
docs.append(
|
50
|
+
Document(
|
51
|
+
page_content=document_context + chunk,
|
52
|
+
metadata=_idx
|
53
|
+
)
|
54
|
+
)
|
55
|
+
return docs
|