webscout 8.2.9__py3-none-any.whl → 2026.1.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- webscout/AIauto.py +524 -251
- webscout/AIbase.py +247 -319
- webscout/AIutel.py +68 -703
- webscout/Bard.py +1072 -1026
- webscout/Extra/GitToolkit/__init__.py +10 -10
- webscout/Extra/GitToolkit/gitapi/__init__.py +20 -12
- webscout/Extra/GitToolkit/gitapi/gist.py +142 -0
- webscout/Extra/GitToolkit/gitapi/organization.py +91 -0
- webscout/Extra/GitToolkit/gitapi/repository.py +308 -195
- webscout/Extra/GitToolkit/gitapi/search.py +162 -0
- webscout/Extra/GitToolkit/gitapi/trending.py +236 -0
- webscout/Extra/GitToolkit/gitapi/user.py +128 -96
- webscout/Extra/GitToolkit/gitapi/utils.py +82 -62
- webscout/Extra/YTToolkit/README.md +443 -375
- webscout/Extra/YTToolkit/YTdownloader.py +953 -957
- webscout/Extra/YTToolkit/__init__.py +3 -3
- webscout/Extra/YTToolkit/transcriber.py +595 -476
- webscout/Extra/YTToolkit/ytapi/README.md +230 -44
- webscout/Extra/YTToolkit/ytapi/__init__.py +22 -6
- webscout/Extra/YTToolkit/ytapi/captions.py +190 -0
- webscout/Extra/YTToolkit/ytapi/channel.py +302 -307
- webscout/Extra/YTToolkit/ytapi/errors.py +13 -13
- webscout/Extra/YTToolkit/ytapi/extras.py +178 -118
- webscout/Extra/YTToolkit/ytapi/hashtag.py +120 -0
- webscout/Extra/YTToolkit/ytapi/https.py +89 -88
- webscout/Extra/YTToolkit/ytapi/patterns.py +61 -61
- webscout/Extra/YTToolkit/ytapi/playlist.py +59 -59
- webscout/Extra/YTToolkit/ytapi/pool.py +8 -8
- webscout/Extra/YTToolkit/ytapi/query.py +143 -40
- webscout/Extra/YTToolkit/ytapi/shorts.py +122 -0
- webscout/Extra/YTToolkit/ytapi/stream.py +68 -63
- webscout/Extra/YTToolkit/ytapi/suggestions.py +97 -0
- webscout/Extra/YTToolkit/ytapi/utils.py +66 -62
- webscout/Extra/YTToolkit/ytapi/video.py +403 -232
- webscout/Extra/__init__.py +2 -3
- webscout/Extra/gguf.py +1298 -684
- webscout/Extra/tempmail/README.md +487 -487
- webscout/Extra/tempmail/__init__.py +28 -28
- webscout/Extra/tempmail/async_utils.py +143 -141
- webscout/Extra/tempmail/base.py +172 -161
- webscout/Extra/tempmail/cli.py +191 -187
- webscout/Extra/tempmail/emailnator.py +88 -84
- webscout/Extra/tempmail/mail_tm.py +378 -361
- webscout/Extra/tempmail/temp_mail_io.py +304 -292
- webscout/Extra/weather.py +196 -194
- webscout/Extra/weather_ascii.py +17 -15
- webscout/Provider/AISEARCH/PERPLEXED_search.py +175 -0
- webscout/Provider/AISEARCH/Perplexity.py +292 -333
- webscout/Provider/AISEARCH/README.md +106 -279
- webscout/Provider/AISEARCH/__init__.py +16 -9
- webscout/Provider/AISEARCH/brave_search.py +298 -0
- webscout/Provider/AISEARCH/iask_search.py +357 -410
- webscout/Provider/AISEARCH/monica_search.py +200 -220
- webscout/Provider/AISEARCH/webpilotai_search.py +242 -255
- webscout/Provider/Algion.py +413 -0
- webscout/Provider/Andi.py +74 -69
- webscout/Provider/Apriel.py +313 -0
- webscout/Provider/Ayle.py +323 -0
- webscout/Provider/ChatSandbox.py +329 -342
- webscout/Provider/ClaudeOnline.py +365 -0
- webscout/Provider/Cohere.py +232 -208
- webscout/Provider/DeepAI.py +367 -0
- webscout/Provider/Deepinfra.py +467 -340
- webscout/Provider/EssentialAI.py +217 -0
- webscout/Provider/ExaAI.py +274 -261
- webscout/Provider/Gemini.py +175 -169
- webscout/Provider/GithubChat.py +385 -369
- webscout/Provider/Gradient.py +286 -0
- webscout/Provider/Groq.py +556 -801
- webscout/Provider/HadadXYZ.py +323 -0
- webscout/Provider/HeckAI.py +392 -375
- webscout/Provider/HuggingFace.py +387 -0
- webscout/Provider/IBM.py +340 -0
- webscout/Provider/Jadve.py +317 -291
- webscout/Provider/K2Think.py +306 -0
- webscout/Provider/Koboldai.py +221 -384
- webscout/Provider/Netwrck.py +273 -270
- webscout/Provider/Nvidia.py +310 -0
- webscout/Provider/OPENAI/DeepAI.py +489 -0
- webscout/Provider/OPENAI/K2Think.py +423 -0
- webscout/Provider/OPENAI/PI.py +463 -0
- webscout/Provider/OPENAI/README.md +890 -952
- webscout/Provider/OPENAI/TogetherAI.py +405 -0
- webscout/Provider/OPENAI/TwoAI.py +255 -357
- webscout/Provider/OPENAI/__init__.py +148 -40
- webscout/Provider/OPENAI/ai4chat.py +348 -293
- webscout/Provider/OPENAI/akashgpt.py +436 -0
- webscout/Provider/OPENAI/algion.py +303 -0
- webscout/Provider/OPENAI/{exachat.py → ayle.py} +365 -444
- webscout/Provider/OPENAI/base.py +253 -249
- webscout/Provider/OPENAI/cerebras.py +296 -0
- webscout/Provider/OPENAI/chatgpt.py +870 -556
- webscout/Provider/OPENAI/chatsandbox.py +233 -173
- webscout/Provider/OPENAI/deepinfra.py +403 -322
- webscout/Provider/OPENAI/e2b.py +2370 -1414
- webscout/Provider/OPENAI/elmo.py +278 -0
- webscout/Provider/OPENAI/exaai.py +452 -417
- webscout/Provider/OPENAI/freeassist.py +446 -0
- webscout/Provider/OPENAI/gradient.py +448 -0
- webscout/Provider/OPENAI/groq.py +380 -364
- webscout/Provider/OPENAI/hadadxyz.py +292 -0
- webscout/Provider/OPENAI/heckai.py +333 -308
- webscout/Provider/OPENAI/huggingface.py +321 -0
- webscout/Provider/OPENAI/ibm.py +425 -0
- webscout/Provider/OPENAI/llmchat.py +253 -0
- webscout/Provider/OPENAI/llmchatco.py +378 -335
- webscout/Provider/OPENAI/meta.py +541 -0
- webscout/Provider/OPENAI/netwrck.py +374 -357
- webscout/Provider/OPENAI/nvidia.py +317 -0
- webscout/Provider/OPENAI/oivscode.py +348 -287
- webscout/Provider/OPENAI/openrouter.py +328 -0
- webscout/Provider/OPENAI/pydantic_imports.py +1 -172
- webscout/Provider/OPENAI/sambanova.py +397 -0
- webscout/Provider/OPENAI/sonus.py +305 -304
- webscout/Provider/OPENAI/textpollinations.py +370 -339
- webscout/Provider/OPENAI/toolbaz.py +375 -413
- webscout/Provider/OPENAI/typefully.py +419 -355
- webscout/Provider/OPENAI/typliai.py +279 -0
- webscout/Provider/OPENAI/utils.py +314 -318
- webscout/Provider/OPENAI/wisecat.py +359 -387
- webscout/Provider/OPENAI/writecream.py +185 -163
- webscout/Provider/OPENAI/x0gpt.py +462 -365
- webscout/Provider/OPENAI/zenmux.py +380 -0
- webscout/Provider/OpenRouter.py +386 -0
- webscout/Provider/Openai.py +337 -496
- webscout/Provider/PI.py +443 -429
- webscout/Provider/QwenLM.py +346 -254
- webscout/Provider/STT/__init__.py +28 -0
- webscout/Provider/STT/base.py +303 -0
- webscout/Provider/STT/elevenlabs.py +264 -0
- webscout/Provider/Sambanova.py +317 -0
- webscout/Provider/TTI/README.md +69 -82
- webscout/Provider/TTI/__init__.py +37 -7
- webscout/Provider/TTI/base.py +147 -64
- webscout/Provider/TTI/claudeonline.py +393 -0
- webscout/Provider/TTI/magicstudio.py +292 -201
- webscout/Provider/TTI/miragic.py +180 -0
- webscout/Provider/TTI/pollinations.py +331 -221
- webscout/Provider/TTI/together.py +334 -0
- webscout/Provider/TTI/utils.py +14 -11
- webscout/Provider/TTS/README.md +186 -192
- webscout/Provider/TTS/__init__.py +43 -10
- webscout/Provider/TTS/base.py +523 -159
- webscout/Provider/TTS/deepgram.py +286 -156
- webscout/Provider/TTS/elevenlabs.py +189 -111
- webscout/Provider/TTS/freetts.py +218 -0
- webscout/Provider/TTS/murfai.py +288 -113
- webscout/Provider/TTS/openai_fm.py +364 -129
- webscout/Provider/TTS/parler.py +203 -111
- webscout/Provider/TTS/qwen.py +334 -0
- webscout/Provider/TTS/sherpa.py +286 -0
- webscout/Provider/TTS/speechma.py +693 -580
- webscout/Provider/TTS/streamElements.py +275 -333
- webscout/Provider/TTS/utils.py +280 -280
- webscout/Provider/TextPollinationsAI.py +331 -308
- webscout/Provider/TogetherAI.py +450 -0
- webscout/Provider/TwoAI.py +309 -475
- webscout/Provider/TypliAI.py +311 -305
- webscout/Provider/UNFINISHED/ChatHub.py +219 -209
- webscout/Provider/{OPENAI/glider.py → UNFINISHED/ChutesAI.py} +331 -326
- webscout/Provider/{GizAI.py → UNFINISHED/GizAI.py} +300 -295
- webscout/Provider/{Marcus.py → UNFINISHED/Marcus.py} +218 -198
- webscout/Provider/UNFINISHED/Qodo.py +481 -0
- webscout/Provider/{MCPCore.py → UNFINISHED/XenAI.py} +330 -315
- webscout/Provider/UNFINISHED/Youchat.py +347 -330
- webscout/Provider/UNFINISHED/aihumanizer.py +41 -0
- webscout/Provider/UNFINISHED/grammerchecker.py +37 -0
- webscout/Provider/UNFINISHED/liner.py +342 -0
- webscout/Provider/UNFINISHED/liner_api_request.py +246 -263
- webscout/Provider/{samurai.py → UNFINISHED/samurai.py} +231 -224
- webscout/Provider/WiseCat.py +256 -233
- webscout/Provider/WrDoChat.py +390 -370
- webscout/Provider/__init__.py +115 -174
- webscout/Provider/ai4chat.py +181 -174
- webscout/Provider/akashgpt.py +330 -335
- webscout/Provider/cerebras.py +397 -290
- webscout/Provider/cleeai.py +236 -213
- webscout/Provider/elmo.py +291 -283
- webscout/Provider/geminiapi.py +343 -208
- webscout/Provider/julius.py +245 -223
- webscout/Provider/learnfastai.py +333 -325
- webscout/Provider/llama3mitril.py +230 -215
- webscout/Provider/llmchat.py +308 -258
- webscout/Provider/llmchatco.py +321 -306
- webscout/Provider/meta.py +996 -801
- webscout/Provider/oivscode.py +332 -309
- webscout/Provider/searchchat.py +316 -292
- webscout/Provider/sonus.py +264 -258
- webscout/Provider/toolbaz.py +359 -353
- webscout/Provider/turboseek.py +332 -266
- webscout/Provider/typefully.py +262 -202
- webscout/Provider/x0gpt.py +332 -299
- webscout/__init__.py +31 -39
- webscout/__main__.py +5 -5
- webscout/cli.py +585 -524
- webscout/client.py +1497 -70
- webscout/conversation.py +140 -436
- webscout/exceptions.py +383 -362
- webscout/litagent/__init__.py +29 -29
- webscout/litagent/agent.py +492 -455
- webscout/litagent/constants.py +60 -60
- webscout/models.py +505 -181
- webscout/optimizers.py +74 -420
- webscout/prompt_manager.py +376 -288
- webscout/sanitize.py +1514 -0
- webscout/scout/README.md +452 -404
- webscout/scout/__init__.py +8 -8
- webscout/scout/core/__init__.py +7 -7
- webscout/scout/core/crawler.py +330 -210
- webscout/scout/core/scout.py +800 -607
- webscout/scout/core/search_result.py +51 -96
- webscout/scout/core/text_analyzer.py +64 -63
- webscout/scout/core/text_utils.py +412 -277
- webscout/scout/core/web_analyzer.py +54 -52
- webscout/scout/element.py +872 -478
- webscout/scout/parsers/__init__.py +70 -69
- webscout/scout/parsers/html5lib_parser.py +182 -172
- webscout/scout/parsers/html_parser.py +238 -236
- webscout/scout/parsers/lxml_parser.py +203 -178
- webscout/scout/utils.py +38 -37
- webscout/search/__init__.py +47 -0
- webscout/search/base.py +201 -0
- webscout/search/bing_main.py +45 -0
- webscout/search/brave_main.py +92 -0
- webscout/search/duckduckgo_main.py +57 -0
- webscout/search/engines/__init__.py +127 -0
- webscout/search/engines/bing/__init__.py +15 -0
- webscout/search/engines/bing/base.py +35 -0
- webscout/search/engines/bing/images.py +114 -0
- webscout/search/engines/bing/news.py +96 -0
- webscout/search/engines/bing/suggestions.py +36 -0
- webscout/search/engines/bing/text.py +109 -0
- webscout/search/engines/brave/__init__.py +19 -0
- webscout/search/engines/brave/base.py +47 -0
- webscout/search/engines/brave/images.py +213 -0
- webscout/search/engines/brave/news.py +353 -0
- webscout/search/engines/brave/suggestions.py +318 -0
- webscout/search/engines/brave/text.py +167 -0
- webscout/search/engines/brave/videos.py +364 -0
- webscout/search/engines/duckduckgo/__init__.py +25 -0
- webscout/search/engines/duckduckgo/answers.py +80 -0
- webscout/search/engines/duckduckgo/base.py +189 -0
- webscout/search/engines/duckduckgo/images.py +100 -0
- webscout/search/engines/duckduckgo/maps.py +183 -0
- webscout/search/engines/duckduckgo/news.py +70 -0
- webscout/search/engines/duckduckgo/suggestions.py +22 -0
- webscout/search/engines/duckduckgo/text.py +221 -0
- webscout/search/engines/duckduckgo/translate.py +48 -0
- webscout/search/engines/duckduckgo/videos.py +80 -0
- webscout/search/engines/duckduckgo/weather.py +84 -0
- webscout/search/engines/mojeek.py +61 -0
- webscout/search/engines/wikipedia.py +77 -0
- webscout/search/engines/yahoo/__init__.py +41 -0
- webscout/search/engines/yahoo/answers.py +19 -0
- webscout/search/engines/yahoo/base.py +34 -0
- webscout/search/engines/yahoo/images.py +323 -0
- webscout/search/engines/yahoo/maps.py +19 -0
- webscout/search/engines/yahoo/news.py +258 -0
- webscout/search/engines/yahoo/suggestions.py +140 -0
- webscout/search/engines/yahoo/text.py +273 -0
- webscout/search/engines/yahoo/translate.py +19 -0
- webscout/search/engines/yahoo/videos.py +302 -0
- webscout/search/engines/yahoo/weather.py +220 -0
- webscout/search/engines/yandex.py +67 -0
- webscout/search/engines/yep/__init__.py +13 -0
- webscout/search/engines/yep/base.py +34 -0
- webscout/search/engines/yep/images.py +101 -0
- webscout/search/engines/yep/suggestions.py +38 -0
- webscout/search/engines/yep/text.py +99 -0
- webscout/search/http_client.py +172 -0
- webscout/search/results.py +141 -0
- webscout/search/yahoo_main.py +57 -0
- webscout/search/yep_main.py +48 -0
- webscout/server/__init__.py +48 -0
- webscout/server/config.py +78 -0
- webscout/server/exceptions.py +69 -0
- webscout/server/providers.py +286 -0
- webscout/server/request_models.py +131 -0
- webscout/server/request_processing.py +404 -0
- webscout/server/routes.py +642 -0
- webscout/server/server.py +351 -0
- webscout/server/ui_templates.py +1171 -0
- webscout/swiftcli/__init__.py +79 -95
- webscout/swiftcli/core/__init__.py +7 -7
- webscout/swiftcli/core/cli.py +574 -297
- webscout/swiftcli/core/context.py +98 -104
- webscout/swiftcli/core/group.py +268 -241
- webscout/swiftcli/decorators/__init__.py +28 -28
- webscout/swiftcli/decorators/command.py +243 -221
- webscout/swiftcli/decorators/options.py +247 -220
- webscout/swiftcli/decorators/output.py +392 -252
- webscout/swiftcli/exceptions.py +21 -21
- webscout/swiftcli/plugins/__init__.py +9 -9
- webscout/swiftcli/plugins/base.py +134 -135
- webscout/swiftcli/plugins/manager.py +269 -269
- webscout/swiftcli/utils/__init__.py +58 -59
- webscout/swiftcli/utils/formatting.py +251 -252
- webscout/swiftcli/utils/parsing.py +368 -267
- webscout/update_checker.py +280 -136
- webscout/utils.py +28 -14
- webscout/version.py +2 -1
- webscout/version.py.bak +3 -0
- webscout/zeroart/__init__.py +218 -135
- webscout/zeroart/base.py +70 -66
- webscout/zeroart/effects.py +155 -101
- webscout/zeroart/fonts.py +1799 -1239
- webscout-2026.1.19.dist-info/METADATA +638 -0
- webscout-2026.1.19.dist-info/RECORD +312 -0
- {webscout-8.2.9.dist-info → webscout-2026.1.19.dist-info}/WHEEL +1 -1
- {webscout-8.2.9.dist-info → webscout-2026.1.19.dist-info}/entry_points.txt +1 -1
- webscout/DWEBS.py +0 -520
- webscout/Extra/Act.md +0 -309
- webscout/Extra/GitToolkit/gitapi/README.md +0 -110
- webscout/Extra/autocoder/__init__.py +0 -9
- webscout/Extra/autocoder/autocoder.py +0 -1105
- webscout/Extra/autocoder/autocoder_utiles.py +0 -332
- webscout/Extra/gguf.md +0 -430
- webscout/Extra/weather.md +0 -281
- webscout/Litlogger/README.md +0 -10
- webscout/Litlogger/__init__.py +0 -15
- webscout/Litlogger/formats.py +0 -4
- webscout/Litlogger/handlers.py +0 -103
- webscout/Litlogger/levels.py +0 -13
- webscout/Litlogger/logger.py +0 -92
- webscout/Provider/AI21.py +0 -177
- webscout/Provider/AISEARCH/DeepFind.py +0 -254
- webscout/Provider/AISEARCH/felo_search.py +0 -202
- webscout/Provider/AISEARCH/genspark_search.py +0 -324
- webscout/Provider/AISEARCH/hika_search.py +0 -186
- webscout/Provider/AISEARCH/scira_search.py +0 -298
- webscout/Provider/Aitopia.py +0 -316
- webscout/Provider/AllenAI.py +0 -440
- webscout/Provider/Blackboxai.py +0 -791
- webscout/Provider/ChatGPTClone.py +0 -237
- webscout/Provider/ChatGPTGratis.py +0 -194
- webscout/Provider/Cloudflare.py +0 -324
- webscout/Provider/ExaChat.py +0 -358
- webscout/Provider/Flowith.py +0 -217
- webscout/Provider/FreeGemini.py +0 -250
- webscout/Provider/Glider.py +0 -225
- webscout/Provider/HF_space/__init__.py +0 -0
- webscout/Provider/HF_space/qwen_qwen2.py +0 -206
- webscout/Provider/HuggingFaceChat.py +0 -469
- webscout/Provider/Hunyuan.py +0 -283
- webscout/Provider/LambdaChat.py +0 -411
- webscout/Provider/Llama3.py +0 -259
- webscout/Provider/Nemotron.py +0 -218
- webscout/Provider/OLLAMA.py +0 -396
- webscout/Provider/OPENAI/BLACKBOXAI.py +0 -766
- webscout/Provider/OPENAI/Cloudflare.py +0 -378
- webscout/Provider/OPENAI/FreeGemini.py +0 -283
- webscout/Provider/OPENAI/NEMOTRON.py +0 -232
- webscout/Provider/OPENAI/Qwen3.py +0 -283
- webscout/Provider/OPENAI/api.py +0 -969
- webscout/Provider/OPENAI/c4ai.py +0 -373
- webscout/Provider/OPENAI/chatgptclone.py +0 -494
- webscout/Provider/OPENAI/copilot.py +0 -242
- webscout/Provider/OPENAI/flowith.py +0 -162
- webscout/Provider/OPENAI/freeaichat.py +0 -359
- webscout/Provider/OPENAI/mcpcore.py +0 -389
- webscout/Provider/OPENAI/multichat.py +0 -376
- webscout/Provider/OPENAI/opkfc.py +0 -496
- webscout/Provider/OPENAI/scirachat.py +0 -477
- webscout/Provider/OPENAI/standardinput.py +0 -433
- webscout/Provider/OPENAI/typegpt.py +0 -364
- webscout/Provider/OPENAI/uncovrAI.py +0 -463
- webscout/Provider/OPENAI/venice.py +0 -431
- webscout/Provider/OPENAI/yep.py +0 -382
- webscout/Provider/OpenGPT.py +0 -209
- webscout/Provider/Perplexitylabs.py +0 -415
- webscout/Provider/Reka.py +0 -214
- webscout/Provider/StandardInput.py +0 -290
- webscout/Provider/TTI/aiarta.py +0 -365
- webscout/Provider/TTI/artbit.py +0 -0
- webscout/Provider/TTI/fastflux.py +0 -200
- webscout/Provider/TTI/piclumen.py +0 -203
- webscout/Provider/TTI/pixelmuse.py +0 -225
- webscout/Provider/TTS/gesserit.py +0 -128
- webscout/Provider/TTS/sthir.py +0 -94
- webscout/Provider/TeachAnything.py +0 -229
- webscout/Provider/UNFINISHED/puterjs.py +0 -635
- webscout/Provider/UNFINISHED/test_lmarena.py +0 -119
- webscout/Provider/Venice.py +0 -258
- webscout/Provider/VercelAI.py +0 -253
- webscout/Provider/Writecream.py +0 -246
- webscout/Provider/WritingMate.py +0 -269
- webscout/Provider/asksteve.py +0 -220
- webscout/Provider/chatglm.py +0 -215
- webscout/Provider/copilot.py +0 -425
- webscout/Provider/freeaichat.py +0 -285
- webscout/Provider/granite.py +0 -235
- webscout/Provider/hermes.py +0 -266
- webscout/Provider/koala.py +0 -170
- webscout/Provider/lmarena.py +0 -198
- webscout/Provider/multichat.py +0 -364
- webscout/Provider/scira_chat.py +0 -299
- webscout/Provider/scnet.py +0 -243
- webscout/Provider/talkai.py +0 -194
- webscout/Provider/typegpt.py +0 -289
- webscout/Provider/uncovr.py +0 -368
- webscout/Provider/yep.py +0 -389
- webscout/litagent/Readme.md +0 -276
- webscout/litprinter/__init__.py +0 -59
- webscout/swiftcli/Readme.md +0 -323
- webscout/tempid.py +0 -128
- webscout/webscout_search.py +0 -1184
- webscout/webscout_search_async.py +0 -654
- webscout/yep_search.py +0 -347
- webscout/zeroart/README.md +0 -89
- webscout-8.2.9.dist-info/METADATA +0 -1033
- webscout-8.2.9.dist-info/RECORD +0 -289
- {webscout-8.2.9.dist-info → webscout-2026.1.19.dist-info}/licenses/LICENSE.md +0 -0
- {webscout-8.2.9.dist-info → webscout-2026.1.19.dist-info}/top_level.txt +0 -0
webscout/sanitize.py
ADDED
|
@@ -0,0 +1,1514 @@
|
|
|
1
|
+
"""Stream sanitization and processing utilities for handling various data formats.
|
|
2
|
+
|
|
3
|
+
This module provides utilities for processing streaming data from various sources,
|
|
4
|
+
including support for byte streams, text streams, JSON parsing, regex filtering,
|
|
5
|
+
marker-based extraction, and customizable output response formatting.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
import codecs
|
|
10
|
+
import functools
|
|
11
|
+
import json
|
|
12
|
+
import re
|
|
13
|
+
import sys
|
|
14
|
+
from itertools import chain
|
|
15
|
+
from typing import (
|
|
16
|
+
Any,
|
|
17
|
+
AsyncGenerator,
|
|
18
|
+
AsyncIterable,
|
|
19
|
+
Callable,
|
|
20
|
+
Dict,
|
|
21
|
+
Generator,
|
|
22
|
+
Iterable,
|
|
23
|
+
List,
|
|
24
|
+
Literal,
|
|
25
|
+
Optional,
|
|
26
|
+
Union,
|
|
27
|
+
overload,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
# Expanded encoding types
|
|
31
|
+
EncodingType = Literal[
|
|
32
|
+
"utf-8",
|
|
33
|
+
"utf-16",
|
|
34
|
+
"utf-32",
|
|
35
|
+
"ascii",
|
|
36
|
+
"latin1",
|
|
37
|
+
"cp1252",
|
|
38
|
+
"iso-8859-1",
|
|
39
|
+
"iso-8859-2",
|
|
40
|
+
"windows-1250",
|
|
41
|
+
"windows-1251",
|
|
42
|
+
"windows-1252",
|
|
43
|
+
"gbk",
|
|
44
|
+
"big5",
|
|
45
|
+
"shift_jis",
|
|
46
|
+
"euc-jp",
|
|
47
|
+
"euc-kr",
|
|
48
|
+
]
|
|
49
|
+
|
|
50
|
+
# Public API
|
|
51
|
+
__all__ = [
|
|
52
|
+
"sanitize_stream",
|
|
53
|
+
"LITSTREAM",
|
|
54
|
+
"sanitize_stream_decorator",
|
|
55
|
+
"lit_streamer",
|
|
56
|
+
"EncodingType",
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _compile_regexes(
|
|
61
|
+
patterns: Optional[List[Union[str, re.Pattern[str]]]],
|
|
62
|
+
) -> Optional[List[re.Pattern[str]]]:
|
|
63
|
+
"""
|
|
64
|
+
Compile regex patterns from strings or return compiled patterns as-is.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
patterns: List of regex patterns as strings or compiled Pattern objects.
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
List of compiled Pattern objects, or None if input is None.
|
|
71
|
+
|
|
72
|
+
Raises:
|
|
73
|
+
ValueError: If any pattern is invalid.
|
|
74
|
+
"""
|
|
75
|
+
if not patterns:
|
|
76
|
+
return None
|
|
77
|
+
|
|
78
|
+
compiled_patterns = []
|
|
79
|
+
for i, pattern in enumerate(patterns):
|
|
80
|
+
try:
|
|
81
|
+
if isinstance(pattern, str):
|
|
82
|
+
compiled_patterns.append(re.compile(pattern))
|
|
83
|
+
elif isinstance(pattern, re.Pattern):
|
|
84
|
+
compiled_patterns.append(pattern)
|
|
85
|
+
else:
|
|
86
|
+
raise ValueError(
|
|
87
|
+
f"Pattern at index {i} must be a string or compiled regex pattern, "
|
|
88
|
+
f"got {type(pattern).__name__}"
|
|
89
|
+
)
|
|
90
|
+
except re.error as e:
|
|
91
|
+
raise ValueError(f"Invalid regex pattern at index {i}: '{pattern}' - {e}")
|
|
92
|
+
|
|
93
|
+
return compiled_patterns
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _process_chunk(
|
|
97
|
+
chunk: str,
|
|
98
|
+
intro_value: Optional[str],
|
|
99
|
+
to_json: bool,
|
|
100
|
+
skip_markers: List[str],
|
|
101
|
+
strip_chars: Optional[str],
|
|
102
|
+
yield_raw_on_error: bool,
|
|
103
|
+
error_handler: Optional[Callable[[Exception, str], Optional[Any]]] = None,
|
|
104
|
+
skip_regexes: Optional[List[re.Pattern[str]]] = None,
|
|
105
|
+
extract_regexes: Optional[List[re.Pattern[str]]] = None,
|
|
106
|
+
) -> Union[str, Dict[str, Any], None]:
|
|
107
|
+
"""
|
|
108
|
+
Sanitizes and potentially parses a single chunk of text.
|
|
109
|
+
|
|
110
|
+
This function performs several operations on the input chunk:
|
|
111
|
+
- Removes a specified prefix (`intro_value`).
|
|
112
|
+
- Strips leading/trailing characters (`strip_chars`).
|
|
113
|
+
- Skips chunks matching specific markers (`skip_markers`).
|
|
114
|
+
- Skips chunks matching regex patterns (`skip_regexes`).
|
|
115
|
+
- Extracts content using regex capturing groups (`extract_regexes`).
|
|
116
|
+
- Optionally parses the chunk as JSON (`to_json`).
|
|
117
|
+
- Handles JSON parsing errors with an optional callback (`error_handler`).
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
chunk (str): The chunk of text to process.
|
|
121
|
+
intro_value (str): The prefix to remove from the chunk.
|
|
122
|
+
to_json (bool): If True, attempts to parse the chunk as JSON.
|
|
123
|
+
skip_markers (List[str]): A list of markers; chunks matching these are skipped.
|
|
124
|
+
strip_chars (Optional[str]): Characters to strip from the beginning and end of the chunk.
|
|
125
|
+
yield_raw_on_error (bool): If True, returns the raw chunk when JSON parsing fails; otherwise, returns None.
|
|
126
|
+
error_handler (Optional[Callable[[Exception, str], Optional[Any]]]): An optional callback function that is called when JSON parsing fails.
|
|
127
|
+
It receives the exception and the sanitized chunk as arguments. It should return a value to yield instead of the raw chunk, or None to ignore.
|
|
128
|
+
skip_regexes (Optional[List[Pattern[str]]]): A list of compiled regex patterns; chunks matching any of these are skipped.
|
|
129
|
+
extract_regexes (Optional[List[Pattern[str]]]): A list of compiled regex patterns for extracting content using capturing groups.
|
|
130
|
+
|
|
131
|
+
"""
|
|
132
|
+
if not isinstance(chunk, str):
|
|
133
|
+
return None
|
|
134
|
+
|
|
135
|
+
# Fast path for empty chunks
|
|
136
|
+
if not chunk:
|
|
137
|
+
return None
|
|
138
|
+
|
|
139
|
+
# Use slicing for prefix removal (faster than startswith+slicing)
|
|
140
|
+
sanitized_chunk = chunk
|
|
141
|
+
if intro_value and len(chunk) >= len(intro_value) and chunk[: len(intro_value)] == intro_value:
|
|
142
|
+
sanitized_chunk = chunk[len(intro_value) :]
|
|
143
|
+
|
|
144
|
+
# Optimize string stripping operations
|
|
145
|
+
if strip_chars is not None:
|
|
146
|
+
sanitized_chunk = sanitized_chunk.strip(strip_chars)
|
|
147
|
+
else:
|
|
148
|
+
# lstrip() is faster than strip() when we only need leading whitespace removed
|
|
149
|
+
sanitized_chunk = sanitized_chunk.lstrip()
|
|
150
|
+
|
|
151
|
+
# Skip empty chunks and markers
|
|
152
|
+
if not sanitized_chunk or any(marker == sanitized_chunk for marker in skip_markers):
|
|
153
|
+
return None
|
|
154
|
+
|
|
155
|
+
# Apply regex-based extraction first (if provided)
|
|
156
|
+
if extract_regexes:
|
|
157
|
+
extracted_content = None
|
|
158
|
+
for regex in extract_regexes:
|
|
159
|
+
match = regex.search(sanitized_chunk)
|
|
160
|
+
if match:
|
|
161
|
+
# If there are capturing groups, return the first group or all groups as a tuple
|
|
162
|
+
if match.groups():
|
|
163
|
+
if len(match.groups()) == 1:
|
|
164
|
+
extracted_content = match.group(1)
|
|
165
|
+
else:
|
|
166
|
+
# Multiple groups - return as tuple converted to string for JSON compatibility
|
|
167
|
+
extracted_content = str(match.groups())
|
|
168
|
+
else:
|
|
169
|
+
# No capturing groups, return the full match
|
|
170
|
+
extracted_content = match.group(0)
|
|
171
|
+
break # Use first matching extraction regex
|
|
172
|
+
|
|
173
|
+
if extracted_content is None:
|
|
174
|
+
if to_json:
|
|
175
|
+
pass
|
|
176
|
+
else:
|
|
177
|
+
return None
|
|
178
|
+
else:
|
|
179
|
+
sanitized_chunk = extracted_content
|
|
180
|
+
|
|
181
|
+
if skip_regexes:
|
|
182
|
+
if any(regex.search(sanitized_chunk) for regex in skip_regexes):
|
|
183
|
+
return None
|
|
184
|
+
|
|
185
|
+
if to_json:
|
|
186
|
+
try:
|
|
187
|
+
# Only strip before JSON parsing if both boundaries are incorrect
|
|
188
|
+
if (
|
|
189
|
+
len(sanitized_chunk) >= 2
|
|
190
|
+
and sanitized_chunk[0] not in "{["
|
|
191
|
+
and sanitized_chunk[-1] not in "}]"
|
|
192
|
+
):
|
|
193
|
+
sanitized_chunk = sanitized_chunk.strip()
|
|
194
|
+
return json.loads(sanitized_chunk)
|
|
195
|
+
except (json.JSONDecodeError, Exception) as e:
|
|
196
|
+
if error_handler:
|
|
197
|
+
try:
|
|
198
|
+
handled = error_handler(e, sanitized_chunk)
|
|
199
|
+
if handled is not None:
|
|
200
|
+
return handled
|
|
201
|
+
except Exception:
|
|
202
|
+
pass
|
|
203
|
+
return sanitized_chunk if yield_raw_on_error else None
|
|
204
|
+
|
|
205
|
+
return sanitized_chunk
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def _decode_byte_stream(
|
|
209
|
+
byte_iterator: Any,
|
|
210
|
+
encoding: EncodingType = "utf-8",
|
|
211
|
+
errors: str = "replace",
|
|
212
|
+
buffer_size: int = 8192,
|
|
213
|
+
) -> Generator[str, None, None]:
|
|
214
|
+
"""
|
|
215
|
+
Decodes a byte stream in realtime with flexible encoding support.
|
|
216
|
+
|
|
217
|
+
This function takes an iterator of bytes and decodes it into a stream of strings
|
|
218
|
+
using the specified character encoding. It handles encoding errors gracefully
|
|
219
|
+
and can be tuned for performance with the `buffer_size` parameter.
|
|
220
|
+
|
|
221
|
+
Args:
|
|
222
|
+
byte_iterator (Iterable[bytes]): An iterator that yields chunks of bytes.
|
|
223
|
+
encoding (EncodingType): The character encoding to use for decoding.
|
|
224
|
+
Defaults to 'utf-8'. Supports a wide range of encodings, including:
|
|
225
|
+
'utf-8', 'utf-16', 'utf-32', 'ascii', 'latin1', 'cp1252', 'iso-8859-1',
|
|
226
|
+
'iso-8859-2', 'windows-1250', 'windows-1251', 'windows-1252', 'gbk', 'big5',
|
|
227
|
+
'shift_jis', 'euc-jp', 'euc-kr'.
|
|
228
|
+
errors (str): Specifies how encoding errors should be handled.
|
|
229
|
+
Options are 'strict' (raises an error), 'ignore' (skips the error), and
|
|
230
|
+
'replace' (replaces the erroneous byte with a replacement character).
|
|
231
|
+
Defaults to 'replace'.
|
|
232
|
+
buffer_size (int): The size of the internal buffer used for decoding.
|
|
233
|
+
|
|
234
|
+
"""
|
|
235
|
+
# Initialize decoder with the specified encoding
|
|
236
|
+
try:
|
|
237
|
+
decoder = codecs.getincrementaldecoder(encoding)(errors=errors)
|
|
238
|
+
except LookupError:
|
|
239
|
+
# Fallback to utf-8 if the encoding is not supported
|
|
240
|
+
decoder = codecs.getincrementaldecoder("utf-8")(errors=errors)
|
|
241
|
+
|
|
242
|
+
# Process byte stream in realtime
|
|
243
|
+
buffer = bytearray(buffer_size)
|
|
244
|
+
buffer_view = memoryview(buffer)
|
|
245
|
+
|
|
246
|
+
for chunk_bytes in byte_iterator:
|
|
247
|
+
if not chunk_bytes:
|
|
248
|
+
continue
|
|
249
|
+
|
|
250
|
+
try:
|
|
251
|
+
# Use buffer for processing if chunk size is appropriate
|
|
252
|
+
if len(chunk_bytes) <= buffer_size:
|
|
253
|
+
buffer[: len(chunk_bytes)] = chunk_bytes
|
|
254
|
+
text = decoder.decode(buffer_view[: len(chunk_bytes)], final=False)
|
|
255
|
+
else:
|
|
256
|
+
text = decoder.decode(chunk_bytes, final=False)
|
|
257
|
+
|
|
258
|
+
if text:
|
|
259
|
+
yield text
|
|
260
|
+
except UnicodeDecodeError:
|
|
261
|
+
yield f"[Encoding Error: Could not decode bytes with {encoding}]\n"
|
|
262
|
+
|
|
263
|
+
# Final flush
|
|
264
|
+
try:
|
|
265
|
+
final_text = decoder.decode(b"", final=True)
|
|
266
|
+
if final_text:
|
|
267
|
+
yield final_text
|
|
268
|
+
except UnicodeDecodeError:
|
|
269
|
+
yield f"[Encoding Error: Could not decode final bytes with {encoding}]\n"
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
async def _decode_byte_stream_async(
|
|
273
|
+
byte_iterator: AsyncIterable[bytes],
|
|
274
|
+
encoding: EncodingType = "utf-8",
|
|
275
|
+
errors: str = "replace",
|
|
276
|
+
buffer_size: int = 8192,
|
|
277
|
+
) -> AsyncGenerator[str, None]:
|
|
278
|
+
"""
|
|
279
|
+
Asynchronously decodes a byte stream with flexible encoding support.
|
|
280
|
+
|
|
281
|
+
This function is the asynchronous counterpart to `_decode_byte_stream`. It takes
|
|
282
|
+
an asynchronous iterator of bytes and decodes it into a stream of strings using
|
|
283
|
+
the specified character encoding. It handles encoding errors gracefully and can
|
|
284
|
+
be tuned for performance with the `buffer_size` parameter.
|
|
285
|
+
|
|
286
|
+
Args:
|
|
287
|
+
byte_iterator (AsyncIterable[bytes]): An asynchronous iterator that yields chunks of bytes.
|
|
288
|
+
encoding (EncodingType): The character encoding to use for decoding.
|
|
289
|
+
Defaults to 'utf-8'. Supports a wide range of encodings, including:
|
|
290
|
+
'utf-8', 'utf-16', 'utf-32', 'ascii', 'latin1', 'cp1252', 'iso-8859-1',
|
|
291
|
+
'iso-8859-2', 'windows-1250', 'windows-1251', 'windows-1252', 'gbk', 'big5',
|
|
292
|
+
'shift_jis', 'euc-jp', 'euc-kr'.
|
|
293
|
+
errors (str): Specifies how encoding errors should be handled.
|
|
294
|
+
Options are 'strict' (raises an error), 'ignore' (skips the error), and
|
|
295
|
+
'replace' (replaces the erroneous byte with a replacement character).
|
|
296
|
+
Defaults to 'replace'.
|
|
297
|
+
buffer_size (int): The size of the internal buffer used for decoding.
|
|
298
|
+
|
|
299
|
+
Yields:
|
|
300
|
+
str: Decoded text chunks from the byte stream.
|
|
301
|
+
"""
|
|
302
|
+
"""
|
|
303
|
+
Asynchronously decodes a byte stream with flexible encoding support.
|
|
304
|
+
|
|
305
|
+
This function is the asynchronous counterpart to `_decode_byte_stream`. It takes
|
|
306
|
+
an asynchronous iterator of bytes and decodes it into a stream of strings using
|
|
307
|
+
the specified character encoding. It handles encoding errors gracefully and can
|
|
308
|
+
be tuned for performance with the `buffer_size` parameter.
|
|
309
|
+
|
|
310
|
+
Args:
|
|
311
|
+
byte_iterator (Iterable[bytes]): An asynchronous iterator that yields chunks of bytes.
|
|
312
|
+
encoding (EncodingType): The character encoding to use for decoding.
|
|
313
|
+
Defaults to 'utf-8'. Supports a wide range of encodings, including:
|
|
314
|
+
'utf-8', 'utf-16', 'utf-32', 'ascii', 'latin1', 'cp1252', 'iso-8859-1',
|
|
315
|
+
'iso-8859-2', 'windows-1250', 'windows-1251', 'windows-1252', 'gbk', 'big5',
|
|
316
|
+
'shift_jis', 'euc-jp', 'euc-kr'.
|
|
317
|
+
errors (str): Specifies how encoding errors should be handled.
|
|
318
|
+
Options are 'strict' (raises an error), 'ignore' (skips the error), and
|
|
319
|
+
'replace' (replaces the erroneous byte with a replacement character).
|
|
320
|
+
Defaults to 'replace'.
|
|
321
|
+
buffer_size (int): The size of the internal buffer used for decoding.
|
|
322
|
+
"""
|
|
323
|
+
try:
|
|
324
|
+
decoder = codecs.getincrementaldecoder(encoding)(errors=errors)
|
|
325
|
+
except LookupError:
|
|
326
|
+
decoder = codecs.getincrementaldecoder("utf-8")(errors=errors)
|
|
327
|
+
|
|
328
|
+
buffer = bytearray(buffer_size)
|
|
329
|
+
buffer_view = memoryview(buffer)
|
|
330
|
+
|
|
331
|
+
async for chunk_bytes in byte_iterator:
|
|
332
|
+
if not chunk_bytes:
|
|
333
|
+
continue
|
|
334
|
+
try:
|
|
335
|
+
if len(chunk_bytes) <= buffer_size:
|
|
336
|
+
buffer[: len(chunk_bytes)] = chunk_bytes
|
|
337
|
+
text = decoder.decode(buffer_view[: len(chunk_bytes)], final=False)
|
|
338
|
+
else:
|
|
339
|
+
text = decoder.decode(chunk_bytes, final=False)
|
|
340
|
+
if text:
|
|
341
|
+
yield text
|
|
342
|
+
except UnicodeDecodeError:
|
|
343
|
+
yield f"[Encoding Error: Could not decode bytes with {encoding}]\n"
|
|
344
|
+
|
|
345
|
+
try:
|
|
346
|
+
final_text = decoder.decode(b"", final=True)
|
|
347
|
+
if final_text:
|
|
348
|
+
yield final_text
|
|
349
|
+
except UnicodeDecodeError:
|
|
350
|
+
yield f"[Encoding Error: Could not decode final bytes with {encoding}]\n"
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
def _sanitize_stream_sync(
|
|
354
|
+
data: Any,
|
|
355
|
+
intro_value: Optional[str] = "data:",
|
|
356
|
+
to_json: bool = True,
|
|
357
|
+
skip_markers: Optional[List[str]] = None,
|
|
358
|
+
strip_chars: Optional[str] = None,
|
|
359
|
+
start_marker: Optional[str] = None,
|
|
360
|
+
end_marker: Optional[str] = None,
|
|
361
|
+
content_extractor: Optional[Callable[[Union[str, Dict[str, Any]]], Optional[Any]]] = None,
|
|
362
|
+
yield_raw_on_error: bool = True,
|
|
363
|
+
encoding: EncodingType = "utf-8",
|
|
364
|
+
encoding_errors: str = "replace",
|
|
365
|
+
buffer_size: int = 8192,
|
|
366
|
+
line_delimiter: Optional[str] = None,
|
|
367
|
+
error_handler: Optional[Callable[[Exception, str], Optional[Any]]] = None,
|
|
368
|
+
skip_regexes: Optional[List[Union[str, re.Pattern[str]]]] = None,
|
|
369
|
+
extract_regexes: Optional[List[Union[str, re.Pattern[str]]]] = None,
|
|
370
|
+
raw: bool = False,
|
|
371
|
+
output_formatter: Optional[Callable[[Any], Any]] = None,
|
|
372
|
+
) -> Generator[Any, None, None]:
|
|
373
|
+
"""
|
|
374
|
+
Processes a stream of data (strings or bytes) in real-time, applying various transformations and filtering.
|
|
375
|
+
|
|
376
|
+
This function is designed to handle streaming data, allowing for operations such as
|
|
377
|
+
prefix removal, JSON parsing, skipping lines based on markers, regex-based filtering,
|
|
378
|
+
and extracting specific content. It also supports custom error handling for JSON parsing failures
|
|
379
|
+
and output response formatting.
|
|
380
|
+
|
|
381
|
+
Args:
|
|
382
|
+
data: String, iterable of strings, or iterable of bytes to process.
|
|
383
|
+
intro_value: Prefix indicating the start of meaningful data.
|
|
384
|
+
to_json: Parse the chunk as JSON if True.
|
|
385
|
+
skip_markers: Lines containing any of these markers are skipped.
|
|
386
|
+
strip_chars: Characters to strip from each line.
|
|
387
|
+
start_marker: Begin processing only after this marker is found.
|
|
388
|
+
end_marker: Stop processing once this marker is found.
|
|
389
|
+
content_extractor: Optional callable to transform parsed content before yielding.
|
|
390
|
+
yield_raw_on_error: Yield raw lines when JSON parsing fails.
|
|
391
|
+
encoding: Byte stream encoding.
|
|
392
|
+
encoding_errors: How to handle encoding errors.
|
|
393
|
+
buffer_size: Buffer size for byte decoding.
|
|
394
|
+
line_delimiter: Delimiter used to split incoming text into lines. ``None``
|
|
395
|
+
uses ``str.splitlines()``.
|
|
396
|
+
error_handler: Callback invoked with ``(Exception, str)`` when JSON
|
|
397
|
+
parsing fails. If the callback returns a value, it is yielded instead of the raw line.
|
|
398
|
+
skip_regexes: List of regex patterns (strings or compiled) for skipping lines that match.
|
|
399
|
+
extract_regexes: List of regex patterns (strings or compiled) for extracting content using capturing groups.
|
|
400
|
+
raw: If True, yields the raw response as returned by the API, chunk by chunk (no processing).
|
|
401
|
+
output_formatter: Custom callable to format/transform each output item before yielding.
|
|
402
|
+
|
|
403
|
+
Yields:
|
|
404
|
+
Any: Processed data, which can be a string, a dictionary (if `to_json` is True),
|
|
405
|
+
the result of `content_extractor`, or formatted by output_formatter.
|
|
406
|
+
|
|
407
|
+
Raises:
|
|
408
|
+
TypeError: If the input `data` is not a string or an iterable.
|
|
409
|
+
ValueError: If any regex pattern is invalid.
|
|
410
|
+
"""
|
|
411
|
+
# --- RAW MODE: yield each chunk exactly as returned by the API ---
|
|
412
|
+
if raw:
|
|
413
|
+
if isinstance(data, str):
|
|
414
|
+
yield data
|
|
415
|
+
return
|
|
416
|
+
elif hasattr(data, "__iter__"):
|
|
417
|
+
for chunk in data:
|
|
418
|
+
if isinstance(chunk, (bytes, bytearray)):
|
|
419
|
+
yield chunk.decode(encoding, encoding_errors)
|
|
420
|
+
elif chunk is not None:
|
|
421
|
+
yield chunk
|
|
422
|
+
return
|
|
423
|
+
else:
|
|
424
|
+
if data is not None:
|
|
425
|
+
yield data
|
|
426
|
+
return
|
|
427
|
+
# --- END RAW MODE ---
|
|
428
|
+
|
|
429
|
+
# --- OUTPUT FORMATTING SETUP ---
|
|
430
|
+
def _apply_output_format(item: Any) -> Any:
|
|
431
|
+
"""Apply output formatting to a processed item."""
|
|
432
|
+
if output_formatter is not None:
|
|
433
|
+
return output_formatter(item)
|
|
434
|
+
return item
|
|
435
|
+
|
|
436
|
+
# --- END OUTPUT FORMATTING SETUP ---
|
|
437
|
+
|
|
438
|
+
effective_skip_markers = skip_markers or []
|
|
439
|
+
# Compile regex patterns
|
|
440
|
+
compiled_skip_regexes = _compile_regexes(skip_regexes)
|
|
441
|
+
compiled_extract_regexes = _compile_regexes(extract_regexes)
|
|
442
|
+
|
|
443
|
+
processing_active = start_marker is None
|
|
444
|
+
buffer = ""
|
|
445
|
+
found_start = False if start_marker else True
|
|
446
|
+
line_iterator: Iterable[str]
|
|
447
|
+
|
|
448
|
+
if isinstance(data, str):
|
|
449
|
+
# If data is a string, decide whether to split it into lines
|
|
450
|
+
# or treat it as an iterable containing a single chunk.
|
|
451
|
+
temp_lines: List[str]
|
|
452
|
+
if line_delimiter is None: # Default: split by newlines if present
|
|
453
|
+
if "\n" in data or "\r" in data:
|
|
454
|
+
temp_lines = data.splitlines()
|
|
455
|
+
else:
|
|
456
|
+
temp_lines = [data] # Treat as a single line/chunk
|
|
457
|
+
elif line_delimiter in data: # Custom delimiter found in string
|
|
458
|
+
temp_lines = data.split(line_delimiter)
|
|
459
|
+
else: # Custom delimiter not found, or string is effectively a single segment
|
|
460
|
+
temp_lines = [data]
|
|
461
|
+
line_iterator = iter(temp_lines)
|
|
462
|
+
elif hasattr(data, "__iter__"): # data is an iterable (but not a string)
|
|
463
|
+
_iter = iter(data)
|
|
464
|
+
first_item = next(_iter, None)
|
|
465
|
+
|
|
466
|
+
if first_item is None: # Iterable was empty
|
|
467
|
+
return
|
|
468
|
+
|
|
469
|
+
# Reconstruct the full iterable including the first_item
|
|
470
|
+
stream_input_iterable = chain([first_item], _iter)
|
|
471
|
+
|
|
472
|
+
if isinstance(first_item, bytes):
|
|
473
|
+
# Ensure stream_input_iterable is typed as Iterable[bytes] for _decode_byte_stream
|
|
474
|
+
line_iterator = _decode_byte_stream(
|
|
475
|
+
stream_input_iterable,
|
|
476
|
+
encoding=encoding,
|
|
477
|
+
errors=encoding_errors,
|
|
478
|
+
buffer_size=buffer_size,
|
|
479
|
+
)
|
|
480
|
+
elif isinstance(first_item, str):
|
|
481
|
+
# Ensure stream_input_iterable is typed as Iterable[str]
|
|
482
|
+
line_iterator = stream_input_iterable
|
|
483
|
+
else:
|
|
484
|
+
raise TypeError(
|
|
485
|
+
f"Iterable must yield strings or bytes, not {type(first_item).__name__}"
|
|
486
|
+
)
|
|
487
|
+
else: # Not a string and not an iterable
|
|
488
|
+
raise TypeError(f"Input must be a string or an iterable, not {type(data).__name__}")
|
|
489
|
+
|
|
490
|
+
try:
|
|
491
|
+
for line in line_iterator:
|
|
492
|
+
if not line:
|
|
493
|
+
continue
|
|
494
|
+
buffer += line
|
|
495
|
+
while True:
|
|
496
|
+
# Look for start marker if needed
|
|
497
|
+
if not found_start and start_marker:
|
|
498
|
+
idx = buffer.find(start_marker)
|
|
499
|
+
if idx != -1:
|
|
500
|
+
found_start = True
|
|
501
|
+
buffer = buffer[idx + len(start_marker) :]
|
|
502
|
+
else:
|
|
503
|
+
# Not found, keep buffering
|
|
504
|
+
buffer = buffer[-max(len(start_marker), 256) :] # avoid unbounded growth
|
|
505
|
+
break
|
|
506
|
+
# Look for end marker if needed
|
|
507
|
+
if found_start and end_marker:
|
|
508
|
+
idx = buffer.find(end_marker)
|
|
509
|
+
if idx != -1:
|
|
510
|
+
chunk = buffer[:idx]
|
|
511
|
+
buffer = buffer[idx + len(end_marker) :]
|
|
512
|
+
processing_active = False
|
|
513
|
+
else:
|
|
514
|
+
chunk = buffer
|
|
515
|
+
buffer = ""
|
|
516
|
+
processing_active = True
|
|
517
|
+
# Process chunk if we are in active region
|
|
518
|
+
if chunk and processing_active:
|
|
519
|
+
for subline in (
|
|
520
|
+
chunk.split(line_delimiter)
|
|
521
|
+
if line_delimiter is not None
|
|
522
|
+
else chunk.splitlines()
|
|
523
|
+
):
|
|
524
|
+
use_extract_in_process = (
|
|
525
|
+
compiled_extract_regexes if not content_extractor else None
|
|
526
|
+
)
|
|
527
|
+
|
|
528
|
+
result = _process_chunk(
|
|
529
|
+
subline,
|
|
530
|
+
intro_value,
|
|
531
|
+
to_json,
|
|
532
|
+
effective_skip_markers,
|
|
533
|
+
strip_chars,
|
|
534
|
+
yield_raw_on_error,
|
|
535
|
+
error_handler,
|
|
536
|
+
compiled_skip_regexes,
|
|
537
|
+
use_extract_in_process,
|
|
538
|
+
)
|
|
539
|
+
if result is None:
|
|
540
|
+
continue
|
|
541
|
+
if content_extractor:
|
|
542
|
+
try:
|
|
543
|
+
final_content = content_extractor(result)
|
|
544
|
+
if final_content is not None:
|
|
545
|
+
if compiled_extract_regexes and isinstance(
|
|
546
|
+
final_content, str
|
|
547
|
+
):
|
|
548
|
+
extracted = None
|
|
549
|
+
for regex in compiled_extract_regexes:
|
|
550
|
+
match = regex.search(final_content)
|
|
551
|
+
if match:
|
|
552
|
+
if match.groups():
|
|
553
|
+
extracted = (
|
|
554
|
+
match.group(1)
|
|
555
|
+
if len(match.groups()) == 1
|
|
556
|
+
else str(match.groups())
|
|
557
|
+
)
|
|
558
|
+
else:
|
|
559
|
+
extracted = match.group(0)
|
|
560
|
+
break
|
|
561
|
+
if extracted is not None:
|
|
562
|
+
yield _apply_output_format(extracted)
|
|
563
|
+
else:
|
|
564
|
+
yield _apply_output_format(final_content)
|
|
565
|
+
except Exception:
|
|
566
|
+
pass
|
|
567
|
+
else:
|
|
568
|
+
yield _apply_output_format(result)
|
|
569
|
+
if not processing_active:
|
|
570
|
+
found_start = False
|
|
571
|
+
if idx == -1:
|
|
572
|
+
break
|
|
573
|
+
elif found_start:
|
|
574
|
+
# No end marker, process all buffered content
|
|
575
|
+
chunk = buffer
|
|
576
|
+
buffer = ""
|
|
577
|
+
if chunk:
|
|
578
|
+
for subline in (
|
|
579
|
+
chunk.split(line_delimiter)
|
|
580
|
+
if line_delimiter is not None
|
|
581
|
+
else chunk.splitlines()
|
|
582
|
+
):
|
|
583
|
+
use_extract_in_process = (
|
|
584
|
+
compiled_extract_regexes if not content_extractor else None
|
|
585
|
+
)
|
|
586
|
+
|
|
587
|
+
result = _process_chunk(
|
|
588
|
+
subline,
|
|
589
|
+
intro_value,
|
|
590
|
+
to_json,
|
|
591
|
+
effective_skip_markers,
|
|
592
|
+
strip_chars,
|
|
593
|
+
yield_raw_on_error,
|
|
594
|
+
error_handler,
|
|
595
|
+
compiled_skip_regexes,
|
|
596
|
+
use_extract_in_process,
|
|
597
|
+
)
|
|
598
|
+
if result is None:
|
|
599
|
+
continue
|
|
600
|
+
if content_extractor:
|
|
601
|
+
try:
|
|
602
|
+
final_content = content_extractor(result)
|
|
603
|
+
if final_content is not None:
|
|
604
|
+
if compiled_extract_regexes and isinstance(
|
|
605
|
+
final_content, str
|
|
606
|
+
):
|
|
607
|
+
extracted = None
|
|
608
|
+
for regex in compiled_extract_regexes:
|
|
609
|
+
match = regex.search(final_content)
|
|
610
|
+
if match:
|
|
611
|
+
if match.groups():
|
|
612
|
+
extracted = (
|
|
613
|
+
match.group(1)
|
|
614
|
+
if len(match.groups()) == 1
|
|
615
|
+
else str(match.groups())
|
|
616
|
+
)
|
|
617
|
+
else:
|
|
618
|
+
extracted = match.group(0)
|
|
619
|
+
break
|
|
620
|
+
if extracted is not None:
|
|
621
|
+
yield _apply_output_format(extracted)
|
|
622
|
+
else:
|
|
623
|
+
yield _apply_output_format(final_content)
|
|
624
|
+
except Exception:
|
|
625
|
+
pass
|
|
626
|
+
else:
|
|
627
|
+
yield _apply_output_format(result)
|
|
628
|
+
break
|
|
629
|
+
else:
|
|
630
|
+
break
|
|
631
|
+
except Exception as e:
|
|
632
|
+
print(f"Stream processing error: {e}", file=sys.stderr)
|
|
633
|
+
|
|
634
|
+
|
|
635
|
+
async def _sanitize_stream_async(
|
|
636
|
+
data: Any,
|
|
637
|
+
intro_value: Optional[str] = "data:",
|
|
638
|
+
to_json: bool = True,
|
|
639
|
+
skip_markers: Optional[List[str]] = None,
|
|
640
|
+
strip_chars: Optional[str] = None,
|
|
641
|
+
start_marker: Optional[str] = None,
|
|
642
|
+
end_marker: Optional[str] = None,
|
|
643
|
+
content_extractor: Optional[Callable[[Union[str, Dict[str, Any]]], Optional[Any]]] = None,
|
|
644
|
+
yield_raw_on_error: bool = True,
|
|
645
|
+
encoding: EncodingType = "utf-8",
|
|
646
|
+
encoding_errors: str = "replace",
|
|
647
|
+
buffer_size: int = 8192,
|
|
648
|
+
line_delimiter: Optional[str] = None,
|
|
649
|
+
error_handler: Optional[Callable[[Exception, str], Optional[Any]]] = None,
|
|
650
|
+
skip_regexes: Optional[List[Union[str, re.Pattern[str]]]] = None,
|
|
651
|
+
extract_regexes: Optional[List[Union[str, re.Pattern[str]]]] = None,
|
|
652
|
+
raw: bool = False,
|
|
653
|
+
output_formatter: Optional[Callable[[Any], Any]] = None,
|
|
654
|
+
) -> AsyncGenerator[Any, None]:
|
|
655
|
+
"""
|
|
656
|
+
Asynchronously processes a stream of data (strings or bytes), applying transformations and filtering.
|
|
657
|
+
|
|
658
|
+
This function is the asynchronous counterpart to `_sanitize_stream_sync`. It handles
|
|
659
|
+
streaming data, allowing for operations such as prefix removal, JSON parsing,
|
|
660
|
+
skipping lines based on markers, regex-based filtering, and extracting specific content.
|
|
661
|
+
It also supports custom error handling for JSON parsing failures and output response formatting.
|
|
662
|
+
|
|
663
|
+
Args:
|
|
664
|
+
data: String, iterable of strings, or iterable of bytes to process.
|
|
665
|
+
intro_value: Prefix indicating the start of meaningful data.
|
|
666
|
+
to_json: Parse JSON content if ``True``.
|
|
667
|
+
skip_markers: Lines containing any of these markers are skipped.
|
|
668
|
+
strip_chars: Characters to strip from each line.
|
|
669
|
+
start_marker: Begin processing only after this marker is found.
|
|
670
|
+
end_marker: Stop processing once this marker is found.
|
|
671
|
+
content_extractor: Optional callable to transform parsed content before yielding.
|
|
672
|
+
yield_raw_on_error: Yield raw lines when JSON parsing fails.
|
|
673
|
+
encoding: Byte stream encoding.
|
|
674
|
+
encoding_errors: How to handle encoding errors.
|
|
675
|
+
buffer_size: Buffer size for byte decoding.
|
|
676
|
+
line_delimiter: Delimiter used to split incoming text into lines. ``None`` uses ``str.splitlines()``.
|
|
677
|
+
error_handler: Callback invoked with ``(Exception, str)`` when JSON parsing fails. If the callback returns a value, it is yielded in place of the raw line.
|
|
678
|
+
skip_regexes: List of regex patterns (strings or compiled) for skipping lines that match.
|
|
679
|
+
extract_regexes: List of regex patterns (strings or compiled) for extracting content using capturing groups.
|
|
680
|
+
raw: If True, yields the raw response as returned by the API, chunk by chunk (no processing).
|
|
681
|
+
output_formatter: Custom callable to format/transform each output item before yielding.
|
|
682
|
+
"""
|
|
683
|
+
# --- RAW MODE: yield each chunk exactly as returned by the API ---
|
|
684
|
+
if raw:
|
|
685
|
+
if isinstance(data, str):
|
|
686
|
+
yield data
|
|
687
|
+
return
|
|
688
|
+
elif hasattr(data, "__aiter__"):
|
|
689
|
+
async for chunk in data:
|
|
690
|
+
if isinstance(chunk, (bytes, bytearray)):
|
|
691
|
+
yield chunk.decode(encoding, encoding_errors)
|
|
692
|
+
elif chunk is not None:
|
|
693
|
+
yield chunk
|
|
694
|
+
return
|
|
695
|
+
elif hasattr(data, "__iter__"):
|
|
696
|
+
for chunk in data:
|
|
697
|
+
if isinstance(chunk, (bytes, bytearray)):
|
|
698
|
+
yield chunk.decode(encoding, encoding_errors)
|
|
699
|
+
elif chunk is not None:
|
|
700
|
+
yield chunk
|
|
701
|
+
return
|
|
702
|
+
else:
|
|
703
|
+
if data is not None:
|
|
704
|
+
yield data
|
|
705
|
+
return
|
|
706
|
+
# --- END RAW MODE ---
|
|
707
|
+
|
|
708
|
+
if isinstance(data, str):
|
|
709
|
+
for item in _sanitize_stream_sync(
|
|
710
|
+
data,
|
|
711
|
+
intro_value=intro_value,
|
|
712
|
+
to_json=to_json,
|
|
713
|
+
skip_markers=skip_markers,
|
|
714
|
+
strip_chars=strip_chars,
|
|
715
|
+
start_marker=start_marker,
|
|
716
|
+
end_marker=end_marker,
|
|
717
|
+
content_extractor=content_extractor,
|
|
718
|
+
yield_raw_on_error=yield_raw_on_error,
|
|
719
|
+
encoding=encoding,
|
|
720
|
+
encoding_errors=encoding_errors,
|
|
721
|
+
buffer_size=buffer_size,
|
|
722
|
+
line_delimiter=line_delimiter,
|
|
723
|
+
error_handler=error_handler,
|
|
724
|
+
skip_regexes=skip_regexes,
|
|
725
|
+
extract_regexes=extract_regexes,
|
|
726
|
+
raw=raw,
|
|
727
|
+
output_formatter=output_formatter,
|
|
728
|
+
):
|
|
729
|
+
yield item
|
|
730
|
+
return
|
|
731
|
+
|
|
732
|
+
if not hasattr(data, "__aiter__"):
|
|
733
|
+
# Fallback to synchronous processing if possible
|
|
734
|
+
for item in _sanitize_stream_sync(
|
|
735
|
+
data,
|
|
736
|
+
intro_value=intro_value,
|
|
737
|
+
to_json=to_json,
|
|
738
|
+
skip_markers=skip_markers,
|
|
739
|
+
strip_chars=strip_chars,
|
|
740
|
+
start_marker=start_marker,
|
|
741
|
+
end_marker=end_marker,
|
|
742
|
+
content_extractor=content_extractor,
|
|
743
|
+
yield_raw_on_error=yield_raw_on_error,
|
|
744
|
+
encoding=encoding,
|
|
745
|
+
encoding_errors=encoding_errors,
|
|
746
|
+
buffer_size=buffer_size,
|
|
747
|
+
line_delimiter=line_delimiter,
|
|
748
|
+
error_handler=error_handler,
|
|
749
|
+
skip_regexes=skip_regexes,
|
|
750
|
+
extract_regexes=extract_regexes,
|
|
751
|
+
raw=raw,
|
|
752
|
+
output_formatter=output_formatter,
|
|
753
|
+
):
|
|
754
|
+
yield item
|
|
755
|
+
return
|
|
756
|
+
|
|
757
|
+
# --- OUTPUT FORMATTING SETUP FOR ASYNC ---
|
|
758
|
+
def _apply_output_format(item: Any) -> Any:
|
|
759
|
+
"""Apply output formatting to a processed item."""
|
|
760
|
+
if output_formatter is not None:
|
|
761
|
+
return output_formatter(item)
|
|
762
|
+
return item
|
|
763
|
+
|
|
764
|
+
# --- END OUTPUT FORMATTING SETUP ---
|
|
765
|
+
|
|
766
|
+
effective_skip_markers = skip_markers or []
|
|
767
|
+
# Compile regex patterns
|
|
768
|
+
compiled_skip_regexes = _compile_regexes(skip_regexes)
|
|
769
|
+
compiled_extract_regexes = _compile_regexes(extract_regexes)
|
|
770
|
+
|
|
771
|
+
processing_active = start_marker is None
|
|
772
|
+
buffer = ""
|
|
773
|
+
found_start = False if start_marker else True
|
|
774
|
+
|
|
775
|
+
iterator = data.__aiter__()
|
|
776
|
+
first_item = None
|
|
777
|
+
async for first_item in iterator:
|
|
778
|
+
break
|
|
779
|
+
if first_item is None:
|
|
780
|
+
return
|
|
781
|
+
|
|
782
|
+
async def _chain(first: Any, it: AsyncIterable[Any]) -> AsyncGenerator[Any, None]:
|
|
783
|
+
"""Chain the first item with the rest of the async iterator."""
|
|
784
|
+
yield first
|
|
785
|
+
async for x in it:
|
|
786
|
+
yield x
|
|
787
|
+
|
|
788
|
+
stream: AsyncGenerator[Any, None] = _chain(first_item, iterator)
|
|
789
|
+
|
|
790
|
+
if isinstance(first_item, bytes):
|
|
791
|
+
line_iterator = _decode_byte_stream_async(
|
|
792
|
+
stream,
|
|
793
|
+
encoding=encoding,
|
|
794
|
+
errors=encoding_errors,
|
|
795
|
+
buffer_size=buffer_size,
|
|
796
|
+
)
|
|
797
|
+
elif isinstance(first_item, str):
|
|
798
|
+
line_iterator = stream
|
|
799
|
+
else:
|
|
800
|
+
raise TypeError(f"Stream must yield strings or bytes, not {type(first_item).__name__}")
|
|
801
|
+
|
|
802
|
+
try:
|
|
803
|
+
async for line in line_iterator:
|
|
804
|
+
if not line:
|
|
805
|
+
continue
|
|
806
|
+
buffer += line
|
|
807
|
+
while True:
|
|
808
|
+
# Look for start marker if needed
|
|
809
|
+
if not found_start and start_marker:
|
|
810
|
+
idx = buffer.find(start_marker)
|
|
811
|
+
if idx != -1:
|
|
812
|
+
found_start = True
|
|
813
|
+
buffer = buffer[idx + len(start_marker) :]
|
|
814
|
+
else:
|
|
815
|
+
# Not found, keep buffering
|
|
816
|
+
buffer = buffer[-max(len(start_marker), 256) :]
|
|
817
|
+
break
|
|
818
|
+
# Look for end marker if needed
|
|
819
|
+
if found_start and end_marker:
|
|
820
|
+
idx = buffer.find(end_marker)
|
|
821
|
+
if idx != -1:
|
|
822
|
+
chunk = buffer[:idx]
|
|
823
|
+
buffer = buffer[idx + len(end_marker) :]
|
|
824
|
+
processing_active = False
|
|
825
|
+
else:
|
|
826
|
+
chunk = buffer
|
|
827
|
+
buffer = ""
|
|
828
|
+
processing_active = True
|
|
829
|
+
# Process chunk if we are in active region
|
|
830
|
+
if chunk and processing_active:
|
|
831
|
+
for subline in (
|
|
832
|
+
chunk.split(line_delimiter)
|
|
833
|
+
if line_delimiter is not None
|
|
834
|
+
else chunk.splitlines()
|
|
835
|
+
):
|
|
836
|
+
use_extract_in_process = (
|
|
837
|
+
compiled_extract_regexes if not content_extractor else None
|
|
838
|
+
)
|
|
839
|
+
|
|
840
|
+
result = _process_chunk(
|
|
841
|
+
subline,
|
|
842
|
+
intro_value,
|
|
843
|
+
to_json,
|
|
844
|
+
effective_skip_markers,
|
|
845
|
+
strip_chars,
|
|
846
|
+
yield_raw_on_error,
|
|
847
|
+
error_handler,
|
|
848
|
+
compiled_skip_regexes,
|
|
849
|
+
use_extract_in_process,
|
|
850
|
+
)
|
|
851
|
+
if result is None:
|
|
852
|
+
continue
|
|
853
|
+
if content_extractor:
|
|
854
|
+
try:
|
|
855
|
+
final_content = content_extractor(result)
|
|
856
|
+
if final_content is not None:
|
|
857
|
+
if compiled_extract_regexes and isinstance(
|
|
858
|
+
final_content, str
|
|
859
|
+
):
|
|
860
|
+
extracted = None
|
|
861
|
+
for regex in compiled_extract_regexes:
|
|
862
|
+
match = regex.search(final_content)
|
|
863
|
+
if match:
|
|
864
|
+
if match.groups():
|
|
865
|
+
extracted = (
|
|
866
|
+
match.group(1)
|
|
867
|
+
if len(match.groups()) == 1
|
|
868
|
+
else str(match.groups())
|
|
869
|
+
)
|
|
870
|
+
else:
|
|
871
|
+
extracted = match.group(0)
|
|
872
|
+
break
|
|
873
|
+
if extracted is not None:
|
|
874
|
+
yield _apply_output_format(extracted)
|
|
875
|
+
else:
|
|
876
|
+
yield _apply_output_format(final_content)
|
|
877
|
+
except Exception:
|
|
878
|
+
pass
|
|
879
|
+
else:
|
|
880
|
+
yield _apply_output_format(result)
|
|
881
|
+
if not processing_active:
|
|
882
|
+
found_start = False
|
|
883
|
+
if idx == -1:
|
|
884
|
+
break
|
|
885
|
+
elif found_start:
|
|
886
|
+
# No end marker, process all buffered content
|
|
887
|
+
chunk = buffer
|
|
888
|
+
buffer = ""
|
|
889
|
+
if chunk:
|
|
890
|
+
for subline in (
|
|
891
|
+
chunk.split(line_delimiter)
|
|
892
|
+
if line_delimiter is not None
|
|
893
|
+
else chunk.splitlines()
|
|
894
|
+
):
|
|
895
|
+
use_extract_in_process = (
|
|
896
|
+
compiled_extract_regexes if not content_extractor else None
|
|
897
|
+
)
|
|
898
|
+
|
|
899
|
+
result = _process_chunk(
|
|
900
|
+
subline,
|
|
901
|
+
intro_value,
|
|
902
|
+
to_json,
|
|
903
|
+
effective_skip_markers,
|
|
904
|
+
strip_chars,
|
|
905
|
+
yield_raw_on_error,
|
|
906
|
+
error_handler,
|
|
907
|
+
compiled_skip_regexes,
|
|
908
|
+
use_extract_in_process,
|
|
909
|
+
)
|
|
910
|
+
if result is None:
|
|
911
|
+
continue
|
|
912
|
+
if content_extractor:
|
|
913
|
+
try:
|
|
914
|
+
final_content = content_extractor(result)
|
|
915
|
+
if final_content is not None:
|
|
916
|
+
# Apply extract_regexes to extracted content if provided
|
|
917
|
+
if compiled_extract_regexes and isinstance(
|
|
918
|
+
final_content, str
|
|
919
|
+
):
|
|
920
|
+
extracted = None
|
|
921
|
+
for regex in compiled_extract_regexes:
|
|
922
|
+
match = regex.search(final_content)
|
|
923
|
+
if match:
|
|
924
|
+
if match.groups():
|
|
925
|
+
extracted = (
|
|
926
|
+
match.group(1)
|
|
927
|
+
if len(match.groups()) == 1
|
|
928
|
+
else str(match.groups())
|
|
929
|
+
)
|
|
930
|
+
else:
|
|
931
|
+
extracted = match.group(0)
|
|
932
|
+
break
|
|
933
|
+
if extracted is not None:
|
|
934
|
+
yield _apply_output_format(extracted)
|
|
935
|
+
else:
|
|
936
|
+
yield _apply_output_format(final_content)
|
|
937
|
+
except Exception:
|
|
938
|
+
pass
|
|
939
|
+
else:
|
|
940
|
+
yield _apply_output_format(result)
|
|
941
|
+
break
|
|
942
|
+
else:
|
|
943
|
+
break
|
|
944
|
+
except Exception as e:
|
|
945
|
+
print(f"Async stream processing error: {e}", file=sys.stderr)
|
|
946
|
+
|
|
947
|
+
|
|
948
|
+
@overload
|
|
949
|
+
def sanitize_stream(
|
|
950
|
+
data: Union[
|
|
951
|
+
str,
|
|
952
|
+
bytes,
|
|
953
|
+
Iterable[str],
|
|
954
|
+
Iterable[bytes],
|
|
955
|
+
dict,
|
|
956
|
+
list,
|
|
957
|
+
int,
|
|
958
|
+
float,
|
|
959
|
+
bool,
|
|
960
|
+
None,
|
|
961
|
+
],
|
|
962
|
+
intro_value: Optional[str] = "data:",
|
|
963
|
+
to_json: bool = True,
|
|
964
|
+
skip_markers: Optional[List[str]] = None,
|
|
965
|
+
strip_chars: Optional[str] = None,
|
|
966
|
+
start_marker: Optional[str] = None,
|
|
967
|
+
end_marker: Optional[str] = None,
|
|
968
|
+
content_extractor: Optional[Callable[[Union[str, Dict[str, Any]]], Optional[Any]]] = None,
|
|
969
|
+
yield_raw_on_error: bool = True,
|
|
970
|
+
encoding: EncodingType = "utf-8",
|
|
971
|
+
encoding_errors: str = "replace",
|
|
972
|
+
buffer_size: int = 8192,
|
|
973
|
+
line_delimiter: Optional[str] = None,
|
|
974
|
+
error_handler: Optional[Callable[[Exception, str], Optional[Any]]] = None,
|
|
975
|
+
skip_regexes: Optional[List[Union[str, re.Pattern[str]]]] = None,
|
|
976
|
+
extract_regexes: Optional[List[Union[str, re.Pattern[str]]]] = None,
|
|
977
|
+
object_mode: Literal["as_is", "json", "str"] = "json",
|
|
978
|
+
raw: bool = False,
|
|
979
|
+
output_formatter: Optional[Callable[[Any], Any]] = None,
|
|
980
|
+
) -> Generator[Any, None, None]: ...
|
|
981
|
+
|
|
982
|
+
|
|
983
|
+
@overload
|
|
984
|
+
def sanitize_stream(
|
|
985
|
+
data: Union[
|
|
986
|
+
AsyncIterable[str],
|
|
987
|
+
AsyncIterable[bytes],
|
|
988
|
+
],
|
|
989
|
+
intro_value: Optional[str] = "data:",
|
|
990
|
+
to_json: bool = True,
|
|
991
|
+
skip_markers: Optional[List[str]] = None,
|
|
992
|
+
strip_chars: Optional[str] = None,
|
|
993
|
+
start_marker: Optional[str] = None,
|
|
994
|
+
end_marker: Optional[str] = None,
|
|
995
|
+
content_extractor: Optional[Callable[[Union[str, Dict[str, Any]]], Optional[Any]]] = None,
|
|
996
|
+
yield_raw_on_error: bool = True,
|
|
997
|
+
encoding: EncodingType = "utf-8",
|
|
998
|
+
encoding_errors: str = "replace",
|
|
999
|
+
buffer_size: int = 8192,
|
|
1000
|
+
line_delimiter: Optional[str] = None,
|
|
1001
|
+
error_handler: Optional[Callable[[Exception, str], Optional[Any]]] = None,
|
|
1002
|
+
skip_regexes: Optional[List[Union[str, re.Pattern[str]]]] = None,
|
|
1003
|
+
extract_regexes: Optional[List[Union[str, re.Pattern[str]]]] = None,
|
|
1004
|
+
object_mode: Literal["as_is", "json", "str"] = "json",
|
|
1005
|
+
raw: bool = False,
|
|
1006
|
+
output_formatter: Optional[Callable[[Any], Any]] = None,
|
|
1007
|
+
) -> AsyncGenerator[Any, None]: ...
|
|
1008
|
+
|
|
1009
|
+
|
|
1010
|
+
def sanitize_stream(
|
|
1011
|
+
data: Any,
|
|
1012
|
+
intro_value: Optional[str] = "data:",
|
|
1013
|
+
to_json: bool = True,
|
|
1014
|
+
skip_markers: Optional[List[str]] = None,
|
|
1015
|
+
strip_chars: Optional[str] = None,
|
|
1016
|
+
start_marker: Optional[str] = None,
|
|
1017
|
+
end_marker: Optional[str] = None,
|
|
1018
|
+
content_extractor: Optional[Callable[[Union[str, Dict[str, Any]]], Optional[Any]]] = None,
|
|
1019
|
+
yield_raw_on_error: bool = True,
|
|
1020
|
+
encoding: EncodingType = "utf-8",
|
|
1021
|
+
encoding_errors: str = "replace",
|
|
1022
|
+
buffer_size: int = 8192,
|
|
1023
|
+
line_delimiter: Optional[str] = None,
|
|
1024
|
+
error_handler: Optional[Callable[[Exception, str], Optional[Any]]] = None,
|
|
1025
|
+
skip_regexes: Optional[List[Union[str, re.Pattern[str]]]] = None,
|
|
1026
|
+
extract_regexes: Optional[List[Union[str, re.Pattern[str]]]] = None,
|
|
1027
|
+
object_mode: Literal["as_is", "json", "str"] = "json",
|
|
1028
|
+
raw: bool = False,
|
|
1029
|
+
output_formatter: Optional[Callable[[Any], Any]] = None,
|
|
1030
|
+
) -> Union[Generator[Any, None, None], AsyncGenerator[Any, None]]:
|
|
1031
|
+
"""
|
|
1032
|
+
Processes streaming data (strings or bytes) in either synchronous or asynchronous mode.
|
|
1033
|
+
Now supports non-iterable and miscellaneous input types (dict, list, int, float, bool, None).
|
|
1034
|
+
Includes regex-based content filtering, extraction capabilities, and customizable output response formatting.
|
|
1035
|
+
|
|
1036
|
+
Args:
|
|
1037
|
+
data: The data to be processed. Can be a string, bytes, a synchronous iterable of strings or bytes,
|
|
1038
|
+
an asynchronous iterable of strings or bytes, or a single object (dict, list, int, float, bool, None).
|
|
1039
|
+
intro_value (str): Prefix indicating the start of meaningful data. Defaults to "data:".
|
|
1040
|
+
to_json (bool): Parse JSON content if ``True``. Defaults to True.
|
|
1041
|
+
skip_markers (Optional[List[str]]): Lines containing any of these markers are skipped. Defaults to None.
|
|
1042
|
+
strip_chars (Optional[str]): Characters to strip from each line. Defaults to None.
|
|
1043
|
+
start_marker (Optional[str]): Begin processing only after this marker is found. Defaults to None.
|
|
1044
|
+
end_marker (Optional[str]): Stop processing once this marker is found. Defaults to None.
|
|
1045
|
+
content_extractor (Optional[Callable[[Union[str, Dict[str, Any]]], Optional[Any]]]):
|
|
1046
|
+
Optional callable to transform parsed content before yielding. Defaults to None.
|
|
1047
|
+
yield_raw_on_error (bool): Yield raw lines when JSON parsing fails. Defaults to True.
|
|
1048
|
+
encoding (EncodingType): Byte stream encoding. Defaults to "utf-8".
|
|
1049
|
+
encoding_errors (str): How to handle encoding errors. Defaults to "replace".
|
|
1050
|
+
buffer_size (int): Buffer size for byte decoding. Defaults to 8192.
|
|
1051
|
+
line_delimiter (Optional[str]): Delimiter used to split incoming text into lines.
|
|
1052
|
+
``None`` uses ``str.splitlines()``. Defaults to None.
|
|
1053
|
+
error_handler (Optional[Callable[[Exception, str], Optional[Any]]]):
|
|
1054
|
+
Callback invoked with ``(Exception, str)`` when JSON parsing fails.
|
|
1055
|
+
If the callback returns a value, it is yielded in place of the raw line. Defaults to None.
|
|
1056
|
+
skip_regexes (Optional[List[Union[str, Pattern[str]]]]): List of regex patterns (strings or compiled)
|
|
1057
|
+
for skipping lines that match any pattern. Defaults to None.
|
|
1058
|
+
extract_regexes (Optional[List[Union[str, Pattern[str]]]]): List of regex patterns (strings or compiled)
|
|
1059
|
+
for extracting content using capturing groups. If multiple groups are captured, they are returned as a tuple string. Defaults to None.
|
|
1060
|
+
object_mode (Literal["as_is", "json", "str"]): How to handle non-string, non-iterable objects.
|
|
1061
|
+
"json" (default) yields as JSON string, "str" yields as str(obj), "as_is" yields the object as-is.
|
|
1062
|
+
raw (bool): If True, yields the raw response as returned by the API, chunk by chunk (no splitting or joining).
|
|
1063
|
+
output_formatter (Optional[Callable[[Any], Any]]): Custom callable to format/transform each output item
|
|
1064
|
+
before yielding. Use this to structure output into any desired format (e.g., OpenAI-like responses,
|
|
1065
|
+
custom dictionaries, etc.). The formatter receives the processed content and returns the formatted output.
|
|
1066
|
+
|
|
1067
|
+
Returns:
|
|
1068
|
+
Union[Generator[Any, None, None], AsyncGenerator[Any, None]]:
|
|
1069
|
+
A generator or an asynchronous generator yielding the processed data, or raw data if raw=True,
|
|
1070
|
+
optionally transformed by output_formatter.
|
|
1071
|
+
|
|
1072
|
+
Raises:
|
|
1073
|
+
ValueError: If any regex pattern is invalid.
|
|
1074
|
+
|
|
1075
|
+
Examples:
|
|
1076
|
+
# Use custom formatter for simple dict structure
|
|
1077
|
+
>>> def my_formatter(content):
|
|
1078
|
+
... return {'text': content, 'timestamp': time.time()}
|
|
1079
|
+
>>> for chunk in sanitize_stream(data, output_formatter=my_formatter):
|
|
1080
|
+
... print(chunk)
|
|
1081
|
+
|
|
1082
|
+
# Format as message with role
|
|
1083
|
+
>>> def message_formatter(content):
|
|
1084
|
+
... return {'role': 'assistant', 'content': content}
|
|
1085
|
+
>>> for chunk in sanitize_stream(data, output_formatter=message_formatter):
|
|
1086
|
+
... print(chunk)
|
|
1087
|
+
"""
|
|
1088
|
+
if raw:
|
|
1089
|
+
|
|
1090
|
+
def _raw_passthrough_sync(source_iter: Iterable[Any]) -> Generator[Any, None, None]:
|
|
1091
|
+
"""Pass through sync iterable, decoding bytes to strings."""
|
|
1092
|
+
for chunk in source_iter:
|
|
1093
|
+
if isinstance(chunk, (bytes, bytearray)):
|
|
1094
|
+
yield chunk.decode(encoding, encoding_errors)
|
|
1095
|
+
elif chunk is not None:
|
|
1096
|
+
yield chunk
|
|
1097
|
+
# Skip None chunks entirely
|
|
1098
|
+
|
|
1099
|
+
async def _raw_passthrough_async(source_aiter: AsyncIterable[Any]) -> AsyncGenerator[Any, None]:
|
|
1100
|
+
"""Pass through async iterable, decoding bytes to strings."""
|
|
1101
|
+
async for chunk in source_aiter:
|
|
1102
|
+
if isinstance(chunk, (bytes, bytearray)):
|
|
1103
|
+
# Decode bytes preserving all whitespace and newlines
|
|
1104
|
+
yield chunk.decode(encoding, encoding_errors)
|
|
1105
|
+
elif chunk is not None:
|
|
1106
|
+
yield chunk
|
|
1107
|
+
|
|
1108
|
+
if hasattr(data, "__iter__") and not isinstance(data, (str, bytes)):
|
|
1109
|
+
return _raw_passthrough_sync(data)
|
|
1110
|
+
# Async iterable
|
|
1111
|
+
if hasattr(data, "__aiter__"):
|
|
1112
|
+
return _raw_passthrough_async(data)
|
|
1113
|
+
# Single string or bytes
|
|
1114
|
+
if isinstance(data, (bytes, bytearray)):
|
|
1115
|
+
|
|
1116
|
+
def _yield_single_bytes() -> Generator[str, None, None]:
|
|
1117
|
+
yield data.decode(encoding, encoding_errors)
|
|
1118
|
+
|
|
1119
|
+
return _yield_single_bytes()
|
|
1120
|
+
else:
|
|
1121
|
+
|
|
1122
|
+
def _yield_single_any() -> Generator[Any, None, None]:
|
|
1123
|
+
if data is not None:
|
|
1124
|
+
yield data
|
|
1125
|
+
|
|
1126
|
+
return _yield_single_any()
|
|
1127
|
+
# --- END RAW MODE ---
|
|
1128
|
+
|
|
1129
|
+
text_attr = getattr(data, "text", None)
|
|
1130
|
+
content_attr = getattr(data, "content", None)
|
|
1131
|
+
|
|
1132
|
+
# Handle None
|
|
1133
|
+
if data is None:
|
|
1134
|
+
|
|
1135
|
+
def _empty_gen() -> Generator[None, None, None]:
|
|
1136
|
+
if False:
|
|
1137
|
+
yield None
|
|
1138
|
+
|
|
1139
|
+
return _empty_gen()
|
|
1140
|
+
|
|
1141
|
+
# Handle bytes directly
|
|
1142
|
+
if isinstance(data, bytes):
|
|
1143
|
+
try:
|
|
1144
|
+
payload = data.decode(encoding, encoding_errors)
|
|
1145
|
+
except Exception:
|
|
1146
|
+
payload = str(data)
|
|
1147
|
+
return _sanitize_stream_sync(
|
|
1148
|
+
payload,
|
|
1149
|
+
intro_value,
|
|
1150
|
+
to_json,
|
|
1151
|
+
skip_markers,
|
|
1152
|
+
strip_chars,
|
|
1153
|
+
start_marker,
|
|
1154
|
+
end_marker,
|
|
1155
|
+
content_extractor,
|
|
1156
|
+
yield_raw_on_error,
|
|
1157
|
+
encoding,
|
|
1158
|
+
encoding_errors,
|
|
1159
|
+
buffer_size,
|
|
1160
|
+
line_delimiter,
|
|
1161
|
+
error_handler,
|
|
1162
|
+
skip_regexes,
|
|
1163
|
+
extract_regexes,
|
|
1164
|
+
raw,
|
|
1165
|
+
output_formatter,
|
|
1166
|
+
)
|
|
1167
|
+
|
|
1168
|
+
# Handle string directly
|
|
1169
|
+
if isinstance(data, str):
|
|
1170
|
+
return _sanitize_stream_sync(
|
|
1171
|
+
data,
|
|
1172
|
+
intro_value,
|
|
1173
|
+
to_json,
|
|
1174
|
+
skip_markers,
|
|
1175
|
+
strip_chars,
|
|
1176
|
+
start_marker,
|
|
1177
|
+
end_marker,
|
|
1178
|
+
content_extractor,
|
|
1179
|
+
yield_raw_on_error,
|
|
1180
|
+
encoding,
|
|
1181
|
+
encoding_errors,
|
|
1182
|
+
buffer_size,
|
|
1183
|
+
line_delimiter,
|
|
1184
|
+
error_handler,
|
|
1185
|
+
skip_regexes,
|
|
1186
|
+
extract_regexes,
|
|
1187
|
+
raw,
|
|
1188
|
+
output_formatter,
|
|
1189
|
+
)
|
|
1190
|
+
|
|
1191
|
+
# Handle dict, list, int, float, bool (non-iterable, non-string/bytes)
|
|
1192
|
+
if isinstance(data, (dict, list, int, float, bool)):
|
|
1193
|
+
if object_mode == "as_is":
|
|
1194
|
+
|
|
1195
|
+
def _as_is_gen() -> Generator[Any, None, None]:
|
|
1196
|
+
yield data
|
|
1197
|
+
|
|
1198
|
+
return _as_is_gen()
|
|
1199
|
+
elif object_mode == "str":
|
|
1200
|
+
return _sanitize_stream_sync(
|
|
1201
|
+
str(data),
|
|
1202
|
+
intro_value,
|
|
1203
|
+
to_json,
|
|
1204
|
+
skip_markers,
|
|
1205
|
+
strip_chars,
|
|
1206
|
+
start_marker,
|
|
1207
|
+
end_marker,
|
|
1208
|
+
content_extractor,
|
|
1209
|
+
yield_raw_on_error,
|
|
1210
|
+
encoding,
|
|
1211
|
+
encoding_errors,
|
|
1212
|
+
buffer_size,
|
|
1213
|
+
line_delimiter,
|
|
1214
|
+
error_handler,
|
|
1215
|
+
skip_regexes,
|
|
1216
|
+
extract_regexes,
|
|
1217
|
+
raw,
|
|
1218
|
+
output_formatter,
|
|
1219
|
+
)
|
|
1220
|
+
else: # "json"
|
|
1221
|
+
try:
|
|
1222
|
+
json_str = json.dumps(data)
|
|
1223
|
+
except Exception:
|
|
1224
|
+
json_str = str(data)
|
|
1225
|
+
return _sanitize_stream_sync(
|
|
1226
|
+
json_str,
|
|
1227
|
+
intro_value,
|
|
1228
|
+
to_json,
|
|
1229
|
+
skip_markers,
|
|
1230
|
+
strip_chars,
|
|
1231
|
+
start_marker,
|
|
1232
|
+
end_marker,
|
|
1233
|
+
content_extractor,
|
|
1234
|
+
yield_raw_on_error,
|
|
1235
|
+
encoding,
|
|
1236
|
+
encoding_errors,
|
|
1237
|
+
buffer_size,
|
|
1238
|
+
line_delimiter,
|
|
1239
|
+
error_handler,
|
|
1240
|
+
skip_regexes,
|
|
1241
|
+
extract_regexes,
|
|
1242
|
+
raw,
|
|
1243
|
+
output_formatter,
|
|
1244
|
+
)
|
|
1245
|
+
|
|
1246
|
+
# Handle file-like objects (optional, treat as string if .read exists)
|
|
1247
|
+
if hasattr(data, "read") and callable(data.read):
|
|
1248
|
+
try:
|
|
1249
|
+
file_content = data.read()
|
|
1250
|
+
if isinstance(file_content, bytes):
|
|
1251
|
+
file_content = file_content.decode(encoding, encoding_errors)
|
|
1252
|
+
return _sanitize_stream_sync(
|
|
1253
|
+
file_content,
|
|
1254
|
+
intro_value,
|
|
1255
|
+
to_json,
|
|
1256
|
+
skip_markers,
|
|
1257
|
+
strip_chars,
|
|
1258
|
+
start_marker,
|
|
1259
|
+
end_marker,
|
|
1260
|
+
content_extractor,
|
|
1261
|
+
yield_raw_on_error,
|
|
1262
|
+
encoding,
|
|
1263
|
+
encoding_errors,
|
|
1264
|
+
buffer_size,
|
|
1265
|
+
line_delimiter,
|
|
1266
|
+
error_handler,
|
|
1267
|
+
skip_regexes,
|
|
1268
|
+
extract_regexes,
|
|
1269
|
+
raw,
|
|
1270
|
+
output_formatter,
|
|
1271
|
+
)
|
|
1272
|
+
except Exception:
|
|
1273
|
+
pass # fallback to next
|
|
1274
|
+
|
|
1275
|
+
# Handle .text or .content attributes
|
|
1276
|
+
if isinstance(text_attr, str):
|
|
1277
|
+
payload = text_attr
|
|
1278
|
+
return _sanitize_stream_sync(
|
|
1279
|
+
payload,
|
|
1280
|
+
intro_value,
|
|
1281
|
+
to_json,
|
|
1282
|
+
skip_markers,
|
|
1283
|
+
strip_chars,
|
|
1284
|
+
start_marker,
|
|
1285
|
+
end_marker,
|
|
1286
|
+
content_extractor,
|
|
1287
|
+
yield_raw_on_error,
|
|
1288
|
+
encoding,
|
|
1289
|
+
encoding_errors,
|
|
1290
|
+
buffer_size,
|
|
1291
|
+
line_delimiter,
|
|
1292
|
+
error_handler,
|
|
1293
|
+
skip_regexes,
|
|
1294
|
+
extract_regexes,
|
|
1295
|
+
raw,
|
|
1296
|
+
output_formatter,
|
|
1297
|
+
)
|
|
1298
|
+
elif isinstance(content_attr, bytes):
|
|
1299
|
+
try:
|
|
1300
|
+
payload = content_attr.decode(encoding, encoding_errors)
|
|
1301
|
+
except Exception:
|
|
1302
|
+
payload = str(content_attr)
|
|
1303
|
+
return _sanitize_stream_sync(
|
|
1304
|
+
payload,
|
|
1305
|
+
intro_value,
|
|
1306
|
+
to_json,
|
|
1307
|
+
skip_markers,
|
|
1308
|
+
strip_chars,
|
|
1309
|
+
start_marker,
|
|
1310
|
+
end_marker,
|
|
1311
|
+
content_extractor,
|
|
1312
|
+
yield_raw_on_error,
|
|
1313
|
+
encoding,
|
|
1314
|
+
encoding_errors,
|
|
1315
|
+
buffer_size,
|
|
1316
|
+
line_delimiter,
|
|
1317
|
+
error_handler,
|
|
1318
|
+
skip_regexes,
|
|
1319
|
+
extract_regexes,
|
|
1320
|
+
raw,
|
|
1321
|
+
output_formatter,
|
|
1322
|
+
)
|
|
1323
|
+
|
|
1324
|
+
# Handle async iterables
|
|
1325
|
+
if hasattr(data, "__aiter__"):
|
|
1326
|
+
return _sanitize_stream_async(
|
|
1327
|
+
data,
|
|
1328
|
+
intro_value,
|
|
1329
|
+
to_json,
|
|
1330
|
+
skip_markers,
|
|
1331
|
+
strip_chars,
|
|
1332
|
+
start_marker,
|
|
1333
|
+
end_marker,
|
|
1334
|
+
content_extractor,
|
|
1335
|
+
yield_raw_on_error,
|
|
1336
|
+
encoding,
|
|
1337
|
+
encoding_errors,
|
|
1338
|
+
buffer_size,
|
|
1339
|
+
line_delimiter,
|
|
1340
|
+
error_handler,
|
|
1341
|
+
skip_regexes,
|
|
1342
|
+
extract_regexes,
|
|
1343
|
+
raw,
|
|
1344
|
+
output_formatter,
|
|
1345
|
+
)
|
|
1346
|
+
# Handle sync iterables (but not strings/bytes)
|
|
1347
|
+
if hasattr(data, "__iter__"):
|
|
1348
|
+
return _sanitize_stream_sync(
|
|
1349
|
+
data,
|
|
1350
|
+
intro_value,
|
|
1351
|
+
to_json,
|
|
1352
|
+
skip_markers,
|
|
1353
|
+
strip_chars,
|
|
1354
|
+
start_marker,
|
|
1355
|
+
end_marker,
|
|
1356
|
+
content_extractor,
|
|
1357
|
+
yield_raw_on_error,
|
|
1358
|
+
encoding,
|
|
1359
|
+
encoding_errors,
|
|
1360
|
+
buffer_size,
|
|
1361
|
+
line_delimiter,
|
|
1362
|
+
error_handler,
|
|
1363
|
+
skip_regexes,
|
|
1364
|
+
extract_regexes,
|
|
1365
|
+
raw,
|
|
1366
|
+
output_formatter,
|
|
1367
|
+
)
|
|
1368
|
+
# Fallback: treat as string
|
|
1369
|
+
return _sanitize_stream_sync(
|
|
1370
|
+
str(data),
|
|
1371
|
+
intro_value,
|
|
1372
|
+
to_json,
|
|
1373
|
+
skip_markers,
|
|
1374
|
+
strip_chars,
|
|
1375
|
+
start_marker,
|
|
1376
|
+
end_marker,
|
|
1377
|
+
content_extractor,
|
|
1378
|
+
yield_raw_on_error,
|
|
1379
|
+
encoding,
|
|
1380
|
+
encoding_errors,
|
|
1381
|
+
buffer_size,
|
|
1382
|
+
line_delimiter,
|
|
1383
|
+
error_handler,
|
|
1384
|
+
skip_regexes,
|
|
1385
|
+
extract_regexes,
|
|
1386
|
+
raw,
|
|
1387
|
+
output_formatter,
|
|
1388
|
+
)
|
|
1389
|
+
|
|
1390
|
+
|
|
1391
|
+
# --- Decorator version of sanitize_stream ---
|
|
1392
|
+
|
|
1393
|
+
|
|
1394
|
+
def _sanitize_stream_decorator(
|
|
1395
|
+
_func=None,
|
|
1396
|
+
*,
|
|
1397
|
+
intro_value: Optional[str] = "data:",
|
|
1398
|
+
to_json: bool = True,
|
|
1399
|
+
skip_markers: Optional[List[str]] = None,
|
|
1400
|
+
strip_chars: Optional[str] = None,
|
|
1401
|
+
start_marker: Optional[str] = None,
|
|
1402
|
+
end_marker: Optional[str] = None,
|
|
1403
|
+
content_extractor: Optional[Callable[[Union[str, Dict[str, Any]]], Optional[Any]]] = None,
|
|
1404
|
+
yield_raw_on_error: bool = True,
|
|
1405
|
+
encoding: EncodingType = "utf-8",
|
|
1406
|
+
encoding_errors: str = "replace",
|
|
1407
|
+
buffer_size: int = 8192,
|
|
1408
|
+
line_delimiter: Optional[str] = None,
|
|
1409
|
+
error_handler: Optional[Callable[[Exception, str], Optional[Any]]] = None,
|
|
1410
|
+
skip_regexes: Optional[List[Union[str, re.Pattern[str]]]] = None,
|
|
1411
|
+
extract_regexes: Optional[List[Union[str, re.Pattern[str]]]] = None,
|
|
1412
|
+
object_mode: Literal["as_is", "json", "str"] = "json",
|
|
1413
|
+
raw: bool = False,
|
|
1414
|
+
output_formatter: Optional[Callable[[Any], Any]] = None,
|
|
1415
|
+
):
|
|
1416
|
+
"""
|
|
1417
|
+
Decorator for sanitize_stream. Can be used as @sanitize_stream or @sanitize_stream(...).
|
|
1418
|
+
All arguments are the same as sanitize_stream(), including output_formatter.
|
|
1419
|
+
"""
|
|
1420
|
+
|
|
1421
|
+
def decorator(func) -> Callable:
|
|
1422
|
+
if asyncio.iscoroutinefunction(func):
|
|
1423
|
+
|
|
1424
|
+
@functools.wraps(func)
|
|
1425
|
+
async def async_wrapper(*args, **kwargs) -> AsyncGenerator[Any, None]:
|
|
1426
|
+
result = await func(*args, **kwargs)
|
|
1427
|
+
return sanitize_stream(
|
|
1428
|
+
result,
|
|
1429
|
+
intro_value=intro_value,
|
|
1430
|
+
to_json=to_json,
|
|
1431
|
+
skip_markers=skip_markers,
|
|
1432
|
+
strip_chars=strip_chars,
|
|
1433
|
+
start_marker=start_marker,
|
|
1434
|
+
end_marker=end_marker,
|
|
1435
|
+
content_extractor=content_extractor,
|
|
1436
|
+
yield_raw_on_error=yield_raw_on_error,
|
|
1437
|
+
encoding=encoding,
|
|
1438
|
+
encoding_errors=encoding_errors,
|
|
1439
|
+
buffer_size=buffer_size,
|
|
1440
|
+
line_delimiter=line_delimiter,
|
|
1441
|
+
error_handler=error_handler,
|
|
1442
|
+
skip_regexes=skip_regexes,
|
|
1443
|
+
extract_regexes=extract_regexes,
|
|
1444
|
+
object_mode=object_mode,
|
|
1445
|
+
raw=raw,
|
|
1446
|
+
output_formatter=output_formatter,
|
|
1447
|
+
)
|
|
1448
|
+
|
|
1449
|
+
return async_wrapper
|
|
1450
|
+
else:
|
|
1451
|
+
|
|
1452
|
+
@functools.wraps(func)
|
|
1453
|
+
def sync_wrapper(*args, **kwargs) -> Generator[Any, None, None]:
|
|
1454
|
+
result = func(*args, **kwargs)
|
|
1455
|
+
return sanitize_stream(
|
|
1456
|
+
result,
|
|
1457
|
+
intro_value=intro_value,
|
|
1458
|
+
to_json=to_json,
|
|
1459
|
+
skip_markers=skip_markers,
|
|
1460
|
+
strip_chars=strip_chars,
|
|
1461
|
+
start_marker=start_marker,
|
|
1462
|
+
end_marker=end_marker,
|
|
1463
|
+
content_extractor=content_extractor,
|
|
1464
|
+
yield_raw_on_error=yield_raw_on_error,
|
|
1465
|
+
encoding=encoding,
|
|
1466
|
+
encoding_errors=encoding_errors,
|
|
1467
|
+
buffer_size=buffer_size,
|
|
1468
|
+
line_delimiter=line_delimiter,
|
|
1469
|
+
error_handler=error_handler,
|
|
1470
|
+
skip_regexes=skip_regexes,
|
|
1471
|
+
extract_regexes=extract_regexes,
|
|
1472
|
+
object_mode=object_mode,
|
|
1473
|
+
raw=raw,
|
|
1474
|
+
output_formatter=output_formatter,
|
|
1475
|
+
)
|
|
1476
|
+
|
|
1477
|
+
return sync_wrapper
|
|
1478
|
+
|
|
1479
|
+
if _func is None:
|
|
1480
|
+
return decorator
|
|
1481
|
+
else:
|
|
1482
|
+
return decorator(_func)
|
|
1483
|
+
|
|
1484
|
+
|
|
1485
|
+
# Alias for decorator usage
|
|
1486
|
+
LITSTREAM = sanitize_stream
|
|
1487
|
+
|
|
1488
|
+
# Decorator aliases
|
|
1489
|
+
sanitize_stream_decorator = _sanitize_stream_decorator
|
|
1490
|
+
lit_streamer = _sanitize_stream_decorator
|
|
1491
|
+
|
|
1492
|
+
# Allow @sanitize_stream and @lit_streamer as decorators
|
|
1493
|
+
try:
|
|
1494
|
+
sanitize_stream.__decorator__ = _sanitize_stream_decorator # type: ignore[attr-defined]
|
|
1495
|
+
except AttributeError:
|
|
1496
|
+
pass
|
|
1497
|
+
try:
|
|
1498
|
+
LITSTREAM.__decorator__ = _sanitize_stream_decorator # type: ignore[attr-defined]
|
|
1499
|
+
except AttributeError:
|
|
1500
|
+
pass
|
|
1501
|
+
try:
|
|
1502
|
+
lit_streamer.__decorator__ = _sanitize_stream_decorator # type: ignore[attr-defined]
|
|
1503
|
+
except AttributeError:
|
|
1504
|
+
pass
|
|
1505
|
+
def __getattr__(name) -> Any:
|
|
1506
|
+
if name == "sanitize_stream":
|
|
1507
|
+
return sanitize_stream
|
|
1508
|
+
if name == "LITSTREAM":
|
|
1509
|
+
return LITSTREAM
|
|
1510
|
+
if name == "sanitize_stream_decorator":
|
|
1511
|
+
return _sanitize_stream_decorator
|
|
1512
|
+
if name == "lit_streamer":
|
|
1513
|
+
return _sanitize_stream_decorator
|
|
1514
|
+
raise AttributeError(f"module {__name__} has no attribute {name}")
|