webscout 8.3.7__py3-none-any.whl → 2025.10.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of webscout might be problematic. Click here for more details.

Files changed (306) hide show
  1. webscout/AIauto.py +250 -250
  2. webscout/AIbase.py +379 -379
  3. webscout/AIutel.py +60 -60
  4. webscout/Bard.py +1012 -1012
  5. webscout/Bing_search.py +417 -417
  6. webscout/DWEBS.py +529 -529
  7. webscout/Extra/Act.md +309 -309
  8. webscout/Extra/GitToolkit/__init__.py +10 -10
  9. webscout/Extra/GitToolkit/gitapi/README.md +110 -110
  10. webscout/Extra/GitToolkit/gitapi/__init__.py +11 -11
  11. webscout/Extra/GitToolkit/gitapi/repository.py +195 -195
  12. webscout/Extra/GitToolkit/gitapi/user.py +96 -96
  13. webscout/Extra/GitToolkit/gitapi/utils.py +61 -61
  14. webscout/Extra/YTToolkit/README.md +375 -375
  15. webscout/Extra/YTToolkit/YTdownloader.py +956 -956
  16. webscout/Extra/YTToolkit/__init__.py +2 -2
  17. webscout/Extra/YTToolkit/transcriber.py +475 -475
  18. webscout/Extra/YTToolkit/ytapi/README.md +44 -44
  19. webscout/Extra/YTToolkit/ytapi/__init__.py +6 -6
  20. webscout/Extra/YTToolkit/ytapi/channel.py +307 -307
  21. webscout/Extra/YTToolkit/ytapi/errors.py +13 -13
  22. webscout/Extra/YTToolkit/ytapi/extras.py +118 -118
  23. webscout/Extra/YTToolkit/ytapi/https.py +88 -88
  24. webscout/Extra/YTToolkit/ytapi/patterns.py +61 -61
  25. webscout/Extra/YTToolkit/ytapi/playlist.py +58 -58
  26. webscout/Extra/YTToolkit/ytapi/pool.py +7 -7
  27. webscout/Extra/YTToolkit/ytapi/query.py +39 -39
  28. webscout/Extra/YTToolkit/ytapi/stream.py +62 -62
  29. webscout/Extra/YTToolkit/ytapi/utils.py +62 -62
  30. webscout/Extra/YTToolkit/ytapi/video.py +232 -232
  31. webscout/Extra/autocoder/__init__.py +9 -9
  32. webscout/Extra/autocoder/autocoder.py +1105 -1105
  33. webscout/Extra/autocoder/autocoder_utiles.py +332 -332
  34. webscout/Extra/gguf.md +429 -429
  35. webscout/Extra/gguf.py +1213 -1213
  36. webscout/Extra/tempmail/README.md +487 -487
  37. webscout/Extra/tempmail/__init__.py +27 -27
  38. webscout/Extra/tempmail/async_utils.py +140 -140
  39. webscout/Extra/tempmail/base.py +160 -160
  40. webscout/Extra/tempmail/cli.py +186 -186
  41. webscout/Extra/tempmail/emailnator.py +84 -84
  42. webscout/Extra/tempmail/mail_tm.py +360 -360
  43. webscout/Extra/tempmail/temp_mail_io.py +291 -291
  44. webscout/Extra/weather.md +281 -281
  45. webscout/Extra/weather.py +193 -193
  46. webscout/Litlogger/README.md +10 -10
  47. webscout/Litlogger/__init__.py +15 -15
  48. webscout/Litlogger/formats.py +13 -13
  49. webscout/Litlogger/handlers.py +121 -121
  50. webscout/Litlogger/levels.py +13 -13
  51. webscout/Litlogger/logger.py +134 -134
  52. webscout/Provider/AISEARCH/Perplexity.py +332 -332
  53. webscout/Provider/AISEARCH/README.md +279 -279
  54. webscout/Provider/AISEARCH/__init__.py +16 -1
  55. webscout/Provider/AISEARCH/felo_search.py +206 -206
  56. webscout/Provider/AISEARCH/genspark_search.py +323 -323
  57. webscout/Provider/AISEARCH/hika_search.py +185 -185
  58. webscout/Provider/AISEARCH/iask_search.py +410 -410
  59. webscout/Provider/AISEARCH/monica_search.py +219 -219
  60. webscout/Provider/AISEARCH/scira_search.py +316 -316
  61. webscout/Provider/AISEARCH/stellar_search.py +177 -177
  62. webscout/Provider/AISEARCH/webpilotai_search.py +255 -255
  63. webscout/Provider/Aitopia.py +314 -314
  64. webscout/Provider/Andi.py +1 -1
  65. webscout/Provider/Apriel.py +306 -0
  66. webscout/Provider/ChatGPTClone.py +237 -236
  67. webscout/Provider/ChatSandbox.py +343 -343
  68. webscout/Provider/Cloudflare.py +324 -324
  69. webscout/Provider/Cohere.py +208 -208
  70. webscout/Provider/Deepinfra.py +370 -366
  71. webscout/Provider/ExaAI.py +260 -260
  72. webscout/Provider/ExaChat.py +308 -308
  73. webscout/Provider/Flowith.py +221 -221
  74. webscout/Provider/GMI.py +293 -0
  75. webscout/Provider/Gemini.py +164 -164
  76. webscout/Provider/GeminiProxy.py +167 -167
  77. webscout/Provider/GithubChat.py +371 -372
  78. webscout/Provider/Groq.py +800 -800
  79. webscout/Provider/HeckAI.py +383 -383
  80. webscout/Provider/Jadve.py +282 -282
  81. webscout/Provider/K2Think.py +307 -307
  82. webscout/Provider/Koboldai.py +205 -205
  83. webscout/Provider/LambdaChat.py +423 -423
  84. webscout/Provider/Nemotron.py +244 -244
  85. webscout/Provider/Netwrck.py +248 -248
  86. webscout/Provider/OLLAMA.py +395 -395
  87. webscout/Provider/OPENAI/Cloudflare.py +393 -393
  88. webscout/Provider/OPENAI/FalconH1.py +451 -451
  89. webscout/Provider/OPENAI/FreeGemini.py +296 -296
  90. webscout/Provider/OPENAI/K2Think.py +431 -431
  91. webscout/Provider/OPENAI/NEMOTRON.py +240 -240
  92. webscout/Provider/OPENAI/PI.py +427 -427
  93. webscout/Provider/OPENAI/README.md +959 -959
  94. webscout/Provider/OPENAI/TogetherAI.py +345 -345
  95. webscout/Provider/OPENAI/TwoAI.py +465 -465
  96. webscout/Provider/OPENAI/__init__.py +33 -18
  97. webscout/Provider/OPENAI/base.py +248 -248
  98. webscout/Provider/OPENAI/chatglm.py +528 -0
  99. webscout/Provider/OPENAI/chatgpt.py +592 -592
  100. webscout/Provider/OPENAI/chatgptclone.py +521 -521
  101. webscout/Provider/OPENAI/chatsandbox.py +202 -202
  102. webscout/Provider/OPENAI/deepinfra.py +318 -314
  103. webscout/Provider/OPENAI/e2b.py +1665 -1665
  104. webscout/Provider/OPENAI/exaai.py +420 -420
  105. webscout/Provider/OPENAI/exachat.py +452 -452
  106. webscout/Provider/OPENAI/friendli.py +232 -232
  107. webscout/Provider/OPENAI/{refact.py → gmi.py} +324 -274
  108. webscout/Provider/OPENAI/groq.py +364 -364
  109. webscout/Provider/OPENAI/heckai.py +314 -314
  110. webscout/Provider/OPENAI/llmchatco.py +337 -337
  111. webscout/Provider/OPENAI/netwrck.py +355 -355
  112. webscout/Provider/OPENAI/oivscode.py +290 -290
  113. webscout/Provider/OPENAI/opkfc.py +518 -518
  114. webscout/Provider/OPENAI/pydantic_imports.py +1 -1
  115. webscout/Provider/OPENAI/scirachat.py +535 -535
  116. webscout/Provider/OPENAI/sonus.py +308 -308
  117. webscout/Provider/OPENAI/standardinput.py +442 -442
  118. webscout/Provider/OPENAI/textpollinations.py +340 -340
  119. webscout/Provider/OPENAI/toolbaz.py +419 -416
  120. webscout/Provider/OPENAI/typefully.py +362 -362
  121. webscout/Provider/OPENAI/utils.py +295 -295
  122. webscout/Provider/OPENAI/venice.py +436 -436
  123. webscout/Provider/OPENAI/wisecat.py +387 -387
  124. webscout/Provider/OPENAI/writecream.py +166 -166
  125. webscout/Provider/OPENAI/x0gpt.py +378 -378
  126. webscout/Provider/OPENAI/yep.py +389 -389
  127. webscout/Provider/OpenGPT.py +230 -230
  128. webscout/Provider/Openai.py +243 -243
  129. webscout/Provider/PI.py +405 -405
  130. webscout/Provider/Perplexitylabs.py +430 -430
  131. webscout/Provider/QwenLM.py +272 -272
  132. webscout/Provider/STT/__init__.py +16 -1
  133. webscout/Provider/Sambanova.py +257 -257
  134. webscout/Provider/StandardInput.py +309 -309
  135. webscout/Provider/TTI/README.md +82 -82
  136. webscout/Provider/TTI/__init__.py +33 -18
  137. webscout/Provider/TTI/aiarta.py +413 -413
  138. webscout/Provider/TTI/base.py +136 -136
  139. webscout/Provider/TTI/bing.py +243 -243
  140. webscout/Provider/TTI/gpt1image.py +149 -149
  141. webscout/Provider/TTI/imagen.py +196 -196
  142. webscout/Provider/TTI/infip.py +211 -211
  143. webscout/Provider/TTI/magicstudio.py +232 -232
  144. webscout/Provider/TTI/monochat.py +219 -219
  145. webscout/Provider/TTI/piclumen.py +214 -214
  146. webscout/Provider/TTI/pixelmuse.py +232 -232
  147. webscout/Provider/TTI/pollinations.py +232 -232
  148. webscout/Provider/TTI/together.py +288 -288
  149. webscout/Provider/TTI/utils.py +12 -12
  150. webscout/Provider/TTI/venice.py +367 -367
  151. webscout/Provider/TTS/README.md +192 -192
  152. webscout/Provider/TTS/__init__.py +33 -18
  153. webscout/Provider/TTS/parler.py +110 -110
  154. webscout/Provider/TTS/streamElements.py +333 -333
  155. webscout/Provider/TTS/utils.py +280 -280
  156. webscout/Provider/TeachAnything.py +237 -237
  157. webscout/Provider/TextPollinationsAI.py +310 -310
  158. webscout/Provider/TogetherAI.py +356 -356
  159. webscout/Provider/TwoAI.py +312 -312
  160. webscout/Provider/TypliAI.py +311 -311
  161. webscout/Provider/UNFINISHED/ChatHub.py +208 -208
  162. webscout/Provider/UNFINISHED/ChutesAI.py +313 -313
  163. webscout/Provider/UNFINISHED/GizAI.py +294 -294
  164. webscout/Provider/UNFINISHED/Marcus.py +198 -198
  165. webscout/Provider/UNFINISHED/Qodo.py +477 -477
  166. webscout/Provider/UNFINISHED/VercelAIGateway.py +338 -338
  167. webscout/Provider/UNFINISHED/XenAI.py +324 -324
  168. webscout/Provider/UNFINISHED/Youchat.py +330 -330
  169. webscout/Provider/UNFINISHED/liner.py +334 -0
  170. webscout/Provider/UNFINISHED/liner_api_request.py +262 -262
  171. webscout/Provider/UNFINISHED/puterjs.py +634 -634
  172. webscout/Provider/UNFINISHED/samurai.py +223 -223
  173. webscout/Provider/UNFINISHED/test_lmarena.py +119 -119
  174. webscout/Provider/Venice.py +250 -250
  175. webscout/Provider/VercelAI.py +256 -256
  176. webscout/Provider/WiseCat.py +231 -231
  177. webscout/Provider/WrDoChat.py +366 -366
  178. webscout/Provider/__init__.py +33 -18
  179. webscout/Provider/ai4chat.py +174 -174
  180. webscout/Provider/akashgpt.py +331 -331
  181. webscout/Provider/cerebras.py +446 -446
  182. webscout/Provider/chatglm.py +394 -301
  183. webscout/Provider/cleeai.py +211 -211
  184. webscout/Provider/elmo.py +282 -282
  185. webscout/Provider/geminiapi.py +208 -208
  186. webscout/Provider/granite.py +261 -261
  187. webscout/Provider/hermes.py +263 -263
  188. webscout/Provider/julius.py +223 -223
  189. webscout/Provider/learnfastai.py +309 -309
  190. webscout/Provider/llama3mitril.py +214 -214
  191. webscout/Provider/llmchat.py +243 -243
  192. webscout/Provider/llmchatco.py +290 -290
  193. webscout/Provider/meta.py +801 -801
  194. webscout/Provider/oivscode.py +309 -309
  195. webscout/Provider/scira_chat.py +383 -383
  196. webscout/Provider/searchchat.py +292 -292
  197. webscout/Provider/sonus.py +258 -258
  198. webscout/Provider/toolbaz.py +370 -367
  199. webscout/Provider/turboseek.py +273 -273
  200. webscout/Provider/typefully.py +207 -207
  201. webscout/Provider/yep.py +372 -372
  202. webscout/__init__.py +27 -31
  203. webscout/__main__.py +5 -5
  204. webscout/auth/api_key_manager.py +189 -189
  205. webscout/auth/config.py +175 -175
  206. webscout/auth/models.py +185 -185
  207. webscout/auth/routes.py +663 -664
  208. webscout/auth/simple_logger.py +236 -236
  209. webscout/cli.py +523 -523
  210. webscout/conversation.py +438 -438
  211. webscout/exceptions.py +361 -361
  212. webscout/litagent/Readme.md +298 -298
  213. webscout/litagent/__init__.py +28 -28
  214. webscout/litagent/agent.py +581 -581
  215. webscout/litagent/constants.py +59 -59
  216. webscout/litprinter/__init__.py +58 -58
  217. webscout/models.py +181 -181
  218. webscout/optimizers.py +419 -419
  219. webscout/prompt_manager.py +288 -288
  220. webscout/sanitize.py +1078 -1078
  221. webscout/scout/README.md +401 -401
  222. webscout/scout/__init__.py +8 -8
  223. webscout/scout/core/__init__.py +6 -6
  224. webscout/scout/core/crawler.py +297 -297
  225. webscout/scout/core/scout.py +706 -706
  226. webscout/scout/core/search_result.py +95 -95
  227. webscout/scout/core/text_analyzer.py +62 -62
  228. webscout/scout/core/text_utils.py +277 -277
  229. webscout/scout/core/web_analyzer.py +51 -51
  230. webscout/scout/element.py +599 -599
  231. webscout/scout/parsers/__init__.py +69 -69
  232. webscout/scout/parsers/html5lib_parser.py +172 -172
  233. webscout/scout/parsers/html_parser.py +236 -236
  234. webscout/scout/parsers/lxml_parser.py +178 -178
  235. webscout/scout/utils.py +37 -37
  236. webscout/search/__init__.py +51 -0
  237. webscout/search/base.py +195 -0
  238. webscout/search/duckduckgo_main.py +54 -0
  239. webscout/search/engines/__init__.py +48 -0
  240. webscout/search/engines/bing.py +84 -0
  241. webscout/search/engines/bing_news.py +52 -0
  242. webscout/search/engines/brave.py +43 -0
  243. webscout/search/engines/duckduckgo/__init__.py +25 -0
  244. webscout/search/engines/duckduckgo/answers.py +78 -0
  245. webscout/search/engines/duckduckgo/base.py +187 -0
  246. webscout/search/engines/duckduckgo/images.py +97 -0
  247. webscout/search/engines/duckduckgo/maps.py +168 -0
  248. webscout/search/engines/duckduckgo/news.py +68 -0
  249. webscout/search/engines/duckduckgo/suggestions.py +21 -0
  250. webscout/search/engines/duckduckgo/text.py +211 -0
  251. webscout/search/engines/duckduckgo/translate.py +47 -0
  252. webscout/search/engines/duckduckgo/videos.py +63 -0
  253. webscout/search/engines/duckduckgo/weather.py +74 -0
  254. webscout/search/engines/mojeek.py +37 -0
  255. webscout/search/engines/wikipedia.py +56 -0
  256. webscout/search/engines/yahoo.py +65 -0
  257. webscout/search/engines/yahoo_news.py +64 -0
  258. webscout/search/engines/yandex.py +43 -0
  259. webscout/search/engines/yep/__init__.py +13 -0
  260. webscout/search/engines/yep/base.py +32 -0
  261. webscout/search/engines/yep/images.py +99 -0
  262. webscout/search/engines/yep/suggestions.py +35 -0
  263. webscout/search/engines/yep/text.py +114 -0
  264. webscout/search/http_client.py +156 -0
  265. webscout/search/results.py +137 -0
  266. webscout/search/yep_main.py +44 -0
  267. webscout/swiftcli/Readme.md +323 -323
  268. webscout/swiftcli/__init__.py +95 -95
  269. webscout/swiftcli/core/__init__.py +7 -7
  270. webscout/swiftcli/core/cli.py +308 -308
  271. webscout/swiftcli/core/context.py +104 -104
  272. webscout/swiftcli/core/group.py +241 -241
  273. webscout/swiftcli/decorators/__init__.py +28 -28
  274. webscout/swiftcli/decorators/command.py +221 -221
  275. webscout/swiftcli/decorators/options.py +220 -220
  276. webscout/swiftcli/decorators/output.py +302 -302
  277. webscout/swiftcli/exceptions.py +21 -21
  278. webscout/swiftcli/plugins/__init__.py +9 -9
  279. webscout/swiftcli/plugins/base.py +135 -135
  280. webscout/swiftcli/plugins/manager.py +269 -269
  281. webscout/swiftcli/utils/__init__.py +59 -59
  282. webscout/swiftcli/utils/formatting.py +252 -252
  283. webscout/swiftcli/utils/parsing.py +267 -267
  284. webscout/update_checker.py +117 -117
  285. webscout/version.py +1 -1
  286. webscout/version.py.bak +2 -0
  287. webscout/zeroart/README.md +89 -89
  288. webscout/zeroart/__init__.py +134 -134
  289. webscout/zeroart/base.py +66 -66
  290. webscout/zeroart/effects.py +100 -100
  291. webscout/zeroart/fonts.py +1238 -1238
  292. {webscout-8.3.7.dist-info → webscout-2025.10.13.dist-info}/METADATA +936 -937
  293. webscout-2025.10.13.dist-info/RECORD +329 -0
  294. webscout/Provider/AISEARCH/DeepFind.py +0 -254
  295. webscout/Provider/OPENAI/Qwen3.py +0 -303
  296. webscout/Provider/OPENAI/qodo.py +0 -630
  297. webscout/Provider/OPENAI/xenai.py +0 -514
  298. webscout/tempid.py +0 -134
  299. webscout/webscout_search.py +0 -1183
  300. webscout/webscout_search_async.py +0 -649
  301. webscout/yep_search.py +0 -346
  302. webscout-8.3.7.dist-info/RECORD +0 -301
  303. {webscout-8.3.7.dist-info → webscout-2025.10.13.dist-info}/WHEEL +0 -0
  304. {webscout-8.3.7.dist-info → webscout-2025.10.13.dist-info}/entry_points.txt +0 -0
  305. {webscout-8.3.7.dist-info → webscout-2025.10.13.dist-info}/licenses/LICENSE.md +0 -0
  306. {webscout-8.3.7.dist-info → webscout-2025.10.13.dist-info}/top_level.txt +0 -0
webscout/sanitize.py CHANGED
@@ -1,1078 +1,1078 @@
1
- import codecs
2
- import json
3
- import re
4
- from typing import (
5
- Any,
6
- AsyncGenerator,
7
- AsyncIterable,
8
- Callable,
9
- Dict,
10
- Generator,
11
- Iterable,
12
- List,
13
- Literal,
14
- Optional,
15
- Pattern,
16
- Union,
17
- )
18
-
19
- # Expanded encoding types
20
- EncodingType = Literal['utf-8', 'utf-16', 'utf-32', 'ascii', 'latin1', 'cp1252', 'iso-8859-1',
21
- 'iso-8859-2', 'windows-1250', 'windows-1251', 'windows-1252', 'gbk', 'big5',
22
- 'shift_jis', 'euc-jp', 'euc-kr']
23
-
24
- def _compile_regexes(patterns: Optional[List[Union[str, Pattern[str]]]]) -> Optional[List[Pattern[str]]]:
25
- """
26
- Compile regex patterns from strings or return compiled patterns as-is.
27
-
28
- Args:
29
- patterns: List of regex patterns as strings or compiled Pattern objects.
30
-
31
- Returns:
32
- List of compiled Pattern objects, or None if input is None.
33
-
34
- Raises:
35
- ValueError: If any pattern is invalid.
36
- """
37
- if not patterns:
38
- return None
39
-
40
- compiled_patterns = []
41
- for i, pattern in enumerate(patterns):
42
- try:
43
- if isinstance(pattern, str):
44
- compiled_patterns.append(re.compile(pattern))
45
- elif isinstance(pattern, Pattern):
46
- compiled_patterns.append(pattern)
47
- else:
48
- raise ValueError(f"Pattern at index {i} must be a string or compiled regex pattern, got {type(pattern)}")
49
- except re.error as e:
50
- raise ValueError(f"Invalid regex pattern at index {i}: '{pattern}' - {str(e)}")
51
-
52
- return compiled_patterns
53
-
54
- def _process_chunk(
55
- chunk: str,
56
- intro_value: str,
57
- to_json: bool,
58
- skip_markers: List[str],
59
- strip_chars: Optional[str],
60
- yield_raw_on_error: bool,
61
- error_handler: Optional[Callable[[Exception, str], Optional[Any]]] = None,
62
- skip_regexes: Optional[List[Pattern[str]]] = None,
63
- extract_regexes: Optional[List[Pattern[str]]] = None,
64
- ) -> Union[str, Dict[str, Any], None]:
65
- """
66
- Sanitizes and potentially parses a single chunk of text.
67
-
68
- This function performs several operations on the input chunk:
69
- - Removes a specified prefix (`intro_value`).
70
- - Strips leading/trailing characters (`strip_chars`).
71
- - Skips chunks matching specific markers (`skip_markers`).
72
- - Skips chunks matching regex patterns (`skip_regexes`).
73
- - Extracts content using regex capturing groups (`extract_regexes`).
74
- - Optionally parses the chunk as JSON (`to_json`).
75
- - Handles JSON parsing errors with an optional callback (`error_handler`).
76
-
77
- Args:
78
- chunk (str): The chunk of text to process.
79
- intro_value (str): The prefix to remove from the chunk.
80
- to_json (bool): If True, attempts to parse the chunk as JSON.
81
- skip_markers (List[str]): A list of markers; chunks matching these are skipped.
82
- strip_chars (Optional[str]): Characters to strip from the beginning and end of the chunk.
83
- yield_raw_on_error (bool): If True, returns the raw chunk when JSON parsing fails; otherwise, returns None.
84
- error_handler (Optional[Callable[[Exception, str], Optional[Any]]]): An optional callback function that is called when JSON parsing fails.
85
- It receives the exception and the sanitized chunk as arguments. It should return a value to yield instead of the raw chunk, or None to ignore.
86
- skip_regexes (Optional[List[Pattern[str]]]): A list of compiled regex patterns; chunks matching any of these are skipped.
87
- extract_regexes (Optional[List[Pattern[str]]]): A list of compiled regex patterns for extracting content using capturing groups.
88
-
89
- """
90
- if not isinstance(chunk, str):
91
- return None
92
-
93
- # Fast path for empty chunks
94
- if not chunk:
95
- return None
96
-
97
- # Use slicing for prefix removal (faster than startswith+slicing)
98
- sanitized_chunk = chunk
99
- if intro_value and len(chunk) >= len(intro_value) and chunk[:len(intro_value)] == intro_value:
100
- sanitized_chunk = chunk[len(intro_value):]
101
-
102
- # Optimize string stripping operations
103
- if strip_chars is not None:
104
- sanitized_chunk = sanitized_chunk.strip(strip_chars)
105
- else:
106
- # lstrip() is faster than strip() when we only need leading whitespace removed
107
- sanitized_chunk = sanitized_chunk.lstrip()
108
-
109
- # Skip empty chunks and markers
110
- if not sanitized_chunk or any(marker == sanitized_chunk for marker in skip_markers):
111
- return None
112
-
113
- # Apply regex-based extraction first (if provided)
114
- if extract_regexes:
115
- extracted_content = None
116
- for regex in extract_regexes:
117
- match = regex.search(sanitized_chunk)
118
- if match:
119
- # If there are capturing groups, return the first group or all groups as a tuple
120
- if match.groups():
121
- if len(match.groups()) == 1:
122
- extracted_content = match.group(1)
123
- else:
124
- # Multiple groups - return as tuple converted to string for JSON compatibility
125
- extracted_content = str(match.groups())
126
- else:
127
- # No capturing groups, return the full match
128
- extracted_content = match.group(0)
129
- break # Use first matching extraction regex
130
-
131
- # If extract_regexes are provided but no match found, skip this chunk entirely
132
- if extracted_content is None:
133
- return None
134
-
135
- sanitized_chunk = extracted_content
136
-
137
- # Apply regex-based skipping (after extraction)
138
- if skip_regexes:
139
- if any(regex.search(sanitized_chunk) for regex in skip_regexes):
140
- return None
141
-
142
- # JSON parsing with optimized error handling
143
- if to_json:
144
- try:
145
- # Only strip before JSON parsing if both boundaries are incorrect
146
- if len(sanitized_chunk) >= 2 and sanitized_chunk[0] not in '{[' and sanitized_chunk[-1] not in '}]':
147
- sanitized_chunk = sanitized_chunk.strip()
148
- return json.loads(sanitized_chunk)
149
- except (json.JSONDecodeError, Exception) as e:
150
- if error_handler:
151
- try:
152
- handled = error_handler(e, sanitized_chunk)
153
- if handled is not None:
154
- return handled
155
- except Exception:
156
- pass
157
- return sanitized_chunk if yield_raw_on_error else None
158
-
159
- return sanitized_chunk
160
-
161
- def _decode_byte_stream(
162
- byte_iterator: Iterable[bytes],
163
- encoding: EncodingType = 'utf-8',
164
- errors: str = 'replace',
165
- buffer_size: int = 8192
166
- ) -> Generator[str, None, None]:
167
- """
168
- Decodes a byte stream in realtime with flexible encoding support.
169
-
170
- This function takes an iterator of bytes and decodes it into a stream of strings
171
- using the specified character encoding. It handles encoding errors gracefully
172
- and can be tuned for performance with the `buffer_size` parameter.
173
-
174
- Args:
175
- byte_iterator (Iterable[bytes]): An iterator that yields chunks of bytes.
176
- encoding (EncodingType): The character encoding to use for decoding.
177
- Defaults to 'utf-8'. Supports a wide range of encodings, including:
178
- 'utf-8', 'utf-16', 'utf-32', 'ascii', 'latin1', 'cp1252', 'iso-8859-1',
179
- 'iso-8859-2', 'windows-1250', 'windows-1251', 'windows-1252', 'gbk', 'big5',
180
- 'shift_jis', 'euc-jp', 'euc-kr'.
181
- errors (str): Specifies how encoding errors should be handled.
182
- Options are 'strict' (raises an error), 'ignore' (skips the error), and
183
- 'replace' (replaces the erroneous byte with a replacement character).
184
- Defaults to 'replace'.
185
- buffer_size (int): The size of the internal buffer used for decoding.
186
-
187
- """
188
- # Initialize decoder with the specified encoding
189
- try:
190
- decoder = codecs.getincrementaldecoder(encoding)(errors=errors)
191
- except LookupError:
192
- # Fallback to utf-8 if the encoding is not supported
193
- decoder = codecs.getincrementaldecoder('utf-8')(errors=errors)
194
-
195
- # Process byte stream in realtime
196
- buffer = bytearray(buffer_size)
197
- buffer_view = memoryview(buffer)
198
-
199
- for chunk_bytes in byte_iterator:
200
- if not chunk_bytes:
201
- continue
202
-
203
- try:
204
- # Use buffer for processing if chunk size is appropriate
205
- if len(chunk_bytes) <= buffer_size:
206
- buffer[:len(chunk_bytes)] = chunk_bytes
207
- text = decoder.decode(buffer_view[:len(chunk_bytes)], final=False)
208
- else:
209
- text = decoder.decode(chunk_bytes, final=False)
210
-
211
- if text:
212
- yield text
213
- except UnicodeDecodeError:
214
- yield f"[Encoding Error: Could not decode bytes with {encoding}]\n"
215
-
216
- # Final flush
217
- try:
218
- final_text = decoder.decode(b'', final=True)
219
- if final_text:
220
- yield final_text
221
- except UnicodeDecodeError:
222
- yield f"[Encoding Error: Could not decode final bytes with {encoding}]\n"
223
-
224
- async def _decode_byte_stream_async(
225
- byte_iterator: Iterable[bytes],
226
- encoding: EncodingType = 'utf-8',
227
- errors: str = 'replace',
228
- buffer_size: int = 8192
229
- ) -> AsyncGenerator[str, None]:
230
- """
231
- Asynchronously decodes a byte stream with flexible encoding support.
232
-
233
- This function is the asynchronous counterpart to `_decode_byte_stream`. It takes
234
- an asynchronous iterator of bytes and decodes it into a stream of strings using
235
- the specified character encoding. It handles encoding errors gracefully and can
236
- be tuned for performance with the `buffer_size` parameter.
237
-
238
- Args:
239
- byte_iterator (Iterable[bytes]): An asynchronous iterator that yields chunks of bytes.
240
- encoding (EncodingType): The character encoding to use for decoding.
241
- Defaults to 'utf-8'. Supports a wide range of encodings, including:
242
- 'utf-8', 'utf-16', 'utf-32', 'ascii', 'latin1', 'cp1252', 'iso-8859-1',
243
- 'iso-8859-2', 'windows-1250', 'windows-1251', 'windows-1252', 'gbk', 'big5',
244
- 'shift_jis', 'euc-jp', 'euc-kr'.
245
- errors (str): Specifies how encoding errors should be handled.
246
- Options are 'strict' (raises an error), 'ignore' (skips the error), and
247
- 'replace' (replaces the erroneous byte with a replacement character).
248
- Defaults to 'replace'.
249
- buffer_size (int): The size of the internal buffer used for decoding.
250
- """
251
- try:
252
- decoder = codecs.getincrementaldecoder(encoding)(errors=errors)
253
- except LookupError:
254
- decoder = codecs.getincrementaldecoder('utf-8')(errors=errors)
255
-
256
- buffer = bytearray(buffer_size)
257
- buffer_view = memoryview(buffer)
258
-
259
- async for chunk_bytes in byte_iterator:
260
- if not chunk_bytes:
261
- continue
262
- try:
263
- if len(chunk_bytes) <= buffer_size:
264
- buffer[:len(chunk_bytes)] = chunk_bytes
265
- text = decoder.decode(buffer_view[:len(chunk_bytes)], final=False)
266
- else:
267
- text = decoder.decode(chunk_bytes, final=False)
268
- if text:
269
- yield text
270
- except UnicodeDecodeError:
271
- yield f"[Encoding Error: Could not decode bytes with {encoding}]\n"
272
-
273
- try:
274
- final_text = decoder.decode(b'', final=True)
275
- if final_text:
276
- yield final_text
277
- except UnicodeDecodeError:
278
- yield f"[Encoding Error: Could not decode final bytes with {encoding}]\n"
279
-
280
- def _sanitize_stream_sync(
281
- data: Union[str, Iterable[str], Iterable[bytes]],
282
- intro_value: str = "data:",
283
- to_json: bool = True,
284
- skip_markers: Optional[List[str]] = None,
285
- strip_chars: Optional[str] = None,
286
- start_marker: Optional[str] = None,
287
- end_marker: Optional[str] = None,
288
- content_extractor: Optional[Callable[[Union[str, Dict[str, Any]]], Optional[Any]]] = None,
289
- yield_raw_on_error: bool = True,
290
- encoding: EncodingType = 'utf-8',
291
- encoding_errors: str = 'replace',
292
- buffer_size: int = 8192,
293
- line_delimiter: Optional[str] = None,
294
- error_handler: Optional[Callable[[Exception, str], Optional[Any]]] = None,
295
- skip_regexes: Optional[List[Union[str, Pattern[str]]]] = None,
296
- extract_regexes: Optional[List[Union[str, Pattern[str]]]] = None,
297
- raw: bool = False,
298
- ) -> Generator[Any, None, None]:
299
- """
300
- Processes a stream of data (strings or bytes) in real-time, applying various transformations and filtering.
301
-
302
- This function is designed to handle streaming data, allowing for operations such as
303
- prefix removal, JSON parsing, skipping lines based on markers, regex-based filtering,
304
- and extracting specific content. It also supports custom error handling for JSON parsing failures.
305
-
306
- Args:
307
- data: String, iterable of strings, or iterable of bytes to process.
308
- intro_value: Prefix indicating the start of meaningful data.
309
- to_json: Parse the chunk as JSON if True.
310
- skip_markers: Lines containing any of these markers are skipped.
311
- strip_chars: Characters to strip from each line.
312
- start_marker: Begin processing only after this marker is found.
313
- end_marker: Stop processing once this marker is found.
314
- content_extractor: Optional callable to transform parsed content before yielding.
315
- yield_raw_on_error: Yield raw lines when JSON parsing fails.
316
- encoding: Byte stream encoding.
317
- encoding_errors: How to handle encoding errors.
318
- buffer_size: Buffer size for byte decoding.
319
- line_delimiter: Delimiter used to split incoming text into lines. ``None``
320
- uses ``str.splitlines()``.
321
- error_handler: Callback invoked with ``(Exception, str)`` when JSON
322
- parsing fails. If the callback returns a value, it is yielded instead of the raw line.
323
- skip_regexes: List of regex patterns (strings or compiled) for skipping lines that match.
324
- extract_regexes: List of regex patterns (strings or compiled) for extracting content using capturing groups.
325
- raw: If True, yields the raw response as returned by the API, chunk by chunk (no processing).
326
-
327
- Yields:
328
- Any: Processed data, which can be a string, a dictionary (if `to_json` is True), or the result of `content_extractor`.
329
-
330
- Raises:
331
- TypeError: If the input `data` is not a string or an iterable.
332
- ValueError: If any regex pattern is invalid.
333
- """
334
- # --- RAW MODE: yield each chunk exactly as returned by the API ---
335
- if raw:
336
- if isinstance(data, str):
337
- yield data
338
- return
339
- elif hasattr(data, '__iter__'):
340
- for chunk in data:
341
- if isinstance(chunk, (bytes, bytearray)):
342
- yield chunk.decode(encoding, encoding_errors)
343
- elif chunk is not None:
344
- yield chunk
345
- return
346
- else:
347
- if data is not None:
348
- yield data
349
- return
350
- # --- END RAW MODE ---
351
-
352
- effective_skip_markers = skip_markers or []
353
- # Compile regex patterns
354
- compiled_skip_regexes = _compile_regexes(skip_regexes)
355
- compiled_extract_regexes = _compile_regexes(extract_regexes)
356
-
357
- processing_active = start_marker is None
358
- buffer = ""
359
- found_start = False if start_marker else True
360
- line_iterator: Iterable[str]
361
-
362
- if isinstance(data, str):
363
- # If data is a string, decide whether to split it into lines
364
- # or treat it as an iterable containing a single chunk.
365
- temp_lines: List[str]
366
- if line_delimiter is None: # Default: split by newlines if present
367
- if '\n' in data or '\r' in data:
368
- temp_lines = data.splitlines()
369
- else:
370
- temp_lines = [data] # Treat as a single line/chunk
371
- elif line_delimiter in data: # Custom delimiter found in string
372
- temp_lines = data.split(line_delimiter)
373
- else: # Custom delimiter not found, or string is effectively a single segment
374
- temp_lines = [data]
375
- line_iterator = iter(temp_lines)
376
- elif hasattr(data, '__iter__'): # data is an iterable (but not a string)
377
- _iter = iter(data)
378
- first_item = next(_iter, None)
379
-
380
- if first_item is None: # Iterable was empty
381
- return
382
-
383
- from itertools import chain
384
- # Reconstruct the full iterable including the first_item
385
- stream_input_iterable = chain([first_item], _iter)
386
-
387
- if isinstance(first_item, bytes):
388
- # Ensure stream_input_iterable is typed as Iterable[bytes] for _decode_byte_stream
389
- line_iterator = _decode_byte_stream(
390
- stream_input_iterable, # type: ignore
391
- encoding=encoding,
392
- errors=encoding_errors,
393
- buffer_size=buffer_size
394
- )
395
- elif isinstance(first_item, str):
396
- # Ensure stream_input_iterable is typed as Iterable[str]
397
- line_iterator = stream_input_iterable # type: ignore
398
- else:
399
- raise TypeError(f"Iterable must yield strings or bytes, not {type(first_item).__name__}")
400
- else: # Not a string and not an iterable
401
- raise TypeError(f"Input must be a string or an iterable, not {type(data).__name__}")
402
-
403
- try:
404
- for line in line_iterator:
405
- if not line:
406
- continue
407
- buffer += line
408
- while True:
409
- # Look for start marker if needed
410
- if not found_start and start_marker:
411
- idx = buffer.find(start_marker)
412
- if idx != -1:
413
- found_start = True
414
- buffer = buffer[idx + len(start_marker):]
415
- else:
416
- # Not found, keep buffering
417
- buffer = buffer[-max(len(start_marker), 256):] # avoid unbounded growth
418
- break
419
- # Look for end marker if needed
420
- if found_start and end_marker:
421
- idx = buffer.find(end_marker)
422
- if idx != -1:
423
- chunk = buffer[:idx]
424
- buffer = buffer[idx + len(end_marker):]
425
- processing_active = False
426
- else:
427
- chunk = buffer
428
- buffer = ""
429
- processing_active = True
430
- # Process chunk if we are in active region
431
- if chunk and processing_active:
432
- for subline in (chunk.split(line_delimiter) if line_delimiter is not None else chunk.splitlines()):
433
- result = _process_chunk(
434
- subline,
435
- intro_value,
436
- to_json,
437
- effective_skip_markers,
438
- strip_chars,
439
- yield_raw_on_error,
440
- error_handler,
441
- compiled_skip_regexes,
442
- compiled_extract_regexes,
443
- )
444
- if result is None:
445
- continue
446
- if content_extractor:
447
- try:
448
- final_content = content_extractor(result)
449
- if final_content is not None:
450
- yield final_content
451
- except Exception:
452
- pass
453
- else:
454
- yield result
455
- if not processing_active:
456
- found_start = False
457
- if idx == -1:
458
- break
459
- elif found_start:
460
- # No end marker, process all buffered content
461
- chunk = buffer
462
- buffer = ""
463
- if chunk:
464
- for subline in (chunk.split(line_delimiter) if line_delimiter is not None else chunk.splitlines()):
465
- result = _process_chunk(
466
- subline,
467
- intro_value,
468
- to_json,
469
- effective_skip_markers,
470
- strip_chars,
471
- yield_raw_on_error,
472
- error_handler,
473
- compiled_skip_regexes,
474
- compiled_extract_regexes,
475
- )
476
- if result is None:
477
- continue
478
- if content_extractor:
479
- try:
480
- final_content = content_extractor(result)
481
- if final_content is not None:
482
- yield final_content
483
- except Exception:
484
- pass
485
- else:
486
- yield result
487
- break
488
- else:
489
- break
490
- except Exception as e:
491
- import sys
492
- print(f"Stream processing error: {str(e)}", file=sys.stderr)
493
-
494
-
495
- async def _sanitize_stream_async(
496
- data: Union[str, Iterable[str], Iterable[bytes]],
497
- intro_value: str = "data:",
498
- to_json: bool = True,
499
- skip_markers: Optional[List[str]] = None,
500
- strip_chars: Optional[str] = None,
501
- start_marker: Optional[str] = None,
502
- end_marker: Optional[str] = None,
503
- content_extractor: Optional[Callable[[Union[str, Dict[str, Any]]], Optional[Any]]] = None,
504
- yield_raw_on_error: bool = True,
505
- encoding: EncodingType = 'utf-8',
506
- encoding_errors: str = 'replace',
507
- buffer_size: int = 8192,
508
- line_delimiter: Optional[str] = None,
509
- error_handler: Optional[Callable[[Exception, str], Optional[Any]]] = None,
510
- skip_regexes: Optional[List[Union[str, Pattern[str]]]] = None,
511
- extract_regexes: Optional[List[Union[str, Pattern[str]]]] = None,
512
- raw: bool = False,
513
- ) -> AsyncGenerator[Any, None]:
514
- """
515
- Asynchronously processes a stream of data (strings or bytes), applying transformations and filtering.
516
-
517
- This function is the asynchronous counterpart to `_sanitize_stream_sync`. It handles
518
- streaming data, allowing for operations such as prefix removal, JSON parsing,
519
- skipping lines based on markers, regex-based filtering, and extracting specific content.
520
- It also supports custom error handling for JSON parsing failures.
521
-
522
- Args:
523
- data: String, iterable of strings, or iterable of bytes to process.
524
- intro_value: Prefix indicating the start of meaningful data.
525
- to_json: Parse JSON content if ``True``.
526
- skip_markers: Lines containing any of these markers are skipped.
527
- strip_chars: Characters to strip from each line.
528
- start_marker: Begin processing only after this marker is found.
529
- end_marker: Stop processing once this marker is found.
530
- content_extractor: Optional callable to transform parsed content before yielding.
531
- yield_raw_on_error: Yield raw lines when JSON parsing fails.
532
- encoding: Byte stream encoding.
533
- encoding_errors: How to handle encoding errors.
534
- buffer_size: Buffer size for byte decoding.
535
- line_delimiter: Delimiter used to split incoming text into lines. ``None`` uses ``str.splitlines()``.
536
- error_handler: Callback invoked with ``(Exception, str)`` when JSON parsing fails. If the callback returns a value, it is yielded in place of the raw line.
537
- skip_regexes: List of regex patterns (strings or compiled) for skipping lines that match.
538
- extract_regexes: List of regex patterns (strings or compiled) for extracting content using capturing groups.
539
- raw: If True, yields the raw response as returned by the API, chunk by chunk (no processing).
540
- """
541
- # --- RAW MODE: yield each chunk exactly as returned by the API ---
542
- if raw:
543
- if isinstance(data, str):
544
- yield data
545
- return
546
- elif hasattr(data, "__aiter__"):
547
- async for chunk in data:
548
- if isinstance(chunk, (bytes, bytearray)):
549
- yield chunk.decode(encoding, encoding_errors)
550
- elif chunk is not None:
551
- yield chunk
552
- return
553
- elif hasattr(data, "__iter__"):
554
- for chunk in data:
555
- if isinstance(chunk, (bytes, bytearray)):
556
- yield chunk.decode(encoding, encoding_errors)
557
- elif chunk is not None:
558
- yield chunk
559
- return
560
- else:
561
- if data is not None:
562
- yield data
563
- return
564
- # --- END RAW MODE ---
565
-
566
- if isinstance(data, str):
567
- for item in _sanitize_stream_sync(
568
- data,
569
- intro_value=intro_value,
570
- to_json=to_json,
571
- skip_markers=skip_markers,
572
- strip_chars=strip_chars,
573
- start_marker=start_marker,
574
- end_marker=end_marker,
575
- content_extractor=content_extractor,
576
- yield_raw_on_error=yield_raw_on_error,
577
- encoding=encoding,
578
- encoding_errors=encoding_errors,
579
- buffer_size=buffer_size,
580
- line_delimiter=line_delimiter,
581
- error_handler=error_handler,
582
- skip_regexes=skip_regexes,
583
- extract_regexes=extract_regexes,
584
- raw=raw,
585
- ):
586
- yield item
587
- return
588
-
589
- if not hasattr(data, "__aiter__"):
590
- # Fallback to synchronous processing if possible
591
- for item in _sanitize_stream_sync(
592
- data,
593
- intro_value=intro_value,
594
- to_json=to_json,
595
- skip_markers=skip_markers,
596
- strip_chars=strip_chars,
597
- start_marker=start_marker,
598
- end_marker=end_marker,
599
- content_extractor=content_extractor,
600
- yield_raw_on_error=yield_raw_on_error,
601
- encoding=encoding,
602
- encoding_errors=encoding_errors,
603
- buffer_size=buffer_size,
604
- line_delimiter=line_delimiter,
605
- error_handler=error_handler,
606
- skip_regexes=skip_regexes,
607
- extract_regexes=extract_regexes,
608
- raw=raw,
609
- ):
610
- yield item
611
- return
612
-
613
- effective_skip_markers = skip_markers or []
614
- # Compile regex patterns
615
- compiled_skip_regexes = _compile_regexes(skip_regexes)
616
- compiled_extract_regexes = _compile_regexes(extract_regexes)
617
-
618
- processing_active = start_marker is None
619
- buffer = ""
620
- found_start = False if start_marker else True
621
-
622
- iterator = data.__aiter__()
623
- first_item = None
624
- async for first_item in iterator:
625
- break
626
- if first_item is None:
627
- return
628
- async def _chain(first, it):
629
- yield first
630
- async for x in it:
631
- yield x
632
-
633
- stream = _chain(first_item, iterator)
634
-
635
- if isinstance(first_item, bytes):
636
- line_iterator = _decode_byte_stream_async(
637
- stream,
638
- encoding=encoding,
639
- errors=encoding_errors,
640
- buffer_size=buffer_size,
641
- )
642
- elif isinstance(first_item, str):
643
- line_iterator = stream
644
- else:
645
- raise TypeError(
646
- f"Stream must yield strings or bytes, not {type(first_item).__name__}"
647
- )
648
-
649
- try:
650
- async for line in line_iterator:
651
- if not line:
652
- continue
653
- buffer += line
654
- while True:
655
- if not found_start and start_marker:
656
- idx = buffer.find(start_marker)
657
- if idx != -1:
658
- found_start = True
659
- buffer = buffer[idx + len(start_marker) :]
660
- else:
661
- buffer = buffer[-max(len(start_marker), 256) :]
662
- break
663
- if found_start and end_marker:
664
- idx = buffer.find(end_marker)
665
- if idx != -1:
666
- chunk = buffer[:idx]
667
- buffer = buffer[idx + len(end_marker) :]
668
- processing_active = False
669
- else:
670
- chunk = buffer
671
- buffer = ""
672
- processing_active = True
673
- if chunk and processing_active:
674
- for subline in (
675
- chunk.split(line_delimiter)
676
- if line_delimiter is not None
677
- else chunk.splitlines()
678
- ):
679
- result = _process_chunk(
680
- subline,
681
- intro_value,
682
- to_json,
683
- effective_skip_markers,
684
- strip_chars,
685
- yield_raw_on_error,
686
- error_handler,
687
- compiled_skip_regexes,
688
- compiled_extract_regexes,
689
- )
690
- if result is None:
691
- continue
692
- if content_extractor:
693
- try:
694
- final_content = content_extractor(result)
695
- if final_content is not None:
696
- yield final_content
697
- except Exception:
698
- pass
699
- else:
700
- yield result
701
- if not processing_active:
702
- found_start = False
703
- if idx == -1:
704
- break
705
- elif found_start:
706
- chunk = buffer
707
- buffer = ""
708
- if chunk:
709
- for subline in (
710
- chunk.split(line_delimiter)
711
- if line_delimiter is not None
712
- else chunk.splitlines()
713
- ):
714
- result = _process_chunk(
715
- subline,
716
- intro_value,
717
- to_json,
718
- effective_skip_markers,
719
- strip_chars,
720
- yield_raw_on_error,
721
- error_handler,
722
- compiled_skip_regexes,
723
- compiled_extract_regexes,
724
- )
725
- if result is None:
726
- continue
727
- if content_extractor:
728
- try:
729
- final_content = content_extractor(result)
730
- if final_content is not None:
731
- yield final_content
732
- except Exception:
733
- pass
734
- else:
735
- yield result
736
- break
737
- else:
738
- break
739
- except Exception as e:
740
- import sys
741
- print(f"Async stream processing error: {str(e)}", file=sys.stderr)
742
-
743
-
744
- def sanitize_stream(
745
- data: Union[
746
- str,
747
- bytes,
748
- Iterable[str],
749
- Iterable[bytes],
750
- AsyncIterable[str],
751
- AsyncIterable[bytes],
752
- dict,
753
- list,
754
- int,
755
- float,
756
- bool,
757
- None,
758
- ],
759
- intro_value: str = "data:",
760
- to_json: bool = True,
761
- skip_markers: Optional[List[str]] = None,
762
- strip_chars: Optional[str] = None,
763
- start_marker: Optional[str] = None,
764
- end_marker: Optional[str] = None,
765
- content_extractor: Optional[Callable[[Union[str, Dict[str, Any]]], Optional[Any]]] = None,
766
- yield_raw_on_error: bool = True,
767
- encoding: EncodingType = "utf-8",
768
- encoding_errors: str = "replace",
769
- buffer_size: int = 8192,
770
- line_delimiter: Optional[str] = None,
771
- error_handler: Optional[Callable[[Exception, str], Optional[Any]]] = None,
772
- skip_regexes: Optional[List[Union[str, Pattern[str]]]] = None,
773
- extract_regexes: Optional[List[Union[str, Pattern[str]]]] = None,
774
- object_mode: Literal["as_is", "json", "str"] = "json",
775
- raw: bool = False,
776
- ) -> Union[Generator[Any, None, None], AsyncGenerator[Any, None]]:
777
- """
778
- Processes streaming data (strings or bytes) in either synchronous or asynchronous mode.
779
- Now supports non-iterable and miscellaneous input types (dict, list, int, float, bool, None).
780
- Includes regex-based content filtering and extraction capabilities.
781
-
782
- Args:
783
- data: The data to be processed. Can be a string, bytes, a synchronous iterable of strings or bytes,
784
- an asynchronous iterable of strings or bytes, or a single object (dict, list, int, float, bool, None).
785
- intro_value (str): Prefix indicating the start of meaningful data. Defaults to "data:".
786
- to_json (bool): Parse JSON content if ``True``. Defaults to True.
787
- skip_markers (Optional[List[str]]): Lines containing any of these markers are skipped. Defaults to None.
788
- strip_chars (Optional[str]): Characters to strip from each line. Defaults to None.
789
- start_marker (Optional[str]): Begin processing only after this marker is found. Defaults to None.
790
- end_marker (Optional[str]): Stop processing once this marker is found. Defaults to None.
791
- content_extractor (Optional[Callable[[Union[str, Dict[str, Any]]], Optional[Any]]]):
792
- Optional callable to transform parsed content before yielding. Defaults to None.
793
- yield_raw_on_error (bool): Yield raw lines when JSON parsing fails. Defaults to True.
794
- encoding (EncodingType): Byte stream encoding. Defaults to "utf-8".
795
- encoding_errors (str): How to handle encoding errors. Defaults to "replace".
796
- buffer_size (int): Buffer size for byte decoding. Defaults to 8192.
797
- line_delimiter (Optional[str]): Delimiter used to split incoming text into lines.
798
- ``None`` uses ``str.splitlines()``. Defaults to None.
799
- error_handler (Optional[Callable[[Exception, str], Optional[Any]]]):
800
- Callback invoked with ``(Exception, str)`` when JSON parsing fails.
801
- If the callback returns a value, it is yielded in place of the raw line. Defaults to None.
802
- skip_regexes (Optional[List[Union[str, Pattern[str]]]]): List of regex patterns (strings or compiled)
803
- for skipping lines that match any pattern. Defaults to None.
804
- extract_regexes (Optional[List[Union[str, Pattern[str]]]]): List of regex patterns (strings or compiled)
805
- for extracting content using capturing groups. If multiple groups are captured, they are returned as a tuple string. Defaults to None.
806
- object_mode (Literal["as_is", "json", "str"]): How to handle non-string, non-iterable objects.
807
- "json" (default) yields as JSON string, "str" yields as str(obj), "as_is" yields the object as-is.
808
- raw (bool): If True, yields the raw response as returned by the API, chunk by chunk (no splitting or joining).
809
-
810
- Returns:
811
- Union[Generator[Any, None, None], AsyncGenerator[Any, None]]:
812
- A generator or an asynchronous generator yielding the processed data, or raw data if raw=True.
813
-
814
- Raises:
815
- ValueError: If any regex pattern is invalid.
816
- """ # --- RAW MODE: yield each chunk exactly as returned by the API ---
817
- if raw:
818
- def _raw_passthrough_sync(source_iter):
819
- for chunk in source_iter:
820
- if isinstance(chunk, (bytes, bytearray)):
821
- # Decode bytes preserving all whitespace and newlines
822
- yield chunk.decode(encoding, encoding_errors)
823
- elif chunk is not None:
824
- # Yield string chunks as-is, preserving all formatting
825
- yield chunk
826
- # Skip None chunks entirely
827
- async def _raw_passthrough_async(source_aiter):
828
- async for chunk in source_aiter:
829
- if isinstance(chunk, (bytes, bytearray)):
830
- # Decode bytes preserving all whitespace and newlines
831
- yield chunk.decode(encoding, encoding_errors)
832
- elif chunk is not None:
833
- # Yield string chunks as-is, preserving all formatting
834
- yield chunk
835
- # Skip None chunks entirely
836
- # Sync iterable (but not str/bytes)
837
- if hasattr(data, "__iter__") and not isinstance(data, (str, bytes)):
838
- return _raw_passthrough_sync(data)
839
- # Async iterable
840
- if hasattr(data, "__aiter__"):
841
- return _raw_passthrough_async(data)
842
- # Single string or bytes
843
- if isinstance(data, (bytes, bytearray)):
844
- def _yield_single():
845
- yield data.decode(encoding, encoding_errors)
846
- return _yield_single()
847
- else:
848
- def _yield_single():
849
- if data is not None:
850
- yield data
851
- return _yield_single()
852
- # --- END RAW MODE ---
853
-
854
- text_attr = getattr(data, "text", None)
855
- content_attr = getattr(data, "content", None)
856
-
857
- # Handle None
858
- if data is None:
859
- def _empty_gen():
860
- if False:
861
- yield None
862
- return _empty_gen()
863
-
864
- # Handle bytes directly
865
- if isinstance(data, bytes):
866
- try:
867
- payload = data.decode(encoding, encoding_errors)
868
- except Exception:
869
- payload = str(data)
870
- return _sanitize_stream_sync(
871
- payload, intro_value, to_json, skip_markers, strip_chars,
872
- start_marker, end_marker, content_extractor, yield_raw_on_error,
873
- encoding, encoding_errors, buffer_size, line_delimiter, error_handler,
874
- skip_regexes, extract_regexes, raw,
875
- )
876
-
877
- # Handle string directly
878
- if isinstance(data, str):
879
- return _sanitize_stream_sync(
880
- data, intro_value, to_json, skip_markers, strip_chars,
881
- start_marker, end_marker, content_extractor, yield_raw_on_error,
882
- encoding, encoding_errors, buffer_size, line_delimiter, error_handler,
883
- skip_regexes, extract_regexes, raw,
884
- )
885
-
886
- # Handle dict, list, int, float, bool (non-iterable, non-string/bytes)
887
- if isinstance(data, (dict, list, int, float, bool)):
888
- if object_mode == "as_is":
889
- def _as_is_gen():
890
- yield data
891
- return _as_is_gen()
892
- elif object_mode == "str":
893
- return _sanitize_stream_sync(
894
- str(data), intro_value, to_json, skip_markers, strip_chars,
895
- start_marker, end_marker, content_extractor, yield_raw_on_error,
896
- encoding, encoding_errors, buffer_size, line_delimiter, error_handler,
897
- skip_regexes, extract_regexes, raw,
898
- )
899
- else: # "json"
900
- try:
901
- json_str = json.dumps(data)
902
- except Exception:
903
- json_str = str(data)
904
- return _sanitize_stream_sync(
905
- json_str, intro_value, to_json, skip_markers, strip_chars,
906
- start_marker, end_marker, content_extractor, yield_raw_on_error,
907
- encoding, encoding_errors, buffer_size, line_delimiter, error_handler,
908
- skip_regexes, extract_regexes, raw,
909
- )
910
-
911
- # Handle file-like objects (optional, treat as string if .read exists)
912
- if hasattr(data, "read") and callable(data.read):
913
- try:
914
- file_content = data.read()
915
- if isinstance(file_content, bytes):
916
- file_content = file_content.decode(encoding, encoding_errors)
917
- return _sanitize_stream_sync(
918
- file_content, intro_value, to_json, skip_markers, strip_chars,
919
- start_marker, end_marker, content_extractor, yield_raw_on_error,
920
- encoding, encoding_errors, buffer_size, line_delimiter, error_handler,
921
- skip_regexes, extract_regexes, raw,
922
- )
923
- except Exception:
924
- pass # fallback to next
925
-
926
- # Handle .text or .content attributes
927
- if isinstance(text_attr, str):
928
- payload = text_attr
929
- return _sanitize_stream_sync(
930
- payload, intro_value, to_json, skip_markers, strip_chars,
931
- start_marker, end_marker, content_extractor, yield_raw_on_error,
932
- encoding, encoding_errors, buffer_size, line_delimiter, error_handler,
933
- skip_regexes, extract_regexes, raw,
934
- )
935
- elif isinstance(content_attr, bytes):
936
- try:
937
- payload = content_attr.decode(encoding, encoding_errors)
938
- except Exception:
939
- payload = str(content_attr)
940
- return _sanitize_stream_sync(
941
- payload, intro_value, to_json, skip_markers, strip_chars,
942
- start_marker, end_marker, content_extractor, yield_raw_on_error,
943
- encoding, encoding_errors, buffer_size, line_delimiter, error_handler,
944
- skip_regexes, extract_regexes, raw,
945
- )
946
-
947
- # Handle async iterables
948
- if hasattr(data, "__aiter__"):
949
- return _sanitize_stream_async(
950
- data, intro_value, to_json, skip_markers, strip_chars,
951
- start_marker, end_marker, content_extractor, yield_raw_on_error,
952
- encoding, encoding_errors, buffer_size, line_delimiter, error_handler,
953
- skip_regexes, extract_regexes, raw,
954
- )
955
- # Handle sync iterables (but not strings/bytes)
956
- if hasattr(data, "__iter__"):
957
- return _sanitize_stream_sync(
958
- data, intro_value, to_json, skip_markers, strip_chars,
959
- start_marker, end_marker, content_extractor, yield_raw_on_error,
960
- encoding, encoding_errors, buffer_size, line_delimiter, error_handler,
961
- skip_regexes, extract_regexes, raw,
962
- )
963
- # Fallback: treat as string
964
- return _sanitize_stream_sync(
965
- str(data), intro_value, to_json, skip_markers, strip_chars,
966
- start_marker, end_marker, content_extractor, yield_raw_on_error,
967
- encoding, encoding_errors, buffer_size, line_delimiter, error_handler,
968
- skip_regexes, extract_regexes, raw,
969
- )
970
-
971
- # --- Decorator version of sanitize_stream ---
972
- import functools
973
- import asyncio
974
- from typing import overload
975
-
976
- def _sanitize_stream_decorator(
977
- _func=None,
978
- *,
979
- intro_value: str = "data:",
980
- to_json: bool = True,
981
- skip_markers: Optional[List[str]] = None,
982
- strip_chars: Optional[str] = None,
983
- start_marker: Optional[str] = None,
984
- end_marker: Optional[str] = None,
985
- content_extractor: Optional[Callable[[Union[str, Dict[str, Any]]], Optional[Any]]] = None,
986
- yield_raw_on_error: bool = True,
987
- encoding: EncodingType = "utf-8",
988
- encoding_errors: str = "replace",
989
- buffer_size: int = 8192,
990
- line_delimiter: Optional[str] = None,
991
- error_handler: Optional[Callable[[Exception, str], Optional[Any]]] = None,
992
- skip_regexes: Optional[List[Union[str, Pattern[str]]]] = None,
993
- extract_regexes: Optional[List[Union[str, Pattern[str]]]] = None,
994
- object_mode: Literal["as_is", "json", "str"] = "json",
995
- raw: bool = False,
996
- ):
997
- """
998
- Decorator for sanitize_stream. Can be used as @sanitize_stream or @sanitize_stream(...).
999
- All arguments are the same as sanitize_stream().
1000
- """
1001
- def decorator(func):
1002
- if asyncio.iscoroutinefunction(func):
1003
- @functools.wraps(func)
1004
- async def async_wrapper(*args, **kwargs):
1005
- result = await func(*args, **kwargs)
1006
- return sanitize_stream(
1007
- result,
1008
- intro_value=intro_value,
1009
- to_json=to_json,
1010
- skip_markers=skip_markers,
1011
- strip_chars=strip_chars,
1012
- start_marker=start_marker,
1013
- end_marker=end_marker,
1014
- content_extractor=content_extractor,
1015
- yield_raw_on_error=yield_raw_on_error,
1016
- encoding=encoding,
1017
- encoding_errors=encoding_errors,
1018
- buffer_size=buffer_size,
1019
- line_delimiter=line_delimiter,
1020
- error_handler=error_handler,
1021
- skip_regexes=skip_regexes,
1022
- extract_regexes=extract_regexes,
1023
- object_mode=object_mode,
1024
- raw=raw,
1025
- )
1026
- return async_wrapper
1027
- else:
1028
- @functools.wraps(func)
1029
- def sync_wrapper(*args, **kwargs):
1030
- result = func(*args, **kwargs)
1031
- return sanitize_stream(
1032
- result,
1033
- intro_value=intro_value,
1034
- to_json=to_json,
1035
- skip_markers=skip_markers,
1036
- strip_chars=strip_chars,
1037
- start_marker=start_marker,
1038
- end_marker=end_marker,
1039
- content_extractor=content_extractor,
1040
- yield_raw_on_error=yield_raw_on_error,
1041
- encoding=encoding,
1042
- encoding_errors=encoding_errors,
1043
- buffer_size=buffer_size,
1044
- line_delimiter=line_delimiter,
1045
- error_handler=error_handler,
1046
- skip_regexes=skip_regexes,
1047
- extract_regexes=extract_regexes,
1048
- object_mode=object_mode,
1049
- raw=raw,
1050
- )
1051
- return sync_wrapper
1052
- if _func is None:
1053
- return decorator
1054
- else:
1055
- return decorator(_func)
1056
-
1057
- # Alias for decorator usage
1058
- LITSTREAM = sanitize_stream
1059
-
1060
- # Decorator aliases
1061
- sanitize_stream_decorator = _sanitize_stream_decorator
1062
- lit_streamer = _sanitize_stream_decorator
1063
-
1064
- # Allow @sanitize_stream and @lit_streamer as decorators
1065
- sanitize_stream.__decorator__ = _sanitize_stream_decorator
1066
- LITSTREAM.__decorator__ = _sanitize_stream_decorator
1067
- lit_streamer.__decorator__ = _sanitize_stream_decorator
1068
-
1069
- def __getattr__(name):
1070
- if name == 'sanitize_stream':
1071
- return sanitize_stream
1072
- if name == 'LITSTREAM':
1073
- return LITSTREAM
1074
- if name == 'sanitize_stream_decorator':
1075
- return _sanitize_stream_decorator
1076
- if name == 'lit_streamer':
1077
- return _sanitize_stream_decorator
1078
- raise AttributeError(f"module {__name__} has no attribute {name}")
1
+ import codecs
2
+ import json
3
+ import re
4
+ from typing import (
5
+ Any,
6
+ AsyncGenerator,
7
+ AsyncIterable,
8
+ Callable,
9
+ Dict,
10
+ Generator,
11
+ Iterable,
12
+ List,
13
+ Literal,
14
+ Optional,
15
+ Pattern,
16
+ Union,
17
+ )
18
+
19
+ # Expanded encoding types
20
+ EncodingType = Literal['utf-8', 'utf-16', 'utf-32', 'ascii', 'latin1', 'cp1252', 'iso-8859-1',
21
+ 'iso-8859-2', 'windows-1250', 'windows-1251', 'windows-1252', 'gbk', 'big5',
22
+ 'shift_jis', 'euc-jp', 'euc-kr']
23
+
24
+ def _compile_regexes(patterns: Optional[List[Union[str, Pattern[str]]]]) -> Optional[List[Pattern[str]]]:
25
+ """
26
+ Compile regex patterns from strings or return compiled patterns as-is.
27
+
28
+ Args:
29
+ patterns: List of regex patterns as strings or compiled Pattern objects.
30
+
31
+ Returns:
32
+ List of compiled Pattern objects, or None if input is None.
33
+
34
+ Raises:
35
+ ValueError: If any pattern is invalid.
36
+ """
37
+ if not patterns:
38
+ return None
39
+
40
+ compiled_patterns = []
41
+ for i, pattern in enumerate(patterns):
42
+ try:
43
+ if isinstance(pattern, str):
44
+ compiled_patterns.append(re.compile(pattern))
45
+ elif isinstance(pattern, Pattern):
46
+ compiled_patterns.append(pattern)
47
+ else:
48
+ raise ValueError(f"Pattern at index {i} must be a string or compiled regex pattern, got {type(pattern)}")
49
+ except re.error as e:
50
+ raise ValueError(f"Invalid regex pattern at index {i}: '{pattern}' - {str(e)}")
51
+
52
+ return compiled_patterns
53
+
54
+ def _process_chunk(
55
+ chunk: str,
56
+ intro_value: str,
57
+ to_json: bool,
58
+ skip_markers: List[str],
59
+ strip_chars: Optional[str],
60
+ yield_raw_on_error: bool,
61
+ error_handler: Optional[Callable[[Exception, str], Optional[Any]]] = None,
62
+ skip_regexes: Optional[List[Pattern[str]]] = None,
63
+ extract_regexes: Optional[List[Pattern[str]]] = None,
64
+ ) -> Union[str, Dict[str, Any], None]:
65
+ """
66
+ Sanitizes and potentially parses a single chunk of text.
67
+
68
+ This function performs several operations on the input chunk:
69
+ - Removes a specified prefix (`intro_value`).
70
+ - Strips leading/trailing characters (`strip_chars`).
71
+ - Skips chunks matching specific markers (`skip_markers`).
72
+ - Skips chunks matching regex patterns (`skip_regexes`).
73
+ - Extracts content using regex capturing groups (`extract_regexes`).
74
+ - Optionally parses the chunk as JSON (`to_json`).
75
+ - Handles JSON parsing errors with an optional callback (`error_handler`).
76
+
77
+ Args:
78
+ chunk (str): The chunk of text to process.
79
+ intro_value (str): The prefix to remove from the chunk.
80
+ to_json (bool): If True, attempts to parse the chunk as JSON.
81
+ skip_markers (List[str]): A list of markers; chunks matching these are skipped.
82
+ strip_chars (Optional[str]): Characters to strip from the beginning and end of the chunk.
83
+ yield_raw_on_error (bool): If True, returns the raw chunk when JSON parsing fails; otherwise, returns None.
84
+ error_handler (Optional[Callable[[Exception, str], Optional[Any]]]): An optional callback function that is called when JSON parsing fails.
85
+ It receives the exception and the sanitized chunk as arguments. It should return a value to yield instead of the raw chunk, or None to ignore.
86
+ skip_regexes (Optional[List[Pattern[str]]]): A list of compiled regex patterns; chunks matching any of these are skipped.
87
+ extract_regexes (Optional[List[Pattern[str]]]): A list of compiled regex patterns for extracting content using capturing groups.
88
+
89
+ """
90
+ if not isinstance(chunk, str):
91
+ return None
92
+
93
+ # Fast path for empty chunks
94
+ if not chunk:
95
+ return None
96
+
97
+ # Use slicing for prefix removal (faster than startswith+slicing)
98
+ sanitized_chunk = chunk
99
+ if intro_value and len(chunk) >= len(intro_value) and chunk[:len(intro_value)] == intro_value:
100
+ sanitized_chunk = chunk[len(intro_value):]
101
+
102
+ # Optimize string stripping operations
103
+ if strip_chars is not None:
104
+ sanitized_chunk = sanitized_chunk.strip(strip_chars)
105
+ else:
106
+ # lstrip() is faster than strip() when we only need leading whitespace removed
107
+ sanitized_chunk = sanitized_chunk.lstrip()
108
+
109
+ # Skip empty chunks and markers
110
+ if not sanitized_chunk or any(marker == sanitized_chunk for marker in skip_markers):
111
+ return None
112
+
113
+ # Apply regex-based extraction first (if provided)
114
+ if extract_regexes:
115
+ extracted_content = None
116
+ for regex in extract_regexes:
117
+ match = regex.search(sanitized_chunk)
118
+ if match:
119
+ # If there are capturing groups, return the first group or all groups as a tuple
120
+ if match.groups():
121
+ if len(match.groups()) == 1:
122
+ extracted_content = match.group(1)
123
+ else:
124
+ # Multiple groups - return as tuple converted to string for JSON compatibility
125
+ extracted_content = str(match.groups())
126
+ else:
127
+ # No capturing groups, return the full match
128
+ extracted_content = match.group(0)
129
+ break # Use first matching extraction regex
130
+
131
+ # If extract_regexes are provided but no match found, skip this chunk entirely
132
+ if extracted_content is None:
133
+ return None
134
+
135
+ sanitized_chunk = extracted_content
136
+
137
+ # Apply regex-based skipping (after extraction)
138
+ if skip_regexes:
139
+ if any(regex.search(sanitized_chunk) for regex in skip_regexes):
140
+ return None
141
+
142
+ # JSON parsing with optimized error handling
143
+ if to_json:
144
+ try:
145
+ # Only strip before JSON parsing if both boundaries are incorrect
146
+ if len(sanitized_chunk) >= 2 and sanitized_chunk[0] not in '{[' and sanitized_chunk[-1] not in '}]':
147
+ sanitized_chunk = sanitized_chunk.strip()
148
+ return json.loads(sanitized_chunk)
149
+ except (json.JSONDecodeError, Exception) as e:
150
+ if error_handler:
151
+ try:
152
+ handled = error_handler(e, sanitized_chunk)
153
+ if handled is not None:
154
+ return handled
155
+ except Exception:
156
+ pass
157
+ return sanitized_chunk if yield_raw_on_error else None
158
+
159
+ return sanitized_chunk
160
+
161
+ def _decode_byte_stream(
162
+ byte_iterator: Iterable[bytes],
163
+ encoding: EncodingType = 'utf-8',
164
+ errors: str = 'replace',
165
+ buffer_size: int = 8192
166
+ ) -> Generator[str, None, None]:
167
+ """
168
+ Decodes a byte stream in realtime with flexible encoding support.
169
+
170
+ This function takes an iterator of bytes and decodes it into a stream of strings
171
+ using the specified character encoding. It handles encoding errors gracefully
172
+ and can be tuned for performance with the `buffer_size` parameter.
173
+
174
+ Args:
175
+ byte_iterator (Iterable[bytes]): An iterator that yields chunks of bytes.
176
+ encoding (EncodingType): The character encoding to use for decoding.
177
+ Defaults to 'utf-8'. Supports a wide range of encodings, including:
178
+ 'utf-8', 'utf-16', 'utf-32', 'ascii', 'latin1', 'cp1252', 'iso-8859-1',
179
+ 'iso-8859-2', 'windows-1250', 'windows-1251', 'windows-1252', 'gbk', 'big5',
180
+ 'shift_jis', 'euc-jp', 'euc-kr'.
181
+ errors (str): Specifies how encoding errors should be handled.
182
+ Options are 'strict' (raises an error), 'ignore' (skips the error), and
183
+ 'replace' (replaces the erroneous byte with a replacement character).
184
+ Defaults to 'replace'.
185
+ buffer_size (int): The size of the internal buffer used for decoding.
186
+
187
+ """
188
+ # Initialize decoder with the specified encoding
189
+ try:
190
+ decoder = codecs.getincrementaldecoder(encoding)(errors=errors)
191
+ except LookupError:
192
+ # Fallback to utf-8 if the encoding is not supported
193
+ decoder = codecs.getincrementaldecoder('utf-8')(errors=errors)
194
+
195
+ # Process byte stream in realtime
196
+ buffer = bytearray(buffer_size)
197
+ buffer_view = memoryview(buffer)
198
+
199
+ for chunk_bytes in byte_iterator:
200
+ if not chunk_bytes:
201
+ continue
202
+
203
+ try:
204
+ # Use buffer for processing if chunk size is appropriate
205
+ if len(chunk_bytes) <= buffer_size:
206
+ buffer[:len(chunk_bytes)] = chunk_bytes
207
+ text = decoder.decode(buffer_view[:len(chunk_bytes)], final=False)
208
+ else:
209
+ text = decoder.decode(chunk_bytes, final=False)
210
+
211
+ if text:
212
+ yield text
213
+ except UnicodeDecodeError:
214
+ yield f"[Encoding Error: Could not decode bytes with {encoding}]\n"
215
+
216
+ # Final flush
217
+ try:
218
+ final_text = decoder.decode(b'', final=True)
219
+ if final_text:
220
+ yield final_text
221
+ except UnicodeDecodeError:
222
+ yield f"[Encoding Error: Could not decode final bytes with {encoding}]\n"
223
+
224
+ async def _decode_byte_stream_async(
225
+ byte_iterator: Iterable[bytes],
226
+ encoding: EncodingType = 'utf-8',
227
+ errors: str = 'replace',
228
+ buffer_size: int = 8192
229
+ ) -> AsyncGenerator[str, None]:
230
+ """
231
+ Asynchronously decodes a byte stream with flexible encoding support.
232
+
233
+ This function is the asynchronous counterpart to `_decode_byte_stream`. It takes
234
+ an asynchronous iterator of bytes and decodes it into a stream of strings using
235
+ the specified character encoding. It handles encoding errors gracefully and can
236
+ be tuned for performance with the `buffer_size` parameter.
237
+
238
+ Args:
239
+ byte_iterator (Iterable[bytes]): An asynchronous iterator that yields chunks of bytes.
240
+ encoding (EncodingType): The character encoding to use for decoding.
241
+ Defaults to 'utf-8'. Supports a wide range of encodings, including:
242
+ 'utf-8', 'utf-16', 'utf-32', 'ascii', 'latin1', 'cp1252', 'iso-8859-1',
243
+ 'iso-8859-2', 'windows-1250', 'windows-1251', 'windows-1252', 'gbk', 'big5',
244
+ 'shift_jis', 'euc-jp', 'euc-kr'.
245
+ errors (str): Specifies how encoding errors should be handled.
246
+ Options are 'strict' (raises an error), 'ignore' (skips the error), and
247
+ 'replace' (replaces the erroneous byte with a replacement character).
248
+ Defaults to 'replace'.
249
+ buffer_size (int): The size of the internal buffer used for decoding.
250
+ """
251
+ try:
252
+ decoder = codecs.getincrementaldecoder(encoding)(errors=errors)
253
+ except LookupError:
254
+ decoder = codecs.getincrementaldecoder('utf-8')(errors=errors)
255
+
256
+ buffer = bytearray(buffer_size)
257
+ buffer_view = memoryview(buffer)
258
+
259
+ async for chunk_bytes in byte_iterator:
260
+ if not chunk_bytes:
261
+ continue
262
+ try:
263
+ if len(chunk_bytes) <= buffer_size:
264
+ buffer[:len(chunk_bytes)] = chunk_bytes
265
+ text = decoder.decode(buffer_view[:len(chunk_bytes)], final=False)
266
+ else:
267
+ text = decoder.decode(chunk_bytes, final=False)
268
+ if text:
269
+ yield text
270
+ except UnicodeDecodeError:
271
+ yield f"[Encoding Error: Could not decode bytes with {encoding}]\n"
272
+
273
+ try:
274
+ final_text = decoder.decode(b'', final=True)
275
+ if final_text:
276
+ yield final_text
277
+ except UnicodeDecodeError:
278
+ yield f"[Encoding Error: Could not decode final bytes with {encoding}]\n"
279
+
280
+ def _sanitize_stream_sync(
281
+ data: Union[str, Iterable[str], Iterable[bytes]],
282
+ intro_value: str = "data:",
283
+ to_json: bool = True,
284
+ skip_markers: Optional[List[str]] = None,
285
+ strip_chars: Optional[str] = None,
286
+ start_marker: Optional[str] = None,
287
+ end_marker: Optional[str] = None,
288
+ content_extractor: Optional[Callable[[Union[str, Dict[str, Any]]], Optional[Any]]] = None,
289
+ yield_raw_on_error: bool = True,
290
+ encoding: EncodingType = 'utf-8',
291
+ encoding_errors: str = 'replace',
292
+ buffer_size: int = 8192,
293
+ line_delimiter: Optional[str] = None,
294
+ error_handler: Optional[Callable[[Exception, str], Optional[Any]]] = None,
295
+ skip_regexes: Optional[List[Union[str, Pattern[str]]]] = None,
296
+ extract_regexes: Optional[List[Union[str, Pattern[str]]]] = None,
297
+ raw: bool = False,
298
+ ) -> Generator[Any, None, None]:
299
+ """
300
+ Processes a stream of data (strings or bytes) in real-time, applying various transformations and filtering.
301
+
302
+ This function is designed to handle streaming data, allowing for operations such as
303
+ prefix removal, JSON parsing, skipping lines based on markers, regex-based filtering,
304
+ and extracting specific content. It also supports custom error handling for JSON parsing failures.
305
+
306
+ Args:
307
+ data: String, iterable of strings, or iterable of bytes to process.
308
+ intro_value: Prefix indicating the start of meaningful data.
309
+ to_json: Parse the chunk as JSON if True.
310
+ skip_markers: Lines containing any of these markers are skipped.
311
+ strip_chars: Characters to strip from each line.
312
+ start_marker: Begin processing only after this marker is found.
313
+ end_marker: Stop processing once this marker is found.
314
+ content_extractor: Optional callable to transform parsed content before yielding.
315
+ yield_raw_on_error: Yield raw lines when JSON parsing fails.
316
+ encoding: Byte stream encoding.
317
+ encoding_errors: How to handle encoding errors.
318
+ buffer_size: Buffer size for byte decoding.
319
+ line_delimiter: Delimiter used to split incoming text into lines. ``None``
320
+ uses ``str.splitlines()``.
321
+ error_handler: Callback invoked with ``(Exception, str)`` when JSON
322
+ parsing fails. If the callback returns a value, it is yielded instead of the raw line.
323
+ skip_regexes: List of regex patterns (strings or compiled) for skipping lines that match.
324
+ extract_regexes: List of regex patterns (strings or compiled) for extracting content using capturing groups.
325
+ raw: If True, yields the raw response as returned by the API, chunk by chunk (no processing).
326
+
327
+ Yields:
328
+ Any: Processed data, which can be a string, a dictionary (if `to_json` is True), or the result of `content_extractor`.
329
+
330
+ Raises:
331
+ TypeError: If the input `data` is not a string or an iterable.
332
+ ValueError: If any regex pattern is invalid.
333
+ """
334
+ # --- RAW MODE: yield each chunk exactly as returned by the API ---
335
+ if raw:
336
+ if isinstance(data, str):
337
+ yield data
338
+ return
339
+ elif hasattr(data, '__iter__'):
340
+ for chunk in data:
341
+ if isinstance(chunk, (bytes, bytearray)):
342
+ yield chunk.decode(encoding, encoding_errors)
343
+ elif chunk is not None:
344
+ yield chunk
345
+ return
346
+ else:
347
+ if data is not None:
348
+ yield data
349
+ return
350
+ # --- END RAW MODE ---
351
+
352
+ effective_skip_markers = skip_markers or []
353
+ # Compile regex patterns
354
+ compiled_skip_regexes = _compile_regexes(skip_regexes)
355
+ compiled_extract_regexes = _compile_regexes(extract_regexes)
356
+
357
+ processing_active = start_marker is None
358
+ buffer = ""
359
+ found_start = False if start_marker else True
360
+ line_iterator: Iterable[str]
361
+
362
+ if isinstance(data, str):
363
+ # If data is a string, decide whether to split it into lines
364
+ # or treat it as an iterable containing a single chunk.
365
+ temp_lines: List[str]
366
+ if line_delimiter is None: # Default: split by newlines if present
367
+ if '\n' in data or '\r' in data:
368
+ temp_lines = data.splitlines()
369
+ else:
370
+ temp_lines = [data] # Treat as a single line/chunk
371
+ elif line_delimiter in data: # Custom delimiter found in string
372
+ temp_lines = data.split(line_delimiter)
373
+ else: # Custom delimiter not found, or string is effectively a single segment
374
+ temp_lines = [data]
375
+ line_iterator = iter(temp_lines)
376
+ elif hasattr(data, '__iter__'): # data is an iterable (but not a string)
377
+ _iter = iter(data)
378
+ first_item = next(_iter, None)
379
+
380
+ if first_item is None: # Iterable was empty
381
+ return
382
+
383
+ from itertools import chain
384
+ # Reconstruct the full iterable including the first_item
385
+ stream_input_iterable = chain([first_item], _iter)
386
+
387
+ if isinstance(first_item, bytes):
388
+ # Ensure stream_input_iterable is typed as Iterable[bytes] for _decode_byte_stream
389
+ line_iterator = _decode_byte_stream(
390
+ stream_input_iterable, # type: ignore
391
+ encoding=encoding,
392
+ errors=encoding_errors,
393
+ buffer_size=buffer_size
394
+ )
395
+ elif isinstance(first_item, str):
396
+ # Ensure stream_input_iterable is typed as Iterable[str]
397
+ line_iterator = stream_input_iterable # type: ignore
398
+ else:
399
+ raise TypeError(f"Iterable must yield strings or bytes, not {type(first_item).__name__}")
400
+ else: # Not a string and not an iterable
401
+ raise TypeError(f"Input must be a string or an iterable, not {type(data).__name__}")
402
+
403
+ try:
404
+ for line in line_iterator:
405
+ if not line:
406
+ continue
407
+ buffer += line
408
+ while True:
409
+ # Look for start marker if needed
410
+ if not found_start and start_marker:
411
+ idx = buffer.find(start_marker)
412
+ if idx != -1:
413
+ found_start = True
414
+ buffer = buffer[idx + len(start_marker):]
415
+ else:
416
+ # Not found, keep buffering
417
+ buffer = buffer[-max(len(start_marker), 256):] # avoid unbounded growth
418
+ break
419
+ # Look for end marker if needed
420
+ if found_start and end_marker:
421
+ idx = buffer.find(end_marker)
422
+ if idx != -1:
423
+ chunk = buffer[:idx]
424
+ buffer = buffer[idx + len(end_marker):]
425
+ processing_active = False
426
+ else:
427
+ chunk = buffer
428
+ buffer = ""
429
+ processing_active = True
430
+ # Process chunk if we are in active region
431
+ if chunk and processing_active:
432
+ for subline in (chunk.split(line_delimiter) if line_delimiter is not None else chunk.splitlines()):
433
+ result = _process_chunk(
434
+ subline,
435
+ intro_value,
436
+ to_json,
437
+ effective_skip_markers,
438
+ strip_chars,
439
+ yield_raw_on_error,
440
+ error_handler,
441
+ compiled_skip_regexes,
442
+ compiled_extract_regexes,
443
+ )
444
+ if result is None:
445
+ continue
446
+ if content_extractor:
447
+ try:
448
+ final_content = content_extractor(result)
449
+ if final_content is not None:
450
+ yield final_content
451
+ except Exception:
452
+ pass
453
+ else:
454
+ yield result
455
+ if not processing_active:
456
+ found_start = False
457
+ if idx == -1:
458
+ break
459
+ elif found_start:
460
+ # No end marker, process all buffered content
461
+ chunk = buffer
462
+ buffer = ""
463
+ if chunk:
464
+ for subline in (chunk.split(line_delimiter) if line_delimiter is not None else chunk.splitlines()):
465
+ result = _process_chunk(
466
+ subline,
467
+ intro_value,
468
+ to_json,
469
+ effective_skip_markers,
470
+ strip_chars,
471
+ yield_raw_on_error,
472
+ error_handler,
473
+ compiled_skip_regexes,
474
+ compiled_extract_regexes,
475
+ )
476
+ if result is None:
477
+ continue
478
+ if content_extractor:
479
+ try:
480
+ final_content = content_extractor(result)
481
+ if final_content is not None:
482
+ yield final_content
483
+ except Exception:
484
+ pass
485
+ else:
486
+ yield result
487
+ break
488
+ else:
489
+ break
490
+ except Exception as e:
491
+ import sys
492
+ print(f"Stream processing error: {str(e)}", file=sys.stderr)
493
+
494
+
495
+ async def _sanitize_stream_async(
496
+ data: Union[str, Iterable[str], Iterable[bytes]],
497
+ intro_value: str = "data:",
498
+ to_json: bool = True,
499
+ skip_markers: Optional[List[str]] = None,
500
+ strip_chars: Optional[str] = None,
501
+ start_marker: Optional[str] = None,
502
+ end_marker: Optional[str] = None,
503
+ content_extractor: Optional[Callable[[Union[str, Dict[str, Any]]], Optional[Any]]] = None,
504
+ yield_raw_on_error: bool = True,
505
+ encoding: EncodingType = 'utf-8',
506
+ encoding_errors: str = 'replace',
507
+ buffer_size: int = 8192,
508
+ line_delimiter: Optional[str] = None,
509
+ error_handler: Optional[Callable[[Exception, str], Optional[Any]]] = None,
510
+ skip_regexes: Optional[List[Union[str, Pattern[str]]]] = None,
511
+ extract_regexes: Optional[List[Union[str, Pattern[str]]]] = None,
512
+ raw: bool = False,
513
+ ) -> AsyncGenerator[Any, None]:
514
+ """
515
+ Asynchronously processes a stream of data (strings or bytes), applying transformations and filtering.
516
+
517
+ This function is the asynchronous counterpart to `_sanitize_stream_sync`. It handles
518
+ streaming data, allowing for operations such as prefix removal, JSON parsing,
519
+ skipping lines based on markers, regex-based filtering, and extracting specific content.
520
+ It also supports custom error handling for JSON parsing failures.
521
+
522
+ Args:
523
+ data: String, iterable of strings, or iterable of bytes to process.
524
+ intro_value: Prefix indicating the start of meaningful data.
525
+ to_json: Parse JSON content if ``True``.
526
+ skip_markers: Lines containing any of these markers are skipped.
527
+ strip_chars: Characters to strip from each line.
528
+ start_marker: Begin processing only after this marker is found.
529
+ end_marker: Stop processing once this marker is found.
530
+ content_extractor: Optional callable to transform parsed content before yielding.
531
+ yield_raw_on_error: Yield raw lines when JSON parsing fails.
532
+ encoding: Byte stream encoding.
533
+ encoding_errors: How to handle encoding errors.
534
+ buffer_size: Buffer size for byte decoding.
535
+ line_delimiter: Delimiter used to split incoming text into lines. ``None`` uses ``str.splitlines()``.
536
+ error_handler: Callback invoked with ``(Exception, str)`` when JSON parsing fails. If the callback returns a value, it is yielded in place of the raw line.
537
+ skip_regexes: List of regex patterns (strings or compiled) for skipping lines that match.
538
+ extract_regexes: List of regex patterns (strings or compiled) for extracting content using capturing groups.
539
+ raw: If True, yields the raw response as returned by the API, chunk by chunk (no processing).
540
+ """
541
+ # --- RAW MODE: yield each chunk exactly as returned by the API ---
542
+ if raw:
543
+ if isinstance(data, str):
544
+ yield data
545
+ return
546
+ elif hasattr(data, "__aiter__"):
547
+ async for chunk in data:
548
+ if isinstance(chunk, (bytes, bytearray)):
549
+ yield chunk.decode(encoding, encoding_errors)
550
+ elif chunk is not None:
551
+ yield chunk
552
+ return
553
+ elif hasattr(data, "__iter__"):
554
+ for chunk in data:
555
+ if isinstance(chunk, (bytes, bytearray)):
556
+ yield chunk.decode(encoding, encoding_errors)
557
+ elif chunk is not None:
558
+ yield chunk
559
+ return
560
+ else:
561
+ if data is not None:
562
+ yield data
563
+ return
564
+ # --- END RAW MODE ---
565
+
566
+ if isinstance(data, str):
567
+ for item in _sanitize_stream_sync(
568
+ data,
569
+ intro_value=intro_value,
570
+ to_json=to_json,
571
+ skip_markers=skip_markers,
572
+ strip_chars=strip_chars,
573
+ start_marker=start_marker,
574
+ end_marker=end_marker,
575
+ content_extractor=content_extractor,
576
+ yield_raw_on_error=yield_raw_on_error,
577
+ encoding=encoding,
578
+ encoding_errors=encoding_errors,
579
+ buffer_size=buffer_size,
580
+ line_delimiter=line_delimiter,
581
+ error_handler=error_handler,
582
+ skip_regexes=skip_regexes,
583
+ extract_regexes=extract_regexes,
584
+ raw=raw,
585
+ ):
586
+ yield item
587
+ return
588
+
589
+ if not hasattr(data, "__aiter__"):
590
+ # Fallback to synchronous processing if possible
591
+ for item in _sanitize_stream_sync(
592
+ data,
593
+ intro_value=intro_value,
594
+ to_json=to_json,
595
+ skip_markers=skip_markers,
596
+ strip_chars=strip_chars,
597
+ start_marker=start_marker,
598
+ end_marker=end_marker,
599
+ content_extractor=content_extractor,
600
+ yield_raw_on_error=yield_raw_on_error,
601
+ encoding=encoding,
602
+ encoding_errors=encoding_errors,
603
+ buffer_size=buffer_size,
604
+ line_delimiter=line_delimiter,
605
+ error_handler=error_handler,
606
+ skip_regexes=skip_regexes,
607
+ extract_regexes=extract_regexes,
608
+ raw=raw,
609
+ ):
610
+ yield item
611
+ return
612
+
613
+ effective_skip_markers = skip_markers or []
614
+ # Compile regex patterns
615
+ compiled_skip_regexes = _compile_regexes(skip_regexes)
616
+ compiled_extract_regexes = _compile_regexes(extract_regexes)
617
+
618
+ processing_active = start_marker is None
619
+ buffer = ""
620
+ found_start = False if start_marker else True
621
+
622
+ iterator = data.__aiter__()
623
+ first_item = None
624
+ async for first_item in iterator:
625
+ break
626
+ if first_item is None:
627
+ return
628
+ async def _chain(first, it):
629
+ yield first
630
+ async for x in it:
631
+ yield x
632
+
633
+ stream = _chain(first_item, iterator)
634
+
635
+ if isinstance(first_item, bytes):
636
+ line_iterator = _decode_byte_stream_async(
637
+ stream,
638
+ encoding=encoding,
639
+ errors=encoding_errors,
640
+ buffer_size=buffer_size,
641
+ )
642
+ elif isinstance(first_item, str):
643
+ line_iterator = stream
644
+ else:
645
+ raise TypeError(
646
+ f"Stream must yield strings or bytes, not {type(first_item).__name__}"
647
+ )
648
+
649
+ try:
650
+ async for line in line_iterator:
651
+ if not line:
652
+ continue
653
+ buffer += line
654
+ while True:
655
+ if not found_start and start_marker:
656
+ idx = buffer.find(start_marker)
657
+ if idx != -1:
658
+ found_start = True
659
+ buffer = buffer[idx + len(start_marker) :]
660
+ else:
661
+ buffer = buffer[-max(len(start_marker), 256) :]
662
+ break
663
+ if found_start and end_marker:
664
+ idx = buffer.find(end_marker)
665
+ if idx != -1:
666
+ chunk = buffer[:idx]
667
+ buffer = buffer[idx + len(end_marker) :]
668
+ processing_active = False
669
+ else:
670
+ chunk = buffer
671
+ buffer = ""
672
+ processing_active = True
673
+ if chunk and processing_active:
674
+ for subline in (
675
+ chunk.split(line_delimiter)
676
+ if line_delimiter is not None
677
+ else chunk.splitlines()
678
+ ):
679
+ result = _process_chunk(
680
+ subline,
681
+ intro_value,
682
+ to_json,
683
+ effective_skip_markers,
684
+ strip_chars,
685
+ yield_raw_on_error,
686
+ error_handler,
687
+ compiled_skip_regexes,
688
+ compiled_extract_regexes,
689
+ )
690
+ if result is None:
691
+ continue
692
+ if content_extractor:
693
+ try:
694
+ final_content = content_extractor(result)
695
+ if final_content is not None:
696
+ yield final_content
697
+ except Exception:
698
+ pass
699
+ else:
700
+ yield result
701
+ if not processing_active:
702
+ found_start = False
703
+ if idx == -1:
704
+ break
705
+ elif found_start:
706
+ chunk = buffer
707
+ buffer = ""
708
+ if chunk:
709
+ for subline in (
710
+ chunk.split(line_delimiter)
711
+ if line_delimiter is not None
712
+ else chunk.splitlines()
713
+ ):
714
+ result = _process_chunk(
715
+ subline,
716
+ intro_value,
717
+ to_json,
718
+ effective_skip_markers,
719
+ strip_chars,
720
+ yield_raw_on_error,
721
+ error_handler,
722
+ compiled_skip_regexes,
723
+ compiled_extract_regexes,
724
+ )
725
+ if result is None:
726
+ continue
727
+ if content_extractor:
728
+ try:
729
+ final_content = content_extractor(result)
730
+ if final_content is not None:
731
+ yield final_content
732
+ except Exception:
733
+ pass
734
+ else:
735
+ yield result
736
+ break
737
+ else:
738
+ break
739
+ except Exception as e:
740
+ import sys
741
+ print(f"Async stream processing error: {str(e)}", file=sys.stderr)
742
+
743
+
744
+ def sanitize_stream(
745
+ data: Union[
746
+ str,
747
+ bytes,
748
+ Iterable[str],
749
+ Iterable[bytes],
750
+ AsyncIterable[str],
751
+ AsyncIterable[bytes],
752
+ dict,
753
+ list,
754
+ int,
755
+ float,
756
+ bool,
757
+ None,
758
+ ],
759
+ intro_value: str = "data:",
760
+ to_json: bool = True,
761
+ skip_markers: Optional[List[str]] = None,
762
+ strip_chars: Optional[str] = None,
763
+ start_marker: Optional[str] = None,
764
+ end_marker: Optional[str] = None,
765
+ content_extractor: Optional[Callable[[Union[str, Dict[str, Any]]], Optional[Any]]] = None,
766
+ yield_raw_on_error: bool = True,
767
+ encoding: EncodingType = "utf-8",
768
+ encoding_errors: str = "replace",
769
+ buffer_size: int = 8192,
770
+ line_delimiter: Optional[str] = None,
771
+ error_handler: Optional[Callable[[Exception, str], Optional[Any]]] = None,
772
+ skip_regexes: Optional[List[Union[str, Pattern[str]]]] = None,
773
+ extract_regexes: Optional[List[Union[str, Pattern[str]]]] = None,
774
+ object_mode: Literal["as_is", "json", "str"] = "json",
775
+ raw: bool = False,
776
+ ) -> Union[Generator[Any, None, None], AsyncGenerator[Any, None]]:
777
+ """
778
+ Processes streaming data (strings or bytes) in either synchronous or asynchronous mode.
779
+ Now supports non-iterable and miscellaneous input types (dict, list, int, float, bool, None).
780
+ Includes regex-based content filtering and extraction capabilities.
781
+
782
+ Args:
783
+ data: The data to be processed. Can be a string, bytes, a synchronous iterable of strings or bytes,
784
+ an asynchronous iterable of strings or bytes, or a single object (dict, list, int, float, bool, None).
785
+ intro_value (str): Prefix indicating the start of meaningful data. Defaults to "data:".
786
+ to_json (bool): Parse JSON content if ``True``. Defaults to True.
787
+ skip_markers (Optional[List[str]]): Lines containing any of these markers are skipped. Defaults to None.
788
+ strip_chars (Optional[str]): Characters to strip from each line. Defaults to None.
789
+ start_marker (Optional[str]): Begin processing only after this marker is found. Defaults to None.
790
+ end_marker (Optional[str]): Stop processing once this marker is found. Defaults to None.
791
+ content_extractor (Optional[Callable[[Union[str, Dict[str, Any]]], Optional[Any]]]):
792
+ Optional callable to transform parsed content before yielding. Defaults to None.
793
+ yield_raw_on_error (bool): Yield raw lines when JSON parsing fails. Defaults to True.
794
+ encoding (EncodingType): Byte stream encoding. Defaults to "utf-8".
795
+ encoding_errors (str): How to handle encoding errors. Defaults to "replace".
796
+ buffer_size (int): Buffer size for byte decoding. Defaults to 8192.
797
+ line_delimiter (Optional[str]): Delimiter used to split incoming text into lines.
798
+ ``None`` uses ``str.splitlines()``. Defaults to None.
799
+ error_handler (Optional[Callable[[Exception, str], Optional[Any]]]):
800
+ Callback invoked with ``(Exception, str)`` when JSON parsing fails.
801
+ If the callback returns a value, it is yielded in place of the raw line. Defaults to None.
802
+ skip_regexes (Optional[List[Union[str, Pattern[str]]]]): List of regex patterns (strings or compiled)
803
+ for skipping lines that match any pattern. Defaults to None.
804
+ extract_regexes (Optional[List[Union[str, Pattern[str]]]]): List of regex patterns (strings or compiled)
805
+ for extracting content using capturing groups. If multiple groups are captured, they are returned as a tuple string. Defaults to None.
806
+ object_mode (Literal["as_is", "json", "str"]): How to handle non-string, non-iterable objects.
807
+ "json" (default) yields as JSON string, "str" yields as str(obj), "as_is" yields the object as-is.
808
+ raw (bool): If True, yields the raw response as returned by the API, chunk by chunk (no splitting or joining).
809
+
810
+ Returns:
811
+ Union[Generator[Any, None, None], AsyncGenerator[Any, None]]:
812
+ A generator or an asynchronous generator yielding the processed data, or raw data if raw=True.
813
+
814
+ Raises:
815
+ ValueError: If any regex pattern is invalid.
816
+ """ # --- RAW MODE: yield each chunk exactly as returned by the API ---
817
+ if raw:
818
+ def _raw_passthrough_sync(source_iter):
819
+ for chunk in source_iter:
820
+ if isinstance(chunk, (bytes, bytearray)):
821
+ # Decode bytes preserving all whitespace and newlines
822
+ yield chunk.decode(encoding, encoding_errors)
823
+ elif chunk is not None:
824
+ # Yield string chunks as-is, preserving all formatting
825
+ yield chunk
826
+ # Skip None chunks entirely
827
+ async def _raw_passthrough_async(source_aiter):
828
+ async for chunk in source_aiter:
829
+ if isinstance(chunk, (bytes, bytearray)):
830
+ # Decode bytes preserving all whitespace and newlines
831
+ yield chunk.decode(encoding, encoding_errors)
832
+ elif chunk is not None:
833
+ # Yield string chunks as-is, preserving all formatting
834
+ yield chunk
835
+ # Skip None chunks entirely
836
+ # Sync iterable (but not str/bytes)
837
+ if hasattr(data, "__iter__") and not isinstance(data, (str, bytes)):
838
+ return _raw_passthrough_sync(data)
839
+ # Async iterable
840
+ if hasattr(data, "__aiter__"):
841
+ return _raw_passthrough_async(data)
842
+ # Single string or bytes
843
+ if isinstance(data, (bytes, bytearray)):
844
+ def _yield_single():
845
+ yield data.decode(encoding, encoding_errors)
846
+ return _yield_single()
847
+ else:
848
+ def _yield_single():
849
+ if data is not None:
850
+ yield data
851
+ return _yield_single()
852
+ # --- END RAW MODE ---
853
+
854
+ text_attr = getattr(data, "text", None)
855
+ content_attr = getattr(data, "content", None)
856
+
857
+ # Handle None
858
+ if data is None:
859
+ def _empty_gen():
860
+ if False:
861
+ yield None
862
+ return _empty_gen()
863
+
864
+ # Handle bytes directly
865
+ if isinstance(data, bytes):
866
+ try:
867
+ payload = data.decode(encoding, encoding_errors)
868
+ except Exception:
869
+ payload = str(data)
870
+ return _sanitize_stream_sync(
871
+ payload, intro_value, to_json, skip_markers, strip_chars,
872
+ start_marker, end_marker, content_extractor, yield_raw_on_error,
873
+ encoding, encoding_errors, buffer_size, line_delimiter, error_handler,
874
+ skip_regexes, extract_regexes, raw,
875
+ )
876
+
877
+ # Handle string directly
878
+ if isinstance(data, str):
879
+ return _sanitize_stream_sync(
880
+ data, intro_value, to_json, skip_markers, strip_chars,
881
+ start_marker, end_marker, content_extractor, yield_raw_on_error,
882
+ encoding, encoding_errors, buffer_size, line_delimiter, error_handler,
883
+ skip_regexes, extract_regexes, raw,
884
+ )
885
+
886
+ # Handle dict, list, int, float, bool (non-iterable, non-string/bytes)
887
+ if isinstance(data, (dict, list, int, float, bool)):
888
+ if object_mode == "as_is":
889
+ def _as_is_gen():
890
+ yield data
891
+ return _as_is_gen()
892
+ elif object_mode == "str":
893
+ return _sanitize_stream_sync(
894
+ str(data), intro_value, to_json, skip_markers, strip_chars,
895
+ start_marker, end_marker, content_extractor, yield_raw_on_error,
896
+ encoding, encoding_errors, buffer_size, line_delimiter, error_handler,
897
+ skip_regexes, extract_regexes, raw,
898
+ )
899
+ else: # "json"
900
+ try:
901
+ json_str = json.dumps(data)
902
+ except Exception:
903
+ json_str = str(data)
904
+ return _sanitize_stream_sync(
905
+ json_str, intro_value, to_json, skip_markers, strip_chars,
906
+ start_marker, end_marker, content_extractor, yield_raw_on_error,
907
+ encoding, encoding_errors, buffer_size, line_delimiter, error_handler,
908
+ skip_regexes, extract_regexes, raw,
909
+ )
910
+
911
+ # Handle file-like objects (optional, treat as string if .read exists)
912
+ if hasattr(data, "read") and callable(data.read):
913
+ try:
914
+ file_content = data.read()
915
+ if isinstance(file_content, bytes):
916
+ file_content = file_content.decode(encoding, encoding_errors)
917
+ return _sanitize_stream_sync(
918
+ file_content, intro_value, to_json, skip_markers, strip_chars,
919
+ start_marker, end_marker, content_extractor, yield_raw_on_error,
920
+ encoding, encoding_errors, buffer_size, line_delimiter, error_handler,
921
+ skip_regexes, extract_regexes, raw,
922
+ )
923
+ except Exception:
924
+ pass # fallback to next
925
+
926
+ # Handle .text or .content attributes
927
+ if isinstance(text_attr, str):
928
+ payload = text_attr
929
+ return _sanitize_stream_sync(
930
+ payload, intro_value, to_json, skip_markers, strip_chars,
931
+ start_marker, end_marker, content_extractor, yield_raw_on_error,
932
+ encoding, encoding_errors, buffer_size, line_delimiter, error_handler,
933
+ skip_regexes, extract_regexes, raw,
934
+ )
935
+ elif isinstance(content_attr, bytes):
936
+ try:
937
+ payload = content_attr.decode(encoding, encoding_errors)
938
+ except Exception:
939
+ payload = str(content_attr)
940
+ return _sanitize_stream_sync(
941
+ payload, intro_value, to_json, skip_markers, strip_chars,
942
+ start_marker, end_marker, content_extractor, yield_raw_on_error,
943
+ encoding, encoding_errors, buffer_size, line_delimiter, error_handler,
944
+ skip_regexes, extract_regexes, raw,
945
+ )
946
+
947
+ # Handle async iterables
948
+ if hasattr(data, "__aiter__"):
949
+ return _sanitize_stream_async(
950
+ data, intro_value, to_json, skip_markers, strip_chars,
951
+ start_marker, end_marker, content_extractor, yield_raw_on_error,
952
+ encoding, encoding_errors, buffer_size, line_delimiter, error_handler,
953
+ skip_regexes, extract_regexes, raw,
954
+ )
955
+ # Handle sync iterables (but not strings/bytes)
956
+ if hasattr(data, "__iter__"):
957
+ return _sanitize_stream_sync(
958
+ data, intro_value, to_json, skip_markers, strip_chars,
959
+ start_marker, end_marker, content_extractor, yield_raw_on_error,
960
+ encoding, encoding_errors, buffer_size, line_delimiter, error_handler,
961
+ skip_regexes, extract_regexes, raw,
962
+ )
963
+ # Fallback: treat as string
964
+ return _sanitize_stream_sync(
965
+ str(data), intro_value, to_json, skip_markers, strip_chars,
966
+ start_marker, end_marker, content_extractor, yield_raw_on_error,
967
+ encoding, encoding_errors, buffer_size, line_delimiter, error_handler,
968
+ skip_regexes, extract_regexes, raw,
969
+ )
970
+
971
+ # --- Decorator version of sanitize_stream ---
972
+ import functools
973
+ import asyncio
974
+ from typing import overload
975
+
976
+ def _sanitize_stream_decorator(
977
+ _func=None,
978
+ *,
979
+ intro_value: str = "data:",
980
+ to_json: bool = True,
981
+ skip_markers: Optional[List[str]] = None,
982
+ strip_chars: Optional[str] = None,
983
+ start_marker: Optional[str] = None,
984
+ end_marker: Optional[str] = None,
985
+ content_extractor: Optional[Callable[[Union[str, Dict[str, Any]]], Optional[Any]]] = None,
986
+ yield_raw_on_error: bool = True,
987
+ encoding: EncodingType = "utf-8",
988
+ encoding_errors: str = "replace",
989
+ buffer_size: int = 8192,
990
+ line_delimiter: Optional[str] = None,
991
+ error_handler: Optional[Callable[[Exception, str], Optional[Any]]] = None,
992
+ skip_regexes: Optional[List[Union[str, Pattern[str]]]] = None,
993
+ extract_regexes: Optional[List[Union[str, Pattern[str]]]] = None,
994
+ object_mode: Literal["as_is", "json", "str"] = "json",
995
+ raw: bool = False,
996
+ ):
997
+ """
998
+ Decorator for sanitize_stream. Can be used as @sanitize_stream or @sanitize_stream(...).
999
+ All arguments are the same as sanitize_stream().
1000
+ """
1001
+ def decorator(func):
1002
+ if asyncio.iscoroutinefunction(func):
1003
+ @functools.wraps(func)
1004
+ async def async_wrapper(*args, **kwargs):
1005
+ result = await func(*args, **kwargs)
1006
+ return sanitize_stream(
1007
+ result,
1008
+ intro_value=intro_value,
1009
+ to_json=to_json,
1010
+ skip_markers=skip_markers,
1011
+ strip_chars=strip_chars,
1012
+ start_marker=start_marker,
1013
+ end_marker=end_marker,
1014
+ content_extractor=content_extractor,
1015
+ yield_raw_on_error=yield_raw_on_error,
1016
+ encoding=encoding,
1017
+ encoding_errors=encoding_errors,
1018
+ buffer_size=buffer_size,
1019
+ line_delimiter=line_delimiter,
1020
+ error_handler=error_handler,
1021
+ skip_regexes=skip_regexes,
1022
+ extract_regexes=extract_regexes,
1023
+ object_mode=object_mode,
1024
+ raw=raw,
1025
+ )
1026
+ return async_wrapper
1027
+ else:
1028
+ @functools.wraps(func)
1029
+ def sync_wrapper(*args, **kwargs):
1030
+ result = func(*args, **kwargs)
1031
+ return sanitize_stream(
1032
+ result,
1033
+ intro_value=intro_value,
1034
+ to_json=to_json,
1035
+ skip_markers=skip_markers,
1036
+ strip_chars=strip_chars,
1037
+ start_marker=start_marker,
1038
+ end_marker=end_marker,
1039
+ content_extractor=content_extractor,
1040
+ yield_raw_on_error=yield_raw_on_error,
1041
+ encoding=encoding,
1042
+ encoding_errors=encoding_errors,
1043
+ buffer_size=buffer_size,
1044
+ line_delimiter=line_delimiter,
1045
+ error_handler=error_handler,
1046
+ skip_regexes=skip_regexes,
1047
+ extract_regexes=extract_regexes,
1048
+ object_mode=object_mode,
1049
+ raw=raw,
1050
+ )
1051
+ return sync_wrapper
1052
+ if _func is None:
1053
+ return decorator
1054
+ else:
1055
+ return decorator(_func)
1056
+
1057
+ # Alias for decorator usage
1058
+ LITSTREAM = sanitize_stream
1059
+
1060
+ # Decorator aliases
1061
+ sanitize_stream_decorator = _sanitize_stream_decorator
1062
+ lit_streamer = _sanitize_stream_decorator
1063
+
1064
+ # Allow @sanitize_stream and @lit_streamer as decorators
1065
+ sanitize_stream.__decorator__ = _sanitize_stream_decorator
1066
+ LITSTREAM.__decorator__ = _sanitize_stream_decorator
1067
+ lit_streamer.__decorator__ = _sanitize_stream_decorator
1068
+
1069
+ def __getattr__(name):
1070
+ if name == 'sanitize_stream':
1071
+ return sanitize_stream
1072
+ if name == 'LITSTREAM':
1073
+ return LITSTREAM
1074
+ if name == 'sanitize_stream_decorator':
1075
+ return _sanitize_stream_decorator
1076
+ if name == 'lit_streamer':
1077
+ return _sanitize_stream_decorator
1078
+ raise AttributeError(f"module {__name__} has no attribute {name}")