webscout 8.2.7__py3-none-any.whl → 8.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (281) hide show
  1. webscout/AIauto.py +33 -15
  2. webscout/AIbase.py +96 -37
  3. webscout/AIutel.py +703 -250
  4. webscout/Bard.py +441 -323
  5. webscout/Extra/Act.md +309 -0
  6. webscout/Extra/GitToolkit/__init__.py +10 -0
  7. webscout/Extra/GitToolkit/gitapi/README.md +110 -0
  8. webscout/Extra/GitToolkit/gitapi/__init__.py +12 -0
  9. webscout/Extra/GitToolkit/gitapi/repository.py +195 -0
  10. webscout/Extra/GitToolkit/gitapi/user.py +96 -0
  11. webscout/Extra/GitToolkit/gitapi/utils.py +62 -0
  12. webscout/Extra/YTToolkit/README.md +375 -0
  13. webscout/Extra/YTToolkit/YTdownloader.py +957 -0
  14. webscout/Extra/YTToolkit/__init__.py +3 -0
  15. webscout/Extra/YTToolkit/transcriber.py +476 -0
  16. webscout/Extra/YTToolkit/ytapi/README.md +44 -0
  17. webscout/Extra/YTToolkit/ytapi/__init__.py +6 -0
  18. webscout/Extra/YTToolkit/ytapi/channel.py +307 -0
  19. webscout/Extra/YTToolkit/ytapi/errors.py +13 -0
  20. webscout/Extra/YTToolkit/ytapi/extras.py +118 -0
  21. webscout/Extra/YTToolkit/ytapi/https.py +88 -0
  22. webscout/Extra/YTToolkit/ytapi/patterns.py +61 -0
  23. webscout/Extra/YTToolkit/ytapi/playlist.py +59 -0
  24. webscout/Extra/YTToolkit/ytapi/pool.py +8 -0
  25. webscout/Extra/YTToolkit/ytapi/query.py +40 -0
  26. webscout/Extra/YTToolkit/ytapi/stream.py +63 -0
  27. webscout/Extra/YTToolkit/ytapi/utils.py +62 -0
  28. webscout/Extra/YTToolkit/ytapi/video.py +232 -0
  29. webscout/Extra/__init__.py +7 -0
  30. webscout/Extra/autocoder/__init__.py +9 -0
  31. webscout/Extra/autocoder/autocoder.py +1105 -0
  32. webscout/Extra/autocoder/autocoder_utiles.py +332 -0
  33. webscout/Extra/gguf.md +430 -0
  34. webscout/Extra/gguf.py +684 -0
  35. webscout/Extra/tempmail/README.md +488 -0
  36. webscout/Extra/tempmail/__init__.py +28 -0
  37. webscout/Extra/tempmail/async_utils.py +141 -0
  38. webscout/Extra/tempmail/base.py +161 -0
  39. webscout/Extra/tempmail/cli.py +187 -0
  40. webscout/Extra/tempmail/emailnator.py +84 -0
  41. webscout/Extra/tempmail/mail_tm.py +361 -0
  42. webscout/Extra/tempmail/temp_mail_io.py +292 -0
  43. webscout/Extra/weather.md +281 -0
  44. webscout/Extra/weather.py +194 -0
  45. webscout/Extra/weather_ascii.py +76 -0
  46. webscout/Litlogger/README.md +10 -0
  47. webscout/Litlogger/__init__.py +15 -0
  48. webscout/Litlogger/formats.py +4 -0
  49. webscout/Litlogger/handlers.py +103 -0
  50. webscout/Litlogger/levels.py +13 -0
  51. webscout/Litlogger/logger.py +92 -0
  52. webscout/Provider/AI21.py +177 -0
  53. webscout/Provider/AISEARCH/DeepFind.py +254 -0
  54. webscout/Provider/AISEARCH/Perplexity.py +333 -0
  55. webscout/Provider/AISEARCH/README.md +279 -0
  56. webscout/Provider/AISEARCH/__init__.py +9 -0
  57. webscout/Provider/AISEARCH/felo_search.py +202 -0
  58. webscout/Provider/AISEARCH/genspark_search.py +324 -0
  59. webscout/Provider/AISEARCH/hika_search.py +186 -0
  60. webscout/Provider/AISEARCH/iask_search.py +410 -0
  61. webscout/Provider/AISEARCH/monica_search.py +220 -0
  62. webscout/Provider/AISEARCH/scira_search.py +298 -0
  63. webscout/Provider/AISEARCH/webpilotai_search.py +255 -0
  64. webscout/Provider/Aitopia.py +316 -0
  65. webscout/Provider/AllenAI.py +440 -0
  66. webscout/Provider/Andi.py +228 -0
  67. webscout/Provider/Blackboxai.py +791 -0
  68. webscout/Provider/ChatGPTClone.py +237 -0
  69. webscout/Provider/ChatGPTGratis.py +194 -0
  70. webscout/Provider/ChatSandbox.py +342 -0
  71. webscout/Provider/Cloudflare.py +324 -0
  72. webscout/Provider/Cohere.py +208 -0
  73. webscout/Provider/Deepinfra.py +340 -0
  74. webscout/Provider/ExaAI.py +261 -0
  75. webscout/Provider/ExaChat.py +358 -0
  76. webscout/Provider/Flowith.py +217 -0
  77. webscout/Provider/FreeGemini.py +250 -0
  78. webscout/Provider/Gemini.py +169 -0
  79. webscout/Provider/GithubChat.py +369 -0
  80. webscout/Provider/GizAI.py +295 -0
  81. webscout/Provider/Glider.py +225 -0
  82. webscout/Provider/Groq.py +801 -0
  83. webscout/Provider/HF_space/__init__.py +0 -0
  84. webscout/Provider/HF_space/qwen_qwen2.py +206 -0
  85. webscout/Provider/HeckAI.py +375 -0
  86. webscout/Provider/HuggingFaceChat.py +469 -0
  87. webscout/Provider/Hunyuan.py +283 -0
  88. webscout/Provider/Jadve.py +291 -0
  89. webscout/Provider/Koboldai.py +384 -0
  90. webscout/Provider/LambdaChat.py +411 -0
  91. webscout/Provider/Llama3.py +259 -0
  92. webscout/Provider/MCPCore.py +315 -0
  93. webscout/Provider/Marcus.py +198 -0
  94. webscout/Provider/Nemotron.py +218 -0
  95. webscout/Provider/Netwrck.py +270 -0
  96. webscout/Provider/OLLAMA.py +396 -0
  97. webscout/Provider/OPENAI/BLACKBOXAI.py +766 -0
  98. webscout/Provider/OPENAI/Cloudflare.py +378 -0
  99. webscout/Provider/OPENAI/FreeGemini.py +283 -0
  100. webscout/Provider/OPENAI/NEMOTRON.py +232 -0
  101. webscout/Provider/OPENAI/Qwen3.py +283 -0
  102. webscout/Provider/OPENAI/README.md +952 -0
  103. webscout/Provider/OPENAI/TwoAI.py +357 -0
  104. webscout/Provider/OPENAI/__init__.py +40 -0
  105. webscout/Provider/OPENAI/ai4chat.py +293 -0
  106. webscout/Provider/OPENAI/api.py +969 -0
  107. webscout/Provider/OPENAI/base.py +249 -0
  108. webscout/Provider/OPENAI/c4ai.py +373 -0
  109. webscout/Provider/OPENAI/chatgpt.py +556 -0
  110. webscout/Provider/OPENAI/chatgptclone.py +494 -0
  111. webscout/Provider/OPENAI/chatsandbox.py +173 -0
  112. webscout/Provider/OPENAI/copilot.py +242 -0
  113. webscout/Provider/OPENAI/deepinfra.py +322 -0
  114. webscout/Provider/OPENAI/e2b.py +1414 -0
  115. webscout/Provider/OPENAI/exaai.py +417 -0
  116. webscout/Provider/OPENAI/exachat.py +444 -0
  117. webscout/Provider/OPENAI/flowith.py +162 -0
  118. webscout/Provider/OPENAI/freeaichat.py +359 -0
  119. webscout/Provider/OPENAI/glider.py +326 -0
  120. webscout/Provider/OPENAI/groq.py +364 -0
  121. webscout/Provider/OPENAI/heckai.py +308 -0
  122. webscout/Provider/OPENAI/llmchatco.py +335 -0
  123. webscout/Provider/OPENAI/mcpcore.py +389 -0
  124. webscout/Provider/OPENAI/multichat.py +376 -0
  125. webscout/Provider/OPENAI/netwrck.py +357 -0
  126. webscout/Provider/OPENAI/oivscode.py +287 -0
  127. webscout/Provider/OPENAI/opkfc.py +496 -0
  128. webscout/Provider/OPENAI/pydantic_imports.py +172 -0
  129. webscout/Provider/OPENAI/scirachat.py +477 -0
  130. webscout/Provider/OPENAI/sonus.py +304 -0
  131. webscout/Provider/OPENAI/standardinput.py +433 -0
  132. webscout/Provider/OPENAI/textpollinations.py +339 -0
  133. webscout/Provider/OPENAI/toolbaz.py +413 -0
  134. webscout/Provider/OPENAI/typefully.py +355 -0
  135. webscout/Provider/OPENAI/typegpt.py +364 -0
  136. webscout/Provider/OPENAI/uncovrAI.py +463 -0
  137. webscout/Provider/OPENAI/utils.py +318 -0
  138. webscout/Provider/OPENAI/venice.py +431 -0
  139. webscout/Provider/OPENAI/wisecat.py +387 -0
  140. webscout/Provider/OPENAI/writecream.py +163 -0
  141. webscout/Provider/OPENAI/x0gpt.py +365 -0
  142. webscout/Provider/OPENAI/yep.py +382 -0
  143. webscout/Provider/OpenGPT.py +209 -0
  144. webscout/Provider/Openai.py +496 -0
  145. webscout/Provider/PI.py +429 -0
  146. webscout/Provider/Perplexitylabs.py +415 -0
  147. webscout/Provider/QwenLM.py +254 -0
  148. webscout/Provider/Reka.py +214 -0
  149. webscout/Provider/StandardInput.py +290 -0
  150. webscout/Provider/TTI/README.md +82 -0
  151. webscout/Provider/TTI/__init__.py +7 -0
  152. webscout/Provider/TTI/aiarta.py +365 -0
  153. webscout/Provider/TTI/artbit.py +0 -0
  154. webscout/Provider/TTI/base.py +64 -0
  155. webscout/Provider/TTI/fastflux.py +200 -0
  156. webscout/Provider/TTI/magicstudio.py +201 -0
  157. webscout/Provider/TTI/piclumen.py +203 -0
  158. webscout/Provider/TTI/pixelmuse.py +225 -0
  159. webscout/Provider/TTI/pollinations.py +221 -0
  160. webscout/Provider/TTI/utils.py +11 -0
  161. webscout/Provider/TTS/README.md +192 -0
  162. webscout/Provider/TTS/__init__.py +10 -0
  163. webscout/Provider/TTS/base.py +159 -0
  164. webscout/Provider/TTS/deepgram.py +156 -0
  165. webscout/Provider/TTS/elevenlabs.py +111 -0
  166. webscout/Provider/TTS/gesserit.py +128 -0
  167. webscout/Provider/TTS/murfai.py +113 -0
  168. webscout/Provider/TTS/openai_fm.py +129 -0
  169. webscout/Provider/TTS/parler.py +111 -0
  170. webscout/Provider/TTS/speechma.py +580 -0
  171. webscout/Provider/TTS/sthir.py +94 -0
  172. webscout/Provider/TTS/streamElements.py +333 -0
  173. webscout/Provider/TTS/utils.py +280 -0
  174. webscout/Provider/TeachAnything.py +229 -0
  175. webscout/Provider/TextPollinationsAI.py +308 -0
  176. webscout/Provider/TwoAI.py +475 -0
  177. webscout/Provider/TypliAI.py +305 -0
  178. webscout/Provider/UNFINISHED/ChatHub.py +209 -0
  179. webscout/Provider/UNFINISHED/Youchat.py +330 -0
  180. webscout/Provider/UNFINISHED/liner_api_request.py +263 -0
  181. webscout/Provider/UNFINISHED/puterjs.py +635 -0
  182. webscout/Provider/UNFINISHED/test_lmarena.py +119 -0
  183. webscout/Provider/Venice.py +258 -0
  184. webscout/Provider/VercelAI.py +253 -0
  185. webscout/Provider/WiseCat.py +233 -0
  186. webscout/Provider/WrDoChat.py +370 -0
  187. webscout/Provider/Writecream.py +246 -0
  188. webscout/Provider/WritingMate.py +269 -0
  189. webscout/Provider/__init__.py +174 -0
  190. webscout/Provider/ai4chat.py +174 -0
  191. webscout/Provider/akashgpt.py +335 -0
  192. webscout/Provider/asksteve.py +220 -0
  193. webscout/Provider/cerebras.py +290 -0
  194. webscout/Provider/chatglm.py +215 -0
  195. webscout/Provider/cleeai.py +213 -0
  196. webscout/Provider/copilot.py +425 -0
  197. webscout/Provider/elmo.py +283 -0
  198. webscout/Provider/freeaichat.py +285 -0
  199. webscout/Provider/geminiapi.py +208 -0
  200. webscout/Provider/granite.py +235 -0
  201. webscout/Provider/hermes.py +266 -0
  202. webscout/Provider/julius.py +223 -0
  203. webscout/Provider/koala.py +170 -0
  204. webscout/Provider/learnfastai.py +325 -0
  205. webscout/Provider/llama3mitril.py +215 -0
  206. webscout/Provider/llmchat.py +258 -0
  207. webscout/Provider/llmchatco.py +306 -0
  208. webscout/Provider/lmarena.py +198 -0
  209. webscout/Provider/meta.py +801 -0
  210. webscout/Provider/multichat.py +364 -0
  211. webscout/Provider/oivscode.py +309 -0
  212. webscout/Provider/samurai.py +224 -0
  213. webscout/Provider/scira_chat.py +299 -0
  214. webscout/Provider/scnet.py +243 -0
  215. webscout/Provider/searchchat.py +292 -0
  216. webscout/Provider/sonus.py +258 -0
  217. webscout/Provider/talkai.py +194 -0
  218. webscout/Provider/toolbaz.py +353 -0
  219. webscout/Provider/turboseek.py +266 -0
  220. webscout/Provider/typefully.py +202 -0
  221. webscout/Provider/typegpt.py +289 -0
  222. webscout/Provider/uncovr.py +368 -0
  223. webscout/Provider/x0gpt.py +299 -0
  224. webscout/Provider/yep.py +389 -0
  225. webscout/__init__.py +4 -2
  226. webscout/cli.py +3 -28
  227. webscout/client.py +70 -0
  228. webscout/conversation.py +35 -35
  229. webscout/litagent/Readme.md +276 -0
  230. webscout/litagent/__init__.py +29 -0
  231. webscout/litagent/agent.py +455 -0
  232. webscout/litagent/constants.py +60 -0
  233. webscout/litprinter/__init__.py +59 -0
  234. webscout/optimizers.py +419 -419
  235. webscout/scout/README.md +404 -0
  236. webscout/scout/__init__.py +8 -0
  237. webscout/scout/core/__init__.py +7 -0
  238. webscout/scout/core/crawler.py +210 -0
  239. webscout/scout/core/scout.py +607 -0
  240. webscout/scout/core/search_result.py +96 -0
  241. webscout/scout/core/text_analyzer.py +63 -0
  242. webscout/scout/core/text_utils.py +277 -0
  243. webscout/scout/core/web_analyzer.py +52 -0
  244. webscout/scout/element.py +478 -0
  245. webscout/scout/parsers/__init__.py +69 -0
  246. webscout/scout/parsers/html5lib_parser.py +172 -0
  247. webscout/scout/parsers/html_parser.py +236 -0
  248. webscout/scout/parsers/lxml_parser.py +178 -0
  249. webscout/scout/utils.py +37 -0
  250. webscout/swiftcli/Readme.md +323 -0
  251. webscout/swiftcli/__init__.py +95 -0
  252. webscout/swiftcli/core/__init__.py +7 -0
  253. webscout/swiftcli/core/cli.py +297 -0
  254. webscout/swiftcli/core/context.py +104 -0
  255. webscout/swiftcli/core/group.py +241 -0
  256. webscout/swiftcli/decorators/__init__.py +28 -0
  257. webscout/swiftcli/decorators/command.py +221 -0
  258. webscout/swiftcli/decorators/options.py +220 -0
  259. webscout/swiftcli/decorators/output.py +252 -0
  260. webscout/swiftcli/exceptions.py +21 -0
  261. webscout/swiftcli/plugins/__init__.py +9 -0
  262. webscout/swiftcli/plugins/base.py +135 -0
  263. webscout/swiftcli/plugins/manager.py +269 -0
  264. webscout/swiftcli/utils/__init__.py +59 -0
  265. webscout/swiftcli/utils/formatting.py +252 -0
  266. webscout/swiftcli/utils/parsing.py +267 -0
  267. webscout/version.py +1 -1
  268. webscout/webscout_search.py +2 -182
  269. webscout/webscout_search_async.py +1 -179
  270. webscout/zeroart/README.md +89 -0
  271. webscout/zeroart/__init__.py +135 -0
  272. webscout/zeroart/base.py +66 -0
  273. webscout/zeroart/effects.py +101 -0
  274. webscout/zeroart/fonts.py +1239 -0
  275. {webscout-8.2.7.dist-info → webscout-8.2.9.dist-info}/METADATA +262 -83
  276. webscout-8.2.9.dist-info/RECORD +289 -0
  277. {webscout-8.2.7.dist-info → webscout-8.2.9.dist-info}/WHEEL +1 -1
  278. {webscout-8.2.7.dist-info → webscout-8.2.9.dist-info}/entry_points.txt +1 -0
  279. webscout-8.2.7.dist-info/RECORD +0 -26
  280. {webscout-8.2.7.dist-info → webscout-8.2.9.dist-info}/licenses/LICENSE.md +0 -0
  281. {webscout-8.2.7.dist-info → webscout-8.2.9.dist-info}/top_level.txt +0 -0
webscout/AIutel.py CHANGED
@@ -1,250 +1,703 @@
1
- import json
2
- from typing import Union, Optional, Dict, Any, Iterable, Generator, List, Callable, Literal
3
- import codecs
4
-
5
- # Expanded encoding types
6
- EncodingType = Literal['utf-8', 'utf-16', 'utf-32', 'ascii', 'latin1', 'cp1252', 'iso-8859-1',
7
- 'iso-8859-2', 'windows-1250', 'windows-1251', 'windows-1252', 'gbk', 'big5',
8
- 'shift_jis', 'euc-jp', 'euc-kr']
9
-
10
- def _process_chunk(
11
- chunk: str,
12
- intro_value: str,
13
- to_json: bool,
14
- skip_markers: List[str],
15
- strip_chars: Optional[str],
16
- yield_raw_on_error: bool,
17
- ) -> Union[str, Dict[str, Any], None]:
18
- """Internal helper to sanitize and potentially parse a single chunk."""
19
- if not isinstance(chunk, str):
20
- return None
21
-
22
- # Fast path for empty chunks
23
- if not chunk:
24
- return None
25
-
26
- # Use slicing for prefix removal (faster than startswith+slicing)
27
- sanitized_chunk = chunk
28
- if intro_value and len(chunk) >= len(intro_value) and chunk[:len(intro_value)] == intro_value:
29
- sanitized_chunk = chunk[len(intro_value):]
30
-
31
- # Optimize string stripping operations
32
- if strip_chars is not None:
33
- sanitized_chunk = sanitized_chunk.strip(strip_chars)
34
- else:
35
- # lstrip() is faster than strip() when we only need leading whitespace removed
36
- sanitized_chunk = sanitized_chunk.lstrip()
37
-
38
- # Skip empty chunks and markers
39
- if not sanitized_chunk or any(marker == sanitized_chunk for marker in skip_markers):
40
- return None
41
-
42
- # JSON parsing with optimized error handling
43
- if to_json:
44
- try:
45
- # Only strip before JSON parsing if needed
46
- if sanitized_chunk[0] not in '{[' or sanitized_chunk[-1] not in '}]':
47
- sanitized_chunk = sanitized_chunk.strip()
48
- return json.loads(sanitized_chunk)
49
- except (json.JSONDecodeError, Exception):
50
- return sanitized_chunk if yield_raw_on_error else None
51
-
52
- return sanitized_chunk
53
-
54
- def _decode_byte_stream(
55
- byte_iterator: Iterable[bytes],
56
- encoding: EncodingType = 'utf-8',
57
- errors: str = 'replace',
58
- buffer_size: int = 8192
59
- ) -> Generator[str, None, None]:
60
- """
61
- Realtime byte stream decoder with flexible encoding support.
62
-
63
- Args:
64
- byte_iterator: Iterator yielding bytes
65
- encoding: Character encoding to use
66
- errors: How to handle encoding errors ('strict', 'ignore', 'replace')
67
- buffer_size: Size of internal buffer for performance tuning
68
- """
69
- # Initialize decoder with the specified encoding
70
- try:
71
- decoder = codecs.getincrementaldecoder(encoding)(errors=errors)
72
- except LookupError:
73
- # Fallback to utf-8 if the encoding is not supported
74
- decoder = codecs.getincrementaldecoder('utf-8')(errors=errors)
75
-
76
- # Process byte stream in realtime
77
- buffer = bytearray(buffer_size)
78
- buffer_view = memoryview(buffer)
79
-
80
- for chunk_bytes in byte_iterator:
81
- if not chunk_bytes:
82
- continue
83
-
84
- try:
85
- # Use buffer for processing if chunk size is appropriate
86
- if len(chunk_bytes) <= buffer_size:
87
- buffer[:len(chunk_bytes)] = chunk_bytes
88
- text = decoder.decode(buffer_view[:len(chunk_bytes)], final=False)
89
- else:
90
- text = decoder.decode(chunk_bytes, final=False)
91
-
92
- if text:
93
- yield text
94
- except UnicodeDecodeError:
95
- yield f"[Encoding Error: Could not decode bytes with {encoding}]\n"
96
-
97
- # Final flush
98
- try:
99
- final_text = decoder.decode(b'', final=True)
100
- if final_text:
101
- yield final_text
102
- except UnicodeDecodeError:
103
- yield f"[Encoding Error: Could not decode final bytes with {encoding}]\n"
104
-
105
- def sanitize_stream(
106
- data: Union[str, Iterable[str], Iterable[bytes]],
107
- intro_value: str = "data:",
108
- to_json: bool = True,
109
- skip_markers: Optional[List[str]] = None,
110
- strip_chars: Optional[str] = None,
111
- start_marker: Optional[str] = None,
112
- end_marker: Optional[str] = None,
113
- content_extractor: Optional[Callable[[Union[str, Dict[str, Any]]], Optional[Any]]] = None,
114
- yield_raw_on_error: bool = True,
115
- encoding: EncodingType = 'utf-8',
116
- encoding_errors: str = 'replace',
117
- buffer_size: int = 8192,
118
- ) -> Generator[Any, None, None]:
119
- """
120
- Optimized realtime stream processor that handles string/byte streams with minimal latency.
121
-
122
- Features:
123
- - Direct realtime processing of byte streams
124
- - Optimized string handling and JSON parsing
125
- - Robust error handling and validation
126
- - Flexible encoding support with memory-efficient buffering
127
- - High performance for large streams
128
-
129
- Args:
130
- data: Input data (string, string iterator, or bytes iterator)
131
- intro_value: Prefix to remove from each chunk
132
- to_json: Whether to parse chunks as JSON
133
- skip_markers: Markers to skip
134
- strip_chars: Characters to strip
135
- start_marker: Processing start marker
136
- end_marker: Processing end marker
137
- content_extractor: Function to extract content
138
- yield_raw_on_error: Yield raw content on JSON errors
139
- encoding: Character encoding for byte streams
140
- encoding_errors: How to handle encoding errors
141
- buffer_size: Size of internal processing buffer
142
-
143
- Yields:
144
- Processed chunks (string or dictionary)
145
- """
146
- effective_skip_markers = skip_markers or []
147
- processing_active = start_marker is None
148
-
149
- # Fast path for single string processing
150
- if isinstance(data, str):
151
- processed_item = None
152
- if processing_active:
153
- # Optimize JSON parsing for large strings
154
- if to_json:
155
- try:
156
- # Use faster JSON parser for large strings
157
- data = data.strip()
158
- if data:
159
- processed_item = json.loads(data)
160
- except json.JSONDecodeError:
161
- processed_item = data if yield_raw_on_error else None
162
- else:
163
- processed_item = _process_chunk(
164
- data, intro_value, False, effective_skip_markers,
165
- strip_chars, yield_raw_on_error
166
- )
167
-
168
- if processed_item is not None:
169
- if content_extractor:
170
- try:
171
- final_content = content_extractor(processed_item)
172
- if final_content is not None:
173
- yield final_content
174
- except Exception:
175
- pass
176
- else:
177
- yield processed_item
178
- return
179
-
180
- # Stream processing path
181
- if not hasattr(data, '__iter__'):
182
- raise TypeError(f"Input must be a string or an iterable, not {type(data).__name__}")
183
-
184
- try:
185
- iterator = iter(data)
186
- first_item = next(iterator, None)
187
- if first_item is None:
188
- return
189
-
190
- # Efficient streaming with itertools
191
- from itertools import chain
192
- stream = chain([first_item], iterator)
193
-
194
- # Determine if we're dealing with bytes or strings
195
- if isinstance(first_item, bytes):
196
- line_iterator = _decode_byte_stream(
197
- stream,
198
- encoding=encoding,
199
- errors=encoding_errors,
200
- buffer_size=buffer_size
201
- )
202
- elif isinstance(first_item, str):
203
- line_iterator = stream
204
- else:
205
- raise TypeError(f"Stream must yield strings or bytes, not {type(first_item).__name__}")
206
-
207
- # Process stream with minimal allocations
208
- for line in line_iterator:
209
- if not line:
210
- continue
211
-
212
- # Handle markers efficiently
213
- if not processing_active and start_marker is not None:
214
- if line.strip() == start_marker:
215
- processing_active = True
216
- continue
217
-
218
- if processing_active and end_marker is not None and line.strip() == end_marker:
219
- processing_active = False
220
- continue
221
-
222
- if processing_active:
223
- # Process chunk with optimized function
224
- processed = _process_chunk(
225
- line, intro_value, to_json, effective_skip_markers,
226
- strip_chars, yield_raw_on_error
227
- )
228
-
229
- if processed is not None:
230
- if content_extractor:
231
- try:
232
- final_content = content_extractor(processed)
233
- if final_content is not None:
234
- yield final_content
235
- except Exception:
236
- # Continue on extraction errors
237
- pass
238
- else:
239
- yield processed
240
-
241
- except Exception as e:
242
- # Log error but don't crash on stream processing exceptions
243
- import sys
244
- print(f"Stream processing error: {str(e)}", file=sys.stderr)
245
-
246
-
247
- from .conversation import Conversation
248
- from .optimizers import Optimizers
249
- from .Extra.autocoder import AutoCoder
250
- from .prompt_manager import AwesomePrompts
1
+ import codecs
2
+ import json
3
+ from typing import (
4
+ Any,
5
+ AsyncGenerator,
6
+ AsyncIterable,
7
+ Callable,
8
+ Dict,
9
+ Generator,
10
+ Iterable,
11
+ List,
12
+ Literal,
13
+ Optional,
14
+ Union,
15
+ )
16
+
17
+ # Expanded encoding types
18
+ EncodingType = Literal['utf-8', 'utf-16', 'utf-32', 'ascii', 'latin1', 'cp1252', 'iso-8859-1',
19
+ 'iso-8859-2', 'windows-1250', 'windows-1251', 'windows-1252', 'gbk', 'big5',
20
+ 'shift_jis', 'euc-jp', 'euc-kr']
21
+
22
+ def _process_chunk(
23
+ chunk: str,
24
+ intro_value: str,
25
+ to_json: bool,
26
+ skip_markers: List[str],
27
+ strip_chars: Optional[str],
28
+ yield_raw_on_error: bool,
29
+ error_handler: Optional[Callable[[Exception, str], Optional[Any]]] = None,
30
+ ) -> Union[str, Dict[str, Any], None]:
31
+ """
32
+ Sanitizes and potentially parses a single chunk of text.
33
+
34
+ This function performs several operations on the input chunk:
35
+ - Removes a specified prefix (`intro_value`).
36
+ - Strips leading/trailing characters (`strip_chars`).
37
+ - Skips chunks matching specific markers (`skip_markers`).
38
+ - Optionally parses the chunk as JSON (`to_json`).
39
+ - Handles JSON parsing errors with an optional callback (`error_handler`).
40
+
41
+ Args:
42
+ chunk (str): The chunk of text to process.
43
+ intro_value (str): The prefix to remove from the chunk.
44
+ to_json (bool): If True, attempts to parse the chunk as JSON.
45
+ skip_markers (List[str]): A list of markers; chunks matching these are skipped.
46
+ strip_chars (Optional[str]): Characters to strip from the beginning and end of the chunk.
47
+ yield_raw_on_error (bool): If True, returns the raw chunk when JSON parsing fails; otherwise, returns None.
48
+ error_handler (Optional[Callable[[Exception, str], Optional[Any]]]): An optional callback function that is called when JSON parsing fails.
49
+ It receives the exception and the sanitized chunk as arguments. It should return a value to yield instead of the raw chunk, or None to ignore.
50
+
51
+
52
+ Args:
53
+ chunk: Chunk of text to process.
54
+ intro_value: Prefix to remove from the chunk.
55
+ to_json: Parse the chunk as JSON if True.
56
+ skip_markers: List of markers to skip.
57
+ strip_chars: Characters to strip from the chunk.
58
+ yield_raw_on_error: Whether to return the raw chunk on parse errors.
59
+ error_handler: Optional callback ``Callable[[Exception, str], Optional[Any]]``
60
+ invoked when JSON parsing fails. The callback should return a value to
61
+ yield instead of the raw chunk, or ``None`` to ignore.
62
+ """
63
+ if not isinstance(chunk, str):
64
+ return None
65
+
66
+ # Fast path for empty chunks
67
+ if not chunk:
68
+ return None
69
+
70
+ # Use slicing for prefix removal (faster than startswith+slicing)
71
+ sanitized_chunk = chunk
72
+ if intro_value and len(chunk) >= len(intro_value) and chunk[:len(intro_value)] == intro_value:
73
+ sanitized_chunk = chunk[len(intro_value):]
74
+
75
+ # Optimize string stripping operations
76
+ if strip_chars is not None:
77
+ sanitized_chunk = sanitized_chunk.strip(strip_chars)
78
+ else:
79
+ # lstrip() is faster than strip() when we only need leading whitespace removed
80
+ sanitized_chunk = sanitized_chunk.lstrip()
81
+
82
+ # Skip empty chunks and markers
83
+ if not sanitized_chunk or any(marker == sanitized_chunk for marker in skip_markers):
84
+ return None
85
+
86
+ # JSON parsing with optimized error handling
87
+ if to_json:
88
+ try:
89
+ # Only strip before JSON parsing if needed
90
+ if sanitized_chunk[0] not in '{[' or sanitized_chunk[-1] not in '}]':
91
+ sanitized_chunk = sanitized_chunk.strip()
92
+ return json.loads(sanitized_chunk)
93
+ except (json.JSONDecodeError, Exception) as e:
94
+ if error_handler:
95
+ try:
96
+ handled = error_handler(e, sanitized_chunk)
97
+ if handled is not None:
98
+ return handled
99
+ except Exception:
100
+ pass
101
+ return sanitized_chunk if yield_raw_on_error else None
102
+
103
+ return sanitized_chunk
104
+
105
+ def _decode_byte_stream(
106
+ byte_iterator: Iterable[bytes],
107
+ encoding: EncodingType = 'utf-8',
108
+ errors: str = 'replace',
109
+ buffer_size: int = 8192
110
+ ) -> Generator[str, None, None]:
111
+ """
112
+ Decodes a byte stream in realtime with flexible encoding support.
113
+
114
+ This function takes an iterator of bytes and decodes it into a stream of strings
115
+ using the specified character encoding. It handles encoding errors gracefully
116
+ and can be tuned for performance with the `buffer_size` parameter.
117
+
118
+ Args:
119
+ byte_iterator (Iterable[bytes]): An iterator that yields chunks of bytes.
120
+ encoding (EncodingType): The character encoding to use for decoding.
121
+ Defaults to 'utf-8'. Supports a wide range of encodings, including:
122
+ 'utf-8', 'utf-16', 'utf-32', 'ascii', 'latin1', 'cp1252', 'iso-8859-1',
123
+ 'iso-8859-2', 'windows-1250', 'windows-1251', 'windows-1252', 'gbk', 'big5',
124
+ 'shift_jis', 'euc-jp', 'euc-kr'.
125
+ errors (str): Specifies how encoding errors should be handled.
126
+ Options are 'strict' (raises an error), 'ignore' (skips the error), and
127
+ 'replace' (replaces the erroneous byte with a replacement character).
128
+ Defaults to 'replace'.
129
+ buffer_size (int): The size of the internal buffer used for decoding.
130
+
131
+ Args:
132
+ byte_iterator: Iterator yielding bytes
133
+ encoding: Character encoding to use
134
+ errors: How to handle encoding errors ('strict', 'ignore', 'replace')
135
+ buffer_size: Size of internal buffer for performance tuning
136
+ """
137
+ # Initialize decoder with the specified encoding
138
+ try:
139
+ decoder = codecs.getincrementaldecoder(encoding)(errors=errors)
140
+ except LookupError:
141
+ # Fallback to utf-8 if the encoding is not supported
142
+ decoder = codecs.getincrementaldecoder('utf-8')(errors=errors)
143
+
144
+ # Process byte stream in realtime
145
+ buffer = bytearray(buffer_size)
146
+ buffer_view = memoryview(buffer)
147
+
148
+ for chunk_bytes in byte_iterator:
149
+ if not chunk_bytes:
150
+ continue
151
+
152
+ try:
153
+ # Use buffer for processing if chunk size is appropriate
154
+ if len(chunk_bytes) <= buffer_size:
155
+ buffer[:len(chunk_bytes)] = chunk_bytes
156
+ text = decoder.decode(buffer_view[:len(chunk_bytes)], final=False)
157
+ else:
158
+ text = decoder.decode(chunk_bytes, final=False)
159
+
160
+ if text:
161
+ yield text
162
+ except UnicodeDecodeError:
163
+ yield f"[Encoding Error: Could not decode bytes with {encoding}]\n"
164
+
165
+ # Final flush
166
+ try:
167
+ final_text = decoder.decode(b'', final=True)
168
+ if final_text:
169
+ yield final_text
170
+ except UnicodeDecodeError:
171
+ yield f"[Encoding Error: Could not decode final bytes with {encoding}]\n"
172
+
173
+ async def _decode_byte_stream_async(
174
+ byte_iterator: Iterable[bytes],
175
+ encoding: EncodingType = 'utf-8',
176
+ errors: str = 'replace',
177
+ buffer_size: int = 8192
178
+ ) -> AsyncGenerator[str, None]:
179
+ """
180
+ Asynchronously decodes a byte stream with flexible encoding support.
181
+
182
+ This function is the asynchronous counterpart to `_decode_byte_stream`. It takes
183
+ an asynchronous iterator of bytes and decodes it into a stream of strings using
184
+ the specified character encoding. It handles encoding errors gracefully and can
185
+ be tuned for performance with the `buffer_size` parameter.
186
+
187
+ Args:
188
+ byte_iterator (Iterable[bytes]): An asynchronous iterator that yields chunks of bytes.
189
+ encoding (EncodingType): The character encoding to use for decoding.
190
+ Defaults to 'utf-8'. Supports a wide range of encodings, including:
191
+ 'utf-8', 'utf-16', 'utf-32', 'ascii', 'latin1', 'cp1252', 'iso-8859-1',
192
+ 'iso-8859-2', 'windows-1250', 'windows-1251', 'windows-1252', 'gbk', 'big5',
193
+ 'shift_jis', 'euc-jp', 'euc-kr'.
194
+ errors (str): Specifies how encoding errors should be handled.
195
+ Options are 'strict' (raises an error), 'ignore' (skips the error), and
196
+ 'replace' (replaces the erroneous byte with a replacement character).
197
+ Defaults to 'replace'.
198
+ buffer_size (int): The size of the internal buffer used for decoding.
199
+ """
200
+ try:
201
+ decoder = codecs.getincrementaldecoder(encoding)(errors=errors)
202
+ except LookupError:
203
+ decoder = codecs.getincrementaldecoder('utf-8')(errors=errors)
204
+
205
+ buffer = bytearray(buffer_size)
206
+ buffer_view = memoryview(buffer)
207
+
208
+ async for chunk_bytes in byte_iterator:
209
+ if not chunk_bytes:
210
+ continue
211
+ try:
212
+ if len(chunk_bytes) <= buffer_size:
213
+ buffer[:len(chunk_bytes)] = chunk_bytes
214
+ text = decoder.decode(buffer_view[:len(chunk_bytes)], final=False)
215
+ else:
216
+ text = decoder.decode(chunk_bytes, final=False)
217
+ if text:
218
+ yield text
219
+ except UnicodeDecodeError:
220
+ yield f"[Encoding Error: Could not decode bytes with {encoding}]\n"
221
+
222
+ try:
223
+ final_text = decoder.decode(b'', final=True)
224
+ if final_text:
225
+ yield final_text
226
+ except UnicodeDecodeError:
227
+ yield f"[Encoding Error: Could not decode final bytes with {encoding}]\n"
228
+
229
+ def _sanitize_stream_sync(
230
+ data: Union[str, Iterable[str], Iterable[bytes]],
231
+ intro_value: str = "data:",
232
+ to_json: bool = True,
233
+ skip_markers: Optional[List[str]] = None,
234
+ strip_chars: Optional[str] = None,
235
+ start_marker: Optional[str] = None,
236
+ end_marker: Optional[str] = None,
237
+ content_extractor: Optional[Callable[[Union[str, Dict[str, Any]]], Optional[Any]]] = None,
238
+ yield_raw_on_error: bool = True,
239
+ encoding: EncodingType = 'utf-8',
240
+ encoding_errors: str = 'replace',
241
+ buffer_size: int = 8192,
242
+ line_delimiter: Optional[str] = None,
243
+ error_handler: Optional[Callable[[Exception, str], Optional[Any]]] = None,
244
+ ) -> Generator[Any, None, None]:
245
+ """
246
+ Processes a stream of data (strings or bytes) in real-time, applying various transformations and filtering.
247
+
248
+ This function is designed to handle streaming data, allowing for operations such as
249
+ prefix removal, JSON parsing, skipping lines based on markers, and extracting specific content.
250
+ It also supports custom error handling for JSON parsing failures.
251
+
252
+ Args:
253
+ data: String, iterable of strings, or iterable of bytes to process.
254
+ intro_value: Prefix indicating the start of meaningful data.
255
+ to_json: Parse JSON content if ``True``.
256
+ skip_markers: Lines containing any of these markers are skipped.
257
+ strip_chars: Characters to strip from each line.
258
+ start_marker: Begin processing only after this marker is found.
259
+ end_marker: Stop processing once this marker is found.
260
+ content_extractor: Optional callable to transform parsed content before yielding.
261
+ yield_raw_on_error: Yield raw lines when JSON parsing fails.
262
+ encoding: Byte stream encoding.
263
+ encoding_errors: How to handle encoding errors.
264
+ buffer_size: Buffer size for byte decoding.
265
+ line_delimiter: Delimiter used to split incoming text into lines. ``None``
266
+ uses ``str.splitlines()``.
267
+ error_handler: Callback invoked with ``(Exception, str)`` when JSON
268
+ parsing fails. If the callback returns a value, it is yielded instead of the raw line.
269
+
270
+ Yields:
271
+ Any: Processed data, which can be a string, a dictionary (if `to_json` is True), or the result of `content_extractor`.
272
+
273
+ Raises:
274
+ TypeError: If the input `data` is not a string or an iterable.
275
+ """
276
+ effective_skip_markers = skip_markers or []
277
+ processing_active = start_marker is None
278
+ buffer = ""
279
+ found_start = False if start_marker else True
280
+
281
+ # Fast path for single string processing
282
+ if isinstance(data, str):
283
+ processed_item = None
284
+ if processing_active:
285
+ if to_json:
286
+ try:
287
+ data = data.strip()
288
+ if data:
289
+ processed_item = json.loads(data)
290
+ except Exception as e:
291
+ if error_handler:
292
+ try:
293
+ handled = error_handler(e, data)
294
+ if handled is not None:
295
+ processed_item = handled
296
+
297
+ except Exception:
298
+ pass
299
+ if processed_item is None:
300
+ processed_item = data if yield_raw_on_error else None
301
+ else:
302
+ processed_item = _process_chunk(
303
+ data, intro_value, False, effective_skip_markers,
304
+ strip_chars, yield_raw_on_error, error_handler
305
+ )
306
+ if processed_item is not None:
307
+ if content_extractor:
308
+ try:
309
+ final_content = content_extractor(processed_item)
310
+ if final_content is not None:
311
+ yield final_content
312
+ except Exception:
313
+ pass
314
+ else:
315
+ yield processed_item
316
+ return
317
+
318
+ # Stream processing path
319
+ if not hasattr(data, '__iter__'):
320
+ raise TypeError(f"Input must be a string or an iterable, not {type(data).__name__}")
321
+
322
+ try:
323
+ iterator = iter(data)
324
+ first_item = next(iterator, None)
325
+ if first_item is None:
326
+ return
327
+ from itertools import chain
328
+ stream = chain([first_item], iterator)
329
+
330
+ # Determine if we're dealing with bytes or strings
331
+ if isinstance(first_item, bytes):
332
+ line_iterator = _decode_byte_stream(
333
+ stream,
334
+ encoding=encoding,
335
+ errors=encoding_errors,
336
+ buffer_size=buffer_size
337
+ )
338
+ elif isinstance(first_item, str):
339
+ line_iterator = stream
340
+ else:
341
+ raise TypeError(f"Stream must yield strings or bytes, not {type(first_item).__name__}")
342
+
343
+ for line in line_iterator:
344
+ if not line:
345
+ continue
346
+ buffer += line
347
+ while True:
348
+ # Look for start marker if needed
349
+ if not found_start and start_marker:
350
+ idx = buffer.find(start_marker)
351
+ if idx != -1:
352
+ found_start = True
353
+ buffer = buffer[idx + len(start_marker):]
354
+ else:
355
+ # Not found, keep buffering
356
+ buffer = buffer[-max(len(start_marker), 256):] # avoid unbounded growth
357
+ break
358
+ # Look for end marker if needed
359
+ if found_start and end_marker:
360
+ idx = buffer.find(end_marker)
361
+ if idx != -1:
362
+ chunk = buffer[:idx]
363
+ buffer = buffer[idx + len(end_marker):]
364
+ processing_active = False
365
+ else:
366
+ chunk = buffer
367
+ buffer = ""
368
+ processing_active = True
369
+ # Process chunk if we are in active region
370
+ if chunk and processing_active:
371
+ for subline in (chunk.split(line_delimiter) if line_delimiter is not None else chunk.splitlines()):
372
+ result = _process_chunk(
373
+ subline,
374
+ intro_value,
375
+ to_json,
376
+ effective_skip_markers,
377
+ strip_chars,
378
+ yield_raw_on_error,
379
+ error_handler,
380
+ )
381
+ if result is None:
382
+ continue
383
+ if content_extractor:
384
+ try:
385
+ final_content = content_extractor(result)
386
+ if final_content is not None:
387
+ yield final_content
388
+ except Exception:
389
+ pass
390
+ else:
391
+ yield result
392
+ if not processing_active:
393
+ found_start = False
394
+ if idx == -1:
395
+ break
396
+ elif found_start:
397
+ # No end marker, process all buffered content
398
+ chunk = buffer
399
+ buffer = ""
400
+ if chunk:
401
+ for subline in (chunk.split(line_delimiter) if line_delimiter is not None else chunk.splitlines()):
402
+ result = _process_chunk(
403
+ subline,
404
+ intro_value,
405
+ to_json,
406
+ effective_skip_markers,
407
+ strip_chars,
408
+ yield_raw_on_error,
409
+ error_handler,
410
+ )
411
+ if result is None:
412
+ continue
413
+ if content_extractor:
414
+ try:
415
+ final_content = content_extractor(result)
416
+ if final_content is not None:
417
+ yield final_content
418
+ except Exception:
419
+ pass
420
+ else:
421
+ yield result
422
+ break
423
+ else:
424
+ break
425
+ except Exception as e:
426
+ import sys
427
+ print(f"Stream processing error: {str(e)}", file=sys.stderr)
428
+
429
+
430
+ async def _sanitize_stream_async(
431
+ data: Union[str, Iterable[str], Iterable[bytes]],
432
+ intro_value: str = "data:",
433
+ to_json: bool = True,
434
+ skip_markers: Optional[List[str]] = None,
435
+ strip_chars: Optional[str] = None,
436
+ start_marker: Optional[str] = None,
437
+ end_marker: Optional[str] = None,
438
+ content_extractor: Optional[Callable[[Union[str, Dict[str, Any]]], Optional[Any]]] = None,
439
+ yield_raw_on_error: bool = True,
440
+ encoding: EncodingType = 'utf-8',
441
+ encoding_errors: str = 'replace',
442
+ buffer_size: int = 8192,
443
+ line_delimiter: Optional[str] = None,
444
+ error_handler: Optional[Callable[[Exception, str], Optional[Any]]] = None,
445
+ ) -> AsyncGenerator[Any, None]:
446
+ """
447
+ Asynchronously processes a stream of data (strings or bytes), applying transformations and filtering.
448
+
449
+ This function is the asynchronous counterpart to `_sanitize_stream_sync`. It handles
450
+ streaming data, allowing for operations such as prefix removal, JSON parsing,
451
+ skipping lines based on markers, and extracting specific content. It also supports
452
+ custom error handling for JSON parsing failures.
453
+
454
+ Args:
455
+ data: String, iterable of strings, or iterable of bytes to process.
456
+ intro_value: Prefix indicating the start of meaningful data.
457
+ to_json: Parse JSON content if ``True``.
458
+ skip_markers: Lines containing any of these markers are skipped.
459
+ strip_chars: Characters to strip from each line.
460
+ start_marker: Begin processing only after this marker is found.
461
+ end_marker: Stop processing once this marker is found.
462
+ content_extractor: Optional callable to transform parsed content before yielding.
463
+ yield_raw_on_error: Yield raw lines when JSON parsing fails.
464
+ encoding: Byte stream encoding.
465
+ encoding_errors: How to handle encoding errors.
466
+ buffer_size: Buffer size for byte decoding.
467
+ line_delimiter: Delimiter used to split incoming text into lines. ``None`` uses ``str.splitlines()``.
468
+ error_handler: Callback invoked with ``(Exception, str)`` when JSON parsing fails. If the callback returns a value, it is yielded in place of the raw line.
469
+ """
470
+ if isinstance(data, str):
471
+ for item in _sanitize_stream_sync(
472
+ data,
473
+ intro_value=intro_value,
474
+ to_json=to_json,
475
+ skip_markers=skip_markers,
476
+ strip_chars=strip_chars,
477
+ start_marker=start_marker,
478
+ end_marker=end_marker,
479
+ content_extractor=content_extractor,
480
+ yield_raw_on_error=yield_raw_on_error,
481
+ encoding=encoding,
482
+ encoding_errors=encoding_errors,
483
+ buffer_size=buffer_size,
484
+ line_delimiter=line_delimiter,
485
+ error_handler=error_handler,
486
+ ):
487
+ yield item
488
+ return
489
+
490
+ if not hasattr(data, "__aiter__"):
491
+ # Fallback to synchronous processing if possible
492
+ for item in _sanitize_stream_sync(
493
+ data,
494
+ intro_value=intro_value,
495
+ to_json=to_json,
496
+ skip_markers=skip_markers,
497
+ strip_chars=strip_chars,
498
+ start_marker=start_marker,
499
+ end_marker=end_marker,
500
+ content_extractor=content_extractor,
501
+ yield_raw_on_error=yield_raw_on_error,
502
+ encoding=encoding,
503
+ encoding_errors=encoding_errors,
504
+ buffer_size=buffer_size,
505
+ line_delimiter=line_delimiter,
506
+ error_handler=error_handler,
507
+ ):
508
+ yield item
509
+ return
510
+
511
+ effective_skip_markers = skip_markers or []
512
+ processing_active = start_marker is None
513
+ buffer = ""
514
+ found_start = False if start_marker else True
515
+
516
+ iterator = data.__aiter__()
517
+ first_item = None
518
+ async for first_item in iterator:
519
+ break
520
+ if first_item is None:
521
+ return
522
+ async def _chain(first, it):
523
+ yield first
524
+ async for x in it:
525
+ yield x
526
+
527
+ stream = _chain(first_item, iterator)
528
+
529
+ if isinstance(first_item, bytes):
530
+ line_iterator = _decode_byte_stream_async(
531
+ stream,
532
+ encoding=encoding,
533
+ errors=encoding_errors,
534
+ buffer_size=buffer_size,
535
+ )
536
+ elif isinstance(first_item, str):
537
+ line_iterator = stream
538
+ else:
539
+ raise TypeError(
540
+ f"Stream must yield strings or bytes, not {type(first_item).__name__}"
541
+ )
542
+
543
+ async for line in line_iterator:
544
+ if not line:
545
+ continue
546
+ buffer += line
547
+ while True:
548
+ if not found_start and start_marker:
549
+ idx = buffer.find(start_marker)
550
+ if idx != -1:
551
+ found_start = True
552
+ buffer = buffer[idx + len(start_marker) :]
553
+ else:
554
+ buffer = buffer[-max(len(start_marker), 256) :]
555
+ break
556
+ if found_start and end_marker:
557
+ idx = buffer.find(end_marker)
558
+ if idx != -1:
559
+ chunk = buffer[:idx]
560
+ buffer = buffer[idx + len(end_marker) :]
561
+ processing_active = False
562
+ else:
563
+ chunk = buffer
564
+ buffer = ""
565
+ processing_active = True
566
+ if chunk and processing_active:
567
+ for subline in (
568
+ chunk.split(line_delimiter)
569
+ if line_delimiter is not None
570
+ else chunk.splitlines()
571
+ ):
572
+ result = _process_chunk(
573
+ subline,
574
+ intro_value,
575
+ to_json,
576
+ effective_skip_markers,
577
+ strip_chars,
578
+ yield_raw_on_error,
579
+ error_handler,
580
+ )
581
+ if result is None:
582
+ continue
583
+ if content_extractor:
584
+ try:
585
+ final_content = content_extractor(result)
586
+ if final_content is not None:
587
+ yield final_content
588
+ except Exception:
589
+ pass
590
+ else:
591
+ yield result
592
+ if not processing_active:
593
+ found_start = False
594
+ if idx == -1:
595
+ break
596
+ elif found_start:
597
+ chunk = buffer
598
+ buffer = ""
599
+ if chunk:
600
+ for subline in (
601
+ chunk.split(line_delimiter)
602
+ if line_delimiter is not None
603
+ else chunk.splitlines()
604
+ ):
605
+ result = _process_chunk(
606
+ subline,
607
+ intro_value,
608
+ to_json,
609
+ effective_skip_markers,
610
+ strip_chars,
611
+ yield_raw_on_error,
612
+ error_handler,
613
+ )
614
+ if result is None:
615
+ continue
616
+ if content_extractor:
617
+ try:
618
+ final_content = content_extractor(result)
619
+ if final_content is not None:
620
+ yield final_content
621
+ except Exception:
622
+ pass
623
+ else:
624
+ yield result
625
+ break
626
+ else:
627
+ break
628
+
629
+
630
+ def sanitize_stream(
631
+ data: Union[
632
+ str,
633
+ Iterable[str],
634
+ Iterable[bytes],
635
+ AsyncIterable[str],
636
+ AsyncIterable[bytes],
637
+ ],
638
+ intro_value: str = "data:",
639
+ to_json: bool = True,
640
+ skip_markers: Optional[List[str]] = None,
641
+ strip_chars: Optional[str] = None,
642
+ start_marker: Optional[str] = None,
643
+ end_marker: Optional[str] = None,
644
+ content_extractor: Optional[Callable[[Union[str, Dict[str, Any]]], Optional[Any]]] = None,
645
+ yield_raw_on_error: bool = True,
646
+ encoding: EncodingType = "utf-8",
647
+ encoding_errors: str = "replace",
648
+ buffer_size: int = 8192,
649
+ line_delimiter: Optional[str] = None,
650
+ error_handler: Optional[Callable[[Exception, str], Optional[Any]]] = None,
651
+ ) -> Union[Generator[Any, None, None], AsyncGenerator[Any, None]]:
652
+ """
653
+ Processes streaming data (strings or bytes) in either synchronous or asynchronous mode.
654
+
655
+ This function acts as a unified interface for handling both synchronous and
656
+ asynchronous data streams. It automatically detects the type of input data and
657
+ dispatches it to the appropriate processing function (`_sanitize_stream_sync` or
658
+ `_sanitize_stream_async`).
659
+
660
+ Args:
661
+ data (Union[str, Iterable[str], Iterable[bytes], AsyncIterable[str], AsyncIterable[bytes]]):
662
+ The data to be processed. Can be a string, a synchronous iterable of strings or bytes,
663
+ or an asynchronous iterable of strings or bytes.
664
+ intro_value (str): Prefix indicating the start of meaningful data. Defaults to "data:".
665
+ to_json (bool): Parse JSON content if ``True``. Defaults to True.
666
+ skip_markers (Optional[List[str]]): Lines containing any of these markers are skipped. Defaults to None.
667
+ strip_chars (Optional[str]): Characters to strip from each line. Defaults to None.
668
+ start_marker (Optional[str]): Begin processing only after this marker is found. Defaults to None.
669
+ end_marker (Optional[str]): Stop processing once this marker is found. Defaults to None.
670
+ content_extractor (Optional[Callable[[Union[str, Dict[str, Any]]], Optional[Any]]]):
671
+ Optional callable to transform parsed content before yielding. Defaults to None.
672
+ yield_raw_on_error (bool): Yield raw lines when JSON parsing fails. Defaults to True.
673
+ encoding (EncodingType): Byte stream encoding. Defaults to "utf-8".
674
+ encoding_errors (str): How to handle encoding errors. Defaults to "replace".
675
+ buffer_size (int): Buffer size for byte decoding. Defaults to 8192.
676
+ line_delimiter (Optional[str]): Delimiter used to split incoming text into lines.
677
+ ``None`` uses ``str.splitlines()``. Defaults to None.
678
+ error_handler (Optional[Callable[[Exception, str], Optional[Any]]]):
679
+ Callback invoked with ``(Exception, str)`` when JSON parsing fails.
680
+ If the callback returns a value, it is yielded in place of the raw line. Defaults to None.
681
+
682
+ Returns:
683
+ Union[Generator[Any, None, None], AsyncGenerator[Any, None]]:
684
+ A generator or an asynchronous generator yielding the processed data.
685
+ """
686
+
687
+ if hasattr(data, "__aiter__"):
688
+ return _sanitize_stream_async(
689
+ data, intro_value, to_json, skip_markers, strip_chars,
690
+ start_marker, end_marker, content_extractor, yield_raw_on_error,
691
+ encoding, encoding_errors, buffer_size, line_delimiter, error_handler,
692
+ )
693
+ return _sanitize_stream_sync(
694
+ data, intro_value, to_json, skip_markers, strip_chars,
695
+ start_marker, end_marker, content_extractor, yield_raw_on_error,
696
+ encoding, encoding_errors, buffer_size, line_delimiter, error_handler,
697
+ )
698
+
699
+
700
+ from .conversation import Conversation # noqa: E402,F401
701
+ from .Extra.autocoder import AutoCoder # noqa: E402,F401
702
+ from .optimizers import Optimizers # noqa: E402,F401
703
+ from .prompt_manager import AwesomePrompts # noqa: E402,F401