kreuzberg 3.3.0__py3-none-any.whl → 3.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. kreuzberg/__init__.py +9 -2
  2. kreuzberg/_api/__init__.py +0 -0
  3. kreuzberg/_api/main.py +87 -0
  4. kreuzberg/_entity_extraction.py +238 -0
  5. kreuzberg/_extractors/_base.py +39 -1
  6. kreuzberg/_extractors/_email.py +149 -0
  7. kreuzberg/_extractors/_html.py +15 -3
  8. kreuzberg/_extractors/_image.py +27 -22
  9. kreuzberg/_extractors/_pandoc.py +3 -14
  10. kreuzberg/_extractors/_pdf.py +97 -34
  11. kreuzberg/_extractors/_presentation.py +62 -10
  12. kreuzberg/_extractors/_spread_sheet.py +181 -6
  13. kreuzberg/_extractors/_structured.py +148 -0
  14. kreuzberg/_gmft.py +318 -11
  15. kreuzberg/_language_detection.py +95 -0
  16. kreuzberg/_mcp/__init__.py +5 -0
  17. kreuzberg/_mcp/server.py +227 -0
  18. kreuzberg/_mime_types.py +27 -1
  19. kreuzberg/_ocr/__init__.py +10 -1
  20. kreuzberg/_ocr/_base.py +59 -0
  21. kreuzberg/_ocr/_easyocr.py +92 -1
  22. kreuzberg/_ocr/_paddleocr.py +89 -0
  23. kreuzberg/_ocr/_tesseract.py +569 -5
  24. kreuzberg/_registry.py +4 -0
  25. kreuzberg/_types.py +181 -4
  26. kreuzberg/_utils/_cache.py +52 -4
  27. kreuzberg/_utils/_device.py +2 -2
  28. kreuzberg/_utils/_errors.py +3 -7
  29. kreuzberg/_utils/_process_pool.py +182 -9
  30. kreuzberg/_utils/_quality.py +237 -0
  31. kreuzberg/_utils/_serialization.py +4 -2
  32. kreuzberg/_utils/_string.py +153 -10
  33. kreuzberg/_utils/_sync.py +6 -7
  34. kreuzberg/_utils/_table.py +261 -0
  35. kreuzberg/_utils/_tmp.py +2 -2
  36. kreuzberg/cli.py +1 -2
  37. kreuzberg/extraction.py +43 -34
  38. kreuzberg-3.8.1.dist-info/METADATA +301 -0
  39. kreuzberg-3.8.1.dist-info/RECORD +53 -0
  40. {kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/entry_points.txt +1 -0
  41. kreuzberg/_multiprocessing/__init__.py +0 -6
  42. kreuzberg/_multiprocessing/gmft_isolated.py +0 -332
  43. kreuzberg/_multiprocessing/process_manager.py +0 -188
  44. kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
  45. kreuzberg/_multiprocessing/tesseract_pool.py +0 -359
  46. kreuzberg-3.3.0.dist-info/METADATA +0 -235
  47. kreuzberg-3.3.0.dist-info/RECORD +0 -48
  48. {kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/WHEEL +0 -0
  49. {kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,237 @@
1
+ """Quality post-processing utilities for extracted text."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from typing import Any
7
+
8
+ # Pre-compiled patterns for performance
9
+ _OCR_ARTIFACTS = {
10
+ # Common OCR misreads
11
+ "scattered_chars": re.compile(r"\b[a-zA-Z]\s{2,}[a-zA-Z]\s{2,}[a-zA-Z]\b"),
12
+ "repeated_punctuation": re.compile(r"[.]{3,}|[-]{3,}|[_]{3,}"),
13
+ "isolated_punctuation": re.compile(r"\s[.,;:!?]\s"),
14
+ "malformed_words": re.compile(r"\b[a-zA-Z]+[0-9]+[a-zA-Z]+[a-zA-Z0-9]*\b"),
15
+ "excessive_whitespace": re.compile(r"\s{3,}"),
16
+ "broken_sentences": re.compile(r"[a-z]\s{3,}[A-Z][a-z]"),
17
+ }
18
+
19
+ # Combined pattern for faster OCR penalty calculation
20
+ _COMBINED_OCR_PATTERN = re.compile(
21
+ r"(?P<scattered>\b[a-zA-Z]\s{2,}[a-zA-Z]\s{2,}[a-zA-Z]\b)|"
22
+ r"(?P<repeated>[.]{3,}|[-]{3,}|[_]{3,})|"
23
+ r"(?P<isolated>\s[.,;:!?]\s)|"
24
+ r"(?P<malformed>\b[a-zA-Z]+[0-9]+[a-zA-Z]+[a-zA-Z0-9]*\b)|"
25
+ r"(?P<whitespace>\s{3,})|"
26
+ r"(?P<broken>[a-z]\s{3,}[A-Z][a-z])"
27
+ )
28
+
29
+ # Pre-compiled patterns for text normalization
30
+ _WHITESPACE_NORMALIZE = re.compile(r"[ \t\f\v\r\xa0\u2000-\u200b\u2028\u2029\u3000]+")
31
+ _NEWLINE_NORMALIZE = re.compile(r"\n\s*\n\s*\n+")
32
+ _SENTENCE_DETECT = re.compile(r"[.!?]\s+[A-Z]")
33
+ _PUNCTUATION_DETECT = re.compile(r"[.!?]")
34
+
35
+ _SCRIPT_PATTERNS = {
36
+ # JavaScript and CSS content
37
+ "js_functions": re.compile(r"function\s+\w+\s*\([^)]*\)\s*\{[^}]*\}", re.IGNORECASE),
38
+ "css_rules": re.compile(r"\.[a-zA-Z][\w-]*\s*\{[^}]*\}", re.IGNORECASE),
39
+ "script_tags": re.compile(r"<script[^>]*>.*?</script>", re.DOTALL | re.IGNORECASE),
40
+ "style_tags": re.compile(r"<style[^>]*>.*?</style>", re.DOTALL | re.IGNORECASE),
41
+ }
42
+
43
+ _NAVIGATION_PATTERNS = {
44
+ "nav_words": re.compile(r"\b(?:Skip to main content|Back to top|Main navigation|Site navigation)\b", re.IGNORECASE),
45
+ "breadcrumbs": re.compile(r"(?:Home\s*[>»]\s*|[>»]\s*){2,}"),
46
+ "pagination": re.compile(
47
+ r"\b(?:Page \d+ of \d+|First page|Last page|Previous page|Next page|^\d+ of \d+$)\b", re.IGNORECASE
48
+ ),
49
+ }
50
+
51
+
52
+ def calculate_quality_score(text: str, metadata: dict[str, Any] | None = None) -> float:
53
+ """Calculate overall quality score for extracted text.
54
+
55
+ Args:
56
+ text: The extracted text content
57
+ metadata: Optional metadata for additional scoring
58
+
59
+ Returns:
60
+ Quality score between 0.0 and 1.0
61
+ """
62
+ if not text or not text.strip():
63
+ return 0.0
64
+
65
+ # Initialize score
66
+ score = 1.0
67
+ total_chars = len(text)
68
+
69
+ # Penalize OCR artifacts
70
+ ocr_penalty = _calculate_ocr_penalty(text, total_chars)
71
+ score -= ocr_penalty * 0.3
72
+
73
+ # Penalize script/style content
74
+ script_penalty = _calculate_script_penalty(text, total_chars)
75
+ score -= script_penalty * 0.2
76
+
77
+ # Penalize navigation content
78
+ nav_penalty = _calculate_navigation_penalty(text, total_chars)
79
+ score -= nav_penalty * 0.1
80
+
81
+ # Bonus for structure (sentences, paragraphs)
82
+ structure_bonus = _calculate_structure_bonus(text)
83
+ score += structure_bonus * 0.2
84
+
85
+ # Bonus for metadata richness
86
+ if metadata:
87
+ metadata_bonus = _calculate_metadata_bonus(metadata)
88
+ score += metadata_bonus * 0.1
89
+
90
+ return max(0.0, min(1.0, score))
91
+
92
+
93
+ def clean_extracted_text(text: str) -> str:
94
+ """Clean extracted text by removing artifacts and improving quality.
95
+
96
+ Args:
97
+ text: The raw extracted text
98
+
99
+ Returns:
100
+ Cleaned text with artifacts removed
101
+ """
102
+ if not text:
103
+ return text
104
+
105
+ # Remove script and style content
106
+ for pattern in _SCRIPT_PATTERNS.values():
107
+ text = pattern.sub(" ", text)
108
+
109
+ # Clean OCR artifacts
110
+ text = _clean_ocr_artifacts(text)
111
+
112
+ # Clean navigation elements
113
+ text = _clean_navigation_elements(text)
114
+
115
+ # Normalize whitespace using pre-compiled patterns
116
+ text = _WHITESPACE_NORMALIZE.sub(" ", text)
117
+ text = _NEWLINE_NORMALIZE.sub("\n\n", text)
118
+
119
+ return text.strip()
120
+
121
+
122
+ def _calculate_ocr_penalty(text: str, total_chars: int) -> float:
123
+ """Calculate penalty for OCR artifacts."""
124
+ if total_chars == 0:
125
+ return 0.0
126
+
127
+ # Use combined pattern for single-pass processing
128
+ artifact_chars = sum(len(match.group()) for match in _COMBINED_OCR_PATTERN.finditer(text))
129
+ return min(1.0, artifact_chars / total_chars)
130
+
131
+
132
+ def _calculate_script_penalty(text: str, total_chars: int) -> float:
133
+ """Calculate penalty for script/style content."""
134
+ if total_chars == 0:
135
+ return 0.0
136
+
137
+ script_chars = 0
138
+ for pattern in _SCRIPT_PATTERNS.values():
139
+ matches = pattern.findall(text)
140
+ script_chars += sum(len(match) for match in matches)
141
+
142
+ return min(1.0, script_chars / total_chars)
143
+
144
+
145
+ def _calculate_navigation_penalty(text: str, total_chars: int) -> float:
146
+ """Calculate penalty for navigation content."""
147
+ if total_chars == 0:
148
+ return 0.0
149
+
150
+ nav_chars = 0
151
+ for pattern in _NAVIGATION_PATTERNS.values():
152
+ matches = pattern.findall(text)
153
+ nav_chars += sum(len(match) for match in matches)
154
+
155
+ return min(1.0, nav_chars / total_chars)
156
+
157
+
158
+ def _calculate_structure_bonus(text: str) -> float:
159
+ """Calculate bonus for proper text structure."""
160
+ if not text:
161
+ return 0.0
162
+
163
+ # Count sentences (rough heuristic)
164
+ sentence_count = len(_SENTENCE_DETECT.findall(text))
165
+
166
+ # Count paragraphs
167
+ paragraph_count = len(text.split("\n\n"))
168
+
169
+ # Calculate structure score
170
+ words = len(text.split())
171
+ if words == 0:
172
+ return 0.0
173
+
174
+ # Good structure: reasonable sentence and paragraph distribution
175
+ avg_words_per_sentence = words / max(1, sentence_count)
176
+ avg_words_per_paragraph = words / max(1, paragraph_count)
177
+
178
+ structure_score = 0.0
179
+
180
+ # Bonus for reasonable sentence length (10-30 words)
181
+ if 10 <= avg_words_per_sentence <= 30:
182
+ structure_score += 0.3
183
+
184
+ # Bonus for reasonable paragraph length (50-300 words)
185
+ if 50 <= avg_words_per_paragraph <= 300:
186
+ structure_score += 0.3
187
+
188
+ # Bonus for having multiple paragraphs
189
+ if paragraph_count > 1:
190
+ structure_score += 0.2
191
+
192
+ # Bonus for having punctuation
193
+ if _PUNCTUATION_DETECT.search(text):
194
+ structure_score += 0.2
195
+
196
+ return min(1.0, structure_score)
197
+
198
+
199
+ def _calculate_metadata_bonus(metadata: dict[str, Any]) -> float:
200
+ """Calculate bonus for rich metadata."""
201
+ if not metadata:
202
+ return 0.0
203
+
204
+ important_fields = {"title", "author", "subject", "description", "keywords"}
205
+ present_fields = sum(1 for field in important_fields if metadata.get(field))
206
+
207
+ return present_fields / len(important_fields)
208
+
209
+
210
+ def _clean_ocr_artifacts(text: str) -> str:
211
+ """Remove common OCR artifacts from text."""
212
+ # Fix scattered characters (likely OCR errors)
213
+ text = _OCR_ARTIFACTS["scattered_chars"].sub(lambda m: m.group().replace(" ", ""), text)
214
+
215
+ # Clean repeated punctuation
216
+ text = _OCR_ARTIFACTS["repeated_punctuation"].sub("...", text)
217
+
218
+ # Fix isolated punctuation
219
+ text = _OCR_ARTIFACTS["isolated_punctuation"].sub(" ", text)
220
+
221
+ # Remove malformed words with numbers mixed in
222
+ text = _OCR_ARTIFACTS["malformed_words"].sub(" ", text)
223
+
224
+ # Normalize excessive whitespace
225
+ return _OCR_ARTIFACTS["excessive_whitespace"].sub(" ", text)
226
+
227
+
228
+ def _clean_navigation_elements(text: str) -> str:
229
+ """Remove navigation elements from text."""
230
+ # Remove navigation words
231
+ text = _NAVIGATION_PATTERNS["nav_words"].sub(" ", text)
232
+
233
+ # Remove breadcrumbs
234
+ text = _NAVIGATION_PATTERNS["breadcrumbs"].sub(" ", text)
235
+
236
+ # Remove pagination
237
+ return _NAVIGATION_PATTERNS["pagination"].sub(" ", text)
@@ -29,8 +29,10 @@ def encode_hook(obj: Any) -> Any:
29
29
  "to_list",
30
30
  "tolist",
31
31
  ):
32
- if hasattr(obj, key) and callable(getattr(obj, key)):
33
- return getattr(obj, key)()
32
+ if hasattr(obj, key):
33
+ method = getattr(obj, key) # Cache the attribute lookup
34
+ if callable(method):
35
+ return method()
34
36
 
35
37
  if is_dataclass(obj) and not isinstance(obj, type):
36
38
  return {k: v if not isinstance(v, Enum) else v.value for (k, v) in asdict(obj).items()}
@@ -1,39 +1,182 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import hashlib
4
+ import re
3
5
  from contextlib import suppress
6
+ from functools import lru_cache
4
7
 
5
- from charset_normalizer import detect
8
+ import chardetng_py
9
+
10
+ # Compile regex patterns once at module level for performance
11
+ _WHITESPACE_PATTERN = re.compile(r"[ \t\f\v\r\xa0\u2000-\u200b\u2028\u2029\u3000]+")
12
+ _NEWLINES_PATTERN = re.compile(r"\n+")
13
+ _MOJIBAKE_PATTERNS = {
14
+ # Hebrew as Cyrillic patterns
15
+ "hebrew_as_cyrillic": re.compile(r"[\u0400-\u04FF]{3,}"),
16
+ # Control characters that shouldn't appear in text
17
+ "control_chars": re.compile(r"[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F-\x9F]"),
18
+ # Unicode replacement characters
19
+ "replacement_chars": re.compile(r"\uFFFD+"),
20
+ # Isolated combining marks (likely encoding issues)
21
+ "isolated_combining": re.compile(r"[\u0300-\u036F](?![^\u0300-\u036F])"),
22
+ }
23
+
24
+ # Simple cache for encoding detection (in-memory, session-scoped)
25
+ _encoding_cache: dict[str, str] = {}
26
+
27
+
28
+ @lru_cache(maxsize=128)
29
+ def _get_encoding_cache_key(data_hash: str, size: int) -> str:
30
+ """Generate cache key for encoding detection."""
31
+ return f"{data_hash}:{size}"
6
32
 
7
33
 
8
34
  def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
9
- """Decode a byte string safely, removing invalid sequences.
35
+ """Decode a byte string safely with mojibake detection and correction.
10
36
 
11
37
  Args:
12
38
  byte_data: The byte string to decode.
13
39
  encoding: The encoding to use when decoding the byte string.
14
40
 
15
41
  Returns:
16
- The decoded string.
42
+ The decoded string with mojibake detection and correction.
17
43
  """
18
44
  if not byte_data:
19
45
  return ""
20
46
 
21
- encodings = [encoding, detect(byte_data).get("encoding", ""), "utf-8"]
47
+ # Try provided encoding first (fastest path)
48
+ if encoding:
49
+ with suppress(UnicodeDecodeError, LookupError):
50
+ decoded = byte_data.decode(encoding)
51
+ return _fix_mojibake(decoded)
22
52
 
23
- for enc in [e for e in encodings if e]:
53
+ # Check cache for similar content (performance optimization)
54
+ data_hash = hashlib.sha256(byte_data[:1024]).hexdigest()[:16] # Hash first 1KB
55
+ cache_key = _get_encoding_cache_key(data_hash, len(byte_data))
56
+
57
+ if cache_key in _encoding_cache:
58
+ cached_encoding = _encoding_cache[cache_key]
59
+ with suppress(UnicodeDecodeError, LookupError):
60
+ decoded = byte_data.decode(cached_encoding)
61
+ return _fix_mojibake(decoded)
62
+
63
+ # Use chardetng for better performance than charset-normalizer
64
+ detected_encoding = chardetng_py.detect(byte_data)
65
+ if detected_encoding:
24
66
  with suppress(UnicodeDecodeError, LookupError):
25
- return byte_data.decode(enc)
67
+ decoded = byte_data.decode(detected_encoding)
68
+ # Cache successful encoding detection
69
+ if len(_encoding_cache) < 1000: # Prevent unlimited growth
70
+ _encoding_cache[cache_key] = detected_encoding
71
+ return _fix_mojibake(decoded)
72
+
73
+ # Try multiple encodings with confidence scoring
74
+ encodings_to_try = [
75
+ "utf-8",
76
+ "windows-1255", # Hebrew
77
+ "iso-8859-8", # Hebrew
78
+ "windows-1256", # Arabic
79
+ "iso-8859-6", # Arabic
80
+ "windows-1252", # Western European
81
+ "cp1251", # Cyrillic
82
+ ]
26
83
 
84
+ best_result = None
85
+ best_confidence = 0.0
86
+
87
+ for enc in encodings_to_try:
88
+ with suppress(UnicodeDecodeError, LookupError):
89
+ decoded = byte_data.decode(enc)
90
+ confidence = _calculate_text_confidence(decoded)
91
+ if confidence > best_confidence:
92
+ best_confidence = confidence
93
+ best_result = decoded
94
+
95
+ if best_result and best_confidence > 0.5:
96
+ return _fix_mojibake(best_result)
97
+
98
+ # Final fallback
27
99
  return byte_data.decode("latin-1", errors="replace")
28
100
 
29
101
 
102
+ def _calculate_text_confidence(text: str) -> float:
103
+ """Calculate confidence score for decoded text quality."""
104
+ if not text:
105
+ return 0.0
106
+
107
+ # Check for common encoding problems
108
+ replacement_count = len(_MOJIBAKE_PATTERNS["replacement_chars"].findall(text))
109
+ control_count = len(_MOJIBAKE_PATTERNS["control_chars"].findall(text))
110
+ total_chars = len(text)
111
+
112
+ if total_chars == 0:
113
+ return 0.0
114
+
115
+ # Penalize replacement and control characters
116
+ penalty = (replacement_count + control_count * 2) / total_chars
117
+
118
+ # Bonus for readable character ranges
119
+ readable_chars = sum(1 for c in text if c.isprintable() or c.isspace())
120
+ readability_score = readable_chars / total_chars
121
+
122
+ # Check for suspicious Cyrillic that might be misencoded Hebrew
123
+ cyrillic_matches = _MOJIBAKE_PATTERNS["hebrew_as_cyrillic"].findall(text)
124
+ if cyrillic_matches and len("".join(cyrillic_matches)) > total_chars * 0.1:
125
+ penalty += 0.3 # Heavy penalty for likely mojibake
126
+
127
+ return max(0.0, min(1.0, readability_score - penalty))
128
+
129
+
130
+ def _fix_mojibake(text: str) -> str:
131
+ """Attempt to fix common mojibake patterns."""
132
+ if not text:
133
+ return text
134
+
135
+ # Remove control characters
136
+ text = _MOJIBAKE_PATTERNS["control_chars"].sub("", text)
137
+
138
+ # Remove replacement characters
139
+ text = _MOJIBAKE_PATTERNS["replacement_chars"].sub("", text)
140
+
141
+ # Remove isolated combining marks
142
+ text = _MOJIBAKE_PATTERNS["isolated_combining"].sub("", text)
143
+
144
+ # Try to fix Hebrew encoded as Cyrillic (common Windows-1255 -> CP1251 confusion)
145
+ if _MOJIBAKE_PATTERNS["hebrew_as_cyrillic"].search(text):
146
+ # This is a heuristic fix - in practice, you'd need actual character mapping
147
+ # For now, we flag it for manual review by keeping the text but adding a marker
148
+ pass
149
+
150
+ return text
151
+
152
+
30
153
  def normalize_spaces(text: str) -> str:
31
- """Normalize the spaces in a string.
154
+ """Normalize spaces while preserving line breaks and paragraph structure.
32
155
 
33
156
  Args:
34
- text: The text to sanitize.
157
+ text: The text to normalize.
35
158
 
36
159
  Returns:
37
- The sanitized text.
160
+ The normalized text with proper spacing.
38
161
  """
39
- return " ".join(text.strip().split())
162
+ if not text or not text.strip():
163
+ return ""
164
+
165
+ # Split by double newlines to preserve paragraph breaks
166
+ paragraphs = text.split("\n\n")
167
+ normalized_paragraphs = []
168
+
169
+ for paragraph in paragraphs:
170
+ # Use pre-compiled patterns for better performance
171
+ # Replace multiple whitespace (except newlines) with single space
172
+ cleaned = _WHITESPACE_PATTERN.sub(" ", paragraph)
173
+ # Clean up multiple newlines within paragraph (keep single newlines)
174
+ cleaned = _NEWLINES_PATTERN.sub("\n", cleaned)
175
+
176
+ # Strip and filter empty lines efficiently
177
+ lines = [line.strip() for line in cleaned.split("\n") if line.strip()]
178
+
179
+ if lines:
180
+ normalized_paragraphs.append("\n".join(lines))
181
+
182
+ return "\n\n".join(normalized_paragraphs)
kreuzberg/_utils/_sync.py CHANGED
@@ -1,6 +1,5 @@
1
1
  from __future__ import annotations
2
2
 
3
- import sys
4
3
  from functools import partial
5
4
  from inspect import isawaitable, iscoroutinefunction
6
5
  from typing import TYPE_CHECKING, Any, TypeVar, cast
@@ -12,10 +11,7 @@ from anyio.to_thread import run_sync as any_io_run_sync
12
11
  if TYPE_CHECKING: # pragma: no cover
13
12
  from collections.abc import Awaitable, Callable
14
13
 
15
- if sys.version_info >= (3, 10):
16
- from typing import ParamSpec
17
- else: # pragma: no cover
18
- from typing_extensions import ParamSpec
14
+ from typing import ParamSpec
19
15
 
20
16
  T = TypeVar("T")
21
17
  P = ParamSpec("P")
@@ -32,8 +28,11 @@ async def run_sync(sync_fn: Callable[P, T], *args: P.args, **kwargs: P.kwargs) -
32
28
  Returns:
33
29
  The result of the synchronous function.
34
30
  """
35
- handler = partial(sync_fn, **kwargs)
36
- return cast("T", await any_io_run_sync(handler, *args, abandon_on_cancel=True)) # pyright: ignore [reportCallIssue]
31
+ # Optimize: only create partial if we have kwargs
32
+ if kwargs:
33
+ handler = partial(sync_fn, **kwargs)
34
+ return cast("T", await any_io_run_sync(handler, *args, abandon_on_cancel=True)) # pyright: ignore [reportCallIssue]
35
+ return cast("T", await any_io_run_sync(sync_fn, *args, abandon_on_cancel=True)) # pyright: ignore [reportCallIssue]
37
36
 
38
37
 
39
38
  async def run_taskgroup(*async_tasks: Awaitable[Any]) -> list[Any]: