aleph-rlm 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aleph/__init__.py +49 -0
- aleph/cache/__init__.py +6 -0
- aleph/cache/base.py +20 -0
- aleph/cache/memory.py +27 -0
- aleph/cli.py +1044 -0
- aleph/config.py +154 -0
- aleph/core.py +874 -0
- aleph/mcp/__init__.py +30 -0
- aleph/mcp/local_server.py +3527 -0
- aleph/mcp/server.py +20 -0
- aleph/prompts/__init__.py +5 -0
- aleph/prompts/system.py +45 -0
- aleph/providers/__init__.py +14 -0
- aleph/providers/anthropic.py +253 -0
- aleph/providers/base.py +59 -0
- aleph/providers/openai.py +224 -0
- aleph/providers/registry.py +22 -0
- aleph/repl/__init__.py +5 -0
- aleph/repl/helpers.py +1068 -0
- aleph/repl/sandbox.py +777 -0
- aleph/sub_query/__init__.py +166 -0
- aleph/sub_query/api_backend.py +166 -0
- aleph/sub_query/cli_backend.py +327 -0
- aleph/types.py +216 -0
- aleph/utils/__init__.py +6 -0
- aleph/utils/logging.py +79 -0
- aleph/utils/tokens.py +43 -0
- aleph_rlm-0.6.0.dist-info/METADATA +358 -0
- aleph_rlm-0.6.0.dist-info/RECORD +32 -0
- aleph_rlm-0.6.0.dist-info/WHEEL +4 -0
- aleph_rlm-0.6.0.dist-info/entry_points.txt +3 -0
- aleph_rlm-0.6.0.dist-info/licenses/LICENSE +21 -0
aleph/repl/helpers.py
ADDED
|
@@ -0,0 +1,1068 @@
|
|
|
1
|
+
"""Built-in helper functions exposed inside the Aleph REPL.
|
|
2
|
+
|
|
3
|
+
These helpers provide powerful text analysis capabilities for any kind of document:
|
|
4
|
+
- Code, logs, configs, legal docs, financial reports, research papers, etc.
|
|
5
|
+
|
|
6
|
+
The REPL injects wrappers so that the LLM can call these directly.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import hashlib
|
|
12
|
+
import math
|
|
13
|
+
import re
|
|
14
|
+
from collections import Counter
|
|
15
|
+
from typing import TypedDict, Any, Callable, Sequence, Iterable
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# =============================================================================
|
|
19
|
+
# Type definitions
|
|
20
|
+
# =============================================================================
|
|
21
|
+
|
|
22
|
+
class SearchResult(TypedDict):
|
|
23
|
+
match: str
|
|
24
|
+
line_num: int
|
|
25
|
+
context: str
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class Citation(TypedDict):
|
|
29
|
+
"""Manual citation for evidence tracking."""
|
|
30
|
+
snippet: str
|
|
31
|
+
line_range: tuple[int, int] | None
|
|
32
|
+
note: str | None
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class ExtractedMatch(TypedDict):
|
|
36
|
+
"""Result from extraction functions."""
|
|
37
|
+
value: str
|
|
38
|
+
line_num: int
|
|
39
|
+
start: int
|
|
40
|
+
end: int
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# =============================================================================
|
|
44
|
+
# Core helpers (original)
|
|
45
|
+
# =============================================================================
|
|
46
|
+
|
|
47
|
+
def _to_text(ctx: object) -> str:
|
|
48
|
+
"""Best-effort conversion of context into a string."""
|
|
49
|
+
if ctx is None:
|
|
50
|
+
return ""
|
|
51
|
+
if isinstance(ctx, str):
|
|
52
|
+
return ctx
|
|
53
|
+
if isinstance(ctx, bytes):
|
|
54
|
+
try:
|
|
55
|
+
return ctx.decode("utf-8", errors="replace")
|
|
56
|
+
except Exception:
|
|
57
|
+
return repr(ctx)
|
|
58
|
+
if isinstance(ctx, (dict, list, tuple)):
|
|
59
|
+
try:
|
|
60
|
+
import json
|
|
61
|
+
return json.dumps(ctx, indent=2, ensure_ascii=False)
|
|
62
|
+
except Exception:
|
|
63
|
+
return str(ctx)
|
|
64
|
+
return str(ctx)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def peek(ctx: object, start: int = 0, end: int | None = None) -> str:
|
|
68
|
+
"""Get a character slice of the context."""
|
|
69
|
+
text = _to_text(ctx)
|
|
70
|
+
return text[start:end]
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def lines(ctx: object, start: int = 0, end: int | None = None) -> str:
|
|
74
|
+
"""Get a line slice of the context."""
|
|
75
|
+
text = _to_text(ctx)
|
|
76
|
+
parts = text.splitlines()
|
|
77
|
+
return "\n".join(parts[start:end])
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def search(
|
|
81
|
+
ctx: object,
|
|
82
|
+
pattern: str,
|
|
83
|
+
context_lines: int = 2,
|
|
84
|
+
flags: int = 0,
|
|
85
|
+
max_results: int = 20,
|
|
86
|
+
) -> list[SearchResult]:
|
|
87
|
+
"""Regex search returning matching lines with surrounding context.
|
|
88
|
+
|
|
89
|
+
Returns list of dicts: {"match": str, "line_num": int, "context": str}
|
|
90
|
+
"""
|
|
91
|
+
text = _to_text(ctx)
|
|
92
|
+
lines_list = text.splitlines()
|
|
93
|
+
results: list[SearchResult] = []
|
|
94
|
+
rx = re.compile(pattern, flags=flags)
|
|
95
|
+
|
|
96
|
+
for i, line in enumerate(lines_list):
|
|
97
|
+
if rx.search(line):
|
|
98
|
+
start = max(0, i - context_lines)
|
|
99
|
+
end = min(len(lines_list), i + context_lines + 1)
|
|
100
|
+
results.append({
|
|
101
|
+
"match": line,
|
|
102
|
+
"line_num": i,
|
|
103
|
+
"context": "\n".join(lines_list[start:end]),
|
|
104
|
+
})
|
|
105
|
+
if len(results) >= max_results:
|
|
106
|
+
break
|
|
107
|
+
|
|
108
|
+
return results
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def chunk(ctx: object, chunk_size: int, overlap: int = 0) -> list[str]:
|
|
112
|
+
"""Split context into chunks by character count."""
|
|
113
|
+
if chunk_size <= 0:
|
|
114
|
+
raise ValueError("chunk_size must be > 0")
|
|
115
|
+
if overlap < 0:
|
|
116
|
+
raise ValueError("overlap must be >= 0")
|
|
117
|
+
if overlap >= chunk_size:
|
|
118
|
+
raise ValueError("overlap must be < chunk_size")
|
|
119
|
+
|
|
120
|
+
text = _to_text(ctx)
|
|
121
|
+
out: list[str] = []
|
|
122
|
+
i = 0
|
|
123
|
+
n = len(text)
|
|
124
|
+
while i < n:
|
|
125
|
+
j = min(n, i + chunk_size)
|
|
126
|
+
out.append(text[i:j])
|
|
127
|
+
if j == n:
|
|
128
|
+
break
|
|
129
|
+
i = j - overlap
|
|
130
|
+
return out
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def cite(
|
|
134
|
+
snippet: str,
|
|
135
|
+
line_range: tuple[int, int] | None = None,
|
|
136
|
+
note: str | None = None,
|
|
137
|
+
) -> Citation:
|
|
138
|
+
"""Manually cite evidence for provenance tracking."""
|
|
139
|
+
return Citation(
|
|
140
|
+
snippet=snippet[:500],
|
|
141
|
+
line_range=line_range,
|
|
142
|
+
note=note,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
# =============================================================================
|
|
147
|
+
# Extraction helpers - pull structured data from text
|
|
148
|
+
# =============================================================================
|
|
149
|
+
|
|
150
|
+
def _extract_with_pattern(
|
|
151
|
+
ctx: object,
|
|
152
|
+
pattern: str,
|
|
153
|
+
flags: int = 0,
|
|
154
|
+
max_results: int = 100,
|
|
155
|
+
) -> list[ExtractedMatch]:
|
|
156
|
+
"""Generic extraction helper."""
|
|
157
|
+
text = _to_text(ctx)
|
|
158
|
+
lines_list = text.splitlines()
|
|
159
|
+
results: list[ExtractedMatch] = []
|
|
160
|
+
rx = re.compile(pattern, flags=flags)
|
|
161
|
+
|
|
162
|
+
for line_num, line in enumerate(lines_list):
|
|
163
|
+
for m in rx.finditer(line):
|
|
164
|
+
results.append({
|
|
165
|
+
"value": m.group(0),
|
|
166
|
+
"line_num": line_num,
|
|
167
|
+
"start": m.start(),
|
|
168
|
+
"end": m.end(),
|
|
169
|
+
})
|
|
170
|
+
if len(results) >= max_results:
|
|
171
|
+
return results
|
|
172
|
+
return results
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def extract_numbers(ctx: object, include_negative: bool = True, include_decimals: bool = True) -> list[ExtractedMatch]:
|
|
176
|
+
"""Extract all numbers from text.
|
|
177
|
+
|
|
178
|
+
Returns list of {"value": str, "line_num": int, "start": int, "end": int}
|
|
179
|
+
"""
|
|
180
|
+
if include_decimals and include_negative:
|
|
181
|
+
pattern = r'-?\d+\.?\d*'
|
|
182
|
+
elif include_decimals:
|
|
183
|
+
pattern = r'\d+\.?\d*'
|
|
184
|
+
elif include_negative:
|
|
185
|
+
pattern = r'-?\d+'
|
|
186
|
+
else:
|
|
187
|
+
pattern = r'\d+'
|
|
188
|
+
return _extract_with_pattern(ctx, pattern)
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def extract_money(ctx: object, currencies: str = r'[$€£¥₹]') -> list[ExtractedMatch]:
|
|
192
|
+
"""Extract monetary amounts like $1,234.56 or €100."""
|
|
193
|
+
pattern = rf'{currencies}\s*[\d,]+\.?\d*|\d+\.?\d*\s*{currencies}'
|
|
194
|
+
return _extract_with_pattern(ctx, pattern)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def extract_percentages(ctx: object) -> list[ExtractedMatch]:
|
|
198
|
+
"""Extract percentages like 45%, 3.14%, -2.5%."""
|
|
199
|
+
pattern = r'-?\d+\.?\d*\s*%'
|
|
200
|
+
return _extract_with_pattern(ctx, pattern)
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def extract_dates(ctx: object) -> list[ExtractedMatch]:
|
|
204
|
+
"""Extract dates in common formats (YYYY-MM-DD, MM/DD/YYYY, etc.)."""
|
|
205
|
+
patterns = [
|
|
206
|
+
r'\d{4}-\d{2}-\d{2}', # ISO: 2024-01-15
|
|
207
|
+
r'\d{1,2}/\d{1,2}/\d{2,4}', # US: 1/15/2024
|
|
208
|
+
r'\d{1,2}-\d{1,2}-\d{2,4}', # 15-01-2024
|
|
209
|
+
r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\.?\s+\d{1,2},?\s+\d{4}', # Jan 15, 2024
|
|
210
|
+
r'\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\.?\s+\d{4}', # 15 Jan 2024
|
|
211
|
+
]
|
|
212
|
+
combined = '|'.join(f'({p})' for p in patterns)
|
|
213
|
+
return _extract_with_pattern(ctx, combined, flags=re.IGNORECASE)
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def extract_times(ctx: object) -> list[ExtractedMatch]:
|
|
217
|
+
"""Extract times like 14:30, 2:30 PM, 14:30:45."""
|
|
218
|
+
pattern = r'\d{1,2}:\d{2}(?::\d{2})?(?:\s*[AaPp][Mm])?'
|
|
219
|
+
return _extract_with_pattern(ctx, pattern)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def extract_timestamps(ctx: object) -> list[ExtractedMatch]:
|
|
223
|
+
"""Extract ISO timestamps and common log formats."""
|
|
224
|
+
patterns = [
|
|
225
|
+
r'\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:?\d{2})?', # ISO 8601
|
|
226
|
+
r'\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}', # Common log format
|
|
227
|
+
r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}', # Syslog
|
|
228
|
+
]
|
|
229
|
+
combined = '|'.join(f'({p})' for p in patterns)
|
|
230
|
+
return _extract_with_pattern(ctx, combined, flags=re.IGNORECASE)
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def extract_emails(ctx: object) -> list[ExtractedMatch]:
|
|
234
|
+
"""Extract email addresses."""
|
|
235
|
+
pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
|
|
236
|
+
return _extract_with_pattern(ctx, pattern)
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def extract_urls(ctx: object) -> list[ExtractedMatch]:
|
|
240
|
+
"""Extract URLs (http, https, ftp)."""
|
|
241
|
+
pattern = r'https?://[^\s<>"\']+|ftp://[^\s<>"\']+|www\.[^\s<>"\']+'
|
|
242
|
+
return _extract_with_pattern(ctx, pattern)
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def extract_ips(ctx: object, include_ipv6: bool = False) -> list[ExtractedMatch]:
|
|
246
|
+
"""Extract IP addresses."""
|
|
247
|
+
ipv4 = r'\b(?:\d{1,3}\.){3}\d{1,3}\b'
|
|
248
|
+
if include_ipv6:
|
|
249
|
+
ipv6 = r'(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}'
|
|
250
|
+
pattern = f'{ipv4}|{ipv6}'
|
|
251
|
+
else:
|
|
252
|
+
pattern = ipv4
|
|
253
|
+
return _extract_with_pattern(ctx, pattern)
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def extract_phones(ctx: object) -> list[ExtractedMatch]:
|
|
257
|
+
"""Extract phone numbers in various formats."""
|
|
258
|
+
pattern = r'(?:\+\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}'
|
|
259
|
+
return _extract_with_pattern(ctx, pattern)
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def extract_hex(ctx: object) -> list[ExtractedMatch]:
|
|
263
|
+
"""Extract hexadecimal values like 0x1F, #FF5733, etc."""
|
|
264
|
+
pattern = r'0x[0-9a-fA-F]+|#[0-9a-fA-F]{3,8}\b'
|
|
265
|
+
return _extract_with_pattern(ctx, pattern)
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def extract_uuids(ctx: object) -> list[ExtractedMatch]:
|
|
269
|
+
"""Extract UUIDs."""
|
|
270
|
+
pattern = r'[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}'
|
|
271
|
+
return _extract_with_pattern(ctx, pattern)
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def extract_paths(ctx: object) -> list[ExtractedMatch]:
|
|
275
|
+
"""Extract file paths (Unix and Windows)."""
|
|
276
|
+
patterns = [
|
|
277
|
+
r'/(?:[^/\s]+/)*[^/\s]+', # Unix: /path/to/file
|
|
278
|
+
r'[A-Za-z]:\\(?:[^\\:\s]+\\)*[^\\:\s]+', # Windows: C:\path\to\file
|
|
279
|
+
r'\.{1,2}/(?:[^/\s]+/)*[^/\s]*', # Relative: ./path or ../path
|
|
280
|
+
]
|
|
281
|
+
combined = '|'.join(f'({p})' for p in patterns)
|
|
282
|
+
return _extract_with_pattern(ctx, combined)
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def extract_env_vars(ctx: object) -> list[ExtractedMatch]:
|
|
286
|
+
"""Extract environment variable references like $VAR, ${VAR}, %VAR%."""
|
|
287
|
+
pattern = r'\$\{[A-Za-z_][A-Za-z0-9_]*\}|\$[A-Za-z_][A-Za-z0-9_]*|%[A-Za-z_][A-Za-z0-9_]*%'
|
|
288
|
+
return _extract_with_pattern(ctx, pattern)
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def extract_versions(ctx: object) -> list[ExtractedMatch]:
|
|
292
|
+
"""Extract version numbers like v1.2.3, 2.0.0-beta, etc."""
|
|
293
|
+
pattern = r'v?\d+\.\d+(?:\.\d+)?(?:-[a-zA-Z0-9.]+)?(?:\+[a-zA-Z0-9.]+)?'
|
|
294
|
+
return _extract_with_pattern(ctx, pattern)
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
def extract_hashes(ctx: object) -> list[ExtractedMatch]:
|
|
298
|
+
"""Extract common hash formats (MD5, SHA1, SHA256)."""
|
|
299
|
+
patterns = [
|
|
300
|
+
r'\b[a-fA-F0-9]{32}\b', # MD5
|
|
301
|
+
r'\b[a-fA-F0-9]{40}\b', # SHA1
|
|
302
|
+
r'\b[a-fA-F0-9]{64}\b', # SHA256
|
|
303
|
+
]
|
|
304
|
+
combined = '|'.join(patterns)
|
|
305
|
+
return _extract_with_pattern(ctx, combined)
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
# =============================================================================
|
|
309
|
+
# Code-specific extraction
|
|
310
|
+
# =============================================================================
|
|
311
|
+
|
|
312
|
+
def extract_functions(ctx: object, lang: str = "python") -> list[ExtractedMatch]:
|
|
313
|
+
"""Extract function definitions."""
|
|
314
|
+
patterns = {
|
|
315
|
+
"python": r'(?:async\s+)?def\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\(',
|
|
316
|
+
"javascript": r'(?:async\s+)?function\s+([a-zA-Z_$][a-zA-Z0-9_$]*)\s*\(|(?:const|let|var)\s+([a-zA-Z_$][a-zA-Z0-9_$]*)\s*=\s*(?:async\s+)?\([^)]*\)\s*=>',
|
|
317
|
+
"go": r'func\s+(?:\([^)]+\)\s+)?([a-zA-Z_][a-zA-Z0-9_]*)\s*\(',
|
|
318
|
+
"rust": r'(?:pub\s+)?(?:async\s+)?fn\s+([a-zA-Z_][a-zA-Z0-9_]*)',
|
|
319
|
+
"java": r'(?:public|private|protected)?\s*(?:static\s+)?(?:\w+\s+)+([a-zA-Z_][a-zA-Z0-9_]*)\s*\(',
|
|
320
|
+
}
|
|
321
|
+
pattern = patterns.get(lang.lower(), patterns["python"])
|
|
322
|
+
return _extract_with_pattern(ctx, pattern)
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def extract_classes(ctx: object, lang: str = "python") -> list[ExtractedMatch]:
|
|
326
|
+
"""Extract class definitions."""
|
|
327
|
+
patterns = {
|
|
328
|
+
"python": r'class\s+([A-Za-z_][A-Za-z0-9_]*)',
|
|
329
|
+
"javascript": r'class\s+([A-Za-z_$][A-Za-z0-9_$]*)',
|
|
330
|
+
"java": r'(?:public\s+)?(?:abstract\s+)?(?:final\s+)?class\s+([A-Za-z_][A-Za-z0-9_]*)',
|
|
331
|
+
"go": r'type\s+([A-Za-z_][A-Za-z0-9_]*)\s+struct',
|
|
332
|
+
"rust": r'(?:pub\s+)?struct\s+([A-Za-z_][A-Za-z0-9_]*)',
|
|
333
|
+
}
|
|
334
|
+
pattern = patterns.get(lang.lower(), patterns["python"])
|
|
335
|
+
return _extract_with_pattern(ctx, pattern)
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def extract_imports(ctx: object, lang: str = "python") -> list[ExtractedMatch]:
|
|
339
|
+
"""Extract import statements."""
|
|
340
|
+
patterns = {
|
|
341
|
+
"python": r'(?:from\s+[\w.]+\s+)?import\s+[\w., ]+',
|
|
342
|
+
"javascript": r'import\s+.*?from\s+[\'"][^\'"]+[\'"]|require\s*\([\'"][^\'"]+[\'"]\)',
|
|
343
|
+
"go": r'import\s+(?:\(\s*(?:"[^"]+"\s*)+\)|"[^"]+")',
|
|
344
|
+
"java": r'import\s+[\w.]+;',
|
|
345
|
+
"rust": r'use\s+[\w:]+;',
|
|
346
|
+
}
|
|
347
|
+
pattern = patterns.get(lang.lower(), patterns["python"])
|
|
348
|
+
return _extract_with_pattern(ctx, pattern)
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
def extract_comments(ctx: object, lang: str = "python") -> list[ExtractedMatch]:
|
|
352
|
+
"""Extract comments."""
|
|
353
|
+
patterns = {
|
|
354
|
+
"python": r'#.*$|\'\'\'[\s\S]*?\'\'\'|"""[\s\S]*?"""',
|
|
355
|
+
"javascript": r'//.*$|/\*[\s\S]*?\*/',
|
|
356
|
+
"go": r'//.*$|/\*[\s\S]*?\*/',
|
|
357
|
+
"java": r'//.*$|/\*[\s\S]*?\*/',
|
|
358
|
+
"rust": r'//.*$|/\*[\s\S]*?\*/',
|
|
359
|
+
"c": r'//.*$|/\*[\s\S]*?\*/',
|
|
360
|
+
"html": r'<!--[\s\S]*?-->',
|
|
361
|
+
"css": r'/\*[\s\S]*?\*/',
|
|
362
|
+
}
|
|
363
|
+
pattern = patterns.get(lang.lower(), patterns["python"])
|
|
364
|
+
return _extract_with_pattern(ctx, pattern, flags=re.MULTILINE)
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
def extract_routes(ctx: object, lang: str = "auto") -> list[ExtractedMatch]:
|
|
368
|
+
"""Extract route definitions from common web frameworks."""
|
|
369
|
+
patterns = {
|
|
370
|
+
"python": r'@(?:app|router)\.(?:get|post|put|delete|patch|options|head)\(\s*["\'][^"\']+',
|
|
371
|
+
"django": r'\b(?:path|re_path)\(\s*r?["\'][^"\']+',
|
|
372
|
+
"javascript": r'\b(?:app|router)\.(?:get|post|put|delete|patch|options|head|use)\(\s*["\'][^"\']+',
|
|
373
|
+
"ruby": r'\b(?:get|post|put|delete|patch|match)\s+["\'][^"\']+',
|
|
374
|
+
}
|
|
375
|
+
key = lang.lower().strip() if isinstance(lang, str) else "auto"
|
|
376
|
+
if key in patterns:
|
|
377
|
+
pattern = patterns[key]
|
|
378
|
+
else:
|
|
379
|
+
pattern = "|".join(f"({p})" for p in patterns.values())
|
|
380
|
+
return _extract_with_pattern(ctx, pattern, flags=re.IGNORECASE | re.MULTILINE)
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
def extract_strings(ctx: object) -> list[ExtractedMatch]:
|
|
384
|
+
"""Extract string literals (single, double, backtick quotes)."""
|
|
385
|
+
pattern = r'"(?:[^"\\]|\\.)*"|\'(?:[^\'\\]|\\.)*\'|`(?:[^`\\]|\\.)*`'
|
|
386
|
+
return _extract_with_pattern(ctx, pattern)
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
def extract_todos(ctx: object) -> list[ExtractedMatch]:
|
|
390
|
+
"""Extract TODO, FIXME, HACK, XXX comments."""
|
|
391
|
+
pattern = r'(?:TODO|FIXME|HACK|XXX|BUG|NOTE)[\s:]+.*'
|
|
392
|
+
return _extract_with_pattern(ctx, pattern, flags=re.IGNORECASE)
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
# =============================================================================
|
|
396
|
+
# Log-specific extraction
|
|
397
|
+
# =============================================================================
|
|
398
|
+
|
|
399
|
+
def extract_log_levels(ctx: object) -> list[ExtractedMatch]:
|
|
400
|
+
"""Extract log levels (ERROR, WARN, INFO, DEBUG, etc.)."""
|
|
401
|
+
pattern = r'\b(?:FATAL|ERROR|WARN(?:ING)?|INFO|DEBUG|TRACE)\b'
|
|
402
|
+
return _extract_with_pattern(ctx, pattern, flags=re.IGNORECASE)
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
def extract_exceptions(ctx: object) -> list[ExtractedMatch]:
|
|
406
|
+
"""Extract exception/error messages from logs or stack traces."""
|
|
407
|
+
patterns = [
|
|
408
|
+
r'(?:Exception|Error|Traceback).*',
|
|
409
|
+
r'at\s+[\w.$]+\([\w.:]+\)', # Java stack trace
|
|
410
|
+
r'File ".*", line \d+', # Python stack trace
|
|
411
|
+
]
|
|
412
|
+
combined = '|'.join(patterns)
|
|
413
|
+
return _extract_with_pattern(ctx, combined)
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
def extract_json_objects(ctx: object) -> list[ExtractedMatch]:
|
|
417
|
+
"""Extract JSON-like objects {...} from text."""
|
|
418
|
+
# Simple brace matching (doesn't handle nested perfectly but good enough)
|
|
419
|
+
pattern = r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}'
|
|
420
|
+
return _extract_with_pattern(ctx, pattern)
|
|
421
|
+
|
|
422
|
+
|
|
423
|
+
# =============================================================================
|
|
424
|
+
# Text statistics
|
|
425
|
+
# =============================================================================
|
|
426
|
+
|
|
427
|
+
def word_count(ctx: object) -> int:
|
|
428
|
+
"""Count total words in text."""
|
|
429
|
+
text = _to_text(ctx)
|
|
430
|
+
return len(text.split())
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
def char_count(ctx: object, include_whitespace: bool = True) -> int:
|
|
434
|
+
"""Count characters in text."""
|
|
435
|
+
text = _to_text(ctx)
|
|
436
|
+
if include_whitespace:
|
|
437
|
+
return len(text)
|
|
438
|
+
return len(text.replace(" ", "").replace("\n", "").replace("\t", ""))
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
def line_count(ctx: object) -> int:
|
|
442
|
+
"""Count lines in text."""
|
|
443
|
+
text = _to_text(ctx)
|
|
444
|
+
return len(text.splitlines())
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
def sentence_count(ctx: object) -> int:
|
|
448
|
+
"""Estimate sentence count (splits on .!?)."""
|
|
449
|
+
text = _to_text(ctx)
|
|
450
|
+
sentences = re.split(r'[.!?]+', text)
|
|
451
|
+
return len([s for s in sentences if s.strip()])
|
|
452
|
+
|
|
453
|
+
|
|
454
|
+
def paragraph_count(ctx: object) -> int:
|
|
455
|
+
"""Count paragraphs (separated by blank lines)."""
|
|
456
|
+
text = _to_text(ctx)
|
|
457
|
+
paragraphs = re.split(r'\n\s*\n', text)
|
|
458
|
+
return len([p for p in paragraphs if p.strip()])
|
|
459
|
+
|
|
460
|
+
|
|
461
|
+
def unique_words(ctx: object, case_insensitive: bool = True) -> list[str]:
|
|
462
|
+
"""Get list of unique words."""
|
|
463
|
+
text = _to_text(ctx)
|
|
464
|
+
if case_insensitive:
|
|
465
|
+
text = text.lower()
|
|
466
|
+
words = re.findall(r'\b\w+\b', text)
|
|
467
|
+
return list(dict.fromkeys(words)) # Preserves order
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
def word_frequency(ctx: object, top_n: int = 20, case_insensitive: bool = True) -> list[tuple[str, int]]:
|
|
471
|
+
"""Get word frequency distribution."""
|
|
472
|
+
text = _to_text(ctx)
|
|
473
|
+
if case_insensitive:
|
|
474
|
+
text = text.lower()
|
|
475
|
+
words = re.findall(r'\b\w+\b', text)
|
|
476
|
+
return Counter(words).most_common(top_n)
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
def ngrams(ctx: object, n: int = 2, top_k: int = 20) -> list[tuple[tuple[str, ...], int]]:
|
|
480
|
+
"""Get most common n-grams."""
|
|
481
|
+
text = _to_text(ctx)
|
|
482
|
+
words = re.findall(r'\b\w+\b', text.lower())
|
|
483
|
+
grams = [tuple(words[i:i+n]) for i in range(len(words) - n + 1)]
|
|
484
|
+
return Counter(grams).most_common(top_k)
|
|
485
|
+
|
|
486
|
+
|
|
487
|
+
# =============================================================================
|
|
488
|
+
# Line operations (grep-like)
|
|
489
|
+
# =============================================================================
|
|
490
|
+
|
|
491
|
+
def head(ctx: object, n: int = 10) -> str:
|
|
492
|
+
"""Get first n lines."""
|
|
493
|
+
text = _to_text(ctx)
|
|
494
|
+
return "\n".join(text.splitlines()[:n])
|
|
495
|
+
|
|
496
|
+
|
|
497
|
+
def tail(ctx: object, n: int = 10) -> str:
|
|
498
|
+
"""Get last n lines."""
|
|
499
|
+
text = _to_text(ctx)
|
|
500
|
+
return "\n".join(text.splitlines()[-n:])
|
|
501
|
+
|
|
502
|
+
|
|
503
|
+
def grep(ctx: object, pattern: str, flags: int = 0) -> list[str]:
|
|
504
|
+
"""Filter lines matching pattern (like grep)."""
|
|
505
|
+
text = _to_text(ctx)
|
|
506
|
+
rx = re.compile(pattern, flags=flags)
|
|
507
|
+
return [line for line in text.splitlines() if rx.search(line)]
|
|
508
|
+
|
|
509
|
+
|
|
510
|
+
def grep_v(ctx: object, pattern: str, flags: int = 0) -> list[str]:
|
|
511
|
+
"""Filter lines NOT matching pattern (like grep -v)."""
|
|
512
|
+
text = _to_text(ctx)
|
|
513
|
+
rx = re.compile(pattern, flags=flags)
|
|
514
|
+
return [line for line in text.splitlines() if not rx.search(line)]
|
|
515
|
+
|
|
516
|
+
|
|
517
|
+
def grep_c(ctx: object, pattern: str, flags: int = 0) -> int:
|
|
518
|
+
"""Count lines matching pattern (like grep -c)."""
|
|
519
|
+
return len(grep(ctx, pattern, flags))
|
|
520
|
+
|
|
521
|
+
|
|
522
|
+
def uniq(ctx: object) -> list[str]:
|
|
523
|
+
"""Remove duplicate consecutive lines (like uniq)."""
|
|
524
|
+
text = _to_text(ctx)
|
|
525
|
+
lines_list = text.splitlines()
|
|
526
|
+
result = []
|
|
527
|
+
prev = None
|
|
528
|
+
for line in lines_list:
|
|
529
|
+
if line != prev:
|
|
530
|
+
result.append(line)
|
|
531
|
+
prev = line
|
|
532
|
+
return result
|
|
533
|
+
|
|
534
|
+
|
|
535
|
+
def sort_lines(ctx: object, reverse: bool = False, numeric: bool = False) -> list[str]:
|
|
536
|
+
"""Sort lines alphabetically or numerically."""
|
|
537
|
+
text = _to_text(ctx)
|
|
538
|
+
lines_list = text.splitlines()
|
|
539
|
+
if numeric:
|
|
540
|
+
def key(x: str) -> float:
|
|
541
|
+
nums = re.findall(r'-?\d+\.?\d*', x)
|
|
542
|
+
return float(nums[0]) if nums else 0
|
|
543
|
+
return sorted(lines_list, key=key, reverse=reverse)
|
|
544
|
+
return sorted(lines_list, reverse=reverse)
|
|
545
|
+
|
|
546
|
+
|
|
547
|
+
def number_lines(ctx: object, start: int = 1) -> str:
|
|
548
|
+
"""Add line numbers to text."""
|
|
549
|
+
text = _to_text(ctx)
|
|
550
|
+
lines_list = text.splitlines()
|
|
551
|
+
width = len(str(start + len(lines_list)))
|
|
552
|
+
return "\n".join(f"{i:{width}d}: {line}" for i, line in enumerate(lines_list, start))
|
|
553
|
+
|
|
554
|
+
|
|
555
|
+
def strip_lines(ctx: object) -> list[str]:
|
|
556
|
+
"""Strip whitespace from each line."""
|
|
557
|
+
text = _to_text(ctx)
|
|
558
|
+
return [line.strip() for line in text.splitlines()]
|
|
559
|
+
|
|
560
|
+
|
|
561
|
+
def blank_lines(ctx: object) -> list[int]:
|
|
562
|
+
"""Get indices of blank lines."""
|
|
563
|
+
text = _to_text(ctx)
|
|
564
|
+
return [i for i, line in enumerate(text.splitlines()) if not line.strip()]
|
|
565
|
+
|
|
566
|
+
|
|
567
|
+
def non_blank_lines(ctx: object) -> list[str]:
|
|
568
|
+
"""Filter out blank lines."""
|
|
569
|
+
text = _to_text(ctx)
|
|
570
|
+
return [line for line in text.splitlines() if line.strip()]
|
|
571
|
+
|
|
572
|
+
|
|
573
|
+
def columns(ctx: object, col: int, delim: str = r'\s+') -> list[str]:
|
|
574
|
+
"""Extract a column from delimited text (0-indexed)."""
|
|
575
|
+
text = _to_text(ctx)
|
|
576
|
+
result = []
|
|
577
|
+
for line in text.splitlines():
|
|
578
|
+
parts = re.split(delim, line)
|
|
579
|
+
if col < len(parts):
|
|
580
|
+
result.append(parts[col])
|
|
581
|
+
return result
|
|
582
|
+
|
|
583
|
+
|
|
584
|
+
# =============================================================================
|
|
585
|
+
# Text manipulation
|
|
586
|
+
# =============================================================================
|
|
587
|
+
|
|
588
|
+
def replace_all(ctx: object, pattern: str, replacement: str, flags: int = 0) -> str:
|
|
589
|
+
"""Replace all occurrences of pattern."""
|
|
590
|
+
text = _to_text(ctx)
|
|
591
|
+
return re.sub(pattern, replacement, text, flags=flags)
|
|
592
|
+
|
|
593
|
+
|
|
594
|
+
def split_by(ctx: object, pattern: str, flags: int = 0) -> list[str]:
|
|
595
|
+
"""Split text by regex pattern."""
|
|
596
|
+
text = _to_text(ctx)
|
|
597
|
+
return re.split(pattern, text, flags=flags)
|
|
598
|
+
|
|
599
|
+
|
|
600
|
+
def between(ctx: object, start_pattern: str, end_pattern: str, include_markers: bool = False) -> list[str]:
|
|
601
|
+
"""Extract text between start and end patterns."""
|
|
602
|
+
text = _to_text(ctx)
|
|
603
|
+
if include_markers:
|
|
604
|
+
pattern = f'({start_pattern}.*?{end_pattern})'
|
|
605
|
+
else:
|
|
606
|
+
pattern = f'{start_pattern}(.*?){end_pattern}'
|
|
607
|
+
return re.findall(pattern, text, flags=re.DOTALL)
|
|
608
|
+
|
|
609
|
+
|
|
610
|
+
def before(ctx: object, pattern: str) -> str:
|
|
611
|
+
"""Get text before first occurrence of pattern."""
|
|
612
|
+
text = _to_text(ctx)
|
|
613
|
+
match = re.search(pattern, text)
|
|
614
|
+
if match:
|
|
615
|
+
return text[:match.start()]
|
|
616
|
+
return text
|
|
617
|
+
|
|
618
|
+
|
|
619
|
+
def after(ctx: object, pattern: str) -> str:
|
|
620
|
+
"""Get text after first occurrence of pattern."""
|
|
621
|
+
text = _to_text(ctx)
|
|
622
|
+
match = re.search(pattern, text)
|
|
623
|
+
if match:
|
|
624
|
+
return text[match.end():]
|
|
625
|
+
return ""
|
|
626
|
+
|
|
627
|
+
|
|
628
|
+
def truncate(ctx: object, max_len: int = 100, suffix: str = "...") -> str:
|
|
629
|
+
"""Truncate text to max length with suffix."""
|
|
630
|
+
text = _to_text(ctx)
|
|
631
|
+
if len(text) <= max_len:
|
|
632
|
+
return text
|
|
633
|
+
return text[:max_len - len(suffix)] + suffix
|
|
634
|
+
|
|
635
|
+
|
|
636
|
+
def wrap_text(ctx: object, width: int = 80) -> str:
|
|
637
|
+
"""Wrap text to specified width."""
|
|
638
|
+
import textwrap
|
|
639
|
+
text = _to_text(ctx)
|
|
640
|
+
return textwrap.fill(text, width=width)
|
|
641
|
+
|
|
642
|
+
|
|
643
|
+
def indent_text(ctx: object, prefix: str = " ") -> str:
|
|
644
|
+
"""Indent all lines with prefix."""
|
|
645
|
+
text = _to_text(ctx)
|
|
646
|
+
return "\n".join(prefix + line for line in text.splitlines())
|
|
647
|
+
|
|
648
|
+
|
|
649
|
+
def dedent_text(ctx: object) -> str:
|
|
650
|
+
"""Remove common leading whitespace."""
|
|
651
|
+
import textwrap
|
|
652
|
+
text = _to_text(ctx)
|
|
653
|
+
return textwrap.dedent(text)
|
|
654
|
+
|
|
655
|
+
|
|
656
|
+
def normalize_whitespace(ctx: object) -> str:
|
|
657
|
+
"""Normalize whitespace (collapse multiple spaces, trim lines)."""
|
|
658
|
+
text = _to_text(ctx)
|
|
659
|
+
lines_list = [" ".join(line.split()) for line in text.splitlines()]
|
|
660
|
+
return "\n".join(lines_list)
|
|
661
|
+
|
|
662
|
+
|
|
663
|
+
def remove_punctuation(ctx: object) -> str:
|
|
664
|
+
"""Remove all punctuation."""
|
|
665
|
+
text = _to_text(ctx)
|
|
666
|
+
return re.sub(r'[^\w\s]', '', text)
|
|
667
|
+
|
|
668
|
+
|
|
669
|
+
# =============================================================================
|
|
670
|
+
# Text comparison
|
|
671
|
+
# =============================================================================
|
|
672
|
+
|
|
673
|
+
def diff(ctx1: object, ctx2: object, context_lines: int = 3) -> str:
|
|
674
|
+
"""Get unified diff between two texts."""
|
|
675
|
+
import difflib
|
|
676
|
+
text1 = _to_text(ctx1)
|
|
677
|
+
text2 = _to_text(ctx2)
|
|
678
|
+
lines1 = text1.splitlines(keepends=True)
|
|
679
|
+
lines2 = text2.splitlines(keepends=True)
|
|
680
|
+
return "".join(difflib.unified_diff(lines1, lines2, n=context_lines))
|
|
681
|
+
|
|
682
|
+
|
|
683
|
+
def similarity(ctx1: object, ctx2: object) -> float:
|
|
684
|
+
"""Get similarity ratio between two texts (0.0 to 1.0)."""
|
|
685
|
+
import difflib
|
|
686
|
+
text1 = _to_text(ctx1)
|
|
687
|
+
text2 = _to_text(ctx2)
|
|
688
|
+
return difflib.SequenceMatcher(None, text1, text2).ratio()
|
|
689
|
+
|
|
690
|
+
|
|
691
|
+
def common_lines(ctx1: object, ctx2: object) -> list[str]:
|
|
692
|
+
"""Get lines common to both texts."""
|
|
693
|
+
text1 = _to_text(ctx1)
|
|
694
|
+
text2 = _to_text(ctx2)
|
|
695
|
+
set1 = set(text1.splitlines())
|
|
696
|
+
set2 = set(text2.splitlines())
|
|
697
|
+
return list(set1 & set2)
|
|
698
|
+
|
|
699
|
+
|
|
700
|
+
def diff_lines(ctx1: object, ctx2: object) -> dict[str, list[str]]:
|
|
701
|
+
"""Get lines unique to each text."""
|
|
702
|
+
text1 = _to_text(ctx1)
|
|
703
|
+
text2 = _to_text(ctx2)
|
|
704
|
+
set1 = set(text1.splitlines())
|
|
705
|
+
set2 = set(text2.splitlines())
|
|
706
|
+
return {
|
|
707
|
+
"only_in_first": list(set1 - set2),
|
|
708
|
+
"only_in_second": list(set2 - set1),
|
|
709
|
+
}
|
|
710
|
+
|
|
711
|
+
|
|
712
|
+
# =============================================================================
|
|
713
|
+
# Pattern matching helpers
|
|
714
|
+
# =============================================================================
|
|
715
|
+
|
|
716
|
+
def contains(ctx: object, pattern: str, flags: int = 0) -> bool:
|
|
717
|
+
"""Check if text contains pattern."""
|
|
718
|
+
text = _to_text(ctx)
|
|
719
|
+
return bool(re.search(pattern, text, flags=flags))
|
|
720
|
+
|
|
721
|
+
|
|
722
|
+
def contains_any(ctx: object, patterns: list[str], flags: int = 0) -> bool:
|
|
723
|
+
"""Check if text contains any of the patterns."""
|
|
724
|
+
text = _to_text(ctx)
|
|
725
|
+
return any(re.search(p, text, flags=flags) for p in patterns)
|
|
726
|
+
|
|
727
|
+
|
|
728
|
+
def contains_all(ctx: object, patterns: list[str], flags: int = 0) -> bool:
|
|
729
|
+
"""Check if text contains all patterns."""
|
|
730
|
+
text = _to_text(ctx)
|
|
731
|
+
return all(re.search(p, text, flags=flags) for p in patterns)
|
|
732
|
+
|
|
733
|
+
|
|
734
|
+
def count_matches(ctx: object, pattern: str, flags: int = 0) -> int:
|
|
735
|
+
"""Count regex matches in text."""
|
|
736
|
+
text = _to_text(ctx)
|
|
737
|
+
return len(re.findall(pattern, text, flags=flags))
|
|
738
|
+
|
|
739
|
+
|
|
740
|
+
def find_all(ctx: object, pattern: str, flags: int = 0) -> list[str]:
|
|
741
|
+
"""Find all matches of pattern."""
|
|
742
|
+
text = _to_text(ctx)
|
|
743
|
+
return re.findall(pattern, text, flags=flags)
|
|
744
|
+
|
|
745
|
+
|
|
746
|
+
def first_match(ctx: object, pattern: str, flags: int = 0) -> str | None:
|
|
747
|
+
"""Get first match of pattern or None."""
|
|
748
|
+
text = _to_text(ctx)
|
|
749
|
+
match = re.search(pattern, text, flags=flags)
|
|
750
|
+
return match.group(0) if match else None
|
|
751
|
+
|
|
752
|
+
|
|
753
|
+
# =============================================================================
|
|
754
|
+
# Semantic search (lightweight embeddings)
|
|
755
|
+
# =============================================================================
|
|
756
|
+
|
|
757
|
+
def _tokenize(text: str) -> list[str]:
|
|
758
|
+
return re.findall(r"[A-Za-z0-9_]+", text.lower())
|
|
759
|
+
|
|
760
|
+
|
|
761
|
+
def embed_text(text: str, dim: int = 256) -> list[float]:
|
|
762
|
+
"""Create a lightweight hashed embedding for text."""
|
|
763
|
+
if dim <= 0:
|
|
764
|
+
raise ValueError("dim must be > 0")
|
|
765
|
+
vec = [0.0] * dim
|
|
766
|
+
for token in _tokenize(text):
|
|
767
|
+
if len(token) < 2:
|
|
768
|
+
continue
|
|
769
|
+
digest = hashlib.blake2b(token.encode("utf-8"), digest_size=4).digest()
|
|
770
|
+
idx = int.from_bytes(digest, "little") % dim
|
|
771
|
+
vec[idx] += 1.0
|
|
772
|
+
norm = math.sqrt(sum(v * v for v in vec))
|
|
773
|
+
if norm > 0:
|
|
774
|
+
vec = [v / norm for v in vec]
|
|
775
|
+
return vec
|
|
776
|
+
|
|
777
|
+
|
|
778
|
+
def _cosine_similarity(a: list[float], b: list[float]) -> float:
|
|
779
|
+
if not a or not b or len(a) != len(b):
|
|
780
|
+
return 0.0
|
|
781
|
+
return sum(x * y for x, y in zip(a, b))
|
|
782
|
+
|
|
783
|
+
|
|
784
|
+
def semantic_search(
|
|
785
|
+
ctx: object,
|
|
786
|
+
query: str,
|
|
787
|
+
chunk_size: int = 1000,
|
|
788
|
+
overlap: int = 100,
|
|
789
|
+
top_k: int = 5,
|
|
790
|
+
embed_dim: int = 256,
|
|
791
|
+
) -> list[dict[str, Any]]:
|
|
792
|
+
"""Semantic search over context using lightweight embeddings."""
|
|
793
|
+
if not query:
|
|
794
|
+
return []
|
|
795
|
+
chunks = chunk(ctx, chunk_size, overlap)
|
|
796
|
+
if not chunks:
|
|
797
|
+
return []
|
|
798
|
+
q_vec = embed_text(query, dim=embed_dim)
|
|
799
|
+
|
|
800
|
+
results: list[dict[str, Any]] = []
|
|
801
|
+
pos = 0
|
|
802
|
+
for i, chunk_text in enumerate(chunks):
|
|
803
|
+
c_vec = embed_text(chunk_text, dim=embed_dim)
|
|
804
|
+
score = _cosine_similarity(q_vec, c_vec)
|
|
805
|
+
start_char = pos
|
|
806
|
+
end_char = pos + len(chunk_text)
|
|
807
|
+
results.append({
|
|
808
|
+
"index": i,
|
|
809
|
+
"score": score,
|
|
810
|
+
"start_char": start_char,
|
|
811
|
+
"end_char": end_char,
|
|
812
|
+
"preview": chunk_text[:200] + ("..." if len(chunk_text) > 200 else ""),
|
|
813
|
+
})
|
|
814
|
+
pos += len(chunk_text) - overlap if i < len(chunks) - 1 else len(chunk_text)
|
|
815
|
+
|
|
816
|
+
results.sort(key=lambda r: r["score"], reverse=True)
|
|
817
|
+
if top_k <= 0:
|
|
818
|
+
return []
|
|
819
|
+
return results[:top_k]
|
|
820
|
+
|
|
821
|
+
|
|
822
|
+
# =============================================================================
|
|
823
|
+
# Collection utilities
|
|
824
|
+
# =============================================================================
|
|
825
|
+
|
|
826
|
+
def dedupe(items: Sequence[Any]) -> list[Any]:
|
|
827
|
+
"""Remove duplicates while preserving order."""
|
|
828
|
+
seen: set[Any] = set()
|
|
829
|
+
result: list[Any] = []
|
|
830
|
+
for item in items:
|
|
831
|
+
hashable = item if isinstance(item, (str, int, float, tuple)) else str(item)
|
|
832
|
+
if hashable not in seen:
|
|
833
|
+
seen.add(hashable)
|
|
834
|
+
result.append(item)
|
|
835
|
+
return result
|
|
836
|
+
|
|
837
|
+
|
|
838
|
+
def flatten(nested: Sequence[Any], depth: int = -1) -> list[Any]:
|
|
839
|
+
"""Flatten nested lists/tuples. depth=-1 means fully flatten."""
|
|
840
|
+
result: list[Any] = []
|
|
841
|
+
for item in nested:
|
|
842
|
+
if isinstance(item, (list, tuple)) and depth != 0:
|
|
843
|
+
result.extend(flatten(item, depth - 1 if depth > 0 else -1))
|
|
844
|
+
else:
|
|
845
|
+
result.append(item)
|
|
846
|
+
return result
|
|
847
|
+
|
|
848
|
+
|
|
849
|
+
def first(items: Sequence[Any], default: Any = None) -> Any:
|
|
850
|
+
"""Get first item or default."""
|
|
851
|
+
return items[0] if items else default
|
|
852
|
+
|
|
853
|
+
|
|
854
|
+
def last(items: Sequence[Any], default: Any = None) -> Any:
|
|
855
|
+
"""Get last item or default."""
|
|
856
|
+
return items[-1] if items else default
|
|
857
|
+
|
|
858
|
+
|
|
859
|
+
def take(n: int, items: Sequence[Any]) -> list[Any]:
|
|
860
|
+
"""Get first n items."""
|
|
861
|
+
return list(items[:n])
|
|
862
|
+
|
|
863
|
+
|
|
864
|
+
def drop(n: int, items: Sequence[Any]) -> list[Any]:
|
|
865
|
+
"""Skip first n items."""
|
|
866
|
+
return list(items[n:])
|
|
867
|
+
|
|
868
|
+
|
|
869
|
+
def partition(items: Sequence[Any], predicate: Callable[[Any], bool]) -> tuple[list[Any], list[Any]]:
|
|
870
|
+
"""Split items into (matches, non-matches) based on predicate."""
|
|
871
|
+
matches: list[Any] = []
|
|
872
|
+
non_matches: list[Any] = []
|
|
873
|
+
for item in items:
|
|
874
|
+
if predicate(item):
|
|
875
|
+
matches.append(item)
|
|
876
|
+
else:
|
|
877
|
+
non_matches.append(item)
|
|
878
|
+
return matches, non_matches
|
|
879
|
+
|
|
880
|
+
|
|
881
|
+
def group_by(items: Sequence[Any], key_fn: Callable[[Any], Any]) -> dict[Any, list[Any]]:
|
|
882
|
+
"""Group items by key function."""
|
|
883
|
+
result: dict[Any, list[Any]] = {}
|
|
884
|
+
for item in items:
|
|
885
|
+
k = key_fn(item)
|
|
886
|
+
if k not in result:
|
|
887
|
+
result[k] = []
|
|
888
|
+
result[k].append(item)
|
|
889
|
+
return result
|
|
890
|
+
|
|
891
|
+
|
|
892
|
+
def frequency(items: Sequence[Any], top_n: int | None = None) -> list[tuple[Any, int]]:
|
|
893
|
+
"""Get frequency distribution of items."""
|
|
894
|
+
counter = Counter(items)
|
|
895
|
+
if top_n:
|
|
896
|
+
return counter.most_common(top_n)
|
|
897
|
+
return counter.most_common()
|
|
898
|
+
|
|
899
|
+
|
|
900
|
+
def sample_items(items: Sequence[Any], n: int, seed: int | None = None) -> list[Any]:
|
|
901
|
+
"""Random sample of n items."""
|
|
902
|
+
import random
|
|
903
|
+
if seed is not None:
|
|
904
|
+
random.seed(seed)
|
|
905
|
+
return random.sample(list(items), min(n, len(items)))
|
|
906
|
+
|
|
907
|
+
|
|
908
|
+
def shuffle_items(items: Sequence[Any], seed: int | None = None) -> list[Any]:
|
|
909
|
+
"""Shuffle items randomly."""
|
|
910
|
+
import random
|
|
911
|
+
if seed is not None:
|
|
912
|
+
random.seed(seed)
|
|
913
|
+
result = list(items)
|
|
914
|
+
random.shuffle(result)
|
|
915
|
+
return result
|
|
916
|
+
|
|
917
|
+
|
|
918
|
+
# =============================================================================
|
|
919
|
+
# Validation helpers
|
|
920
|
+
# =============================================================================
|
|
921
|
+
|
|
922
|
+
def is_numeric(text: str) -> bool:
|
|
923
|
+
"""Check if text represents a number."""
|
|
924
|
+
try:
|
|
925
|
+
float(text.replace(",", ""))
|
|
926
|
+
return True
|
|
927
|
+
except (ValueError, AttributeError):
|
|
928
|
+
return False
|
|
929
|
+
|
|
930
|
+
|
|
931
|
+
def is_email(text: str) -> bool:
|
|
932
|
+
"""Check if text is a valid email format."""
|
|
933
|
+
pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
|
|
934
|
+
return bool(re.match(pattern, text.strip()))
|
|
935
|
+
|
|
936
|
+
|
|
937
|
+
def is_url(text: str) -> bool:
|
|
938
|
+
"""Check if text is a valid URL format."""
|
|
939
|
+
pattern = r'^https?://[^\s<>"\']+$'
|
|
940
|
+
return bool(re.match(pattern, text.strip()))
|
|
941
|
+
|
|
942
|
+
|
|
943
|
+
def is_ip(text: str) -> bool:
|
|
944
|
+
"""Check if text is a valid IPv4 address."""
|
|
945
|
+
parts = text.strip().split(".")
|
|
946
|
+
if len(parts) != 4:
|
|
947
|
+
return False
|
|
948
|
+
try:
|
|
949
|
+
return all(0 <= int(p) <= 255 for p in parts)
|
|
950
|
+
except ValueError:
|
|
951
|
+
return False
|
|
952
|
+
|
|
953
|
+
|
|
954
|
+
def is_uuid(text: str) -> bool:
|
|
955
|
+
"""Check if text is a valid UUID format."""
|
|
956
|
+
pattern = r'^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$'
|
|
957
|
+
return bool(re.match(pattern, text.strip()))
|
|
958
|
+
|
|
959
|
+
|
|
960
|
+
def is_json(text: str) -> bool:
|
|
961
|
+
"""Check if text is valid JSON."""
|
|
962
|
+
import json
|
|
963
|
+
try:
|
|
964
|
+
json.loads(text)
|
|
965
|
+
return True
|
|
966
|
+
except (json.JSONDecodeError, TypeError):
|
|
967
|
+
return False
|
|
968
|
+
|
|
969
|
+
|
|
970
|
+
def is_blank(text: str) -> bool:
|
|
971
|
+
"""Check if text is empty or only whitespace."""
|
|
972
|
+
return not text or not text.strip()
|
|
973
|
+
|
|
974
|
+
|
|
975
|
+
# =============================================================================
|
|
976
|
+
# Conversion helpers
|
|
977
|
+
# =============================================================================
|
|
978
|
+
|
|
979
|
+
def to_json(obj: Any, indent: int = 2) -> str:
|
|
980
|
+
"""Convert object to JSON string."""
|
|
981
|
+
import json
|
|
982
|
+
return json.dumps(obj, indent=indent, ensure_ascii=False, default=str)
|
|
983
|
+
|
|
984
|
+
|
|
985
|
+
def from_json(text: str) -> Any:
|
|
986
|
+
"""Parse JSON string to object."""
|
|
987
|
+
import json
|
|
988
|
+
return json.loads(text)
|
|
989
|
+
|
|
990
|
+
|
|
991
|
+
def to_csv_row(items: Sequence[Any], delim: str = ",") -> str:
|
|
992
|
+
"""Convert items to CSV row."""
|
|
993
|
+
return delim.join(str(item) for item in items)
|
|
994
|
+
|
|
995
|
+
|
|
996
|
+
def from_csv_row(text: str, delim: str = ",") -> list[str]:
|
|
997
|
+
"""Parse CSV row to list."""
|
|
998
|
+
import csv
|
|
999
|
+
from io import StringIO
|
|
1000
|
+
reader = csv.reader(StringIO(text), delimiter=delim)
|
|
1001
|
+
return next(reader, [])
|
|
1002
|
+
|
|
1003
|
+
|
|
1004
|
+
def to_int(text: str, default: int = 0) -> int:
|
|
1005
|
+
"""Convert text to int with default."""
|
|
1006
|
+
try:
|
|
1007
|
+
return int(text.replace(",", "").strip())
|
|
1008
|
+
except (ValueError, AttributeError):
|
|
1009
|
+
return default
|
|
1010
|
+
|
|
1011
|
+
|
|
1012
|
+
def to_float(text: str, default: float = 0.0) -> float:
|
|
1013
|
+
"""Convert text to float with default."""
|
|
1014
|
+
try:
|
|
1015
|
+
return float(text.replace(",", "").strip())
|
|
1016
|
+
except (ValueError, AttributeError):
|
|
1017
|
+
return default
|
|
1018
|
+
|
|
1019
|
+
|
|
1020
|
+
# =============================================================================
|
|
1021
|
+
# Case conversion
|
|
1022
|
+
# =============================================================================
|
|
1023
|
+
|
|
1024
|
+
def to_lower(ctx: object) -> str:
|
|
1025
|
+
"""Convert to lowercase."""
|
|
1026
|
+
return _to_text(ctx).lower()
|
|
1027
|
+
|
|
1028
|
+
|
|
1029
|
+
def to_upper(ctx: object) -> str:
|
|
1030
|
+
"""Convert to uppercase."""
|
|
1031
|
+
return _to_text(ctx).upper()
|
|
1032
|
+
|
|
1033
|
+
|
|
1034
|
+
def to_title(ctx: object) -> str:
|
|
1035
|
+
"""Convert to title case."""
|
|
1036
|
+
return _to_text(ctx).title()
|
|
1037
|
+
|
|
1038
|
+
|
|
1039
|
+
def to_snake_case(text: str) -> str:
|
|
1040
|
+
"""Convert to snake_case."""
|
|
1041
|
+
s1 = re.sub(r'(.)([A-Z][a-z]+)', r'\1_\2', text)
|
|
1042
|
+
s2 = re.sub(r'([a-z0-9])([A-Z])', r'\1_\2', s1)
|
|
1043
|
+
return re.sub(r'[-\s]+', '_', s2).lower()
|
|
1044
|
+
|
|
1045
|
+
|
|
1046
|
+
def to_camel_case(text: str) -> str:
|
|
1047
|
+
"""Convert to camelCase."""
|
|
1048
|
+
parts = re.split(r'[-_\s]+', text)
|
|
1049
|
+
return parts[0].lower() + "".join(p.title() for p in parts[1:])
|
|
1050
|
+
|
|
1051
|
+
|
|
1052
|
+
def to_pascal_case(text: str) -> str:
|
|
1053
|
+
"""Convert to PascalCase."""
|
|
1054
|
+
parts = re.split(r'[-_\s]+', text)
|
|
1055
|
+
return "".join(p.title() for p in parts)
|
|
1056
|
+
|
|
1057
|
+
|
|
1058
|
+
def to_kebab_case(text: str) -> str:
|
|
1059
|
+
"""Convert to kebab-case."""
|
|
1060
|
+
return to_snake_case(text).replace("_", "-")
|
|
1061
|
+
|
|
1062
|
+
|
|
1063
|
+
def slugify(text: str) -> str:
|
|
1064
|
+
"""Convert to URL-safe slug."""
|
|
1065
|
+
text = text.lower()
|
|
1066
|
+
text = re.sub(r'[^\w\s-]', '', text)
|
|
1067
|
+
text = re.sub(r'[-\s]+', '-', text)
|
|
1068
|
+
return text.strip('-')
|