flock-core 0.5.0b3__py3-none-any.whl → 0.5.0b6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of flock-core might be problematic. Click here for more details.
- flock/cli/execute_flock.py +1 -1
- flock/core/api/endpoints.py +1 -1
- flock/core/api/service.py +2 -2
- flock/core/evaluation/utils.py +2 -1
- flock/core/execution/batch_executor.py +1 -1
- flock/core/execution/evaluation_executor.py +1 -1
- flock/core/execution/opik_executor.py +1 -1
- flock/core/flock.py +5 -5
- flock/core/flock_scheduler.py +1 -1
- flock/core/orchestration/flock_execution.py +10 -10
- flock/core/util/hydrator.py +1 -1
- flock/webapp/app/chat.py +1 -1
- flock/webapp/app/services/flock_service.py +1 -1
- {flock_core-0.5.0b3.dist-info → flock_core-0.5.0b6.dist-info}/METADATA +2 -41
- {flock_core-0.5.0b3.dist-info → flock_core-0.5.0b6.dist-info}/RECORD +18 -28
- flock/tools/__init__.py +0 -0
- flock/tools/azure_tools.py +0 -781
- flock/tools/code_tools.py +0 -167
- flock/tools/file_tools.py +0 -149
- flock/tools/github_tools.py +0 -157
- flock/tools/markdown_tools.py +0 -204
- flock/tools/system_tools.py +0 -9
- flock/tools/text_tools.py +0 -809
- flock/tools/web_tools.py +0 -90
- flock/tools/zendesk_tools.py +0 -147
- {flock_core-0.5.0b3.dist-info → flock_core-0.5.0b6.dist-info}/WHEEL +0 -0
- {flock_core-0.5.0b3.dist-info → flock_core-0.5.0b6.dist-info}/entry_points.txt +0 -0
- {flock_core-0.5.0b3.dist-info → flock_core-0.5.0b6.dist-info}/licenses/LICENSE +0 -0
flock/tools/text_tools.py
DELETED
|
@@ -1,809 +0,0 @@
|
|
|
1
|
-
import hashlib
|
|
2
|
-
import json
|
|
3
|
-
import re
|
|
4
|
-
from collections.abc import Callable
|
|
5
|
-
from typing import Any
|
|
6
|
-
|
|
7
|
-
import nltk
|
|
8
|
-
|
|
9
|
-
from flock.core.logging.trace_and_logged import traced_and_logged
|
|
10
|
-
|
|
11
|
-
# Ensure NLTK data is downloaded
|
|
12
|
-
try:
|
|
13
|
-
nltk.data.find("tokenizers/punkt")
|
|
14
|
-
except LookupError:
|
|
15
|
-
nltk.download("punkt")
|
|
16
|
-
|
|
17
|
-
try:
|
|
18
|
-
nltk.data.find("corpora/stopwords")
|
|
19
|
-
except LookupError:
|
|
20
|
-
nltk.download("stopwords")
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
@traced_and_logged
|
|
24
|
-
def text_split_by_sentences(text: str) -> list[str]:
|
|
25
|
-
return nltk.sent_tokenize(text)
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
@traced_and_logged
|
|
29
|
-
def text_split_by_characters(
|
|
30
|
-
text: str, chunk_size: int = 4000, overlap: int = 200
|
|
31
|
-
) -> list[str]:
|
|
32
|
-
if chunk_size <= 0:
|
|
33
|
-
raise ValueError("chunk_size must be positive")
|
|
34
|
-
|
|
35
|
-
if overlap >= chunk_size:
|
|
36
|
-
raise ValueError("overlap must be smaller than chunk_size")
|
|
37
|
-
|
|
38
|
-
if not text:
|
|
39
|
-
return []
|
|
40
|
-
|
|
41
|
-
chunks = []
|
|
42
|
-
start = 0
|
|
43
|
-
text_length = len(text)
|
|
44
|
-
|
|
45
|
-
while start < text_length:
|
|
46
|
-
end = min(start + chunk_size, text_length)
|
|
47
|
-
|
|
48
|
-
# If we're not at the end and the next character isn't a space, try to find a suitable break point
|
|
49
|
-
if end < text_length and text[end] not in [
|
|
50
|
-
" ",
|
|
51
|
-
"\n",
|
|
52
|
-
".",
|
|
53
|
-
",",
|
|
54
|
-
"!",
|
|
55
|
-
"?",
|
|
56
|
-
";",
|
|
57
|
-
":",
|
|
58
|
-
"-",
|
|
59
|
-
]:
|
|
60
|
-
# Look for the last occurrence of a good break character
|
|
61
|
-
break_chars = [" ", "\n", ".", ",", "!", "?", ";", ":", "-"]
|
|
62
|
-
for i in range(end, max(start, end - 100), -1):
|
|
63
|
-
if text[i] in break_chars:
|
|
64
|
-
end = i + 1 # Include the break character
|
|
65
|
-
break
|
|
66
|
-
|
|
67
|
-
chunks.append(text[start:end])
|
|
68
|
-
start = end - overlap if end < text_length else text_length
|
|
69
|
-
|
|
70
|
-
return chunks
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
@traced_and_logged
|
|
74
|
-
def text_split_by_tokens(
|
|
75
|
-
text: str,
|
|
76
|
-
tokenizer: Callable[[str], list[str]],
|
|
77
|
-
max_tokens: int = 1024,
|
|
78
|
-
overlap_tokens: int = 100,
|
|
79
|
-
) -> list[str]:
|
|
80
|
-
tokens = tokenizer(text)
|
|
81
|
-
chunks = []
|
|
82
|
-
|
|
83
|
-
i = 0
|
|
84
|
-
while i < len(tokens):
|
|
85
|
-
chunk = tokens[i : i + max_tokens]
|
|
86
|
-
chunks.append("".join(chunk))
|
|
87
|
-
i += max_tokens - overlap_tokens
|
|
88
|
-
|
|
89
|
-
return chunks
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
@traced_and_logged
|
|
93
|
-
def text_split_by_separator(text: str, separator: str = "\n\n") -> list[str]:
|
|
94
|
-
if not text:
|
|
95
|
-
return []
|
|
96
|
-
|
|
97
|
-
chunks = text.split(separator)
|
|
98
|
-
return [chunk for chunk in chunks if chunk.strip()]
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
@traced_and_logged
|
|
102
|
-
def text_recursive_splitter(
|
|
103
|
-
text: str,
|
|
104
|
-
chunk_size: int = 4000,
|
|
105
|
-
separators: list[str] = ["\n\n", "\n", ". ", ", ", " ", ""],
|
|
106
|
-
keep_separator: bool = True,
|
|
107
|
-
) -> list[str]:
|
|
108
|
-
if not text:
|
|
109
|
-
return []
|
|
110
|
-
|
|
111
|
-
if len(text) <= chunk_size:
|
|
112
|
-
return [text]
|
|
113
|
-
|
|
114
|
-
if not separators:
|
|
115
|
-
return [
|
|
116
|
-
text[:chunk_size],
|
|
117
|
-
*text_recursive_splitter(text[chunk_size:], chunk_size, separators),
|
|
118
|
-
]
|
|
119
|
-
|
|
120
|
-
separator = separators[0]
|
|
121
|
-
new_separators = separators[1:]
|
|
122
|
-
|
|
123
|
-
if separator == "":
|
|
124
|
-
# If we're at the character level, just split by characters
|
|
125
|
-
return text_split_by_characters(text, chunk_size=chunk_size, overlap=0)
|
|
126
|
-
|
|
127
|
-
splits = text.split(separator)
|
|
128
|
-
separator_len = len(separator) if keep_separator else 0
|
|
129
|
-
|
|
130
|
-
# Add separator back to the chunks if needed
|
|
131
|
-
if keep_separator and separator:
|
|
132
|
-
splits = [f"{split}{separator}" for split in splits[:-1]] + [splits[-1]]
|
|
133
|
-
|
|
134
|
-
# Process each split
|
|
135
|
-
result = []
|
|
136
|
-
current_chunk = []
|
|
137
|
-
current_length = 0
|
|
138
|
-
|
|
139
|
-
for split in splits:
|
|
140
|
-
split_len = len(split)
|
|
141
|
-
|
|
142
|
-
if split_len > chunk_size:
|
|
143
|
-
# If current split is too large, handle current chunk and recursively split this large piece
|
|
144
|
-
if current_chunk:
|
|
145
|
-
result.append("".join(current_chunk))
|
|
146
|
-
current_chunk = []
|
|
147
|
-
current_length = 0
|
|
148
|
-
|
|
149
|
-
# Recursively split this large piece
|
|
150
|
-
smaller_chunks = text_recursive_splitter(
|
|
151
|
-
split, chunk_size, new_separators, keep_separator
|
|
152
|
-
)
|
|
153
|
-
result.extend(smaller_chunks)
|
|
154
|
-
elif current_length + split_len <= chunk_size:
|
|
155
|
-
# If we can fit this split in the current chunk, add it
|
|
156
|
-
current_chunk.append(split)
|
|
157
|
-
current_length += split_len
|
|
158
|
-
else:
|
|
159
|
-
# If we can't fit this split, complete the current chunk and start a new one
|
|
160
|
-
result.append("".join(current_chunk))
|
|
161
|
-
current_chunk = [split]
|
|
162
|
-
current_length = split_len
|
|
163
|
-
|
|
164
|
-
# Don't forget the last chunk
|
|
165
|
-
if current_chunk:
|
|
166
|
-
result.append("".join(current_chunk))
|
|
167
|
-
|
|
168
|
-
return result
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
@traced_and_logged
|
|
172
|
-
def text_chunking_for_embedding(
|
|
173
|
-
text: str, file_name: str, chunk_size: int = 1000, overlap: int = 100
|
|
174
|
-
) -> list[dict[str, Any]]:
|
|
175
|
-
chunks = text_split_by_characters(text, chunk_size=chunk_size, overlap=overlap)
|
|
176
|
-
|
|
177
|
-
# Create metadata for each chunk
|
|
178
|
-
result = []
|
|
179
|
-
for i, chunk in enumerate(chunks):
|
|
180
|
-
result.append(
|
|
181
|
-
{
|
|
182
|
-
"chunk_id": file_name + "_" + str(i),
|
|
183
|
-
"text": chunk,
|
|
184
|
-
"file": file_name,
|
|
185
|
-
"total_chunks": len(chunks),
|
|
186
|
-
}
|
|
187
|
-
)
|
|
188
|
-
|
|
189
|
-
return result
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
@traced_and_logged
|
|
193
|
-
def text_split_code_by_functions(code: str) -> list[dict[str, Any]]:
|
|
194
|
-
if not code:
|
|
195
|
-
return []
|
|
196
|
-
|
|
197
|
-
# Basic pattern for Python functions
|
|
198
|
-
function_pattern = re.compile(
|
|
199
|
-
r"(^|\n)def\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\((.*?)\)(?:\s*->.*?)?:"
|
|
200
|
-
)
|
|
201
|
-
matches = list(function_pattern.finditer(code))
|
|
202
|
-
|
|
203
|
-
if not matches:
|
|
204
|
-
return [{"name": "Main", "content": code, "type": "code"}]
|
|
205
|
-
|
|
206
|
-
functions = []
|
|
207
|
-
|
|
208
|
-
# Process each function
|
|
209
|
-
for i, current_match in enumerate(matches):
|
|
210
|
-
function_name = current_match.group(2)
|
|
211
|
-
|
|
212
|
-
# Determine function content
|
|
213
|
-
if i < len(matches) - 1:
|
|
214
|
-
next_function_start = matches[i + 1].start()
|
|
215
|
-
content = code[current_match.start() : next_function_start]
|
|
216
|
-
else:
|
|
217
|
-
content = code[current_match.start() :]
|
|
218
|
-
|
|
219
|
-
functions.append(
|
|
220
|
-
{
|
|
221
|
-
"name": function_name,
|
|
222
|
-
"content": content.strip(),
|
|
223
|
-
"type": "function",
|
|
224
|
-
}
|
|
225
|
-
)
|
|
226
|
-
|
|
227
|
-
# Check if there's content before the first function
|
|
228
|
-
if matches[0].start() > 0:
|
|
229
|
-
preamble = code[: matches[0].start()].strip()
|
|
230
|
-
if preamble:
|
|
231
|
-
functions.insert(
|
|
232
|
-
0,
|
|
233
|
-
{"name": "Imports/Setup", "content": preamble, "type": "code"},
|
|
234
|
-
)
|
|
235
|
-
|
|
236
|
-
return functions
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
@traced_and_logged
|
|
240
|
-
def text_count_tokens(text: str, model: str = "gpt-3.5-turbo") -> int:
|
|
241
|
-
"""Count tokens using tiktoken."""
|
|
242
|
-
if not text:
|
|
243
|
-
return 0
|
|
244
|
-
|
|
245
|
-
try:
|
|
246
|
-
import tiktoken
|
|
247
|
-
|
|
248
|
-
# Map model names to encoding types
|
|
249
|
-
if model.startswith(("gpt-4", "gpt-3.5")):
|
|
250
|
-
encoding_name = "cl100k_base" # For newer OpenAI models
|
|
251
|
-
elif model.startswith("text-davinci"):
|
|
252
|
-
encoding_name = "p50k_base" # For older OpenAI models
|
|
253
|
-
elif "llama" in model.lower() or "mistral" in model.lower():
|
|
254
|
-
encoding_name = (
|
|
255
|
-
"cl100k_base" # Best approximation for LLaMA/Mistral
|
|
256
|
-
)
|
|
257
|
-
else:
|
|
258
|
-
# Default to cl100k_base as fallback
|
|
259
|
-
encoding_name = "cl100k_base"
|
|
260
|
-
|
|
261
|
-
# Try to get the specific encoder for the model if available
|
|
262
|
-
try:
|
|
263
|
-
encoding = tiktoken.encoding_for_model(model)
|
|
264
|
-
except KeyError:
|
|
265
|
-
# Fall back to the encoding name
|
|
266
|
-
encoding = tiktoken.get_encoding(encoding_name)
|
|
267
|
-
|
|
268
|
-
# Count tokens
|
|
269
|
-
token_integers = encoding.encode(text)
|
|
270
|
-
return len(token_integers)
|
|
271
|
-
|
|
272
|
-
except ImportError:
|
|
273
|
-
# Fallback to character-based estimation if tiktoken is not installed
|
|
274
|
-
return text_count_tokens_estimate(text, model)
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
@traced_and_logged
|
|
278
|
-
def text_count_tokens_estimate(text: str, model: str = "gpt-3.5-turbo") -> int:
|
|
279
|
-
"""Estimate token count for different models."""
|
|
280
|
-
if not text:
|
|
281
|
-
return 0
|
|
282
|
-
|
|
283
|
-
# Rough token estimations for different models
|
|
284
|
-
if model.startswith(("gpt-3", "gpt-4")):
|
|
285
|
-
# OpenAI models: ~4 chars per token
|
|
286
|
-
return len(text) // 4 + 1
|
|
287
|
-
elif model.startswith("claude"):
|
|
288
|
-
# Anthropic models: ~3.5 chars per token
|
|
289
|
-
return len(text) // 3.5 + 1
|
|
290
|
-
elif "llama" in model.lower():
|
|
291
|
-
# LLaMA-based models: ~3.7 chars per token
|
|
292
|
-
return len(text) // 3.7 + 1
|
|
293
|
-
else:
|
|
294
|
-
# Default estimation
|
|
295
|
-
return len(text) // 4 + 1
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
@traced_and_logged
|
|
299
|
-
def text_truncate_to_token_limit(
|
|
300
|
-
text: str, max_tokens: int = 4000, model: str = "gpt-3.5-turbo"
|
|
301
|
-
) -> str:
|
|
302
|
-
if not text:
|
|
303
|
-
return ""
|
|
304
|
-
|
|
305
|
-
# Try to use tiktoken for accurate truncation
|
|
306
|
-
try:
|
|
307
|
-
import tiktoken
|
|
308
|
-
|
|
309
|
-
# Get appropriate encoding
|
|
310
|
-
try:
|
|
311
|
-
encoding = tiktoken.encoding_for_model(model)
|
|
312
|
-
except KeyError:
|
|
313
|
-
# Fall back to cl100k_base (used by most newer models)
|
|
314
|
-
encoding = tiktoken.get_encoding("cl100k_base")
|
|
315
|
-
|
|
316
|
-
# Encode the text to tokens
|
|
317
|
-
tokens = encoding.encode(text)
|
|
318
|
-
|
|
319
|
-
# If we're already under the limit, return the original text
|
|
320
|
-
if len(tokens) <= max_tokens:
|
|
321
|
-
return text
|
|
322
|
-
|
|
323
|
-
# Truncate tokens and decode back to text
|
|
324
|
-
truncated_tokens = tokens[:max_tokens]
|
|
325
|
-
return encoding.decode(truncated_tokens)
|
|
326
|
-
|
|
327
|
-
except ImportError:
|
|
328
|
-
# Fallback to the character-based method if tiktoken is not available
|
|
329
|
-
estimated_tokens = text_count_tokens_estimate(text, model)
|
|
330
|
-
|
|
331
|
-
if estimated_tokens <= max_tokens:
|
|
332
|
-
return text
|
|
333
|
-
|
|
334
|
-
# Calculate approximate character limit
|
|
335
|
-
char_per_token = 4 # Default for most models
|
|
336
|
-
if model.startswith("claude"):
|
|
337
|
-
char_per_token = 3.5
|
|
338
|
-
elif "llama" in model.lower():
|
|
339
|
-
char_per_token = 3.7
|
|
340
|
-
|
|
341
|
-
char_limit = int(max_tokens * char_per_token)
|
|
342
|
-
|
|
343
|
-
# Try to find a good breaking point
|
|
344
|
-
if char_limit < len(text):
|
|
345
|
-
# Look for sentence or paragraph break near the limit
|
|
346
|
-
for i in range(char_limit - 1, max(0, char_limit - 100), -1):
|
|
347
|
-
if i < len(text) and text[i] in [".", "!", "?", "\n"]:
|
|
348
|
-
return text[: i + 1]
|
|
349
|
-
|
|
350
|
-
# Fallback to hard truncation
|
|
351
|
-
return text[:char_limit]
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
@traced_and_logged
|
|
355
|
-
def text_extract_keywords(text: str, top_n: int = 10) -> list[str]:
|
|
356
|
-
if not text:
|
|
357
|
-
return []
|
|
358
|
-
|
|
359
|
-
# Get stopwords
|
|
360
|
-
try:
|
|
361
|
-
from nltk.corpus import stopwords
|
|
362
|
-
|
|
363
|
-
stop_words = set(stopwords.words("english"))
|
|
364
|
-
except:
|
|
365
|
-
# Fallback basic stopwords if NLTK data isn't available
|
|
366
|
-
stop_words = {
|
|
367
|
-
"i",
|
|
368
|
-
"me",
|
|
369
|
-
"my",
|
|
370
|
-
"myself",
|
|
371
|
-
"we",
|
|
372
|
-
"our",
|
|
373
|
-
"ours",
|
|
374
|
-
"ourselves",
|
|
375
|
-
"you",
|
|
376
|
-
"you're",
|
|
377
|
-
"you've",
|
|
378
|
-
"you'll",
|
|
379
|
-
"you'd",
|
|
380
|
-
"your",
|
|
381
|
-
"yours",
|
|
382
|
-
"yourself",
|
|
383
|
-
"yourselves",
|
|
384
|
-
"he",
|
|
385
|
-
"him",
|
|
386
|
-
"his",
|
|
387
|
-
"himself",
|
|
388
|
-
"she",
|
|
389
|
-
"she's",
|
|
390
|
-
"her",
|
|
391
|
-
"hers",
|
|
392
|
-
"herself",
|
|
393
|
-
"it",
|
|
394
|
-
"it's",
|
|
395
|
-
"its",
|
|
396
|
-
"itself",
|
|
397
|
-
"they",
|
|
398
|
-
"them",
|
|
399
|
-
"their",
|
|
400
|
-
"theirs",
|
|
401
|
-
"themselves",
|
|
402
|
-
"what",
|
|
403
|
-
"which",
|
|
404
|
-
"who",
|
|
405
|
-
"whom",
|
|
406
|
-
"this",
|
|
407
|
-
"that",
|
|
408
|
-
"that'll",
|
|
409
|
-
"these",
|
|
410
|
-
"those",
|
|
411
|
-
"am",
|
|
412
|
-
"is",
|
|
413
|
-
"are",
|
|
414
|
-
"was",
|
|
415
|
-
"were",
|
|
416
|
-
"be",
|
|
417
|
-
"been",
|
|
418
|
-
"being",
|
|
419
|
-
"have",
|
|
420
|
-
"has",
|
|
421
|
-
"had",
|
|
422
|
-
"having",
|
|
423
|
-
"do",
|
|
424
|
-
"does",
|
|
425
|
-
"did",
|
|
426
|
-
"doing",
|
|
427
|
-
"a",
|
|
428
|
-
"an",
|
|
429
|
-
"the",
|
|
430
|
-
"and",
|
|
431
|
-
"but",
|
|
432
|
-
"if",
|
|
433
|
-
"or",
|
|
434
|
-
"because",
|
|
435
|
-
"as",
|
|
436
|
-
"until",
|
|
437
|
-
"while",
|
|
438
|
-
"of",
|
|
439
|
-
"at",
|
|
440
|
-
"by",
|
|
441
|
-
"for",
|
|
442
|
-
"with",
|
|
443
|
-
"about",
|
|
444
|
-
"against",
|
|
445
|
-
"between",
|
|
446
|
-
"into",
|
|
447
|
-
"through",
|
|
448
|
-
"during",
|
|
449
|
-
"before",
|
|
450
|
-
"after",
|
|
451
|
-
"above",
|
|
452
|
-
"below",
|
|
453
|
-
"to",
|
|
454
|
-
"from",
|
|
455
|
-
"up",
|
|
456
|
-
"down",
|
|
457
|
-
"in",
|
|
458
|
-
"out",
|
|
459
|
-
"on",
|
|
460
|
-
"off",
|
|
461
|
-
"over",
|
|
462
|
-
"under",
|
|
463
|
-
"again",
|
|
464
|
-
"further",
|
|
465
|
-
"then",
|
|
466
|
-
"once",
|
|
467
|
-
}
|
|
468
|
-
|
|
469
|
-
# Tokenize and remove punctuation
|
|
470
|
-
words = re.findall(r"\b[a-zA-Z]{3,}\b", text.lower())
|
|
471
|
-
|
|
472
|
-
# Remove stopwords
|
|
473
|
-
words = [word for word in words if word not in stop_words]
|
|
474
|
-
|
|
475
|
-
# Count word frequencies
|
|
476
|
-
word_freq = {}
|
|
477
|
-
for word in words:
|
|
478
|
-
if word in word_freq:
|
|
479
|
-
word_freq[word] += 1
|
|
480
|
-
else:
|
|
481
|
-
word_freq[word] = 1
|
|
482
|
-
|
|
483
|
-
# Sort by frequency
|
|
484
|
-
sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
|
|
485
|
-
|
|
486
|
-
# Return top N keywords
|
|
487
|
-
return [word for word, freq in sorted_words[:top_n]]
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
@traced_and_logged
|
|
491
|
-
def text_clean_text(
|
|
492
|
-
text: str,
|
|
493
|
-
remove_urls: bool = True,
|
|
494
|
-
remove_html: bool = True,
|
|
495
|
-
normalize_whitespace: bool = True,
|
|
496
|
-
) -> str:
|
|
497
|
-
if not text:
|
|
498
|
-
return ""
|
|
499
|
-
|
|
500
|
-
result = text
|
|
501
|
-
|
|
502
|
-
# Remove URLs
|
|
503
|
-
if remove_urls:
|
|
504
|
-
result = re.sub(r"https?://\S+|www\.\S+", "", result)
|
|
505
|
-
|
|
506
|
-
# Remove HTML tags
|
|
507
|
-
if remove_html:
|
|
508
|
-
result = re.sub(r"<.*?>", "", result)
|
|
509
|
-
|
|
510
|
-
# Normalize whitespace
|
|
511
|
-
if normalize_whitespace:
|
|
512
|
-
# Replace multiple spaces, tabs, newlines with a single space
|
|
513
|
-
result = re.sub(r"\s+", " ", result)
|
|
514
|
-
result = result.strip()
|
|
515
|
-
|
|
516
|
-
return result
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
@traced_and_logged
|
|
520
|
-
def text_format_chat_history(
|
|
521
|
-
messages: list[dict[str, str]],
|
|
522
|
-
format_type: str = "text",
|
|
523
|
-
system_prefix: str = "System: ",
|
|
524
|
-
user_prefix: str = "User: ",
|
|
525
|
-
assistant_prefix: str = "Assistant: ",
|
|
526
|
-
) -> str:
|
|
527
|
-
if not messages:
|
|
528
|
-
return ""
|
|
529
|
-
|
|
530
|
-
result = []
|
|
531
|
-
|
|
532
|
-
if format_type == "text":
|
|
533
|
-
for msg in messages:
|
|
534
|
-
role = msg.get("role", "").lower()
|
|
535
|
-
content = msg.get("content", "")
|
|
536
|
-
|
|
537
|
-
if role == "system":
|
|
538
|
-
result.append(f"{system_prefix}{content}")
|
|
539
|
-
elif role == "user":
|
|
540
|
-
result.append(f"{user_prefix}{content}")
|
|
541
|
-
elif role == "assistant":
|
|
542
|
-
result.append(f"{assistant_prefix}{content}")
|
|
543
|
-
else:
|
|
544
|
-
result.append(f"{role.capitalize()}: {content}")
|
|
545
|
-
|
|
546
|
-
return "\n\n".join(result)
|
|
547
|
-
|
|
548
|
-
elif format_type == "markdown":
|
|
549
|
-
for msg in messages:
|
|
550
|
-
role = msg.get("role", "").lower()
|
|
551
|
-
content = msg.get("content", "")
|
|
552
|
-
|
|
553
|
-
if role == "system":
|
|
554
|
-
result.append(f"**{system_prefix.strip()}** {content}")
|
|
555
|
-
elif role == "user":
|
|
556
|
-
result.append(f"**{user_prefix.strip()}** {content}")
|
|
557
|
-
elif role == "assistant":
|
|
558
|
-
result.append(f"**{assistant_prefix.strip()}** {content}")
|
|
559
|
-
else:
|
|
560
|
-
result.append(f"**{role.capitalize()}:** {content}")
|
|
561
|
-
|
|
562
|
-
return "\n\n".join(result)
|
|
563
|
-
|
|
564
|
-
else:
|
|
565
|
-
raise ValueError(f"Unsupported format type: {format_type}")
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
@traced_and_logged
|
|
569
|
-
def text_extract_json_from_text(text: str) -> dict[str, Any] | None:
|
|
570
|
-
if not text:
|
|
571
|
-
return None
|
|
572
|
-
|
|
573
|
-
# Find JSON-like patterns between curly braces
|
|
574
|
-
json_pattern = re.compile(r"({[\s\S]*?})")
|
|
575
|
-
json_matches = json_pattern.findall(text)
|
|
576
|
-
|
|
577
|
-
# Try to parse each match
|
|
578
|
-
for json_str in json_matches:
|
|
579
|
-
try:
|
|
580
|
-
return json.loads(json_str)
|
|
581
|
-
except json.JSONDecodeError:
|
|
582
|
-
continue
|
|
583
|
-
|
|
584
|
-
# Try to find JSON with markdown code blocks
|
|
585
|
-
code_block_pattern = re.compile(r"```(?:json)?\s*([\s\S]*?)\s*```")
|
|
586
|
-
code_blocks = code_block_pattern.findall(text)
|
|
587
|
-
|
|
588
|
-
for block in code_blocks:
|
|
589
|
-
# Clean up any trailing ``` that might have been captured
|
|
590
|
-
block = block.replace("```", "")
|
|
591
|
-
try:
|
|
592
|
-
return json.loads(block)
|
|
593
|
-
except json.JSONDecodeError:
|
|
594
|
-
continue
|
|
595
|
-
|
|
596
|
-
# No valid JSON found
|
|
597
|
-
return None
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
@traced_and_logged
|
|
601
|
-
def text_calculate_hash(text: str, algorithm: str = "sha256") -> str:
|
|
602
|
-
if not text:
|
|
603
|
-
return ""
|
|
604
|
-
|
|
605
|
-
if algorithm == "md5":
|
|
606
|
-
return hashlib.md5(text.encode()).hexdigest()
|
|
607
|
-
elif algorithm == "sha1":
|
|
608
|
-
return hashlib.sha1(text.encode()).hexdigest()
|
|
609
|
-
elif algorithm == "sha256":
|
|
610
|
-
return hashlib.sha256(text.encode()).hexdigest()
|
|
611
|
-
else:
|
|
612
|
-
raise ValueError(f"Unsupported hash algorithm: {algorithm}")
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
@traced_and_logged
|
|
616
|
-
def text_format_table_from_dicts(data: list[dict[str, Any]]) -> str:
|
|
617
|
-
if not data:
|
|
618
|
-
return ""
|
|
619
|
-
|
|
620
|
-
# Extract all possible keys
|
|
621
|
-
keys = set()
|
|
622
|
-
for item in data:
|
|
623
|
-
keys.update(item.keys())
|
|
624
|
-
|
|
625
|
-
# Convert to list and sort for consistent output
|
|
626
|
-
keys = sorted(list(keys))
|
|
627
|
-
|
|
628
|
-
# Calculate column widths
|
|
629
|
-
widths = {key: len(key) for key in keys}
|
|
630
|
-
for item in data:
|
|
631
|
-
for key in keys:
|
|
632
|
-
if key in item:
|
|
633
|
-
value_str = str(item[key])
|
|
634
|
-
widths[key] = max(widths[key], len(value_str))
|
|
635
|
-
|
|
636
|
-
# Create header
|
|
637
|
-
header = " | ".join(f"{key:{widths[key]}}" for key in keys)
|
|
638
|
-
separator = "-+-".join("-" * widths[key] for key in keys)
|
|
639
|
-
|
|
640
|
-
# Create rows
|
|
641
|
-
rows = []
|
|
642
|
-
for item in data:
|
|
643
|
-
row = " | ".join(f"{item.get(key, '')!s:{widths[key]}}" for key in keys)
|
|
644
|
-
rows.append(row)
|
|
645
|
-
|
|
646
|
-
# Combine everything
|
|
647
|
-
return f"{header}\n{separator}\n" + "\n".join(rows)
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
@traced_and_logged
|
|
651
|
-
def text_detect_language(text: str) -> str:
|
|
652
|
-
"""Simple language detection"""
|
|
653
|
-
if not text or len(text.strip()) < 10:
|
|
654
|
-
return "unknown"
|
|
655
|
-
|
|
656
|
-
try:
|
|
657
|
-
# Try to use langdetect if available
|
|
658
|
-
from langdetect import detect
|
|
659
|
-
|
|
660
|
-
return detect(text)
|
|
661
|
-
except ImportError:
|
|
662
|
-
# Fallback to simple detection based on character frequency
|
|
663
|
-
# This is very simplistic and only works for a few common languages
|
|
664
|
-
text = text.lower()
|
|
665
|
-
|
|
666
|
-
# Count character frequencies that may indicate certain languages
|
|
667
|
-
special_chars = {
|
|
668
|
-
"á": 0,
|
|
669
|
-
"é": 0,
|
|
670
|
-
"í": 0,
|
|
671
|
-
"ó": 0,
|
|
672
|
-
"ú": 0,
|
|
673
|
-
"ü": 0,
|
|
674
|
-
"ñ": 0, # Spanish
|
|
675
|
-
"ä": 0,
|
|
676
|
-
"ö": 0,
|
|
677
|
-
"ß": 0, # German
|
|
678
|
-
"ç": 0,
|
|
679
|
-
"à": 0,
|
|
680
|
-
"è": 0,
|
|
681
|
-
"ù": 0, # French
|
|
682
|
-
"å": 0,
|
|
683
|
-
"ø": 0, # Nordic
|
|
684
|
-
"й": 0,
|
|
685
|
-
"ы": 0,
|
|
686
|
-
"ъ": 0,
|
|
687
|
-
"э": 0, # Russian/Cyrillic
|
|
688
|
-
"的": 0,
|
|
689
|
-
"是": 0,
|
|
690
|
-
"在": 0, # Chinese
|
|
691
|
-
"の": 0,
|
|
692
|
-
"は": 0,
|
|
693
|
-
"で": 0, # Japanese
|
|
694
|
-
"한": 0,
|
|
695
|
-
"국": 0,
|
|
696
|
-
"어": 0, # Korean
|
|
697
|
-
}
|
|
698
|
-
|
|
699
|
-
for char in text:
|
|
700
|
-
if char in special_chars:
|
|
701
|
-
special_chars[char] += 1
|
|
702
|
-
|
|
703
|
-
# Detect based on character frequencies
|
|
704
|
-
spanish = sum(
|
|
705
|
-
special_chars[c] for c in ["á", "é", "í", "ó", "ú", "ü", "ñ"]
|
|
706
|
-
)
|
|
707
|
-
german = sum(special_chars[c] for c in ["ä", "ö", "ß"])
|
|
708
|
-
french = sum(special_chars[c] for c in ["ç", "à", "è", "ù"])
|
|
709
|
-
nordic = sum(special_chars[c] for c in ["å", "ø"])
|
|
710
|
-
russian = sum(special_chars[c] for c in ["й", "ы", "ъ", "э"])
|
|
711
|
-
chinese = sum(special_chars[c] for c in ["的", "是", "在"])
|
|
712
|
-
japanese = sum(special_chars[c] for c in ["の", "は", "で"])
|
|
713
|
-
korean = sum(special_chars[c] for c in ["한", "국", "어"])
|
|
714
|
-
|
|
715
|
-
scores = {
|
|
716
|
-
"es": spanish,
|
|
717
|
-
"de": german,
|
|
718
|
-
"fr": french,
|
|
719
|
-
"no": nordic,
|
|
720
|
-
"ru": russian,
|
|
721
|
-
"zh": chinese,
|
|
722
|
-
"ja": japanese,
|
|
723
|
-
"ko": korean,
|
|
724
|
-
}
|
|
725
|
-
|
|
726
|
-
# If we have a clear signal from special characters
|
|
727
|
-
max_score = max(scores.values())
|
|
728
|
-
if max_score > 0:
|
|
729
|
-
return max(scores, key=scores.get)
|
|
730
|
-
|
|
731
|
-
# Otherwise assume English (very simplistic)
|
|
732
|
-
return "en"
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
@traced_and_logged
|
|
736
|
-
def text_tiktoken_split(
|
|
737
|
-
text: str,
|
|
738
|
-
model: str = "gpt-3.5-turbo",
|
|
739
|
-
chunk_size: int = 1000,
|
|
740
|
-
overlap: int = 50,
|
|
741
|
-
) -> list[str]:
|
|
742
|
-
"""Split text based on tiktoken tokens with proper overlap handling."""
|
|
743
|
-
if not text:
|
|
744
|
-
return []
|
|
745
|
-
|
|
746
|
-
try:
|
|
747
|
-
import tiktoken
|
|
748
|
-
|
|
749
|
-
try:
|
|
750
|
-
encoding = tiktoken.encoding_for_model(model)
|
|
751
|
-
except KeyError:
|
|
752
|
-
encoding = tiktoken.get_encoding("cl100k_base")
|
|
753
|
-
|
|
754
|
-
# Encode the text to tokens
|
|
755
|
-
tokens = encoding.encode(text)
|
|
756
|
-
total_tokens = len(tokens)
|
|
757
|
-
|
|
758
|
-
# Check if we need to split at all
|
|
759
|
-
if total_tokens <= chunk_size:
|
|
760
|
-
return [text]
|
|
761
|
-
|
|
762
|
-
# Create chunks with overlap
|
|
763
|
-
chunks = []
|
|
764
|
-
start_idx = 0
|
|
765
|
-
|
|
766
|
-
while start_idx < total_tokens:
|
|
767
|
-
# Define the end of this chunk
|
|
768
|
-
end_idx = min(start_idx + chunk_size, total_tokens)
|
|
769
|
-
|
|
770
|
-
# Decode this chunk of tokens back to text
|
|
771
|
-
chunk_tokens = tokens[start_idx:end_idx]
|
|
772
|
-
chunk_text = encoding.decode(chunk_tokens)
|
|
773
|
-
chunks.append(chunk_text)
|
|
774
|
-
|
|
775
|
-
# Move to the next chunk, accounting for overlap
|
|
776
|
-
start_idx += chunk_size - overlap
|
|
777
|
-
|
|
778
|
-
# Avoid tiny final chunks
|
|
779
|
-
if start_idx < total_tokens and start_idx + overlap >= total_tokens:
|
|
780
|
-
break
|
|
781
|
-
|
|
782
|
-
return chunks
|
|
783
|
-
except ImportError:
|
|
784
|
-
# Fallback to character-based chunking if tiktoken is not available
|
|
785
|
-
return text_split_by_characters(
|
|
786
|
-
text, chunk_size=chunk_size * 4, overlap=overlap * 4
|
|
787
|
-
)
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
@traced_and_logged
|
|
791
|
-
def text_count_words(text: str) -> int:
|
|
792
|
-
if not text:
|
|
793
|
-
return 0
|
|
794
|
-
return len(text.split())
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
@traced_and_logged
|
|
798
|
-
def text_extract_urls(text: str) -> list[str]:
|
|
799
|
-
if not text:
|
|
800
|
-
return []
|
|
801
|
-
# A more robust regex might be needed for complex cases
|
|
802
|
-
return re.findall(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", text)
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
@traced_and_logged
|
|
806
|
-
def text_extract_numbers(text: str) -> list[float]:
|
|
807
|
-
if not text:
|
|
808
|
-
return []
|
|
809
|
-
return [float(num) for num in re.findall(r"[-+]?\d*\.?\d+", text)]
|