flock-core 0.4.528__py3-none-any.whl → 0.5.0b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of flock-core might be problematic. Click here for more details.

Files changed (130) hide show
  1. flock/cli/execute_flock.py +1 -1
  2. flock/cli/manage_agents.py +6 -6
  3. flock/components/__init__.py +30 -0
  4. flock/components/evaluation/__init__.py +9 -0
  5. flock/components/evaluation/declarative_evaluation_component.py +222 -0
  6. flock/components/routing/__init__.py +15 -0
  7. flock/{routers/conditional/conditional_router.py → components/routing/conditional_routing_component.py} +61 -53
  8. flock/components/routing/default_routing_component.py +103 -0
  9. flock/components/routing/llm_routing_component.py +206 -0
  10. flock/components/utility/__init__.py +15 -0
  11. flock/{modules/enterprise_memory/enterprise_memory_module.py → components/utility/memory_utility_component.py} +195 -173
  12. flock/{modules/performance/metrics_module.py → components/utility/metrics_utility_component.py} +110 -95
  13. flock/{modules/output/output_module.py → components/utility/output_utility_component.py} +47 -45
  14. flock/core/__init__.py +26 -18
  15. flock/core/agent/__init__.py +16 -0
  16. flock/core/agent/flock_agent_components.py +104 -0
  17. flock/core/agent/flock_agent_execution.py +101 -0
  18. flock/core/agent/flock_agent_integration.py +206 -0
  19. flock/core/agent/flock_agent_lifecycle.py +177 -0
  20. flock/core/agent/flock_agent_serialization.py +381 -0
  21. flock/core/api/endpoints.py +2 -2
  22. flock/core/api/service.py +2 -2
  23. flock/core/component/__init__.py +15 -0
  24. flock/core/{flock_module.py → component/agent_component_base.py} +136 -34
  25. flock/core/component/evaluation_component.py +56 -0
  26. flock/core/component/routing_component.py +74 -0
  27. flock/core/component/utility_component.py +69 -0
  28. flock/core/config/flock_agent_config.py +49 -2
  29. flock/core/evaluation/utils.py +3 -2
  30. flock/core/execution/batch_executor.py +1 -1
  31. flock/core/execution/evaluation_executor.py +2 -2
  32. flock/core/execution/opik_executor.py +1 -1
  33. flock/core/flock.py +147 -493
  34. flock/core/flock_agent.py +195 -1032
  35. flock/core/flock_factory.py +114 -90
  36. flock/core/flock_scheduler.py +1 -1
  37. flock/core/flock_server_manager.py +8 -8
  38. flock/core/logging/logging.py +1 -0
  39. flock/core/mcp/flock_mcp_server.py +53 -48
  40. flock/core/mcp/{flock_mcp_tool_base.py → flock_mcp_tool.py} +2 -2
  41. flock/core/mcp/mcp_client.py +9 -9
  42. flock/core/mcp/mcp_client_manager.py +9 -9
  43. flock/core/mcp/mcp_config.py +24 -24
  44. flock/core/mixin/dspy_integration.py +5 -5
  45. flock/core/orchestration/__init__.py +18 -0
  46. flock/core/orchestration/flock_batch_processor.py +94 -0
  47. flock/core/orchestration/flock_evaluator.py +113 -0
  48. flock/core/orchestration/flock_execution.py +288 -0
  49. flock/core/orchestration/flock_initialization.py +125 -0
  50. flock/core/orchestration/flock_server_manager.py +67 -0
  51. flock/core/orchestration/flock_web_server.py +117 -0
  52. flock/core/registry/__init__.py +45 -0
  53. flock/core/registry/agent_registry.py +69 -0
  54. flock/core/registry/callable_registry.py +139 -0
  55. flock/core/registry/component_discovery.py +142 -0
  56. flock/core/registry/component_registry.py +64 -0
  57. flock/core/registry/config_mapping.py +64 -0
  58. flock/core/registry/decorators.py +137 -0
  59. flock/core/registry/registry_hub.py +205 -0
  60. flock/core/registry/server_registry.py +57 -0
  61. flock/core/registry/type_registry.py +86 -0
  62. flock/core/serialization/flock_serializer.py +36 -32
  63. flock/core/serialization/serialization_utils.py +28 -25
  64. flock/core/util/hydrator.py +1 -1
  65. flock/core/util/input_resolver.py +29 -2
  66. flock/mcp/servers/sse/flock_sse_server.py +10 -10
  67. flock/mcp/servers/stdio/flock_stdio_server.py +10 -10
  68. flock/mcp/servers/streamable_http/flock_streamable_http_server.py +10 -10
  69. flock/mcp/servers/websockets/flock_websocket_server.py +10 -10
  70. flock/platform/docker_tools.py +3 -3
  71. flock/webapp/app/chat.py +1 -1
  72. flock/webapp/app/main.py +9 -5
  73. flock/webapp/app/services/flock_service.py +1 -1
  74. flock/webapp/app/services/sharing_store.py +1 -0
  75. flock/workflow/activities.py +67 -92
  76. flock/workflow/agent_execution_activity.py +6 -6
  77. flock/workflow/flock_workflow.py +1 -1
  78. flock_core-0.5.0b0.dist-info/METADATA +272 -0
  79. {flock_core-0.4.528.dist-info → flock_core-0.5.0b0.dist-info}/RECORD +82 -95
  80. flock/core/flock_evaluator.py +0 -60
  81. flock/core/flock_registry.py +0 -702
  82. flock/core/flock_router.py +0 -83
  83. flock/evaluators/__init__.py +0 -1
  84. flock/evaluators/declarative/__init__.py +0 -1
  85. flock/evaluators/declarative/declarative_evaluator.py +0 -217
  86. flock/evaluators/memory/memory_evaluator.py +0 -90
  87. flock/evaluators/test/test_case_evaluator.py +0 -38
  88. flock/evaluators/zep/zep_evaluator.py +0 -59
  89. flock/modules/__init__.py +0 -1
  90. flock/modules/assertion/__init__.py +0 -1
  91. flock/modules/assertion/assertion_module.py +0 -286
  92. flock/modules/callback/__init__.py +0 -1
  93. flock/modules/callback/callback_module.py +0 -91
  94. flock/modules/enterprise_memory/README.md +0 -99
  95. flock/modules/mem0/__init__.py +0 -1
  96. flock/modules/mem0/mem0_module.py +0 -126
  97. flock/modules/mem0_async/__init__.py +0 -1
  98. flock/modules/mem0_async/async_mem0_module.py +0 -126
  99. flock/modules/memory/__init__.py +0 -1
  100. flock/modules/memory/memory_module.py +0 -429
  101. flock/modules/memory/memory_parser.py +0 -125
  102. flock/modules/memory/memory_storage.py +0 -736
  103. flock/modules/output/__init__.py +0 -1
  104. flock/modules/performance/__init__.py +0 -1
  105. flock/modules/zep/__init__.py +0 -1
  106. flock/modules/zep/zep_module.py +0 -192
  107. flock/routers/__init__.py +0 -1
  108. flock/routers/agent/__init__.py +0 -1
  109. flock/routers/agent/agent_router.py +0 -236
  110. flock/routers/agent/handoff_agent.py +0 -58
  111. flock/routers/default/__init__.py +0 -1
  112. flock/routers/default/default_router.py +0 -80
  113. flock/routers/feedback/feedback_router.py +0 -114
  114. flock/routers/list_generator/list_generator_router.py +0 -166
  115. flock/routers/llm/__init__.py +0 -1
  116. flock/routers/llm/llm_router.py +0 -365
  117. flock/tools/__init__.py +0 -0
  118. flock/tools/azure_tools.py +0 -781
  119. flock/tools/code_tools.py +0 -167
  120. flock/tools/file_tools.py +0 -149
  121. flock/tools/github_tools.py +0 -157
  122. flock/tools/markdown_tools.py +0 -205
  123. flock/tools/system_tools.py +0 -9
  124. flock/tools/text_tools.py +0 -810
  125. flock/tools/web_tools.py +0 -90
  126. flock/tools/zendesk_tools.py +0 -147
  127. flock_core-0.4.528.dist-info/METADATA +0 -675
  128. {flock_core-0.4.528.dist-info → flock_core-0.5.0b0.dist-info}/WHEEL +0 -0
  129. {flock_core-0.4.528.dist-info → flock_core-0.5.0b0.dist-info}/entry_points.txt +0 -0
  130. {flock_core-0.4.528.dist-info → flock_core-0.5.0b0.dist-info}/licenses/LICENSE +0 -0
flock/tools/text_tools.py DELETED
@@ -1,810 +0,0 @@
1
- import hashlib
2
- import json
3
- import re
4
- from collections.abc import Callable
5
- from typing import Any
6
-
7
- import nltk
8
-
9
- from flock.core.logging.trace_and_logged import traced_and_logged
10
-
11
- # Ensure NLTK data is downloaded
12
- try:
13
- nltk.data.find("tokenizers/punkt")
14
- except LookupError:
15
- nltk.download("punkt")
16
-
17
- try:
18
- nltk.data.find("corpora/stopwords")
19
- except LookupError:
20
- nltk.download("stopwords")
21
-
22
-
23
- @traced_and_logged
24
- def text_split_by_sentences(text: str) -> list[str]:
25
- return nltk.sent_tokenize(text)
26
-
27
-
28
- @traced_and_logged
29
- def text_split_by_characters(
30
- text: str, chunk_size: int = 4000, overlap: int = 200
31
- ) -> list[str]:
32
- if chunk_size <= 0:
33
- raise ValueError("chunk_size must be positive")
34
-
35
- if overlap >= chunk_size:
36
- raise ValueError("overlap must be smaller than chunk_size")
37
-
38
- if not text:
39
- return []
40
-
41
- chunks = []
42
- start = 0
43
- text_length = len(text)
44
-
45
- while start < text_length:
46
- end = min(start + chunk_size, text_length)
47
-
48
- # If we're not at the end and the next character isn't a space, try to find a suitable break point
49
- if end < text_length and text[end] not in [
50
- " ",
51
- "\n",
52
- ".",
53
- ",",
54
- "!",
55
- "?",
56
- ";",
57
- ":",
58
- "-",
59
- ]:
60
- # Look for the last occurrence of a good break character
61
- break_chars = [" ", "\n", ".", ",", "!", "?", ";", ":", "-"]
62
- for i in range(end, max(start, end - 100), -1):
63
- if text[i] in break_chars:
64
- end = i + 1 # Include the break character
65
- break
66
-
67
- chunks.append(text[start:end])
68
- start = end - overlap if end < text_length else text_length
69
-
70
- return chunks
71
-
72
-
73
- @traced_and_logged
74
- def text_split_by_tokens(
75
- text: str,
76
- tokenizer: Callable[[str], list[str]],
77
- max_tokens: int = 1024,
78
- overlap_tokens: int = 100,
79
- ) -> list[str]:
80
- tokens = tokenizer(text)
81
- chunks = []
82
-
83
- i = 0
84
- while i < len(tokens):
85
- chunk = tokens[i : i + max_tokens]
86
- chunks.append("".join(chunk))
87
- i += max_tokens - overlap_tokens
88
-
89
- return chunks
90
-
91
-
92
- @traced_and_logged
93
- def text_split_by_separator(text: str, separator: str = "\n\n") -> list[str]:
94
- if not text:
95
- return []
96
-
97
- chunks = text.split(separator)
98
- return [chunk for chunk in chunks if chunk.strip()]
99
-
100
-
101
- @traced_and_logged
102
- def text_recursive_splitter(
103
- text: str,
104
- chunk_size: int = 4000,
105
- separators: list[str] = ["\n\n", "\n", ". ", ", ", " ", ""],
106
- keep_separator: bool = True,
107
- ) -> list[str]:
108
- if not text:
109
- return []
110
-
111
- if len(text) <= chunk_size:
112
- return [text]
113
-
114
- if not separators:
115
- return [
116
- text[:chunk_size],
117
- *text_recursive_splitter(text[chunk_size:], chunk_size, separators),
118
- ]
119
-
120
- separator = separators[0]
121
- new_separators = separators[1:]
122
-
123
- if separator == "":
124
- # If we're at the character level, just split by characters
125
- return text_split_by_characters(text, chunk_size=chunk_size, overlap=0)
126
-
127
- splits = text.split(separator)
128
- separator_len = len(separator) if keep_separator else 0
129
-
130
- # Add separator back to the chunks if needed
131
- if keep_separator and separator:
132
- splits = [f"{split}{separator}" for split in splits[:-1]] + [splits[-1]]
133
-
134
- # Process each split
135
- result = []
136
- current_chunk = []
137
- current_length = 0
138
-
139
- for split in splits:
140
- split_len = len(split)
141
-
142
- if split_len > chunk_size:
143
- # If current split is too large, handle current chunk and recursively split this large piece
144
- if current_chunk:
145
- result.append("".join(current_chunk))
146
- current_chunk = []
147
- current_length = 0
148
-
149
- # Recursively split this large piece
150
- smaller_chunks = text_recursive_splitter(
151
- split, chunk_size, new_separators, keep_separator
152
- )
153
- result.extend(smaller_chunks)
154
- elif current_length + split_len <= chunk_size:
155
- # If we can fit this split in the current chunk, add it
156
- current_chunk.append(split)
157
- current_length += split_len
158
- else:
159
- # If we can't fit this split, complete the current chunk and start a new one
160
- result.append("".join(current_chunk))
161
- current_chunk = [split]
162
- current_length = split_len
163
-
164
- # Don't forget the last chunk
165
- if current_chunk:
166
- result.append("".join(current_chunk))
167
-
168
- return result
169
-
170
-
171
- @traced_and_logged
172
- def text_chunking_for_embedding(
173
- text: str, file_name: str, chunk_size: int = 1000, overlap: int = 100
174
- ) -> list[dict[str, Any]]:
175
- chunks = text_split_by_characters(text, chunk_size=chunk_size, overlap=overlap)
176
-
177
- # Create metadata for each chunk
178
- result = []
179
- for i, chunk in enumerate(chunks):
180
- result.append(
181
- {
182
- "chunk_id": file_name + "_" + str(i),
183
- "text": chunk,
184
- "file": file_name,
185
- "total_chunks": len(chunks),
186
- }
187
- )
188
-
189
- return result
190
-
191
-
192
- @traced_and_logged
193
- def text_split_code_by_functions(code: str) -> list[dict[str, Any]]:
194
- if not code:
195
- return []
196
-
197
- # Basic pattern for Python functions
198
- function_pattern = re.compile(
199
- r"(^|\n)def\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\((.*?)\)(?:\s*->.*?)?:"
200
- )
201
- matches = list(function_pattern.finditer(code))
202
-
203
- if not matches:
204
- return [{"name": "Main", "content": code, "type": "code"}]
205
-
206
- functions = []
207
-
208
- # Process each function
209
- for i in range(len(matches)):
210
- current_match = matches[i]
211
- function_name = current_match.group(2)
212
-
213
- # Determine function content
214
- if i < len(matches) - 1:
215
- next_function_start = matches[i + 1].start()
216
- content = code[current_match.start() : next_function_start]
217
- else:
218
- content = code[current_match.start() :]
219
-
220
- functions.append(
221
- {
222
- "name": function_name,
223
- "content": content.strip(),
224
- "type": "function",
225
- }
226
- )
227
-
228
- # Check if there's content before the first function
229
- if matches[0].start() > 0:
230
- preamble = code[: matches[0].start()].strip()
231
- if preamble:
232
- functions.insert(
233
- 0,
234
- {"name": "Imports/Setup", "content": preamble, "type": "code"},
235
- )
236
-
237
- return functions
238
-
239
-
240
- @traced_and_logged
241
- def text_count_tokens(text: str, model: str = "gpt-3.5-turbo") -> int:
242
- """Count tokens using tiktoken."""
243
- if not text:
244
- return 0
245
-
246
- try:
247
- import tiktoken
248
-
249
- # Map model names to encoding types
250
- if model.startswith(("gpt-4", "gpt-3.5")):
251
- encoding_name = "cl100k_base" # For newer OpenAI models
252
- elif model.startswith("text-davinci"):
253
- encoding_name = "p50k_base" # For older OpenAI models
254
- elif "llama" in model.lower() or "mistral" in model.lower():
255
- encoding_name = (
256
- "cl100k_base" # Best approximation for LLaMA/Mistral
257
- )
258
- else:
259
- # Default to cl100k_base as fallback
260
- encoding_name = "cl100k_base"
261
-
262
- # Try to get the specific encoder for the model if available
263
- try:
264
- encoding = tiktoken.encoding_for_model(model)
265
- except KeyError:
266
- # Fall back to the encoding name
267
- encoding = tiktoken.get_encoding(encoding_name)
268
-
269
- # Count tokens
270
- token_integers = encoding.encode(text)
271
- return len(token_integers)
272
-
273
- except ImportError:
274
- # Fallback to character-based estimation if tiktoken is not installed
275
- return text_count_tokens_estimate(text, model)
276
-
277
-
278
- @traced_and_logged
279
- def text_count_tokens_estimate(text: str, model: str = "gpt-3.5-turbo") -> int:
280
- """Estimate token count for different models."""
281
- if not text:
282
- return 0
283
-
284
- # Rough token estimations for different models
285
- if model.startswith(("gpt-3", "gpt-4")):
286
- # OpenAI models: ~4 chars per token
287
- return len(text) // 4 + 1
288
- elif model.startswith("claude"):
289
- # Anthropic models: ~3.5 chars per token
290
- return len(text) // 3.5 + 1
291
- elif "llama" in model.lower():
292
- # LLaMA-based models: ~3.7 chars per token
293
- return len(text) // 3.7 + 1
294
- else:
295
- # Default estimation
296
- return len(text) // 4 + 1
297
-
298
-
299
- @traced_and_logged
300
- def text_truncate_to_token_limit(
301
- text: str, max_tokens: int = 4000, model: str = "gpt-3.5-turbo"
302
- ) -> str:
303
- if not text:
304
- return ""
305
-
306
- # Try to use tiktoken for accurate truncation
307
- try:
308
- import tiktoken
309
-
310
- # Get appropriate encoding
311
- try:
312
- encoding = tiktoken.encoding_for_model(model)
313
- except KeyError:
314
- # Fall back to cl100k_base (used by most newer models)
315
- encoding = tiktoken.get_encoding("cl100k_base")
316
-
317
- # Encode the text to tokens
318
- tokens = encoding.encode(text)
319
-
320
- # If we're already under the limit, return the original text
321
- if len(tokens) <= max_tokens:
322
- return text
323
-
324
- # Truncate tokens and decode back to text
325
- truncated_tokens = tokens[:max_tokens]
326
- return encoding.decode(truncated_tokens)
327
-
328
- except ImportError:
329
- # Fallback to the character-based method if tiktoken is not available
330
- estimated_tokens = text_count_tokens_estimate(text, model)
331
-
332
- if estimated_tokens <= max_tokens:
333
- return text
334
-
335
- # Calculate approximate character limit
336
- char_per_token = 4 # Default for most models
337
- if model.startswith("claude"):
338
- char_per_token = 3.5
339
- elif "llama" in model.lower():
340
- char_per_token = 3.7
341
-
342
- char_limit = int(max_tokens * char_per_token)
343
-
344
- # Try to find a good breaking point
345
- if char_limit < len(text):
346
- # Look for sentence or paragraph break near the limit
347
- for i in range(char_limit - 1, max(0, char_limit - 100), -1):
348
- if i < len(text) and text[i] in [".", "!", "?", "\n"]:
349
- return text[: i + 1]
350
-
351
- # Fallback to hard truncation
352
- return text[:char_limit]
353
-
354
-
355
- @traced_and_logged
356
- def text_extract_keywords(text: str, top_n: int = 10) -> list[str]:
357
- if not text:
358
- return []
359
-
360
- # Get stopwords
361
- try:
362
- from nltk.corpus import stopwords
363
-
364
- stop_words = set(stopwords.words("english"))
365
- except:
366
- # Fallback basic stopwords if NLTK data isn't available
367
- stop_words = {
368
- "i",
369
- "me",
370
- "my",
371
- "myself",
372
- "we",
373
- "our",
374
- "ours",
375
- "ourselves",
376
- "you",
377
- "you're",
378
- "you've",
379
- "you'll",
380
- "you'd",
381
- "your",
382
- "yours",
383
- "yourself",
384
- "yourselves",
385
- "he",
386
- "him",
387
- "his",
388
- "himself",
389
- "she",
390
- "she's",
391
- "her",
392
- "hers",
393
- "herself",
394
- "it",
395
- "it's",
396
- "its",
397
- "itself",
398
- "they",
399
- "them",
400
- "their",
401
- "theirs",
402
- "themselves",
403
- "what",
404
- "which",
405
- "who",
406
- "whom",
407
- "this",
408
- "that",
409
- "that'll",
410
- "these",
411
- "those",
412
- "am",
413
- "is",
414
- "are",
415
- "was",
416
- "were",
417
- "be",
418
- "been",
419
- "being",
420
- "have",
421
- "has",
422
- "had",
423
- "having",
424
- "do",
425
- "does",
426
- "did",
427
- "doing",
428
- "a",
429
- "an",
430
- "the",
431
- "and",
432
- "but",
433
- "if",
434
- "or",
435
- "because",
436
- "as",
437
- "until",
438
- "while",
439
- "of",
440
- "at",
441
- "by",
442
- "for",
443
- "with",
444
- "about",
445
- "against",
446
- "between",
447
- "into",
448
- "through",
449
- "during",
450
- "before",
451
- "after",
452
- "above",
453
- "below",
454
- "to",
455
- "from",
456
- "up",
457
- "down",
458
- "in",
459
- "out",
460
- "on",
461
- "off",
462
- "over",
463
- "under",
464
- "again",
465
- "further",
466
- "then",
467
- "once",
468
- }
469
-
470
- # Tokenize and remove punctuation
471
- words = re.findall(r"\b[a-zA-Z]{3,}\b", text.lower())
472
-
473
- # Remove stopwords
474
- words = [word for word in words if word not in stop_words]
475
-
476
- # Count word frequencies
477
- word_freq = {}
478
- for word in words:
479
- if word in word_freq:
480
- word_freq[word] += 1
481
- else:
482
- word_freq[word] = 1
483
-
484
- # Sort by frequency
485
- sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
486
-
487
- # Return top N keywords
488
- return [word for word, freq in sorted_words[:top_n]]
489
-
490
-
491
- @traced_and_logged
492
- def text_clean_text(
493
- text: str,
494
- remove_urls: bool = True,
495
- remove_html: bool = True,
496
- normalize_whitespace: bool = True,
497
- ) -> str:
498
- if not text:
499
- return ""
500
-
501
- result = text
502
-
503
- # Remove URLs
504
- if remove_urls:
505
- result = re.sub(r"https?://\S+|www\.\S+", "", result)
506
-
507
- # Remove HTML tags
508
- if remove_html:
509
- result = re.sub(r"<.*?>", "", result)
510
-
511
- # Normalize whitespace
512
- if normalize_whitespace:
513
- # Replace multiple spaces, tabs, newlines with a single space
514
- result = re.sub(r"\s+", " ", result)
515
- result = result.strip()
516
-
517
- return result
518
-
519
-
520
- @traced_and_logged
521
- def text_format_chat_history(
522
- messages: list[dict[str, str]],
523
- format_type: str = "text",
524
- system_prefix: str = "System: ",
525
- user_prefix: str = "User: ",
526
- assistant_prefix: str = "Assistant: ",
527
- ) -> str:
528
- if not messages:
529
- return ""
530
-
531
- result = []
532
-
533
- if format_type == "text":
534
- for msg in messages:
535
- role = msg.get("role", "").lower()
536
- content = msg.get("content", "")
537
-
538
- if role == "system":
539
- result.append(f"{system_prefix}{content}")
540
- elif role == "user":
541
- result.append(f"{user_prefix}{content}")
542
- elif role == "assistant":
543
- result.append(f"{assistant_prefix}{content}")
544
- else:
545
- result.append(f"{role.capitalize()}: {content}")
546
-
547
- return "\n\n".join(result)
548
-
549
- elif format_type == "markdown":
550
- for msg in messages:
551
- role = msg.get("role", "").lower()
552
- content = msg.get("content", "")
553
-
554
- if role == "system":
555
- result.append(f"**{system_prefix.strip()}** {content}")
556
- elif role == "user":
557
- result.append(f"**{user_prefix.strip()}** {content}")
558
- elif role == "assistant":
559
- result.append(f"**{assistant_prefix.strip()}** {content}")
560
- else:
561
- result.append(f"**{role.capitalize()}:** {content}")
562
-
563
- return "\n\n".join(result)
564
-
565
- else:
566
- raise ValueError(f"Unsupported format type: {format_type}")
567
-
568
-
569
- @traced_and_logged
570
- def text_extract_json_from_text(text: str) -> dict[str, Any] | None:
571
- if not text:
572
- return None
573
-
574
- # Find JSON-like patterns between curly braces
575
- json_pattern = re.compile(r"({[\s\S]*?})")
576
- json_matches = json_pattern.findall(text)
577
-
578
- # Try to parse each match
579
- for json_str in json_matches:
580
- try:
581
- return json.loads(json_str)
582
- except json.JSONDecodeError:
583
- continue
584
-
585
- # Try to find JSON with markdown code blocks
586
- code_block_pattern = re.compile(r"```(?:json)?\s*([\s\S]*?)\s*```")
587
- code_blocks = code_block_pattern.findall(text)
588
-
589
- for block in code_blocks:
590
- # Clean up any trailing ``` that might have been captured
591
- block = block.replace("```", "")
592
- try:
593
- return json.loads(block)
594
- except json.JSONDecodeError:
595
- continue
596
-
597
- # No valid JSON found
598
- return None
599
-
600
-
601
- @traced_and_logged
602
- def text_calculate_hash(text: str, algorithm: str = "sha256") -> str:
603
- if not text:
604
- return ""
605
-
606
- if algorithm == "md5":
607
- return hashlib.md5(text.encode()).hexdigest()
608
- elif algorithm == "sha1":
609
- return hashlib.sha1(text.encode()).hexdigest()
610
- elif algorithm == "sha256":
611
- return hashlib.sha256(text.encode()).hexdigest()
612
- else:
613
- raise ValueError(f"Unsupported hash algorithm: {algorithm}")
614
-
615
-
616
- @traced_and_logged
617
- def text_format_table_from_dicts(data: list[dict[str, Any]]) -> str:
618
- if not data:
619
- return ""
620
-
621
- # Extract all possible keys
622
- keys = set()
623
- for item in data:
624
- keys.update(item.keys())
625
-
626
- # Convert to list and sort for consistent output
627
- keys = sorted(list(keys))
628
-
629
- # Calculate column widths
630
- widths = {key: len(key) for key in keys}
631
- for item in data:
632
- for key in keys:
633
- if key in item:
634
- value_str = str(item[key])
635
- widths[key] = max(widths[key], len(value_str))
636
-
637
- # Create header
638
- header = " | ".join(f"{key:{widths[key]}}" for key in keys)
639
- separator = "-+-".join("-" * widths[key] for key in keys)
640
-
641
- # Create rows
642
- rows = []
643
- for item in data:
644
- row = " | ".join(f"{item.get(key, '')!s:{widths[key]}}" for key in keys)
645
- rows.append(row)
646
-
647
- # Combine everything
648
- return f"{header}\n{separator}\n" + "\n".join(rows)
649
-
650
-
651
- @traced_and_logged
652
- def text_detect_language(text: str) -> str:
653
- """Simple language detection"""
654
- if not text or len(text.strip()) < 10:
655
- return "unknown"
656
-
657
- try:
658
- # Try to use langdetect if available
659
- from langdetect import detect
660
-
661
- return detect(text)
662
- except ImportError:
663
- # Fallback to simple detection based on character frequency
664
- # This is very simplistic and only works for a few common languages
665
- text = text.lower()
666
-
667
- # Count character frequencies that may indicate certain languages
668
- special_chars = {
669
- "á": 0,
670
- "é": 0,
671
- "í": 0,
672
- "ó": 0,
673
- "ú": 0,
674
- "ü": 0,
675
- "ñ": 0, # Spanish
676
- "ä": 0,
677
- "ö": 0,
678
- "ß": 0, # German
679
- "ç": 0,
680
- "à": 0,
681
- "è": 0,
682
- "ù": 0, # French
683
- "å": 0,
684
- "ø": 0, # Nordic
685
- "й": 0,
686
- "ы": 0,
687
- "ъ": 0,
688
- "э": 0, # Russian/Cyrillic
689
- "的": 0,
690
- "是": 0,
691
- "在": 0, # Chinese
692
- "の": 0,
693
- "は": 0,
694
- "で": 0, # Japanese
695
- "한": 0,
696
- "국": 0,
697
- "어": 0, # Korean
698
- }
699
-
700
- for char in text:
701
- if char in special_chars:
702
- special_chars[char] += 1
703
-
704
- # Detect based on character frequencies
705
- spanish = sum(
706
- special_chars[c] for c in ["á", "é", "í", "ó", "ú", "ü", "ñ"]
707
- )
708
- german = sum(special_chars[c] for c in ["ä", "ö", "ß"])
709
- french = sum(special_chars[c] for c in ["ç", "à", "è", "ù"])
710
- nordic = sum(special_chars[c] for c in ["å", "ø"])
711
- russian = sum(special_chars[c] for c in ["й", "ы", "ъ", "э"])
712
- chinese = sum(special_chars[c] for c in ["的", "是", "在"])
713
- japanese = sum(special_chars[c] for c in ["の", "は", "で"])
714
- korean = sum(special_chars[c] for c in ["한", "국", "어"])
715
-
716
- scores = {
717
- "es": spanish,
718
- "de": german,
719
- "fr": french,
720
- "no": nordic,
721
- "ru": russian,
722
- "zh": chinese,
723
- "ja": japanese,
724
- "ko": korean,
725
- }
726
-
727
- # If we have a clear signal from special characters
728
- max_score = max(scores.values())
729
- if max_score > 0:
730
- return max(scores, key=scores.get)
731
-
732
- # Otherwise assume English (very simplistic)
733
- return "en"
734
-
735
-
736
- @traced_and_logged
737
- def text_tiktoken_split(
738
- text: str,
739
- model: str = "gpt-3.5-turbo",
740
- chunk_size: int = 1000,
741
- overlap: int = 50,
742
- ) -> list[str]:
743
- """Split text based on tiktoken tokens with proper overlap handling."""
744
- if not text:
745
- return []
746
-
747
- try:
748
- import tiktoken
749
-
750
- try:
751
- encoding = tiktoken.encoding_for_model(model)
752
- except KeyError:
753
- encoding = tiktoken.get_encoding("cl100k_base")
754
-
755
- # Encode the text to tokens
756
- tokens = encoding.encode(text)
757
- total_tokens = len(tokens)
758
-
759
- # Check if we need to split at all
760
- if total_tokens <= chunk_size:
761
- return [text]
762
-
763
- # Create chunks with overlap
764
- chunks = []
765
- start_idx = 0
766
-
767
- while start_idx < total_tokens:
768
- # Define the end of this chunk
769
- end_idx = min(start_idx + chunk_size, total_tokens)
770
-
771
- # Decode this chunk of tokens back to text
772
- chunk_tokens = tokens[start_idx:end_idx]
773
- chunk_text = encoding.decode(chunk_tokens)
774
- chunks.append(chunk_text)
775
-
776
- # Move to the next chunk, accounting for overlap
777
- start_idx += chunk_size - overlap
778
-
779
- # Avoid tiny final chunks
780
- if start_idx < total_tokens and start_idx + overlap >= total_tokens:
781
- break
782
-
783
- return chunks
784
- except ImportError:
785
- # Fallback to character-based chunking if tiktoken is not available
786
- return text_split_by_characters(
787
- text, chunk_size=chunk_size * 4, overlap=overlap * 4
788
- )
789
-
790
-
791
- @traced_and_logged
792
- def text_count_words(text: str) -> int:
793
- if not text:
794
- return 0
795
- return len(text.split())
796
-
797
-
798
- @traced_and_logged
799
- def text_extract_urls(text: str) -> list[str]:
800
- if not text:
801
- return []
802
- # A more robust regex might be needed for complex cases
803
- return re.findall(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", text)
804
-
805
-
806
- @traced_and_logged
807
- def text_extract_numbers(text: str) -> list[float]:
808
- if not text:
809
- return []
810
- return [float(num) for num in re.findall(r"[-+]?\d*\.?\d+", text)]