flock-core 0.5.0b3__py3-none-any.whl → 0.5.0b6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of flock-core might be problematic. Click here for more details.

flock/tools/text_tools.py DELETED
@@ -1,809 +0,0 @@
1
- import hashlib
2
- import json
3
- import re
4
- from collections.abc import Callable
5
- from typing import Any
6
-
7
- import nltk
8
-
9
- from flock.core.logging.trace_and_logged import traced_and_logged
10
-
11
- # Ensure NLTK data is downloaded
12
- try:
13
- nltk.data.find("tokenizers/punkt")
14
- except LookupError:
15
- nltk.download("punkt")
16
-
17
- try:
18
- nltk.data.find("corpora/stopwords")
19
- except LookupError:
20
- nltk.download("stopwords")
21
-
22
-
23
- @traced_and_logged
24
- def text_split_by_sentences(text: str) -> list[str]:
25
- return nltk.sent_tokenize(text)
26
-
27
-
28
- @traced_and_logged
29
- def text_split_by_characters(
30
- text: str, chunk_size: int = 4000, overlap: int = 200
31
- ) -> list[str]:
32
- if chunk_size <= 0:
33
- raise ValueError("chunk_size must be positive")
34
-
35
- if overlap >= chunk_size:
36
- raise ValueError("overlap must be smaller than chunk_size")
37
-
38
- if not text:
39
- return []
40
-
41
- chunks = []
42
- start = 0
43
- text_length = len(text)
44
-
45
- while start < text_length:
46
- end = min(start + chunk_size, text_length)
47
-
48
- # If we're not at the end and the next character isn't a space, try to find a suitable break point
49
- if end < text_length and text[end] not in [
50
- " ",
51
- "\n",
52
- ".",
53
- ",",
54
- "!",
55
- "?",
56
- ";",
57
- ":",
58
- "-",
59
- ]:
60
- # Look for the last occurrence of a good break character
61
- break_chars = [" ", "\n", ".", ",", "!", "?", ";", ":", "-"]
62
- for i in range(end, max(start, end - 100), -1):
63
- if text[i] in break_chars:
64
- end = i + 1 # Include the break character
65
- break
66
-
67
- chunks.append(text[start:end])
68
- start = end - overlap if end < text_length else text_length
69
-
70
- return chunks
71
-
72
-
73
- @traced_and_logged
74
- def text_split_by_tokens(
75
- text: str,
76
- tokenizer: Callable[[str], list[str]],
77
- max_tokens: int = 1024,
78
- overlap_tokens: int = 100,
79
- ) -> list[str]:
80
- tokens = tokenizer(text)
81
- chunks = []
82
-
83
- i = 0
84
- while i < len(tokens):
85
- chunk = tokens[i : i + max_tokens]
86
- chunks.append("".join(chunk))
87
- i += max_tokens - overlap_tokens
88
-
89
- return chunks
90
-
91
-
92
- @traced_and_logged
93
- def text_split_by_separator(text: str, separator: str = "\n\n") -> list[str]:
94
- if not text:
95
- return []
96
-
97
- chunks = text.split(separator)
98
- return [chunk for chunk in chunks if chunk.strip()]
99
-
100
-
101
- @traced_and_logged
102
- def text_recursive_splitter(
103
- text: str,
104
- chunk_size: int = 4000,
105
- separators: list[str] = ["\n\n", "\n", ". ", ", ", " ", ""],
106
- keep_separator: bool = True,
107
- ) -> list[str]:
108
- if not text:
109
- return []
110
-
111
- if len(text) <= chunk_size:
112
- return [text]
113
-
114
- if not separators:
115
- return [
116
- text[:chunk_size],
117
- *text_recursive_splitter(text[chunk_size:], chunk_size, separators),
118
- ]
119
-
120
- separator = separators[0]
121
- new_separators = separators[1:]
122
-
123
- if separator == "":
124
- # If we're at the character level, just split by characters
125
- return text_split_by_characters(text, chunk_size=chunk_size, overlap=0)
126
-
127
- splits = text.split(separator)
128
- separator_len = len(separator) if keep_separator else 0
129
-
130
- # Add separator back to the chunks if needed
131
- if keep_separator and separator:
132
- splits = [f"{split}{separator}" for split in splits[:-1]] + [splits[-1]]
133
-
134
- # Process each split
135
- result = []
136
- current_chunk = []
137
- current_length = 0
138
-
139
- for split in splits:
140
- split_len = len(split)
141
-
142
- if split_len > chunk_size:
143
- # If current split is too large, handle current chunk and recursively split this large piece
144
- if current_chunk:
145
- result.append("".join(current_chunk))
146
- current_chunk = []
147
- current_length = 0
148
-
149
- # Recursively split this large piece
150
- smaller_chunks = text_recursive_splitter(
151
- split, chunk_size, new_separators, keep_separator
152
- )
153
- result.extend(smaller_chunks)
154
- elif current_length + split_len <= chunk_size:
155
- # If we can fit this split in the current chunk, add it
156
- current_chunk.append(split)
157
- current_length += split_len
158
- else:
159
- # If we can't fit this split, complete the current chunk and start a new one
160
- result.append("".join(current_chunk))
161
- current_chunk = [split]
162
- current_length = split_len
163
-
164
- # Don't forget the last chunk
165
- if current_chunk:
166
- result.append("".join(current_chunk))
167
-
168
- return result
169
-
170
-
171
- @traced_and_logged
172
- def text_chunking_for_embedding(
173
- text: str, file_name: str, chunk_size: int = 1000, overlap: int = 100
174
- ) -> list[dict[str, Any]]:
175
- chunks = text_split_by_characters(text, chunk_size=chunk_size, overlap=overlap)
176
-
177
- # Create metadata for each chunk
178
- result = []
179
- for i, chunk in enumerate(chunks):
180
- result.append(
181
- {
182
- "chunk_id": file_name + "_" + str(i),
183
- "text": chunk,
184
- "file": file_name,
185
- "total_chunks": len(chunks),
186
- }
187
- )
188
-
189
- return result
190
-
191
-
192
- @traced_and_logged
193
- def text_split_code_by_functions(code: str) -> list[dict[str, Any]]:
194
- if not code:
195
- return []
196
-
197
- # Basic pattern for Python functions
198
- function_pattern = re.compile(
199
- r"(^|\n)def\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\((.*?)\)(?:\s*->.*?)?:"
200
- )
201
- matches = list(function_pattern.finditer(code))
202
-
203
- if not matches:
204
- return [{"name": "Main", "content": code, "type": "code"}]
205
-
206
- functions = []
207
-
208
- # Process each function
209
- for i, current_match in enumerate(matches):
210
- function_name = current_match.group(2)
211
-
212
- # Determine function content
213
- if i < len(matches) - 1:
214
- next_function_start = matches[i + 1].start()
215
- content = code[current_match.start() : next_function_start]
216
- else:
217
- content = code[current_match.start() :]
218
-
219
- functions.append(
220
- {
221
- "name": function_name,
222
- "content": content.strip(),
223
- "type": "function",
224
- }
225
- )
226
-
227
- # Check if there's content before the first function
228
- if matches[0].start() > 0:
229
- preamble = code[: matches[0].start()].strip()
230
- if preamble:
231
- functions.insert(
232
- 0,
233
- {"name": "Imports/Setup", "content": preamble, "type": "code"},
234
- )
235
-
236
- return functions
237
-
238
-
239
- @traced_and_logged
240
- def text_count_tokens(text: str, model: str = "gpt-3.5-turbo") -> int:
241
- """Count tokens using tiktoken."""
242
- if not text:
243
- return 0
244
-
245
- try:
246
- import tiktoken
247
-
248
- # Map model names to encoding types
249
- if model.startswith(("gpt-4", "gpt-3.5")):
250
- encoding_name = "cl100k_base" # For newer OpenAI models
251
- elif model.startswith("text-davinci"):
252
- encoding_name = "p50k_base" # For older OpenAI models
253
- elif "llama" in model.lower() or "mistral" in model.lower():
254
- encoding_name = (
255
- "cl100k_base" # Best approximation for LLaMA/Mistral
256
- )
257
- else:
258
- # Default to cl100k_base as fallback
259
- encoding_name = "cl100k_base"
260
-
261
- # Try to get the specific encoder for the model if available
262
- try:
263
- encoding = tiktoken.encoding_for_model(model)
264
- except KeyError:
265
- # Fall back to the encoding name
266
- encoding = tiktoken.get_encoding(encoding_name)
267
-
268
- # Count tokens
269
- token_integers = encoding.encode(text)
270
- return len(token_integers)
271
-
272
- except ImportError:
273
- # Fallback to character-based estimation if tiktoken is not installed
274
- return text_count_tokens_estimate(text, model)
275
-
276
-
277
- @traced_and_logged
278
- def text_count_tokens_estimate(text: str, model: str = "gpt-3.5-turbo") -> int:
279
- """Estimate token count for different models."""
280
- if not text:
281
- return 0
282
-
283
- # Rough token estimations for different models
284
- if model.startswith(("gpt-3", "gpt-4")):
285
- # OpenAI models: ~4 chars per token
286
- return len(text) // 4 + 1
287
- elif model.startswith("claude"):
288
- # Anthropic models: ~3.5 chars per token
289
- return len(text) // 3.5 + 1
290
- elif "llama" in model.lower():
291
- # LLaMA-based models: ~3.7 chars per token
292
- return len(text) // 3.7 + 1
293
- else:
294
- # Default estimation
295
- return len(text) // 4 + 1
296
-
297
-
298
- @traced_and_logged
299
- def text_truncate_to_token_limit(
300
- text: str, max_tokens: int = 4000, model: str = "gpt-3.5-turbo"
301
- ) -> str:
302
- if not text:
303
- return ""
304
-
305
- # Try to use tiktoken for accurate truncation
306
- try:
307
- import tiktoken
308
-
309
- # Get appropriate encoding
310
- try:
311
- encoding = tiktoken.encoding_for_model(model)
312
- except KeyError:
313
- # Fall back to cl100k_base (used by most newer models)
314
- encoding = tiktoken.get_encoding("cl100k_base")
315
-
316
- # Encode the text to tokens
317
- tokens = encoding.encode(text)
318
-
319
- # If we're already under the limit, return the original text
320
- if len(tokens) <= max_tokens:
321
- return text
322
-
323
- # Truncate tokens and decode back to text
324
- truncated_tokens = tokens[:max_tokens]
325
- return encoding.decode(truncated_tokens)
326
-
327
- except ImportError:
328
- # Fallback to the character-based method if tiktoken is not available
329
- estimated_tokens = text_count_tokens_estimate(text, model)
330
-
331
- if estimated_tokens <= max_tokens:
332
- return text
333
-
334
- # Calculate approximate character limit
335
- char_per_token = 4 # Default for most models
336
- if model.startswith("claude"):
337
- char_per_token = 3.5
338
- elif "llama" in model.lower():
339
- char_per_token = 3.7
340
-
341
- char_limit = int(max_tokens * char_per_token)
342
-
343
- # Try to find a good breaking point
344
- if char_limit < len(text):
345
- # Look for sentence or paragraph break near the limit
346
- for i in range(char_limit - 1, max(0, char_limit - 100), -1):
347
- if i < len(text) and text[i] in [".", "!", "?", "\n"]:
348
- return text[: i + 1]
349
-
350
- # Fallback to hard truncation
351
- return text[:char_limit]
352
-
353
-
354
- @traced_and_logged
355
- def text_extract_keywords(text: str, top_n: int = 10) -> list[str]:
356
- if not text:
357
- return []
358
-
359
- # Get stopwords
360
- try:
361
- from nltk.corpus import stopwords
362
-
363
- stop_words = set(stopwords.words("english"))
364
- except:
365
- # Fallback basic stopwords if NLTK data isn't available
366
- stop_words = {
367
- "i",
368
- "me",
369
- "my",
370
- "myself",
371
- "we",
372
- "our",
373
- "ours",
374
- "ourselves",
375
- "you",
376
- "you're",
377
- "you've",
378
- "you'll",
379
- "you'd",
380
- "your",
381
- "yours",
382
- "yourself",
383
- "yourselves",
384
- "he",
385
- "him",
386
- "his",
387
- "himself",
388
- "she",
389
- "she's",
390
- "her",
391
- "hers",
392
- "herself",
393
- "it",
394
- "it's",
395
- "its",
396
- "itself",
397
- "they",
398
- "them",
399
- "their",
400
- "theirs",
401
- "themselves",
402
- "what",
403
- "which",
404
- "who",
405
- "whom",
406
- "this",
407
- "that",
408
- "that'll",
409
- "these",
410
- "those",
411
- "am",
412
- "is",
413
- "are",
414
- "was",
415
- "were",
416
- "be",
417
- "been",
418
- "being",
419
- "have",
420
- "has",
421
- "had",
422
- "having",
423
- "do",
424
- "does",
425
- "did",
426
- "doing",
427
- "a",
428
- "an",
429
- "the",
430
- "and",
431
- "but",
432
- "if",
433
- "or",
434
- "because",
435
- "as",
436
- "until",
437
- "while",
438
- "of",
439
- "at",
440
- "by",
441
- "for",
442
- "with",
443
- "about",
444
- "against",
445
- "between",
446
- "into",
447
- "through",
448
- "during",
449
- "before",
450
- "after",
451
- "above",
452
- "below",
453
- "to",
454
- "from",
455
- "up",
456
- "down",
457
- "in",
458
- "out",
459
- "on",
460
- "off",
461
- "over",
462
- "under",
463
- "again",
464
- "further",
465
- "then",
466
- "once",
467
- }
468
-
469
- # Tokenize and remove punctuation
470
- words = re.findall(r"\b[a-zA-Z]{3,}\b", text.lower())
471
-
472
- # Remove stopwords
473
- words = [word for word in words if word not in stop_words]
474
-
475
- # Count word frequencies
476
- word_freq = {}
477
- for word in words:
478
- if word in word_freq:
479
- word_freq[word] += 1
480
- else:
481
- word_freq[word] = 1
482
-
483
- # Sort by frequency
484
- sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
485
-
486
- # Return top N keywords
487
- return [word for word, freq in sorted_words[:top_n]]
488
-
489
-
490
- @traced_and_logged
491
- def text_clean_text(
492
- text: str,
493
- remove_urls: bool = True,
494
- remove_html: bool = True,
495
- normalize_whitespace: bool = True,
496
- ) -> str:
497
- if not text:
498
- return ""
499
-
500
- result = text
501
-
502
- # Remove URLs
503
- if remove_urls:
504
- result = re.sub(r"https?://\S+|www\.\S+", "", result)
505
-
506
- # Remove HTML tags
507
- if remove_html:
508
- result = re.sub(r"<.*?>", "", result)
509
-
510
- # Normalize whitespace
511
- if normalize_whitespace:
512
- # Replace multiple spaces, tabs, newlines with a single space
513
- result = re.sub(r"\s+", " ", result)
514
- result = result.strip()
515
-
516
- return result
517
-
518
-
519
- @traced_and_logged
520
- def text_format_chat_history(
521
- messages: list[dict[str, str]],
522
- format_type: str = "text",
523
- system_prefix: str = "System: ",
524
- user_prefix: str = "User: ",
525
- assistant_prefix: str = "Assistant: ",
526
- ) -> str:
527
- if not messages:
528
- return ""
529
-
530
- result = []
531
-
532
- if format_type == "text":
533
- for msg in messages:
534
- role = msg.get("role", "").lower()
535
- content = msg.get("content", "")
536
-
537
- if role == "system":
538
- result.append(f"{system_prefix}{content}")
539
- elif role == "user":
540
- result.append(f"{user_prefix}{content}")
541
- elif role == "assistant":
542
- result.append(f"{assistant_prefix}{content}")
543
- else:
544
- result.append(f"{role.capitalize()}: {content}")
545
-
546
- return "\n\n".join(result)
547
-
548
- elif format_type == "markdown":
549
- for msg in messages:
550
- role = msg.get("role", "").lower()
551
- content = msg.get("content", "")
552
-
553
- if role == "system":
554
- result.append(f"**{system_prefix.strip()}** {content}")
555
- elif role == "user":
556
- result.append(f"**{user_prefix.strip()}** {content}")
557
- elif role == "assistant":
558
- result.append(f"**{assistant_prefix.strip()}** {content}")
559
- else:
560
- result.append(f"**{role.capitalize()}:** {content}")
561
-
562
- return "\n\n".join(result)
563
-
564
- else:
565
- raise ValueError(f"Unsupported format type: {format_type}")
566
-
567
-
568
- @traced_and_logged
569
- def text_extract_json_from_text(text: str) -> dict[str, Any] | None:
570
- if not text:
571
- return None
572
-
573
- # Find JSON-like patterns between curly braces
574
- json_pattern = re.compile(r"({[\s\S]*?})")
575
- json_matches = json_pattern.findall(text)
576
-
577
- # Try to parse each match
578
- for json_str in json_matches:
579
- try:
580
- return json.loads(json_str)
581
- except json.JSONDecodeError:
582
- continue
583
-
584
- # Try to find JSON with markdown code blocks
585
- code_block_pattern = re.compile(r"```(?:json)?\s*([\s\S]*?)\s*```")
586
- code_blocks = code_block_pattern.findall(text)
587
-
588
- for block in code_blocks:
589
- # Clean up any trailing ``` that might have been captured
590
- block = block.replace("```", "")
591
- try:
592
- return json.loads(block)
593
- except json.JSONDecodeError:
594
- continue
595
-
596
- # No valid JSON found
597
- return None
598
-
599
-
600
- @traced_and_logged
601
- def text_calculate_hash(text: str, algorithm: str = "sha256") -> str:
602
- if not text:
603
- return ""
604
-
605
- if algorithm == "md5":
606
- return hashlib.md5(text.encode()).hexdigest()
607
- elif algorithm == "sha1":
608
- return hashlib.sha1(text.encode()).hexdigest()
609
- elif algorithm == "sha256":
610
- return hashlib.sha256(text.encode()).hexdigest()
611
- else:
612
- raise ValueError(f"Unsupported hash algorithm: {algorithm}")
613
-
614
-
615
- @traced_and_logged
616
- def text_format_table_from_dicts(data: list[dict[str, Any]]) -> str:
617
- if not data:
618
- return ""
619
-
620
- # Extract all possible keys
621
- keys = set()
622
- for item in data:
623
- keys.update(item.keys())
624
-
625
- # Convert to list and sort for consistent output
626
- keys = sorted(list(keys))
627
-
628
- # Calculate column widths
629
- widths = {key: len(key) for key in keys}
630
- for item in data:
631
- for key in keys:
632
- if key in item:
633
- value_str = str(item[key])
634
- widths[key] = max(widths[key], len(value_str))
635
-
636
- # Create header
637
- header = " | ".join(f"{key:{widths[key]}}" for key in keys)
638
- separator = "-+-".join("-" * widths[key] for key in keys)
639
-
640
- # Create rows
641
- rows = []
642
- for item in data:
643
- row = " | ".join(f"{item.get(key, '')!s:{widths[key]}}" for key in keys)
644
- rows.append(row)
645
-
646
- # Combine everything
647
- return f"{header}\n{separator}\n" + "\n".join(rows)
648
-
649
-
650
- @traced_and_logged
651
- def text_detect_language(text: str) -> str:
652
- """Simple language detection"""
653
- if not text or len(text.strip()) < 10:
654
- return "unknown"
655
-
656
- try:
657
- # Try to use langdetect if available
658
- from langdetect import detect
659
-
660
- return detect(text)
661
- except ImportError:
662
- # Fallback to simple detection based on character frequency
663
- # This is very simplistic and only works for a few common languages
664
- text = text.lower()
665
-
666
- # Count character frequencies that may indicate certain languages
667
- special_chars = {
668
- "á": 0,
669
- "é": 0,
670
- "í": 0,
671
- "ó": 0,
672
- "ú": 0,
673
- "ü": 0,
674
- "ñ": 0, # Spanish
675
- "ä": 0,
676
- "ö": 0,
677
- "ß": 0, # German
678
- "ç": 0,
679
- "à": 0,
680
- "è": 0,
681
- "ù": 0, # French
682
- "å": 0,
683
- "ø": 0, # Nordic
684
- "й": 0,
685
- "ы": 0,
686
- "ъ": 0,
687
- "э": 0, # Russian/Cyrillic
688
- "的": 0,
689
- "是": 0,
690
- "在": 0, # Chinese
691
- "の": 0,
692
- "は": 0,
693
- "で": 0, # Japanese
694
- "한": 0,
695
- "국": 0,
696
- "어": 0, # Korean
697
- }
698
-
699
- for char in text:
700
- if char in special_chars:
701
- special_chars[char] += 1
702
-
703
- # Detect based on character frequencies
704
- spanish = sum(
705
- special_chars[c] for c in ["á", "é", "í", "ó", "ú", "ü", "ñ"]
706
- )
707
- german = sum(special_chars[c] for c in ["ä", "ö", "ß"])
708
- french = sum(special_chars[c] for c in ["ç", "à", "è", "ù"])
709
- nordic = sum(special_chars[c] for c in ["å", "ø"])
710
- russian = sum(special_chars[c] for c in ["й", "ы", "ъ", "э"])
711
- chinese = sum(special_chars[c] for c in ["的", "是", "在"])
712
- japanese = sum(special_chars[c] for c in ["の", "は", "で"])
713
- korean = sum(special_chars[c] for c in ["한", "국", "어"])
714
-
715
- scores = {
716
- "es": spanish,
717
- "de": german,
718
- "fr": french,
719
- "no": nordic,
720
- "ru": russian,
721
- "zh": chinese,
722
- "ja": japanese,
723
- "ko": korean,
724
- }
725
-
726
- # If we have a clear signal from special characters
727
- max_score = max(scores.values())
728
- if max_score > 0:
729
- return max(scores, key=scores.get)
730
-
731
- # Otherwise assume English (very simplistic)
732
- return "en"
733
-
734
-
735
- @traced_and_logged
736
- def text_tiktoken_split(
737
- text: str,
738
- model: str = "gpt-3.5-turbo",
739
- chunk_size: int = 1000,
740
- overlap: int = 50,
741
- ) -> list[str]:
742
- """Split text based on tiktoken tokens with proper overlap handling."""
743
- if not text:
744
- return []
745
-
746
- try:
747
- import tiktoken
748
-
749
- try:
750
- encoding = tiktoken.encoding_for_model(model)
751
- except KeyError:
752
- encoding = tiktoken.get_encoding("cl100k_base")
753
-
754
- # Encode the text to tokens
755
- tokens = encoding.encode(text)
756
- total_tokens = len(tokens)
757
-
758
- # Check if we need to split at all
759
- if total_tokens <= chunk_size:
760
- return [text]
761
-
762
- # Create chunks with overlap
763
- chunks = []
764
- start_idx = 0
765
-
766
- while start_idx < total_tokens:
767
- # Define the end of this chunk
768
- end_idx = min(start_idx + chunk_size, total_tokens)
769
-
770
- # Decode this chunk of tokens back to text
771
- chunk_tokens = tokens[start_idx:end_idx]
772
- chunk_text = encoding.decode(chunk_tokens)
773
- chunks.append(chunk_text)
774
-
775
- # Move to the next chunk, accounting for overlap
776
- start_idx += chunk_size - overlap
777
-
778
- # Avoid tiny final chunks
779
- if start_idx < total_tokens and start_idx + overlap >= total_tokens:
780
- break
781
-
782
- return chunks
783
- except ImportError:
784
- # Fallback to character-based chunking if tiktoken is not available
785
- return text_split_by_characters(
786
- text, chunk_size=chunk_size * 4, overlap=overlap * 4
787
- )
788
-
789
-
790
- @traced_and_logged
791
- def text_count_words(text: str) -> int:
792
- if not text:
793
- return 0
794
- return len(text.split())
795
-
796
-
797
- @traced_and_logged
798
- def text_extract_urls(text: str) -> list[str]:
799
- if not text:
800
- return []
801
- # A more robust regex might be needed for complex cases
802
- return re.findall(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", text)
803
-
804
-
805
- @traced_and_logged
806
- def text_extract_numbers(text: str) -> list[float]:
807
- if not text:
808
- return []
809
- return [float(num) for num in re.findall(r"[-+]?\d*\.?\d+", text)]