skylos 1.0.10__py3-none-any.whl → 1.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skylos might be problematic. Click here for more details.

@@ -0,0 +1,783 @@
1
+ import os
2
+ import fnmatch
3
+ import re
4
+ import concurrent.futures
5
+ from typing import Optional, List, Tuple
6
+ import fitz
7
+
8
+ BUILTIN_IGNORES = [
9
+ "**/.git/**",
10
+ "**/.svn/**",
11
+ "**/.hg/**",
12
+ "**/.idea/**",
13
+ "**/.vscode/**",
14
+ "**/__pycache__/**",
15
+ "**/*.pyc",
16
+ "**/*.pyo",
17
+ "**/.pytest_cache/**",
18
+ "**/.coverage",
19
+ "**/.tox/**",
20
+ "**/.eggs/**",
21
+ "**/Cython/Debugger/**",
22
+ "**/*.egg-info/**",
23
+ "**/build/**",
24
+ "**/dist/**",
25
+ "**/venv/**",
26
+ "**/.venv/**",
27
+ "**/env/**",
28
+ "**/ENV/**",
29
+ "**/virtualenv/**",
30
+ "**/site-packages/**",
31
+ "**/pip/**",
32
+ "**/.DS_Store",
33
+ "**/Thumbs.db",
34
+ "**/node_modules/**",
35
+ "**/*.env",
36
+ "**/.env",
37
+ "**/*.png",
38
+ "**/*.jpg",
39
+ "**/*.jpeg",
40
+ "**/*.gif",
41
+ "**/*.webp",
42
+ "**/*.bmp",
43
+ "**/*.mp3",
44
+ "**/*.mp4",
45
+ "**/tmp/**",
46
+ "**/temp/**",
47
+ "**/var/folders/**",
48
+ "**/test/data/**",
49
+ "**/tests/data/**",
50
+ "**/test_data/**",
51
+ "**/tests_data/**",
52
+ "__pycache__",
53
+ "*.pyc",
54
+ "*.pyo",
55
+ "target",
56
+ "venv"
57
+ ]
58
+
59
+ class PriorityRule:
60
+ def __init__(self, pattern, score):
61
+ self.pattern = pattern
62
+ self.score = score
63
+
64
+ class ParallelChunker:
65
+ DIR_IGNORE_NAMES = [
66
+ "venv",
67
+ ".venv",
68
+ "env",
69
+ "node_modules",
70
+ ".git",
71
+ ".svn",
72
+ ".hg",
73
+ "__pycache__",
74
+ ".pytest_cache",
75
+ ".tox",
76
+ ".eggs",
77
+ "build",
78
+ "dist"
79
+ ]
80
+ def __init__(
81
+ self,
82
+ equal_chunks: Optional[int] = None,
83
+ max_chunk_size: Optional[int] = None,
84
+ output_dir: str = "chunks",
85
+ user_ignore: Optional[List[str]] = None,
86
+ user_unignore: Optional[List[str]] = None,
87
+ binary_extensions: Optional[List[str]] = None,
88
+ priority_rules: Optional[List[Tuple[str,int]]] = None,
89
+ num_threads: int = 4,
90
+ dry_run: bool = False,
91
+ semantic_chunking: bool = False,
92
+ file_type: Optional[str] = None,
93
+ verbose: bool = False
94
+ ) -> None:
95
+ if equal_chunks is not None and max_chunk_size is not None:
96
+ raise ValueError("Cannot specify both equal_chunks and max_chunk_size")
97
+ if equal_chunks is None and max_chunk_size is None:
98
+ raise ValueError("Must specify either equal_chunks or max_chunk_size")
99
+ self.dir_ignore_names = self.DIR_IGNORE_NAMES
100
+ self.equal_chunks = equal_chunks
101
+ self.max_chunk_size = max_chunk_size
102
+ self.output_dir = output_dir
103
+ self.num_threads = num_threads
104
+ self.dry_run = dry_run
105
+ self.semantic_chunking = semantic_chunking
106
+ self.file_type = file_type.lower() if file_type else None
107
+ self.verbose = verbose
108
+
109
+ if user_ignore is None:
110
+ user_ignore = []
111
+ if user_unignore is None:
112
+ user_unignore = []
113
+
114
+ self.ignore_patterns = BUILTIN_IGNORES[:]
115
+ self.ignore_patterns.extend(user_ignore)
116
+ self.unignore_patterns = list(user_unignore)
117
+ if not any("site-packages" in pattern or "venv" in pattern for pattern in user_unignore or []):
118
+ self.unignore_patterns.append("*.py")
119
+
120
+ if binary_extensions is None:
121
+ binary_extensions = ["exe", "dll", "so"]
122
+ self.binary_exts = set(ext.lower() for ext in binary_extensions)
123
+
124
+ self.priority_rules = []
125
+ if priority_rules:
126
+ for rule_data in priority_rules:
127
+ if isinstance(rule_data, PriorityRule):
128
+ self.priority_rules.append(rule_data)
129
+ else:
130
+ pat, score = rule_data
131
+ self.priority_rules.append(PriorityRule(pat, score))
132
+
133
+ self.loaded_files = []
134
+ self.current_walk_root = None
135
+
136
+ def _get_text_content(self, path, content_bytes):
137
+ if path.endswith(".pdf"):
138
+ try:
139
+ doc = fitz.open(path)
140
+ text = ""
141
+ for page in doc:
142
+ text += page.get_text("text")
143
+ return text
144
+ except Exception as e:
145
+ print(f"Error extracting text from PDF {path}: {e}")
146
+ return ""
147
+ else:
148
+ text = content_bytes.decode("utf-8", errors="replace")
149
+ text = self._filter_api_keys(text)
150
+ return text
151
+
152
+ def is_absolute_pattern(self, pattern):
153
+ if pattern.startswith("/"):
154
+ return True
155
+ if re.match(r"^[a-zA-Z]:\\", pattern):
156
+ return True
157
+ return False
158
+
159
+ def _contains_api_key(self, line: str) -> bool:
160
+ pattern = r'[\'"].*[a-zA-Z0-9_-]{20,}.*[\'"]'
161
+ return bool(re.search(pattern, line))
162
+
163
+ def _filter_api_keys(self, text: str) -> str:
164
+ lines = text.splitlines()
165
+ filtered_lines = []
166
+ for line in lines:
167
+ contains_key = self._contains_api_key(line)
168
+ if contains_key:
169
+ filtered_lines.append("[API_KEY_REDACTED]")
170
+ else:
171
+ filtered_lines.append(line)
172
+ result = "\n".join(filtered_lines)
173
+ return result
174
+
175
+ def _match_segments(self, path_segs, pattern_segs, pi=0, pj=0):
176
+ if pj == len(pattern_segs):
177
+ return pi == len(path_segs)
178
+ if pi == len(path_segs):
179
+ return all(seg == '**' for seg in pattern_segs[pj:])
180
+ seg_pat = pattern_segs[pj]
181
+ if seg_pat == "**":
182
+ if self._match_segments(path_segs, pattern_segs, pi, pj + 1):
183
+ return True
184
+ return self._match_segments(path_segs, pattern_segs, pi + 1, pj)
185
+ if fnmatch.fnmatch(path_segs[pi], seg_pat):
186
+ return self._match_segments(path_segs, pattern_segs, pi + 1, pj + 1)
187
+ return False
188
+
189
+ def _double_star_fnmatch(self, path, pattern):
190
+ path = path.replace("\\", "/")
191
+ pattern = pattern.replace("\\", "/")
192
+ return self._match_segments(path.split("/"), pattern.split("/"))
193
+
194
+ def _matches_pattern(self, abs_path, rel_path, pattern):
195
+ target = abs_path if self.is_absolute_pattern(pattern) else rel_path
196
+
197
+ if "**" in pattern:
198
+ if self._double_star_fnmatch(target, pattern):
199
+ return True
200
+ else:
201
+ if fnmatch.fnmatch(target, pattern):
202
+ return True
203
+ if not self.is_absolute_pattern(pattern) and "/" not in pattern:
204
+ if fnmatch.fnmatch(os.path.basename(abs_path), pattern):
205
+ return True
206
+ return False
207
+
208
+ def _read_ignore_file(self, directory):
209
+ """Read .pykomodo-ignore file in the given directory and add patterns to ignore_patterns."""
210
+ for filename in ['.pykomodo-ignore', '.gitignore']:
211
+ ignore_file_path = os.path.join(directory, filename)
212
+ if os.path.exists(ignore_file_path):
213
+ try:
214
+ with open(ignore_file_path, 'r') as f:
215
+ for line in f:
216
+ line = line.strip()
217
+ if line and not line.startswith('#'):
218
+ if filename == '.gitignore' and '**' not in line:
219
+ if not line.startswith('/'):
220
+ line = f"**/{line}"
221
+ if line.endswith('/'):
222
+ line = f"{line}**"
223
+ self.ignore_patterns.append(line)
224
+ except Exception as e:
225
+ print(f"Error reading {filename} file: {e}")
226
+
227
+ def should_ignore_file(self, path):
228
+ abs_path = os.path.abspath(path)
229
+ root = self.current_walk_root or os.path.dirname(abs_path)
230
+ rel_path = os.path.relpath(abs_path, start=root).replace("\\", "/")
231
+ for pat in self.ignore_patterns:
232
+ if self._matches_pattern(abs_path, rel_path, pat):
233
+ for unignore_pat in self.unignore_patterns:
234
+ if self._matches_pattern(abs_path, rel_path, unignore_pat):
235
+ return False
236
+ return True
237
+
238
+ return False
239
+
240
+ def is_binary_file(self, path):
241
+ ext = path.split(".")[-1].lower()
242
+ if ext in {"py", "pdf"}:
243
+ return False
244
+ if ext in self.binary_exts:
245
+ return True
246
+ try:
247
+ with open(path, "rb") as f:
248
+ chunk = f.read(8192)
249
+ if b"\0" in chunk:
250
+ return True
251
+ except OSError:
252
+ return True
253
+ return False
254
+
255
+ def _collect_paths(self, dir_list):
256
+ collected = []
257
+ for directory in dir_list:
258
+ self.current_walk_root = os.path.abspath(directory)
259
+ for root, dirs, files in os.walk(directory):
260
+ dirs[:] = [d for d in dirs if d not in self.dir_ignore_names]
261
+ for filename in files:
262
+ full_path = os.path.join(root, filename)
263
+ if self.file_type:
264
+ _, ext = os.path.splitext(full_path)
265
+ if ext.lower() != f".{self.file_type}":
266
+ continue
267
+ if os.path.commonprefix([os.path.abspath(self.output_dir), os.path.abspath(full_path)]) == os.path.abspath(self.output_dir):
268
+ continue
269
+ if self.should_ignore_file(full_path):
270
+ continue
271
+ collected.append(full_path)
272
+ return collected
273
+
274
+ def _load_file_data(self, path):
275
+ try:
276
+ with open(path, "rb") as f:
277
+ content = f.read()
278
+ return path, content, self.calculate_priority(path)
279
+ except:
280
+ return path, None, 0
281
+
282
+ def calculate_priority(self, path):
283
+ highest = 0
284
+ basename = os.path.basename(path)
285
+ for rule in self.priority_rules:
286
+ if fnmatch.fnmatch(basename, rule.pattern):
287
+ highest = max(highest, rule.score)
288
+ return highest
289
+
290
+ def process_directories(self, dirs: List[str]) -> None:
291
+ for directory in dirs:
292
+ self._read_ignore_file(directory)
293
+ all_paths = self._collect_paths(dirs)
294
+ self.loaded_files.clear()
295
+ if self.dry_run:
296
+ print("[DRY-RUN] The following files would be processed (in priority order):")
297
+ tmp_loaded = []
298
+ for p in all_paths:
299
+ priority = self.calculate_priority(p)
300
+ tmp_loaded.append((p, priority))
301
+ tmp_loaded.sort(key=lambda x: -x[1])
302
+ for path, pr in tmp_loaded:
303
+ print(f" - {path} (priority={pr})")
304
+ return
305
+
306
+ with concurrent.futures.ThreadPoolExecutor(max_workers=self.num_threads) as ex:
307
+ future_map = {ex.submit(self._load_file_data, p): p for p in all_paths}
308
+ for fut in concurrent.futures.as_completed(future_map):
309
+ path, content, priority = fut.result()
310
+ if content is not None and not self.is_binary_file(path):
311
+ self.loaded_files.append((path, content, priority))
312
+ self.loaded_files.sort(key=lambda x: (-x[2], x[0]))
313
+ self._process_chunks()
314
+
315
+ def process_file(self, file_path: str, custom_chunk_size: Optional[int] = None, force_process: bool = False) -> None:
316
+ """
317
+ Process a single file and create chunks from it.
318
+
319
+ Args:
320
+ file_path: Path to the file to process
321
+ custom_chunk_size: Optional custom chunk size for this specific file, overriding the global setting
322
+ force_process: If True, process the file even if it would normally be ignored
323
+ """
324
+ if not os.path.isfile(file_path):
325
+ raise ValueError(f"File not found: {file_path}")
326
+
327
+ if self.should_ignore_file(file_path) and not force_process and not self.dry_run:
328
+ print(f"Skipping ignored file: {file_path}")
329
+ return
330
+
331
+ if self.dry_run:
332
+ priority = self.calculate_priority(file_path)
333
+ print(f"[DRY-RUN] Would process file: {file_path} (priority={priority})")
334
+ return
335
+
336
+ if self.is_binary_file(file_path) and not file_path.endswith(".pdf") and not force_process:
337
+ print(f"Skipping binary file: {file_path}")
338
+ return
339
+
340
+ path, content, priority = self._load_file_data(file_path)
341
+ if content is None:
342
+ print(f"Error loading file: {file_path}")
343
+ return
344
+
345
+ self.loaded_files = [(path, content, priority)]
346
+
347
+ original_max_chunk_size = None
348
+ if custom_chunk_size is not None and not self.equal_chunks:
349
+ original_max_chunk_size = self.max_chunk_size
350
+ self.max_chunk_size = custom_chunk_size
351
+
352
+ try:
353
+ self._process_chunks()
354
+ finally:
355
+ if original_max_chunk_size is not None:
356
+ self.max_chunk_size = original_max_chunk_size
357
+
358
+ def process_directory(self, directory):
359
+ self.process_directories([directory])
360
+
361
+ def _split_tokens(self, content_bytes):
362
+ try:
363
+ return content_bytes.decode("utf-8", errors="replace").split()
364
+ except:
365
+ return []
366
+
367
+ def _write_chunk(self, content_bytes, chunk_num):
368
+ os.makedirs(self.output_dir, exist_ok=True)
369
+ p = os.path.join(self.output_dir, f"chunk-{chunk_num}.txt")
370
+ try:
371
+ with open(p, "wb") as f:
372
+ f.write(content_bytes)
373
+ except:
374
+ pass
375
+
376
+ def _improved_pdf_chunking(self, path, idx):
377
+ """
378
+ Process a PDF file with improved text formatting for academic papers.
379
+ Uses multiple extraction methods to get the best text representation.
380
+
381
+ Args:
382
+ path: Path to the PDF file
383
+ idx: Starting chunk index
384
+
385
+ Returns:
386
+ Updated chunk index
387
+ """
388
+ try:
389
+ doc = fitz.open(path)
390
+
391
+ all_pages_content = []
392
+
393
+ for page_num in range(len(doc)):
394
+ page = doc[page_num]
395
+
396
+ text_as_text = page.get_text("text")
397
+ text_as_html = page.get_text("html")
398
+ text_as_dict = page.get_text("dict")
399
+
400
+ if "<p>" in text_as_html:
401
+ import re
402
+ paragraphs = re.findall(r'<p>(.*?)</p>', text_as_html, re.DOTALL)
403
+ processed_text = []
404
+
405
+ for p in paragraphs:
406
+ clean_p = re.sub(r'<.*?>', ' ', p)
407
+ clean_p = re.sub(r'&[a-zA-Z]+;', ' ', clean_p)
408
+ clean_p = re.sub(r'\s+', ' ', clean_p).strip()
409
+ if clean_p:
410
+ processed_text.append(clean_p)
411
+
412
+ page_text = "\n\n".join(processed_text)
413
+
414
+ elif len(text_as_dict.get("blocks", [])) > 0:
415
+ blocks = sorted(text_as_dict["blocks"], key=lambda b: b["bbox"][1])
416
+ processed_text = []
417
+
418
+ for block in blocks:
419
+ if "lines" not in block:
420
+ continue
421
+
422
+ block_lines = []
423
+ for line in block["lines"]:
424
+ if "spans" not in line:
425
+ continue
426
+
427
+ line_text = " ".join(span["text"] for span in line["spans"] if "text" in span)
428
+ if line_text.strip():
429
+ block_lines.append(line_text)
430
+
431
+ if block_lines:
432
+ processed_text.append(" ".join(block_lines))
433
+
434
+ page_text = "\n\n".join(processed_text)
435
+
436
+ else:
437
+ lines = text_as_text.split('\n')
438
+ paragraphs = []
439
+ current_paragraph = []
440
+
441
+ for line in lines:
442
+ line = line.strip()
443
+ words = line.split()
444
+ if len(words) <= 2 and not line.endswith('.') and not line.endswith(':'):
445
+ current_paragraph.append(line)
446
+ else:
447
+ if current_paragraph:
448
+ paragraphs.append(" ".join(current_paragraph))
449
+ current_paragraph = []
450
+ if line:
451
+ paragraphs.append(line)
452
+
453
+ if current_paragraph:
454
+ paragraphs.append(" ".join(current_paragraph))
455
+
456
+ page_text = "\n\n".join(paragraphs)
457
+
458
+ page_content = f"--- Page {page_num + 1} ---\n\n{page_text}"
459
+ all_pages_content.append(page_content)
460
+
461
+ full_document = "\n\n".join(all_pages_content)
462
+
463
+ paragraphs = full_document.split("\n\n")
464
+ current_chunk = []
465
+ current_size = 0
466
+
467
+ for paragraph in paragraphs:
468
+ if not paragraph.strip():
469
+ continue
470
+
471
+ para_size = len(paragraph.split())
472
+
473
+ if current_size + para_size > self.max_chunk_size and current_chunk:
474
+ chunk_text = "\n\n".join(current_chunk)
475
+ final_text = f"{'='*80}\nFILE: {path}\n{'='*80}\n\n{chunk_text}"
476
+ self._write_chunk(final_text.encode("utf-8"), idx)
477
+ idx += 1
478
+ current_chunk = []
479
+ current_size = 0
480
+
481
+ current_chunk.append(paragraph)
482
+ current_size += para_size
483
+
484
+ if current_chunk:
485
+ chunk_text = "\n\n".join(current_chunk)
486
+ final_text = f"{'='*80}\nFILE: {path}\n{'='*80}\n\n{chunk_text}"
487
+ self._write_chunk(final_text.encode("utf-8"), idx)
488
+ idx += 1
489
+
490
+ return idx
491
+
492
+ except Exception as e:
493
+ print(f"Error processing PDF {path}: {e}")
494
+ t = (
495
+ "="*80 + "\n"
496
+ + f"CHUNK {idx + 1}\n"
497
+ + "="*80 + "\n\n"
498
+ + "="*40 + "\n"
499
+ + f"File: {path}\n"
500
+ + "="*40 + "\n"
501
+ + f"[Error processing PDF: {str(e)}]\n"
502
+ )
503
+ self._write_chunk(t.encode("utf-8"), idx)
504
+ return idx + 1
505
+
506
+ def _process_chunks(self):
507
+ if not self.loaded_files:
508
+ return
509
+ if self.semantic_chunking:
510
+ self._chunk_by_semantic()
511
+ elif self.equal_chunks:
512
+ self._chunk_by_equal_parts()
513
+ else:
514
+ self._chunk_by_size()
515
+
516
+ def _extract_pdf_paragraphs(self, path):
517
+ try:
518
+ doc = fitz.open(path)
519
+ paragraphs = []
520
+ for page in doc:
521
+ text = page.get_text("text")
522
+ page_paras = text.split("\n\n")
523
+ paragraphs.extend([para.strip() for para in page_paras if para.strip()])
524
+ return paragraphs
525
+ except Exception as e:
526
+ print(f"Error extracting paragraphs from PDF {path}: {e}")
527
+ return []
528
+
529
+ def _chunk_by_equal_parts(self) -> None:
530
+ text_blocks = []
531
+ for (path, content_bytes, _) in self.loaded_files:
532
+ if path.endswith(".pdf"):
533
+ paragraphs = self._extract_pdf_paragraphs(path)
534
+ for para in paragraphs:
535
+ s = len(para.split())
536
+ if s > 0:
537
+ text_blocks.append((path, para, s))
538
+ else:
539
+ text = self._get_text_content(path, content_bytes)
540
+ if text:
541
+ s = len(text.split())
542
+ text_blocks.append((path, text, s))
543
+ if not text_blocks:
544
+ return
545
+ n_chunks = self.equal_chunks
546
+ text_blocks.sort(key=lambda x: -x[2])
547
+ chunk_contents = [[] for _ in range(n_chunks)]
548
+ chunk_sizes = [0] * n_chunks
549
+ for block in text_blocks:
550
+ min_idx = 0
551
+ min_size = chunk_sizes[0]
552
+ for i in range(1, n_chunks):
553
+ if chunk_sizes[i] < min_size:
554
+ min_size = chunk_sizes[i]
555
+ min_idx = i
556
+ chunk_contents[min_idx].append(block)
557
+ chunk_sizes[min_idx] += block[2]
558
+ for i, chunk in enumerate(chunk_contents):
559
+ if chunk:
560
+ self._write_equal_chunk([(path, text) for path, text, _ in chunk], i)
561
+
562
+ def _write_equal_chunk(self, chunk_data, chunk_num):
563
+ txt = "="*80 + "\n" + f"CHUNK {chunk_num + 1} OF {self.equal_chunks}\n" + "="*80 + "\n\n"
564
+ for path, text in chunk_data:
565
+ txt += "="*40 + "\n" + f"File: {path}\n" + "="*40 + "\n" + text + "\n"
566
+ self._write_chunk(txt.encode("utf-8"), chunk_num)
567
+
568
+ def _chunk_by_size(self):
569
+ idx = 0
570
+ for (path, content_bytes, _) in self.loaded_files:
571
+ text = self._get_text_content(path, content_bytes)
572
+ if not text:
573
+ t = (
574
+ "="*80 + "\n"
575
+ + f"CHUNK {idx + 1}\n"
576
+ + "="*80 + "\n\n"
577
+ + "="*40 + "\n"
578
+ + f"File: {path}\n"
579
+ + "="*40 + "\n"
580
+ + "[Empty File]\n"
581
+ )
582
+ self._write_chunk(t.encode("utf-8"), idx)
583
+ idx += 1
584
+ continue
585
+
586
+ if path.endswith(".pdf"):
587
+ idx = self._improved_pdf_chunking(path, idx)
588
+ else:
589
+ lines = text.splitlines()
590
+ current_chunk_lines = []
591
+ current_size = 0
592
+ for line in lines:
593
+ line_size = len(line.split())
594
+ if current_size + line_size > self.max_chunk_size and current_chunk_lines:
595
+ h = [
596
+ "="*80,
597
+ f"CHUNK {idx + 1}",
598
+ "="*80,
599
+ "",
600
+ "="*40,
601
+ f"File: {path}",
602
+ "="*40,
603
+ ""
604
+ ]
605
+ chunk_data = "\n".join(h + current_chunk_lines) + "\n"
606
+ self._write_chunk(chunk_data.encode("utf-8"), idx)
607
+ idx += 1
608
+ current_chunk_lines = []
609
+ current_size = 0
610
+ if line.strip():
611
+ current_chunk_lines.append(line)
612
+ current_size += line_size
613
+ if current_chunk_lines:
614
+ h = [
615
+ "="*80,
616
+ f"CHUNK {idx + 1}",
617
+ "="*80,
618
+ "",
619
+ "="*40,
620
+ f"File: {path}",
621
+ "="*40,
622
+ ""
623
+ ]
624
+ chunk_data = "\n".join(h + current_chunk_lines) + "\n"
625
+ self._write_chunk(chunk_data.encode("utf-8"), idx)
626
+ idx += 1
627
+
628
+ def _chunk_by_semantic(self):
629
+ chunk_index = 0
630
+ for (path, content_bytes, priority) in self.loaded_files:
631
+ text = self._get_text_content(path, content_bytes)
632
+ if not text and not path.endswith(".pdf"):
633
+ continue
634
+ if path.endswith(".py"):
635
+ chunk_index = self._chunk_python_file_ast(path, text, chunk_index)
636
+ else:
637
+ chunk_index = self._chunk_nonpython_file_by_size(path, text, chunk_index)
638
+
639
+ def _chunk_nonpython_file_by_size(self, path, text, chunk_index):
640
+ lines = text.splitlines()
641
+ if not lines:
642
+ t = (
643
+ "="*80 + "\n"
644
+ + f"CHUNK {chunk_index + 1}\n"
645
+ + "="*80 + "\n\n"
646
+ + "="*40 + "\n"
647
+ + f"File: {path}\n"
648
+ + "="*40 + "\n"
649
+ + "[Empty File]\n"
650
+ )
651
+ self._write_chunk(t.encode("utf-8"), chunk_index)
652
+ return chunk_index + 1
653
+
654
+ current_chunk_lines = []
655
+ current_size = 0
656
+ idx = chunk_index
657
+ for line in lines:
658
+ line_size = len(line.split())
659
+ if self.max_chunk_size and (current_size + line_size) > self.max_chunk_size and current_chunk_lines:
660
+ chunk_data = self._format_chunk_content(path, current_chunk_lines, idx)
661
+ self._write_chunk(chunk_data.encode("utf-8"), idx)
662
+ idx += 1
663
+ current_chunk_lines = []
664
+ current_size = 0
665
+ current_chunk_lines.append(line)
666
+ current_size += line_size
667
+
668
+ if current_chunk_lines:
669
+ chunk_data = self._format_chunk_content(path, current_chunk_lines, idx)
670
+ self._write_chunk(chunk_data.encode("utf-8"), idx)
671
+ idx += 1
672
+
673
+ return idx
674
+
675
+ def _format_chunk_content(self, path, lines, idx):
676
+ h = [
677
+ "="*80,
678
+ f"CHUNK {idx + 1}",
679
+ "="*80,
680
+ "",
681
+ "="*40,
682
+ f"File: {path}",
683
+ "="*40,
684
+ ""
685
+ ]
686
+ return "\n".join(h + lines) + "\n"
687
+
688
+ def _chunk_python_file_ast(self, path, text, chunk_index):
689
+ import ast
690
+ try:
691
+ tree = ast.parse(text, filename=path)
692
+ except SyntaxError:
693
+ chunk_data = f"{'='*80}\nFILE: {path}\n{'='*80}\n\n{text}"
694
+ self._write_chunk(chunk_data.encode("utf-8"), chunk_index)
695
+ return chunk_index + 1
696
+
697
+ lines = text.splitlines()
698
+
699
+ node_boundaries = []
700
+ for node in tree.body:
701
+ if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
702
+ node_type = "Function"
703
+ label = f"{node_type}: {node.name}"
704
+ elif isinstance(node, ast.ClassDef):
705
+ label = f"Class: {node.name}"
706
+ else:
707
+ continue
708
+ start = node.lineno
709
+ end = getattr(node, 'end_lineno', start)
710
+ node_boundaries.append((start, end, label))
711
+
712
+ node_boundaries.sort(key=lambda x: x[0])
713
+
714
+ expanded_blocks = []
715
+ prev_end = 1
716
+ for (start, end, label) in node_boundaries:
717
+ if start > prev_end:
718
+ expanded_blocks.append((prev_end, start - 1, "GLOBAL CODE"))
719
+ expanded_blocks.append((start, end, label))
720
+ prev_end = end + 1
721
+ if prev_end <= len(lines):
722
+ expanded_blocks.append((prev_end, len(lines), "GLOBAL CODE"))
723
+
724
+ code_blocks = []
725
+ for (start, end, label) in expanded_blocks:
726
+ snippet = lines[start - 1 : end]
727
+ block_text = f"{label} (lines {start}-{end})\n" + "\n".join(snippet)
728
+ code_blocks.append(block_text)
729
+
730
+ current_lines = []
731
+ current_count = 0
732
+
733
+ for block in code_blocks:
734
+ block_size = len(block.splitlines())
735
+
736
+ if not self.max_chunk_size:
737
+ current_lines.append(block)
738
+ current_count += block_size
739
+ continue
740
+
741
+ if block_size > self.max_chunk_size:
742
+ if current_lines:
743
+ chunk_data = "\n\n".join(current_lines)
744
+ final_text = f"{'='*80}\nFILE: {path}\n{'='*80}\n\n{chunk_data}"
745
+ self._write_chunk(final_text.encode("utf-8"), chunk_index)
746
+ chunk_index += 1
747
+ current_lines = []
748
+ current_count = 0
749
+
750
+ big_block_data = f"{'='*80}\nFILE: {path}\n{'='*80}\n\n{block}"
751
+ self._write_chunk(big_block_data.encode("utf-8"), chunk_index)
752
+ chunk_index += 1
753
+ continue
754
+
755
+ if current_count + block_size > self.max_chunk_size and current_lines:
756
+ chunk_data = "\n\n".join(current_lines)
757
+ final_text = f"{'='*80}\nFILE: {path}\n{'='*80}\n\n{chunk_data}"
758
+ self._write_chunk(final_text.encode("utf-8"), chunk_index)
759
+ chunk_index += 1
760
+
761
+ current_lines = []
762
+ current_count = 0
763
+
764
+ current_lines.append(block)
765
+ current_count += block_size
766
+
767
+ if current_lines:
768
+ chunk_data = "\n\n".join(current_lines)
769
+ final_text = f"{'='*80}\nFILE: {path}\n{'='*80}\n\n{chunk_data}"
770
+ self._write_chunk(final_text.encode("utf-8"), chunk_index)
771
+ chunk_index += 1
772
+
773
+ return chunk_index
774
+
775
+ def close(self):
776
+ pass
777
+
778
+ def __enter__(self):
779
+ return self
780
+
781
+ def __exit__(self, exc_type, exc_val, exc_tb):
782
+ self.close()
783
+ return False