skylos 1.0.9__py3-none-any.whl → 1.0.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skylos might be problematic. Click here for more details.
- skylos/__init__.py +1 -1
- skylos/analyzer.py +89 -17
- skylos/cli.py +24 -1
- skylos/visitor.py +76 -16
- {skylos-1.0.9.dist-info → skylos-1.0.11.dist-info}/METADATA +1 -1
- skylos-1.0.11.dist-info/RECORD +30 -0
- {skylos-1.0.9.dist-info → skylos-1.0.11.dist-info}/WHEEL +1 -1
- test/pykomodo/__init__.py +0 -0
- test/pykomodo/command_line.py +176 -0
- test/pykomodo/config.py +20 -0
- test/pykomodo/core.py +121 -0
- test/pykomodo/dashboard.py +608 -0
- test/pykomodo/enhanced_chunker.py +304 -0
- test/pykomodo/multi_dirs_chunker.py +783 -0
- test/pykomodo/pykomodo_config.py +68 -0
- test/pykomodo/token_chunker.py +470 -0
- skylos-1.0.9.dist-info/RECORD +0 -21
- {skylos-1.0.9.dist-info → skylos-1.0.11.dist-info}/entry_points.txt +0 -0
- {skylos-1.0.9.dist-info → skylos-1.0.11.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,783 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import fnmatch
|
|
3
|
+
import re
|
|
4
|
+
import concurrent.futures
|
|
5
|
+
from typing import Optional, List, Tuple
|
|
6
|
+
import fitz
|
|
7
|
+
|
|
8
|
+
BUILTIN_IGNORES = [
|
|
9
|
+
"**/.git/**",
|
|
10
|
+
"**/.svn/**",
|
|
11
|
+
"**/.hg/**",
|
|
12
|
+
"**/.idea/**",
|
|
13
|
+
"**/.vscode/**",
|
|
14
|
+
"**/__pycache__/**",
|
|
15
|
+
"**/*.pyc",
|
|
16
|
+
"**/*.pyo",
|
|
17
|
+
"**/.pytest_cache/**",
|
|
18
|
+
"**/.coverage",
|
|
19
|
+
"**/.tox/**",
|
|
20
|
+
"**/.eggs/**",
|
|
21
|
+
"**/Cython/Debugger/**",
|
|
22
|
+
"**/*.egg-info/**",
|
|
23
|
+
"**/build/**",
|
|
24
|
+
"**/dist/**",
|
|
25
|
+
"**/venv/**",
|
|
26
|
+
"**/.venv/**",
|
|
27
|
+
"**/env/**",
|
|
28
|
+
"**/ENV/**",
|
|
29
|
+
"**/virtualenv/**",
|
|
30
|
+
"**/site-packages/**",
|
|
31
|
+
"**/pip/**",
|
|
32
|
+
"**/.DS_Store",
|
|
33
|
+
"**/Thumbs.db",
|
|
34
|
+
"**/node_modules/**",
|
|
35
|
+
"**/*.env",
|
|
36
|
+
"**/.env",
|
|
37
|
+
"**/*.png",
|
|
38
|
+
"**/*.jpg",
|
|
39
|
+
"**/*.jpeg",
|
|
40
|
+
"**/*.gif",
|
|
41
|
+
"**/*.webp",
|
|
42
|
+
"**/*.bmp",
|
|
43
|
+
"**/*.mp3",
|
|
44
|
+
"**/*.mp4",
|
|
45
|
+
"**/tmp/**",
|
|
46
|
+
"**/temp/**",
|
|
47
|
+
"**/var/folders/**",
|
|
48
|
+
"**/test/data/**",
|
|
49
|
+
"**/tests/data/**",
|
|
50
|
+
"**/test_data/**",
|
|
51
|
+
"**/tests_data/**",
|
|
52
|
+
"__pycache__",
|
|
53
|
+
"*.pyc",
|
|
54
|
+
"*.pyo",
|
|
55
|
+
"target",
|
|
56
|
+
"venv"
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
class PriorityRule:
|
|
60
|
+
def __init__(self, pattern, score):
|
|
61
|
+
self.pattern = pattern
|
|
62
|
+
self.score = score
|
|
63
|
+
|
|
64
|
+
class ParallelChunker:
|
|
65
|
+
DIR_IGNORE_NAMES = [
|
|
66
|
+
"venv",
|
|
67
|
+
".venv",
|
|
68
|
+
"env",
|
|
69
|
+
"node_modules",
|
|
70
|
+
".git",
|
|
71
|
+
".svn",
|
|
72
|
+
".hg",
|
|
73
|
+
"__pycache__",
|
|
74
|
+
".pytest_cache",
|
|
75
|
+
".tox",
|
|
76
|
+
".eggs",
|
|
77
|
+
"build",
|
|
78
|
+
"dist"
|
|
79
|
+
]
|
|
80
|
+
def __init__(
|
|
81
|
+
self,
|
|
82
|
+
equal_chunks: Optional[int] = None,
|
|
83
|
+
max_chunk_size: Optional[int] = None,
|
|
84
|
+
output_dir: str = "chunks",
|
|
85
|
+
user_ignore: Optional[List[str]] = None,
|
|
86
|
+
user_unignore: Optional[List[str]] = None,
|
|
87
|
+
binary_extensions: Optional[List[str]] = None,
|
|
88
|
+
priority_rules: Optional[List[Tuple[str,int]]] = None,
|
|
89
|
+
num_threads: int = 4,
|
|
90
|
+
dry_run: bool = False,
|
|
91
|
+
semantic_chunking: bool = False,
|
|
92
|
+
file_type: Optional[str] = None,
|
|
93
|
+
verbose: bool = False
|
|
94
|
+
) -> None:
|
|
95
|
+
if equal_chunks is not None and max_chunk_size is not None:
|
|
96
|
+
raise ValueError("Cannot specify both equal_chunks and max_chunk_size")
|
|
97
|
+
if equal_chunks is None and max_chunk_size is None:
|
|
98
|
+
raise ValueError("Must specify either equal_chunks or max_chunk_size")
|
|
99
|
+
self.dir_ignore_names = self.DIR_IGNORE_NAMES
|
|
100
|
+
self.equal_chunks = equal_chunks
|
|
101
|
+
self.max_chunk_size = max_chunk_size
|
|
102
|
+
self.output_dir = output_dir
|
|
103
|
+
self.num_threads = num_threads
|
|
104
|
+
self.dry_run = dry_run
|
|
105
|
+
self.semantic_chunking = semantic_chunking
|
|
106
|
+
self.file_type = file_type.lower() if file_type else None
|
|
107
|
+
self.verbose = verbose
|
|
108
|
+
|
|
109
|
+
if user_ignore is None:
|
|
110
|
+
user_ignore = []
|
|
111
|
+
if user_unignore is None:
|
|
112
|
+
user_unignore = []
|
|
113
|
+
|
|
114
|
+
self.ignore_patterns = BUILTIN_IGNORES[:]
|
|
115
|
+
self.ignore_patterns.extend(user_ignore)
|
|
116
|
+
self.unignore_patterns = list(user_unignore)
|
|
117
|
+
if not any("site-packages" in pattern or "venv" in pattern for pattern in user_unignore or []):
|
|
118
|
+
self.unignore_patterns.append("*.py")
|
|
119
|
+
|
|
120
|
+
if binary_extensions is None:
|
|
121
|
+
binary_extensions = ["exe", "dll", "so"]
|
|
122
|
+
self.binary_exts = set(ext.lower() for ext in binary_extensions)
|
|
123
|
+
|
|
124
|
+
self.priority_rules = []
|
|
125
|
+
if priority_rules:
|
|
126
|
+
for rule_data in priority_rules:
|
|
127
|
+
if isinstance(rule_data, PriorityRule):
|
|
128
|
+
self.priority_rules.append(rule_data)
|
|
129
|
+
else:
|
|
130
|
+
pat, score = rule_data
|
|
131
|
+
self.priority_rules.append(PriorityRule(pat, score))
|
|
132
|
+
|
|
133
|
+
self.loaded_files = []
|
|
134
|
+
self.current_walk_root = None
|
|
135
|
+
|
|
136
|
+
def _get_text_content(self, path, content_bytes):
|
|
137
|
+
if path.endswith(".pdf"):
|
|
138
|
+
try:
|
|
139
|
+
doc = fitz.open(path)
|
|
140
|
+
text = ""
|
|
141
|
+
for page in doc:
|
|
142
|
+
text += page.get_text("text")
|
|
143
|
+
return text
|
|
144
|
+
except Exception as e:
|
|
145
|
+
print(f"Error extracting text from PDF {path}: {e}")
|
|
146
|
+
return ""
|
|
147
|
+
else:
|
|
148
|
+
text = content_bytes.decode("utf-8", errors="replace")
|
|
149
|
+
text = self._filter_api_keys(text)
|
|
150
|
+
return text
|
|
151
|
+
|
|
152
|
+
def is_absolute_pattern(self, pattern):
|
|
153
|
+
if pattern.startswith("/"):
|
|
154
|
+
return True
|
|
155
|
+
if re.match(r"^[a-zA-Z]:\\", pattern):
|
|
156
|
+
return True
|
|
157
|
+
return False
|
|
158
|
+
|
|
159
|
+
def _contains_api_key(self, line: str) -> bool:
|
|
160
|
+
pattern = r'[\'"].*[a-zA-Z0-9_-]{20,}.*[\'"]'
|
|
161
|
+
return bool(re.search(pattern, line))
|
|
162
|
+
|
|
163
|
+
def _filter_api_keys(self, text: str) -> str:
|
|
164
|
+
lines = text.splitlines()
|
|
165
|
+
filtered_lines = []
|
|
166
|
+
for line in lines:
|
|
167
|
+
contains_key = self._contains_api_key(line)
|
|
168
|
+
if contains_key:
|
|
169
|
+
filtered_lines.append("[API_KEY_REDACTED]")
|
|
170
|
+
else:
|
|
171
|
+
filtered_lines.append(line)
|
|
172
|
+
result = "\n".join(filtered_lines)
|
|
173
|
+
return result
|
|
174
|
+
|
|
175
|
+
def _match_segments(self, path_segs, pattern_segs, pi=0, pj=0):
|
|
176
|
+
if pj == len(pattern_segs):
|
|
177
|
+
return pi == len(path_segs)
|
|
178
|
+
if pi == len(path_segs):
|
|
179
|
+
return all(seg == '**' for seg in pattern_segs[pj:])
|
|
180
|
+
seg_pat = pattern_segs[pj]
|
|
181
|
+
if seg_pat == "**":
|
|
182
|
+
if self._match_segments(path_segs, pattern_segs, pi, pj + 1):
|
|
183
|
+
return True
|
|
184
|
+
return self._match_segments(path_segs, pattern_segs, pi + 1, pj)
|
|
185
|
+
if fnmatch.fnmatch(path_segs[pi], seg_pat):
|
|
186
|
+
return self._match_segments(path_segs, pattern_segs, pi + 1, pj + 1)
|
|
187
|
+
return False
|
|
188
|
+
|
|
189
|
+
def _double_star_fnmatch(self, path, pattern):
|
|
190
|
+
path = path.replace("\\", "/")
|
|
191
|
+
pattern = pattern.replace("\\", "/")
|
|
192
|
+
return self._match_segments(path.split("/"), pattern.split("/"))
|
|
193
|
+
|
|
194
|
+
def _matches_pattern(self, abs_path, rel_path, pattern):
|
|
195
|
+
target = abs_path if self.is_absolute_pattern(pattern) else rel_path
|
|
196
|
+
|
|
197
|
+
if "**" in pattern:
|
|
198
|
+
if self._double_star_fnmatch(target, pattern):
|
|
199
|
+
return True
|
|
200
|
+
else:
|
|
201
|
+
if fnmatch.fnmatch(target, pattern):
|
|
202
|
+
return True
|
|
203
|
+
if not self.is_absolute_pattern(pattern) and "/" not in pattern:
|
|
204
|
+
if fnmatch.fnmatch(os.path.basename(abs_path), pattern):
|
|
205
|
+
return True
|
|
206
|
+
return False
|
|
207
|
+
|
|
208
|
+
def _read_ignore_file(self, directory):
|
|
209
|
+
"""Read .pykomodo-ignore file in the given directory and add patterns to ignore_patterns."""
|
|
210
|
+
for filename in ['.pykomodo-ignore', '.gitignore']:
|
|
211
|
+
ignore_file_path = os.path.join(directory, filename)
|
|
212
|
+
if os.path.exists(ignore_file_path):
|
|
213
|
+
try:
|
|
214
|
+
with open(ignore_file_path, 'r') as f:
|
|
215
|
+
for line in f:
|
|
216
|
+
line = line.strip()
|
|
217
|
+
if line and not line.startswith('#'):
|
|
218
|
+
if filename == '.gitignore' and '**' not in line:
|
|
219
|
+
if not line.startswith('/'):
|
|
220
|
+
line = f"**/{line}"
|
|
221
|
+
if line.endswith('/'):
|
|
222
|
+
line = f"{line}**"
|
|
223
|
+
self.ignore_patterns.append(line)
|
|
224
|
+
except Exception as e:
|
|
225
|
+
print(f"Error reading {filename} file: {e}")
|
|
226
|
+
|
|
227
|
+
def should_ignore_file(self, path):
|
|
228
|
+
abs_path = os.path.abspath(path)
|
|
229
|
+
root = self.current_walk_root or os.path.dirname(abs_path)
|
|
230
|
+
rel_path = os.path.relpath(abs_path, start=root).replace("\\", "/")
|
|
231
|
+
for pat in self.ignore_patterns:
|
|
232
|
+
if self._matches_pattern(abs_path, rel_path, pat):
|
|
233
|
+
for unignore_pat in self.unignore_patterns:
|
|
234
|
+
if self._matches_pattern(abs_path, rel_path, unignore_pat):
|
|
235
|
+
return False
|
|
236
|
+
return True
|
|
237
|
+
|
|
238
|
+
return False
|
|
239
|
+
|
|
240
|
+
def is_binary_file(self, path):
|
|
241
|
+
ext = path.split(".")[-1].lower()
|
|
242
|
+
if ext in {"py", "pdf"}:
|
|
243
|
+
return False
|
|
244
|
+
if ext in self.binary_exts:
|
|
245
|
+
return True
|
|
246
|
+
try:
|
|
247
|
+
with open(path, "rb") as f:
|
|
248
|
+
chunk = f.read(8192)
|
|
249
|
+
if b"\0" in chunk:
|
|
250
|
+
return True
|
|
251
|
+
except OSError:
|
|
252
|
+
return True
|
|
253
|
+
return False
|
|
254
|
+
|
|
255
|
+
def _collect_paths(self, dir_list):
|
|
256
|
+
collected = []
|
|
257
|
+
for directory in dir_list:
|
|
258
|
+
self.current_walk_root = os.path.abspath(directory)
|
|
259
|
+
for root, dirs, files in os.walk(directory):
|
|
260
|
+
dirs[:] = [d for d in dirs if d not in self.dir_ignore_names]
|
|
261
|
+
for filename in files:
|
|
262
|
+
full_path = os.path.join(root, filename)
|
|
263
|
+
if self.file_type:
|
|
264
|
+
_, ext = os.path.splitext(full_path)
|
|
265
|
+
if ext.lower() != f".{self.file_type}":
|
|
266
|
+
continue
|
|
267
|
+
if os.path.commonprefix([os.path.abspath(self.output_dir), os.path.abspath(full_path)]) == os.path.abspath(self.output_dir):
|
|
268
|
+
continue
|
|
269
|
+
if self.should_ignore_file(full_path):
|
|
270
|
+
continue
|
|
271
|
+
collected.append(full_path)
|
|
272
|
+
return collected
|
|
273
|
+
|
|
274
|
+
def _load_file_data(self, path):
|
|
275
|
+
try:
|
|
276
|
+
with open(path, "rb") as f:
|
|
277
|
+
content = f.read()
|
|
278
|
+
return path, content, self.calculate_priority(path)
|
|
279
|
+
except:
|
|
280
|
+
return path, None, 0
|
|
281
|
+
|
|
282
|
+
def calculate_priority(self, path):
|
|
283
|
+
highest = 0
|
|
284
|
+
basename = os.path.basename(path)
|
|
285
|
+
for rule in self.priority_rules:
|
|
286
|
+
if fnmatch.fnmatch(basename, rule.pattern):
|
|
287
|
+
highest = max(highest, rule.score)
|
|
288
|
+
return highest
|
|
289
|
+
|
|
290
|
+
def process_directories(self, dirs: List[str]) -> None:
|
|
291
|
+
for directory in dirs:
|
|
292
|
+
self._read_ignore_file(directory)
|
|
293
|
+
all_paths = self._collect_paths(dirs)
|
|
294
|
+
self.loaded_files.clear()
|
|
295
|
+
if self.dry_run:
|
|
296
|
+
print("[DRY-RUN] The following files would be processed (in priority order):")
|
|
297
|
+
tmp_loaded = []
|
|
298
|
+
for p in all_paths:
|
|
299
|
+
priority = self.calculate_priority(p)
|
|
300
|
+
tmp_loaded.append((p, priority))
|
|
301
|
+
tmp_loaded.sort(key=lambda x: -x[1])
|
|
302
|
+
for path, pr in tmp_loaded:
|
|
303
|
+
print(f" - {path} (priority={pr})")
|
|
304
|
+
return
|
|
305
|
+
|
|
306
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=self.num_threads) as ex:
|
|
307
|
+
future_map = {ex.submit(self._load_file_data, p): p for p in all_paths}
|
|
308
|
+
for fut in concurrent.futures.as_completed(future_map):
|
|
309
|
+
path, content, priority = fut.result()
|
|
310
|
+
if content is not None and not self.is_binary_file(path):
|
|
311
|
+
self.loaded_files.append((path, content, priority))
|
|
312
|
+
self.loaded_files.sort(key=lambda x: (-x[2], x[0]))
|
|
313
|
+
self._process_chunks()
|
|
314
|
+
|
|
315
|
+
def process_file(self, file_path: str, custom_chunk_size: Optional[int] = None, force_process: bool = False) -> None:
|
|
316
|
+
"""
|
|
317
|
+
Process a single file and create chunks from it.
|
|
318
|
+
|
|
319
|
+
Args:
|
|
320
|
+
file_path: Path to the file to process
|
|
321
|
+
custom_chunk_size: Optional custom chunk size for this specific file, overriding the global setting
|
|
322
|
+
force_process: If True, process the file even if it would normally be ignored
|
|
323
|
+
"""
|
|
324
|
+
if not os.path.isfile(file_path):
|
|
325
|
+
raise ValueError(f"File not found: {file_path}")
|
|
326
|
+
|
|
327
|
+
if self.should_ignore_file(file_path) and not force_process and not self.dry_run:
|
|
328
|
+
print(f"Skipping ignored file: {file_path}")
|
|
329
|
+
return
|
|
330
|
+
|
|
331
|
+
if self.dry_run:
|
|
332
|
+
priority = self.calculate_priority(file_path)
|
|
333
|
+
print(f"[DRY-RUN] Would process file: {file_path} (priority={priority})")
|
|
334
|
+
return
|
|
335
|
+
|
|
336
|
+
if self.is_binary_file(file_path) and not file_path.endswith(".pdf") and not force_process:
|
|
337
|
+
print(f"Skipping binary file: {file_path}")
|
|
338
|
+
return
|
|
339
|
+
|
|
340
|
+
path, content, priority = self._load_file_data(file_path)
|
|
341
|
+
if content is None:
|
|
342
|
+
print(f"Error loading file: {file_path}")
|
|
343
|
+
return
|
|
344
|
+
|
|
345
|
+
self.loaded_files = [(path, content, priority)]
|
|
346
|
+
|
|
347
|
+
original_max_chunk_size = None
|
|
348
|
+
if custom_chunk_size is not None and not self.equal_chunks:
|
|
349
|
+
original_max_chunk_size = self.max_chunk_size
|
|
350
|
+
self.max_chunk_size = custom_chunk_size
|
|
351
|
+
|
|
352
|
+
try:
|
|
353
|
+
self._process_chunks()
|
|
354
|
+
finally:
|
|
355
|
+
if original_max_chunk_size is not None:
|
|
356
|
+
self.max_chunk_size = original_max_chunk_size
|
|
357
|
+
|
|
358
|
+
def process_directory(self, directory):
|
|
359
|
+
self.process_directories([directory])
|
|
360
|
+
|
|
361
|
+
def _split_tokens(self, content_bytes):
|
|
362
|
+
try:
|
|
363
|
+
return content_bytes.decode("utf-8", errors="replace").split()
|
|
364
|
+
except:
|
|
365
|
+
return []
|
|
366
|
+
|
|
367
|
+
def _write_chunk(self, content_bytes, chunk_num):
|
|
368
|
+
os.makedirs(self.output_dir, exist_ok=True)
|
|
369
|
+
p = os.path.join(self.output_dir, f"chunk-{chunk_num}.txt")
|
|
370
|
+
try:
|
|
371
|
+
with open(p, "wb") as f:
|
|
372
|
+
f.write(content_bytes)
|
|
373
|
+
except:
|
|
374
|
+
pass
|
|
375
|
+
|
|
376
|
+
def _improved_pdf_chunking(self, path, idx):
|
|
377
|
+
"""
|
|
378
|
+
Process a PDF file with improved text formatting for academic papers.
|
|
379
|
+
Uses multiple extraction methods to get the best text representation.
|
|
380
|
+
|
|
381
|
+
Args:
|
|
382
|
+
path: Path to the PDF file
|
|
383
|
+
idx: Starting chunk index
|
|
384
|
+
|
|
385
|
+
Returns:
|
|
386
|
+
Updated chunk index
|
|
387
|
+
"""
|
|
388
|
+
try:
|
|
389
|
+
doc = fitz.open(path)
|
|
390
|
+
|
|
391
|
+
all_pages_content = []
|
|
392
|
+
|
|
393
|
+
for page_num in range(len(doc)):
|
|
394
|
+
page = doc[page_num]
|
|
395
|
+
|
|
396
|
+
text_as_text = page.get_text("text")
|
|
397
|
+
text_as_html = page.get_text("html")
|
|
398
|
+
text_as_dict = page.get_text("dict")
|
|
399
|
+
|
|
400
|
+
if "<p>" in text_as_html:
|
|
401
|
+
import re
|
|
402
|
+
paragraphs = re.findall(r'<p>(.*?)</p>', text_as_html, re.DOTALL)
|
|
403
|
+
processed_text = []
|
|
404
|
+
|
|
405
|
+
for p in paragraphs:
|
|
406
|
+
clean_p = re.sub(r'<.*?>', ' ', p)
|
|
407
|
+
clean_p = re.sub(r'&[a-zA-Z]+;', ' ', clean_p)
|
|
408
|
+
clean_p = re.sub(r'\s+', ' ', clean_p).strip()
|
|
409
|
+
if clean_p:
|
|
410
|
+
processed_text.append(clean_p)
|
|
411
|
+
|
|
412
|
+
page_text = "\n\n".join(processed_text)
|
|
413
|
+
|
|
414
|
+
elif len(text_as_dict.get("blocks", [])) > 0:
|
|
415
|
+
blocks = sorted(text_as_dict["blocks"], key=lambda b: b["bbox"][1])
|
|
416
|
+
processed_text = []
|
|
417
|
+
|
|
418
|
+
for block in blocks:
|
|
419
|
+
if "lines" not in block:
|
|
420
|
+
continue
|
|
421
|
+
|
|
422
|
+
block_lines = []
|
|
423
|
+
for line in block["lines"]:
|
|
424
|
+
if "spans" not in line:
|
|
425
|
+
continue
|
|
426
|
+
|
|
427
|
+
line_text = " ".join(span["text"] for span in line["spans"] if "text" in span)
|
|
428
|
+
if line_text.strip():
|
|
429
|
+
block_lines.append(line_text)
|
|
430
|
+
|
|
431
|
+
if block_lines:
|
|
432
|
+
processed_text.append(" ".join(block_lines))
|
|
433
|
+
|
|
434
|
+
page_text = "\n\n".join(processed_text)
|
|
435
|
+
|
|
436
|
+
else:
|
|
437
|
+
lines = text_as_text.split('\n')
|
|
438
|
+
paragraphs = []
|
|
439
|
+
current_paragraph = []
|
|
440
|
+
|
|
441
|
+
for line in lines:
|
|
442
|
+
line = line.strip()
|
|
443
|
+
words = line.split()
|
|
444
|
+
if len(words) <= 2 and not line.endswith('.') and not line.endswith(':'):
|
|
445
|
+
current_paragraph.append(line)
|
|
446
|
+
else:
|
|
447
|
+
if current_paragraph:
|
|
448
|
+
paragraphs.append(" ".join(current_paragraph))
|
|
449
|
+
current_paragraph = []
|
|
450
|
+
if line:
|
|
451
|
+
paragraphs.append(line)
|
|
452
|
+
|
|
453
|
+
if current_paragraph:
|
|
454
|
+
paragraphs.append(" ".join(current_paragraph))
|
|
455
|
+
|
|
456
|
+
page_text = "\n\n".join(paragraphs)
|
|
457
|
+
|
|
458
|
+
page_content = f"--- Page {page_num + 1} ---\n\n{page_text}"
|
|
459
|
+
all_pages_content.append(page_content)
|
|
460
|
+
|
|
461
|
+
full_document = "\n\n".join(all_pages_content)
|
|
462
|
+
|
|
463
|
+
paragraphs = full_document.split("\n\n")
|
|
464
|
+
current_chunk = []
|
|
465
|
+
current_size = 0
|
|
466
|
+
|
|
467
|
+
for paragraph in paragraphs:
|
|
468
|
+
if not paragraph.strip():
|
|
469
|
+
continue
|
|
470
|
+
|
|
471
|
+
para_size = len(paragraph.split())
|
|
472
|
+
|
|
473
|
+
if current_size + para_size > self.max_chunk_size and current_chunk:
|
|
474
|
+
chunk_text = "\n\n".join(current_chunk)
|
|
475
|
+
final_text = f"{'='*80}\nFILE: {path}\n{'='*80}\n\n{chunk_text}"
|
|
476
|
+
self._write_chunk(final_text.encode("utf-8"), idx)
|
|
477
|
+
idx += 1
|
|
478
|
+
current_chunk = []
|
|
479
|
+
current_size = 0
|
|
480
|
+
|
|
481
|
+
current_chunk.append(paragraph)
|
|
482
|
+
current_size += para_size
|
|
483
|
+
|
|
484
|
+
if current_chunk:
|
|
485
|
+
chunk_text = "\n\n".join(current_chunk)
|
|
486
|
+
final_text = f"{'='*80}\nFILE: {path}\n{'='*80}\n\n{chunk_text}"
|
|
487
|
+
self._write_chunk(final_text.encode("utf-8"), idx)
|
|
488
|
+
idx += 1
|
|
489
|
+
|
|
490
|
+
return idx
|
|
491
|
+
|
|
492
|
+
except Exception as e:
|
|
493
|
+
print(f"Error processing PDF {path}: {e}")
|
|
494
|
+
t = (
|
|
495
|
+
"="*80 + "\n"
|
|
496
|
+
+ f"CHUNK {idx + 1}\n"
|
|
497
|
+
+ "="*80 + "\n\n"
|
|
498
|
+
+ "="*40 + "\n"
|
|
499
|
+
+ f"File: {path}\n"
|
|
500
|
+
+ "="*40 + "\n"
|
|
501
|
+
+ f"[Error processing PDF: {str(e)}]\n"
|
|
502
|
+
)
|
|
503
|
+
self._write_chunk(t.encode("utf-8"), idx)
|
|
504
|
+
return idx + 1
|
|
505
|
+
|
|
506
|
+
def _process_chunks(self):
|
|
507
|
+
if not self.loaded_files:
|
|
508
|
+
return
|
|
509
|
+
if self.semantic_chunking:
|
|
510
|
+
self._chunk_by_semantic()
|
|
511
|
+
elif self.equal_chunks:
|
|
512
|
+
self._chunk_by_equal_parts()
|
|
513
|
+
else:
|
|
514
|
+
self._chunk_by_size()
|
|
515
|
+
|
|
516
|
+
def _extract_pdf_paragraphs(self, path):
|
|
517
|
+
try:
|
|
518
|
+
doc = fitz.open(path)
|
|
519
|
+
paragraphs = []
|
|
520
|
+
for page in doc:
|
|
521
|
+
text = page.get_text("text")
|
|
522
|
+
page_paras = text.split("\n\n")
|
|
523
|
+
paragraphs.extend([para.strip() for para in page_paras if para.strip()])
|
|
524
|
+
return paragraphs
|
|
525
|
+
except Exception as e:
|
|
526
|
+
print(f"Error extracting paragraphs from PDF {path}: {e}")
|
|
527
|
+
return []
|
|
528
|
+
|
|
529
|
+
def _chunk_by_equal_parts(self) -> None:
|
|
530
|
+
text_blocks = []
|
|
531
|
+
for (path, content_bytes, _) in self.loaded_files:
|
|
532
|
+
if path.endswith(".pdf"):
|
|
533
|
+
paragraphs = self._extract_pdf_paragraphs(path)
|
|
534
|
+
for para in paragraphs:
|
|
535
|
+
s = len(para.split())
|
|
536
|
+
if s > 0:
|
|
537
|
+
text_blocks.append((path, para, s))
|
|
538
|
+
else:
|
|
539
|
+
text = self._get_text_content(path, content_bytes)
|
|
540
|
+
if text:
|
|
541
|
+
s = len(text.split())
|
|
542
|
+
text_blocks.append((path, text, s))
|
|
543
|
+
if not text_blocks:
|
|
544
|
+
return
|
|
545
|
+
n_chunks = self.equal_chunks
|
|
546
|
+
text_blocks.sort(key=lambda x: -x[2])
|
|
547
|
+
chunk_contents = [[] for _ in range(n_chunks)]
|
|
548
|
+
chunk_sizes = [0] * n_chunks
|
|
549
|
+
for block in text_blocks:
|
|
550
|
+
min_idx = 0
|
|
551
|
+
min_size = chunk_sizes[0]
|
|
552
|
+
for i in range(1, n_chunks):
|
|
553
|
+
if chunk_sizes[i] < min_size:
|
|
554
|
+
min_size = chunk_sizes[i]
|
|
555
|
+
min_idx = i
|
|
556
|
+
chunk_contents[min_idx].append(block)
|
|
557
|
+
chunk_sizes[min_idx] += block[2]
|
|
558
|
+
for i, chunk in enumerate(chunk_contents):
|
|
559
|
+
if chunk:
|
|
560
|
+
self._write_equal_chunk([(path, text) for path, text, _ in chunk], i)
|
|
561
|
+
|
|
562
|
+
def _write_equal_chunk(self, chunk_data, chunk_num):
|
|
563
|
+
txt = "="*80 + "\n" + f"CHUNK {chunk_num + 1} OF {self.equal_chunks}\n" + "="*80 + "\n\n"
|
|
564
|
+
for path, text in chunk_data:
|
|
565
|
+
txt += "="*40 + "\n" + f"File: {path}\n" + "="*40 + "\n" + text + "\n"
|
|
566
|
+
self._write_chunk(txt.encode("utf-8"), chunk_num)
|
|
567
|
+
|
|
568
|
+
def _chunk_by_size(self):
|
|
569
|
+
idx = 0
|
|
570
|
+
for (path, content_bytes, _) in self.loaded_files:
|
|
571
|
+
text = self._get_text_content(path, content_bytes)
|
|
572
|
+
if not text:
|
|
573
|
+
t = (
|
|
574
|
+
"="*80 + "\n"
|
|
575
|
+
+ f"CHUNK {idx + 1}\n"
|
|
576
|
+
+ "="*80 + "\n\n"
|
|
577
|
+
+ "="*40 + "\n"
|
|
578
|
+
+ f"File: {path}\n"
|
|
579
|
+
+ "="*40 + "\n"
|
|
580
|
+
+ "[Empty File]\n"
|
|
581
|
+
)
|
|
582
|
+
self._write_chunk(t.encode("utf-8"), idx)
|
|
583
|
+
idx += 1
|
|
584
|
+
continue
|
|
585
|
+
|
|
586
|
+
if path.endswith(".pdf"):
|
|
587
|
+
idx = self._improved_pdf_chunking(path, idx)
|
|
588
|
+
else:
|
|
589
|
+
lines = text.splitlines()
|
|
590
|
+
current_chunk_lines = []
|
|
591
|
+
current_size = 0
|
|
592
|
+
for line in lines:
|
|
593
|
+
line_size = len(line.split())
|
|
594
|
+
if current_size + line_size > self.max_chunk_size and current_chunk_lines:
|
|
595
|
+
h = [
|
|
596
|
+
"="*80,
|
|
597
|
+
f"CHUNK {idx + 1}",
|
|
598
|
+
"="*80,
|
|
599
|
+
"",
|
|
600
|
+
"="*40,
|
|
601
|
+
f"File: {path}",
|
|
602
|
+
"="*40,
|
|
603
|
+
""
|
|
604
|
+
]
|
|
605
|
+
chunk_data = "\n".join(h + current_chunk_lines) + "\n"
|
|
606
|
+
self._write_chunk(chunk_data.encode("utf-8"), idx)
|
|
607
|
+
idx += 1
|
|
608
|
+
current_chunk_lines = []
|
|
609
|
+
current_size = 0
|
|
610
|
+
if line.strip():
|
|
611
|
+
current_chunk_lines.append(line)
|
|
612
|
+
current_size += line_size
|
|
613
|
+
if current_chunk_lines:
|
|
614
|
+
h = [
|
|
615
|
+
"="*80,
|
|
616
|
+
f"CHUNK {idx + 1}",
|
|
617
|
+
"="*80,
|
|
618
|
+
"",
|
|
619
|
+
"="*40,
|
|
620
|
+
f"File: {path}",
|
|
621
|
+
"="*40,
|
|
622
|
+
""
|
|
623
|
+
]
|
|
624
|
+
chunk_data = "\n".join(h + current_chunk_lines) + "\n"
|
|
625
|
+
self._write_chunk(chunk_data.encode("utf-8"), idx)
|
|
626
|
+
idx += 1
|
|
627
|
+
|
|
628
|
+
def _chunk_by_semantic(self):
|
|
629
|
+
chunk_index = 0
|
|
630
|
+
for (path, content_bytes, priority) in self.loaded_files:
|
|
631
|
+
text = self._get_text_content(path, content_bytes)
|
|
632
|
+
if not text and not path.endswith(".pdf"):
|
|
633
|
+
continue
|
|
634
|
+
if path.endswith(".py"):
|
|
635
|
+
chunk_index = self._chunk_python_file_ast(path, text, chunk_index)
|
|
636
|
+
else:
|
|
637
|
+
chunk_index = self._chunk_nonpython_file_by_size(path, text, chunk_index)
|
|
638
|
+
|
|
639
|
+
def _chunk_nonpython_file_by_size(self, path, text, chunk_index):
|
|
640
|
+
lines = text.splitlines()
|
|
641
|
+
if not lines:
|
|
642
|
+
t = (
|
|
643
|
+
"="*80 + "\n"
|
|
644
|
+
+ f"CHUNK {chunk_index + 1}\n"
|
|
645
|
+
+ "="*80 + "\n\n"
|
|
646
|
+
+ "="*40 + "\n"
|
|
647
|
+
+ f"File: {path}\n"
|
|
648
|
+
+ "="*40 + "\n"
|
|
649
|
+
+ "[Empty File]\n"
|
|
650
|
+
)
|
|
651
|
+
self._write_chunk(t.encode("utf-8"), chunk_index)
|
|
652
|
+
return chunk_index + 1
|
|
653
|
+
|
|
654
|
+
current_chunk_lines = []
|
|
655
|
+
current_size = 0
|
|
656
|
+
idx = chunk_index
|
|
657
|
+
for line in lines:
|
|
658
|
+
line_size = len(line.split())
|
|
659
|
+
if self.max_chunk_size and (current_size + line_size) > self.max_chunk_size and current_chunk_lines:
|
|
660
|
+
chunk_data = self._format_chunk_content(path, current_chunk_lines, idx)
|
|
661
|
+
self._write_chunk(chunk_data.encode("utf-8"), idx)
|
|
662
|
+
idx += 1
|
|
663
|
+
current_chunk_lines = []
|
|
664
|
+
current_size = 0
|
|
665
|
+
current_chunk_lines.append(line)
|
|
666
|
+
current_size += line_size
|
|
667
|
+
|
|
668
|
+
if current_chunk_lines:
|
|
669
|
+
chunk_data = self._format_chunk_content(path, current_chunk_lines, idx)
|
|
670
|
+
self._write_chunk(chunk_data.encode("utf-8"), idx)
|
|
671
|
+
idx += 1
|
|
672
|
+
|
|
673
|
+
return idx
|
|
674
|
+
|
|
675
|
+
def _format_chunk_content(self, path, lines, idx):
|
|
676
|
+
h = [
|
|
677
|
+
"="*80,
|
|
678
|
+
f"CHUNK {idx + 1}",
|
|
679
|
+
"="*80,
|
|
680
|
+
"",
|
|
681
|
+
"="*40,
|
|
682
|
+
f"File: {path}",
|
|
683
|
+
"="*40,
|
|
684
|
+
""
|
|
685
|
+
]
|
|
686
|
+
return "\n".join(h + lines) + "\n"
|
|
687
|
+
|
|
688
|
+
def _chunk_python_file_ast(self, path, text, chunk_index):
|
|
689
|
+
import ast
|
|
690
|
+
try:
|
|
691
|
+
tree = ast.parse(text, filename=path)
|
|
692
|
+
except SyntaxError:
|
|
693
|
+
chunk_data = f"{'='*80}\nFILE: {path}\n{'='*80}\n\n{text}"
|
|
694
|
+
self._write_chunk(chunk_data.encode("utf-8"), chunk_index)
|
|
695
|
+
return chunk_index + 1
|
|
696
|
+
|
|
697
|
+
lines = text.splitlines()
|
|
698
|
+
|
|
699
|
+
node_boundaries = []
|
|
700
|
+
for node in tree.body:
|
|
701
|
+
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
702
|
+
node_type = "Function"
|
|
703
|
+
label = f"{node_type}: {node.name}"
|
|
704
|
+
elif isinstance(node, ast.ClassDef):
|
|
705
|
+
label = f"Class: {node.name}"
|
|
706
|
+
else:
|
|
707
|
+
continue
|
|
708
|
+
start = node.lineno
|
|
709
|
+
end = getattr(node, 'end_lineno', start)
|
|
710
|
+
node_boundaries.append((start, end, label))
|
|
711
|
+
|
|
712
|
+
node_boundaries.sort(key=lambda x: x[0])
|
|
713
|
+
|
|
714
|
+
expanded_blocks = []
|
|
715
|
+
prev_end = 1
|
|
716
|
+
for (start, end, label) in node_boundaries:
|
|
717
|
+
if start > prev_end:
|
|
718
|
+
expanded_blocks.append((prev_end, start - 1, "GLOBAL CODE"))
|
|
719
|
+
expanded_blocks.append((start, end, label))
|
|
720
|
+
prev_end = end + 1
|
|
721
|
+
if prev_end <= len(lines):
|
|
722
|
+
expanded_blocks.append((prev_end, len(lines), "GLOBAL CODE"))
|
|
723
|
+
|
|
724
|
+
code_blocks = []
|
|
725
|
+
for (start, end, label) in expanded_blocks:
|
|
726
|
+
snippet = lines[start - 1 : end]
|
|
727
|
+
block_text = f"{label} (lines {start}-{end})\n" + "\n".join(snippet)
|
|
728
|
+
code_blocks.append(block_text)
|
|
729
|
+
|
|
730
|
+
current_lines = []
|
|
731
|
+
current_count = 0
|
|
732
|
+
|
|
733
|
+
for block in code_blocks:
|
|
734
|
+
block_size = len(block.splitlines())
|
|
735
|
+
|
|
736
|
+
if not self.max_chunk_size:
|
|
737
|
+
current_lines.append(block)
|
|
738
|
+
current_count += block_size
|
|
739
|
+
continue
|
|
740
|
+
|
|
741
|
+
if block_size > self.max_chunk_size:
|
|
742
|
+
if current_lines:
|
|
743
|
+
chunk_data = "\n\n".join(current_lines)
|
|
744
|
+
final_text = f"{'='*80}\nFILE: {path}\n{'='*80}\n\n{chunk_data}"
|
|
745
|
+
self._write_chunk(final_text.encode("utf-8"), chunk_index)
|
|
746
|
+
chunk_index += 1
|
|
747
|
+
current_lines = []
|
|
748
|
+
current_count = 0
|
|
749
|
+
|
|
750
|
+
big_block_data = f"{'='*80}\nFILE: {path}\n{'='*80}\n\n{block}"
|
|
751
|
+
self._write_chunk(big_block_data.encode("utf-8"), chunk_index)
|
|
752
|
+
chunk_index += 1
|
|
753
|
+
continue
|
|
754
|
+
|
|
755
|
+
if current_count + block_size > self.max_chunk_size and current_lines:
|
|
756
|
+
chunk_data = "\n\n".join(current_lines)
|
|
757
|
+
final_text = f"{'='*80}\nFILE: {path}\n{'='*80}\n\n{chunk_data}"
|
|
758
|
+
self._write_chunk(final_text.encode("utf-8"), chunk_index)
|
|
759
|
+
chunk_index += 1
|
|
760
|
+
|
|
761
|
+
current_lines = []
|
|
762
|
+
current_count = 0
|
|
763
|
+
|
|
764
|
+
current_lines.append(block)
|
|
765
|
+
current_count += block_size
|
|
766
|
+
|
|
767
|
+
if current_lines:
|
|
768
|
+
chunk_data = "\n\n".join(current_lines)
|
|
769
|
+
final_text = f"{'='*80}\nFILE: {path}\n{'='*80}\n\n{chunk_data}"
|
|
770
|
+
self._write_chunk(final_text.encode("utf-8"), chunk_index)
|
|
771
|
+
chunk_index += 1
|
|
772
|
+
|
|
773
|
+
return chunk_index
|
|
774
|
+
|
|
775
|
+
def close(self):
|
|
776
|
+
pass
|
|
777
|
+
|
|
778
|
+
def __enter__(self):
|
|
779
|
+
return self
|
|
780
|
+
|
|
781
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
782
|
+
self.close()
|
|
783
|
+
return False
|