skylos 1.0.10__py3-none-any.whl → 1.0.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skylos might be problematic. Click here for more details.
- skylos/__init__.py +1 -1
- skylos/analyzer.py +14 -2
- skylos/cli.py +24 -1
- skylos/visitor.py +76 -16
- {skylos-1.0.10.dist-info → skylos-1.0.11.dist-info}/METADATA +1 -1
- skylos-1.0.11.dist-info/RECORD +30 -0
- {skylos-1.0.10.dist-info → skylos-1.0.11.dist-info}/WHEEL +1 -1
- test/pykomodo/__init__.py +0 -0
- test/pykomodo/command_line.py +176 -0
- test/pykomodo/config.py +20 -0
- test/pykomodo/core.py +121 -0
- test/pykomodo/dashboard.py +608 -0
- test/pykomodo/enhanced_chunker.py +304 -0
- test/pykomodo/multi_dirs_chunker.py +783 -0
- test/pykomodo/pykomodo_config.py +68 -0
- test/pykomodo/token_chunker.py +470 -0
- skylos-1.0.10.dist-info/RECORD +0 -21
- {skylos-1.0.10.dist-info → skylos-1.0.11.dist-info}/entry_points.txt +0 -0
- {skylos-1.0.10.dist-info → skylos-1.0.11.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
from pydantic import BaseModel, Field
|
|
2
|
+
from typing import List, Optional
|
|
3
|
+
|
|
4
|
+
from pykomodo.enhanced_chunker import EnhancedParallelChunker
|
|
5
|
+
from pykomodo.multi_dirs_chunker import ParallelChunker
|
|
6
|
+
|
|
7
|
+
class KomodoConfig(BaseModel):
|
|
8
|
+
"""
|
|
9
|
+
A Pydantic model representing pykomodo settings.
|
|
10
|
+
"""
|
|
11
|
+
directories: List[str] = Field(
|
|
12
|
+
default_factory=lambda: ["."],
|
|
13
|
+
description="Which directories to process."
|
|
14
|
+
)
|
|
15
|
+
equal_chunks: Optional[int] = Field(
|
|
16
|
+
default=None,
|
|
17
|
+
description="Number of equal chunks to produce (mutually exclusive with max_chunk_size)."
|
|
18
|
+
)
|
|
19
|
+
max_chunk_size: Optional[int] = Field(
|
|
20
|
+
default=None,
|
|
21
|
+
description="Max tokens/lines per chunk (mutually exclusive with equal_chunks)."
|
|
22
|
+
)
|
|
23
|
+
output_dir: str = Field(
|
|
24
|
+
default="chunks",
|
|
25
|
+
description="Where chunked files will be stored."
|
|
26
|
+
)
|
|
27
|
+
semantic_chunking: bool = Field(
|
|
28
|
+
default=False,
|
|
29
|
+
description="If True, chunk .py files at function/class boundaries."
|
|
30
|
+
)
|
|
31
|
+
enhanced: bool = Field(
|
|
32
|
+
default=False,
|
|
33
|
+
description="If True, use EnhancedParallelChunker for LLM-related features."
|
|
34
|
+
)
|
|
35
|
+
context_window: int = 4096
|
|
36
|
+
min_relevance_score: float = 0.3
|
|
37
|
+
remove_redundancy: bool = True
|
|
38
|
+
extract_metadata: bool = True
|
|
39
|
+
|
|
40
|
+
def run_chunker_with_config(config: KomodoConfig):
|
|
41
|
+
"""
|
|
42
|
+
Build and run a pykomodo chunker (Enhanced or basic) from config.
|
|
43
|
+
"""
|
|
44
|
+
ChunkerClass = EnhancedParallelChunker if config.enhanced else ParallelChunker
|
|
45
|
+
|
|
46
|
+
chunker = ChunkerClass(
|
|
47
|
+
equal_chunks=config.equal_chunks,
|
|
48
|
+
max_chunk_size=config.max_chunk_size,
|
|
49
|
+
output_dir=config.output_dir,
|
|
50
|
+
semantic_chunking=config.semantic_chunking,
|
|
51
|
+
context_window=config.context_window if config.enhanced else None,
|
|
52
|
+
min_relevance_score=config.min_relevance_score if config.enhanced else None,
|
|
53
|
+
remove_redundancy=config.remove_redundancy if config.enhanced else None,
|
|
54
|
+
extract_metadata=config.extract_metadata if config.enhanced else None,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
chunker.process_directories(config.directories)
|
|
58
|
+
chunker.close()
|
|
59
|
+
|
|
60
|
+
if __name__ == "__main__":
|
|
61
|
+
my_config = KomodoConfig(
|
|
62
|
+
directories=["src/", "docs/"], # or wherever
|
|
63
|
+
equal_chunks=5,
|
|
64
|
+
output_dir="my_chunks",
|
|
65
|
+
semantic_chunking=True,
|
|
66
|
+
enhanced=True
|
|
67
|
+
)
|
|
68
|
+
run_chunker_with_config(my_config)
|
|
@@ -0,0 +1,470 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Optional, List, Tuple
|
|
3
|
+
import ast
|
|
4
|
+
import fitz
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
try:
|
|
8
|
+
import tiktoken
|
|
9
|
+
TIKTOKEN_AVAILABLE = True
|
|
10
|
+
except ImportError:
|
|
11
|
+
TIKTOKEN_AVAILABLE = False
|
|
12
|
+
|
|
13
|
+
from pykomodo.multi_dirs_chunker import ParallelChunker
|
|
14
|
+
|
|
15
|
+
class TokenBasedChunker(ParallelChunker):
|
|
16
|
+
def __init__(
|
|
17
|
+
self,
|
|
18
|
+
equal_chunks: Optional[int] = None,
|
|
19
|
+
max_tokens_per_chunk: Optional[int] = None,
|
|
20
|
+
output_dir: str = "chunks",
|
|
21
|
+
user_ignore: Optional[List[str]] = None,
|
|
22
|
+
user_unignore: Optional[List[str]] = None,
|
|
23
|
+
binary_extensions: Optional[List[str]] = None,
|
|
24
|
+
priority_rules: Optional[List[Tuple[str, int]]] = None,
|
|
25
|
+
num_threads: int = 4,
|
|
26
|
+
dry_run: bool = False,
|
|
27
|
+
semantic_chunking: bool = False,
|
|
28
|
+
file_type: Optional[str] = None,
|
|
29
|
+
encoding_name: str = "cl100k_base",
|
|
30
|
+
verbose: bool = False
|
|
31
|
+
) -> None:
|
|
32
|
+
super().__init__(
|
|
33
|
+
equal_chunks=equal_chunks,
|
|
34
|
+
max_chunk_size=max_tokens_per_chunk,
|
|
35
|
+
output_dir=output_dir,
|
|
36
|
+
user_ignore=user_ignore,
|
|
37
|
+
user_unignore=user_unignore,
|
|
38
|
+
binary_extensions=binary_extensions,
|
|
39
|
+
priority_rules=priority_rules,
|
|
40
|
+
num_threads=num_threads,
|
|
41
|
+
dry_run=dry_run,
|
|
42
|
+
semantic_chunking=semantic_chunking,
|
|
43
|
+
file_type=file_type
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
self.max_tokens_per_chunk = max_tokens_per_chunk
|
|
47
|
+
self.encoding_name = encoding_name
|
|
48
|
+
self.verbose = verbose
|
|
49
|
+
|
|
50
|
+
self.encoding = None
|
|
51
|
+
if TIKTOKEN_AVAILABLE:
|
|
52
|
+
try:
|
|
53
|
+
self.encoding = tiktoken.get_encoding(encoding_name)
|
|
54
|
+
if verbose:
|
|
55
|
+
print(f"Using {encoding_name} tokenizer", flush=True)
|
|
56
|
+
except Exception as e:
|
|
57
|
+
print(f"Error initializing tokenizer: {e}. Falling back to word-splitting.")
|
|
58
|
+
elif verbose:
|
|
59
|
+
print("tiktoken not available. Install with: pip install tiktoken")
|
|
60
|
+
print("Falling back to word-based token counting")
|
|
61
|
+
|
|
62
|
+
def count_tokens(self, text: str) -> int:
|
|
63
|
+
if self.encoding:
|
|
64
|
+
return len(self.encoding.encode(text))
|
|
65
|
+
else:
|
|
66
|
+
return len(text.split())
|
|
67
|
+
|
|
68
|
+
def _chunk_by_equal_parts(self) -> None:
|
|
69
|
+
if not self.loaded_files:
|
|
70
|
+
return
|
|
71
|
+
|
|
72
|
+
if self.verbose:
|
|
73
|
+
print(f"Creating {self.equal_chunks} equal chunks based on token counts")
|
|
74
|
+
|
|
75
|
+
chunks = [[] for _ in range(self.equal_chunks)]
|
|
76
|
+
chunk_token_counts = [0] * self.equal_chunks
|
|
77
|
+
|
|
78
|
+
text_blocks = []
|
|
79
|
+
for path, content_bytes, priority in self.loaded_files:
|
|
80
|
+
if path.endswith(".pdf"):
|
|
81
|
+
try:
|
|
82
|
+
doc = fitz.open(path)
|
|
83
|
+
text = ""
|
|
84
|
+
for page in doc:
|
|
85
|
+
text += page.get_text("text")
|
|
86
|
+
token_count = self.count_tokens(text)
|
|
87
|
+
text_blocks.append((path, text, token_count))
|
|
88
|
+
except Exception as e:
|
|
89
|
+
if self.verbose:
|
|
90
|
+
print(f"Error extracting text from PDF {path}: {e}")
|
|
91
|
+
else:
|
|
92
|
+
try:
|
|
93
|
+
text = content_bytes.decode("utf-8", errors="replace")
|
|
94
|
+
text = self._filter_api_keys(text)
|
|
95
|
+
token_count = self.count_tokens(text)
|
|
96
|
+
text_blocks.append((path, text, token_count))
|
|
97
|
+
except Exception as e:
|
|
98
|
+
if self.verbose:
|
|
99
|
+
print(f"Error processing {path}: {e}")
|
|
100
|
+
|
|
101
|
+
text_blocks.sort(key=lambda x: -x[2])
|
|
102
|
+
|
|
103
|
+
for path, text, tokens in text_blocks:
|
|
104
|
+
min_idx = chunk_token_counts.index(min(chunk_token_counts))
|
|
105
|
+
chunks[min_idx].append((path, text))
|
|
106
|
+
chunk_token_counts[min_idx] += tokens
|
|
107
|
+
|
|
108
|
+
for i, chunk_files in enumerate(chunks):
|
|
109
|
+
if not chunk_files:
|
|
110
|
+
continue
|
|
111
|
+
|
|
112
|
+
chunk_text = f"{'=' * 80}\nCHUNK {i + 1} OF {self.equal_chunks}\n{'=' * 80}\n\n"
|
|
113
|
+
|
|
114
|
+
for path, text in chunk_files:
|
|
115
|
+
chunk_text += f"{'=' * 40}\nFile: {path}\n{'=' * 40}\n{text}\n\n"
|
|
116
|
+
|
|
117
|
+
chunk_path = os.path.join(self.output_dir, f"chunk-{i}.txt")
|
|
118
|
+
with open(chunk_path, "w", encoding="utf-8") as f:
|
|
119
|
+
f.write(chunk_text)
|
|
120
|
+
|
|
121
|
+
if self.verbose:
|
|
122
|
+
print(f"Created chunk {i+1} with approximately {chunk_token_counts[i]} tokens")
|
|
123
|
+
|
|
124
|
+
def _chunk_by_size(self) -> None:
|
|
125
|
+
if not self.loaded_files:
|
|
126
|
+
return
|
|
127
|
+
|
|
128
|
+
if self.verbose:
|
|
129
|
+
print(f"Creating chunks with maximum {self.max_tokens_per_chunk} tokens per chunk")
|
|
130
|
+
|
|
131
|
+
chunk_index = 0
|
|
132
|
+
|
|
133
|
+
for path, content_bytes, _ in self.loaded_files:
|
|
134
|
+
if path.endswith(".pdf"):
|
|
135
|
+
chunk_index = self._chunk_pdf_file(path, chunk_index)
|
|
136
|
+
continue
|
|
137
|
+
|
|
138
|
+
if self.semantic_chunking and path.endswith(".py"):
|
|
139
|
+
text = content_bytes.decode("utf-8", errors="replace")
|
|
140
|
+
text = self._filter_api_keys(text)
|
|
141
|
+
chunk_index = self._chunk_python_file_semantic(path, text, chunk_index)
|
|
142
|
+
continue
|
|
143
|
+
|
|
144
|
+
try:
|
|
145
|
+
text = content_bytes.decode("utf-8", errors="replace")
|
|
146
|
+
text = self._filter_api_keys(text)
|
|
147
|
+
except Exception as e:
|
|
148
|
+
if self.verbose:
|
|
149
|
+
print(f"Error decoding {path}: {e}")
|
|
150
|
+
continue
|
|
151
|
+
|
|
152
|
+
lines = text.splitlines()
|
|
153
|
+
current_chunk_lines = []
|
|
154
|
+
current_tokens = 0
|
|
155
|
+
|
|
156
|
+
for line in lines:
|
|
157
|
+
line_tokens = self.count_tokens(line)
|
|
158
|
+
|
|
159
|
+
if current_tokens + line_tokens > self.max_tokens_per_chunk and current_chunk_lines:
|
|
160
|
+
chunk_text = f"{'=' * 80}\nCHUNK {chunk_index + 1}\n{'=' * 80}\n\n"
|
|
161
|
+
chunk_text += f"{'=' * 40}\nFile: {path}\n{'=' * 40}\n"
|
|
162
|
+
chunk_text += "\n".join(current_chunk_lines) + "\n"
|
|
163
|
+
|
|
164
|
+
chunk_path = os.path.join(self.output_dir, f"chunk-{chunk_index}.txt")
|
|
165
|
+
with open(chunk_path, "w", encoding="utf-8") as f:
|
|
166
|
+
f.write(chunk_text)
|
|
167
|
+
|
|
168
|
+
chunk_index += 1
|
|
169
|
+
current_chunk_lines = []
|
|
170
|
+
current_tokens = 0
|
|
171
|
+
|
|
172
|
+
if line_tokens > self.max_tokens_per_chunk:
|
|
173
|
+
if self.verbose:
|
|
174
|
+
print(f"Warning: Line in {path} exceeds token limit ({line_tokens} tokens)")
|
|
175
|
+
|
|
176
|
+
if current_chunk_lines:
|
|
177
|
+
chunk_text = f"{'=' * 80}\nCHUNK {chunk_index + 1}\n{'=' * 80}\n\n"
|
|
178
|
+
chunk_text += f"{'=' * 40}\nFile: {path}\n{'=' * 40}\n"
|
|
179
|
+
chunk_text += "\n".join(current_chunk_lines) + "\n"
|
|
180
|
+
|
|
181
|
+
chunk_path = os.path.join(self.output_dir, f"chunk-{chunk_index}.txt")
|
|
182
|
+
with open(chunk_path, "w", encoding="utf-8") as f:
|
|
183
|
+
f.write(chunk_text)
|
|
184
|
+
|
|
185
|
+
chunk_index += 1
|
|
186
|
+
current_chunk_lines = []
|
|
187
|
+
current_tokens = 0
|
|
188
|
+
|
|
189
|
+
words = line.split()
|
|
190
|
+
word_chunks = []
|
|
191
|
+
current_word_chunk = []
|
|
192
|
+
current_word_tokens = 0
|
|
193
|
+
|
|
194
|
+
for word in words:
|
|
195
|
+
word_tokens = self.count_tokens(word + ' ')
|
|
196
|
+
if current_word_tokens + word_tokens > self.max_tokens_per_chunk:
|
|
197
|
+
word_chunks.append(' '.join(current_word_chunk))
|
|
198
|
+
current_word_chunk = [word]
|
|
199
|
+
current_word_tokens = word_tokens
|
|
200
|
+
else:
|
|
201
|
+
current_word_chunk.append(word)
|
|
202
|
+
current_word_tokens += word_tokens
|
|
203
|
+
|
|
204
|
+
if current_word_chunk:
|
|
205
|
+
word_chunks.append(' '.join(current_word_chunk))
|
|
206
|
+
|
|
207
|
+
for i, word_chunk in enumerate(word_chunks):
|
|
208
|
+
chunk_text = f"{'=' * 80}\nCHUNK {chunk_index + 1}\n{'=' * 80}\n\n"
|
|
209
|
+
chunk_text += f"{'=' * 40}\nFile: {path}\n{'=' * 40}\n"
|
|
210
|
+
chunk_text += f"[Long line part {i+1}/{len(word_chunks)}]\n{word_chunk}\n"
|
|
211
|
+
|
|
212
|
+
chunk_path = os.path.join(self.output_dir, f"chunk-{chunk_index}.txt")
|
|
213
|
+
with open(chunk_path, "w", encoding="utf-8") as f:
|
|
214
|
+
f.write(chunk_text)
|
|
215
|
+
|
|
216
|
+
chunk_index += 1
|
|
217
|
+
|
|
218
|
+
continue
|
|
219
|
+
|
|
220
|
+
if line.strip():
|
|
221
|
+
current_chunk_lines.append(line)
|
|
222
|
+
current_tokens += line_tokens
|
|
223
|
+
|
|
224
|
+
if current_chunk_lines:
|
|
225
|
+
chunk_text = f"{'=' * 80}\nCHUNK {chunk_index + 1}\n{'=' * 80}\n\n"
|
|
226
|
+
chunk_text += f"{'=' * 40}\nFile: {path}\n{'=' * 40}\n"
|
|
227
|
+
chunk_text += "\n".join(current_chunk_lines) + "\n"
|
|
228
|
+
|
|
229
|
+
chunk_path = os.path.join(self.output_dir, f"chunk-{chunk_index}.txt")
|
|
230
|
+
with open(chunk_path, "w", encoding="utf-8") as f:
|
|
231
|
+
f.write(chunk_text)
|
|
232
|
+
|
|
233
|
+
chunk_index += 1
|
|
234
|
+
|
|
235
|
+
def _chunk_python_file_semantic(self, path: str, text: str, chunk_index: int) -> int:
|
|
236
|
+
try:
|
|
237
|
+
tree = ast.parse(text, filename=path)
|
|
238
|
+
except SyntaxError:
|
|
239
|
+
if self.verbose:
|
|
240
|
+
print(f"Syntax error in {path}, falling back to token-based chunking")
|
|
241
|
+
|
|
242
|
+
lines = text.splitlines()
|
|
243
|
+
current_chunk_lines = []
|
|
244
|
+
current_tokens = 0
|
|
245
|
+
|
|
246
|
+
for line in lines:
|
|
247
|
+
line_tokens = self.count_tokens(line)
|
|
248
|
+
|
|
249
|
+
if current_tokens + line_tokens > self.max_tokens_per_chunk and current_chunk_lines:
|
|
250
|
+
chunk_text = f"{'=' * 80}\nCHUNK {chunk_index + 1}\n{'=' * 80}\n\n"
|
|
251
|
+
chunk_text += f"{'=' * 40}\nFile: {path}\n{'=' * 40}\n"
|
|
252
|
+
chunk_text += "\n".join(current_chunk_lines) + "\n"
|
|
253
|
+
|
|
254
|
+
chunk_path = os.path.join(self.output_dir, f"chunk-{chunk_index}.txt")
|
|
255
|
+
with open(chunk_path, "w", encoding="utf-8") as f:
|
|
256
|
+
f.write(chunk_text)
|
|
257
|
+
|
|
258
|
+
chunk_index += 1
|
|
259
|
+
current_chunk_lines = []
|
|
260
|
+
current_tokens = 0
|
|
261
|
+
|
|
262
|
+
current_chunk_lines.append(line)
|
|
263
|
+
current_tokens += line_tokens
|
|
264
|
+
|
|
265
|
+
if current_chunk_lines:
|
|
266
|
+
chunk_text = f"{'=' * 80}\nCHUNK {chunk_index + 1}\n{'=' * 80}\n\n"
|
|
267
|
+
chunk_text += f"{'=' * 40}\nFile: {path}\n{'=' * 40}\n"
|
|
268
|
+
chunk_text += "\n".join(current_chunk_lines) + "\n"
|
|
269
|
+
|
|
270
|
+
chunk_path = os.path.join(self.output_dir, f"chunk-{chunk_index}.txt")
|
|
271
|
+
with open(chunk_path, "w", encoding="utf-8") as f:
|
|
272
|
+
f.write(chunk_text)
|
|
273
|
+
|
|
274
|
+
chunk_index += 1
|
|
275
|
+
|
|
276
|
+
return chunk_index
|
|
277
|
+
|
|
278
|
+
nodes = []
|
|
279
|
+
for node in tree.body:
|
|
280
|
+
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
|
|
281
|
+
start_line = node.lineno
|
|
282
|
+
end_line = getattr(node, 'end_lineno', start_line)
|
|
283
|
+
|
|
284
|
+
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
285
|
+
node_type = "Function"
|
|
286
|
+
name = node.name
|
|
287
|
+
else:
|
|
288
|
+
node_type = "Class"
|
|
289
|
+
name = node.name
|
|
290
|
+
|
|
291
|
+
nodes.append({
|
|
292
|
+
'type': node_type,
|
|
293
|
+
'name': name,
|
|
294
|
+
'start': start_line,
|
|
295
|
+
'end': end_line
|
|
296
|
+
})
|
|
297
|
+
|
|
298
|
+
nodes.sort(key=lambda x: x['start'])
|
|
299
|
+
|
|
300
|
+
lines = text.splitlines()
|
|
301
|
+
line_ranges = []
|
|
302
|
+
prev_end = 0
|
|
303
|
+
|
|
304
|
+
for node in nodes:
|
|
305
|
+
if node['start'] > prev_end + 1:
|
|
306
|
+
line_ranges.append({
|
|
307
|
+
'type': 'Global',
|
|
308
|
+
'name': 'Code',
|
|
309
|
+
'start': prev_end + 1,
|
|
310
|
+
'end': node['start'] - 1
|
|
311
|
+
})
|
|
312
|
+
line_ranges.append(node)
|
|
313
|
+
prev_end = node['end']
|
|
314
|
+
|
|
315
|
+
if prev_end < len(lines):
|
|
316
|
+
line_ranges.append({
|
|
317
|
+
'type': 'Global',
|
|
318
|
+
'name': 'Code',
|
|
319
|
+
'start': prev_end + 1,
|
|
320
|
+
'end': len(lines)
|
|
321
|
+
})
|
|
322
|
+
|
|
323
|
+
current_chunk_blocks = []
|
|
324
|
+
current_tokens = 0
|
|
325
|
+
|
|
326
|
+
for block in line_ranges:
|
|
327
|
+
block_lines = lines[block['start']-1:block['end']]
|
|
328
|
+
block_text = f"{block['type']}: {block['name']} (lines {block['start']}-{block['end']})\n"
|
|
329
|
+
block_text += "\n".join(block_lines)
|
|
330
|
+
|
|
331
|
+
block_tokens = self.count_tokens(block_text)
|
|
332
|
+
|
|
333
|
+
if block_tokens > self.max_tokens_per_chunk:
|
|
334
|
+
if self.verbose:
|
|
335
|
+
print(f"Warning: {block['type']} {block['name']} in {path} exceeds token limit ({block_tokens} tokens)")
|
|
336
|
+
|
|
337
|
+
if current_chunk_blocks:
|
|
338
|
+
chunk_text = f"{'=' * 80}\nFILE: {path}\n{'=' * 80}\n\n"
|
|
339
|
+
chunk_text += "\n\n".join(current_chunk_blocks)
|
|
340
|
+
|
|
341
|
+
chunk_path = os.path.join(self.output_dir, f"chunk-{chunk_index}.txt")
|
|
342
|
+
with open(chunk_path, "w", encoding="utf-8") as f:
|
|
343
|
+
f.write(chunk_text)
|
|
344
|
+
|
|
345
|
+
chunk_index += 1
|
|
346
|
+
current_chunk_blocks = []
|
|
347
|
+
current_tokens = 0
|
|
348
|
+
|
|
349
|
+
chunk_text = f"{'=' * 80}\nFILE: {path}\n{'=' * 80}\n\n{block_text}"
|
|
350
|
+
|
|
351
|
+
chunk_path = os.path.join(self.output_dir, f"chunk-{chunk_index}.txt")
|
|
352
|
+
with open(chunk_path, "w", encoding="utf-8") as f:
|
|
353
|
+
f.write(chunk_text)
|
|
354
|
+
|
|
355
|
+
chunk_index += 1
|
|
356
|
+
continue
|
|
357
|
+
|
|
358
|
+
if current_tokens + block_tokens > self.max_tokens_per_chunk and current_chunk_blocks:
|
|
359
|
+
chunk_text = f"{'=' * 80}\nFILE: {path}\n{'=' * 80}\n\n"
|
|
360
|
+
chunk_text += "\n\n".join(current_chunk_blocks)
|
|
361
|
+
|
|
362
|
+
chunk_path = os.path.join(self.output_dir, f"chunk-{chunk_index}.txt")
|
|
363
|
+
with open(chunk_path, "w", encoding="utf-8") as f:
|
|
364
|
+
f.write(chunk_text)
|
|
365
|
+
|
|
366
|
+
chunk_index += 1
|
|
367
|
+
current_chunk_blocks = []
|
|
368
|
+
current_tokens = 0
|
|
369
|
+
|
|
370
|
+
current_chunk_blocks.append(block_text)
|
|
371
|
+
current_tokens += block_tokens
|
|
372
|
+
|
|
373
|
+
if current_chunk_blocks:
|
|
374
|
+
chunk_text = f"{'=' * 80}\nFILE: {path}\n{'=' * 80}\n\n"
|
|
375
|
+
chunk_text += "\n\n".join(current_chunk_blocks)
|
|
376
|
+
|
|
377
|
+
chunk_path = os.path.join(self.output_dir, f"chunk-{chunk_index}.txt")
|
|
378
|
+
with open(chunk_path, "w", encoding="utf-8") as f:
|
|
379
|
+
f.write(chunk_text)
|
|
380
|
+
|
|
381
|
+
chunk_index += 1
|
|
382
|
+
|
|
383
|
+
return chunk_index
|
|
384
|
+
|
|
385
|
+
def _chunk_pdf_file(self, path: str, chunk_index: int) -> int:
|
|
386
|
+
try:
|
|
387
|
+
doc = fitz.open(path)
|
|
388
|
+
|
|
389
|
+
all_paragraphs = []
|
|
390
|
+
|
|
391
|
+
for page_num in range(len(doc)):
|
|
392
|
+
page = doc[page_num]
|
|
393
|
+
text = page.get_text("text")
|
|
394
|
+
|
|
395
|
+
try:
|
|
396
|
+
html = page.get_text("html")
|
|
397
|
+
if "<p>" in html:
|
|
398
|
+
paragraphs = re.findall(r'<p>(.*?)</p>', html, re.DOTALL)
|
|
399
|
+
clean_paras = []
|
|
400
|
+
|
|
401
|
+
for p in paragraphs:
|
|
402
|
+
clean_p = re.sub(r'<.*?>', ' ', p)
|
|
403
|
+
clean_p = re.sub(r'&[a-zA-Z]+;', ' ', clean_p)
|
|
404
|
+
clean_p = re.sub(r'\s+', ' ', clean_p).strip()
|
|
405
|
+
|
|
406
|
+
if clean_p:
|
|
407
|
+
clean_paras.append(clean_p)
|
|
408
|
+
|
|
409
|
+
all_paragraphs.append(f"--- Page {page_num + 1} ---")
|
|
410
|
+
all_paragraphs.extend(clean_paras)
|
|
411
|
+
continue
|
|
412
|
+
except Exception:
|
|
413
|
+
pass
|
|
414
|
+
|
|
415
|
+
page_paragraphs = text.split("\n\n")
|
|
416
|
+
|
|
417
|
+
all_paragraphs.append(f"--- Page {page_num + 1} ---")
|
|
418
|
+
|
|
419
|
+
for para in page_paragraphs:
|
|
420
|
+
if para.strip():
|
|
421
|
+
all_paragraphs.append(para.strip())
|
|
422
|
+
|
|
423
|
+
current_chunk_paras = []
|
|
424
|
+
current_tokens = 0
|
|
425
|
+
|
|
426
|
+
for para in all_paragraphs:
|
|
427
|
+
para_tokens = self.count_tokens(para)
|
|
428
|
+
|
|
429
|
+
if para_tokens == 0:
|
|
430
|
+
continue
|
|
431
|
+
|
|
432
|
+
if current_tokens + para_tokens > self.max_tokens_per_chunk and current_chunk_paras:
|
|
433
|
+
chunk_text = f"{'=' * 80}\nFILE: {path}\n{'=' * 80}\n\n"
|
|
434
|
+
chunk_text += "\n\n".join(current_chunk_paras)
|
|
435
|
+
|
|
436
|
+
chunk_path = os.path.join(self.output_dir, f"chunk-{chunk_index}.txt")
|
|
437
|
+
with open(chunk_path, "w", encoding="utf-8") as f:
|
|
438
|
+
f.write(chunk_text)
|
|
439
|
+
|
|
440
|
+
chunk_index += 1
|
|
441
|
+
current_chunk_paras = []
|
|
442
|
+
current_tokens = 0
|
|
443
|
+
|
|
444
|
+
current_chunk_paras.append(para)
|
|
445
|
+
current_tokens += para_tokens
|
|
446
|
+
|
|
447
|
+
if current_chunk_paras:
|
|
448
|
+
chunk_text = f"{'=' * 80}\nFILE: {path}\n{'=' * 80}\n\n"
|
|
449
|
+
chunk_text += "\n\n".join(current_chunk_paras)
|
|
450
|
+
|
|
451
|
+
chunk_path = os.path.join(self.output_dir, f"chunk-{chunk_index}.txt")
|
|
452
|
+
with open(chunk_path, "w", encoding="utf-8") as f:
|
|
453
|
+
f.write(chunk_text)
|
|
454
|
+
|
|
455
|
+
chunk_index += 1
|
|
456
|
+
|
|
457
|
+
return chunk_index
|
|
458
|
+
|
|
459
|
+
except Exception as e:
|
|
460
|
+
if self.verbose:
|
|
461
|
+
print(f"Error processing PDF {path}: {e}")
|
|
462
|
+
|
|
463
|
+
chunk_text = f"{'=' * 80}\nFILE: {path}\n{'=' * 80}\n\n"
|
|
464
|
+
chunk_text += f"[Error processing PDF: {str(e)}]\n"
|
|
465
|
+
|
|
466
|
+
chunk_path = os.path.join(self.output_dir, f"chunk-{chunk_index}.txt")
|
|
467
|
+
with open(chunk_path, "w", encoding="utf-8") as f:
|
|
468
|
+
f.write(chunk_text)
|
|
469
|
+
|
|
470
|
+
return chunk_index + 1
|
skylos-1.0.10.dist-info/RECORD
DELETED
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
skylos/__init__.py,sha256=U8NngtBsmaUGNLstSCYzDjfSCT1KyFOKExYRX7VmZfw,152
|
|
2
|
-
skylos/analyzer.py,sha256=1joG0Ek3jNj7qv7w3gFiNeb2BD8CXgX5PF6fVc_HIG4,9710
|
|
3
|
-
skylos/cli.py,sha256=l-qfaC0RUH2L9YgjlMOvlQrCPD5hcV3McHIk1az-CI4,13525
|
|
4
|
-
skylos/visitor.py,sha256=uHNHKf7Kf8Qg1sIa-PsH2NHCQD6R9Bd_NELs-41deE8,9339
|
|
5
|
-
test/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
-
test/compare_tools.py,sha256=0g9PDeJlbst-7hOaQzrL4MiJFQKpqM8q8VeBGzpPczg,22738
|
|
7
|
-
test/diagnostics.py,sha256=ExuFOCVpc9BDwNYapU96vj9RXLqxji32Sv6wVF4nJYU,13802
|
|
8
|
-
test/test_skylos.py,sha256=kz77STrS4k3Eez5RDYwGxOg2WH3e7zNZPUYEaTLbGTs,15608
|
|
9
|
-
test/test_visitor.py,sha256=bxUY_Zn_gLadZlz_n3Mu6rhVcExqElISwwVBo4eqVAY,7337
|
|
10
|
-
test/sample_repo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
-
test/sample_repo/app.py,sha256=M5XgoAn-LPz50mKAj_ZacRKf-Pg7I4HbjWP7Z9jE4a0,226
|
|
12
|
-
test/sample_repo/sample_repo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
|
-
test/sample_repo/sample_repo/commands.py,sha256=b6gQ9YDabt2yyfqGbOpLo0osF7wya8O4Lm7m8gtCr3g,2575
|
|
14
|
-
test/sample_repo/sample_repo/models.py,sha256=xXIg3pToEZwKuUCmKX2vTlCF_VeFA0yZlvlBVPIy5Qw,3320
|
|
15
|
-
test/sample_repo/sample_repo/routes.py,sha256=8yITrt55BwS01G7nWdESdx8LuxmReqop1zrGUKPeLi8,2475
|
|
16
|
-
test/sample_repo/sample_repo/utils.py,sha256=S56hEYh8wkzwsD260MvQcmUFOkw2EjFU27nMLFE6G2k,1103
|
|
17
|
-
skylos-1.0.10.dist-info/METADATA,sha256=iVIhmRsXWo0WPlMjYVrZpsi2mUVdEbKYrXDvOW4hbIk,225
|
|
18
|
-
skylos-1.0.10.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
|
|
19
|
-
skylos-1.0.10.dist-info/entry_points.txt,sha256=zzRpN2ByznlQoLeuLolS_TFNYSQxUGBL1EXQsAd6bIA,43
|
|
20
|
-
skylos-1.0.10.dist-info/top_level.txt,sha256=f8GA_7KwfaEopPMP8-EXDQXaqd4IbsOQPakZy01LkdQ,12
|
|
21
|
-
skylos-1.0.10.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|