skylos 1.0.10__tar.gz → 1.0.11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skylos might be problematic. Click here for more details.
- {skylos-1.0.10 → skylos-1.0.11}/PKG-INFO +1 -1
- {skylos-1.0.10 → skylos-1.0.11}/README.md +1 -1
- {skylos-1.0.10 → skylos-1.0.11}/pyproject.toml +1 -1
- {skylos-1.0.10 → skylos-1.0.11}/setup.py +1 -1
- {skylos-1.0.10 → skylos-1.0.11}/skylos/__init__.py +1 -1
- {skylos-1.0.10 → skylos-1.0.11}/skylos/analyzer.py +14 -2
- {skylos-1.0.10 → skylos-1.0.11}/skylos/cli.py +24 -1
- {skylos-1.0.10 → skylos-1.0.11}/skylos/visitor.py +76 -16
- {skylos-1.0.10 → skylos-1.0.11}/skylos.egg-info/PKG-INFO +1 -1
- {skylos-1.0.10 → skylos-1.0.11}/skylos.egg-info/SOURCES.txt +9 -0
- skylos-1.0.11/test/pykomodo/command_line.py +176 -0
- skylos-1.0.11/test/pykomodo/config.py +20 -0
- skylos-1.0.11/test/pykomodo/core.py +121 -0
- skylos-1.0.11/test/pykomodo/dashboard.py +608 -0
- skylos-1.0.11/test/pykomodo/enhanced_chunker.py +304 -0
- skylos-1.0.11/test/pykomodo/multi_dirs_chunker.py +783 -0
- skylos-1.0.11/test/pykomodo/pykomodo_config.py +68 -0
- skylos-1.0.11/test/pykomodo/token_chunker.py +470 -0
- skylos-1.0.11/test/sample_repo/sample_repo/__init__.py +0 -0
- {skylos-1.0.10 → skylos-1.0.11}/setup.cfg +0 -0
- {skylos-1.0.10 → skylos-1.0.11}/skylos.egg-info/dependency_links.txt +0 -0
- {skylos-1.0.10 → skylos-1.0.11}/skylos.egg-info/entry_points.txt +0 -0
- {skylos-1.0.10 → skylos-1.0.11}/skylos.egg-info/requires.txt +0 -0
- {skylos-1.0.10 → skylos-1.0.11}/skylos.egg-info/top_level.txt +0 -0
- {skylos-1.0.10 → skylos-1.0.11}/test/__init__.py +0 -0
- {skylos-1.0.10 → skylos-1.0.11}/test/compare_tools.py +0 -0
- {skylos-1.0.10 → skylos-1.0.11}/test/diagnostics.py +0 -0
- {skylos-1.0.10/test/sample_repo → skylos-1.0.11/test/pykomodo}/__init__.py +0 -0
- {skylos-1.0.10/test/sample_repo → skylos-1.0.11/test}/sample_repo/__init__.py +0 -0
- {skylos-1.0.10 → skylos-1.0.11}/test/sample_repo/app.py +0 -0
- {skylos-1.0.10 → skylos-1.0.11}/test/sample_repo/sample_repo/commands.py +0 -0
- {skylos-1.0.10 → skylos-1.0.11}/test/sample_repo/sample_repo/models.py +0 -0
- {skylos-1.0.10 → skylos-1.0.11}/test/sample_repo/sample_repo/routes.py +0 -0
- {skylos-1.0.10 → skylos-1.0.11}/test/sample_repo/sample_repo/utils.py +0 -0
- {skylos-1.0.10 → skylos-1.0.11}/test/test_skylos.py +0 -0
- {skylos-1.0.10 → skylos-1.0.11}/test/test_visitor.py +0 -0
|
@@ -212,7 +212,7 @@ We welcome contributions! Please read our [Contributing Guidelines](CONTRIBUTING
|
|
|
212
212
|
5. Open a Pull Request
|
|
213
213
|
|
|
214
214
|
## Roadmap
|
|
215
|
-
|
|
215
|
+
- [ ] Add a production flag, to include dead codes that are used in test but not in the actual execution
|
|
216
216
|
- [ ] Expand our test cases
|
|
217
217
|
- [ ] Configuration file support
|
|
218
218
|
- [ ] Custom analysis rules
|
|
@@ -97,6 +97,12 @@ class Skylos:
|
|
|
97
97
|
if d.simple_name in MAGIC_METHODS or (d.simple_name.startswith("__") and d.simple_name.endswith("__")):
|
|
98
98
|
d.confidence = 0
|
|
99
99
|
|
|
100
|
+
if d.type == "parameter" and d.simple_name in ("self", "cls"):
|
|
101
|
+
d.confidence = 0
|
|
102
|
+
|
|
103
|
+
if d.type != "parameter" and (d.simple_name in MAGIC_METHODS or (d.simple_name.startswith("__") and d.simple_name.endswith("__"))):
|
|
104
|
+
d.confidence = 0
|
|
105
|
+
|
|
100
106
|
if not d.simple_name.startswith("_") and d.type in ("function", "method", "class"):
|
|
101
107
|
d.confidence = min(d.confidence, 90)
|
|
102
108
|
|
|
@@ -106,6 +112,9 @@ class Skylos:
|
|
|
106
112
|
if d.name.split(".")[0] in self.dynamic:
|
|
107
113
|
d.confidence = min(d.confidence, 60)
|
|
108
114
|
|
|
115
|
+
if d.type == "variable" and d.simple_name == "_":
|
|
116
|
+
d.confidence = 0
|
|
117
|
+
|
|
109
118
|
if d.type == "method" and TEST_METHOD_PATTERN.match(d.simple_name):
|
|
110
119
|
class_name = d.name.rsplit(".", 1)[0]
|
|
111
120
|
class_simple_name = class_name.split(".")[-1]
|
|
@@ -134,7 +143,7 @@ class Skylos:
|
|
|
134
143
|
self._mark_refs()
|
|
135
144
|
self._apply_heuristics()
|
|
136
145
|
self._mark_exports()
|
|
137
|
-
|
|
146
|
+
|
|
138
147
|
thr = max(0, thr)
|
|
139
148
|
|
|
140
149
|
unused = []
|
|
@@ -146,7 +155,8 @@ class Skylos:
|
|
|
146
155
|
"unused_functions": [],
|
|
147
156
|
"unused_imports": [],
|
|
148
157
|
"unused_classes": [],
|
|
149
|
-
"unused_variables": []
|
|
158
|
+
"unused_variables": [],
|
|
159
|
+
"unused_parameters": []
|
|
150
160
|
}
|
|
151
161
|
|
|
152
162
|
for u in unused:
|
|
@@ -158,6 +168,8 @@ class Skylos:
|
|
|
158
168
|
result["unused_classes"].append(u)
|
|
159
169
|
elif u["type"] == "variable":
|
|
160
170
|
result["unused_variables"].append(u)
|
|
171
|
+
elif u["type"] == "parameter":
|
|
172
|
+
result["unused_parameters"].append(u)
|
|
161
173
|
|
|
162
174
|
return json.dumps(result, indent=2)
|
|
163
175
|
|
|
@@ -247,6 +247,8 @@ def main() -> None:
|
|
|
247
247
|
|
|
248
248
|
unused_functions = result.get("unused_functions", [])
|
|
249
249
|
unused_imports = result.get("unused_imports", [])
|
|
250
|
+
unused_parameters = result.get("unused_parameters", [])
|
|
251
|
+
unused_variables = result.get("unused_variables", [])
|
|
250
252
|
|
|
251
253
|
logger.info(f"{Colors.CYAN}{Colors.BOLD}🔍 Python Static Analysis Results{Colors.RESET}")
|
|
252
254
|
logger.info(f"{Colors.CYAN}{'=' * 35}{Colors.RESET}")
|
|
@@ -254,7 +256,10 @@ def main() -> None:
|
|
|
254
256
|
logger.info(f"\n{Colors.BOLD}Summary:{Colors.RESET}")
|
|
255
257
|
logger.info(f" • Unreachable functions: {Colors.YELLOW}{len(unused_functions)}{Colors.RESET}")
|
|
256
258
|
logger.info(f" • Unused imports: {Colors.YELLOW}{len(unused_imports)}{Colors.RESET}")
|
|
257
|
-
|
|
259
|
+
logger.info(f" • Unused parameters: {Colors.YELLOW}{len(unused_parameters)}{Colors.RESET}")
|
|
260
|
+
logger.info(f" • Unused variables: {Colors.YELLOW}{len(unused_variables)}{Colors.RESET}")
|
|
261
|
+
|
|
262
|
+
|
|
258
263
|
if args.interactive and (unused_functions or unused_imports):
|
|
259
264
|
logger.info(f"\n{Colors.BOLD}Interactive Mode:{Colors.RESET}")
|
|
260
265
|
selected_functions, selected_imports = interactive_selection(logger, unused_functions, unused_imports)
|
|
@@ -324,6 +329,24 @@ def main() -> None:
|
|
|
324
329
|
else:
|
|
325
330
|
logger.info(f"\n{Colors.GREEN}✓ All imports are being used!{Colors.RESET}")
|
|
326
331
|
|
|
332
|
+
if unused_parameters:
|
|
333
|
+
logger.info(f"\n{Colors.BLUE}{Colors.BOLD}🔧 Unused Parameters{Colors.RESET}")
|
|
334
|
+
logger.info(f"{Colors.BLUE}{'=' * 18}{Colors.RESET}")
|
|
335
|
+
for i, item in enumerate(unused_parameters, 1):
|
|
336
|
+
logger.info(f"{Colors.GRAY}{i:2d}. {Colors.RESET}{Colors.BLUE}{item['name']}{Colors.RESET}")
|
|
337
|
+
logger.info(f" {Colors.GRAY}└─ {item['file']}:{item['line']}{Colors.RESET}")
|
|
338
|
+
else:
|
|
339
|
+
logger.info(f"\n{Colors.GREEN}✓ All parameters are being used!{Colors.RESET}")
|
|
340
|
+
|
|
341
|
+
if unused_variables:
|
|
342
|
+
logger.info(f"\n{Colors.YELLOW}{Colors.BOLD}📊 Unused Variables{Colors.RESET}")
|
|
343
|
+
logger.info(f"{Colors.YELLOW}{'=' * 18}{Colors.RESET}")
|
|
344
|
+
for i, item in enumerate(unused_variables, 1):
|
|
345
|
+
logger.info(f"{Colors.GRAY}{i:2d}. {Colors.RESET}{Colors.YELLOW}{item['name']}{Colors.RESET}")
|
|
346
|
+
logger.info(f" {Colors.GRAY}└─ {item['file']}:{item['line']}{Colors.RESET}")
|
|
347
|
+
else:
|
|
348
|
+
logger.info(f"\n{Colors.GREEN}✓ All variables are being used!{Colors.RESET}")
|
|
349
|
+
|
|
327
350
|
dead_code_count = len(unused_functions) + len(unused_imports)
|
|
328
351
|
print_badge(dead_code_count, logger)
|
|
329
352
|
|
|
@@ -52,6 +52,7 @@ class Visitor(ast.NodeVisitor):
|
|
|
52
52
|
self.dyn=set()
|
|
53
53
|
self.exports=set()
|
|
54
54
|
self.current_function_scope = []
|
|
55
|
+
self.current_function_params = []
|
|
55
56
|
|
|
56
57
|
def add_def(self,n,t,l):
|
|
57
58
|
if n not in{d.name for d in self.defs}:self.defs.append(Definition(n,t,self.file,l))
|
|
@@ -85,17 +86,27 @@ class Visitor(ast.NodeVisitor):
|
|
|
85
86
|
self.alias[a.asname or a.name.split(".")[-1]]=full
|
|
86
87
|
self.add_def(full,"import",node.lineno)
|
|
87
88
|
|
|
88
|
-
def visit_ImportFrom(self,node):
|
|
89
|
-
if node.module is None:
|
|
89
|
+
def visit_ImportFrom(self, node):
|
|
90
|
+
if node.module is None:
|
|
91
|
+
return
|
|
90
92
|
for a in node.names:
|
|
91
|
-
if a.name=="*":
|
|
92
|
-
|
|
93
|
+
if a.name == "*":
|
|
94
|
+
continue
|
|
95
|
+
base = node.module
|
|
93
96
|
if node.level:
|
|
94
|
-
parts=self.mod.split(".")
|
|
95
|
-
base=".".join(parts[:-node.level])+(f".{node.module}"if node.module else"")
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
97
|
+
parts = self.mod.split(".")
|
|
98
|
+
base = ".".join(parts[:-node.level]) + (f".{node.module}" if node.module else "")
|
|
99
|
+
|
|
100
|
+
full = f"{base}.{a.name}"
|
|
101
|
+
|
|
102
|
+
if a.asname:
|
|
103
|
+
alias_full = f"{self.mod}.{a.asname}" if self.mod else a.asname
|
|
104
|
+
self.add_def(alias_full, "import", node.lineno)
|
|
105
|
+
self.alias[a.asname] = full
|
|
106
|
+
self.add_ref(full)
|
|
107
|
+
else:
|
|
108
|
+
self.alias[a.name] = full
|
|
109
|
+
self.add_def(full, "import", node.lineno)
|
|
99
110
|
|
|
100
111
|
def visit_arguments(self, args):
|
|
101
112
|
for arg in args.args:
|
|
@@ -128,15 +139,25 @@ class Visitor(ast.NodeVisitor):
|
|
|
128
139
|
|
|
129
140
|
self.current_function_scope.append(node.name)
|
|
130
141
|
|
|
142
|
+
old_params = self.current_function_params
|
|
143
|
+
self.current_function_params = []
|
|
144
|
+
|
|
131
145
|
for d_node in node.decorator_list:
|
|
132
146
|
self.visit(d_node)
|
|
133
147
|
|
|
148
|
+
for arg in node.args.args:
|
|
149
|
+
param_name = f"{qualified_name}.{arg.arg}"
|
|
150
|
+
self.add_def(param_name, "parameter", node.lineno)
|
|
151
|
+
self.current_function_params.append((arg.arg, param_name))
|
|
152
|
+
|
|
134
153
|
self.visit_arguments(node.args)
|
|
135
154
|
self.visit_annotation(node.returns)
|
|
136
155
|
|
|
137
156
|
for stmt in node.body:
|
|
138
157
|
self.visit(stmt)
|
|
158
|
+
|
|
139
159
|
self.current_function_scope.pop()
|
|
160
|
+
self.current_function_params = old_params
|
|
140
161
|
|
|
141
162
|
visit_AsyncFunctionDef=visit_FunctionDef
|
|
142
163
|
|
|
@@ -178,6 +199,30 @@ class Visitor(ast.NodeVisitor):
|
|
|
178
199
|
self.visit(node.step)
|
|
179
200
|
|
|
180
201
|
def visit_Assign(self, node):
|
|
202
|
+
def process_target_for_def(target_node):
|
|
203
|
+
if isinstance(target_node, ast.Name):
|
|
204
|
+
var_name_simple = target_node.id
|
|
205
|
+
if var_name_simple == "__all__" and not self.current_function_scope and not self.cls:
|
|
206
|
+
return
|
|
207
|
+
|
|
208
|
+
scope_parts = [self.mod]
|
|
209
|
+
if self.cls:
|
|
210
|
+
scope_parts.append(self.cls)
|
|
211
|
+
if self.current_function_scope:
|
|
212
|
+
scope_parts.extend(self.current_function_scope)
|
|
213
|
+
|
|
214
|
+
prefix = '.'.join(filter(None, scope_parts))
|
|
215
|
+
qualified_var_name = f"{prefix}.{var_name_simple}" if prefix else var_name_simple
|
|
216
|
+
|
|
217
|
+
self.add_def(qualified_var_name, "variable", target_node.lineno)
|
|
218
|
+
|
|
219
|
+
elif isinstance(target_node, (ast.Tuple, ast.List)):
|
|
220
|
+
for elt in target_node.elts:
|
|
221
|
+
process_target_for_def(elt)
|
|
222
|
+
|
|
223
|
+
for t in node.targets:
|
|
224
|
+
process_target_for_def(t)
|
|
225
|
+
|
|
181
226
|
for target in node.targets:
|
|
182
227
|
if isinstance(target, ast.Name) and target.id == "__all__":
|
|
183
228
|
if isinstance(node.value, (ast.List, ast.Tuple)):
|
|
@@ -189,9 +234,10 @@ class Visitor(ast.NodeVisitor):
|
|
|
189
234
|
value = elt.s
|
|
190
235
|
|
|
191
236
|
if value is not None:
|
|
192
|
-
|
|
193
|
-
self.add_ref(
|
|
194
|
-
self.add_ref(value)
|
|
237
|
+
full_name_export = f"{self.mod}.{value}" if self.mod else value
|
|
238
|
+
self.add_ref(full_name_export)
|
|
239
|
+
self.add_ref(value)
|
|
240
|
+
|
|
195
241
|
self.generic_visit(node)
|
|
196
242
|
|
|
197
243
|
def visit_Call(self, node):
|
|
@@ -219,12 +265,26 @@ class Visitor(ast.NodeVisitor):
|
|
|
219
265
|
|
|
220
266
|
def visit_Name(self,node):
|
|
221
267
|
if isinstance(node.ctx,ast.Load):
|
|
222
|
-
self.
|
|
223
|
-
|
|
268
|
+
for param_name, param_full_name in self.current_function_params:
|
|
269
|
+
if node.id == param_name:
|
|
270
|
+
self.add_ref(param_full_name)
|
|
271
|
+
break
|
|
272
|
+
else:
|
|
273
|
+
# not parameter, handle normally
|
|
274
|
+
self.add_ref(self.qual(node.id))
|
|
275
|
+
if node.id in DYNAMIC_PATTERNS:
|
|
276
|
+
self.dyn.add(self.mod.split(".")[0])
|
|
224
277
|
|
|
225
|
-
def visit_Attribute(self,node):
|
|
278
|
+
def visit_Attribute(self, node):
|
|
226
279
|
self.generic_visit(node)
|
|
227
|
-
if isinstance(node.ctx,ast.Load)and isinstance(node.value,ast.Name):
|
|
280
|
+
if isinstance(node.ctx, ast.Load) and isinstance(node.value, ast.Name):
|
|
281
|
+
if node.value.id in [param_name for param_name, _ in self.current_function_params]:
|
|
282
|
+
# mark parameter as referenced
|
|
283
|
+
for param_name, param_full_name in self.current_function_params:
|
|
284
|
+
if node.value.id == param_name:
|
|
285
|
+
self.add_ref(param_full_name)
|
|
286
|
+
break
|
|
287
|
+
|
|
228
288
|
self.add_ref(f"{self.qual(node.value.id)}.{node.attr}")
|
|
229
289
|
|
|
230
290
|
def visit_keyword(self, node):
|
|
@@ -16,6 +16,15 @@ test/compare_tools.py
|
|
|
16
16
|
test/diagnostics.py
|
|
17
17
|
test/test_skylos.py
|
|
18
18
|
test/test_visitor.py
|
|
19
|
+
test/pykomodo/__init__.py
|
|
20
|
+
test/pykomodo/command_line.py
|
|
21
|
+
test/pykomodo/config.py
|
|
22
|
+
test/pykomodo/core.py
|
|
23
|
+
test/pykomodo/dashboard.py
|
|
24
|
+
test/pykomodo/enhanced_chunker.py
|
|
25
|
+
test/pykomodo/multi_dirs_chunker.py
|
|
26
|
+
test/pykomodo/pykomodo_config.py
|
|
27
|
+
test/pykomodo/token_chunker.py
|
|
19
28
|
test/sample_repo/__init__.py
|
|
20
29
|
test/sample_repo/app.py
|
|
21
30
|
test/sample_repo/sample_repo/__init__.py
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import argparse
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
KOMODO_VERSION = "0.2.5"
|
|
6
|
+
|
|
7
|
+
def launch_dashboard():
|
|
8
|
+
"""Launch the dashboard interface."""
|
|
9
|
+
try:
|
|
10
|
+
from pykomodo.dashboard import launch_dashboard
|
|
11
|
+
print("Starting Komodo Dashboard...")
|
|
12
|
+
demo = launch_dashboard()
|
|
13
|
+
demo.launch(
|
|
14
|
+
server_name="0.0.0.0",
|
|
15
|
+
server_port=7860,
|
|
16
|
+
share=False,
|
|
17
|
+
debug=False
|
|
18
|
+
)
|
|
19
|
+
except ImportError as e:
|
|
20
|
+
print(f"[Error] Dashboard dependencies not available: {e}", file=sys.stderr)
|
|
21
|
+
print("Please install gradio: pip install gradio", file=sys.stderr)
|
|
22
|
+
sys.exit(1)
|
|
23
|
+
except Exception as e:
|
|
24
|
+
print(f"[Error] Failed to launch dashboard: {e}", file=sys.stderr)
|
|
25
|
+
sys.exit(1)
|
|
26
|
+
|
|
27
|
+
def main():
|
|
28
|
+
"""Main entry point for the komodo CLI."""
|
|
29
|
+
parser = argparse.ArgumentParser(
|
|
30
|
+
description="Process and chunk codebase content with advanced chunking strategies."
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
parser.add_argument("--version", action="version", version=f"komodo {KOMODO_VERSION}")
|
|
34
|
+
|
|
35
|
+
parser.add_argument("--dashboard", action="store_true",
|
|
36
|
+
help="Launch the web-based dashboard interface")
|
|
37
|
+
|
|
38
|
+
parser.add_argument("dirs", nargs="*", default=["."],
|
|
39
|
+
help="Directories to process (default: current directory)")
|
|
40
|
+
|
|
41
|
+
chunk_group = parser.add_mutually_exclusive_group(required=False)
|
|
42
|
+
chunk_group.add_argument("--equal-chunks", type=int,
|
|
43
|
+
help="Split into N equal chunks")
|
|
44
|
+
chunk_group.add_argument("--max-chunk-size", type=int,
|
|
45
|
+
help="Maximum tokens/lines per chunk")
|
|
46
|
+
chunk_group.add_argument("--max-tokens", type=int,
|
|
47
|
+
help="Maximum tokens per chunk (token-based chunking)")
|
|
48
|
+
|
|
49
|
+
parser.add_argument("--output-dir", default="chunks",
|
|
50
|
+
help="Output directory for chunks (default: chunks)")
|
|
51
|
+
|
|
52
|
+
parser.add_argument("--ignore", action="append", default=[],
|
|
53
|
+
help="Repeatable. Each usage adds one ignore pattern. Example: --ignore '**/node_modules/**' --ignore 'venv'")
|
|
54
|
+
parser.add_argument("--unignore", action="append", default=[],
|
|
55
|
+
help="Repeatable. Each usage adds one unignore pattern. Example: --unignore '*.md'")
|
|
56
|
+
|
|
57
|
+
parser.add_argument("--dry-run", action="store_true",
|
|
58
|
+
help="Show which files would be processed, but do not generate any chunks.")
|
|
59
|
+
|
|
60
|
+
parser.add_argument("--priority", action="append", default=[],
|
|
61
|
+
help="Priority rules in format 'pattern,score' (repeatable). Example: --priority '*.py,10' --priority 'file2.txt,20'")
|
|
62
|
+
|
|
63
|
+
parser.add_argument("--num-threads", type=int, default=4,
|
|
64
|
+
help="Number of processing threads (default: 4)")
|
|
65
|
+
|
|
66
|
+
parser.add_argument("--enhanced", action="store_true",
|
|
67
|
+
help="Enable LLM optimizations")
|
|
68
|
+
|
|
69
|
+
parser.add_argument("--semantic-chunks", action="store_true",
|
|
70
|
+
help="Use AST-based chunking for .py files (splits by top-level functions/classes)")
|
|
71
|
+
|
|
72
|
+
parser.add_argument("--context-window", type=int, default=4096,
|
|
73
|
+
help="Target LLM context window size (default: 4096)")
|
|
74
|
+
parser.add_argument("--min-relevance", type=float, default=0.3,
|
|
75
|
+
help="Minimum relevance score 0.0-1.0 (default: 0.3)")
|
|
76
|
+
parser.add_argument("--no-metadata", action="store_true",
|
|
77
|
+
help="Disable metadata extraction")
|
|
78
|
+
parser.add_argument("--keep-redundant", action="store_true",
|
|
79
|
+
help="Keep redundant content")
|
|
80
|
+
parser.add_argument("--no-summaries", action="store_true",
|
|
81
|
+
help="Disable summary generation")
|
|
82
|
+
|
|
83
|
+
parser.add_argument("--file-type", type=str,
|
|
84
|
+
help="Only chunk files of this type (e.g., 'pdf', 'py')")
|
|
85
|
+
|
|
86
|
+
parser.add_argument("--verbose", action="store_true",
|
|
87
|
+
help="Enable verbose output")
|
|
88
|
+
|
|
89
|
+
args = parser.parse_args()
|
|
90
|
+
|
|
91
|
+
if args.dashboard:
|
|
92
|
+
launch_dashboard()
|
|
93
|
+
return
|
|
94
|
+
|
|
95
|
+
if not any([args.equal_chunks, args.max_chunk_size, args.max_tokens]):
|
|
96
|
+
parser.error("One of --equal-chunks, --max-chunk-size, or --max-tokens is required (unless using --dashboard)")
|
|
97
|
+
|
|
98
|
+
if args.output_dir:
|
|
99
|
+
os.makedirs(args.output_dir, exist_ok=True)
|
|
100
|
+
|
|
101
|
+
priority_rules = []
|
|
102
|
+
for rule in args.priority:
|
|
103
|
+
if not rule:
|
|
104
|
+
continue
|
|
105
|
+
try:
|
|
106
|
+
pattern, score = rule.split(",", 1)
|
|
107
|
+
priority_rules.append((pattern.strip(), int(score.strip())))
|
|
108
|
+
except ValueError:
|
|
109
|
+
print(f"[Error] Priority rule must be 'pattern,score': {rule}",
|
|
110
|
+
file=sys.stderr)
|
|
111
|
+
sys.exit(1)
|
|
112
|
+
|
|
113
|
+
chunker = None
|
|
114
|
+
try:
|
|
115
|
+
if args.max_tokens:
|
|
116
|
+
try:
|
|
117
|
+
from pykomodo.token_chunker import TokenBasedChunker as ChunkerClass
|
|
118
|
+
if args.verbose:
|
|
119
|
+
print("Using TokenBasedChunker for token-based chunking")
|
|
120
|
+
except ImportError:
|
|
121
|
+
print("[Error] TokenBasedChunker not available. Please install tiktoken or update pykomodo.",
|
|
122
|
+
file=sys.stderr)
|
|
123
|
+
sys.exit(1)
|
|
124
|
+
|
|
125
|
+
chunker_args = {
|
|
126
|
+
"max_tokens_per_chunk": args.max_tokens,
|
|
127
|
+
"output_dir": args.output_dir,
|
|
128
|
+
"user_ignore": args.ignore,
|
|
129
|
+
"user_unignore": args.unignore,
|
|
130
|
+
"priority_rules": priority_rules,
|
|
131
|
+
"num_threads": args.num_threads,
|
|
132
|
+
"dry_run": args.dry_run,
|
|
133
|
+
"semantic_chunking": args.semantic_chunks,
|
|
134
|
+
"file_type": args.file_type,
|
|
135
|
+
"verbose": args.verbose
|
|
136
|
+
}
|
|
137
|
+
else:
|
|
138
|
+
if args.enhanced:
|
|
139
|
+
from pykomodo.enhanced_chunker import EnhancedParallelChunker as ChunkerClass
|
|
140
|
+
else:
|
|
141
|
+
from pykomodo.multi_dirs_chunker import ParallelChunker as ChunkerClass
|
|
142
|
+
|
|
143
|
+
chunker_args = {
|
|
144
|
+
"equal_chunks": args.equal_chunks,
|
|
145
|
+
"max_chunk_size": args.max_chunk_size,
|
|
146
|
+
"output_dir": args.output_dir,
|
|
147
|
+
"user_ignore": args.ignore,
|
|
148
|
+
"user_unignore": args.unignore,
|
|
149
|
+
"priority_rules": priority_rules,
|
|
150
|
+
"num_threads": args.num_threads,
|
|
151
|
+
"dry_run": args.dry_run,
|
|
152
|
+
"semantic_chunking": args.semantic_chunks,
|
|
153
|
+
"file_type": args.file_type
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
if args.enhanced:
|
|
157
|
+
chunker_args.update({
|
|
158
|
+
"extract_metadata": not args.no_metadata,
|
|
159
|
+
"add_summaries": not args.no_summaries,
|
|
160
|
+
"remove_redundancy": not args.keep_redundant,
|
|
161
|
+
"context_window": args.context_window,
|
|
162
|
+
"min_relevance_score": args.min_relevance
|
|
163
|
+
})
|
|
164
|
+
|
|
165
|
+
chunker = ChunkerClass(**chunker_args)
|
|
166
|
+
chunker.process_directories(args.dirs)
|
|
167
|
+
|
|
168
|
+
except Exception as e:
|
|
169
|
+
print(f"[Error] Processing failed: {e}", file=sys.stderr)
|
|
170
|
+
sys.exit(1)
|
|
171
|
+
finally:
|
|
172
|
+
if chunker and hasattr(chunker, 'close'):
|
|
173
|
+
chunker.close()
|
|
174
|
+
|
|
175
|
+
if __name__ == "__main__":
|
|
176
|
+
main()
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# src/config.py
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class PriorityRule:
|
|
9
|
+
pattern: str
|
|
10
|
+
score: int
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class KomodoConfig:
|
|
14
|
+
max_size: int = 10 * 1024 * 1024
|
|
15
|
+
token_mode: bool = False
|
|
16
|
+
output_dir: Optional[Path] = None
|
|
17
|
+
stream: bool = False
|
|
18
|
+
ignore_patterns: list[str] = None
|
|
19
|
+
priority_rules: list[PriorityRule] = None
|
|
20
|
+
binary_extensions: list[str] = None
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import fnmatch
|
|
3
|
+
from typing import List, Optional
|
|
4
|
+
|
|
5
|
+
class PriorityRule:
|
|
6
|
+
"""
|
|
7
|
+
Simple Python container for (pattern, score).
|
|
8
|
+
"""
|
|
9
|
+
def __init__(self, pattern, score):
|
|
10
|
+
self.pattern: str = pattern
|
|
11
|
+
self.score: int = score
|
|
12
|
+
|
|
13
|
+
class PyCConfig:
|
|
14
|
+
"""
|
|
15
|
+
A pure Python equivalent of the 'PyCConfig' that in Cython
|
|
16
|
+
wrapped the 'CConfig' struct. This class maintains the same
|
|
17
|
+
conceptual fields but in Pythonic form (lists, strings, booleans).
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(self):
|
|
21
|
+
self.max_size: int = 0
|
|
22
|
+
self.token_mode: bool = False
|
|
23
|
+
self.output_dir: Optional[str] = None
|
|
24
|
+
self.stream: bool = False
|
|
25
|
+
|
|
26
|
+
self.ignore_patterns: List[str] = []
|
|
27
|
+
self.unignore_patterns: List[str] = []
|
|
28
|
+
self.priority_rules: List[PriorityRule] = []
|
|
29
|
+
self.binary_exts: List[str] = []
|
|
30
|
+
|
|
31
|
+
def add_ignore_pattern(self, pattern: str) -> None:
|
|
32
|
+
"""
|
|
33
|
+
Just appends to a Python list.
|
|
34
|
+
"""
|
|
35
|
+
self.ignore_patterns.append(pattern)
|
|
36
|
+
|
|
37
|
+
def add_unignore_pattern(self, pattern: str) -> None:
|
|
38
|
+
self.unignore_patterns.append(pattern)
|
|
39
|
+
|
|
40
|
+
def add_priority_rule(self, pattern: str, score: int) -> None:
|
|
41
|
+
self.priority_rules.append(PriorityRule(pattern, score))
|
|
42
|
+
|
|
43
|
+
def should_ignore(self, path: str) -> bool:
|
|
44
|
+
"""
|
|
45
|
+
Return True if path matches one of the ignore_patterns,
|
|
46
|
+
unless it matches unignore_patterns first.
|
|
47
|
+
"""
|
|
48
|
+
for pat in self.unignore_patterns:
|
|
49
|
+
if fnmatch.fnmatch(path, pat):
|
|
50
|
+
return False
|
|
51
|
+
|
|
52
|
+
for pat in self.ignore_patterns:
|
|
53
|
+
if fnmatch.fnmatch(path, pat):
|
|
54
|
+
return True
|
|
55
|
+
|
|
56
|
+
return False
|
|
57
|
+
|
|
58
|
+
def calculate_priority(self, path: str) -> int:
|
|
59
|
+
"""
|
|
60
|
+
Returns the highest score among any matching priority rule.
|
|
61
|
+
"""
|
|
62
|
+
highest = 0
|
|
63
|
+
for rule in self.priority_rules:
|
|
64
|
+
if fnmatch.fnmatch(path, rule.pattern):
|
|
65
|
+
if rule.score > highest:
|
|
66
|
+
highest = rule.score
|
|
67
|
+
return highest
|
|
68
|
+
|
|
69
|
+
def is_binary_file(self, path: str) -> bool:
|
|
70
|
+
"""
|
|
71
|
+
1) If extension is in self.binary_exts -> True
|
|
72
|
+
2) Else read up to 512 bytes, if it has a null byte -> True
|
|
73
|
+
3) If can't open -> True
|
|
74
|
+
"""
|
|
75
|
+
_, ext = os.path.splitext(path)
|
|
76
|
+
ext = ext.lstrip(".").lower()
|
|
77
|
+
if ext in (b.lower() for b in self.binary_exts):
|
|
78
|
+
return True
|
|
79
|
+
|
|
80
|
+
try:
|
|
81
|
+
with open(path, "rb") as f:
|
|
82
|
+
chunk = f.read(512)
|
|
83
|
+
except OSError:
|
|
84
|
+
return True
|
|
85
|
+
|
|
86
|
+
if b"\0" in chunk:
|
|
87
|
+
return True
|
|
88
|
+
|
|
89
|
+
return False
|
|
90
|
+
|
|
91
|
+
def read_file_contents(self, path: str) -> str:
|
|
92
|
+
"""
|
|
93
|
+
Reads the entire file as text, returns it.
|
|
94
|
+
If can't open, return "<NULL>" or handle differently.
|
|
95
|
+
"""
|
|
96
|
+
try:
|
|
97
|
+
with open(path, "rb") as f:
|
|
98
|
+
data = f.read()
|
|
99
|
+
return data.decode("utf-8", errors="replace")
|
|
100
|
+
except OSError:
|
|
101
|
+
return "<NULL>"
|
|
102
|
+
|
|
103
|
+
def count_tokens(self, text: str) -> int:
|
|
104
|
+
"""
|
|
105
|
+
Replicates py_count_tokens:
|
|
106
|
+
Simple whitespace-based token counting in pure Python.
|
|
107
|
+
"""
|
|
108
|
+
return len(text.split())
|
|
109
|
+
|
|
110
|
+
def make_c_string(self, text: Optional[str]) -> str:
|
|
111
|
+
if text is None:
|
|
112
|
+
return "<NULL>"
|
|
113
|
+
return text
|
|
114
|
+
|
|
115
|
+
def __repr__(self) -> str:
|
|
116
|
+
return (f"PyCConfig(max_size={self.max_size}, token_mode={self.token_mode}, "
|
|
117
|
+
f"output_dir={self.output_dir!r}, stream={self.stream}, "
|
|
118
|
+
f"ignore_patterns={self.ignore_patterns}, "
|
|
119
|
+
f"unignore_patterns={self.unignore_patterns}, "
|
|
120
|
+
f"priority_rules={[ (r.pattern, r.score) for r in self.priority_rules ]}, "
|
|
121
|
+
f"binary_exts={self.binary_exts})")
|