kekkai-cli 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kekkai/__init__.py +7 -0
- kekkai/cli.py +1038 -0
- kekkai/config.py +403 -0
- kekkai/dojo.py +419 -0
- kekkai/dojo_import.py +213 -0
- kekkai/github/__init__.py +16 -0
- kekkai/github/commenter.py +198 -0
- kekkai/github/models.py +56 -0
- kekkai/github/sanitizer.py +112 -0
- kekkai/installer/__init__.py +39 -0
- kekkai/installer/errors.py +23 -0
- kekkai/installer/extract.py +161 -0
- kekkai/installer/manager.py +252 -0
- kekkai/installer/manifest.py +189 -0
- kekkai/installer/verify.py +86 -0
- kekkai/manifest.py +77 -0
- kekkai/output.py +218 -0
- kekkai/paths.py +46 -0
- kekkai/policy.py +326 -0
- kekkai/runner.py +70 -0
- kekkai/scanners/__init__.py +67 -0
- kekkai/scanners/backends/__init__.py +14 -0
- kekkai/scanners/backends/base.py +73 -0
- kekkai/scanners/backends/docker.py +178 -0
- kekkai/scanners/backends/native.py +240 -0
- kekkai/scanners/base.py +110 -0
- kekkai/scanners/container.py +144 -0
- kekkai/scanners/falco.py +237 -0
- kekkai/scanners/gitleaks.py +237 -0
- kekkai/scanners/semgrep.py +227 -0
- kekkai/scanners/trivy.py +246 -0
- kekkai/scanners/url_policy.py +163 -0
- kekkai/scanners/zap.py +340 -0
- kekkai/threatflow/__init__.py +94 -0
- kekkai/threatflow/artifacts.py +476 -0
- kekkai/threatflow/chunking.py +361 -0
- kekkai/threatflow/core.py +438 -0
- kekkai/threatflow/mermaid.py +374 -0
- kekkai/threatflow/model_adapter.py +491 -0
- kekkai/threatflow/prompts.py +277 -0
- kekkai/threatflow/redaction.py +228 -0
- kekkai/threatflow/sanitizer.py +643 -0
- kekkai/triage/__init__.py +33 -0
- kekkai/triage/app.py +168 -0
- kekkai/triage/audit.py +203 -0
- kekkai/triage/ignore.py +269 -0
- kekkai/triage/models.py +185 -0
- kekkai/triage/screens.py +341 -0
- kekkai/triage/widgets.py +169 -0
- kekkai_cli-1.0.0.dist-info/METADATA +135 -0
- kekkai_cli-1.0.0.dist-info/RECORD +90 -0
- kekkai_cli-1.0.0.dist-info/WHEEL +5 -0
- kekkai_cli-1.0.0.dist-info/entry_points.txt +3 -0
- kekkai_cli-1.0.0.dist-info/top_level.txt +3 -0
- kekkai_core/__init__.py +3 -0
- kekkai_core/ci/__init__.py +11 -0
- kekkai_core/ci/benchmarks.py +354 -0
- kekkai_core/ci/metadata.py +104 -0
- kekkai_core/ci/validators.py +92 -0
- kekkai_core/docker/__init__.py +17 -0
- kekkai_core/docker/metadata.py +153 -0
- kekkai_core/docker/sbom.py +173 -0
- kekkai_core/docker/security.py +158 -0
- kekkai_core/docker/signing.py +135 -0
- kekkai_core/redaction.py +84 -0
- kekkai_core/slsa/__init__.py +13 -0
- kekkai_core/slsa/verify.py +121 -0
- kekkai_core/windows/__init__.py +29 -0
- kekkai_core/windows/chocolatey.py +335 -0
- kekkai_core/windows/installer.py +256 -0
- kekkai_core/windows/scoop.py +165 -0
- kekkai_core/windows/validators.py +220 -0
- portal/__init__.py +19 -0
- portal/api.py +155 -0
- portal/auth.py +103 -0
- portal/enterprise/__init__.py +32 -0
- portal/enterprise/audit.py +435 -0
- portal/enterprise/licensing.py +342 -0
- portal/enterprise/rbac.py +276 -0
- portal/enterprise/saml.py +595 -0
- portal/ops/__init__.py +53 -0
- portal/ops/backup.py +553 -0
- portal/ops/log_shipper.py +469 -0
- portal/ops/monitoring.py +517 -0
- portal/ops/restore.py +469 -0
- portal/ops/secrets.py +408 -0
- portal/ops/upgrade.py +591 -0
- portal/tenants.py +340 -0
- portal/uploads.py +259 -0
- portal/web.py +384 -0
|
@@ -0,0 +1,361 @@
|
|
|
1
|
+
"""Safe file chunking for ThreatFlow.
|
|
2
|
+
|
|
3
|
+
Splits repository files into manageable chunks for LLM processing while:
|
|
4
|
+
- Respecting token limits
|
|
5
|
+
- Maintaining context boundaries (don't split mid-function)
|
|
6
|
+
- Handling various file types appropriately
|
|
7
|
+
- Never executing code
|
|
8
|
+
|
|
9
|
+
ASVS V13.1.3: Resource management and timeouts.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import os
|
|
15
|
+
from collections.abc import Iterator
|
|
16
|
+
from dataclasses import dataclass, field
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
|
|
19
|
+
# Approximate chars per token (conservative estimate)
|
|
20
|
+
CHARS_PER_TOKEN = 4
|
|
21
|
+
|
|
22
|
+
# Default limits
|
|
23
|
+
DEFAULT_MAX_TOKENS_PER_CHUNK = 2000
|
|
24
|
+
DEFAULT_MAX_FILE_SIZE_BYTES = 1_000_000 # 1MB
|
|
25
|
+
DEFAULT_MAX_FILES = 500
|
|
26
|
+
|
|
27
|
+
# File extensions to include by default
|
|
28
|
+
DEFAULT_INCLUDE_EXTENSIONS = frozenset(
|
|
29
|
+
{
|
|
30
|
+
".py",
|
|
31
|
+
".js",
|
|
32
|
+
".ts",
|
|
33
|
+
".jsx",
|
|
34
|
+
".tsx",
|
|
35
|
+
".java",
|
|
36
|
+
".go",
|
|
37
|
+
".rs",
|
|
38
|
+
".c",
|
|
39
|
+
".cpp",
|
|
40
|
+
".h",
|
|
41
|
+
".hpp",
|
|
42
|
+
".cs",
|
|
43
|
+
".rb",
|
|
44
|
+
".php",
|
|
45
|
+
".swift",
|
|
46
|
+
".kt",
|
|
47
|
+
".scala",
|
|
48
|
+
".sql",
|
|
49
|
+
".sh",
|
|
50
|
+
".bash",
|
|
51
|
+
".yaml",
|
|
52
|
+
".yml",
|
|
53
|
+
".json",
|
|
54
|
+
".toml",
|
|
55
|
+
".xml",
|
|
56
|
+
".html",
|
|
57
|
+
".css",
|
|
58
|
+
".md",
|
|
59
|
+
".txt",
|
|
60
|
+
".dockerfile",
|
|
61
|
+
".tf",
|
|
62
|
+
".hcl",
|
|
63
|
+
}
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
# Directories to exclude
|
|
67
|
+
DEFAULT_EXCLUDE_DIRS = frozenset(
|
|
68
|
+
{
|
|
69
|
+
".git",
|
|
70
|
+
".svn",
|
|
71
|
+
".hg",
|
|
72
|
+
"node_modules",
|
|
73
|
+
"__pycache__",
|
|
74
|
+
".pytest_cache",
|
|
75
|
+
".mypy_cache",
|
|
76
|
+
".ruff_cache",
|
|
77
|
+
"venv",
|
|
78
|
+
".venv",
|
|
79
|
+
"env",
|
|
80
|
+
".env",
|
|
81
|
+
"dist",
|
|
82
|
+
"build",
|
|
83
|
+
"target",
|
|
84
|
+
".tox",
|
|
85
|
+
".eggs",
|
|
86
|
+
"*.egg-info",
|
|
87
|
+
"vendor",
|
|
88
|
+
"third_party",
|
|
89
|
+
}
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
@dataclass(frozen=True)
|
|
94
|
+
class ChunkingConfig:
|
|
95
|
+
"""Configuration for file chunking."""
|
|
96
|
+
|
|
97
|
+
max_tokens_per_chunk: int = DEFAULT_MAX_TOKENS_PER_CHUNK
|
|
98
|
+
max_file_size_bytes: int = DEFAULT_MAX_FILE_SIZE_BYTES
|
|
99
|
+
max_files: int = DEFAULT_MAX_FILES
|
|
100
|
+
include_extensions: frozenset[str] = DEFAULT_INCLUDE_EXTENSIONS
|
|
101
|
+
exclude_dirs: frozenset[str] = DEFAULT_EXCLUDE_DIRS
|
|
102
|
+
overlap_lines: int = 3 # Lines to overlap between chunks for context
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
@dataclass(frozen=True)
|
|
106
|
+
class FileChunk:
|
|
107
|
+
"""A chunk of file content with metadata."""
|
|
108
|
+
|
|
109
|
+
file_path: str
|
|
110
|
+
content: str
|
|
111
|
+
start_line: int
|
|
112
|
+
end_line: int
|
|
113
|
+
chunk_index: int
|
|
114
|
+
total_chunks: int
|
|
115
|
+
language: str | None = None
|
|
116
|
+
|
|
117
|
+
@property
|
|
118
|
+
def token_estimate(self) -> int:
|
|
119
|
+
"""Estimate token count for this chunk."""
|
|
120
|
+
return len(self.content) // CHARS_PER_TOKEN
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
@dataclass
|
|
124
|
+
class ChunkingResult:
|
|
125
|
+
"""Result of chunking a repository."""
|
|
126
|
+
|
|
127
|
+
chunks: list[FileChunk] = field(default_factory=list)
|
|
128
|
+
skipped_files: list[tuple[str, str]] = field(default_factory=list) # (path, reason)
|
|
129
|
+
total_files_processed: int = 0
|
|
130
|
+
total_tokens_estimated: int = 0
|
|
131
|
+
warnings: list[str] = field(default_factory=list)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _detect_language(file_path: str) -> str | None:
|
|
135
|
+
"""Detect programming language from file extension."""
|
|
136
|
+
ext_to_lang = {
|
|
137
|
+
".py": "python",
|
|
138
|
+
".js": "javascript",
|
|
139
|
+
".ts": "typescript",
|
|
140
|
+
".jsx": "javascript",
|
|
141
|
+
".tsx": "typescript",
|
|
142
|
+
".java": "java",
|
|
143
|
+
".go": "go",
|
|
144
|
+
".rs": "rust",
|
|
145
|
+
".c": "c",
|
|
146
|
+
".cpp": "cpp",
|
|
147
|
+
".h": "c",
|
|
148
|
+
".hpp": "cpp",
|
|
149
|
+
".cs": "csharp",
|
|
150
|
+
".rb": "ruby",
|
|
151
|
+
".php": "php",
|
|
152
|
+
".swift": "swift",
|
|
153
|
+
".kt": "kotlin",
|
|
154
|
+
".scala": "scala",
|
|
155
|
+
".sql": "sql",
|
|
156
|
+
".sh": "bash",
|
|
157
|
+
".bash": "bash",
|
|
158
|
+
".yaml": "yaml",
|
|
159
|
+
".yml": "yaml",
|
|
160
|
+
".json": "json",
|
|
161
|
+
".toml": "toml",
|
|
162
|
+
".xml": "xml",
|
|
163
|
+
".html": "html",
|
|
164
|
+
".css": "css",
|
|
165
|
+
".md": "markdown",
|
|
166
|
+
".tf": "terraform",
|
|
167
|
+
".hcl": "hcl",
|
|
168
|
+
}
|
|
169
|
+
ext = Path(file_path).suffix.lower()
|
|
170
|
+
return ext_to_lang.get(ext)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _should_include_file(
|
|
174
|
+
file_path: Path,
|
|
175
|
+
config: ChunkingConfig,
|
|
176
|
+
) -> tuple[bool, str | None]:
|
|
177
|
+
"""Check if a file should be included in chunking.
|
|
178
|
+
|
|
179
|
+
Returns (should_include, skip_reason).
|
|
180
|
+
"""
|
|
181
|
+
# Check extension
|
|
182
|
+
special_files = ("dockerfile", "makefile", "jenkinsfile", "vagrantfile")
|
|
183
|
+
ext_ok = file_path.suffix.lower() in config.include_extensions
|
|
184
|
+
name_ok = file_path.name.lower() in special_files
|
|
185
|
+
if not ext_ok and not name_ok:
|
|
186
|
+
return False, "unsupported_extension"
|
|
187
|
+
|
|
188
|
+
# Check file size
|
|
189
|
+
try:
|
|
190
|
+
size = file_path.stat().st_size
|
|
191
|
+
if size > config.max_file_size_bytes:
|
|
192
|
+
return False, f"file_too_large ({size} bytes)"
|
|
193
|
+
if size == 0:
|
|
194
|
+
return False, "empty_file"
|
|
195
|
+
except OSError:
|
|
196
|
+
return False, "cannot_read"
|
|
197
|
+
|
|
198
|
+
return True, None
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def _should_exclude_dir(dir_name: str, config: ChunkingConfig) -> bool:
|
|
202
|
+
"""Check if a directory should be excluded."""
|
|
203
|
+
return dir_name in config.exclude_dirs or dir_name.startswith(".")
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def _find_chunk_boundary(lines: list[str], target_line: int, direction: int = 1) -> int:
|
|
207
|
+
"""Find a good boundary for splitting chunks.
|
|
208
|
+
|
|
209
|
+
Looks for natural breakpoints like blank lines, function definitions.
|
|
210
|
+
"""
|
|
211
|
+
# Try to find a blank line near the target
|
|
212
|
+
search_range = 5
|
|
213
|
+
for offset in range(search_range):
|
|
214
|
+
check_line = target_line + (offset * direction)
|
|
215
|
+
if 0 <= check_line < len(lines):
|
|
216
|
+
line = lines[check_line].strip()
|
|
217
|
+
# Good break points: blank lines, class/function definitions
|
|
218
|
+
if not line or line.startswith(("def ", "class ", "function ", "async def ")):
|
|
219
|
+
return check_line
|
|
220
|
+
|
|
221
|
+
return target_line
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def _chunk_file_content(
|
|
225
|
+
file_path: str,
|
|
226
|
+
content: str,
|
|
227
|
+
config: ChunkingConfig,
|
|
228
|
+
) -> list[FileChunk]:
|
|
229
|
+
"""Split file content into chunks."""
|
|
230
|
+
lines = content.splitlines(keepends=True)
|
|
231
|
+
if not lines:
|
|
232
|
+
return []
|
|
233
|
+
|
|
234
|
+
max_chars = config.max_tokens_per_chunk * CHARS_PER_TOKEN
|
|
235
|
+
language = _detect_language(file_path)
|
|
236
|
+
chunks: list[FileChunk] = []
|
|
237
|
+
|
|
238
|
+
current_start = 0
|
|
239
|
+
chunk_index = 0
|
|
240
|
+
|
|
241
|
+
while current_start < len(lines):
|
|
242
|
+
# Find end of chunk based on character count
|
|
243
|
+
current_chars = 0
|
|
244
|
+
current_end = current_start
|
|
245
|
+
|
|
246
|
+
while current_end < len(lines) and current_chars < max_chars:
|
|
247
|
+
current_chars += len(lines[current_end])
|
|
248
|
+
current_end += 1
|
|
249
|
+
|
|
250
|
+
# Try to find a good boundary if we're not at the end
|
|
251
|
+
if current_end < len(lines):
|
|
252
|
+
boundary = _find_chunk_boundary(lines, current_end - 1, direction=-1)
|
|
253
|
+
if boundary > current_start:
|
|
254
|
+
current_end = boundary + 1
|
|
255
|
+
|
|
256
|
+
# Build chunk content
|
|
257
|
+
chunk_lines = lines[current_start:current_end]
|
|
258
|
+
chunk_content = "".join(chunk_lines)
|
|
259
|
+
|
|
260
|
+
chunks.append(
|
|
261
|
+
FileChunk(
|
|
262
|
+
file_path=file_path,
|
|
263
|
+
content=chunk_content,
|
|
264
|
+
start_line=current_start + 1, # 1-indexed
|
|
265
|
+
end_line=current_end,
|
|
266
|
+
chunk_index=chunk_index,
|
|
267
|
+
total_chunks=0, # Updated later
|
|
268
|
+
language=language,
|
|
269
|
+
)
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
# Move to next chunk, with overlap
|
|
273
|
+
current_start = max(current_start + 1, current_end - config.overlap_lines)
|
|
274
|
+
chunk_index += 1
|
|
275
|
+
|
|
276
|
+
# Update total_chunks for all chunks
|
|
277
|
+
total = len(chunks)
|
|
278
|
+
return [
|
|
279
|
+
FileChunk(
|
|
280
|
+
file_path=c.file_path,
|
|
281
|
+
content=c.content,
|
|
282
|
+
start_line=c.start_line,
|
|
283
|
+
end_line=c.end_line,
|
|
284
|
+
chunk_index=c.chunk_index,
|
|
285
|
+
total_chunks=total,
|
|
286
|
+
language=c.language,
|
|
287
|
+
)
|
|
288
|
+
for c in chunks
|
|
289
|
+
]
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def _iter_repo_files(
|
|
293
|
+
repo_path: Path,
|
|
294
|
+
config: ChunkingConfig,
|
|
295
|
+
) -> Iterator[tuple[Path, str | None]]:
|
|
296
|
+
"""Iterate over files in the repository.
|
|
297
|
+
|
|
298
|
+
Yields (file_path, skip_reason) tuples.
|
|
299
|
+
"""
|
|
300
|
+
for root, dirs, files in os.walk(repo_path):
|
|
301
|
+
# Filter out excluded directories in-place
|
|
302
|
+
dirs[:] = [d for d in dirs if not _should_exclude_dir(d, config)]
|
|
303
|
+
|
|
304
|
+
for filename in files:
|
|
305
|
+
file_path = Path(root) / filename
|
|
306
|
+
should_include, skip_reason = _should_include_file(file_path, config)
|
|
307
|
+
|
|
308
|
+
if should_include:
|
|
309
|
+
yield file_path, None
|
|
310
|
+
else:
|
|
311
|
+
yield file_path, skip_reason
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def chunk_files(
|
|
315
|
+
repo_path: Path,
|
|
316
|
+
config: ChunkingConfig | None = None,
|
|
317
|
+
) -> ChunkingResult:
|
|
318
|
+
"""Chunk all eligible files in a repository.
|
|
319
|
+
|
|
320
|
+
Args:
|
|
321
|
+
repo_path: Path to the repository root
|
|
322
|
+
config: Chunking configuration (uses defaults if not provided)
|
|
323
|
+
|
|
324
|
+
Returns:
|
|
325
|
+
ChunkingResult with all chunks and metadata
|
|
326
|
+
"""
|
|
327
|
+
config = config or ChunkingConfig()
|
|
328
|
+
result = ChunkingResult()
|
|
329
|
+
|
|
330
|
+
files_processed = 0
|
|
331
|
+
for file_path, skip_reason in _iter_repo_files(repo_path, config):
|
|
332
|
+
if skip_reason:
|
|
333
|
+
rel_path = str(file_path.relative_to(repo_path))
|
|
334
|
+
result.skipped_files.append((rel_path, skip_reason))
|
|
335
|
+
continue
|
|
336
|
+
|
|
337
|
+
# Enforce file limit
|
|
338
|
+
if files_processed >= config.max_files:
|
|
339
|
+
result.warnings.append(
|
|
340
|
+
f"Reached max file limit ({config.max_files}). Some files were not processed."
|
|
341
|
+
)
|
|
342
|
+
break
|
|
343
|
+
|
|
344
|
+
# Read file content (text only - never execute)
|
|
345
|
+
try:
|
|
346
|
+
content = file_path.read_text(encoding="utf-8", errors="replace")
|
|
347
|
+
except OSError as e:
|
|
348
|
+
rel_path = str(file_path.relative_to(repo_path))
|
|
349
|
+
result.skipped_files.append((rel_path, f"read_error: {e}"))
|
|
350
|
+
continue
|
|
351
|
+
|
|
352
|
+
# Chunk the file
|
|
353
|
+
rel_path = str(file_path.relative_to(repo_path))
|
|
354
|
+
file_chunks = _chunk_file_content(rel_path, content, config)
|
|
355
|
+
result.chunks.extend(file_chunks)
|
|
356
|
+
files_processed += 1
|
|
357
|
+
|
|
358
|
+
result.total_files_processed = files_processed
|
|
359
|
+
result.total_tokens_estimated = sum(c.token_estimate for c in result.chunks)
|
|
360
|
+
|
|
361
|
+
return result
|