kekkai-cli 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. kekkai/__init__.py +7 -0
  2. kekkai/cli.py +1038 -0
  3. kekkai/config.py +403 -0
  4. kekkai/dojo.py +419 -0
  5. kekkai/dojo_import.py +213 -0
  6. kekkai/github/__init__.py +16 -0
  7. kekkai/github/commenter.py +198 -0
  8. kekkai/github/models.py +56 -0
  9. kekkai/github/sanitizer.py +112 -0
  10. kekkai/installer/__init__.py +39 -0
  11. kekkai/installer/errors.py +23 -0
  12. kekkai/installer/extract.py +161 -0
  13. kekkai/installer/manager.py +252 -0
  14. kekkai/installer/manifest.py +189 -0
  15. kekkai/installer/verify.py +86 -0
  16. kekkai/manifest.py +77 -0
  17. kekkai/output.py +218 -0
  18. kekkai/paths.py +46 -0
  19. kekkai/policy.py +326 -0
  20. kekkai/runner.py +70 -0
  21. kekkai/scanners/__init__.py +67 -0
  22. kekkai/scanners/backends/__init__.py +14 -0
  23. kekkai/scanners/backends/base.py +73 -0
  24. kekkai/scanners/backends/docker.py +178 -0
  25. kekkai/scanners/backends/native.py +240 -0
  26. kekkai/scanners/base.py +110 -0
  27. kekkai/scanners/container.py +144 -0
  28. kekkai/scanners/falco.py +237 -0
  29. kekkai/scanners/gitleaks.py +237 -0
  30. kekkai/scanners/semgrep.py +227 -0
  31. kekkai/scanners/trivy.py +246 -0
  32. kekkai/scanners/url_policy.py +163 -0
  33. kekkai/scanners/zap.py +340 -0
  34. kekkai/threatflow/__init__.py +94 -0
  35. kekkai/threatflow/artifacts.py +476 -0
  36. kekkai/threatflow/chunking.py +361 -0
  37. kekkai/threatflow/core.py +438 -0
  38. kekkai/threatflow/mermaid.py +374 -0
  39. kekkai/threatflow/model_adapter.py +491 -0
  40. kekkai/threatflow/prompts.py +277 -0
  41. kekkai/threatflow/redaction.py +228 -0
  42. kekkai/threatflow/sanitizer.py +643 -0
  43. kekkai/triage/__init__.py +33 -0
  44. kekkai/triage/app.py +168 -0
  45. kekkai/triage/audit.py +203 -0
  46. kekkai/triage/ignore.py +269 -0
  47. kekkai/triage/models.py +185 -0
  48. kekkai/triage/screens.py +341 -0
  49. kekkai/triage/widgets.py +169 -0
  50. kekkai_cli-1.0.0.dist-info/METADATA +135 -0
  51. kekkai_cli-1.0.0.dist-info/RECORD +90 -0
  52. kekkai_cli-1.0.0.dist-info/WHEEL +5 -0
  53. kekkai_cli-1.0.0.dist-info/entry_points.txt +3 -0
  54. kekkai_cli-1.0.0.dist-info/top_level.txt +3 -0
  55. kekkai_core/__init__.py +3 -0
  56. kekkai_core/ci/__init__.py +11 -0
  57. kekkai_core/ci/benchmarks.py +354 -0
  58. kekkai_core/ci/metadata.py +104 -0
  59. kekkai_core/ci/validators.py +92 -0
  60. kekkai_core/docker/__init__.py +17 -0
  61. kekkai_core/docker/metadata.py +153 -0
  62. kekkai_core/docker/sbom.py +173 -0
  63. kekkai_core/docker/security.py +158 -0
  64. kekkai_core/docker/signing.py +135 -0
  65. kekkai_core/redaction.py +84 -0
  66. kekkai_core/slsa/__init__.py +13 -0
  67. kekkai_core/slsa/verify.py +121 -0
  68. kekkai_core/windows/__init__.py +29 -0
  69. kekkai_core/windows/chocolatey.py +335 -0
  70. kekkai_core/windows/installer.py +256 -0
  71. kekkai_core/windows/scoop.py +165 -0
  72. kekkai_core/windows/validators.py +220 -0
  73. portal/__init__.py +19 -0
  74. portal/api.py +155 -0
  75. portal/auth.py +103 -0
  76. portal/enterprise/__init__.py +32 -0
  77. portal/enterprise/audit.py +435 -0
  78. portal/enterprise/licensing.py +342 -0
  79. portal/enterprise/rbac.py +276 -0
  80. portal/enterprise/saml.py +595 -0
  81. portal/ops/__init__.py +53 -0
  82. portal/ops/backup.py +553 -0
  83. portal/ops/log_shipper.py +469 -0
  84. portal/ops/monitoring.py +517 -0
  85. portal/ops/restore.py +469 -0
  86. portal/ops/secrets.py +408 -0
  87. portal/ops/upgrade.py +591 -0
  88. portal/tenants.py +340 -0
  89. portal/uploads.py +259 -0
  90. portal/web.py +384 -0
@@ -0,0 +1,361 @@
1
+ """Safe file chunking for ThreatFlow.
2
+
3
+ Splits repository files into manageable chunks for LLM processing while:
4
+ - Respecting token limits
5
+ - Maintaining context boundaries (don't split mid-function)
6
+ - Handling various file types appropriately
7
+ - Never executing code
8
+
9
+ ASVS V13.1.3: Resource management and timeouts.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import os
15
+ from collections.abc import Iterator
16
+ from dataclasses import dataclass, field
17
+ from pathlib import Path
18
+
19
+ # Approximate chars per token (conservative estimate)
20
+ CHARS_PER_TOKEN = 4
21
+
22
+ # Default limits
23
+ DEFAULT_MAX_TOKENS_PER_CHUNK = 2000
24
+ DEFAULT_MAX_FILE_SIZE_BYTES = 1_000_000 # 1MB
25
+ DEFAULT_MAX_FILES = 500
26
+
27
+ # File extensions to include by default
28
+ DEFAULT_INCLUDE_EXTENSIONS = frozenset(
29
+ {
30
+ ".py",
31
+ ".js",
32
+ ".ts",
33
+ ".jsx",
34
+ ".tsx",
35
+ ".java",
36
+ ".go",
37
+ ".rs",
38
+ ".c",
39
+ ".cpp",
40
+ ".h",
41
+ ".hpp",
42
+ ".cs",
43
+ ".rb",
44
+ ".php",
45
+ ".swift",
46
+ ".kt",
47
+ ".scala",
48
+ ".sql",
49
+ ".sh",
50
+ ".bash",
51
+ ".yaml",
52
+ ".yml",
53
+ ".json",
54
+ ".toml",
55
+ ".xml",
56
+ ".html",
57
+ ".css",
58
+ ".md",
59
+ ".txt",
60
+ ".dockerfile",
61
+ ".tf",
62
+ ".hcl",
63
+ }
64
+ )
65
+
66
+ # Directories to exclude
67
+ DEFAULT_EXCLUDE_DIRS = frozenset(
68
+ {
69
+ ".git",
70
+ ".svn",
71
+ ".hg",
72
+ "node_modules",
73
+ "__pycache__",
74
+ ".pytest_cache",
75
+ ".mypy_cache",
76
+ ".ruff_cache",
77
+ "venv",
78
+ ".venv",
79
+ "env",
80
+ ".env",
81
+ "dist",
82
+ "build",
83
+ "target",
84
+ ".tox",
85
+ ".eggs",
86
+ "*.egg-info",
87
+ "vendor",
88
+ "third_party",
89
+ }
90
+ )
91
+
92
+
93
+ @dataclass(frozen=True)
94
+ class ChunkingConfig:
95
+ """Configuration for file chunking."""
96
+
97
+ max_tokens_per_chunk: int = DEFAULT_MAX_TOKENS_PER_CHUNK
98
+ max_file_size_bytes: int = DEFAULT_MAX_FILE_SIZE_BYTES
99
+ max_files: int = DEFAULT_MAX_FILES
100
+ include_extensions: frozenset[str] = DEFAULT_INCLUDE_EXTENSIONS
101
+ exclude_dirs: frozenset[str] = DEFAULT_EXCLUDE_DIRS
102
+ overlap_lines: int = 3 # Lines to overlap between chunks for context
103
+
104
+
105
+ @dataclass(frozen=True)
106
+ class FileChunk:
107
+ """A chunk of file content with metadata."""
108
+
109
+ file_path: str
110
+ content: str
111
+ start_line: int
112
+ end_line: int
113
+ chunk_index: int
114
+ total_chunks: int
115
+ language: str | None = None
116
+
117
+ @property
118
+ def token_estimate(self) -> int:
119
+ """Estimate token count for this chunk."""
120
+ return len(self.content) // CHARS_PER_TOKEN
121
+
122
+
123
+ @dataclass
124
+ class ChunkingResult:
125
+ """Result of chunking a repository."""
126
+
127
+ chunks: list[FileChunk] = field(default_factory=list)
128
+ skipped_files: list[tuple[str, str]] = field(default_factory=list) # (path, reason)
129
+ total_files_processed: int = 0
130
+ total_tokens_estimated: int = 0
131
+ warnings: list[str] = field(default_factory=list)
132
+
133
+
134
+ def _detect_language(file_path: str) -> str | None:
135
+ """Detect programming language from file extension."""
136
+ ext_to_lang = {
137
+ ".py": "python",
138
+ ".js": "javascript",
139
+ ".ts": "typescript",
140
+ ".jsx": "javascript",
141
+ ".tsx": "typescript",
142
+ ".java": "java",
143
+ ".go": "go",
144
+ ".rs": "rust",
145
+ ".c": "c",
146
+ ".cpp": "cpp",
147
+ ".h": "c",
148
+ ".hpp": "cpp",
149
+ ".cs": "csharp",
150
+ ".rb": "ruby",
151
+ ".php": "php",
152
+ ".swift": "swift",
153
+ ".kt": "kotlin",
154
+ ".scala": "scala",
155
+ ".sql": "sql",
156
+ ".sh": "bash",
157
+ ".bash": "bash",
158
+ ".yaml": "yaml",
159
+ ".yml": "yaml",
160
+ ".json": "json",
161
+ ".toml": "toml",
162
+ ".xml": "xml",
163
+ ".html": "html",
164
+ ".css": "css",
165
+ ".md": "markdown",
166
+ ".tf": "terraform",
167
+ ".hcl": "hcl",
168
+ }
169
+ ext = Path(file_path).suffix.lower()
170
+ return ext_to_lang.get(ext)
171
+
172
+
173
+ def _should_include_file(
174
+ file_path: Path,
175
+ config: ChunkingConfig,
176
+ ) -> tuple[bool, str | None]:
177
+ """Check if a file should be included in chunking.
178
+
179
+ Returns (should_include, skip_reason).
180
+ """
181
+ # Check extension
182
+ special_files = ("dockerfile", "makefile", "jenkinsfile", "vagrantfile")
183
+ ext_ok = file_path.suffix.lower() in config.include_extensions
184
+ name_ok = file_path.name.lower() in special_files
185
+ if not ext_ok and not name_ok:
186
+ return False, "unsupported_extension"
187
+
188
+ # Check file size
189
+ try:
190
+ size = file_path.stat().st_size
191
+ if size > config.max_file_size_bytes:
192
+ return False, f"file_too_large ({size} bytes)"
193
+ if size == 0:
194
+ return False, "empty_file"
195
+ except OSError:
196
+ return False, "cannot_read"
197
+
198
+ return True, None
199
+
200
+
201
+ def _should_exclude_dir(dir_name: str, config: ChunkingConfig) -> bool:
202
+ """Check if a directory should be excluded."""
203
+ return dir_name in config.exclude_dirs or dir_name.startswith(".")
204
+
205
+
206
+ def _find_chunk_boundary(lines: list[str], target_line: int, direction: int = 1) -> int:
207
+ """Find a good boundary for splitting chunks.
208
+
209
+ Looks for natural breakpoints like blank lines, function definitions.
210
+ """
211
+ # Try to find a blank line near the target
212
+ search_range = 5
213
+ for offset in range(search_range):
214
+ check_line = target_line + (offset * direction)
215
+ if 0 <= check_line < len(lines):
216
+ line = lines[check_line].strip()
217
+ # Good break points: blank lines, class/function definitions
218
+ if not line or line.startswith(("def ", "class ", "function ", "async def ")):
219
+ return check_line
220
+
221
+ return target_line
222
+
223
+
224
+ def _chunk_file_content(
225
+ file_path: str,
226
+ content: str,
227
+ config: ChunkingConfig,
228
+ ) -> list[FileChunk]:
229
+ """Split file content into chunks."""
230
+ lines = content.splitlines(keepends=True)
231
+ if not lines:
232
+ return []
233
+
234
+ max_chars = config.max_tokens_per_chunk * CHARS_PER_TOKEN
235
+ language = _detect_language(file_path)
236
+ chunks: list[FileChunk] = []
237
+
238
+ current_start = 0
239
+ chunk_index = 0
240
+
241
+ while current_start < len(lines):
242
+ # Find end of chunk based on character count
243
+ current_chars = 0
244
+ current_end = current_start
245
+
246
+ while current_end < len(lines) and current_chars < max_chars:
247
+ current_chars += len(lines[current_end])
248
+ current_end += 1
249
+
250
+ # Try to find a good boundary if we're not at the end
251
+ if current_end < len(lines):
252
+ boundary = _find_chunk_boundary(lines, current_end - 1, direction=-1)
253
+ if boundary > current_start:
254
+ current_end = boundary + 1
255
+
256
+ # Build chunk content
257
+ chunk_lines = lines[current_start:current_end]
258
+ chunk_content = "".join(chunk_lines)
259
+
260
+ chunks.append(
261
+ FileChunk(
262
+ file_path=file_path,
263
+ content=chunk_content,
264
+ start_line=current_start + 1, # 1-indexed
265
+ end_line=current_end,
266
+ chunk_index=chunk_index,
267
+ total_chunks=0, # Updated later
268
+ language=language,
269
+ )
270
+ )
271
+
272
+ # Move to next chunk, with overlap
273
+ current_start = max(current_start + 1, current_end - config.overlap_lines)
274
+ chunk_index += 1
275
+
276
+ # Update total_chunks for all chunks
277
+ total = len(chunks)
278
+ return [
279
+ FileChunk(
280
+ file_path=c.file_path,
281
+ content=c.content,
282
+ start_line=c.start_line,
283
+ end_line=c.end_line,
284
+ chunk_index=c.chunk_index,
285
+ total_chunks=total,
286
+ language=c.language,
287
+ )
288
+ for c in chunks
289
+ ]
290
+
291
+
292
+ def _iter_repo_files(
293
+ repo_path: Path,
294
+ config: ChunkingConfig,
295
+ ) -> Iterator[tuple[Path, str | None]]:
296
+ """Iterate over files in the repository.
297
+
298
+ Yields (file_path, skip_reason) tuples.
299
+ """
300
+ for root, dirs, files in os.walk(repo_path):
301
+ # Filter out excluded directories in-place
302
+ dirs[:] = [d for d in dirs if not _should_exclude_dir(d, config)]
303
+
304
+ for filename in files:
305
+ file_path = Path(root) / filename
306
+ should_include, skip_reason = _should_include_file(file_path, config)
307
+
308
+ if should_include:
309
+ yield file_path, None
310
+ else:
311
+ yield file_path, skip_reason
312
+
313
+
314
+ def chunk_files(
315
+ repo_path: Path,
316
+ config: ChunkingConfig | None = None,
317
+ ) -> ChunkingResult:
318
+ """Chunk all eligible files in a repository.
319
+
320
+ Args:
321
+ repo_path: Path to the repository root
322
+ config: Chunking configuration (uses defaults if not provided)
323
+
324
+ Returns:
325
+ ChunkingResult with all chunks and metadata
326
+ """
327
+ config = config or ChunkingConfig()
328
+ result = ChunkingResult()
329
+
330
+ files_processed = 0
331
+ for file_path, skip_reason in _iter_repo_files(repo_path, config):
332
+ if skip_reason:
333
+ rel_path = str(file_path.relative_to(repo_path))
334
+ result.skipped_files.append((rel_path, skip_reason))
335
+ continue
336
+
337
+ # Enforce file limit
338
+ if files_processed >= config.max_files:
339
+ result.warnings.append(
340
+ f"Reached max file limit ({config.max_files}). Some files were not processed."
341
+ )
342
+ break
343
+
344
+ # Read file content (text only - never execute)
345
+ try:
346
+ content = file_path.read_text(encoding="utf-8", errors="replace")
347
+ except OSError as e:
348
+ rel_path = str(file_path.relative_to(repo_path))
349
+ result.skipped_files.append((rel_path, f"read_error: {e}"))
350
+ continue
351
+
352
+ # Chunk the file
353
+ rel_path = str(file_path.relative_to(repo_path))
354
+ file_chunks = _chunk_file_content(rel_path, content, config)
355
+ result.chunks.extend(file_chunks)
356
+ files_processed += 1
357
+
358
+ result.total_files_processed = files_processed
359
+ result.total_tokens_estimated = sum(c.token_estimate for c in result.chunks)
360
+
361
+ return result