draft-board 0.1.0-beta.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/app/backend/.env.example +9 -0
- package/app/backend/.smartkanban/evidence/8b383839-cbec-45af-86ee-c7708d075cbe/bddf2ed5-2e21-4d46-a62b-10b87f1642a6_patch.txt +195 -0
- package/app/backend/.smartkanban/evidence/8b383839-cbec-45af-86ee-c7708d075cbe/bddf2ed5-2e21-4d46-a62b-10b87f1642a6_stat.txt +6 -0
- package/app/backend/CURL_EXAMPLES.md +335 -0
- package/app/backend/ENV_SETUP.md +65 -0
- package/app/backend/alembic/env.py +71 -0
- package/app/backend/alembic/script.py.mako +28 -0
- package/app/backend/alembic/versions/001_initial_schema.py +104 -0
- package/app/backend/alembic/versions/002_add_jobs_table.py +52 -0
- package/app/backend/alembic/versions/003_add_workspace_table.py +48 -0
- package/app/backend/alembic/versions/004_add_evidence_table.py +56 -0
- package/app/backend/alembic/versions/005_add_verification_commands.py +32 -0
- package/app/backend/alembic/versions/006_add_planner_lock_table.py +39 -0
- package/app/backend/alembic/versions/007_add_revision_review_tables.py +126 -0
- package/app/backend/alembic/versions/008_add_revision_idempotency_and_traceability.py +52 -0
- package/app/backend/alembic/versions/009_add_job_health_fields.py +46 -0
- package/app/backend/alembic/versions/010_add_review_comment_line_content.py +36 -0
- package/app/backend/alembic/versions/011_add_analysis_cache.py +47 -0
- package/app/backend/alembic/versions/012_add_boards_table.py +102 -0
- package/app/backend/alembic/versions/013_add_ticket_blocking.py +45 -0
- package/app/backend/alembic/versions/014_add_agent_sessions.py +220 -0
- package/app/backend/alembic/versions/015_add_ticket_sort_order.py +33 -0
- package/app/backend/alembic/versions/03220f0b93ae_add_pr_fields_to_ticket.py +49 -0
- package/app/backend/alembic/versions/0c2d89fff3b1_seed_board_configs_from_yaml.py +206 -0
- package/app/backend/alembic/versions/3348e5cf54c1_add_merge_checklist_table.py +67 -0
- package/app/backend/alembic/versions/357c780ee445_add_goal_status.py +34 -0
- package/app/backend/alembic/versions/553340b7e26c_add_autonomy_fields_to_goal.py +65 -0
- package/app/backend/alembic/versions/774dc335c679_merge_migration_heads.py +23 -0
- package/app/backend/alembic/versions/7b307e847cbd_merge_heads.py +23 -0
- package/app/backend/alembic/versions/82ecd978cc70_add_missing_indexes.py +48 -0
- package/app/backend/alembic/versions/8ef5054dc280_add_normalized_log_entries.py +173 -0
- package/app/backend/alembic/versions/8f3e2bd8ea3b_merge_migration_heads.py +23 -0
- package/app/backend/alembic/versions/9d17f0698d3b_add_config_column_to_boards_table.py +30 -0
- package/app/backend/alembic/versions/add_agent_conversation_history.py +72 -0
- package/app/backend/alembic/versions/add_job_variant.py +34 -0
- package/app/backend/alembic/versions/add_performance_indexes.py +95 -0
- package/app/backend/alembic/versions/add_repos_and_board_repos.py +174 -0
- package/app/backend/alembic/versions/add_session_id_to_jobs.py +27 -0
- package/app/backend/alembic/versions/add_sqlite_backend_tables.py +104 -0
- package/app/backend/alembic/versions/b10fb0b62240_add_diff_content_to_revisions.py +34 -0
- package/app/backend/alembic.ini +89 -0
- package/app/backend/app/__init__.py +3 -0
- package/app/backend/app/data_dir.py +85 -0
- package/app/backend/app/database.py +70 -0
- package/app/backend/app/database_sync.py +64 -0
- package/app/backend/app/dependencies/__init__.py +5 -0
- package/app/backend/app/dependencies/auth.py +80 -0
- package/app/backend/app/dependencies.py +43 -0
- package/app/backend/app/exceptions.py +178 -0
- package/app/backend/app/executors/__init__.py +1 -0
- package/app/backend/app/executors/adapters/__init__.py +1 -0
- package/app/backend/app/executors/adapters/aider.py +152 -0
- package/app/backend/app/executors/adapters/amazon_q.py +103 -0
- package/app/backend/app/executors/adapters/amp.py +123 -0
- package/app/backend/app/executors/adapters/claude.py +177 -0
- package/app/backend/app/executors/adapters/cline.py +127 -0
- package/app/backend/app/executors/adapters/codex.py +167 -0
- package/app/backend/app/executors/adapters/copilot.py +202 -0
- package/app/backend/app/executors/adapters/cursor.py +87 -0
- package/app/backend/app/executors/adapters/droid.py +123 -0
- package/app/backend/app/executors/adapters/gemini.py +132 -0
- package/app/backend/app/executors/adapters/goose.py +131 -0
- package/app/backend/app/executors/adapters/opencode.py +123 -0
- package/app/backend/app/executors/adapters/qwen.py +123 -0
- package/app/backend/app/executors/plugins/__init__.py +1 -0
- package/app/backend/app/executors/registry.py +202 -0
- package/app/backend/app/executors/spec.py +226 -0
- package/app/backend/app/main.py +486 -0
- package/app/backend/app/middleware/__init__.py +13 -0
- package/app/backend/app/middleware/idempotency.py +426 -0
- package/app/backend/app/middleware/rate_limit.py +312 -0
- package/app/backend/app/middleware/security_headers.py +43 -0
- package/app/backend/app/middleware/timeout.py +37 -0
- package/app/backend/app/models/__init__.py +56 -0
- package/app/backend/app/models/agent_conversation_history.py +56 -0
- package/app/backend/app/models/agent_session.py +127 -0
- package/app/backend/app/models/analysis_cache.py +49 -0
- package/app/backend/app/models/base.py +9 -0
- package/app/backend/app/models/board.py +79 -0
- package/app/backend/app/models/board_repo.py +68 -0
- package/app/backend/app/models/cost_budget.py +42 -0
- package/app/backend/app/models/enums.py +40 -0
- package/app/backend/app/models/evidence.py +132 -0
- package/app/backend/app/models/goal.py +102 -0
- package/app/backend/app/models/idempotency_entry.py +30 -0
- package/app/backend/app/models/job.py +163 -0
- package/app/backend/app/models/job_queue.py +39 -0
- package/app/backend/app/models/kv_store.py +28 -0
- package/app/backend/app/models/merge_checklist.py +87 -0
- package/app/backend/app/models/normalized_log.py +100 -0
- package/app/backend/app/models/planner_lock.py +43 -0
- package/app/backend/app/models/rate_limit_entry.py +25 -0
- package/app/backend/app/models/repo.py +66 -0
- package/app/backend/app/models/review_comment.py +91 -0
- package/app/backend/app/models/review_summary.py +69 -0
- package/app/backend/app/models/revision.py +130 -0
- package/app/backend/app/models/ticket.py +223 -0
- package/app/backend/app/models/ticket_event.py +83 -0
- package/app/backend/app/models/user.py +47 -0
- package/app/backend/app/models/workspace.py +71 -0
- package/app/backend/app/redis_client.py +119 -0
- package/app/backend/app/routers/__init__.py +29 -0
- package/app/backend/app/routers/agents.py +296 -0
- package/app/backend/app/routers/auth.py +94 -0
- package/app/backend/app/routers/board.py +885 -0
- package/app/backend/app/routers/dashboard.py +351 -0
- package/app/backend/app/routers/debug.py +528 -0
- package/app/backend/app/routers/evidence.py +96 -0
- package/app/backend/app/routers/executors.py +324 -0
- package/app/backend/app/routers/goals.py +574 -0
- package/app/backend/app/routers/jobs.py +448 -0
- package/app/backend/app/routers/maintenance.py +172 -0
- package/app/backend/app/routers/merge.py +360 -0
- package/app/backend/app/routers/planner.py +537 -0
- package/app/backend/app/routers/pull_requests.py +382 -0
- package/app/backend/app/routers/repos.py +263 -0
- package/app/backend/app/routers/revisions.py +939 -0
- package/app/backend/app/routers/settings.py +267 -0
- package/app/backend/app/routers/tickets.py +2003 -0
- package/app/backend/app/routers/webhooks.py +143 -0
- package/app/backend/app/routers/websocket.py +249 -0
- package/app/backend/app/schemas/__init__.py +109 -0
- package/app/backend/app/schemas/board.py +87 -0
- package/app/backend/app/schemas/common.py +33 -0
- package/app/backend/app/schemas/evidence.py +87 -0
- package/app/backend/app/schemas/goal.py +90 -0
- package/app/backend/app/schemas/job.py +97 -0
- package/app/backend/app/schemas/merge.py +139 -0
- package/app/backend/app/schemas/planner.py +500 -0
- package/app/backend/app/schemas/repo.py +187 -0
- package/app/backend/app/schemas/review.py +137 -0
- package/app/backend/app/schemas/revision.py +114 -0
- package/app/backend/app/schemas/ticket.py +238 -0
- package/app/backend/app/schemas/ticket_event.py +72 -0
- package/app/backend/app/schemas/workspace.py +19 -0
- package/app/backend/app/services/__init__.py +31 -0
- package/app/backend/app/services/agent_memory_service.py +223 -0
- package/app/backend/app/services/agent_registry.py +346 -0
- package/app/backend/app/services/agent_session_manager.py +318 -0
- package/app/backend/app/services/agent_session_service.py +219 -0
- package/app/backend/app/services/agent_tools.py +379 -0
- package/app/backend/app/services/auth_service.py +98 -0
- package/app/backend/app/services/autonomy_service.py +380 -0
- package/app/backend/app/services/board_repo_service.py +201 -0
- package/app/backend/app/services/board_service.py +326 -0
- package/app/backend/app/services/cleanup_service.py +1085 -0
- package/app/backend/app/services/config_service.py +908 -0
- package/app/backend/app/services/context_gatherer.py +557 -0
- package/app/backend/app/services/cost_tracking_service.py +293 -0
- package/app/backend/app/services/cursor_log_normalizer.py +536 -0
- package/app/backend/app/services/delivery_pipeline.py +440 -0
- package/app/backend/app/services/executor_service.py +634 -0
- package/app/backend/app/services/git_host/__init__.py +11 -0
- package/app/backend/app/services/git_host/factory.py +87 -0
- package/app/backend/app/services/git_host/github.py +270 -0
- package/app/backend/app/services/git_host/gitlab.py +194 -0
- package/app/backend/app/services/git_host/protocol.py +75 -0
- package/app/backend/app/services/git_merge_simple.py +346 -0
- package/app/backend/app/services/git_ops.py +384 -0
- package/app/backend/app/services/github_service.py +233 -0
- package/app/backend/app/services/goal_service.py +113 -0
- package/app/backend/app/services/job_service.py +423 -0
- package/app/backend/app/services/job_watchdog_service.py +424 -0
- package/app/backend/app/services/langchain_adapter.py +122 -0
- package/app/backend/app/services/llm_provider_clients.py +351 -0
- package/app/backend/app/services/llm_service.py +285 -0
- package/app/backend/app/services/log_normalizer.py +342 -0
- package/app/backend/app/services/log_stream_service.py +276 -0
- package/app/backend/app/services/merge_checklist_service.py +264 -0
- package/app/backend/app/services/merge_service.py +784 -0
- package/app/backend/app/services/orchestrator_log.py +84 -0
- package/app/backend/app/services/planner_service.py +1662 -0
- package/app/backend/app/services/planner_tick_sync.py +1040 -0
- package/app/backend/app/services/queued_message_service.py +156 -0
- package/app/backend/app/services/reliability_wrapper.py +389 -0
- package/app/backend/app/services/repo_discovery_service.py +318 -0
- package/app/backend/app/services/review_service.py +334 -0
- package/app/backend/app/services/revision_service.py +389 -0
- package/app/backend/app/services/safe_autopilot.py +510 -0
- package/app/backend/app/services/sqlite_worker.py +372 -0
- package/app/backend/app/services/task_dispatch.py +135 -0
- package/app/backend/app/services/ticket_generation_service.py +1781 -0
- package/app/backend/app/services/ticket_service.py +486 -0
- package/app/backend/app/services/udar_planner_service.py +1007 -0
- package/app/backend/app/services/webhook_service.py +126 -0
- package/app/backend/app/services/workspace_service.py +465 -0
- package/app/backend/app/services/worktree_file_service.py +92 -0
- package/app/backend/app/services/worktree_validator.py +213 -0
- package/app/backend/app/sqlite_kv.py +278 -0
- package/app/backend/app/state_machine.py +128 -0
- package/app/backend/app/templates/__init__.py +5 -0
- package/app/backend/app/templates/registry.py +243 -0
- package/app/backend/app/utils/__init__.py +5 -0
- package/app/backend/app/utils/artifact_reader.py +87 -0
- package/app/backend/app/utils/circuit_breaker.py +229 -0
- package/app/backend/app/utils/db_retry.py +136 -0
- package/app/backend/app/utils/ignored_fields.py +123 -0
- package/app/backend/app/utils/validators.py +54 -0
- package/app/backend/app/websocket/__init__.py +5 -0
- package/app/backend/app/websocket/manager.py +179 -0
- package/app/backend/app/websocket/state_tracker.py +113 -0
- package/app/backend/app/worker.py +3190 -0
- package/app/backend/calculator_tickets.json +40 -0
- package/app/backend/canary_tests.sh +591 -0
- package/app/backend/celerybeat-schedule +0 -0
- package/app/backend/celerybeat-schedule-shm +0 -0
- package/app/backend/celerybeat-schedule-wal +0 -0
- package/app/backend/logs/.gitkeep +3 -0
- package/app/backend/multiplication_division_implementation_tickets.json +55 -0
- package/app/backend/multiplication_division_tickets.json +42 -0
- package/app/backend/pyproject.toml +45 -0
- package/app/backend/requirements-dev.txt +8 -0
- package/app/backend/requirements.txt +20 -0
- package/app/backend/run.sh +30 -0
- package/app/backend/run_with_logs.sh +10 -0
- package/app/backend/scientific_calculator_tickets.json +40 -0
- package/app/backend/scripts/extract_openapi.py +21 -0
- package/app/backend/scripts/seed_demo.py +187 -0
- package/app/backend/setup_demo_review.py +302 -0
- package/app/backend/test_actual_parse.py +41 -0
- package/app/backend/test_agent_streaming.py +61 -0
- package/app/backend/test_parse.py +51 -0
- package/app/backend/test_streaming.py +51 -0
- package/app/backend/test_subprocess_streaming.py +50 -0
- package/app/backend/tests/__init__.py +1 -0
- package/app/backend/tests/conftest.py +46 -0
- package/app/backend/tests/test_auth.py +341 -0
- package/app/backend/tests/test_autonomy_service.py +391 -0
- package/app/backend/tests/test_cleanup_service_safety.py +417 -0
- package/app/backend/tests/test_middleware.py +279 -0
- package/app/backend/tests/test_planner_providers.py +290 -0
- package/app/backend/tests/test_planner_unblock.py +183 -0
- package/app/backend/tests/test_revision_invariants.py +618 -0
- package/app/backend/tests/test_sqlite_kv.py +290 -0
- package/app/backend/tests/test_sqlite_worker.py +353 -0
- package/app/backend/tests/test_task_dispatch.py +100 -0
- package/app/backend/tests/test_ticket_validation.py +304 -0
- package/app/backend/tests/test_udar_agent.py +693 -0
- package/app/backend/tests/test_webhook_service.py +184 -0
- package/app/backend/tickets_output.json +59 -0
- package/app/backend/user_management_tickets.json +50 -0
- package/app/backend/uvicorn.log +0 -0
- package/app/draft.yaml +313 -0
- package/app/frontend/dist/assets/index-LcjCczu5.js +155 -0
- package/app/frontend/dist/assets/index-_FP_279e.css +1 -0
- package/app/frontend/dist/index.html +14 -0
- package/app/frontend/dist/vite.svg +1 -0
- package/app/frontend/package.json +101 -0
- package/bin/cli.js +527 -0
- package/package.json +37 -0
|
@@ -0,0 +1,557 @@
|
|
|
1
|
+
"""Secure, metadata-first repository context gathering.
|
|
2
|
+
|
|
3
|
+
This module provides safe context gathering for LLM-based ticket generation.
|
|
4
|
+
It follows these principles:
|
|
5
|
+
|
|
6
|
+
1. METADATA-FIRST: Returns file paths, line counts, and small excerpts only.
|
|
7
|
+
Never returns full file contents except for small, capped excerpts.
|
|
8
|
+
|
|
9
|
+
2. STRICT CAPS: Hard limits on files scanned, bytes read, and excerpt sizes
|
|
10
|
+
to prevent runaway prompts and cost explosions.
|
|
11
|
+
|
|
12
|
+
3. SECURITY: Excludes sensitive paths (.env, keys, secrets) and skips symlinks
|
|
13
|
+
to prevent secret leakage to third-party LLM providers.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import fnmatch
|
|
17
|
+
import logging
|
|
18
|
+
import re
|
|
19
|
+
from dataclasses import dataclass, field
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class FileMetadata:
|
|
27
|
+
"""Metadata about a single file."""
|
|
28
|
+
|
|
29
|
+
path: str # Relative to repo root
|
|
30
|
+
line_count: int
|
|
31
|
+
language: str | None # Detected from extension
|
|
32
|
+
size_bytes: int
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class GatherStats:
|
|
37
|
+
"""Statistics from the context gathering operation."""
|
|
38
|
+
|
|
39
|
+
files_scanned: int = 0
|
|
40
|
+
bytes_read: int = 0
|
|
41
|
+
skipped_excluded: int = 0
|
|
42
|
+
skipped_symlinks: int = 0
|
|
43
|
+
skipped_binary: int = 0
|
|
44
|
+
skipped_too_large: int = 0
|
|
45
|
+
todo_lines_found: int = 0
|
|
46
|
+
# Observability: track what was excluded and what was scanned
|
|
47
|
+
excluded_by_pattern: dict[str, int] = field(
|
|
48
|
+
default_factory=dict
|
|
49
|
+
) # pattern -> count
|
|
50
|
+
extensions_scanned: dict[str, int] = field(
|
|
51
|
+
default_factory=dict
|
|
52
|
+
) # extension -> count
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dataclass
|
|
56
|
+
class RepoContext:
|
|
57
|
+
"""Gathered repository context for LLM consumption."""
|
|
58
|
+
|
|
59
|
+
file_tree: list[FileMetadata]
|
|
60
|
+
project_type: str # "python", "node", "mixed", "unknown"
|
|
61
|
+
todo_count: int
|
|
62
|
+
todo_excerpts: list[str] # Max 50, each max 200 chars
|
|
63
|
+
readme_excerpt: str | None # Max 500 chars if enabled
|
|
64
|
+
stats: GatherStats = field(default_factory=GatherStats)
|
|
65
|
+
|
|
66
|
+
def to_prompt_string(self) -> str:
|
|
67
|
+
"""Convert to a string suitable for LLM prompts."""
|
|
68
|
+
parts = []
|
|
69
|
+
|
|
70
|
+
# Project type
|
|
71
|
+
parts.append(f"Project type: {self.project_type}")
|
|
72
|
+
|
|
73
|
+
# File tree summary (top directories + file counts by type)
|
|
74
|
+
if self.file_tree:
|
|
75
|
+
# Group by directory
|
|
76
|
+
dir_counts: dict[str, int] = {}
|
|
77
|
+
ext_counts: dict[str, int] = {}
|
|
78
|
+
for f in self.file_tree:
|
|
79
|
+
# Get top-level directory
|
|
80
|
+
path_parts = f.path.split("/")
|
|
81
|
+
if len(path_parts) > 1:
|
|
82
|
+
top_dir = path_parts[0]
|
|
83
|
+
dir_counts[top_dir] = dir_counts.get(top_dir, 0) + 1
|
|
84
|
+
# Count extensions
|
|
85
|
+
if f.language:
|
|
86
|
+
ext_counts[f.language] = ext_counts.get(f.language, 0) + 1
|
|
87
|
+
|
|
88
|
+
if dir_counts:
|
|
89
|
+
top_dirs = sorted(dir_counts.items(), key=lambda x: -x[1])[:10]
|
|
90
|
+
parts.append(
|
|
91
|
+
f"Top directories: {', '.join(f'{d} ({c} files)' for d, c in top_dirs)}"
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
if ext_counts:
|
|
95
|
+
top_exts = sorted(ext_counts.items(), key=lambda x: -x[1])[:8]
|
|
96
|
+
parts.append(
|
|
97
|
+
f"File types: {', '.join(f'{e} ({c})' for e, c in top_exts)}"
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
parts.append(f"Total files indexed: {len(self.file_tree)}")
|
|
101
|
+
|
|
102
|
+
# README excerpt
|
|
103
|
+
if self.readme_excerpt:
|
|
104
|
+
parts.append(f"README excerpt:\n{self.readme_excerpt}")
|
|
105
|
+
|
|
106
|
+
# TODOs
|
|
107
|
+
if self.todo_count > 0:
|
|
108
|
+
parts.append(f"TODO/FIXME comments found: {self.todo_count}")
|
|
109
|
+
if self.todo_excerpts:
|
|
110
|
+
parts.append("Sample TODOs:")
|
|
111
|
+
for excerpt in self.todo_excerpts[:10]:
|
|
112
|
+
parts.append(f" - {excerpt}")
|
|
113
|
+
|
|
114
|
+
# Stats
|
|
115
|
+
parts.append(
|
|
116
|
+
f"Scan stats: {self.stats.files_scanned} files scanned, "
|
|
117
|
+
f"{self.stats.skipped_excluded} excluded, "
|
|
118
|
+
f"{self.stats.skipped_symlinks} symlinks skipped"
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
return "\n".join(parts)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
class ContextGatherer:
|
|
125
|
+
"""Metadata-first repo context with strict caps and exclusions.
|
|
126
|
+
|
|
127
|
+
This class gathers repository context safely for LLM consumption:
|
|
128
|
+
- Never reads full file contents (only line counts and small excerpts)
|
|
129
|
+
- Excludes sensitive files (secrets, env, keys)
|
|
130
|
+
- Enforces hard caps on all operations
|
|
131
|
+
- Skips symlinks entirely
|
|
132
|
+
"""
|
|
133
|
+
|
|
134
|
+
# Hard caps - non-negotiable limits
|
|
135
|
+
MAX_FILES_SCANNED = 500
|
|
136
|
+
MAX_BYTES_TOTAL = 50_000 # ~50KB of excerpts
|
|
137
|
+
MAX_TODO_LINES = 50
|
|
138
|
+
MAX_EXCERPT_CHARS = 200
|
|
139
|
+
MAX_README_CHARS = 500
|
|
140
|
+
MAX_FILE_SIZE_FOR_SCAN = 100_000 # Skip files > 100KB for TODO scanning
|
|
141
|
+
|
|
142
|
+
# Sensitive path patterns to exclude (glob-style)
|
|
143
|
+
EXCLUDED_PATTERNS = [
|
|
144
|
+
# Environment and secrets
|
|
145
|
+
".env",
|
|
146
|
+
".env.*",
|
|
147
|
+
"*.env",
|
|
148
|
+
".envrc",
|
|
149
|
+
"secrets.*",
|
|
150
|
+
"*secret*",
|
|
151
|
+
"*password*",
|
|
152
|
+
# Keys and certificates
|
|
153
|
+
"*.pem",
|
|
154
|
+
"*.key",
|
|
155
|
+
"*.crt",
|
|
156
|
+
"*.p12",
|
|
157
|
+
"*.pfx",
|
|
158
|
+
"id_rsa*",
|
|
159
|
+
"id_ed25519*",
|
|
160
|
+
"*.pub",
|
|
161
|
+
# Config files that might contain secrets
|
|
162
|
+
"credentials*",
|
|
163
|
+
"*_credentials*",
|
|
164
|
+
"auth.json",
|
|
165
|
+
"config.local.*",
|
|
166
|
+
# Package directories
|
|
167
|
+
"node_modules/",
|
|
168
|
+
"venv/",
|
|
169
|
+
".venv/",
|
|
170
|
+
"__pycache__/",
|
|
171
|
+
".git/",
|
|
172
|
+
".svn/",
|
|
173
|
+
".hg/",
|
|
174
|
+
# Build artifacts
|
|
175
|
+
"dist/",
|
|
176
|
+
"build/",
|
|
177
|
+
"*.pyc",
|
|
178
|
+
"*.pyo",
|
|
179
|
+
"*.so",
|
|
180
|
+
"*.dylib",
|
|
181
|
+
"*.dll",
|
|
182
|
+
# Logs and data
|
|
183
|
+
"*.log",
|
|
184
|
+
"*.sqlite",
|
|
185
|
+
"*.db",
|
|
186
|
+
# IDE and editor
|
|
187
|
+
".idea/",
|
|
188
|
+
".vscode/",
|
|
189
|
+
"*.swp",
|
|
190
|
+
"*.swo",
|
|
191
|
+
# Coverage and test artifacts
|
|
192
|
+
"coverage/",
|
|
193
|
+
".coverage",
|
|
194
|
+
"htmlcov/",
|
|
195
|
+
".pytest_cache/",
|
|
196
|
+
".mypy_cache/",
|
|
197
|
+
# Binary files
|
|
198
|
+
"*.jpg",
|
|
199
|
+
"*.jpeg",
|
|
200
|
+
"*.png",
|
|
201
|
+
"*.gif",
|
|
202
|
+
"*.ico",
|
|
203
|
+
"*.pdf",
|
|
204
|
+
"*.zip",
|
|
205
|
+
"*.tar",
|
|
206
|
+
"*.gz",
|
|
207
|
+
"*.woff",
|
|
208
|
+
"*.woff2",
|
|
209
|
+
"*.ttf",
|
|
210
|
+
"*.eot",
|
|
211
|
+
]
|
|
212
|
+
|
|
213
|
+
# Extension to language mapping
|
|
214
|
+
EXTENSION_LANGUAGES = {
|
|
215
|
+
".py": "python",
|
|
216
|
+
".js": "javascript",
|
|
217
|
+
".ts": "typescript",
|
|
218
|
+
".tsx": "typescript",
|
|
219
|
+
".jsx": "javascript",
|
|
220
|
+
".go": "go",
|
|
221
|
+
".rs": "rust",
|
|
222
|
+
".java": "java",
|
|
223
|
+
".kt": "kotlin",
|
|
224
|
+
".rb": "ruby",
|
|
225
|
+
".php": "php",
|
|
226
|
+
".c": "c",
|
|
227
|
+
".cpp": "cpp",
|
|
228
|
+
".h": "c",
|
|
229
|
+
".hpp": "cpp",
|
|
230
|
+
".cs": "csharp",
|
|
231
|
+
".swift": "swift",
|
|
232
|
+
".sh": "shell",
|
|
233
|
+
".bash": "shell",
|
|
234
|
+
".zsh": "shell",
|
|
235
|
+
".sql": "sql",
|
|
236
|
+
".md": "markdown",
|
|
237
|
+
".yaml": "yaml",
|
|
238
|
+
".yml": "yaml",
|
|
239
|
+
".json": "json",
|
|
240
|
+
".toml": "toml",
|
|
241
|
+
".xml": "xml",
|
|
242
|
+
".html": "html",
|
|
243
|
+
".css": "css",
|
|
244
|
+
".scss": "scss",
|
|
245
|
+
".less": "less",
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
def __init__(
|
|
249
|
+
self,
|
|
250
|
+
max_files: int | None = None,
|
|
251
|
+
max_bytes: int | None = None,
|
|
252
|
+
max_todos: int | None = None,
|
|
253
|
+
additional_exclusions: list[str] | None = None,
|
|
254
|
+
):
|
|
255
|
+
"""Initialize the context gatherer.
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
max_files: Override MAX_FILES_SCANNED (can only decrease).
|
|
259
|
+
max_bytes: Override MAX_BYTES_TOTAL (can only decrease).
|
|
260
|
+
max_todos: Override MAX_TODO_LINES (can only decrease).
|
|
261
|
+
additional_exclusions: Additional glob patterns to exclude.
|
|
262
|
+
"""
|
|
263
|
+
self.max_files = min(
|
|
264
|
+
max_files or self.MAX_FILES_SCANNED, self.MAX_FILES_SCANNED
|
|
265
|
+
)
|
|
266
|
+
self.max_bytes = min(max_bytes or self.MAX_BYTES_TOTAL, self.MAX_BYTES_TOTAL)
|
|
267
|
+
self.max_todos = min(max_todos or self.MAX_TODO_LINES, self.MAX_TODO_LINES)
|
|
268
|
+
|
|
269
|
+
self.exclusions = list(self.EXCLUDED_PATTERNS)
|
|
270
|
+
if additional_exclusions:
|
|
271
|
+
self.exclusions.extend(additional_exclusions)
|
|
272
|
+
|
|
273
|
+
def gather(
|
|
274
|
+
self,
|
|
275
|
+
repo_root: Path | str,
|
|
276
|
+
include_readme_excerpt: bool = False,
|
|
277
|
+
) -> RepoContext:
|
|
278
|
+
"""Gather repository context.
|
|
279
|
+
|
|
280
|
+
Args:
|
|
281
|
+
repo_root: Path to the repository root.
|
|
282
|
+
include_readme_excerpt: Whether to include README excerpt (default OFF).
|
|
283
|
+
|
|
284
|
+
Returns:
|
|
285
|
+
RepoContext with metadata about the repository.
|
|
286
|
+
"""
|
|
287
|
+
repo_root = Path(repo_root).resolve()
|
|
288
|
+
if not repo_root.exists():
|
|
289
|
+
logger.warning(f"Repository root does not exist: {repo_root}")
|
|
290
|
+
return RepoContext(
|
|
291
|
+
file_tree=[],
|
|
292
|
+
project_type="unknown",
|
|
293
|
+
todo_count=0,
|
|
294
|
+
todo_excerpts=[],
|
|
295
|
+
readme_excerpt=None,
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
stats = GatherStats()
|
|
299
|
+
file_tree: list[FileMetadata] = []
|
|
300
|
+
todo_excerpts: list[str] = []
|
|
301
|
+
readme_excerpt: str | None = None
|
|
302
|
+
|
|
303
|
+
# Detect project type
|
|
304
|
+
project_type = self._detect_project_type(repo_root)
|
|
305
|
+
|
|
306
|
+
# Scan files
|
|
307
|
+
bytes_read = 0
|
|
308
|
+
for file_path in self._walk_files(repo_root, stats):
|
|
309
|
+
if stats.files_scanned >= self.max_files:
|
|
310
|
+
break
|
|
311
|
+
|
|
312
|
+
try:
|
|
313
|
+
# Get file metadata (no content read yet)
|
|
314
|
+
rel_path = str(file_path.relative_to(repo_root))
|
|
315
|
+
file_size = file_path.stat().st_size
|
|
316
|
+
extension = file_path.suffix.lower()
|
|
317
|
+
language = self.EXTENSION_LANGUAGES.get(extension)
|
|
318
|
+
|
|
319
|
+
# Count lines without reading entire file into memory
|
|
320
|
+
line_count = self._count_lines(file_path)
|
|
321
|
+
if line_count is None:
|
|
322
|
+
stats.skipped_binary += 1
|
|
323
|
+
continue
|
|
324
|
+
|
|
325
|
+
file_tree.append(
|
|
326
|
+
FileMetadata(
|
|
327
|
+
path=rel_path,
|
|
328
|
+
line_count=line_count,
|
|
329
|
+
language=language,
|
|
330
|
+
size_bytes=file_size,
|
|
331
|
+
)
|
|
332
|
+
)
|
|
333
|
+
stats.files_scanned += 1
|
|
334
|
+
|
|
335
|
+
# Scan for TODOs if file is small enough and we haven't hit the cap
|
|
336
|
+
if (
|
|
337
|
+
len(todo_excerpts) < self.max_todos
|
|
338
|
+
and file_size < self.MAX_FILE_SIZE_FOR_SCAN
|
|
339
|
+
and bytes_read < self.max_bytes
|
|
340
|
+
and language
|
|
341
|
+
in ("python", "javascript", "typescript", "go", "rust", "java")
|
|
342
|
+
):
|
|
343
|
+
new_todos, bytes_used = self._extract_todos(
|
|
344
|
+
file_path, rel_path, self.max_todos - len(todo_excerpts)
|
|
345
|
+
)
|
|
346
|
+
todo_excerpts.extend(new_todos)
|
|
347
|
+
bytes_read += bytes_used
|
|
348
|
+
stats.todo_lines_found += len(new_todos)
|
|
349
|
+
|
|
350
|
+
except (OSError, PermissionError) as e:
|
|
351
|
+
logger.debug(f"Failed to read {file_path}: {e}")
|
|
352
|
+
continue
|
|
353
|
+
|
|
354
|
+
# Get README excerpt if requested
|
|
355
|
+
if include_readme_excerpt:
|
|
356
|
+
readme_excerpt = self._get_readme_excerpt(repo_root)
|
|
357
|
+
if readme_excerpt:
|
|
358
|
+
bytes_read += len(readme_excerpt.encode("utf-8", errors="replace"))
|
|
359
|
+
|
|
360
|
+
stats.bytes_read = bytes_read
|
|
361
|
+
|
|
362
|
+
return RepoContext(
|
|
363
|
+
file_tree=file_tree,
|
|
364
|
+
project_type=project_type,
|
|
365
|
+
todo_count=stats.todo_lines_found,
|
|
366
|
+
todo_excerpts=todo_excerpts,
|
|
367
|
+
readme_excerpt=readme_excerpt,
|
|
368
|
+
stats=stats,
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
def _walk_files(self, repo_root: Path, stats: GatherStats):
|
|
372
|
+
"""Walk repository files, respecting exclusions and caps.
|
|
373
|
+
|
|
374
|
+
Yields file paths, updating stats as it goes.
|
|
375
|
+
"""
|
|
376
|
+
for item in repo_root.rglob("*"):
|
|
377
|
+
# Skip directories
|
|
378
|
+
if item.is_dir():
|
|
379
|
+
continue
|
|
380
|
+
|
|
381
|
+
# Skip symlinks entirely (security)
|
|
382
|
+
if item.is_symlink():
|
|
383
|
+
stats.skipped_symlinks += 1
|
|
384
|
+
continue
|
|
385
|
+
|
|
386
|
+
# Check exclusions (returns matching pattern if excluded)
|
|
387
|
+
rel_path = str(item.relative_to(repo_root))
|
|
388
|
+
matched_pattern = self._get_exclusion_match(rel_path, item.name)
|
|
389
|
+
if matched_pattern:
|
|
390
|
+
stats.skipped_excluded += 1
|
|
391
|
+
# Track which patterns are matching (for debugging bad suggestions)
|
|
392
|
+
stats.excluded_by_pattern[matched_pattern] = (
|
|
393
|
+
stats.excluded_by_pattern.get(matched_pattern, 0) + 1
|
|
394
|
+
)
|
|
395
|
+
continue
|
|
396
|
+
|
|
397
|
+
# Track extension for filetype histogram
|
|
398
|
+
ext = item.suffix.lower() or "(no extension)"
|
|
399
|
+
stats.extensions_scanned[ext] = stats.extensions_scanned.get(ext, 0) + 1
|
|
400
|
+
|
|
401
|
+
yield item
|
|
402
|
+
|
|
403
|
+
def _get_exclusion_match(self, rel_path: str, filename: str) -> str | None:
|
|
404
|
+
"""Check if a path matches any exclusion pattern.
|
|
405
|
+
|
|
406
|
+
Returns the matching pattern if excluded, None otherwise.
|
|
407
|
+
"""
|
|
408
|
+
for pattern in self.exclusions:
|
|
409
|
+
# Check against full relative path
|
|
410
|
+
if fnmatch.fnmatch(rel_path, pattern):
|
|
411
|
+
return pattern
|
|
412
|
+
if fnmatch.fnmatch(rel_path, f"**/{pattern}"):
|
|
413
|
+
return pattern
|
|
414
|
+
# Check against filename only
|
|
415
|
+
if fnmatch.fnmatch(filename, pattern):
|
|
416
|
+
return pattern
|
|
417
|
+
# Check if path contains the pattern as a directory
|
|
418
|
+
if pattern.endswith("/") and pattern[:-1] in rel_path.split("/"):
|
|
419
|
+
return pattern
|
|
420
|
+
return None
|
|
421
|
+
|
|
422
|
+
def _is_excluded(self, rel_path: str, filename: str) -> bool:
|
|
423
|
+
"""Check if a path matches any exclusion pattern."""
|
|
424
|
+
return self._get_exclusion_match(rel_path, filename) is not None
|
|
425
|
+
|
|
426
|
+
def _count_lines(self, file_path: Path) -> int | None:
|
|
427
|
+
"""Count lines in a file without loading it all into memory.
|
|
428
|
+
|
|
429
|
+
Returns None if the file appears to be binary.
|
|
430
|
+
"""
|
|
431
|
+
try:
|
|
432
|
+
line_count = 0
|
|
433
|
+
with open(file_path, "rb") as f:
|
|
434
|
+
# Read first 8KB to check if binary
|
|
435
|
+
sample = f.read(8192)
|
|
436
|
+
if b"\x00" in sample:
|
|
437
|
+
return None # Binary file
|
|
438
|
+
|
|
439
|
+
# Count newlines in sample
|
|
440
|
+
line_count = sample.count(b"\n")
|
|
441
|
+
|
|
442
|
+
# Continue counting for rest of file
|
|
443
|
+
for chunk in iter(lambda: f.read(65536), b""):
|
|
444
|
+
line_count += chunk.count(b"\n")
|
|
445
|
+
|
|
446
|
+
return line_count
|
|
447
|
+
except Exception:
|
|
448
|
+
return None
|
|
449
|
+
|
|
450
|
+
def _extract_todos(
|
|
451
|
+
self, file_path: Path, rel_path: str, max_count: int
|
|
452
|
+
) -> tuple[list[str], int]:
|
|
453
|
+
"""Extract TODO/FIXME comments from a file.
|
|
454
|
+
|
|
455
|
+
Returns (list of excerpts, bytes read).
|
|
456
|
+
"""
|
|
457
|
+
todos: list[str] = []
|
|
458
|
+
bytes_read = 0
|
|
459
|
+
todo_pattern = re.compile(
|
|
460
|
+
r"#\s*(TODO|FIXME|XXX|HACK)\b[:\s]*(.*)", re.IGNORECASE
|
|
461
|
+
)
|
|
462
|
+
|
|
463
|
+
try:
|
|
464
|
+
with open(file_path, encoding="utf-8", errors="replace") as f:
|
|
465
|
+
for line_num, line in enumerate(f, 1):
|
|
466
|
+
bytes_read += len(line.encode("utf-8", errors="replace"))
|
|
467
|
+
|
|
468
|
+
match = todo_pattern.search(line)
|
|
469
|
+
if match:
|
|
470
|
+
tag = match.group(1).upper()
|
|
471
|
+
message = match.group(2).strip()[: self.MAX_EXCERPT_CHARS]
|
|
472
|
+
# Sanitize - remove any potential secrets
|
|
473
|
+
if not self._looks_like_secret(message):
|
|
474
|
+
excerpt = f"{rel_path}:{line_num} [{tag}] {message}"
|
|
475
|
+
todos.append(excerpt[: self.MAX_EXCERPT_CHARS])
|
|
476
|
+
|
|
477
|
+
if len(todos) >= max_count:
|
|
478
|
+
break
|
|
479
|
+
|
|
480
|
+
# Cap bytes read per file
|
|
481
|
+
if bytes_read > self.MAX_FILE_SIZE_FOR_SCAN:
|
|
482
|
+
break
|
|
483
|
+
|
|
484
|
+
except Exception as e:
|
|
485
|
+
logger.debug(f"Failed to extract TODOs from {file_path}: {e}")
|
|
486
|
+
|
|
487
|
+
return todos, bytes_read
|
|
488
|
+
|
|
489
|
+
def _looks_like_secret(self, text: str) -> bool:
|
|
490
|
+
"""Check if text looks like it might contain a secret."""
|
|
491
|
+
text_lower = text.lower()
|
|
492
|
+
secret_indicators = [
|
|
493
|
+
"password",
|
|
494
|
+
"secret",
|
|
495
|
+
"api_key",
|
|
496
|
+
"apikey",
|
|
497
|
+
"token",
|
|
498
|
+
"credential",
|
|
499
|
+
"auth",
|
|
500
|
+
"bearer",
|
|
501
|
+
"private_key",
|
|
502
|
+
]
|
|
503
|
+
# Check for key=value patterns with these words
|
|
504
|
+
for indicator in secret_indicators:
|
|
505
|
+
if indicator in text_lower and "=" in text:
|
|
506
|
+
return True
|
|
507
|
+
# Check for long hex strings (possible keys/tokens)
|
|
508
|
+
if re.search(r"[a-fA-F0-9]{32,}", text):
|
|
509
|
+
return True
|
|
510
|
+
return False
|
|
511
|
+
|
|
512
|
+
def _detect_project_type(self, repo_root: Path) -> str:
|
|
513
|
+
"""Detect the project type from configuration files."""
|
|
514
|
+
indicators = {
|
|
515
|
+
"python": ["requirements.txt", "pyproject.toml", "setup.py", "Pipfile"],
|
|
516
|
+
"node": ["package.json", "yarn.lock", "pnpm-lock.yaml"],
|
|
517
|
+
"go": ["go.mod", "go.sum"],
|
|
518
|
+
"rust": ["Cargo.toml"],
|
|
519
|
+
"java": ["pom.xml", "build.gradle", "build.gradle.kts"],
|
|
520
|
+
"ruby": ["Gemfile", "Rakefile"],
|
|
521
|
+
}
|
|
522
|
+
|
|
523
|
+
detected = []
|
|
524
|
+
for lang, files in indicators.items():
|
|
525
|
+
for f in files:
|
|
526
|
+
if (repo_root / f).exists():
|
|
527
|
+
detected.append(lang)
|
|
528
|
+
break
|
|
529
|
+
|
|
530
|
+
if not detected:
|
|
531
|
+
return "unknown"
|
|
532
|
+
if len(detected) == 1:
|
|
533
|
+
return detected[0]
|
|
534
|
+
return "mixed"
|
|
535
|
+
|
|
536
|
+
def _get_readme_excerpt(self, repo_root: Path) -> str | None:
|
|
537
|
+
"""Get a capped excerpt from the README file."""
|
|
538
|
+
readme_names = ["README.md", "README.rst", "README.txt", "README"]
|
|
539
|
+
|
|
540
|
+
for name in readme_names:
|
|
541
|
+
readme_path = repo_root / name
|
|
542
|
+
if readme_path.exists() and readme_path.is_file():
|
|
543
|
+
try:
|
|
544
|
+
with open(readme_path, encoding="utf-8", errors="replace") as f:
|
|
545
|
+
content = f.read(self.MAX_README_CHARS + 100)
|
|
546
|
+
if len(content) > self.MAX_README_CHARS:
|
|
547
|
+
# Truncate at word boundary
|
|
548
|
+
content = content[: self.MAX_README_CHARS]
|
|
549
|
+
last_space = content.rfind(" ")
|
|
550
|
+
if last_space > self.MAX_README_CHARS - 50:
|
|
551
|
+
content = content[:last_space]
|
|
552
|
+
content += "..."
|
|
553
|
+
return content
|
|
554
|
+
except Exception as e:
|
|
555
|
+
logger.debug(f"Failed to read README {readme_path}: {e}")
|
|
556
|
+
|
|
557
|
+
return None
|