draft-board 0.1.0-beta.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (250) hide show
  1. package/app/backend/.env.example +9 -0
  2. package/app/backend/.smartkanban/evidence/8b383839-cbec-45af-86ee-c7708d075cbe/bddf2ed5-2e21-4d46-a62b-10b87f1642a6_patch.txt +195 -0
  3. package/app/backend/.smartkanban/evidence/8b383839-cbec-45af-86ee-c7708d075cbe/bddf2ed5-2e21-4d46-a62b-10b87f1642a6_stat.txt +6 -0
  4. package/app/backend/CURL_EXAMPLES.md +335 -0
  5. package/app/backend/ENV_SETUP.md +65 -0
  6. package/app/backend/alembic/env.py +71 -0
  7. package/app/backend/alembic/script.py.mako +28 -0
  8. package/app/backend/alembic/versions/001_initial_schema.py +104 -0
  9. package/app/backend/alembic/versions/002_add_jobs_table.py +52 -0
  10. package/app/backend/alembic/versions/003_add_workspace_table.py +48 -0
  11. package/app/backend/alembic/versions/004_add_evidence_table.py +56 -0
  12. package/app/backend/alembic/versions/005_add_verification_commands.py +32 -0
  13. package/app/backend/alembic/versions/006_add_planner_lock_table.py +39 -0
  14. package/app/backend/alembic/versions/007_add_revision_review_tables.py +126 -0
  15. package/app/backend/alembic/versions/008_add_revision_idempotency_and_traceability.py +52 -0
  16. package/app/backend/alembic/versions/009_add_job_health_fields.py +46 -0
  17. package/app/backend/alembic/versions/010_add_review_comment_line_content.py +36 -0
  18. package/app/backend/alembic/versions/011_add_analysis_cache.py +47 -0
  19. package/app/backend/alembic/versions/012_add_boards_table.py +102 -0
  20. package/app/backend/alembic/versions/013_add_ticket_blocking.py +45 -0
  21. package/app/backend/alembic/versions/014_add_agent_sessions.py +220 -0
  22. package/app/backend/alembic/versions/015_add_ticket_sort_order.py +33 -0
  23. package/app/backend/alembic/versions/03220f0b93ae_add_pr_fields_to_ticket.py +49 -0
  24. package/app/backend/alembic/versions/0c2d89fff3b1_seed_board_configs_from_yaml.py +206 -0
  25. package/app/backend/alembic/versions/3348e5cf54c1_add_merge_checklist_table.py +67 -0
  26. package/app/backend/alembic/versions/357c780ee445_add_goal_status.py +34 -0
  27. package/app/backend/alembic/versions/553340b7e26c_add_autonomy_fields_to_goal.py +65 -0
  28. package/app/backend/alembic/versions/774dc335c679_merge_migration_heads.py +23 -0
  29. package/app/backend/alembic/versions/7b307e847cbd_merge_heads.py +23 -0
  30. package/app/backend/alembic/versions/82ecd978cc70_add_missing_indexes.py +48 -0
  31. package/app/backend/alembic/versions/8ef5054dc280_add_normalized_log_entries.py +173 -0
  32. package/app/backend/alembic/versions/8f3e2bd8ea3b_merge_migration_heads.py +23 -0
  33. package/app/backend/alembic/versions/9d17f0698d3b_add_config_column_to_boards_table.py +30 -0
  34. package/app/backend/alembic/versions/add_agent_conversation_history.py +72 -0
  35. package/app/backend/alembic/versions/add_job_variant.py +34 -0
  36. package/app/backend/alembic/versions/add_performance_indexes.py +95 -0
  37. package/app/backend/alembic/versions/add_repos_and_board_repos.py +174 -0
  38. package/app/backend/alembic/versions/add_session_id_to_jobs.py +27 -0
  39. package/app/backend/alembic/versions/add_sqlite_backend_tables.py +104 -0
  40. package/app/backend/alembic/versions/b10fb0b62240_add_diff_content_to_revisions.py +34 -0
  41. package/app/backend/alembic.ini +89 -0
  42. package/app/backend/app/__init__.py +3 -0
  43. package/app/backend/app/data_dir.py +85 -0
  44. package/app/backend/app/database.py +70 -0
  45. package/app/backend/app/database_sync.py +64 -0
  46. package/app/backend/app/dependencies/__init__.py +5 -0
  47. package/app/backend/app/dependencies/auth.py +80 -0
  48. package/app/backend/app/dependencies.py +43 -0
  49. package/app/backend/app/exceptions.py +178 -0
  50. package/app/backend/app/executors/__init__.py +1 -0
  51. package/app/backend/app/executors/adapters/__init__.py +1 -0
  52. package/app/backend/app/executors/adapters/aider.py +152 -0
  53. package/app/backend/app/executors/adapters/amazon_q.py +103 -0
  54. package/app/backend/app/executors/adapters/amp.py +123 -0
  55. package/app/backend/app/executors/adapters/claude.py +177 -0
  56. package/app/backend/app/executors/adapters/cline.py +127 -0
  57. package/app/backend/app/executors/adapters/codex.py +167 -0
  58. package/app/backend/app/executors/adapters/copilot.py +202 -0
  59. package/app/backend/app/executors/adapters/cursor.py +87 -0
  60. package/app/backend/app/executors/adapters/droid.py +123 -0
  61. package/app/backend/app/executors/adapters/gemini.py +132 -0
  62. package/app/backend/app/executors/adapters/goose.py +131 -0
  63. package/app/backend/app/executors/adapters/opencode.py +123 -0
  64. package/app/backend/app/executors/adapters/qwen.py +123 -0
  65. package/app/backend/app/executors/plugins/__init__.py +1 -0
  66. package/app/backend/app/executors/registry.py +202 -0
  67. package/app/backend/app/executors/spec.py +226 -0
  68. package/app/backend/app/main.py +486 -0
  69. package/app/backend/app/middleware/__init__.py +13 -0
  70. package/app/backend/app/middleware/idempotency.py +426 -0
  71. package/app/backend/app/middleware/rate_limit.py +312 -0
  72. package/app/backend/app/middleware/security_headers.py +43 -0
  73. package/app/backend/app/middleware/timeout.py +37 -0
  74. package/app/backend/app/models/__init__.py +56 -0
  75. package/app/backend/app/models/agent_conversation_history.py +56 -0
  76. package/app/backend/app/models/agent_session.py +127 -0
  77. package/app/backend/app/models/analysis_cache.py +49 -0
  78. package/app/backend/app/models/base.py +9 -0
  79. package/app/backend/app/models/board.py +79 -0
  80. package/app/backend/app/models/board_repo.py +68 -0
  81. package/app/backend/app/models/cost_budget.py +42 -0
  82. package/app/backend/app/models/enums.py +40 -0
  83. package/app/backend/app/models/evidence.py +132 -0
  84. package/app/backend/app/models/goal.py +102 -0
  85. package/app/backend/app/models/idempotency_entry.py +30 -0
  86. package/app/backend/app/models/job.py +163 -0
  87. package/app/backend/app/models/job_queue.py +39 -0
  88. package/app/backend/app/models/kv_store.py +28 -0
  89. package/app/backend/app/models/merge_checklist.py +87 -0
  90. package/app/backend/app/models/normalized_log.py +100 -0
  91. package/app/backend/app/models/planner_lock.py +43 -0
  92. package/app/backend/app/models/rate_limit_entry.py +25 -0
  93. package/app/backend/app/models/repo.py +66 -0
  94. package/app/backend/app/models/review_comment.py +91 -0
  95. package/app/backend/app/models/review_summary.py +69 -0
  96. package/app/backend/app/models/revision.py +130 -0
  97. package/app/backend/app/models/ticket.py +223 -0
  98. package/app/backend/app/models/ticket_event.py +83 -0
  99. package/app/backend/app/models/user.py +47 -0
  100. package/app/backend/app/models/workspace.py +71 -0
  101. package/app/backend/app/redis_client.py +119 -0
  102. package/app/backend/app/routers/__init__.py +29 -0
  103. package/app/backend/app/routers/agents.py +296 -0
  104. package/app/backend/app/routers/auth.py +94 -0
  105. package/app/backend/app/routers/board.py +885 -0
  106. package/app/backend/app/routers/dashboard.py +351 -0
  107. package/app/backend/app/routers/debug.py +528 -0
  108. package/app/backend/app/routers/evidence.py +96 -0
  109. package/app/backend/app/routers/executors.py +324 -0
  110. package/app/backend/app/routers/goals.py +574 -0
  111. package/app/backend/app/routers/jobs.py +448 -0
  112. package/app/backend/app/routers/maintenance.py +172 -0
  113. package/app/backend/app/routers/merge.py +360 -0
  114. package/app/backend/app/routers/planner.py +537 -0
  115. package/app/backend/app/routers/pull_requests.py +382 -0
  116. package/app/backend/app/routers/repos.py +263 -0
  117. package/app/backend/app/routers/revisions.py +939 -0
  118. package/app/backend/app/routers/settings.py +267 -0
  119. package/app/backend/app/routers/tickets.py +2003 -0
  120. package/app/backend/app/routers/webhooks.py +143 -0
  121. package/app/backend/app/routers/websocket.py +249 -0
  122. package/app/backend/app/schemas/__init__.py +109 -0
  123. package/app/backend/app/schemas/board.py +87 -0
  124. package/app/backend/app/schemas/common.py +33 -0
  125. package/app/backend/app/schemas/evidence.py +87 -0
  126. package/app/backend/app/schemas/goal.py +90 -0
  127. package/app/backend/app/schemas/job.py +97 -0
  128. package/app/backend/app/schemas/merge.py +139 -0
  129. package/app/backend/app/schemas/planner.py +500 -0
  130. package/app/backend/app/schemas/repo.py +187 -0
  131. package/app/backend/app/schemas/review.py +137 -0
  132. package/app/backend/app/schemas/revision.py +114 -0
  133. package/app/backend/app/schemas/ticket.py +238 -0
  134. package/app/backend/app/schemas/ticket_event.py +72 -0
  135. package/app/backend/app/schemas/workspace.py +19 -0
  136. package/app/backend/app/services/__init__.py +31 -0
  137. package/app/backend/app/services/agent_memory_service.py +223 -0
  138. package/app/backend/app/services/agent_registry.py +346 -0
  139. package/app/backend/app/services/agent_session_manager.py +318 -0
  140. package/app/backend/app/services/agent_session_service.py +219 -0
  141. package/app/backend/app/services/agent_tools.py +379 -0
  142. package/app/backend/app/services/auth_service.py +98 -0
  143. package/app/backend/app/services/autonomy_service.py +380 -0
  144. package/app/backend/app/services/board_repo_service.py +201 -0
  145. package/app/backend/app/services/board_service.py +326 -0
  146. package/app/backend/app/services/cleanup_service.py +1085 -0
  147. package/app/backend/app/services/config_service.py +908 -0
  148. package/app/backend/app/services/context_gatherer.py +557 -0
  149. package/app/backend/app/services/cost_tracking_service.py +293 -0
  150. package/app/backend/app/services/cursor_log_normalizer.py +536 -0
  151. package/app/backend/app/services/delivery_pipeline.py +440 -0
  152. package/app/backend/app/services/executor_service.py +634 -0
  153. package/app/backend/app/services/git_host/__init__.py +11 -0
  154. package/app/backend/app/services/git_host/factory.py +87 -0
  155. package/app/backend/app/services/git_host/github.py +270 -0
  156. package/app/backend/app/services/git_host/gitlab.py +194 -0
  157. package/app/backend/app/services/git_host/protocol.py +75 -0
  158. package/app/backend/app/services/git_merge_simple.py +346 -0
  159. package/app/backend/app/services/git_ops.py +384 -0
  160. package/app/backend/app/services/github_service.py +233 -0
  161. package/app/backend/app/services/goal_service.py +113 -0
  162. package/app/backend/app/services/job_service.py +423 -0
  163. package/app/backend/app/services/job_watchdog_service.py +424 -0
  164. package/app/backend/app/services/langchain_adapter.py +122 -0
  165. package/app/backend/app/services/llm_provider_clients.py +351 -0
  166. package/app/backend/app/services/llm_service.py +285 -0
  167. package/app/backend/app/services/log_normalizer.py +342 -0
  168. package/app/backend/app/services/log_stream_service.py +276 -0
  169. package/app/backend/app/services/merge_checklist_service.py +264 -0
  170. package/app/backend/app/services/merge_service.py +784 -0
  171. package/app/backend/app/services/orchestrator_log.py +84 -0
  172. package/app/backend/app/services/planner_service.py +1662 -0
  173. package/app/backend/app/services/planner_tick_sync.py +1040 -0
  174. package/app/backend/app/services/queued_message_service.py +156 -0
  175. package/app/backend/app/services/reliability_wrapper.py +389 -0
  176. package/app/backend/app/services/repo_discovery_service.py +318 -0
  177. package/app/backend/app/services/review_service.py +334 -0
  178. package/app/backend/app/services/revision_service.py +389 -0
  179. package/app/backend/app/services/safe_autopilot.py +510 -0
  180. package/app/backend/app/services/sqlite_worker.py +372 -0
  181. package/app/backend/app/services/task_dispatch.py +135 -0
  182. package/app/backend/app/services/ticket_generation_service.py +1781 -0
  183. package/app/backend/app/services/ticket_service.py +486 -0
  184. package/app/backend/app/services/udar_planner_service.py +1007 -0
  185. package/app/backend/app/services/webhook_service.py +126 -0
  186. package/app/backend/app/services/workspace_service.py +465 -0
  187. package/app/backend/app/services/worktree_file_service.py +92 -0
  188. package/app/backend/app/services/worktree_validator.py +213 -0
  189. package/app/backend/app/sqlite_kv.py +278 -0
  190. package/app/backend/app/state_machine.py +128 -0
  191. package/app/backend/app/templates/__init__.py +5 -0
  192. package/app/backend/app/templates/registry.py +243 -0
  193. package/app/backend/app/utils/__init__.py +5 -0
  194. package/app/backend/app/utils/artifact_reader.py +87 -0
  195. package/app/backend/app/utils/circuit_breaker.py +229 -0
  196. package/app/backend/app/utils/db_retry.py +136 -0
  197. package/app/backend/app/utils/ignored_fields.py +123 -0
  198. package/app/backend/app/utils/validators.py +54 -0
  199. package/app/backend/app/websocket/__init__.py +5 -0
  200. package/app/backend/app/websocket/manager.py +179 -0
  201. package/app/backend/app/websocket/state_tracker.py +113 -0
  202. package/app/backend/app/worker.py +3190 -0
  203. package/app/backend/calculator_tickets.json +40 -0
  204. package/app/backend/canary_tests.sh +591 -0
  205. package/app/backend/celerybeat-schedule +0 -0
  206. package/app/backend/celerybeat-schedule-shm +0 -0
  207. package/app/backend/celerybeat-schedule-wal +0 -0
  208. package/app/backend/logs/.gitkeep +3 -0
  209. package/app/backend/multiplication_division_implementation_tickets.json +55 -0
  210. package/app/backend/multiplication_division_tickets.json +42 -0
  211. package/app/backend/pyproject.toml +45 -0
  212. package/app/backend/requirements-dev.txt +8 -0
  213. package/app/backend/requirements.txt +20 -0
  214. package/app/backend/run.sh +30 -0
  215. package/app/backend/run_with_logs.sh +10 -0
  216. package/app/backend/scientific_calculator_tickets.json +40 -0
  217. package/app/backend/scripts/extract_openapi.py +21 -0
  218. package/app/backend/scripts/seed_demo.py +187 -0
  219. package/app/backend/setup_demo_review.py +302 -0
  220. package/app/backend/test_actual_parse.py +41 -0
  221. package/app/backend/test_agent_streaming.py +61 -0
  222. package/app/backend/test_parse.py +51 -0
  223. package/app/backend/test_streaming.py +51 -0
  224. package/app/backend/test_subprocess_streaming.py +50 -0
  225. package/app/backend/tests/__init__.py +1 -0
  226. package/app/backend/tests/conftest.py +46 -0
  227. package/app/backend/tests/test_auth.py +341 -0
  228. package/app/backend/tests/test_autonomy_service.py +391 -0
  229. package/app/backend/tests/test_cleanup_service_safety.py +417 -0
  230. package/app/backend/tests/test_middleware.py +279 -0
  231. package/app/backend/tests/test_planner_providers.py +290 -0
  232. package/app/backend/tests/test_planner_unblock.py +183 -0
  233. package/app/backend/tests/test_revision_invariants.py +618 -0
  234. package/app/backend/tests/test_sqlite_kv.py +290 -0
  235. package/app/backend/tests/test_sqlite_worker.py +353 -0
  236. package/app/backend/tests/test_task_dispatch.py +100 -0
  237. package/app/backend/tests/test_ticket_validation.py +304 -0
  238. package/app/backend/tests/test_udar_agent.py +693 -0
  239. package/app/backend/tests/test_webhook_service.py +184 -0
  240. package/app/backend/tickets_output.json +59 -0
  241. package/app/backend/user_management_tickets.json +50 -0
  242. package/app/backend/uvicorn.log +0 -0
  243. package/app/draft.yaml +313 -0
  244. package/app/frontend/dist/assets/index-LcjCczu5.js +155 -0
  245. package/app/frontend/dist/assets/index-_FP_279e.css +1 -0
  246. package/app/frontend/dist/index.html +14 -0
  247. package/app/frontend/dist/vite.svg +1 -0
  248. package/app/frontend/package.json +101 -0
  249. package/bin/cli.js +527 -0
  250. package/package.json +37 -0
@@ -0,0 +1,557 @@
1
+ """Secure, metadata-first repository context gathering.
2
+
3
+ This module provides safe context gathering for LLM-based ticket generation.
4
+ It follows these principles:
5
+
6
+ 1. METADATA-FIRST: Returns file paths, line counts, and small excerpts only.
7
+ Never returns full file contents except for small, capped excerpts.
8
+
9
+ 2. STRICT CAPS: Hard limits on files scanned, bytes read, and excerpt sizes
10
+ to prevent runaway prompts and cost explosions.
11
+
12
+ 3. SECURITY: Excludes sensitive paths (.env, keys, secrets) and skips symlinks
13
+ to prevent secret leakage to third-party LLM providers.
14
+ """
15
+
16
+ import fnmatch
17
+ import logging
18
+ import re
19
+ from dataclasses import dataclass, field
20
+ from pathlib import Path
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ @dataclass
26
+ class FileMetadata:
27
+ """Metadata about a single file."""
28
+
29
+ path: str # Relative to repo root
30
+ line_count: int
31
+ language: str | None # Detected from extension
32
+ size_bytes: int
33
+
34
+
35
+ @dataclass
36
+ class GatherStats:
37
+ """Statistics from the context gathering operation."""
38
+
39
+ files_scanned: int = 0
40
+ bytes_read: int = 0
41
+ skipped_excluded: int = 0
42
+ skipped_symlinks: int = 0
43
+ skipped_binary: int = 0
44
+ skipped_too_large: int = 0
45
+ todo_lines_found: int = 0
46
+ # Observability: track what was excluded and what was scanned
47
+ excluded_by_pattern: dict[str, int] = field(
48
+ default_factory=dict
49
+ ) # pattern -> count
50
+ extensions_scanned: dict[str, int] = field(
51
+ default_factory=dict
52
+ ) # extension -> count
53
+
54
+
55
+ @dataclass
56
+ class RepoContext:
57
+ """Gathered repository context for LLM consumption."""
58
+
59
+ file_tree: list[FileMetadata]
60
+ project_type: str # "python", "node", "mixed", "unknown"
61
+ todo_count: int
62
+ todo_excerpts: list[str] # Max 50, each max 200 chars
63
+ readme_excerpt: str | None # Max 500 chars if enabled
64
+ stats: GatherStats = field(default_factory=GatherStats)
65
+
66
+ def to_prompt_string(self) -> str:
67
+ """Convert to a string suitable for LLM prompts."""
68
+ parts = []
69
+
70
+ # Project type
71
+ parts.append(f"Project type: {self.project_type}")
72
+
73
+ # File tree summary (top directories + file counts by type)
74
+ if self.file_tree:
75
+ # Group by directory
76
+ dir_counts: dict[str, int] = {}
77
+ ext_counts: dict[str, int] = {}
78
+ for f in self.file_tree:
79
+ # Get top-level directory
80
+ path_parts = f.path.split("/")
81
+ if len(path_parts) > 1:
82
+ top_dir = path_parts[0]
83
+ dir_counts[top_dir] = dir_counts.get(top_dir, 0) + 1
84
+ # Count extensions
85
+ if f.language:
86
+ ext_counts[f.language] = ext_counts.get(f.language, 0) + 1
87
+
88
+ if dir_counts:
89
+ top_dirs = sorted(dir_counts.items(), key=lambda x: -x[1])[:10]
90
+ parts.append(
91
+ f"Top directories: {', '.join(f'{d} ({c} files)' for d, c in top_dirs)}"
92
+ )
93
+
94
+ if ext_counts:
95
+ top_exts = sorted(ext_counts.items(), key=lambda x: -x[1])[:8]
96
+ parts.append(
97
+ f"File types: {', '.join(f'{e} ({c})' for e, c in top_exts)}"
98
+ )
99
+
100
+ parts.append(f"Total files indexed: {len(self.file_tree)}")
101
+
102
+ # README excerpt
103
+ if self.readme_excerpt:
104
+ parts.append(f"README excerpt:\n{self.readme_excerpt}")
105
+
106
+ # TODOs
107
+ if self.todo_count > 0:
108
+ parts.append(f"TODO/FIXME comments found: {self.todo_count}")
109
+ if self.todo_excerpts:
110
+ parts.append("Sample TODOs:")
111
+ for excerpt in self.todo_excerpts[:10]:
112
+ parts.append(f" - {excerpt}")
113
+
114
+ # Stats
115
+ parts.append(
116
+ f"Scan stats: {self.stats.files_scanned} files scanned, "
117
+ f"{self.stats.skipped_excluded} excluded, "
118
+ f"{self.stats.skipped_symlinks} symlinks skipped"
119
+ )
120
+
121
+ return "\n".join(parts)
122
+
123
+
124
+ class ContextGatherer:
125
+ """Metadata-first repo context with strict caps and exclusions.
126
+
127
+ This class gathers repository context safely for LLM consumption:
128
+ - Never reads full file contents (only line counts and small excerpts)
129
+ - Excludes sensitive files (secrets, env, keys)
130
+ - Enforces hard caps on all operations
131
+ - Skips symlinks entirely
132
+ """
133
+
134
+ # Hard caps - non-negotiable limits
135
+ MAX_FILES_SCANNED = 500
136
+ MAX_BYTES_TOTAL = 50_000 # ~50KB of excerpts
137
+ MAX_TODO_LINES = 50
138
+ MAX_EXCERPT_CHARS = 200
139
+ MAX_README_CHARS = 500
140
+ MAX_FILE_SIZE_FOR_SCAN = 100_000 # Skip files > 100KB for TODO scanning
141
+
142
+ # Sensitive path patterns to exclude (glob-style)
143
+ EXCLUDED_PATTERNS = [
144
+ # Environment and secrets
145
+ ".env",
146
+ ".env.*",
147
+ "*.env",
148
+ ".envrc",
149
+ "secrets.*",
150
+ "*secret*",
151
+ "*password*",
152
+ # Keys and certificates
153
+ "*.pem",
154
+ "*.key",
155
+ "*.crt",
156
+ "*.p12",
157
+ "*.pfx",
158
+ "id_rsa*",
159
+ "id_ed25519*",
160
+ "*.pub",
161
+ # Config files that might contain secrets
162
+ "credentials*",
163
+ "*_credentials*",
164
+ "auth.json",
165
+ "config.local.*",
166
+ # Package directories
167
+ "node_modules/",
168
+ "venv/",
169
+ ".venv/",
170
+ "__pycache__/",
171
+ ".git/",
172
+ ".svn/",
173
+ ".hg/",
174
+ # Build artifacts
175
+ "dist/",
176
+ "build/",
177
+ "*.pyc",
178
+ "*.pyo",
179
+ "*.so",
180
+ "*.dylib",
181
+ "*.dll",
182
+ # Logs and data
183
+ "*.log",
184
+ "*.sqlite",
185
+ "*.db",
186
+ # IDE and editor
187
+ ".idea/",
188
+ ".vscode/",
189
+ "*.swp",
190
+ "*.swo",
191
+ # Coverage and test artifacts
192
+ "coverage/",
193
+ ".coverage",
194
+ "htmlcov/",
195
+ ".pytest_cache/",
196
+ ".mypy_cache/",
197
+ # Binary files
198
+ "*.jpg",
199
+ "*.jpeg",
200
+ "*.png",
201
+ "*.gif",
202
+ "*.ico",
203
+ "*.pdf",
204
+ "*.zip",
205
+ "*.tar",
206
+ "*.gz",
207
+ "*.woff",
208
+ "*.woff2",
209
+ "*.ttf",
210
+ "*.eot",
211
+ ]
212
+
213
+ # Extension to language mapping
214
+ EXTENSION_LANGUAGES = {
215
+ ".py": "python",
216
+ ".js": "javascript",
217
+ ".ts": "typescript",
218
+ ".tsx": "typescript",
219
+ ".jsx": "javascript",
220
+ ".go": "go",
221
+ ".rs": "rust",
222
+ ".java": "java",
223
+ ".kt": "kotlin",
224
+ ".rb": "ruby",
225
+ ".php": "php",
226
+ ".c": "c",
227
+ ".cpp": "cpp",
228
+ ".h": "c",
229
+ ".hpp": "cpp",
230
+ ".cs": "csharp",
231
+ ".swift": "swift",
232
+ ".sh": "shell",
233
+ ".bash": "shell",
234
+ ".zsh": "shell",
235
+ ".sql": "sql",
236
+ ".md": "markdown",
237
+ ".yaml": "yaml",
238
+ ".yml": "yaml",
239
+ ".json": "json",
240
+ ".toml": "toml",
241
+ ".xml": "xml",
242
+ ".html": "html",
243
+ ".css": "css",
244
+ ".scss": "scss",
245
+ ".less": "less",
246
+ }
247
+
248
+ def __init__(
249
+ self,
250
+ max_files: int | None = None,
251
+ max_bytes: int | None = None,
252
+ max_todos: int | None = None,
253
+ additional_exclusions: list[str] | None = None,
254
+ ):
255
+ """Initialize the context gatherer.
256
+
257
+ Args:
258
+ max_files: Override MAX_FILES_SCANNED (can only decrease).
259
+ max_bytes: Override MAX_BYTES_TOTAL (can only decrease).
260
+ max_todos: Override MAX_TODO_LINES (can only decrease).
261
+ additional_exclusions: Additional glob patterns to exclude.
262
+ """
263
+ self.max_files = min(
264
+ max_files or self.MAX_FILES_SCANNED, self.MAX_FILES_SCANNED
265
+ )
266
+ self.max_bytes = min(max_bytes or self.MAX_BYTES_TOTAL, self.MAX_BYTES_TOTAL)
267
+ self.max_todos = min(max_todos or self.MAX_TODO_LINES, self.MAX_TODO_LINES)
268
+
269
+ self.exclusions = list(self.EXCLUDED_PATTERNS)
270
+ if additional_exclusions:
271
+ self.exclusions.extend(additional_exclusions)
272
+
273
+ def gather(
274
+ self,
275
+ repo_root: Path | str,
276
+ include_readme_excerpt: bool = False,
277
+ ) -> RepoContext:
278
+ """Gather repository context.
279
+
280
+ Args:
281
+ repo_root: Path to the repository root.
282
+ include_readme_excerpt: Whether to include README excerpt (default OFF).
283
+
284
+ Returns:
285
+ RepoContext with metadata about the repository.
286
+ """
287
+ repo_root = Path(repo_root).resolve()
288
+ if not repo_root.exists():
289
+ logger.warning(f"Repository root does not exist: {repo_root}")
290
+ return RepoContext(
291
+ file_tree=[],
292
+ project_type="unknown",
293
+ todo_count=0,
294
+ todo_excerpts=[],
295
+ readme_excerpt=None,
296
+ )
297
+
298
+ stats = GatherStats()
299
+ file_tree: list[FileMetadata] = []
300
+ todo_excerpts: list[str] = []
301
+ readme_excerpt: str | None = None
302
+
303
+ # Detect project type
304
+ project_type = self._detect_project_type(repo_root)
305
+
306
+ # Scan files
307
+ bytes_read = 0
308
+ for file_path in self._walk_files(repo_root, stats):
309
+ if stats.files_scanned >= self.max_files:
310
+ break
311
+
312
+ try:
313
+ # Get file metadata (no content read yet)
314
+ rel_path = str(file_path.relative_to(repo_root))
315
+ file_size = file_path.stat().st_size
316
+ extension = file_path.suffix.lower()
317
+ language = self.EXTENSION_LANGUAGES.get(extension)
318
+
319
+ # Count lines without reading entire file into memory
320
+ line_count = self._count_lines(file_path)
321
+ if line_count is None:
322
+ stats.skipped_binary += 1
323
+ continue
324
+
325
+ file_tree.append(
326
+ FileMetadata(
327
+ path=rel_path,
328
+ line_count=line_count,
329
+ language=language,
330
+ size_bytes=file_size,
331
+ )
332
+ )
333
+ stats.files_scanned += 1
334
+
335
+ # Scan for TODOs if file is small enough and we haven't hit the cap
336
+ if (
337
+ len(todo_excerpts) < self.max_todos
338
+ and file_size < self.MAX_FILE_SIZE_FOR_SCAN
339
+ and bytes_read < self.max_bytes
340
+ and language
341
+ in ("python", "javascript", "typescript", "go", "rust", "java")
342
+ ):
343
+ new_todos, bytes_used = self._extract_todos(
344
+ file_path, rel_path, self.max_todos - len(todo_excerpts)
345
+ )
346
+ todo_excerpts.extend(new_todos)
347
+ bytes_read += bytes_used
348
+ stats.todo_lines_found += len(new_todos)
349
+
350
+ except (OSError, PermissionError) as e:
351
+ logger.debug(f"Failed to read {file_path}: {e}")
352
+ continue
353
+
354
+ # Get README excerpt if requested
355
+ if include_readme_excerpt:
356
+ readme_excerpt = self._get_readme_excerpt(repo_root)
357
+ if readme_excerpt:
358
+ bytes_read += len(readme_excerpt.encode("utf-8", errors="replace"))
359
+
360
+ stats.bytes_read = bytes_read
361
+
362
+ return RepoContext(
363
+ file_tree=file_tree,
364
+ project_type=project_type,
365
+ todo_count=stats.todo_lines_found,
366
+ todo_excerpts=todo_excerpts,
367
+ readme_excerpt=readme_excerpt,
368
+ stats=stats,
369
+ )
370
+
371
+ def _walk_files(self, repo_root: Path, stats: GatherStats):
372
+ """Walk repository files, respecting exclusions and caps.
373
+
374
+ Yields file paths, updating stats as it goes.
375
+ """
376
+ for item in repo_root.rglob("*"):
377
+ # Skip directories
378
+ if item.is_dir():
379
+ continue
380
+
381
+ # Skip symlinks entirely (security)
382
+ if item.is_symlink():
383
+ stats.skipped_symlinks += 1
384
+ continue
385
+
386
+ # Check exclusions (returns matching pattern if excluded)
387
+ rel_path = str(item.relative_to(repo_root))
388
+ matched_pattern = self._get_exclusion_match(rel_path, item.name)
389
+ if matched_pattern:
390
+ stats.skipped_excluded += 1
391
+ # Track which patterns are matching (for debugging bad suggestions)
392
+ stats.excluded_by_pattern[matched_pattern] = (
393
+ stats.excluded_by_pattern.get(matched_pattern, 0) + 1
394
+ )
395
+ continue
396
+
397
+ # Track extension for filetype histogram
398
+ ext = item.suffix.lower() or "(no extension)"
399
+ stats.extensions_scanned[ext] = stats.extensions_scanned.get(ext, 0) + 1
400
+
401
+ yield item
402
+
403
+ def _get_exclusion_match(self, rel_path: str, filename: str) -> str | None:
404
+ """Check if a path matches any exclusion pattern.
405
+
406
+ Returns the matching pattern if excluded, None otherwise.
407
+ """
408
+ for pattern in self.exclusions:
409
+ # Check against full relative path
410
+ if fnmatch.fnmatch(rel_path, pattern):
411
+ return pattern
412
+ if fnmatch.fnmatch(rel_path, f"**/{pattern}"):
413
+ return pattern
414
+ # Check against filename only
415
+ if fnmatch.fnmatch(filename, pattern):
416
+ return pattern
417
+ # Check if path contains the pattern as a directory
418
+ if pattern.endswith("/") and pattern[:-1] in rel_path.split("/"):
419
+ return pattern
420
+ return None
421
+
422
+ def _is_excluded(self, rel_path: str, filename: str) -> bool:
423
+ """Check if a path matches any exclusion pattern."""
424
+ return self._get_exclusion_match(rel_path, filename) is not None
425
+
426
+ def _count_lines(self, file_path: Path) -> int | None:
427
+ """Count lines in a file without loading it all into memory.
428
+
429
+ Returns None if the file appears to be binary.
430
+ """
431
+ try:
432
+ line_count = 0
433
+ with open(file_path, "rb") as f:
434
+ # Read first 8KB to check if binary
435
+ sample = f.read(8192)
436
+ if b"\x00" in sample:
437
+ return None # Binary file
438
+
439
+ # Count newlines in sample
440
+ line_count = sample.count(b"\n")
441
+
442
+ # Continue counting for rest of file
443
+ for chunk in iter(lambda: f.read(65536), b""):
444
+ line_count += chunk.count(b"\n")
445
+
446
+ return line_count
447
+ except Exception:
448
+ return None
449
+
450
+ def _extract_todos(
451
+ self, file_path: Path, rel_path: str, max_count: int
452
+ ) -> tuple[list[str], int]:
453
+ """Extract TODO/FIXME comments from a file.
454
+
455
+ Returns (list of excerpts, bytes read).
456
+ """
457
+ todos: list[str] = []
458
+ bytes_read = 0
459
+ todo_pattern = re.compile(
460
+ r"#\s*(TODO|FIXME|XXX|HACK)\b[:\s]*(.*)", re.IGNORECASE
461
+ )
462
+
463
+ try:
464
+ with open(file_path, encoding="utf-8", errors="replace") as f:
465
+ for line_num, line in enumerate(f, 1):
466
+ bytes_read += len(line.encode("utf-8", errors="replace"))
467
+
468
+ match = todo_pattern.search(line)
469
+ if match:
470
+ tag = match.group(1).upper()
471
+ message = match.group(2).strip()[: self.MAX_EXCERPT_CHARS]
472
+ # Sanitize - remove any potential secrets
473
+ if not self._looks_like_secret(message):
474
+ excerpt = f"{rel_path}:{line_num} [{tag}] {message}"
475
+ todos.append(excerpt[: self.MAX_EXCERPT_CHARS])
476
+
477
+ if len(todos) >= max_count:
478
+ break
479
+
480
+ # Cap bytes read per file
481
+ if bytes_read > self.MAX_FILE_SIZE_FOR_SCAN:
482
+ break
483
+
484
+ except Exception as e:
485
+ logger.debug(f"Failed to extract TODOs from {file_path}: {e}")
486
+
487
+ return todos, bytes_read
488
+
489
+ def _looks_like_secret(self, text: str) -> bool:
490
+ """Check if text looks like it might contain a secret."""
491
+ text_lower = text.lower()
492
+ secret_indicators = [
493
+ "password",
494
+ "secret",
495
+ "api_key",
496
+ "apikey",
497
+ "token",
498
+ "credential",
499
+ "auth",
500
+ "bearer",
501
+ "private_key",
502
+ ]
503
+ # Check for key=value patterns with these words
504
+ for indicator in secret_indicators:
505
+ if indicator in text_lower and "=" in text:
506
+ return True
507
+ # Check for long hex strings (possible keys/tokens)
508
+ if re.search(r"[a-fA-F0-9]{32,}", text):
509
+ return True
510
+ return False
511
+
512
+ def _detect_project_type(self, repo_root: Path) -> str:
513
+ """Detect the project type from configuration files."""
514
+ indicators = {
515
+ "python": ["requirements.txt", "pyproject.toml", "setup.py", "Pipfile"],
516
+ "node": ["package.json", "yarn.lock", "pnpm-lock.yaml"],
517
+ "go": ["go.mod", "go.sum"],
518
+ "rust": ["Cargo.toml"],
519
+ "java": ["pom.xml", "build.gradle", "build.gradle.kts"],
520
+ "ruby": ["Gemfile", "Rakefile"],
521
+ }
522
+
523
+ detected = []
524
+ for lang, files in indicators.items():
525
+ for f in files:
526
+ if (repo_root / f).exists():
527
+ detected.append(lang)
528
+ break
529
+
530
+ if not detected:
531
+ return "unknown"
532
+ if len(detected) == 1:
533
+ return detected[0]
534
+ return "mixed"
535
+
536
+ def _get_readme_excerpt(self, repo_root: Path) -> str | None:
537
+ """Get a capped excerpt from the README file."""
538
+ readme_names = ["README.md", "README.rst", "README.txt", "README"]
539
+
540
+ for name in readme_names:
541
+ readme_path = repo_root / name
542
+ if readme_path.exists() and readme_path.is_file():
543
+ try:
544
+ with open(readme_path, encoding="utf-8", errors="replace") as f:
545
+ content = f.read(self.MAX_README_CHARS + 100)
546
+ if len(content) > self.MAX_README_CHARS:
547
+ # Truncate at word boundary
548
+ content = content[: self.MAX_README_CHARS]
549
+ last_space = content.rfind(" ")
550
+ if last_space > self.MAX_README_CHARS - 50:
551
+ content = content[:last_space]
552
+ content += "..."
553
+ return content
554
+ except Exception as e:
555
+ logger.debug(f"Failed to read README {readme_path}: {e}")
556
+
557
+ return None