codeboarding 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. agents/__init__.py +0 -0
  2. agents/abstraction_agent.py +150 -0
  3. agents/agent.py +467 -0
  4. agents/agent_responses.py +363 -0
  5. agents/cluster_methods_mixin.py +281 -0
  6. agents/constants.py +13 -0
  7. agents/dependency_discovery.py +159 -0
  8. agents/details_agent.py +174 -0
  9. agents/llm_config.py +309 -0
  10. agents/meta_agent.py +105 -0
  11. agents/planner_agent.py +105 -0
  12. agents/prompts/__init__.py +85 -0
  13. agents/prompts/abstract_prompt_factory.py +63 -0
  14. agents/prompts/claude_prompts.py +381 -0
  15. agents/prompts/deepseek_prompts.py +389 -0
  16. agents/prompts/gemini_flash_prompts.py +362 -0
  17. agents/prompts/glm_prompts.py +407 -0
  18. agents/prompts/gpt_prompts.py +470 -0
  19. agents/prompts/kimi_prompts.py +400 -0
  20. agents/prompts/prompt_factory.py +179 -0
  21. agents/tools/__init__.py +8 -0
  22. agents/tools/base.py +96 -0
  23. agents/tools/get_external_deps.py +47 -0
  24. agents/tools/get_method_invocations.py +47 -0
  25. agents/tools/read_cfg.py +60 -0
  26. agents/tools/read_docs.py +132 -0
  27. agents/tools/read_file.py +90 -0
  28. agents/tools/read_file_structure.py +156 -0
  29. agents/tools/read_git_diff.py +131 -0
  30. agents/tools/read_packages.py +60 -0
  31. agents/tools/read_source.py +105 -0
  32. agents/tools/read_structure.py +49 -0
  33. agents/tools/toolkit.py +119 -0
  34. agents/validation.py +383 -0
  35. caching/__init__.py +4 -0
  36. caching/cache.py +29 -0
  37. caching/meta_cache.py +227 -0
  38. codeboarding-0.9.0.dist-info/METADATA +223 -0
  39. codeboarding-0.9.0.dist-info/RECORD +126 -0
  40. codeboarding-0.9.0.dist-info/WHEEL +5 -0
  41. codeboarding-0.9.0.dist-info/entry_points.txt +3 -0
  42. codeboarding-0.9.0.dist-info/licenses/LICENSE +21 -0
  43. codeboarding-0.9.0.dist-info/top_level.txt +18 -0
  44. core/__init__.py +101 -0
  45. core/plugin_loader.py +46 -0
  46. core/protocols.py +27 -0
  47. core/registry.py +46 -0
  48. diagram_analysis/__init__.py +4 -0
  49. diagram_analysis/analysis_json.py +346 -0
  50. diagram_analysis/diagram_generator.py +486 -0
  51. diagram_analysis/file_coverage.py +212 -0
  52. diagram_analysis/incremental/__init__.py +63 -0
  53. diagram_analysis/incremental/component_checker.py +236 -0
  54. diagram_analysis/incremental/file_manager.py +217 -0
  55. diagram_analysis/incremental/impact_analyzer.py +238 -0
  56. diagram_analysis/incremental/io_utils.py +281 -0
  57. diagram_analysis/incremental/models.py +72 -0
  58. diagram_analysis/incremental/path_patching.py +164 -0
  59. diagram_analysis/incremental/reexpansion.py +166 -0
  60. diagram_analysis/incremental/scoped_analysis.py +227 -0
  61. diagram_analysis/incremental/updater.py +464 -0
  62. diagram_analysis/incremental/validation.py +48 -0
  63. diagram_analysis/manifest.py +152 -0
  64. diagram_analysis/version.py +6 -0
  65. duckdb_crud.py +125 -0
  66. github_action.py +172 -0
  67. health/__init__.py +3 -0
  68. health/checks/__init__.py +11 -0
  69. health/checks/circular_deps.py +48 -0
  70. health/checks/cohesion.py +93 -0
  71. health/checks/coupling.py +140 -0
  72. health/checks/function_size.py +85 -0
  73. health/checks/god_class.py +167 -0
  74. health/checks/inheritance.py +104 -0
  75. health/checks/instability.py +77 -0
  76. health/checks/unused_code_diagnostics.py +338 -0
  77. health/config.py +172 -0
  78. health/constants.py +19 -0
  79. health/models.py +186 -0
  80. health/runner.py +236 -0
  81. install.py +518 -0
  82. logging_config.py +105 -0
  83. main.py +529 -0
  84. monitoring/__init__.py +12 -0
  85. monitoring/callbacks.py +163 -0
  86. monitoring/context.py +158 -0
  87. monitoring/mixin.py +16 -0
  88. monitoring/paths.py +47 -0
  89. monitoring/stats.py +50 -0
  90. monitoring/writers.py +172 -0
  91. output_generators/__init__.py +0 -0
  92. output_generators/html.py +163 -0
  93. output_generators/html_template.py +382 -0
  94. output_generators/markdown.py +140 -0
  95. output_generators/mdx.py +171 -0
  96. output_generators/sphinx.py +175 -0
  97. repo_utils/__init__.py +277 -0
  98. repo_utils/change_detector.py +289 -0
  99. repo_utils/errors.py +6 -0
  100. repo_utils/git_diff.py +74 -0
  101. repo_utils/ignore.py +341 -0
  102. static_analyzer/__init__.py +335 -0
  103. static_analyzer/analysis_cache.py +699 -0
  104. static_analyzer/analysis_result.py +269 -0
  105. static_analyzer/cluster_change_analyzer.py +391 -0
  106. static_analyzer/cluster_helpers.py +79 -0
  107. static_analyzer/constants.py +166 -0
  108. static_analyzer/git_diff_analyzer.py +224 -0
  109. static_analyzer/graph.py +746 -0
  110. static_analyzer/incremental_orchestrator.py +671 -0
  111. static_analyzer/java_config_scanner.py +232 -0
  112. static_analyzer/java_utils.py +227 -0
  113. static_analyzer/lsp_client/__init__.py +12 -0
  114. static_analyzer/lsp_client/client.py +1642 -0
  115. static_analyzer/lsp_client/diagnostics.py +62 -0
  116. static_analyzer/lsp_client/java_client.py +517 -0
  117. static_analyzer/lsp_client/language_settings.py +97 -0
  118. static_analyzer/lsp_client/typescript_client.py +235 -0
  119. static_analyzer/programming_language.py +152 -0
  120. static_analyzer/reference_resolve_mixin.py +166 -0
  121. static_analyzer/scanner.py +95 -0
  122. static_analyzer/typescript_config_scanner.py +54 -0
  123. tool_registry.py +433 -0
  124. user_config.py +134 -0
  125. utils.py +56 -0
  126. vscode_constants.py +124 -0
main.py ADDED
@@ -0,0 +1,529 @@
1
+ import argparse
2
+ import json
3
+ import logging
4
+ import os
5
+ import shutil
6
+ from pathlib import Path
7
+
8
+ import requests
9
+ from tqdm import tqdm
10
+
11
+ from agents.llm_config import configure_models, validate_api_key_provided
12
+ from user_config import ensure_config_template, load_user_config
13
+ from core import get_registries, load_plugins
14
+ from diagram_analysis import DiagramGenerator
15
+ from diagram_analysis.analysis_json import build_id_to_name_map, parse_unified_analysis
16
+ from diagram_analysis.incremental.io_utils import load_full_analysis, save_sub_analysis
17
+ from logging_config import setup_logging
18
+ from monitoring import monitor_execution
19
+ from monitoring.paths import generate_run_id, get_monitoring_run_dir
20
+ from output_generators.markdown import generate_markdown_file
21
+ from repo_utils import (
22
+ clone_repository,
23
+ get_branch,
24
+ get_repo_name,
25
+ store_token,
26
+ upload_onboarding_materials,
27
+ )
28
+ from repo_utils.ignore import initialize_codeboardingignore
29
+ from utils import (
30
+ create_temp_repo_folder,
31
+ monitoring_enabled,
32
+ remove_temp_repo_folder,
33
+ sanitize,
34
+ )
35
+ from vscode_constants import update_config
36
+
37
+ logger = logging.getLogger(__name__)
38
+
39
+
40
+ def onboarding_materials_exist(project_name: str) -> bool:
41
+ generated_repo_url = f"https://github.com/CodeBoarding/GeneratedOnBoardings/tree/main/{project_name}"
42
+ response = requests.get(generated_repo_url)
43
+ if response.status_code == 200:
44
+ logger.info(f"Repository has already been generated, please check {generated_repo_url}")
45
+ return True
46
+ return False
47
+
48
+
49
+ def generate_analysis(
50
+ repo_name: str,
51
+ repo_path: Path,
52
+ output_dir: Path,
53
+ depth_level: int = 1,
54
+ run_id: str | None = None,
55
+ monitoring_enabled: bool = False,
56
+ force_full: bool = False,
57
+ ) -> list[Path]:
58
+ generator = DiagramGenerator(
59
+ repo_location=repo_path,
60
+ temp_folder=output_dir,
61
+ repo_name=repo_name,
62
+ output_dir=output_dir,
63
+ depth_level=depth_level,
64
+ run_id=run_id,
65
+ monitoring_enabled=monitoring_enabled,
66
+ )
67
+ generator.force_full_analysis = force_full
68
+ generated_files = generator.generate_analysis()
69
+ return [Path(path) for path in generated_files]
70
+
71
+
72
+ def generate_markdown_docs(
73
+ repo_name: str,
74
+ repo_path: Path,
75
+ repo_url: str,
76
+ analysis_files: list[Path],
77
+ output_dir: Path,
78
+ demo_mode: bool = False,
79
+ ):
80
+ target_branch = get_branch(repo_path)
81
+ repo_ref = f"{repo_url}/blob/{target_branch}/"
82
+
83
+ # Load the single unified analysis.json
84
+ analysis_path = analysis_files[0]
85
+ with open(analysis_path, "r") as f:
86
+ data = json.load(f)
87
+
88
+ root_analysis, sub_analyses = parse_unified_analysis(data)
89
+
90
+ # Generate markdown for root analysis
91
+ root_expanded = set(sub_analyses.keys())
92
+ generate_markdown_file(
93
+ "on_boarding",
94
+ root_analysis,
95
+ repo_name,
96
+ repo_ref=repo_ref,
97
+ expanded_components=root_expanded,
98
+ temp_dir=output_dir,
99
+ demo=demo_mode,
100
+ )
101
+
102
+ # Build id-to-name mapping across all levels for file naming
103
+ id_to_name = build_id_to_name_map(root_analysis, sub_analyses)
104
+
105
+ # Generate markdown for each sub-analysis
106
+ for comp_id, sub_analysis in sub_analyses.items():
107
+ sub_expanded = {c.component_id for c in sub_analysis.components if c.component_id in sub_analyses}
108
+ comp_name = id_to_name.get(comp_id, comp_id)
109
+ fname = sanitize(comp_name)
110
+ generate_markdown_file(
111
+ fname,
112
+ sub_analysis,
113
+ repo_name,
114
+ repo_ref=repo_ref,
115
+ expanded_components=sub_expanded,
116
+ temp_dir=output_dir,
117
+ demo=demo_mode,
118
+ )
119
+
120
+
121
+ def partial_update(
122
+ repo_path: Path,
123
+ output_dir: Path,
124
+ project_name: str,
125
+ component_id: str,
126
+ depth_level: int = 1,
127
+ ):
128
+ """
129
+ Update a specific component in an existing analysis.
130
+ """
131
+ generator = DiagramGenerator(
132
+ repo_location=repo_path,
133
+ temp_folder=output_dir,
134
+ repo_name=project_name,
135
+ output_dir=output_dir,
136
+ depth_level=depth_level,
137
+ )
138
+ generator.pre_analysis()
139
+
140
+ # Load the full unified analysis (root + all sub-analyses)
141
+ full_analysis = load_full_analysis(output_dir)
142
+ if full_analysis is None:
143
+ logger.error(f"No analysis.json found in '{output_dir}'. Please ensure the file exists.")
144
+ return
145
+
146
+ root_analysis, sub_analyses = full_analysis
147
+
148
+ # Search root components first, then all nested sub-analysis components
149
+ component_to_analyze = None
150
+ for component in root_analysis.components:
151
+ if component.component_id == component_id:
152
+ logger.info(f"Updating analysis for component: {component.name}")
153
+ component_to_analyze = component
154
+ break
155
+ if component_to_analyze is None:
156
+ for sub_analysis in sub_analyses.values():
157
+ for component in sub_analysis.components:
158
+ if component.component_id == component_id:
159
+ logger.info(f"Updating analysis for component: {component.name}")
160
+ component_to_analyze = component
161
+ break
162
+ if component_to_analyze is not None:
163
+ break
164
+
165
+ if component_to_analyze is None:
166
+ logger.error(f"Component with ID '{component_id}' not found in analysis")
167
+ return
168
+
169
+ comp_id, sub_analysis, new_components = generator.process_component(component_to_analyze)
170
+
171
+ if sub_analysis:
172
+ save_sub_analysis(sub_analysis, output_dir, component_id)
173
+ logger.info(f"Updated component '{component_id}' in analysis.json")
174
+ else:
175
+ logger.error(f"Failed to generate sub-analysis for component '{component_id}'")
176
+
177
+
178
+ def generate_docs_remote(
179
+ repo_url: str,
180
+ temp_repo_folder: Path,
181
+ local_dev: bool = False,
182
+ run_id: str | None = None,
183
+ monitoring_enabled: bool = False,
184
+ ):
185
+ """
186
+ Clone a git repo and generate documentation (backward compatibility wrapper used by local_app).
187
+ """
188
+ process_remote_repository(
189
+ repo_url=repo_url,
190
+ output_dir=temp_repo_folder,
191
+ depth_level=int(os.getenv("DIAGRAM_DEPTH_LEVEL", "1")),
192
+ upload=not local_dev, # Only upload if not in local dev mode
193
+ cache_check=True,
194
+ run_id=run_id,
195
+ monitoring_enabled=monitoring_enabled,
196
+ )
197
+
198
+
199
+ def process_remote_repository(
200
+ repo_url: str,
201
+ output_dir: Path | None = None,
202
+ depth_level: int = 1,
203
+ upload: bool = False,
204
+ cache_check: bool = True,
205
+ run_id: str | None = None,
206
+ monitoring_enabled: bool = False,
207
+ ):
208
+ """
209
+ Process a remote repository by cloning and generating documentation.
210
+ """
211
+ repo_root = Path("repos")
212
+
213
+ repo_name = get_repo_name(repo_url)
214
+
215
+ # Check cache if enabled
216
+ if cache_check and onboarding_materials_exist(repo_name):
217
+ logger.info(f"Cache hit for '{repo_name}', skipping documentation generation.")
218
+ return
219
+
220
+ # Clone repository
221
+ repo_name = clone_repository(repo_url, repo_root)
222
+ repo_path = repo_root / repo_name
223
+
224
+ temp_folder = create_temp_repo_folder()
225
+
226
+ try:
227
+ analysis_files = generate_analysis(
228
+ repo_name=repo_name,
229
+ repo_path=repo_path,
230
+ output_dir=temp_folder,
231
+ depth_level=depth_level,
232
+ run_id=run_id,
233
+ monitoring_enabled=monitoring_enabled,
234
+ )
235
+
236
+ # Generate markdown documentation for remote repo
237
+ generate_markdown_docs(
238
+ repo_name=repo_name,
239
+ repo_path=repo_path,
240
+ repo_url=repo_url,
241
+ analysis_files=analysis_files,
242
+ output_dir=temp_folder,
243
+ demo_mode=True,
244
+ )
245
+
246
+ # Copy files to output directory if specified
247
+ if output_dir:
248
+ copy_files(temp_folder, output_dir)
249
+
250
+ # Upload if requested
251
+ if upload:
252
+ upload_onboarding_materials(repo_name, temp_folder, "results")
253
+ finally:
254
+ remove_temp_repo_folder(str(temp_folder))
255
+
256
+
257
+ def process_local_repository(
258
+ repo_path: Path,
259
+ output_dir: Path,
260
+ project_name: str,
261
+ depth_level: int = 1,
262
+ component_id: str | None = None,
263
+ monitoring_enabled: bool = False,
264
+ incremental: bool = False,
265
+ force_full: bool = False,
266
+ ):
267
+ # Handle partial updates
268
+ if component_id:
269
+ partial_update(
270
+ repo_path=repo_path,
271
+ output_dir=output_dir,
272
+ project_name=project_name,
273
+ component_id=component_id,
274
+ depth_level=depth_level,
275
+ )
276
+ return
277
+
278
+ # Use smart incremental analysis if requested
279
+ if incremental and not force_full:
280
+ generator = DiagramGenerator(
281
+ repo_location=repo_path,
282
+ temp_folder=output_dir,
283
+ repo_name=project_name,
284
+ output_dir=output_dir,
285
+ depth_level=depth_level,
286
+ monitoring_enabled=monitoring_enabled,
287
+ )
288
+ generator.force_full_analysis = force_full
289
+
290
+ # Try incremental first, fall back to full
291
+ result = generator.generate_analysis_smart()
292
+ if result:
293
+ logger.info(f"Incremental analysis completed: {len(result)} files")
294
+ return
295
+
296
+ # Full analysis (local repo - no markdown generation)
297
+ generate_analysis(
298
+ repo_name=project_name,
299
+ repo_path=repo_path,
300
+ output_dir=output_dir,
301
+ depth_level=depth_level,
302
+ monitoring_enabled=monitoring_enabled,
303
+ force_full=force_full,
304
+ )
305
+
306
+
307
+ def copy_files(temp_folder: Path, output_dir: Path):
308
+ """Copy all markdown and JSON files from temp folder to output directory."""
309
+ # Copy markdown files
310
+ markdown_files = list(temp_folder.glob("*.md"))
311
+ # Copy JSON files
312
+ json_files = list(temp_folder.glob("*.json"))
313
+
314
+ all_files = markdown_files + json_files
315
+
316
+ if not all_files:
317
+ logger.warning(f"No markdown or JSON files found in {temp_folder}")
318
+ return
319
+
320
+ for file in all_files:
321
+ dest_file = output_dir / file.name
322
+ shutil.copy2(file, dest_file)
323
+ logger.info(f"Copied {file.name} to {dest_file}")
324
+
325
+
326
+ def validate_arguments(args, parser, is_local: bool):
327
+ # Ensure mutual exclusivity between remote and local runs
328
+ has_remote_repos = bool(getattr(args, "repositories", None))
329
+ has_local_repo = args.local is not None
330
+
331
+ if has_remote_repos == has_local_repo:
332
+ parser.error("Provide either one or more remote repositories or --local, but not both.")
333
+
334
+ # Validate partial update arguments
335
+ if args.partial_component_id and not is_local:
336
+ parser.error("--partial-component-id only works with local repositories")
337
+
338
+
339
+ def define_cli_arguments(parser: argparse.ArgumentParser):
340
+ """
341
+ Adds all command-line arguments and groups to the ArgumentParser.
342
+ """
343
+ parser.add_argument(
344
+ "repositories",
345
+ nargs="*",
346
+ help="One or more Git repository URLs to generate documentation for",
347
+ )
348
+ parser.add_argument("--local", type=Path, help="Path to a local repository")
349
+
350
+ # Partial update options
351
+ parser.add_argument(
352
+ "--partial-component-id",
353
+ type=str,
354
+ help="Component ID to update (for partial updates only)",
355
+ )
356
+
357
+ # Binary/tool configuration
358
+ parser.add_argument(
359
+ "--binary-location",
360
+ type=Path,
361
+ help="Path to the binary directory for language servers (overrides ~/.codeboarding/servers/)",
362
+ )
363
+
364
+ # Analysis options
365
+ parser.add_argument(
366
+ "--depth-level",
367
+ type=int,
368
+ default=1,
369
+ help="Depth level for diagram generation (default: 1)",
370
+ )
371
+ parser.add_argument(
372
+ "--upload",
373
+ action="store_true",
374
+ help="Upload onboarding materials to GeneratedOnBoardings repo (remote repos only)",
375
+ )
376
+ parser.add_argument("--enable-monitoring", action="store_true", help="Enable monitoring")
377
+
378
+ # Incremental update options
379
+ parser.add_argument(
380
+ "--full",
381
+ action="store_true",
382
+ help="Force full reanalysis, skipping incremental update detection",
383
+ )
384
+ parser.add_argument(
385
+ "--incremental",
386
+ action="store_true",
387
+ help="Use smart incremental updates (tries incremental first, falls back to full)",
388
+ )
389
+
390
+
391
+ def main():
392
+ """Main entry point for the unified CodeBoarding CLI."""
393
+ parser = argparse.ArgumentParser(
394
+ description="Generate onboarding documentation for Git repositories (local or remote)",
395
+ formatter_class=argparse.RawDescriptionHelpFormatter,
396
+ epilog="""
397
+ Examples:
398
+ # Local repository (output written to <repo>/.codeboarding/)
399
+ codeboarding --local /path/to/repo
400
+
401
+ # Local repository with custom depth level
402
+ codeboarding --local /path/to/repo --depth-level 2
403
+
404
+ # Remote repository (cloned to cwd/<repo_name>/, output to cwd/<repo_name>/.codeboarding/)
405
+ codeboarding https://github.com/user/repo
406
+
407
+ # Partial update (update single component by ID)
408
+ codeboarding --local /path/to/repo --partial-component-id "a3f2b1c4d5e6f789"
409
+
410
+ # Incremental update (smart - detects changes automatically)
411
+ codeboarding --local /path/to/repo --incremental
412
+
413
+ # Force full reanalysis (skip incremental detection)
414
+ codeboarding --local /path/to/repo --full
415
+
416
+ # Use custom binary location (e.g. VS Code extension)
417
+ codeboarding --local /path/to/repo --binary-location /path/to/binaries
418
+ """,
419
+ )
420
+ define_cli_arguments(parser)
421
+
422
+ args = parser.parse_args()
423
+
424
+ # Validate interdependent arguments
425
+ is_local = args.local is not None
426
+ validate_arguments(args, parser, is_local)
427
+
428
+ # Derive output directory from repo path
429
+ if is_local:
430
+ output_dir = args.local.resolve() / ".codeboarding"
431
+ else:
432
+ # Remote: will be set per-repo inside the loop below
433
+ output_dir = None
434
+
435
+ # Setup logging
436
+ setup_logging(log_dir=output_dir)
437
+ logger.info("Starting CodeBoarding documentation generation...")
438
+
439
+ # Ensure ~/.codeboarding/config.toml exists (writes template on first run)
440
+ ensure_config_template()
441
+
442
+ # Load ~/.codeboarding/config.toml: inject provider keys into env and store model overrides
443
+ user_cfg = load_user_config()
444
+ user_cfg.apply_to_env()
445
+ configure_models(agent_model=user_cfg.llm.agent_model, parsing_model=user_cfg.llm.parsing_model)
446
+
447
+ # Validate that an LLM provider key is configured before doing any heavy work
448
+ try:
449
+ validate_api_key_provided()
450
+ except ValueError as e:
451
+ logger.error(str(e))
452
+ raise SystemExit(1)
453
+
454
+ load_plugins(get_registries())
455
+
456
+ if args.binary_location:
457
+ update_config(args.binary_location)
458
+ else:
459
+ from tool_registry import ensure_tools, needs_install
460
+
461
+ if needs_install():
462
+ logger.info("First run: downloading language server binaries to ~/.codeboarding/servers/ ...")
463
+ ensure_tools(auto_install_npm=True)
464
+
465
+ should_monitor = args.enable_monitoring or monitoring_enabled()
466
+
467
+ if is_local:
468
+ output_dir.mkdir(parents=True, exist_ok=True)
469
+ initialize_codeboardingignore(output_dir)
470
+
471
+ # Derive project name from the repo directory name
472
+ project_name = args.local.resolve().name
473
+
474
+ process_local_repository(
475
+ repo_path=args.local,
476
+ output_dir=output_dir,
477
+ project_name=project_name,
478
+ depth_level=args.depth_level,
479
+ component_id=args.partial_component_id,
480
+ monitoring_enabled=should_monitor,
481
+ incremental=args.incremental,
482
+ force_full=args.full,
483
+ )
484
+ logger.info(f"Documentation generated successfully in {output_dir}")
485
+ else:
486
+ if args.repositories:
487
+ if args.upload:
488
+ try:
489
+ store_token()
490
+ except Exception as e:
491
+ logger.warning(f"Could not store GitHub token: {e}")
492
+
493
+ for repo in tqdm(args.repositories, desc="Generating docs for repos"):
494
+ repo_name = get_repo_name(repo)
495
+ # Clone to cwd/<repo_name>/, output to cwd/<repo_name>/.codeboarding/
496
+ repo_output_dir = Path.cwd() / repo_name / ".codeboarding"
497
+ repo_output_dir.mkdir(parents=True, exist_ok=True)
498
+ initialize_codeboardingignore(repo_output_dir)
499
+
500
+ run_id = generate_run_id(repo_name)
501
+ monitoring_dir = get_monitoring_run_dir(run_id, create=should_monitor)
502
+
503
+ with monitor_execution(
504
+ run_id=run_id,
505
+ output_dir=str(monitoring_dir),
506
+ enabled=should_monitor,
507
+ ) as mon:
508
+ mon.step(f"processing_{repo_name}")
509
+
510
+ try:
511
+ process_remote_repository(
512
+ repo_url=repo,
513
+ output_dir=repo_output_dir,
514
+ depth_level=args.depth_level,
515
+ upload=args.upload,
516
+ run_id=run_id,
517
+ monitoring_enabled=should_monitor,
518
+ )
519
+ except Exception as e:
520
+ logger.error(f"Failed to process repository {repo}: {e}")
521
+ continue
522
+
523
+ logger.info("All repositories processed successfully!")
524
+ else:
525
+ logger.error("No repositories specified")
526
+
527
+
528
+ if __name__ == "__main__":
529
+ main()
monitoring/__init__.py ADDED
@@ -0,0 +1,12 @@
1
+ """
2
+ Monitoring package for tracking LLM usage, tool calls, and static analysis metrics.
3
+
4
+ Usage:
5
+ from monitoring import RunStats, MonitoringCallback, StreamingStatsWriter
6
+ from monitoring import monitor_execution, trace, current_step
7
+ """
8
+
9
+ from .stats import RunStats
10
+ from .callbacks import MonitoringCallback
11
+ from .writers import StreamingStatsWriter
12
+ from .context import monitor_execution, trace, current_step
@@ -0,0 +1,163 @@
1
+ import json
2
+ import logging
3
+ import time
4
+ from typing import Any, Mapping, MutableMapping, cast
5
+ from uuid import UUID
6
+
7
+ from langchain_core.callbacks import BaseCallbackHandler
8
+ from langchain_core.outputs import LLMResult
9
+
10
+ from monitoring.stats import RunStats, current_stats
11
+ from monitoring.context import current_step
12
+
13
+ logger = logging.getLogger("monitoring")
14
+
15
+
16
+ class MonitoringCallback(BaseCallbackHandler):
17
+ """
18
+ Captures LLM events, tags them with the current step, and updates stats.
19
+ """
20
+
21
+ def __init__(self, stats_container: RunStats | None = None, log_results: bool = True):
22
+ # runtime bookkeeping
23
+ self._tool_start_times: dict[str, float] = {} # run_id -> start_time
24
+ self._tool_names: dict[str, str] = {} # run_id -> tool_name
25
+ self._stats_container = stats_container
26
+ self.log_results = log_results
27
+
28
+ @property
29
+ def model_name(self) -> str | None:
30
+ return self.stats.model_name
31
+
32
+ @model_name.setter
33
+ def model_name(self, value: str | None) -> None:
34
+ with self.stats._lock:
35
+ self.stats.model_name = value
36
+
37
+ @property
38
+ def stats(self) -> RunStats:
39
+ if self._stats_container:
40
+ return self._stats_container
41
+ return current_stats.get()
42
+
43
+ def on_llm_end(self, response: LLMResult, **_kwargs: Any) -> None:
44
+ step_name = current_step.get()
45
+
46
+ # Extract usage
47
+ usage = self._extract_usage(response)
48
+
49
+ if not usage:
50
+ return
51
+
52
+ # Update State
53
+ with self.stats._lock:
54
+ self.stats.total_tokens += usage.get("total_tokens", 0)
55
+ self.stats.input_tokens += usage.get("input_tokens", 0)
56
+ self.stats.output_tokens += usage.get("output_tokens", 0)
57
+
58
+ # Log Event
59
+ if self.log_results:
60
+ model = self.model_name or "unknown"
61
+ # IMPORTANT: Do not change this log line format. Any change must be approved by IVAN.
62
+ logger.info(f"Token Usage: step={step_name} model={model} usage={json.dumps(usage)}")
63
+
64
+ def on_tool_start(self, serialized: dict[str, Any], input_str: str, **kwargs: Any) -> None:
65
+ run_id_any = kwargs.get("run_id")
66
+ run_id: str | None = str(run_id_any) if run_id_any else None
67
+ tool_name = (
68
+ serialized.get("name")
69
+ or serialized.get("id")
70
+ or serialized.get("lc_namespace", ["tool"])[-1]
71
+ or "unknown_tool"
72
+ )
73
+ with self.stats._lock:
74
+ self.stats.tool_counts[tool_name] += 1
75
+
76
+ now = time.time()
77
+ if run_id:
78
+ self._tool_start_times[run_id] = now
79
+ self._tool_names[run_id] = tool_name
80
+
81
+ def on_tool_end(self, output: Any, **kwargs: Any) -> None:
82
+ run_id_any = kwargs.get("run_id")
83
+ run_id: str | None = str(run_id_any) if run_id_any else None
84
+ if run_id and run_id in self._tool_start_times:
85
+ start = self._tool_start_times.pop(run_id)
86
+ tool_name = self._tool_names.pop(run_id, "unknown_tool")
87
+ latency = int((time.time() - start) * 1000)
88
+ with self.stats._lock:
89
+ self.stats.tool_latency_ms[tool_name].append(latency)
90
+
91
+ def on_tool_error(
92
+ self, error: BaseException, *, run_id: UUID, parent_run_id: UUID | None = None, **kwargs: Any
93
+ ) -> Any:
94
+ tool_name = "unknown_tool"
95
+ run_id_str = str(run_id)
96
+ if run_id_str in self._tool_names:
97
+ tool_name = self._tool_names[run_id_str]
98
+ with self.stats._lock:
99
+ self.stats.tool_errors[tool_name] += 1
100
+
101
+ # Clean up any in-flight timing
102
+ if run_id_str in self._tool_start_times:
103
+ self._tool_start_times.pop(run_id_str, None)
104
+ self._tool_names.pop(run_id_str, None)
105
+
106
+ def _extract_usage(self, response: LLMResult) -> dict[str, int]:
107
+ def _coerce_int(value: Any) -> int:
108
+ try:
109
+ return int(value)
110
+ except (TypeError, ValueError):
111
+ return 0
112
+
113
+ def _extract_usage_from_mapping(mapping: Mapping[str, Any]) -> dict[str, int]:
114
+ # Handle both prompt/completion and input/output styles
115
+ prompt = mapping.get("prompt_tokens", mapping.get("input_tokens", 0))
116
+ completion = mapping.get("completion_tokens", mapping.get("output_tokens", 0))
117
+ total = mapping.get("total_tokens", mapping.get("total_token_count", None))
118
+
119
+ prompt_i = _coerce_int(prompt)
120
+ completion_i = _coerce_int(completion)
121
+
122
+ if total is None:
123
+ total_i = prompt_i + completion_i
124
+ else:
125
+ total_i = _coerce_int(total)
126
+
127
+ return {
128
+ "input_tokens": prompt_i,
129
+ "output_tokens": completion_i,
130
+ "total_tokens": total_i,
131
+ }
132
+
133
+ usage_mapping: MutableMapping[str, Any] = {}
134
+
135
+ # 1) Try llm_output
136
+ llm_output = response.llm_output or {}
137
+ if "token_usage" in llm_output:
138
+ raw = cast(Mapping[str, Any], llm_output.get("token_usage") or {})
139
+ usage_mapping = dict(raw)
140
+ elif "usage" in llm_output:
141
+ raw = cast(Mapping[str, Any], llm_output.get("usage") or {})
142
+ usage_mapping = dict(raw)
143
+
144
+ # 2) Fallback to first generation's message metadata
145
+ if not usage_mapping and response.generations:
146
+ first_gen = response.generations[0][0]
147
+ message = getattr(first_gen, "message", None) or getattr(first_gen, "text", None)
148
+ meta: Mapping[str, Any] = getattr(message, "response_metadata", {}) or {}
149
+ usage_meta: Mapping[str, Any] = getattr(message, "usage_metadata", {}) or {}
150
+
151
+ if "token_usage" in meta:
152
+ raw = cast(Mapping[str, Any], meta.get("token_usage") or {})
153
+ usage_mapping = dict(raw)
154
+ elif "usage" in meta:
155
+ raw = cast(Mapping[str, Any], meta.get("usage") or {})
156
+ usage_mapping = dict(raw)
157
+ elif usage_meta:
158
+ usage_mapping = dict(usage_meta)
159
+
160
+ if not usage_mapping:
161
+ return {}
162
+
163
+ return _extract_usage_from_mapping(usage_mapping)