gitflow-analytics 1.0.3__py3-none-any.whl → 1.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. gitflow_analytics/_version.py +1 -1
  2. gitflow_analytics/classification/__init__.py +31 -0
  3. gitflow_analytics/classification/batch_classifier.py +752 -0
  4. gitflow_analytics/classification/classifier.py +464 -0
  5. gitflow_analytics/classification/feature_extractor.py +725 -0
  6. gitflow_analytics/classification/linguist_analyzer.py +574 -0
  7. gitflow_analytics/classification/model.py +455 -0
  8. gitflow_analytics/cli.py +4158 -350
  9. gitflow_analytics/cli_rich.py +198 -48
  10. gitflow_analytics/config/__init__.py +43 -0
  11. gitflow_analytics/config/errors.py +261 -0
  12. gitflow_analytics/config/loader.py +905 -0
  13. gitflow_analytics/config/profiles.py +264 -0
  14. gitflow_analytics/config/repository.py +124 -0
  15. gitflow_analytics/config/schema.py +444 -0
  16. gitflow_analytics/config/validator.py +154 -0
  17. gitflow_analytics/config.py +44 -508
  18. gitflow_analytics/core/analyzer.py +1209 -98
  19. gitflow_analytics/core/cache.py +1337 -29
  20. gitflow_analytics/core/data_fetcher.py +1285 -0
  21. gitflow_analytics/core/identity.py +363 -14
  22. gitflow_analytics/core/metrics_storage.py +526 -0
  23. gitflow_analytics/core/progress.py +372 -0
  24. gitflow_analytics/core/schema_version.py +269 -0
  25. gitflow_analytics/extractors/ml_tickets.py +1100 -0
  26. gitflow_analytics/extractors/story_points.py +8 -1
  27. gitflow_analytics/extractors/tickets.py +749 -11
  28. gitflow_analytics/identity_llm/__init__.py +6 -0
  29. gitflow_analytics/identity_llm/analysis_pass.py +231 -0
  30. gitflow_analytics/identity_llm/analyzer.py +464 -0
  31. gitflow_analytics/identity_llm/models.py +76 -0
  32. gitflow_analytics/integrations/github_integration.py +175 -11
  33. gitflow_analytics/integrations/jira_integration.py +461 -24
  34. gitflow_analytics/integrations/orchestrator.py +124 -1
  35. gitflow_analytics/metrics/activity_scoring.py +322 -0
  36. gitflow_analytics/metrics/branch_health.py +470 -0
  37. gitflow_analytics/metrics/dora.py +379 -20
  38. gitflow_analytics/models/database.py +843 -53
  39. gitflow_analytics/pm_framework/__init__.py +115 -0
  40. gitflow_analytics/pm_framework/adapters/__init__.py +50 -0
  41. gitflow_analytics/pm_framework/adapters/jira_adapter.py +1845 -0
  42. gitflow_analytics/pm_framework/base.py +406 -0
  43. gitflow_analytics/pm_framework/models.py +211 -0
  44. gitflow_analytics/pm_framework/orchestrator.py +652 -0
  45. gitflow_analytics/pm_framework/registry.py +333 -0
  46. gitflow_analytics/qualitative/__init__.py +9 -10
  47. gitflow_analytics/qualitative/chatgpt_analyzer.py +259 -0
  48. gitflow_analytics/qualitative/classifiers/__init__.py +3 -3
  49. gitflow_analytics/qualitative/classifiers/change_type.py +518 -244
  50. gitflow_analytics/qualitative/classifiers/domain_classifier.py +272 -165
  51. gitflow_analytics/qualitative/classifiers/intent_analyzer.py +321 -222
  52. gitflow_analytics/qualitative/classifiers/llm/__init__.py +35 -0
  53. gitflow_analytics/qualitative/classifiers/llm/base.py +193 -0
  54. gitflow_analytics/qualitative/classifiers/llm/batch_processor.py +383 -0
  55. gitflow_analytics/qualitative/classifiers/llm/cache.py +479 -0
  56. gitflow_analytics/qualitative/classifiers/llm/cost_tracker.py +435 -0
  57. gitflow_analytics/qualitative/classifiers/llm/openai_client.py +403 -0
  58. gitflow_analytics/qualitative/classifiers/llm/prompts.py +373 -0
  59. gitflow_analytics/qualitative/classifiers/llm/response_parser.py +287 -0
  60. gitflow_analytics/qualitative/classifiers/llm_commit_classifier.py +607 -0
  61. gitflow_analytics/qualitative/classifiers/risk_analyzer.py +215 -189
  62. gitflow_analytics/qualitative/core/__init__.py +4 -4
  63. gitflow_analytics/qualitative/core/llm_fallback.py +239 -235
  64. gitflow_analytics/qualitative/core/nlp_engine.py +157 -148
  65. gitflow_analytics/qualitative/core/pattern_cache.py +214 -192
  66. gitflow_analytics/qualitative/core/processor.py +381 -248
  67. gitflow_analytics/qualitative/enhanced_analyzer.py +2236 -0
  68. gitflow_analytics/qualitative/example_enhanced_usage.py +420 -0
  69. gitflow_analytics/qualitative/models/__init__.py +7 -7
  70. gitflow_analytics/qualitative/models/schemas.py +155 -121
  71. gitflow_analytics/qualitative/utils/__init__.py +4 -4
  72. gitflow_analytics/qualitative/utils/batch_processor.py +136 -123
  73. gitflow_analytics/qualitative/utils/cost_tracker.py +142 -140
  74. gitflow_analytics/qualitative/utils/metrics.py +172 -158
  75. gitflow_analytics/qualitative/utils/text_processing.py +146 -104
  76. gitflow_analytics/reports/__init__.py +100 -0
  77. gitflow_analytics/reports/analytics_writer.py +539 -14
  78. gitflow_analytics/reports/base.py +648 -0
  79. gitflow_analytics/reports/branch_health_writer.py +322 -0
  80. gitflow_analytics/reports/classification_writer.py +924 -0
  81. gitflow_analytics/reports/cli_integration.py +427 -0
  82. gitflow_analytics/reports/csv_writer.py +1676 -212
  83. gitflow_analytics/reports/data_models.py +504 -0
  84. gitflow_analytics/reports/database_report_generator.py +427 -0
  85. gitflow_analytics/reports/example_usage.py +344 -0
  86. gitflow_analytics/reports/factory.py +499 -0
  87. gitflow_analytics/reports/formatters.py +698 -0
  88. gitflow_analytics/reports/html_generator.py +1116 -0
  89. gitflow_analytics/reports/interfaces.py +489 -0
  90. gitflow_analytics/reports/json_exporter.py +2770 -0
  91. gitflow_analytics/reports/narrative_writer.py +2287 -158
  92. gitflow_analytics/reports/story_point_correlation.py +1144 -0
  93. gitflow_analytics/reports/weekly_trends_writer.py +389 -0
  94. gitflow_analytics/training/__init__.py +5 -0
  95. gitflow_analytics/training/model_loader.py +377 -0
  96. gitflow_analytics/training/pipeline.py +550 -0
  97. gitflow_analytics/tui/__init__.py +1 -1
  98. gitflow_analytics/tui/app.py +129 -126
  99. gitflow_analytics/tui/screens/__init__.py +3 -3
  100. gitflow_analytics/tui/screens/analysis_progress_screen.py +188 -179
  101. gitflow_analytics/tui/screens/configuration_screen.py +154 -178
  102. gitflow_analytics/tui/screens/loading_screen.py +100 -110
  103. gitflow_analytics/tui/screens/main_screen.py +89 -72
  104. gitflow_analytics/tui/screens/results_screen.py +305 -281
  105. gitflow_analytics/tui/widgets/__init__.py +2 -2
  106. gitflow_analytics/tui/widgets/data_table.py +67 -69
  107. gitflow_analytics/tui/widgets/export_modal.py +76 -76
  108. gitflow_analytics/tui/widgets/progress_widget.py +41 -46
  109. gitflow_analytics-1.3.11.dist-info/METADATA +1015 -0
  110. gitflow_analytics-1.3.11.dist-info/RECORD +122 -0
  111. gitflow_analytics-1.0.3.dist-info/METADATA +0 -490
  112. gitflow_analytics-1.0.3.dist-info/RECORD +0 -62
  113. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/WHEEL +0 -0
  114. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/entry_points.txt +0 -0
  115. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/licenses/LICENSE +0 -0
  116. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,444 @@
1
+ """Configuration schema definitions and defaults for GitFlow Analytics."""
2
+
3
+ from dataclasses import dataclass, field
4
+ from pathlib import Path
5
+ from typing import TYPE_CHECKING, Any, Optional
6
+
7
+ if TYPE_CHECKING:
8
+ from ..qualitative.models.schemas import QualitativeConfig
9
+
10
+
11
+ @dataclass
12
+ class RepositoryConfig:
13
+ """Configuration for a single repository."""
14
+
15
+ name: str
16
+ path: Path
17
+ github_repo: Optional[str] = None
18
+ project_key: Optional[str] = None
19
+ branch: Optional[str] = None
20
+
21
+ def __post_init__(self) -> None:
22
+ self.path = Path(self.path).expanduser().resolve()
23
+ if not self.project_key:
24
+ self.project_key = self.name.upper().replace("-", "_")
25
+
26
+
27
+ @dataclass
28
+ class GitHubConfig:
29
+ """GitHub API configuration."""
30
+
31
+ token: Optional[str] = None
32
+ owner: Optional[str] = None
33
+ organization: Optional[str] = None
34
+ base_url: str = "https://api.github.com"
35
+ max_retries: int = 3
36
+ backoff_factor: int = 2
37
+
38
+ def get_repo_full_name(self, repo_name: str) -> str:
39
+ """Get full repository name including owner."""
40
+ if "/" in repo_name:
41
+ return repo_name
42
+ if self.owner:
43
+ return f"{self.owner}/{repo_name}"
44
+ raise ValueError(f"Repository {repo_name} needs owner specified")
45
+
46
+
47
+ @dataclass
48
+ class MLCategorization:
49
+ """ML-based commit categorization configuration."""
50
+
51
+ enabled: bool = True
52
+ min_confidence: float = 0.6
53
+ semantic_weight: float = 0.7
54
+ file_pattern_weight: float = 0.3
55
+ hybrid_threshold: float = 0.5 # Confidence threshold for using ML vs rule-based
56
+ cache_duration_days: int = 30
57
+ batch_size: int = 100
58
+ enable_caching: bool = True
59
+ spacy_model: str = "en_core_web_sm" # Preferred spaCy model
60
+
61
+
62
+ @dataclass
63
+ class LLMClassificationConfig:
64
+ """LLM-based commit classification configuration.
65
+
66
+ This configuration enables Large Language Model-based commit classification
67
+ via OpenRouter API for more accurate and context-aware categorization.
68
+ """
69
+
70
+ # Enable/disable LLM classification
71
+ enabled: bool = False # Disabled by default to avoid unexpected API costs
72
+
73
+ # OpenRouter API configuration
74
+ api_key: Optional[str] = None # Set via environment variable or config
75
+ api_base_url: str = "https://openrouter.ai/api/v1"
76
+ model: str = "mistralai/mistral-7b-instruct" # Fast, affordable model
77
+
78
+ # Alternative models for different use cases:
79
+ # - "meta-llama/llama-3-8b-instruct" (Higher accuracy, slightly more expensive)
80
+ # - "openai/gpt-3.5-turbo" (Good balance, more expensive)
81
+
82
+ # Classification parameters
83
+ confidence_threshold: float = 0.7 # Minimum confidence for LLM predictions
84
+ max_tokens: int = 50 # Keep responses short for cost optimization
85
+ temperature: float = 0.1 # Low temperature for consistent results
86
+ timeout_seconds: float = 30.0 # API request timeout
87
+
88
+ # Caching configuration (aggressive caching for cost optimization)
89
+ cache_duration_days: int = 90 # Long cache duration
90
+ enable_caching: bool = True
91
+
92
+ # Cost and rate limiting
93
+ max_daily_requests: int = 1000 # Daily API request limit
94
+
95
+ # Domain-specific terms for better classification accuracy
96
+ domain_terms: dict[str, list[str]] = field(
97
+ default_factory=lambda: {
98
+ "media": [
99
+ "video",
100
+ "audio",
101
+ "streaming",
102
+ "player",
103
+ "media",
104
+ "content",
105
+ "broadcast",
106
+ "live",
107
+ "recording",
108
+ "episode",
109
+ "program",
110
+ "tv",
111
+ "radio",
112
+ "podcast",
113
+ "channel",
114
+ "playlist",
115
+ ],
116
+ "localization": [
117
+ "translation",
118
+ "i18n",
119
+ "l10n",
120
+ "locale",
121
+ "language",
122
+ "spanish",
123
+ "french",
124
+ "german",
125
+ "italian",
126
+ "portuguese",
127
+ "multilingual",
128
+ "translate",
129
+ "localize",
130
+ "regional",
131
+ ],
132
+ "integration": [
133
+ "api",
134
+ "webhook",
135
+ "third-party",
136
+ "external",
137
+ "service",
138
+ "integration",
139
+ "sync",
140
+ "import",
141
+ "export",
142
+ "connector",
143
+ "oauth",
144
+ "auth",
145
+ "authentication",
146
+ "sso",
147
+ ],
148
+ "content": [
149
+ "copy",
150
+ "text",
151
+ "wording",
152
+ "messaging",
153
+ "editorial",
154
+ "article",
155
+ "blog",
156
+ "news",
157
+ "story",
158
+ "caption",
159
+ "title",
160
+ "headline",
161
+ "description",
162
+ "summary",
163
+ "metadata",
164
+ ],
165
+ }
166
+ )
167
+
168
+ # Fallback behavior when LLM is unavailable
169
+ fallback_to_rules: bool = True # Fall back to rule-based classification
170
+ fallback_to_ml: bool = True # Fall back to existing ML classification
171
+
172
+
173
+ @dataclass
174
+ class CommitClassificationConfig:
175
+ """Configuration for commit classification system.
176
+
177
+ This configuration controls the Random Forest-based commit classification
178
+ system that analyzes commits to categorize them into types like feature,
179
+ bugfix, refactor, docs, test, etc.
180
+ """
181
+
182
+ enabled: bool = True
183
+ confidence_threshold: float = 0.5 # Minimum confidence for reliable predictions
184
+ batch_size: int = 100 # Commits processed per batch
185
+ auto_retrain: bool = True # Automatically check if model needs retraining
186
+ retrain_threshold_days: int = 30 # Days after which to suggest retraining
187
+
188
+ # Model hyperparameters
189
+ model: dict[str, Any] = field(
190
+ default_factory=lambda: {
191
+ "n_estimators": 100, # Number of trees in random forest
192
+ "max_depth": 20, # Maximum depth of trees
193
+ "min_samples_split": 5, # Minimum samples to split a node
194
+ "min_samples_leaf": 2, # Minimum samples at leaf node
195
+ "random_state": 42, # For reproducible results
196
+ "n_jobs": -1, # Use all available CPU cores
197
+ }
198
+ )
199
+
200
+ # Feature extraction settings
201
+ feature_extraction: dict[str, Any] = field(
202
+ default_factory=lambda: {
203
+ "enable_temporal_features": True,
204
+ "enable_author_features": True,
205
+ "enable_file_analysis": True,
206
+ "keyword_categories": [
207
+ "feature",
208
+ "bugfix",
209
+ "refactor",
210
+ "docs",
211
+ "test",
212
+ "config",
213
+ "security",
214
+ "performance",
215
+ "ui",
216
+ "api",
217
+ "database",
218
+ "deployment",
219
+ ],
220
+ }
221
+ )
222
+
223
+ # Training settings
224
+ training: dict[str, Any] = field(
225
+ default_factory=lambda: {
226
+ "validation_split": 0.2, # Fraction for validation
227
+ "min_training_samples": 20, # Minimum samples needed for training
228
+ "cross_validation_folds": 5, # K-fold cross validation
229
+ "class_weight": "balanced", # Handle class imbalance
230
+ }
231
+ )
232
+
233
+ # Supported classification categories
234
+ categories: dict[str, str] = field(
235
+ default_factory=lambda: {
236
+ "feature": "New functionality or capabilities",
237
+ "bugfix": "Bug fixes and error corrections",
238
+ "refactor": "Code restructuring and optimization",
239
+ "docs": "Documentation changes and updates",
240
+ "test": "Testing-related changes",
241
+ "config": "Configuration and settings changes",
242
+ "chore": "Maintenance and housekeeping tasks",
243
+ "security": "Security-related changes",
244
+ "hotfix": "Emergency production fixes",
245
+ "style": "Code style and formatting changes",
246
+ "build": "Build system and dependency changes",
247
+ "ci": "Continuous integration changes",
248
+ "revert": "Reverts of previous changes",
249
+ "merge": "Merge commits and integration",
250
+ "wip": "Work in progress commits",
251
+ }
252
+ )
253
+
254
+
255
+ @dataclass
256
+ class BranchAnalysisConfig:
257
+ """Configuration for branch analysis optimization.
258
+
259
+ This configuration controls how branches are analyzed to prevent performance
260
+ issues on large organizations with many repositories and branches.
261
+ """
262
+
263
+ # Branch analysis strategy
264
+ strategy: str = "smart" # Options: "all", "smart", "main_only"
265
+
266
+ # Smart analysis parameters
267
+ max_branches_per_repo: int = 50 # Maximum branches to analyze per repository
268
+ active_days_threshold: int = 90 # Days to consider a branch "active"
269
+ include_main_branches: bool = True # Always include main/master branches
270
+
271
+ # Branch name patterns to always include/exclude
272
+ always_include_patterns: list[str] = field(
273
+ default_factory=lambda: [
274
+ r"^(main|master|develop|dev)$", # Main development branches
275
+ r"^release/.*", # Release branches
276
+ r"^hotfix/.*", # Hotfix branches
277
+ ]
278
+ )
279
+
280
+ always_exclude_patterns: list[str] = field(
281
+ default_factory=lambda: [
282
+ r"^dependabot/.*", # Dependabot branches
283
+ r"^renovate/.*", # Renovate branches
284
+ r".*-backup$", # Backup branches
285
+ r".*-temp$", # Temporary branches
286
+ ]
287
+ )
288
+
289
+ # Performance limits
290
+ enable_progress_logging: bool = True # Log branch analysis progress
291
+ branch_commit_limit: int = 1000 # Max commits to analyze per branch
292
+
293
+
294
+ @dataclass
295
+ class AnalysisConfig:
296
+ """Analysis-specific configuration."""
297
+
298
+ story_point_patterns: list[str] = field(default_factory=list)
299
+ exclude_authors: list[str] = field(default_factory=list)
300
+ exclude_message_patterns: list[str] = field(default_factory=list)
301
+ exclude_paths: list[str] = field(default_factory=list)
302
+ similarity_threshold: float = 0.85
303
+ manual_identity_mappings: list[dict[str, Any]] = field(default_factory=list)
304
+ default_ticket_platform: Optional[str] = None
305
+ branch_mapping_rules: dict[str, list[str]] = field(default_factory=dict)
306
+ ticket_platforms: Optional[list[str]] = None
307
+ auto_identity_analysis: bool = True # Enable automatic identity analysis by default
308
+ branch_patterns: Optional[list[str]] = (
309
+ None # Branch patterns to analyze (e.g., ["*"] for all branches)
310
+ )
311
+ branch_analysis: BranchAnalysisConfig = field(default_factory=BranchAnalysisConfig)
312
+ ml_categorization: MLCategorization = field(default_factory=MLCategorization)
313
+ commit_classification: CommitClassificationConfig = field(
314
+ default_factory=CommitClassificationConfig
315
+ )
316
+ llm_classification: LLMClassificationConfig = field(default_factory=LLMClassificationConfig)
317
+
318
+
319
+ @dataclass
320
+ class OutputConfig:
321
+ """Output configuration."""
322
+
323
+ directory: Optional[Path] = None
324
+ formats: list[str] = field(default_factory=lambda: ["csv", "markdown"])
325
+ csv_delimiter: str = ","
326
+ csv_encoding: str = "utf-8"
327
+ anonymize_enabled: bool = False
328
+ anonymize_fields: list[str] = field(default_factory=list)
329
+ anonymize_method: str = "hash"
330
+
331
+
332
+ @dataclass
333
+ class CacheConfig:
334
+ """Cache configuration."""
335
+
336
+ directory: Path = Path(".gitflow-cache")
337
+ ttl_hours: int = 168
338
+ max_size_mb: int = 500
339
+
340
+
341
+ @dataclass
342
+ class JIRAConfig:
343
+ """JIRA configuration."""
344
+
345
+ access_user: str
346
+ access_token: str
347
+ base_url: Optional[str] = None
348
+
349
+
350
+ @dataclass
351
+ class JIRAIntegrationConfig:
352
+ """JIRA integration specific configuration."""
353
+
354
+ enabled: bool = True
355
+ fetch_story_points: bool = True
356
+ project_keys: list[str] = field(default_factory=list)
357
+ story_point_fields: list[str] = field(
358
+ default_factory=lambda: ["customfield_10016", "customfield_10021", "Story Points"]
359
+ )
360
+
361
+
362
+ @dataclass
363
+ class PMPlatformConfig:
364
+ """Base PM platform configuration."""
365
+
366
+ enabled: bool = True
367
+ platform_type: str = ""
368
+ config: dict[str, Any] = field(default_factory=dict)
369
+
370
+
371
+ @dataclass
372
+ class PMIntegrationConfig:
373
+ """PM framework integration configuration."""
374
+
375
+ enabled: bool = False
376
+ primary_platform: Optional[str] = None
377
+ correlation: dict[str, Any] = field(default_factory=dict)
378
+ platforms: dict[str, PMPlatformConfig] = field(default_factory=dict)
379
+
380
+
381
+ @dataclass
382
+ class Config:
383
+ """Main configuration container."""
384
+
385
+ repositories: list[RepositoryConfig]
386
+ github: GitHubConfig
387
+ analysis: AnalysisConfig
388
+ output: OutputConfig
389
+ cache: CacheConfig
390
+ jira: Optional[JIRAConfig] = None
391
+ jira_integration: Optional[JIRAIntegrationConfig] = None
392
+ pm: Optional[Any] = None # Modern PM framework config
393
+ pm_integration: Optional[PMIntegrationConfig] = None
394
+ qualitative: Optional["QualitativeConfig"] = None
395
+
396
+ def discover_organization_repositories(
397
+ self, clone_base_path: Optional[Path] = None
398
+ ) -> list[RepositoryConfig]:
399
+ """Discover repositories from GitHub organization.
400
+
401
+ Args:
402
+ clone_base_path: Base directory where repos should be cloned/found.
403
+ If None, uses output directory.
404
+
405
+ Returns:
406
+ List of discovered repository configurations.
407
+ """
408
+ if not self.github.organization or not self.github.token:
409
+ return []
410
+
411
+ from github import Github
412
+
413
+ github_client = Github(self.github.token, base_url=self.github.base_url)
414
+
415
+ try:
416
+ org = github_client.get_organization(self.github.organization)
417
+ discovered_repos = []
418
+
419
+ base_path = clone_base_path or self.output.directory
420
+ if base_path is None:
421
+ raise ValueError("No base path available for repository cloning")
422
+
423
+ for repo in org.get_repos():
424
+ # Skip archived repositories
425
+ if repo.archived:
426
+ continue
427
+
428
+ # Create repository configuration
429
+ repo_path = base_path / repo.name
430
+ repo_config = RepositoryConfig(
431
+ name=repo.name,
432
+ path=repo_path,
433
+ github_repo=repo.full_name,
434
+ project_key=repo.name.upper().replace("-", "_"),
435
+ branch=repo.default_branch,
436
+ )
437
+ discovered_repos.append(repo_config)
438
+
439
+ return discovered_repos
440
+
441
+ except Exception as e:
442
+ raise ValueError(
443
+ f"Failed to discover repositories from organization {self.github.organization}: {e}"
444
+ ) from e
@@ -0,0 +1,154 @@
1
+ """Configuration validation logic for GitFlow Analytics."""
2
+
3
+ from pathlib import Path
4
+
5
+ from .schema import Config
6
+
7
+
8
+ class ConfigValidator:
9
+ """Validates configuration settings."""
10
+
11
+ @staticmethod
12
+ def validate_config(config: Config) -> list[str]:
13
+ """Validate configuration and return list of warnings.
14
+
15
+ Args:
16
+ config: Configuration to validate
17
+
18
+ Returns:
19
+ List of warning messages
20
+ """
21
+ warnings = []
22
+
23
+ # Check repository paths exist
24
+ for repo in config.repositories:
25
+ if not repo.path.exists():
26
+ warnings.append(f"Repository path does not exist: {repo.path}")
27
+ elif not (repo.path / ".git").exists():
28
+ warnings.append(f"Path is not a git repository: {repo.path}")
29
+
30
+ # Check GitHub token if GitHub repos are specified
31
+ has_github_repos = any(r.github_repo for r in config.repositories)
32
+ if has_github_repos and not config.github.token:
33
+ warnings.append("GitHub repositories specified but no GitHub token provided")
34
+
35
+ # Check if owner is needed
36
+ for repo in config.repositories:
37
+ if repo.github_repo and "/" not in repo.github_repo and not config.github.owner:
38
+ warnings.append(f"Repository {repo.github_repo} needs owner specified")
39
+
40
+ # Check cache directory permissions
41
+ try:
42
+ config.cache.directory.mkdir(exist_ok=True, parents=True)
43
+ except PermissionError:
44
+ warnings.append(f"Cannot create cache directory: {config.cache.directory}")
45
+
46
+ return warnings
47
+
48
+ @staticmethod
49
+ def validate_analysis_config(analysis_config: dict, config_path: Path) -> None:
50
+ """Validate analysis configuration section.
51
+
52
+ Args:
53
+ analysis_config: Analysis configuration dictionary
54
+ config_path: Path to configuration file (for error messages)
55
+
56
+ Raises:
57
+ InvalidValueError: If configuration values are invalid
58
+ """
59
+ from .errors import InvalidValueError
60
+
61
+ # Validate similarity threshold
62
+ if "identity" in analysis_config:
63
+ threshold = analysis_config["identity"].get("similarity_threshold")
64
+ if threshold is not None and not (0.0 <= threshold <= 1.0):
65
+ raise InvalidValueError(
66
+ "similarity_threshold",
67
+ threshold,
68
+ "must be between 0.0 and 1.0",
69
+ config_path,
70
+ valid_values=["0.0 to 1.0"],
71
+ )
72
+
73
+ # Validate ML categorization settings
74
+ if "ml_categorization" in analysis_config:
75
+ ml_config = analysis_config["ml_categorization"]
76
+
77
+ if "min_confidence" in ml_config:
78
+ conf = ml_config["min_confidence"]
79
+ if not (0.0 <= conf <= 1.0):
80
+ raise InvalidValueError(
81
+ "ml_categorization.min_confidence",
82
+ conf,
83
+ "must be between 0.0 and 1.0",
84
+ config_path,
85
+ valid_values=["0.0 to 1.0"],
86
+ )
87
+
88
+ if "semantic_weight" in ml_config:
89
+ weight = ml_config["semantic_weight"]
90
+ if not (0.0 <= weight <= 1.0):
91
+ raise InvalidValueError(
92
+ "ml_categorization.semantic_weight",
93
+ weight,
94
+ "must be between 0.0 and 1.0",
95
+ config_path,
96
+ valid_values=["0.0 to 1.0"],
97
+ )
98
+
99
+ # Validate branch analysis strategy
100
+ if "branch_analysis" in analysis_config:
101
+ branch_config = analysis_config["branch_analysis"]
102
+ if "strategy" in branch_config:
103
+ strategy = branch_config["strategy"]
104
+ valid_strategies = ["all", "smart", "main_only"]
105
+ if strategy not in valid_strategies:
106
+ raise InvalidValueError(
107
+ "branch_analysis.strategy",
108
+ strategy,
109
+ "invalid branch analysis strategy",
110
+ config_path,
111
+ valid_values=valid_strategies,
112
+ )
113
+
114
+ @staticmethod
115
+ def validate_output_config(output_config: dict, config_path: Path) -> None:
116
+ """Validate output configuration section.
117
+
118
+ Args:
119
+ output_config: Output configuration dictionary
120
+ config_path: Path to configuration file (for error messages)
121
+
122
+ Raises:
123
+ InvalidValueError: If configuration values are invalid
124
+ """
125
+ from .errors import InvalidValueError
126
+
127
+ # Validate output formats
128
+ if "formats" in output_config:
129
+ formats = output_config["formats"]
130
+ valid_formats = ["csv", "markdown", "json"]
131
+ for fmt in formats:
132
+ if fmt not in valid_formats:
133
+ raise InvalidValueError(
134
+ "output.formats",
135
+ fmt,
136
+ "invalid output format",
137
+ config_path,
138
+ valid_values=valid_formats,
139
+ )
140
+
141
+ # Validate anonymization method
142
+ if "anonymization" in output_config:
143
+ anon_config = output_config["anonymization"]
144
+ if "method" in anon_config:
145
+ method = anon_config["method"]
146
+ valid_methods = ["hash", "random", "sequential"]
147
+ if method not in valid_methods:
148
+ raise InvalidValueError(
149
+ "output.anonymization.method",
150
+ method,
151
+ "invalid anonymization method",
152
+ config_path,
153
+ valid_values=valid_methods,
154
+ )