gitflow-analytics 1.0.1__py3-none-any.whl → 1.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. gitflow_analytics/__init__.py +11 -11
  2. gitflow_analytics/_version.py +2 -2
  3. gitflow_analytics/classification/__init__.py +31 -0
  4. gitflow_analytics/classification/batch_classifier.py +752 -0
  5. gitflow_analytics/classification/classifier.py +464 -0
  6. gitflow_analytics/classification/feature_extractor.py +725 -0
  7. gitflow_analytics/classification/linguist_analyzer.py +574 -0
  8. gitflow_analytics/classification/model.py +455 -0
  9. gitflow_analytics/cli.py +4490 -378
  10. gitflow_analytics/cli_rich.py +503 -0
  11. gitflow_analytics/config/__init__.py +43 -0
  12. gitflow_analytics/config/errors.py +261 -0
  13. gitflow_analytics/config/loader.py +904 -0
  14. gitflow_analytics/config/profiles.py +264 -0
  15. gitflow_analytics/config/repository.py +124 -0
  16. gitflow_analytics/config/schema.py +441 -0
  17. gitflow_analytics/config/validator.py +154 -0
  18. gitflow_analytics/config.py +44 -398
  19. gitflow_analytics/core/analyzer.py +1320 -172
  20. gitflow_analytics/core/branch_mapper.py +132 -132
  21. gitflow_analytics/core/cache.py +1554 -175
  22. gitflow_analytics/core/data_fetcher.py +1193 -0
  23. gitflow_analytics/core/identity.py +571 -185
  24. gitflow_analytics/core/metrics_storage.py +526 -0
  25. gitflow_analytics/core/progress.py +372 -0
  26. gitflow_analytics/core/schema_version.py +269 -0
  27. gitflow_analytics/extractors/base.py +13 -11
  28. gitflow_analytics/extractors/ml_tickets.py +1100 -0
  29. gitflow_analytics/extractors/story_points.py +77 -59
  30. gitflow_analytics/extractors/tickets.py +841 -89
  31. gitflow_analytics/identity_llm/__init__.py +6 -0
  32. gitflow_analytics/identity_llm/analysis_pass.py +231 -0
  33. gitflow_analytics/identity_llm/analyzer.py +464 -0
  34. gitflow_analytics/identity_llm/models.py +76 -0
  35. gitflow_analytics/integrations/github_integration.py +258 -87
  36. gitflow_analytics/integrations/jira_integration.py +572 -123
  37. gitflow_analytics/integrations/orchestrator.py +206 -82
  38. gitflow_analytics/metrics/activity_scoring.py +322 -0
  39. gitflow_analytics/metrics/branch_health.py +470 -0
  40. gitflow_analytics/metrics/dora.py +542 -179
  41. gitflow_analytics/models/database.py +986 -59
  42. gitflow_analytics/pm_framework/__init__.py +115 -0
  43. gitflow_analytics/pm_framework/adapters/__init__.py +50 -0
  44. gitflow_analytics/pm_framework/adapters/jira_adapter.py +1845 -0
  45. gitflow_analytics/pm_framework/base.py +406 -0
  46. gitflow_analytics/pm_framework/models.py +211 -0
  47. gitflow_analytics/pm_framework/orchestrator.py +652 -0
  48. gitflow_analytics/pm_framework/registry.py +333 -0
  49. gitflow_analytics/qualitative/__init__.py +29 -0
  50. gitflow_analytics/qualitative/chatgpt_analyzer.py +259 -0
  51. gitflow_analytics/qualitative/classifiers/__init__.py +13 -0
  52. gitflow_analytics/qualitative/classifiers/change_type.py +742 -0
  53. gitflow_analytics/qualitative/classifiers/domain_classifier.py +506 -0
  54. gitflow_analytics/qualitative/classifiers/intent_analyzer.py +535 -0
  55. gitflow_analytics/qualitative/classifiers/llm/__init__.py +35 -0
  56. gitflow_analytics/qualitative/classifiers/llm/base.py +193 -0
  57. gitflow_analytics/qualitative/classifiers/llm/batch_processor.py +383 -0
  58. gitflow_analytics/qualitative/classifiers/llm/cache.py +479 -0
  59. gitflow_analytics/qualitative/classifiers/llm/cost_tracker.py +435 -0
  60. gitflow_analytics/qualitative/classifiers/llm/openai_client.py +403 -0
  61. gitflow_analytics/qualitative/classifiers/llm/prompts.py +373 -0
  62. gitflow_analytics/qualitative/classifiers/llm/response_parser.py +287 -0
  63. gitflow_analytics/qualitative/classifiers/llm_commit_classifier.py +607 -0
  64. gitflow_analytics/qualitative/classifiers/risk_analyzer.py +438 -0
  65. gitflow_analytics/qualitative/core/__init__.py +13 -0
  66. gitflow_analytics/qualitative/core/llm_fallback.py +657 -0
  67. gitflow_analytics/qualitative/core/nlp_engine.py +382 -0
  68. gitflow_analytics/qualitative/core/pattern_cache.py +479 -0
  69. gitflow_analytics/qualitative/core/processor.py +673 -0
  70. gitflow_analytics/qualitative/enhanced_analyzer.py +2236 -0
  71. gitflow_analytics/qualitative/example_enhanced_usage.py +420 -0
  72. gitflow_analytics/qualitative/models/__init__.py +25 -0
  73. gitflow_analytics/qualitative/models/schemas.py +306 -0
  74. gitflow_analytics/qualitative/utils/__init__.py +13 -0
  75. gitflow_analytics/qualitative/utils/batch_processor.py +339 -0
  76. gitflow_analytics/qualitative/utils/cost_tracker.py +345 -0
  77. gitflow_analytics/qualitative/utils/metrics.py +361 -0
  78. gitflow_analytics/qualitative/utils/text_processing.py +285 -0
  79. gitflow_analytics/reports/__init__.py +100 -0
  80. gitflow_analytics/reports/analytics_writer.py +550 -18
  81. gitflow_analytics/reports/base.py +648 -0
  82. gitflow_analytics/reports/branch_health_writer.py +322 -0
  83. gitflow_analytics/reports/classification_writer.py +924 -0
  84. gitflow_analytics/reports/cli_integration.py +427 -0
  85. gitflow_analytics/reports/csv_writer.py +1700 -216
  86. gitflow_analytics/reports/data_models.py +504 -0
  87. gitflow_analytics/reports/database_report_generator.py +427 -0
  88. gitflow_analytics/reports/example_usage.py +344 -0
  89. gitflow_analytics/reports/factory.py +499 -0
  90. gitflow_analytics/reports/formatters.py +698 -0
  91. gitflow_analytics/reports/html_generator.py +1116 -0
  92. gitflow_analytics/reports/interfaces.py +489 -0
  93. gitflow_analytics/reports/json_exporter.py +2770 -0
  94. gitflow_analytics/reports/narrative_writer.py +2289 -158
  95. gitflow_analytics/reports/story_point_correlation.py +1144 -0
  96. gitflow_analytics/reports/weekly_trends_writer.py +389 -0
  97. gitflow_analytics/training/__init__.py +5 -0
  98. gitflow_analytics/training/model_loader.py +377 -0
  99. gitflow_analytics/training/pipeline.py +550 -0
  100. gitflow_analytics/tui/__init__.py +5 -0
  101. gitflow_analytics/tui/app.py +724 -0
  102. gitflow_analytics/tui/screens/__init__.py +8 -0
  103. gitflow_analytics/tui/screens/analysis_progress_screen.py +496 -0
  104. gitflow_analytics/tui/screens/configuration_screen.py +523 -0
  105. gitflow_analytics/tui/screens/loading_screen.py +348 -0
  106. gitflow_analytics/tui/screens/main_screen.py +321 -0
  107. gitflow_analytics/tui/screens/results_screen.py +722 -0
  108. gitflow_analytics/tui/widgets/__init__.py +7 -0
  109. gitflow_analytics/tui/widgets/data_table.py +255 -0
  110. gitflow_analytics/tui/widgets/export_modal.py +301 -0
  111. gitflow_analytics/tui/widgets/progress_widget.py +187 -0
  112. gitflow_analytics-1.3.6.dist-info/METADATA +1015 -0
  113. gitflow_analytics-1.3.6.dist-info/RECORD +122 -0
  114. gitflow_analytics-1.0.1.dist-info/METADATA +0 -463
  115. gitflow_analytics-1.0.1.dist-info/RECORD +0 -31
  116. {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/WHEEL +0 -0
  117. {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/entry_points.txt +0 -0
  118. {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/licenses/LICENSE +0 -0
  119. {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,441 @@
1
+ """Configuration schema definitions and defaults for GitFlow Analytics."""
2
+
3
+ from dataclasses import dataclass, field
4
+ from pathlib import Path
5
+ from typing import TYPE_CHECKING, Any, Optional
6
+
7
+ if TYPE_CHECKING:
8
+ from ..qualitative.models.schemas import QualitativeConfig
9
+
10
+
11
+ @dataclass
12
+ class RepositoryConfig:
13
+ """Configuration for a single repository."""
14
+
15
+ name: str
16
+ path: Path
17
+ github_repo: Optional[str] = None
18
+ project_key: Optional[str] = None
19
+ branch: Optional[str] = None
20
+
21
+ def __post_init__(self) -> None:
22
+ self.path = Path(self.path).expanduser().resolve()
23
+ if not self.project_key:
24
+ self.project_key = self.name.upper().replace("-", "_")
25
+
26
+
27
+ @dataclass
28
+ class GitHubConfig:
29
+ """GitHub API configuration."""
30
+
31
+ token: Optional[str] = None
32
+ owner: Optional[str] = None
33
+ organization: Optional[str] = None
34
+ base_url: str = "https://api.github.com"
35
+ max_retries: int = 3
36
+ backoff_factor: int = 2
37
+
38
+ def get_repo_full_name(self, repo_name: str) -> str:
39
+ """Get full repository name including owner."""
40
+ if "/" in repo_name:
41
+ return repo_name
42
+ if self.owner:
43
+ return f"{self.owner}/{repo_name}"
44
+ raise ValueError(f"Repository {repo_name} needs owner specified")
45
+
46
+
47
+ @dataclass
48
+ class MLCategorization:
49
+ """ML-based commit categorization configuration."""
50
+
51
+ enabled: bool = True
52
+ min_confidence: float = 0.6
53
+ semantic_weight: float = 0.7
54
+ file_pattern_weight: float = 0.3
55
+ hybrid_threshold: float = 0.5 # Confidence threshold for using ML vs rule-based
56
+ cache_duration_days: int = 30
57
+ batch_size: int = 100
58
+ enable_caching: bool = True
59
+ spacy_model: str = "en_core_web_sm" # Preferred spaCy model
60
+
61
+
62
+ @dataclass
63
+ class LLMClassificationConfig:
64
+ """LLM-based commit classification configuration.
65
+
66
+ This configuration enables Large Language Model-based commit classification
67
+ via OpenRouter API for more accurate and context-aware categorization.
68
+ """
69
+
70
+ # Enable/disable LLM classification
71
+ enabled: bool = False # Disabled by default to avoid unexpected API costs
72
+
73
+ # OpenRouter API configuration
74
+ api_key: Optional[str] = None # Set via environment variable or config
75
+ api_base_url: str = "https://openrouter.ai/api/v1"
76
+ model: str = "mistralai/mistral-7b-instruct" # Fast, affordable model
77
+
78
+ # Alternative models for different use cases:
79
+ # - "meta-llama/llama-3-8b-instruct" (Higher accuracy, slightly more expensive)
80
+ # - "openai/gpt-3.5-turbo" (Good balance, more expensive)
81
+
82
+ # Classification parameters
83
+ confidence_threshold: float = 0.7 # Minimum confidence for LLM predictions
84
+ max_tokens: int = 50 # Keep responses short for cost optimization
85
+ temperature: float = 0.1 # Low temperature for consistent results
86
+ timeout_seconds: float = 30.0 # API request timeout
87
+
88
+ # Caching configuration (aggressive caching for cost optimization)
89
+ cache_duration_days: int = 90 # Long cache duration
90
+ enable_caching: bool = True
91
+
92
+ # Cost and rate limiting
93
+ max_daily_requests: int = 1000 # Daily API request limit
94
+
95
+ # Domain-specific terms for better classification accuracy
96
+ domain_terms: dict[str, list[str]] = field(
97
+ default_factory=lambda: {
98
+ "media": [
99
+ "video",
100
+ "audio",
101
+ "streaming",
102
+ "player",
103
+ "media",
104
+ "content",
105
+ "broadcast",
106
+ "live",
107
+ "recording",
108
+ "episode",
109
+ "program",
110
+ "tv",
111
+ "radio",
112
+ "podcast",
113
+ "channel",
114
+ "playlist",
115
+ ],
116
+ "localization": [
117
+ "translation",
118
+ "i18n",
119
+ "l10n",
120
+ "locale",
121
+ "language",
122
+ "spanish",
123
+ "french",
124
+ "german",
125
+ "italian",
126
+ "portuguese",
127
+ "multilingual",
128
+ "translate",
129
+ "localize",
130
+ "regional",
131
+ ],
132
+ "integration": [
133
+ "api",
134
+ "webhook",
135
+ "third-party",
136
+ "external",
137
+ "service",
138
+ "integration",
139
+ "sync",
140
+ "import",
141
+ "export",
142
+ "connector",
143
+ "oauth",
144
+ "auth",
145
+ "authentication",
146
+ "sso",
147
+ ],
148
+ "content": [
149
+ "copy",
150
+ "text",
151
+ "wording",
152
+ "messaging",
153
+ "editorial",
154
+ "article",
155
+ "blog",
156
+ "news",
157
+ "story",
158
+ "caption",
159
+ "title",
160
+ "headline",
161
+ "description",
162
+ "summary",
163
+ "metadata",
164
+ ],
165
+ }
166
+ )
167
+
168
+ # Fallback behavior when LLM is unavailable
169
+ fallback_to_rules: bool = True # Fall back to rule-based classification
170
+ fallback_to_ml: bool = True # Fall back to existing ML classification
171
+
172
+
173
+ @dataclass
174
+ class CommitClassificationConfig:
175
+ """Configuration for commit classification system.
176
+
177
+ This configuration controls the Random Forest-based commit classification
178
+ system that analyzes commits to categorize them into types like feature,
179
+ bugfix, refactor, docs, test, etc.
180
+ """
181
+
182
+ enabled: bool = True
183
+ confidence_threshold: float = 0.5 # Minimum confidence for reliable predictions
184
+ batch_size: int = 100 # Commits processed per batch
185
+ auto_retrain: bool = True # Automatically check if model needs retraining
186
+ retrain_threshold_days: int = 30 # Days after which to suggest retraining
187
+
188
+ # Model hyperparameters
189
+ model: dict[str, Any] = field(
190
+ default_factory=lambda: {
191
+ "n_estimators": 100, # Number of trees in random forest
192
+ "max_depth": 20, # Maximum depth of trees
193
+ "min_samples_split": 5, # Minimum samples to split a node
194
+ "min_samples_leaf": 2, # Minimum samples at leaf node
195
+ "random_state": 42, # For reproducible results
196
+ "n_jobs": -1, # Use all available CPU cores
197
+ }
198
+ )
199
+
200
+ # Feature extraction settings
201
+ feature_extraction: dict[str, Any] = field(
202
+ default_factory=lambda: {
203
+ "enable_temporal_features": True,
204
+ "enable_author_features": True,
205
+ "enable_file_analysis": True,
206
+ "keyword_categories": [
207
+ "feature",
208
+ "bugfix",
209
+ "refactor",
210
+ "docs",
211
+ "test",
212
+ "config",
213
+ "security",
214
+ "performance",
215
+ "ui",
216
+ "api",
217
+ "database",
218
+ "deployment",
219
+ ],
220
+ }
221
+ )
222
+
223
+ # Training settings
224
+ training: dict[str, Any] = field(
225
+ default_factory=lambda: {
226
+ "validation_split": 0.2, # Fraction for validation
227
+ "min_training_samples": 20, # Minimum samples needed for training
228
+ "cross_validation_folds": 5, # K-fold cross validation
229
+ "class_weight": "balanced", # Handle class imbalance
230
+ }
231
+ )
232
+
233
+ # Supported classification categories
234
+ categories: dict[str, str] = field(
235
+ default_factory=lambda: {
236
+ "feature": "New functionality or capabilities",
237
+ "bugfix": "Bug fixes and error corrections",
238
+ "refactor": "Code restructuring and optimization",
239
+ "docs": "Documentation changes and updates",
240
+ "test": "Testing-related changes",
241
+ "config": "Configuration and settings changes",
242
+ "chore": "Maintenance and housekeeping tasks",
243
+ "security": "Security-related changes",
244
+ "hotfix": "Emergency production fixes",
245
+ "style": "Code style and formatting changes",
246
+ "build": "Build system and dependency changes",
247
+ "ci": "Continuous integration changes",
248
+ "revert": "Reverts of previous changes",
249
+ "merge": "Merge commits and integration",
250
+ "wip": "Work in progress commits",
251
+ }
252
+ )
253
+
254
+
255
+ @dataclass
256
+ class BranchAnalysisConfig:
257
+ """Configuration for branch analysis optimization.
258
+
259
+ This configuration controls how branches are analyzed to prevent performance
260
+ issues on large organizations with many repositories and branches.
261
+ """
262
+
263
+ # Branch analysis strategy
264
+ strategy: str = "smart" # Options: "all", "smart", "main_only"
265
+
266
+ # Smart analysis parameters
267
+ max_branches_per_repo: int = 50 # Maximum branches to analyze per repository
268
+ active_days_threshold: int = 90 # Days to consider a branch "active"
269
+ include_main_branches: bool = True # Always include main/master branches
270
+
271
+ # Branch name patterns to always include/exclude
272
+ always_include_patterns: list[str] = field(
273
+ default_factory=lambda: [
274
+ r"^(main|master|develop|dev)$", # Main development branches
275
+ r"^release/.*", # Release branches
276
+ r"^hotfix/.*", # Hotfix branches
277
+ ]
278
+ )
279
+
280
+ always_exclude_patterns: list[str] = field(
281
+ default_factory=lambda: [
282
+ r"^dependabot/.*", # Dependabot branches
283
+ r"^renovate/.*", # Renovate branches
284
+ r".*-backup$", # Backup branches
285
+ r".*-temp$", # Temporary branches
286
+ ]
287
+ )
288
+
289
+ # Performance limits
290
+ enable_progress_logging: bool = True # Log branch analysis progress
291
+ branch_commit_limit: int = 1000 # Max commits to analyze per branch
292
+
293
+
294
+ @dataclass
295
+ class AnalysisConfig:
296
+ """Analysis-specific configuration."""
297
+
298
+ story_point_patterns: list[str] = field(default_factory=list)
299
+ exclude_authors: list[str] = field(default_factory=list)
300
+ exclude_message_patterns: list[str] = field(default_factory=list)
301
+ exclude_paths: list[str] = field(default_factory=list)
302
+ similarity_threshold: float = 0.85
303
+ manual_identity_mappings: list[dict[str, Any]] = field(default_factory=list)
304
+ default_ticket_platform: Optional[str] = None
305
+ branch_mapping_rules: dict[str, list[str]] = field(default_factory=dict)
306
+ ticket_platforms: Optional[list[str]] = None
307
+ auto_identity_analysis: bool = True # Enable automatic identity analysis by default
308
+ branch_analysis: BranchAnalysisConfig = field(default_factory=BranchAnalysisConfig)
309
+ ml_categorization: MLCategorization = field(default_factory=MLCategorization)
310
+ commit_classification: CommitClassificationConfig = field(
311
+ default_factory=CommitClassificationConfig
312
+ )
313
+ llm_classification: LLMClassificationConfig = field(default_factory=LLMClassificationConfig)
314
+
315
+
316
+ @dataclass
317
+ class OutputConfig:
318
+ """Output configuration."""
319
+
320
+ directory: Optional[Path] = None
321
+ formats: list[str] = field(default_factory=lambda: ["csv", "markdown"])
322
+ csv_delimiter: str = ","
323
+ csv_encoding: str = "utf-8"
324
+ anonymize_enabled: bool = False
325
+ anonymize_fields: list[str] = field(default_factory=list)
326
+ anonymize_method: str = "hash"
327
+
328
+
329
+ @dataclass
330
+ class CacheConfig:
331
+ """Cache configuration."""
332
+
333
+ directory: Path = Path(".gitflow-cache")
334
+ ttl_hours: int = 168
335
+ max_size_mb: int = 500
336
+
337
+
338
+ @dataclass
339
+ class JIRAConfig:
340
+ """JIRA configuration."""
341
+
342
+ access_user: str
343
+ access_token: str
344
+ base_url: Optional[str] = None
345
+
346
+
347
+ @dataclass
348
+ class JIRAIntegrationConfig:
349
+ """JIRA integration specific configuration."""
350
+
351
+ enabled: bool = True
352
+ fetch_story_points: bool = True
353
+ project_keys: list[str] = field(default_factory=list)
354
+ story_point_fields: list[str] = field(
355
+ default_factory=lambda: ["customfield_10016", "customfield_10021", "Story Points"]
356
+ )
357
+
358
+
359
+ @dataclass
360
+ class PMPlatformConfig:
361
+ """Base PM platform configuration."""
362
+
363
+ enabled: bool = True
364
+ platform_type: str = ""
365
+ config: dict[str, Any] = field(default_factory=dict)
366
+
367
+
368
+ @dataclass
369
+ class PMIntegrationConfig:
370
+ """PM framework integration configuration."""
371
+
372
+ enabled: bool = False
373
+ primary_platform: Optional[str] = None
374
+ correlation: dict[str, Any] = field(default_factory=dict)
375
+ platforms: dict[str, PMPlatformConfig] = field(default_factory=dict)
376
+
377
+
378
+ @dataclass
379
+ class Config:
380
+ """Main configuration container."""
381
+
382
+ repositories: list[RepositoryConfig]
383
+ github: GitHubConfig
384
+ analysis: AnalysisConfig
385
+ output: OutputConfig
386
+ cache: CacheConfig
387
+ jira: Optional[JIRAConfig] = None
388
+ jira_integration: Optional[JIRAIntegrationConfig] = None
389
+ pm: Optional[Any] = None # Modern PM framework config
390
+ pm_integration: Optional[PMIntegrationConfig] = None
391
+ qualitative: Optional["QualitativeConfig"] = None
392
+
393
+ def discover_organization_repositories(
394
+ self, clone_base_path: Optional[Path] = None
395
+ ) -> list[RepositoryConfig]:
396
+ """Discover repositories from GitHub organization.
397
+
398
+ Args:
399
+ clone_base_path: Base directory where repos should be cloned/found.
400
+ If None, uses output directory.
401
+
402
+ Returns:
403
+ List of discovered repository configurations.
404
+ """
405
+ if not self.github.organization or not self.github.token:
406
+ return []
407
+
408
+ from github import Github
409
+
410
+ github_client = Github(self.github.token, base_url=self.github.base_url)
411
+
412
+ try:
413
+ org = github_client.get_organization(self.github.organization)
414
+ discovered_repos = []
415
+
416
+ base_path = clone_base_path or self.output.directory
417
+ if base_path is None:
418
+ raise ValueError("No base path available for repository cloning")
419
+
420
+ for repo in org.get_repos():
421
+ # Skip archived repositories
422
+ if repo.archived:
423
+ continue
424
+
425
+ # Create repository configuration
426
+ repo_path = base_path / repo.name
427
+ repo_config = RepositoryConfig(
428
+ name=repo.name,
429
+ path=repo_path,
430
+ github_repo=repo.full_name,
431
+ project_key=repo.name.upper().replace("-", "_"),
432
+ branch=repo.default_branch,
433
+ )
434
+ discovered_repos.append(repo_config)
435
+
436
+ return discovered_repos
437
+
438
+ except Exception as e:
439
+ raise ValueError(
440
+ f"Failed to discover repositories from organization {self.github.organization}: {e}"
441
+ ) from e
@@ -0,0 +1,154 @@
1
+ """Configuration validation logic for GitFlow Analytics."""
2
+
3
+ from pathlib import Path
4
+
5
+ from .schema import Config
6
+
7
+
8
+ class ConfigValidator:
9
+ """Validates configuration settings."""
10
+
11
+ @staticmethod
12
+ def validate_config(config: Config) -> list[str]:
13
+ """Validate configuration and return list of warnings.
14
+
15
+ Args:
16
+ config: Configuration to validate
17
+
18
+ Returns:
19
+ List of warning messages
20
+ """
21
+ warnings = []
22
+
23
+ # Check repository paths exist
24
+ for repo in config.repositories:
25
+ if not repo.path.exists():
26
+ warnings.append(f"Repository path does not exist: {repo.path}")
27
+ elif not (repo.path / ".git").exists():
28
+ warnings.append(f"Path is not a git repository: {repo.path}")
29
+
30
+ # Check GitHub token if GitHub repos are specified
31
+ has_github_repos = any(r.github_repo for r in config.repositories)
32
+ if has_github_repos and not config.github.token:
33
+ warnings.append("GitHub repositories specified but no GitHub token provided")
34
+
35
+ # Check if owner is needed
36
+ for repo in config.repositories:
37
+ if repo.github_repo and "/" not in repo.github_repo and not config.github.owner:
38
+ warnings.append(f"Repository {repo.github_repo} needs owner specified")
39
+
40
+ # Check cache directory permissions
41
+ try:
42
+ config.cache.directory.mkdir(exist_ok=True, parents=True)
43
+ except PermissionError:
44
+ warnings.append(f"Cannot create cache directory: {config.cache.directory}")
45
+
46
+ return warnings
47
+
48
+ @staticmethod
49
+ def validate_analysis_config(analysis_config: dict, config_path: Path) -> None:
50
+ """Validate analysis configuration section.
51
+
52
+ Args:
53
+ analysis_config: Analysis configuration dictionary
54
+ config_path: Path to configuration file (for error messages)
55
+
56
+ Raises:
57
+ InvalidValueError: If configuration values are invalid
58
+ """
59
+ from .errors import InvalidValueError
60
+
61
+ # Validate similarity threshold
62
+ if "identity" in analysis_config:
63
+ threshold = analysis_config["identity"].get("similarity_threshold")
64
+ if threshold is not None and not (0.0 <= threshold <= 1.0):
65
+ raise InvalidValueError(
66
+ "similarity_threshold",
67
+ threshold,
68
+ "must be between 0.0 and 1.0",
69
+ config_path,
70
+ valid_values=["0.0 to 1.0"],
71
+ )
72
+
73
+ # Validate ML categorization settings
74
+ if "ml_categorization" in analysis_config:
75
+ ml_config = analysis_config["ml_categorization"]
76
+
77
+ if "min_confidence" in ml_config:
78
+ conf = ml_config["min_confidence"]
79
+ if not (0.0 <= conf <= 1.0):
80
+ raise InvalidValueError(
81
+ "ml_categorization.min_confidence",
82
+ conf,
83
+ "must be between 0.0 and 1.0",
84
+ config_path,
85
+ valid_values=["0.0 to 1.0"],
86
+ )
87
+
88
+ if "semantic_weight" in ml_config:
89
+ weight = ml_config["semantic_weight"]
90
+ if not (0.0 <= weight <= 1.0):
91
+ raise InvalidValueError(
92
+ "ml_categorization.semantic_weight",
93
+ weight,
94
+ "must be between 0.0 and 1.0",
95
+ config_path,
96
+ valid_values=["0.0 to 1.0"],
97
+ )
98
+
99
+ # Validate branch analysis strategy
100
+ if "branch_analysis" in analysis_config:
101
+ branch_config = analysis_config["branch_analysis"]
102
+ if "strategy" in branch_config:
103
+ strategy = branch_config["strategy"]
104
+ valid_strategies = ["all", "smart", "main_only"]
105
+ if strategy not in valid_strategies:
106
+ raise InvalidValueError(
107
+ "branch_analysis.strategy",
108
+ strategy,
109
+ "invalid branch analysis strategy",
110
+ config_path,
111
+ valid_values=valid_strategies,
112
+ )
113
+
114
+ @staticmethod
115
+ def validate_output_config(output_config: dict, config_path: Path) -> None:
116
+ """Validate output configuration section.
117
+
118
+ Args:
119
+ output_config: Output configuration dictionary
120
+ config_path: Path to configuration file (for error messages)
121
+
122
+ Raises:
123
+ InvalidValueError: If configuration values are invalid
124
+ """
125
+ from .errors import InvalidValueError
126
+
127
+ # Validate output formats
128
+ if "formats" in output_config:
129
+ formats = output_config["formats"]
130
+ valid_formats = ["csv", "markdown", "json"]
131
+ for fmt in formats:
132
+ if fmt not in valid_formats:
133
+ raise InvalidValueError(
134
+ "output.formats",
135
+ fmt,
136
+ "invalid output format",
137
+ config_path,
138
+ valid_values=valid_formats,
139
+ )
140
+
141
+ # Validate anonymization method
142
+ if "anonymization" in output_config:
143
+ anon_config = output_config["anonymization"]
144
+ if "method" in anon_config:
145
+ method = anon_config["method"]
146
+ valid_methods = ["hash", "random", "sequential"]
147
+ if method not in valid_methods:
148
+ raise InvalidValueError(
149
+ "output.anonymization.method",
150
+ method,
151
+ "invalid anonymization method",
152
+ config_path,
153
+ valid_values=valid_methods,
154
+ )