gitflow-analytics 1.0.3__py3-none-any.whl → 1.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. gitflow_analytics/_version.py +1 -1
  2. gitflow_analytics/classification/__init__.py +31 -0
  3. gitflow_analytics/classification/batch_classifier.py +752 -0
  4. gitflow_analytics/classification/classifier.py +464 -0
  5. gitflow_analytics/classification/feature_extractor.py +725 -0
  6. gitflow_analytics/classification/linguist_analyzer.py +574 -0
  7. gitflow_analytics/classification/model.py +455 -0
  8. gitflow_analytics/cli.py +4158 -350
  9. gitflow_analytics/cli_rich.py +198 -48
  10. gitflow_analytics/config/__init__.py +43 -0
  11. gitflow_analytics/config/errors.py +261 -0
  12. gitflow_analytics/config/loader.py +905 -0
  13. gitflow_analytics/config/profiles.py +264 -0
  14. gitflow_analytics/config/repository.py +124 -0
  15. gitflow_analytics/config/schema.py +444 -0
  16. gitflow_analytics/config/validator.py +154 -0
  17. gitflow_analytics/config.py +44 -508
  18. gitflow_analytics/core/analyzer.py +1209 -98
  19. gitflow_analytics/core/cache.py +1337 -29
  20. gitflow_analytics/core/data_fetcher.py +1285 -0
  21. gitflow_analytics/core/identity.py +363 -14
  22. gitflow_analytics/core/metrics_storage.py +526 -0
  23. gitflow_analytics/core/progress.py +372 -0
  24. gitflow_analytics/core/schema_version.py +269 -0
  25. gitflow_analytics/extractors/ml_tickets.py +1100 -0
  26. gitflow_analytics/extractors/story_points.py +8 -1
  27. gitflow_analytics/extractors/tickets.py +749 -11
  28. gitflow_analytics/identity_llm/__init__.py +6 -0
  29. gitflow_analytics/identity_llm/analysis_pass.py +231 -0
  30. gitflow_analytics/identity_llm/analyzer.py +464 -0
  31. gitflow_analytics/identity_llm/models.py +76 -0
  32. gitflow_analytics/integrations/github_integration.py +175 -11
  33. gitflow_analytics/integrations/jira_integration.py +461 -24
  34. gitflow_analytics/integrations/orchestrator.py +124 -1
  35. gitflow_analytics/metrics/activity_scoring.py +322 -0
  36. gitflow_analytics/metrics/branch_health.py +470 -0
  37. gitflow_analytics/metrics/dora.py +379 -20
  38. gitflow_analytics/models/database.py +843 -53
  39. gitflow_analytics/pm_framework/__init__.py +115 -0
  40. gitflow_analytics/pm_framework/adapters/__init__.py +50 -0
  41. gitflow_analytics/pm_framework/adapters/jira_adapter.py +1845 -0
  42. gitflow_analytics/pm_framework/base.py +406 -0
  43. gitflow_analytics/pm_framework/models.py +211 -0
  44. gitflow_analytics/pm_framework/orchestrator.py +652 -0
  45. gitflow_analytics/pm_framework/registry.py +333 -0
  46. gitflow_analytics/qualitative/__init__.py +9 -10
  47. gitflow_analytics/qualitative/chatgpt_analyzer.py +259 -0
  48. gitflow_analytics/qualitative/classifiers/__init__.py +3 -3
  49. gitflow_analytics/qualitative/classifiers/change_type.py +518 -244
  50. gitflow_analytics/qualitative/classifiers/domain_classifier.py +272 -165
  51. gitflow_analytics/qualitative/classifiers/intent_analyzer.py +321 -222
  52. gitflow_analytics/qualitative/classifiers/llm/__init__.py +35 -0
  53. gitflow_analytics/qualitative/classifiers/llm/base.py +193 -0
  54. gitflow_analytics/qualitative/classifiers/llm/batch_processor.py +383 -0
  55. gitflow_analytics/qualitative/classifiers/llm/cache.py +479 -0
  56. gitflow_analytics/qualitative/classifiers/llm/cost_tracker.py +435 -0
  57. gitflow_analytics/qualitative/classifiers/llm/openai_client.py +403 -0
  58. gitflow_analytics/qualitative/classifiers/llm/prompts.py +373 -0
  59. gitflow_analytics/qualitative/classifiers/llm/response_parser.py +287 -0
  60. gitflow_analytics/qualitative/classifiers/llm_commit_classifier.py +607 -0
  61. gitflow_analytics/qualitative/classifiers/risk_analyzer.py +215 -189
  62. gitflow_analytics/qualitative/core/__init__.py +4 -4
  63. gitflow_analytics/qualitative/core/llm_fallback.py +239 -235
  64. gitflow_analytics/qualitative/core/nlp_engine.py +157 -148
  65. gitflow_analytics/qualitative/core/pattern_cache.py +214 -192
  66. gitflow_analytics/qualitative/core/processor.py +381 -248
  67. gitflow_analytics/qualitative/enhanced_analyzer.py +2236 -0
  68. gitflow_analytics/qualitative/example_enhanced_usage.py +420 -0
  69. gitflow_analytics/qualitative/models/__init__.py +7 -7
  70. gitflow_analytics/qualitative/models/schemas.py +155 -121
  71. gitflow_analytics/qualitative/utils/__init__.py +4 -4
  72. gitflow_analytics/qualitative/utils/batch_processor.py +136 -123
  73. gitflow_analytics/qualitative/utils/cost_tracker.py +142 -140
  74. gitflow_analytics/qualitative/utils/metrics.py +172 -158
  75. gitflow_analytics/qualitative/utils/text_processing.py +146 -104
  76. gitflow_analytics/reports/__init__.py +100 -0
  77. gitflow_analytics/reports/analytics_writer.py +539 -14
  78. gitflow_analytics/reports/base.py +648 -0
  79. gitflow_analytics/reports/branch_health_writer.py +322 -0
  80. gitflow_analytics/reports/classification_writer.py +924 -0
  81. gitflow_analytics/reports/cli_integration.py +427 -0
  82. gitflow_analytics/reports/csv_writer.py +1676 -212
  83. gitflow_analytics/reports/data_models.py +504 -0
  84. gitflow_analytics/reports/database_report_generator.py +427 -0
  85. gitflow_analytics/reports/example_usage.py +344 -0
  86. gitflow_analytics/reports/factory.py +499 -0
  87. gitflow_analytics/reports/formatters.py +698 -0
  88. gitflow_analytics/reports/html_generator.py +1116 -0
  89. gitflow_analytics/reports/interfaces.py +489 -0
  90. gitflow_analytics/reports/json_exporter.py +2770 -0
  91. gitflow_analytics/reports/narrative_writer.py +2287 -158
  92. gitflow_analytics/reports/story_point_correlation.py +1144 -0
  93. gitflow_analytics/reports/weekly_trends_writer.py +389 -0
  94. gitflow_analytics/training/__init__.py +5 -0
  95. gitflow_analytics/training/model_loader.py +377 -0
  96. gitflow_analytics/training/pipeline.py +550 -0
  97. gitflow_analytics/tui/__init__.py +1 -1
  98. gitflow_analytics/tui/app.py +129 -126
  99. gitflow_analytics/tui/screens/__init__.py +3 -3
  100. gitflow_analytics/tui/screens/analysis_progress_screen.py +188 -179
  101. gitflow_analytics/tui/screens/configuration_screen.py +154 -178
  102. gitflow_analytics/tui/screens/loading_screen.py +100 -110
  103. gitflow_analytics/tui/screens/main_screen.py +89 -72
  104. gitflow_analytics/tui/screens/results_screen.py +305 -281
  105. gitflow_analytics/tui/widgets/__init__.py +2 -2
  106. gitflow_analytics/tui/widgets/data_table.py +67 -69
  107. gitflow_analytics/tui/widgets/export_modal.py +76 -76
  108. gitflow_analytics/tui/widgets/progress_widget.py +41 -46
  109. gitflow_analytics-1.3.11.dist-info/METADATA +1015 -0
  110. gitflow_analytics-1.3.11.dist-info/RECORD +122 -0
  111. gitflow_analytics-1.0.3.dist-info/METADATA +0 -490
  112. gitflow_analytics-1.0.3.dist-info/RECORD +0 -62
  113. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/WHEEL +0 -0
  114. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/entry_points.txt +0 -0
  115. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/licenses/LICENSE +0 -0
  116. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,464 @@
1
+ """Main commit classification orchestrator.
2
+
3
+ This module provides the primary interface for commit classification,
4
+ orchestrating feature extraction, model training, and prediction.
5
+ It integrates with GitFlow Analytics' existing infrastructure and
6
+ provides both training and inference capabilities.
7
+ """
8
+
9
+ import logging
10
+ from datetime import datetime
11
+ from pathlib import Path
12
+ from typing import Any, Optional
13
+
14
+ from .feature_extractor import FeatureExtractor
15
+ from .linguist_analyzer import LinguistAnalyzer
16
+ from .model import CommitClassificationModel
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class CommitClassifier:
22
+ """Main interface for commit classification.
23
+
24
+ This class provides a high-level interface for commit classification,
25
+ handling the entire pipeline from feature extraction to prediction.
26
+ It's designed to integrate seamlessly with GitFlow Analytics while
27
+ providing standalone functionality for other use cases.
28
+
29
+ Key capabilities:
30
+ - Automated feature extraction from git commits
31
+ - Model training with cross-validation
32
+ - Batch and single commit prediction
33
+ - Performance monitoring and metrics
34
+ - Integration with existing GitFlow Analytics caching
35
+ """
36
+
37
+ def __init__(self, config: Optional[dict[str, Any]] = None, cache_dir: Optional[Path] = None):
38
+ """Initialize the commit classifier.
39
+
40
+ Args:
41
+ config: Configuration dictionary for classification parameters
42
+ cache_dir: Directory for caching models and intermediate results
43
+ """
44
+ self.config = config or {}
45
+
46
+ # Setup paths
47
+ self.cache_dir = cache_dir or Path(".gitflow-cache")
48
+ self.model_path = self.cache_dir / "classification"
49
+ self.model_path.mkdir(parents=True, exist_ok=True)
50
+
51
+ # Initialize components
52
+ self.feature_extractor = FeatureExtractor()
53
+ self.linguist_analyzer = LinguistAnalyzer()
54
+ self.model = CommitClassificationModel(
55
+ model_path=self.model_path, config=self.config.get("model", {})
56
+ )
57
+
58
+ # Classification configuration
59
+ self.enabled = self.config.get("enabled", True)
60
+ self.confidence_threshold = self.config.get("confidence_threshold", 0.5)
61
+ self.batch_size = self.config.get("batch_size", 100)
62
+ self.auto_retrain = self.config.get("auto_retrain", True)
63
+ self.retrain_threshold_days = self.config.get("retrain_threshold_days", 30)
64
+
65
+ # Supported classification categories
66
+ self.classification_categories = {
67
+ "feature": "New functionality or capabilities",
68
+ "bugfix": "Bug fixes and error corrections",
69
+ "refactor": "Code restructuring and optimization",
70
+ "docs": "Documentation changes and updates",
71
+ "test": "Testing-related changes",
72
+ "config": "Configuration and settings changes",
73
+ "chore": "Maintenance and housekeeping tasks",
74
+ "security": "Security-related changes",
75
+ "hotfix": "Emergency production fixes",
76
+ "style": "Code style and formatting changes",
77
+ "build": "Build system and dependency changes",
78
+ "ci": "Continuous integration changes",
79
+ "revert": "Reverts of previous changes",
80
+ "merge": "Merge commits and integration",
81
+ "wip": "Work in progress commits",
82
+ }
83
+
84
+ logger.info(
85
+ f"CommitClassifier initialized with {len(self.classification_categories)} categories"
86
+ )
87
+
88
+ def train_model(
89
+ self, training_data: list[tuple[dict[str, Any], str]], validation_split: float = 0.2
90
+ ) -> dict[str, Any]:
91
+ """Train the classification model on labeled data.
92
+
93
+ Args:
94
+ training_data: List of (commit_data, label) tuples
95
+ validation_split: Fraction of data to use for validation
96
+
97
+ Returns:
98
+ Dictionary containing training results and metrics
99
+ """
100
+ if not self.enabled:
101
+ raise RuntimeError("Classification is disabled in configuration")
102
+
103
+ if len(training_data) < 20:
104
+ raise ValueError("Need at least 20 labeled examples for reliable training")
105
+
106
+ logger.info(f"Training commit classifier on {len(training_data)} examples")
107
+
108
+ # Separate commits and labels
109
+ commits = [item[0] for item in training_data]
110
+ labels = [item[1] for item in training_data]
111
+
112
+ # Validate labels
113
+ valid_labels = set(self.classification_categories.keys())
114
+ invalid_labels = set(labels) - valid_labels
115
+ if invalid_labels:
116
+ logger.warning(f"Found invalid labels: {invalid_labels}. Using fallback mapping.")
117
+ labels = [self._map_fallback_label(label) for label in labels]
118
+
119
+ # Train the model
120
+ training_results = self.model.train(commits, labels, validation_split)
121
+
122
+ # Log training summary
123
+ accuracy = training_results.get("accuracy", 0.0)
124
+ logger.info(f"Model training completed with accuracy: {accuracy:.3f}")
125
+
126
+ return training_results
127
+
128
+ def classify_commits(self, commits: list[dict[str, Any]]) -> list[dict[str, Any]]:
129
+ """Classify a batch of commits.
130
+
131
+ Args:
132
+ commits: List of commit data dictionaries
133
+
134
+ Returns:
135
+ List of classification results with predictions and metadata
136
+ """
137
+ if not self.enabled:
138
+ logger.info("Classification disabled, returning empty results")
139
+ return []
140
+
141
+ if not commits:
142
+ return []
143
+
144
+ logger.info(f"Classifying {len(commits)} commits")
145
+
146
+ # Check if model needs retraining
147
+ if self.auto_retrain and self.model.retrain_needed(self.retrain_threshold_days):
148
+ logger.warning("Model may need retraining - consider updating with recent data")
149
+
150
+ # Process commits in batches for memory efficiency
151
+ results = []
152
+ for i in range(0, len(commits), self.batch_size):
153
+ batch = commits[i : i + self.batch_size]
154
+ batch_results = self._classify_batch(batch)
155
+ results.extend(batch_results)
156
+
157
+ logger.info(f"Classification completed for {len(results)} commits")
158
+ return results
159
+
160
+ def _classify_batch(self, commit_batch: list[dict[str, Any]]) -> list[dict[str, Any]]:
161
+ """Classify a single batch of commits.
162
+
163
+ Args:
164
+ commit_batch: Batch of commit data dictionaries
165
+
166
+ Returns:
167
+ List of classification results for the batch
168
+ """
169
+ # Get model predictions
170
+ predictions = self.model.predict(commit_batch)
171
+
172
+ # Enhance results with additional analysis
173
+ enhanced_results = []
174
+ for _i, (commit, prediction) in enumerate(zip(commit_batch, predictions)):
175
+ # Add file analysis context
176
+ file_analysis = self.linguist_analyzer.analyze_commit_files(
177
+ commit.get("files_changed", [])
178
+ )
179
+
180
+ # Determine if prediction is reliable
181
+ confidence = prediction["confidence"]
182
+ is_reliable = confidence >= self.confidence_threshold
183
+
184
+ # Create enhanced result
185
+ result = {
186
+ "commit_hash": commit.get("hash", ""),
187
+ "commit_message": commit.get("message", ""),
188
+ "predicted_class": prediction["predicted_class"],
189
+ "confidence": confidence,
190
+ "is_reliable_prediction": is_reliable,
191
+ "class_probabilities": prediction["class_probabilities"],
192
+ "file_analysis": {
193
+ "primary_language": file_analysis["primary_language"],
194
+ "primary_activity": file_analysis["primary_activity"],
195
+ "file_count": file_analysis["file_count"],
196
+ "is_multilingual": file_analysis["is_multilingual"],
197
+ "is_cross_functional": file_analysis["is_cross_functional"],
198
+ },
199
+ "classification_metadata": {
200
+ "model_timestamp": self.model.training_timestamp,
201
+ "feature_count": 68,
202
+ "categories_available": len(self.classification_categories),
203
+ },
204
+ }
205
+
206
+ enhanced_results.append(result)
207
+
208
+ return enhanced_results
209
+
210
+ def classify_single_commit(self, commit: dict[str, Any]) -> dict[str, Any]:
211
+ """Classify a single commit.
212
+
213
+ Args:
214
+ commit: Commit data dictionary
215
+
216
+ Returns:
217
+ Classification result dictionary
218
+ """
219
+ results = self.classify_commits([commit])
220
+ return results[0] if results else {}
221
+
222
+ def get_feature_importance(self, top_n: int = 20) -> list[tuple[str, float]]:
223
+ """Get feature importance rankings from the trained model.
224
+
225
+ Args:
226
+ top_n: Number of top features to return
227
+
228
+ Returns:
229
+ List of (feature_name, importance_score) tuples
230
+ """
231
+ return self.model.get_feature_importance(top_n)
232
+
233
+ def analyze_commit_patterns(self, commits: list[dict[str, Any]]) -> dict[str, Any]:
234
+ """Analyze patterns in a collection of commits.
235
+
236
+ Args:
237
+ commits: List of commit data dictionaries
238
+
239
+ Returns:
240
+ Dictionary with pattern analysis results
241
+ """
242
+ if not commits:
243
+ return {}
244
+
245
+ # Classify all commits
246
+ classifications = self.classify_commits(commits)
247
+
248
+ # Aggregate pattern statistics
249
+ class_counts = {}
250
+ language_usage = {}
251
+ activity_patterns = {}
252
+ confidence_distribution = []
253
+
254
+ for result in classifications:
255
+ # Count classifications
256
+ predicted_class = result["predicted_class"]
257
+ class_counts[predicted_class] = class_counts.get(predicted_class, 0) + 1
258
+
259
+ # Track confidence scores
260
+ confidence_distribution.append(result["confidence"])
261
+
262
+ # Aggregate language usage
263
+ primary_lang = result["file_analysis"]["primary_language"]
264
+ if primary_lang:
265
+ if primary_lang not in language_usage:
266
+ language_usage[primary_lang] = {}
267
+ if predicted_class not in language_usage[primary_lang]:
268
+ language_usage[primary_lang][predicted_class] = 0
269
+ language_usage[primary_lang][predicted_class] += 1
270
+
271
+ # Aggregate activity patterns
272
+ primary_activity = result["file_analysis"]["primary_activity"]
273
+ if primary_activity:
274
+ if primary_activity not in activity_patterns:
275
+ activity_patterns[primary_activity] = {}
276
+ if predicted_class not in activity_patterns[primary_activity]:
277
+ activity_patterns[primary_activity][predicted_class] = 0
278
+ activity_patterns[primary_activity][predicted_class] += 1
279
+
280
+ # Calculate statistics
281
+ total_commits = len(classifications)
282
+ avg_confidence = (
283
+ sum(confidence_distribution) / len(confidence_distribution)
284
+ if confidence_distribution
285
+ else 0.0
286
+ )
287
+
288
+ return {
289
+ "total_commits_analyzed": total_commits,
290
+ "classification_distribution": class_counts,
291
+ "average_confidence": avg_confidence,
292
+ "high_confidence_ratio": sum(
293
+ 1 for c in confidence_distribution if c >= self.confidence_threshold
294
+ )
295
+ / total_commits,
296
+ "language_usage_patterns": language_usage,
297
+ "activity_patterns": activity_patterns,
298
+ "most_common_class": (
299
+ max(class_counts.items(), key=lambda x: x[1])[0] if class_counts else None
300
+ ),
301
+ "classification_diversity": len(class_counts),
302
+ "supported_categories": list(self.classification_categories.keys()),
303
+ }
304
+
305
+ def _map_fallback_label(self, label: str) -> str:
306
+ """Map unknown labels to supported categories.
307
+
308
+ Args:
309
+ label: Original label
310
+
311
+ Returns:
312
+ Mapped label from supported categories
313
+ """
314
+ label_lower = label.lower()
315
+
316
+ # Common mappings
317
+ mappings = {
318
+ "feat": "feature",
319
+ "fix": "bugfix",
320
+ "bug_fix": "bugfix", # From training pipeline
321
+ "doc": "docs",
322
+ "documentation": "docs",
323
+ "testing": "test",
324
+ "tests": "test",
325
+ "maintenance": "chore", # From training pipeline
326
+ "cleanup": "chore",
327
+ "optimization": "refactor",
328
+ "optimize": "refactor",
329
+ "enhancement": "feature",
330
+ "improvement": "refactor",
331
+ "styling": "style",
332
+ "format": "style",
333
+ }
334
+
335
+ return mappings.get(label_lower, "chore") # Default to chore
336
+
337
+ def get_model_status(self) -> dict[str, Any]:
338
+ """Get comprehensive status of the classification system.
339
+
340
+ Returns:
341
+ Dictionary with system status and capabilities
342
+ """
343
+ model_info = self.model.get_model_info()
344
+
345
+ return {
346
+ "enabled": self.enabled,
347
+ "model_trained": model_info["is_trained"],
348
+ "sklearn_available": model_info["sklearn_available"],
349
+ "training_timestamp": model_info["training_timestamp"],
350
+ "supported_categories": list(self.classification_categories.keys()),
351
+ "confidence_threshold": self.confidence_threshold,
352
+ "batch_size": self.batch_size,
353
+ "model_path": str(self.model_path),
354
+ "auto_retrain_enabled": self.auto_retrain,
355
+ "needs_retraining": self.model.retrain_needed(self.retrain_threshold_days),
356
+ "training_metrics": model_info.get("training_metrics", {}),
357
+ "cache_directory": str(self.cache_dir),
358
+ }
359
+
360
+ def export_training_data(self, commits: list[dict[str, Any]], output_path: Path) -> None:
361
+ """Export commits in a format suitable for manual labeling.
362
+
363
+ Args:
364
+ commits: List of commit data dictionaries
365
+ output_path: Path to save the training data CSV
366
+ """
367
+ import csv
368
+
369
+ with open(output_path, "w", newline="", encoding="utf-8") as f:
370
+ writer = csv.writer(f)
371
+
372
+ # Write header
373
+ writer.writerow(
374
+ [
375
+ "hash",
376
+ "message",
377
+ "author",
378
+ "timestamp",
379
+ "files_changed",
380
+ "insertions",
381
+ "deletions",
382
+ "primary_language",
383
+ "primary_activity",
384
+ "suggested_class",
385
+ "manual_label",
386
+ ]
387
+ )
388
+
389
+ # Analyze commits for suggestions
390
+ for commit in commits:
391
+ file_analysis = self.linguist_analyzer.analyze_commit_files(
392
+ commit.get("files_changed", [])
393
+ )
394
+
395
+ # Get a prediction for suggestion
396
+ if self.model.is_trained:
397
+ prediction = self.classify_single_commit(commit)
398
+ suggested_class = prediction.get("predicted_class", "unknown")
399
+ else:
400
+ suggested_class = "unknown"
401
+
402
+ # Write row
403
+ writer.writerow(
404
+ [
405
+ commit.get("hash", ""),
406
+ commit.get("message", ""),
407
+ commit.get("author_name", ""),
408
+ commit.get("timestamp", ""),
409
+ len(commit.get("files_changed", [])),
410
+ commit.get("insertions", 0),
411
+ commit.get("deletions", 0),
412
+ file_analysis["primary_language"] or "unknown",
413
+ file_analysis["primary_activity"] or "unknown",
414
+ suggested_class,
415
+ "", # Empty column for manual labeling
416
+ ]
417
+ )
418
+
419
+ logger.info(f"Training data exported to {output_path}")
420
+
421
+ def load_training_data(self, csv_path: Path) -> list[tuple[dict[str, Any], str]]:
422
+ """Load manually labeled training data from CSV.
423
+
424
+ Args:
425
+ csv_path: Path to CSV file with labeled data
426
+
427
+ Returns:
428
+ List of (commit_data, label) tuples
429
+ """
430
+ import csv
431
+
432
+ training_data = []
433
+
434
+ with open(csv_path, encoding="utf-8") as f:
435
+ reader = csv.DictReader(f)
436
+
437
+ for row in reader:
438
+ # Skip rows without manual labels
439
+ if not row.get("manual_label", "").strip():
440
+ continue
441
+
442
+ # Parse timestamp
443
+ timestamp_str = row.get("timestamp", "")
444
+ try:
445
+ timestamp = datetime.fromisoformat(timestamp_str.replace("Z", "+00:00"))
446
+ except (ValueError, AttributeError):
447
+ timestamp = datetime.now()
448
+
449
+ # Create commit data structure
450
+ commit_data = {
451
+ "hash": row.get("hash", ""),
452
+ "message": row.get("message", ""),
453
+ "author_name": row.get("author", ""),
454
+ "timestamp": timestamp,
455
+ "files_changed": [], # Would need to be reconstructed from git
456
+ "insertions": int(row.get("insertions", 0) or 0),
457
+ "deletions": int(row.get("deletions", 0) or 0),
458
+ }
459
+
460
+ label = row["manual_label"].strip()
461
+ training_data.append((commit_data, label))
462
+
463
+ logger.info(f"Loaded {len(training_data)} labeled examples from {csv_path}")
464
+ return training_data