gitflow-analytics 1.0.3__py3-none-any.whl → 1.3.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gitflow_analytics/_version.py +1 -1
- gitflow_analytics/classification/__init__.py +31 -0
- gitflow_analytics/classification/batch_classifier.py +752 -0
- gitflow_analytics/classification/classifier.py +464 -0
- gitflow_analytics/classification/feature_extractor.py +725 -0
- gitflow_analytics/classification/linguist_analyzer.py +574 -0
- gitflow_analytics/classification/model.py +455 -0
- gitflow_analytics/cli.py +4158 -350
- gitflow_analytics/cli_rich.py +198 -48
- gitflow_analytics/config/__init__.py +43 -0
- gitflow_analytics/config/errors.py +261 -0
- gitflow_analytics/config/loader.py +905 -0
- gitflow_analytics/config/profiles.py +264 -0
- gitflow_analytics/config/repository.py +124 -0
- gitflow_analytics/config/schema.py +444 -0
- gitflow_analytics/config/validator.py +154 -0
- gitflow_analytics/config.py +44 -508
- gitflow_analytics/core/analyzer.py +1209 -98
- gitflow_analytics/core/cache.py +1337 -29
- gitflow_analytics/core/data_fetcher.py +1285 -0
- gitflow_analytics/core/identity.py +363 -14
- gitflow_analytics/core/metrics_storage.py +526 -0
- gitflow_analytics/core/progress.py +372 -0
- gitflow_analytics/core/schema_version.py +269 -0
- gitflow_analytics/extractors/ml_tickets.py +1100 -0
- gitflow_analytics/extractors/story_points.py +8 -1
- gitflow_analytics/extractors/tickets.py +749 -11
- gitflow_analytics/identity_llm/__init__.py +6 -0
- gitflow_analytics/identity_llm/analysis_pass.py +231 -0
- gitflow_analytics/identity_llm/analyzer.py +464 -0
- gitflow_analytics/identity_llm/models.py +76 -0
- gitflow_analytics/integrations/github_integration.py +175 -11
- gitflow_analytics/integrations/jira_integration.py +461 -24
- gitflow_analytics/integrations/orchestrator.py +124 -1
- gitflow_analytics/metrics/activity_scoring.py +322 -0
- gitflow_analytics/metrics/branch_health.py +470 -0
- gitflow_analytics/metrics/dora.py +379 -20
- gitflow_analytics/models/database.py +843 -53
- gitflow_analytics/pm_framework/__init__.py +115 -0
- gitflow_analytics/pm_framework/adapters/__init__.py +50 -0
- gitflow_analytics/pm_framework/adapters/jira_adapter.py +1845 -0
- gitflow_analytics/pm_framework/base.py +406 -0
- gitflow_analytics/pm_framework/models.py +211 -0
- gitflow_analytics/pm_framework/orchestrator.py +652 -0
- gitflow_analytics/pm_framework/registry.py +333 -0
- gitflow_analytics/qualitative/__init__.py +9 -10
- gitflow_analytics/qualitative/chatgpt_analyzer.py +259 -0
- gitflow_analytics/qualitative/classifiers/__init__.py +3 -3
- gitflow_analytics/qualitative/classifiers/change_type.py +518 -244
- gitflow_analytics/qualitative/classifiers/domain_classifier.py +272 -165
- gitflow_analytics/qualitative/classifiers/intent_analyzer.py +321 -222
- gitflow_analytics/qualitative/classifiers/llm/__init__.py +35 -0
- gitflow_analytics/qualitative/classifiers/llm/base.py +193 -0
- gitflow_analytics/qualitative/classifiers/llm/batch_processor.py +383 -0
- gitflow_analytics/qualitative/classifiers/llm/cache.py +479 -0
- gitflow_analytics/qualitative/classifiers/llm/cost_tracker.py +435 -0
- gitflow_analytics/qualitative/classifiers/llm/openai_client.py +403 -0
- gitflow_analytics/qualitative/classifiers/llm/prompts.py +373 -0
- gitflow_analytics/qualitative/classifiers/llm/response_parser.py +287 -0
- gitflow_analytics/qualitative/classifiers/llm_commit_classifier.py +607 -0
- gitflow_analytics/qualitative/classifiers/risk_analyzer.py +215 -189
- gitflow_analytics/qualitative/core/__init__.py +4 -4
- gitflow_analytics/qualitative/core/llm_fallback.py +239 -235
- gitflow_analytics/qualitative/core/nlp_engine.py +157 -148
- gitflow_analytics/qualitative/core/pattern_cache.py +214 -192
- gitflow_analytics/qualitative/core/processor.py +381 -248
- gitflow_analytics/qualitative/enhanced_analyzer.py +2236 -0
- gitflow_analytics/qualitative/example_enhanced_usage.py +420 -0
- gitflow_analytics/qualitative/models/__init__.py +7 -7
- gitflow_analytics/qualitative/models/schemas.py +155 -121
- gitflow_analytics/qualitative/utils/__init__.py +4 -4
- gitflow_analytics/qualitative/utils/batch_processor.py +136 -123
- gitflow_analytics/qualitative/utils/cost_tracker.py +142 -140
- gitflow_analytics/qualitative/utils/metrics.py +172 -158
- gitflow_analytics/qualitative/utils/text_processing.py +146 -104
- gitflow_analytics/reports/__init__.py +100 -0
- gitflow_analytics/reports/analytics_writer.py +539 -14
- gitflow_analytics/reports/base.py +648 -0
- gitflow_analytics/reports/branch_health_writer.py +322 -0
- gitflow_analytics/reports/classification_writer.py +924 -0
- gitflow_analytics/reports/cli_integration.py +427 -0
- gitflow_analytics/reports/csv_writer.py +1676 -212
- gitflow_analytics/reports/data_models.py +504 -0
- gitflow_analytics/reports/database_report_generator.py +427 -0
- gitflow_analytics/reports/example_usage.py +344 -0
- gitflow_analytics/reports/factory.py +499 -0
- gitflow_analytics/reports/formatters.py +698 -0
- gitflow_analytics/reports/html_generator.py +1116 -0
- gitflow_analytics/reports/interfaces.py +489 -0
- gitflow_analytics/reports/json_exporter.py +2770 -0
- gitflow_analytics/reports/narrative_writer.py +2287 -158
- gitflow_analytics/reports/story_point_correlation.py +1144 -0
- gitflow_analytics/reports/weekly_trends_writer.py +389 -0
- gitflow_analytics/training/__init__.py +5 -0
- gitflow_analytics/training/model_loader.py +377 -0
- gitflow_analytics/training/pipeline.py +550 -0
- gitflow_analytics/tui/__init__.py +1 -1
- gitflow_analytics/tui/app.py +129 -126
- gitflow_analytics/tui/screens/__init__.py +3 -3
- gitflow_analytics/tui/screens/analysis_progress_screen.py +188 -179
- gitflow_analytics/tui/screens/configuration_screen.py +154 -178
- gitflow_analytics/tui/screens/loading_screen.py +100 -110
- gitflow_analytics/tui/screens/main_screen.py +89 -72
- gitflow_analytics/tui/screens/results_screen.py +305 -281
- gitflow_analytics/tui/widgets/__init__.py +2 -2
- gitflow_analytics/tui/widgets/data_table.py +67 -69
- gitflow_analytics/tui/widgets/export_modal.py +76 -76
- gitflow_analytics/tui/widgets/progress_widget.py +41 -46
- gitflow_analytics-1.3.11.dist-info/METADATA +1015 -0
- gitflow_analytics-1.3.11.dist-info/RECORD +122 -0
- gitflow_analytics-1.0.3.dist-info/METADATA +0 -490
- gitflow_analytics-1.0.3.dist-info/RECORD +0 -62
- {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/WHEEL +0 -0
- {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/entry_points.txt +0 -0
- {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/licenses/LICENSE +0 -0
- {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,550 @@
|
|
|
1
|
+
"""Training pipeline for commit classification using PM platform data."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from datetime import datetime, timezone
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, Optional
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
from sqlalchemy import Column, DateTime, Float, ForeignKey, Integer, String, Text, create_engine
|
|
10
|
+
from sqlalchemy.orm import declarative_base, sessionmaker
|
|
11
|
+
|
|
12
|
+
from ..classification.classifier import CommitClassifier
|
|
13
|
+
from ..config import Config
|
|
14
|
+
from ..core.analyzer import GitAnalyzer
|
|
15
|
+
from ..core.cache import GitAnalysisCache
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
# Database models for training data
|
|
20
|
+
TrainingBase = declarative_base()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class TrainingSession(TrainingBase):
|
|
24
|
+
"""Store training session metadata."""
|
|
25
|
+
|
|
26
|
+
__tablename__ = "training_sessions"
|
|
27
|
+
|
|
28
|
+
id = Column(String, primary_key=True)
|
|
29
|
+
name = Column(String)
|
|
30
|
+
created_at = Column(DateTime)
|
|
31
|
+
model_type = Column(String)
|
|
32
|
+
training_examples = Column(Integer)
|
|
33
|
+
validation_split = Column(Float)
|
|
34
|
+
accuracy = Column(Float)
|
|
35
|
+
precision = Column(Float)
|
|
36
|
+
recall = Column(Float)
|
|
37
|
+
f1_score = Column(Float)
|
|
38
|
+
model_path = Column(String)
|
|
39
|
+
config_hash = Column(String)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class TrainingData(TrainingBase):
|
|
43
|
+
"""Store individual training examples."""
|
|
44
|
+
|
|
45
|
+
__tablename__ = "training_data"
|
|
46
|
+
|
|
47
|
+
id = Column(Integer, primary_key=True, autoincrement=True)
|
|
48
|
+
session_id = Column(String, ForeignKey("training_sessions.id"))
|
|
49
|
+
commit_hash = Column(String, index=True)
|
|
50
|
+
repository = Column(String)
|
|
51
|
+
message = Column(Text)
|
|
52
|
+
author = Column(String)
|
|
53
|
+
timestamp = Column(DateTime)
|
|
54
|
+
files_changed = Column(Integer)
|
|
55
|
+
insertions = Column(Integer)
|
|
56
|
+
deletions = Column(Integer)
|
|
57
|
+
ticket_id = Column(String)
|
|
58
|
+
ticket_type = Column(String)
|
|
59
|
+
ticket_platform = Column(String)
|
|
60
|
+
label = Column(String)
|
|
61
|
+
confidence = Column(Float)
|
|
62
|
+
created_at = Column(DateTime)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class CommitClassificationTrainer:
|
|
66
|
+
"""Train commit classification models using PM platform data."""
|
|
67
|
+
|
|
68
|
+
# Mapping from PM ticket types to classification categories
|
|
69
|
+
TICKET_TYPE_MAPPING = {
|
|
70
|
+
# Bug types
|
|
71
|
+
"bug": "bug_fix",
|
|
72
|
+
"defect": "bug_fix",
|
|
73
|
+
"issue": "bug_fix",
|
|
74
|
+
"incident": "bug_fix",
|
|
75
|
+
"problem": "bug_fix",
|
|
76
|
+
# Feature types
|
|
77
|
+
"feature": "feature",
|
|
78
|
+
"story": "feature",
|
|
79
|
+
"user story": "feature",
|
|
80
|
+
"new feature": "feature",
|
|
81
|
+
"enhancement": "feature",
|
|
82
|
+
"epic": "feature",
|
|
83
|
+
"historia": "feature", # EWTN custom type (Spanish for Story)
|
|
84
|
+
# Task/maintenance types
|
|
85
|
+
"task": "maintenance",
|
|
86
|
+
"chore": "maintenance",
|
|
87
|
+
"subtask": "maintenance",
|
|
88
|
+
"sub-task": "maintenance",
|
|
89
|
+
# Documentation types
|
|
90
|
+
"documentation": "documentation",
|
|
91
|
+
"docs": "documentation",
|
|
92
|
+
# Improvement/refactoring types
|
|
93
|
+
"improvement": "refactor",
|
|
94
|
+
"refactoring": "refactor",
|
|
95
|
+
"technical debt": "refactor",
|
|
96
|
+
"optimization": "refactor",
|
|
97
|
+
# Test types
|
|
98
|
+
"test": "test",
|
|
99
|
+
"testing": "test",
|
|
100
|
+
"qa": "test",
|
|
101
|
+
# Other types
|
|
102
|
+
"security": "security",
|
|
103
|
+
"hotfix": "hotfix",
|
|
104
|
+
"research": "other",
|
|
105
|
+
"spike": "other",
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
def __init__(
|
|
109
|
+
self,
|
|
110
|
+
config: Config,
|
|
111
|
+
cache: GitAnalysisCache,
|
|
112
|
+
orchestrator: Any,
|
|
113
|
+
training_config: Optional[dict[str, Any]] = None,
|
|
114
|
+
):
|
|
115
|
+
"""Initialize the training pipeline.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
config: GitFlow Analytics configuration
|
|
119
|
+
cache: Cache instance
|
|
120
|
+
orchestrator: Integration orchestrator with PM platforms
|
|
121
|
+
training_config: Training-specific configuration
|
|
122
|
+
"""
|
|
123
|
+
self.config = config
|
|
124
|
+
self.cache = cache
|
|
125
|
+
self.orchestrator = orchestrator
|
|
126
|
+
self.training_config = training_config or {}
|
|
127
|
+
|
|
128
|
+
# Initialize database for training data
|
|
129
|
+
self.db_path = cache.cache_dir / "training_data.db"
|
|
130
|
+
self.engine = create_engine(f"sqlite:///{self.db_path}")
|
|
131
|
+
TrainingBase.metadata.create_all(self.engine)
|
|
132
|
+
self.Session = sessionmaker(bind=self.engine)
|
|
133
|
+
|
|
134
|
+
# Initialize classifier
|
|
135
|
+
self.classifier = CommitClassifier(
|
|
136
|
+
config=(
|
|
137
|
+
config.analysis.commit_classification.__dict__
|
|
138
|
+
if hasattr(config.analysis, "commit_classification")
|
|
139
|
+
else {}
|
|
140
|
+
),
|
|
141
|
+
cache_dir=cache.cache_dir,
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
logger.info(f"Initialized training pipeline with cache at {cache.cache_dir}")
|
|
145
|
+
|
|
146
|
+
def train(
|
|
147
|
+
self, repositories: list[Any], since: datetime, session_name: Optional[str] = None
|
|
148
|
+
) -> dict[str, Any]:
|
|
149
|
+
"""Train a classification model using PM platform data.
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
repositories: List of repository configurations
|
|
153
|
+
since: Start date for commit extraction
|
|
154
|
+
session_name: Optional name for this training session
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
Training results dictionary
|
|
158
|
+
"""
|
|
159
|
+
session_id = self._create_training_session(session_name)
|
|
160
|
+
|
|
161
|
+
try:
|
|
162
|
+
# Step 1: Extract commits with ticket references
|
|
163
|
+
logger.info("Extracting commits with ticket references...")
|
|
164
|
+
labeled_commits = self._extract_labeled_commits(repositories, since)
|
|
165
|
+
|
|
166
|
+
if len(labeled_commits) < self.training_config.get("min_training_examples", 50):
|
|
167
|
+
raise ValueError(
|
|
168
|
+
f"Insufficient training data: {len(labeled_commits)} examples found, "
|
|
169
|
+
f"minimum {self.training_config.get('min_training_examples', 50)} required"
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
# Step 2: Store training data
|
|
173
|
+
logger.info(f"Storing {len(labeled_commits)} training examples...")
|
|
174
|
+
self._store_training_data(session_id, labeled_commits)
|
|
175
|
+
|
|
176
|
+
# Step 3: Train the model
|
|
177
|
+
logger.info("Training classification model...")
|
|
178
|
+
training_data = [(commit["commit_data"], commit["label"]) for commit in labeled_commits]
|
|
179
|
+
results = self.classifier.train_model(
|
|
180
|
+
training_data, validation_split=self.training_config.get("validation_split", 0.2)
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
# Step 4: Update session with results
|
|
184
|
+
self._update_training_session(session_id, results, len(labeled_commits))
|
|
185
|
+
|
|
186
|
+
# Step 5: Save training data CSV if requested
|
|
187
|
+
if self.training_config.get("save_training_data", False):
|
|
188
|
+
self._export_training_data(session_id)
|
|
189
|
+
|
|
190
|
+
return {
|
|
191
|
+
"session_id": session_id,
|
|
192
|
+
"training_examples": len(labeled_commits),
|
|
193
|
+
"accuracy": results.get("accuracy", 0.0),
|
|
194
|
+
"results": results,
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
except Exception as e:
|
|
198
|
+
logger.error(f"Training failed: {e}")
|
|
199
|
+
self._mark_session_failed(session_id, str(e))
|
|
200
|
+
raise
|
|
201
|
+
|
|
202
|
+
def _extract_labeled_commits(
|
|
203
|
+
self, repositories: list[Any], since: datetime
|
|
204
|
+
) -> list[dict[str, Any]]:
|
|
205
|
+
"""Extract commits with PM platform labels.
|
|
206
|
+
|
|
207
|
+
Args:
|
|
208
|
+
repositories: List of repository configurations
|
|
209
|
+
since: Start date for commit extraction
|
|
210
|
+
|
|
211
|
+
Returns:
|
|
212
|
+
List of labeled commit dictionaries
|
|
213
|
+
"""
|
|
214
|
+
labeled_commits = []
|
|
215
|
+
analyzer = GitAnalyzer(
|
|
216
|
+
self.cache,
|
|
217
|
+
batch_size=getattr(self.config.analysis, "batch_size", 1000),
|
|
218
|
+
allowed_ticket_platforms=getattr(
|
|
219
|
+
self.config.analysis, "allowed_ticket_platforms", None
|
|
220
|
+
),
|
|
221
|
+
story_point_patterns=getattr(self.config.analysis, "story_point_patterns", None),
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
for repo_config in repositories:
|
|
225
|
+
if not repo_config.path.exists():
|
|
226
|
+
logger.warning(f"Repository path does not exist: {repo_config.path}")
|
|
227
|
+
continue
|
|
228
|
+
|
|
229
|
+
logger.info(f"Analyzing repository: {repo_config.path}")
|
|
230
|
+
|
|
231
|
+
# Extract commits
|
|
232
|
+
try:
|
|
233
|
+
commits = analyzer.analyze_repository(
|
|
234
|
+
repo_config.path, since=since, branch=repo_config.branch
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
# Filter commits with ticket references
|
|
238
|
+
for commit in commits:
|
|
239
|
+
ticket_refs = commit.get("ticket_references", [])
|
|
240
|
+
if not ticket_refs:
|
|
241
|
+
continue
|
|
242
|
+
|
|
243
|
+
# Get ticket data from PM platforms
|
|
244
|
+
ticket_data = self._fetch_ticket_data(ticket_refs)
|
|
245
|
+
if not ticket_data:
|
|
246
|
+
continue
|
|
247
|
+
|
|
248
|
+
# Determine label from ticket type
|
|
249
|
+
label = self._determine_label(ticket_data)
|
|
250
|
+
if label:
|
|
251
|
+
# Normalize commit data to ensure files_changed is a list
|
|
252
|
+
normalized_commit = self._normalize_commit_data(commit)
|
|
253
|
+
labeled_commits.append(
|
|
254
|
+
{
|
|
255
|
+
"commit_data": normalized_commit,
|
|
256
|
+
"ticket_data": ticket_data,
|
|
257
|
+
"label": label,
|
|
258
|
+
"repository": repo_config.name,
|
|
259
|
+
}
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
except Exception as e:
|
|
263
|
+
logger.error(f"Failed to analyze repository {repo_config.path}: {e}")
|
|
264
|
+
continue
|
|
265
|
+
|
|
266
|
+
return labeled_commits
|
|
267
|
+
|
|
268
|
+
def _normalize_commit_data(self, commit: dict[str, Any]) -> dict[str, Any]:
|
|
269
|
+
"""Normalize commit data to ensure consistency.
|
|
270
|
+
|
|
271
|
+
Args:
|
|
272
|
+
commit: Original commit data
|
|
273
|
+
|
|
274
|
+
Returns:
|
|
275
|
+
Normalized commit data with files_changed as a list
|
|
276
|
+
"""
|
|
277
|
+
normalized = commit.copy()
|
|
278
|
+
|
|
279
|
+
# Ensure files_changed is a list
|
|
280
|
+
files_changed = commit.get("files_changed", [])
|
|
281
|
+
if isinstance(files_changed, int):
|
|
282
|
+
# If it's an integer count, we can't reconstruct the file list
|
|
283
|
+
# Store the count separately and use empty list for files
|
|
284
|
+
normalized["files_changed_count"] = files_changed
|
|
285
|
+
normalized["files_changed"] = []
|
|
286
|
+
elif isinstance(files_changed, list):
|
|
287
|
+
# If it's already a list, keep it and also store the count
|
|
288
|
+
normalized["files_changed"] = files_changed
|
|
289
|
+
normalized["files_changed_count"] = len(files_changed)
|
|
290
|
+
else:
|
|
291
|
+
# Fallback for unexpected types
|
|
292
|
+
normalized["files_changed"] = []
|
|
293
|
+
normalized["files_changed_count"] = 0
|
|
294
|
+
|
|
295
|
+
return normalized
|
|
296
|
+
|
|
297
|
+
def _fetch_ticket_data(self, ticket_refs: list[dict[str, str]]) -> list[dict[str, Any]]:
|
|
298
|
+
"""Fetch ticket data from PM platforms.
|
|
299
|
+
|
|
300
|
+
Args:
|
|
301
|
+
ticket_refs: List of ticket references
|
|
302
|
+
|
|
303
|
+
Returns:
|
|
304
|
+
List of ticket data dictionaries
|
|
305
|
+
"""
|
|
306
|
+
if not self.orchestrator.pm_orchestrator:
|
|
307
|
+
return []
|
|
308
|
+
|
|
309
|
+
# Get list of configured platforms
|
|
310
|
+
configured_platforms = self.orchestrator.pm_orchestrator.get_active_platforms()
|
|
311
|
+
ticket_data = []
|
|
312
|
+
|
|
313
|
+
for ref in ticket_refs:
|
|
314
|
+
platform = ref.get("platform", "")
|
|
315
|
+
ticket_id = ref.get("id", "")
|
|
316
|
+
|
|
317
|
+
if not platform or not ticket_id:
|
|
318
|
+
continue
|
|
319
|
+
|
|
320
|
+
# Skip platforms that aren't configured
|
|
321
|
+
if platform not in configured_platforms:
|
|
322
|
+
logger.debug(f"Skipping ticket {ticket_id} from unconfigured platform {platform}")
|
|
323
|
+
continue
|
|
324
|
+
|
|
325
|
+
try:
|
|
326
|
+
# Fetch ticket from PM platform
|
|
327
|
+
tickets = self.orchestrator.pm_orchestrator.get_issues_by_keys(
|
|
328
|
+
platform, [ticket_id]
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
if tickets and ticket_id in tickets:
|
|
332
|
+
ticket = tickets[ticket_id]
|
|
333
|
+
ticket_data.append(
|
|
334
|
+
{
|
|
335
|
+
"id": ticket_id,
|
|
336
|
+
"platform": platform,
|
|
337
|
+
"type": ticket.issue_type.value if ticket.issue_type else "unknown",
|
|
338
|
+
"title": ticket.title,
|
|
339
|
+
"status": ticket.status.value if ticket.status else "unknown",
|
|
340
|
+
}
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
except Exception as e:
|
|
344
|
+
logger.warning(f"Failed to fetch ticket {ticket_id} from {platform}: {e}")
|
|
345
|
+
continue
|
|
346
|
+
|
|
347
|
+
return ticket_data
|
|
348
|
+
|
|
349
|
+
def _determine_label(self, ticket_data: list[dict[str, Any]]) -> Optional[str]:
|
|
350
|
+
"""Determine classification label from ticket data.
|
|
351
|
+
|
|
352
|
+
Args:
|
|
353
|
+
ticket_data: List of ticket data dictionaries
|
|
354
|
+
|
|
355
|
+
Returns:
|
|
356
|
+
Classification label or None
|
|
357
|
+
"""
|
|
358
|
+
if not ticket_data:
|
|
359
|
+
return None
|
|
360
|
+
|
|
361
|
+
# Count ticket types
|
|
362
|
+
type_counts = {}
|
|
363
|
+
for ticket in ticket_data:
|
|
364
|
+
ticket_type = ticket.get("type", "").lower()
|
|
365
|
+
mapped_type = self.TICKET_TYPE_MAPPING.get(ticket_type, None)
|
|
366
|
+
|
|
367
|
+
if mapped_type:
|
|
368
|
+
type_counts[mapped_type] = type_counts.get(mapped_type, 0) + 1
|
|
369
|
+
|
|
370
|
+
if not type_counts:
|
|
371
|
+
return None
|
|
372
|
+
|
|
373
|
+
# Return most common type
|
|
374
|
+
return max(type_counts.items(), key=lambda x: x[1])[0]
|
|
375
|
+
|
|
376
|
+
def _create_training_session(self, name: Optional[str] = None) -> str:
|
|
377
|
+
"""Create a new training session.
|
|
378
|
+
|
|
379
|
+
Args:
|
|
380
|
+
name: Optional session name
|
|
381
|
+
|
|
382
|
+
Returns:
|
|
383
|
+
Session ID
|
|
384
|
+
"""
|
|
385
|
+
import uuid
|
|
386
|
+
|
|
387
|
+
session_id = str(uuid.uuid4())
|
|
388
|
+
session = TrainingSession(
|
|
389
|
+
id=session_id,
|
|
390
|
+
name=name or f"training_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
|
|
391
|
+
created_at=datetime.now(timezone.utc),
|
|
392
|
+
model_type=self.training_config.get("model_type", "random_forest"),
|
|
393
|
+
validation_split=self.training_config.get("validation_split", 0.2),
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
with self.Session() as db_session:
|
|
397
|
+
db_session.add(session)
|
|
398
|
+
db_session.commit()
|
|
399
|
+
|
|
400
|
+
return session_id
|
|
401
|
+
|
|
402
|
+
def _store_training_data(self, session_id: str, labeled_commits: list[dict[str, Any]]) -> None:
|
|
403
|
+
"""Store training data in database.
|
|
404
|
+
|
|
405
|
+
Args:
|
|
406
|
+
session_id: Training session ID
|
|
407
|
+
labeled_commits: List of labeled commit data
|
|
408
|
+
"""
|
|
409
|
+
with self.Session() as db_session:
|
|
410
|
+
for item in labeled_commits:
|
|
411
|
+
commit = item["commit_data"]
|
|
412
|
+
ticket_data = item["ticket_data"]
|
|
413
|
+
|
|
414
|
+
# Use first ticket for primary data
|
|
415
|
+
primary_ticket = ticket_data[0] if ticket_data else {}
|
|
416
|
+
|
|
417
|
+
# Handle files_changed being either int or list
|
|
418
|
+
files_changed_value = commit.get("files_changed", 0)
|
|
419
|
+
if isinstance(files_changed_value, int):
|
|
420
|
+
files_changed_count = files_changed_value
|
|
421
|
+
elif isinstance(files_changed_value, list):
|
|
422
|
+
files_changed_count = len(files_changed_value)
|
|
423
|
+
else:
|
|
424
|
+
files_changed_count = 0
|
|
425
|
+
|
|
426
|
+
training_example = TrainingData(
|
|
427
|
+
session_id=session_id,
|
|
428
|
+
commit_hash=commit.get("hash", ""),
|
|
429
|
+
repository=item["repository"],
|
|
430
|
+
message=commit.get("message", ""),
|
|
431
|
+
author=commit.get("author_name", ""),
|
|
432
|
+
timestamp=commit.get("timestamp"),
|
|
433
|
+
files_changed=files_changed_count,
|
|
434
|
+
insertions=commit.get("insertions", 0),
|
|
435
|
+
deletions=commit.get("deletions", 0),
|
|
436
|
+
ticket_id=primary_ticket.get("id", ""),
|
|
437
|
+
ticket_type=primary_ticket.get("type", ""),
|
|
438
|
+
ticket_platform=primary_ticket.get("platform", ""),
|
|
439
|
+
label=item["label"],
|
|
440
|
+
confidence=1.0, # High confidence for PM-based labels
|
|
441
|
+
created_at=datetime.now(timezone.utc),
|
|
442
|
+
)
|
|
443
|
+
|
|
444
|
+
db_session.add(training_example)
|
|
445
|
+
|
|
446
|
+
db_session.commit()
|
|
447
|
+
|
|
448
|
+
def _update_training_session(
|
|
449
|
+
self, session_id: str, results: dict[str, Any], num_examples: int
|
|
450
|
+
) -> None:
|
|
451
|
+
"""Update training session with results.
|
|
452
|
+
|
|
453
|
+
Args:
|
|
454
|
+
session_id: Training session ID
|
|
455
|
+
results: Training results
|
|
456
|
+
num_examples: Number of training examples
|
|
457
|
+
"""
|
|
458
|
+
with self.Session() as db_session:
|
|
459
|
+
session = db_session.query(TrainingSession).filter_by(id=session_id).first()
|
|
460
|
+
if session:
|
|
461
|
+
session.training_examples = num_examples
|
|
462
|
+
session.accuracy = results.get("accuracy", 0.0)
|
|
463
|
+
session.precision = results.get("precision", 0.0)
|
|
464
|
+
session.recall = results.get("recall", 0.0)
|
|
465
|
+
session.f1_score = results.get("f1_score", 0.0)
|
|
466
|
+
session.model_path = str(self.classifier.model_path)
|
|
467
|
+
db_session.commit()
|
|
468
|
+
|
|
469
|
+
def _mark_session_failed(self, session_id: str, error: str) -> None:
|
|
470
|
+
"""Mark a training session as failed.
|
|
471
|
+
|
|
472
|
+
Args:
|
|
473
|
+
session_id: Training session ID
|
|
474
|
+
error: Error message
|
|
475
|
+
"""
|
|
476
|
+
with self.Session() as db_session:
|
|
477
|
+
session = db_session.query(TrainingSession).filter_by(id=session_id).first()
|
|
478
|
+
if session:
|
|
479
|
+
session.accuracy = -1.0 # Indicates failure
|
|
480
|
+
db_session.commit()
|
|
481
|
+
|
|
482
|
+
def _export_training_data(self, session_id: str) -> Path:
|
|
483
|
+
"""Export training data to CSV.
|
|
484
|
+
|
|
485
|
+
Args:
|
|
486
|
+
session_id: Training session ID
|
|
487
|
+
|
|
488
|
+
Returns:
|
|
489
|
+
Path to exported CSV file
|
|
490
|
+
"""
|
|
491
|
+
output_path = self.cache.cache_dir / f"training_data_{session_id[:8]}.csv"
|
|
492
|
+
|
|
493
|
+
with self.Session() as db_session:
|
|
494
|
+
data = db_session.query(TrainingData).filter_by(session_id=session_id).all()
|
|
495
|
+
|
|
496
|
+
rows = []
|
|
497
|
+
for item in data:
|
|
498
|
+
rows.append(
|
|
499
|
+
{
|
|
500
|
+
"commit_hash": item.commit_hash,
|
|
501
|
+
"repository": item.repository,
|
|
502
|
+
"message": item.message,
|
|
503
|
+
"author": item.author,
|
|
504
|
+
"timestamp": item.timestamp,
|
|
505
|
+
"files_changed": item.files_changed,
|
|
506
|
+
"insertions": item.insertions,
|
|
507
|
+
"deletions": item.deletions,
|
|
508
|
+
"ticket_id": item.ticket_id,
|
|
509
|
+
"ticket_type": item.ticket_type,
|
|
510
|
+
"ticket_platform": item.ticket_platform,
|
|
511
|
+
"label": item.label,
|
|
512
|
+
"confidence": item.confidence,
|
|
513
|
+
}
|
|
514
|
+
)
|
|
515
|
+
|
|
516
|
+
df = pd.DataFrame(rows)
|
|
517
|
+
df.to_csv(output_path, index=False)
|
|
518
|
+
|
|
519
|
+
logger.info(f"Exported training data to {output_path}")
|
|
520
|
+
|
|
521
|
+
return output_path
|
|
522
|
+
|
|
523
|
+
def get_training_history(self) -> list[dict[str, Any]]:
|
|
524
|
+
"""Get history of training sessions.
|
|
525
|
+
|
|
526
|
+
Returns:
|
|
527
|
+
List of training session summaries
|
|
528
|
+
"""
|
|
529
|
+
with self.Session() as db_session:
|
|
530
|
+
sessions = (
|
|
531
|
+
db_session.query(TrainingSession).order_by(TrainingSession.created_at.desc()).all()
|
|
532
|
+
)
|
|
533
|
+
|
|
534
|
+
history = []
|
|
535
|
+
for session in sessions:
|
|
536
|
+
history.append(
|
|
537
|
+
{
|
|
538
|
+
"id": session.id,
|
|
539
|
+
"name": session.name,
|
|
540
|
+
"created_at": session.created_at,
|
|
541
|
+
"model_type": session.model_type,
|
|
542
|
+
"training_examples": session.training_examples,
|
|
543
|
+
"accuracy": session.accuracy,
|
|
544
|
+
"precision": session.precision,
|
|
545
|
+
"recall": session.recall,
|
|
546
|
+
"f1_score": session.f1_score,
|
|
547
|
+
}
|
|
548
|
+
)
|
|
549
|
+
|
|
550
|
+
return history
|