gitflow-analytics 1.0.1__py3-none-any.whl → 1.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gitflow_analytics/__init__.py +11 -11
- gitflow_analytics/_version.py +2 -2
- gitflow_analytics/classification/__init__.py +31 -0
- gitflow_analytics/classification/batch_classifier.py +752 -0
- gitflow_analytics/classification/classifier.py +464 -0
- gitflow_analytics/classification/feature_extractor.py +725 -0
- gitflow_analytics/classification/linguist_analyzer.py +574 -0
- gitflow_analytics/classification/model.py +455 -0
- gitflow_analytics/cli.py +4490 -378
- gitflow_analytics/cli_rich.py +503 -0
- gitflow_analytics/config/__init__.py +43 -0
- gitflow_analytics/config/errors.py +261 -0
- gitflow_analytics/config/loader.py +904 -0
- gitflow_analytics/config/profiles.py +264 -0
- gitflow_analytics/config/repository.py +124 -0
- gitflow_analytics/config/schema.py +441 -0
- gitflow_analytics/config/validator.py +154 -0
- gitflow_analytics/config.py +44 -398
- gitflow_analytics/core/analyzer.py +1320 -172
- gitflow_analytics/core/branch_mapper.py +132 -132
- gitflow_analytics/core/cache.py +1554 -175
- gitflow_analytics/core/data_fetcher.py +1193 -0
- gitflow_analytics/core/identity.py +571 -185
- gitflow_analytics/core/metrics_storage.py +526 -0
- gitflow_analytics/core/progress.py +372 -0
- gitflow_analytics/core/schema_version.py +269 -0
- gitflow_analytics/extractors/base.py +13 -11
- gitflow_analytics/extractors/ml_tickets.py +1100 -0
- gitflow_analytics/extractors/story_points.py +77 -59
- gitflow_analytics/extractors/tickets.py +841 -89
- gitflow_analytics/identity_llm/__init__.py +6 -0
- gitflow_analytics/identity_llm/analysis_pass.py +231 -0
- gitflow_analytics/identity_llm/analyzer.py +464 -0
- gitflow_analytics/identity_llm/models.py +76 -0
- gitflow_analytics/integrations/github_integration.py +258 -87
- gitflow_analytics/integrations/jira_integration.py +572 -123
- gitflow_analytics/integrations/orchestrator.py +206 -82
- gitflow_analytics/metrics/activity_scoring.py +322 -0
- gitflow_analytics/metrics/branch_health.py +470 -0
- gitflow_analytics/metrics/dora.py +542 -179
- gitflow_analytics/models/database.py +986 -59
- gitflow_analytics/pm_framework/__init__.py +115 -0
- gitflow_analytics/pm_framework/adapters/__init__.py +50 -0
- gitflow_analytics/pm_framework/adapters/jira_adapter.py +1845 -0
- gitflow_analytics/pm_framework/base.py +406 -0
- gitflow_analytics/pm_framework/models.py +211 -0
- gitflow_analytics/pm_framework/orchestrator.py +652 -0
- gitflow_analytics/pm_framework/registry.py +333 -0
- gitflow_analytics/qualitative/__init__.py +29 -0
- gitflow_analytics/qualitative/chatgpt_analyzer.py +259 -0
- gitflow_analytics/qualitative/classifiers/__init__.py +13 -0
- gitflow_analytics/qualitative/classifiers/change_type.py +742 -0
- gitflow_analytics/qualitative/classifiers/domain_classifier.py +506 -0
- gitflow_analytics/qualitative/classifiers/intent_analyzer.py +535 -0
- gitflow_analytics/qualitative/classifiers/llm/__init__.py +35 -0
- gitflow_analytics/qualitative/classifiers/llm/base.py +193 -0
- gitflow_analytics/qualitative/classifiers/llm/batch_processor.py +383 -0
- gitflow_analytics/qualitative/classifiers/llm/cache.py +479 -0
- gitflow_analytics/qualitative/classifiers/llm/cost_tracker.py +435 -0
- gitflow_analytics/qualitative/classifiers/llm/openai_client.py +403 -0
- gitflow_analytics/qualitative/classifiers/llm/prompts.py +373 -0
- gitflow_analytics/qualitative/classifiers/llm/response_parser.py +287 -0
- gitflow_analytics/qualitative/classifiers/llm_commit_classifier.py +607 -0
- gitflow_analytics/qualitative/classifiers/risk_analyzer.py +438 -0
- gitflow_analytics/qualitative/core/__init__.py +13 -0
- gitflow_analytics/qualitative/core/llm_fallback.py +657 -0
- gitflow_analytics/qualitative/core/nlp_engine.py +382 -0
- gitflow_analytics/qualitative/core/pattern_cache.py +479 -0
- gitflow_analytics/qualitative/core/processor.py +673 -0
- gitflow_analytics/qualitative/enhanced_analyzer.py +2236 -0
- gitflow_analytics/qualitative/example_enhanced_usage.py +420 -0
- gitflow_analytics/qualitative/models/__init__.py +25 -0
- gitflow_analytics/qualitative/models/schemas.py +306 -0
- gitflow_analytics/qualitative/utils/__init__.py +13 -0
- gitflow_analytics/qualitative/utils/batch_processor.py +339 -0
- gitflow_analytics/qualitative/utils/cost_tracker.py +345 -0
- gitflow_analytics/qualitative/utils/metrics.py +361 -0
- gitflow_analytics/qualitative/utils/text_processing.py +285 -0
- gitflow_analytics/reports/__init__.py +100 -0
- gitflow_analytics/reports/analytics_writer.py +550 -18
- gitflow_analytics/reports/base.py +648 -0
- gitflow_analytics/reports/branch_health_writer.py +322 -0
- gitflow_analytics/reports/classification_writer.py +924 -0
- gitflow_analytics/reports/cli_integration.py +427 -0
- gitflow_analytics/reports/csv_writer.py +1700 -216
- gitflow_analytics/reports/data_models.py +504 -0
- gitflow_analytics/reports/database_report_generator.py +427 -0
- gitflow_analytics/reports/example_usage.py +344 -0
- gitflow_analytics/reports/factory.py +499 -0
- gitflow_analytics/reports/formatters.py +698 -0
- gitflow_analytics/reports/html_generator.py +1116 -0
- gitflow_analytics/reports/interfaces.py +489 -0
- gitflow_analytics/reports/json_exporter.py +2770 -0
- gitflow_analytics/reports/narrative_writer.py +2289 -158
- gitflow_analytics/reports/story_point_correlation.py +1144 -0
- gitflow_analytics/reports/weekly_trends_writer.py +389 -0
- gitflow_analytics/training/__init__.py +5 -0
- gitflow_analytics/training/model_loader.py +377 -0
- gitflow_analytics/training/pipeline.py +550 -0
- gitflow_analytics/tui/__init__.py +5 -0
- gitflow_analytics/tui/app.py +724 -0
- gitflow_analytics/tui/screens/__init__.py +8 -0
- gitflow_analytics/tui/screens/analysis_progress_screen.py +496 -0
- gitflow_analytics/tui/screens/configuration_screen.py +523 -0
- gitflow_analytics/tui/screens/loading_screen.py +348 -0
- gitflow_analytics/tui/screens/main_screen.py +321 -0
- gitflow_analytics/tui/screens/results_screen.py +722 -0
- gitflow_analytics/tui/widgets/__init__.py +7 -0
- gitflow_analytics/tui/widgets/data_table.py +255 -0
- gitflow_analytics/tui/widgets/export_modal.py +301 -0
- gitflow_analytics/tui/widgets/progress_widget.py +187 -0
- gitflow_analytics-1.3.6.dist-info/METADATA +1015 -0
- gitflow_analytics-1.3.6.dist-info/RECORD +122 -0
- gitflow_analytics-1.0.1.dist-info/METADATA +0 -463
- gitflow_analytics-1.0.1.dist-info/RECORD +0 -31
- {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/WHEEL +0 -0
- {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/entry_points.txt +0 -0
- {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/licenses/LICENSE +0 -0
- {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,752 @@
|
|
|
1
|
+
"""Batch LLM classifier for intelligent commit categorization with context.
|
|
2
|
+
|
|
3
|
+
This module implements the second step of the two-step fetch/analyze process,
|
|
4
|
+
providing intelligent batch classification of commits using LLM with ticket context.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
import uuid
|
|
9
|
+
from collections import defaultdict
|
|
10
|
+
from datetime import datetime, timedelta, timezone
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any, Optional
|
|
13
|
+
|
|
14
|
+
from ..core.progress import get_progress_service
|
|
15
|
+
from ..models.database import CachedCommit, DailyCommitBatch, Database, DetailedTicketData
|
|
16
|
+
from ..qualitative.classifiers.llm_commit_classifier import LLMCommitClassifier, LLMConfig
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class BatchCommitClassifier:
|
|
22
|
+
"""Intelligent batch classifier using LLM with ticket context.
|
|
23
|
+
|
|
24
|
+
WHY: This class implements the second step of the two-step process by:
|
|
25
|
+
- Reading cached commit data organized by day/week
|
|
26
|
+
- Adding ticket context to improve classification accuracy
|
|
27
|
+
- Sending batches of commits to LLM for intelligent classification
|
|
28
|
+
- Falling back to rule-based classification when LLM fails
|
|
29
|
+
- Storing results with confidence tracking
|
|
30
|
+
|
|
31
|
+
DESIGN DECISION: Uses batch processing to reduce API calls and costs
|
|
32
|
+
while providing better context for classification accuracy.
|
|
33
|
+
|
|
34
|
+
PROGRESS REPORTING: Provides granular progress feedback with nested progress bars:
|
|
35
|
+
- Repository level: Shows which repository is being processed (position 0)
|
|
36
|
+
- Weekly level: Shows week being processed within repository (position 1)
|
|
37
|
+
- API batch level: Shows LLM API batches being processed (position 2)
|
|
38
|
+
Each level shows commit counts and progress indicators for user feedback.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(
|
|
42
|
+
self,
|
|
43
|
+
cache_dir: Path,
|
|
44
|
+
llm_config: Optional[dict[str, Any]] = None,
|
|
45
|
+
batch_size: int = 50,
|
|
46
|
+
confidence_threshold: float = 0.7,
|
|
47
|
+
fallback_enabled: bool = True,
|
|
48
|
+
):
|
|
49
|
+
"""Initialize the batch classifier.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
cache_dir: Path to cache directory containing database
|
|
53
|
+
llm_config: Configuration for LLM classifier
|
|
54
|
+
batch_size: Number of commits per batch (max 50 for token limits)
|
|
55
|
+
confidence_threshold: Minimum confidence for LLM classification
|
|
56
|
+
fallback_enabled: Whether to fall back to rule-based classification
|
|
57
|
+
"""
|
|
58
|
+
self.cache_dir = cache_dir
|
|
59
|
+
self.database = Database(cache_dir / "gitflow_cache.db")
|
|
60
|
+
self.batch_size = min(batch_size, 50) # Limit for token constraints
|
|
61
|
+
self.confidence_threshold = confidence_threshold
|
|
62
|
+
self.fallback_enabled = fallback_enabled
|
|
63
|
+
|
|
64
|
+
# Initialize LLM classifier
|
|
65
|
+
# Handle different config types
|
|
66
|
+
if isinstance(llm_config, dict):
|
|
67
|
+
# Convert dict config to LLMConfig object
|
|
68
|
+
llm_config_obj = LLMConfig(
|
|
69
|
+
api_key=llm_config.get("api_key", ""),
|
|
70
|
+
model=llm_config.get("model", "mistralai/mistral-7b-instruct"),
|
|
71
|
+
max_tokens=llm_config.get("max_tokens", 50),
|
|
72
|
+
temperature=llm_config.get("temperature", 0.1),
|
|
73
|
+
confidence_threshold=llm_config.get("confidence_threshold", 0.7),
|
|
74
|
+
timeout_seconds=llm_config.get("timeout_seconds", 30),
|
|
75
|
+
cache_duration_days=llm_config.get("cache_duration_days", 7),
|
|
76
|
+
enable_caching=llm_config.get("enable_caching", True),
|
|
77
|
+
max_daily_requests=llm_config.get("max_daily_requests", 1000),
|
|
78
|
+
)
|
|
79
|
+
elif hasattr(llm_config, "api_key"):
|
|
80
|
+
# Use provided config object (e.g., mock config for testing)
|
|
81
|
+
llm_config_obj = llm_config
|
|
82
|
+
else:
|
|
83
|
+
# Use default LLMConfig
|
|
84
|
+
llm_config_obj = LLMConfig()
|
|
85
|
+
|
|
86
|
+
self.llm_classifier = LLMCommitClassifier(config=llm_config_obj, cache_dir=cache_dir)
|
|
87
|
+
logger.info(
|
|
88
|
+
f"LLM Classifier initialized with API key: {'Yes' if llm_config_obj.api_key else 'No'}"
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# Rule-based fallback patterns for when LLM fails
|
|
92
|
+
self.fallback_patterns = {
|
|
93
|
+
"feature": [
|
|
94
|
+
r"feat(?:ure)?[\(\:]",
|
|
95
|
+
r"add(?:ed|ing)?.*(?:feature|functionality|capability)",
|
|
96
|
+
r"implement(?:ed|ing|s)?",
|
|
97
|
+
r"introduce(?:d|s)?",
|
|
98
|
+
],
|
|
99
|
+
"bug_fix": [
|
|
100
|
+
r"fix(?:ed|es|ing)?[\(\:]",
|
|
101
|
+
r"bug[\(\:]",
|
|
102
|
+
r"resolve(?:d|s)?",
|
|
103
|
+
r"repair(?:ed|ing|s)?",
|
|
104
|
+
r"correct(?:ed|ing|s)?",
|
|
105
|
+
],
|
|
106
|
+
"refactor": [
|
|
107
|
+
r"refactor(?:ed|ing|s)?[\(\:]",
|
|
108
|
+
r"restructure(?:d|ing|s)?",
|
|
109
|
+
r"optimize(?:d|ing|s)?",
|
|
110
|
+
r"improve(?:d|ing|s)?",
|
|
111
|
+
r"clean(?:ed|ing)?\s+up",
|
|
112
|
+
],
|
|
113
|
+
"documentation": [
|
|
114
|
+
r"docs?[\(\:]",
|
|
115
|
+
r"documentation[\(\:]",
|
|
116
|
+
r"readme",
|
|
117
|
+
r"update.*(?:comment|docs?|documentation)",
|
|
118
|
+
],
|
|
119
|
+
"maintenance": [
|
|
120
|
+
r"chore[\(\:]",
|
|
121
|
+
r"maintenance[\(\:]",
|
|
122
|
+
r"update.*(?:dependencies|deps)",
|
|
123
|
+
r"bump.*version",
|
|
124
|
+
r"cleanup",
|
|
125
|
+
],
|
|
126
|
+
"test": [
|
|
127
|
+
r"test(?:s|ing)?[\(\:]",
|
|
128
|
+
r"spec[\(\:]",
|
|
129
|
+
r"add.*(?:test|spec)",
|
|
130
|
+
r"fix.*test",
|
|
131
|
+
],
|
|
132
|
+
"style": [
|
|
133
|
+
r"style[\(\:]",
|
|
134
|
+
r"format(?:ted|ting)?[\(\:]",
|
|
135
|
+
r"lint(?:ed|ing)?",
|
|
136
|
+
r"prettier",
|
|
137
|
+
r"whitespace",
|
|
138
|
+
],
|
|
139
|
+
"build": [
|
|
140
|
+
r"build[\(\:]",
|
|
141
|
+
r"ci[\(\:]",
|
|
142
|
+
r"deploy(?:ed|ment)?",
|
|
143
|
+
r"docker",
|
|
144
|
+
r"webpack",
|
|
145
|
+
r"package\.json",
|
|
146
|
+
],
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
def classify_date_range(
|
|
150
|
+
self,
|
|
151
|
+
start_date: datetime,
|
|
152
|
+
end_date: datetime,
|
|
153
|
+
project_keys: Optional[list[str]] = None,
|
|
154
|
+
force_reclassify: bool = False,
|
|
155
|
+
) -> dict[str, Any]:
|
|
156
|
+
"""Classify all commits in a date range using batch processing.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
start_date: Start date for classification
|
|
160
|
+
end_date: End date for classification
|
|
161
|
+
project_keys: Optional list of specific projects to classify
|
|
162
|
+
force_reclassify: Whether to reclassify already processed batches
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
Dictionary containing classification results and statistics
|
|
166
|
+
"""
|
|
167
|
+
logger.info(f"Starting batch classification from {start_date.date()} to {end_date.date()}")
|
|
168
|
+
|
|
169
|
+
# Get daily batches to process
|
|
170
|
+
batches_to_process = self._get_batches_to_process(
|
|
171
|
+
start_date, end_date, project_keys, force_reclassify
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
if not batches_to_process:
|
|
175
|
+
logger.info("No batches need classification")
|
|
176
|
+
return {"processed_batches": 0, "total_commits": 0}
|
|
177
|
+
|
|
178
|
+
# Group batches by repository first for better progress reporting
|
|
179
|
+
repo_batches = self._group_batches_by_repository(batches_to_process)
|
|
180
|
+
|
|
181
|
+
total_processed = 0
|
|
182
|
+
total_commits = 0
|
|
183
|
+
|
|
184
|
+
# Use centralized progress service
|
|
185
|
+
progress = get_progress_service()
|
|
186
|
+
|
|
187
|
+
# Add progress bar for repository processing
|
|
188
|
+
with progress.progress(
|
|
189
|
+
total=len(repo_batches),
|
|
190
|
+
description="AI Classification",
|
|
191
|
+
unit="repo",
|
|
192
|
+
nested=False,
|
|
193
|
+
leave=True,
|
|
194
|
+
) as repo_ctx:
|
|
195
|
+
for repo_num, (repo_info, repo_batch_list) in enumerate(repo_batches.items(), 1):
|
|
196
|
+
project_key, repo_path = repo_info
|
|
197
|
+
repo_name = Path(repo_path).name if repo_path else project_key
|
|
198
|
+
|
|
199
|
+
# Count commits in this repository for detailed progress
|
|
200
|
+
repo_commit_count = sum(batch.commit_count for batch in repo_batch_list)
|
|
201
|
+
|
|
202
|
+
progress.set_description(
|
|
203
|
+
repo_ctx, f"Classifying {repo_name} ({repo_commit_count} commits)"
|
|
204
|
+
)
|
|
205
|
+
logger.info(
|
|
206
|
+
f"Processing repository {repo_num}/{len(repo_batches)}: {repo_name} ({len(repo_batch_list)} batches, {repo_commit_count} commits)"
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
# Process this repository's batches by week for optimal context
|
|
210
|
+
weekly_batches = self._group_batches_by_week(repo_batch_list)
|
|
211
|
+
|
|
212
|
+
repo_processed = 0
|
|
213
|
+
repo_commits_processed = 0
|
|
214
|
+
|
|
215
|
+
# Add nested progress bar for weekly processing within repository
|
|
216
|
+
with progress.progress(
|
|
217
|
+
total=len(weekly_batches),
|
|
218
|
+
description=" Processing weeks",
|
|
219
|
+
unit="week",
|
|
220
|
+
nested=True,
|
|
221
|
+
leave=False,
|
|
222
|
+
) as week_ctx:
|
|
223
|
+
for week_num, (week_start, week_batches) in enumerate(
|
|
224
|
+
weekly_batches.items(), 1
|
|
225
|
+
):
|
|
226
|
+
progress.set_description(
|
|
227
|
+
week_ctx,
|
|
228
|
+
f" Week {week_num}/{len(weekly_batches)} ({week_start.strftime('%Y-%m-%d')})",
|
|
229
|
+
)
|
|
230
|
+
logger.info(
|
|
231
|
+
f" Processing week starting {week_start}: {len(week_batches)} daily batches"
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
week_result = self._classify_weekly_batches(week_batches)
|
|
235
|
+
repo_processed += week_result["batches_processed"]
|
|
236
|
+
repo_commits_processed += week_result["commits_processed"]
|
|
237
|
+
|
|
238
|
+
progress.update(week_ctx, 1)
|
|
239
|
+
# Update description to show commits processed
|
|
240
|
+
progress.set_description(
|
|
241
|
+
week_ctx,
|
|
242
|
+
f" Week {week_num}/{len(weekly_batches)} - {week_result['commits_processed']} commits",
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
total_processed += repo_processed
|
|
246
|
+
total_commits += repo_commits_processed
|
|
247
|
+
|
|
248
|
+
progress.update(repo_ctx, 1)
|
|
249
|
+
# Update description to show total progress
|
|
250
|
+
progress.set_description(
|
|
251
|
+
repo_ctx,
|
|
252
|
+
f"AI Classification [{repo_num}/{len(repo_batches)} repos, {total_commits} commits]",
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
logger.info(
|
|
256
|
+
f" Repository {repo_name} completed: {repo_processed} batches, {repo_commits_processed} commits"
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
# Store daily metrics from classification results
|
|
260
|
+
self._store_daily_metrics(start_date, end_date, project_keys)
|
|
261
|
+
|
|
262
|
+
logger.info(
|
|
263
|
+
f"Batch classification completed: {total_processed} batches, {total_commits} commits"
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
return {
|
|
267
|
+
"processed_batches": total_processed,
|
|
268
|
+
"total_commits": total_commits,
|
|
269
|
+
"date_range": {"start": start_date, "end": end_date},
|
|
270
|
+
"project_keys": project_keys or [],
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
def _get_batches_to_process(
|
|
274
|
+
self,
|
|
275
|
+
start_date: datetime,
|
|
276
|
+
end_date: datetime,
|
|
277
|
+
project_keys: Optional[list[str]],
|
|
278
|
+
force_reclassify: bool,
|
|
279
|
+
) -> list[DailyCommitBatch]:
|
|
280
|
+
"""Get daily commit batches that need classification."""
|
|
281
|
+
session = self.database.get_session()
|
|
282
|
+
|
|
283
|
+
try:
|
|
284
|
+
query = session.query(DailyCommitBatch).filter(
|
|
285
|
+
DailyCommitBatch.date >= start_date.date(), DailyCommitBatch.date <= end_date.date()
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
if project_keys:
|
|
289
|
+
query = query.filter(DailyCommitBatch.project_key.in_(project_keys))
|
|
290
|
+
|
|
291
|
+
if not force_reclassify:
|
|
292
|
+
# Only get batches that haven't been classified or failed
|
|
293
|
+
query = query.filter(
|
|
294
|
+
DailyCommitBatch.classification_status.in_(["pending", "failed"])
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
batches = query.order_by(DailyCommitBatch.date).all()
|
|
298
|
+
logger.info(f"Found {len(batches)} batches needing classification")
|
|
299
|
+
|
|
300
|
+
# Debug: Log filtering criteria
|
|
301
|
+
logger.debug(
|
|
302
|
+
f"Query criteria: start_date={start_date.date()}, end_date={end_date.date()}"
|
|
303
|
+
)
|
|
304
|
+
if project_keys:
|
|
305
|
+
logger.debug(f"Project key filter: {project_keys}")
|
|
306
|
+
logger.debug(f"Force reclassify: {force_reclassify}")
|
|
307
|
+
|
|
308
|
+
return batches
|
|
309
|
+
|
|
310
|
+
except Exception as e:
|
|
311
|
+
logger.error(f"Error getting batches to process: {e}")
|
|
312
|
+
return []
|
|
313
|
+
finally:
|
|
314
|
+
session.close()
|
|
315
|
+
|
|
316
|
+
def _group_batches_by_repository(
|
|
317
|
+
self, batches: list[DailyCommitBatch]
|
|
318
|
+
) -> dict[tuple[str, str], list[DailyCommitBatch]]:
|
|
319
|
+
"""Group daily batches by repository for granular progress reporting."""
|
|
320
|
+
repo_batches = defaultdict(list)
|
|
321
|
+
|
|
322
|
+
for batch in batches:
|
|
323
|
+
# Use (project_key, repo_path) as the key for unique repository identification
|
|
324
|
+
repo_key = (batch.project_key, batch.repo_path)
|
|
325
|
+
repo_batches[repo_key].append(batch)
|
|
326
|
+
|
|
327
|
+
# Sort each repository's batches by date
|
|
328
|
+
for batches_list in repo_batches.values():
|
|
329
|
+
batches_list.sort(key=lambda b: b.date)
|
|
330
|
+
|
|
331
|
+
return dict(repo_batches)
|
|
332
|
+
|
|
333
|
+
def _group_batches_by_week(
|
|
334
|
+
self, batches: list[DailyCommitBatch]
|
|
335
|
+
) -> dict[datetime, list[DailyCommitBatch]]:
|
|
336
|
+
"""Group daily batches by week for optimal context window."""
|
|
337
|
+
weekly_batches = defaultdict(list)
|
|
338
|
+
|
|
339
|
+
for batch in batches:
|
|
340
|
+
# Get Monday of the week
|
|
341
|
+
batch_date = datetime.combine(batch.date, datetime.min.time())
|
|
342
|
+
days_since_monday = batch_date.weekday()
|
|
343
|
+
week_start = batch_date - timedelta(days=days_since_monday)
|
|
344
|
+
|
|
345
|
+
weekly_batches[week_start].append(batch)
|
|
346
|
+
|
|
347
|
+
# Sort each week's batches by date
|
|
348
|
+
for week_batches in weekly_batches.values():
|
|
349
|
+
week_batches.sort(key=lambda b: b.date)
|
|
350
|
+
|
|
351
|
+
return dict(weekly_batches)
|
|
352
|
+
|
|
353
|
+
def _classify_weekly_batches(self, weekly_batches: list[DailyCommitBatch]) -> dict[str, Any]:
|
|
354
|
+
"""Classify all batches for a single week with shared context."""
|
|
355
|
+
session = self.database.get_session()
|
|
356
|
+
batches_processed = 0
|
|
357
|
+
commits_processed = 0
|
|
358
|
+
|
|
359
|
+
try:
|
|
360
|
+
# Collect all commits for the week
|
|
361
|
+
week_commits = []
|
|
362
|
+
batch_commit_map = {} # Maps commit hash to batch
|
|
363
|
+
|
|
364
|
+
for batch in weekly_batches:
|
|
365
|
+
# Mark batch as processing
|
|
366
|
+
batch.classification_status = "processing"
|
|
367
|
+
|
|
368
|
+
# Get commits for this day
|
|
369
|
+
daily_commits = self._get_commits_for_batch(session, batch)
|
|
370
|
+
week_commits.extend(daily_commits)
|
|
371
|
+
|
|
372
|
+
# Track which batch each commit belongs to
|
|
373
|
+
for commit in daily_commits:
|
|
374
|
+
batch_commit_map[commit["commit_hash"]] = batch
|
|
375
|
+
|
|
376
|
+
if not week_commits:
|
|
377
|
+
logger.warning(
|
|
378
|
+
f"No commits found for weekly batches (expected {sum(batch.commit_count for batch in weekly_batches)} commits)"
|
|
379
|
+
)
|
|
380
|
+
# Mark batches as failed due to missing commits
|
|
381
|
+
for batch in weekly_batches:
|
|
382
|
+
batch.classification_status = "failed"
|
|
383
|
+
batch.classified_at = datetime.utcnow()
|
|
384
|
+
session.commit()
|
|
385
|
+
return {"batches_processed": 0, "commits_processed": 0}
|
|
386
|
+
|
|
387
|
+
# Get ticket context for the week
|
|
388
|
+
week_tickets = self._get_ticket_context_for_commits(session, week_commits)
|
|
389
|
+
|
|
390
|
+
# Process commits in batches (respecting API limits)
|
|
391
|
+
classified_commits = []
|
|
392
|
+
num_batches = (len(week_commits) + self.batch_size - 1) // self.batch_size
|
|
393
|
+
|
|
394
|
+
# Use centralized progress service for batch processing
|
|
395
|
+
progress = get_progress_service()
|
|
396
|
+
|
|
397
|
+
# Add progress bar for batch processing within the week
|
|
398
|
+
with progress.progress(
|
|
399
|
+
total=num_batches,
|
|
400
|
+
description=" Processing batches",
|
|
401
|
+
unit="batch",
|
|
402
|
+
nested=True,
|
|
403
|
+
leave=False,
|
|
404
|
+
) as batch_ctx:
|
|
405
|
+
for i in range(0, len(week_commits), self.batch_size):
|
|
406
|
+
batch_num = i // self.batch_size + 1
|
|
407
|
+
batch_commits = week_commits[i : i + self.batch_size]
|
|
408
|
+
progress.set_description(
|
|
409
|
+
batch_ctx,
|
|
410
|
+
f" API batch {batch_num}/{num_batches} ({len(batch_commits)} commits)",
|
|
411
|
+
)
|
|
412
|
+
logger.info(f"Classifying batch {batch_num}: {len(batch_commits)} commits")
|
|
413
|
+
|
|
414
|
+
# Classify this batch with LLM
|
|
415
|
+
batch_results = self._classify_commit_batch_with_llm(
|
|
416
|
+
batch_commits, week_tickets
|
|
417
|
+
)
|
|
418
|
+
classified_commits.extend(batch_results)
|
|
419
|
+
|
|
420
|
+
progress.update(batch_ctx, 1)
|
|
421
|
+
# Update description to show total classified commits
|
|
422
|
+
progress.set_description(
|
|
423
|
+
batch_ctx,
|
|
424
|
+
f" API batch {batch_num}/{num_batches} - Total: {len(classified_commits)} commits",
|
|
425
|
+
)
|
|
426
|
+
|
|
427
|
+
# Store classification results
|
|
428
|
+
for commit_result in classified_commits:
|
|
429
|
+
self._store_commit_classification(session, commit_result)
|
|
430
|
+
commits_processed += 1
|
|
431
|
+
|
|
432
|
+
# Mark all daily batches as completed
|
|
433
|
+
for batch in weekly_batches:
|
|
434
|
+
batch.classification_status = "completed"
|
|
435
|
+
batch.classified_at = datetime.utcnow()
|
|
436
|
+
batches_processed += 1
|
|
437
|
+
|
|
438
|
+
session.commit()
|
|
439
|
+
|
|
440
|
+
logger.info(
|
|
441
|
+
f"Week classification completed: {batches_processed} batches, {commits_processed} commits"
|
|
442
|
+
)
|
|
443
|
+
|
|
444
|
+
except Exception as e:
|
|
445
|
+
logger.error(f"Error in weekly batch classification: {e}")
|
|
446
|
+
# Mark batches as failed
|
|
447
|
+
for batch in weekly_batches:
|
|
448
|
+
batch.classification_status = "failed"
|
|
449
|
+
session.rollback()
|
|
450
|
+
finally:
|
|
451
|
+
session.close()
|
|
452
|
+
|
|
453
|
+
return {
|
|
454
|
+
"batches_processed": batches_processed,
|
|
455
|
+
"commits_processed": commits_processed,
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
def _get_commits_for_batch(self, session: Any, batch: DailyCommitBatch) -> list[dict[str, Any]]:
|
|
459
|
+
"""Get all commits for a daily batch."""
|
|
460
|
+
try:
|
|
461
|
+
# Get cached commits for this batch
|
|
462
|
+
# CRITICAL FIX: CachedCommit.timestamp is timezone-aware UTC (from analyzer.py line 806)
|
|
463
|
+
# but we were creating timezone-naive boundaries, causing comparison to fail
|
|
464
|
+
# Create timezone-aware UTC boundaries to match CachedCommit.timestamp format
|
|
465
|
+
start_of_day = datetime.combine(batch.date, datetime.min.time(), tzinfo=timezone.utc)
|
|
466
|
+
end_of_day = datetime.combine(batch.date, datetime.max.time(), tzinfo=timezone.utc)
|
|
467
|
+
|
|
468
|
+
logger.debug(
|
|
469
|
+
f"Searching for commits in {batch.repo_path} between {start_of_day} and {end_of_day}"
|
|
470
|
+
)
|
|
471
|
+
|
|
472
|
+
commits = (
|
|
473
|
+
session.query(CachedCommit)
|
|
474
|
+
.filter(
|
|
475
|
+
CachedCommit.repo_path == batch.repo_path,
|
|
476
|
+
CachedCommit.timestamp >= start_of_day,
|
|
477
|
+
CachedCommit.timestamp < end_of_day,
|
|
478
|
+
)
|
|
479
|
+
.all()
|
|
480
|
+
)
|
|
481
|
+
|
|
482
|
+
logger.debug(f"Found {len(commits)} commits for batch on {batch.date}")
|
|
483
|
+
|
|
484
|
+
commit_list = []
|
|
485
|
+
for commit in commits:
|
|
486
|
+
commit_data = {
|
|
487
|
+
"commit_hash": commit.commit_hash,
|
|
488
|
+
"commit_hash_short": commit.commit_hash[:7],
|
|
489
|
+
"message": commit.message,
|
|
490
|
+
"author_name": commit.author_name,
|
|
491
|
+
"author_email": commit.author_email,
|
|
492
|
+
"timestamp": commit.timestamp,
|
|
493
|
+
"branch": commit.branch,
|
|
494
|
+
"project_key": batch.project_key,
|
|
495
|
+
"repo_path": commit.repo_path,
|
|
496
|
+
"files_changed": commit.files_changed or 0,
|
|
497
|
+
"lines_added": commit.insertions or 0,
|
|
498
|
+
"lines_deleted": commit.deletions or 0,
|
|
499
|
+
"story_points": commit.story_points,
|
|
500
|
+
"ticket_references": commit.ticket_references or [],
|
|
501
|
+
}
|
|
502
|
+
commit_list.append(commit_data)
|
|
503
|
+
|
|
504
|
+
return commit_list
|
|
505
|
+
|
|
506
|
+
except Exception as e:
|
|
507
|
+
logger.error(f"Error getting commits for batch {batch.id}: {e}")
|
|
508
|
+
return []
|
|
509
|
+
|
|
510
|
+
def _get_ticket_context_for_commits(
|
|
511
|
+
self, session: Any, commits: list[dict[str, Any]]
|
|
512
|
+
) -> dict[str, dict[str, Any]]:
|
|
513
|
+
"""Get ticket context for a list of commits."""
|
|
514
|
+
# Extract all ticket references from commits
|
|
515
|
+
all_ticket_ids = set()
|
|
516
|
+
for commit in commits:
|
|
517
|
+
ticket_refs = commit.get("ticket_references", [])
|
|
518
|
+
all_ticket_ids.update(ticket_refs)
|
|
519
|
+
|
|
520
|
+
if not all_ticket_ids:
|
|
521
|
+
return {}
|
|
522
|
+
|
|
523
|
+
try:
|
|
524
|
+
# Get detailed ticket information
|
|
525
|
+
tickets = (
|
|
526
|
+
session.query(DetailedTicketData)
|
|
527
|
+
.filter(DetailedTicketData.ticket_id.in_(all_ticket_ids))
|
|
528
|
+
.all()
|
|
529
|
+
)
|
|
530
|
+
|
|
531
|
+
ticket_context = {}
|
|
532
|
+
for ticket in tickets:
|
|
533
|
+
ticket_context[ticket.ticket_id] = {
|
|
534
|
+
"title": ticket.title,
|
|
535
|
+
"description": (
|
|
536
|
+
ticket.summary or ticket.description[:200] if ticket.description else ""
|
|
537
|
+
),
|
|
538
|
+
"ticket_type": ticket.ticket_type,
|
|
539
|
+
"status": ticket.status,
|
|
540
|
+
"labels": ticket.labels or [],
|
|
541
|
+
"classification_hints": ticket.classification_hints or [],
|
|
542
|
+
"business_domain": ticket.business_domain,
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
logger.info(f"Retrieved context for {len(ticket_context)} tickets")
|
|
546
|
+
return ticket_context
|
|
547
|
+
|
|
548
|
+
except Exception as e:
|
|
549
|
+
logger.error(f"Error getting ticket context: {e}")
|
|
550
|
+
return {}
|
|
551
|
+
|
|
552
|
+
def _classify_commit_batch_with_llm(
|
|
553
|
+
self,
|
|
554
|
+
commits: list[dict[str, Any]],
|
|
555
|
+
ticket_context: dict[str, dict[str, Any]],
|
|
556
|
+
) -> list[dict[str, Any]]:
|
|
557
|
+
"""Classify a batch of commits using LLM with ticket context."""
|
|
558
|
+
batch_id = str(uuid.uuid4())
|
|
559
|
+
logger.info(f"Starting LLM classification for batch {batch_id} with {len(commits)} commits")
|
|
560
|
+
|
|
561
|
+
# Prepare batch for LLM classification
|
|
562
|
+
enhanced_commits = []
|
|
563
|
+
for commit in commits:
|
|
564
|
+
enhanced_commit = commit.copy()
|
|
565
|
+
|
|
566
|
+
# Add ticket context to commit
|
|
567
|
+
ticket_refs = commit.get("ticket_references", [])
|
|
568
|
+
relevant_tickets = []
|
|
569
|
+
for ticket_id in ticket_refs:
|
|
570
|
+
if ticket_id in ticket_context:
|
|
571
|
+
relevant_tickets.append(ticket_context[ticket_id])
|
|
572
|
+
|
|
573
|
+
enhanced_commit["ticket_context"] = relevant_tickets
|
|
574
|
+
enhanced_commits.append(enhanced_commit)
|
|
575
|
+
|
|
576
|
+
try:
|
|
577
|
+
# Use LLM classifier with enhanced context
|
|
578
|
+
llm_results = self.llm_classifier.classify_commits_batch(
|
|
579
|
+
enhanced_commits, batch_id=batch_id, include_confidence=True
|
|
580
|
+
)
|
|
581
|
+
|
|
582
|
+
# Process LLM results and add fallbacks
|
|
583
|
+
processed_results = []
|
|
584
|
+
for _i, (commit, llm_result) in enumerate(zip(commits, llm_results)):
|
|
585
|
+
confidence = llm_result.get("confidence", 0.0)
|
|
586
|
+
predicted_category = llm_result.get("category", "other")
|
|
587
|
+
|
|
588
|
+
# Apply confidence threshold and fallback
|
|
589
|
+
if confidence < self.confidence_threshold and self.fallback_enabled:
|
|
590
|
+
fallback_category = self._fallback_classify_commit(commit)
|
|
591
|
+
processed_results.append(
|
|
592
|
+
{
|
|
593
|
+
"commit_hash": commit["commit_hash"],
|
|
594
|
+
"category": fallback_category,
|
|
595
|
+
"confidence": 0.5, # Medium confidence for rule-based
|
|
596
|
+
"method": "fallback",
|
|
597
|
+
"llm_category": predicted_category,
|
|
598
|
+
"llm_confidence": confidence,
|
|
599
|
+
"batch_id": batch_id,
|
|
600
|
+
}
|
|
601
|
+
)
|
|
602
|
+
else:
|
|
603
|
+
processed_results.append(
|
|
604
|
+
{
|
|
605
|
+
"commit_hash": commit["commit_hash"],
|
|
606
|
+
"category": predicted_category,
|
|
607
|
+
"confidence": confidence,
|
|
608
|
+
"method": "llm",
|
|
609
|
+
"batch_id": batch_id,
|
|
610
|
+
}
|
|
611
|
+
)
|
|
612
|
+
|
|
613
|
+
logger.info(
|
|
614
|
+
f"LLM classification completed for batch {batch_id}: {len(processed_results)} commits"
|
|
615
|
+
)
|
|
616
|
+
return processed_results
|
|
617
|
+
|
|
618
|
+
except Exception as e:
|
|
619
|
+
logger.error(f"LLM classification failed for batch {batch_id}: {e}")
|
|
620
|
+
|
|
621
|
+
# Fall back to rule-based classification for entire batch
|
|
622
|
+
if self.fallback_enabled:
|
|
623
|
+
fallback_results = []
|
|
624
|
+
for commit in commits:
|
|
625
|
+
category = self._fallback_classify_commit(commit)
|
|
626
|
+
fallback_results.append(
|
|
627
|
+
{
|
|
628
|
+
"commit_hash": commit["commit_hash"],
|
|
629
|
+
"category": category,
|
|
630
|
+
"confidence": 0.3, # Low confidence for fallback
|
|
631
|
+
"method": "fallback_only",
|
|
632
|
+
"error": str(e),
|
|
633
|
+
"batch_id": batch_id,
|
|
634
|
+
}
|
|
635
|
+
)
|
|
636
|
+
|
|
637
|
+
logger.info(f"Fallback classification completed for batch {batch_id}")
|
|
638
|
+
return fallback_results
|
|
639
|
+
|
|
640
|
+
return []
|
|
641
|
+
|
|
642
|
+
def _fallback_classify_commit(self, commit: dict[str, Any]) -> str:
|
|
643
|
+
"""Classify commit using rule-based patterns."""
|
|
644
|
+
import re
|
|
645
|
+
|
|
646
|
+
message = commit.get("message", "").lower()
|
|
647
|
+
|
|
648
|
+
# Check patterns in order of specificity
|
|
649
|
+
for category, patterns in self.fallback_patterns.items():
|
|
650
|
+
for pattern in patterns:
|
|
651
|
+
if re.search(pattern, message, re.IGNORECASE):
|
|
652
|
+
return category
|
|
653
|
+
|
|
654
|
+
# Default category
|
|
655
|
+
return "other"
|
|
656
|
+
|
|
657
|
+
def _store_commit_classification(
|
|
658
|
+
self, session: Any, classification_result: dict[str, Any]
|
|
659
|
+
) -> None:
|
|
660
|
+
"""Store classification result in cached commit record."""
|
|
661
|
+
try:
|
|
662
|
+
commit_hash = classification_result["commit_hash"]
|
|
663
|
+
|
|
664
|
+
# Find the cached commit record
|
|
665
|
+
cached_commit = (
|
|
666
|
+
session.query(CachedCommit).filter(CachedCommit.commit_hash == commit_hash).first()
|
|
667
|
+
)
|
|
668
|
+
|
|
669
|
+
if cached_commit:
|
|
670
|
+
# Store classification in ticket_references as temporary solution
|
|
671
|
+
# In production, this would go in a separate classification table
|
|
672
|
+
if not hasattr(cached_commit, "classification_data"):
|
|
673
|
+
cached_commit.ticket_references = cached_commit.ticket_references or []
|
|
674
|
+
|
|
675
|
+
# Add classification data to the record
|
|
676
|
+
# Note: This is a simplified approach - in production you'd want a separate table
|
|
677
|
+
{
|
|
678
|
+
"category": classification_result["category"],
|
|
679
|
+
"confidence": classification_result["confidence"],
|
|
680
|
+
"method": classification_result["method"],
|
|
681
|
+
"classified_at": datetime.utcnow().isoformat(),
|
|
682
|
+
"batch_id": classification_result.get("batch_id"),
|
|
683
|
+
}
|
|
684
|
+
|
|
685
|
+
# Store in a JSON field or separate table in production
|
|
686
|
+
logger.debug(
|
|
687
|
+
f"Classified commit {commit_hash[:7]} as {classification_result['category']}"
|
|
688
|
+
)
|
|
689
|
+
|
|
690
|
+
except Exception as e:
|
|
691
|
+
logger.error(
|
|
692
|
+
f"Error storing classification for {classification_result.get('commit_hash', 'unknown')}: {e}"
|
|
693
|
+
)
|
|
694
|
+
|
|
695
|
+
def _store_daily_metrics(
|
|
696
|
+
self,
|
|
697
|
+
start_date: datetime,
|
|
698
|
+
end_date: datetime,
|
|
699
|
+
project_keys: Optional[list[str]],
|
|
700
|
+
) -> None:
|
|
701
|
+
"""Store aggregated daily metrics from classification results."""
|
|
702
|
+
from ..core.metrics_storage import DailyMetricsStorage
|
|
703
|
+
|
|
704
|
+
try:
|
|
705
|
+
DailyMetricsStorage(self.cache_dir / "gitflow_cache.db")
|
|
706
|
+
|
|
707
|
+
# This would typically aggregate from the classification results
|
|
708
|
+
# For now, we'll let the existing system handle this
|
|
709
|
+
logger.info("Daily metrics storage integration placeholder")
|
|
710
|
+
|
|
711
|
+
except Exception as e:
|
|
712
|
+
logger.error(f"Error storing daily metrics: {e}")
|
|
713
|
+
|
|
714
|
+
def get_classification_status(
|
|
715
|
+
self,
|
|
716
|
+
start_date: datetime,
|
|
717
|
+
end_date: datetime,
|
|
718
|
+
project_keys: Optional[list[str]] = None,
|
|
719
|
+
) -> dict[str, Any]:
|
|
720
|
+
"""Get classification status for a date range."""
|
|
721
|
+
session = self.database.get_session()
|
|
722
|
+
|
|
723
|
+
try:
|
|
724
|
+
query = session.query(DailyCommitBatch).filter(
|
|
725
|
+
DailyCommitBatch.date >= start_date.date(), DailyCommitBatch.date <= end_date.date()
|
|
726
|
+
)
|
|
727
|
+
|
|
728
|
+
if project_keys:
|
|
729
|
+
query = query.filter(DailyCommitBatch.project_key.in_(project_keys))
|
|
730
|
+
|
|
731
|
+
batches = query.all()
|
|
732
|
+
|
|
733
|
+
status_counts = defaultdict(int)
|
|
734
|
+
total_commits = 0
|
|
735
|
+
|
|
736
|
+
for batch in batches:
|
|
737
|
+
status_counts[batch.classification_status] += 1
|
|
738
|
+
total_commits += batch.commit_count
|
|
739
|
+
|
|
740
|
+
return {
|
|
741
|
+
"total_batches": len(batches),
|
|
742
|
+
"total_commits": total_commits,
|
|
743
|
+
"status_breakdown": dict(status_counts),
|
|
744
|
+
"completion_rate": status_counts["completed"] / len(batches) if batches else 0.0,
|
|
745
|
+
"date_range": {"start": start_date, "end": end_date},
|
|
746
|
+
}
|
|
747
|
+
|
|
748
|
+
except Exception as e:
|
|
749
|
+
logger.error(f"Error getting classification status: {e}")
|
|
750
|
+
return {}
|
|
751
|
+
finally:
|
|
752
|
+
session.close()
|