gitflow-analytics 1.0.3__py3-none-any.whl → 1.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. gitflow_analytics/_version.py +1 -1
  2. gitflow_analytics/classification/__init__.py +31 -0
  3. gitflow_analytics/classification/batch_classifier.py +752 -0
  4. gitflow_analytics/classification/classifier.py +464 -0
  5. gitflow_analytics/classification/feature_extractor.py +725 -0
  6. gitflow_analytics/classification/linguist_analyzer.py +574 -0
  7. gitflow_analytics/classification/model.py +455 -0
  8. gitflow_analytics/cli.py +4108 -350
  9. gitflow_analytics/cli_rich.py +198 -48
  10. gitflow_analytics/config/__init__.py +43 -0
  11. gitflow_analytics/config/errors.py +261 -0
  12. gitflow_analytics/config/loader.py +904 -0
  13. gitflow_analytics/config/profiles.py +264 -0
  14. gitflow_analytics/config/repository.py +124 -0
  15. gitflow_analytics/config/schema.py +441 -0
  16. gitflow_analytics/config/validator.py +154 -0
  17. gitflow_analytics/config.py +44 -508
  18. gitflow_analytics/core/analyzer.py +1209 -98
  19. gitflow_analytics/core/cache.py +1337 -29
  20. gitflow_analytics/core/data_fetcher.py +1193 -0
  21. gitflow_analytics/core/identity.py +363 -14
  22. gitflow_analytics/core/metrics_storage.py +526 -0
  23. gitflow_analytics/core/progress.py +372 -0
  24. gitflow_analytics/core/schema_version.py +269 -0
  25. gitflow_analytics/extractors/ml_tickets.py +1100 -0
  26. gitflow_analytics/extractors/story_points.py +8 -1
  27. gitflow_analytics/extractors/tickets.py +749 -11
  28. gitflow_analytics/identity_llm/__init__.py +6 -0
  29. gitflow_analytics/identity_llm/analysis_pass.py +231 -0
  30. gitflow_analytics/identity_llm/analyzer.py +464 -0
  31. gitflow_analytics/identity_llm/models.py +76 -0
  32. gitflow_analytics/integrations/github_integration.py +175 -11
  33. gitflow_analytics/integrations/jira_integration.py +461 -24
  34. gitflow_analytics/integrations/orchestrator.py +124 -1
  35. gitflow_analytics/metrics/activity_scoring.py +322 -0
  36. gitflow_analytics/metrics/branch_health.py +470 -0
  37. gitflow_analytics/metrics/dora.py +379 -20
  38. gitflow_analytics/models/database.py +843 -53
  39. gitflow_analytics/pm_framework/__init__.py +115 -0
  40. gitflow_analytics/pm_framework/adapters/__init__.py +50 -0
  41. gitflow_analytics/pm_framework/adapters/jira_adapter.py +1845 -0
  42. gitflow_analytics/pm_framework/base.py +406 -0
  43. gitflow_analytics/pm_framework/models.py +211 -0
  44. gitflow_analytics/pm_framework/orchestrator.py +652 -0
  45. gitflow_analytics/pm_framework/registry.py +333 -0
  46. gitflow_analytics/qualitative/__init__.py +9 -10
  47. gitflow_analytics/qualitative/chatgpt_analyzer.py +259 -0
  48. gitflow_analytics/qualitative/classifiers/__init__.py +3 -3
  49. gitflow_analytics/qualitative/classifiers/change_type.py +518 -244
  50. gitflow_analytics/qualitative/classifiers/domain_classifier.py +272 -165
  51. gitflow_analytics/qualitative/classifiers/intent_analyzer.py +321 -222
  52. gitflow_analytics/qualitative/classifiers/llm/__init__.py +35 -0
  53. gitflow_analytics/qualitative/classifiers/llm/base.py +193 -0
  54. gitflow_analytics/qualitative/classifiers/llm/batch_processor.py +383 -0
  55. gitflow_analytics/qualitative/classifiers/llm/cache.py +479 -0
  56. gitflow_analytics/qualitative/classifiers/llm/cost_tracker.py +435 -0
  57. gitflow_analytics/qualitative/classifiers/llm/openai_client.py +403 -0
  58. gitflow_analytics/qualitative/classifiers/llm/prompts.py +373 -0
  59. gitflow_analytics/qualitative/classifiers/llm/response_parser.py +287 -0
  60. gitflow_analytics/qualitative/classifiers/llm_commit_classifier.py +607 -0
  61. gitflow_analytics/qualitative/classifiers/risk_analyzer.py +215 -189
  62. gitflow_analytics/qualitative/core/__init__.py +4 -4
  63. gitflow_analytics/qualitative/core/llm_fallback.py +239 -235
  64. gitflow_analytics/qualitative/core/nlp_engine.py +157 -148
  65. gitflow_analytics/qualitative/core/pattern_cache.py +214 -192
  66. gitflow_analytics/qualitative/core/processor.py +381 -248
  67. gitflow_analytics/qualitative/enhanced_analyzer.py +2236 -0
  68. gitflow_analytics/qualitative/example_enhanced_usage.py +420 -0
  69. gitflow_analytics/qualitative/models/__init__.py +7 -7
  70. gitflow_analytics/qualitative/models/schemas.py +155 -121
  71. gitflow_analytics/qualitative/utils/__init__.py +4 -4
  72. gitflow_analytics/qualitative/utils/batch_processor.py +136 -123
  73. gitflow_analytics/qualitative/utils/cost_tracker.py +142 -140
  74. gitflow_analytics/qualitative/utils/metrics.py +172 -158
  75. gitflow_analytics/qualitative/utils/text_processing.py +146 -104
  76. gitflow_analytics/reports/__init__.py +100 -0
  77. gitflow_analytics/reports/analytics_writer.py +539 -14
  78. gitflow_analytics/reports/base.py +648 -0
  79. gitflow_analytics/reports/branch_health_writer.py +322 -0
  80. gitflow_analytics/reports/classification_writer.py +924 -0
  81. gitflow_analytics/reports/cli_integration.py +427 -0
  82. gitflow_analytics/reports/csv_writer.py +1676 -212
  83. gitflow_analytics/reports/data_models.py +504 -0
  84. gitflow_analytics/reports/database_report_generator.py +427 -0
  85. gitflow_analytics/reports/example_usage.py +344 -0
  86. gitflow_analytics/reports/factory.py +499 -0
  87. gitflow_analytics/reports/formatters.py +698 -0
  88. gitflow_analytics/reports/html_generator.py +1116 -0
  89. gitflow_analytics/reports/interfaces.py +489 -0
  90. gitflow_analytics/reports/json_exporter.py +2770 -0
  91. gitflow_analytics/reports/narrative_writer.py +2287 -158
  92. gitflow_analytics/reports/story_point_correlation.py +1144 -0
  93. gitflow_analytics/reports/weekly_trends_writer.py +389 -0
  94. gitflow_analytics/training/__init__.py +5 -0
  95. gitflow_analytics/training/model_loader.py +377 -0
  96. gitflow_analytics/training/pipeline.py +550 -0
  97. gitflow_analytics/tui/__init__.py +1 -1
  98. gitflow_analytics/tui/app.py +129 -126
  99. gitflow_analytics/tui/screens/__init__.py +3 -3
  100. gitflow_analytics/tui/screens/analysis_progress_screen.py +188 -179
  101. gitflow_analytics/tui/screens/configuration_screen.py +154 -178
  102. gitflow_analytics/tui/screens/loading_screen.py +100 -110
  103. gitflow_analytics/tui/screens/main_screen.py +89 -72
  104. gitflow_analytics/tui/screens/results_screen.py +305 -281
  105. gitflow_analytics/tui/widgets/__init__.py +2 -2
  106. gitflow_analytics/tui/widgets/data_table.py +67 -69
  107. gitflow_analytics/tui/widgets/export_modal.py +76 -76
  108. gitflow_analytics/tui/widgets/progress_widget.py +41 -46
  109. gitflow_analytics-1.3.6.dist-info/METADATA +1015 -0
  110. gitflow_analytics-1.3.6.dist-info/RECORD +122 -0
  111. gitflow_analytics-1.0.3.dist-info/METADATA +0 -490
  112. gitflow_analytics-1.0.3.dist-info/RECORD +0 -62
  113. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.6.dist-info}/WHEEL +0 -0
  114. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.6.dist-info}/entry_points.txt +0 -0
  115. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.6.dist-info}/licenses/LICENSE +0 -0
  116. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.6.dist-info}/top_level.txt +0 -0
@@ -1,12 +1,30 @@
1
1
  """Database models for GitFlow Analytics using SQLAlchemy."""
2
2
 
3
+ import logging
4
+ import os
5
+ import tempfile
3
6
  from datetime import datetime
4
7
  from pathlib import Path
5
-
6
- from sqlalchemy import JSON, Boolean, Column, DateTime, Float, ForeignKey, Index, Integer, String, create_engine
7
- from sqlalchemy.orm import Session, sessionmaker, declarative_base
8
8
  from typing import Any
9
9
 
10
+ from sqlalchemy import (
11
+ JSON,
12
+ Boolean,
13
+ Column,
14
+ DateTime,
15
+ Float,
16
+ ForeignKey,
17
+ Index,
18
+ Integer,
19
+ String,
20
+ create_engine,
21
+ text,
22
+ )
23
+ from sqlalchemy.exc import OperationalError
24
+ from sqlalchemy.orm import Session, declarative_base, sessionmaker
25
+
26
+ logger = logging.getLogger(__name__)
27
+
10
28
  Base: Any = declarative_base()
11
29
 
12
30
 
@@ -164,140 +182,912 @@ class IssueCache(Base):
164
182
 
165
183
  class QualitativeCommitData(Base):
166
184
  """Extended commit data with qualitative analysis results.
167
-
185
+
168
186
  This table stores the results of qualitative analysis performed on commits,
169
187
  including change type classification, domain analysis, risk assessment,
170
188
  and processing metadata.
171
189
  """
172
- __tablename__ = 'qualitative_commits'
173
-
190
+
191
+ __tablename__ = "qualitative_commits"
192
+
174
193
  # Link to existing commit
175
- commit_id = Column(Integer, ForeignKey('cached_commits.id'), primary_key=True)
176
-
194
+ commit_id = Column(Integer, ForeignKey("cached_commits.id"), primary_key=True)
195
+
177
196
  # Classification results
178
197
  change_type = Column(String, nullable=False)
179
198
  change_type_confidence = Column(Float, nullable=False)
180
- business_domain = Column(String, nullable=False)
199
+ business_domain = Column(String, nullable=False)
181
200
  domain_confidence = Column(Float, nullable=False)
182
201
  risk_level = Column(String, nullable=False)
183
202
  risk_factors = Column(JSON) # List of risk factors
184
-
203
+
185
204
  # Intent and context analysis
186
205
  intent_signals = Column(JSON) # Intent analysis results
187
206
  collaboration_patterns = Column(JSON) # Team interaction patterns
188
207
  technical_context = Column(JSON) # Technical context information
189
-
208
+
190
209
  # Processing metadata
191
210
  processing_method = Column(String, nullable=False) # 'nlp' or 'llm'
192
211
  processing_time_ms = Column(Float)
193
212
  confidence_score = Column(Float, nullable=False)
194
-
213
+
195
214
  # Timestamps
196
215
  analyzed_at = Column(DateTime, default=datetime.utcnow)
197
216
  analysis_version = Column(String, default="1.0")
198
-
217
+
199
218
  # Indexes for efficient querying
200
219
  __table_args__ = (
201
- Index('idx_change_type', 'change_type'),
202
- Index('idx_business_domain', 'business_domain'),
203
- Index('idx_risk_level', 'risk_level'),
204
- Index('idx_qualitative_confidence', 'confidence_score'),
205
- Index('idx_processing_method', 'processing_method'),
206
- Index('idx_analyzed_at', 'analyzed_at'),
220
+ Index("idx_change_type", "change_type"),
221
+ Index("idx_business_domain", "business_domain"),
222
+ Index("idx_risk_level", "risk_level"),
223
+ Index("idx_qualitative_confidence", "confidence_score"),
224
+ Index("idx_processing_method", "processing_method"),
225
+ Index("idx_analyzed_at", "analyzed_at"),
207
226
  )
208
227
 
209
228
 
210
229
  class PatternCache(Base):
211
230
  """Cache for learned patterns and classifications.
212
-
231
+
213
232
  This table stores frequently occurring patterns to avoid reprocessing
214
233
  similar commits and to improve classification accuracy over time.
215
234
  """
216
- __tablename__ = 'pattern_cache'
217
-
235
+
236
+ __tablename__ = "pattern_cache"
237
+
218
238
  id = Column(Integer, primary_key=True)
219
-
239
+
220
240
  # Pattern identification
221
241
  message_hash = Column(String, nullable=False, unique=True)
222
242
  semantic_fingerprint = Column(String, nullable=False)
223
-
243
+
224
244
  # Cached classification results
225
245
  classification_result = Column(JSON, nullable=False)
226
246
  confidence_score = Column(Float, nullable=False)
227
-
247
+
228
248
  # Usage tracking for cache management
229
249
  hit_count = Column(Integer, default=1)
230
250
  last_used = Column(DateTime, default=datetime.utcnow)
231
251
  created_at = Column(DateTime, default=datetime.utcnow)
232
-
252
+
233
253
  # Source tracking
234
254
  source_method = Column(String, nullable=False) # 'nlp' or 'llm'
235
255
  source_model = Column(String) # Model/method that created this pattern
236
-
256
+
237
257
  # Performance tracking
238
258
  avg_processing_time_ms = Column(Float)
239
-
259
+
240
260
  # Indexes for pattern matching and cleanup
241
261
  __table_args__ = (
242
- Index('idx_semantic_fingerprint', 'semantic_fingerprint'),
243
- Index('idx_pattern_confidence', 'confidence_score'),
244
- Index('idx_hit_count', 'hit_count'),
245
- Index('idx_last_used', 'last_used'),
246
- Index('idx_source_method', 'source_method'),
262
+ Index("idx_semantic_fingerprint", "semantic_fingerprint"),
263
+ Index("idx_pattern_confidence", "confidence_score"),
264
+ Index("idx_hit_count", "hit_count"),
265
+ Index("idx_last_used", "last_used"),
266
+ Index("idx_source_method", "source_method"),
247
267
  )
248
268
 
249
269
 
250
270
  class LLMUsageStats(Base):
251
271
  """Track LLM usage statistics for cost monitoring and optimization.
252
-
272
+
253
273
  This table helps monitor LLM API usage, costs, and performance to
254
274
  optimize the balance between speed, accuracy, and cost.
255
275
  """
256
- __tablename__ = 'llm_usage_stats'
257
-
276
+
277
+ __tablename__ = "llm_usage_stats"
278
+
258
279
  id = Column(Integer, primary_key=True)
259
-
280
+
260
281
  # API call metadata
261
282
  model_name = Column(String, nullable=False)
262
- api_provider = Column(String, default='openrouter')
283
+ api_provider = Column(String, default="openrouter")
263
284
  timestamp = Column(DateTime, default=datetime.utcnow)
264
-
285
+
265
286
  # Usage metrics
266
287
  input_tokens = Column(Integer, nullable=False)
267
288
  output_tokens = Column(Integer, nullable=False)
268
289
  processing_time_ms = Column(Float, nullable=False)
269
-
290
+
270
291
  # Cost tracking
271
292
  estimated_cost_usd = Column(Float)
272
293
  cost_per_token = Column(Float)
273
-
294
+
274
295
  # Batch information
275
296
  batch_size = Column(Integer, default=1) # Number of commits processed
276
297
  batch_id = Column(String) # Group related calls
277
-
298
+
278
299
  # Quality metrics
279
300
  avg_confidence_score = Column(Float)
280
301
  success = Column(Boolean, default=True)
281
302
  error_message = Column(String)
282
-
303
+
283
304
  # Indexes for analysis and monitoring
284
305
  __table_args__ = (
285
- Index('idx_model_timestamp', 'model_name', 'timestamp'),
286
- Index('idx_llm_timestamp', 'timestamp'),
287
- Index('idx_batch_id', 'batch_id'),
288
- Index('idx_success', 'success'),
306
+ Index("idx_model_timestamp", "model_name", "timestamp"),
307
+ Index("idx_llm_timestamp", "timestamp"),
308
+ Index("idx_llm_batch_id", "batch_id"),
309
+ Index("idx_success", "success"),
310
+ )
311
+
312
+
313
+ class TrainingData(Base):
314
+ """Training data for commit classification models.
315
+
316
+ This table stores labeled training examples collected from PM platforms
317
+ and manual annotations for training and improving classification models.
318
+ """
319
+
320
+ __tablename__ = "training_data"
321
+
322
+ id = Column(Integer, primary_key=True)
323
+
324
+ # Commit identification
325
+ commit_hash = Column(String, nullable=False)
326
+ commit_message = Column(String, nullable=False)
327
+ files_changed = Column(JSON) # List of changed files
328
+ repo_path = Column(String, nullable=False)
329
+
330
+ # Classification labels
331
+ category = Column(String, nullable=False) # feature, bug_fix, refactor, etc.
332
+ confidence = Column(Float, nullable=False, default=1.0) # Label confidence (0-1)
333
+
334
+ # Source information
335
+ source_type = Column(String, nullable=False) # 'pm_platform', 'manual', 'inferred'
336
+ source_platform = Column(String) # 'jira', 'github', 'clickup', etc.
337
+ source_ticket_id = Column(String) # Original ticket/issue ID
338
+ source_ticket_type = Column(String) # Bug, Story, Task, etc.
339
+
340
+ # Training metadata
341
+ training_session_id = Column(String, nullable=False) # Groups related training data
342
+ created_at = Column(DateTime, default=datetime.utcnow)
343
+ updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
344
+
345
+ # Quality assurance
346
+ validated = Column(Boolean, default=False) # Human validation flag
347
+ validation_notes = Column(String) # Notes from validation process
348
+
349
+ # Feature extraction (for ML training)
350
+ extracted_features = Column(JSON) # Pre-computed features for ML
351
+
352
+ # Indexes for efficient querying and training
353
+ __table_args__ = (
354
+ Index("idx_training_commit_hash", "commit_hash"),
355
+ Index("idx_training_category", "category"),
356
+ Index("idx_training_source", "source_type", "source_platform"),
357
+ Index("idx_training_session", "training_session_id"),
358
+ Index("idx_training_created", "created_at"),
359
+ Index("idx_training_validated", "validated"),
360
+ Index("idx_commit_repo", "commit_hash", "repo_path", unique=True),
361
+ )
362
+
363
+
364
+ class RepositoryAnalysisStatus(Base):
365
+ """Track repository-level analysis completion status for cache-first workflow.
366
+
367
+ WHY: This table enables "fetch once, report many" behavior by tracking
368
+ which repositories have been fully analyzed for specific time periods.
369
+ Prevents re-fetching Git data when only generating different reports.
370
+ """
371
+
372
+ __tablename__ = "repository_analysis_status"
373
+
374
+ id = Column(Integer, primary_key=True)
375
+
376
+ # Repository identification
377
+ repo_path = Column(String, nullable=False)
378
+ repo_name = Column(String, nullable=False) # For display purposes
379
+ project_key = Column(String, nullable=False)
380
+
381
+ # Analysis period
382
+ analysis_start = Column(DateTime, nullable=False) # Start of analysis period
383
+ analysis_end = Column(DateTime, nullable=False) # End of analysis period
384
+ weeks_analyzed = Column(Integer, nullable=False) # Number of weeks
385
+
386
+ # Completion tracking
387
+ git_analysis_complete = Column(Boolean, default=False)
388
+ commit_count = Column(Integer, default=0)
389
+ pr_analysis_complete = Column(Boolean, default=False)
390
+ pr_count = Column(Integer, default=0)
391
+ ticket_analysis_complete = Column(Boolean, default=False)
392
+ ticket_count = Column(Integer, default=0)
393
+
394
+ # Developer identity resolution
395
+ identity_resolution_complete = Column(Boolean, default=False)
396
+ unique_developers = Column(Integer, default=0)
397
+
398
+ # Analysis metadata
399
+ last_updated = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
400
+ analysis_version = Column(String, default="2.0") # For tracking schema changes
401
+
402
+ # Configuration hash to detect config changes
403
+ config_hash = Column(String, nullable=True) # MD5 hash of relevant config
404
+
405
+ # Analysis performance metrics
406
+ processing_time_seconds = Column(Float, nullable=True)
407
+ cache_hit_rate_percent = Column(Float, nullable=True)
408
+
409
+ # Status tracking
410
+ status = Column(String, default="pending") # pending, in_progress, completed, failed
411
+ error_message = Column(String, nullable=True)
412
+
413
+ # Indexes for efficient querying
414
+ __table_args__ = (
415
+ Index("idx_repo_analysis_path", "repo_path"),
416
+ Index("idx_repo_analysis_period", "analysis_start", "analysis_end"),
417
+ Index("idx_repo_analysis_status", "status"),
418
+ Index(
419
+ "idx_repo_analysis_unique", "repo_path", "analysis_start", "analysis_end", unique=True
420
+ ),
421
+ Index("idx_repo_analysis_updated", "last_updated"),
422
+ )
423
+
424
+
425
+ class TrainingSession(Base):
426
+ """Training session metadata and results.
427
+
428
+ This table tracks individual training runs, their configurations,
429
+ and performance metrics for model versioning and comparison.
430
+ """
431
+
432
+ __tablename__ = "training_sessions"
433
+
434
+ id = Column(Integer, primary_key=True)
435
+ session_id = Column(String, unique=True, nullable=False)
436
+
437
+ # Session metadata
438
+ started_at = Column(DateTime, default=datetime.utcnow)
439
+ completed_at = Column(DateTime)
440
+ status = Column(String, default="running") # running, completed, failed
441
+
442
+ # Configuration
443
+ config = Column(JSON, nullable=False) # Training configuration
444
+ weeks_analyzed = Column(Integer) # Time period covered
445
+ repositories = Column(JSON) # List of repositories analyzed
446
+
447
+ # Data statistics
448
+ total_commits = Column(Integer, default=0)
449
+ labeled_commits = Column(Integer, default=0)
450
+ training_examples = Column(Integer, default=0)
451
+ validation_examples = Column(Integer, default=0)
452
+
453
+ # PM platform coverage
454
+ pm_platforms = Column(JSON) # List of PM platforms used
455
+ ticket_coverage_pct = Column(Float) # Percentage of commits with tickets
456
+
457
+ # Training results
458
+ model_accuracy = Column(Float) # Overall accuracy
459
+ category_metrics = Column(JSON) # Per-category precision/recall/f1
460
+ validation_loss = Column(Float) # Validation loss
461
+
462
+ # Model storage
463
+ model_path = Column(String) # Path to saved model
464
+ model_version = Column(String) # Version identifier
465
+ model_size_mb = Column(Float) # Model file size
466
+
467
+ # Performance metrics
468
+ training_time_minutes = Column(Float)
469
+ prediction_time_ms = Column(Float) # Average prediction time
470
+
471
+ # Notes and errors
472
+ notes = Column(String)
473
+ error_message = Column(String)
474
+
475
+ # Indexes for session management
476
+ __table_args__ = (
477
+ Index("idx_session_id", "session_id"),
478
+ Index("idx_session_status", "status"),
479
+ Index("idx_session_started", "started_at"),
480
+ Index("idx_session_model_version", "model_version"),
481
+ )
482
+
483
+
484
+ class ClassificationModel(Base):
485
+ """Versioned storage for trained classification models.
486
+
487
+ This table manages different versions of trained models with
488
+ metadata for model selection and performance tracking.
489
+ """
490
+
491
+ __tablename__ = "classification_models"
492
+
493
+ id = Column(Integer, primary_key=True)
494
+ model_id = Column(String, unique=True, nullable=False)
495
+
496
+ # Model metadata
497
+ name = Column(String, nullable=False)
498
+ version = Column(String, nullable=False)
499
+ model_type = Column(String, nullable=False) # 'sklearn', 'spacy', 'custom'
500
+ created_at = Column(DateTime, default=datetime.utcnow)
501
+
502
+ # Training information
503
+ training_session_id = Column(String, ForeignKey("training_sessions.session_id"))
504
+ trained_on_commits = Column(Integer, nullable=False)
505
+ training_accuracy = Column(Float, nullable=False)
506
+ validation_accuracy = Column(Float, nullable=False)
507
+
508
+ # Model performance
509
+ categories = Column(JSON, nullable=False) # List of supported categories
510
+ performance_metrics = Column(JSON) # Detailed performance metrics
511
+ feature_importance = Column(JSON) # Feature importance scores
512
+
513
+ # Model storage and configuration
514
+ model_binary = Column(JSON) # Serialized model (for small models)
515
+ model_file_path = Column(String) # Path to model file (for large models)
516
+ model_config = Column(JSON) # Model hyperparameters and settings
517
+
518
+ # Usage tracking
519
+ active = Column(Boolean, default=True) # Whether model is active
520
+ usage_count = Column(Integer, default=0) # Number of times used
521
+ last_used = Column(DateTime)
522
+
523
+ # Model validation
524
+ cross_validation_scores = Column(JSON) # Cross-validation results
525
+ test_accuracy = Column(Float) # Hold-out test set accuracy
526
+
527
+ # Indexes for model management
528
+ __table_args__ = (
529
+ Index("idx_model_id", "model_id"),
530
+ Index("idx_model_version", "version"),
531
+ Index("idx_model_active", "active"),
532
+ Index("idx_model_accuracy", "validation_accuracy"),
533
+ Index("idx_model_created", "created_at"),
534
+ )
535
+
536
+
537
+ class DailyCommitBatch(Base):
538
+ """Daily batches of commits organized for efficient data collection and retrieval.
539
+
540
+ WHY: This table enables the two-step fetch/analyze process by storing raw commit data
541
+ in daily batches with full metadata before classification. Each row represents
542
+ one day's worth of commits for a specific project, enabling efficient batch retrieval.
543
+ """
544
+
545
+ __tablename__ = "daily_commit_batches"
546
+
547
+ # Primary key components
548
+ id = Column(Integer, primary_key=True)
549
+ date = Column(DateTime, nullable=False) # Date for the commit batch (YYYY-MM-DD)
550
+ project_key = Column(String, nullable=False) # Project identifier
551
+ repo_path = Column(String, nullable=False) # Repository path for identification
552
+
553
+ # Batch metadata
554
+ commit_count = Column(Integer, default=0) # Number of commits in this batch
555
+ total_files_changed = Column(Integer, default=0)
556
+ total_lines_added = Column(Integer, default=0)
557
+ total_lines_deleted = Column(Integer, default=0)
558
+
559
+ # Developers active on this day
560
+ active_developers = Column(JSON) # List of developer canonical IDs
561
+ unique_tickets = Column(JSON) # List of ticket IDs referenced on this day
562
+
563
+ # Processing status
564
+ fetched_at = Column(DateTime, default=datetime.utcnow)
565
+ classification_status = Column(
566
+ String, default="pending"
567
+ ) # pending, processing, completed, failed
568
+ classified_at = Column(DateTime, nullable=True)
569
+
570
+ # Batch context for LLM classification
571
+ context_summary = Column(String, nullable=True) # Brief summary of day's activity
572
+
573
+ # Indexes for efficient retrieval by date range and project
574
+ __table_args__ = (
575
+ Index("idx_batch_date", "date"),
576
+ Index("idx_daily_batch_project", "project_key"),
577
+ Index("idx_batch_repo", "repo_path"),
578
+ Index("idx_daily_batch_status", "classification_status"),
579
+ Index("idx_batch_unique", "date", "project_key", "repo_path", unique=True),
580
+ Index("idx_batch_date_range", "date", "project_key"),
581
+ )
582
+
583
+
584
+ class DetailedTicketData(Base):
585
+ """Enhanced ticket storage with full metadata for context-aware classification.
586
+
587
+ WHY: The two-step process requires full ticket context (descriptions, types, etc.)
588
+ to improve classification accuracy. This extends the existing IssueCache with
589
+ fields specifically needed for classification context.
590
+ """
591
+
592
+ __tablename__ = "detailed_tickets"
593
+
594
+ id = Column(Integer, primary_key=True)
595
+
596
+ # Ticket identification (enhanced from IssueCache)
597
+ platform = Column(String, nullable=False) # 'jira', 'github', 'clickup', 'linear'
598
+ ticket_id = Column(String, nullable=False)
599
+ project_key = Column(String, nullable=False)
600
+
601
+ # Core ticket data
602
+ title = Column(String)
603
+ description = Column(String) # Full description for context
604
+ summary = Column(String) # Brief summary extracted from description
605
+ ticket_type = Column(String) # Bug, Story, Task, Epic, etc.
606
+ status = Column(String)
607
+ priority = Column(String)
608
+ labels = Column(JSON) # List of labels/tags
609
+
610
+ # People and dates
611
+ assignee = Column(String, nullable=True)
612
+ reporter = Column(String, nullable=True)
613
+ created_at = Column(DateTime)
614
+ updated_at = Column(DateTime)
615
+ resolved_at = Column(DateTime, nullable=True)
616
+
617
+ # Metrics for classification context
618
+ story_points = Column(Integer, nullable=True)
619
+ original_estimate = Column(String, nullable=True) # Time estimate
620
+ time_spent = Column(String, nullable=True)
621
+
622
+ # Relationships for context
623
+ epic_key = Column(String, nullable=True) # Parent epic
624
+ parent_key = Column(String, nullable=True) # Parent issue
625
+ subtasks = Column(JSON) # List of subtask keys
626
+ linked_issues = Column(JSON) # List of linked issue keys
627
+
628
+ # Classification hints from ticket type/labels
629
+ classification_hints = Column(JSON) # Extracted hints for commit classification
630
+ business_domain = Column(String, nullable=True) # Domain extracted from ticket
631
+
632
+ # Platform-specific data
633
+ platform_data = Column(JSON) # Additional platform-specific fields
634
+
635
+ # Fetch metadata
636
+ fetched_at = Column(DateTime, default=datetime.utcnow)
637
+ fetch_version = Column(String, default="2.0") # Version for schema evolution
638
+
639
+ # Indexes for efficient lookup and context building
640
+ __table_args__ = (
641
+ Index("idx_detailed_platform_ticket", "platform", "ticket_id", unique=True),
642
+ Index("idx_detailed_project", "project_key"),
643
+ Index("idx_detailed_type", "ticket_type"),
644
+ Index("idx_detailed_epic", "epic_key"),
645
+ Index("idx_detailed_created", "created_at"),
646
+ Index("idx_detailed_status", "status"),
647
+ )
648
+
649
+
650
+ class CommitClassificationBatch(Base):
651
+ """Batch classification results with context and confidence tracking.
652
+
653
+ WHY: This table stores the results of batch LLM classification with full
654
+ context about what information was used and confidence levels achieved.
655
+ Enables iterative improvement and debugging of classification quality.
656
+ """
657
+
658
+ __tablename__ = "classification_batches"
659
+
660
+ id = Column(Integer, primary_key=True)
661
+ batch_id = Column(String, unique=True, nullable=False) # UUID for this batch
662
+
663
+ # Batch context
664
+ project_key = Column(String, nullable=False)
665
+ week_start = Column(DateTime, nullable=False) # Monday of the week
666
+ week_end = Column(DateTime, nullable=False) # Sunday of the week
667
+ commit_count = Column(Integer, nullable=False)
668
+
669
+ # Context provided to LLM
670
+ ticket_context = Column(JSON) # Tickets included in context
671
+ developer_context = Column(JSON) # Active developers in this batch
672
+ project_context = Column(String) # Project description/domain
673
+
674
+ # LLM processing details
675
+ model_used = Column(String, nullable=False) # Model identifier
676
+ prompt_template = Column(String, nullable=False) # Template used
677
+ context_tokens = Column(Integer, default=0) # Tokens used for context
678
+ completion_tokens = Column(Integer, default=0) # Tokens in response
679
+ total_tokens = Column(Integer, default=0)
680
+
681
+ # Processing results
682
+ processing_status = Column(String, default="pending") # pending, processing, completed, failed
683
+ started_at = Column(DateTime, default=datetime.utcnow)
684
+ completed_at = Column(DateTime, nullable=True)
685
+ processing_time_ms = Column(Float, nullable=True)
686
+
687
+ # Quality metrics
688
+ avg_confidence = Column(Float, nullable=True) # Average confidence across commits
689
+ low_confidence_count = Column(Integer, default=0) # Commits with confidence < 0.7
690
+ fallback_count = Column(Integer, default=0) # Commits that fell back to rules
691
+
692
+ # Cost tracking
693
+ estimated_cost_usd = Column(Float, nullable=True)
694
+ cost_per_commit = Column(Float, nullable=True)
695
+
696
+ # Error handling
697
+ error_message = Column(String, nullable=True)
698
+ retry_count = Column(Integer, default=0)
699
+
700
+ # Indexes for batch management and analysis
701
+ __table_args__ = (
702
+ Index("idx_classification_batch_id", "batch_id"),
703
+ Index("idx_classification_batch_project", "project_key"),
704
+ Index("idx_batch_week", "week_start", "week_end"),
705
+ Index("idx_classification_batch_status", "processing_status"),
706
+ Index("idx_batch_completed", "completed_at"),
707
+ Index("idx_batch_model", "model_used"),
708
+ )
709
+
710
+
711
+ class CommitTicketCorrelation(Base):
712
+ """Correlations between commits and tickets for context-aware classification.
713
+
714
+ WHY: This table explicitly tracks which commits reference which tickets,
715
+ enabling the batch classifier to include relevant ticket context when
716
+ classifying related commits. Improves accuracy by providing business context.
717
+ """
718
+
719
+ __tablename__ = "commit_ticket_correlations"
720
+
721
+ id = Column(Integer, primary_key=True)
722
+
723
+ # Commit identification
724
+ commit_hash = Column(String, nullable=False)
725
+ repo_path = Column(String, nullable=False)
726
+
727
+ # Ticket identification
728
+ ticket_id = Column(String, nullable=False)
729
+ platform = Column(String, nullable=False)
730
+ project_key = Column(String, nullable=False)
731
+
732
+ # Correlation metadata
733
+ correlation_type = Column(String, default="direct") # direct, inferred, related
734
+ confidence = Column(Float, default=1.0) # Confidence in correlation
735
+ extracted_from = Column(String, nullable=False) # commit_message, branch_name, pr_title
736
+
737
+ # Pattern that created this correlation
738
+ matching_pattern = Column(String, nullable=True) # Regex pattern that matched
739
+
740
+ # Timestamps
741
+ created_at = Column(DateTime, default=datetime.utcnow)
742
+ validated = Column(Boolean, default=False) # Manual validation flag
743
+
744
+ # Indexes for efficient correlation lookup
745
+ __table_args__ = (
746
+ Index("idx_corr_commit", "commit_hash", "repo_path"),
747
+ Index("idx_corr_ticket", "ticket_id", "platform"),
748
+ Index("idx_corr_project", "project_key"),
749
+ Index("idx_corr_unique", "commit_hash", "repo_path", "ticket_id", "platform", unique=True),
750
+ )
751
+
752
+
753
+ class DailyMetrics(Base):
754
+ """Daily activity metrics per developer per project with classification data.
755
+
756
+ WHY: This table stores daily aggregated metrics for each developer-project combination,
757
+ enabling quick retrieval by date range for reporting and trend analysis.
758
+ Each row represents one developer's activity in one project for one day.
759
+ """
760
+
761
+ __tablename__ = "daily_metrics"
762
+
763
+ # Primary key components
764
+ id = Column(Integer, primary_key=True)
765
+ date = Column(DateTime, nullable=False) # Date for the metrics (YYYY-MM-DD)
766
+ developer_id = Column(String, nullable=False) # Canonical developer ID
767
+ project_key = Column(String, nullable=False) # Project identifier
768
+
769
+ # Developer information
770
+ developer_name = Column(String, nullable=False) # Display name for reports
771
+ developer_email = Column(String, nullable=False) # Primary email
772
+
773
+ # Classification counts - commit counts by category
774
+ feature_commits = Column(Integer, default=0)
775
+ bug_fix_commits = Column(Integer, default=0)
776
+ refactor_commits = Column(Integer, default=0)
777
+ documentation_commits = Column(Integer, default=0)
778
+ maintenance_commits = Column(Integer, default=0)
779
+ test_commits = Column(Integer, default=0)
780
+ style_commits = Column(Integer, default=0)
781
+ build_commits = Column(Integer, default=0)
782
+ other_commits = Column(Integer, default=0)
783
+
784
+ # Aggregate metrics
785
+ total_commits = Column(Integer, default=0)
786
+ files_changed = Column(Integer, default=0)
787
+ lines_added = Column(Integer, default=0)
788
+ lines_deleted = Column(Integer, default=0)
789
+ story_points = Column(Integer, default=0)
790
+
791
+ # Ticket tracking metrics
792
+ tracked_commits = Column(Integer, default=0) # Commits with ticket references
793
+ untracked_commits = Column(Integer, default=0) # Commits without ticket references
794
+ unique_tickets = Column(Integer, default=0) # Number of unique tickets referenced
795
+
796
+ # Work pattern indicators
797
+ merge_commits = Column(Integer, default=0)
798
+ complex_commits = Column(Integer, default=0) # Commits with >5 files changed
799
+
800
+ # Metadata
801
+ created_at = Column(DateTime, default=datetime.utcnow)
802
+ updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
803
+
804
+ # Indexes for efficient querying
805
+ __table_args__ = (
806
+ Index("idx_daily_date", "date"),
807
+ Index("idx_daily_developer", "developer_id"),
808
+ Index("idx_daily_project", "project_key"),
809
+ Index("idx_daily_date_range", "date", "developer_id", "project_key"),
810
+ Index("idx_daily_unique", "date", "developer_id", "project_key", unique=True),
811
+ )
812
+
813
+
814
+ class WeeklyTrends(Base):
815
+ """Weekly trend analysis for developer-project combinations.
816
+
817
+ WHY: Pre-calculated weekly trends improve report performance by avoiding
818
+ repeated calculations. Stores week-over-week changes in activity patterns.
819
+ """
820
+
821
+ __tablename__ = "weekly_trends"
822
+
823
+ id = Column(Integer, primary_key=True)
824
+ week_start = Column(DateTime, nullable=False) # Monday of the week
825
+ week_end = Column(DateTime, nullable=False) # Sunday of the week
826
+ developer_id = Column(String, nullable=False)
827
+ project_key = Column(String, nullable=False)
828
+
829
+ # Week totals
830
+ total_commits = Column(Integer, default=0)
831
+ feature_commits = Column(Integer, default=0)
832
+ bug_fix_commits = Column(Integer, default=0)
833
+ refactor_commits = Column(Integer, default=0)
834
+
835
+ # Week-over-week changes (percentage)
836
+ total_commits_change = Column(Float, default=0.0)
837
+ feature_commits_change = Column(Float, default=0.0)
838
+ bug_fix_commits_change = Column(Float, default=0.0)
839
+ refactor_commits_change = Column(Float, default=0.0)
840
+
841
+ # Activity indicators
842
+ days_active = Column(Integer, default=0) # Number of days with commits
843
+ avg_commits_per_day = Column(Float, default=0.0)
844
+
845
+ # Metadata
846
+ calculated_at = Column(DateTime, default=datetime.utcnow)
847
+
848
+ # Indexes for trend queries
849
+ __table_args__ = (
850
+ Index("idx_weekly_start", "week_start"),
851
+ Index("idx_weekly_dev_proj", "developer_id", "project_key"),
852
+ Index("idx_weekly_unique", "week_start", "developer_id", "project_key", unique=True),
289
853
  )
290
854
 
291
855
 
292
856
  class Database:
293
- """Database connection manager."""
857
+ """Database connection manager with robust permission handling."""
294
858
 
295
859
  def __init__(self, db_path: Path):
296
- """Initialize database connection."""
297
- db_path.parent.mkdir(parents=True, exist_ok=True)
298
- self.engine = create_engine(f"sqlite:///{db_path}")
299
- Base.metadata.create_all(self.engine)
300
- self.SessionLocal = sessionmaker(bind=self.engine)
860
+ """
861
+ Initialize database connection with proper error handling.
862
+
863
+ WHY: This method handles various permission scenarios that can occur
864
+ in different deployment environments:
865
+ - Readonly filesystems (Docker containers, CI/CD)
866
+ - Permission denied on directory creation
867
+ - Database file creation failures
868
+ - Fallback to memory database when persistence isn't possible
869
+
870
+ DESIGN DECISION: Uses fallback mechanisms rather than failing hard,
871
+ allowing the application to continue running even in restricted environments.
872
+
873
+ Args:
874
+ db_path: Path to the SQLite database file
875
+
876
+ Raises:
877
+ RuntimeError: If database initialization fails completely
878
+ """
879
+ self.db_path = db_path
880
+ self.is_readonly_fallback = False
881
+ self.engine = None
882
+ self.SessionLocal = None
883
+
884
+ # Try to create database with proper error handling
885
+ self._initialize_database()
886
+
887
+ def _initialize_database(self) -> None:
888
+ """
889
+ Initialize database with comprehensive error handling.
890
+
891
+ WHY: Database initialization can fail for multiple reasons:
892
+ 1. Directory doesn't exist and can't be created (permissions)
893
+ 2. Directory exists but database file can't be created (readonly filesystem)
894
+ 3. Database file exists but is readonly
895
+ 4. Filesystem is completely readonly (containers, CI)
896
+
897
+ APPROACH: Try primary location first, then fallback strategies
898
+ """
899
+ # Strategy 1: Try primary database location
900
+ if self._try_primary_database():
901
+ return
902
+
903
+ # Strategy 2: Try temp directory fallback
904
+ if self._try_temp_database_fallback():
905
+ return
906
+
907
+ # Strategy 3: Use in-memory database as last resort
908
+ self._use_memory_database_fallback()
909
+
910
+ def _try_primary_database(self) -> bool:
911
+ """
912
+ Attempt to create database at the primary location.
913
+
914
+ Returns:
915
+ True if successful, False if fallback needed
916
+ """
917
+ try:
918
+ # Check if we can create the directory
919
+ if not self._ensure_directory_writable(self.db_path.parent):
920
+ return False
921
+
922
+ # Check if database file can be created/accessed
923
+ if not self._ensure_database_writable(self.db_path):
924
+ return False
925
+
926
+ # Try to create the database
927
+ self.engine = create_engine(
928
+ f"sqlite:///{self.db_path}",
929
+ # Add connection args to handle locked databases better
930
+ connect_args={
931
+ "timeout": 30, # 30 second timeout for database locks
932
+ "check_same_thread": False, # Allow multi-threading
933
+ },
934
+ )
935
+
936
+ # Test the connection and create tables
937
+ Base.metadata.create_all(self.engine)
938
+ self.SessionLocal = sessionmaker(bind=self.engine)
939
+
940
+ # Test that we can actually write to the database
941
+ self._test_database_write()
942
+
943
+ logger.info(f"Database initialized successfully at: {self.db_path}")
944
+ return True
945
+
946
+ except (OperationalError, OSError, PermissionError) as e:
947
+ logger.warning(f"Failed to initialize primary database at {self.db_path}: {e}")
948
+ return False
949
+
950
+ def _try_temp_database_fallback(self) -> bool:
951
+ """
952
+ Try to create database in system temp directory as fallback.
953
+
954
+ Returns:
955
+ True if successful, False if fallback needed
956
+ """
957
+ try:
958
+ # Create a temp file that will persist for the session
959
+ temp_dir = Path(tempfile.gettempdir()) / "gitflow-analytics-cache"
960
+ temp_dir.mkdir(exist_ok=True, parents=True)
961
+
962
+ # Use the same filename but in temp directory
963
+ temp_db_path = temp_dir / self.db_path.name
964
+
965
+ self.engine = create_engine(
966
+ f"sqlite:///{temp_db_path}",
967
+ connect_args={
968
+ "timeout": 30,
969
+ "check_same_thread": False,
970
+ },
971
+ )
972
+
973
+ Base.metadata.create_all(self.engine)
974
+ self.SessionLocal = sessionmaker(bind=self.engine)
975
+
976
+ # Test write capability
977
+ self._test_database_write()
978
+
979
+ logger.warning(
980
+ f"Primary database location not writable. Using temp fallback: {temp_db_path}"
981
+ )
982
+ self.db_path = temp_db_path # Update path for reference
983
+ return True
984
+
985
+ except (OperationalError, OSError, PermissionError) as e:
986
+ logger.warning(f"Temp database fallback failed: {e}")
987
+ return False
988
+
989
+ def _use_memory_database_fallback(self) -> None:
990
+ """
991
+ Use in-memory SQLite database as last resort.
992
+
993
+ This allows the application to function even in completely readonly environments,
994
+ but data will not persist between runs.
995
+ """
996
+ try:
997
+ logger.warning(
998
+ "All persistent database options failed. Using in-memory database. "
999
+ "Data will not persist between runs."
1000
+ )
1001
+
1002
+ self.engine = create_engine(
1003
+ "sqlite:///:memory:", connect_args={"check_same_thread": False}
1004
+ )
1005
+
1006
+ Base.metadata.create_all(self.engine)
1007
+ self.SessionLocal = sessionmaker(bind=self.engine)
1008
+
1009
+ self.is_readonly_fallback = True
1010
+
1011
+ # Test that memory database works
1012
+ self._test_database_write()
1013
+
1014
+ except Exception as e:
1015
+ raise RuntimeError(
1016
+ f"Failed to initialize any database (including in-memory fallback): {e}. "
1017
+ "This may indicate a deeper system issue."
1018
+ ) from e
1019
+
1020
+ def _ensure_directory_writable(self, directory: Path) -> bool:
1021
+ """
1022
+ Ensure directory exists and is writable.
1023
+
1024
+ Args:
1025
+ directory: Directory to check/create
1026
+
1027
+ Returns:
1028
+ True if directory is writable, False otherwise
1029
+ """
1030
+ try:
1031
+ # Create directory if it doesn't exist
1032
+ directory.mkdir(parents=True, exist_ok=True)
1033
+
1034
+ # Test write permissions by creating a temporary file
1035
+ test_file = directory / ".write_test"
1036
+ test_file.touch()
1037
+ test_file.unlink() # Clean up
1038
+
1039
+ return True
1040
+
1041
+ except (PermissionError, OSError) as e:
1042
+ logger.debug(f"Directory {directory} is not writable: {e}")
1043
+ return False
1044
+
1045
+ def _ensure_database_writable(self, db_path: Path) -> bool:
1046
+ """
1047
+ Check if database file can be created or is writable if it exists.
1048
+
1049
+ Args:
1050
+ db_path: Path to the database file
1051
+
1052
+ Returns:
1053
+ True if database file is writable, False otherwise
1054
+ """
1055
+ try:
1056
+ if db_path.exists():
1057
+ # Check if existing file is writable
1058
+ if not os.access(db_path, os.W_OK):
1059
+ logger.debug(f"Database file {db_path} exists but is not writable")
1060
+ return False
1061
+ else:
1062
+ # Test if we can create the file
1063
+ db_path.touch()
1064
+ db_path.unlink() # Clean up test file
1065
+
1066
+ return True
1067
+
1068
+ except (PermissionError, OSError) as e:
1069
+ logger.debug(f"Cannot create/write database file {db_path}: {e}")
1070
+ return False
1071
+
1072
+ def _test_database_write(self) -> None:
1073
+ """
1074
+ Test that we can actually write to the database.
1075
+
1076
+ Raises:
1077
+ OperationalError: If database write test fails
1078
+ """
1079
+ try:
1080
+ # Try a simple write operation to verify database is writable
1081
+ session = self.get_session()
1082
+ try:
1083
+ # Just test that we can begin a transaction and rollback
1084
+ session.execute(text("SELECT 1"))
1085
+ session.rollback()
1086
+ finally:
1087
+ session.close()
1088
+
1089
+ except Exception as e:
1090
+ raise OperationalError(f"Database write test failed: {e}", None, None) from e
301
1091
 
302
1092
  def get_session(self) -> Session:
303
1093
  """Get a new database session."""