gitflow-analytics 1.0.1__py3-none-any.whl → 1.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gitflow_analytics/__init__.py +11 -11
- gitflow_analytics/_version.py +2 -2
- gitflow_analytics/classification/__init__.py +31 -0
- gitflow_analytics/classification/batch_classifier.py +752 -0
- gitflow_analytics/classification/classifier.py +464 -0
- gitflow_analytics/classification/feature_extractor.py +725 -0
- gitflow_analytics/classification/linguist_analyzer.py +574 -0
- gitflow_analytics/classification/model.py +455 -0
- gitflow_analytics/cli.py +4490 -378
- gitflow_analytics/cli_rich.py +503 -0
- gitflow_analytics/config/__init__.py +43 -0
- gitflow_analytics/config/errors.py +261 -0
- gitflow_analytics/config/loader.py +904 -0
- gitflow_analytics/config/profiles.py +264 -0
- gitflow_analytics/config/repository.py +124 -0
- gitflow_analytics/config/schema.py +441 -0
- gitflow_analytics/config/validator.py +154 -0
- gitflow_analytics/config.py +44 -398
- gitflow_analytics/core/analyzer.py +1320 -172
- gitflow_analytics/core/branch_mapper.py +132 -132
- gitflow_analytics/core/cache.py +1554 -175
- gitflow_analytics/core/data_fetcher.py +1193 -0
- gitflow_analytics/core/identity.py +571 -185
- gitflow_analytics/core/metrics_storage.py +526 -0
- gitflow_analytics/core/progress.py +372 -0
- gitflow_analytics/core/schema_version.py +269 -0
- gitflow_analytics/extractors/base.py +13 -11
- gitflow_analytics/extractors/ml_tickets.py +1100 -0
- gitflow_analytics/extractors/story_points.py +77 -59
- gitflow_analytics/extractors/tickets.py +841 -89
- gitflow_analytics/identity_llm/__init__.py +6 -0
- gitflow_analytics/identity_llm/analysis_pass.py +231 -0
- gitflow_analytics/identity_llm/analyzer.py +464 -0
- gitflow_analytics/identity_llm/models.py +76 -0
- gitflow_analytics/integrations/github_integration.py +258 -87
- gitflow_analytics/integrations/jira_integration.py +572 -123
- gitflow_analytics/integrations/orchestrator.py +206 -82
- gitflow_analytics/metrics/activity_scoring.py +322 -0
- gitflow_analytics/metrics/branch_health.py +470 -0
- gitflow_analytics/metrics/dora.py +542 -179
- gitflow_analytics/models/database.py +986 -59
- gitflow_analytics/pm_framework/__init__.py +115 -0
- gitflow_analytics/pm_framework/adapters/__init__.py +50 -0
- gitflow_analytics/pm_framework/adapters/jira_adapter.py +1845 -0
- gitflow_analytics/pm_framework/base.py +406 -0
- gitflow_analytics/pm_framework/models.py +211 -0
- gitflow_analytics/pm_framework/orchestrator.py +652 -0
- gitflow_analytics/pm_framework/registry.py +333 -0
- gitflow_analytics/qualitative/__init__.py +29 -0
- gitflow_analytics/qualitative/chatgpt_analyzer.py +259 -0
- gitflow_analytics/qualitative/classifiers/__init__.py +13 -0
- gitflow_analytics/qualitative/classifiers/change_type.py +742 -0
- gitflow_analytics/qualitative/classifiers/domain_classifier.py +506 -0
- gitflow_analytics/qualitative/classifiers/intent_analyzer.py +535 -0
- gitflow_analytics/qualitative/classifiers/llm/__init__.py +35 -0
- gitflow_analytics/qualitative/classifiers/llm/base.py +193 -0
- gitflow_analytics/qualitative/classifiers/llm/batch_processor.py +383 -0
- gitflow_analytics/qualitative/classifiers/llm/cache.py +479 -0
- gitflow_analytics/qualitative/classifiers/llm/cost_tracker.py +435 -0
- gitflow_analytics/qualitative/classifiers/llm/openai_client.py +403 -0
- gitflow_analytics/qualitative/classifiers/llm/prompts.py +373 -0
- gitflow_analytics/qualitative/classifiers/llm/response_parser.py +287 -0
- gitflow_analytics/qualitative/classifiers/llm_commit_classifier.py +607 -0
- gitflow_analytics/qualitative/classifiers/risk_analyzer.py +438 -0
- gitflow_analytics/qualitative/core/__init__.py +13 -0
- gitflow_analytics/qualitative/core/llm_fallback.py +657 -0
- gitflow_analytics/qualitative/core/nlp_engine.py +382 -0
- gitflow_analytics/qualitative/core/pattern_cache.py +479 -0
- gitflow_analytics/qualitative/core/processor.py +673 -0
- gitflow_analytics/qualitative/enhanced_analyzer.py +2236 -0
- gitflow_analytics/qualitative/example_enhanced_usage.py +420 -0
- gitflow_analytics/qualitative/models/__init__.py +25 -0
- gitflow_analytics/qualitative/models/schemas.py +306 -0
- gitflow_analytics/qualitative/utils/__init__.py +13 -0
- gitflow_analytics/qualitative/utils/batch_processor.py +339 -0
- gitflow_analytics/qualitative/utils/cost_tracker.py +345 -0
- gitflow_analytics/qualitative/utils/metrics.py +361 -0
- gitflow_analytics/qualitative/utils/text_processing.py +285 -0
- gitflow_analytics/reports/__init__.py +100 -0
- gitflow_analytics/reports/analytics_writer.py +550 -18
- gitflow_analytics/reports/base.py +648 -0
- gitflow_analytics/reports/branch_health_writer.py +322 -0
- gitflow_analytics/reports/classification_writer.py +924 -0
- gitflow_analytics/reports/cli_integration.py +427 -0
- gitflow_analytics/reports/csv_writer.py +1700 -216
- gitflow_analytics/reports/data_models.py +504 -0
- gitflow_analytics/reports/database_report_generator.py +427 -0
- gitflow_analytics/reports/example_usage.py +344 -0
- gitflow_analytics/reports/factory.py +499 -0
- gitflow_analytics/reports/formatters.py +698 -0
- gitflow_analytics/reports/html_generator.py +1116 -0
- gitflow_analytics/reports/interfaces.py +489 -0
- gitflow_analytics/reports/json_exporter.py +2770 -0
- gitflow_analytics/reports/narrative_writer.py +2289 -158
- gitflow_analytics/reports/story_point_correlation.py +1144 -0
- gitflow_analytics/reports/weekly_trends_writer.py +389 -0
- gitflow_analytics/training/__init__.py +5 -0
- gitflow_analytics/training/model_loader.py +377 -0
- gitflow_analytics/training/pipeline.py +550 -0
- gitflow_analytics/tui/__init__.py +5 -0
- gitflow_analytics/tui/app.py +724 -0
- gitflow_analytics/tui/screens/__init__.py +8 -0
- gitflow_analytics/tui/screens/analysis_progress_screen.py +496 -0
- gitflow_analytics/tui/screens/configuration_screen.py +523 -0
- gitflow_analytics/tui/screens/loading_screen.py +348 -0
- gitflow_analytics/tui/screens/main_screen.py +321 -0
- gitflow_analytics/tui/screens/results_screen.py +722 -0
- gitflow_analytics/tui/widgets/__init__.py +7 -0
- gitflow_analytics/tui/widgets/data_table.py +255 -0
- gitflow_analytics/tui/widgets/export_modal.py +301 -0
- gitflow_analytics/tui/widgets/progress_widget.py +187 -0
- gitflow_analytics-1.3.6.dist-info/METADATA +1015 -0
- gitflow_analytics-1.3.6.dist-info/RECORD +122 -0
- gitflow_analytics-1.0.1.dist-info/METADATA +0 -463
- gitflow_analytics-1.0.1.dist-info/RECORD +0 -31
- {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/WHEEL +0 -0
- {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/entry_points.txt +0 -0
- {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/licenses/LICENSE +0 -0
- {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,306 @@
|
|
|
1
|
+
"""Data models and configuration schemas for qualitative analysis."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class QualitativeCommitData:
|
|
10
|
+
"""Enhanced commit data with qualitative analysis results.
|
|
11
|
+
|
|
12
|
+
This class extends basic commit information with semantic analysis results
|
|
13
|
+
including change type, business domain, risk assessment, and processing metadata.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
# Existing commit data from GitFlow Analytics
|
|
17
|
+
hash: str
|
|
18
|
+
message: str
|
|
19
|
+
author_name: str
|
|
20
|
+
author_email: str
|
|
21
|
+
timestamp: datetime
|
|
22
|
+
files_changed: list[str]
|
|
23
|
+
insertions: int
|
|
24
|
+
deletions: int
|
|
25
|
+
|
|
26
|
+
# New qualitative analysis fields
|
|
27
|
+
change_type: str # feature|bugfix|refactor|docs|test|chore|security|hotfix|config
|
|
28
|
+
change_type_confidence: float # 0.0-1.0
|
|
29
|
+
business_domain: str # frontend|backend|database|infrastructure|mobile|devops|unknown
|
|
30
|
+
domain_confidence: float # 0.0-1.0
|
|
31
|
+
risk_level: str # low|medium|high|critical
|
|
32
|
+
risk_factors: list[str] # List of identified risk factors
|
|
33
|
+
intent_signals: dict[str, Any] # Intent analysis results
|
|
34
|
+
collaboration_patterns: dict[str, Any] # Team interaction patterns
|
|
35
|
+
technical_context: dict[str, Any] # Technical context information
|
|
36
|
+
|
|
37
|
+
# Processing metadata
|
|
38
|
+
processing_method: str # 'nlp' or 'llm'
|
|
39
|
+
processing_time_ms: float
|
|
40
|
+
confidence_score: float # Overall confidence in analysis
|
|
41
|
+
|
|
42
|
+
def to_dict(self) -> dict[str, Any]:
|
|
43
|
+
"""Convert to dictionary for JSON serialization."""
|
|
44
|
+
return {
|
|
45
|
+
"hash": self.hash,
|
|
46
|
+
"message": self.message,
|
|
47
|
+
"author_name": self.author_name,
|
|
48
|
+
"author_email": self.author_email,
|
|
49
|
+
"timestamp": self.timestamp.isoformat(),
|
|
50
|
+
"files_changed": self.files_changed,
|
|
51
|
+
"insertions": self.insertions,
|
|
52
|
+
"deletions": self.deletions,
|
|
53
|
+
"change_type": self.change_type,
|
|
54
|
+
"change_type_confidence": self.change_type_confidence,
|
|
55
|
+
"business_domain": self.business_domain,
|
|
56
|
+
"domain_confidence": self.domain_confidence,
|
|
57
|
+
"risk_level": self.risk_level,
|
|
58
|
+
"risk_factors": self.risk_factors,
|
|
59
|
+
"intent_signals": self.intent_signals,
|
|
60
|
+
"collaboration_patterns": self.collaboration_patterns,
|
|
61
|
+
"technical_context": self.technical_context,
|
|
62
|
+
"processing_method": self.processing_method,
|
|
63
|
+
"processing_time_ms": self.processing_time_ms,
|
|
64
|
+
"confidence_score": self.confidence_score,
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@dataclass
|
|
69
|
+
class ChangeTypeConfig:
|
|
70
|
+
"""Configuration for change type classification."""
|
|
71
|
+
|
|
72
|
+
min_confidence: float = 0.7
|
|
73
|
+
semantic_weight: float = 0.6 # Weight for semantic features
|
|
74
|
+
file_pattern_weight: float = 0.4 # Weight for file pattern signals
|
|
75
|
+
enable_custom_patterns: bool = True
|
|
76
|
+
custom_patterns: dict[str, dict[str, list[str]]] = field(default_factory=dict)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@dataclass
|
|
80
|
+
class IntentConfig:
|
|
81
|
+
"""Configuration for intent analysis."""
|
|
82
|
+
|
|
83
|
+
urgency_keywords: dict[str, list[str]] = field(
|
|
84
|
+
default_factory=lambda: {
|
|
85
|
+
"critical": ["critical", "urgent", "hotfix", "emergency", "immediate"],
|
|
86
|
+
"important": ["important", "priority", "asap", "needed"],
|
|
87
|
+
"routine": ["routine", "regular", "normal", "standard"],
|
|
88
|
+
}
|
|
89
|
+
)
|
|
90
|
+
confidence_threshold: float = 0.6
|
|
91
|
+
sentiment_analysis: bool = True
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
@dataclass
|
|
95
|
+
class DomainConfig:
|
|
96
|
+
"""Configuration for domain classification."""
|
|
97
|
+
|
|
98
|
+
file_patterns: dict[str, list[str]] = field(
|
|
99
|
+
default_factory=lambda: {
|
|
100
|
+
"frontend": ["*.js", "*.jsx", "*.ts", "*.tsx", "*.vue", "*.html", "*.css", "*.scss"],
|
|
101
|
+
"backend": ["*.py", "*.java", "*.go", "*.rb", "*.php", "*.cs", "*.cpp"],
|
|
102
|
+
"database": ["*.sql", "migrations/*", "schema/*", "**/models/**"],
|
|
103
|
+
"infrastructure": ["Dockerfile", "*.yaml", "*.yml", "terraform/*", "*.tf"],
|
|
104
|
+
"mobile": ["*.swift", "*.kt", "*.java", "android/*", "ios/*"],
|
|
105
|
+
"devops": ["*.yml", "*.yaml", "ci/*", ".github/*", "docker/*"],
|
|
106
|
+
}
|
|
107
|
+
)
|
|
108
|
+
keyword_patterns: dict[str, list[str]] = field(
|
|
109
|
+
default_factory=lambda: {
|
|
110
|
+
"frontend": ["ui", "component", "styling", "interface", "layout"],
|
|
111
|
+
"backend": ["api", "endpoint", "service", "server", "logic"],
|
|
112
|
+
"database": ["query", "schema", "migration", "data", "model"],
|
|
113
|
+
"infrastructure": ["deploy", "config", "environment", "setup"],
|
|
114
|
+
"mobile": ["android", "ios", "mobile", "app"],
|
|
115
|
+
"devops": ["build", "pipeline", "deploy", "ci", "docker"],
|
|
116
|
+
}
|
|
117
|
+
)
|
|
118
|
+
min_confidence: float = 0.6
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
@dataclass
|
|
122
|
+
class RiskConfig:
|
|
123
|
+
"""Configuration for risk analysis."""
|
|
124
|
+
|
|
125
|
+
high_risk_patterns: list[str] = field(
|
|
126
|
+
default_factory=lambda: [
|
|
127
|
+
# Security-related patterns
|
|
128
|
+
"password",
|
|
129
|
+
"secret",
|
|
130
|
+
"key",
|
|
131
|
+
"token",
|
|
132
|
+
"auth",
|
|
133
|
+
"security",
|
|
134
|
+
# Critical system patterns
|
|
135
|
+
"production",
|
|
136
|
+
"prod",
|
|
137
|
+
"critical",
|
|
138
|
+
"emergency",
|
|
139
|
+
# Infrastructure patterns
|
|
140
|
+
"database",
|
|
141
|
+
"migration",
|
|
142
|
+
"schema",
|
|
143
|
+
"deploy",
|
|
144
|
+
# Large change patterns
|
|
145
|
+
"refactor",
|
|
146
|
+
"rewrite",
|
|
147
|
+
"restructure",
|
|
148
|
+
]
|
|
149
|
+
)
|
|
150
|
+
medium_risk_patterns: list[str] = field(
|
|
151
|
+
default_factory=lambda: [
|
|
152
|
+
"config",
|
|
153
|
+
"configuration",
|
|
154
|
+
"settings",
|
|
155
|
+
"environment",
|
|
156
|
+
"api",
|
|
157
|
+
"endpoint",
|
|
158
|
+
"service",
|
|
159
|
+
"integration",
|
|
160
|
+
]
|
|
161
|
+
)
|
|
162
|
+
file_risk_patterns: dict[str, str] = field(
|
|
163
|
+
default_factory=lambda: {
|
|
164
|
+
# High risk file patterns
|
|
165
|
+
"**/*prod*": "high",
|
|
166
|
+
"**/migrations/**": "high",
|
|
167
|
+
"**/schema/**": "high",
|
|
168
|
+
"Dockerfile": "medium",
|
|
169
|
+
"*.yml": "medium",
|
|
170
|
+
"*.yaml": "medium",
|
|
171
|
+
"**/*config*": "medium",
|
|
172
|
+
}
|
|
173
|
+
)
|
|
174
|
+
size_thresholds: dict[str, int] = field(
|
|
175
|
+
default_factory=lambda: {
|
|
176
|
+
"large_commit_files": 20, # Files changed
|
|
177
|
+
"large_commit_lines": 500, # Lines changed
|
|
178
|
+
"massive_commit_lines": 2000, # Very large changes
|
|
179
|
+
}
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
@dataclass
|
|
184
|
+
class NLPConfig:
|
|
185
|
+
"""Configuration for NLP processing engine."""
|
|
186
|
+
|
|
187
|
+
spacy_model: str = "en_core_web_sm"
|
|
188
|
+
spacy_batch_size: int = 1000
|
|
189
|
+
fast_mode: bool = True # Disable parser/NER for speed
|
|
190
|
+
|
|
191
|
+
# Component configurations
|
|
192
|
+
change_type_config: ChangeTypeConfig = field(default_factory=ChangeTypeConfig)
|
|
193
|
+
intent_config: IntentConfig = field(default_factory=IntentConfig)
|
|
194
|
+
domain_config: DomainConfig = field(default_factory=DomainConfig)
|
|
195
|
+
risk_config: RiskConfig = field(default_factory=RiskConfig)
|
|
196
|
+
|
|
197
|
+
# Performance settings
|
|
198
|
+
enable_parallel_processing: bool = True
|
|
199
|
+
max_workers: int = 4
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
@dataclass
|
|
203
|
+
class LLMConfig:
|
|
204
|
+
"""Configuration for LLM fallback processing via OpenRouter."""
|
|
205
|
+
|
|
206
|
+
# OpenRouter API settings
|
|
207
|
+
openrouter_api_key: str = "${OPENROUTER_API_KEY}"
|
|
208
|
+
base_url: str = "https://openrouter.ai/api/v1"
|
|
209
|
+
|
|
210
|
+
# Model selection strategy
|
|
211
|
+
primary_model: str = "anthropic/claude-3-haiku" # Fast, cheap classification
|
|
212
|
+
fallback_model: str = "meta-llama/llama-3.1-8b-instruct:free" # Free fallback
|
|
213
|
+
complex_model: str = "anthropic/claude-3-sonnet" # For complex cases
|
|
214
|
+
|
|
215
|
+
# Model routing thresholds
|
|
216
|
+
complexity_threshold: float = 0.5 # Route complex cases to better model
|
|
217
|
+
cost_threshold_per_1k: float = 0.01 # Max cost per 1k commits
|
|
218
|
+
|
|
219
|
+
# Processing settings
|
|
220
|
+
max_tokens: int = 1000
|
|
221
|
+
temperature: float = 0.1
|
|
222
|
+
|
|
223
|
+
# Batching settings
|
|
224
|
+
max_group_size: int = 10 # Process up to 10 commits per batch
|
|
225
|
+
similarity_threshold: float = 0.8 # Group similar commits together
|
|
226
|
+
|
|
227
|
+
# Rate limiting
|
|
228
|
+
requests_per_minute: int = 200 # Higher limit with OpenRouter
|
|
229
|
+
max_retries: int = 3
|
|
230
|
+
|
|
231
|
+
# Cost control
|
|
232
|
+
max_daily_cost: float = 5.0 # Max daily spend in USD
|
|
233
|
+
enable_cost_tracking: bool = True
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
@dataclass
|
|
237
|
+
class CacheConfig:
|
|
238
|
+
"""Configuration for qualitative analysis caching."""
|
|
239
|
+
|
|
240
|
+
cache_dir: str = ".qualitative_cache"
|
|
241
|
+
semantic_cache_size: int = 10000 # Max cached patterns
|
|
242
|
+
pattern_cache_ttl_hours: int = 168 # 1 week
|
|
243
|
+
|
|
244
|
+
# Learning settings
|
|
245
|
+
enable_pattern_learning: bool = True
|
|
246
|
+
learning_threshold: int = 10 # Min examples to learn pattern
|
|
247
|
+
confidence_boost_factor: float = 0.1 # Boost for learned patterns
|
|
248
|
+
|
|
249
|
+
# Cache optimization
|
|
250
|
+
enable_compression: bool = True
|
|
251
|
+
max_cache_size_mb: int = 100
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
@dataclass
|
|
255
|
+
class QualitativeConfig:
|
|
256
|
+
"""Main configuration for qualitative analysis system.
|
|
257
|
+
|
|
258
|
+
This configuration orchestrates the entire qualitative analysis pipeline,
|
|
259
|
+
balancing performance, accuracy, and cost through intelligent NLP and
|
|
260
|
+
strategic LLM usage.
|
|
261
|
+
"""
|
|
262
|
+
|
|
263
|
+
# Processing settings
|
|
264
|
+
enabled: bool = True
|
|
265
|
+
batch_size: int = 1000 # Commits processed per batch
|
|
266
|
+
max_llm_fallback_pct: float = 0.15 # Max 15% of commits use LLM
|
|
267
|
+
confidence_threshold: float = 0.7 # Min confidence for NLP results
|
|
268
|
+
|
|
269
|
+
# Component configurations
|
|
270
|
+
nlp_config: NLPConfig = field(default_factory=NLPConfig)
|
|
271
|
+
llm_config: LLMConfig = field(default_factory=LLMConfig)
|
|
272
|
+
cache_config: CacheConfig = field(default_factory=CacheConfig)
|
|
273
|
+
|
|
274
|
+
# Performance monitoring
|
|
275
|
+
enable_performance_tracking: bool = True
|
|
276
|
+
target_processing_time_ms: float = 2.0 # Target per-commit processing time
|
|
277
|
+
|
|
278
|
+
# Quality settings
|
|
279
|
+
min_overall_confidence: float = 0.6 # Min confidence for any result
|
|
280
|
+
enable_quality_feedback: bool = True # Learn from corrections
|
|
281
|
+
|
|
282
|
+
def validate(self) -> list[str]:
|
|
283
|
+
"""Validate configuration and return any warnings.
|
|
284
|
+
|
|
285
|
+
Returns:
|
|
286
|
+
List of validation warning messages.
|
|
287
|
+
"""
|
|
288
|
+
warnings = []
|
|
289
|
+
|
|
290
|
+
if self.max_llm_fallback_pct > 0.3:
|
|
291
|
+
warnings.append("LLM fallback percentage > 30% may result in high costs")
|
|
292
|
+
|
|
293
|
+
if self.confidence_threshold > 0.9:
|
|
294
|
+
warnings.append("Very high confidence threshold may route too many commits to LLM")
|
|
295
|
+
|
|
296
|
+
if self.batch_size > 5000:
|
|
297
|
+
warnings.append("Large batch size may cause memory issues")
|
|
298
|
+
|
|
299
|
+
# Validate LLM config if API key is set
|
|
300
|
+
if (
|
|
301
|
+
self.llm_config.openrouter_api_key
|
|
302
|
+
and self.llm_config.openrouter_api_key != "${OPENROUTER_API_KEY}"
|
|
303
|
+
) and self.llm_config.max_daily_cost < 1.0:
|
|
304
|
+
warnings.append("Very low daily cost limit may restrict LLM usage")
|
|
305
|
+
|
|
306
|
+
return warnings
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Utility functions for qualitative analysis."""
|
|
2
|
+
|
|
3
|
+
from .batch_processor import BatchProcessor
|
|
4
|
+
from .cost_tracker import CostTracker
|
|
5
|
+
from .metrics import PerformanceMetrics
|
|
6
|
+
from .text_processing import TextProcessor
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"TextProcessor",
|
|
10
|
+
"BatchProcessor",
|
|
11
|
+
"PerformanceMetrics",
|
|
12
|
+
"CostTracker",
|
|
13
|
+
]
|
|
@@ -0,0 +1,339 @@
|
|
|
1
|
+
"""Batch processing utilities for efficient commit analysis."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import time
|
|
5
|
+
from collections.abc import Iterator
|
|
6
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
7
|
+
from threading import Lock
|
|
8
|
+
from typing import Any, Callable, Optional, TypeVar
|
|
9
|
+
|
|
10
|
+
T = TypeVar("T")
|
|
11
|
+
R = TypeVar("R")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class BatchProcessor:
|
|
15
|
+
"""Efficient batch processing for commit analysis.
|
|
16
|
+
|
|
17
|
+
This class provides utilities for processing large numbers of commits
|
|
18
|
+
in batches with parallel execution, progress tracking, and error handling.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __init__(self, batch_size: int = 1000, max_workers: int = 4):
|
|
22
|
+
"""Initialize batch processor.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
batch_size: Number of items to process per batch
|
|
26
|
+
max_workers: Maximum number of worker threads
|
|
27
|
+
"""
|
|
28
|
+
self.batch_size = batch_size
|
|
29
|
+
self.max_workers = max_workers
|
|
30
|
+
self.logger = logging.getLogger(__name__)
|
|
31
|
+
self._stats_lock = Lock()
|
|
32
|
+
self._processing_stats = {
|
|
33
|
+
"total_processed": 0,
|
|
34
|
+
"total_errors": 0,
|
|
35
|
+
"batch_times": [],
|
|
36
|
+
"start_time": None,
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
def create_batches(self, items: list[T], batch_size: Optional[int] = None) -> Iterator[list[T]]:
|
|
40
|
+
"""Split items into batches for processing.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
items: List of items to batch
|
|
44
|
+
batch_size: Override default batch size
|
|
45
|
+
|
|
46
|
+
Yields:
|
|
47
|
+
Batches of items
|
|
48
|
+
"""
|
|
49
|
+
batch_size = batch_size or self.batch_size
|
|
50
|
+
|
|
51
|
+
for i in range(0, len(items), batch_size):
|
|
52
|
+
yield items[i : i + batch_size]
|
|
53
|
+
|
|
54
|
+
def process_batches(
|
|
55
|
+
self, items: list[T], processor_func: Callable[[list[T]], list[R]], parallel: bool = True
|
|
56
|
+
) -> list[R]:
|
|
57
|
+
"""Process items in batches with optional parallelization.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
items: Items to process
|
|
61
|
+
processor_func: Function that processes a batch and returns results
|
|
62
|
+
parallel: Whether to use parallel processing
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
List of all processing results
|
|
66
|
+
"""
|
|
67
|
+
if not items:
|
|
68
|
+
return []
|
|
69
|
+
|
|
70
|
+
self._reset_stats()
|
|
71
|
+
self._processing_stats["start_time"] = time.time()
|
|
72
|
+
|
|
73
|
+
batches = list(self.create_batches(items))
|
|
74
|
+
self.logger.info(f"Processing {len(items)} items in {len(batches)} batches")
|
|
75
|
+
|
|
76
|
+
all_results = []
|
|
77
|
+
|
|
78
|
+
if parallel and len(batches) > 1:
|
|
79
|
+
all_results = self._process_parallel(batches, processor_func)
|
|
80
|
+
else:
|
|
81
|
+
all_results = self._process_sequential(batches, processor_func)
|
|
82
|
+
|
|
83
|
+
self._log_final_stats(len(items))
|
|
84
|
+
return all_results
|
|
85
|
+
|
|
86
|
+
def process_with_callback(
|
|
87
|
+
self,
|
|
88
|
+
items: list[T],
|
|
89
|
+
processor_func: Callable[[list[T]], list[R]],
|
|
90
|
+
progress_callback: Optional[Callable[[int, int], None]] = None,
|
|
91
|
+
) -> list[R]:
|
|
92
|
+
"""Process batches with progress callback.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
items: Items to process
|
|
96
|
+
processor_func: Function that processes a batch
|
|
97
|
+
progress_callback: Callback for progress updates (processed, total)
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
List of all processing results
|
|
101
|
+
"""
|
|
102
|
+
if not items:
|
|
103
|
+
return []
|
|
104
|
+
|
|
105
|
+
self._reset_stats()
|
|
106
|
+
batches = list(self.create_batches(items))
|
|
107
|
+
all_results = []
|
|
108
|
+
processed_count = 0
|
|
109
|
+
|
|
110
|
+
for i, batch in enumerate(batches):
|
|
111
|
+
batch_start = time.time()
|
|
112
|
+
|
|
113
|
+
try:
|
|
114
|
+
batch_results = processor_func(batch)
|
|
115
|
+
all_results.extend(batch_results)
|
|
116
|
+
processed_count += len(batch)
|
|
117
|
+
|
|
118
|
+
with self._stats_lock:
|
|
119
|
+
self._processing_stats["total_processed"] += len(batch)
|
|
120
|
+
self._processing_stats["batch_times"].append(time.time() - batch_start)
|
|
121
|
+
|
|
122
|
+
except Exception as e:
|
|
123
|
+
self.logger.error(f"Error processing batch {i}: {e}")
|
|
124
|
+
with self._stats_lock:
|
|
125
|
+
self._processing_stats["total_errors"] += len(batch)
|
|
126
|
+
|
|
127
|
+
# Call progress callback if provided
|
|
128
|
+
if progress_callback:
|
|
129
|
+
progress_callback(processed_count, len(items))
|
|
130
|
+
|
|
131
|
+
return all_results
|
|
132
|
+
|
|
133
|
+
def _process_parallel(
|
|
134
|
+
self, batches: list[list[T]], processor_func: Callable[[list[T]], list[R]]
|
|
135
|
+
) -> list[R]:
|
|
136
|
+
"""Process batches in parallel using ThreadPoolExecutor.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
batches: List of batches to process
|
|
140
|
+
processor_func: Function to process each batch
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
Combined results from all batches
|
|
144
|
+
"""
|
|
145
|
+
all_results = []
|
|
146
|
+
|
|
147
|
+
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
|
148
|
+
# Submit all batches
|
|
149
|
+
future_to_batch = {
|
|
150
|
+
executor.submit(self._process_batch_with_timing, batch, processor_func): i
|
|
151
|
+
for i, batch in enumerate(batches)
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
# Collect results as they complete
|
|
155
|
+
for future in as_completed(future_to_batch):
|
|
156
|
+
batch_idx = future_to_batch[future]
|
|
157
|
+
|
|
158
|
+
try:
|
|
159
|
+
batch_results, batch_time = future.result()
|
|
160
|
+
all_results.extend(batch_results)
|
|
161
|
+
|
|
162
|
+
with self._stats_lock:
|
|
163
|
+
self._processing_stats["total_processed"] += len(batches[batch_idx])
|
|
164
|
+
self._processing_stats["batch_times"].append(batch_time)
|
|
165
|
+
|
|
166
|
+
except Exception as e:
|
|
167
|
+
self.logger.error(f"Error processing batch {batch_idx}: {e}")
|
|
168
|
+
with self._stats_lock:
|
|
169
|
+
self._processing_stats["total_errors"] += len(batches[batch_idx])
|
|
170
|
+
|
|
171
|
+
return all_results
|
|
172
|
+
|
|
173
|
+
def _process_sequential(
|
|
174
|
+
self, batches: list[list[T]], processor_func: Callable[[list[T]], list[R]]
|
|
175
|
+
) -> list[R]:
|
|
176
|
+
"""Process batches sequentially.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
batches: List of batches to process
|
|
180
|
+
processor_func: Function to process each batch
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
Combined results from all batches
|
|
184
|
+
"""
|
|
185
|
+
all_results = []
|
|
186
|
+
|
|
187
|
+
for i, batch in enumerate(batches):
|
|
188
|
+
try:
|
|
189
|
+
batch_results, batch_time = self._process_batch_with_timing(batch, processor_func)
|
|
190
|
+
all_results.extend(batch_results)
|
|
191
|
+
|
|
192
|
+
self._processing_stats["total_processed"] += len(batch)
|
|
193
|
+
self._processing_stats["batch_times"].append(batch_time)
|
|
194
|
+
|
|
195
|
+
except Exception as e:
|
|
196
|
+
self.logger.error(f"Error processing batch {i}: {e}")
|
|
197
|
+
self._processing_stats["total_errors"] += len(batch)
|
|
198
|
+
|
|
199
|
+
return all_results
|
|
200
|
+
|
|
201
|
+
def _process_batch_with_timing(
|
|
202
|
+
self, batch: list[T], processor_func: Callable[[list[T]], list[R]]
|
|
203
|
+
) -> tuple[list[R], float]:
|
|
204
|
+
"""Process a single batch with timing.
|
|
205
|
+
|
|
206
|
+
Args:
|
|
207
|
+
batch: Batch to process
|
|
208
|
+
processor_func: Processing function
|
|
209
|
+
|
|
210
|
+
Returns:
|
|
211
|
+
Tuple of (results, processing_time_seconds)
|
|
212
|
+
"""
|
|
213
|
+
start_time = time.time()
|
|
214
|
+
results = processor_func(batch)
|
|
215
|
+
processing_time = time.time() - start_time
|
|
216
|
+
|
|
217
|
+
return results, processing_time
|
|
218
|
+
|
|
219
|
+
def _reset_stats(self) -> None:
|
|
220
|
+
"""Reset processing statistics."""
|
|
221
|
+
with self._stats_lock:
|
|
222
|
+
self._processing_stats = {
|
|
223
|
+
"total_processed": 0,
|
|
224
|
+
"total_errors": 0,
|
|
225
|
+
"batch_times": [],
|
|
226
|
+
"start_time": time.time(),
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
def _log_final_stats(self, total_items: int) -> None:
|
|
230
|
+
"""Log final processing statistics.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
total_items: Total number of items processed
|
|
234
|
+
"""
|
|
235
|
+
with self._stats_lock:
|
|
236
|
+
stats = self._processing_stats.copy()
|
|
237
|
+
|
|
238
|
+
if not stats["batch_times"]:
|
|
239
|
+
return
|
|
240
|
+
|
|
241
|
+
total_time = time.time() - stats["start_time"]
|
|
242
|
+
avg_batch_time = sum(stats["batch_times"]) / len(stats["batch_times"])
|
|
243
|
+
items_per_second = stats["total_processed"] / total_time if total_time > 0 else 0
|
|
244
|
+
|
|
245
|
+
self.logger.info(
|
|
246
|
+
f"Batch processing complete: {stats['total_processed']}/{total_items} items processed "
|
|
247
|
+
f"in {total_time:.2f}s ({items_per_second:.1f} items/s), "
|
|
248
|
+
f"{stats['total_errors']} errors, avg batch time: {avg_batch_time:.2f}s"
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
def get_processing_stats(self) -> dict[str, Any]:
|
|
252
|
+
"""Get current processing statistics.
|
|
253
|
+
|
|
254
|
+
Returns:
|
|
255
|
+
Dictionary with processing statistics
|
|
256
|
+
"""
|
|
257
|
+
with self._stats_lock:
|
|
258
|
+
stats = self._processing_stats.copy()
|
|
259
|
+
|
|
260
|
+
if stats["start_time"] and stats["batch_times"]:
|
|
261
|
+
elapsed_time = time.time() - stats["start_time"]
|
|
262
|
+
avg_batch_time = sum(stats["batch_times"]) / len(stats["batch_times"])
|
|
263
|
+
items_per_second = stats["total_processed"] / elapsed_time if elapsed_time > 0 else 0
|
|
264
|
+
|
|
265
|
+
return {
|
|
266
|
+
"total_processed": stats["total_processed"],
|
|
267
|
+
"total_errors": stats["total_errors"],
|
|
268
|
+
"elapsed_time_seconds": elapsed_time,
|
|
269
|
+
"avg_batch_time_seconds": avg_batch_time,
|
|
270
|
+
"items_per_second": items_per_second,
|
|
271
|
+
"batches_completed": len(stats["batch_times"]),
|
|
272
|
+
"error_rate": (
|
|
273
|
+
stats["total_errors"] / (stats["total_processed"] + stats["total_errors"])
|
|
274
|
+
if (stats["total_processed"] + stats["total_errors"]) > 0
|
|
275
|
+
else 0.0
|
|
276
|
+
),
|
|
277
|
+
}
|
|
278
|
+
else:
|
|
279
|
+
return {
|
|
280
|
+
"total_processed": 0,
|
|
281
|
+
"total_errors": 0,
|
|
282
|
+
"elapsed_time_seconds": 0,
|
|
283
|
+
"avg_batch_time_seconds": 0,
|
|
284
|
+
"items_per_second": 0,
|
|
285
|
+
"batches_completed": 0,
|
|
286
|
+
"error_rate": 0.0,
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
class ProgressTracker:
|
|
291
|
+
"""Simple progress tracking for long-running operations."""
|
|
292
|
+
|
|
293
|
+
def __init__(self, total: int, description: str = "Processing"):
|
|
294
|
+
"""Initialize progress tracker.
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
total: Total number of items to process
|
|
298
|
+
description: Description of the operation
|
|
299
|
+
"""
|
|
300
|
+
self.total = total
|
|
301
|
+
self.description = description
|
|
302
|
+
self.processed = 0
|
|
303
|
+
self.start_time = time.time()
|
|
304
|
+
self.last_report = 0
|
|
305
|
+
self.logger = logging.getLogger(__name__)
|
|
306
|
+
|
|
307
|
+
def update(self, count: int = 1) -> None:
|
|
308
|
+
"""Update progress count.
|
|
309
|
+
|
|
310
|
+
Args:
|
|
311
|
+
count: Number of items processed since last update
|
|
312
|
+
"""
|
|
313
|
+
self.processed += count
|
|
314
|
+
|
|
315
|
+
# Report progress every 10% or every 1000 items, whichever is less frequent
|
|
316
|
+
report_interval = max(self.total // 10, 1000)
|
|
317
|
+
|
|
318
|
+
if self.processed - self.last_report >= report_interval or self.processed >= self.total:
|
|
319
|
+
self._report_progress()
|
|
320
|
+
self.last_report = self.processed
|
|
321
|
+
|
|
322
|
+
def _report_progress(self) -> None:
|
|
323
|
+
"""Report current progress."""
|
|
324
|
+
elapsed_time = time.time() - self.start_time
|
|
325
|
+
percentage = (self.processed / self.total) * 100 if self.total > 0 else 0
|
|
326
|
+
rate = self.processed / elapsed_time if elapsed_time > 0 else 0
|
|
327
|
+
|
|
328
|
+
# Estimate time remaining
|
|
329
|
+
if rate > 0 and self.processed < self.total:
|
|
330
|
+
remaining_items = self.total - self.processed
|
|
331
|
+
eta_seconds = remaining_items / rate
|
|
332
|
+
eta_str = f", ETA: {eta_seconds:.0f}s"
|
|
333
|
+
else:
|
|
334
|
+
eta_str = ""
|
|
335
|
+
|
|
336
|
+
self.logger.info(
|
|
337
|
+
f"{self.description}: {self.processed}/{self.total} ({percentage:.1f}%) "
|
|
338
|
+
f"at {rate:.1f} items/s{eta_str}"
|
|
339
|
+
)
|