gitflow-analytics 1.0.3__py3-none-any.whl → 1.3.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gitflow_analytics/_version.py +1 -1
- gitflow_analytics/classification/__init__.py +31 -0
- gitflow_analytics/classification/batch_classifier.py +752 -0
- gitflow_analytics/classification/classifier.py +464 -0
- gitflow_analytics/classification/feature_extractor.py +725 -0
- gitflow_analytics/classification/linguist_analyzer.py +574 -0
- gitflow_analytics/classification/model.py +455 -0
- gitflow_analytics/cli.py +4158 -350
- gitflow_analytics/cli_rich.py +198 -48
- gitflow_analytics/config/__init__.py +43 -0
- gitflow_analytics/config/errors.py +261 -0
- gitflow_analytics/config/loader.py +905 -0
- gitflow_analytics/config/profiles.py +264 -0
- gitflow_analytics/config/repository.py +124 -0
- gitflow_analytics/config/schema.py +444 -0
- gitflow_analytics/config/validator.py +154 -0
- gitflow_analytics/config.py +44 -508
- gitflow_analytics/core/analyzer.py +1209 -98
- gitflow_analytics/core/cache.py +1337 -29
- gitflow_analytics/core/data_fetcher.py +1285 -0
- gitflow_analytics/core/identity.py +363 -14
- gitflow_analytics/core/metrics_storage.py +526 -0
- gitflow_analytics/core/progress.py +372 -0
- gitflow_analytics/core/schema_version.py +269 -0
- gitflow_analytics/extractors/ml_tickets.py +1100 -0
- gitflow_analytics/extractors/story_points.py +8 -1
- gitflow_analytics/extractors/tickets.py +749 -11
- gitflow_analytics/identity_llm/__init__.py +6 -0
- gitflow_analytics/identity_llm/analysis_pass.py +231 -0
- gitflow_analytics/identity_llm/analyzer.py +464 -0
- gitflow_analytics/identity_llm/models.py +76 -0
- gitflow_analytics/integrations/github_integration.py +175 -11
- gitflow_analytics/integrations/jira_integration.py +461 -24
- gitflow_analytics/integrations/orchestrator.py +124 -1
- gitflow_analytics/metrics/activity_scoring.py +322 -0
- gitflow_analytics/metrics/branch_health.py +470 -0
- gitflow_analytics/metrics/dora.py +379 -20
- gitflow_analytics/models/database.py +843 -53
- gitflow_analytics/pm_framework/__init__.py +115 -0
- gitflow_analytics/pm_framework/adapters/__init__.py +50 -0
- gitflow_analytics/pm_framework/adapters/jira_adapter.py +1845 -0
- gitflow_analytics/pm_framework/base.py +406 -0
- gitflow_analytics/pm_framework/models.py +211 -0
- gitflow_analytics/pm_framework/orchestrator.py +652 -0
- gitflow_analytics/pm_framework/registry.py +333 -0
- gitflow_analytics/qualitative/__init__.py +9 -10
- gitflow_analytics/qualitative/chatgpt_analyzer.py +259 -0
- gitflow_analytics/qualitative/classifiers/__init__.py +3 -3
- gitflow_analytics/qualitative/classifiers/change_type.py +518 -244
- gitflow_analytics/qualitative/classifiers/domain_classifier.py +272 -165
- gitflow_analytics/qualitative/classifiers/intent_analyzer.py +321 -222
- gitflow_analytics/qualitative/classifiers/llm/__init__.py +35 -0
- gitflow_analytics/qualitative/classifiers/llm/base.py +193 -0
- gitflow_analytics/qualitative/classifiers/llm/batch_processor.py +383 -0
- gitflow_analytics/qualitative/classifiers/llm/cache.py +479 -0
- gitflow_analytics/qualitative/classifiers/llm/cost_tracker.py +435 -0
- gitflow_analytics/qualitative/classifiers/llm/openai_client.py +403 -0
- gitflow_analytics/qualitative/classifiers/llm/prompts.py +373 -0
- gitflow_analytics/qualitative/classifiers/llm/response_parser.py +287 -0
- gitflow_analytics/qualitative/classifiers/llm_commit_classifier.py +607 -0
- gitflow_analytics/qualitative/classifiers/risk_analyzer.py +215 -189
- gitflow_analytics/qualitative/core/__init__.py +4 -4
- gitflow_analytics/qualitative/core/llm_fallback.py +239 -235
- gitflow_analytics/qualitative/core/nlp_engine.py +157 -148
- gitflow_analytics/qualitative/core/pattern_cache.py +214 -192
- gitflow_analytics/qualitative/core/processor.py +381 -248
- gitflow_analytics/qualitative/enhanced_analyzer.py +2236 -0
- gitflow_analytics/qualitative/example_enhanced_usage.py +420 -0
- gitflow_analytics/qualitative/models/__init__.py +7 -7
- gitflow_analytics/qualitative/models/schemas.py +155 -121
- gitflow_analytics/qualitative/utils/__init__.py +4 -4
- gitflow_analytics/qualitative/utils/batch_processor.py +136 -123
- gitflow_analytics/qualitative/utils/cost_tracker.py +142 -140
- gitflow_analytics/qualitative/utils/metrics.py +172 -158
- gitflow_analytics/qualitative/utils/text_processing.py +146 -104
- gitflow_analytics/reports/__init__.py +100 -0
- gitflow_analytics/reports/analytics_writer.py +539 -14
- gitflow_analytics/reports/base.py +648 -0
- gitflow_analytics/reports/branch_health_writer.py +322 -0
- gitflow_analytics/reports/classification_writer.py +924 -0
- gitflow_analytics/reports/cli_integration.py +427 -0
- gitflow_analytics/reports/csv_writer.py +1676 -212
- gitflow_analytics/reports/data_models.py +504 -0
- gitflow_analytics/reports/database_report_generator.py +427 -0
- gitflow_analytics/reports/example_usage.py +344 -0
- gitflow_analytics/reports/factory.py +499 -0
- gitflow_analytics/reports/formatters.py +698 -0
- gitflow_analytics/reports/html_generator.py +1116 -0
- gitflow_analytics/reports/interfaces.py +489 -0
- gitflow_analytics/reports/json_exporter.py +2770 -0
- gitflow_analytics/reports/narrative_writer.py +2287 -158
- gitflow_analytics/reports/story_point_correlation.py +1144 -0
- gitflow_analytics/reports/weekly_trends_writer.py +389 -0
- gitflow_analytics/training/__init__.py +5 -0
- gitflow_analytics/training/model_loader.py +377 -0
- gitflow_analytics/training/pipeline.py +550 -0
- gitflow_analytics/tui/__init__.py +1 -1
- gitflow_analytics/tui/app.py +129 -126
- gitflow_analytics/tui/screens/__init__.py +3 -3
- gitflow_analytics/tui/screens/analysis_progress_screen.py +188 -179
- gitflow_analytics/tui/screens/configuration_screen.py +154 -178
- gitflow_analytics/tui/screens/loading_screen.py +100 -110
- gitflow_analytics/tui/screens/main_screen.py +89 -72
- gitflow_analytics/tui/screens/results_screen.py +305 -281
- gitflow_analytics/tui/widgets/__init__.py +2 -2
- gitflow_analytics/tui/widgets/data_table.py +67 -69
- gitflow_analytics/tui/widgets/export_modal.py +76 -76
- gitflow_analytics/tui/widgets/progress_widget.py +41 -46
- gitflow_analytics-1.3.11.dist-info/METADATA +1015 -0
- gitflow_analytics-1.3.11.dist-info/RECORD +122 -0
- gitflow_analytics-1.0.3.dist-info/METADATA +0 -490
- gitflow_analytics-1.0.3.dist-info/RECORD +0 -62
- {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/WHEEL +0 -0
- {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/entry_points.txt +0 -0
- {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/licenses/LICENSE +0 -0
- {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/top_level.txt +0 -0
|
@@ -1,29 +1,29 @@
|
|
|
1
1
|
"""Domain classifier for identifying technical domains of commits."""
|
|
2
2
|
|
|
3
|
+
import importlib.util
|
|
3
4
|
import logging
|
|
4
5
|
import re
|
|
5
6
|
from collections import defaultdict
|
|
6
|
-
from typing import
|
|
7
|
-
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
8
|
|
|
9
9
|
from ..models.schemas import DomainConfig
|
|
10
10
|
|
|
11
|
-
|
|
12
|
-
|
|
11
|
+
# Check if spacy is available without importing it
|
|
12
|
+
SPACY_AVAILABLE = importlib.util.find_spec("spacy") is not None
|
|
13
|
+
|
|
14
|
+
if SPACY_AVAILABLE:
|
|
13
15
|
from spacy.tokens import Doc
|
|
14
|
-
|
|
15
|
-
except ImportError:
|
|
16
|
-
SPACY_AVAILABLE = False
|
|
16
|
+
else:
|
|
17
17
|
Doc = Any
|
|
18
18
|
|
|
19
19
|
|
|
20
20
|
class DomainClassifier:
|
|
21
21
|
"""Classify commits by technical domain (frontend, backend, etc.).
|
|
22
|
-
|
|
22
|
+
|
|
23
23
|
This classifier determines the technical domain or business area
|
|
24
24
|
affected by a commit by analyzing both the commit message content
|
|
25
25
|
and the patterns of files that were changed.
|
|
26
|
-
|
|
26
|
+
|
|
27
27
|
Domains identified:
|
|
28
28
|
- frontend: UI/UX, client-side code
|
|
29
29
|
- backend: Server-side logic, APIs
|
|
@@ -32,82 +32,180 @@ class DomainClassifier:
|
|
|
32
32
|
- mobile: Mobile app development
|
|
33
33
|
- devops: CI/CD, build tools, automation
|
|
34
34
|
"""
|
|
35
|
-
|
|
35
|
+
|
|
36
36
|
def __init__(self, config: DomainConfig):
|
|
37
37
|
"""Initialize domain classifier.
|
|
38
|
-
|
|
38
|
+
|
|
39
39
|
Args:
|
|
40
40
|
config: Configuration for domain classification
|
|
41
41
|
"""
|
|
42
42
|
self.config = config
|
|
43
43
|
self.logger = logging.getLogger(__name__)
|
|
44
|
-
|
|
44
|
+
|
|
45
45
|
# Compile file patterns for efficient matching
|
|
46
46
|
self._compile_file_patterns()
|
|
47
|
-
|
|
47
|
+
|
|
48
48
|
# Keyword patterns for semantic analysis
|
|
49
49
|
self.keyword_patterns = config.keyword_patterns
|
|
50
|
-
|
|
50
|
+
|
|
51
51
|
# Directory patterns that strongly indicate domains
|
|
52
52
|
self.directory_indicators = {
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
53
|
+
"frontend": {
|
|
54
|
+
"src/components",
|
|
55
|
+
"src/pages",
|
|
56
|
+
"src/views",
|
|
57
|
+
"public",
|
|
58
|
+
"assets",
|
|
59
|
+
"static",
|
|
60
|
+
"styles",
|
|
61
|
+
"css",
|
|
62
|
+
"scss",
|
|
63
|
+
"ui",
|
|
64
|
+
"components",
|
|
65
|
+
"pages",
|
|
56
66
|
},
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
67
|
+
"backend": {
|
|
68
|
+
"src/controllers",
|
|
69
|
+
"src/services",
|
|
70
|
+
"src/api",
|
|
71
|
+
"api",
|
|
72
|
+
"server",
|
|
73
|
+
"controllers",
|
|
74
|
+
"services",
|
|
75
|
+
"handlers",
|
|
76
|
+
"routes",
|
|
77
|
+
"middleware",
|
|
60
78
|
},
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
79
|
+
"database": {
|
|
80
|
+
"migrations",
|
|
81
|
+
"models",
|
|
82
|
+
"schemas",
|
|
83
|
+
"seeds",
|
|
84
|
+
"data",
|
|
85
|
+
"sql",
|
|
86
|
+
"database",
|
|
87
|
+
"db",
|
|
88
|
+
"repositories",
|
|
64
89
|
},
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
90
|
+
"infrastructure": {
|
|
91
|
+
"terraform",
|
|
92
|
+
"ansible",
|
|
93
|
+
"k8s",
|
|
94
|
+
"kubernetes",
|
|
95
|
+
"helm",
|
|
96
|
+
"charts",
|
|
97
|
+
"infrastructure",
|
|
98
|
+
"deploy",
|
|
99
|
+
"deployment",
|
|
100
|
+
"ops",
|
|
68
101
|
},
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
102
|
+
"mobile": {
|
|
103
|
+
"android",
|
|
104
|
+
"ios",
|
|
105
|
+
"mobile",
|
|
106
|
+
"app",
|
|
107
|
+
"native",
|
|
108
|
+
"react-native",
|
|
109
|
+
"flutter",
|
|
110
|
+
"swift",
|
|
111
|
+
"kotlin",
|
|
112
|
+
},
|
|
113
|
+
"devops": {
|
|
114
|
+
".github",
|
|
115
|
+
".gitlab",
|
|
116
|
+
"ci",
|
|
117
|
+
"cd",
|
|
118
|
+
"scripts",
|
|
119
|
+
"build",
|
|
120
|
+
"docker",
|
|
121
|
+
"jenkins",
|
|
122
|
+
"actions",
|
|
123
|
+
"workflows",
|
|
72
124
|
},
|
|
73
|
-
'devops': {
|
|
74
|
-
'.github', '.gitlab', 'ci', 'cd', 'scripts', 'build', 'docker',
|
|
75
|
-
'jenkins', 'actions', 'workflows'
|
|
76
|
-
}
|
|
77
125
|
}
|
|
78
|
-
|
|
126
|
+
|
|
79
127
|
# Technology stack indicators
|
|
80
128
|
self.tech_indicators = {
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
129
|
+
"frontend": {
|
|
130
|
+
"react",
|
|
131
|
+
"vue",
|
|
132
|
+
"angular",
|
|
133
|
+
"svelte",
|
|
134
|
+
"jquery",
|
|
135
|
+
"bootstrap",
|
|
136
|
+
"tailwind",
|
|
137
|
+
"css",
|
|
138
|
+
"html",
|
|
139
|
+
"javascript",
|
|
140
|
+
"typescript",
|
|
141
|
+
"jsx",
|
|
142
|
+
"tsx",
|
|
84
143
|
},
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
144
|
+
"backend": {
|
|
145
|
+
"django",
|
|
146
|
+
"flask",
|
|
147
|
+
"fastapi",
|
|
148
|
+
"express",
|
|
149
|
+
"spring",
|
|
150
|
+
"rails",
|
|
151
|
+
"laravel",
|
|
152
|
+
"api",
|
|
153
|
+
"endpoint",
|
|
154
|
+
"service",
|
|
155
|
+
"controller",
|
|
88
156
|
},
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
157
|
+
"database": {
|
|
158
|
+
"mysql",
|
|
159
|
+
"postgresql",
|
|
160
|
+
"mongodb",
|
|
161
|
+
"redis",
|
|
162
|
+
"elasticsearch",
|
|
163
|
+
"migration",
|
|
164
|
+
"schema",
|
|
165
|
+
"query",
|
|
166
|
+
"orm",
|
|
167
|
+
"sql",
|
|
92
168
|
},
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
169
|
+
"infrastructure": {
|
|
170
|
+
"aws",
|
|
171
|
+
"gcp",
|
|
172
|
+
"azure",
|
|
173
|
+
"docker",
|
|
174
|
+
"kubernetes",
|
|
175
|
+
"terraform",
|
|
176
|
+
"ansible",
|
|
177
|
+
"helm",
|
|
178
|
+
"nginx",
|
|
179
|
+
"apache",
|
|
96
180
|
},
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
181
|
+
"mobile": {
|
|
182
|
+
"android",
|
|
183
|
+
"ios",
|
|
184
|
+
"swift",
|
|
185
|
+
"kotlin",
|
|
186
|
+
"flutter",
|
|
187
|
+
"react-native",
|
|
188
|
+
"xamarin",
|
|
189
|
+
"cordova",
|
|
190
|
+
"ionic",
|
|
191
|
+
},
|
|
192
|
+
"devops": {
|
|
193
|
+
"jenkins",
|
|
194
|
+
"gitlab-ci",
|
|
195
|
+
"github-actions",
|
|
196
|
+
"circleci",
|
|
197
|
+
"travis",
|
|
198
|
+
"docker",
|
|
199
|
+
"kubernetes",
|
|
200
|
+
"helm",
|
|
201
|
+
"terraform",
|
|
100
202
|
},
|
|
101
|
-
'devops': {
|
|
102
|
-
'jenkins', 'gitlab-ci', 'github-actions', 'circleci', 'travis',
|
|
103
|
-
'docker', 'kubernetes', 'helm', 'terraform'
|
|
104
|
-
}
|
|
105
203
|
}
|
|
106
|
-
|
|
204
|
+
|
|
107
205
|
def _compile_file_patterns(self) -> None:
|
|
108
206
|
"""Compile file extension patterns for efficient matching."""
|
|
109
207
|
self.compiled_file_patterns = {}
|
|
110
|
-
|
|
208
|
+
|
|
111
209
|
for domain, patterns in self.config.file_patterns.items():
|
|
112
210
|
compiled_patterns = []
|
|
113
211
|
for pattern in patterns:
|
|
@@ -116,116 +214,118 @@ class DomainClassifier:
|
|
|
116
214
|
regex_pattern = self._glob_to_regex(pattern)
|
|
117
215
|
compiled_patterns.append(re.compile(regex_pattern, re.IGNORECASE))
|
|
118
216
|
except re.error as e:
|
|
119
|
-
self.logger.warning(
|
|
120
|
-
|
|
217
|
+
self.logger.warning(
|
|
218
|
+
f"Invalid file pattern '{pattern}' for domain {domain}: {e}"
|
|
219
|
+
)
|
|
220
|
+
|
|
121
221
|
self.compiled_file_patterns[domain] = compiled_patterns
|
|
122
|
-
|
|
222
|
+
|
|
123
223
|
def _glob_to_regex(self, pattern: str) -> str:
|
|
124
224
|
"""Convert glob pattern to regex.
|
|
125
|
-
|
|
225
|
+
|
|
126
226
|
Args:
|
|
127
227
|
pattern: Glob pattern (e.g., '*.js', '**/models/**')
|
|
128
|
-
|
|
228
|
+
|
|
129
229
|
Returns:
|
|
130
230
|
Equivalent regex pattern
|
|
131
231
|
"""
|
|
132
232
|
# Simple glob to regex conversion
|
|
133
|
-
pattern = pattern.replace(
|
|
134
|
-
pattern = pattern.replace(
|
|
135
|
-
pattern = pattern.replace(
|
|
136
|
-
pattern = f
|
|
233
|
+
pattern = pattern.replace(".", r"\.")
|
|
234
|
+
pattern = pattern.replace("*", ".*")
|
|
235
|
+
pattern = pattern.replace("?", ".")
|
|
236
|
+
pattern = f"^{pattern}$"
|
|
137
237
|
return pattern
|
|
138
|
-
|
|
139
|
-
def classify(self, message: str, doc: Doc, files:
|
|
238
|
+
|
|
239
|
+
def classify(self, message: str, doc: Doc, files: list[str]) -> tuple[str, float]:
|
|
140
240
|
"""Classify commit domain with confidence score.
|
|
141
|
-
|
|
241
|
+
|
|
142
242
|
Args:
|
|
143
243
|
message: Commit message
|
|
144
244
|
doc: spaCy processed document (may be None)
|
|
145
245
|
files: List of changed files
|
|
146
|
-
|
|
246
|
+
|
|
147
247
|
Returns:
|
|
148
248
|
Tuple of (domain, confidence_score)
|
|
149
249
|
"""
|
|
150
250
|
if not message and not files:
|
|
151
|
-
return
|
|
152
|
-
|
|
251
|
+
return "unknown", 0.0
|
|
252
|
+
|
|
153
253
|
# Analyze file patterns (primary signal)
|
|
154
254
|
file_scores = self._analyze_file_patterns(files)
|
|
155
|
-
|
|
255
|
+
|
|
156
256
|
# Analyze directory patterns
|
|
157
257
|
dir_scores = self._analyze_directory_patterns(files)
|
|
158
|
-
|
|
258
|
+
|
|
159
259
|
# Analyze message content
|
|
160
260
|
message_scores = self._analyze_message_content(message, doc)
|
|
161
|
-
|
|
261
|
+
|
|
162
262
|
# Combine all signals
|
|
163
263
|
combined_scores = self._combine_domain_scores(file_scores, dir_scores, message_scores)
|
|
164
|
-
|
|
264
|
+
|
|
165
265
|
if not combined_scores:
|
|
166
|
-
return
|
|
167
|
-
|
|
266
|
+
return "unknown", 0.0
|
|
267
|
+
|
|
168
268
|
# Select best domain
|
|
169
269
|
best_domain = max(combined_scores.keys(), key=lambda k: combined_scores[k])
|
|
170
270
|
confidence = combined_scores[best_domain]
|
|
171
|
-
|
|
271
|
+
|
|
172
272
|
# Apply confidence threshold
|
|
173
273
|
if confidence < self.config.min_confidence:
|
|
174
|
-
return
|
|
175
|
-
|
|
274
|
+
return "unknown", confidence
|
|
275
|
+
|
|
176
276
|
return best_domain, confidence
|
|
177
|
-
|
|
178
|
-
def _analyze_file_patterns(self, files:
|
|
277
|
+
|
|
278
|
+
def _analyze_file_patterns(self, files: list[str]) -> dict[str, float]:
|
|
179
279
|
"""Analyze file patterns to determine domain.
|
|
180
|
-
|
|
280
|
+
|
|
181
281
|
Args:
|
|
182
282
|
files: List of file paths
|
|
183
|
-
|
|
283
|
+
|
|
184
284
|
Returns:
|
|
185
285
|
Dictionary of domain -> confidence_score
|
|
186
286
|
"""
|
|
187
287
|
if not files:
|
|
188
288
|
return {}
|
|
189
|
-
|
|
289
|
+
|
|
190
290
|
domain_matches = defaultdict(int)
|
|
191
|
-
|
|
291
|
+
|
|
192
292
|
for file_path in files:
|
|
193
293
|
for domain, patterns in self.compiled_file_patterns.items():
|
|
194
294
|
for pattern in patterns:
|
|
195
295
|
if pattern.search(file_path):
|
|
196
296
|
domain_matches[domain] += 1
|
|
197
297
|
break # Don't double-count same file for same domain
|
|
198
|
-
|
|
298
|
+
|
|
199
299
|
# Convert to confidence scores
|
|
200
300
|
scores = {}
|
|
201
301
|
total_files = len(files)
|
|
202
|
-
|
|
302
|
+
|
|
203
303
|
for domain, matches in domain_matches.items():
|
|
204
304
|
# Confidence based on proportion of matching files
|
|
205
305
|
confidence = matches / total_files
|
|
206
306
|
scores[domain] = min(1.0, confidence * 2) # Boost confidence for strong signals
|
|
207
|
-
|
|
307
|
+
|
|
208
308
|
return scores
|
|
209
|
-
|
|
210
|
-
def _analyze_directory_patterns(self, files:
|
|
309
|
+
|
|
310
|
+
def _analyze_directory_patterns(self, files: list[str]) -> dict[str, float]:
|
|
211
311
|
"""Analyze directory patterns for domain signals.
|
|
212
|
-
|
|
312
|
+
|
|
213
313
|
Args:
|
|
214
314
|
files: List of file paths
|
|
215
|
-
|
|
315
|
+
|
|
216
316
|
Returns:
|
|
217
317
|
Dictionary of domain -> confidence_score
|
|
218
318
|
"""
|
|
219
319
|
if not files:
|
|
220
320
|
return {}
|
|
221
|
-
|
|
321
|
+
|
|
222
322
|
domain_scores = defaultdict(float)
|
|
223
|
-
|
|
323
|
+
|
|
224
324
|
for file_path in files:
|
|
225
325
|
# Normalize path separators and convert to lowercase
|
|
226
|
-
normalized_path = file_path.replace(
|
|
227
|
-
path_parts = normalized_path.split(
|
|
228
|
-
|
|
326
|
+
normalized_path = file_path.replace("\\", "/").lower()
|
|
327
|
+
path_parts = normalized_path.split("/")
|
|
328
|
+
|
|
229
329
|
# Check each domain's directory indicators
|
|
230
330
|
for domain, indicators in self.directory_indicators.items():
|
|
231
331
|
for indicator in indicators:
|
|
@@ -236,164 +336,171 @@ class DomainClassifier:
|
|
|
236
336
|
# Also check full path contains indicator
|
|
237
337
|
elif indicator in normalized_path:
|
|
238
338
|
domain_scores[domain] += 0.5
|
|
239
|
-
|
|
339
|
+
|
|
240
340
|
# Normalize scores
|
|
241
341
|
scores = {}
|
|
242
342
|
max_score = max(domain_scores.values()) if domain_scores else 0
|
|
243
|
-
|
|
343
|
+
|
|
244
344
|
if max_score > 0:
|
|
245
345
|
for domain, score in domain_scores.items():
|
|
246
346
|
scores[domain] = min(1.0, score / max_score)
|
|
247
|
-
|
|
347
|
+
|
|
248
348
|
return scores
|
|
249
|
-
|
|
250
|
-
def _analyze_message_content(self, message: str, doc: Doc) ->
|
|
349
|
+
|
|
350
|
+
def _analyze_message_content(self, message: str, doc: Doc) -> dict[str, float]:
|
|
251
351
|
"""Analyze commit message content for domain keywords.
|
|
252
|
-
|
|
352
|
+
|
|
253
353
|
Args:
|
|
254
354
|
message: Commit message
|
|
255
355
|
doc: spaCy processed document (may be None)
|
|
256
|
-
|
|
356
|
+
|
|
257
357
|
Returns:
|
|
258
358
|
Dictionary of domain -> confidence_score
|
|
259
359
|
"""
|
|
260
360
|
if not message:
|
|
261
361
|
return {}
|
|
262
|
-
|
|
362
|
+
|
|
263
363
|
# Convert message to lowercase for analysis
|
|
264
364
|
message_lower = message.lower()
|
|
265
|
-
|
|
365
|
+
|
|
266
366
|
# Extract keywords from message
|
|
267
367
|
if SPACY_AVAILABLE and doc:
|
|
268
368
|
# Use spaCy for better keyword extraction
|
|
269
369
|
keywords = self._extract_keywords_from_doc(doc)
|
|
270
370
|
else:
|
|
271
371
|
# Fallback to simple word extraction
|
|
272
|
-
keywords = set(re.findall(r
|
|
273
|
-
|
|
372
|
+
keywords = set(re.findall(r"\b\w+\b", message_lower))
|
|
373
|
+
|
|
274
374
|
# Score domains based on keyword matches
|
|
275
375
|
domain_scores = {}
|
|
276
|
-
|
|
376
|
+
|
|
277
377
|
for domain, domain_keywords in self.keyword_patterns.items():
|
|
278
|
-
keyword_matches = len(
|
|
279
|
-
|
|
378
|
+
keyword_matches = len(
|
|
379
|
+
keywords.intersection(set(word.lower() for word in domain_keywords))
|
|
380
|
+
)
|
|
381
|
+
|
|
280
382
|
if keyword_matches > 0:
|
|
281
383
|
# Base score from keyword matches
|
|
282
384
|
base_score = min(1.0, keyword_matches / 3.0) # Scale to 0-1
|
|
283
|
-
|
|
385
|
+
|
|
284
386
|
# Boost score for technology indicators
|
|
285
387
|
tech_keywords = self.tech_indicators.get(domain, set())
|
|
286
388
|
tech_matches = len(keywords.intersection(tech_keywords))
|
|
287
389
|
tech_boost = min(0.3, tech_matches * 0.1)
|
|
288
|
-
|
|
390
|
+
|
|
289
391
|
domain_scores[domain] = min(1.0, base_score + tech_boost)
|
|
290
|
-
|
|
392
|
+
|
|
291
393
|
return domain_scores
|
|
292
|
-
|
|
293
|
-
def _extract_keywords_from_doc(self, doc: Doc) ->
|
|
394
|
+
|
|
395
|
+
def _extract_keywords_from_doc(self, doc: Doc) -> set[str]:
|
|
294
396
|
"""Extract meaningful keywords from spaCy document.
|
|
295
|
-
|
|
397
|
+
|
|
296
398
|
Args:
|
|
297
399
|
doc: spaCy processed document
|
|
298
|
-
|
|
400
|
+
|
|
299
401
|
Returns:
|
|
300
402
|
Set of extracted keywords
|
|
301
403
|
"""
|
|
302
404
|
keywords = set()
|
|
303
|
-
|
|
405
|
+
|
|
304
406
|
for token in doc:
|
|
305
|
-
if (
|
|
306
|
-
not token.
|
|
307
|
-
|
|
308
|
-
token.
|
|
407
|
+
if (
|
|
408
|
+
not token.is_stop
|
|
409
|
+
and not token.is_punct
|
|
410
|
+
and len(token.text) > 2
|
|
411
|
+
and token.pos_ in ["NOUN", "PROPN", "ADJ", "VERB"]
|
|
412
|
+
):
|
|
309
413
|
keywords.add(token.lemma_.lower())
|
|
310
|
-
|
|
414
|
+
|
|
311
415
|
# Add named entities
|
|
312
416
|
for ent in doc.ents:
|
|
313
417
|
if len(ent.text) > 2:
|
|
314
418
|
keywords.add(ent.text.lower())
|
|
315
|
-
|
|
419
|
+
|
|
316
420
|
return keywords
|
|
317
|
-
|
|
318
|
-
def _combine_domain_scores(
|
|
319
|
-
|
|
320
|
-
|
|
421
|
+
|
|
422
|
+
def _combine_domain_scores(
|
|
423
|
+
self,
|
|
424
|
+
file_scores: dict[str, float],
|
|
425
|
+
dir_scores: dict[str, float],
|
|
426
|
+
message_scores: dict[str, float],
|
|
427
|
+
) -> dict[str, float]:
|
|
321
428
|
"""Combine scores from different analysis methods.
|
|
322
|
-
|
|
429
|
+
|
|
323
430
|
Args:
|
|
324
431
|
file_scores: Scores from file pattern analysis
|
|
325
|
-
dir_scores: Scores from directory pattern analysis
|
|
432
|
+
dir_scores: Scores from directory pattern analysis
|
|
326
433
|
message_scores: Scores from message content analysis
|
|
327
|
-
|
|
434
|
+
|
|
328
435
|
Returns:
|
|
329
436
|
Combined scores dictionary
|
|
330
437
|
"""
|
|
331
438
|
all_domains = set(file_scores.keys()) | set(dir_scores.keys()) | set(message_scores.keys())
|
|
332
439
|
combined_scores = {}
|
|
333
|
-
|
|
440
|
+
|
|
334
441
|
# Weights for different signal types
|
|
335
442
|
weights = {
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
443
|
+
"file": 0.5, # File patterns are strongest signal
|
|
444
|
+
"directory": 0.3, # Directory patterns are also strong
|
|
445
|
+
"message": 0.2, # Message content provides additional context
|
|
339
446
|
}
|
|
340
|
-
|
|
447
|
+
|
|
341
448
|
for domain in all_domains:
|
|
342
449
|
file_score = file_scores.get(domain, 0.0)
|
|
343
450
|
dir_score = dir_scores.get(domain, 0.0)
|
|
344
451
|
message_score = message_scores.get(domain, 0.0)
|
|
345
|
-
|
|
452
|
+
|
|
346
453
|
# Weighted combination
|
|
347
454
|
combined_score = (
|
|
348
|
-
file_score * weights[
|
|
349
|
-
dir_score * weights[
|
|
350
|
-
message_score * weights[
|
|
455
|
+
file_score * weights["file"]
|
|
456
|
+
+ dir_score * weights["directory"]
|
|
457
|
+
+ message_score * weights["message"]
|
|
351
458
|
)
|
|
352
|
-
|
|
459
|
+
|
|
353
460
|
# Bonus for multiple signal types agreeing
|
|
354
461
|
signal_count = sum(1 for score in [file_score, dir_score, message_score] if score > 0)
|
|
355
462
|
if signal_count > 1:
|
|
356
|
-
combined_score *=
|
|
357
|
-
|
|
463
|
+
combined_score *= 1.0 + (signal_count - 1) * 0.1 # 10% bonus per additional signal
|
|
464
|
+
|
|
358
465
|
if combined_score > 0:
|
|
359
466
|
combined_scores[domain] = min(1.0, combined_score)
|
|
360
|
-
|
|
467
|
+
|
|
361
468
|
return combined_scores
|
|
362
|
-
|
|
363
|
-
def get_domain_statistics(self, files:
|
|
469
|
+
|
|
470
|
+
def get_domain_statistics(self, files: list[str]) -> dict[str, Any]:
|
|
364
471
|
"""Get detailed domain analysis statistics for debugging.
|
|
365
|
-
|
|
472
|
+
|
|
366
473
|
Args:
|
|
367
474
|
files: List of file paths
|
|
368
|
-
|
|
475
|
+
|
|
369
476
|
Returns:
|
|
370
477
|
Dictionary with detailed analysis breakdown
|
|
371
478
|
"""
|
|
372
479
|
stats = {
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
480
|
+
"total_files": len(files),
|
|
481
|
+
"file_analysis": self._analyze_file_patterns(files),
|
|
482
|
+
"directory_analysis": self._analyze_directory_patterns(files),
|
|
483
|
+
"file_extensions": {},
|
|
484
|
+
"directory_breakdown": {},
|
|
378
485
|
}
|
|
379
|
-
|
|
486
|
+
|
|
380
487
|
# File extension breakdown
|
|
381
488
|
extensions = defaultdict(int)
|
|
382
489
|
directories = defaultdict(int)
|
|
383
|
-
|
|
490
|
+
|
|
384
491
|
for file_path in files:
|
|
385
492
|
# Extract extension
|
|
386
|
-
if
|
|
387
|
-
ext = file_path.split(
|
|
493
|
+
if "." in file_path:
|
|
494
|
+
ext = file_path.split(".")[-1].lower()
|
|
388
495
|
extensions[ext] += 1
|
|
389
|
-
|
|
496
|
+
|
|
390
497
|
# Extract directories
|
|
391
|
-
path_parts = file_path.split(
|
|
498
|
+
path_parts = file_path.split("/")
|
|
392
499
|
for part in path_parts[:-1]: # Exclude filename
|
|
393
500
|
if part:
|
|
394
501
|
directories[part] += 1
|
|
395
|
-
|
|
396
|
-
stats[
|
|
397
|
-
stats[
|
|
398
|
-
|
|
399
|
-
return stats
|
|
502
|
+
|
|
503
|
+
stats["file_extensions"] = dict(extensions)
|
|
504
|
+
stats["directory_breakdown"] = dict(directories)
|
|
505
|
+
|
|
506
|
+
return stats
|