gitflow-analytics 1.0.1__py3-none-any.whl → 1.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gitflow_analytics/__init__.py +11 -11
- gitflow_analytics/_version.py +2 -2
- gitflow_analytics/classification/__init__.py +31 -0
- gitflow_analytics/classification/batch_classifier.py +752 -0
- gitflow_analytics/classification/classifier.py +464 -0
- gitflow_analytics/classification/feature_extractor.py +725 -0
- gitflow_analytics/classification/linguist_analyzer.py +574 -0
- gitflow_analytics/classification/model.py +455 -0
- gitflow_analytics/cli.py +4490 -378
- gitflow_analytics/cli_rich.py +503 -0
- gitflow_analytics/config/__init__.py +43 -0
- gitflow_analytics/config/errors.py +261 -0
- gitflow_analytics/config/loader.py +904 -0
- gitflow_analytics/config/profiles.py +264 -0
- gitflow_analytics/config/repository.py +124 -0
- gitflow_analytics/config/schema.py +441 -0
- gitflow_analytics/config/validator.py +154 -0
- gitflow_analytics/config.py +44 -398
- gitflow_analytics/core/analyzer.py +1320 -172
- gitflow_analytics/core/branch_mapper.py +132 -132
- gitflow_analytics/core/cache.py +1554 -175
- gitflow_analytics/core/data_fetcher.py +1193 -0
- gitflow_analytics/core/identity.py +571 -185
- gitflow_analytics/core/metrics_storage.py +526 -0
- gitflow_analytics/core/progress.py +372 -0
- gitflow_analytics/core/schema_version.py +269 -0
- gitflow_analytics/extractors/base.py +13 -11
- gitflow_analytics/extractors/ml_tickets.py +1100 -0
- gitflow_analytics/extractors/story_points.py +77 -59
- gitflow_analytics/extractors/tickets.py +841 -89
- gitflow_analytics/identity_llm/__init__.py +6 -0
- gitflow_analytics/identity_llm/analysis_pass.py +231 -0
- gitflow_analytics/identity_llm/analyzer.py +464 -0
- gitflow_analytics/identity_llm/models.py +76 -0
- gitflow_analytics/integrations/github_integration.py +258 -87
- gitflow_analytics/integrations/jira_integration.py +572 -123
- gitflow_analytics/integrations/orchestrator.py +206 -82
- gitflow_analytics/metrics/activity_scoring.py +322 -0
- gitflow_analytics/metrics/branch_health.py +470 -0
- gitflow_analytics/metrics/dora.py +542 -179
- gitflow_analytics/models/database.py +986 -59
- gitflow_analytics/pm_framework/__init__.py +115 -0
- gitflow_analytics/pm_framework/adapters/__init__.py +50 -0
- gitflow_analytics/pm_framework/adapters/jira_adapter.py +1845 -0
- gitflow_analytics/pm_framework/base.py +406 -0
- gitflow_analytics/pm_framework/models.py +211 -0
- gitflow_analytics/pm_framework/orchestrator.py +652 -0
- gitflow_analytics/pm_framework/registry.py +333 -0
- gitflow_analytics/qualitative/__init__.py +29 -0
- gitflow_analytics/qualitative/chatgpt_analyzer.py +259 -0
- gitflow_analytics/qualitative/classifiers/__init__.py +13 -0
- gitflow_analytics/qualitative/classifiers/change_type.py +742 -0
- gitflow_analytics/qualitative/classifiers/domain_classifier.py +506 -0
- gitflow_analytics/qualitative/classifiers/intent_analyzer.py +535 -0
- gitflow_analytics/qualitative/classifiers/llm/__init__.py +35 -0
- gitflow_analytics/qualitative/classifiers/llm/base.py +193 -0
- gitflow_analytics/qualitative/classifiers/llm/batch_processor.py +383 -0
- gitflow_analytics/qualitative/classifiers/llm/cache.py +479 -0
- gitflow_analytics/qualitative/classifiers/llm/cost_tracker.py +435 -0
- gitflow_analytics/qualitative/classifiers/llm/openai_client.py +403 -0
- gitflow_analytics/qualitative/classifiers/llm/prompts.py +373 -0
- gitflow_analytics/qualitative/classifiers/llm/response_parser.py +287 -0
- gitflow_analytics/qualitative/classifiers/llm_commit_classifier.py +607 -0
- gitflow_analytics/qualitative/classifiers/risk_analyzer.py +438 -0
- gitflow_analytics/qualitative/core/__init__.py +13 -0
- gitflow_analytics/qualitative/core/llm_fallback.py +657 -0
- gitflow_analytics/qualitative/core/nlp_engine.py +382 -0
- gitflow_analytics/qualitative/core/pattern_cache.py +479 -0
- gitflow_analytics/qualitative/core/processor.py +673 -0
- gitflow_analytics/qualitative/enhanced_analyzer.py +2236 -0
- gitflow_analytics/qualitative/example_enhanced_usage.py +420 -0
- gitflow_analytics/qualitative/models/__init__.py +25 -0
- gitflow_analytics/qualitative/models/schemas.py +306 -0
- gitflow_analytics/qualitative/utils/__init__.py +13 -0
- gitflow_analytics/qualitative/utils/batch_processor.py +339 -0
- gitflow_analytics/qualitative/utils/cost_tracker.py +345 -0
- gitflow_analytics/qualitative/utils/metrics.py +361 -0
- gitflow_analytics/qualitative/utils/text_processing.py +285 -0
- gitflow_analytics/reports/__init__.py +100 -0
- gitflow_analytics/reports/analytics_writer.py +550 -18
- gitflow_analytics/reports/base.py +648 -0
- gitflow_analytics/reports/branch_health_writer.py +322 -0
- gitflow_analytics/reports/classification_writer.py +924 -0
- gitflow_analytics/reports/cli_integration.py +427 -0
- gitflow_analytics/reports/csv_writer.py +1700 -216
- gitflow_analytics/reports/data_models.py +504 -0
- gitflow_analytics/reports/database_report_generator.py +427 -0
- gitflow_analytics/reports/example_usage.py +344 -0
- gitflow_analytics/reports/factory.py +499 -0
- gitflow_analytics/reports/formatters.py +698 -0
- gitflow_analytics/reports/html_generator.py +1116 -0
- gitflow_analytics/reports/interfaces.py +489 -0
- gitflow_analytics/reports/json_exporter.py +2770 -0
- gitflow_analytics/reports/narrative_writer.py +2289 -158
- gitflow_analytics/reports/story_point_correlation.py +1144 -0
- gitflow_analytics/reports/weekly_trends_writer.py +389 -0
- gitflow_analytics/training/__init__.py +5 -0
- gitflow_analytics/training/model_loader.py +377 -0
- gitflow_analytics/training/pipeline.py +550 -0
- gitflow_analytics/tui/__init__.py +5 -0
- gitflow_analytics/tui/app.py +724 -0
- gitflow_analytics/tui/screens/__init__.py +8 -0
- gitflow_analytics/tui/screens/analysis_progress_screen.py +496 -0
- gitflow_analytics/tui/screens/configuration_screen.py +523 -0
- gitflow_analytics/tui/screens/loading_screen.py +348 -0
- gitflow_analytics/tui/screens/main_screen.py +321 -0
- gitflow_analytics/tui/screens/results_screen.py +722 -0
- gitflow_analytics/tui/widgets/__init__.py +7 -0
- gitflow_analytics/tui/widgets/data_table.py +255 -0
- gitflow_analytics/tui/widgets/export_modal.py +301 -0
- gitflow_analytics/tui/widgets/progress_widget.py +187 -0
- gitflow_analytics-1.3.6.dist-info/METADATA +1015 -0
- gitflow_analytics-1.3.6.dist-info/RECORD +122 -0
- gitflow_analytics-1.0.1.dist-info/METADATA +0 -463
- gitflow_analytics-1.0.1.dist-info/RECORD +0 -31
- {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/WHEEL +0 -0
- {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/entry_points.txt +0 -0
- {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/licenses/LICENSE +0 -0
- {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,506 @@
|
|
|
1
|
+
"""Domain classifier for identifying technical domains of commits."""
|
|
2
|
+
|
|
3
|
+
import importlib.util
|
|
4
|
+
import logging
|
|
5
|
+
import re
|
|
6
|
+
from collections import defaultdict
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from ..models.schemas import DomainConfig
|
|
10
|
+
|
|
11
|
+
# Check if spacy is available without importing it
|
|
12
|
+
SPACY_AVAILABLE = importlib.util.find_spec("spacy") is not None
|
|
13
|
+
|
|
14
|
+
if SPACY_AVAILABLE:
|
|
15
|
+
from spacy.tokens import Doc
|
|
16
|
+
else:
|
|
17
|
+
Doc = Any
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class DomainClassifier:
|
|
21
|
+
"""Classify commits by technical domain (frontend, backend, etc.).
|
|
22
|
+
|
|
23
|
+
This classifier determines the technical domain or business area
|
|
24
|
+
affected by a commit by analyzing both the commit message content
|
|
25
|
+
and the patterns of files that were changed.
|
|
26
|
+
|
|
27
|
+
Domains identified:
|
|
28
|
+
- frontend: UI/UX, client-side code
|
|
29
|
+
- backend: Server-side logic, APIs
|
|
30
|
+
- database: Data models, migrations, queries
|
|
31
|
+
- infrastructure: Deployment, configuration, DevOps
|
|
32
|
+
- mobile: Mobile app development
|
|
33
|
+
- devops: CI/CD, build tools, automation
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(self, config: DomainConfig):
|
|
37
|
+
"""Initialize domain classifier.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
config: Configuration for domain classification
|
|
41
|
+
"""
|
|
42
|
+
self.config = config
|
|
43
|
+
self.logger = logging.getLogger(__name__)
|
|
44
|
+
|
|
45
|
+
# Compile file patterns for efficient matching
|
|
46
|
+
self._compile_file_patterns()
|
|
47
|
+
|
|
48
|
+
# Keyword patterns for semantic analysis
|
|
49
|
+
self.keyword_patterns = config.keyword_patterns
|
|
50
|
+
|
|
51
|
+
# Directory patterns that strongly indicate domains
|
|
52
|
+
self.directory_indicators = {
|
|
53
|
+
"frontend": {
|
|
54
|
+
"src/components",
|
|
55
|
+
"src/pages",
|
|
56
|
+
"src/views",
|
|
57
|
+
"public",
|
|
58
|
+
"assets",
|
|
59
|
+
"static",
|
|
60
|
+
"styles",
|
|
61
|
+
"css",
|
|
62
|
+
"scss",
|
|
63
|
+
"ui",
|
|
64
|
+
"components",
|
|
65
|
+
"pages",
|
|
66
|
+
},
|
|
67
|
+
"backend": {
|
|
68
|
+
"src/controllers",
|
|
69
|
+
"src/services",
|
|
70
|
+
"src/api",
|
|
71
|
+
"api",
|
|
72
|
+
"server",
|
|
73
|
+
"controllers",
|
|
74
|
+
"services",
|
|
75
|
+
"handlers",
|
|
76
|
+
"routes",
|
|
77
|
+
"middleware",
|
|
78
|
+
},
|
|
79
|
+
"database": {
|
|
80
|
+
"migrations",
|
|
81
|
+
"models",
|
|
82
|
+
"schemas",
|
|
83
|
+
"seeds",
|
|
84
|
+
"data",
|
|
85
|
+
"sql",
|
|
86
|
+
"database",
|
|
87
|
+
"db",
|
|
88
|
+
"repositories",
|
|
89
|
+
},
|
|
90
|
+
"infrastructure": {
|
|
91
|
+
"terraform",
|
|
92
|
+
"ansible",
|
|
93
|
+
"k8s",
|
|
94
|
+
"kubernetes",
|
|
95
|
+
"helm",
|
|
96
|
+
"charts",
|
|
97
|
+
"infrastructure",
|
|
98
|
+
"deploy",
|
|
99
|
+
"deployment",
|
|
100
|
+
"ops",
|
|
101
|
+
},
|
|
102
|
+
"mobile": {
|
|
103
|
+
"android",
|
|
104
|
+
"ios",
|
|
105
|
+
"mobile",
|
|
106
|
+
"app",
|
|
107
|
+
"native",
|
|
108
|
+
"react-native",
|
|
109
|
+
"flutter",
|
|
110
|
+
"swift",
|
|
111
|
+
"kotlin",
|
|
112
|
+
},
|
|
113
|
+
"devops": {
|
|
114
|
+
".github",
|
|
115
|
+
".gitlab",
|
|
116
|
+
"ci",
|
|
117
|
+
"cd",
|
|
118
|
+
"scripts",
|
|
119
|
+
"build",
|
|
120
|
+
"docker",
|
|
121
|
+
"jenkins",
|
|
122
|
+
"actions",
|
|
123
|
+
"workflows",
|
|
124
|
+
},
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
# Technology stack indicators
|
|
128
|
+
self.tech_indicators = {
|
|
129
|
+
"frontend": {
|
|
130
|
+
"react",
|
|
131
|
+
"vue",
|
|
132
|
+
"angular",
|
|
133
|
+
"svelte",
|
|
134
|
+
"jquery",
|
|
135
|
+
"bootstrap",
|
|
136
|
+
"tailwind",
|
|
137
|
+
"css",
|
|
138
|
+
"html",
|
|
139
|
+
"javascript",
|
|
140
|
+
"typescript",
|
|
141
|
+
"jsx",
|
|
142
|
+
"tsx",
|
|
143
|
+
},
|
|
144
|
+
"backend": {
|
|
145
|
+
"django",
|
|
146
|
+
"flask",
|
|
147
|
+
"fastapi",
|
|
148
|
+
"express",
|
|
149
|
+
"spring",
|
|
150
|
+
"rails",
|
|
151
|
+
"laravel",
|
|
152
|
+
"api",
|
|
153
|
+
"endpoint",
|
|
154
|
+
"service",
|
|
155
|
+
"controller",
|
|
156
|
+
},
|
|
157
|
+
"database": {
|
|
158
|
+
"mysql",
|
|
159
|
+
"postgresql",
|
|
160
|
+
"mongodb",
|
|
161
|
+
"redis",
|
|
162
|
+
"elasticsearch",
|
|
163
|
+
"migration",
|
|
164
|
+
"schema",
|
|
165
|
+
"query",
|
|
166
|
+
"orm",
|
|
167
|
+
"sql",
|
|
168
|
+
},
|
|
169
|
+
"infrastructure": {
|
|
170
|
+
"aws",
|
|
171
|
+
"gcp",
|
|
172
|
+
"azure",
|
|
173
|
+
"docker",
|
|
174
|
+
"kubernetes",
|
|
175
|
+
"terraform",
|
|
176
|
+
"ansible",
|
|
177
|
+
"helm",
|
|
178
|
+
"nginx",
|
|
179
|
+
"apache",
|
|
180
|
+
},
|
|
181
|
+
"mobile": {
|
|
182
|
+
"android",
|
|
183
|
+
"ios",
|
|
184
|
+
"swift",
|
|
185
|
+
"kotlin",
|
|
186
|
+
"flutter",
|
|
187
|
+
"react-native",
|
|
188
|
+
"xamarin",
|
|
189
|
+
"cordova",
|
|
190
|
+
"ionic",
|
|
191
|
+
},
|
|
192
|
+
"devops": {
|
|
193
|
+
"jenkins",
|
|
194
|
+
"gitlab-ci",
|
|
195
|
+
"github-actions",
|
|
196
|
+
"circleci",
|
|
197
|
+
"travis",
|
|
198
|
+
"docker",
|
|
199
|
+
"kubernetes",
|
|
200
|
+
"helm",
|
|
201
|
+
"terraform",
|
|
202
|
+
},
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
def _compile_file_patterns(self) -> None:
|
|
206
|
+
"""Compile file extension patterns for efficient matching."""
|
|
207
|
+
self.compiled_file_patterns = {}
|
|
208
|
+
|
|
209
|
+
for domain, patterns in self.config.file_patterns.items():
|
|
210
|
+
compiled_patterns = []
|
|
211
|
+
for pattern in patterns:
|
|
212
|
+
try:
|
|
213
|
+
# Convert glob patterns to regex
|
|
214
|
+
regex_pattern = self._glob_to_regex(pattern)
|
|
215
|
+
compiled_patterns.append(re.compile(regex_pattern, re.IGNORECASE))
|
|
216
|
+
except re.error as e:
|
|
217
|
+
self.logger.warning(
|
|
218
|
+
f"Invalid file pattern '{pattern}' for domain {domain}: {e}"
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
self.compiled_file_patterns[domain] = compiled_patterns
|
|
222
|
+
|
|
223
|
+
def _glob_to_regex(self, pattern: str) -> str:
|
|
224
|
+
"""Convert glob pattern to regex.
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
pattern: Glob pattern (e.g., '*.js', '**/models/**')
|
|
228
|
+
|
|
229
|
+
Returns:
|
|
230
|
+
Equivalent regex pattern
|
|
231
|
+
"""
|
|
232
|
+
# Simple glob to regex conversion
|
|
233
|
+
pattern = pattern.replace(".", r"\.")
|
|
234
|
+
pattern = pattern.replace("*", ".*")
|
|
235
|
+
pattern = pattern.replace("?", ".")
|
|
236
|
+
pattern = f"^{pattern}$"
|
|
237
|
+
return pattern
|
|
238
|
+
|
|
239
|
+
def classify(self, message: str, doc: Doc, files: list[str]) -> tuple[str, float]:
|
|
240
|
+
"""Classify commit domain with confidence score.
|
|
241
|
+
|
|
242
|
+
Args:
|
|
243
|
+
message: Commit message
|
|
244
|
+
doc: spaCy processed document (may be None)
|
|
245
|
+
files: List of changed files
|
|
246
|
+
|
|
247
|
+
Returns:
|
|
248
|
+
Tuple of (domain, confidence_score)
|
|
249
|
+
"""
|
|
250
|
+
if not message and not files:
|
|
251
|
+
return "unknown", 0.0
|
|
252
|
+
|
|
253
|
+
# Analyze file patterns (primary signal)
|
|
254
|
+
file_scores = self._analyze_file_patterns(files)
|
|
255
|
+
|
|
256
|
+
# Analyze directory patterns
|
|
257
|
+
dir_scores = self._analyze_directory_patterns(files)
|
|
258
|
+
|
|
259
|
+
# Analyze message content
|
|
260
|
+
message_scores = self._analyze_message_content(message, doc)
|
|
261
|
+
|
|
262
|
+
# Combine all signals
|
|
263
|
+
combined_scores = self._combine_domain_scores(file_scores, dir_scores, message_scores)
|
|
264
|
+
|
|
265
|
+
if not combined_scores:
|
|
266
|
+
return "unknown", 0.0
|
|
267
|
+
|
|
268
|
+
# Select best domain
|
|
269
|
+
best_domain = max(combined_scores.keys(), key=lambda k: combined_scores[k])
|
|
270
|
+
confidence = combined_scores[best_domain]
|
|
271
|
+
|
|
272
|
+
# Apply confidence threshold
|
|
273
|
+
if confidence < self.config.min_confidence:
|
|
274
|
+
return "unknown", confidence
|
|
275
|
+
|
|
276
|
+
return best_domain, confidence
|
|
277
|
+
|
|
278
|
+
def _analyze_file_patterns(self, files: list[str]) -> dict[str, float]:
|
|
279
|
+
"""Analyze file patterns to determine domain.
|
|
280
|
+
|
|
281
|
+
Args:
|
|
282
|
+
files: List of file paths
|
|
283
|
+
|
|
284
|
+
Returns:
|
|
285
|
+
Dictionary of domain -> confidence_score
|
|
286
|
+
"""
|
|
287
|
+
if not files:
|
|
288
|
+
return {}
|
|
289
|
+
|
|
290
|
+
domain_matches = defaultdict(int)
|
|
291
|
+
|
|
292
|
+
for file_path in files:
|
|
293
|
+
for domain, patterns in self.compiled_file_patterns.items():
|
|
294
|
+
for pattern in patterns:
|
|
295
|
+
if pattern.search(file_path):
|
|
296
|
+
domain_matches[domain] += 1
|
|
297
|
+
break # Don't double-count same file for same domain
|
|
298
|
+
|
|
299
|
+
# Convert to confidence scores
|
|
300
|
+
scores = {}
|
|
301
|
+
total_files = len(files)
|
|
302
|
+
|
|
303
|
+
for domain, matches in domain_matches.items():
|
|
304
|
+
# Confidence based on proportion of matching files
|
|
305
|
+
confidence = matches / total_files
|
|
306
|
+
scores[domain] = min(1.0, confidence * 2) # Boost confidence for strong signals
|
|
307
|
+
|
|
308
|
+
return scores
|
|
309
|
+
|
|
310
|
+
def _analyze_directory_patterns(self, files: list[str]) -> dict[str, float]:
|
|
311
|
+
"""Analyze directory patterns for domain signals.
|
|
312
|
+
|
|
313
|
+
Args:
|
|
314
|
+
files: List of file paths
|
|
315
|
+
|
|
316
|
+
Returns:
|
|
317
|
+
Dictionary of domain -> confidence_score
|
|
318
|
+
"""
|
|
319
|
+
if not files:
|
|
320
|
+
return {}
|
|
321
|
+
|
|
322
|
+
domain_scores = defaultdict(float)
|
|
323
|
+
|
|
324
|
+
for file_path in files:
|
|
325
|
+
# Normalize path separators and convert to lowercase
|
|
326
|
+
normalized_path = file_path.replace("\\", "/").lower()
|
|
327
|
+
path_parts = normalized_path.split("/")
|
|
328
|
+
|
|
329
|
+
# Check each domain's directory indicators
|
|
330
|
+
for domain, indicators in self.directory_indicators.items():
|
|
331
|
+
for indicator in indicators:
|
|
332
|
+
# Check if indicator appears in any part of the path
|
|
333
|
+
if any(indicator in part for part in path_parts):
|
|
334
|
+
domain_scores[domain] += 1.0
|
|
335
|
+
break
|
|
336
|
+
# Also check full path contains indicator
|
|
337
|
+
elif indicator in normalized_path:
|
|
338
|
+
domain_scores[domain] += 0.5
|
|
339
|
+
|
|
340
|
+
# Normalize scores
|
|
341
|
+
scores = {}
|
|
342
|
+
max_score = max(domain_scores.values()) if domain_scores else 0
|
|
343
|
+
|
|
344
|
+
if max_score > 0:
|
|
345
|
+
for domain, score in domain_scores.items():
|
|
346
|
+
scores[domain] = min(1.0, score / max_score)
|
|
347
|
+
|
|
348
|
+
return scores
|
|
349
|
+
|
|
350
|
+
def _analyze_message_content(self, message: str, doc: Doc) -> dict[str, float]:
|
|
351
|
+
"""Analyze commit message content for domain keywords.
|
|
352
|
+
|
|
353
|
+
Args:
|
|
354
|
+
message: Commit message
|
|
355
|
+
doc: spaCy processed document (may be None)
|
|
356
|
+
|
|
357
|
+
Returns:
|
|
358
|
+
Dictionary of domain -> confidence_score
|
|
359
|
+
"""
|
|
360
|
+
if not message:
|
|
361
|
+
return {}
|
|
362
|
+
|
|
363
|
+
# Convert message to lowercase for analysis
|
|
364
|
+
message_lower = message.lower()
|
|
365
|
+
|
|
366
|
+
# Extract keywords from message
|
|
367
|
+
if SPACY_AVAILABLE and doc:
|
|
368
|
+
# Use spaCy for better keyword extraction
|
|
369
|
+
keywords = self._extract_keywords_from_doc(doc)
|
|
370
|
+
else:
|
|
371
|
+
# Fallback to simple word extraction
|
|
372
|
+
keywords = set(re.findall(r"\b\w+\b", message_lower))
|
|
373
|
+
|
|
374
|
+
# Score domains based on keyword matches
|
|
375
|
+
domain_scores = {}
|
|
376
|
+
|
|
377
|
+
for domain, domain_keywords in self.keyword_patterns.items():
|
|
378
|
+
keyword_matches = len(
|
|
379
|
+
keywords.intersection(set(word.lower() for word in domain_keywords))
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
if keyword_matches > 0:
|
|
383
|
+
# Base score from keyword matches
|
|
384
|
+
base_score = min(1.0, keyword_matches / 3.0) # Scale to 0-1
|
|
385
|
+
|
|
386
|
+
# Boost score for technology indicators
|
|
387
|
+
tech_keywords = self.tech_indicators.get(domain, set())
|
|
388
|
+
tech_matches = len(keywords.intersection(tech_keywords))
|
|
389
|
+
tech_boost = min(0.3, tech_matches * 0.1)
|
|
390
|
+
|
|
391
|
+
domain_scores[domain] = min(1.0, base_score + tech_boost)
|
|
392
|
+
|
|
393
|
+
return domain_scores
|
|
394
|
+
|
|
395
|
+
def _extract_keywords_from_doc(self, doc: Doc) -> set[str]:
|
|
396
|
+
"""Extract meaningful keywords from spaCy document.
|
|
397
|
+
|
|
398
|
+
Args:
|
|
399
|
+
doc: spaCy processed document
|
|
400
|
+
|
|
401
|
+
Returns:
|
|
402
|
+
Set of extracted keywords
|
|
403
|
+
"""
|
|
404
|
+
keywords = set()
|
|
405
|
+
|
|
406
|
+
for token in doc:
|
|
407
|
+
if (
|
|
408
|
+
not token.is_stop
|
|
409
|
+
and not token.is_punct
|
|
410
|
+
and len(token.text) > 2
|
|
411
|
+
and token.pos_ in ["NOUN", "PROPN", "ADJ", "VERB"]
|
|
412
|
+
):
|
|
413
|
+
keywords.add(token.lemma_.lower())
|
|
414
|
+
|
|
415
|
+
# Add named entities
|
|
416
|
+
for ent in doc.ents:
|
|
417
|
+
if len(ent.text) > 2:
|
|
418
|
+
keywords.add(ent.text.lower())
|
|
419
|
+
|
|
420
|
+
return keywords
|
|
421
|
+
|
|
422
|
+
def _combine_domain_scores(
|
|
423
|
+
self,
|
|
424
|
+
file_scores: dict[str, float],
|
|
425
|
+
dir_scores: dict[str, float],
|
|
426
|
+
message_scores: dict[str, float],
|
|
427
|
+
) -> dict[str, float]:
|
|
428
|
+
"""Combine scores from different analysis methods.
|
|
429
|
+
|
|
430
|
+
Args:
|
|
431
|
+
file_scores: Scores from file pattern analysis
|
|
432
|
+
dir_scores: Scores from directory pattern analysis
|
|
433
|
+
message_scores: Scores from message content analysis
|
|
434
|
+
|
|
435
|
+
Returns:
|
|
436
|
+
Combined scores dictionary
|
|
437
|
+
"""
|
|
438
|
+
all_domains = set(file_scores.keys()) | set(dir_scores.keys()) | set(message_scores.keys())
|
|
439
|
+
combined_scores = {}
|
|
440
|
+
|
|
441
|
+
# Weights for different signal types
|
|
442
|
+
weights = {
|
|
443
|
+
"file": 0.5, # File patterns are strongest signal
|
|
444
|
+
"directory": 0.3, # Directory patterns are also strong
|
|
445
|
+
"message": 0.2, # Message content provides additional context
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
for domain in all_domains:
|
|
449
|
+
file_score = file_scores.get(domain, 0.0)
|
|
450
|
+
dir_score = dir_scores.get(domain, 0.0)
|
|
451
|
+
message_score = message_scores.get(domain, 0.0)
|
|
452
|
+
|
|
453
|
+
# Weighted combination
|
|
454
|
+
combined_score = (
|
|
455
|
+
file_score * weights["file"]
|
|
456
|
+
+ dir_score * weights["directory"]
|
|
457
|
+
+ message_score * weights["message"]
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
# Bonus for multiple signal types agreeing
|
|
461
|
+
signal_count = sum(1 for score in [file_score, dir_score, message_score] if score > 0)
|
|
462
|
+
if signal_count > 1:
|
|
463
|
+
combined_score *= 1.0 + (signal_count - 1) * 0.1 # 10% bonus per additional signal
|
|
464
|
+
|
|
465
|
+
if combined_score > 0:
|
|
466
|
+
combined_scores[domain] = min(1.0, combined_score)
|
|
467
|
+
|
|
468
|
+
return combined_scores
|
|
469
|
+
|
|
470
|
+
def get_domain_statistics(self, files: list[str]) -> dict[str, Any]:
|
|
471
|
+
"""Get detailed domain analysis statistics for debugging.
|
|
472
|
+
|
|
473
|
+
Args:
|
|
474
|
+
files: List of file paths
|
|
475
|
+
|
|
476
|
+
Returns:
|
|
477
|
+
Dictionary with detailed analysis breakdown
|
|
478
|
+
"""
|
|
479
|
+
stats = {
|
|
480
|
+
"total_files": len(files),
|
|
481
|
+
"file_analysis": self._analyze_file_patterns(files),
|
|
482
|
+
"directory_analysis": self._analyze_directory_patterns(files),
|
|
483
|
+
"file_extensions": {},
|
|
484
|
+
"directory_breakdown": {},
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
# File extension breakdown
|
|
488
|
+
extensions = defaultdict(int)
|
|
489
|
+
directories = defaultdict(int)
|
|
490
|
+
|
|
491
|
+
for file_path in files:
|
|
492
|
+
# Extract extension
|
|
493
|
+
if "." in file_path:
|
|
494
|
+
ext = file_path.split(".")[-1].lower()
|
|
495
|
+
extensions[ext] += 1
|
|
496
|
+
|
|
497
|
+
# Extract directories
|
|
498
|
+
path_parts = file_path.split("/")
|
|
499
|
+
for part in path_parts[:-1]: # Exclude filename
|
|
500
|
+
if part:
|
|
501
|
+
directories[part] += 1
|
|
502
|
+
|
|
503
|
+
stats["file_extensions"] = dict(extensions)
|
|
504
|
+
stats["directory_breakdown"] = dict(directories)
|
|
505
|
+
|
|
506
|
+
return stats
|