gitflow-analytics 1.0.3__py3-none-any.whl → 1.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. gitflow_analytics/_version.py +1 -1
  2. gitflow_analytics/classification/__init__.py +31 -0
  3. gitflow_analytics/classification/batch_classifier.py +752 -0
  4. gitflow_analytics/classification/classifier.py +464 -0
  5. gitflow_analytics/classification/feature_extractor.py +725 -0
  6. gitflow_analytics/classification/linguist_analyzer.py +574 -0
  7. gitflow_analytics/classification/model.py +455 -0
  8. gitflow_analytics/cli.py +4108 -350
  9. gitflow_analytics/cli_rich.py +198 -48
  10. gitflow_analytics/config/__init__.py +43 -0
  11. gitflow_analytics/config/errors.py +261 -0
  12. gitflow_analytics/config/loader.py +904 -0
  13. gitflow_analytics/config/profiles.py +264 -0
  14. gitflow_analytics/config/repository.py +124 -0
  15. gitflow_analytics/config/schema.py +441 -0
  16. gitflow_analytics/config/validator.py +154 -0
  17. gitflow_analytics/config.py +44 -508
  18. gitflow_analytics/core/analyzer.py +1209 -98
  19. gitflow_analytics/core/cache.py +1337 -29
  20. gitflow_analytics/core/data_fetcher.py +1193 -0
  21. gitflow_analytics/core/identity.py +363 -14
  22. gitflow_analytics/core/metrics_storage.py +526 -0
  23. gitflow_analytics/core/progress.py +372 -0
  24. gitflow_analytics/core/schema_version.py +269 -0
  25. gitflow_analytics/extractors/ml_tickets.py +1100 -0
  26. gitflow_analytics/extractors/story_points.py +8 -1
  27. gitflow_analytics/extractors/tickets.py +749 -11
  28. gitflow_analytics/identity_llm/__init__.py +6 -0
  29. gitflow_analytics/identity_llm/analysis_pass.py +231 -0
  30. gitflow_analytics/identity_llm/analyzer.py +464 -0
  31. gitflow_analytics/identity_llm/models.py +76 -0
  32. gitflow_analytics/integrations/github_integration.py +175 -11
  33. gitflow_analytics/integrations/jira_integration.py +461 -24
  34. gitflow_analytics/integrations/orchestrator.py +124 -1
  35. gitflow_analytics/metrics/activity_scoring.py +322 -0
  36. gitflow_analytics/metrics/branch_health.py +470 -0
  37. gitflow_analytics/metrics/dora.py +379 -20
  38. gitflow_analytics/models/database.py +843 -53
  39. gitflow_analytics/pm_framework/__init__.py +115 -0
  40. gitflow_analytics/pm_framework/adapters/__init__.py +50 -0
  41. gitflow_analytics/pm_framework/adapters/jira_adapter.py +1845 -0
  42. gitflow_analytics/pm_framework/base.py +406 -0
  43. gitflow_analytics/pm_framework/models.py +211 -0
  44. gitflow_analytics/pm_framework/orchestrator.py +652 -0
  45. gitflow_analytics/pm_framework/registry.py +333 -0
  46. gitflow_analytics/qualitative/__init__.py +9 -10
  47. gitflow_analytics/qualitative/chatgpt_analyzer.py +259 -0
  48. gitflow_analytics/qualitative/classifiers/__init__.py +3 -3
  49. gitflow_analytics/qualitative/classifiers/change_type.py +518 -244
  50. gitflow_analytics/qualitative/classifiers/domain_classifier.py +272 -165
  51. gitflow_analytics/qualitative/classifiers/intent_analyzer.py +321 -222
  52. gitflow_analytics/qualitative/classifiers/llm/__init__.py +35 -0
  53. gitflow_analytics/qualitative/classifiers/llm/base.py +193 -0
  54. gitflow_analytics/qualitative/classifiers/llm/batch_processor.py +383 -0
  55. gitflow_analytics/qualitative/classifiers/llm/cache.py +479 -0
  56. gitflow_analytics/qualitative/classifiers/llm/cost_tracker.py +435 -0
  57. gitflow_analytics/qualitative/classifiers/llm/openai_client.py +403 -0
  58. gitflow_analytics/qualitative/classifiers/llm/prompts.py +373 -0
  59. gitflow_analytics/qualitative/classifiers/llm/response_parser.py +287 -0
  60. gitflow_analytics/qualitative/classifiers/llm_commit_classifier.py +607 -0
  61. gitflow_analytics/qualitative/classifiers/risk_analyzer.py +215 -189
  62. gitflow_analytics/qualitative/core/__init__.py +4 -4
  63. gitflow_analytics/qualitative/core/llm_fallback.py +239 -235
  64. gitflow_analytics/qualitative/core/nlp_engine.py +157 -148
  65. gitflow_analytics/qualitative/core/pattern_cache.py +214 -192
  66. gitflow_analytics/qualitative/core/processor.py +381 -248
  67. gitflow_analytics/qualitative/enhanced_analyzer.py +2236 -0
  68. gitflow_analytics/qualitative/example_enhanced_usage.py +420 -0
  69. gitflow_analytics/qualitative/models/__init__.py +7 -7
  70. gitflow_analytics/qualitative/models/schemas.py +155 -121
  71. gitflow_analytics/qualitative/utils/__init__.py +4 -4
  72. gitflow_analytics/qualitative/utils/batch_processor.py +136 -123
  73. gitflow_analytics/qualitative/utils/cost_tracker.py +142 -140
  74. gitflow_analytics/qualitative/utils/metrics.py +172 -158
  75. gitflow_analytics/qualitative/utils/text_processing.py +146 -104
  76. gitflow_analytics/reports/__init__.py +100 -0
  77. gitflow_analytics/reports/analytics_writer.py +539 -14
  78. gitflow_analytics/reports/base.py +648 -0
  79. gitflow_analytics/reports/branch_health_writer.py +322 -0
  80. gitflow_analytics/reports/classification_writer.py +924 -0
  81. gitflow_analytics/reports/cli_integration.py +427 -0
  82. gitflow_analytics/reports/csv_writer.py +1676 -212
  83. gitflow_analytics/reports/data_models.py +504 -0
  84. gitflow_analytics/reports/database_report_generator.py +427 -0
  85. gitflow_analytics/reports/example_usage.py +344 -0
  86. gitflow_analytics/reports/factory.py +499 -0
  87. gitflow_analytics/reports/formatters.py +698 -0
  88. gitflow_analytics/reports/html_generator.py +1116 -0
  89. gitflow_analytics/reports/interfaces.py +489 -0
  90. gitflow_analytics/reports/json_exporter.py +2770 -0
  91. gitflow_analytics/reports/narrative_writer.py +2287 -158
  92. gitflow_analytics/reports/story_point_correlation.py +1144 -0
  93. gitflow_analytics/reports/weekly_trends_writer.py +389 -0
  94. gitflow_analytics/training/__init__.py +5 -0
  95. gitflow_analytics/training/model_loader.py +377 -0
  96. gitflow_analytics/training/pipeline.py +550 -0
  97. gitflow_analytics/tui/__init__.py +1 -1
  98. gitflow_analytics/tui/app.py +129 -126
  99. gitflow_analytics/tui/screens/__init__.py +3 -3
  100. gitflow_analytics/tui/screens/analysis_progress_screen.py +188 -179
  101. gitflow_analytics/tui/screens/configuration_screen.py +154 -178
  102. gitflow_analytics/tui/screens/loading_screen.py +100 -110
  103. gitflow_analytics/tui/screens/main_screen.py +89 -72
  104. gitflow_analytics/tui/screens/results_screen.py +305 -281
  105. gitflow_analytics/tui/widgets/__init__.py +2 -2
  106. gitflow_analytics/tui/widgets/data_table.py +67 -69
  107. gitflow_analytics/tui/widgets/export_modal.py +76 -76
  108. gitflow_analytics/tui/widgets/progress_widget.py +41 -46
  109. gitflow_analytics-1.3.6.dist-info/METADATA +1015 -0
  110. gitflow_analytics-1.3.6.dist-info/RECORD +122 -0
  111. gitflow_analytics-1.0.3.dist-info/METADATA +0 -490
  112. gitflow_analytics-1.0.3.dist-info/RECORD +0 -62
  113. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.6.dist-info}/WHEEL +0 -0
  114. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.6.dist-info}/entry_points.txt +0 -0
  115. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.6.dist-info}/licenses/LICENSE +0 -0
  116. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.6.dist-info}/top_level.txt +0 -0
@@ -1,29 +1,29 @@
1
1
  """Domain classifier for identifying technical domains of commits."""
2
2
 
3
+ import importlib.util
3
4
  import logging
4
5
  import re
5
6
  from collections import defaultdict
6
- from typing import Dict, List, Tuple, Set, Any
7
- from pathlib import Path
7
+ from typing import Any
8
8
 
9
9
  from ..models.schemas import DomainConfig
10
10
 
11
- try:
12
- import spacy
11
+ # Check if spacy is available without importing it
12
+ SPACY_AVAILABLE = importlib.util.find_spec("spacy") is not None
13
+
14
+ if SPACY_AVAILABLE:
13
15
  from spacy.tokens import Doc
14
- SPACY_AVAILABLE = True
15
- except ImportError:
16
- SPACY_AVAILABLE = False
16
+ else:
17
17
  Doc = Any
18
18
 
19
19
 
20
20
  class DomainClassifier:
21
21
  """Classify commits by technical domain (frontend, backend, etc.).
22
-
22
+
23
23
  This classifier determines the technical domain or business area
24
24
  affected by a commit by analyzing both the commit message content
25
25
  and the patterns of files that were changed.
26
-
26
+
27
27
  Domains identified:
28
28
  - frontend: UI/UX, client-side code
29
29
  - backend: Server-side logic, APIs
@@ -32,82 +32,180 @@ class DomainClassifier:
32
32
  - mobile: Mobile app development
33
33
  - devops: CI/CD, build tools, automation
34
34
  """
35
-
35
+
36
36
  def __init__(self, config: DomainConfig):
37
37
  """Initialize domain classifier.
38
-
38
+
39
39
  Args:
40
40
  config: Configuration for domain classification
41
41
  """
42
42
  self.config = config
43
43
  self.logger = logging.getLogger(__name__)
44
-
44
+
45
45
  # Compile file patterns for efficient matching
46
46
  self._compile_file_patterns()
47
-
47
+
48
48
  # Keyword patterns for semantic analysis
49
49
  self.keyword_patterns = config.keyword_patterns
50
-
50
+
51
51
  # Directory patterns that strongly indicate domains
52
52
  self.directory_indicators = {
53
- 'frontend': {
54
- 'src/components', 'src/pages', 'src/views', 'public', 'assets',
55
- 'static', 'styles', 'css', 'scss', 'ui', 'components', 'pages'
53
+ "frontend": {
54
+ "src/components",
55
+ "src/pages",
56
+ "src/views",
57
+ "public",
58
+ "assets",
59
+ "static",
60
+ "styles",
61
+ "css",
62
+ "scss",
63
+ "ui",
64
+ "components",
65
+ "pages",
56
66
  },
57
- 'backend': {
58
- 'src/controllers', 'src/services', 'src/api', 'api', 'server',
59
- 'controllers', 'services', 'handlers', 'routes', 'middleware'
67
+ "backend": {
68
+ "src/controllers",
69
+ "src/services",
70
+ "src/api",
71
+ "api",
72
+ "server",
73
+ "controllers",
74
+ "services",
75
+ "handlers",
76
+ "routes",
77
+ "middleware",
60
78
  },
61
- 'database': {
62
- 'migrations', 'models', 'schemas', 'seeds', 'data', 'sql',
63
- 'database', 'db', 'repositories'
79
+ "database": {
80
+ "migrations",
81
+ "models",
82
+ "schemas",
83
+ "seeds",
84
+ "data",
85
+ "sql",
86
+ "database",
87
+ "db",
88
+ "repositories",
64
89
  },
65
- 'infrastructure': {
66
- 'terraform', 'ansible', 'k8s', 'kubernetes', 'helm', 'charts',
67
- 'infrastructure', 'deploy', 'deployment', 'ops'
90
+ "infrastructure": {
91
+ "terraform",
92
+ "ansible",
93
+ "k8s",
94
+ "kubernetes",
95
+ "helm",
96
+ "charts",
97
+ "infrastructure",
98
+ "deploy",
99
+ "deployment",
100
+ "ops",
68
101
  },
69
- 'mobile': {
70
- 'android', 'ios', 'mobile', 'app', 'native', 'react-native',
71
- 'flutter', 'swift', 'kotlin'
102
+ "mobile": {
103
+ "android",
104
+ "ios",
105
+ "mobile",
106
+ "app",
107
+ "native",
108
+ "react-native",
109
+ "flutter",
110
+ "swift",
111
+ "kotlin",
112
+ },
113
+ "devops": {
114
+ ".github",
115
+ ".gitlab",
116
+ "ci",
117
+ "cd",
118
+ "scripts",
119
+ "build",
120
+ "docker",
121
+ "jenkins",
122
+ "actions",
123
+ "workflows",
72
124
  },
73
- 'devops': {
74
- '.github', '.gitlab', 'ci', 'cd', 'scripts', 'build', 'docker',
75
- 'jenkins', 'actions', 'workflows'
76
- }
77
125
  }
78
-
126
+
79
127
  # Technology stack indicators
80
128
  self.tech_indicators = {
81
- 'frontend': {
82
- 'react', 'vue', 'angular', 'svelte', 'jquery', 'bootstrap',
83
- 'tailwind', 'css', 'html', 'javascript', 'typescript', 'jsx', 'tsx'
129
+ "frontend": {
130
+ "react",
131
+ "vue",
132
+ "angular",
133
+ "svelte",
134
+ "jquery",
135
+ "bootstrap",
136
+ "tailwind",
137
+ "css",
138
+ "html",
139
+ "javascript",
140
+ "typescript",
141
+ "jsx",
142
+ "tsx",
84
143
  },
85
- 'backend': {
86
- 'django', 'flask', 'fastapi', 'express', 'spring', 'rails',
87
- 'laravel', 'api', 'endpoint', 'service', 'controller'
144
+ "backend": {
145
+ "django",
146
+ "flask",
147
+ "fastapi",
148
+ "express",
149
+ "spring",
150
+ "rails",
151
+ "laravel",
152
+ "api",
153
+ "endpoint",
154
+ "service",
155
+ "controller",
88
156
  },
89
- 'database': {
90
- 'mysql', 'postgresql', 'mongodb', 'redis', 'elasticsearch',
91
- 'migration', 'schema', 'query', 'orm', 'sql'
157
+ "database": {
158
+ "mysql",
159
+ "postgresql",
160
+ "mongodb",
161
+ "redis",
162
+ "elasticsearch",
163
+ "migration",
164
+ "schema",
165
+ "query",
166
+ "orm",
167
+ "sql",
92
168
  },
93
- 'infrastructure': {
94
- 'aws', 'gcp', 'azure', 'docker', 'kubernetes', 'terraform',
95
- 'ansible', 'helm', 'nginx', 'apache'
169
+ "infrastructure": {
170
+ "aws",
171
+ "gcp",
172
+ "azure",
173
+ "docker",
174
+ "kubernetes",
175
+ "terraform",
176
+ "ansible",
177
+ "helm",
178
+ "nginx",
179
+ "apache",
96
180
  },
97
- 'mobile': {
98
- 'android', 'ios', 'swift', 'kotlin', 'flutter', 'react-native',
99
- 'xamarin', 'cordova', 'ionic'
181
+ "mobile": {
182
+ "android",
183
+ "ios",
184
+ "swift",
185
+ "kotlin",
186
+ "flutter",
187
+ "react-native",
188
+ "xamarin",
189
+ "cordova",
190
+ "ionic",
191
+ },
192
+ "devops": {
193
+ "jenkins",
194
+ "gitlab-ci",
195
+ "github-actions",
196
+ "circleci",
197
+ "travis",
198
+ "docker",
199
+ "kubernetes",
200
+ "helm",
201
+ "terraform",
100
202
  },
101
- 'devops': {
102
- 'jenkins', 'gitlab-ci', 'github-actions', 'circleci', 'travis',
103
- 'docker', 'kubernetes', 'helm', 'terraform'
104
- }
105
203
  }
106
-
204
+
107
205
  def _compile_file_patterns(self) -> None:
108
206
  """Compile file extension patterns for efficient matching."""
109
207
  self.compiled_file_patterns = {}
110
-
208
+
111
209
  for domain, patterns in self.config.file_patterns.items():
112
210
  compiled_patterns = []
113
211
  for pattern in patterns:
@@ -116,116 +214,118 @@ class DomainClassifier:
116
214
  regex_pattern = self._glob_to_regex(pattern)
117
215
  compiled_patterns.append(re.compile(regex_pattern, re.IGNORECASE))
118
216
  except re.error as e:
119
- self.logger.warning(f"Invalid file pattern '{pattern}' for domain {domain}: {e}")
120
-
217
+ self.logger.warning(
218
+ f"Invalid file pattern '{pattern}' for domain {domain}: {e}"
219
+ )
220
+
121
221
  self.compiled_file_patterns[domain] = compiled_patterns
122
-
222
+
123
223
  def _glob_to_regex(self, pattern: str) -> str:
124
224
  """Convert glob pattern to regex.
125
-
225
+
126
226
  Args:
127
227
  pattern: Glob pattern (e.g., '*.js', '**/models/**')
128
-
228
+
129
229
  Returns:
130
230
  Equivalent regex pattern
131
231
  """
132
232
  # Simple glob to regex conversion
133
- pattern = pattern.replace('.', r'\.')
134
- pattern = pattern.replace('*', '.*')
135
- pattern = pattern.replace('?', '.')
136
- pattern = f'^{pattern}$'
233
+ pattern = pattern.replace(".", r"\.")
234
+ pattern = pattern.replace("*", ".*")
235
+ pattern = pattern.replace("?", ".")
236
+ pattern = f"^{pattern}$"
137
237
  return pattern
138
-
139
- def classify(self, message: str, doc: Doc, files: List[str]) -> Tuple[str, float]:
238
+
239
+ def classify(self, message: str, doc: Doc, files: list[str]) -> tuple[str, float]:
140
240
  """Classify commit domain with confidence score.
141
-
241
+
142
242
  Args:
143
243
  message: Commit message
144
244
  doc: spaCy processed document (may be None)
145
245
  files: List of changed files
146
-
246
+
147
247
  Returns:
148
248
  Tuple of (domain, confidence_score)
149
249
  """
150
250
  if not message and not files:
151
- return 'unknown', 0.0
152
-
251
+ return "unknown", 0.0
252
+
153
253
  # Analyze file patterns (primary signal)
154
254
  file_scores = self._analyze_file_patterns(files)
155
-
255
+
156
256
  # Analyze directory patterns
157
257
  dir_scores = self._analyze_directory_patterns(files)
158
-
258
+
159
259
  # Analyze message content
160
260
  message_scores = self._analyze_message_content(message, doc)
161
-
261
+
162
262
  # Combine all signals
163
263
  combined_scores = self._combine_domain_scores(file_scores, dir_scores, message_scores)
164
-
264
+
165
265
  if not combined_scores:
166
- return 'unknown', 0.0
167
-
266
+ return "unknown", 0.0
267
+
168
268
  # Select best domain
169
269
  best_domain = max(combined_scores.keys(), key=lambda k: combined_scores[k])
170
270
  confidence = combined_scores[best_domain]
171
-
271
+
172
272
  # Apply confidence threshold
173
273
  if confidence < self.config.min_confidence:
174
- return 'unknown', confidence
175
-
274
+ return "unknown", confidence
275
+
176
276
  return best_domain, confidence
177
-
178
- def _analyze_file_patterns(self, files: List[str]) -> Dict[str, float]:
277
+
278
+ def _analyze_file_patterns(self, files: list[str]) -> dict[str, float]:
179
279
  """Analyze file patterns to determine domain.
180
-
280
+
181
281
  Args:
182
282
  files: List of file paths
183
-
283
+
184
284
  Returns:
185
285
  Dictionary of domain -> confidence_score
186
286
  """
187
287
  if not files:
188
288
  return {}
189
-
289
+
190
290
  domain_matches = defaultdict(int)
191
-
291
+
192
292
  for file_path in files:
193
293
  for domain, patterns in self.compiled_file_patterns.items():
194
294
  for pattern in patterns:
195
295
  if pattern.search(file_path):
196
296
  domain_matches[domain] += 1
197
297
  break # Don't double-count same file for same domain
198
-
298
+
199
299
  # Convert to confidence scores
200
300
  scores = {}
201
301
  total_files = len(files)
202
-
302
+
203
303
  for domain, matches in domain_matches.items():
204
304
  # Confidence based on proportion of matching files
205
305
  confidence = matches / total_files
206
306
  scores[domain] = min(1.0, confidence * 2) # Boost confidence for strong signals
207
-
307
+
208
308
  return scores
209
-
210
- def _analyze_directory_patterns(self, files: List[str]) -> Dict[str, float]:
309
+
310
+ def _analyze_directory_patterns(self, files: list[str]) -> dict[str, float]:
211
311
  """Analyze directory patterns for domain signals.
212
-
312
+
213
313
  Args:
214
314
  files: List of file paths
215
-
315
+
216
316
  Returns:
217
317
  Dictionary of domain -> confidence_score
218
318
  """
219
319
  if not files:
220
320
  return {}
221
-
321
+
222
322
  domain_scores = defaultdict(float)
223
-
323
+
224
324
  for file_path in files:
225
325
  # Normalize path separators and convert to lowercase
226
- normalized_path = file_path.replace('\\', '/').lower()
227
- path_parts = normalized_path.split('/')
228
-
326
+ normalized_path = file_path.replace("\\", "/").lower()
327
+ path_parts = normalized_path.split("/")
328
+
229
329
  # Check each domain's directory indicators
230
330
  for domain, indicators in self.directory_indicators.items():
231
331
  for indicator in indicators:
@@ -236,164 +336,171 @@ class DomainClassifier:
236
336
  # Also check full path contains indicator
237
337
  elif indicator in normalized_path:
238
338
  domain_scores[domain] += 0.5
239
-
339
+
240
340
  # Normalize scores
241
341
  scores = {}
242
342
  max_score = max(domain_scores.values()) if domain_scores else 0
243
-
343
+
244
344
  if max_score > 0:
245
345
  for domain, score in domain_scores.items():
246
346
  scores[domain] = min(1.0, score / max_score)
247
-
347
+
248
348
  return scores
249
-
250
- def _analyze_message_content(self, message: str, doc: Doc) -> Dict[str, float]:
349
+
350
+ def _analyze_message_content(self, message: str, doc: Doc) -> dict[str, float]:
251
351
  """Analyze commit message content for domain keywords.
252
-
352
+
253
353
  Args:
254
354
  message: Commit message
255
355
  doc: spaCy processed document (may be None)
256
-
356
+
257
357
  Returns:
258
358
  Dictionary of domain -> confidence_score
259
359
  """
260
360
  if not message:
261
361
  return {}
262
-
362
+
263
363
  # Convert message to lowercase for analysis
264
364
  message_lower = message.lower()
265
-
365
+
266
366
  # Extract keywords from message
267
367
  if SPACY_AVAILABLE and doc:
268
368
  # Use spaCy for better keyword extraction
269
369
  keywords = self._extract_keywords_from_doc(doc)
270
370
  else:
271
371
  # Fallback to simple word extraction
272
- keywords = set(re.findall(r'\b\w+\b', message_lower))
273
-
372
+ keywords = set(re.findall(r"\b\w+\b", message_lower))
373
+
274
374
  # Score domains based on keyword matches
275
375
  domain_scores = {}
276
-
376
+
277
377
  for domain, domain_keywords in self.keyword_patterns.items():
278
- keyword_matches = len(keywords.intersection(set(word.lower() for word in domain_keywords)))
279
-
378
+ keyword_matches = len(
379
+ keywords.intersection(set(word.lower() for word in domain_keywords))
380
+ )
381
+
280
382
  if keyword_matches > 0:
281
383
  # Base score from keyword matches
282
384
  base_score = min(1.0, keyword_matches / 3.0) # Scale to 0-1
283
-
385
+
284
386
  # Boost score for technology indicators
285
387
  tech_keywords = self.tech_indicators.get(domain, set())
286
388
  tech_matches = len(keywords.intersection(tech_keywords))
287
389
  tech_boost = min(0.3, tech_matches * 0.1)
288
-
390
+
289
391
  domain_scores[domain] = min(1.0, base_score + tech_boost)
290
-
392
+
291
393
  return domain_scores
292
-
293
- def _extract_keywords_from_doc(self, doc: Doc) -> Set[str]:
394
+
395
+ def _extract_keywords_from_doc(self, doc: Doc) -> set[str]:
294
396
  """Extract meaningful keywords from spaCy document.
295
-
397
+
296
398
  Args:
297
399
  doc: spaCy processed document
298
-
400
+
299
401
  Returns:
300
402
  Set of extracted keywords
301
403
  """
302
404
  keywords = set()
303
-
405
+
304
406
  for token in doc:
305
- if (not token.is_stop and
306
- not token.is_punct and
307
- len(token.text) > 2 and
308
- token.pos_ in ['NOUN', 'PROPN', 'ADJ', 'VERB']):
407
+ if (
408
+ not token.is_stop
409
+ and not token.is_punct
410
+ and len(token.text) > 2
411
+ and token.pos_ in ["NOUN", "PROPN", "ADJ", "VERB"]
412
+ ):
309
413
  keywords.add(token.lemma_.lower())
310
-
414
+
311
415
  # Add named entities
312
416
  for ent in doc.ents:
313
417
  if len(ent.text) > 2:
314
418
  keywords.add(ent.text.lower())
315
-
419
+
316
420
  return keywords
317
-
318
- def _combine_domain_scores(self, file_scores: Dict[str, float],
319
- dir_scores: Dict[str, float],
320
- message_scores: Dict[str, float]) -> Dict[str, float]:
421
+
422
+ def _combine_domain_scores(
423
+ self,
424
+ file_scores: dict[str, float],
425
+ dir_scores: dict[str, float],
426
+ message_scores: dict[str, float],
427
+ ) -> dict[str, float]:
321
428
  """Combine scores from different analysis methods.
322
-
429
+
323
430
  Args:
324
431
  file_scores: Scores from file pattern analysis
325
- dir_scores: Scores from directory pattern analysis
432
+ dir_scores: Scores from directory pattern analysis
326
433
  message_scores: Scores from message content analysis
327
-
434
+
328
435
  Returns:
329
436
  Combined scores dictionary
330
437
  """
331
438
  all_domains = set(file_scores.keys()) | set(dir_scores.keys()) | set(message_scores.keys())
332
439
  combined_scores = {}
333
-
440
+
334
441
  # Weights for different signal types
335
442
  weights = {
336
- 'file': 0.5, # File patterns are strongest signal
337
- 'directory': 0.3, # Directory patterns are also strong
338
- 'message': 0.2 # Message content provides additional context
443
+ "file": 0.5, # File patterns are strongest signal
444
+ "directory": 0.3, # Directory patterns are also strong
445
+ "message": 0.2, # Message content provides additional context
339
446
  }
340
-
447
+
341
448
  for domain in all_domains:
342
449
  file_score = file_scores.get(domain, 0.0)
343
450
  dir_score = dir_scores.get(domain, 0.0)
344
451
  message_score = message_scores.get(domain, 0.0)
345
-
452
+
346
453
  # Weighted combination
347
454
  combined_score = (
348
- file_score * weights['file'] +
349
- dir_score * weights['directory'] +
350
- message_score * weights['message']
455
+ file_score * weights["file"]
456
+ + dir_score * weights["directory"]
457
+ + message_score * weights["message"]
351
458
  )
352
-
459
+
353
460
  # Bonus for multiple signal types agreeing
354
461
  signal_count = sum(1 for score in [file_score, dir_score, message_score] if score > 0)
355
462
  if signal_count > 1:
356
- combined_score *= (1.0 + (signal_count - 1) * 0.1) # 10% bonus per additional signal
357
-
463
+ combined_score *= 1.0 + (signal_count - 1) * 0.1 # 10% bonus per additional signal
464
+
358
465
  if combined_score > 0:
359
466
  combined_scores[domain] = min(1.0, combined_score)
360
-
467
+
361
468
  return combined_scores
362
-
363
- def get_domain_statistics(self, files: List[str]) -> Dict[str, Any]:
469
+
470
+ def get_domain_statistics(self, files: list[str]) -> dict[str, Any]:
364
471
  """Get detailed domain analysis statistics for debugging.
365
-
472
+
366
473
  Args:
367
474
  files: List of file paths
368
-
475
+
369
476
  Returns:
370
477
  Dictionary with detailed analysis breakdown
371
478
  """
372
479
  stats = {
373
- 'total_files': len(files),
374
- 'file_analysis': self._analyze_file_patterns(files),
375
- 'directory_analysis': self._analyze_directory_patterns(files),
376
- 'file_extensions': {},
377
- 'directory_breakdown': {},
480
+ "total_files": len(files),
481
+ "file_analysis": self._analyze_file_patterns(files),
482
+ "directory_analysis": self._analyze_directory_patterns(files),
483
+ "file_extensions": {},
484
+ "directory_breakdown": {},
378
485
  }
379
-
486
+
380
487
  # File extension breakdown
381
488
  extensions = defaultdict(int)
382
489
  directories = defaultdict(int)
383
-
490
+
384
491
  for file_path in files:
385
492
  # Extract extension
386
- if '.' in file_path:
387
- ext = file_path.split('.')[-1].lower()
493
+ if "." in file_path:
494
+ ext = file_path.split(".")[-1].lower()
388
495
  extensions[ext] += 1
389
-
496
+
390
497
  # Extract directories
391
- path_parts = file_path.split('/')
498
+ path_parts = file_path.split("/")
392
499
  for part in path_parts[:-1]: # Exclude filename
393
500
  if part:
394
501
  directories[part] += 1
395
-
396
- stats['file_extensions'] = dict(extensions)
397
- stats['directory_breakdown'] = dict(directories)
398
-
399
- return stats
502
+
503
+ stats["file_extensions"] = dict(extensions)
504
+ stats["directory_breakdown"] = dict(directories)
505
+
506
+ return stats