sourcefire 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sourcefire/__init__.py +0 -0
- sourcefire/api/__init__.py +0 -0
- sourcefire/api/models.py +24 -0
- sourcefire/api/routes.py +166 -0
- sourcefire/chain/__init__.py +0 -0
- sourcefire/chain/prompts.py +195 -0
- sourcefire/chain/rag_chain.py +967 -0
- sourcefire/cli.py +293 -0
- sourcefire/config.py +148 -0
- sourcefire/db.py +196 -0
- sourcefire/indexer/__init__.py +0 -0
- sourcefire/indexer/embeddings.py +27 -0
- sourcefire/indexer/language_profiles.py +448 -0
- sourcefire/indexer/metadata.py +289 -0
- sourcefire/indexer/pipeline.py +406 -0
- sourcefire/init.py +189 -0
- sourcefire/prompts/system.md +28 -0
- sourcefire/retriever/__init__.py +0 -0
- sourcefire/retriever/graph.py +162 -0
- sourcefire/retriever/search.py +86 -0
- sourcefire/static/.DS_Store +0 -0
- sourcefire/static/app.js +414 -0
- sourcefire/static/index.html +102 -0
- sourcefire/static/styles.css +607 -0
- sourcefire/watcher.py +105 -0
- sourcefire-0.2.0.dist-info/METADATA +145 -0
- sourcefire-0.2.0.dist-info/RECORD +31 -0
- sourcefire-0.2.0.dist-info/WHEEL +5 -0
- sourcefire-0.2.0.dist-info/entry_points.txt +2 -0
- sourcefire-0.2.0.dist-info/licenses/LICENSE +21 -0
- sourcefire-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,448 @@
|
|
|
1
|
+
"""Language profiles for language-agnostic code analysis.
|
|
2
|
+
|
|
3
|
+
Each profile defines the patterns, AST node types, and regex needed to
|
|
4
|
+
parse imports, exports, and chunk boundaries for a specific language.
|
|
5
|
+
Auto-detection scans the project directory to count file extensions and
|
|
6
|
+
picks the dominant language.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import os
|
|
12
|
+
import re
|
|
13
|
+
from collections import Counter
|
|
14
|
+
from dataclasses import dataclass, field
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import Any, Optional
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class LanguageProfile:
|
|
21
|
+
"""Configuration for language-specific code analysis."""
|
|
22
|
+
|
|
23
|
+
language: str
|
|
24
|
+
file_extensions: list[str]
|
|
25
|
+
|
|
26
|
+
# File collection patterns
|
|
27
|
+
include_patterns: list[str] = field(default_factory=list)
|
|
28
|
+
exclude_patterns: list[str] = field(default_factory=list)
|
|
29
|
+
|
|
30
|
+
# Tree-sitter config (optional)
|
|
31
|
+
tree_sitter_language: Optional[str] = None
|
|
32
|
+
import_node_types: list[str] = field(default_factory=list)
|
|
33
|
+
export_node_types: list[str] = field(default_factory=list)
|
|
34
|
+
boundary_node_types: list[str] = field(default_factory=list)
|
|
35
|
+
string_literal_type: str = "string_literal"
|
|
36
|
+
|
|
37
|
+
# Regex fallback patterns
|
|
38
|
+
import_pattern: Optional[str] = None
|
|
39
|
+
export_pattern: Optional[str] = None
|
|
40
|
+
boundary_pattern: Optional[str] = None
|
|
41
|
+
|
|
42
|
+
# Path-based metadata inference (optional)
|
|
43
|
+
layer_parts: list[str] = field(default_factory=list)
|
|
44
|
+
feature_regex: Optional[str] = None
|
|
45
|
+
file_type_suffixes: list[tuple[str, str]] = field(default_factory=list)
|
|
46
|
+
directory_type_patterns: dict[str, str] = field(default_factory=dict)
|
|
47
|
+
|
|
48
|
+
# Import graph config
|
|
49
|
+
external_import_prefixes: tuple[str, ...] = ()
|
|
50
|
+
|
|
51
|
+
# Syntax highlighting language for code blocks
|
|
52
|
+
highlight_language: str = "text"
|
|
53
|
+
|
|
54
|
+
# Stack trace / file reference regex patterns
|
|
55
|
+
file_ref_patterns: list[str] = field(default_factory=list)
|
|
56
|
+
|
|
57
|
+
# Search extensions for keyword search tool
|
|
58
|
+
searchable_extensions: list[str] = field(default_factory=list)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# ---------------------------------------------------------------------------
|
|
62
|
+
# Built-in language profiles
|
|
63
|
+
# ---------------------------------------------------------------------------
|
|
64
|
+
|
|
65
|
+
DART_PROFILE = LanguageProfile(
|
|
66
|
+
language="dart",
|
|
67
|
+
file_extensions=[".dart"],
|
|
68
|
+
include_patterns=["lib/**/*.dart", "pubspec.yaml", "analysis_options.yaml"],
|
|
69
|
+
exclude_patterns=["*.g.dart", "*.freezed.dart", "test/**"],
|
|
70
|
+
tree_sitter_language="dart",
|
|
71
|
+
import_node_types=["import_specification"],
|
|
72
|
+
export_node_types=[
|
|
73
|
+
"class_definition",
|
|
74
|
+
"enum_declaration",
|
|
75
|
+
"mixin_declaration",
|
|
76
|
+
"extension_declaration",
|
|
77
|
+
],
|
|
78
|
+
boundary_node_types=[
|
|
79
|
+
"class_definition",
|
|
80
|
+
"enum_declaration",
|
|
81
|
+
"mixin_declaration",
|
|
82
|
+
"extension_declaration",
|
|
83
|
+
],
|
|
84
|
+
import_pattern=r"""import\s+'([^']+)'""",
|
|
85
|
+
export_pattern=r"""^(?:abstract\s+)?(?:class|enum|mixin|extension)\s+(\w+)""",
|
|
86
|
+
boundary_pattern=r"""^(?:abstract\s+)?(?:class|enum|mixin|extension)\s+\w+""",
|
|
87
|
+
layer_parts=["presentation", "domain", "data", "core"],
|
|
88
|
+
feature_regex=r"features/(\w+)/",
|
|
89
|
+
file_type_suffixes=[
|
|
90
|
+
("_remote_datasource", "datasource"),
|
|
91
|
+
("_datasource", "datasource"),
|
|
92
|
+
("_repository_impl", "repository"),
|
|
93
|
+
("_repository", "repository"),
|
|
94
|
+
("_notifier", "notifier"),
|
|
95
|
+
("_provider", "provider"),
|
|
96
|
+
("_interceptor", "interceptor"),
|
|
97
|
+
("_screen", "screen"),
|
|
98
|
+
("_widget", "widget"),
|
|
99
|
+
("_model", "model"),
|
|
100
|
+
("_entity", "entity"),
|
|
101
|
+
],
|
|
102
|
+
directory_type_patterns={"/widgets/": "widget"},
|
|
103
|
+
external_import_prefixes=("package:", "dart:"),
|
|
104
|
+
highlight_language="dart",
|
|
105
|
+
file_ref_patterns=[
|
|
106
|
+
r"package:[^/]+/((?:features|core|lib)[^\s:)]+\.dart):(\d+)",
|
|
107
|
+
r"\b(lib/[^\s:)]+\.dart)(?::(\d+))?",
|
|
108
|
+
],
|
|
109
|
+
searchable_extensions=[".dart", ".yaml", ".json", ".md"],
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
PYTHON_PROFILE = LanguageProfile(
|
|
113
|
+
language="python",
|
|
114
|
+
file_extensions=[".py"],
|
|
115
|
+
include_patterns=["src/**/*.py", "**/*.py"],
|
|
116
|
+
exclude_patterns=["__pycache__/**", "*.pyc", ".venv/**", "venv/**", "test/**", "tests/**"],
|
|
117
|
+
tree_sitter_language="python",
|
|
118
|
+
import_node_types=["import_statement", "import_from_statement"],
|
|
119
|
+
export_node_types=["class_definition", "function_definition"],
|
|
120
|
+
boundary_node_types=["class_definition", "function_definition"],
|
|
121
|
+
import_pattern=r"""^(?:from\s+([\w.]+)\s+import|import\s+([\w.]+))""",
|
|
122
|
+
export_pattern=r"""^(?:class|def|async\s+def)\s+(\w+)""",
|
|
123
|
+
boundary_pattern=r"""^(?:class|def|async\s+def)\s+\w+""",
|
|
124
|
+
layer_parts=["api", "services", "models", "core", "utils"],
|
|
125
|
+
feature_regex=r"(?:features|modules|apps)/(\w+)/",
|
|
126
|
+
file_type_suffixes=[
|
|
127
|
+
("_test", "test"),
|
|
128
|
+
("_service", "service"),
|
|
129
|
+
("_handler", "handler"),
|
|
130
|
+
("_model", "model"),
|
|
131
|
+
("_schema", "schema"),
|
|
132
|
+
("_router", "router"),
|
|
133
|
+
("_view", "view"),
|
|
134
|
+
("_serializer", "serializer"),
|
|
135
|
+
],
|
|
136
|
+
external_import_prefixes=(),
|
|
137
|
+
highlight_language="python",
|
|
138
|
+
file_ref_patterns=[
|
|
139
|
+
r'File "([^"]+\.py)", line (\d+)',
|
|
140
|
+
r"\b([\w/]+\.py)(?::(\d+))?",
|
|
141
|
+
],
|
|
142
|
+
searchable_extensions=[".py", ".yaml", ".yml", ".json", ".md", ".toml"],
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
JAVASCRIPT_PROFILE = LanguageProfile(
|
|
146
|
+
language="javascript",
|
|
147
|
+
file_extensions=[".js", ".jsx", ".ts", ".tsx", ".mjs", ".cjs"],
|
|
148
|
+
include_patterns=["src/**/*.{js,jsx,ts,tsx}", "**/*.{js,jsx,ts,tsx}"],
|
|
149
|
+
exclude_patterns=["node_modules/**", "dist/**", "build/**", ".next/**", "*.test.*", "*.spec.*"],
|
|
150
|
+
tree_sitter_language="typescript",
|
|
151
|
+
import_node_types=["import_statement"],
|
|
152
|
+
export_node_types=["export_statement", "class_declaration", "function_declaration"],
|
|
153
|
+
boundary_node_types=["export_statement", "class_declaration", "function_declaration"],
|
|
154
|
+
import_pattern=r"""(?:import|require)\s*\(?['"]([\w@./][^'"]*)['"]\)?""",
|
|
155
|
+
export_pattern=r"""^(?:export\s+)?(?:default\s+)?(?:class|function|const|let|var)\s+(\w+)""",
|
|
156
|
+
boundary_pattern=r"""^(?:export\s+)?(?:default\s+)?(?:class|function|const|let|var)\s+\w+""",
|
|
157
|
+
layer_parts=["components", "pages", "hooks", "services", "utils", "api", "lib"],
|
|
158
|
+
feature_regex=r"(?:features|modules)/(\w+)/",
|
|
159
|
+
file_type_suffixes=[
|
|
160
|
+
(".test", "test"),
|
|
161
|
+
(".spec", "test"),
|
|
162
|
+
(".hook", "hook"),
|
|
163
|
+
(".service", "service"),
|
|
164
|
+
(".controller", "controller"),
|
|
165
|
+
(".middleware", "middleware"),
|
|
166
|
+
(".component", "component"),
|
|
167
|
+
(".page", "page"),
|
|
168
|
+
(".route", "route"),
|
|
169
|
+
],
|
|
170
|
+
external_import_prefixes=(),
|
|
171
|
+
highlight_language="typescript",
|
|
172
|
+
file_ref_patterns=[
|
|
173
|
+
r"\b(src/[^\s:)]+\.(?:ts|tsx|js|jsx))(?::(\d+))?",
|
|
174
|
+
],
|
|
175
|
+
searchable_extensions=[".js", ".jsx", ".ts", ".tsx", ".json", ".md"],
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
GO_PROFILE = LanguageProfile(
|
|
179
|
+
language="go",
|
|
180
|
+
file_extensions=[".go"],
|
|
181
|
+
include_patterns=["**/*.go"],
|
|
182
|
+
exclude_patterns=["vendor/**", "*_test.go"],
|
|
183
|
+
tree_sitter_language="go",
|
|
184
|
+
import_node_types=["import_declaration"],
|
|
185
|
+
export_node_types=["function_declaration", "type_declaration", "method_declaration"],
|
|
186
|
+
boundary_node_types=["function_declaration", "type_declaration", "method_declaration"],
|
|
187
|
+
import_pattern=r"""^\s*"([^"]+)"$""",
|
|
188
|
+
export_pattern=r"""^(?:func|type)\s+(?:\(.*?\)\s+)?(\w+)""",
|
|
189
|
+
boundary_pattern=r"""^(?:func|type)\s+""",
|
|
190
|
+
layer_parts=["cmd", "internal", "pkg", "api"],
|
|
191
|
+
feature_regex=r"(?:internal|pkg)/(\w+)/",
|
|
192
|
+
file_type_suffixes=[
|
|
193
|
+
("_handler", "handler"),
|
|
194
|
+
("_service", "service"),
|
|
195
|
+
("_repository", "repository"),
|
|
196
|
+
("_model", "model"),
|
|
197
|
+
("_middleware", "middleware"),
|
|
198
|
+
],
|
|
199
|
+
external_import_prefixes=(),
|
|
200
|
+
highlight_language="go",
|
|
201
|
+
file_ref_patterns=[
|
|
202
|
+
r"\b([\w/]+\.go):(\d+)",
|
|
203
|
+
],
|
|
204
|
+
searchable_extensions=[".go", ".yaml", ".yml", ".json", ".md"],
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
RUST_PROFILE = LanguageProfile(
|
|
208
|
+
language="rust",
|
|
209
|
+
file_extensions=[".rs"],
|
|
210
|
+
include_patterns=["src/**/*.rs", "**/*.rs"],
|
|
211
|
+
exclude_patterns=["target/**"],
|
|
212
|
+
tree_sitter_language="rust",
|
|
213
|
+
import_node_types=["use_declaration"],
|
|
214
|
+
export_node_types=["function_item", "struct_item", "enum_item", "impl_item", "trait_item"],
|
|
215
|
+
boundary_node_types=["function_item", "struct_item", "enum_item", "impl_item", "trait_item"],
|
|
216
|
+
import_pattern=r"""^use\s+([\w:]+)""",
|
|
217
|
+
export_pattern=r"""^(?:pub\s+)?(?:fn|struct|enum|impl|trait)\s+(\w+)""",
|
|
218
|
+
boundary_pattern=r"""^(?:pub\s+)?(?:fn|struct|enum|impl|trait)\s+\w+""",
|
|
219
|
+
layer_parts=["api", "domain", "infrastructure", "lib"],
|
|
220
|
+
feature_regex=None,
|
|
221
|
+
file_type_suffixes=[
|
|
222
|
+
("_test", "test"),
|
|
223
|
+
("_handler", "handler"),
|
|
224
|
+
("_service", "service"),
|
|
225
|
+
],
|
|
226
|
+
external_import_prefixes=(),
|
|
227
|
+
highlight_language="rust",
|
|
228
|
+
file_ref_patterns=[
|
|
229
|
+
r"\b([\w/]+\.rs):(\d+)",
|
|
230
|
+
],
|
|
231
|
+
searchable_extensions=[".rs", ".toml", ".yaml", ".md"],
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
JAVA_PROFILE = LanguageProfile(
|
|
235
|
+
language="java",
|
|
236
|
+
file_extensions=[".java"],
|
|
237
|
+
include_patterns=["src/**/*.java", "**/*.java"],
|
|
238
|
+
exclude_patterns=["build/**", "target/**", "*Test.java", "*Tests.java"],
|
|
239
|
+
tree_sitter_language="java",
|
|
240
|
+
import_node_types=["import_declaration"],
|
|
241
|
+
export_node_types=["class_declaration", "interface_declaration", "enum_declaration"],
|
|
242
|
+
boundary_node_types=["class_declaration", "interface_declaration", "enum_declaration"],
|
|
243
|
+
import_pattern=r"""^import\s+([\w.]+);""",
|
|
244
|
+
export_pattern=r"""^(?:public\s+)?(?:class|interface|enum)\s+(\w+)""",
|
|
245
|
+
boundary_pattern=r"""^(?:public\s+)?(?:class|interface|enum)\s+\w+""",
|
|
246
|
+
layer_parts=["controller", "service", "repository", "model", "dto", "config"],
|
|
247
|
+
feature_regex=None,
|
|
248
|
+
file_type_suffixes=[
|
|
249
|
+
("Controller", "controller"),
|
|
250
|
+
("Service", "service"),
|
|
251
|
+
("Repository", "repository"),
|
|
252
|
+
("Dto", "dto"),
|
|
253
|
+
("Entity", "entity"),
|
|
254
|
+
("Config", "config"),
|
|
255
|
+
],
|
|
256
|
+
external_import_prefixes=("java.", "javax.", "jakarta."),
|
|
257
|
+
highlight_language="java",
|
|
258
|
+
file_ref_patterns=[
|
|
259
|
+
r"\b([\w/]+\.java):(\d+)",
|
|
260
|
+
],
|
|
261
|
+
searchable_extensions=[".java", ".xml", ".yaml", ".yml", ".json", ".md"],
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
C_PROFILE = LanguageProfile(
|
|
265
|
+
language="c",
|
|
266
|
+
file_extensions=[".c", ".h"],
|
|
267
|
+
include_patterns=["src/**/*.c", "src/**/*.h", "include/**/*.h", "**/*.c", "**/*.h"],
|
|
268
|
+
exclude_patterns=["build/**", "cmake-build-*/**", "third_party/**", "vendor/**"],
|
|
269
|
+
tree_sitter_language="c",
|
|
270
|
+
import_node_types=["preproc_include"],
|
|
271
|
+
export_node_types=["function_definition", "struct_specifier", "enum_specifier", "type_definition"],
|
|
272
|
+
boundary_node_types=["function_definition", "struct_specifier", "enum_specifier", "type_definition"],
|
|
273
|
+
import_pattern=r"""^#include\s+[<"]([\w/.]+)[>"]""",
|
|
274
|
+
export_pattern=r"""^(?:static\s+)?(?:inline\s+)?(?:extern\s+)?(?:\w+[\s*]+)+(\w+)\s*\(""",
|
|
275
|
+
boundary_pattern=r"""^(?:static\s+)?(?:inline\s+)?(?:extern\s+)?(?:\w+[\s*]+)+\w+\s*\(|^(?:typedef\s+)?(?:struct|enum|union)\s+\w+""",
|
|
276
|
+
layer_parts=["src", "include", "lib", "drivers", "core", "hal"],
|
|
277
|
+
feature_regex=r"(?:src|modules)/(\w+)/",
|
|
278
|
+
file_type_suffixes=[
|
|
279
|
+
("_test", "test"),
|
|
280
|
+
("_hal", "hal"),
|
|
281
|
+
("_driver", "driver"),
|
|
282
|
+
("_util", "util"),
|
|
283
|
+
],
|
|
284
|
+
external_import_prefixes=(),
|
|
285
|
+
highlight_language="c",
|
|
286
|
+
file_ref_patterns=[
|
|
287
|
+
r"\b([\w/]+\.[ch]):(\d+)",
|
|
288
|
+
],
|
|
289
|
+
searchable_extensions=[".c", ".h", ".md", ".txt", ".cmake"],
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
CPP_PROFILE = LanguageProfile(
|
|
293
|
+
language="cpp",
|
|
294
|
+
file_extensions=[".cpp", ".cc", ".cxx", ".hpp", ".hxx", ".hh", ".h"],
|
|
295
|
+
include_patterns=[
|
|
296
|
+
"src/**/*.cpp", "src/**/*.cc", "src/**/*.cxx",
|
|
297
|
+
"src/**/*.hpp", "src/**/*.hxx", "src/**/*.hh",
|
|
298
|
+
"include/**/*.hpp", "include/**/*.hxx", "include/**/*.hh", "include/**/*.h",
|
|
299
|
+
"**/*.cpp", "**/*.cc", "**/*.hpp",
|
|
300
|
+
],
|
|
301
|
+
exclude_patterns=["build/**", "cmake-build-*/**", "third_party/**", "vendor/**"],
|
|
302
|
+
tree_sitter_language="cpp",
|
|
303
|
+
import_node_types=["preproc_include"],
|
|
304
|
+
export_node_types=[
|
|
305
|
+
"function_definition", "class_specifier", "struct_specifier",
|
|
306
|
+
"enum_specifier", "namespace_definition", "template_declaration",
|
|
307
|
+
],
|
|
308
|
+
boundary_node_types=[
|
|
309
|
+
"function_definition", "class_specifier", "struct_specifier",
|
|
310
|
+
"enum_specifier", "namespace_definition", "template_declaration",
|
|
311
|
+
],
|
|
312
|
+
import_pattern=r"""^#include\s+[<"]([\w/.]+)[>"]""",
|
|
313
|
+
export_pattern=r"""^(?:template\s*<[^>]*>\s*)?(?:class|struct|enum(?:\s+class)?|namespace)\s+(\w+)|^(?:[\w:*&<>\s]+)\s+(\w+)\s*\(""",
|
|
314
|
+
boundary_pattern=r"""^(?:template\s*<[^>]*>\s*)?(?:class|struct|enum|namespace)\s+\w+|^(?:[\w:*&<>\s]+)\s+\w+\s*\(""",
|
|
315
|
+
layer_parts=["src", "include", "lib", "core", "engine", "modules"],
|
|
316
|
+
feature_regex=r"(?:src|modules)/(\w+)/",
|
|
317
|
+
file_type_suffixes=[
|
|
318
|
+
("_test", "test"),
|
|
319
|
+
("_impl", "implementation"),
|
|
320
|
+
("_factory", "factory"),
|
|
321
|
+
("_manager", "manager"),
|
|
322
|
+
("_handler", "handler"),
|
|
323
|
+
("_util", "util"),
|
|
324
|
+
],
|
|
325
|
+
external_import_prefixes=(),
|
|
326
|
+
highlight_language="cpp",
|
|
327
|
+
file_ref_patterns=[
|
|
328
|
+
r"\b([\w/]+\.(?:cpp|cc|cxx|hpp|hxx|hh|h)):(\d+)",
|
|
329
|
+
],
|
|
330
|
+
searchable_extensions=[".cpp", ".cc", ".cxx", ".hpp", ".hxx", ".hh", ".h", ".cmake", ".md"],
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
# ---------------------------------------------------------------------------
|
|
334
|
+
# Profile registry
|
|
335
|
+
# ---------------------------------------------------------------------------
|
|
336
|
+
|
|
337
|
+
LANGUAGE_PROFILES: dict[str, LanguageProfile] = {
|
|
338
|
+
"dart": DART_PROFILE,
|
|
339
|
+
"python": PYTHON_PROFILE,
|
|
340
|
+
"javascript": JAVASCRIPT_PROFILE,
|
|
341
|
+
"typescript": JAVASCRIPT_PROFILE,
|
|
342
|
+
"go": GO_PROFILE,
|
|
343
|
+
"rust": RUST_PROFILE,
|
|
344
|
+
"java": JAVA_PROFILE,
|
|
345
|
+
"c": C_PROFILE,
|
|
346
|
+
"cpp": CPP_PROFILE,
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
# Map file extension -> language name for quick lookup
|
|
350
|
+
_EXTENSION_TO_LANGUAGE: dict[str, str] = {}
|
|
351
|
+
for _name, _profile in LANGUAGE_PROFILES.items():
|
|
352
|
+
for _ext in _profile.file_extensions:
|
|
353
|
+
_EXTENSION_TO_LANGUAGE.setdefault(_ext, _name)
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
def get_profile_for_extension(ext: str) -> Optional[LanguageProfile]:
|
|
357
|
+
"""Return the language profile for a given file extension, or None."""
|
|
358
|
+
lang = _EXTENSION_TO_LANGUAGE.get(ext)
|
|
359
|
+
return LANGUAGE_PROFILES.get(lang) if lang else None
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
# ---------------------------------------------------------------------------
|
|
363
|
+
# Auto-detection by scanning the project directory
|
|
364
|
+
# ---------------------------------------------------------------------------
|
|
365
|
+
|
|
366
|
+
# Directories to skip during the file scan
|
|
367
|
+
_SKIP_DIRS: set[str] = {
|
|
368
|
+
".git", "node_modules", "__pycache__", "build", "dist", "target",
|
|
369
|
+
".dart_tool", ".next", "venv", ".venv", ".idea", ".vs", "vendor",
|
|
370
|
+
"cmake-build-debug", "cmake-build-release", ".cache", ".gradle",
|
|
371
|
+
"Pods", ".build", "egg-info",
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
# All known code extensions mapped to their language
|
|
375
|
+
_EXT_TO_LANG: dict[str, str] = {
|
|
376
|
+
".dart": "dart",
|
|
377
|
+
".py": "python",
|
|
378
|
+
".js": "javascript", ".jsx": "javascript", ".mjs": "javascript", ".cjs": "javascript",
|
|
379
|
+
".ts": "typescript", ".tsx": "typescript",
|
|
380
|
+
".go": "go",
|
|
381
|
+
".rs": "rust",
|
|
382
|
+
".java": "java",
|
|
383
|
+
".c": "c",
|
|
384
|
+
".cpp": "cpp", ".cc": "cpp", ".cxx": "cpp",
|
|
385
|
+
".hpp": "cpp", ".hxx": "cpp", ".hh": "cpp",
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
# Max files to scan before stopping (avoid huge repos taking forever)
|
|
389
|
+
_MAX_SCAN_FILES: int = 5000
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
def detect_language(codebase_path: Path) -> str:
|
|
393
|
+
"""Auto-detect the primary language by scanning files in the project.
|
|
394
|
+
|
|
395
|
+
Walks the directory tree (skipping common non-source dirs), counts code
|
|
396
|
+
file extensions, and returns the language with the most source files.
|
|
397
|
+
Falls back to "generic" if no known code files are found.
|
|
398
|
+
"""
|
|
399
|
+
counts: Counter[str] = Counter()
|
|
400
|
+
scanned = 0
|
|
401
|
+
|
|
402
|
+
for root, dirs, files in os.walk(codebase_path):
|
|
403
|
+
# Prune directories we don't want to descend into
|
|
404
|
+
dirs[:] = [d for d in dirs if d not in _SKIP_DIRS and not d.startswith(".")]
|
|
405
|
+
|
|
406
|
+
for fname in files:
|
|
407
|
+
ext = os.path.splitext(fname)[1].lower()
|
|
408
|
+
lang = _EXT_TO_LANG.get(ext)
|
|
409
|
+
if lang:
|
|
410
|
+
counts[lang] += 1
|
|
411
|
+
|
|
412
|
+
scanned += 1
|
|
413
|
+
if scanned >= _MAX_SCAN_FILES:
|
|
414
|
+
break
|
|
415
|
+
if scanned >= _MAX_SCAN_FILES:
|
|
416
|
+
break
|
|
417
|
+
|
|
418
|
+
if not counts:
|
|
419
|
+
return "generic"
|
|
420
|
+
|
|
421
|
+
# .h files are ambiguous — could be C or C++. If we have .cpp/.cc files,
|
|
422
|
+
# count .h towards C++. Otherwise count towards C.
|
|
423
|
+
# (This is already handled by the counter — .h maps to "c" by default,
|
|
424
|
+
# but if cpp count > c count the user likely has a C++ project.)
|
|
425
|
+
|
|
426
|
+
# Merge typescript into javascript (same profile)
|
|
427
|
+
if "typescript" in counts:
|
|
428
|
+
counts["javascript"] += counts.pop("typescript")
|
|
429
|
+
|
|
430
|
+
winner = counts.most_common(1)[0][0]
|
|
431
|
+
|
|
432
|
+
print(f"[detect] Scanned {scanned} files — language breakdown: {dict(counts.most_common())}")
|
|
433
|
+
|
|
434
|
+
return winner
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
def get_profile(codebase_path: Path, language_override: str | None = None) -> LanguageProfile | None:
|
|
438
|
+
"""Get the language profile for a codebase.
|
|
439
|
+
|
|
440
|
+
Args:
|
|
441
|
+
codebase_path: Root of the target codebase.
|
|
442
|
+
language_override: If set, use this language instead of auto-detecting.
|
|
443
|
+
|
|
444
|
+
Returns:
|
|
445
|
+
A LanguageProfile, or None if the language is "generic" (no profile).
|
|
446
|
+
"""
|
|
447
|
+
lang = language_override or detect_language(codebase_path)
|
|
448
|
+
return LANGUAGE_PROFILES.get(lang)
|