sourcefire 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,448 @@
1
+ """Language profiles for language-agnostic code analysis.
2
+
3
+ Each profile defines the patterns, AST node types, and regex needed to
4
+ parse imports, exports, and chunk boundaries for a specific language.
5
+ Auto-detection scans the project directory to count file extensions and
6
+ picks the dominant language.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import os
12
+ import re
13
+ from collections import Counter
14
+ from dataclasses import dataclass, field
15
+ from pathlib import Path
16
+ from typing import Any, Optional
17
+
18
+
19
+ @dataclass
20
+ class LanguageProfile:
21
+ """Configuration for language-specific code analysis."""
22
+
23
+ language: str
24
+ file_extensions: list[str]
25
+
26
+ # File collection patterns
27
+ include_patterns: list[str] = field(default_factory=list)
28
+ exclude_patterns: list[str] = field(default_factory=list)
29
+
30
+ # Tree-sitter config (optional)
31
+ tree_sitter_language: Optional[str] = None
32
+ import_node_types: list[str] = field(default_factory=list)
33
+ export_node_types: list[str] = field(default_factory=list)
34
+ boundary_node_types: list[str] = field(default_factory=list)
35
+ string_literal_type: str = "string_literal"
36
+
37
+ # Regex fallback patterns
38
+ import_pattern: Optional[str] = None
39
+ export_pattern: Optional[str] = None
40
+ boundary_pattern: Optional[str] = None
41
+
42
+ # Path-based metadata inference (optional)
43
+ layer_parts: list[str] = field(default_factory=list)
44
+ feature_regex: Optional[str] = None
45
+ file_type_suffixes: list[tuple[str, str]] = field(default_factory=list)
46
+ directory_type_patterns: dict[str, str] = field(default_factory=dict)
47
+
48
+ # Import graph config
49
+ external_import_prefixes: tuple[str, ...] = ()
50
+
51
+ # Syntax highlighting language for code blocks
52
+ highlight_language: str = "text"
53
+
54
+ # Stack trace / file reference regex patterns
55
+ file_ref_patterns: list[str] = field(default_factory=list)
56
+
57
+ # Search extensions for keyword search tool
58
+ searchable_extensions: list[str] = field(default_factory=list)
59
+
60
+
61
+ # ---------------------------------------------------------------------------
62
+ # Built-in language profiles
63
+ # ---------------------------------------------------------------------------
64
+
65
+ DART_PROFILE = LanguageProfile(
66
+ language="dart",
67
+ file_extensions=[".dart"],
68
+ include_patterns=["lib/**/*.dart", "pubspec.yaml", "analysis_options.yaml"],
69
+ exclude_patterns=["*.g.dart", "*.freezed.dart", "test/**"],
70
+ tree_sitter_language="dart",
71
+ import_node_types=["import_specification"],
72
+ export_node_types=[
73
+ "class_definition",
74
+ "enum_declaration",
75
+ "mixin_declaration",
76
+ "extension_declaration",
77
+ ],
78
+ boundary_node_types=[
79
+ "class_definition",
80
+ "enum_declaration",
81
+ "mixin_declaration",
82
+ "extension_declaration",
83
+ ],
84
+ import_pattern=r"""import\s+'([^']+)'""",
85
+ export_pattern=r"""^(?:abstract\s+)?(?:class|enum|mixin|extension)\s+(\w+)""",
86
+ boundary_pattern=r"""^(?:abstract\s+)?(?:class|enum|mixin|extension)\s+\w+""",
87
+ layer_parts=["presentation", "domain", "data", "core"],
88
+ feature_regex=r"features/(\w+)/",
89
+ file_type_suffixes=[
90
+ ("_remote_datasource", "datasource"),
91
+ ("_datasource", "datasource"),
92
+ ("_repository_impl", "repository"),
93
+ ("_repository", "repository"),
94
+ ("_notifier", "notifier"),
95
+ ("_provider", "provider"),
96
+ ("_interceptor", "interceptor"),
97
+ ("_screen", "screen"),
98
+ ("_widget", "widget"),
99
+ ("_model", "model"),
100
+ ("_entity", "entity"),
101
+ ],
102
+ directory_type_patterns={"/widgets/": "widget"},
103
+ external_import_prefixes=("package:", "dart:"),
104
+ highlight_language="dart",
105
+ file_ref_patterns=[
106
+ r"package:[^/]+/((?:features|core|lib)[^\s:)]+\.dart):(\d+)",
107
+ r"\b(lib/[^\s:)]+\.dart)(?::(\d+))?",
108
+ ],
109
+ searchable_extensions=[".dart", ".yaml", ".json", ".md"],
110
+ )
111
+
112
+ PYTHON_PROFILE = LanguageProfile(
113
+ language="python",
114
+ file_extensions=[".py"],
115
+ include_patterns=["src/**/*.py", "**/*.py"],
116
+ exclude_patterns=["__pycache__/**", "*.pyc", ".venv/**", "venv/**", "test/**", "tests/**"],
117
+ tree_sitter_language="python",
118
+ import_node_types=["import_statement", "import_from_statement"],
119
+ export_node_types=["class_definition", "function_definition"],
120
+ boundary_node_types=["class_definition", "function_definition"],
121
+ import_pattern=r"""^(?:from\s+([\w.]+)\s+import|import\s+([\w.]+))""",
122
+ export_pattern=r"""^(?:class|def|async\s+def)\s+(\w+)""",
123
+ boundary_pattern=r"""^(?:class|def|async\s+def)\s+\w+""",
124
+ layer_parts=["api", "services", "models", "core", "utils"],
125
+ feature_regex=r"(?:features|modules|apps)/(\w+)/",
126
+ file_type_suffixes=[
127
+ ("_test", "test"),
128
+ ("_service", "service"),
129
+ ("_handler", "handler"),
130
+ ("_model", "model"),
131
+ ("_schema", "schema"),
132
+ ("_router", "router"),
133
+ ("_view", "view"),
134
+ ("_serializer", "serializer"),
135
+ ],
136
+ external_import_prefixes=(),
137
+ highlight_language="python",
138
+ file_ref_patterns=[
139
+ r'File "([^"]+\.py)", line (\d+)',
140
+ r"\b([\w/]+\.py)(?::(\d+))?",
141
+ ],
142
+ searchable_extensions=[".py", ".yaml", ".yml", ".json", ".md", ".toml"],
143
+ )
144
+
145
+ JAVASCRIPT_PROFILE = LanguageProfile(
146
+ language="javascript",
147
+ file_extensions=[".js", ".jsx", ".ts", ".tsx", ".mjs", ".cjs"],
148
+ include_patterns=["src/**/*.{js,jsx,ts,tsx}", "**/*.{js,jsx,ts,tsx}"],
149
+ exclude_patterns=["node_modules/**", "dist/**", "build/**", ".next/**", "*.test.*", "*.spec.*"],
150
+ tree_sitter_language="typescript",
151
+ import_node_types=["import_statement"],
152
+ export_node_types=["export_statement", "class_declaration", "function_declaration"],
153
+ boundary_node_types=["export_statement", "class_declaration", "function_declaration"],
154
+ import_pattern=r"""(?:import|require)\s*\(?['"]([\w@./][^'"]*)['"]\)?""",
155
+ export_pattern=r"""^(?:export\s+)?(?:default\s+)?(?:class|function|const|let|var)\s+(\w+)""",
156
+ boundary_pattern=r"""^(?:export\s+)?(?:default\s+)?(?:class|function|const|let|var)\s+\w+""",
157
+ layer_parts=["components", "pages", "hooks", "services", "utils", "api", "lib"],
158
+ feature_regex=r"(?:features|modules)/(\w+)/",
159
+ file_type_suffixes=[
160
+ (".test", "test"),
161
+ (".spec", "test"),
162
+ (".hook", "hook"),
163
+ (".service", "service"),
164
+ (".controller", "controller"),
165
+ (".middleware", "middleware"),
166
+ (".component", "component"),
167
+ (".page", "page"),
168
+ (".route", "route"),
169
+ ],
170
+ external_import_prefixes=(),
171
+ highlight_language="typescript",
172
+ file_ref_patterns=[
173
+ r"\b(src/[^\s:)]+\.(?:ts|tsx|js|jsx))(?::(\d+))?",
174
+ ],
175
+ searchable_extensions=[".js", ".jsx", ".ts", ".tsx", ".json", ".md"],
176
+ )
177
+
178
+ GO_PROFILE = LanguageProfile(
179
+ language="go",
180
+ file_extensions=[".go"],
181
+ include_patterns=["**/*.go"],
182
+ exclude_patterns=["vendor/**", "*_test.go"],
183
+ tree_sitter_language="go",
184
+ import_node_types=["import_declaration"],
185
+ export_node_types=["function_declaration", "type_declaration", "method_declaration"],
186
+ boundary_node_types=["function_declaration", "type_declaration", "method_declaration"],
187
+ import_pattern=r"""^\s*"([^"]+)"$""",
188
+ export_pattern=r"""^(?:func|type)\s+(?:\(.*?\)\s+)?(\w+)""",
189
+ boundary_pattern=r"""^(?:func|type)\s+""",
190
+ layer_parts=["cmd", "internal", "pkg", "api"],
191
+ feature_regex=r"(?:internal|pkg)/(\w+)/",
192
+ file_type_suffixes=[
193
+ ("_handler", "handler"),
194
+ ("_service", "service"),
195
+ ("_repository", "repository"),
196
+ ("_model", "model"),
197
+ ("_middleware", "middleware"),
198
+ ],
199
+ external_import_prefixes=(),
200
+ highlight_language="go",
201
+ file_ref_patterns=[
202
+ r"\b([\w/]+\.go):(\d+)",
203
+ ],
204
+ searchable_extensions=[".go", ".yaml", ".yml", ".json", ".md"],
205
+ )
206
+
207
+ RUST_PROFILE = LanguageProfile(
208
+ language="rust",
209
+ file_extensions=[".rs"],
210
+ include_patterns=["src/**/*.rs", "**/*.rs"],
211
+ exclude_patterns=["target/**"],
212
+ tree_sitter_language="rust",
213
+ import_node_types=["use_declaration"],
214
+ export_node_types=["function_item", "struct_item", "enum_item", "impl_item", "trait_item"],
215
+ boundary_node_types=["function_item", "struct_item", "enum_item", "impl_item", "trait_item"],
216
+ import_pattern=r"""^use\s+([\w:]+)""",
217
+ export_pattern=r"""^(?:pub\s+)?(?:fn|struct|enum|impl|trait)\s+(\w+)""",
218
+ boundary_pattern=r"""^(?:pub\s+)?(?:fn|struct|enum|impl|trait)\s+\w+""",
219
+ layer_parts=["api", "domain", "infrastructure", "lib"],
220
+ feature_regex=None,
221
+ file_type_suffixes=[
222
+ ("_test", "test"),
223
+ ("_handler", "handler"),
224
+ ("_service", "service"),
225
+ ],
226
+ external_import_prefixes=(),
227
+ highlight_language="rust",
228
+ file_ref_patterns=[
229
+ r"\b([\w/]+\.rs):(\d+)",
230
+ ],
231
+ searchable_extensions=[".rs", ".toml", ".yaml", ".md"],
232
+ )
233
+
234
+ JAVA_PROFILE = LanguageProfile(
235
+ language="java",
236
+ file_extensions=[".java"],
237
+ include_patterns=["src/**/*.java", "**/*.java"],
238
+ exclude_patterns=["build/**", "target/**", "*Test.java", "*Tests.java"],
239
+ tree_sitter_language="java",
240
+ import_node_types=["import_declaration"],
241
+ export_node_types=["class_declaration", "interface_declaration", "enum_declaration"],
242
+ boundary_node_types=["class_declaration", "interface_declaration", "enum_declaration"],
243
+ import_pattern=r"""^import\s+([\w.]+);""",
244
+ export_pattern=r"""^(?:public\s+)?(?:class|interface|enum)\s+(\w+)""",
245
+ boundary_pattern=r"""^(?:public\s+)?(?:class|interface|enum)\s+\w+""",
246
+ layer_parts=["controller", "service", "repository", "model", "dto", "config"],
247
+ feature_regex=None,
248
+ file_type_suffixes=[
249
+ ("Controller", "controller"),
250
+ ("Service", "service"),
251
+ ("Repository", "repository"),
252
+ ("Dto", "dto"),
253
+ ("Entity", "entity"),
254
+ ("Config", "config"),
255
+ ],
256
+ external_import_prefixes=("java.", "javax.", "jakarta."),
257
+ highlight_language="java",
258
+ file_ref_patterns=[
259
+ r"\b([\w/]+\.java):(\d+)",
260
+ ],
261
+ searchable_extensions=[".java", ".xml", ".yaml", ".yml", ".json", ".md"],
262
+ )
263
+
264
+ C_PROFILE = LanguageProfile(
265
+ language="c",
266
+ file_extensions=[".c", ".h"],
267
+ include_patterns=["src/**/*.c", "src/**/*.h", "include/**/*.h", "**/*.c", "**/*.h"],
268
+ exclude_patterns=["build/**", "cmake-build-*/**", "third_party/**", "vendor/**"],
269
+ tree_sitter_language="c",
270
+ import_node_types=["preproc_include"],
271
+ export_node_types=["function_definition", "struct_specifier", "enum_specifier", "type_definition"],
272
+ boundary_node_types=["function_definition", "struct_specifier", "enum_specifier", "type_definition"],
273
+ import_pattern=r"""^#include\s+[<"]([\w/.]+)[>"]""",
274
+ export_pattern=r"""^(?:static\s+)?(?:inline\s+)?(?:extern\s+)?(?:\w+[\s*]+)+(\w+)\s*\(""",
275
+ boundary_pattern=r"""^(?:static\s+)?(?:inline\s+)?(?:extern\s+)?(?:\w+[\s*]+)+\w+\s*\(|^(?:typedef\s+)?(?:struct|enum|union)\s+\w+""",
276
+ layer_parts=["src", "include", "lib", "drivers", "core", "hal"],
277
+ feature_regex=r"(?:src|modules)/(\w+)/",
278
+ file_type_suffixes=[
279
+ ("_test", "test"),
280
+ ("_hal", "hal"),
281
+ ("_driver", "driver"),
282
+ ("_util", "util"),
283
+ ],
284
+ external_import_prefixes=(),
285
+ highlight_language="c",
286
+ file_ref_patterns=[
287
+ r"\b([\w/]+\.[ch]):(\d+)",
288
+ ],
289
+ searchable_extensions=[".c", ".h", ".md", ".txt", ".cmake"],
290
+ )
291
+
292
+ CPP_PROFILE = LanguageProfile(
293
+ language="cpp",
294
+ file_extensions=[".cpp", ".cc", ".cxx", ".hpp", ".hxx", ".hh", ".h"],
295
+ include_patterns=[
296
+ "src/**/*.cpp", "src/**/*.cc", "src/**/*.cxx",
297
+ "src/**/*.hpp", "src/**/*.hxx", "src/**/*.hh",
298
+ "include/**/*.hpp", "include/**/*.hxx", "include/**/*.hh", "include/**/*.h",
299
+ "**/*.cpp", "**/*.cc", "**/*.hpp",
300
+ ],
301
+ exclude_patterns=["build/**", "cmake-build-*/**", "third_party/**", "vendor/**"],
302
+ tree_sitter_language="cpp",
303
+ import_node_types=["preproc_include"],
304
+ export_node_types=[
305
+ "function_definition", "class_specifier", "struct_specifier",
306
+ "enum_specifier", "namespace_definition", "template_declaration",
307
+ ],
308
+ boundary_node_types=[
309
+ "function_definition", "class_specifier", "struct_specifier",
310
+ "enum_specifier", "namespace_definition", "template_declaration",
311
+ ],
312
+ import_pattern=r"""^#include\s+[<"]([\w/.]+)[>"]""",
313
+ export_pattern=r"""^(?:template\s*<[^>]*>\s*)?(?:class|struct|enum(?:\s+class)?|namespace)\s+(\w+)|^(?:[\w:*&<>\s]+)\s+(\w+)\s*\(""",
314
+ boundary_pattern=r"""^(?:template\s*<[^>]*>\s*)?(?:class|struct|enum|namespace)\s+\w+|^(?:[\w:*&<>\s]+)\s+\w+\s*\(""",
315
+ layer_parts=["src", "include", "lib", "core", "engine", "modules"],
316
+ feature_regex=r"(?:src|modules)/(\w+)/",
317
+ file_type_suffixes=[
318
+ ("_test", "test"),
319
+ ("_impl", "implementation"),
320
+ ("_factory", "factory"),
321
+ ("_manager", "manager"),
322
+ ("_handler", "handler"),
323
+ ("_util", "util"),
324
+ ],
325
+ external_import_prefixes=(),
326
+ highlight_language="cpp",
327
+ file_ref_patterns=[
328
+ r"\b([\w/]+\.(?:cpp|cc|cxx|hpp|hxx|hh|h)):(\d+)",
329
+ ],
330
+ searchable_extensions=[".cpp", ".cc", ".cxx", ".hpp", ".hxx", ".hh", ".h", ".cmake", ".md"],
331
+ )
332
+
333
+ # ---------------------------------------------------------------------------
334
+ # Profile registry
335
+ # ---------------------------------------------------------------------------
336
+
337
+ LANGUAGE_PROFILES: dict[str, LanguageProfile] = {
338
+ "dart": DART_PROFILE,
339
+ "python": PYTHON_PROFILE,
340
+ "javascript": JAVASCRIPT_PROFILE,
341
+ "typescript": JAVASCRIPT_PROFILE,
342
+ "go": GO_PROFILE,
343
+ "rust": RUST_PROFILE,
344
+ "java": JAVA_PROFILE,
345
+ "c": C_PROFILE,
346
+ "cpp": CPP_PROFILE,
347
+ }
348
+
349
+ # Map file extension -> language name for quick lookup
350
+ _EXTENSION_TO_LANGUAGE: dict[str, str] = {}
351
+ for _name, _profile in LANGUAGE_PROFILES.items():
352
+ for _ext in _profile.file_extensions:
353
+ _EXTENSION_TO_LANGUAGE.setdefault(_ext, _name)
354
+
355
+
356
+ def get_profile_for_extension(ext: str) -> Optional[LanguageProfile]:
357
+ """Return the language profile for a given file extension, or None."""
358
+ lang = _EXTENSION_TO_LANGUAGE.get(ext)
359
+ return LANGUAGE_PROFILES.get(lang) if lang else None
360
+
361
+
362
+ # ---------------------------------------------------------------------------
363
+ # Auto-detection by scanning the project directory
364
+ # ---------------------------------------------------------------------------
365
+
366
+ # Directories to skip during the file scan
367
+ _SKIP_DIRS: set[str] = {
368
+ ".git", "node_modules", "__pycache__", "build", "dist", "target",
369
+ ".dart_tool", ".next", "venv", ".venv", ".idea", ".vs", "vendor",
370
+ "cmake-build-debug", "cmake-build-release", ".cache", ".gradle",
371
+ "Pods", ".build", "egg-info",
372
+ }
373
+
374
+ # All known code extensions mapped to their language
375
+ _EXT_TO_LANG: dict[str, str] = {
376
+ ".dart": "dart",
377
+ ".py": "python",
378
+ ".js": "javascript", ".jsx": "javascript", ".mjs": "javascript", ".cjs": "javascript",
379
+ ".ts": "typescript", ".tsx": "typescript",
380
+ ".go": "go",
381
+ ".rs": "rust",
382
+ ".java": "java",
383
+ ".c": "c",
384
+ ".cpp": "cpp", ".cc": "cpp", ".cxx": "cpp",
385
+ ".hpp": "cpp", ".hxx": "cpp", ".hh": "cpp",
386
+ }
387
+
388
+ # Max files to scan before stopping (avoid huge repos taking forever)
389
+ _MAX_SCAN_FILES: int = 5000
390
+
391
+
392
+ def detect_language(codebase_path: Path) -> str:
393
+ """Auto-detect the primary language by scanning files in the project.
394
+
395
+ Walks the directory tree (skipping common non-source dirs), counts code
396
+ file extensions, and returns the language with the most source files.
397
+ Falls back to "generic" if no known code files are found.
398
+ """
399
+ counts: Counter[str] = Counter()
400
+ scanned = 0
401
+
402
+ for root, dirs, files in os.walk(codebase_path):
403
+ # Prune directories we don't want to descend into
404
+ dirs[:] = [d for d in dirs if d not in _SKIP_DIRS and not d.startswith(".")]
405
+
406
+ for fname in files:
407
+ ext = os.path.splitext(fname)[1].lower()
408
+ lang = _EXT_TO_LANG.get(ext)
409
+ if lang:
410
+ counts[lang] += 1
411
+
412
+ scanned += 1
413
+ if scanned >= _MAX_SCAN_FILES:
414
+ break
415
+ if scanned >= _MAX_SCAN_FILES:
416
+ break
417
+
418
+ if not counts:
419
+ return "generic"
420
+
421
+ # .h files are ambiguous — could be C or C++. If we have .cpp/.cc files,
422
+ # count .h towards C++. Otherwise count towards C.
423
+ # (This is already handled by the counter — .h maps to "c" by default,
424
+ # but if cpp count > c count the user likely has a C++ project.)
425
+
426
+ # Merge typescript into javascript (same profile)
427
+ if "typescript" in counts:
428
+ counts["javascript"] += counts.pop("typescript")
429
+
430
+ winner = counts.most_common(1)[0][0]
431
+
432
+ print(f"[detect] Scanned {scanned} files — language breakdown: {dict(counts.most_common())}")
433
+
434
+ return winner
435
+
436
+
437
+ def get_profile(codebase_path: Path, language_override: str | None = None) -> LanguageProfile | None:
438
+ """Get the language profile for a codebase.
439
+
440
+ Args:
441
+ codebase_path: Root of the target codebase.
442
+ language_override: If set, use this language instead of auto-detecting.
443
+
444
+ Returns:
445
+ A LanguageProfile, or None if the language is "generic" (no profile).
446
+ """
447
+ lang = language_override or detect_language(codebase_path)
448
+ return LANGUAGE_PROFILES.get(lang)