codebase-mcp 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,462 @@
1
+ """
2
+ Lightweight parsers for non-code files:
3
+ - Markdown (.md) → headings as symbols
4
+ - JSON (.json) → top-level keys as symbols
5
+ - TOML (.toml) → sections + keys as symbols
6
+ - YAML (.yaml, .yml) → top-level keys as symbols
7
+ - Makefile → targets as symbols
8
+ - Dockerfile → stages and major instructions as symbols
9
+ - .env → key names ONLY (never values) as symbols
10
+ - .proto (Protocol Buffers) → messages, enums, services as symbols
11
+ - .graphql / .gql → types, queries, mutations as symbols
12
+ - .sql → CREATE TABLE/FUNCTION/PROCEDURE/VIEW as symbols
13
+
14
+ These use stdlib + regex, NOT tree-sitter, so they work without any extra packages.
15
+ """
16
+
17
+ from __future__ import annotations
18
+ import json
19
+ import re
20
+ from ..models import ParseResult, ParsedSymbol, ParsedImport
21
+ from .base import BaseParser, register
22
+
23
+
24
+ # ─── Markdown ─────────────────────────────────────────────────────────────────
25
+
26
+ @register("markdown")
27
+ class MarkdownParser(BaseParser):
28
+ language_name = "markdown"
29
+
30
+ def parse(self, path: str, content: str) -> ParseResult:
31
+ result = ParseResult(language="markdown", line_count=content.count("\n") + 1)
32
+ lines = content.splitlines()
33
+ for i, line in enumerate(lines, 1):
34
+ m = re.match(r'^(#{1,6})\s+(.+)', line)
35
+ if m:
36
+ level = len(m.group(1))
37
+ heading = m.group(2).strip()
38
+ result.symbols.append(ParsedSymbol(
39
+ kind="heading",
40
+ name=heading[:120],
41
+ line_start=i,
42
+ line_end=i,
43
+ signature=line.strip()[:120],
44
+ is_exported=True,
45
+ ))
46
+ # Also extract code fence language hints as import-like refs
47
+ m2 = re.match(r'^```(\w+)', line)
48
+ if m2:
49
+ result.imports.append(ParsedImport(
50
+ imported_name=m2.group(1),
51
+ line_number=i,
52
+ import_kind="from",
53
+ ))
54
+ return result
55
+
56
+
57
+ # ─── JSON ─────────────────────────────────────────────────────────────────────
58
+
59
+ @register("json")
60
+ class JSONParser(BaseParser):
61
+ language_name = "json"
62
+
63
+ @staticmethod
64
+ def _strip_comments(text: str) -> str:
65
+ """Strip // and /* */ comments for JSONC/tsconfig/launch.json files."""
66
+ import re
67
+ # Remove block comments /* ... */
68
+ text = re.sub(r'/\*.*?\*/', '', text, flags=re.DOTALL)
69
+ # Remove line comments // ... (not inside strings — best-effort)
70
+ text = re.sub(r'(?<!:)//[^\n]*', '', text)
71
+ # Remove trailing commas before } or ] (common in JSONC)
72
+ text = re.sub(r',(\s*[}\]])', r'\1', text)
73
+ return text
74
+
75
+ def parse(self, path: str, content: str) -> ParseResult:
76
+ result = ParseResult(language="json", line_count=content.count("\n") + 1)
77
+ try:
78
+ data = json.loads(content)
79
+ except (json.JSONDecodeError, ValueError):
80
+ # Retry as JSONC (JSON with comments) — tsconfig.json, launch.json, etc.
81
+ try:
82
+ data = json.loads(self._strip_comments(content))
83
+ except (json.JSONDecodeError, ValueError):
84
+ result.parse_error = "Invalid JSON"
85
+ return result
86
+
87
+ if not isinstance(data, dict):
88
+ return result
89
+
90
+ # package.json special handling
91
+ is_package_json = path.endswith("package.json")
92
+ for key, value in data.items():
93
+ kind = "variable"
94
+ if key == "scripts" and isinstance(value, dict):
95
+ for script_name in value:
96
+ result.symbols.append(ParsedSymbol(
97
+ kind="function",
98
+ name=f"script:{script_name}",
99
+ line_start=1,
100
+ line_end=1,
101
+ signature=f"npm run {script_name}",
102
+ ))
103
+ continue
104
+ if key in ("dependencies", "devDependencies", "peerDependencies") and isinstance(value, dict):
105
+ for dep in value:
106
+ result.imports.append(ParsedImport(
107
+ imported_name=dep,
108
+ line_number=1,
109
+ import_kind="module",
110
+ ))
111
+ continue
112
+ val_preview = json.dumps(value)[:60] if value is not None else "null"
113
+ result.symbols.append(ParsedSymbol(
114
+ kind="variable",
115
+ name=key,
116
+ line_start=1,
117
+ line_end=1,
118
+ signature=f"{key}: {val_preview}",
119
+ ))
120
+ return result
121
+
122
+
123
+ # ─── TOML ─────────────────────────────────────────────────────────────────────
124
+
125
+ @register("toml")
126
+ class TOMLParser(BaseParser):
127
+ language_name = "toml"
128
+
129
+ def parse(self, path: str, content: str) -> ParseResult:
130
+ result = ParseResult(language="toml", line_count=content.count("\n") + 1)
131
+ lines = content.splitlines()
132
+ current_section = ""
133
+
134
+ for i, line in enumerate(lines, 1):
135
+ stripped = line.strip()
136
+ if not stripped or stripped.startswith("#"):
137
+ continue
138
+
139
+ # [section] or [[array]]
140
+ m = re.match(r'^\[{1,2}([^\]]+)\]{1,2}', stripped)
141
+ if m:
142
+ current_section = m.group(1).strip()
143
+ result.symbols.append(ParsedSymbol(
144
+ kind="class",
145
+ name=current_section,
146
+ line_start=i,
147
+ line_end=i,
148
+ signature=stripped,
149
+ ))
150
+ continue
151
+
152
+ # key = value
153
+ m2 = re.match(r'^([a-zA-Z_][a-zA-Z0-9_\-\.]*)\s*=', stripped)
154
+ if m2:
155
+ key = m2.group(1)
156
+ full_key = f"{current_section}.{key}" if current_section else key
157
+ result.symbols.append(ParsedSymbol(
158
+ kind="variable",
159
+ name=key,
160
+ line_start=i,
161
+ line_end=i,
162
+ signature=stripped[:80],
163
+ parent_name=current_section if current_section else None,
164
+ ))
165
+
166
+ return result
167
+
168
+
169
+ # ─── YAML ─────────────────────────────────────────────────────────────────────
170
+
171
+ @register("yaml")
172
+ class YAMLParser(BaseParser):
173
+ language_name = "yaml"
174
+
175
+ def parse(self, path: str, content: str) -> ParseResult:
176
+ result = ParseResult(language="yaml", line_count=content.count("\n") + 1)
177
+
178
+ # Try PyYAML if available
179
+ try:
180
+ import yaml
181
+ try:
182
+ data = yaml.safe_load(content)
183
+ if isinstance(data, dict):
184
+ self._extract_yaml_keys(data, result, depth=0, max_depth=2)
185
+ return result
186
+ except yaml.YAMLError:
187
+ pass
188
+ except ImportError:
189
+ pass
190
+
191
+ # Fallback: regex-based top-level key extraction
192
+ lines = content.splitlines()
193
+ for i, line in enumerate(lines, 1):
194
+ m = re.match(r'^([a-zA-Z_][a-zA-Z0-9_\-]*):', line)
195
+ if m:
196
+ result.symbols.append(ParsedSymbol(
197
+ kind="variable",
198
+ name=m.group(1),
199
+ line_start=i,
200
+ line_end=i,
201
+ signature=line.strip()[:80],
202
+ ))
203
+ return result
204
+
205
+ def _extract_yaml_keys(self, data: dict, result: ParseResult,
206
+ depth: int, max_depth: int, parent: str = "") -> None:
207
+ for key, value in data.items():
208
+ name = str(key)
209
+ result.symbols.append(ParsedSymbol(
210
+ kind="class" if isinstance(value, dict) else "variable",
211
+ name=name,
212
+ line_start=1,
213
+ line_end=1,
214
+ signature=f"{name}: {str(value)[:60]}" if not isinstance(value, dict) else f"{name}:",
215
+ parent_name=parent or None,
216
+ ))
217
+ if isinstance(value, dict) and depth < max_depth:
218
+ self._extract_yaml_keys(value, result, depth + 1, max_depth, name)
219
+
220
+
221
+ # ─── Makefile ─────────────────────────────────────────────────────────────────
222
+
223
+ @register("makefile")
224
+ class MakefileParser(BaseParser):
225
+ language_name = "makefile"
226
+
227
+ def parse(self, path: str, content: str) -> ParseResult:
228
+ result = ParseResult(language="makefile", line_count=content.count("\n") + 1)
229
+ lines = content.splitlines()
230
+ for i, line in enumerate(lines, 1):
231
+ # Target: target_name: [dependencies]
232
+ m = re.match(r'^([a-zA-Z_][a-zA-Z0-9_\-\./]*)(?:\s*:(?!=))', line)
233
+ if m and not line.startswith("\t"):
234
+ target = m.group(1)
235
+ if not target.startswith(".") or target in (".PHONY", ".DEFAULT", ".SILENT"):
236
+ result.symbols.append(ParsedSymbol(
237
+ kind="function",
238
+ name=target,
239
+ line_start=i,
240
+ line_end=i,
241
+ signature=line.strip()[:120],
242
+ is_exported=not target.startswith("_"),
243
+ ))
244
+ # Variable definitions
245
+ m2 = re.match(r'^([A-Z_][A-Z0-9_]*)\s*[:?!]?=', line)
246
+ if m2:
247
+ result.symbols.append(ParsedSymbol(
248
+ kind="variable",
249
+ name=m2.group(1),
250
+ line_start=i,
251
+ line_end=i,
252
+ signature=line.strip()[:80],
253
+ ))
254
+ return result
255
+
256
+
257
+ # ─── Dockerfile ───────────────────────────────────────────────────────────────
258
+
259
+ @register("dockerfile")
260
+ class DockerfileParser(BaseParser):
261
+ language_name = "dockerfile"
262
+
263
+ def parse(self, path: str, content: str) -> ParseResult:
264
+ result = ParseResult(language="dockerfile", line_count=content.count("\n") + 1)
265
+ lines = content.splitlines()
266
+ stage_count = 0
267
+ for i, line in enumerate(lines, 1):
268
+ stripped = line.strip()
269
+ if not stripped or stripped.startswith("#"):
270
+ continue
271
+
272
+ upper = stripped.upper()
273
+ # FROM image AS stage_name
274
+ m = re.match(r'FROM\s+\S+(?:\s+AS\s+(\S+))?', stripped, re.IGNORECASE)
275
+ if m:
276
+ stage_name = m.group(1) or f"stage_{stage_count}"
277
+ stage_count += 1
278
+ result.symbols.append(ParsedSymbol(
279
+ kind="class",
280
+ name=stage_name,
281
+ line_start=i,
282
+ line_end=i,
283
+ signature=stripped[:120],
284
+ ))
285
+ continue
286
+
287
+ # ARG / ENV variable declarations
288
+ for keyword in ("ARG", "ENV"):
289
+ if upper.startswith(keyword + " "):
290
+ m2 = re.match(rf'{keyword}\s+([A-Z_][A-Z0-9_]*)', stripped, re.IGNORECASE)
291
+ if m2:
292
+ result.symbols.append(ParsedSymbol(
293
+ kind="variable",
294
+ name=m2.group(1),
295
+ line_start=i,
296
+ line_end=i,
297
+ signature=stripped[:80],
298
+ ))
299
+
300
+ return result
301
+
302
+
303
+ # ─── .env files ───────────────────────────────────────────────────────────────
304
+
305
+ @register("env")
306
+ class EnvParser(BaseParser):
307
+ language_name = "env"
308
+
309
+ def parse(self, path: str, content: str) -> ParseResult:
310
+ result = ParseResult(language="env", line_count=content.count("\n") + 1)
311
+ lines = content.splitlines()
312
+ for i, line in enumerate(lines, 1):
313
+ stripped = line.strip()
314
+ if not stripped or stripped.startswith("#"):
315
+ continue
316
+ m = re.match(r'^export\s+([A-Za-z_][A-Za-z0-9_]*)=?', stripped)
317
+ if not m:
318
+ m = re.match(r'^([A-Za-z_][A-Za-z0-9_]*)=', stripped)
319
+ if m:
320
+ key_name = m.group(1)
321
+ result.symbols.append(ParsedSymbol(
322
+ kind="variable",
323
+ name=key_name,
324
+ line_start=i,
325
+ line_end=i,
326
+ # NEVER store the value — only the key name
327
+ signature=f"{key_name}=<value>",
328
+ is_exported=True,
329
+ ))
330
+ return result
331
+
332
+
333
+ # ─── Protocol Buffers (.proto) ────────────────────────────────────────────────
334
+
335
+ @register("proto")
336
+ class ProtoParser(BaseParser):
337
+ language_name = "proto"
338
+
339
+ def parse(self, path: str, content: str) -> ParseResult:
340
+ result = ParseResult(language="proto", line_count=content.count("\n") + 1)
341
+ lines = content.splitlines()
342
+ for i, line in enumerate(lines, 1):
343
+ stripped = line.strip()
344
+ for keyword, kind in [("message", "class"), ("enum", "enum"),
345
+ ("service", "interface"), ("rpc", "function")]:
346
+ m = re.match(rf'^{keyword}\s+(\w+)', stripped)
347
+ if m:
348
+ result.symbols.append(ParsedSymbol(
349
+ kind=kind,
350
+ name=m.group(1),
351
+ line_start=i,
352
+ line_end=i,
353
+ signature=stripped[:120],
354
+ ))
355
+ # import "foo.proto"
356
+ m2 = re.match(r'^import\s+"([^"]+)"', stripped)
357
+ if m2:
358
+ result.imports.append(ParsedImport(imported_name=m2.group(1), line_number=i))
359
+ return result
360
+
361
+
362
+ # ─── GraphQL (.graphql / .gql) ───────────────────────────────────────────────
363
+
364
+ @register("graphql")
365
+ class GraphQLParser(BaseParser):
366
+ language_name = "graphql"
367
+
368
+ _KEYWORDS = {
369
+ "type": "class", "interface": "interface", "enum": "enum",
370
+ "union": "type_alias", "input": "struct", "scalar": "type_alias",
371
+ "query": "function", "mutation": "function", "subscription": "function",
372
+ "fragment": "function", "directive": "function", "schema": "class",
373
+ "extend": "class",
374
+ }
375
+
376
+ def parse(self, path: str, content: str) -> ParseResult:
377
+ result = ParseResult(language="graphql", line_count=content.count("\n") + 1)
378
+ lines = content.splitlines()
379
+ for i, line in enumerate(lines, 1):
380
+ stripped = line.strip()
381
+ for kw, kind in self._KEYWORDS.items():
382
+ m = re.match(rf'^{kw}\s+(\w+)', stripped)
383
+ if m:
384
+ result.symbols.append(ParsedSymbol(
385
+ kind=kind,
386
+ name=m.group(1),
387
+ line_start=i,
388
+ line_end=i,
389
+ signature=stripped[:120],
390
+ ))
391
+ break
392
+ return result
393
+
394
+
395
+ # ─── SQL ──────────────────────────────────────────────────────────────────────
396
+
397
+ @register("sql")
398
+ class SQLParser(BaseParser):
399
+ language_name = "sql"
400
+
401
+ def parse(self, path: str, content: str) -> ParseResult:
402
+ result = ParseResult(language="sql", line_count=content.count("\n") + 1)
403
+ # Multi-line so we join and scan
404
+ combined = " ".join(l.strip() for l in content.splitlines() if not l.strip().startswith("--"))
405
+ patterns = [
406
+ (r'CREATE\s+(?:OR\s+REPLACE\s+)?TABLE\s+(?:IF\s+NOT\s+EXISTS\s+)?([`"\[\w\.]+)', "class"),
407
+ (r'CREATE\s+(?:OR\s+REPLACE\s+)?(?:DEFINER[^F]*)?FUNCTION\s+([`"\[\w\.]+)', "function"),
408
+ (r'CREATE\s+(?:OR\s+REPLACE\s+)?PROCEDURE\s+([`"\[\w\.]+)', "function"),
409
+ (r'CREATE\s+(?:OR\s+REPLACE\s+)?VIEW\s+([`"\[\w\.]+)', "class"),
410
+ (r'CREATE\s+(?:UNIQUE\s+)?INDEX\s+(\w+)', "variable"),
411
+ (r'CREATE\s+(?:OR\s+REPLACE\s+)?TRIGGER\s+(\w+)', "function"),
412
+ ]
413
+ for pattern, kind in patterns:
414
+ for m in re.finditer(pattern, combined, re.IGNORECASE):
415
+ name = m.group(1).strip('`"[]')
416
+ result.symbols.append(ParsedSymbol(
417
+ kind=kind,
418
+ name=name,
419
+ line_start=1,
420
+ line_end=1,
421
+ signature=combined[m.start():m.start() + 80].strip(),
422
+ ))
423
+ return result
424
+
425
+
426
+ # ─── HCL / Terraform (.tf) ───────────────────────────────────────────────────
427
+
428
+ @register("hcl")
429
+ class HCLParser(BaseParser):
430
+ language_name = "hcl"
431
+
432
+ def parse(self, path: str, content: str) -> ParseResult:
433
+ result = ParseResult(language="hcl", line_count=content.count("\n") + 1)
434
+ lines = content.splitlines()
435
+ for i, line in enumerate(lines, 1):
436
+ stripped = line.strip()
437
+ # resource "type" "name" {
438
+ m = re.match(r'^(\w+)\s+"([^"]+)"\s+"([^"]+)"\s*\{?', stripped)
439
+ if m:
440
+ block_type, res_type, res_name = m.groups()
441
+ result.symbols.append(ParsedSymbol(
442
+ kind="class" if block_type == "resource" else "function",
443
+ name=f"{res_type}.{res_name}",
444
+ line_start=i,
445
+ line_end=i,
446
+ signature=stripped[:120],
447
+ ))
448
+ continue
449
+ # variable "name" {, output "name" {, module "name" {
450
+ m2 = re.match(r'^(\w+)\s+"([^"]+)"\s*\{?', stripped)
451
+ if m2:
452
+ block_type, name = m2.groups()
453
+ if block_type in ("variable", "output", "module", "data", "locals",
454
+ "provider", "terraform"):
455
+ result.symbols.append(ParsedSymbol(
456
+ kind="variable" if block_type in ("variable", "output") else "class",
457
+ name=f"{block_type}.{name}" if block_type not in ("variable", "output") else name,
458
+ line_start=i,
459
+ line_end=i,
460
+ signature=stripped[:120],
461
+ ))
462
+ return result
@@ -0,0 +1,95 @@
1
+ """
2
+ Generic regex-based fallback parser.
3
+ Used for languages without a tree-sitter grammar, or when tree-sitter is unavailable.
4
+ Extracts function/class-like patterns with simple regex — better than nothing.
5
+ """
6
+
7
+ from __future__ import annotations
8
+ import re
9
+ from ..models import ParseResult, ParsedSymbol, ParsedImport
10
+ from .base import BaseParser, register
11
+
12
+
13
+ # Patterns that work across most C-style and Python-style languages
14
+ _FUNC_PATTERNS = [
15
+ # Python
16
+ re.compile(r"^\s*(?:async\s+)?def\s+(\w+)\s*\(", re.M),
17
+ # JS/TS
18
+ re.compile(r"^\s*(?:async\s+)?function\s+(\w+)\s*\(", re.M),
19
+ re.compile(r"^\s*(?:export\s+)?(?:async\s+)?(?:function\s+)?(\w+)\s*=\s*(?:async\s*)?\(", re.M),
20
+ # Go
21
+ re.compile(r"^func\s+(?:\(\w+\s+\*?\w+\)\s+)?(\w+)\s*\(", re.M),
22
+ # Rust
23
+ re.compile(r"^\s*(?:pub\s+)?(?:async\s+)?fn\s+(\w+)\s*[(<]", re.M),
24
+ # Java/C#
25
+ re.compile(r"^\s*(?:public|private|protected|static|async)[\s\w]*\s+(\w+)\s*\(", re.M),
26
+ ]
27
+
28
+ _CLASS_PATTERNS = [
29
+ re.compile(r"^\s*class\s+(\w+)", re.M),
30
+ re.compile(r"^\s*(?:pub\s+)?struct\s+(\w+)", re.M),
31
+ re.compile(r"^\s*(?:pub\s+)?enum\s+(\w+)", re.M),
32
+ re.compile(r"^\s*(?:pub\s+)?trait\s+(\w+)", re.M),
33
+ re.compile(r"^\s*interface\s+(\w+)", re.M),
34
+ ]
35
+
36
+ _IMPORT_PATTERNS = [
37
+ re.compile(r"^(?:import|from)\s+([\w.]+)", re.M),
38
+ re.compile(r'^import\s+"([\w./]+)"', re.M),
39
+ re.compile(r'^use\s+([\w:]+)', re.M),
40
+ re.compile(r'^require\s*\(\s*["\']([^"\']+)', re.M),
41
+ re.compile(r'^import\s+.*\s+from\s+["\']([^"\']+)', re.M),
42
+ ]
43
+
44
+
45
+ class GenericParser(BaseParser):
46
+ language_name = "generic"
47
+
48
+ def parse(self, path: str, content: str) -> ParseResult:
49
+ result = ParseResult(language="generic")
50
+ lines = content.splitlines()
51
+ result.line_count = len(lines)
52
+
53
+ seen_names: set[str] = set()
54
+
55
+ for pat in _FUNC_PATTERNS:
56
+ for m in pat.finditer(content):
57
+ name = m.group(1)
58
+ if name and name not in seen_names and not name.startswith("_"):
59
+ seen_names.add(name)
60
+ line_no = content[:m.start()].count("\n") + 1
61
+ result.symbols.append(ParsedSymbol(
62
+ kind="function",
63
+ name=name,
64
+ line_start=line_no,
65
+ line_end=line_no,
66
+ signature=lines[line_no - 1].strip() if line_no <= len(lines) else "",
67
+ ))
68
+
69
+ for pat in _CLASS_PATTERNS:
70
+ for m in pat.finditer(content):
71
+ name = m.group(1)
72
+ if name and name not in seen_names:
73
+ seen_names.add(name)
74
+ line_no = content[:m.start()].count("\n") + 1
75
+ result.symbols.append(ParsedSymbol(
76
+ kind="class",
77
+ name=name,
78
+ line_start=line_no,
79
+ line_end=line_no,
80
+ signature=lines[line_no - 1].strip() if line_no <= len(lines) else "",
81
+ ))
82
+
83
+ seen_imports: set[str] = set()
84
+ for pat in _IMPORT_PATTERNS:
85
+ for m in pat.finditer(content):
86
+ name = m.group(1)
87
+ if name and name not in seen_imports:
88
+ seen_imports.add(name)
89
+ line_no = content[:m.start()].count("\n") + 1
90
+ result.imports.append(ParsedImport(
91
+ imported_name=name,
92
+ line_number=line_no,
93
+ ))
94
+
95
+ return result