codebeacon 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. codebeacon/__init__.py +1 -0
  2. codebeacon/__main__.py +3 -0
  3. codebeacon/cache.py +136 -0
  4. codebeacon/cli.py +391 -0
  5. codebeacon/common/__init__.py +0 -0
  6. codebeacon/common/filters.py +170 -0
  7. codebeacon/common/symbols.py +121 -0
  8. codebeacon/common/types.py +98 -0
  9. codebeacon/config.py +144 -0
  10. codebeacon/contextmap/__init__.py +0 -0
  11. codebeacon/contextmap/generator.py +602 -0
  12. codebeacon/discover/__init__.py +0 -0
  13. codebeacon/discover/detector.py +388 -0
  14. codebeacon/discover/scanner.py +192 -0
  15. codebeacon/export/__init__.py +0 -0
  16. codebeacon/export/mcp.py +515 -0
  17. codebeacon/export/obsidian.py +812 -0
  18. codebeacon/extract/__init__.py +22 -0
  19. codebeacon/extract/base.py +372 -0
  20. codebeacon/extract/components.py +357 -0
  21. codebeacon/extract/dependencies.py +140 -0
  22. codebeacon/extract/entities.py +575 -0
  23. codebeacon/extract/queries/README.md +116 -0
  24. codebeacon/extract/queries/actix.scm +115 -0
  25. codebeacon/extract/queries/angular.scm +155 -0
  26. codebeacon/extract/queries/aspnet.scm +159 -0
  27. codebeacon/extract/queries/django.scm +122 -0
  28. codebeacon/extract/queries/express.scm +124 -0
  29. codebeacon/extract/queries/fastapi.scm +152 -0
  30. codebeacon/extract/queries/flask.scm +120 -0
  31. codebeacon/extract/queries/gin.scm +142 -0
  32. codebeacon/extract/queries/ktor.scm +144 -0
  33. codebeacon/extract/queries/laravel.scm +172 -0
  34. codebeacon/extract/queries/nestjs.scm +183 -0
  35. codebeacon/extract/queries/rails.scm +114 -0
  36. codebeacon/extract/queries/react.scm +111 -0
  37. codebeacon/extract/queries/spring_boot.scm +204 -0
  38. codebeacon/extract/queries/svelte.scm +73 -0
  39. codebeacon/extract/queries/vapor.scm +130 -0
  40. codebeacon/extract/queries/vue.scm +123 -0
  41. codebeacon/extract/routes.py +910 -0
  42. codebeacon/extract/semantic.py +280 -0
  43. codebeacon/extract/services.py +597 -0
  44. codebeacon/graph/__init__.py +1 -0
  45. codebeacon/graph/analyze.py +281 -0
  46. codebeacon/graph/build.py +320 -0
  47. codebeacon/graph/cluster.py +160 -0
  48. codebeacon/graph/enrich.py +206 -0
  49. codebeacon/skill/SKILL.md +127 -0
  50. codebeacon/wave.py +292 -0
  51. codebeacon/wiki/__init__.py +0 -0
  52. codebeacon/wiki/generator.py +376 -0
  53. codebeacon/wiki/index.py +95 -0
  54. codebeacon/wiki/templates.py +467 -0
  55. codebeacon-0.1.2.dist-info/METADATA +319 -0
  56. codebeacon-0.1.2.dist-info/RECORD +59 -0
  57. codebeacon-0.1.2.dist-info/WHEEL +4 -0
  58. codebeacon-0.1.2.dist-info/entry_points.txt +2 -0
  59. codebeacon-0.1.2.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,280 @@
1
+ """Semantic extraction: structured comment/docstring parsing → Edge objects.
2
+
3
+ Activated with `--semantic` flag. Does NOT require an LLM by default — parses
4
+ structured documentation comments (Javadoc, Python docstrings, JSDoc) to infer
5
+ additional "references" relationships between code entities.
6
+
7
+ LLM-based deeper inference is available via extract_semantic_llm() when an
8
+ ANTHROPIC_API_KEY is set.
9
+
10
+ Public API:
11
+ extract_semantic_refs(file_path, framework, source_node_id="") -> list[Edge]
12
+ extract_semantic_llm(file_path, framework, source_node_id="") -> list[Edge]
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import re
18
+ from pathlib import Path
19
+ from typing import Optional
20
+
21
+ from codebeacon.common.types import Edge
22
+
23
+
24
+ # ── Patterns ──────────────────────────────────────────────────────────────────
25
+
26
+ # Javadoc / KDoc: @see ClassName, {@link ClassName#method}, @throws ClassName
27
+ _JAVADOC_SEE = re.compile(r"@see\s+([\w.]+)")
28
+ _JAVADOC_LINK = re.compile(r"\{@link\s+([\w.#]+)\}")
29
+ _JAVADOC_PARAM_TYPE = re.compile(r"@param\s+\{?([\w<>\[\]]+)\}?\s+\w+")
30
+ _JAVADOC_THROWS = re.compile(r"@throws\s+([\w.]+)")
31
+
32
+ # Python docstring: :class:`ClassName`, :func:`name`, :meth:`ClassName.method`
33
+ _PY_CROSS_REF = re.compile(r":(?:class|func|meth|exc|attr):`([\w.]+)`")
34
+ # "See Also" section in NumPy/Google style docstrings
35
+ _PY_SEE_ALSO = re.compile(r"(?:See Also|See also)\s*\n\s*[-–]*\s*\n((?:\s+[\w., ]+\n)+)", re.MULTILINE)
36
+ _PY_SEE_ALSO_INLINE = re.compile(r"See[: ]+`?([\w]+)`?")
37
+
38
+ # JSDoc: @see ClassName, @param {ClassName} name, @returns {ClassName}
39
+ _JSDOC_SEE = re.compile(r"@see\s+\{?([\w.]+)\}?")
40
+ _JSDOC_TYPE = re.compile(r"@(?:param|returns?|type|throws)\s+\{([\w<>|, ]+)\}")
41
+
42
+ # Strip leading * from comment lines
43
+ _COMMENT_STAR = re.compile(r"^\s*\*+\s?", re.MULTILINE)
44
+
45
+ # Extract block comments /** ... */ and /* ... */
46
+ _BLOCK_COMMENT = re.compile(r"/\*\*?(.*?)\*/", re.DOTALL)
47
+ # Extract Python triple-quoted docstrings
48
+ _PY_DOCSTRING = re.compile(r'"""(.*?)"""|\'\'\'(.*?)\'\'\'', re.DOTALL)
49
+ # Extract line comments // ...
50
+ _LINE_COMMENT = re.compile(r"//[^\n]*")
51
+ # Extract Python # comments
52
+ _PY_LINE_COMMENT = re.compile(r"#[^\n]*")
53
+ # Extract Ruby/Shell # comments
54
+ _HASH_COMMENT = re.compile(r"#[^\n]*")
55
+
56
+
57
+ def _is_type_name(token: str) -> bool:
58
+ """Heuristic: is this token a meaningful class/type name (not a primitive)?"""
59
+ token = token.strip()
60
+ if not token or len(token) < 2:
61
+ return False
62
+ primitives = {
63
+ "int", "long", "float", "double", "boolean", "void", "string",
64
+ "String", "Integer", "Long", "Float", "Double", "Boolean", "Object",
65
+ "any", "unknown", "never", "undefined", "null", "true", "false",
66
+ "str", "bytes", "list", "dict", "tuple", "set", "bool", "type",
67
+ }
68
+ return token not in primitives and token[0].isupper()
69
+
70
+
71
+ def _make_ref_edge(source_node_id: str, target_name: str, source_file: str) -> Edge:
72
+ return Edge(
73
+ source=source_node_id,
74
+ target=target_name,
75
+ relation="references",
76
+ confidence="INFERRED",
77
+ confidence_score=0.5,
78
+ source_file=source_file,
79
+ )
80
+
81
+
82
+ def _extract_java_refs(content: str, source_node_id: str, source_file: str) -> list[Edge]:
83
+ edges: list[Edge] = []
84
+ seen: set[str] = set()
85
+
86
+ for block_match in _BLOCK_COMMENT.finditer(content):
87
+ block = _COMMENT_STAR.sub("", block_match.group(1))
88
+
89
+ for m in _JAVADOC_SEE.finditer(block):
90
+ name = m.group(1).split(".")[-1]
91
+ if _is_type_name(name) and name not in seen:
92
+ seen.add(name)
93
+ edges.append(_make_ref_edge(source_node_id, name, source_file))
94
+
95
+ for m in _JAVADOC_LINK.finditer(block):
96
+ raw = m.group(1).split("#")[0].split(".")[-1]
97
+ if _is_type_name(raw) and raw not in seen:
98
+ seen.add(raw)
99
+ edges.append(_make_ref_edge(source_node_id, raw, source_file))
100
+
101
+ for m in _JAVADOC_THROWS.finditer(block):
102
+ name = m.group(1).split(".")[-1]
103
+ if _is_type_name(name) and name not in seen:
104
+ seen.add(name)
105
+ edges.append(_make_ref_edge(source_node_id, name, source_file))
106
+
107
+ return edges
108
+
109
+
110
+ def _extract_python_refs(content: str, source_node_id: str, source_file: str) -> list[Edge]:
111
+ edges: list[Edge] = []
112
+ seen: set[str] = set()
113
+
114
+ for ds_match in _PY_DOCSTRING.finditer(content):
115
+ docstring = ds_match.group(1) or ds_match.group(2) or ""
116
+
117
+ for m in _PY_CROSS_REF.finditer(docstring):
118
+ name = m.group(1).split(".")[-1]
119
+ if _is_type_name(name) and name not in seen:
120
+ seen.add(name)
121
+ edges.append(_make_ref_edge(source_node_id, name, source_file))
122
+
123
+ for m in _PY_SEE_ALSO.finditer(docstring):
124
+ for token in re.split(r"[,\s]+", m.group(1)):
125
+ token = token.strip().rstrip("()")
126
+ if _is_type_name(token) and token not in seen:
127
+ seen.add(token)
128
+ edges.append(_make_ref_edge(source_node_id, token, source_file))
129
+
130
+ return edges
131
+
132
+
133
+ def _extract_js_refs(content: str, source_node_id: str, source_file: str) -> list[Edge]:
134
+ edges: list[Edge] = []
135
+ seen: set[str] = set()
136
+
137
+ for block_match in _BLOCK_COMMENT.finditer(content):
138
+ block = _COMMENT_STAR.sub("", block_match.group(1))
139
+
140
+ for m in _JSDOC_SEE.finditer(block):
141
+ name = m.group(1).split(".")[-1]
142
+ if _is_type_name(name) and name not in seen:
143
+ seen.add(name)
144
+ edges.append(_make_ref_edge(source_node_id, name, source_file))
145
+
146
+ for m in _JSDOC_TYPE.finditer(block):
147
+ # Handle union types like {UserService | AdminService}
148
+ for part in re.split(r"[|,<> ]+", m.group(1)):
149
+ part = part.strip()
150
+ if _is_type_name(part) and part not in seen:
151
+ seen.add(part)
152
+ edges.append(_make_ref_edge(source_node_id, part, source_file))
153
+
154
+ return edges
155
+
156
+
157
+ # ── Extension → extractor dispatch ──────────────────────────────────────────
158
+
159
+ _JAVA_EXTS = frozenset({".java", ".kt", ".kts", ".swift"})
160
+ _PY_EXTS = frozenset({".py"})
161
+ _JS_EXTS = frozenset({".js", ".jsx", ".ts", ".tsx", ".mjs", ".cjs", ".vue", ".svelte"})
162
+
163
+
164
+ def extract_semantic_refs(
165
+ file_path: str,
166
+ framework: str,
167
+ source_node_id: str = "",
168
+ ) -> list[Edge]:
169
+ """Parse structured comments in a source file and return inferred reference edges.
170
+
171
+ Args:
172
+ file_path: absolute path to the source file
173
+ framework: detected framework name (used for routing dispatch)
174
+ source_node_id: the node ID that 'owns' these references
175
+ (defaults to file_path if empty)
176
+
177
+ Returns:
178
+ list of Edge objects with relation="references", confidence="INFERRED"
179
+ """
180
+ path = Path(file_path)
181
+ if not path.exists():
182
+ return []
183
+
184
+ if not source_node_id:
185
+ source_node_id = str(path)
186
+
187
+ try:
188
+ content = path.read_text(encoding="utf-8", errors="replace")
189
+ except OSError:
190
+ return []
191
+
192
+ ext = path.suffix.lower()
193
+ if ext in _JAVA_EXTS:
194
+ return _extract_java_refs(content, source_node_id, file_path)
195
+ elif ext in _PY_EXTS:
196
+ return _extract_python_refs(content, source_node_id, file_path)
197
+ elif ext in _JS_EXTS:
198
+ return _extract_js_refs(content, source_node_id, file_path)
199
+ return []
200
+
201
+
202
+ def extract_semantic_llm(
203
+ file_path: str,
204
+ framework: str,
205
+ source_node_id: str = "",
206
+ model: str = "claude-haiku-4-5-20251001",
207
+ ) -> list[Edge]:
208
+ """LLM-based deeper semantic inference.
209
+
210
+ Requires ANTHROPIC_API_KEY environment variable.
211
+ Falls back to extract_semantic_refs() if the API key is not set.
212
+
213
+ Args:
214
+ file_path: source file path
215
+ framework: detected framework
216
+ source_node_id: node ID for the source of edges
217
+ model: Claude model to use (defaults to Haiku for cost efficiency)
218
+
219
+ Returns:
220
+ list of Edge objects with confidence="INFERRED", confidence_score=0.7
221
+ """
222
+ import os
223
+
224
+ if not os.environ.get("ANTHROPIC_API_KEY"):
225
+ return extract_semantic_refs(file_path, framework, source_node_id)
226
+
227
+ path = Path(file_path)
228
+ if not path.exists():
229
+ return []
230
+
231
+ if not source_node_id:
232
+ source_node_id = str(path)
233
+
234
+ try:
235
+ content = path.read_text(encoding="utf-8", errors="replace")
236
+ except OSError:
237
+ return []
238
+
239
+ # Truncate to avoid large token counts
240
+ MAX_CHARS = 4000
241
+ excerpt = content[:MAX_CHARS]
242
+
243
+ prompt = (
244
+ "Analyze this source file and list ONLY explicit class/type references that appear "
245
+ "in comments, docstrings, or type annotations — NOT in code logic.\n"
246
+ "Return a JSON array of strings (type/class names only). Example: [\"UserService\", \"OrderRepository\"]\n"
247
+ "If none found, return []\n\n"
248
+ f"File: {path.name}\nFramework: {framework}\n\n```\n{excerpt}\n```"
249
+ )
250
+
251
+ try:
252
+ import anthropic
253
+ client = anthropic.Anthropic()
254
+ message = client.messages.create(
255
+ model=model,
256
+ max_tokens=256,
257
+ messages=[{"role": "user", "content": prompt}],
258
+ )
259
+ raw = message.content[0].text.strip()
260
+ import json
261
+ names = json.loads(raw)
262
+ if not isinstance(names, list):
263
+ return []
264
+ edges = []
265
+ seen: set[str] = set()
266
+ for name in names:
267
+ if isinstance(name, str) and _is_type_name(name) and name not in seen:
268
+ seen.add(name)
269
+ edges.append(Edge(
270
+ source=source_node_id,
271
+ target=name,
272
+ relation="references",
273
+ confidence="INFERRED",
274
+ confidence_score=0.7,
275
+ source_file=file_path,
276
+ ))
277
+ return edges
278
+ except Exception:
279
+ # Fall back to regex parsing
280
+ return extract_semantic_refs(file_path, framework, source_node_id)