flurryx-code-memory 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. code_memory/__init__.py +1 -0
  2. code_memory/claims/__init__.py +32 -0
  3. code_memory/claims/extractor.py +325 -0
  4. code_memory/claims/indexer.py +258 -0
  5. code_memory/claims/resolver.py +186 -0
  6. code_memory/claims/store.py +424 -0
  7. code_memory/cli.py +1192 -0
  8. code_memory/config.py +268 -0
  9. code_memory/embed/__init__.py +224 -0
  10. code_memory/embed/cache.py +204 -0
  11. code_memory/embed/m3.py +174 -0
  12. code_memory/embed/ollama.py +92 -0
  13. code_memory/embed/tei.py +106 -0
  14. code_memory/episodic/__init__.py +3 -0
  15. code_memory/episodic/sqlite_store.py +278 -0
  16. code_memory/extractor/__init__.py +3 -0
  17. code_memory/extractor/csproj.py +166 -0
  18. code_memory/extractor/dll.py +385 -0
  19. code_memory/extractor/gitignore.py +162 -0
  20. code_memory/extractor/nuget.py +275 -0
  21. code_memory/extractor/sanity.py +124 -0
  22. code_memory/extractor/sln.py +108 -0
  23. code_memory/extractor/treesitter.py +1172 -0
  24. code_memory/graph/__init__.py +3 -0
  25. code_memory/graph/falkor_store.py +740 -0
  26. code_memory/mcp_server.py +1816 -0
  27. code_memory/metrics.py +260 -0
  28. code_memory/orchestrator/__init__.py +13 -0
  29. code_memory/orchestrator/git_delta.py +211 -0
  30. code_memory/orchestrator/ingest_state.py +71 -0
  31. code_memory/orchestrator/pipeline.py +1478 -0
  32. code_memory/orchestrator/reset.py +130 -0
  33. code_memory/orchestrator/resolver.py +825 -0
  34. code_memory/orchestrator/retrieve.py +505 -0
  35. code_memory/resilience.py +73 -0
  36. code_memory/sync/__init__.py +20 -0
  37. code_memory/sync/autostart/__init__.py +42 -0
  38. code_memory/sync/autostart/base.py +106 -0
  39. code_memory/sync/autostart/launchd.py +115 -0
  40. code_memory/sync/autostart/schtasks.py +155 -0
  41. code_memory/sync/autostart/systemd.py +113 -0
  42. code_memory/sync/hooks.py +164 -0
  43. code_memory/sync/safety.py +65 -0
  44. code_memory/sync/snapshot.py +461 -0
  45. code_memory/sync/store.py +399 -0
  46. code_memory/sync/sync.py +405 -0
  47. code_memory/sync/watcher.py +320 -0
  48. code_memory/vector/__init__.py +3 -0
  49. code_memory/vector/qdrant_store.py +302 -0
  50. flurryx_code_memory-0.4.0.dist-info/METADATA +26 -0
  51. flurryx_code_memory-0.4.0.dist-info/RECORD +53 -0
  52. flurryx_code_memory-0.4.0.dist-info/WHEEL +4 -0
  53. flurryx_code_memory-0.4.0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,162 @@
1
+ """Lightweight .gitignore matcher with no external deps.
2
+
3
+ Supports the subset of gitignore syntax that matters in practice:
4
+
5
+ - blank lines + ``#`` comments
6
+ - ``!`` negation
7
+ - ``/`` anchored (root-relative) vs unanchored patterns
8
+ - trailing ``/`` directory-only patterns
9
+ - ``*`` (no slash), ``?``, ``**`` globs
10
+ - collects ``.gitignore`` files recursively from the root downward
11
+
12
+ This is deliberately not a full re-implementation of git's matcher; the goal
13
+ is to avoid indexing build artifacts (``.angular/``, ``tmp/``, ``dist/``…)
14
+ that repos already ignore.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import fnmatch
20
+ import re
21
+ from dataclasses import dataclass
22
+ from pathlib import Path
23
+
24
+
25
+ @dataclass(frozen=True)
26
+ class _Rule:
27
+ base: Path # directory containing the .gitignore that owns the rule
28
+ pattern: str
29
+ negate: bool
30
+ dir_only: bool
31
+ anchored: bool
32
+ regex: re.Pattern[str]
33
+
34
+
35
+ def _compile(pattern: str) -> re.Pattern[str]:
36
+ parts = pattern.split("/")
37
+ out: list[str] = []
38
+ for i, seg in enumerate(parts):
39
+ if seg == "**":
40
+ out.append(".*")
41
+ else:
42
+ # fnmatch.translate adds anchors we don't want — strip them
43
+ tr = fnmatch.translate(seg)
44
+ # python's translate yields "(?s:...)\\Z"; extract inner
45
+ m = re.match(r"\(\?s:(.*)\)\\Z", tr)
46
+ inner = m.group(1) if m else tr
47
+ out.append(inner)
48
+ if i != len(parts) - 1:
49
+ out.append("/")
50
+ body = "".join(out)
51
+ # collapse leading "**/" -> "(?:.*/)?" to allow zero-segment match
52
+ body = re.sub(r"^\.\*/", "(?:.*/)?", body)
53
+ return re.compile(rf"^{body}$")
54
+
55
+
56
+ def _parse_line(line: str, base: Path) -> _Rule | None:
57
+ raw = line.rstrip("\n").rstrip("\r")
58
+ if not raw or raw.lstrip().startswith("#"):
59
+ return None
60
+ negate = raw.startswith("!")
61
+ if negate:
62
+ raw = raw[1:]
63
+ raw = raw.strip()
64
+ if not raw:
65
+ return None
66
+ dir_only = raw.endswith("/")
67
+ if dir_only:
68
+ raw = raw[:-1]
69
+ anchored = "/" in raw and not raw.startswith("**/")
70
+ if raw.startswith("/"):
71
+ raw = raw[1:]
72
+ return _Rule(
73
+ base=base,
74
+ pattern=raw,
75
+ negate=negate,
76
+ dir_only=dir_only,
77
+ anchored=anchored,
78
+ regex=_compile(raw),
79
+ )
80
+
81
+
82
+ class GitignoreMatcher:
83
+ """Walk-aware gitignore matcher.
84
+
85
+ Load all ``.gitignore`` files under ``root`` upfront, then call
86
+ :meth:`match` per candidate path during the walk.
87
+ """
88
+
89
+ def __init__(self, root: Path, rules: list[_Rule]) -> None:
90
+ self._root = root.resolve()
91
+ self._rules = rules
92
+
93
+ @classmethod
94
+ def from_root(cls, root: str | Path) -> GitignoreMatcher:
95
+ root_path = Path(root).resolve()
96
+ rules: list[_Rule] = []
97
+ if not root_path.is_dir():
98
+ return cls(root_path, rules)
99
+ # always seed with .git/ so we never index the git directory
100
+ rules.append(
101
+ _Rule(
102
+ base=root_path,
103
+ pattern=".git",
104
+ negate=False,
105
+ dir_only=True,
106
+ anchored=False,
107
+ regex=_compile(".git"),
108
+ )
109
+ )
110
+ for gi in root_path.rglob(".gitignore"):
111
+ try:
112
+ lines = gi.read_text(encoding="utf-8", errors="replace").splitlines()
113
+ except OSError:
114
+ continue
115
+ base = gi.parent.resolve()
116
+ for line in lines:
117
+ rule = _parse_line(line, base)
118
+ if rule is not None:
119
+ rules.append(rule)
120
+ return cls(root_path, rules)
121
+
122
+ def match(self, path: Path, *, is_dir: bool) -> bool:
123
+ """Return True if ``path`` is ignored. Last matching rule wins.
124
+
125
+ Per gitignore semantics, a file is ignored if any of its ancestor
126
+ directories is ignored, so we evaluate the chain root -> path and
127
+ return the final state.
128
+ """
129
+ try:
130
+ abs_path = path.resolve()
131
+ except OSError:
132
+ return False
133
+ # Build list of (candidate_path, is_dir) from root toward the leaf.
134
+ chain: list[tuple[Path, bool]] = []
135
+ for parent in reversed(abs_path.parents):
136
+ if self._root in (parent, *parent.parents) or parent == self._root:
137
+ chain.append((parent, True))
138
+ chain.append((abs_path, is_dir))
139
+
140
+ ignored = False
141
+ for candidate, candidate_is_dir in chain:
142
+ if candidate == self._root:
143
+ continue
144
+ for rule in self._rules:
145
+ if rule.dir_only and not candidate_is_dir:
146
+ continue
147
+ try:
148
+ rel_from_base = candidate.relative_to(rule.base)
149
+ except ValueError:
150
+ continue
151
+ rel_str = rel_from_base.as_posix()
152
+ if rel_str in ("", "."):
153
+ continue
154
+ if rule.anchored:
155
+ hit = bool(rule.regex.match(rel_str))
156
+ else:
157
+ hit = bool(rule.regex.match(rel_str)) or any(
158
+ rule.regex.match(part) for part in rel_from_base.parts
159
+ )
160
+ if hit:
161
+ ignored = not rule.negate
162
+ return ignored
@@ -0,0 +1,275 @@
1
+ """Resolve `.csproj` references to physical DLL paths.
2
+
3
+ The graph layer wants Assembly nodes keyed by `Name, Version=…`.
4
+ Csproj parsing gives us PackageReference + ProjectReference logical
5
+ identities; this module turns those into concrete `.dll` files on
6
+ disk so the metadata reader has something to open.
7
+
8
+ Two sources, in priority order:
9
+
10
+ 1. **NuGet global cache** at ``$NUGET_PACKAGES`` or ``~/.nuget/packages``.
11
+ Standard layout:
12
+ ``{cache}/{pkg_lower}/{version}/lib/{tfm}/{Foo}.dll``.
13
+ 2. **Build output** of project references:
14
+ ``{ref_csproj_dir}/bin/{config}/{tfm}/{AssemblyName}.dll``.
15
+ We pick Debug if both exist; the contents differ only in PDBs /
16
+ optimisation, not in the public type surface that we index.
17
+
18
+ Either source can be missing (offline machine, build never run);
19
+ unresolved references are skipped — we never emit fictional paths.
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import os
25
+ from collections.abc import Iterable
26
+ from dataclasses import dataclass
27
+ from pathlib import Path
28
+
29
+ from .csproj import CsprojInfo, PackageRef
30
+
31
+
32
+ def nuget_cache_dirs() -> list[Path]:
33
+ """Return candidate roots in priority order, deduped.
34
+
35
+ ``NUGET_PACKAGES`` overrides the default. We also probe both the
36
+ POSIX (``~/.nuget``) and Windows-style (``%USERPROFILE%\\.nuget``)
37
+ locations because cross-platform repos sometimes drag both.
38
+ """
39
+ candidates: list[Path] = []
40
+ env = os.environ.get("NUGET_PACKAGES")
41
+ if env:
42
+ candidates.append(Path(env).expanduser())
43
+ candidates.append(Path.home() / ".nuget" / "packages")
44
+ # Dedupe while preserving order.
45
+ seen: set[Path] = set()
46
+ out: list[Path] = []
47
+ for c in candidates:
48
+ r = c.resolve() if c.exists() else c
49
+ if r in seen:
50
+ continue
51
+ seen.add(r)
52
+ if r.exists():
53
+ out.append(r)
54
+ return out
55
+
56
+
57
+ # TFM compatibility table — only the broad strokes. Real NuGet
58
+ # compatibility is a directed graph of moniker rules; we cover the
59
+ # common parent-fallback so resolution doesn't silently miss anything
60
+ # obvious. Order matters: earlier = preferred.
61
+ _TFM_FALLBACKS: dict[str, tuple[str, ...]] = {
62
+ "net8.0": (
63
+ "net8.0",
64
+ "net7.0",
65
+ "net6.0",
66
+ "net5.0",
67
+ "netstandard2.1",
68
+ "netstandard2.0",
69
+ "netstandard1.6",
70
+ ),
71
+ "net7.0": (
72
+ "net7.0",
73
+ "net6.0",
74
+ "net5.0",
75
+ "netstandard2.1",
76
+ "netstandard2.0",
77
+ ),
78
+ "net6.0": (
79
+ "net6.0",
80
+ "net5.0",
81
+ "netstandard2.1",
82
+ "netstandard2.0",
83
+ ),
84
+ "net5.0": ("net5.0", "netstandard2.1", "netstandard2.0"),
85
+ "netstandard2.1": ("netstandard2.1", "netstandard2.0"),
86
+ "netstandard2.0": ("netstandard2.0", "netstandard1.6", "netstandard1.4"),
87
+ "net48": ("net48", "net472", "net471", "net47", "net462", "net461", "net46", "net45", "netstandard2.0"),
88
+ "net472": ("net472", "net471", "net47", "net462", "net461", "net46", "net45", "netstandard2.0"),
89
+ "net471": ("net471", "net47", "net462", "net461", "net46", "net45", "netstandard2.0"),
90
+ "net47": ("net47", "net462", "net461", "net46", "net45", "netstandard2.0"),
91
+ "net46": ("net46", "net45", "netstandard1.4", "netstandard1.3"),
92
+ "net45": ("net45", "netstandard1.3", "netstandard1.2"),
93
+ }
94
+
95
+
96
+ def _candidate_tfms(target: str | None) -> list[str]:
97
+ """Return TFMs to try, in priority order, for a project's <TargetFramework>.
98
+
99
+ Multi-target projects come in as a semicolon-joined string
100
+ (``net6.0;net8.0``); we expand each side and concatenate fallback
101
+ chains, then dedupe.
102
+ """
103
+ if not target:
104
+ return ["netstandard2.0", "netstandard2.1", "net8.0", "net6.0"]
105
+ chunks = [t.strip() for t in target.split(";") if t.strip()]
106
+ out: list[str] = []
107
+ seen: set[str] = set()
108
+ for c in chunks:
109
+ for tfm in _TFM_FALLBACKS.get(c, (c,)):
110
+ if tfm not in seen:
111
+ seen.add(tfm)
112
+ out.append(tfm)
113
+ return out
114
+
115
+
116
+ def resolve_package_dlls(
117
+ pkg: PackageRef, target_framework: str | None
118
+ ) -> list[Path]:
119
+ """Find DLLs for a single ``PackageReference`` in any local NuGet cache.
120
+
121
+ Returns all DLLs under the best-matching TFM directory — most
122
+ packages ship one or two assemblies per TFM but some (Roslyn,
123
+ BouncyCastle) ship a bag.
124
+ """
125
+ caches = nuget_cache_dirs()
126
+ if not caches:
127
+ return []
128
+
129
+ name_lower = pkg.name.lower()
130
+ versions = _versions_for(caches, name_lower, pkg.version)
131
+ tfms = _candidate_tfms(target_framework)
132
+
133
+ results: list[Path] = []
134
+ for cache in caches:
135
+ for ver in versions:
136
+ lib_root = cache / name_lower / ver / "lib"
137
+ if not lib_root.is_dir():
138
+ continue
139
+ for tfm in tfms:
140
+ tfm_dir = lib_root / tfm
141
+ if tfm_dir.is_dir():
142
+ results.extend(sorted(tfm_dir.glob("*.dll")))
143
+ if results:
144
+ return results
145
+ # No exact/fallback TFM match — let any flat DLL stand in
146
+ # for older packages that ship `lib/*.dll` without a TFM
147
+ # subdir. Better than nothing for net2.0-era libs.
148
+ for dll in sorted(lib_root.glob("*.dll")):
149
+ if dll.is_file():
150
+ results.append(dll)
151
+ if results:
152
+ return results
153
+ return results
154
+
155
+
156
+ def _versions_for(
157
+ caches: list[Path], name_lower: str, requested: str | None
158
+ ) -> list[str]:
159
+ """Pick which on-disk version directories to consider, best first.
160
+
161
+ Requested version present on disk → use it directly. Otherwise
162
+ return all versions sorted descending so we try newer first
163
+ (lexicographic on padded chunks is good enough for the
164
+ Major.Minor.Patch[-suffix] layout NuGet emits).
165
+ """
166
+ if requested:
167
+ # Direct hit if the requested version directory exists in any cache.
168
+ for cache in caches:
169
+ if (cache / name_lower / requested).is_dir():
170
+ return [requested]
171
+ # Fall back to anything we have on disk.
172
+ seen: set[str] = set()
173
+ for cache in caches:
174
+ d = cache / name_lower
175
+ if not d.is_dir():
176
+ continue
177
+ for child in d.iterdir():
178
+ if child.is_dir():
179
+ seen.add(child.name)
180
+ return sorted(seen, key=_version_sort_key, reverse=True)
181
+
182
+
183
+ def _version_sort_key(version: str) -> tuple[int, ...]:
184
+ """Coarse semver-ish sort: split on dots, zero-pad missing chunks.
185
+
186
+ Pre-release suffixes (``1.2.3-beta``) compare lower than their
187
+ release equivalents because the int parse strips at the first
188
+ non-digit. Good enough for "newest first" without dragging in a
189
+ real packaging dep.
190
+ """
191
+ parts = version.split(".")
192
+ out: list[int] = []
193
+ for p in parts[:4]: # cap to four segments
194
+ digits = ""
195
+ for ch in p:
196
+ if ch.isdigit():
197
+ digits += ch
198
+ else:
199
+ break
200
+ out.append(int(digits) if digits else 0)
201
+ while len(out) < 4:
202
+ out.append(0)
203
+ return tuple(out)
204
+
205
+
206
+ def resolve_project_reference_dlls(
207
+ ref_csproj: Path, target_framework: str | None
208
+ ) -> list[Path]:
209
+ """Find the built DLL for a sibling project, if any.
210
+
211
+ Looks under ``{proj_dir}/bin/{config}/{tfm}/`` for each candidate
212
+ tfm in priority order, preferring Debug over Release because dev
213
+ workstations build Debug by default and that's the file most
214
+ likely to exist.
215
+ """
216
+ base = ref_csproj.parent
217
+ tfms = _candidate_tfms(target_framework)
218
+ for config in ("Debug", "Release"):
219
+ for tfm in tfms:
220
+ out = base / "bin" / config / tfm
221
+ if out.is_dir():
222
+ dlls = sorted(out.glob("*.dll"))
223
+ if dlls:
224
+ return dlls
225
+ # Older non-SDK projects sometimes drop output straight in bin/.
226
+ flat = base / "bin"
227
+ if flat.is_dir():
228
+ return sorted(flat.glob("*.dll"))
229
+ return []
230
+
231
+
232
+ @dataclass
233
+ class ResolvedRefs:
234
+ """All DLL paths resolved for one csproj."""
235
+
236
+ package_dlls: dict[str, list[Path]] # PackageRef.name -> dlls
237
+ project_dlls: dict[str, list[Path]] # referenced csproj path -> dlls
238
+
239
+ def all_paths(self) -> list[Path]:
240
+ out: list[Path] = []
241
+ for v in self.package_dlls.values():
242
+ out.extend(v)
243
+ for v in self.project_dlls.values():
244
+ out.extend(v)
245
+ return out
246
+
247
+
248
+ def resolve_refs(info: CsprojInfo) -> ResolvedRefs:
249
+ """Resolve every PackageReference + ProjectReference of one csproj.
250
+
251
+ Returns an empty ``ResolvedRefs`` rather than raising on missing
252
+ NuGet cache / absent build outputs — callers care about "what did
253
+ we manage to find" more than about failures.
254
+ """
255
+ package_dlls: dict[str, list[Path]] = {}
256
+ for pkg in info.package_references:
257
+ dlls = resolve_package_dlls(pkg, info.target_framework)
258
+ if dlls:
259
+ package_dlls[pkg.name] = dlls
260
+
261
+ project_dlls: dict[str, list[Path]] = {}
262
+ for ref_path in info.project_references:
263
+ dlls = resolve_project_reference_dlls(Path(ref_path), info.target_framework)
264
+ if dlls:
265
+ project_dlls[ref_path] = dlls
266
+ return ResolvedRefs(package_dlls=package_dlls, project_dlls=project_dlls)
267
+
268
+
269
+ def all_referenced_dlls(infos: Iterable[CsprojInfo]) -> set[Path]:
270
+ """Deduplicated set of every DLL referenced by any project."""
271
+ out: set[Path] = set()
272
+ for info in infos:
273
+ refs = resolve_refs(info)
274
+ out.update(refs.all_paths())
275
+ return out
@@ -0,0 +1,124 @@
1
+ """Extraction sanity checks — catch UTF-8 / parser drift at ingest time.
2
+
3
+ The historical UTF-8 byte-vs-str slicing bug silently chopped every
4
+ identifier in non-ASCII files, and nobody noticed until a user got
5
+ empty callers/callees a year later. This module exists so that class
6
+ of regression fails loudly at ingest, not at user-report.
7
+
8
+ The check is intentionally narrow: for each extracted Symbol whose
9
+ name is a plain identifier, the snippet must contain that identifier
10
+ as a substring. Anything more sophisticated would have to mirror the
11
+ extractor's logic, which is exactly what we're trying to validate —
12
+ so we keep this check independent and dumb on purpose.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import re
18
+ from dataclasses import dataclass
19
+
20
+ from .treesitter import ExtractedFile, Symbol
21
+
22
+ # Plain identifier — letters / digits / underscore. Skips generics
23
+ # (``Foo<T>``), operator overloads (``operator +``), F# parameterised
24
+ # names, and anything else the extractor reasonably emits but where a
25
+ # literal substring check would false-positive.
26
+ _PLAIN_IDENT = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$")
27
+
28
+
29
+ @dataclass(frozen=True)
30
+ class SanityViolation:
31
+ """One symbol whose snippet didn't contain its name verbatim."""
32
+
33
+ path: str
34
+ name: str
35
+ kind: str
36
+ start_line: int
37
+
38
+
39
+ def is_checkable(symbol: Symbol) -> bool:
40
+ """Whether this symbol's name is safe to round-trip against the snippet."""
41
+ return bool(_PLAIN_IDENT.match(symbol.name))
42
+
43
+
44
+ def _contains_as_word(haystack: str, needle: str) -> bool:
45
+ """``needle`` appears in ``haystack`` as a complete word.
46
+
47
+ A plain substring check would miss the historical UTF-8 chop bug:
48
+ when ``CommandeRules`` got truncated to ``mmandeRules``, the
49
+ truncated name is still a substring of the snippet containing the
50
+ real ``CommandeRules`` declaration. Word boundaries close that
51
+ hole — ``mmandeRules`` is not a whole-word occurrence inside
52
+ ``CommandeRules``.
53
+ """
54
+ pattern = r"(?<![A-Za-z0-9_])" + re.escape(needle) + r"(?![A-Za-z0-9_])"
55
+ return re.search(pattern, haystack) is not None
56
+
57
+
58
+ def violations_in(ex: ExtractedFile) -> list[SanityViolation]:
59
+ """Return symbols whose snippet doesn't contain their plain-identifier name.
60
+
61
+ Non-plain names (generics, operators, F# parameterised) are skipped
62
+ rather than flagged — they aren't reliably substring-checkable.
63
+ Returns ``[]`` on a clean file.
64
+ """
65
+ out: list[SanityViolation] = []
66
+ for sym in ex.symbols:
67
+ if not is_checkable(sym):
68
+ continue
69
+ if _contains_as_word(sym.snippet, sym.name):
70
+ continue
71
+ out.append(
72
+ SanityViolation(
73
+ path=ex.path,
74
+ name=sym.name,
75
+ kind=sym.kind,
76
+ start_line=sym.start_line,
77
+ )
78
+ )
79
+ return out
80
+
81
+
82
+ @dataclass
83
+ class SanitySummary:
84
+ """Aggregate counts across one ingest run."""
85
+
86
+ symbols_checked: int = 0
87
+ symbols_failed: int = 0
88
+ sample_violations: list[SanityViolation] = None # type: ignore[assignment]
89
+
90
+ def __post_init__(self) -> None:
91
+ if self.sample_violations is None:
92
+ self.sample_violations = []
93
+
94
+ @property
95
+ def failure_rate(self) -> float:
96
+ if self.symbols_checked == 0:
97
+ return 0.0
98
+ return self.symbols_failed / self.symbols_checked
99
+
100
+ def record(self, ex: ExtractedFile, *, keep_samples: int = 10) -> None:
101
+ for sym in ex.symbols:
102
+ if not is_checkable(sym):
103
+ continue
104
+ self.symbols_checked += 1
105
+ if _contains_as_word(sym.snippet, sym.name):
106
+ continue
107
+ self.symbols_failed += 1
108
+ if len(self.sample_violations) < keep_samples:
109
+ self.sample_violations.append(
110
+ SanityViolation(
111
+ path=ex.path,
112
+ name=sym.name,
113
+ kind=sym.kind,
114
+ start_line=sym.start_line,
115
+ )
116
+ )
117
+
118
+
119
+ # Threshold above which the ingest run flags itself as suspect. Tuned
120
+ # from real corpora: a healthy ingest sits at 0% (every plain
121
+ # identifier round-trips). The historical UTF-8 bug pushed the
122
+ # failure rate close to 100% on French C# repos. Anything above ~2%
123
+ # almost certainly means a real bug, not edge-case syntax.
124
+ SUSPECT_THRESHOLD = 0.02
@@ -0,0 +1,108 @@
1
+ """Minimal Visual Studio `.sln` parser.
2
+
3
+ A solution file is a project group. The parser pulls out just the
4
+ project list — name, csproj path, and a stable GUID — which is enough
5
+ to wire ``Solution`` graph nodes and ``MEMBER_OF`` edges from each
6
+ indexed Project to its containing Solution.
7
+
8
+ Folders ("solution items") are skipped; we want code projects, not
9
+ the IDE's tree organization. Unparseable lines are dropped silently
10
+ so a single corrupted entry never aborts the ingest.
11
+
12
+ Format reference (informal — Microsoft never published a real
13
+ grammar): https://learn.microsoft.com/en-us/visualstudio/extensibility/internals/solution-dot-sln-file
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import logging
19
+ import re
20
+ from dataclasses import dataclass, field
21
+ from pathlib import Path
22
+
23
+ log = logging.getLogger(__name__)
24
+
25
+ # ``Project("{type-guid}") = "Name", "RelativePath.csproj", "{proj-guid}"``
26
+ _PROJECT_LINE = re.compile(
27
+ r'^Project\("\{(?P<type>[0-9A-Fa-f-]+)\}"\)\s*=\s*'
28
+ r'"(?P<name>[^"]+)",\s*'
29
+ r'"(?P<path>[^"]+)",\s*'
30
+ r'"\{(?P<guid>[0-9A-Fa-f-]+)\}"'
31
+ )
32
+
33
+ # "Solution folder" type GUID per MS docs — these have no buildable
34
+ # output and shouldn't be indexed as projects.
35
+ _FOLDER_TYPE_GUID = "2150E333-8FDC-42A3-9474-1A3956D46DE8"
36
+
37
+
38
+ @dataclass(frozen=True)
39
+ class SolutionProject:
40
+ """One project entry inside a .sln file."""
41
+
42
+ name: str
43
+ csproj_path: str # absolute, resolved
44
+ guid: str
45
+ type_guid: str
46
+
47
+
48
+ @dataclass
49
+ class SolutionInfo:
50
+ """Parsed view of one .sln file."""
51
+
52
+ path: str
53
+ name: str
54
+ projects: list[SolutionProject] = field(default_factory=list)
55
+
56
+
57
+ def parse_sln(sln_path: str | Path) -> SolutionInfo | None:
58
+ """Parse one `.sln`. Returns ``None`` on read failure."""
59
+ p = Path(sln_path).resolve()
60
+ try:
61
+ # .sln files are Windows-encoded MBCS-or-UTF8 with BOM in
62
+ # practice; ``utf-8-sig`` strips the BOM if present without
63
+ # caring otherwise.
64
+ text = p.read_text(encoding="utf-8-sig", errors="replace")
65
+ except OSError as e:
66
+ log.warning("sln: failed to read %s — %s", p, e)
67
+ return None
68
+
69
+ info = SolutionInfo(path=str(p), name=p.stem)
70
+ base = p.parent
71
+ for line in text.splitlines():
72
+ m = _PROJECT_LINE.match(line.strip())
73
+ if not m:
74
+ continue
75
+ type_guid = m.group("type").lower()
76
+ if type_guid == _FOLDER_TYPE_GUID.lower():
77
+ continue
78
+ rel = m.group("path").replace("\\", "/")
79
+ candidate = (base / rel).resolve()
80
+ if not candidate.exists():
81
+ # Some solutions reference projects outside the cloned
82
+ # working tree (shared infra). Skip — we can't index what
83
+ # we don't have on disk.
84
+ continue
85
+ info.projects.append(
86
+ SolutionProject(
87
+ name=m.group("name"),
88
+ csproj_path=str(candidate),
89
+ guid=m.group("guid").lower(),
90
+ type_guid=type_guid,
91
+ )
92
+ )
93
+ return info
94
+
95
+
96
+ def walk_solutions(root: str | Path) -> list[SolutionInfo]:
97
+ """Find every `.sln` under ``root`` and parse it."""
98
+ out: list[SolutionInfo] = []
99
+ root_path = Path(root).resolve()
100
+ for sln in root_path.rglob("*.sln"):
101
+ # Skip artifacts in obvious build outputs; .sln rarely lives
102
+ # there but the filter is cheap and matches the csproj rule.
103
+ if any(part in {"bin", "obj", "node_modules"} for part in sln.parts):
104
+ continue
105
+ info = parse_sln(sln)
106
+ if info is not None:
107
+ out.append(info)
108
+ return out