flurryx-code-memory 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- code_memory/__init__.py +1 -0
- code_memory/claims/__init__.py +32 -0
- code_memory/claims/extractor.py +325 -0
- code_memory/claims/indexer.py +258 -0
- code_memory/claims/resolver.py +186 -0
- code_memory/claims/store.py +424 -0
- code_memory/cli.py +1192 -0
- code_memory/config.py +268 -0
- code_memory/embed/__init__.py +224 -0
- code_memory/embed/cache.py +204 -0
- code_memory/embed/m3.py +174 -0
- code_memory/embed/ollama.py +92 -0
- code_memory/embed/tei.py +106 -0
- code_memory/episodic/__init__.py +3 -0
- code_memory/episodic/sqlite_store.py +278 -0
- code_memory/extractor/__init__.py +3 -0
- code_memory/extractor/csproj.py +166 -0
- code_memory/extractor/dll.py +385 -0
- code_memory/extractor/gitignore.py +162 -0
- code_memory/extractor/nuget.py +275 -0
- code_memory/extractor/sanity.py +124 -0
- code_memory/extractor/sln.py +108 -0
- code_memory/extractor/treesitter.py +1172 -0
- code_memory/graph/__init__.py +3 -0
- code_memory/graph/falkor_store.py +740 -0
- code_memory/mcp_server.py +1816 -0
- code_memory/metrics.py +260 -0
- code_memory/orchestrator/__init__.py +13 -0
- code_memory/orchestrator/git_delta.py +211 -0
- code_memory/orchestrator/ingest_state.py +71 -0
- code_memory/orchestrator/pipeline.py +1478 -0
- code_memory/orchestrator/reset.py +130 -0
- code_memory/orchestrator/resolver.py +825 -0
- code_memory/orchestrator/retrieve.py +505 -0
- code_memory/resilience.py +73 -0
- code_memory/sync/__init__.py +20 -0
- code_memory/sync/autostart/__init__.py +42 -0
- code_memory/sync/autostart/base.py +106 -0
- code_memory/sync/autostart/launchd.py +115 -0
- code_memory/sync/autostart/schtasks.py +155 -0
- code_memory/sync/autostart/systemd.py +113 -0
- code_memory/sync/hooks.py +164 -0
- code_memory/sync/safety.py +65 -0
- code_memory/sync/snapshot.py +461 -0
- code_memory/sync/store.py +399 -0
- code_memory/sync/sync.py +405 -0
- code_memory/sync/watcher.py +320 -0
- code_memory/vector/__init__.py +3 -0
- code_memory/vector/qdrant_store.py +302 -0
- flurryx_code_memory-0.4.0.dist-info/METADATA +26 -0
- flurryx_code_memory-0.4.0.dist-info/RECORD +53 -0
- flurryx_code_memory-0.4.0.dist-info/WHEEL +4 -0
- flurryx_code_memory-0.4.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
"""Lightweight .gitignore matcher with no external deps.
|
|
2
|
+
|
|
3
|
+
Supports the subset of gitignore syntax that matters in practice:
|
|
4
|
+
|
|
5
|
+
- blank lines + ``#`` comments
|
|
6
|
+
- ``!`` negation
|
|
7
|
+
- ``/`` anchored (root-relative) vs unanchored patterns
|
|
8
|
+
- trailing ``/`` directory-only patterns
|
|
9
|
+
- ``*`` (no slash), ``?``, ``**`` globs
|
|
10
|
+
- collects ``.gitignore`` files recursively from the root downward
|
|
11
|
+
|
|
12
|
+
This is deliberately not a full re-implementation of git's matcher; the goal
|
|
13
|
+
is to avoid indexing build artifacts (``.angular/``, ``tmp/``, ``dist/``…)
|
|
14
|
+
that repos already ignore.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import fnmatch
|
|
20
|
+
import re
|
|
21
|
+
from dataclasses import dataclass
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass(frozen=True)
|
|
26
|
+
class _Rule:
|
|
27
|
+
base: Path # directory containing the .gitignore that owns the rule
|
|
28
|
+
pattern: str
|
|
29
|
+
negate: bool
|
|
30
|
+
dir_only: bool
|
|
31
|
+
anchored: bool
|
|
32
|
+
regex: re.Pattern[str]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _compile(pattern: str) -> re.Pattern[str]:
|
|
36
|
+
parts = pattern.split("/")
|
|
37
|
+
out: list[str] = []
|
|
38
|
+
for i, seg in enumerate(parts):
|
|
39
|
+
if seg == "**":
|
|
40
|
+
out.append(".*")
|
|
41
|
+
else:
|
|
42
|
+
# fnmatch.translate adds anchors we don't want — strip them
|
|
43
|
+
tr = fnmatch.translate(seg)
|
|
44
|
+
# python's translate yields "(?s:...)\\Z"; extract inner
|
|
45
|
+
m = re.match(r"\(\?s:(.*)\)\\Z", tr)
|
|
46
|
+
inner = m.group(1) if m else tr
|
|
47
|
+
out.append(inner)
|
|
48
|
+
if i != len(parts) - 1:
|
|
49
|
+
out.append("/")
|
|
50
|
+
body = "".join(out)
|
|
51
|
+
# collapse leading "**/" -> "(?:.*/)?" to allow zero-segment match
|
|
52
|
+
body = re.sub(r"^\.\*/", "(?:.*/)?", body)
|
|
53
|
+
return re.compile(rf"^{body}$")
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _parse_line(line: str, base: Path) -> _Rule | None:
|
|
57
|
+
raw = line.rstrip("\n").rstrip("\r")
|
|
58
|
+
if not raw or raw.lstrip().startswith("#"):
|
|
59
|
+
return None
|
|
60
|
+
negate = raw.startswith("!")
|
|
61
|
+
if negate:
|
|
62
|
+
raw = raw[1:]
|
|
63
|
+
raw = raw.strip()
|
|
64
|
+
if not raw:
|
|
65
|
+
return None
|
|
66
|
+
dir_only = raw.endswith("/")
|
|
67
|
+
if dir_only:
|
|
68
|
+
raw = raw[:-1]
|
|
69
|
+
anchored = "/" in raw and not raw.startswith("**/")
|
|
70
|
+
if raw.startswith("/"):
|
|
71
|
+
raw = raw[1:]
|
|
72
|
+
return _Rule(
|
|
73
|
+
base=base,
|
|
74
|
+
pattern=raw,
|
|
75
|
+
negate=negate,
|
|
76
|
+
dir_only=dir_only,
|
|
77
|
+
anchored=anchored,
|
|
78
|
+
regex=_compile(raw),
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class GitignoreMatcher:
|
|
83
|
+
"""Walk-aware gitignore matcher.
|
|
84
|
+
|
|
85
|
+
Load all ``.gitignore`` files under ``root`` upfront, then call
|
|
86
|
+
:meth:`match` per candidate path during the walk.
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
def __init__(self, root: Path, rules: list[_Rule]) -> None:
|
|
90
|
+
self._root = root.resolve()
|
|
91
|
+
self._rules = rules
|
|
92
|
+
|
|
93
|
+
@classmethod
|
|
94
|
+
def from_root(cls, root: str | Path) -> GitignoreMatcher:
|
|
95
|
+
root_path = Path(root).resolve()
|
|
96
|
+
rules: list[_Rule] = []
|
|
97
|
+
if not root_path.is_dir():
|
|
98
|
+
return cls(root_path, rules)
|
|
99
|
+
# always seed with .git/ so we never index the git directory
|
|
100
|
+
rules.append(
|
|
101
|
+
_Rule(
|
|
102
|
+
base=root_path,
|
|
103
|
+
pattern=".git",
|
|
104
|
+
negate=False,
|
|
105
|
+
dir_only=True,
|
|
106
|
+
anchored=False,
|
|
107
|
+
regex=_compile(".git"),
|
|
108
|
+
)
|
|
109
|
+
)
|
|
110
|
+
for gi in root_path.rglob(".gitignore"):
|
|
111
|
+
try:
|
|
112
|
+
lines = gi.read_text(encoding="utf-8", errors="replace").splitlines()
|
|
113
|
+
except OSError:
|
|
114
|
+
continue
|
|
115
|
+
base = gi.parent.resolve()
|
|
116
|
+
for line in lines:
|
|
117
|
+
rule = _parse_line(line, base)
|
|
118
|
+
if rule is not None:
|
|
119
|
+
rules.append(rule)
|
|
120
|
+
return cls(root_path, rules)
|
|
121
|
+
|
|
122
|
+
def match(self, path: Path, *, is_dir: bool) -> bool:
|
|
123
|
+
"""Return True if ``path`` is ignored. Last matching rule wins.
|
|
124
|
+
|
|
125
|
+
Per gitignore semantics, a file is ignored if any of its ancestor
|
|
126
|
+
directories is ignored, so we evaluate the chain root -> path and
|
|
127
|
+
return the final state.
|
|
128
|
+
"""
|
|
129
|
+
try:
|
|
130
|
+
abs_path = path.resolve()
|
|
131
|
+
except OSError:
|
|
132
|
+
return False
|
|
133
|
+
# Build list of (candidate_path, is_dir) from root toward the leaf.
|
|
134
|
+
chain: list[tuple[Path, bool]] = []
|
|
135
|
+
for parent in reversed(abs_path.parents):
|
|
136
|
+
if self._root in (parent, *parent.parents) or parent == self._root:
|
|
137
|
+
chain.append((parent, True))
|
|
138
|
+
chain.append((abs_path, is_dir))
|
|
139
|
+
|
|
140
|
+
ignored = False
|
|
141
|
+
for candidate, candidate_is_dir in chain:
|
|
142
|
+
if candidate == self._root:
|
|
143
|
+
continue
|
|
144
|
+
for rule in self._rules:
|
|
145
|
+
if rule.dir_only and not candidate_is_dir:
|
|
146
|
+
continue
|
|
147
|
+
try:
|
|
148
|
+
rel_from_base = candidate.relative_to(rule.base)
|
|
149
|
+
except ValueError:
|
|
150
|
+
continue
|
|
151
|
+
rel_str = rel_from_base.as_posix()
|
|
152
|
+
if rel_str in ("", "."):
|
|
153
|
+
continue
|
|
154
|
+
if rule.anchored:
|
|
155
|
+
hit = bool(rule.regex.match(rel_str))
|
|
156
|
+
else:
|
|
157
|
+
hit = bool(rule.regex.match(rel_str)) or any(
|
|
158
|
+
rule.regex.match(part) for part in rel_from_base.parts
|
|
159
|
+
)
|
|
160
|
+
if hit:
|
|
161
|
+
ignored = not rule.negate
|
|
162
|
+
return ignored
|
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
"""Resolve `.csproj` references to physical DLL paths.
|
|
2
|
+
|
|
3
|
+
The graph layer wants Assembly nodes keyed by `Name, Version=…`.
|
|
4
|
+
Csproj parsing gives us PackageReference + ProjectReference logical
|
|
5
|
+
identities; this module turns those into concrete `.dll` files on
|
|
6
|
+
disk so the metadata reader has something to open.
|
|
7
|
+
|
|
8
|
+
Two sources, in priority order:
|
|
9
|
+
|
|
10
|
+
1. **NuGet global cache** at ``$NUGET_PACKAGES`` or ``~/.nuget/packages``.
|
|
11
|
+
Standard layout:
|
|
12
|
+
``{cache}/{pkg_lower}/{version}/lib/{tfm}/{Foo}.dll``.
|
|
13
|
+
2. **Build output** of project references:
|
|
14
|
+
``{ref_csproj_dir}/bin/{config}/{tfm}/{AssemblyName}.dll``.
|
|
15
|
+
We pick Debug if both exist; the contents differ only in PDBs /
|
|
16
|
+
optimisation, not in the public type surface that we index.
|
|
17
|
+
|
|
18
|
+
Either source can be missing (offline machine, build never run);
|
|
19
|
+
unresolved references are skipped — we never emit fictional paths.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
import os
|
|
25
|
+
from collections.abc import Iterable
|
|
26
|
+
from dataclasses import dataclass
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
|
|
29
|
+
from .csproj import CsprojInfo, PackageRef
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def nuget_cache_dirs() -> list[Path]:
|
|
33
|
+
"""Return candidate roots in priority order, deduped.
|
|
34
|
+
|
|
35
|
+
``NUGET_PACKAGES`` overrides the default. We also probe both the
|
|
36
|
+
POSIX (``~/.nuget``) and Windows-style (``%USERPROFILE%\\.nuget``)
|
|
37
|
+
locations because cross-platform repos sometimes drag both.
|
|
38
|
+
"""
|
|
39
|
+
candidates: list[Path] = []
|
|
40
|
+
env = os.environ.get("NUGET_PACKAGES")
|
|
41
|
+
if env:
|
|
42
|
+
candidates.append(Path(env).expanduser())
|
|
43
|
+
candidates.append(Path.home() / ".nuget" / "packages")
|
|
44
|
+
# Dedupe while preserving order.
|
|
45
|
+
seen: set[Path] = set()
|
|
46
|
+
out: list[Path] = []
|
|
47
|
+
for c in candidates:
|
|
48
|
+
r = c.resolve() if c.exists() else c
|
|
49
|
+
if r in seen:
|
|
50
|
+
continue
|
|
51
|
+
seen.add(r)
|
|
52
|
+
if r.exists():
|
|
53
|
+
out.append(r)
|
|
54
|
+
return out
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
# TFM compatibility table — only the broad strokes. Real NuGet
|
|
58
|
+
# compatibility is a directed graph of moniker rules; we cover the
|
|
59
|
+
# common parent-fallback so resolution doesn't silently miss anything
|
|
60
|
+
# obvious. Order matters: earlier = preferred.
|
|
61
|
+
_TFM_FALLBACKS: dict[str, tuple[str, ...]] = {
|
|
62
|
+
"net8.0": (
|
|
63
|
+
"net8.0",
|
|
64
|
+
"net7.0",
|
|
65
|
+
"net6.0",
|
|
66
|
+
"net5.0",
|
|
67
|
+
"netstandard2.1",
|
|
68
|
+
"netstandard2.0",
|
|
69
|
+
"netstandard1.6",
|
|
70
|
+
),
|
|
71
|
+
"net7.0": (
|
|
72
|
+
"net7.0",
|
|
73
|
+
"net6.0",
|
|
74
|
+
"net5.0",
|
|
75
|
+
"netstandard2.1",
|
|
76
|
+
"netstandard2.0",
|
|
77
|
+
),
|
|
78
|
+
"net6.0": (
|
|
79
|
+
"net6.0",
|
|
80
|
+
"net5.0",
|
|
81
|
+
"netstandard2.1",
|
|
82
|
+
"netstandard2.0",
|
|
83
|
+
),
|
|
84
|
+
"net5.0": ("net5.0", "netstandard2.1", "netstandard2.0"),
|
|
85
|
+
"netstandard2.1": ("netstandard2.1", "netstandard2.0"),
|
|
86
|
+
"netstandard2.0": ("netstandard2.0", "netstandard1.6", "netstandard1.4"),
|
|
87
|
+
"net48": ("net48", "net472", "net471", "net47", "net462", "net461", "net46", "net45", "netstandard2.0"),
|
|
88
|
+
"net472": ("net472", "net471", "net47", "net462", "net461", "net46", "net45", "netstandard2.0"),
|
|
89
|
+
"net471": ("net471", "net47", "net462", "net461", "net46", "net45", "netstandard2.0"),
|
|
90
|
+
"net47": ("net47", "net462", "net461", "net46", "net45", "netstandard2.0"),
|
|
91
|
+
"net46": ("net46", "net45", "netstandard1.4", "netstandard1.3"),
|
|
92
|
+
"net45": ("net45", "netstandard1.3", "netstandard1.2"),
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _candidate_tfms(target: str | None) -> list[str]:
|
|
97
|
+
"""Return TFMs to try, in priority order, for a project's <TargetFramework>.
|
|
98
|
+
|
|
99
|
+
Multi-target projects come in as a semicolon-joined string
|
|
100
|
+
(``net6.0;net8.0``); we expand each side and concatenate fallback
|
|
101
|
+
chains, then dedupe.
|
|
102
|
+
"""
|
|
103
|
+
if not target:
|
|
104
|
+
return ["netstandard2.0", "netstandard2.1", "net8.0", "net6.0"]
|
|
105
|
+
chunks = [t.strip() for t in target.split(";") if t.strip()]
|
|
106
|
+
out: list[str] = []
|
|
107
|
+
seen: set[str] = set()
|
|
108
|
+
for c in chunks:
|
|
109
|
+
for tfm in _TFM_FALLBACKS.get(c, (c,)):
|
|
110
|
+
if tfm not in seen:
|
|
111
|
+
seen.add(tfm)
|
|
112
|
+
out.append(tfm)
|
|
113
|
+
return out
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def resolve_package_dlls(
|
|
117
|
+
pkg: PackageRef, target_framework: str | None
|
|
118
|
+
) -> list[Path]:
|
|
119
|
+
"""Find DLLs for a single ``PackageReference`` in any local NuGet cache.
|
|
120
|
+
|
|
121
|
+
Returns all DLLs under the best-matching TFM directory — most
|
|
122
|
+
packages ship one or two assemblies per TFM but some (Roslyn,
|
|
123
|
+
BouncyCastle) ship a bag.
|
|
124
|
+
"""
|
|
125
|
+
caches = nuget_cache_dirs()
|
|
126
|
+
if not caches:
|
|
127
|
+
return []
|
|
128
|
+
|
|
129
|
+
name_lower = pkg.name.lower()
|
|
130
|
+
versions = _versions_for(caches, name_lower, pkg.version)
|
|
131
|
+
tfms = _candidate_tfms(target_framework)
|
|
132
|
+
|
|
133
|
+
results: list[Path] = []
|
|
134
|
+
for cache in caches:
|
|
135
|
+
for ver in versions:
|
|
136
|
+
lib_root = cache / name_lower / ver / "lib"
|
|
137
|
+
if not lib_root.is_dir():
|
|
138
|
+
continue
|
|
139
|
+
for tfm in tfms:
|
|
140
|
+
tfm_dir = lib_root / tfm
|
|
141
|
+
if tfm_dir.is_dir():
|
|
142
|
+
results.extend(sorted(tfm_dir.glob("*.dll")))
|
|
143
|
+
if results:
|
|
144
|
+
return results
|
|
145
|
+
# No exact/fallback TFM match — let any flat DLL stand in
|
|
146
|
+
# for older packages that ship `lib/*.dll` without a TFM
|
|
147
|
+
# subdir. Better than nothing for net2.0-era libs.
|
|
148
|
+
for dll in sorted(lib_root.glob("*.dll")):
|
|
149
|
+
if dll.is_file():
|
|
150
|
+
results.append(dll)
|
|
151
|
+
if results:
|
|
152
|
+
return results
|
|
153
|
+
return results
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def _versions_for(
|
|
157
|
+
caches: list[Path], name_lower: str, requested: str | None
|
|
158
|
+
) -> list[str]:
|
|
159
|
+
"""Pick which on-disk version directories to consider, best first.
|
|
160
|
+
|
|
161
|
+
Requested version present on disk → use it directly. Otherwise
|
|
162
|
+
return all versions sorted descending so we try newer first
|
|
163
|
+
(lexicographic on padded chunks is good enough for the
|
|
164
|
+
Major.Minor.Patch[-suffix] layout NuGet emits).
|
|
165
|
+
"""
|
|
166
|
+
if requested:
|
|
167
|
+
# Direct hit if the requested version directory exists in any cache.
|
|
168
|
+
for cache in caches:
|
|
169
|
+
if (cache / name_lower / requested).is_dir():
|
|
170
|
+
return [requested]
|
|
171
|
+
# Fall back to anything we have on disk.
|
|
172
|
+
seen: set[str] = set()
|
|
173
|
+
for cache in caches:
|
|
174
|
+
d = cache / name_lower
|
|
175
|
+
if not d.is_dir():
|
|
176
|
+
continue
|
|
177
|
+
for child in d.iterdir():
|
|
178
|
+
if child.is_dir():
|
|
179
|
+
seen.add(child.name)
|
|
180
|
+
return sorted(seen, key=_version_sort_key, reverse=True)
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _version_sort_key(version: str) -> tuple[int, ...]:
|
|
184
|
+
"""Coarse semver-ish sort: split on dots, zero-pad missing chunks.
|
|
185
|
+
|
|
186
|
+
Pre-release suffixes (``1.2.3-beta``) compare lower than their
|
|
187
|
+
release equivalents because the int parse strips at the first
|
|
188
|
+
non-digit. Good enough for "newest first" without dragging in a
|
|
189
|
+
real packaging dep.
|
|
190
|
+
"""
|
|
191
|
+
parts = version.split(".")
|
|
192
|
+
out: list[int] = []
|
|
193
|
+
for p in parts[:4]: # cap to four segments
|
|
194
|
+
digits = ""
|
|
195
|
+
for ch in p:
|
|
196
|
+
if ch.isdigit():
|
|
197
|
+
digits += ch
|
|
198
|
+
else:
|
|
199
|
+
break
|
|
200
|
+
out.append(int(digits) if digits else 0)
|
|
201
|
+
while len(out) < 4:
|
|
202
|
+
out.append(0)
|
|
203
|
+
return tuple(out)
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def resolve_project_reference_dlls(
|
|
207
|
+
ref_csproj: Path, target_framework: str | None
|
|
208
|
+
) -> list[Path]:
|
|
209
|
+
"""Find the built DLL for a sibling project, if any.
|
|
210
|
+
|
|
211
|
+
Looks under ``{proj_dir}/bin/{config}/{tfm}/`` for each candidate
|
|
212
|
+
tfm in priority order, preferring Debug over Release because dev
|
|
213
|
+
workstations build Debug by default and that's the file most
|
|
214
|
+
likely to exist.
|
|
215
|
+
"""
|
|
216
|
+
base = ref_csproj.parent
|
|
217
|
+
tfms = _candidate_tfms(target_framework)
|
|
218
|
+
for config in ("Debug", "Release"):
|
|
219
|
+
for tfm in tfms:
|
|
220
|
+
out = base / "bin" / config / tfm
|
|
221
|
+
if out.is_dir():
|
|
222
|
+
dlls = sorted(out.glob("*.dll"))
|
|
223
|
+
if dlls:
|
|
224
|
+
return dlls
|
|
225
|
+
# Older non-SDK projects sometimes drop output straight in bin/.
|
|
226
|
+
flat = base / "bin"
|
|
227
|
+
if flat.is_dir():
|
|
228
|
+
return sorted(flat.glob("*.dll"))
|
|
229
|
+
return []
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
@dataclass
|
|
233
|
+
class ResolvedRefs:
|
|
234
|
+
"""All DLL paths resolved for one csproj."""
|
|
235
|
+
|
|
236
|
+
package_dlls: dict[str, list[Path]] # PackageRef.name -> dlls
|
|
237
|
+
project_dlls: dict[str, list[Path]] # referenced csproj path -> dlls
|
|
238
|
+
|
|
239
|
+
def all_paths(self) -> list[Path]:
|
|
240
|
+
out: list[Path] = []
|
|
241
|
+
for v in self.package_dlls.values():
|
|
242
|
+
out.extend(v)
|
|
243
|
+
for v in self.project_dlls.values():
|
|
244
|
+
out.extend(v)
|
|
245
|
+
return out
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def resolve_refs(info: CsprojInfo) -> ResolvedRefs:
|
|
249
|
+
"""Resolve every PackageReference + ProjectReference of one csproj.
|
|
250
|
+
|
|
251
|
+
Returns an empty ``ResolvedRefs`` rather than raising on missing
|
|
252
|
+
NuGet cache / absent build outputs — callers care about "what did
|
|
253
|
+
we manage to find" more than about failures.
|
|
254
|
+
"""
|
|
255
|
+
package_dlls: dict[str, list[Path]] = {}
|
|
256
|
+
for pkg in info.package_references:
|
|
257
|
+
dlls = resolve_package_dlls(pkg, info.target_framework)
|
|
258
|
+
if dlls:
|
|
259
|
+
package_dlls[pkg.name] = dlls
|
|
260
|
+
|
|
261
|
+
project_dlls: dict[str, list[Path]] = {}
|
|
262
|
+
for ref_path in info.project_references:
|
|
263
|
+
dlls = resolve_project_reference_dlls(Path(ref_path), info.target_framework)
|
|
264
|
+
if dlls:
|
|
265
|
+
project_dlls[ref_path] = dlls
|
|
266
|
+
return ResolvedRefs(package_dlls=package_dlls, project_dlls=project_dlls)
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def all_referenced_dlls(infos: Iterable[CsprojInfo]) -> set[Path]:
|
|
270
|
+
"""Deduplicated set of every DLL referenced by any project."""
|
|
271
|
+
out: set[Path] = set()
|
|
272
|
+
for info in infos:
|
|
273
|
+
refs = resolve_refs(info)
|
|
274
|
+
out.update(refs.all_paths())
|
|
275
|
+
return out
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""Extraction sanity checks — catch UTF-8 / parser drift at ingest time.
|
|
2
|
+
|
|
3
|
+
The historical UTF-8 byte-vs-str slicing bug silently chopped every
|
|
4
|
+
identifier in non-ASCII files, and nobody noticed until a user got
|
|
5
|
+
empty callers/callees a year later. This module exists so that class
|
|
6
|
+
of regression fails loudly at ingest, not at user-report.
|
|
7
|
+
|
|
8
|
+
The check is intentionally narrow: for each extracted Symbol whose
|
|
9
|
+
name is a plain identifier, the snippet must contain that identifier
|
|
10
|
+
as a substring. Anything more sophisticated would have to mirror the
|
|
11
|
+
extractor's logic, which is exactly what we're trying to validate —
|
|
12
|
+
so we keep this check independent and dumb on purpose.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import re
|
|
18
|
+
from dataclasses import dataclass
|
|
19
|
+
|
|
20
|
+
from .treesitter import ExtractedFile, Symbol
|
|
21
|
+
|
|
22
|
+
# Plain identifier — letters / digits / underscore. Skips generics
|
|
23
|
+
# (``Foo<T>``), operator overloads (``operator +``), F# parameterised
|
|
24
|
+
# names, and anything else the extractor reasonably emits but where a
|
|
25
|
+
# literal substring check would false-positive.
|
|
26
|
+
_PLAIN_IDENT = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass(frozen=True)
|
|
30
|
+
class SanityViolation:
|
|
31
|
+
"""One symbol whose snippet didn't contain its name verbatim."""
|
|
32
|
+
|
|
33
|
+
path: str
|
|
34
|
+
name: str
|
|
35
|
+
kind: str
|
|
36
|
+
start_line: int
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def is_checkable(symbol: Symbol) -> bool:
|
|
40
|
+
"""Whether this symbol's name is safe to round-trip against the snippet."""
|
|
41
|
+
return bool(_PLAIN_IDENT.match(symbol.name))
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _contains_as_word(haystack: str, needle: str) -> bool:
|
|
45
|
+
"""``needle`` appears in ``haystack`` as a complete word.
|
|
46
|
+
|
|
47
|
+
A plain substring check would miss the historical UTF-8 chop bug:
|
|
48
|
+
when ``CommandeRules`` got truncated to ``mmandeRules``, the
|
|
49
|
+
truncated name is still a substring of the snippet containing the
|
|
50
|
+
real ``CommandeRules`` declaration. Word boundaries close that
|
|
51
|
+
hole — ``mmandeRules`` is not a whole-word occurrence inside
|
|
52
|
+
``CommandeRules``.
|
|
53
|
+
"""
|
|
54
|
+
pattern = r"(?<![A-Za-z0-9_])" + re.escape(needle) + r"(?![A-Za-z0-9_])"
|
|
55
|
+
return re.search(pattern, haystack) is not None
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def violations_in(ex: ExtractedFile) -> list[SanityViolation]:
|
|
59
|
+
"""Return symbols whose snippet doesn't contain their plain-identifier name.
|
|
60
|
+
|
|
61
|
+
Non-plain names (generics, operators, F# parameterised) are skipped
|
|
62
|
+
rather than flagged — they aren't reliably substring-checkable.
|
|
63
|
+
Returns ``[]`` on a clean file.
|
|
64
|
+
"""
|
|
65
|
+
out: list[SanityViolation] = []
|
|
66
|
+
for sym in ex.symbols:
|
|
67
|
+
if not is_checkable(sym):
|
|
68
|
+
continue
|
|
69
|
+
if _contains_as_word(sym.snippet, sym.name):
|
|
70
|
+
continue
|
|
71
|
+
out.append(
|
|
72
|
+
SanityViolation(
|
|
73
|
+
path=ex.path,
|
|
74
|
+
name=sym.name,
|
|
75
|
+
kind=sym.kind,
|
|
76
|
+
start_line=sym.start_line,
|
|
77
|
+
)
|
|
78
|
+
)
|
|
79
|
+
return out
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@dataclass
|
|
83
|
+
class SanitySummary:
|
|
84
|
+
"""Aggregate counts across one ingest run."""
|
|
85
|
+
|
|
86
|
+
symbols_checked: int = 0
|
|
87
|
+
symbols_failed: int = 0
|
|
88
|
+
sample_violations: list[SanityViolation] = None # type: ignore[assignment]
|
|
89
|
+
|
|
90
|
+
def __post_init__(self) -> None:
|
|
91
|
+
if self.sample_violations is None:
|
|
92
|
+
self.sample_violations = []
|
|
93
|
+
|
|
94
|
+
@property
|
|
95
|
+
def failure_rate(self) -> float:
|
|
96
|
+
if self.symbols_checked == 0:
|
|
97
|
+
return 0.0
|
|
98
|
+
return self.symbols_failed / self.symbols_checked
|
|
99
|
+
|
|
100
|
+
def record(self, ex: ExtractedFile, *, keep_samples: int = 10) -> None:
|
|
101
|
+
for sym in ex.symbols:
|
|
102
|
+
if not is_checkable(sym):
|
|
103
|
+
continue
|
|
104
|
+
self.symbols_checked += 1
|
|
105
|
+
if _contains_as_word(sym.snippet, sym.name):
|
|
106
|
+
continue
|
|
107
|
+
self.symbols_failed += 1
|
|
108
|
+
if len(self.sample_violations) < keep_samples:
|
|
109
|
+
self.sample_violations.append(
|
|
110
|
+
SanityViolation(
|
|
111
|
+
path=ex.path,
|
|
112
|
+
name=sym.name,
|
|
113
|
+
kind=sym.kind,
|
|
114
|
+
start_line=sym.start_line,
|
|
115
|
+
)
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
# Threshold above which the ingest run flags itself as suspect. Tuned
|
|
120
|
+
# from real corpora: a healthy ingest sits at 0% (every plain
|
|
121
|
+
# identifier round-trips). The historical UTF-8 bug pushed the
|
|
122
|
+
# failure rate close to 100% on French C# repos. Anything above ~2%
|
|
123
|
+
# almost certainly means a real bug, not edge-case syntax.
|
|
124
|
+
SUSPECT_THRESHOLD = 0.02
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
"""Minimal Visual Studio `.sln` parser.
|
|
2
|
+
|
|
3
|
+
A solution file is a project group. The parser pulls out just the
|
|
4
|
+
project list — name, csproj path, and a stable GUID — which is enough
|
|
5
|
+
to wire ``Solution`` graph nodes and ``MEMBER_OF`` edges from each
|
|
6
|
+
indexed Project to its containing Solution.
|
|
7
|
+
|
|
8
|
+
Folders ("solution items") are skipped; we want code projects, not
|
|
9
|
+
the IDE's tree organization. Unparseable lines are dropped silently
|
|
10
|
+
so a single corrupted entry never aborts the ingest.
|
|
11
|
+
|
|
12
|
+
Format reference (informal — Microsoft never published a real
|
|
13
|
+
grammar): https://learn.microsoft.com/en-us/visualstudio/extensibility/internals/solution-dot-sln-file
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import logging
|
|
19
|
+
import re
|
|
20
|
+
from dataclasses import dataclass, field
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
|
|
23
|
+
log = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
# ``Project("{type-guid}") = "Name", "RelativePath.csproj", "{proj-guid}"``
|
|
26
|
+
_PROJECT_LINE = re.compile(
|
|
27
|
+
r'^Project\("\{(?P<type>[0-9A-Fa-f-]+)\}"\)\s*=\s*'
|
|
28
|
+
r'"(?P<name>[^"]+)",\s*'
|
|
29
|
+
r'"(?P<path>[^"]+)",\s*'
|
|
30
|
+
r'"\{(?P<guid>[0-9A-Fa-f-]+)\}"'
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
# "Solution folder" type GUID per MS docs — these have no buildable
|
|
34
|
+
# output and shouldn't be indexed as projects.
|
|
35
|
+
_FOLDER_TYPE_GUID = "2150E333-8FDC-42A3-9474-1A3956D46DE8"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass(frozen=True)
|
|
39
|
+
class SolutionProject:
|
|
40
|
+
"""One project entry inside a .sln file."""
|
|
41
|
+
|
|
42
|
+
name: str
|
|
43
|
+
csproj_path: str # absolute, resolved
|
|
44
|
+
guid: str
|
|
45
|
+
type_guid: str
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class SolutionInfo:
|
|
50
|
+
"""Parsed view of one .sln file."""
|
|
51
|
+
|
|
52
|
+
path: str
|
|
53
|
+
name: str
|
|
54
|
+
projects: list[SolutionProject] = field(default_factory=list)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def parse_sln(sln_path: str | Path) -> SolutionInfo | None:
|
|
58
|
+
"""Parse one `.sln`. Returns ``None`` on read failure."""
|
|
59
|
+
p = Path(sln_path).resolve()
|
|
60
|
+
try:
|
|
61
|
+
# .sln files are Windows-encoded MBCS-or-UTF8 with BOM in
|
|
62
|
+
# practice; ``utf-8-sig`` strips the BOM if present without
|
|
63
|
+
# caring otherwise.
|
|
64
|
+
text = p.read_text(encoding="utf-8-sig", errors="replace")
|
|
65
|
+
except OSError as e:
|
|
66
|
+
log.warning("sln: failed to read %s — %s", p, e)
|
|
67
|
+
return None
|
|
68
|
+
|
|
69
|
+
info = SolutionInfo(path=str(p), name=p.stem)
|
|
70
|
+
base = p.parent
|
|
71
|
+
for line in text.splitlines():
|
|
72
|
+
m = _PROJECT_LINE.match(line.strip())
|
|
73
|
+
if not m:
|
|
74
|
+
continue
|
|
75
|
+
type_guid = m.group("type").lower()
|
|
76
|
+
if type_guid == _FOLDER_TYPE_GUID.lower():
|
|
77
|
+
continue
|
|
78
|
+
rel = m.group("path").replace("\\", "/")
|
|
79
|
+
candidate = (base / rel).resolve()
|
|
80
|
+
if not candidate.exists():
|
|
81
|
+
# Some solutions reference projects outside the cloned
|
|
82
|
+
# working tree (shared infra). Skip — we can't index what
|
|
83
|
+
# we don't have on disk.
|
|
84
|
+
continue
|
|
85
|
+
info.projects.append(
|
|
86
|
+
SolutionProject(
|
|
87
|
+
name=m.group("name"),
|
|
88
|
+
csproj_path=str(candidate),
|
|
89
|
+
guid=m.group("guid").lower(),
|
|
90
|
+
type_guid=type_guid,
|
|
91
|
+
)
|
|
92
|
+
)
|
|
93
|
+
return info
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def walk_solutions(root: str | Path) -> list[SolutionInfo]:
|
|
97
|
+
"""Find every `.sln` under ``root`` and parse it."""
|
|
98
|
+
out: list[SolutionInfo] = []
|
|
99
|
+
root_path = Path(root).resolve()
|
|
100
|
+
for sln in root_path.rglob("*.sln"):
|
|
101
|
+
# Skip artifacts in obvious build outputs; .sln rarely lives
|
|
102
|
+
# there but the filter is cheap and matches the csproj rule.
|
|
103
|
+
if any(part in {"bin", "obj", "node_modules"} for part in sln.parts):
|
|
104
|
+
continue
|
|
105
|
+
info = parse_sln(sln)
|
|
106
|
+
if info is not None:
|
|
107
|
+
out.append(info)
|
|
108
|
+
return out
|