PyPI - sourcecode - Versions diffs - 0.33.0__tar.gz → 0.35.0__tar.gz - Mend

sourcecode 0.33.0tar.gz → 0.35.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (142) hide show

{sourcecode-0.33.0 → sourcecode-0.35.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sourcecode
-Version: 0.33.0
+Version: 0.35.0
 Summary: Deterministic codebase context for AI coding agents
 License:                                  Apache License
                                    Version 2.0, January 2004

{sourcecode-0.33.0 → sourcecode-0.35.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "sourcecode"
-version = "0.33.0"
+version = "0.35.0"
 description = "Deterministic codebase context for AI coding agents"
 readme = "README.md"
 requires-python = ">=3.9"

{sourcecode-0.33.0 → sourcecode-0.35.0}/src/sourcecode/__init__.py RENAMED Viewed

@@ -1,3 +1,3 @@
 """sourcecode — Deterministic codebase context maps for AI coding agents."""
-__version__ = "0.33.0"
+__version__ = "0.35.0"

{sourcecode-0.33.0 → sourcecode-0.35.0}/src/sourcecode/ast_extractor.py RENAMED Viewed

@@ -13,6 +13,7 @@ Install tree-sitter for best TS/JS results:
 import ast
 import re
+import sys
 from pathlib import Path
 from typing import Any, Iterator, Optional
@@ -31,6 +32,45 @@ from sourcecode.contract_model import (
 _MAX_FILE_SIZE = 200_000  # bytes — skip files larger than this
+# Python stdlib module names — used to filter noise from import lists.
+# sys.stdlib_module_names is available in Python 3.10+; fall back to a
+# curated set for 3.9 compatibility.
+if hasattr(sys, "stdlib_module_names"):
+    _PY_STDLIB: frozenset[str] = sys.stdlib_module_names  # type: ignore[attr-defined]
+else:
+    _PY_STDLIB: frozenset[str] = frozenset({  # type: ignore[no-redef]
+        "__future__", "_thread", "abc", "aifc", "argparse", "array", "ast",
+        "asynchat", "asyncio", "asyncore", "atexit", "audioop", "base64",
+        "bdb", "binascii", "binhex", "bisect", "builtins", "bz2", "calendar",
+        "cgi", "cgitb", "chunk", "cmath", "cmd", "code", "codecs", "codeop",
+        "collections", "colorsys", "compileall", "concurrent", "configparser",
+        "contextlib", "contextvars", "copy", "copyreg", "cProfile", "csv",
+        "ctypes", "curses", "dataclasses", "datetime", "dbm", "decimal",
+        "difflib", "dis", "doctest", "email", "encodings", "enum", "errno",
+        "faulthandler", "fcntl", "filecmp", "fileinput", "fnmatch", "fractions",
+        "ftplib", "functools", "gc", "getopt", "getpass", "gettext", "glob",
+        "grp", "gzip", "hashlib", "heapq", "hmac", "html", "http", "idlelib",
+        "imaplib", "importlib", "inspect", "io", "ipaddress", "itertools",
+        "json", "keyword", "lib2to3", "linecache", "locale", "logging", "lzma",
+        "mailbox", "marshal", "math", "mimetypes", "mmap", "modulefinder",
+        "multiprocessing", "netrc", "nntplib", "numbers", "operator", "optparse",
+        "os", "pathlib", "pdb", "pickle", "pickletools", "pipes", "pkgutil",
+        "platform", "plistlib", "poplib", "posix", "posixpath", "pprint",
+        "profile", "pstats", "pty", "pwd", "py_compile", "pyclbr", "pydoc",
+        "queue", "quopri", "random", "re", "readline", "reprlib", "resource",
+        "rlcompleter", "runpy", "sched", "secrets", "select", "selectors",
+        "shelve", "shlex", "shutil", "signal", "site", "smtpd", "smtplib",
+        "sndhdr", "socket", "socketserver", "sqlite3", "ssl", "stat",
+        "statistics", "string", "stringprep", "struct", "subprocess", "sunau",
+        "symtable", "sys", "sysconfig", "syslog", "tabnanny", "tarfile",
+        "tempfile", "termios", "test", "textwrap", "threading", "time",
+        "timeit", "tkinter", "token", "tokenize", "tomllib", "trace",
+        "traceback", "tracemalloc", "tty", "types", "typing", "unicodedata",
+        "unittest", "urllib", "uuid", "venv", "warnings", "wave", "weakref",
+        "webbrowser", "wsgiref", "xml", "xmlrpc", "zipapp", "zipfile",
+        "zipimport", "zlib", "zoneinfo",
+    })
 _LANGUAGE_MAP: dict[str, str] = {
     ".py": "python",
     ".ts": "typescript",
@@ -333,7 +373,8 @@ def _ts_types(root: Any, src: bytes) -> list[TypeDefinition]:
                 continue
             name = _text(name_n, src)
             fields: list[TypeField] = []
-            body_n = _find_child(node, "object_type")
+            # "interface_body" in tree-sitter-typescript >= 0.21; "object_type" in older builds
+            body_n = _find_child(node, "interface_body", "object_type")
             if body_n:
                 for prop in _walk(body_n):
                     if prop.type in ("property_signature", "method_signature"):
@@ -345,7 +386,7 @@ def _ts_types(root: Any, src: bytes) -> list[TypeDefinition]:
                             required = not any(c.type == "?" for c in prop.children)
                             fields.append(TypeField(name=prop_name, type=type_text, required=required))
             extends: list[str] = []
-            heritage_n = _find_child(node, "extends_type_clause", "class_heritage")
+            heritage_n = _find_child(node, "extends_type_clause", "extends_clause", "class_heritage")
             if heritage_n:
                 for ext_n in _walk(heritage_n):
                     if ext_n.type == "type_identifier":
@@ -389,6 +430,25 @@ def _ts_hooks(root: Any, src: bytes) -> list[str]:
     return sorted(used)
+def _merge_imports(imports: list[ImportRecord]) -> list[ImportRecord]:
+    """Merge multiple ImportRecords with the same source into one.
+    Tree-sitter correctly captures `import { A }` and `import type { B }` from
+    the same module as two separate statements.  Merging them produces a compact,
+    predictable contract where each source appears exactly once.
+    """
+    merged: dict[str, ImportRecord] = {}
+    for imp in imports:
+        if imp.source in merged:
+            existing = merged[imp.source]
+            combined_symbols = sorted(set(existing.symbols) | set(imp.symbols))
+            kind = existing.kind if existing.kind != "side_effect" else imp.kind
+            merged[imp.source] = ImportRecord(source=imp.source, symbols=combined_symbols, kind=kind)
+        else:
+            merged[imp.source] = imp
+    return list(merged.values())
 def _extract_ts_js_tree_sitter(path: str, source: str, lang_obj: Any, language: str) -> FileContract:
     try:
         parser = _get_parser(lang_obj)
@@ -396,7 +456,7 @@ def _extract_ts_js_tree_sitter(path: str, source: str, lang_obj: Any, language:
         tree = parser.parse(src_bytes)
         root = tree.root_node
-        imports = _ts_imports(root, src_bytes)
+        imports = _merge_imports(_ts_imports(root, src_bytes))
         exports = _ts_exports(root, src_bytes)
         exported_names = {e.name for e in exports}
         functions = _ts_functions(root, src_bytes, exported_names)
@@ -729,9 +789,10 @@ def _py_signature(node: ast.FunctionDef | ast.AsyncFunctionDef) -> str:
             sig += f" -> {ast.unparse(node.returns)}"
         except Exception:
             pass
-    # Truncate runaway signatures (e.g. typer.Option() defaults)
-    if len(sig) > 300:
-        sig = sig[:297] + "..."
+    # Keep full signature — serializer applies per-mode compression.
+    # Hard cap at 2000 to prevent pathological cases.
+    if len(sig) > 2000:
+        sig = sig[:1997] + "..."
     return sig
@@ -840,6 +901,10 @@ def _extract_python(path: str, source: str) -> FileContract:
             if exported or name in all_names:
                 exports.append(ExportRecord(name=name, kind="class"))
+    # Filter stdlib from imports — they add noise without signal for agents
+    _stdlib_roots = {m.split(".")[0] for m in _PY_STDLIB}
+    imports = [i for i in imports if i.source.split(".")[0] not in _stdlib_roots]
     deps = sorted({
         imp.source.split(".")[0]
         for imp in imports

{sourcecode-0.33.0 → sourcecode-0.35.0}/src/sourcecode/cli.py RENAMED Viewed

@@ -516,11 +516,13 @@ def main(
         "contract",
         "--mode",
         help=(
-            "Output mode: contract (default) | hybrid | raw. "
-            "contract: per-file semantic contracts — exports, signatures, types, imports. No bodies. "
+            "Output mode: contract|minimal (default) | standard | deep | hybrid | raw. "
+            "contract/minimal: minimal per-file contracts — exports, signatures, deps. Smallest output. "
+            "standard: full per-file detail with imports, relevance scores, extraction method. "
+            "deep: standard + optional analysis sections (deps, env, git). "
             "hybrid: contracts + compact bodies for top-ranked files. "
             "raw: legacy project-level analysis (stacks, entry points, dependencies). "
-            "contract mode is the recommended default for AI coding agents."
+            "contract/minimal is the recommended default for AI coding agents."
         ),
     ),
     max_symbols: Optional[int] = typer.Option(
@@ -587,7 +589,7 @@ def main(
     _t0 = time.monotonic()
     # Validate new flag choices
-    _MODE_CHOICES = ("contract", "hybrid", "raw")
+    _MODE_CHOICES = ("contract", "minimal", "standard", "deep", "hybrid", "raw")
     if mode not in _MODE_CHOICES:
         typer.echo(
             f"Error: invalid value '{mode}' for --mode. Valid options: {', '.join(_MODE_CHOICES)}",
@@ -631,6 +633,13 @@ def main(
         typer.echo(f"Error: '{target}' is not a directory.", err=True)
         raise typer.Exit(code=1)
+    # Normalize mode aliases
+    _CONTRACT_MODES = frozenset({"contract", "minimal", "standard", "deep", "hybrid"})
+    if mode == "minimal":
+        mode = "contract"   # minimal is the canonical default contract rendering
+    elif mode not in _CONTRACT_MODES and mode != "raw":
+        mode = "contract"   # unknown → safe default
     # Legacy flags imply raw mode unless --mode was explicitly overridden.
     # These flags produce standard_view-only output sections not in contract_view.
     # Preserves backward compat: callers using any legacy flag get their previous format.
@@ -639,9 +648,17 @@ def main(
         compact or agent or tree or format == "yaml" or trace_pipeline
         or docs or semantics or graph_modules or full_metrics or architecture
     )
-    if mode == "contract" and _legacy_flags_active:
+    if mode in ("contract", "standard", "deep") and _legacy_flags_active:
         mode = "raw"
+    # Map mode to contract_view depth
+    _CONTRACT_DEPTH = {
+        "contract": "minimal",
+        "standard": "standard",
+        "deep": "deep",
+        "hybrid": "minimal",  # hybrid adds bodies via pipeline, minimal header
+    }
     # --- Import analysis modules ---
     from dataclasses import asdict, replace
@@ -1226,8 +1243,9 @@ def main(
                         ))
         sm = _replace(sm, pipeline_trace=_trace.build_trace())
-    # Contract pipeline — runs for mode=contract|hybrid (skip for raw)
-    if mode in ("contract", "hybrid"):
+    # Contract pipeline — runs for mode=contract|standard|deep|hybrid (skip for raw)
+    _is_contract_mode = mode in ("contract", "standard", "deep", "hybrid")
+    if _is_contract_mode:
         from sourcecode.contract_pipeline import ContractPipeline
         _cp = ContractPipeline()
         _contracts, _contract_summary = _cp.run(
@@ -1249,9 +1267,10 @@ def main(
             typer.echo(f"[contract] {len(_contracts)} files extracted ({_contract_summary.method_breakdown})", err=True)
     # 4. Serialize
-    if mode in ("contract", "hybrid"):
+    if _is_contract_mode:
         from sourcecode.serializer import contract_view as _contract_view
-        data = _contract_view(sm, emit_graph=emit_graph)
+        _depth = _CONTRACT_DEPTH.get(mode, "minimal")
+        data = _contract_view(sm, emit_graph=emit_graph, depth=_depth)
         if not no_redact:
             data = redact_dict(data)
         content = json.dumps(data, indent=2, ensure_ascii=False)

{sourcecode-0.33.0 → sourcecode-0.35.0}/src/sourcecode/contract_pipeline.py RENAMED Viewed

@@ -25,6 +25,23 @@ from sourcecode.schema import EntryPoint, MonorepoPackageInfo
 _MAX_FILES = 500      # hard cap on files extracted per run
 _SRC_EXTENSIONS: frozenset[str] = frozenset(_LANGUAGE_MAP.keys())
+# Role-based score adjustments applied after contract extraction.
+# Runtime roles get a boost; config/util are neutral or penalized.
+_ROLE_SCORE: dict[str, float] = {
+    "entrypoint": 0.15,
+    "service":    0.10,
+    "route":      0.10,
+    "api":        0.08,
+    "middleware": 0.06,
+    "store":      0.05,
+    "model":      0.05,
+    "hook":       0.05,
+    "component":  0.03,
+    "util":       0.00,
+    "config":    -0.10,
+    "unknown":    0.00,
+}
 RankStrategy = Literal["relevance", "centrality", "git-churn"]
@@ -206,9 +223,9 @@ class ContractPipeline:
         if changed_only:
             src_paths = [p for p in src_paths if p in changed_files]
-        # Apply max_files cap
-        if len(src_paths) > self.max_files:
-            # Pre-rank by static relevance to pick best candidates
+        # Apply max_files cap — bypass when symbol search to ensure defining files are found.
+        # A symbol query over a large repo needs all files; result set is small after filtering.
+        if symbol is None and len(src_paths) > self.max_files:
             src_paths = sorted(
                 src_paths,
                 key=lambda p: (p in entry_paths, scorer.score(p)),
@@ -255,23 +272,9 @@ class ContractPipeline:
         # 7. Rank
         contracts = self._rank(contracts, rank_by)
-        # 8. Symbol filter — keep files that export or use the symbol
+        # 8. Symbol filter — keep files that define or import the symbol
         if symbol:
-            symbol_contracts = [
-                c for c in contracts
-                if any(e.name == symbol for e in c.exports)
-                or any(f.name == symbol for f in c.functions)
-                or symbol in {t.name for t in c.types}
-            ]
-            # Also pull in direct importers (fan_in sourcing)
-            importer_paths = {
-                c.path for c in contracts
-                for imp in c.imports
-                if symbol in imp.symbols
-            }
-            importer_contracts = [c for c in contracts if c.path in importer_paths]
-            symbol_contracts = list({c.path: c for c in symbol_contracts + importer_contracts}.values())
-            contracts = sorted(symbol_contracts, key=lambda c: -c.relevance_score)
+            contracts = _filter_by_symbol(contracts, symbol)
         # 9. Entrypoints-only filter
         if entrypoints_only and not symbol:
@@ -323,6 +326,9 @@ class ContractPipeline:
         churn_score = min(churn.get(c.path, 0) / 20.0, 0.1)
         base += churn_score
+        # Role-based boost: runtime roles score higher than auxiliary
+        base += _ROLE_SCORE.get(c.role, 0.0)
         return min(1.0, base)
     def _rank(self, contracts: list[FileContract], rank_by: RankStrategy) -> list[FileContract]:
@@ -385,6 +391,48 @@ def _limit_symbols(contracts: list[FileContract], max_symbols: int) -> list[File
     return result
+# ---------------------------------------------------------------------------
+# Symbol-aware filter
+# ---------------------------------------------------------------------------
+def _filter_by_symbol(contracts: list[FileContract], symbol: str) -> list[FileContract]:
+    """Return contracts that define or import *symbol*.
+    Matching strategy:
+    1. Exact match on export/function/type names.
+    2. Case-insensitive fallback when exact match yields nothing.
+    3. Importer contracts: files that name the symbol in their imports.
+    Defining contracts are ranked first; importers follow.
+    """
+    def _defines(c: FileContract, sym: str, case: bool) -> bool:
+        cmp = (lambda a, b: a.lower() == b.lower()) if case else (lambda a, b: a == b)
+        return (
+            any(cmp(e.name, sym) for e in c.exports)
+            or any(cmp(f.name, sym) for f in c.functions)
+            or any(cmp(t.name, sym) for t in c.types)
+        )
+    def _imports(c: FileContract, sym: str, case: bool) -> bool:
+        if case:
+            sym_l = sym.lower()
+            return any(sym_l == s.lower() for imp in c.imports for s in imp.symbols)
+        return any(sym in imp.symbols for imp in c.imports)
+    # Exact match first
+    defining = [c for c in contracts if _defines(c, symbol, case=False)]
+    if not defining:
+        defining = [c for c in contracts if _defines(c, symbol, case=True)]
+    importer_paths = {c.path for c in contracts if _imports(c, symbol, case=len(defining) == 0)}
+    # Exclude files already in defining set
+    defining_paths = {c.path for c in defining}
+    importers = [c for c in contracts if c.path in importer_paths and c.path not in defining_paths]
+    merged = list({c.path: c for c in defining + importers}.values())
+    return sorted(merged, key=lambda c: (c.path not in defining_paths, -c.relevance_score))
 # ---------------------------------------------------------------------------
 # Dependency graph emission
 # ---------------------------------------------------------------------------

{sourcecode-0.33.0 → sourcecode-0.35.0}/src/sourcecode/relevance_scorer.py RENAMED Viewed

@@ -82,8 +82,42 @@ _AUXILIARY_DIR_PATTERNS: list[re.Pattern[str]] = [
     re.compile(r"(?:^|/)scripts?(?:/|$)"),
     re.compile(r"(?:^|/)tools?(?:/|$)"),
     re.compile(r"(?:^|/)ci(?:/|$)"),
+    re.compile(r"(?:^|/)migrations?(?:/|$)"),
+    re.compile(r"(?:^|/)generated?(?:/|$)"),
+    re.compile(r"(?:^|/)storybook(?:/|$)"),
+    re.compile(r"(?:^|/)stories(?:/|$)"),
 ]
+# Test file patterns — scored low, excluded from default contract output
+_TEST_FILE_PATTERNS: tuple[str, ...] = (
+    "_test.", ".test.", ".spec.", "test_", "conftest", "_spec.",
+)
+_TEST_DIR_MARKERS: frozenset[str] = frozenset({
+    "/test/", "/tests/", "/spec/", "/specs/", "/__tests__/", "/__mocks__/",
+})
+# Config/tooling filenames that are low runtime-relevance
+_LOW_RUNTIME_STEMS: frozenset[str] = frozenset({
+    "setup", "setup.cfg", "pyproject", "package", "package-lock",
+    "yarn.lock", "pnpm-lock", "composer", "gemfile", "podfile",
+    "dockerfile", "docker-compose", "makefile", "rakefile",
+    "gruntfile", "gulpfile", "webpack.config", "vite.config",
+    "rollup.config", "babel.config", "jest.config", "vitest.config",
+    "tsconfig", "jsconfig", ".eslintrc", ".prettierrc", ".editorconfig",
+    # doc-site tooling
+    "rspress", "rspress.config", "docusaurus.config", "docusaurus",
+    "vuepress.config", "vuepress", "nextra.config",
+    "astro.config", "gatsby.config", "gatsby-config",
+    # build/workspace orchestration
+    "turbo", "turbo.config", "nx", "nx.config", "lerna",
+    "esbuild.config", "swc.config", "postcss.config",
+    "tailwind.config", "tailwind",
+    # storybook
+    "main.storybook", "preview.storybook",
+    # playwright / cypress / e2e
+    "playwright.config", "cypress.config",
+})
 _HIGH_VALUE_SUFFIXES: frozenset[str] = frozenset({
     ".py", ".ts", ".tsx", ".js", ".jsx", ".mjs",
     ".go", ".java", ".kt", ".rs", ".rb", ".cs",
@@ -114,7 +148,7 @@ class RelevanceScorer:
         base = 0.3
-        # Package role boost
+        # Package role boost — runtime code scores high, tooling/docs low
         role = self._package_role(norm)
         role_boost = {
             "runtime_core": 0.4,
@@ -124,10 +158,10 @@ class RelevanceScorer:
             "composition_layer": 0.2,
             "plugin_package": 0.15,
             "infrastructure_layer": 0.15,
-            "tooling_layer": -0.1,
-            "docs_layer": -0.15,
-            "test_layer": 0.05,
-            "benchmark_layer": -0.2,
+            "tooling_layer": -0.15,
+            "docs_layer": -0.25,
+            "test_layer": -0.1,
+            "benchmark_layer": -0.25,
         }.get(role, 0.0)
         base += role_boost
@@ -141,9 +175,22 @@ class RelevanceScorer:
         if stem in _ENTRYPOINT_STEMS:
             base += 0.15
-        # Penalize auxiliary dirs
+        # Test file penalty — tests are useful for coverage but not for
+        # understanding architecture or editing production code
+        fname = Path(norm).name.lower()
+        if (any(m in f"/{norm}/" for m in _TEST_DIR_MARKERS)
+                or any(fname.startswith(p.strip(".")) or p in fname
+                       for p in _TEST_FILE_PATTERNS)):
+            base -= 0.30
+        # Config/tooling filename penalty — stronger than before
+        if stem.lower() in _LOW_RUNTIME_STEMS:
+            base -= 0.30
+        # Auxiliary dir penalty (docs, examples, demos, fixtures, scripts…)
+        # Aggressive: these almost never belong in top-ranked agent context
         if self._is_auxiliary(norm):
-            base -= 0.2
+            base -= 0.40
         return max(0.0, min(1.0, base))

sourcecode 0.33.0__tar.gz → 0.35.0__tar.gz

sourcecode 0.33.0tar.gz → 0.35.0tar.gz