PyPI - refactorika - Versions diffs - 0.2.0__py3-none-any.whl - Mend

refactorika 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

refactorika/__init__.py +3 -0
refactorika/agents/__init__.py +0 -0
refactorika/agents/base.py +23 -0
refactorika/agents/complexity_agent.py +28 -0
refactorika/agents/dead_code_agent.py +23 -0
refactorika/agents/duplicate_agent.py +27 -0
refactorika/agents/import_agent.py +15 -0
refactorika/agents/orchestrator.py +82 -0
refactorika/analysis/__init__.py +0 -0
refactorika/analysis/audit.py +86 -0
refactorika/analysis/call_graph.py +411 -0
refactorika/analysis/dead_code.py +248 -0
refactorika/analysis/duplicates.py +337 -0
refactorika/analysis/embeddings.py +164 -0
refactorika/analysis/parser.py +129 -0
refactorika/analysis/related.py +159 -0
refactorika/cli.py +382 -0
refactorika/core/__init__.py +1 -0
refactorika/core/analyze.py +137 -0
refactorika/core/apply.py +161 -0
refactorika/core/gates.py +126 -0
refactorika/core/schema.py +275 -0
refactorika/core/storage.py +157 -0
refactorika/dashboard.py +165 -0
refactorika/docs_gen.py +286 -0
refactorika/harness.py +266 -0
refactorika/languages/__init__.py +18 -0
refactorika/languages/base.py +45 -0
refactorika/languages/generic_adapter.py +18 -0
refactorika/languages/python_adapter.py +49 -0
refactorika/languages/registry.py +29 -0
refactorika/mcp_server.py +193 -0
refactorika/memory/__init__.py +0 -0
refactorika/memory/agent_memory.py +116 -0
refactorika/memory/context.py +113 -0
refactorika/memory/vector_index.py +325 -0
refactorika/observability.py +152 -0
refactorika/transforms/__init__.py +0 -0
refactorika/transforms/dead.py +94 -0
refactorika/transforms/imports.py +95 -0
refactorika-0.2.0.dist-info/METADATA +541 -0
refactorika-0.2.0.dist-info/RECORD +45 -0
refactorika-0.2.0.dist-info/WHEEL +4 -0
refactorika-0.2.0.dist-info/entry_points.txt +3 -0
refactorika-0.2.0.dist-info/licenses/LICENSE +21 -0

refactorika/analysis/call_graph.py ADDED Viewed

@@ -0,0 +1,411 @@
+"""Call graph builder for dead-code reachability analysis.
+Walks all *.py files in a directory (or a single file), builds a directed graph
+of qualname -> set[qualname] edges, and exposes entry-point heuristics.
+"""
+from __future__ import annotations
+from pathlib import Path
+from typing import Optional
+from refactorika.analysis.parser import get_tree, iter_imports, iter_symbols
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+_SKIP_DIRS = {
+    ".venv",
+    "__pycache__",
+    ".git",
+    ".mypy_cache",
+    ".pytest_cache",
+    ".ruff_cache",
+}
+_ENTRY_DECORATORS = {"app.route", "click.command", "pytest.fixture"}
+def _module_name(file_path: Path, root: Path) -> str:
+    """Derive dotted module name from file path relative to root."""
+    try:
+        rel = file_path.relative_to(root)
+    except ValueError:
+        rel = file_path
+    parts = list(rel.parts)
+    if parts and parts[-1].endswith(".py"):
+        parts[-1] = parts[-1][:-3]
+    if parts and parts[-1] == "__init__":
+        parts = parts[:-1]
+    return ".".join(parts) if parts else file_path.stem
+def _collect_py_files(path: str) -> tuple[list[Path], Path]:
+    """Return (list of .py files to scan, root directory for module naming)."""
+    p = Path(path)
+    if p.is_file():
+        return [p], p.parent
+    files: list[Path] = []
+    for f in p.rglob("*.py"):
+        if any(part in _SKIP_DIRS for part in f.parts):
+            continue
+        files.append(f)
+    return files, p
+def _string_literal_text(node) -> Optional[str]:
+    """Return the decoded inner text of a string node, or None if not a string."""
+    if node.type != "string":
+        return None
+    parts: list[str] = []
+    for child in node.children:
+        if child.type == "string_content" and child.text:
+            parts.append(child.text.decode())
+    if parts:
+        return "".join(parts)
+    # Fallback: strip the surrounding quotes from the raw text.
+    raw = node.text.decode() if node.text else ""
+    return raw.strip("\"'")
+def _parse_all_from_tree(tree) -> set[str]:
+    """Collect names listed in a module-level ``__all__`` via the AST.
+    Handles list **and** tuple (and set) literals, including multi-line ones —
+    anything regex-over-source missed.
+    """
+    names: set[str] = set()
+    root = tree.root_node
+    for node in root.children:
+        # __all__ = [...] / (...) is an expression_statement wrapping an assignment.
+        assign = node
+        if node.type == "expression_statement" and node.children:
+            assign = node.children[0]
+        if assign.type != "assignment":
+            continue
+        left = assign.child_by_field_name("left")
+        right = assign.child_by_field_name("right")
+        if left is None or right is None:
+            continue
+        if not (
+            left.type == "identifier" and left.text and left.text.decode() == "__all__"
+        ):
+            continue
+        if right.type not in ("list", "tuple", "set"):
+            continue
+        for elem in right.children:
+            text = _string_literal_text(elem)
+            if text and text.isidentifier():
+                names.add(text)
+    return names
+def _find_main_block(tree):
+    """Return the ``if __name__ == "__main__":`` if_statement node, or None."""
+    root = tree.root_node
+    for node in root.children:
+        if node.type != "if_statement":
+            continue
+        cond = node.child_by_field_name("condition")
+        if cond is None or cond.type != "comparison_operator":
+            continue
+        cond_text = cond.text.decode() if cond.text else ""
+        # Normalize quotes/spacing: __name__ == "__main__" or '__main__'.
+        normalized = cond_text.replace(" ", "")
+        if "__name__==" in normalized and "__main__" in normalized:
+            return node
+    return None
+def _has_main_block(tree) -> bool:
+    return _find_main_block(tree) is not None
+def _main_block_calls(tree) -> set[str]:
+    """Extract function names called anywhere inside the ``__main__`` block.
+    Walks the full block subtree (multi-line and nested calls included) via the
+    AST instead of a single-line regex.
+    """
+    block = _find_main_block(tree)
+    if block is None:
+        return set()
+    return set(_iter_calls_from_node(block))
+def _decorator_texts(node) -> list[str]:
+    """Return decorator expression texts for a function/class node."""
+    decorators: list[str] = []
+    for child in node.children:
+        if child.type == "decorator":
+            # decorator -> '@' followed by the expression
+            text = child.text.decode() if child.text else ""
+            text = text.lstrip("@").strip()
+            decorators.append(text)
+    return decorators
+# ---------------------------------------------------------------------------
+# CallGraph
+# ---------------------------------------------------------------------------
+class CallGraph:
+    """Directed call graph over all symbols in a Python project."""
+    def __init__(self) -> None:
+        # qualname -> (kind, file_path_str, line)
+        self._nodes: dict[str, tuple[str, str, int]] = {}
+        # qualname -> set of qualnames it calls
+        self._edges: dict[str, set[str]] = {}
+        # qualnames considered entry points
+        self._entry_points: set[str] = set()
+    # ------------------------------------------------------------------
+    # Builder
+    # ------------------------------------------------------------------
+    @classmethod
+    def build(cls, path: str) -> "CallGraph":
+        """Parse all *.py files under *path* and construct the call graph."""
+        cg = cls()
+        files, root = _collect_py_files(path)
+        # Pass 1: collect all symbols and build per-file data needed for edge resolution.
+        # per_file: module -> { local_name -> qualname,  import_alias -> qualname }
+        per_file_symbols: dict[str, dict[str, str]] = {}  # module -> {name: qualname}
+        per_file_imports: dict[str, dict[str, str]] = {}  # module -> {alias: qualname}
+        file_trees: dict[str, object] = {}  # module -> tree
+        file_paths: dict[str, str] = {}  # module -> filesystem path
+        for fpath in files:
+            try:
+                source = fpath.read_text(encoding="utf-8", errors="replace")
+                tree = get_tree(source)
+            except Exception:
+                continue
+            module = _module_name(fpath, root)
+            file_trees[module] = tree
+            file_paths[module] = str(fpath)
+            sym_map: dict[str, str] = {}
+            for node, kind, name, line in iter_symbols(tree):
+                qualname = f"{module}.{name}"
+                cg._nodes[qualname] = (kind, str(fpath), line)
+                sym_map[name] = qualname
+            per_file_symbols[module] = sym_map
+        # Pass 1b: collect import aliases per module
+        for module, tree in file_trees.items():
+            import_map: dict[str, str] = {}
+            try:
+                for mod, names in iter_imports(tree):
+                    if names:
+                        for nm in names:
+                            # e.g. "from orders import compute_total" -> compute_total: orders.compute_total
+                            import_map[nm] = f"{mod}.{nm}"
+                    else:
+                        # bare "import foo" -> foo: foo
+                        top = mod.split(".")[0]
+                        import_map[top] = mod
+            except Exception:
+                pass
+            per_file_imports[module] = import_map
+        # Build a project-wide unqualified-name -> qualname map, but ONLY for
+        # names that are unique across the whole project. Ambiguous names (e.g.
+        # two modules each defining `compute`) are deliberately excluded so a
+        # bare call to an ambiguous name resolves to no edge instead of guessing.
+        _unq_counts: dict[str, list[str]] = {}
+        for qualname in cg._nodes:
+            _unq_counts.setdefault(qualname.split(".")[-1], []).append(qualname)
+        unique_by_unqualified: dict[str, str] = {
+            unq: quals[0] for unq, quals in _unq_counts.items() if len(quals) == 1
+        }
+        # Pass 2: build edges + detect entry points
+        for module, tree in file_trees.items():
+            sym_map = per_file_symbols.get(module, {})
+            import_map = per_file_imports.get(module, {})
+            all_dunder_names = _parse_all_from_tree(tree)
+            main_calls = _main_block_calls(tree)
+            is_test_file = Path(file_paths[module]).name.startswith("test_") or Path(
+                file_paths[module]
+            ).name.endswith("_test.py")
+            for node, kind, name, line in iter_symbols(tree):
+                qualname = f"{module}.{name}"
+                # Determine entry point
+                is_entry = False
+                # Public name -> conservative entry point
+                if not name.startswith("_"):
+                    is_entry = True
+                # __all__ inclusion
+                if name in all_dunder_names:
+                    is_entry = True
+                # inside __main__ block call
+                if name in main_calls:
+                    is_entry = True
+                # test_ prefix or in test file
+                if name.startswith("test_") or is_test_file:
+                    is_entry = True
+                # decorator heuristic
+                for deco_text in _decorator_texts(node):
+                    for ep_deco in _ENTRY_DECORATORS:
+                        if deco_text.startswith(ep_deco):
+                            is_entry = True
+                            break
+                if is_entry:
+                    cg._entry_points.add(qualname)
+                # Build edges: collect call names from this node's body
+                try:
+                    # iter_calls walks the whole tree; we scope it to this node
+                    sub_tree_calls = list(_iter_calls_from_node(node))
+                except Exception:
+                    sub_tree_calls = []
+                edge_set: set[str] = set()
+                for call_name in sub_tree_calls:
+                    resolved = _resolve_name(
+                        call_name,
+                        module,
+                        sym_map,
+                        import_map,
+                        cg._nodes,
+                        unique_by_unqualified,
+                    )
+                    if resolved:
+                        edge_set.add(resolved)
+                cg._edges.setdefault(qualname, set()).update(edge_set)
+        # Ensure every node has an (possibly empty) edge set
+        for qualname in cg._nodes:
+            cg._edges.setdefault(qualname, set())
+        return cg
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+    def call_sites(self, name: str) -> int:
+        """Count how many edges point TO *name* (exact qualname only).
+        Edges store fully-resolved qualnames, so an exact match is the correct
+        count. We deliberately do **not** match on the unqualified suffix —
+        doing so would credit calls aimed at a *different* same-named symbol in
+        another module, inflating the count and masking genuinely-dead code.
+        """
+        count = 0
+        for targets in self._edges.values():
+            if name in targets:
+                count += 1
+        return count
+    def edges_from(self, qualname: str) -> set[str]:
+        """Outbound references (qualnames) from *qualname*."""
+        return self._edges.get(qualname, set())
+    def all_symbols(self) -> set[str]:
+        """All known qualnames."""
+        return set(self._nodes.keys())
+    def dependents_of(self, module: str) -> list[str]:
+        """Modules referencing *module* (matched by final segment) via call-graph edges."""
+        target = module.split(".")[-1]
+        dependents: set[str] = set()
+        for qualname in self.all_symbols():
+            src_module = qualname.rsplit(".", 1)[0] if "." in qualname else qualname
+            if src_module.split(".")[-1] == target:
+                continue  # references within the same module aren't "dependents"
+            for t in self.edges_from(qualname):
+                t_module = t.rsplit(".", 1)[0] if "." in t else t
+                if t_module.split(".")[-1] == target:
+                    dependents.add(src_module)
+                    break
+        return sorted(dependents)
+    def dependent_count(self, module: str) -> int:
+        """How many other modules depend on *module* (blast radius)."""
+        return len(self.dependents_of(module))
+    def entry_points(self) -> set[str]:
+        """Conservatively reachable anchors."""
+        return set(self._entry_points)
+    def node_info(self, qualname: str) -> Optional[tuple[str, str, int]]:
+        """Return (kind, file, line) for a qualname, or None if unknown."""
+        return self._nodes.get(qualname)
+# ---------------------------------------------------------------------------
+# Internal helpers
+# ---------------------------------------------------------------------------
+def _iter_calls_from_node(node) -> list[str]:
+    """Collect call target names from a single AST node (and its descendants)."""
+    results: list[str] = []
+    def _walk(n) -> None:
+        if n.type == "call":
+            fn = n.child_by_field_name("function")
+            if fn is not None:
+                if fn.type == "identifier" and fn.text:
+                    results.append(fn.text.decode())
+                elif fn.type == "attribute":
+                    attr = fn.child_by_field_name("attribute")
+                    if attr is not None and attr.text:
+                        results.append(attr.text.decode())
+        for child in n.children:
+            _walk(child)
+    _walk(node)
+    return results
+def _resolve_name(
+    name: str,
+    current_module: str,
+    sym_map: dict[str, str],
+    import_map: dict[str, str],
+    all_nodes: dict[str, tuple],
+    unique_by_unqualified: dict[str, str],
+) -> Optional[str]:
+    """Resolve a bare call name to a fully qualified name, or None.
+    Resolution is *scoped* — we never credit a call to an arbitrary same-named
+    symbol in another module (that invents false edges and makes genuinely-dead
+    code look alive). Order:
+    1. Same-module symbol table.
+    2. Real imported-name map (the name was explicitly imported into this module).
+    3. A project-wide unqualified-name match **only when it is unambiguous**
+       (exactly one symbol anywhere bears that unqualified name). When the name
+       is ambiguous across modules, we record **no edge** rather than guessing.
+    """
+    # 1. Same-module symbol
+    if name in sym_map:
+        return sym_map[name]
+    # 2. Imported alias -> the real target it was imported as
+    if name in import_map:
+        candidate = import_map[name]
+        if candidate in all_nodes:
+            return candidate
+    # 3. Unambiguous project-wide match (one and only one symbol has this name).
+    #    Ambiguous names resolve to None -> no edge.
+    return unique_by_unqualified.get(name)

refactorika/analysis/dead_code.py ADDED Viewed

@@ -0,0 +1,248 @@
+"""Dead-code detection via call-graph reachability.
+BFS/DFS from entry points; anything not reachable is a dead-code candidate.
+Confidence is assigned based on naming conventions and string-literal reflection risk.
+"""
+from __future__ import annotations
+import hashlib
+from refactorika.analysis.call_graph import CallGraph, _collect_py_files
+from refactorika.analysis.parser import get_tree
+from refactorika.core.schema import DeadSymbol
+from refactorika.core.storage import Storage
+# Builtins that take a string attribute/key name and dynamically resolve a symbol.
+_REFLECTION_FUNCS = {"getattr", "setattr", "hasattr", "delattr"}
+def find_dead_code(path: str, storage: Storage) -> dict:
+    """Detect unreachable symbols in *path* via call-graph reachability.
+    Parameters
+    ----------
+    path:
+        File or directory to analyse.
+    storage:
+        Storage instance. Used to cache the result on an AST/content signature
+        of the analysed files (Redis primary, JSON fallback) so a re-run over an
+        unchanged tree skips the whole call-graph build.
+    Returns
+    -------
+    dict with keys:
+        "path"          - the analysed path
+        "entry_points"  - list of qualnames used as BFS roots
+        "dead_symbols"  - list of DeadSymbol.to_dict() sorted by rank descending
+    """
+    # Cache on a signature of every analysed file (path + content). A re-seen,
+    # unchanged tree returns the prior result without re-parsing.
+    cache_key = _dir_signature(path)
+    if cache_key is not None:
+        cached = storage.cache_get(cache_key)
+        if cached is not None:
+            return cached
+    # Build call graph
+    try:
+        call_graph = CallGraph.build(path)
+    except Exception as exc:
+        return {
+            "path": path,
+            "entry_points": [],
+            "dead_symbols": [],
+            "error": str(exc),
+        }
+    all_symbols = call_graph.all_symbols()
+    entry_pts = call_graph.entry_points()
+    # BFS/DFS reachability from entry points
+    reachable: set[str] = set()
+    frontier = list(entry_pts & all_symbols)
+    while frontier:
+        node = frontier.pop()
+        if node in reachable:
+            continue
+        reachable.add(node)
+        for child in call_graph.edges_from(node):
+            if child not in reachable:
+                frontier.append(child)
+    # Collect names that appear in *actual* reflection / dynamic-dispatch
+    # patterns (getattr("name"), string dispatch-dict keys) — not every string.
+    reflection_names = _collect_reflection_names(path)
+    # Identify dead symbols
+    dead: list[DeadSymbol] = []
+    for qualname in all_symbols:
+        if qualname in reachable:
+            continue
+        info = call_graph.node_info(qualname)
+        if info is None:
+            continue
+        kind, file_str, line = info
+        unqualified = qualname.split(".")[-1]
+        sites = call_graph.call_sites(qualname)
+        # Assign confidence + reason.
+        # Reflection wins over everything: a name resolved dynamically
+        # (getattr / dispatch-dict key) can't be trusted as dead — flag low,
+        # even for a private name with zero static call sites.
+        if unqualified in reflection_names:
+            confidence = "low"
+            rank = 30
+            reason = (
+                f"Name '{unqualified}' has {sites} static call site(s) but appears as a "
+                "reflection/dispatch string (getattr/dispatch key) — possible dynamic usage."
+            )
+        elif unqualified.startswith("_") and sites == 0:
+            confidence = "high"
+            rank = 90
+            reason = f"Private name '{unqualified}' with zero call sites and unreachable from entry points."
+        elif sites == 0:
+            confidence = "medium"
+            rank = 60
+            reason = (
+                f"Public name '{unqualified}' has zero call sites within the analysed codebase "
+                "and is unreachable from entry points."
+            )
+        else:
+            # Has call sites but still unreachable — unusual; treat as medium.
+            confidence = "medium"
+            rank = 60
+            reason = (
+                f"Symbol '{unqualified}' is unreachable from entry points "
+                f"(call_sites={sites})."
+            )
+        dead.append(
+            DeadSymbol(
+                kind=kind,
+                name=qualname,
+                file=file_str,
+                line=line,
+                confidence=confidence,
+                reason=reason,
+                rank=rank,
+            )
+        )
+    # Sort by rank descending (highest confidence first)
+    dead.sort(key=lambda d: d.rank, reverse=True)
+    result = {
+        "path": path,
+        "entry_points": sorted(entry_pts),
+        "dead_symbols": [d.to_dict() for d in dead],
+    }
+    if cache_key is not None:
+        storage.cache_set(cache_key, result)
+    return result
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def _dir_signature(path: str) -> str | None:
+    """Sha1 over every analysed file's relative path + content.
+    Returns ``None`` if no files are readable (nothing to cache on). Sorting the
+    inputs keeps the signature stable regardless of filesystem walk order.
+    """
+    files, root = _collect_py_files(path)
+    items: list[str] = []
+    for fpath in sorted(files):
+        try:
+            content = fpath.read_text(encoding="utf-8", errors="replace")
+        except Exception:
+            continue
+        try:
+            rel = str(fpath.relative_to(root))
+        except ValueError:
+            rel = str(fpath)
+        items.append(f"{rel}\0{content}")
+    if not items:
+        return None
+    digest = hashlib.sha1("\0\0".join(items).encode()).hexdigest()
+    return f"dead_code:{digest}"
+def _collect_reflection_names(path: str) -> set[str]:
+    """Return identifiers that appear in *actual reflection / dynamic-dispatch* sites.
+    Narrow on purpose (the old version matched any identifier-like substring in
+    any string/comment, which demoted far too many symbols to ``low``). We only
+    collect a name when it is used in a way that could dynamically resolve a
+    symbol:
+    * a string-literal argument to ``getattr`` / ``setattr`` / ``hasattr`` /
+      ``delattr`` (e.g. ``getattr(obj, "handle_event")``);
+    * a string-literal key in a dict literal — a dispatch table
+      (e.g. ``{"create": create_user, "delete": delete_user}``).
+    """
+    names: set[str] = set()
+    files, _ = _collect_py_files(path)
+    for fpath in files:
+        try:
+            source = fpath.read_text(encoding="utf-8", errors="replace")
+            tree = get_tree(source)
+        except Exception:
+            continue
+        _walk_reflection(tree.root_node, names)
+    return names
+def _walk_reflection(node, names: set[str]) -> None:
+    """Recursively collect reflection/dispatch string names from a subtree."""
+    if node.type == "call":
+        _collect_reflection_call(node, names)
+    elif node.type == "dictionary":
+        for child in node.children:
+            if child.type == "pair":
+                key = child.child_by_field_name("key")
+                text = _string_identifier(key) if key is not None else None
+                if text:
+                    names.add(text)
+    for child in node.children:
+        _walk_reflection(child, names)
+def _collect_reflection_call(call_node, names: set[str]) -> None:
+    """If *call_node* is getattr/setattr/..., collect its string-literal name args."""
+    fn = call_node.child_by_field_name("function")
+    fn_name = (
+        fn.text.decode()
+        if (fn is not None and fn.type == "identifier" and fn.text)
+        else ""
+    )
+    if fn_name not in _REFLECTION_FUNCS:
+        return
+    args = call_node.child_by_field_name("arguments")
+    if args is None:
+        return
+    for arg in args.children:
+        text = _string_identifier(arg)
+        if text:
+            names.add(text)
+def _string_identifier(node) -> str | None:
+    """Return the inner text of a string node iff it is a valid identifier, else None."""
+    if node is None or node.type != "string":
+        return None
+    parts: list[str] = []
+    for child in node.children:
+        if child.type == "string_content" and child.text:
+            parts.append(child.text.decode())
+    text = (
+        "".join(parts)
+        if parts
+        else (node.text.decode().strip("\"'") if node.text else "")
+    )
+    return text if text.isidentifier() else None