zerottmm 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
zerottmm/gitutils.py ADDED
@@ -0,0 +1,117 @@
1
+ """Utilities for computing git churn.
2
+
3
+ The hotspot score used by ttmm combines cyclomatic complexity and recent
4
+ development activity. ``compute_churn`` uses ``git log --numstat`` to
5
+ approximate how much each file has changed recently. A half‑life decay
6
+ is applied so that more recent commits contribute more to the churn.
7
+
8
+ If ``git`` is not installed or the repository is not a git repository,
9
+ ``compute_churn`` will return an empty dictionary, in which case
10
+ complexity alone determines the hotspot score.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import subprocess
16
+ import time
17
+ import os
18
+ from typing import Dict, Optional
19
+
20
+
21
+ def _run_git(args, cwd: str) -> Optional[str]:
22
+ try:
23
+ result = subprocess.run(
24
+ ["git"] + args,
25
+ cwd=cwd,
26
+ stdout=subprocess.PIPE,
27
+ stderr=subprocess.PIPE,
28
+ text=True,
29
+ check=True,
30
+ )
31
+ return result.stdout
32
+ except Exception:
33
+ return None
34
+
35
+
36
+ def compute_churn(
37
+ repo_path: str,
38
+ half_life_days: float = 90.0,
39
+ since_days: float = 365.0,
40
+ ) -> Dict[str, float]:
41
+ """Compute a recency‑weighted churn score for each file in a git repository.
42
+
43
+ Parameters
44
+ ----------
45
+ repo_path: str
46
+ Path to the repository root. Must contain a ``.git`` directory.
47
+ half_life_days: float
48
+ The half‑life for decay in days. Commits older than this
49
+ contribute half as much churn as very recent commits.
50
+ since_days: float
51
+ Limit churn calculation to this many days in the past. Setting
52
+ a finite window improves performance on large repositories.
53
+
54
+ Returns
55
+ -------
56
+ Dict[str, float]
57
+ Mapping from relative file path to a churn value. Paths use
58
+ forward slashes regardless of platform. If ``git`` is not
59
+ available or no repository exists, returns an empty dict.
60
+ """
61
+ # Verify .git exists
62
+ if not os.path.isdir(os.path.join(repo_path, ".git")):
63
+ return {}
64
+ # Build git log command
65
+ now = time.time()
66
+ since_timestamp = now - since_days * 86400
67
+ since_date = time.strftime("%Y-%m-%d", time.gmtime(since_timestamp))
68
+ output = _run_git([
69
+ "-c", "log.showSignature=false", # suppress GPG signature noise
70
+ "log",
71
+ "--numstat",
72
+ f"--since={since_date}",
73
+ "--pretty=format:%ct",
74
+ ], cwd=repo_path)
75
+ if output is None:
76
+ return {}
77
+ churn: Dict[str, float] = {}
78
+ lines = output.splitlines()
79
+ commit_time: Optional[int] = None
80
+ for line in lines:
81
+ line = line.strip()
82
+ if not line:
83
+ continue
84
+ if line.isdigit():
85
+ # Start of a commit: epoch time
86
+ try:
87
+ commit_time = int(line)
88
+ except Exception:
89
+ commit_time = None
90
+ else:
91
+ if commit_time is None:
92
+ continue
93
+ parts = line.split("\t")
94
+ if len(parts) != 3:
95
+ continue
96
+ adds_str, dels_str, path = parts
97
+ # Convert additions/deletions; "-" means binary or untracked
98
+ try:
99
+ adds = int(adds_str) if adds_str != "-" else 0
100
+ except Exception:
101
+ adds = 0
102
+ try:
103
+ dels = int(dels_str) if dels_str != "-" else 0
104
+ except Exception:
105
+ dels = 0
106
+ total = adds + dels
107
+ # Weight by recency
108
+ age_days = (now - commit_time) / 86400.0
109
+ weight = 0.0
110
+ if half_life_days > 0:
111
+ # Exponential decay: half‑life h -> weight = 0.5^(age/h)
112
+ weight = 0.5 ** (age_days / half_life_days)
113
+ else:
114
+ weight = 1.0
115
+ churn.setdefault(path, 0.0)
116
+ churn[path] += total * weight
117
+ return churn
zerottmm/index.py ADDED
@@ -0,0 +1,160 @@
1
+ """Repository indexer for ttmm.
2
+
3
+ The indexer walks a Python repository, parses functions and methods with
4
+ ``ast`` and stores them in the ttmm database via :mod:`ttmm.store`.
5
+ It computes cyclomatic complexity and lines of code for each symbol via
6
+ :mod:`ttmm.metrics`, and merges in git churn information via
7
+ :mod:`ttmm.gitutils` to produce a hotspot score later on.
8
+
9
+ Only top‑level functions and class methods are indexed. Nested
10
+ functions are ignored to keep the mental model focused on API surfaces.
11
+ Calls made within functions are collected, but attribute calls that
12
+ cannot be resolved statically are marked as unresolved.
13
+
14
+ Example usage:
15
+
16
+ ```
17
+ from ttmm.index import index_repo
18
+ index_repo("/path/to/repo")
19
+ ```
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import ast
25
+ import os
26
+ from typing import Dict, List, Tuple
27
+
28
+ from . import metrics, gitutils, store
29
+
30
+
31
+ def _iter_python_files(repo_path: str) -> List[str]:
32
+ """Recursively find all Python files in a repository.
33
+
34
+ Ignores the ``.ttmm`` directory. Returns relative paths with
35
+ forward slashes.
36
+ """
37
+ py_files: List[str] = []
38
+ for root, dirs, files in os.walk(repo_path):
39
+ # Skip .ttmm directory
40
+ rel_root = os.path.relpath(root, repo_path)
41
+ if rel_root.startswith(os.path.join(".ttmm")):
42
+ continue
43
+ for fname in files:
44
+ if fname.endswith(".py") and not fname.startswith("."):
45
+ full_path = os.path.join(root, fname)
46
+ rel_path = os.path.relpath(full_path, repo_path).replace(os.sep, "/")
47
+ py_files.append(rel_path)
48
+ return py_files
49
+
50
+
51
+ def index_repo(repo_path: str) -> None:
52
+ """Index a Python repository into the ttmm database.
53
+
54
+ This will parse all ``.py`` files under ``repo_path``, compute
55
+ symbol definitions, call edges and metrics, and persist them in
56
+ ``.ttmm/ttmm.db``. Any existing static data in the database is
57
+ replaced. Dynamic trace data is preserved.
58
+ """
59
+ repo_path = os.path.abspath(repo_path)
60
+ # Discover python files
61
+ py_files = _iter_python_files(repo_path)
62
+ # Compute git churn for all files
63
+ churn_by_file = gitutils.compute_churn(repo_path)
64
+ files_data: List[Tuple[str, float]] = []
65
+ symbols_data: List[Dict[str, object]] = []
66
+ calls_data: List[Dict[str, object]] = []
67
+ metrics_data: Dict[str, Tuple[float, int, float]] = {}
68
+ for rel_path in py_files:
69
+ abs_path = os.path.join(repo_path, rel_path)
70
+ try:
71
+ with open(abs_path, "r", encoding="utf-8") as f:
72
+ source = f.read()
73
+ except Exception:
74
+ # Skip unreadable files
75
+ continue
76
+ try:
77
+ tree = ast.parse(source, filename=abs_path)
78
+ except SyntaxError:
79
+ # Skip files with syntax errors
80
+ continue
81
+ mtime = os.path.getmtime(abs_path)
82
+ files_data.append((rel_path, mtime))
83
+ module_name = rel_path[:-3].replace("/", ".") # strip .py
84
+
85
+ class IndexVisitor(ast.NodeVisitor):
86
+ """Visitor to collect top‑level functions and methods and their calls."""
87
+
88
+ def __init__(self) -> None:
89
+ self.class_stack: List[str] = []
90
+ self.func_depth = 0 # track nesting of functions
91
+
92
+ def visit_ClassDef(self, node: ast.ClassDef) -> None:
93
+ self.class_stack.append(node.name)
94
+ self.generic_visit(node)
95
+ self.class_stack.pop()
96
+
97
+ def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
98
+ # Index only functions not nested inside another function (func_depth == 0)
99
+ if self.func_depth == 0:
100
+ qualname = module_name + ":" + (
101
+ f"{self.class_stack[-1]}." if self.class_stack else ""
102
+ ) + node.name
103
+ sym_type = "method" if self.class_stack else "function"
104
+ doc = ast.get_docstring(node)
105
+ symbols_data.append(
106
+ {
107
+ "qualname": qualname,
108
+ "path": rel_path,
109
+ "lineno": node.lineno,
110
+ "endlineno": getattr(node, "end_lineno", node.lineno),
111
+ "type": sym_type,
112
+ "doc": doc.strip() if isinstance(doc, str) else None,
113
+ }
114
+ )
115
+ # Metrics
116
+ comp = metrics.compute_complexity(node)
117
+ loc = metrics.compute_loc(node)
118
+ churn = churn_by_file.get(rel_path, 0.0)
119
+ metrics_data[qualname] = (comp, loc, churn)
120
+ # Collect calls
121
+
122
+ class CallVisitor(ast.NodeVisitor):
123
+ def visit_Call(self, call: ast.Call) -> None:
124
+ callee_name: str | None = None
125
+ unresolved = False
126
+ func = call.func
127
+ if isinstance(func, ast.Name):
128
+ callee_name = func.id
129
+ elif isinstance(func, ast.Attribute):
130
+ callee_name = func.attr
131
+ unresolved = True
132
+ if callee_name:
133
+ calls_data.append(
134
+ {
135
+ "caller_qualname": qualname,
136
+ "callee_name": callee_name,
137
+ "unresolved": unresolved,
138
+ }
139
+ )
140
+ # Continue into nested calls
141
+ self.generic_visit(call)
142
+ CallVisitor().visit(node)
143
+ # Recurse into the function to handle nested functions' bodies but not index them
144
+ self.func_depth += 1
145
+ self.generic_visit(node)
146
+ self.func_depth -= 1
147
+
148
+ # Also handle async functions similarly
149
+ def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None:
150
+ # Treat async functions as normal for indexing
151
+ self.visit_FunctionDef(node) # type: ignore[arg-type]
152
+
153
+ IndexVisitor().visit(tree)
154
+ # Insert into DB
155
+ conn = store.connect(repo_path)
156
+ try:
157
+ store.reset_static_tables(conn)
158
+ store.insert_static_data(conn, files_data, symbols_data, calls_data, metrics_data)
159
+ finally:
160
+ store.close(conn)
zerottmm/metrics.py ADDED
@@ -0,0 +1,66 @@
1
+ """Metrics computation for ttmm.
2
+
3
+ This module provides functions to compute cyclomatic complexity and lines
4
+ of code for Python functions and methods. Complexity is a simple
5
+ approximation based on counting branching constructs; it is not as
6
+ sophisticated as tools like radon or mccabe but suffices for guiding
7
+ reading order. It does not require any third party dependencies.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import ast
13
+
14
+
15
+ class ComplexityVisitor(ast.NodeVisitor):
16
+ """AST visitor that accumulates cyclomatic complexity.
17
+
18
+ The initial complexity is 1. Each branching or boolean operator
19
+ increments the count. Attribute calls do not contribute to
20
+ complexity here.
21
+ """
22
+
23
+ def __init__(self) -> None:
24
+ self.complexity = 1
25
+
26
+ def generic_visit(self, node: ast.AST) -> None:
27
+ # Branching constructs increase complexity by 1
28
+ if isinstance(
29
+ node,
30
+ (
31
+ ast.If,
32
+ ast.For,
33
+ ast.While,
34
+ ast.AsyncFor,
35
+ ast.With,
36
+ ast.AsyncWith,
37
+ ast.Try,
38
+ ast.ExceptHandler,
39
+ ),
40
+ ):
41
+ self.complexity += 1
42
+ elif isinstance(node, ast.BoolOp):
43
+ # bool operations like ``a and b and c`` count len(values) - 1
44
+ self.complexity += max(len(node.values) - 1, 0)
45
+ # Continue traversing
46
+ super().generic_visit(node)
47
+
48
+
49
+ def compute_complexity(node: ast.AST) -> int:
50
+ """Compute a simple cyclomatic complexity for a function/method AST node."""
51
+ visitor = ComplexityVisitor()
52
+ visitor.visit(node)
53
+ return visitor.complexity
54
+
55
+
56
+ def compute_loc(node: ast.AST) -> int:
57
+ """Compute the number of lines of code covered by a node.
58
+
59
+ Uses ``lineno`` and ``end_lineno`` attributes available on Python 3.8+
60
+ AST nodes. Returns at least 1 even if these attributes are missing.
61
+ """
62
+ start = getattr(node, "lineno", None)
63
+ end = getattr(node, "end_lineno", None)
64
+ if start is not None and end is not None and end >= start:
65
+ return end - start + 1
66
+ return 1
zerottmm/search.py ADDED
@@ -0,0 +1,121 @@
1
+ """Simple TF‑IDF search over ttmm symbols.
2
+
3
+ This module provides a function to answer a natural language question
4
+ about a codebase. It ranks functions and methods by combining
5
+ keyword similarity with their hotspot score. The goal is to return
6
+ a small set of entry points that the developer should read first to
7
+ understand the behaviour described by the query.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import re
13
+ import math
14
+ from typing import Dict, List, Tuple
15
+
16
+ from . import store
17
+
18
+
19
+ def _tokenize(text: str) -> List[str]:
20
+ """Split a string into lowercase alphanumeric tokens."""
21
+ return [t.lower() for t in re.findall(r"[A-Za-z0-9]+", text)]
22
+
23
+
24
+ def answer_question(
25
+ repo_path: str,
26
+ question: str,
27
+ top: int = 5,
28
+ include_scores: bool = False,
29
+ ) -> List[Tuple[str, str, int, float]]:
30
+ """Answer a question by returning a minimal set of relevant symbols.
31
+
32
+ Parameters
33
+ ----------
34
+ repo_path: str
35
+ Path to the repository root. The repository must have been
36
+ indexed previously.
37
+ question: str
38
+ Natural language query or keywords.
39
+ top: int
40
+ Number of symbols to return.
41
+ include_scores: bool
42
+ If True, return the computed score alongside each result.
43
+
44
+ Returns
45
+ -------
46
+ List[Tuple[qualname, file_path, line_no, score]]
47
+ Sorted list of symbol descriptors. Each tuple contains the
48
+ fully qualified name, the relative file path, the starting line
49
+ number and the ranking score. The score is omitted if
50
+ ``include_scores`` is False.
51
+ """
52
+ question_tokens = _tokenize(question)
53
+ if not question_tokens:
54
+ return []
55
+ conn = store.connect(repo_path)
56
+ try:
57
+ cur = conn.cursor()
58
+ # Load all symbols with metrics
59
+ cur.execute(
60
+ """
61
+ SELECT symbols.id AS id,
62
+ symbols.qualname AS qualname,
63
+ files.path AS file_path,
64
+ symbols.lineno AS lineno,
65
+ metrics.complexity AS complexity,
66
+ metrics.churn AS churn,
67
+ symbols.doc AS doc
68
+ FROM symbols
69
+ JOIN metrics ON metrics.symbol_id = symbols.id
70
+ JOIN files ON files.id = symbols.file_id
71
+ """
72
+ )
73
+ symbols_list = cur.fetchall()
74
+ # Build inverted index: token -> {symbol_index: tf}
75
+ token_df: Dict[str, int] = {}
76
+ token_tf: Dict[str, Dict[int, int]] = {}
77
+ docs: List[Dict[str, int]] = []
78
+ for idx, row in enumerate(symbols_list):
79
+ text_parts = [row["qualname"], row["doc"] or ""]
80
+ tokens = _tokenize(" ".join(text_parts))
81
+ tf: Dict[str, int] = {}
82
+ for t in tokens:
83
+ tf[t] = tf.get(t, 0) + 1
84
+ docs.append(tf)
85
+ for t in tf:
86
+ token_df[t] = token_df.get(t, 0) + 1
87
+ token_tf.setdefault(t, {})[idx] = tf[t]
88
+ n_docs = len(symbols_list)
89
+ # Precompute idf for query tokens only to speed up
90
+ idf: Dict[str, float] = {}
91
+ for t in question_tokens:
92
+ df = token_df.get(t, 0)
93
+ # Add 1 to denominator to avoid division by zero
94
+ idf[t] = math.log((n_docs + 1) / (df + 1)) + 1.0
95
+ # Compute similarity for each symbol
96
+ scores: List[Tuple[int, float]] = []
97
+ for idx, row in enumerate(symbols_list):
98
+ # Compute TF‑IDF dot product for query and document
99
+ score = 0.0
100
+ for t in question_tokens:
101
+ tf = token_tf.get(t, {}).get(idx, 0)
102
+ score += tf * idf.get(t, 0.0)
103
+ if score > 0.0:
104
+ # Multiply by hotspot score (complexity * sqrt(churn + 1))
105
+ complexity = row["complexity"]
106
+ churn = row["churn"]
107
+ hotspot = complexity * (1.0 + math.sqrt(churn))
108
+ score *= (hotspot + 1e-6)
109
+ scores.append((idx, score))
110
+ # Sort descending
111
+ scores.sort(key=lambda x: x[1], reverse=True)
112
+ results = []
113
+ for idx, s in scores[:top]:
114
+ row = symbols_list[idx]
115
+ if include_scores:
116
+ results.append((row["qualname"], row["file_path"], row["lineno"], s))
117
+ else:
118
+ results.append((row["qualname"], row["file_path"], row["lineno"],))
119
+ return results
120
+ finally:
121
+ store.close(conn)