zerottmm 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zerottmm/__init__.py +38 -0
- zerottmm/ai_analysis.py +149 -0
- zerottmm/cli.py +212 -0
- zerottmm/gitingest.py +219 -0
- zerottmm/gitutils.py +117 -0
- zerottmm/index.py +160 -0
- zerottmm/metrics.py +66 -0
- zerottmm/search.py +121 -0
- zerottmm/store.py +380 -0
- zerottmm/trace.py +178 -0
- zerottmm-0.1.0.dist-info/METADATA +176 -0
- zerottmm-0.1.0.dist-info/RECORD +15 -0
- zerottmm-0.1.0.dist-info/WHEEL +5 -0
- zerottmm-0.1.0.dist-info/entry_points.txt +2 -0
- zerottmm-0.1.0.dist-info/top_level.txt +1 -0
zerottmm/gitutils.py
ADDED
@@ -0,0 +1,117 @@
|
|
1
|
+
"""Utilities for computing git churn.
|
2
|
+
|
3
|
+
The hotspot score used by ttmm combines cyclomatic complexity and recent
|
4
|
+
development activity. ``compute_churn`` uses ``git log --numstat`` to
|
5
|
+
approximate how much each file has changed recently. A half‑life decay
|
6
|
+
is applied so that more recent commits contribute more to the churn.
|
7
|
+
|
8
|
+
If ``git`` is not installed or the repository is not a git repository,
|
9
|
+
``compute_churn`` will return an empty dictionary, in which case
|
10
|
+
complexity alone determines the hotspot score.
|
11
|
+
"""
|
12
|
+
|
13
|
+
from __future__ import annotations
|
14
|
+
|
15
|
+
import subprocess
|
16
|
+
import time
|
17
|
+
import os
|
18
|
+
from typing import Dict, Optional
|
19
|
+
|
20
|
+
|
21
|
+
def _run_git(args, cwd: str) -> Optional[str]:
|
22
|
+
try:
|
23
|
+
result = subprocess.run(
|
24
|
+
["git"] + args,
|
25
|
+
cwd=cwd,
|
26
|
+
stdout=subprocess.PIPE,
|
27
|
+
stderr=subprocess.PIPE,
|
28
|
+
text=True,
|
29
|
+
check=True,
|
30
|
+
)
|
31
|
+
return result.stdout
|
32
|
+
except Exception:
|
33
|
+
return None
|
34
|
+
|
35
|
+
|
36
|
+
def compute_churn(
|
37
|
+
repo_path: str,
|
38
|
+
half_life_days: float = 90.0,
|
39
|
+
since_days: float = 365.0,
|
40
|
+
) -> Dict[str, float]:
|
41
|
+
"""Compute a recency‑weighted churn score for each file in a git repository.
|
42
|
+
|
43
|
+
Parameters
|
44
|
+
----------
|
45
|
+
repo_path: str
|
46
|
+
Path to the repository root. Must contain a ``.git`` directory.
|
47
|
+
half_life_days: float
|
48
|
+
The half‑life for decay in days. Commits older than this
|
49
|
+
contribute half as much churn as very recent commits.
|
50
|
+
since_days: float
|
51
|
+
Limit churn calculation to this many days in the past. Setting
|
52
|
+
a finite window improves performance on large repositories.
|
53
|
+
|
54
|
+
Returns
|
55
|
+
-------
|
56
|
+
Dict[str, float]
|
57
|
+
Mapping from relative file path to a churn value. Paths use
|
58
|
+
forward slashes regardless of platform. If ``git`` is not
|
59
|
+
available or no repository exists, returns an empty dict.
|
60
|
+
"""
|
61
|
+
# Verify .git exists
|
62
|
+
if not os.path.isdir(os.path.join(repo_path, ".git")):
|
63
|
+
return {}
|
64
|
+
# Build git log command
|
65
|
+
now = time.time()
|
66
|
+
since_timestamp = now - since_days * 86400
|
67
|
+
since_date = time.strftime("%Y-%m-%d", time.gmtime(since_timestamp))
|
68
|
+
output = _run_git([
|
69
|
+
"-c", "log.showSignature=false", # suppress GPG signature noise
|
70
|
+
"log",
|
71
|
+
"--numstat",
|
72
|
+
f"--since={since_date}",
|
73
|
+
"--pretty=format:%ct",
|
74
|
+
], cwd=repo_path)
|
75
|
+
if output is None:
|
76
|
+
return {}
|
77
|
+
churn: Dict[str, float] = {}
|
78
|
+
lines = output.splitlines()
|
79
|
+
commit_time: Optional[int] = None
|
80
|
+
for line in lines:
|
81
|
+
line = line.strip()
|
82
|
+
if not line:
|
83
|
+
continue
|
84
|
+
if line.isdigit():
|
85
|
+
# Start of a commit: epoch time
|
86
|
+
try:
|
87
|
+
commit_time = int(line)
|
88
|
+
except Exception:
|
89
|
+
commit_time = None
|
90
|
+
else:
|
91
|
+
if commit_time is None:
|
92
|
+
continue
|
93
|
+
parts = line.split("\t")
|
94
|
+
if len(parts) != 3:
|
95
|
+
continue
|
96
|
+
adds_str, dels_str, path = parts
|
97
|
+
# Convert additions/deletions; "-" means binary or untracked
|
98
|
+
try:
|
99
|
+
adds = int(adds_str) if adds_str != "-" else 0
|
100
|
+
except Exception:
|
101
|
+
adds = 0
|
102
|
+
try:
|
103
|
+
dels = int(dels_str) if dels_str != "-" else 0
|
104
|
+
except Exception:
|
105
|
+
dels = 0
|
106
|
+
total = adds + dels
|
107
|
+
# Weight by recency
|
108
|
+
age_days = (now - commit_time) / 86400.0
|
109
|
+
weight = 0.0
|
110
|
+
if half_life_days > 0:
|
111
|
+
# Exponential decay: half‑life h -> weight = 0.5^(age/h)
|
112
|
+
weight = 0.5 ** (age_days / half_life_days)
|
113
|
+
else:
|
114
|
+
weight = 1.0
|
115
|
+
churn.setdefault(path, 0.0)
|
116
|
+
churn[path] += total * weight
|
117
|
+
return churn
|
zerottmm/index.py
ADDED
@@ -0,0 +1,160 @@
|
|
1
|
+
"""Repository indexer for ttmm.
|
2
|
+
|
3
|
+
The indexer walks a Python repository, parses functions and methods with
|
4
|
+
``ast`` and stores them in the ttmm database via :mod:`ttmm.store`.
|
5
|
+
It computes cyclomatic complexity and lines of code for each symbol via
|
6
|
+
:mod:`ttmm.metrics`, and merges in git churn information via
|
7
|
+
:mod:`ttmm.gitutils` to produce a hotspot score later on.
|
8
|
+
|
9
|
+
Only top‑level functions and class methods are indexed. Nested
|
10
|
+
functions are ignored to keep the mental model focused on API surfaces.
|
11
|
+
Calls made within functions are collected, but attribute calls that
|
12
|
+
cannot be resolved statically are marked as unresolved.
|
13
|
+
|
14
|
+
Example usage:
|
15
|
+
|
16
|
+
```
|
17
|
+
from ttmm.index import index_repo
|
18
|
+
index_repo("/path/to/repo")
|
19
|
+
```
|
20
|
+
"""
|
21
|
+
|
22
|
+
from __future__ import annotations
|
23
|
+
|
24
|
+
import ast
|
25
|
+
import os
|
26
|
+
from typing import Dict, List, Tuple
|
27
|
+
|
28
|
+
from . import metrics, gitutils, store
|
29
|
+
|
30
|
+
|
31
|
+
def _iter_python_files(repo_path: str) -> List[str]:
|
32
|
+
"""Recursively find all Python files in a repository.
|
33
|
+
|
34
|
+
Ignores the ``.ttmm`` directory. Returns relative paths with
|
35
|
+
forward slashes.
|
36
|
+
"""
|
37
|
+
py_files: List[str] = []
|
38
|
+
for root, dirs, files in os.walk(repo_path):
|
39
|
+
# Skip .ttmm directory
|
40
|
+
rel_root = os.path.relpath(root, repo_path)
|
41
|
+
if rel_root.startswith(os.path.join(".ttmm")):
|
42
|
+
continue
|
43
|
+
for fname in files:
|
44
|
+
if fname.endswith(".py") and not fname.startswith("."):
|
45
|
+
full_path = os.path.join(root, fname)
|
46
|
+
rel_path = os.path.relpath(full_path, repo_path).replace(os.sep, "/")
|
47
|
+
py_files.append(rel_path)
|
48
|
+
return py_files
|
49
|
+
|
50
|
+
|
51
|
+
def index_repo(repo_path: str) -> None:
|
52
|
+
"""Index a Python repository into the ttmm database.
|
53
|
+
|
54
|
+
This will parse all ``.py`` files under ``repo_path``, compute
|
55
|
+
symbol definitions, call edges and metrics, and persist them in
|
56
|
+
``.ttmm/ttmm.db``. Any existing static data in the database is
|
57
|
+
replaced. Dynamic trace data is preserved.
|
58
|
+
"""
|
59
|
+
repo_path = os.path.abspath(repo_path)
|
60
|
+
# Discover python files
|
61
|
+
py_files = _iter_python_files(repo_path)
|
62
|
+
# Compute git churn for all files
|
63
|
+
churn_by_file = gitutils.compute_churn(repo_path)
|
64
|
+
files_data: List[Tuple[str, float]] = []
|
65
|
+
symbols_data: List[Dict[str, object]] = []
|
66
|
+
calls_data: List[Dict[str, object]] = []
|
67
|
+
metrics_data: Dict[str, Tuple[float, int, float]] = {}
|
68
|
+
for rel_path in py_files:
|
69
|
+
abs_path = os.path.join(repo_path, rel_path)
|
70
|
+
try:
|
71
|
+
with open(abs_path, "r", encoding="utf-8") as f:
|
72
|
+
source = f.read()
|
73
|
+
except Exception:
|
74
|
+
# Skip unreadable files
|
75
|
+
continue
|
76
|
+
try:
|
77
|
+
tree = ast.parse(source, filename=abs_path)
|
78
|
+
except SyntaxError:
|
79
|
+
# Skip files with syntax errors
|
80
|
+
continue
|
81
|
+
mtime = os.path.getmtime(abs_path)
|
82
|
+
files_data.append((rel_path, mtime))
|
83
|
+
module_name = rel_path[:-3].replace("/", ".") # strip .py
|
84
|
+
|
85
|
+
class IndexVisitor(ast.NodeVisitor):
|
86
|
+
"""Visitor to collect top‑level functions and methods and their calls."""
|
87
|
+
|
88
|
+
def __init__(self) -> None:
|
89
|
+
self.class_stack: List[str] = []
|
90
|
+
self.func_depth = 0 # track nesting of functions
|
91
|
+
|
92
|
+
def visit_ClassDef(self, node: ast.ClassDef) -> None:
|
93
|
+
self.class_stack.append(node.name)
|
94
|
+
self.generic_visit(node)
|
95
|
+
self.class_stack.pop()
|
96
|
+
|
97
|
+
def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
|
98
|
+
# Index only functions not nested inside another function (func_depth == 0)
|
99
|
+
if self.func_depth == 0:
|
100
|
+
qualname = module_name + ":" + (
|
101
|
+
f"{self.class_stack[-1]}." if self.class_stack else ""
|
102
|
+
) + node.name
|
103
|
+
sym_type = "method" if self.class_stack else "function"
|
104
|
+
doc = ast.get_docstring(node)
|
105
|
+
symbols_data.append(
|
106
|
+
{
|
107
|
+
"qualname": qualname,
|
108
|
+
"path": rel_path,
|
109
|
+
"lineno": node.lineno,
|
110
|
+
"endlineno": getattr(node, "end_lineno", node.lineno),
|
111
|
+
"type": sym_type,
|
112
|
+
"doc": doc.strip() if isinstance(doc, str) else None,
|
113
|
+
}
|
114
|
+
)
|
115
|
+
# Metrics
|
116
|
+
comp = metrics.compute_complexity(node)
|
117
|
+
loc = metrics.compute_loc(node)
|
118
|
+
churn = churn_by_file.get(rel_path, 0.0)
|
119
|
+
metrics_data[qualname] = (comp, loc, churn)
|
120
|
+
# Collect calls
|
121
|
+
|
122
|
+
class CallVisitor(ast.NodeVisitor):
|
123
|
+
def visit_Call(self, call: ast.Call) -> None:
|
124
|
+
callee_name: str | None = None
|
125
|
+
unresolved = False
|
126
|
+
func = call.func
|
127
|
+
if isinstance(func, ast.Name):
|
128
|
+
callee_name = func.id
|
129
|
+
elif isinstance(func, ast.Attribute):
|
130
|
+
callee_name = func.attr
|
131
|
+
unresolved = True
|
132
|
+
if callee_name:
|
133
|
+
calls_data.append(
|
134
|
+
{
|
135
|
+
"caller_qualname": qualname,
|
136
|
+
"callee_name": callee_name,
|
137
|
+
"unresolved": unresolved,
|
138
|
+
}
|
139
|
+
)
|
140
|
+
# Continue into nested calls
|
141
|
+
self.generic_visit(call)
|
142
|
+
CallVisitor().visit(node)
|
143
|
+
# Recurse into the function to handle nested functions' bodies but not index them
|
144
|
+
self.func_depth += 1
|
145
|
+
self.generic_visit(node)
|
146
|
+
self.func_depth -= 1
|
147
|
+
|
148
|
+
# Also handle async functions similarly
|
149
|
+
def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None:
|
150
|
+
# Treat async functions as normal for indexing
|
151
|
+
self.visit_FunctionDef(node) # type: ignore[arg-type]
|
152
|
+
|
153
|
+
IndexVisitor().visit(tree)
|
154
|
+
# Insert into DB
|
155
|
+
conn = store.connect(repo_path)
|
156
|
+
try:
|
157
|
+
store.reset_static_tables(conn)
|
158
|
+
store.insert_static_data(conn, files_data, symbols_data, calls_data, metrics_data)
|
159
|
+
finally:
|
160
|
+
store.close(conn)
|
zerottmm/metrics.py
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
"""Metrics computation for ttmm.
|
2
|
+
|
3
|
+
This module provides functions to compute cyclomatic complexity and lines
|
4
|
+
of code for Python functions and methods. Complexity is a simple
|
5
|
+
approximation based on counting branching constructs; it is not as
|
6
|
+
sophisticated as tools like radon or mccabe but suffices for guiding
|
7
|
+
reading order. It does not require any third party dependencies.
|
8
|
+
"""
|
9
|
+
|
10
|
+
from __future__ import annotations
|
11
|
+
|
12
|
+
import ast
|
13
|
+
|
14
|
+
|
15
|
+
class ComplexityVisitor(ast.NodeVisitor):
|
16
|
+
"""AST visitor that accumulates cyclomatic complexity.
|
17
|
+
|
18
|
+
The initial complexity is 1. Each branching or boolean operator
|
19
|
+
increments the count. Attribute calls do not contribute to
|
20
|
+
complexity here.
|
21
|
+
"""
|
22
|
+
|
23
|
+
def __init__(self) -> None:
|
24
|
+
self.complexity = 1
|
25
|
+
|
26
|
+
def generic_visit(self, node: ast.AST) -> None:
|
27
|
+
# Branching constructs increase complexity by 1
|
28
|
+
if isinstance(
|
29
|
+
node,
|
30
|
+
(
|
31
|
+
ast.If,
|
32
|
+
ast.For,
|
33
|
+
ast.While,
|
34
|
+
ast.AsyncFor,
|
35
|
+
ast.With,
|
36
|
+
ast.AsyncWith,
|
37
|
+
ast.Try,
|
38
|
+
ast.ExceptHandler,
|
39
|
+
),
|
40
|
+
):
|
41
|
+
self.complexity += 1
|
42
|
+
elif isinstance(node, ast.BoolOp):
|
43
|
+
# bool operations like ``a and b and c`` count len(values) - 1
|
44
|
+
self.complexity += max(len(node.values) - 1, 0)
|
45
|
+
# Continue traversing
|
46
|
+
super().generic_visit(node)
|
47
|
+
|
48
|
+
|
49
|
+
def compute_complexity(node: ast.AST) -> int:
|
50
|
+
"""Compute a simple cyclomatic complexity for a function/method AST node."""
|
51
|
+
visitor = ComplexityVisitor()
|
52
|
+
visitor.visit(node)
|
53
|
+
return visitor.complexity
|
54
|
+
|
55
|
+
|
56
|
+
def compute_loc(node: ast.AST) -> int:
|
57
|
+
"""Compute the number of lines of code covered by a node.
|
58
|
+
|
59
|
+
Uses ``lineno`` and ``end_lineno`` attributes available on Python 3.8+
|
60
|
+
AST nodes. Returns at least 1 even if these attributes are missing.
|
61
|
+
"""
|
62
|
+
start = getattr(node, "lineno", None)
|
63
|
+
end = getattr(node, "end_lineno", None)
|
64
|
+
if start is not None and end is not None and end >= start:
|
65
|
+
return end - start + 1
|
66
|
+
return 1
|
zerottmm/search.py
ADDED
@@ -0,0 +1,121 @@
|
|
1
|
+
"""Simple TF‑IDF search over ttmm symbols.
|
2
|
+
|
3
|
+
This module provides a function to answer a natural language question
|
4
|
+
about a codebase. It ranks functions and methods by combining
|
5
|
+
keyword similarity with their hotspot score. The goal is to return
|
6
|
+
a small set of entry points that the developer should read first to
|
7
|
+
understand the behaviour described by the query.
|
8
|
+
"""
|
9
|
+
|
10
|
+
from __future__ import annotations
|
11
|
+
|
12
|
+
import re
|
13
|
+
import math
|
14
|
+
from typing import Dict, List, Tuple
|
15
|
+
|
16
|
+
from . import store
|
17
|
+
|
18
|
+
|
19
|
+
def _tokenize(text: str) -> List[str]:
|
20
|
+
"""Split a string into lowercase alphanumeric tokens."""
|
21
|
+
return [t.lower() for t in re.findall(r"[A-Za-z0-9]+", text)]
|
22
|
+
|
23
|
+
|
24
|
+
def answer_question(
|
25
|
+
repo_path: str,
|
26
|
+
question: str,
|
27
|
+
top: int = 5,
|
28
|
+
include_scores: bool = False,
|
29
|
+
) -> List[Tuple[str, str, int, float]]:
|
30
|
+
"""Answer a question by returning a minimal set of relevant symbols.
|
31
|
+
|
32
|
+
Parameters
|
33
|
+
----------
|
34
|
+
repo_path: str
|
35
|
+
Path to the repository root. The repository must have been
|
36
|
+
indexed previously.
|
37
|
+
question: str
|
38
|
+
Natural language query or keywords.
|
39
|
+
top: int
|
40
|
+
Number of symbols to return.
|
41
|
+
include_scores: bool
|
42
|
+
If True, return the computed score alongside each result.
|
43
|
+
|
44
|
+
Returns
|
45
|
+
-------
|
46
|
+
List[Tuple[qualname, file_path, line_no, score]]
|
47
|
+
Sorted list of symbol descriptors. Each tuple contains the
|
48
|
+
fully qualified name, the relative file path, the starting line
|
49
|
+
number and the ranking score. The score is omitted if
|
50
|
+
``include_scores`` is False.
|
51
|
+
"""
|
52
|
+
question_tokens = _tokenize(question)
|
53
|
+
if not question_tokens:
|
54
|
+
return []
|
55
|
+
conn = store.connect(repo_path)
|
56
|
+
try:
|
57
|
+
cur = conn.cursor()
|
58
|
+
# Load all symbols with metrics
|
59
|
+
cur.execute(
|
60
|
+
"""
|
61
|
+
SELECT symbols.id AS id,
|
62
|
+
symbols.qualname AS qualname,
|
63
|
+
files.path AS file_path,
|
64
|
+
symbols.lineno AS lineno,
|
65
|
+
metrics.complexity AS complexity,
|
66
|
+
metrics.churn AS churn,
|
67
|
+
symbols.doc AS doc
|
68
|
+
FROM symbols
|
69
|
+
JOIN metrics ON metrics.symbol_id = symbols.id
|
70
|
+
JOIN files ON files.id = symbols.file_id
|
71
|
+
"""
|
72
|
+
)
|
73
|
+
symbols_list = cur.fetchall()
|
74
|
+
# Build inverted index: token -> {symbol_index: tf}
|
75
|
+
token_df: Dict[str, int] = {}
|
76
|
+
token_tf: Dict[str, Dict[int, int]] = {}
|
77
|
+
docs: List[Dict[str, int]] = []
|
78
|
+
for idx, row in enumerate(symbols_list):
|
79
|
+
text_parts = [row["qualname"], row["doc"] or ""]
|
80
|
+
tokens = _tokenize(" ".join(text_parts))
|
81
|
+
tf: Dict[str, int] = {}
|
82
|
+
for t in tokens:
|
83
|
+
tf[t] = tf.get(t, 0) + 1
|
84
|
+
docs.append(tf)
|
85
|
+
for t in tf:
|
86
|
+
token_df[t] = token_df.get(t, 0) + 1
|
87
|
+
token_tf.setdefault(t, {})[idx] = tf[t]
|
88
|
+
n_docs = len(symbols_list)
|
89
|
+
# Precompute idf for query tokens only to speed up
|
90
|
+
idf: Dict[str, float] = {}
|
91
|
+
for t in question_tokens:
|
92
|
+
df = token_df.get(t, 0)
|
93
|
+
# Add 1 to denominator to avoid division by zero
|
94
|
+
idf[t] = math.log((n_docs + 1) / (df + 1)) + 1.0
|
95
|
+
# Compute similarity for each symbol
|
96
|
+
scores: List[Tuple[int, float]] = []
|
97
|
+
for idx, row in enumerate(symbols_list):
|
98
|
+
# Compute TF‑IDF dot product for query and document
|
99
|
+
score = 0.0
|
100
|
+
for t in question_tokens:
|
101
|
+
tf = token_tf.get(t, {}).get(idx, 0)
|
102
|
+
score += tf * idf.get(t, 0.0)
|
103
|
+
if score > 0.0:
|
104
|
+
# Multiply by hotspot score (complexity * sqrt(churn + 1))
|
105
|
+
complexity = row["complexity"]
|
106
|
+
churn = row["churn"]
|
107
|
+
hotspot = complexity * (1.0 + math.sqrt(churn))
|
108
|
+
score *= (hotspot + 1e-6)
|
109
|
+
scores.append((idx, score))
|
110
|
+
# Sort descending
|
111
|
+
scores.sort(key=lambda x: x[1], reverse=True)
|
112
|
+
results = []
|
113
|
+
for idx, s in scores[:top]:
|
114
|
+
row = symbols_list[idx]
|
115
|
+
if include_scores:
|
116
|
+
results.append((row["qualname"], row["file_path"], row["lineno"], s))
|
117
|
+
else:
|
118
|
+
results.append((row["qualname"], row["file_path"], row["lineno"],))
|
119
|
+
return results
|
120
|
+
finally:
|
121
|
+
store.close(conn)
|