archeologist 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- archeologist-0.1.0/PKG-INFO +133 -0
- archeologist-0.1.0/README.md +118 -0
- archeologist-0.1.0/archeologist.egg-info/PKG-INFO +133 -0
- archeologist-0.1.0/archeologist.egg-info/SOURCES.txt +26 -0
- archeologist-0.1.0/archeologist.egg-info/dependency_links.txt +1 -0
- archeologist-0.1.0/archeologist.egg-info/entry_points.txt +2 -0
- archeologist-0.1.0/archeologist.egg-info/requires.txt +8 -0
- archeologist-0.1.0/archeologist.egg-info/top_level.txt +1 -0
- archeologist-0.1.0/pyproject.toml +31 -0
- archeologist-0.1.0/setup.cfg +11 -0
- archeologist-0.1.0/src/__init__.py +1 -0
- archeologist-0.1.0/src/ast/__init__.py +1 -0
- archeologist-0.1.0/src/ast/lineage.py +217 -0
- archeologist-0.1.0/src/ast/parser.py +239 -0
- archeologist-0.1.0/src/cli/__init__.py +753 -0
- archeologist-0.1.0/src/cli/__main__.py +4 -0
- archeologist-0.1.0/src/db/__init__.py +1 -0
- archeologist-0.1.0/src/db/database.py +266 -0
- archeologist-0.1.0/src/git/__init__.py +1 -0
- archeologist-0.1.0/src/git/walker.py +398 -0
- archeologist-0.1.0/src/github/__init__.py +1 -0
- archeologist-0.1.0/src/github/fetcher.py +198 -0
- archeologist-0.1.0/src/github/geographic.py +91 -0
- archeologist-0.1.0/src/synthesis/__init__.py +1 -0
- archeologist-0.1.0/src/synthesis/narrative.py +145 -0
- archeologist-0.1.0/tests/test_e2e.py +232 -0
- archeologist-0.1.0/tests/test_lineage.py +232 -0
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: archeologist
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Code archaeologist - reconstruct function decision history via AST-aware lineage tracking
|
|
5
|
+
Requires-Python: >=3.9
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
Requires-Dist: GitPython
|
|
8
|
+
Requires-Dist: PyGithub
|
|
9
|
+
Requires-Dist: tree-sitter>=0.21.0
|
|
10
|
+
Requires-Dist: tree-sitter-languages>=1.9.0
|
|
11
|
+
Requires-Dist: click
|
|
12
|
+
Requires-Dist: pytest
|
|
13
|
+
Requires-Dist: python-dotenv
|
|
14
|
+
Requires-Dist: pyyaml
|
|
15
|
+
|
|
16
|
+
# Archeologist - Semantic Lineage Graph Generator
|
|
17
|
+
|
|
18
|
+
Post-incident archaeology tool that reconstructs function decision history via deterministic AST-aware lineage tracking.
|
|
19
|
+
|
|
20
|
+
## CLI Usage
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
# Install
|
|
24
|
+
pip install -e .
|
|
25
|
+
|
|
26
|
+
# Analyze file (auto-detects git repo)
|
|
27
|
+
arc analyze path/to/file.py
|
|
28
|
+
|
|
29
|
+
# Analyze specific function
|
|
30
|
+
arc analyze-function path/to/file.py function_name
|
|
31
|
+
|
|
32
|
+
# With GitHub PR integration
|
|
33
|
+
arc analyze-function path/to/file.py function_name --repo owner/repo
|
|
34
|
+
|
|
35
|
+
# With LLM narrative synthesis
|
|
36
|
+
export CLAUDE_API_KEY=sk-ant-xxx
|
|
37
|
+
arc analyze-function path/to/file.py function_name
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Configuration
|
|
41
|
+
|
|
42
|
+
Set environment variables:
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
export GITHUB_TOKEN=ghp_xxx
|
|
46
|
+
export CLAUDE_API_KEY=sk-ant-xxx
|
|
47
|
+
export GIT_REPO_PATH=/path/to/local/repo
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## Architecture
|
|
51
|
+
|
|
52
|
+
Three-phase pipeline:
|
|
53
|
+
|
|
54
|
+
1. **Semantic Lineage Tracking**
|
|
55
|
+
- GitWalker traverses history (--no-renames flag)
|
|
56
|
+
- ASTParser extracts function boundaries (Python, JS, TS, Go, Rust, Java, C, C++, Ruby, PHP)
|
|
57
|
+
- LineageTracker links nodes via four-tier hierarchy
|
|
58
|
+
|
|
59
|
+
2. **Contextual Slicing**
|
|
60
|
+
- PRFetcher pulls associated PRs
|
|
61
|
+
- Geographic filter maps review comments to AST node line ranges
|
|
62
|
+
|
|
63
|
+
3. **Narrative Synthesis**
|
|
64
|
+
- LiteLLM abstracts LLM calls (Claude, local models)
|
|
65
|
+
- Outputs 5-sentence brief explaining decisions
|
|
66
|
+
|
|
67
|
+
## MCP Server
|
|
68
|
+
|
|
69
|
+
The tool exposes an MCP-compatible JSON-RPC 2.0 server over stdio:
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
# Run MCP server
|
|
73
|
+
arc-mcp
|
|
74
|
+
|
|
75
|
+
# Or run directly
|
|
76
|
+
python -m src.mcp.server
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### Available Methods
|
|
80
|
+
|
|
81
|
+
```json
|
|
82
|
+
// List functions in a file
|
|
83
|
+
{"jsonrpc": "2.0", "id": 1, "method": "list_functions", "params": {"file_path": "/path/to/file.py"}}
|
|
84
|
+
|
|
85
|
+
// Analyze a specific function
|
|
86
|
+
{"jsonrpc": "2.0", "id": 2, "method": "analyze_function", "params": {"file_path": "/path/to/file.py", "function_name": "foo"}}
|
|
87
|
+
|
|
88
|
+
// Analyze a file's overall lineage
|
|
89
|
+
{"jsonrpc": "2.0", "id": 3, "method": "analyze_file", "params": {"file_path": "/path/to/file.py"}}
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
## Example
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
# Analyze the `authenticate` function in a FastAPI project
|
|
96
|
+
GIT_REPO_PATH=/Users/fuads/fastapi arc analyze-function app/auth.py authenticate --repo fastapi/fastapi
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
Output:
|
|
100
|
+
```
|
|
101
|
+
Analyzing function authenticate in app/auth.py...
|
|
102
|
+
Found 12 lineage edges for authenticate
|
|
103
|
+
Summary: Found 12 historical versions of this code. Change types: physical: 8, identity: 4
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
With LLM synthesis:
|
|
107
|
+
```
|
|
108
|
+
The authenticate function evolved through 12 commits over 18 months.
|
|
109
|
+
Initial implementation used simple token validation, replaced in PR #2341
|
|
110
|
+
with OAuth2 Bearer token parsing after security audit. Several performance
|
|
111
|
+
optimizations were attempted (PRs #1892, #2103) but reverted due to race
|
|
112
|
+
conditions. The current implementation handles both JWT and opaque tokens
|
|
113
|
+
with a unified interface, consolidating three previous approaches.
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
## Testing
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
pytest tests/
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
## Roadmap
|
|
123
|
+
|
|
124
|
+
- [x] Graph construction (GitWalker, ASTParser, LineageTracker)
|
|
125
|
+
- [x] Contextual slicing (PRFetcher, GeographicFilter)
|
|
126
|
+
- [x] CLI commands with local git repo auto-detection
|
|
127
|
+
- [x] 10 language support (Python, JS, TS, Go, Rust, Java, C, C++, Ruby, PHP)
|
|
128
|
+
- [x] Real-world testing on Flask repo
|
|
129
|
+
- [x] MCP server (JSON-RPC 2.0 over stdio, works with Python 3.9)
|
|
130
|
+
|
|
131
|
+
## License
|
|
132
|
+
|
|
133
|
+
MIT
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
# Archeologist - Semantic Lineage Graph Generator
|
|
2
|
+
|
|
3
|
+
Post-incident archaeology tool that reconstructs function decision history via deterministic AST-aware lineage tracking.
|
|
4
|
+
|
|
5
|
+
## CLI Usage
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
# Install
|
|
9
|
+
pip install -e .
|
|
10
|
+
|
|
11
|
+
# Analyze file (auto-detects git repo)
|
|
12
|
+
arc analyze path/to/file.py
|
|
13
|
+
|
|
14
|
+
# Analyze specific function
|
|
15
|
+
arc analyze-function path/to/file.py function_name
|
|
16
|
+
|
|
17
|
+
# With GitHub PR integration
|
|
18
|
+
arc analyze-function path/to/file.py function_name --repo owner/repo
|
|
19
|
+
|
|
20
|
+
# With LLM narrative synthesis
|
|
21
|
+
export CLAUDE_API_KEY=sk-ant-xxx
|
|
22
|
+
arc analyze-function path/to/file.py function_name
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Configuration
|
|
26
|
+
|
|
27
|
+
Set environment variables:
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
export GITHUB_TOKEN=ghp_xxx
|
|
31
|
+
export CLAUDE_API_KEY=sk-ant-xxx
|
|
32
|
+
export GIT_REPO_PATH=/path/to/local/repo
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Architecture
|
|
36
|
+
|
|
37
|
+
Three-phase pipeline:
|
|
38
|
+
|
|
39
|
+
1. **Semantic Lineage Tracking**
|
|
40
|
+
- GitWalker traverses history (--no-renames flag)
|
|
41
|
+
- ASTParser extracts function boundaries (Python, JS, TS, Go, Rust, Java, C, C++, Ruby, PHP)
|
|
42
|
+
- LineageTracker links nodes via four-tier hierarchy
|
|
43
|
+
|
|
44
|
+
2. **Contextual Slicing**
|
|
45
|
+
- PRFetcher pulls associated PRs
|
|
46
|
+
- Geographic filter maps review comments to AST node line ranges
|
|
47
|
+
|
|
48
|
+
3. **Narrative Synthesis**
|
|
49
|
+
- LiteLLM abstracts LLM calls (Claude, local models)
|
|
50
|
+
- Outputs 5-sentence brief explaining decisions
|
|
51
|
+
|
|
52
|
+
## MCP Server
|
|
53
|
+
|
|
54
|
+
The tool exposes an MCP-compatible JSON-RPC 2.0 server over stdio:
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
# Run MCP server
|
|
58
|
+
arc-mcp
|
|
59
|
+
|
|
60
|
+
# Or run directly
|
|
61
|
+
python -m src.mcp.server
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### Available Methods
|
|
65
|
+
|
|
66
|
+
```json
|
|
67
|
+
// List functions in a file
|
|
68
|
+
{"jsonrpc": "2.0", "id": 1, "method": "list_functions", "params": {"file_path": "/path/to/file.py"}}
|
|
69
|
+
|
|
70
|
+
// Analyze a specific function
|
|
71
|
+
{"jsonrpc": "2.0", "id": 2, "method": "analyze_function", "params": {"file_path": "/path/to/file.py", "function_name": "foo"}}
|
|
72
|
+
|
|
73
|
+
// Analyze a file's overall lineage
|
|
74
|
+
{"jsonrpc": "2.0", "id": 3, "method": "analyze_file", "params": {"file_path": "/path/to/file.py"}}
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## Example
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
# Analyze the `authenticate` function in a FastAPI project
|
|
81
|
+
GIT_REPO_PATH=/Users/fuads/fastapi arc analyze-function app/auth.py authenticate --repo fastapi/fastapi
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
Output:
|
|
85
|
+
```
|
|
86
|
+
Analyzing function authenticate in app/auth.py...
|
|
87
|
+
Found 12 lineage edges for authenticate
|
|
88
|
+
Summary: Found 12 historical versions of this code. Change types: physical: 8, identity: 4
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
With LLM synthesis:
|
|
92
|
+
```
|
|
93
|
+
The authenticate function evolved through 12 commits over 18 months.
|
|
94
|
+
Initial implementation used simple token validation, replaced in PR #2341
|
|
95
|
+
with OAuth2 Bearer token parsing after security audit. Several performance
|
|
96
|
+
optimizations were attempted (PRs #1892, #2103) but reverted due to race
|
|
97
|
+
conditions. The current implementation handles both JWT and opaque tokens
|
|
98
|
+
with a unified interface, consolidating three previous approaches.
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
## Testing
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
pytest tests/
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
## Roadmap
|
|
108
|
+
|
|
109
|
+
- [x] Graph construction (GitWalker, ASTParser, LineageTracker)
|
|
110
|
+
- [x] Contextual slicing (PRFetcher, GeographicFilter)
|
|
111
|
+
- [x] CLI commands with local git repo auto-detection
|
|
112
|
+
- [x] 10 language support (Python, JS, TS, Go, Rust, Java, C, C++, Ruby, PHP)
|
|
113
|
+
- [x] Real-world testing on Flask repo
|
|
114
|
+
- [x] MCP server (JSON-RPC 2.0 over stdio, works with Python 3.9)
|
|
115
|
+
|
|
116
|
+
## License
|
|
117
|
+
|
|
118
|
+
MIT
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: archeologist
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Code archaeologist - reconstruct function decision history via AST-aware lineage tracking
|
|
5
|
+
Requires-Python: >=3.9
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
Requires-Dist: GitPython
|
|
8
|
+
Requires-Dist: PyGithub
|
|
9
|
+
Requires-Dist: tree-sitter>=0.21.0
|
|
10
|
+
Requires-Dist: tree-sitter-languages>=1.9.0
|
|
11
|
+
Requires-Dist: click
|
|
12
|
+
Requires-Dist: pytest
|
|
13
|
+
Requires-Dist: python-dotenv
|
|
14
|
+
Requires-Dist: pyyaml
|
|
15
|
+
|
|
16
|
+
# Archeologist - Semantic Lineage Graph Generator
|
|
17
|
+
|
|
18
|
+
Post-incident archaeology tool that reconstructs function decision history via deterministic AST-aware lineage tracking.
|
|
19
|
+
|
|
20
|
+
## CLI Usage
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
# Install
|
|
24
|
+
pip install -e .
|
|
25
|
+
|
|
26
|
+
# Analyze file (auto-detects git repo)
|
|
27
|
+
arc analyze path/to/file.py
|
|
28
|
+
|
|
29
|
+
# Analyze specific function
|
|
30
|
+
arc analyze-function path/to/file.py function_name
|
|
31
|
+
|
|
32
|
+
# With GitHub PR integration
|
|
33
|
+
arc analyze-function path/to/file.py function_name --repo owner/repo
|
|
34
|
+
|
|
35
|
+
# With LLM narrative synthesis
|
|
36
|
+
export CLAUDE_API_KEY=sk-ant-xxx
|
|
37
|
+
arc analyze-function path/to/file.py function_name
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Configuration
|
|
41
|
+
|
|
42
|
+
Set environment variables:
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
export GITHUB_TOKEN=ghp_xxx
|
|
46
|
+
export CLAUDE_API_KEY=sk-ant-xxx
|
|
47
|
+
export GIT_REPO_PATH=/path/to/local/repo
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## Architecture
|
|
51
|
+
|
|
52
|
+
Three-phase pipeline:
|
|
53
|
+
|
|
54
|
+
1. **Semantic Lineage Tracking**
|
|
55
|
+
- GitWalker traverses history (--no-renames flag)
|
|
56
|
+
- ASTParser extracts function boundaries (Python, JS, TS, Go, Rust, Java, C, C++, Ruby, PHP)
|
|
57
|
+
- LineageTracker links nodes via four-tier hierarchy
|
|
58
|
+
|
|
59
|
+
2. **Contextual Slicing**
|
|
60
|
+
- PRFetcher pulls associated PRs
|
|
61
|
+
- Geographic filter maps review comments to AST node line ranges
|
|
62
|
+
|
|
63
|
+
3. **Narrative Synthesis**
|
|
64
|
+
- LiteLLM abstracts LLM calls (Claude, local models)
|
|
65
|
+
- Outputs 5-sentence brief explaining decisions
|
|
66
|
+
|
|
67
|
+
## MCP Server
|
|
68
|
+
|
|
69
|
+
The tool exposes an MCP-compatible JSON-RPC 2.0 server over stdio:
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
# Run MCP server
|
|
73
|
+
arc-mcp
|
|
74
|
+
|
|
75
|
+
# Or run directly
|
|
76
|
+
python -m src.mcp.server
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### Available Methods
|
|
80
|
+
|
|
81
|
+
```json
|
|
82
|
+
// List functions in a file
|
|
83
|
+
{"jsonrpc": "2.0", "id": 1, "method": "list_functions", "params": {"file_path": "/path/to/file.py"}}
|
|
84
|
+
|
|
85
|
+
// Analyze a specific function
|
|
86
|
+
{"jsonrpc": "2.0", "id": 2, "method": "analyze_function", "params": {"file_path": "/path/to/file.py", "function_name": "foo"}}
|
|
87
|
+
|
|
88
|
+
// Analyze a file's overall lineage
|
|
89
|
+
{"jsonrpc": "2.0", "id": 3, "method": "analyze_file", "params": {"file_path": "/path/to/file.py"}}
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
## Example
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
# Analyze the `authenticate` function in a FastAPI project
|
|
96
|
+
GIT_REPO_PATH=/Users/fuads/fastapi arc analyze-function app/auth.py authenticate --repo fastapi/fastapi
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
Output:
|
|
100
|
+
```
|
|
101
|
+
Analyzing function authenticate in app/auth.py...
|
|
102
|
+
Found 12 lineage edges for authenticate
|
|
103
|
+
Summary: Found 12 historical versions of this code. Change types: physical: 8, identity: 4
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
With LLM synthesis:
|
|
107
|
+
```
|
|
108
|
+
The authenticate function evolved through 12 commits over 18 months.
|
|
109
|
+
Initial implementation used simple token validation, replaced in PR #2341
|
|
110
|
+
with OAuth2 Bearer token parsing after security audit. Several performance
|
|
111
|
+
optimizations were attempted (PRs #1892, #2103) but reverted due to race
|
|
112
|
+
conditions. The current implementation handles both JWT and opaque tokens
|
|
113
|
+
with a unified interface, consolidating three previous approaches.
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
## Testing
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
pytest tests/
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
## Roadmap
|
|
123
|
+
|
|
124
|
+
- [x] Graph construction (GitWalker, ASTParser, LineageTracker)
|
|
125
|
+
- [x] Contextual slicing (PRFetcher, GeographicFilter)
|
|
126
|
+
- [x] CLI commands with local git repo auto-detection
|
|
127
|
+
- [x] 10 language support (Python, JS, TS, Go, Rust, Java, C, C++, Ruby, PHP)
|
|
128
|
+
- [x] Real-world testing on Flask repo
|
|
129
|
+
- [x] MCP server (JSON-RPC 2.0 over stdio, works with Python 3.9)
|
|
130
|
+
|
|
131
|
+
## License
|
|
132
|
+
|
|
133
|
+
MIT
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
setup.cfg
|
|
4
|
+
archeologist.egg-info/PKG-INFO
|
|
5
|
+
archeologist.egg-info/SOURCES.txt
|
|
6
|
+
archeologist.egg-info/dependency_links.txt
|
|
7
|
+
archeologist.egg-info/entry_points.txt
|
|
8
|
+
archeologist.egg-info/requires.txt
|
|
9
|
+
archeologist.egg-info/top_level.txt
|
|
10
|
+
src/__init__.py
|
|
11
|
+
src/ast/__init__.py
|
|
12
|
+
src/ast/lineage.py
|
|
13
|
+
src/ast/parser.py
|
|
14
|
+
src/cli/__init__.py
|
|
15
|
+
src/cli/__main__.py
|
|
16
|
+
src/db/__init__.py
|
|
17
|
+
src/db/database.py
|
|
18
|
+
src/git/__init__.py
|
|
19
|
+
src/git/walker.py
|
|
20
|
+
src/github/__init__.py
|
|
21
|
+
src/github/fetcher.py
|
|
22
|
+
src/github/geographic.py
|
|
23
|
+
src/synthesis/__init__.py
|
|
24
|
+
src/synthesis/narrative.py
|
|
25
|
+
tests/test_e2e.py
|
|
26
|
+
tests/test_lineage.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
src
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "archeologist"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Code archaeologist - reconstruct function decision history via AST-aware lineage tracking"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.9"
|
|
7
|
+
dependencies = [
|
|
8
|
+
"GitPython",
|
|
9
|
+
"PyGithub",
|
|
10
|
+
"tree-sitter>=0.21.0",
|
|
11
|
+
"tree-sitter-languages>=1.9.0",
|
|
12
|
+
"click",
|
|
13
|
+
"pytest",
|
|
14
|
+
"python-dotenv",
|
|
15
|
+
"pyyaml",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
[project.scripts]
|
|
19
|
+
arc = "src.cli:cli"
|
|
20
|
+
|
|
21
|
+
[build-system]
|
|
22
|
+
requires = ["setuptools>=61.0"]
|
|
23
|
+
build-backend = "setuptools.build_meta"
|
|
24
|
+
|
|
25
|
+
[tool.pytest.ini_options]
|
|
26
|
+
testpaths = ["tests"]
|
|
27
|
+
python_files = ["test_*.py"]
|
|
28
|
+
|
|
29
|
+
[tool.setuptools.packages.find]
|
|
30
|
+
where = ["."]
|
|
31
|
+
include = ["src*"]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# Archeologist - Code lineage tracking tool
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .parser import ASTParser, ASTNode
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
from ..git.walker import GitWalker, DiffHunk
|
|
6
|
+
from ..ast.parser import ASTParser
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class LineageEdge:
|
|
11
|
+
parent_node_id: str
|
|
12
|
+
child_node_id: str
|
|
13
|
+
change_type: str
|
|
14
|
+
confidence: float
|
|
15
|
+
commit_hash: str = ""
|
|
16
|
+
commit_message: str = ""
|
|
17
|
+
author: str = ""
|
|
18
|
+
date: str = ""
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class LineageTracker:
|
|
22
|
+
"""Four-tier lineage detection for AST nodes across commits."""
|
|
23
|
+
|
|
24
|
+
def __init__(self, git_walker: GitWalker, ast_parser: ASTParser):
|
|
25
|
+
self.git = git_walker
|
|
26
|
+
self.ast = ast_parser
|
|
27
|
+
|
|
28
|
+
def track_lineage(
|
|
29
|
+
self, file_path: str, function_name: str, language: str, max_commits: int = 100
|
|
30
|
+
) -> list[LineageEdge]:
|
|
31
|
+
"""Track lineage of a function across git history."""
|
|
32
|
+
commits = self.git.get_commits_for_file(file_path, max_count=max_commits)
|
|
33
|
+
if not commits:
|
|
34
|
+
return []
|
|
35
|
+
|
|
36
|
+
commits = list(reversed(commits))
|
|
37
|
+
|
|
38
|
+
edges = []
|
|
39
|
+
previous_node = None
|
|
40
|
+
previous_content = None
|
|
41
|
+
previous_start_line = None
|
|
42
|
+
|
|
43
|
+
for commit in commits:
|
|
44
|
+
content = self.git.get_file_at_commit(commit.hash, file_path)
|
|
45
|
+
if not content:
|
|
46
|
+
continue
|
|
47
|
+
|
|
48
|
+
tree = self.ast.parse_file(content, language)
|
|
49
|
+
if not tree:
|
|
50
|
+
continue
|
|
51
|
+
|
|
52
|
+
current_node = self.ast.find_node_by_name(tree, language, function_name)
|
|
53
|
+
if not current_node:
|
|
54
|
+
continue
|
|
55
|
+
|
|
56
|
+
if previous_node and previous_content and previous_start_line is not None:
|
|
57
|
+
edge = self._detect_edge_type(
|
|
58
|
+
previous_content,
|
|
59
|
+
current_node.content,
|
|
60
|
+
previous_start_line,
|
|
61
|
+
current_node.start_line,
|
|
62
|
+
previous_node.name,
|
|
63
|
+
current_node.name,
|
|
64
|
+
previous_node.node_id,
|
|
65
|
+
current_node.node_id,
|
|
66
|
+
commit.hash,
|
|
67
|
+
file_path,
|
|
68
|
+
)
|
|
69
|
+
if edge:
|
|
70
|
+
edge.commit_hash = commit.hash
|
|
71
|
+
edge.commit_message = commit.message
|
|
72
|
+
edge.author = commit.author
|
|
73
|
+
edge.date = commit.date
|
|
74
|
+
edges.append(edge)
|
|
75
|
+
|
|
76
|
+
previous_node = current_node
|
|
77
|
+
previous_content = current_node.content
|
|
78
|
+
previous_start_line = current_node.start_line
|
|
79
|
+
|
|
80
|
+
return edges
|
|
81
|
+
|
|
82
|
+
def _detect_edge_type(
|
|
83
|
+
self,
|
|
84
|
+
old_content: str,
|
|
85
|
+
new_content: str,
|
|
86
|
+
old_start_line: int,
|
|
87
|
+
new_start_line: int,
|
|
88
|
+
old_name: str,
|
|
89
|
+
new_name: str,
|
|
90
|
+
parent_id: str,
|
|
91
|
+
child_id: str,
|
|
92
|
+
commit_hash: str,
|
|
93
|
+
file_path: str,
|
|
94
|
+
) -> Optional[LineageEdge]:
|
|
95
|
+
"""Determine lineage edge type using four-tier hierarchy."""
|
|
96
|
+
|
|
97
|
+
tier1 = self._tier1_identity(old_content, new_content, parent_id, child_id)
|
|
98
|
+
if tier1:
|
|
99
|
+
return tier1
|
|
100
|
+
|
|
101
|
+
parent_commit = self.git.get_commit_parent(commit_hash)
|
|
102
|
+
if parent_commit:
|
|
103
|
+
diffs = self.git.get_diff_for_commit(commit_hash, file_path, parent_commit)
|
|
104
|
+
if diffs:
|
|
105
|
+
tier2 = self._tier2_physical(
|
|
106
|
+
old_start_line, new_start_line, diffs, parent_id, child_id
|
|
107
|
+
)
|
|
108
|
+
if tier2:
|
|
109
|
+
return tier2
|
|
110
|
+
|
|
111
|
+
tier3 = self._tier3_signature(
|
|
112
|
+
old_name, new_name, old_content, new_content, parent_id, child_id
|
|
113
|
+
)
|
|
114
|
+
if tier3:
|
|
115
|
+
return tier3
|
|
116
|
+
|
|
117
|
+
tier4 = self._tier4_semantic(old_content, new_content, parent_id, child_id)
|
|
118
|
+
if tier4:
|
|
119
|
+
return tier4
|
|
120
|
+
|
|
121
|
+
return None
|
|
122
|
+
|
|
123
|
+
def _tier1_identity(
|
|
124
|
+
self, old_content: str, new_content: str, parent_id: str, child_id: str
|
|
125
|
+
) -> Optional[LineageEdge]:
|
|
126
|
+
"""Tier 1: Global hash check for identical code."""
|
|
127
|
+
old_hash = self.git.compute_content_hash(old_content.strip())
|
|
128
|
+
new_hash = self.git.compute_content_hash(new_content.strip())
|
|
129
|
+
|
|
130
|
+
if old_hash == new_hash:
|
|
131
|
+
return LineageEdge(
|
|
132
|
+
parent_node_id=parent_id,
|
|
133
|
+
child_node_id=child_id,
|
|
134
|
+
change_type="identity",
|
|
135
|
+
confidence=1.0,
|
|
136
|
+
)
|
|
137
|
+
return None
|
|
138
|
+
|
|
139
|
+
def _tier2_physical(
|
|
140
|
+
self,
|
|
141
|
+
old_start_line: int,
|
|
142
|
+
new_start_line: int,
|
|
143
|
+
diffs: list[DiffHunk],
|
|
144
|
+
parent_id: str,
|
|
145
|
+
child_id: str,
|
|
146
|
+
) -> Optional[LineageEdge]:
|
|
147
|
+
"""Tier 2: Physical intersection via diff math."""
|
|
148
|
+
deleted = []
|
|
149
|
+
added = []
|
|
150
|
+
for diff in diffs:
|
|
151
|
+
deleted.extend(diff.deleted_lines)
|
|
152
|
+
added.extend(diff.added_lines)
|
|
153
|
+
|
|
154
|
+
line_shift = 0
|
|
155
|
+
for ln in added:
|
|
156
|
+
if ln < old_start_line:
|
|
157
|
+
line_shift += 1
|
|
158
|
+
for ln in deleted:
|
|
159
|
+
if ln < old_start_line:
|
|
160
|
+
line_shift -= 1
|
|
161
|
+
|
|
162
|
+
projected = old_start_line + line_shift
|
|
163
|
+
if abs(projected - new_start_line) <= 2:
|
|
164
|
+
return LineageEdge(
|
|
165
|
+
parent_node_id=parent_id,
|
|
166
|
+
child_node_id=child_id,
|
|
167
|
+
change_type="physical",
|
|
168
|
+
confidence=0.85,
|
|
169
|
+
)
|
|
170
|
+
return None
|
|
171
|
+
|
|
172
|
+
def _tier3_signature(
|
|
173
|
+
self,
|
|
174
|
+
old_name: str,
|
|
175
|
+
new_name: str,
|
|
176
|
+
old_content: str,
|
|
177
|
+
new_content: str,
|
|
178
|
+
parent_id: str,
|
|
179
|
+
child_id: str,
|
|
180
|
+
) -> Optional[LineageEdge]:
|
|
181
|
+
"""Tier 3: Signature match - name unchanged, content cut-and-paste."""
|
|
182
|
+
if old_name == new_name:
|
|
183
|
+
overlap = self._jaccard_similarity(
|
|
184
|
+
set(old_content.split()), set(new_content.split())
|
|
185
|
+
)
|
|
186
|
+
if 0.5 <= overlap:
|
|
187
|
+
return LineageEdge(
|
|
188
|
+
parent_node_id=parent_id,
|
|
189
|
+
child_node_id=child_id,
|
|
190
|
+
change_type="signature",
|
|
191
|
+
confidence=overlap,
|
|
192
|
+
)
|
|
193
|
+
return None
|
|
194
|
+
|
|
195
|
+
def _tier4_semantic(
|
|
196
|
+
self, old_content: str, new_content: str, parent_id: str, child_id: str
|
|
197
|
+
) -> Optional[LineageEdge]:
|
|
198
|
+
"""Tier 4: Semantic overlap - Jaccard for split/merge."""
|
|
199
|
+
overlap = self._jaccard_similarity(
|
|
200
|
+
set(old_content.split()), set(new_content.split())
|
|
201
|
+
)
|
|
202
|
+
if overlap >= 0.5:
|
|
203
|
+
return LineageEdge(
|
|
204
|
+
parent_node_id=parent_id,
|
|
205
|
+
child_node_id=child_id,
|
|
206
|
+
change_type="semantic",
|
|
207
|
+
confidence=overlap,
|
|
208
|
+
)
|
|
209
|
+
return None
|
|
210
|
+
|
|
211
|
+
def _jaccard_similarity(self, set1: set, set2: set) -> float:
|
|
212
|
+
"""Calculate Jaccard similarity between two sets."""
|
|
213
|
+
if not set1 or not set2:
|
|
214
|
+
return 0.0
|
|
215
|
+
intersection = len(set1 & set2)
|
|
216
|
+
union = len(set1 | set2)
|
|
217
|
+
return intersection / union if union > 0 else 0.0
|