codemap-python 0.1.3__tar.gz → 0.1.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. {codemap_python-0.1.3 → codemap_python-0.1.5}/PKG-INFO +24 -12
  2. {codemap_python-0.1.3 → codemap_python-0.1.5}/README.md +22 -9
  3. {codemap_python-0.1.3 → codemap_python-0.1.5}/analysis/call_graph/call_extractor.py +15 -16
  4. {codemap_python-0.1.3 → codemap_python-0.1.5}/analysis/core/ast_parser.py +7 -13
  5. {codemap_python-0.1.3 → codemap_python-0.1.5}/analysis/core/import_extractor.py +46 -46
  6. {codemap_python-0.1.3 → codemap_python-0.1.5}/analysis/explain/explain_runner.py +50 -47
  7. {codemap_python-0.1.3 → codemap_python-0.1.5}/analysis/explain/summary_generator.py +8 -5
  8. {codemap_python-0.1.3 → codemap_python-0.1.5}/analysis/runners/phase4_runner.py +41 -44
  9. codemap_python-0.1.5/analysis/utils/bom_handler.py +119 -0
  10. {codemap_python-0.1.3 → codemap_python-0.1.5}/analysis/utils/cache_manager.py +26 -21
  11. codemap_python-0.1.5/analysis/utils/progress_spinner.py +85 -0
  12. codemap_python-0.1.5/analysis/utils/repo_walk.py +27 -0
  13. codemap_python-0.1.3/codemap_cli.py → codemap_python-0.1.5/cli.py +11 -11
  14. codemap_python-0.1.3/cli.py → codemap_python-0.1.5/codemap_app.py +203 -132
  15. codemap_python-0.1.5/codemap_cli.py +11 -0
  16. {codemap_python-0.1.3 → codemap_python-0.1.5}/codemap_python.egg-info/PKG-INFO +24 -12
  17. {codemap_python-0.1.3 → codemap_python-0.1.5}/codemap_python.egg-info/SOURCES.txt +8 -0
  18. {codemap_python-0.1.3 → codemap_python-0.1.5}/codemap_python.egg-info/top_level.txt +1 -1
  19. {codemap_python-0.1.3 → codemap_python-0.1.5}/pyproject.toml +3 -4
  20. {codemap_python-0.1.3 → codemap_python-0.1.5}/tests/test_cache_cli_commands.py +35 -30
  21. codemap_python-0.1.5/tests/test_cli_invalid_escape_warnings.py +35 -0
  22. codemap_python-0.1.5/tests/test_codemap_cli_entrypoint.py +12 -0
  23. codemap_python-0.1.5/tests/test_explain_runner_collection.py +37 -0
  24. codemap_python-0.1.5/tests/test_repo_walk_filters.py +57 -0
  25. {codemap_python-0.1.3 → codemap_python-0.1.5}/tests/test_security_cli_integration.py +6 -6
  26. codemap_python-0.1.5/tests/test_summary_generator.py +12 -0
  27. {codemap_python-0.1.3 → codemap_python-0.1.5}/ui/app.py +10 -6
  28. {codemap_python-0.1.3 → codemap_python-0.1.5}/ui/static/app.js +69 -10
  29. codemap_python-0.1.3/analysis/utils/bom_handler.py +0 -55
  30. {codemap_python-0.1.3 → codemap_python-0.1.5}/analysis/__init__.py +0 -0
  31. {codemap_python-0.1.3 → codemap_python-0.1.5}/analysis/architecture/__init__.py +0 -0
  32. {codemap_python-0.1.3 → codemap_python-0.1.5}/analysis/architecture/architecture_engine.py +0 -0
  33. {codemap_python-0.1.3 → codemap_python-0.1.5}/analysis/architecture/dependency_cycles.py +0 -0
  34. {codemap_python-0.1.3 → codemap_python-0.1.5}/analysis/architecture/risk_radar.py +0 -0
  35. {codemap_python-0.1.3 → codemap_python-0.1.5}/analysis/call_graph/__init__.py +0 -0
  36. {codemap_python-0.1.3 → codemap_python-0.1.5}/analysis/call_graph/call_graph_builder.py +0 -0
  37. {codemap_python-0.1.3 → codemap_python-0.1.5}/analysis/call_graph/call_resolver.py +0 -0
  38. {codemap_python-0.1.3 → codemap_python-0.1.5}/analysis/call_graph/context_models.py +0 -0
  39. {codemap_python-0.1.3 → codemap_python-0.1.5}/analysis/call_graph/cross_file_resolver.py +0 -0
  40. {codemap_python-0.1.3 → codemap_python-0.1.5}/analysis/call_graph/execution_tracker.py +0 -0
  41. {codemap_python-0.1.3 → codemap_python-0.1.5}/analysis/call_graph/flow_builder.py +0 -0
  42. {codemap_python-0.1.3 → codemap_python-0.1.5}/analysis/call_graph/models.py +0 -0
  43. {codemap_python-0.1.3 → codemap_python-0.1.5}/analysis/core/__init__.py +0 -0
  44. {codemap_python-0.1.3 → codemap_python-0.1.5}/analysis/core/ast_context.py +0 -0
  45. {codemap_python-0.1.3 → codemap_python-0.1.5}/analysis/core/class_extractor.py +0 -0
  46. {codemap_python-0.1.3 → codemap_python-0.1.5}/analysis/core/function_extractor.py +0 -0
  47. {codemap_python-0.1.3 → codemap_python-0.1.5}/analysis/explain/__init__.py +0 -0
  48. {codemap_python-0.1.3 → codemap_python-0.1.5}/analysis/explain/docstring_extractor.py +0 -0
  49. {codemap_python-0.1.3 → codemap_python-0.1.5}/analysis/explain/repo_summary_generator.py +0 -0
  50. {codemap_python-0.1.3 → codemap_python-0.1.5}/analysis/explain/return_analyzer.py +0 -0
  51. {codemap_python-0.1.3 → codemap_python-0.1.5}/analysis/explain/risk_flags.py +0 -0
  52. {codemap_python-0.1.3 → codemap_python-0.1.5}/analysis/explain/signature_extractor.py +0 -0
  53. {codemap_python-0.1.3 → codemap_python-0.1.5}/analysis/graph/__init__.py +0 -0
  54. {codemap_python-0.1.3 → codemap_python-0.1.5}/analysis/graph/callgraph_index.py +0 -0
  55. {codemap_python-0.1.3 → codemap_python-0.1.5}/analysis/graph/entrypoint_detector.py +0 -0
  56. {codemap_python-0.1.3 → codemap_python-0.1.5}/analysis/graph/impact_analyzer.py +0 -0
  57. {codemap_python-0.1.3 → codemap_python-0.1.5}/analysis/indexing/__init__.py +0 -0
  58. {codemap_python-0.1.3 → codemap_python-0.1.5}/analysis/indexing/import_resolver.py +0 -0
  59. {codemap_python-0.1.3 → codemap_python-0.1.5}/analysis/indexing/symbol_index.py +0 -0
  60. {codemap_python-0.1.3 → codemap_python-0.1.5}/analysis/runners/__init__.py +0 -0
  61. {codemap_python-0.1.3 → codemap_python-0.1.5}/analysis/utils/__init__.py +0 -0
  62. {codemap_python-0.1.3 → codemap_python-0.1.5}/analysis/utils/ast_helpers.py +0 -0
  63. {codemap_python-0.1.3 → codemap_python-0.1.5}/analysis/utils/path_resolver.py +0 -0
  64. {codemap_python-0.1.3 → codemap_python-0.1.5}/analysis/utils/repo_fetcher.py +0 -0
  65. {codemap_python-0.1.3 → codemap_python-0.1.5}/codemap_python.egg-info/dependency_links.txt +0 -0
  66. {codemap_python-0.1.3 → codemap_python-0.1.5}/codemap_python.egg-info/entry_points.txt +0 -0
  67. {codemap_python-0.1.3 → codemap_python-0.1.5}/codemap_python.egg-info/requires.txt +0 -0
  68. {codemap_python-0.1.3 → codemap_python-0.1.5}/security_utils.py +0 -0
  69. {codemap_python-0.1.3 → codemap_python-0.1.5}/setup.cfg +0 -0
  70. {codemap_python-0.1.3 → codemap_python-0.1.5}/tests/test_cache_retention.py +0 -0
  71. {codemap_python-0.1.3 → codemap_python-0.1.5}/tests/test_no_key_persistence.py +0 -0
  72. {codemap_python-0.1.3 → codemap_python-0.1.5}/tests/test_registry_session_mode.py +0 -0
  73. {codemap_python-0.1.3 → codemap_python-0.1.5}/tests/test_security_redaction.py +0 -0
  74. {codemap_python-0.1.3 → codemap_python-0.1.5}/tests/test_symbol_explain_cache.py +0 -0
  75. {codemap_python-0.1.3 → codemap_python-0.1.5}/tests/test_symbol_info_endpoint.py +0 -0
  76. {codemap_python-0.1.3 → codemap_python-0.1.5}/tests/test_ui_private_mode_security.py +0 -0
  77. {codemap_python-0.1.3 → codemap_python-0.1.5}/tests/test_ui_retention_controls.py +0 -0
  78. {codemap_python-0.1.3 → codemap_python-0.1.5}/ui/__init__.py +0 -0
  79. {codemap_python-0.1.3 → codemap_python-0.1.5}/ui/device_id.py +0 -0
  80. {codemap_python-0.1.3 → codemap_python-0.1.5}/ui/static/styles.css +0 -0
  81. {codemap_python-0.1.3 → codemap_python-0.1.5}/ui/templates/index.html +0 -0
  82. {codemap_python-0.1.3 → codemap_python-0.1.5}/ui/utils/__init__.py +0 -0
  83. {codemap_python-0.1.3 → codemap_python-0.1.5}/ui/utils/registry_manager.py +0 -0
@@ -1,9 +1,9 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: codemap-python
3
- Version: 0.1.3
3
+ Version: 0.1.5
4
4
  Summary: Local Python code analysis tool - understand architecture, dependencies, and call graphs
5
5
  Author-email: ADITYA <aditykushwaha69@gmail.com>
6
- License: MIT
6
+ License-Expression: MIT
7
7
  Project-URL: Homepage, https://github.com/ADITYA-kus/codemap_ai
8
8
  Project-URL: Repository, https://github.com/ADITYA-kus/codemap_ai.git
9
9
  Project-URL: Issues, https://github.com/ADITYA-kus/codemap_ai/issues
@@ -11,7 +11,6 @@ Project-URL: Documentation, https://github.com/ADITYA-kus/codemap_ai#readme
11
11
  Keywords: code-analysis,python,architecture,call-graph,cli,dashboard,local,privacy
12
12
  Classifier: Development Status :: 4 - Beta
13
13
  Classifier: Intended Audience :: Developers
14
- Classifier: License :: OSI Approved :: MIT License
15
14
  Classifier: Programming Language :: Python :: 3
16
15
  Classifier: Programming Language :: Python :: 3.10
17
16
  Classifier: Programming Language :: Python :: 3.11
@@ -159,22 +158,35 @@ codemap open --port 8000
159
158
 
160
159
  ### Cache Management
161
160
  ```bash
162
- # List all analyzed repositories
161
+ # 📋 List all analyzed repositories and their cache info
163
162
  codemap cache list
164
163
 
165
- # Show cache details for a repository
166
- codemap cache info <repo_hash>
164
+ # 📊 Show detailed cache information for a specific repository
165
+ codemap cache info --path <repo_directory>
167
166
 
168
- # Clear a specific repository's cache
169
- codemap cache clear <repo_hash>
167
+ # ⏱️ Set cache retention policy (automatically clean old caches)
168
+ codemap cache retention --path <repo_directory> --days 30 --yes
170
169
 
171
- # Show cache retention policy
172
- codemap cache retention <repo_hash>
170
+ # 🧹 Preview what would be cleaned (safe, no deletion)
171
+ codemap cache sweep --dry-run
173
172
 
174
- # Sweep expired caches (auto-cleanup)
175
- codemap cache sweep
173
+ # 🧹 Actually clean up expired caches (requires --yes confirmation)
174
+ codemap cache sweep --yes
175
+
176
+ # 🗑️ Clear cache for a specific repository (preview first)
177
+ codemap cache clear --path <repo_directory> --dry-run
178
+
179
+ # 🗑️ Actually delete a repository's cache (requires --yes confirmation)
180
+ codemap cache clear --path <repo_directory> --yes
176
181
  ```
177
182
 
183
+ **Cache Management Tips:**
184
+ - ✅ Always use `--dry-run` first to preview changes
185
+ - ✅ Add `--yes` flag to skip confirmation (useful in scripts)
186
+ - ✅ Default retention is 14 days; adjust with `--days <number>`
187
+ - ✅ Cache is stored in: `~/.codemap_cache/` (varies by OS)
188
+ - ✅ Use `cache list` to see all cached repositories and their sizes
189
+
178
190
  **Get GitHub Token (for private repos):**
179
191
  1. Go to https://github.com/settings/tokens
180
192
  2. Click "Generate new token" → "Generate new token (classic)"
@@ -131,22 +131,35 @@ codemap open --port 8000
131
131
 
132
132
  ### Cache Management
133
133
  ```bash
134
- # List all analyzed repositories
134
+ # 📋 List all analyzed repositories and their cache info
135
135
  codemap cache list
136
136
 
137
- # Show cache details for a repository
138
- codemap cache info <repo_hash>
137
+ # 📊 Show detailed cache information for a specific repository
138
+ codemap cache info --path <repo_directory>
139
139
 
140
- # Clear a specific repository's cache
141
- codemap cache clear <repo_hash>
140
+ # ⏱️ Set cache retention policy (automatically clean old caches)
141
+ codemap cache retention --path <repo_directory> --days 30 --yes
142
142
 
143
- # Show cache retention policy
144
- codemap cache retention <repo_hash>
143
+ # 🧹 Preview what would be cleaned (safe, no deletion)
144
+ codemap cache sweep --dry-run
145
145
 
146
- # Sweep expired caches (auto-cleanup)
147
- codemap cache sweep
146
+ # 🧹 Actually clean up expired caches (requires --yes confirmation)
147
+ codemap cache sweep --yes
148
+
149
+ # 🗑️ Clear cache for a specific repository (preview first)
150
+ codemap cache clear --path <repo_directory> --dry-run
151
+
152
+ # 🗑️ Actually delete a repository's cache (requires --yes confirmation)
153
+ codemap cache clear --path <repo_directory> --yes
148
154
  ```
149
155
 
156
+ **Cache Management Tips:**
157
+ - ✅ Always use `--dry-run` first to preview changes
158
+ - ✅ Add `--yes` flag to skip confirmation (useful in scripts)
159
+ - ✅ Default retention is 14 days; adjust with `--days <number>`
160
+ - ✅ Cache is stored in: `~/.codemap_cache/` (varies by OS)
161
+ - ✅ Use `cache list` to see all cached repositories and their sizes
162
+
150
163
  **Get GitHub Token (for private repos):**
151
164
  1. Go to https://github.com/settings/tokens
152
165
  2. Click "Generate new token" → "Generate new token (classic)"
@@ -1,7 +1,7 @@
1
- # AST Call detection
2
-
3
- import ast
4
- from analysis.utils.bom_handler import remove_bom
1
+ # AST Call detection
2
+
3
+ import ast
4
+ from analysis.utils.bom_handler import read_source_file, parse_source_to_ast
5
5
 
6
6
  class FunctionCallVisitor(ast.NodeVisitor):
7
7
  def __init__(self, file_path):
@@ -82,15 +82,14 @@ class FunctionCallVisitor(ast.NodeVisitor):
82
82
  return None
83
83
 
84
84
 
85
- def extract_function_calls(file_path):
86
- with open(file_path, "r", encoding="utf-8") as f:
87
- source = f.read()
88
-
89
- # Remove BOM if present
90
- source = remove_bom(source)
91
- tree = ast.parse(source)
92
-
93
- visitor = FunctionCallVisitor(file_path)
94
- visitor.visit(tree)
95
-
96
- return visitor.calls
85
+ def extract_function_calls(file_path):
86
+ source = read_source_file(file_path)
87
+ tree = parse_source_to_ast(source, file_path=file_path)
88
+ return extract_function_calls_from_tree(tree, file_path)
89
+
90
+
91
+ def extract_function_calls_from_tree(tree, file_path):
92
+ visitor = FunctionCallVisitor(file_path)
93
+ visitor.visit(tree)
94
+
95
+ return visitor.calls
@@ -1,13 +1,12 @@
1
- # AST Parser Module
2
- import ast
3
- from analysis.utils.bom_handler import remove_bom
1
+ # AST Parser Module
2
+ from analysis.utils.bom_handler import read_and_parse_python_file
4
3
 
5
4
 
6
- def parse_python_file(file_path):
7
- """Parse a Python file, automatically handling UTF-8 BOM.
5
+ def parse_python_file(file_path):
6
+ """Parse a Python file with automatic encoding and BOM handling.
8
7
 
9
8
  This function:
10
- 1. Reads the file with UTF-8 encoding
9
+ 1. Reads the file with automatic encoding detection (UTF-8 → Latin-1)
11
10
  2. Removes any BOM characters automatically
12
11
  3. Parses the cleaned source code
13
12
 
@@ -20,12 +19,7 @@ def parse_python_file(file_path):
20
19
  Raises:
21
20
  SyntaxError: If source code has syntax errors
22
21
  FileNotFoundError: If file doesn't exist
22
+ ValueError: If file encoding cannot be determined
23
23
  """
24
- with open(file_path, "r", encoding="utf-8") as f:
25
- source = f.read()
26
-
27
- # Remove BOM if present (handles files from Windows editors, etc.)
28
- source = remove_bom(source)
29
-
30
- return ast.parse(source)
24
+ return read_and_parse_python_file(file_path)
31
25
 
@@ -1,49 +1,49 @@
1
1
  # Import Extractor Module
2
2
  # analysis/import_extractor.py
3
3
 
4
- import ast
5
- from analysis.utils.bom_handler import remove_bom
6
-
7
-
8
- def extract_imports(file_path):
9
- """Extract imports from a Python file, handling UTF-8 BOM automatically."""
10
- with open(file_path, "r", encoding="utf-8") as f:
11
- source = f.read()
12
-
13
- # Remove BOM if present
14
- source = remove_bom(source)
15
-
16
- tree = ast.parse(source)
17
- imports = []
18
-
19
- for node in ast.walk(tree):
20
-
21
- # import module
22
- if isinstance(node, ast.Import):
23
- for alias in node.names:
24
- imports.append({
25
- "type": "import",
26
- "module": alias.name,
27
- "name": None,
28
- "alias": alias.asname,
29
- "line": node.lineno,
30
- "file": file_path
31
- })
32
-
33
- # from module import name
34
- elif isinstance(node, ast.ImportFrom):
35
- module = node.module
36
- level = node.level # 0 = absolute, >0 = relative
37
-
38
- for alias in node.names:
39
- imports.append({
40
- "type": "from_import",
41
- "module": module,
42
- "name": alias.name,
43
- "alias": alias.asname,
44
- "level": level,
45
- "line": node.lineno,
46
- "file": file_path
47
- })
48
-
49
- return imports
4
+ import ast
5
+ from analysis.utils.bom_handler import read_source_file, parse_source_to_ast
6
+
7
+
8
+ def extract_imports_from_tree(tree, file_path):
9
+ """Extract imports from an already-parsed AST tree."""
10
+ imports = []
11
+
12
+ for node in ast.walk(tree):
13
+
14
+ # import module
15
+ if isinstance(node, ast.Import):
16
+ for alias in node.names:
17
+ imports.append({
18
+ "type": "import",
19
+ "module": alias.name,
20
+ "name": None,
21
+ "alias": alias.asname,
22
+ "line": node.lineno,
23
+ "file": file_path
24
+ })
25
+
26
+ # from module import name
27
+ elif isinstance(node, ast.ImportFrom):
28
+ module = node.module
29
+ level = node.level # 0 = absolute, >0 = relative
30
+
31
+ for alias in node.names:
32
+ imports.append({
33
+ "type": "from_import",
34
+ "module": module,
35
+ "name": alias.name,
36
+ "alias": alias.asname,
37
+ "level": level,
38
+ "line": node.lineno,
39
+ "file": file_path
40
+ })
41
+
42
+ return imports
43
+
44
+
45
+ def extract_imports(file_path):
46
+ """Extract imports from a Python file with automatic encoding and BOM handling."""
47
+ source = read_source_file(file_path)
48
+ tree = parse_source_to_ast(source, file_path=file_path)
49
+ return extract_imports_from_tree(tree, file_path)
@@ -5,35 +5,32 @@ from __future__ import annotations
5
5
 
6
6
  from typing import Optional, Dict, Any
7
7
 
8
- import ast
9
- import json
10
- import os
11
- from analysis.utils.bom_handler import remove_bom
12
-
13
- from analysis.indexing.symbol_index import SymbolIndex, SymbolInfo
14
- from analysis.graph.callgraph_index import CallGraphIndex, CallSite
15
- from analysis.explain.docstring_extractor import extract_docstrings
16
- from analysis.explain.signature_extractor import extract_signatures
17
- from analysis.explain.return_analyzer import analyze_returns
18
- from analysis.explain.summary_generator import generate_symbol_summary
19
-
20
-
21
- def collect_python_files(root_dir: str):
22
- py_files = []
23
- for root, _, files in os.walk(root_dir):
24
- for file in files:
25
- if file.endswith(".py") and not file.startswith("__"):
26
- py_files.append(os.path.join(root, file))
27
- return py_files
28
-
29
-
30
- def parse_ast(file_path: str) -> ast.AST:
31
- """Parse a Python file, automatically handling UTF-8 BOM."""
32
- with open(file_path, "r", encoding="utf-8") as f:
33
- source = f.read()
34
- # Remove BOM if present
35
- source = remove_bom(source)
36
- return ast.parse(source)
8
+ import json
9
+ import os
10
+
11
+ from analysis.indexing.symbol_index import SymbolIndex, SymbolInfo
12
+ from analysis.graph.callgraph_index import CallGraphIndex, CallSite
13
+ from analysis.explain.docstring_extractor import extract_docstrings
14
+ from analysis.explain.signature_extractor import extract_signatures
15
+ from analysis.explain.return_analyzer import analyze_returns
16
+ from analysis.explain.summary_generator import generate_symbol_summary
17
+ from analysis.utils.repo_walk import filter_skipped_dirs
18
+
19
+
20
+ def collect_python_files(root_dir: str):
21
+ py_files = []
22
+ for root, dirs, files in os.walk(root_dir):
23
+ dirs[:] = filter_skipped_dirs(dirs)
24
+ for file in files:
25
+ if file.endswith(".py") and not file.startswith("__"):
26
+ py_files.append(os.path.join(root, file))
27
+ return py_files
28
+
29
+
30
+ def parse_ast(file_path: str):
31
+ """Parse a Python file with automatic encoding and BOM handling."""
32
+ from analysis.utils.bom_handler import read_and_parse_python_file
33
+ return read_and_parse_python_file(file_path)
37
34
 
38
35
 
39
36
  def file_to_module(file_path: str, repo_root: str) -> str:
@@ -83,7 +80,11 @@ def merge_maps(dst: dict, src: dict):
83
80
  dst[k].update(src.get(k, {}))
84
81
 
85
82
 
86
- def run(repo_dir: Optional[str] = None, output_dir: Optional[str] = None) -> Dict[str, Any]:
83
+ def run(
84
+ repo_dir: Optional[str] = None,
85
+ output_dir: Optional[str] = None,
86
+ symbol_snapshot: Optional[list] = None,
87
+ ) -> Dict[str, Any]:
87
88
  """
88
89
  Callable explain pipeline (Phase-5/6), suitable for CLI/VS Code.
89
90
 
@@ -119,23 +120,25 @@ def run(repo_dir: Optional[str] = None, output_dir: Optional[str] = None) -> Dic
119
120
  # 2) Collect repo python files
120
121
  python_files = collect_python_files(repo_dir)
121
122
 
122
- # 3) Build symbol index + extractors across repo
123
- symbol_index = SymbolIndex()
124
-
125
- repo_docstrings = {"module": None, "classes": {}, "functions": {}, "methods": {}}
126
- repo_signatures = {"functions": {}, "methods": {}}
127
- repo_returns = {"functions": {}, "methods": {}}
128
-
129
- for file_path in python_files:
130
- tree = parse_ast(file_path)
131
- module_path = file_to_module(file_path, repo_dir)
132
-
133
-
134
- # index symbols
135
- symbol_index.index_file(tree, module_path, file_path)
136
-
137
- # extract per-file and merge
138
- merge_maps(repo_docstrings, extract_docstrings(tree))
123
+ # 3) Build symbol index + extractors across repo
124
+ symbol_index = SymbolIndex()
125
+ loaded_snapshot = False
126
+ if isinstance(symbol_snapshot, list) and symbol_snapshot:
127
+ symbol_index.load_snapshot(symbol_snapshot)
128
+ loaded_snapshot = True
129
+
130
+ repo_docstrings = {"module": None, "classes": {}, "functions": {}, "methods": {}}
131
+ repo_signatures = {"functions": {}, "methods": {}}
132
+ repo_returns = {"functions": {}, "methods": {}}
133
+
134
+ for file_path in python_files:
135
+ tree = parse_ast(file_path)
136
+ if not loaded_snapshot:
137
+ module_path = file_to_module(file_path, repo_dir)
138
+ symbol_index.index_file(tree, module_path, file_path)
139
+
140
+ # extract per-file and merge
141
+ merge_maps(repo_docstrings, extract_docstrings(tree))
139
142
 
140
143
  sigs = extract_signatures(tree)
141
144
  repo_signatures["functions"].update(sigs.get("functions", {}))
@@ -11,11 +11,14 @@ from analysis.indexing.symbol_index import SymbolInfo
11
11
  from analysis.graph.callgraph_index import CallGraphIndex
12
12
 
13
13
 
14
- def _first_line(text: Optional[str]) -> Optional[str]:
15
- if not text:
16
- return None
17
- line = text.strip().splitlines()[0].strip()
18
- return line or None
14
+ def _first_line(text: Optional[str]) -> Optional[str]:
15
+ if not text:
16
+ return None
17
+ stripped = text.strip()
18
+ if not stripped:
19
+ return None
20
+ line = stripped.splitlines()[0].strip()
21
+ return line or None
19
22
 
20
23
 
21
24
  def _humanize_name(name: str) -> str:
@@ -3,39 +3,34 @@ from __future__ import annotations
3
3
 
4
4
  from typing import Optional, Dict, Any, List
5
5
 
6
- import os
7
- import ast
8
- import json
9
- from analysis.indexing.symbol_index import SymbolIndex
10
- from analysis.indexing.import_resolver import ImportResolver
11
- from analysis.call_graph.cross_file_resolver import CrossFileResolver
12
- from analysis.call_graph.call_extractor import extract_function_calls
13
- from analysis.core.import_extractor import extract_imports
14
- from analysis.graph.callgraph_index import build_caller_fqn
15
- from analysis.utils.bom_handler import remove_bom
6
+ import os
7
+ import json
8
+ from analysis.indexing.symbol_index import SymbolIndex
9
+ from analysis.indexing.import_resolver import ImportResolver
10
+ from analysis.call_graph.cross_file_resolver import CrossFileResolver
11
+ from analysis.call_graph.call_extractor import extract_function_calls_from_tree
12
+ from analysis.core.import_extractor import extract_imports_from_tree
13
+ from analysis.graph.callgraph_index import build_caller_fqn
14
+ from analysis.utils.repo_walk import filter_skipped_dirs
16
15
 
17
16
 
18
17
  PROJECT_ROOT = os.path.dirname(os.path.dirname(__file__))
19
18
 
20
19
 
21
- def collect_python_files(root_dir: str) -> List[str]:
22
- ignore_dirs = {".git", "__pycache__", ".codemap_cache", "node_modules", ".venv", "venv"}
23
- py_files: List[str] = []
24
- for root, dirs, files in os.walk(root_dir):
25
- dirs[:] = [d for d in dirs if d not in ignore_dirs]
26
- for file in files:
27
- if file.endswith(".py") and not file.startswith("__"):
28
- py_files.append(os.path.join(root, file))
20
+ def collect_python_files(root_dir: str) -> List[str]:
21
+ py_files: List[str] = []
22
+ for root, dirs, files in os.walk(root_dir):
23
+ dirs[:] = filter_skipped_dirs(dirs)
24
+ for file in files:
25
+ if file.endswith(".py") and not file.startswith("__"):
26
+ py_files.append(os.path.join(root, file))
29
27
  return py_files
30
28
 
31
29
 
32
- def parse_ast(file_path: str):
33
- """Parse a Python file, automatically handling UTF-8 BOM."""
34
- with open(file_path, "r", encoding="utf-8") as f:
35
- source = f.read()
36
- # Remove BOM if present
37
- source = remove_bom(source)
38
- return ast.parse(source)
30
+ def parse_ast(file_path: str):
31
+ """Parse a Python file, automatically handling encoding and UTF-8 BOM."""
32
+ from analysis.utils.bom_handler import read_and_parse_python_file
33
+ return read_and_parse_python_file(file_path)
39
34
 
40
35
 
41
36
  def file_to_module(file_path: str, repo_root: str) -> str:
@@ -80,25 +75,27 @@ def run(repo_dir: Optional[str] = None, output_dir: Optional[str] = None, force_
80
75
 
81
76
  os.makedirs(output_dir, exist_ok=True)
82
77
 
83
- python_files = collect_python_files(repo_dir)
84
- symbol_index = SymbolIndex()
85
- file_module_map: Dict[str, str] = {}
86
-
87
- for file_path in python_files:
88
- module_path = file_to_module(file_path, repo_dir)
89
- file_module_map[file_path] = module_path
90
- tree = parse_ast(file_path)
91
- symbol_index.index_file(tree, module_path, file_path)
92
-
93
- import_resolver = ImportResolver(symbol_index)
94
- for file_path in python_files:
95
- module_path = file_module_map[file_path]
96
- imports = extract_imports(file_path)
97
- import_resolver.index_module_imports(module_path, imports)
98
-
99
- all_calls = []
100
- for file_path in python_files:
101
- all_calls.extend(extract_function_calls(file_path))
78
+ python_files = collect_python_files(repo_dir)
79
+ symbol_index = SymbolIndex()
80
+ file_module_map: Dict[str, str] = {}
81
+ parsed_trees: Dict[str, Any] = {}
82
+
83
+ for file_path in python_files:
84
+ module_path = file_to_module(file_path, repo_dir)
85
+ file_module_map[file_path] = module_path
86
+ tree = parse_ast(file_path)
87
+ parsed_trees[file_path] = tree
88
+ symbol_index.index_file(tree, module_path, file_path)
89
+
90
+ import_resolver = ImportResolver(symbol_index)
91
+ for file_path in python_files:
92
+ module_path = file_module_map[file_path]
93
+ imports = extract_imports_from_tree(parsed_trees[file_path], file_path)
94
+ import_resolver.index_module_imports(module_path, imports)
95
+
96
+ all_calls = []
97
+ for file_path in python_files:
98
+ all_calls.extend(extract_function_calls_from_tree(parsed_trees[file_path], file_path))
102
99
 
103
100
  cross_resolver = CrossFileResolver(symbol_index, import_resolver)
104
101
  resolved_calls = []
@@ -0,0 +1,119 @@
1
+ """BOM (Byte Order Mark), encoding, and AST parsing utilities for CodeMap.
2
+
3
+ This module provides utilities to handle:
4
+ 1. UTF-8 BOM (Byte Order Mark) characters added by certain editors
5
+ 2. Non-UTF-8 encoded files (e.g., Latin-1, Windows-1252)
6
+
7
+ Issues handled:
8
+ - BOM (U+FEFF): invisible character causing "invalid non-printable character U+FEFF"
9
+ - Non-UTF-8: files with different encodings causing UnicodeDecodeError
10
+
11
+ Solution: Detect encoding with fallback chain, strip BOM, and parse quietly.
12
+ """
13
+
14
+ import ast
15
+ import warnings
16
+ from typing import Tuple
17
+
18
+
19
+ def remove_bom(source: str) -> str:
20
+ """Remove UTF-8 BOM (Byte Order Mark) from source code if present.
21
+
22
+ BOM is a special character (U+FEFF) that some editors (especially Notepad
23
+ on Windows) add to the start of files. Python's AST parser doesn't handle it.
24
+
25
+ This function silently removes it if present, or returns the source unchanged.
26
+
27
+ Args:
28
+ source: Python source code as string
29
+
30
+ Returns:
31
+ Source code with BOM removed if present
32
+
33
+ Example:
34
+ >>> source_with_bom = '\\ufeffdef hello(): pass'
35
+ >>> clean_source = remove_bom(source_with_bom)
36
+ >>> print(clean_source)
37
+ def hello(): pass
38
+ """
39
+ if source.startswith('\ufeff'):
40
+ return source[1:]
41
+ return source
42
+
43
+
44
+ def detect_encoding(file_path: str) -> Tuple[str, bool]:
45
+ """Detect file encoding by trying multiple decodings.
46
+
47
+ Tries encodings in this order:
48
+ 1. UTF-8 (most common for Python files)
49
+ 2. System default encoding
50
+ 3. Latin-1 / ISO-8859-1 (accepts any byte sequence)
51
+
52
+ Args:
53
+ file_path: Path to file to detect encoding for
54
+
55
+ Returns:
56
+ Tuple of (encoding_name: str, is_fallback: bool)
57
+ is_fallback=True means file uses non-standard encoding
58
+
59
+ Raises:
60
+ FileNotFoundError: If file doesn't exist
61
+ """
62
+ import sys
63
+
64
+ encodings_to_try = [
65
+ ('utf-8', False),
66
+ (sys.getdefaultencoding(), False),
67
+ ('latin-1', True), # Latin-1 accepts any byte sequence
68
+ ]
69
+
70
+ for encoding, is_fallback in encodings_to_try:
71
+ try:
72
+ with open(file_path, 'rb') as f:
73
+ f.read().decode(encoding)
74
+ return (encoding, is_fallback)
75
+ except (UnicodeDecodeError, LookupError):
76
+ continue
77
+
78
+ # Should never reach here since Latin-1 accepts all bytes
79
+ return ('latin-1', True)
80
+
81
+
82
+ def read_source_file(file_path: str) -> str:
83
+ """Read a Python file with automatic encoding detection and BOM removal.
84
+
85
+ Handles files with different encodings gracefully by trying multiple
86
+ decodings in order of likelihood, then falling back to Latin-1.
87
+
88
+ Args:
89
+ file_path: Path to Python file to read
90
+
91
+ Returns:
92
+ Source code with BOM removed
93
+
94
+ Raises:
95
+ FileNotFoundError: If file doesn't exist
96
+ """
97
+ encoding, _is_fallback = detect_encoding(file_path)
98
+ with open(file_path, 'r', encoding=encoding, errors='replace') as f:
99
+ source = f.read()
100
+ return remove_bom(source)
101
+
102
+
103
+ def parse_source_to_ast(source: str, file_path: str = "<unknown>") -> ast.AST:
104
+ """Parse source code while suppressing noisy invalid-escape warnings.
105
+
106
+ Some user repositories contain regular string literals like ``"\\S"`` or
107
+ ``"\\["``. Python can emit ``SyntaxWarning: invalid escape sequence`` while
108
+ parsing those files even though analysis can continue normally. For CodeMap,
109
+ these warnings are implementation noise, so we suppress them here.
110
+ """
111
+ with warnings.catch_warnings():
112
+ warnings.filterwarnings("ignore", category=SyntaxWarning)
113
+ return ast.parse(source, filename=file_path)
114
+
115
+
116
+ def read_and_parse_python_file(file_path: str) -> ast.AST:
117
+ """Read a Python file with encoding/BOM handling and return its AST."""
118
+ source = read_source_file(file_path)
119
+ return parse_source_to_ast(source, file_path=file_path)