mcp-code-indexer 2.3.0__tar.gz → 3.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. {mcp_code_indexer-2.3.0/src/mcp_code_indexer.egg-info → mcp_code_indexer-3.0.0}/PKG-INFO +3 -3
  2. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/README.md +2 -2
  3. mcp_code_indexer-3.0.0/migrations/004_remove_branch_dependency.sql +166 -0
  4. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/pyproject.toml +1 -1
  5. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/src/mcp_code_indexer/ask_handler.py +5 -7
  6. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/src/mcp_code_indexer/claude_api_handler.py +2 -2
  7. mcp_code_indexer-3.0.0/src/mcp_code_indexer/cleanup_manager.py +255 -0
  8. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/src/mcp_code_indexer/database/database.py +82 -90
  9. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/src/mcp_code_indexer/database/models.py +3 -5
  10. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/src/mcp_code_indexer/deepask_handler.py +5 -9
  11. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/src/mcp_code_indexer/git_hook_handler.py +2 -9
  12. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/src/mcp_code_indexer/main.py +1 -0
  13. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/src/mcp_code_indexer/server/mcp_server.py +107 -209
  14. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0/src/mcp_code_indexer.egg-info}/PKG-INFO +3 -3
  15. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/src/mcp_code_indexer.egg-info/SOURCES.txt +2 -0
  16. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/LICENSE +0 -0
  17. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/MANIFEST.in +0 -0
  18. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/docs/api-reference.md +0 -0
  19. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/docs/architecture.md +0 -0
  20. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/docs/configuration.md +0 -0
  21. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/docs/contributing.md +0 -0
  22. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/docs/database-resilience.md +0 -0
  23. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/docs/git-hook-setup.md +0 -0
  24. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/docs/monitoring.md +0 -0
  25. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/docs/performance-tuning.md +0 -0
  26. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/migrations/001_initial.sql +0 -0
  27. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/migrations/002_performance_indexes.sql +0 -0
  28. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/migrations/003_project_overviews.sql +0 -0
  29. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/requirements.txt +0 -0
  30. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/setup.cfg +0 -0
  31. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/setup.py +0 -0
  32. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/src/mcp_code_indexer/__init__.py +0 -0
  33. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/src/mcp_code_indexer/__main__.py +0 -0
  34. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/src/mcp_code_indexer/data/stop_words_english.txt +0 -0
  35. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/src/mcp_code_indexer/database/__init__.py +0 -0
  36. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/src/mcp_code_indexer/database/connection_health.py +0 -0
  37. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/src/mcp_code_indexer/database/exceptions.py +0 -0
  38. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/src/mcp_code_indexer/database/retry_executor.py +0 -0
  39. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/src/mcp_code_indexer/error_handler.py +0 -0
  40. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/src/mcp_code_indexer/file_scanner.py +0 -0
  41. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/src/mcp_code_indexer/logging_config.py +0 -0
  42. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/src/mcp_code_indexer/merge_handler.py +0 -0
  43. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/src/mcp_code_indexer/middleware/__init__.py +0 -0
  44. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/src/mcp_code_indexer/middleware/error_middleware.py +0 -0
  45. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/src/mcp_code_indexer/query_preprocessor.py +0 -0
  46. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/src/mcp_code_indexer/server/__init__.py +0 -0
  47. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/src/mcp_code_indexer/tiktoken_cache/9b5ad71b2ce5302211f9c61530b329a4922fc6a4 +0 -0
  48. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/src/mcp_code_indexer/token_counter.py +0 -0
  49. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/src/mcp_code_indexer/tools/__init__.py +0 -0
  50. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/src/mcp_code_indexer.egg-info/dependency_links.txt +0 -0
  51. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/src/mcp_code_indexer.egg-info/entry_points.txt +0 -0
  52. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/src/mcp_code_indexer.egg-info/requires.txt +0 -0
  53. {mcp_code_indexer-2.3.0 → mcp_code_indexer-3.0.0}/src/mcp_code_indexer.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mcp-code-indexer
3
- Version: 2.3.0
3
+ Version: 3.0.0
4
4
  Summary: MCP server that tracks file descriptions across codebases, enabling AI agents to efficiently navigate and understand code through searchable summaries and token-aware overviews.
5
5
  Author: MCP Code Indexer Contributors
6
6
  Maintainer: MCP Code Indexer Contributors
@@ -59,8 +59,8 @@ Dynamic: requires-python
59
59
 
60
60
  # MCP Code Indexer 🚀
61
61
 
62
- [![PyPI version](https://badge.fury.io/py/mcp-code-indexer.svg?17)](https://badge.fury.io/py/mcp-code-indexer)
63
- [![Python](https://img.shields.io/pypi/pyversions/mcp-code-indexer.svg?17)](https://pypi.org/project/mcp-code-indexer/)
62
+ [![PyPI version](https://badge.fury.io/py/mcp-code-indexer.svg?19)](https://badge.fury.io/py/mcp-code-indexer)
63
+ [![Python](https://img.shields.io/pypi/pyversions/mcp-code-indexer.svg?19)](https://pypi.org/project/mcp-code-indexer/)
64
64
  [![License](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
65
65
 
66
66
  A production-ready **Model Context Protocol (MCP) server** that revolutionizes how AI agents navigate and understand codebases. Built for high-concurrency environments with advanced database resilience, the server provides instant access to intelligent descriptions, semantic search, and context-aware recommendations while maintaining 800+ writes/sec throughput.
@@ -1,7 +1,7 @@
1
1
  # MCP Code Indexer 🚀
2
2
 
3
- [![PyPI version](https://badge.fury.io/py/mcp-code-indexer.svg?17)](https://badge.fury.io/py/mcp-code-indexer)
4
- [![Python](https://img.shields.io/pypi/pyversions/mcp-code-indexer.svg?17)](https://pypi.org/project/mcp-code-indexer/)
3
+ [![PyPI version](https://badge.fury.io/py/mcp-code-indexer.svg?19)](https://badge.fury.io/py/mcp-code-indexer)
4
+ [![Python](https://img.shields.io/pypi/pyversions/mcp-code-indexer.svg?19)](https://pypi.org/project/mcp-code-indexer/)
5
5
  [![License](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
6
6
 
7
7
  A production-ready **Model Context Protocol (MCP) server** that revolutionizes how AI agents navigate and understand codebases. Built for high-concurrency environments with advanced database resilience, the server provides instant access to intelligent descriptions, semantic search, and context-aware recommendations while maintaining 800+ writes/sec throughput.
@@ -0,0 +1,166 @@
1
+ -- Migration 004: Remove branch dependency from database schema
2
+ -- This migration consolidates multi-branch data and simplifies the schema
3
+ -- by removing branch columns from file_descriptions and project_overviews tables
4
+
5
+ -- Ensure WAL mode is enabled for safe migrations
6
+ PRAGMA journal_mode=WAL;
7
+
8
+ -- Enable foreign key support
9
+ PRAGMA foreign_keys=ON;
10
+
11
+ -- Start transaction for atomic migration
12
+ BEGIN TRANSACTION;
13
+
14
+ -- Create new file_descriptions table without branch dependency
15
+ CREATE TABLE file_descriptions_new (
16
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
17
+ project_id TEXT NOT NULL,
18
+ file_path TEXT NOT NULL,
19
+ description TEXT NOT NULL,
20
+ file_hash TEXT,
21
+ last_modified DATETIME DEFAULT CURRENT_TIMESTAMP,
22
+ version INTEGER DEFAULT 1,
23
+ source_project_id TEXT,
24
+ to_be_cleaned INTEGER DEFAULT NULL, -- UNIX timestamp for cleanup, NULL = active
25
+ UNIQUE(project_id, file_path),
26
+ FOREIGN KEY (project_id) REFERENCES projects(id) ON DELETE CASCADE,
27
+ FOREIGN KEY (source_project_id) REFERENCES projects(id) ON DELETE SET NULL
28
+ );
29
+
30
+ -- Create indexes for the new table
31
+ CREATE INDEX idx_file_descriptions_new_project_id ON file_descriptions_new(project_id);
32
+ CREATE INDEX idx_file_descriptions_new_file_hash ON file_descriptions_new(file_hash);
33
+ CREATE INDEX idx_file_descriptions_new_last_modified ON file_descriptions_new(last_modified);
34
+ CREATE INDEX idx_file_descriptions_new_to_be_cleaned ON file_descriptions_new(to_be_cleaned);
35
+
36
+ -- Consolidate data from old table - keep most recent description per file
37
+ -- This handles multi-branch scenarios by selecting the newest data
38
+ INSERT INTO file_descriptions_new (
39
+ project_id, file_path, description, file_hash, last_modified, version, source_project_id
40
+ )
41
+ SELECT
42
+ project_id,
43
+ file_path,
44
+ description,
45
+ file_hash,
46
+ last_modified,
47
+ version,
48
+ source_project_id
49
+ FROM (
50
+ SELECT
51
+ project_id,
52
+ file_path,
53
+ description,
54
+ file_hash,
55
+ last_modified,
56
+ version,
57
+ source_project_id,
58
+ ROW_NUMBER() OVER (
59
+ PARTITION BY project_id, file_path
60
+ ORDER BY last_modified DESC
61
+ ) as rn
62
+ FROM file_descriptions
63
+ ) ranked_descriptions
64
+ WHERE rn = 1;
65
+
66
+ -- Create new project_overviews table without branch dependency
67
+ CREATE TABLE project_overviews_new (
68
+ project_id TEXT PRIMARY KEY,
69
+ overview TEXT NOT NULL,
70
+ last_modified TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
71
+ total_files INTEGER NOT NULL DEFAULT 0,
72
+ total_tokens INTEGER NOT NULL DEFAULT 0,
73
+ FOREIGN KEY (project_id) REFERENCES projects(id) ON DELETE CASCADE
74
+ );
75
+
76
+ -- Create indexes for the new table
77
+ CREATE INDEX idx_project_overviews_new_last_modified ON project_overviews_new(last_modified);
78
+
79
+ -- Consolidate project overviews - keep the one with most tokens (most comprehensive)
80
+ INSERT INTO project_overviews_new (
81
+ project_id, overview, last_modified, total_files, total_tokens
82
+ )
83
+ SELECT
84
+ project_id,
85
+ overview,
86
+ last_modified,
87
+ total_files,
88
+ total_tokens
89
+ FROM (
90
+ SELECT
91
+ project_id,
92
+ overview,
93
+ last_modified,
94
+ total_files,
95
+ total_tokens,
96
+ ROW_NUMBER() OVER (
97
+ PARTITION BY project_id
98
+ ORDER BY total_tokens DESC, last_modified DESC
99
+ ) as rn
100
+ FROM project_overviews
101
+ ) ranked_overviews
102
+ WHERE rn = 1;
103
+
104
+ -- Drop FTS5 triggers for old table
105
+ DROP TRIGGER IF EXISTS file_descriptions_ai;
106
+ DROP TRIGGER IF EXISTS file_descriptions_ad;
107
+ DROP TRIGGER IF EXISTS file_descriptions_au;
108
+
109
+ -- Drop FTS5 virtual table
110
+ DROP TABLE IF EXISTS file_descriptions_fts;
111
+
112
+ -- Drop old tables
113
+ DROP TABLE file_descriptions;
114
+ DROP TABLE project_overviews;
115
+
116
+ -- Rename new tables to original names
117
+ ALTER TABLE file_descriptions_new RENAME TO file_descriptions;
118
+ ALTER TABLE project_overviews_new RENAME TO project_overviews;
119
+
120
+ -- Create new FTS5 virtual table without branch column
121
+ CREATE VIRTUAL TABLE file_descriptions_fts USING fts5(
122
+ project_id,
123
+ file_path,
124
+ description,
125
+ content='file_descriptions',
126
+ content_rowid='id'
127
+ );
128
+
129
+ -- Populate FTS5 table with existing data (only active records)
130
+ INSERT INTO file_descriptions_fts(rowid, project_id, file_path, description)
131
+ SELECT id, project_id, file_path, description
132
+ FROM file_descriptions
133
+ WHERE to_be_cleaned IS NULL;
134
+
135
+ -- Create new FTS5 triggers for the updated schema
136
+ CREATE TRIGGER file_descriptions_ai AFTER INSERT ON file_descriptions BEGIN
137
+ -- Only index active records (not marked for cleanup)
138
+ INSERT INTO file_descriptions_fts(rowid, project_id, file_path, description)
139
+ SELECT new.id, new.project_id, new.file_path, new.description
140
+ WHERE new.to_be_cleaned IS NULL;
141
+ END;
142
+
143
+ CREATE TRIGGER file_descriptions_ad AFTER DELETE ON file_descriptions BEGIN
144
+ INSERT INTO file_descriptions_fts(file_descriptions_fts, rowid, project_id, file_path, description)
145
+ VALUES ('delete', old.id, old.project_id, old.file_path, old.description);
146
+ END;
147
+
148
+ CREATE TRIGGER file_descriptions_au AFTER UPDATE ON file_descriptions BEGIN
149
+ -- Remove old record from FTS
150
+ INSERT INTO file_descriptions_fts(file_descriptions_fts, rowid, project_id, file_path, description)
151
+ VALUES ('delete', old.id, old.project_id, old.file_path, old.description);
152
+
153
+ -- Add new record only if it's active (not marked for cleanup)
154
+ INSERT INTO file_descriptions_fts(rowid, project_id, file_path, description)
155
+ SELECT new.id, new.project_id, new.file_path, new.description
156
+ WHERE new.to_be_cleaned IS NULL;
157
+ END;
158
+
159
+ -- Update merge_conflicts table to remove branch references (optional cleanup)
160
+ -- This table structure can remain as-is since it's used for temporary conflict resolution
161
+ -- but we'll remove unused indexes that reference branches
162
+ DROP INDEX IF EXISTS idx_merge_conflicts_project;
163
+ CREATE INDEX idx_merge_conflicts_project ON merge_conflicts(project_id, created);
164
+
165
+ -- Commit the migration
166
+ COMMIT;
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "mcp-code-indexer"
7
- version = "2.3.0"
7
+ version = "3.0.0"
8
8
  description = "MCP server that tracks file descriptions across codebases, enabling AI agents to efficiently navigate and understand code through searchable summaries and token-aware overviews."
9
9
  readme = "README.md"
10
10
  license = {text = "MIT"}
@@ -51,7 +51,7 @@ class AskHandler(ClaudeAPIHandler):
51
51
  Ask a question about the project using Claude API.
52
52
 
53
53
  Args:
54
- project_info: Project information dict with projectName, folderPath, branch, etc.
54
+ project_info: Project information dict with projectName, folderPath, etc.
55
55
  question: User's question about the project
56
56
  include_overview: Whether to include project overview in context
57
57
 
@@ -112,8 +112,7 @@ class AskHandler(ClaudeAPIHandler):
112
112
  "response_tokens": response.usage.get("completion_tokens") if response.usage else None,
113
113
  "total_tokens": response.usage.get("total_tokens") if response.usage else None
114
114
  },
115
- "include_overview": include_overview,
116
- "branch": project_info.get("branch", "unknown")
115
+ "include_overview": include_overview
117
116
  }
118
117
  }
119
118
 
@@ -141,10 +140,9 @@ class AskHandler(ClaudeAPIHandler):
141
140
  Formatted prompt string
142
141
  """
143
142
  project_name = project_info["projectName"]
144
- branch = project_info.get("branch", "unknown")
145
143
 
146
144
  if overview.strip():
147
- prompt = f"""Please answer the following question about the codebase "{project_name}" (branch: {branch}).
145
+ prompt = f"""Please answer the following question about the codebase "{project_name}".
148
146
 
149
147
  PROJECT OVERVIEW:
150
148
  {overview}
@@ -154,7 +152,7 @@ QUESTION:
154
152
 
155
153
  Please provide a clear, detailed answer based on the project overview above. If the overview doesn't contain enough information to fully answer the question, please say so and suggest what additional information might be needed."""
156
154
  else:
157
- prompt = f"""Please answer the following question about the codebase "{project_name}" (branch: {branch}).
155
+ prompt = f"""Please answer the following question about the codebase "{project_name}".
158
156
 
159
157
  Note: No project overview is available for this codebase.
160
158
 
@@ -200,7 +198,7 @@ If the project overview is insufficient to answer the question completely, expla
200
198
 
201
199
  output = []
202
200
  output.append(f"Question: {result['question']}")
203
- output.append(f"Project: {result['project_name']} (branch: {metadata['branch']})")
201
+ output.append(f"Project: {result['project_name']}")
204
202
  output.append("")
205
203
  output.append("Answer:")
206
204
  output.append(answer)
@@ -331,7 +331,7 @@ class ClaudeAPIHandler:
331
331
  Get project overview from database.
332
332
 
333
333
  Args:
334
- project_info: Project information dict with projectName, folderPath, branch, etc.
334
+ project_info: Project information dict with projectName, folderPath, etc.
335
335
 
336
336
  Returns:
337
337
  Project overview text or empty string if not found
@@ -345,7 +345,7 @@ class ClaudeAPIHandler:
345
345
  return ""
346
346
 
347
347
  # Get overview for the project using project.id
348
- overview_result = await self.db_manager.get_project_overview(project.id, project_info["branch"])
348
+ overview_result = await self.db_manager.get_project_overview(project.id)
349
349
  if overview_result:
350
350
  return overview_result.overview
351
351
  else:
@@ -0,0 +1,255 @@
1
+ """
2
+ Cleanup Manager for MCP Code Indexer.
3
+
4
+ Handles soft deletion and retention policies for file descriptions
5
+ that are marked for cleanup. Provides periodic cleanup operations
6
+ and manual cleanup methods.
7
+ """
8
+
9
+ import logging
10
+ import time
11
+ from typing import List, Optional
12
+ from pathlib import Path
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class CleanupManager:
18
+ """
19
+ Manages cleanup operations for file descriptions with retention policies.
20
+
21
+ Handles soft deletion by updating to_be_cleaned timestamps and provides
22
+ periodic cleanup to permanently remove old records after the retention period.
23
+ """
24
+
25
+ def __init__(self, db_manager, retention_months: int = 6):
26
+ """
27
+ Initialize cleanup manager.
28
+
29
+ Args:
30
+ db_manager: DatabaseManager instance
31
+ retention_months: Number of months to retain records before permanent deletion
32
+ """
33
+ self.db_manager = db_manager
34
+ self.retention_months = retention_months
35
+
36
+ async def mark_file_for_cleanup(self, project_id: str, file_path: str) -> bool:
37
+ """
38
+ Mark a specific file for cleanup by setting to_be_cleaned timestamp.
39
+
40
+ Args:
41
+ project_id: Project identifier
42
+ file_path: Path to file to mark for cleanup
43
+
44
+ Returns:
45
+ True if file was marked, False if file not found
46
+ """
47
+ cleanup_timestamp = int(time.time())
48
+
49
+ async with self.db_manager.get_write_connection_with_retry("mark_file_for_cleanup") as db:
50
+ cursor = await db.execute(
51
+ """
52
+ UPDATE file_descriptions
53
+ SET to_be_cleaned = ?
54
+ WHERE project_id = ? AND file_path = ? AND to_be_cleaned IS NULL
55
+ """,
56
+ (cleanup_timestamp, project_id, file_path)
57
+ )
58
+ await db.commit()
59
+
60
+ # Check if any rows were affected
61
+ return cursor.rowcount > 0
62
+
63
+ async def mark_files_for_cleanup(self, project_id: str, file_paths: List[str]) -> int:
64
+ """
65
+ Mark multiple files for cleanup in a batch operation.
66
+
67
+ Args:
68
+ project_id: Project identifier
69
+ file_paths: List of file paths to mark for cleanup
70
+
71
+ Returns:
72
+ Number of files marked for cleanup
73
+ """
74
+ if not file_paths:
75
+ return 0
76
+
77
+ cleanup_timestamp = int(time.time())
78
+
79
+ async def batch_operation(conn):
80
+ data = [(cleanup_timestamp, project_id, path) for path in file_paths]
81
+ cursor = await conn.executemany(
82
+ """
83
+ UPDATE file_descriptions
84
+ SET to_be_cleaned = ?
85
+ WHERE project_id = ? AND file_path = ? AND to_be_cleaned IS NULL
86
+ """,
87
+ data
88
+ )
89
+ return cursor.rowcount
90
+
91
+ marked_count = await self.db_manager.execute_transaction_with_retry(
92
+ batch_operation,
93
+ f"mark_files_for_cleanup_{len(file_paths)}_files",
94
+ timeout_seconds=30.0
95
+ )
96
+
97
+ logger.info(f"Marked {marked_count} files for cleanup in project {project_id}")
98
+ return marked_count
99
+
100
+ async def restore_file_from_cleanup(self, project_id: str, file_path: str) -> bool:
101
+ """
102
+ Restore a file from cleanup by clearing its to_be_cleaned timestamp.
103
+
104
+ Args:
105
+ project_id: Project identifier
106
+ file_path: Path to file to restore
107
+
108
+ Returns:
109
+ True if file was restored, False if file not found
110
+ """
111
+ async with self.db_manager.get_write_connection_with_retry("restore_file_from_cleanup") as db:
112
+ cursor = await db.execute(
113
+ """
114
+ UPDATE file_descriptions
115
+ SET to_be_cleaned = NULL
116
+ WHERE project_id = ? AND file_path = ? AND to_be_cleaned IS NOT NULL
117
+ """,
118
+ (project_id, file_path)
119
+ )
120
+ await db.commit()
121
+
122
+ return cursor.rowcount > 0
123
+
124
+ async def get_files_to_be_cleaned(self, project_id: str) -> List[dict]:
125
+ """
126
+ Get list of files marked for cleanup in a project.
127
+
128
+ Args:
129
+ project_id: Project identifier
130
+
131
+ Returns:
132
+ List of dictionaries with file_path and to_be_cleaned timestamp
133
+ """
134
+ async with self.db_manager.get_connection() as db:
135
+ cursor = await db.execute(
136
+ """
137
+ SELECT file_path, to_be_cleaned
138
+ FROM file_descriptions
139
+ WHERE project_id = ? AND to_be_cleaned IS NOT NULL
140
+ ORDER BY to_be_cleaned DESC, file_path
141
+ """,
142
+ (project_id,)
143
+ )
144
+ rows = await cursor.fetchall()
145
+
146
+ return [
147
+ {
148
+ 'file_path': row['file_path'],
149
+ 'marked_for_cleanup': row['to_be_cleaned'],
150
+ 'marked_date': time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(row['to_be_cleaned']))
151
+ }
152
+ for row in rows
153
+ ]
154
+
155
+ async def perform_cleanup(self, project_id: Optional[str] = None) -> int:
156
+ """
157
+ Permanently delete records that exceed the retention period.
158
+
159
+ Args:
160
+ project_id: If specified, only clean up this project. Otherwise clean all projects.
161
+
162
+ Returns:
163
+ Number of records permanently deleted
164
+ """
165
+ # Calculate cutoff timestamp (retention_months ago)
166
+ cutoff_seconds = self.retention_months * 30 * 24 * 60 * 60 # Approximate months to seconds
167
+ cutoff_timestamp = int(time.time()) - cutoff_seconds
168
+
169
+ async def cleanup_operation(conn):
170
+ if project_id:
171
+ cursor = await conn.execute(
172
+ """
173
+ DELETE FROM file_descriptions
174
+ WHERE project_id = ? AND to_be_cleaned IS NOT NULL AND to_be_cleaned < ?
175
+ """,
176
+ (project_id, cutoff_timestamp)
177
+ )
178
+ else:
179
+ cursor = await conn.execute(
180
+ """
181
+ DELETE FROM file_descriptions
182
+ WHERE to_be_cleaned IS NOT NULL AND to_be_cleaned < ?
183
+ """,
184
+ (cutoff_timestamp,)
185
+ )
186
+
187
+ return cursor.rowcount
188
+
189
+ deleted_count = await self.db_manager.execute_transaction_with_retry(
190
+ cleanup_operation,
191
+ f"perform_cleanup_{project_id or 'all_projects'}",
192
+ timeout_seconds=60.0
193
+ )
194
+
195
+ if deleted_count > 0:
196
+ scope = f"project {project_id}" if project_id else "all projects"
197
+ logger.info(f"Permanently deleted {deleted_count} old records from {scope}")
198
+
199
+ return deleted_count
200
+
201
+ async def get_cleanup_stats(self, project_id: Optional[str] = None) -> dict:
202
+ """
203
+ Get statistics about cleanup state.
204
+
205
+ Args:
206
+ project_id: If specified, get stats for this project only
207
+
208
+ Returns:
209
+ Dictionary with cleanup statistics
210
+ """
211
+ cutoff_seconds = self.retention_months * 30 * 24 * 60 * 60
212
+ cutoff_timestamp = int(time.time()) - cutoff_seconds
213
+
214
+ async with self.db_manager.get_connection() as db:
215
+ if project_id:
216
+ base_where = "WHERE project_id = ?"
217
+ params = (project_id,)
218
+ else:
219
+ base_where = ""
220
+ params = ()
221
+
222
+ # Active files
223
+ cursor = await db.execute(
224
+ f"SELECT COUNT(*) FROM file_descriptions {base_where} AND to_be_cleaned IS NULL",
225
+ params
226
+ )
227
+ active_count = (await cursor.fetchone())[0]
228
+
229
+ # Files marked for cleanup
230
+ cursor = await db.execute(
231
+ f"SELECT COUNT(*) FROM file_descriptions {base_where} AND to_be_cleaned IS NOT NULL",
232
+ params
233
+ )
234
+ marked_count = (await cursor.fetchone())[0]
235
+
236
+ # Files eligible for permanent deletion
237
+ if project_id:
238
+ cursor = await db.execute(
239
+ "SELECT COUNT(*) FROM file_descriptions WHERE project_id = ? AND to_be_cleaned IS NOT NULL AND to_be_cleaned < ?",
240
+ (project_id, cutoff_timestamp)
241
+ )
242
+ else:
243
+ cursor = await db.execute(
244
+ "SELECT COUNT(*) FROM file_descriptions WHERE to_be_cleaned IS NOT NULL AND to_be_cleaned < ?",
245
+ (cutoff_timestamp,)
246
+ )
247
+ eligible_for_deletion = (await cursor.fetchone())[0]
248
+
249
+ return {
250
+ 'active_files': active_count,
251
+ 'marked_for_cleanup': marked_count,
252
+ 'eligible_for_deletion': eligible_for_deletion,
253
+ 'retention_months': self.retention_months,
254
+ 'cutoff_date': time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(cutoff_timestamp))
255
+ }