gdmcode 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gdmcode-0.1.0.dist-info/METADATA +240 -0
- gdmcode-0.1.0.dist-info/RECORD +131 -0
- gdmcode-0.1.0.dist-info/WHEEL +4 -0
- gdmcode-0.1.0.dist-info/entry_points.txt +2 -0
- src/__init__.py +1 -0
- src/_internal/__init__.py +0 -0
- src/_internal/constants.py +244 -0
- src/_internal/domain_skills.py +339 -0
- src/agent/__init__.py +0 -0
- src/agent/commit_classifier.py +91 -0
- src/agent/context_budget.py +391 -0
- src/agent/daemon.py +681 -0
- src/agent/dag_validator.py +153 -0
- src/agent/debug_loop.py +473 -0
- src/agent/impact_analyzer.py +149 -0
- src/agent/impact_graph.py +117 -0
- src/agent/loop.py +1410 -0
- src/agent/orchestrator.py +141 -0
- src/agent/regression_guard.py +251 -0
- src/agent/review_gate.py +648 -0
- src/agent/risk_scorer.py +169 -0
- src/agent/self_healing.py +145 -0
- src/agent/smart_test_selector.py +89 -0
- src/agent/system_prompt.py +226 -0
- src/agent/task_tracker.py +320 -0
- src/agent/test_validator.py +210 -0
- src/agent/tool_orchestrator.py +402 -0
- src/agent/transcript.py +230 -0
- src/agent/verification_loop.py +133 -0
- src/agent/work_director.py +136 -0
- src/agent/worktree_manager.py +53 -0
- src/artifacts/__init__.py +16 -0
- src/artifacts/artifact_store.py +456 -0
- src/artifacts/verification_graph.py +75 -0
- src/auth.py +411 -0
- src/cli.py +1290 -0
- src/commands.py +1398 -0
- src/config.py +762 -0
- src/cost_tracker.py +348 -0
- src/db/__init__.py +4 -0
- src/db/migrations.py +337 -0
- src/enterprise/__init__.py +3 -0
- src/enterprise/audit_log.py +182 -0
- src/enterprise/identity.py +90 -0
- src/enterprise/rbac.py +100 -0
- src/enterprise/team_config.py +125 -0
- src/enterprise/usage_analytics.py +261 -0
- src/exceptions.py +207 -0
- src/git_workflow.py +651 -0
- src/integrations/__init__.py +6 -0
- src/integrations/github_actions.py +106 -0
- src/integrations/mcp_server.py +333 -0
- src/integrations/sentry_integration.py +100 -0
- src/integrations/sentry_server.py +82 -0
- src/integrations/webhook_security.py +19 -0
- src/main.py +27 -0
- src/memory/__init__.py +0 -0
- src/memory/code_index.py +376 -0
- src/memory/compressor.py +378 -0
- src/memory/context_memory.py +135 -0
- src/memory/continuous_memory.py +234 -0
- src/memory/conventions.py +495 -0
- src/memory/db.py +1119 -0
- src/memory/document_index.py +205 -0
- src/memory/file_cache.py +128 -0
- src/memory/project_scanner.py +178 -0
- src/memory/session_store.py +201 -0
- src/models/__init__.py +0 -0
- src/models/client.py +715 -0
- src/models/definitions.py +459 -0
- src/models/router.py +418 -0
- src/models/schemas.py +389 -0
- src/permissions.py +294 -0
- src/remote/__init__.py +5 -0
- src/remote/command_filter.py +33 -0
- src/remote/models.py +31 -0
- src/remote/permission_handler.py +79 -0
- src/remote/phone_ui.py +48 -0
- src/remote/protocol.py +59 -0
- src/remote/qr.py +65 -0
- src/remote/server.py +586 -0
- src/remote/token_manager.py +61 -0
- src/remote/tunnel.py +212 -0
- src/repl.py +475 -0
- src/runtime/__init__.py +1 -0
- src/runtime/branch_farm.py +372 -0
- src/runtime/replay.py +351 -0
- src/sandbox/__init__.py +2 -0
- src/sandbox/hermetic.py +214 -0
- src/sandbox/policy.py +44 -0
- src/sdk/__init__.py +3 -0
- src/sdk/plugin_base.py +39 -0
- src/sdk/plugin_host.py +100 -0
- src/sdk/plugin_loader.py +101 -0
- src/security.py +409 -0
- src/server/__init__.py +7 -0
- src/server/bridge.py +427 -0
- src/server/bridge_cli.py +103 -0
- src/server/bridge_client.py +170 -0
- src/server/protocol_version.py +103 -0
- src/session/__init__.py +10 -0
- src/session/event_fanout.py +46 -0
- src/session/input_broker.py +38 -0
- src/session/permission_bridge.py +100 -0
- src/tools/__init__.py +160 -0
- src/tools/_atomic.py +72 -0
- src/tools/agent_tools.py +423 -0
- src/tools/ask_user_tool.py +83 -0
- src/tools/bash_tool.py +384 -0
- src/tools/browser_tool.py +352 -0
- src/tools/browser_tools.py +179 -0
- src/tools/dep_tools.py +210 -0
- src/tools/document_reader.py +167 -0
- src/tools/document_tool.py +240 -0
- src/tools/document_writer.py +171 -0
- src/tools/impact_tools.py +240 -0
- src/tools/playwright_tool.py +172 -0
- src/tools/quality_tools.py +366 -0
- src/tools/read_tools.py +318 -0
- src/tools/result_cache.py +157 -0
- src/tools/search_tools.py +310 -0
- src/tools/shell_tools.py +311 -0
- src/tools/write_tools.py +337 -0
- src/voice/__init__.py +25 -0
- src/voice/audio_capture.py +92 -0
- src/voice/audio_playback.py +68 -0
- src/voice/errors.py +14 -0
- src/voice/models.py +35 -0
- src/voice/providers.py +143 -0
- src/voice/vad.py +55 -0
- src/voice/voice_loop.py +156 -0
src/tools/dep_tools.py
ADDED
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
"""Dependency guard tools — CVE scanning and version pinning.
|
|
2
|
+
|
|
3
|
+
DependencyGuardTool: runs pip-audit (preferred) or safety (fallback) to detect
|
|
4
|
+
known CVEs before any new package is installed.
|
|
5
|
+
|
|
6
|
+
PinDepsTool: generates a pinned requirements.lock from the live environment,
|
|
7
|
+
filtering development-only packages out by default.
|
|
8
|
+
"""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import logging
|
|
12
|
+
import os
|
|
13
|
+
import shutil
|
|
14
|
+
import subprocess
|
|
15
|
+
import tempfile
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any, ClassVar
|
|
18
|
+
|
|
19
|
+
from src.tools import REGISTRY, ToolBase, ToolResult
|
|
20
|
+
|
|
21
|
+
__all__ = ["DependencyGuardTool", "PinDepsTool"]
|
|
22
|
+
|
|
23
|
+
log = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
_AUDIT_TIMEOUT: int = 60
|
|
26
|
+
_PIN_TIMEOUT: int = 30
|
|
27
|
+
|
|
28
|
+
# Packages that should be stripped from a production lock file
|
|
29
|
+
_DEV_PKG_PREFIXES: frozenset[str] = frozenset({
|
|
30
|
+
"pytest", "pytest-cov", "pytest-xdist", "ruff", "mypy", "black",
|
|
31
|
+
"isort", "pre-commit", "build", "twine", "wheel", "setuptools",
|
|
32
|
+
"pip-audit", "safety", "coverage", "hypothesis", "faker",
|
|
33
|
+
"flake8", "pylint", "bandit", "pipdeptree",
|
|
34
|
+
})
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# ---------------------------------------------------------------------------
|
|
38
|
+
# DependencyGuardTool
|
|
39
|
+
# ---------------------------------------------------------------------------
|
|
40
|
+
|
|
41
|
+
class DependencyGuardTool(ToolBase):
|
|
42
|
+
"""Scan Python dependencies for known CVEs before installing packages.
|
|
43
|
+
|
|
44
|
+
Runs ``pip-audit`` (preferred) or ``safety check`` (fallback).
|
|
45
|
+
Returns JSON-formatted vulnerability data or a clean bill of health.
|
|
46
|
+
|
|
47
|
+
**Always call this before pip install <package>.**
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
name: ClassVar[str] = "check_deps"
|
|
51
|
+
description: ClassVar[str] = (
|
|
52
|
+
"Scan Python dependencies for known CVEs. "
|
|
53
|
+
"Run before any 'pip install' to catch security vulnerabilities. "
|
|
54
|
+
"Returns vulnerable packages with CVE IDs, or confirms clean."
|
|
55
|
+
)
|
|
56
|
+
input_schema: ClassVar[dict[str, Any]] = {
|
|
57
|
+
"type": "object",
|
|
58
|
+
"properties": {
|
|
59
|
+
"packages": {
|
|
60
|
+
"type": "array",
|
|
61
|
+
"items": {"type": "string"},
|
|
62
|
+
"description": (
|
|
63
|
+
"Specific packages to check (e.g. ['requests==2.28.0']). "
|
|
64
|
+
"Omit to scan all installed packages."
|
|
65
|
+
),
|
|
66
|
+
},
|
|
67
|
+
"requirements_file": {
|
|
68
|
+
"type": "string",
|
|
69
|
+
"description": "Path to a requirements file to scan (optional).",
|
|
70
|
+
},
|
|
71
|
+
},
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
def execute(self, args: dict[str, Any]) -> ToolResult:
|
|
75
|
+
packages: list[str] = args.get("packages") or []
|
|
76
|
+
req_file: str | None = args.get("requirements_file")
|
|
77
|
+
|
|
78
|
+
if shutil.which("pip-audit") is not None:
|
|
79
|
+
return self._run_pip_audit(packages, req_file)
|
|
80
|
+
if shutil.which("safety") is not None:
|
|
81
|
+
return self._run_safety(packages, req_file)
|
|
82
|
+
|
|
83
|
+
return ToolResult(output=(
|
|
84
|
+
"⚠ pip-audit and safety are not installed — cannot scan for CVEs.\n"
|
|
85
|
+
"Install: pip install pip-audit\n"
|
|
86
|
+
+ (f"Packages requested: {', '.join(packages)}" if packages else "")
|
|
87
|
+
))
|
|
88
|
+
|
|
89
|
+
# ------------------------------------------------------------------
|
|
90
|
+
# pip-audit
|
|
91
|
+
# ------------------------------------------------------------------
|
|
92
|
+
|
|
93
|
+
def _run_pip_audit(self, packages: list[str], req_file: str | None) -> ToolResult:
|
|
94
|
+
cmd = ["pip-audit", "--format", "json"]
|
|
95
|
+
if req_file:
|
|
96
|
+
cmd += ["-r", req_file]
|
|
97
|
+
return self._exec(cmd)
|
|
98
|
+
if packages:
|
|
99
|
+
return self._with_temp_req(packages, lambda tmp: self._exec(cmd + ["-r", tmp]))
|
|
100
|
+
return self._exec(cmd)
|
|
101
|
+
|
|
102
|
+
# ------------------------------------------------------------------
|
|
103
|
+
# safety
|
|
104
|
+
# ------------------------------------------------------------------
|
|
105
|
+
|
|
106
|
+
def _run_safety(self, packages: list[str], req_file: str | None) -> ToolResult:
|
|
107
|
+
cmd = ["safety", "check", "--json"]
|
|
108
|
+
if req_file:
|
|
109
|
+
cmd += ["-r", req_file]
|
|
110
|
+
return self._exec(cmd)
|
|
111
|
+
if packages:
|
|
112
|
+
return self._with_temp_req(packages, lambda tmp: self._exec(cmd + ["-r", tmp]))
|
|
113
|
+
return self._exec(cmd)
|
|
114
|
+
|
|
115
|
+
# ------------------------------------------------------------------
|
|
116
|
+
# Helpers
|
|
117
|
+
# ------------------------------------------------------------------
|
|
118
|
+
|
|
119
|
+
@staticmethod
|
|
120
|
+
def _with_temp_req(
|
|
121
|
+
packages: list[str],
|
|
122
|
+
fn: Any,
|
|
123
|
+
) -> ToolResult:
|
|
124
|
+
"""Write a temporary requirements file, call fn(path), then delete it."""
|
|
125
|
+
fd, tmp = tempfile.mkstemp(suffix=".txt")
|
|
126
|
+
try:
|
|
127
|
+
with os.fdopen(fd, "w") as f:
|
|
128
|
+
f.write("\n".join(packages))
|
|
129
|
+
return fn(tmp)
|
|
130
|
+
finally:
|
|
131
|
+
try:
|
|
132
|
+
os.unlink(tmp)
|
|
133
|
+
except OSError:
|
|
134
|
+
pass
|
|
135
|
+
|
|
136
|
+
@staticmethod
|
|
137
|
+
def _exec(cmd: list[str]) -> ToolResult:
|
|
138
|
+
try:
|
|
139
|
+
res = subprocess.run(
|
|
140
|
+
cmd, capture_output=True, text=True, timeout=_AUDIT_TIMEOUT
|
|
141
|
+
)
|
|
142
|
+
output = res.stdout or res.stderr or "(no output)"
|
|
143
|
+
# pip-audit exits 1 when vulnerabilities are found — that is normal, not an error
|
|
144
|
+
ok = res.returncode in (0, 1)
|
|
145
|
+
return ToolResult(output=output, error="" if ok else res.stderr)
|
|
146
|
+
except subprocess.TimeoutExpired:
|
|
147
|
+
return ToolResult(output="", error=f"Audit timed out after {_AUDIT_TIMEOUT}s")
|
|
148
|
+
except Exception as exc: # noqa: BLE001
|
|
149
|
+
return ToolResult(output="", error=str(exc))
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
# ---------------------------------------------------------------------------
|
|
153
|
+
# PinDepsTool
|
|
154
|
+
# ---------------------------------------------------------------------------
|
|
155
|
+
|
|
156
|
+
class PinDepsTool(ToolBase):
|
|
157
|
+
"""Generate a pinned requirements.lock from the current Python environment.
|
|
158
|
+
|
|
159
|
+
Uses ``pip freeze`` and filters out development-only packages by default.
|
|
160
|
+
Call after installing new dependencies before committing changes.
|
|
161
|
+
"""
|
|
162
|
+
|
|
163
|
+
name: ClassVar[str] = "pin_deps"
|
|
164
|
+
description: ClassVar[str] = (
|
|
165
|
+
"Generate a pinned requirements.lock from the current Python environment. "
|
|
166
|
+
"Filters out dev/test packages. Use after adding new dependencies."
|
|
167
|
+
)
|
|
168
|
+
input_schema: ClassVar[dict[str, Any]] = {
|
|
169
|
+
"type": "object",
|
|
170
|
+
"properties": {
|
|
171
|
+
"output_path": {
|
|
172
|
+
"type": "string",
|
|
173
|
+
"description": "File to write. Defaults to 'requirements.lock'.",
|
|
174
|
+
},
|
|
175
|
+
"exclude_dev": {
|
|
176
|
+
"type": "boolean",
|
|
177
|
+
"description": "Strip dev/test packages from output (default: true).",
|
|
178
|
+
},
|
|
179
|
+
},
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
def execute(self, args: dict[str, Any]) -> ToolResult:
|
|
183
|
+
out_path = Path(args.get("output_path") or "requirements.lock")
|
|
184
|
+
exclude_dev: bool = args.get("exclude_dev", True)
|
|
185
|
+
|
|
186
|
+
try:
|
|
187
|
+
res = subprocess.run(
|
|
188
|
+
["pip", "freeze"], capture_output=True, text=True, timeout=_PIN_TIMEOUT
|
|
189
|
+
)
|
|
190
|
+
if res.returncode != 0:
|
|
191
|
+
return ToolResult(output="", error=res.stderr)
|
|
192
|
+
|
|
193
|
+
lines = res.stdout.splitlines()
|
|
194
|
+
if exclude_dev:
|
|
195
|
+
lines = [
|
|
196
|
+
ln for ln in lines
|
|
197
|
+
if not any(ln.lower().startswith(p.lower()) for p in _DEV_PKG_PREFIXES)
|
|
198
|
+
]
|
|
199
|
+
|
|
200
|
+
out_path.write_text("\n".join(lines) + "\n", encoding="utf-8")
|
|
201
|
+
return ToolResult(output=f"Pinned {len(lines)} packages → {out_path}")
|
|
202
|
+
except subprocess.TimeoutExpired:
|
|
203
|
+
return ToolResult(output="", error=f"pip freeze timed out after {_PIN_TIMEOUT}s")
|
|
204
|
+
except Exception as exc: # noqa: BLE001
|
|
205
|
+
return ToolResult(output="", error=str(exc))
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
# Self-register
|
|
209
|
+
REGISTRY.register(DependencyGuardTool())
|
|
210
|
+
REGISTRY.register(PinDepsTool())
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
"""DocumentReader — unified reader for Word, Excel, PDF, and CSV files.
|
|
2
|
+
|
|
3
|
+
All document library imports are guarded with try/except so the module loads
|
|
4
|
+
even when optional deps are not installed. Install with:
|
|
5
|
+
pip install 'gdm-code[docs]'
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Optional
|
|
11
|
+
import csv, io, logging
|
|
12
|
+
|
|
13
|
+
log = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
__all__ = ["DocumentReader", "DocumentContent", "SheetData"]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class SheetData:
|
|
20
|
+
name: str
|
|
21
|
+
headers: list[str]
|
|
22
|
+
rows: list[list[str]] # all values coerced to str
|
|
23
|
+
|
|
24
|
+
def to_text(self) -> str:
|
|
25
|
+
lines = ["\t".join(self.headers)]
|
|
26
|
+
lines += ["\t".join(row) for row in self.rows]
|
|
27
|
+
return "\n".join(lines)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class DocumentContent:
|
|
32
|
+
file_path: str
|
|
33
|
+
format: str # "docx" | "xlsx" | "xls" | "pdf" | "csv" | "txt"
|
|
34
|
+
text: str # full plain-text representation
|
|
35
|
+
title: Optional[str] = None
|
|
36
|
+
author: Optional[str] = None
|
|
37
|
+
page_count: Optional[int] = None
|
|
38
|
+
sheets: list[SheetData] = field(default_factory=list) # non-empty for spreadsheets
|
|
39
|
+
error: Optional[str] = None # set if partial parse failure
|
|
40
|
+
|
|
41
|
+
@property
|
|
42
|
+
def success(self) -> bool:
|
|
43
|
+
return self.error is None
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class DocumentReader:
|
|
47
|
+
def read(self, path: Path | str) -> DocumentContent:
|
|
48
|
+
path = Path(path)
|
|
49
|
+
suffix = path.suffix.lower().lstrip(".")
|
|
50
|
+
dispatch = {
|
|
51
|
+
"docx": self._read_docx,
|
|
52
|
+
"xlsx": self._read_xlsx,
|
|
53
|
+
"xls": self._read_xls,
|
|
54
|
+
"pdf": self._read_pdf,
|
|
55
|
+
"csv": self._read_csv,
|
|
56
|
+
"txt": self._read_text,
|
|
57
|
+
"md": self._read_text,
|
|
58
|
+
}
|
|
59
|
+
reader = dispatch.get(suffix)
|
|
60
|
+
if reader is None:
|
|
61
|
+
return DocumentContent(
|
|
62
|
+
file_path=str(path), format=suffix, text="",
|
|
63
|
+
error=f"Unsupported format: .{suffix}"
|
|
64
|
+
)
|
|
65
|
+
try:
|
|
66
|
+
return reader(path)
|
|
67
|
+
except Exception as exc:
|
|
68
|
+
log.warning("Document read failed for %s: %s", path, exc)
|
|
69
|
+
return DocumentContent(
|
|
70
|
+
file_path=str(path), format=suffix, text="",
|
|
71
|
+
error=str(exc)
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
def _read_docx(self, path: Path) -> DocumentContent:
|
|
75
|
+
try:
|
|
76
|
+
import docx
|
|
77
|
+
except ImportError:
|
|
78
|
+
raise ImportError("python-docx required. pip install 'gdm-code[docs]'")
|
|
79
|
+
doc = docx.Document(str(path))
|
|
80
|
+
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
|
|
81
|
+
for table in doc.tables:
|
|
82
|
+
for row in table.rows:
|
|
83
|
+
paragraphs.append("\t".join(c.text for c in row.cells))
|
|
84
|
+
props = doc.core_properties
|
|
85
|
+
return DocumentContent(
|
|
86
|
+
file_path=str(path), format="docx",
|
|
87
|
+
text="\n".join(paragraphs),
|
|
88
|
+
title=props.title or None,
|
|
89
|
+
author=props.author or None,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
def _read_xlsx(self, path: Path) -> DocumentContent:
|
|
93
|
+
try:
|
|
94
|
+
import openpyxl
|
|
95
|
+
except ImportError:
|
|
96
|
+
raise ImportError("openpyxl required. pip install 'gdm-code[docs]'")
|
|
97
|
+
wb = openpyxl.load_workbook(str(path), read_only=True, data_only=True)
|
|
98
|
+
sheets, all_text = [], []
|
|
99
|
+
for ws in wb.worksheets:
|
|
100
|
+
rows = [[str(c.value) if c.value is not None else "" for c in row]
|
|
101
|
+
for row in ws.iter_rows()]
|
|
102
|
+
headers = rows[0] if rows else []
|
|
103
|
+
data_rows = rows[1:] if len(rows) > 1 else []
|
|
104
|
+
sd = SheetData(name=ws.title, headers=headers, rows=data_rows)
|
|
105
|
+
sheets.append(sd)
|
|
106
|
+
all_text.append(f"[Sheet: {ws.title}]\n{sd.to_text()}")
|
|
107
|
+
wb.close()
|
|
108
|
+
return DocumentContent(
|
|
109
|
+
file_path=str(path), format="xlsx",
|
|
110
|
+
text="\n\n".join(all_text), sheets=sheets,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
def _read_xls(self, path: Path) -> DocumentContent:
|
|
114
|
+
try:
|
|
115
|
+
import xlrd
|
|
116
|
+
except ImportError:
|
|
117
|
+
raise ImportError("xlrd required. pip install 'gdm-code[docs]'")
|
|
118
|
+
wb = xlrd.open_workbook(str(path))
|
|
119
|
+
sheets, all_text = [], []
|
|
120
|
+
for ws in wb.sheets():
|
|
121
|
+
rows = [[str(ws.cell_value(r, c)) for c in range(ws.ncols)]
|
|
122
|
+
for r in range(ws.nrows)]
|
|
123
|
+
headers = rows[0] if rows else []
|
|
124
|
+
data_rows = rows[1:] if len(rows) > 1 else []
|
|
125
|
+
sd = SheetData(name=ws.name, headers=headers, rows=data_rows)
|
|
126
|
+
sheets.append(sd)
|
|
127
|
+
all_text.append(f"[Sheet: {ws.name}]\n{sd.to_text()}")
|
|
128
|
+
return DocumentContent(
|
|
129
|
+
file_path=str(path), format="xls",
|
|
130
|
+
text="\n\n".join(all_text), sheets=sheets,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
def _read_pdf(self, path: Path) -> DocumentContent:
|
|
134
|
+
try:
|
|
135
|
+
import pdfplumber
|
|
136
|
+
except ImportError:
|
|
137
|
+
raise ImportError("pdfplumber required. pip install 'gdm-code[docs]'")
|
|
138
|
+
pages_text = []
|
|
139
|
+
with pdfplumber.open(str(path)) as pdf:
|
|
140
|
+
page_count = len(pdf.pages)
|
|
141
|
+
for page in pdf.pages:
|
|
142
|
+
t = page.extract_text() or ""
|
|
143
|
+
if t.strip():
|
|
144
|
+
pages_text.append(t)
|
|
145
|
+
return DocumentContent(
|
|
146
|
+
file_path=str(path), format="pdf",
|
|
147
|
+
text="\n\n".join(pages_text),
|
|
148
|
+
page_count=page_count,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
def _read_csv(self, path: Path) -> DocumentContent:
|
|
152
|
+
text = path.read_text(encoding="utf-8-sig", errors="replace")
|
|
153
|
+
reader = csv.reader(io.StringIO(text))
|
|
154
|
+
rows = list(reader)
|
|
155
|
+
headers = rows[0] if rows else []
|
|
156
|
+
data_rows = rows[1:] if len(rows) > 1 else []
|
|
157
|
+
sd = SheetData(name="Sheet1", headers=headers, rows=data_rows)
|
|
158
|
+
return DocumentContent(
|
|
159
|
+
file_path=str(path), format="csv",
|
|
160
|
+
text=sd.to_text(), sheets=[sd],
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
def _read_text(self, path: Path) -> DocumentContent:
|
|
164
|
+
text = path.read_text(encoding="utf-8", errors="replace")
|
|
165
|
+
return DocumentContent(
|
|
166
|
+
file_path=str(path), format=path.suffix.lstrip("."), text=text,
|
|
167
|
+
)
|
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
"""Document tools — LLM-callable wrappers for document read/write/index/search.
|
|
2
|
+
|
|
3
|
+
Registered tools:
|
|
4
|
+
- read_document : read Word/Excel/PDF/CSV → text
|
|
5
|
+
- generate_document: create docx or xlsx from spec
|
|
6
|
+
- index_document : index a document into the FTS search index
|
|
7
|
+
- search_documents: full-text search across indexed documents
|
|
8
|
+
"""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
import json, logging
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any, ClassVar
|
|
13
|
+
|
|
14
|
+
from src.tools import REGISTRY, ToolBase, ToolResult
|
|
15
|
+
from src.tools.document_reader import DocumentReader
|
|
16
|
+
from src.tools.document_writer import DocumentWriter, DocxSpec, XlsxSpec
|
|
17
|
+
|
|
18
|
+
log = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
__all__ = [
|
|
21
|
+
"is_document_path",
|
|
22
|
+
"read_document_tool",
|
|
23
|
+
"generate_document_tool",
|
|
24
|
+
"DOCUMENT_EXTENSIONS",
|
|
25
|
+
"READ_DOCUMENT_SCHEMA",
|
|
26
|
+
"GENERATE_DOCUMENT_SCHEMA",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
DOCUMENT_EXTENSIONS = frozenset({"docx", "xlsx", "xls", "pdf", "csv"})
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def is_document_path(path: str) -> bool:
|
|
33
|
+
return Path(path).suffix.lower().lstrip(".") in DOCUMENT_EXTENSIONS
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def read_document_tool(path: str, include_sheet_data: bool = False) -> str:
|
|
37
|
+
"""Read a document and return its text content. Never raises."""
|
|
38
|
+
result = DocumentReader().read(Path(path))
|
|
39
|
+
if not result.success:
|
|
40
|
+
return f"Error reading {path}: {result.error}"
|
|
41
|
+
if include_sheet_data and result.sheets:
|
|
42
|
+
sheets_json = [
|
|
43
|
+
{"name": s.name, "headers": s.headers, "rows": s.rows}
|
|
44
|
+
for s in result.sheets
|
|
45
|
+
]
|
|
46
|
+
return json.dumps({"text": result.text, "sheets": sheets_json}, indent=2)
|
|
47
|
+
meta = []
|
|
48
|
+
if result.title:
|
|
49
|
+
meta.append(f"Title: {result.title}")
|
|
50
|
+
if result.author:
|
|
51
|
+
meta.append(f"Author: {result.author}")
|
|
52
|
+
if result.page_count is not None:
|
|
53
|
+
meta.append(f"Pages: {result.page_count}")
|
|
54
|
+
header = "\n".join(meta)
|
|
55
|
+
return f"{header}\n\n{result.text}".strip() if header else result.text
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def generate_document_tool(format: str, output_path: str, spec: dict) -> str:
|
|
59
|
+
"""Create a docx or xlsx from a spec dict. Never raises."""
|
|
60
|
+
writer = DocumentWriter()
|
|
61
|
+
out = Path(output_path)
|
|
62
|
+
if format == "docx":
|
|
63
|
+
doc_spec = DocxSpec(
|
|
64
|
+
title=spec.get("title", ""),
|
|
65
|
+
author=spec.get("author", ""),
|
|
66
|
+
sections=spec.get("sections", []),
|
|
67
|
+
)
|
|
68
|
+
result = writer.create_docx(doc_spec, out)
|
|
69
|
+
elif format == "xlsx":
|
|
70
|
+
xl_spec = XlsxSpec(sheets=spec.get("sheets", []))
|
|
71
|
+
result = writer.create_xlsx(xl_spec, out)
|
|
72
|
+
else:
|
|
73
|
+
return f"Error: unsupported format '{format}'. Use 'docx' or 'xlsx'."
|
|
74
|
+
if not result.success:
|
|
75
|
+
return f"Error generating {format}: {result.error}"
|
|
76
|
+
return f"Created {result.path} ({result.bytes_written:,} bytes)"
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def index_document_tool(path: str) -> str:
|
|
80
|
+
"""Index a document for search. Returns chunk count."""
|
|
81
|
+
try:
|
|
82
|
+
from src.memory.document_index import DocumentIndex
|
|
83
|
+
except ImportError:
|
|
84
|
+
return "Error: document_index module not available."
|
|
85
|
+
count = DocumentIndex().index_document(path)
|
|
86
|
+
if count < 0:
|
|
87
|
+
return f"Error: could not index {path} (file not found or unreadable)"
|
|
88
|
+
return f"Indexed {path}: {count} chunks" if count else f"{path} already up to date"
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def search_documents_tool(query: str, limit: int = 10) -> str:
|
|
92
|
+
"""Search all indexed documents for query. Returns ranked passages."""
|
|
93
|
+
try:
|
|
94
|
+
from src.memory.document_index import DocumentIndex
|
|
95
|
+
except ImportError:
|
|
96
|
+
return "Error: document_index module not available."
|
|
97
|
+
results = DocumentIndex().search(query, limit=limit)
|
|
98
|
+
if not results:
|
|
99
|
+
return f"No results for '{query}'"
|
|
100
|
+
lines = []
|
|
101
|
+
for r in results:
|
|
102
|
+
lines.append(f"[{r.file_path} — {r.source_label}]\n{r.snippet}")
|
|
103
|
+
return "\n\n".join(lines)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
# ---------------------------------------------------------------------------
|
|
107
|
+
# LLM function-call schemas
|
|
108
|
+
# ---------------------------------------------------------------------------
|
|
109
|
+
|
|
110
|
+
READ_DOCUMENT_SCHEMA = {
|
|
111
|
+
"name": "read_document",
|
|
112
|
+
"description": "Read a Word (.docx), Excel (.xlsx/.xls), PDF, or CSV file and return its text content.",
|
|
113
|
+
"parameters": {
|
|
114
|
+
"type": "object",
|
|
115
|
+
"properties": {
|
|
116
|
+
"path": {"type": "string", "description": "Absolute or relative path to the document"},
|
|
117
|
+
"include_sheet_data": {
|
|
118
|
+
"type": "boolean",
|
|
119
|
+
"description": "For spreadsheets: return JSON with per-sheet structured data",
|
|
120
|
+
"default": False,
|
|
121
|
+
},
|
|
122
|
+
},
|
|
123
|
+
"required": ["path"],
|
|
124
|
+
},
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
GENERATE_DOCUMENT_SCHEMA = {
|
|
128
|
+
"name": "generate_document",
|
|
129
|
+
"description": "Create a new Word (.docx) or Excel (.xlsx) document from structured content.",
|
|
130
|
+
"parameters": {
|
|
131
|
+
"type": "object",
|
|
132
|
+
"properties": {
|
|
133
|
+
"format": {"type": "string", "enum": ["docx", "xlsx"]},
|
|
134
|
+
"output_path": {"type": "string", "description": "File path to write the document to"},
|
|
135
|
+
"spec": {
|
|
136
|
+
"type": "object",
|
|
137
|
+
"description": "Document spec. For docx: {title, sections:[{heading, paragraphs, table}]}. For xlsx: {sheets:[{name, headers, rows}]}",
|
|
138
|
+
},
|
|
139
|
+
},
|
|
140
|
+
"required": ["format", "output_path", "spec"],
|
|
141
|
+
},
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
# ---------------------------------------------------------------------------
|
|
146
|
+
# ToolBase subclasses
|
|
147
|
+
# ---------------------------------------------------------------------------
|
|
148
|
+
|
|
149
|
+
class ReadDocumentTool(ToolBase):
|
|
150
|
+
"""Read a Word, Excel, PDF, or CSV file."""
|
|
151
|
+
|
|
152
|
+
name: ClassVar[str] = "read_document"
|
|
153
|
+
description: ClassVar[str] = (
|
|
154
|
+
"Read a Word (.docx), Excel (.xlsx/.xls), PDF, or CSV file and return its text content. "
|
|
155
|
+
"For spreadsheets returns tab-separated table text per sheet."
|
|
156
|
+
)
|
|
157
|
+
input_schema: ClassVar[dict[str, Any]] = READ_DOCUMENT_SCHEMA["parameters"]
|
|
158
|
+
|
|
159
|
+
def execute(self, params: dict[str, Any]) -> ToolResult:
|
|
160
|
+
path = params["path"]
|
|
161
|
+
include_sheet_data = params.get("include_sheet_data", False)
|
|
162
|
+
text = read_document_tool(path, include_sheet_data=include_sheet_data)
|
|
163
|
+
if text.startswith("Error"):
|
|
164
|
+
return ToolResult(output="", error=text)
|
|
165
|
+
return ToolResult(output=text, metadata={"path": path})
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
class GenerateDocumentTool(ToolBase):
|
|
169
|
+
"""Create a Word or Excel document from structured content."""
|
|
170
|
+
|
|
171
|
+
name: ClassVar[str] = "generate_document"
|
|
172
|
+
description: ClassVar[str] = (
|
|
173
|
+
"Create a new Word (.docx) or Excel (.xlsx) document from structured content. "
|
|
174
|
+
"Returns the output file path and byte count."
|
|
175
|
+
)
|
|
176
|
+
input_schema: ClassVar[dict[str, Any]] = GENERATE_DOCUMENT_SCHEMA["parameters"]
|
|
177
|
+
|
|
178
|
+
def execute(self, params: dict[str, Any]) -> ToolResult:
|
|
179
|
+
fmt = params["format"]
|
|
180
|
+
output_path = params["output_path"]
|
|
181
|
+
spec = params.get("spec", {})
|
|
182
|
+
result = generate_document_tool(fmt, output_path, spec)
|
|
183
|
+
if result.startswith("Error"):
|
|
184
|
+
return ToolResult(output="", error=result)
|
|
185
|
+
return ToolResult(output=result)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
class IndexDocumentTool(ToolBase):
|
|
189
|
+
"""Index a document for full-text search."""
|
|
190
|
+
|
|
191
|
+
name: ClassVar[str] = "index_document"
|
|
192
|
+
description: ClassVar[str] = (
|
|
193
|
+
"Index a document (Word, Excel, PDF, CSV) into the local search index. "
|
|
194
|
+
"Run this before searching for document content."
|
|
195
|
+
)
|
|
196
|
+
input_schema: ClassVar[dict[str, Any]] = {
|
|
197
|
+
"type": "object",
|
|
198
|
+
"properties": {
|
|
199
|
+
"path": {"type": "string", "description": "Path to the document to index"},
|
|
200
|
+
},
|
|
201
|
+
"required": ["path"],
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
def execute(self, params: dict[str, Any]) -> ToolResult:
|
|
205
|
+
result = index_document_tool(params["path"])
|
|
206
|
+
if result.startswith("Error"):
|
|
207
|
+
return ToolResult(output="", error=result)
|
|
208
|
+
return ToolResult(output=result)
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
class SearchDocumentsTool(ToolBase):
|
|
212
|
+
"""Full-text search across all indexed documents."""
|
|
213
|
+
|
|
214
|
+
name: ClassVar[str] = "search_documents"
|
|
215
|
+
description: ClassVar[str] = (
|
|
216
|
+
"Search all indexed documents for a query. "
|
|
217
|
+
"Returns ranked passages with source file and location."
|
|
218
|
+
)
|
|
219
|
+
input_schema: ClassVar[dict[str, Any]] = {
|
|
220
|
+
"type": "object",
|
|
221
|
+
"properties": {
|
|
222
|
+
"query": {"type": "string", "description": "Search query"},
|
|
223
|
+
"limit": {"type": "integer", "description": "Max results (default 10)", "default": 10},
|
|
224
|
+
},
|
|
225
|
+
"required": ["query"],
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
def execute(self, params: dict[str, Any]) -> ToolResult:
|
|
229
|
+
result = search_documents_tool(params["query"], limit=params.get("limit", 10))
|
|
230
|
+
return ToolResult(output=result)
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
# ---------------------------------------------------------------------------
|
|
234
|
+
# Auto-register
|
|
235
|
+
# ---------------------------------------------------------------------------
|
|
236
|
+
|
|
237
|
+
REGISTRY.register(ReadDocumentTool())
|
|
238
|
+
REGISTRY.register(GenerateDocumentTool())
|
|
239
|
+
REGISTRY.register(IndexDocumentTool())
|
|
240
|
+
REGISTRY.register(SearchDocumentsTool())
|