proseforge-agent 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- proseforge_agent/__init__.py +10 -0
- proseforge_agent/__main__.py +8 -0
- proseforge_agent/_bootstrap.py +108 -0
- proseforge_agent/agent/__init__.py +123 -0
- proseforge_agent/agent/artifacts.py +160 -0
- proseforge_agent/agent/attachments.py +282 -0
- proseforge_agent/agent/audit.py +198 -0
- proseforge_agent/agent/context_window.py +90 -0
- proseforge_agent/agent/control.py +45 -0
- proseforge_agent/agent/degradation.py +166 -0
- proseforge_agent/agent/eval.py +237 -0
- proseforge_agent/agent/events.py +209 -0
- proseforge_agent/agent/execution_guard.py +219 -0
- proseforge_agent/agent/function_calling.py +255 -0
- proseforge_agent/agent/intent_router.py +152 -0
- proseforge_agent/agent/kernel.py +514 -0
- proseforge_agent/agent/loop.py +301 -0
- proseforge_agent/agent/middleware.py +241 -0
- proseforge_agent/agent/modes.py +26 -0
- proseforge_agent/agent/observability.py +385 -0
- proseforge_agent/agent/offline.py +77 -0
- proseforge_agent/agent/permissions.py +101 -0
- proseforge_agent/agent/planner.py +166 -0
- proseforge_agent/agent/profiles.py +122 -0
- proseforge_agent/agent/prompt_templates.py +230 -0
- proseforge_agent/agent/provider_fallback.py +135 -0
- proseforge_agent/agent/reflection.py +136 -0
- proseforge_agent/agent/request_cache.py +186 -0
- proseforge_agent/agent/safety.py +146 -0
- proseforge_agent/agent/sandbox.py +205 -0
- proseforge_agent/agent/structured_output.py +236 -0
- proseforge_agent/agent/subagent.py +111 -0
- proseforge_agent/agent/tools.py +391 -0
- proseforge_agent/agent/types.py +60 -0
- proseforge_agent/capabilities.py +194 -0
- proseforge_agent/chapter/__init__.py +67 -0
- proseforge_agent/chapter/accept.py +81 -0
- proseforge_agent/chapter/context.py +113 -0
- proseforge_agent/chapter/draft.py +137 -0
- proseforge_agent/chapter/lifecycle.py +293 -0
- proseforge_agent/chapter/review.py +162 -0
- proseforge_agent/chapter/rewrite.py +187 -0
- proseforge_agent/chat/__init__.py +60 -0
- proseforge_agent/chat/context.py +66 -0
- proseforge_agent/chat/handoff.py +111 -0
- proseforge_agent/chat/memory.py +115 -0
- proseforge_agent/chat/prompts.py +129 -0
- proseforge_agent/chat/repl.py +207 -0
- proseforge_agent/chat/retrieval.py +127 -0
- proseforge_agent/chat/session.py +934 -0
- proseforge_agent/chat/slash.py +134 -0
- proseforge_agent/chat/system_prompts.py +221 -0
- proseforge_agent/chat/transcript.py +39 -0
- proseforge_agent/cli.py +6746 -0
- proseforge_agent/concurrency.py +156 -0
- proseforge_agent/config.py +138 -0
- proseforge_agent/cron/__init__.py +21 -0
- proseforge_agent/cron/core.py +168 -0
- proseforge_agent/daily/__init__.py +17 -0
- proseforge_agent/daily/recommend.py +84 -0
- proseforge_agent/daily/workbook.py +169 -0
- proseforge_agent/demo.py +281 -0
- proseforge_agent/dotenv.py +126 -0
- proseforge_agent/environments/__init__.py +40 -0
- proseforge_agent/environments/base.py +124 -0
- proseforge_agent/environments/checkpoints.py +45 -0
- proseforge_agent/environments/daytona.py +21 -0
- proseforge_agent/environments/docker.py +64 -0
- proseforge_agent/environments/file_sync.py +60 -0
- proseforge_agent/environments/local.py +58 -0
- proseforge_agent/environments/modal.py +21 -0
- proseforge_agent/environments/process_registry.py +151 -0
- proseforge_agent/environments/process_runner.py +99 -0
- proseforge_agent/environments/serverless.py +46 -0
- proseforge_agent/environments/singularity.py +45 -0
- proseforge_agent/environments/ssh.py +47 -0
- proseforge_agent/errors.py +44 -0
- proseforge_agent/eval/__init__.py +17 -0
- proseforge_agent/eval/trajectories.py +140 -0
- proseforge_agent/extensions/__init__.py +24 -0
- proseforge_agent/extensions/base.py +114 -0
- proseforge_agent/extensions/registry.py +100 -0
- proseforge_agent/gateway/__init__.py +30 -0
- proseforge_agent/gateway/core.py +139 -0
- proseforge_agent/gateway/delivery.py +117 -0
- proseforge_agent/gateway/media.py +99 -0
- proseforge_agent/gateway/platforms/__init__.py +32 -0
- proseforge_agent/gateway/platforms/base.py +164 -0
- proseforge_agent/gateway/platforms/discord.py +82 -0
- proseforge_agent/gateway/platforms/email.py +33 -0
- proseforge_agent/gateway/platforms/mobile_email.py +65 -0
- proseforge_agent/gateway/platforms/signal.py +31 -0
- proseforge_agent/gateway/platforms/slack.py +81 -0
- proseforge_agent/gateway/platforms/telegram.py +199 -0
- proseforge_agent/gateway/platforms/telegram_transport.py +119 -0
- proseforge_agent/gateway/platforms/whatsapp.py +31 -0
- proseforge_agent/gateway/poller.py +76 -0
- proseforge_agent/gateway/relay/__init__.py +5 -0
- proseforge_agent/gateway/relay/auth.py +155 -0
- proseforge_agent/install/__init__.py +60 -0
- proseforge_agent/install/app_dirs.py +97 -0
- proseforge_agent/install/auto_trigger.py +118 -0
- proseforge_agent/install/binary_build.py +147 -0
- proseforge_agent/install/binary_packaging.py +63 -0
- proseforge_agent/install/ci_matrix.py +98 -0
- proseforge_agent/install/docker_plan.py +53 -0
- proseforge_agent/install/doctor.py +263 -0
- proseforge_agent/install/first_run.py +87 -0
- proseforge_agent/install/installer_scripts.py +195 -0
- proseforge_agent/install/installers.py +217 -0
- proseforge_agent/install/linux.py +70 -0
- proseforge_agent/install/local_models.py +98 -0
- proseforge_agent/install/macos.py +66 -0
- proseforge_agent/install/migrations.py +98 -0
- proseforge_agent/install/package_checks.py +96 -0
- proseforge_agent/install/platform_io.py +74 -0
- proseforge_agent/install/provider_setup.py +73 -0
- proseforge_agent/install/qa_matrix.py +125 -0
- proseforge_agent/install/secrets.py +81 -0
- proseforge_agent/install/shell.py +64 -0
- proseforge_agent/install/support_bundle.py +152 -0
- proseforge_agent/install/uninstall.py +60 -0
- proseforge_agent/install/windows.py +81 -0
- proseforge_agent/llm/__init__.py +55 -0
- proseforge_agent/llm/base.py +100 -0
- proseforge_agent/llm/capabilities.py +181 -0
- proseforge_agent/llm/certification.py +167 -0
- proseforge_agent/llm/docs_refresh.py +83 -0
- proseforge_agent/llm/fake.py +62 -0
- proseforge_agent/llm/http.py +121 -0
- proseforge_agent/llm/openai_compatible.py +158 -0
- proseforge_agent/llm/policies.py +69 -0
- proseforge_agent/llm/probes.py +163 -0
- proseforge_agent/llm/profiles.py +108 -0
- proseforge_agent/llm/providers/__init__.py +6 -0
- proseforge_agent/llm/providers/_openai_shape.py +130 -0
- proseforge_agent/llm/providers/anthropic.py +230 -0
- proseforge_agent/llm/providers/deepseek.py +264 -0
- proseforge_agent/llm/providers/doubao.py +258 -0
- proseforge_agent/llm/providers/gemini.py +303 -0
- proseforge_agent/llm/providers/glm.py +245 -0
- proseforge_agent/llm/providers/grok.py +255 -0
- proseforge_agent/llm/providers/mimo.py +254 -0
- proseforge_agent/llm/providers/minimax.py +262 -0
- proseforge_agent/llm/providers/openai.py +261 -0
- proseforge_agent/llm/providers/profiles/anthropic.yaml +21 -0
- proseforge_agent/llm/providers/profiles/deepseek.yaml +22 -0
- proseforge_agent/llm/providers/profiles/doubao.yaml +37 -0
- proseforge_agent/llm/providers/profiles/gemini.yaml +36 -0
- proseforge_agent/llm/providers/profiles/glm.yaml +36 -0
- proseforge_agent/llm/providers/profiles/mimo.yaml +23 -0
- proseforge_agent/llm/providers/profiles/minimax.yaml +22 -0
- proseforge_agent/llm/providers/profiles/openai.yaml +36 -0
- proseforge_agent/llm/providers/profiles/qwen.yaml +22 -0
- proseforge_agent/llm/providers/profiles/xai.yaml +21 -0
- proseforge_agent/llm/providers/qwen.py +270 -0
- proseforge_agent/llm/registry.py +230 -0
- proseforge_agent/llm/router.py +329 -0
- proseforge_agent/llm/streaming.py +81 -0
- proseforge_agent/llm/usage.py +261 -0
- proseforge_agent/mcp/__init__.py +50 -0
- proseforge_agent/mcp/approval.py +180 -0
- proseforge_agent/mcp/client.py +600 -0
- proseforge_agent/mcp/credentials.py +44 -0
- proseforge_agent/mcp/policy.py +146 -0
- proseforge_agent/mcp/registry.py +145 -0
- proseforge_agent/mcp/schema.py +179 -0
- proseforge_agent/memory/__init__.py +18 -0
- proseforge_agent/memory/compact.py +87 -0
- proseforge_agent/memory/ingest.py +79 -0
- proseforge_agent/memory/nudges.py +61 -0
- proseforge_agent/memory/review.py +66 -0
- proseforge_agent/memory/schema.py +62 -0
- proseforge_agent/memory/store.py +254 -0
- proseforge_agent/memory/user_model.py +125 -0
- proseforge_agent/notifications/__init__.py +24 -0
- proseforge_agent/notifications/core.py +98 -0
- proseforge_agent/notifications/desktop.py +55 -0
- proseforge_agent/notifications/jobs.py +171 -0
- proseforge_agent/notifications/webhook.py +97 -0
- proseforge_agent/novel/__init__.py +141 -0
- proseforge_agent/novel/approval_queue.py +153 -0
- proseforge_agent/novel/artifacts.py +107 -0
- proseforge_agent/novel/backup_verification.py +197 -0
- proseforge_agent/novel/bible.py +102 -0
- proseforge_agent/novel/character_arcs.py +153 -0
- proseforge_agent/novel/continuity.py +160 -0
- proseforge_agent/novel/draft_versioning.py +242 -0
- proseforge_agent/novel/editorial_pipeline.py +200 -0
- proseforge_agent/novel/exporter.py +160 -0
- proseforge_agent/novel/foreshadowing.py +147 -0
- proseforge_agent/novel/importer.py +241 -0
- proseforge_agent/novel/literary_regression.py +154 -0
- proseforge_agent/novel/manifest.py +110 -0
- proseforge_agent/novel/manuscript_search.py +105 -0
- proseforge_agent/novel/plot_threads.py +124 -0
- proseforge_agent/novel/project_health.py +156 -0
- proseforge_agent/novel/publishing.py +81 -0
- proseforge_agent/novel/reader_review.py +276 -0
- proseforge_agent/novel/relationship_graph.py +138 -0
- proseforge_agent/novel/reorganize.py +149 -0
- proseforge_agent/novel/rewrite_strategies.py +153 -0
- proseforge_agent/novel/safety.py +84 -0
- proseforge_agent/novel/scenes.py +215 -0
- proseforge_agent/novel/storage.py +69 -0
- proseforge_agent/novel/style_profile.py +184 -0
- proseforge_agent/novel/timeline.py +186 -0
- proseforge_agent/novel/writing_analytics.py +165 -0
- proseforge_agent/novel/writing_quality.py +213 -0
- proseforge_agent/novel/writing_rules.py +119 -0
- proseforge_agent/planning/__init__.py +26 -0
- proseforge_agent/planning/intake.py +69 -0
- proseforge_agent/planning/phase_plan.py +210 -0
- proseforge_agent/plugins/__init__.py +41 -0
- proseforge_agent/plugins/dependencies.py +143 -0
- proseforge_agent/plugins/discovery.py +91 -0
- proseforge_agent/plugins/harness.py +151 -0
- proseforge_agent/plugins/hooks.py +120 -0
- proseforge_agent/plugins/manager.py +118 -0
- proseforge_agent/plugins/manifest.py +72 -0
- proseforge_agent/plugins/permissions.py +60 -0
- proseforge_agent/plugins/sandbox.py +109 -0
- proseforge_agent/proseforge/__init__.py +11 -0
- proseforge_agent/proseforge/adapter.py +158 -0
- proseforge_agent/proseforge/results.py +45 -0
- proseforge_agent/release/__init__.py +151 -0
- proseforge_agent/release/complete_agent_gate.py +98 -0
- proseforge_agent/release/publish.py +227 -0
- proseforge_agent/release/version_policy.py +98 -0
- proseforge_agent/reports/__init__.py +29 -0
- proseforge_agent/reports/registry.py +68 -0
- proseforge_agent/reports/render.py +100 -0
- proseforge_agent/retrieval/__init__.py +45 -0
- proseforge_agent/retrieval/embeddings.py +70 -0
- proseforge_agent/retrieval/evaluation.py +96 -0
- proseforge_agent/retrieval/evidence.py +207 -0
- proseforge_agent/retrieval/hybrid.py +141 -0
- proseforge_agent/retrieval/index.py +65 -0
- proseforge_agent/retrieval/ingestion.py +216 -0
- proseforge_agent/retrieval/router.py +82 -0
- proseforge_agent/retrieval/vector_store.py +199 -0
- proseforge_agent/service/__init__.py +6 -0
- proseforge_agent/service/api.py +111 -0
- proseforge_agent/service/http_server.py +163 -0
- proseforge_agent/setup/__init__.py +27 -0
- proseforge_agent/setup/config_generator.py +271 -0
- proseforge_agent/setup/first_run.py +106 -0
- proseforge_agent/setup/modes.py +144 -0
- proseforge_agent/setup/recovery.py +45 -0
- proseforge_agent/setup/summary.py +25 -0
- proseforge_agent/setup/wizard.py +212 -0
- proseforge_agent/skills/__init__.py +29 -0
- proseforge_agent/skills/audit.py +45 -0
- proseforge_agent/skills/creation.py +159 -0
- proseforge_agent/skills/hub.py +102 -0
- proseforge_agent/skills/improvement.py +148 -0
- proseforge_agent/skills/install.py +157 -0
- proseforge_agent/skills/registry.py +134 -0
- proseforge_agent/skills/usage.py +112 -0
- proseforge_agent/testing/__init__.py +23 -0
- proseforge_agent/testing/fakes.py +140 -0
- proseforge_agent/tools/__init__.py +17 -0
- proseforge_agent/tools/managed/__init__.py +218 -0
- proseforge_agent/tools/managed/cloud_browser.py +174 -0
- proseforge_agent/tools/managed/media.py +130 -0
- proseforge_agent/tools/managed/url_safety.py +105 -0
- proseforge_agent/tools/managed/web_search.py +101 -0
- proseforge_agent/tui/__init__.py +7 -0
- proseforge_agent/tui/ansi.py +72 -0
- proseforge_agent/tui/app.py +367 -0
- proseforge_agent/tui/fullscreen.py +339 -0
- proseforge_agent/tui/keys.py +180 -0
- proseforge_agent/tui/screen.py +111 -0
- proseforge_agent/tui/streaming.py +83 -0
- proseforge_agent/workflow/__init__.py +29 -0
- proseforge_agent/workflow/recovery.py +68 -0
- proseforge_agent/workflow/state.py +289 -0
- proseforge_agent/workspace.py +107 -0
- proseforge_agent-0.2.0.dist-info/METADATA +587 -0
- proseforge_agent-0.2.0.dist-info/RECORD +284 -0
- proseforge_agent-0.2.0.dist-info/WHEEL +5 -0
- proseforge_agent-0.2.0.dist-info/entry_points.txt +2 -0
- proseforge_agent-0.2.0.dist-info/licenses/LICENSE +201 -0
- proseforge_agent-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""ProseForge Agent package.
|
|
2
|
+
|
|
3
|
+
Importing this package must not read configuration, touch the filesystem,
|
|
4
|
+
or require API keys. Subsystem behavior lives behind dedicated submodules
|
|
5
|
+
that are imported explicitly by their callers.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
__version__ = "0.2.0"
|
|
9
|
+
|
|
10
|
+
__all__ = ["__version__"]
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
"""Early process bootstrap (Task 190).
|
|
2
|
+
|
|
3
|
+
Fixes Windows text-encoding issues before any user-facing output and hardens
|
|
4
|
+
the import path. Imported as the very first line of every entry point
|
|
5
|
+
(``cli``, ``chat.repl``, ``demo``, ``__main__``) so a UTF-8-safe interpreter is
|
|
6
|
+
guaranteed before the rest of the package loads.
|
|
7
|
+
|
|
8
|
+
Four responsibilities, each guarded so the bootstrap NEVER raises:
|
|
9
|
+
|
|
10
|
+
1. Force UTF-8 for child processes (``PYTHONUTF8``, ``PYTHONIOENCODING``).
|
|
11
|
+
2. Reconfigure stdout/stderr/stdin to UTF-8 with ``errors="replace"``.
|
|
12
|
+
3. Harden ``sys.path`` (prepend the package source root, strip ``""``/``.``).
|
|
13
|
+
4. Record everything in :data:`STATE` for observability.
|
|
14
|
+
|
|
15
|
+
Safe to import multiple times: a guard on ``sys._pf_agent_bootstrapped``
|
|
16
|
+
makes repeat calls a no-op.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import os
|
|
22
|
+
import sys
|
|
23
|
+
from typing import Any
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
STATE: dict[str, Any] = {
|
|
27
|
+
"already_installed": False,
|
|
28
|
+
"python_utf8_set": None,
|
|
29
|
+
"stdout_reconfigured": None,
|
|
30
|
+
"stderr_reconfigured": None,
|
|
31
|
+
"stdin_reconfigured": None,
|
|
32
|
+
"sys_path_hardened": False,
|
|
33
|
+
"warnings": [],
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _configure_utf8_env() -> None:
|
|
38
|
+
if "PYTHONUTF8" in os.environ:
|
|
39
|
+
STATE["python_utf8_set"] = "preexisting"
|
|
40
|
+
else:
|
|
41
|
+
os.environ["PYTHONUTF8"] = "1"
|
|
42
|
+
STATE["python_utf8_set"] = True
|
|
43
|
+
os.environ.setdefault("PYTHONIOENCODING", "utf-8")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _reconfigure_stream(name: str, key: str) -> None:
|
|
47
|
+
stream = getattr(sys, name, None)
|
|
48
|
+
reconfigure = getattr(stream, "reconfigure", None)
|
|
49
|
+
if not callable(reconfigure):
|
|
50
|
+
STATE[key] = "not_a_text_wrapper"
|
|
51
|
+
return
|
|
52
|
+
try:
|
|
53
|
+
reconfigure(encoding="utf-8", errors="replace")
|
|
54
|
+
STATE[key] = True
|
|
55
|
+
except Exception as exc: # noqa: BLE001 - bootstrap must never raise
|
|
56
|
+
STATE[key] = False
|
|
57
|
+
STATE["warnings"].append(f"{name}.reconfigure failed: {exc}")
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _harden_sys_path() -> None:
|
|
61
|
+
try:
|
|
62
|
+
src_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
63
|
+
if src_root and src_root not in sys.path:
|
|
64
|
+
sys.path.insert(0, src_root)
|
|
65
|
+
# Strip current-directory entries so a user's local module cannot
|
|
66
|
+
# shadow package modules with common names (utils, proxy, ui, ...).
|
|
67
|
+
sys.path[:] = [entry for entry in sys.path if entry not in ("", ".")]
|
|
68
|
+
STATE["sys_path_hardened"] = True
|
|
69
|
+
except Exception as exc: # noqa: BLE001
|
|
70
|
+
STATE["warnings"].append(f"sys.path hardening failed: {exc}")
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _load_dotenv() -> None:
|
|
74
|
+
try:
|
|
75
|
+
from .dotenv import load_default_files
|
|
76
|
+
|
|
77
|
+
STATE["dotenv"] = load_default_files()
|
|
78
|
+
except Exception as exc: # noqa: BLE001 - .env loading must never break startup
|
|
79
|
+
STATE["warnings"].append(f".env load failed: {exc}")
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def install() -> dict[str, Any]:
|
|
83
|
+
"""Idempotently apply the bootstrap and return :data:`STATE`."""
|
|
84
|
+
if getattr(sys, "_pf_agent_bootstrapped", False):
|
|
85
|
+
STATE["already_installed"] = True
|
|
86
|
+
return STATE
|
|
87
|
+
|
|
88
|
+
STATE["already_installed"] = False
|
|
89
|
+
STATE["warnings"] = []
|
|
90
|
+
try:
|
|
91
|
+
_configure_utf8_env()
|
|
92
|
+
_reconfigure_stream("stdout", "stdout_reconfigured")
|
|
93
|
+
_reconfigure_stream("stderr", "stderr_reconfigured")
|
|
94
|
+
_reconfigure_stream("stdin", "stdin_reconfigured")
|
|
95
|
+
_harden_sys_path()
|
|
96
|
+
_load_dotenv()
|
|
97
|
+
except Exception as exc: # noqa: BLE001 - defense in depth
|
|
98
|
+
STATE["warnings"].append(f"bootstrap error: {exc}")
|
|
99
|
+
|
|
100
|
+
sys._pf_agent_bootstrapped = True # type: ignore[attr-defined]
|
|
101
|
+
return STATE
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
# Apply automatically on import so entry points only need `import _bootstrap`.
|
|
105
|
+
install()
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
__all__ = ["STATE", "install"]
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
"""Agent runtime interfaces."""
|
|
2
|
+
|
|
3
|
+
from .events import BackgroundJobRunner, EventBus, EventRecord, JobResult, ToolOutputChunk
|
|
4
|
+
from .eval import EvalHarness, EvalReport, EvalSuite, EvalTaskResult, GoldenTask
|
|
5
|
+
from .attachments import AttachmentIngestResult, AttachmentIngestor
|
|
6
|
+
from .audit import AuditStep, AuditTrailStore, ReplayResult
|
|
7
|
+
from .execution_guard import ExecutionGuard, ExecutionGuardResult, ExecutionPolicy
|
|
8
|
+
from .function_calling import (
|
|
9
|
+
ProviderToolCall,
|
|
10
|
+
StructuredToolAdapter,
|
|
11
|
+
StructuredToolResult,
|
|
12
|
+
ToolCallLoop,
|
|
13
|
+
ToolLoopResult,
|
|
14
|
+
)
|
|
15
|
+
from .kernel import AgentKernel
|
|
16
|
+
from .intent_router import IntentDecision, IntentRouter
|
|
17
|
+
from .types import AgentIntent, AgentTurnRequest, AgentTurnResult, ToolCallResult
|
|
18
|
+
from .control import ControlSignal, ControlToken
|
|
19
|
+
from .context_window import ContextUsageReport, ContextWindowManager
|
|
20
|
+
from .degradation import CapabilityReport, CapabilityRuntime, FeatureCheck, FeatureDeclaration, FeatureLevel
|
|
21
|
+
from .offline import OfflineDecision, OfflinePolicy
|
|
22
|
+
from .permissions import PERMISSION_LEVELS, PermissionDecision, PermissionPolicy
|
|
23
|
+
from .profiles import AgentProfile, AgentProfileRegistry
|
|
24
|
+
from .provider_fallback import ProviderFallbackAttempt, ProviderFallbackChain, ProviderFallbackResult
|
|
25
|
+
from .prompt_templates import (
|
|
26
|
+
PromptTemplate,
|
|
27
|
+
PromptTemplateRegistry,
|
|
28
|
+
PromptTemplateValidation,
|
|
29
|
+
PromptTemplateValidationError,
|
|
30
|
+
)
|
|
31
|
+
from .request_cache import CachedResponse, RequestCache, RequestCacheKey
|
|
32
|
+
from .sandbox import Approval, ExecRequest, ExecResult, Sandbox
|
|
33
|
+
from .subagent import Scope, SubAgentResult, SubAgentRunner
|
|
34
|
+
from .structured_output import (
|
|
35
|
+
StructuredOutputRepairResult,
|
|
36
|
+
repair_structured_output,
|
|
37
|
+
validate_or_repair,
|
|
38
|
+
)
|
|
39
|
+
from .tools import (
|
|
40
|
+
AgentTool,
|
|
41
|
+
ToolContext,
|
|
42
|
+
ToolRegistry,
|
|
43
|
+
ToolResult,
|
|
44
|
+
default_tool_registry,
|
|
45
|
+
general_tool_registry,
|
|
46
|
+
register_writing_domain_tools,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
__all__ = [
|
|
50
|
+
"BackgroundJobRunner",
|
|
51
|
+
"AttachmentIngestResult",
|
|
52
|
+
"AttachmentIngestor",
|
|
53
|
+
"AuditStep",
|
|
54
|
+
"AuditTrailStore",
|
|
55
|
+
"EvalHarness",
|
|
56
|
+
"EvalReport",
|
|
57
|
+
"EvalSuite",
|
|
58
|
+
"EvalTaskResult",
|
|
59
|
+
"GoldenTask",
|
|
60
|
+
"EventBus",
|
|
61
|
+
"EventRecord",
|
|
62
|
+
"ExecutionGuard",
|
|
63
|
+
"ExecutionGuardResult",
|
|
64
|
+
"ExecutionPolicy",
|
|
65
|
+
"ProviderToolCall",
|
|
66
|
+
"StructuredToolAdapter",
|
|
67
|
+
"StructuredToolResult",
|
|
68
|
+
"ToolCallLoop",
|
|
69
|
+
"ToolLoopResult",
|
|
70
|
+
"ToolOutputChunk",
|
|
71
|
+
"AgentKernel",
|
|
72
|
+
"IntentDecision",
|
|
73
|
+
"IntentRouter",
|
|
74
|
+
"AgentIntent",
|
|
75
|
+
"AgentTurnRequest",
|
|
76
|
+
"AgentTurnResult",
|
|
77
|
+
"ToolCallResult",
|
|
78
|
+
"ControlSignal",
|
|
79
|
+
"ControlToken",
|
|
80
|
+
"ContextUsageReport",
|
|
81
|
+
"ContextWindowManager",
|
|
82
|
+
"CapabilityReport",
|
|
83
|
+
"CapabilityRuntime",
|
|
84
|
+
"FeatureCheck",
|
|
85
|
+
"FeatureDeclaration",
|
|
86
|
+
"FeatureLevel",
|
|
87
|
+
"OfflineDecision",
|
|
88
|
+
"OfflinePolicy",
|
|
89
|
+
"PERMISSION_LEVELS",
|
|
90
|
+
"PermissionDecision",
|
|
91
|
+
"PermissionPolicy",
|
|
92
|
+
"AgentProfile",
|
|
93
|
+
"AgentProfileRegistry",
|
|
94
|
+
"ProviderFallbackAttempt",
|
|
95
|
+
"ProviderFallbackChain",
|
|
96
|
+
"ProviderFallbackResult",
|
|
97
|
+
"PromptTemplate",
|
|
98
|
+
"PromptTemplateRegistry",
|
|
99
|
+
"PromptTemplateValidation",
|
|
100
|
+
"PromptTemplateValidationError",
|
|
101
|
+
"CachedResponse",
|
|
102
|
+
"RequestCache",
|
|
103
|
+
"RequestCacheKey",
|
|
104
|
+
"ReplayResult",
|
|
105
|
+
"Approval",
|
|
106
|
+
"ExecRequest",
|
|
107
|
+
"ExecResult",
|
|
108
|
+
"Sandbox",
|
|
109
|
+
"Scope",
|
|
110
|
+
"SubAgentResult",
|
|
111
|
+
"SubAgentRunner",
|
|
112
|
+
"StructuredOutputRepairResult",
|
|
113
|
+
"AgentTool",
|
|
114
|
+
"ToolContext",
|
|
115
|
+
"ToolRegistry",
|
|
116
|
+
"ToolResult",
|
|
117
|
+
"JobResult",
|
|
118
|
+
"default_tool_registry",
|
|
119
|
+
"general_tool_registry",
|
|
120
|
+
"register_writing_domain_tools",
|
|
121
|
+
"repair_structured_output",
|
|
122
|
+
"validate_or_repair",
|
|
123
|
+
]
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
"""Bounded artifact storage for large tool results."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
import hashlib
|
|
7
|
+
import json
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
import re
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
from .tools import ToolResult
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass(frozen=True)
|
|
16
|
+
class ArtifactRef:
|
|
17
|
+
"""Portable reference to a stored tool artifact."""
|
|
18
|
+
|
|
19
|
+
id: str
|
|
20
|
+
kind: str
|
|
21
|
+
path: str
|
|
22
|
+
content_type: str
|
|
23
|
+
size_bytes: int
|
|
24
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
25
|
+
redaction_applied: bool = False
|
|
26
|
+
|
|
27
|
+
def to_dict(self) -> dict[str, Any]:
|
|
28
|
+
return {
|
|
29
|
+
"id": self.id,
|
|
30
|
+
"kind": self.kind,
|
|
31
|
+
"path": self.path,
|
|
32
|
+
"content_type": self.content_type,
|
|
33
|
+
"size_bytes": self.size_bytes,
|
|
34
|
+
"metadata": self.metadata,
|
|
35
|
+
"redaction_applied": self.redaction_applied,
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class ArtifactStore:
|
|
40
|
+
"""Path-contained local store for redacted tool artifacts."""
|
|
41
|
+
|
|
42
|
+
def __init__(self, root: str | Path, *, output_limit: int = 4096) -> None:
|
|
43
|
+
self.root = Path(root)
|
|
44
|
+
self.base = self.root / "tool_artifacts"
|
|
45
|
+
self.output_limit = output_limit
|
|
46
|
+
self.base.mkdir(parents=True, exist_ok=True)
|
|
47
|
+
|
|
48
|
+
def write(self, kind: str, content: str | bytes, metadata: dict[str, Any] | None = None) -> ArtifactRef:
|
|
49
|
+
metadata = dict(metadata or {})
|
|
50
|
+
content_type = str(
|
|
51
|
+
metadata.get("content_type")
|
|
52
|
+
or ("application/octet-stream" if isinstance(content, bytes) else "text/plain")
|
|
53
|
+
)
|
|
54
|
+
redaction_applied = False
|
|
55
|
+
if isinstance(content, bytes):
|
|
56
|
+
stored = content
|
|
57
|
+
suffix = ".bin"
|
|
58
|
+
else:
|
|
59
|
+
text, redaction_applied = _redact(content)
|
|
60
|
+
stored = text.encode("utf-8")
|
|
61
|
+
suffix = ".txt"
|
|
62
|
+
digest = hashlib.sha256(stored).hexdigest()
|
|
63
|
+
artifact_id = f"artifact-{kind}-{digest[:16]}"
|
|
64
|
+
artifact_path = self._contained_path(f"{artifact_id}{suffix}")
|
|
65
|
+
artifact_path.write_bytes(stored)
|
|
66
|
+
ref = ArtifactRef(
|
|
67
|
+
id=artifact_id,
|
|
68
|
+
kind=kind,
|
|
69
|
+
path=str(artifact_path.relative_to(self.root)),
|
|
70
|
+
content_type=content_type,
|
|
71
|
+
size_bytes=len(stored),
|
|
72
|
+
metadata=metadata,
|
|
73
|
+
redaction_applied=redaction_applied,
|
|
74
|
+
)
|
|
75
|
+
self._metadata_path(artifact_id).write_text(
|
|
76
|
+
json.dumps(ref.to_dict(), indent=2, sort_keys=True),
|
|
77
|
+
encoding="utf-8",
|
|
78
|
+
)
|
|
79
|
+
return ref
|
|
80
|
+
|
|
81
|
+
def read(self, artifact_id: str) -> str | bytes:
|
|
82
|
+
ref = self.get(artifact_id)
|
|
83
|
+
if ref is None:
|
|
84
|
+
raise FileNotFoundError(artifact_id)
|
|
85
|
+
path = self._contained_path(Path(ref.path).name)
|
|
86
|
+
content = path.read_bytes()
|
|
87
|
+
if ref.content_type.startswith("text/"):
|
|
88
|
+
return content.decode("utf-8")
|
|
89
|
+
return content
|
|
90
|
+
|
|
91
|
+
def get(self, artifact_id: str) -> ArtifactRef | None:
|
|
92
|
+
path = self._metadata_path(artifact_id)
|
|
93
|
+
if not path.exists():
|
|
94
|
+
return None
|
|
95
|
+
payload = json.loads(path.read_text(encoding="utf-8"))
|
|
96
|
+
return ArtifactRef(
|
|
97
|
+
id=str(payload["id"]),
|
|
98
|
+
kind=str(payload["kind"]),
|
|
99
|
+
path=str(payload["path"]),
|
|
100
|
+
content_type=str(payload["content_type"]),
|
|
101
|
+
size_bytes=int(payload["size_bytes"]),
|
|
102
|
+
metadata=dict(payload.get("metadata") or {}),
|
|
103
|
+
redaction_applied=bool(payload.get("redaction_applied", False)),
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
def list(self) -> list[ArtifactRef]:
|
|
107
|
+
refs = []
|
|
108
|
+
for path in sorted(self.base.glob("*.meta.json")):
|
|
109
|
+
refs.append(self.get(path.name.removesuffix(".meta.json")))
|
|
110
|
+
return [ref for ref in refs if ref is not None]
|
|
111
|
+
|
|
112
|
+
def cleanup(self, *, keep_last: int = 100) -> list[str]:
|
|
113
|
+
refs = self.list()
|
|
114
|
+
removed: list[str] = []
|
|
115
|
+
for ref in refs[:-keep_last]:
|
|
116
|
+
artifact_path = self._contained_path(Path(ref.path).name)
|
|
117
|
+
metadata_path = self._metadata_path(ref.id)
|
|
118
|
+
if artifact_path.exists():
|
|
119
|
+
artifact_path.unlink()
|
|
120
|
+
if metadata_path.exists():
|
|
121
|
+
metadata_path.unlink()
|
|
122
|
+
removed.append(ref.id)
|
|
123
|
+
return removed
|
|
124
|
+
|
|
125
|
+
def _metadata_path(self, artifact_id: str) -> Path:
|
|
126
|
+
return self._contained_path(f"{artifact_id}.meta.json")
|
|
127
|
+
|
|
128
|
+
def _contained_path(self, name: str) -> Path:
|
|
129
|
+
root = self.base.resolve()
|
|
130
|
+
path = (self.base / name).resolve()
|
|
131
|
+
if path != root and root not in path.parents:
|
|
132
|
+
raise ValueError("artifact path escapes store")
|
|
133
|
+
return path
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def summarize_tool_output(content: str | bytes, *, store: ArtifactStore, kind: str = "tool") -> ToolResult:
|
|
137
|
+
ref = store.write(kind, content, {})
|
|
138
|
+
if isinstance(content, bytes):
|
|
139
|
+
summary = f"{kind} binary artifact {ref.id}"
|
|
140
|
+
truncated = True
|
|
141
|
+
else:
|
|
142
|
+
summary = _redact(content)[0][: store.output_limit]
|
|
143
|
+
truncated = len(content) > store.output_limit
|
|
144
|
+
return ToolResult(
|
|
145
|
+
ok=True,
|
|
146
|
+
output=summary,
|
|
147
|
+
summary=summary,
|
|
148
|
+
artifact_refs=[ref],
|
|
149
|
+
truncated=truncated,
|
|
150
|
+
redaction_applied=ref.redaction_applied,
|
|
151
|
+
provenance="artifact_store",
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _redact(text: str) -> tuple[str, bool]:
|
|
156
|
+
pattern = re.compile(r"(?i)(token|secret|api_key|password)=\S+")
|
|
157
|
+
return pattern.sub(r"\1=[redacted]", text), bool(pattern.search(text))
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
__all__ = ["ArtifactRef", "ArtifactStore", "summarize_tool_output"]
|
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
"""Deterministic attachment ingestion for chat and project context."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import csv
|
|
6
|
+
import hashlib
|
|
7
|
+
import json
|
|
8
|
+
import struct
|
|
9
|
+
import zipfile
|
|
10
|
+
from dataclasses import asdict, dataclass, field
|
|
11
|
+
from datetime import datetime, timezone
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Any, Callable
|
|
14
|
+
from xml.etree import ElementTree
|
|
15
|
+
|
|
16
|
+
from ..chat.transcript import append_jsonl
|
|
17
|
+
from ..errors import ConfigurationError
|
|
18
|
+
from ..novel.artifacts import ArtifactGraphStore, ArtifactRecord
|
|
19
|
+
|
|
20
|
+
UTC = timezone.utc
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
TEXT_EXTENSIONS = {".txt": "text", ".md": "markdown", ".markdown": "markdown"}
|
|
24
|
+
TABLE_EXTENSIONS = {".csv": "csv", ".xlsx": "excel", ".xlsm": "excel"}
|
|
25
|
+
DOCUMENT_EXTENSIONS = {".pdf": "pdf", ".docx": "docx"}
|
|
26
|
+
IMAGE_EXTENSIONS = {".png": "image", ".jpg": "image", ".jpeg": "image", ".webp": "image"}
|
|
27
|
+
SUPPORTED_EXTENSIONS = set(TEXT_EXTENSIONS) | set(TABLE_EXTENSIONS) | set(DOCUMENT_EXTENSIONS) | set(IMAGE_EXTENSIONS)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass(frozen=True)
|
|
31
|
+
class AttachmentIngestResult:
|
|
32
|
+
"""One ingested attachment and its persisted project artifacts."""
|
|
33
|
+
|
|
34
|
+
id: str
|
|
35
|
+
status: str
|
|
36
|
+
kind: str
|
|
37
|
+
source_path: Path
|
|
38
|
+
text: str = ""
|
|
39
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
40
|
+
artifact_path: Path = Path()
|
|
41
|
+
searchable_path: Path | None = None
|
|
42
|
+
memory_candidate_path: Path | None = None
|
|
43
|
+
warnings: list[str] = field(default_factory=list)
|
|
44
|
+
|
|
45
|
+
def to_dict(self) -> dict[str, Any]:
|
|
46
|
+
payload = asdict(self)
|
|
47
|
+
payload["source_path"] = str(self.source_path)
|
|
48
|
+
payload["artifact_path"] = str(self.artifact_path)
|
|
49
|
+
payload["searchable_path"] = str(self.searchable_path) if self.searchable_path else None
|
|
50
|
+
payload["memory_candidate_path"] = str(self.memory_candidate_path) if self.memory_candidate_path else None
|
|
51
|
+
return payload
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class AttachmentIngestor:
|
|
55
|
+
"""Extract searchable text and metadata from supported attachments."""
|
|
56
|
+
|
|
57
|
+
def __init__(
|
|
58
|
+
self,
|
|
59
|
+
root: str | Path,
|
|
60
|
+
*,
|
|
61
|
+
vision_describer: Callable[[Path], str] | None = None,
|
|
62
|
+
) -> None:
|
|
63
|
+
self.root = Path(root)
|
|
64
|
+
self.vision_describer = vision_describer
|
|
65
|
+
|
|
66
|
+
def ingest_file(self, source: str | Path, *, slug: str) -> AttachmentIngestResult:
|
|
67
|
+
path = Path(source)
|
|
68
|
+
self._require_source(path)
|
|
69
|
+
if path.suffix.lower() in IMAGE_EXTENSIONS:
|
|
70
|
+
return self.ingest_image(path, slug=slug)
|
|
71
|
+
text, kind, metadata, warnings = self._extract(path)
|
|
72
|
+
return self._persist(path, slug=slug, kind=kind, text=text, metadata=metadata, warnings=warnings)
|
|
73
|
+
|
|
74
|
+
def ingest_image(self, source: str | Path, *, slug: str) -> AttachmentIngestResult:
|
|
75
|
+
path = Path(source)
|
|
76
|
+
self._require_source(path)
|
|
77
|
+
metadata = _image_metadata(path)
|
|
78
|
+
text = self.vision_describer(path) if self.vision_describer else ""
|
|
79
|
+
warnings = [] if text else ["image description provider not configured"]
|
|
80
|
+
return self._persist(path, slug=slug, kind="image", text=text, metadata=metadata, warnings=warnings)
|
|
81
|
+
|
|
82
|
+
def ingest_folder(self, source: str | Path, *, slug: str) -> list[AttachmentIngestResult]:
|
|
83
|
+
folder = Path(source)
|
|
84
|
+
if not folder.is_dir():
|
|
85
|
+
raise ConfigurationError(f"attachment folder does not exist: {folder}")
|
|
86
|
+
results: list[AttachmentIngestResult] = []
|
|
87
|
+
for item in sorted(folder.iterdir(), key=lambda path: path.name):
|
|
88
|
+
if item.is_file() and item.suffix.lower() in SUPPORTED_EXTENSIONS:
|
|
89
|
+
results.append(self.ingest_file(item, slug=slug))
|
|
90
|
+
return results
|
|
91
|
+
|
|
92
|
+
def _extract(self, path: Path) -> tuple[str, str, dict[str, Any], list[str]]:
|
|
93
|
+
suffix = path.suffix.lower()
|
|
94
|
+
metadata = {"extension": suffix, "size_bytes": path.stat().st_size}
|
|
95
|
+
if suffix in TEXT_EXTENSIONS:
|
|
96
|
+
return path.read_text(encoding="utf-8"), TEXT_EXTENSIONS[suffix], metadata, []
|
|
97
|
+
if suffix == ".csv":
|
|
98
|
+
return _extract_csv(path), "csv", metadata, []
|
|
99
|
+
if suffix in {".xlsx", ".xlsm"}:
|
|
100
|
+
text, warnings = _extract_excel(path)
|
|
101
|
+
return text, "excel", metadata, warnings
|
|
102
|
+
if suffix == ".docx":
|
|
103
|
+
return _extract_docx(path), "docx", metadata, []
|
|
104
|
+
if suffix == ".pdf":
|
|
105
|
+
text, warnings = _extract_pdf(path)
|
|
106
|
+
return text, "pdf", metadata, warnings
|
|
107
|
+
raise ConfigurationError(f"unsupported attachment format: {suffix}")
|
|
108
|
+
|
|
109
|
+
def _persist(
|
|
110
|
+
self,
|
|
111
|
+
source_path: Path,
|
|
112
|
+
*,
|
|
113
|
+
slug: str,
|
|
114
|
+
kind: str,
|
|
115
|
+
text: str,
|
|
116
|
+
metadata: dict[str, Any],
|
|
117
|
+
warnings: list[str],
|
|
118
|
+
) -> AttachmentIngestResult:
|
|
119
|
+
project_root = self.root / "projects" / slug
|
|
120
|
+
attachment_id = f"attachment_{_sha256(source_path.read_bytes())[:12]}"
|
|
121
|
+
artifact_path = project_root / "attachments" / f"{attachment_id}.json"
|
|
122
|
+
searchable_path = project_root / "searchable" / f"{attachment_id}.txt"
|
|
123
|
+
candidate_path = self.root / "memory_candidates" / "projects" / f"{slug}.jsonl"
|
|
124
|
+
searchable_text = text if text.strip() else _metadata_text(metadata)
|
|
125
|
+
|
|
126
|
+
artifact_path.parent.mkdir(parents=True, exist_ok=True)
|
|
127
|
+
searchable_path.parent.mkdir(parents=True, exist_ok=True)
|
|
128
|
+
payload = {
|
|
129
|
+
"id": attachment_id,
|
|
130
|
+
"kind": kind,
|
|
131
|
+
"source": str(source_path),
|
|
132
|
+
"text": text,
|
|
133
|
+
"metadata": metadata,
|
|
134
|
+
"warnings": warnings,
|
|
135
|
+
"created_at": datetime.now(UTC).isoformat(),
|
|
136
|
+
}
|
|
137
|
+
artifact_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2, sort_keys=True) + "\n", encoding="utf-8")
|
|
138
|
+
searchable_path.write_text(searchable_text, encoding="utf-8")
|
|
139
|
+
append_jsonl(
|
|
140
|
+
candidate_path,
|
|
141
|
+
{
|
|
142
|
+
"id": f"memcand_{attachment_id}",
|
|
143
|
+
"kind": "attachment",
|
|
144
|
+
"project_slug": slug,
|
|
145
|
+
"scope": "project",
|
|
146
|
+
"source": str(source_path),
|
|
147
|
+
"text": searchable_text[:2000],
|
|
148
|
+
"status": "candidate",
|
|
149
|
+
"created_at": datetime.now(UTC).isoformat(),
|
|
150
|
+
},
|
|
151
|
+
)
|
|
152
|
+
ArtifactGraphStore(self.root, slug=slug).add(
|
|
153
|
+
ArtifactRecord(
|
|
154
|
+
id=attachment_id,
|
|
155
|
+
type=f"attachment:{kind}",
|
|
156
|
+
generated=[str(searchable_path)],
|
|
157
|
+
checksum=_sha256(artifact_path.read_bytes()),
|
|
158
|
+
provider="local",
|
|
159
|
+
prompt_version="attachment-ingestion-v1",
|
|
160
|
+
)
|
|
161
|
+
)
|
|
162
|
+
return AttachmentIngestResult(
|
|
163
|
+
id=attachment_id,
|
|
164
|
+
status="degraded" if warnings and not text.strip() else "ok",
|
|
165
|
+
kind=kind,
|
|
166
|
+
source_path=source_path,
|
|
167
|
+
text=text,
|
|
168
|
+
metadata=metadata,
|
|
169
|
+
artifact_path=artifact_path,
|
|
170
|
+
searchable_path=searchable_path,
|
|
171
|
+
memory_candidate_path=candidate_path,
|
|
172
|
+
warnings=warnings,
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
@staticmethod
|
|
176
|
+
def _require_source(path: Path) -> None:
|
|
177
|
+
if not path.is_file():
|
|
178
|
+
raise ConfigurationError(f"attachment file does not exist: {path}")
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def _extract_csv(path: Path) -> str:
|
|
182
|
+
rows: list[str] = []
|
|
183
|
+
with path.open("r", encoding="utf-8", newline="") as handle:
|
|
184
|
+
reader = csv.reader(handle)
|
|
185
|
+
for row in reader:
|
|
186
|
+
rows.append("\t".join(row))
|
|
187
|
+
return "\n".join(rows) + ("\n" if rows else "")
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def _extract_docx(path: Path) -> str:
|
|
191
|
+
with zipfile.ZipFile(path) as archive:
|
|
192
|
+
try:
|
|
193
|
+
raw = archive.read("word/document.xml")
|
|
194
|
+
except KeyError:
|
|
195
|
+
return ""
|
|
196
|
+
root = ElementTree.fromstring(raw)
|
|
197
|
+
namespace = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"
|
|
198
|
+
texts = [node.text or "" for node in root.iter(f"{namespace}t")]
|
|
199
|
+
return "\n".join(text for text in texts if text)
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def _extract_pdf(path: Path) -> tuple[str, list[str]]:
|
|
203
|
+
try:
|
|
204
|
+
from pypdf import PdfReader # type: ignore
|
|
205
|
+
except Exception:
|
|
206
|
+
try:
|
|
207
|
+
from PyPDF2 import PdfReader # type: ignore
|
|
208
|
+
except Exception:
|
|
209
|
+
return "", ["pdf text extraction requires pypdf or PyPDF2"]
|
|
210
|
+
try:
|
|
211
|
+
reader = PdfReader(str(path))
|
|
212
|
+
text = "\n".join(page.extract_text() or "" for page in reader.pages)
|
|
213
|
+
except Exception as exc:
|
|
214
|
+
return "", [f"pdf text extraction failed: {exc}"]
|
|
215
|
+
return text, []
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def _extract_excel(path: Path) -> tuple[str, list[str]]:
|
|
219
|
+
try:
|
|
220
|
+
from openpyxl import load_workbook # type: ignore
|
|
221
|
+
except Exception:
|
|
222
|
+
return "", ["excel table extraction requires openpyxl"]
|
|
223
|
+
workbook = load_workbook(path, read_only=True, data_only=True)
|
|
224
|
+
lines: list[str] = []
|
|
225
|
+
for sheet in workbook.worksheets:
|
|
226
|
+
lines.append(f"# {sheet.title}")
|
|
227
|
+
for row in sheet.iter_rows(values_only=True):
|
|
228
|
+
values = ["" if value is None else str(value) for value in row]
|
|
229
|
+
if any(values):
|
|
230
|
+
lines.append("\t".join(values))
|
|
231
|
+
return "\n".join(lines) + ("\n" if lines else ""), []
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def _image_metadata(path: Path) -> dict[str, Any]:
|
|
235
|
+
data = path.read_bytes()
|
|
236
|
+
metadata: dict[str, Any] = {
|
|
237
|
+
"extension": path.suffix.lower(),
|
|
238
|
+
"size_bytes": len(data),
|
|
239
|
+
}
|
|
240
|
+
if data.startswith(b"\x89PNG\r\n\x1a\n") and len(data) >= 24:
|
|
241
|
+
width, height = struct.unpack(">II", data[16:24])
|
|
242
|
+
metadata.update({"format": "png", "width": width, "height": height})
|
|
243
|
+
return metadata
|
|
244
|
+
if data.startswith(b"\xff\xd8"):
|
|
245
|
+
size = _jpeg_size(data)
|
|
246
|
+
metadata.update({"format": "jpeg"})
|
|
247
|
+
if size:
|
|
248
|
+
metadata.update({"width": size[0], "height": size[1]})
|
|
249
|
+
return metadata
|
|
250
|
+
metadata["format"] = path.suffix.lower().lstrip(".") or "unknown"
|
|
251
|
+
return metadata
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def _jpeg_size(data: bytes) -> tuple[int, int] | None:
|
|
255
|
+
index = 2
|
|
256
|
+
while index + 9 < len(data):
|
|
257
|
+
if data[index] != 0xFF:
|
|
258
|
+
index += 1
|
|
259
|
+
continue
|
|
260
|
+
marker = data[index + 1]
|
|
261
|
+
block_length = int.from_bytes(data[index + 2 : index + 4], "big")
|
|
262
|
+
if marker in {0xC0, 0xC2}:
|
|
263
|
+
height = int.from_bytes(data[index + 5 : index + 7], "big")
|
|
264
|
+
width = int.from_bytes(data[index + 7 : index + 9], "big")
|
|
265
|
+
return width, height
|
|
266
|
+
index += 2 + block_length
|
|
267
|
+
return None
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def _metadata_text(metadata: dict[str, Any]) -> str:
|
|
271
|
+
return "\n".join(f"{key}: {value}" for key, value in sorted(metadata.items())) + "\n"
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def _sha256(data: bytes) -> str:
|
|
275
|
+
return hashlib.sha256(data).hexdigest()
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
__all__ = [
|
|
279
|
+
"AttachmentIngestResult",
|
|
280
|
+
"AttachmentIngestor",
|
|
281
|
+
"SUPPORTED_EXTENSIONS",
|
|
282
|
+
]
|