memuron 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- memuron/__init__.py +3 -0
- memuron/actions/__init__.py +12 -0
- memuron/actions/context.py +63 -0
- memuron/actions/helpers.py +88 -0
- memuron/actions/memory.py +340 -0
- memuron/actions/memory_write.py +290 -0
- memuron/actions/nodes.py +340 -0
- memuron/actions/registry.py +5 -0
- memuron/actions/runtime.py +37 -0
- memuron/actions/spaces_documents.py +720 -0
- memuron/actions/sync.py +155 -0
- memuron/application/__init__.py +1 -0
- memuron/application/api.py +206 -0
- memuron/application/app.py +103 -0
- memuron/application/capabilities.py +82 -0
- memuron/application/cli.py +35 -0
- memuron/application/config.py +176 -0
- memuron/application/mcp.py +44 -0
- memuron/application/mcp_oauth.py +290 -0
- memuron/application/registry.py +52 -0
- memuron/context.py +532 -0
- memuron/documents/__init__.py +1 -0
- memuron/documents/link_guardian.py +192 -0
- memuron/documents/linking.py +292 -0
- memuron/documents/parser.py +1152 -0
- memuron/documents/storage.py +151 -0
- memuron/documents/url_ingest.py +375 -0
- memuron/domain/__init__.py +1 -0
- memuron/domain/decoders.py +1 -0
- memuron/domain/encoders.py +185 -0
- memuron/domain/lifecycles.py +8 -0
- memuron/domain/limits.py +6 -0
- memuron/domain/representations.py +56 -0
- memuron/domain/schemas.py +581 -0
- memuron/domain/scope_filter.py +104 -0
- memuron/graphfs/__init__.py +1 -0
- memuron/graphfs/manual.py +635 -0
- memuron/graphfs/projection.py +578 -0
- memuron/graphfs/query.py +1782 -0
- memuron/graphfs/read_model.py +574 -0
- memuron/ingest/__init__.py +1 -0
- memuron/ingest/guardian.py +213 -0
- memuron/ingest/jobs.py +424 -0
- memuron/ingest/prompts.py +147 -0
- memuron/memory/__init__.py +1 -0
- memuron/memory/engine.py +35 -0
- memuron/memory/projections.py +452 -0
- memuron/memory/recipes.py +3247 -0
- memuron/persistence/__init__.py +1 -0
- memuron/persistence/db_pool.py +57 -0
- memuron/persistence/identity_store.py +918 -0
- memuron/persistence/store_helpers.py +16 -0
- memuron/search/__init__.py +1 -0
- memuron/search/fulltext.py +110 -0
- memuron/search/hybrid.py +284 -0
- memuron/search/pgvector.py +252 -0
- memuron/security/__init__.py +1 -0
- memuron/security/auth.py +143 -0
- memuron/security/auth_provider.py +119 -0
- memuron/security/authorization.py +53 -0
- memuron/security/clerk_scopes.py +94 -0
- memuron/security/clerk_webhooks.py +61 -0
- memuron/security/jwt_tokens.py +53 -0
- memuron/security/passwords.py +38 -0
- memuron/security/tenant.py +58 -0
- memuron/spaces/__init__.py +1 -0
- memuron/spaces/model.py +35 -0
- memuron/spaces/service.py +155 -0
- memuron/sync/__init__.py +25 -0
- memuron/sync/folder.py +828 -0
- memuron-0.1.1.dist-info/METADATA +242 -0
- memuron-0.1.1.dist-info/RECORD +74 -0
- memuron-0.1.1.dist-info/WHEEL +4 -0
- memuron-0.1.1.dist-info/entry_points.txt +4 -0
memuron/sync/folder.py
ADDED
|
@@ -0,0 +1,828 @@
|
|
|
1
|
+
"""CLI-first one-way folder sync.
|
|
2
|
+
|
|
3
|
+
V0 intentionally imports local folders into Memuron collections and documents without
|
|
4
|
+
watching files, deleting remote graph nodes, or attempting a bidirectional mount.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import fnmatch
|
|
10
|
+
import hashlib
|
|
11
|
+
import json
|
|
12
|
+
import mimetypes
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
from datetime import UTC, datetime
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import Any, Protocol
|
|
17
|
+
|
|
18
|
+
from artha_engine import ArthaEngine
|
|
19
|
+
|
|
20
|
+
from memuron.documents.parser import MAX_DOCUMENT_UPLOAD_BYTES
|
|
21
|
+
from memuron.memory.recipes import (
|
|
22
|
+
create_collection,
|
|
23
|
+
ensure_memory_projections,
|
|
24
|
+
ingest_document_source,
|
|
25
|
+
place_node_in_collection,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
MANIFEST_VERSION = 1
|
|
29
|
+
DEFAULT_MANIFEST_RELATIVE_PATH = ".memuron/sync-manifest.json"
|
|
30
|
+
IGNORE_FILE_NAME = ".memuronignore"
|
|
31
|
+
|
|
32
|
+
DEFAULT_EXCLUDES = [
|
|
33
|
+
".git/",
|
|
34
|
+
".hg/",
|
|
35
|
+
".svn/",
|
|
36
|
+
".memuron/",
|
|
37
|
+
"node_modules/",
|
|
38
|
+
"__pycache__/",
|
|
39
|
+
".pytest_cache/",
|
|
40
|
+
".mypy_cache/",
|
|
41
|
+
".ruff_cache/",
|
|
42
|
+
".next/",
|
|
43
|
+
".nuxt/",
|
|
44
|
+
".turbo/",
|
|
45
|
+
".venv/",
|
|
46
|
+
"venv/",
|
|
47
|
+
"dist/",
|
|
48
|
+
"build/",
|
|
49
|
+
"coverage/",
|
|
50
|
+
".DS_Store",
|
|
51
|
+
"Thumbs.db",
|
|
52
|
+
"desktop.ini",
|
|
53
|
+
IGNORE_FILE_NAME,
|
|
54
|
+
]
|
|
55
|
+
|
|
56
|
+
SUPPORTED_EXTENSIONS = {
|
|
57
|
+
".csv",
|
|
58
|
+
".docx",
|
|
59
|
+
".gif",
|
|
60
|
+
".htm",
|
|
61
|
+
".html",
|
|
62
|
+
".jpeg",
|
|
63
|
+
".jpg",
|
|
64
|
+
".json",
|
|
65
|
+
".jsonl",
|
|
66
|
+
".md",
|
|
67
|
+
".markdown",
|
|
68
|
+
".pdf",
|
|
69
|
+
".png",
|
|
70
|
+
".ppt",
|
|
71
|
+
".pptx",
|
|
72
|
+
".rtf",
|
|
73
|
+
".text",
|
|
74
|
+
".tsv",
|
|
75
|
+
".txt",
|
|
76
|
+
".webp",
|
|
77
|
+
".xls",
|
|
78
|
+
".xlsm",
|
|
79
|
+
".xlsx",
|
|
80
|
+
".xml",
|
|
81
|
+
".yaml",
|
|
82
|
+
".yml",
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _now_iso() -> str:
|
|
87
|
+
return datetime.now(UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z")
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _rel(path: Path, root: Path) -> str:
|
|
91
|
+
value = path.relative_to(root).as_posix()
|
|
92
|
+
return value or "."
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _normalize_root(path: str | Path) -> Path:
|
|
96
|
+
root = Path(path).expanduser().resolve()
|
|
97
|
+
if not root.exists():
|
|
98
|
+
raise ValueError(f"Folder sync path does not exist: {root}")
|
|
99
|
+
if not root.is_dir():
|
|
100
|
+
raise ValueError(f"Folder sync path must be a directory: {root}")
|
|
101
|
+
return root
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def manifest_path_for_root(root: str | Path) -> Path:
|
|
105
|
+
return _normalize_root(root) / DEFAULT_MANIFEST_RELATIVE_PATH
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _load_ignore_file(root: Path) -> list[str]:
|
|
109
|
+
ignore_file = root / IGNORE_FILE_NAME
|
|
110
|
+
if not ignore_file.exists():
|
|
111
|
+
return []
|
|
112
|
+
patterns: list[str] = []
|
|
113
|
+
for raw in ignore_file.read_text(encoding="utf-8").splitlines():
|
|
114
|
+
line = raw.strip()
|
|
115
|
+
if not line or line.startswith("#"):
|
|
116
|
+
continue
|
|
117
|
+
patterns.append(line)
|
|
118
|
+
return patterns
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _match_pattern(pattern: str, rel_path: str, *, is_dir: bool) -> bool:
|
|
122
|
+
negated = pattern.startswith("!")
|
|
123
|
+
if negated:
|
|
124
|
+
pattern = pattern[1:]
|
|
125
|
+
pattern = pattern.strip()
|
|
126
|
+
if not pattern:
|
|
127
|
+
return False
|
|
128
|
+
directory_only = pattern.endswith("/")
|
|
129
|
+
pattern = pattern.rstrip("/")
|
|
130
|
+
if directory_only and not is_dir:
|
|
131
|
+
return False
|
|
132
|
+
|
|
133
|
+
rel_path = rel_path.strip("/")
|
|
134
|
+
basename = rel_path.rsplit("/", 1)[-1]
|
|
135
|
+
parts = rel_path.split("/") if rel_path else []
|
|
136
|
+
if "/" in pattern:
|
|
137
|
+
return fnmatch.fnmatch(rel_path, pattern) or fnmatch.fnmatch(rel_path, f"{pattern}/*")
|
|
138
|
+
return (
|
|
139
|
+
fnmatch.fnmatch(basename, pattern)
|
|
140
|
+
or any(fnmatch.fnmatch(part, pattern) for part in parts)
|
|
141
|
+
or fnmatch.fnmatch(rel_path, pattern)
|
|
142
|
+
or fnmatch.fnmatch(rel_path, f"{pattern}/*")
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def _matches_any(patterns: list[str], rel_path: str, *, is_dir: bool) -> bool:
|
|
147
|
+
ignored = False
|
|
148
|
+
for pattern in patterns:
|
|
149
|
+
if pattern.startswith("!"):
|
|
150
|
+
if ignored and _match_pattern(pattern, rel_path, is_dir=is_dir):
|
|
151
|
+
ignored = False
|
|
152
|
+
continue
|
|
153
|
+
if _match_pattern(pattern, rel_path, is_dir=is_dir):
|
|
154
|
+
ignored = True
|
|
155
|
+
return ignored
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _sha256(path: Path) -> str:
|
|
159
|
+
digest = hashlib.sha256()
|
|
160
|
+
with path.open("rb") as handle:
|
|
161
|
+
for chunk in iter(lambda: handle.read(1024 * 1024), b""):
|
|
162
|
+
digest.update(chunk)
|
|
163
|
+
return digest.hexdigest()
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _supported(path: Path) -> bool:
|
|
167
|
+
return path.suffix.lower() in SUPPORTED_EXTENSIONS
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
@dataclass(frozen=True)
|
|
171
|
+
class ScannedDirectory:
|
|
172
|
+
path: Path
|
|
173
|
+
relative_path: str
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
@dataclass(frozen=True)
|
|
177
|
+
class ScannedFile:
|
|
178
|
+
path: Path
|
|
179
|
+
relative_path: str
|
|
180
|
+
sha256: str
|
|
181
|
+
mtime_ns: int
|
|
182
|
+
size_bytes: int
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
@dataclass(frozen=True)
|
|
186
|
+
class SkippedPath:
|
|
187
|
+
path: Path
|
|
188
|
+
relative_path: str
|
|
189
|
+
reason: str
|
|
190
|
+
size_bytes: int | None = None
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
@dataclass(frozen=True)
|
|
194
|
+
class FolderScan:
|
|
195
|
+
root_path: Path
|
|
196
|
+
directories: list[ScannedDirectory]
|
|
197
|
+
files: list[ScannedFile]
|
|
198
|
+
skipped: list[SkippedPath]
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
@dataclass
|
|
202
|
+
class SyncManifest:
|
|
203
|
+
root_path: str
|
|
204
|
+
space_ref: str
|
|
205
|
+
include: list[str] = field(default_factory=list)
|
|
206
|
+
exclude: list[str] = field(default_factory=list)
|
|
207
|
+
max_file_bytes: int = MAX_DOCUMENT_UPLOAD_BYTES
|
|
208
|
+
version: int = MANIFEST_VERSION
|
|
209
|
+
created_at: str = field(default_factory=_now_iso)
|
|
210
|
+
updated_at: str = field(default_factory=_now_iso)
|
|
211
|
+
space: dict[str, Any] = field(default_factory=dict)
|
|
212
|
+
directories: dict[str, dict[str, Any]] = field(default_factory=dict)
|
|
213
|
+
files: dict[str, dict[str, Any]] = field(default_factory=dict)
|
|
214
|
+
last_run: dict[str, Any] = field(default_factory=dict)
|
|
215
|
+
|
|
216
|
+
def to_dict(self) -> dict[str, Any]:
|
|
217
|
+
return {
|
|
218
|
+
"version": self.version,
|
|
219
|
+
"root_path": self.root_path,
|
|
220
|
+
"space_ref": self.space_ref,
|
|
221
|
+
"space": self.space,
|
|
222
|
+
"include": self.include,
|
|
223
|
+
"exclude": self.exclude,
|
|
224
|
+
"max_file_bytes": self.max_file_bytes,
|
|
225
|
+
"created_at": self.created_at,
|
|
226
|
+
"updated_at": self.updated_at,
|
|
227
|
+
"directories": self.directories,
|
|
228
|
+
"files": self.files,
|
|
229
|
+
"last_run": self.last_run,
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
@classmethod
|
|
233
|
+
def from_dict(cls, data: dict[str, Any]) -> "SyncManifest":
|
|
234
|
+
version = int(data.get("version") or 0)
|
|
235
|
+
if version != MANIFEST_VERSION:
|
|
236
|
+
raise ValueError(f"Unsupported folder sync manifest version: {version}")
|
|
237
|
+
return cls(
|
|
238
|
+
version=version,
|
|
239
|
+
root_path=str(data["root_path"]),
|
|
240
|
+
space_ref=str(data["space_ref"]),
|
|
241
|
+
space=dict(data.get("space") or {}),
|
|
242
|
+
include=list(data.get("include") or []),
|
|
243
|
+
exclude=list(data.get("exclude") or []),
|
|
244
|
+
max_file_bytes=int(data.get("max_file_bytes") or MAX_DOCUMENT_UPLOAD_BYTES),
|
|
245
|
+
created_at=str(data.get("created_at") or _now_iso()),
|
|
246
|
+
updated_at=str(data.get("updated_at") or _now_iso()),
|
|
247
|
+
directories=dict(data.get("directories") or {}),
|
|
248
|
+
files=dict(data.get("files") or {}),
|
|
249
|
+
last_run=dict(data.get("last_run") or {}),
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
@dataclass(frozen=True)
|
|
254
|
+
class FilePlanItem:
|
|
255
|
+
relative_path: str
|
|
256
|
+
action: str
|
|
257
|
+
reason: str
|
|
258
|
+
scanned: ScannedFile | None = None
|
|
259
|
+
skipped: SkippedPath | None = None
|
|
260
|
+
previous: dict[str, Any] = field(default_factory=dict)
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
@dataclass(frozen=True)
|
|
264
|
+
class DirectoryPlanItem:
|
|
265
|
+
relative_path: str
|
|
266
|
+
action: str
|
|
267
|
+
scanned: ScannedDirectory | None = None
|
|
268
|
+
previous: dict[str, Any] = field(default_factory=dict)
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
@dataclass(frozen=True)
|
|
272
|
+
class SyncPlan:
|
|
273
|
+
root_path: Path
|
|
274
|
+
manifest: SyncManifest
|
|
275
|
+
directories: list[DirectoryPlanItem]
|
|
276
|
+
files: list[FilePlanItem]
|
|
277
|
+
deleted_files: list[str]
|
|
278
|
+
deleted_directories: list[str]
|
|
279
|
+
|
|
280
|
+
def summary(self) -> dict[str, int]:
|
|
281
|
+
file_counts: dict[str, int] = {}
|
|
282
|
+
for item in self.files:
|
|
283
|
+
file_counts[item.action] = file_counts.get(item.action, 0) + 1
|
|
284
|
+
dir_counts: dict[str, int] = {}
|
|
285
|
+
for item in self.directories:
|
|
286
|
+
dir_counts[item.action] = dir_counts.get(item.action, 0) + 1
|
|
287
|
+
return {
|
|
288
|
+
"directories_create": dir_counts.get("create", 0),
|
|
289
|
+
"directories_unchanged": dir_counts.get("unchanged", 0),
|
|
290
|
+
"files_ingest": file_counts.get("ingest", 0),
|
|
291
|
+
"files_reingest": file_counts.get("reingest", 0),
|
|
292
|
+
"files_unchanged": file_counts.get("unchanged", 0),
|
|
293
|
+
"files_skipped": file_counts.get("skipped", 0),
|
|
294
|
+
"deleted_files_reported": len(self.deleted_files),
|
|
295
|
+
"deleted_directories_reported": len(self.deleted_directories),
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
class FolderSyncBackend(Protocol):
|
|
300
|
+
def create_collection(
|
|
301
|
+
self,
|
|
302
|
+
*,
|
|
303
|
+
relative_path: str,
|
|
304
|
+
name: str,
|
|
305
|
+
summary: str,
|
|
306
|
+
metadata: dict[str, Any],
|
|
307
|
+
) -> dict[str, Any]:
|
|
308
|
+
...
|
|
309
|
+
|
|
310
|
+
def place_node(
|
|
311
|
+
self,
|
|
312
|
+
*,
|
|
313
|
+
parent_id: str,
|
|
314
|
+
child_id: str,
|
|
315
|
+
name: str,
|
|
316
|
+
metadata: dict[str, Any],
|
|
317
|
+
) -> dict[str, Any]:
|
|
318
|
+
...
|
|
319
|
+
|
|
320
|
+
def ingest_file(
|
|
321
|
+
self,
|
|
322
|
+
file: ScannedFile,
|
|
323
|
+
*,
|
|
324
|
+
metadata: dict[str, Any],
|
|
325
|
+
) -> dict[str, Any]:
|
|
326
|
+
...
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
class EngineFolderSyncBackend:
|
|
330
|
+
def __init__(
|
|
331
|
+
self,
|
|
332
|
+
engine: ArthaEngine,
|
|
333
|
+
*,
|
|
334
|
+
scope: list[str],
|
|
335
|
+
event_metadata: dict[str, object] | None = None,
|
|
336
|
+
) -> None:
|
|
337
|
+
self.engine = engine
|
|
338
|
+
self.scope = list(scope)
|
|
339
|
+
self.event_metadata = dict(event_metadata or {})
|
|
340
|
+
|
|
341
|
+
def create_collection(
|
|
342
|
+
self,
|
|
343
|
+
*,
|
|
344
|
+
relative_path: str,
|
|
345
|
+
name: str,
|
|
346
|
+
summary: str,
|
|
347
|
+
metadata: dict[str, Any],
|
|
348
|
+
) -> dict[str, Any]:
|
|
349
|
+
return create_collection(
|
|
350
|
+
self.engine,
|
|
351
|
+
name=name,
|
|
352
|
+
summary=summary,
|
|
353
|
+
scope=self.scope,
|
|
354
|
+
metadata=metadata,
|
|
355
|
+
event_metadata=self.event_metadata,
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
def place_node(
|
|
359
|
+
self,
|
|
360
|
+
*,
|
|
361
|
+
parent_id: str,
|
|
362
|
+
child_id: str,
|
|
363
|
+
name: str,
|
|
364
|
+
metadata: dict[str, Any],
|
|
365
|
+
) -> dict[str, Any]:
|
|
366
|
+
return place_node_in_collection(
|
|
367
|
+
self.engine,
|
|
368
|
+
parent_id=parent_id,
|
|
369
|
+
child_id=child_id,
|
|
370
|
+
name=name,
|
|
371
|
+
scope=self.scope,
|
|
372
|
+
metadata=metadata,
|
|
373
|
+
event_metadata=self.event_metadata,
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
def ingest_file(
|
|
377
|
+
self,
|
|
378
|
+
file: ScannedFile,
|
|
379
|
+
*,
|
|
380
|
+
metadata: dict[str, Any],
|
|
381
|
+
) -> dict[str, Any]:
|
|
382
|
+
content_type = mimetypes.guess_type(file.path.name)[0]
|
|
383
|
+
return ingest_document_source(
|
|
384
|
+
self.engine,
|
|
385
|
+
file_name=file.path.name,
|
|
386
|
+
content_type=content_type,
|
|
387
|
+
file_bytes=file.path.read_bytes(),
|
|
388
|
+
scope=self.scope,
|
|
389
|
+
metadata=metadata,
|
|
390
|
+
event_metadata=self.event_metadata,
|
|
391
|
+
)
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
def scan_folder(
|
|
395
|
+
root: str | Path,
|
|
396
|
+
*,
|
|
397
|
+
include: list[str] | None = None,
|
|
398
|
+
exclude: list[str] | None = None,
|
|
399
|
+
max_file_bytes: int = MAX_DOCUMENT_UPLOAD_BYTES,
|
|
400
|
+
) -> FolderScan:
|
|
401
|
+
root_path = _normalize_root(root)
|
|
402
|
+
includes = list(include or [])
|
|
403
|
+
excludes = [*DEFAULT_EXCLUDES, *_load_ignore_file(root_path), *(exclude or [])]
|
|
404
|
+
directories = [ScannedDirectory(root_path, ".")]
|
|
405
|
+
files: list[ScannedFile] = []
|
|
406
|
+
skipped: list[SkippedPath] = []
|
|
407
|
+
|
|
408
|
+
for current, dir_names, file_names in root_path.walk():
|
|
409
|
+
current_path = Path(current)
|
|
410
|
+
kept_dirs: list[str] = []
|
|
411
|
+
for dir_name in sorted(dir_names):
|
|
412
|
+
child = current_path / dir_name
|
|
413
|
+
rel_path = _rel(child, root_path)
|
|
414
|
+
if _matches_any(excludes, rel_path, is_dir=True):
|
|
415
|
+
skipped.append(SkippedPath(child, rel_path, "ignored directory"))
|
|
416
|
+
continue
|
|
417
|
+
if includes and not _matches_any(includes, rel_path, is_dir=True):
|
|
418
|
+
# Keep walking because an include may match a descendant.
|
|
419
|
+
kept_dirs.append(dir_name)
|
|
420
|
+
directories.append(ScannedDirectory(child, rel_path))
|
|
421
|
+
continue
|
|
422
|
+
kept_dirs.append(dir_name)
|
|
423
|
+
directories.append(ScannedDirectory(child, rel_path))
|
|
424
|
+
dir_names[:] = kept_dirs
|
|
425
|
+
|
|
426
|
+
for file_name in sorted(file_names):
|
|
427
|
+
path = current_path / file_name
|
|
428
|
+
rel_path = _rel(path, root_path)
|
|
429
|
+
try:
|
|
430
|
+
stat = path.stat()
|
|
431
|
+
except OSError as exc:
|
|
432
|
+
skipped.append(SkippedPath(path, rel_path, f"stat failed: {exc}"))
|
|
433
|
+
continue
|
|
434
|
+
if _matches_any(excludes, rel_path, is_dir=False):
|
|
435
|
+
skipped.append(SkippedPath(path, rel_path, "ignored file", stat.st_size))
|
|
436
|
+
continue
|
|
437
|
+
if includes and not _matches_any(includes, rel_path, is_dir=False):
|
|
438
|
+
skipped.append(SkippedPath(path, rel_path, "not included", stat.st_size))
|
|
439
|
+
continue
|
|
440
|
+
if stat.st_size > max_file_bytes:
|
|
441
|
+
skipped.append(
|
|
442
|
+
SkippedPath(
|
|
443
|
+
path,
|
|
444
|
+
rel_path,
|
|
445
|
+
f"file exceeds max upload size ({max_file_bytes} bytes)",
|
|
446
|
+
stat.st_size,
|
|
447
|
+
)
|
|
448
|
+
)
|
|
449
|
+
continue
|
|
450
|
+
if not _supported(path):
|
|
451
|
+
skipped.append(SkippedPath(path, rel_path, "unsupported file type", stat.st_size))
|
|
452
|
+
continue
|
|
453
|
+
files.append(
|
|
454
|
+
ScannedFile(
|
|
455
|
+
path=path,
|
|
456
|
+
relative_path=rel_path,
|
|
457
|
+
sha256=_sha256(path),
|
|
458
|
+
mtime_ns=stat.st_mtime_ns,
|
|
459
|
+
size_bytes=stat.st_size,
|
|
460
|
+
)
|
|
461
|
+
)
|
|
462
|
+
directories.sort(key=lambda item: (item.relative_path.count("/"), item.relative_path))
|
|
463
|
+
files.sort(key=lambda item: item.relative_path)
|
|
464
|
+
skipped.sort(key=lambda item: item.relative_path)
|
|
465
|
+
return FolderScan(root_path=root_path, directories=directories, files=files, skipped=skipped)
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
def init_manifest(
|
|
469
|
+
path: str | Path,
|
|
470
|
+
*,
|
|
471
|
+
space_ref: str,
|
|
472
|
+
include: list[str] | None = None,
|
|
473
|
+
exclude: list[str] | None = None,
|
|
474
|
+
max_file_bytes: int = MAX_DOCUMENT_UPLOAD_BYTES,
|
|
475
|
+
manifest_path: str | Path | None = None,
|
|
476
|
+
overwrite: bool = False,
|
|
477
|
+
) -> tuple[SyncManifest, Path]:
|
|
478
|
+
root = _normalize_root(path)
|
|
479
|
+
target = Path(manifest_path).expanduser().resolve() if manifest_path else manifest_path_for_root(root)
|
|
480
|
+
if target.exists() and not overwrite:
|
|
481
|
+
raise ValueError(f"Folder sync manifest already exists: {target}")
|
|
482
|
+
manifest = SyncManifest(
|
|
483
|
+
root_path=str(root),
|
|
484
|
+
space_ref=space_ref,
|
|
485
|
+
include=list(include or []),
|
|
486
|
+
exclude=list(exclude or []),
|
|
487
|
+
max_file_bytes=max_file_bytes,
|
|
488
|
+
)
|
|
489
|
+
save_manifest(manifest, target)
|
|
490
|
+
return manifest, target
|
|
491
|
+
|
|
492
|
+
|
|
493
|
+
def load_manifest(path: str | Path) -> SyncManifest:
|
|
494
|
+
data = json.loads(Path(path).expanduser().read_text(encoding="utf-8"))
|
|
495
|
+
if not isinstance(data, dict):
|
|
496
|
+
raise ValueError("Folder sync manifest must contain a JSON object")
|
|
497
|
+
return SyncManifest.from_dict(data)
|
|
498
|
+
|
|
499
|
+
|
|
500
|
+
def save_manifest(manifest: SyncManifest, path: str | Path) -> None:
|
|
501
|
+
manifest.updated_at = _now_iso()
|
|
502
|
+
target = Path(path).expanduser()
|
|
503
|
+
target.parent.mkdir(parents=True, exist_ok=True)
|
|
504
|
+
target.write_text(
|
|
505
|
+
json.dumps(manifest.to_dict(), indent=2, sort_keys=True, ensure_ascii=True) + "\n",
|
|
506
|
+
encoding="utf-8",
|
|
507
|
+
)
|
|
508
|
+
|
|
509
|
+
|
|
510
|
+
def resolve_manifest_path(path: str | Path, manifest_path: str | Path | None = None) -> Path:
|
|
511
|
+
if manifest_path is not None:
|
|
512
|
+
return Path(manifest_path).expanduser().resolve()
|
|
513
|
+
candidate = Path(path).expanduser().resolve()
|
|
514
|
+
if candidate.is_file():
|
|
515
|
+
return candidate
|
|
516
|
+
return manifest_path_for_root(candidate)
|
|
517
|
+
|
|
518
|
+
|
|
519
|
+
def plan_folder_sync(manifest: SyncManifest) -> SyncPlan:
|
|
520
|
+
root = _normalize_root(manifest.root_path)
|
|
521
|
+
scan = scan_folder(
|
|
522
|
+
root,
|
|
523
|
+
include=manifest.include,
|
|
524
|
+
exclude=manifest.exclude,
|
|
525
|
+
max_file_bytes=manifest.max_file_bytes,
|
|
526
|
+
)
|
|
527
|
+
existing_dirs = manifest.directories
|
|
528
|
+
existing_files = manifest.files
|
|
529
|
+
seen_dirs = {item.relative_path for item in scan.directories}
|
|
530
|
+
seen_files = {item.relative_path for item in scan.files}
|
|
531
|
+
skipped_files = {item.relative_path for item in scan.skipped if item.path.is_file()}
|
|
532
|
+
|
|
533
|
+
dir_items: list[DirectoryPlanItem] = []
|
|
534
|
+
for directory in scan.directories:
|
|
535
|
+
previous = dict(existing_dirs.get(directory.relative_path) or {})
|
|
536
|
+
action = "unchanged" if previous.get("node_id") else "create"
|
|
537
|
+
dir_items.append(
|
|
538
|
+
DirectoryPlanItem(
|
|
539
|
+
relative_path=directory.relative_path,
|
|
540
|
+
action=action,
|
|
541
|
+
scanned=directory,
|
|
542
|
+
previous=previous,
|
|
543
|
+
)
|
|
544
|
+
)
|
|
545
|
+
|
|
546
|
+
file_items: list[FilePlanItem] = []
|
|
547
|
+
for file in scan.files:
|
|
548
|
+
previous = dict(existing_files.get(file.relative_path) or {})
|
|
549
|
+
if (
|
|
550
|
+
previous.get("sha256") == file.sha256
|
|
551
|
+
and previous.get("mtime_ns") == file.mtime_ns
|
|
552
|
+
and previous.get("size_bytes") == file.size_bytes
|
|
553
|
+
and previous.get("last_sync_status") == "synced"
|
|
554
|
+
):
|
|
555
|
+
action = "unchanged"
|
|
556
|
+
reason = "hash, mtime, and size unchanged"
|
|
557
|
+
elif previous.get("collection_id") or previous.get("document_id"):
|
|
558
|
+
action = "reingest"
|
|
559
|
+
reason = "file changed; V0 imports a new document graph and keeps old nodes"
|
|
560
|
+
else:
|
|
561
|
+
action = "ingest"
|
|
562
|
+
reason = "new supported file"
|
|
563
|
+
file_items.append(
|
|
564
|
+
FilePlanItem(
|
|
565
|
+
relative_path=file.relative_path,
|
|
566
|
+
action=action,
|
|
567
|
+
reason=reason,
|
|
568
|
+
scanned=file,
|
|
569
|
+
previous=previous,
|
|
570
|
+
)
|
|
571
|
+
)
|
|
572
|
+
for skipped in scan.skipped:
|
|
573
|
+
if skipped.relative_path in skipped_files:
|
|
574
|
+
file_items.append(
|
|
575
|
+
FilePlanItem(
|
|
576
|
+
relative_path=skipped.relative_path,
|
|
577
|
+
action="skipped",
|
|
578
|
+
reason=skipped.reason,
|
|
579
|
+
skipped=skipped,
|
|
580
|
+
previous=dict(existing_files.get(skipped.relative_path) or {}),
|
|
581
|
+
)
|
|
582
|
+
)
|
|
583
|
+
|
|
584
|
+
deleted_files = sorted(
|
|
585
|
+
rel
|
|
586
|
+
for rel in existing_files
|
|
587
|
+
if rel not in seen_files and rel not in skipped_files and rel != "."
|
|
588
|
+
)
|
|
589
|
+
deleted_dirs = sorted(rel for rel in existing_dirs if rel not in seen_dirs and rel != ".")
|
|
590
|
+
file_items.sort(key=lambda item: item.relative_path)
|
|
591
|
+
return SyncPlan(
|
|
592
|
+
root_path=root,
|
|
593
|
+
manifest=manifest,
|
|
594
|
+
directories=dir_items,
|
|
595
|
+
files=file_items,
|
|
596
|
+
deleted_files=deleted_files,
|
|
597
|
+
deleted_directories=deleted_dirs,
|
|
598
|
+
)
|
|
599
|
+
|
|
600
|
+
|
|
601
|
+
def _folder_metadata(root: Path, relative_path: str) -> dict[str, Any]:
|
|
602
|
+
return {
|
|
603
|
+
"system": {
|
|
604
|
+
"folder_sync": {
|
|
605
|
+
"version": MANIFEST_VERSION,
|
|
606
|
+
"kind": "directory",
|
|
607
|
+
"root_path": str(root),
|
|
608
|
+
"relative_path": relative_path,
|
|
609
|
+
}
|
|
610
|
+
}
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
|
|
614
|
+
def _file_metadata(root: Path, file: ScannedFile) -> dict[str, Any]:
|
|
615
|
+
return {
|
|
616
|
+
"system": {
|
|
617
|
+
"folder_sync": {
|
|
618
|
+
"version": MANIFEST_VERSION,
|
|
619
|
+
"kind": "file",
|
|
620
|
+
"root_path": str(root),
|
|
621
|
+
"path": str(file.path),
|
|
622
|
+
"relative_path": file.relative_path,
|
|
623
|
+
"sha256": file.sha256,
|
|
624
|
+
"mtime_ns": file.mtime_ns,
|
|
625
|
+
"size_bytes": file.size_bytes,
|
|
626
|
+
}
|
|
627
|
+
}
|
|
628
|
+
}
|
|
629
|
+
|
|
630
|
+
|
|
631
|
+
def _parent_relative_path(relative_path: str) -> str:
|
|
632
|
+
if relative_path == "." or "/" not in relative_path:
|
|
633
|
+
return "."
|
|
634
|
+
return relative_path.rsplit("/", 1)[0]
|
|
635
|
+
|
|
636
|
+
|
|
637
|
+
def run_folder_sync(
|
|
638
|
+
manifest: SyncManifest,
|
|
639
|
+
*,
|
|
640
|
+
backend: FolderSyncBackend,
|
|
641
|
+
manifest_path: str | Path | None = None,
|
|
642
|
+
dry_run: bool = False,
|
|
643
|
+
) -> dict[str, Any]:
|
|
644
|
+
plan = plan_folder_sync(manifest)
|
|
645
|
+
if dry_run:
|
|
646
|
+
return {"status": "planned", "summary": plan.summary(), "manifest": manifest.to_dict()}
|
|
647
|
+
|
|
648
|
+
for item in plan.directories:
|
|
649
|
+
previous = dict(manifest.directories.get(item.relative_path) or {})
|
|
650
|
+
scanned = item.scanned
|
|
651
|
+
if scanned is None:
|
|
652
|
+
continue
|
|
653
|
+
if item.action == "create":
|
|
654
|
+
name = plan.root_path.name if item.relative_path == "." else scanned.path.name
|
|
655
|
+
summary = (
|
|
656
|
+
f"Folder sync root: {plan.root_path}"
|
|
657
|
+
if item.relative_path == "."
|
|
658
|
+
else f"Folder synced from {item.relative_path}"
|
|
659
|
+
)
|
|
660
|
+
collection = backend.create_collection(
|
|
661
|
+
relative_path=item.relative_path,
|
|
662
|
+
name=name,
|
|
663
|
+
summary=summary,
|
|
664
|
+
metadata=_folder_metadata(plan.root_path, item.relative_path),
|
|
665
|
+
)
|
|
666
|
+
previous.update(
|
|
667
|
+
{
|
|
668
|
+
"path": str(scanned.path),
|
|
669
|
+
"relative_path": item.relative_path,
|
|
670
|
+
"node_id": collection["id"],
|
|
671
|
+
"last_sync_status": "synced",
|
|
672
|
+
"last_synced_at": _now_iso(),
|
|
673
|
+
}
|
|
674
|
+
)
|
|
675
|
+
if item.relative_path != ".":
|
|
676
|
+
parent_rel = _parent_relative_path(item.relative_path)
|
|
677
|
+
parent = manifest.directories.get(parent_rel) or {}
|
|
678
|
+
parent_id = parent.get("node_id")
|
|
679
|
+
if parent_id:
|
|
680
|
+
placement = backend.place_node(
|
|
681
|
+
parent_id=str(parent_id),
|
|
682
|
+
child_id=str(collection["id"]),
|
|
683
|
+
name=scanned.path.name,
|
|
684
|
+
metadata={
|
|
685
|
+
"role": "folder_sync_directory",
|
|
686
|
+
"relative_path": item.relative_path,
|
|
687
|
+
},
|
|
688
|
+
)
|
|
689
|
+
previous["placement_id"] = placement["id"]
|
|
690
|
+
else:
|
|
691
|
+
previous.update(
|
|
692
|
+
{
|
|
693
|
+
"path": str(scanned.path),
|
|
694
|
+
"relative_path": item.relative_path,
|
|
695
|
+
"last_sync_status": "synced",
|
|
696
|
+
}
|
|
697
|
+
)
|
|
698
|
+
manifest.directories[item.relative_path] = previous
|
|
699
|
+
|
|
700
|
+
for item in plan.files:
|
|
701
|
+
previous = dict(manifest.files.get(item.relative_path) or {})
|
|
702
|
+
if item.action == "unchanged" and item.scanned is not None:
|
|
703
|
+
previous.update(
|
|
704
|
+
{
|
|
705
|
+
"path": str(item.scanned.path),
|
|
706
|
+
"relative_path": item.relative_path,
|
|
707
|
+
"sha256": item.scanned.sha256,
|
|
708
|
+
"mtime_ns": item.scanned.mtime_ns,
|
|
709
|
+
"size_bytes": item.scanned.size_bytes,
|
|
710
|
+
"last_sync_status": "synced",
|
|
711
|
+
"last_skip_reason": None,
|
|
712
|
+
}
|
|
713
|
+
)
|
|
714
|
+
manifest.files[item.relative_path] = previous
|
|
715
|
+
continue
|
|
716
|
+
if item.action == "skipped" and item.skipped is not None:
|
|
717
|
+
previous.update(
|
|
718
|
+
{
|
|
719
|
+
"path": str(item.skipped.path),
|
|
720
|
+
"relative_path": item.relative_path,
|
|
721
|
+
"size_bytes": item.skipped.size_bytes,
|
|
722
|
+
"last_sync_status": "skipped",
|
|
723
|
+
"last_skip_reason": item.reason,
|
|
724
|
+
"last_synced_at": _now_iso(),
|
|
725
|
+
}
|
|
726
|
+
)
|
|
727
|
+
manifest.files[item.relative_path] = previous
|
|
728
|
+
continue
|
|
729
|
+
if item.scanned is None:
|
|
730
|
+
continue
|
|
731
|
+
try:
|
|
732
|
+
payload = backend.ingest_file(
|
|
733
|
+
item.scanned,
|
|
734
|
+
metadata=_file_metadata(plan.root_path, item.scanned),
|
|
735
|
+
)
|
|
736
|
+
parent_rel = _parent_relative_path(item.relative_path)
|
|
737
|
+
parent = manifest.directories.get(parent_rel) or manifest.directories.get(".") or {}
|
|
738
|
+
parent_id = parent.get("node_id")
|
|
739
|
+
placement_id = None
|
|
740
|
+
collection_id = str(payload["collection"]["id"])
|
|
741
|
+
if parent_id:
|
|
742
|
+
placement = backend.place_node(
|
|
743
|
+
parent_id=str(parent_id),
|
|
744
|
+
child_id=collection_id,
|
|
745
|
+
name=item.scanned.path.name,
|
|
746
|
+
metadata={"role": "folder_sync_file", "relative_path": item.relative_path},
|
|
747
|
+
)
|
|
748
|
+
placement_id = placement["id"]
|
|
749
|
+
history = list(previous.get("history") or [])
|
|
750
|
+
if previous.get("collection_id") or previous.get("document_id"):
|
|
751
|
+
history.append(
|
|
752
|
+
{
|
|
753
|
+
"collection_id": previous.get("collection_id"),
|
|
754
|
+
"document_id": previous.get("document_id"),
|
|
755
|
+
"sha256": previous.get("sha256"),
|
|
756
|
+
"replaced_at": _now_iso(),
|
|
757
|
+
}
|
|
758
|
+
)
|
|
759
|
+
previous.update(
|
|
760
|
+
{
|
|
761
|
+
"path": str(item.scanned.path),
|
|
762
|
+
"relative_path": item.relative_path,
|
|
763
|
+
"sha256": item.scanned.sha256,
|
|
764
|
+
"mtime_ns": item.scanned.mtime_ns,
|
|
765
|
+
"size_bytes": item.scanned.size_bytes,
|
|
766
|
+
"collection_id": collection_id,
|
|
767
|
+
"document_id": str(payload["document"]["id"]),
|
|
768
|
+
"document_key": str(payload["document_key"]),
|
|
769
|
+
"placement_id": placement_id,
|
|
770
|
+
"last_sync_status": "synced",
|
|
771
|
+
"last_sync_action": item.action,
|
|
772
|
+
"last_error": None,
|
|
773
|
+
"last_skip_reason": None,
|
|
774
|
+
"last_synced_at": _now_iso(),
|
|
775
|
+
"history": history,
|
|
776
|
+
}
|
|
777
|
+
)
|
|
778
|
+
except Exception as exc:
|
|
779
|
+
previous.update(
|
|
780
|
+
{
|
|
781
|
+
"path": str(item.scanned.path),
|
|
782
|
+
"relative_path": item.relative_path,
|
|
783
|
+
"sha256": item.scanned.sha256,
|
|
784
|
+
"mtime_ns": item.scanned.mtime_ns,
|
|
785
|
+
"size_bytes": item.scanned.size_bytes,
|
|
786
|
+
"last_sync_status": "error",
|
|
787
|
+
"last_error": str(exc),
|
|
788
|
+
"last_synced_at": _now_iso(),
|
|
789
|
+
}
|
|
790
|
+
)
|
|
791
|
+
manifest.files[item.relative_path] = previous
|
|
792
|
+
|
|
793
|
+
for relative_path in plan.deleted_files:
|
|
794
|
+
entry = dict(manifest.files.get(relative_path) or {})
|
|
795
|
+
entry["last_sync_status"] = "deleted_local"
|
|
796
|
+
entry["last_skip_reason"] = "local path missing; V0 does not delete remote nodes"
|
|
797
|
+
entry["last_synced_at"] = _now_iso()
|
|
798
|
+
manifest.files[relative_path] = entry
|
|
799
|
+
for relative_path in plan.deleted_directories:
|
|
800
|
+
entry = dict(manifest.directories.get(relative_path) or {})
|
|
801
|
+
entry["last_sync_status"] = "deleted_local"
|
|
802
|
+
entry["last_skip_reason"] = "local path missing; V0 does not delete remote nodes"
|
|
803
|
+
entry["last_synced_at"] = _now_iso()
|
|
804
|
+
manifest.directories[relative_path] = entry
|
|
805
|
+
|
|
806
|
+
manifest.last_run = {"ran_at": _now_iso(), "summary": plan.summary()}
|
|
807
|
+
if manifest_path is not None:
|
|
808
|
+
save_manifest(manifest, manifest_path)
|
|
809
|
+
return {"status": "success", "summary": plan.summary(), "manifest": manifest.to_dict()}
|
|
810
|
+
|
|
811
|
+
|
|
812
|
+
def run_engine_folder_sync(
|
|
813
|
+
engine: ArthaEngine,
|
|
814
|
+
manifest: SyncManifest,
|
|
815
|
+
*,
|
|
816
|
+
scope: list[str],
|
|
817
|
+
event_metadata: dict[str, object] | None = None,
|
|
818
|
+
manifest_path: str | Path | None = None,
|
|
819
|
+
dry_run: bool = False,
|
|
820
|
+
) -> dict[str, Any]:
|
|
821
|
+
ensure_memory_projections(engine)
|
|
822
|
+
backend = EngineFolderSyncBackend(engine, scope=scope, event_metadata=event_metadata)
|
|
823
|
+
return run_folder_sync(
|
|
824
|
+
manifest,
|
|
825
|
+
backend=backend,
|
|
826
|
+
manifest_path=manifest_path,
|
|
827
|
+
dry_run=dry_run,
|
|
828
|
+
)
|