chunksmith-adapters 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chunksmith_adapters-0.3.0/PKG-INFO +21 -0
- chunksmith_adapters-0.3.0/pyproject.toml +33 -0
- chunksmith_adapters-0.3.0/setup.cfg +4 -0
- chunksmith_adapters-0.3.0/src/chunksmith_adapters/__init__.py +40 -0
- chunksmith_adapters-0.3.0/src/chunksmith_adapters/base.py +78 -0
- chunksmith_adapters-0.3.0/src/chunksmith_adapters/config.py +96 -0
- chunksmith_adapters-0.3.0/src/chunksmith_adapters/filesystem.py +50 -0
- chunksmith_adapters-0.3.0/src/chunksmith_adapters/mongodb.py +70 -0
- chunksmith_adapters-0.3.0/src/chunksmith_adapters/mvl_composite.py +173 -0
- chunksmith_adapters-0.3.0/src/chunksmith_adapters/mvl_mongo.py +135 -0
- chunksmith_adapters-0.3.0/src/chunksmith_adapters/mvl_paths.py +61 -0
- chunksmith_adapters-0.3.0/src/chunksmith_adapters/mvl_postgres.py +162 -0
- chunksmith_adapters-0.3.0/src/chunksmith_adapters/mvl_progress.py +21 -0
- chunksmith_adapters-0.3.0/src/chunksmith_adapters/mvl_s3.py +164 -0
- chunksmith_adapters-0.3.0/src/chunksmith_adapters/s3.py +50 -0
- chunksmith_adapters-0.3.0/src/chunksmith_adapters.egg-info/PKG-INFO +21 -0
- chunksmith_adapters-0.3.0/src/chunksmith_adapters.egg-info/SOURCES.txt +18 -0
- chunksmith_adapters-0.3.0/src/chunksmith_adapters.egg-info/dependency_links.txt +1 -0
- chunksmith_adapters-0.3.0/src/chunksmith_adapters.egg-info/requires.txt +15 -0
- chunksmith_adapters-0.3.0/src/chunksmith_adapters.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: chunksmith-adapters
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: Optional ChunkSmith storage adapters (S3, MongoDB, PostgreSQL, MVL composite).
|
|
5
|
+
Author-email: AnshulParate2004 <anshulnparate@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/AnshulParate2004/ChunkSmith
|
|
8
|
+
Project-URL: Repository, https://github.com/AnshulParate2004/ChunkSmith
|
|
9
|
+
Project-URL: Changelog, https://github.com/AnshulParate2004/ChunkSmith/blob/main/CHANGELOG.md
|
|
10
|
+
Requires-Python: >=3.10
|
|
11
|
+
Requires-Dist: chunksmith-core>=0.3.0
|
|
12
|
+
Provides-Extra: s3
|
|
13
|
+
Requires-Dist: boto3>=1.35.0; extra == "s3"
|
|
14
|
+
Provides-Extra: mongo
|
|
15
|
+
Requires-Dist: pymongo>=4.10.0; extra == "mongo"
|
|
16
|
+
Provides-Extra: postgres
|
|
17
|
+
Requires-Dist: psycopg[binary]>=3.2.0; extra == "postgres"
|
|
18
|
+
Provides-Extra: mvl
|
|
19
|
+
Requires-Dist: boto3>=1.35.0; extra == "mvl"
|
|
20
|
+
Requires-Dist: pymongo>=4.10.0; extra == "mvl"
|
|
21
|
+
Requires-Dist: psycopg[binary]>=3.2.0; extra == "mvl"
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "chunksmith-adapters"
|
|
7
|
+
version = "0.3.0"
|
|
8
|
+
description = "Optional ChunkSmith storage adapters (S3, MongoDB, PostgreSQL, MVL composite)."
|
|
9
|
+
requires-python = ">=3.10"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
authors = [{ name = "AnshulParate2004", email = "anshulnparate@gmail.com" }]
|
|
12
|
+
dependencies = [
|
|
13
|
+
"chunksmith-core>=0.3.0",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
[project.optional-dependencies]
|
|
17
|
+
s3 = ["boto3>=1.35.0"]
|
|
18
|
+
mongo = ["pymongo>=4.10.0"]
|
|
19
|
+
postgres = ["psycopg[binary]>=3.2.0"]
|
|
20
|
+
mvl = [
|
|
21
|
+
"boto3>=1.35.0",
|
|
22
|
+
"pymongo>=4.10.0",
|
|
23
|
+
"psycopg[binary]>=3.2.0",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
[project.urls]
|
|
27
|
+
Homepage = "https://github.com/AnshulParate2004/ChunkSmith"
|
|
28
|
+
Repository = "https://github.com/AnshulParate2004/ChunkSmith"
|
|
29
|
+
Changelog = "https://github.com/AnshulParate2004/ChunkSmith/blob/main/CHANGELOG.md"
|
|
30
|
+
|
|
31
|
+
[tool.setuptools.packages.find]
|
|
32
|
+
where = ["src"]
|
|
33
|
+
include = ["chunksmith_adapters", "chunksmith_adapters.*"]
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""Optional storage adapters for ChunkSmith pipelines."""
|
|
2
|
+
|
|
3
|
+
from chunksmith_adapters.base import BasePipelineStorage, ProgressRecord
|
|
4
|
+
from chunksmith_adapters.config import (
|
|
5
|
+
FilesystemStorageConfig,
|
|
6
|
+
MongoStorageConfig,
|
|
7
|
+
S3StorageConfig,
|
|
8
|
+
StorageConfig,
|
|
9
|
+
build_storage,
|
|
10
|
+
)
|
|
11
|
+
from chunksmith_adapters.filesystem import FilesystemArtifactSink, FilesystemStorage
|
|
12
|
+
from chunksmith_adapters.mvl_composite import MvlCompositeStorage, MvlStorageConfig
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"BasePipelineStorage",
|
|
16
|
+
"FilesystemArtifactSink",
|
|
17
|
+
"FilesystemStorage",
|
|
18
|
+
"FilesystemStorageConfig",
|
|
19
|
+
"MongoStorageConfig",
|
|
20
|
+
"MvlCompositeStorage",
|
|
21
|
+
"MvlStorageConfig",
|
|
22
|
+
"ProgressRecord",
|
|
23
|
+
"S3StorageConfig",
|
|
24
|
+
"StorageConfig",
|
|
25
|
+
"build_storage",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
try:
|
|
29
|
+
from chunksmith_adapters.s3 import S3Storage
|
|
30
|
+
|
|
31
|
+
__all__.append("S3Storage")
|
|
32
|
+
except ImportError:
|
|
33
|
+
S3Storage = None # type: ignore[misc, assignment]
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
from chunksmith_adapters.mongodb import MongoStorage
|
|
37
|
+
|
|
38
|
+
__all__.append("MongoStorage")
|
|
39
|
+
except ImportError:
|
|
40
|
+
MongoStorage = None # type: ignore[misc, assignment]
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""Shared progressive storage base."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from datetime import datetime, timezone
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class ProgressRecord:
|
|
16
|
+
event: str
|
|
17
|
+
payload: dict[str, Any]
|
|
18
|
+
at: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class BasePipelineStorage:
|
|
22
|
+
"""
|
|
23
|
+
Base class for progressive storage adapters.
|
|
24
|
+
|
|
25
|
+
Subclasses implement ``_write_blob`` / ``_write_json``; this class handles
|
|
26
|
+
progress tracking and artifact serialization.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(self, *, document_id: str, run_id: str | None = None) -> None:
|
|
30
|
+
self.document_id = document_id
|
|
31
|
+
self.run_id = run_id or datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
|
|
32
|
+
self.progress_log: list[ProgressRecord] = []
|
|
33
|
+
self._meta: dict[str, Any] = {
|
|
34
|
+
"document_id": document_id,
|
|
35
|
+
"run_id": self.run_id,
|
|
36
|
+
"status": "running",
|
|
37
|
+
"updated_at": None,
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
def on_progress(self, event: str, payload: dict[str, Any]) -> None:
|
|
41
|
+
self.progress_log.append(ProgressRecord(event=event, payload=dict(payload)))
|
|
42
|
+
self._meta["last_event"] = event
|
|
43
|
+
self._meta["updated_at"] = datetime.now(timezone.utc).isoformat()
|
|
44
|
+
self._persist_progress()
|
|
45
|
+
logger.debug("[chunksmith storage] %s document_id=%s %s", event, self.document_id, payload)
|
|
46
|
+
|
|
47
|
+
def save_elements(self, elements: list[dict[str, Any]]) -> None:
|
|
48
|
+
self._write_json("elements.json", elements)
|
|
49
|
+
self._meta["element_count"] = len(elements)
|
|
50
|
+
|
|
51
|
+
def save_coded_formate(self, text: str) -> None:
|
|
52
|
+
self._write_blob("coded_formate.txt", text.encode("utf-8"), "text/plain")
|
|
53
|
+
|
|
54
|
+
def save_outline(self, outline: dict[str, Any]) -> None:
|
|
55
|
+
self._write_json("outline.json", outline)
|
|
56
|
+
|
|
57
|
+
def save_mapper(self, mapper: dict[str, Any]) -> None:
|
|
58
|
+
self._write_json("mapper.json", mapper)
|
|
59
|
+
|
|
60
|
+
def finalize(self) -> None:
|
|
61
|
+
self._meta["status"] = "completed"
|
|
62
|
+
self._meta["updated_at"] = datetime.now(timezone.utc).isoformat()
|
|
63
|
+
self._write_json("meta.json", self._meta)
|
|
64
|
+
self._write_json("progress.json", [r.__dict__ for r in self.progress_log])
|
|
65
|
+
|
|
66
|
+
def _persist_progress(self) -> None:
|
|
67
|
+
self._write_json("progress.json", [r.__dict__ for r in self.progress_log])
|
|
68
|
+
self._write_json("meta.json", self._meta)
|
|
69
|
+
|
|
70
|
+
def _write_json(self, name: str, data: Any) -> None:
|
|
71
|
+
raw = json.dumps(data, ensure_ascii=False, indent=2).encode("utf-8")
|
|
72
|
+
self._write_blob(name, raw, "application/json")
|
|
73
|
+
|
|
74
|
+
def _write_blob(self, name: str, data: bytes, content_type: str) -> None:
|
|
75
|
+
raise NotImplementedError
|
|
76
|
+
|
|
77
|
+
def _key(self, name: str) -> str:
|
|
78
|
+
return f"{self.document_id}/{self.run_id}/{name}"
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""Class-based storage configuration — no environment variables."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Union
|
|
7
|
+
|
|
8
|
+
from chunksmith_core.ports import PipelineStorage
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class FilesystemStorageConfig:
|
|
13
|
+
"""Local disk storage under ``{root}/{document_id}/{run_id}/``."""
|
|
14
|
+
|
|
15
|
+
root: str = "./chunksmith-runs"
|
|
16
|
+
document_id: str = "default"
|
|
17
|
+
|
|
18
|
+
def build(self) -> PipelineStorage:
|
|
19
|
+
from chunksmith_adapters.filesystem import FilesystemStorage
|
|
20
|
+
|
|
21
|
+
return FilesystemStorage(self.root, document_id=self.document_id)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class S3StorageConfig:
|
|
26
|
+
"""Generic S3 progressive storage (simple prefix layout)."""
|
|
27
|
+
|
|
28
|
+
bucket: str
|
|
29
|
+
document_id: str
|
|
30
|
+
prefix: str = "chunksmith/"
|
|
31
|
+
region: str | None = None
|
|
32
|
+
access_key_id: str = ""
|
|
33
|
+
secret_access_key: str = ""
|
|
34
|
+
endpoint_url: str | None = None
|
|
35
|
+
|
|
36
|
+
def build(self) -> PipelineStorage:
|
|
37
|
+
from chunksmith_adapters.s3 import S3Storage
|
|
38
|
+
|
|
39
|
+
client = None
|
|
40
|
+
if self.access_key_id and self.secret_access_key:
|
|
41
|
+
try:
|
|
42
|
+
import boto3
|
|
43
|
+
|
|
44
|
+
kwargs = {
|
|
45
|
+
"aws_access_key_id": self.access_key_id,
|
|
46
|
+
"aws_secret_access_key": self.secret_access_key,
|
|
47
|
+
}
|
|
48
|
+
if self.endpoint_url:
|
|
49
|
+
kwargs["endpoint_url"] = self.endpoint_url
|
|
50
|
+
if self.region:
|
|
51
|
+
kwargs["region_name"] = self.region
|
|
52
|
+
client = boto3.client("s3", **kwargs)
|
|
53
|
+
except ImportError as exc:
|
|
54
|
+
raise ImportError(
|
|
55
|
+
"S3Storage requires boto3. Install with: pip install 'chunksmith-adapters[s3]'"
|
|
56
|
+
) from exc
|
|
57
|
+
|
|
58
|
+
return S3Storage(
|
|
59
|
+
bucket=self.bucket,
|
|
60
|
+
prefix=self.prefix,
|
|
61
|
+
document_id=self.document_id,
|
|
62
|
+
region=self.region,
|
|
63
|
+
client=client,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@dataclass
|
|
68
|
+
class MongoStorageConfig:
|
|
69
|
+
"""Generic Mongo progressive storage (one document per run)."""
|
|
70
|
+
|
|
71
|
+
uri: str
|
|
72
|
+
document_id: str
|
|
73
|
+
database: str = "chunksmith"
|
|
74
|
+
collection: str = "indexes"
|
|
75
|
+
|
|
76
|
+
def build(self) -> PipelineStorage:
|
|
77
|
+
from chunksmith_adapters.mongodb import MongoStorage
|
|
78
|
+
|
|
79
|
+
return MongoStorage(
|
|
80
|
+
uri=self.uri,
|
|
81
|
+
database=self.database,
|
|
82
|
+
collection=self.collection,
|
|
83
|
+
document_id=self.document_id,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
StorageConfig = Union[
|
|
88
|
+
"FilesystemStorageConfig",
|
|
89
|
+
"S3StorageConfig",
|
|
90
|
+
"MongoStorageConfig",
|
|
91
|
+
]
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def build_storage(config: StorageConfig) -> PipelineStorage:
|
|
95
|
+
"""Build a storage adapter from a config dataclass."""
|
|
96
|
+
return config.build()
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Local filesystem progressive storage."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from chunksmith_adapters.base import BasePipelineStorage
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class FilesystemArtifactSink:
|
|
13
|
+
"""Write indexing artifacts to a local directory (post-run via ``persist_*`` helpers)."""
|
|
14
|
+
|
|
15
|
+
def __init__(self, root: str | Path) -> None:
|
|
16
|
+
self.root = Path(root)
|
|
17
|
+
self.root.mkdir(parents=True, exist_ok=True)
|
|
18
|
+
|
|
19
|
+
def save_elements(self, elements: list[dict[str, Any]]) -> Path:
|
|
20
|
+
path = self.root / "elements.json"
|
|
21
|
+
path.write_text(json.dumps(elements, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
22
|
+
return path
|
|
23
|
+
|
|
24
|
+
def save_coded_formate(self, text: str) -> Path:
|
|
25
|
+
path = self.root / "coded_formate.txt"
|
|
26
|
+
path.write_text(text, encoding="utf-8")
|
|
27
|
+
return path
|
|
28
|
+
|
|
29
|
+
def save_outline(self, outline: dict[str, Any]) -> Path:
|
|
30
|
+
path = self.root / "outline.json"
|
|
31
|
+
path.write_text(json.dumps(outline, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
32
|
+
return path
|
|
33
|
+
|
|
34
|
+
def save_mapper(self, mapper: dict[str, Any]) -> Path:
|
|
35
|
+
path = self.root / "mapper.json"
|
|
36
|
+
path.write_text(json.dumps(mapper, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
37
|
+
return path
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class FilesystemStorage(BasePipelineStorage):
|
|
41
|
+
"""Save artifacts under ``{root}/{document_id}/{run_id}/`` as the pipeline runs."""
|
|
42
|
+
|
|
43
|
+
def __init__(self, root: str | Path, *, document_id: str) -> None:
|
|
44
|
+
super().__init__(document_id=document_id)
|
|
45
|
+
self.root = Path(root)
|
|
46
|
+
|
|
47
|
+
def _write_blob(self, name: str, data: bytes, content_type: str) -> None:
|
|
48
|
+
path = self.root / self._key(name)
|
|
49
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
50
|
+
path.write_bytes(data)
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""MongoDB progressive storage (optional pymongo)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from chunksmith_adapters.base import BasePipelineStorage
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class MongoStorage(BasePipelineStorage):
|
|
12
|
+
"""
|
|
13
|
+
Upsert one document per run; updates fields as pipeline stages complete.
|
|
14
|
+
|
|
15
|
+
Requires: ``pip install 'chunksmith-adapters[mongo]'`` or ``pip install pymongo``
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def __init__(
|
|
19
|
+
self,
|
|
20
|
+
*,
|
|
21
|
+
uri: str,
|
|
22
|
+
database: str,
|
|
23
|
+
collection: str,
|
|
24
|
+
document_id: str,
|
|
25
|
+
client: Any | None = None,
|
|
26
|
+
) -> None:
|
|
27
|
+
if not uri:
|
|
28
|
+
raise ValueError("MongoStorage requires uri=")
|
|
29
|
+
super().__init__(document_id=document_id)
|
|
30
|
+
self._db_name = database
|
|
31
|
+
self._collection_name = collection
|
|
32
|
+
self._client = client or self._make_client(uri)
|
|
33
|
+
self._collection = self._client[database][collection]
|
|
34
|
+
self._doc_key = {"document_id": document_id, "run_id": self.run_id}
|
|
35
|
+
self._collection.update_one(self._doc_key, {"$set": {**self._doc_key, "status": "running"}}, upsert=True)
|
|
36
|
+
|
|
37
|
+
@staticmethod
|
|
38
|
+
def _make_client(uri: str):
|
|
39
|
+
try:
|
|
40
|
+
from pymongo import MongoClient
|
|
41
|
+
except ImportError as exc:
|
|
42
|
+
raise ImportError(
|
|
43
|
+
"MongoStorage requires pymongo. Install with: pip install 'chunksmith-adapters[mongo]'"
|
|
44
|
+
) from exc
|
|
45
|
+
return MongoClient(uri)
|
|
46
|
+
|
|
47
|
+
def _write_blob(self, name: str, data: bytes, content_type: str) -> None:
|
|
48
|
+
field = name.replace(".", "_")
|
|
49
|
+
if content_type == "application/json":
|
|
50
|
+
value: Any = json.loads(data.decode("utf-8"))
|
|
51
|
+
else:
|
|
52
|
+
value = data.decode("utf-8")
|
|
53
|
+
self._collection.update_one(
|
|
54
|
+
self._doc_key,
|
|
55
|
+
{"$set": {field: value, "updated_at": self._meta.get("updated_at"), "status": "running"}},
|
|
56
|
+
upsert=True,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
def finalize(self) -> None:
|
|
60
|
+
super().finalize()
|
|
61
|
+
self._collection.update_one(
|
|
62
|
+
self._doc_key,
|
|
63
|
+
{
|
|
64
|
+
"$set": {
|
|
65
|
+
"status": "completed",
|
|
66
|
+
"meta": self._meta,
|
|
67
|
+
"progress": [r.__dict__ for r in self.progress_log],
|
|
68
|
+
}
|
|
69
|
+
},
|
|
70
|
+
)
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
"""Unified MVL storage: S3 + MongoDB + PostgreSQL with graceful partial config."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from typing import Any, Literal
|
|
8
|
+
|
|
9
|
+
from chunksmith_adapters.mvl_mongo import MvlMongoStorage
|
|
10
|
+
from chunksmith_adapters.mvl_postgres import MvlPostgresStorage
|
|
11
|
+
from chunksmith_adapters.mvl_s3 import MvlS3Storage
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
ContextFormat = Literal["json", "toon"]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class MvlStorageConfig:
|
|
20
|
+
"""
|
|
21
|
+
One config for all MVL backends. Set only what you have — missing backends log a warning and are skipped.
|
|
22
|
+
|
|
23
|
+
Example: S3 + Postgres configured, Mongo omitted → stores to S3 and Postgres only.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
user_id: str
|
|
27
|
+
project_id: str
|
|
28
|
+
document_id: str
|
|
29
|
+
mapping_method: str = "page_indexing"
|
|
30
|
+
context_format: ContextFormat = "json"
|
|
31
|
+
indexing_mode: str = "chunksmith_multi_indexing"
|
|
32
|
+
|
|
33
|
+
# --- S3 (optional) ---
|
|
34
|
+
s3_bucket: str | None = None
|
|
35
|
+
s3_access_key_id: str = ""
|
|
36
|
+
s3_secret_access_key: str = ""
|
|
37
|
+
s3_endpoint_url: str | None = None
|
|
38
|
+
s3_region: str | None = None
|
|
39
|
+
|
|
40
|
+
# --- MongoDB (optional) ---
|
|
41
|
+
mongo_uri: str | None = None
|
|
42
|
+
mongo_database: str = "chunksmith"
|
|
43
|
+
mongo_collection: str = "document_indexes"
|
|
44
|
+
|
|
45
|
+
# --- PostgreSQL (optional) ---
|
|
46
|
+
postgres_url: str | None = None
|
|
47
|
+
|
|
48
|
+
warnings: list[str] = field(default_factory=list, init=False, repr=False)
|
|
49
|
+
active_backends: list[str] = field(default_factory=list, init=False, repr=False)
|
|
50
|
+
|
|
51
|
+
def build(self) -> "MvlCompositeStorage":
|
|
52
|
+
return MvlCompositeStorage.from_config(self)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class MvlCompositeStorage:
|
|
56
|
+
"""Fan-out storage to every configured MVL backend; warn on missing configs."""
|
|
57
|
+
|
|
58
|
+
def __init__(
|
|
59
|
+
self,
|
|
60
|
+
*,
|
|
61
|
+
s3: MvlS3Storage | None,
|
|
62
|
+
mongo: MvlMongoStorage | None,
|
|
63
|
+
postgres: MvlPostgresStorage | None,
|
|
64
|
+
warnings: list[str],
|
|
65
|
+
active_backends: list[str],
|
|
66
|
+
) -> None:
|
|
67
|
+
self._s3 = s3
|
|
68
|
+
self._mongo = mongo
|
|
69
|
+
self._postgres = postgres
|
|
70
|
+
self.warnings = warnings
|
|
71
|
+
self.active_backends = active_backends
|
|
72
|
+
|
|
73
|
+
@classmethod
|
|
74
|
+
def from_config(cls, config: MvlStorageConfig) -> "MvlCompositeStorage":
|
|
75
|
+
warnings: list[str] = []
|
|
76
|
+
active: list[str] = []
|
|
77
|
+
s3: MvlS3Storage | None = None
|
|
78
|
+
mongo: MvlMongoStorage | None = None
|
|
79
|
+
postgres: MvlPostgresStorage | None = None
|
|
80
|
+
|
|
81
|
+
if config.s3_bucket:
|
|
82
|
+
s3 = MvlS3Storage(
|
|
83
|
+
user_id=config.user_id,
|
|
84
|
+
project_id=config.project_id,
|
|
85
|
+
document_id=config.document_id,
|
|
86
|
+
bucket=config.s3_bucket,
|
|
87
|
+
mapping_method=config.mapping_method,
|
|
88
|
+
context_format=config.context_format,
|
|
89
|
+
access_key_id=config.s3_access_key_id,
|
|
90
|
+
secret_access_key=config.s3_secret_access_key,
|
|
91
|
+
endpoint_url=config.s3_endpoint_url,
|
|
92
|
+
region=config.s3_region,
|
|
93
|
+
)
|
|
94
|
+
active.append("S3")
|
|
95
|
+
else:
|
|
96
|
+
warnings.append("S3 not configured — skipping S3 storage.")
|
|
97
|
+
|
|
98
|
+
if config.mongo_uri:
|
|
99
|
+
mongo = MvlMongoStorage(
|
|
100
|
+
uri=config.mongo_uri,
|
|
101
|
+
user_id=config.user_id,
|
|
102
|
+
project_id=config.project_id,
|
|
103
|
+
document_id=config.document_id,
|
|
104
|
+
database=config.mongo_database,
|
|
105
|
+
collection=config.mongo_collection,
|
|
106
|
+
)
|
|
107
|
+
active.append("MongoDB")
|
|
108
|
+
else:
|
|
109
|
+
warnings.append("MongoDB not configured — skipping MongoDB storage.")
|
|
110
|
+
|
|
111
|
+
if config.postgres_url:
|
|
112
|
+
postgres = MvlPostgresStorage(
|
|
113
|
+
database_url=config.postgres_url,
|
|
114
|
+
user_id=config.user_id,
|
|
115
|
+
project_id=config.project_id,
|
|
116
|
+
document_id=config.document_id,
|
|
117
|
+
indexing_mode=config.indexing_mode,
|
|
118
|
+
)
|
|
119
|
+
active.append("PostgreSQL")
|
|
120
|
+
else:
|
|
121
|
+
warnings.append("PostgreSQL not configured — skipping PostgreSQL storage.")
|
|
122
|
+
|
|
123
|
+
if not active:
|
|
124
|
+
raise ValueError("No storage backend configured. Set at least one of: s3_bucket, mongo_uri, postgres_url.")
|
|
125
|
+
|
|
126
|
+
config.warnings = list(warnings)
|
|
127
|
+
config.active_backends = list(active)
|
|
128
|
+
|
|
129
|
+
for msg in warnings:
|
|
130
|
+
logger.warning("%s Will store using: %s.", msg, ", ".join(active))
|
|
131
|
+
|
|
132
|
+
logger.info(
|
|
133
|
+
"MVL composite storage active backends: %s (document_id=%s)",
|
|
134
|
+
", ".join(active),
|
|
135
|
+
config.document_id,
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
return cls(s3=s3, mongo=mongo, postgres=postgres, warnings=warnings, active_backends=active)
|
|
139
|
+
|
|
140
|
+
def _call(self, method: str, *args: Any) -> None:
|
|
141
|
+
for name, backend in (
|
|
142
|
+
("S3", self._s3),
|
|
143
|
+
("MongoDB", self._mongo),
|
|
144
|
+
("PostgreSQL", self._postgres),
|
|
145
|
+
):
|
|
146
|
+
if backend is None:
|
|
147
|
+
continue
|
|
148
|
+
fn = getattr(backend, method, None)
|
|
149
|
+
if fn is None:
|
|
150
|
+
continue
|
|
151
|
+
try:
|
|
152
|
+
fn(*args)
|
|
153
|
+
except Exception as exc:
|
|
154
|
+
logger.error("%s storage.%s failed: %s", name, method, exc)
|
|
155
|
+
raise
|
|
156
|
+
|
|
157
|
+
def on_progress(self, event: str, payload: dict[str, Any]) -> None:
|
|
158
|
+
self._call("on_progress", event, payload)
|
|
159
|
+
|
|
160
|
+
def save_elements(self, elements: list[dict[str, Any]]) -> None:
|
|
161
|
+
self._call("save_elements", elements)
|
|
162
|
+
|
|
163
|
+
def save_coded_formate(self, text: str) -> None:
|
|
164
|
+
self._call("save_coded_formate", text)
|
|
165
|
+
|
|
166
|
+
def save_outline(self, outline: dict[str, Any]) -> None:
|
|
167
|
+
self._call("save_outline", outline)
|
|
168
|
+
|
|
169
|
+
def save_mapper(self, mapper: dict[str, Any]) -> None:
|
|
170
|
+
self._call("save_mapper", mapper)
|
|
171
|
+
|
|
172
|
+
def finalize(self) -> None:
|
|
173
|
+
self._call("finalize")
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
"""MVL Backend-style MongoDB storage (scoped by user_id / project_id / document_id)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from datetime import datetime, timezone
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from chunksmith_adapters.base import ProgressRecord
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class MvlMongoStorageConfig:
|
|
14
|
+
"""
|
|
15
|
+
Connect and store like MVL ``MongoDocumentStore`` document_indexes layout.
|
|
16
|
+
|
|
17
|
+
One document per run in ``document_indexes`` collection, updated as pipeline progresses.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
uri: str
|
|
21
|
+
user_id: str
|
|
22
|
+
project_id: str
|
|
23
|
+
document_id: str
|
|
24
|
+
database: str = "chunksmith"
|
|
25
|
+
collection: str = "document_indexes"
|
|
26
|
+
|
|
27
|
+
def build(self) -> "MvlMongoStorage":
|
|
28
|
+
return MvlMongoStorage(
|
|
29
|
+
uri=self.uri,
|
|
30
|
+
user_id=self.user_id,
|
|
31
|
+
project_id=self.project_id,
|
|
32
|
+
document_id=self.document_id,
|
|
33
|
+
database=self.database,
|
|
34
|
+
collection=self.collection,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class MvlMongoStorage:
|
|
39
|
+
"""Progressive Mongo upserts — same scope fields as MVL Backend indexing context."""
|
|
40
|
+
|
|
41
|
+
def __init__(
|
|
42
|
+
self,
|
|
43
|
+
*,
|
|
44
|
+
uri: str,
|
|
45
|
+
user_id: str,
|
|
46
|
+
project_id: str,
|
|
47
|
+
document_id: str,
|
|
48
|
+
database: str = "chunksmith",
|
|
49
|
+
collection: str = "document_indexes",
|
|
50
|
+
client: Any | None = None,
|
|
51
|
+
) -> None:
|
|
52
|
+
if not uri:
|
|
53
|
+
raise ValueError("MvlMongoStorage requires uri=")
|
|
54
|
+
self.user_id = user_id
|
|
55
|
+
self.project_id = project_id
|
|
56
|
+
self.document_id = document_id
|
|
57
|
+
self.run_id = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
|
|
58
|
+
self.progress_log: list[ProgressRecord] = []
|
|
59
|
+
self._client = client or self._make_client(uri)
|
|
60
|
+
self._collection = self._client[database][collection]
|
|
61
|
+
self._filter = {
|
|
62
|
+
"user_id": user_id,
|
|
63
|
+
"project_id": project_id,
|
|
64
|
+
"document_id": document_id,
|
|
65
|
+
"run_id": self.run_id,
|
|
66
|
+
}
|
|
67
|
+
self._collection.update_one(
|
|
68
|
+
self._filter,
|
|
69
|
+
{
|
|
70
|
+
"$set": {
|
|
71
|
+
**self._filter,
|
|
72
|
+
"status": "running",
|
|
73
|
+
"updated_at": datetime.now(timezone.utc).isoformat(),
|
|
74
|
+
}
|
|
75
|
+
},
|
|
76
|
+
upsert=True,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
@staticmethod
|
|
80
|
+
def _make_client(uri: str):
|
|
81
|
+
try:
|
|
82
|
+
from pymongo import MongoClient
|
|
83
|
+
except ImportError as exc:
|
|
84
|
+
raise ImportError(
|
|
85
|
+
"MvlMongoStorage requires pymongo. Install with: pip install 'chunksmith-adapters[mongo]'"
|
|
86
|
+
) from exc
|
|
87
|
+
return MongoClient(uri)
|
|
88
|
+
|
|
89
|
+
def on_progress(self, event: str, payload: dict[str, Any]) -> None:
|
|
90
|
+
self.progress_log.append(ProgressRecord(event=event, payload=dict(payload)))
|
|
91
|
+
self._collection.update_one(
|
|
92
|
+
self._filter,
|
|
93
|
+
{
|
|
94
|
+
"$set": {
|
|
95
|
+
"last_event": event,
|
|
96
|
+
"progress": [r.__dict__ for r in self.progress_log],
|
|
97
|
+
"updated_at": datetime.now(timezone.utc).isoformat(),
|
|
98
|
+
}
|
|
99
|
+
},
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
def save_elements(self, elements: list[dict[str, Any]]) -> None:
|
|
103
|
+
self._set_field("elements", elements)
|
|
104
|
+
|
|
105
|
+
def save_coded_formate(self, text: str) -> None:
|
|
106
|
+
self._set_field("title_coded_formate", text)
|
|
107
|
+
|
|
108
|
+
def save_outline(self, outline: dict[str, Any]) -> None:
|
|
109
|
+
self._set_field("outline", outline)
|
|
110
|
+
|
|
111
|
+
def save_mapper(self, mapper: dict[str, Any]) -> None:
|
|
112
|
+
self._set_field("mapper_output", mapper)
|
|
113
|
+
|
|
114
|
+
def finalize(self) -> None:
|
|
115
|
+
self._collection.update_one(
|
|
116
|
+
self._filter,
|
|
117
|
+
{
|
|
118
|
+
"$set": {
|
|
119
|
+
"status": "completed",
|
|
120
|
+
"updated_at": datetime.now(timezone.utc).isoformat(),
|
|
121
|
+
}
|
|
122
|
+
},
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
def _set_field(self, name: str, value: Any) -> None:
|
|
126
|
+
self._collection.update_one(
|
|
127
|
+
self._filter,
|
|
128
|
+
{
|
|
129
|
+
"$set": {
|
|
130
|
+
name: value,
|
|
131
|
+
"updated_at": datetime.now(timezone.utc).isoformat(),
|
|
132
|
+
"status": "running",
|
|
133
|
+
}
|
|
134
|
+
},
|
|
135
|
+
)
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""MVL S3 object key layout."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Literal
|
|
6
|
+
|
|
7
|
+
ContextFormat = Literal["json", "toon"]
|
|
8
|
+
|
|
9
|
+
_VALID_MAPPING_METHODS = frozenset(
|
|
10
|
+
{
|
|
11
|
+
"page_indexing",
|
|
12
|
+
"title_indexing",
|
|
13
|
+
"chunk_assignment_indexing",
|
|
14
|
+
"group_indexing",
|
|
15
|
+
"anchor_indexing",
|
|
16
|
+
}
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def project_prefix(user_id: str, project_id: str) -> str:
|
|
21
|
+
uid = (user_id or "").strip().strip("/")
|
|
22
|
+
pid = (project_id or "").strip().strip("/")
|
|
23
|
+
if not uid or not pid:
|
|
24
|
+
raise ValueError("user_id and project_id are required for storage paths")
|
|
25
|
+
return f"{uid}/{pid}"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _scoped(prefix: str, *parts: str) -> str:
|
|
29
|
+
return "/".join((prefix, *parts))
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def json_index_object_key(user_id: str, project_id: str, document_id: str) -> str:
|
|
33
|
+
return _scoped(project_prefix(user_id, project_id), "json", f"{document_id}_index.json")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def mapper_json_object_key(
|
|
37
|
+
document_id: str,
|
|
38
|
+
mapping_method: str,
|
|
39
|
+
*,
|
|
40
|
+
user_id: str,
|
|
41
|
+
project_id: str,
|
|
42
|
+
context_format: ContextFormat,
|
|
43
|
+
) -> str:
|
|
44
|
+
method = (mapping_method or "").strip().lower()
|
|
45
|
+
if method not in _VALID_MAPPING_METHODS:
|
|
46
|
+
raise ValueError(f"unknown mapping_method: {mapping_method!r}")
|
|
47
|
+
fmt = context_format if context_format in ("json", "toon") else "json"
|
|
48
|
+
basename = f"{document_id}_{method}_{fmt}.json"
|
|
49
|
+
return _scoped(project_prefix(user_id, project_id), "json", basename)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def text_coded_formate_object_key(user_id: str, project_id: str, document_id: str) -> str:
|
|
53
|
+
return _scoped(project_prefix(user_id, project_id), "text", f"{document_id}_title_coded_formate.txt")
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def elements_json_object_key(user_id: str, project_id: str, document_id: str) -> str:
|
|
57
|
+
return _scoped(project_prefix(user_id, project_id), "json", f"{document_id}_elements.json")
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def progress_json_object_key(user_id: str, project_id: str, document_id: str, run_id: str) -> str:
|
|
61
|
+
return _scoped(project_prefix(user_id, project_id), "json", f"{document_id}_{run_id}_progress.json")
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
"""MVL Backend-style PostgreSQL progress + document status (optional psycopg)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from datetime import datetime, timezone
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from chunksmith_adapters.mvl_progress import progress_tuple
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class MvlPostgresStorageConfig:
|
|
17
|
+
"""PostgreSQL connection for document status + processing_jobs (MVL tables)."""
|
|
18
|
+
|
|
19
|
+
database_url: str
|
|
20
|
+
user_id: str
|
|
21
|
+
project_id: str
|
|
22
|
+
document_id: str
|
|
23
|
+
indexing_mode: str = "chunksmith_multi_indexing"
|
|
24
|
+
|
|
25
|
+
def is_configured(self) -> bool:
|
|
26
|
+
return bool((self.database_url or "").strip())
|
|
27
|
+
|
|
28
|
+
def build(self) -> "MvlPostgresStorage":
|
|
29
|
+
if not self.is_configured():
|
|
30
|
+
raise ValueError("PostgreSQL database_url is required when building MvlPostgresStorage")
|
|
31
|
+
return MvlPostgresStorage(
|
|
32
|
+
database_url=self.database_url,
|
|
33
|
+
user_id=self.user_id,
|
|
34
|
+
project_id=self.project_id,
|
|
35
|
+
document_id=self.document_id,
|
|
36
|
+
indexing_mode=self.indexing_mode,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class MvlPostgresStorage:
|
|
41
|
+
"""
|
|
42
|
+
Update MVL ``documents`` and ``processing_jobs`` tables during pipeline progress.
|
|
43
|
+
|
|
44
|
+
Does not store artifact blobs (those go to S3 / Mongo).
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
def __init__(
|
|
48
|
+
self,
|
|
49
|
+
*,
|
|
50
|
+
database_url: str,
|
|
51
|
+
user_id: str,
|
|
52
|
+
project_id: str,
|
|
53
|
+
document_id: str,
|
|
54
|
+
indexing_mode: str = "chunksmith_multi_indexing",
|
|
55
|
+
conn: Any | None = None,
|
|
56
|
+
) -> None:
|
|
57
|
+
self.database_url = database_url
|
|
58
|
+
self.user_id = user_id
|
|
59
|
+
self.project_id = project_id
|
|
60
|
+
self.document_id = document_id
|
|
61
|
+
self.indexing_mode = indexing_mode
|
|
62
|
+
self._conn = conn
|
|
63
|
+
|
|
64
|
+
def _connect(self):
|
|
65
|
+
if self._conn is not None:
|
|
66
|
+
return self._conn
|
|
67
|
+
try:
|
|
68
|
+
import psycopg
|
|
69
|
+
except ImportError as exc:
|
|
70
|
+
raise ImportError(
|
|
71
|
+
"PostgreSQL storage requires psycopg. Install with: pip install 'chunksmith-adapters[postgres]'"
|
|
72
|
+
) from exc
|
|
73
|
+
return psycopg.connect(self.database_url)
|
|
74
|
+
|
|
75
|
+
def on_progress(self, event: str, payload: dict[str, Any]) -> None:
|
|
76
|
+
if event == "llm:token":
|
|
77
|
+
delta = str(payload.get("delta") or "")
|
|
78
|
+
if not delta:
|
|
79
|
+
return
|
|
80
|
+
step, pct, label = progress_tuple("pipeline:page_index")
|
|
81
|
+
preview = delta[-200:]
|
|
82
|
+
message = f"{label} (streaming): …{preview}"
|
|
83
|
+
now = datetime.now(timezone.utc)
|
|
84
|
+
sql = """
|
|
85
|
+
INSERT INTO processing_jobs (document_id, project_id, status, progress, message, step, indexing_mode, updated_at)
|
|
86
|
+
VALUES (%s, %s, 'running', %s, %s, %s, %s, %s)
|
|
87
|
+
ON CONFLICT (document_id) DO UPDATE SET
|
|
88
|
+
status = EXCLUDED.status,
|
|
89
|
+
progress = EXCLUDED.progress,
|
|
90
|
+
message = EXCLUDED.message,
|
|
91
|
+
step = EXCLUDED.step,
|
|
92
|
+
indexing_mode = EXCLUDED.indexing_mode,
|
|
93
|
+
updated_at = EXCLUDED.updated_at
|
|
94
|
+
"""
|
|
95
|
+
with self._connect() as conn:
|
|
96
|
+
with conn.cursor() as cur:
|
|
97
|
+
cur.execute(
|
|
98
|
+
sql,
|
|
99
|
+
(self.document_id, self.project_id, pct, message, step, self.indexing_mode, now),
|
|
100
|
+
)
|
|
101
|
+
conn.commit()
|
|
102
|
+
return
|
|
103
|
+
|
|
104
|
+
step, pct, label = progress_tuple(event)
|
|
105
|
+
detail = ", ".join(
|
|
106
|
+
f"{k}={v}" for k, v in payload.items() if k in {"chunk_count", "elements", "chunks", "nodes"}
|
|
107
|
+
)
|
|
108
|
+
message = f"{label}: {detail}" if detail else label
|
|
109
|
+
now = datetime.now(timezone.utc)
|
|
110
|
+
sql = """
|
|
111
|
+
INSERT INTO processing_jobs (document_id, project_id, status, progress, message, step, indexing_mode, updated_at)
|
|
112
|
+
VALUES (%s, %s, 'running', %s, %s, %s, %s, %s)
|
|
113
|
+
ON CONFLICT (document_id) DO UPDATE SET
|
|
114
|
+
status = EXCLUDED.status,
|
|
115
|
+
progress = EXCLUDED.progress,
|
|
116
|
+
message = EXCLUDED.message,
|
|
117
|
+
step = EXCLUDED.step,
|
|
118
|
+
indexing_mode = EXCLUDED.indexing_mode,
|
|
119
|
+
updated_at = EXCLUDED.updated_at
|
|
120
|
+
"""
|
|
121
|
+
with self._connect() as conn:
|
|
122
|
+
with conn.cursor() as cur:
|
|
123
|
+
cur.execute(
|
|
124
|
+
sql,
|
|
125
|
+
(self.document_id, self.project_id, pct, message, step, self.indexing_mode, now),
|
|
126
|
+
)
|
|
127
|
+
conn.commit()
|
|
128
|
+
|
|
129
|
+
def save_elements(self, elements: list[dict[str, Any]]) -> None:
|
|
130
|
+
return
|
|
131
|
+
|
|
132
|
+
def save_coded_formate(self, text: str) -> None:
|
|
133
|
+
return
|
|
134
|
+
|
|
135
|
+
def save_outline(self, outline: dict[str, Any]) -> None:
|
|
136
|
+
return
|
|
137
|
+
|
|
138
|
+
def save_mapper(self, mapper: dict[str, Any]) -> None:
|
|
139
|
+
return
|
|
140
|
+
|
|
141
|
+
def finalize(self) -> None:
|
|
142
|
+
now = datetime.now(timezone.utc)
|
|
143
|
+
with self._connect() as conn:
|
|
144
|
+
with conn.cursor() as cur:
|
|
145
|
+
cur.execute(
|
|
146
|
+
"""
|
|
147
|
+
UPDATE processing_jobs
|
|
148
|
+
SET status = 'completed', progress = 100, message = 'Indexing completed', updated_at = %s
|
|
149
|
+
WHERE document_id = %s
|
|
150
|
+
""",
|
|
151
|
+
(now, self.document_id),
|
|
152
|
+
)
|
|
153
|
+
cur.execute(
|
|
154
|
+
"""
|
|
155
|
+
UPDATE documents
|
|
156
|
+
SET status = 'indexed', indexing_mode = %s
|
|
157
|
+
WHERE id = %s
|
|
158
|
+
""",
|
|
159
|
+
(self.indexing_mode, self.document_id),
|
|
160
|
+
)
|
|
161
|
+
conn.commit()
|
|
162
|
+
logger.info("PostgreSQL: document_id=%s marked indexed", self.document_id)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""Pipeline event → progress step mapping (MVL Backend parity)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
PIPELINE_EVENT_PROGRESS: dict[str, tuple[int, int, str]] = {
|
|
6
|
+
"pipeline:start": (1, 12, "Pipeline started"),
|
|
7
|
+
"partition:split": (1, 18, "Partitioning PDF pages"),
|
|
8
|
+
"partition:chunk_done": (1, 24, "Partition chunk completed"),
|
|
9
|
+
"partition:done": (1, 30, "Partitioning completed"),
|
|
10
|
+
"pipeline:elements": (2, 40, "Elements loaded"),
|
|
11
|
+
"pipeline:group_by_title": (3, 50, "Grouping by title"),
|
|
12
|
+
"pipeline:coded_formate": (4, 60, "Generating coded format"),
|
|
13
|
+
"pipeline:page_index": (4, 72, "Running page indexing"),
|
|
14
|
+
"pipeline:mapper": (4, 80, "Building mapper output"),
|
|
15
|
+
"pipeline:mapper_saved": (4, 82, "Mapper saved"),
|
|
16
|
+
"pipeline:done": (6, 100, "Indexing completed"),
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def progress_tuple(event: str) -> tuple[int, int, str]:
|
|
21
|
+
return PIPELINE_EVENT_PROGRESS.get(event, (1, 10, event))
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
"""MVL Backend-style S3 storage (scoped keys + boto3 client from config class)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from datetime import datetime, timezone
|
|
8
|
+
from typing import Any, Literal
|
|
9
|
+
|
|
10
|
+
from chunksmith_adapters.base import ProgressRecord
|
|
11
|
+
from chunksmith_adapters.mvl_paths import (
|
|
12
|
+
elements_json_object_key,
|
|
13
|
+
json_index_object_key,
|
|
14
|
+
mapper_json_object_key,
|
|
15
|
+
progress_json_object_key,
|
|
16
|
+
text_coded_formate_object_key,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
ContextFormat = Literal["json", "toon"]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class MvlS3StorageConfig:
|
|
24
|
+
"""
|
|
25
|
+
Connect and store like MVL Backend ``StorageManager`` + ``storage_paths``.
|
|
26
|
+
|
|
27
|
+
Keys: ``{user_id}/{project_id}/json/...`` and ``.../text/...`` inside one bucket.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
user_id: str
|
|
31
|
+
project_id: str
|
|
32
|
+
document_id: str
|
|
33
|
+
bucket: str
|
|
34
|
+
mapping_method: str = "page_indexing"
|
|
35
|
+
context_format: ContextFormat = "json"
|
|
36
|
+
|
|
37
|
+
access_key_id: str = ""
|
|
38
|
+
secret_access_key: str = ""
|
|
39
|
+
endpoint_url: str | None = None
|
|
40
|
+
region: str | None = None
|
|
41
|
+
|
|
42
|
+
def build(self) -> "MvlS3Storage":
|
|
43
|
+
return MvlS3Storage(
|
|
44
|
+
user_id=self.user_id,
|
|
45
|
+
project_id=self.project_id,
|
|
46
|
+
document_id=self.document_id,
|
|
47
|
+
bucket=self.bucket,
|
|
48
|
+
mapping_method=self.mapping_method,
|
|
49
|
+
context_format=self.context_format,
|
|
50
|
+
access_key_id=self.access_key_id,
|
|
51
|
+
secret_access_key=self.secret_access_key,
|
|
52
|
+
endpoint_url=self.endpoint_url,
|
|
53
|
+
region=self.region,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class MvlS3Storage:
|
|
58
|
+
"""Progressive S3 uploads using MVL object key layout."""
|
|
59
|
+
|
|
60
|
+
def __init__(
|
|
61
|
+
self,
|
|
62
|
+
*,
|
|
63
|
+
user_id: str,
|
|
64
|
+
project_id: str,
|
|
65
|
+
document_id: str,
|
|
66
|
+
bucket: str,
|
|
67
|
+
mapping_method: str = "page_indexing",
|
|
68
|
+
context_format: ContextFormat = "json",
|
|
69
|
+
access_key_id: str = "",
|
|
70
|
+
secret_access_key: str = "",
|
|
71
|
+
endpoint_url: str | None = None,
|
|
72
|
+
region: str | None = None,
|
|
73
|
+
client: Any | None = None,
|
|
74
|
+
) -> None:
|
|
75
|
+
if not bucket:
|
|
76
|
+
raise ValueError("MvlS3Storage requires bucket=")
|
|
77
|
+
self.user_id = user_id
|
|
78
|
+
self.project_id = project_id
|
|
79
|
+
self.document_id = document_id
|
|
80
|
+
self.bucket = bucket
|
|
81
|
+
self.mapping_method = mapping_method
|
|
82
|
+
self.context_format = context_format
|
|
83
|
+
self.run_id = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
|
|
84
|
+
self.progress_log: list[ProgressRecord] = []
|
|
85
|
+
self._meta: dict[str, Any] = {
|
|
86
|
+
"user_id": user_id,
|
|
87
|
+
"project_id": project_id,
|
|
88
|
+
"document_id": document_id,
|
|
89
|
+
"run_id": self.run_id,
|
|
90
|
+
"status": "running",
|
|
91
|
+
}
|
|
92
|
+
self._client = client or self._make_client(
|
|
93
|
+
access_key_id=access_key_id,
|
|
94
|
+
secret_access_key=secret_access_key,
|
|
95
|
+
endpoint_url=endpoint_url,
|
|
96
|
+
region=region,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
@staticmethod
|
|
100
|
+
def _make_client(
|
|
101
|
+
*,
|
|
102
|
+
access_key_id: str,
|
|
103
|
+
secret_access_key: str,
|
|
104
|
+
endpoint_url: str | None,
|
|
105
|
+
region: str | None,
|
|
106
|
+
):
|
|
107
|
+
try:
|
|
108
|
+
import boto3
|
|
109
|
+
except ImportError as exc:
|
|
110
|
+
raise ImportError(
|
|
111
|
+
"MvlS3Storage requires boto3. Install with: pip install 'chunksmith-adapters[s3]'"
|
|
112
|
+
) from exc
|
|
113
|
+
kwargs: dict[str, Any] = {}
|
|
114
|
+
if access_key_id and secret_access_key:
|
|
115
|
+
kwargs["aws_access_key_id"] = access_key_id
|
|
116
|
+
kwargs["aws_secret_access_key"] = secret_access_key
|
|
117
|
+
if endpoint_url:
|
|
118
|
+
kwargs["endpoint_url"] = endpoint_url
|
|
119
|
+
if region:
|
|
120
|
+
kwargs["region_name"] = region
|
|
121
|
+
return boto3.client("s3", **kwargs)
|
|
122
|
+
|
|
123
|
+
def on_progress(self, event: str, payload: dict[str, Any]) -> None:
|
|
124
|
+
self.progress_log.append(ProgressRecord(event=event, payload=dict(payload)))
|
|
125
|
+
self._meta["last_event"] = event
|
|
126
|
+
self._meta["updated_at"] = datetime.now(timezone.utc).isoformat()
|
|
127
|
+
key = progress_json_object_key(self.user_id, self.project_id, self.document_id, self.run_id)
|
|
128
|
+
self._put(
|
|
129
|
+
key, json.dumps([r.__dict__ for r in self.progress_log], ensure_ascii=False, indent=2), "application/json"
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
def _put(self, key: str, body: str | bytes, content_type: str) -> None:
|
|
133
|
+
data = body.encode("utf-8") if isinstance(body, str) else body
|
|
134
|
+
self._client.put_object(Bucket=self.bucket, Key=key, Body=data, ContentType=content_type)
|
|
135
|
+
|
|
136
|
+
def save_elements(self, elements: list[dict[str, Any]]) -> None:
|
|
137
|
+
key = elements_json_object_key(self.user_id, self.project_id, self.document_id)
|
|
138
|
+
self._put(key, json.dumps(elements, ensure_ascii=False, indent=2), "application/json")
|
|
139
|
+
self._meta["element_count"] = len(elements)
|
|
140
|
+
|
|
141
|
+
def save_coded_formate(self, text: str) -> None:
|
|
142
|
+
key = text_coded_formate_object_key(self.user_id, self.project_id, self.document_id)
|
|
143
|
+
self._put(key, text, "text/plain")
|
|
144
|
+
|
|
145
|
+
def save_outline(self, outline: dict[str, Any]) -> None:
|
|
146
|
+
key = json_index_object_key(self.user_id, self.project_id, self.document_id)
|
|
147
|
+
self._put(key, json.dumps(outline, ensure_ascii=False, indent=2), "application/json")
|
|
148
|
+
|
|
149
|
+
def save_mapper(self, mapper: dict[str, Any]) -> None:
|
|
150
|
+
key = mapper_json_object_key(
|
|
151
|
+
self.document_id,
|
|
152
|
+
self.mapping_method,
|
|
153
|
+
user_id=self.user_id,
|
|
154
|
+
project_id=self.project_id,
|
|
155
|
+
context_format=self.context_format,
|
|
156
|
+
)
|
|
157
|
+
self._put(key, json.dumps(mapper, ensure_ascii=False, indent=2), "application/json")
|
|
158
|
+
|
|
159
|
+
def finalize(self) -> None:
|
|
160
|
+
self._meta["status"] = "completed"
|
|
161
|
+
self._meta["updated_at"] = datetime.now(timezone.utc).isoformat()
|
|
162
|
+
self._meta["bucket"] = self.bucket
|
|
163
|
+
meta_key = f"{self.user_id}/{self.project_id}/json/{self.document_id}_{self.run_id}_meta.json"
|
|
164
|
+
self._put(meta_key, json.dumps(self._meta, ensure_ascii=False, indent=2), "application/json")
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""AWS S3 progressive storage (optional boto3)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from chunksmith_adapters.base import BasePipelineStorage
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class S3Storage(BasePipelineStorage):
|
|
11
|
+
"""
|
|
12
|
+
Upload artifacts to S3 as each pipeline stage completes.
|
|
13
|
+
|
|
14
|
+
Requires: ``pip install 'chunksmith-adapters[s3]'`` or ``pip install boto3``
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
*,
|
|
20
|
+
bucket: str,
|
|
21
|
+
prefix: str = "chunksmith/",
|
|
22
|
+
document_id: str,
|
|
23
|
+
region: str | None = None,
|
|
24
|
+
client: Any | None = None,
|
|
25
|
+
) -> None:
|
|
26
|
+
if not bucket:
|
|
27
|
+
raise ValueError("S3Storage requires bucket=")
|
|
28
|
+
super().__init__(document_id=document_id)
|
|
29
|
+
self.bucket = bucket
|
|
30
|
+
self.prefix = prefix.rstrip("/") + "/" if prefix else ""
|
|
31
|
+
self._client = client or self._make_client(region)
|
|
32
|
+
|
|
33
|
+
@staticmethod
|
|
34
|
+
def _make_client(region: str | None):
|
|
35
|
+
try:
|
|
36
|
+
import boto3
|
|
37
|
+
except ImportError as exc:
|
|
38
|
+
raise ImportError("S3Storage requires boto3. Install with: pip install 'chunksmith-adapters[s3]'") from exc
|
|
39
|
+
return boto3.client("s3", region_name=region) if region else boto3.client("s3")
|
|
40
|
+
|
|
41
|
+
def _object_key(self, name: str) -> str:
|
|
42
|
+
return f"{self.prefix}{self._key(name)}"
|
|
43
|
+
|
|
44
|
+
def _write_blob(self, name: str, data: bytes, content_type: str) -> None:
|
|
45
|
+
self._client.put_object(
|
|
46
|
+
Bucket=self.bucket,
|
|
47
|
+
Key=self._object_key(name),
|
|
48
|
+
Body=data,
|
|
49
|
+
ContentType=content_type,
|
|
50
|
+
)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: chunksmith-adapters
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: Optional ChunkSmith storage adapters (S3, MongoDB, PostgreSQL, MVL composite).
|
|
5
|
+
Author-email: AnshulParate2004 <anshulnparate@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/AnshulParate2004/ChunkSmith
|
|
8
|
+
Project-URL: Repository, https://github.com/AnshulParate2004/ChunkSmith
|
|
9
|
+
Project-URL: Changelog, https://github.com/AnshulParate2004/ChunkSmith/blob/main/CHANGELOG.md
|
|
10
|
+
Requires-Python: >=3.10
|
|
11
|
+
Requires-Dist: chunksmith-core>=0.3.0
|
|
12
|
+
Provides-Extra: s3
|
|
13
|
+
Requires-Dist: boto3>=1.35.0; extra == "s3"
|
|
14
|
+
Provides-Extra: mongo
|
|
15
|
+
Requires-Dist: pymongo>=4.10.0; extra == "mongo"
|
|
16
|
+
Provides-Extra: postgres
|
|
17
|
+
Requires-Dist: psycopg[binary]>=3.2.0; extra == "postgres"
|
|
18
|
+
Provides-Extra: mvl
|
|
19
|
+
Requires-Dist: boto3>=1.35.0; extra == "mvl"
|
|
20
|
+
Requires-Dist: pymongo>=4.10.0; extra == "mvl"
|
|
21
|
+
Requires-Dist: psycopg[binary]>=3.2.0; extra == "mvl"
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
pyproject.toml
|
|
2
|
+
src/chunksmith_adapters/__init__.py
|
|
3
|
+
src/chunksmith_adapters/base.py
|
|
4
|
+
src/chunksmith_adapters/config.py
|
|
5
|
+
src/chunksmith_adapters/filesystem.py
|
|
6
|
+
src/chunksmith_adapters/mongodb.py
|
|
7
|
+
src/chunksmith_adapters/mvl_composite.py
|
|
8
|
+
src/chunksmith_adapters/mvl_mongo.py
|
|
9
|
+
src/chunksmith_adapters/mvl_paths.py
|
|
10
|
+
src/chunksmith_adapters/mvl_postgres.py
|
|
11
|
+
src/chunksmith_adapters/mvl_progress.py
|
|
12
|
+
src/chunksmith_adapters/mvl_s3.py
|
|
13
|
+
src/chunksmith_adapters/s3.py
|
|
14
|
+
src/chunksmith_adapters.egg-info/PKG-INFO
|
|
15
|
+
src/chunksmith_adapters.egg-info/SOURCES.txt
|
|
16
|
+
src/chunksmith_adapters.egg-info/dependency_links.txt
|
|
17
|
+
src/chunksmith_adapters.egg-info/requires.txt
|
|
18
|
+
src/chunksmith_adapters.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
chunksmith_adapters
|