morphik 1.2.2__tar.gz → 1.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {morphik-1.2.2 → morphik-1.2.4}/.gitignore +0 -3
- {morphik-1.2.2 → morphik-1.2.4}/PKG-INFO +7 -1
- {morphik-1.2.2 → morphik-1.2.4}/README.md +6 -0
- {morphik-1.2.2 → morphik-1.2.4}/morphik/__init__.py +4 -2
- {morphik-1.2.2 → morphik-1.2.4}/morphik/_shared.py +54 -1
- {morphik-1.2.2 → morphik-1.2.4}/morphik/async_.py +234 -2
- {morphik-1.2.2 → morphik-1.2.4}/morphik/models.py +21 -0
- {morphik-1.2.2 → morphik-1.2.4}/morphik/sync.py +221 -2
- {morphik-1.2.2 → morphik-1.2.4}/morphik/tests/test_scoped_ops_unit.py +55 -0
- {morphik-1.2.2 → morphik-1.2.4}/morphik/tests/test_shared_helpers.py +36 -1
- {morphik-1.2.2 → morphik-1.2.4}/pyproject.toml +1 -1
- {morphik-1.2.2 → morphik-1.2.4}/morphik/_internal.py +0 -0
- {morphik-1.2.2 → morphik-1.2.4}/morphik/_scoped_ops.py +0 -0
- {morphik-1.2.2 → morphik-1.2.4}/morphik/exceptions.py +0 -0
- {morphik-1.2.2 → morphik-1.2.4}/morphik/tests/README.md +0 -0
- {morphik-1.2.2 → morphik-1.2.4}/morphik/tests/__init__.py +0 -0
- {morphik-1.2.2 → morphik-1.2.4}/morphik/tests/example_usage.py +0 -0
- {morphik-1.2.2 → morphik-1.2.4}/morphik/tests/test_app_ops.py +0 -0
- {morphik-1.2.2 → morphik-1.2.4}/morphik/tests/test_async.py +0 -0
- {morphik-1.2.2 → morphik-1.2.4}/morphik/tests/test_docs/sample1.txt +0 -0
- {morphik-1.2.2 → morphik-1.2.4}/morphik/tests/test_docs/sample2.txt +0 -0
- {morphik-1.2.2 → morphik-1.2.4}/morphik/tests/test_docs/sample3.txt +0 -0
- {morphik-1.2.2 → morphik-1.2.4}/morphik/tests/test_sync.py +0 -0
- {morphik-1.2.2 → morphik-1.2.4}/morphik/tests/test_update_document_metadata_rename.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: morphik
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.4
|
|
4
4
|
Summary: Morphik Python Client
|
|
5
5
|
Author-email: Morphik <founders@morphik.ai>
|
|
6
6
|
Requires-Python: >=3.8
|
|
@@ -91,6 +91,12 @@ response = db.query(
|
|
|
91
91
|
)
|
|
92
92
|
|
|
93
93
|
print(response.completion)
|
|
94
|
+
|
|
95
|
+
# Migrate this app's documents into another Morphik deployment.
|
|
96
|
+
# Run this from a machine that can reach both source and target, such as
|
|
97
|
+
# inside a customer's VPN for on-prem targets.
|
|
98
|
+
result = db.migrate(target_uri="morphik://owner_id:token@onprem.example.com", target_is_local=True)
|
|
99
|
+
print(result.created_count, result.skipped_count, result.failed_count)
|
|
94
100
|
```
|
|
95
101
|
|
|
96
102
|
### Nested Folders & Folder Depth
|
|
@@ -78,6 +78,12 @@ response = db.query(
|
|
|
78
78
|
)
|
|
79
79
|
|
|
80
80
|
print(response.completion)
|
|
81
|
+
|
|
82
|
+
# Migrate this app's documents into another Morphik deployment.
|
|
83
|
+
# Run this from a machine that can reach both source and target, such as
|
|
84
|
+
# inside a customer's VPN for on-prem targets.
|
|
85
|
+
result = db.migrate(target_uri="morphik://owner_id:token@onprem.example.com", target_is_local=True)
|
|
86
|
+
print(result.created_count, result.skipped_count, result.failed_count)
|
|
81
87
|
```
|
|
82
88
|
|
|
83
89
|
### Nested Folders & Folder Depth
|
|
@@ -3,7 +3,7 @@ Morphik Python SDK for document ingestion and querying.
|
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
from .async_ import AsyncMorphik
|
|
6
|
-
from .models import Document, DocumentQueryResponse, Summary
|
|
6
|
+
from .models import Document, DocumentQueryResponse, MigrationDocumentResult, MigrationResult, Summary
|
|
7
7
|
from .sync import Morphik
|
|
8
8
|
|
|
9
9
|
__all__ = [
|
|
@@ -12,6 +12,8 @@ __all__ = [
|
|
|
12
12
|
"Document",
|
|
13
13
|
"Summary",
|
|
14
14
|
"DocumentQueryResponse",
|
|
15
|
+
"MigrationDocumentResult",
|
|
16
|
+
"MigrationResult",
|
|
15
17
|
]
|
|
16
18
|
|
|
17
|
-
__version__ = "1.2.
|
|
19
|
+
__version__ = "1.2.4"
|
|
@@ -2,7 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import Any, Dict, Iterable, List, Optional, Union
|
|
5
|
+
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
|
6
6
|
from urllib.parse import quote
|
|
7
7
|
|
|
8
8
|
from pydantic import BaseModel
|
|
@@ -10,6 +10,17 @@ from pydantic import BaseModel
|
|
|
10
10
|
MAX_LIMIT = 500
|
|
11
11
|
MIN_LOG_HOURS = 0.1
|
|
12
12
|
MAX_LOG_HOURS = 168.0
|
|
13
|
+
MIGRATION_SOURCE_METADATA_KEY = "_morphik_migration"
|
|
14
|
+
MIGRATION_RESERVED_METADATA_FIELDS = {
|
|
15
|
+
"app_id",
|
|
16
|
+
"end_user_id",
|
|
17
|
+
"external_id",
|
|
18
|
+
"filename",
|
|
19
|
+
"folder_id",
|
|
20
|
+
"folder_name",
|
|
21
|
+
"folder_path",
|
|
22
|
+
"owner_id",
|
|
23
|
+
}
|
|
13
24
|
|
|
14
25
|
|
|
15
26
|
def merge_folders(
|
|
@@ -197,3 +208,45 @@ def normalize_additional_folders(
|
|
|
197
208
|
if additional_folders:
|
|
198
209
|
return list(additional_folders) + folder_list
|
|
199
210
|
return folder_list
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def build_migration_metadata(
|
|
214
|
+
document: Any,
|
|
215
|
+
*,
|
|
216
|
+
include_source_metadata: bool = True,
|
|
217
|
+
) -> Tuple[Dict[str, Any], Dict[str, str]]:
|
|
218
|
+
"""Prepare document metadata for migration ingestion.
|
|
219
|
+
|
|
220
|
+
The target API owns fields such as external_id, folder_name, and app_id, so
|
|
221
|
+
those values must travel through dedicated migration parameters instead of
|
|
222
|
+
user metadata.
|
|
223
|
+
"""
|
|
224
|
+
metadata = dict(getattr(document, "metadata", None) or {})
|
|
225
|
+
metadata_types = dict(getattr(document, "metadata_types", None) or {})
|
|
226
|
+
|
|
227
|
+
for field in MIGRATION_RESERVED_METADATA_FIELDS:
|
|
228
|
+
metadata.pop(field, None)
|
|
229
|
+
metadata_types.pop(field, None)
|
|
230
|
+
|
|
231
|
+
if include_source_metadata:
|
|
232
|
+
system_metadata = getattr(document, "system_metadata", None) or {}
|
|
233
|
+
source_info = {
|
|
234
|
+
"source_document_id": getattr(document, "external_id", None),
|
|
235
|
+
"source_app_id": getattr(document, "app_id", None),
|
|
236
|
+
"source_filename": getattr(document, "filename", None),
|
|
237
|
+
"source_created_at": system_metadata.get("created_at") if isinstance(system_metadata, dict) else None,
|
|
238
|
+
"source_updated_at": system_metadata.get("updated_at") if isinstance(system_metadata, dict) else None,
|
|
239
|
+
}
|
|
240
|
+
source_info = {key: value for key, value in source_info.items() if value is not None}
|
|
241
|
+
|
|
242
|
+
existing_source_info = metadata.get(MIGRATION_SOURCE_METADATA_KEY)
|
|
243
|
+
if isinstance(existing_source_info, dict):
|
|
244
|
+
existing_source_info = dict(existing_source_info)
|
|
245
|
+
for key, value in source_info.items():
|
|
246
|
+
existing_source_info.setdefault(key, value)
|
|
247
|
+
metadata[MIGRATION_SOURCE_METADATA_KEY] = existing_source_info
|
|
248
|
+
else:
|
|
249
|
+
metadata[MIGRATION_SOURCE_METADATA_KEY] = source_info
|
|
250
|
+
metadata_types.setdefault(MIGRATION_SOURCE_METADATA_KEY, "object")
|
|
251
|
+
|
|
252
|
+
return metadata, metadata_types
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import inspect
|
|
1
2
|
import json
|
|
2
3
|
import logging
|
|
3
4
|
import warnings
|
|
@@ -12,13 +13,14 @@ from pydantic import BaseModel
|
|
|
12
13
|
from ._internal import FinalChunkResult, _MorphikClientLogic
|
|
13
14
|
from ._scoped_ops import _ScopedOperationsMixin
|
|
14
15
|
from ._shared import (
|
|
16
|
+
build_create_app_payload,
|
|
17
|
+
build_document_by_filename_params,
|
|
15
18
|
build_folder_endpoint_identifier,
|
|
16
19
|
build_folder_move_payload,
|
|
17
20
|
build_folder_rename_path,
|
|
18
|
-
build_create_app_payload,
|
|
19
|
-
build_document_by_filename_params,
|
|
20
21
|
build_list_apps_params,
|
|
21
22
|
build_logs_params,
|
|
23
|
+
build_migration_metadata,
|
|
22
24
|
build_rename_app_params,
|
|
23
25
|
build_requeue_payload,
|
|
24
26
|
build_rotate_app_params,
|
|
@@ -42,6 +44,8 @@ from .models import (
|
|
|
42
44
|
IngestTextRequest,
|
|
43
45
|
ListDocsResponse,
|
|
44
46
|
LogResponse,
|
|
47
|
+
MigrationDocumentResult,
|
|
48
|
+
MigrationResult,
|
|
45
49
|
QueryPromptOverrides,
|
|
46
50
|
RequeueIngestionJob,
|
|
47
51
|
RequeueIngestionResponse,
|
|
@@ -1266,6 +1270,234 @@ class AsyncMorphik(_ScopedOperationsMixin):
|
|
|
1266
1270
|
sort_direction=sort_direction,
|
|
1267
1271
|
)
|
|
1268
1272
|
|
|
1273
|
+
async def migrate(
|
|
1274
|
+
self,
|
|
1275
|
+
target_uri: str,
|
|
1276
|
+
*,
|
|
1277
|
+
filters: Optional[Dict[str, Any]] = None,
|
|
1278
|
+
folder_name: Optional[Union[str, List[str]]] = None,
|
|
1279
|
+
folder_depth: Optional[int] = None,
|
|
1280
|
+
end_user_id: Optional[str] = None,
|
|
1281
|
+
skip: int = 0,
|
|
1282
|
+
limit: Optional[int] = None,
|
|
1283
|
+
batch_size: int = 100,
|
|
1284
|
+
completed_only: bool = True,
|
|
1285
|
+
use_colpali: bool = True,
|
|
1286
|
+
preserve_folders: bool = True,
|
|
1287
|
+
preserve_end_user_id: bool = True,
|
|
1288
|
+
preserve_summaries: bool = True,
|
|
1289
|
+
include_source_metadata: bool = True,
|
|
1290
|
+
on_conflict: Literal["skip", "fail"] = "skip",
|
|
1291
|
+
continue_on_error: bool = True,
|
|
1292
|
+
target_timeout: Optional[int] = None,
|
|
1293
|
+
target_is_local: bool = False,
|
|
1294
|
+
progress_callback: Optional[Callable[[MigrationDocumentResult], Any]] = None,
|
|
1295
|
+
) -> MigrationResult:
|
|
1296
|
+
"""Migrate documents from this client into another Morphik URI.
|
|
1297
|
+
|
|
1298
|
+
Set target_is_local=True for local/on-prem targets that should use HTTP
|
|
1299
|
+
or skip TLS verification.
|
|
1300
|
+
"""
|
|
1301
|
+
if batch_size <= 0:
|
|
1302
|
+
raise ValueError("batch_size must be greater than 0")
|
|
1303
|
+
if limit is not None and limit < 0:
|
|
1304
|
+
raise ValueError("limit must be greater than or equal to 0")
|
|
1305
|
+
|
|
1306
|
+
target = AsyncMorphik(target_uri, timeout=target_timeout or self._logic._timeout, is_local=target_is_local)
|
|
1307
|
+
results: List[MigrationDocumentResult] = []
|
|
1308
|
+
total_source_count: Optional[int] = None
|
|
1309
|
+
current_skip = max(skip, 0)
|
|
1310
|
+
remaining = limit
|
|
1311
|
+
page_size = min(batch_size, 500)
|
|
1312
|
+
|
|
1313
|
+
try:
|
|
1314
|
+
while remaining is None or remaining > 0:
|
|
1315
|
+
current_limit = page_size if remaining is None else min(page_size, remaining)
|
|
1316
|
+
page = await self._scoped_list_documents(
|
|
1317
|
+
skip=current_skip,
|
|
1318
|
+
limit=current_limit,
|
|
1319
|
+
filters=filters,
|
|
1320
|
+
folder_name=folder_name,
|
|
1321
|
+
folder_depth=folder_depth,
|
|
1322
|
+
end_user_id=end_user_id,
|
|
1323
|
+
include_total_count=total_source_count is None,
|
|
1324
|
+
include_status_counts=False,
|
|
1325
|
+
include_folder_counts=False,
|
|
1326
|
+
completed_only=completed_only,
|
|
1327
|
+
sort_by="updated_at",
|
|
1328
|
+
sort_direction="desc",
|
|
1329
|
+
)
|
|
1330
|
+
if total_source_count is None:
|
|
1331
|
+
total_source_count = page.total_count
|
|
1332
|
+
if not page.documents:
|
|
1333
|
+
break
|
|
1334
|
+
|
|
1335
|
+
for source_document in page.documents:
|
|
1336
|
+
try:
|
|
1337
|
+
result = await self._migrate_single_document(
|
|
1338
|
+
target=target,
|
|
1339
|
+
source_document=source_document,
|
|
1340
|
+
use_colpali=use_colpali,
|
|
1341
|
+
preserve_folders=preserve_folders,
|
|
1342
|
+
preserve_end_user_id=preserve_end_user_id,
|
|
1343
|
+
preserve_summaries=preserve_summaries,
|
|
1344
|
+
include_source_metadata=include_source_metadata,
|
|
1345
|
+
on_conflict=on_conflict,
|
|
1346
|
+
)
|
|
1347
|
+
except Exception as exc: # noqa: BLE001
|
|
1348
|
+
result = MigrationDocumentResult(
|
|
1349
|
+
source_document_id=source_document.external_id,
|
|
1350
|
+
filename=source_document.filename,
|
|
1351
|
+
status="failed",
|
|
1352
|
+
error=str(exc),
|
|
1353
|
+
)
|
|
1354
|
+
if not continue_on_error:
|
|
1355
|
+
results.append(result)
|
|
1356
|
+
await self._emit_migration_progress(progress_callback, result)
|
|
1357
|
+
raise
|
|
1358
|
+
|
|
1359
|
+
results.append(result)
|
|
1360
|
+
await self._emit_migration_progress(progress_callback, result)
|
|
1361
|
+
|
|
1362
|
+
if remaining is not None:
|
|
1363
|
+
remaining -= len(page.documents)
|
|
1364
|
+
if not page.has_more:
|
|
1365
|
+
break
|
|
1366
|
+
next_skip = page.next_skip if page.next_skip is not None else current_skip + page.returned_count
|
|
1367
|
+
if next_skip <= current_skip:
|
|
1368
|
+
break
|
|
1369
|
+
current_skip = next_skip
|
|
1370
|
+
finally:
|
|
1371
|
+
await target.close()
|
|
1372
|
+
|
|
1373
|
+
return self._build_migration_result(results, total_source_count)
|
|
1374
|
+
|
|
1375
|
+
async def _migrate_single_document(
|
|
1376
|
+
self,
|
|
1377
|
+
*,
|
|
1378
|
+
target: "AsyncMorphik",
|
|
1379
|
+
source_document: Document,
|
|
1380
|
+
use_colpali: bool,
|
|
1381
|
+
preserve_folders: bool,
|
|
1382
|
+
preserve_end_user_id: bool,
|
|
1383
|
+
preserve_summaries: bool,
|
|
1384
|
+
include_source_metadata: bool,
|
|
1385
|
+
on_conflict: Literal["skip", "fail"],
|
|
1386
|
+
) -> MigrationDocumentResult:
|
|
1387
|
+
metadata, metadata_types = build_migration_metadata(
|
|
1388
|
+
source_document,
|
|
1389
|
+
include_source_metadata=include_source_metadata,
|
|
1390
|
+
)
|
|
1391
|
+
file_bytes = await self.get_document_file(source_document.external_id)
|
|
1392
|
+
folder = (source_document.folder_path or source_document.folder_name) if preserve_folders else None
|
|
1393
|
+
end_user = source_document.end_user_id if preserve_end_user_id else None
|
|
1394
|
+
|
|
1395
|
+
status, target_document = await target._ingest_migrated_document(
|
|
1396
|
+
source_document_id=source_document.external_id,
|
|
1397
|
+
file_content=file_bytes,
|
|
1398
|
+
filename=source_document.filename or source_document.external_id,
|
|
1399
|
+
content_type=source_document.content_type,
|
|
1400
|
+
metadata=metadata,
|
|
1401
|
+
metadata_types=metadata_types,
|
|
1402
|
+
folder_name=folder,
|
|
1403
|
+
end_user_id=end_user,
|
|
1404
|
+
use_colpali=use_colpali,
|
|
1405
|
+
on_conflict=on_conflict,
|
|
1406
|
+
)
|
|
1407
|
+
|
|
1408
|
+
if preserve_summaries and status == "created":
|
|
1409
|
+
await self._copy_document_summary(source_document.external_id, target, target_document.external_id)
|
|
1410
|
+
|
|
1411
|
+
return MigrationDocumentResult(
|
|
1412
|
+
source_document_id=source_document.external_id,
|
|
1413
|
+
target_document_id=target_document.external_id,
|
|
1414
|
+
filename=source_document.filename,
|
|
1415
|
+
status=status,
|
|
1416
|
+
)
|
|
1417
|
+
|
|
1418
|
+
async def _ingest_migrated_document(
|
|
1419
|
+
self,
|
|
1420
|
+
*,
|
|
1421
|
+
source_document_id: str,
|
|
1422
|
+
file_content: bytes,
|
|
1423
|
+
filename: str,
|
|
1424
|
+
content_type: Optional[str],
|
|
1425
|
+
metadata: Dict[str, Any],
|
|
1426
|
+
metadata_types: Optional[Dict[str, str]],
|
|
1427
|
+
folder_name: Optional[str],
|
|
1428
|
+
end_user_id: Optional[str],
|
|
1429
|
+
use_colpali: bool,
|
|
1430
|
+
on_conflict: Literal["skip", "fail"],
|
|
1431
|
+
) -> tuple[str, Document]:
|
|
1432
|
+
serialized_metadata, inferred_types = self._logic._serialize_metadata_map(metadata)
|
|
1433
|
+
metadata_type_payload = {**inferred_types, **(metadata_types or {})}
|
|
1434
|
+
metadata_type_payload = {
|
|
1435
|
+
key: value for key, value in metadata_type_payload.items() if key in serialized_metadata
|
|
1436
|
+
}
|
|
1437
|
+
|
|
1438
|
+
form_data: Dict[str, Any] = {
|
|
1439
|
+
"source_document_id": source_document_id,
|
|
1440
|
+
"metadata": json.dumps(serialized_metadata),
|
|
1441
|
+
"metadata_types": json.dumps(metadata_type_payload),
|
|
1442
|
+
"use_colpali": str(use_colpali).lower(),
|
|
1443
|
+
"on_conflict": on_conflict,
|
|
1444
|
+
}
|
|
1445
|
+
if folder_name:
|
|
1446
|
+
form_data["folder_name"] = folder_name
|
|
1447
|
+
if end_user_id:
|
|
1448
|
+
form_data["end_user_id"] = end_user_id
|
|
1449
|
+
|
|
1450
|
+
file_obj = BytesIO(file_content)
|
|
1451
|
+
files = {"file": (filename, file_obj, content_type or "application/octet-stream")}
|
|
1452
|
+
response = await self._request("POST", "migrate/document", data=form_data, files=files)
|
|
1453
|
+
document = self._logic._parse_document_response(response["document"])
|
|
1454
|
+
document._client = self
|
|
1455
|
+
return response.get("status", "created"), document
|
|
1456
|
+
|
|
1457
|
+
async def _copy_document_summary(
|
|
1458
|
+
self,
|
|
1459
|
+
source_document_id: str,
|
|
1460
|
+
target: "AsyncMorphik",
|
|
1461
|
+
target_document_id: str,
|
|
1462
|
+
) -> None:
|
|
1463
|
+
try:
|
|
1464
|
+
summary = await self.get_document_summary(source_document_id)
|
|
1465
|
+
except httpx.HTTPStatusError as exc:
|
|
1466
|
+
if exc.response.status_code == 404:
|
|
1467
|
+
return
|
|
1468
|
+
raise
|
|
1469
|
+
await target.upsert_document_summary(
|
|
1470
|
+
document_id=target_document_id,
|
|
1471
|
+
content=summary.content,
|
|
1472
|
+
versioning=False,
|
|
1473
|
+
overwrite_latest=True,
|
|
1474
|
+
)
|
|
1475
|
+
|
|
1476
|
+
@staticmethod
|
|
1477
|
+
async def _emit_migration_progress(
|
|
1478
|
+
progress_callback: Optional[Callable[[MigrationDocumentResult], Any]],
|
|
1479
|
+
result: MigrationDocumentResult,
|
|
1480
|
+
) -> None:
|
|
1481
|
+
if progress_callback is None:
|
|
1482
|
+
return
|
|
1483
|
+
callback_result = progress_callback(result)
|
|
1484
|
+
if inspect.isawaitable(callback_result):
|
|
1485
|
+
await callback_result
|
|
1486
|
+
|
|
1487
|
+
@staticmethod
|
|
1488
|
+
def _build_migration_result(
|
|
1489
|
+
results: List[MigrationDocumentResult],
|
|
1490
|
+
total_source_count: Optional[int],
|
|
1491
|
+
) -> MigrationResult:
|
|
1492
|
+
return MigrationResult(
|
|
1493
|
+
documents=results,
|
|
1494
|
+
total_source_count=total_source_count,
|
|
1495
|
+
attempted_count=len(results),
|
|
1496
|
+
created_count=sum(1 for item in results if item.status == "created"),
|
|
1497
|
+
skipped_count=sum(1 for item in results if item.status == "skipped"),
|
|
1498
|
+
failed_count=sum(1 for item in results if item.status == "failed"),
|
|
1499
|
+
)
|
|
1500
|
+
|
|
1269
1501
|
async def get_document(self, document_id: str) -> Document:
|
|
1270
1502
|
"""
|
|
1271
1503
|
Get document metadata by ID.
|
|
@@ -415,6 +415,27 @@ class ListDocsResponse(BaseModel):
|
|
|
415
415
|
folder_counts: Optional[List[FolderCount]] = Field(None, description="Document counts by folder")
|
|
416
416
|
|
|
417
417
|
|
|
418
|
+
class MigrationDocumentResult(BaseModel):
|
|
419
|
+
"""Per-document result from a migration run."""
|
|
420
|
+
|
|
421
|
+
source_document_id: str = Field(..., description="Document ID in the source Morphik app")
|
|
422
|
+
target_document_id: Optional[str] = Field(None, description="Document ID in the target Morphik app")
|
|
423
|
+
filename: Optional[str] = Field(None, description="Migrated filename")
|
|
424
|
+
status: Literal["created", "skipped", "failed"] = Field(..., description="Migration outcome")
|
|
425
|
+
error: Optional[str] = Field(None, description="Error message when status is failed")
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
class MigrationResult(BaseModel):
|
|
429
|
+
"""Summary returned by client.migrate."""
|
|
430
|
+
|
|
431
|
+
documents: List[MigrationDocumentResult] = Field(default_factory=list)
|
|
432
|
+
total_source_count: Optional[int] = Field(None, description="Total matching source documents, when available")
|
|
433
|
+
attempted_count: int = Field(..., description="Number of source documents attempted")
|
|
434
|
+
created_count: int = Field(..., description="Number of target documents created")
|
|
435
|
+
skipped_count: int = Field(..., description="Number of target documents skipped because they already existed")
|
|
436
|
+
failed_count: int = Field(..., description="Number of documents that failed migration")
|
|
437
|
+
|
|
438
|
+
|
|
418
439
|
class IngestTextRequest(BaseModel):
|
|
419
440
|
"""Request model for ingesting text content"""
|
|
420
441
|
|
|
@@ -12,13 +12,14 @@ from pydantic import BaseModel
|
|
|
12
12
|
from ._internal import FinalChunkResult, _MorphikClientLogic
|
|
13
13
|
from ._scoped_ops import _ScopedOperationsMixin
|
|
14
14
|
from ._shared import (
|
|
15
|
+
build_create_app_payload,
|
|
16
|
+
build_document_by_filename_params,
|
|
15
17
|
build_folder_endpoint_identifier,
|
|
16
18
|
build_folder_move_payload,
|
|
17
19
|
build_folder_rename_path,
|
|
18
|
-
build_create_app_payload,
|
|
19
|
-
build_document_by_filename_params,
|
|
20
20
|
build_list_apps_params,
|
|
21
21
|
build_logs_params,
|
|
22
|
+
build_migration_metadata,
|
|
22
23
|
build_rename_app_params,
|
|
23
24
|
build_requeue_payload,
|
|
24
25
|
build_rotate_app_params,
|
|
@@ -42,6 +43,8 @@ from .models import (
|
|
|
42
43
|
IngestTextRequest,
|
|
43
44
|
ListDocsResponse,
|
|
44
45
|
LogResponse,
|
|
46
|
+
MigrationDocumentResult,
|
|
47
|
+
MigrationResult,
|
|
45
48
|
QueryPromptOverrides,
|
|
46
49
|
RequeueIngestionJob,
|
|
47
50
|
RequeueIngestionResponse,
|
|
@@ -1305,6 +1308,222 @@ class Morphik(_ScopedOperationsMixin):
|
|
|
1305
1308
|
sort_direction=sort_direction,
|
|
1306
1309
|
)
|
|
1307
1310
|
|
|
1311
|
+
def migrate(
|
|
1312
|
+
self,
|
|
1313
|
+
target_uri: str,
|
|
1314
|
+
*,
|
|
1315
|
+
filters: Optional[Dict[str, Any]] = None,
|
|
1316
|
+
folder_name: Optional[Union[str, List[str]]] = None,
|
|
1317
|
+
folder_depth: Optional[int] = None,
|
|
1318
|
+
end_user_id: Optional[str] = None,
|
|
1319
|
+
skip: int = 0,
|
|
1320
|
+
limit: Optional[int] = None,
|
|
1321
|
+
batch_size: int = 100,
|
|
1322
|
+
completed_only: bool = True,
|
|
1323
|
+
use_colpali: bool = True,
|
|
1324
|
+
preserve_folders: bool = True,
|
|
1325
|
+
preserve_end_user_id: bool = True,
|
|
1326
|
+
preserve_summaries: bool = True,
|
|
1327
|
+
include_source_metadata: bool = True,
|
|
1328
|
+
on_conflict: Literal["skip", "fail"] = "skip",
|
|
1329
|
+
continue_on_error: bool = True,
|
|
1330
|
+
target_timeout: Optional[int] = None,
|
|
1331
|
+
target_is_local: bool = False,
|
|
1332
|
+
progress_callback: Optional[Callable[[MigrationDocumentResult], None]] = None,
|
|
1333
|
+
) -> MigrationResult:
|
|
1334
|
+
"""Migrate documents from this client into another Morphik URI.
|
|
1335
|
+
|
|
1336
|
+
The caller should run this from a network location that can reach both
|
|
1337
|
+
the source URI and the target URI, for example from inside a customer's
|
|
1338
|
+
VPN when the target deployment is on-prem. Set target_is_local=True for
|
|
1339
|
+
local/on-prem targets that should use HTTP or skip TLS verification.
|
|
1340
|
+
"""
|
|
1341
|
+
if batch_size <= 0:
|
|
1342
|
+
raise ValueError("batch_size must be greater than 0")
|
|
1343
|
+
if limit is not None and limit < 0:
|
|
1344
|
+
raise ValueError("limit must be greater than or equal to 0")
|
|
1345
|
+
|
|
1346
|
+
target = Morphik(target_uri, timeout=target_timeout or self._logic._timeout, is_local=target_is_local)
|
|
1347
|
+
results: List[MigrationDocumentResult] = []
|
|
1348
|
+
total_source_count: Optional[int] = None
|
|
1349
|
+
current_skip = max(skip, 0)
|
|
1350
|
+
remaining = limit
|
|
1351
|
+
page_size = min(batch_size, 500)
|
|
1352
|
+
|
|
1353
|
+
try:
|
|
1354
|
+
while remaining is None or remaining > 0:
|
|
1355
|
+
current_limit = page_size if remaining is None else min(page_size, remaining)
|
|
1356
|
+
page = self._scoped_list_documents(
|
|
1357
|
+
skip=current_skip,
|
|
1358
|
+
limit=current_limit,
|
|
1359
|
+
filters=filters,
|
|
1360
|
+
folder_name=folder_name,
|
|
1361
|
+
folder_depth=folder_depth,
|
|
1362
|
+
end_user_id=end_user_id,
|
|
1363
|
+
include_total_count=total_source_count is None,
|
|
1364
|
+
include_status_counts=False,
|
|
1365
|
+
include_folder_counts=False,
|
|
1366
|
+
completed_only=completed_only,
|
|
1367
|
+
sort_by="updated_at",
|
|
1368
|
+
sort_direction="desc",
|
|
1369
|
+
)
|
|
1370
|
+
if total_source_count is None:
|
|
1371
|
+
total_source_count = page.total_count
|
|
1372
|
+
if not page.documents:
|
|
1373
|
+
break
|
|
1374
|
+
|
|
1375
|
+
for source_document in page.documents:
|
|
1376
|
+
try:
|
|
1377
|
+
result = self._migrate_single_document(
|
|
1378
|
+
target=target,
|
|
1379
|
+
source_document=source_document,
|
|
1380
|
+
use_colpali=use_colpali,
|
|
1381
|
+
preserve_folders=preserve_folders,
|
|
1382
|
+
preserve_end_user_id=preserve_end_user_id,
|
|
1383
|
+
preserve_summaries=preserve_summaries,
|
|
1384
|
+
include_source_metadata=include_source_metadata,
|
|
1385
|
+
on_conflict=on_conflict,
|
|
1386
|
+
)
|
|
1387
|
+
except Exception as exc: # noqa: BLE001
|
|
1388
|
+
result = MigrationDocumentResult(
|
|
1389
|
+
source_document_id=source_document.external_id,
|
|
1390
|
+
filename=source_document.filename,
|
|
1391
|
+
status="failed",
|
|
1392
|
+
error=str(exc),
|
|
1393
|
+
)
|
|
1394
|
+
if not continue_on_error:
|
|
1395
|
+
results.append(result)
|
|
1396
|
+
if progress_callback:
|
|
1397
|
+
progress_callback(result)
|
|
1398
|
+
raise
|
|
1399
|
+
|
|
1400
|
+
results.append(result)
|
|
1401
|
+
if progress_callback:
|
|
1402
|
+
progress_callback(result)
|
|
1403
|
+
|
|
1404
|
+
if remaining is not None:
|
|
1405
|
+
remaining -= len(page.documents)
|
|
1406
|
+
if not page.has_more:
|
|
1407
|
+
break
|
|
1408
|
+
next_skip = page.next_skip if page.next_skip is not None else current_skip + page.returned_count
|
|
1409
|
+
if next_skip <= current_skip:
|
|
1410
|
+
break
|
|
1411
|
+
current_skip = next_skip
|
|
1412
|
+
finally:
|
|
1413
|
+
target.close()
|
|
1414
|
+
|
|
1415
|
+
return self._build_migration_result(results, total_source_count)
|
|
1416
|
+
|
|
1417
|
+
def _migrate_single_document(
|
|
1418
|
+
self,
|
|
1419
|
+
*,
|
|
1420
|
+
target: "Morphik",
|
|
1421
|
+
source_document: Document,
|
|
1422
|
+
use_colpali: bool,
|
|
1423
|
+
preserve_folders: bool,
|
|
1424
|
+
preserve_end_user_id: bool,
|
|
1425
|
+
preserve_summaries: bool,
|
|
1426
|
+
include_source_metadata: bool,
|
|
1427
|
+
on_conflict: Literal["skip", "fail"],
|
|
1428
|
+
) -> MigrationDocumentResult:
|
|
1429
|
+
metadata, metadata_types = build_migration_metadata(
|
|
1430
|
+
source_document,
|
|
1431
|
+
include_source_metadata=include_source_metadata,
|
|
1432
|
+
)
|
|
1433
|
+
file_bytes = self.get_document_file(source_document.external_id)
|
|
1434
|
+
folder = (source_document.folder_path or source_document.folder_name) if preserve_folders else None
|
|
1435
|
+
end_user = source_document.end_user_id if preserve_end_user_id else None
|
|
1436
|
+
|
|
1437
|
+
status, target_document = target._ingest_migrated_document(
|
|
1438
|
+
source_document_id=source_document.external_id,
|
|
1439
|
+
file_content=file_bytes,
|
|
1440
|
+
filename=source_document.filename or source_document.external_id,
|
|
1441
|
+
content_type=source_document.content_type,
|
|
1442
|
+
metadata=metadata,
|
|
1443
|
+
metadata_types=metadata_types,
|
|
1444
|
+
folder_name=folder,
|
|
1445
|
+
end_user_id=end_user,
|
|
1446
|
+
use_colpali=use_colpali,
|
|
1447
|
+
on_conflict=on_conflict,
|
|
1448
|
+
)
|
|
1449
|
+
|
|
1450
|
+
if preserve_summaries and status == "created":
|
|
1451
|
+
self._copy_document_summary(source_document.external_id, target, target_document.external_id)
|
|
1452
|
+
|
|
1453
|
+
return MigrationDocumentResult(
|
|
1454
|
+
source_document_id=source_document.external_id,
|
|
1455
|
+
target_document_id=target_document.external_id,
|
|
1456
|
+
filename=source_document.filename,
|
|
1457
|
+
status=status,
|
|
1458
|
+
)
|
|
1459
|
+
|
|
1460
|
+
def _ingest_migrated_document(
|
|
1461
|
+
self,
|
|
1462
|
+
*,
|
|
1463
|
+
source_document_id: str,
|
|
1464
|
+
file_content: bytes,
|
|
1465
|
+
filename: str,
|
|
1466
|
+
content_type: Optional[str],
|
|
1467
|
+
metadata: Dict[str, Any],
|
|
1468
|
+
metadata_types: Optional[Dict[str, str]],
|
|
1469
|
+
folder_name: Optional[str],
|
|
1470
|
+
end_user_id: Optional[str],
|
|
1471
|
+
use_colpali: bool,
|
|
1472
|
+
on_conflict: Literal["skip", "fail"],
|
|
1473
|
+
) -> tuple[str, Document]:
|
|
1474
|
+
serialized_metadata, inferred_types = self._logic._serialize_metadata_map(metadata)
|
|
1475
|
+
metadata_type_payload = {**inferred_types, **(metadata_types or {})}
|
|
1476
|
+
metadata_type_payload = {
|
|
1477
|
+
key: value for key, value in metadata_type_payload.items() if key in serialized_metadata
|
|
1478
|
+
}
|
|
1479
|
+
|
|
1480
|
+
form_data: Dict[str, Any] = {
|
|
1481
|
+
"source_document_id": source_document_id,
|
|
1482
|
+
"metadata": json.dumps(serialized_metadata),
|
|
1483
|
+
"metadata_types": json.dumps(metadata_type_payload),
|
|
1484
|
+
"use_colpali": str(use_colpali).lower(),
|
|
1485
|
+
"on_conflict": on_conflict,
|
|
1486
|
+
}
|
|
1487
|
+
if folder_name:
|
|
1488
|
+
form_data["folder_name"] = folder_name
|
|
1489
|
+
if end_user_id:
|
|
1490
|
+
form_data["end_user_id"] = end_user_id
|
|
1491
|
+
|
|
1492
|
+
file_obj = BytesIO(file_content)
|
|
1493
|
+
files = {"file": (filename, file_obj, content_type or "application/octet-stream")}
|
|
1494
|
+
response = self._request("POST", "migrate/document", data=form_data, files=files)
|
|
1495
|
+
document = self._logic._parse_document_response(response["document"])
|
|
1496
|
+
document._client = self
|
|
1497
|
+
return response.get("status", "created"), document
|
|
1498
|
+
|
|
1499
|
+
def _copy_document_summary(self, source_document_id: str, target: "Morphik", target_document_id: str) -> None:
|
|
1500
|
+
try:
|
|
1501
|
+
summary = self.get_document_summary(source_document_id)
|
|
1502
|
+
except httpx.HTTPStatusError as exc:
|
|
1503
|
+
if exc.response.status_code == 404:
|
|
1504
|
+
return
|
|
1505
|
+
raise
|
|
1506
|
+
target.upsert_document_summary(
|
|
1507
|
+
document_id=target_document_id,
|
|
1508
|
+
content=summary.content,
|
|
1509
|
+
versioning=False,
|
|
1510
|
+
overwrite_latest=True,
|
|
1511
|
+
)
|
|
1512
|
+
|
|
1513
|
+
@staticmethod
|
|
1514
|
+
def _build_migration_result(
|
|
1515
|
+
results: List[MigrationDocumentResult],
|
|
1516
|
+
total_source_count: Optional[int],
|
|
1517
|
+
) -> MigrationResult:
|
|
1518
|
+
return MigrationResult(
|
|
1519
|
+
documents=results,
|
|
1520
|
+
total_source_count=total_source_count,
|
|
1521
|
+
attempted_count=len(results),
|
|
1522
|
+
created_count=sum(1 for item in results if item.status == "created"),
|
|
1523
|
+
skipped_count=sum(1 for item in results if item.status == "skipped"),
|
|
1524
|
+
failed_count=sum(1 for item in results if item.status == "failed"),
|
|
1525
|
+
)
|
|
1526
|
+
|
|
1308
1527
|
def get_document(self, document_id: str) -> Document:
|
|
1309
1528
|
"""
|
|
1310
1529
|
Get document metadata by ID.
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
1
3
|
import httpx
|
|
2
4
|
import jwt
|
|
3
5
|
import pytest
|
|
@@ -305,6 +307,59 @@ def test_sync_get_document_by_filename_scoped_params_and_encoding():
|
|
|
305
307
|
client.close()
|
|
306
308
|
|
|
307
309
|
|
|
310
|
+
def test_sync_ingest_migrated_document_posts_migration_payload():
|
|
311
|
+
client = Morphik()
|
|
312
|
+
calls = []
|
|
313
|
+
|
|
314
|
+
def fake_request(method, endpoint, data=None, files=None, params=None):
|
|
315
|
+
calls.append({"method": method, "endpoint": endpoint, "data": data, "files": files, "params": params})
|
|
316
|
+
return {
|
|
317
|
+
"status": "created",
|
|
318
|
+
"document": {
|
|
319
|
+
"external_id": "source-doc-1",
|
|
320
|
+
"content_type": "application/pdf",
|
|
321
|
+
"filename": "report.pdf",
|
|
322
|
+
},
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
client._request = fake_request # type: ignore[attr-defined]
|
|
326
|
+
try:
|
|
327
|
+
status, doc = client._ingest_migrated_document(
|
|
328
|
+
source_document_id="source-doc-1",
|
|
329
|
+
file_content=b"pdf-bytes",
|
|
330
|
+
filename="report.pdf",
|
|
331
|
+
content_type="application/pdf",
|
|
332
|
+
metadata={"category": "finance", "_morphik_migration": {"source_document_id": "source-doc-1"}},
|
|
333
|
+
metadata_types={"category": "string", "_morphik_migration": "object"},
|
|
334
|
+
folder_name="/finance/reports",
|
|
335
|
+
end_user_id="customer-1",
|
|
336
|
+
use_colpali=True,
|
|
337
|
+
on_conflict="skip",
|
|
338
|
+
)
|
|
339
|
+
call = calls.pop()
|
|
340
|
+
assert call["method"] == "POST"
|
|
341
|
+
assert call["endpoint"] == "migrate/document"
|
|
342
|
+
assert call["data"]["source_document_id"] == "source-doc-1"
|
|
343
|
+
assert call["data"]["folder_name"] == "/finance/reports"
|
|
344
|
+
assert call["data"]["end_user_id"] == "customer-1"
|
|
345
|
+
assert call["data"]["use_colpali"] == "true"
|
|
346
|
+
assert call["data"]["on_conflict"] == "skip"
|
|
347
|
+
assert json.loads(call["data"]["metadata"]) == {
|
|
348
|
+
"category": "finance",
|
|
349
|
+
"_morphik_migration": {"source_document_id": "source-doc-1"},
|
|
350
|
+
}
|
|
351
|
+
assert json.loads(call["data"]["metadata_types"]) == {
|
|
352
|
+
"category": "string",
|
|
353
|
+
"_morphik_migration": "object",
|
|
354
|
+
}
|
|
355
|
+
assert call["files"]["file"][0] == "report.pdf"
|
|
356
|
+
assert call["files"]["file"][2] == "application/pdf"
|
|
357
|
+
assert status == "created"
|
|
358
|
+
assert doc.external_id == "source-doc-1"
|
|
359
|
+
finally:
|
|
360
|
+
client.close()
|
|
361
|
+
|
|
362
|
+
|
|
308
363
|
def test_sync_folder_get_document_by_filename_scoped():
|
|
309
364
|
client, calls = _make_sync_client()
|
|
310
365
|
try:
|
|
@@ -7,6 +7,7 @@ from morphik._shared import (
|
|
|
7
7
|
build_document_by_filename_params,
|
|
8
8
|
build_list_apps_params,
|
|
9
9
|
build_logs_params,
|
|
10
|
+
build_migration_metadata,
|
|
10
11
|
build_rename_app_params,
|
|
11
12
|
build_requeue_payload,
|
|
12
13
|
build_rotate_app_params,
|
|
@@ -14,7 +15,7 @@ from morphik._shared import (
|
|
|
14
15
|
merge_folders,
|
|
15
16
|
normalize_additional_folders,
|
|
16
17
|
)
|
|
17
|
-
from morphik.models import RequeueIngestionJob
|
|
18
|
+
from morphik.models import Document, RequeueIngestionJob
|
|
18
19
|
|
|
19
20
|
|
|
20
21
|
def test_merge_folders_variants():
|
|
@@ -119,3 +120,37 @@ def test_normalize_additional_folders_alias():
|
|
|
119
120
|
assert normalize_additional_folders(None, "b") == ["b"]
|
|
120
121
|
assert normalize_additional_folders(["a"], "b") == ["a", "b"]
|
|
121
122
|
assert normalize_additional_folders(["a"], ["b", "c"]) == ["a", "b", "c"]
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def test_build_migration_metadata_strips_managed_fields_and_records_source():
|
|
126
|
+
doc = Document(
|
|
127
|
+
external_id="source-doc-1",
|
|
128
|
+
content_type="application/pdf",
|
|
129
|
+
filename="report.pdf",
|
|
130
|
+
app_id="source-app",
|
|
131
|
+
folder_path="/finance/reports",
|
|
132
|
+
end_user_id="customer-1",
|
|
133
|
+
metadata={
|
|
134
|
+
"external_id": "source-doc-1",
|
|
135
|
+
"folder_name": "/finance/reports",
|
|
136
|
+
"end_user_id": "customer-1",
|
|
137
|
+
"category": "finance",
|
|
138
|
+
},
|
|
139
|
+
metadata_types={
|
|
140
|
+
"external_id": "string",
|
|
141
|
+
"folder_name": "string",
|
|
142
|
+
"end_user_id": "string",
|
|
143
|
+
"category": "string",
|
|
144
|
+
},
|
|
145
|
+
system_metadata={"created_at": "2026-01-01T00:00:00Z", "updated_at": "2026-01-02T00:00:00Z"},
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
metadata, metadata_types = build_migration_metadata(doc)
|
|
149
|
+
|
|
150
|
+
assert metadata["category"] == "finance"
|
|
151
|
+
assert "external_id" not in metadata
|
|
152
|
+
assert "folder_name" not in metadata
|
|
153
|
+
assert "end_user_id" not in metadata
|
|
154
|
+
assert metadata["_morphik_migration"]["source_document_id"] == "source-doc-1"
|
|
155
|
+
assert metadata["_morphik_migration"]["source_app_id"] == "source-app"
|
|
156
|
+
assert metadata_types == {"category": "string", "_morphik_migration": "object"}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|