morphik 1.2.2__tar.gz → 1.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. {morphik-1.2.2 → morphik-1.2.4}/.gitignore +0 -3
  2. {morphik-1.2.2 → morphik-1.2.4}/PKG-INFO +7 -1
  3. {morphik-1.2.2 → morphik-1.2.4}/README.md +6 -0
  4. {morphik-1.2.2 → morphik-1.2.4}/morphik/__init__.py +4 -2
  5. {morphik-1.2.2 → morphik-1.2.4}/morphik/_shared.py +54 -1
  6. {morphik-1.2.2 → morphik-1.2.4}/morphik/async_.py +234 -2
  7. {morphik-1.2.2 → morphik-1.2.4}/morphik/models.py +21 -0
  8. {morphik-1.2.2 → morphik-1.2.4}/morphik/sync.py +221 -2
  9. {morphik-1.2.2 → morphik-1.2.4}/morphik/tests/test_scoped_ops_unit.py +55 -0
  10. {morphik-1.2.2 → morphik-1.2.4}/morphik/tests/test_shared_helpers.py +36 -1
  11. {morphik-1.2.2 → morphik-1.2.4}/pyproject.toml +1 -1
  12. {morphik-1.2.2 → morphik-1.2.4}/morphik/_internal.py +0 -0
  13. {morphik-1.2.2 → morphik-1.2.4}/morphik/_scoped_ops.py +0 -0
  14. {morphik-1.2.2 → morphik-1.2.4}/morphik/exceptions.py +0 -0
  15. {morphik-1.2.2 → morphik-1.2.4}/morphik/tests/README.md +0 -0
  16. {morphik-1.2.2 → morphik-1.2.4}/morphik/tests/__init__.py +0 -0
  17. {morphik-1.2.2 → morphik-1.2.4}/morphik/tests/example_usage.py +0 -0
  18. {morphik-1.2.2 → morphik-1.2.4}/morphik/tests/test_app_ops.py +0 -0
  19. {morphik-1.2.2 → morphik-1.2.4}/morphik/tests/test_async.py +0 -0
  20. {morphik-1.2.2 → morphik-1.2.4}/morphik/tests/test_docs/sample1.txt +0 -0
  21. {morphik-1.2.2 → morphik-1.2.4}/morphik/tests/test_docs/sample2.txt +0 -0
  22. {morphik-1.2.2 → morphik-1.2.4}/morphik/tests/test_docs/sample3.txt +0 -0
  23. {morphik-1.2.2 → morphik-1.2.4}/morphik/tests/test_sync.py +0 -0
  24. {morphik-1.2.2 → morphik-1.2.4}/morphik/tests/test_update_document_metadata_rename.py +0 -0
@@ -58,6 +58,3 @@ multi_vector_embeddings_*.json
58
58
  # Rust build artifacts
59
59
  morphik_rust/target/
60
60
  morphik_rust/Cargo.lock
61
- DOCS_BY_FACILITIES/
62
- eval_freshworks/
63
- iep_test/
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: morphik
3
- Version: 1.2.2
3
+ Version: 1.2.4
4
4
  Summary: Morphik Python Client
5
5
  Author-email: Morphik <founders@morphik.ai>
6
6
  Requires-Python: >=3.8
@@ -91,6 +91,12 @@ response = db.query(
91
91
  )
92
92
 
93
93
  print(response.completion)
94
+
95
+ # Migrate this app's documents into another Morphik deployment.
96
+ # Run this from a machine that can reach both source and target, such as
97
+ # inside a customer's VPN for on-prem targets.
98
+ result = db.migrate(target_uri="morphik://owner_id:token@onprem.example.com", target_is_local=True)
99
+ print(result.created_count, result.skipped_count, result.failed_count)
94
100
  ```
95
101
 
96
102
  ### Nested Folders & Folder Depth
@@ -78,6 +78,12 @@ response = db.query(
78
78
  )
79
79
 
80
80
  print(response.completion)
81
+
82
+ # Migrate this app's documents into another Morphik deployment.
83
+ # Run this from a machine that can reach both source and target, such as
84
+ # inside a customer's VPN for on-prem targets.
85
+ result = db.migrate(target_uri="morphik://owner_id:token@onprem.example.com", target_is_local=True)
86
+ print(result.created_count, result.skipped_count, result.failed_count)
81
87
  ```
82
88
 
83
89
  ### Nested Folders & Folder Depth
@@ -3,7 +3,7 @@ Morphik Python SDK for document ingestion and querying.
3
3
  """
4
4
 
5
5
  from .async_ import AsyncMorphik
6
- from .models import Document, DocumentQueryResponse, Summary
6
+ from .models import Document, DocumentQueryResponse, MigrationDocumentResult, MigrationResult, Summary
7
7
  from .sync import Morphik
8
8
 
9
9
  __all__ = [
@@ -12,6 +12,8 @@ __all__ = [
12
12
  "Document",
13
13
  "Summary",
14
14
  "DocumentQueryResponse",
15
+ "MigrationDocumentResult",
16
+ "MigrationResult",
15
17
  ]
16
18
 
17
- __version__ = "1.2.2"
19
+ __version__ = "1.2.4"
@@ -2,7 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  import json
4
4
  from pathlib import Path
5
- from typing import Any, Dict, Iterable, List, Optional, Union
5
+ from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
6
6
  from urllib.parse import quote
7
7
 
8
8
  from pydantic import BaseModel
@@ -10,6 +10,17 @@ from pydantic import BaseModel
10
10
  MAX_LIMIT = 500
11
11
  MIN_LOG_HOURS = 0.1
12
12
  MAX_LOG_HOURS = 168.0
13
+ MIGRATION_SOURCE_METADATA_KEY = "_morphik_migration"
14
+ MIGRATION_RESERVED_METADATA_FIELDS = {
15
+ "app_id",
16
+ "end_user_id",
17
+ "external_id",
18
+ "filename",
19
+ "folder_id",
20
+ "folder_name",
21
+ "folder_path",
22
+ "owner_id",
23
+ }
13
24
 
14
25
 
15
26
  def merge_folders(
@@ -197,3 +208,45 @@ def normalize_additional_folders(
197
208
  if additional_folders:
198
209
  return list(additional_folders) + folder_list
199
210
  return folder_list
211
+
212
+
213
+ def build_migration_metadata(
214
+ document: Any,
215
+ *,
216
+ include_source_metadata: bool = True,
217
+ ) -> Tuple[Dict[str, Any], Dict[str, str]]:
218
+ """Prepare document metadata for migration ingestion.
219
+
220
+ The target API owns fields such as external_id, folder_name, and app_id, so
221
+ those values must travel through dedicated migration parameters instead of
222
+ user metadata.
223
+ """
224
+ metadata = dict(getattr(document, "metadata", None) or {})
225
+ metadata_types = dict(getattr(document, "metadata_types", None) or {})
226
+
227
+ for field in MIGRATION_RESERVED_METADATA_FIELDS:
228
+ metadata.pop(field, None)
229
+ metadata_types.pop(field, None)
230
+
231
+ if include_source_metadata:
232
+ system_metadata = getattr(document, "system_metadata", None) or {}
233
+ source_info = {
234
+ "source_document_id": getattr(document, "external_id", None),
235
+ "source_app_id": getattr(document, "app_id", None),
236
+ "source_filename": getattr(document, "filename", None),
237
+ "source_created_at": system_metadata.get("created_at") if isinstance(system_metadata, dict) else None,
238
+ "source_updated_at": system_metadata.get("updated_at") if isinstance(system_metadata, dict) else None,
239
+ }
240
+ source_info = {key: value for key, value in source_info.items() if value is not None}
241
+
242
+ existing_source_info = metadata.get(MIGRATION_SOURCE_METADATA_KEY)
243
+ if isinstance(existing_source_info, dict):
244
+ existing_source_info = dict(existing_source_info)
245
+ for key, value in source_info.items():
246
+ existing_source_info.setdefault(key, value)
247
+ metadata[MIGRATION_SOURCE_METADATA_KEY] = existing_source_info
248
+ else:
249
+ metadata[MIGRATION_SOURCE_METADATA_KEY] = source_info
250
+ metadata_types.setdefault(MIGRATION_SOURCE_METADATA_KEY, "object")
251
+
252
+ return metadata, metadata_types
@@ -1,3 +1,4 @@
1
+ import inspect
1
2
  import json
2
3
  import logging
3
4
  import warnings
@@ -12,13 +13,14 @@ from pydantic import BaseModel
12
13
  from ._internal import FinalChunkResult, _MorphikClientLogic
13
14
  from ._scoped_ops import _ScopedOperationsMixin
14
15
  from ._shared import (
16
+ build_create_app_payload,
17
+ build_document_by_filename_params,
15
18
  build_folder_endpoint_identifier,
16
19
  build_folder_move_payload,
17
20
  build_folder_rename_path,
18
- build_create_app_payload,
19
- build_document_by_filename_params,
20
21
  build_list_apps_params,
21
22
  build_logs_params,
23
+ build_migration_metadata,
22
24
  build_rename_app_params,
23
25
  build_requeue_payload,
24
26
  build_rotate_app_params,
@@ -42,6 +44,8 @@ from .models import (
42
44
  IngestTextRequest,
43
45
  ListDocsResponse,
44
46
  LogResponse,
47
+ MigrationDocumentResult,
48
+ MigrationResult,
45
49
  QueryPromptOverrides,
46
50
  RequeueIngestionJob,
47
51
  RequeueIngestionResponse,
@@ -1266,6 +1270,234 @@ class AsyncMorphik(_ScopedOperationsMixin):
1266
1270
  sort_direction=sort_direction,
1267
1271
  )
1268
1272
 
1273
+ async def migrate(
1274
+ self,
1275
+ target_uri: str,
1276
+ *,
1277
+ filters: Optional[Dict[str, Any]] = None,
1278
+ folder_name: Optional[Union[str, List[str]]] = None,
1279
+ folder_depth: Optional[int] = None,
1280
+ end_user_id: Optional[str] = None,
1281
+ skip: int = 0,
1282
+ limit: Optional[int] = None,
1283
+ batch_size: int = 100,
1284
+ completed_only: bool = True,
1285
+ use_colpali: bool = True,
1286
+ preserve_folders: bool = True,
1287
+ preserve_end_user_id: bool = True,
1288
+ preserve_summaries: bool = True,
1289
+ include_source_metadata: bool = True,
1290
+ on_conflict: Literal["skip", "fail"] = "skip",
1291
+ continue_on_error: bool = True,
1292
+ target_timeout: Optional[int] = None,
1293
+ target_is_local: bool = False,
1294
+ progress_callback: Optional[Callable[[MigrationDocumentResult], Any]] = None,
1295
+ ) -> MigrationResult:
1296
+ """Migrate documents from this client into another Morphik URI.
1297
+
1298
+ Set target_is_local=True for local/on-prem targets that should use HTTP
1299
+ or skip TLS verification.
1300
+ """
1301
+ if batch_size <= 0:
1302
+ raise ValueError("batch_size must be greater than 0")
1303
+ if limit is not None and limit < 0:
1304
+ raise ValueError("limit must be greater than or equal to 0")
1305
+
1306
+ target = AsyncMorphik(target_uri, timeout=target_timeout or self._logic._timeout, is_local=target_is_local)
1307
+ results: List[MigrationDocumentResult] = []
1308
+ total_source_count: Optional[int] = None
1309
+ current_skip = max(skip, 0)
1310
+ remaining = limit
1311
+ page_size = min(batch_size, 500)
1312
+
1313
+ try:
1314
+ while remaining is None or remaining > 0:
1315
+ current_limit = page_size if remaining is None else min(page_size, remaining)
1316
+ page = await self._scoped_list_documents(
1317
+ skip=current_skip,
1318
+ limit=current_limit,
1319
+ filters=filters,
1320
+ folder_name=folder_name,
1321
+ folder_depth=folder_depth,
1322
+ end_user_id=end_user_id,
1323
+ include_total_count=total_source_count is None,
1324
+ include_status_counts=False,
1325
+ include_folder_counts=False,
1326
+ completed_only=completed_only,
1327
+ sort_by="updated_at",
1328
+ sort_direction="desc",
1329
+ )
1330
+ if total_source_count is None:
1331
+ total_source_count = page.total_count
1332
+ if not page.documents:
1333
+ break
1334
+
1335
+ for source_document in page.documents:
1336
+ try:
1337
+ result = await self._migrate_single_document(
1338
+ target=target,
1339
+ source_document=source_document,
1340
+ use_colpali=use_colpali,
1341
+ preserve_folders=preserve_folders,
1342
+ preserve_end_user_id=preserve_end_user_id,
1343
+ preserve_summaries=preserve_summaries,
1344
+ include_source_metadata=include_source_metadata,
1345
+ on_conflict=on_conflict,
1346
+ )
1347
+ except Exception as exc: # noqa: BLE001
1348
+ result = MigrationDocumentResult(
1349
+ source_document_id=source_document.external_id,
1350
+ filename=source_document.filename,
1351
+ status="failed",
1352
+ error=str(exc),
1353
+ )
1354
+ if not continue_on_error:
1355
+ results.append(result)
1356
+ await self._emit_migration_progress(progress_callback, result)
1357
+ raise
1358
+
1359
+ results.append(result)
1360
+ await self._emit_migration_progress(progress_callback, result)
1361
+
1362
+ if remaining is not None:
1363
+ remaining -= len(page.documents)
1364
+ if not page.has_more:
1365
+ break
1366
+ next_skip = page.next_skip if page.next_skip is not None else current_skip + page.returned_count
1367
+ if next_skip <= current_skip:
1368
+ break
1369
+ current_skip = next_skip
1370
+ finally:
1371
+ await target.close()
1372
+
1373
+ return self._build_migration_result(results, total_source_count)
1374
+
1375
+ async def _migrate_single_document(
1376
+ self,
1377
+ *,
1378
+ target: "AsyncMorphik",
1379
+ source_document: Document,
1380
+ use_colpali: bool,
1381
+ preserve_folders: bool,
1382
+ preserve_end_user_id: bool,
1383
+ preserve_summaries: bool,
1384
+ include_source_metadata: bool,
1385
+ on_conflict: Literal["skip", "fail"],
1386
+ ) -> MigrationDocumentResult:
1387
+ metadata, metadata_types = build_migration_metadata(
1388
+ source_document,
1389
+ include_source_metadata=include_source_metadata,
1390
+ )
1391
+ file_bytes = await self.get_document_file(source_document.external_id)
1392
+ folder = (source_document.folder_path or source_document.folder_name) if preserve_folders else None
1393
+ end_user = source_document.end_user_id if preserve_end_user_id else None
1394
+
1395
+ status, target_document = await target._ingest_migrated_document(
1396
+ source_document_id=source_document.external_id,
1397
+ file_content=file_bytes,
1398
+ filename=source_document.filename or source_document.external_id,
1399
+ content_type=source_document.content_type,
1400
+ metadata=metadata,
1401
+ metadata_types=metadata_types,
1402
+ folder_name=folder,
1403
+ end_user_id=end_user,
1404
+ use_colpali=use_colpali,
1405
+ on_conflict=on_conflict,
1406
+ )
1407
+
1408
+ if preserve_summaries and status == "created":
1409
+ await self._copy_document_summary(source_document.external_id, target, target_document.external_id)
1410
+
1411
+ return MigrationDocumentResult(
1412
+ source_document_id=source_document.external_id,
1413
+ target_document_id=target_document.external_id,
1414
+ filename=source_document.filename,
1415
+ status=status,
1416
+ )
1417
+
1418
+ async def _ingest_migrated_document(
1419
+ self,
1420
+ *,
1421
+ source_document_id: str,
1422
+ file_content: bytes,
1423
+ filename: str,
1424
+ content_type: Optional[str],
1425
+ metadata: Dict[str, Any],
1426
+ metadata_types: Optional[Dict[str, str]],
1427
+ folder_name: Optional[str],
1428
+ end_user_id: Optional[str],
1429
+ use_colpali: bool,
1430
+ on_conflict: Literal["skip", "fail"],
1431
+ ) -> tuple[str, Document]:
1432
+ serialized_metadata, inferred_types = self._logic._serialize_metadata_map(metadata)
1433
+ metadata_type_payload = {**inferred_types, **(metadata_types or {})}
1434
+ metadata_type_payload = {
1435
+ key: value for key, value in metadata_type_payload.items() if key in serialized_metadata
1436
+ }
1437
+
1438
+ form_data: Dict[str, Any] = {
1439
+ "source_document_id": source_document_id,
1440
+ "metadata": json.dumps(serialized_metadata),
1441
+ "metadata_types": json.dumps(metadata_type_payload),
1442
+ "use_colpali": str(use_colpali).lower(),
1443
+ "on_conflict": on_conflict,
1444
+ }
1445
+ if folder_name:
1446
+ form_data["folder_name"] = folder_name
1447
+ if end_user_id:
1448
+ form_data["end_user_id"] = end_user_id
1449
+
1450
+ file_obj = BytesIO(file_content)
1451
+ files = {"file": (filename, file_obj, content_type or "application/octet-stream")}
1452
+ response = await self._request("POST", "migrate/document", data=form_data, files=files)
1453
+ document = self._logic._parse_document_response(response["document"])
1454
+ document._client = self
1455
+ return response.get("status", "created"), document
1456
+
1457
+ async def _copy_document_summary(
1458
+ self,
1459
+ source_document_id: str,
1460
+ target: "AsyncMorphik",
1461
+ target_document_id: str,
1462
+ ) -> None:
1463
+ try:
1464
+ summary = await self.get_document_summary(source_document_id)
1465
+ except httpx.HTTPStatusError as exc:
1466
+ if exc.response.status_code == 404:
1467
+ return
1468
+ raise
1469
+ await target.upsert_document_summary(
1470
+ document_id=target_document_id,
1471
+ content=summary.content,
1472
+ versioning=False,
1473
+ overwrite_latest=True,
1474
+ )
1475
+
1476
+ @staticmethod
1477
+ async def _emit_migration_progress(
1478
+ progress_callback: Optional[Callable[[MigrationDocumentResult], Any]],
1479
+ result: MigrationDocumentResult,
1480
+ ) -> None:
1481
+ if progress_callback is None:
1482
+ return
1483
+ callback_result = progress_callback(result)
1484
+ if inspect.isawaitable(callback_result):
1485
+ await callback_result
1486
+
1487
+ @staticmethod
1488
+ def _build_migration_result(
1489
+ results: List[MigrationDocumentResult],
1490
+ total_source_count: Optional[int],
1491
+ ) -> MigrationResult:
1492
+ return MigrationResult(
1493
+ documents=results,
1494
+ total_source_count=total_source_count,
1495
+ attempted_count=len(results),
1496
+ created_count=sum(1 for item in results if item.status == "created"),
1497
+ skipped_count=sum(1 for item in results if item.status == "skipped"),
1498
+ failed_count=sum(1 for item in results if item.status == "failed"),
1499
+ )
1500
+
1269
1501
  async def get_document(self, document_id: str) -> Document:
1270
1502
  """
1271
1503
  Get document metadata by ID.
@@ -415,6 +415,27 @@ class ListDocsResponse(BaseModel):
415
415
  folder_counts: Optional[List[FolderCount]] = Field(None, description="Document counts by folder")
416
416
 
417
417
 
418
+ class MigrationDocumentResult(BaseModel):
419
+ """Per-document result from a migration run."""
420
+
421
+ source_document_id: str = Field(..., description="Document ID in the source Morphik app")
422
+ target_document_id: Optional[str] = Field(None, description="Document ID in the target Morphik app")
423
+ filename: Optional[str] = Field(None, description="Migrated filename")
424
+ status: Literal["created", "skipped", "failed"] = Field(..., description="Migration outcome")
425
+ error: Optional[str] = Field(None, description="Error message when status is failed")
426
+
427
+
428
+ class MigrationResult(BaseModel):
429
+ """Summary returned by client.migrate."""
430
+
431
+ documents: List[MigrationDocumentResult] = Field(default_factory=list)
432
+ total_source_count: Optional[int] = Field(None, description="Total matching source documents, when available")
433
+ attempted_count: int = Field(..., description="Number of source documents attempted")
434
+ created_count: int = Field(..., description="Number of target documents created")
435
+ skipped_count: int = Field(..., description="Number of target documents skipped because they already existed")
436
+ failed_count: int = Field(..., description="Number of documents that failed migration")
437
+
438
+
418
439
  class IngestTextRequest(BaseModel):
419
440
  """Request model for ingesting text content"""
420
441
 
@@ -12,13 +12,14 @@ from pydantic import BaseModel
12
12
  from ._internal import FinalChunkResult, _MorphikClientLogic
13
13
  from ._scoped_ops import _ScopedOperationsMixin
14
14
  from ._shared import (
15
+ build_create_app_payload,
16
+ build_document_by_filename_params,
15
17
  build_folder_endpoint_identifier,
16
18
  build_folder_move_payload,
17
19
  build_folder_rename_path,
18
- build_create_app_payload,
19
- build_document_by_filename_params,
20
20
  build_list_apps_params,
21
21
  build_logs_params,
22
+ build_migration_metadata,
22
23
  build_rename_app_params,
23
24
  build_requeue_payload,
24
25
  build_rotate_app_params,
@@ -42,6 +43,8 @@ from .models import (
42
43
  IngestTextRequest,
43
44
  ListDocsResponse,
44
45
  LogResponse,
46
+ MigrationDocumentResult,
47
+ MigrationResult,
45
48
  QueryPromptOverrides,
46
49
  RequeueIngestionJob,
47
50
  RequeueIngestionResponse,
@@ -1305,6 +1308,222 @@ class Morphik(_ScopedOperationsMixin):
1305
1308
  sort_direction=sort_direction,
1306
1309
  )
1307
1310
 
1311
+ def migrate(
1312
+ self,
1313
+ target_uri: str,
1314
+ *,
1315
+ filters: Optional[Dict[str, Any]] = None,
1316
+ folder_name: Optional[Union[str, List[str]]] = None,
1317
+ folder_depth: Optional[int] = None,
1318
+ end_user_id: Optional[str] = None,
1319
+ skip: int = 0,
1320
+ limit: Optional[int] = None,
1321
+ batch_size: int = 100,
1322
+ completed_only: bool = True,
1323
+ use_colpali: bool = True,
1324
+ preserve_folders: bool = True,
1325
+ preserve_end_user_id: bool = True,
1326
+ preserve_summaries: bool = True,
1327
+ include_source_metadata: bool = True,
1328
+ on_conflict: Literal["skip", "fail"] = "skip",
1329
+ continue_on_error: bool = True,
1330
+ target_timeout: Optional[int] = None,
1331
+ target_is_local: bool = False,
1332
+ progress_callback: Optional[Callable[[MigrationDocumentResult], None]] = None,
1333
+ ) -> MigrationResult:
1334
+ """Migrate documents from this client into another Morphik URI.
1335
+
1336
+ The caller should run this from a network location that can reach both
1337
+ the source URI and the target URI, for example from inside a customer's
1338
+ VPN when the target deployment is on-prem. Set target_is_local=True for
1339
+ local/on-prem targets that should use HTTP or skip TLS verification.
1340
+ """
1341
+ if batch_size <= 0:
1342
+ raise ValueError("batch_size must be greater than 0")
1343
+ if limit is not None and limit < 0:
1344
+ raise ValueError("limit must be greater than or equal to 0")
1345
+
1346
+ target = Morphik(target_uri, timeout=target_timeout or self._logic._timeout, is_local=target_is_local)
1347
+ results: List[MigrationDocumentResult] = []
1348
+ total_source_count: Optional[int] = None
1349
+ current_skip = max(skip, 0)
1350
+ remaining = limit
1351
+ page_size = min(batch_size, 500)
1352
+
1353
+ try:
1354
+ while remaining is None or remaining > 0:
1355
+ current_limit = page_size if remaining is None else min(page_size, remaining)
1356
+ page = self._scoped_list_documents(
1357
+ skip=current_skip,
1358
+ limit=current_limit,
1359
+ filters=filters,
1360
+ folder_name=folder_name,
1361
+ folder_depth=folder_depth,
1362
+ end_user_id=end_user_id,
1363
+ include_total_count=total_source_count is None,
1364
+ include_status_counts=False,
1365
+ include_folder_counts=False,
1366
+ completed_only=completed_only,
1367
+ sort_by="updated_at",
1368
+ sort_direction="desc",
1369
+ )
1370
+ if total_source_count is None:
1371
+ total_source_count = page.total_count
1372
+ if not page.documents:
1373
+ break
1374
+
1375
+ for source_document in page.documents:
1376
+ try:
1377
+ result = self._migrate_single_document(
1378
+ target=target,
1379
+ source_document=source_document,
1380
+ use_colpali=use_colpali,
1381
+ preserve_folders=preserve_folders,
1382
+ preserve_end_user_id=preserve_end_user_id,
1383
+ preserve_summaries=preserve_summaries,
1384
+ include_source_metadata=include_source_metadata,
1385
+ on_conflict=on_conflict,
1386
+ )
1387
+ except Exception as exc: # noqa: BLE001
1388
+ result = MigrationDocumentResult(
1389
+ source_document_id=source_document.external_id,
1390
+ filename=source_document.filename,
1391
+ status="failed",
1392
+ error=str(exc),
1393
+ )
1394
+ if not continue_on_error:
1395
+ results.append(result)
1396
+ if progress_callback:
1397
+ progress_callback(result)
1398
+ raise
1399
+
1400
+ results.append(result)
1401
+ if progress_callback:
1402
+ progress_callback(result)
1403
+
1404
+ if remaining is not None:
1405
+ remaining -= len(page.documents)
1406
+ if not page.has_more:
1407
+ break
1408
+ next_skip = page.next_skip if page.next_skip is not None else current_skip + page.returned_count
1409
+ if next_skip <= current_skip:
1410
+ break
1411
+ current_skip = next_skip
1412
+ finally:
1413
+ target.close()
1414
+
1415
+ return self._build_migration_result(results, total_source_count)
1416
+
1417
+ def _migrate_single_document(
1418
+ self,
1419
+ *,
1420
+ target: "Morphik",
1421
+ source_document: Document,
1422
+ use_colpali: bool,
1423
+ preserve_folders: bool,
1424
+ preserve_end_user_id: bool,
1425
+ preserve_summaries: bool,
1426
+ include_source_metadata: bool,
1427
+ on_conflict: Literal["skip", "fail"],
1428
+ ) -> MigrationDocumentResult:
1429
+ metadata, metadata_types = build_migration_metadata(
1430
+ source_document,
1431
+ include_source_metadata=include_source_metadata,
1432
+ )
1433
+ file_bytes = self.get_document_file(source_document.external_id)
1434
+ folder = (source_document.folder_path or source_document.folder_name) if preserve_folders else None
1435
+ end_user = source_document.end_user_id if preserve_end_user_id else None
1436
+
1437
+ status, target_document = target._ingest_migrated_document(
1438
+ source_document_id=source_document.external_id,
1439
+ file_content=file_bytes,
1440
+ filename=source_document.filename or source_document.external_id,
1441
+ content_type=source_document.content_type,
1442
+ metadata=metadata,
1443
+ metadata_types=metadata_types,
1444
+ folder_name=folder,
1445
+ end_user_id=end_user,
1446
+ use_colpali=use_colpali,
1447
+ on_conflict=on_conflict,
1448
+ )
1449
+
1450
+ if preserve_summaries and status == "created":
1451
+ self._copy_document_summary(source_document.external_id, target, target_document.external_id)
1452
+
1453
+ return MigrationDocumentResult(
1454
+ source_document_id=source_document.external_id,
1455
+ target_document_id=target_document.external_id,
1456
+ filename=source_document.filename,
1457
+ status=status,
1458
+ )
1459
+
1460
+ def _ingest_migrated_document(
1461
+ self,
1462
+ *,
1463
+ source_document_id: str,
1464
+ file_content: bytes,
1465
+ filename: str,
1466
+ content_type: Optional[str],
1467
+ metadata: Dict[str, Any],
1468
+ metadata_types: Optional[Dict[str, str]],
1469
+ folder_name: Optional[str],
1470
+ end_user_id: Optional[str],
1471
+ use_colpali: bool,
1472
+ on_conflict: Literal["skip", "fail"],
1473
+ ) -> tuple[str, Document]:
1474
+ serialized_metadata, inferred_types = self._logic._serialize_metadata_map(metadata)
1475
+ metadata_type_payload = {**inferred_types, **(metadata_types or {})}
1476
+ metadata_type_payload = {
1477
+ key: value for key, value in metadata_type_payload.items() if key in serialized_metadata
1478
+ }
1479
+
1480
+ form_data: Dict[str, Any] = {
1481
+ "source_document_id": source_document_id,
1482
+ "metadata": json.dumps(serialized_metadata),
1483
+ "metadata_types": json.dumps(metadata_type_payload),
1484
+ "use_colpali": str(use_colpali).lower(),
1485
+ "on_conflict": on_conflict,
1486
+ }
1487
+ if folder_name:
1488
+ form_data["folder_name"] = folder_name
1489
+ if end_user_id:
1490
+ form_data["end_user_id"] = end_user_id
1491
+
1492
+ file_obj = BytesIO(file_content)
1493
+ files = {"file": (filename, file_obj, content_type or "application/octet-stream")}
1494
+ response = self._request("POST", "migrate/document", data=form_data, files=files)
1495
+ document = self._logic._parse_document_response(response["document"])
1496
+ document._client = self
1497
+ return response.get("status", "created"), document
1498
+
1499
+ def _copy_document_summary(self, source_document_id: str, target: "Morphik", target_document_id: str) -> None:
1500
+ try:
1501
+ summary = self.get_document_summary(source_document_id)
1502
+ except httpx.HTTPStatusError as exc:
1503
+ if exc.response.status_code == 404:
1504
+ return
1505
+ raise
1506
+ target.upsert_document_summary(
1507
+ document_id=target_document_id,
1508
+ content=summary.content,
1509
+ versioning=False,
1510
+ overwrite_latest=True,
1511
+ )
1512
+
1513
+ @staticmethod
1514
+ def _build_migration_result(
1515
+ results: List[MigrationDocumentResult],
1516
+ total_source_count: Optional[int],
1517
+ ) -> MigrationResult:
1518
+ return MigrationResult(
1519
+ documents=results,
1520
+ total_source_count=total_source_count,
1521
+ attempted_count=len(results),
1522
+ created_count=sum(1 for item in results if item.status == "created"),
1523
+ skipped_count=sum(1 for item in results if item.status == "skipped"),
1524
+ failed_count=sum(1 for item in results if item.status == "failed"),
1525
+ )
1526
+
1308
1527
  def get_document(self, document_id: str) -> Document:
1309
1528
  """
1310
1529
  Get document metadata by ID.
@@ -1,3 +1,5 @@
1
+ import json
2
+
1
3
  import httpx
2
4
  import jwt
3
5
  import pytest
@@ -305,6 +307,59 @@ def test_sync_get_document_by_filename_scoped_params_and_encoding():
305
307
  client.close()
306
308
 
307
309
 
310
+ def test_sync_ingest_migrated_document_posts_migration_payload():
311
+ client = Morphik()
312
+ calls = []
313
+
314
+ def fake_request(method, endpoint, data=None, files=None, params=None):
315
+ calls.append({"method": method, "endpoint": endpoint, "data": data, "files": files, "params": params})
316
+ return {
317
+ "status": "created",
318
+ "document": {
319
+ "external_id": "source-doc-1",
320
+ "content_type": "application/pdf",
321
+ "filename": "report.pdf",
322
+ },
323
+ }
324
+
325
+ client._request = fake_request # type: ignore[attr-defined]
326
+ try:
327
+ status, doc = client._ingest_migrated_document(
328
+ source_document_id="source-doc-1",
329
+ file_content=b"pdf-bytes",
330
+ filename="report.pdf",
331
+ content_type="application/pdf",
332
+ metadata={"category": "finance", "_morphik_migration": {"source_document_id": "source-doc-1"}},
333
+ metadata_types={"category": "string", "_morphik_migration": "object"},
334
+ folder_name="/finance/reports",
335
+ end_user_id="customer-1",
336
+ use_colpali=True,
337
+ on_conflict="skip",
338
+ )
339
+ call = calls.pop()
340
+ assert call["method"] == "POST"
341
+ assert call["endpoint"] == "migrate/document"
342
+ assert call["data"]["source_document_id"] == "source-doc-1"
343
+ assert call["data"]["folder_name"] == "/finance/reports"
344
+ assert call["data"]["end_user_id"] == "customer-1"
345
+ assert call["data"]["use_colpali"] == "true"
346
+ assert call["data"]["on_conflict"] == "skip"
347
+ assert json.loads(call["data"]["metadata"]) == {
348
+ "category": "finance",
349
+ "_morphik_migration": {"source_document_id": "source-doc-1"},
350
+ }
351
+ assert json.loads(call["data"]["metadata_types"]) == {
352
+ "category": "string",
353
+ "_morphik_migration": "object",
354
+ }
355
+ assert call["files"]["file"][0] == "report.pdf"
356
+ assert call["files"]["file"][2] == "application/pdf"
357
+ assert status == "created"
358
+ assert doc.external_id == "source-doc-1"
359
+ finally:
360
+ client.close()
361
+
362
+
308
363
  def test_sync_folder_get_document_by_filename_scoped():
309
364
  client, calls = _make_sync_client()
310
365
  try:
@@ -7,6 +7,7 @@ from morphik._shared import (
7
7
  build_document_by_filename_params,
8
8
  build_list_apps_params,
9
9
  build_logs_params,
10
+ build_migration_metadata,
10
11
  build_rename_app_params,
11
12
  build_requeue_payload,
12
13
  build_rotate_app_params,
@@ -14,7 +15,7 @@ from morphik._shared import (
14
15
  merge_folders,
15
16
  normalize_additional_folders,
16
17
  )
17
- from morphik.models import RequeueIngestionJob
18
+ from morphik.models import Document, RequeueIngestionJob
18
19
 
19
20
 
20
21
  def test_merge_folders_variants():
@@ -119,3 +120,37 @@ def test_normalize_additional_folders_alias():
119
120
  assert normalize_additional_folders(None, "b") == ["b"]
120
121
  assert normalize_additional_folders(["a"], "b") == ["a", "b"]
121
122
  assert normalize_additional_folders(["a"], ["b", "c"]) == ["a", "b", "c"]
123
+
124
+
125
+ def test_build_migration_metadata_strips_managed_fields_and_records_source():
126
+ doc = Document(
127
+ external_id="source-doc-1",
128
+ content_type="application/pdf",
129
+ filename="report.pdf",
130
+ app_id="source-app",
131
+ folder_path="/finance/reports",
132
+ end_user_id="customer-1",
133
+ metadata={
134
+ "external_id": "source-doc-1",
135
+ "folder_name": "/finance/reports",
136
+ "end_user_id": "customer-1",
137
+ "category": "finance",
138
+ },
139
+ metadata_types={
140
+ "external_id": "string",
141
+ "folder_name": "string",
142
+ "end_user_id": "string",
143
+ "category": "string",
144
+ },
145
+ system_metadata={"created_at": "2026-01-01T00:00:00Z", "updated_at": "2026-01-02T00:00:00Z"},
146
+ )
147
+
148
+ metadata, metadata_types = build_migration_metadata(doc)
149
+
150
+ assert metadata["category"] == "finance"
151
+ assert "external_id" not in metadata
152
+ assert "folder_name" not in metadata
153
+ assert "end_user_id" not in metadata
154
+ assert metadata["_morphik_migration"]["source_document_id"] == "source-doc-1"
155
+ assert metadata["_morphik_migration"]["source_app_id"] == "source-app"
156
+ assert metadata_types == {"category": "string", "_morphik_migration": "object"}
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "morphik"
7
- version = "1.2.2"
7
+ version = "1.2.4"
8
8
  authors = [
9
9
  { name = "Morphik", email = "founders@morphik.ai" },
10
10
  ]
File without changes
File without changes
File without changes
File without changes