rosetta-cli 2.0.13b0__tar.gz → 2.0.13b2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. {rosetta_cli-2.0.13b0/rosetta_cli.egg-info → rosetta_cli-2.0.13b2}/PKG-INFO +2 -2
  2. {rosetta_cli-2.0.13b0 → rosetta_cli-2.0.13b2}/pyproject.toml +2 -2
  3. {rosetta_cli-2.0.13b0 → rosetta_cli-2.0.13b2}/rosetta_cli/ims_publisher.py +18 -48
  4. rosetta_cli-2.0.13b2/rosetta_cli/ims_utils.py +92 -0
  5. {rosetta_cli-2.0.13b0 → rosetta_cli-2.0.13b2}/rosetta_cli/ragflow_client.py +229 -90
  6. {rosetta_cli-2.0.13b0 → rosetta_cli-2.0.13b2/rosetta_cli.egg-info}/PKG-INFO +2 -2
  7. {rosetta_cli-2.0.13b0 → rosetta_cli-2.0.13b2}/rosetta_cli.egg-info/requires.txt +1 -1
  8. rosetta_cli-2.0.13b0/rosetta_cli/ims_utils.py +0 -28
  9. {rosetta_cli-2.0.13b0 → rosetta_cli-2.0.13b2}/MANIFEST.in +0 -0
  10. {rosetta_cli-2.0.13b0 → rosetta_cli-2.0.13b2}/README.md +0 -0
  11. {rosetta_cli-2.0.13b0 → rosetta_cli-2.0.13b2}/env.template +0 -0
  12. {rosetta_cli-2.0.13b0 → rosetta_cli-2.0.13b2}/rosetta_cli/__init__.py +0 -0
  13. {rosetta_cli-2.0.13b0 → rosetta_cli-2.0.13b2}/rosetta_cli/__main__.py +0 -0
  14. {rosetta_cli-2.0.13b0 → rosetta_cli-2.0.13b2}/rosetta_cli/cli.py +0 -0
  15. {rosetta_cli-2.0.13b0 → rosetta_cli-2.0.13b2}/rosetta_cli/commands/__init__.py +0 -0
  16. {rosetta_cli-2.0.13b0 → rosetta_cli-2.0.13b2}/rosetta_cli/commands/base_command.py +0 -0
  17. {rosetta_cli-2.0.13b0 → rosetta_cli-2.0.13b2}/rosetta_cli/commands/cleanup_command.py +0 -0
  18. {rosetta_cli-2.0.13b0 → rosetta_cli-2.0.13b2}/rosetta_cli/commands/list_command.py +0 -0
  19. {rosetta_cli-2.0.13b0 → rosetta_cli-2.0.13b2}/rosetta_cli/commands/parse_command.py +0 -0
  20. {rosetta_cli-2.0.13b0 → rosetta_cli-2.0.13b2}/rosetta_cli/commands/publish_command.py +0 -0
  21. {rosetta_cli-2.0.13b0 → rosetta_cli-2.0.13b2}/rosetta_cli/commands/verify_command.py +0 -0
  22. {rosetta_cli-2.0.13b0 → rosetta_cli-2.0.13b2}/rosetta_cli/ims_auth.py +0 -0
  23. {rosetta_cli-2.0.13b0 → rosetta_cli-2.0.13b2}/rosetta_cli/ims_config.py +0 -0
  24. {rosetta_cli-2.0.13b0 → rosetta_cli-2.0.13b2}/rosetta_cli/services/__init__.py +0 -0
  25. {rosetta_cli-2.0.13b0 → rosetta_cli-2.0.13b2}/rosetta_cli/services/auth_service.py +0 -0
  26. {rosetta_cli-2.0.13b0 → rosetta_cli-2.0.13b2}/rosetta_cli/services/dataset_service.py +0 -0
  27. {rosetta_cli-2.0.13b0 → rosetta_cli-2.0.13b2}/rosetta_cli/services/document_data.py +0 -0
  28. {rosetta_cli-2.0.13b0 → rosetta_cli-2.0.13b2}/rosetta_cli/services/document_service.py +0 -0
  29. {rosetta_cli-2.0.13b0 → rosetta_cli-2.0.13b2}/rosetta_cli/typing_utils.py +0 -0
  30. {rosetta_cli-2.0.13b0 → rosetta_cli-2.0.13b2}/rosetta_cli.egg-info/SOURCES.txt +0 -0
  31. {rosetta_cli-2.0.13b0 → rosetta_cli-2.0.13b2}/rosetta_cli.egg-info/dependency_links.txt +0 -0
  32. {rosetta_cli-2.0.13b0 → rosetta_cli-2.0.13b2}/rosetta_cli.egg-info/entry_points.txt +0 -0
  33. {rosetta_cli-2.0.13b0 → rosetta_cli-2.0.13b2}/rosetta_cli.egg-info/top_level.txt +0 -0
  34. {rosetta_cli-2.0.13b0 → rosetta_cli-2.0.13b2}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rosetta-cli
3
- Version: 2.0.13b0
3
+ Version: 2.0.13b2
4
4
  Summary: Rosetta CLI for publishing knowledge base content to RAGFlow
5
5
  Author-email: Rosetta Team <rosetta-support@griddynamics.com>
6
6
  License-Expression: Apache-2.0
@@ -18,7 +18,7 @@ Requires-Python: >=3.12
18
18
  Description-Content-Type: text/markdown
19
19
  Requires-Dist: python-dotenv<2.0.0,>=1.0.0
20
20
  Requires-Dist: python-frontmatter<2.0.0,>=1.1.0
21
- Requires-Dist: ragflow-sdk<0.26.0,>=0.25.0
21
+ Requires-Dist: ragflow-sdk<0.26.0,>=0.25.1
22
22
  Requires-Dist: requests<3.0.0,>=2.31.0
23
23
  Requires-Dist: tqdm<5.0.0,>=4.67.0
24
24
  Provides-Extra: dev
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "rosetta-cli"
7
- version = "2.0.13b00"
7
+ version = "2.0.13b02"
8
8
  description = "Rosetta CLI for publishing knowledge base content to RAGFlow"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.12"
@@ -23,7 +23,7 @@ classifiers = [
23
23
  dependencies = [
24
24
  "python-dotenv>=1.0.0,<2.0.0",
25
25
  "python-frontmatter>=1.1.0,<2.0.0",
26
- "ragflow-sdk>=0.25.0,<0.26.0",
26
+ "ragflow-sdk>=0.25.1,<0.26.0",
27
27
  "requests>=2.31.0,<3.0.0",
28
28
  "tqdm>=4.67.0,<5.0.0",
29
29
  ]
@@ -388,26 +388,7 @@ class ContentPublisher:
388
388
  # Add file size for binary files
389
389
  if not is_text:
390
390
  metadata['file_size'] = len(content)
391
-
392
-
393
- if dry_run:
394
- print(f"[DRY RUN] Would publish: {metadata.get('doc_title', metadata.get('original_path', file.name))}")
395
- print(f" Document ID: {ims_doc_id}")
396
- print(f" Dataset: {dataset_name}")
397
- print(f" File type: {'text' if is_text else 'binary'}")
398
- print(f" Metadata: {metadata}")
399
- if is_text and content_str:
400
- print(f" Content size: {len(content_str)} characters")
401
- else:
402
- print(f" File size: {metadata.get('file_size', 0)} bytes")
403
-
404
- return PublishResult(
405
- success=True,
406
- document_id=ims_doc_id,
407
- file_path=str(file),
408
- tags=metadata.get('tags', [])
409
- )
410
-
391
+
411
392
  original_path = metadata.get("original_path", "")
412
393
 
413
394
  # Create DocumentMetadata for RAGFlow
@@ -426,13 +407,16 @@ class ContentPublisher:
426
407
  )
427
408
 
428
409
  # Upload to RAGFlow using pre-read content from cache (no re-reading!)
410
+ # In dry_run, upload_document gates each SDK write at the call site,
411
+ # prints the would-be payloads, and returns None (treated as skipped).
429
412
  result = self.client.upload_document(
430
413
  file_path=file,
431
414
  metadata=doc_metadata,
432
415
  dataset_name=dataset_name,
433
416
  dataset_template=self.dataset_template,
434
417
  force=force,
435
- content=content # Pass pre-read content from cache
418
+ content=content, # Pass pre-read content from cache
419
+ dry_run=dry_run,
436
420
  )
437
421
 
438
422
  # None means document was skipped (unchanged)
@@ -542,10 +526,14 @@ class ContentPublisher:
542
526
  def _has_content_changed_cached(self, cache: DocumentData) -> bool:
543
527
  """
544
528
  Check if document content has changed using pre-calculated hash.
545
-
529
+
530
+ Looks up the existing doc through the client's per-dataset index
531
+ (RAGFlow 0.25.0 ignores server-side metadata_condition, so the index
532
+ is the authoritative lookup).
533
+
546
534
  Args:
547
535
  cache: DocumentData with pre-calculated hash
548
-
536
+
549
537
  Returns:
550
538
  True if changed or new, False if unchanged
551
539
  """
@@ -556,46 +544,28 @@ class ContentPublisher:
556
544
  'tags': cache.tags,
557
545
  'domain': cache.domain
558
546
  })
559
-
547
+
560
548
  # Get dataset
561
549
  dataset = self.client.get_dataset(name=dataset_name)
562
550
  if not dataset:
563
551
  return True # New dataset = new document
564
-
565
- # Search for existing document by ims_doc_id
566
- metadata_filter = {
567
- "logic": "and",
568
- "conditions": [{
569
- "name": "ims_doc_id",
570
- "comparison_operator": "is",
571
- "value": cache.ims_doc_id
572
- }]
573
- }
574
-
575
- docs = self.client.list_documents(
576
- dataset,
577
- page_size=1,
578
- metadata_condition=metadata_filter
579
- )
580
-
581
- if not docs:
552
+
553
+ existing_doc = self.client.get_existing_doc(dataset, cache.ims_doc_id)
554
+ if existing_doc is None:
582
555
  return True # Document doesn't exist
583
-
584
- # Compare hashes (use pre-calculated hash from cache)
585
- existing_doc = docs[0]
556
+
586
557
  existing_meta = getattr(existing_doc, 'meta_fields', {}) or {}
587
-
588
558
  if isinstance(existing_meta, dict):
589
559
  existing_hash = existing_meta.get("content_hash")
590
560
  else:
591
561
  existing_hash = getattr(existing_meta, 'content_hash', None)
592
-
562
+
593
563
  if not existing_hash:
594
564
  return True # No hash = changed
595
565
 
596
566
  # Compare: cache.content_hash was already calculated in DocumentData
597
567
  return cache.content_hash != str(existing_hash)
598
-
568
+
599
569
  except Exception as e:
600
570
  print(f" Warning: Could not check existing document: {e}")
601
571
  return True # Assume changed on error
@@ -0,0 +1,92 @@
1
+ """Shared path utilities for Rosetta CLI."""
2
+
3
+ import random
4
+ import time
5
+ from pathlib import Path
6
+ from typing import Callable, TypeVar
7
+
8
+ _T = TypeVar("_T")
9
+
10
+ _TRANSIENT_RAGFLOW = (
11
+ "The dataset doesn't own the document",
12
+ "Documents not found",
13
+ "mapper_parsing_exception",
14
+ "Failed to update metadata",
15
+ "timed out", "timeout",
16
+ "Connection aborted", "Connection refused", "Connection reset",
17
+ "status 5",
18
+ )
19
+ _PERMANENT_RAGFLOW = (
20
+ "The type is not supported",
21
+ "format_invalid",
22
+ "Invalid API key",
23
+ "Insufficient permissions",
24
+ "lacks permission",
25
+ "You don't own",
26
+ "meta_fields must be a dictionary",
27
+ )
28
+
29
+
30
+ def is_transient_ragflow(exc: BaseException) -> bool:
31
+ """Classify a RAGFlow error as transient (retry) or permanent (do not retry).
32
+
33
+ Permanent substrings win — a message containing both never retries.
34
+ """
35
+ msg = str(exc)
36
+ if any(s in msg for s in _PERMANENT_RAGFLOW):
37
+ return False
38
+ return any(s in msg for s in _TRANSIENT_RAGFLOW)
39
+
40
+
41
+ def retry_call(
42
+ fn: Callable[[], _T],
43
+ *,
44
+ attempts: int = 3,
45
+ jitter_ms_range: tuple[int, int] = (150, 250),
46
+ retry_on: Callable[[BaseException], bool] = is_transient_ragflow,
47
+ label: str = "",
48
+ ) -> _T:
49
+ """Call ``fn`` up to ``attempts`` times; sleep flat-random ms between attempts on retry_on(exc).
50
+
51
+ Re-raises the last exception when attempts are exhausted or ``retry_on`` returns False.
52
+ """
53
+ if attempts < 1:
54
+ raise ValueError("attempts must be >= 1")
55
+ last: BaseException | None = None
56
+ for n in range(1, attempts + 1):
57
+ try:
58
+ return fn()
59
+ except BaseException as exc:
60
+ last = exc
61
+ if n >= attempts or not retry_on(exc):
62
+ raise
63
+ jitter = random.randint(jitter_ms_range[0], jitter_ms_range[1])
64
+ print(f" ↻ retry {n}/{attempts - 1} for {label} after {jitter}ms: {str(exc)[:120]}")
65
+ time.sleep(jitter / 1000.0)
66
+ assert last is not None
67
+ raise last
68
+
69
+
70
+ def resolve_workspace_root(path: Path) -> Path:
71
+ """Resolve the workspace root for a publish target.
72
+
73
+ Preference order:
74
+ 1. Parent of the topmost `instructions/` directory in the target path.
75
+ 2. Nearest ancestor containing `.git`.
76
+ 3. The target directory itself, or the parent for a file target.
77
+ """
78
+ resolved = path.resolve()
79
+ container = resolved if resolved.is_dir() else resolved.parent
80
+
81
+ parts = container.parts
82
+ for index, part in enumerate(parts):
83
+ if part == "instructions" and index > 0:
84
+ return Path(*parts[:index])
85
+
86
+ current = container
87
+ while current != current.parent:
88
+ if (current / ".git").exists():
89
+ return current
90
+ current = current.parent
91
+
92
+ return container
@@ -22,9 +22,32 @@ from typing import Any, Dict, List, Optional, cast
22
22
  from ragflow_sdk import RAGFlow
23
23
  from ragflow_sdk.modules.dataset import DataSet
24
24
  from ragflow_sdk.modules.document import Document
25
+ from .ims_utils import retry_call
25
26
  from .typing_utils import DatasetLike, DocumentLike, JsonDict
26
27
 
27
28
 
29
+ class _Timer:
30
+ """Context manager that prints elapsed time for an SDK call."""
31
+
32
+ def __init__(self, label: str) -> None:
33
+ self.label = label
34
+ self.t0 = 0.0
35
+
36
+ def __enter__(self) -> "_Timer":
37
+ self.t0 = time.perf_counter()
38
+ return self
39
+
40
+ def __exit__(self, exc_type: Any, exc: Any, tb: Any) -> None:
41
+ elapsed_ms = (time.perf_counter() - self.t0) * 1000.0
42
+ tag = "FAILED" if exc_type else "ok"
43
+ print(f" ⏱️ {self.label}: {elapsed_ms:.0f}ms [{tag}]")
44
+
45
+
46
+ def _timed(label: str) -> _Timer:
47
+ """Wrap an SDK call to print its elapsed wall time."""
48
+ return _Timer(label)
49
+
50
+
28
51
 
29
52
  @dataclass
30
53
  class DocumentMetadata:
@@ -128,6 +151,71 @@ class RAGFlowClient:
128
151
  # Initialize RAGFlow SDK client
129
152
  self._client = RAGFlow(api_key=api_key, base_url=base_url, version=version)
130
153
 
154
+ # Per-dataset index of {ims_doc_id: doc}, lazily built and reused across
155
+ # the publish session. RAGFlow 0.25.0 ignores `metadata_condition` server-
156
+ # side, so we cannot rely on filtered list_documents to find an existing
157
+ # doc by ims_doc_id; we list once and index in memory instead.
158
+ self._doc_index_by_dataset: dict[str, dict[str, DocumentLike]] = {}
159
+
160
+ def _get_doc_index(self, dataset: DatasetLike) -> dict[str, DocumentLike]:
161
+ """Return a {ims_doc_id: doc} index for the dataset, building it once.
162
+
163
+ Reuses across calls in the same client. Mutate via `_remember_doc` after
164
+ upload and `_forget_doc` after delete to keep it consistent.
165
+
166
+ Tolerates ownership/permission errors from list_documents — RAGFlow can
167
+ return those for team-shared datasets where the API key holder is not the
168
+ owner. We treat them as "no existing docs visible" so the publish flow
169
+ proceeds with fresh uploads.
170
+ """
171
+ ds_id = str(dataset.id)
172
+ cached = self._doc_index_by_dataset.get(ds_id)
173
+ if cached is not None:
174
+ return cached
175
+
176
+ try:
177
+ all_docs = self.list_documents(dataset, page_size=self.page_size)
178
+ except RAGFlowClientError as e:
179
+ msg = str(e).lower()
180
+ if (
181
+ "you don't own" in msg
182
+ or "you do not own" in msg
183
+ or "lacks permission" in msg
184
+ ):
185
+ self._doc_index_by_dataset[ds_id] = {}
186
+ return {}
187
+ raise
188
+
189
+ index: dict[str, DocumentLike] = {}
190
+ for doc in all_docs:
191
+ meta = getattr(doc, "meta_fields", {}) or {}
192
+ ims_doc_id = (
193
+ meta.get("ims_doc_id") if isinstance(meta, dict)
194
+ else getattr(meta, "ims_doc_id", None)
195
+ )
196
+ if ims_doc_id:
197
+ index[str(ims_doc_id)] = doc
198
+ self._doc_index_by_dataset[ds_id] = index
199
+ print(f" 📚 doc-index built for dataset {ds_id}: {len(index)} indexed of {len(all_docs)} listed")
200
+ return index
201
+
202
+ def get_existing_doc(self, dataset: DatasetLike, ims_doc_id: str) -> DocumentLike | None:
203
+ """Return the doc with this ims_doc_id, or None.
204
+
205
+ Uses the per-dataset index. Required because RAGFlow 0.25.0 ignores
206
+ metadata_condition filters server-side.
207
+ """
208
+ return self._get_doc_index(dataset).get(str(ims_doc_id))
209
+
210
+ def _remember_doc(self, dataset: DatasetLike, ims_doc_id: str, doc: DocumentLike) -> None:
211
+ ds_id = str(dataset.id)
212
+ self._doc_index_by_dataset.setdefault(ds_id, {})[str(ims_doc_id)] = doc
213
+
214
+ def _forget_doc(self, dataset: DatasetLike, ims_doc_id: str) -> None:
215
+ ds_id = str(dataset.id)
216
+ if ds_id in self._doc_index_by_dataset:
217
+ self._doc_index_by_dataset[ds_id].pop(str(ims_doc_id), None)
218
+
131
219
  def _handle_response_error(self, response: Any, operation: str) -> None:
132
220
  """
133
221
  Handle API response errors uniformly.
@@ -172,8 +260,9 @@ class RAGFlowClient:
172
260
  embedding_model: str | None = None,
173
261
  permission: str = "team",
174
262
  chunk_method: str | None = None,
175
- parser_config: JsonDict | None = None
176
- ) -> DataSet:
263
+ parser_config: JsonDict | None = None,
264
+ dry_run: bool = False
265
+ ) -> DataSet | None:
177
266
  """
178
267
  Create a new dataset.
179
268
 
@@ -212,11 +301,16 @@ class RAGFlowClient:
212
301
  # Convert parser_config dict to DataSet.ParserConfig object if needed
213
302
  if parser_cfg:
214
303
  kwargs["parser_config"] = DataSet.ParserConfig(self._client, parser_cfg)
215
-
216
- dataset = self._client.create_dataset(**kwargs)
217
-
304
+
305
+ if dry_run:
306
+ print(f" [DRY RUN] would self._client.create_dataset({json.dumps({k: v for k, v in kwargs.items() if k != 'parser_config'})})")
307
+ return None
308
+
309
+ with _timed(f"create_dataset(name={name})"):
310
+ dataset = self._client.create_dataset(**kwargs)
311
+
218
312
  return dataset
219
-
313
+
220
314
  except Exception as e:
221
315
  raise RAGFlowClientError(f"Failed to create dataset '{name}': {str(e)}")
222
316
 
@@ -247,15 +341,16 @@ class RAGFlowClient:
247
341
  RAGFlowClientError: If listing fails
248
342
  """
249
343
  try:
250
- datasets = self._client.list_datasets(
251
- page=page,
252
- page_size=page_size,
253
- orderby=orderby,
254
- desc=desc,
255
- id=id,
256
- name=name
257
- )
258
-
344
+ with _timed(f"list_datasets(name={name},id={id})"):
345
+ datasets = self._client.list_datasets(
346
+ page=page,
347
+ page_size=page_size,
348
+ orderby=orderby,
349
+ desc=desc,
350
+ id=id,
351
+ name=name
352
+ )
353
+
259
354
  return cast(list[DataSet], datasets)
260
355
 
261
356
  except Exception as e:
@@ -282,10 +377,12 @@ class RAGFlowClient:
282
377
  try:
283
378
  if id:
284
379
  # Filter by ID
285
- datasets = self._client.list_datasets(id=id, page_size=1)
380
+ with _timed(f"list_datasets(id={id})"):
381
+ datasets = self._client.list_datasets(id=id, page_size=1)
286
382
  elif name:
287
383
  # Filter by name (RAGFlow does substring, we verify exact match)
288
- datasets = self._client.list_datasets(name=name, page_size=10)
384
+ with _timed(f"list_datasets(name={name})"):
385
+ datasets = self._client.list_datasets(name=name, page_size=10)
289
386
  # Filter for exact match
290
387
  datasets = [ds for ds in datasets if ds.name == name]
291
388
  else:
@@ -302,39 +399,45 @@ class RAGFlowClient:
302
399
  return None
303
400
  raise RAGFlowClientError(f"Failed to get dataset: {str(e)}")
304
401
 
305
- def delete_datasets(self, ids: list[str]) -> None:
402
+ def delete_datasets(self, ids: list[str], dry_run: bool = False) -> None:
306
403
  """
307
404
  Delete datasets by IDs.
308
-
405
+
309
406
  Args:
310
407
  ids: List of dataset IDs to delete
311
-
408
+ dry_run: If True, print would-be call and skip the SDK write.
409
+
312
410
  Raises:
313
411
  RAGFlowClientError: If deletion fails
314
412
  """
315
413
  try:
316
- self._client.delete_datasets(ids=ids)
317
-
414
+ if dry_run:
415
+ print(f" [DRY RUN] would self._client.delete_datasets(ids={json.dumps(ids)})")
416
+ return
417
+ with _timed(f"delete_datasets(n={len(ids)})"):
418
+ self._client.delete_datasets(ids=ids)
419
+
318
420
  except Exception as e:
319
421
  raise RAGFlowClientError(f"Failed to delete datasets: {str(e)}")
320
422
 
321
- def _ensure_dataset(self, name: str, description: str = "") -> DataSet:
423
+ def _ensure_dataset(self, name: str, description: str = "", dry_run: bool = False) -> DataSet | None:
322
424
  """
323
425
  Get dataset if exists, create if not.
324
-
426
+
325
427
  Args:
326
428
  name: Dataset name
327
429
  description: Dataset description (used if creating)
328
-
430
+ dry_run: If True and dataset is missing, print would-be create and return None.
431
+
329
432
  Returns:
330
- DataSet object
433
+ DataSet object, or None when dry_run skips a needed create.
331
434
  """
332
435
  dataset = self.get_dataset(name=name)
333
436
  if dataset is not None:
334
437
  return dataset
335
-
336
- # Dataset doesn't exist, create it
337
- return self.create_dataset(name, description)
438
+
439
+ # Dataset doesn't exist, create it (gated by dry_run)
440
+ return self.create_dataset(name, description, dry_run=dry_run)
338
441
 
339
442
  def _resolve_dataset_name(self, template: str, release: str | None) -> str:
340
443
  """
@@ -379,7 +482,8 @@ class RAGFlowClient:
379
482
  dataset_name: str | None = None,
380
483
  dataset_template: str = "aia-{release}",
381
484
  force: bool = False,
382
- content: bytes | None = None # NEW: Pre-read content from cache
485
+ content: bytes | None = None, # NEW: Pre-read content from cache
486
+ dry_run: bool = False
383
487
  ) -> tuple[DocumentLike, str] | None:
384
488
  """
385
489
  Upload document with upsert semantics and change detection.
@@ -438,11 +542,20 @@ class RAGFlowClient:
438
542
  metadata.release
439
543
  )
440
544
 
441
- # Ensure dataset exists
545
+ # Ensure dataset exists (dry_run gates the underlying create_dataset call)
442
546
  dataset = self._ensure_dataset(
443
547
  resolved_name,
444
- f"IMS Knowledge - Release {metadata.release}" if metadata.release else "IMS Knowledge"
548
+ f"IMS Knowledge - Release {metadata.release}" if metadata.release else "IMS Knowledge",
549
+ dry_run=dry_run,
445
550
  )
551
+ if dataset is None:
552
+ # dry_run path where dataset would have been created but wasn't.
553
+ # Return a sentinel so the publisher reports "would-publish", not "skipped".
554
+ # dataset.id is unavailable here; use resolved_name as the dataset identifier.
555
+ # Publisher only consumes the dataset_id for parsing (guarded by not dry_run).
556
+ print(f" [DRY RUN] dataset '{resolved_name}' missing; would be created.")
557
+ from types import SimpleNamespace
558
+ return (cast(DocumentLike, SimpleNamespace(id=metadata.ims_doc_id)), resolved_name)
446
559
 
447
560
  # Build display name from normalized doc title when available.
448
561
  # For R1, doc_title is filename; for R2, doc_title is logical path.
@@ -450,37 +563,11 @@ class RAGFlowClient:
450
563
  filename = metadata.doc_title or (file_path.name if file_path else "")
451
564
  title = self._build_title_with_tags(metadata.tags, filename)
452
565
 
453
- # Check if document exists by searching for ims_doc_id in metadata
566
+ # Check if document exists by ims_doc_id via the per-dataset index.
567
+ # RAGFlow 0.25.0 ignores metadata_condition server-side, so the index
568
+ # (built from a single list-all per dataset) is the authoritative lookup.
454
569
  start_time = time.time()
455
-
456
- # Use server-side metadata filtering to find document by ims_doc_id.
457
- # RAGFlow may return ownership-style errors when the filtered lookup
458
- # misses a document in team-shared datasets; treat that as "not found".
459
- try:
460
- existing_docs = self.list_documents(
461
- dataset=dataset,
462
- metadata_condition={
463
- "logic": "and",
464
- "conditions": [{
465
- "name": "ims_doc_id",
466
- "comparison_operator": "is",
467
- "value": metadata.ims_doc_id
468
- }]
469
- },
470
- page_size=1
471
- )
472
- except RAGFlowClientError as e:
473
- msg = str(e).lower()
474
- if (
475
- "you don't own" in msg
476
- or "you do not own" in msg
477
- or "lacks permission" in msg
478
- ):
479
- existing_docs = []
480
- else:
481
- raise
482
-
483
- existing_doc = existing_docs[0] if existing_docs else None
570
+ existing_doc = self.get_existing_doc(dataset, metadata.ims_doc_id)
484
571
 
485
572
  if existing_doc:
486
573
  # Check if content changed by comparing hashes
@@ -500,24 +587,36 @@ class RAGFlowClient:
500
587
  return None
501
588
 
502
589
  # Content changed, delete old version
503
- dataset.delete_documents([existing_doc.id])
590
+ if dry_run:
591
+ print(f" [DRY RUN] would dataset.delete_documents({json.dumps([existing_doc.id])})")
592
+ else:
593
+ with _timed(f"dataset.delete_documents(id={existing_doc.id})"):
594
+ dataset.delete_documents([existing_doc.id])
595
+ self._forget_doc(dataset, metadata.ims_doc_id)
504
596
  print(f" 🔄 Updating: {title}")
505
597
  else:
506
598
  print(f" ⬆️ Uploading: {title}")
507
-
599
+
508
600
  # Upload document
509
601
  try:
510
- documents = dataset.upload_documents([{
511
- "display_name": title,
512
- "blob": content
513
- }])
514
-
515
- if not documents:
516
- raise RAGFlowClientError("Upload returned no documents")
517
-
518
- doc = documents[0]
602
+ upload_payload = [{"display_name": title, "blob_bytes": len(content)}]
603
+ if dry_run:
604
+ print(f" [DRY RUN] would dataset.upload_documents({json.dumps(upload_payload)}) (blob_bytes shown instead of raw blob)")
605
+ doc = None
606
+ else:
607
+ with _timed(f"dataset.upload_documents(bytes={len(content)})"):
608
+ documents = dataset.upload_documents([{
609
+ "display_name": title,
610
+ "blob": content
611
+ }])
612
+ if not documents:
613
+ raise RAGFlowClientError("Upload returned no documents")
614
+ doc = documents[0]
519
615
 
520
- # Update metadata
616
+ # RAGFlow 0.25.x rejects None and dict values in meta_fields
617
+ # (validate_document_meta_fields). Drop None entries; JSON-stringify
618
+ # the frontmatter dict in place under the same key so the validator
619
+ # accepts it. The MCP read side already json.loads it on the way back.
521
620
  meta_fields: JsonDict = {
522
621
  "ims_doc_id": metadata.ims_doc_id,
523
622
  "tags": metadata.tags,
@@ -525,18 +624,47 @@ class RAGFlowClient:
525
624
  "release": metadata.release,
526
625
  "content_hash": metadata.content_hash,
527
626
  "original_path": metadata.original_path,
528
- "sort_order": metadata.sort_order,
529
627
  "doc_title": metadata.doc_title,
530
628
  }
629
+ if metadata.sort_order is not None:
630
+ meta_fields["sort_order"] = metadata.sort_order
531
631
  if metadata.line_count is not None:
532
632
  meta_fields["line_count"] = metadata.line_count
533
633
  if metadata.resource_path is not None:
534
634
  meta_fields["resource_path"] = metadata.resource_path
535
- frontmatter_value = getattr(metadata, 'frontmatter', None)
536
- if frontmatter_value is not None:
537
- meta_fields["frontmatter"] = frontmatter_value
635
+ if metadata.frontmatter is not None:
636
+ # Stored under "fm" (not "frontmatter") because the per-tenant
637
+ # ES doc-meta index (ragflow_doc_meta_{tenant_id}) commits a
638
+ # sticky "object" dynamic mapping for any key that was first
639
+ # written as a dict. Dropping datasets does NOT reset the index;
640
+ # the "frontmatter" key is permanently typed as object in
641
+ # existing deployments, so any string write to it is rejected.
642
+ # Using "fm" gets a fresh dynamic mapping as "text/keyword".
643
+ # MCP readers fall back to the legacy "frontmatter" key so old
644
+ # documents written before this rename remain readable.
645
+ meta_fields["fm"] = json.dumps(
646
+ metadata.frontmatter,
647
+ sort_keys=True,
648
+ ensure_ascii=False,
649
+ default=str,
650
+ )
538
651
 
539
- doc.update({"meta_fields": meta_fields})
652
+ if dry_run:
653
+ print(f" [DRY RUN] would doc.update({json.dumps({'meta_fields': meta_fields}, ensure_ascii=False, default=str)})")
654
+ elapsed = time.time() - start_time
655
+ print(f" [DRY RUN] would Done ({elapsed:.2f}s): {title}")
656
+ # Return a sentinel doc so the publisher reports this file as
657
+ # "would-publish" rather than "skipped (unchanged)".
658
+ from types import SimpleNamespace
659
+ return (cast(DocumentLike, SimpleNamespace(id=metadata.ims_doc_id)), dataset.id)
660
+ assert doc is not None
661
+ def _do_update() -> None:
662
+ with _timed(f"doc.update(id={doc.id})"):
663
+ doc.update({"meta_fields": meta_fields})
664
+ retry_call(
665
+ _do_update,
666
+ label=f"doc.update({doc.id})",
667
+ )
540
668
  # SDK update() does not echo meta_fields back in the PUT response;
541
669
  # re-fetch to get the actual stored state.
542
670
  try:
@@ -545,10 +673,10 @@ class RAGFlowClient:
545
673
  if updated_meta:
546
674
  if isinstance(updated_meta, dict):
547
675
  meta_tags = updated_meta.get('tags', [])
548
- meta_fm = updated_meta.get('frontmatter')
676
+ meta_fm = updated_meta.get('fm') or updated_meta.get('frontmatter')
549
677
  else:
550
678
  meta_tags = getattr(updated_meta, 'tags', []) or []
551
- meta_fm = getattr(updated_meta, 'frontmatter', None)
679
+ meta_fm = getattr(updated_meta, 'fm', None) or getattr(updated_meta, 'frontmatter', None)
552
680
  tag_count = len(meta_tags) if isinstance(meta_tags, list) else 0
553
681
  print(f" ✅ Metadata set: {tag_count} tags, frontmatter={'yes' if meta_fm else 'no'}")
554
682
  else:
@@ -558,7 +686,10 @@ class RAGFlowClient:
558
686
 
559
687
  elapsed = time.time() - start_time
560
688
  print(f" ✅ Done ({elapsed:.2f}s): {title}")
561
-
689
+
690
+ # Remember the new doc so subsequent lookups in this session find it.
691
+ self._remember_doc(dataset, metadata.ims_doc_id, cast(DocumentLike, doc))
692
+
562
693
  # Return doc object and dataset ID for parsing
563
694
  # doc.id is RAGFlow's internal document ID needed for parsing
564
695
  return (cast(DocumentLike, doc), dataset.id)
@@ -566,30 +697,36 @@ class RAGFlowClient:
566
697
  except Exception as e:
567
698
  raise RAGFlowClientError(f"Failed to upload document '{title}': {str(e)}")
568
699
 
569
- def trigger_parse(self, dataset_id: str, document_ids: list[str]) -> None:
700
+ def trigger_parse(self, dataset_id: str, document_ids: list[str], dry_run: bool = False) -> None:
570
701
  """
571
702
  Trigger async parsing for documents.
572
-
703
+
573
704
  Args:
574
705
  dataset_id: Dataset ID containing documents
575
706
  document_ids: List of document IDs to parse
576
-
707
+ dry_run: If True, print would-be call and skip the SDK write.
708
+
577
709
  Raises:
578
710
  RAGFlowClientError: If parsing trigger fails
579
711
  """
712
+ if dry_run:
713
+ print(f" [DRY RUN] would dataset({json.dumps(dataset_id)}).async_parse_documents({json.dumps(document_ids)})")
714
+ return
580
715
  dataset = self.get_dataset(id=dataset_id)
581
716
  if not dataset:
582
717
  raise NotFoundError(f"Dataset not found: {dataset_id}")
583
-
718
+
584
719
  try:
585
- dataset.async_parse_documents(document_ids)
720
+ with _timed(f"async_parse_documents(n={len(document_ids)})"):
721
+ dataset.async_parse_documents(document_ids)
586
722
  except Exception as e:
587
723
  raise RAGFlowClientError(f"Failed to trigger parsing: {str(e)}")
588
724
 
589
725
  def parse_documents_batch(
590
726
  self,
591
727
  documents: list[JsonDict],
592
- silent: bool = False
728
+ silent: bool = False,
729
+ dry_run: bool = False
593
730
  ) -> dict[str, list[str]]:
594
731
  """
595
732
  Trigger parsing for multiple documents across datasets.
@@ -636,7 +773,7 @@ class RAGFlowClient:
636
773
  print(f" → Document IDs: {doc_ids[:3]}{'...' if len(doc_ids) > 3 else ''}")
637
774
 
638
775
  try:
639
- self.trigger_parse(dataset_id, doc_ids)
776
+ self.trigger_parse(dataset_id, doc_ids, dry_run=dry_run)
640
777
  success_datasets.append(dataset_id)
641
778
  except Exception as e:
642
779
  failed_datasets.append(dataset_id)
@@ -797,7 +934,9 @@ class RAGFlowClient:
797
934
 
798
935
  # Bypass SDK and call HTTP API directly
799
936
  # SDK doesn't support run, suffix, metadata_condition parameters
800
- res = dataset.get(f"/datasets/{dataset.id}/documents", params=params)
937
+ cond_keys = sorted(params.keys())
938
+ with _timed(f"list_documents({','.join(cond_keys)})"):
939
+ res = dataset.get(f"/datasets/{dataset.id}/documents", params=params)
801
940
  res_json = cast(JsonDict, cast(Any, res).json())
802
941
 
803
942
  if res_json.get("code") != 0:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rosetta-cli
3
- Version: 2.0.13b0
3
+ Version: 2.0.13b2
4
4
  Summary: Rosetta CLI for publishing knowledge base content to RAGFlow
5
5
  Author-email: Rosetta Team <rosetta-support@griddynamics.com>
6
6
  License-Expression: Apache-2.0
@@ -18,7 +18,7 @@ Requires-Python: >=3.12
18
18
  Description-Content-Type: text/markdown
19
19
  Requires-Dist: python-dotenv<2.0.0,>=1.0.0
20
20
  Requires-Dist: python-frontmatter<2.0.0,>=1.1.0
21
- Requires-Dist: ragflow-sdk<0.26.0,>=0.25.0
21
+ Requires-Dist: ragflow-sdk<0.26.0,>=0.25.1
22
22
  Requires-Dist: requests<3.0.0,>=2.31.0
23
23
  Requires-Dist: tqdm<5.0.0,>=4.67.0
24
24
  Provides-Extra: dev
@@ -1,6 +1,6 @@
1
1
  python-dotenv<2.0.0,>=1.0.0
2
2
  python-frontmatter<2.0.0,>=1.1.0
3
- ragflow-sdk<0.26.0,>=0.25.0
3
+ ragflow-sdk<0.26.0,>=0.25.1
4
4
  requests<3.0.0,>=2.31.0
5
5
  tqdm<5.0.0,>=4.67.0
6
6
 
@@ -1,28 +0,0 @@
1
- """Shared path utilities for Rosetta CLI."""
2
-
3
- from pathlib import Path
4
-
5
-
6
- def resolve_workspace_root(path: Path) -> Path:
7
- """Resolve the workspace root for a publish target.
8
-
9
- Preference order:
10
- 1. Parent of the topmost `instructions/` directory in the target path.
11
- 2. Nearest ancestor containing `.git`.
12
- 3. The target directory itself, or the parent for a file target.
13
- """
14
- resolved = path.resolve()
15
- container = resolved if resolved.is_dir() else resolved.parent
16
-
17
- parts = container.parts
18
- for index, part in enumerate(parts):
19
- if part == "instructions" and index > 0:
20
- return Path(*parts[:index])
21
-
22
- current = container
23
- while current != current.parent:
24
- if (current / ".git").exists():
25
- return current
26
- current = current.parent
27
-
28
- return container
File without changes
File without changes