rosetta-cli 2.0.13b1__tar.gz → 2.0.13b3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. {rosetta_cli-2.0.13b1/rosetta_cli.egg-info → rosetta_cli-2.0.13b3}/PKG-INFO +2 -2
  2. {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/pyproject.toml +2 -2
  3. {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli/ims_publisher.py +18 -48
  4. rosetta_cli-2.0.13b3/rosetta_cli/ims_utils.py +92 -0
  5. {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli/ragflow_client.py +258 -92
  6. {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3/rosetta_cli.egg-info}/PKG-INFO +2 -2
  7. {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli.egg-info/requires.txt +1 -1
  8. rosetta_cli-2.0.13b1/rosetta_cli/ims_utils.py +0 -28
  9. {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/MANIFEST.in +0 -0
  10. {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/README.md +0 -0
  11. {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/env.template +0 -0
  12. {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli/__init__.py +0 -0
  13. {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli/__main__.py +0 -0
  14. {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli/cli.py +0 -0
  15. {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli/commands/__init__.py +0 -0
  16. {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli/commands/base_command.py +0 -0
  17. {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli/commands/cleanup_command.py +0 -0
  18. {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli/commands/list_command.py +0 -0
  19. {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli/commands/parse_command.py +0 -0
  20. {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli/commands/publish_command.py +0 -0
  21. {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli/commands/verify_command.py +0 -0
  22. {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli/ims_auth.py +0 -0
  23. {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli/ims_config.py +0 -0
  24. {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli/services/__init__.py +0 -0
  25. {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli/services/auth_service.py +0 -0
  26. {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli/services/dataset_service.py +0 -0
  27. {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli/services/document_data.py +0 -0
  28. {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli/services/document_service.py +0 -0
  29. {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli/typing_utils.py +0 -0
  30. {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli.egg-info/SOURCES.txt +0 -0
  31. {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli.egg-info/dependency_links.txt +0 -0
  32. {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli.egg-info/entry_points.txt +0 -0
  33. {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli.egg-info/top_level.txt +0 -0
  34. {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rosetta-cli
3
- Version: 2.0.13b1
3
+ Version: 2.0.13b3
4
4
  Summary: Rosetta CLI for publishing knowledge base content to RAGFlow
5
5
  Author-email: Rosetta Team <rosetta-support@griddynamics.com>
6
6
  License-Expression: Apache-2.0
@@ -18,7 +18,7 @@ Requires-Python: >=3.12
18
18
  Description-Content-Type: text/markdown
19
19
  Requires-Dist: python-dotenv<2.0.0,>=1.0.0
20
20
  Requires-Dist: python-frontmatter<2.0.0,>=1.1.0
21
- Requires-Dist: ragflow-sdk<0.25.1,>=0.25.0
21
+ Requires-Dist: ragflow-sdk<0.26.0,>=0.25.1
22
22
  Requires-Dist: requests<3.0.0,>=2.31.0
23
23
  Requires-Dist: tqdm<5.0.0,>=4.67.0
24
24
  Provides-Extra: dev
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "rosetta-cli"
7
- version = "2.0.13b01"
7
+ version = "2.0.13b03"
8
8
  description = "Rosetta CLI for publishing knowledge base content to RAGFlow"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.12"
@@ -23,7 +23,7 @@ classifiers = [
23
23
  dependencies = [
24
24
  "python-dotenv>=1.0.0,<2.0.0",
25
25
  "python-frontmatter>=1.1.0,<2.0.0",
26
- "ragflow-sdk>=0.25.0,<0.25.1",
26
+ "ragflow-sdk>=0.25.1,<0.26.0",
27
27
  "requests>=2.31.0,<3.0.0",
28
28
  "tqdm>=4.67.0,<5.0.0",
29
29
  ]
@@ -388,26 +388,7 @@ class ContentPublisher:
388
388
  # Add file size for binary files
389
389
  if not is_text:
390
390
  metadata['file_size'] = len(content)
391
-
392
-
393
- if dry_run:
394
- print(f"[DRY RUN] Would publish: {metadata.get('doc_title', metadata.get('original_path', file.name))}")
395
- print(f" Document ID: {ims_doc_id}")
396
- print(f" Dataset: {dataset_name}")
397
- print(f" File type: {'text' if is_text else 'binary'}")
398
- print(f" Metadata: {metadata}")
399
- if is_text and content_str:
400
- print(f" Content size: {len(content_str)} characters")
401
- else:
402
- print(f" File size: {metadata.get('file_size', 0)} bytes")
403
-
404
- return PublishResult(
405
- success=True,
406
- document_id=ims_doc_id,
407
- file_path=str(file),
408
- tags=metadata.get('tags', [])
409
- )
410
-
391
+
411
392
  original_path = metadata.get("original_path", "")
412
393
 
413
394
  # Create DocumentMetadata for RAGFlow
@@ -426,13 +407,16 @@ class ContentPublisher:
426
407
  )
427
408
 
428
409
  # Upload to RAGFlow using pre-read content from cache (no re-reading!)
410
+ # In dry_run, upload_document gates each SDK write at the call site,
411
+ # prints the would-be payloads, and returns None (treated as skipped).
429
412
  result = self.client.upload_document(
430
413
  file_path=file,
431
414
  metadata=doc_metadata,
432
415
  dataset_name=dataset_name,
433
416
  dataset_template=self.dataset_template,
434
417
  force=force,
435
- content=content # Pass pre-read content from cache
418
+ content=content, # Pass pre-read content from cache
419
+ dry_run=dry_run,
436
420
  )
437
421
 
438
422
  # None means document was skipped (unchanged)
@@ -542,10 +526,14 @@ class ContentPublisher:
542
526
  def _has_content_changed_cached(self, cache: DocumentData) -> bool:
543
527
  """
544
528
  Check if document content has changed using pre-calculated hash.
545
-
529
+
530
+ Looks up the existing doc through the client's per-dataset index
531
+ (RAGFlow 0.25.0 ignores server-side metadata_condition, so the index
532
+ is the authoritative lookup).
533
+
546
534
  Args:
547
535
  cache: DocumentData with pre-calculated hash
548
-
536
+
549
537
  Returns:
550
538
  True if changed or new, False if unchanged
551
539
  """
@@ -556,46 +544,28 @@ class ContentPublisher:
556
544
  'tags': cache.tags,
557
545
  'domain': cache.domain
558
546
  })
559
-
547
+
560
548
  # Get dataset
561
549
  dataset = self.client.get_dataset(name=dataset_name)
562
550
  if not dataset:
563
551
  return True # New dataset = new document
564
-
565
- # Search for existing document by ims_doc_id
566
- metadata_filter = {
567
- "logic": "and",
568
- "conditions": [{
569
- "name": "ims_doc_id",
570
- "comparison_operator": "is",
571
- "value": cache.ims_doc_id
572
- }]
573
- }
574
-
575
- docs = self.client.list_documents(
576
- dataset,
577
- page_size=1,
578
- metadata_condition=metadata_filter
579
- )
580
-
581
- if not docs:
552
+
553
+ existing_doc = self.client.get_existing_doc(dataset, cache.ims_doc_id)
554
+ if existing_doc is None:
582
555
  return True # Document doesn't exist
583
-
584
- # Compare hashes (use pre-calculated hash from cache)
585
- existing_doc = docs[0]
556
+
586
557
  existing_meta = getattr(existing_doc, 'meta_fields', {}) or {}
587
-
588
558
  if isinstance(existing_meta, dict):
589
559
  existing_hash = existing_meta.get("content_hash")
590
560
  else:
591
561
  existing_hash = getattr(existing_meta, 'content_hash', None)
592
-
562
+
593
563
  if not existing_hash:
594
564
  return True # No hash = changed
595
565
 
596
566
  # Compare: cache.content_hash was already calculated in DocumentData
597
567
  return cache.content_hash != str(existing_hash)
598
-
568
+
599
569
  except Exception as e:
600
570
  print(f" Warning: Could not check existing document: {e}")
601
571
  return True # Assume changed on error
@@ -0,0 +1,92 @@
1
+ """Shared path utilities for Rosetta CLI."""
2
+
3
+ import random
4
+ import time
5
+ from pathlib import Path
6
+ from typing import Callable, TypeVar
7
+
8
+ _T = TypeVar("_T")
9
+
10
+ _TRANSIENT_RAGFLOW = (
11
+ "The dataset doesn't own the document",
12
+ "Documents not found",
13
+ "mapper_parsing_exception",
14
+ "Failed to update metadata",
15
+ "timed out", "timeout",
16
+ "Connection aborted", "Connection refused", "Connection reset",
17
+ "status 5",
18
+ )
19
+ _PERMANENT_RAGFLOW = (
20
+ "The type is not supported",
21
+ "format_invalid",
22
+ "Invalid API key",
23
+ "Insufficient permissions",
24
+ "lacks permission",
25
+ "You don't own",
26
+ "meta_fields must be a dictionary",
27
+ )
28
+
29
+
30
+ def is_transient_ragflow(exc: BaseException) -> bool:
31
+ """Classify a RAGFlow error as transient (retry) or permanent (do not retry).
32
+
33
+ Permanent substrings win — a message containing both never retries.
34
+ """
35
+ msg = str(exc)
36
+ if any(s in msg for s in _PERMANENT_RAGFLOW):
37
+ return False
38
+ return any(s in msg for s in _TRANSIENT_RAGFLOW)
39
+
40
+
41
+ def retry_call(
42
+ fn: Callable[[], _T],
43
+ *,
44
+ attempts: int = 3,
45
+ jitter_ms_range: tuple[int, int] = (150, 250),
46
+ retry_on: Callable[[BaseException], bool] = is_transient_ragflow,
47
+ label: str = "",
48
+ ) -> _T:
49
+ """Call ``fn`` up to ``attempts`` times; sleep flat-random ms between attempts on retry_on(exc).
50
+
51
+ Re-raises the last exception when attempts are exhausted or ``retry_on`` returns False.
52
+ """
53
+ if attempts < 1:
54
+ raise ValueError("attempts must be >= 1")
55
+ last: BaseException | None = None
56
+ for n in range(1, attempts + 1):
57
+ try:
58
+ return fn()
59
+ except BaseException as exc:
60
+ last = exc
61
+ if n >= attempts or not retry_on(exc):
62
+ raise
63
+ jitter = random.randint(jitter_ms_range[0], jitter_ms_range[1])
64
+ print(f" ↻ retry {n}/{attempts - 1} for {label} after {jitter}ms: {str(exc)[:120]}")
65
+ time.sleep(jitter / 1000.0)
66
+ assert last is not None
67
+ raise last
68
+
69
+
70
+ def resolve_workspace_root(path: Path) -> Path:
71
+ """Resolve the workspace root for a publish target.
72
+
73
+ Preference order:
74
+ 1. Parent of the topmost `instructions/` directory in the target path.
75
+ 2. Nearest ancestor containing `.git`.
76
+ 3. The target directory itself, or the parent for a file target.
77
+ """
78
+ resolved = path.resolve()
79
+ container = resolved if resolved.is_dir() else resolved.parent
80
+
81
+ parts = container.parts
82
+ for index, part in enumerate(parts):
83
+ if part == "instructions" and index > 0:
84
+ return Path(*parts[:index])
85
+
86
+ current = container
87
+ while current != current.parent:
88
+ if (current / ".git").exists():
89
+ return current
90
+ current = current.parent
91
+
92
+ return container
@@ -22,9 +22,32 @@ from typing import Any, Dict, List, Optional, cast
22
22
  from ragflow_sdk import RAGFlow
23
23
  from ragflow_sdk.modules.dataset import DataSet
24
24
  from ragflow_sdk.modules.document import Document
25
+ from .ims_utils import retry_call
25
26
  from .typing_utils import DatasetLike, DocumentLike, JsonDict
26
27
 
27
28
 
29
+ class _Timer:
30
+ """Context manager that prints elapsed time for an SDK call."""
31
+
32
+ def __init__(self, label: str) -> None:
33
+ self.label = label
34
+ self.t0 = 0.0
35
+
36
+ def __enter__(self) -> "_Timer":
37
+ self.t0 = time.perf_counter()
38
+ return self
39
+
40
+ def __exit__(self, exc_type: Any, exc: Any, tb: Any) -> None:
41
+ elapsed_ms = (time.perf_counter() - self.t0) * 1000.0
42
+ tag = "FAILED" if exc_type else "ok"
43
+ print(f" ⏱️ {self.label}: {elapsed_ms:.0f}ms [{tag}]")
44
+
45
+
46
+ def _timed(label: str) -> _Timer:
47
+ """Wrap an SDK call to print its elapsed wall time."""
48
+ return _Timer(label)
49
+
50
+
28
51
 
29
52
  @dataclass
30
53
  class DocumentMetadata:
@@ -128,6 +151,88 @@ class RAGFlowClient:
128
151
  # Initialize RAGFlow SDK client
129
152
  self._client = RAGFlow(api_key=api_key, base_url=base_url, version=version)
130
153
 
154
+ # Per-dataset index of {ims_doc_id: doc}, lazily built and reused across
155
+ # the publish session. RAGFlow 0.25.0 ignores `metadata_condition` server-
156
+ # side, so we cannot rely on filtered list_documents to find an existing
157
+ # doc by ims_doc_id; we list once and index in memory instead.
158
+ self._doc_index_by_dataset: dict[str, dict[str, DocumentLike]] = {}
159
+ # Per-client dataset lookup cache. Publishing resolves the same release
160
+ # dataset for every file; keep those list_datasets calls in-process.
161
+ self._dataset_by_id: dict[str, DataSet] = {}
162
+ self._dataset_by_name: dict[str, DataSet] = {}
163
+
164
+ def _clear_dataset_cache(self) -> None:
165
+ self._dataset_by_id.clear()
166
+ self._dataset_by_name.clear()
167
+
168
+ def _remember_dataset(self, dataset: DataSet) -> DataSet:
169
+ dataset_id = getattr(dataset, "id", None)
170
+ dataset_name = getattr(dataset, "name", None)
171
+ if dataset_id:
172
+ self._dataset_by_id[str(dataset_id)] = dataset
173
+ if dataset_name:
174
+ self._dataset_by_name[str(dataset_name)] = dataset
175
+ return dataset
176
+
177
+ def _get_doc_index(self, dataset: DatasetLike) -> dict[str, DocumentLike]:
178
+ """Return a {ims_doc_id: doc} index for the dataset, building it once.
179
+
180
+ Reuses across calls in the same client. Mutate via `_remember_doc` after
181
+ upload and `_forget_doc` after delete to keep it consistent.
182
+
183
+ Tolerates ownership/permission errors from list_documents — RAGFlow can
184
+ return those for team-shared datasets where the API key holder is not the
185
+ owner. We treat them as "no existing docs visible" so the publish flow
186
+ proceeds with fresh uploads.
187
+ """
188
+ ds_id = str(dataset.id)
189
+ cached = self._doc_index_by_dataset.get(ds_id)
190
+ if cached is not None:
191
+ return cached
192
+
193
+ try:
194
+ all_docs = self.list_documents(dataset, page_size=self.page_size)
195
+ except RAGFlowClientError as e:
196
+ msg = str(e).lower()
197
+ if (
198
+ "you don't own" in msg
199
+ or "you do not own" in msg
200
+ or "lacks permission" in msg
201
+ ):
202
+ self._doc_index_by_dataset[ds_id] = {}
203
+ return {}
204
+ raise
205
+
206
+ index: dict[str, DocumentLike] = {}
207
+ for doc in all_docs:
208
+ meta = getattr(doc, "meta_fields", {}) or {}
209
+ ims_doc_id = (
210
+ meta.get("ims_doc_id") if isinstance(meta, dict)
211
+ else getattr(meta, "ims_doc_id", None)
212
+ )
213
+ if ims_doc_id:
214
+ index[str(ims_doc_id)] = doc
215
+ self._doc_index_by_dataset[ds_id] = index
216
+ print(f" 📚 doc-index built for dataset {ds_id}: {len(index)} indexed of {len(all_docs)} listed")
217
+ return index
218
+
219
+ def get_existing_doc(self, dataset: DatasetLike, ims_doc_id: str) -> DocumentLike | None:
220
+ """Return the doc with this ims_doc_id, or None.
221
+
222
+ Uses the per-dataset index. Required because RAGFlow 0.25.0 ignores
223
+ metadata_condition filters server-side.
224
+ """
225
+ return self._get_doc_index(dataset).get(str(ims_doc_id))
226
+
227
+ def _remember_doc(self, dataset: DatasetLike, ims_doc_id: str, doc: DocumentLike) -> None:
228
+ ds_id = str(dataset.id)
229
+ self._doc_index_by_dataset.setdefault(ds_id, {})[str(ims_doc_id)] = doc
230
+
231
+ def _forget_doc(self, dataset: DatasetLike, ims_doc_id: str) -> None:
232
+ ds_id = str(dataset.id)
233
+ if ds_id in self._doc_index_by_dataset:
234
+ self._doc_index_by_dataset[ds_id].pop(str(ims_doc_id), None)
235
+
131
236
  def _handle_response_error(self, response: Any, operation: str) -> None:
132
237
  """
133
238
  Handle API response errors uniformly.
@@ -172,8 +277,9 @@ class RAGFlowClient:
172
277
  embedding_model: str | None = None,
173
278
  permission: str = "team",
174
279
  chunk_method: str | None = None,
175
- parser_config: JsonDict | None = None
176
- ) -> DataSet:
280
+ parser_config: JsonDict | None = None,
281
+ dry_run: bool = False
282
+ ) -> DataSet | None:
177
283
  """
178
284
  Create a new dataset.
179
285
 
@@ -212,11 +318,17 @@ class RAGFlowClient:
212
318
  # Convert parser_config dict to DataSet.ParserConfig object if needed
213
319
  if parser_cfg:
214
320
  kwargs["parser_config"] = DataSet.ParserConfig(self._client, parser_cfg)
215
-
216
- dataset = self._client.create_dataset(**kwargs)
217
-
218
- return dataset
219
-
321
+
322
+ if dry_run:
323
+ print(f" [DRY RUN] would self._client.create_dataset({json.dumps({k: v for k, v in kwargs.items() if k != 'parser_config'})})")
324
+ return None
325
+
326
+ with _timed(f"create_dataset(name={name})"):
327
+ dataset = self._client.create_dataset(**kwargs)
328
+
329
+ self._clear_dataset_cache()
330
+ return self._remember_dataset(cast(DataSet, dataset))
331
+
220
332
  except Exception as e:
221
333
  raise RAGFlowClientError(f"Failed to create dataset '{name}': {str(e)}")
222
334
 
@@ -247,15 +359,16 @@ class RAGFlowClient:
247
359
  RAGFlowClientError: If listing fails
248
360
  """
249
361
  try:
250
- datasets = self._client.list_datasets(
251
- page=page,
252
- page_size=page_size,
253
- orderby=orderby,
254
- desc=desc,
255
- id=id,
256
- name=name
257
- )
258
-
362
+ with _timed(f"list_datasets(name={name},id={id})"):
363
+ datasets = self._client.list_datasets(
364
+ page=page,
365
+ page_size=page_size,
366
+ orderby=orderby,
367
+ desc=desc,
368
+ id=id,
369
+ name=name
370
+ )
371
+
259
372
  return cast(list[DataSet], datasets)
260
373
 
261
374
  except Exception as e:
@@ -281,18 +394,28 @@ class RAGFlowClient:
281
394
  """
282
395
  try:
283
396
  if id:
397
+ cached = self._dataset_by_id.get(str(id))
398
+ if cached is not None:
399
+ return cached
400
+
284
401
  # Filter by ID
285
- datasets = self._client.list_datasets(id=id, page_size=1)
402
+ with _timed(f"list_datasets(id={id})"):
403
+ datasets = self._client.list_datasets(id=id, page_size=1)
286
404
  elif name:
405
+ cached = self._dataset_by_name.get(str(name))
406
+ if cached is not None:
407
+ return cached
408
+
287
409
  # Filter by name (RAGFlow does substring, we verify exact match)
288
- datasets = self._client.list_datasets(name=name, page_size=10)
410
+ with _timed(f"list_datasets(name={name})"):
411
+ datasets = self._client.list_datasets(name=name, page_size=10)
289
412
  # Filter for exact match
290
413
  datasets = [ds for ds in datasets if ds.name == name]
291
414
  else:
292
415
  return None
293
416
 
294
417
  if datasets and len(datasets) > 0:
295
- return datasets[0]
418
+ return self._remember_dataset(cast(DataSet, datasets[0]))
296
419
  return None
297
420
 
298
421
  except Exception as e:
@@ -302,39 +425,46 @@ class RAGFlowClient:
302
425
  return None
303
426
  raise RAGFlowClientError(f"Failed to get dataset: {str(e)}")
304
427
 
305
- def delete_datasets(self, ids: list[str]) -> None:
428
+ def delete_datasets(self, ids: list[str], dry_run: bool = False) -> None:
306
429
  """
307
430
  Delete datasets by IDs.
308
-
431
+
309
432
  Args:
310
433
  ids: List of dataset IDs to delete
311
-
434
+ dry_run: If True, print would-be call and skip the SDK write.
435
+
312
436
  Raises:
313
437
  RAGFlowClientError: If deletion fails
314
438
  """
315
439
  try:
316
- self._client.delete_datasets(ids=ids)
317
-
440
+ if dry_run:
441
+ print(f" [DRY RUN] would self._client.delete_datasets(ids={json.dumps(ids)})")
442
+ return
443
+ with _timed(f"delete_datasets(n={len(ids)})"):
444
+ self._client.delete_datasets(ids=ids)
445
+ self._clear_dataset_cache()
446
+
318
447
  except Exception as e:
319
448
  raise RAGFlowClientError(f"Failed to delete datasets: {str(e)}")
320
449
 
321
- def _ensure_dataset(self, name: str, description: str = "") -> DataSet:
450
+ def _ensure_dataset(self, name: str, description: str = "", dry_run: bool = False) -> DataSet | None:
322
451
  """
323
452
  Get dataset if exists, create if not.
324
-
453
+
325
454
  Args:
326
455
  name: Dataset name
327
456
  description: Dataset description (used if creating)
328
-
457
+ dry_run: If True and dataset is missing, print would-be create and return None.
458
+
329
459
  Returns:
330
- DataSet object
460
+ DataSet object, or None when dry_run skips a needed create.
331
461
  """
332
462
  dataset = self.get_dataset(name=name)
333
463
  if dataset is not None:
334
464
  return dataset
335
-
336
- # Dataset doesn't exist, create it
337
- return self.create_dataset(name, description)
465
+
466
+ # Dataset doesn't exist, create it (gated by dry_run)
467
+ return self.create_dataset(name, description, dry_run=dry_run)
338
468
 
339
469
  def _resolve_dataset_name(self, template: str, release: str | None) -> str:
340
470
  """
@@ -379,7 +509,8 @@ class RAGFlowClient:
379
509
  dataset_name: str | None = None,
380
510
  dataset_template: str = "aia-{release}",
381
511
  force: bool = False,
382
- content: bytes | None = None # NEW: Pre-read content from cache
512
+ content: bytes | None = None, # NEW: Pre-read content from cache
513
+ dry_run: bool = False
383
514
  ) -> tuple[DocumentLike, str] | None:
384
515
  """
385
516
  Upload document with upsert semantics and change detection.
@@ -438,11 +569,20 @@ class RAGFlowClient:
438
569
  metadata.release
439
570
  )
440
571
 
441
- # Ensure dataset exists
572
+ # Ensure dataset exists (dry_run gates the underlying create_dataset call)
442
573
  dataset = self._ensure_dataset(
443
574
  resolved_name,
444
- f"IMS Knowledge - Release {metadata.release}" if metadata.release else "IMS Knowledge"
575
+ f"IMS Knowledge - Release {metadata.release}" if metadata.release else "IMS Knowledge",
576
+ dry_run=dry_run,
445
577
  )
578
+ if dataset is None:
579
+ # dry_run path where dataset would have been created but wasn't.
580
+ # Return a sentinel so the publisher reports "would-publish", not "skipped".
581
+ # dataset.id is unavailable here; use resolved_name as the dataset identifier.
582
+ # Publisher only consumes the dataset_id for parsing (guarded by not dry_run).
583
+ print(f" [DRY RUN] dataset '{resolved_name}' missing; would be created.")
584
+ from types import SimpleNamespace
585
+ return (cast(DocumentLike, SimpleNamespace(id=metadata.ims_doc_id)), resolved_name)
446
586
 
447
587
  # Build display name from normalized doc title when available.
448
588
  # For R1, doc_title is filename; for R2, doc_title is logical path.
@@ -450,37 +590,11 @@ class RAGFlowClient:
450
590
  filename = metadata.doc_title or (file_path.name if file_path else "")
451
591
  title = self._build_title_with_tags(metadata.tags, filename)
452
592
 
453
- # Check if document exists by searching for ims_doc_id in metadata
593
+ # Check if document exists by ims_doc_id via the per-dataset index.
594
+ # RAGFlow 0.25.0 ignores metadata_condition server-side, so the index
595
+ # (built from a single list-all per dataset) is the authoritative lookup.
454
596
  start_time = time.time()
455
-
456
- # Use server-side metadata filtering to find document by ims_doc_id.
457
- # RAGFlow may return ownership-style errors when the filtered lookup
458
- # misses a document in team-shared datasets; treat that as "not found".
459
- try:
460
- existing_docs = self.list_documents(
461
- dataset=dataset,
462
- metadata_condition={
463
- "logic": "and",
464
- "conditions": [{
465
- "name": "ims_doc_id",
466
- "comparison_operator": "is",
467
- "value": metadata.ims_doc_id
468
- }]
469
- },
470
- page_size=1
471
- )
472
- except RAGFlowClientError as e:
473
- msg = str(e).lower()
474
- if (
475
- "you don't own" in msg
476
- or "you do not own" in msg
477
- or "lacks permission" in msg
478
- ):
479
- existing_docs = []
480
- else:
481
- raise
482
-
483
- existing_doc = existing_docs[0] if existing_docs else None
597
+ existing_doc = self.get_existing_doc(dataset, metadata.ims_doc_id)
484
598
 
485
599
  if existing_doc:
486
600
  # Check if content changed by comparing hashes
@@ -500,24 +614,36 @@ class RAGFlowClient:
500
614
  return None
501
615
 
502
616
  # Content changed, delete old version
503
- dataset.delete_documents([existing_doc.id])
617
+ if dry_run:
618
+ print(f" [DRY RUN] would dataset.delete_documents({json.dumps([existing_doc.id])})")
619
+ else:
620
+ with _timed(f"dataset.delete_documents(id={existing_doc.id})"):
621
+ dataset.delete_documents([existing_doc.id])
622
+ self._forget_doc(dataset, metadata.ims_doc_id)
504
623
  print(f" 🔄 Updating: {title}")
505
624
  else:
506
625
  print(f" ⬆️ Uploading: {title}")
507
-
626
+
508
627
  # Upload document
509
628
  try:
510
- documents = dataset.upload_documents([{
511
- "display_name": title,
512
- "blob": content
513
- }])
514
-
515
- if not documents:
516
- raise RAGFlowClientError("Upload returned no documents")
517
-
518
- doc = documents[0]
629
+ upload_payload = [{"display_name": title, "blob_bytes": len(content)}]
630
+ if dry_run:
631
+ print(f" [DRY RUN] would dataset.upload_documents({json.dumps(upload_payload)}) (blob_bytes shown instead of raw blob)")
632
+ doc = None
633
+ else:
634
+ with _timed(f"dataset.upload_documents(bytes={len(content)})"):
635
+ documents = dataset.upload_documents([{
636
+ "display_name": title,
637
+ "blob": content
638
+ }])
639
+ if not documents:
640
+ raise RAGFlowClientError("Upload returned no documents")
641
+ doc = documents[0]
519
642
 
520
- # Update metadata
643
+ # RAGFlow 0.25.x rejects None and dict values in meta_fields
644
+ # (validate_document_meta_fields). Drop None entries; JSON-stringify
645
+ # the frontmatter dict in place under the same key so the validator
646
+ # accepts it. The MCP read side already json.loads it on the way back.
521
647
  meta_fields: JsonDict = {
522
648
  "ims_doc_id": metadata.ims_doc_id,
523
649
  "tags": metadata.tags,
@@ -525,18 +651,47 @@ class RAGFlowClient:
525
651
  "release": metadata.release,
526
652
  "content_hash": metadata.content_hash,
527
653
  "original_path": metadata.original_path,
528
- "sort_order": metadata.sort_order,
529
654
  "doc_title": metadata.doc_title,
530
655
  }
656
+ if metadata.sort_order is not None:
657
+ meta_fields["sort_order"] = metadata.sort_order
531
658
  if metadata.line_count is not None:
532
659
  meta_fields["line_count"] = metadata.line_count
533
660
  if metadata.resource_path is not None:
534
661
  meta_fields["resource_path"] = metadata.resource_path
535
- frontmatter_value = getattr(metadata, 'frontmatter', None)
536
- if frontmatter_value is not None:
537
- meta_fields["frontmatter"] = frontmatter_value
662
+ if metadata.frontmatter is not None:
663
+ # Stored under "fm" (not "frontmatter") because the per-tenant
664
+ # ES doc-meta index (ragflow_doc_meta_{tenant_id}) commits a
665
+ # sticky "object" dynamic mapping for any key that was first
666
+ # written as a dict. Dropping datasets does NOT reset the index;
667
+ # the "frontmatter" key is permanently typed as object in
668
+ # existing deployments, so any string write to it is rejected.
669
+ # Using "fm" gets a fresh dynamic mapping as "text/keyword".
670
+ # MCP readers fall back to the legacy "frontmatter" key so old
671
+ # documents written before this rename remain readable.
672
+ meta_fields["fm"] = json.dumps(
673
+ metadata.frontmatter,
674
+ sort_keys=True,
675
+ ensure_ascii=False,
676
+ default=str,
677
+ )
538
678
 
539
- doc.update({"meta_fields": meta_fields})
679
+ if dry_run:
680
+ print(f" [DRY RUN] would doc.update({json.dumps({'meta_fields': meta_fields}, ensure_ascii=False, default=str)})")
681
+ elapsed = time.time() - start_time
682
+ print(f" [DRY RUN] would Done ({elapsed:.2f}s): {title}")
683
+ # Return a sentinel doc so the publisher reports this file as
684
+ # "would-publish" rather than "skipped (unchanged)".
685
+ from types import SimpleNamespace
686
+ return (cast(DocumentLike, SimpleNamespace(id=metadata.ims_doc_id)), dataset.id)
687
+ assert doc is not None
688
+ def _do_update() -> None:
689
+ with _timed(f"doc.update(id={doc.id})"):
690
+ doc.update({"meta_fields": meta_fields})
691
+ retry_call(
692
+ _do_update,
693
+ label=f"doc.update({doc.id})",
694
+ )
540
695
  # SDK update() does not echo meta_fields back in the PUT response;
541
696
  # re-fetch to get the actual stored state.
542
697
  try:
@@ -545,10 +700,10 @@ class RAGFlowClient:
545
700
  if updated_meta:
546
701
  if isinstance(updated_meta, dict):
547
702
  meta_tags = updated_meta.get('tags', [])
548
- meta_fm = updated_meta.get('frontmatter')
703
+ meta_fm = updated_meta.get('fm') or updated_meta.get('frontmatter')
549
704
  else:
550
705
  meta_tags = getattr(updated_meta, 'tags', []) or []
551
- meta_fm = getattr(updated_meta, 'frontmatter', None)
706
+ meta_fm = getattr(updated_meta, 'fm', None) or getattr(updated_meta, 'frontmatter', None)
552
707
  tag_count = len(meta_tags) if isinstance(meta_tags, list) else 0
553
708
  print(f" ✅ Metadata set: {tag_count} tags, frontmatter={'yes' if meta_fm else 'no'}")
554
709
  else:
@@ -558,7 +713,10 @@ class RAGFlowClient:
558
713
 
559
714
  elapsed = time.time() - start_time
560
715
  print(f" ✅ Done ({elapsed:.2f}s): {title}")
561
-
716
+
717
+ # Remember the new doc so subsequent lookups in this session find it.
718
+ self._remember_doc(dataset, metadata.ims_doc_id, cast(DocumentLike, doc))
719
+
562
720
  # Return doc object and dataset ID for parsing
563
721
  # doc.id is RAGFlow's internal document ID needed for parsing
564
722
  return (cast(DocumentLike, doc), dataset.id)
@@ -566,30 +724,36 @@ class RAGFlowClient:
566
724
  except Exception as e:
567
725
  raise RAGFlowClientError(f"Failed to upload document '{title}': {str(e)}")
568
726
 
569
- def trigger_parse(self, dataset_id: str, document_ids: list[str]) -> None:
727
+ def trigger_parse(self, dataset_id: str, document_ids: list[str], dry_run: bool = False) -> None:
570
728
  """
571
729
  Trigger async parsing for documents.
572
-
730
+
573
731
  Args:
574
732
  dataset_id: Dataset ID containing documents
575
733
  document_ids: List of document IDs to parse
576
-
734
+ dry_run: If True, print would-be call and skip the SDK write.
735
+
577
736
  Raises:
578
737
  RAGFlowClientError: If parsing trigger fails
579
738
  """
739
+ if dry_run:
740
+ print(f" [DRY RUN] would dataset({json.dumps(dataset_id)}).async_parse_documents({json.dumps(document_ids)})")
741
+ return
580
742
  dataset = self.get_dataset(id=dataset_id)
581
743
  if not dataset:
582
744
  raise NotFoundError(f"Dataset not found: {dataset_id}")
583
-
745
+
584
746
  try:
585
- dataset.async_parse_documents(document_ids)
747
+ with _timed(f"async_parse_documents(n={len(document_ids)})"):
748
+ dataset.async_parse_documents(document_ids)
586
749
  except Exception as e:
587
750
  raise RAGFlowClientError(f"Failed to trigger parsing: {str(e)}")
588
751
 
589
752
  def parse_documents_batch(
590
753
  self,
591
754
  documents: list[JsonDict],
592
- silent: bool = False
755
+ silent: bool = False,
756
+ dry_run: bool = False
593
757
  ) -> dict[str, list[str]]:
594
758
  """
595
759
  Trigger parsing for multiple documents across datasets.
@@ -636,7 +800,7 @@ class RAGFlowClient:
636
800
  print(f" → Document IDs: {doc_ids[:3]}{'...' if len(doc_ids) > 3 else ''}")
637
801
 
638
802
  try:
639
- self.trigger_parse(dataset_id, doc_ids)
803
+ self.trigger_parse(dataset_id, doc_ids, dry_run=dry_run)
640
804
  success_datasets.append(dataset_id)
641
805
  except Exception as e:
642
806
  failed_datasets.append(dataset_id)
@@ -797,7 +961,9 @@ class RAGFlowClient:
797
961
 
798
962
  # Bypass SDK and call HTTP API directly
799
963
  # SDK doesn't support run, suffix, metadata_condition parameters
800
- res = dataset.get(f"/datasets/{dataset.id}/documents", params=params)
964
+ cond_keys = sorted(params.keys())
965
+ with _timed(f"list_documents({','.join(cond_keys)})"):
966
+ res = dataset.get(f"/datasets/{dataset.id}/documents", params=params)
801
967
  res_json = cast(JsonDict, cast(Any, res).json())
802
968
 
803
969
  if res_json.get("code") != 0:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rosetta-cli
3
- Version: 2.0.13b1
3
+ Version: 2.0.13b3
4
4
  Summary: Rosetta CLI for publishing knowledge base content to RAGFlow
5
5
  Author-email: Rosetta Team <rosetta-support@griddynamics.com>
6
6
  License-Expression: Apache-2.0
@@ -18,7 +18,7 @@ Requires-Python: >=3.12
18
18
  Description-Content-Type: text/markdown
19
19
  Requires-Dist: python-dotenv<2.0.0,>=1.0.0
20
20
  Requires-Dist: python-frontmatter<2.0.0,>=1.1.0
21
- Requires-Dist: ragflow-sdk<0.25.1,>=0.25.0
21
+ Requires-Dist: ragflow-sdk<0.26.0,>=0.25.1
22
22
  Requires-Dist: requests<3.0.0,>=2.31.0
23
23
  Requires-Dist: tqdm<5.0.0,>=4.67.0
24
24
  Provides-Extra: dev
@@ -1,6 +1,6 @@
1
1
  python-dotenv<2.0.0,>=1.0.0
2
2
  python-frontmatter<2.0.0,>=1.1.0
3
- ragflow-sdk<0.25.1,>=0.25.0
3
+ ragflow-sdk<0.26.0,>=0.25.1
4
4
  requests<3.0.0,>=2.31.0
5
5
  tqdm<5.0.0,>=4.67.0
6
6
 
@@ -1,28 +0,0 @@
1
- """Shared path utilities for Rosetta CLI."""
2
-
3
- from pathlib import Path
4
-
5
-
6
- def resolve_workspace_root(path: Path) -> Path:
7
- """Resolve the workspace root for a publish target.
8
-
9
- Preference order:
10
- 1. Parent of the topmost `instructions/` directory in the target path.
11
- 2. Nearest ancestor containing `.git`.
12
- 3. The target directory itself, or the parent for a file target.
13
- """
14
- resolved = path.resolve()
15
- container = resolved if resolved.is_dir() else resolved.parent
16
-
17
- parts = container.parts
18
- for index, part in enumerate(parts):
19
- if part == "instructions" and index > 0:
20
- return Path(*parts[:index])
21
-
22
- current = container
23
- while current != current.parent:
24
- if (current / ".git").exists():
25
- return current
26
- current = current.parent
27
-
28
- return container
File without changes
File without changes